diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index 6d2a2fc189dd6..68b5d4018c2f7 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -55,6 +55,17 @@ Description: mount that is prone to get stuck, or a FUSE mount which cannot be trusted to play fair. + (read-write) +What: /sys/class/bdi//strict_limit +Date: October 2022 +Contact: Stefan Roesch +Description: + Forces per-BDI checks for the share of given device in the write-back + cache even before the global background dirty limit is reached. This + is useful in situations where the global limit is much higher than + affordable for given relatively slow (or untrusted) device. Turning + strictlimit on has no visible effect if max_ratio is equal to 100%. + (read-write) What: /sys/class/bdi//stable_pages_required Date: January 2008 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e6f0570cf4900..5e711c739c11b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -554,6 +554,10 @@ named mounts. Specifying both "all" and "named" disables all v1 hierarchies. + cgroup_favordynmods= [KNL] Enable or Disable favordynmods. + Format: { "true" | "false" } + Defaults to the value of CONFIG_CGROUP_FAVOR_DYNMODS. + cgroup.memory= [KNL] Pass options to the cgroup memory controller. Format: nosocket -- Disable socket memory accounting. @@ -669,7 +673,7 @@ kernel/dma/contiguous.c cma_pernuma=nn[MG] - [ARM64,KNL,CMA] + [KNL,CMA] Sets the size of kernel per-numa memory area for contiguous memory allocations. A value of 0 disables per-numa CMA altogether. And If this option is not @@ -679,6 +683,17 @@ which is located in node nid, if the allocation fails, they will fallback to the global default memory area. + numa_cma=:nn[MG][,:nn[MG]] + [KNL,CMA] + Sets the size of kernel numa memory area for + contiguous memory allocations. It will reserve CMA + area for the specified node. + + With numa CMA enabled, DMA users on node nid will + first try to allocate buffer from the numa area + which is located in node nid, if the allocation fails, + they will fallback to the global default memory area. + cmo_free_hint= [PPC] Format: { yes | no } Specify whether pages are marked as being inactive when they are freed. This is used in CMO environments diff --git a/MAINTAINERS b/MAINTAINERS index 4b19dfb5d2fd4..3ee119ef87b51 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15577,6 +15577,13 @@ F: Documentation/mm/page_table_check.rst F: include/linux/page_table_check.h F: mm/page_table_check.c +PAGE TOUCHING DMA +M: James Gowans +L: ec2-memo@amazon.com +S: Supported +F: include/linux/dma-page-touching.h +F: kernel/dma/page_touching.c + PANASONIC LAPTOP ACPI EXTRAS DRIVER M: Kenneth Chan L: platform-driver-x86@vger.kernel.org diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 044b98a62f7bb..cc39b1ca95f8e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -180,8 +180,10 @@ config ARM64 select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE + select HAVE_DYNAMIC_FTRACE_WITH_ARGS select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \ if DYNAMIC_FTRACE_WITH_REGS + select HAVE_RELIABLE_STACKTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD @@ -193,6 +195,7 @@ config ARM64 select HAVE_IOREMAP_PROT select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KVM + select HAVE_LIVEPATCH select HAVE_NMI select HAVE_PERF_EVENTS select HAVE_PERF_REGS @@ -204,6 +207,8 @@ config ARM64 select MMU_GATHER_RCU_TABLE_FREE select HAVE_RSEQ select HAVE_STACKPROTECTOR + select HAVE_OBJTOOL + select HAVE_STACK_VALIDATION select HAVE_SYSCALL_TRACEPOINTS select HAVE_KPROBES select HAVE_KRETPROBES @@ -328,6 +333,24 @@ config GENERIC_CALIBRATE_DELAY config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE def_bool y +config ARCH_MEMORY_PROBE + bool "Enable /sys/devices/system/memory/probe interface" + depends on MEMORY_HOTPLUG + help + This option enables a sysfs /sys/devices/system/memory/probe + interface for testing. See Documentation/memory-hotplug.txt + for more information. If you are unsure how to answer this + question, answer N. + +config ARCH_MEMORY_REMOVE + bool "Enable /sys/devices/system/memory/remove interface" + depends on MEMORY_HOTREMOVE + help + This option enables a sysfs /sys/devices/system/memory/remove + interface for testing. See Documentation/memory-hotplug.txt + for more information. If you are unsure how to answer this + question, answer N. + config SMP def_bool y @@ -2304,3 +2327,4 @@ source "drivers/acpi/Kconfig" source "arch/arm64/kvm/Kconfig" +source "kernel/livepatch/Kconfig" diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug index 265c4461031f4..c2c68c6f75578 100644 --- a/arch/arm64/Kconfig.debug +++ b/arch/arm64/Kconfig.debug @@ -20,4 +20,25 @@ config ARM64_RELOC_TEST depends on m tristate "Relocation testing module" +choice + prompt "Choose kernel unwinder" + default UNWINDER_FRAME_POINTER + help + This determines which method will be used for unwinding kernel stack + traces for panics, oopses, bugs, warnings, perf, /proc//stack, + livepatch, lockdep, and more. + +config UNWINDER_FRAME_POINTER + bool "Frame pointer unwinder" + select FRAME_POINTER + help + This option enables the frame pointer unwinder for unwinding kernel + stack traces. + + The unwinder itself is fast and it uses less RAM than the ORC + unwinder, but the kernel text size will grow by ~3% and the kernel's + overall performance will degrade by roughly 5-10%. + +endchoice + source "drivers/hwtracing/coresight/Kconfig" diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index c9496539c3351..ef29e10cb1f69 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -63,20 +63,19 @@ ifeq ($(CONFIG_AS_HAS_ARMV8_2), y) asm-arch := armv8.2-a endif -# Ensure that if the compiler supports branch protection we default it -# off, this will be overridden if we are using branch protection. -branch-prot-flags-y += $(call cc-option,-mbranch-protection=none) - -ifeq ($(CONFIG_ARM64_PTR_AUTH_KERNEL),y) -branch-prot-flags-$(CONFIG_CC_HAS_SIGN_RETURN_ADDRESS) := -msign-return-address=all -# We enable additional protection for leaf functions as there is some -# narrow potential for ROP protection benefits and no substantial -# performance impact has been observed. ifeq ($(CONFIG_ARM64_BTI_KERNEL),y) -branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET_BTI) := -mbranch-protection=pac-ret+leaf+bti + KBUILD_CFLAGS += -mbranch-protection=pac-ret+bti +else ifeq ($(CONFIG_ARM64_PTR_AUTH_KERNEL),y) + ifeq ($(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET),y) + KBUILD_CFLAGS += -mbranch-protection=pac-ret + else + KBUILD_CFLAGS += -msign-return-address=non-leaf + endif else -branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET) := -mbranch-protection=pac-ret+leaf + KBUILD_CFLAGS += $(call cc-option,-mbranch-protection=none) endif + +ifeq ($(CONFIG_ARM64_PTR_AUTH_KERNEL),y) # -march=armv8.3-a enables the non-nops instructions for PAC, to avoid the # compiler to generate them and consequently to break the single image contract # we pass it only to the assembler. This option is utilized only in case of non @@ -86,8 +85,6 @@ asm-arch := armv8.3-a endif endif -KBUILD_CFLAGS += $(branch-prot-flags-y) - ifeq ($(CONFIG_AS_HAS_ARMV8_4), y) # make sure to pass the newest target architecture to -march. asm-arch := armv8.4-a @@ -133,6 +130,10 @@ ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS),y) CC_FLAGS_FTRACE := -fpatchable-function-entry=2 endif +ifeq ($(CONFIG_STACK_VALIDATION),y) +KBUILD_CFLAGS += -fno-jump-tables +endif + ifeq ($(CONFIG_KASAN_SW_TAGS), y) KASAN_SHADOW_SCALE_SHIFT := 4 else ifeq ($(CONFIG_KASAN_GENERIC), y) diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S index b2062eeee59e2..1b41899aa9f14 100644 --- a/arch/arm64/crypto/aes-neonbs-core.S +++ b/arch/arm64/crypto/aes-neonbs-core.S @@ -368,15 +368,15 @@ .align 6 -M0: .octa 0x0004080c0105090d02060a0e03070b0f +SYM_DATA_LOCAL(M0, .octa 0x0004080c0105090d02060a0e03070b0f) -M0SR: .octa 0x0004080c05090d010a0e02060f03070b -SR: .octa 0x0f0e0d0c0a09080b0504070600030201 -SRM0: .octa 0x01060b0c0207080d0304090e00050a0f +SYM_DATA_LOCAL(M0SR, .octa 0x0004080c05090d010a0e02060f03070b) +SYM_DATA_LOCAL(SR, .octa 0x0f0e0d0c0a09080b0504070600030201) +SYM_DATA_LOCAL(SRM0, .octa 0x01060b0c0207080d0304090e00050a0f) -M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 -ISR: .octa 0x0f0e0d0c080b0a090504070602010003 -ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f +SYM_DATA_LOCAL(M0ISR, .octa 0x0004080c0d0105090a0e0206070b0f03) +SYM_DATA_LOCAL(ISR, .octa 0x0f0e0d0c080b0a090504070602010003) +SYM_DATA_LOCAL(ISRM0, .octa 0x0306090c00070a0d01040b0e0205080f) /* * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S index dce6dcebfca18..b3b8e56cb87d3 100644 --- a/arch/arm64/crypto/crct10dif-ce-core.S +++ b/arch/arm64/crypto/crct10dif-ce-core.S @@ -63,6 +63,7 @@ // #include +#include #include .text @@ -132,6 +133,8 @@ .endm SYM_FUNC_START_LOCAL(__pmull_p8_core) + stp x29, x30, [sp, #-16]! + mov x29, sp .L__pmull_p8_core: ext t4.8b, ad.8b, ad.8b, #1 // A1 ext t5.8b, ad.8b, ad.8b, #2 // A2 @@ -193,6 +196,7 @@ SYM_FUNC_START_LOCAL(__pmull_p8_core) eor t4.16b, t4.16b, t5.16b eor t6.16b, t6.16b, t3.16b + ldp x29, x30, [sp], #16 ret SYM_FUNC_END(__pmull_p8_core) @@ -207,6 +211,7 @@ SYM_FUNC_END(__pmull_p8_core) pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B .endif + ANNOTATE_INTRA_FUNCTION_CALL bl .L__pmull_p8_core\i eor \rq\().16b, \rq\().16b, t4.16b diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/crypto/poly1305-armv8.pl index cbc980fb02e33..f460f33c127a0 100644 --- a/arch/arm64/crypto/poly1305-armv8.pl +++ b/arch/arm64/crypto/poly1305-armv8.pl @@ -47,6 +47,8 @@ my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); $code.=<<___; +#include + #ifndef __KERNEL__ # include "arm_arch.h" .extern OPENSSL_armcap_P @@ -888,8 +890,10 @@ .align 5 .Lzeros: .long 0,0,0,0,0,0,0,0 +SYM_DATA_START_LOCAL(POLY1305_str) .asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" .align 2 +SYM_DATA_END(POLY1305_str) #if !defined(__KERNEL__) && !defined(_WIN64) .comm OPENSSL_armcap_P,4,4 .hidden OPENSSL_armcap_P diff --git a/arch/arm64/crypto/sha512-armv8.pl b/arch/arm64/crypto/sha512-armv8.pl index 35ec9ae99fe16..6e2a96e05c5a5 100644 --- a/arch/arm64/crypto/sha512-armv8.pl +++ b/arch/arm64/crypto/sha512-armv8.pl @@ -193,6 +193,8 @@ sub BODY_00_xx { } $code.=<<___; +#include + #ifndef __KERNEL__ # include "arm_arch.h" #endif @@ -208,11 +210,11 @@ sub BODY_00_xx { $code.=<<___ if ($SZ==4); #ifndef __KERNEL__ # ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P + ldrsw x16,OPENSSL_armcap_P_rel # else - ldr x16,.LOPENSSL_armcap_P + ldr x16,OPENSSL_armcap_P_rel # endif - adr x17,.LOPENSSL_armcap_P + adr x17,OPENSSL_armcap_P_rel add x16,x16,x17 ldr w16,[x16] tst w16,#ARMV8_SHA256 @@ -237,7 +239,7 @@ sub BODY_00_xx { ldp $E,$F,[$ctx,#4*$SZ] add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input ldp $G,$H,[$ctx,#6*$SZ] - adr $Ktbl,.LK$BITS + adr $Ktbl,K$BITS stp $ctx,$num,[x29,#96] .Loop: @@ -287,8 +289,7 @@ sub BODY_00_xx { .size $func,.-$func .align 6 -.type .LK$BITS,%object -.LK$BITS: +SYM_DATA_START_LOCAL(K$BITS) ___ $code.=<<___ if ($SZ==8); .quad 0x428a2f98d728ae22,0x7137449123ef65cd @@ -353,18 +354,21 @@ sub BODY_00_xx { .long 0 //terminator ___ $code.=<<___; -.size .LK$BITS,.-.LK$BITS +SYM_DATA_END(K$BITS) #ifndef __KERNEL__ .align 3 -.LOPENSSL_armcap_P: +SYM_DATA_START_LOCAL(OPENSSL_armcap_P_rel) # ifdef __ILP32__ .long OPENSSL_armcap_P-. # else .quad OPENSSL_armcap_P-. # endif +SYM_DATA_END(OPENSSL_armcap_P_rel) #endif +SYM_DATA_START_LOCAL(OPENSSL_str) .asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by " .align 2 +SYM_DATA_END(OPENSSL_str) ___ if ($SZ==4) { @@ -385,7 +389,7 @@ sub BODY_00_xx { add x29,sp,#0 ld1.32 {$ABCD,$EFGH},[$ctx] - adr $Ktbl,.LK256 + adr $Ktbl,K256 .Loop_hw: ld1 {@MSG[0]-@MSG[3]},[$inp],#64 @@ -644,11 +648,9 @@ () .align 4 sha256_block_neon: .Lneon_entry: - stp x29, x30, [sp, #-16]! - mov x29, sp sub sp,sp,#16*4 - adr $Ktbl,.LK256 + adr $Ktbl,K256 add $num,$inp,$num,lsl#6 // len to point at the end of inp ld1.8 {@X[0]},[$inp], #16 @@ -732,8 +734,7 @@ () mov $Xfer,sp b.ne .L_00_48 - ldr x29,[x29] - add sp,sp,#16*4+16 + add sp,sp,#16*4 ret .size sha256_block_neon,.-sha256_block_neon ___ diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h index 51738c56e96cd..1c4e9d0430b77 100644 --- a/arch/arm64/include/asm/alternative-macros.h +++ b/arch/arm64/include/asm/alternative-macros.h @@ -230,7 +230,7 @@ alternative_has_feature_likely(unsigned long feature) "feature must be < ARM64_NCAPS"); asm goto( - ALTERNATIVE_CB("b %l[l_no]", %[feature], alt_cb_patch_nops) + ALTERNATIVE("b %l[l_no]", "nop", %[feature]) : : [feature] "i" (feature) : diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index e5957a53be398..edfe1f0c2a0e1 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -25,6 +25,7 @@ #include #include #include +#include /* * Provide a wxN alias for each wN register so what we can paste a xN @@ -147,6 +148,7 @@ lr .req x30 // link register */ .macro ventry label .align 7 + UNWIND_HINT_EMPTY b \label .endm diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index 28be048db3f63..9917429971d48 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -19,7 +19,11 @@ unreachable(); \ } while (0) -#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags)) +#define __WARN_FLAGS(flags) \ +do { \ + __BUG_FLAGS(BUGFLAG_WARNING|(flags)); \ + annotate_reachable(); \ +} while (0) #define HAVE_ARCH_BUG diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index a0badda3a8d1c..15d043e5c3e3b 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -908,6 +908,7 @@ static inline unsigned int get_vmid_bits(u64 mmfr1) return 8; } +s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur); struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id); extern struct arm64_ftr_override id_aa64mmfr1_override; diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 930b0e6c94622..7622782d0bb97 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -56,11 +56,13 @@ extern void fpsimd_signal_preserve_current_state(void); extern void fpsimd_preserve_current_state(void); extern void fpsimd_restore_current_state(void); extern void fpsimd_update_current_state(struct user_fpsimd_state const *state); +extern void fpsimd_kvm_prepare(void); extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state, void *sve_state, unsigned int sve_vl, void *za_state, unsigned int sme_vl, - u64 *svcr); + u64 *svcr, enum fp_type *type, + enum fp_type to_save); extern void fpsimd_flush_task_state(struct task_struct *target); extern void fpsimd_save_and_flush_cpu_state(void); diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index 329dbbd4d50b6..0bc03ecfb257c 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -78,6 +78,26 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) return addr; } +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS + +struct ftrace_regs { + struct pt_regs regs; +}; + +static __always_inline struct pt_regs * +arch_ftrace_get_regs(struct ftrace_regs *fregs) +{ + return &fregs->regs; +} + +static __always_inline void ftrace_instruction_pointer_set( + struct ftrace_regs *fregs, unsigned long pc) +{ + fregs->regs.pc = pc; +} + +#endif + #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS struct dyn_ftrace; struct ftrace_ops; diff --git a/arch/arm64/include/asm/kgdb.h b/arch/arm64/include/asm/kgdb.h index 21fc85e9d2bed..a8cb91d8d59b3 100644 --- a/arch/arm64/include/asm/kgdb.h +++ b/arch/arm64/include/asm/kgdb.h @@ -19,6 +19,7 @@ static inline void arch_kgdb_breakpoint(void) { asm ("brk %0" : : "I" (KGDB_COMPILED_DBG_BRK_IMM)); + annotate_reachable(); } extern void kgdb_handle_bus_error(void); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 577cf444c1135..5424c7bd24531 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -115,6 +115,21 @@ struct kvm_smccc_features { unsigned long vendor_hyp_bmap; }; +/* + * Emulated CPU ID registers per VM + * (Op0, Op1, CRn, CRm, Op2) of the ID registers to be saved in it + * is (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8. + * + * These emulated idregs are VM-wide, but accessed from the context of a vCPU. + * Atomic access to multiple idregs are guarded by kvm_arch.config_lock. + */ +#define IDREG_IDX(id) (((sys_reg_CRm(id) - 1) << 3) | sys_reg_Op2(id)) +#define IDREG(kvm, id) ((kvm)->arch.idregs.regs[IDREG_IDX(id)]) +#define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1) +struct kvm_idregs { + u64 regs[KVM_ARM_ID_REG_NUM]; +}; + struct kvm_arch { struct kvm_s2_mmu mmu; @@ -152,6 +167,12 @@ struct kvm_arch { #define KVM_ARCH_FLAG_EL1_32BIT 4 /* PSCI SYSTEM_SUSPEND enabled for the guest */ #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 5 + /* + * AA64DFR0_EL1.PMUver was set as ID_AA64DFR0_EL1_PMUVer_IMP_DEF + * or DFR0_EL1.PerfMon was set as ID_DFR0_EL1_PerfMon_IMPDEF from + * userspace for VCPUs without PMU. + */ +#define KVM_ARCH_FLAG_VCPU_HAS_IMP_DEF_PMU 6 unsigned long flags; @@ -164,11 +185,11 @@ struct kvm_arch { cpumask_var_t supported_cpus; - u8 pfr0_csv2; - u8 pfr0_csv3; - /* Hypercall features firmware registers' descriptor */ struct kvm_smccc_features smccc_feat; + + /* Emulated CPU ID registers */ + struct kvm_idregs idregs; }; struct kvm_vcpu_fault_info { @@ -309,8 +330,18 @@ struct vcpu_reset_state { struct kvm_vcpu_arch { struct kvm_cpu_context ctxt; - /* Guest floating point state */ + /* + * Guest floating point state + * + * The architecture has two main floating point extensions, + * the original FPSIMD and SVE. These have overlapping + * register views, with the FPSIMD V registers occupying the + * low 128 bits of the SVE Z registers. When the core + * floating point code saves the register state of a task it + * records which view it saved in fp_type. + */ void *sve_state; + enum fp_type fp_type; unsigned int sve_max_vl; u64 svcr; @@ -905,6 +936,8 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, struct kvm_arm_copy_mte_tags *copy_tags); +void kvm_arm_init_id_regs(struct kvm *kvm); + /* Guest/host FPSIMD coordination helpers */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h index 1436fa1cde24d..7d8906737cce9 100644 --- a/arch/arm64/include/asm/linkage.h +++ b/arch/arm64/include/asm/linkage.h @@ -43,4 +43,16 @@ SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \ bti c ; +/* + * Record the address range of each SYM_CODE function in a struct code_range + * in a special section. + */ +#define SYM_CODE_END(name) \ + SYM_END(name, SYM_T_NONE) ;\ + 99: ;\ + .pushsection "sym_code_functions", "aw" ;\ + .quad name ;\ + .quad 99b ;\ + .popsection + #endif diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 56c7df4c65325..5c60d1fa5e299 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -260,9 +260,14 @@ static inline pte_t pte_mkdevmap(pte_t pte) return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL)); } -static inline void set_pte(pte_t *ptep, pte_t pte) +static inline void set_pte_nosync(pte_t *ptep, pte_t pte) { WRITE_ONCE(*ptep, pte); +} + +static inline void set_pte(pte_t *ptep, pte_t pte) +{ + set_pte_nosync(ptep, pte); /* * Only if the new pte is valid and kernel, otherwise TLB maintenance diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 400f8956328b9..1b822e618bb4b 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -122,6 +122,12 @@ enum vec_type { ARM64_VEC_MAX, }; +enum fp_type { + FP_STATE_CURRENT, /* Save based on current task state. */ + FP_STATE_FPSIMD, + FP_STATE_SVE, +}; + struct cpu_context { unsigned long x19; unsigned long x20; @@ -152,6 +158,7 @@ struct thread_struct { struct user_fpsimd_state fpsimd_state; } uw; + enum fp_type fp_type; /* registers FPSIMD or SVE? */ unsigned int fpsimd_cpu; void *sve_state; /* SVE registers, if any */ void *za_state; /* ZA register, if any */ diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index 40971ac1303f9..50cfd1083563b 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -22,6 +22,7 @@ extern char __irqentry_text_start[], __irqentry_text_end[]; extern char __mmuoff_data_start[], __mmuoff_data_end[]; extern char __entry_tramp_text_start[], __entry_tramp_text_end[]; extern char __relocate_new_kernel_start[], __relocate_new_kernel_end[]; +extern char __sym_code_functions_start[], __sym_code_functions_end[]; static inline size_t entry_tramp_text_size(void) { diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 3a448ab0924b3..c362c45c91771 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -704,6 +704,8 @@ #define ID_DFR0_PERFMON_8_1 0x4 #define ID_DFR0_PERFMON_8_4 0x5 #define ID_DFR0_PERFMON_8_5 0x6 +#define ID_DFR0_PERFMON_8_7 0x7 +#define ID_DFR0_PERFMON_IMP_DEF 0xf #define ID_ISAR4_SWP_FRAC_SHIFT 28 #define ID_ISAR4_PSR_M_SHIFT 24 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 848739c15de82..42ba9d37e8d83 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -68,6 +68,7 @@ int arch_dup_task_struct(struct task_struct *dst, #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ #define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ +#define TIF_PATCH_PENDING 7 /* pending live patching update */ #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @@ -100,11 +101,12 @@ int arch_dup_task_struct(struct task_struct *dst, #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ - _TIF_NOTIFY_SIGNAL) + _TIF_NOTIFY_SIGNAL | _TIF_PATCH_PENDING) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/include/asm/unwind_hints.h b/arch/arm64/include/asm/unwind_hints.h new file mode 100644 index 0000000000000..8655058aa63c8 --- /dev/null +++ b/arch/arm64/include/asm/unwind_hints.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ASM_UNWIND_HINTS_H +#define __ASM_UNWIND_HINTS_H + +#include + +#define UNWIND_HINT_REG_UNDEFINED 0xff +#define UNWIND_HINT_REG_SP 31 + +#ifdef __ASSEMBLY__ + +.macro UNWIND_HINT_EMPTY + UNWIND_HINT sp_reg=UNWIND_HINT_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL end=1 +.endm + +.macro UNWIND_HINT_FUNC sp_offset=0 + UNWIND_HINT sp_reg=UNWIND_HINT_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL +.endm + +.macro UNWIND_HINT_REGS base=UNWIND_HINT_REG_SP offset=0 + UNWIND_HINT sp_reg=\base sp_offset=\offset type=UNWIND_HINT_TYPE_REGS +.endm + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_UNWIND_HINTS_H */ diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index a5a256e3f9fe4..7a388b845da76 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -111,6 +112,7 @@ bool acpi_psci_use_hvc(void) { return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_USE_HVC; } +EXPORT_SYMBOL_GPL(acpi_psci_use_hvc); /* * acpi_fadt_sanity_check() - Check FADT presence and carry out sanity @@ -226,6 +228,15 @@ void __init acpi_boot_table_init(void) if (earlycon_acpi_spcr_enable) early_init_dt_scan_chosen_stdout(); } else { +#ifdef CONFIG_HIBERNATION + struct acpi_table_header *facs = NULL; + acpi_get_table(ACPI_SIG_FACS, 1, &facs); + if (facs) { + swsusp_hardware_signature = + ((struct acpi_table_facs *)facs)->hardware_signature; + acpi_put_table(facs); + } +#endif acpi_parse_spcr(earlycon_acpi_spcr_enable, true); if (IS_ENABLED(CONFIG_ACPI_BGRT)) acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 91263d09ea650..43aec57da78ac 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -291,11 +291,3 @@ void apply_alternatives_module(void *start, size_t length) __apply_alternatives(®ion, true, &all_capabilities[0]); } #endif - -noinstr void alt_cb_patch_nops(struct alt_instr *alt, __le32 *origptr, - __le32 *updptr, int nr_inst) -{ - for (int i = 0; i < nr_inst; i++) - updptr[i] = cpu_to_le32(aarch64_insn_gen_nop()); -} -EXPORT_SYMBOL(alt_cb_patch_nops); diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S index 6b752fe897451..2ab2d4255d8bf 100644 --- a/arch/arm64/kernel/cpu-reset.S +++ b/arch/arm64/kernel/cpu-reset.S @@ -11,6 +11,7 @@ #include #include #include +#include #include .text @@ -29,7 +30,8 @@ * branch to what would be the reset vector. It must be executed with the * flat identity mapping. */ -SYM_TYPED_FUNC_START(cpu_soft_restart) +SYM_CODE_START(cpu_soft_restart) + UNWIND_HINT_EMPTY mov_q x12, INIT_SCTLR_EL1_MMU_OFF pre_disable_mmu_workaround /* diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 770a31c6ed81b..292a9f94e617a 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -779,7 +779,7 @@ static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg, return reg; } -static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, +s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { s64 ret = 0; diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S index 61a87fa1c3055..9a1a94c3c4dbf 100644 --- a/arch/arm64/kernel/efi-entry.S +++ b/arch/arm64/kernel/efi-entry.S @@ -9,10 +9,12 @@ #include #include +#include __INIT SYM_CODE_START(efi_enter_kernel) + UNWIND_HINT_EMPTY /* * efi_pe_entry() will have copied the kernel image if necessary and we * end up here with device tree address in x1 and the kernel entry diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S index d731b4655df8e..241b148bbf333 100644 --- a/arch/arm64/kernel/efi-header.S +++ b/arch/arm64/kernel/efi-header.S @@ -28,6 +28,7 @@ .macro __EFI_PE_HEADER #ifdef CONFIG_EFI .set .Lpe_header_offset, . - .L_head +SYM_DATA_START_LOCAL(arm64_efi_header) .long PE_MAGIC .short IMAGE_FILE_MACHINE_ARM64 // Machine .short .Lsection_count // NumberOfSections @@ -160,6 +161,7 @@ .balign SEGMENT_ALIGN .Lefi_header_end: +SYM_DATA_END_LABEL(arm64_efi_header, SYM_L_LOCAL, efi_header_end) #else .set .Lpe_header_offset, 0x0 #endif diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 62146d48dba73..43463e4e1c2d9 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include .macro clear_gp_regs .irp n,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29 @@ -37,6 +39,7 @@ .macro kernel_ventry, el:req, ht:req, regsize:req, label:req .align 7 + UNWIND_HINT_EMPTY .Lventry_start\@: .if \el == 0 /* @@ -44,6 +47,7 @@ * skipped by the trampoline vectors, to trigger the cleanup. */ b .Lskip_tramp_vectors_cleanup\@ + UNWIND_HINT_EMPTY .if \regsize == 64 mrs x30, tpidrro_el0 msr tpidrro_el0, xzr @@ -117,6 +121,7 @@ alternative_cb ARM64_ALWAYS_SYSTEM, spectre_v4_patch_fw_mitigation_enable b .L__asm_ssbd_skip\@ // Patched to NOP alternative_cb_end + UNWIND_HINT_REGS ldr_this_cpu \tmp2, arm64_ssbd_callback_required, \tmp1 cbz \tmp2, .L__asm_ssbd_skip\@ ldr \tmp2, [tsk, #TSK_TI_FLAGS] @@ -215,6 +220,7 @@ alternative_cb_end stp x24, x25, [sp, #16 * 12] stp x26, x27, [sp, #16 * 13] stp x28, x29, [sp, #16 * 14] + UNWIND_HINT_REGS .if \el == 0 clear_gp_regs @@ -417,6 +423,7 @@ alternative_else_nop_endif ldp x24, x25, [sp, #16 * 12] ldp x26, x27, [sp, #16 * 13] ldp x28, x29, [sp, #16 * 14] + UNWIND_HINT_EMPTY .if \el == 0 alternative_if ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD @@ -427,7 +434,11 @@ alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 ldr lr, [sp, #S_LR] add sp, sp, #PT_REGS_SIZE // restore sp eret -alternative_else_nop_endif +alternative_else + nop + add sp, sp, #PT_REGS_SIZE // restore sp + nop +alternative_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 bne 4f msr far_el1, x29 @@ -464,11 +475,15 @@ SYM_CODE_START_LOCAL(__swpan_entry_el1) orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR b.eq 1f // TTBR0 access already disabled and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR -SYM_INNER_LABEL(__swpan_entry_el0, SYM_L_LOCAL) __uaccess_ttbr0_disable x21 1: ret SYM_CODE_END(__swpan_entry_el1) +SYM_CODE_START_LOCAL(__swpan_entry_el0) + __uaccess_ttbr0_disable x21 +1: ret +SYM_CODE_END(__swpan_entry_el0) + /* * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR * PAN bit checking. @@ -597,6 +612,7 @@ SYM_CODE_START_LOCAL(ret_to_kernel) SYM_CODE_END(ret_to_kernel) SYM_CODE_START_LOCAL(ret_to_user) + UNWIND_HINT_REGS ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step enable_step_tsk x19, x2 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK @@ -672,6 +688,7 @@ alternative_else_nop_endif .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 + UNWIND_HINT_EMPTY 1: .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry @@ -696,7 +713,9 @@ alternative_else_nop_endif * entry onto the return stack and using a RET instruction to * enter the full-fat kernel vectors. */ + ANNOTATE_INTRA_FUNCTION_CALL bl 2f + UNWIND_HINT_EMPTY b . 2: tramp_map_kernel x30 @@ -727,6 +746,8 @@ alternative_else_nop_endif .endm .macro tramp_exit, regsize = 64 + UNWIND_HINT_EMPTY + sub sp, sp, #PT_REGS_SIZE // revert sp tramp_data_read_var x30, this_cpu_vector get_this_cpu_offset x29 ldr x30, [x30, x29] @@ -771,11 +792,11 @@ SYM_CODE_START_NOALIGN(tramp_vectors) generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_NONE SYM_CODE_END(tramp_vectors) -SYM_CODE_START(tramp_exit_native) +SYM_CODE_START_LOCAL(tramp_exit_native) tramp_exit SYM_CODE_END(tramp_exit_native) -SYM_CODE_START(tramp_exit_compat) +SYM_CODE_START_LOCAL(tramp_exit_compat) tramp_exit 32 SYM_CODE_END(tramp_exit_compat) .popsection // .entry.tramp.text @@ -858,6 +879,7 @@ NOKPROBE(cpu_switch_to) * This is how we return from a fork. */ SYM_CODE_START(ret_from_fork) + UNWIND_HINT_REGS bl schedule_tail cbz x19, 1f // not a kernel thread mov x0, x20 diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 59b5a16bab5d6..ab0ea49620c52 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -125,6 +125,8 @@ struct fpsimd_last_state_struct { u64 *svcr; unsigned int sve_vl; unsigned int sme_vl; + enum fp_type *fp_type; + enum fp_type to_save; }; static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state); @@ -330,15 +332,6 @@ void task_set_vl_onexec(struct task_struct *task, enum vec_type type, * The task can execute SVE instructions while in userspace without * trapping to the kernel. * - * When stored, Z0-Z31 (incorporating Vn in bits[127:0] or the - * corresponding Zn), P0-P15 and FFR are encoded in - * task->thread.sve_state, formatted appropriately for vector - * length task->thread.sve_vl or, if SVCR.SM is set, - * task->thread.sme_vl. - * - * task->thread.sve_state must point to a valid buffer at least - * sve_state_size(task) bytes in size. - * * During any syscall, the kernel may optionally clear TIF_SVE and * discard the vector state except for the FPSIMD subset. * @@ -348,7 +341,15 @@ void task_set_vl_onexec(struct task_struct *task, enum vec_type type, * do_sve_acc() to be called, which does some preparation and then * sets TIF_SVE. * - * When stored, FPSIMD registers V0-V31 are encoded in + * During any syscall, the kernel may optionally clear TIF_SVE and + * discard the vector state except for the FPSIMD subset. + * + * The data will be stored in one of two formats: + * + * * FPSIMD only - FP_STATE_FPSIMD: + * + * When the FPSIMD only state stored task->thread.fp_type is set to + * FP_STATE_FPSIMD, the FPSIMD registers V0-V31 are encoded in * task->thread.uw.fpsimd_state; bits [max : 128] for each of Z0-Z31 are * logically zero but not stored anywhere; P0-P15 and FFR are not * stored and have unspecified values from userspace's point of @@ -356,7 +357,23 @@ void task_set_vl_onexec(struct task_struct *task, enum vec_type type, * but userspace is discouraged from relying on this. * * task->thread.sve_state does not need to be non-NULL, valid or any - * particular size: it must not be dereferenced. + * particular size: it must not be dereferenced and any data stored + * there should be considered stale and not referenced. + * + * * SVE state - FP_STATE_SVE: + * + * When the full SVE state is stored task->thread.fp_type is set to + * FP_STATE_SVE and Z0-Z31 (incorporating Vn in bits[127:0] or the + * corresponding Zn), P0-P15 and FFR are encoded in in + * task->thread.sve_state, formatted appropriately for vector + * length task->thread.sve_vl or, if SVCR.SM is set, + * task->thread.sme_vl. The storage for the vector registers in + * task->thread.uw.fpsimd_state should be ignored. + * + * task->thread.sve_state must point to a valid buffer at least + * sve_state_size(task) bytes in size. The data stored in + * task->thread.uw.fpsimd_state.vregs should be considered stale + * and not referenced. * * * FPSR and FPCR are always stored in task->thread.uw.fpsimd_state * irrespective of whether TIF_SVE is clear or set, since these are @@ -378,11 +395,37 @@ static void task_fpsimd_load(void) WARN_ON(!system_supports_fpsimd()); WARN_ON(!have_cpu_fpsimd_context()); - /* Check if we should restore SVE first */ - if (IS_ENABLED(CONFIG_ARM64_SVE) && test_thread_flag(TIF_SVE)) { - sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1); - restore_sve_regs = true; - restore_ffr = true; + if (system_supports_sve()) { + switch (current->thread.fp_type) { + case FP_STATE_FPSIMD: + /* Stop tracking SVE for this task until next use. */ + if (test_and_clear_thread_flag(TIF_SVE)) + sve_user_disable(); + break; + case FP_STATE_SVE: + if (!thread_sm_enabled(¤t->thread) && + !WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE))) + sve_user_enable(); + + if (test_thread_flag(TIF_SVE)) + sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1); + + restore_sve_regs = true; + restore_ffr = true; + break; + default: + /* + * This indicates either a bug in + * fpsimd_save() or memory corruption, we + * should always record an explicit format + * when we save. We always at least have the + * memory allocated for FPSMID registers so + * try that and hope for the best. + */ + WARN_ON_ONCE(1); + clear_thread_flag(TIF_SVE); + break; + } } /* Restore SME, override SVE register configuration if needed */ @@ -398,18 +441,19 @@ static void task_fpsimd_load(void) if (thread_za_enabled(¤t->thread)) za_load_state(current->thread.za_state); - if (thread_sm_enabled(¤t->thread)) { - restore_sve_regs = true; + if (thread_sm_enabled(¤t->thread)) restore_ffr = system_supports_fa64(); - } } - if (restore_sve_regs) + if (restore_sve_regs) { + WARN_ON_ONCE(current->thread.fp_type != FP_STATE_SVE); sve_load_state(sve_pffr(¤t->thread), ¤t->thread.uw.fpsimd_state.fpsr, restore_ffr); - else + } else { + WARN_ON_ONCE(current->thread.fp_type != FP_STATE_FPSIMD); fpsimd_load_state(¤t->thread.uw.fpsimd_state); + } } /* @@ -419,8 +463,8 @@ static void task_fpsimd_load(void) * last, if KVM is involved this may be the guest VM context rather * than the host thread for the VM pointed to by current. This means * that we must always reference the state storage via last rather - * than via current, other than the TIF_ flags which KVM will - * carefully maintain for us. + * than via current, if we are saving KVM state then it will have + * ensured that the type of registers to save is set in last->to_save. */ static void fpsimd_save(void) { @@ -437,7 +481,14 @@ static void fpsimd_save(void) if (test_thread_flag(TIF_FOREIGN_FPSTATE)) return; - if (test_thread_flag(TIF_SVE)) { + /* + * If a task is in a syscall the ABI allows us to only + * preserve the state shared with FPSIMD so don't bother + * saving the full SVE state in that case. + */ + if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE) && + !in_syscall(current_pt_regs())) || + last->to_save == FP_STATE_SVE) { save_sve_regs = true; save_ffr = true; vl = last->sve_vl; @@ -474,8 +525,10 @@ static void fpsimd_save(void) sve_save_state((char *)last->sve_state + sve_ffr_offset(vl), &last->st->fpsr, save_ffr); + *last->fp_type = FP_STATE_SVE; } else { fpsimd_save_state(last->st); + *last->fp_type = FP_STATE_FPSIMD; } } @@ -768,8 +821,7 @@ void fpsimd_sync_to_sve(struct task_struct *task) */ void sve_sync_to_fpsimd(struct task_struct *task) { - if (test_tsk_thread_flag(task, TIF_SVE) || - thread_sm_enabled(&task->thread)) + if (task->thread.fp_type == FP_STATE_SVE) sve_to_fpsimd(task); } @@ -851,8 +903,10 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type, fpsimd_flush_task_state(task); if (test_and_clear_tsk_thread_flag(task, TIF_SVE) || - thread_sm_enabled(&task->thread)) + thread_sm_enabled(&task->thread)) { sve_to_fpsimd(task); + task->thread.fp_type = FP_STATE_FPSIMD; + } if (system_supports_sme()) { if (type == ARM64_VEC_SME || @@ -1383,6 +1437,7 @@ static void sve_init_regs(void) fpsimd_bind_task_to_cpu(); } else { fpsimd_to_sve(current); + current->thread.fp_type = FP_STATE_SVE; } } @@ -1611,6 +1666,8 @@ void fpsimd_flush_thread(void) current->thread.svcr = 0; } + current->thread.fp_type = FP_STATE_FPSIMD; + put_cpu_fpsimd_context(); kfree(sve_state); kfree(za_state); @@ -1642,6 +1699,31 @@ void fpsimd_signal_preserve_current_state(void) sve_to_fpsimd(current); } +/* + * Called by KVM when entering the guest. + */ +void fpsimd_kvm_prepare(void) +{ + if (!system_supports_sve()) + return; + + /* + * KVM does not save host SVE state since we can only enter + * the guest from a syscall so the ABI means that only the + * non-saved SVE state needs to be saved. If we have left + * SVE enabled for performance reasons then update the task + * state to be FPSIMD only. + */ + get_cpu_fpsimd_context(); + + if (test_and_clear_thread_flag(TIF_SVE)) { + sve_to_fpsimd(current); + current->thread.fp_type = FP_STATE_FPSIMD; + } + + put_cpu_fpsimd_context(); +} + /* * Associate current's FPSIMD context with this cpu * The caller must have ownership of the cpu FPSIMD context before calling @@ -1659,6 +1741,8 @@ static void fpsimd_bind_task_to_cpu(void) last->sve_vl = task_get_sve_vl(current); last->sme_vl = task_get_sme_vl(current); last->svcr = ¤t->thread.svcr; + last->fp_type = ¤t->thread.fp_type; + last->to_save = FP_STATE_CURRENT; current->thread.fpsimd_cpu = smp_processor_id(); /* @@ -1682,7 +1766,8 @@ static void fpsimd_bind_task_to_cpu(void) void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state, unsigned int sve_vl, void *za_state, - unsigned int sme_vl, u64 *svcr) + unsigned int sme_vl, u64 *svcr, + enum fp_type *type, enum fp_type to_save) { struct fpsimd_last_state_struct *last = this_cpu_ptr(&fpsimd_last_state); @@ -1696,6 +1781,8 @@ void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state, last->za_state = za_state; last->sve_vl = sve_vl; last->sme_vl = sme_vl; + last->fp_type = type; + last->to_save = to_save; } /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index cdbbc95eb49d0..2cffaad364fea 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -33,6 +33,7 @@ #include #include #include +#include #include #include "efi-header.S" @@ -57,9 +58,11 @@ /* * DO NOT MODIFY. Image header expected by Linux boot-loaders. */ - efi_signature_nop // special NOP to identity as PE/COFF executable +SYM_DATA_LOCAL(efi_nop, efi_signature_nop) // special NOP to identity as PE/COFF executable + UNWIND_HINT_EMPTY b primary_entry // branch to kernel start, magic - .quad 0 // Image load offset from start of RAM, little-endian +SYM_DATA_LOCAL(_zero_reserved, .quad 0) // Image load offset from start of RAM, little-endian +SYM_DATA_START_LOCAL(_arm64_common_header) le64sym _kernel_size_le // Effective size of kernel image, little-endian le64sym _kernel_flags_le // Informative flags, little-endian .quad 0 // reserved @@ -67,6 +70,7 @@ .quad 0 // reserved .ascii ARM64_IMAGE_MAGIC // Magic number .long .Lpe_header_offset // Offset to the PE header. +SYM_DATA_END(_arm64_common_header) __EFI_PE_HEADER @@ -113,6 +117,7 @@ SYM_CODE_END(primary_entry) * Preserve the arguments passed by the bootloader in x0 .. x3 */ SYM_CODE_START_LOCAL(preserve_boot_args) + UNWIND_HINT_EMPTY mov x21, x0 // x21=FDT adr_l x0, boot_args // record the contents of @@ -126,7 +131,8 @@ SYM_CODE_START_LOCAL(preserve_boot_args) b dcache_inval_poc // tail call SYM_CODE_END(preserve_boot_args) -SYM_FUNC_START_LOCAL(clear_page_tables) +SYM_CODE_START_LOCAL(clear_page_tables) + UNWIND_HINT_EMPTY /* * Clear the init page tables. */ @@ -135,7 +141,7 @@ SYM_FUNC_START_LOCAL(clear_page_tables) sub x2, x1, x0 mov x1, xzr b __pi_memset // tail call -SYM_FUNC_END(clear_page_tables) +SYM_CODE_END(clear_page_tables) /* * Macro to populate page table entries, these entries can be pointers to the next level @@ -259,8 +265,8 @@ SYM_FUNC_END(clear_page_tables) * x5: attributes to set on the updated region * x6: order of the last level mappings */ -SYM_FUNC_START_LOCAL(remap_region) - sub x3, x3, #1 // make end inclusive +SYM_CODE_START_LOCAL(remap_region) + UNWIND_HINT_EMPTY // Get the index offset for the start of the last level table lsr x1, x1, x6 @@ -278,9 +284,10 @@ SYM_FUNC_START_LOCAL(remap_region) populate_entries x0, x4, x2, x3, x5, x6, x7 ret -SYM_FUNC_END(remap_region) +SYM_CODE_END(remap_region) -SYM_FUNC_START_LOCAL(create_idmap) +SYM_CODE_START_LOCAL(create_idmap) + UNWIND_HINT_EMPTY mov x28, lr /* * The ID map carries a 1:1 mapping of the physical address range @@ -366,9 +373,10 @@ SYM_FUNC_START_LOCAL(create_idmap) adrp x1, init_idmap_pg_end bl dcache_inval_poc ret x28 -SYM_FUNC_END(create_idmap) +SYM_CODE_END(create_idmap) -SYM_FUNC_START_LOCAL(create_kernel_mapping) +SYM_CODE_START_LOCAL(create_kernel_mapping) + UNWIND_HINT_EMPTY adrp x0, init_pg_dir mov_q x5, KIMAGE_VADDR // compile time __va(_text) #ifdef CONFIG_RELOCATABLE @@ -384,7 +392,7 @@ SYM_FUNC_START_LOCAL(create_kernel_mapping) dsb ishst // sync with page table walker ret -SYM_FUNC_END(create_kernel_mapping) +SYM_CODE_END(create_kernel_mapping) /* * Initialize CPU registers with task-specific and cpu-specific context. @@ -417,7 +425,7 @@ SYM_FUNC_END(create_kernel_mapping) * * x0 = __pa(KERNEL_START) */ -SYM_FUNC_START_LOCAL(__primary_switched) +SYM_CODE_START_LOCAL(__primary_switched) adr_l x4, init_task init_cpu_task x4, x5, x6 @@ -467,7 +475,7 @@ SYM_FUNC_START_LOCAL(__primary_switched) ldp x29, x30, [sp], #16 bl start_kernel ASM_BUG() -SYM_FUNC_END(__primary_switched) +SYM_CODE_END(__primary_switched) /* * end early head section, begin head code that is also used for @@ -487,7 +495,8 @@ SYM_FUNC_END(__primary_switched) * booted in EL1 or EL2 respectively, with the top 32 bits containing * potential context flags. These flags are *not* stored in __boot_cpu_mode. */ -SYM_FUNC_START(init_kernel_el) +SYM_CODE_START(init_kernel_el) + UNWIND_HINT_EMPTY mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 b.eq init_el2 @@ -538,26 +547,28 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) mov w0, #BOOT_CPU_MODE_EL2 orr x0, x0, x2 eret -SYM_FUNC_END(init_kernel_el) +SYM_CODE_END(init_kernel_el) /* * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed * in w0. See arch/arm64/include/asm/virt.h for more info. */ -SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag) +SYM_CODE_START_LOCAL(set_cpu_boot_mode_flag) + UNWIND_HINT_EMPTY adr_l x1, __boot_cpu_mode cmp w0, #BOOT_CPU_MODE_EL2 b.ne 1f add x1, x1, #4 1: str w0, [x1] // Save CPU boot mode ret -SYM_FUNC_END(set_cpu_boot_mode_flag) +SYM_CODE_END(set_cpu_boot_mode_flag) /* * This provides a "holding pen" for platforms to hold all secondary * cores are held until we're ready for them to initialise. */ -SYM_FUNC_START(secondary_holding_pen) +SYM_CODE_START(secondary_holding_pen) + UNWIND_HINT_EMPTY bl init_kernel_el // w0=cpu_boot_mode mrs x2, mpidr_el1 mov_q x1, MPIDR_HWID_BITMASK @@ -568,18 +579,19 @@ pen: ldr x4, [x3] b.eq secondary_startup wfe b pen -SYM_FUNC_END(secondary_holding_pen) +SYM_CODE_END(secondary_holding_pen) /* * Secondary entry point that jumps straight into the kernel. Only to * be used where CPUs are brought online dynamically by the kernel. */ -SYM_FUNC_START(secondary_entry) +SYM_CODE_START(secondary_entry) + UNWIND_HINT_EMPTY bl init_kernel_el // w0=cpu_boot_mode b secondary_startup -SYM_FUNC_END(secondary_entry) +SYM_CODE_END(secondary_entry) -SYM_FUNC_START_LOCAL(secondary_startup) +SYM_CODE_START_LOCAL(secondary_startup) /* * Common entry point for secondary CPUs. */ @@ -595,9 +607,10 @@ SYM_FUNC_START_LOCAL(secondary_startup) bl __enable_mmu ldr x8, =__secondary_switched br x8 -SYM_FUNC_END(secondary_startup) +SYM_CODE_END(secondary_startup) -SYM_FUNC_START_LOCAL(__secondary_switched) +SYM_CODE_START_LOCAL(__secondary_switched) + UNWIND_HINT_EMPTY mov x0, x20 bl set_cpu_boot_mode_flag str_l xzr, __early_cpu_boot_status, x3 @@ -617,13 +630,14 @@ SYM_FUNC_START_LOCAL(__secondary_switched) bl secondary_start_kernel ASM_BUG() -SYM_FUNC_END(__secondary_switched) +SYM_CODE_END(__secondary_switched) -SYM_FUNC_START_LOCAL(__secondary_too_slow) +SYM_CODE_START_LOCAL(__secondary_too_slow) + UNWIND_HINT_EMPTY wfe wfi b __secondary_too_slow -SYM_FUNC_END(__secondary_too_slow) +SYM_CODE_END(__secondary_too_slow) /* * The booting CPU updates the failed status @__early_cpu_boot_status, @@ -656,7 +670,8 @@ SYM_FUNC_END(__secondary_too_slow) * Checks if the selected granule size is supported by the CPU. * If it isn't, park the CPU */ -SYM_FUNC_START(__enable_mmu) +SYM_CODE_START(__enable_mmu) + UNWIND_HINT_EMPTY mrs x3, ID_AA64MMFR0_EL1 ubfx x3, x3, #ID_AA64MMFR0_EL1_TGRAN_SHIFT, 4 cmp x3, #ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN @@ -670,9 +685,10 @@ SYM_FUNC_START(__enable_mmu) set_sctlr_el1 x0 ret -SYM_FUNC_END(__enable_mmu) +SYM_CODE_END(__enable_mmu) -SYM_FUNC_START(__cpu_secondary_check52bitva) +SYM_CODE_START_LOCAL(__cpu_secondary_check52bitva) + UNWIND_HINT_EMPTY #if VA_BITS > 48 ldr_l x0, vabits_actual cmp x0, #52 @@ -690,9 +706,9 @@ SYM_FUNC_START(__cpu_secondary_check52bitva) #endif 2: ret -SYM_FUNC_END(__cpu_secondary_check52bitva) +SYM_CODE_END(__cpu_secondary_check52bitva) -SYM_FUNC_START_LOCAL(__no_granule_support) +SYM_CODE_START_LOCAL(__no_granule_support) /* Indicate that this CPU can't boot and is stuck in the kernel */ update_early_cpu_boot_status \ CPU_STUCK_IN_KERNEL | CPU_STUCK_REASON_NO_GRAN, x1, x2 @@ -700,10 +716,11 @@ SYM_FUNC_START_LOCAL(__no_granule_support) wfe wfi b 1b -SYM_FUNC_END(__no_granule_support) +SYM_CODE_END(__no_granule_support) #ifdef CONFIG_RELOCATABLE -SYM_FUNC_START_LOCAL(__relocate_kernel) +SYM_CODE_START_LOCAL(__relocate_kernel) + UNWIND_HINT_EMPTY /* * Iterate over each entry in the relocation table, and apply the * relocations in place. @@ -790,10 +807,10 @@ SYM_FUNC_START_LOCAL(__relocate_kernel) #endif ret -SYM_FUNC_END(__relocate_kernel) +SYM_CODE_END(__relocate_kernel) #endif -SYM_FUNC_START_LOCAL(__primary_switch) +SYM_CODE_START_LOCAL(__primary_switch) adrp x1, reserved_pg_dir adrp x2, init_idmap_pg_dir bl __enable_mmu @@ -822,4 +839,4 @@ SYM_FUNC_START_LOCAL(__primary_switch) ldr x8, =__primary_switched adrp x0, KERNEL_START // __pa(KERNEL_START) br x8 -SYM_FUNC_END(__primary_switch) +SYM_CODE_END(__primary_switch) diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S index 0e1d9c3c6a933..c0bec20bf0e09 100644 --- a/arch/arm64/kernel/hibernate-asm.S +++ b/arch/arm64/kernel/hibernate-asm.S @@ -13,6 +13,7 @@ #include #include #include +#include #include /* @@ -46,6 +47,7 @@ */ .pushsection ".hibernate_exit.text", "ax" SYM_CODE_START(swsusp_arch_suspend_exit) + UNWIND_HINT_EMPTY /* * We execute from ttbr0, change ttbr1 to our copied linear map tables * with a break-before-make via the zero page diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 8151412653de2..a47e6185efa33 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -60,7 +60,6 @@ KVM_NVHE_ALIAS(spectre_bhb_patch_loop_iter); KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable); KVM_NVHE_ALIAS(spectre_bhb_patch_wa3); KVM_NVHE_ALIAS(spectre_bhb_patch_clearbhb); -KVM_NVHE_ALIAS(alt_cb_patch_nops); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 76b41e4ca9fa3..a2b755718d2ed 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -19,6 +19,7 @@ #include #include #include +#include void *module_alloc(unsigned long size) { @@ -156,7 +157,8 @@ enum aarch64_insn_movw_imm_type { }; static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, - int lsb, enum aarch64_insn_movw_imm_type imm_type) + int lsb, enum aarch64_insn_movw_imm_type imm_type, + bool early) { u64 imm; s64 sval; @@ -188,7 +190,10 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, /* Update the instruction with the new encoding. */ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); - *place = cpu_to_le32(insn); + if (early) + *place = cpu_to_le32(insn); + else + aarch64_insn_write(place, insn); if (imm > U16_MAX) return -ERANGE; @@ -197,7 +202,8 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, } static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, - int lsb, int len, enum aarch64_insn_imm_type imm_type) + int lsb, int len, enum aarch64_insn_imm_type imm_type, + bool early) { u64 imm, imm_mask; s64 sval; @@ -213,7 +219,10 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, /* Update the instruction's immediate field. */ insn = aarch64_insn_encode_immediate(imm_type, insn, imm); - *place = cpu_to_le32(insn); + if (early) + *place = cpu_to_le32(insn); + else + aarch64_insn_write(place, insn); /* * Extract the upper value bits (including the sign bit) and @@ -232,17 +241,17 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, } static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs, - __le32 *place, u64 val) + __le32 *place, u64 val, bool early) { u32 insn; if (!is_forbidden_offset_for_adrp(place)) return reloc_insn_imm(RELOC_OP_PAGE, place, val, 12, 21, - AARCH64_INSN_IMM_ADR); + AARCH64_INSN_IMM_ADR, early); /* patch ADRP to ADR if it is in range */ if (!reloc_insn_imm(RELOC_OP_PREL, place, val & ~0xfff, 0, 21, - AARCH64_INSN_IMM_ADR)) { + AARCH64_INSN_IMM_ADR, early)) { insn = le32_to_cpu(*place); insn &= ~BIT(31); } else { @@ -254,7 +263,10 @@ static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs, AARCH64_INSN_BRANCH_NOLINK); } - *place = cpu_to_le32(insn); + if (early) + *place = cpu_to_le32(insn); + else + aarch64_insn_write(place, insn); return 0; } @@ -271,6 +283,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, void *loc; u64 val; Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; + bool early = me->state == MODULE_STATE_UNFORMED; for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* loc corresponds to P in the AArch64 ELF document. */ @@ -323,88 +336,88 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, fallthrough; case R_AARCH64_MOVW_UABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_UABS_G1_NC: overflow_check = false; fallthrough; case R_AARCH64_MOVW_UABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_UABS_G2_NC: overflow_check = false; fallthrough; case R_AARCH64_MOVW_UABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_UABS_G3: /* We're using the top bits so we can't overflow. */ overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_SABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; case R_AARCH64_MOVW_SABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; case R_AARCH64_MOVW_SABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; case R_AARCH64_MOVW_PREL_G0_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_PREL_G0: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; case R_AARCH64_MOVW_PREL_G1_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_PREL_G1: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; case R_AARCH64_MOVW_PREL_G2_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_PREL_G2: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; case R_AARCH64_MOVW_PREL_G3: /* We're using the top bits so we can't overflow. */ overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 48, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; /* Immediate instruction relocations. */ case R_AARCH64_LD_PREL_LO19: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19, - AARCH64_INSN_IMM_19); + AARCH64_INSN_IMM_19, early); break; case R_AARCH64_ADR_PREL_LO21: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21, - AARCH64_INSN_IMM_ADR); + AARCH64_INSN_IMM_ADR, early); break; case R_AARCH64_ADR_PREL_PG_HI21_NC: overflow_check = false; fallthrough; case R_AARCH64_ADR_PREL_PG_HI21: - ovf = reloc_insn_adrp(me, sechdrs, loc, val); + ovf = reloc_insn_adrp(me, sechdrs, loc, val, early); if (ovf && ovf != -ERANGE) return ovf; break; @@ -412,40 +425,40 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_LDST8_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 0, 12, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, early); break; case R_AARCH64_LDST16_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 1, 11, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, early); break; case R_AARCH64_LDST32_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 2, 10, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, early); break; case R_AARCH64_LDST64_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 3, 9, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, early); break; case R_AARCH64_LDST128_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 4, 8, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, early); break; case R_AARCH64_TSTBR14: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 14, - AARCH64_INSN_IMM_14); + AARCH64_INSN_IMM_14, early); break; case R_AARCH64_CONDBR19: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19, - AARCH64_INSN_IMM_19); + AARCH64_INSN_IMM_19, early); break; case R_AARCH64_JUMP26: case R_AARCH64_CALL26: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26, - AARCH64_INSN_IMM_26); + AARCH64_INSN_IMM_26, early); if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && ovf == -ERANGE) { @@ -453,7 +466,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, if (!val) return -ENOEXEC; ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, - 26, AARCH64_INSN_IMM_26); + 26, AARCH64_INSN_IMM_26, early); } break; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 044a7d7f1f6ad..19cd05eea3f0e 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -331,6 +331,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) clear_tsk_thread_flag(dst, TIF_SME); } + dst->thread.fp_type = FP_STATE_FPSIMD; + /* clear any pending asynchronous tag fault raised by the parent */ clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT); diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index bfce41c2a53b3..dd07406d1391a 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -257,6 +258,7 @@ static noinstr void qcom_link_stack_sanitisation(void) "mov x30, %0 \n" : "=&r" (tmp)); } +STACK_FRAME_NON_STANDARD(qcom_link_stack_sanitisation); static bp_hardening_cb_t spectre_v2_get_sw_mitigation_cb(void) { diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index d02dd2be17b3b..18ba01eb2a0e3 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -915,8 +915,7 @@ static int sve_set_common(struct task_struct *target, ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, SVE_PT_FPSIMD_OFFSET); clear_tsk_thread_flag(target, TIF_SVE); - if (type == ARM64_VEC_SME) - fpsimd_force_sync_to_sve(target); + target->thread.fp_type = FP_STATE_FPSIMD; goto out; } @@ -939,6 +938,7 @@ static int sve_set_common(struct task_struct *target, if (!target->thread.sve_state) { ret = -ENOMEM; clear_tsk_thread_flag(target, TIF_SVE); + target->thread.fp_type = FP_STATE_FPSIMD; goto out; } @@ -952,6 +952,7 @@ static int sve_set_common(struct task_struct *target, fpsimd_sync_to_sve(target); if (type == ARM64_VEC_SVE) set_tsk_thread_flag(target, TIF_SVE); + target->thread.fp_type = FP_STATE_SVE; BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); start = SVE_PT_SVE_OFFSET; diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index 413f899e4ac63..3ec70802a7f8e 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -16,6 +16,7 @@ #include #include #include +#include .macro turn_off_mmu tmp1, tmp2 mov_q \tmp1, INIT_SCTLR_EL1_MMU_OFF @@ -37,6 +38,7 @@ * safe memory that has been set up to be preserved during the copy operation. */ SYM_CODE_START(arm64_relocate_new_kernel) + UNWIND_HINT_EMPTY /* * The kimage structure isn't allocated specially and may be clobbered * during relocation. We must load any values we need from it prior to diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 82f4572c8ddfc..86f2c5b66bc68 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -207,6 +208,7 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx) __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); clear_thread_flag(TIF_SVE); + current->thread.fp_type = FP_STATE_FPSIMD; /* load the hardware registers from the fpsimd_state structure */ if (!err) @@ -297,6 +299,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (sve.head.size <= sizeof(*user->sve)) { clear_thread_flag(TIF_SVE); current->thread.svcr &= ~SVCR_SM_MASK; + current->thread.fp_type = FP_STATE_FPSIMD; goto fpsimd_only; } @@ -332,6 +335,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) current->thread.svcr |= SVCR_SM_MASK; else set_thread_flag(TIF_SVE); + current->thread.fp_type = FP_STATE_SVE; fpsimd_only: /* copy the FP and status/control registers */ @@ -937,9 +941,11 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, * FPSIMD register state - flush the saved FPSIMD * register state in case it gets loaded. */ - if (current->thread.svcr & SVCR_SM_MASK) + if (current->thread.svcr & SVCR_SM_MASK) { memset(¤t->thread.uw.fpsimd_state, 0, sizeof(current->thread.uw.fpsimd_state)); + current->thread.fp_type = FP_STATE_FPSIMD; + } current->thread.svcr &= ~(SVCR_ZA_MASK | SVCR_SM_MASK); @@ -1125,6 +1131,9 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) (void __user *)NULL, current); } + if (thread_flags & _TIF_PATCH_PENDING) + klp_update_patch_state(current); + if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 97c9de57725df..82e38d1e7a8ba 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -4,6 +4,7 @@ #include #include #include +#include .text /* @@ -91,6 +92,7 @@ SYM_FUNC_START(__cpu_suspend_enter) str x0, [x1] add x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS stp x29, lr, [sp, #-16]! + mov x29, sp bl cpu_do_suspend ldp x29, lr, [sp], #16 mov x0, #1 @@ -99,6 +101,7 @@ SYM_FUNC_END(__cpu_suspend_enter) .pushsection ".idmap.text", "awx" SYM_CODE_START(cpu_resume) + UNWIND_HINT_EMPTY bl init_kernel_el bl finalise_el2 #if VA_BITS > 48 @@ -115,7 +118,8 @@ SYM_CODE_END(cpu_resume) .ltorg .popsection -SYM_FUNC_START(_cpu_resume) +SYM_CODE_START(_cpu_resume) + UNWIND_HINT_EMPTY mrs x1, mpidr_el1 adr_l x8, mpidr_hash // x8 = struct mpidr_hash virt address @@ -151,4 +155,4 @@ SYM_FUNC_START(_cpu_resume) ldp x29, lr, [x29] mov x0, #0 ret -SYM_FUNC_END(_cpu_resume) +SYM_CODE_END(_cpu_resume) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 83154303e682c..6f010945c3bbb 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -12,12 +12,116 @@ #include #include #include +#include #include #include #include #include +struct code_range { + unsigned long start; + unsigned long end; +}; + +static struct code_range *sym_code_functions; +static int num_sym_code_functions; + +int __init init_sym_code_functions(void) +{ + size_t size; + + size = (unsigned long)__sym_code_functions_end - + (unsigned long)__sym_code_functions_start; + + sym_code_functions = kmalloc(size, GFP_KERNEL); + if (!sym_code_functions) + return -ENOMEM; + + memcpy(sym_code_functions, __sym_code_functions_start, size); + /* Update num_sym_code_functions after copying sym_code_functions. */ + smp_mb(); + num_sym_code_functions = size / sizeof(struct code_range); + + return 0; +} +early_initcall(init_sym_code_functions); + +/* + * Check the return PC against sym_code_functions[]. If there is a match, then + * the consider the stack frame unreliable. These functions contain low-level + * code where the frame pointer and/or the return address register cannot be + * relied upon. This addresses the following situations: + * + * - Exception handlers and entry assembly + * - Trampoline assembly (e.g., ftrace, kprobes) + * - Hypervisor-related assembly + * - Hibernation-related assembly + * - CPU start-stop, suspend-resume assembly + * - Kernel relocation assembly + * + * Some special cases covered by sym_code_functions[] deserve a mention here: + * + * - All EL1 interrupt and exception stack traces will be considered + * unreliable. This is the correct behavior as interrupts and exceptions + * can happen on any instruction including ones in the frame pointer + * prolog and epilog. Unless stack metadata is available so the unwinder + * can unwind through these special cases, such stack traces will be + * considered unreliable. + * + * - A task can get preempted at the end of an interrupt. Stack traces + * of preempted tasks will show the interrupt frame in the stack trace + * and will be considered unreliable. + * + * - Breakpoints are exceptions. So, all stack traces in the break point + * handler (including probes) will be considered unreliable. + * + * - All of the ftrace entry trampolines are considered unreliable. So, + * all stack traces taken from tracer functions will be considered + * unreliable. + * + * - The Function Graph Tracer return trampoline (return_to_handler) + * and the Kretprobe return trampoline (kretprobe_trampoline) are + * also considered unreliable. + * + * Some of the special cases above can be unwound through using special logic + * in unwind_frame(). + * + * - return_to_handler() is handled by the unwinder by attempting to + * retrieve the original return address from the per-task return + * address stack. + * + * - kretprobe_trampoline() can be handled in a similar fashion by + * attempting to retrieve the original return address from the per-task + * kretprobe instance list. + * + * - I reckon optprobes can be handled in a similar fashion in the future? + * + * - Stack traces taken from the FTrace tracer functions can be handled + * as well. ftrace_call is an inner label defined in the Ftrace entry + * trampoline. This is the location where the call to a tracer function + * is patched. So, if the return PC equals ftrace_call+4, it is + * reliable. At that point, proper stack frames have already been set + * up for the traced function and its caller. + */ +static bool unwinder_is_unreliable(unsigned long pc) +{ + const struct code_range *range; + int i; + + /* + * If sym_code_functions[] were sorted, a binary search could be + * done to make this more performant. + */ + for (i = 0; i < num_sym_code_functions; i++) { + range = &sym_code_functions[i]; + if (pc >= range->start && pc < range->end) + return true; + } + + return false; +} + /* * Start an unwind from a pt_regs. * @@ -76,19 +180,17 @@ static __always_inline void unwind_init_from_task(struct unwind_state *state, * records (e.g. a cycle), determined based on the location and fp value of A * and the location (but not the fp value) of B. */ -static int notrace unwind_next(struct unwind_state *state) +static int notrace unwind_next(struct unwind_state *state, int *reliable) { struct task_struct *tsk = state->task; - unsigned long fp = state->fp; int err; - /* Final frame; nothing to unwind */ - if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) - return -ENOENT; - err = unwind_next_frame_record(state); - if (err) + if (err) { + if (reliable) + *reliable = 0; return err; + } state->pc = ptrauth_strip_insn_pac(state->pc); @@ -114,11 +216,34 @@ static int notrace unwind_next(struct unwind_state *state) state->pc = kretprobe_find_ret_addr(tsk, (void *)state->fp, &state->kr_cur); #endif + /* + * Check the return PC for conditions that make unwinding unreliable. + * In each case, mark the stack trace as such. + */ + + /* + * Make sure that the return address is a proper kernel text address. + * A NULL or invalid return address could mean: + * + * - generated code such as eBPF and optprobe trampolines + * - Foreign code (e.g. EFI runtime services) + * - Procedure Linkage Table (PLT) entries and veneer functions + */ + if (reliable && !__kernel_text_address(state->pc)) + *reliable = 0; + + /* Final frame; nothing to unwind */ + if (state->fp == (unsigned long)task_pt_regs(tsk)->stackframe) + return -ENOENT; + + if (reliable && unwinder_is_unreliable(state->pc)) + *reliable = 0; + return 0; } NOKPROBE_SYMBOL(unwind_next); -static void notrace unwind(struct unwind_state *state, +static void notrace unwind(struct unwind_state *state, int *reliable, stack_trace_consume_fn consume_entry, void *cookie) { while (1) { @@ -126,8 +251,8 @@ static void notrace unwind(struct unwind_state *state, if (!consume_entry(cookie, state->pc)) break; - ret = unwind_next(state); - if (ret < 0) + ret = unwind_next(state, reliable); + if ((ret < 0) || (reliable && !(*reliable))) break; } } @@ -228,5 +353,44 @@ noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, unwind_init_from_task(&state, task); } - unwind(&state, consume_entry, cookie); + unwind(&state, NULL, consume_entry, cookie); +} + +/* + * Walk the stack like arch_stack_walk() but stop the walk as soon as + * some unreliability is detected in the stack. + */ +noinline noinstr int arch_stack_walk_reliable( + stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task) +{ + struct stack_info stacks[] = { + stackinfo_get_task(task), + STACKINFO_CPU(irq), +#if defined(CONFIG_VMAP_STACK) + STACKINFO_CPU(overflow), +#endif +#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_ARM_SDE_INTERFACE) + STACKINFO_SDEI(normal), + STACKINFO_SDEI(critical), +#endif +#ifdef CONFIG_EFI + STACKINFO_EFI, +#endif + }; + struct unwind_state state = { + .stacks = stacks, + .nr_stacks = ARRAY_SIZE(stacks), + }; + int reliable = 1; + + if (task == current) { + unwind_init_from_caller(&state); + } else { + unwind_init_from_task(&state, task); + } + + unwind(&state, &reliable, consume_entry, cookie); + + return reliable ? 0 : -EINVAL; } diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index c771e94568b9b..9fa3f3472acf4 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -181,21 +181,12 @@ static inline void fp_user_discard(void) if (!system_supports_sve()) return; - /* - * If SME is not active then disable SVE, the registers will - * be cleared when userspace next attempts to access them and - * we do not need to track the SVE register state until then. - */ - clear_thread_flag(TIF_SVE); + if (test_thread_flag(TIF_SVE)) { + unsigned int sve_vq_minus_one; - /* - * task_fpsimd_load() won't be called to update CPACR_EL1 in - * ret_to_user unless TIF_FOREIGN_FPSTATE is still set, which only - * happens if a context switch or kernel_neon_begin() or context - * modification (sigreturn, ptrace) intervenes. - * So, ensure that CPACR_EL1 is already correct for the fast-path case. - */ - sve_user_disable(); + sve_vq_minus_one = sve_vq_from_vl(task_get_sve_vl(current)) - 1; + sve_flush_live(true, sve_vq_minus_one); + } } void do_el0_svc(struct pt_regs *regs) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 45131e354e27f..d62f24c288b9c 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -121,6 +121,12 @@ jiffies = jiffies_64; #define TRAMP_TEXT #endif +#define SYM_CODE_FUNCTIONS \ + . = ALIGN(16); \ + __sym_code_functions_start = .; \ + KEEP(*(sym_code_functions)) \ + __sym_code_functions_end = .; + /* * The size of the PE/COFF section that covers the kernel image, which * runs from _stext to _edata, must be a round multiple of the PE/COFF @@ -246,6 +252,7 @@ SECTIONS CON_INITCALL INIT_RAM_FS *(.init.altinstructions .init.bss) /* from the EFI stub */ + SYM_CODE_FUNCTIONS } .exit.data : { EXIT_DATA diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index de94515fb17c6..85024cccd9d71 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -114,22 +114,6 @@ static int kvm_arm_default_max_vcpus(void) return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; } -static void set_default_spectre(struct kvm *kvm) -{ - /* - * The default is to expose CSV2 == 1 if the HW isn't affected. - * Although this is a per-CPU feature, we make it global because - * asymmetric systems are just a nuisance. - * - * Userspace can override this as long as it doesn't promise - * the impossible. - */ - if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) - kvm->arch.pfr0_csv2 = 1; - if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) - kvm->arch.pfr0_csv3 = 1; -} - /** * kvm_arch_init_vm - initializes a VM data structure * @kvm: pointer to the KVM struct @@ -171,8 +155,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* The maximum number of VCPUs is limited by the host's GIC model */ kvm->max_vcpus = kvm_arm_default_max_vcpus(); - set_default_spectre(kvm); kvm_arm_init_hypercalls(kvm); + kvm_arm_init_id_regs(kvm); return ret; out_free_stage2_pgd: diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index ec8e4494873d4..ec82d0191f767 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -75,11 +75,12 @@ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) { BUG_ON(!current->mm); - BUG_ON(test_thread_flag(TIF_SVE)); if (!system_supports_fpsimd()) return; + fpsimd_kvm_prepare(); + vcpu->arch.fp_state = FP_STATE_HOST_OWNED; vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); @@ -129,9 +130,16 @@ void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu) */ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) { + enum fp_type fp_type; + WARN_ON_ONCE(!irqs_disabled()); if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) { + if (vcpu_has_sve(vcpu)) + fp_type = FP_STATE_SVE; + else + fp_type = FP_STATE_FPSIMD; + /* * Currently we do not support SME guests so SVCR is * always 0 and we just need a variable to point to. @@ -139,10 +147,10 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) fpsimd_bind_state_to_cpu(&vcpu->arch.ctxt.fp_regs, vcpu->arch.sve_state, vcpu->arch.sve_max_vl, - NULL, 0, &vcpu->arch.svcr); + NULL, 0, &vcpu->arch.svcr, + &vcpu->arch.fp_type, fp_type); clear_thread_flag(TIF_FOREIGN_FPSTATE); - update_thread_flag(TIF_SVE, vcpu_has_sve(vcpu)); } } @@ -199,7 +207,5 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); } - update_thread_flag(TIF_SVE, 0); - local_irq_restore(flags); } diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 435346ea1504e..81e30d56bf28b 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -15,13 +15,15 @@ #include #include #include +#include .text /* * u64 __guest_enter(struct kvm_vcpu *vcpu); */ -SYM_FUNC_START(__guest_enter) +SYM_CODE_START(__guest_enter) + UNWIND_HINT_FUNC // x0: vcpu // x1-x17: clobbered by macros // x29: guest context @@ -88,6 +90,7 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) // vcpu x0-x1 on the stack // If the hyp context is loaded, go straight to hyp_panic + UNWIND_HINT_FUNC get_loaded_vcpu x0, x1 cbnz x0, 1f b hyp_panic @@ -110,6 +113,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // x1: vcpu // x2-x29,lr: vcpu regs // vcpu x0-x1 on the stack + UNWIND_HINT_FUNC sp_offset=16 add x1, x1, #VCPU_CONTEXT @@ -199,6 +203,7 @@ abort_guest_exit_end: msr daifset, #4 // Mask aborts ret + UNWIND_HINT_FUNC _kvm_extable abort_guest_exit_start, 9997f _kvm_extable abort_guest_exit_end, 9997f 9997: @@ -212,4 +217,4 @@ abort_guest_exit_end: msr spsr_el2, x4 orr x0, x0, x5 1: ret -SYM_FUNC_END(__guest_enter) +SYM_CODE_END(__guest_enter) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 8f3f93fa119ed..d80fe00450368 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -14,6 +14,7 @@ #include #include #include +#include .macro save_caller_saved_regs_vect /* x0 and x1 were saved in the vector entry */ @@ -150,6 +151,7 @@ SYM_CODE_END(\label) .macro valid_vect target .align 7 + UNWIND_HINT_FUNC 661: esb stp x0, x1, [sp, #-16]! @@ -161,6 +163,7 @@ check_preamble_length 661b, 662b .macro invalid_vect target .align 7 + UNWIND_HINT_FUNC 661: nop stp x0, x1, [sp, #-16]! @@ -208,6 +211,7 @@ SYM_CODE_END(__kvm_hyp_vector) .macro hyp_ventry indirect, spectrev2 .align 7 1: esb + UNWIND_HINT_FUNC .if \spectrev2 != 0 spectrev2_smccc_wa1_smc .else diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 07f9dc9848ef1..0faa330a41edb 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -195,7 +195,7 @@ struct kvm_mem_range { u64 end; }; -static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) +static struct memblock_region *find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) { int cur, left = 0, right = hyp_memblock_nr; struct memblock_region *reg; @@ -218,18 +218,28 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) } else { range->start = reg->base; range->end = end; - return true; + return reg; } } - return false; + return NULL; } bool addr_is_memory(phys_addr_t phys) { struct kvm_mem_range range; - return find_mem_range(phys, &range); + return !!find_mem_range(phys, &range); +} + +static bool addr_is_allowed_memory(phys_addr_t phys) +{ + struct memblock_region *reg; + struct kvm_mem_range range; + + reg = find_mem_range(phys, &range); + + return reg && !(reg->flags & MEMBLOCK_NOMAP); } static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) @@ -348,7 +358,7 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr static int host_stage2_idmap(u64 addr) { struct kvm_mem_range range; - bool is_memory = find_mem_range(addr, &range); + bool is_memory = !!find_mem_range(addr, &range); enum kvm_pgtable_prot prot; int ret; @@ -414,7 +424,7 @@ struct pkvm_mem_share { struct check_walk_data { enum pkvm_page_state desired; - enum pkvm_page_state (*get_page_state)(kvm_pte_t pte); + enum pkvm_page_state (*get_page_state)(kvm_pte_t pte, u64 addr); }; static int __check_page_state_visitor(u64 addr, u64 end, u32 level, @@ -425,10 +435,7 @@ static int __check_page_state_visitor(u64 addr, u64 end, u32 level, struct check_walk_data *d = arg; kvm_pte_t pte = *ptep; - if (kvm_pte_valid(pte) && !addr_is_memory(kvm_pte_to_phys(pte))) - return -EINVAL; - - return d->get_page_state(pte) == d->desired ? 0 : -EPERM; + return d->get_page_state(pte, addr) == d->desired ? 0 : -EPERM; } static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, @@ -443,8 +450,11 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, return kvm_pgtable_walk(pgt, addr, size, &walker); } -static enum pkvm_page_state host_get_page_state(kvm_pte_t pte) +static enum pkvm_page_state host_get_page_state(kvm_pte_t pte, u64 addr) { + if (!addr_is_allowed_memory(addr)) + return PKVM_NOPAGE; + if (!kvm_pte_valid(pte) && pte) return PKVM_NOPAGE; @@ -511,7 +521,7 @@ static int host_initiate_unshare(u64 *completer_addr, return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED); } -static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte) +static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte, u64 addr) { if (!kvm_pte_valid(pte)) return PKVM_NOPAGE; diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 6cb638b184b18..493b630eacf2f 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -166,7 +166,7 @@ static u32 __vgic_v3_read_ap0rn(int n) val = read_gicreg(ICH_AP0R3_EL2); break; default: - unreachable(); + BUG(); } return val; @@ -190,7 +190,7 @@ static u32 __vgic_v3_read_ap1rn(int n) val = read_gicreg(ICH_AP1R3_EL2); break; default: - unreachable(); + BUG(); } return val; diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index c7e5f6a28c28b..a93648bc47041 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -1014,3 +1014,14 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) return -ENXIO; } + +u8 kvm_arm_pmu_get_pmuver_limit(void) +{ + u64 tmp; + + tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + tmp = cpuid_feature_cap_perfmon_field(tmp, + ID_AA64DFR0_EL1_PMUVer_SHIFT, + ID_AA64DFR0_EL1_PMUVer_V3P4); + return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp); +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 457e74f1f6717..9040b20970aff 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -39,6 +39,9 @@ * 64bit interface. */ +static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val); +static u64 kvm_arm_update_id_reg(const struct kvm_vcpu *vcpu, u32 id, u64 val); +static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding); static u64 sys_reg_to_index(const struct sys_reg_desc *reg); static bool read_from_write_only(struct kvm_vcpu *vcpu, @@ -270,7 +273,7 @@ static bool trap_loregion(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + u64 val = kvm_arm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); u32 sr = reg_to_encoding(r); if (!(val & (0xfUL << ID_AA64MMFR1_EL1_LO_SHIFT))) { @@ -449,10 +452,11 @@ static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return 0; } -static void reset_bvr(struct kvm_vcpu *vcpu, +static u64 reset_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val; + return rd->val; } static bool trap_bcr(struct kvm_vcpu *vcpu, @@ -485,10 +489,11 @@ static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return 0; } -static void reset_bcr(struct kvm_vcpu *vcpu, +static u64 reset_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val; + return rd->val; } static bool trap_wvr(struct kvm_vcpu *vcpu, @@ -522,10 +527,11 @@ static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return 0; } -static void reset_wvr(struct kvm_vcpu *vcpu, +static u64 reset_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val; + return rd->val; } static bool trap_wcr(struct kvm_vcpu *vcpu, @@ -558,25 +564,28 @@ static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return 0; } -static void reset_wcr(struct kvm_vcpu *vcpu, +static u64 reset_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val; + return rd->val; } -static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 amair = read_sysreg(amair_el1); vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1); + return amair; } -static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 actlr = read_sysreg(actlr_el1); vcpu_write_sys_reg(vcpu, actlr, ACTLR_EL1); + return actlr; } -static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 mpidr; @@ -590,7 +599,10 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0); mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1); mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2); - vcpu_write_sys_reg(vcpu, (1ULL << 31) | mpidr, MPIDR_EL1); + mpidr |= (1ULL << 31); + vcpu_write_sys_reg(vcpu, mpidr, MPIDR_EL1); + + return mpidr; } static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, @@ -602,13 +614,13 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } -static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX); /* No PMU available, any PMU reg may UNDEF... */ if (!kvm_arm_support_pmu_v3()) - return; + return 0; n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT; n &= ARMV8_PMU_PMCR_N_MASK; @@ -617,33 +629,41 @@ static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= mask; + + return __vcpu_sys_reg(vcpu, r->reg); } -static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0); + + return __vcpu_sys_reg(vcpu, r->reg); } -static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK; + + return __vcpu_sys_reg(vcpu, r->reg); } -static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK; + + return __vcpu_sys_reg(vcpu, r->reg); } -static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 pmcr, val; /* No PMU available, PMCR_EL0 may UNDEF... */ if (!kvm_arm_support_pmu_v3()) - return; + return 0; pmcr = read_sysreg(pmcr_el0); /* @@ -655,6 +675,8 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; __vcpu_sys_reg(vcpu, r->reg) = val; + + return __vcpu_sys_reg(vcpu, r->reg); } static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) @@ -1079,36 +1101,173 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } -/* Read a sanitised cpufeature ID register by sys_reg_desc */ -static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) +static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, + s64 new, s64 cur) { - u32 id = reg_to_encoding(r); - u64 val; + struct arm64_ftr_bits kvm_ftr = *ftrp; - if (sysreg_visible_as_raz(vcpu, r)) - return 0; + /* Some features have different safe value type in KVM than host features */ + switch (id) { + case SYS_ID_AA64DFR0_EL1: + if (kvm_ftr.shift == ID_AA64DFR0_EL1_PMUVer_SHIFT) + kvm_ftr.type = FTR_LOWER_SAFE; + break; + case SYS_ID_DFR0_EL1: + if (kvm_ftr.shift == ID_DFR0_PERFMON_SHIFT) + kvm_ftr.type = FTR_LOWER_SAFE; + break; + } - val = read_sanitised_ftr_reg(id); + return arm64_ftr_safe_value(&kvm_ftr, new, cur); +} - switch (id) { +/** + * arm64_check_features() - Check if a feature register value constitutes + * a subset of features indicated by the idreg's KVM sanitised limit. + * + * This function will check if each feature field of @val is the "safe" value + * against idreg's KVM sanitised limit return from reset() callback. + * If a field value in @val is the same as the one in limit, it is always + * considered the safe value regardless For register fields that are not in + * writable, only the value in limit is considered the safe value. + * + * Return: 0 if all the fields are safe. Otherwise, return negative errno. + */ +static int arm64_check_features(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + const struct arm64_ftr_reg *ftr_reg; + const struct arm64_ftr_bits *ftrp = NULL; + u32 id = reg_to_encoding(rd); + u64 writable_mask = rd->val; + u64 limit = 0; + u64 mask = 0; + + /* If the register is RAZ we know the only safe value is 0. */ + if (sysreg_visible_as_raz(vcpu, rd)) + return val ? -E2BIG : 0; + + /* For hidden and unallocated idregs without reset, only val = 0 is allowed. */ + if (rd->reset) { + limit = rd->reset(vcpu, rd); + limit = kvm_arm_update_id_reg(vcpu, id, limit); + ftr_reg = get_arm64_ftr_reg(id); + if (!ftr_reg) + return -EINVAL; + ftrp = ftr_reg->ftr_bits; + } + + for (; ftrp && ftrp->width; ftrp++) { + s64 f_val, f_lim, safe_val; + u64 ftr_mask; + + ftr_mask = arm64_ftr_mask(ftrp); + if ((ftr_mask & writable_mask) != ftr_mask) + continue; + + f_val = arm64_ftr_value(ftrp, val); + f_lim = arm64_ftr_value(ftrp, limit); + mask |= ftr_mask; + + if (f_val == f_lim) + safe_val = f_val; + else + safe_val = kvm_arm64_ftr_safe_value(id, ftrp, f_val, f_lim); + + if (safe_val != f_val) + return -E2BIG; + } + + /* For fields that are not writable, values in limit are the safe values. */ + if ((val & ~mask) != (limit & ~mask)) + return -E2BIG; + + return 0; +} + +static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu) +{ + if (kvm_vcpu_has_pmu(vcpu)) + return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), + IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1)); + else if (test_bit(KVM_ARCH_FLAG_VCPU_HAS_IMP_DEF_PMU, &vcpu->kvm->arch.flags)) + return ID_AA64DFR0_EL1_PMUVer_IMP_DEF; + + return 0; +} + +static u8 perfmon_to_pmuver(u8 perfmon) +{ + switch (perfmon) { + case ID_DFR0_PERFMON_8_0: + return ID_AA64DFR0_EL1_PMUVer_IMP; + case ID_DFR0_PERFMON_IMP_DEF: + return ID_AA64DFR0_EL1_PMUVer_IMP_DEF; + default: + /* Anything ARMv8.1+ and NI have the same value. For now. */ + return perfmon; + } +} + +static u8 pmuver_to_perfmon(u8 pmuver) +{ + switch (pmuver) { + case ID_AA64DFR0_EL1_PMUVer_IMP: + return ID_DFR0_PERFMON_8_0; + case ID_AA64DFR0_EL1_PMUVer_IMP_DEF: + return ID_DFR0_PERFMON_IMP_DEF; + default: + /* Anything ARMv8.1+ and NI have the same value. For now. */ + return pmuver; + } +} + +static int pmuver_update(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val, + u8 pmuver, + bool valid_pmu) +{ + int ret; + + ret = set_id_reg(vcpu, rd, val); + if (ret) + return ret; + + if (valid_pmu) { + val = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1); + val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; + val |= FIELD_PREP(ID_AA64DFR0_EL1_PMUVer_MASK, pmuver); + IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1) = val; + + val = IDREG(vcpu->kvm, SYS_ID_DFR0_EL1); + val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_PERFMON), pmuver_to_perfmon(pmuver)); + IDREG(vcpu->kvm, SYS_ID_DFR0_EL1) = val; + } else { + assign_bit(KVM_ARCH_FLAG_VCPU_HAS_IMP_DEF_PMU, &vcpu->kvm->arch.flags, + pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF); + } + + return 0; +} + +static u64 general_read_kvm_sanitised_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) +{ + return read_sanitised_ftr_reg(reg_to_encoding(rd)); +} + +static u64 kvm_arm_update_id_reg(const struct kvm_vcpu *vcpu, u32 encoding, u64 val) +{ + switch (encoding) { case SYS_ID_AA64PFR0_EL1: if (!vcpu_has_sve(vcpu)) val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); - if (kvm_vgic_global_state.type == VGIC_V3) { - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), 1); - } break; case SYS_ID_AA64PFR1_EL1: if (!kvm_has_mte(vcpu->kvm)) val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); - - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) @@ -1121,29 +1280,51 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r if (!vcpu_has_ptrauth(vcpu)) val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); - if (!cpus_have_final_cap(ARM64_HAS_WFXT)) - val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT); break; + } + + return val; +} + +static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding) +{ + u64 val = IDREG(vcpu->kvm, encoding); + + switch (encoding) { case SYS_ID_AA64DFR0_EL1: - /* Limit debug to ARMv8.0 */ - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6); - /* Limit guests to PMUv3 for ARMv8.4 */ - val = cpuid_feature_cap_perfmon_field(val, - ID_AA64DFR0_EL1_PMUVer_SHIFT, - kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_EL1_PMUVer_V3P4 : 0); - /* Hide SPE from guests */ - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer); + /* Set PMUver to the required version */ + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), + vcpu_pmuver(vcpu)); break; case SYS_ID_DFR0_EL1: - /* Limit guests to PMUv3 for ARMv8.4 */ - val = cpuid_feature_cap_perfmon_field(val, - ID_DFR0_PERFMON_SHIFT, - kvm_vcpu_has_pmu(vcpu) ? ID_DFR0_PERFMON_8_4 : 0); + val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_PERFMON), + pmuver_to_perfmon(vcpu_pmuver(vcpu))); break; } - return val; + return kvm_arm_update_id_reg(vcpu, encoding, val); +} + +/* Read a sanitised cpufeature ID register by sys_reg_desc */ +static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) +{ + if (sysreg_visible_as_raz(vcpu, r)) + return 0; + + return kvm_arm_read_id_reg(vcpu, reg_to_encoding(r)); +} + +/* + * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is + * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8. + */ +static inline bool is_id_reg(u32 id) +{ + return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && + sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && + sys_reg_CRm(id) < 8); } static unsigned int id_visibility(const struct kvm_vcpu *vcpu, @@ -1204,47 +1385,438 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } +static u64 read_sanitised_id_mmfr4_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + u64 val; + u32 id = reg_to_encoding(rd); + + val = read_sanitised_ftr_reg(id); + + /* CCIDX is not supported */ + val &= ~ARM64_FEATURE_MASK(ID_MMFR4_CCIDX); + + return val; +} + +static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + u64 val; + u32 id = reg_to_encoding(rd); + + val = read_sanitised_ftr_reg(id); + /* + * The default is to expose CSV2 == 1 if the HW isn't affected. + * Although this is a per-CPU feature, we make it global because + * asymmetric systems are just a nuisance. + * + * Userspace can override this as long as it doesn't promise + * the impossible. + */ + if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) { + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), 1); + } + if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) { + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), 1); + } + + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU); + + if (kvm_vgic_global_state.type == VGIC_V3) { + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), 1); + } + + return val; +} + +static u64 read_sanitised_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + u64 val; + u32 id = reg_to_encoding(rd); + + val = read_sanitised_ftr_reg(id); + + /* SME is not supported */ + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); + + if (!system_supports_sve()) + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE); + + return val; +} + static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { - u8 csv2, csv3; + int fp, simd; + bool has_sve = id_aa64pfr0_sve(val); + + simd = cpuid_feature_extract_signed_field(val, ID_AA64PFR0_EL1_AdvSIMD_SHIFT); + fp = cpuid_feature_extract_signed_field(val, ID_AA64PFR0_EL1_FP_SHIFT); + /* AdvSIMD field must have the same value as FP field */ + if (simd != fp) + return -EINVAL; + + /* fp must be supported when sve is supported */ + if (has_sve && (fp < 0)) + return -EINVAL; + + /* Check if there is a conflict with a request via KVM_ARM_VCPU_INIT */ + if (vcpu_has_sve(vcpu) ^ has_sve) + return -EPERM; + + return set_id_reg(vcpu, rd, val); +} + +static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + u64 val; + u32 id = reg_to_encoding(rd); + + val = read_sanitised_ftr_reg(id); + /* Limit debug to ARMv8.0 */ + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6); + /* + * Initialise the default PMUver before there is a chance to + * create an actual PMU. + */ + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), + kvm_arm_pmu_get_pmuver_limit()); + /* Hide SPE from guests */ + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer); + + return val; +} + +static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + u8 pmuver, host_pmuver, brps, ctx_cmps; + bool valid_pmu; + + brps = FIELD_GET(ID_AA64DFR0_EL1_BRPs_MASK, val); + ctx_cmps = FIELD_GET(ID_AA64DFR0_EL1_CTX_CMPs_MASK, val); + if (ctx_cmps > brps) + return -EINVAL; + + host_pmuver = kvm_arm_pmu_get_pmuver_limit(); /* - * Allow AA64PFR0_EL1.CSV2 to be set from userspace as long as - * it doesn't promise more than what is actually provided (the - * guest could otherwise be covered in ectoplasmic residue). + * Allow AA64DFR0_EL1.PMUver to be set from userspace as long + * as it doesn't promise more than what the HW gives us. We + * allow an IMPDEF PMU though, only if no PMU is supported + * (KVM backward compatibility handling). */ - csv2 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV2_SHIFT); - if (csv2 > 1 || - (csv2 && arm64_get_spectre_v2_state() != SPECTRE_UNAFFECTED)) + pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), val); + if ((pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver > host_pmuver)) return -EINVAL; - /* Same thing for CSV3 */ - csv3 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV3_SHIFT); - if (csv3 > 1 || - (csv3 && arm64_get_meltdown_state() != SPECTRE_UNAFFECTED)) + valid_pmu = (pmuver != 0 && pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF); + + /* Make sure view register and PMU support do match */ + if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) return -EINVAL; - /* We can only differ with CSV[23], and anything else is an error */ - val ^= read_id_reg(vcpu, rd); - val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3)); - if (val) + if (!valid_pmu) { + /* + * Ignore the PMUVer field in @val. The PMUVer would be determined + * by arch flags bit KVM_ARCH_FLAG_VCPU_HAS_IMP_DEF_PMU, + */ + pmuver = FIELD_GET(ID_AA64DFR0_EL1_PMUVer_MASK, + IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1)); + val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; + val |= FIELD_PREP(ID_AA64DFR0_EL1_PMUVer_MASK, pmuver); + } + + return pmuver_update(vcpu, rd, val, pmuver, valid_pmu); +} + +static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + u64 val; + u32 id = reg_to_encoding(rd); + + val = read_sanitised_ftr_reg(id); + /* + * Initialise the default PMUver before there is a chance to + * create an actual PMU. + */ + val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_PERFMON), kvm_arm_pmu_get_pmuver_limit()); + + return val; +} + +static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + u8 perfmon, host_perfmon; + bool valid_pmu; + + host_perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); + + /* + * Allow DFR0_EL1.PerfMon to be set from userspace as long as + * it doesn't promise more than what the HW gives us on the + * AArch64 side (as everything is emulated with that), and + * that this is a PMUv3. + */ + perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_PERFMON), val); + if ((perfmon != ID_DFR0_PERFMON_IMP_DEF && perfmon > host_perfmon) || + (perfmon != 0 && perfmon < ID_DFR0_PERFMON_8_0)) return -EINVAL; - vcpu->kvm->arch.pfr0_csv2 = csv2; - vcpu->kvm->arch.pfr0_csv3 = csv3; + valid_pmu = (perfmon != 0 && perfmon != ID_DFR0_PERFMON_IMP_DEF); - return 0; + /* Make sure view register and PMU support do match */ + if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) + return -EINVAL; + + if (!valid_pmu) { + /* + * Ignore the PerfMon field in @val. The PerfMon would be determined + * by arch flags bit KVM_ARCH_FLAG_VCPU_HAS_IMP_DEF_PMU, + */ + perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_PERFMON), + IDREG(vcpu->kvm, SYS_ID_DFR0_EL1)); + val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_PERFMON), perfmon); + } + + return pmuver_update(vcpu, rd, val, perfmon_to_pmuver(perfmon), valid_pmu); +} + +static int set_id_aa64isar0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + u8 sm4, sm3, sha1, sha2, sha3; + + sm4 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4), val); + sm3 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3), val); + sha1 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1), val); + sha2 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2), val); + sha3 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3), val); + + /* + * From Arm Architecture Reference Manual for A-profile architecture + * (https://developer.arm.com/documentation/ddi0487/latest/) + * D19.2.61: + * SM4, bits [43:40] + * This field must have the same value as ID_AA64ISAR0_EL1.SM3. + */ + if (sm4 != sm3) + return -EINVAL; + + /* + * From Arm Architecture Reference Manual for A-profile architecture + * (https://developer.arm.com/documentation/ddi0487/latest/) + * D19.2.61: + * SHA1, bits [11:8] + * If the value of ID_AA64ISAR0_EL1.SHA2 is 0b0000, + * this field must have the value 0b0000. + * SHA2, bits [15:12] + * If the value of this field is 0b0010, + * ID_AA64ISAR0_EL1.SHA3 must have the value 0b0001. + * SHA3, bits [35:32] + * If the value of ID_AA64ISAR0_EL1.SHA1 is 0b0000, + * this field must have the value 0b0000. + */ + if (!sha1) { + if (sha2 || sha3) + return -EINVAL; + } else { + if (sha3 && (sha2 != 0b0010)) + return -EINVAL; + } + + return set_id_reg(vcpu, rd, val); +} + +static int set_id_aa64isar1_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + u8 zfr0_i8mm, zfr0_bf16, gpa3, sme; + u8 i8mm, bf16, gpi, gpa; + int advsimd; + + /* Fields in the register we're trying to set - ISAR1 */ + i8mm = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM), val); + bf16 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16), val); + gpi = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), val); + gpa = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA), val); + + /* Fields in ZFR0 */ + zfr0_i8mm = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ZFR0_EL1_I8MM), + IDREG(vcpu->kvm, SYS_ID_AA64ZFR0_EL1)); + zfr0_bf16 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ZFR0_EL1_BF16), + IDREG(vcpu->kvm, SYS_ID_AA64ZFR0_EL1)); + + /* Fields in ISAR2 */ + gpa3 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3), + IDREG(vcpu->kvm, SYS_ID_AA64ISAR2_EL1)); + + /* Fields in PFR1 */ + sme = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME), + IDREG(vcpu->kvm, SYS_ID_AA64PFR1_EL1)); + + /* Fields in PFR0 */ + advsimd = cpuid_feature_extract_signed_field(IDREG(vcpu->kvm, + SYS_ID_AA64PFR0_EL1), + ID_AA64PFR0_EL1_AdvSIMD_SHIFT); + + /* + * From Arm Architecture Reference Manual for A-profile architecture + * (https://developer.arm.com/documentation/ddi0487/latest/) + * D19.2.62: + * I8MM, bits [55:52] + * When Advanced SIMD and SVE are both implemented, this field must + * return the same value as ID_AA64ZFR0_EL1.I8MM. + */ + if (vcpu_has_sve(vcpu) && advsimd) { + if (i8mm != zfr0_i8mm) + return -EINVAL; + } + + /* + * From Arm Architecture Reference Manual for A-profile architecture + * (https://developer.arm.com/documentation/ddi0487/latest/) + * D19.2.62: + * BF16, bits [47:44] + * When FEAT_SVE or FEAT_SME is implemented, this field must return + * the same value as ID_AA64ZFR0_EL1.BF16. + */ + if (vcpu_has_sve(vcpu) || sme) { + if (bf16 != zfr0_bf16) + return -EINVAL; + } + + /* + * From Arm Architecture Reference Manual for A-profile architecture + * (https://developer.arm.com/documentation/ddi0487/latest/) + * D19.2.62: + * GPI, bits [31:28] + * If the value of ID_AA64ISAR1_EL1.GPA is nonzero, or the value of + * ID_AA64ISAR2_EL1.GPA3 is nonzero, this field must have the value + * 0b0000. + */ + if (gpi && (gpa || gpa3)) { + return -EINVAL; + } + + /* + * From Arm Architecture Reference Manual for A-profile architecture + * (https://developer.arm.com/documentation/ddi0487/latest/) + * D19.2.62: + * GPA, bits [27:24] + * If the value of ID_AA64ISAR1_EL1.GPI is nonzero, or the value of + * ID_AA64ISAR2_EL1.GPA3 is nonzero, this field must have the value + * 0b0000. + */ + if (gpa && (gpi || gpa3)) { + return -EINVAL; + } + + /* Check ptrauth state matches that requested in vcpu features */ + if ((gpi || gpa || gpa3) != vcpu_has_ptrauth(vcpu)) + return -EINVAL; + + /* + * No need to validate API or APA, since they are FTR_EXACT they must + * match the host value. And who are we to argue if the host screwed + * these up. + */ + + return set_id_reg(vcpu, rd, val); +} + +static int set_id_aa64isar2_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + u8 gpi, gpa, gpa3; + + /* Fields in the register we're trying to set - ISAR2 */ + gpa3 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3), val); + + /* Fields in ISAR1 */ + gpi = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), + IDREG(vcpu->kvm, SYS_ID_AA64ISAR1_EL1)); + gpa = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA), + IDREG(vcpu->kvm, SYS_ID_AA64ISAR1_EL1)); + + /* + * From Arm Architecture Reference Manual for A-profile architecture + * (https://developer.arm.com/documentation/ddi0487/latest/) + * D19.2.63: + * GPA3, bits [11:8] + * If the value of ID_AA64ISAR1_EL1.GPI is nonzero, or the value of + * ID_AA64ISAR1_EL1.GPA is nonzero, this field must have the value + * 0b0000. + */ + if (gpa3 && (gpi || gpa)) { + return -EINVAL; + } + + /* Check ptrauth state matches that requested in vcpu features */ + if ((gpi || gpa || gpa3) != vcpu_has_ptrauth(vcpu)) + return -EINVAL; + + /* + * No need to validate APA3, since it is FTR_EXACT it must match the + * host value. And who are we to argue if the host screwed it up. + */ + + return set_id_reg(vcpu, rd, val); +} + +static u64 read_sanitised_id_aa64isar2_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + u64 val; + u32 id = reg_to_encoding(rd); + + val = read_sanitised_ftr_reg(id); + + if (!cpus_have_final_cap(ARM64_HAS_WFXT)) + val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT); + + return val; +} + +static u64 read_sanitised_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + u64 val; + u32 id = reg_to_encoding(rd); + + val = read_sanitised_ftr_reg(id); + + /* CCIDX is not supported */ + val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; + + return val; } /* * cpufeature ID register user accessors * - * For now, these registers are immutable for userspace, so no values - * are stored, and for set_id_reg() we don't allow the effective value - * to be changed. + * For now, only some registers or some part of registers are mutable for + * userspace. For those registers immutable for userspace, in set_id_reg() + * we don't allow the effective value to be changed. */ static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) @@ -1256,11 +1828,24 @@ static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { - /* This is what we mean by invariant: you can't change it. */ - if (val != read_id_reg(vcpu, rd)) - return -EINVAL; + u32 id = reg_to_encoding(rd); + int ret = 0; - return 0; + ret = arm64_check_features(vcpu, rd, val); + if (!ret) + IDREG(vcpu->kvm, id) = val; + + /* + * arm64_check_features() returns -E2BIG to indicate the register's + * feature set is a superset of the maximally-allowed register value. + * While it would be nice to precisely describe this to userspace, the + * existing UAPI for KVM_SET_ONE_REG has it that invalid register + * writes return -EINVAL. + */ + if (ret == -E2BIG) + ret = -EINVAL; + + return ret; } static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, @@ -1353,6 +1938,17 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, .visibility = mte_visibility, \ } +/* + * Since reset() callback and field val are not used for idregs, they will be + * used for specific purposes for idregs. + * The reset() would return KVM sanitised register value. The value would be the + * same as the host kernel sanitised value if there is no KVM sanitisation. + * The val would be used as a mask indicating writable fields for the idreg. + * Only bits with 1 are writable from userspace. This mask might not be + * necessary in the future whenever all ID registers are enabled as writable + * from userspace. + */ + /* sys_reg_desc initialiser for known cpufeature ID registers */ #define ID_SANITISED(name) { \ SYS_DESC(SYS_##name), \ @@ -1360,6 +1956,18 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, .get_user = get_id_reg, \ .set_user = set_id_reg, \ .visibility = id_visibility, \ + .reset = general_read_kvm_sanitised_reg,\ + .val = 0, \ +} + +#define ID_SANITISED_WRITABLE(name) { \ + SYS_DESC(SYS_##name), \ + .access = access_id_reg, \ + .get_user = get_id_reg, \ + .set_user = set_id_reg, \ + .visibility = id_visibility, \ + .reset = general_read_kvm_sanitised_reg,\ + .val = GENMASK(63, 0), \ } /* sys_reg_desc initialiser for known cpufeature ID registers */ @@ -1369,6 +1977,8 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, .get_user = get_id_reg, \ .set_user = set_id_reg, \ .visibility = aa32_id_visibility, \ + .reset = general_read_kvm_sanitised_reg,\ + .val = 0, \ } /* @@ -1381,7 +1991,9 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, .access = access_id_reg, \ .get_user = get_id_reg, \ .set_user = set_id_reg, \ - .visibility = raz_visibility \ + .visibility = raz_visibility, \ + .reset = NULL, \ + .val = 0, \ } /* @@ -1395,6 +2007,8 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, .get_user = get_id_reg, \ .set_user = set_id_reg, \ .visibility = raz_visibility, \ + .reset = NULL, \ + .val = 0, \ } /* @@ -1460,7 +2074,13 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* CRm=1 */ AA32_ID_SANITISED(ID_PFR0_EL1), AA32_ID_SANITISED(ID_PFR1_EL1), - AA32_ID_SANITISED(ID_DFR0_EL1), + { SYS_DESC(SYS_ID_DFR0_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_dfr0_el1, + .visibility = aa32_id_visibility, + .reset = read_sanitised_id_dfr0_el1, + .val = GENMASK(63, 0), }, ID_HIDDEN(ID_AFR0_EL1), AA32_ID_SANITISED(ID_MMFR0_EL1), AA32_ID_SANITISED(ID_MMFR1_EL1), @@ -1474,7 +2094,13 @@ static const struct sys_reg_desc sys_reg_descs[] = { AA32_ID_SANITISED(ID_ISAR3_EL1), AA32_ID_SANITISED(ID_ISAR4_EL1), AA32_ID_SANITISED(ID_ISAR5_EL1), - AA32_ID_SANITISED(ID_MMFR4_EL1), + { SYS_DESC(SYS_ID_MMFR4_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_reg, + .visibility = aa32_id_visibility, + .reset = read_sanitised_id_mmfr4_el1, + .val = 0, }, AA32_ID_SANITISED(ID_ISAR6_EL1), /* CRm=3 */ @@ -1489,9 +2115,18 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* AArch64 ID registers */ /* CRm=4 */ - { SYS_DESC(SYS_ID_AA64PFR0_EL1), .access = access_id_reg, - .get_user = get_id_reg, .set_user = set_id_aa64pfr0_el1, }, - ID_SANITISED(ID_AA64PFR1_EL1), + { SYS_DESC(SYS_ID_AA64PFR0_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_aa64pfr0_el1, + .reset = read_sanitised_id_aa64pfr0_el1, + .val = GENMASK(63, 0), }, + { SYS_DESC(SYS_ID_AA64PFR1_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_reg, + .reset = read_sanitised_id_aa64pfr1_el1, + .val = 0, }, ID_UNALLOCATED(4,2), ID_UNALLOCATED(4,3), ID_SANITISED(ID_AA64ZFR0_EL1), @@ -1500,7 +2135,12 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_UNALLOCATED(4,7), /* CRm=5 */ - ID_SANITISED(ID_AA64DFR0_EL1), + { SYS_DESC(SYS_ID_AA64DFR0_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_aa64dfr0_el1, + .reset = read_sanitised_id_aa64dfr0_el1, + .val = GENMASK(63, 0), }, ID_SANITISED(ID_AA64DFR1_EL1), ID_UNALLOCATED(5,2), ID_UNALLOCATED(5,3), @@ -1510,9 +2150,24 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_UNALLOCATED(5,7), /* CRm=6 */ - ID_SANITISED(ID_AA64ISAR0_EL1), - ID_SANITISED(ID_AA64ISAR1_EL1), - ID_SANITISED(ID_AA64ISAR2_EL1), + { SYS_DESC(SYS_ID_AA64ISAR0_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_aa64isar0_el1, + .reset = general_read_kvm_sanitised_reg, + .val = GENMASK(63, 0), }, + { SYS_DESC(SYS_ID_AA64ISAR1_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_aa64isar1_el1, + .reset = general_read_kvm_sanitised_reg, + .val = GENMASK(63, 0), }, + { SYS_DESC(SYS_ID_AA64ISAR2_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_aa64isar2_el1, + .reset = read_sanitised_id_aa64isar2_el1, + .val = GENMASK(63, 0), }, ID_UNALLOCATED(6,3), ID_UNALLOCATED(6,4), ID_UNALLOCATED(6,5), @@ -1520,9 +2175,14 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_UNALLOCATED(6,7), /* CRm=7 */ - ID_SANITISED(ID_AA64MMFR0_EL1), - ID_SANITISED(ID_AA64MMFR1_EL1), - ID_SANITISED(ID_AA64MMFR2_EL1), + ID_SANITISED_WRITABLE(ID_AA64MMFR0_EL1), + ID_SANITISED_WRITABLE(ID_AA64MMFR1_EL1), + { SYS_DESC(SYS_ID_AA64MMFR2_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_reg, + .reset = read_sanitised_id_aa64mmfr2_el1, + .val = GENMASK(63, 0), }, ID_UNALLOCATED(7,3), ID_UNALLOCATED(7,4), ID_UNALLOCATED(7,5), @@ -1823,6 +2483,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 }, }; +static const struct sys_reg_desc *first_idreg; + static bool trap_dbgdidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -1830,8 +2492,8 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu, if (p->is_write) { return ignore_write(vcpu, p); } else { - u64 dfr = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); - u64 pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u64 dfr = kvm_arm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); + u64 pfr = kvm_arm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); u32 el3 = !!cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR0_EL1_EL3_SHIFT); p->regval = ((((dfr >> ID_AA64DFR0_EL1_WRPs_SHIFT) & 0xf) << 28) | @@ -2621,10 +3283,11 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id, */ #define FUNCTION_INVARIANT(reg) \ - static void get_##reg(struct kvm_vcpu *v, \ + static u64 get_##reg(struct kvm_vcpu *v, \ const struct sys_reg_desc *r) \ { \ ((struct sys_reg_desc *)r)->val = read_sysreg(reg); \ + return ((struct sys_reg_desc *)r)->val; \ } FUNCTION_INVARIANT(midr_el1) @@ -2632,9 +3295,10 @@ FUNCTION_INVARIANT(revidr_el1) FUNCTION_INVARIANT(clidr_el1) FUNCTION_INVARIANT(aidr_el1) -static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) +static u64 get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) { ((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0); + return ((struct sys_reg_desc *)r)->val; } /* ->val is filled in by kvm_sys_reg_table_init() */ @@ -2772,6 +3436,9 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, if (!r) return -ENOENT; + if (is_id_reg(reg_to_encoding(r))) + mutex_lock(&vcpu->kvm->arch.config_lock); + if (r->get_user) { ret = (r->get_user)(vcpu, r, &val); } else { @@ -2779,6 +3446,9 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, ret = 0; } + if (is_id_reg(reg_to_encoding(r))) + mutex_unlock(&vcpu->kvm->arch.config_lock); + if (!ret) ret = put_user(val, uaddr); @@ -2816,9 +3486,21 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, if (!r) return -ENOENT; + /* Only allow userspace to change the idregs before VM running */ + if (is_id_reg(reg_to_encoding(r)) && + test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &vcpu->kvm->arch.flags) ) { + if (val == read_id_reg(vcpu, r)) + return 0; + return -EBUSY; + } + if (sysreg_user_write_ignore(vcpu, r)) return 0; + /* ID regs are global to the VM and cannot be updated concurrently */ + if (is_id_reg(reg_to_encoding(r))) + mutex_lock(&vcpu->kvm->arch.config_lock); + if (r->set_user) { ret = (r->set_user)(vcpu, r, val); } else { @@ -2826,6 +3508,9 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, ret = 0; } + if (is_id_reg(reg_to_encoding(r))) + mutex_unlock(&vcpu->kvm->arch.config_lock); + return ret; } @@ -2962,8 +3647,29 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return write_demux_regids(uindices); } +/* Initialize the guest's ID registers with KVM sanitised values. */ +void kvm_arm_init_id_regs(struct kvm *kvm) +{ + const struct sys_reg_desc *idreg = first_idreg; + u32 id = reg_to_encoding(idreg); + u64 val; + + /* Initialize all idregs */ + while (is_id_reg(id)) { + val = 0; + /* Read KVM sanitised register value if available */ + if (idreg->reset) + val = idreg->reset(NULL, idreg); + IDREG(kvm, id) = val; + + idreg++; + id = reg_to_encoding(idreg); + } +} + int kvm_sys_reg_table_init(void) { + struct sys_reg_params params; bool valid = true; unsigned int i; struct sys_reg_desc clidr; @@ -3000,6 +3706,12 @@ int kvm_sys_reg_table_init(void) break; /* Clear all higher bits. */ cache_levels &= (1 << (i*3))-1; + + /* Find the first idreg (SYS_ID_PFR0_EL1) in sys_reg_descs. */ + params = encoding_to_params(SYS_ID_PFR0_EL1); + first_idreg = find_reg(¶ms, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); + if (!first_idreg) + return -EINVAL; return 0; } diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index e4ebb3a379fdb..ec3749147cd7c 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -27,6 +27,13 @@ struct sys_reg_params { bool is_write; }; +#define encoding_to_params(reg) \ + ((struct sys_reg_params){ .Op0 = sys_reg_Op0(reg), \ + .Op1 = sys_reg_Op1(reg), \ + .CRn = sys_reg_CRn(reg), \ + .CRm = sys_reg_CRm(reg), \ + .Op2 = sys_reg_Op2(reg) }) + #define esr_sys64_to_params(esr) \ ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \ .Op1 = ((esr) >> 14) & 0x7, \ @@ -64,13 +71,16 @@ struct sys_reg_desc { struct sys_reg_params *, const struct sys_reg_desc *); - /* Initialization for vcpu. */ - void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *); + /* + * Initialization for vcpu. Return initialized value, or KVM + * sanitized value for ID registers. + */ + u64 (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *); /* Index into sys_reg[], or 0 if we don't need to save it. */ int reg; - /* Value (usually reset value) */ + /* Value (usually reset value), or write mask for idregs */ u64 val; /* Custom get/set_user functions, fallback to generic if NULL */ @@ -122,19 +132,21 @@ static inline bool read_zero(struct kvm_vcpu *vcpu, } /* Reset functions */ -static inline void reset_unknown(struct kvm_vcpu *vcpu, +static inline u64 reset_unknown(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); __vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL; + return __vcpu_sys_reg(vcpu, r->reg); } -static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); __vcpu_sys_reg(vcpu, r->reg) = r->val; + return __vcpu_sys_reg(vcpu, r->reg); } static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu, diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 4b4651ee47f27..b0ca3d6af9ed5 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -418,8 +418,6 @@ void __init bootmem_init(void) arm64_hugetlb_cma_reserve(); #endif - dma_pernuma_cma_reserve(); - kvm_hyp_reserve(); /* diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 4b302dbf78e96..4302653909e78 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -169,16 +169,17 @@ static bool pgattr_change_is_safe(u64 old, u64 new) return ((old ^ new) & ~mask) == 0; } -static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, +static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot) { - pte_t *ptep; - - ptep = pte_set_fixmap_offset(pmdp, addr); do { pte_t old_pte = READ_ONCE(*ptep); - set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); + /* + * Required barriers to make this visible to the table walker + * are deferred to the end of alloc_init_cont_pte(). + */ + set_pte_nosync(ptep, pfn_pte(__phys_to_pfn(phys), prot)); /* * After the PTE entry has been populated once, we @@ -189,8 +190,6 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, phys += PAGE_SIZE; } while (ptep++, addr += PAGE_SIZE, addr != end); - - pte_clear_fixmap(); } static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, @@ -201,6 +200,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, { unsigned long next; pmd_t pmd = READ_ONCE(*pmdp); + pte_t *ptep; BUG_ON(pmd_sect(pmd)); if (pmd_none(pmd)) { @@ -216,6 +216,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, } BUG_ON(pmd_bad(pmd)); + ptep = pte_set_fixmap_offset(pmdp, addr); do { pgprot_t __prot = prot; @@ -226,20 +227,26 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, (flags & NO_CONT_MAPPINGS) == 0) __prot = __pgprot(pgprot_val(prot) | PTE_CONT); - init_pte(pmdp, addr, next, phys, __prot); + init_pte(ptep, addr, next, phys, __prot); + ptep += pte_index(next) - pte_index(addr); phys += next - addr; } while (addr = next, addr != end); + + /* + * Note: barriers and maintenance necessary to clear the fixmap slot + * ensure that all previous pgtable writes are visible to the table + * walker. + */ + pte_clear_fixmap(); } -static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end, +static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; - pmd_t *pmdp; - pmdp = pmd_set_fixmap_offset(pudp, addr); do { pmd_t old_pmd = READ_ONCE(*pmdp); @@ -265,8 +272,6 @@ static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end, } phys += next - addr; } while (pmdp++, addr = next, addr != end); - - pmd_clear_fixmap(); } static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, @@ -276,6 +281,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, { unsigned long next; pud_t pud = READ_ONCE(*pudp); + pmd_t *pmdp; /* * Check for initial section mappings in the pgd/pud. @@ -294,6 +300,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, } BUG_ON(pud_bad(pud)); + pmdp = pmd_set_fixmap_offset(pudp, addr); do { pgprot_t __prot = prot; @@ -304,10 +311,13 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, (flags & NO_CONT_MAPPINGS) == 0) __prot = __pgprot(pgprot_val(prot) | PTE_CONT); - init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags); + init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags); + pmdp += pmd_index(next) - pmd_index(addr); phys += next - addr; } while (addr = next, addr != end); + + pmd_clear_fixmap(); } static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, diff --git a/arch/arm64/mm/trans_pgd-asm.S b/arch/arm64/mm/trans_pgd-asm.S index 021c31573bcb6..148435248860d 100644 --- a/arch/arm64/mm/trans_pgd-asm.S +++ b/arch/arm64/mm/trans_pgd-asm.S @@ -8,10 +8,12 @@ #include #include #include +#include .macro invalid_vector label SYM_CODE_START_LOCAL(\label) .align 7 + UNWIND_HINT_EMPTY b \label SYM_CODE_END(\label) .endm @@ -19,6 +21,7 @@ SYM_CODE_END(\label) .macro el1_sync_vector SYM_CODE_START_LOCAL(el1_sync) .align 7 + UNWIND_HINT_EMPTY cmp x0, #HVC_SET_VECTORS /* Called from hibernate */ b.ne 1f msr vbar_el2, x1 diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 9794d9174795d..9367905943bd3 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -1,3 +1,4 @@ + // SPDX-License-Identifier: GPL-2.0 /* * kaslr.c @@ -32,6 +33,9 @@ #include #include +/* xen_cpuid_base/hypervisor_cpuid_base inlines */ +#include + #define _SETUP #include /* For COMMAND_LINE_SIZE */ #undef _SETUP @@ -835,6 +839,10 @@ void choose_random_location(unsigned long input, warn("KASLR disabled: 'nokaslr' on cmdline."); return; } + if (xen_cpuid_base() != 0) { + warn("KASLR disabled: Xen hypervisor detected."); + return; + } boot_params_ptr->hdr.loadflags |= KASLR_FLAG; diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 3c5d5c97f8f73..802872d9c68c5 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -257,6 +257,10 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) if (result != ES_OK) goto finish; + result = vc_check_opcode_bytes(&ctxt, exit_code); + if (result != ES_OK) + goto finish; + switch (exit_code) { case SVM_EXIT_RDTSC: case SVM_EXIT_RDTSCP: diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 3ac069a4559b0..b9a58954dc535 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -534,12 +534,8 @@ static void amd_pmu_cpu_reset(int cpu) /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); - /* - * Clear freeze and overflow bits i.e. PerfCntrGLobalStatus.LbrFreeze - * and PerfCntrGLobalStatus.PerfCntrOvfl - */ - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, - GLOBAL_STATUS_LBRS_FROZEN | amd_pmu_global_cntr_mask); + /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask); } static int amd_pmu_cpu_prepare(int cpu) @@ -574,7 +570,6 @@ static void amd_pmu_cpu_starting(int cpu) int i, nb_id; cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; - amd_pmu_cpu_reset(cpu); if (!x86_pmu.amd_nb_constraints) return; @@ -596,6 +591,8 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; + + amd_pmu_cpu_reset(cpu); } static void amd_pmu_cpu_dead(int cpu) @@ -616,6 +613,8 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } + + amd_pmu_cpu_reset(cpu); } static inline void amd_pmu_set_global_ctl(u64 ctl) diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 16f548a661cf6..1bf2ad34188ad 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -43,6 +43,20 @@ static inline uint32_t xen_cpuid_base(void) return hypervisor_cpuid_base("XenVMMXenVMM", 2); } +#ifdef CONFIG_XEN +extern bool __init xen_hvm_need_lapic(void); + +static inline bool __init xen_x2apic_para_available(void) +{ + return xen_hvm_need_lapic(); +} +#else +static inline bool __init xen_x2apic_para_available(void) +{ + return (xen_cpuid_base() != 0); +} +#endif + struct pci_dev; #ifdef CONFIG_XEN_PV_DOM0 diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index ed8ac6bcbafb2..a780572062bdc 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -48,7 +48,7 @@ enum allow_write_msrs { MSR_WRITES_DEFAULT, }; -static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT; +static enum allow_write_msrs allow_writes = MSR_WRITES_OFF; static ssize_t msr_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 18a034613d94d..06352e9acf62e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -52,6 +52,7 @@ #include #include #include +#include #include /* @@ -885,6 +886,9 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS; #endif +#ifdef CONFIG_RANDOMIZE_BASE + printk(KERN_INFO "KASLR %s\n", kaslr_enabled() ? "enabled" : "disabled"); +#endif /* * If we have OLPC OFW, we might end up relocating the fixmap due to diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 3fe76bf17d95e..ecaa466b50b9c 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -10,8 +10,13 @@ */ #ifndef __BOOT_COMPRESSED -#define error(v) pr_err(v) -#define has_cpuflag(f) boot_cpu_has(f) +#define error(v) pr_err(v) +#define has_cpuflag(f) boot_cpu_has(f) +#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) +#define sev_printk_rtl(fmt, ...) printk_ratelimited(fmt, ##__VA_ARGS__) +#else +#define sev_printk(fmt, ...) +#define sev_printk_rtl(fmt, ...) #endif /* I/O parameters for CPUID-related helpers */ @@ -567,6 +572,7 @@ void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); + u16 opcode = *(unsigned short *)regs->ip; struct cpuid_leaf leaf; int ret; @@ -574,6 +580,10 @@ void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) if (exit_code != SVM_EXIT_CPUID) goto fail; + /* Is it really a CPUID insn? */ + if (opcode != 0xa20f) + goto fail; + leaf.fn = fn; leaf.subfn = subfn; @@ -1064,3 +1074,92 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) RIP_REL_REF(cpuid_ext_range_max) = fn->eax; } } + +static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + unsigned int opcode = (unsigned int)ctxt->insn.opcode.value; + u8 modrm = ctxt->insn.modrm.value; + + switch (exit_code) { + + case SVM_EXIT_IOIO: + case SVM_EXIT_NPF: + /* handled separately */ + return ES_OK; + + case SVM_EXIT_CPUID: + if (opcode == 0xa20f) + return ES_OK; + break; + + case SVM_EXIT_INVD: + if (opcode == 0x080f) + return ES_OK; + break; + + case SVM_EXIT_MONITOR: + if (opcode == 0x010f && modrm == 0xc8) + return ES_OK; + break; + + case SVM_EXIT_MWAIT: + if (opcode == 0x010f && modrm == 0xc9) + return ES_OK; + break; + + case SVM_EXIT_MSR: + /* RDMSR */ + if (opcode == 0x320f || + /* WRMSR */ + opcode == 0x300f) + return ES_OK; + break; + + case SVM_EXIT_RDPMC: + if (opcode == 0x330f) + return ES_OK; + break; + + case SVM_EXIT_RDTSC: + if (opcode == 0x310f) + return ES_OK; + break; + + case SVM_EXIT_RDTSCP: + if (opcode == 0x010f && modrm == 0xf9) + return ES_OK; + break; + + case SVM_EXIT_READ_DR7: + if (opcode == 0x210f && + X86_MODRM_REG(ctxt->insn.modrm.value) == 7) + return ES_OK; + break; + + case SVM_EXIT_VMMCALL: + if (opcode == 0x010f && modrm == 0xd9) + return ES_OK; + + break; + + case SVM_EXIT_WRITE_DR7: + if (opcode == 0x230f && + X86_MODRM_REG(ctxt->insn.modrm.value) == 7) + return ES_OK; + break; + + case SVM_EXIT_WBINVD: + if (opcode == 0x90f) + return ES_OK; + break; + + default: + break; + } + + sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n", + opcode, exit_code, ctxt->regs->ip); + + return ES_UNSUPPORTED; +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index f8a8249ae1177..7fe2b8d6db4fa 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1779,7 +1779,10 @@ static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, struct ghcb *ghcb, unsigned long exit_code) { - enum es_result result; + enum es_result result = vc_check_opcode_bytes(ctxt, exit_code); + + if (result != ES_OK) + return result; switch (exit_code) { case SVM_EXIT_READ_DR7: diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cafacb2e58cce..cece805541284 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -1575,3 +1576,31 @@ unsigned long calibrate_delay_is_known(void) return 0; } #endif + +static int tsc_pm_notifier(struct notifier_block *notifier, + unsigned long pm_event, void *unused) +{ + switch (pm_event) { + case PM_HIBERNATION_PREPARE: + clear_sched_clock_stable(); + break; + case PM_POST_HIBERNATION: + /* Set back to the default */ + if (!check_tsc_unstable()) + set_sched_clock_stable(); + break; + } + + return 0; +}; + +static struct notifier_block tsc_pm_notifier_block = { + .notifier_call = tsc_pm_notifier, +}; + +static int tsc_setup_pm_notifier(void) +{ + return register_pm_notifier(&tsc_pm_notifier_block); +} + +subsys_initcall(tsc_setup_pm_notifier); diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index c66807dd02703..2d1284f8cd2ae 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -36,6 +36,13 @@ static unsigned long shared_info_pfn; __ro_after_init bool xen_percpu_upcall; EXPORT_SYMBOL_GPL(xen_percpu_upcall); +void xen_hvm_map_shared_info(void) +{ + xen_hvm_init_shared_info(); + if(shared_info_pfn) + HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn)); +} + void xen_hvm_init_shared_info(void) { struct xen_add_to_physmap xatp; @@ -228,6 +235,7 @@ static void __init xen_hvm_guest_init(void) xen_panic_handler_init(); + xen_setup_syscore_ops(); xen_hvm_smp_init(); WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm)); xen_unplug_emulated_devices(); @@ -258,9 +266,15 @@ static __init int xen_parse_no_vector_callback(char *arg) } early_param("xen_no_vector_callback", xen_parse_no_vector_callback); -static __init bool xen_x2apic_available(void) +bool __init xen_hvm_need_lapic(void) { - return x2apic_supported(); + if (xen_pv_domain()) + return false; + if (!xen_hvm_domain()) + return false; + if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) + return false; + return true; } static bool __init msi_ext_dest_id(void) @@ -327,7 +341,7 @@ struct hypervisor_x86 x86_hyper_xen_hvm __initdata = { .detect = xen_platform_hvm, .type = X86_HYPER_XEN_HVM, .init.init_platform = xen_hvm_guest_init, - .init.x2apic_available = xen_x2apic_available, + .init.x2apic_available = xen_x2apic_para_available, .init.init_mem_mapping = xen_hvm_init_mem_mapping, .init.guest_late_init = xen_hvm_guest_late_init, .init.msi_ext_dest_id = msi_ext_dest_id, diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 1d83152c761bc..8be6ffa6bfbea 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -2,17 +2,22 @@ #include #include #include +#include +#include #include #include +#include #include #include +#include #include #include #include #include #include +#include #include "xen-ops.h" #include "mmu.h" @@ -82,3 +87,65 @@ void xen_arch_suspend(void) on_each_cpu(xen_vcpu_notify_suspend, NULL, 1); } + +static int xen_syscore_suspend(void) +{ + struct xen_remove_from_physmap xrfp; + int cpu, ret; + + /* Xen suspend does similar stuffs in its own logic */ + if (xen_suspend_mode_is_xen_suspend()) + return 0; + + for_each_present_cpu(cpu) { + /* + * Nonboot CPUs are already offline, but the last copy of + * runstate info is still accessible. + */ + xen_save_steal_clock(cpu); + } + + xen_shutdown_pirqs(); + + xrfp.domid = DOMID_SELF; + xrfp.gpfn = __pa(HYPERVISOR_shared_info) >> PAGE_SHIFT; + + ret = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrfp); + if (!ret) + HYPERVISOR_shared_info = &xen_dummy_shared_info; + + return ret; +} + +static void xen_syscore_resume(void) +{ + /* Xen suspend does similar stuffs in its own logic */ + if (xen_suspend_mode_is_xen_suspend()) + return; + + /* No need to setup vcpu_info as it's already moved off */ + xen_hvm_map_shared_info(); + + pvclock_resume(); + + /* Nonboot CPUs will be resumed when they're brought up */ + xen_restore_steal_clock(smp_processor_id()); + + gnttab_resume(); + +} + +/* + * These callbacks will be called with interrupts disabled and when having only + * one CPU online. + */ +static struct syscore_ops xen_hvm_syscore_ops = { + .suspend = xen_syscore_suspend, + .resume = xen_syscore_resume +}; + +void __init xen_setup_syscore_ops(void) +{ + if (xen_hvm_domain()) + register_syscore_ops(&xen_hvm_syscore_ops); +} diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 9ef0a5cca96ee..cc11dd2e2f481 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -548,6 +548,9 @@ static void xen_hvm_setup_cpu_clockevents(void) { int cpu = smp_processor_id(); xen_setup_runstate_info(cpu); + if (cpu) + xen_restore_steal_clock(cpu); + /* * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence * doing it xen_hvm_cpu_notify (which gets called by smp_init during diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index b2b2f4315b78d..5c626793c11e2 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -56,6 +56,8 @@ void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); +void xen_callback_vector(void); +void xen_hvm_map_shared_info(void); void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); diff --git a/crypto/Kconfig b/crypto/Kconfig index edf193aff23e7..69907bc4a0984 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1280,6 +1280,7 @@ endif # if CRYPTO_DRBG_MENU config CRYPTO_JITTERENTROPY tristate "CPU Jitter Non-Deterministic RNG (Random Number Generator)" select CRYPTO_RNG + select CRYPTO_SHA3 help CPU Jitter RNG (Random Number Generator) from the Jitterentropy library diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index 50c933f86b218..170f069823815 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -43,7 +43,7 @@ static void public_key_describe(const struct key *asymmetric_key, void public_key_free(struct public_key *key) { if (key) { - kfree(key->key); + kfree_sensitive(key->key); kfree(key->params); kfree(key); } @@ -218,7 +218,7 @@ static int software_key_query(const struct kernel_pkey_params *params, ret = 0; error_free_key: - kfree(key); + kfree_sensitive(key); error_free_tfm: crypto_free_akcipher(tfm); pr_devel("<==%s() = %d\n", __func__, ret); @@ -303,7 +303,7 @@ static int software_key_eds_op(struct kernel_pkey_params *params, ret = req->dst_len; error_free_key: - kfree(key); + kfree_sensitive(key); error_free_req: akcipher_request_free(req); error_free_tfm: @@ -460,7 +460,7 @@ int public_key_verify_signature(const struct public_key *pkey, ret = crypto_wait_req(crypto_akcipher_verify(req), &cwait); error_free_buf: - kfree(buf); + kfree_sensitive(buf); error_free_req: akcipher_request_free(req); error_free_tfm: diff --git a/crypto/dh.c b/crypto/dh.c index 99c3b2ef7adca..5e81fdca41ff6 100644 --- a/crypto/dh.c +++ b/crypto/dh.c @@ -163,6 +163,37 @@ static int dh_is_pubkey_valid(struct dh_ctx *ctx, MPI y) return 0; } +/* + * SP800-56A pair-wise consistency check: + * section 5.6.2.1.4: + * For an FFC key pair (x, y): Use the private key, x, along with the generator g and + * prime modulus p included in the domain parameters associated with the key pair + * to compute g^x mod p. Compare the result to the public key, y. + */ +static int dh_pairwise_test(struct dh_ctx *ctx, MPI y) +{ + int ret; + MPI val; + val = mpi_alloc(0); + if (!val) + return -ENOMEM; + + ret = _compute_val(ctx, ctx->g, val); /*g^x mod p*/ + if (ret) { + goto err_free_val; + } + + if (mpi_cmp(val, y)) { + ret = -EINVAL; + goto err_free_val; + } + + ret = 0; +err_free_val: + mpi_free(val); + return ret; +} + static int dh_compute_value(struct kpp_request *req) { struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); @@ -226,6 +257,11 @@ static int dh_compute_value(struct kpp_request *req) /* SP800-56A rev 3 5.6.2.1.3 key check */ } else { + if (fips_enabled && dh_pairwise_test(ctx, val)) { + fips_fail_notify(); + panic("dh_pairwise_test failed"); + } + if (dh_is_pubkey_valid(ctx, val)) { ret = -EAGAIN; goto err_free_val; diff --git a/crypto/drbg.c b/crypto/drbg.c index ff4ebbc68efab..c58d2f871855f 100644 --- a/crypto/drbg.c +++ b/crypto/drbg.c @@ -1510,13 +1510,14 @@ static int drbg_generate(struct drbg_state *drbg, * Wrapper around drbg_generate which can pull arbitrary long strings * from the DRBG without hitting the maximum request limitation. * - * Parameters: see drbg_generate + * Parameters: see drbg_generate, except @reseed, which triggers reseeding * Return codes: see drbg_generate -- if one drbg_generate request fails, * the entire drbg_generate_long request fails */ static int drbg_generate_long(struct drbg_state *drbg, unsigned char *buf, unsigned int buflen, - struct drbg_string *addtl) + struct drbg_string *addtl, + bool reseed) { unsigned int len = 0; unsigned int slice = 0; @@ -1526,6 +1527,8 @@ static int drbg_generate_long(struct drbg_state *drbg, slice = ((buflen - len) / drbg_max_request_bytes(drbg)); chunk = slice ? drbg_max_request_bytes(drbg) : (buflen - len); mutex_lock(&drbg->drbg_mutex); + if (reseed) + drbg->seeded = DRBG_SEED_STATE_UNSEEDED; err = drbg_generate(drbg, buf + len, chunk, addtl); mutex_unlock(&drbg->drbg_mutex); if (0 > err) @@ -1959,7 +1962,10 @@ static int drbg_kcapi_random(struct crypto_rng *tfm, addtl = &string; } - return drbg_generate_long(drbg, dst, dlen, addtl); + return drbg_generate_long(drbg, dst, dlen, addtl, + (crypto_tfm_get_flags(crypto_rng_tfm(tfm)) & + CRYPTO_TFM_REQ_NEED_RESEED) == + CRYPTO_TFM_REQ_NEED_RESEED); } /* diff --git a/crypto/ecc.c b/crypto/ecc.c index 7315217c8f733..7e25fa1e122f5 100644 --- a/crypto/ecc.c +++ b/crypto/ecc.c @@ -1505,6 +1505,56 @@ int ecc_gen_privkey(unsigned int curve_id, unsigned int ndigits, u64 *privkey) } EXPORT_SYMBOL(ecc_gen_privkey); +/** +* SP800-56A section 5.6.2.1.4 Pair-Wise Consistency Test +* ecc_pairwise_test() - Pair-wise Consistency test +* +* @curve: elliptic curve domain parameters +* @private_key: pregenerated private key for the given curve +* @pk: public key as a point +* @ndigits: curve's number of digits +* +* Pair-wise Consistency test according to SP800-56A section 5.6.2.1.4 +* +* Return: 0 if test is successful, -EINVAL if test is failed. +*/ +static int ecc_pairwise_test(const struct ecc_curve *curve, + const u64 *private_key, + struct ecc_point *pk, + unsigned int ndigits) +{ + u64 priv[ECC_MAX_DIGITS]; + struct ecc_point *epk; + int ret; + + ecc_swap_digits(private_key, priv, ndigits); + + epk = ecc_alloc_point(ndigits); + if (!epk) { + ret = -ENOMEM; + goto err; + } + + ecc_point_mult(epk, &curve->g, priv, NULL, curve, ndigits); + + /* check expected public key against the public_key */ + if (vli_cmp(epk->x, pk->x, ndigits)) { + ret = -EINVAL; + goto err_free_point; + } + + if (vli_cmp(epk->y, pk->y, ndigits)) { + ret = -EINVAL; + goto err_free_point; + } + + ret = 0; +err_free_point: + ecc_free_point(epk); +err: + return ret; +} + int ecc_make_pub_key(unsigned int curve_id, unsigned int ndigits, const u64 *private_key, u64 *public_key) { @@ -1534,6 +1584,12 @@ int ecc_make_pub_key(unsigned int curve_id, unsigned int ndigits, goto err_free_point; } + if (fips_enabled && + ecc_pairwise_test(curve, private_key, pk, ndigits)) { + fips_fail_notify(); + panic("ecc_pairwise_test failed"); + } + ecc_swap_digits(pk->x, public_key, ndigits); ecc_swap_digits(pk->y, &public_key[ndigits], ndigits); diff --git a/crypto/ecdh.c b/crypto/ecdh.c index 3049f147e0117..71599cadf0bc7 100644 --- a/crypto/ecdh.c +++ b/crypto/ecdh.c @@ -113,7 +113,7 @@ static int ecdh_compute_value(struct kpp_request *req) free_all: kfree_sensitive(shared_secret); free_pubkey: - kfree(public_key); + kfree_sensitive(public_key); return ret; } diff --git a/crypto/jitterentropy-kcapi.c b/crypto/jitterentropy-kcapi.c index b9edfaa51b273..4b50cbc8a2faf 100644 --- a/crypto/jitterentropy-kcapi.c +++ b/crypto/jitterentropy-kcapi.c @@ -2,7 +2,7 @@ * Non-physical true random number generator based on timing jitter -- * Linux Kernel Crypto API specific code * - * Copyright Stephan Mueller , 2015 + * Copyright Stephan Mueller , 2015 - 2023 * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -37,6 +37,8 @@ * DAMAGE. */ +#include +#include #include #include #include @@ -46,6 +48,8 @@ #include "jitterentropy.h" +#define JENT_CONDITIONING_HASH "sha3-256-generic" + /*************************************************************************** * Helper function ***************************************************************************/ @@ -60,11 +64,6 @@ void jent_zfree(void *ptr) kfree_sensitive(ptr); } -void jent_memcpy(void *dest, const void *src, unsigned int n) -{ - memcpy(dest, src, n); -} - /* * Obtain a high-resolution time stamp value. The time stamp is used to measure * the execution time of a given code path and its variations. Hence, the time @@ -91,6 +90,91 @@ void jent_get_nstime(__u64 *out) *out = tmp; } +int jent_hash_time(void *hash_state, __u64 time, u8 *addtl, + unsigned int addtl_len, __u64 hash_loop_cnt, + unsigned int stuck) +{ + struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state; + SHASH_DESC_ON_STACK(desc, hash_state_desc->tfm); + u8 intermediary[SHA3_256_DIGEST_SIZE]; + __u64 j = 0; + int ret; + + desc->tfm = hash_state_desc->tfm; + + if (sizeof(intermediary) != crypto_shash_digestsize(desc->tfm)) { + pr_warn_ratelimited("Unexpected digest size\n"); + return -EINVAL; + } + + /* + * This loop fills a buffer which is injected into the entropy pool. + * The main reason for this loop is to execute something over which we + * can perform a timing measurement. The injection of the resulting + * data into the pool is performed to ensure the result is used and + * the compiler cannot optimize the loop away in case the result is not + * used at all. Yet that data is considered "additional information" + * considering the terminology from SP800-90A without any entropy. + * + * Note, it does not matter which or how much data you inject, we are + * interested in one Keccack1600 compression operation performed with + * the crypto_shash_final. + */ + for (j = 0; j < hash_loop_cnt; j++) { + ret = crypto_shash_init(desc) ?: + crypto_shash_update(desc, intermediary, + sizeof(intermediary)) ?: + crypto_shash_finup(desc, addtl, addtl_len, intermediary); + if (ret) + goto err; + } + + /* + * Inject the data from the previous loop into the pool. This data is + * not considered to contain any entropy, but it stirs the pool a bit. + */ + ret = crypto_shash_update(desc, intermediary, sizeof(intermediary)); + if (ret) + goto err; + + /* + * Insert the time stamp into the hash context representing the pool. + * + * If the time stamp is stuck, do not finally insert the value into the + * entropy pool. Although this operation should not do any harm even + * when the time stamp has no entropy, SP800-90B requires that any + * conditioning operation to have an identical amount of input data + * according to section 3.1.5. + */ + if (!stuck) { + ret = crypto_shash_update(hash_state_desc, (u8 *)&time, + sizeof(__u64)); + } + +err: + shash_desc_zero(desc); + memzero_explicit(intermediary, sizeof(intermediary)); + + return ret; +} + +int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len) +{ + struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state; + u8 jent_block[SHA3_256_DIGEST_SIZE]; + /* Obtain data from entropy pool and re-initialize it */ + int ret = crypto_shash_final(hash_state_desc, jent_block) ?: + crypto_shash_init(hash_state_desc) ?: + crypto_shash_update(hash_state_desc, jent_block, + sizeof(jent_block)); + + if (!ret && dst_len) + memcpy(dst, jent_block, dst_len); + + memzero_explicit(jent_block, sizeof(jent_block)); + return ret; +} + /*************************************************************************** * Kernel crypto API interface ***************************************************************************/ @@ -98,32 +182,82 @@ void jent_get_nstime(__u64 *out) struct jitterentropy { spinlock_t jent_lock; struct rand_data *entropy_collector; + struct crypto_shash *tfm; + struct shash_desc *sdesc; }; -static int jent_kcapi_init(struct crypto_tfm *tfm) +static void jent_kcapi_cleanup(struct crypto_tfm *tfm) { struct jitterentropy *rng = crypto_tfm_ctx(tfm); - int ret = 0; - rng->entropy_collector = jent_entropy_collector_alloc(1, 0); - if (!rng->entropy_collector) - ret = -ENOMEM; + spin_lock(&rng->jent_lock); - spin_lock_init(&rng->jent_lock); - return ret; -} + if (rng->sdesc) { + shash_desc_zero(rng->sdesc); + kfree(rng->sdesc); + } + rng->sdesc = NULL; -static void jent_kcapi_cleanup(struct crypto_tfm *tfm) -{ - struct jitterentropy *rng = crypto_tfm_ctx(tfm); + if (rng->tfm) + crypto_free_shash(rng->tfm); + rng->tfm = NULL; - spin_lock(&rng->jent_lock); if (rng->entropy_collector) jent_entropy_collector_free(rng->entropy_collector); rng->entropy_collector = NULL; spin_unlock(&rng->jent_lock); } +static int jent_kcapi_init(struct crypto_tfm *tfm) +{ + struct jitterentropy *rng = crypto_tfm_ctx(tfm); + struct crypto_shash *hash; + struct shash_desc *sdesc; + int size, ret = 0; + + spin_lock_init(&rng->jent_lock); + + /* + * Use SHA3-256 as conditioner. We allocate only the generic + * implementation as we are not interested in high-performance. The + * execution time of the SHA3 operation is measured and adds to the + * Jitter RNG's unpredictable behavior. If we have a slower hash + * implementation, the execution timing variations are larger. When + * using a fast implementation, we would need to call it more often + * as its variations are lower. + */ + hash = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0); + if (IS_ERR(hash)) { + pr_err("Cannot allocate conditioning digest\n"); + return PTR_ERR(hash); + } + rng->tfm = hash; + + size = sizeof(struct shash_desc) + crypto_shash_descsize(hash); + sdesc = kmalloc(size, GFP_KERNEL); + if (!sdesc) { + ret = -ENOMEM; + goto err; + } + + sdesc->tfm = hash; + crypto_shash_init(sdesc); + rng->sdesc = sdesc; + + rng->entropy_collector = jent_entropy_collector_alloc(1, 0, sdesc); + if (!rng->entropy_collector) { + ret = -ENOMEM; + goto err; + } + + spin_lock_init(&rng->jent_lock); + return 0; + +err: + jent_kcapi_cleanup(tfm); + return ret; +} + static int jent_kcapi_random(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *rdata, unsigned int dlen) @@ -180,15 +314,24 @@ static struct rng_alg jent_alg = { .cra_module = THIS_MODULE, .cra_init = jent_kcapi_init, .cra_exit = jent_kcapi_cleanup, - } }; static int __init jent_mod_init(void) { + SHASH_DESC_ON_STACK(desc, tfm); + struct crypto_shash *tfm; int ret = 0; - ret = jent_entropy_init(); + tfm = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + desc->tfm = tfm; + crypto_shash_init(desc); + ret = jent_entropy_init(desc); + shash_desc_zero(desc); + crypto_free_shash(tfm); if (ret) { /* Handle permanent health test error */ if (fips_enabled) diff --git a/crypto/jitterentropy.c b/crypto/jitterentropy.c index 227cedfa4f0ae..c7d7f2caa7793 100644 --- a/crypto/jitterentropy.c +++ b/crypto/jitterentropy.c @@ -2,7 +2,7 @@ * Non-physical true random number generator based on timing jitter -- * Jitter RNG standalone code. * - * Copyright Stephan Mueller , 2015 - 2020 + * Copyright Stephan Mueller , 2015 - 2023 * * Design * ====== @@ -47,7 +47,7 @@ /* * This Jitterentropy RNG is based on the jitterentropy library - * version 2.2.0 provided at https://www.chronox.de/jent.html + * version 3.4.0 provided at https://www.chronox.de/jent.html */ #ifdef __OPTIMIZE__ @@ -57,21 +57,22 @@ typedef unsigned long long __u64; typedef long long __s64; typedef unsigned int __u32; +typedef unsigned char u8; #define NULL ((void *) 0) /* The entropy pool */ struct rand_data { + /* SHA3-256 is used as conditioner */ +#define DATA_SIZE_BITS 256 /* all data values that are vital to maintain the security * of the RNG are marked as SENSITIVE. A user must not * access that information while the RNG executes its loops to * calculate the next random value. */ - __u64 data; /* SENSITIVE Actual random number */ - __u64 old_data; /* SENSITIVE Previous random number */ - __u64 prev_time; /* SENSITIVE Previous time stamp */ -#define DATA_SIZE_BITS ((sizeof(__u64)) * 8) - __u64 last_delta; /* SENSITIVE stuck test */ - __s64 last_delta2; /* SENSITIVE stuck test */ - unsigned int osr; /* Oversample rate */ + void *hash_state; /* SENSITIVE hash state entropy pool */ + __u64 prev_time; /* SENSITIVE Previous time stamp */ + __u64 last_delta; /* SENSITIVE stuck test */ + __s64 last_delta2; /* SENSITIVE stuck test */ + unsigned int osr; /* Oversample rate */ #define JENT_MEMORY_BLOCKS 64 #define JENT_MEMORY_BLOCKSIZE 32 #define JENT_MEMORY_ACCESSLOOPS 128 @@ -301,15 +302,13 @@ static int jent_permanent_health_failure(struct rand_data *ec) * an entropy collection. * * Input: - * @ec entropy collector struct -- may be NULL * @bits is the number of low bits of the timer to consider * @min is the number of bits we shift the timer value to the right at * the end to make sure we have a guaranteed minimum value * * @return Newly calculated loop counter */ -static __u64 jent_loop_shuffle(struct rand_data *ec, - unsigned int bits, unsigned int min) +static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min) { __u64 time = 0; __u64 shuffle = 0; @@ -317,12 +316,7 @@ static __u64 jent_loop_shuffle(struct rand_data *ec, unsigned int mask = (1<data; + /* * We fold the time value as much as possible to ensure that as many * bits of the time stamp are included as possible. @@ -344,81 +338,32 @@ static __u64 jent_loop_shuffle(struct rand_data *ec, * execution time jitter * * This function injects the individual bits of the time value into the - * entropy pool using an LFSR. + * entropy pool using a hash. * - * The code is deliberately inefficient with respect to the bit shifting - * and shall stay that way. This function is the root cause why the code - * shall be compiled without optimization. This function not only acts as - * folding operation, but this function's execution is used to measure - * the CPU execution time jitter. Any change to the loop in this function - * implies that careful retesting must be done. - * - * @ec [in] entropy collector struct - * @time [in] time stamp to be injected - * @loop_cnt [in] if a value not equal to 0 is set, use the given value as - * number of loops to perform the folding - * @stuck [in] Is the time stamp identified as stuck? + * ec [in] entropy collector + * time [in] time stamp to be injected + * stuck [in] Is the time stamp identified as stuck? * * Output: - * updated ec->data - * - * @return Number of loops the folding operation is performed + * updated hash context in the entropy collector or error code */ -static void jent_lfsr_time(struct rand_data *ec, __u64 time, __u64 loop_cnt, - int stuck) +static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck) { - unsigned int i; - __u64 j = 0; - __u64 new = 0; -#define MAX_FOLD_LOOP_BIT 4 -#define MIN_FOLD_LOOP_BIT 0 - __u64 fold_loop_cnt = - jent_loop_shuffle(ec, MAX_FOLD_LOOP_BIT, MIN_FOLD_LOOP_BIT); - - /* - * testing purposes -- allow test app to set the counter, not - * needed during runtime - */ - if (loop_cnt) - fold_loop_cnt = loop_cnt; - for (j = 0; j < fold_loop_cnt; j++) { - new = ec->data; - for (i = 1; (DATA_SIZE_BITS) >= i; i++) { - __u64 tmp = time << (DATA_SIZE_BITS - i); - - tmp = tmp >> (DATA_SIZE_BITS - 1); - - /* - * Fibonacci LSFR with polynomial of - * x^64 + x^61 + x^56 + x^31 + x^28 + x^23 + 1 which is - * primitive according to - * http://poincare.matf.bg.ac.rs/~ezivkovm/publications/primpol1.pdf - * (the shift values are the polynomial values minus one - * due to counting bits from 0 to 63). As the current - * position is always the LSB, the polynomial only needs - * to shift data in from the left without wrap. - */ - tmp ^= ((new >> 63) & 1); - tmp ^= ((new >> 60) & 1); - tmp ^= ((new >> 55) & 1); - tmp ^= ((new >> 30) & 1); - tmp ^= ((new >> 27) & 1); - tmp ^= ((new >> 22) & 1); - new <<= 1; - new ^= tmp; - } - } - - /* - * If the time stamp is stuck, do not finally insert the value into - * the entropy pool. Although this operation should not do any harm - * even when the time stamp has no entropy, SP800-90B requires that - * any conditioning operation (SP800-90B considers the LFSR to be a - * conditioning operation) to have an identical amount of input - * data according to section 3.1.5. - */ - if (!stuck) - ec->data = new; +#define SHA3_HASH_LOOP (1<<3) + struct { + int rct_count; + unsigned int apt_observations; + unsigned int apt_count; + unsigned int apt_base; + } addtl = { + ec->rct_count, + ec->apt_observations, + ec->apt_count, + ec->apt_base + }; + + return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl), + SHA3_HASH_LOOP, stuck); } /* @@ -452,7 +397,7 @@ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt) #define MAX_ACC_LOOP_BIT 7 #define MIN_ACC_LOOP_BIT 0 __u64 acc_loop_cnt = - jent_loop_shuffle(ec, MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); + jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); if (NULL == ec || NULL == ec->mem) return; @@ -520,14 +465,15 @@ static int jent_measure_jitter(struct rand_data *ec) stuck = jent_stuck(ec, current_delta); /* Now call the next noise sources which also injects the data */ - jent_lfsr_time(ec, current_delta, 0, stuck); + if (jent_condition_data(ec, current_delta, stuck)) + stuck = 1; return stuck; } /* * Generator of one 64 bit random number - * Function fills rand_data->data + * Function fills rand_data->hash_state * * @ec [in] Reference to entropy collector */ @@ -574,7 +520,7 @@ static void jent_gen_entropy(struct rand_data *ec) * @return 0 when request is fulfilled or an error * * The following error codes can occur: - * -1 entropy_collector is NULL + * -1 entropy_collector is NULL or the generation failed * -2 Intermittent health failure * -3 Permanent health failure */ @@ -604,7 +550,7 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, * Perform startup health tests and return permanent * error if it fails. */ - if (jent_entropy_init()) + if (jent_entropy_init(ec->hash_state)) return -3; return -2; @@ -614,7 +560,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, tocopy = (DATA_SIZE_BITS / 8); else tocopy = len; - jent_memcpy(p, &ec->data, tocopy); + if (jent_read_random_block(ec->hash_state, p, tocopy)) + return -1; len -= tocopy; p += tocopy; @@ -628,7 +575,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, ***************************************************************************/ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, - unsigned int flags) + unsigned int flags, + void *hash_state) { struct rand_data *entropy_collector; @@ -655,6 +603,8 @@ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, osr = 1; /* minimum sampling rate is 1 */ entropy_collector->osr = osr; + entropy_collector->hash_state = hash_state; + /* fill the data pad with non-zero values */ jent_gen_entropy(entropy_collector); @@ -668,7 +618,7 @@ void jent_entropy_collector_free(struct rand_data *entropy_collector) jent_zfree(entropy_collector); } -int jent_entropy_init(void) +int jent_entropy_init(void *hash_state) { int i; __u64 delta_sum = 0; @@ -681,6 +631,7 @@ int jent_entropy_init(void) /* Required for RCT */ ec.osr = 1; + ec.hash_state = hash_state; /* We could perform statistical tests here, but the problem is * that we only have a few loop counts to do testing. These @@ -718,7 +669,7 @@ int jent_entropy_init(void) /* Invoke core entropy collection logic */ jent_get_nstime(&time); ec.prev_time = time; - jent_lfsr_time(&ec, time, 0, 0); + jent_condition_data(&ec, time, 0); jent_get_nstime(&time2); /* test whether timer works */ diff --git a/crypto/jitterentropy.h b/crypto/jitterentropy.h index 5cc583f6bc6b8..b3890ff26a023 100644 --- a/crypto/jitterentropy.h +++ b/crypto/jitterentropy.h @@ -2,14 +2,18 @@ extern void *jent_zalloc(unsigned int len); extern void jent_zfree(void *ptr); -extern void jent_memcpy(void *dest, const void *src, unsigned int n); extern void jent_get_nstime(__u64 *out); +extern int jent_hash_time(void *hash_state, __u64 time, u8 *addtl, + unsigned int addtl_len, __u64 hash_loop_cnt, + unsigned int stuck); +int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len); struct rand_data; -extern int jent_entropy_init(void); +extern int jent_entropy_init(void *hash_state); extern int jent_read_entropy(struct rand_data *ec, unsigned char *data, unsigned int len); extern struct rand_data *jent_entropy_collector_alloc(unsigned int osr, - unsigned int flags); + unsigned int flags, + void *hash_state); extern void jent_entropy_collector_free(struct rand_data *entropy_collector); diff --git a/crypto/rng.c b/crypto/rng.c index fea082b25fe4b..2ed6a8a0ce5c2 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -11,19 +11,24 @@ #include #include #include +#include +#include #include #include #include #include +#include +#include #include #include #include -#include #include #include "internal.h" -static DEFINE_MUTEX(crypto_default_rng_lock); +static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_reseed_rng_lock); +static struct crypto_rng *crypto_reseed_rng; +static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_default_rng_lock); struct crypto_rng *crypto_default_rng; EXPORT_SYMBOL_GPL(crypto_default_rng); static int crypto_default_rng_refcnt; @@ -113,31 +118,37 @@ struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask) } EXPORT_SYMBOL_GPL(crypto_alloc_rng); -int crypto_get_default_rng(void) +static int crypto_get_rng(struct crypto_rng **rngp) { struct crypto_rng *rng; int err; - mutex_lock(&crypto_default_rng_lock); - if (!crypto_default_rng) { + if (!*rngp) { rng = crypto_alloc_rng("stdrng", 0, 0); err = PTR_ERR(rng); if (IS_ERR(rng)) - goto unlock; + return err; err = crypto_rng_reset(rng, NULL, crypto_rng_seedsize(rng)); if (err) { crypto_free_rng(rng); - goto unlock; + return err; } - crypto_default_rng = rng; + *rngp = rng; } - crypto_default_rng_refcnt++; - err = 0; + return 0; +} + +int crypto_get_default_rng(void) +{ + int err; -unlock: + mutex_lock(&crypto_default_rng_lock); + err = crypto_get_rng(&crypto_default_rng); + if (!err) + crypto_default_rng_refcnt++; mutex_unlock(&crypto_default_rng_lock); return err; @@ -153,16 +164,17 @@ void crypto_put_default_rng(void) EXPORT_SYMBOL_GPL(crypto_put_default_rng); #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE) -int crypto_del_default_rng(void) +static int crypto_del_rng(struct crypto_rng **rngp, int *refcntp, + struct mutex *lock) { int err = -EBUSY; - mutex_lock(&crypto_default_rng_lock); - if (crypto_default_rng_refcnt) + mutex_lock(lock); + if (refcntp && *refcntp) goto out; - crypto_free_rng(crypto_default_rng); - crypto_default_rng = NULL; + crypto_free_rng(*rngp); + *rngp = NULL; err = 0; @@ -171,6 +183,14 @@ int crypto_del_default_rng(void) return err; } + +int crypto_del_default_rng(void) +{ + return crypto_del_rng(&crypto_default_rng, &crypto_default_rng_refcnt, + &crypto_default_rng_lock) ?: + crypto_del_rng(&crypto_reseed_rng, NULL, + &crypto_reseed_rng_lock); +} EXPORT_SYMBOL_GPL(crypto_del_default_rng); #endif @@ -224,5 +244,109 @@ void crypto_unregister_rngs(struct rng_alg *algs, int count) } EXPORT_SYMBOL_GPL(crypto_unregister_rngs); +static ssize_t crypto_devrandom_read(void __user *buf, size_t buflen, + bool reseed) +{ + struct crypto_rng *rng; + u8 tmp[256]; + ssize_t ret; + + if (!buflen) + return 0; + + if (reseed) { + u32 flags = 0; + + /* If reseeding is requested, acquire a lock on + * crypto_reseed_rng so it is not swapped out until + * the initial random bytes are generated. + * + * The algorithm implementation is also protected with + * a separate mutex (drbg->drbg_mutex) around the + * reseed-and-generate operation. + */ + mutex_lock(&crypto_reseed_rng_lock); + + /* If crypto_default_rng is not set, it will be seeded + * at creation in __crypto_get_default_rng and thus no + * reseeding is needed. + */ + if (crypto_reseed_rng) + flags |= CRYPTO_TFM_REQ_NEED_RESEED; + + ret = crypto_get_rng(&crypto_reseed_rng); + if (ret) { + mutex_unlock(&crypto_reseed_rng_lock); + return ret; + } + + rng = crypto_reseed_rng; + crypto_tfm_set_flags(crypto_rng_tfm(rng), flags); + } else { + ret = crypto_get_default_rng(); + if (ret) + return ret; + rng = crypto_default_rng; + } + + for (;;) { + int err; + int i; + + i = min_t(int, buflen, sizeof(tmp)); + err = crypto_rng_get_bytes(rng, tmp, i); + if (err) { + ret = err; + break; + } + + if (copy_to_user(buf, tmp, i)) { + ret = -EFAULT; + break; + } + + buflen -= i; + buf += i; + ret += i; + + if (!buflen) + break; + + if (need_resched()) { + if (signal_pending(current)) + break; + schedule(); + } + } + + if (reseed) + mutex_unlock(&crypto_reseed_rng_lock); + else + crypto_put_default_rng(); + memzero_explicit(tmp, sizeof(tmp)); + + return ret; +} + +static const struct random_extrng crypto_devrandom_rng = { + .extrng_read = crypto_devrandom_read, + .owner = THIS_MODULE, +}; + +static int __init crypto_rng_init(void) +{ + if (fips_enabled) + random_register_extrng(&crypto_devrandom_rng); + return 0; +} + +static void __exit crypto_rng_exit(void) +{ + random_unregister_extrng(); +} + +late_initcall(crypto_rng_init); +module_exit(crypto_rng_exit); + MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Random Number Generator"); diff --git a/crypto/rsa.c b/crypto/rsa.c index c50f2d2a4d064..c79613cdce6e4 100644 --- a/crypto/rsa.c +++ b/crypto/rsa.c @@ -205,6 +205,32 @@ static int rsa_check_key_length(unsigned int len) return -EINVAL; } +static int rsa_check_exponent_fips(MPI e) +{ + MPI e_max = NULL; + + /* check if odd */ + if (!mpi_test_bit(e, 0)) { + return -EINVAL; + } + + /* check if 2^16 < e < 2^256. */ + if (mpi_cmp_ui(e, 65536) <= 0) { + return -EINVAL; + } + + e_max = mpi_alloc(0); + mpi_set_bit(e_max, 256); + + if (mpi_cmp(e, e_max) >= 0) { + mpi_free(e_max); + return -EINVAL; + } + + mpi_free(e_max); + return 0; +} + static int rsa_set_pub_key(struct crypto_akcipher *tfm, const void *key, unsigned int keylen) { @@ -232,6 +258,11 @@ static int rsa_set_pub_key(struct crypto_akcipher *tfm, const void *key, return -EINVAL; } + if (fips_enabled && rsa_check_exponent_fips(mpi_key->e)) { + rsa_free_mpi_key(mpi_key); + return -EINVAL; + } + return 0; err: @@ -290,6 +321,11 @@ static int rsa_set_priv_key(struct crypto_akcipher *tfm, const void *key, return -EINVAL; } + if (fips_enabled && rsa_check_exponent_fips(mpi_key->e)) { + rsa_free_mpi_key(mpi_key); + return -EINVAL; + } + return 0; err: diff --git a/crypto/seqiv.c b/crypto/seqiv.c index b1bcfe537daf1..952dbeae7a08e 100644 --- a/crypto/seqiv.c +++ b/crypto/seqiv.c @@ -133,6 +133,12 @@ static int seqiv_aead_decrypt(struct aead_request *req) return crypto_aead_decrypt(subreq); } +static int aead_init_seqiv(struct crypto_aead *aead) +{ + crypto_aead_set_flags(aead, CRYPTO_ALG_FIPS140_COMPLIANT); + return aead_init_geniv(aead); +} + static int seqiv_aead_create(struct crypto_template *tmpl, struct rtattr **tb) { struct aead_instance *inst; @@ -150,7 +156,7 @@ static int seqiv_aead_create(struct crypto_template *tmpl, struct rtattr **tb) inst->alg.encrypt = seqiv_aead_encrypt; inst->alg.decrypt = seqiv_aead_decrypt; - inst->alg.init = aead_init_geniv; + inst->alg.init = aead_init_seqiv; inst->alg.exit = aead_exit_geniv; inst->alg.base.cra_ctxsize = sizeof(struct aead_geniv_ctx); diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index b23235d58a122..8e0ab5c2dad04 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -1702,6 +1702,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) ret = min(ret, tcrypt_test("gcm(aria)")); break; + case 59: + ret = min(ret, tcrypt_test("ffdhe2048(dh)")); + break; + case 100: ret = min(ret, tcrypt_test("hmac(md5)")); break; diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 56c39a0c94952..f8cba28b0e230 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4573,7 +4573,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { #endif .alg = "cbcmac(aes)", - .fips_allowed = 1, .test = alg_test_hash, .suite = { .hash = __VECS(aes_cbcmac_tv_template) @@ -4848,7 +4847,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { /* covered by drbg_nopr_hmac_sha256 test */ .alg = "drbg_nopr_hmac_sha384", - .fips_allowed = 1, .test = alg_test_null, }, { .alg = "drbg_nopr_hmac_sha512", @@ -4871,7 +4869,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { /* covered by drbg_nopr_sha256 test */ .alg = "drbg_nopr_sha384", - .fips_allowed = 1, .test = alg_test_null, }, { .alg = "drbg_nopr_sha512", @@ -4907,7 +4904,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { /* covered by drbg_pr_hmac_sha256 test */ .alg = "drbg_pr_hmac_sha384", - .fips_allowed = 1, .test = alg_test_null, }, { .alg = "drbg_pr_hmac_sha512", @@ -4927,7 +4923,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { /* covered by drbg_pr_sha256 test */ .alg = "drbg_pr_sha384", - .fips_allowed = 1, .test = alg_test_null, }, { .alg = "drbg_pr_sha512", @@ -5192,7 +5187,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "ghash", .test = alg_test_hash, - .fips_allowed = 1, .suite = { .hash = __VECS(ghash_tv_template) } @@ -5729,20 +5723,16 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { #endif - .alg = "xts4096(paes)", - .test = alg_test_null, - .fips_allowed = 1, - }, { - .alg = "xts512(paes)", - .test = alg_test_null, - .fips_allowed = 1, - }, { .alg = "xxhash64", .test = alg_test_hash, .fips_allowed = 1, .suite = { .hash = __VECS(xxhash64_tv_template) } + }, { + .alg = "zlib", + .test = alg_test_null, + .fips_allowed = 1, }, { .alg = "zlib-deflate", .test = alg_test_comp, diff --git a/drivers/Kconfig b/drivers/Kconfig index 19ee995bd0ae1..ab32f94d6304c 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -239,4 +239,5 @@ source "drivers/peci/Kconfig" source "drivers/hte/Kconfig" +source "drivers/amazon/Kconfig" endmenu diff --git a/drivers/Makefile b/drivers/Makefile index bdf1c66141c9b..b44c6599aea35 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -189,3 +189,4 @@ obj-$(CONFIG_COUNTER) += counter/ obj-$(CONFIG_MOST) += most/ obj-$(CONFIG_PECI) += peci/ obj-$(CONFIG_HTE) += hte/ +obj-$(CONFIG_AMAZON_DRIVER_UPDATES) += amazon/ diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c index 31d7ea84a360f..730db6d6635c1 100644 --- a/drivers/acpi/acpica/tbfadt.c +++ b/drivers/acpi/acpica/tbfadt.c @@ -315,23 +315,19 @@ void acpi_tb_parse_fadt(void) ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL, NULL, FALSE, TRUE, &acpi_gbl_dsdt_index); - /* If Hardware Reduced flag is set, there is no FACS */ - - if (!acpi_gbl_reduced_hardware) { - if (acpi_gbl_FADT.facs) { - acpi_tb_install_standard_table((acpi_physical_address) - acpi_gbl_FADT.facs, - ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL, - NULL, FALSE, TRUE, - &acpi_gbl_facs_index); - } - if (acpi_gbl_FADT.Xfacs) { - acpi_tb_install_standard_table((acpi_physical_address) - acpi_gbl_FADT.Xfacs, - ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL, - NULL, FALSE, TRUE, - &acpi_gbl_xfacs_index); - } + if (acpi_gbl_FADT.facs) { + acpi_tb_install_standard_table((acpi_physical_address) + acpi_gbl_FADT.facs, + ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL, + NULL, FALSE, TRUE, + &acpi_gbl_facs_index); + } + if (acpi_gbl_FADT.Xfacs) { + acpi_tb_install_standard_table((acpi_physical_address) + acpi_gbl_FADT.Xfacs, + ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL, + NULL, FALSE, TRUE, + &acpi_gbl_xfacs_index); } } diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c index 633a823be65fb..2bea36ec00905 100644 --- a/drivers/acpi/acpica/tbutils.c +++ b/drivers/acpi/acpica/tbutils.c @@ -36,12 +36,7 @@ acpi_status acpi_tb_initialize_facs(void) { struct acpi_table_facs *facs; - /* If Hardware Reduced flag is set, there is no FACS */ - - if (acpi_gbl_reduced_hardware) { - acpi_gbl_FACS = NULL; - return (AE_OK); - } else if (acpi_gbl_FADT.Xfacs && + if (acpi_gbl_FADT.Xfacs && (!acpi_gbl_FADT.facs || !acpi_gbl_use32_bit_facs_addresses)) { (void)acpi_get_table_by_index(acpi_gbl_xfacs_index, diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 293cdf486fd81..f335d810fd1c1 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -1635,6 +1636,11 @@ int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, arch_setup_dma_ops(dev, 0, U64_MAX, iommu, attr == DEV_DMA_COHERENT); +#ifdef CONFIG_DMA_PAGE_TOUCHING + if (!dev->dma_ops) + setup_dma_page_touching_ops(dev); +#endif + return 0; } EXPORT_SYMBOL_GPL(acpi_dma_configure_id); diff --git a/drivers/amazon/Kconfig b/drivers/amazon/Kconfig new file mode 100644 index 0000000000000..2012cb50eb2a1 --- /dev/null +++ b/drivers/amazon/Kconfig @@ -0,0 +1,42 @@ +# +# Amazon driver updates configuration +# + +config AMAZON_DRIVER_UPDATES + bool "Amazon Driver Updates" + default y + depends on PCI || EXPERIMENTAL + help + Amazon driver updates includes out-of-tree drivers and/or modifeid + versions of the drivers present in the stable kernel tree. + +if AMAZON_DRIVER_UPDATES + +config AMAZON_ENA_ETHERNET + tristate "Elastic Network Adapter (ENA) support" + depends on PCI_MSI && !ENA_ETHERNET + help + This driver supports Elastic Network Adapter (ENA) + + To compile this driver as a module, choose M here. + The module will be called ena. + +config AMAZON_EFA_INFINIBAND + tristate "Elastic Fabric Adapter (EFA) support" + depends on INFINIBAND_USER_ACCESS && AMAZON_ENA_ETHERNET + help + This driver support Elastic Fabric Adapter (EFA) + + To compile this driver as a module, choose M here. + The module will be called efa + +config AMAZON_IGB_UIO + tristate "DPDK igb_uio driver" + help + This is the direct PCI access driver for igb and + other PCI network devices, for DPDK. + + To compile this driver as a module, choose M here. + The module will be called igb_uio. + +endif # AMAZON_DRIVER_UPDATES diff --git a/drivers/amazon/Makefile b/drivers/amazon/Makefile new file mode 100644 index 0000000000000..fc5f70dd7487d --- /dev/null +++ b/drivers/amazon/Makefile @@ -0,0 +1,4 @@ +# +# Amazon Driver Updates +# +obj-$(CONFIG_AMAZON_DRIVER_UPDATES) += net/ diff --git a/drivers/amazon/net/Makefile b/drivers/amazon/net/Makefile new file mode 100644 index 0000000000000..7eb6f214798ee --- /dev/null +++ b/drivers/amazon/net/Makefile @@ -0,0 +1,6 @@ +# +# Amazon Driver Updates +# +obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena/ +obj-$(CONFIG_AMAZON_EFA_INFINIBAND) += efa/ +obj-$(CONFIG_AMAZON_IGB_UIO) += igb_uio/ diff --git a/drivers/amazon/net/efa/Makefile b/drivers/amazon/net/efa/Makefile new file mode 100644 index 0000000000000..121bdd4bf6ec4 --- /dev/null +++ b/drivers/amazon/net/efa/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +# Copyright 2024 Amazon.com, Inc. or its affiliates. All rights reserved. +# +# Makefile for Amazon Elastic Fabric Adapter (EFA) device driver. +# + +obj-$(CONFIG_AMAZON_EFA_INFINIBAND) += efa.o + +efa-y := efa_com.o efa_com_cmd.o efa_gdr.o efa_main.o efa_neuron.o efa_p2p.o +efa-y += efa_verbs.o + +efa-$(CONFIG_SYSFS) += efa_sysfs.o diff --git a/drivers/amazon/net/efa/efa.h b/drivers/amazon/net/efa/efa.h new file mode 100644 index 0000000000000..f1dc5739dcd70 --- /dev/null +++ b/drivers/amazon/net/efa/efa.h @@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_H_ +#define _EFA_H_ + +#include "kcompat.h" +#include +#include +#include +#include + +#include +#include + +#include "efa_com_cmd.h" + +#define DRV_MODULE_NAME "efa" +#define DEVICE_NAME "Elastic Fabric Adapter (EFA)" + +#define EFA_IRQNAME_SIZE 40 + +#define EFA_MGMNT_MSIX_VEC_IDX 0 +#define EFA_COMP_EQS_VEC_BASE 1 + +struct efa_irq { + irq_handler_t handler; + void *data; + u32 irqn; + u32 vector; + cpumask_t affinity_hint_mask; + char name[EFA_IRQNAME_SIZE]; +}; + +/* Don't use anything other than atomic64 */ +struct efa_stats { + atomic64_t alloc_pd_err; + atomic64_t create_qp_err; + atomic64_t create_cq_err; + atomic64_t reg_mr_err; + atomic64_t alloc_ucontext_err; + atomic64_t create_ah_err; + atomic64_t mmap_err; + atomic64_t keep_alive_rcvd; +}; + +struct efa_dev { + struct ib_device ibdev; + struct efa_com_dev edev; + struct pci_dev *pdev; + struct efa_com_get_device_attr_result dev_attr; + + u64 reg_bar_addr; + u64 reg_bar_len; + u64 mem_bar_addr; + u64 mem_bar_len; + u64 db_bar_addr; + u64 db_bar_len; + + int admin_msix_vector_idx; + struct efa_irq admin_irq; + + struct efa_stats stats; + + /* Array of completion EQs */ + struct efa_eq *eqs; + unsigned int neqs; + + /* Only stores CQs with interrupts enabled */ + struct xarray cqs_xa; +}; + +struct efa_ucontext { + struct ib_ucontext ibucontext; + u16 uarn; +}; + +struct efa_pd { + struct ib_pd ibpd; + u16 pdn; +}; + +struct efa_mr_interconnect_info { + u16 recv_ic_id; + u16 rdma_read_ic_id; + u16 rdma_recv_ic_id; + u8 recv_ic_id_valid : 1; + u8 rdma_read_ic_id_valid : 1; + u8 rdma_recv_ic_id_valid : 1; +}; + +struct efa_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct efa_mr_interconnect_info ic_info; + struct efa_p2pmem *p2pmem; + u64 p2p_ticket; +}; + +struct efa_cq { + struct ib_cq ibcq; + struct efa_ucontext *ucontext; + dma_addr_t dma_addr; + void *cpu_addr; + struct rdma_user_mmap_entry *mmap_entry; + struct rdma_user_mmap_entry *db_mmap_entry; + size_t size; + u16 cq_idx; + /* NULL when no interrupts requested */ + struct efa_eq *eq; +}; + +struct efa_qp { + struct ib_qp ibqp; + dma_addr_t rq_dma_addr; + void *rq_cpu_addr; + size_t rq_size; + enum ib_qp_state state; + + /* Used for saving mmap_xa entries */ + struct rdma_user_mmap_entry *sq_db_mmap_entry; + struct rdma_user_mmap_entry *llq_desc_mmap_entry; + struct rdma_user_mmap_entry *rq_db_mmap_entry; + struct rdma_user_mmap_entry *rq_mmap_entry; + + u32 qp_handle; + u32 max_send_wr; + u32 max_recv_wr; + u32 max_send_sge; + u32 max_recv_sge; + u32 max_inline_data; +}; + +struct efa_ah { + struct ib_ah ibah; + u16 ah; + /* dest_addr */ + u8 id[EFA_GID_SIZE]; +}; + +struct efa_eq { + struct efa_com_eq eeq; + struct efa_irq irq; +}; + +int efa_query_device(struct ib_device *ibdev, + struct ib_device_attr *props, + struct ib_udata *udata); +int efa_query_port(struct ib_device *ibdev, port_t port, + struct ib_port_attr *props); +int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int efa_query_gid(struct ib_device *ibdev, port_t port, int index, + union ib_gid *gid); +int efa_query_pkey(struct ib_device *ibdev, port_t port, u16 index, + u16 *pkey); +int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); +int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); +struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, + u64 length, u64 virt_addr, + int fd, int access_flags, + struct ib_udata *udata); +int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); +int efa_get_port_immutable(struct ib_device *ibdev, port_t port_num, + struct ib_port_immutable *immutable); +int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata); +void efa_dealloc_ucontext(struct ib_ucontext *ibucontext); +int efa_mmap(struct ib_ucontext *ibucontext, + struct vm_area_struct *vma); +void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +int efa_create_ah(struct ib_ah *ibah, + struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata); +int efa_destroy_ah(struct ib_ah *ibah, u32 flags); +int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_udata *udata); +enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev, + port_t port_num); +struct rdma_hw_stats *efa_alloc_hw_port_stats(struct ib_device *ibdev, port_t port_num); +struct rdma_hw_stats *efa_alloc_hw_device_stats(struct ib_device *ibdev); +int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, + port_t port_num, int index); + +#endif /* _EFA_H_ */ diff --git a/drivers/amazon/net/efa/efa_admin_cmds_defs.h b/drivers/amazon/net/efa/efa_admin_cmds_defs.h new file mode 100644 index 0000000000000..7377c8a9f4d5d --- /dev/null +++ b/drivers/amazon/net/efa/efa_admin_cmds_defs.h @@ -0,0 +1,1066 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_ADMIN_CMDS_H_ +#define _EFA_ADMIN_CMDS_H_ + +#define EFA_ADMIN_API_VERSION_MAJOR 0 +#define EFA_ADMIN_API_VERSION_MINOR 1 + +/* EFA admin queue opcodes */ +enum efa_admin_aq_opcode { + EFA_ADMIN_CREATE_QP = 1, + EFA_ADMIN_MODIFY_QP = 2, + EFA_ADMIN_QUERY_QP = 3, + EFA_ADMIN_DESTROY_QP = 4, + EFA_ADMIN_CREATE_AH = 5, + EFA_ADMIN_DESTROY_AH = 6, + EFA_ADMIN_REG_MR = 7, + EFA_ADMIN_DEREG_MR = 8, + EFA_ADMIN_CREATE_CQ = 9, + EFA_ADMIN_DESTROY_CQ = 10, + EFA_ADMIN_GET_FEATURE = 11, + EFA_ADMIN_SET_FEATURE = 12, + EFA_ADMIN_GET_STATS = 13, + EFA_ADMIN_ALLOC_PD = 14, + EFA_ADMIN_DEALLOC_PD = 15, + EFA_ADMIN_ALLOC_UAR = 16, + EFA_ADMIN_DEALLOC_UAR = 17, + EFA_ADMIN_CREATE_EQ = 18, + EFA_ADMIN_DESTROY_EQ = 19, + EFA_ADMIN_MAX_OPCODE = 19, +}; + +enum efa_admin_aq_feature_id { + EFA_ADMIN_DEVICE_ATTR = 1, + EFA_ADMIN_AENQ_CONFIG = 2, + EFA_ADMIN_NETWORK_ATTR = 3, + EFA_ADMIN_QUEUE_ATTR = 4, + EFA_ADMIN_HW_HINTS = 5, + EFA_ADMIN_HOST_INFO = 6, + EFA_ADMIN_EVENT_QUEUE_ATTR = 7, +}; + +/* QP transport type */ +enum efa_admin_qp_type { + /* Unreliable Datagram */ + EFA_ADMIN_QP_TYPE_UD = 1, + /* Scalable Reliable Datagram */ + EFA_ADMIN_QP_TYPE_SRD = 2, +}; + +/* QP state */ +enum efa_admin_qp_state { + EFA_ADMIN_QP_STATE_RESET = 0, + EFA_ADMIN_QP_STATE_INIT = 1, + EFA_ADMIN_QP_STATE_RTR = 2, + EFA_ADMIN_QP_STATE_RTS = 3, + EFA_ADMIN_QP_STATE_SQD = 4, + EFA_ADMIN_QP_STATE_SQE = 5, + EFA_ADMIN_QP_STATE_ERR = 6, +}; + +enum efa_admin_get_stats_type { + EFA_ADMIN_GET_STATS_TYPE_BASIC = 0, + EFA_ADMIN_GET_STATS_TYPE_MESSAGES = 1, + EFA_ADMIN_GET_STATS_TYPE_RDMA_READ = 2, + EFA_ADMIN_GET_STATS_TYPE_RDMA_WRITE = 3, +}; + +enum efa_admin_get_stats_scope { + EFA_ADMIN_GET_STATS_SCOPE_ALL = 0, + EFA_ADMIN_GET_STATS_SCOPE_QUEUE = 1, +}; + +/* + * QP allocation sizes, converted by fabric QueuePair (QP) create command + * from QP capabilities. + */ +struct efa_admin_qp_alloc_size { + /* Send descriptor ring size in bytes */ + u32 send_queue_ring_size; + + /* Max number of WQEs that can be outstanding on send queue. */ + u32 send_queue_depth; + + /* + * Recv descriptor ring size in bytes, sufficient for user-provided + * number of WQEs + */ + u32 recv_queue_ring_size; + + /* Max number of WQEs that can be outstanding on recv queue */ + u32 recv_queue_depth; +}; + +struct efa_admin_create_qp_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* Protection Domain associated with this QP */ + u16 pd; + + /* QP type */ + u8 qp_type; + + /* + * 0 : sq_virt - If set, SQ ring base address is + * virtual (IOVA returned by MR registration) + * 1 : rq_virt - If set, RQ ring base address is + * virtual (IOVA returned by MR registration) + * 7:2 : reserved - MBZ + */ + u8 flags; + + /* + * Send queue (SQ) ring base physical address. This field is not + * used if this is a Low Latency Queue(LLQ). + */ + u64 sq_base_addr; + + /* Receive queue (RQ) ring base address. */ + u64 rq_base_addr; + + /* Index of CQ to be associated with Send Queue completions */ + u32 send_cq_idx; + + /* Index of CQ to be associated with Recv Queue completions */ + u32 recv_cq_idx; + + /* + * Memory registration key for the SQ ring, used only when not in + * LLQ mode and base address is virtual + */ + u32 sq_l_key; + + /* + * Memory registration key for the RQ ring, used only when base + * address is virtual + */ + u32 rq_l_key; + + /* Requested QP allocation sizes */ + struct efa_admin_qp_alloc_size qp_alloc_size; + + /* UAR number */ + u16 uar; + + /* MBZ */ + u16 reserved; + + /* MBZ */ + u32 reserved2; +}; + +struct efa_admin_create_qp_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; + + /* + * Opaque handle to be used for consequent admin operations on the + * QP + */ + u32 qp_handle; + + /* + * QP number in the given EFA virtual device. Least-significant bits (as + * needed according to max_qp) carry unique QP ID + */ + u16 qp_num; + + /* MBZ */ + u16 reserved; + + /* Index of sub-CQ for Send Queue completions */ + u16 send_sub_cq_idx; + + /* Index of sub-CQ for Receive Queue completions */ + u16 recv_sub_cq_idx; + + /* SQ doorbell address, as offset to PCIe DB BAR */ + u32 sq_db_offset; + + /* RQ doorbell address, as offset to PCIe DB BAR */ + u32 rq_db_offset; + + /* + * low latency send queue ring base address as an offset to PCIe + * MMIO LLQ_MEM BAR + */ + u32 llq_descriptors_offset; +}; + +struct efa_admin_modify_qp_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* + * Mask indicating which fields should be updated + * 0 : qp_state + * 1 : cur_qp_state + * 2 : qkey + * 3 : sq_psn + * 4 : sq_drained_async_notify + * 5 : rnr_retry + * 31:6 : reserved + */ + u32 modify_mask; + + /* QP handle returned by create_qp command */ + u32 qp_handle; + + /* QP state */ + u32 qp_state; + + /* Override current QP state (before applying the transition) */ + u32 cur_qp_state; + + /* QKey */ + u32 qkey; + + /* SQ PSN */ + u32 sq_psn; + + /* Enable async notification when SQ is drained */ + u8 sq_drained_async_notify; + + /* Number of RNR retries (valid only for SRD QPs) */ + u8 rnr_retry; + + /* MBZ */ + u16 reserved2; +}; + +struct efa_admin_modify_qp_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; +}; + +struct efa_admin_query_qp_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* QP handle returned by create_qp command */ + u32 qp_handle; +}; + +struct efa_admin_query_qp_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; + + /* QP state */ + u32 qp_state; + + /* QKey */ + u32 qkey; + + /* SQ PSN */ + u32 sq_psn; + + /* Indicates that draining is in progress */ + u8 sq_draining; + + /* Number of RNR retries (valid only for SRD QPs) */ + u8 rnr_retry; + + /* MBZ */ + u16 reserved2; +}; + +struct efa_admin_destroy_qp_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* QP handle returned by create_qp command */ + u32 qp_handle; +}; + +struct efa_admin_destroy_qp_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; +}; + +/* + * Create Address Handle command parameters. Must not be called more than + * once for the same destination + */ +struct efa_admin_create_ah_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* Destination address in network byte order */ + u8 dest_addr[16]; + + /* PD number */ + u16 pd; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_create_ah_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; + + /* Target interface address handle (opaque) */ + u16 ah; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_destroy_ah_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* Target interface address handle (opaque) */ + u16 ah; + + /* PD number */ + u16 pd; +}; + +struct efa_admin_destroy_ah_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; +}; + +/* + * Registration of MemoryRegion, required for QP working with Virtual + * Addresses. In standard verbs semantics, region length is limited to 2GB + * space, but EFA offers larger MR support for large memory space, to ease + * on users working with very large datasets (i.e. full GPU memory mapping). + */ +struct efa_admin_reg_mr_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* Protection Domain */ + u16 pd; + + /* MBZ */ + u16 reserved16_w1; + + /* Physical Buffer List, each element is page-aligned. */ + union { + /* + * Inline array of guest-physical page addresses of user + * memory pages (optimization for short region + * registrations) + */ + u64 inline_pbl_array[4]; + + /* points to PBL (direct or indirect, chained if needed) */ + struct efa_admin_ctrl_buff_info pbl; + } pbl; + + /* Memory region length, in bytes. */ + u64 mr_length; + + /* + * flags and page size + * 4:0 : phys_page_size_shift - page size is (1 << + * phys_page_size_shift). Page size is used for + * building the Virtual to Physical address mapping + * 6:5 : reserved - MBZ + * 7 : mem_addr_phy_mode_en - Enable bit for physical + * memory registration (no translation), can be used + * only by privileged clients. If set, PBL must + * contain a single entry. + */ + u8 flags; + + /* + * permissions + * 0 : local_write_enable - Local write permissions: + * must be set for RQ buffers and buffers posted for + * RDMA Read requests + * 1 : remote_write_enable - Remote write + * permissions: must be set to enable RDMA write to + * the region + * 2 : remote_read_enable - Remote read permissions: + * must be set to enable RDMA read from the region + * 7:3 : reserved2 - MBZ + */ + u8 permissions; + + /* MBZ */ + u16 reserved16_w5; + + /* number of pages in PBL (redundant, could be calculated) */ + u32 page_num; + + /* + * IO Virtual Address associated with this MR. If + * mem_addr_phy_mode_en is set, contains the physical address of + * the region. + */ + u64 iova; +}; + +struct efa_admin_reg_mr_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; + + /* + * L_Key, to be used in conjunction with local buffer references in + * SQ and RQ WQE, or with virtual RQ/CQ rings + */ + u32 l_key; + + /* + * R_Key, to be used in RDMA messages to refer to remotely accessed + * memory region + */ + u32 r_key; + + /* + * Mask indicating which fields have valid values + * 0 : recv_ic_id + * 1 : rdma_read_ic_id + * 2 : rdma_recv_ic_id + */ + u8 validity; + + /* + * Physical interconnect used by the device to reach the MR for receive + * operation + */ + u8 recv_ic_id; + + /* + * Physical interconnect used by the device to reach the MR for RDMA + * read operation + */ + u8 rdma_read_ic_id; + + /* + * Physical interconnect used by the device to reach the MR for RDMA + * write receive + */ + u8 rdma_recv_ic_id; +}; + +struct efa_admin_dereg_mr_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* L_Key, memory region's l_key */ + u32 l_key; +}; + +struct efa_admin_dereg_mr_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; +}; + +struct efa_admin_create_cq_cmd { + struct efa_admin_aq_common_desc aq_common_desc; + + /* + * 4:0 : reserved5 - MBZ + * 5 : interrupt_mode_enabled - if set, cq operates + * in interrupt mode (i.e. CQ events and EQ elements + * are generated), otherwise - polling + * 6 : virt - If set, ring base address is virtual + * (IOVA returned by MR registration) + * 7 : reserved6 - MBZ + */ + u8 cq_caps_1; + + /* + * 4:0 : cq_entry_size_words - size of CQ entry in + * 32-bit words, valid values: 4, 8. + * 5 : set_src_addr - If set, source address will be + * filled on RX completions from unknown senders. + * Requires 8 words CQ entry size. + * 7:6 : reserved7 - MBZ + */ + u8 cq_caps_2; + + /* completion queue depth in # of entries. must be power of 2 */ + u16 cq_depth; + + /* EQ number assigned to this cq */ + u16 eqn; + + /* MBZ */ + u16 reserved; + + /* + * CQ ring base address, virtual or physical depending on 'virt' + * flag + */ + struct efa_common_mem_addr cq_ba; + + /* + * Memory registration key for the ring, used only when base + * address is virtual + */ + u32 l_key; + + /* + * number of sub cqs - must be equal to sub_cqs_per_cq of queue + * attributes. + */ + u16 num_sub_cqs; + + /* UAR number */ + u16 uar; +}; + +struct efa_admin_create_cq_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + u16 cq_idx; + + /* actual cq depth in number of entries */ + u16 cq_actual_depth; + + /* CQ doorbell address, as offset to PCIe DB BAR */ + u32 db_offset; + + /* + * 0 : db_valid - If set, doorbell offset is valid. + * Always set when interrupts are requested. + */ + u32 flags; +}; + +struct efa_admin_destroy_cq_cmd { + struct efa_admin_aq_common_desc aq_common_desc; + + u16 cq_idx; + + /* MBZ */ + u16 reserved1; +}; + +struct efa_admin_destroy_cq_resp { + struct efa_admin_acq_common_desc acq_common_desc; +}; + +/* + * EFA AQ Get Statistics command. Extended statistics are placed in control + * buffer pointed by AQ entry + */ +struct efa_admin_aq_get_stats_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + union { + /* command specific inline data */ + u32 inline_data_w1[3]; + + struct efa_admin_ctrl_buff_info control_buffer; + } u; + + /* stats type as defined in enum efa_admin_get_stats_type */ + u8 type; + + /* stats scope defined in enum efa_admin_get_stats_scope */ + u8 scope; + + u16 scope_modifier; +}; + +struct efa_admin_basic_stats { + u64 tx_bytes; + + u64 tx_pkts; + + u64 rx_bytes; + + u64 rx_pkts; + + u64 rx_drops; +}; + +struct efa_admin_messages_stats { + u64 send_bytes; + + u64 send_wrs; + + u64 recv_bytes; + + u64 recv_wrs; +}; + +struct efa_admin_rdma_read_stats { + u64 read_wrs; + + u64 read_bytes; + + u64 read_wr_err; + + u64 read_resp_bytes; +}; + +struct efa_admin_rdma_write_stats { + u64 write_wrs; + + u64 write_bytes; + + u64 write_wr_err; + + u64 write_recv_bytes; +}; + +struct efa_admin_acq_get_stats_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + union { + struct efa_admin_basic_stats basic_stats; + + struct efa_admin_messages_stats messages_stats; + + struct efa_admin_rdma_read_stats rdma_read_stats; + + struct efa_admin_rdma_write_stats rdma_write_stats; + } u; +}; + +struct efa_admin_get_set_feature_common_desc { + /* MBZ */ + u8 reserved0; + + /* as appears in efa_admin_aq_feature_id */ + u8 feature_id; + + /* MBZ */ + u16 reserved16; +}; + +struct efa_admin_feature_device_attr_desc { + /* Bitmap of efa_admin_aq_feature_id */ + u64 supported_features; + + /* Bitmap of supported page sizes in MR registrations */ + u64 page_size_cap; + + u32 fw_version; + + u32 admin_api_version; + + u32 device_version; + + /* Bar used for SQ and RQ doorbells */ + u16 db_bar; + + /* Indicates how many bits are used on physical address access */ + u8 phys_addr_width; + + /* Indicates how many bits are used on virtual address access */ + u8 virt_addr_width; + + /* + * 0 : rdma_read - If set, RDMA Read is supported on + * TX queues + * 1 : rnr_retry - If set, RNR retry is supported on + * modify QP command + * 2 : data_polling_128 - If set, 128 bytes data + * polling is supported + * 3 : rdma_write - If set, RDMA Write is supported + * on TX queues + * 31:4 : reserved - MBZ + */ + u32 device_caps; + + /* Max RDMA transfer size in bytes */ + u32 max_rdma_size; +}; + +struct efa_admin_feature_queue_attr_desc { + /* The maximum number of queue pairs supported */ + u32 max_qp; + + /* Maximum number of WQEs per Send Queue */ + u32 max_sq_depth; + + /* Maximum size of data that can be sent inline in a Send WQE */ + u32 inline_buf_size; + + /* Maximum number of buffer descriptors per Recv Queue */ + u32 max_rq_depth; + + /* The maximum number of completion queues supported per VF */ + u32 max_cq; + + /* Maximum number of CQEs per Completion Queue */ + u32 max_cq_depth; + + /* Number of sub-CQs to be created for each CQ */ + u16 sub_cqs_per_cq; + + /* Minimum number of WQEs per SQ */ + u16 min_sq_depth; + + /* Maximum number of SGEs (buffers) allowed for a single send WQE */ + u16 max_wr_send_sges; + + /* Maximum number of SGEs allowed for a single recv WQE */ + u16 max_wr_recv_sges; + + /* The maximum number of memory regions supported */ + u32 max_mr; + + /* The maximum number of pages can be registered */ + u32 max_mr_pages; + + /* The maximum number of protection domains supported */ + u32 max_pd; + + /* The maximum number of address handles supported */ + u32 max_ah; + + /* The maximum size of LLQ in bytes */ + u32 max_llq_size; + + /* Maximum number of SGEs for a single RDMA read/write WQE */ + u16 max_wr_rdma_sges; + + /* + * Maximum number of bytes that can be written to SQ between two + * consecutive doorbells (in units of 64B). Driver must ensure that only + * complete WQEs are written to queue before issuing a doorbell. + * Examples: max_tx_batch=16 and WQE size = 64B, means up to 16 WQEs can + * be written to SQ between two consecutive doorbells. max_tx_batch=11 + * and WQE size = 128B, means up to 5 WQEs can be written to SQ between + * two consecutive doorbells. Zero means unlimited. + */ + u16 max_tx_batch; +}; + +struct efa_admin_event_queue_attr_desc { + /* The maximum number of event queues supported */ + u32 max_eq; + + /* Maximum number of EQEs per Event Queue */ + u32 max_eq_depth; + + /* Supported events bitmask */ + u32 event_bitmask; +}; + +struct efa_admin_feature_aenq_desc { + /* bitmask for AENQ groups the device can report */ + u32 supported_groups; + + /* bitmask for AENQ groups to report */ + u32 enabled_groups; +}; + +struct efa_admin_feature_network_attr_desc { + /* Raw address data in network byte order */ + u8 addr[16]; + + /* max packet payload size in bytes */ + u32 mtu; +}; + +/* + * When hint value is 0, hints capabilities are not supported or driver + * should use its own predefined value + */ +struct efa_admin_hw_hints { + /* value in ms */ + u16 mmio_read_timeout; + + /* value in ms */ + u16 driver_watchdog_timeout; + + /* value in ms */ + u16 admin_completion_timeout; + + /* poll interval in ms */ + u16 poll_interval; +}; + +struct efa_admin_get_feature_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + struct efa_admin_ctrl_buff_info control_buffer; + + struct efa_admin_get_set_feature_common_desc feature_common; + + u32 raw[11]; +}; + +struct efa_admin_get_feature_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + union { + u32 raw[14]; + + struct efa_admin_feature_device_attr_desc device_attr; + + struct efa_admin_feature_aenq_desc aenq; + + struct efa_admin_feature_network_attr_desc network_attr; + + struct efa_admin_feature_queue_attr_desc queue_attr; + + struct efa_admin_event_queue_attr_desc event_queue_attr; + + struct efa_admin_hw_hints hw_hints; + } u; +}; + +struct efa_admin_set_feature_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + struct efa_admin_ctrl_buff_info control_buffer; + + struct efa_admin_get_set_feature_common_desc feature_common; + + union { + u32 raw[11]; + + /* AENQ configuration */ + struct efa_admin_feature_aenq_desc aenq; + } u; +}; + +struct efa_admin_set_feature_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + union { + u32 raw[14]; + } u; +}; + +struct efa_admin_alloc_pd_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; +}; + +struct efa_admin_alloc_pd_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + /* PD number */ + u16 pd; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_dealloc_pd_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + /* PD number */ + u16 pd; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_dealloc_pd_resp { + struct efa_admin_acq_common_desc acq_common_desc; +}; + +struct efa_admin_alloc_uar_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; +}; + +struct efa_admin_alloc_uar_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + /* UAR number */ + u16 uar; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_dealloc_uar_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + /* UAR number */ + u16 uar; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_dealloc_uar_resp { + struct efa_admin_acq_common_desc acq_common_desc; +}; + +struct efa_admin_create_eq_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + /* Size of the EQ in entries, must be power of 2 */ + u16 depth; + + /* MSI-X table entry index */ + u8 msix_vec; + + /* + * 4:0 : entry_size_words - size of EQ entry in + * 32-bit words + * 7:5 : reserved - MBZ + */ + u8 caps; + + /* EQ ring base address */ + struct efa_common_mem_addr ba; + + /* + * Enabled events on this EQ + * 0 : completion_events - Enable completion events + * 31:1 : reserved - MBZ + */ + u32 event_bitmask; + + /* MBZ */ + u32 reserved; +}; + +struct efa_admin_create_eq_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + /* EQ number */ + u16 eqn; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_destroy_eq_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + /* EQ number */ + u16 eqn; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_destroy_eq_resp { + struct efa_admin_acq_common_desc acq_common_desc; +}; + +/* asynchronous event notification groups */ +enum efa_admin_aenq_group { + EFA_ADMIN_FATAL_ERROR = 1, + EFA_ADMIN_WARNING = 2, + EFA_ADMIN_NOTIFICATION = 3, + EFA_ADMIN_KEEP_ALIVE = 4, + EFA_ADMIN_AENQ_GROUPS_NUM = 5, +}; + +struct efa_admin_mmio_req_read_less_resp { + u16 req_id; + + u16 reg_off; + + /* value is valid when poll is cleared */ + u32 reg_val; +}; + +enum efa_admin_os_type { + EFA_ADMIN_OS_LINUX = 0, +}; + +struct efa_admin_host_info { + /* OS distribution string format */ + u8 os_dist_str[128]; + + /* Defined in enum efa_admin_os_type */ + u32 os_type; + + /* Kernel version string format */ + u8 kernel_ver_str[32]; + + /* Kernel version numeric format */ + u32 kernel_ver; + + /* + * 7:0 : driver_module_type + * 15:8 : driver_sub_minor + * 23:16 : driver_minor + * 31:24 : driver_major + */ + u32 driver_ver; + + /* + * Device's Bus, Device and Function + * 2:0 : function + * 7:3 : device + * 15:8 : bus + */ + u16 bdf; + + /* + * Spec version + * 7:0 : spec_minor + * 15:8 : spec_major + */ + u16 spec_ver; + + /* + * 0 : intree - Intree driver + * 1 : gdr - GPUDirect RDMA supported + * 31:2 : reserved2 + */ + u32 flags; +}; + +/* create_qp_cmd */ +#define EFA_ADMIN_CREATE_QP_CMD_SQ_VIRT_MASK BIT(0) +#define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_MASK BIT(1) + +/* modify_qp_cmd */ +#define EFA_ADMIN_MODIFY_QP_CMD_QP_STATE_MASK BIT(0) +#define EFA_ADMIN_MODIFY_QP_CMD_CUR_QP_STATE_MASK BIT(1) +#define EFA_ADMIN_MODIFY_QP_CMD_QKEY_MASK BIT(2) +#define EFA_ADMIN_MODIFY_QP_CMD_SQ_PSN_MASK BIT(3) +#define EFA_ADMIN_MODIFY_QP_CMD_SQ_DRAINED_ASYNC_NOTIFY_MASK BIT(4) +#define EFA_ADMIN_MODIFY_QP_CMD_RNR_RETRY_MASK BIT(5) + +/* reg_mr_cmd */ +#define EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK GENMASK(4, 0) +#define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_MASK BIT(7) +#define EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK BIT(0) +#define EFA_ADMIN_REG_MR_CMD_REMOTE_WRITE_ENABLE_MASK BIT(1) +#define EFA_ADMIN_REG_MR_CMD_REMOTE_READ_ENABLE_MASK BIT(2) + +/* reg_mr_resp */ +#define EFA_ADMIN_REG_MR_RESP_RECV_IC_ID_MASK BIT(0) +#define EFA_ADMIN_REG_MR_RESP_RDMA_READ_IC_ID_MASK BIT(1) +#define EFA_ADMIN_REG_MR_RESP_RDMA_RECV_IC_ID_MASK BIT(2) + +/* create_cq_cmd */ +#define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5) +#define EFA_ADMIN_CREATE_CQ_CMD_VIRT_MASK BIT(6) +#define EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK GENMASK(4, 0) +#define EFA_ADMIN_CREATE_CQ_CMD_SET_SRC_ADDR_MASK BIT(5) + +/* create_cq_resp */ +#define EFA_ADMIN_CREATE_CQ_RESP_DB_VALID_MASK BIT(0) + +/* feature_device_attr_desc */ +#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_READ_MASK BIT(0) +#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK BIT(1) +#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_DATA_POLLING_128_MASK BIT(2) +#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_WRITE_MASK BIT(3) + +/* create_eq_cmd */ +#define EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS_MASK GENMASK(4, 0) +#define EFA_ADMIN_CREATE_EQ_CMD_VIRT_MASK BIT(6) +#define EFA_ADMIN_CREATE_EQ_CMD_COMPLETION_EVENTS_MASK BIT(0) + +/* host_info */ +#define EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE_MASK GENMASK(7, 0) +#define EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR_MASK GENMASK(15, 8) +#define EFA_ADMIN_HOST_INFO_DRIVER_MINOR_MASK GENMASK(23, 16) +#define EFA_ADMIN_HOST_INFO_DRIVER_MAJOR_MASK GENMASK(31, 24) +#define EFA_ADMIN_HOST_INFO_FUNCTION_MASK GENMASK(2, 0) +#define EFA_ADMIN_HOST_INFO_DEVICE_MASK GENMASK(7, 3) +#define EFA_ADMIN_HOST_INFO_BUS_MASK GENMASK(15, 8) +#define EFA_ADMIN_HOST_INFO_SPEC_MINOR_MASK GENMASK(7, 0) +#define EFA_ADMIN_HOST_INFO_SPEC_MAJOR_MASK GENMASK(15, 8) +#define EFA_ADMIN_HOST_INFO_INTREE_MASK BIT(0) +#define EFA_ADMIN_HOST_INFO_GDR_MASK BIT(1) + +#endif /* _EFA_ADMIN_CMDS_H_ */ diff --git a/drivers/amazon/net/efa/efa_admin_defs.h b/drivers/amazon/net/efa/efa_admin_defs.h new file mode 100644 index 0000000000000..83f20c38a8400 --- /dev/null +++ b/drivers/amazon/net/efa/efa_admin_defs.h @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_ADMIN_H_ +#define _EFA_ADMIN_H_ + +enum efa_admin_aq_completion_status { + EFA_ADMIN_SUCCESS = 0, + EFA_ADMIN_RESOURCE_ALLOCATION_FAILURE = 1, + EFA_ADMIN_BAD_OPCODE = 2, + EFA_ADMIN_UNSUPPORTED_OPCODE = 3, + EFA_ADMIN_MALFORMED_REQUEST = 4, + /* Additional status is provided in ACQ entry extended_status */ + EFA_ADMIN_ILLEGAL_PARAMETER = 5, + EFA_ADMIN_UNKNOWN_ERROR = 6, + EFA_ADMIN_RESOURCE_BUSY = 7, +}; + +struct efa_admin_aq_common_desc { + /* + * 11:0 : command_id + * 15:12 : reserved12 + */ + u16 command_id; + + /* as appears in efa_admin_aq_opcode */ + u8 opcode; + + /* + * 0 : phase + * 1 : ctrl_data - control buffer address valid + * 2 : ctrl_data_indirect - control buffer address + * points to list of pages with addresses of control + * buffers + * 7:3 : reserved3 + */ + u8 flags; +}; + +/* + * used in efa_admin_aq_entry. Can point directly to control data, or to a + * page list chunk. Used also at the end of indirect mode page list chunks, + * for chaining. + */ +struct efa_admin_ctrl_buff_info { + u32 length; + + struct efa_common_mem_addr address; +}; + +struct efa_admin_aq_entry { + struct efa_admin_aq_common_desc aq_common_descriptor; + + union { + u32 inline_data_w1[3]; + + struct efa_admin_ctrl_buff_info control_buffer; + } u; + + u32 inline_data_w4[12]; +}; + +struct efa_admin_acq_common_desc { + /* + * command identifier to associate it with the aq descriptor + * 11:0 : command_id + * 15:12 : reserved12 + */ + u16 command; + + u8 status; + + /* + * 0 : phase + * 7:1 : reserved1 + */ + u8 flags; + + u16 extended_status; + + /* + * indicates to the driver which AQ entry has been consumed by the + * device and could be reused + */ + u16 sq_head_indx; +}; + +struct efa_admin_acq_entry { + struct efa_admin_acq_common_desc acq_common_descriptor; + + u32 response_specific_data[14]; +}; + +struct efa_admin_aenq_common_desc { + u16 group; + + u16 syndrom; + + /* + * 0 : phase + * 7:1 : reserved - MBZ + */ + u8 flags; + + u8 reserved1[3]; + + u32 timestamp_low; + + u32 timestamp_high; +}; + +struct efa_admin_aenq_entry { + struct efa_admin_aenq_common_desc aenq_common_desc; + + /* command specific inline data */ + u32 inline_data_w4[12]; +}; + +enum efa_admin_eqe_event_type { + EFA_ADMIN_EQE_EVENT_TYPE_COMPLETION = 0, +}; + +/* Completion event */ +struct efa_admin_comp_event { + /* CQ number */ + u16 cqn; + + /* MBZ */ + u16 reserved; + + /* MBZ */ + u32 reserved2; +}; + +/* Event Queue Element */ +struct efa_admin_eqe { + /* + * 0 : phase + * 8:1 : event_type - Event type + * 31:9 : reserved - MBZ + */ + u32 common; + + /* MBZ */ + u32 reserved; + + union { + /* Event data */ + u32 event_data[2]; + + /* Completion Event */ + struct efa_admin_comp_event comp_event; + } u; +}; + +/* aq_common_desc */ +#define EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK GENMASK(11, 0) +#define EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK BIT(0) +#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK BIT(1) +#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK BIT(2) + +/* acq_common_desc */ +#define EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK GENMASK(11, 0) +#define EFA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK BIT(0) + +/* aenq_common_desc */ +#define EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK BIT(0) + +/* eqe */ +#define EFA_ADMIN_EQE_PHASE_MASK BIT(0) +#define EFA_ADMIN_EQE_EVENT_TYPE_MASK GENMASK(8, 1) + +#endif /* _EFA_ADMIN_H_ */ diff --git a/drivers/amazon/net/efa/efa_com.c b/drivers/amazon/net/efa/efa_com.c new file mode 100644 index 0000000000000..d0b13097a0967 --- /dev/null +++ b/drivers/amazon/net/efa/efa_com.c @@ -0,0 +1,1251 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "efa_com.h" +#include "efa_regs_defs.h" + +#define ADMIN_CMD_TIMEOUT_US 30000000 /* usecs */ + +#define EFA_REG_READ_TIMEOUT_US 50000 /* usecs */ +#define EFA_MMIO_READ_INVALID 0xffffffff + +#define EFA_POLL_INTERVAL_MS 100 /* msecs */ + +#define EFA_ASYNC_QUEUE_DEPTH 16 +#define EFA_ADMIN_QUEUE_DEPTH 32 + +#define EFA_CTRL_MAJOR 0 +#define EFA_CTRL_MINOR 0 +#define EFA_CTRL_SUB_MINOR 1 + +enum efa_cmd_status { + EFA_CMD_SUBMITTED, + EFA_CMD_COMPLETED, +}; + +struct efa_comp_ctx { + struct completion wait_event; + struct efa_admin_acq_entry *user_cqe; + u32 comp_size; + enum efa_cmd_status status; + u8 cmd_opcode; + u8 occupied; +}; + +static const char *efa_com_cmd_str(u8 cmd) +{ +#define EFA_CMD_STR_CASE(_cmd) case EFA_ADMIN_##_cmd: return #_cmd + + switch (cmd) { + EFA_CMD_STR_CASE(CREATE_QP); + EFA_CMD_STR_CASE(MODIFY_QP); + EFA_CMD_STR_CASE(QUERY_QP); + EFA_CMD_STR_CASE(DESTROY_QP); + EFA_CMD_STR_CASE(CREATE_AH); + EFA_CMD_STR_CASE(DESTROY_AH); + EFA_CMD_STR_CASE(REG_MR); + EFA_CMD_STR_CASE(DEREG_MR); + EFA_CMD_STR_CASE(CREATE_CQ); + EFA_CMD_STR_CASE(DESTROY_CQ); + EFA_CMD_STR_CASE(GET_FEATURE); + EFA_CMD_STR_CASE(SET_FEATURE); + EFA_CMD_STR_CASE(GET_STATS); + EFA_CMD_STR_CASE(ALLOC_PD); + EFA_CMD_STR_CASE(DEALLOC_PD); + EFA_CMD_STR_CASE(ALLOC_UAR); + EFA_CMD_STR_CASE(DEALLOC_UAR); + EFA_CMD_STR_CASE(CREATE_EQ); + EFA_CMD_STR_CASE(DESTROY_EQ); + default: return "unknown command opcode"; + } +#undef EFA_CMD_STR_CASE +} + +void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low) +{ + *addr_low = lower_32_bits(addr); + *addr_high = upper_32_bits(addr); +} + +static u32 efa_com_reg_read32(struct efa_com_dev *edev, u16 offset) +{ + struct efa_com_mmio_read *mmio_read = &edev->mmio_read; + struct efa_admin_mmio_req_read_less_resp *read_resp; + unsigned long exp_time; + u32 mmio_read_reg = 0; + u32 err; + + read_resp = mmio_read->read_resp; + + spin_lock(&mmio_read->lock); + mmio_read->seq_num++; + + /* trash DMA req_id to identify when hardware is done */ + read_resp->req_id = mmio_read->seq_num + 0x9aL; + EFA_SET(&mmio_read_reg, EFA_REGS_MMIO_REG_READ_REG_OFF, offset); + EFA_SET(&mmio_read_reg, EFA_REGS_MMIO_REG_READ_REQ_ID, + mmio_read->seq_num); + + writel(mmio_read_reg, edev->reg_bar + EFA_REGS_MMIO_REG_READ_OFF); + + exp_time = jiffies + usecs_to_jiffies(mmio_read->mmio_read_timeout); + do { + if (READ_ONCE(read_resp->req_id) == mmio_read->seq_num) + break; + udelay(1); + } while (time_is_after_jiffies(exp_time)); + + if (read_resp->req_id != mmio_read->seq_num) { + ibdev_err_ratelimited( + edev->efa_dev, + "Reading register timed out. expected: req id[%u] offset[%#x] actual: req id[%u] offset[%#x]\n", + mmio_read->seq_num, offset, read_resp->req_id, + read_resp->reg_off); + err = EFA_MMIO_READ_INVALID; + goto out; + } + + if (read_resp->reg_off != offset) { + ibdev_err_ratelimited( + edev->efa_dev, + "Reading register failed: wrong offset provided\n"); + err = EFA_MMIO_READ_INVALID; + goto out; + } + + err = read_resp->reg_val; +out: + spin_unlock(&mmio_read->lock); + return err; +} + +static int efa_com_admin_init_sq(struct efa_com_dev *edev) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_com_admin_sq *sq = &aq->sq; + u16 size = aq->depth * sizeof(*sq->entries); + u32 aq_caps = 0; + u32 addr_high; + u32 addr_low; + + sq->entries = + dma_alloc_coherent(aq->dmadev, size, &sq->dma_addr, GFP_KERNEL); + if (!sq->entries) + return -ENOMEM; + + spin_lock_init(&sq->lock); + + sq->cc = 0; + sq->pc = 0; + sq->phase = 1; + + sq->db_addr = (u32 __iomem *)(edev->reg_bar + EFA_REGS_AQ_PROD_DB_OFF); + + addr_high = upper_32_bits(sq->dma_addr); + addr_low = lower_32_bits(sq->dma_addr); + + writel(addr_low, edev->reg_bar + EFA_REGS_AQ_BASE_LO_OFF); + writel(addr_high, edev->reg_bar + EFA_REGS_AQ_BASE_HI_OFF); + + EFA_SET(&aq_caps, EFA_REGS_AQ_CAPS_AQ_DEPTH, aq->depth); + EFA_SET(&aq_caps, EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE, + sizeof(struct efa_admin_aq_entry)); + + writel(aq_caps, edev->reg_bar + EFA_REGS_AQ_CAPS_OFF); + + return 0; +} + +static int efa_com_admin_init_cq(struct efa_com_dev *edev) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_com_admin_cq *cq = &aq->cq; + u16 size = aq->depth * sizeof(*cq->entries); + u32 acq_caps = 0; + u32 addr_high; + u32 addr_low; + + cq->entries = + dma_alloc_coherent(aq->dmadev, size, &cq->dma_addr, GFP_KERNEL); + if (!cq->entries) + return -ENOMEM; + + spin_lock_init(&cq->lock); + + cq->cc = 0; + cq->phase = 1; + + addr_high = upper_32_bits(cq->dma_addr); + addr_low = lower_32_bits(cq->dma_addr); + + writel(addr_low, edev->reg_bar + EFA_REGS_ACQ_BASE_LO_OFF); + writel(addr_high, edev->reg_bar + EFA_REGS_ACQ_BASE_HI_OFF); + + EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_DEPTH, aq->depth); + EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE, + sizeof(struct efa_admin_acq_entry)); + EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR, + aq->msix_vector_idx); + + writel(acq_caps, edev->reg_bar + EFA_REGS_ACQ_CAPS_OFF); + + return 0; +} + +static int efa_com_admin_init_aenq(struct efa_com_dev *edev, + struct efa_aenq_handlers *aenq_handlers) +{ + struct efa_com_aenq *aenq = &edev->aenq; + u32 addr_low, addr_high; + u32 aenq_caps = 0; + u16 size; + + if (!aenq_handlers) { + ibdev_err(edev->efa_dev, "aenq handlers pointer is NULL\n"); + return -EINVAL; + } + + size = EFA_ASYNC_QUEUE_DEPTH * sizeof(*aenq->entries); + aenq->entries = dma_alloc_coherent(edev->dmadev, size, &aenq->dma_addr, + GFP_KERNEL); + if (!aenq->entries) + return -ENOMEM; + + aenq->aenq_handlers = aenq_handlers; + aenq->depth = EFA_ASYNC_QUEUE_DEPTH; + aenq->cc = 0; + aenq->phase = 1; + + addr_low = lower_32_bits(aenq->dma_addr); + addr_high = upper_32_bits(aenq->dma_addr); + + writel(addr_low, edev->reg_bar + EFA_REGS_AENQ_BASE_LO_OFF); + writel(addr_high, edev->reg_bar + EFA_REGS_AENQ_BASE_HI_OFF); + + EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_DEPTH, aenq->depth); + EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE, + sizeof(struct efa_admin_aenq_entry)); + EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR, + aenq->msix_vector_idx); + writel(aenq_caps, edev->reg_bar + EFA_REGS_AENQ_CAPS_OFF); + + /* + * Init cons_db to mark that all entries in the queue + * are initially available + */ + writel(edev->aenq.cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF); + + return 0; +} + +/* ID to be used with efa_com_get_comp_ctx */ +static u16 efa_com_alloc_ctx_id(struct efa_com_admin_queue *aq) +{ + u16 ctx_id; + + spin_lock(&aq->comp_ctx_lock); + ctx_id = aq->comp_ctx_pool[aq->comp_ctx_pool_next]; + aq->comp_ctx_pool_next++; + spin_unlock(&aq->comp_ctx_lock); + + return ctx_id; +} + +static void efa_com_dealloc_ctx_id(struct efa_com_admin_queue *aq, + u16 ctx_id) +{ + spin_lock(&aq->comp_ctx_lock); + aq->comp_ctx_pool_next--; + aq->comp_ctx_pool[aq->comp_ctx_pool_next] = ctx_id; + spin_unlock(&aq->comp_ctx_lock); +} + +static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq, + struct efa_comp_ctx *comp_ctx) +{ + u16 cmd_id = EFA_GET(&comp_ctx->user_cqe->acq_common_descriptor.command, + EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); + u16 ctx_id = cmd_id & (aq->depth - 1); + + ibdev_dbg(aq->efa_dev, "Put completion command_id %#x\n", cmd_id); + comp_ctx->occupied = 0; + efa_com_dealloc_ctx_id(aq, ctx_id); +} + +static struct efa_comp_ctx *efa_com_get_comp_ctx(struct efa_com_admin_queue *aq, + u16 cmd_id, bool capture) +{ + u16 ctx_id = cmd_id & (aq->depth - 1); + + if (aq->comp_ctx[ctx_id].occupied && capture) { + ibdev_err_ratelimited( + aq->efa_dev, + "Completion context for command_id %#x is occupied\n", + cmd_id); + return NULL; + } + + if (capture) { + aq->comp_ctx[ctx_id].occupied = 1; + ibdev_dbg(aq->efa_dev, + "Take completion ctxt for command_id %#x\n", cmd_id); + } + + return &aq->comp_ctx[ctx_id]; +} + +static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq, + struct efa_admin_aq_entry *cmd, + size_t cmd_size_in_bytes, + struct efa_admin_acq_entry *comp, + size_t comp_size_in_bytes) +{ + struct efa_admin_aq_entry *aqe; + struct efa_comp_ctx *comp_ctx; + u16 queue_size_mask; + u16 cmd_id; + u16 ctx_id; + u16 pi; + + queue_size_mask = aq->depth - 1; + pi = aq->sq.pc & queue_size_mask; + + ctx_id = efa_com_alloc_ctx_id(aq); + + /* cmd_id LSBs are the ctx_id and MSBs are entropy bits from pc */ + cmd_id = ctx_id & queue_size_mask; + cmd_id |= aq->sq.pc & ~queue_size_mask; + cmd_id &= EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK; + + cmd->aq_common_descriptor.command_id = cmd_id; + EFA_SET(&cmd->aq_common_descriptor.flags, + EFA_ADMIN_AQ_COMMON_DESC_PHASE, aq->sq.phase); + + comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, true); + if (!comp_ctx) { + efa_com_dealloc_ctx_id(aq, ctx_id); + return ERR_PTR(-EINVAL); + } + + comp_ctx->status = EFA_CMD_SUBMITTED; + comp_ctx->comp_size = comp_size_in_bytes; + comp_ctx->user_cqe = comp; + comp_ctx->cmd_opcode = cmd->aq_common_descriptor.opcode; + + reinit_completion(&comp_ctx->wait_event); + + aqe = &aq->sq.entries[pi]; + memset(aqe, 0, sizeof(*aqe)); + memcpy(aqe, cmd, cmd_size_in_bytes); + + aq->sq.pc++; + atomic64_inc(&aq->stats.submitted_cmd); + + if ((aq->sq.pc & queue_size_mask) == 0) + aq->sq.phase = !aq->sq.phase; + + /* barrier not needed in case of writel */ + writel(aq->sq.pc, aq->sq.db_addr); + + return comp_ctx; +} + +static inline int efa_com_init_comp_ctxt(struct efa_com_admin_queue *aq) +{ + size_t pool_size = aq->depth * sizeof(*aq->comp_ctx_pool); + size_t size = aq->depth * sizeof(struct efa_comp_ctx); + struct efa_comp_ctx *comp_ctx; + u16 i; + + aq->comp_ctx = devm_kzalloc(aq->dmadev, size, GFP_KERNEL); + aq->comp_ctx_pool = devm_kzalloc(aq->dmadev, pool_size, GFP_KERNEL); + if (!aq->comp_ctx || !aq->comp_ctx_pool) { + devm_kfree(aq->dmadev, aq->comp_ctx_pool); + devm_kfree(aq->dmadev, aq->comp_ctx); + return -ENOMEM; + } + + for (i = 0; i < aq->depth; i++) { + comp_ctx = efa_com_get_comp_ctx(aq, i, false); + if (comp_ctx) + init_completion(&comp_ctx->wait_event); + + aq->comp_ctx_pool[i] = i; + } + + spin_lock_init(&aq->comp_ctx_lock); + + aq->comp_ctx_pool_next = 0; + + return 0; +} + +static struct efa_comp_ctx *efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq, + struct efa_admin_aq_entry *cmd, + size_t cmd_size_in_bytes, + struct efa_admin_acq_entry *comp, + size_t comp_size_in_bytes) +{ + struct efa_comp_ctx *comp_ctx; + + spin_lock(&aq->sq.lock); + if (!test_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state)) { + ibdev_err_ratelimited(aq->efa_dev, "Admin queue is closed\n"); + spin_unlock(&aq->sq.lock); + return ERR_PTR(-ENODEV); + } + + comp_ctx = __efa_com_submit_admin_cmd(aq, cmd, cmd_size_in_bytes, comp, + comp_size_in_bytes); + spin_unlock(&aq->sq.lock); + if (IS_ERR(comp_ctx)) + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); + + return comp_ctx; +} + +static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq, + struct efa_admin_acq_entry *cqe) +{ + struct efa_comp_ctx *comp_ctx; + u16 cmd_id; + + cmd_id = EFA_GET(&cqe->acq_common_descriptor.command, + EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); + + comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false); + if (!comp_ctx) { + ibdev_err( + aq->efa_dev, + "comp_ctx is NULL. Changing the admin queue running state\n"); + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); + return; + } + + comp_ctx->status = EFA_CMD_COMPLETED; + memcpy(comp_ctx->user_cqe, cqe, comp_ctx->comp_size); + + if (!test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state)) + complete(&comp_ctx->wait_event); +} + +static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq) +{ + struct efa_admin_acq_entry *cqe; + u16 queue_size_mask; + u16 comp_num = 0; + u8 phase; + u16 ci; + + queue_size_mask = aq->depth - 1; + + ci = aq->cq.cc & queue_size_mask; + phase = aq->cq.phase; + + cqe = &aq->cq.entries[ci]; + + /* Go over all the completions */ + while ((READ_ONCE(cqe->acq_common_descriptor.flags) & + EFA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK) == phase) { + /* + * Do not read the rest of the completion entry before the + * phase bit was validated + */ + dma_rmb(); + efa_com_handle_single_admin_completion(aq, cqe); + + ci++; + comp_num++; + if (ci == aq->depth) { + ci = 0; + phase = !phase; + } + + cqe = &aq->cq.entries[ci]; + } + + aq->cq.cc += comp_num; + aq->cq.phase = phase; + aq->sq.cc += comp_num; + atomic64_add(comp_num, &aq->stats.completed_cmd); +} + +static int efa_com_comp_status_to_errno(u8 comp_status) +{ + switch (comp_status) { + case EFA_ADMIN_SUCCESS: + return 0; + case EFA_ADMIN_RESOURCE_ALLOCATION_FAILURE: + return -ENOMEM; + case EFA_ADMIN_UNSUPPORTED_OPCODE: + return -EOPNOTSUPP; + case EFA_ADMIN_BAD_OPCODE: + case EFA_ADMIN_MALFORMED_REQUEST: + case EFA_ADMIN_ILLEGAL_PARAMETER: + case EFA_ADMIN_UNKNOWN_ERROR: + return -EINVAL; + default: + return -EINVAL; + } +} + +static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_ctx, + struct efa_com_admin_queue *aq) +{ + unsigned long timeout; + unsigned long flags; + int err; + + timeout = jiffies + usecs_to_jiffies(aq->completion_timeout); + + while (1) { + spin_lock_irqsave(&aq->cq.lock, flags); + efa_com_handle_admin_completion(aq); + spin_unlock_irqrestore(&aq->cq.lock, flags); + + if (comp_ctx->status != EFA_CMD_SUBMITTED) + break; + + if (time_is_before_jiffies(timeout)) { + ibdev_err_ratelimited( + aq->efa_dev, + "Wait for completion (polling) timeout\n"); + /* EFA didn't have any completion */ + atomic64_inc(&aq->stats.no_completion); + + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); + err = -ETIME; + goto out; + } + + msleep(aq->poll_interval); + } + + err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status); +out: + efa_com_put_comp_ctx(aq, comp_ctx); + return err; +} + +static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *comp_ctx, + struct efa_com_admin_queue *aq) +{ + unsigned long flags; + int err; + + wait_for_completion_timeout(&comp_ctx->wait_event, + usecs_to_jiffies(aq->completion_timeout)); + + /* + * In case the command wasn't completed find out the root cause. + * There might be 2 kinds of errors + * 1) No completion (timeout reached) + * 2) There is completion but the device didn't get any msi-x interrupt. + */ + if (comp_ctx->status == EFA_CMD_SUBMITTED) { + spin_lock_irqsave(&aq->cq.lock, flags); + efa_com_handle_admin_completion(aq); + spin_unlock_irqrestore(&aq->cq.lock, flags); + + atomic64_inc(&aq->stats.no_completion); + + if (comp_ctx->status == EFA_CMD_COMPLETED) + ibdev_err_ratelimited( + aq->efa_dev, + "The device sent a completion but the driver didn't receive any MSI-X interrupt for admin cmd %s(%d) status %d (ctx: 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n", + efa_com_cmd_str(comp_ctx->cmd_opcode), + comp_ctx->cmd_opcode, comp_ctx->status, + comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc); + else + ibdev_err_ratelimited( + aq->efa_dev, + "The device didn't send any completion for admin cmd %s(%d) status %d (ctx 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n", + efa_com_cmd_str(comp_ctx->cmd_opcode), + comp_ctx->cmd_opcode, comp_ctx->status, + comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc); + + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); + err = -ETIME; + goto out; + } + + err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status); +out: + efa_com_put_comp_ctx(aq, comp_ctx); + return err; +} + +/* + * There are two types to wait for completion. + * Polling mode - wait until the completion is available. + * Async mode - wait on wait queue until the completion is ready + * (or the timeout expired). + * It is expected that the IRQ called efa_com_handle_admin_completion + * to mark the completions. + */ +static int efa_com_wait_and_process_admin_cq(struct efa_comp_ctx *comp_ctx, + struct efa_com_admin_queue *aq) +{ + if (test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state)) + return efa_com_wait_and_process_admin_cq_polling(comp_ctx, aq); + + return efa_com_wait_and_process_admin_cq_interrupts(comp_ctx, aq); +} + +/** + * efa_com_cmd_exec - Execute admin command + * @aq: admin queue. + * @cmd: the admin command to execute. + * @cmd_size: the command size. + * @comp: command completion return entry. + * @comp_size: command completion size. + * Submit an admin command and then wait until the device will return a + * completion. + * The completion will be copied into comp. + * + * @return - 0 on success, negative value on failure. + */ +int efa_com_cmd_exec(struct efa_com_admin_queue *aq, + struct efa_admin_aq_entry *cmd, + size_t cmd_size, + struct efa_admin_acq_entry *comp, + size_t comp_size) +{ + struct efa_comp_ctx *comp_ctx; + int err; + + might_sleep(); + + /* In case of queue FULL */ + down(&aq->avail_cmds); + + ibdev_dbg(aq->efa_dev, "%s (opcode %d)\n", + efa_com_cmd_str(cmd->aq_common_descriptor.opcode), + cmd->aq_common_descriptor.opcode); + comp_ctx = efa_com_submit_admin_cmd(aq, cmd, cmd_size, comp, comp_size); + if (IS_ERR(comp_ctx)) { + ibdev_err_ratelimited( + aq->efa_dev, + "Failed to submit command %s (opcode %u) err %ld\n", + efa_com_cmd_str(cmd->aq_common_descriptor.opcode), + cmd->aq_common_descriptor.opcode, PTR_ERR(comp_ctx)); + + up(&aq->avail_cmds); + atomic64_inc(&aq->stats.cmd_err); + return PTR_ERR(comp_ctx); + } + + err = efa_com_wait_and_process_admin_cq(comp_ctx, aq); + if (err) { + ibdev_err_ratelimited( + aq->efa_dev, + "Failed to process command %s (opcode %u) comp_status %d err %d\n", + efa_com_cmd_str(cmd->aq_common_descriptor.opcode), + cmd->aq_common_descriptor.opcode, + comp_ctx->user_cqe->acq_common_descriptor.status, err); + atomic64_inc(&aq->stats.cmd_err); + } + + up(&aq->avail_cmds); + + return err; +} + +/** + * efa_com_admin_destroy - Destroy the admin and the async events queues. + * @edev: EFA communication layer struct + */ +void efa_com_admin_destroy(struct efa_com_dev *edev) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_com_aenq *aenq = &edev->aenq; + struct efa_com_admin_cq *cq = &aq->cq; + struct efa_com_admin_sq *sq = &aq->sq; + u16 size; + + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); + + devm_kfree(edev->dmadev, aq->comp_ctx_pool); + devm_kfree(edev->dmadev, aq->comp_ctx); + + size = aq->depth * sizeof(*sq->entries); + dma_free_coherent(edev->dmadev, size, sq->entries, sq->dma_addr); + + size = aq->depth * sizeof(*cq->entries); + dma_free_coherent(edev->dmadev, size, cq->entries, cq->dma_addr); + + size = aenq->depth * sizeof(*aenq->entries); + dma_free_coherent(edev->dmadev, size, aenq->entries, aenq->dma_addr); +} + +/** + * efa_com_set_admin_polling_mode - Set the admin completion queue polling mode + * @edev: EFA communication layer struct + * @polling: Enable/Disable polling mode + * + * Set the admin completion mode. + */ +void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling) +{ + u32 mask_value = 0; + + if (polling) + EFA_SET(&mask_value, EFA_REGS_INTR_MASK_EN, 1); + + writel(mask_value, edev->reg_bar + EFA_REGS_INTR_MASK_OFF); + if (polling) + set_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state); + else + clear_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state); +} + +static void efa_com_stats_init(struct efa_com_dev *edev) +{ + atomic64_t *s = (atomic64_t *)&edev->aq.stats; + int i; + + for (i = 0; i < sizeof(edev->aq.stats) / sizeof(*s); i++, s++) + atomic64_set(s, 0); +} + +/** + * efa_com_admin_init - Init the admin and the async queues + * @edev: EFA communication layer struct + * @aenq_handlers: Those handlers to be called upon event. + * + * Initialize the admin submission and completion queues. + * Initialize the asynchronous events notification queues. + * + * @return - 0 on success, negative value on failure. + */ +int efa_com_admin_init(struct efa_com_dev *edev, + struct efa_aenq_handlers *aenq_handlers) +{ + struct efa_com_admin_queue *aq = &edev->aq; + u32 timeout; + u32 dev_sts; + u32 cap; + int err; + + dev_sts = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF); + if (!EFA_GET(&dev_sts, EFA_REGS_DEV_STS_READY)) { + ibdev_err(edev->efa_dev, + "Device isn't ready, abort com init %#x\n", dev_sts); + return -ENODEV; + } + + aq->depth = EFA_ADMIN_QUEUE_DEPTH; + + aq->dmadev = edev->dmadev; + aq->efa_dev = edev->efa_dev; + set_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state); + + sema_init(&aq->avail_cmds, aq->depth); + + efa_com_stats_init(edev); + + err = efa_com_init_comp_ctxt(aq); + if (err) + return err; + + err = efa_com_admin_init_sq(edev); + if (err) + goto err_destroy_comp_ctxt; + + err = efa_com_admin_init_cq(edev); + if (err) + goto err_destroy_sq; + + efa_com_set_admin_polling_mode(edev, false); + + err = efa_com_admin_init_aenq(edev, aenq_handlers); + if (err) + goto err_destroy_cq; + + cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF); + timeout = EFA_GET(&cap, EFA_REGS_CAPS_ADMIN_CMD_TO); + if (timeout) + /* the resolution of timeout reg is 100ms */ + aq->completion_timeout = timeout * 100000; + else + aq->completion_timeout = ADMIN_CMD_TIMEOUT_US; + + aq->poll_interval = EFA_POLL_INTERVAL_MS; + + set_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); + + return 0; + +err_destroy_cq: + dma_free_coherent(edev->dmadev, aq->depth * sizeof(*aq->cq.entries), + aq->cq.entries, aq->cq.dma_addr); +err_destroy_sq: + dma_free_coherent(edev->dmadev, aq->depth * sizeof(*aq->sq.entries), + aq->sq.entries, aq->sq.dma_addr); +err_destroy_comp_ctxt: + devm_kfree(edev->dmadev, aq->comp_ctx); + + return err; +} + +/** + * efa_com_admin_q_comp_intr_handler - admin queue interrupt handler + * @edev: EFA communication layer struct + * + * This method goes over the admin completion queue and wakes up + * all the pending threads that wait on the commands wait event. + * + * Note: Should be called after MSI-X interrupt. + */ +void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev) +{ + unsigned long flags; + + spin_lock_irqsave(&edev->aq.cq.lock, flags); + efa_com_handle_admin_completion(&edev->aq); + spin_unlock_irqrestore(&edev->aq.cq.lock, flags); +} + +/* + * efa_handle_specific_aenq_event: + * return the handler that is relevant to the specific event group + */ +static efa_aenq_handler efa_com_get_specific_aenq_cb(struct efa_com_dev *edev, + u16 group) +{ + struct efa_aenq_handlers *aenq_handlers = edev->aenq.aenq_handlers; + + if (group < EFA_MAX_HANDLERS && aenq_handlers->handlers[group]) + return aenq_handlers->handlers[group]; + + return aenq_handlers->unimplemented_handler; +} + +/** + * efa_com_aenq_intr_handler - AENQ interrupt handler + * @edev: EFA communication layer struct + * @data: Data of interrupt handler. + * + * Go over the async event notification queue and call the proper aenq handler. + */ +void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data) +{ + struct efa_admin_aenq_common_desc *aenq_common; + struct efa_com_aenq *aenq = &edev->aenq; + struct efa_admin_aenq_entry *aenq_e; + efa_aenq_handler handler_cb; + u32 processed = 0; + u8 phase; + u32 ci; + + ci = aenq->cc & (aenq->depth - 1); + phase = aenq->phase; + aenq_e = &aenq->entries[ci]; /* Get first entry */ + aenq_common = &aenq_e->aenq_common_desc; + + /* Go over all the events */ + while ((READ_ONCE(aenq_common->flags) & + EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) { + /* + * Do not read the rest of the completion entry before the + * phase bit was validated + */ + dma_rmb(); + + /* Handle specific event*/ + handler_cb = efa_com_get_specific_aenq_cb(edev, + aenq_common->group); + handler_cb(data, aenq_e); /* call the actual event handler*/ + + /* Get next event entry */ + ci++; + processed++; + + if (ci == aenq->depth) { + ci = 0; + phase = !phase; + } + aenq_e = &aenq->entries[ci]; + aenq_common = &aenq_e->aenq_common_desc; + } + + aenq->cc += processed; + aenq->phase = phase; + + /* Don't update aenq doorbell if there weren't any processed events */ + if (!processed) + return; + + /* barrier not needed in case of writel */ + writel(aenq->cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF); +} + +static void efa_com_mmio_reg_read_resp_addr_init(struct efa_com_dev *edev) +{ + struct efa_com_mmio_read *mmio_read = &edev->mmio_read; + u32 addr_high; + u32 addr_low; + + /* dma_addr_bits is unknown at this point */ + addr_high = (mmio_read->read_resp_dma_addr >> 32) & GENMASK(31, 0); + addr_low = mmio_read->read_resp_dma_addr & GENMASK(31, 0); + + writel(addr_high, edev->reg_bar + EFA_REGS_MMIO_RESP_HI_OFF); + writel(addr_low, edev->reg_bar + EFA_REGS_MMIO_RESP_LO_OFF); +} + +int efa_com_mmio_reg_read_init(struct efa_com_dev *edev) +{ + struct efa_com_mmio_read *mmio_read = &edev->mmio_read; + + spin_lock_init(&mmio_read->lock); + mmio_read->read_resp = + dma_alloc_coherent(edev->dmadev, sizeof(*mmio_read->read_resp), + &mmio_read->read_resp_dma_addr, GFP_KERNEL); + if (!mmio_read->read_resp) + return -ENOMEM; + + efa_com_mmio_reg_read_resp_addr_init(edev); + + mmio_read->read_resp->req_id = 0; + mmio_read->seq_num = 0; + mmio_read->mmio_read_timeout = EFA_REG_READ_TIMEOUT_US; + + return 0; +} + +void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev) +{ + struct efa_com_mmio_read *mmio_read = &edev->mmio_read; + + dma_free_coherent(edev->dmadev, sizeof(*mmio_read->read_resp), + mmio_read->read_resp, mmio_read->read_resp_dma_addr); +} + +int efa_com_validate_version(struct efa_com_dev *edev) +{ + u32 min_ctrl_ver = 0; + u32 ctrl_ver_masked; + u32 min_ver = 0; + u32 ctrl_ver; + u32 ver; + + /* + * Make sure the EFA version and the controller version are at least + * as the driver expects + */ + ver = efa_com_reg_read32(edev, EFA_REGS_VERSION_OFF); + ctrl_ver = efa_com_reg_read32(edev, + EFA_REGS_CONTROLLER_VERSION_OFF); + + ibdev_dbg(edev->efa_dev, "efa device version: %d.%d\n", + EFA_GET(&ver, EFA_REGS_VERSION_MAJOR_VERSION), + EFA_GET(&ver, EFA_REGS_VERSION_MINOR_VERSION)); + + EFA_SET(&min_ver, EFA_REGS_VERSION_MAJOR_VERSION, + EFA_ADMIN_API_VERSION_MAJOR); + EFA_SET(&min_ver, EFA_REGS_VERSION_MINOR_VERSION, + EFA_ADMIN_API_VERSION_MINOR); + if (ver < min_ver) { + ibdev_err( + edev->efa_dev, + "EFA version is lower than the minimal version the driver supports\n"); + return -EOPNOTSUPP; + } + + ibdev_dbg( + edev->efa_dev, + "efa controller version: %d.%d.%d implementation version %d\n", + EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION), + EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION), + EFA_GET(&ctrl_ver, + EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION), + EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_IMPL_ID)); + + ctrl_ver_masked = + EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION) | + EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION) | + EFA_GET(&ctrl_ver, + EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION); + + EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION, + EFA_CTRL_MAJOR); + EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION, + EFA_CTRL_MINOR); + EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION, + EFA_CTRL_SUB_MINOR); + /* Validate the ctrl version without the implementation ID */ + if (ctrl_ver_masked < min_ctrl_ver) { + ibdev_err( + edev->efa_dev, + "EFA ctrl version is lower than the minimal ctrl version the driver supports\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +/** + * efa_com_get_dma_width - Retrieve physical dma address width the device + * supports. + * @edev: EFA communication layer struct + * + * Retrieve the maximum physical address bits the device can handle. + * + * @return: > 0 on Success and negative value otherwise. + */ +int efa_com_get_dma_width(struct efa_com_dev *edev) +{ + u32 caps = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF); + int width; + + width = EFA_GET(&caps, EFA_REGS_CAPS_DMA_ADDR_WIDTH); + + ibdev_dbg(edev->efa_dev, "DMA width: %d\n", width); + + if (width < 32 || width > 64) { + ibdev_err(edev->efa_dev, "DMA width illegal value: %d\n", + width); + return -EINVAL; + } + + edev->dma_addr_bits = width; + + return width; +} + +static int wait_for_reset_state(struct efa_com_dev *edev, u32 timeout, int on) +{ + u32 val, i; + + for (i = 0; i < timeout; i++) { + val = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF); + + if (EFA_GET(&val, EFA_REGS_DEV_STS_RESET_IN_PROGRESS) == on) + return 0; + + ibdev_dbg(edev->efa_dev, "Reset indication val %d\n", val); + msleep(EFA_POLL_INTERVAL_MS); + } + + return -ETIME; +} + +/** + * efa_com_dev_reset - Perform device FLR to the device. + * @edev: EFA communication layer struct + * @reset_reason: Specify what is the trigger for the reset in case of an error. + * + * @return - 0 on success, negative value on failure. + */ +int efa_com_dev_reset(struct efa_com_dev *edev, + enum efa_regs_reset_reason_types reset_reason) +{ + u32 stat, timeout, cap; + u32 reset_val = 0; + int err; + + stat = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF); + cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF); + + if (!EFA_GET(&stat, EFA_REGS_DEV_STS_READY)) { + ibdev_err(edev->efa_dev, + "Device isn't ready, can't reset device\n"); + return -EINVAL; + } + + timeout = EFA_GET(&cap, EFA_REGS_CAPS_RESET_TIMEOUT); + if (!timeout) { + ibdev_err(edev->efa_dev, "Invalid timeout value\n"); + return -EINVAL; + } + + /* start reset */ + EFA_SET(&reset_val, EFA_REGS_DEV_CTL_DEV_RESET, 1); + EFA_SET(&reset_val, EFA_REGS_DEV_CTL_RESET_REASON, reset_reason); + writel(reset_val, edev->reg_bar + EFA_REGS_DEV_CTL_OFF); + + /* reset clears the mmio readless address, restore it */ + efa_com_mmio_reg_read_resp_addr_init(edev); + + err = wait_for_reset_state(edev, timeout, 1); + if (err) { + ibdev_err(edev->efa_dev, "Reset indication didn't turn on\n"); + return err; + } + + /* reset done */ + writel(0, edev->reg_bar + EFA_REGS_DEV_CTL_OFF); + err = wait_for_reset_state(edev, timeout, 0); + if (err) { + ibdev_err(edev->efa_dev, "Reset indication didn't turn off\n"); + return err; + } + + timeout = EFA_GET(&cap, EFA_REGS_CAPS_ADMIN_CMD_TO); + if (timeout) + /* the resolution of timeout reg is 100ms */ + edev->aq.completion_timeout = timeout * 100000; + else + edev->aq.completion_timeout = ADMIN_CMD_TIMEOUT_US; + + return 0; +} + +static int efa_com_create_eq(struct efa_com_dev *edev, + struct efa_com_create_eq_params *params, + struct efa_com_create_eq_result *result) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_create_eq_resp resp = {}; + struct efa_admin_create_eq_cmd cmd = {}; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_CREATE_EQ; + EFA_SET(&cmd.caps, EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS, + params->entry_size_in_bytes / 4); + cmd.depth = params->depth; + cmd.event_bitmask = params->event_bitmask; + cmd.msix_vec = params->msix_vec; + + efa_com_set_dma_addr(params->dma_addr, &cmd.ba.mem_addr_high, + &cmd.ba.mem_addr_low); + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to create eq[%d]\n", err); + return err; + } + + result->eqn = resp.eqn; + + return 0; +} + +static void efa_com_destroy_eq(struct efa_com_dev *edev, + struct efa_com_destroy_eq_params *params) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_destroy_eq_resp resp = {}; + struct efa_admin_destroy_eq_cmd cmd = {}; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_DESTROY_EQ; + cmd.eqn = params->eqn; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) + ibdev_err_ratelimited(edev->efa_dev, + "Failed to destroy EQ-%u [%d]\n", cmd.eqn, + err); +} + +static void efa_com_arm_eq(struct efa_com_dev *edev, struct efa_com_eq *eeq) +{ + u32 val = 0; + + EFA_SET(&val, EFA_REGS_EQ_DB_EQN, eeq->eqn); + EFA_SET(&val, EFA_REGS_EQ_DB_ARM, 1); + + writel(val, edev->reg_bar + EFA_REGS_EQ_DB_OFF); +} + +void efa_com_eq_comp_intr_handler(struct efa_com_dev *edev, + struct efa_com_eq *eeq) +{ + struct efa_admin_eqe *eqe; + u32 processed = 0; + u8 phase; + u32 ci; + + ci = eeq->cc & (eeq->depth - 1); + phase = eeq->phase; + eqe = &eeq->eqes[ci]; + + /* Go over all the events */ + while ((READ_ONCE(eqe->common) & EFA_ADMIN_EQE_PHASE_MASK) == phase) { + /* + * Do not read the rest of the completion entry before the + * phase bit was validated + */ + dma_rmb(); + + eeq->cb(eeq, eqe); + + /* Get next event entry */ + ci++; + processed++; + + if (ci == eeq->depth) { + ci = 0; + phase = !phase; + } + + eqe = &eeq->eqes[ci]; + } + + eeq->cc += processed; + eeq->phase = phase; + efa_com_arm_eq(eeq->edev, eeq); +} + +void efa_com_eq_destroy(struct efa_com_dev *edev, struct efa_com_eq *eeq) +{ + struct efa_com_destroy_eq_params params = { + .eqn = eeq->eqn, + }; + + efa_com_destroy_eq(edev, ¶ms); + dma_free_coherent(edev->dmadev, eeq->depth * sizeof(*eeq->eqes), + eeq->eqes, eeq->dma_addr); +} + +int efa_com_eq_init(struct efa_com_dev *edev, struct efa_com_eq *eeq, + efa_eqe_handler cb, u16 depth, u8 msix_vec) +{ + struct efa_com_create_eq_params params = {}; + struct efa_com_create_eq_result result = {}; + int err; + + params.depth = depth; + params.entry_size_in_bytes = sizeof(*eeq->eqes); + EFA_SET(¶ms.event_bitmask, + EFA_ADMIN_CREATE_EQ_CMD_COMPLETION_EVENTS, 1); + params.msix_vec = msix_vec; + + eeq->eqes = dma_alloc_coherent(edev->dmadev, + params.depth * sizeof(*eeq->eqes), + ¶ms.dma_addr, GFP_KERNEL); + if (!eeq->eqes) + return -ENOMEM; + + err = efa_com_create_eq(edev, ¶ms, &result); + if (err) + goto err_free_coherent; + + eeq->eqn = result.eqn; + eeq->edev = edev; + eeq->dma_addr = params.dma_addr; + eeq->phase = 1; + eeq->depth = params.depth; + eeq->cb = cb; + efa_com_arm_eq(edev, eeq); + + return 0; + +err_free_coherent: + dma_free_coherent(edev->dmadev, params.depth * sizeof(*eeq->eqes), + eeq->eqes, params.dma_addr); + return err; +} diff --git a/drivers/amazon/net/efa/efa_com.h b/drivers/amazon/net/efa/efa_com.h new file mode 100644 index 0000000000000..bced7c3981792 --- /dev/null +++ b/drivers/amazon/net/efa/efa_com.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_COM_H_ +#define _EFA_COM_H_ + +#include +#include +#include +#include +#include + +#include +#include "kcompat.h" + +#include "efa_common_defs.h" +#include "efa_admin_defs.h" +#include "efa_admin_cmds_defs.h" +#include "efa_regs_defs.h" + +#define EFA_MAX_HANDLERS 256 + +struct efa_com_admin_cq { + struct efa_admin_acq_entry *entries; + dma_addr_t dma_addr; + spinlock_t lock; /* Protects ACQ */ + + u16 cc; /* consumer counter */ + u8 phase; +}; + +struct efa_com_admin_sq { + struct efa_admin_aq_entry *entries; + dma_addr_t dma_addr; + spinlock_t lock; /* Protects ASQ */ + + u32 __iomem *db_addr; + + u16 cc; /* consumer counter */ + u16 pc; /* producer counter */ + u8 phase; + +}; + +/* Don't use anything other than atomic64 */ +struct efa_com_stats_admin { + atomic64_t submitted_cmd; + atomic64_t completed_cmd; + atomic64_t cmd_err; + atomic64_t no_completion; +}; + +enum { + EFA_AQ_STATE_RUNNING_BIT = 0, + EFA_AQ_STATE_POLLING_BIT = 1, +}; + +struct efa_com_admin_queue { + void *dmadev; + void *efa_dev; + struct efa_comp_ctx *comp_ctx; + u32 completion_timeout; /* usecs */ + u16 poll_interval; /* msecs */ + u16 depth; + struct efa_com_admin_cq cq; + struct efa_com_admin_sq sq; + u16 msix_vector_idx; + + unsigned long state; + + /* Count the number of available admin commands */ + struct semaphore avail_cmds; + + struct efa_com_stats_admin stats; + + spinlock_t comp_ctx_lock; /* Protects completion context pool */ + u32 *comp_ctx_pool; + u16 comp_ctx_pool_next; +}; + +struct efa_aenq_handlers; +struct efa_com_eq; +typedef void (*efa_eqe_handler)(struct efa_com_eq *eeq, + struct efa_admin_eqe *eqe); + +struct efa_com_aenq { + struct efa_admin_aenq_entry *entries; + struct efa_aenq_handlers *aenq_handlers; + dma_addr_t dma_addr; + u32 cc; /* consumer counter */ + u16 msix_vector_idx; + u16 depth; + u8 phase; +}; + +struct efa_com_mmio_read { + struct efa_admin_mmio_req_read_less_resp *read_resp; + dma_addr_t read_resp_dma_addr; + u16 seq_num; + u16 mmio_read_timeout; /* usecs */ + /* serializes mmio reads */ + spinlock_t lock; +}; + +struct efa_com_dev { + struct efa_com_admin_queue aq; + struct efa_com_aenq aenq; + u8 __iomem *reg_bar; + void *dmadev; + void *efa_dev; + u32 supported_features; + u32 dma_addr_bits; + + struct efa_com_mmio_read mmio_read; +}; + +struct efa_com_eq { + struct efa_com_dev *edev; + struct efa_admin_eqe *eqes; + dma_addr_t dma_addr; + u32 cc; /* Consumer counter */ + u16 eqn; + u16 depth; + u8 phase; + efa_eqe_handler cb; +}; + +struct efa_com_create_eq_params { + dma_addr_t dma_addr; + u32 event_bitmask; + u16 depth; + u8 entry_size_in_bytes; + u8 msix_vec; +}; + +struct efa_com_create_eq_result { + u16 eqn; +}; + +struct efa_com_destroy_eq_params { + u16 eqn; +}; + +typedef void (*efa_aenq_handler)(void *data, + struct efa_admin_aenq_entry *aenq_e); + +/* Holds aenq handlers. Indexed by AENQ event group */ +struct efa_aenq_handlers { + efa_aenq_handler handlers[EFA_MAX_HANDLERS]; + efa_aenq_handler unimplemented_handler; +}; + +void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low); +int efa_com_admin_init(struct efa_com_dev *edev, + struct efa_aenq_handlers *aenq_handlers); +void efa_com_admin_destroy(struct efa_com_dev *edev); +int efa_com_eq_init(struct efa_com_dev *edev, struct efa_com_eq *eeq, + efa_eqe_handler cb, u16 depth, u8 msix_vec); +void efa_com_eq_destroy(struct efa_com_dev *edev, struct efa_com_eq *eeq); +int efa_com_dev_reset(struct efa_com_dev *edev, + enum efa_regs_reset_reason_types reset_reason); +void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling); +void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev); +int efa_com_mmio_reg_read_init(struct efa_com_dev *edev); +void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev); + +int efa_com_validate_version(struct efa_com_dev *edev); +int efa_com_get_dma_width(struct efa_com_dev *edev); + +int efa_com_cmd_exec(struct efa_com_admin_queue *aq, + struct efa_admin_aq_entry *cmd, + size_t cmd_size, + struct efa_admin_acq_entry *comp, + size_t comp_size); +void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data); +void efa_com_eq_comp_intr_handler(struct efa_com_dev *edev, + struct efa_com_eq *eeq); + +#endif /* _EFA_COM_H_ */ diff --git a/drivers/amazon/net/efa/efa_com_cmd.c b/drivers/amazon/net/efa/efa_com_cmd.c new file mode 100644 index 0000000000000..43f79cb197d2d --- /dev/null +++ b/drivers/amazon/net/efa/efa_com_cmd.c @@ -0,0 +1,816 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "efa_com.h" +#include "efa_com_cmd.h" + +int efa_com_create_qp(struct efa_com_dev *edev, + struct efa_com_create_qp_params *params, + struct efa_com_create_qp_result *res) +{ + struct efa_admin_create_qp_cmd create_qp_cmd = {}; + struct efa_admin_create_qp_resp cmd_completion; + struct efa_com_admin_queue *aq = &edev->aq; + int err; + + create_qp_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_QP; + + create_qp_cmd.pd = params->pd; + create_qp_cmd.qp_type = params->qp_type; + create_qp_cmd.rq_base_addr = params->rq_base_addr; + create_qp_cmd.send_cq_idx = params->send_cq_idx; + create_qp_cmd.recv_cq_idx = params->recv_cq_idx; + create_qp_cmd.qp_alloc_size.send_queue_ring_size = + params->sq_ring_size_in_bytes; + create_qp_cmd.qp_alloc_size.send_queue_depth = + params->sq_depth; + create_qp_cmd.qp_alloc_size.recv_queue_ring_size = + params->rq_ring_size_in_bytes; + create_qp_cmd.qp_alloc_size.recv_queue_depth = + params->rq_depth; + create_qp_cmd.uar = params->uarn; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&create_qp_cmd, + sizeof(create_qp_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to create qp [%d]\n", err); + return err; + } + + res->qp_handle = cmd_completion.qp_handle; + res->qp_num = cmd_completion.qp_num; + res->sq_db_offset = cmd_completion.sq_db_offset; + res->rq_db_offset = cmd_completion.rq_db_offset; + res->llq_descriptors_offset = cmd_completion.llq_descriptors_offset; + res->send_sub_cq_idx = cmd_completion.send_sub_cq_idx; + res->recv_sub_cq_idx = cmd_completion.recv_sub_cq_idx; + + return 0; +} + +int efa_com_modify_qp(struct efa_com_dev *edev, + struct efa_com_modify_qp_params *params) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_modify_qp_cmd cmd = {}; + struct efa_admin_modify_qp_resp resp; + int err; + + cmd.aq_common_desc.opcode = EFA_ADMIN_MODIFY_QP; + cmd.modify_mask = params->modify_mask; + cmd.qp_handle = params->qp_handle; + cmd.qp_state = params->qp_state; + cmd.cur_qp_state = params->cur_qp_state; + cmd.qkey = params->qkey; + cmd.sq_psn = params->sq_psn; + cmd.sq_drained_async_notify = params->sq_drained_async_notify; + cmd.rnr_retry = params->rnr_retry; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to modify qp-%u modify_mask[%#x] [%d]\n", + cmd.qp_handle, cmd.modify_mask, err); + return err; + } + + return 0; +} + +int efa_com_query_qp(struct efa_com_dev *edev, + struct efa_com_query_qp_params *params, + struct efa_com_query_qp_result *result) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_query_qp_cmd cmd = {}; + struct efa_admin_query_qp_resp resp; + int err; + + cmd.aq_common_desc.opcode = EFA_ADMIN_QUERY_QP; + cmd.qp_handle = params->qp_handle; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to query qp-%u [%d]\n", + cmd.qp_handle, err); + return err; + } + + result->qp_state = resp.qp_state; + result->qkey = resp.qkey; + result->sq_draining = resp.sq_draining; + result->sq_psn = resp.sq_psn; + result->rnr_retry = resp.rnr_retry; + + return 0; +} + +int efa_com_destroy_qp(struct efa_com_dev *edev, + struct efa_com_destroy_qp_params *params) +{ + struct efa_admin_destroy_qp_resp cmd_completion; + struct efa_admin_destroy_qp_cmd qp_cmd = {}; + struct efa_com_admin_queue *aq = &edev->aq; + int err; + + qp_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_QP; + qp_cmd.qp_handle = params->qp_handle; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&qp_cmd, + sizeof(qp_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to destroy qp-%u [%d]\n", + qp_cmd.qp_handle, err); + return err; + } + + return 0; +} + +int efa_com_create_cq(struct efa_com_dev *edev, + struct efa_com_create_cq_params *params, + struct efa_com_create_cq_result *result) +{ + struct efa_admin_create_cq_resp cmd_completion = {}; + struct efa_admin_create_cq_cmd create_cmd = {}; + struct efa_com_admin_queue *aq = &edev->aq; + int err; + + create_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_CQ; + EFA_SET(&create_cmd.cq_caps_2, + EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS, + params->entry_size_in_bytes / 4); + create_cmd.cq_depth = params->cq_depth; + create_cmd.num_sub_cqs = params->num_sub_cqs; + create_cmd.uar = params->uarn; + if (params->interrupt_mode_enabled) { + EFA_SET(&create_cmd.cq_caps_1, + EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED, 1); + create_cmd.eqn = params->eqn; + } + if (params->set_src_addr) { + EFA_SET(&create_cmd.cq_caps_2, + EFA_ADMIN_CREATE_CQ_CMD_SET_SRC_ADDR, 1); + } + efa_com_set_dma_addr(params->dma_addr, + &create_cmd.cq_ba.mem_addr_high, + &create_cmd.cq_ba.mem_addr_low); + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&create_cmd, + sizeof(create_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to create cq[%d]\n", err); + return err; + } + + result->cq_idx = cmd_completion.cq_idx; + result->actual_depth = params->cq_depth; + result->db_off = cmd_completion.db_offset; + result->db_valid = EFA_GET(&cmd_completion.flags, + EFA_ADMIN_CREATE_CQ_RESP_DB_VALID); + + return 0; +} + +int efa_com_destroy_cq(struct efa_com_dev *edev, + struct efa_com_destroy_cq_params *params) +{ + struct efa_admin_destroy_cq_cmd destroy_cmd = {}; + struct efa_admin_destroy_cq_resp destroy_resp; + struct efa_com_admin_queue *aq = &edev->aq; + int err; + + destroy_cmd.cq_idx = params->cq_idx; + destroy_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_CQ; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&destroy_cmd, + sizeof(destroy_cmd), + (struct efa_admin_acq_entry *)&destroy_resp, + sizeof(destroy_resp)); + + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to destroy CQ-%u [%d]\n", + params->cq_idx, err); + return err; + } + + return 0; +} + +int efa_com_register_mr(struct efa_com_dev *edev, + struct efa_com_reg_mr_params *params, + struct efa_com_reg_mr_result *result) +{ + struct efa_admin_reg_mr_resp cmd_completion; + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_reg_mr_cmd mr_cmd = {}; + int err; + + mr_cmd.aq_common_desc.opcode = EFA_ADMIN_REG_MR; + mr_cmd.pd = params->pd; + mr_cmd.mr_length = params->mr_length_in_bytes; + EFA_SET(&mr_cmd.flags, EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT, + params->page_shift); + mr_cmd.iova = params->iova; + mr_cmd.permissions = params->permissions; + + if (params->inline_pbl) { + memcpy(mr_cmd.pbl.inline_pbl_array, + params->pbl.inline_pbl_array, + sizeof(mr_cmd.pbl.inline_pbl_array)); + } else { + mr_cmd.pbl.pbl.length = params->pbl.pbl.length; + mr_cmd.pbl.pbl.address.mem_addr_low = + params->pbl.pbl.address.mem_addr_low; + mr_cmd.pbl.pbl.address.mem_addr_high = + params->pbl.pbl.address.mem_addr_high; + EFA_SET(&mr_cmd.aq_common_desc.flags, + EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1); + if (params->indirect) + EFA_SET(&mr_cmd.aq_common_desc.flags, + EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT, 1); + } + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&mr_cmd, + sizeof(mr_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to register mr [%d]\n", err); + return err; + } + + result->l_key = cmd_completion.l_key; + result->r_key = cmd_completion.r_key; + result->ic_info.recv_ic_id = cmd_completion.recv_ic_id; + result->ic_info.rdma_read_ic_id = cmd_completion.rdma_read_ic_id; + result->ic_info.rdma_recv_ic_id = cmd_completion.rdma_recv_ic_id; + result->ic_info.recv_ic_id_valid = EFA_GET(&cmd_completion.validity, + EFA_ADMIN_REG_MR_RESP_RECV_IC_ID); + result->ic_info.rdma_read_ic_id_valid = EFA_GET(&cmd_completion.validity, + EFA_ADMIN_REG_MR_RESP_RDMA_READ_IC_ID); + result->ic_info.rdma_recv_ic_id_valid = EFA_GET(&cmd_completion.validity, + EFA_ADMIN_REG_MR_RESP_RDMA_RECV_IC_ID); + + return 0; +} + +int efa_com_dereg_mr(struct efa_com_dev *edev, + struct efa_com_dereg_mr_params *params) +{ + struct efa_admin_dereg_mr_resp cmd_completion; + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_dereg_mr_cmd mr_cmd = {}; + int err; + + mr_cmd.aq_common_desc.opcode = EFA_ADMIN_DEREG_MR; + mr_cmd.l_key = params->l_key; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&mr_cmd, + sizeof(mr_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to de-register mr(lkey-%u) [%d]\n", + mr_cmd.l_key, err); + return err; + } + + return 0; +} + +int efa_com_create_ah(struct efa_com_dev *edev, + struct efa_com_create_ah_params *params, + struct efa_com_create_ah_result *result) +{ + struct efa_admin_create_ah_resp cmd_completion; + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_create_ah_cmd ah_cmd = {}; + int err; + + ah_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_AH; + + memcpy(ah_cmd.dest_addr, params->dest_addr, sizeof(ah_cmd.dest_addr)); + ah_cmd.pd = params->pdn; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&ah_cmd, + sizeof(ah_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to create ah for %pI6 [%d]\n", + ah_cmd.dest_addr, err); + return err; + } + + result->ah = cmd_completion.ah; + + return 0; +} + +int efa_com_destroy_ah(struct efa_com_dev *edev, + struct efa_com_destroy_ah_params *params) +{ + struct efa_admin_destroy_ah_resp cmd_completion; + struct efa_admin_destroy_ah_cmd ah_cmd = {}; + struct efa_com_admin_queue *aq = &edev->aq; + int err; + + ah_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_AH; + ah_cmd.ah = params->ah; + ah_cmd.pd = params->pdn; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&ah_cmd, + sizeof(ah_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to destroy ah-%d pd-%d [%d]\n", + ah_cmd.ah, ah_cmd.pd, err); + return err; + } + + return 0; +} + +bool +efa_com_check_supported_feature_id(struct efa_com_dev *edev, + enum efa_admin_aq_feature_id feature_id) +{ + u32 feature_mask = 1 << feature_id; + + /* Device attributes is always supported */ + if (feature_id != EFA_ADMIN_DEVICE_ATTR && + !(edev->supported_features & feature_mask)) + return false; + + return true; +} + +static int efa_com_get_feature_ex(struct efa_com_dev *edev, + struct efa_admin_get_feature_resp *get_resp, + enum efa_admin_aq_feature_id feature_id, + dma_addr_t control_buf_dma_addr, + u32 control_buff_size) +{ + struct efa_admin_get_feature_cmd get_cmd = {}; + struct efa_com_admin_queue *aq; + int err; + + if (!efa_com_check_supported_feature_id(edev, feature_id)) { + ibdev_err_ratelimited(edev->efa_dev, + "Feature %d isn't supported\n", + feature_id); + return -EOPNOTSUPP; + } + + aq = &edev->aq; + + get_cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_FEATURE; + + if (control_buff_size) + EFA_SET(&get_cmd.aq_common_descriptor.flags, + EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1); + + efa_com_set_dma_addr(control_buf_dma_addr, + &get_cmd.control_buffer.address.mem_addr_high, + &get_cmd.control_buffer.address.mem_addr_low); + + get_cmd.control_buffer.length = control_buff_size; + get_cmd.feature_common.feature_id = feature_id; + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *) + &get_cmd, + sizeof(get_cmd), + (struct efa_admin_acq_entry *) + get_resp, + sizeof(*get_resp)); + + if (err) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to submit get_feature command %d [%d]\n", + feature_id, err); + return err; + } + + return 0; +} + +static int efa_com_get_feature(struct efa_com_dev *edev, + struct efa_admin_get_feature_resp *get_resp, + enum efa_admin_aq_feature_id feature_id) +{ + return efa_com_get_feature_ex(edev, get_resp, feature_id, 0, 0); +} + +int efa_com_get_device_attr(struct efa_com_dev *edev, + struct efa_com_get_device_attr_result *result) +{ + struct efa_admin_get_feature_resp resp; + int err; + + err = efa_com_get_feature(edev, &resp, EFA_ADMIN_DEVICE_ATTR); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to get device attributes %d\n", + err); + return err; + } + + result->page_size_cap = resp.u.device_attr.page_size_cap; + result->fw_version = resp.u.device_attr.fw_version; + result->admin_api_version = resp.u.device_attr.admin_api_version; + result->device_version = resp.u.device_attr.device_version; + result->supported_features = resp.u.device_attr.supported_features; + result->phys_addr_width = resp.u.device_attr.phys_addr_width; + result->virt_addr_width = resp.u.device_attr.virt_addr_width; + result->db_bar = resp.u.device_attr.db_bar; + result->max_rdma_size = resp.u.device_attr.max_rdma_size; + result->device_caps = resp.u.device_attr.device_caps; + + if (result->admin_api_version < 1) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to get device attr api version [%u < 1]\n", + result->admin_api_version); + return -EINVAL; + } + + edev->supported_features = resp.u.device_attr.supported_features; + err = efa_com_get_feature(edev, &resp, + EFA_ADMIN_QUEUE_ATTR); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to get queue attributes %d\n", + err); + return err; + } + + result->max_qp = resp.u.queue_attr.max_qp; + result->max_sq_depth = resp.u.queue_attr.max_sq_depth; + result->max_rq_depth = resp.u.queue_attr.max_rq_depth; + result->max_cq = resp.u.queue_attr.max_cq; + result->max_cq_depth = resp.u.queue_attr.max_cq_depth; + result->inline_buf_size = resp.u.queue_attr.inline_buf_size; + result->max_sq_sge = resp.u.queue_attr.max_wr_send_sges; + result->max_rq_sge = resp.u.queue_attr.max_wr_recv_sges; + result->max_mr = resp.u.queue_attr.max_mr; + result->max_mr_pages = resp.u.queue_attr.max_mr_pages; + result->max_pd = resp.u.queue_attr.max_pd; + result->max_ah = resp.u.queue_attr.max_ah; + result->max_llq_size = resp.u.queue_attr.max_llq_size; + result->sub_cqs_per_cq = resp.u.queue_attr.sub_cqs_per_cq; + result->max_wr_rdma_sge = resp.u.queue_attr.max_wr_rdma_sges; + result->max_tx_batch = resp.u.queue_attr.max_tx_batch; + result->min_sq_depth = resp.u.queue_attr.min_sq_depth; + + err = efa_com_get_feature(edev, &resp, EFA_ADMIN_NETWORK_ATTR); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to get network attributes %d\n", + err); + return err; + } + + memcpy(result->addr, resp.u.network_attr.addr, + sizeof(resp.u.network_attr.addr)); + result->mtu = resp.u.network_attr.mtu; + + if (efa_com_check_supported_feature_id(edev, + EFA_ADMIN_EVENT_QUEUE_ATTR)) { + err = efa_com_get_feature(edev, &resp, + EFA_ADMIN_EVENT_QUEUE_ATTR); + if (err) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to get event queue attributes %d\n", + err); + return err; + } + + result->max_eq = resp.u.event_queue_attr.max_eq; + result->max_eq_depth = resp.u.event_queue_attr.max_eq_depth; + result->event_bitmask = resp.u.event_queue_attr.event_bitmask; + } + + return 0; +} + +int efa_com_get_hw_hints(struct efa_com_dev *edev, + struct efa_com_get_hw_hints_result *result) +{ + struct efa_admin_get_feature_resp resp; + int err; + + err = efa_com_get_feature(edev, &resp, EFA_ADMIN_HW_HINTS); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to get hw hints %d\n", err); + return err; + } + + result->admin_completion_timeout = resp.u.hw_hints.admin_completion_timeout; + result->driver_watchdog_timeout = resp.u.hw_hints.driver_watchdog_timeout; + result->mmio_read_timeout = resp.u.hw_hints.mmio_read_timeout; + result->poll_interval = resp.u.hw_hints.poll_interval; + + return 0; +} + +int efa_com_set_feature_ex(struct efa_com_dev *edev, + struct efa_admin_set_feature_resp *set_resp, + struct efa_admin_set_feature_cmd *set_cmd, + enum efa_admin_aq_feature_id feature_id, + dma_addr_t control_buf_dma_addr, + u32 control_buff_size) +{ + struct efa_com_admin_queue *aq; + int err; + + if (!efa_com_check_supported_feature_id(edev, feature_id)) { + ibdev_err_ratelimited(edev->efa_dev, + "Feature %d isn't supported\n", + feature_id); + return -EOPNOTSUPP; + } + + aq = &edev->aq; + + set_cmd->aq_common_descriptor.opcode = EFA_ADMIN_SET_FEATURE; + if (control_buff_size) { + set_cmd->aq_common_descriptor.flags = 0; + EFA_SET(&set_cmd->aq_common_descriptor.flags, + EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1); + efa_com_set_dma_addr(control_buf_dma_addr, + &set_cmd->control_buffer.address.mem_addr_high, + &set_cmd->control_buffer.address.mem_addr_low); + } + + set_cmd->control_buffer.length = control_buff_size; + set_cmd->feature_common.feature_id = feature_id; + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)set_cmd, + sizeof(*set_cmd), + (struct efa_admin_acq_entry *)set_resp, + sizeof(*set_resp)); + + if (err) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to submit set_feature command %d error: %d\n", + feature_id, err); + return err; + } + + return 0; +} + +static int efa_com_set_feature(struct efa_com_dev *edev, + struct efa_admin_set_feature_resp *set_resp, + struct efa_admin_set_feature_cmd *set_cmd, + enum efa_admin_aq_feature_id feature_id) +{ + return efa_com_set_feature_ex(edev, set_resp, set_cmd, feature_id, + 0, 0); +} + +int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups) +{ + struct efa_admin_get_feature_resp get_resp; + struct efa_admin_set_feature_resp set_resp; + struct efa_admin_set_feature_cmd cmd = {}; + int err; + + ibdev_dbg(edev->efa_dev, "Configuring aenq with groups[%#x]\n", groups); + + err = efa_com_get_feature(edev, &get_resp, EFA_ADMIN_AENQ_CONFIG); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to get aenq attributes: %d\n", + err); + return err; + } + + ibdev_dbg(edev->efa_dev, + "Get aenq groups: supported[%#x] enabled[%#x]\n", + get_resp.u.aenq.supported_groups, + get_resp.u.aenq.enabled_groups); + + if ((get_resp.u.aenq.supported_groups & groups) != groups) { + ibdev_err_ratelimited( + edev->efa_dev, + "Trying to set unsupported aenq groups[%#x] supported[%#x]\n", + groups, get_resp.u.aenq.supported_groups); + return -EOPNOTSUPP; + } + + cmd.u.aenq.enabled_groups = groups; + err = efa_com_set_feature(edev, &set_resp, &cmd, + EFA_ADMIN_AENQ_CONFIG); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to set aenq attributes: %d\n", + err); + return err; + } + + return 0; +} + +int efa_com_alloc_pd(struct efa_com_dev *edev, + struct efa_com_alloc_pd_result *result) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_alloc_pd_cmd cmd = {}; + struct efa_admin_alloc_pd_resp resp; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_ALLOC_PD; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to allocate pd[%d]\n", err); + return err; + } + + result->pdn = resp.pd; + + return 0; +} + +int efa_com_dealloc_pd(struct efa_com_dev *edev, + struct efa_com_dealloc_pd_params *params) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_dealloc_pd_cmd cmd = {}; + struct efa_admin_dealloc_pd_resp resp; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_DEALLOC_PD; + cmd.pd = params->pdn; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to deallocate pd-%u [%d]\n", + cmd.pd, err); + return err; + } + + return 0; +} + +int efa_com_alloc_uar(struct efa_com_dev *edev, + struct efa_com_alloc_uar_result *result) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_alloc_uar_cmd cmd = {}; + struct efa_admin_alloc_uar_resp resp; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_ALLOC_UAR; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to allocate uar[%d]\n", err); + return err; + } + + result->uarn = resp.uar; + + return 0; +} + +int efa_com_dealloc_uar(struct efa_com_dev *edev, + struct efa_com_dealloc_uar_params *params) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_dealloc_uar_cmd cmd = {}; + struct efa_admin_dealloc_uar_resp resp; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_DEALLOC_UAR; + cmd.uar = params->uarn; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to deallocate uar-%u [%d]\n", + cmd.uar, err); + return err; + } + + return 0; +} + +int efa_com_get_stats(struct efa_com_dev *edev, + struct efa_com_get_stats_params *params, + union efa_com_get_stats_result *result) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_aq_get_stats_cmd cmd = {}; + struct efa_admin_acq_get_stats_resp resp; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_STATS; + cmd.type = params->type; + cmd.scope = params->scope; + cmd.scope_modifier = params->scope_modifier; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to get stats type-%u scope-%u.%u [%d]\n", + cmd.type, cmd.scope, cmd.scope_modifier, err); + return err; + } + + switch (cmd.type) { + case EFA_ADMIN_GET_STATS_TYPE_BASIC: + result->basic_stats.tx_bytes = resp.u.basic_stats.tx_bytes; + result->basic_stats.tx_pkts = resp.u.basic_stats.tx_pkts; + result->basic_stats.rx_bytes = resp.u.basic_stats.rx_bytes; + result->basic_stats.rx_pkts = resp.u.basic_stats.rx_pkts; + result->basic_stats.rx_drops = resp.u.basic_stats.rx_drops; + break; + case EFA_ADMIN_GET_STATS_TYPE_MESSAGES: + result->messages_stats.send_bytes = resp.u.messages_stats.send_bytes; + result->messages_stats.send_wrs = resp.u.messages_stats.send_wrs; + result->messages_stats.recv_bytes = resp.u.messages_stats.recv_bytes; + result->messages_stats.recv_wrs = resp.u.messages_stats.recv_wrs; + break; + case EFA_ADMIN_GET_STATS_TYPE_RDMA_READ: + result->rdma_read_stats.read_wrs = resp.u.rdma_read_stats.read_wrs; + result->rdma_read_stats.read_bytes = resp.u.rdma_read_stats.read_bytes; + result->rdma_read_stats.read_wr_err = resp.u.rdma_read_stats.read_wr_err; + result->rdma_read_stats.read_resp_bytes = resp.u.rdma_read_stats.read_resp_bytes; + break; + case EFA_ADMIN_GET_STATS_TYPE_RDMA_WRITE: + result->rdma_write_stats.write_wrs = resp.u.rdma_write_stats.write_wrs; + result->rdma_write_stats.write_bytes = resp.u.rdma_write_stats.write_bytes; + result->rdma_write_stats.write_wr_err = resp.u.rdma_write_stats.write_wr_err; + result->rdma_write_stats.write_recv_bytes = resp.u.rdma_write_stats.write_recv_bytes; + break; + } + + return 0; +} diff --git a/drivers/amazon/net/efa/efa_com_cmd.h b/drivers/amazon/net/efa/efa_com_cmd.h new file mode 100644 index 0000000000000..720a99ba0f7d1 --- /dev/null +++ b/drivers/amazon/net/efa/efa_com_cmd.h @@ -0,0 +1,340 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_COM_CMD_H_ +#define _EFA_COM_CMD_H_ + +#include "efa_com.h" + +#define EFA_GID_SIZE 16 + +struct efa_com_create_qp_params { + u64 rq_base_addr; + u32 send_cq_idx; + u32 recv_cq_idx; + /* + * Send descriptor ring size in bytes, + * sufficient for user-provided number of WQEs and SGL size + */ + u32 sq_ring_size_in_bytes; + /* Max number of WQEs that will be posted on send queue */ + u32 sq_depth; + /* Recv descriptor ring size in bytes */ + u32 rq_ring_size_in_bytes; + u32 rq_depth; + u16 pd; + u16 uarn; + u8 qp_type; +}; + +struct efa_com_create_qp_result { + u32 qp_handle; + u32 qp_num; + u32 sq_db_offset; + u32 rq_db_offset; + u32 llq_descriptors_offset; + u16 send_sub_cq_idx; + u16 recv_sub_cq_idx; +}; + +struct efa_com_modify_qp_params { + u32 modify_mask; + u32 qp_handle; + u32 qp_state; + u32 cur_qp_state; + u32 qkey; + u32 sq_psn; + u8 sq_drained_async_notify; + u8 rnr_retry; +}; + +struct efa_com_query_qp_params { + u32 qp_handle; +}; + +struct efa_com_query_qp_result { + u32 qp_state; + u32 qkey; + u32 sq_draining; + u32 sq_psn; + u8 rnr_retry; +}; + +struct efa_com_destroy_qp_params { + u32 qp_handle; +}; + +struct efa_com_create_cq_params { + /* cq physical base address in OS memory */ + dma_addr_t dma_addr; + /* completion queue depth in # of entries */ + u16 cq_depth; + u16 num_sub_cqs; + u16 uarn; + u16 eqn; + u8 entry_size_in_bytes; + u8 interrupt_mode_enabled : 1; + u8 set_src_addr : 1; +}; + +struct efa_com_create_cq_result { + /* cq identifier */ + u16 cq_idx; + /* actual cq depth in # of entries */ + u16 actual_depth; + u32 db_off; + bool db_valid; +}; + +struct efa_com_destroy_cq_params { + u16 cq_idx; +}; + +struct efa_com_create_ah_params { + u16 pdn; + /* Destination address in network byte order */ + u8 dest_addr[EFA_GID_SIZE]; +}; + +struct efa_com_create_ah_result { + u16 ah; +}; + +struct efa_com_destroy_ah_params { + u16 ah; + u16 pdn; +}; + +struct efa_com_get_device_attr_result { + u8 addr[EFA_GID_SIZE]; + u64 page_size_cap; + u64 max_mr_pages; + u32 mtu; + u32 fw_version; + u32 admin_api_version; + u32 device_version; + u32 supported_features; + u32 phys_addr_width; + u32 virt_addr_width; + u32 max_qp; + u32 max_sq_depth; /* wqes */ + u32 max_rq_depth; /* wqes */ + u32 max_cq; + u32 max_cq_depth; /* cqes */ + u32 inline_buf_size; + u32 max_mr; + u32 max_pd; + u32 max_ah; + u32 max_llq_size; + u32 max_rdma_size; + u32 device_caps; + u32 max_eq; + u32 max_eq_depth; + u32 event_bitmask; /* EQ events bitmask */ + u16 sub_cqs_per_cq; + u16 max_sq_sge; + u16 max_rq_sge; + u16 max_wr_rdma_sge; + u16 max_tx_batch; + u16 min_sq_depth; + u8 db_bar; +}; + +struct efa_com_get_hw_hints_result { + u16 mmio_read_timeout; + u16 driver_watchdog_timeout; + u16 admin_completion_timeout; + u16 poll_interval; + u32 reserved[4]; +}; + +struct efa_com_mem_addr { + u32 mem_addr_low; + u32 mem_addr_high; +}; + +/* Used at indirect mode page list chunks for chaining */ +struct efa_com_ctrl_buff_info { + /* indicates length of the buffer pointed by control_buffer_address. */ + u32 length; + /* points to control buffer (direct or indirect) */ + struct efa_com_mem_addr address; +}; + +struct efa_com_reg_mr_params { + /* Memory region length, in bytes. */ + u64 mr_length_in_bytes; + /* IO Virtual Address associated with this MR. */ + u64 iova; + /* words 8:15: Physical Buffer List, each element is page-aligned. */ + union { + /* + * Inline array of physical addresses of app pages + * (optimization for short region reservations) + */ + u64 inline_pbl_array[4]; + /* + * Describes the next physically contiguous chunk of indirect + * page list. A page list contains physical addresses of command + * data pages. Data pages are 4KB; page list chunks are + * variable-sized. + */ + struct efa_com_ctrl_buff_info pbl; + } pbl; + /* number of pages in PBL (redundant, could be calculated) */ + u32 page_num; + /* Protection Domain */ + u16 pd; + /* + * phys_page_size_shift - page size is (1 << phys_page_size_shift) + * Page size is used for building the Virtual to Physical + * address mapping + */ + u8 page_shift; + /* see permissions field of struct efa_admin_reg_mr_cmd */ + u8 permissions; + u8 inline_pbl; + u8 indirect; +}; + +struct efa_com_mr_interconnect_info { + u16 recv_ic_id; + u16 rdma_read_ic_id; + u16 rdma_recv_ic_id; + u8 recv_ic_id_valid : 1; + u8 rdma_read_ic_id_valid : 1; + u8 rdma_recv_ic_id_valid : 1; +}; + +struct efa_com_reg_mr_result { + /* + * To be used in conjunction with local buffers references in SQ and + * RQ WQE + */ + u32 l_key; + /* + * To be used in incoming RDMA semantics messages to refer to remotely + * accessed memory region + */ + u32 r_key; + struct efa_com_mr_interconnect_info ic_info; +}; + +struct efa_com_dereg_mr_params { + u32 l_key; +}; + +struct efa_com_alloc_pd_result { + u16 pdn; +}; + +struct efa_com_dealloc_pd_params { + u16 pdn; +}; + +struct efa_com_alloc_uar_result { + u16 uarn; +}; + +struct efa_com_dealloc_uar_params { + u16 uarn; +}; + +struct efa_com_get_stats_params { + /* see enum efa_admin_get_stats_type */ + u8 type; + /* see enum efa_admin_get_stats_scope */ + u8 scope; + u16 scope_modifier; +}; + +struct efa_com_basic_stats { + u64 tx_bytes; + u64 tx_pkts; + u64 rx_bytes; + u64 rx_pkts; + u64 rx_drops; +}; + +struct efa_com_messages_stats { + u64 send_bytes; + u64 send_wrs; + u64 recv_bytes; + u64 recv_wrs; +}; + +struct efa_com_rdma_read_stats { + u64 read_wrs; + u64 read_bytes; + u64 read_wr_err; + u64 read_resp_bytes; +}; + +struct efa_com_rdma_write_stats { + u64 write_wrs; + u64 write_bytes; + u64 write_wr_err; + u64 write_recv_bytes; +}; + +union efa_com_get_stats_result { + struct efa_com_basic_stats basic_stats; + struct efa_com_messages_stats messages_stats; + struct efa_com_rdma_read_stats rdma_read_stats; + struct efa_com_rdma_write_stats rdma_write_stats; +}; + +int efa_com_create_qp(struct efa_com_dev *edev, + struct efa_com_create_qp_params *params, + struct efa_com_create_qp_result *res); +int efa_com_modify_qp(struct efa_com_dev *edev, + struct efa_com_modify_qp_params *params); +int efa_com_query_qp(struct efa_com_dev *edev, + struct efa_com_query_qp_params *params, + struct efa_com_query_qp_result *result); +int efa_com_destroy_qp(struct efa_com_dev *edev, + struct efa_com_destroy_qp_params *params); +int efa_com_create_cq(struct efa_com_dev *edev, + struct efa_com_create_cq_params *params, + struct efa_com_create_cq_result *result); +int efa_com_destroy_cq(struct efa_com_dev *edev, + struct efa_com_destroy_cq_params *params); +int efa_com_register_mr(struct efa_com_dev *edev, + struct efa_com_reg_mr_params *params, + struct efa_com_reg_mr_result *result); +int efa_com_dereg_mr(struct efa_com_dev *edev, + struct efa_com_dereg_mr_params *params); +int efa_com_create_ah(struct efa_com_dev *edev, + struct efa_com_create_ah_params *params, + struct efa_com_create_ah_result *result); +int efa_com_destroy_ah(struct efa_com_dev *edev, + struct efa_com_destroy_ah_params *params); +int efa_com_get_device_attr(struct efa_com_dev *edev, + struct efa_com_get_device_attr_result *result); +int efa_com_get_hw_hints(struct efa_com_dev *edev, + struct efa_com_get_hw_hints_result *result); +bool +efa_com_check_supported_feature_id(struct efa_com_dev *edev, + enum efa_admin_aq_feature_id feature_id); +int efa_com_set_feature_ex(struct efa_com_dev *edev, + struct efa_admin_set_feature_resp *set_resp, + struct efa_admin_set_feature_cmd *set_cmd, + enum efa_admin_aq_feature_id feature_id, + dma_addr_t control_buf_dma_addr, + u32 control_buff_size); +int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups); +int efa_com_alloc_pd(struct efa_com_dev *edev, + struct efa_com_alloc_pd_result *result); +int efa_com_dealloc_pd(struct efa_com_dev *edev, + struct efa_com_dealloc_pd_params *params); +int efa_com_alloc_uar(struct efa_com_dev *edev, + struct efa_com_alloc_uar_result *result); +int efa_com_dealloc_uar(struct efa_com_dev *edev, + struct efa_com_dealloc_uar_params *params); +int efa_com_get_stats(struct efa_com_dev *edev, + struct efa_com_get_stats_params *params, + union efa_com_get_stats_result *result); + +#endif /* _EFA_COM_CMD_H_ */ diff --git a/drivers/amazon/net/efa/efa_common_defs.h b/drivers/amazon/net/efa/efa_common_defs.h new file mode 100644 index 0000000000000..90af1c82c9c62 --- /dev/null +++ b/drivers/amazon/net/efa/efa_common_defs.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_COMMON_H_ +#define _EFA_COMMON_H_ + +#include + +#define EFA_COMMON_SPEC_VERSION_MAJOR 2 +#define EFA_COMMON_SPEC_VERSION_MINOR 0 + +#define EFA_GET(ptr, mask) FIELD_GET(mask##_MASK, *(ptr)) + +#define EFA_SET(ptr, mask, value) \ + ({ \ + typeof(ptr) _ptr = ptr; \ + *_ptr = (*_ptr & ~(mask##_MASK)) | \ + FIELD_PREP(mask##_MASK, value); \ + }) + +struct efa_common_mem_addr { + u32 mem_addr_low; + + u32 mem_addr_high; +}; + +#endif /* _EFA_COMMON_H_ */ diff --git a/drivers/amazon/net/efa/efa_gdr.c b/drivers/amazon/net/efa/efa_gdr.c new file mode 100644 index 0000000000000..eb588f0369664 --- /dev/null +++ b/drivers/amazon/net/efa/efa_gdr.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2019-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include + +#include "efa_p2p.h" +#include "nv-p2p.h" + +#define GPU_PAGE_SHIFT 16 +#define GPU_PAGE_SIZE BIT_ULL(GPU_PAGE_SHIFT) + +int efa_nv_peermem_p2p_get_pages(u64 p2p_token, u32 va_space, + u64 virtual_address, u64 length, + struct nvidia_p2p_page_table **page_table, + void (*free_callback)(void *data), void *data); + +int efa_nv_peermem_p2p_dma_map_pages(struct pci_dev *peer, + struct nvidia_p2p_page_table *page_table, + struct nvidia_p2p_dma_mapping **dma_mapping); + +int efa_nv_peermem_p2p_dma_unmap_pages(struct pci_dev *peer, + struct nvidia_p2p_page_table *page_table, + struct nvidia_p2p_dma_mapping *dma_mapping); + +int efa_nv_peermem_p2p_put_pages(u64 p2p_token, + u32 va_space, u64 virtual_address, + struct nvidia_p2p_page_table *page_table); + +struct efa_nvmem_ops { + int (*get_pages)(u64 p2p_token, u32 va_space, u64 virtual_address, + u64 length, struct nvidia_p2p_page_table **page_table, + void (*free_callback)(void *data), void *data); + int (*dma_map_pages)(struct pci_dev *peer, + struct nvidia_p2p_page_table *page_table, + struct nvidia_p2p_dma_mapping **dma_mapping); + int (*put_pages)(u64 p2p_token, u32 va_space, u64 virtual_address, + struct nvidia_p2p_page_table *page_table); + int (*dma_unmap_pages)(struct pci_dev *peer, + struct nvidia_p2p_page_table *page_table, + struct nvidia_p2p_dma_mapping *dma_mapping); + bool using_peermem_fp; +}; + +struct efa_nvmem { + struct efa_p2pmem p2pmem; + struct efa_nvmem_ops ops; + struct nvidia_p2p_page_table *pgtbl; + struct nvidia_p2p_dma_mapping *dma_mapping; + u64 virt_start; +}; + +static unsigned int nvmem_pgsz(struct efa_dev *dev, struct efa_p2pmem *p2pmem) +{ + struct efa_nvmem *nvmem; + + nvmem = container_of(p2pmem, struct efa_nvmem, p2pmem); + + switch (nvmem->pgtbl->page_size) { + case NVIDIA_P2P_PAGE_SIZE_4KB: + return SZ_4K; + case NVIDIA_P2P_PAGE_SIZE_64KB: + return SZ_64K; + case NVIDIA_P2P_PAGE_SIZE_128KB: + return SZ_128K; + default: + return 0; + } +} + +static int nvmem_get_peermem_fp(struct efa_nvmem_ops *ops) +{ + ops->get_pages = symbol_get(efa_nv_peermem_p2p_get_pages); + if (!ops->get_pages) + goto err_out; + + ops->put_pages = symbol_get(efa_nv_peermem_p2p_put_pages); + if (!ops->put_pages) + goto err_put_get_pages; + + ops->dma_map_pages = symbol_get(efa_nv_peermem_p2p_dma_map_pages); + if (!ops->dma_map_pages) + goto err_put_put_pages; + + ops->dma_unmap_pages = symbol_get(efa_nv_peermem_p2p_dma_unmap_pages); + if (!ops->dma_unmap_pages) + goto err_put_dma_map_pages; + + ops->using_peermem_fp = true; + return 0; + +err_put_dma_map_pages: + symbol_put(efa_nv_peermem_p2p_dma_map_pages); +err_put_put_pages: + symbol_put(efa_nv_peermem_p2p_put_pages); +err_put_get_pages: + symbol_put(efa_nv_peermem_p2p_get_pages); +err_out: + return -EINVAL; +} + +static int nvmem_get_nvidia_fp(struct efa_nvmem_ops *ops) +{ + ops->get_pages = symbol_get(nvidia_p2p_get_pages); + if (!ops->get_pages) + goto err_out; + + ops->put_pages = symbol_get(nvidia_p2p_put_pages); + if (!ops->put_pages) + goto err_put_get_pages; + + ops->dma_map_pages = symbol_get(nvidia_p2p_dma_map_pages); + if (!ops->dma_map_pages) + goto err_put_put_pages; + + ops->dma_unmap_pages = symbol_get(nvidia_p2p_dma_unmap_pages); + if (!ops->dma_unmap_pages) + goto err_put_dma_map_pages; + + return 0; + +err_put_dma_map_pages: + symbol_put(nvidia_p2p_dma_map_pages); +err_put_put_pages: + symbol_put(nvidia_p2p_put_pages); +err_put_get_pages: + symbol_put(nvidia_p2p_get_pages); +err_out: + return -EINVAL; +} + +static int nvmem_get_fp(struct efa_nvmem_ops *ops) +{ + if (!nvmem_get_peermem_fp(ops)) + return 0; + + return nvmem_get_nvidia_fp(ops); +} + +static void nvmem_put_fp(struct efa_nvmem_ops *ops) +{ + if (ops->using_peermem_fp) { + symbol_put(efa_nv_peermem_p2p_dma_unmap_pages); + symbol_put(efa_nv_peermem_p2p_dma_map_pages); + symbol_put(efa_nv_peermem_p2p_put_pages); + symbol_put(efa_nv_peermem_p2p_get_pages); + return; + } + + symbol_put(nvidia_p2p_dma_unmap_pages); + symbol_put(nvidia_p2p_dma_map_pages); + symbol_put(nvidia_p2p_put_pages); + symbol_put(nvidia_p2p_get_pages); +} + +static void nvmem_free_cb(void *data) +{ + pr_debug("Free callback ticket %llu\n", (u64)data); + efa_p2p_put((u64)data, true); +} + +static int nvmem_get_pages(struct efa_dev *dev, struct efa_nvmem *nvmem, + u64 addr, u64 size, u64 ticket) +{ + int err; + + err = nvmem->ops.get_pages(0, 0, addr, size, &nvmem->pgtbl, + nvmem_free_cb, (void *)ticket); + if (err) { + ibdev_dbg(&dev->ibdev, "nvidia_p2p_get_pages failed %d\n", err); + return err; + } + + if (!NVIDIA_P2P_PAGE_TABLE_VERSION_COMPATIBLE(nvmem->pgtbl)) { + ibdev_dbg(&dev->ibdev, "Incompatible page table version %#08x\n", + nvmem->pgtbl->version); + nvmem->ops.put_pages(0, 0, addr, nvmem->pgtbl); + nvmem->pgtbl = NULL; + return -EINVAL; + } + + return 0; +} + +static int nvmem_dma_map(struct efa_dev *dev, struct efa_nvmem *nvmem) +{ + int err; + + err = nvmem->ops.dma_map_pages(dev->pdev, nvmem->pgtbl, + &nvmem->dma_mapping); + if (err) { + ibdev_dbg(&dev->ibdev, "nvidia_p2p_dma_map_pages failed %d\n", + err); + return err; + } + + if (!NVIDIA_P2P_DMA_MAPPING_VERSION_COMPATIBLE(nvmem->dma_mapping)) { + ibdev_dbg(&dev->ibdev, "Incompatible DMA mapping version %#08x\n", + nvmem->dma_mapping->version); + nvmem->ops.dma_unmap_pages(dev->pdev, nvmem->pgtbl, + nvmem->dma_mapping); + nvmem->dma_mapping = NULL; + return -EINVAL; + } + + return 0; +} + +static struct efa_p2pmem *nvmem_get(struct efa_dev *dev, u64 ticket, u64 start, + u64 length) +{ + struct efa_nvmem *nvmem; + u64 virt_start; + u64 virt_end; + u64 pinsz; + int err; + + nvmem = kzalloc(sizeof(*nvmem), GFP_KERNEL); + if (!nvmem) + return NULL; + + virt_start = ALIGN_DOWN(start, GPU_PAGE_SIZE); + virt_end = ALIGN(start + length, GPU_PAGE_SIZE); + pinsz = virt_end - virt_start; + nvmem->virt_start = virt_start; + + err = nvmem_get_fp(&nvmem->ops); + if (err) + /* Nvidia module is not loaded */ + goto err_free; + + err = nvmem_get_pages(dev, nvmem, virt_start, pinsz, ticket); + if (err) + /* Most likely not our pages */ + goto err_put_fp; + + err = nvmem_dma_map(dev, nvmem); + if (err) + goto err_put; + + return &nvmem->p2pmem; + +err_put: + nvmem->ops.put_pages(0, 0, virt_start, nvmem->pgtbl); +err_put_fp: + nvmem_put_fp(&nvmem->ops); +err_free: + kfree(nvmem); + return NULL; +} + +static int nvmem_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + u64 *page_list) +{ + struct nvidia_p2p_dma_mapping *dma_mapping; + struct efa_nvmem *nvmem; + int i; + + nvmem = container_of(p2pmem, struct efa_nvmem, p2pmem); + dma_mapping = nvmem->dma_mapping; + + for (i = 0; i < dma_mapping->entries; i++) + page_list[i] = dma_mapping->dma_addresses[i]; + + return 0; +} + +static void nvmem_release(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + bool in_cb) +{ + struct efa_nvmem *nvmem; + + nvmem = container_of(p2pmem, struct efa_nvmem, p2pmem); + + if (!in_cb) { + nvmem->ops.dma_unmap_pages(dev->pdev, nvmem->pgtbl, + nvmem->dma_mapping); + nvmem->ops.put_pages(0, 0, nvmem->virt_start, nvmem->pgtbl); + } + + nvmem_put_fp(&nvmem->ops); + kfree(nvmem); +} + +static char *nvmem_provider_string(void) +{ + struct efa_nvmem_ops ops = {}; + char *prov_string; + + if (nvmem_get_fp(&ops)) + return ""; + + prov_string = ops.using_peermem_fp ? "NVIDIA peermem" : "NVIDIA"; + nvmem_put_fp(&ops); + + return prov_string; +} + +struct nvmem_provider { + struct efa_p2p_provider p2p; +}; + +static const struct nvmem_provider prov = { + .p2p = { + .ops = { + .get_provider_string = nvmem_provider_string, + .try_get = nvmem_get, + .to_page_list = nvmem_to_page_list, + .release = nvmem_release, + .get_page_size = nvmem_pgsz, + }, + .type = EFA_P2P_PROVIDER_NVMEM, + }, +}; + +const struct efa_p2p_provider *nvmem_get_provider(void) +{ + struct efa_nvmem_ops ops = {}; + int err; + + err = request_module("nvidia"); + if (!err) { + err = nvmem_get_nvidia_fp(&ops); + if (err) + request_module("efa_nv_peermem"); + else + nvmem_put_fp(&ops); + } + + return &prov.p2p; +} diff --git a/drivers/amazon/net/efa/efa_io_defs.h b/drivers/amazon/net/efa/efa_io_defs.h new file mode 100644 index 0000000000000..2d8eb96eaa81b --- /dev/null +++ b/drivers/amazon/net/efa/efa_io_defs.h @@ -0,0 +1,305 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_IO_H_ +#define _EFA_IO_H_ + +#define EFA_IO_TX_DESC_NUM_BUFS 2 +#define EFA_IO_TX_DESC_NUM_RDMA_BUFS 1 +#define EFA_IO_TX_DESC_INLINE_MAX_SIZE 32 +#define EFA_IO_TX_DESC_IMM_DATA_SIZE 4 + +enum efa_io_queue_type { + /* send queue (of a QP) */ + EFA_IO_SEND_QUEUE = 1, + /* recv queue (of a QP) */ + EFA_IO_RECV_QUEUE = 2, +}; + +enum efa_io_send_op_type { + /* send message */ + EFA_IO_SEND = 0, + /* RDMA read */ + EFA_IO_RDMA_READ = 1, + /* RDMA write */ + EFA_IO_RDMA_WRITE = 2, +}; + +enum efa_io_comp_status { + /* Successful completion */ + EFA_IO_COMP_STATUS_OK = 0, + /* Flushed during QP destroy */ + EFA_IO_COMP_STATUS_FLUSHED = 1, + /* Internal QP error */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR = 2, + /* Bad operation type */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE = 3, + /* Bad AH */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH = 4, + /* LKEY not registered or does not match IOVA */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY = 5, + /* Message too long */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH = 6, + /* Destination ENI is down or does not run EFA */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS = 7, + /* Connection was reset by remote side */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT = 8, + /* Bad dest QP number (QP does not exist or is in error state) */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN = 9, + /* Destination resource not ready (no WQEs posted on RQ) */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR = 10, + /* Receiver SGL too short */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH = 11, + /* Unexpected status returned by responder */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS = 12, + /* Unresponsive remote - detected locally */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE = 13, +}; + +struct efa_io_tx_meta_desc { + /* Verbs-generated Request ID */ + u16 req_id; + + /* + * control flags + * 3:0 : op_type - enum efa_io_send_op_type + * 4 : has_imm - immediate_data field carries valid + * data. + * 5 : inline_msg - inline mode - inline message data + * follows this descriptor (no buffer descriptors). + * Note that it is different from immediate data + * 6 : meta_extension - Extended metadata. MBZ + * 7 : meta_desc - Indicates metadata descriptor. + * Must be set. + */ + u8 ctrl1; + + /* + * control flags + * 0 : phase + * 1 : reserved25 - MBZ + * 2 : first - Indicates first descriptor in + * transaction. Must be set. + * 3 : last - Indicates last descriptor in + * transaction. Must be set. + * 4 : comp_req - Indicates whether completion should + * be posted, after packet is transmitted. Valid only + * for the first descriptor + * 7:5 : reserved29 - MBZ + */ + u8 ctrl2; + + u16 dest_qp_num; + + /* + * If inline_msg bit is set, length of inline message in bytes, + * otherwise length of SGL (number of buffers). + */ + u16 length; + + /* + * immediate data: if has_imm is set, then this field is included + * within Tx message and reported in remote Rx completion. + */ + u32 immediate_data; + + u16 ah; + + u16 reserved; + + /* Queue key */ + u32 qkey; + + u8 reserved2[12]; +}; + +/* + * Tx queue buffer descriptor, for any transport type. Preceded by metadata + * descriptor. + */ +struct efa_io_tx_buf_desc { + /* length in bytes */ + u32 length; + + /* + * 23:0 : lkey - local memory translation key + * 31:24 : reserved - MBZ + */ + u32 lkey; + + /* Buffer address bits[31:0] */ + u32 buf_addr_lo; + + /* Buffer address bits[63:32] */ + u32 buf_addr_hi; +}; + +struct efa_io_remote_mem_addr { + /* length in bytes */ + u32 length; + + /* remote memory translation key */ + u32 rkey; + + /* Buffer address bits[31:0] */ + u32 buf_addr_lo; + + /* Buffer address bits[63:32] */ + u32 buf_addr_hi; +}; + +struct efa_io_rdma_req { + /* Remote memory address */ + struct efa_io_remote_mem_addr remote_mem; + + /* Local memory address */ + struct efa_io_tx_buf_desc local_mem[1]; +}; + +/* + * Tx WQE, composed of tx meta descriptors followed by either tx buffer + * descriptors or inline data + */ +struct efa_io_tx_wqe { + /* TX meta */ + struct efa_io_tx_meta_desc meta; + + union { + /* Send buffer descriptors */ + struct efa_io_tx_buf_desc sgl[2]; + + u8 inline_data[32]; + + /* RDMA local and remote memory addresses */ + struct efa_io_rdma_req rdma_req; + } data; +}; + +/* + * Rx buffer descriptor; RX WQE is composed of one or more RX buffer + * descriptors. + */ +struct efa_io_rx_desc { + /* Buffer address bits[31:0] */ + u32 buf_addr_lo; + + /* Buffer Pointer[63:32] */ + u32 buf_addr_hi; + + /* Verbs-generated request id. */ + u16 req_id; + + /* Length in bytes. */ + u16 length; + + /* + * LKey and control flags + * 23:0 : lkey + * 29:24 : reserved - MBZ + * 30 : first - Indicates first descriptor in WQE + * 31 : last - Indicates last descriptor in WQE + */ + u32 lkey_ctrl; +}; + +/* Common IO completion descriptor */ +struct efa_io_cdesc_common { + /* + * verbs-generated request ID, as provided in the completed tx or rx + * descriptor. + */ + u16 req_id; + + u8 status; + + /* + * flags + * 0 : phase - Phase bit + * 2:1 : q_type - enum efa_io_queue_type: send/recv + * 3 : has_imm - indicates that immediate data is + * present - for RX completions only + * 6:4 : op_type - enum efa_io_send_op_type + * 7 : reserved31 - MBZ + */ + u8 flags; + + /* local QP number */ + u16 qp_num; +}; + +/* Tx completion descriptor */ +struct efa_io_tx_cdesc { + /* Common completion info */ + struct efa_io_cdesc_common common; + + /* MBZ */ + u16 reserved16; +}; + +/* Rx Completion Descriptor */ +struct efa_io_rx_cdesc { + /* Common completion info */ + struct efa_io_cdesc_common common; + + /* Transferred length bits[15:0] */ + u16 length; + + /* Remote Address Handle FW index, 0xFFFF indicates invalid ah */ + u16 ah; + + u16 src_qp_num; + + /* Immediate data */ + u32 imm; +}; + +/* Rx Completion Descriptor RDMA write info */ +struct efa_io_rx_cdesc_rdma_write { + /* Transferred length bits[31:16] */ + u16 length_hi; +}; + +/* Extended Rx Completion Descriptor */ +struct efa_io_rx_cdesc_ex { + /* Base RX completion info */ + struct efa_io_rx_cdesc base; + + union { + struct efa_io_rx_cdesc_rdma_write rdma_write; + + /* + * Valid only in case of unknown AH (0xFFFF) and CQ + * set_src_addr is enabled. + */ + u8 src_addr[16]; + } u; +}; + +/* tx_meta_desc */ +#define EFA_IO_TX_META_DESC_OP_TYPE_MASK GENMASK(3, 0) +#define EFA_IO_TX_META_DESC_HAS_IMM_MASK BIT(4) +#define EFA_IO_TX_META_DESC_INLINE_MSG_MASK BIT(5) +#define EFA_IO_TX_META_DESC_META_EXTENSION_MASK BIT(6) +#define EFA_IO_TX_META_DESC_META_DESC_MASK BIT(7) +#define EFA_IO_TX_META_DESC_PHASE_MASK BIT(0) +#define EFA_IO_TX_META_DESC_FIRST_MASK BIT(2) +#define EFA_IO_TX_META_DESC_LAST_MASK BIT(3) +#define EFA_IO_TX_META_DESC_COMP_REQ_MASK BIT(4) + +/* tx_buf_desc */ +#define EFA_IO_TX_BUF_DESC_LKEY_MASK GENMASK(23, 0) + +/* rx_desc */ +#define EFA_IO_RX_DESC_LKEY_MASK GENMASK(23, 0) +#define EFA_IO_RX_DESC_FIRST_MASK BIT(30) +#define EFA_IO_RX_DESC_LAST_MASK BIT(31) + +/* cdesc_common */ +#define EFA_IO_CDESC_COMMON_PHASE_MASK BIT(0) +#define EFA_IO_CDESC_COMMON_Q_TYPE_MASK GENMASK(2, 1) +#define EFA_IO_CDESC_COMMON_HAS_IMM_MASK BIT(3) +#define EFA_IO_CDESC_COMMON_OP_TYPE_MASK GENMASK(6, 4) + +#endif /* _EFA_IO_H_ */ diff --git a/drivers/amazon/net/efa/efa_main.c b/drivers/amazon/net/efa/efa_main.c new file mode 100644 index 0000000000000..7332c296936e6 --- /dev/null +++ b/drivers/amazon/net/efa/efa_main.c @@ -0,0 +1,737 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "kcompat.h" +#include +#include +#include +#include + +#include +#include + +#include "efa.h" +#include "efa_sysfs.h" + +#include "efa_p2p.h" + +#define PCI_DEV_ID_EFA0_VF 0xefa0 +#define PCI_DEV_ID_EFA1_VF 0xefa1 +#define PCI_DEV_ID_EFA2_VF 0xefa2 + +static const struct pci_device_id efa_pci_tbl[] = { + { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA0_VF) }, + { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA1_VF) }, + { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA2_VF) }, + { } +}; + +#define DRV_MODULE_VER_MAJOR 2 +#define DRV_MODULE_VER_MINOR 8 +#define DRV_MODULE_VER_SUBMINOR 0 + +#ifndef DRV_MODULE_VERSION +#define DRV_MODULE_VERSION \ + __stringify(DRV_MODULE_VER_MAJOR) "." \ + __stringify(DRV_MODULE_VER_MINOR) "." \ + __stringify(DRV_MODULE_VER_SUBMINOR) "a" +#endif + +MODULE_VERSION(DRV_MODULE_VERSION); +MODULE_SOFTDEP("pre: ib_uverbs"); + +static char version[] = DEVICE_NAME " v" DRV_MODULE_VERSION; + +MODULE_AUTHOR("Amazon.com, Inc. or its affiliates"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION(DEVICE_NAME); +MODULE_DEVICE_TABLE(pci, efa_pci_tbl); + +#define EFA_REG_BAR 0 +#define EFA_MEM_BAR 2 +#define EFA_BASE_BAR_MASK (BIT(EFA_REG_BAR) | BIT(EFA_MEM_BAR)) + +#define EFA_AENQ_ENABLED_GROUPS \ + (BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \ + BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE)) + +extern const struct uapi_definition efa_uapi_defs[]; + +/* This handler will called for unknown event group or unimplemented handlers */ +static void unimplemented_aenq_handler(void *data, + struct efa_admin_aenq_entry *aenq_e) +{ + struct efa_dev *dev = (struct efa_dev *)data; + + ibdev_err(&dev->ibdev, + "Unknown event was received or event with unimplemented handler\n"); +} + +static void efa_keep_alive(void *data, struct efa_admin_aenq_entry *aenq_e) +{ + struct efa_dev *dev = (struct efa_dev *)data; + + atomic64_inc(&dev->stats.keep_alive_rcvd); +} + +static struct efa_aenq_handlers aenq_handlers = { + .handlers = { + [EFA_ADMIN_KEEP_ALIVE] = efa_keep_alive, + }, + .unimplemented_handler = unimplemented_aenq_handler +}; + +static void efa_release_bars(struct efa_dev *dev, int bars_mask) +{ + struct pci_dev *pdev = dev->pdev; + int release_bars; + + release_bars = pci_select_bars(pdev, IORESOURCE_MEM) & bars_mask; + pci_release_selected_regions(pdev, release_bars); +} + +static void efa_process_comp_eqe(struct efa_dev *dev, struct efa_admin_eqe *eqe) +{ + u16 cqn = eqe->u.comp_event.cqn; + struct efa_cq *cq; + + /* Safe to load as we're in irq and removal calls synchronize_irq() */ + cq = xa_load(&dev->cqs_xa, cqn); + if (unlikely(!cq)) { + ibdev_err_ratelimited(&dev->ibdev, + "Completion event on non-existent CQ[%u]", + cqn); + return; + } + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + +static void efa_process_eqe(struct efa_com_eq *eeq, struct efa_admin_eqe *eqe) +{ + struct efa_dev *dev = container_of(eeq->edev, struct efa_dev, edev); + + if (likely(EFA_GET(&eqe->common, EFA_ADMIN_EQE_EVENT_TYPE) == + EFA_ADMIN_EQE_EVENT_TYPE_COMPLETION)) + efa_process_comp_eqe(dev, eqe); + else + ibdev_err_ratelimited(&dev->ibdev, + "Unknown event type received %lu", + EFA_GET(&eqe->common, + EFA_ADMIN_EQE_EVENT_TYPE)); +} + +static irqreturn_t efa_intr_msix_comp(int irq, void *data) +{ + struct efa_eq *eq = data; + struct efa_com_dev *edev = eq->eeq.edev; + + efa_com_eq_comp_intr_handler(edev, &eq->eeq); + + return IRQ_HANDLED; +} + +static irqreturn_t efa_intr_msix_mgmnt(int irq, void *data) +{ + struct efa_dev *dev = data; + + efa_com_admin_q_comp_intr_handler(&dev->edev); + efa_com_aenq_intr_handler(&dev->edev, data); + + return IRQ_HANDLED; +} + +static int efa_request_irq(struct efa_dev *dev, struct efa_irq *irq) +{ + int err; + + err = request_irq(irq->irqn, irq->handler, 0, irq->name, irq->data); + if (err) { + dev_err(&dev->pdev->dev, "Failed to request irq %s (%d)\n", + irq->name, err); + return err; + } + + irq_set_affinity_hint(irq->irqn, &irq->affinity_hint_mask); + + return 0; +} + +static void efa_setup_comp_irq(struct efa_dev *dev, struct efa_eq *eq, + int vector) +{ + u32 cpu; + + cpu = vector - EFA_COMP_EQS_VEC_BASE; + snprintf(eq->irq.name, EFA_IRQNAME_SIZE, "efa-comp%d@pci:%s", cpu, + pci_name(dev->pdev)); + eq->irq.handler = efa_intr_msix_comp; + eq->irq.data = eq; + eq->irq.vector = vector; + eq->irq.irqn = pci_irq_vector(dev->pdev, vector); + cpumask_set_cpu(cpu, &eq->irq.affinity_hint_mask); +} + +static void efa_free_irq(struct efa_dev *dev, struct efa_irq *irq) +{ + irq_set_affinity_hint(irq->irqn, NULL); + free_irq(irq->irqn, irq->data); +} + +static void efa_setup_mgmnt_irq(struct efa_dev *dev) +{ + u32 cpu; + + snprintf(dev->admin_irq.name, EFA_IRQNAME_SIZE, + "efa-mgmnt@pci:%s", pci_name(dev->pdev)); + dev->admin_irq.handler = efa_intr_msix_mgmnt; + dev->admin_irq.data = dev; + dev->admin_irq.vector = dev->admin_msix_vector_idx; + dev->admin_irq.irqn = pci_irq_vector(dev->pdev, + dev->admin_msix_vector_idx); + cpu = cpumask_first(cpu_online_mask); + cpumask_set_cpu(cpu, + &dev->admin_irq.affinity_hint_mask); + dev_info(&dev->pdev->dev, "Setup irq:%d name:%s\n", + dev->admin_irq.irqn, + dev->admin_irq.name); +} + +static int efa_set_mgmnt_irq(struct efa_dev *dev) +{ + efa_setup_mgmnt_irq(dev); + + return efa_request_irq(dev, &dev->admin_irq); +} + +static int efa_request_doorbell_bar(struct efa_dev *dev) +{ + u8 db_bar_idx = dev->dev_attr.db_bar; + struct pci_dev *pdev = dev->pdev; + int bars; + int err; + + if (!(BIT(db_bar_idx) & EFA_BASE_BAR_MASK)) { + bars = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(db_bar_idx); + + err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + if (err) { + dev_err(&dev->pdev->dev, + "pci_request_selected_regions for bar %d failed %d\n", + db_bar_idx, err); + return err; + } + } + + dev->db_bar_addr = pci_resource_start(dev->pdev, db_bar_idx); + dev->db_bar_len = pci_resource_len(dev->pdev, db_bar_idx); + + return 0; +} + +static void efa_release_doorbell_bar(struct efa_dev *dev) +{ + if (!(BIT(dev->dev_attr.db_bar) & EFA_BASE_BAR_MASK)) + efa_release_bars(dev, BIT(dev->dev_attr.db_bar)); +} + +static void efa_update_hw_hints(struct efa_dev *dev, + struct efa_com_get_hw_hints_result *hw_hints) +{ + struct efa_com_dev *edev = &dev->edev; + + if (hw_hints->mmio_read_timeout) + edev->mmio_read.mmio_read_timeout = + hw_hints->mmio_read_timeout * 1000; + + if (hw_hints->poll_interval) + edev->aq.poll_interval = hw_hints->poll_interval; + + if (hw_hints->admin_completion_timeout) + edev->aq.completion_timeout = + hw_hints->admin_completion_timeout; +} + +static void efa_stats_init(struct efa_dev *dev) +{ + atomic64_t *s = (atomic64_t *)&dev->stats; + int i; + + for (i = 0; i < sizeof(dev->stats) / sizeof(*s); i++, s++) + atomic64_set(s, 0); +} + +static void efa_set_host_info(struct efa_dev *dev) +{ + struct efa_admin_set_feature_resp resp = {}; + struct efa_admin_set_feature_cmd cmd = {}; + struct efa_admin_host_info *hinf; + u32 bufsz = sizeof(*hinf); + dma_addr_t hinf_dma; + + if (!efa_com_check_supported_feature_id(&dev->edev, + EFA_ADMIN_HOST_INFO)) + return; + + /* Failures in host info set shall not disturb probe */ + hinf = dma_alloc_coherent(&dev->pdev->dev, bufsz, &hinf_dma, + GFP_KERNEL); + if (!hinf) + return; + + strscpy(hinf->os_dist_str, utsname()->release, + sizeof(hinf->os_dist_str)); + hinf->os_type = EFA_ADMIN_OS_LINUX; + strscpy(hinf->kernel_ver_str, utsname()->version, + sizeof(hinf->kernel_ver_str)); + hinf->kernel_ver = LINUX_VERSION_CODE; + EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MAJOR, + DRV_MODULE_VER_MAJOR); + EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MINOR, + DRV_MODULE_VER_MINOR); + EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR, + DRV_MODULE_VER_SUBMINOR); + EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE, + "a"[0]); + EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_BUS, dev->pdev->bus->number); + EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_DEVICE, + PCI_SLOT(dev->pdev->devfn)); + EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_FUNCTION, + PCI_FUNC(dev->pdev->devfn)); + EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MAJOR, + EFA_COMMON_SPEC_VERSION_MAJOR); + EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MINOR, + EFA_COMMON_SPEC_VERSION_MINOR); + EFA_SET(&hinf->flags, EFA_ADMIN_HOST_INFO_INTREE, 1); + EFA_SET(&hinf->flags, EFA_ADMIN_HOST_INFO_GDR, 1); + + efa_com_set_feature_ex(&dev->edev, &resp, &cmd, EFA_ADMIN_HOST_INFO, + hinf_dma, bufsz); + + dma_free_coherent(&dev->pdev->dev, bufsz, hinf, hinf_dma); +} + +static void efa_destroy_eq(struct efa_dev *dev, struct efa_eq *eq) +{ + efa_com_eq_destroy(&dev->edev, &eq->eeq); + efa_free_irq(dev, &eq->irq); +} + +static int efa_create_eq(struct efa_dev *dev, struct efa_eq *eq, u8 msix_vec) +{ + int err; + + efa_setup_comp_irq(dev, eq, msix_vec); + err = efa_request_irq(dev, &eq->irq); + if (err) + return err; + + err = efa_com_eq_init(&dev->edev, &eq->eeq, efa_process_eqe, + dev->dev_attr.max_eq_depth, msix_vec); + if (err) + goto err_free_comp_irq; + + return 0; + +err_free_comp_irq: + efa_free_irq(dev, &eq->irq); + return err; +} + +static int efa_create_eqs(struct efa_dev *dev) +{ + unsigned int neqs = dev->dev_attr.max_eq; + int err; + int i; + + neqs = min_t(unsigned int, neqs, num_online_cpus()); + dev->neqs = neqs; + dev->eqs = kcalloc(neqs, sizeof(*dev->eqs), GFP_KERNEL); + if (!dev->eqs) + return -ENOMEM; + + for (i = 0; i < neqs; i++) { + err = efa_create_eq(dev, &dev->eqs[i], + i + EFA_COMP_EQS_VEC_BASE); + if (err) + goto err_destroy_eqs; + } + + return 0; + +err_destroy_eqs: + for (i--; i >= 0; i--) + efa_destroy_eq(dev, &dev->eqs[i]); + kfree(dev->eqs); + + return err; +} + +static void efa_destroy_eqs(struct efa_dev *dev) +{ + int i; + + for (i = 0; i < dev->neqs; i++) + efa_destroy_eq(dev, &dev->eqs[i]); + + kfree(dev->eqs); +} + +static const struct ib_device_ops efa_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_EFA, + .uverbs_abi_ver = EFA_UVERBS_ABI_VERSION, + + .alloc_hw_port_stats = efa_alloc_hw_port_stats, + .alloc_hw_device_stats = efa_alloc_hw_device_stats, + .alloc_pd = efa_alloc_pd, + .alloc_ucontext = efa_alloc_ucontext, + .create_cq = efa_create_cq, + .create_qp = efa_create_qp, + .create_user_ah = efa_create_ah, + .dealloc_pd = efa_dealloc_pd, + .dealloc_ucontext = efa_dealloc_ucontext, + .dereg_mr = efa_dereg_mr, + .destroy_ah = efa_destroy_ah, + .destroy_cq = efa_destroy_cq, + .destroy_qp = efa_destroy_qp, + .get_hw_stats = efa_get_hw_stats, + .get_link_layer = efa_port_link_layer, + .get_port_immutable = efa_get_port_immutable, + .mmap = efa_mmap, + .mmap_free = efa_mmap_free, + .modify_qp = efa_modify_qp, + .query_device = efa_query_device, + .query_gid = efa_query_gid, + .query_pkey = efa_query_pkey, + .query_port = efa_query_port, + .query_qp = efa_query_qp, + .reg_user_mr = efa_reg_mr, + .reg_user_mr_dmabuf = efa_reg_user_mr_dmabuf, + + INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, efa_cq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_pd, efa_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_qp, efa_qp, ibqp), + INIT_RDMA_OBJ_SIZE(ib_ucontext, efa_ucontext, ibucontext), +}; + +static int efa_ib_device_add(struct efa_dev *dev) +{ + struct efa_com_get_hw_hints_result hw_hints; + struct pci_dev *pdev = dev->pdev; + int err; + + efa_stats_init(dev); + + err = efa_com_get_device_attr(&dev->edev, &dev->dev_attr); + if (err) + return err; + + dev_dbg(&dev->pdev->dev, "Doorbells bar (%d)\n", dev->dev_attr.db_bar); + err = efa_request_doorbell_bar(dev); + if (err) + return err; + + err = efa_com_get_hw_hints(&dev->edev, &hw_hints); + if (err) + goto err_release_doorbell_bar; + + efa_update_hw_hints(dev, &hw_hints); + + /* Try to enable all the available aenq groups */ + err = efa_com_set_aenq_config(&dev->edev, EFA_AENQ_ENABLED_GROUPS); + if (err) + goto err_release_doorbell_bar; + + err = efa_create_eqs(dev); + if (err) + goto err_release_doorbell_bar; + + efa_set_host_info(dev); + + dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED; + dev->ibdev.phys_port_cnt = 1; + dev->ibdev.num_comp_vectors = dev->neqs ?: 1; + dev->ibdev.dev.parent = &pdev->dev; + + ib_set_device_ops(&dev->ibdev, &efa_dev_ops); + + dev->ibdev.driver_def = efa_uapi_defs; + + err = ib_register_device(&dev->ibdev, "efa_%d", &pdev->dev); + if (err) + goto err_destroy_eqs; + + ibdev_info(&dev->ibdev, "IB device registered\n"); + + return 0; + +err_destroy_eqs: + efa_destroy_eqs(dev); +err_release_doorbell_bar: + efa_release_doorbell_bar(dev); + return err; +} + +static void efa_ib_device_remove(struct efa_dev *dev) +{ + ibdev_info(&dev->ibdev, "Unregister ib device\n"); + ib_unregister_device(&dev->ibdev); + efa_destroy_eqs(dev); + efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_NORMAL); + efa_release_doorbell_bar(dev); +} + +static void efa_disable_msix(struct efa_dev *dev) +{ + pci_free_irq_vectors(dev->pdev); +} + +static int efa_enable_msix(struct efa_dev *dev) +{ + int msix_vecs, irq_num; + + /* + * Reserve the max msix vectors we might need, one vector is reserved + * for admin. + */ + msix_vecs = min_t(int, pci_msix_vec_count(dev->pdev), + num_online_cpus() + 1); + dev_dbg(&dev->pdev->dev, "Trying to enable MSI-X, vectors %d\n", + msix_vecs); + + dev->admin_msix_vector_idx = EFA_MGMNT_MSIX_VEC_IDX; + irq_num = pci_alloc_irq_vectors(dev->pdev, msix_vecs, + msix_vecs, PCI_IRQ_MSIX); + + if (irq_num < 0) { + dev_err(&dev->pdev->dev, "Failed to enable MSI-X. irq_num %d\n", + irq_num); + return -ENOSPC; + } + + if (irq_num != msix_vecs) { + efa_disable_msix(dev); + dev_err(&dev->pdev->dev, + "Allocated %d MSI-X (out of %d requested)\n", + irq_num, msix_vecs); + return -ENOSPC; + } + + return 0; +} + +static int efa_device_init(struct efa_com_dev *edev, struct pci_dev *pdev) +{ + int dma_width; + int err; + + err = efa_com_dev_reset(edev, EFA_REGS_RESET_NORMAL); + if (err) + return err; + + err = efa_com_validate_version(edev); + if (err) + return err; + + dma_width = efa_com_get_dma_width(edev); + if (dma_width < 0) { + err = dma_width; + return err; + } + + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(dma_width)); + if (err) { + dev_err(&pdev->dev, "dma_set_mask_and_coherent failed %d\n", err); + return err; + } + + dma_set_max_seg_size(&pdev->dev, UINT_MAX); + return 0; +} + +static struct efa_dev *efa_probe_device(struct pci_dev *pdev) +{ + struct efa_com_dev *edev; + struct efa_dev *dev; + int bars; + int err; + + err = pci_enable_device_mem(pdev); + if (err) { + dev_err(&pdev->dev, "pci_enable_device_mem() failed!\n"); + return ERR_PTR(err); + } + + pci_set_master(pdev); + + dev = ib_alloc_device(efa_dev, ibdev); + if (!dev) { + dev_err(&pdev->dev, "Device alloc failed\n"); + err = -ENOMEM; + goto err_disable_device; + } + + pci_set_drvdata(pdev, dev); + edev = &dev->edev; + edev->efa_dev = dev; + edev->dmadev = &pdev->dev; + dev->pdev = pdev; + xa_init(&dev->cqs_xa); + + bars = pci_select_bars(pdev, IORESOURCE_MEM) & EFA_BASE_BAR_MASK; + err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + if (err) { + dev_err(&pdev->dev, "pci_request_selected_regions failed %d\n", + err); + goto err_ibdev_destroy; + } + + dev->reg_bar_addr = pci_resource_start(pdev, EFA_REG_BAR); + dev->reg_bar_len = pci_resource_len(pdev, EFA_REG_BAR); + dev->mem_bar_addr = pci_resource_start(pdev, EFA_MEM_BAR); + dev->mem_bar_len = pci_resource_len(pdev, EFA_MEM_BAR); + + edev->reg_bar = devm_ioremap(&pdev->dev, + dev->reg_bar_addr, + dev->reg_bar_len); + if (!edev->reg_bar) { + dev_err(&pdev->dev, "Failed to remap register bar\n"); + err = -EFAULT; + goto err_release_bars; + } + + err = efa_com_mmio_reg_read_init(edev); + if (err) { + dev_err(&pdev->dev, "Failed to init readless MMIO\n"); + goto err_iounmap; + } + + err = efa_device_init(edev, pdev); + if (err) { + dev_err(&pdev->dev, "EFA device init failed\n"); + if (err == -ETIME) + err = -EPROBE_DEFER; + goto err_reg_read_destroy; + } + + err = efa_enable_msix(dev); + if (err) + goto err_reg_read_destroy; + + edev->aq.msix_vector_idx = dev->admin_msix_vector_idx; + edev->aenq.msix_vector_idx = dev->admin_msix_vector_idx; + + err = efa_set_mgmnt_irq(dev); + if (err) + goto err_disable_msix; + + err = efa_com_admin_init(edev, &aenq_handlers); + if (err) + goto err_free_mgmnt_irq; + + err = efa_sysfs_init(dev); + if (err) + goto err_admin_destroy; + + return dev; + +err_admin_destroy: + efa_com_admin_destroy(edev); +err_free_mgmnt_irq: + efa_free_irq(dev, &dev->admin_irq); +err_disable_msix: + efa_disable_msix(dev); +err_reg_read_destroy: + efa_com_mmio_reg_read_destroy(edev); +err_iounmap: + devm_iounmap(&pdev->dev, edev->reg_bar); +err_release_bars: + efa_release_bars(dev, EFA_BASE_BAR_MASK); +err_ibdev_destroy: + ib_dealloc_device(&dev->ibdev); +err_disable_device: + pci_disable_device(pdev); + return ERR_PTR(err); +} + +static void efa_remove_device(struct pci_dev *pdev) +{ + struct efa_dev *dev = pci_get_drvdata(pdev); + struct efa_com_dev *edev; + + edev = &dev->edev; + efa_sysfs_destroy(dev); + efa_com_admin_destroy(edev); + efa_free_irq(dev, &dev->admin_irq); + efa_disable_msix(dev); + efa_com_mmio_reg_read_destroy(edev); + devm_iounmap(&pdev->dev, edev->reg_bar); + efa_release_bars(dev, EFA_BASE_BAR_MASK); + xa_destroy(&dev->cqs_xa); + ib_dealloc_device(&dev->ibdev); + pci_disable_device(pdev); +} + +static int efa_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + struct efa_dev *dev; + int err; + + dev = efa_probe_device(pdev); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + err = efa_ib_device_add(dev); + if (err) + goto err_remove_device; + + return 0; + +err_remove_device: + efa_remove_device(pdev); + return err; +} + +static void efa_remove(struct pci_dev *pdev) +{ + struct efa_dev *dev = pci_get_drvdata(pdev); + + efa_ib_device_remove(dev); + efa_remove_device(pdev); +} + +static struct pci_driver efa_pci_driver = { + .name = DRV_MODULE_NAME, + .id_table = efa_pci_tbl, + .probe = efa_probe, + .remove = efa_remove, +}; + +static int __init efa_init(void) +{ + int err; + + pr_info("%s\n", version); + + err = pci_register_driver(&efa_pci_driver); + if (err) { + pr_err("Couldn't register efa driver\n"); + return err; + } + + efa_p2p_init(); + + return 0; +} + +static void __exit efa_exit(void) +{ + pci_unregister_driver(&efa_pci_driver); +} + +module_init(efa_init); +module_exit(efa_exit); diff --git a/drivers/amazon/net/efa/efa_neuron.c b/drivers/amazon/net/efa/efa_neuron.c new file mode 100644 index 0000000000000..15a9917ac2cc2 --- /dev/null +++ b/drivers/amazon/net/efa/efa_neuron.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2021-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include + +#include "efa_p2p.h" +#include "neuron_p2p.h" + +#define NEURON_PAGE_SHIFT 12 +#define NEURON_PAGE_SIZE BIT_ULL(NEURON_PAGE_SHIFT) + +struct efa_neuronmem_ops { + int (*register_va)(u64 virtual_address, u64 length, + struct neuron_p2p_va_info **vainfo, + void (*free_callback)(void *data), + void *data); + int (*unregister_va)(struct neuron_p2p_va_info *vainfo); +}; + +struct efa_neuronmem { + struct efa_p2pmem p2pmem; + struct efa_neuronmem_ops ops; + struct neuron_p2p_va_info *va_info; + u64 virt_start; +}; + +static unsigned int neuronmem_pgsz(struct efa_dev *dev, + struct efa_p2pmem *p2pmem) +{ + struct efa_neuronmem *neuronmem; + + neuronmem = container_of(p2pmem, struct efa_neuronmem, p2pmem); + return BIT(neuronmem->va_info->shift_page_size); +} + +static int neuronmem_get_fp(struct efa_neuronmem *neuronmem) +{ + neuronmem->ops.register_va = symbol_get(neuron_p2p_register_va); + if (!neuronmem->ops.register_va) + goto err_out; + + neuronmem->ops.unregister_va = symbol_get(neuron_p2p_unregister_va); + if (!neuronmem->ops.unregister_va) + goto err_put_register_va; + + return 0; + +err_put_register_va: + symbol_put(neuron_p2p_register_va); +err_out: + return -EINVAL; +} + +static void neuronmem_put_fp(void) +{ + symbol_put(neuron_p2p_unregister_va); + symbol_put(neuron_p2p_register_va); +} + +static void neuronmem_free_cb(void *data) +{ + pr_debug("Free callback ticket %llu\n", (u64)data); + efa_p2p_put((u64)data, true); +} + +static int neuronmem_register_va(struct efa_dev *dev, struct efa_neuronmem *neuronmem, + u64 addr, u64 size, u64 ticket) +{ + int err; + + err = neuronmem->ops.register_va(addr, size, &neuronmem->va_info, + neuronmem_free_cb, (void *)ticket); + if (err) { + ibdev_dbg(&dev->ibdev, "neuron_p2p_register_va failed %d\n", err); + return err; + } + + return 0; +} + +static struct efa_p2pmem *neuronmem_get(struct efa_dev *dev, u64 ticket, u64 start, + u64 length) +{ + struct efa_neuronmem *neuronmem; + u64 virt_start; + u64 virt_end; + u64 pinsz; + int err; + + neuronmem = kzalloc(sizeof(*neuronmem), GFP_KERNEL); + if (!neuronmem) + return NULL; + + virt_start = ALIGN_DOWN(start, NEURON_PAGE_SIZE); + virt_end = ALIGN(start + length, NEURON_PAGE_SIZE); + pinsz = virt_end - virt_start; + neuronmem->virt_start = virt_start; + + err = neuronmem_get_fp(neuronmem); + if (err) + /* Neuron module is not loaded */ + goto err_free; + + err = neuronmem_register_va(dev, neuronmem, virt_start, pinsz, ticket); + if (err) + /* Most likely not our pages */ + goto err_put_fp; + + return &neuronmem->p2pmem; + +err_put_fp: + neuronmem_put_fp(); +err_free: + kfree(neuronmem); + return NULL; +} + +static int neuronmem_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + u64 *page_list) +{ + struct neuron_p2p_page_info *pg_info; + struct neuron_p2p_va_info *va_info; + struct efa_neuronmem *neuronmem; + int ent_idx, pa_idx; + int pg_idx = 0; + u64 pa; + + neuronmem = container_of(p2pmem, struct efa_neuronmem, p2pmem); + va_info = neuronmem->va_info; + + for (ent_idx = 0; ent_idx < va_info->entries; ent_idx++) { + pg_info = va_info->page_info + ent_idx; + pa = pg_info->physical_address; + for (pa_idx = 0; pa_idx < pg_info->page_count; pa_idx++) { + page_list[pg_idx++] = pa; + pa += BIT(va_info->shift_page_size); + } + } + + return 0; +} + +static void neuronmem_release(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + bool in_cb) +{ + struct efa_neuronmem *neuronmem; + + neuronmem = container_of(p2pmem, struct efa_neuronmem, p2pmem); + + neuronmem->ops.unregister_va(neuronmem->va_info); + neuronmem_put_fp(); + kfree(neuronmem); +} + +static char *neuronmem_provider_string(void) +{ + struct efa_neuronmem dummy = {}; + + if (neuronmem_get_fp(&dummy)) + return ""; + + neuronmem_put_fp(); + return "NEURON"; +} + +struct neuronmem_provider { + struct efa_p2p_provider p2p; +}; + +static const struct neuronmem_provider prov = { + .p2p = { + .ops = { + .get_provider_string = neuronmem_provider_string, + .try_get = neuronmem_get, + .to_page_list = neuronmem_to_page_list, + .release = neuronmem_release, + .get_page_size = neuronmem_pgsz, + }, + .type = EFA_P2P_PROVIDER_NEURON, + }, +}; + +const struct efa_p2p_provider *neuronmem_get_provider(void) +{ + return &prov.p2p; +} diff --git a/drivers/amazon/net/efa/efa_p2p.c b/drivers/amazon/net/efa/efa_p2p.c new file mode 100644 index 0000000000000..ab05f3a5d170b --- /dev/null +++ b/drivers/amazon/net/efa/efa_p2p.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2019-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "efa_p2p.h" + +static struct mutex p2p_list_lock; +static struct list_head p2p_list; +static atomic64_t next_p2p_ticket; + +static const struct efa_p2p_provider *prov_arr[EFA_P2P_PROVIDER_MAX]; + +/* Register all providers here */ +static void p2p_providers_init(void) +{ + prov_arr[EFA_P2P_PROVIDER_NVMEM] = nvmem_get_provider(); + prov_arr[EFA_P2P_PROVIDER_NEURON] = neuronmem_get_provider(); +} + +void efa_p2p_init(void) +{ + mutex_init(&p2p_list_lock); + INIT_LIST_HEAD(&p2p_list); + /* + * Ideally, first ticket would be zero, but that would make callback + * data NULL which is invalid. + */ + atomic64_set(&next_p2p_ticket, 1); + + p2p_providers_init(); +} + +static struct efa_p2pmem *ticket_to_p2p(u64 ticket) +{ + struct efa_p2pmem *p2pmem; + + lockdep_assert_held(&p2p_list_lock); + list_for_each_entry(p2pmem, &p2p_list, list) { + if (p2pmem->ticket == ticket) + return p2pmem; + } + + return NULL; +} + +char *efa_p2p_provider_string(void) +{ + const struct efa_p2p_provider *prov; + char *prov_string; + int i; + + for (i = 0; i < EFA_P2P_PROVIDER_MAX; i++) { + prov = prov_arr[i]; + prov_string = prov->ops.get_provider_string(); + if (prov_string[0] != '\0') + /* Only the first available provider is returned */ + return prov_string; + } + + return ""; +} + +int efa_p2p_put(u64 ticket, bool in_cb) +{ + struct efa_com_dereg_mr_params params = {}; + struct efa_p2pmem *p2pmem; + struct efa_dev *dev; + int err; + + mutex_lock(&p2p_list_lock); + p2pmem = ticket_to_p2p(ticket); + if (!p2pmem) { + pr_debug("Ticket %llu not found in the p2pmem list\n", ticket); + mutex_unlock(&p2p_list_lock); + return 0; + } + + dev = p2pmem->dev; + if (p2pmem->needs_dereg) { + params.l_key = p2pmem->lkey; + err = efa_com_dereg_mr(&dev->edev, ¶ms); + if (err) { + mutex_unlock(&p2p_list_lock); + return err; + } + p2pmem->needs_dereg = false; + } + + list_del(&p2pmem->list); + mutex_unlock(&p2p_list_lock); + p2pmem->prov->ops.release(dev, p2pmem, in_cb); + + return 0; +} + +struct efa_p2pmem *efa_p2p_get(struct efa_dev *dev, struct efa_mr *mr, u64 start, + u64 length) +{ + const struct efa_p2p_provider *prov; + static bool message_printed; + struct efa_p2pmem *p2pmem; + u64 ticket; + int i; + + ticket = atomic64_fetch_inc(&next_p2p_ticket); + for (i = 0; i < EFA_P2P_PROVIDER_MAX; i++) { + prov = prov_arr[i]; + p2pmem = prov->ops.try_get(dev, ticket, start, length); + if (p2pmem) + break; + } + if (!p2pmem) + /* No provider was found, most likely cpu pages */ + return NULL; + + p2pmem->dev = dev; + p2pmem->ticket = ticket; + p2pmem->prov = prov; + mr->p2p_ticket = p2pmem->ticket; + + if (!message_printed) { + pr_info("efa: Acquired peer memory using P2P"); + message_printed = true; + } + + mutex_lock(&p2p_list_lock); + list_add(&p2pmem->list, &p2p_list); + mutex_unlock(&p2p_list_lock); + + return p2pmem; +} + +int efa_p2p_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + u64 *page_list) +{ + return p2pmem->prov->ops.to_page_list(dev, p2pmem, page_list); +} + +unsigned int efa_p2p_get_page_size(struct efa_dev *dev, + struct efa_p2pmem *p2pmem) +{ + return p2pmem->prov->ops.get_page_size(dev, p2pmem); +} diff --git a/drivers/amazon/net/efa/efa_p2p.h b/drivers/amazon/net/efa/efa_p2p.h new file mode 100644 index 0000000000000..5a4bf353ec633 --- /dev/null +++ b/drivers/amazon/net/efa/efa_p2p.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2019-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_P2P_H_ +#define _EFA_P2P_H_ + +#include "efa.h" + +struct efa_p2p_ops { + char *(*get_provider_string)(void); + struct efa_p2pmem *(*try_get)(struct efa_dev *dev, u64 ticket, u64 start, + u64 length); + int (*to_page_list)(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + u64 *page_list); + void (*release)(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + bool in_cb); + unsigned int (*get_page_size)(struct efa_dev *dev, + struct efa_p2pmem *p2pmem); +}; + +enum efa_p2p_prov { + EFA_P2P_PROVIDER_NVMEM, + EFA_P2P_PROVIDER_NEURON, + EFA_P2P_PROVIDER_MAX, +}; + +struct efa_p2p_provider { + const struct efa_p2p_ops ops; + enum efa_p2p_prov type; +}; + +struct efa_p2pmem { + struct efa_dev *dev; + const struct efa_p2p_provider *prov; + u64 ticket; + u32 lkey; + bool needs_dereg; + struct list_head list; /* member of efa_p2p_list */ +}; + +void efa_p2p_init(void); +char *efa_p2p_provider_string(void); +struct efa_p2pmem *efa_p2p_get(struct efa_dev *dev, struct efa_mr *mr, u64 start, + u64 length); +unsigned int efa_p2p_get_page_size(struct efa_dev *dev, + struct efa_p2pmem *p2pmem); +int efa_p2p_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + u64 *page_list); +int efa_p2p_put(u64 ticket, bool in_cb); + +/* Provider specific stuff go here */ +const struct efa_p2p_provider *nvmem_get_provider(void); + +const struct efa_p2p_provider *neuronmem_get_provider(void); + +#endif /* _EFA_P2P_H_ */ diff --git a/drivers/amazon/net/efa/efa_regs_defs.h b/drivers/amazon/net/efa/efa_regs_defs.h new file mode 100644 index 0000000000000..714ae62588004 --- /dev/null +++ b/drivers/amazon/net/efa/efa_regs_defs.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_REGS_H_ +#define _EFA_REGS_H_ + +enum efa_regs_reset_reason_types { + EFA_REGS_RESET_NORMAL = 0, + /* Keep alive timeout */ + EFA_REGS_RESET_KEEP_ALIVE_TO = 1, + EFA_REGS_RESET_ADMIN_TO = 2, + EFA_REGS_RESET_INIT_ERR = 3, + EFA_REGS_RESET_DRIVER_INVALID_STATE = 4, + EFA_REGS_RESET_OS_TRIGGER = 5, + EFA_REGS_RESET_SHUTDOWN = 6, + EFA_REGS_RESET_USER_TRIGGER = 7, + EFA_REGS_RESET_GENERIC = 8, +}; + +/* efa_registers offsets */ + +/* 0 base */ +#define EFA_REGS_VERSION_OFF 0x0 +#define EFA_REGS_CONTROLLER_VERSION_OFF 0x4 +#define EFA_REGS_CAPS_OFF 0x8 +#define EFA_REGS_AQ_BASE_LO_OFF 0x10 +#define EFA_REGS_AQ_BASE_HI_OFF 0x14 +#define EFA_REGS_AQ_CAPS_OFF 0x18 +#define EFA_REGS_ACQ_BASE_LO_OFF 0x20 +#define EFA_REGS_ACQ_BASE_HI_OFF 0x24 +#define EFA_REGS_ACQ_CAPS_OFF 0x28 +#define EFA_REGS_AQ_PROD_DB_OFF 0x2c +#define EFA_REGS_AENQ_CAPS_OFF 0x34 +#define EFA_REGS_AENQ_BASE_LO_OFF 0x38 +#define EFA_REGS_AENQ_BASE_HI_OFF 0x3c +#define EFA_REGS_AENQ_CONS_DB_OFF 0x40 +#define EFA_REGS_INTR_MASK_OFF 0x4c +#define EFA_REGS_DEV_CTL_OFF 0x54 +#define EFA_REGS_DEV_STS_OFF 0x58 +#define EFA_REGS_MMIO_REG_READ_OFF 0x5c +#define EFA_REGS_MMIO_RESP_LO_OFF 0x60 +#define EFA_REGS_MMIO_RESP_HI_OFF 0x64 +#define EFA_REGS_EQ_DB_OFF 0x68 + +/* version register */ +#define EFA_REGS_VERSION_MINOR_VERSION_MASK 0xff +#define EFA_REGS_VERSION_MAJOR_VERSION_MASK 0xff00 + +/* controller_version register */ +#define EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK 0xff +#define EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK 0xff00 +#define EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK 0xff0000 +#define EFA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK 0xff000000 + +/* caps register */ +#define EFA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK 0x1 +#define EFA_REGS_CAPS_RESET_TIMEOUT_MASK 0x3e +#define EFA_REGS_CAPS_DMA_ADDR_WIDTH_MASK 0xff00 +#define EFA_REGS_CAPS_ADMIN_CMD_TO_MASK 0xf0000 + +/* aq_caps register */ +#define EFA_REGS_AQ_CAPS_AQ_DEPTH_MASK 0xffff +#define EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK 0xffff0000 + +/* acq_caps register */ +#define EFA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK 0xffff +#define EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK 0xff0000 +#define EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_MASK 0xff000000 + +/* aenq_caps register */ +#define EFA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK 0xffff +#define EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK 0xff0000 +#define EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK 0xff000000 + +/* intr_mask register */ +#define EFA_REGS_INTR_MASK_EN_MASK 0x1 + +/* dev_ctl register */ +#define EFA_REGS_DEV_CTL_DEV_RESET_MASK 0x1 +#define EFA_REGS_DEV_CTL_AQ_RESTART_MASK 0x2 +#define EFA_REGS_DEV_CTL_RESET_REASON_MASK 0xf0000000 + +/* dev_sts register */ +#define EFA_REGS_DEV_STS_READY_MASK 0x1 +#define EFA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK 0x2 +#define EFA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK 0x4 +#define EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK 0x8 +#define EFA_REGS_DEV_STS_RESET_FINISHED_MASK 0x10 +#define EFA_REGS_DEV_STS_FATAL_ERROR_MASK 0x20 + +/* mmio_reg_read register */ +#define EFA_REGS_MMIO_REG_READ_REQ_ID_MASK 0xffff +#define EFA_REGS_MMIO_REG_READ_REG_OFF_MASK 0xffff0000 + +/* eq_db register */ +#define EFA_REGS_EQ_DB_EQN_MASK 0xffff +#define EFA_REGS_EQ_DB_ARM_MASK 0x80000000 + +#endif /* _EFA_REGS_H_ */ diff --git a/drivers/amazon/net/efa/efa_sysfs.c b/drivers/amazon/net/efa/efa_sysfs.c new file mode 100644 index 0000000000000..1cd729c758159 --- /dev/null +++ b/drivers/amazon/net/efa/efa_sysfs.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "efa_sysfs.h" +#include "kcompat.h" + +#include +#include + +#include "efa_p2p.h" + +static ssize_t p2p_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%s\n", efa_p2p_provider_string()); +} + +static DEVICE_ATTR_RO(p2p); + +int efa_sysfs_init(struct efa_dev *dev) +{ + struct device *device = &dev->pdev->dev; + + if (device_create_file(device, &dev_attr_p2p)) + dev_err(device, "Failed to create P2P sysfs file\n"); + return 0; +} + +void efa_sysfs_destroy(struct efa_dev *dev) +{ + device_remove_file(&dev->pdev->dev, &dev_attr_p2p); +} diff --git a/drivers/amazon/net/efa/efa_sysfs.h b/drivers/amazon/net/efa/efa_sysfs.h new file mode 100644 index 0000000000000..c390aa547e5a6 --- /dev/null +++ b/drivers/amazon/net/efa/efa_sysfs.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_SYSFS_H_ +#define _EFA_SYSFS_H_ + +#include "efa.h" + +int efa_sysfs_init(struct efa_dev *dev); + +void efa_sysfs_destroy(struct efa_dev *dev); + +#endif /* _EFA_SYSFS_H_ */ diff --git a/drivers/amazon/net/efa/efa_verbs.c b/drivers/amazon/net/efa/efa_verbs.c new file mode 100644 index 0000000000000..6ed927b005a5c --- /dev/null +++ b/drivers/amazon/net/efa/efa_verbs.c @@ -0,0 +1,2267 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "kcompat.h" +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#define UVERBS_MODULE_NAME efa_ib +#include +#include + +#include "efa.h" +#include "efa_io_defs.h" + +#include "efa_p2p.h" + +enum { + EFA_MMAP_DMA_PAGE = 0, + EFA_MMAP_IO_WC, + EFA_MMAP_IO_NC, +}; + +#define EFA_AENQ_ENABLED_GROUPS \ + (BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \ + BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE)) + +struct efa_user_mmap_entry { + struct rdma_user_mmap_entry rdma_entry; + u64 address; + u8 mmap_flag; +}; + +#define EFA_DEFINE_DEVICE_STATS(op) \ + op(EFA_SUBMITTED_CMDS, "submitted_cmds") \ + op(EFA_COMPLETED_CMDS, "completed_cmds") \ + op(EFA_CMDS_ERR, "cmds_err") \ + op(EFA_NO_COMPLETION_CMDS, "no_completion_cmds") \ + op(EFA_KEEP_ALIVE_RCVD, "keep_alive_rcvd") \ + op(EFA_ALLOC_PD_ERR, "alloc_pd_err") \ + op(EFA_CREATE_QP_ERR, "create_qp_err") \ + op(EFA_CREATE_CQ_ERR, "create_cq_err") \ + op(EFA_REG_MR_ERR, "reg_mr_err") \ + op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \ + op(EFA_CREATE_AH_ERR, "create_ah_err") \ + op(EFA_MMAP_ERR, "mmap_err") + +#define EFA_DEFINE_PORT_STATS(op) \ + op(EFA_TX_BYTES, "tx_bytes") \ + op(EFA_TX_PKTS, "tx_pkts") \ + op(EFA_RX_BYTES, "rx_bytes") \ + op(EFA_RX_PKTS, "rx_pkts") \ + op(EFA_RX_DROPS, "rx_drops") \ + op(EFA_SEND_BYTES, "send_bytes") \ + op(EFA_SEND_WRS, "send_wrs") \ + op(EFA_RECV_BYTES, "recv_bytes") \ + op(EFA_RECV_WRS, "recv_wrs") \ + op(EFA_RDMA_READ_WRS, "rdma_read_wrs") \ + op(EFA_RDMA_READ_BYTES, "rdma_read_bytes") \ + op(EFA_RDMA_READ_WR_ERR, "rdma_read_wr_err") \ + op(EFA_RDMA_READ_RESP_BYTES, "rdma_read_resp_bytes") \ + op(EFA_RDMA_WRITE_WRS, "rdma_write_wrs") \ + op(EFA_RDMA_WRITE_BYTES, "rdma_write_bytes") \ + op(EFA_RDMA_WRITE_WR_ERR, "rdma_write_wr_err") \ + op(EFA_RDMA_WRITE_RECV_BYTES, "rdma_write_recv_bytes") \ + +#define EFA_STATS_ENUM(ename, name) ename, +#define EFA_STATS_STR(ename, nam) \ + [ename].name = nam, + +enum efa_hw_device_stats { + EFA_DEFINE_DEVICE_STATS(EFA_STATS_ENUM) +}; + +static const struct rdma_stat_desc efa_device_stats_descs[] = { + EFA_DEFINE_DEVICE_STATS(EFA_STATS_STR) +}; + +enum efa_hw_port_stats { + EFA_DEFINE_PORT_STATS(EFA_STATS_ENUM) +}; + +static const struct rdma_stat_desc efa_port_stats_descs[] = { + EFA_DEFINE_PORT_STATS(EFA_STATS_STR) +}; + +#define EFA_CHUNK_PAYLOAD_SHIFT 12 +#define EFA_CHUNK_PAYLOAD_SIZE BIT(EFA_CHUNK_PAYLOAD_SHIFT) +#define EFA_CHUNK_PAYLOAD_PTR_SIZE 8 + +#define EFA_CHUNK_SHIFT 12 +#define EFA_CHUNK_SIZE BIT(EFA_CHUNK_SHIFT) +#define EFA_CHUNK_PTR_SIZE sizeof(struct efa_com_ctrl_buff_info) + +#define EFA_PTRS_PER_CHUNK \ + ((EFA_CHUNK_SIZE - EFA_CHUNK_PTR_SIZE) / EFA_CHUNK_PAYLOAD_PTR_SIZE) + +#define EFA_CHUNK_USED_SIZE \ + ((EFA_PTRS_PER_CHUNK * EFA_CHUNK_PAYLOAD_PTR_SIZE) + EFA_CHUNK_PTR_SIZE) + +struct pbl_chunk { + dma_addr_t dma_addr; + u64 *buf; + u32 length; +}; + +struct pbl_chunk_list { + struct pbl_chunk *chunks; + unsigned int size; +}; + +struct pbl_context { + union { + struct { + dma_addr_t dma_addr; + } continuous; + struct { + u32 pbl_buf_size_in_pages; + struct scatterlist *sgl; + int sg_dma_cnt; + struct pbl_chunk_list chunk_list; + } indirect; + } phys; + u64 *pbl_buf; + u32 pbl_buf_size_in_bytes; + u8 physically_continuous; +}; + +static inline struct efa_dev *to_edev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct efa_dev, ibdev); +} + +static inline struct efa_ucontext *to_eucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct efa_ucontext, ibucontext); +} + +static inline struct efa_pd *to_epd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct efa_pd, ibpd); +} + +static inline struct efa_mr *to_emr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct efa_mr, ibmr); +} + +static inline struct efa_qp *to_eqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct efa_qp, ibqp); +} + +static inline struct efa_cq *to_ecq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct efa_cq, ibcq); +} + +static inline struct efa_ah *to_eah(struct ib_ah *ibah) +{ + return container_of(ibah, struct efa_ah, ibah); +} + +static inline struct efa_user_mmap_entry * +to_emmap(struct rdma_user_mmap_entry *rdma_entry) +{ + return container_of(rdma_entry, struct efa_user_mmap_entry, rdma_entry); +} + +#define EFA_DEV_CAP(dev, cap) \ + ((dev)->dev_attr.device_caps & \ + EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_##cap##_MASK) + +#define is_reserved_cleared(reserved) \ + !memchr_inv(reserved, 0, sizeof(reserved)) + +static void *efa_zalloc_mapped(struct efa_dev *dev, dma_addr_t *dma_addr, + size_t size, enum dma_data_direction dir) +{ + void *addr; + + addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); + if (!addr) + return NULL; + + *dma_addr = dma_map_single(&dev->pdev->dev, addr, size, dir); + if (dma_mapping_error(&dev->pdev->dev, *dma_addr)) { + ibdev_err(&dev->ibdev, "Failed to map DMA address\n"); + free_pages_exact(addr, size); + return NULL; + } + + return addr; +} + +static void efa_free_mapped(struct efa_dev *dev, void *cpu_addr, + dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir) +{ + dma_unmap_single(&dev->pdev->dev, dma_addr, size, dir); + free_pages_exact(cpu_addr, size); +} + +int efa_query_device(struct ib_device *ibdev, + struct ib_device_attr *props, + struct ib_udata *udata) +{ + struct efa_com_get_device_attr_result *dev_attr; + struct efa_ibv_ex_query_device_resp resp = {}; + struct efa_dev *dev = to_edev(ibdev); + int err; + + if (udata && udata->inlen && + !ib_is_udata_cleared(udata, 0, udata->inlen)) { + ibdev_dbg(ibdev, + "Incompatible ABI params, udata not cleared\n"); + return -EINVAL; + } + + dev_attr = &dev->dev_attr; + + memset(props, 0, sizeof(*props)); + props->max_mr_size = dev_attr->max_mr_pages * PAGE_SIZE; + props->page_size_cap = dev_attr->page_size_cap; + props->vendor_id = dev->pdev->vendor; + props->vendor_part_id = dev->pdev->device; + props->hw_ver = dev->pdev->subsystem_device; + props->max_qp = dev_attr->max_qp; + props->max_cq = dev_attr->max_cq; + props->max_pd = dev_attr->max_pd; + props->max_mr = dev_attr->max_mr; + props->max_ah = dev_attr->max_ah; + props->max_cqe = dev_attr->max_cq_depth; + props->max_qp_wr = min_t(u32, dev_attr->max_sq_depth, + dev_attr->max_rq_depth); + props->max_send_sge = dev_attr->max_sq_sge; + props->max_recv_sge = dev_attr->max_rq_sge; + props->max_sge_rd = dev_attr->max_wr_rdma_sge; + props->max_pkeys = 1; + + if (udata && udata->outlen) { + resp.max_sq_sge = dev_attr->max_sq_sge; + resp.max_rq_sge = dev_attr->max_rq_sge; + resp.max_sq_wr = dev_attr->max_sq_depth; + resp.max_rq_wr = dev_attr->max_rq_depth; + resp.max_rdma_size = dev_attr->max_rdma_size; + + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID; + if (EFA_DEV_CAP(dev, RDMA_READ)) + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_READ; + + if (EFA_DEV_CAP(dev, RNR_RETRY)) + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RNR_RETRY; + + if (EFA_DEV_CAP(dev, DATA_POLLING_128)) + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128; + + if (EFA_DEV_CAP(dev, RDMA_WRITE)) + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_WRITE; + + if (dev->neqs) + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS; + + err = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(ibdev, + "Failed to copy udata for query_device\n"); + return err; + } + } + + return 0; +} + +int efa_query_port(struct ib_device *ibdev, port_t port, + struct ib_port_attr *props) +{ + struct efa_dev *dev = to_edev(ibdev); + + props->lmc = 1; + + props->state = IB_PORT_ACTIVE; + props->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->active_speed = IB_SPEED_EDR; + props->active_width = IB_WIDTH_4X; + props->max_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu); + props->active_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu); + props->max_msg_sz = dev->dev_attr.mtu; + props->max_vl_num = 1; + + return 0; +} + +int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct efa_dev *dev = to_edev(ibqp->device); + struct efa_com_query_qp_params params = {}; + struct efa_com_query_qp_result result; + struct efa_qp *qp = to_eqp(ibqp); + int err; + +#define EFA_QUERY_QP_SUPP_MASK \ + (IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | \ + IB_QP_QKEY | IB_QP_SQ_PSN | IB_QP_CAP | IB_QP_RNR_RETRY) + + if (qp_attr_mask & ~EFA_QUERY_QP_SUPP_MASK) { + ibdev_dbg(&dev->ibdev, + "Unsupported qp_attr_mask[%#x] supported[%#x]\n", + qp_attr_mask, EFA_QUERY_QP_SUPP_MASK); + return -EOPNOTSUPP; + } + + memset(qp_attr, 0, sizeof(*qp_attr)); + memset(qp_init_attr, 0, sizeof(*qp_init_attr)); + + params.qp_handle = qp->qp_handle; + err = efa_com_query_qp(&dev->edev, ¶ms, &result); + if (err) + return err; + + qp_attr->qp_state = result.qp_state; + qp_attr->qkey = result.qkey; + qp_attr->sq_psn = result.sq_psn; + qp_attr->sq_draining = result.sq_draining; + qp_attr->port_num = 1; + qp_attr->rnr_retry = result.rnr_retry; + + qp_attr->cap.max_send_wr = qp->max_send_wr; + qp_attr->cap.max_recv_wr = qp->max_recv_wr; + qp_attr->cap.max_send_sge = qp->max_send_sge; + qp_attr->cap.max_recv_sge = qp->max_recv_sge; + qp_attr->cap.max_inline_data = qp->max_inline_data; + + qp_init_attr->qp_type = ibqp->qp_type; + qp_init_attr->recv_cq = ibqp->recv_cq; + qp_init_attr->send_cq = ibqp->send_cq; + qp_init_attr->qp_context = ibqp->qp_context; + qp_init_attr->cap = qp_attr->cap; + + return 0; +} + +int efa_query_gid(struct ib_device *ibdev, port_t port, int index, + union ib_gid *gid) +{ + struct efa_dev *dev = to_edev(ibdev); + + memcpy(gid->raw, dev->dev_attr.addr, sizeof(dev->dev_attr.addr)); + + return 0; +} + +int efa_query_pkey(struct ib_device *ibdev, port_t port, u16 index, + u16 *pkey) +{ + if (index > 0) + return -EINVAL; + + *pkey = 0xffff; + return 0; +} + +static int efa_pd_dealloc(struct efa_dev *dev, u16 pdn) +{ + struct efa_com_dealloc_pd_params params = { + .pdn = pdn, + }; + + return efa_com_dealloc_pd(&dev->edev, ¶ms); +} + +int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct efa_ibv_alloc_pd_resp resp = {}; + struct efa_com_alloc_pd_result result; + struct efa_pd *pd = to_epd(ibpd); + int err; + + if (udata->inlen && + !ib_is_udata_cleared(udata, 0, udata->inlen)) { + ibdev_dbg(&dev->ibdev, + "Incompatible ABI params, udata not cleared\n"); + err = -EINVAL; + goto err_out; + } + + err = efa_com_alloc_pd(&dev->edev, &result); + if (err) + goto err_out; + + pd->pdn = result.pdn; + resp.pdn = result.pdn; + + if (udata->outlen) { + err = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&dev->ibdev, + "Failed to copy udata for alloc_pd\n"); + goto err_dealloc_pd; + } + } + + ibdev_dbg(&dev->ibdev, "Allocated pd[%d]\n", pd->pdn); + + return 0; + +err_dealloc_pd: + efa_pd_dealloc(dev, result.pdn); +err_out: + atomic64_inc(&dev->stats.alloc_pd_err); + return err; +} + +int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct efa_pd *pd = to_epd(ibpd); + + ibdev_dbg(&dev->ibdev, "Dealloc pd[%d]\n", pd->pdn); + efa_pd_dealloc(dev, pd->pdn); + return 0; +} + +static int efa_destroy_qp_handle(struct efa_dev *dev, u32 qp_handle) +{ + struct efa_com_destroy_qp_params params = { .qp_handle = qp_handle }; + + return efa_com_destroy_qp(&dev->edev, ¶ms); +} + +static void efa_qp_user_mmap_entries_remove(struct efa_qp *qp) +{ + rdma_user_mmap_entry_remove(qp->rq_mmap_entry); + rdma_user_mmap_entry_remove(qp->rq_db_mmap_entry); + rdma_user_mmap_entry_remove(qp->llq_desc_mmap_entry); + rdma_user_mmap_entry_remove(qp->sq_db_mmap_entry); +} + +int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibqp->pd->device); + struct efa_qp *qp = to_eqp(ibqp); + int err; + + ibdev_dbg(&dev->ibdev, "Destroy qp[%u]\n", ibqp->qp_num); + + err = efa_destroy_qp_handle(dev, qp->qp_handle); + if (err) + return err; + + efa_qp_user_mmap_entries_remove(qp); + + if (qp->rq_cpu_addr) { + ibdev_dbg(&dev->ibdev, + "qp->cpu_addr[0x%p] freed: size[%lu], dma[%pad]\n", + qp->rq_cpu_addr, qp->rq_size, + &qp->rq_dma_addr); + efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr, + qp->rq_size, DMA_TO_DEVICE); + } + + return 0; +} + +static struct rdma_user_mmap_entry* +efa_user_mmap_entry_insert(struct ib_ucontext *ucontext, + u64 address, size_t length, + u8 mmap_flag, u64 *offset) +{ + struct efa_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL); + int err; + + if (!entry) + return NULL; + + entry->address = address; + entry->mmap_flag = mmap_flag; + + err = rdma_user_mmap_entry_insert(ucontext, &entry->rdma_entry, + length); + if (err) { + kfree(entry); + return NULL; + } + *offset = rdma_user_mmap_get_offset(&entry->rdma_entry); + + return &entry->rdma_entry; +} + +static int qp_mmap_entries_setup(struct efa_qp *qp, + struct efa_dev *dev, + struct efa_ucontext *ucontext, + struct efa_com_create_qp_params *params, + struct efa_ibv_create_qp_resp *resp) +{ + size_t length; + u64 address; + + address = dev->db_bar_addr + resp->sq_db_offset; + qp->sq_db_mmap_entry = + efa_user_mmap_entry_insert(&ucontext->ibucontext, + address, + PAGE_SIZE, EFA_MMAP_IO_NC, + &resp->sq_db_mmap_key); + if (!qp->sq_db_mmap_entry) + return -ENOMEM; + + resp->sq_db_offset &= ~PAGE_MASK; + + address = dev->mem_bar_addr + resp->llq_desc_offset; + length = PAGE_ALIGN(params->sq_ring_size_in_bytes + + (resp->llq_desc_offset & ~PAGE_MASK)); + + qp->llq_desc_mmap_entry = + efa_user_mmap_entry_insert(&ucontext->ibucontext, + address, length, + EFA_MMAP_IO_WC, + &resp->llq_desc_mmap_key); + if (!qp->llq_desc_mmap_entry) + goto err_remove_mmap; + + resp->llq_desc_offset &= ~PAGE_MASK; + + if (qp->rq_size) { + address = dev->db_bar_addr + resp->rq_db_offset; + + qp->rq_db_mmap_entry = + efa_user_mmap_entry_insert(&ucontext->ibucontext, + address, PAGE_SIZE, + EFA_MMAP_IO_NC, + &resp->rq_db_mmap_key); + if (!qp->rq_db_mmap_entry) + goto err_remove_mmap; + + resp->rq_db_offset &= ~PAGE_MASK; + + address = virt_to_phys(qp->rq_cpu_addr); + qp->rq_mmap_entry = + efa_user_mmap_entry_insert(&ucontext->ibucontext, + address, qp->rq_size, + EFA_MMAP_DMA_PAGE, + &resp->rq_mmap_key); + if (!qp->rq_mmap_entry) + goto err_remove_mmap; + + resp->rq_mmap_size = qp->rq_size; + } + + return 0; + +err_remove_mmap: + efa_qp_user_mmap_entries_remove(qp); + + return -ENOMEM; +} + +static int efa_qp_validate_cap(struct efa_dev *dev, + struct ib_qp_init_attr *init_attr) +{ + if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) { + ibdev_dbg(&dev->ibdev, + "qp: requested send wr[%u] exceeds the max[%u]\n", + init_attr->cap.max_send_wr, + dev->dev_attr.max_sq_depth); + return -EINVAL; + } + if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) { + ibdev_dbg(&dev->ibdev, + "qp: requested receive wr[%u] exceeds the max[%u]\n", + init_attr->cap.max_recv_wr, + dev->dev_attr.max_rq_depth); + return -EINVAL; + } + if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) { + ibdev_dbg(&dev->ibdev, + "qp: requested sge send[%u] exceeds the max[%u]\n", + init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge); + return -EINVAL; + } + if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) { + ibdev_dbg(&dev->ibdev, + "qp: requested sge recv[%u] exceeds the max[%u]\n", + init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge); + return -EINVAL; + } + if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) { + ibdev_dbg(&dev->ibdev, + "qp: requested inline data[%u] exceeds the max[%u]\n", + init_attr->cap.max_inline_data, + dev->dev_attr.inline_buf_size); + return -EINVAL; + } + + return 0; +} + +static int efa_qp_validate_attr(struct efa_dev *dev, + struct ib_qp_init_attr *init_attr) +{ + if (init_attr->qp_type != IB_QPT_DRIVER && + init_attr->qp_type != IB_QPT_UD) { + ibdev_dbg(&dev->ibdev, + "Unsupported qp type %d\n", init_attr->qp_type); + return -EOPNOTSUPP; + } + + if (init_attr->srq) { + ibdev_dbg(&dev->ibdev, "SRQ is not supported\n"); + return -EOPNOTSUPP; + } + + if (init_attr->create_flags) { + ibdev_dbg(&dev->ibdev, "Unsupported create flags\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct efa_com_create_qp_params create_qp_params = {}; + struct efa_com_create_qp_result create_qp_resp; + struct efa_dev *dev = to_edev(ibqp->device); + struct efa_ibv_create_qp_resp resp = {}; + struct efa_ibv_create_qp cmd = {}; + struct efa_qp *qp = to_eqp(ibqp); + struct efa_ucontext *ucontext; + int err; + + ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext, + ibucontext); + + err = efa_qp_validate_cap(dev, init_attr); + if (err) + goto err_out; + + err = efa_qp_validate_attr(dev, init_attr); + if (err) + goto err_out; + + if (offsetofend(typeof(cmd), driver_qp_type) > udata->inlen) { + ibdev_dbg(&dev->ibdev, + "Incompatible ABI params, no input udata\n"); + err = -EINVAL; + goto err_out; + } + + if (udata->inlen > sizeof(cmd) && + !ib_is_udata_cleared(udata, sizeof(cmd), + udata->inlen - sizeof(cmd))) { + ibdev_dbg(&dev->ibdev, + "Incompatible ABI params, unknown fields in udata\n"); + err = -EINVAL; + goto err_out; + } + + err = ib_copy_from_udata(&cmd, udata, + min(sizeof(cmd), udata->inlen)); + if (err) { + ibdev_dbg(&dev->ibdev, + "Cannot copy udata for create_qp\n"); + goto err_out; + } + + if (cmd.comp_mask) { + ibdev_dbg(&dev->ibdev, + "Incompatible ABI params, unknown fields in udata\n"); + err = -EINVAL; + goto err_out; + } + + create_qp_params.uarn = ucontext->uarn; + create_qp_params.pd = to_epd(ibqp->pd)->pdn; + + if (init_attr->qp_type == IB_QPT_UD) { + create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_UD; + } else if (cmd.driver_qp_type == EFA_QP_DRIVER_TYPE_SRD) { + create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_SRD; + } else { + ibdev_dbg(&dev->ibdev, + "Unsupported qp type %d driver qp type %d\n", + init_attr->qp_type, cmd.driver_qp_type); + err = -EOPNOTSUPP; + goto err_out; + } + + ibdev_dbg(&dev->ibdev, "Create QP: qp type %d driver qp type %#x\n", + init_attr->qp_type, cmd.driver_qp_type); + create_qp_params.send_cq_idx = to_ecq(init_attr->send_cq)->cq_idx; + create_qp_params.recv_cq_idx = to_ecq(init_attr->recv_cq)->cq_idx; + create_qp_params.sq_depth = init_attr->cap.max_send_wr; + create_qp_params.sq_ring_size_in_bytes = cmd.sq_ring_size; + + create_qp_params.rq_depth = init_attr->cap.max_recv_wr; + create_qp_params.rq_ring_size_in_bytes = cmd.rq_ring_size; + qp->rq_size = PAGE_ALIGN(create_qp_params.rq_ring_size_in_bytes); + if (qp->rq_size) { + qp->rq_cpu_addr = efa_zalloc_mapped(dev, &qp->rq_dma_addr, + qp->rq_size, DMA_TO_DEVICE); + if (!qp->rq_cpu_addr) { + err = -ENOMEM; + goto err_out; + } + + ibdev_dbg(&dev->ibdev, + "qp->cpu_addr[0x%p] allocated: size[%lu], dma[%pad]\n", + qp->rq_cpu_addr, qp->rq_size, &qp->rq_dma_addr); + create_qp_params.rq_base_addr = qp->rq_dma_addr; + } + + err = efa_com_create_qp(&dev->edev, &create_qp_params, + &create_qp_resp); + if (err) + goto err_free_mapped; + + resp.sq_db_offset = create_qp_resp.sq_db_offset; + resp.rq_db_offset = create_qp_resp.rq_db_offset; + resp.llq_desc_offset = create_qp_resp.llq_descriptors_offset; + resp.send_sub_cq_idx = create_qp_resp.send_sub_cq_idx; + resp.recv_sub_cq_idx = create_qp_resp.recv_sub_cq_idx; + + err = qp_mmap_entries_setup(qp, dev, ucontext, &create_qp_params, + &resp); + if (err) + goto err_destroy_qp; + + qp->qp_handle = create_qp_resp.qp_handle; + qp->ibqp.qp_num = create_qp_resp.qp_num; + qp->max_send_wr = init_attr->cap.max_send_wr; + qp->max_recv_wr = init_attr->cap.max_recv_wr; + qp->max_send_sge = init_attr->cap.max_send_sge; + qp->max_recv_sge = init_attr->cap.max_recv_sge; + qp->max_inline_data = init_attr->cap.max_inline_data; + + if (udata->outlen) { + err = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&dev->ibdev, + "Failed to copy udata for qp[%u]\n", + create_qp_resp.qp_num); + goto err_remove_mmap_entries; + } + } + + ibdev_dbg(&dev->ibdev, "Created qp[%d]\n", qp->ibqp.qp_num); + + return 0; + +err_remove_mmap_entries: + efa_qp_user_mmap_entries_remove(qp); +err_destroy_qp: + efa_destroy_qp_handle(dev, create_qp_resp.qp_handle); +err_free_mapped: + if (qp->rq_size) + efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr, + qp->rq_size, DMA_TO_DEVICE); +err_out: + atomic64_inc(&dev->stats.create_qp_err); + return err; +} + +static const struct { + int valid; + enum ib_qp_attr_mask req_param; + enum ib_qp_attr_mask opt_param; +} srd_qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .req_param = IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY, + }, + }, + [IB_QPS_INIT] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .opt_param = IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY, + }, + [IB_QPS_RTR] = { + .valid = 1, + .opt_param = IB_QP_PKEY_INDEX | + IB_QP_QKEY, + }, + }, + [IB_QPS_RTR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .req_param = IB_QP_SQ_PSN, + .opt_param = IB_QP_CUR_STATE | + IB_QP_QKEY | + IB_QP_RNR_RETRY, + + } + }, + [IB_QPS_RTS] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = IB_QP_CUR_STATE | + IB_QP_QKEY, + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = IB_QP_EN_SQD_ASYNC_NOTIFY, + }, + }, + [IB_QPS_SQD] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = IB_QP_CUR_STATE | + IB_QP_QKEY, + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = IB_QP_PKEY_INDEX | + IB_QP_QKEY, + } + }, + [IB_QPS_SQE] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = IB_QP_CUR_STATE | + IB_QP_QKEY, + } + }, + [IB_QPS_ERR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + } +}; + +static bool efa_modify_srd_qp_is_ok(enum ib_qp_state cur_state, + enum ib_qp_state next_state, + enum ib_qp_attr_mask mask) +{ + enum ib_qp_attr_mask req_param, opt_param; + + if (mask & IB_QP_CUR_STATE && + cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && + cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) + return false; + + if (!srd_qp_state_table[cur_state][next_state].valid) + return false; + + req_param = srd_qp_state_table[cur_state][next_state].req_param; + opt_param = srd_qp_state_table[cur_state][next_state].opt_param; + + if ((mask & req_param) != req_param) + return false; + + if (mask & ~(req_param | opt_param | IB_QP_STATE)) + return false; + + return true; +} + +static int efa_modify_qp_validate(struct efa_dev *dev, struct efa_qp *qp, + struct ib_qp_attr *qp_attr, int qp_attr_mask, + enum ib_qp_state cur_state, + enum ib_qp_state new_state) +{ + int err; + +#define EFA_MODIFY_QP_SUPP_MASK \ + (IB_QP_STATE | IB_QP_CUR_STATE | IB_QP_EN_SQD_ASYNC_NOTIFY | \ + IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY | IB_QP_SQ_PSN | \ + IB_QP_RNR_RETRY) + + if (qp_attr_mask & ~EFA_MODIFY_QP_SUPP_MASK) { + ibdev_dbg(&dev->ibdev, + "Unsupported qp_attr_mask[%#x] supported[%#x]\n", + qp_attr_mask, EFA_MODIFY_QP_SUPP_MASK); + return -EOPNOTSUPP; + } + + if (qp->ibqp.qp_type == IB_QPT_DRIVER) + err = !efa_modify_srd_qp_is_ok(cur_state, new_state, + qp_attr_mask); + else + err = !ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD, + qp_attr_mask); + + if (err) { + ibdev_dbg(&dev->ibdev, "Invalid modify QP parameters\n"); + return -EINVAL; + } + + if ((qp_attr_mask & IB_QP_PORT) && qp_attr->port_num != 1) { + ibdev_dbg(&dev->ibdev, "Can't change port num\n"); + return -EOPNOTSUPP; + } + + if ((qp_attr_mask & IB_QP_PKEY_INDEX) && qp_attr->pkey_index) { + ibdev_dbg(&dev->ibdev, "Can't change pkey index\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibqp->device); + struct efa_com_modify_qp_params params = {}; + struct efa_qp *qp = to_eqp(ibqp); + enum ib_qp_state cur_state; + enum ib_qp_state new_state; + int err; + + if (qp_attr_mask & ~IB_QP_ATTR_STANDARD_BITS) + return -EOPNOTSUPP; + + if (udata->inlen && + !ib_is_udata_cleared(udata, 0, udata->inlen)) { + ibdev_dbg(&dev->ibdev, + "Incompatible ABI params, udata not cleared\n"); + return -EINVAL; + } + + cur_state = qp_attr_mask & IB_QP_CUR_STATE ? qp_attr->cur_qp_state : + qp->state; + new_state = qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : cur_state; + + err = efa_modify_qp_validate(dev, qp, qp_attr, qp_attr_mask, cur_state, + new_state); + if (err) + return err; + + params.qp_handle = qp->qp_handle; + + if (qp_attr_mask & IB_QP_STATE) { + EFA_SET(¶ms.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QP_STATE, + 1); + EFA_SET(¶ms.modify_mask, + EFA_ADMIN_MODIFY_QP_CMD_CUR_QP_STATE, 1); + params.cur_qp_state = cur_state; + params.qp_state = new_state; + } + + if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) { + EFA_SET(¶ms.modify_mask, + EFA_ADMIN_MODIFY_QP_CMD_SQ_DRAINED_ASYNC_NOTIFY, 1); + params.sq_drained_async_notify = qp_attr->en_sqd_async_notify; + } + + if (qp_attr_mask & IB_QP_QKEY) { + EFA_SET(¶ms.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QKEY, 1); + params.qkey = qp_attr->qkey; + } + + if (qp_attr_mask & IB_QP_SQ_PSN) { + EFA_SET(¶ms.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_SQ_PSN, 1); + params.sq_psn = qp_attr->sq_psn; + } + + if (qp_attr_mask & IB_QP_RNR_RETRY) { + EFA_SET(¶ms.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_RNR_RETRY, + 1); + params.rnr_retry = qp_attr->rnr_retry; + } + + err = efa_com_modify_qp(&dev->edev, ¶ms); + if (err) + return err; + + qp->state = new_state; + + return 0; +} + +static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx) +{ + struct efa_com_destroy_cq_params params = { .cq_idx = cq_idx }; + + return efa_com_destroy_cq(&dev->edev, ¶ms); +} + +static void efa_cq_user_mmap_entries_remove(struct efa_cq *cq) +{ + rdma_user_mmap_entry_remove(cq->db_mmap_entry); + rdma_user_mmap_entry_remove(cq->mmap_entry); +} + +int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibcq->device); + struct efa_cq *cq = to_ecq(ibcq); + + ibdev_dbg(&dev->ibdev, + "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n", + cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr); + + efa_destroy_cq_idx(dev, cq->cq_idx); + efa_cq_user_mmap_entries_remove(cq); + if (cq->eq) { + xa_erase(&dev->cqs_xa, cq->cq_idx); + synchronize_irq(cq->eq->irq.irqn); + } + efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, + DMA_FROM_DEVICE); + return 0; +} + +static struct efa_eq *efa_vec2eq(struct efa_dev *dev, int vec) +{ + return &dev->eqs[vec]; +} + +static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq, + struct efa_ibv_create_cq_resp *resp, + bool db_valid) +{ + resp->q_mmap_size = cq->size; + cq->mmap_entry = efa_user_mmap_entry_insert(&cq->ucontext->ibucontext, + virt_to_phys(cq->cpu_addr), + cq->size, EFA_MMAP_DMA_PAGE, + &resp->q_mmap_key); + if (!cq->mmap_entry) + return -ENOMEM; + + if (db_valid) { + cq->db_mmap_entry = + efa_user_mmap_entry_insert(&cq->ucontext->ibucontext, + dev->db_bar_addr + resp->db_off, + PAGE_SIZE, EFA_MMAP_IO_NC, + &resp->db_mmap_key); + if (!cq->db_mmap_entry) { + rdma_user_mmap_entry_remove(cq->mmap_entry); + return -ENOMEM; + } + + resp->db_off &= ~PAGE_MASK; + resp->comp_mask |= EFA_CREATE_CQ_RESP_DB_OFF; + } + + return 0; +} + +int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + struct efa_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct efa_ucontext, ibucontext); + struct efa_com_create_cq_params params = {}; + struct efa_ibv_create_cq_resp resp = {}; + struct efa_com_create_cq_result result; + struct ib_device *ibdev = ibcq->device; + struct efa_dev *dev = to_edev(ibdev); + struct efa_ibv_create_cq cmd = {}; + struct efa_cq *cq = to_ecq(ibcq); + int entries = attr->cqe; + bool set_src_addr; + int err; + + ibdev_dbg(ibdev, "create_cq entries %d\n", entries); + + if (attr->flags) + return -EOPNOTSUPP; + + if (entries < 1 || entries > dev->dev_attr.max_cq_depth) { + ibdev_dbg(ibdev, + "cq: requested entries[%u] non-positive or greater than max[%u]\n", + entries, dev->dev_attr.max_cq_depth); + err = -EINVAL; + goto err_out; + } + + if (offsetofend(typeof(cmd), num_sub_cqs) > udata->inlen) { + ibdev_dbg(ibdev, + "Incompatible ABI params, no input udata\n"); + err = -EINVAL; + goto err_out; + } + + if (udata->inlen > sizeof(cmd) && + !ib_is_udata_cleared(udata, sizeof(cmd), + udata->inlen - sizeof(cmd))) { + ibdev_dbg(ibdev, + "Incompatible ABI params, unknown fields in udata\n"); + err = -EINVAL; + goto err_out; + } + + err = ib_copy_from_udata(&cmd, udata, + min(sizeof(cmd), udata->inlen)); + if (err) { + ibdev_dbg(ibdev, "Cannot copy udata for create_cq\n"); + goto err_out; + } + + if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_58)) { + ibdev_dbg(ibdev, + "Incompatible ABI params, unknown fields in udata\n"); + err = -EINVAL; + goto err_out; + } + + set_src_addr = !!(cmd.flags & EFA_CREATE_CQ_WITH_SGID); + if ((cmd.cq_entry_size != sizeof(struct efa_io_rx_cdesc_ex)) && + (set_src_addr || cmd.cq_entry_size != sizeof(struct efa_io_rx_cdesc))) { + ibdev_dbg(ibdev, + "Invalid entry size [%u]\n", cmd.cq_entry_size); + err = -EINVAL; + goto err_out; + } + + if (cmd.num_sub_cqs != dev->dev_attr.sub_cqs_per_cq) { + ibdev_dbg(ibdev, + "Invalid number of sub cqs[%u] expected[%u]\n", + cmd.num_sub_cqs, dev->dev_attr.sub_cqs_per_cq); + err = -EINVAL; + goto err_out; + } + + cq->ucontext = ucontext; + cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs); + cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size, + DMA_FROM_DEVICE); + if (!cq->cpu_addr) { + err = -ENOMEM; + goto err_out; + } + + params.uarn = cq->ucontext->uarn; + params.cq_depth = entries; + params.dma_addr = cq->dma_addr; + params.entry_size_in_bytes = cmd.cq_entry_size; + params.num_sub_cqs = cmd.num_sub_cqs; + params.set_src_addr = set_src_addr; + if (cmd.flags & EFA_CREATE_CQ_WITH_COMPLETION_CHANNEL) { + cq->eq = efa_vec2eq(dev, attr->comp_vector); + params.eqn = cq->eq->eeq.eqn; + params.interrupt_mode_enabled = true; + } + + err = efa_com_create_cq(&dev->edev, ¶ms, &result); + if (err) + goto err_free_mapped; + + resp.db_off = result.db_off; + resp.cq_idx = result.cq_idx; + cq->cq_idx = result.cq_idx; + cq->ibcq.cqe = result.actual_depth; + WARN_ON_ONCE(entries != result.actual_depth); + + err = cq_mmap_entries_setup(dev, cq, &resp, result.db_valid); + if (err) { + ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n", + cq->cq_idx); + goto err_destroy_cq; + } + + if (cq->eq) { + err = xa_err(xa_store(&dev->cqs_xa, cq->cq_idx, cq, GFP_KERNEL)); + if (err) { + ibdev_dbg(ibdev, "Failed to store cq[%u] in xarray\n", + cq->cq_idx); + goto err_remove_mmap; + } + } + + if (udata->outlen) { + err = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(ibdev, + "Failed to copy udata for create_cq\n"); + goto err_xa_erase; + } + } + + ibdev_dbg(ibdev, "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n", + cq->cq_idx, result.actual_depth, &cq->dma_addr, cq->cpu_addr); + + return 0; + +err_xa_erase: + if (cq->eq) + xa_erase(&dev->cqs_xa, cq->cq_idx); +err_remove_mmap: + efa_cq_user_mmap_entries_remove(cq); +err_destroy_cq: + efa_destroy_cq_idx(dev, cq->cq_idx); +err_free_mapped: + efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, + DMA_FROM_DEVICE); + +err_out: + atomic64_inc(&dev->stats.create_cq_err); + return err; +} + +static int umem_to_page_list(struct efa_dev *dev, + struct ib_umem *umem, + u64 *page_list, + u32 hp_cnt, + u8 hp_shift) +{ + u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT); + struct ib_block_iter biter; + unsigned int hp_idx = 0; + + ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n", + hp_cnt, pages_in_hp); + + rdma_umem_for_each_dma_block(umem, &biter, BIT(hp_shift)) + page_list[hp_idx++] = rdma_block_iter_dma_address(&biter); + + return 0; +} + +static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt) +{ + struct scatterlist *sglist; + struct page *pg; + int i; + + sglist = kmalloc_array(page_cnt, sizeof(*sglist), GFP_KERNEL); + if (!sglist) + return NULL; + sg_init_table(sglist, page_cnt); + for (i = 0; i < page_cnt; i++) { + pg = vmalloc_to_page(buf); + if (!pg) + goto err; + sg_set_page(&sglist[i], pg, PAGE_SIZE, 0); + buf += PAGE_SIZE / sizeof(*buf); + } + return sglist; + +err: + kfree(sglist); + return NULL; +} + +/* + * create a chunk list of physical pages dma addresses from the supplied + * scatter gather list + */ +static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl) +{ + struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list; + int page_cnt = pbl->phys.indirect.pbl_buf_size_in_pages; + struct scatterlist *pages_sgl = pbl->phys.indirect.sgl; + unsigned int chunk_list_size, chunk_idx, payload_idx; + int sg_dma_cnt = pbl->phys.indirect.sg_dma_cnt; + struct efa_com_ctrl_buff_info *ctrl_buf; + u64 *cur_chunk_buf, *prev_chunk_buf; + struct ib_block_iter biter; + dma_addr_t dma_addr; + int i; + + /* allocate a chunk list that consists of 4KB chunks */ + chunk_list_size = DIV_ROUND_UP(page_cnt, EFA_PTRS_PER_CHUNK); + + chunk_list->size = chunk_list_size; + chunk_list->chunks = kcalloc(chunk_list_size, + sizeof(*chunk_list->chunks), + GFP_KERNEL); + if (!chunk_list->chunks) + return -ENOMEM; + + ibdev_dbg(&dev->ibdev, + "chunk_list_size[%u] - pages[%u]\n", chunk_list_size, + page_cnt); + + /* allocate chunk buffers: */ + for (i = 0; i < chunk_list_size; i++) { + chunk_list->chunks[i].buf = kzalloc(EFA_CHUNK_SIZE, GFP_KERNEL); + if (!chunk_list->chunks[i].buf) + goto chunk_list_dealloc; + + chunk_list->chunks[i].length = EFA_CHUNK_USED_SIZE; + } + chunk_list->chunks[chunk_list_size - 1].length = + ((page_cnt % EFA_PTRS_PER_CHUNK) * EFA_CHUNK_PAYLOAD_PTR_SIZE) + + EFA_CHUNK_PTR_SIZE; + + /* fill the dma addresses of sg list pages to chunks: */ + chunk_idx = 0; + payload_idx = 0; + cur_chunk_buf = chunk_list->chunks[0].buf; + rdma_for_each_block(pages_sgl, &biter, sg_dma_cnt, + EFA_CHUNK_PAYLOAD_SIZE) { + cur_chunk_buf[payload_idx++] = + rdma_block_iter_dma_address(&biter); + + if (payload_idx == EFA_PTRS_PER_CHUNK) { + chunk_idx++; + cur_chunk_buf = chunk_list->chunks[chunk_idx].buf; + payload_idx = 0; + } + } + + /* map chunks to dma and fill chunks next ptrs */ + for (i = chunk_list_size - 1; i >= 0; i--) { + dma_addr = dma_map_single(&dev->pdev->dev, + chunk_list->chunks[i].buf, + chunk_list->chunks[i].length, + DMA_TO_DEVICE); + if (dma_mapping_error(&dev->pdev->dev, dma_addr)) { + ibdev_err(&dev->ibdev, + "chunk[%u] dma_map_failed\n", i); + goto chunk_list_unmap; + } + + chunk_list->chunks[i].dma_addr = dma_addr; + ibdev_dbg(&dev->ibdev, + "chunk[%u] mapped at [%pad]\n", i, &dma_addr); + + if (!i) + break; + + prev_chunk_buf = chunk_list->chunks[i - 1].buf; + + ctrl_buf = (struct efa_com_ctrl_buff_info *) + &prev_chunk_buf[EFA_PTRS_PER_CHUNK]; + ctrl_buf->length = chunk_list->chunks[i].length; + + efa_com_set_dma_addr(dma_addr, + &ctrl_buf->address.mem_addr_high, + &ctrl_buf->address.mem_addr_low); + } + + return 0; + +chunk_list_unmap: + for (; i < chunk_list_size; i++) { + dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr, + chunk_list->chunks[i].length, DMA_TO_DEVICE); + } +chunk_list_dealloc: + for (i = 0; i < chunk_list_size; i++) + kfree(chunk_list->chunks[i].buf); + + kfree(chunk_list->chunks); + return -ENOMEM; +} + +static void pbl_chunk_list_destroy(struct efa_dev *dev, struct pbl_context *pbl) +{ + struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list; + int i; + + for (i = 0; i < chunk_list->size; i++) { + dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr, + chunk_list->chunks[i].length, DMA_TO_DEVICE); + kfree(chunk_list->chunks[i].buf); + } + + kfree(chunk_list->chunks); +} + +/* initialize pbl continuous mode: map pbl buffer to a dma address. */ +static int pbl_continuous_initialize(struct efa_dev *dev, + struct pbl_context *pbl) +{ + dma_addr_t dma_addr; + + dma_addr = dma_map_single(&dev->pdev->dev, pbl->pbl_buf, + pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE); + if (dma_mapping_error(&dev->pdev->dev, dma_addr)) { + ibdev_err(&dev->ibdev, "Unable to map pbl to DMA address\n"); + return -ENOMEM; + } + + pbl->phys.continuous.dma_addr = dma_addr; + ibdev_dbg(&dev->ibdev, + "pbl continuous - dma_addr = %pad, size[%u]\n", + &dma_addr, pbl->pbl_buf_size_in_bytes); + + return 0; +} + +/* + * initialize pbl indirect mode: + * create a chunk list out of the dma addresses of the physical pages of + * pbl buffer. + */ +static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl) +{ + u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, EFA_CHUNK_PAYLOAD_SIZE); + struct scatterlist *sgl; + int sg_dma_cnt, err; + + BUILD_BUG_ON(EFA_CHUNK_PAYLOAD_SIZE > PAGE_SIZE); + sgl = efa_vmalloc_buf_to_sg(pbl->pbl_buf, size_in_pages); + if (!sgl) + return -ENOMEM; + + sg_dma_cnt = dma_map_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE); + if (!sg_dma_cnt) { + err = -EINVAL; + goto err_map; + } + + pbl->phys.indirect.pbl_buf_size_in_pages = size_in_pages; + pbl->phys.indirect.sgl = sgl; + pbl->phys.indirect.sg_dma_cnt = sg_dma_cnt; + err = pbl_chunk_list_create(dev, pbl); + if (err) { + ibdev_dbg(&dev->ibdev, + "chunk_list creation failed[%d]\n", err); + goto err_chunk; + } + + ibdev_dbg(&dev->ibdev, + "pbl indirect - size[%u], chunks[%u]\n", + pbl->pbl_buf_size_in_bytes, + pbl->phys.indirect.chunk_list.size); + + return 0; + +err_chunk: + dma_unmap_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE); +err_map: + kfree(sgl); + return err; +} + +static void pbl_indirect_terminate(struct efa_dev *dev, struct pbl_context *pbl) +{ + pbl_chunk_list_destroy(dev, pbl); + dma_unmap_sg(&dev->pdev->dev, pbl->phys.indirect.sgl, + pbl->phys.indirect.pbl_buf_size_in_pages, DMA_TO_DEVICE); + kfree(pbl->phys.indirect.sgl); +} + +/* create a page buffer list from a mapped user memory region */ +static int pbl_create(struct efa_dev *dev, + struct pbl_context *pbl, + struct efa_mr *mr, + int hp_cnt, + u8 hp_shift) +{ + int err; + + pbl->pbl_buf_size_in_bytes = hp_cnt * EFA_CHUNK_PAYLOAD_PTR_SIZE; + pbl->pbl_buf = kvzalloc(pbl->pbl_buf_size_in_bytes, GFP_KERNEL); + if (!pbl->pbl_buf) + return -ENOMEM; + + if (is_vmalloc_addr(pbl->pbl_buf)) { + pbl->physically_continuous = 0; + if (mr->p2pmem) + err = efa_p2p_to_page_list(dev, mr->p2pmem, pbl->pbl_buf); + else + err = umem_to_page_list(dev, mr->umem, pbl->pbl_buf, hp_cnt, + hp_shift); + if (err) + goto err_free; + + err = pbl_indirect_initialize(dev, pbl); + if (err) + goto err_free; + } else { + pbl->physically_continuous = 1; + if (mr->p2pmem) + err = efa_p2p_to_page_list(dev, mr->p2pmem, pbl->pbl_buf); + else + err = umem_to_page_list(dev, mr->umem, pbl->pbl_buf, hp_cnt, + hp_shift); + if (err) + goto err_free; + + err = pbl_continuous_initialize(dev, pbl); + if (err) + goto err_free; + } + + ibdev_dbg(&dev->ibdev, + "user_pbl_created: user_pages[%u], continuous[%u]\n", + hp_cnt, pbl->physically_continuous); + + return 0; + +err_free: + kvfree(pbl->pbl_buf); + return err; +} + +static void pbl_destroy(struct efa_dev *dev, struct pbl_context *pbl) +{ + if (pbl->physically_continuous) + dma_unmap_single(&dev->pdev->dev, pbl->phys.continuous.dma_addr, + pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE); + else + pbl_indirect_terminate(dev, pbl); + + kvfree(pbl->pbl_buf); +} + +static int efa_create_inline_pbl(struct efa_dev *dev, struct efa_mr *mr, + struct efa_com_reg_mr_params *params) +{ + int err; + + params->inline_pbl = 1; + if (mr->p2pmem) + err = efa_p2p_to_page_list(dev, mr->p2pmem, + params->pbl.inline_pbl_array); + else + err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array, + params->page_num, params->page_shift); + if (err) + return err; + + ibdev_dbg(&dev->ibdev, + "inline_pbl_array - pages[%u]\n", params->page_num); + + return 0; +} + +static int efa_create_pbl(struct efa_dev *dev, + struct pbl_context *pbl, + struct efa_mr *mr, + struct efa_com_reg_mr_params *params) +{ + int err; + + err = pbl_create(dev, pbl, mr, params->page_num, + params->page_shift); + if (err) { + ibdev_dbg(&dev->ibdev, "Failed to create pbl[%d]\n", err); + return err; + } + + params->inline_pbl = 0; + params->indirect = !pbl->physically_continuous; + if (pbl->physically_continuous) { + params->pbl.pbl.length = pbl->pbl_buf_size_in_bytes; + + efa_com_set_dma_addr(pbl->phys.continuous.dma_addr, + ¶ms->pbl.pbl.address.mem_addr_high, + ¶ms->pbl.pbl.address.mem_addr_low); + } else { + params->pbl.pbl.length = + pbl->phys.indirect.chunk_list.chunks[0].length; + + efa_com_set_dma_addr(pbl->phys.indirect.chunk_list.chunks[0].dma_addr, + ¶ms->pbl.pbl.address.mem_addr_high, + ¶ms->pbl.pbl.address.mem_addr_low); + } + + return 0; +} + +static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags, + struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibpd->device); + int supp_access_flags; + struct efa_mr *mr; + + if (udata && udata->inlen && + !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) { + ibdev_dbg(&dev->ibdev, + "Incompatible ABI params, udata not cleared\n"); + return ERR_PTR(-EINVAL); + } + + supp_access_flags = + IB_ACCESS_LOCAL_WRITE | + (EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0) | + (EFA_DEV_CAP(dev, RDMA_WRITE) ? IB_ACCESS_REMOTE_WRITE : 0); + + access_flags &= ~IB_ACCESS_OPTIONAL; + if (access_flags & ~supp_access_flags) { + ibdev_dbg(&dev->ibdev, + "Unsupported access flags[%#x], supported[%#x]\n", + access_flags, supp_access_flags); + return ERR_PTR(-EOPNOTSUPP); + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + return mr; +} + +static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start, + u64 length, u64 virt_addr, int access_flags) +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct efa_com_reg_mr_params params = {}; + struct efa_com_reg_mr_result result = {}; + struct pbl_context pbl; + unsigned int pg_sz; + int inline_size; + int err; + + params.pd = to_epd(ibpd)->pdn; + params.iova = virt_addr; + params.mr_length_in_bytes = length; + params.permissions = access_flags; + + if (mr->p2pmem) { + pg_sz = efa_p2p_get_page_size(dev, mr->p2pmem); + goto skip_umem_pg_sz; + } + + pg_sz = ib_umem_find_best_pgsz(mr->umem, + dev->dev_attr.page_size_cap, + virt_addr); + if (!pg_sz) { + ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n", + dev->dev_attr.page_size_cap); + return -EOPNOTSUPP; + } + +skip_umem_pg_sz: + params.page_shift = order_base_2(pg_sz); + if (mr->p2pmem) + params.page_num = DIV_ROUND_UP(length + + (virt_addr & (pg_sz - 1)), + pg_sz); + else + params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz); + + ibdev_dbg(&dev->ibdev, + "start %#llx length %#llx params.page_shift %u params.page_num %u\n", + start, length, params.page_shift, params.page_num); + + inline_size = ARRAY_SIZE(params.pbl.inline_pbl_array); + if (params.page_num <= inline_size) { + err = efa_create_inline_pbl(dev, mr, ¶ms); + if (err) + return err; + + err = efa_com_register_mr(&dev->edev, ¶ms, &result); + if (err) + return err; + } else { + err = efa_create_pbl(dev, &pbl, mr, ¶ms); + if (err) + return err; + + err = efa_com_register_mr(&dev->edev, ¶ms, &result); + pbl_destroy(dev, &pbl); + + if (err) + return err; + } + + mr->ibmr.lkey = result.l_key; + mr->ibmr.rkey = result.r_key; + mr->ibmr.length = length; + mr->ic_info.recv_ic_id = result.ic_info.recv_ic_id; + mr->ic_info.rdma_read_ic_id = result.ic_info.rdma_read_ic_id; + mr->ic_info.rdma_recv_ic_id = result.ic_info.rdma_recv_ic_id; + mr->ic_info.recv_ic_id_valid = result.ic_info.recv_ic_id_valid; + mr->ic_info.rdma_read_ic_id_valid = result.ic_info.rdma_read_ic_id_valid; + mr->ic_info.rdma_recv_ic_id_valid = result.ic_info.rdma_recv_ic_id_valid; + if (mr->p2pmem) { + mr->p2pmem->lkey = result.l_key; + mr->p2pmem->needs_dereg = true; + } + ibdev_dbg(&dev->ibdev, "Registered mr[%d]\n", mr->ibmr.lkey); + + return 0; +} + +struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, + u64 length, u64 virt_addr, + int fd, int access_flags, + struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct ib_umem_dmabuf *umem_dmabuf; + struct efa_mr *mr; + int err; + + mr = efa_alloc_mr(ibpd, access_flags, udata); + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + goto err_out; + } + + umem_dmabuf = ib_umem_dmabuf_get_pinned(ibpd->device, start, length, fd, + access_flags); + if (IS_ERR(umem_dmabuf)) { + err = PTR_ERR(umem_dmabuf); + ibdev_dbg(&dev->ibdev, "Failed to get dmabuf umem[%d]\n", err); + goto err_free; + } + + mr->umem = &umem_dmabuf->umem; + err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags); + if (err) + goto err_release; + + return &mr->ibmr; + +err_release: + ib_umem_release(mr->umem); +err_free: + kfree(mr); +err_out: + atomic64_inc(&dev->stats.reg_mr_err); + return ERR_PTR(err); +} + +struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct efa_mr *mr; + int err; + + mr = efa_alloc_mr(ibpd, access_flags, udata); + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + goto err_out; + } + + mr->umem = ib_umem_get(ibpd->device, start, length, access_flags); + if (IS_ERR(mr->umem)) { + mr->p2pmem = efa_p2p_get(dev, mr, start, length); + if (mr->p2pmem) { + /* Avoid referencing an error-pointer later on */ + mr->umem = NULL; + goto reg_mr; + } + err = PTR_ERR(mr->umem); + ibdev_dbg(&dev->ibdev, + "Failed to pin and map user space memory[%d]\n", err); + goto err_free; + } + +reg_mr: + err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags); + if (err) + goto err_release; + + return &mr->ibmr; + +err_release: + if (mr->p2pmem) + efa_p2p_put(mr->p2pmem->ticket, false); + else + ib_umem_release(mr->umem); +err_free: + kfree(mr); +err_out: + atomic64_inc(&dev->stats.reg_mr_err); + return ERR_PTR(err); +} + +static int UVERBS_HANDLER(EFA_IB_METHOD_MR_QUERY)(struct uverbs_attr_bundle *attrs) +{ + struct ib_mr *ibmr = uverbs_attr_get_obj(attrs, EFA_IB_ATTR_QUERY_MR_HANDLE); + struct efa_mr *mr = to_emr(ibmr); + u16 ic_id_validity = 0; + int ret; + + ret = uverbs_copy_to(attrs, EFA_IB_ATTR_QUERY_MR_RESP_RECV_IC_ID, + &mr->ic_info.recv_ic_id, sizeof(mr->ic_info.recv_ic_id)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, EFA_IB_ATTR_QUERY_MR_RESP_RDMA_READ_IC_ID, + &mr->ic_info.rdma_read_ic_id, sizeof(mr->ic_info.rdma_read_ic_id)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, EFA_IB_ATTR_QUERY_MR_RESP_RDMA_RECV_IC_ID, + &mr->ic_info.rdma_recv_ic_id, sizeof(mr->ic_info.rdma_recv_ic_id)); + if (ret) + return ret; + + if (mr->ic_info.recv_ic_id_valid) + ic_id_validity |= EFA_QUERY_MR_VALIDITY_RECV_IC_ID; + if (mr->ic_info.rdma_read_ic_id_valid) + ic_id_validity |= EFA_QUERY_MR_VALIDITY_RDMA_READ_IC_ID; + if (mr->ic_info.rdma_recv_ic_id_valid) + ic_id_validity |= EFA_QUERY_MR_VALIDITY_RDMA_RECV_IC_ID; + + return uverbs_copy_to(attrs, EFA_IB_ATTR_QUERY_MR_RESP_IC_ID_VALIDITY, + &ic_id_validity, sizeof(ic_id_validity)); +} + +int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibmr->device); + struct efa_com_dereg_mr_params params; + struct efa_mr *mr = to_emr(ibmr); + int err; + + ibdev_dbg(&dev->ibdev, "Deregister mr[%d]\n", ibmr->lkey); + + if (mr->p2pmem) { + err = efa_p2p_put(mr->p2p_ticket, false); + if (err) + return err; + + kfree(mr); + return 0; + } + params.l_key = mr->ibmr.lkey; + err = efa_com_dereg_mr(&dev->edev, ¶ms); + if (err) + return err; + + ib_umem_release(mr->umem); + kfree(mr); + + return 0; +} + +int efa_get_port_immutable(struct ib_device *ibdev, port_t port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + int err; + + err = ib_query_port(ibdev, port_num, &attr); + if (err) { + ibdev_dbg(ibdev, "Couldn't query port err[%d]\n", err); + return err; + } + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + + return 0; +} + +static int efa_dealloc_uar(struct efa_dev *dev, u16 uarn) +{ + struct efa_com_dealloc_uar_params params = { + .uarn = uarn, + }; + + return efa_com_dealloc_uar(&dev->edev, ¶ms); +} + +#define EFA_CHECK_USER_COMP(_dev, _comp_mask, _attr, _mask, _attr_str) \ + (_attr_str = (!(_dev)->dev_attr._attr || ((_comp_mask) & (_mask))) ? \ + NULL : #_attr) + +static int efa_user_comp_handshake(const struct ib_ucontext *ibucontext, + const struct efa_ibv_alloc_ucontext_cmd *cmd) +{ + struct efa_dev *dev = to_edev(ibucontext->device); + char *attr_str; + + if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, max_tx_batch, + EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH, attr_str)) + goto err; + + if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, min_sq_depth, + EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR, + attr_str)) + goto err; + + return 0; + +err: + ibdev_dbg(&dev->ibdev, "Userspace handshake failed for %s attribute\n", + attr_str); + return -EOPNOTSUPP; +} + +int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata) +{ + struct efa_ucontext *ucontext = to_eucontext(ibucontext); + struct efa_dev *dev = to_edev(ibucontext->device); + struct efa_ibv_alloc_ucontext_resp resp = {}; + struct efa_ibv_alloc_ucontext_cmd cmd = {}; + struct efa_com_alloc_uar_result result; + int err; + + /* + * it's fine if the driver does not know all request fields, + * we will ack input fields in our response. + */ + + err = ib_copy_from_udata(&cmd, udata, + min(sizeof(cmd), udata->inlen)); + if (err) { + ibdev_dbg(&dev->ibdev, + "Cannot copy udata for alloc_ucontext\n"); + goto err_out; + } + + err = efa_user_comp_handshake(ibucontext, &cmd); + if (err) + goto err_out; + + err = efa_com_alloc_uar(&dev->edev, &result); + if (err) + goto err_out; + + ucontext->uarn = result.uarn; + + resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE; + resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH; + resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq; + resp.inline_buf_size = dev->dev_attr.inline_buf_size; + resp.max_llq_size = dev->dev_attr.max_llq_size; + resp.max_tx_batch = dev->dev_attr.max_tx_batch; + resp.min_sq_wr = dev->dev_attr.min_sq_depth; + + err = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen)); + if (err) + goto err_dealloc_uar; + + return 0; + +err_dealloc_uar: + efa_dealloc_uar(dev, result.uarn); +err_out: + atomic64_inc(&dev->stats.alloc_ucontext_err); + return err; +} + +void efa_dealloc_ucontext(struct ib_ucontext *ibucontext) +{ + struct efa_ucontext *ucontext = to_eucontext(ibucontext); + struct efa_dev *dev = to_edev(ibucontext->device); + + efa_dealloc_uar(dev, ucontext->uarn); +} + +void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry) +{ + struct efa_user_mmap_entry *entry = to_emmap(rdma_entry); + + kfree(entry); +} + +static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext, + struct vm_area_struct *vma) +{ + struct rdma_user_mmap_entry *rdma_entry; + struct efa_user_mmap_entry *entry; + unsigned long va; + int err = 0; + u64 pfn; + + rdma_entry = rdma_user_mmap_entry_get(&ucontext->ibucontext, vma); + if (!rdma_entry) { + ibdev_dbg(&dev->ibdev, + "pgoff[%#lx] does not have valid entry\n", + vma->vm_pgoff); + atomic64_inc(&dev->stats.mmap_err); + return -EINVAL; + } + entry = to_emmap(rdma_entry); + + ibdev_dbg(&dev->ibdev, + "Mapping address[%#llx], length[%#zx], mmap_flag[%d]\n", + entry->address, rdma_entry->npages * PAGE_SIZE, + entry->mmap_flag); + + pfn = entry->address >> PAGE_SHIFT; + switch (entry->mmap_flag) { + case EFA_MMAP_IO_NC: + err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, + entry->rdma_entry.npages * PAGE_SIZE, + pgprot_noncached(vma->vm_page_prot), + rdma_entry); + break; + case EFA_MMAP_IO_WC: + err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, + entry->rdma_entry.npages * PAGE_SIZE, + pgprot_writecombine(vma->vm_page_prot), + rdma_entry); + break; + case EFA_MMAP_DMA_PAGE: + for (va = vma->vm_start; va < vma->vm_end; + va += PAGE_SIZE, pfn++) { + err = vm_insert_page(vma, va, pfn_to_page(pfn)); + if (err) + break; + } + break; + default: + err = -EINVAL; + } + + if (err) { + ibdev_dbg( + &dev->ibdev, + "Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n", + entry->address, rdma_entry->npages * PAGE_SIZE, + entry->mmap_flag, err); + atomic64_inc(&dev->stats.mmap_err); + } + + rdma_user_mmap_entry_put(rdma_entry); + return err; +} + +int efa_mmap(struct ib_ucontext *ibucontext, + struct vm_area_struct *vma) +{ + struct efa_ucontext *ucontext = to_eucontext(ibucontext); + struct efa_dev *dev = to_edev(ibucontext->device); + size_t length = vma->vm_end - vma->vm_start; + + ibdev_dbg(&dev->ibdev, + "start %#lx, end %#lx, length = %#zx, pgoff = %#lx\n", + vma->vm_start, vma->vm_end, length, vma->vm_pgoff); + + return __efa_mmap(dev, ucontext, vma); +} + +static int efa_ah_destroy(struct efa_dev *dev, struct efa_ah *ah) +{ + struct efa_com_destroy_ah_params params = { + .ah = ah->ah, + .pdn = to_epd(ah->ibah.pd)->pdn, + }; + + return efa_com_destroy_ah(&dev->edev, ¶ms); +} + +int efa_create_ah(struct ib_ah *ibah, + struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) +{ + struct rdma_ah_attr *ah_attr = init_attr->ah_attr; + struct efa_dev *dev = to_edev(ibah->device); + struct efa_com_create_ah_params params = {}; + struct efa_ibv_create_ah_resp resp = {}; + struct efa_com_create_ah_result result; + struct efa_ah *ah = to_eah(ibah); + int err; + + if (!(init_attr->flags & RDMA_CREATE_AH_SLEEPABLE)) { + ibdev_dbg(&dev->ibdev, + "Create address handle is not supported in atomic context\n"); + err = -EOPNOTSUPP; + goto err_out; + } + + if (udata->inlen && + !ib_is_udata_cleared(udata, 0, udata->inlen)) { + ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n"); + err = -EINVAL; + goto err_out; + } + + memcpy(params.dest_addr, ah_attr->grh.dgid.raw, + sizeof(params.dest_addr)); + params.pdn = to_epd(ibah->pd)->pdn; + err = efa_com_create_ah(&dev->edev, ¶ms, &result); + if (err) + goto err_out; + + memcpy(ah->id, ah_attr->grh.dgid.raw, sizeof(ah->id)); + ah->ah = result.ah; + + resp.efa_address_handle = result.ah; + + if (udata->outlen) { + err = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&dev->ibdev, + "Failed to copy udata for create_ah response\n"); + goto err_destroy_ah; + } + } + ibdev_dbg(&dev->ibdev, "Created ah[%d]\n", ah->ah); + + return 0; + +err_destroy_ah: + efa_ah_destroy(dev, ah); +err_out: + atomic64_inc(&dev->stats.create_ah_err); + return err; +} + +int efa_destroy_ah(struct ib_ah *ibah, u32 flags) +{ + struct efa_dev *dev = to_edev(ibah->pd->device); + struct efa_ah *ah = to_eah(ibah); + + ibdev_dbg(&dev->ibdev, "Destroy ah[%d]\n", ah->ah); + + efa_ah_destroy(dev, ah); + return 0; +} + +struct rdma_hw_stats *efa_alloc_hw_port_stats(struct ib_device *ibdev, + port_t port_num) +{ + return rdma_alloc_hw_stats_struct(efa_port_stats_descs, + ARRAY_SIZE(efa_port_stats_descs), + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +struct rdma_hw_stats *efa_alloc_hw_device_stats(struct ib_device *ibdev) +{ + return rdma_alloc_hw_stats_struct(efa_device_stats_descs, + ARRAY_SIZE(efa_device_stats_descs), + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +static int efa_fill_device_stats(struct efa_dev *dev, + struct rdma_hw_stats *stats) +{ + struct efa_com_stats_admin *as = &dev->edev.aq.stats; + struct efa_stats *s = &dev->stats; + + stats->value[EFA_SUBMITTED_CMDS] = atomic64_read(&as->submitted_cmd); + stats->value[EFA_COMPLETED_CMDS] = atomic64_read(&as->completed_cmd); + stats->value[EFA_CMDS_ERR] = atomic64_read(&as->cmd_err); + stats->value[EFA_NO_COMPLETION_CMDS] = atomic64_read(&as->no_completion); + + stats->value[EFA_KEEP_ALIVE_RCVD] = atomic64_read(&s->keep_alive_rcvd); + stats->value[EFA_ALLOC_PD_ERR] = atomic64_read(&s->alloc_pd_err); + stats->value[EFA_CREATE_QP_ERR] = atomic64_read(&s->create_qp_err); + stats->value[EFA_CREATE_CQ_ERR] = atomic64_read(&s->create_cq_err); + stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->reg_mr_err); + stats->value[EFA_ALLOC_UCONTEXT_ERR] = + atomic64_read(&s->alloc_ucontext_err); + stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->create_ah_err); + stats->value[EFA_MMAP_ERR] = atomic64_read(&s->mmap_err); + + return ARRAY_SIZE(efa_device_stats_descs); +} + +static int efa_fill_port_stats(struct efa_dev *dev, struct rdma_hw_stats *stats, + port_t port_num) +{ + struct efa_com_get_stats_params params = {}; + union efa_com_get_stats_result result; + struct efa_com_rdma_write_stats *rws; + struct efa_com_rdma_read_stats *rrs; + struct efa_com_messages_stats *ms; + struct efa_com_basic_stats *bs; + int err; + + params.scope = EFA_ADMIN_GET_STATS_SCOPE_ALL; + params.type = EFA_ADMIN_GET_STATS_TYPE_BASIC; + + err = efa_com_get_stats(&dev->edev, ¶ms, &result); + if (err) + return err; + + bs = &result.basic_stats; + stats->value[EFA_TX_BYTES] = bs->tx_bytes; + stats->value[EFA_TX_PKTS] = bs->tx_pkts; + stats->value[EFA_RX_BYTES] = bs->rx_bytes; + stats->value[EFA_RX_PKTS] = bs->rx_pkts; + stats->value[EFA_RX_DROPS] = bs->rx_drops; + + params.type = EFA_ADMIN_GET_STATS_TYPE_MESSAGES; + err = efa_com_get_stats(&dev->edev, ¶ms, &result); + if (err) + return err; + + ms = &result.messages_stats; + stats->value[EFA_SEND_BYTES] = ms->send_bytes; + stats->value[EFA_SEND_WRS] = ms->send_wrs; + stats->value[EFA_RECV_BYTES] = ms->recv_bytes; + stats->value[EFA_RECV_WRS] = ms->recv_wrs; + + params.type = EFA_ADMIN_GET_STATS_TYPE_RDMA_READ; + err = efa_com_get_stats(&dev->edev, ¶ms, &result); + if (err) + return err; + + rrs = &result.rdma_read_stats; + stats->value[EFA_RDMA_READ_WRS] = rrs->read_wrs; + stats->value[EFA_RDMA_READ_BYTES] = rrs->read_bytes; + stats->value[EFA_RDMA_READ_WR_ERR] = rrs->read_wr_err; + stats->value[EFA_RDMA_READ_RESP_BYTES] = rrs->read_resp_bytes; + + if (EFA_DEV_CAP(dev, RDMA_WRITE)) { + params.type = EFA_ADMIN_GET_STATS_TYPE_RDMA_WRITE; + err = efa_com_get_stats(&dev->edev, ¶ms, &result); + if (err) + return err; + + rws = &result.rdma_write_stats; + stats->value[EFA_RDMA_WRITE_WRS] = rws->write_wrs; + stats->value[EFA_RDMA_WRITE_BYTES] = rws->write_bytes; + stats->value[EFA_RDMA_WRITE_WR_ERR] = rws->write_wr_err; + stats->value[EFA_RDMA_WRITE_RECV_BYTES] = rws->write_recv_bytes; + } + + return ARRAY_SIZE(efa_port_stats_descs); +} + +int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, + port_t port_num, int index) +{ + if (port_num) + return efa_fill_port_stats(to_edev(ibdev), stats, port_num); + else + return efa_fill_device_stats(to_edev(ibdev), stats); +} + +enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev, + port_t port_num) +{ + return IB_LINK_LAYER_UNSPECIFIED; +} + +DECLARE_UVERBS_NAMED_METHOD(EFA_IB_METHOD_MR_QUERY, + UVERBS_ATTR_IDR(EFA_IB_ATTR_QUERY_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(EFA_IB_ATTR_QUERY_MR_RESP_IC_ID_VALIDITY, + UVERBS_ATTR_TYPE(u16), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(EFA_IB_ATTR_QUERY_MR_RESP_RECV_IC_ID, + UVERBS_ATTR_TYPE(u16), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(EFA_IB_ATTR_QUERY_MR_RESP_RDMA_READ_IC_ID, + UVERBS_ATTR_TYPE(u16), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(EFA_IB_ATTR_QUERY_MR_RESP_RDMA_RECV_IC_ID, + UVERBS_ATTR_TYPE(u16), + UA_MANDATORY)); + +ADD_UVERBS_METHODS(efa_mr, + UVERBS_OBJECT_MR, + &UVERBS_METHOD(EFA_IB_METHOD_MR_QUERY)); + +const struct uapi_definition efa_uapi_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR, + &efa_mr), + {}, +}; diff --git a/drivers/amazon/net/efa/kcompat.h b/drivers/amazon/net/efa/kcompat.h new file mode 100644 index 0000000000000..c7b520c633fa7 --- /dev/null +++ b/drivers/amazon/net/efa/kcompat.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _KCOMPAT_H_ +#define _KCOMPAT_H_ + +#include + + +#ifndef sizeof_field +#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) +#endif + +typedef u32 port_t; + +#endif /* _KCOMPAT_H_ */ diff --git a/drivers/amazon/net/efa/neuron_p2p.h b/drivers/amazon/net/efa/neuron_p2p.h new file mode 100644 index 0000000000000..a1ce44003463f --- /dev/null +++ b/drivers/amazon/net/efa/neuron_p2p.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef __NEURON_P2P_H__ +#define __NEURON_P2P_H__ + +struct neuron_p2p_page_info { + u64 physical_address; // PA's that map to the VA (page aligned as defined in va_info) + u32 page_count; // page count each page is shift_page_size size +}; + +struct neuron_p2p_va_info { + void *virtual_address; // Virtual address for which the PA's need to be obtained + u64 size; // The actual size of the memory pointed by the virtual_address + u32 shift_page_size; // log2 of the page size + u32 device_index; // Neuron Device index. + u32 entries; // Number of page_info entries + struct neuron_p2p_page_info page_info[]; +}; + +/** Given the virtual address and length returns the physical address + * + * @param[in] virtual_address - Virtual address of device memory + * @param[in] length - Length of the memory + * @param[out] va_info - Set of physical addresses + * @param[in] free_callback - Callback function to be called. This will be called with a lock held. + * @param[in] data - Data to be used for the callback + * + * @return 0 - Success. + */ +int neuron_p2p_register_va(u64 virtual_address, u64 length, struct neuron_p2p_va_info **vainfo, void (*free_callback) (void *data), void *data); + +/** Give the pa, release the pa from being used by third-party device + * + * @param[in] va_info - Set of physical addresses + * + * @return 0 - Success. + */ +int neuron_p2p_unregister_va(struct neuron_p2p_va_info *vainfo); + +#endif diff --git a/drivers/amazon/net/efa/nv-p2p.h b/drivers/amazon/net/efa/nv-p2p.h new file mode 100644 index 0000000000000..93350530a3eb2 --- /dev/null +++ b/drivers/amazon/net/efa/nv-p2p.h @@ -0,0 +1,478 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _NV_P2P_H_ +#define _NV_P2P_H_ + +/* + * NVIDIA P2P Structure Versioning + * + * For the nvidia_p2p_*_t structures allocated by the NVIDIA driver, it will + * set the version field of the structure according to the definition used by + * the NVIDIA driver. The "major" field of the version is defined as the upper + * 16 bits, and the "minor" field of the version is defined as the lower 16 + * bits. The version field will always be the first 4 bytes of the structure, + * and third-party drivers should check the value of this field in structures + * allocated by the NVIDIA driver to ensure runtime compatibility. + * + * In general, version numbers will be incremented as follows: + * - When a backwards-compatible change is made to the structure layout, the + * minor version for that structure will be incremented. Third-party drivers + * built against an older minor version will continue to work with the newer + * minor version used by the NVIDIA driver, without recompilation. + * - When a breaking change is made to the structure layout, the major version + * will be incremented. Third-party drivers built against an older major + * version require at least recompilation and potentially additional updates + * to use the new API. + */ +#define NVIDIA_P2P_MAJOR_VERSION_MASK 0xffff0000 +#define NVIDIA_P2P_MINOR_VERSION_MASK 0x0000ffff + +#define NVIDIA_P2P_MAJOR_VERSION(v) \ + (((v) & NVIDIA_P2P_MAJOR_VERSION_MASK) >> 16) + +#define NVIDIA_P2P_MINOR_VERSION(v) \ + (((v) & NVIDIA_P2P_MINOR_VERSION_MASK)) + +#define NVIDIA_P2P_MAJOR_VERSION_MATCHES(p, v) \ + (NVIDIA_P2P_MAJOR_VERSION((p)->version) == NVIDIA_P2P_MAJOR_VERSION(v)) + +#define NVIDIA_P2P_VERSION_COMPATIBLE(p, v) \ + (NVIDIA_P2P_MAJOR_VERSION_MATCHES(p, v) && \ + (NVIDIA_P2P_MINOR_VERSION((p)->version) >= (NVIDIA_P2P_MINOR_VERSION(v)))) + +enum { + NVIDIA_P2P_ARCHITECTURE_TESLA = 0, + NVIDIA_P2P_ARCHITECTURE_FERMI, + NVIDIA_P2P_ARCHITECTURE_CURRENT = NVIDIA_P2P_ARCHITECTURE_FERMI +}; + +#define NVIDIA_P2P_PARAMS_VERSION 0x00010001 + +enum { + NVIDIA_P2P_PARAMS_ADDRESS_INDEX_GPU = 0, + NVIDIA_P2P_PARAMS_ADDRESS_INDEX_THIRD_PARTY_DEVICE, + NVIDIA_P2P_PARAMS_ADDRESS_INDEX_MAX = \ + NVIDIA_P2P_PARAMS_ADDRESS_INDEX_THIRD_PARTY_DEVICE +}; + +#define NVIDIA_P2P_GPU_UUID_LEN 16 + +typedef +struct nvidia_p2p_params { + u32 version; + u32 architecture; + union nvidia_p2p_mailbox_addresses { + struct { + u64 wmb_addr; + u64 wmb_data; + u64 rreq_addr; + u64 rcomp_addr; + u64 reserved[2]; + } fermi; + } addresses[NVIDIA_P2P_PARAMS_ADDRESS_INDEX_MAX+1]; +} nvidia_p2p_params_t; + +/* + * Macro for users to detect + * driver support for persistent pages. + */ +#define NVIDIA_P2P_CAP_GET_PAGES_PERSISTENT_API + +/* + * This API is not supported. + */ +int nvidia_p2p_init_mapping(u64 p2p_token, + struct nvidia_p2p_params *params, + void (*destroy_callback)(void *data), + void *data); + +/* + * This API is not supported. + */ +int nvidia_p2p_destroy_mapping(u64 p2p_token); + +enum nvidia_p2p_page_size_type { + NVIDIA_P2P_PAGE_SIZE_4KB = 0, + NVIDIA_P2P_PAGE_SIZE_64KB, + NVIDIA_P2P_PAGE_SIZE_128KB, + NVIDIA_P2P_PAGE_SIZE_COUNT +}; + +typedef +struct nvidia_p2p_page { + u64 physical_address; + union nvidia_p2p_request_registers { + struct { + u32 wreqmb_h; + u32 rreqmb_h; + u32 rreqmb_0; + u32 reserved[3]; + } fermi; + } registers; +} nvidia_p2p_page_t; + +#define NVIDIA_P2P_PAGE_TABLE_VERSION 0x00010002 + +#define NVIDIA_P2P_PAGE_TABLE_VERSION_COMPATIBLE(p) \ + NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_PAGE_TABLE_VERSION) + +typedef +struct nvidia_p2p_page_table { + u32 version; + u32 page_size; /* enum nvidia_p2p_page_size_type */ + struct nvidia_p2p_page **pages; + u32 entries; + u8 *gpu_uuid; +} nvidia_p2p_page_table_t; + +/* + * @brief + * Make the pages underlying a range of GPU virtual memory + * accessible to a third-party device. + * + * This API only supports pinned, GPU-resident memory, such as that provided + * by cudaMalloc(). + * + * This API may sleep. + * + * @param[in] p2p_token + * A token that uniquely identifies the P2P mapping. + * @param[in] va_space + * A GPU virtual address space qualifier. + * @param[in] virtual_address + * The start address in the specified virtual address space. + * Address must be aligned to the 64KB boundary. + * @param[in] length + * The length of the requested P2P mapping. + * Length must be a multiple of 64KB. + * @param[out] page_table + * A pointer to an array of structures with P2P PTEs. + * @param[in] free_callback + * A pointer to the function to be invoked when the pages + * underlying the virtual address range are freed + * implicitly. + * @param[in] data + * A non-NULL opaque pointer to private data to be passed to the + * callback function. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + * -ENOTSUPP if the requested operation is not supported. + * -ENOMEM if the driver failed to allocate memory or if + * insufficient resources were available to complete the operation. + * -EIO if an unknown error occurred. + */ +int nvidia_p2p_get_pages( u64 p2p_token, u32 va_space, + u64 virtual_address, u64 length, + struct nvidia_p2p_page_table **page_table, + void (*free_callback)(void *data), void *data); + +/* + * @brief + * Pin and make the pages underlying a range of GPU virtual memory + * accessible to a third-party device. The pages will persist until + * explicitly freed by nvidia_p2p_put_pages_persistent(). + * + * Persistent GPU memory mappings are not supported on PowerPC, + * MIG-enabled devices and vGPU. + * + * This API only supports pinned, GPU-resident memory, such as that provided + * by cudaMalloc(). + * + * This API may sleep. + * + * @param[in] virtual_address + * The start address in the specified virtual address space. + * Address must be aligned to the 64KB boundary. + * @param[in] length + * The length of the requested P2P mapping. + * Length must be a multiple of 64KB. + * @param[out] page_table + * A pointer to an array of structures with P2P PTEs. + * @param[in] flags + * Must be set to zero for now. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + * -ENOTSUPP if the requested operation is not supported. + * -ENOMEM if the driver failed to allocate memory or if + * insufficient resources were available to complete the operation. + * -EIO if an unknown error occurred. + */ +int nvidia_p2p_get_pages_persistent(u64 virtual_address, + u64 length, + struct nvidia_p2p_page_table **page_table, + u32 flags); + +#define NVIDIA_P2P_DMA_MAPPING_VERSION 0x00020003 + +#define NVIDIA_P2P_DMA_MAPPING_VERSION_COMPATIBLE(p) \ + NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_DMA_MAPPING_VERSION) + +struct pci_dev; + +typedef +struct nvidia_p2p_dma_mapping { + u32 version; + enum nvidia_p2p_page_size_type page_size_type; + u32 entries; + u64 *dma_addresses; + void *private; + struct pci_dev *pci_dev; +} nvidia_p2p_dma_mapping_t; + +/* + * @brief + * Make the physical pages retrieved using nvidia_p2p_get_pages accessible to + * a third-party device. + * + * @param[in] peer + * The struct pci_dev * of the peer device that needs to DMA to/from the + * mapping. + * @param[in] page_table + * The page table outlining the physical pages underlying the mapping, as + * retrieved with nvidia_p2p_get_pages(). + * @param[out] dma_mapping + * The DMA mapping containing the DMA addresses to use on the third-party + * device. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + * -ENOTSUPP if the requested operation is not supported. + * -EIO if an unknown error occurred. + */ +int nvidia_p2p_dma_map_pages(struct pci_dev *peer, + struct nvidia_p2p_page_table *page_table, + struct nvidia_p2p_dma_mapping **dma_mapping); + +/* + * @brief + * Unmap the physical pages previously mapped to the third-party device by + * nvidia_p2p_dma_map_pages(). + * + * @param[in] peer + * The struct pci_dev * of the peer device that the DMA mapping belongs to. + * @param[in] page_table + * The page table backing the DMA mapping to be unmapped. + * @param[in] dma_mapping + * The DMA mapping containing the DMA addresses used by the third-party + * device, as retrieved with nvidia_p2p_dma_map_pages(). After this call + * returns, neither this struct nor the addresses contained within will be + * valid for use by the third-party device. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + * -EIO if an unknown error occurred. + */ +int nvidia_p2p_dma_unmap_pages(struct pci_dev *peer, + struct nvidia_p2p_page_table *page_table, + struct nvidia_p2p_dma_mapping *dma_mapping); + +/* + * @brief + * Release a set of pages previously made accessible to + * a third-party device. + * + * This API may sleep. + * + * @param[in] p2p_token + * A token that uniquely identifies the P2P mapping. + * @param[in] va_space + * A GPU virtual address space qualifier. + * @param[in] virtual_address + * The start address in the specified virtual address space. + * @param[in] page_table + * A pointer to the array of structures with P2P PTEs. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + * -EIO if an unknown error occurred. + */ +int nvidia_p2p_put_pages(u64 p2p_token, + u32 va_space, u64 virtual_address, + struct nvidia_p2p_page_table *page_table); + +/* + * @brief + * Release a set of persistent pages previously made accessible to + * a third-party device. + * + * This API may sleep. + * + * @param[in] virtual_address + * The start address in the specified virtual address space. + * @param[in] page_table + * A pointer to the array of structures with P2P PTEs. + * @param[in] flags + * Must be set to zero for now. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + * -EIO if an unknown error occurred. + */ +int nvidia_p2p_put_pages_persistent(u64 virtual_address, + struct nvidia_p2p_page_table *page_table, + u32 flags); + +/* + * @brief + * Free a third-party P2P page table. (This function is a no-op.) + * + * @param[in] page_table + * A pointer to the array of structures with P2P PTEs. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + */ +int nvidia_p2p_free_page_table(struct nvidia_p2p_page_table *page_table); + +/* + * @brief + * Free a third-party P2P DMA mapping. (This function is a no-op.) + * + * @param[in] dma_mapping + * A pointer to the DMA mapping structure. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + */ +int nvidia_p2p_free_dma_mapping(struct nvidia_p2p_dma_mapping *dma_mapping); + +#define NVIDIA_P2P_RSYNC_DRIVER_VERSION 0x00010001 + +#define NVIDIA_P2P_RSYNC_DRIVER_VERSION_COMPATIBLE(p) \ + NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_RSYNC_DRIVER_VERSION) + +typedef +struct nvidia_p2p_rsync_driver { + u32 version; + int (*get_relaxed_ordering_mode)(int *mode, void *data); + void (*put_relaxed_ordering_mode)(int mode, void *data); + void (*wait_for_rsync)(struct pci_dev *gpu, void *data); +} nvidia_p2p_rsync_driver_t; + +/* + * @brief + * Registers the rsync driver. + * + * @param[in] driver + * A pointer to the rsync driver structure. The NVIDIA driver would use, + * + * get_relaxed_ordering_mode to obtain a reference to the current relaxed + * ordering mode (treated as a boolean) from the rsync driver. + * + * put_relaxed_ordering_mode to release a reference to the current relaxed + * ordering mode back to the rsync driver. The NVIDIA driver will call this + * function once for each successful call to get_relaxed_ordering_mode, and + * the relaxed ordering mode must not change until the last reference is + * released. + * + * wait_for_rsync to call into the rsync module to issue RSYNC. This callback + * can't sleep or re-schedule as it may arrive under spinlocks. + * @param[in] data + * A pointer to the rsync driver's private data. + * + * @Returns + * 0 upon successful completion. + * -EINVAL parameters are incorrect. + * -EBUSY if a module is already registered or GPU devices are in use. + */ +int nvidia_p2p_register_rsync_driver(nvidia_p2p_rsync_driver_t *driver, + void *data); + +/* + * @brief + * Unregisters the rsync driver. + * + * @param[in] driver + * A pointer to the rsync driver structure. + * @param[in] data + * A pointer to the rsync driver's private data. + */ +void nvidia_p2p_unregister_rsync_driver(nvidia_p2p_rsync_driver_t *driver, + void *data); + +#define NVIDIA_P2P_RSYNC_REG_INFO_VERSION 0x00020001 + +#define NVIDIA_P2P_RSYNC_REG_INFO_VERSION_COMPATIBLE(p) \ + NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_RSYNC_REG_INFO_VERSION) + +typedef struct nvidia_p2p_rsync_reg { + void *ptr; + size_t size; + struct pci_dev *ibmnpu; + struct pci_dev *gpu; + u32 cluster_id; + u32 socket_id; +} nvidia_p2p_rsync_reg_t; + +typedef struct nvidia_p2p_rsync_reg_info { + u32 version; + nvidia_p2p_rsync_reg_t *regs; + size_t entries; +} nvidia_p2p_rsync_reg_info_t; + +/* + * @brief + * Gets rsync (GEN-ID) register information associated with the supported + * NPUs. + * + * The caller would use the returned information {GPU device, NPU device, + * socket-id, cluster-id} to pick the optimal generation registers to issue + * RSYNC (NVLink HW flush). + * + * The interface allocates structures to return the information, hence + * nvidia_p2p_put_rsync_registers() must be called to free the structures. + * + * Note, cluster-id is hardcoded to zero as early system configurations would + * only support cluster mode i.e. all devices would share the same cluster-id + * (0). In the future, appropriate kernel support would be needed to query + * cluster-ids. + * + * @param[out] reg_info + * A pointer to the rsync reg info structure. + * + * @Returns + * 0 Upon successful completion. Otherwise, returns negative value. + */ +int nvidia_p2p_get_rsync_registers(nvidia_p2p_rsync_reg_info_t **reg_info); + +/* + * @brief + * Frees the structures allocated by nvidia_p2p_get_rsync_registers(). + * + * @param[in] reg_info + * A pointer to the rsync reg info structure. + */ +void nvidia_p2p_put_rsync_registers(nvidia_p2p_rsync_reg_info_t *reg_info); + +#endif /* _NV_P2P_H_ */ diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile new file mode 100644 index 0000000000000..e60b229b694bd --- /dev/null +++ b/drivers/amazon/net/ena/Makefile @@ -0,0 +1,20 @@ +# +# Makefile for the Elastic Network Adapter (ENA) device drivers. +# ENA Source is: https://github.com/amzn/amzn-drivers. +# Current ENA source is based on ena_linux_2.12.0 tag. +# + +obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena.o + +ena-y := ena_netdev.o ena_ethtool.o ena_lpc.o ena_phc.o ena_xdp.o dim.o \ + net_dim.o ena_com.o ena_eth_com.o + +ena-$(CONFIG_SYSFS) += ena_sysfs.o + +ifdef TEST_AF_XDP + ccflags-y += -DENA_TEST_AF_XDP +endif + +ccflags-y += -DENA_PHC_INCLUDE + +ccflags-y += -include $(srctree)/drivers/amazon/net/ena/config.h diff --git a/drivers/amazon/net/ena/config.h b/drivers/amazon/net/ena/config.h new file mode 100644 index 0000000000000..994cb09b6b989 --- /dev/null +++ b/drivers/amazon/net/ena/config.h @@ -0,0 +1,7 @@ +#ifndef _ENA_CONFIG_H_ +#define _ENA_CONFIG_H_ +#define ENA_HAVE_PCI_DEV_ID 1 +#define ENA_HAVE_XDP_DO_FLUSH 1 +#define ENA_HAVE_CPUMASK_LOCAL_SPREAD 1 +#define ENA_HAVE_UPDATE_AFFINITY_HINT 1 +#endif /* _ENA_CONFIG_H_ */ diff --git a/drivers/amazon/net/ena/dim.c b/drivers/amazon/net/ena/dim.c new file mode 100644 index 0000000000000..1b200be4b3709 --- /dev/null +++ b/drivers/amazon/net/ena/dim.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. + */ + +#include "dim.h" + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) + +bool dim_on_top(struct dim *dim) +{ + switch (dim->tune_state) { + case DIM_PARKING_ON_TOP: + case DIM_PARKING_TIRED: + return true; + case DIM_GOING_RIGHT: + return (dim->steps_left > 1) && (dim->steps_right == 1); + default: /* DIM_GOING_LEFT */ + return (dim->steps_right > 1) && (dim->steps_left == 1); + } +} + +void dim_turn(struct dim *dim) +{ + switch (dim->tune_state) { + case DIM_PARKING_ON_TOP: + case DIM_PARKING_TIRED: + break; + case DIM_GOING_RIGHT: + dim->tune_state = DIM_GOING_LEFT; + dim->steps_left = 0; + break; + case DIM_GOING_LEFT: + dim->tune_state = DIM_GOING_RIGHT; + dim->steps_right = 0; + break; + } +} + +void dim_park_on_top(struct dim *dim) +{ + dim->steps_right = 0; + dim->steps_left = 0; + dim->tired = 0; + dim->tune_state = DIM_PARKING_ON_TOP; +} + +void dim_park_tired(struct dim *dim) +{ + dim->steps_right = 0; + dim->steps_left = 0; + dim->tune_state = DIM_PARKING_TIRED; +} + +void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, + struct dim_stats *curr_stats) +{ + /* u32 holds up to 71 minutes, should be enough */ + u32 delta_us = ktime_us_delta(end->time, start->time); + u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr); + u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr, + start->byte_ctr); + u32 ncomps = BIT_GAP(BITS_PER_TYPE(u32), end->comp_ctr, + start->comp_ctr); + + if (!delta_us) + return; + + curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us); + curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us); + curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC, + delta_us); + curr_stats->cpms = DIV_ROUND_UP(ncomps * USEC_PER_MSEC, delta_us); + if (curr_stats->epms != 0) + curr_stats->cpe_ratio = DIV_ROUND_DOWN_ULL( + curr_stats->cpms * 100, curr_stats->epms); + else + curr_stats->cpe_ratio = 0; + +} + +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */ diff --git a/drivers/amazon/net/ena/dim.h b/drivers/amazon/net/ena/dim.h new file mode 100644 index 0000000000000..633c2473e73ad --- /dev/null +++ b/drivers/amazon/net/ena/dim.h @@ -0,0 +1,338 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef DIM_H +#define DIM_H + +#include +#include "kcompat.h" + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) + +/** + * Number of events between DIM iterations. + * Causes a moderation of the algorithm run. + */ +#define DIM_NEVENTS 64 + +/** + * Is a difference between values justifies taking an action. + * We consider 10% difference as significant. + */ +#define IS_SIGNIFICANT_DIFF(val, ref) \ + (((100UL * abs((val) - (ref))) / (ref)) > 10) + +/** + * Calculate the gap between two values. + * Take wrap-around and variable size into consideration. + */ +#define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) \ + & (BIT_ULL(bits) - 1)) + +/** + * Structure for CQ moderation values. + * Used for communications between DIM and its consumer. + * + * @usec: CQ timer suggestion (by DIM) + * @pkts: CQ packet counter suggestion (by DIM) + * @cq_period_mode: CQ priod count mode (from CQE/EQE) + */ +struct dim_cq_moder { + u16 usec; + u16 pkts; + u16 comps; + u8 cq_period_mode; +}; + +/** + * Structure for DIM sample data. + * Used for communications between DIM and its consumer. + * + * @time: Sample timestamp + * @pkt_ctr: Number of packets + * @byte_ctr: Number of bytes + * @event_ctr: Number of events + */ +struct dim_sample { + ktime_t time; + u32 pkt_ctr; + u32 byte_ctr; + u16 event_ctr; + u32 comp_ctr; +}; + +/** + * Structure for DIM stats. + * Used for holding current measured rates. + * + * @ppms: Packets per msec + * @bpms: Bytes per msec + * @epms: Events per msec + */ +struct dim_stats { + int ppms; /* packets per msec */ + int bpms; /* bytes per msec */ + int epms; /* events per msec */ + int cpms; /* completions per msec */ + int cpe_ratio; /* ratio of completions to events */ +}; + +/** + * Main structure for dynamic interrupt moderation (DIM). + * Used for holding all information about a specific DIM instance. + * + * @state: Algorithm state (see below) + * @prev_stats: Measured rates from previous iteration (for comparison) + * @start_sample: Sampled data at start of current iteration + * @work: Work to perform on action required + * @priv: A pointer to the struct that points to dim + * @profile_ix: Current moderation profile + * @mode: CQ period count mode + * @tune_state: Algorithm tuning state (see below) + * @steps_right: Number of steps taken towards higher moderation + * @steps_left: Number of steps taken towards lower moderation + * @tired: Parking depth counter + */ +struct dim { + u8 state; + struct dim_stats prev_stats; + struct dim_sample start_sample; + struct dim_sample measuring_sample; + struct work_struct work; + void *priv; + u8 profile_ix; + u8 mode; + u8 tune_state; + u8 steps_right; + u8 steps_left; + u8 tired; +}; + +/** + * enum dim_cq_period_mode + * + * These are the modes for CQ period count. + * + * @DIM_CQ_PERIOD_MODE_START_FROM_EQE: Start counting from EQE + * @DIM_CQ_PERIOD_MODE_START_FROM_CQE: Start counting from CQE (implies timer reset) + * @DIM_CQ_PERIOD_NUM_MODES: Number of modes + */ +enum { + DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0, + DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1, + DIM_CQ_PERIOD_NUM_MODES +}; + +/** + * enum dim_state + * + * These are the DIM algorithm states. + * These will determine if the algorithm is in a valid state to start an iteration. + * + * @DIM_START_MEASURE: This is the first iteration (also after applying a new profile) + * @DIM_MEASURE_IN_PROGRESS: Algorithm is already in progress - check if + * need to perform an action + * @DIM_APPLY_NEW_PROFILE: DIM consumer is currently applying a profile - no need to measure + */ +enum { + DIM_START_MEASURE, + DIM_MEASURE_IN_PROGRESS, + DIM_APPLY_NEW_PROFILE, +}; + +/** + * enum dim_tune_state + * + * These are the DIM algorithm tune states. + * These will determine which action the algorithm should perform. + * + * @DIM_PARKING_ON_TOP: Algorithm found a local top point - exit on significant difference + * @DIM_PARKING_TIRED: Algorithm found a deep top point - don't exit if tired > 0 + * @DIM_GOING_RIGHT: Algorithm is currently trying higher moderation levels + * @DIM_GOING_LEFT: Algorithm is currently trying lower moderation levels + */ +enum { + DIM_PARKING_ON_TOP, + DIM_PARKING_TIRED, + DIM_GOING_RIGHT, + DIM_GOING_LEFT, +}; + +/** + * enum dim_stats_state + * + * These are the DIM algorithm statistics states. + * These will determine the verdict of current iteration. + * + * @DIM_STATS_WORSE: Current iteration shows worse performance than before + * @DIM_STATS_WORSE: Current iteration shows same performance than before + * @DIM_STATS_WORSE: Current iteration shows better performance than before + */ +enum { + DIM_STATS_WORSE, + DIM_STATS_SAME, + DIM_STATS_BETTER, +}; + +/** + * enum dim_step_result + * + * These are the DIM algorithm step results. + * These describe the result of a step. + * + * @DIM_STEPPED: Performed a regular step + * @DIM_TOO_TIRED: Same kind of step was done multiple times - should go to + * tired parking + * @DIM_ON_EDGE: Stepped to the most left/right profile + */ +enum { + DIM_STEPPED, + DIM_TOO_TIRED, + DIM_ON_EDGE, +}; + +/** + * dim_on_top - check if current state is a good place to stop (top location) + * @dim: DIM context + * + * Check if current profile is a good place to park at. + * This will result in reducing the DIM checks frequency as we assume we + * shouldn't probably change profiles, unless traffic pattern wasn't changed. + */ +bool dim_on_top(struct dim *dim); + +/** + * dim_turn - change profile alterning direction + * @dim: DIM context + * + * Go left if we were going right and vice-versa. + * Do nothing if currently parking. + */ +void dim_turn(struct dim *dim); + +/** + * dim_park_on_top - enter a parking state on a top location + * @dim: DIM context + * + * Enter parking state. + * Clear all movement history. + */ +void dim_park_on_top(struct dim *dim); + +/** + * dim_park_tired - enter a tired parking state + * @dim: DIM context + * + * Enter parking state. + * Clear all movement history and cause DIM checks frequency to reduce. + */ +void dim_park_tired(struct dim *dim); + +/** + * dim_calc_stats - calculate the difference between two samples + * @start: start sample + * @end: end sample + * @curr_stats: delta between samples + * + * Calculate the delta between two samples (in data rates). + * Takes into consideration counter wrap-around. + */ +void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, + struct dim_stats *curr_stats); + +/** + * dim_update_sample - set a sample's fields with give values + * @event_ctr: number of events to set + * @packets: number of packets to set + * @bytes: number of bytes to set + * @s: DIM sample + */ +static inline void +dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s) +{ + s->time = ktime_get(); + s->pkt_ctr = packets; + s->byte_ctr = bytes; + s->event_ctr = event_ctr; +} + +/** + * dim_update_sample_with_comps - set a sample's fields with given + * values including the completion parameter + * @event_ctr: number of events to set + * @packets: number of packets to set + * @bytes: number of bytes to set + * @comps: number of completions to set + * @s: DIM sample + */ +static inline void +dim_update_sample_with_comps(u16 event_ctr, u64 packets, u64 bytes, u64 comps, + struct dim_sample *s) +{ + dim_update_sample(event_ctr, packets, bytes, s); + s->comp_ctr = comps; +} + +/* Net DIM */ + +/** + * net_dim_get_rx_moderation - provide a CQ moderation object for the given RX profile + * @cq_period_mode: CQ period mode + * @ix: Profile index + */ +struct dim_cq_moder net_dim_get_rx_moderation(u8 cq_period_mode, int ix); + +/** + * net_dim_get_def_rx_moderation - provide the default RX moderation + * @cq_period_mode: CQ period mode + */ +struct dim_cq_moder net_dim_get_def_rx_moderation(u8 cq_period_mode); + +/** + * net_dim_get_tx_moderation - provide a CQ moderation object for the given TX profile + * @cq_period_mode: CQ period mode + * @ix: Profile index + */ +struct dim_cq_moder net_dim_get_tx_moderation(u8 cq_period_mode, int ix); + +/** + * net_dim_get_def_tx_moderation - provide the default TX moderation + * @cq_period_mode: CQ period mode + */ +struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode); + +/** + * net_dim - main DIM algorithm entry point + * @dim: DIM instance information + * @end_sample: Current data measurement + * + * Called by the consumer. + * This is the main logic of the algorithm, where data is processed in order to decide on next + * required action. + */ +void net_dim(struct dim *dim, struct dim_sample end_sample); + +/* RDMA DIM */ + +/* + * RDMA DIM profile: + * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES. + */ +#define RDMA_DIM_PARAMS_NUM_PROFILES 9 +#define RDMA_DIM_START_PROFILE 0 + +/** + * rdma_dim - Runs the adaptive moderation. + * @dim: The moderation struct. + * @completions: The number of completions collected in this round. + * + * Each call to rdma_dim takes the latest amount of completions that + * have been collected and counts them as a new event. + * Once enough events have been collected the algorithm decides a new + * moderation level. + */ +void rdma_dim(struct dim *dim, u64 completions); + +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */ + +#endif /* DIM_H */ diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h new file mode 100644 index 0000000000000..db52cae4b9ded --- /dev/null +++ b/drivers/amazon/net/ena/ena_admin_defs.h @@ -0,0 +1,1403 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ +#ifndef _ENA_ADMIN_H_ +#define _ENA_ADMIN_H_ + +#define ENA_ADMIN_EXTRA_PROPERTIES_STRING_LEN 32 +#define ENA_ADMIN_EXTRA_PROPERTIES_COUNT 32 + +#define ENA_ADMIN_RSS_KEY_PARTS 10 + +#define ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK 0x3F +#define ENA_ADMIN_CUSTOMER_METRICS_MIN_SUPPORT_MASK 0x1F + + /* customer metrics - in correlation with + * ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK + */ +enum ena_admin_customer_metrics_id { + ENA_ADMIN_BW_IN_ALLOWANCE_EXCEEDED = 0, + ENA_ADMIN_BW_OUT_ALLOWANCE_EXCEEDED = 1, + ENA_ADMIN_PPS_ALLOWANCE_EXCEEDED = 2, + ENA_ADMIN_CONNTRACK_ALLOWANCE_EXCEEDED = 3, + ENA_ADMIN_LINKLOCAL_ALLOWANCE_EXCEEDED = 4, + ENA_ADMIN_CONNTRACK_ALLOWANCE_AVAILABLE = 5, +}; + +enum ena_admin_aq_opcode { + ENA_ADMIN_CREATE_SQ = 1, + ENA_ADMIN_DESTROY_SQ = 2, + ENA_ADMIN_CREATE_CQ = 3, + ENA_ADMIN_DESTROY_CQ = 4, + ENA_ADMIN_GET_FEATURE = 8, + ENA_ADMIN_SET_FEATURE = 9, + ENA_ADMIN_GET_STATS = 11, +}; + +enum ena_admin_aq_completion_status { + ENA_ADMIN_SUCCESS = 0, + ENA_ADMIN_RESOURCE_ALLOCATION_FAILURE = 1, + ENA_ADMIN_BAD_OPCODE = 2, + ENA_ADMIN_UNSUPPORTED_OPCODE = 3, + ENA_ADMIN_MALFORMED_REQUEST = 4, + /* Additional status is provided in ACQ entry extended_status */ + ENA_ADMIN_ILLEGAL_PARAMETER = 5, + ENA_ADMIN_UNKNOWN_ERROR = 6, + ENA_ADMIN_RESOURCE_BUSY = 7, +}; + +/* subcommands for the set/get feature admin commands */ +enum ena_admin_aq_feature_id { + ENA_ADMIN_DEVICE_ATTRIBUTES = 1, + ENA_ADMIN_MAX_QUEUES_NUM = 2, + ENA_ADMIN_HW_HINTS = 3, + ENA_ADMIN_LLQ = 4, + ENA_ADMIN_EXTRA_PROPERTIES_STRINGS = 5, + ENA_ADMIN_EXTRA_PROPERTIES_FLAGS = 6, + ENA_ADMIN_MAX_QUEUES_EXT = 7, + ENA_ADMIN_RSS_HASH_FUNCTION = 10, + ENA_ADMIN_STATELESS_OFFLOAD_CONFIG = 11, + ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG = 12, + ENA_ADMIN_MTU = 14, + ENA_ADMIN_RSS_HASH_INPUT = 18, + ENA_ADMIN_INTERRUPT_MODERATION = 20, + ENA_ADMIN_AENQ_CONFIG = 26, + ENA_ADMIN_LINK_CONFIG = 27, + ENA_ADMIN_HOST_ATTR_CONFIG = 28, + ENA_ADMIN_PHC_CONFIG = 29, + ENA_ADMIN_FEATURES_OPCODE_NUM = 32, +}; + +/* feature version for the set/get ENA_ADMIN_LLQ feature admin commands */ +enum ena_admin_llq_feature_version { + /* legacy base version in older drivers */ + ENA_ADMIN_LLQ_FEATURE_VERSION_0_LEGACY = 0, + /* support entry_size recommendation by device */ + ENA_ADMIN_LLQ_FEATURE_VERSION_1 = 1, +}; + +/* device capabilities */ +enum ena_admin_aq_caps_id { + ENA_ADMIN_ENI_STATS = 0, + /* ENA SRD customer metrics */ + ENA_ADMIN_ENA_SRD_INFO = 1, + ENA_ADMIN_CUSTOMER_METRICS = 2, + ENA_ADMIN_EXTENDED_RESET_REASONS = 3, + ENA_ADMIN_CDESC_MBZ = 4, +}; + +enum ena_admin_placement_policy_type { + /* descriptors and headers are in host memory */ + ENA_ADMIN_PLACEMENT_POLICY_HOST = 1, + /* descriptors and headers are in device memory (a.k.a Low Latency + * Queue) + */ + ENA_ADMIN_PLACEMENT_POLICY_DEV = 3, +}; + +enum ena_admin_link_types { + ENA_ADMIN_LINK_SPEED_1G = 0x1, + ENA_ADMIN_LINK_SPEED_2_HALF_G = 0x2, + ENA_ADMIN_LINK_SPEED_5G = 0x4, + ENA_ADMIN_LINK_SPEED_10G = 0x8, + ENA_ADMIN_LINK_SPEED_25G = 0x10, + ENA_ADMIN_LINK_SPEED_40G = 0x20, + ENA_ADMIN_LINK_SPEED_50G = 0x40, + ENA_ADMIN_LINK_SPEED_100G = 0x80, + ENA_ADMIN_LINK_SPEED_200G = 0x100, + ENA_ADMIN_LINK_SPEED_400G = 0x200, +}; + +enum ena_admin_completion_policy_type { + /* completion queue entry for each sq descriptor */ + ENA_ADMIN_COMPLETION_POLICY_DESC = 0, + /* completion queue entry upon request in sq descriptor */ + ENA_ADMIN_COMPLETION_POLICY_DESC_ON_DEMAND = 1, + /* current queue head pointer is updated in OS memory upon sq + * descriptor request + */ + ENA_ADMIN_COMPLETION_POLICY_HEAD_ON_DEMAND = 2, + /* current queue head pointer is updated in OS memory for each sq + * descriptor + */ + ENA_ADMIN_COMPLETION_POLICY_HEAD = 3, +}; + +/* basic stats return ena_admin_basic_stats while extanded stats return a + * buffer (string format) with additional statistics per queue and per + * device id + */ +enum ena_admin_get_stats_type { + ENA_ADMIN_GET_STATS_TYPE_BASIC = 0, + ENA_ADMIN_GET_STATS_TYPE_EXTENDED = 1, + /* extra HW stats for specific network interface */ + ENA_ADMIN_GET_STATS_TYPE_ENI = 2, + /* extra HW stats for ENA SRD */ + ENA_ADMIN_GET_STATS_TYPE_ENA_SRD = 3, + ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS = 4, + +}; + +enum ena_admin_get_stats_scope { + ENA_ADMIN_SPECIFIC_QUEUE = 0, + ENA_ADMIN_ETH_TRAFFIC = 1, +}; + +enum ena_admin_phc_feature_version { + /* Readless with error_bound */ + ENA_ADMIN_PHC_FEATURE_VERSION_0 = 0, +}; + +enum ena_admin_phc_error_flags { + ENA_ADMIN_PHC_ERROR_FLAG_TIMESTAMP = BIT(0), + ENA_ADMIN_PHC_ERROR_FLAG_ERROR_BOUND = BIT(1), +}; + +/* ENA SRD configuration for ENI */ +enum ena_admin_ena_srd_flags { + /* Feature enabled */ + ENA_ADMIN_ENA_SRD_ENABLED = BIT(0), + /* UDP support enabled */ + ENA_ADMIN_ENA_SRD_UDP_ENABLED = BIT(1), + /* Bypass Rx UDP ordering */ + ENA_ADMIN_ENA_SRD_UDP_ORDERING_BYPASS_ENABLED = BIT(2), +}; + +struct ena_admin_aq_common_desc { + /* 11:0 : command_id + * 15:12 : reserved12 + */ + u16 command_id; + + /* as appears in ena_admin_aq_opcode */ + u8 opcode; + + /* 0 : phase + * 1 : ctrl_data - control buffer address valid + * 2 : ctrl_data_indirect - control buffer address + * points to list of pages with addresses of control + * buffers + * 7:3 : reserved3 + */ + u8 flags; +}; + +/* used in ena_admin_aq_entry. Can point directly to control data, or to a + * page list chunk. Used also at the end of indirect mode page list chunks, + * for chaining. + */ +struct ena_admin_ctrl_buff_info { + u32 length; + + struct ena_common_mem_addr address; +}; + +struct ena_admin_sq { + u16 sq_idx; + + /* 4:0 : reserved + * 7:5 : sq_direction - 0x1 - Tx; 0x2 - Rx + */ + u8 sq_identity; + + u8 reserved1; +}; + +struct ena_admin_aq_entry { + struct ena_admin_aq_common_desc aq_common_descriptor; + + union { + u32 inline_data_w1[3]; + + struct ena_admin_ctrl_buff_info control_buffer; + } u; + + u32 inline_data_w4[12]; +}; + +struct ena_admin_acq_common_desc { + /* command identifier to associate it with the aq descriptor + * 11:0 : command_id + * 15:12 : reserved12 + */ + u16 command; + + u8 status; + + /* 0 : phase + * 7:1 : reserved1 + */ + u8 flags; + + u16 extended_status; + + /* indicates to the driver which AQ entry has been consumed by the + * device and could be reused + */ + u16 sq_head_indx; +}; + +struct ena_admin_acq_entry { + struct ena_admin_acq_common_desc acq_common_descriptor; + + u32 response_specific_data[14]; +}; + +struct ena_admin_aq_create_sq_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + /* 4:0 : reserved0_w1 + * 7:5 : sq_direction - 0x1 - Tx, 0x2 - Rx + */ + u8 sq_identity; + + u8 reserved8_w1; + + /* 3:0 : placement_policy - Describing where the SQ + * descriptor ring and the SQ packet headers reside: + * 0x1 - descriptors and headers are in OS memory, + * 0x3 - descriptors and headers in device memory + * (a.k.a Low Latency Queue) + * 6:4 : completion_policy - Describing what policy + * to use for generation completion entry (cqe) in + * the CQ associated with this SQ: 0x0 - cqe for each + * sq descriptor, 0x1 - cqe upon request in sq + * descriptor, 0x2 - current queue head pointer is + * updated in OS memory upon sq descriptor request + * 0x3 - current queue head pointer is updated in OS + * memory for each sq descriptor + * 7 : reserved15_w1 + */ + u8 sq_caps_2; + + /* 0 : is_physically_contiguous - Described if the + * queue ring memory is allocated in physical + * contiguous pages or split. + * 7:1 : reserved17_w1 + */ + u8 sq_caps_3; + + /* associated completion queue id. This CQ must be created prior to SQ + * creation + */ + u16 cq_idx; + + /* submission queue depth in entries */ + u16 sq_depth; + + /* SQ physical base address in OS memory. This field should not be + * used for Low Latency queues. Has to be page aligned. + */ + struct ena_common_mem_addr sq_ba; + + /* specifies queue head writeback location in OS memory. Valid if + * completion_policy is set to completion_policy_head_on_demand or + * completion_policy_head. Has to be cache aligned + */ + struct ena_common_mem_addr sq_head_writeback; + + u32 reserved0_w7; + + u32 reserved0_w8; +}; + +enum ena_admin_sq_direction { + ENA_ADMIN_SQ_DIRECTION_TX = 1, + ENA_ADMIN_SQ_DIRECTION_RX = 2, +}; + +struct ena_admin_acq_create_sq_resp_desc { + struct ena_admin_acq_common_desc acq_common_desc; + + u16 sq_idx; + + u16 reserved; + + /* queue doorbell address as an offset to PCIe MMIO REG BAR */ + u32 sq_doorbell_offset; + + /* low latency queue ring base address as an offset to PCIe MMIO + * LLQ_MEM BAR + */ + u32 llq_descriptors_offset; + + /* low latency queue headers' memory as an offset to PCIe MMIO + * LLQ_MEM BAR + */ + u32 llq_headers_offset; +}; + +struct ena_admin_aq_destroy_sq_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + struct ena_admin_sq sq; +}; + +struct ena_admin_acq_destroy_sq_resp_desc { + struct ena_admin_acq_common_desc acq_common_desc; +}; + +struct ena_admin_aq_create_cq_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + /* 4:0 : reserved5 + * 5 : interrupt_mode_enabled - if set, cq operates + * in interrupt mode, otherwise - polling + * 7:6 : reserved6 + */ + u8 cq_caps_1; + + /* 4:0 : cq_entry_size_words - size of CQ entry in + * 32-bit words, valid values: 4, 8. + * 7:5 : reserved7 + */ + u8 cq_caps_2; + + /* completion queue depth in # of entries. must be power of 2 */ + u16 cq_depth; + + /* msix vector assigned to this cq */ + u32 msix_vector; + + /* cq physical base address in OS memory. CQ must be physically + * contiguous + */ + struct ena_common_mem_addr cq_ba; +}; + +struct ena_admin_acq_create_cq_resp_desc { + struct ena_admin_acq_common_desc acq_common_desc; + + u16 cq_idx; + + /* actual cq depth in number of entries */ + u16 cq_actual_depth; + + u32 numa_node_register_offset; + + u32 cq_head_db_register_offset; + + u32 cq_interrupt_unmask_register_offset; +}; + +struct ena_admin_aq_destroy_cq_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + u16 cq_idx; + + u16 reserved1; +}; + +struct ena_admin_acq_destroy_cq_resp_desc { + struct ena_admin_acq_common_desc acq_common_desc; +}; + +/* ENA AQ Get Statistics command. Extended statistics are placed in control + * buffer pointed by AQ entry + */ +struct ena_admin_aq_get_stats_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + union { + /* command specific inline data */ + u32 inline_data_w1[3]; + + struct ena_admin_ctrl_buff_info control_buffer; + } u; + + /* stats type as defined in enum ena_admin_get_stats_type */ + u8 type; + + /* stats scope defined in enum ena_admin_get_stats_scope */ + u8 scope; + + u16 reserved3; + + /* queue id. used when scope is specific_queue */ + u16 queue_idx; + + /* device id, value 0xFFFF means mine. only privileged device can get + * stats of other device + */ + u16 device_id; + + /* a bitmap representing the requested metric values */ + u64 requested_metrics; +}; + +/* Basic Statistics Command. */ +struct ena_admin_basic_stats { + u32 tx_bytes_low; + + u32 tx_bytes_high; + + u32 tx_pkts_low; + + u32 tx_pkts_high; + + u32 rx_bytes_low; + + u32 rx_bytes_high; + + u32 rx_pkts_low; + + u32 rx_pkts_high; + + u32 rx_drops_low; + + u32 rx_drops_high; + + u32 tx_drops_low; + + u32 tx_drops_high; + + u32 rx_overruns_low; + + u32 rx_overruns_high; +}; + +/* ENI Statistics Command. */ +struct ena_admin_eni_stats { + /* The number of packets shaped due to inbound aggregate BW + * allowance being exceeded + */ + u64 bw_in_allowance_exceeded; + + /* The number of packets shaped due to outbound aggregate BW + * allowance being exceeded + */ + u64 bw_out_allowance_exceeded; + + /* The number of packets shaped due to PPS allowance being exceeded */ + u64 pps_allowance_exceeded; + + /* The number of packets shaped due to connection tracking + * allowance being exceeded and leading to failure in establishment + * of new connections + */ + u64 conntrack_allowance_exceeded; + + /* The number of packets shaped due to linklocal packet rate + * allowance being exceeded + */ + u64 linklocal_allowance_exceeded; +}; + +struct ena_admin_ena_srd_stats { + /* Number of packets transmitted over ENA SRD */ + u64 ena_srd_tx_pkts; + + /* Number of packets transmitted or could have been + * transmitted over ENA SRD + */ + u64 ena_srd_eligible_tx_pkts; + + /* Number of packets received over ENA SRD */ + u64 ena_srd_rx_pkts; + + /* Percentage of the ENA SRD resources that is in use */ + u64 ena_srd_resource_utilization; +}; + +/* ENA SRD Statistics Command */ +struct ena_admin_ena_srd_info { + /* ENA SRD configuration bitmap. See ena_admin_ena_srd_flags for + * details + */ + u64 flags; + + struct ena_admin_ena_srd_stats ena_srd_stats; +}; + +/* Customer Metrics Command. */ +struct ena_admin_customer_metrics { + /* A bitmap representing the reported customer metrics according to + * the order they are reported + */ + u64 reported_metrics; +}; + +struct ena_admin_acq_get_stats_resp { + struct ena_admin_acq_common_desc acq_common_desc; + + union { + u64 raw[7]; + + struct ena_admin_basic_stats basic_stats; + + struct ena_admin_eni_stats eni_stats; + + struct ena_admin_ena_srd_info ena_srd_info; + + struct ena_admin_customer_metrics customer_metrics; + } u; +}; + +struct ena_admin_get_set_feature_common_desc { + /* 1:0 : select - 0x1 - current value; 0x3 - default + * value + * 7:3 : reserved3 + */ + u8 flags; + + /* as appears in ena_admin_aq_feature_id */ + u8 feature_id; + + /* The driver specifies the max feature version it supports and the + * device responds with the currently supported feature version. The + * field is zero based + */ + u8 feature_version; + + u8 reserved8; +}; + +struct ena_admin_device_attr_feature_desc { + u32 impl_id; + + u32 device_version; + + /* bitmap of ena_admin_aq_feature_id, which represents supported + * subcommands for the set/get feature admin commands. + */ + u32 supported_features; + + /* bitmap of ena_admin_aq_caps_id, which represents device + * capabilities. + */ + u32 capabilities; + + /* Indicates how many bits are used physical address access. */ + u32 phys_addr_width; + + /* Indicates how many bits are used virtual address access. */ + u32 virt_addr_width; + + /* unicast MAC address (in Network byte order) */ + u8 mac_addr[6]; + + u8 reserved7[2]; + + u32 max_mtu; +}; + +enum ena_admin_llq_header_location { + /* header is in descriptor list */ + ENA_ADMIN_INLINE_HEADER = 1, + /* header in a separate ring, implies 16B descriptor list entry */ + ENA_ADMIN_HEADER_RING = 2, +}; + +enum ena_admin_llq_ring_entry_size { + ENA_ADMIN_LIST_ENTRY_SIZE_128B = 1, + ENA_ADMIN_LIST_ENTRY_SIZE_192B = 2, + ENA_ADMIN_LIST_ENTRY_SIZE_256B = 4, +}; + +enum ena_admin_llq_num_descs_before_header { + ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_0 = 0, + ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_1 = 1, + ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2 = 2, + ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_4 = 4, + ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8 = 8, +}; + +/* packet descriptor list entry always starts with one or more descriptors, + * followed by a header. The rest of the descriptors are located in the + * beginning of the subsequent entry. Stride refers to how the rest of the + * descriptors are placed. This field is relevant only for inline header + * mode + */ +enum ena_admin_llq_stride_ctrl { + ENA_ADMIN_SINGLE_DESC_PER_ENTRY = 1, + ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY = 2, +}; + +enum ena_admin_accel_mode_feat { + ENA_ADMIN_DISABLE_META_CACHING = 0, + ENA_ADMIN_LIMIT_TX_BURST = 1, +}; + +struct ena_admin_accel_mode_get { + /* bit field of enum ena_admin_accel_mode_feat */ + u16 supported_flags; + + /* maximum burst size between two doorbells. The size is in bytes */ + u16 max_tx_burst_size; +}; + +struct ena_admin_accel_mode_set { + /* bit field of enum ena_admin_accel_mode_feat */ + u16 enabled_flags; + + u16 reserved; +}; + +struct ena_admin_accel_mode_req { + union { + u32 raw[2]; + + struct ena_admin_accel_mode_get get; + + struct ena_admin_accel_mode_set set; + } u; +}; + +struct ena_admin_feature_llq_desc { + u32 max_llq_num; + + u32 max_llq_depth; + + /* specify the header locations the device supports. bitfield of enum + * ena_admin_llq_header_location. + */ + u16 header_location_ctrl_supported; + + /* the header location the driver selected to use. */ + u16 header_location_ctrl_enabled; + + /* if inline header is specified - this is the size of descriptor list + * entry. If header in a separate ring is specified - this is the size + * of header ring entry. bitfield of enum ena_admin_llq_ring_entry_size. + * specify the entry sizes the device supports + */ + u16 entry_size_ctrl_supported; + + /* the entry size the driver selected to use. */ + u16 entry_size_ctrl_enabled; + + /* valid only if inline header is specified. First entry associated with + * the packet includes descriptors and header. Rest of the entries + * occupied by descriptors. This parameter defines the max number of + * descriptors precedding the header in the first entry. The field is + * bitfield of enum ena_admin_llq_num_descs_before_header and specify + * the values the device supports + */ + u16 desc_num_before_header_supported; + + /* the desire field the driver selected to use */ + u16 desc_num_before_header_enabled; + + /* valid only if inline was chosen. bitfield of enum + * ena_admin_llq_stride_ctrl + */ + u16 descriptors_stride_ctrl_supported; + + /* the stride control the driver selected to use */ + u16 descriptors_stride_ctrl_enabled; + + /* feature version of device resp to either GET/SET commands. */ + u8 feature_version; + + /* llq entry size recommended by the device, + * values correlated to enum ena_admin_llq_ring_entry_size. + * used only for GET command. + */ + u8 entry_size_recommended; + + /* max depth of wide llq, or 0 for N/A */ + u16 max_wide_llq_depth; + + /* accelerated low latency queues requirement. driver needs to + * support those requirements in order to use accelerated llq + */ + struct ena_admin_accel_mode_req accel_mode; +}; + +struct ena_admin_queue_ext_feature_fields { + u32 max_tx_sq_num; + + u32 max_tx_cq_num; + + u32 max_rx_sq_num; + + u32 max_rx_cq_num; + + u32 max_tx_sq_depth; + + u32 max_tx_cq_depth; + + u32 max_rx_sq_depth; + + u32 max_rx_cq_depth; + + u32 max_tx_header_size; + + /* Maximum Descriptors number, including meta descriptor, allowed for a + * single Tx packet + */ + u16 max_per_packet_tx_descs; + + /* Maximum Descriptors number allowed for a single Rx packet */ + u16 max_per_packet_rx_descs; +}; + +struct ena_admin_queue_feature_desc { + u32 max_sq_num; + + u32 max_sq_depth; + + u32 max_cq_num; + + u32 max_cq_depth; + + u32 max_legacy_llq_num; + + u32 max_legacy_llq_depth; + + u32 max_header_size; + + /* Maximum Descriptors number, including meta descriptor, allowed for a + * single Tx packet + */ + u16 max_packet_tx_descs; + + /* Maximum Descriptors number allowed for a single Rx packet */ + u16 max_packet_rx_descs; +}; + +struct ena_admin_set_feature_mtu_desc { + /* exclude L2 */ + u32 mtu; +}; + +struct ena_admin_get_extra_properties_strings_desc { + u32 count; +}; + +struct ena_admin_get_extra_properties_flags_desc { + u32 flags; +}; + +struct ena_admin_set_feature_host_attr_desc { + /* host OS info base address in OS memory. host info is 4KB of + * physically contiguous + */ + struct ena_common_mem_addr os_info_ba; + + /* host debug area base address in OS memory. debug area must be + * physically contiguous + */ + struct ena_common_mem_addr debug_ba; + + /* debug area size */ + u32 debug_area_size; +}; + +struct ena_admin_feature_intr_moder_desc { + /* interrupt delay granularity in usec */ + u16 intr_delay_resolution; + + u16 reserved; +}; + +struct ena_admin_get_feature_link_desc { + /* Link speed in Mb */ + u32 speed; + + /* bit field of enum ena_admin_link types */ + u32 supported; + + /* 0 : autoneg + * 1 : duplex - Full Duplex + * 31:2 : reserved2 + */ + u32 flags; +}; + +struct ena_admin_feature_aenq_desc { + /* bitmask for AENQ groups the device can report */ + u32 supported_groups; + + /* bitmask for AENQ groups to report */ + u32 enabled_groups; +}; + +struct ena_admin_feature_offload_desc { + /* 0 : TX_L3_csum_ipv4 + * 1 : TX_L4_ipv4_csum_part - The checksum field + * should be initialized with pseudo header checksum + * 2 : TX_L4_ipv4_csum_full + * 3 : TX_L4_ipv6_csum_part - The checksum field + * should be initialized with pseudo header checksum + * 4 : TX_L4_ipv6_csum_full + * 5 : tso_ipv4 + * 6 : tso_ipv6 + * 7 : tso_ecn + */ + u32 tx; + + /* Receive side supported stateless offload + * 0 : RX_L3_csum_ipv4 - IPv4 checksum + * 1 : RX_L4_ipv4_csum - TCP/UDP/IPv4 checksum + * 2 : RX_L4_ipv6_csum - TCP/UDP/IPv6 checksum + * 3 : RX_hash - Hash calculation + */ + u32 rx_supported; + + u32 rx_enabled; +}; + +enum ena_admin_hash_functions { + ENA_ADMIN_TOEPLITZ = 1, + ENA_ADMIN_CRC32 = 2, +}; + +struct ena_admin_feature_rss_flow_hash_control { + u32 key_parts; + + u32 reserved; + + u32 key[ENA_ADMIN_RSS_KEY_PARTS]; +}; + +struct ena_admin_feature_rss_flow_hash_function { + /* 7:0 : funcs - bitmask of ena_admin_hash_functions */ + u32 supported_func; + + /* 7:0 : selected_func - bitmask of + * ena_admin_hash_functions + */ + u32 selected_func; + + /* initial value */ + u32 init_val; +}; + +/* RSS flow hash protocols */ +enum ena_admin_flow_hash_proto { + ENA_ADMIN_RSS_TCP4 = 0, + ENA_ADMIN_RSS_UDP4 = 1, + ENA_ADMIN_RSS_TCP6 = 2, + ENA_ADMIN_RSS_UDP6 = 3, + ENA_ADMIN_RSS_IP4 = 4, + ENA_ADMIN_RSS_IP6 = 5, + ENA_ADMIN_RSS_IP4_FRAG = 6, + ENA_ADMIN_RSS_NOT_IP = 7, + /* TCPv6 with extension header */ + ENA_ADMIN_RSS_TCP6_EX = 8, + /* IPv6 with extension header */ + ENA_ADMIN_RSS_IP6_EX = 9, + ENA_ADMIN_RSS_PROTO_NUM = 16, +}; + +/* RSS flow hash fields */ +enum ena_admin_flow_hash_fields { + /* Ethernet Dest Addr */ + ENA_ADMIN_RSS_L2_DA = BIT(0), + /* Ethernet Src Addr */ + ENA_ADMIN_RSS_L2_SA = BIT(1), + /* ipv4/6 Dest Addr */ + ENA_ADMIN_RSS_L3_DA = BIT(2), + /* ipv4/6 Src Addr */ + ENA_ADMIN_RSS_L3_SA = BIT(3), + /* tcp/udp Dest Port */ + ENA_ADMIN_RSS_L4_DP = BIT(4), + /* tcp/udp Src Port */ + ENA_ADMIN_RSS_L4_SP = BIT(5), +}; + +struct ena_admin_proto_input { + /* flow hash fields (bitwise according to ena_admin_flow_hash_fields) */ + u16 fields; + + u16 reserved2; +}; + +struct ena_admin_feature_rss_hash_control { + struct ena_admin_proto_input supported_fields[ENA_ADMIN_RSS_PROTO_NUM]; + + struct ena_admin_proto_input selected_fields[ENA_ADMIN_RSS_PROTO_NUM]; + + struct ena_admin_proto_input reserved2[ENA_ADMIN_RSS_PROTO_NUM]; + + struct ena_admin_proto_input reserved3[ENA_ADMIN_RSS_PROTO_NUM]; +}; + +struct ena_admin_feature_rss_flow_hash_input { + /* supported hash input sorting + * 1 : L3_sort - support swap L3 addresses if DA is + * smaller than SA + * 2 : L4_sort - support swap L4 ports if DP smaller + * SP + */ + u16 supported_input_sort; + + /* enabled hash input sorting + * 1 : enable_L3_sort - enable swap L3 addresses if + * DA smaller than SA + * 2 : enable_L4_sort - enable swap L4 ports if DP + * smaller than SP + */ + u16 enabled_input_sort; +}; + +struct ena_admin_host_info { + /* Host OS type defined as ENA_ADMIN_OS_* */ + u32 os_type; + + /* os distribution string format */ + u8 os_dist_str[128]; + + /* OS distribution numeric format */ + u32 os_dist; + + /* kernel version string format */ + u8 kernel_ver_str[32]; + + /* Kernel version numeric format */ + u32 kernel_ver; + + /* 7:0 : major + * 15:8 : minor + * 23:16 : sub_minor + * 31:24 : module_type + */ + u32 driver_version; + + /* features bitmap */ + u32 supported_network_features[2]; + + /* ENA spec version of driver */ + u16 ena_spec_version; + + /* ENA device's Bus, Device and Function + * 2:0 : function + * 7:3 : device + * 15:8 : bus + */ + u16 bdf; + + /* Number of CPUs */ + u16 num_cpus; + + u16 reserved; + + /* 0 : reserved + * 1 : rx_offset + * 2 : interrupt_moderation + * 3 : rx_buf_mirroring + * 4 : rss_configurable_function_key + * 5 : reserved + * 6 : rx_page_reuse + * 7 : tx_ipv6_csum_offload + * 8 : phc + * 31:9 : reserved + */ + u32 driver_supported_features; +}; + +struct ena_admin_rss_ind_table_entry { + u16 cq_idx; + + u16 reserved; +}; + +struct ena_admin_feature_rss_ind_table { + /* min supported table size (2^min_size) */ + u16 min_size; + + /* max supported table size (2^max_size) */ + u16 max_size; + + /* table size (2^size) */ + u16 size; + + /* 0 : one_entry_update - The ENA device supports + * setting a single RSS table entry + */ + u8 flags; + + u8 reserved; + + /* index of the inline entry. 0xFFFFFFFF means invalid */ + u32 inline_index; + + /* used for updating single entry, ignored when setting the entire + * table through the control buffer. + */ + struct ena_admin_rss_ind_table_entry inline_entry; +}; + +/* When hint value is 0, driver should use it's own predefined value */ +struct ena_admin_ena_hw_hints { + /* value in ms */ + u16 mmio_read_timeout; + + /* value in ms */ + u16 driver_watchdog_timeout; + + /* Per packet tx completion timeout. value in ms */ + u16 missing_tx_completion_timeout; + + u16 missed_tx_completion_count_threshold_to_reset; + + /* value in ms */ + u16 admin_completion_tx_timeout; + + u16 netdev_wd_timeout; + + u16 max_tx_sgl_size; + + u16 max_rx_sgl_size; + + u16 reserved[8]; +}; + +struct ena_admin_get_feat_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + struct ena_admin_ctrl_buff_info control_buffer; + + struct ena_admin_get_set_feature_common_desc feat_common; + + u32 raw[11]; +}; + +struct ena_admin_queue_ext_feature_desc { + /* version */ + u8 version; + + u8 reserved1[3]; + + union { + struct ena_admin_queue_ext_feature_fields max_queue_ext; + + u32 raw[10]; + }; +}; + +struct ena_admin_feature_phc_desc { + /* PHC version as defined in enum ena_admin_phc_feature_version, + * used only for GET command as max supported PHC version by the device. + */ + u8 version; + + /* Reserved - MBZ */ + u8 reserved1[3]; + + /* PHC doorbell address as an offset to PCIe MMIO REG BAR, + * used only for GET command. + */ + u32 doorbell_offset; + + /* Max time for valid PHC retrieval, passing this threshold will + * fail the get-time request and block PHC requests for + * block_timeout_usec, used only for GET command. + */ + u32 expire_timeout_usec; + + /* PHC requests block period, blocking starts if PHC request expired + * in order to prevent floods on busy device, + * used only for GET command. + */ + u32 block_timeout_usec; + + /* Shared PHC physical address (ena_admin_phc_resp), + * used only for SET command. + */ + struct ena_common_mem_addr output_address; + + /* Shared PHC Size (ena_admin_phc_resp), + * used only for SET command. + */ + u32 output_length; +}; + +struct ena_admin_get_feat_resp { + struct ena_admin_acq_common_desc acq_common_desc; + + union { + u32 raw[14]; + + struct ena_admin_device_attr_feature_desc dev_attr; + + struct ena_admin_feature_llq_desc llq; + + struct ena_admin_queue_feature_desc max_queue; + + struct ena_admin_queue_ext_feature_desc max_queue_ext; + + struct ena_admin_feature_aenq_desc aenq; + + struct ena_admin_get_feature_link_desc link; + + struct ena_admin_feature_offload_desc offload; + + struct ena_admin_feature_rss_flow_hash_function flow_hash_func; + + struct ena_admin_feature_rss_flow_hash_input flow_hash_input; + + struct ena_admin_feature_rss_ind_table ind_table; + + struct ena_admin_feature_intr_moder_desc intr_moderation; + + struct ena_admin_ena_hw_hints hw_hints; + + struct ena_admin_feature_phc_desc phc; + + struct ena_admin_get_extra_properties_strings_desc extra_properties_strings; + + struct ena_admin_get_extra_properties_flags_desc extra_properties_flags; + } u; +}; + +struct ena_admin_set_feat_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + struct ena_admin_ctrl_buff_info control_buffer; + + struct ena_admin_get_set_feature_common_desc feat_common; + + union { + u32 raw[11]; + + /* mtu size */ + struct ena_admin_set_feature_mtu_desc mtu; + + /* host attributes */ + struct ena_admin_set_feature_host_attr_desc host_attr; + + /* AENQ configuration */ + struct ena_admin_feature_aenq_desc aenq; + + /* rss flow hash function */ + struct ena_admin_feature_rss_flow_hash_function flow_hash_func; + + /* rss flow hash input */ + struct ena_admin_feature_rss_flow_hash_input flow_hash_input; + + /* rss indirection table */ + struct ena_admin_feature_rss_ind_table ind_table; + + /* LLQ configuration */ + struct ena_admin_feature_llq_desc llq; + + /* PHC configuration */ + struct ena_admin_feature_phc_desc phc; + } u; +}; + +struct ena_admin_set_feat_resp { + struct ena_admin_acq_common_desc acq_common_desc; + + union { + u32 raw[14]; + } u; +}; + +struct ena_admin_aenq_common_desc { + u16 group; + + u16 syndrome; + + /* 0 : phase + * 7:1 : reserved - MBZ + */ + u8 flags; + + u8 reserved1[3]; + + u32 timestamp_low; + + u32 timestamp_high; +}; + +/* asynchronous event notification groups */ +enum ena_admin_aenq_group { + ENA_ADMIN_LINK_CHANGE = 0, + ENA_ADMIN_FATAL_ERROR = 1, + ENA_ADMIN_WARNING = 2, + ENA_ADMIN_NOTIFICATION = 3, + ENA_ADMIN_KEEP_ALIVE = 4, + ENA_ADMIN_REFRESH_CAPABILITIES = 5, + ENA_ADMIN_CONF_NOTIFICATIONS = 6, + ENA_ADMIN_DEVICE_REQUEST_RESET = 7, + ENA_ADMIN_AENQ_GROUPS_NUM = 8, +}; + +enum ena_admin_aenq_notification_syndrome { + ENA_ADMIN_UPDATE_HINTS = 2, +}; + +struct ena_admin_aenq_entry { + struct ena_admin_aenq_common_desc aenq_common_desc; + + /* command specific inline data */ + u32 inline_data_w4[12]; +}; + +struct ena_admin_aenq_link_change_desc { + struct ena_admin_aenq_common_desc aenq_common_desc; + + /* 0 : link_status */ + u32 flags; +}; + +struct ena_admin_aenq_keep_alive_desc { + struct ena_admin_aenq_common_desc aenq_common_desc; + + u32 rx_drops_low; + + u32 rx_drops_high; + + u32 tx_drops_low; + + u32 tx_drops_high; + + u32 rx_overruns_low; + + u32 rx_overruns_high; +}; + +struct ena_admin_aenq_conf_notifications_desc { + struct ena_admin_aenq_common_desc aenq_common_desc; + + u64 notifications_bitmap; + + u64 reserved; +}; + +struct ena_admin_ena_mmio_req_read_less_resp { + u16 req_id; + + u16 reg_off; + + /* value is valid when poll is cleared */ + u32 reg_val; +}; + +struct ena_admin_phc_resp { + /* Request Id, received from DB register */ + u16 req_id; + + u8 reserved1[6]; + + /* PHC timestamp (nsec) */ + u64 timestamp; + + u8 reserved2[8]; + + /* Timestamp error limit (nsec) */ + u32 error_bound; + + /* Bit field of enum ena_admin_phc_error_flags */ + u32 error_flags; + + u8 reserved3[32]; +}; + +/* aq_common_desc */ +#define ENA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK GENMASK(11, 0) +#define ENA_ADMIN_AQ_COMMON_DESC_PHASE_MASK BIT(0) +#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_SHIFT 1 +#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK BIT(1) +#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_SHIFT 2 +#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK BIT(2) + +/* sq */ +#define ENA_ADMIN_SQ_SQ_DIRECTION_SHIFT 5 +#define ENA_ADMIN_SQ_SQ_DIRECTION_MASK GENMASK(7, 5) + +/* acq_common_desc */ +#define ENA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK GENMASK(11, 0) +#define ENA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK BIT(0) + +/* aq_create_sq_cmd */ +#define ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_SHIFT 5 +#define ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_MASK GENMASK(7, 5) +#define ENA_ADMIN_AQ_CREATE_SQ_CMD_PLACEMENT_POLICY_MASK GENMASK(3, 0) +#define ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_SHIFT 4 +#define ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_MASK GENMASK(6, 4) +#define ENA_ADMIN_AQ_CREATE_SQ_CMD_IS_PHYSICALLY_CONTIGUOUS_MASK BIT(0) + +/* aq_create_cq_cmd */ +#define ENA_ADMIN_AQ_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_SHIFT 5 +#define ENA_ADMIN_AQ_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5) +#define ENA_ADMIN_AQ_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK GENMASK(4, 0) + +/* get_set_feature_common_desc */ +#define ENA_ADMIN_GET_SET_FEATURE_COMMON_DESC_SELECT_MASK GENMASK(1, 0) + +/* get_feature_link_desc */ +#define ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK BIT(0) +#define ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_SHIFT 1 +#define ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_MASK BIT(1) + +/* feature_offload_desc */ +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L3_CSUM_IPV4_MASK BIT(0) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_SHIFT 1 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_MASK BIT(1) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_FULL_SHIFT 2 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_FULL_MASK BIT(2) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_SHIFT 3 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_MASK BIT(3) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_FULL_SHIFT 4 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_FULL_MASK BIT(4) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_SHIFT 5 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_MASK BIT(5) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_SHIFT 6 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_MASK BIT(6) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_SHIFT 7 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_MASK BIT(7) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L3_CSUM_IPV4_MASK BIT(0) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_SHIFT 1 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_MASK BIT(1) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_SHIFT 2 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_MASK BIT(2) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_HASH_SHIFT 3 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_HASH_MASK BIT(3) + +/* feature_rss_flow_hash_function */ +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_FUNCTION_FUNCS_MASK GENMASK(7, 0) +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_FUNCTION_SELECTED_FUNC_MASK GENMASK(7, 0) + +/* feature_rss_flow_hash_input */ +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_SHIFT 1 +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_MASK BIT(1) +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_SHIFT 2 +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_MASK BIT(2) +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L3_SORT_SHIFT 1 +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L3_SORT_MASK BIT(1) +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L4_SORT_SHIFT 2 +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L4_SORT_MASK BIT(2) + +/* host_info */ +#define ENA_ADMIN_HOST_INFO_MAJOR_MASK GENMASK(7, 0) +#define ENA_ADMIN_HOST_INFO_MINOR_SHIFT 8 +#define ENA_ADMIN_HOST_INFO_MINOR_MASK GENMASK(15, 8) +#define ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT 16 +#define ENA_ADMIN_HOST_INFO_SUB_MINOR_MASK GENMASK(23, 16) +#define ENA_ADMIN_HOST_INFO_MODULE_TYPE_SHIFT 24 +#define ENA_ADMIN_HOST_INFO_MODULE_TYPE_MASK GENMASK(31, 24) +#define ENA_ADMIN_HOST_INFO_FUNCTION_MASK GENMASK(2, 0) +#define ENA_ADMIN_HOST_INFO_DEVICE_SHIFT 3 +#define ENA_ADMIN_HOST_INFO_DEVICE_MASK GENMASK(7, 3) +#define ENA_ADMIN_HOST_INFO_BUS_SHIFT 8 +#define ENA_ADMIN_HOST_INFO_BUS_MASK GENMASK(15, 8) +#define ENA_ADMIN_HOST_INFO_RX_OFFSET_SHIFT 1 +#define ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK BIT(1) +#define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_SHIFT 2 +#define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK BIT(2) +#define ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_SHIFT 3 +#define ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_MASK BIT(3) +#define ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_SHIFT 4 +#define ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK BIT(4) +#define ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_SHIFT 6 +#define ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_MASK BIT(6) +#define ENA_ADMIN_HOST_INFO_TX_IPV6_CSUM_OFFLOAD_SHIFT 7 +#define ENA_ADMIN_HOST_INFO_TX_IPV6_CSUM_OFFLOAD_MASK BIT(7) +#define ENA_ADMIN_HOST_INFO_PHC_SHIFT 8 +#define ENA_ADMIN_HOST_INFO_PHC_MASK BIT(8) + +/* feature_rss_ind_table */ +#define ENA_ADMIN_FEATURE_RSS_IND_TABLE_ONE_ENTRY_UPDATE_MASK BIT(0) + +/* aenq_common_desc */ +#define ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK BIT(0) + +/* aenq_link_change_desc */ +#define ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK BIT(0) + +#endif /* _ENA_ADMIN_H_ */ diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c new file mode 100644 index 0000000000000..d53bde802ea4d --- /dev/null +++ b/drivers/amazon/net/ena/ena_com.c @@ -0,0 +1,3357 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "ena_com.h" + +/*****************************************************************************/ +/*****************************************************************************/ + +/* Timeout in micro-sec */ +#define ADMIN_CMD_TIMEOUT_US (3000000) + +#define ENA_ASYNC_QUEUE_DEPTH 16 +#define ENA_ADMIN_QUEUE_DEPTH 32 + + +#define ENA_CTRL_MAJOR 0 +#define ENA_CTRL_MINOR 0 +#define ENA_CTRL_SUB_MINOR 1 + +#define MIN_ENA_CTRL_VER \ + (((ENA_CTRL_MAJOR) << \ + (ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT)) | \ + ((ENA_CTRL_MINOR) << \ + (ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT)) | \ + (ENA_CTRL_SUB_MINOR)) + +#define ENA_DMA_ADDR_TO_UINT32_LOW(x) ((u32)((u64)(x))) +#define ENA_DMA_ADDR_TO_UINT32_HIGH(x) ((u32)(((u64)(x)) >> 32)) + +#define ENA_MMIO_READ_TIMEOUT 0xFFFFFFFF + +#define ENA_COM_BOUNCE_BUFFER_CNTRL_CNT 4 + +#define ENA_REGS_ADMIN_INTR_MASK 1 + +#define ENA_MAX_BACKOFF_DELAY_EXP 16U + +#define ENA_MIN_ADMIN_POLL_US 100 + +#define ENA_MAX_ADMIN_POLL_US 5000 + +/* PHC definitions */ +#define ENA_PHC_DEFAULT_EXPIRE_TIMEOUT_USEC 10 +#define ENA_PHC_DEFAULT_BLOCK_TIMEOUT_USEC 1000 +#define ENA_PHC_MAX_ERROR_BOUND 0xFFFFFFFF +#define ENA_PHC_REQ_ID_OFFSET 0xDEAD +#define ENA_PHC_ERROR_FLAGS (ENA_ADMIN_PHC_ERROR_FLAG_TIMESTAMP | \ + ENA_ADMIN_PHC_ERROR_FLAG_ERROR_BOUND) + +/*****************************************************************************/ +/*****************************************************************************/ +/*****************************************************************************/ + +enum ena_cmd_status { + ENA_CMD_SUBMITTED, + ENA_CMD_COMPLETED, + /* Abort - canceled by the driver */ + ENA_CMD_ABORTED, +}; + +struct ena_comp_ctx { + struct completion wait_event; + struct ena_admin_acq_entry *user_cqe; + u32 comp_size; + enum ena_cmd_status status; + /* status from the device */ + u8 comp_status; + u8 cmd_opcode; + bool occupied; +}; + +struct ena_com_stats_ctx { + struct ena_admin_aq_get_stats_cmd get_cmd; + struct ena_admin_acq_get_stats_resp get_resp; +}; + +static int ena_com_mem_addr_set(struct ena_com_dev *ena_dev, + struct ena_common_mem_addr *ena_addr, + dma_addr_t addr) +{ + if (unlikely((addr & GENMASK_ULL(ena_dev->dma_addr_bits - 1, 0)) != addr)) { + netdev_err(ena_dev->net_device, + "DMA address has more bits that the device supports\n"); + return -EINVAL; + } + + ena_addr->mem_addr_low = lower_32_bits(addr); + ena_addr->mem_addr_high = (u16)upper_32_bits(addr); + + return 0; +} + +static int ena_com_admin_init_sq(struct ena_com_admin_queue *admin_queue) +{ + struct ena_com_dev *ena_dev = admin_queue->ena_dev; + struct ena_com_admin_sq *sq = &admin_queue->sq; + u16 size = ADMIN_SQ_SIZE(admin_queue->q_depth); + + sq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size, &sq->dma_addr, GFP_KERNEL); + + if (unlikely(!sq->entries)) { + netdev_err(ena_dev->net_device, "Memory allocation failed\n"); + return -ENOMEM; + } + + sq->head = 0; + sq->tail = 0; + sq->phase = 1; + + sq->db_addr = NULL; + + return 0; +} + +static int ena_com_admin_init_cq(struct ena_com_admin_queue *admin_queue) +{ + struct ena_com_dev *ena_dev = admin_queue->ena_dev; + struct ena_com_admin_cq *cq = &admin_queue->cq; + u16 size = ADMIN_CQ_SIZE(admin_queue->q_depth); + + cq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size, &cq->dma_addr, GFP_KERNEL); + + if (unlikely(!cq->entries)) { + netdev_err(ena_dev->net_device, "Memory allocation failed\n"); + return -ENOMEM; + } + + cq->head = 0; + cq->phase = 1; + + return 0; +} + +static int ena_com_admin_init_aenq(struct ena_com_dev *ena_dev, + struct ena_aenq_handlers *aenq_handlers) +{ + struct ena_com_aenq *aenq = &ena_dev->aenq; + u32 addr_low, addr_high, aenq_caps; + u16 size; + + ena_dev->aenq.q_depth = ENA_ASYNC_QUEUE_DEPTH; + size = ADMIN_AENQ_SIZE(ENA_ASYNC_QUEUE_DEPTH); + aenq->entries = dma_zalloc_coherent(ena_dev->dmadev, size, &aenq->dma_addr, GFP_KERNEL); + + if (unlikely(!aenq->entries)) { + netdev_err(ena_dev->net_device, "Memory allocation failed\n"); + return -ENOMEM; + } + + aenq->head = aenq->q_depth; + aenq->phase = 1; + + addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(aenq->dma_addr); + addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(aenq->dma_addr); + + writel(addr_low, ena_dev->reg_bar + ENA_REGS_AENQ_BASE_LO_OFF); + writel(addr_high, ena_dev->reg_bar + ENA_REGS_AENQ_BASE_HI_OFF); + + aenq_caps = 0; + aenq_caps |= ena_dev->aenq.q_depth & ENA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK; + aenq_caps |= + (sizeof(struct ena_admin_aenq_entry) << ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT) & + ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK; + writel(aenq_caps, ena_dev->reg_bar + ENA_REGS_AENQ_CAPS_OFF); + + if (unlikely(!aenq_handlers)) { + netdev_err(ena_dev->net_device, "AENQ handlers pointer is NULL\n"); + return -EINVAL; + } + + aenq->aenq_handlers = aenq_handlers; + + return 0; +} + +static void comp_ctxt_release(struct ena_com_admin_queue *queue, + struct ena_comp_ctx *comp_ctx) +{ + comp_ctx->user_cqe = NULL; + comp_ctx->occupied = false; + atomic_dec(&queue->outstanding_cmds); +} + +static struct ena_comp_ctx *get_comp_ctxt(struct ena_com_admin_queue *admin_queue, + u16 command_id, bool capture) +{ + if (unlikely(command_id >= admin_queue->q_depth)) { + netdev_err(admin_queue->ena_dev->net_device, + "Command id is larger than the queue size. cmd_id: %u queue size %d\n", + command_id, admin_queue->q_depth); + return NULL; + } + + if (unlikely(!admin_queue->comp_ctx)) { + netdev_err(admin_queue->ena_dev->net_device, "Completion context is NULL\n"); + return NULL; + } + + if (unlikely(admin_queue->comp_ctx[command_id].occupied && capture)) { + netdev_err(admin_queue->ena_dev->net_device, "Completion context is occupied\n"); + return NULL; + } + + if (capture) { + atomic_inc(&admin_queue->outstanding_cmds); + admin_queue->comp_ctx[command_id].occupied = true; + } + + return &admin_queue->comp_ctx[command_id]; +} + +static struct ena_comp_ctx *__ena_com_submit_admin_cmd(struct ena_com_admin_queue *admin_queue, + struct ena_admin_aq_entry *cmd, + size_t cmd_size_in_bytes, + struct ena_admin_acq_entry *comp, + size_t comp_size_in_bytes) +{ + struct ena_comp_ctx *comp_ctx; + u16 tail_masked, cmd_id; + u16 queue_size_mask; + u16 cnt; + + queue_size_mask = admin_queue->q_depth - 1; + + tail_masked = admin_queue->sq.tail & queue_size_mask; + + /* In case of queue FULL */ + cnt = (u16)atomic_read(&admin_queue->outstanding_cmds); + if (unlikely(cnt >= admin_queue->q_depth)) { + netdev_dbg(admin_queue->ena_dev->net_device, "Admin queue is full.\n"); + admin_queue->stats.out_of_space++; + return ERR_PTR(-ENOSPC); + } + + cmd_id = admin_queue->curr_cmd_id; + + cmd->aq_common_descriptor.flags |= admin_queue->sq.phase & + ENA_ADMIN_AQ_COMMON_DESC_PHASE_MASK; + + cmd->aq_common_descriptor.command_id |= cmd_id & + ENA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK; + + comp_ctx = get_comp_ctxt(admin_queue, cmd_id, true); + if (unlikely(!comp_ctx)) + return ERR_PTR(-EINVAL); + + comp_ctx->status = ENA_CMD_SUBMITTED; + comp_ctx->comp_size = (u32)comp_size_in_bytes; + comp_ctx->user_cqe = comp; + comp_ctx->cmd_opcode = cmd->aq_common_descriptor.opcode; + + reinit_completion(&comp_ctx->wait_event); + + memcpy(&admin_queue->sq.entries[tail_masked], cmd, cmd_size_in_bytes); + + admin_queue->curr_cmd_id = (admin_queue->curr_cmd_id + 1) & + queue_size_mask; + + admin_queue->sq.tail++; + admin_queue->stats.submitted_cmd++; + + if (unlikely((admin_queue->sq.tail & queue_size_mask) == 0)) + admin_queue->sq.phase = !admin_queue->sq.phase; + + writel(admin_queue->sq.tail, admin_queue->sq.db_addr); + + return comp_ctx; +} + +static int ena_com_init_comp_ctxt(struct ena_com_admin_queue *admin_queue) +{ + struct ena_com_dev *ena_dev = admin_queue->ena_dev; + size_t size = admin_queue->q_depth * sizeof(struct ena_comp_ctx); + struct ena_comp_ctx *comp_ctx; + u16 i; + + admin_queue->comp_ctx = devm_kzalloc(admin_queue->q_dmadev, size, GFP_KERNEL); + if (unlikely(!admin_queue->comp_ctx)) { + netdev_err(ena_dev->net_device, "Memory allocation failed\n"); + return -ENOMEM; + } + + for (i = 0; i < admin_queue->q_depth; i++) { + comp_ctx = get_comp_ctxt(admin_queue, i, false); + if (comp_ctx) + init_completion(&comp_ctx->wait_event); + } + + return 0; +} + +static struct ena_comp_ctx *ena_com_submit_admin_cmd(struct ena_com_admin_queue *admin_queue, + struct ena_admin_aq_entry *cmd, + size_t cmd_size_in_bytes, + struct ena_admin_acq_entry *comp, + size_t comp_size_in_bytes) +{ + unsigned long flags = 0; + struct ena_comp_ctx *comp_ctx; + + spin_lock_irqsave(&admin_queue->q_lock, flags); + if (unlikely(!admin_queue->running_state)) { + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + return ERR_PTR(-ENODEV); + } + comp_ctx = __ena_com_submit_admin_cmd(admin_queue, cmd, + cmd_size_in_bytes, + comp, + comp_size_in_bytes); + if (IS_ERR(comp_ctx)) + admin_queue->running_state = false; + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + + return comp_ctx; +} + +static int ena_com_init_io_sq(struct ena_com_dev *ena_dev, + struct ena_com_create_io_ctx *ctx, + struct ena_com_io_sq *io_sq) +{ + size_t size; + int dev_node = 0; + + memset(&io_sq->desc_addr, 0x0, sizeof(io_sq->desc_addr)); + + io_sq->dma_addr_bits = (u8)ena_dev->dma_addr_bits; + io_sq->desc_entry_size = + (io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) ? + sizeof(struct ena_eth_io_tx_desc) : + sizeof(struct ena_eth_io_rx_desc); + + size = io_sq->desc_entry_size * io_sq->q_depth; + io_sq->bus = ena_dev->bus; + + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) { + dev_node = dev_to_node(ena_dev->dmadev); + set_dev_node(ena_dev->dmadev, ctx->numa_node); + io_sq->desc_addr.virt_addr = + dma_zalloc_coherent(ena_dev->dmadev, size, &io_sq->desc_addr.phys_addr, + GFP_KERNEL); + set_dev_node(ena_dev->dmadev, dev_node); + if (!io_sq->desc_addr.virt_addr) { + io_sq->desc_addr.virt_addr = + dma_zalloc_coherent(ena_dev->dmadev, size, + &io_sq->desc_addr.phys_addr, GFP_KERNEL); + } + + if (unlikely(!io_sq->desc_addr.virt_addr)) { + netdev_err(ena_dev->net_device, "Memory allocation failed\n"); + return -ENOMEM; + } + } + + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + /* Allocate bounce buffers */ + io_sq->bounce_buf_ctrl.buffer_size = + ena_dev->llq_info.desc_list_entry_size; + io_sq->bounce_buf_ctrl.buffers_num = + ENA_COM_BOUNCE_BUFFER_CNTRL_CNT; + io_sq->bounce_buf_ctrl.next_to_use = 0; + + size = (size_t)io_sq->bounce_buf_ctrl.buffer_size * + io_sq->bounce_buf_ctrl.buffers_num; + + dev_node = dev_to_node(ena_dev->dmadev); + set_dev_node(ena_dev->dmadev, ctx->numa_node); + io_sq->bounce_buf_ctrl.base_buffer = devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL); + set_dev_node(ena_dev->dmadev, dev_node); + if (!io_sq->bounce_buf_ctrl.base_buffer) + io_sq->bounce_buf_ctrl.base_buffer = + devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL); + + if (unlikely(!io_sq->bounce_buf_ctrl.base_buffer)) { + netdev_err(ena_dev->net_device, "Bounce buffer memory allocation failed\n"); + return -ENOMEM; + } + + memcpy(&io_sq->llq_info, &ena_dev->llq_info, + sizeof(io_sq->llq_info)); + + /* Initiate the first bounce buffer */ + io_sq->llq_buf_ctrl.curr_bounce_buf = + ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl); + memset(io_sq->llq_buf_ctrl.curr_bounce_buf, + 0x0, io_sq->llq_info.desc_list_entry_size); + io_sq->llq_buf_ctrl.descs_left_in_line = + io_sq->llq_info.descs_num_before_header; + io_sq->disable_meta_caching = + io_sq->llq_info.disable_meta_caching; + + if (io_sq->llq_info.max_entries_in_tx_burst > 0) + io_sq->entries_in_tx_burst_left = + io_sq->llq_info.max_entries_in_tx_burst; + } + + io_sq->tail = 0; + io_sq->next_to_comp = 0; + io_sq->phase = 1; + + return 0; +} + +static int ena_com_init_io_cq(struct ena_com_dev *ena_dev, + struct ena_com_create_io_ctx *ctx, + struct ena_com_io_cq *io_cq) +{ + size_t size; + int prev_node = 0; + + memset(&io_cq->cdesc_addr, 0x0, sizeof(io_cq->cdesc_addr)); + + /* Use the basic completion descriptor for Rx */ + io_cq->cdesc_entry_size_in_bytes = + (io_cq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) ? + sizeof(struct ena_eth_io_tx_cdesc) : + sizeof(struct ena_eth_io_rx_cdesc_base); + + size = io_cq->cdesc_entry_size_in_bytes * io_cq->q_depth; + io_cq->bus = ena_dev->bus; + + prev_node = dev_to_node(ena_dev->dmadev); + set_dev_node(ena_dev->dmadev, ctx->numa_node); + io_cq->cdesc_addr.virt_addr = + dma_zalloc_coherent(ena_dev->dmadev, size, &io_cq->cdesc_addr.phys_addr, GFP_KERNEL); + set_dev_node(ena_dev->dmadev, prev_node); + if (!io_cq->cdesc_addr.virt_addr) { + io_cq->cdesc_addr.virt_addr = + dma_zalloc_coherent(ena_dev->dmadev, size, &io_cq->cdesc_addr.phys_addr, + GFP_KERNEL); + } + + if (unlikely(!io_cq->cdesc_addr.virt_addr)) { + netdev_err(ena_dev->net_device, "Memory allocation failed\n"); + return -ENOMEM; + } + + io_cq->phase = 1; + io_cq->head = 0; + + return 0; +} + +static void ena_com_handle_single_admin_completion(struct ena_com_admin_queue *admin_queue, + struct ena_admin_acq_entry *cqe) +{ + struct ena_comp_ctx *comp_ctx; + u16 cmd_id; + + cmd_id = cqe->acq_common_descriptor.command & + ENA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK; + + comp_ctx = get_comp_ctxt(admin_queue, cmd_id, false); + if (unlikely(!comp_ctx)) { + netdev_err(admin_queue->ena_dev->net_device, + "comp_ctx is NULL. Changing the admin queue running state\n"); + admin_queue->running_state = false; + return; + } + + if (!comp_ctx->occupied) + return; + + comp_ctx->status = ENA_CMD_COMPLETED; + comp_ctx->comp_status = cqe->acq_common_descriptor.status; + + if (comp_ctx->user_cqe) + memcpy(comp_ctx->user_cqe, (void *)cqe, comp_ctx->comp_size); + + if (!admin_queue->polling) + complete(&comp_ctx->wait_event); +} + +static void ena_com_handle_admin_completion(struct ena_com_admin_queue *admin_queue) +{ + struct ena_admin_acq_entry *cqe = NULL; + u16 comp_num = 0; + u16 head_masked; + u8 phase; + + head_masked = admin_queue->cq.head & (admin_queue->q_depth - 1); + phase = admin_queue->cq.phase; + + cqe = &admin_queue->cq.entries[head_masked]; + + /* Go over all the completions */ + while ((READ_ONCE(cqe->acq_common_descriptor.flags) & + ENA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK) == phase) { + /* Do not read the rest of the completion entry before the + * phase bit was validated + */ + dma_rmb(); + ena_com_handle_single_admin_completion(admin_queue, cqe); + + head_masked++; + comp_num++; + if (unlikely(head_masked == admin_queue->q_depth)) { + head_masked = 0; + phase = !phase; + } + + cqe = &admin_queue->cq.entries[head_masked]; + } + + admin_queue->cq.head += comp_num; + admin_queue->cq.phase = phase; + admin_queue->sq.head += comp_num; + admin_queue->stats.completed_cmd += comp_num; +} + +static int ena_com_comp_status_to_errno(struct ena_com_admin_queue *admin_queue, + u8 comp_status) +{ + if (unlikely(comp_status != 0)) + netdev_err(admin_queue->ena_dev->net_device, "Admin command failed[%u]\n", + comp_status); + + switch (comp_status) { + case ENA_ADMIN_SUCCESS: + return 0; + case ENA_ADMIN_RESOURCE_ALLOCATION_FAILURE: + return -ENOMEM; + case ENA_ADMIN_UNSUPPORTED_OPCODE: + return -EOPNOTSUPP; + case ENA_ADMIN_BAD_OPCODE: + case ENA_ADMIN_MALFORMED_REQUEST: + case ENA_ADMIN_ILLEGAL_PARAMETER: + case ENA_ADMIN_UNKNOWN_ERROR: + return -EINVAL; + case ENA_ADMIN_RESOURCE_BUSY: + return -EAGAIN; + } + + return -EINVAL; +} + +static void ena_delay_exponential_backoff_us(u32 exp, u32 delay_us) +{ + exp = min_t(u32, ENA_MAX_BACKOFF_DELAY_EXP, exp); + delay_us = max_t(u32, ENA_MIN_ADMIN_POLL_US, delay_us); + delay_us = min_t(u32, ENA_MAX_ADMIN_POLL_US, delay_us * (1U << exp)); + usleep_range(delay_us, 2 * delay_us); +} + +static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_ctx, + struct ena_com_admin_queue *admin_queue) +{ + unsigned long flags = 0; + unsigned long timeout; + int ret; + u32 exp = 0; + + timeout = jiffies + usecs_to_jiffies(admin_queue->completion_timeout); + + while (1) { + spin_lock_irqsave(&admin_queue->q_lock, flags); + ena_com_handle_admin_completion(admin_queue); + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + + if (comp_ctx->status != ENA_CMD_SUBMITTED) + break; + + if (unlikely(time_is_before_jiffies(timeout))) { + netdev_err(admin_queue->ena_dev->net_device, + "Wait for completion (polling) timeout\n"); + /* ENA didn't have any completion */ + spin_lock_irqsave(&admin_queue->q_lock, flags); + admin_queue->stats.no_completion++; + admin_queue->running_state = false; + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + + ret = -ETIME; + goto err; + } + + ena_delay_exponential_backoff_us(exp++, + admin_queue->ena_dev->ena_min_poll_delay_us); + } + + if (unlikely(comp_ctx->status == ENA_CMD_ABORTED)) { + netdev_err(admin_queue->ena_dev->net_device, "Command was aborted\n"); + spin_lock_irqsave(&admin_queue->q_lock, flags); + admin_queue->stats.aborted_cmd++; + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + ret = -ENODEV; + goto err; + } + + ret = ena_com_comp_status_to_errno(admin_queue, comp_ctx->comp_status); +err: + comp_ctxt_release(admin_queue, comp_ctx); + return ret; +} + +/* + * Set the LLQ configurations of the firmware + * + * The driver provides only the enabled feature values to the device, + * which in turn, checks if they are supported. + */ +static int ena_com_set_llq(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + struct ena_com_llq_info *llq_info = &ena_dev->llq_info; + int ret; + + memset(&cmd, 0x0, sizeof(cmd)); + admin_queue = &ena_dev->admin_queue; + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.feat_common.feature_id = ENA_ADMIN_LLQ; + + cmd.u.llq.header_location_ctrl_enabled = llq_info->header_location_ctrl; + cmd.u.llq.entry_size_ctrl_enabled = llq_info->desc_list_entry_size_ctrl; + cmd.u.llq.desc_num_before_header_enabled = llq_info->descs_num_before_header; + cmd.u.llq.descriptors_stride_ctrl_enabled = llq_info->desc_stride_ctrl; + + cmd.u.llq.accel_mode.u.set.enabled_flags = + BIT(ENA_ADMIN_DISABLE_META_CACHING) | + BIT(ENA_ADMIN_LIMIT_TX_BURST); + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, "Failed to set LLQ configurations: %d\n", ret); + + return ret; +} + +static int ena_com_config_llq_info(struct ena_com_dev *ena_dev, + struct ena_admin_feature_llq_desc *llq_features, + struct ena_llq_configurations *llq_default_cfg) +{ + struct ena_com_llq_info *llq_info = &ena_dev->llq_info; + struct ena_admin_accel_mode_get llq_accel_mode_get; + u16 supported_feat; + int rc; + + memset(llq_info, 0, sizeof(*llq_info)); + + supported_feat = llq_features->header_location_ctrl_supported; + + if (likely(supported_feat & llq_default_cfg->llq_header_location)) { + llq_info->header_location_ctrl = + llq_default_cfg->llq_header_location; + } else { + netdev_err(ena_dev->net_device, + "Invalid header location control, supported: 0x%x\n", supported_feat); + return -EINVAL; + } + + if (likely(llq_info->header_location_ctrl == ENA_ADMIN_INLINE_HEADER)) { + supported_feat = llq_features->descriptors_stride_ctrl_supported; + if (likely(supported_feat & llq_default_cfg->llq_stride_ctrl)) { + llq_info->desc_stride_ctrl = llq_default_cfg->llq_stride_ctrl; + } else { + if (supported_feat & ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY) { + llq_info->desc_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY; + } else if (supported_feat & ENA_ADMIN_SINGLE_DESC_PER_ENTRY) { + llq_info->desc_stride_ctrl = ENA_ADMIN_SINGLE_DESC_PER_ENTRY; + } else { + netdev_err(ena_dev->net_device, + "Invalid desc_stride_ctrl, supported: 0x%x\n", + supported_feat); + return -EINVAL; + } + + netdev_err(ena_dev->net_device, + "Default llq stride ctrl is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n", + llq_default_cfg->llq_stride_ctrl, supported_feat, + llq_info->desc_stride_ctrl); + } + } else { + llq_info->desc_stride_ctrl = 0; + } + + supported_feat = llq_features->entry_size_ctrl_supported; + if (likely(supported_feat & llq_default_cfg->llq_ring_entry_size)) { + llq_info->desc_list_entry_size_ctrl = llq_default_cfg->llq_ring_entry_size; + llq_info->desc_list_entry_size = llq_default_cfg->llq_ring_entry_size_value; + } else { + if (supported_feat & ENA_ADMIN_LIST_ENTRY_SIZE_128B) { + llq_info->desc_list_entry_size_ctrl = ENA_ADMIN_LIST_ENTRY_SIZE_128B; + llq_info->desc_list_entry_size = 128; + } else if (supported_feat & ENA_ADMIN_LIST_ENTRY_SIZE_192B) { + llq_info->desc_list_entry_size_ctrl = ENA_ADMIN_LIST_ENTRY_SIZE_192B; + llq_info->desc_list_entry_size = 192; + } else if (supported_feat & ENA_ADMIN_LIST_ENTRY_SIZE_256B) { + llq_info->desc_list_entry_size_ctrl = ENA_ADMIN_LIST_ENTRY_SIZE_256B; + llq_info->desc_list_entry_size = 256; + } else { + netdev_err(ena_dev->net_device, + "Invalid entry_size_ctrl, supported: 0x%x\n", supported_feat); + return -EINVAL; + } + + netdev_err(ena_dev->net_device, + "Default llq ring entry size is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n", + llq_default_cfg->llq_ring_entry_size, supported_feat, + llq_info->desc_list_entry_size); + } + if (unlikely(llq_info->desc_list_entry_size & 0x7)) { + /* The desc list entry size should be whole multiply of 8 + * This requirement comes from __iowrite64_copy() + */ + netdev_err(ena_dev->net_device, "Illegal entry size %d\n", + llq_info->desc_list_entry_size); + return -EINVAL; + } + + if (llq_info->desc_stride_ctrl == ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY) + llq_info->descs_per_entry = llq_info->desc_list_entry_size / + sizeof(struct ena_eth_io_tx_desc); + else + llq_info->descs_per_entry = 1; + + supported_feat = llq_features->desc_num_before_header_supported; + if (likely(supported_feat & llq_default_cfg->llq_num_decs_before_header)) { + llq_info->descs_num_before_header = llq_default_cfg->llq_num_decs_before_header; + } else { + if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2) { + llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2; + } else if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_1) { + llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_1; + } else if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_4) { + llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_4; + } else if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8) { + llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8; + } else { + netdev_err(ena_dev->net_device, + "Invalid descs_num_before_header, supported: 0x%x\n", + supported_feat); + return -EINVAL; + } + + netdev_err(ena_dev->net_device, + "Default llq num descs before header is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n", + llq_default_cfg->llq_num_decs_before_header, supported_feat, + llq_info->descs_num_before_header); + } + /* Check for accelerated queue supported */ + llq_accel_mode_get = llq_features->accel_mode.u.get; + + llq_info->disable_meta_caching = + !!(llq_accel_mode_get.supported_flags & + BIT(ENA_ADMIN_DISABLE_META_CACHING)); + + if (llq_accel_mode_get.supported_flags & BIT(ENA_ADMIN_LIMIT_TX_BURST)) + llq_info->max_entries_in_tx_burst = + llq_accel_mode_get.max_tx_burst_size / + llq_default_cfg->llq_ring_entry_size_value; + + rc = ena_com_set_llq(ena_dev); + if (unlikely(rc)) + netdev_err(ena_dev->net_device, "Cannot set LLQ configuration: %d\n", rc); + + return rc; +} + +static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *comp_ctx, + struct ena_com_admin_queue *admin_queue) +{ + unsigned long flags = 0; + int ret; + + wait_for_completion_timeout(&comp_ctx->wait_event, + usecs_to_jiffies(admin_queue->completion_timeout)); + + /* In case the command wasn't completed find out the root cause. + * There might be 2 kinds of errors + * 1) No completion (timeout reached) + * 2) There is completion but the device didn't get any msi-x interrupt. + */ + if (unlikely(comp_ctx->status == ENA_CMD_SUBMITTED)) { + spin_lock_irqsave(&admin_queue->q_lock, flags); + ena_com_handle_admin_completion(admin_queue); + admin_queue->stats.no_completion++; + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + + if (comp_ctx->status == ENA_CMD_COMPLETED) { + admin_queue->is_missing_admin_interrupt = true; + netdev_err(admin_queue->ena_dev->net_device, + "The ena device sent a completion but the driver didn't receive a MSI-X interrupt (cmd %d), autopolling mode is %s\n", + comp_ctx->cmd_opcode, admin_queue->auto_polling ? "ON" : "OFF"); + /* Check if fallback to polling is enabled */ + if (admin_queue->auto_polling) + admin_queue->polling = true; + } else { + netdev_err(admin_queue->ena_dev->net_device, + "The ena device didn't send a completion for the admin cmd %d status %d\n", + comp_ctx->cmd_opcode, comp_ctx->status); + } + /* Check if shifted to polling mode. + * This will happen if there is a completion without an interrupt + * and autopolling mode is enabled. Continuing normal execution in such case + */ + if (!admin_queue->polling) { + admin_queue->running_state = false; + ret = -ETIME; + goto err; + } + } else if (unlikely(comp_ctx->status == ENA_CMD_ABORTED)) { + netdev_err(admin_queue->ena_dev->net_device, "Command was aborted\n"); + spin_lock_irqsave(&admin_queue->q_lock, flags); + admin_queue->stats.aborted_cmd++; + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + ret = -ENODEV; + goto err; + } + + ret = ena_com_comp_status_to_errno(admin_queue, comp_ctx->comp_status); +err: + comp_ctxt_release(admin_queue, comp_ctx); + return ret; +} + +/* This method read the hardware device register through posting writes + * and waiting for response + * On timeout the function will return ENA_MMIO_READ_TIMEOUT + */ +static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset) +{ + struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read; + volatile struct ena_admin_ena_mmio_req_read_less_resp *read_resp = + mmio_read->read_resp; + u32 mmio_read_reg, ret, i; + unsigned long flags = 0; + u32 timeout = mmio_read->reg_read_to; + + might_sleep(); + + if (timeout == 0) + timeout = ENA_REG_READ_TIMEOUT; + + /* If readless is disabled, perform regular read */ + if (!mmio_read->readless_supported) + return readl(ena_dev->reg_bar + offset); + + spin_lock_irqsave(&mmio_read->lock, flags); + mmio_read->seq_num++; + + read_resp->req_id = mmio_read->seq_num + 0xDEAD; + mmio_read_reg = (offset << ENA_REGS_MMIO_REG_READ_REG_OFF_SHIFT) & + ENA_REGS_MMIO_REG_READ_REG_OFF_MASK; + mmio_read_reg |= mmio_read->seq_num & + ENA_REGS_MMIO_REG_READ_REQ_ID_MASK; + + writel(mmio_read_reg, ena_dev->reg_bar + ENA_REGS_MMIO_REG_READ_OFF); + + for (i = 0; i < timeout; i++) { + if (READ_ONCE(read_resp->req_id) == mmio_read->seq_num) + break; + + udelay(1); + } + + if (unlikely(i == timeout)) { + netdev_err(ena_dev->net_device, + "Reading reg failed for timeout. expected: req id[%u] offset[%u] actual: req id[%u] offset[%u]\n", + mmio_read->seq_num, offset, read_resp->req_id, read_resp->reg_off); + ret = ENA_MMIO_READ_TIMEOUT; + goto err; + } + + if (unlikely(read_resp->reg_off != offset)) { + netdev_err(ena_dev->net_device, "Read failure: wrong offset provided\n"); + ret = ENA_MMIO_READ_TIMEOUT; + } else { + ret = read_resp->reg_val; + } +err: + spin_unlock_irqrestore(&mmio_read->lock, flags); + + return ret; +} + +/* There are two types to wait for completion. + * Polling mode - wait until the completion is available. + * Async mode - wait on wait queue until the completion is ready + * (or the timeout expired). + * It is expected that the IRQ called ena_com_handle_admin_completion + * to mark the completions. + */ +static int ena_com_wait_and_process_admin_cq(struct ena_comp_ctx *comp_ctx, + struct ena_com_admin_queue *admin_queue) +{ + if (admin_queue->polling) + return ena_com_wait_and_process_admin_cq_polling(comp_ctx, + admin_queue); + + return ena_com_wait_and_process_admin_cq_interrupts(comp_ctx, + admin_queue); +} + +static int ena_com_destroy_io_sq(struct ena_com_dev *ena_dev, + struct ena_com_io_sq *io_sq) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_admin_aq_destroy_sq_cmd destroy_cmd; + struct ena_admin_acq_destroy_sq_resp_desc destroy_resp; + u8 direction; + int ret; + + memset(&destroy_cmd, 0x0, sizeof(destroy_cmd)); + + if (io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) + direction = ENA_ADMIN_SQ_DIRECTION_TX; + else + direction = ENA_ADMIN_SQ_DIRECTION_RX; + + destroy_cmd.sq.sq_identity |= (direction << + ENA_ADMIN_SQ_SQ_DIRECTION_SHIFT) & + ENA_ADMIN_SQ_SQ_DIRECTION_MASK; + + destroy_cmd.sq.sq_idx = io_sq->idx; + destroy_cmd.aq_common_descriptor.opcode = ENA_ADMIN_DESTROY_SQ; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&destroy_cmd, + sizeof(destroy_cmd), + (struct ena_admin_acq_entry *)&destroy_resp, + sizeof(destroy_resp)); + + if (unlikely(ret && (ret != -ENODEV))) + netdev_err(ena_dev->net_device, "Failed to destroy io sq error: %d\n", ret); + + return ret; +} + +static void ena_com_io_queue_free(struct ena_com_dev *ena_dev, + struct ena_com_io_sq *io_sq, + struct ena_com_io_cq *io_cq) +{ + size_t size; + + if (io_cq->cdesc_addr.virt_addr) { + size = io_cq->cdesc_entry_size_in_bytes * io_cq->q_depth; + + dma_free_coherent(ena_dev->dmadev, size, io_cq->cdesc_addr.virt_addr, + io_cq->cdesc_addr.phys_addr); + + io_cq->cdesc_addr.virt_addr = NULL; + } + + if (io_sq->desc_addr.virt_addr) { + size = io_sq->desc_entry_size * io_sq->q_depth; + + dma_free_coherent(ena_dev->dmadev, size, io_sq->desc_addr.virt_addr, + io_sq->desc_addr.phys_addr); + + io_sq->desc_addr.virt_addr = NULL; + } + + if (io_sq->bounce_buf_ctrl.base_buffer) { + devm_kfree(ena_dev->dmadev, io_sq->bounce_buf_ctrl.base_buffer); + io_sq->bounce_buf_ctrl.base_buffer = NULL; + } +} + +static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout, + u16 exp_state) +{ + u32 val, exp = 0; + unsigned long timeout_stamp; + + /* Convert timeout from resolution of 100ms to us resolution. */ + timeout_stamp = jiffies + usecs_to_jiffies(100 * 1000 * timeout); + + while (1) { + val = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF); + + if (unlikely(val == ENA_MMIO_READ_TIMEOUT)) { + netdev_err(ena_dev->net_device, "Reg read timeout occurred\n"); + return -ETIME; + } + + if ((val & ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK) == + exp_state) + return 0; + + if (unlikely(time_is_before_jiffies(timeout_stamp))) + return -ETIME; + + ena_delay_exponential_backoff_us(exp++, ena_dev->ena_min_poll_delay_us); + } +} + +static bool ena_com_check_supported_feature_id(struct ena_com_dev *ena_dev, + enum ena_admin_aq_feature_id feature_id) +{ + u32 feature_mask = 1 << feature_id; + + /* Device attributes is always supported */ + if ((feature_id != ENA_ADMIN_DEVICE_ATTRIBUTES) && + !(ena_dev->supported_features & feature_mask)) + return false; + + return true; +} + +static int ena_com_get_feature_ex(struct ena_com_dev *ena_dev, + struct ena_admin_get_feat_resp *get_resp, + enum ena_admin_aq_feature_id feature_id, + dma_addr_t control_buf_dma_addr, + u32 control_buff_size, + u8 feature_ver) +{ + struct ena_com_admin_queue *admin_queue; + struct ena_admin_get_feat_cmd get_cmd; + int ret; + + if (!ena_com_check_supported_feature_id(ena_dev, feature_id)) { + netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n", feature_id); + return -EOPNOTSUPP; + } + + memset(&get_cmd, 0x0, sizeof(get_cmd)); + admin_queue = &ena_dev->admin_queue; + + get_cmd.aq_common_descriptor.opcode = ENA_ADMIN_GET_FEATURE; + + if (control_buff_size) + get_cmd.aq_common_descriptor.flags = + ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK; + else + get_cmd.aq_common_descriptor.flags = 0; + + ret = ena_com_mem_addr_set(ena_dev, + &get_cmd.control_buffer.address, + control_buf_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + + get_cmd.control_buffer.length = control_buff_size; + get_cmd.feat_common.feature_version = feature_ver; + get_cmd.feat_common.feature_id = feature_id; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *) + &get_cmd, + sizeof(get_cmd), + (struct ena_admin_acq_entry *) + get_resp, + sizeof(*get_resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, + "Failed to submit get_feature command %d error: %d\n", feature_id, ret); + + return ret; +} + +static int ena_com_get_feature(struct ena_com_dev *ena_dev, + struct ena_admin_get_feat_resp *get_resp, + enum ena_admin_aq_feature_id feature_id, + u8 feature_ver) +{ + return ena_com_get_feature_ex(ena_dev, + get_resp, + feature_id, + 0, + 0, + feature_ver); +} + +int ena_com_get_current_hash_function(struct ena_com_dev *ena_dev) +{ + return ena_dev->rss.hash_func; +} + +static void ena_com_hash_key_fill_default_key(struct ena_com_dev *ena_dev) +{ + struct ena_admin_feature_rss_flow_hash_control *hash_key = + (ena_dev->rss).hash_key; + + netdev_rss_key_fill(&hash_key->key, sizeof(hash_key->key)); + /* The key buffer is stored in the device in an array of + * uint32 elements. + */ + hash_key->key_parts = ENA_ADMIN_RSS_KEY_PARTS; +} + +static int ena_com_hash_key_allocate(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + + if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_RSS_HASH_FUNCTION)) + return -EOPNOTSUPP; + + rss->hash_key = dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_key), + &rss->hash_key_dma_addr, GFP_KERNEL); + + if (unlikely(!rss->hash_key)) + return -ENOMEM; + + return 0; +} + +static void ena_com_hash_key_destroy(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + + if (rss->hash_key) + dma_free_coherent(ena_dev->dmadev, sizeof(*rss->hash_key), rss->hash_key, + rss->hash_key_dma_addr); + rss->hash_key = NULL; +} + +static int ena_com_hash_ctrl_init(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + + rss->hash_ctrl = dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl), + &rss->hash_ctrl_dma_addr, GFP_KERNEL); + + if (unlikely(!rss->hash_ctrl)) + return -ENOMEM; + + return 0; +} + +static void ena_com_hash_ctrl_destroy(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + + if (rss->hash_ctrl) + dma_free_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl), rss->hash_ctrl, + rss->hash_ctrl_dma_addr); + rss->hash_ctrl = NULL; +} + +static int ena_com_indirect_table_allocate(struct ena_com_dev *ena_dev, + u16 log_size) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_get_feat_resp get_resp; + size_t tbl_size; + int ret; + + ret = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG, 0); + if (unlikely(ret)) + return ret; + + if ((get_resp.u.ind_table.min_size > log_size) || + (get_resp.u.ind_table.max_size < log_size)) { + netdev_err(ena_dev->net_device, + "Indirect table size doesn't fit. requested size: %d while min is:%d and max %d\n", + 1 << log_size, 1 << get_resp.u.ind_table.min_size, + 1 << get_resp.u.ind_table.max_size); + return -EINVAL; + } + + tbl_size = (1ULL << log_size) * + sizeof(struct ena_admin_rss_ind_table_entry); + + rss->rss_ind_tbl = dma_zalloc_coherent(ena_dev->dmadev, tbl_size, + &rss->rss_ind_tbl_dma_addr, GFP_KERNEL); + if (unlikely(!rss->rss_ind_tbl)) + goto mem_err1; + + tbl_size = (1ULL << log_size) * sizeof(u16); + rss->host_rss_ind_tbl = devm_kzalloc(ena_dev->dmadev, tbl_size, GFP_KERNEL); + if (unlikely(!rss->host_rss_ind_tbl)) + goto mem_err2; + + rss->tbl_log_size = log_size; + + return 0; + +mem_err2: + tbl_size = (1ULL << log_size) * + sizeof(struct ena_admin_rss_ind_table_entry); + + dma_free_coherent(ena_dev->dmadev, tbl_size, rss->rss_ind_tbl, rss->rss_ind_tbl_dma_addr); + rss->rss_ind_tbl = NULL; +mem_err1: + rss->tbl_log_size = 0; + return -ENOMEM; +} + +static void ena_com_indirect_table_destroy(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + size_t tbl_size = (1ULL << rss->tbl_log_size) * + sizeof(struct ena_admin_rss_ind_table_entry); + + if (rss->rss_ind_tbl) + dma_free_coherent(ena_dev->dmadev, tbl_size, rss->rss_ind_tbl, + rss->rss_ind_tbl_dma_addr); + rss->rss_ind_tbl = NULL; + + if (rss->host_rss_ind_tbl) + devm_kfree(ena_dev->dmadev, rss->host_rss_ind_tbl); + rss->host_rss_ind_tbl = NULL; +} + +static int ena_com_create_io_sq(struct ena_com_dev *ena_dev, + struct ena_com_io_sq *io_sq, u16 cq_idx) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_admin_aq_create_sq_cmd create_cmd; + struct ena_admin_acq_create_sq_resp_desc cmd_completion; + u8 direction; + int ret; + + memset(&create_cmd, 0x0, sizeof(create_cmd)); + + create_cmd.aq_common_descriptor.opcode = ENA_ADMIN_CREATE_SQ; + + if (io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) + direction = ENA_ADMIN_SQ_DIRECTION_TX; + else + direction = ENA_ADMIN_SQ_DIRECTION_RX; + + create_cmd.sq_identity |= (direction << + ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_SHIFT) & + ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_MASK; + + create_cmd.sq_caps_2 |= io_sq->mem_queue_type & + ENA_ADMIN_AQ_CREATE_SQ_CMD_PLACEMENT_POLICY_MASK; + + create_cmd.sq_caps_2 |= (ENA_ADMIN_COMPLETION_POLICY_DESC << + ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_SHIFT) & + ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_MASK; + + create_cmd.sq_caps_3 |= + ENA_ADMIN_AQ_CREATE_SQ_CMD_IS_PHYSICALLY_CONTIGUOUS_MASK; + + create_cmd.cq_idx = cq_idx; + create_cmd.sq_depth = io_sq->q_depth; + + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) { + ret = ena_com_mem_addr_set(ena_dev, + &create_cmd.sq_ba, + io_sq->desc_addr.phys_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + } + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&create_cmd, + sizeof(create_cmd), + (struct ena_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Failed to create IO SQ. error: %d\n", ret); + return ret; + } + + io_sq->idx = cmd_completion.sq_idx; + + io_sq->db_addr = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar + + (uintptr_t)cmd_completion.sq_doorbell_offset); + + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + io_sq->desc_addr.pbuf_dev_addr = + (u8 __iomem *)((uintptr_t)ena_dev->mem_bar + + cmd_completion.llq_descriptors_offset); + } + + netdev_dbg(ena_dev->net_device, "Created sq[%u], depth[%u]\n", io_sq->idx, io_sq->q_depth); + + return ret; +} + +static int ena_com_ind_tbl_convert_to_device(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_com_io_sq *io_sq; + u16 qid; + int i; + + for (i = 0; i < 1 << rss->tbl_log_size; i++) { + qid = rss->host_rss_ind_tbl[i]; + if (qid >= ENA_TOTAL_NUM_QUEUES) + return -EINVAL; + + io_sq = &ena_dev->io_sq_queues[qid]; + + if (io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX) + return -EINVAL; + + rss->rss_ind_tbl[i].cq_idx = io_sq->idx; + } + + return 0; +} + +static void ena_com_update_intr_delay_resolution(struct ena_com_dev *ena_dev, + u16 intr_delay_resolution) +{ + u16 prev_intr_delay_resolution = ena_dev->intr_delay_resolution; + + if (unlikely(!intr_delay_resolution)) { + netdev_err(ena_dev->net_device, + "Illegal intr_delay_resolution provided. Going to use default 1 usec resolution\n"); + intr_delay_resolution = ENA_DEFAULT_INTR_DELAY_RESOLUTION; + } + + /* update Rx */ + ena_dev->intr_moder_rx_interval = + ena_dev->intr_moder_rx_interval * + prev_intr_delay_resolution / + intr_delay_resolution; + + /* update Tx */ + ena_dev->intr_moder_tx_interval = + ena_dev->intr_moder_tx_interval * + prev_intr_delay_resolution / + intr_delay_resolution; + + ena_dev->intr_delay_resolution = intr_delay_resolution; +} + +/*****************************************************************************/ +/******************************* API ******************************/ +/*****************************************************************************/ + +int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue, + struct ena_admin_aq_entry *cmd, + size_t cmd_size, + struct ena_admin_acq_entry *comp, + size_t comp_size) +{ + struct ena_comp_ctx *comp_ctx; + int ret; + + comp_ctx = ena_com_submit_admin_cmd(admin_queue, cmd, cmd_size, + comp, comp_size); + if (IS_ERR(comp_ctx)) { + ret = PTR_ERR(comp_ctx); + if (ret == -ENODEV) + netdev_dbg(admin_queue->ena_dev->net_device, + "Failed to submit command [%d]\n", ret); + else + netdev_err(admin_queue->ena_dev->net_device, + "Failed to submit command [%d]\n", ret); + + return ret; + } + + ret = ena_com_wait_and_process_admin_cq(comp_ctx, admin_queue); + if (unlikely(ret)) { + if (admin_queue->running_state) + netdev_err(admin_queue->ena_dev->net_device, + "Failed to process command. ret = %d\n", ret); + else + netdev_dbg(admin_queue->ena_dev->net_device, + "Failed to process command. ret = %d\n", ret); + } + return ret; +} + +int ena_com_create_io_cq(struct ena_com_dev *ena_dev, + struct ena_com_io_cq *io_cq) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_admin_aq_create_cq_cmd create_cmd; + struct ena_admin_acq_create_cq_resp_desc cmd_completion; + int ret; + + memset(&create_cmd, 0x0, sizeof(create_cmd)); + + create_cmd.aq_common_descriptor.opcode = ENA_ADMIN_CREATE_CQ; + + create_cmd.cq_caps_2 |= (io_cq->cdesc_entry_size_in_bytes / 4) & + ENA_ADMIN_AQ_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK; + create_cmd.cq_caps_1 |= + ENA_ADMIN_AQ_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK; + + create_cmd.msix_vector = io_cq->msix_vector; + create_cmd.cq_depth = io_cq->q_depth; + + ret = ena_com_mem_addr_set(ena_dev, + &create_cmd.cq_ba, + io_cq->cdesc_addr.phys_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&create_cmd, + sizeof(create_cmd), + (struct ena_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Failed to create IO CQ. error: %d\n", ret); + return ret; + } + + io_cq->idx = cmd_completion.cq_idx; + + io_cq->unmask_reg = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar + + cmd_completion.cq_interrupt_unmask_register_offset); + + if (cmd_completion.numa_node_register_offset) + io_cq->numa_node_cfg_reg = + (u32 __iomem *)((uintptr_t)ena_dev->reg_bar + + cmd_completion.numa_node_register_offset); + + netdev_dbg(ena_dev->net_device, "Created cq[%u], depth[%u]\n", io_cq->idx, io_cq->q_depth); + + return ret; +} + +int ena_com_get_io_handlers(struct ena_com_dev *ena_dev, u16 qid, + struct ena_com_io_sq **io_sq, + struct ena_com_io_cq **io_cq) +{ + if (unlikely(qid >= ENA_TOTAL_NUM_QUEUES)) { + netdev_err(ena_dev->net_device, "Invalid queue number %d but the max is %d\n", qid, + ENA_TOTAL_NUM_QUEUES); + return -EINVAL; + } + + *io_sq = &ena_dev->io_sq_queues[qid]; + *io_cq = &ena_dev->io_cq_queues[qid]; + + return 0; +} + +void ena_com_abort_admin_commands(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_comp_ctx *comp_ctx; + u16 i; + + if (!admin_queue->comp_ctx) + return; + + for (i = 0; i < admin_queue->q_depth; i++) { + comp_ctx = get_comp_ctxt(admin_queue, i, false); + if (unlikely(!comp_ctx)) + break; + + comp_ctx->status = ENA_CMD_ABORTED; + + complete(&comp_ctx->wait_event); + } +} + +void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + unsigned long flags = 0; + u32 exp = 0; + + spin_lock_irqsave(&admin_queue->q_lock, flags); + while (atomic_read(&admin_queue->outstanding_cmds) != 0) { + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + ena_delay_exponential_backoff_us(exp++, ena_dev->ena_min_poll_delay_us); + spin_lock_irqsave(&admin_queue->q_lock, flags); + } + spin_unlock_irqrestore(&admin_queue->q_lock, flags); +} + +int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev, + struct ena_com_io_cq *io_cq) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_admin_aq_destroy_cq_cmd destroy_cmd; + struct ena_admin_acq_destroy_cq_resp_desc destroy_resp; + int ret; + + memset(&destroy_cmd, 0x0, sizeof(destroy_cmd)); + + destroy_cmd.cq_idx = io_cq->idx; + destroy_cmd.aq_common_descriptor.opcode = ENA_ADMIN_DESTROY_CQ; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&destroy_cmd, + sizeof(destroy_cmd), + (struct ena_admin_acq_entry *)&destroy_resp, + sizeof(destroy_resp)); + + if (unlikely(ret && (ret != -ENODEV))) + netdev_err(ena_dev->net_device, "Failed to destroy IO CQ. error: %d\n", ret); + + return ret; +} + +bool ena_com_get_admin_running_state(struct ena_com_dev *ena_dev) +{ + return ena_dev->admin_queue.running_state; +} + +void ena_com_set_admin_running_state(struct ena_com_dev *ena_dev, bool state) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + unsigned long flags = 0; + + spin_lock_irqsave(&admin_queue->q_lock, flags); + ena_dev->admin_queue.running_state = state; + spin_unlock_irqrestore(&admin_queue->q_lock, flags); +} + +void ena_com_admin_aenq_enable(struct ena_com_dev *ena_dev) +{ + u16 depth = ena_dev->aenq.q_depth; + + WARN(ena_dev->aenq.head != depth, "Invalid AENQ state\n"); + + /* Init head_db to mark that all entries in the queue + * are initially available + */ + writel(depth, ena_dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF); +} + +int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag) +{ + struct ena_com_admin_queue *admin_queue; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + struct ena_admin_get_feat_resp get_resp; + int ret; + + ret = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_AENQ_CONFIG, 0); + if (unlikely(ret)) { + dev_info(ena_dev->dmadev, "Can't get aenq configuration\n"); + return ret; + } + + if ((get_resp.u.aenq.supported_groups & groups_flag) != groups_flag) { + netdev_warn(ena_dev->net_device, + "Trying to set unsupported aenq events. supported flag: 0x%x asked flag: 0x%x\n", + get_resp.u.aenq.supported_groups, groups_flag); + return -EOPNOTSUPP; + } + + memset(&cmd, 0x0, sizeof(cmd)); + admin_queue = &ena_dev->admin_queue; + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.aq_common_descriptor.flags = 0; + cmd.feat_common.feature_id = ENA_ADMIN_AENQ_CONFIG; + cmd.u.aenq.enabled_groups = groups_flag; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, "Failed to config AENQ ret: %d\n", ret); + + return ret; +} + +int ena_com_get_dma_width(struct ena_com_dev *ena_dev) +{ + u32 caps = ena_com_reg_bar_read32(ena_dev, ENA_REGS_CAPS_OFF); + u32 width; + + if (unlikely(caps == ENA_MMIO_READ_TIMEOUT)) { + netdev_err(ena_dev->net_device, "Reg read timeout occurred\n"); + return -ETIME; + } + + width = (caps & ENA_REGS_CAPS_DMA_ADDR_WIDTH_MASK) >> + ENA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT; + + netdev_dbg(ena_dev->net_device, "ENA dma width: %d\n", width); + + if (unlikely((width < 32) || width > ENA_MAX_PHYS_ADDR_SIZE_BITS)) { + netdev_err(ena_dev->net_device, "DMA width illegal value: %d\n", width); + return -EINVAL; + } + + ena_dev->dma_addr_bits = width; + + return width; +} + +int ena_com_validate_version(struct ena_com_dev *ena_dev) +{ + u32 ver; + u32 ctrl_ver; + u32 ctrl_ver_masked; + + /* Make sure the ENA version and the controller version are at least + * as the driver expects + */ + ver = ena_com_reg_bar_read32(ena_dev, ENA_REGS_VERSION_OFF); + ctrl_ver = ena_com_reg_bar_read32(ena_dev, + ENA_REGS_CONTROLLER_VERSION_OFF); + + if (unlikely((ver == ENA_MMIO_READ_TIMEOUT) || (ctrl_ver == ENA_MMIO_READ_TIMEOUT))) { + netdev_err(ena_dev->net_device, "Reg read timeout occurred\n"); + return -ETIME; + } + + dev_info(ena_dev->dmadev, "ENA device version: %d.%d\n", + (ver & ENA_REGS_VERSION_MAJOR_VERSION_MASK) >> ENA_REGS_VERSION_MAJOR_VERSION_SHIFT, + ver & ENA_REGS_VERSION_MINOR_VERSION_MASK); + + dev_info(ena_dev->dmadev, "ENA controller version: %d.%d.%d implementation version %d\n", + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) >> + ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT, + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) >> + ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT, + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK), + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK) >> + ENA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT); + + ctrl_ver_masked = + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) | + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) | + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK); + + /* Validate the ctrl version without the implementation ID */ + if (ctrl_ver_masked < MIN_ENA_CTRL_VER) { + netdev_err(ena_dev->net_device, + "ENA ctrl version is lower than the minimal ctrl version the driver supports\n"); + return -1; + } + + return 0; +} + +static void +ena_com_free_ena_admin_queue_comp_ctx(struct ena_com_dev *ena_dev, + struct ena_com_admin_queue *admin_queue) + +{ + if (!admin_queue->comp_ctx) + return; + + devm_kfree(ena_dev->dmadev, admin_queue->comp_ctx); + + admin_queue->comp_ctx = NULL; +} + +void ena_com_admin_destroy(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_com_admin_cq *cq = &admin_queue->cq; + struct ena_com_admin_sq *sq = &admin_queue->sq; + struct ena_com_aenq *aenq = &ena_dev->aenq; + u16 size; + + ena_com_free_ena_admin_queue_comp_ctx(ena_dev, admin_queue); + + size = ADMIN_SQ_SIZE(admin_queue->q_depth); + if (sq->entries) + dma_free_coherent(ena_dev->dmadev, size, sq->entries, sq->dma_addr); + sq->entries = NULL; + + size = ADMIN_CQ_SIZE(admin_queue->q_depth); + if (cq->entries) + dma_free_coherent(ena_dev->dmadev, size, cq->entries, cq->dma_addr); + cq->entries = NULL; + + size = ADMIN_AENQ_SIZE(aenq->q_depth); + if (ena_dev->aenq.entries) + dma_free_coherent(ena_dev->dmadev, size, aenq->entries, aenq->dma_addr); + aenq->entries = NULL; +} + +void ena_com_set_admin_polling_mode(struct ena_com_dev *ena_dev, bool polling) +{ + u32 mask_value = 0; + + if (polling) + mask_value = ENA_REGS_ADMIN_INTR_MASK; + + writel(mask_value, ena_dev->reg_bar + ENA_REGS_INTR_MASK_OFF); + ena_dev->admin_queue.polling = polling; +} + +bool ena_com_get_admin_polling_mode(struct ena_com_dev *ena_dev) +{ + return ena_dev->admin_queue.polling; +} + +void ena_com_set_admin_auto_polling_mode(struct ena_com_dev *ena_dev, + bool polling) +{ + ena_dev->admin_queue.auto_polling = polling; +} + +bool ena_com_phc_supported(struct ena_com_dev *ena_dev) +{ + return ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_PHC_CONFIG); +} + +int ena_com_phc_init(struct ena_com_dev *ena_dev) +{ + struct ena_com_phc_info *phc = &ena_dev->phc; + + memset(phc, 0x0, sizeof(*phc)); + + /* Allocate shared mem used PHC timestamp retrieved from device */ + phc->virt_addr = dma_zalloc_coherent(ena_dev->dmadev, sizeof(*phc->virt_addr), + &phc->phys_addr, GFP_KERNEL); + if (unlikely(!phc->virt_addr)) + return -ENOMEM; + + spin_lock_init(&phc->lock); + + phc->virt_addr->req_id = 0; + phc->virt_addr->timestamp = 0; + + return 0; +} + +int ena_com_phc_config(struct ena_com_dev *ena_dev) +{ + struct ena_com_phc_info *phc = &ena_dev->phc; + struct ena_admin_get_feat_resp get_feat_resp; + struct ena_admin_set_feat_resp set_feat_resp; + struct ena_admin_set_feat_cmd set_feat_cmd; + int ret = 0; + + /* Get default device PHC configuration */ + ret = ena_com_get_feature(ena_dev, + &get_feat_resp, + ENA_ADMIN_PHC_CONFIG, + ENA_ADMIN_PHC_FEATURE_VERSION_0); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, + "Failed to get PHC feature configuration, error: %d\n", ret); + return ret; + } + + /* Supporting only PHC V0 (readless mode with error bound) */ + if (get_feat_resp.u.phc.version != ENA_ADMIN_PHC_FEATURE_VERSION_0) { + netdev_err(ena_dev->net_device, "Unsupprted PHC version (0x%X), error: %d\n", + get_feat_resp.u.phc.version, -EOPNOTSUPP); + return -EOPNOTSUPP; + } + + /* Update PHC doorbell offset according to device value, used to write req_id to PHC bar */ + phc->doorbell_offset = get_feat_resp.u.phc.doorbell_offset; + + /* Update PHC expire timeout according to device or default driver value */ + phc->expire_timeout_usec = (get_feat_resp.u.phc.expire_timeout_usec) ? + get_feat_resp.u.phc.expire_timeout_usec : + ENA_PHC_DEFAULT_EXPIRE_TIMEOUT_USEC; + + /* Update PHC block timeout according to device or default driver value */ + phc->block_timeout_usec = (get_feat_resp.u.phc.block_timeout_usec) ? + get_feat_resp.u.phc.block_timeout_usec : + ENA_PHC_DEFAULT_BLOCK_TIMEOUT_USEC; + + /* Sanity check - expire timeout must not exceed block timeout */ + if (phc->expire_timeout_usec > phc->block_timeout_usec) + phc->expire_timeout_usec = phc->block_timeout_usec; + + /* Prepare PHC config feature command */ + memset(&set_feat_cmd, 0x0, sizeof(set_feat_cmd)); + set_feat_cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + set_feat_cmd.feat_common.feature_id = ENA_ADMIN_PHC_CONFIG; + set_feat_cmd.u.phc.output_length = sizeof(*phc->virt_addr); + ret = ena_com_mem_addr_set(ena_dev, &set_feat_cmd.u.phc.output_address, phc->phys_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Failed setting PHC output address, error: %d\n", + ret); + return ret; + } + + /* Send PHC feature command to the device */ + ret = ena_com_execute_admin_command(&ena_dev->admin_queue, + (struct ena_admin_aq_entry *)&set_feat_cmd, + sizeof(set_feat_cmd), + (struct ena_admin_acq_entry *)&set_feat_resp, + sizeof(set_feat_resp)); + + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Failed to enable PHC, error: %d\n", ret); + return ret; + } + + phc->active = true; + netdev_dbg(ena_dev->net_device, "PHC is active in the device\n"); + + return ret; +} + +void ena_com_phc_destroy(struct ena_com_dev *ena_dev) +{ + struct ena_com_phc_info *phc = &ena_dev->phc; + unsigned long flags = 0; + + /* In case PHC is not supported by the device, silently exiting */ + if (!phc->virt_addr) + return; + + spin_lock_irqsave(&phc->lock, flags); + phc->active = false; + spin_unlock_irqrestore(&phc->lock, flags); + + dma_free_coherent(ena_dev->dmadev, sizeof(*phc->virt_addr), phc->virt_addr, phc->phys_addr); + phc->virt_addr = NULL; +} + +int ena_com_phc_get_timestamp(struct ena_com_dev *ena_dev, u64 *timestamp) +{ + volatile struct ena_admin_phc_resp *read_resp = ena_dev->phc.virt_addr; + const ktime_t zero_system_time = ktime_set(0, 0); + struct ena_com_phc_info *phc = &ena_dev->phc; + ktime_t expire_time; + ktime_t block_time; + unsigned long flags = 0; + int ret = 0; + + if (!phc->active) { + netdev_err(ena_dev->net_device, "PHC feature is not active in the device\n"); + return -EOPNOTSUPP; + } + + spin_lock_irqsave(&phc->lock, flags); + + /* Check if PHC is in blocked state */ + if (unlikely(ktime_compare(phc->system_time, zero_system_time))) { + /* Check if blocking time expired */ + block_time = ktime_add_us(phc->system_time, phc->block_timeout_usec); + if (!ktime_after(ktime_get(), block_time)) { + /* PHC is still in blocked state, skip PHC request */ + phc->stats.phc_skp++; + ret = -EBUSY; + goto skip; + } + + /* PHC is in active state, update statistics according to req_id and error_flags */ + if ((READ_ONCE(read_resp->req_id) != phc->req_id) || + (read_resp->error_flags & ENA_PHC_ERROR_FLAGS)) { + /* Device didn't update req_id during blocking time or timestamp is invalid, + * this indicates on a device error + */ + phc->stats.phc_err++; + } else { + /* Device updated req_id during blocking time with valid timestamp */ + phc->stats.phc_exp++; + } + } + + /* Setting relative timeouts */ + phc->system_time = ktime_get(); + block_time = ktime_add_us(phc->system_time, phc->block_timeout_usec); + expire_time = ktime_add_us(phc->system_time, phc->expire_timeout_usec); + + /* We expect the device to return this req_id once the new PHC timestamp is updated */ + phc->req_id++; + + /* Initialize PHC shared memory with different req_id value to be able to identify once the + * device changes it to req_id + */ + read_resp->req_id = phc->req_id + ENA_PHC_REQ_ID_OFFSET; + + /* Writing req_id to PHC bar */ + writel(phc->req_id, ena_dev->reg_bar + phc->doorbell_offset); + + /* Stalling until the device updates req_id */ + while (1) { + if (unlikely(ktime_after(ktime_get(), expire_time))) { + /* Gave up waiting for updated req_id, PHC enters into blocked state until + * passing blocking time, during this time any get PHC timestamp or + * error bound requests will fail with device busy error + */ + phc->error_bound = ENA_PHC_MAX_ERROR_BOUND; + ret = -EBUSY; + break; + } + + /* Check if req_id was updated by the device */ + if (READ_ONCE(read_resp->req_id) != phc->req_id) { + /* req_id was not updated by the device yet, check again on next loop */ + continue; + } + + /* req_id was updated by the device which indicates that PHC timestamp, error_bound + * and error_flags are updated too, checking errors before retrieving timestamp and + * error_bound values + */ + if (unlikely(read_resp->error_flags & ENA_PHC_ERROR_FLAGS)) { + /* Retrieved timestamp or error bound errors, PHC enters into blocked state + * until passing blocking time, during this time any get PHC timestamp or + * error bound requests will fail with device busy error + */ + phc->error_bound = ENA_PHC_MAX_ERROR_BOUND; + ret = -EBUSY; + break; + } + + /* PHC timestamp value is returned to the caller */ + *timestamp = read_resp->timestamp; + + /* Error bound value is cached for future retrieval by caller */ + phc->error_bound = read_resp->error_bound; + + /* Update statistic on valid PHC timestamp retrieval */ + phc->stats.phc_cnt++; + + /* This indicates PHC state is active */ + phc->system_time = zero_system_time; + break; + } + +skip: + spin_unlock_irqrestore(&phc->lock, flags); + + return ret; +} + +int ena_com_phc_get_error_bound(struct ena_com_dev *ena_dev, u32 *error_bound) +{ + struct ena_com_phc_info *phc = &ena_dev->phc; + u32 local_error_bound = phc->error_bound; + + if (!phc->active) { + netdev_err(ena_dev->net_device, "PHC feature is not active in the device\n"); + return -EOPNOTSUPP; + } + + if (local_error_bound == ENA_PHC_MAX_ERROR_BOUND) + return -EBUSY; + + *error_bound = local_error_bound; + + return 0; +} + +int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev) +{ + struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read; + + spin_lock_init(&mmio_read->lock); + mmio_read->read_resp = dma_zalloc_coherent(ena_dev->dmadev, sizeof(*mmio_read->read_resp), + &mmio_read->read_resp_dma_addr, GFP_KERNEL); + if (unlikely(!mmio_read->read_resp)) + goto err; + + ena_com_mmio_reg_read_request_write_dev_addr(ena_dev); + + mmio_read->read_resp->req_id = 0x0; + mmio_read->seq_num = 0x0; + mmio_read->readless_supported = true; + + return 0; + +err: + + return -ENOMEM; +} + +void ena_com_set_mmio_read_mode(struct ena_com_dev *ena_dev, bool readless_supported) +{ + struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read; + + mmio_read->readless_supported = readless_supported; +} + +void ena_com_mmio_reg_read_request_destroy(struct ena_com_dev *ena_dev) +{ + struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read; + + writel(0x0, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_LO_OFF); + writel(0x0, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_HI_OFF); + + dma_free_coherent(ena_dev->dmadev, sizeof(*mmio_read->read_resp), mmio_read->read_resp, + mmio_read->read_resp_dma_addr); + + mmio_read->read_resp = NULL; +} + +void ena_com_mmio_reg_read_request_write_dev_addr(struct ena_com_dev *ena_dev) +{ + struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read; + u32 addr_low, addr_high; + + addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(mmio_read->read_resp_dma_addr); + addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(mmio_read->read_resp_dma_addr); + + writel(addr_low, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_LO_OFF); + writel(addr_high, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_HI_OFF); +} + +int ena_com_admin_init(struct ena_com_dev *ena_dev, + struct ena_aenq_handlers *aenq_handlers) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + u32 aq_caps, acq_caps, dev_sts, addr_low, addr_high; + int ret; + + dev_sts = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF); + + if (unlikely(dev_sts == ENA_MMIO_READ_TIMEOUT)) { + netdev_err(ena_dev->net_device, "Reg read timeout occurred\n"); + return -ETIME; + } + + if (!(dev_sts & ENA_REGS_DEV_STS_READY_MASK)) { + netdev_err(ena_dev->net_device, "Device isn't ready, abort com init\n"); + return -ENODEV; + } + + admin_queue->q_depth = ENA_ADMIN_QUEUE_DEPTH; + + admin_queue->bus = ena_dev->bus; + admin_queue->q_dmadev = ena_dev->dmadev; + admin_queue->polling = false; + admin_queue->curr_cmd_id = 0; + + atomic_set(&admin_queue->outstanding_cmds, 0); + + spin_lock_init(&admin_queue->q_lock); + + ret = ena_com_init_comp_ctxt(admin_queue); + if (unlikely(ret)) + goto error; + + ret = ena_com_admin_init_sq(admin_queue); + if (unlikely(ret)) + goto error; + + ret = ena_com_admin_init_cq(admin_queue); + if (unlikely(ret)) + goto error; + + admin_queue->sq.db_addr = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar + + ENA_REGS_AQ_DB_OFF); + + addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(admin_queue->sq.dma_addr); + addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(admin_queue->sq.dma_addr); + + writel(addr_low, ena_dev->reg_bar + ENA_REGS_AQ_BASE_LO_OFF); + writel(addr_high, ena_dev->reg_bar + ENA_REGS_AQ_BASE_HI_OFF); + + addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(admin_queue->cq.dma_addr); + addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(admin_queue->cq.dma_addr); + + writel(addr_low, ena_dev->reg_bar + ENA_REGS_ACQ_BASE_LO_OFF); + writel(addr_high, ena_dev->reg_bar + ENA_REGS_ACQ_BASE_HI_OFF); + + aq_caps = 0; + aq_caps |= admin_queue->q_depth & ENA_REGS_AQ_CAPS_AQ_DEPTH_MASK; + aq_caps |= (sizeof(struct ena_admin_aq_entry) << + ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT) & + ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK; + + acq_caps = 0; + acq_caps |= admin_queue->q_depth & ENA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK; + acq_caps |= (sizeof(struct ena_admin_acq_entry) << + ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT) & + ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK; + + writel(aq_caps, ena_dev->reg_bar + ENA_REGS_AQ_CAPS_OFF); + writel(acq_caps, ena_dev->reg_bar + ENA_REGS_ACQ_CAPS_OFF); + ret = ena_com_admin_init_aenq(ena_dev, aenq_handlers); + if (unlikely(ret)) + goto error; + + admin_queue->ena_dev = ena_dev; + admin_queue->running_state = true; + admin_queue->is_missing_admin_interrupt = false; + + return 0; +error: + ena_com_admin_destroy(ena_dev); + + return ret; +} + +int ena_com_create_io_queue(struct ena_com_dev *ena_dev, + struct ena_com_create_io_ctx *ctx) +{ + struct ena_com_io_sq *io_sq; + struct ena_com_io_cq *io_cq; + int ret; + + if (unlikely(ctx->qid >= ENA_TOTAL_NUM_QUEUES)) { + netdev_err(ena_dev->net_device, "Qid (%d) is bigger than max num of queues (%d)\n", + ctx->qid, ENA_TOTAL_NUM_QUEUES); + return -EINVAL; + } + + io_sq = &ena_dev->io_sq_queues[ctx->qid]; + io_cq = &ena_dev->io_cq_queues[ctx->qid]; + + memset(io_sq, 0x0, sizeof(*io_sq)); + memset(io_cq, 0x0, sizeof(*io_cq)); + + /* Init CQ */ + io_cq->q_depth = ctx->queue_size; + io_cq->direction = ctx->direction; + io_cq->qid = ctx->qid; + + io_cq->msix_vector = ctx->msix_vector; + + io_sq->q_depth = ctx->queue_size; + io_sq->direction = ctx->direction; + io_sq->qid = ctx->qid; + + io_sq->mem_queue_type = ctx->mem_queue_type; + + if (ctx->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) + /* header length is limited to 8 bits */ + io_sq->tx_max_header_size = min_t(u32, ena_dev->tx_max_header_size, SZ_256); + + ret = ena_com_init_io_sq(ena_dev, ctx, io_sq); + if (unlikely(ret)) + goto error; + ret = ena_com_init_io_cq(ena_dev, ctx, io_cq); + if (unlikely(ret)) + goto error; + + ret = ena_com_create_io_cq(ena_dev, io_cq); + if (unlikely(ret)) + goto error; + + ret = ena_com_create_io_sq(ena_dev, io_sq, io_cq->idx); + if (unlikely(ret)) + goto destroy_io_cq; + + return 0; + +destroy_io_cq: + ena_com_destroy_io_cq(ena_dev, io_cq); +error: + ena_com_io_queue_free(ena_dev, io_sq, io_cq); + return ret; +} + +void ena_com_destroy_io_queue(struct ena_com_dev *ena_dev, u16 qid) +{ + struct ena_com_io_sq *io_sq; + struct ena_com_io_cq *io_cq; + + if (unlikely(qid >= ENA_TOTAL_NUM_QUEUES)) { + netdev_err(ena_dev->net_device, "Qid (%d) is bigger than max num of queues (%d)\n", + qid, ENA_TOTAL_NUM_QUEUES); + return; + } + + io_sq = &ena_dev->io_sq_queues[qid]; + io_cq = &ena_dev->io_cq_queues[qid]; + + ena_com_destroy_io_sq(ena_dev, io_sq); + ena_com_destroy_io_cq(ena_dev, io_cq); + + ena_com_io_queue_free(ena_dev, io_sq, io_cq); +} + +int ena_com_get_link_params(struct ena_com_dev *ena_dev, + struct ena_admin_get_feat_resp *resp) +{ + return ena_com_get_feature(ena_dev, resp, ENA_ADMIN_LINK_CONFIG, 0); +} + +static int ena_get_dev_stats(struct ena_com_dev *ena_dev, + struct ena_com_stats_ctx *ctx, + enum ena_admin_get_stats_type type) +{ + struct ena_admin_acq_get_stats_resp *get_resp = &ctx->get_resp; + struct ena_admin_aq_get_stats_cmd *get_cmd = &ctx->get_cmd; + struct ena_com_admin_queue *admin_queue; + int ret; + + admin_queue = &ena_dev->admin_queue; + + get_cmd->aq_common_descriptor.opcode = ENA_ADMIN_GET_STATS; + get_cmd->aq_common_descriptor.flags = 0; + get_cmd->type = type; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)get_cmd, + sizeof(*get_cmd), + (struct ena_admin_acq_entry *)get_resp, + sizeof(*get_resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, "Failed to get stats. error: %d\n", ret); + + return ret; +} + +static void ena_com_set_supported_customer_metrics(struct ena_com_dev *ena_dev) +{ + struct ena_customer_metrics *customer_metrics; + struct ena_com_stats_ctx ctx; + int ret; + + customer_metrics = &ena_dev->customer_metrics; + if (!ena_com_get_cap(ena_dev, ENA_ADMIN_CUSTOMER_METRICS)) { + customer_metrics->supported_metrics = ENA_ADMIN_CUSTOMER_METRICS_MIN_SUPPORT_MASK; + return; + } + + memset(&ctx, 0x0, sizeof(ctx)); + ctx.get_cmd.requested_metrics = ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK; + ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS); + if (likely(ret == 0)) + customer_metrics->supported_metrics = + ctx.get_resp.u.customer_metrics.reported_metrics; + else + netdev_err(ena_dev->net_device, + "Failed to query customer metrics support. error: %d\n", ret); +} + +int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev, + struct ena_com_dev_get_features_ctx *get_feat_ctx) +{ + struct ena_admin_get_feat_resp get_resp; + int rc; + + rc = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_DEVICE_ATTRIBUTES, 0); + if (rc) + return rc; + + memcpy(&get_feat_ctx->dev_attr, &get_resp.u.dev_attr, + sizeof(get_resp.u.dev_attr)); + + ena_dev->supported_features = get_resp.u.dev_attr.supported_features; + ena_dev->capabilities = get_resp.u.dev_attr.capabilities; + + if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) { + rc = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_MAX_QUEUES_EXT, + ENA_FEATURE_MAX_QUEUE_EXT_VER); + if (rc) + return rc; + + if (get_resp.u.max_queue_ext.version != ENA_FEATURE_MAX_QUEUE_EXT_VER) + return -EINVAL; + + memcpy(&get_feat_ctx->max_queue_ext, &get_resp.u.max_queue_ext, + sizeof(get_resp.u.max_queue_ext)); + ena_dev->tx_max_header_size = + get_resp.u.max_queue_ext.max_queue_ext.max_tx_header_size; + } else { + rc = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_MAX_QUEUES_NUM, 0); + memcpy(&get_feat_ctx->max_queues, &get_resp.u.max_queue, + sizeof(get_resp.u.max_queue)); + ena_dev->tx_max_header_size = + get_resp.u.max_queue.max_header_size; + + if (rc) + return rc; + } + + rc = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_AENQ_CONFIG, 0); + if (rc) + return rc; + + memcpy(&get_feat_ctx->aenq, &get_resp.u.aenq, + sizeof(get_resp.u.aenq)); + + rc = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_STATELESS_OFFLOAD_CONFIG, 0); + if (rc) + return rc; + + memcpy(&get_feat_ctx->offload, &get_resp.u.offload, + sizeof(get_resp.u.offload)); + + /* Driver hints isn't mandatory admin command. So in case the + * command isn't supported set driver hints to 0 + */ + rc = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_HW_HINTS, 0); + + if (!rc) + memcpy(&get_feat_ctx->hw_hints, &get_resp.u.hw_hints, sizeof(get_resp.u.hw_hints)); + else if (rc == -EOPNOTSUPP) + memset(&get_feat_ctx->hw_hints, 0x0, sizeof(get_feat_ctx->hw_hints)); + else + return rc; + + rc = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_LLQ, ENA_ADMIN_LLQ_FEATURE_VERSION_1); + if (!rc) + memcpy(&get_feat_ctx->llq, &get_resp.u.llq, sizeof(get_resp.u.llq)); + else if (rc == -EOPNOTSUPP) + memset(&get_feat_ctx->llq, 0x0, sizeof(get_feat_ctx->llq)); + else + return rc; + + ena_com_set_supported_customer_metrics(ena_dev); + + return 0; +} + +void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev) +{ + ena_com_handle_admin_completion(&ena_dev->admin_queue); +} + +/* ena_handle_specific_aenq_event: + * return the handler that is relevant to the specific event group + */ +static ena_aenq_handler ena_com_get_specific_aenq_cb(struct ena_com_dev *ena_dev, + u16 group) +{ + struct ena_aenq_handlers *aenq_handlers = ena_dev->aenq.aenq_handlers; + + if ((group < ENA_MAX_HANDLERS) && aenq_handlers->handlers[group]) + return aenq_handlers->handlers[group]; + + return aenq_handlers->unimplemented_handler; +} + +/* ena_aenq_intr_handler: + * handles the aenq incoming events. + * pop events from the queue and apply the specific handler + */ +void ena_com_aenq_intr_handler(struct ena_com_dev *ena_dev, void *data) +{ + struct ena_admin_aenq_entry *aenq_e; + struct ena_admin_aenq_common_desc *aenq_common; + struct ena_com_aenq *aenq = &ena_dev->aenq; + u64 timestamp; + ena_aenq_handler handler_cb; + u16 masked_head, processed = 0; + u8 phase; + + masked_head = aenq->head & (aenq->q_depth - 1); + phase = aenq->phase; + aenq_e = &aenq->entries[masked_head]; /* Get first entry */ + aenq_common = &aenq_e->aenq_common_desc; + + /* Go over all the events */ + while ((READ_ONCE(aenq_common->flags) & ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) { + /* When the phase bit of the AENQ descriptor aligns with the driver's phase bit, + * it signifies the readiness of the entire AENQ descriptor. + * The driver should proceed to read the descriptor's data only after confirming + * and synchronizing the phase bit. + * This memory fence guarantees the correct sequence of accesses to the + * descriptor's memory. + */ + dma_rmb(); + + timestamp = (u64)aenq_common->timestamp_low | + ((u64)aenq_common->timestamp_high << 32); + + netdev_dbg(ena_dev->net_device, "AENQ! Group[%x] Syndrome[%x] timestamp: [%llus]\n", + aenq_common->group, aenq_common->syndrome, timestamp); + + /* Handle specific event*/ + handler_cb = ena_com_get_specific_aenq_cb(ena_dev, + aenq_common->group); + handler_cb(data, aenq_e); /* call the actual event handler*/ + + /* Get next event entry */ + masked_head++; + processed++; + + if (unlikely(masked_head == aenq->q_depth)) { + masked_head = 0; + phase = !phase; + } + aenq_e = &aenq->entries[masked_head]; + aenq_common = &aenq_e->aenq_common_desc; + } + + aenq->head += processed; + aenq->phase = phase; + + /* Don't update aenq doorbell if there weren't any processed events */ + if (!processed) + return; + + /* write the aenq doorbell after all AENQ descriptors were read */ + mb(); + writel_relaxed((u32)aenq->head, ena_dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF); +#ifndef MMIOWB_NOT_DEFINED + mmiowb(); +#endif +} + +bool ena_com_aenq_has_keep_alive(struct ena_com_dev *ena_dev) +{ + struct ena_admin_aenq_common_desc *aenq_common; + struct ena_com_aenq *aenq = &ena_dev->aenq; + struct ena_admin_aenq_entry *aenq_e; + u8 phase = aenq->phase; + u16 masked_head; + + masked_head = aenq->head & (aenq->q_depth - 1); + aenq_e = &aenq->entries[masked_head]; /* Get first entry */ + aenq_common = &aenq_e->aenq_common_desc; + + /* Go over all the events */ + while ((READ_ONCE(aenq_common->flags) & ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) { + /* When the phase bit of the AENQ descriptor aligns with the driver's phase bit, + * it signifies the readiness of the entire AENQ descriptor. + * The driver should proceed to read the descriptor's data only after confirming + * and synchronizing the phase bit. + * This memory fence guarantees the correct sequence of accesses to the + * descriptor's memory. + */ + dma_rmb(); + + if (aenq_common->group == ENA_ADMIN_KEEP_ALIVE) + return true; + + /* Get next event entry */ + masked_head++; + + if (unlikely(masked_head == aenq->q_depth)) { + masked_head = 0; + phase = !phase; + } + + aenq_e = &aenq->entries[masked_head]; + aenq_common = &aenq_e->aenq_common_desc; + } + + return false; +} + +int ena_com_dev_reset(struct ena_com_dev *ena_dev, + enum ena_regs_reset_reason_types reset_reason) +{ + u32 reset_reason_msb, reset_reason_lsb; + u32 stat, timeout, cap, reset_val; + int rc; + + stat = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF); + cap = ena_com_reg_bar_read32(ena_dev, ENA_REGS_CAPS_OFF); + + if (unlikely((stat == ENA_MMIO_READ_TIMEOUT) || (cap == ENA_MMIO_READ_TIMEOUT))) { + netdev_err(ena_dev->net_device, "Reg read32 timeout occurred\n"); + return -ETIME; + } + + if ((stat & ENA_REGS_DEV_STS_READY_MASK) == 0) { + netdev_err(ena_dev->net_device, "Device isn't ready, can't reset device\n"); + return -EINVAL; + } + + timeout = (cap & ENA_REGS_CAPS_RESET_TIMEOUT_MASK) >> + ENA_REGS_CAPS_RESET_TIMEOUT_SHIFT; + if (timeout == 0) { + netdev_err(ena_dev->net_device, "Invalid timeout value\n"); + return -EINVAL; + } + + /* start reset */ + reset_val = ENA_REGS_DEV_CTL_DEV_RESET_MASK; + + /* For backward compatibility, device will interpret + * bits 24-27 as MSB, bits 28-31 as LSB + */ + reset_reason_lsb = ENA_FIELD_GET(reset_reason, ENA_RESET_REASON_LSB_MASK, + ENA_RESET_REASON_LSB_OFFSET); + + reset_reason_msb = ENA_FIELD_GET(reset_reason, ENA_RESET_REASON_MSB_MASK, + ENA_RESET_REASON_MSB_OFFSET); + + reset_val |= reset_reason_lsb << ENA_REGS_DEV_CTL_RESET_REASON_SHIFT; + + if (ena_com_get_cap(ena_dev, ENA_ADMIN_EXTENDED_RESET_REASONS)) + reset_val |= reset_reason_msb << ENA_REGS_DEV_CTL_RESET_REASON_EXT_SHIFT; + else if (reset_reason_msb) { + /* In case the device does not support intended + * extended reset reason fallback to generic + */ + reset_val = ENA_REGS_DEV_CTL_DEV_RESET_MASK; + reset_val |= (ENA_REGS_RESET_GENERIC << ENA_REGS_DEV_CTL_RESET_REASON_SHIFT) & + ENA_REGS_DEV_CTL_RESET_REASON_MASK; + } + writel(reset_val, ena_dev->reg_bar + ENA_REGS_DEV_CTL_OFF); + + /* Write again the MMIO read request address */ + ena_com_mmio_reg_read_request_write_dev_addr(ena_dev); + + rc = wait_for_reset_state(ena_dev, timeout, + ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK); + if (unlikely(rc)) { + netdev_err(ena_dev->net_device, "Reset indication didn't turn on\n"); + return rc; + } + + /* reset done */ + writel(0, ena_dev->reg_bar + ENA_REGS_DEV_CTL_OFF); + rc = wait_for_reset_state(ena_dev, timeout, 0); + if (unlikely(rc)) { + netdev_err(ena_dev->net_device, "Reset indication didn't turn off\n"); + return rc; + } + + timeout = (cap & ENA_REGS_CAPS_ADMIN_CMD_TO_MASK) >> + ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT; + if (timeout) + /* the resolution of timeout reg is 100ms */ + ena_dev->admin_queue.completion_timeout = timeout * 100000; + else + ena_dev->admin_queue.completion_timeout = ADMIN_CMD_TIMEOUT_US; + + return 0; +} + +int ena_com_get_eni_stats(struct ena_com_dev *ena_dev, + struct ena_admin_eni_stats *stats) +{ + struct ena_com_stats_ctx ctx; + int ret; + + if (!ena_com_get_cap(ena_dev, ENA_ADMIN_ENI_STATS)) { + netdev_err(ena_dev->net_device, "Capability %d isn't supported\n", + ENA_ADMIN_ENI_STATS); + return -EOPNOTSUPP; + } + + memset(&ctx, 0x0, sizeof(ctx)); + ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_ENI); + if (likely(ret == 0)) + memcpy(stats, &ctx.get_resp.u.eni_stats, + sizeof(ctx.get_resp.u.eni_stats)); + + return ret; +} + +int ena_com_get_ena_srd_info(struct ena_com_dev *ena_dev, + struct ena_admin_ena_srd_info *info) +{ + struct ena_com_stats_ctx ctx; + int ret; + + if (!ena_com_get_cap(ena_dev, ENA_ADMIN_ENA_SRD_INFO)) { + netdev_err(ena_dev->net_device, "Capability %d isn't supported\n", + ENA_ADMIN_ENA_SRD_INFO); + return -EOPNOTSUPP; + } + + memset(&ctx, 0x0, sizeof(ctx)); + ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_ENA_SRD); + if (likely(ret == 0)) + memcpy(info, &ctx.get_resp.u.ena_srd_info, + sizeof(ctx.get_resp.u.ena_srd_info)); + + return ret; +} + +int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev, + struct ena_admin_basic_stats *stats) +{ + struct ena_com_stats_ctx ctx; + int ret; + + memset(&ctx, 0x0, sizeof(ctx)); + ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_BASIC); + if (likely(ret == 0)) + memcpy(stats, &ctx.get_resp.u.basic_stats, + sizeof(ctx.get_resp.u.basic_stats)); + + return ret; +} + +int ena_com_get_customer_metrics(struct ena_com_dev *ena_dev, char *buffer, u32 len) +{ + struct ena_admin_aq_get_stats_cmd *get_cmd; + struct ena_com_stats_ctx ctx; + int ret; + + if (unlikely(len > ena_dev->customer_metrics.buffer_len)) { + netdev_err(ena_dev->net_device, + "Invalid buffer size %u. The given buffer is too big.\n", len); + return -EINVAL; + } + + if (!ena_com_get_cap(ena_dev, ENA_ADMIN_CUSTOMER_METRICS)) { + netdev_err(ena_dev->net_device, "Capability %d not supported.\n", + ENA_ADMIN_CUSTOMER_METRICS); + return -EOPNOTSUPP; + } + + if (!ena_dev->customer_metrics.supported_metrics) { + netdev_err(ena_dev->net_device, "No supported customer metrics.\n"); + return -EOPNOTSUPP; + } + + get_cmd = &ctx.get_cmd; + memset(&ctx, 0x0, sizeof(ctx)); + ret = ena_com_mem_addr_set(ena_dev, + &get_cmd->u.control_buffer.address, + ena_dev->customer_metrics.buffer_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed.\n"); + return ret; + } + + get_cmd->u.control_buffer.length = ena_dev->customer_metrics.buffer_len; + get_cmd->requested_metrics = ena_dev->customer_metrics.supported_metrics; + ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS); + if (likely(ret == 0)) + memcpy(buffer, ena_dev->customer_metrics.buffer_virt_addr, len); + else + netdev_err(ena_dev->net_device, "Failed to get customer metrics. error: %d\n", ret); + + return ret; +} + +int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu) +{ + struct ena_com_admin_queue *admin_queue; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + int ret; + + if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_MTU)) { + netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n", ENA_ADMIN_MTU); + return -EOPNOTSUPP; + } + + memset(&cmd, 0x0, sizeof(cmd)); + admin_queue = &ena_dev->admin_queue; + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.aq_common_descriptor.flags = 0; + cmd.feat_common.feature_id = ENA_ADMIN_MTU; + cmd.u.mtu.mtu = mtu; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, "Failed to set mtu %d. error: %d\n", mtu, ret); + + return ret; +} + +int ena_com_get_offload_settings(struct ena_com_dev *ena_dev, + struct ena_admin_feature_offload_desc *offload) +{ + int ret; + struct ena_admin_get_feat_resp resp; + + ret = ena_com_get_feature(ena_dev, &resp, + ENA_ADMIN_STATELESS_OFFLOAD_CONFIG, 0); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Failed to get offload capabilities %d\n", ret); + return ret; + } + + memcpy(offload, &resp.u.offload, sizeof(resp.u.offload)); + + return 0; +} + +int ena_com_set_hash_function(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + struct ena_admin_get_feat_resp get_resp; + int ret; + + if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_RSS_HASH_FUNCTION)) { + netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n", + ENA_ADMIN_RSS_HASH_FUNCTION); + return -EOPNOTSUPP; + } + + /* Validate hash function is supported */ + ret = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_RSS_HASH_FUNCTION, 0); + if (unlikely(ret)) + return ret; + + if (!(get_resp.u.flow_hash_func.supported_func & BIT(rss->hash_func))) { + netdev_err(ena_dev->net_device, "Func hash %d isn't supported by device, abort\n", + rss->hash_func); + return -EOPNOTSUPP; + } + + memset(&cmd, 0x0, sizeof(cmd)); + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.aq_common_descriptor.flags = + ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK; + cmd.feat_common.feature_id = ENA_ADMIN_RSS_HASH_FUNCTION; + cmd.u.flow_hash_func.init_val = rss->hash_init_val; + cmd.u.flow_hash_func.selected_func = 1 << rss->hash_func; + + ret = ena_com_mem_addr_set(ena_dev, + &cmd.control_buffer.address, + rss->hash_key_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + + cmd.control_buffer.length = sizeof(*rss->hash_key); + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Failed to set hash function %d. error: %d\n", + rss->hash_func, ret); + return -EINVAL; + } + + return 0; +} + +int ena_com_fill_hash_function(struct ena_com_dev *ena_dev, + enum ena_admin_hash_functions func, + const u8 *key, u16 key_len, u32 init_val) +{ + struct ena_admin_feature_rss_flow_hash_control *hash_key; + struct ena_admin_get_feat_resp get_resp; + enum ena_admin_hash_functions old_func; + struct ena_rss *rss = &ena_dev->rss; + int rc; + + hash_key = rss->hash_key; + + /* Make sure size is a mult of DWs */ + if (unlikely(key_len & 0x3)) + return -EINVAL; + + rc = ena_com_get_feature_ex(ena_dev, &get_resp, + ENA_ADMIN_RSS_HASH_FUNCTION, + rss->hash_key_dma_addr, + sizeof(*rss->hash_key), 0); + if (unlikely(rc)) + return rc; + + if (!(BIT(func) & get_resp.u.flow_hash_func.supported_func)) { + netdev_err(ena_dev->net_device, "Flow hash function %d isn't supported\n", func); + return -EOPNOTSUPP; + } + + if ((func == ENA_ADMIN_TOEPLITZ) && key) { + if (key_len != sizeof(hash_key->key)) { + netdev_err(ena_dev->net_device, + "key len (%u) doesn't equal the supported size (%zu)\n", key_len, + sizeof(hash_key->key)); + return -EINVAL; + } + memcpy(hash_key->key, key, key_len); + hash_key->key_parts = key_len / sizeof(hash_key->key[0]); + } + + rss->hash_init_val = init_val; + old_func = rss->hash_func; + rss->hash_func = func; + rc = ena_com_set_hash_function(ena_dev); + + /* Restore the old function */ + if (unlikely(rc)) + rss->hash_func = old_func; + + return rc; +} + +int ena_com_get_hash_function(struct ena_com_dev *ena_dev, + enum ena_admin_hash_functions *func) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_get_feat_resp get_resp; + int rc; + + if (unlikely(!func)) + return -EINVAL; + + rc = ena_com_get_feature_ex(ena_dev, &get_resp, + ENA_ADMIN_RSS_HASH_FUNCTION, + rss->hash_key_dma_addr, + sizeof(*rss->hash_key), 0); + if (unlikely(rc)) + return rc; + + /* ffs() returns 1 in case the lsb is set */ + rss->hash_func = ffs(get_resp.u.flow_hash_func.selected_func); + if (rss->hash_func) + rss->hash_func--; + + *func = rss->hash_func; + + return 0; +} + +int ena_com_get_hash_key(struct ena_com_dev *ena_dev, u8 *key) +{ + struct ena_admin_feature_rss_flow_hash_control *hash_key = + ena_dev->rss.hash_key; + + if (key) + memcpy(key, hash_key->key, + (size_t)(hash_key->key_parts) * sizeof(hash_key->key[0])); + + return 0; +} + +int ena_com_get_hash_ctrl(struct ena_com_dev *ena_dev, + enum ena_admin_flow_hash_proto proto, + u16 *fields) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_get_feat_resp get_resp; + int rc; + + rc = ena_com_get_feature_ex(ena_dev, &get_resp, + ENA_ADMIN_RSS_HASH_INPUT, + rss->hash_ctrl_dma_addr, + sizeof(*rss->hash_ctrl), 0); + if (unlikely(rc)) + return rc; + + if (fields) + *fields = rss->hash_ctrl->selected_fields[proto].fields; + + return 0; +} + +int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_feature_rss_hash_control *hash_ctrl = rss->hash_ctrl; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + int ret; + + if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_RSS_HASH_INPUT)) { + netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n", + ENA_ADMIN_RSS_HASH_INPUT); + return -EOPNOTSUPP; + } + + memset(&cmd, 0x0, sizeof(cmd)); + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.aq_common_descriptor.flags = + ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK; + cmd.feat_common.feature_id = ENA_ADMIN_RSS_HASH_INPUT; + cmd.u.flow_hash_input.enabled_input_sort = + ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_MASK | + ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_MASK; + + ret = ena_com_mem_addr_set(ena_dev, + &cmd.control_buffer.address, + rss->hash_ctrl_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + cmd.control_buffer.length = sizeof(*hash_ctrl); + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + if (unlikely(ret)) + netdev_err(ena_dev->net_device, "Failed to set hash input. error: %d\n", ret); + + return ret; +} + +int ena_com_set_default_hash_ctrl(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_feature_rss_hash_control *hash_ctrl = + rss->hash_ctrl; + u16 available_fields = 0; + int rc, i; + + /* Get the supported hash input */ + rc = ena_com_get_hash_ctrl(ena_dev, 0, NULL); + if (unlikely(rc)) + return rc; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_TCP4].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA | + ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_UDP4].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA | + ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_TCP6].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA | + ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_UDP6].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA | + ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_IP4].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_IP6].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_IP4_FRAG].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_NOT_IP].fields = + ENA_ADMIN_RSS_L2_DA | ENA_ADMIN_RSS_L2_SA; + + for (i = 0; i < ENA_ADMIN_RSS_PROTO_NUM; i++) { + available_fields = hash_ctrl->selected_fields[i].fields & + hash_ctrl->supported_fields[i].fields; + if (available_fields != hash_ctrl->selected_fields[i].fields) { + netdev_err(ena_dev->net_device, + "Hash control doesn't support all the desire configuration. proto %x supported %x selected %x\n", + i, hash_ctrl->supported_fields[i].fields, + hash_ctrl->selected_fields[i].fields); + return -EOPNOTSUPP; + } + } + + rc = ena_com_set_hash_ctrl(ena_dev); + + /* In case of failure, restore the old hash ctrl */ + if (unlikely(rc)) + ena_com_get_hash_ctrl(ena_dev, 0, NULL); + + return rc; +} + +int ena_com_fill_hash_ctrl(struct ena_com_dev *ena_dev, + enum ena_admin_flow_hash_proto proto, + u16 hash_fields) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_feature_rss_hash_control *hash_ctrl = rss->hash_ctrl; + u16 supported_fields; + int rc; + + if (proto >= ENA_ADMIN_RSS_PROTO_NUM) { + netdev_err(ena_dev->net_device, "Invalid proto num (%u)\n", proto); + return -EINVAL; + } + + /* Get the ctrl table */ + rc = ena_com_get_hash_ctrl(ena_dev, proto, NULL); + if (unlikely(rc)) + return rc; + + /* Make sure all the fields are supported */ + supported_fields = hash_ctrl->supported_fields[proto].fields; + if ((hash_fields & supported_fields) != hash_fields) { + netdev_err(ena_dev->net_device, + "Proto %d doesn't support the required fields %x. supports only: %x\n", + proto, hash_fields, supported_fields); + } + + hash_ctrl->selected_fields[proto].fields = hash_fields; + + rc = ena_com_set_hash_ctrl(ena_dev); + + /* In case of failure, restore the old hash ctrl */ + if (unlikely(rc)) + ena_com_get_hash_ctrl(ena_dev, 0, NULL); + + return 0; +} + +int ena_com_indirect_table_fill_entry(struct ena_com_dev *ena_dev, + u16 entry_idx, u16 entry_value) +{ + struct ena_rss *rss = &ena_dev->rss; + + if (unlikely(entry_idx >= (1 << rss->tbl_log_size))) + return -EINVAL; + + if (unlikely((entry_value > ENA_TOTAL_NUM_QUEUES))) + return -EINVAL; + + rss->host_rss_ind_tbl[entry_idx] = entry_value; + + return 0; +} + +int ena_com_indirect_table_set(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + int ret; + + if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG)) { + netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n", + ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG); + return -EOPNOTSUPP; + } + + ret = ena_com_ind_tbl_convert_to_device(ena_dev); + if (ret) { + netdev_err(ena_dev->net_device, + "Failed to convert host indirection table to device table\n"); + return ret; + } + + memset(&cmd, 0x0, sizeof(cmd)); + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.aq_common_descriptor.flags = + ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK; + cmd.feat_common.feature_id = ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG; + cmd.u.ind_table.size = rss->tbl_log_size; + cmd.u.ind_table.inline_index = 0xFFFFFFFF; + + ret = ena_com_mem_addr_set(ena_dev, + &cmd.control_buffer.address, + rss->rss_ind_tbl_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + + cmd.control_buffer.length = (1ULL << rss->tbl_log_size) * + sizeof(struct ena_admin_rss_ind_table_entry); + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, "Failed to set indirect table. error: %d\n", ret); + + return ret; +} + +int ena_com_indirect_table_get(struct ena_com_dev *ena_dev, u32 *ind_tbl) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_get_feat_resp get_resp; + u32 tbl_size; + int i, rc; + + tbl_size = (1ULL << rss->tbl_log_size) * + sizeof(struct ena_admin_rss_ind_table_entry); + + rc = ena_com_get_feature_ex(ena_dev, &get_resp, + ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG, + rss->rss_ind_tbl_dma_addr, + tbl_size, 0); + if (unlikely(rc)) + return rc; + + if (!ind_tbl) + return 0; + + for (i = 0; i < (1 << rss->tbl_log_size); i++) + ind_tbl[i] = rss->host_rss_ind_tbl[i]; + + return 0; +} + +int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 indr_tbl_log_size) +{ + int rc; + + memset(&ena_dev->rss, 0x0, sizeof(ena_dev->rss)); + + rc = ena_com_indirect_table_allocate(ena_dev, indr_tbl_log_size); + if (unlikely(rc)) + goto err_indr_tbl; + + /* The following function might return unsupported in case the + * device doesn't support setting the key / hash function. We can safely + * ignore this error and have indirection table support only. + */ + rc = ena_com_hash_key_allocate(ena_dev); + if (likely(!rc)) + ena_com_hash_key_fill_default_key(ena_dev); + else if (rc != -EOPNOTSUPP) + goto err_hash_key; + + rc = ena_com_hash_ctrl_init(ena_dev); + if (unlikely(rc)) + goto err_hash_ctrl; + + return 0; + +err_hash_ctrl: + ena_com_hash_key_destroy(ena_dev); +err_hash_key: + ena_com_indirect_table_destroy(ena_dev); +err_indr_tbl: + + return rc; +} + +void ena_com_rss_destroy(struct ena_com_dev *ena_dev) +{ + ena_com_indirect_table_destroy(ena_dev); + ena_com_hash_key_destroy(ena_dev); + ena_com_hash_ctrl_destroy(ena_dev); + + memset(&ena_dev->rss, 0x0, sizeof(ena_dev->rss)); +} + +int ena_com_allocate_host_info(struct ena_com_dev *ena_dev) +{ + struct ena_host_attribute *host_attr = &ena_dev->host_attr; + + host_attr->host_info = dma_zalloc_coherent(ena_dev->dmadev, SZ_4K, + &host_attr->host_info_dma_addr, GFP_KERNEL); + if (unlikely(!host_attr->host_info)) + return -ENOMEM; + + host_attr->host_info->ena_spec_version = ((ENA_COMMON_SPEC_VERSION_MAJOR << + ENA_REGS_VERSION_MAJOR_VERSION_SHIFT) | + (ENA_COMMON_SPEC_VERSION_MINOR)); + + return 0; +} + +int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev, + u32 debug_area_size) +{ + struct ena_host_attribute *host_attr = &ena_dev->host_attr; + + host_attr->debug_area_virt_addr = + dma_zalloc_coherent(ena_dev->dmadev, debug_area_size, + &host_attr->debug_area_dma_addr, GFP_KERNEL); + if (unlikely(!host_attr->debug_area_virt_addr)) { + host_attr->debug_area_size = 0; + return -ENOMEM; + } + + host_attr->debug_area_size = debug_area_size; + + return 0; +} + +int ena_com_allocate_customer_metrics_buffer(struct ena_com_dev *ena_dev) +{ + struct ena_customer_metrics *customer_metrics = &ena_dev->customer_metrics; + + customer_metrics->buffer_len = ENA_CUSTOMER_METRICS_BUFFER_SIZE; + customer_metrics->buffer_virt_addr = + dma_zalloc_coherent(ena_dev->dmadev, customer_metrics->buffer_len, + &customer_metrics->buffer_dma_addr, GFP_KERNEL); + if (unlikely(!customer_metrics->buffer_virt_addr)) + return -ENOMEM; + + return 0; +} + +void ena_com_delete_host_info(struct ena_com_dev *ena_dev) +{ + struct ena_host_attribute *host_attr = &ena_dev->host_attr; + + if (host_attr->host_info) { + dma_free_coherent(ena_dev->dmadev, SZ_4K, host_attr->host_info, + host_attr->host_info_dma_addr); + host_attr->host_info = NULL; + } +} + +void ena_com_delete_debug_area(struct ena_com_dev *ena_dev) +{ + struct ena_host_attribute *host_attr = &ena_dev->host_attr; + + if (host_attr->debug_area_virt_addr) { + dma_free_coherent(ena_dev->dmadev, host_attr->debug_area_size, + host_attr->debug_area_virt_addr, host_attr->debug_area_dma_addr); + host_attr->debug_area_virt_addr = NULL; + } +} + +void ena_com_delete_customer_metrics_buffer(struct ena_com_dev *ena_dev) +{ + struct ena_customer_metrics *customer_metrics = &ena_dev->customer_metrics; + + if (customer_metrics->buffer_virt_addr) { + dma_free_coherent(ena_dev->dmadev, customer_metrics->buffer_len, + customer_metrics->buffer_virt_addr, + customer_metrics->buffer_dma_addr); + customer_metrics->buffer_virt_addr = NULL; + } +} + +int ena_com_set_host_attributes(struct ena_com_dev *ena_dev) +{ + struct ena_host_attribute *host_attr = &ena_dev->host_attr; + struct ena_com_admin_queue *admin_queue; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + + int ret; + + /* Host attribute config is called before ena_com_get_dev_attr_feat + * so ena_com can't check if the feature is supported. + */ + + memset(&cmd, 0x0, sizeof(cmd)); + admin_queue = &ena_dev->admin_queue; + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.feat_common.feature_id = ENA_ADMIN_HOST_ATTR_CONFIG; + + ret = ena_com_mem_addr_set(ena_dev, + &cmd.u.host_attr.debug_ba, + host_attr->debug_area_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + + ret = ena_com_mem_addr_set(ena_dev, + &cmd.u.host_attr.os_info_ba, + host_attr->host_info_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + + cmd.u.host_attr.debug_area_size = host_attr->debug_area_size; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, "Failed to set host attributes: %d\n", ret); + + return ret; +} + +/* Interrupt moderation */ +bool ena_com_interrupt_moderation_supported(struct ena_com_dev *ena_dev) +{ + return ena_com_check_supported_feature_id(ena_dev, + ENA_ADMIN_INTERRUPT_MODERATION); +} + +static int ena_com_update_nonadaptive_moderation_interval(struct ena_com_dev *ena_dev, + u32 coalesce_usecs, + u32 intr_delay_resolution, + u32 *intr_moder_interval) +{ + if (!intr_delay_resolution) { + netdev_err(ena_dev->net_device, "Illegal interrupt delay granularity value\n"); + return -EFAULT; + } + + *intr_moder_interval = coalesce_usecs / intr_delay_resolution; + + return 0; +} + +int ena_com_update_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev, + u32 tx_coalesce_usecs) +{ + return ena_com_update_nonadaptive_moderation_interval(ena_dev, + tx_coalesce_usecs, + ena_dev->intr_delay_resolution, + &ena_dev->intr_moder_tx_interval); +} + +int ena_com_update_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev, + u32 rx_coalesce_usecs) +{ + return ena_com_update_nonadaptive_moderation_interval(ena_dev, + rx_coalesce_usecs, + ena_dev->intr_delay_resolution, + &ena_dev->intr_moder_rx_interval); +} + +int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev) +{ + struct ena_admin_get_feat_resp get_resp; + u16 delay_resolution; + int rc; + + rc = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_INTERRUPT_MODERATION, 0); + + if (rc) { + if (rc == -EOPNOTSUPP) { + netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n", + ENA_ADMIN_INTERRUPT_MODERATION); + rc = 0; + } else { + netdev_err(ena_dev->net_device, + "Failed to get interrupt moderation admin cmd. rc: %d\n", rc); + } + + /* no moderation supported, disable adaptive support */ + ena_com_disable_adaptive_moderation(ena_dev); + return rc; + } + + /* if moderation is supported by device we set adaptive moderation */ + delay_resolution = get_resp.u.intr_moderation.intr_delay_resolution; + ena_com_update_intr_delay_resolution(ena_dev, delay_resolution); + + /* Disable adaptive moderation by default - can be enabled later */ + ena_com_disable_adaptive_moderation(ena_dev); + + return 0; +} + +unsigned int ena_com_get_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev) +{ + return ena_dev->intr_moder_tx_interval; +} + +unsigned int ena_com_get_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev) +{ + return ena_dev->intr_moder_rx_interval; +} + +int ena_com_config_dev_mode(struct ena_com_dev *ena_dev, + struct ena_admin_feature_llq_desc *llq_features, + struct ena_llq_configurations *llq_default_cfg) +{ + struct ena_com_llq_info *llq_info = &ena_dev->llq_info; + int rc; + + if (!llq_features->max_llq_num) { + ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST; + return 0; + } + + rc = ena_com_config_llq_info(ena_dev, llq_features, llq_default_cfg); + if (unlikely(rc)) + return rc; + + ena_dev->tx_max_header_size = llq_info->desc_list_entry_size - + (llq_info->descs_num_before_header * sizeof(struct ena_eth_io_tx_desc)); + + if (unlikely(ena_dev->tx_max_header_size == 0)) { + netdev_err(ena_dev->net_device, "The size of the LLQ entry is smaller than needed\n"); + return -EINVAL; + } + + ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_DEV; + + return 0; +} diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h new file mode 100644 index 0000000000000..f4f1b676e45eb --- /dev/null +++ b/drivers/amazon/net/ena/ena_com.h @@ -0,0 +1,1236 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef ENA_COM +#define ENA_COM + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kcompat.h" +#include "ena_common_defs.h" +#include "ena_admin_defs.h" +#include "ena_eth_io_defs.h" +#include "ena_regs_defs.h" + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define ENA_ADMIN_OS_LINUX 1 + +#define ENA_MAX_NUM_IO_QUEUES 128U +/* We need to queues for each IO (on for Tx and one for Rx) */ +#define ENA_TOTAL_NUM_QUEUES (2 * (ENA_MAX_NUM_IO_QUEUES)) + +#define ENA_MAX_HANDLERS 256 + +#define ENA_MAX_PHYS_ADDR_SIZE_BITS 48 + +/* Unit in usec */ +#define ENA_REG_READ_TIMEOUT 200000 + +#define ADMIN_SQ_SIZE(depth) ((depth) * sizeof(struct ena_admin_aq_entry)) +#define ADMIN_CQ_SIZE(depth) ((depth) * sizeof(struct ena_admin_acq_entry)) +#define ADMIN_AENQ_SIZE(depth) ((depth) * sizeof(struct ena_admin_aenq_entry)) + +/* Macros used to extract LSB/MSB from the + * enums defining the reset reasons + */ +#define ENA_RESET_REASON_LSB_OFFSET 0 +#define ENA_RESET_REASON_LSB_MASK 0xf +#define ENA_RESET_REASON_MSB_OFFSET 4 +#define ENA_RESET_REASON_MSB_MASK 0xf0 + +#define ENA_CUSTOMER_METRICS_BUFFER_SIZE 512 + +/*****************************************************************************/ +/*****************************************************************************/ +/* ENA adaptive interrupt moderation settings */ + +#define ENA_INTR_INITIAL_TX_INTERVAL_USECS 64 +#define ENA_INTR_INITIAL_RX_INTERVAL_USECS 20 +#define ENA_DEFAULT_INTR_DELAY_RESOLUTION 1 + +#define ENA_HASH_KEY_SIZE 40 + +#define ENA_HW_HINTS_NO_TIMEOUT 0xFFFF + +#define ENA_FEATURE_MAX_QUEUE_EXT_VER 1 + +struct ena_llq_configurations { + enum ena_admin_llq_header_location llq_header_location; + enum ena_admin_llq_ring_entry_size llq_ring_entry_size; + enum ena_admin_llq_stride_ctrl llq_stride_ctrl; + enum ena_admin_llq_num_descs_before_header llq_num_decs_before_header; + u16 llq_ring_entry_size_value; +}; + +enum queue_direction { + ENA_COM_IO_QUEUE_DIRECTION_TX, + ENA_COM_IO_QUEUE_DIRECTION_RX +}; + +struct ena_com_buf { + dma_addr_t paddr; /**< Buffer physical address */ + u16 len; /**< Buffer length in bytes */ +}; + +struct ena_com_rx_buf_info { + u16 len; + u16 req_id; +}; + +struct ena_com_io_desc_addr { + u8 __iomem *pbuf_dev_addr; /* LLQ address */ + u8 *virt_addr; + dma_addr_t phys_addr; +}; + +struct ena_com_tx_meta { + u16 mss; + u16 l3_hdr_len; + u16 l3_hdr_offset; + u16 l4_hdr_len; /* In words */ +}; + +struct ena_com_llq_info { + u16 header_location_ctrl; + u16 desc_stride_ctrl; + u16 desc_list_entry_size_ctrl; + u16 desc_list_entry_size; + u16 descs_num_before_header; + u16 descs_per_entry; + u16 max_entries_in_tx_burst; + bool disable_meta_caching; +}; + +struct ena_com_io_cq { + struct ena_com_io_desc_addr cdesc_addr; + void *bus; + + /* Interrupt unmask register */ + u32 __iomem *unmask_reg; + + + /* numa configuration register (for TPH) */ + u32 __iomem *numa_node_cfg_reg; + + /* The value to write to the above register to unmask + * the interrupt of this queue + */ + u32 msix_vector ____cacheline_aligned; + + enum queue_direction direction; + + /* holds the number of cdesc of the current packet */ + u16 cur_rx_pkt_cdesc_count; + /* save the first cdesc idx of the current packet */ + u16 cur_rx_pkt_cdesc_start_idx; + + u16 q_depth; + /* Caller qid */ + u16 qid; + + /* Device queue index */ + u16 idx; + u16 head; + u8 phase; + u8 cdesc_entry_size_in_bytes; + +} ____cacheline_aligned; + +struct ena_com_io_bounce_buffer_control { + u8 *base_buffer; + u16 next_to_use; + u16 buffer_size; + u16 buffers_num; /* Must be a power of 2 */ +}; + +/* This struct is to keep tracking the current location of the next llq entry */ +struct ena_com_llq_pkt_ctrl { + u8 *curr_bounce_buf; + u16 idx; + u16 descs_left_in_line; +}; + +struct ena_com_io_sq { + struct ena_com_io_desc_addr desc_addr; + void *bus; + + u32 __iomem *db_addr; + + enum queue_direction direction; + enum ena_admin_placement_policy_type mem_queue_type; + + bool disable_meta_caching; + + u32 msix_vector; + struct ena_com_tx_meta cached_tx_meta; + struct ena_com_llq_info llq_info; + struct ena_com_llq_pkt_ctrl llq_buf_ctrl; + struct ena_com_io_bounce_buffer_control bounce_buf_ctrl; + + u16 q_depth; + u16 qid; + + u16 idx; + u16 tail; + u16 next_to_comp; + u16 llq_last_copy_tail; + u32 tx_max_header_size; + u8 phase; + u8 desc_entry_size; + u8 dma_addr_bits; + u16 entries_in_tx_burst_left; +} ____cacheline_aligned; + +struct ena_com_admin_cq { + struct ena_admin_acq_entry *entries; + dma_addr_t dma_addr; + + u16 head; + u8 phase; +}; + +struct ena_com_admin_sq { + struct ena_admin_aq_entry *entries; + dma_addr_t dma_addr; + + u32 __iomem *db_addr; + + u16 head; + u16 tail; + u8 phase; + +}; + +struct ena_com_stats_admin { + u64 aborted_cmd; + u64 submitted_cmd; + u64 completed_cmd; + u64 out_of_space; + u64 no_completion; +}; + +struct ena_com_stats_phc { + u64 phc_cnt; + u64 phc_exp; + u64 phc_skp; + u64 phc_err; +}; + +struct ena_com_admin_queue { + void *q_dmadev; + void *bus; + struct ena_com_dev *ena_dev; + spinlock_t q_lock; /* spinlock for the admin queue */ + + struct ena_comp_ctx *comp_ctx; + u32 completion_timeout; + u16 q_depth; + struct ena_com_admin_cq cq; + struct ena_com_admin_sq sq; + + /* Indicate if the admin queue should poll for completion */ + bool polling; + + /* Define if fallback to polling mode should occur */ + bool auto_polling; + + u16 curr_cmd_id; + + /* Indicate that the ena was initialized and can + * process new admin commands + */ + bool running_state; + + bool is_missing_admin_interrupt; + + /* Count the number of outstanding admin commands */ + atomic_t outstanding_cmds; + + struct ena_com_stats_admin stats; +}; + +struct ena_aenq_handlers; + +struct ena_com_aenq { + u16 head; + u8 phase; + struct ena_admin_aenq_entry *entries; + dma_addr_t dma_addr; + u16 q_depth; + struct ena_aenq_handlers *aenq_handlers; +}; + +struct ena_com_mmio_read { + struct ena_admin_ena_mmio_req_read_less_resp *read_resp; + dma_addr_t read_resp_dma_addr; + u32 reg_read_to; /* in us */ + u16 seq_num; + bool readless_supported; + /* spin lock to ensure a single outstanding read */ + spinlock_t lock; +}; + +/* PTP hardware clock (PHC) MMIO read data info */ +struct ena_com_phc_info { + /* Internal PHC statistics */ + struct ena_com_stats_phc stats; + + /* PHC shared memory - virtual address */ + struct ena_admin_phc_resp *virt_addr; + + /* System time of last PHC request */ + ktime_t system_time; + + /* Spin lock to ensure a single outstanding PHC read */ + spinlock_t lock; + + /* PHC doorbell address as an offset to PCIe MMIO REG BAR */ + u32 doorbell_offset; + + /* Shared memory read expire timeout (usec) + * Max time for valid PHC retrieval, passing this threshold will fail the get time request + * and block new PHC requests for block_timeout_usec in order to prevent floods on busy + * device + */ + u32 expire_timeout_usec; + + /* Shared memory read abort timeout (usec) + * PHC requests block period, blocking starts once PHC request expired in order to prevent + * floods on busy device, any PHC requests during block period will be skipped + */ + u32 block_timeout_usec; + + /* PHC shared memory - physical address */ + dma_addr_t phys_addr; + + /* Cached error bound per timestamp sample */ + u32 error_bound; + + /* Request id sent to the device */ + u16 req_id; + + /* True if PHC is active in the device */ + bool active; +}; + +struct ena_rss { + /* Indirect table */ + u16 *host_rss_ind_tbl; + struct ena_admin_rss_ind_table_entry *rss_ind_tbl; + dma_addr_t rss_ind_tbl_dma_addr; + u16 tbl_log_size; + + /* Hash key */ + enum ena_admin_hash_functions hash_func; + struct ena_admin_feature_rss_flow_hash_control *hash_key; + dma_addr_t hash_key_dma_addr; + u32 hash_init_val; + + /* Flow Control */ + struct ena_admin_feature_rss_hash_control *hash_ctrl; + dma_addr_t hash_ctrl_dma_addr; + +}; + +struct ena_customer_metrics { + /* in correlation with ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK + * and ena_admin_customer_metrics_id + */ + u64 supported_metrics; + dma_addr_t buffer_dma_addr; + void *buffer_virt_addr; + u32 buffer_len; +}; + +struct ena_host_attribute { + /* Debug area */ + u8 *debug_area_virt_addr; + dma_addr_t debug_area_dma_addr; + u32 debug_area_size; + + /* Host information */ + struct ena_admin_host_info *host_info; + dma_addr_t host_info_dma_addr; +}; + +/* Each ena_dev is a PCI function. */ +struct ena_com_dev { + struct ena_com_admin_queue admin_queue; + struct ena_com_aenq aenq; + struct ena_com_io_cq io_cq_queues[ENA_TOTAL_NUM_QUEUES]; + struct ena_com_io_sq io_sq_queues[ENA_TOTAL_NUM_QUEUES]; + u8 __iomem *reg_bar; + void __iomem *mem_bar; + void *dmadev; + void *bus; + struct net_device *net_device; + + enum ena_admin_placement_policy_type tx_mem_queue_type; + u32 tx_max_header_size; + u16 stats_func; /* Selected function for extended statistic dump */ + u16 stats_queue; /* Selected queue for extended statistic dump */ + + u32 ena_min_poll_delay_us; + + struct ena_com_mmio_read mmio_read; + struct ena_com_phc_info phc; + + struct ena_rss rss; + u32 supported_features; + u32 capabilities; + u32 dma_addr_bits; + + struct ena_host_attribute host_attr; + bool adaptive_coalescing; + u16 intr_delay_resolution; + + /* interrupt moderation intervals are in usec divided by + * intr_delay_resolution, which is supplied by the device. + */ + u32 intr_moder_tx_interval; + u32 intr_moder_rx_interval; + + struct ena_intr_moder_entry *intr_moder_tbl; + + struct ena_com_llq_info llq_info; + + struct ena_customer_metrics customer_metrics; +}; + +struct ena_com_dev_get_features_ctx { + struct ena_admin_queue_feature_desc max_queues; + struct ena_admin_queue_ext_feature_desc max_queue_ext; + struct ena_admin_device_attr_feature_desc dev_attr; + struct ena_admin_feature_aenq_desc aenq; + struct ena_admin_feature_offload_desc offload; + struct ena_admin_ena_hw_hints hw_hints; + struct ena_admin_feature_llq_desc llq; +}; + +struct ena_com_create_io_ctx { + enum ena_admin_placement_policy_type mem_queue_type; + enum queue_direction direction; + int numa_node; + u32 msix_vector; + u16 queue_size; + u16 qid; +}; + +typedef void (*ena_aenq_handler)(void *data, + struct ena_admin_aenq_entry *aenq_e); + +/* Holds aenq handlers. Indexed by AENQ event group */ +struct ena_aenq_handlers { + ena_aenq_handler handlers[ENA_MAX_HANDLERS]; + ena_aenq_handler unimplemented_handler; +}; + +/*****************************************************************************/ +/*****************************************************************************/ + +/* ena_com_mmio_reg_read_request_init - Init the mmio reg read mechanism + * @ena_dev: ENA communication layer struct + * + * Initialize the register read mechanism. + * + * @note: This method must be the first stage in the initialization sequence. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev); + +/* ena_com_phc_init - Allocate and initialize PHC feature + * @ena_dev: ENA communication layer struct + * @note: This method assumes PHC is supported by the device + * @return - 0 on success, negative value on failure + */ +int ena_com_phc_init(struct ena_com_dev *ena_dev); + +/* ena_com_phc_supported - Return if PHC feature is supported by the device + * @ena_dev: ENA communication layer struct + * @note: This method must be called after getting supported features + * @return - supported or not + */ +bool ena_com_phc_supported(struct ena_com_dev *ena_dev); + +/* ena_com_phc_config - Configure PHC feature + * @ena_dev: ENA communication layer struct + * Configure PHC feature in driver and device + * @note: This method assumes PHC is supported by the device + * @return - 0 on success, negative value on failure + */ +int ena_com_phc_config(struct ena_com_dev *ena_dev); + +/* ena_com_phc_destroy - Destroy PHC feature + * @ena_dev: ENA communication layer struct + */ +void ena_com_phc_destroy(struct ena_com_dev *ena_dev); + +/* ena_com_phc_get_timestamp - Retrieve PHC timestamp + * @ena_dev: ENA communication layer struct + * @timestamp: Retrieved PHC timestamp + * @return - 0 on success, negative value on failure + */ +int ena_com_phc_get_timestamp(struct ena_com_dev *ena_dev, u64 *timestamp); + +/* ena_com_phc_get_error_bound - Retrieve cached PHC error bound + * @ena_dev: ENA communication layer struct + * @error_bound: Cached PHC error bound + * @return - 0 on success, negative value on failure + */ +int ena_com_phc_get_error_bound(struct ena_com_dev *ena_dev, u32 *error_bound); + +/* ena_com_set_mmio_read_mode - Enable/disable the indirect mmio reg read mechanism + * @ena_dev: ENA communication layer struct + * @readless_supported: readless mode (enable/disable) + */ +void ena_com_set_mmio_read_mode(struct ena_com_dev *ena_dev, + bool readless_supported); + +/* ena_com_mmio_reg_read_request_write_dev_addr - Write the mmio reg read return + * value physical address. + * @ena_dev: ENA communication layer struct + */ +void ena_com_mmio_reg_read_request_write_dev_addr(struct ena_com_dev *ena_dev); + +/* ena_com_mmio_reg_read_request_destroy - Destroy the mmio reg read mechanism + * @ena_dev: ENA communication layer struct + */ +void ena_com_mmio_reg_read_request_destroy(struct ena_com_dev *ena_dev); + +/* ena_com_admin_init - Init the admin and the async queues + * @ena_dev: ENA communication layer struct + * @aenq_handlers: Those handlers to be called upon event. + * + * Initialize the admin submission and completion queues. + * Initialize the asynchronous events notification queues. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_admin_init(struct ena_com_dev *ena_dev, + struct ena_aenq_handlers *aenq_handlers); + +/* ena_com_admin_destroy - Destroy the admin and the async events queues. + * @ena_dev: ENA communication layer struct + * + * @note: Before calling this method, the caller must validate that the device + * won't send any additional admin completions/aenq. + * To achieve that, a FLR is recommended. + */ +void ena_com_admin_destroy(struct ena_com_dev *ena_dev); + +/* ena_com_dev_reset - Perform device FLR to the device. + * @ena_dev: ENA communication layer struct + * @reset_reason: Specify what is the trigger for the reset in case of an error. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_dev_reset(struct ena_com_dev *ena_dev, + enum ena_regs_reset_reason_types reset_reason); + +/* ena_com_create_io_queue - Create io queue. + * @ena_dev: ENA communication layer struct + * @ctx - create context structure + * + * Create the submission and the completion queues. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_create_io_queue(struct ena_com_dev *ena_dev, + struct ena_com_create_io_ctx *ctx); + +/* ena_com_destroy_io_queue - Destroy IO queue with the queue id - qid. + * @ena_dev: ENA communication layer struct + * @qid - the caller virtual queue id. + */ +void ena_com_destroy_io_queue(struct ena_com_dev *ena_dev, u16 qid); + +/* ena_com_get_io_handlers - Return the io queue handlers + * @ena_dev: ENA communication layer struct + * @qid - the caller virtual queue id. + * @io_sq - IO submission queue handler + * @io_cq - IO completion queue handler. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_get_io_handlers(struct ena_com_dev *ena_dev, u16 qid, + struct ena_com_io_sq **io_sq, + struct ena_com_io_cq **io_cq); + +/* ena_com_admin_aenq_enable - ENAble asynchronous event notifications + * @ena_dev: ENA communication layer struct + * + * After this method, aenq event can be received via AENQ. + */ +void ena_com_admin_aenq_enable(struct ena_com_dev *ena_dev); + +/* ena_com_set_admin_running_state - Set the state of the admin queue + * @ena_dev: ENA communication layer struct + * + * Change the state of the admin queue (enable/disable) + */ +void ena_com_set_admin_running_state(struct ena_com_dev *ena_dev, bool state); + +/* ena_com_get_admin_running_state - Get the admin queue state + * @ena_dev: ENA communication layer struct + * + * Retrieve the state of the admin queue (enable/disable) + * + * @return - current polling mode (enable/disable) + */ +bool ena_com_get_admin_running_state(struct ena_com_dev *ena_dev); + +/* ena_com_set_admin_polling_mode - Set the admin completion queue polling mode + * @ena_dev: ENA communication layer struct + * @polling: ENAble/Disable polling mode + * + * Set the admin completion mode. + */ +void ena_com_set_admin_polling_mode(struct ena_com_dev *ena_dev, bool polling); + +/* ena_com_get_admin_polling_mode - Get the admin completion queue polling mode + * @ena_dev: ENA communication layer struct + * + * Get the admin completion mode. + * If polling mode is on, ena_com_execute_admin_command will perform a + * polling on the admin completion queue for the commands completion, + * otherwise it will wait on wait event. + * + * @return state + */ +bool ena_com_get_admin_polling_mode(struct ena_com_dev *ena_dev); + +/* ena_com_set_admin_auto_polling_mode - Enable autoswitch to polling mode + * @ena_dev: ENA communication layer struct + * @polling: Enable/Disable polling mode + * + * Set the autopolling mode. + * If autopolling is on: + * In case of missing interrupt when data is available switch to polling. + */ +void ena_com_set_admin_auto_polling_mode(struct ena_com_dev *ena_dev, + bool polling); + +/* ena_com_admin_q_comp_intr_handler - admin queue interrupt handler + * @ena_dev: ENA communication layer struct + * + * This method goes over the admin completion queue and wakes up all the pending + * threads that wait on the commands wait event. + * + * @note: Should be called after MSI-X interrupt. + */ +void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev); + +/* ena_com_aenq_intr_handler - AENQ interrupt handler + * @ena_dev: ENA communication layer struct + * + * This method goes over the async event notification queue and calls the proper + * aenq handler. + */ +void ena_com_aenq_intr_handler(struct ena_com_dev *ena_dev, void *data); + +/* ena_com_aenq_has_keep_alive - Retrieve if there is a keep alive notification in the aenq + * @ena_dev: ENA communication layer struct + * + * This method goes over the async event notification queue and returns if there + * is a keep alive notification. + * + * @return - true if there is a keep alive notification in the aenq or false otherwise + */ +bool ena_com_aenq_has_keep_alive(struct ena_com_dev *ena_dev); + +/* ena_com_abort_admin_commands - Abort all the outstanding admin commands. + * @ena_dev: ENA communication layer struct + * + * This method aborts all the outstanding admin commands. + * The caller should then call ena_com_wait_for_abort_completion to make sure + * all the commands were completed. + */ +void ena_com_abort_admin_commands(struct ena_com_dev *ena_dev); + +/* ena_com_wait_for_abort_completion - Wait for admin commands abort. + * @ena_dev: ENA communication layer struct + * + * This method waits until all the outstanding admin commands are completed. + */ +void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev); + +/* ena_com_validate_version - Validate the device parameters + * @ena_dev: ENA communication layer struct + * + * This method verifies the device parameters are the same as the saved + * parameters in ena_dev. + * This method is useful after device reset, to validate the device mac address + * and the device offloads are the same as before the reset. + * + * @return - 0 on success negative value otherwise. + */ +int ena_com_validate_version(struct ena_com_dev *ena_dev); + +/* ena_com_get_link_params - Retrieve physical link parameters. + * @ena_dev: ENA communication layer struct + * @resp: Link parameters + * + * Retrieve the physical link parameters, + * like speed, auto-negotiation and full duplex support. + * + * @return - 0 on Success negative value otherwise. + */ +int ena_com_get_link_params(struct ena_com_dev *ena_dev, + struct ena_admin_get_feat_resp *resp); + +/* ena_com_get_dma_width - Retrieve physical dma address width the device + * supports. + * @ena_dev: ENA communication layer struct + * + * Retrieve the maximum physical address bits the device can handle. + * + * @return: > 0 on Success and negative value otherwise. + */ +int ena_com_get_dma_width(struct ena_com_dev *ena_dev); + +/* ena_com_set_aenq_config - Set aenq groups configurations + * @ena_dev: ENA communication layer struct + * @groups flag: bit fields flags of enum ena_admin_aenq_group. + * + * Configure which aenq event group the driver would like to receive. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag); + +/* ena_com_get_dev_attr_feat - Get device features + * @ena_dev: ENA communication layer struct + * @get_feat_ctx: returned context that contain the get features. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev, + struct ena_com_dev_get_features_ctx *get_feat_ctx); + +/* ena_com_get_dev_basic_stats - Get device basic statistics + * @ena_dev: ENA communication layer struct + * @stats: stats return value + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev, + struct ena_admin_basic_stats *stats); + +/* ena_com_get_eni_stats - Get extended network interface statistics + * @ena_dev: ENA communication layer struct + * @stats: stats return value + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_eni_stats(struct ena_com_dev *ena_dev, + struct ena_admin_eni_stats *stats); + +/* ena_com_get_ena_srd_info - Get ENA SRD network interface statistics + * @ena_dev: ENA communication layer struct + * @info: ena srd stats and flags + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_ena_srd_info(struct ena_com_dev *ena_dev, + struct ena_admin_ena_srd_info *info); + +/* ena_com_get_customer_metrics - Get customer metrics for network interface + * @ena_dev: ENA communication layer struct + * @buffer: buffer for returned customer metrics + * @len: size of the buffer + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_customer_metrics(struct ena_com_dev *ena_dev, char *buffer, u32 len); + +/* ena_com_set_dev_mtu - Configure the device mtu. + * @ena_dev: ENA communication layer struct + * @mtu: mtu value + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu); + +/* ena_com_get_offload_settings - Retrieve the device offloads capabilities + * @ena_dev: ENA communication layer struct + * @offlad: offload return value + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_offload_settings(struct ena_com_dev *ena_dev, + struct ena_admin_feature_offload_desc *offload); + +/* ena_com_rss_init - Init RSS + * @ena_dev: ENA communication layer struct + * @log_size: indirection log size + * + * Allocate RSS/RFS resources. + * The caller then can configure rss using ena_com_set_hash_function, + * ena_com_set_hash_ctrl and ena_com_indirect_table_set. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 log_size); + +/* ena_com_rss_destroy - Destroy rss + * @ena_dev: ENA communication layer struct + * + * Free all the RSS/RFS resources. + */ +void ena_com_rss_destroy(struct ena_com_dev *ena_dev); + +/* ena_com_get_current_hash_function - Get RSS hash function + * @ena_dev: ENA communication layer struct + * + * Return the current hash function. + * @return: 0 or one of the ena_admin_hash_functions values. + */ +int ena_com_get_current_hash_function(struct ena_com_dev *ena_dev); + +/* ena_com_fill_hash_function - Fill RSS hash function + * @ena_dev: ENA communication layer struct + * @func: The hash function (Toeplitz or crc) + * @key: Hash key (for toeplitz hash) + * @key_len: key length (max length 10 DW) + * @init_val: initial value for the hash function + * + * Fill the ena_dev resources with the desire hash function, hash key, key_len + * and key initial value (if needed by the hash function). + * To flush the key into the device the caller should call + * ena_com_set_hash_function. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_fill_hash_function(struct ena_com_dev *ena_dev, + enum ena_admin_hash_functions func, + const u8 *key, u16 key_len, u32 init_val); + +/* ena_com_set_hash_function - Flush the hash function and it dependencies to + * the device. + * @ena_dev: ENA communication layer struct + * + * Flush the hash function and it dependencies (key, key length and + * initial value) if needed. + * + * @note: Prior to this method the caller should call ena_com_fill_hash_function + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_set_hash_function(struct ena_com_dev *ena_dev); + +/* ena_com_get_hash_function - Retrieve the hash function from the device. + * @ena_dev: ENA communication layer struct + * @func: hash function + * + * Retrieve the hash function from the device. + * + * @note: If the caller called ena_com_fill_hash_function but didn't flush + * it to the device, the new configuration will be lost. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_hash_function(struct ena_com_dev *ena_dev, + enum ena_admin_hash_functions *func); + +/* ena_com_get_hash_key - Retrieve the hash key + * @ena_dev: ENA communication layer struct + * @key: hash key + * + * Retrieve the hash key. + * + * @note: If the caller called ena_com_fill_hash_key but didn't flush + * it to the device, the new configuration will be lost. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_hash_key(struct ena_com_dev *ena_dev, u8 *key); +/* ena_com_fill_hash_ctrl - Fill RSS hash control + * @ena_dev: ENA communication layer struct. + * @proto: The protocol to configure. + * @hash_fields: bit mask of ena_admin_flow_hash_fields + * + * Fill the ena_dev resources with the desire hash control (the ethernet + * fields that take part of the hash) for a specific protocol. + * To flush the hash control to the device, the caller should call + * ena_com_set_hash_ctrl. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_fill_hash_ctrl(struct ena_com_dev *ena_dev, + enum ena_admin_flow_hash_proto proto, + u16 hash_fields); + +/* ena_com_set_hash_ctrl - Flush the hash control resources to the device. + * @ena_dev: ENA communication layer struct + * + * Flush the hash control (the ethernet fields that take part of the hash) + * + * @note: Prior to this method the caller should call ena_com_fill_hash_ctrl. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev); + +/* ena_com_get_hash_ctrl - Retrieve the hash control from the device. + * @ena_dev: ENA communication layer struct + * @proto: The protocol to retrieve. + * @fields: bit mask of ena_admin_flow_hash_fields. + * + * Retrieve the hash control from the device. + * + * @note: If the caller called ena_com_fill_hash_ctrl but didn't flush + * it to the device, the new configuration will be lost. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_hash_ctrl(struct ena_com_dev *ena_dev, + enum ena_admin_flow_hash_proto proto, + u16 *fields); + +/* ena_com_set_default_hash_ctrl - Set the hash control to a default + * configuration. + * @ena_dev: ENA communication layer struct + * + * Fill the ena_dev resources with the default hash control configuration. + * To flush the hash control to the device, the caller should call + * ena_com_set_hash_ctrl. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_set_default_hash_ctrl(struct ena_com_dev *ena_dev); + +/* ena_com_indirect_table_fill_entry - Fill a single entry in the RSS + * indirection table + * @ena_dev: ENA communication layer struct. + * @entry_idx - indirection table entry. + * @entry_value - redirection value + * + * Fill a single entry of the RSS indirection table in the ena_dev resources. + * To flush the indirection table to the device, the called should call + * ena_com_indirect_table_set. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_indirect_table_fill_entry(struct ena_com_dev *ena_dev, + u16 entry_idx, u16 entry_value); + +/* ena_com_indirect_table_set - Flush the indirection table to the device. + * @ena_dev: ENA communication layer struct + * + * Flush the indirection hash control to the device. + * Prior to this method the caller should call ena_com_indirect_table_fill_entry + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_indirect_table_set(struct ena_com_dev *ena_dev); + +/* ena_com_indirect_table_get - Retrieve the indirection table from the device. + * @ena_dev: ENA communication layer struct + * @ind_tbl: indirection table + * + * Retrieve the RSS indirection table from the device. + * + * @note: If the caller called ena_com_indirect_table_fill_entry but didn't flush + * it to the device, the new configuration will be lost. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_indirect_table_get(struct ena_com_dev *ena_dev, u32 *ind_tbl); + +/* ena_com_allocate_host_info - Allocate host info resources. + * @ena_dev: ENA communication layer struct + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_allocate_host_info(struct ena_com_dev *ena_dev); + +/* ena_com_allocate_debug_area - Allocate debug area. + * @ena_dev: ENA communication layer struct + * @debug_area_size - debug area size. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev, + u32 debug_area_size); + +/* ena_com_allocate_customer_metrics_buffer - Allocate customer metrics resources. + * @ena_dev: ENA communication layer struct + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_allocate_customer_metrics_buffer(struct ena_com_dev *ena_dev); + +/* ena_com_delete_debug_area - Free the debug area resources. + * @ena_dev: ENA communication layer struct + * + * Free the allocated debug area. + */ +void ena_com_delete_debug_area(struct ena_com_dev *ena_dev); + +/* ena_com_delete_host_info - Free the host info resources. + * @ena_dev: ENA communication layer struct + * + * Free the allocated host info. + */ +void ena_com_delete_host_info(struct ena_com_dev *ena_dev); + +/* ena_com_delete_customer_metrics_buffer - Free the customer metrics resources. + * @ena_dev: ENA communication layer struct + * + * Free the allocated customer metrics area. + */ +void ena_com_delete_customer_metrics_buffer(struct ena_com_dev *ena_dev); + +/* ena_com_set_host_attributes - Update the device with the host + * attributes (debug area and host info) base address. + * @ena_dev: ENA communication layer struct + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_set_host_attributes(struct ena_com_dev *ena_dev); + +/* ena_com_create_io_cq - Create io completion queue. + * @ena_dev: ENA communication layer struct + * @io_cq - io completion queue handler + + * Create IO completion queue. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_create_io_cq(struct ena_com_dev *ena_dev, + struct ena_com_io_cq *io_cq); + +/* ena_com_destroy_io_cq - Destroy io completion queue. + * @ena_dev: ENA communication layer struct + * @io_cq - io completion queue handler + + * Destroy IO completion queue. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev, + struct ena_com_io_cq *io_cq); + +/* ena_com_execute_admin_command - Execute admin command + * @admin_queue: admin queue. + * @cmd: the admin command to execute. + * @cmd_size: the command size. + * @cmd_completion: command completion return value. + * @cmd_comp_size: command completion size. + + * Submit an admin command and then wait until the device returns a + * completion. + * The completion will be copied into cmd_comp. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue, + struct ena_admin_aq_entry *cmd, + size_t cmd_size, + struct ena_admin_acq_entry *cmd_comp, + size_t cmd_comp_size); + +/* ena_com_init_interrupt_moderation - Init interrupt moderation + * @ena_dev: ENA communication layer struct + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev); + +/* ena_com_interrupt_moderation_supported - Return if interrupt moderation + * capability is supported by the device. + * + * @return - supported or not. + */ +bool ena_com_interrupt_moderation_supported(struct ena_com_dev *ena_dev); + +/* ena_com_update_nonadaptive_moderation_interval_tx - Update the + * non-adaptive interval in Tx direction. + * @ena_dev: ENA communication layer struct + * @tx_coalesce_usecs: Interval in usec. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_update_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev, + u32 tx_coalesce_usecs); + +/* ena_com_update_nonadaptive_moderation_interval_rx - Update the + * non-adaptive interval in Rx direction. + * @ena_dev: ENA communication layer struct + * @rx_coalesce_usecs: Interval in usec. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_update_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev, + u32 rx_coalesce_usecs); + +/* ena_com_get_nonadaptive_moderation_interval_tx - Retrieve the + * non-adaptive interval in Tx direction. + * @ena_dev: ENA communication layer struct + * + * @return - interval in usec + */ +unsigned int ena_com_get_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev); + +/* ena_com_get_nonadaptive_moderation_interval_rx - Retrieve the + * non-adaptive interval in Rx direction. + * @ena_dev: ENA communication layer struct + * + * @return - interval in usec + */ +unsigned int ena_com_get_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev); + +/* ena_com_config_dev_mode - Configure the placement policy of the device. + * @ena_dev: ENA communication layer struct + * @llq_features: LLQ feature descriptor, retrieve via + * ena_com_get_dev_attr_feat. + * @ena_llq_config: The default driver LLQ parameters configurations + */ +int ena_com_config_dev_mode(struct ena_com_dev *ena_dev, + struct ena_admin_feature_llq_desc *llq_features, + struct ena_llq_configurations *llq_default_config); + +/* ena_com_get_missing_admin_interrupt - Return if there is a missing admin interrupt + * @ena_dev: ENA communication layer struct + * + * @return - true if there is a missing admin interrupt or false otherwise + */ +static inline bool ena_com_get_missing_admin_interrupt(struct ena_com_dev *ena_dev) +{ + return ena_dev->admin_queue.is_missing_admin_interrupt; +} + +/* ena_com_io_sq_to_ena_dev - Extract ena_com_dev using contained field io_sq. + * @io_sq: IO submit queue struct + * + * @return - ena_com_dev struct extracted from io_sq + */ +static inline struct ena_com_dev *ena_com_io_sq_to_ena_dev(struct ena_com_io_sq *io_sq) +{ + return container_of(io_sq, struct ena_com_dev, io_sq_queues[io_sq->qid]); +} + +/* ena_com_io_cq_to_ena_dev - Extract ena_com_dev using contained field io_cq. + * @io_sq: IO submit queue struct + * + * @return - ena_com_dev struct extracted from io_sq + */ +static inline struct ena_com_dev *ena_com_io_cq_to_ena_dev(struct ena_com_io_cq *io_cq) +{ + return container_of(io_cq, struct ena_com_dev, io_cq_queues[io_cq->qid]); +} + +static inline bool ena_com_get_adaptive_moderation_enabled(struct ena_com_dev *ena_dev) +{ + return ena_dev->adaptive_coalescing; +} + +static inline void ena_com_enable_adaptive_moderation(struct ena_com_dev *ena_dev) +{ + ena_dev->adaptive_coalescing = true; +} + +static inline void ena_com_disable_adaptive_moderation(struct ena_com_dev *ena_dev) +{ + ena_dev->adaptive_coalescing = false; +} + +/* ena_com_get_cap - query whether device supports a capability. + * @ena_dev: ENA communication layer struct + * @cap_id: enum value representing the capability + * + * @return - true if capability is supported or false otherwise + */ +static inline bool ena_com_get_cap(struct ena_com_dev *ena_dev, + enum ena_admin_aq_caps_id cap_id) +{ + return !!(ena_dev->capabilities & BIT(cap_id)); +} + +/* ena_com_get_customer_metric_support - query whether device supports a given customer metric. + * @ena_dev: ENA communication layer struct + * @metric_id: enum value representing the customer metric + * + * @return - true if customer metric is supported or false otherwise + */ +static inline bool ena_com_get_customer_metric_support(struct ena_com_dev *ena_dev, + enum ena_admin_customer_metrics_id metric_id) +{ + return !!(ena_dev->customer_metrics.supported_metrics & BIT(metric_id)); +} + +/* ena_com_get_customer_metric_count - return the number of supported customer metrics. + * @ena_dev: ENA communication layer struct + * + * @return - the number of supported customer metrics + */ +static inline int ena_com_get_customer_metric_count(struct ena_com_dev *ena_dev) +{ + return hweight64(ena_dev->customer_metrics.supported_metrics); +} + +/* ena_com_update_intr_reg - Prepare interrupt register + * @intr_reg: interrupt register to update. + * @rx_delay_interval: Rx interval in usecs + * @tx_delay_interval: Tx interval in usecs + * @unmask: unmask enable/disable + * @no_moderation_update: 0 - Indicates that any of the TX/RX intervals was + * updated, 1 - otherwise + * + * Prepare interrupt update register with the supplied parameters. + */ +static inline void ena_com_update_intr_reg(struct ena_eth_io_intr_reg *intr_reg, + u32 rx_delay_interval, + u32 tx_delay_interval, + bool unmask, + bool no_moderation_update) +{ + intr_reg->intr_control = 0; + intr_reg->intr_control |= rx_delay_interval & + ENA_ETH_IO_INTR_REG_RX_INTR_DELAY_MASK; + + intr_reg->intr_control |= + (tx_delay_interval << ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_SHIFT) + & ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_MASK; + + if (unmask) + intr_reg->intr_control |= ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK; + + intr_reg->intr_control |= + (((u32)no_moderation_update) << ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_SHIFT) & + ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_MASK; +} + +static inline u8 *ena_com_get_next_bounce_buffer(struct ena_com_io_bounce_buffer_control *bounce_buf_ctrl) +{ + u16 size, buffers_num; + u8 *buf; + + size = bounce_buf_ctrl->buffer_size; + buffers_num = bounce_buf_ctrl->buffers_num; + + buf = bounce_buf_ctrl->base_buffer + + (bounce_buf_ctrl->next_to_use++ & (buffers_num - 1)) * size; + + prefetchw(bounce_buf_ctrl->base_buffer + + (bounce_buf_ctrl->next_to_use & (buffers_num - 1)) * size); + + return buf; +} + +#endif /* !(ENA_COM) */ diff --git a/drivers/amazon/net/ena/ena_common_defs.h b/drivers/amazon/net/ena/ena_common_defs.h new file mode 100644 index 0000000000000..e210c8a81fc0e --- /dev/null +++ b/drivers/amazon/net/ena/ena_common_defs.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ +#ifndef _ENA_COMMON_H_ +#define _ENA_COMMON_H_ + +#define ENA_COMMON_SPEC_VERSION_MAJOR 2 +#define ENA_COMMON_SPEC_VERSION_MINOR 0 + +/* ENA operates with 48-bit memory addresses. ena_mem_addr_t */ +struct ena_common_mem_addr { + u32 mem_addr_low; + + u16 mem_addr_high; + + /* MBZ */ + u16 reserved16; +}; + +#endif /* _ENA_COMMON_H_ */ diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c new file mode 100644 index 0000000000000..281a4b46f2e89 --- /dev/null +++ b/drivers/amazon/net/ena/ena_eth_com.c @@ -0,0 +1,668 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "ena_eth_com.h" + +static struct ena_eth_io_rx_cdesc_base *ena_com_get_next_rx_cdesc( + struct ena_com_io_cq *io_cq) +{ + struct ena_eth_io_rx_cdesc_base *cdesc; + u16 expected_phase, head_masked; + u16 desc_phase; + + head_masked = io_cq->head & (io_cq->q_depth - 1); + expected_phase = io_cq->phase; + + cdesc = (struct ena_eth_io_rx_cdesc_base *)(io_cq->cdesc_addr.virt_addr + + (head_masked * io_cq->cdesc_entry_size_in_bytes)); + + desc_phase = (READ_ONCE(cdesc->status) & ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT; + + if (desc_phase != expected_phase) + return NULL; + + /* Make sure we read the rest of the descriptor after the phase bit + * has been read + */ + dma_rmb(); + + return cdesc; +} + +static void *get_sq_desc_regular_queue(struct ena_com_io_sq *io_sq) +{ + u16 tail_masked; + u32 offset; + + tail_masked = io_sq->tail & (io_sq->q_depth - 1); + + offset = tail_masked * io_sq->desc_entry_size; + + return (void *)((uintptr_t)io_sq->desc_addr.virt_addr + offset); +} + +static int ena_com_write_bounce_buffer_to_dev(struct ena_com_io_sq *io_sq, + u8 *bounce_buffer) +{ + struct ena_com_llq_info *llq_info = &io_sq->llq_info; + + u16 dst_tail_mask; + u32 dst_offset; + + dst_tail_mask = io_sq->tail & (io_sq->q_depth - 1); + dst_offset = dst_tail_mask * llq_info->desc_list_entry_size; + + if (is_llq_max_tx_burst_exists(io_sq)) { + if (unlikely(!io_sq->entries_in_tx_burst_left)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Error: trying to send more packets than tx burst allows\n"); + return -ENOSPC; + } + + io_sq->entries_in_tx_burst_left--; + netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Decreasing entries_in_tx_burst_left of queue %d to %d\n", io_sq->qid, + io_sq->entries_in_tx_burst_left); + } + + /* Make sure everything was written into the bounce buffer before + * writing the bounce buffer to the device + */ + wmb(); + + /* The line is completed. Copy it to dev */ + __iowrite64_copy(io_sq->desc_addr.pbuf_dev_addr + dst_offset, bounce_buffer, + (llq_info->desc_list_entry_size) / 8); + + io_sq->tail++; + + /* Switch phase bit in case of wrap around */ + if (unlikely((io_sq->tail & (io_sq->q_depth - 1)) == 0)) + io_sq->phase ^= 1; + + return 0; +} + +static int ena_com_write_header_to_bounce(struct ena_com_io_sq *io_sq, + u8 *header_src, + u16 header_len) +{ + struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl; + struct ena_com_llq_info *llq_info = &io_sq->llq_info; + u8 *bounce_buffer = pkt_ctrl->curr_bounce_buf; + u16 header_offset; + + if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)) + return 0; + + header_offset = + llq_info->descs_num_before_header * io_sq->desc_entry_size; + + if (unlikely((header_offset + header_len) > llq_info->desc_list_entry_size)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Trying to write header larger than llq entry can accommodate\n"); + return -EFAULT; + } + + if (unlikely(!bounce_buffer)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, "Bounce buffer is NULL\n"); + return -EFAULT; + } + + memcpy(bounce_buffer + header_offset, header_src, header_len); + + return 0; +} + +static void *get_sq_desc_llq(struct ena_com_io_sq *io_sq) +{ + struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl; + u8 *bounce_buffer; + void *sq_desc; + + bounce_buffer = pkt_ctrl->curr_bounce_buf; + + if (unlikely(!bounce_buffer)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, "Bounce buffer is NULL\n"); + return NULL; + } + + sq_desc = bounce_buffer + pkt_ctrl->idx * io_sq->desc_entry_size; + pkt_ctrl->idx++; + pkt_ctrl->descs_left_in_line--; + + return sq_desc; +} + +static int ena_com_close_bounce_buffer(struct ena_com_io_sq *io_sq) +{ + struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl; + struct ena_com_llq_info *llq_info = &io_sq->llq_info; + int rc; + + if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)) + return 0; + + /* bounce buffer was used, so write it and get a new one */ + if (likely(pkt_ctrl->idx)) { + rc = ena_com_write_bounce_buffer_to_dev(io_sq, + pkt_ctrl->curr_bounce_buf); + if (unlikely(rc)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Failed to write bounce buffer to device\n"); + return rc; + } + + pkt_ctrl->curr_bounce_buf = + ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl); + memset(io_sq->llq_buf_ctrl.curr_bounce_buf, + 0x0, llq_info->desc_list_entry_size); + } + + pkt_ctrl->idx = 0; + pkt_ctrl->descs_left_in_line = llq_info->descs_num_before_header; + return 0; +} + +static void *get_sq_desc(struct ena_com_io_sq *io_sq) +{ + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) + return get_sq_desc_llq(io_sq); + + return get_sq_desc_regular_queue(io_sq); +} + +static int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq) +{ + struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl; + struct ena_com_llq_info *llq_info = &io_sq->llq_info; + int rc; + + if (!pkt_ctrl->descs_left_in_line) { + rc = ena_com_write_bounce_buffer_to_dev(io_sq, + pkt_ctrl->curr_bounce_buf); + if (unlikely(rc)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Failed to write bounce buffer to device\n"); + return rc; + } + + pkt_ctrl->curr_bounce_buf = + ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl); + memset(io_sq->llq_buf_ctrl.curr_bounce_buf, + 0x0, llq_info->desc_list_entry_size); + + pkt_ctrl->idx = 0; + if (unlikely(llq_info->desc_stride_ctrl == ENA_ADMIN_SINGLE_DESC_PER_ENTRY)) + pkt_ctrl->descs_left_in_line = 1; + else + pkt_ctrl->descs_left_in_line = + llq_info->desc_list_entry_size / io_sq->desc_entry_size; + } + + return 0; +} + +static int ena_com_sq_update_reqular_queue_tail(struct ena_com_io_sq *io_sq) +{ + io_sq->tail++; + + /* Switch phase bit in case of wrap around */ + if (unlikely((io_sq->tail & (io_sq->q_depth - 1)) == 0)) + io_sq->phase ^= 1; + + return 0; +} + +static int ena_com_sq_update_tail(struct ena_com_io_sq *io_sq) +{ + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) + return ena_com_sq_update_llq_tail(io_sq); + + return ena_com_sq_update_reqular_queue_tail(io_sq); +} + +static struct ena_eth_io_rx_cdesc_base * + ena_com_rx_cdesc_idx_to_ptr(struct ena_com_io_cq *io_cq, u16 idx) +{ + idx &= (io_cq->q_depth - 1); + return (struct ena_eth_io_rx_cdesc_base *) + ((uintptr_t)io_cq->cdesc_addr.virt_addr + + idx * io_cq->cdesc_entry_size_in_bytes); +} + +static int ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq, + u16 *first_cdesc_idx, + u16 *num_descs) +{ + struct ena_com_dev *dev = ena_com_io_cq_to_ena_dev(io_cq); + u16 count = io_cq->cur_rx_pkt_cdesc_count, head_masked; + struct ena_eth_io_rx_cdesc_base *cdesc; + u32 last = 0; + + do { + u32 status; + + cdesc = ena_com_get_next_rx_cdesc(io_cq); + if (!cdesc) + break; + status = READ_ONCE(cdesc->status); + + ena_com_cq_inc_head(io_cq); + if (unlikely((status & ENA_ETH_IO_RX_CDESC_BASE_FIRST_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_FIRST_SHIFT && + count != 0)) { + netdev_err(dev->net_device, + "First bit is on in descriptor #%d on q_id: %d, req_id: %u\n", + count, io_cq->qid, cdesc->req_id); + return -EFAULT; + } + + if (unlikely((status & (ENA_ETH_IO_RX_CDESC_BASE_MBZ7_MASK | + ENA_ETH_IO_RX_CDESC_BASE_MBZ17_MASK)) && + ena_com_get_cap(dev, ENA_ADMIN_CDESC_MBZ))) { + netdev_err(dev->net_device, + "Corrupted RX descriptor #%d on q_id: %d, req_id: %u\n", count, + io_cq->qid, cdesc->req_id); + return -EFAULT; + } + + count++; + last = (status & ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT; + } while (!last); + + if (last) { + *first_cdesc_idx = io_cq->cur_rx_pkt_cdesc_start_idx; + + head_masked = io_cq->head & (io_cq->q_depth - 1); + + *num_descs = count; + io_cq->cur_rx_pkt_cdesc_count = 0; + io_cq->cur_rx_pkt_cdesc_start_idx = head_masked; + + netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device, + "ENA q_id: %d packets were completed. first desc idx %u descs# %d\n", + io_cq->qid, *first_cdesc_idx, count); + } else { + io_cq->cur_rx_pkt_cdesc_count = count; + *num_descs = 0; + } + + return 0; +} + +static int ena_com_create_meta(struct ena_com_io_sq *io_sq, + struct ena_com_tx_meta *ena_meta) +{ + struct ena_eth_io_tx_meta_desc *meta_desc = NULL; + + meta_desc = get_sq_desc(io_sq); + if (unlikely(!meta_desc)) + return -EFAULT; + + memset(meta_desc, 0x0, sizeof(struct ena_eth_io_tx_meta_desc)); + + meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_META_DESC_MASK; + + meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_EXT_VALID_MASK; + + /* bits 0-9 of the mss */ + meta_desc->word2 |= ((u32)ena_meta->mss << + ENA_ETH_IO_TX_META_DESC_MSS_LO_SHIFT) & + ENA_ETH_IO_TX_META_DESC_MSS_LO_MASK; + /* bits 10-13 of the mss */ + meta_desc->len_ctrl |= ((ena_meta->mss >> 10) << + ENA_ETH_IO_TX_META_DESC_MSS_HI_SHIFT) & + ENA_ETH_IO_TX_META_DESC_MSS_HI_MASK; + + /* Extended meta desc */ + meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_MASK; + meta_desc->len_ctrl |= ((u32)io_sq->phase << + ENA_ETH_IO_TX_META_DESC_PHASE_SHIFT) & + ENA_ETH_IO_TX_META_DESC_PHASE_MASK; + + meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_FIRST_MASK; + meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_META_STORE_MASK; + + meta_desc->word2 |= ena_meta->l3_hdr_len & + ENA_ETH_IO_TX_META_DESC_L3_HDR_LEN_MASK; + meta_desc->word2 |= (ena_meta->l3_hdr_offset << + ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_SHIFT) & + ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_MASK; + + meta_desc->word2 |= ((u32)ena_meta->l4_hdr_len << + ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_SHIFT) & + ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_MASK; + + return ena_com_sq_update_tail(io_sq); +} + +static int ena_com_create_and_store_tx_meta_desc(struct ena_com_io_sq *io_sq, + struct ena_com_tx_ctx *ena_tx_ctx, + bool *have_meta) +{ + struct ena_com_tx_meta *ena_meta = &ena_tx_ctx->ena_meta; + + /* When disable meta caching is set, don't bother to save the meta and + * compare it to the stored version, just create the meta + */ + if (io_sq->disable_meta_caching) { + *have_meta = true; + return ena_com_create_meta(io_sq, ena_meta); + } + + if (ena_com_meta_desc_changed(io_sq, ena_tx_ctx)) { + *have_meta = true; + /* Cache the meta desc */ + memcpy(&io_sq->cached_tx_meta, ena_meta, + sizeof(struct ena_com_tx_meta)); + return ena_com_create_meta(io_sq, ena_meta); + } + + *have_meta = false; + return 0; +} + +static void ena_com_rx_set_flags(struct ena_com_io_cq *io_cq, + struct ena_com_rx_ctx *ena_rx_ctx, + struct ena_eth_io_rx_cdesc_base *cdesc) +{ + ena_rx_ctx->l3_proto = cdesc->status & + ENA_ETH_IO_RX_CDESC_BASE_L3_PROTO_IDX_MASK; + ena_rx_ctx->l4_proto = + (cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_SHIFT; + ena_rx_ctx->l3_csum_err = + !!((cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_SHIFT); + ena_rx_ctx->l4_csum_err = + !!((cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_SHIFT); + ena_rx_ctx->l4_csum_checked = + !!((cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_SHIFT); + ena_rx_ctx->hash = cdesc->hash; + ena_rx_ctx->frag = + (cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_SHIFT; + + netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device, + "l3_proto %d l4_proto %d l3_csum_err %d l4_csum_err %d hash %d frag %d cdesc_status %x\n", + ena_rx_ctx->l3_proto, ena_rx_ctx->l4_proto, ena_rx_ctx->l3_csum_err, + ena_rx_ctx->l4_csum_err, ena_rx_ctx->hash, ena_rx_ctx->frag, cdesc->status); +} + +/*****************************************************************************/ +/***************************** API **********************************/ +/*****************************************************************************/ + +int ena_com_prepare_tx(struct ena_com_io_sq *io_sq, + struct ena_com_tx_ctx *ena_tx_ctx, + int *nb_hw_desc) +{ + struct ena_eth_io_tx_desc *desc = NULL; + struct ena_com_buf *ena_bufs = ena_tx_ctx->ena_bufs; + void *buffer_to_push = ena_tx_ctx->push_header; + u16 header_len = ena_tx_ctx->header_len; + u16 num_bufs = ena_tx_ctx->num_bufs; + u16 start_tail = io_sq->tail; + int i, rc; + bool have_meta; + u64 addr_hi; + + WARN(io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_TX, "wrong Q type"); + + /* num_bufs +1 for potential meta desc */ + if (unlikely(!ena_com_sq_have_enough_space(io_sq, num_bufs + 1))) { + netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Not enough space in the tx queue\n"); + return -ENOMEM; + } + + if (unlikely(header_len > io_sq->tx_max_header_size)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Header size is too large %d max header: %d\n", header_len, + io_sq->tx_max_header_size); + return -EINVAL; + } + + if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV && !buffer_to_push)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Push header wasn't provided in LLQ mode\n"); + return -EINVAL; + } + + rc = ena_com_write_header_to_bounce(io_sq, buffer_to_push, header_len); + if (unlikely(rc)) + return rc; + + rc = ena_com_create_and_store_tx_meta_desc(io_sq, ena_tx_ctx, &have_meta); + if (unlikely(rc)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Failed to create and store tx meta desc\n"); + return rc; + } + + /* If the caller doesn't want to send packets */ + if (unlikely(!num_bufs && !header_len)) { + rc = ena_com_close_bounce_buffer(io_sq); + if (unlikely(rc)) + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Failed to write buffers to LLQ\n"); + *nb_hw_desc = io_sq->tail - start_tail; + return rc; + } + + desc = get_sq_desc(io_sq); + if (unlikely(!desc)) + return -EFAULT; + memset(desc, 0x0, sizeof(struct ena_eth_io_tx_desc)); + + /* Set first desc when we don't have meta descriptor */ + if (!have_meta) + desc->len_ctrl |= ENA_ETH_IO_TX_DESC_FIRST_MASK; + + desc->buff_addr_hi_hdr_sz |= ((u32)header_len << + ENA_ETH_IO_TX_DESC_HEADER_LENGTH_SHIFT) & + ENA_ETH_IO_TX_DESC_HEADER_LENGTH_MASK; + desc->len_ctrl |= ((u32)io_sq->phase << ENA_ETH_IO_TX_DESC_PHASE_SHIFT) & + ENA_ETH_IO_TX_DESC_PHASE_MASK; + + desc->len_ctrl |= ENA_ETH_IO_TX_DESC_COMP_REQ_MASK; + + /* Bits 0-9 */ + desc->meta_ctrl |= ((u32)ena_tx_ctx->req_id << + ENA_ETH_IO_TX_DESC_REQ_ID_LO_SHIFT) & + ENA_ETH_IO_TX_DESC_REQ_ID_LO_MASK; + + desc->meta_ctrl |= (ena_tx_ctx->df << + ENA_ETH_IO_TX_DESC_DF_SHIFT) & + ENA_ETH_IO_TX_DESC_DF_MASK; + + /* Bits 10-15 */ + desc->len_ctrl |= ((ena_tx_ctx->req_id >> 10) << + ENA_ETH_IO_TX_DESC_REQ_ID_HI_SHIFT) & + ENA_ETH_IO_TX_DESC_REQ_ID_HI_MASK; + + if (ena_tx_ctx->meta_valid) { + desc->meta_ctrl |= (ena_tx_ctx->tso_enable << + ENA_ETH_IO_TX_DESC_TSO_EN_SHIFT) & + ENA_ETH_IO_TX_DESC_TSO_EN_MASK; + desc->meta_ctrl |= ena_tx_ctx->l3_proto & + ENA_ETH_IO_TX_DESC_L3_PROTO_IDX_MASK; + desc->meta_ctrl |= (ena_tx_ctx->l4_proto << + ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_SHIFT) & + ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_MASK; + desc->meta_ctrl |= (ena_tx_ctx->l3_csum_enable << + ENA_ETH_IO_TX_DESC_L3_CSUM_EN_SHIFT) & + ENA_ETH_IO_TX_DESC_L3_CSUM_EN_MASK; + desc->meta_ctrl |= (ena_tx_ctx->l4_csum_enable << + ENA_ETH_IO_TX_DESC_L4_CSUM_EN_SHIFT) & + ENA_ETH_IO_TX_DESC_L4_CSUM_EN_MASK; + desc->meta_ctrl |= (ena_tx_ctx->l4_csum_partial << + ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_SHIFT) & + ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_MASK; + } + + for (i = 0; i < num_bufs; i++) { + /* The first desc share the same desc as the header */ + if (likely(i != 0)) { + rc = ena_com_sq_update_tail(io_sq); + if (unlikely(rc)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Failed to update sq tail\n"); + return rc; + } + + desc = get_sq_desc(io_sq); + if (unlikely(!desc)) + return -EFAULT; + + memset(desc, 0x0, sizeof(struct ena_eth_io_tx_desc)); + + desc->len_ctrl |= ((u32)io_sq->phase << + ENA_ETH_IO_TX_DESC_PHASE_SHIFT) & + ENA_ETH_IO_TX_DESC_PHASE_MASK; + } + + desc->len_ctrl |= ena_bufs->len & + ENA_ETH_IO_TX_DESC_LENGTH_MASK; + + addr_hi = ((ena_bufs->paddr & + GENMASK_ULL(io_sq->dma_addr_bits - 1, 32)) >> 32); + + desc->buff_addr_lo = (u32)ena_bufs->paddr; + desc->buff_addr_hi_hdr_sz |= addr_hi & + ENA_ETH_IO_TX_DESC_ADDR_HI_MASK; + ena_bufs++; + } + + /* set the last desc indicator */ + desc->len_ctrl |= ENA_ETH_IO_TX_DESC_LAST_MASK; + + rc = ena_com_sq_update_tail(io_sq); + if (unlikely(rc)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Failed to update sq tail of the last descriptor\n"); + return rc; + } + + rc = ena_com_close_bounce_buffer(io_sq); + + *nb_hw_desc = io_sq->tail - start_tail; + return rc; +} + +int ena_com_rx_pkt(struct ena_com_io_cq *io_cq, + struct ena_com_io_sq *io_sq, + struct ena_com_rx_ctx *ena_rx_ctx) +{ + struct ena_com_rx_buf_info *ena_buf = &ena_rx_ctx->ena_bufs[0]; + struct ena_eth_io_rx_cdesc_base *cdesc = NULL; + u16 q_depth = io_cq->q_depth; + u16 cdesc_idx = 0; + u16 nb_hw_desc; + u16 i = 0; + int rc; + + WARN(io_cq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type"); + + rc = ena_com_cdesc_rx_pkt_get(io_cq, &cdesc_idx, &nb_hw_desc); + if (unlikely(rc != 0)) + return -EFAULT; + + if (nb_hw_desc == 0) { + ena_rx_ctx->descs = nb_hw_desc; + return 0; + } + + netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device, + "Fetch rx packet: queue %d completed desc: %d\n", io_cq->qid, nb_hw_desc); + + if (unlikely(nb_hw_desc > ena_rx_ctx->max_bufs)) { + netdev_err(ena_com_io_cq_to_ena_dev(io_cq)->net_device, + "Too many RX cdescs (%d) > MAX(%d)\n", nb_hw_desc, ena_rx_ctx->max_bufs); + return -ENOSPC; + } + + cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx); + ena_rx_ctx->pkt_offset = cdesc->offset; + + do { + ena_buf[i].len = cdesc->length; + ena_buf[i].req_id = cdesc->req_id; + if (unlikely(ena_buf[i].req_id >= q_depth)) + return -EIO; + + if (++i >= nb_hw_desc) + break; + + cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx + i); + + } while (1); + + /* Update SQ head ptr */ + io_sq->next_to_comp += nb_hw_desc; + + netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device, + "[%s][QID#%d] Updating SQ head to: %d\n", __func__, io_sq->qid, + io_sq->next_to_comp); + + /* Get rx flags from the last pkt */ + ena_com_rx_set_flags(io_cq, ena_rx_ctx, cdesc); + + ena_rx_ctx->descs = nb_hw_desc; + + return 0; +} + +int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq, + struct ena_com_buf *ena_buf, + u16 req_id) +{ + struct ena_eth_io_rx_desc *desc; + + WARN(io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type"); + + if (unlikely(!ena_com_sq_have_enough_space(io_sq, 1))) + return -ENOSPC; + + /* virt_addr allocation success is checked before calling this function */ + desc = get_sq_desc_regular_queue(io_sq); + + memset(desc, 0x0, sizeof(struct ena_eth_io_rx_desc)); + + desc->length = ena_buf->len; + + desc->ctrl = ENA_ETH_IO_RX_DESC_FIRST_MASK | + ENA_ETH_IO_RX_DESC_LAST_MASK | + ENA_ETH_IO_RX_DESC_COMP_REQ_MASK | + (io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK); + + desc->req_id = req_id; + + netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "[%s] Adding single RX desc, Queue: %u, req_id: %u\n", __func__, io_sq->qid, + req_id); + + desc->buff_addr_lo = (u32)ena_buf->paddr; + desc->buff_addr_hi = + ((ena_buf->paddr & GENMASK_ULL(io_sq->dma_addr_bits - 1, 32)) >> 32); + + return ena_com_sq_update_reqular_queue_tail(io_sq); +} + +bool ena_com_cq_empty(struct ena_com_io_cq *io_cq) +{ + struct ena_eth_io_rx_cdesc_base *cdesc; + + cdesc = ena_com_get_next_rx_cdesc(io_cq); + if (cdesc) + return false; + else + return true; +} diff --git a/drivers/amazon/net/ena/ena_eth_com.h b/drivers/amazon/net/ena/ena_eth_com.h new file mode 100644 index 0000000000000..121e2e212a1b5 --- /dev/null +++ b/drivers/amazon/net/ena/ena_eth_com.h @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef ENA_ETH_COM_H_ +#define ENA_ETH_COM_H_ + +#include "ena_com.h" + +/* we allow 2 DMA descriptors per LLQ entry */ +#define ENA_LLQ_ENTRY_DESC_CHUNK_SIZE (2 * sizeof(struct ena_eth_io_tx_desc)) +#define ENA_LLQ_HEADER (128UL - ENA_LLQ_ENTRY_DESC_CHUNK_SIZE) +#define ENA_LLQ_LARGE_HEADER (256UL - ENA_LLQ_ENTRY_DESC_CHUNK_SIZE) + +struct ena_com_tx_ctx { + struct ena_com_tx_meta ena_meta; + struct ena_com_buf *ena_bufs; + /* For LLQ, header buffer - pushed to the device mem space */ + void *push_header; + + enum ena_eth_io_l3_proto_index l3_proto; + enum ena_eth_io_l4_proto_index l4_proto; + u16 num_bufs; + u16 req_id; + /* For regular queue, indicate the size of the header + * For LLQ, indicate the size of the pushed buffer + */ + u16 header_len; + + u8 meta_valid; + u8 tso_enable; + u8 l3_csum_enable; + u8 l4_csum_enable; + u8 l4_csum_partial; + u8 df; /* Don't fragment */ +}; + +struct ena_com_rx_ctx { + struct ena_com_rx_buf_info *ena_bufs; + enum ena_eth_io_l3_proto_index l3_proto; + enum ena_eth_io_l4_proto_index l4_proto; + bool l3_csum_err; + bool l4_csum_err; + u8 l4_csum_checked; + /* fragmented packet */ + bool frag; + u32 hash; + u16 descs; + u16 max_bufs; + u8 pkt_offset; +}; + +int ena_com_prepare_tx(struct ena_com_io_sq *io_sq, + struct ena_com_tx_ctx *ena_tx_ctx, + int *nb_hw_desc); + +int ena_com_rx_pkt(struct ena_com_io_cq *io_cq, + struct ena_com_io_sq *io_sq, + struct ena_com_rx_ctx *ena_rx_ctx); + +int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq, + struct ena_com_buf *ena_buf, + u16 req_id); + +bool ena_com_cq_empty(struct ena_com_io_cq *io_cq); + +static inline void ena_com_unmask_intr(struct ena_com_io_cq *io_cq, + struct ena_eth_io_intr_reg *intr_reg) +{ + writel(intr_reg->intr_control, io_cq->unmask_reg); +} + +static inline u16 ena_com_used_q_entries(struct ena_com_io_sq *io_sq) +{ + return io_sq->tail - io_sq->next_to_comp; +} + +static inline int ena_com_free_q_entries(struct ena_com_io_sq *io_sq) +{ + return io_sq->q_depth - 1 - ena_com_used_q_entries(io_sq); +} + +/* Check if the submission queue has enough space to hold required_buffers */ +static inline bool ena_com_sq_have_enough_space(struct ena_com_io_sq *io_sq, + u16 required_buffers) +{ + int temp; + + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) + return ena_com_free_q_entries(io_sq) >= required_buffers; + + /* This calculation doesn't need to be 100% accurate. So to reduce + * the calculation overhead just Subtract 2 lines from the free descs + * (one for the header line and one to compensate the devision + * down calculation. + */ + temp = required_buffers / io_sq->llq_info.descs_per_entry + 2; + + return ena_com_free_q_entries(io_sq) > temp; +} + +static inline bool ena_com_meta_desc_changed(struct ena_com_io_sq *io_sq, + struct ena_com_tx_ctx *ena_tx_ctx) +{ + if (!ena_tx_ctx->meta_valid) + return false; + + return !!memcmp(&io_sq->cached_tx_meta, + &ena_tx_ctx->ena_meta, + sizeof(struct ena_com_tx_meta)); +} + +static inline bool is_llq_max_tx_burst_exists(struct ena_com_io_sq *io_sq) +{ + return (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) && + io_sq->llq_info.max_entries_in_tx_burst > 0; +} + +static inline bool ena_com_is_doorbell_needed(struct ena_com_io_sq *io_sq, + struct ena_com_tx_ctx *ena_tx_ctx) +{ + struct ena_com_llq_info *llq_info; + int descs_after_first_entry; + int num_entries_needed = 1; + u16 num_descs; + + if (!is_llq_max_tx_burst_exists(io_sq)) + return false; + + llq_info = &io_sq->llq_info; + num_descs = ena_tx_ctx->num_bufs; + + if (llq_info->disable_meta_caching || + unlikely(ena_com_meta_desc_changed(io_sq, ena_tx_ctx))) + ++num_descs; + + if (num_descs > llq_info->descs_num_before_header) { + descs_after_first_entry = num_descs - llq_info->descs_num_before_header; + num_entries_needed += DIV_ROUND_UP(descs_after_first_entry, + llq_info->descs_per_entry); + } + + netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Queue: %d num_descs: %d num_entries_needed: %d\n", io_sq->qid, num_descs, + num_entries_needed); + + return num_entries_needed > io_sq->entries_in_tx_burst_left; +} + +static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq) +{ + u16 max_entries_in_tx_burst = io_sq->llq_info.max_entries_in_tx_burst; + u16 tail = io_sq->tail; + + netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Write submission queue doorbell for queue: %d tail: %d\n", io_sq->qid, tail); + + writel(tail, io_sq->db_addr); + + if (is_llq_max_tx_burst_exists(io_sq)) { + netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Reset available entries in tx burst for queue %d to %d\n", io_sq->qid, + max_entries_in_tx_burst); + io_sq->entries_in_tx_burst_left = max_entries_in_tx_burst; + } + + return 0; +} + +static inline void ena_com_update_numa_node(struct ena_com_io_cq *io_cq, + u8 numa_node) +{ + struct ena_eth_io_numa_node_cfg_reg numa_cfg; + + if (!io_cq->numa_node_cfg_reg) + return; + + numa_cfg.numa_cfg = (numa_node & ENA_ETH_IO_NUMA_NODE_CFG_REG_NUMA_MASK) + | ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK; + + writel(numa_cfg.numa_cfg, io_cq->numa_node_cfg_reg); +} + +static inline void ena_com_comp_ack(struct ena_com_io_sq *io_sq, u16 elem) +{ + io_sq->next_to_comp += elem; +} + +static inline void ena_com_cq_inc_head(struct ena_com_io_cq *io_cq) +{ + io_cq->head++; + + /* Switch phase bit in case of wrap around */ + if (unlikely((io_cq->head & (io_cq->q_depth - 1)) == 0)) + io_cq->phase ^= 1; +} + +static inline int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq, + u16 *req_id) +{ + struct ena_com_dev *dev = ena_com_io_cq_to_ena_dev(io_cq); + u8 expected_phase, cdesc_phase; + struct ena_eth_io_tx_cdesc *cdesc; + u16 masked_head; + u8 flags; + + masked_head = io_cq->head & (io_cq->q_depth - 1); + expected_phase = io_cq->phase; + + cdesc = (struct ena_eth_io_tx_cdesc *) + ((uintptr_t)io_cq->cdesc_addr.virt_addr + + (masked_head * io_cq->cdesc_entry_size_in_bytes)); + + flags = READ_ONCE(cdesc->flags); + + /* When the current completion descriptor phase isn't the same as the + * expected, it mean that the device still didn't update + * this completion. + */ + cdesc_phase = flags & ENA_ETH_IO_TX_CDESC_PHASE_MASK; + if (cdesc_phase != expected_phase) + return -EAGAIN; + + if (unlikely((flags & ENA_ETH_IO_TX_CDESC_MBZ6_MASK) && + ena_com_get_cap(dev, ENA_ADMIN_CDESC_MBZ))) { + netdev_err(dev->net_device, "Corrupted TX descriptor on q_id: %d, req_id: %u\n", + io_cq->qid, cdesc->req_id); + return -EFAULT; + } + + dma_rmb(); + + *req_id = READ_ONCE(cdesc->req_id); + if (unlikely(*req_id >= io_cq->q_depth)) { + netdev_err(ena_com_io_cq_to_ena_dev(io_cq)->net_device, "Invalid req id %d\n", + cdesc->req_id); + return -EINVAL; + } + + ena_com_cq_inc_head(io_cq); + + return 0; +} + +#endif /* ENA_ETH_COM_H_ */ diff --git a/drivers/amazon/net/ena/ena_eth_io_defs.h b/drivers/amazon/net/ena/ena_eth_io_defs.h new file mode 100644 index 0000000000000..35b59ee9b0134 --- /dev/null +++ b/drivers/amazon/net/ena/ena_eth_io_defs.h @@ -0,0 +1,401 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ +#ifndef _ENA_ETH_IO_H_ +#define _ENA_ETH_IO_H_ + +enum ena_eth_io_l3_proto_index { + ENA_ETH_IO_L3_PROTO_UNKNOWN = 0, + ENA_ETH_IO_L3_PROTO_IPV4 = 8, + ENA_ETH_IO_L3_PROTO_IPV6 = 11, + ENA_ETH_IO_L3_PROTO_FCOE = 21, + ENA_ETH_IO_L3_PROTO_ROCE = 22, +}; + +enum ena_eth_io_l4_proto_index { + ENA_ETH_IO_L4_PROTO_UNKNOWN = 0, + ENA_ETH_IO_L4_PROTO_TCP = 12, + ENA_ETH_IO_L4_PROTO_UDP = 13, + ENA_ETH_IO_L4_PROTO_ROUTEABLE_ROCE = 23, +}; + +struct ena_eth_io_tx_desc { + /* 15:0 : length - Buffer length in bytes, must + * include any packet trailers that the ENA supposed + * to update like End-to-End CRC, Authentication GMAC + * etc. This length must not include the + * 'Push_Buffer' length. This length must not include + * the 4-byte added in the end for 802.3 Ethernet FCS + * 21:16 : req_id_hi - Request ID[15:10] + * 22 : reserved22 - MBZ + * 23 : meta_desc - MBZ + * 24 : phase + * 25 : reserved1 - MBZ + * 26 : first - Indicates first descriptor in + * transaction + * 27 : last - Indicates last descriptor in + * transaction + * 28 : comp_req - Indicates whether completion + * should be posted, after packet is transmitted. + * Valid only for first descriptor + * 30:29 : reserved29 - MBZ + * 31 : reserved31 - MBZ + */ + u32 len_ctrl; + + /* 3:0 : l3_proto_idx - L3 protocol. This field + * required when l3_csum_en,l3_csum or tso_en are set. + * 4 : DF - IPv4 DF, must be 0 if packet is IPv4 and + * DF flags of the IPv4 header is 0. Otherwise must + * be set to 1 + * 6:5 : reserved5 + * 7 : tso_en - Enable TSO, For TCP only. + * 12:8 : l4_proto_idx - L4 protocol. This field need + * to be set when l4_csum_en or tso_en are set. + * 13 : l3_csum_en - enable IPv4 header checksum. + * 14 : l4_csum_en - enable TCP/UDP checksum. + * 15 : ethernet_fcs_dis - when set, the controller + * will not append the 802.3 Ethernet Frame Check + * Sequence to the packet + * 16 : reserved16 + * 17 : l4_csum_partial - L4 partial checksum. when + * set to 0, the ENA calculates the L4 checksum, + * where the Destination Address required for the + * TCP/UDP pseudo-header is taken from the actual + * packet L3 header. when set to 1, the ENA doesn't + * calculate the sum of the pseudo-header, instead, + * the checksum field of the L4 is used instead. When + * TSO enabled, the checksum of the pseudo-header + * must not include the tcp length field. L4 partial + * checksum should be used for IPv6 packet that + * contains Routing Headers. + * 20:18 : reserved18 - MBZ + * 21 : reserved21 - MBZ + * 31:22 : req_id_lo - Request ID[9:0] + */ + u32 meta_ctrl; + + u32 buff_addr_lo; + + /* address high and header size + * 15:0 : addr_hi - Buffer Pointer[47:32] + * 23:16 : reserved16_w2 + * 31:24 : header_length - Header length. For Low + * Latency Queues, this fields indicates the number + * of bytes written to the headers' memory. For + * normal queues, if packet is TCP or UDP, and longer + * than max_header_size, then this field should be + * set to the sum of L4 header offset and L4 header + * size(without options), otherwise, this field + * should be set to 0. For both modes, this field + * must not exceed the max_header_size. + * max_header_size value is reported by the Max + * Queues Feature descriptor + */ + u32 buff_addr_hi_hdr_sz; +}; + +struct ena_eth_io_tx_meta_desc { + /* 9:0 : req_id_lo - Request ID[9:0] + * 11:10 : reserved10 - MBZ + * 12 : reserved12 - MBZ + * 13 : reserved13 - MBZ + * 14 : ext_valid - if set, offset fields in Word2 + * are valid Also MSS High in Word 0 and bits [31:24] + * in Word 3 + * 15 : reserved15 + * 19:16 : mss_hi + * 20 : eth_meta_type - 0: Tx Metadata Descriptor, 1: + * Extended Metadata Descriptor + * 21 : meta_store - Store extended metadata in queue + * cache + * 22 : reserved22 - MBZ + * 23 : meta_desc - MBO + * 24 : phase + * 25 : reserved25 - MBZ + * 26 : first - Indicates first descriptor in + * transaction + * 27 : last - Indicates last descriptor in + * transaction + * 28 : comp_req - Indicates whether completion + * should be posted, after packet is transmitted. + * Valid only for first descriptor + * 30:29 : reserved29 - MBZ + * 31 : reserved31 - MBZ + */ + u32 len_ctrl; + + /* 5:0 : req_id_hi + * 31:6 : reserved6 - MBZ + */ + u32 word1; + + /* 7:0 : l3_hdr_len + * 15:8 : l3_hdr_off + * 21:16 : l4_hdr_len_in_words - counts the L4 header + * length in words. there is an explicit assumption + * that L4 header appears right after L3 header and + * L4 offset is based on l3_hdr_off+l3_hdr_len + * 31:22 : mss_lo + */ + u32 word2; + + u32 reserved; +}; + +struct ena_eth_io_tx_cdesc { + /* Request ID[15:0] */ + u16 req_id; + + u8 status; + + /* flags + * 0 : phase + * 5:1 : reserved1 + * 7:6 : mbz6 - MBZ + */ + u8 flags; + + u16 sub_qid; + + u16 sq_head_idx; +}; + +struct ena_eth_io_rx_desc { + /* In bytes. 0 means 64KB */ + u16 length; + + /* MBZ */ + u8 reserved2; + + /* 0 : phase + * 1 : reserved1 - MBZ + * 2 : first - Indicates first descriptor in + * transaction + * 3 : last - Indicates last descriptor in transaction + * 4 : comp_req + * 5 : reserved5 - MBO + * 7:6 : reserved6 - MBZ + */ + u8 ctrl; + + u16 req_id; + + /* MBZ */ + u16 reserved6; + + u32 buff_addr_lo; + + u16 buff_addr_hi; + + /* MBZ */ + u16 reserved16_w3; +}; + +/* 4-word format Note: all ethernet parsing information are valid only when + * last=1 + */ +struct ena_eth_io_rx_cdesc_base { + /* 4:0 : l3_proto_idx + * 6:5 : src_vlan_cnt + * 7 : mbz7 - MBZ + * 12:8 : l4_proto_idx + * 13 : l3_csum_err - when set, either the L3 + * checksum error detected, or, the controller didn't + * validate the checksum. This bit is valid only when + * l3_proto_idx indicates IPv4 packet + * 14 : l4_csum_err - when set, either the L4 + * checksum error detected, or, the controller didn't + * validate the checksum. This bit is valid only when + * l4_proto_idx indicates TCP/UDP packet, and, + * ipv4_frag is not set. This bit is valid only when + * l4_csum_checked below is set. + * 15 : ipv4_frag - Indicates IPv4 fragmented packet + * 16 : l4_csum_checked - L4 checksum was verified + * (could be OK or error), when cleared the status of + * checksum is unknown + * 17 : mbz17 - MBZ + * 23:18 : reserved18 + * 24 : phase + * 25 : l3_csum2 - second checksum engine result + * 26 : first - Indicates first descriptor in + * transaction + * 27 : last - Indicates last descriptor in + * transaction + * 29:28 : reserved28 + * 30 : buffer - 0: Metadata descriptor. 1: Buffer + * Descriptor was used + * 31 : reserved31 + */ + u32 status; + + u16 length; + + u16 req_id; + + /* 32-bit hash result */ + u32 hash; + + u16 sub_qid; + + u8 offset; + + u8 reserved; +}; + +/* 8-word format */ +struct ena_eth_io_rx_cdesc_ext { + struct ena_eth_io_rx_cdesc_base base; + + u32 buff_addr_lo; + + u16 buff_addr_hi; + + u16 reserved16; + + u32 reserved_w6; + + u32 reserved_w7; +}; + +struct ena_eth_io_intr_reg { + /* 14:0 : rx_intr_delay + * 29:15 : tx_intr_delay + * 30 : intr_unmask + * 31 : no_moderation_update - 0 - moderation + * updated, 1 - moderation not updated + */ + u32 intr_control; +}; + +struct ena_eth_io_numa_node_cfg_reg { + /* 7:0 : numa + * 30:8 : reserved + * 31 : enabled + */ + u32 numa_cfg; +}; + +/* tx_desc */ +#define ENA_ETH_IO_TX_DESC_LENGTH_MASK GENMASK(15, 0) +#define ENA_ETH_IO_TX_DESC_REQ_ID_HI_SHIFT 16 +#define ENA_ETH_IO_TX_DESC_REQ_ID_HI_MASK GENMASK(21, 16) +#define ENA_ETH_IO_TX_DESC_META_DESC_SHIFT 23 +#define ENA_ETH_IO_TX_DESC_META_DESC_MASK BIT(23) +#define ENA_ETH_IO_TX_DESC_PHASE_SHIFT 24 +#define ENA_ETH_IO_TX_DESC_PHASE_MASK BIT(24) +#define ENA_ETH_IO_TX_DESC_FIRST_SHIFT 26 +#define ENA_ETH_IO_TX_DESC_FIRST_MASK BIT(26) +#define ENA_ETH_IO_TX_DESC_LAST_SHIFT 27 +#define ENA_ETH_IO_TX_DESC_LAST_MASK BIT(27) +#define ENA_ETH_IO_TX_DESC_COMP_REQ_SHIFT 28 +#define ENA_ETH_IO_TX_DESC_COMP_REQ_MASK BIT(28) +#define ENA_ETH_IO_TX_DESC_L3_PROTO_IDX_MASK GENMASK(3, 0) +#define ENA_ETH_IO_TX_DESC_DF_SHIFT 4 +#define ENA_ETH_IO_TX_DESC_DF_MASK BIT(4) +#define ENA_ETH_IO_TX_DESC_TSO_EN_SHIFT 7 +#define ENA_ETH_IO_TX_DESC_TSO_EN_MASK BIT(7) +#define ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_SHIFT 8 +#define ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_MASK GENMASK(12, 8) +#define ENA_ETH_IO_TX_DESC_L3_CSUM_EN_SHIFT 13 +#define ENA_ETH_IO_TX_DESC_L3_CSUM_EN_MASK BIT(13) +#define ENA_ETH_IO_TX_DESC_L4_CSUM_EN_SHIFT 14 +#define ENA_ETH_IO_TX_DESC_L4_CSUM_EN_MASK BIT(14) +#define ENA_ETH_IO_TX_DESC_ETHERNET_FCS_DIS_SHIFT 15 +#define ENA_ETH_IO_TX_DESC_ETHERNET_FCS_DIS_MASK BIT(15) +#define ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_SHIFT 17 +#define ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_MASK BIT(17) +#define ENA_ETH_IO_TX_DESC_REQ_ID_LO_SHIFT 22 +#define ENA_ETH_IO_TX_DESC_REQ_ID_LO_MASK GENMASK(31, 22) +#define ENA_ETH_IO_TX_DESC_ADDR_HI_MASK GENMASK(15, 0) +#define ENA_ETH_IO_TX_DESC_HEADER_LENGTH_SHIFT 24 +#define ENA_ETH_IO_TX_DESC_HEADER_LENGTH_MASK GENMASK(31, 24) + +/* tx_meta_desc */ +#define ENA_ETH_IO_TX_META_DESC_REQ_ID_LO_MASK GENMASK(9, 0) +#define ENA_ETH_IO_TX_META_DESC_EXT_VALID_SHIFT 14 +#define ENA_ETH_IO_TX_META_DESC_EXT_VALID_MASK BIT(14) +#define ENA_ETH_IO_TX_META_DESC_MSS_HI_SHIFT 16 +#define ENA_ETH_IO_TX_META_DESC_MSS_HI_MASK GENMASK(19, 16) +#define ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_SHIFT 20 +#define ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_MASK BIT(20) +#define ENA_ETH_IO_TX_META_DESC_META_STORE_SHIFT 21 +#define ENA_ETH_IO_TX_META_DESC_META_STORE_MASK BIT(21) +#define ENA_ETH_IO_TX_META_DESC_META_DESC_SHIFT 23 +#define ENA_ETH_IO_TX_META_DESC_META_DESC_MASK BIT(23) +#define ENA_ETH_IO_TX_META_DESC_PHASE_SHIFT 24 +#define ENA_ETH_IO_TX_META_DESC_PHASE_MASK BIT(24) +#define ENA_ETH_IO_TX_META_DESC_FIRST_SHIFT 26 +#define ENA_ETH_IO_TX_META_DESC_FIRST_MASK BIT(26) +#define ENA_ETH_IO_TX_META_DESC_LAST_SHIFT 27 +#define ENA_ETH_IO_TX_META_DESC_LAST_MASK BIT(27) +#define ENA_ETH_IO_TX_META_DESC_COMP_REQ_SHIFT 28 +#define ENA_ETH_IO_TX_META_DESC_COMP_REQ_MASK BIT(28) +#define ENA_ETH_IO_TX_META_DESC_REQ_ID_HI_MASK GENMASK(5, 0) +#define ENA_ETH_IO_TX_META_DESC_L3_HDR_LEN_MASK GENMASK(7, 0) +#define ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_SHIFT 8 +#define ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_MASK GENMASK(15, 8) +#define ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_SHIFT 16 +#define ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_MASK GENMASK(21, 16) +#define ENA_ETH_IO_TX_META_DESC_MSS_LO_SHIFT 22 +#define ENA_ETH_IO_TX_META_DESC_MSS_LO_MASK GENMASK(31, 22) + +/* tx_cdesc */ +#define ENA_ETH_IO_TX_CDESC_PHASE_MASK BIT(0) +#define ENA_ETH_IO_TX_CDESC_MBZ6_SHIFT 6 +#define ENA_ETH_IO_TX_CDESC_MBZ6_MASK GENMASK(7, 6) + +/* rx_desc */ +#define ENA_ETH_IO_RX_DESC_PHASE_MASK BIT(0) +#define ENA_ETH_IO_RX_DESC_FIRST_SHIFT 2 +#define ENA_ETH_IO_RX_DESC_FIRST_MASK BIT(2) +#define ENA_ETH_IO_RX_DESC_LAST_SHIFT 3 +#define ENA_ETH_IO_RX_DESC_LAST_MASK BIT(3) +#define ENA_ETH_IO_RX_DESC_COMP_REQ_SHIFT 4 +#define ENA_ETH_IO_RX_DESC_COMP_REQ_MASK BIT(4) + +/* rx_cdesc_base */ +#define ENA_ETH_IO_RX_CDESC_BASE_L3_PROTO_IDX_MASK GENMASK(4, 0) +#define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_SHIFT 5 +#define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_MASK GENMASK(6, 5) +#define ENA_ETH_IO_RX_CDESC_BASE_MBZ7_SHIFT 7 +#define ENA_ETH_IO_RX_CDESC_BASE_MBZ7_MASK BIT(7) +#define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_SHIFT 8 +#define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_MASK GENMASK(12, 8) +#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_SHIFT 13 +#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_MASK BIT(13) +#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_SHIFT 14 +#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_MASK BIT(14) +#define ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_SHIFT 15 +#define ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_MASK BIT(15) +#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_SHIFT 16 +#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_MASK BIT(16) +#define ENA_ETH_IO_RX_CDESC_BASE_MBZ17_SHIFT 17 +#define ENA_ETH_IO_RX_CDESC_BASE_MBZ17_MASK BIT(17) +#define ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT 24 +#define ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK BIT(24) +#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM2_SHIFT 25 +#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM2_MASK BIT(25) +#define ENA_ETH_IO_RX_CDESC_BASE_FIRST_SHIFT 26 +#define ENA_ETH_IO_RX_CDESC_BASE_FIRST_MASK BIT(26) +#define ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT 27 +#define ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK BIT(27) +#define ENA_ETH_IO_RX_CDESC_BASE_BUFFER_SHIFT 30 +#define ENA_ETH_IO_RX_CDESC_BASE_BUFFER_MASK BIT(30) + +/* intr_reg */ +#define ENA_ETH_IO_INTR_REG_RX_INTR_DELAY_MASK GENMASK(14, 0) +#define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_SHIFT 15 +#define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_MASK GENMASK(29, 15) +#define ENA_ETH_IO_INTR_REG_INTR_UNMASK_SHIFT 30 +#define ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK BIT(30) +#define ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_SHIFT 31 +#define ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_MASK BIT(31) + +/* numa_node_cfg_reg */ +#define ENA_ETH_IO_NUMA_NODE_CFG_REG_NUMA_MASK GENMASK(7, 0) +#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_SHIFT 31 +#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK BIT(31) + +#endif /* _ENA_ETH_IO_H_ */ diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c new file mode 100644 index 0000000000000..d6147d6b980ae --- /dev/null +++ b/drivers/amazon/net/ena/ena_ethtool.c @@ -0,0 +1,1516 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include +#include +#include + +#include "ena_netdev.h" +#include "ena_xdp.h" +#include "ena_phc.h" + +struct ena_stats { + char name[ETH_GSTRING_LEN]; + int stat_offset; +}; + +struct ena_hw_metrics { + char name[ETH_GSTRING_LEN]; +}; + +#define ENA_STAT_ENA_COM_ADMIN_ENTRY(stat) { \ + .name = #stat, \ + .stat_offset = offsetof(struct ena_com_stats_admin, stat) / sizeof(u64) \ +} + +#define ENA_STAT_ENA_COM_PHC_ENTRY(stat) { \ + .name = #stat, \ + .stat_offset = offsetof(struct ena_com_stats_phc, stat) / sizeof(u64) \ +} + +#define ENA_STAT_ENTRY(stat, stat_type) { \ + .name = #stat, \ + .stat_offset = offsetof(struct ena_stats_##stat_type, stat) / sizeof(u64) \ +} + +#define ENA_STAT_HW_ENTRY(stat, stat_type) { \ + .name = #stat, \ + .stat_offset = offsetof(struct ena_admin_##stat_type, stat) / sizeof(u64) \ +} + +#define ENA_STAT_RX_ENTRY(stat) \ + ENA_STAT_ENTRY(stat, rx) + +#define ENA_STAT_TX_ENTRY(stat) \ + ENA_STAT_ENTRY(stat, tx) + +#define ENA_STAT_GLOBAL_ENTRY(stat) \ + ENA_STAT_ENTRY(stat, dev) + +#define ENA_STAT_ENI_ENTRY(stat) \ + ENA_STAT_HW_ENTRY(stat, eni_stats) + +#define ENA_STAT_ENA_SRD_ENTRY(stat) \ + ENA_STAT_HW_ENTRY(stat, ena_srd_stats) + +#define ENA_STAT_ENA_SRD_MODE_ENTRY(stat) { \ + .name = #stat, \ + .stat_offset = offsetof(struct ena_admin_ena_srd_info, flags) / sizeof(u64) \ +} + +#define ENA_METRIC_ENI_ENTRY(stat) { \ + .name = #stat \ +} + +static const struct ena_stats ena_stats_global_strings[] = { + ENA_STAT_GLOBAL_ENTRY(total_resets), + ENA_STAT_GLOBAL_ENTRY(reset_fail), + ENA_STAT_GLOBAL_ENTRY(tx_timeout), + ENA_STAT_GLOBAL_ENTRY(wd_expired), + ENA_STAT_GLOBAL_ENTRY(admin_q_pause), + ENA_STAT_GLOBAL_ENTRY(bad_tx_req_id), + ENA_STAT_GLOBAL_ENTRY(bad_rx_req_id), + ENA_STAT_GLOBAL_ENTRY(bad_rx_desc_num), + ENA_STAT_GLOBAL_ENTRY(missing_intr), + ENA_STAT_GLOBAL_ENTRY(suspected_poll_starvation), + ENA_STAT_GLOBAL_ENTRY(missing_tx_cmpl), + ENA_STAT_GLOBAL_ENTRY(rx_desc_malformed), + ENA_STAT_GLOBAL_ENTRY(tx_desc_malformed), + ENA_STAT_GLOBAL_ENTRY(invalid_state), + ENA_STAT_GLOBAL_ENTRY(os_netdev_wd), + ENA_STAT_GLOBAL_ENTRY(missing_admin_interrupt), + ENA_STAT_GLOBAL_ENTRY(admin_to), + ENA_STAT_GLOBAL_ENTRY(device_request_reset), + ENA_STAT_GLOBAL_ENTRY(suspend), + ENA_STAT_GLOBAL_ENTRY(resume), + ENA_STAT_GLOBAL_ENTRY(interface_down), + ENA_STAT_GLOBAL_ENTRY(interface_up), +}; + +/* A partial list of hw stats. Used when admin command + * with type ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS is not supported + */ +static const struct ena_stats ena_stats_eni_strings[] = { + ENA_STAT_ENI_ENTRY(bw_in_allowance_exceeded), + ENA_STAT_ENI_ENTRY(bw_out_allowance_exceeded), + ENA_STAT_ENI_ENTRY(pps_allowance_exceeded), + ENA_STAT_ENI_ENTRY(conntrack_allowance_exceeded), + ENA_STAT_ENI_ENTRY(linklocal_allowance_exceeded), +}; + +static const struct ena_hw_metrics ena_hw_stats_strings[] = { + ENA_METRIC_ENI_ENTRY(bw_in_allowance_exceeded), + ENA_METRIC_ENI_ENTRY(bw_out_allowance_exceeded), + ENA_METRIC_ENI_ENTRY(pps_allowance_exceeded), + ENA_METRIC_ENI_ENTRY(conntrack_allowance_exceeded), + ENA_METRIC_ENI_ENTRY(linklocal_allowance_exceeded), + ENA_METRIC_ENI_ENTRY(conntrack_allowance_available), +}; + +static const struct ena_stats ena_srd_info_strings[] = { + ENA_STAT_ENA_SRD_MODE_ENTRY(ena_srd_mode), + ENA_STAT_ENA_SRD_ENTRY(ena_srd_tx_pkts), + ENA_STAT_ENA_SRD_ENTRY(ena_srd_eligible_tx_pkts), + ENA_STAT_ENA_SRD_ENTRY(ena_srd_rx_pkts), + ENA_STAT_ENA_SRD_ENTRY(ena_srd_resource_utilization) +}; + +static const struct ena_stats ena_stats_tx_strings[] = { + ENA_STAT_TX_ENTRY(cnt), + ENA_STAT_TX_ENTRY(bytes), + ENA_STAT_TX_ENTRY(queue_stop), + ENA_STAT_TX_ENTRY(queue_wakeup), + ENA_STAT_TX_ENTRY(dma_mapping_err), + ENA_STAT_TX_ENTRY(linearize), + ENA_STAT_TX_ENTRY(linearize_failed), + ENA_STAT_TX_ENTRY(napi_comp), + ENA_STAT_TX_ENTRY(tx_poll), + ENA_STAT_TX_ENTRY(doorbells), + ENA_STAT_TX_ENTRY(prepare_ctx_err), + ENA_STAT_TX_ENTRY(bad_req_id), + ENA_STAT_TX_ENTRY(llq_buffer_copy), + ENA_STAT_TX_ENTRY(missed_tx), + ENA_STAT_TX_ENTRY(unmask_interrupt), +#ifdef ENA_AF_XDP_SUPPORT + ENA_STAT_TX_ENTRY(xsk_need_wakeup_set), + ENA_STAT_TX_ENTRY(xsk_wakeup_request), +#endif /* ENA_AF_XDP_SUPPORT */ +}; + +static const struct ena_stats ena_stats_rx_strings[] = { + ENA_STAT_RX_ENTRY(cnt), + ENA_STAT_RX_ENTRY(bytes), + ENA_STAT_RX_ENTRY(rx_copybreak_pkt), + ENA_STAT_RX_ENTRY(csum_good), + ENA_STAT_RX_ENTRY(refil_partial), + ENA_STAT_RX_ENTRY(csum_bad), + ENA_STAT_RX_ENTRY(page_alloc_fail), + ENA_STAT_RX_ENTRY(skb_alloc_fail), + ENA_STAT_RX_ENTRY(dma_mapping_err), + ENA_STAT_RX_ENTRY(bad_desc_num), +#ifdef ENA_BUSY_POLL_SUPPORT + ENA_STAT_RX_ENTRY(bp_yield), + ENA_STAT_RX_ENTRY(bp_missed), + ENA_STAT_RX_ENTRY(bp_cleaned), +#endif + ENA_STAT_RX_ENTRY(bad_req_id), + ENA_STAT_RX_ENTRY(empty_rx_ring), + ENA_STAT_RX_ENTRY(csum_unchecked), +#ifdef ENA_XDP_SUPPORT + ENA_STAT_RX_ENTRY(xdp_aborted), + ENA_STAT_RX_ENTRY(xdp_drop), + ENA_STAT_RX_ENTRY(xdp_pass), + ENA_STAT_RX_ENTRY(xdp_tx), + ENA_STAT_RX_ENTRY(xdp_invalid), + ENA_STAT_RX_ENTRY(xdp_redirect), +#endif + ENA_STAT_RX_ENTRY(lpc_warm_up), + ENA_STAT_RX_ENTRY(lpc_full), + ENA_STAT_RX_ENTRY(lpc_wrong_numa), +#ifdef ENA_AF_XDP_SUPPORT + ENA_STAT_RX_ENTRY(xsk_need_wakeup_set), + ENA_STAT_RX_ENTRY(zc_queue_pkt_copy), +#endif /* ENA_AF_XDP_SUPPORT */ +}; + +static const struct ena_stats ena_stats_ena_com_admin_strings[] = { + ENA_STAT_ENA_COM_ADMIN_ENTRY(aborted_cmd), + ENA_STAT_ENA_COM_ADMIN_ENTRY(submitted_cmd), + ENA_STAT_ENA_COM_ADMIN_ENTRY(completed_cmd), + ENA_STAT_ENA_COM_ADMIN_ENTRY(out_of_space), + ENA_STAT_ENA_COM_ADMIN_ENTRY(no_completion), +}; + +static const struct ena_stats ena_stats_ena_com_phc_strings[] = { + ENA_STAT_ENA_COM_PHC_ENTRY(phc_cnt), + ENA_STAT_ENA_COM_PHC_ENTRY(phc_exp), + ENA_STAT_ENA_COM_PHC_ENTRY(phc_skp), + ENA_STAT_ENA_COM_PHC_ENTRY(phc_err), +}; + +#define ENA_STATS_ARRAY_GLOBAL ARRAY_SIZE(ena_stats_global_strings) +#define ENA_STATS_ARRAY_TX ARRAY_SIZE(ena_stats_tx_strings) +#define ENA_STATS_ARRAY_RX ARRAY_SIZE(ena_stats_rx_strings) +#define ENA_STATS_ARRAY_ENA_COM_ADMIN ARRAY_SIZE(ena_stats_ena_com_admin_strings) +#define ENA_STATS_ARRAY_ENA_COM_PHC ARRAY_SIZE(ena_stats_ena_com_phc_strings) +#define ENA_STATS_ARRAY_ENI ARRAY_SIZE(ena_stats_eni_strings) +#define ENA_STATS_ARRAY_ENA_SRD ARRAY_SIZE(ena_srd_info_strings) +#define ENA_METRICS_ARRAY_ENI ARRAY_SIZE(ena_hw_stats_strings) + +static const char ena_priv_flags_strings[][ETH_GSTRING_LEN] = { +#define ENA_PRIV_FLAGS_LPC BIT(0) + "local_page_cache", +}; + +#define ENA_PRIV_FLAGS_NR ARRAY_SIZE(ena_priv_flags_strings) + +static void ena_safe_update_stat(u64 *src, u64 *dst, + struct u64_stats_sync *syncp) +{ + unsigned int start; + + do { + start = ena_u64_stats_fetch_begin(syncp); + *(dst) = *src; + } while (ena_u64_stats_fetch_retry(syncp, start)); +} + + +static void ena_metrics_stats(struct ena_adapter *adapter, u64 **data) +{ + struct ena_com_dev *dev = adapter->ena_dev; + const struct ena_stats *ena_stats; + u64 *ptr; + int i; + + if (ena_com_get_cap(dev, ENA_ADMIN_CUSTOMER_METRICS)) { + u32 supported_metrics_count; + int len; + + supported_metrics_count = ena_com_get_customer_metric_count(dev); + len = supported_metrics_count * sizeof(u64); + + /* Fill the data buffer, and advance its pointer */ + ena_com_get_customer_metrics(adapter->ena_dev, (char *)(*data), len); + (*data) += supported_metrics_count; + + } else if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) { + ena_com_get_eni_stats(adapter->ena_dev, &adapter->eni_stats); + /* Updating regardless of rc - once we told ethtool how many stats we have + * it will print that much stats. We can't leave holes in the stats + */ + for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) { + ena_stats = &ena_stats_eni_strings[i]; + + ptr = (u64 *)&adapter->eni_stats + + ena_stats->stat_offset; + + ena_safe_update_stat(ptr, (*data)++, &adapter->syncp); + } + } + + if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) { + ena_com_get_ena_srd_info(adapter->ena_dev, &adapter->ena_srd_info); + /* Get ENA SRD mode */ + ptr = (u64 *)&adapter->ena_srd_info; + ena_safe_update_stat(ptr, (*data)++, &adapter->syncp); + for (i = 1; i < ENA_STATS_ARRAY_ENA_SRD; i++) { + ena_stats = &ena_srd_info_strings[i]; + /* Wrapped within an outer struct - need to accommodate an + * additional offset of the ENA SRD mode that was already processed + */ + ptr = (u64 *)&adapter->ena_srd_info + + ena_stats->stat_offset + 1; + + ena_safe_update_stat(ptr, (*data)++, &adapter->syncp); + } + } +} + +static void ena_queue_stats(struct ena_adapter *adapter, u64 **data) +{ + const struct ena_stats *ena_stats; + struct ena_ring *ring; + + u64 *ptr; + int i, j; + + for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) { + /* Tx stats */ + ring = &adapter->tx_ring[i]; + + for (j = 0; j < ENA_STATS_ARRAY_TX; j++) { + ena_stats = &ena_stats_tx_strings[j]; + + ptr = (u64 *)&ring->tx_stats + ena_stats->stat_offset; + + ena_safe_update_stat(ptr, (*data)++, &ring->syncp); + } + /* XDP TX queues don't have a RX queue counterpart */ + if (!ENA_IS_XDP_INDEX(adapter, i)) { + /* Rx stats */ + ring = &adapter->rx_ring[i]; + + for (j = 0; j < ENA_STATS_ARRAY_RX; j++) { + ena_stats = &ena_stats_rx_strings[j]; + + ptr = (u64 *)&ring->rx_stats + + ena_stats->stat_offset; + + ena_safe_update_stat(ptr, (*data)++, &ring->syncp); + } + } + } +} + +static void ena_com_admin_queue_stats(struct ena_adapter *adapter, u64 **data) +{ + const struct ena_stats *ena_stats; + u64 *ptr; + int i; + + for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_ADMIN; i++) { + ena_stats = &ena_stats_ena_com_admin_strings[i]; + + ptr = (u64 *)&adapter->ena_dev->admin_queue.stats + + ena_stats->stat_offset; + + *(*data)++ = *ptr; + } +} + +static void ena_com_phc_stats(struct ena_adapter *adapter, u64 **data) +{ + const struct ena_stats *ena_stats; + u64 *ptr; + int i; + + for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_PHC; i++) { + ena_stats = &ena_stats_ena_com_phc_strings[i]; + ptr = (u64 *)&adapter->ena_dev->phc.stats + ena_stats->stat_offset; + *(*data)++ = *ptr; + } +} + +static void ena_get_stats(struct ena_adapter *adapter, + u64 *data, + bool hw_stats_needed) +{ + const struct ena_stats *ena_stats; + u64 *ptr; + int i; + + for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) { + ena_stats = &ena_stats_global_strings[i]; + + ptr = (u64 *)&adapter->dev_stats + ena_stats->stat_offset; + + ena_safe_update_stat(ptr, data++, &adapter->syncp); + } + + if (hw_stats_needed) + ena_metrics_stats(adapter, &data); + + ena_queue_stats(adapter, &data); + ena_com_admin_queue_stats(adapter, &data); + + if (ena_phc_is_active(adapter)) + ena_com_phc_stats(adapter, &data); +} + +static void ena_get_ethtool_stats(struct net_device *netdev, + struct ethtool_stats *stats, + u64 *data) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + ena_get_stats(adapter, data, true); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) +static int ena_get_ts_info(struct net_device *netdev, struct ethtool_ts_info *info) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + + info->phc_index = ena_phc_get_index(adapter); + + return 0; +} + +#endif +static int ena_get_sw_stats_count(struct ena_adapter *adapter) +{ + int count = adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX) + + adapter->xdp_num_queues * ENA_STATS_ARRAY_TX + + ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM_ADMIN; + + if (ena_phc_is_active(adapter)) + count += ENA_STATS_ARRAY_ENA_COM_PHC; + + return count; +} + +static int ena_get_hw_stats_count(struct ena_adapter *adapter) +{ + struct ena_com_dev *dev = adapter->ena_dev; + int count = ENA_STATS_ARRAY_ENA_SRD * + ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO); + + if (ena_com_get_cap(dev, ENA_ADMIN_CUSTOMER_METRICS)) + count += ena_com_get_customer_metric_count(dev); + else if (ena_com_get_cap(dev, ENA_ADMIN_ENI_STATS)) + count += ENA_STATS_ARRAY_ENI; + + return count; +} + +int ena_get_sset_count(struct net_device *netdev, int sset) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + switch (sset) { + case ETH_SS_STATS: + return ena_get_sw_stats_count(adapter) + + ena_get_hw_stats_count(adapter); + case ETH_SS_PRIV_FLAGS: + return ENA_PRIV_FLAGS_NR; + } + + return -EOPNOTSUPP; +} + +static void ena_metrics_stats_strings(struct ena_adapter *adapter, u8 **data) +{ + struct ena_com_dev *dev = adapter->ena_dev; + const struct ena_hw_metrics *ena_metrics; + const struct ena_stats *ena_stats; + int i; + + if (ena_com_get_cap(dev, ENA_ADMIN_CUSTOMER_METRICS)) { + for (i = 0; i < ENA_METRICS_ARRAY_ENI; i++) { + if (ena_com_get_customer_metric_support(dev, i)) { + ena_metrics = &ena_hw_stats_strings[i]; + ethtool_puts(data, ena_metrics->name); + } + } + } else if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) { + for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) { + ena_stats = &ena_stats_eni_strings[i]; + ethtool_puts(data, ena_stats->name); + } + } + + if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) { + for (i = 0; i < ENA_STATS_ARRAY_ENA_SRD; i++) { + ena_stats = &ena_srd_info_strings[i]; + ethtool_puts(data, ena_stats->name); + } + } +} + +static void ena_queue_strings(struct ena_adapter *adapter, u8 **data) +{ + const struct ena_stats *ena_stats; + bool is_xdp; + int i, j; + + for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) { + is_xdp = ENA_IS_XDP_INDEX(adapter, i); + /* Tx stats */ + for (j = 0; j < ENA_STATS_ARRAY_TX; j++) { + ena_stats = &ena_stats_tx_strings[j]; + + ethtool_sprintf(data, + "queue_%u_%s_%s", i, + is_xdp ? "xdp_tx" : "tx", + ena_stats->name); + } + + /* In XDP there isn't an RX queue counterpart */ + if (is_xdp) + continue; + + for (j = 0; j < ENA_STATS_ARRAY_RX; j++) { + ena_stats = &ena_stats_rx_strings[j]; + + ethtool_sprintf(data, "queue_%u_rx_%s", i, ena_stats->name); + } + } +} + +static void ena_com_admin_strings(u8 **data) +{ + const struct ena_stats *ena_stats; + int i; + + for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_ADMIN; i++) { + ena_stats = &ena_stats_ena_com_admin_strings[i]; + + ethtool_sprintf(data, + "ena_admin_q_%s", ena_stats->name); + } +} + +static void ena_com_phc_strings(u8 **data) +{ + const struct ena_stats *ena_stats; + int i; + + for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_PHC; i++) { + ena_stats = &ena_stats_ena_com_phc_strings[i]; + ethtool_puts(data, ena_stats->name); + } +} + +static void ena_get_strings(struct ena_adapter *adapter, + u8 *data, + bool hw_stats_needed) +{ + const struct ena_stats *ena_stats; + int i; + + for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) { + ena_stats = &ena_stats_global_strings[i]; + ethtool_puts(&data, ena_stats->name); + } + + if (hw_stats_needed) + ena_metrics_stats_strings(adapter, &data); + + ena_queue_strings(adapter, &data); + ena_com_admin_strings(&data); + + if (ena_phc_is_active(adapter)) + ena_com_phc_strings(&data); +} + +static void ena_get_ethtool_strings(struct net_device *netdev, + u32 sset, + u8 *data) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + switch (sset) { + case ETH_SS_STATS: + ena_get_strings(adapter, data, true); + break; + case ETH_SS_PRIV_FLAGS: + memcpy(data, ena_priv_flags_strings, sizeof(ena_priv_flags_strings)); + break; + } +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0) +static int ena_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *link_ksettings) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct ena_admin_get_feature_link_desc *link; + struct ena_admin_get_feat_resp feat_resp; + int rc; + + rc = ena_com_get_link_params(ena_dev, &feat_resp); + if (rc) + return rc; + + link = &feat_resp.u.link; + link_ksettings->base.speed = link->speed; + + if (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK) { + ethtool_link_ksettings_add_link_mode(link_ksettings, + supported, Autoneg); + ethtool_link_ksettings_add_link_mode(link_ksettings, + supported, Autoneg); + } + + link_ksettings->base.autoneg = + (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK) ? + AUTONEG_ENABLE : AUTONEG_DISABLE; + + link_ksettings->base.duplex = DUPLEX_FULL; + + return 0; +} + +#else +static int ena_get_settings(struct net_device *netdev, + struct ethtool_cmd *ecmd) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct ena_admin_get_feature_link_desc *link; + struct ena_admin_get_feat_resp feat_resp; + int rc; + + rc = ena_com_get_link_params(ena_dev, &feat_resp); + if (rc) + return rc; + + link = &feat_resp.u.link; + + ethtool_cmd_speed_set(ecmd, link->speed); + + if (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_MASK) + ecmd->duplex = DUPLEX_FULL; + else + ecmd->duplex = DUPLEX_HALF; + + if (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK) + ecmd->autoneg = AUTONEG_ENABLE; + else + ecmd->autoneg = AUTONEG_DISABLE; + + return 0; +} + +#endif +static int ena_get_coalesce(struct net_device *net_dev, +#ifdef ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED + struct ethtool_coalesce *coalesce, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +#else + struct ethtool_coalesce *coalesce) +#endif +{ + struct ena_adapter *adapter = netdev_priv(net_dev); + struct ena_com_dev *ena_dev = adapter->ena_dev; + + if (!ena_com_interrupt_moderation_supported(ena_dev)) + return -EOPNOTSUPP; + + coalesce->tx_coalesce_usecs = + ena_com_get_nonadaptive_moderation_interval_tx(ena_dev) * + ena_dev->intr_delay_resolution; + + coalesce->rx_coalesce_usecs = + ena_com_get_nonadaptive_moderation_interval_rx(ena_dev) + * ena_dev->intr_delay_resolution; + + coalesce->use_adaptive_rx_coalesce = + ena_com_get_adaptive_moderation_enabled(ena_dev); + + return 0; +} + +static void ena_update_tx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter) +{ + unsigned int val; + int i; + + val = ena_com_get_nonadaptive_moderation_interval_tx(adapter->ena_dev); + + for (i = 0; i < adapter->num_io_queues; i++) { + adapter->tx_ring[i].interrupt_interval_changed = + adapter->tx_ring[i].interrupt_interval != val; + adapter->tx_ring[i].interrupt_interval = val; + } +} + +static void ena_update_rx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter) +{ + unsigned int val; + int i; + + val = ena_com_get_nonadaptive_moderation_interval_rx(adapter->ena_dev); + + for (i = 0; i < adapter->num_io_queues; i++) { + adapter->rx_ring[i].interrupt_interval_changed = + adapter->rx_ring[i].interrupt_interval != val; + adapter->rx_ring[i].interrupt_interval = val; + } +} + +static int ena_set_coalesce(struct net_device *net_dev, +#ifdef ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED + struct ethtool_coalesce *coalesce, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +#else + struct ethtool_coalesce *coalesce) +#endif +{ + struct ena_adapter *adapter = netdev_priv(net_dev); + struct ena_com_dev *ena_dev = adapter->ena_dev; + int rc; + + if (!ena_com_interrupt_moderation_supported(ena_dev)) + return -EOPNOTSUPP; + + rc = ena_com_update_nonadaptive_moderation_interval_tx(ena_dev, + coalesce->tx_coalesce_usecs); + if (rc) + return rc; + + ena_update_tx_rings_nonadaptive_intr_moderation(adapter); + + rc = ena_com_update_nonadaptive_moderation_interval_rx(ena_dev, + coalesce->rx_coalesce_usecs); + if (rc) + return rc; + + ena_update_rx_rings_nonadaptive_intr_moderation(adapter); + + if (coalesce->use_adaptive_rx_coalesce && + !ena_com_get_adaptive_moderation_enabled(ena_dev)) + ena_com_enable_adaptive_moderation(ena_dev); + + if (!coalesce->use_adaptive_rx_coalesce && + ena_com_get_adaptive_moderation_enabled(ena_dev)) + ena_com_disable_adaptive_moderation(ena_dev); + + return 0; +} + +static u32 ena_get_msglevel(struct net_device *netdev) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + return adapter->msg_enable; +} + +static void ena_set_msglevel(struct net_device *netdev, u32 value) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + adapter->msg_enable = value; +} + +static void ena_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + struct ena_adapter *adapter = netdev_priv(dev); + ssize_t ret = 0; + + ret = strscpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver)); + if (ret < 0) + netif_dbg(adapter, drv, dev, + "module name will be truncated, status = %zd\n", ret); + + ret = strscpy(info->version, DRV_MODULE_GENERATION, sizeof(info->version)); + if (ret < 0) + netif_dbg(adapter, drv, dev, + "module version will be truncated, status = %zd\n", ret); + + ret = strscpy(info->bus_info, pci_name(adapter->pdev), + sizeof(info->bus_info)); + if (ret < 0) + netif_dbg(adapter, drv, dev, + "bus info will be truncated, status = %zd\n", ret); + + info->n_priv_flags = ENA_PRIV_FLAGS_NR; +} + +static void ena_get_ringparam(struct net_device *netdev, +#ifdef ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) +#else + struct ethtool_ringparam *ring) +#endif +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + ring->tx_max_pending = adapter->max_tx_ring_size; + ring->rx_max_pending = adapter->max_rx_ring_size; +#ifdef ENA_LARGE_LLQ_ETHTOOL + if (adapter->ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + bool large_llq_supported = adapter->large_llq_header_supported; + + kernel_ring->tx_push = true; + kernel_ring->tx_push_buf_len = adapter->ena_dev->tx_max_header_size; + if (large_llq_supported) + kernel_ring->tx_push_buf_max_len = ENA_LLQ_LARGE_HEADER; + else + kernel_ring->tx_push_buf_max_len = ENA_LLQ_HEADER; + } else { + kernel_ring->tx_push = false; + kernel_ring->tx_push_buf_max_len = 0; + kernel_ring->tx_push_buf_len = 0; + } + +#endif + ring->tx_pending = adapter->tx_ring[0].ring_size; + ring->rx_pending = adapter->rx_ring[0].ring_size; +} + +static int ena_set_ringparam(struct net_device *netdev, +#ifdef ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) +#else + struct ethtool_ringparam *ring) +#endif +{ + struct ena_adapter *adapter = netdev_priv(netdev); + u32 new_tx_size, new_rx_size, new_tx_push_buf_len; + bool changed = false; + + if (ring->rx_mini_pending || ring->rx_jumbo_pending) + return -EINVAL; + + new_tx_size = clamp_val(ring->tx_pending, ENA_MIN_RING_SIZE, + adapter->max_tx_ring_size); + new_tx_size = rounddown_pow_of_two(new_tx_size); + + new_rx_size = clamp_val(ring->rx_pending, ENA_MIN_RING_SIZE, + adapter->max_rx_ring_size); + new_rx_size = rounddown_pow_of_two(new_rx_size); + + changed |= new_tx_size != adapter->requested_tx_ring_size || + new_rx_size != adapter->requested_rx_ring_size; + + /* This value is ignored if LLQ is not supported */ + new_tx_push_buf_len = adapter->ena_dev->tx_max_header_size; +#ifdef ENA_LARGE_LLQ_ETHTOOL + + if ((adapter->ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) != + kernel_ring->tx_push) { + NL_SET_ERR_MSG_MOD(extack, "Push mode state cannot be modified"); + return -EINVAL; + } + + /* Validate that the push buffer is supported on the underlying device */ + if (kernel_ring->tx_push_buf_len) { + enum ena_admin_placement_policy_type placement; + + new_tx_push_buf_len = kernel_ring->tx_push_buf_len; + + placement = adapter->ena_dev->tx_mem_queue_type; + if (placement == ENA_ADMIN_PLACEMENT_POLICY_HOST) + return -EOPNOTSUPP; + + if (new_tx_push_buf_len != ENA_LLQ_HEADER && + new_tx_push_buf_len != ENA_LLQ_LARGE_HEADER) { + bool large_llq_sup = adapter->large_llq_header_supported; + char large_llq_size_str[40]; + + snprintf(large_llq_size_str, 40, ", %lu", ENA_LLQ_LARGE_HEADER); + + NL_SET_ERR_MSG_FMT_MOD(extack, + "Supported tx push buff values: [%lu%s]", + ENA_LLQ_HEADER, + large_llq_sup ? large_llq_size_str : ""); + + return -EINVAL; + } + + changed |= new_tx_push_buf_len != adapter->ena_dev->tx_max_header_size; + } + +#endif + if (!changed) + return 0; + + return ena_update_queue_params(adapter, new_tx_size, new_rx_size, + new_tx_push_buf_len); +} + +#ifdef ETHTOOL_GRXRINGS +static u32 ena_flow_hash_to_flow_type(u16 hash_fields) +{ + u32 data = 0; + + if (hash_fields & ENA_ADMIN_RSS_L2_DA) + data |= RXH_L2DA; + + if (hash_fields & ENA_ADMIN_RSS_L3_DA) + data |= RXH_IP_DST; + + if (hash_fields & ENA_ADMIN_RSS_L3_SA) + data |= RXH_IP_SRC; + + if (hash_fields & ENA_ADMIN_RSS_L4_DP) + data |= RXH_L4_B_2_3; + + if (hash_fields & ENA_ADMIN_RSS_L4_SP) + data |= RXH_L4_B_0_1; + + return data; +} + +static u16 ena_flow_data_to_flow_hash(u32 hash_fields) +{ + u16 data = 0; + + if (hash_fields & RXH_L2DA) + data |= ENA_ADMIN_RSS_L2_DA; + + if (hash_fields & RXH_IP_DST) + data |= ENA_ADMIN_RSS_L3_DA; + + if (hash_fields & RXH_IP_SRC) + data |= ENA_ADMIN_RSS_L3_SA; + + if (hash_fields & RXH_L4_B_2_3) + data |= ENA_ADMIN_RSS_L4_DP; + + if (hash_fields & RXH_L4_B_0_1) + data |= ENA_ADMIN_RSS_L4_SP; + + return data; +} + +static int ena_get_rss_hash(struct ena_com_dev *ena_dev, + struct ethtool_rxnfc *cmd) +{ + enum ena_admin_flow_hash_proto proto; + u16 hash_fields; + int rc; + + cmd->data = 0; + + switch (cmd->flow_type) { + case TCP_V4_FLOW: + proto = ENA_ADMIN_RSS_TCP4; + break; + case UDP_V4_FLOW: + proto = ENA_ADMIN_RSS_UDP4; + break; + case TCP_V6_FLOW: + proto = ENA_ADMIN_RSS_TCP6; + break; + case UDP_V6_FLOW: + proto = ENA_ADMIN_RSS_UDP6; + break; + case IPV4_FLOW: + proto = ENA_ADMIN_RSS_IP4; + break; + case IPV6_FLOW: + proto = ENA_ADMIN_RSS_IP6; + break; + case ETHER_FLOW: + proto = ENA_ADMIN_RSS_NOT_IP; + break; + case AH_V4_FLOW: + case ESP_V4_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case SCTP_V4_FLOW: + case AH_ESP_V4_FLOW: + return -EOPNOTSUPP; + default: + return -EINVAL; + } + + rc = ena_com_get_hash_ctrl(ena_dev, proto, &hash_fields); + if (unlikely(rc)) + return rc; + + cmd->data = ena_flow_hash_to_flow_type(hash_fields); + + return 0; +} + +static int ena_set_rss_hash(struct ena_com_dev *ena_dev, + struct ethtool_rxnfc *cmd) +{ + enum ena_admin_flow_hash_proto proto; + u16 hash_fields; + + switch (cmd->flow_type) { + case TCP_V4_FLOW: + proto = ENA_ADMIN_RSS_TCP4; + break; + case UDP_V4_FLOW: + proto = ENA_ADMIN_RSS_UDP4; + break; + case TCP_V6_FLOW: + proto = ENA_ADMIN_RSS_TCP6; + break; + case UDP_V6_FLOW: + proto = ENA_ADMIN_RSS_UDP6; + break; + case IPV4_FLOW: + proto = ENA_ADMIN_RSS_IP4; + break; + case IPV6_FLOW: + proto = ENA_ADMIN_RSS_IP6; + break; + case ETHER_FLOW: + proto = ENA_ADMIN_RSS_NOT_IP; + break; + case AH_V4_FLOW: + case ESP_V4_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case SCTP_V4_FLOW: + case AH_ESP_V4_FLOW: + return -EOPNOTSUPP; + default: + return -EINVAL; + } + + hash_fields = ena_flow_data_to_flow_hash(cmd->data); + + return ena_com_fill_hash_ctrl(ena_dev, proto, hash_fields); +} + +static int ena_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + int rc = 0; + + switch (info->cmd) { + case ETHTOOL_SRXFH: + rc = ena_set_rss_hash(adapter->ena_dev, info); + break; + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + default: + netif_err(adapter, drv, netdev, + "Command parameter %d is not supported\n", info->cmd); + rc = -EOPNOTSUPP; + } + + return rc; +} + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(3, 2, 0) +static int ena_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info, + void *rules) +#else +static int ena_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info, + u32 *rules) +#endif +{ + struct ena_adapter *adapter = netdev_priv(netdev); + int rc = 0; + + switch (info->cmd) { + case ETHTOOL_GRXRINGS: + info->data = adapter->num_io_queues; + rc = 0; + break; + case ETHTOOL_GRXFH: + rc = ena_get_rss_hash(adapter->ena_dev, info); + break; + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + default: + netif_err(adapter, drv, netdev, + "Command parameter %d is not supported\n", info->cmd); + rc = -EOPNOTSUPP; + } + + return rc; +} +#endif /* ETHTOOL_GRXRINGS */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) +static u32 ena_get_rxfh_indir_size(struct net_device *netdev) +{ + return ENA_RX_RSS_TABLE_SIZE; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) +static u32 ena_get_rxfh_key_size(struct net_device *netdev) +{ + return ENA_HASH_KEY_SIZE; +} +#endif + +static int ena_indirection_table_set(struct ena_adapter *adapter, + const u32 *indir) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + int i, rc; + + for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) { + rc = ena_com_indirect_table_fill_entry(ena_dev, + i, + ENA_IO_RXQ_IDX(indir[i])); + if (unlikely(rc)) { + netif_err(adapter, drv, adapter->netdev, + "Cannot fill indirect table (index is too large)\n"); + return rc; + } + } + + rc = ena_com_indirect_table_set(ena_dev); + if (rc) { + netif_err(adapter, drv, adapter->netdev, + "Cannot set indirect table\n"); + return rc == -EPERM ? -EOPNOTSUPP : rc; + } + return rc; +} + +static int ena_indirection_table_get(struct ena_adapter *adapter, u32 *indir) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + int i, rc; + + if (!indir) + return 0; + + rc = ena_com_indirect_table_get(ena_dev, indir); + if (unlikely(rc)) + return rc; + + /* Our internal representation of the indices is: even indices + * for Tx and uneven indices for Rx. We need to convert the Rx + * indices to be consecutive + */ + for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) + indir[i] = ENA_IO_RXQ_IDX_TO_COMBINED_IDX(indir[i]); + + return rc; +} + +#ifdef ENA_HAVE_ETHTOOL_RXFH_PARAM +static int ena_get_rxfh(struct net_device *netdev, + struct ethtool_rxfh_param *rxfh) +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) +static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, + u8 *hfunc) +#endif /* ENA_HAVE_ETHTOOL_RXFH_PARAM */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + enum ena_admin_hash_functions ena_func; +#ifdef ENA_HAVE_ETHTOOL_RXFH_PARAM + u32 *indir = rxfh->indir; + u8 *hfunc = &rxfh->hfunc; + u8 *key = rxfh->key; +#endif /* ENA_HAVE_ETHTOOL_RXFH_PARAM */ + u8 func; + int rc; + + rc = ena_indirection_table_get(adapter, indir); + if (unlikely(rc)) + return rc; + + /* We call this function in order to check if the device + * supports getting/setting the hash function. + */ + rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func); + if (rc) { + if (rc == -EOPNOTSUPP) + rc = 0; + + return rc; + } + + rc = ena_com_get_hash_key(adapter->ena_dev, key); + if (rc) + return rc; + + switch (ena_func) { + case ENA_ADMIN_TOEPLITZ: + func = ETH_RSS_HASH_TOP; + break; + case ENA_ADMIN_CRC32: + func = ETH_RSS_HASH_CRC32; + break; + default: + netif_err(adapter, drv, netdev, + "Command parameter is not supported\n"); + return -EOPNOTSUPP; + } + +#ifdef ENA_HAVE_ETHTOOL_RXFH_PARAM + *hfunc = func; +#else + if (hfunc) + *hfunc = func; +#endif /* ENA_HAVE_ETHTOOL_RXFH_PARAM */ + + return 0; +} +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) +static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + enum ena_admin_hash_functions ena_func; + int rc; + + rc = ena_indirection_table_get(adapter, indir); + if (unlikely(rc)) + return rc; + + /* We call this function in order to check if the device + * supports getting/setting the hash function. + */ + rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func); + if (rc) { + if (rc == -EOPNOTSUPP) + rc = 0; + + return rc; + } + + rc = ena_com_get_hash_key(adapter->ena_dev, key); + if (rc) + return rc; + + return rc; +} +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)/* >= 3.16.0 */ +static int ena_get_rxfh(struct net_device *netdev, u32 *indir) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + return ena_indirection_table_get(adapter, indir); +} +#endif /* >= 3.8.0 */ + +#ifdef ENA_HAVE_ETHTOOL_RXFH_PARAM +static int ena_set_rxfh(struct net_device *netdev, + struct ethtool_rxfh_param *rxfh, + struct netlink_ext_ack *extack) +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) +static int ena_set_rxfh(struct net_device *netdev, const u32 *indir, + const u8 *key, const u8 hfunc) +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) +static int ena_set_rxfh(struct net_device *netdev, const u32 *indir, + const u8 *key) +#endif /* ENA_HAVE_ETHTOOL_RXFH_PARAM */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct ena_com_dev *ena_dev = adapter->ena_dev; + enum ena_admin_hash_functions func = 0; +#ifdef ENA_HAVE_ETHTOOL_RXFH_PARAM + u32 *indir = rxfh->indir; + u8 hfunc = rxfh->hfunc; + u8 *key = rxfh->key; +#endif /* ENA_HAVE_ETHTOOL_RXFH_PARAM */ + int rc; + + if (indir) { + rc = ena_indirection_table_set(adapter, indir); + if (rc) + return rc; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) + switch (hfunc) { + case ETH_RSS_HASH_NO_CHANGE: + func = ena_com_get_current_hash_function(ena_dev); + break; + case ETH_RSS_HASH_TOP: + func = ENA_ADMIN_TOEPLITZ; + break; + case ETH_RSS_HASH_CRC32: + func = ENA_ADMIN_CRC32; + break; + default: + netif_err(adapter, drv, netdev, "Unsupported hfunc %d\n", + hfunc); + return -EOPNOTSUPP; + } +#else /* Kernel 3.19 */ + func = ENA_ADMIN_TOEPLITZ; +#endif + + if (key || func) { + rc = ena_com_fill_hash_function(ena_dev, func, key, + ENA_HASH_KEY_SIZE, + 0xFFFFFFFF); + if (unlikely(rc)) { + netif_err(adapter, drv, netdev, "Cannot fill key\n"); + return rc == -EPERM ? -EOPNOTSUPP : rc; + } + } + + return 0; +} +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) /* Kernel > 3.16 */ +static int ena_set_rxfh(struct net_device *netdev, const u32 *indir) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + int rc = 0; + + if (indir) + rc = ena_indirection_table_set(adapter, indir); + + return rc; +} +#endif /* Kernel >= 3.8 */ +#endif /* ETHTOOL_GRXFH */ +#ifndef HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT + +#ifdef ETHTOOL_SCHANNELS +static void ena_get_channels(struct net_device *netdev, + struct ethtool_channels *channels) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + channels->max_combined = adapter->max_num_io_queues; + channels->combined_count = adapter->num_io_queues; +} + +static int ena_set_channels(struct net_device *netdev, + struct ethtool_channels *channels) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + u32 count = channels->combined_count; + /* The check for max value is already done in ethtool */ + if (count < ENA_MIN_NUM_IO_QUEUES) + return -EINVAL; + + if (!ena_xdp_legal_queue_count(adapter, count)) { + if (ena_xdp_present(adapter)) + return -EINVAL; + + xdp_clear_features_flag(netdev); + } else { + xdp_set_features_flag(netdev, ENA_XDP_FEATURES); + } + + if (count > adapter->max_num_io_queues) + return -EINVAL; + +#ifdef ENA_AF_XDP_SUPPORT + if (count != adapter->num_io_queues && ena_is_zc_q_exist(adapter)) { + netdev_err(adapter->netdev, + "Changing channel count not supported with xsk pool loaded\n"); + return -EOPNOTSUPP; + } + +#endif /* ENA_AF_XDP_SUPPORT */ + return ena_update_queue_count(adapter, count); +} +#endif /* ETHTOOL_SCHANNELS */ + +#endif /* HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) +static int ena_get_tunable(struct net_device *netdev, + const struct ethtool_tunable *tuna, void *data) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + int ret = 0; + + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + *(u32 *)data = adapter->rx_copybreak; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int ena_set_tunable(struct net_device *netdev, + const struct ethtool_tunable *tuna, + const void *data) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + int ret = 0; + u32 len; + + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + len = *(u32 *)data; + ret = ena_set_rx_copybreak(adapter, len); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} +#endif /* 3.18.0 */ + +static u32 ena_get_priv_flags(struct net_device *netdev) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + u32 priv_flags = 0; + + if (adapter->rx_ring->page_cache) + priv_flags |= ENA_PRIV_FLAGS_LPC; + + return priv_flags; +} + +static int ena_set_priv_flags(struct net_device *netdev, u32 priv_flags) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + /* LPC is the only supported private flag for now */ + return ena_set_lpc_state(adapter, !!(priv_flags & ENA_PRIV_FLAGS_LPC)); +} + +static const struct ethtool_ops ena_ethtool_ops = { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0) + .supported_coalesce_params = ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_USE_ADAPTIVE_RX, +#endif +#ifdef ENA_LARGE_LLQ_ETHTOOL + .supported_ring_params = ETHTOOL_RING_USE_TX_PUSH_BUF_LEN | + ETHTOOL_RING_USE_TX_PUSH, +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0) + .get_link_ksettings = ena_get_link_ksettings, +#else + .get_settings = ena_get_settings, +#endif + .get_drvinfo = ena_get_drvinfo, + .get_msglevel = ena_get_msglevel, + .set_msglevel = ena_set_msglevel, + .get_link = ethtool_op_get_link, + .get_coalesce = ena_get_coalesce, + .set_coalesce = ena_set_coalesce, + .get_ringparam = ena_get_ringparam, + .set_ringparam = ena_set_ringparam, + .get_sset_count = ena_get_sset_count, + .get_strings = ena_get_ethtool_strings, + .get_ethtool_stats = ena_get_ethtool_stats, +#ifdef ETHTOOL_GRXRINGS + .get_rxnfc = ena_get_rxnfc, + .set_rxnfc = ena_set_rxnfc, +#endif /* ETHTOOL_GRXRINGS */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) + .get_rxfh_indir_size = ena_get_rxfh_indir_size, +#endif /* >= 3.8.0 */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) + .get_rxfh_key_size = ena_get_rxfh_key_size, + .get_rxfh = ena_get_rxfh, + .set_rxfh = ena_set_rxfh, +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) + .get_rxfh_indir = ena_get_rxfh, + .set_rxfh_indir = ena_set_rxfh, +#endif /* >= 3.8.0 */ +#ifndef HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT +#ifdef ETHTOOL_SCHANNELS + .get_channels = ena_get_channels, + .set_channels = ena_set_channels, +#endif /* ETHTOOL_SCHANNELS */ +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) + .get_tunable = ena_get_tunable, + .set_tunable = ena_set_tunable, +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) + .get_ts_info = ena_get_ts_info, +#endif + .get_priv_flags = ena_get_priv_flags, + .set_priv_flags = ena_set_priv_flags, +}; + +void ena_set_ethtool_ops(struct net_device *netdev) +{ + netdev->ethtool_ops = &ena_ethtool_ops; +} + +static void ena_dump_stats_ex(struct ena_adapter *adapter, u8 *buf) +{ + struct net_device *netdev = adapter->netdev; + u8 *strings_buf; + u64 *data_buf; + int strings_num; + int i, rc; + + strings_num = ena_get_sw_stats_count(adapter); + if (strings_num <= 0) { + netif_err(adapter, drv, netdev, "Can't get stats num\n"); + return; + } + + strings_buf = devm_kcalloc(&adapter->pdev->dev, + ETH_GSTRING_LEN, strings_num, + GFP_ATOMIC); + if (!strings_buf) { + netif_err(adapter, drv, netdev, + "Failed to allocate strings_buf\n"); + return; + } + + data_buf = devm_kcalloc(&adapter->pdev->dev, + strings_num, sizeof(u64), + GFP_ATOMIC); + if (!data_buf) { + netif_err(adapter, drv, netdev, + "Failed to allocate data buf\n"); + devm_kfree(&adapter->pdev->dev, strings_buf); + return; + } + + ena_get_strings(adapter, strings_buf, false); + ena_get_stats(adapter, data_buf, false); + + /* If there is a buffer, dump stats, otherwise print them to dmesg */ + if (buf) + for (i = 0; i < strings_num; i++) { + rc = snprintf(buf, ETH_GSTRING_LEN + sizeof(u64), + "%s %llu\n", + strings_buf + i * ETH_GSTRING_LEN, + data_buf[i]); + buf += rc; + } + else + for (i = 0; i < strings_num; i++) + netif_err(adapter, drv, netdev, "%s: %llu\n", + strings_buf + i * ETH_GSTRING_LEN, + data_buf[i]); + + devm_kfree(&adapter->pdev->dev, strings_buf); + devm_kfree(&adapter->pdev->dev, data_buf); +} + +void ena_dump_stats_to_buf(struct ena_adapter *adapter, u8 *buf) +{ + if (!buf) + return; + + ena_dump_stats_ex(adapter, buf); +} + +void ena_dump_stats_to_dmesg(struct ena_adapter *adapter) +{ + ena_dump_stats_ex(adapter, NULL); +} diff --git a/drivers/amazon/net/ena/ena_lpc.c b/drivers/amazon/net/ena/ena_lpc.c new file mode 100644 index 0000000000000..64c3d2d24f398 --- /dev/null +++ b/drivers/amazon/net/ena/ena_lpc.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ +#include "ena_lpc.h" +#include "ena_xdp.h" + +static void ena_free_ring_page_cache(struct ena_ring *rx_ring); + +static void ena_put_unmap_cache_page(struct ena_ring *rx_ring, struct ena_page *ena_page) +{ + dma_unmap_page(rx_ring->dev, ena_page->dma_addr, ENA_PAGE_SIZE, + DMA_BIDIRECTIONAL); + + put_page(ena_page->page); +} + +/* Removes a page from page cache and allocate a new one instead. If an + * allocation of a new page fails, the cache entry isn't changed + */ +static void ena_replace_cache_page(struct ena_ring *rx_ring, + struct ena_page *ena_page) +{ + struct page *new_page; + dma_addr_t dma; + + new_page = ena_alloc_map_page(rx_ring, &dma); + + if (unlikely(IS_ERR(new_page))) + return; + + ena_put_unmap_cache_page(rx_ring, ena_page); + + ena_page->page = new_page; + ena_page->dma_addr = dma; +} + +/* Mark the cache page as used and return it. If the page belongs to a different + * NUMA than the current one, free the cache page and allocate another one + * instead. + */ +static struct page *ena_return_cache_page(struct ena_ring *rx_ring, + struct ena_page *ena_page, + dma_addr_t *dma) +{ + /* Remove pages belonging to different node than the one the CPU runs on */ + if (unlikely(page_to_nid(ena_page->page) != numa_mem_id())) { + ena_increase_stat(&rx_ring->rx_stats.lpc_wrong_numa, 1, &rx_ring->syncp); + ena_replace_cache_page(rx_ring, ena_page); + } + + /* Make sure no writes are pending for this page */ + dma_sync_single_for_device(rx_ring->dev, ena_page->dma_addr, + ENA_PAGE_SIZE, + DMA_BIDIRECTIONAL); + + /* Increase refcount to 2 so that the page is returned to the + * cache after being freed + */ + page_ref_inc(ena_page->page); + + *dma = ena_page->dma_addr; + + return ena_page->page; +} + +struct page *ena_lpc_get_page(struct ena_ring *rx_ring, dma_addr_t *dma, + bool *is_lpc_page) +{ + struct ena_page_cache *page_cache = rx_ring->page_cache; + u32 head, cache_current_size; + struct ena_page *ena_page; + + /* Cache size of zero indicates disabled cache */ + if (!page_cache) { + *is_lpc_page = false; + return ena_alloc_map_page(rx_ring, dma); + } + + *is_lpc_page = true; + + cache_current_size = page_cache->current_size; + head = page_cache->head; + + ena_page = &page_cache->cache[head]; + /* Warm up phase. We fill the pages for the first time. The + * phase is done in the napi context to improve the chances we + * allocate on the correct NUMA node + */ + if (unlikely(cache_current_size < page_cache->max_size)) { + /* Check if oldest allocated page is free */ + if (ena_page->page && page_ref_count(ena_page->page) == 1) { + page_cache->head = (head + 1) % cache_current_size; + return ena_return_cache_page(rx_ring, ena_page, dma); + } + + ena_page = &page_cache->cache[cache_current_size]; + + /* Add a new page to the cache */ + ena_page->page = ena_alloc_map_page(rx_ring, dma); + if (unlikely(IS_ERR(ena_page->page))) + return ena_page->page; + + ena_page->dma_addr = *dma; + + /* Increase refcount to 2 so that the page is returned to the + * cache after being freed + */ + page_ref_inc(ena_page->page); + + page_cache->current_size++; + + ena_increase_stat(&rx_ring->rx_stats.lpc_warm_up, 1, &rx_ring->syncp); + + return ena_page->page; + } + + /* Next page is still in use, so we allocate outside the cache */ + if (unlikely(page_ref_count(ena_page->page) != 1)) { + ena_increase_stat(&rx_ring->rx_stats.lpc_full, 1, &rx_ring->syncp); + *is_lpc_page = false; + return ena_alloc_map_page(rx_ring, dma); + } + + page_cache->head = (head + 1) & (page_cache->max_size - 1); + + return ena_return_cache_page(rx_ring, ena_page, dma); +} + +bool ena_is_lpc_supported(struct ena_adapter *adapter, + struct ena_ring *rx_ring, + bool error_print) +{ +#ifdef ENA_NETDEV_LOGS_WITHOUT_RV + void (*print_log)(const struct net_device *dev, const char *format, ...); +#else + int (*print_log)(const struct net_device *dev, const char *format, ...); +#endif + int channels_nr = adapter->num_io_queues + adapter->xdp_num_queues; + + print_log = (error_print) ? netdev_err : netdev_info; + + /* LPC is disabled below min number of channels */ + if (channels_nr < ENA_LPC_MIN_NUM_OF_CHANNELS) { + print_log(adapter->netdev, + "Local page cache is disabled for less than %d channels\n", + ENA_LPC_MIN_NUM_OF_CHANNELS); + + /* Disable LPC for such case. It can enabled again through + * ethtool private-flag. + */ + adapter->used_lpc_size = 0; + + return false; + } +#ifdef ENA_XDP_SUPPORT + + /* The driver doesn't support page caches under XDP */ + if (ena_xdp_present_ring(rx_ring)) { + print_log(adapter->netdev, + "Local page cache is disabled when using XDP\n"); + return false; + } +#endif /* ENA_XDP_SUPPORT */ + + return true; +} + +/* Calculate the size of the Local Page Cache. If LPC should be disabled, return + * a size of 0. + */ +static u32 ena_calculate_cache_size(struct ena_adapter *adapter, + struct ena_ring *rx_ring) +{ + u32 page_cache_size = adapter->used_lpc_size; + + /* LPC cache size of 0 means disabled cache */ + if (page_cache_size == 0) + return 0; + + if (!ena_is_lpc_supported(adapter, rx_ring, false)) + return 0; + + /* Clap the LPC size to its maximum value */ + if (page_cache_size > ENA_LPC_MAX_MULTIPLIER) { + netdev_info(adapter->netdev, + "Configured LPC size %d is too large, reducing to %d (max)\n", + adapter->configured_lpc_size, ENA_LPC_MAX_MULTIPLIER); + + /* Override LPC size to avoid printing this message + * every up/down operation + */ + adapter->configured_lpc_size = ENA_LPC_MAX_MULTIPLIER; + adapter->used_lpc_size = page_cache_size = ENA_LPC_MAX_MULTIPLIER; + } + + page_cache_size = page_cache_size * ENA_LPC_MULTIPLIER_UNIT; + page_cache_size = roundup_pow_of_two(page_cache_size); + + return page_cache_size; +} + +int ena_create_page_caches(struct ena_adapter *adapter) +{ + struct ena_page_cache *cache; + u32 page_cache_size; + int i; + + for (i = 0; i < adapter->num_io_queues; i++) { + struct ena_ring *rx_ring = &adapter->rx_ring[i]; + + page_cache_size = ena_calculate_cache_size(adapter, rx_ring); + + if (!page_cache_size) + return 0; + + cache = vzalloc(sizeof(struct ena_page_cache) + + sizeof(struct ena_page) * page_cache_size); + if (!cache) + goto err_cache_alloc; + + cache->max_size = page_cache_size; + rx_ring->page_cache = cache; + } + + return 0; +err_cache_alloc: + netif_err(adapter, ifup, adapter->netdev, + "Failed to initialize local page caches (LPCs)\n"); + while (--i >= 0) { + struct ena_ring *rx_ring = &adapter->rx_ring[i]; + + ena_free_ring_page_cache(rx_ring); + } + + return -ENOMEM; +} + +/* Release all pages from the page cache */ +static void ena_free_ring_cache_pages(struct ena_adapter *adapter, int qid) +{ + struct ena_ring *rx_ring = &adapter->rx_ring[qid]; + struct ena_page_cache *page_cache; + int i; + + /* Page cache is disabled */ + if (!rx_ring->page_cache) + return; + + page_cache = rx_ring->page_cache; + + /* We check size value to make sure we don't + * free pages that weren't allocated. + */ + for (i = 0; i < page_cache->current_size; i++) { + struct ena_page *ena_page = &page_cache->cache[i]; + + WARN_ON(!ena_page->page); + + dma_unmap_page(rx_ring->dev, ena_page->dma_addr, + ENA_PAGE_SIZE, + DMA_BIDIRECTIONAL); + + /* If the page is also in the rx buffer, then this operation + * would only decrease its reference count + */ + __free_page(ena_page->page); + } + + page_cache->head = page_cache->current_size = 0; +} + +void ena_free_all_cache_pages(struct ena_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) + ena_free_ring_cache_pages(adapter, i); +} + +static void ena_free_ring_page_cache(struct ena_ring *rx_ring) +{ + if(!rx_ring->page_cache) + return; + + vfree(rx_ring->page_cache); + rx_ring->page_cache = NULL; +} + +void ena_free_page_caches(struct ena_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) { + struct ena_ring *rx_ring = &adapter->rx_ring[i]; + + ena_free_ring_page_cache(rx_ring); + } +} diff --git a/drivers/amazon/net/ena/ena_lpc.h b/drivers/amazon/net/ena/ena_lpc.h new file mode 100644 index 0000000000000..2953eb24ac4dd --- /dev/null +++ b/drivers/amazon/net/ena/ena_lpc.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "ena_netdev.h" + +/* LPC definitions */ +#define ENA_LPC_DEFAULT_MULTIPLIER 2 +#define ENA_LPC_MAX_MULTIPLIER 32 +#define ENA_LPC_MULTIPLIER_UNIT 1024 +#define ENA_LPC_MIN_NUM_OF_CHANNELS 16 + +/* Store DMA address along with the page */ +struct ena_page { + struct page *page; + dma_addr_t dma_addr; +}; + +struct ena_page_cache { + /* How many pages are produced */ + u32 head; + /* How many of the entries were initialized */ + u32 current_size; + /* Maximum number of pages the cache can hold */ + u32 max_size; + + struct ena_page cache[0]; +} ____cacheline_aligned; + +int ena_create_page_caches(struct ena_adapter *adapter); +void ena_free_page_caches(struct ena_adapter *adapter); +void ena_free_all_cache_pages(struct ena_adapter *adapter); +struct page *ena_lpc_get_page(struct ena_ring *rx_ring, dma_addr_t *dma, + bool *is_lpc_page); +bool ena_is_lpc_supported(struct ena_adapter *adapter, + struct ena_ring *rx_ring, + bool error_print); diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c new file mode 100644 index 0000000000000..083c9546fa033 --- /dev/null +++ b/drivers/amazon/net/ena/ena_netdev.c @@ -0,0 +1,5208 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#ifdef CONFIG_RFS_ACCEL +#include +#endif /* CONFIG_RFS_ACCEL */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_NET_RX_BUSY_POLL) && (LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0)) +#include +#endif +#include + +#include "ena_netdev.h" +#include "ena_pci_id_tbl.h" +#include "ena_sysfs.h" +#include "ena_xdp.h" + +#include "ena_lpc.h" + +#include "ena_phc.h" +static char version[] = DEVICE_NAME " v" DRV_MODULE_GENERATION "\n"; + +MODULE_AUTHOR("Amazon.com, Inc. or its affiliates"); +MODULE_DESCRIPTION(DEVICE_NAME); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRV_MODULE_GENERATION); + +/* Time in jiffies before concluding the transmitter is hung. */ +#define TX_TIMEOUT (5 * HZ) + +#define ENA_MAX_RINGS min_t(unsigned int, ENA_MAX_NUM_IO_QUEUES, num_possible_cpus()) + +#define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFUP | \ + NETIF_MSG_IFDOWN | NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR) + +#define ENA_HIGH_LOW_TO_U64(high, low) ((((u64)(high)) << 32) | (low)) +#ifndef ENA_LINEAR_FRAG_SUPPORTED + +#define ENA_SKB_PULL_MIN_LEN 64 +#endif + +static int debug = -1; +module_param(debug, int, 0444); +MODULE_PARM_DESC(debug, "Debug level (-1=default,0=none,...,16=all)"); + +static int rx_queue_size = ENA_DEFAULT_RING_SIZE; +module_param(rx_queue_size, int, 0444); +MODULE_PARM_DESC(rx_queue_size, "Rx queue size. The size should be a power of 2. Depending on instance type, max value can be up to 16K\n"); + +#define FORCE_LARGE_LLQ_HEADER_UNINIT_VALUE 0xFFFF +static int force_large_llq_header = FORCE_LARGE_LLQ_HEADER_UNINIT_VALUE; +module_param(force_large_llq_header, int, 0444); +MODULE_PARM_DESC(force_large_llq_header, "Increases maximum supported header size in LLQ mode to 224 bytes, while reducing the maximum TX queue size by half.\n"); + +static int num_io_queues = ENA_MAX_NUM_IO_QUEUES; +module_param(num_io_queues, int, 0444); +MODULE_PARM_DESC(num_io_queues, "Sets number of RX/TX queues to allocate to device. The maximum value depends on the device and number of online CPUs.\n"); + +static int enable_bql = 0; +module_param(enable_bql, int, 0444); +MODULE_PARM_DESC(enable_bql, "Enable BQL.\n"); + +static int lpc_size = ENA_LPC_DEFAULT_MULTIPLIER; +module_param(lpc_size, uint, 0444); +MODULE_PARM_DESC(lpc_size, "Each local page cache (lpc) holds N * 1024 pages. This parameter sets N which is rounded up to a multiplier of 2. If zero, the page cache is disabled. Max: 32\n"); + +#ifdef ENA_PHC_SUPPORT +static int phc_enable = 0; +module_param(phc_enable, uint, 0444); +MODULE_PARM_DESC(phc_enable, "Enable PHC.\n"); + +#endif /* ENA_PHC_SUPPORT */ +static struct ena_aenq_handlers aenq_handlers; + +static struct workqueue_struct *ena_wq; + +MODULE_DEVICE_TABLE(pci, ena_pci_tbl); + +static int ena_rss_init_default(struct ena_adapter *adapter); +static void check_for_admin_com_state(struct ena_adapter *adapter); +static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat, + struct net_device *netdev); + +static void ena_tx_timeout(struct net_device *dev, unsigned int txqueue) +{ + enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_OS_NETDEV_WD; + struct ena_adapter *adapter = netdev_priv(dev); + unsigned int time_since_last_napi, threshold; + unsigned long jiffies_since_last_intr; + struct ena_ring *tx_ring; + int napi_scheduled; + + if (txqueue >= adapter->num_io_queues) { + netdev_err(dev, "TX timeout on invalid queue %u\n", txqueue); + goto schedule_reset; + } + + threshold = jiffies_to_usecs(dev->watchdog_timeo); + tx_ring = &adapter->tx_ring[txqueue]; + + time_since_last_napi = jiffies_to_usecs(jiffies - tx_ring->tx_stats.last_napi_jiffies); + napi_scheduled = !!(tx_ring->napi->state & NAPIF_STATE_SCHED); + + jiffies_since_last_intr = jiffies - READ_ONCE(adapter->ena_napi[txqueue].last_intr_jiffies); + + netdev_err(dev, + "TX q %d is paused for too long (threshold %u). Time since last napi %u usec. napi scheduled: %d. msecs since last interrupt: %u\n", + txqueue, + threshold, + time_since_last_napi, + napi_scheduled, + jiffies_to_msecs(jiffies_since_last_intr)); + + if (threshold < time_since_last_napi && napi_scheduled) { + netdev_err(dev, + "napi handler hasn't been called for a long time but is scheduled\n"); + reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION; + } +schedule_reset: + /* Change the state of the device to trigger reset + * Check that we are not in the middle or a trigger already + */ + if (test_and_set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) + return; + + ena_reset_device(adapter, reset_reason); + ena_increase_stat(&adapter->dev_stats.tx_timeout, 1, &adapter->syncp); +} + +#ifndef HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER +/* This function is called by the kernel's watchdog and indicates that the queue + * has been closed longer than dev->watchdog_timeo value allows. + * In older kernels the called function doesn't contain the id of the queue + * that's been closed for too long. This helper function retrieves this + * information + */ +static void ena_find_and_timeout_queue(struct net_device *dev) +{ + struct ena_adapter *adapter = netdev_priv(dev); + unsigned long trans_start; + struct netdev_queue *txq; + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + txq = netdev_get_tx_queue(dev, i); + trans_start = txq->trans_start; + if (netif_xmit_stopped(txq) && + time_after(jiffies, (trans_start + dev->watchdog_timeo))) { + ena_tx_timeout(dev, i); + return; + } + } + + netdev_warn(dev, "timeout was called, but no offending queue was found\n"); + + /* Change the state of the device to trigger reset + * Check that we are not in the middle or a trigger already + */ + if (test_and_set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) + return; + + ena_reset_device(adapter, ENA_REGS_RESET_OS_NETDEV_WD); + ena_increase_stat(&adapter->dev_stats.tx_timeout, 1, &adapter->syncp); +} + +#endif +static void update_rx_ring_mtu(struct ena_adapter *adapter, int mtu) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) + adapter->rx_ring[i].mtu = mtu; +} + +static int ena_change_mtu(struct net_device *dev, int new_mtu) +{ + struct ena_adapter *adapter = netdev_priv(dev); + int ret; + +#ifndef HAVE_MTU_MIN_MAX_IN_NET_DEVICE + if ((new_mtu > adapter->max_mtu) || (new_mtu < ENA_MIN_MTU)) { + netif_err(adapter, drv, dev, + "Invalid MTU setting. new_mtu: %d max mtu: %d min mtu: %d\n", + new_mtu, adapter->max_mtu, ENA_MIN_MTU); + return -EINVAL; + } +#endif + ret = ena_com_set_dev_mtu(adapter->ena_dev, new_mtu); + if (!ret) { + netif_dbg(adapter, drv, dev, "Set MTU to %d\n", new_mtu); + update_rx_ring_mtu(adapter, new_mtu); + dev->mtu = new_mtu; + } else { + netif_err(adapter, drv, dev, "Failed to set MTU to %d\n", + new_mtu); + } + + return ret; +} + +int ena_xmit_common(struct ena_adapter *adapter, + struct ena_ring *ring, + struct ena_tx_buffer *tx_info, + struct ena_com_tx_ctx *ena_tx_ctx, + u16 next_to_use, + u32 bytes) +{ + int rc, nb_hw_desc; + + if (unlikely(ena_com_is_doorbell_needed(ring->ena_com_io_sq, + ena_tx_ctx))) { + netif_dbg(adapter, tx_queued, adapter->netdev, + "llq tx max burst size of queue %d achieved, writing doorbell to send burst\n", + ring->qid); + ena_ring_tx_doorbell(ring); + } + + /* prepare the packet's descriptors to dma engine */ + rc = ena_com_prepare_tx(ring->ena_com_io_sq, ena_tx_ctx, + &nb_hw_desc); + + /* In case there isn't enough space in the queue for the packet, + * we simply drop it. All other failure reasons of + * ena_com_prepare_tx() are fatal and therefore require a device reset. + */ + if (unlikely(rc)) { + netif_err(adapter, tx_queued, adapter->netdev, + "Failed to prepare tx bufs\n"); + ena_increase_stat(&ring->tx_stats.prepare_ctx_err, 1, &ring->syncp); + if (rc != -ENOMEM) + ena_reset_device(adapter, ENA_REGS_RESET_DRIVER_INVALID_STATE); + return rc; + } + + u64_stats_update_begin(&ring->syncp); + ring->tx_stats.cnt++; + ring->tx_stats.bytes += bytes; + u64_stats_update_end(&ring->syncp); + + tx_info->tx_descs = nb_hw_desc; + tx_info->total_tx_size = bytes; + tx_info->tx_sent_jiffies = jiffies; + tx_info->print_once = 0; + + ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use, + ring->ring_size); + return 0; +} + +static int ena_init_rx_cpu_rmap(struct ena_adapter *adapter) +{ +#ifdef CONFIG_RFS_ACCEL + u32 i; + int rc; + + adapter->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(adapter->num_io_queues); + if (!adapter->netdev->rx_cpu_rmap) + return -ENOMEM; + for (i = 0; i < adapter->num_io_queues; i++) { + int irq_idx = ENA_IO_IRQ_IDX(i); + + rc = irq_cpu_rmap_add(adapter->netdev->rx_cpu_rmap, +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + adapter->msix_entries[irq_idx].vector); +#else + pci_irq_vector(adapter->pdev, irq_idx)); +#endif + if (rc) { + free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap); + adapter->netdev->rx_cpu_rmap = NULL; + return rc; + } + } +#endif /* CONFIG_RFS_ACCEL */ + return 0; +} + +static void ena_init_io_rings_common(struct ena_adapter *adapter, + struct ena_ring *ring, u16 qid) +{ + ring->qid = qid; + ring->pdev = adapter->pdev; + ring->dev = &adapter->pdev->dev; + ring->netdev = adapter->netdev; + ring->napi = &adapter->ena_napi[qid].napi; + ring->adapter = adapter; + ring->ena_dev = adapter->ena_dev; + ring->per_napi_packets = 0; + ring->cpu = 0; + ring->numa_node = 0; + ring->no_interrupt_event_cnt = 0; + u64_stats_init(&ring->syncp); +} + +void ena_init_io_rings(struct ena_adapter *adapter, + int first_index, int count) +{ + struct ena_com_dev *ena_dev; + struct ena_ring *txr, *rxr; + int i; + + ena_dev = adapter->ena_dev; + + for (i = first_index; i < first_index + count; i++) { + txr = &adapter->tx_ring[i]; + rxr = &adapter->rx_ring[i]; + + /* TX common ring state */ + ena_init_io_rings_common(adapter, txr, i); + + /* TX specific ring state */ + txr->ring_size = adapter->requested_tx_ring_size; + txr->tx_max_header_size = ena_dev->tx_max_header_size; + txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type; + txr->sgl_size = adapter->max_tx_sgl_size; + txr->enable_bql = enable_bql; + txr->interrupt_interval = + ena_com_get_nonadaptive_moderation_interval_tx(ena_dev); + /* Initial value, mark as true */ + txr->interrupt_interval_changed = true; + txr->disable_meta_caching = adapter->disable_meta_caching; +#ifdef ENA_XDP_SUPPORT + spin_lock_init(&txr->xdp_tx_lock); +#endif + + /* Don't init RX queues for xdp queues */ + if (!ENA_IS_XDP_INDEX(adapter, i)) { + /* RX common ring state */ + ena_init_io_rings_common(adapter, rxr, i); + + /* RX specific ring state */ + rxr->ring_size = adapter->requested_rx_ring_size; + rxr->rx_copybreak = adapter->rx_copybreak; + rxr->sgl_size = adapter->max_rx_sgl_size; + rxr->interrupt_interval = + ena_com_get_nonadaptive_moderation_interval_rx(ena_dev); + /* Initial value, mark as true */ + rxr->interrupt_interval_changed = true; + rxr->empty_rx_queue = 0; + rxr->rx_headroom = NET_SKB_PAD; + adapter->ena_napi[i].dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; +#ifdef ENA_XDP_SUPPORT + rxr->xdp_ring = &adapter->tx_ring[i + adapter->num_io_queues]; +#endif + } + } +} + +/* ena_setup_tx_resources - allocate I/O Tx resources (Descriptors) + * @adapter: network interface device structure + * @qid: queue index + * + * Return 0 on success, negative on failure + */ +static int ena_setup_tx_resources(struct ena_adapter *adapter, int qid) +{ + struct ena_ring *tx_ring = &adapter->tx_ring[qid]; + struct ena_irq *ena_irq = &adapter->irq_tbl[ENA_IO_IRQ_IDX(qid)]; + int size, i, node; + + if (tx_ring->tx_buffer_info) { + netif_err(adapter, ifup, + adapter->netdev, "tx_buffer_info info is not NULL"); + return -EEXIST; + } + + size = sizeof(struct ena_tx_buffer) * tx_ring->ring_size; + node = cpu_to_node(ena_irq->cpu); + + tx_ring->tx_buffer_info = vzalloc_node(size, node); + if (!tx_ring->tx_buffer_info) { + tx_ring->tx_buffer_info = vzalloc(size); + if (!tx_ring->tx_buffer_info) + goto err_tx_buffer_info; + } + + size = sizeof(u16) * tx_ring->ring_size; + tx_ring->free_ids = vzalloc_node(size, node); + if (!tx_ring->free_ids) { + tx_ring->free_ids = vzalloc(size); + if (!tx_ring->free_ids) + goto err_tx_free_ids; + } + + size = tx_ring->tx_max_header_size; + tx_ring->push_buf_intermediate_buf = vzalloc_node(size, node); + if (!tx_ring->push_buf_intermediate_buf) { + tx_ring->push_buf_intermediate_buf = vzalloc(size); + if (!tx_ring->push_buf_intermediate_buf) + goto err_push_buf_intermediate_buf; + } + + /* Req id ring for TX out of order completions */ + for (i = 0; i < tx_ring->ring_size; i++) + tx_ring->free_ids[i] = i; + + /* Reset tx statistics */ + memset(&tx_ring->tx_stats, 0x0, sizeof(tx_ring->tx_stats)); + + tx_ring->next_to_use = 0; + tx_ring->next_to_clean = 0; + tx_ring->cpu = ena_irq->cpu; + tx_ring->numa_node = node; + return 0; + +err_push_buf_intermediate_buf: + vfree(tx_ring->free_ids); + tx_ring->free_ids = NULL; +err_tx_free_ids: + vfree(tx_ring->tx_buffer_info); + tx_ring->tx_buffer_info = NULL; +err_tx_buffer_info: + return -ENOMEM; +} + +/* ena_free_tx_resources - Free I/O Tx Resources per Queue + * @adapter: network interface device structure + * @qid: queue index + * + * Free all transmit software resources + */ +static void ena_free_tx_resources(struct ena_adapter *adapter, int qid) +{ + struct ena_ring *tx_ring = &adapter->tx_ring[qid]; + + vfree(tx_ring->tx_buffer_info); + tx_ring->tx_buffer_info = NULL; + + vfree(tx_ring->free_ids); + tx_ring->free_ids = NULL; + + vfree(tx_ring->push_buf_intermediate_buf); + tx_ring->push_buf_intermediate_buf = NULL; +} + +int ena_setup_tx_resources_in_range(struct ena_adapter *adapter, + int first_index, int count) +{ + int i, rc = 0; + + for (i = first_index; i < first_index + count; i++) { + rc = ena_setup_tx_resources(adapter, i); + if (rc) + goto err_setup_tx; + } + + return 0; + +err_setup_tx: + + netif_err(adapter, ifup, adapter->netdev, + "Tx queue %d: allocation failed\n", i); + + /* rewind the index freeing the rings as we go */ + while (first_index < i--) + ena_free_tx_resources(adapter, i); + return rc; +} + +void ena_free_all_io_tx_resources_in_range(struct ena_adapter *adapter, + int first_index, int count) +{ + int i; + + for (i = first_index; i < first_index + count; i++) + ena_free_tx_resources(adapter, i); +} + +/* ena_free_all_io_tx_resources - Free I/O Tx Resources for All Queues + * @adapter: board private structure + * + * Free all transmit software resources + */ +void ena_free_all_io_tx_resources(struct ena_adapter *adapter) +{ + ena_free_all_io_tx_resources_in_range(adapter, + 0, + adapter->xdp_num_queues + + adapter->num_io_queues); +} + +/* ena_setup_rx_resources - allocate I/O Rx resources (Descriptors) + * @adapter: network interface device structure + * @qid: queue index + * + * Returns 0 on success, negative on failure + */ +static int ena_setup_rx_resources(struct ena_adapter *adapter, + u32 qid) +{ + struct ena_ring *rx_ring = &adapter->rx_ring[qid]; + struct ena_irq *ena_irq = &adapter->irq_tbl[ENA_IO_IRQ_IDX(qid)]; + int size, node, i; + + if (rx_ring->rx_buffer_info) { + netif_err(adapter, ifup, adapter->netdev, + "rx_buffer_info is not NULL"); + return -EEXIST; + } + + /* alloc extra element so in rx path + * we can always prefetch rx_info + 1 + */ + size = sizeof(struct ena_rx_buffer) * (rx_ring->ring_size + 1); + node = cpu_to_node(ena_irq->cpu); + + rx_ring->rx_buffer_info = vzalloc_node(size, node); + if (!rx_ring->rx_buffer_info) { + rx_ring->rx_buffer_info = vzalloc(size); + if (!rx_ring->rx_buffer_info) + return -ENOMEM; + } + + size = sizeof(u16) * rx_ring->ring_size; + rx_ring->free_ids = vzalloc_node(size, node); + if (!rx_ring->free_ids) { + rx_ring->free_ids = vzalloc(size); + if (!rx_ring->free_ids) { + vfree(rx_ring->rx_buffer_info); + rx_ring->rx_buffer_info = NULL; + return -ENOMEM; + } + } + + /* Req id ring for receiving RX pkts out of order */ + for (i = 0; i < rx_ring->ring_size; i++) + rx_ring->free_ids[i] = i; + + /* Reset rx statistics */ + memset(&rx_ring->rx_stats, 0x0, sizeof(rx_ring->rx_stats)); + +#ifdef ENA_BUSY_POLL_SUPPORT + ena_bp_init_lock(rx_ring); +#endif + rx_ring->next_to_clean = 0; + rx_ring->next_to_use = 0; + rx_ring->cpu = ena_irq->cpu; + rx_ring->numa_node = node; + + return 0; +} + +/* ena_free_rx_resources - Free I/O Rx Resources + * @adapter: network interface device structure + * @qid: queue index + * + * Free all receive software resources + */ +static void ena_free_rx_resources(struct ena_adapter *adapter, + u32 qid) +{ + struct ena_ring *rx_ring = &adapter->rx_ring[qid]; + + vfree(rx_ring->rx_buffer_info); + rx_ring->rx_buffer_info = NULL; + + vfree(rx_ring->free_ids); + rx_ring->free_ids = NULL; +} + +/* ena_setup_all_rx_resources - allocate I/O Rx queues resources for all queues + * @adapter: board private structure + * + * Return 0 on success, negative on failure + */ +static int ena_setup_all_rx_resources(struct ena_adapter *adapter) +{ + int i, rc = 0; + + for (i = 0; i < adapter->num_io_queues; i++) { + rc = ena_setup_rx_resources(adapter, i); + if (rc) + goto err_setup_rx; + } + + return 0; + +err_setup_rx: + + netif_err(adapter, ifup, adapter->netdev, + "Rx queue %d: allocation failed\n", i); + + /* rewind the index freeing the rings as we go */ + while (i--) + ena_free_rx_resources(adapter, i); + return rc; +} + +/* ena_free_all_io_rx_resources - Free I/O Rx Resources for All Queues + * @adapter: board private structure + * + * Free all receive software resources + */ +static void ena_free_all_io_rx_resources(struct ena_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) + ena_free_rx_resources(adapter, i); +} + +struct page *ena_alloc_map_page(struct ena_ring *rx_ring, + dma_addr_t *dma) +{ + struct page *page; + + /* This would allocate the page on the same NUMA node the executing code + * is running on. + */ + page = dev_alloc_page(); + if (!page) { + ena_increase_stat(&rx_ring->rx_stats.page_alloc_fail, 1, &rx_ring->syncp); + return ERR_PTR(-ENOSPC); + } + + /* To enable NIC-side port-mirroring, AKA SPAN port, + * we make the buffer readable from the nic as well + */ + *dma = dma_map_page(rx_ring->dev, page, 0, ENA_PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (unlikely(dma_mapping_error(rx_ring->dev, *dma))) { + ena_increase_stat(&rx_ring->rx_stats.dma_mapping_err, 1, + &rx_ring->syncp); + __free_page(page); + return ERR_PTR(-EIO); + } + + return page; +} + +static int ena_alloc_rx_buffer(struct ena_ring *rx_ring, + struct ena_rx_buffer *rx_info) +{ + int headroom = rx_ring->rx_headroom; + struct ena_com_buf *ena_buf; + struct page *page; + dma_addr_t dma; + int tailroom; + + /* restore page offset value in case it has been changed by device */ + rx_info->buf_offset = headroom; + + /* if previous allocated page is not used */ + if (unlikely(rx_info->page)) + return 0; +#ifdef ENA_AF_XDP_SUPPORT + + if (unlikely(ENA_IS_XSK_RING(rx_ring))) { + struct xdp_buff *xdp; + + xdp = xsk_buff_alloc(rx_ring->xsk_pool); + if (!xdp) + return -ENOMEM; + + ena_buf = &rx_info->ena_buf; + ena_buf->paddr = xsk_buff_xdp_get_dma(xdp); + ena_buf->len = xsk_pool_get_rx_frame_size(rx_ring->xsk_pool); + + rx_info->xdp = xdp; + + return 0; + } +#endif /* ENA_AF_XDP_SUPPORT */ + + /* We handle DMA here */ + page = ena_lpc_get_page(rx_ring, &dma, &rx_info->is_lpc_page); + if (unlikely(IS_ERR(page))) + return PTR_ERR(page); + + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "Allocate page %p, rx_info %p\n", page, rx_info); + + tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + rx_info->page = page; + rx_info->dma_addr = dma; + rx_info->page_offset = 0; + ena_buf = &rx_info->ena_buf; + ena_buf->paddr = dma + headroom; + ena_buf->len = ENA_PAGE_SIZE - headroom - tailroom; + + return 0; +} + +static void ena_unmap_rx_buff_attrs(struct ena_ring *rx_ring, + struct ena_rx_buffer *rx_info, + unsigned long attrs) +{ + /* LPC pages are unmapped at cache destruction */ + if (rx_info->is_lpc_page) + return; + + ena_dma_unmap_page_attrs(rx_ring->dev, rx_info->dma_addr, ENA_PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); +} + +static void ena_free_rx_page(struct ena_ring *rx_ring, + struct ena_rx_buffer *rx_info) +{ + struct page *page = rx_info->page; + + if (unlikely(!page)) { + netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev, + "Trying to free unallocated buffer\n"); + return; + } + + ena_unmap_rx_buff_attrs(rx_ring, rx_info, 0); + + __free_page(page); + rx_info->page = NULL; +} + +int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num) +{ + u16 next_to_use, req_id; + u32 i; + int rc; + + next_to_use = rx_ring->next_to_use; + + for (i = 0; i < num; i++) { + struct ena_rx_buffer *rx_info; + + req_id = rx_ring->free_ids[next_to_use]; + + rx_info = &rx_ring->rx_buffer_info[req_id]; + + rc = ena_alloc_rx_buffer(rx_ring, rx_info); + if (unlikely(rc < 0)) { +#ifdef ENA_AF_XDP_SUPPORT + if (ENA_IS_XSK_RING(rx_ring)) + break; + +#endif /* ENA_AF_XDP_SUPPORT */ + netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev, + "Failed to allocate buffer for rx queue %d\n", + rx_ring->qid); + break; + } + rc = ena_com_add_single_rx_desc(rx_ring->ena_com_io_sq, + &rx_info->ena_buf, + req_id); + if (unlikely(rc)) { + netif_warn(rx_ring->adapter, rx_status, rx_ring->netdev, + "Failed to add buffer for rx queue %d\n", + rx_ring->qid); + break; + } + next_to_use = ENA_RX_RING_IDX_NEXT(next_to_use, + rx_ring->ring_size); + } + + if (unlikely(i < num)) { + ena_increase_stat(&rx_ring->rx_stats.refil_partial, 1, + &rx_ring->syncp); +#ifdef ENA_AF_XDP_SUPPORT + if (ENA_IS_XSK_RING(rx_ring)) + goto ring_doorbell; + +#endif /* ENA_AF_XDP_SUPPORT */ + netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev, + "Refilled rx qid %d with only %d buffers (from %d)\n", + rx_ring->qid, i, num); + } + +#ifdef ENA_AF_XDP_SUPPORT +ring_doorbell: +#endif /* ENA_AF_XDP_SUPPORT */ + /* ena_com_write_sq_doorbell issues a wmb() */ + if (likely(i)) + ena_com_write_sq_doorbell(rx_ring->ena_com_io_sq); + + rx_ring->next_to_use = next_to_use; + + return i; +} + +static void ena_free_rx_bufs(struct ena_adapter *adapter, + u32 qid) +{ + struct ena_ring *rx_ring = &adapter->rx_ring[qid]; + u32 i; + +#ifdef ENA_AF_XDP_SUPPORT + if (ENA_IS_XSK_RING(rx_ring)) { + ena_xdp_free_rx_bufs_zc(rx_ring); + return; + } + +#endif /* ENA_AF_XDP_SUPPORT */ + for (i = 0; i < rx_ring->ring_size; i++) { + struct ena_rx_buffer *rx_info = &rx_ring->rx_buffer_info[i]; + + if (rx_info->page) + ena_free_rx_page(rx_ring, rx_info); + } +} + +/* ena_refill_all_rx_bufs - allocate all queues Rx buffers + * @adapter: board private structure + */ +static void ena_refill_all_rx_bufs(struct ena_adapter *adapter) +{ + struct ena_ring *rx_ring; + int i, rc, bufs_num; + + for (i = 0; i < adapter->num_io_queues; i++) { + rx_ring = &adapter->rx_ring[i]; + bufs_num = rx_ring->ring_size - 1; + rc = ena_refill_rx_bufs(rx_ring, bufs_num); + + if (unlikely(rc != bufs_num)) + netif_warn(rx_ring->adapter, rx_status, rx_ring->netdev, + "Refilling Queue %d failed. allocated %d buffers from: %d\n", + i, rc, bufs_num); + } +} + +static void ena_free_all_rx_bufs(struct ena_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) + ena_free_rx_bufs(adapter, i); +} + +void ena_unmap_tx_buff(struct ena_ring *tx_ring, + struct ena_tx_buffer *tx_info) +{ + struct ena_com_buf *ena_buf; + u32 cnt; + int i; + + ena_buf = tx_info->bufs; + cnt = tx_info->num_of_bufs; + + if (unlikely(!cnt)) + return; + + if (tx_info->map_linear_data) { + dma_unmap_single(tx_ring->dev, + dma_unmap_addr(ena_buf, paddr), + dma_unmap_len(ena_buf, len), + DMA_TO_DEVICE); + ena_buf++; + cnt--; + } + + /* unmap remaining mapped pages */ + for (i = 0; i < cnt; i++) { + dma_unmap_page(tx_ring->dev, dma_unmap_addr(ena_buf, paddr), + dma_unmap_len(ena_buf, len), DMA_TO_DEVICE); + ena_buf++; + } +} + +/* ena_free_tx_bufs - Free Tx Buffers per Queue + * @tx_ring: TX ring for which buffers be freed + */ +static void ena_free_tx_bufs(struct ena_ring *tx_ring) +{ + bool print_once = true; + bool is_xdp_ring; + u32 i; + + is_xdp_ring = ENA_IS_XDP_INDEX(tx_ring->adapter, tx_ring->qid); + + for (i = 0; i < tx_ring->ring_size; i++) { + struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i]; + unsigned long jiffies_since_submitted; + + if (!tx_info->skb) + continue; + + jiffies_since_submitted = jiffies - tx_info->tx_sent_jiffies; + if (print_once) { + netif_notice(tx_ring->adapter, ifdown, tx_ring->netdev, + "Free uncompleted tx skb qid %d, idx 0x%x, %u msecs since submission\n", + tx_ring->qid, i, jiffies_to_msecs(jiffies_since_submitted)); + print_once = false; + } else { + netif_dbg(tx_ring->adapter, ifdown, tx_ring->netdev, + "Free uncompleted tx skb qid %d, idx 0x%x, %u msecs since submission\n", + tx_ring->qid, i, jiffies_to_msecs(jiffies_since_submitted)); + } + + ena_unmap_tx_buff(tx_ring, tx_info); + + if (is_xdp_ring) + xdp_return_frame(tx_info->xdpf); + else + dev_kfree_skb_any(tx_info->skb); + } + + if (!is_xdp_ring) + netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev, + tx_ring->qid)); +} + +static void ena_free_all_tx_bufs(struct ena_adapter *adapter) +{ + struct ena_ring *tx_ring; + int i; + + for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) { + tx_ring = &adapter->tx_ring[i]; +#ifdef ENA_AF_XDP_SUPPORT + if (ENA_IS_XSK_RING(tx_ring)) { + ena_xdp_free_tx_bufs_zc(tx_ring); + continue; + } +#endif /* ENA_AF_XDP_SUPPORT */ + ena_free_tx_bufs(tx_ring); + } +} + +static void ena_destroy_all_tx_queues(struct ena_adapter *adapter) +{ + u16 ena_qid; + int i; + + for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) { + ena_qid = ENA_IO_TXQ_IDX(i); + ena_com_destroy_io_queue(adapter->ena_dev, ena_qid); + } +} + +static void ena_destroy_all_rx_queues(struct ena_adapter *adapter) +{ + u16 ena_qid; + int i; + + for (i = 0; i < adapter->num_io_queues; i++) { + ena_qid = ENA_IO_RXQ_IDX(i); + cancel_work_sync(&adapter->ena_napi[i].dim.work); + ena_xdp_unregister_rxq_info(&adapter->rx_ring[i]); + ena_com_destroy_io_queue(adapter->ena_dev, ena_qid); + } +} + +static void ena_destroy_all_io_queues(struct ena_adapter *adapter) +{ + ena_destroy_all_tx_queues(adapter); + ena_destroy_all_rx_queues(adapter); +} + +int handle_invalid_req_id(struct ena_ring *ring, u16 req_id, + struct ena_tx_buffer *tx_info, bool is_xdp) +{ + if (tx_info) + netif_err(ring->adapter, + tx_done, + ring->netdev, + "tx_info doesn't have valid %s. qid %u req_id %u", + is_xdp ? "xdp frame" : "skb", ring->qid, req_id); + else + netif_err(ring->adapter, + tx_done, + ring->netdev, + "Invalid req_id %u in qid %u\n", + req_id, ring->qid); + + ena_increase_stat(&ring->tx_stats.bad_req_id, 1, &ring->syncp); + ena_reset_device(ring->adapter, ENA_REGS_RESET_INV_TX_REQ_ID); + + return -EFAULT; +} + +static int validate_tx_req_id(struct ena_ring *tx_ring, u16 req_id) +{ + struct ena_tx_buffer *tx_info; + + tx_info = &tx_ring->tx_buffer_info[req_id]; + if (likely(tx_info->skb)) + return 0; + + return handle_invalid_req_id(tx_ring, req_id, tx_info, false); +} + +static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget) +{ + struct netdev_queue *txq; + bool above_thresh; + u32 tx_bytes = 0; + u32 total_done = 0; + u16 next_to_clean; + u16 req_id; + int tx_pkts = 0; + int rc; + + next_to_clean = tx_ring->next_to_clean; + txq = netdev_get_tx_queue(tx_ring->netdev, tx_ring->qid); + + while (tx_pkts < budget) { + struct ena_tx_buffer *tx_info; + struct sk_buff *skb; + + rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq, + &req_id); + if (rc) { + if (unlikely(rc == -EINVAL)) + handle_invalid_req_id(tx_ring, req_id, NULL, false); + else if (unlikely(rc == -EFAULT)) { + ena_reset_device(tx_ring->adapter, + ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED); + } + break; + } + + /* validate that the request id points to a valid skb */ + rc = validate_tx_req_id(tx_ring, req_id); + if (unlikely(rc)) + break; + + tx_info = &tx_ring->tx_buffer_info[req_id]; + skb = tx_info->skb; + + /* prefetch skb_end_pointer() to speedup skb_shinfo(skb) */ + prefetch(&skb->end); + + tx_info->skb = NULL; + tx_info->tx_sent_jiffies = 0; + + ena_unmap_tx_buff(tx_ring, tx_info); + + netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev, + "tx_poll: q %d skb %p completed\n", tx_ring->qid, + skb); + + tx_bytes += tx_info->total_tx_size; + dev_kfree_skb(skb); + tx_pkts++; + total_done += tx_info->tx_descs; + + tx_ring->free_ids[next_to_clean] = req_id; + next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean, + tx_ring->ring_size); + } + + tx_ring->next_to_clean = next_to_clean; + ena_com_comp_ack(tx_ring->ena_com_io_sq, total_done); + + if (tx_ring->enable_bql) + netdev_tx_completed_queue(txq, tx_pkts, tx_bytes); + + netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev, + "tx_poll: q %d done. total pkts: %d\n", + tx_ring->qid, tx_pkts); + + /* need to make the rings circular update visible to + * ena_start_xmit() before checking for netif_queue_stopped(). + */ + smp_mb(); + + above_thresh = ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq, + ENA_TX_WAKEUP_THRESH); + if (unlikely(netif_tx_queue_stopped(txq) && above_thresh)) { + __netif_tx_lock(txq, smp_processor_id()); + above_thresh = + ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq, + ENA_TX_WAKEUP_THRESH); + if (netif_tx_queue_stopped(txq) && above_thresh && + test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags)) { + netif_tx_wake_queue(txq); + ena_increase_stat(&tx_ring->tx_stats.queue_wakeup, 1, + &tx_ring->syncp); + } + __netif_tx_unlock(txq); + } + + return tx_pkts; +} + +static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag, u16 len) +{ + struct sk_buff *skb; + +#ifdef ENA_LINEAR_FRAG_SUPPORTED + if (!first_frag) + skb = napi_alloc_skb(rx_ring->napi, len); + else + skb = build_skb(first_frag, len); +#else + if (!first_frag) + skb = napi_alloc_skb(rx_ring->napi, len); + else + skb = napi_alloc_skb(rx_ring->napi, + ENA_SKB_PULL_MIN_LEN); +#endif /* ENA_LINEAR_FRAG_SUPPORTED */ + + if (unlikely(!skb)) { + ena_increase_stat(&rx_ring->rx_stats.skb_alloc_fail, 1, + &rx_ring->syncp); + + netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev, + "Failed to allocate skb. first_frag %s\n", + first_frag ? "provided" : "not provided"); + } + + return skb; +} + +static bool ena_try_rx_buf_page_reuse(struct ena_rx_buffer *rx_info, u16 buf_len, + u16 len, int pkt_offset) +{ + struct ena_com_buf *ena_buf = &rx_info->ena_buf; + + /* More than ENA_MIN_RX_BUF_SIZE left in the reused buffer + * for data + headroom + tailroom. + */ + if (SKB_DATA_ALIGN(len + pkt_offset) + ENA_MIN_RX_BUF_SIZE <= ena_buf->len) { + page_ref_inc(rx_info->page); + rx_info->page_offset += buf_len; + ena_buf->paddr += buf_len; + ena_buf->len -= buf_len; + return true; + } + + return false; +} + +static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring, + struct ena_com_rx_buf_info *ena_bufs, + u32 descs, + u16 *next_to_clean) +{ + int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + bool is_xdp_loaded = ena_xdp_present_ring(rx_ring); + struct ena_rx_buffer *rx_info; + struct ena_adapter *adapter; + int page_offset, pkt_offset; + dma_addr_t pre_reuse_paddr; + u16 len, req_id, buf = 0; + bool reuse_rx_buf_page; + struct sk_buff *skb; + void *buf_addr; + int buf_offset; + u16 buf_len; +#ifndef ENA_LINEAR_FRAG_SUPPORTED + void *data_addr; + u16 hlen; +#endif + + len = ena_bufs[buf].len; + req_id = ena_bufs[buf].req_id; + + rx_info = &rx_ring->rx_buffer_info[req_id]; + + if (unlikely(!rx_info->page)) { + adapter = rx_ring->adapter; + netif_err(adapter, rx_err, rx_ring->netdev, + "Page is NULL. qid %u req_id %u\n", rx_ring->qid, req_id); + ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1, &rx_ring->syncp); + ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID); + return NULL; + } + + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "rx_info %p page %p\n", + rx_info, rx_info->page); + + buf_offset = rx_info->buf_offset; + pkt_offset = buf_offset - rx_ring->rx_headroom; + page_offset = rx_info->page_offset; + buf_addr = page_address(rx_info->page) + page_offset; + + if ((len <= rx_ring->rx_copybreak) && likely(descs == 1)) { + skb = ena_alloc_skb(rx_ring, NULL, len); + if (unlikely(!skb)) + return NULL; + + skb_copy_to_linear_data(skb, buf_addr + buf_offset, len); + dma_sync_single_for_device(rx_ring->dev, + dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset, + len, + DMA_FROM_DEVICE); + + skb_put(skb, len); + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "RX allocated small packet. len %d.\n", skb->len); +#ifdef ENA_BUSY_POLL_SUPPORT + skb_mark_napi_id(skb, rx_ring->napi); +#endif + skb->protocol = eth_type_trans(skb, rx_ring->netdev); + rx_ring->free_ids[*next_to_clean] = req_id; + *next_to_clean = ENA_RX_RING_IDX_ADD(*next_to_clean, descs, + rx_ring->ring_size); + return skb; + } + + buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom); + + /* If XDP isn't loaded try to reuse part of the RX buffer */ + reuse_rx_buf_page = !is_xdp_loaded && + ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset); + + if (!reuse_rx_buf_page) { + ena_unmap_rx_buff_attrs(rx_ring, rx_info, ENA_DMA_ATTR_SKIP_CPU_SYNC); + /* Make sure buf_len represents the actual size used + * by the buffer as expected from skb->truesize + */ + buf_len = ENA_PAGE_SIZE - page_offset; + } + + + skb = ena_alloc_skb(rx_ring, buf_addr, buf_len); + if (unlikely(!skb)) + return NULL; + +#ifdef ENA_LINEAR_FRAG_SUPPORTED + /* Populate skb's linear part */ + skb_reserve(skb, buf_offset); + skb_put(skb, len); +#else + data_addr = buf_addr + buf_offset; + + /* GRO expects us to have the ethernet header in the linear part. + * Copy the first ENA_SKB_PULL_MIN_LEN bytes because it is more + * efficient. + */ + hlen = min_t(u16, len, ENA_SKB_PULL_MIN_LEN); + memcpy(__skb_put(skb, hlen), data_addr, hlen); + if (hlen < len) + skb_add_rx_frag(skb, 0, rx_info->page, + page_offset + buf_offset + hlen, + len - hlen, buf_len); +#endif + skb->protocol = eth_type_trans(skb, rx_ring->netdev); + + do { + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "RX skb updated. len %d. data_len %d\n", + skb->len, skb->data_len); + + if (!reuse_rx_buf_page) + rx_info->page = NULL; + + rx_ring->free_ids[*next_to_clean] = req_id; + *next_to_clean = + ENA_RX_RING_IDX_NEXT(*next_to_clean, + rx_ring->ring_size); + if (likely(--descs == 0)) + break; + + buf++; + len = ena_bufs[buf].len; + req_id = ena_bufs[buf].req_id; + + rx_info = &rx_ring->rx_buffer_info[req_id]; + + /* rx_info->buf_offset includes rx_ring->rx_headroom */ + buf_offset = rx_info->buf_offset; + pkt_offset = buf_offset - rx_ring->rx_headroom; + buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom); + page_offset = rx_info->page_offset; + + pre_reuse_paddr = dma_unmap_addr(&rx_info->ena_buf, paddr); + + reuse_rx_buf_page = !is_xdp_loaded && + ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset); + + dma_sync_single_for_cpu(rx_ring->dev, + pre_reuse_paddr + pkt_offset, + len, + DMA_FROM_DEVICE); + + if (!reuse_rx_buf_page) { + ena_unmap_rx_buff_attrs(rx_ring, rx_info, ENA_DMA_ATTR_SKIP_CPU_SYNC); + /* Make sure buf_len represents the actual size used + * by the buffer as expected from skb->truesize + */ + buf_len = ENA_PAGE_SIZE - page_offset; + } + + + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page, + page_offset + buf_offset, len, buf_len); + + } while (1); + +#ifdef ENA_BUSY_POLL_SUPPORT + skb_mark_napi_id(skb, rx_ring->napi); + +#endif + return skb; +} + +/* ena_rx_checksum - indicate in skb if hw indicated a good cksum + * @adapter: structure containing adapter specific data + * @ena_rx_ctx: received packet context/metadata + * @skb: skb currently being received and modified + */ +void ena_rx_checksum(struct ena_ring *rx_ring, + struct ena_com_rx_ctx *ena_rx_ctx, + struct sk_buff *skb) +{ + /* Rx csum disabled */ + if (unlikely(!(rx_ring->netdev->features & NETIF_F_RXCSUM))) { + skb->ip_summed = CHECKSUM_NONE; + return; + } + + /* For fragmented packets the checksum isn't valid */ + if (ena_rx_ctx->frag) { + skb->ip_summed = CHECKSUM_NONE; + return; + } + + /* if IP and error */ + if (unlikely((ena_rx_ctx->l3_proto == ENA_ETH_IO_L3_PROTO_IPV4) && + (ena_rx_ctx->l3_csum_err))) { + /* ipv4 checksum error */ + skb->ip_summed = CHECKSUM_NONE; + ena_increase_stat(&rx_ring->rx_stats.csum_bad, 1, + &rx_ring->syncp); + netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev, + "RX IPv4 header checksum error\n"); + return; + } + + /* if TCP/UDP */ + if (likely((ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_TCP) || + (ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_UDP))) { + if (unlikely(ena_rx_ctx->l4_csum_err)) { + /* TCP/UDP checksum error */ + ena_increase_stat(&rx_ring->rx_stats.csum_bad, 1, + &rx_ring->syncp); + netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev, + "RX L4 checksum error\n"); + skb->ip_summed = CHECKSUM_NONE; + return; + } + + if (likely(ena_rx_ctx->l4_csum_checked)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + ena_increase_stat(&rx_ring->rx_stats.csum_good, 1, + &rx_ring->syncp); + } else { + ena_increase_stat(&rx_ring->rx_stats.csum_unchecked, 1, + &rx_ring->syncp); + skb->ip_summed = CHECKSUM_NONE; + } + } else { + skb->ip_summed = CHECKSUM_NONE; + return; + } + +} + +void ena_set_rx_hash(struct ena_ring *rx_ring, + struct ena_com_rx_ctx *ena_rx_ctx, + struct sk_buff *skb) +{ +#ifdef NETIF_F_RXHASH + enum pkt_hash_types hash_type; + + if (likely(rx_ring->netdev->features & NETIF_F_RXHASH)) { + if (likely((ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_TCP) || + (ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_UDP))) + + hash_type = PKT_HASH_TYPE_L4; + else + hash_type = PKT_HASH_TYPE_NONE; + + /* Override hash type if the packet is fragmented */ + if (ena_rx_ctx->frag) + hash_type = PKT_HASH_TYPE_NONE; + + skb_set_hash(skb, ena_rx_ctx->hash, hash_type); + } +#endif /* NETIF_F_RXHASH */ +} + +#ifdef ENA_XDP_SUPPORT +static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp, u16 num_descs) +{ + struct ena_rx_buffer *rx_info; + int ret; + + /* XDP multi-buffer packets not supported */ + if (unlikely(num_descs > 1)) { + netdev_err_once(rx_ring->adapter->netdev, + "xdp: dropped unsupported multi-buffer packets\n"); + ena_increase_stat(&rx_ring->rx_stats.xdp_drop, 1, &rx_ring->syncp); + return ENA_XDP_DROP; + } + + rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id]; + xdp_prepare_buff(xdp, page_address(rx_info->page), + rx_info->buf_offset, + rx_ring->ena_bufs[0].len, false); + + ret = ena_xdp_execute(rx_ring, xdp); + + /* The xdp program might expand the headers */ + if (ret == ENA_XDP_PASS) { + rx_info->buf_offset = xdp->data - xdp->data_hard_start; + rx_ring->ena_bufs[0].len = xdp->data_end - xdp->data; + } + + return ret; +} + +#endif /* ENA_XDP_SUPPORT */ +/* ena_clean_rx_irq - Cleanup RX irq + * @rx_ring: RX ring to clean + * @napi: napi handler + * @budget: how many packets driver is allowed to clean + * + * Returns the number of cleaned buffers. + */ +static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, + u32 budget) +{ + u16 next_to_clean = rx_ring->next_to_clean; + struct ena_com_rx_ctx ena_rx_ctx; + struct ena_rx_buffer *rx_info; + struct ena_adapter *adapter; + u32 res_budget, work_done; + int rx_copybreak_pkt = 0; + int refill_threshold; + struct sk_buff *skb; + int refill_required; +#ifdef ENA_XDP_SUPPORT + struct xdp_buff xdp; + int xdp_flags = 0; +#endif /* ENA_XDP_SUPPORT */ + int total_len = 0; +#ifdef ENA_XDP_SUPPORT + int xdp_verdict; +#endif /* ENA_XDP_SUPPORT */ + u8 pkt_offset; + int rc = 0; + int i; + + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "%s qid %d\n", __func__, rx_ring->qid); + res_budget = budget; +#ifdef ENA_XDP_SUPPORT + xdp_init_buff(&xdp, ENA_PAGE_SIZE, &rx_ring->xdp_rxq); +#endif /* ENA_XDP_SUPPORT */ + + do { +#ifdef ENA_XDP_SUPPORT + xdp_verdict = ENA_XDP_PASS; + skb = NULL; +#endif /* ENA_XDP_SUPPORT */ + ena_rx_ctx.ena_bufs = rx_ring->ena_bufs; + ena_rx_ctx.max_bufs = rx_ring->sgl_size; + ena_rx_ctx.descs = 0; + ena_rx_ctx.pkt_offset = 0; + rc = ena_com_rx_pkt(rx_ring->ena_com_io_cq, + rx_ring->ena_com_io_sq, + &ena_rx_ctx); + if (unlikely(rc)) + goto error; + + if (unlikely(ena_rx_ctx.descs == 0)) + break; + + /* First descriptor might have an offset set by the device */ + rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id]; + pkt_offset = ena_rx_ctx.pkt_offset; + rx_info->buf_offset += pkt_offset; + + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n", + rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto, + ena_rx_ctx.l4_proto, ena_rx_ctx.hash); + + dma_sync_single_for_cpu(rx_ring->dev, + dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset, + rx_ring->ena_bufs[0].len, + DMA_FROM_DEVICE); + +#ifdef ENA_XDP_SUPPORT + if (ena_xdp_present_ring(rx_ring)) + xdp_verdict = ena_xdp_handle_buff(rx_ring, &xdp, ena_rx_ctx.descs); + + /* allocate skb and fill it */ + if (xdp_verdict == ENA_XDP_PASS) + skb = ena_rx_skb(rx_ring, + rx_ring->ena_bufs, + ena_rx_ctx.descs, + &next_to_clean); +#else + skb = ena_rx_skb(rx_ring, rx_ring->ena_bufs, ena_rx_ctx.descs, + &next_to_clean); +#endif /* ENA_XDP_SUPPORT */ + + if (unlikely(!skb)) { + for (i = 0; i < ena_rx_ctx.descs; i++) { + int req_id = rx_ring->ena_bufs[i].req_id; + + rx_ring->free_ids[next_to_clean] = req_id; + next_to_clean = + ENA_RX_RING_IDX_NEXT(next_to_clean, + rx_ring->ring_size); + +#ifdef ENA_XDP_SUPPORT + /* Packets was passed for transmission, unmap it + * from RX side. + */ + if (xdp_verdict & ENA_XDP_FORWARDED) { + ena_unmap_rx_buff_attrs(rx_ring, + &rx_ring->rx_buffer_info[req_id], + ENA_DMA_ATTR_SKIP_CPU_SYNC); + rx_ring->rx_buffer_info[req_id].page = NULL; + } +#endif /* ENA_XDP_SUPPORT */ + } +#ifdef ENA_XDP_SUPPORT + if (xdp_verdict != ENA_XDP_PASS) { + xdp_flags |= xdp_verdict; + total_len += ena_rx_ctx.ena_bufs[0].len; + res_budget--; + continue; + } +#endif /* ENA_XDP_SUPPORT */ + break; + } + + ena_rx_checksum(rx_ring, &ena_rx_ctx, skb); + + ena_set_rx_hash(rx_ring, &ena_rx_ctx, skb); + + skb_record_rx_queue(skb, rx_ring->qid); + + if ((rx_ring->ena_bufs[0].len <= rx_ring->rx_copybreak) && + likely(ena_rx_ctx.descs == 1)) + rx_copybreak_pkt++; + + total_len += skb->len; + +#ifdef ENA_BUSY_POLL_SUPPORT + if (ena_bp_busy_polling(rx_ring)) + netif_receive_skb(skb); + else + napi_gro_receive(napi, skb); +#else + napi_gro_receive(napi, skb); +#endif /* ENA_BUSY_POLL_SUPPORT */ + + res_budget--; + } while (likely(res_budget)); + + work_done = budget - res_budget; + rx_ring->per_napi_packets += work_done; + u64_stats_update_begin(&rx_ring->syncp); + rx_ring->rx_stats.bytes += total_len; + rx_ring->rx_stats.cnt += work_done; + rx_ring->rx_stats.rx_copybreak_pkt += rx_copybreak_pkt; + u64_stats_update_end(&rx_ring->syncp); + + rx_ring->next_to_clean = next_to_clean; + + refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq); + refill_threshold = + min_t(int, rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER, + ENA_RX_REFILL_THRESH_PACKET); + + /* Optimization, try to batch new rx buffers */ + if (refill_required > refill_threshold) + ena_refill_rx_bufs(rx_ring, refill_required); + +#ifdef ENA_XDP_SUPPORT + if (xdp_flags & ENA_XDP_REDIRECT) + xdp_do_flush(); + if (xdp_flags & ENA_XDP_TX) + ena_ring_tx_doorbell(rx_ring->xdp_ring); +#endif + + return work_done; + +error: +#ifdef ENA_XDP_SUPPORT + if (xdp_flags & ENA_XDP_REDIRECT) + xdp_do_flush(); + +#endif + adapter = netdev_priv(rx_ring->netdev); + + if (rc == -ENOSPC) { + ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1, &rx_ring->syncp); + ena_reset_device(adapter, ENA_REGS_RESET_TOO_MANY_RX_DESCS); + } else if (rc == -EFAULT) { + ena_reset_device(adapter, ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED); + } else { + ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1, + &rx_ring->syncp); + ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID); + } + return 0; +} + +static void ena_dim_work(struct work_struct *w) +{ + struct dim *dim = container_of(w, struct dim, work); + struct dim_cq_moder cur_moder = + net_dim_get_rx_moderation(dim->mode, dim->profile_ix); + struct ena_napi *ena_napi = container_of(dim, struct ena_napi, dim); + + ena_napi->rx_ring->interrupt_interval = cur_moder.usec; + /* DIM will schedule the work in case there was a change in the profile. */ + ena_napi->rx_ring->interrupt_interval_changed = true; + + dim->state = DIM_START_MEASURE; +} + +static void ena_adjust_adaptive_rx_intr_moderation(struct ena_napi *ena_napi) +{ + struct dim_sample dim_sample; + struct ena_ring *rx_ring = ena_napi->rx_ring; + + if (!rx_ring->per_napi_packets) + return; + + rx_ring->non_empty_napi_events++; + + dim_update_sample(rx_ring->non_empty_napi_events, + rx_ring->rx_stats.cnt, + rx_ring->rx_stats.bytes, + &dim_sample); + + net_dim(&ena_napi->dim, dim_sample); + + rx_ring->per_napi_packets = 0; +} + +void ena_unmask_interrupt(struct ena_ring *tx_ring, + struct ena_ring *rx_ring) +{ + u32 rx_interval = tx_ring->interrupt_interval; + struct ena_eth_io_intr_reg intr_reg; + bool no_moderation_update = true; + + /* Rx ring can be NULL when for XDP tx queues which don't have an + * accompanying rx_ring pair. + */ + if (rx_ring) { + rx_interval = ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev) ? + rx_ring->interrupt_interval : + ena_com_get_nonadaptive_moderation_interval_rx(rx_ring->ena_dev); + + no_moderation_update &= !rx_ring->interrupt_interval_changed; + rx_ring->interrupt_interval_changed = false; + } + + no_moderation_update &= !tx_ring->interrupt_interval_changed; + tx_ring->interrupt_interval_changed = false; + + /* Update intr register: rx intr delay, + * tx intr delay and interrupt unmask + */ + ena_com_update_intr_reg(&intr_reg, + rx_interval, + tx_ring->interrupt_interval, + true, + no_moderation_update); + + ena_increase_stat(&tx_ring->tx_stats.unmask_interrupt, 1, + &tx_ring->syncp); + + /* It is a shared MSI-X. + * Tx and Rx CQ have pointer to it. + * So we use one of them to reach the intr reg + * The Tx ring is used because the rx_ring is NULL for XDP queues + */ + ena_com_unmask_intr(tx_ring->ena_com_io_cq, &intr_reg); +} + +void ena_update_ring_numa_node(struct ena_ring *rx_ring) +{ + int cpu = get_cpu(); + int numa_node; + + if (likely(rx_ring->cpu == cpu)) + goto out; + + rx_ring->cpu = cpu; + + numa_node = cpu_to_node(cpu); + + if (likely(rx_ring->numa_node == numa_node)) + goto out; + + put_cpu(); + + if (numa_node != NUMA_NO_NODE) { + ena_com_update_numa_node(rx_ring->ena_com_io_cq, numa_node); + rx_ring->numa_node = numa_node; + } + + return; +out: + put_cpu(); +} + +static int ena_io_poll(struct napi_struct *napi, int budget) +{ + struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi); + struct ena_ring *tx_ring, *rx_ring; + int tx_work_done; + int rx_work_done = 0; + int tx_budget; + int napi_comp_call = 0; + int ret; + + tx_ring = ena_napi->tx_ring; + rx_ring = ena_napi->rx_ring; + + tx_budget = tx_ring->ring_size / ENA_TX_POLL_BUDGET_DIVIDER; + + if (!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) || + test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags)) { + napi_complete_done(napi, 0); + return 0; + } +#ifdef ENA_BUSY_POLL_SUPPORT + if (!ena_bp_lock_napi(rx_ring)) + return budget; +#endif + + tx_work_done = ena_clean_tx_irq(tx_ring, tx_budget); + /* On netpoll the budget is zero and the handler should only clean the + * tx completions. + */ + if (likely(budget)) + rx_work_done = ena_clean_rx_irq(rx_ring, napi, budget); + + /* If the device is about to reset or down, avoid unmask + * the interrupt and return 0 so NAPI won't reschedule + */ + if (unlikely(!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) || + test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags))) { + napi_complete_done(napi, 0); + ret = 0; + + } else if ((budget > rx_work_done) && (tx_budget > tx_work_done)) { + napi_comp_call = 1; + + /* Update numa and unmask the interrupt only when schedule + * from the interrupt context (vs from sk_busy_loop) + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) + if (napi_complete_done(napi, rx_work_done) && + READ_ONCE(ena_napi->interrupts_masked)) { +#else + napi_complete_done(napi, rx_work_done); + if (READ_ONCE(ena_napi->interrupts_masked)) { +#endif + smp_rmb(); /* make sure interrupts_masked is read */ + WRITE_ONCE(ena_napi->interrupts_masked, false); + /* We apply adaptive moderation on Rx path only. + * Tx uses static interrupt moderation. + */ + if (ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev)) + ena_adjust_adaptive_rx_intr_moderation(ena_napi); + + ena_update_ring_numa_node(rx_ring); + ena_unmask_interrupt(tx_ring, rx_ring); + } + + ret = rx_work_done; + } else { + ret = budget; + } + + u64_stats_update_begin(&tx_ring->syncp); + tx_ring->tx_stats.napi_comp += napi_comp_call; + tx_ring->tx_stats.tx_poll++; + u64_stats_update_end(&tx_ring->syncp); + +#ifdef ENA_BUSY_POLL_SUPPORT + ena_bp_unlock_napi(rx_ring); +#endif + tx_ring->tx_stats.last_napi_jiffies = jiffies; + + return ret; +} + +static irqreturn_t ena_intr_msix_mgmnt(int irq, void *data) +{ + struct ena_adapter *adapter = (struct ena_adapter *)data; + + ena_com_admin_q_comp_intr_handler(adapter->ena_dev); + + /* Don't call the aenq handler before probe is done */ + if (likely(test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))) + ena_com_aenq_intr_handler(adapter->ena_dev, data); + + return IRQ_HANDLED; +} + +/* ena_intr_msix_io - MSI-X Interrupt Handler for Tx/Rx + * @irq: interrupt number + * @data: pointer to a network interface private napi device structure + */ +static irqreturn_t ena_intr_msix_io(int irq, void *data) +{ + struct ena_napi *ena_napi = data; + + /* Used to check HW health */ + WRITE_ONCE(ena_napi->last_intr_jiffies, jiffies); + + WRITE_ONCE(ena_napi->interrupts_masked, true); + smp_wmb(); /* write interrupts_masked before calling napi */ + + napi_schedule_irqoff(&ena_napi->napi); + + return IRQ_HANDLED; +} + +/* Reserve a single MSI-X vector for management (admin + aenq). + * plus reserve one vector for each potential io queue. + * the number of potential io queues is the minimum of what the device + * supports and the number of vCPUs. + */ +static int ena_enable_msix(struct ena_adapter *adapter) +{ + int msix_vecs, irq_cnt; +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + int i; +#endif + + if (test_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) { + netif_err(adapter, probe, adapter->netdev, + "Error, MSI-X is already enabled\n"); + return -EPERM; + } + + /* Reserved the max msix vectors we might need */ + msix_vecs = ENA_MAX_MSIX_VEC(adapter->max_num_io_queues); + netif_dbg(adapter, probe, adapter->netdev, + "Trying to enable MSI-X, vectors %d\n", msix_vecs); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + adapter->msix_entries = vzalloc(msix_vecs * sizeof(struct msix_entry)); + + if (!adapter->msix_entries) + return -ENOMEM; + + for (i = 0; i < msix_vecs; i++) + adapter->msix_entries[i].entry = i; + + irq_cnt = pci_enable_msix_range(adapter->pdev, adapter->msix_entries, + ENA_MIN_MSIX_VEC, msix_vecs); +#else + irq_cnt = pci_alloc_irq_vectors(adapter->pdev, ENA_MIN_MSIX_VEC, + msix_vecs, PCI_IRQ_MSIX); +#endif + + if (irq_cnt < 0) { + netif_err(adapter, probe, adapter->netdev, + "Failed to enable MSI-X. irq_cnt %d\n", irq_cnt); +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + vfree(adapter->msix_entries); + adapter->msix_entries = NULL; +#endif + return -ENOSPC; + } + + if (irq_cnt != msix_vecs) { + netif_notice(adapter, probe, adapter->netdev, + "Enable only %d MSI-X (out of %d), reduce the number of queues\n", + irq_cnt, msix_vecs); + adapter->num_io_queues = irq_cnt - ENA_ADMIN_MSIX_VEC; + } + + if (ena_init_rx_cpu_rmap(adapter)) + netif_warn(adapter, probe, adapter->netdev, + "Failed to map IRQs to CPUs\n"); + + adapter->msix_vecs = irq_cnt; + set_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags); + + return 0; +} + +static void ena_setup_mgmnt_intr(struct ena_adapter *adapter) +{ + u32 cpu; + + snprintf(adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].name, + ENA_IRQNAME_SIZE, "ena-mgmnt@pci:%s", + pci_name(adapter->pdev)); + adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].handler = + ena_intr_msix_mgmnt; + adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].data = adapter; + adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].vector = +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + adapter->msix_entries[ENA_MGMNT_IRQ_IDX].vector; +#else + pci_irq_vector(adapter->pdev, ENA_MGMNT_IRQ_IDX); +#endif + cpu = cpumask_first(cpu_online_mask); + adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].cpu = cpu; + cpumask_set_cpu(cpu, + &adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].affinity_hint_mask); +} + +static void ena_setup_io_intr(struct ena_adapter *adapter) +{ + const struct cpumask *affinity = cpu_online_mask; + int irq_idx, i, cpu, io_queue_count, node; + struct net_device *netdev; + + netdev = adapter->netdev; + io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues; + node = dev_to_node(adapter->ena_dev->dmadev); + + if (node != NUMA_NO_NODE) + affinity = cpumask_of_node(node); + + for (i = 0; i < io_queue_count; i++) { + irq_idx = ENA_IO_IRQ_IDX(i); + cpu = cpumask_local_spread(i, node); + snprintf(adapter->irq_tbl[irq_idx].name, ENA_IRQNAME_SIZE, + "%s-Tx-Rx-%d", netdev->name, i); + adapter->irq_tbl[irq_idx].handler = ena_intr_msix_io; + adapter->irq_tbl[irq_idx].data = &adapter->ena_napi[i]; + adapter->irq_tbl[irq_idx].vector = +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + adapter->msix_entries[irq_idx].vector; +#else + pci_irq_vector(adapter->pdev, irq_idx); +#endif + adapter->irq_tbl[irq_idx].cpu = cpu; + + cpumask_copy(&adapter->irq_tbl[irq_idx].affinity_hint_mask, affinity); + } +} + +static int ena_request_mgmnt_irq(struct ena_adapter *adapter) +{ + unsigned long flags = 0; + struct ena_irq *irq; + int rc; + + irq = &adapter->irq_tbl[ENA_MGMNT_IRQ_IDX]; + rc = request_irq(irq->vector, irq->handler, flags, irq->name, + irq->data); + if (rc) { + netif_err(adapter, probe, adapter->netdev, + "Failed to request admin irq\n"); + return rc; + } + + netif_dbg(adapter, probe, adapter->netdev, + "Set affinity hint of mgmnt irq.to 0x%lx (irq vector: %d)\n", + irq->affinity_hint_mask.bits[0], irq->vector); + + irq_update_affinity_hint(irq->vector, &irq->affinity_hint_mask); + + return rc; +} + +static int ena_request_io_irq(struct ena_adapter *adapter) +{ + u32 io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues; + unsigned long flags = 0; + struct ena_irq *irq; + int rc = 0, i, k; + + if (!test_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) { + netif_err(adapter, ifup, adapter->netdev, + "Failed to request I/O IRQ: MSI-X is not enabled\n"); + return -EINVAL; + } + + for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) { + irq = &adapter->irq_tbl[i]; + rc = request_irq(irq->vector, irq->handler, flags, irq->name, + irq->data); + if (rc) { + netif_err(adapter, ifup, adapter->netdev, + "Failed to request I/O IRQ. index %d rc %d\n", + i, rc); + goto err; + } + + netif_dbg(adapter, ifup, adapter->netdev, + "Set affinity hint of irq. index %d to 0x%lx (irq vector: %d)\n", + i, irq->affinity_hint_mask.bits[0], irq->vector); + + irq_update_affinity_hint(irq->vector, &irq->affinity_hint_mask); + } + + return rc; + +err: + for (k = ENA_IO_IRQ_FIRST_IDX; k < i; k++) { + irq = &adapter->irq_tbl[k]; + free_irq(irq->vector, irq->data); + } + + return rc; +} + +static void ena_free_mgmnt_irq(struct ena_adapter *adapter) +{ + struct ena_irq *irq; + + irq = &adapter->irq_tbl[ENA_MGMNT_IRQ_IDX]; + synchronize_irq(irq->vector); + irq_update_affinity_hint(irq->vector, NULL); + free_irq(irq->vector, irq->data); +} + +static void ena_free_io_irq(struct ena_adapter *adapter) +{ + u32 io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues; + struct ena_irq *irq; + int i; + +#ifdef CONFIG_RFS_ACCEL + if (adapter->msix_vecs >= 1) { + free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap); + adapter->netdev->rx_cpu_rmap = NULL; + } +#endif /* CONFIG_RFS_ACCEL */ + + for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) { + irq = &adapter->irq_tbl[i]; + irq_update_affinity_hint(irq->vector, NULL); + free_irq(irq->vector, irq->data); + } +} + +static void ena_disable_msix(struct ena_adapter *adapter) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + if (test_and_clear_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) + pci_disable_msix(adapter->pdev); + + if (adapter->msix_entries) + vfree(adapter->msix_entries); + adapter->msix_entries = NULL; +#else + if (test_and_clear_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) + pci_free_irq_vectors(adapter->pdev); +#endif +} + +static void ena_disable_io_intr_sync(struct ena_adapter *adapter) +{ + u32 io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues; + int i; + + if (!netif_running(adapter->netdev)) + return; + + for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) + synchronize_irq(adapter->irq_tbl[i].vector); +} + +static void ena_del_napi_in_range(struct ena_adapter *adapter, + int first_index, + int count) +{ + int i; + + for (i = first_index; i < first_index + count; i++) { +#ifdef ENA_BUSY_POLL_SUPPORT + napi_hash_del(&adapter->ena_napi[i].napi); +#endif /* ENA_BUSY_POLL_SUPPORT */ + netif_napi_del(&adapter->ena_napi[i].napi); + +#ifdef ENA_XDP_SUPPORT + WARN_ON(ENA_IS_XDP_INDEX(adapter, i) && + adapter->ena_napi[i].rx_ring); +#endif /* ENA_XDP_SUPPORT */ + } +#ifdef ENA_BUSY_POLL_SUPPORT + + /* Wait until all uses of napi struct complete */ + synchronize_net(); +#endif /* ENA_BUSY_POLL_SUPPORT */ +} + +static void ena_init_napi_in_range(struct ena_adapter *adapter, + int first_index, int count) +{ + int (*napi_handler)(struct napi_struct *napi, int budget); + int i; + + for (i = first_index; i < first_index + count; i++) { + struct ena_napi *napi = &adapter->ena_napi[i]; + struct ena_ring *rx_ring, *tx_ring; + + memset(napi, 0, sizeof(*napi)); + + rx_ring = &adapter->rx_ring[i]; + tx_ring = &adapter->tx_ring[i]; + + napi_handler = ena_io_poll; +#ifdef ENA_XDP_SUPPORT +#ifdef ENA_AF_XDP_SUPPORT + if (ENA_IS_XDP_INDEX(adapter, i) || ENA_IS_XSK_RING(rx_ring)) +#else + if (ENA_IS_XDP_INDEX(adapter, i)) +#endif /* ENA_AF_XDP_SUPPORT */ + napi_handler = ena_xdp_io_poll; +#endif /* ENA_XDP_SUPPORT */ + + ena_netif_napi_add(adapter->netdev, &napi->napi, napi_handler); + +#ifdef ENA_BUSY_POLL_SUPPORT + napi_hash_add(&adapter->ena_napi[i].napi); + +#endif /* ENA_BUSY_POLL_SUPPORT */ + if (!ENA_IS_XDP_INDEX(adapter, i)) + napi->rx_ring = rx_ring; + + napi->tx_ring = tx_ring; + napi->qid = i; + } +} + +#ifdef ENA_BUSY_POLL_SUPPORT +static void ena_napi_disable_in_range(struct ena_adapter *adapter, + int first_index, + int count) +{ + struct ena_ring *rx_ring; + int i, timeout; + + for (i = first_index; i < first_index + count; i++) { + napi_disable(&adapter->ena_napi[i].napi); + + rx_ring = &adapter->rx_ring[i]; + timeout = 1000; + while (!ena_bp_disable(rx_ring)) { + netif_info(adapter, ifdown, adapter->netdev, + "Rx queue %d locked\n", i); + usleep_range(1000, 2000); + timeout--; + + if (!timeout) { + WARN(!ena_bp_disable(rx_ring), + "Unable to disable busy poll at ring %d\n", i); + break; + } + } + } +} +#else +static void ena_napi_disable_in_range(struct ena_adapter *adapter, + int first_index, + int count) +{ + int i; + + for (i = first_index; i < first_index + count; i++) + napi_disable(&adapter->ena_napi[i].napi); +} +#endif + +static void ena_napi_enable_in_range(struct ena_adapter *adapter, + int first_index, + int count) +{ + int i; + + for (i = first_index; i < first_index + count; i++) + napi_enable(&adapter->ena_napi[i].napi); +} + +/* Configure the Rx forwarding */ +static int ena_rss_configure(struct ena_adapter *adapter) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + int rc; + + /* In case the RSS table wasn't initialized by probe */ + if (!ena_dev->rss.tbl_log_size) { + rc = ena_rss_init_default(adapter); + if (unlikely(rc && (rc != -EOPNOTSUPP))) { + netif_err(adapter, ifup, adapter->netdev, "Failed to init RSS rc: %d\n", rc); + return rc; + } + } + + /* Set indirect table */ + rc = ena_com_indirect_table_set(ena_dev); + if (unlikely(rc && rc != -EOPNOTSUPP)) + return rc; + + /* Configure hash function (if supported) */ + rc = ena_com_set_hash_function(ena_dev); + if (unlikely(rc && (rc != -EOPNOTSUPP))) + return rc; + + /* Configure hash inputs (if supported) */ + rc = ena_com_set_hash_ctrl(ena_dev); + if (unlikely(rc && (rc != -EOPNOTSUPP))) + return rc; + + return 0; +} + +static int ena_up_complete(struct ena_adapter *adapter) +{ + int rc; + + rc = ena_rss_configure(adapter); + if (unlikely(rc)) + return rc; + + ena_change_mtu(adapter->netdev, adapter->netdev->mtu); + + ena_refill_all_rx_bufs(adapter); + + /* enable transmits */ + netif_tx_start_all_queues(adapter->netdev); + + ena_napi_enable_in_range(adapter, + 0, + adapter->xdp_num_queues + adapter->num_io_queues); + + return 0; +} + +static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid) +{ + struct ena_com_create_io_ctx ctx; + struct ena_com_dev *ena_dev; + struct ena_ring *tx_ring; + u32 msix_vector; + u16 ena_qid; + int rc; + + ena_dev = adapter->ena_dev; + + tx_ring = &adapter->tx_ring[qid]; + msix_vector = ENA_IO_IRQ_IDX(qid); + ena_qid = ENA_IO_TXQ_IDX(qid); + + memset(&ctx, 0x0, sizeof(ctx)); + + ctx.direction = ENA_COM_IO_QUEUE_DIRECTION_TX; + ctx.qid = ena_qid; + ctx.mem_queue_type = ena_dev->tx_mem_queue_type; + ctx.msix_vector = msix_vector; + ctx.queue_size = tx_ring->ring_size; + ctx.numa_node = tx_ring->numa_node; + + rc = ena_com_create_io_queue(ena_dev, &ctx); + if (unlikely(rc)) { + netif_err(adapter, ifup, adapter->netdev, + "Failed to create I/O TX queue num %d rc: %d\n", + qid, rc); + return rc; + } + + rc = ena_com_get_io_handlers(ena_dev, ena_qid, + &tx_ring->ena_com_io_sq, + &tx_ring->ena_com_io_cq); + if (unlikely(rc)) { + netif_err(adapter, ifup, adapter->netdev, + "Failed to get TX queue handlers. TX queue num %d rc: %d\n", + qid, rc); + ena_com_destroy_io_queue(ena_dev, ena_qid); + return rc; + } + + return rc; +} + +int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter, + int first_index, int count) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + int rc, i; + + for (i = first_index; i < first_index + count; i++) { + rc = ena_create_io_tx_queue(adapter, i); + if (unlikely(rc)) + goto create_err; + } + + return 0; + +create_err: + while (i-- > first_index) + ena_com_destroy_io_queue(ena_dev, ENA_IO_TXQ_IDX(i)); + + return rc; +} + +static int ena_create_io_rx_queue(struct ena_adapter *adapter, int qid) +{ + struct ena_com_dev *ena_dev; + struct ena_com_create_io_ctx ctx; + struct ena_ring *rx_ring; + u32 msix_vector; + u16 ena_qid; + int rc; + + ena_dev = adapter->ena_dev; + + rx_ring = &adapter->rx_ring[qid]; + msix_vector = ENA_IO_IRQ_IDX(qid); + ena_qid = ENA_IO_RXQ_IDX(qid); + + memset(&ctx, 0x0, sizeof(ctx)); + + ctx.qid = ena_qid; + ctx.direction = ENA_COM_IO_QUEUE_DIRECTION_RX; + ctx.mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST; + ctx.msix_vector = msix_vector; + ctx.queue_size = rx_ring->ring_size; + ctx.numa_node = rx_ring->numa_node; + + rc = ena_com_create_io_queue(ena_dev, &ctx); + if (unlikely(rc)) { + netif_err(adapter, ifup, adapter->netdev, + "Failed to create I/O RX queue num %d rc: %d\n", + qid, rc); + return rc; + } + + rc = ena_com_get_io_handlers(ena_dev, ena_qid, + &rx_ring->ena_com_io_sq, + &rx_ring->ena_com_io_cq); + if (unlikely(rc)) { + netif_err(adapter, ifup, adapter->netdev, + "Failed to get RX queue handlers. RX queue num %d rc: %d\n", + qid, rc); + goto err; + } + + ena_com_update_numa_node(rx_ring->ena_com_io_cq, ctx.numa_node); + + return rc; +err: + ena_com_destroy_io_queue(ena_dev, ena_qid); + return rc; +} + +static int ena_create_all_io_rx_queues(struct ena_adapter *adapter) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + int rc, i; + + for (i = 0; i < adapter->num_io_queues; i++) { + rc = ena_create_io_rx_queue(adapter, i); + if (unlikely(rc)) + goto create_err; + INIT_WORK(&adapter->ena_napi[i].dim.work, ena_dim_work); + + ena_xdp_register_rxq_info(&adapter->rx_ring[i]); + } + + return 0; + +create_err: + while (i--) { + ena_xdp_unregister_rxq_info(&adapter->rx_ring[i]); + cancel_work_sync(&adapter->ena_napi[i].dim.work); + ena_com_destroy_io_queue(ena_dev, ENA_IO_RXQ_IDX(i)); + } + + return rc; +} + +static void set_io_rings_size(struct ena_adapter *adapter, + int new_tx_size, + int new_rx_size) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) { + adapter->tx_ring[i].ring_size = new_tx_size; + adapter->rx_ring[i].ring_size = new_rx_size; + } +} + +/* This function allows queue allocation to backoff when the system is + * low on memory. If there is not enough memory to allocate io queues + * the driver will try to allocate smaller queues. + * + * The backoff algorithm is as follows: + * 1. Try to allocate TX and RX and if successful. + * 1.1. return success + * + * 2. Divide by 2 the size of the larger of RX and TX queues (or both if their size is the same). + * + * 3. If TX or RX is smaller than 256 + * 3.1. return failure. + * 4. else + * 4.1. go back to 1. + */ +static int create_queues_with_size_backoff(struct ena_adapter *adapter) +{ + int rc, cur_rx_ring_size, cur_tx_ring_size; + int new_rx_ring_size, new_tx_ring_size; + + /* current queue sizes might be set to smaller than the requested + * ones due to past queue allocation failures. + */ + set_io_rings_size(adapter, adapter->requested_tx_ring_size, + adapter->requested_rx_ring_size); + + while (1) { +#ifdef ENA_XDP_SUPPORT + if (ena_xdp_present(adapter)) { + rc = ena_setup_and_create_all_xdp_queues(adapter); + + if (rc) + goto err_setup_tx; + } +#endif /* ENA_XDP_SUPPORT */ + rc = ena_setup_tx_resources_in_range(adapter, + 0, + adapter->num_io_queues); + if (rc) + goto err_setup_tx; + + rc = ena_create_io_tx_queues_in_range(adapter, + 0, + adapter->num_io_queues); + if (rc) + goto err_create_tx_queues; + + rc = ena_setup_all_rx_resources(adapter); + if (rc) + goto err_setup_rx; + + rc = ena_create_all_io_rx_queues(adapter); + if (rc) + goto err_create_rx_queues; + + rc = ena_create_page_caches(adapter); + if (rc) /* Cache memory is freed in case of failure */ + goto err_create_rx_queues; + + return 0; + +err_create_rx_queues: + ena_free_all_io_rx_resources(adapter); +err_setup_rx: + ena_destroy_all_tx_queues(adapter); +err_create_tx_queues: + ena_free_all_io_tx_resources(adapter); +err_setup_tx: + if (rc != -ENOMEM) { + netif_err(adapter, ifup, adapter->netdev, + "Queue creation failed with error code %d\n", + rc); + return rc; + } + + cur_tx_ring_size = adapter->tx_ring[0].ring_size; + cur_rx_ring_size = adapter->rx_ring[0].ring_size; + + netif_err(adapter, ifup, adapter->netdev, + "Not enough memory to create queues with sizes TX=%d, RX=%d\n", + cur_tx_ring_size, cur_rx_ring_size); + + new_tx_ring_size = cur_tx_ring_size; + new_rx_ring_size = cur_rx_ring_size; + + /* Decrease the size of the larger queue, or + * decrease both if they are the same size. + */ + if (cur_rx_ring_size <= cur_tx_ring_size) + new_tx_ring_size = cur_tx_ring_size / 2; + if (cur_rx_ring_size >= cur_tx_ring_size) + new_rx_ring_size = cur_rx_ring_size / 2; + + if (new_tx_ring_size < ENA_MIN_RING_SIZE || + new_rx_ring_size < ENA_MIN_RING_SIZE) { + netif_err(adapter, ifup, adapter->netdev, + "Queue creation failed with the smallest possible queue size of %d for both queues. Not retrying with smaller queues\n", + ENA_MIN_RING_SIZE); + return rc; + } + + netif_err(adapter, ifup, adapter->netdev, + "Retrying queue creation with sizes TX=%d, RX=%d\n", + new_tx_ring_size, + new_rx_ring_size); + + set_io_rings_size(adapter, new_tx_ring_size, + new_rx_ring_size); + } +} + +int ena_up(struct ena_adapter *adapter) +{ + int io_queue_count, rc, i; + + netif_dbg(adapter, ifup, adapter->netdev, "%s\n", __func__); + + io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues; + ena_setup_io_intr(adapter); + + /* napi poll functions should be initialized before running + * request_irq(), to handle a rare condition where there is a pending + * interrupt, causing the ISR to fire immediately while the poll + * function wasn't set yet, causing a null dereference + */ + ena_init_napi_in_range(adapter, 0, io_queue_count); + + /* Enabling DIM needs to happen before enabling IRQs since DIM + * is run from napi routine + */ + if (ena_com_interrupt_moderation_supported(adapter->ena_dev)) + ena_com_enable_adaptive_moderation(adapter->ena_dev); + + rc = ena_request_io_irq(adapter); + if (rc) + goto err_req_irq; + + rc = create_queues_with_size_backoff(adapter); + if (rc) + goto err_create_queues_with_backoff; + + rc = ena_up_complete(adapter); + if (unlikely(rc)) + goto err_up; + + if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags)) + netif_carrier_on(adapter->netdev); + + ena_increase_stat(&adapter->dev_stats.interface_up, 1, + &adapter->syncp); + + set_bit(ENA_FLAG_DEV_UP, &adapter->flags); + + /* Enable completion queues interrupt */ + for (i = 0; i < adapter->num_io_queues; i++) + ena_unmask_interrupt(&adapter->tx_ring[i], + &adapter->rx_ring[i]); + + /* schedule napi in case we had pending packets + * from the last time we disable napi + */ + for (i = 0; i < io_queue_count; i++) + napi_schedule(&adapter->ena_napi[i].napi); + + return rc; + +err_up: + ena_free_page_caches(adapter); + ena_destroy_all_tx_queues(adapter); + ena_free_all_io_tx_resources(adapter); + ena_destroy_all_rx_queues(adapter); + ena_free_all_io_rx_resources(adapter); +err_create_queues_with_backoff: + ena_free_io_irq(adapter); +err_req_irq: + ena_del_napi_in_range(adapter, 0, io_queue_count); + + return rc; +} + +void ena_down(struct ena_adapter *adapter) +{ + int io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues; + + netif_dbg(adapter, ifdown, adapter->netdev, "%s\n", __func__); + + clear_bit(ENA_FLAG_DEV_UP, &adapter->flags); + + ena_increase_stat(&adapter->dev_stats.interface_down, 1, + &adapter->syncp); + + netif_carrier_off(adapter->netdev); + netif_tx_disable(adapter->netdev); + + /* After this point the napi handler won't enable the tx queue */ + ena_napi_disable_in_range(adapter, 0, io_queue_count); + + if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) { + int rc; + + rc = ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason); + if (rc) + netif_err(adapter, ifdown, adapter->netdev, + "Device reset failed\n"); + /* stop submitting admin commands on a device that was reset */ + ena_com_set_admin_running_state(adapter->ena_dev, false); + } + + ena_destroy_all_io_queues(adapter); + + ena_disable_io_intr_sync(adapter); + ena_free_io_irq(adapter); + ena_del_napi_in_range(adapter, 0, io_queue_count); + + ena_free_all_tx_bufs(adapter); + ena_free_all_rx_bufs(adapter); + ena_free_all_cache_pages(adapter); + ena_free_page_caches(adapter); + ena_free_all_io_tx_resources(adapter); + ena_free_all_io_rx_resources(adapter); +} + +/* ena_open - Called when a network interface is made active + * @netdev: network interface device structure + * + * Returns 0 on success, negative value on failure + * + * The open entry point is called when a network interface is made + * active by the system (IFF_UP). At this point all resources needed + * for transmit and receive operations are allocated, the interrupt + * handler is registered with the OS, the watchdog timer is started, + * and the stack is notified that the interface is ready. + */ +static int ena_open(struct net_device *netdev) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + int rc; + + /* Notify the stack of the actual queue counts. */ + rc = netif_set_real_num_tx_queues(netdev, adapter->num_io_queues); + if (rc) { + netif_err(adapter, ifup, netdev, "Can't set num tx queues\n"); + return rc; + } + + rc = netif_set_real_num_rx_queues(netdev, adapter->num_io_queues); + if (rc) { + netif_err(adapter, ifup, netdev, "Can't set num rx queues\n"); + return rc; + } + + rc = ena_up(adapter); + if (rc) + return rc; + + return rc; +} + +/* ena_close - Disables a network interface + * @netdev: network interface device structure + * + * Returns 0, this is not allowed to fail + * + * The close entry point is called when an interface is de-activated + * by the OS. The hardware is still under the drivers control, but + * needs to be disabled. A global MAC reset is issued to stop the + * hardware, and all transmit and receive resources are freed. + */ +static int ena_close(struct net_device *netdev) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + u8 *debug_area; + + netif_dbg(adapter, ifdown, netdev, "%s\n", __func__); + + if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags)) + return 0; + + if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) + ena_down(adapter); + + /* Check for device status and issue reset if needed*/ + check_for_admin_com_state(adapter); + if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) { + netif_err(adapter, ifdown, adapter->netdev, + "Destroy failure, restarting device\n"); + + debug_area = adapter->ena_dev->host_attr.debug_area_virt_addr; + if (debug_area) + ena_dump_stats_to_buf(adapter, debug_area); + ena_dump_stats_to_dmesg(adapter); + /* rtnl lock already obtained in dev_ioctl() layer */ + ena_destroy_device(adapter, false); + ena_restore_device(adapter); + } + + return 0; +} + +int ena_set_lpc_state(struct ena_adapter *adapter, bool enabled) +{ + /* In XDP, lpc_size might be positive even with LPC disabled, use cache + * pointer instead. + */ + struct ena_page_cache *page_cache = adapter->rx_ring->page_cache; + + /* Exit early if LPC state doesn't change */ + if (enabled == !!page_cache) + return 0; + + if (enabled && !ena_is_lpc_supported(adapter, adapter->rx_ring, true)) + return -EOPNOTSUPP; + + adapter->used_lpc_size = enabled ? adapter->configured_lpc_size : 0; + + /* rtnl lock is already obtained in dev_ioctl() layer, so it's safe to + * re-initialize IO resources. + */ + if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) { + ena_close(adapter->netdev); + ena_up(adapter); + } + + return 0; +} + +int ena_update_queue_params(struct ena_adapter *adapter, + u32 new_tx_size, + u32 new_rx_size, + u32 new_llq_header_len) +{ + bool dev_was_up, large_llq_changed = false; + int rc = 0; + + dev_was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags); + ena_close(adapter->netdev); + adapter->requested_tx_ring_size = new_tx_size; + adapter->requested_rx_ring_size = new_rx_size; + ena_init_io_rings(adapter, + 0, + adapter->xdp_num_queues + + adapter->num_io_queues); + +#ifdef ENA_LARGE_LLQ_ETHTOOL + large_llq_changed = adapter->ena_dev->tx_mem_queue_type == + ENA_ADMIN_PLACEMENT_POLICY_DEV; + large_llq_changed &= + new_llq_header_len != adapter->ena_dev->tx_max_header_size; + +#endif /* ENA_LARGE_LLQ_ETHTOOL */ + /* a check that the configuration is valid is done by caller */ + if (large_llq_changed) { + bool large_llq_requested = new_llq_header_len == ENA_LLQ_LARGE_HEADER; + + adapter->llq_policy = large_llq_requested ? + ENA_LLQ_HEADER_SIZE_POLICY_LARGE : + ENA_LLQ_HEADER_SIZE_POLICY_NORMAL; + + ena_destroy_device(adapter, false); + rc = ena_restore_device(adapter); + } + + return dev_was_up && !rc ? ena_up(adapter) : rc; +} + +int ena_set_rx_copybreak(struct ena_adapter *adapter, u32 rx_copybreak) +{ + struct ena_ring *rx_ring; + int i; + + if (rx_copybreak > min_t(u16, adapter->netdev->mtu, ENA_PAGE_SIZE)) + return -EINVAL; + + adapter->rx_copybreak = rx_copybreak; + + for (i = 0; i < adapter->num_io_queues; i++) { + rx_ring = &adapter->rx_ring[i]; + rx_ring->rx_copybreak = rx_copybreak; + } + + return 0; +} + +int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; +#ifdef ENA_XDP_SUPPORT + int prev_channel_count; +#endif /* ENA_XDP_SUPPORT */ + bool dev_was_up; + + dev_was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags); + ena_close(adapter->netdev); +#ifdef ENA_XDP_SUPPORT + prev_channel_count = adapter->num_io_queues; +#endif /* ENA_XDP_SUPPORT */ + adapter->num_io_queues = new_channel_count; +#ifdef ENA_XDP_SUPPORT + if (ena_xdp_present(adapter) && + ena_xdp_allowed(adapter) == ENA_XDP_ALLOWED) { + adapter->xdp_first_ring = new_channel_count; + adapter->xdp_num_queues = new_channel_count; + if (prev_channel_count > new_channel_count) + ena_xdp_exchange_program_rx_in_range(adapter, + NULL, + new_channel_count, + prev_channel_count); + else + ena_xdp_exchange_program_rx_in_range(adapter, + adapter->xdp_bpf_prog, + prev_channel_count, + new_channel_count); + } +#endif /* ENA_XDP_SUPPORT */ + + /* We need to destroy the rss table so that the indirection + * table will be reinitialized by ena_up() + */ + ena_com_rss_destroy(ena_dev); + ena_init_io_rings(adapter, + 0, + adapter->xdp_num_queues + + adapter->num_io_queues); + return dev_was_up ? ena_open(adapter->netdev) : 0; +} + +static void ena_tx_csum(struct ena_com_tx_ctx *ena_tx_ctx, + struct sk_buff *skb, + bool disable_meta_caching) +{ + u32 mss = skb_shinfo(skb)->gso_size; + struct ena_com_tx_meta *ena_meta = &ena_tx_ctx->ena_meta; + u8 l4_protocol = 0; + + if ((skb->ip_summed == CHECKSUM_PARTIAL) || mss) { + ena_tx_ctx->l4_csum_enable = 1; + if (mss) { + ena_tx_ctx->tso_enable = 1; + ena_meta->l4_hdr_len = tcp_hdr(skb)->doff; + ena_tx_ctx->l4_csum_partial = 0; + } else { + ena_tx_ctx->tso_enable = 0; + ena_meta->l4_hdr_len = 0; + ena_tx_ctx->l4_csum_partial = 1; + } + + switch (ip_hdr(skb)->version) { + case IPVERSION: + ena_tx_ctx->l3_proto = ENA_ETH_IO_L3_PROTO_IPV4; + if (ip_hdr(skb)->frag_off & htons(IP_DF)) + ena_tx_ctx->df = 1; + if (mss) + ena_tx_ctx->l3_csum_enable = 1; + l4_protocol = ip_hdr(skb)->protocol; + break; + case 6: + ena_tx_ctx->l3_proto = ENA_ETH_IO_L3_PROTO_IPV6; + l4_protocol = ipv6_hdr(skb)->nexthdr; + break; + default: + break; + } + + if (l4_protocol == IPPROTO_TCP) + ena_tx_ctx->l4_proto = ENA_ETH_IO_L4_PROTO_TCP; + else + ena_tx_ctx->l4_proto = ENA_ETH_IO_L4_PROTO_UDP; + + ena_meta->mss = mss; + ena_meta->l3_hdr_len = skb_network_header_len(skb); + ena_meta->l3_hdr_offset = skb_network_offset(skb); + ena_tx_ctx->meta_valid = 1; + } else if (disable_meta_caching) { + memset(ena_meta, 0, sizeof(*ena_meta)); + ena_tx_ctx->meta_valid = 1; + } else { + ena_tx_ctx->meta_valid = 0; + } +} + +static int ena_check_and_linearize_skb(struct ena_ring *tx_ring, + struct sk_buff *skb) +{ + int num_frags, header_len, rc; + + num_frags = skb_shinfo(skb)->nr_frags; + header_len = skb_headlen(skb); + + if (num_frags < tx_ring->sgl_size) + return 0; + + if ((num_frags == tx_ring->sgl_size) && + (header_len < tx_ring->tx_max_header_size)) + return 0; + + ena_increase_stat(&tx_ring->tx_stats.linearize, 1, &tx_ring->syncp); + + rc = skb_linearize(skb); + if (unlikely(rc)) { + ena_increase_stat(&tx_ring->tx_stats.linearize_failed, 1, + &tx_ring->syncp); + } + + return rc; +} + +static int ena_tx_map_skb(struct ena_ring *tx_ring, + struct ena_tx_buffer *tx_info, + struct sk_buff *skb, + void **push_hdr, + u16 *header_len) +{ + struct ena_adapter *adapter = tx_ring->adapter; + struct ena_com_buf *ena_buf; + dma_addr_t dma; + u32 skb_head_len, frag_len, last_frag; + u16 push_len = 0; + u16 delta = 0; + int i = 0; + + skb_head_len = skb_headlen(skb); + tx_info->skb = skb; + ena_buf = tx_info->bufs; + + if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + /* When the device is LLQ mode, the driver will copy + * the header into the device memory space. + * the ena_com layer assume the header is in a linear + * memory space. + * This assumption might be wrong since part of the header + * can be in the fragmented buffers. + * Use skb_header_pointer to make sure the header is in a + * linear memory space. + */ + + push_len = min_t(u32, skb->len, tx_ring->tx_max_header_size); + *push_hdr = skb_header_pointer(skb, 0, push_len, + tx_ring->push_buf_intermediate_buf); + *header_len = push_len; + if (unlikely(skb->data != *push_hdr)) { + ena_increase_stat(&tx_ring->tx_stats.llq_buffer_copy, 1, + &tx_ring->syncp); + + delta = push_len - skb_head_len; + } + } else { + *push_hdr = NULL; + *header_len = min_t(u32, skb_head_len, + tx_ring->tx_max_header_size); + } + + netif_dbg(adapter, tx_queued, adapter->netdev, + "skb: %p header_buf->vaddr: %p push_len: %d\n", skb, + *push_hdr, push_len); + + if (skb_head_len > push_len) { + dma = dma_map_single(tx_ring->dev, skb->data + push_len, + skb_head_len - push_len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx_ring->dev, dma))) + goto error_report_dma_error; + + ena_buf->paddr = dma; + ena_buf->len = skb_head_len - push_len; + + ena_buf++; + tx_info->num_of_bufs++; + tx_info->map_linear_data = 1; + } else { + tx_info->map_linear_data = 0; + } + + last_frag = skb_shinfo(skb)->nr_frags; + + for (i = 0; i < last_frag; i++) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + frag_len = skb_frag_size(frag); + + if (unlikely(delta >= frag_len)) { + delta -= frag_len; + continue; + } + + dma = skb_frag_dma_map(tx_ring->dev, frag, delta, + frag_len - delta, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx_ring->dev, dma))) + goto error_report_dma_error; + + ena_buf->paddr = dma; + ena_buf->len = frag_len - delta; + ena_buf++; + tx_info->num_of_bufs++; + delta = 0; + } + + return 0; + +error_report_dma_error: + ena_increase_stat(&tx_ring->tx_stats.dma_mapping_err, 1, + &tx_ring->syncp); + netif_warn(adapter, tx_queued, adapter->netdev, "Failed to map skb\n"); + + tx_info->skb = NULL; + + ena_unmap_tx_buff(tx_ring, tx_info); + + return -EINVAL; +} + +/* Called with netif_tx_lock. */ +static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ena_adapter *adapter = netdev_priv(dev); + struct ena_tx_buffer *tx_info; + struct ena_com_tx_ctx ena_tx_ctx; + struct ena_ring *tx_ring; + struct netdev_queue *txq; + void *push_hdr; + u16 next_to_use, req_id, header_len; + int qid, rc; + + netif_dbg(adapter, tx_queued, dev, "%s skb %p\n", __func__, skb); + /* Determine which tx ring we will be placed on */ + qid = skb_get_queue_mapping(skb); + tx_ring = &adapter->tx_ring[qid]; + txq = netdev_get_tx_queue(dev, qid); + + rc = ena_check_and_linearize_skb(tx_ring, skb); + if (unlikely(rc)) + goto error_drop_packet; + + next_to_use = tx_ring->next_to_use; + req_id = tx_ring->free_ids[next_to_use]; + tx_info = &tx_ring->tx_buffer_info[req_id]; + tx_info->num_of_bufs = 0; + + WARN(tx_info->skb, "SKB isn't NULL req_id %d\n", req_id); + + rc = ena_tx_map_skb(tx_ring, tx_info, skb, &push_hdr, &header_len); + if (unlikely(rc)) + goto error_drop_packet; + + memset(&ena_tx_ctx, 0x0, sizeof(struct ena_com_tx_ctx)); + ena_tx_ctx.ena_bufs = tx_info->bufs; + ena_tx_ctx.push_header = push_hdr; + ena_tx_ctx.num_bufs = tx_info->num_of_bufs; + ena_tx_ctx.req_id = req_id; + ena_tx_ctx.header_len = header_len; + + /* set flags and meta data */ + ena_tx_csum(&ena_tx_ctx, skb, tx_ring->disable_meta_caching); + + rc = ena_xmit_common(adapter, + tx_ring, + tx_info, + &ena_tx_ctx, + next_to_use, + skb->len); + if (unlikely(rc)) + goto error_unmap_dma; + + if (tx_ring->enable_bql) + netdev_tx_sent_queue(txq, skb->len); + + /* stop the queue when no more space available, the packet can have up + * to sgl_size + 2. one for the meta descriptor and one for header + * (if the header is larger than tx_max_header_size). + */ + if (unlikely(!ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq, + tx_ring->sgl_size + 2))) { + netif_dbg(adapter, tx_queued, dev, "%s stop queue %d\n", + __func__, qid); + + netif_tx_stop_queue(txq); + ena_increase_stat(&tx_ring->tx_stats.queue_stop, 1, + &tx_ring->syncp); + + /* There is a rare condition where this function decide to + * stop the queue but meanwhile clean_tx_irq updates + * next_to_completion and terminates. + * The queue will remain stopped forever. + * To solve this issue add a mb() to make sure that + * netif_tx_stop_queue() write is vissible before checking if + * there is additional space in the queue. + */ + smp_mb(); + + if (ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq, + ENA_TX_WAKEUP_THRESH)) { + netif_tx_wake_queue(txq); + ena_increase_stat(&tx_ring->tx_stats.queue_wakeup, 1, + &tx_ring->syncp); + } + } + + skb_tx_timestamp(skb); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) + if (netif_xmit_stopped(txq) || !netdev_xmit_more()) +#endif + /* trigger the dma engine. ena_ring_tx_doorbell() + * calls a memory barrier inside it. + */ + ena_ring_tx_doorbell(tx_ring); + + return NETDEV_TX_OK; + +error_unmap_dma: + ena_unmap_tx_buff(tx_ring, tx_info); + tx_info->skb = NULL; + +error_drop_packet: +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) + if (!netdev_xmit_more() && ena_com_used_q_entries(tx_ring->ena_com_io_sq)) +#else + if (ena_com_used_q_entries(tx_ring->ena_com_io_sq)) +#endif + ena_ring_tx_doorbell(tx_ring); + + dev_kfree_skb(skb); + return NETDEV_TX_OK; +} + +#ifdef HAVE_SET_RX_MODE + +/* Unicast, Multicast and Promiscuous mode set + * @netdev: network interface device structure + * + * The set_rx_mode entry point is called whenever the unicast or multicast + * address lists or the network interface flags are updated. This routine is + * responsible for configuring the hardware for proper unicast, multicast, + * promiscuous mode, and all-multi behavior. + */ +static void ena_set_rx_mode(struct net_device *netdev) +{ +/* struct ena_adapter *adapter = netdev_priv(netdev); */ + /* TODO set Rx mode */ + + if (netdev->flags & IFF_PROMISC) { + } else if (netdev->flags & IFF_ALLMULTI) { + } else if (netdev_mc_empty(netdev)) { + } else { + } +} +#endif /* HAVE_SET_RX_MODE */ + +static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pdev) +{ + struct device *dev = &pdev->dev; + struct ena_admin_host_info *host_info; + ssize_t ret; + int rc; + + /* Allocate only the host info */ + rc = ena_com_allocate_host_info(ena_dev); + if (unlikely(rc)) { + dev_err(dev, "Cannot allocate host info\n"); + return; + } + + host_info = ena_dev->host_attr.host_info; + + host_info->bdf = pci_dev_id(pdev); + host_info->os_type = ENA_ADMIN_OS_LINUX; + host_info->kernel_ver = LINUX_VERSION_CODE; + ret = strscpy(host_info->kernel_ver_str, utsname()->version, + sizeof(host_info->kernel_ver_str)); + if (ret < 0) + dev_dbg(dev, + "kernel version string will be truncated, status = %zd\n", ret); + + host_info->os_dist = 0; + ret = strscpy(host_info->os_dist_str, utsname()->release, + sizeof(host_info->os_dist_str)); + if (ret < 0) + dev_dbg(dev, + "OS distribution string will be truncated, status = %zd\n", ret); + + host_info->driver_version = + (DRV_MODULE_GEN_MAJOR) | + (DRV_MODULE_GEN_MINOR << ENA_ADMIN_HOST_INFO_MINOR_SHIFT) | + (DRV_MODULE_GEN_SUBMINOR << ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT) | + ("g"[0] << ENA_ADMIN_HOST_INFO_MODULE_TYPE_SHIFT); + host_info->num_cpus = num_online_cpus(); + + host_info->driver_supported_features = + ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK | + ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK | + ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_MASK | + ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK | + ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_MASK | + ENA_ADMIN_HOST_INFO_TX_IPV6_CSUM_OFFLOAD_MASK | + ENA_ADMIN_HOST_INFO_PHC_MASK; + + rc = ena_com_set_host_attributes(ena_dev); + if (unlikely(rc)) { + if (rc == -EOPNOTSUPP) + dev_warn(dev, "Cannot set host attributes\n"); + else + dev_err(dev, "Cannot set host attributes\n"); + + goto err; + } + + return; + +err: + ena_com_delete_host_info(ena_dev); +} + +static void ena_config_debug_area(struct ena_adapter *adapter) +{ + u32 debug_area_size; + int rc, ss_count; + + ss_count = ena_get_sset_count(adapter->netdev, ETH_SS_STATS); + if (ss_count <= 0) { + netif_err(adapter, drv, adapter->netdev, + "SS count is negative\n"); + return; + } + + /* allocate 32 bytes for each string and 64bit for the value */ + debug_area_size = ss_count * ETH_GSTRING_LEN + sizeof(u64) * ss_count; + + rc = ena_com_allocate_debug_area(adapter->ena_dev, debug_area_size); + if (unlikely(rc)) { + netif_err(adapter, drv, adapter->netdev, + "Cannot allocate debug area\n"); + return; + } + + rc = ena_com_set_host_attributes(adapter->ena_dev); + if (unlikely(rc)) { + if (rc == -EOPNOTSUPP) + netif_warn(adapter, drv, adapter->netdev, "Cannot set host attributes\n"); + else + netif_err(adapter, drv, adapter->netdev, + "Cannot set host attributes\n"); + goto err; + } + + return; +err: + ena_com_delete_debug_area(adapter->ena_dev); +} + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)) +#ifdef NDO_GET_STATS_64_V2 +static void ena_get_stats64(struct net_device *netdev, + struct rtnl_link_stats64 *stats) +#else +static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev, + struct rtnl_link_stats64 *stats) +#endif +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct ena_ring *rx_ring, *tx_ring; + unsigned int start; + u64 rx_overruns; + u64 rx_drops; + u64 tx_drops; + int i; + + if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) +#ifdef NDO_GET_STATS_64_V2 + return; +#else + return NULL; +#endif + + for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) { + u64 bytes, packets; + + tx_ring = &adapter->tx_ring[i]; + + do { + start = ena_u64_stats_fetch_begin(&tx_ring->syncp); + packets = tx_ring->tx_stats.cnt; + bytes = tx_ring->tx_stats.bytes; + } while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start)); + + stats->tx_packets += packets; + stats->tx_bytes += bytes; + + /* In XDP there isn't an RX queue counterpart */ + if (ENA_IS_XDP_INDEX(adapter, i)) + continue; + + rx_ring = &adapter->rx_ring[i]; + + do { + start = ena_u64_stats_fetch_begin(&rx_ring->syncp); + packets = rx_ring->rx_stats.cnt; + bytes = rx_ring->rx_stats.bytes; + } while (ena_u64_stats_fetch_retry(&rx_ring->syncp, start)); + + stats->rx_packets += packets; + stats->rx_bytes += bytes; + } + + do { + start = ena_u64_stats_fetch_begin(&adapter->syncp); + rx_drops = adapter->dev_stats.rx_drops; + tx_drops = adapter->dev_stats.tx_drops; + rx_overruns = adapter->dev_stats.rx_overruns; + } while (ena_u64_stats_fetch_retry(&adapter->syncp, start)); + + stats->rx_dropped = rx_drops; + stats->tx_dropped = tx_drops; + + stats->multicast = 0; + stats->collisions = 0; + + stats->rx_length_errors = 0; + stats->rx_crc_errors = 0; + stats->rx_frame_errors = 0; + stats->rx_fifo_errors = 0; + stats->rx_missed_errors = 0; + stats->tx_window_errors = 0; + stats->rx_over_errors = rx_overruns; + + stats->rx_errors = stats->rx_over_errors; + stats->tx_errors = 0; +#ifndef NDO_GET_STATS_64_V2 + return stats; +#endif +} +#else /* kernel > 2.6.36 */ +static struct net_device_stats *ena_get_stats(struct net_device *netdev) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct ena_ring *rx_ring, *tx_ring; + unsigned long rx_drops; + struct net_device_stats *stats = &netdev->stats; + unsigned int start; + int i; + + memset(stats, 0, sizeof(*stats)); + for (i = 0; i < adapter->num_io_queues; i++) { + unsigned long bytes, packets; + + tx_ring = &adapter->tx_ring[i]; + do { + start = ena_u64_stats_fetch_begin(&tx_ring->syncp); + packets = (unsigned long)tx_ring->tx_stats.cnt; + bytes = (unsigned long)tx_ring->tx_stats.bytes; + } while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start)); + + stats->tx_packets += packets; + stats->tx_bytes += bytes; + + rx_ring = &adapter->rx_ring[i]; + + do { + start = ena_u64_stats_fetch_begin(&tx_ring->syncp); + packets = (unsigned long)rx_ring->rx_stats.cnt; + bytes = (unsigned long)rx_ring->rx_stats.bytes; + } while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start)); + + stats->rx_packets += packets; + stats->rx_bytes += bytes; + } + + do { + start = ena_u64_stats_fetch_begin(&tx_ring->syncp); + rx_drops = (unsigned long)adapter->dev_stats.rx_drops; + } while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start)); + + stats->rx_dropped = rx_drops; + + stats->multicast = 0; + stats->collisions = 0; + + stats->rx_length_errors = 0; + stats->rx_crc_errors = 0; + stats->rx_frame_errors = 0; + stats->rx_fifo_errors = 0; + stats->rx_missed_errors = 0; + stats->tx_window_errors = 0; + + stats->rx_errors = 0; + stats->tx_errors = 0; + + return stats; +} +#endif +#ifdef ENA_BUSY_POLL_SUPPORT + +#define ENA_BP_NAPI_BUDGET 8 +static int ena_busy_poll(struct napi_struct *napi) +{ + struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi); + struct ena_ring *rx_ring = ena_napi->rx_ring; + struct ena_adapter *adapter= rx_ring->adapter; + int done; + + if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) + return LL_FLUSH_FAILED; + + if (!ena_bp_lock_poll(rx_ring)) + return LL_FLUSH_BUSY; + + done = ena_clean_rx_irq(rx_ring, napi, ENA_BP_NAPI_BUDGET); + if (likely(done)) + rx_ring->rx_stats.bp_cleaned += done; + else + rx_ring->rx_stats.bp_missed++; + + ena_bp_unlock_poll(rx_ring); + + return done; +} +#endif + +static const struct net_device_ops ena_netdev_ops = { + .ndo_open = ena_open, + .ndo_stop = ena_close, + .ndo_start_xmit = ena_start_xmit, +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)) + .ndo_get_stats64 = ena_get_stats64, +#else + .ndo_get_stats = ena_get_stats, +#endif +#ifdef HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER + .ndo_tx_timeout = ena_tx_timeout, +#else + .ndo_tx_timeout = ena_find_and_timeout_queue, +#endif + .ndo_change_mtu = ena_change_mtu, + .ndo_set_mac_address = NULL, +#ifdef HAVE_SET_RX_MODE + .ndo_set_rx_mode = ena_set_rx_mode, +#endif + .ndo_validate_addr = eth_validate_addr, +#ifdef ENA_BUSY_POLL_SUPPORT + .ndo_busy_poll = ena_busy_poll, +#endif +#ifdef ENA_XDP_SUPPORT + .ndo_bpf = ena_xdp, + .ndo_xdp_xmit = ena_xdp_xmit, +#if defined(ENA_TEST_AF_XDP) && defined(ENA_AF_XDP_SUPPORT) + .ndo_xsk_wakeup = ena_xdp_xsk_wakeup, +#endif /* defined(ENA_TEST_AF_XDP) && defined(ENA_AF_XDP_SUPPORT) */ +#endif /* ENA_XDP_SUPPORT */ +}; + +static int ena_calc_io_queue_size(struct ena_adapter *adapter, + struct ena_com_dev_get_features_ctx *get_feat_ctx) +{ + struct ena_admin_feature_llq_desc *llq = &get_feat_ctx->llq; + struct ena_com_dev *ena_dev = adapter->ena_dev; + u32 max_tx_queue_size; + u32 max_rx_queue_size; + u32 tx_queue_size; + + /* If this function is called after driver load, the ring sizes have already + * been configured. Take it into account when recalculating ring size. + */ + if (adapter->tx_ring->ring_size) { + tx_queue_size = adapter->tx_ring->ring_size; + } else if (adapter->llq_policy == ENA_LLQ_HEADER_SIZE_POLICY_LARGE && + ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + tx_queue_size = ENA_DEFAULT_WIDE_LLQ_RING_SIZE; + } else { + tx_queue_size = ENA_DEFAULT_RING_SIZE; + } + + if (adapter->rx_ring->ring_size) + rx_queue_size = adapter->rx_ring->ring_size; + + if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) { + struct ena_admin_queue_ext_feature_fields *max_queue_ext = + &get_feat_ctx->max_queue_ext.max_queue_ext; + max_rx_queue_size = min_t(u32, max_queue_ext->max_rx_cq_depth, + max_queue_ext->max_rx_sq_depth); + max_tx_queue_size = max_queue_ext->max_tx_cq_depth; + + if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) + max_tx_queue_size = min_t(u32, max_tx_queue_size, + llq->max_llq_depth); + else + max_tx_queue_size = min_t(u32, max_tx_queue_size, + max_queue_ext->max_tx_sq_depth); + + adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS, + max_queue_ext->max_per_packet_tx_descs); + adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS, + max_queue_ext->max_per_packet_rx_descs); + } else { + struct ena_admin_queue_feature_desc *max_queues = + &get_feat_ctx->max_queues; + max_rx_queue_size = min_t(u32, max_queues->max_cq_depth, + max_queues->max_sq_depth); + max_tx_queue_size = max_queues->max_cq_depth; + + if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) + max_tx_queue_size = min_t(u32, max_tx_queue_size, + llq->max_llq_depth); + else + max_tx_queue_size = min_t(u32, max_tx_queue_size, + max_queues->max_sq_depth); + + adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS, + max_queues->max_packet_tx_descs); + adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS, + max_queues->max_packet_rx_descs); + } + + if (adapter->llq_policy == ENA_LLQ_HEADER_SIZE_POLICY_LARGE) { + if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + u32 max_wide_llq_size = max_tx_queue_size; + + if (llq->max_wide_llq_depth == 0) { + /* if there is no large llq max depth from device, we divide + * the queue size by 2, leaving the amount of memory + * used by the queues unchanged. + */ + max_wide_llq_size /= 2; + } else if (llq->max_wide_llq_depth < max_wide_llq_size) { + max_wide_llq_size = llq->max_wide_llq_depth; + } + if (max_wide_llq_size != max_tx_queue_size) { + max_tx_queue_size = max_wide_llq_size; + dev_info(&adapter->pdev->dev, + "Forcing large headers and decreasing maximum TX queue size to %d\n", + max_tx_queue_size); + } + } else { + dev_err(&adapter->pdev->dev, + "Forcing large headers failed: LLQ is disabled or device does not support large headers\n"); + + adapter->llq_policy = ENA_LLQ_HEADER_SIZE_POLICY_NORMAL; + } + } + + max_tx_queue_size = rounddown_pow_of_two(max_tx_queue_size); + max_rx_queue_size = rounddown_pow_of_two(max_rx_queue_size); + + if (max_tx_queue_size < ENA_MIN_RING_SIZE) { + netdev_err(adapter->netdev, "Device max TX queue size: %d < minimum: %d\n", + max_tx_queue_size, ENA_MIN_RING_SIZE); + return -EINVAL; + } + + if (max_rx_queue_size < ENA_MIN_RING_SIZE) { + netdev_err(adapter->netdev, "Device max RX queue size: %d < minimum: %d\n", + max_rx_queue_size, ENA_MIN_RING_SIZE); + return -EINVAL; + } + + tx_queue_size = clamp_val(tx_queue_size, ENA_MIN_RING_SIZE, + max_tx_queue_size); + rx_queue_size = clamp_val(rx_queue_size, ENA_MIN_RING_SIZE, + max_rx_queue_size); + + tx_queue_size = rounddown_pow_of_two(tx_queue_size); + rx_queue_size = rounddown_pow_of_two(rx_queue_size); + + adapter->max_tx_ring_size = max_tx_queue_size; + adapter->max_rx_ring_size = max_rx_queue_size; + adapter->requested_tx_ring_size = tx_queue_size; + adapter->requested_rx_ring_size = rx_queue_size; + + return 0; +} + +static int ena_device_validate_params(struct ena_adapter *adapter, + struct ena_com_dev_get_features_ctx *get_feat_ctx) +{ + struct net_device *netdev = adapter->netdev; + int rc; + + rc = ether_addr_equal(get_feat_ctx->dev_attr.mac_addr, + adapter->mac_addr); + if (!rc) { + netif_err(adapter, drv, netdev, + "Error, mac address are different\n"); + return -EINVAL; + } + + if (get_feat_ctx->dev_attr.max_mtu < netdev->mtu) { + netif_err(adapter, drv, netdev, + "Error, device max mtu is smaller than netdev MTU\n"); + return -EINVAL; + } + + return 0; +} + +static void ena_set_forced_llq_size_policy(struct ena_adapter *adapter) +{ + /* policy will be set according to device recommendation unless user + * forced either large/normal size + */ + if (force_large_llq_header != FORCE_LARGE_LLQ_HEADER_UNINIT_VALUE) { + /* user selection is prioritized on top of device recommendation */ + adapter->llq_policy = force_large_llq_header ? ENA_LLQ_HEADER_SIZE_POLICY_LARGE : + ENA_LLQ_HEADER_SIZE_POLICY_NORMAL; + } +} + +static int ena_set_llq_configurations(struct ena_adapter *adapter, + struct ena_llq_configurations *llq_config, + struct ena_admin_feature_llq_desc *llq) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + bool use_large_llq; + + llq_config->llq_header_location = ENA_ADMIN_INLINE_HEADER; + llq_config->llq_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY; + llq_config->llq_num_decs_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2; + + adapter->large_llq_header_supported = + !!(ena_dev->supported_features & BIT(ENA_ADMIN_LLQ)); + adapter->large_llq_header_supported &= + !!(llq->entry_size_ctrl_supported & + ENA_ADMIN_LIST_ENTRY_SIZE_256B); + + use_large_llq = adapter->llq_policy != ENA_LLQ_HEADER_SIZE_POLICY_NORMAL; + use_large_llq &= adapter->large_llq_header_supported; + + if (adapter->llq_policy == ENA_LLQ_HEADER_SIZE_POLICY_UNSPECIFIED) + use_large_llq &= (llq->entry_size_recommended == ENA_ADMIN_LIST_ENTRY_SIZE_256B); + + if (!use_large_llq) { + llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_128B; + llq_config->llq_ring_entry_size_value = 128; + adapter->llq_policy = ENA_LLQ_HEADER_SIZE_POLICY_NORMAL; + } else { + llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_256B; + llq_config->llq_ring_entry_size_value = 256; + adapter->llq_policy = ENA_LLQ_HEADER_SIZE_POLICY_LARGE; + } + + return 0; +} + +static int ena_set_queues_placement_policy(struct pci_dev *pdev, + struct ena_com_dev *ena_dev, + struct ena_admin_feature_llq_desc *llq, + struct ena_llq_configurations *llq_default_configurations) +{ + int rc; + u32 llq_feature_mask; + + llq_feature_mask = 1 << ENA_ADMIN_LLQ; + if (!(ena_dev->supported_features & llq_feature_mask)) { + dev_warn(&pdev->dev, + "LLQ is not supported Fallback to host mode policy.\n"); + ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST; + return 0; + } + + if (!ena_dev->mem_bar) { + netdev_err(ena_dev->net_device, + "LLQ is advertised as supported but device doesn't expose mem bar\n"); + ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST; + return 0; + } + + rc = ena_com_config_dev_mode(ena_dev, llq, llq_default_configurations); + if (unlikely(rc)) { + dev_err(&pdev->dev, + "Failed to configure the device mode. Fallback to host mode policy.\n"); + ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST; + } + + return 0; +} + +static int ena_map_llq_mem_bar(struct pci_dev *pdev, struct ena_com_dev *ena_dev, + int bars) +{ + bool has_mem_bar = !!(bars & BIT(ENA_MEM_BAR)); + + if (!has_mem_bar) + return 0; + + ena_dev->mem_bar = devm_ioremap_wc(&pdev->dev, + pci_resource_start(pdev, ENA_MEM_BAR), + pci_resource_len(pdev, ENA_MEM_BAR)); + + if (!ena_dev->mem_bar) + return -EFAULT; + + return 0; +} + +static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev, + struct ena_com_dev_get_features_ctx *get_feat_ctx, + bool *wd_state) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct net_device *netdev = adapter->netdev; + struct ena_llq_configurations llq_config; + netdev_features_t prev_netdev_features; + struct device *dev = &pdev->dev; + bool readless_supported; + u32 aenq_groups; + int dma_width; + int rc; + + rc = ena_com_mmio_reg_read_request_init(ena_dev); + if (unlikely(rc)) { + dev_err(dev, "Failed to init mmio read less\n"); + return rc; + } + + /* The PCIe configuration space revision id indicate if mmio reg + * read is disabled + */ + readless_supported = !(pdev->revision & ENA_MMIO_DISABLE_REG_READ); + ena_com_set_mmio_read_mode(ena_dev, readless_supported); + + rc = ena_com_dev_reset(ena_dev, ENA_REGS_RESET_NORMAL); + if (rc) { + dev_err(dev, "Can not reset device\n"); + goto err_mmio_read_less; + } + + rc = ena_com_validate_version(ena_dev); + if (rc) { + dev_err(dev, "Device version is too low\n"); + goto err_mmio_read_less; + } + + dma_width = ena_com_get_dma_width(ena_dev); + if (unlikely(dma_width < 0)) { + dev_err(dev, "Invalid dma width value %d", dma_width); + rc = dma_width; + goto err_mmio_read_less; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) + rc = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(dma_width)); + if (rc) { + dev_err(dev, "dma_set_mask_and_coherent failed %d\n", rc); + goto err_mmio_read_less; + } +#else + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(dma_width)); + if (rc) { + dev_err(dev, "pci_set_dma_mask failed %d\n", rc); + goto err_mmio_read_less; + } + + rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(dma_width)); + if (rc) { + dev_err(dev, "err_pci_set_consistent_dma_mask failed %d\n", + rc); + goto err_mmio_read_less; + } +#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */ + + + /* ENA admin level init */ + rc = ena_com_admin_init(ena_dev, &aenq_handlers); + if (unlikely(rc)) { + dev_err(dev, + "Can not initialize ena admin queue with device\n"); + goto err_mmio_read_less; + } + + /* To enable the msix interrupts the driver needs to know the number + * of queues. So the driver uses polling mode to retrieve this + * information + */ + ena_com_set_admin_polling_mode(ena_dev, true); + + ena_config_host_info(ena_dev, pdev); + + /* Get Device Attributes*/ + rc = ena_com_get_dev_attr_feat(ena_dev, get_feat_ctx); + if (rc) { + dev_err(dev, "Cannot get attribute for ena device rc=%d\n", rc); + goto err_admin_init; + } + + /* Try to turn all the available aenq groups */ + aenq_groups = BIT(ENA_ADMIN_LINK_CHANGE) | + BIT(ENA_ADMIN_FATAL_ERROR) | + BIT(ENA_ADMIN_WARNING) | + BIT(ENA_ADMIN_NOTIFICATION) | + BIT(ENA_ADMIN_KEEP_ALIVE) | + BIT(ENA_ADMIN_CONF_NOTIFICATIONS) | + BIT(ENA_ADMIN_DEVICE_REQUEST_RESET); + + aenq_groups &= get_feat_ctx->aenq.supported_groups; + + rc = ena_com_set_aenq_config(ena_dev, aenq_groups); + if (rc) { + dev_err(dev, "Cannot configure aenq groups rc= %d\n", rc); + goto err_admin_init; + } + + *wd_state = !!(aenq_groups & BIT(ENA_ADMIN_KEEP_ALIVE)); + + rc = ena_set_llq_configurations(adapter, &llq_config, &get_feat_ctx->llq); + if (rc) { + netdev_err(netdev, "Cannot set llq configuration rc= %d\n", rc); + goto err_admin_init; + } + + rc = ena_set_queues_placement_policy(pdev, ena_dev, &get_feat_ctx->llq, + &llq_config); + if (rc) { + netdev_err(netdev, "Cannot set queues placement policy rc= %d\n", rc); + goto err_admin_init; + } + + rc = ena_calc_io_queue_size(adapter, get_feat_ctx); + if (unlikely(rc)) + goto err_admin_init; + + if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) + dev_info(&pdev->dev, "ENA Large LLQ is %s\n", + adapter->llq_policy == ENA_LLQ_HEADER_SIZE_POLICY_LARGE ? + "enabled" : "disabled"); + + /* Turned on features shouldn't change due to reset. */ + prev_netdev_features = adapter->netdev->features; + ena_set_dev_offloads(get_feat_ctx, adapter->netdev); + adapter->netdev->features = prev_netdev_features; + + rc = ena_phc_init(adapter); + if (unlikely(rc && (rc != -EOPNOTSUPP))) { + netdev_err(netdev, "Failed initiating PHC, error: %d\n", rc); + goto err_admin_init; + } + + return 0; + +err_admin_init: + ena_com_abort_admin_commands(ena_dev); + ena_com_wait_for_abort_completion(ena_dev); + ena_com_delete_host_info(ena_dev); + ena_com_admin_destroy(ena_dev); +err_mmio_read_less: + ena_com_mmio_reg_read_request_destroy(ena_dev); + + return rc; +} + +static int ena_enable_msix_and_set_admin_interrupts(struct ena_adapter *adapter) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct device *dev = &adapter->pdev->dev; + int rc; + + rc = ena_enable_msix(adapter); + if (rc) { + dev_err(dev, "Can not reserve msix vectors\n"); + return rc; + } + + ena_setup_mgmnt_intr(adapter); + + rc = ena_request_mgmnt_irq(adapter); + if (rc) { + dev_err(dev, "Can not setup management interrupts\n"); + goto err_disable_msix; + } + + ena_com_set_admin_polling_mode(ena_dev, false); + + ena_com_admin_aenq_enable(ena_dev); + + return 0; + +err_disable_msix: + ena_disable_msix(adapter); + + return rc; +} + +int ena_destroy_device(struct ena_adapter *adapter, bool graceful) +{ + struct net_device *netdev = adapter->netdev; + struct ena_com_dev *ena_dev = adapter->ena_dev; + bool dev_up; + int rc = 0; + + if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags)) + return 0; + + netif_carrier_off(netdev); + + del_timer_sync(&adapter->timer_service); + + dev_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags); + adapter->dev_up_before_reset = dev_up; + if (!graceful) + ena_com_set_admin_running_state(ena_dev, false); + + if (dev_up) + ena_down(adapter); + + /* Stop the device from sending AENQ events (in case reset flag is set + * and device is up, ena_down() already reset the device. + */ + if (!(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags) && dev_up)) + rc = ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason); + + ena_free_mgmnt_irq(adapter); + + ena_disable_msix(adapter); + + ena_com_abort_admin_commands(ena_dev); + + ena_com_wait_for_abort_completion(ena_dev); + + ena_com_admin_destroy(ena_dev); + + ena_phc_destroy(adapter); + + ena_com_mmio_reg_read_request_destroy(ena_dev); + + /* return reset reason to default value */ + adapter->reset_reason = ENA_REGS_RESET_NORMAL; + + clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags); + clear_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags); + + return rc; +} + +int ena_restore_device(struct ena_adapter *adapter) +{ + struct ena_com_dev_get_features_ctx get_feat_ctx; + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct pci_dev *pdev = adapter->pdev; + struct ena_ring *txr; + int rc, count, i; + bool wd_state; + + set_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags); + rc = ena_device_init(adapter, adapter->pdev, &get_feat_ctx, &wd_state); + if (rc) { + dev_err(&pdev->dev, "Can not initialize device\n"); + goto err; + } + adapter->wd_state = wd_state; + + count = adapter->xdp_num_queues + adapter->num_io_queues; + for (i = 0 ; i < count; i++) { + txr = &adapter->tx_ring[i]; + txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type; + txr->tx_max_header_size = ena_dev->tx_max_header_size; + } + + rc = ena_device_validate_params(adapter, &get_feat_ctx); + if (rc) { + dev_err(&pdev->dev, "Validation of device parameters failed\n"); + goto err_device_destroy; + } + + rc = ena_enable_msix_and_set_admin_interrupts(adapter); + if (rc) { + dev_err(&pdev->dev, "Enable MSI-X failed\n"); + goto err_device_destroy; + } + + /* If the interface was up before the reset bring it up */ + if (adapter->dev_up_before_reset) { + rc = ena_up(adapter); + if (rc) { + dev_err(&pdev->dev, "Failed to create I/O queues\n"); + goto err_disable_msix; + } + } + + set_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags); + + clear_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags); + if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags)) + netif_carrier_on(adapter->netdev); + + mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ)); + adapter->last_keep_alive_jiffies = jiffies; + + return rc; +err_disable_msix: + ena_free_mgmnt_irq(adapter); + ena_disable_msix(adapter); +err_device_destroy: + ena_com_abort_admin_commands(ena_dev); + ena_com_wait_for_abort_completion(ena_dev); + ena_com_admin_destroy(ena_dev); + ena_com_dev_reset(ena_dev, ENA_REGS_RESET_DRIVER_INVALID_STATE); + ena_phc_destroy(adapter); + ena_com_mmio_reg_read_request_destroy(ena_dev); +err: + clear_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags); + clear_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags); + dev_err(&pdev->dev, + "Reset attempt failed. Can not reset the device\n"); + + return rc; +} + +static void ena_fw_reset_device(struct work_struct *work) +{ + int rc = 0; + + struct ena_adapter *adapter = + container_of(work, struct ena_adapter, reset_task); + + rtnl_lock(); + + if (likely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) { + rc |= ena_destroy_device(adapter, false); + rc |= ena_restore_device(adapter); + adapter->dev_stats.reset_fail += !!rc; + + dev_err(&adapter->pdev->dev, + "Device reset completed successfully, Driver info: %s\n", + version); + } + + rtnl_unlock(); +} + +static int check_for_rx_interrupt_queue(struct ena_adapter *adapter, + struct ena_ring *rx_ring) +{ + struct ena_napi *ena_napi = container_of(rx_ring->napi, struct ena_napi, napi); + + if (likely(READ_ONCE(ena_napi->last_intr_jiffies) != 0)) + return 0; + + if (ena_com_cq_empty(rx_ring->ena_com_io_cq)) + return 0; + + rx_ring->no_interrupt_event_cnt++; + + if (rx_ring->no_interrupt_event_cnt == ENA_MAX_NO_INTERRUPT_ITERATIONS) { + netif_err(adapter, rx_err, adapter->netdev, + "Potential MSIX issue on Rx side Queue = %d. Reset the device\n", + rx_ring->qid); + + ena_reset_device(adapter, ENA_REGS_RESET_MISS_INTERRUPT); + return -EIO; + } + + return 0; +} + +enum ena_regs_reset_reason_types check_cdesc_in_tx_cq(struct ena_adapter *adapter, + struct ena_ring *tx_ring) +{ + struct net_device *netdev = adapter->netdev; + u16 req_id; + int rc; + + rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq, &req_id); + + /* TX CQ is empty */ + if (rc == -EAGAIN) { + netif_err(adapter, tx_err, netdev, "No completion descriptors found in CQ %d", + tx_ring->qid); + + return ENA_REGS_RESET_MISS_TX_CMPL; + } + + /* TX CQ has cdescs */ + netif_err(adapter, tx_err, netdev, + "Completion descriptors found in CQ %d", tx_ring->qid); + + return ENA_REGS_RESET_MISS_INTERRUPT; +} + +static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter, struct ena_ring *tx_ring) +{ + unsigned long miss_tx_comp_to_jiffies = adapter->missing_tx_completion_to_jiffies; + struct ena_napi *ena_napi = container_of(tx_ring->napi, struct ena_napi, napi); + enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_MISS_TX_CMPL; + u32 missed_tx_thresh = adapter->missing_tx_completion_threshold; + struct net_device *netdev = adapter->netdev; + unsigned long jiffies_since_last_napi; + unsigned long jiffies_since_last_intr; + u32 missed_tx = 0, new_missed_tx = 0; + unsigned long graceful_timeout; + struct ena_tx_buffer *tx_buf; + unsigned long timeout; + int napi_scheduled; + bool is_expired; + int i, rc = 0; + + for (i = 0; i < tx_ring->ring_size; i++) { + tx_buf = &tx_ring->tx_buffer_info[i]; + if (tx_buf->tx_sent_jiffies == 0) + /* No pending Tx at this location */ + continue; + + timeout = tx_buf->tx_sent_jiffies + miss_tx_comp_to_jiffies; + graceful_timeout = timeout + miss_tx_comp_to_jiffies; + + /* Checking if current TX ring didn't get first interrupt */ + is_expired = time_is_before_jiffies(graceful_timeout); + if (unlikely(READ_ONCE(ena_napi->last_intr_jiffies) == 0 && is_expired)) { + /* If first interrupt is still not received, schedule a reset */ + netif_err(adapter, tx_err, netdev, + "Potential MSIX issue on Tx side Queue = %d. Reset the device\n", + tx_ring->qid); + ena_reset_device(adapter, ENA_REGS_RESET_MISS_INTERRUPT); + return -EIO; + } + + /* Checking if current TX buffer got timeout */ + is_expired = time_is_before_jiffies(timeout); + if (unlikely(is_expired)) { + /* Checking if current TX ring got NAPI timeout */ + unsigned long last_napi = READ_ONCE(tx_ring->tx_stats.last_napi_jiffies); + + jiffies_since_last_napi = jiffies - last_napi; + jiffies_since_last_intr = jiffies - READ_ONCE(ena_napi->last_intr_jiffies); + napi_scheduled = !!(READ_ONCE(ena_napi->napi.state) & NAPIF_STATE_SCHED); + if (jiffies_since_last_napi > miss_tx_comp_to_jiffies && napi_scheduled) { + /* We suspect napi isn't called because the bottom half is not run. + * Require a bigger timeout for these cases. + */ + if (time_is_after_jiffies(graceful_timeout)) + continue; + + reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION; + } + + missed_tx++; + + if (tx_buf->print_once) + continue; + + /* Add new TX completions which are missed */ + new_missed_tx++; + + netif_notice(adapter, tx_err, netdev, + "TX hasn't completed, qid %d, index %d. %u msecs since last interrupt, %u msecs since last napi execution, napi scheduled: %d\n", + tx_ring->qid, i, jiffies_to_msecs(jiffies_since_last_intr), + jiffies_to_msecs(jiffies_since_last_napi), napi_scheduled); + + tx_buf->print_once = 1; + } + } + + /* Checking if this TX ring missing TX completions have passed the threshold */ + if (unlikely(missed_tx > missed_tx_thresh)) { + jiffies_since_last_intr = jiffies - READ_ONCE(ena_napi->last_intr_jiffies); + jiffies_since_last_napi = jiffies - READ_ONCE(tx_ring->tx_stats.last_napi_jiffies); + netif_err(adapter, tx_err, netdev, + "Lost TX completions are above the threshold (%d > %d). Completion transmission timeout: %u (msec). %u msecs since last interrupt, %u msecs since last napi execution.\n", + missed_tx, + missed_tx_thresh, + jiffies_to_msecs(miss_tx_comp_to_jiffies), + jiffies_to_msecs(jiffies_since_last_intr), + jiffies_to_msecs(jiffies_since_last_napi)); + netif_err(adapter, tx_err, netdev, "Resetting the device\n"); + /* Set the reset flag to prevent NAPI from running */ + set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags); + /* Need to make sure that reset reason is visible to ena_io_poll to prevent it + * from accessing CQ concurrently with check_cdesc_in_tx_cq() + */ + smp_mb(); + napi_scheduled = !!(READ_ONCE(ena_napi->napi.state) & NAPIF_STATE_SCHED); + if (!napi_scheduled) + reset_reason = check_cdesc_in_tx_cq(adapter, tx_ring); + /* Update reset reason */ + ena_reset_device(adapter, reset_reason); + rc = -EIO; + } + + /* Add the newly discovered missing TX completions */ + ena_increase_stat(&tx_ring->tx_stats.missed_tx, new_missed_tx, &tx_ring->syncp); + + return rc; +} + +static void check_for_missing_completions(struct ena_adapter *adapter) +{ + struct ena_ring *tx_ring; + struct ena_ring *rx_ring; + int qid, budget, rc; + int io_queue_count; + + io_queue_count = adapter->xdp_num_queues + adapter->num_io_queues; + + /* Make sure the driver doesn't turn the device in other process */ + smp_rmb(); + + if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) + return; + + if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) + return; + + if (adapter->missing_tx_completion_to_jiffies == ENA_HW_HINTS_NO_TIMEOUT) + return; + + budget = min_t(u32, io_queue_count, ENA_MONITORED_TX_QUEUES); + + qid = adapter->last_monitored_tx_qid; + + while (budget) { + qid = (qid + 1) % io_queue_count; + + tx_ring = &adapter->tx_ring[qid]; + rx_ring = &adapter->rx_ring[qid]; + + rc = check_missing_comp_in_tx_queue(adapter, tx_ring); + if (unlikely(rc)) + return; + + rc = !ENA_IS_XDP_INDEX(adapter, qid) ? + check_for_rx_interrupt_queue(adapter, rx_ring) : 0; + if (unlikely(rc)) + return; + + budget--; + } + + adapter->last_monitored_tx_qid = qid; +} + +/* trigger napi schedule after 2 consecutive detections */ +#define EMPTY_RX_REFILL 2 +/* For the rare case where the device runs out of Rx descriptors and the + * napi handler failed to refill new Rx descriptors (due to a lack of memory + * for example). + * This case will lead to a deadlock: + * The device won't send interrupts since all the new Rx packets will be dropped + * The napi handler won't allocate new Rx descriptors so the device will be + * able to send new packets. + * + * This scenario can happen when the kernel's vm.min_free_kbytes is too small. + * It is recommended to have at least 512MB, with a minimum of 128MB for + * constrained environment). + * + * When such a situation is detected - Reschedule napi + */ +static void check_for_empty_rx_ring(struct ena_adapter *adapter) +{ + struct ena_ring *rx_ring; + int i, refill_required; + + if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) + return; + + if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) + return; + + for (i = 0; i < adapter->num_io_queues; i++) { + rx_ring = &adapter->rx_ring[i]; +#ifdef ENA_AF_XDP_SUPPORT + + /* If using UMEM, app might not provide RX buffers and the ring + * can be empty + */ + if (ENA_IS_XSK_RING(rx_ring)) + continue; +#endif /* ENA_AF_XDP_SUPPORT */ + + refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq); + if (unlikely(refill_required == (rx_ring->ring_size - 1))) { + rx_ring->empty_rx_queue++; + + if (rx_ring->empty_rx_queue >= EMPTY_RX_REFILL) { + ena_increase_stat(&rx_ring->rx_stats.empty_rx_ring, 1, + &rx_ring->syncp); + + netif_err(adapter, drv, adapter->netdev, + "Trigger refill for ring %d\n", i); + + napi_schedule(rx_ring->napi); + rx_ring->empty_rx_queue = 0; + } + } else { + rx_ring->empty_rx_queue = 0; + } + } +} + +/* Check for keep alive expiration */ +static void check_for_missing_keep_alive(struct ena_adapter *adapter) +{ + enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_KEEP_ALIVE_TO; + unsigned long keep_alive_expired; + + if (!adapter->wd_state) + return; + + if (adapter->keep_alive_timeout == ENA_HW_HINTS_NO_TIMEOUT) + return; + + keep_alive_expired = adapter->last_keep_alive_jiffies + + adapter->keep_alive_timeout; + if (unlikely(time_is_before_jiffies(keep_alive_expired))) { + unsigned long jiffies_since_last_keep_alive = + jiffies - adapter->last_keep_alive_jiffies; + netif_err(adapter, drv, adapter->netdev, + "Keep alive watchdog timeout, %u msecs since last keep alive.\n", + jiffies_to_msecs(jiffies_since_last_keep_alive)); + if (ena_com_aenq_has_keep_alive(adapter->ena_dev)) + reset_reason = ENA_REGS_RESET_MISSING_ADMIN_INTERRUPT; + + ena_reset_device(adapter, reset_reason); + } +} + +static void check_for_admin_com_state(struct ena_adapter *adapter) +{ + if (unlikely(!ena_com_get_admin_running_state(adapter->ena_dev))) { + netif_err(adapter, drv, adapter->netdev, + "ENA admin queue is not in running state!\n"); + ena_increase_stat(&adapter->dev_stats.admin_q_pause, 1, + &adapter->syncp); + if (ena_com_get_missing_admin_interrupt(adapter->ena_dev)) + ena_reset_device(adapter, ENA_REGS_RESET_MISSING_ADMIN_INTERRUPT); + else + ena_reset_device(adapter, ENA_REGS_RESET_ADMIN_TO); + } +} + +static void ena_update_hints(struct ena_adapter *adapter, + struct ena_admin_ena_hw_hints *hints) +{ + struct net_device *netdev = adapter->netdev; + + if (hints->admin_completion_tx_timeout) + adapter->ena_dev->admin_queue.completion_timeout = + hints->admin_completion_tx_timeout * 1000; + + if (hints->mmio_read_timeout) + /* convert to usec */ + adapter->ena_dev->mmio_read.reg_read_to = + hints->mmio_read_timeout * 1000; + + if (hints->missed_tx_completion_count_threshold_to_reset) + adapter->missing_tx_completion_threshold = + hints->missed_tx_completion_count_threshold_to_reset; + + if (hints->missing_tx_completion_timeout) { + if (hints->missing_tx_completion_timeout == ENA_HW_HINTS_NO_TIMEOUT) + adapter->missing_tx_completion_to_jiffies = ENA_HW_HINTS_NO_TIMEOUT; + else + adapter->missing_tx_completion_to_jiffies = + msecs_to_jiffies(hints->missing_tx_completion_timeout); + } + + if (hints->netdev_wd_timeout) + netdev->watchdog_timeo = msecs_to_jiffies(hints->netdev_wd_timeout); + + if (hints->driver_watchdog_timeout) { + if (hints->driver_watchdog_timeout == ENA_HW_HINTS_NO_TIMEOUT) + adapter->keep_alive_timeout = ENA_HW_HINTS_NO_TIMEOUT; + else + adapter->keep_alive_timeout = + msecs_to_jiffies(hints->driver_watchdog_timeout); + } +} + +static void ena_update_host_info(struct ena_admin_host_info *host_info, + struct net_device *netdev) +{ + host_info->supported_network_features[0] = + netdev->features & GENMASK_ULL(31, 0); + host_info->supported_network_features[1] = + (netdev->features & GENMASK_ULL(63, 32)) >> 32; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) +static void ena_timer_service(struct timer_list *t) +{ + struct ena_adapter *adapter = from_timer(adapter, t, timer_service); +#else +static void ena_timer_service(unsigned long data) +{ + struct ena_adapter *adapter = (struct ena_adapter *)data; +#endif + u8 *debug_area = adapter->ena_dev->host_attr.debug_area_virt_addr; + struct ena_admin_host_info *host_info = + adapter->ena_dev->host_attr.host_info; + + check_for_missing_keep_alive(adapter); + + check_for_admin_com_state(adapter); + + check_for_missing_completions(adapter); + + check_for_empty_rx_ring(adapter); + + if (debug_area) + ena_dump_stats_to_buf(adapter, debug_area); + + if (host_info) + ena_update_host_info(host_info, adapter->netdev); + + if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) { + /* We don't destroy driver resources if we're not able to + * communicate with the device. Failure in validating the + * version implies unresponsive device. + */ + if (ena_com_validate_version(adapter->ena_dev) == -ETIME) { + netif_err(adapter, drv, adapter->netdev, + "FW isn't responsive, skipping reset routine\n"); + mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ)); + return; + } + + netif_err(adapter, drv, adapter->netdev, + "Trigger reset is on\n"); + + if (adapter->reset_reason != ENA_REGS_RESET_NORMAL) + ena_dump_stats_to_dmesg(adapter); + + queue_work(ena_wq, &adapter->reset_task); + return; + } + + /* Reset the timer */ + mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ)); +} + +static u32 ena_calc_max_io_queue_num(struct pci_dev *pdev, + struct ena_com_dev *ena_dev, + struct ena_com_dev_get_features_ctx *get_feat_ctx) +{ + u32 io_tx_sq_num, io_tx_cq_num, io_rx_num, max_num_io_queues; + + if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) { + struct ena_admin_queue_ext_feature_fields *max_queue_ext = + &get_feat_ctx->max_queue_ext.max_queue_ext; + io_rx_num = min_t(u32, max_queue_ext->max_rx_sq_num, + max_queue_ext->max_rx_cq_num); + + io_tx_sq_num = max_queue_ext->max_tx_sq_num; + io_tx_cq_num = max_queue_ext->max_tx_cq_num; + } else { + struct ena_admin_queue_feature_desc *max_queues = + &get_feat_ctx->max_queues; + io_tx_sq_num = max_queues->max_sq_num; + io_tx_cq_num = max_queues->max_cq_num; + io_rx_num = min_t(u32, io_tx_sq_num, io_tx_cq_num); + } + + /* In case of LLQ use the llq fields for the tx SQ/CQ */ + if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) + io_tx_sq_num = get_feat_ctx->llq.max_llq_num; + + max_num_io_queues = min_t(u32, num_online_cpus(), ENA_MAX_NUM_IO_QUEUES); + max_num_io_queues = min_t(u32, max_num_io_queues, io_rx_num); + max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_sq_num); + max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_cq_num); + /* 1 IRQ for mgmnt and 1 IRQs for each IO direction */ + max_num_io_queues = min_t(u32, max_num_io_queues, pci_msix_vec_count(pdev) - 1); + + return max_num_io_queues; +} + +static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat, + struct net_device *netdev) +{ + netdev_features_t dev_features = 0; + + /* Set offload features */ + if (feat->offload.tx & + ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_MASK) + dev_features |= NETIF_F_IP_CSUM; + + if (feat->offload.tx & + ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_MASK) + dev_features |= NETIF_F_IPV6_CSUM; + + if (feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_MASK) + dev_features |= NETIF_F_TSO; + + if (feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_MASK) + dev_features |= NETIF_F_TSO6; + + if (feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_MASK) + dev_features |= NETIF_F_TSO_ECN; + + if (feat->offload.rx_supported & + ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_MASK) + dev_features |= NETIF_F_RXCSUM; + + if (feat->offload.rx_supported & + ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_MASK) + dev_features |= NETIF_F_RXCSUM; + + netdev->features = + dev_features | + NETIF_F_SG | +#ifdef NETIF_F_RXHASH + NETIF_F_RXHASH | +#endif /* NETIF_F_RXHASH */ + NETIF_F_HIGHDMA; + +#ifdef HAVE_RHEL6_NET_DEVICE_OPS_EXT + do { + u32 hw_features = get_netdev_hw_features(netdev); + hw_features |= netdev->features; + set_netdev_hw_features(netdev, hw_features); + } while (0); +#else + netdev->hw_features |= netdev->features; +#endif + netdev->vlan_features |= netdev->features; +} + +static void ena_set_conf_feat_params(struct ena_adapter *adapter, + struct ena_com_dev_get_features_ctx *feat) +{ + struct net_device *netdev = adapter->netdev; + + /* Copy mac address */ + if (!is_valid_ether_addr(feat->dev_attr.mac_addr)) { + eth_hw_addr_random(netdev); + ether_addr_copy(adapter->mac_addr, netdev->dev_addr); + } else { + ether_addr_copy(adapter->mac_addr, feat->dev_attr.mac_addr); + eth_hw_addr_set(netdev, adapter->mac_addr); + } + + /* Set offload features */ + ena_set_dev_offloads(feat, netdev); + + adapter->max_mtu = feat->dev_attr.max_mtu; +#ifdef HAVE_MTU_MIN_MAX_IN_NET_DEVICE + netdev->max_mtu = adapter->max_mtu; + netdev->min_mtu = ENA_MIN_MTU; +#endif +} + +static int ena_rss_init_default(struct ena_adapter *adapter) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct device *dev = &adapter->pdev->dev; + int rc, i; + u32 val; + + rc = ena_com_rss_init(ena_dev, ENA_RX_RSS_TABLE_LOG_SIZE); + if (unlikely(rc)) { + dev_err(dev, "Cannot init indirect table\n"); + goto err_rss_init; + } + + for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) { + val = ethtool_rxfh_indir_default(i, adapter->num_io_queues); + rc = ena_com_indirect_table_fill_entry(ena_dev, i, + ENA_IO_RXQ_IDX(val)); + if (unlikely(rc)) { + dev_err(dev, "Cannot fill indirect table\n"); + goto err_fill_indir; + } + } + + rc = ena_com_fill_hash_function(ena_dev, ENA_ADMIN_TOEPLITZ, NULL, ENA_HASH_KEY_SIZE, + 0xFFFFFFFF); + if (unlikely(rc && (rc != -EOPNOTSUPP))) { + dev_err(dev, "Cannot fill hash function\n"); + goto err_fill_indir; + } + + rc = ena_com_set_default_hash_ctrl(ena_dev); + if (unlikely(rc && (rc != -EOPNOTSUPP))) { + dev_err(dev, "Cannot fill hash control\n"); + goto err_fill_indir; + } + + return 0; + +err_fill_indir: + ena_com_rss_destroy(ena_dev); +err_rss_init: + + return rc; +} + +static void ena_release_bars(struct ena_com_dev *ena_dev, struct pci_dev *pdev) +{ + int release_bars = pci_select_bars(pdev, IORESOURCE_MEM) & ENA_BAR_MASK; + + pci_release_selected_regions(pdev, release_bars); +} + +/* ena_probe - Device Initialization Routine + * @pdev: PCI device information struct + * @ent: entry in ena_pci_tbl + * + * Returns 0 on success, negative on failure + * + * ena_probe initializes an adapter identified by a pci_dev structure. + * The OS initialization, configuring of the adapter private structure, + * and a hardware reset occur. + */ +static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + struct ena_com_dev_get_features_ctx get_feat_ctx; + struct ena_com_dev *ena_dev = NULL; + struct ena_adapter *adapter; + struct net_device *netdev; + static int adapters_found; + u32 max_num_io_queues; + bool wd_state; + int bars, rc; + + dev_dbg(&pdev->dev, "%s\n", __func__); + + dev_info_once(&pdev->dev, "%s", version); + + rc = pci_enable_device_mem(pdev); + if (rc) { + dev_err(&pdev->dev, "pci_enable_device_mem() failed!\n"); + return rc; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(ENA_MAX_PHYS_ADDR_SIZE_BITS)); + if (rc) { + dev_err(&pdev->dev, "dma_set_mask_and_coherent failed %d\n", rc); + goto err_disable_device; + } +#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */ + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(ENA_MAX_PHYS_ADDR_SIZE_BITS)); + if (rc) { + dev_err(&pdev->dev, "pci_set_dma_mask failed %d\n", rc); + goto err_disable_device; + } + + rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(ENA_MAX_PHYS_ADDR_SIZE_BITS)); + if (rc) { + dev_err(&pdev->dev, "err_pci_set_consistent_dma_mask failed %d\n", + rc); + goto err_disable_device; + } +#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */ + + pci_set_master(pdev); + + ena_dev = vzalloc(sizeof(*ena_dev)); + if (!ena_dev) { + rc = -ENOMEM; + goto err_disable_device; + } + + bars = pci_select_bars(pdev, IORESOURCE_MEM) & ENA_BAR_MASK; + rc = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + if (rc) { + dev_err(&pdev->dev, "pci_request_selected_regions failed %d\n", + rc); + goto err_free_ena_dev; + } + + ena_dev->reg_bar = devm_ioremap(&pdev->dev, + pci_resource_start(pdev, ENA_REG_BAR), + pci_resource_len(pdev, ENA_REG_BAR)); + if (!ena_dev->reg_bar) { + dev_err(&pdev->dev, "Failed to remap regs bar\n"); + rc = -EFAULT; + goto err_free_region; + } + + ena_dev->ena_min_poll_delay_us = ENA_ADMIN_POLL_DELAY_US; + + ena_dev->dmadev = &pdev->dev; + + netdev = alloc_etherdev_mq(sizeof(struct ena_adapter), ENA_MAX_RINGS); + if (!netdev) { + dev_err(&pdev->dev, "alloc_etherdev_mq failed\n"); + rc = -ENOMEM; + goto err_free_region; + } + + SET_NETDEV_DEV(netdev, &pdev->dev); + adapter = netdev_priv(netdev); + adapter->ena_dev = ena_dev; + adapter->netdev = netdev; + adapter->pdev = pdev; + adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE); + + ena_dev->net_device = netdev; + + pci_set_drvdata(pdev, adapter); + + rc = ena_phc_alloc(adapter); + if (rc) { + netdev_err(netdev, "ena_phc_alloc failed\n"); + goto err_netdev_destroy; + } + + adapter->llq_policy = ENA_LLQ_HEADER_SIZE_POLICY_UNSPECIFIED; + ena_set_forced_llq_size_policy(adapter); + +#ifdef ENA_PHC_SUPPORT + ena_phc_enable(adapter, !!phc_enable); + +#endif /* ENA_PHC_SUPPORT */ + rc = ena_com_allocate_customer_metrics_buffer(ena_dev); + if (unlikely(rc)) { + netdev_err(netdev, "ena_com_allocate_customer_metrics_buffer failed\n"); + goto err_free_phc; + } + + rc = ena_map_llq_mem_bar(pdev, ena_dev, bars); + if (rc) { + dev_err(&pdev->dev, "ENA LLQ bar mapping failed\n"); + goto err_metrics_destroy; + } + + rc = ena_device_init(adapter, pdev, &get_feat_ctx, &wd_state); + if (rc) { + dev_err(&pdev->dev, "ENA device init failed\n"); + if (rc == -ETIME) + rc = -EPROBE_DEFER; + goto err_metrics_destroy; + } + + /* Initial TX and RX interrupt delay. Assumes 1 usec granularity. + * Updated during device initialization with the real granularity + */ + ena_dev->intr_moder_tx_interval = ENA_INTR_INITIAL_TX_INTERVAL_USECS; + ena_dev->intr_moder_rx_interval = ENA_INTR_INITIAL_RX_INTERVAL_USECS; + ena_dev->intr_delay_resolution = ENA_DEFAULT_INTR_DELAY_RESOLUTION; + max_num_io_queues = ena_calc_max_io_queue_num(pdev, ena_dev, &get_feat_ctx); + if (unlikely(!max_num_io_queues)) { + rc = -EFAULT; + goto err_device_destroy; + } + + ena_set_conf_feat_params(adapter, &get_feat_ctx); + + adapter->reset_reason = ENA_REGS_RESET_NORMAL; + + adapter->num_io_queues = clamp_val(num_io_queues, ENA_MIN_NUM_IO_QUEUES, + max_num_io_queues); + adapter->used_lpc_size = lpc_size; + /* When LPC is enabled after driver load, the configured_lpc_size is + * used. Leaving it as 0, wouldn't change LPC state so we set it to + * different value + */ + adapter->configured_lpc_size = lpc_size ? : ENA_LPC_DEFAULT_MULTIPLIER; + adapter->max_num_io_queues = max_num_io_queues; + adapter->last_monitored_tx_qid = 0; + + adapter->xdp_first_ring = 0; + adapter->xdp_num_queues = 0; + + adapter->rx_copybreak = ENA_DEFAULT_RX_COPYBREAK; + if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) + adapter->disable_meta_caching = + !!(get_feat_ctx.llq.accel_mode.u.get.supported_flags & + BIT(ENA_ADMIN_DISABLE_META_CACHING)); + + adapter->wd_state = wd_state; + + snprintf(adapter->name, ENA_NAME_MAX_LEN, "ena_%d", adapters_found); + + rc = ena_com_init_interrupt_moderation(adapter->ena_dev); + if (rc) { + dev_err(&pdev->dev, + "Failed to query interrupt moderation feature\n"); + goto err_device_destroy; + } + + ena_init_io_rings(adapter, + 0, + adapter->xdp_num_queues + + adapter->num_io_queues); + + netdev->netdev_ops = &ena_netdev_ops; + netdev->watchdog_timeo = TX_TIMEOUT; + ena_set_ethtool_ops(netdev); + +#if defined(NETIF_F_MQ_TX_LOCK_OPT) + netdev->features &= ~NETIF_F_MQ_TX_LOCK_OPT; +#endif /* defined(NETIF_F_MQ_TX_LOCK_OPT) */ +#ifdef IFF_UNICAST_FLT + netdev->priv_flags |= IFF_UNICAST_FLT; +#endif /* IFF_UNICAST_FLT */ + + u64_stats_init(&adapter->syncp); + + rc = ena_enable_msix_and_set_admin_interrupts(adapter); + if (rc) { + dev_err(&pdev->dev, + "Failed to enable and set the admin interrupts\n"); + goto err_worker_destroy; + } + rc = ena_sysfs_init(&adapter->pdev->dev); + if (rc) { + dev_err(&pdev->dev, "Cannot init sysfs\n"); + goto err_free_msix; + } + rc = ena_rss_init_default(adapter); + if (unlikely(rc && (rc != -EOPNOTSUPP))) { + dev_err(&pdev->dev, "Cannot init RSS rc: %d\n", rc); + goto err_terminate_sysfs; + } + + ena_config_debug_area(adapter); + +#ifdef ENA_XDP_NETLINK_ADVERTISEMENT + if (ena_xdp_legal_queue_count(adapter, adapter->num_io_queues)) + netdev->xdp_features = ENA_XDP_FEATURES; +#endif + memcpy(adapter->netdev->perm_addr, adapter->mac_addr, netdev->addr_len); + + netif_carrier_off(netdev); + + rc = register_netdev(netdev); + if (rc) { + dev_err(&pdev->dev, "Cannot register net device\n"); + goto err_rss; + } + + INIT_WORK(&adapter->reset_task, ena_fw_reset_device); + + adapter->last_keep_alive_jiffies = jiffies; + adapter->keep_alive_timeout = ENA_DEVICE_KALIVE_TIMEOUT; + adapter->missing_tx_completion_to_jiffies = TX_TIMEOUT; + adapter->missing_tx_completion_threshold = MAX_NUM_OF_TIMEOUTED_PACKETS; + + ena_update_hints(adapter, &get_feat_ctx.hw_hints); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) + timer_setup(&adapter->timer_service, ena_timer_service, 0); +#else + setup_timer(&adapter->timer_service, ena_timer_service, + (unsigned long)adapter); +#endif + mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ)); + + dev_info(&pdev->dev, + "%s found at mem %lx, mac addr %pM\n", + DEVICE_NAME, (long)pci_resource_start(pdev, 0), + netdev->dev_addr); + + set_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags); + + adapters_found++; + + return 0; + +err_rss: + ena_com_delete_debug_area(ena_dev); + ena_com_rss_destroy(ena_dev); +err_terminate_sysfs: + ena_sysfs_terminate(&pdev->dev); +err_free_msix: + ena_com_dev_reset(ena_dev, ENA_REGS_RESET_INIT_ERR); + /* stop submitting admin commands on a device that was reset */ + ena_com_set_admin_running_state(ena_dev, false); + ena_free_mgmnt_irq(adapter); + ena_disable_msix(adapter); +err_worker_destroy: + del_timer(&adapter->timer_service); +err_device_destroy: + ena_com_delete_host_info(ena_dev); + ena_com_admin_destroy(ena_dev); +err_metrics_destroy: + ena_com_delete_customer_metrics_buffer(ena_dev); +err_free_phc: + ena_phc_free(adapter); +err_netdev_destroy: + free_netdev(netdev); +err_free_region: + ena_release_bars(ena_dev, pdev); +err_free_ena_dev: + vfree(ena_dev); +err_disable_device: + pci_disable_device(pdev); + return rc; +} + +/*****************************************************************************/ + +/* __ena_shutoff - Helper used in both PCI remove/shutdown routines + * @pdev: PCI device information struct + * @shutdown: Is it a shutdown operation? If false, means it is a removal + * + * __ena_shutoff is a helper routine that does the real work on shutdown and + * removal paths; the difference between those paths is with regards to whether + * dettach or unregister the netdevice. + */ +static void __ena_shutoff(struct pci_dev *pdev, bool shutdown) +{ + struct ena_adapter *adapter = pci_get_drvdata(pdev); + struct ena_com_dev *ena_dev; + struct net_device *netdev; + + ena_dev = adapter->ena_dev; + netdev = adapter->netdev; + +#ifdef CONFIG_RFS_ACCEL + if ((adapter->msix_vecs >= 1) && (netdev->rx_cpu_rmap)) { + free_irq_cpu_rmap(netdev->rx_cpu_rmap); + netdev->rx_cpu_rmap = NULL; + } + +#endif /* CONFIG_RFS_ACCEL */ + ena_sysfs_terminate(&adapter->pdev->dev); + /* Make sure timer and reset routine won't be called after + * freeing device resources. + */ + del_timer_sync(&adapter->timer_service); + cancel_work_sync(&adapter->reset_task); + + rtnl_lock(); /* lock released inside the below if-else block */ + adapter->reset_reason = ENA_REGS_RESET_SHUTDOWN; + ena_destroy_device(adapter, true); + + ena_phc_free(adapter); + + if (shutdown) { + netif_device_detach(netdev); + dev_close(netdev); + rtnl_unlock(); + } else { + rtnl_unlock(); + unregister_netdev(netdev); + free_netdev(netdev); + } + + ena_com_rss_destroy(ena_dev); + + ena_com_delete_debug_area(ena_dev); + + ena_com_delete_host_info(ena_dev); + + ena_com_delete_customer_metrics_buffer(ena_dev); + + ena_release_bars(ena_dev, pdev); + + pci_disable_device(pdev); + + vfree(ena_dev); +} + +/* ena_remove - Device Removal Routine + * @pdev: PCI device information struct + * + * ena_remove is called by the PCI subsystem to alert the driver + * that it should release a PCI device. + */ +static void ena_remove(struct pci_dev *pdev) +{ + __ena_shutoff(pdev, false); +} + +/* ena_shutdown - Device Shutdown Routine + * @pdev: PCI device information struct + * + * ena_shutdown is called by the PCI subsystem to alert the driver that + * a shutdown/reboot (or kexec) is happening and device must be disabled. + */ +static void ena_shutdown(struct pci_dev *pdev) +{ + __ena_shutoff(pdev, true); +} + +#ifdef CONFIG_PM +#ifdef ENA_GENERIC_PM_OPS +/* ena_suspend - PM suspend callback + * @dev_d: Device information struct + */ +static int __maybe_unused ena_suspend(struct device *dev_d) +{ + struct pci_dev *pdev = to_pci_dev(dev_d); +#else /* ENA_GENERIC_PM_OPS */ +/* ena_suspend - PM suspend callback + * @pdev: PCI device information struct + * @state:power state + */ +static int ena_suspend(struct pci_dev *pdev, pm_message_t state) +{ +#endif /* ENA_GENERIC_PM_OPS */ + struct ena_adapter *adapter = pci_get_drvdata(pdev); + + ena_increase_stat(&adapter->dev_stats.suspend, 1, &adapter->syncp); + + rtnl_lock(); + if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) { + dev_err(&pdev->dev, + "Ignoring device reset request as the device is being suspended\n"); + clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags); + } + ena_destroy_device(adapter, true); + rtnl_unlock(); + return 0; +} + +#ifdef ENA_GENERIC_PM_OPS +/* ena_resume - PM resume callback + * @dev_d: Device information struct + */ +static int __maybe_unused ena_resume(struct device *dev_d) +{ + struct ena_adapter *adapter = dev_get_drvdata(dev_d); +#else /* ENA_GENERIC_PM_OPS */ +/* ena_resume - PM resume callback + * @pdev: PCI device information struct + * + */ +static int ena_resume(struct pci_dev *pdev) +{ + struct ena_adapter *adapter = pci_get_drvdata(pdev); +#endif /* ENA_GENERIC_PM_OPS */ + int rc; + + ena_increase_stat(&adapter->dev_stats.resume, 1, &adapter->syncp); + + rtnl_lock(); +#if LINUX_VERSION_CODE < KERNEL_VERSION(5,5,0) + pci_set_power_state(pdev, PCI_D0); +#endif + rc = ena_restore_device(adapter); + rtnl_unlock(); + return rc; +} +#endif /* CONFIG_PM */ +#ifdef ENA_GENERIC_PM_OPS + +static SIMPLE_DEV_PM_OPS(ena_pm_ops, ena_suspend, ena_resume); +#endif /* ENA_GENERIC_PM_OPS */ + +static struct pci_driver ena_pci_driver = { + .name = DRV_MODULE_NAME, + .id_table = ena_pci_tbl, + .probe = ena_probe, + .remove = ena_remove, + .shutdown = ena_shutdown, +#ifdef ENA_GENERIC_PM_OPS + .driver.pm = &ena_pm_ops, +#else /* ENA_GENERIC_PM_OPS */ +#ifdef CONFIG_PM + .suspend = ena_suspend, + .resume = ena_resume, +#endif /* CONFIG_PM */ +#endif /* ENA_GENERIC_PM_OPS */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,18,0) + .sriov_configure = pci_sriov_configure_simple, +#endif +}; + +static int __init ena_init(void) +{ + int ret; + + ena_wq = create_singlethread_workqueue(DRV_MODULE_NAME); + if (!ena_wq) { + pr_err("Failed to create workqueue\n"); + return -ENOMEM; + } + + ret = pci_register_driver(&ena_pci_driver); + if (ret) + destroy_workqueue(ena_wq); + + return ret; +} + +static void __exit ena_cleanup(void) +{ + pci_unregister_driver(&ena_pci_driver); + + if (ena_wq) { + destroy_workqueue(ena_wq); + ena_wq = NULL; + } +} + +/****************************************************************************** + ******************************** AENQ Handlers ******************************* + *****************************************************************************/ +/* ena_update_on_link_change: + * Notify the network interface about the change in link status + */ +static void ena_update_on_link_change(void *adapter_data, + struct ena_admin_aenq_entry *aenq_e) +{ + struct ena_adapter *adapter = (struct ena_adapter *)adapter_data; + struct ena_admin_aenq_link_change_desc *aenq_desc = + (struct ena_admin_aenq_link_change_desc *)aenq_e; + int status = aenq_desc->flags & + ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK; + + if (status) { + netif_dbg(adapter, ifup, adapter->netdev, "%s\n", __func__); + set_bit(ENA_FLAG_LINK_UP, &adapter->flags); + if (!test_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags)) + netif_carrier_on(adapter->netdev); + } else { + clear_bit(ENA_FLAG_LINK_UP, &adapter->flags); + netif_carrier_off(adapter->netdev); + } +} + +static void ena_keep_alive_wd(void *adapter_data, + struct ena_admin_aenq_entry *aenq_e) +{ + struct ena_adapter *adapter = (struct ena_adapter *)adapter_data; + struct ena_admin_aenq_keep_alive_desc *desc; + u64 rx_overruns; + u64 rx_drops; + u64 tx_drops; + + desc = (struct ena_admin_aenq_keep_alive_desc *)aenq_e; + adapter->last_keep_alive_jiffies = jiffies; + + rx_drops = ENA_HIGH_LOW_TO_U64(desc->rx_drops_high, desc->rx_drops_low); + tx_drops = ENA_HIGH_LOW_TO_U64(desc->tx_drops_high, desc->tx_drops_low); + rx_overruns = ENA_HIGH_LOW_TO_U64(desc->rx_overruns_high, desc->rx_overruns_low); + + u64_stats_update_begin(&adapter->syncp); + /* These stats are accumulated by the device, so the counters indicate + * all drops since last reset. + */ + adapter->dev_stats.rx_drops = rx_drops; + adapter->dev_stats.tx_drops = tx_drops; + adapter->dev_stats.rx_overruns = rx_overruns; + u64_stats_update_end(&adapter->syncp); +} + +static void ena_notification(void *adapter_data, + struct ena_admin_aenq_entry *aenq_e) +{ + struct ena_adapter *adapter = (struct ena_adapter *)adapter_data; + struct ena_admin_ena_hw_hints *hints; + + WARN(aenq_e->aenq_common_desc.group != ENA_ADMIN_NOTIFICATION, + "Invalid group(%x) expected %x\n", + aenq_e->aenq_common_desc.group, + ENA_ADMIN_NOTIFICATION); + + switch (aenq_e->aenq_common_desc.syndrome) { + case ENA_ADMIN_UPDATE_HINTS: + hints = (struct ena_admin_ena_hw_hints *) + (&aenq_e->inline_data_w4); + ena_update_hints(adapter, hints); + break; + default: + netif_err(adapter, drv, adapter->netdev, + "Invalid aenq notification link state %d\n", + aenq_e->aenq_common_desc.syndrome); + } +} + +static void ena_refresh_fw_capabilites(void *adapter_data, + struct ena_admin_aenq_entry *aenq_e) +{ + struct ena_adapter *adapter = (struct ena_adapter *)adapter_data; + + netdev_info(adapter->netdev, "Received requet to refresh capabilities\n"); + + set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags); +} + + +static void ena_conf_notification(void *adapter_data, + struct ena_admin_aenq_entry *aenq_e) +{ + struct ena_adapter *adapter = (struct ena_adapter *)adapter_data; + struct ena_admin_aenq_conf_notifications_desc *desc; + u64 bitmap; + int bit; + + desc = (struct ena_admin_aenq_conf_notifications_desc *)aenq_e; + bitmap = desc->notifications_bitmap; + + if (bitmap == 0) { + netif_dbg(adapter, drv, adapter->netdev, + "Empty configuration notification bitmap\n"); + return; + } + + for_each_set_bit(bit, (unsigned long *)&bitmap, BITS_PER_TYPE(bitmap)) { + netif_info(adapter, drv, adapter->netdev, + "Sub-optimal configuration notification code: %d. Refer to AWS ENA documentation for additional details and mitigation options.\n", + bit + 1); + } +} + +static void ena_admin_device_request_reset(void *data, + struct ena_admin_aenq_entry *aenq_e) +{ + struct ena_adapter *adapter = (struct ena_adapter *)data; + + netdev_warn(adapter->netdev, + "The device has detected an unhealthy state, reset is requested\n"); + + ena_reset_device(adapter, ENA_REGS_RESET_DEVICE_REQUEST); +} + +/* This handler will called for unknown event group or unimplemented handlers*/ +static void unimplemented_aenq_handler(void *data, + struct ena_admin_aenq_entry *aenq_e) +{ + struct ena_adapter *adapter = (struct ena_adapter *)data; + + netif_err(adapter, drv, adapter->netdev, + "Unknown event was received or event with unimplemented handler\n"); +} + +static struct ena_aenq_handlers aenq_handlers = { + .handlers = { + [ENA_ADMIN_LINK_CHANGE] = ena_update_on_link_change, + [ENA_ADMIN_NOTIFICATION] = ena_notification, + [ENA_ADMIN_KEEP_ALIVE] = ena_keep_alive_wd, + [ENA_ADMIN_CONF_NOTIFICATIONS] = ena_conf_notification, + [ENA_ADMIN_DEVICE_REQUEST_RESET] = ena_admin_device_request_reset, + [ENA_ADMIN_REFRESH_CAPABILITIES] = ena_refresh_fw_capabilites, + }, + .unimplemented_handler = unimplemented_aenq_handler +}; + +module_init(ena_init); +module_exit(ena_cleanup); diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h new file mode 100644 index 0000000000000..269ced8d531e8 --- /dev/null +++ b/drivers/amazon/net/ena/ena_netdev.h @@ -0,0 +1,698 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef ENA_H +#define ENA_H + +#include "kcompat.h" +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) +#include "dim.h" +#else +#include +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */ +#include +#include +#include +#include +#include +#include +#ifdef ENA_XDP_SUPPORT +#include +#endif +#ifdef HAS_BPF_HEADER +#include +#endif +#include + +#include "ena_com.h" +#include "ena_eth_com.h" + +#define DRV_MODULE_GEN_MAJOR 2 +#define DRV_MODULE_GEN_MINOR 12 +#define DRV_MODULE_GEN_SUBMINOR 0 + +#define DRV_MODULE_NAME "ena" +#ifndef DRV_MODULE_GENERATION +#define DRV_MODULE_GENERATION \ + __stringify(DRV_MODULE_GEN_MAJOR) "." \ + __stringify(DRV_MODULE_GEN_MINOR) "." \ + __stringify(DRV_MODULE_GEN_SUBMINOR) "g" +#endif + +#define DEVICE_NAME "Elastic Network Adapter (ENA)" + +/* 1 for AENQ + ADMIN */ +#define ENA_ADMIN_MSIX_VEC 1 +#define ENA_MAX_MSIX_VEC(io_queues) (ENA_ADMIN_MSIX_VEC + (io_queues)) + +/* The ENA buffer length fields is 16 bit long. So when PAGE_SIZE == 64kB the + * driver passes 0. + * Since the max packet size the ENA handles is ~9kB limit the buffer length to + * 16kB. + */ +#if PAGE_SIZE > SZ_16K +#define ENA_PAGE_SIZE (_AC(SZ_16K, UL)) +#else +#define ENA_PAGE_SIZE PAGE_SIZE +#endif + +#define ENA_MIN_MSIX_VEC 2 + +#define ENA_REG_BAR 0 +#define ENA_MEM_BAR 2 +#define ENA_BAR_MASK (BIT(ENA_REG_BAR) | BIT(ENA_MEM_BAR)) + +#define ENA_DEFAULT_RING_SIZE (1024) +#define ENA_DEFAULT_WIDE_LLQ_RING_SIZE (512) +#define ENA_MIN_RING_SIZE (256) + +#define ENA_MIN_RX_BUF_SIZE (2048) + +#define ENA_MIN_NUM_IO_QUEUES (1) + +#define ENA_TX_WAKEUP_THRESH (MAX_SKB_FRAGS + 2) +#define ENA_DEFAULT_RX_COPYBREAK (256 - NET_IP_ALIGN) + +#define ENA_MIN_MTU 128 + +#define ENA_NAME_MAX_LEN 20 +#define ENA_IRQNAME_SIZE 40 + +#define ENA_PKT_MAX_BUFS 19 + +#define ENA_RX_RSS_TABLE_LOG_SIZE 7 +#define ENA_RX_RSS_TABLE_SIZE (1 << ENA_RX_RSS_TABLE_LOG_SIZE) + +/* The number of tx packet completions that will be handled each NAPI poll + * cycle is ring_size / ENA_TX_POLL_BUDGET_DIVIDER. + */ +#define ENA_TX_POLL_BUDGET_DIVIDER 4 + +/* Refill Rx queue when number of required descriptors is above + * QUEUE_SIZE / ENA_RX_REFILL_THRESH_DIVIDER or ENA_RX_REFILL_THRESH_PACKET + */ +#define ENA_RX_REFILL_THRESH_DIVIDER 8 +#define ENA_RX_REFILL_THRESH_PACKET 256 + +/* Number of queues to check for missing queues per timer service */ +#define ENA_MONITORED_TX_QUEUES 4 +/* Max timeout packets before device reset */ +#define MAX_NUM_OF_TIMEOUTED_PACKETS 128 + +#define ENA_TX_RING_IDX_NEXT(idx, ring_size) (((idx) + 1) & ((ring_size) - 1)) + +#define ENA_RX_RING_IDX_NEXT(idx, ring_size) (((idx) + 1) & ((ring_size) - 1)) +#define ENA_RX_RING_IDX_ADD(idx, n, ring_size) \ + (((idx) + (n)) & ((ring_size) - 1)) + +#define ENA_IO_TXQ_IDX(q) (2 * (q)) +#define ENA_IO_RXQ_IDX(q) (2 * (q) + 1) +#define ENA_IO_TXQ_IDX_TO_COMBINED_IDX(q) ((q) / 2) +#define ENA_IO_RXQ_IDX_TO_COMBINED_IDX(q) (((q) - 1) / 2) + +#define ENA_MGMNT_IRQ_IDX 0 +#define ENA_IO_IRQ_FIRST_IDX 1 +#define ENA_IO_IRQ_IDX(q) (ENA_IO_IRQ_FIRST_IDX + (q)) + +#define ENA_ADMIN_POLL_DELAY_US 5000 + +/* ENA device should send keep alive msg every 1 sec. + * We wait for 6 sec just to be on the safe side. + */ +#define ENA_DEVICE_KALIVE_TIMEOUT (6 * HZ) +#define ENA_MAX_NO_INTERRUPT_ITERATIONS 3 + +#define ENA_MMIO_DISABLE_REG_READ BIT(0) + +struct ena_page_cache; + +#ifdef ENA_PHC_SUPPORT +struct ena_phc_info; + +#endif +struct ena_irq { + irq_handler_t handler; + void *data; + int cpu; + u32 vector; + cpumask_t affinity_hint_mask; + char name[ENA_IRQNAME_SIZE]; +}; + +struct ena_napi { + unsigned long last_intr_jiffies ____cacheline_aligned; + u8 interrupts_masked; + struct napi_struct napi; + struct ena_ring *tx_ring; + struct ena_ring *rx_ring; + u32 qid; + struct dim dim; +}; + +struct ena_tx_buffer { + union { + struct sk_buff *skb; +#ifdef ENA_XDP_SUPPORT + /* XDP buffer structure which is used for sending packets in + * the xdp queues + */ + struct xdp_frame *xdpf; +#endif /* ENA_XDP_SUPPORT */ + }; + /* num of ena desc for this specific skb + * (includes data desc and metadata desc) + */ + u32 tx_descs; + /* num of buffers used by this skb */ + u32 num_of_bufs; + + /* Total size of all buffers in bytes */ + u32 total_tx_size; + + /* Indicate if bufs[0] map the linear data of the skb. */ + u8 map_linear_data; + + /* Used for detect missing tx packets to limit the number of prints */ + u8 print_once; + /* Save the last jiffies to detect missing tx packets + * + * sets to non zero value on ena_start_xmit and set to zero on + * napi and timer_Service_routine. + * + * while this value is not protected by lock, + * a given packet is not expected to be handled by ena_start_xmit + * and by napi/timer_service at the same time. + */ + unsigned long tx_sent_jiffies; + struct ena_com_buf bufs[ENA_PKT_MAX_BUFS]; +} ____cacheline_aligned; + +struct ena_rx_buffer { + struct sk_buff *skb; +#ifdef ENA_AF_XDP_SUPPORT + union { + struct { + struct page *page; + dma_addr_t dma_addr; + }; + /* XSK pool buffer */ + struct xdp_buff *xdp; + }; +#else + struct page *page; + dma_addr_t dma_addr; +#endif /* ENA_AF_XDP_SUPPORT */ + u32 page_offset; + u32 buf_offset; + struct ena_com_buf ena_buf; + bool is_lpc_page; +} ____cacheline_aligned; + +struct ena_stats_tx { + u64 cnt; + u64 bytes; + u64 queue_stop; + u64 prepare_ctx_err; + u64 queue_wakeup; + u64 dma_mapping_err; + u64 linearize; + u64 linearize_failed; + u64 napi_comp; + u64 tx_poll; + u64 doorbells; + u64 bad_req_id; + u64 llq_buffer_copy; + u64 missed_tx; + u64 unmask_interrupt; + u64 last_napi_jiffies; +#ifdef ENA_AF_XDP_SUPPORT + u64 xsk_need_wakeup_set; + u64 xsk_wakeup_request; +#endif /* ENA_AF_XDP_SUPPORT */ +}; + +struct ena_stats_rx { + u64 cnt; + u64 bytes; + u64 rx_copybreak_pkt; + u64 csum_good; + u64 refil_partial; + u64 csum_bad; + u64 page_alloc_fail; + u64 skb_alloc_fail; + u64 dma_mapping_err; + u64 bad_desc_num; +#ifdef ENA_BUSY_POLL_SUPPORT + u64 bp_yield; + u64 bp_missed; + u64 bp_cleaned; +#endif + u64 bad_req_id; + u64 empty_rx_ring; + u64 csum_unchecked; +#ifdef ENA_XDP_SUPPORT + u64 xdp_aborted; + u64 xdp_drop; + u64 xdp_pass; + u64 xdp_tx; + u64 xdp_invalid; + u64 xdp_redirect; +#endif + u64 lpc_warm_up; + u64 lpc_full; + u64 lpc_wrong_numa; +#ifdef ENA_AF_XDP_SUPPORT + u64 xsk_need_wakeup_set; + u64 zc_queue_pkt_copy; +#endif /* ENA_AF_XDP_SUPPORT */ +}; + +struct ena_ring { + /* Holds the empty requests for TX/RX + * out of order completions + */ + u16 *free_ids; + + union { + struct ena_tx_buffer *tx_buffer_info; + struct ena_rx_buffer *rx_buffer_info; + }; + + /* cache ptr to avoid using the adapter */ + struct device *dev; + struct pci_dev *pdev; + struct napi_struct *napi; + struct net_device *netdev; + struct ena_page_cache *page_cache; + struct ena_com_dev *ena_dev; + struct ena_adapter *adapter; + struct ena_com_io_cq *ena_com_io_cq; + struct ena_com_io_sq *ena_com_io_sq; +#ifdef ENA_XDP_SUPPORT + struct bpf_prog *xdp_bpf_prog; + struct xdp_rxq_info xdp_rxq; + spinlock_t xdp_tx_lock; /* synchronize XDP TX/Redirect traffic */ + /* Used for rx queues only to point to the xdp tx ring, to + * which traffic should be redirected from this rx ring. + */ + struct ena_ring *xdp_ring; +#ifdef ENA_AF_XDP_SUPPORT + struct xsk_buff_pool *xsk_pool; +#endif /* ENA_AF_XDP_SUPPORT */ +#endif /* ENA_XDP_SUPPORT */ + + u16 next_to_use; + u16 next_to_clean; + u16 rx_copybreak; + u16 rx_headroom; + u16 qid; + u16 mtu; + u16 sgl_size; + u8 enable_bql; + + /* The maximum header length the device can handle */ + u8 tx_max_header_size; + + bool disable_meta_caching; + u16 no_interrupt_event_cnt; + + /* cpu and NUMA for TPH */ + int cpu; + int numa_node; + + /* number of tx/rx_buffer_info's entries */ + int ring_size; + + enum ena_admin_placement_policy_type tx_mem_queue_type; + + struct ena_com_rx_buf_info ena_bufs[ENA_PKT_MAX_BUFS]; + u32 interrupt_interval; + /* Indicates whether interrupt interval has changed since previous set. + * This flag will be kept up, until cleared by the routine which updates + * the device with the modified interrupt interval value. + */ + bool interrupt_interval_changed; + u32 per_napi_packets; + u16 non_empty_napi_events; + struct u64_stats_sync syncp; + union { + struct ena_stats_tx tx_stats; + struct ena_stats_rx rx_stats; + }; + + u8 *push_buf_intermediate_buf; + int empty_rx_queue; +#ifdef ENA_BUSY_POLL_SUPPORT + atomic_t bp_state; +#endif +} ____cacheline_aligned; + +#ifdef ENA_BUSY_POLL_SUPPORT +enum ena_busy_poll_state_t { + ENA_BP_STATE_IDLE = 0, + ENA_BP_STATE_NAPI, + ENA_BP_STATE_POLL, + ENA_BP_STATE_DISABLE +}; +#endif +struct ena_stats_dev { + u64 tx_timeout; + u64 suspend; + u64 resume; + u64 wd_expired; + u64 interface_up; + u64 interface_down; + u64 admin_q_pause; + u64 rx_drops; + u64 tx_drops; + u64 rx_overruns; + u64 reset_fail; + u64 total_resets; + u64 bad_tx_req_id; + u64 bad_rx_req_id; + u64 bad_rx_desc_num; + u64 missing_intr; + u64 suspected_poll_starvation; + u64 missing_tx_cmpl; + u64 rx_desc_malformed; + u64 tx_desc_malformed; + u64 invalid_state; + u64 os_netdev_wd; + u64 missing_admin_interrupt; + u64 admin_to; + u64 device_request_reset; +}; + +enum ena_flags_t { + ENA_FLAG_DEVICE_RUNNING, + ENA_FLAG_DEV_UP, + ENA_FLAG_LINK_UP, + ENA_FLAG_MSIX_ENABLED, + ENA_FLAG_TRIGGER_RESET, + ENA_FLAG_ONGOING_RESET +}; + +enum ena_llq_header_size_policy_t { + /* Intermediate policy until llq configuration is initialized + * to either NORMAL or LARGE + */ + ENA_LLQ_HEADER_SIZE_POLICY_UNSPECIFIED = 0, + /* Policy for Normal size LLQ entry (128B) */ + ENA_LLQ_HEADER_SIZE_POLICY_NORMAL, + /* Policy for Large size LLQ entry (256B) */ + ENA_LLQ_HEADER_SIZE_POLICY_LARGE +}; + +/* adapter specific private data structure */ +struct ena_adapter { + struct ena_com_dev *ena_dev; + /* OS defined structs */ + struct net_device *netdev; + struct pci_dev *pdev; + + /* rx packets that are shorter than this len will be copied to the skb + * header + */ + u32 rx_copybreak; + u32 max_mtu; + + u32 num_io_queues; + u32 max_num_io_queues; + + /* Local page cache size when it's enabled */ + u32 configured_lpc_size; + /* Current Local page cache size */ + u32 used_lpc_size; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + struct msix_entry *msix_entries; +#endif + int msix_vecs; + + u32 missing_tx_completion_threshold; + + u32 requested_tx_ring_size; + u32 requested_rx_ring_size; + + u32 max_tx_ring_size; + u32 max_rx_ring_size; + + u32 msg_enable; + + /* The policy is used for two purposes: + * 1. Indicates who decided on LLQ entry size (user / device) + * 2. Indicates whether large LLQ is set or not after device + * initialization / configuration. + */ + enum ena_llq_header_size_policy_t llq_policy; + bool large_llq_header_supported; + + u16 max_tx_sgl_size; + u16 max_rx_sgl_size; + + u8 mac_addr[ETH_ALEN]; + + unsigned long keep_alive_timeout; + unsigned long missing_tx_completion_to_jiffies; + + char name[ENA_NAME_MAX_LEN]; +#ifdef ENA_PHC_SUPPORT + + struct ena_phc_info *phc_info; +#endif + + unsigned long flags; + /* TX */ + struct ena_ring tx_ring[ENA_MAX_NUM_IO_QUEUES] + ____cacheline_aligned_in_smp; + + /* RX */ + struct ena_ring rx_ring[ENA_MAX_NUM_IO_QUEUES] + ____cacheline_aligned_in_smp; + + struct ena_napi ena_napi[ENA_MAX_NUM_IO_QUEUES]; + + struct ena_irq irq_tbl[ENA_MAX_MSIX_VEC(ENA_MAX_NUM_IO_QUEUES)]; + + /* timer service */ + struct work_struct reset_task; + struct timer_list timer_service; + + bool wd_state; + bool dev_up_before_reset; + bool disable_meta_caching; + unsigned long last_keep_alive_jiffies; + + struct u64_stats_sync syncp; + struct ena_stats_dev dev_stats; + struct ena_admin_eni_stats eni_stats; + struct ena_admin_ena_srd_info ena_srd_info; + + /* last queue index that was checked for uncompleted tx packets */ + u32 last_monitored_tx_qid; + + enum ena_regs_reset_reason_types reset_reason; + +#ifdef ENA_XDP_SUPPORT + struct bpf_prog *xdp_bpf_prog; +#endif + u32 xdp_first_ring; + u32 xdp_num_queues; +}; + +#define ENA_RESET_STATS_ENTRY(reset_reason, stat) \ + [reset_reason] = { \ + .stat_offset = offsetof(struct ena_stats_dev, stat) / sizeof(u64), \ + .has_counter = true \ +} + +struct ena_reset_stats_offset { + int stat_offset; + bool has_counter; +}; + +static const struct ena_reset_stats_offset resets_to_stats_offset_map[ENA_REGS_RESET_LAST] = { + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_KEEP_ALIVE_TO, wd_expired), + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_ADMIN_TO, admin_to), + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_MISS_TX_CMPL, missing_tx_cmpl), + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_INV_RX_REQ_ID, bad_rx_req_id), + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_INV_TX_REQ_ID, bad_tx_req_id), + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_TOO_MANY_RX_DESCS, bad_rx_desc_num), + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_DRIVER_INVALID_STATE, invalid_state), + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_OS_NETDEV_WD, os_netdev_wd), + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_MISS_INTERRUPT, missing_intr), + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_SUSPECTED_POLL_STARVATION, suspected_poll_starvation), + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED, rx_desc_malformed), + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED, tx_desc_malformed), + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_MISSING_ADMIN_INTERRUPT, missing_admin_interrupt), + ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_DEVICE_REQUEST, device_request_reset), +}; + +void ena_set_ethtool_ops(struct net_device *netdev); + +void ena_dump_stats_to_dmesg(struct ena_adapter *adapter); + +void ena_dump_stats_to_buf(struct ena_adapter *adapter, u8 *buf); + + +int ena_set_lpc_state(struct ena_adapter *adapter, bool enabled); + +int ena_update_queue_params(struct ena_adapter *adapter, + u32 new_tx_size, + u32 new_rx_size, + u32 new_llq_header_len); + +int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count); + +int ena_set_rx_copybreak(struct ena_adapter *adapter, u32 rx_copybreak); + +/* Increase a stat by cnt while holding syncp seqlock on 32bit machines */ +static inline void ena_increase_stat(u64 *statp, u64 cnt, + struct u64_stats_sync *syncp) +{ + u64_stats_update_begin(syncp); + (*statp) += cnt; + u64_stats_update_end(syncp); +} + +int ena_get_sset_count(struct net_device *netdev, int sset); +#ifdef ENA_BUSY_POLL_SUPPORT +static inline void ena_bp_init_lock(struct ena_ring *rx_ring) +{ + /* reset state to idle */ + atomic_set(&rx_ring->bp_state, ENA_BP_STATE_IDLE); +} + +/* called from the napi routine to get ownership of the ring */ +static inline bool ena_bp_lock_napi(struct ena_ring *rx_ring) +{ + int rc = atomic_cmpxchg(&rx_ring->bp_state, ENA_BP_STATE_IDLE, + ENA_BP_STATE_NAPI); + if (rc != ENA_BP_STATE_IDLE) { + u64_stats_update_begin(&rx_ring->syncp); + rx_ring->rx_stats.bp_yield++; + u64_stats_update_end(&rx_ring->syncp); + } + + return rc == ENA_BP_STATE_IDLE; +} + +static inline void ena_bp_unlock_napi(struct ena_ring *rx_ring) +{ + WARN_ON(atomic_read(&rx_ring->bp_state) != ENA_BP_STATE_NAPI); + + /* flush any outstanding Rx frames */ + if (rx_ring->napi->gro_list) + napi_gro_flush(rx_ring->napi, false); + + /* reset state to idle */ + atomic_set(&rx_ring->bp_state, ENA_BP_STATE_IDLE); +} + +/* called from ena_ll_busy_poll() */ +static inline bool ena_bp_lock_poll(struct ena_ring *rx_ring) +{ + int rc = atomic_cmpxchg(&rx_ring->bp_state, ENA_BP_STATE_IDLE, + ENA_BP_STATE_POLL); + if (rc != ENA_BP_STATE_IDLE) { + u64_stats_update_begin(&rx_ring->syncp); + rx_ring->rx_stats.bp_yield++; + u64_stats_update_end(&rx_ring->syncp); + } + + return rc == ENA_BP_STATE_IDLE; +} + +static inline void ena_bp_unlock_poll(struct ena_ring *rx_ring) +{ + WARN_ON(atomic_read(&rx_ring->bp_state) != ENA_BP_STATE_POLL); + + /* reset state to idle */ + atomic_set(&rx_ring->bp_state, ENA_BP_STATE_IDLE); +} + +/* true if a socket is polling, even if it did not get the lock */ +static inline bool ena_bp_busy_polling(struct ena_ring *rx_ring) +{ + return atomic_read(&rx_ring->bp_state) == ENA_BP_STATE_POLL; +} + +static inline bool ena_bp_disable(struct ena_ring *rx_ring) +{ + int rc = atomic_cmpxchg(&rx_ring->bp_state, ENA_BP_STATE_IDLE, + ENA_BP_STATE_DISABLE); + + return rc == ENA_BP_STATE_IDLE; +} +#endif /* ENA_BUSY_POLL_SUPPORT */ + +static inline void ena_reset_device(struct ena_adapter *adapter, + enum ena_regs_reset_reason_types reset_reason) +{ + const struct ena_reset_stats_offset *ena_reset_stats_offset = + &resets_to_stats_offset_map[reset_reason]; + + if (ena_reset_stats_offset->has_counter) { + u64 *stat_ptr = (u64 *)&adapter->dev_stats + ena_reset_stats_offset->stat_offset; + + ena_increase_stat(stat_ptr, 1, &adapter->syncp); + } + + ena_increase_stat(&adapter->dev_stats.total_resets, 1, &adapter->syncp); + adapter->reset_reason = reset_reason; + /* Make sure reset reason is set before triggering the reset */ + smp_mb__before_atomic(); + set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags); +} + +/* Allocate a page and DMA map it + * @rx_ring: The IO queue pair which requests the allocation + * + * @return: address of the mapped page in DMA and allocated page address is + * succeeded, or NULL + */ +struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma); + +int ena_destroy_device(struct ena_adapter *adapter, bool graceful); +int ena_restore_device(struct ena_adapter *adapter); +int handle_invalid_req_id(struct ena_ring *ring, u16 req_id, + struct ena_tx_buffer *tx_info, bool is_xdp); + +static inline void ena_ring_tx_doorbell(struct ena_ring *tx_ring) +{ + ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq); + ena_increase_stat(&tx_ring->tx_stats.doorbells, 1, &tx_ring->syncp); +} + +int ena_xmit_common(struct ena_adapter *adapter, + struct ena_ring *ring, + struct ena_tx_buffer *tx_info, + struct ena_com_tx_ctx *ena_tx_ctx, + u16 next_to_use, + u32 bytes); +void ena_unmap_tx_buff(struct ena_ring *tx_ring, + struct ena_tx_buffer *tx_info); +void ena_init_io_rings(struct ena_adapter *adapter, + int first_index, int count); +int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter, + int first_index, int count); +int ena_setup_tx_resources_in_range(struct ena_adapter *adapter, + int first_index, int count); +void ena_free_all_io_tx_resources_in_range(struct ena_adapter *adapter, + int first_index, int count); +void ena_free_all_io_tx_resources(struct ena_adapter *adapter); +void ena_down(struct ena_adapter *adapter); +int ena_up(struct ena_adapter *adapter); +void ena_unmask_interrupt(struct ena_ring *tx_ring, struct ena_ring *rx_ring); +void ena_update_ring_numa_node(struct ena_ring *rx_ring); +void ena_rx_checksum(struct ena_ring *rx_ring, + struct ena_com_rx_ctx *ena_rx_ctx, + struct sk_buff *skb); +void ena_set_rx_hash(struct ena_ring *rx_ring, + struct ena_com_rx_ctx *ena_rx_ctx, + struct sk_buff *skb); +int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num); +#endif /* !(ENA_H) */ diff --git a/drivers/amazon/net/ena/ena_pci_id_tbl.h b/drivers/amazon/net/ena/ena_pci_id_tbl.h new file mode 100644 index 0000000000000..3ecdf29160ca7 --- /dev/null +++ b/drivers/amazon/net/ena/ena_pci_id_tbl.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef ENA_PCI_ID_TBL_H_ +#define ENA_PCI_ID_TBL_H_ + +#ifndef PCI_VENDOR_ID_AMAZON +#define PCI_VENDOR_ID_AMAZON 0x1d0f +#endif + +#ifndef PCI_DEV_ID_ENA_PF +#define PCI_DEV_ID_ENA_PF 0x0ec2 +#endif + +#ifndef PCI_DEV_ID_ENA_LLQ_PF +#define PCI_DEV_ID_ENA_LLQ_PF 0x1ec2 +#endif + +#ifndef PCI_DEV_ID_ENA_VF +#define PCI_DEV_ID_ENA_VF 0xec20 +#endif + +#ifndef PCI_DEV_ID_ENA_LLQ_VF +#define PCI_DEV_ID_ENA_LLQ_VF 0xec21 +#endif + +#ifndef PCI_DEV_ID_ENA_RESRV0 +#define PCI_DEV_ID_ENA_RESRV0 0x0051 +#endif + +#define ENA_PCI_ID_TABLE_ENTRY(devid) \ + {PCI_DEVICE(PCI_VENDOR_ID_AMAZON, devid)}, + +static const struct pci_device_id ena_pci_tbl[] = { + ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_RESRV0) + ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_PF) + ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_LLQ_PF) + ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_VF) + ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_LLQ_VF) + { } +}; + +#endif /* ENA_PCI_ID_TBL_H_ */ diff --git a/drivers/amazon/net/ena/ena_phc.c b/drivers/amazon/net/ena/ena_phc.c new file mode 100644 index 0000000000000..705824aab2ef5 --- /dev/null +++ b/drivers/amazon/net/ena/ena_phc.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2022 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include +#include "ena_netdev.h" +#include "ena_phc.h" + +#ifdef ENA_PHC_SUPPORT +#ifdef ENA_PHC_SUPPORT_ADJFREQ +static int ena_phc_adjfreq(struct ptp_clock_info *clock_info, s32 ppb) +{ + return -EOPNOTSUPP; +} +#endif /* ENA_PHC_SUPPORT_ADJFREQ */ + +static int ena_phc_adjtime(struct ptp_clock_info *clock_info, s64 delta) +{ + return -EOPNOTSUPP; +} + +static int ena_phc_feature_enable(struct ptp_clock_info *clock_info, struct ptp_clock_request *rq, + int on) +{ + return -EOPNOTSUPP; +} + +#ifdef ENA_PHC_SUPPORT_GETTIME64 +#ifdef ENA_PHC_SUPPORT_GETTIME64_EXTENDED +static int ena_phc_gettimex64(struct ptp_clock_info *clock_info, struct timespec64 *ts, + struct ptp_system_timestamp *sts) +{ + struct ena_phc_info *phc_info = container_of(clock_info, struct ena_phc_info, clock_info); + unsigned long flags; + u64 timestamp_nsec; + int rc; + + spin_lock_irqsave(&phc_info->lock, flags); + + ptp_read_system_prets(sts); + + rc = ena_com_phc_get_timestamp(phc_info->adapter->ena_dev, ×tamp_nsec); + + ptp_read_system_postts(sts); + + spin_unlock_irqrestore(&phc_info->lock, flags); + + *ts = ns_to_timespec64(timestamp_nsec); + + return rc; +} + +#else /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */ +static int ena_phc_gettime64(struct ptp_clock_info *clock_info, struct timespec64 *ts) +{ + struct ena_phc_info *phc_info = container_of(clock_info, struct ena_phc_info, clock_info); + unsigned long flags; + u64 timestamp_nsec; + int rc; + + spin_lock_irqsave(&phc_info->lock, flags); + + rc = ena_com_phc_get_timestamp(phc_info->adapter->ena_dev, ×tamp_nsec); + + spin_unlock_irqrestore(&phc_info->lock, flags); + + *ts = ns_to_timespec64(timestamp_nsec); + + return rc; +} + +#endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */ +static int ena_phc_settime64(struct ptp_clock_info *clock_info, + const struct timespec64 *ts) +{ + return -EOPNOTSUPP; +} + +#else /* ENA_PHC_SUPPORT_GETTIME64 */ +static int ena_phc_gettime(struct ptp_clock_info *clock_info, struct timespec *ts) +{ + struct ena_phc_info *phc_info = container_of(clock_info, struct ena_phc_info, clock_info); + unsigned long flags; + u64 timestamp_nsec; + u32 remainder; + int rc; + + spin_lock_irqsave(&phc_info->lock, flags); + + rc = ena_com_phc_get_timestamp(phc_info->adapter->ena_dev, ×tamp_nsec); + + spin_unlock_irqrestore(&phc_info->lock, flags); + + ts->tv_sec = div_u64_rem(timestamp_nsec, NSEC_PER_SEC, &remainder); + ts->tv_nsec = remainder; + + return rc; +} + +static int ena_phc_settime(struct ptp_clock_info *clock_info, const struct timespec *ts) +{ + return -EOPNOTSUPP; +} + +#endif /* ENA_PHC_SUPPORT_GETTIME64 */ + +static struct ptp_clock_info ena_ptp_clock_info = { + .owner = THIS_MODULE, + .n_alarm = 0, + .n_ext_ts = 0, + .n_per_out = 0, + .pps = 0, +#ifdef ENA_PHC_SUPPORT_ADJFREQ + .adjfreq = ena_phc_adjfreq, +#endif /* ENA_PHC_SUPPORT_ADJFREQ */ + .adjtime = ena_phc_adjtime, +#ifdef ENA_PHC_SUPPORT_GETTIME64 +#ifdef ENA_PHC_SUPPORT_GETTIME64_EXTENDED + .gettimex64 = ena_phc_gettimex64, +#else /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */ + .gettime64 = ena_phc_gettime64, +#endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */ + .settime64 = ena_phc_settime64, +#else /* ENA_PHC_SUPPORT_GETTIME64 */ + .gettime = ena_phc_gettime, + .settime = ena_phc_settime, +#endif /* ENA_PHC_SUPPORT_GETTIME64 */ + .enable = ena_phc_feature_enable, +}; + +/* Enable/Disable PHC by the kernel, affects on the next init flow */ +void ena_phc_enable(struct ena_adapter *adapter, bool enable) +{ + struct ena_phc_info *phc_info = adapter->phc_info; + + if (!phc_info) { + netdev_err(adapter->netdev, "phc_info is not allocated\n"); + return; + } + + phc_info->enabled = enable; +} + +/* Check if PHC is enabled by the kernel */ +bool ena_phc_is_enabled(struct ena_adapter *adapter) +{ + struct ena_phc_info *phc_info = adapter->phc_info; + + return (phc_info && phc_info->enabled); +} + +/* PHC is activated if ptp clock is registered in the kernel */ +bool ena_phc_is_active(struct ena_adapter *adapter) +{ + struct ena_phc_info *phc_info = adapter->phc_info; + + return (phc_info && phc_info->clock); +} + +static int ena_phc_register(struct ena_adapter *adapter) +{ + struct pci_dev *pdev = adapter->pdev; + struct ptp_clock_info *clock_info; + struct ena_phc_info *phc_info; + int rc = 0; + + phc_info = adapter->phc_info; + clock_info = &phc_info->clock_info; + + /* PHC may already be registered in case of a reset */ + if (ena_phc_is_active(adapter)) + return 0; + + phc_info->adapter = adapter; + + spin_lock_init(&phc_info->lock); + + /* Fill the ptp_clock_info struct and register PTP clock */ + *clock_info = ena_ptp_clock_info; + snprintf(clock_info->name, + sizeof(clock_info->name), + "ena-ptp-%02x", + PCI_SLOT(pdev->devfn)); + + phc_info->clock = ptp_clock_register(clock_info, &pdev->dev); + if (IS_ERR(phc_info->clock)) { + rc = PTR_ERR(phc_info->clock); + netdev_err(adapter->netdev, "Failed registering ptp clock, error: %d\n", rc); + phc_info->clock = NULL; + } + + return rc; +} + +static void ena_phc_unregister(struct ena_adapter *adapter) +{ + struct ena_phc_info *phc_info = adapter->phc_info; + + /* During reset flow, PHC must stay registered to keep kernel's PHC index */ + if (ena_phc_is_active(adapter) && !test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) { + ptp_clock_unregister(phc_info->clock); + phc_info->clock = NULL; + } +} + +int ena_phc_alloc(struct ena_adapter *adapter) +{ + /* Allocate driver specific PHC info */ + adapter->phc_info = vzalloc(sizeof(*adapter->phc_info)); + if (unlikely(!adapter->phc_info)) { + netdev_err(adapter->netdev, "Failed to alloc phc_info\n"); + return -ENOMEM; + } + + return 0; +} + +void ena_phc_free(struct ena_adapter *adapter) +{ + if (adapter->phc_info) { + vfree(adapter->phc_info); + adapter->phc_info = NULL; + } +} + +int ena_phc_init(struct ena_adapter *adapter) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct net_device *netdev = adapter->netdev; + int rc = -EOPNOTSUPP; + + /* Validate PHC feature is supported in the device */ + if (!ena_com_phc_supported(ena_dev)) { + netdev_dbg(netdev, "PHC feature is not supported by the device\n"); + goto err_ena_com_phc_init; + } + + /* Validate PHC feature is enabled by the kernel */ + if (!ena_phc_is_enabled(adapter)) { + netdev_dbg(netdev, "PHC feature is not enabled by the kernel\n"); + goto err_ena_com_phc_init; + } + + /* Initialize device specific PHC info */ + rc = ena_com_phc_init(ena_dev); + if (unlikely(rc)) { + netdev_err(netdev, "Failed to init phc, error: %d\n", rc); + goto err_ena_com_phc_init; + } + + /* Configure PHC feature in driver and device */ + rc = ena_com_phc_config(ena_dev); + if (unlikely(rc)) { + netdev_err(netdev, "Failed to config phc, error: %d\n", rc); + goto err_ena_com_phc_config; + } + + /* Register to PTP class driver */ + rc = ena_phc_register(adapter); + if (unlikely(rc)) { + netdev_err(netdev, "Failed to register phc, error: %d\n", rc); + goto err_ena_com_phc_config; + } + + return 0; + +err_ena_com_phc_config: + ena_com_phc_destroy(ena_dev); +err_ena_com_phc_init: + ena_phc_enable(adapter, false); + return rc; +} + +void ena_phc_destroy(struct ena_adapter *adapter) +{ + ena_phc_unregister(adapter); + ena_com_phc_destroy(adapter->ena_dev); +} + +int ena_phc_get_index(struct ena_adapter *adapter) +{ + if (ena_phc_is_active(adapter)) + return ptp_clock_index(adapter->phc_info->clock); + + return -1; +} + +int ena_phc_get_error_bound(struct ena_adapter *adapter, u32 *error_bound_nsec) +{ + if (!ena_phc_is_active(adapter)) + return -EOPNOTSUPP; + + return ena_com_phc_get_error_bound(adapter->ena_dev, error_bound_nsec); +} +#endif /* ENA_PHC_SUPPORT */ diff --git a/drivers/amazon/net/ena/ena_phc.h b/drivers/amazon/net/ena/ena_phc.h new file mode 100644 index 0000000000000..5252fc7081199 --- /dev/null +++ b/drivers/amazon/net/ena/ena_phc.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2022 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef ENA_PHC_H +#define ENA_PHC_H + +#ifdef ENA_PHC_SUPPORT + +#include + +struct ena_phc_info { + /* PTP hardware capabilities */ + struct ptp_clock_info clock_info; + + /* Registered PTP clock device */ + struct ptp_clock *clock; + + /* Adapter specific private data structure */ + struct ena_adapter *adapter; + + /* PHC lock */ + spinlock_t lock; + + /* Enabled by kernel */ + bool enabled; +}; + +void ena_phc_enable(struct ena_adapter *adapter, bool enable); +bool ena_phc_is_enabled(struct ena_adapter *adapter); +bool ena_phc_is_active(struct ena_adapter *adapter); +int ena_phc_get_index(struct ena_adapter *adapter); +int ena_phc_init(struct ena_adapter *adapter); +void ena_phc_destroy(struct ena_adapter *adapter); +int ena_phc_alloc(struct ena_adapter *adapter); +void ena_phc_free(struct ena_adapter *adapter); +int ena_phc_get_error_bound(struct ena_adapter *adapter, u32 *error_bound); +#else /* ENA_PHC_SUPPORT */ + +static inline void ena_phc_enable(struct ena_adapter *adapter, bool enable) { } +static inline bool ena_phc_is_enabled(struct ena_adapter *adapter) { return false; } +static inline bool ena_phc_is_active(struct ena_adapter *adapter) { return false; } +static inline int ena_phc_get_index(struct ena_adapter *adapter) { return -1; } +static inline int ena_phc_init(struct ena_adapter *adapter) { return 0; } +static inline void ena_phc_destroy(struct ena_adapter *adapter) { } +static inline int ena_phc_alloc(struct ena_adapter *adapter) { return 0; } +static inline void ena_phc_free(struct ena_adapter *adapter) { } +static inline int ena_phc_get_error_bound(struct ena_adapter *adapter, u32 *error_bound) +{ + return 0; +} +#endif /* ENA_PHC_SUPPORT */ + +#endif /* ENA_PHC_H */ diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h new file mode 100644 index 0000000000000..c0f6b8c14e66a --- /dev/null +++ b/drivers/amazon/net/ena/ena_regs_defs.h @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ +#ifndef _ENA_REGS_H_ +#define _ENA_REGS_H_ + +enum ena_regs_reset_reason_types { + ENA_REGS_RESET_NORMAL = 0, + ENA_REGS_RESET_KEEP_ALIVE_TO = 1, + ENA_REGS_RESET_ADMIN_TO = 2, + ENA_REGS_RESET_MISS_TX_CMPL = 3, + ENA_REGS_RESET_INV_RX_REQ_ID = 4, + ENA_REGS_RESET_INV_TX_REQ_ID = 5, + ENA_REGS_RESET_TOO_MANY_RX_DESCS = 6, + ENA_REGS_RESET_INIT_ERR = 7, + ENA_REGS_RESET_DRIVER_INVALID_STATE = 8, + ENA_REGS_RESET_OS_TRIGGER = 9, + ENA_REGS_RESET_OS_NETDEV_WD = 10, + ENA_REGS_RESET_SHUTDOWN = 11, + ENA_REGS_RESET_USER_TRIGGER = 12, + ENA_REGS_RESET_GENERIC = 13, + ENA_REGS_RESET_MISS_INTERRUPT = 14, + ENA_REGS_RESET_SUSPECTED_POLL_STARVATION = 15, + ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED = 16, + ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED = 17, + ENA_REGS_RESET_MISSING_ADMIN_INTERRUPT = 18, + ENA_REGS_RESET_DEVICE_REQUEST = 19, + ENA_REGS_RESET_LAST, +}; + +/* ena_registers offsets */ + +/* 0 base */ +#define ENA_REGS_VERSION_OFF 0x0 +#define ENA_REGS_CONTROLLER_VERSION_OFF 0x4 +#define ENA_REGS_CAPS_OFF 0x8 +#define ENA_REGS_CAPS_EXT_OFF 0xc +#define ENA_REGS_AQ_BASE_LO_OFF 0x10 +#define ENA_REGS_AQ_BASE_HI_OFF 0x14 +#define ENA_REGS_AQ_CAPS_OFF 0x18 +#define ENA_REGS_ACQ_BASE_LO_OFF 0x20 +#define ENA_REGS_ACQ_BASE_HI_OFF 0x24 +#define ENA_REGS_ACQ_CAPS_OFF 0x28 +#define ENA_REGS_AQ_DB_OFF 0x2c +#define ENA_REGS_ACQ_TAIL_OFF 0x30 +#define ENA_REGS_AENQ_CAPS_OFF 0x34 +#define ENA_REGS_AENQ_BASE_LO_OFF 0x38 +#define ENA_REGS_AENQ_BASE_HI_OFF 0x3c +#define ENA_REGS_AENQ_HEAD_DB_OFF 0x40 +#define ENA_REGS_AENQ_TAIL_OFF 0x44 +#define ENA_REGS_INTR_MASK_OFF 0x4c +#define ENA_REGS_DEV_CTL_OFF 0x54 +#define ENA_REGS_DEV_STS_OFF 0x58 +#define ENA_REGS_MMIO_REG_READ_OFF 0x5c +#define ENA_REGS_MMIO_RESP_LO_OFF 0x60 +#define ENA_REGS_MMIO_RESP_HI_OFF 0x64 +#define ENA_REGS_RSS_IND_ENTRY_UPDATE_OFF 0x68 + +/* phc_registers offsets */ + +/* 100 base */ +#define ENA_REGS_PHC_DB_OFF 0x100 + +/* version register */ +#define ENA_REGS_VERSION_MINOR_VERSION_MASK 0xff +#define ENA_REGS_VERSION_MAJOR_VERSION_SHIFT 8 +#define ENA_REGS_VERSION_MAJOR_VERSION_MASK 0xff00 + +/* controller_version register */ +#define ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK 0xff +#define ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT 8 +#define ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK 0xff00 +#define ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT 16 +#define ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK 0xff0000 +#define ENA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT 24 +#define ENA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK 0xff000000 + +/* caps register */ +#define ENA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK 0x1 +#define ENA_REGS_CAPS_RESET_TIMEOUT_SHIFT 1 +#define ENA_REGS_CAPS_RESET_TIMEOUT_MASK 0x3e +#define ENA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT 8 +#define ENA_REGS_CAPS_DMA_ADDR_WIDTH_MASK 0xff00 +#define ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT 16 +#define ENA_REGS_CAPS_ADMIN_CMD_TO_MASK 0xf0000 + +/* aq_caps register */ +#define ENA_REGS_AQ_CAPS_AQ_DEPTH_MASK 0xffff +#define ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT 16 +#define ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK 0xffff0000 + +/* acq_caps register */ +#define ENA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK 0xffff +#define ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT 16 +#define ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK 0xffff0000 + +/* aenq_caps register */ +#define ENA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK 0xffff +#define ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT 16 +#define ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK 0xffff0000 + +/* dev_ctl register */ +#define ENA_REGS_DEV_CTL_DEV_RESET_MASK 0x1 +#define ENA_REGS_DEV_CTL_AQ_RESTART_SHIFT 1 +#define ENA_REGS_DEV_CTL_AQ_RESTART_MASK 0x2 +#define ENA_REGS_DEV_CTL_QUIESCENT_SHIFT 2 +#define ENA_REGS_DEV_CTL_QUIESCENT_MASK 0x4 +#define ENA_REGS_DEV_CTL_IO_RESUME_SHIFT 3 +#define ENA_REGS_DEV_CTL_IO_RESUME_MASK 0x8 +#define ENA_REGS_DEV_CTL_RESET_REASON_EXT_SHIFT 24 +#define ENA_REGS_DEV_CTL_RESET_REASON_EXT_MASK 0xf000000 +#define ENA_REGS_DEV_CTL_RESET_REASON_SHIFT 28 +#define ENA_REGS_DEV_CTL_RESET_REASON_MASK 0xf0000000 + +/* dev_sts register */ +#define ENA_REGS_DEV_STS_READY_MASK 0x1 +#define ENA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_SHIFT 1 +#define ENA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK 0x2 +#define ENA_REGS_DEV_STS_AQ_RESTART_FINISHED_SHIFT 2 +#define ENA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK 0x4 +#define ENA_REGS_DEV_STS_RESET_IN_PROGRESS_SHIFT 3 +#define ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK 0x8 +#define ENA_REGS_DEV_STS_RESET_FINISHED_SHIFT 4 +#define ENA_REGS_DEV_STS_RESET_FINISHED_MASK 0x10 +#define ENA_REGS_DEV_STS_FATAL_ERROR_SHIFT 5 +#define ENA_REGS_DEV_STS_FATAL_ERROR_MASK 0x20 +#define ENA_REGS_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_SHIFT 6 +#define ENA_REGS_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_MASK 0x40 +#define ENA_REGS_DEV_STS_QUIESCENT_STATE_ACHIEVED_SHIFT 7 +#define ENA_REGS_DEV_STS_QUIESCENT_STATE_ACHIEVED_MASK 0x80 + +/* mmio_reg_read register */ +#define ENA_REGS_MMIO_REG_READ_REQ_ID_MASK 0xffff +#define ENA_REGS_MMIO_REG_READ_REG_OFF_SHIFT 16 +#define ENA_REGS_MMIO_REG_READ_REG_OFF_MASK 0xffff0000 + +/* rss_ind_entry_update register */ +#define ENA_REGS_RSS_IND_ENTRY_UPDATE_INDEX_MASK 0xffff +#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_SHIFT 16 +#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_MASK 0xffff0000 + +/* phc_db_req_id register */ +#define ENA_REGS_PHC_DB_REQ_ID_MASK 0xffff + +#endif /* _ENA_REGS_H_ */ diff --git a/drivers/amazon/net/ena/ena_sysfs.c b/drivers/amazon/net/ena/ena_sysfs.c new file mode 100644 index 0000000000000..109203f5b349a --- /dev/null +++ b/drivers/amazon/net/ena/ena_sysfs.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include + +#include "ena_com.h" +#include "ena_netdev.h" +#ifdef ENA_PHC_SUPPORT +#include "ena_phc.h" +#endif /* ENA_PHC_SUPPORT */ +#include "ena_sysfs.h" + + +static ssize_t ena_store_rx_copybreak(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct ena_adapter *adapter = dev_get_drvdata(dev); + unsigned long rx_copybreak; + int rc; + + rc = kstrtoul(buf, 10, &rx_copybreak); + if (rc < 0) + goto exit; + + rtnl_lock(); + rc = ena_set_rx_copybreak(adapter, rx_copybreak); + if (rc) + goto unlock; + rtnl_unlock(); + + return len; +unlock: + rtnl_unlock(); +exit: + return rc; +} + +#define ENA_RX_COPYBREAK_STR_MAX_LEN 7 + +static ssize_t ena_show_rx_copybreak(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ena_adapter *adapter = dev_get_drvdata(dev); + + return snprintf(buf, ENA_RX_COPYBREAK_STR_MAX_LEN, "%d\n", + adapter->rx_copybreak); +} + +static DEVICE_ATTR(rx_copybreak, S_IRUGO | S_IWUSR, ena_show_rx_copybreak, + ena_store_rx_copybreak); +#ifdef ENA_PHC_SUPPORT +/* Max PHC error bound string size takes into account max u32 value, null and new line characters */ +#define ENA_PHC_ERROR_BOUND_STR_MAX_LEN 12 + +static ssize_t ena_show_phc_error_bound(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ena_adapter *adapter = dev_get_drvdata(dev); + u32 error_bound_nsec = 0; + int rc; + + rc = ena_phc_get_error_bound(adapter, &error_bound_nsec); + if (rc != 0) + return rc; + + return snprintf(buf, ENA_PHC_ERROR_BOUND_STR_MAX_LEN, "%u\n", error_bound_nsec); +} + +static DEVICE_ATTR(phc_error_bound, S_IRUGO, ena_show_phc_error_bound, NULL); +#endif /* ENA_PHC_SUPPORT */ + +static ssize_t ena_large_llq_set(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t len) +{ + struct ena_adapter *adapter = dev_get_drvdata(dev); + enum ena_llq_header_size_policy_t new_llq_policy; + unsigned long large_llq_enabled; + int rc; + + rc = kstrtoul(buf, 10, &large_llq_enabled); + if (rc < 0) + return rc; + + if (large_llq_enabled != 0 && large_llq_enabled != 1) + return -EINVAL; + + rtnl_lock(); + new_llq_policy = large_llq_enabled ? ENA_LLQ_HEADER_SIZE_POLICY_LARGE : + ENA_LLQ_HEADER_SIZE_POLICY_NORMAL; + if (adapter->llq_policy == new_llq_policy) + goto unlock; + + adapter->llq_policy = new_llq_policy; + + ena_destroy_device(adapter, false); + rc = ena_restore_device(adapter); +unlock: + rtnl_unlock(); + + return rc ? rc : len; +} + +#define ENA_LARGE_LLQ_STR_MAX_LEN 3 + +static ssize_t ena_large_llq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ena_adapter *adapter = dev_get_drvdata(dev); + bool large_llq_enabled; + + large_llq_enabled = adapter->llq_policy == ENA_LLQ_HEADER_SIZE_POLICY_LARGE; + + return snprintf(buf, ENA_LARGE_LLQ_STR_MAX_LEN, "%d\n", + large_llq_enabled); +} + +static DEVICE_ATTR(large_llq_header, S_IRUGO | S_IWUSR, ena_large_llq_show, + ena_large_llq_set); + +/****************************************************************************** + *****************************************************************************/ +int ena_sysfs_init(struct device *dev) +{ + + if (device_create_file(dev, &dev_attr_rx_copybreak)) + dev_err(dev, "Failed to create rx_copybreak sysfs entry"); + +#ifdef ENA_PHC_SUPPORT + if (device_create_file(dev, &dev_attr_phc_error_bound)) + dev_err(dev, "Failed to create phc_error_bound sysfs entry"); + +#endif /* ENA_PHC_SUPPORT */ + + if (device_create_file(dev, &dev_attr_large_llq_header)) + dev_err(dev, "Failed to create large_llq_header sysfs entry"); + return 0; +} + +/****************************************************************************** + *****************************************************************************/ +void ena_sysfs_terminate(struct device *dev) +{ + device_remove_file(dev, &dev_attr_rx_copybreak); +#ifdef ENA_PHC_SUPPORT + device_remove_file(dev, &dev_attr_phc_error_bound); +#endif /* ENA_PHC_SUPPORT */ + device_remove_file(dev, &dev_attr_large_llq_header); +} diff --git a/drivers/amazon/net/ena/ena_sysfs.h b/drivers/amazon/net/ena/ena_sysfs.h new file mode 100644 index 0000000000000..8c572eee268f3 --- /dev/null +++ b/drivers/amazon/net/ena/ena_sysfs.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef __ENA_SYSFS_H__ +#define __ENA_SYSFS_H__ + +#ifdef CONFIG_SYSFS + +int ena_sysfs_init(struct device *dev); + +void ena_sysfs_terminate(struct device *dev); + +#else /* CONFIG_SYSFS */ + +static inline int ena_sysfs_init(struct device *dev) +{ + return 0; +} + +static inline void ena_sysfs_terminate(struct device *dev) +{ +} + +#endif /* CONFIG_SYSFS */ + +#endif /* __ENA_SYSFS_H__ */ diff --git a/drivers/amazon/net/ena/ena_xdp.c b/drivers/amazon/net/ena/ena_xdp.c new file mode 100644 index 0000000000000..204389ffe5b24 --- /dev/null +++ b/drivers/amazon/net/ena/ena_xdp.c @@ -0,0 +1,1006 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "ena_xdp.h" +#ifdef ENA_XDP_SUPPORT + +static int validate_xdp_req_id(struct ena_ring *tx_ring, u16 req_id) +{ + struct ena_tx_buffer *tx_info; + + tx_info = &tx_ring->tx_buffer_info[req_id]; +#ifdef ENA_AF_XDP_SUPPORT + if (likely(tx_info->total_tx_size)) +#else + if (likely(tx_info->xdpf)) +#endif + return 0; + + return handle_invalid_req_id(tx_ring, req_id, tx_info, true); +} + +static int ena_xdp_tx_map_frame(struct ena_ring *tx_ring, + struct ena_tx_buffer *tx_info, + struct xdp_frame *xdpf, + struct ena_com_tx_ctx *ena_tx_ctx) +{ + struct ena_adapter *adapter = tx_ring->adapter; + struct ena_com_buf *ena_buf; + int push_len = 0; + dma_addr_t dma; + void *data; + u32 size; + + tx_info->xdpf = xdpf; + data = tx_info->xdpf->data; + size = tx_info->xdpf->len; + + if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + /* Designate part of the packet for LLQ */ + push_len = min_t(u32, size, tx_ring->tx_max_header_size); + + ena_tx_ctx->push_header = data; + + size -= push_len; + data += push_len; + } + + ena_tx_ctx->header_len = push_len; + + if (size > 0) { + dma = dma_map_single(tx_ring->dev, + data, + size, + DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx_ring->dev, dma))) + goto error_report_dma_error; + + tx_info->map_linear_data = 0; + + ena_buf = tx_info->bufs; + ena_buf->paddr = dma; + ena_buf->len = size; + + ena_tx_ctx->ena_bufs = ena_buf; + ena_tx_ctx->num_bufs = tx_info->num_of_bufs = 1; + } + + return 0; + +error_report_dma_error: + ena_increase_stat(&tx_ring->tx_stats.dma_mapping_err, 1, + &tx_ring->syncp); + netif_warn(adapter, tx_queued, adapter->netdev, "Failed to map xdp buff\n"); + + return -EINVAL; +} + +int ena_xdp_xmit_frame(struct ena_ring *tx_ring, + struct ena_adapter *adapter, + struct xdp_frame *xdpf) +{ + struct ena_com_tx_ctx ena_tx_ctx = {}; + struct ena_tx_buffer *tx_info; + u16 next_to_use, req_id; + int rc; + + next_to_use = tx_ring->next_to_use; + req_id = tx_ring->free_ids[next_to_use]; + tx_info = &tx_ring->tx_buffer_info[req_id]; + tx_info->num_of_bufs = 0; + + rc = ena_xdp_tx_map_frame(tx_ring, tx_info, xdpf, &ena_tx_ctx); + if (unlikely(rc)) + goto err; + + ena_tx_ctx.req_id = req_id; + + rc = ena_xmit_common(adapter, + tx_ring, + tx_info, + &ena_tx_ctx, + next_to_use, + xdpf->len); + if (rc) + goto error_unmap_dma; + + return rc; + +error_unmap_dma: + ena_unmap_tx_buff(tx_ring, tx_info); +err: + tx_info->xdpf = NULL; + + return rc; +} + +int ena_xdp_xmit(struct net_device *dev, int n, + struct xdp_frame **frames, u32 flags) +{ + struct ena_adapter *adapter = netdev_priv(dev); + struct ena_ring *tx_ring; + int qid, i, nxmit = 0; + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + return -EINVAL; + + if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) + return -ENETDOWN; + + /* We assume that all rings have the same XDP program */ + if (!READ_ONCE(adapter->rx_ring->xdp_bpf_prog)) + return -ENXIO; + + qid = smp_processor_id() % adapter->xdp_num_queues; + qid += adapter->xdp_first_ring; + tx_ring = &adapter->tx_ring[qid]; + + /* Other CPU ids might try to send thorugh this queue */ + spin_lock(&tx_ring->xdp_tx_lock); + + for (i = 0; i < n; i++) { + if (ena_xdp_xmit_frame(tx_ring, adapter, frames[i])) + break; + nxmit++; + } + + /* Ring doorbell to make device aware of the packets */ + if (flags & XDP_XMIT_FLUSH) + ena_ring_tx_doorbell(tx_ring); + + spin_unlock(&tx_ring->xdp_tx_lock); + +#ifndef ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY + for (i = nxmit; unlikely(i < n); i++) + xdp_return_frame(frames[i]); + +#endif + /* Return number of packets sent */ + return nxmit; +} + +static void ena_init_all_xdp_queues(struct ena_adapter *adapter) +{ + adapter->xdp_first_ring = adapter->num_io_queues; + adapter->xdp_num_queues = adapter->num_io_queues; + + ena_init_io_rings(adapter, + adapter->xdp_first_ring, + adapter->xdp_num_queues); +} + +int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter) +{ + u32 xdp_first_ring = adapter->xdp_first_ring; + u32 xdp_num_queues = adapter->xdp_num_queues; + int rc = 0; + + rc = ena_setup_tx_resources_in_range(adapter, xdp_first_ring, xdp_num_queues); + if (rc) + goto setup_err; + + rc = ena_create_io_tx_queues_in_range(adapter, xdp_first_ring, xdp_num_queues); + if (rc) + goto create_err; + + return 0; + +create_err: + ena_free_all_io_tx_resources_in_range(adapter, xdp_first_ring, xdp_num_queues); +setup_err: + return rc; +} + +/* Provides a way for both kernel and bpf-prog to know + * more about the RX-queue a given XDP frame arrived on. + */ +int ena_xdp_register_rxq_info(struct ena_ring *rx_ring) +{ + int rc; + +#ifdef AF_XDP_BUSY_POLL_SUPPORTED +#ifdef ENA_AF_XDP_SUPPORT + rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid, + rx_ring->napi->napi_id); +#else + rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid, 0); +#endif /* ENA_AF_XDP_SUPPORT */ +#else + rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid); +#endif /* AF_XDP_BUSY_POLL_SUPPORTED */ + + netif_dbg(rx_ring->adapter, ifup, rx_ring->netdev, + "Registering RX info for queue %d with napi id %d\n", + rx_ring->qid, rx_ring->napi->napi_id); + if (rc) { + netif_err(rx_ring->adapter, ifup, rx_ring->netdev, + "Failed to register xdp rx queue info. RX queue num %d rc: %d\n", + rx_ring->qid, rc); + goto err; + } + +#ifdef ENA_AF_XDP_SUPPORT + if (ENA_IS_XSK_RING(rx_ring)) { + rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_XSK_BUFF_POOL, NULL); + xsk_pool_set_rxq_info(rx_ring->xsk_pool, &rx_ring->xdp_rxq); + } else { + rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); + } +#else + rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); +#endif /* ENA_AF_XDP_SUPPORT */ + + if (rc) { + netif_err(rx_ring->adapter, ifup, rx_ring->netdev, + "Failed to register xdp rx queue info memory model. RX queue num %d rc: %d\n", + rx_ring->qid, rc); + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); + } + +err: + return rc; +} + +#ifdef ENA_AF_XDP_SUPPORT +void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring) +{ + struct xsk_buff_pool *xsk_pool = tx_ring->xsk_pool; + int i, xsk_frames = 0; + + for (i = 0; i < tx_ring->ring_size; i++) { + struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i]; + + if (tx_info->tx_sent_jiffies) + xsk_frames++; + + tx_info->tx_sent_jiffies = 0; + } + + if (xsk_frames) + xsk_tx_completed(xsk_pool, xsk_frames); +} + +void ena_xdp_free_rx_bufs_zc(struct ena_ring *rx_ring) +{ + int i = 0; + + for (i = 0; i < rx_ring->ring_size; i++) { + struct ena_rx_buffer *rx_info = &rx_ring->rx_buffer_info[i]; + + if (rx_info->xdp) + xsk_buff_free(rx_info->xdp); + + rx_info->xdp = NULL; + } +} + +#endif /* ENA_AF_XDP_SUPPORT */ +void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring) +{ + netif_dbg(rx_ring->adapter, ifdown, rx_ring->netdev, + "Unregistering RX info for queue %d", + rx_ring->qid); + xdp_rxq_info_unreg_mem_model(&rx_ring->xdp_rxq); + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); +} + +void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter, + struct bpf_prog *prog, + int first, int count) +{ + struct bpf_prog *old_bpf_prog; + struct ena_ring *rx_ring; + int i = 0; + + for (i = first; i < count; i++) { + rx_ring = &adapter->rx_ring[i]; + old_bpf_prog = xchg(&rx_ring->xdp_bpf_prog, prog); + + if (!old_bpf_prog && prog) { + rx_ring->rx_headroom = XDP_PACKET_HEADROOM; + } else if (old_bpf_prog && !prog) { + rx_ring->rx_headroom = NET_SKB_PAD; + } + } +} + +static void ena_xdp_exchange_program(struct ena_adapter *adapter, + struct bpf_prog *prog) +{ + struct bpf_prog *old_bpf_prog = xchg(&adapter->xdp_bpf_prog, prog); + + ena_xdp_exchange_program_rx_in_range(adapter, + prog, + 0, + adapter->num_io_queues); + + if (old_bpf_prog) + bpf_prog_put(old_bpf_prog); +} + +static int ena_destroy_and_free_all_xdp_queues(struct ena_adapter *adapter) +{ + bool was_up; + int rc; + + was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags); + + if (was_up) + ena_down(adapter); + + adapter->xdp_first_ring = 0; + adapter->xdp_num_queues = 0; + ena_xdp_exchange_program(adapter, NULL); + if (was_up) { + rc = ena_up(adapter); + if (rc) + return rc; + } + return 0; +} + +static int ena_xdp_set(struct net_device *netdev, struct netdev_bpf *bpf) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct bpf_prog *prog = bpf->prog; + struct bpf_prog *old_bpf_prog; + int rc, prev_mtu; + bool is_up; + + is_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags); + rc = ena_xdp_allowed(adapter); + if (rc == ENA_XDP_ALLOWED) { + old_bpf_prog = adapter->xdp_bpf_prog; + if (prog) { + if (!is_up) { + ena_init_all_xdp_queues(adapter); + } else if (!old_bpf_prog) { + ena_down(adapter); + ena_init_all_xdp_queues(adapter); + } + ena_xdp_exchange_program(adapter, prog); + + netif_dbg(adapter, drv, adapter->netdev, "Set a new XDP program\n"); + + if (is_up && !old_bpf_prog) { + rc = ena_up(adapter); + if (rc) + return rc; + } + xdp_features_set_redirect_target(netdev, false); + } else if (old_bpf_prog) { + xdp_features_clear_redirect_target(netdev); + netif_dbg(adapter, drv, adapter->netdev, "Removing XDP program\n"); + + rc = ena_destroy_and_free_all_xdp_queues(adapter); + if (rc) + return rc; + } + + prev_mtu = netdev->max_mtu; + netdev->max_mtu = prog ? ENA_XDP_MAX_MTU : adapter->max_mtu; + + if (!old_bpf_prog) + netif_info(adapter, drv, adapter->netdev, + "XDP program is set, changing the max_mtu from %d to %d", + prev_mtu, netdev->max_mtu); + + } else if (rc == ENA_XDP_CURRENT_MTU_TOO_LARGE) { + netif_err(adapter, drv, adapter->netdev, + "Failed to set xdp program, the current MTU (%d) is larger than the maximum allowed MTU (%lu) while xdp is on", + netdev->mtu, ENA_XDP_MAX_MTU); + NL_SET_ERR_MSG_MOD(bpf->extack, + "Failed to set xdp program, the current MTU is larger than the maximum allowed MTU. Check the dmesg for more info"); + return -EINVAL; + } else if (rc == ENA_XDP_NO_ENOUGH_QUEUES) { + netif_err(adapter, drv, adapter->netdev, + "Failed to set xdp program, the Rx/Tx channel count should be at most half of the maximum allowed channel count. The current queue count (%d), the maximal queue count (%d)\n", + adapter->num_io_queues, adapter->max_num_io_queues); + NL_SET_ERR_MSG_MOD(bpf->extack, + "Failed to set xdp program, there is no enough space for allocating XDP queues, Check the dmesg for more info"); + return -EINVAL; + } + + return 0; +} + +#ifdef ENA_AF_XDP_SUPPORT +static bool ena_is_xsk_pool_params_allowed(struct xsk_buff_pool *pool) +{ + return xsk_pool_get_headroom(pool) == 0 && + xsk_pool_get_chunk_size(pool) == ENA_PAGE_SIZE; +} + +static int ena_xsk_pool_enable(struct ena_adapter *adapter, + struct xsk_buff_pool *pool, + u16 qid) +{ + struct ena_ring *rx_ring, *tx_ring; + bool dev_was_up = false; + int err; + + if (qid >= adapter->num_io_queues) { + netdev_err(adapter->netdev, + "Max qid for XSK pool is %d (received %d)\n", + adapter->num_io_queues, qid); + return -EINVAL; + } + + if (ena_is_xsk_pool_params_allowed(pool)) + return -EINVAL; + + rx_ring = &adapter->rx_ring[qid]; + tx_ring = &adapter->tx_ring[qid]; + + err = xsk_pool_dma_map(pool, adapter->ena_dev->dmadev, 0); + if (err) { + ena_increase_stat(&rx_ring->rx_stats.dma_mapping_err, 1, + &rx_ring->syncp); + netif_err(adapter, drv, adapter->netdev, + "Failed to DMA map XSK pool for qid %d\n", qid); + return err; + } + + if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) { + dev_was_up = true; + ena_down(adapter); + } + + rx_ring->xsk_pool = tx_ring->xsk_pool = pool; + + netif_dbg(adapter, drv, adapter->netdev, + "Setting XSK pool for queue %d\n", qid); + + return dev_was_up ? ena_up(adapter) : 0; +} + +static int ena_xsk_pool_disable(struct ena_adapter *adapter, + u16 qid) +{ + struct ena_ring *rx_ring, *tx_ring; + bool dev_was_up = false; + + if (qid >= adapter->num_io_queues) + return -EINVAL; + + rx_ring = &adapter->rx_ring[qid]; + tx_ring = &adapter->tx_ring[qid]; + + /* XSK pool isn't attached to this ring */ + if (!rx_ring->xsk_pool) + return 0; + + if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) { + dev_was_up = true; + ena_down(adapter); + } + + xsk_pool_dma_unmap(rx_ring->xsk_pool, 0); + + rx_ring->xsk_pool = tx_ring->xsk_pool = NULL; + + netif_dbg(adapter, drv, adapter->netdev, + "Removing XSK pool for queue %d\n", qid); + + return dev_was_up ? ena_up(adapter) : 0; +} + +static int ena_xsk_pool_setup(struct ena_adapter *adapter, + struct xsk_buff_pool *pool, + u16 qid) +{ + return pool ? ena_xsk_pool_enable(adapter, pool, qid) : + ena_xsk_pool_disable(adapter, qid); +} + +#endif /* ENA_AF_XDP_SUPPORT */ +/* This is the main xdp callback, it's used by the kernel to set/unset the xdp + * program as well as to query the current xdp program id. + */ +int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf) +{ +#if defined(ENA_XDP_QUERY_IN_DRIVER) || defined(ENA_AF_XDP_SUPPORT) + struct ena_adapter *adapter = netdev_priv(netdev); + +#endif /* ENA_XDP_QUERY_IN_DRIVER || ENA_AF_XDP_SUPPORT */ + switch (bpf->command) { + case XDP_SETUP_PROG: + return ena_xdp_set(netdev, bpf); +#ifdef ENA_AF_XDP_SUPPORT + case XDP_SETUP_XSK_POOL: + return ena_xsk_pool_setup(adapter, bpf->xsk.pool, bpf->xsk.queue_id); +#endif /* ENA_AF_XDP_SUPPORT */ +#ifdef ENA_XDP_QUERY_IN_DRIVER + case XDP_QUERY_PROG: + bpf->prog_id = adapter->xdp_bpf_prog ? + adapter->xdp_bpf_prog->aux->id : 0; + break; +#endif + default: + return -EINVAL; + } + return 0; +} + +#ifdef ENA_AF_XDP_SUPPORT +int ena_xdp_xsk_wakeup(struct net_device *netdev, u32 qid, u32 flags) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct ena_ring *tx_ring; + struct napi_struct *napi; + + if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) + return -ENETDOWN; + + if (qid >= adapter->num_io_queues) + return -EINVAL; + + if (!adapter->xdp_bpf_prog) + return -ENXIO; + + tx_ring = &adapter->tx_ring[qid]; + + if (!ENA_IS_XSK_RING(tx_ring)) + return -ENXIO; + + ena_increase_stat(&tx_ring->tx_stats.xsk_wakeup_request, 1, + &tx_ring->syncp); + + napi = tx_ring->napi; + + napi_schedule(napi); + + return 0; +} + +#endif /* ENA_AF_XDP_SUPPORT */ +static int ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget) +{ +#ifdef ENA_AF_XDP_SUPPORT + bool is_zc_q = ENA_IS_XSK_RING(tx_ring); +#endif /* ENA_AF_XDP_SUPPORT */ + u32 total_done = 0; + u16 next_to_clean; + int tx_pkts = 0; + u16 req_id; + int rc; + + next_to_clean = tx_ring->next_to_clean; + + while (tx_pkts < budget) { + struct ena_tx_buffer *tx_info; + struct xdp_frame *xdpf; + + rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq, + &req_id); + if (rc) { + if (unlikely(rc == -EINVAL)) + handle_invalid_req_id(tx_ring, req_id, NULL, true); + else if (unlikely(rc == -EFAULT)) { + ena_reset_device(tx_ring->adapter, + ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED); + } + break; + } + + /* validate that the request id points to a valid xdp_frame */ + rc = validate_xdp_req_id(tx_ring, req_id); + if (rc) + break; + + tx_info = &tx_ring->tx_buffer_info[req_id]; + + tx_info->tx_sent_jiffies = 0; +#ifdef ENA_AF_XDP_SUPPORT + + if (is_zc_q) + goto log_xdp_packet; +#endif /* ENA_AF_XDP_SUPPORT */ + + xdpf = tx_info->xdpf; + tx_info->xdpf = NULL; + ena_unmap_tx_buff(tx_ring, tx_info); + xdp_return_frame(xdpf); + +#ifdef ENA_AF_XDP_SUPPORT +log_xdp_packet: +#endif /* ENA_AF_XDP_SUPPORT */ + tx_pkts++; + total_done += tx_info->tx_descs; +#ifdef ENA_AF_XDP_SUPPORT + tx_info->total_tx_size = 0; +#endif /* ENA_AF_XDP_SUPPORT */ + tx_ring->free_ids[next_to_clean] = req_id; + next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean, + tx_ring->ring_size); + + netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev, + "tx_poll: q %d pkt #%d req_id %d\n", tx_ring->qid, tx_pkts, req_id); + } + + tx_ring->next_to_clean = next_to_clean; + ena_com_comp_ack(tx_ring->ena_com_io_sq, total_done); + + netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev, + "tx_poll: q %d done. total pkts: %d\n", + tx_ring->qid, tx_pkts); + +#ifdef ENA_AF_XDP_SUPPORT + if (is_zc_q) { + struct xsk_buff_pool *xsk_pool = tx_ring->xsk_pool; + + if (tx_pkts) + xsk_tx_completed(xsk_pool, tx_pkts); + + if (xsk_uses_need_wakeup(xsk_pool)) { + bool needs_wakeup = tx_pkts < budget; + if (needs_wakeup) + xsk_set_tx_need_wakeup(xsk_pool); + else + xsk_clear_tx_need_wakeup(xsk_pool); + } + } + +#endif /* ENA_AF_XDP_SUPPORT */ + return tx_pkts; +} + +#ifdef ENA_AF_XDP_SUPPORT +static bool ena_xdp_xmit_irq_zc(struct ena_ring *tx_ring, + struct napi_struct *napi, + int budget) +{ + struct xsk_buff_pool *xsk_pool = tx_ring->xsk_pool; + int size, rc, push_len = 0, work_done = 0; + struct ena_tx_buffer *tx_info; + struct ena_com_buf *ena_buf; + u16 next_to_use, req_id; + bool need_wakeup = true; + struct xdp_desc desc; + dma_addr_t dma; + + while (likely(work_done < budget)) { + struct ena_com_tx_ctx ena_tx_ctx = {}; + + /* We assume the maximum number of descriptors, which is two + * (meta data included) + */ + if (unlikely(!ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq, 2))) + break; + + if (!xsk_tx_peek_desc(xsk_pool, &desc)) + break; + + next_to_use = tx_ring->next_to_use; + req_id = tx_ring->free_ids[next_to_use]; + tx_info = &tx_ring->tx_buffer_info[req_id]; + + size = desc.len; + + if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + /* Designate part of the packet for LLQ */ + push_len = min_t(u32, size, tx_ring->tx_max_header_size); + ena_tx_ctx.push_header = xsk_buff_raw_get_data(xsk_pool, desc.addr); + ena_tx_ctx.header_len = push_len; + + size -= push_len; + if (!size) + goto xmit_desc; + } + + /* Pass the rest of the descriptor as a DMA address. Assuming + * single page descriptor. + */ + dma = xsk_buff_raw_get_dma(xsk_pool, desc.addr); + ena_buf = tx_info->bufs; + ena_buf->paddr = dma + push_len; + ena_buf->len = size; + + ena_tx_ctx.ena_bufs = ena_buf; + ena_tx_ctx.num_bufs = 1; + +xmit_desc: + ena_tx_ctx.req_id = req_id; + + netif_dbg(tx_ring->adapter, tx_queued, tx_ring->netdev, + "Queueing zc packet on q %d, %s DMA part (req-id %d)\n", + tx_ring->qid, ena_tx_ctx.num_bufs ? "with" : "without", req_id); + + rc = ena_xmit_common(tx_ring->adapter, + tx_ring, + tx_info, + &ena_tx_ctx, + next_to_use, + desc.len); + if (rc) + break; + + work_done++; + } + + if (work_done) { + xsk_tx_release(xsk_pool); + ena_ring_tx_doorbell(tx_ring); + } + + if (work_done == budget) { + need_wakeup = false; + if (xsk_uses_need_wakeup(xsk_pool)) + xsk_clear_tx_need_wakeup(xsk_pool); + } + + return need_wakeup; +} + +static struct sk_buff *ena_xdp_rx_skb_zc(struct ena_ring *rx_ring, struct xdp_buff *xdp) +{ + u32 headroom, data_len; + struct sk_buff *skb; + void *data_addr; + + /* Assuming single-page packets for XDP */ + headroom = xdp->data - xdp->data_hard_start; + data_len = xdp->data_end - xdp->data; + data_addr = xdp->data; + + /* allocate a skb to store the frags */ + skb = __napi_alloc_skb(rx_ring->napi, + headroom + data_len, + GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!skb)) { + ena_increase_stat(&rx_ring->rx_stats.skb_alloc_fail, 1, + &rx_ring->syncp); + netif_err(rx_ring->adapter, rx_err, rx_ring->netdev, + "Failed to allocate skb in zc queue %d\n", rx_ring->qid); + return NULL; + } + + skb_reserve(skb, headroom); + memcpy(__skb_put(skb, data_len), data_addr, data_len); + + skb->protocol = eth_type_trans(skb, rx_ring->netdev); + + return skb; +} + +static int ena_xdp_clean_rx_irq_zc(struct ena_ring *rx_ring, + struct napi_struct *napi, + int budget) +{ + int i, refill_required, work_done, refill_threshold, pkt_copy; + u16 next_to_clean = rx_ring->next_to_clean; + int xdp_verdict, req_id, rc, total_len; + struct ena_com_rx_ctx ena_rx_ctx; + struct ena_rx_buffer *rx_info; + bool xdp_prog_present; + struct xdp_buff *xdp; + struct sk_buff *skb; + u32 xdp_flags = 0; + + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "%s qid %d\n", __func__, rx_ring->qid); + + ena_rx_ctx.ena_bufs = rx_ring->ena_bufs; + ena_rx_ctx.max_bufs = rx_ring->sgl_size; + + xdp_prog_present = ena_xdp_present_ring(rx_ring); + + work_done = 0; + total_len = 0; + pkt_copy = 0; + + do { + xdp_verdict = ENA_XDP_PASS; + + /* Poll a packet from HW */ + rc = ena_com_rx_pkt(rx_ring->ena_com_io_cq, + rx_ring->ena_com_io_sq, + &ena_rx_ctx); + if (unlikely(rc)) + break; + + /* Polled all RX packets */ + if (unlikely(ena_rx_ctx.descs == 0)) + break; + + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n", + rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto, + ena_rx_ctx.l4_proto, ena_rx_ctx.hash); + + /* First descriptor might have an offset set by the device */ + rx_info = &rx_ring->rx_buffer_info[ena_rx_ctx.ena_bufs[0].req_id]; + xdp = rx_info->xdp; + xdp->data += ena_rx_ctx.pkt_offset; + xdp->data_end = xdp->data + ena_rx_ctx.ena_bufs[0].len; + xsk_buff_dma_sync_for_cpu(xdp, rx_ring->xsk_pool); + + /* XDP multi-buffer packets not supported */ + if (unlikely(ena_rx_ctx.descs > 1)) { + netdev_err_once(rx_ring->adapter->netdev, + "xdp: dropped unsupported multi-buffer packets\n"); + ena_increase_stat(&rx_ring->rx_stats.xdp_drop, 1, &rx_ring->syncp); + xdp_verdict = ENA_XDP_DROP; + goto skip_xdp_prog; + } + + if (likely(xdp_prog_present)) + xdp_verdict = ena_xdp_execute(rx_ring, xdp); + +skip_xdp_prog: + /* Note that there can be several descriptors, since device + * might not honor MTU + */ + for (i = 0; i < ena_rx_ctx.descs; i++) { + req_id = rx_ring->ena_bufs[i].req_id; + rx_ring->free_ids[next_to_clean] = req_id; + next_to_clean = + ENA_RX_RING_IDX_NEXT(next_to_clean, + rx_ring->ring_size); + } + + if (likely(xdp_verdict)) { + work_done++; + total_len += ena_rx_ctx.ena_bufs[0].len; + xdp_flags |= xdp_verdict; + + /* Mark buffer as consumed when it is redirected */ + if (likely(xdp_verdict & ENA_XDP_FORWARDED)) + rx_info->xdp = NULL; + + continue; + } + + /* XDP PASS */ + skb = ena_xdp_rx_skb_zc(rx_ring, xdp); + if (unlikely(!skb)) { + rc = -ENOMEM; + break; + } + + pkt_copy++; + work_done++; + total_len += ena_rx_ctx.ena_bufs[0].len; + ena_rx_checksum(rx_ring, &ena_rx_ctx, skb); + ena_set_rx_hash(rx_ring, &ena_rx_ctx, skb); + skb_record_rx_queue(skb, rx_ring->qid); + napi_gro_receive(napi, skb); + + } while (likely(work_done <= budget)); + + rx_ring->per_napi_packets += work_done; + u64_stats_update_begin(&rx_ring->syncp); + rx_ring->rx_stats.bytes += total_len; + rx_ring->rx_stats.cnt += work_done; + rx_ring->rx_stats.zc_queue_pkt_copy += pkt_copy; + u64_stats_update_end(&rx_ring->syncp); + + rx_ring->next_to_clean = next_to_clean; + + if (xdp_flags & ENA_XDP_REDIRECT) + xdp_do_flush(); + + refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq); + refill_threshold = + min_t(int, rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER, + ENA_RX_REFILL_THRESH_PACKET); + /* Optimization, try to batch new rx buffers */ + if (refill_required > refill_threshold) + ena_refill_rx_bufs(rx_ring, refill_required); + + if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) { + if (likely(rc || work_done < budget)) { + xsk_set_rx_need_wakeup(rx_ring->xsk_pool); + ena_increase_stat(&rx_ring->rx_stats.xsk_need_wakeup_set, 1, + &rx_ring->syncp); + } else { + xsk_clear_rx_need_wakeup(rx_ring->xsk_pool); + } + } + + if (unlikely(rc)) { + struct ena_adapter *adapter = netdev_priv(rx_ring->netdev); + + if (rc == -ENOSPC) { + ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1, &rx_ring->syncp); + ena_reset_device(adapter, ENA_REGS_RESET_TOO_MANY_RX_DESCS); + } else if (rc == -EIO) { + ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1, &rx_ring->syncp); + ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID); + } else if (rc == -EFAULT) { + ena_reset_device(adapter, ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED); + } + + return 0; + } + + return work_done; +} + +#endif /* ENA_AF_XDP_SUPPORT */ +/* This is the XDP napi callback. XDP queues use a separate napi callback + * than Rx/Tx queues. + */ +int ena_xdp_io_poll(struct napi_struct *napi, int budget) +{ + struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi); + struct ena_ring *tx_ring; +#ifdef ENA_AF_XDP_SUPPORT + struct ena_ring *rx_ring; + bool needs_wakeup = true; +#endif /* ENA_AF_XDP_SUPPORT */ + u32 work_done; + int ret; + + tx_ring = ena_napi->tx_ring; + + if (!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) || + test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags)) { + napi_complete_done(napi, 0); + return 0; + } + + work_done = ena_clean_xdp_irq(tx_ring, budget); + +#ifdef ENA_AF_XDP_SUPPORT + /* Take XDP work into account */ + needs_wakeup &= work_done < budget; + + if (!ENA_IS_XSK_RING(tx_ring)) + goto polling_done; + + rx_ring = ena_napi->rx_ring; + + needs_wakeup &= ena_xdp_xmit_irq_zc(tx_ring, napi, budget); + + work_done = ena_xdp_clean_rx_irq_zc(rx_ring, napi, budget); + needs_wakeup &= work_done < budget; + +polling_done: +#endif /* ENA_AF_XDP_SUPPORT */ + /* If the device is about to reset or down, avoid unmask + * the interrupt and return 0 so NAPI won't reschedule + */ + if (unlikely(!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags))) { + napi_complete_done(napi, 0); + ret = 0; +#ifdef ENA_AF_XDP_SUPPORT + } else if (needs_wakeup) { +#else + } else if (budget > work_done) { +#endif /* ENA_AF_XDP_SUPPORT */ + ena_increase_stat(&tx_ring->tx_stats.napi_comp, 1, + &tx_ring->syncp); +#ifdef ENA_AF_XDP_SUPPORT + if (napi_complete_done(napi, work_done) && + READ_ONCE(ena_napi->interrupts_masked)) { + smp_rmb(); /* make sure interrupts_masked is read */ + WRITE_ONCE(ena_napi->interrupts_masked, false); + ena_unmask_interrupt(tx_ring, NULL); + /* Checking the tx_ring since for XDP channels + * napi->rx_ring is NULL and for AF_XDP both are + * xsk rings + */ + if (ENA_IS_XSK_RING(tx_ring)) + ena_update_ring_numa_node(rx_ring); + } +#else + if (napi_complete_done(napi, work_done)) + ena_unmask_interrupt(tx_ring, NULL); +#endif /* ENA_AF_XDP_SUPPORT */ + + ret = work_done; + } else { + ret = budget; + } + + u64_stats_update_begin(&tx_ring->syncp); + tx_ring->tx_stats.tx_poll++; + u64_stats_update_end(&tx_ring->syncp); + tx_ring->tx_stats.last_napi_jiffies = jiffies; + + return ret; +} +#endif /* ENA_XDP_SUPPORT */ diff --git a/drivers/amazon/net/ena/ena_xdp.h b/drivers/amazon/net/ena/ena_xdp.h new file mode 100644 index 0000000000000..b468c7c58c8f1 --- /dev/null +++ b/drivers/amazon/net/ena/ena_xdp.h @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef ENA_XDP_H +#define ENA_XDP_H + +#include "ena_netdev.h" +#ifdef ENA_XDP_SUPPORT +#include +#ifdef ENA_AF_XDP_SUPPORT +#include + +#define ENA_IS_XSK_RING(ring) (!!(ring)->xsk_pool) + +#endif /* ENA_AF_XDP_SUPPORT */ + +/* The max MTU size is configured to be the ethernet frame size without + * the overhead of the ethernet header, which can have a VLAN header, and + * a frame check sequence (FCS). + * The buffer size we share with the device is defined to be ENA_PAGE_SIZE + */ +#ifdef XDP_HAS_FRAME_SZ +#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \ + VLAN_HLEN - XDP_PACKET_HEADROOM - \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) +#else +#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \ + VLAN_HLEN - XDP_PACKET_HEADROOM) +#endif + +#define ENA_IS_XDP_INDEX(adapter, index) (((index) >= (adapter)->xdp_first_ring) && \ + ((index) < (adapter)->xdp_first_ring + (adapter)->xdp_num_queues)) + +enum ENA_XDP_ACTIONS { + ENA_XDP_PASS = 0, + ENA_XDP_TX = BIT(0), + ENA_XDP_REDIRECT = BIT(1), + ENA_XDP_DROP = BIT(2) +}; + +#define ENA_XDP_FEATURES (NETDEV_XDP_ACT_BASIC | \ + NETDEV_XDP_ACT_REDIRECT) + +#define ENA_XDP_FORWARDED (ENA_XDP_TX | ENA_XDP_REDIRECT) + +int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter); +void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter, + struct bpf_prog *prog, + int first, int count); +int ena_xdp_io_poll(struct napi_struct *napi, int budget); +int ena_xdp_xmit_frame(struct ena_ring *tx_ring, + struct ena_adapter *adapter, + struct xdp_frame *xdpf); +int ena_xdp_xmit(struct net_device *dev, int n, + struct xdp_frame **frames, u32 flags); +int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf); +int ena_xdp_register_rxq_info(struct ena_ring *rx_ring); +void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring); +#ifdef ENA_AF_XDP_SUPPORT +void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring); +void ena_xdp_free_rx_bufs_zc(struct ena_ring *rx_ring); +int ena_xdp_xsk_wakeup(struct net_device *netdev, u32 qid, u32 flags); +#endif /* ENA_AF_XDP_SUPPORT */ + +enum ena_xdp_errors_t { + ENA_XDP_ALLOWED = 0, + ENA_XDP_CURRENT_MTU_TOO_LARGE, + ENA_XDP_NO_ENOUGH_QUEUES, +}; + +static inline bool ena_xdp_present(struct ena_adapter *adapter) +{ + return !!adapter->xdp_bpf_prog; +} + +static inline bool ena_xdp_present_ring(struct ena_ring *ring) +{ + return !!ring->xdp_bpf_prog; +} + +static inline bool ena_xdp_legal_queue_count(struct ena_adapter *adapter, + u32 queues) +{ + return 2 * queues <= adapter->max_num_io_queues; +} + +static inline enum ena_xdp_errors_t ena_xdp_allowed(struct ena_adapter *adapter) +{ + enum ena_xdp_errors_t rc = ENA_XDP_ALLOWED; + + if (adapter->netdev->mtu > ENA_XDP_MAX_MTU) + rc = ENA_XDP_CURRENT_MTU_TOO_LARGE; + else if (!ena_xdp_legal_queue_count(adapter, adapter->num_io_queues)) + rc = ENA_XDP_NO_ENOUGH_QUEUES; + + return rc; +} + +#ifdef ENA_AF_XDP_SUPPORT +static inline bool ena_is_zc_q_exist(struct ena_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) + if (ENA_IS_XSK_RING(&adapter->rx_ring[i])) + return true; + + return false; +} + +#endif /* ENA_AF_XDP_SUPPORT */ +static inline int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) +{ + u32 verdict = ENA_XDP_PASS; + struct bpf_prog *xdp_prog; + struct ena_ring *xdp_ring; + struct xdp_frame *xdpf; + u64 *xdp_stat; + + xdp_prog = READ_ONCE(rx_ring->xdp_bpf_prog); + + verdict = bpf_prog_run_xdp(xdp_prog, xdp); + + switch (verdict) { + case XDP_TX: +#ifdef XDP_CONVERT_TO_FRAME_NAME_CHANGED + xdpf = xdp_convert_buff_to_frame(xdp); +#else + xdpf = convert_to_xdp_frame(xdp); +#endif + if (unlikely(!xdpf)) { + trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); + xdp_stat = &rx_ring->rx_stats.xdp_aborted; + verdict = ENA_XDP_DROP; + break; + } + + /* Find xmit queue */ + xdp_ring = rx_ring->xdp_ring; + + /* The XDP queues are shared between XDP_TX and XDP_REDIRECT */ + spin_lock(&xdp_ring->xdp_tx_lock); + + if (ena_xdp_xmit_frame(xdp_ring, rx_ring->adapter, xdpf)) + xdp_return_frame(xdpf); + + spin_unlock(&xdp_ring->xdp_tx_lock); + xdp_stat = &rx_ring->rx_stats.xdp_tx; + verdict = ENA_XDP_TX; + break; + case XDP_REDIRECT: + if (likely(!xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog))) { + xdp_stat = &rx_ring->rx_stats.xdp_redirect; + verdict = ENA_XDP_REDIRECT; + break; + } + trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); + xdp_stat = &rx_ring->rx_stats.xdp_aborted; + verdict = ENA_XDP_DROP; + break; + case XDP_ABORTED: + trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); + xdp_stat = &rx_ring->rx_stats.xdp_aborted; + verdict = ENA_XDP_DROP; + break; + case XDP_DROP: + xdp_stat = &rx_ring->rx_stats.xdp_drop; + verdict = ENA_XDP_DROP; + break; + case XDP_PASS: + xdp_stat = &rx_ring->rx_stats.xdp_pass; + verdict = ENA_XDP_PASS; + break; + default: + bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, verdict); + xdp_stat = &rx_ring->rx_stats.xdp_invalid; + verdict = ENA_XDP_DROP; + } + + ena_increase_stat(xdp_stat, 1, &rx_ring->syncp); + + return verdict; +} +#else /* ENA_XDP_SUPPORT */ + +#define ENA_IS_XDP_INDEX(adapter, index) (false) +#define xdp_return_frame(frame) do {} while (0) + +static inline bool ena_xdp_present_ring(struct ena_ring *ring) +{ + return false; +} + +static inline int ena_xdp_register_rxq_info(struct ena_ring *rx_ring) +{ + return 0; +} + +static inline void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring) {} + +static inline bool ena_xdp_legal_queue_count(struct ena_adapter *adapter, + u32 queues) +{ + return false; +} + +static inline bool ena_xdp_present(struct ena_adapter *adapter) +{ + return false; +} +#endif /* ENA_XDP_SUPPORT */ +#endif /* ENA_XDP_H */ diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h new file mode 100644 index 0000000000000..32a9cc54dc2b5 --- /dev/null +++ b/drivers/amazon/net/ena/kcompat.h @@ -0,0 +1,1161 @@ +/******************************************************************************* +Modified by Amazon 2015-2016. +Copyright 2015-2016, Amazon.com, Inc. or its affiliates. All Rights Reserved. + +Modifications subject to the terms and conditions of the GNU General +Public License, version 2. +*******************************************************************************/ + +/******************************************************************************* + +Intel 10 Gigabit PCI Express Linux driver +Copyright(c) 1999 - 2013 Intel Corporation. + +This program is free software; you can redistribute it and/or modify it +under the terms and conditions of the GNU General Public License, +version 2, as published by the Free Software Foundation. + +This program is distributed in the hope it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + +The full GNU General Public License is included in this distribution in +the file called "COPYING". + +Contact Information: +e1000-devel Mailing List +Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 + +*******************************************************************************/ + +#ifndef _KCOMPAT_H_ +#define _KCOMPAT_H_ + +#include "config.h" + +#ifndef LINUX_VERSION_CODE +#include +#endif + +#ifndef KERNEL_VERSION +#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c)) +#endif + +#include + +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) ) +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0) +#include +#endif + +/* For ACCESS_ONCE, WRITE_ONCE and READ_ONCE macros */ +#include + +#ifndef SZ_256 +#define SZ_256 0x0000100 +#endif + +#ifndef SZ_4K +#define SZ_4K 0x00001000 +#endif + +#ifndef SZ_16K +#define SZ_16K 0x00004000 +#endif + +#ifndef __GFP_COLD +#define __GFP_COLD 0 +#endif + +#if defined(CONFIG_NET_RX_BUSY_POLL) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) && \ + LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) +#define ENA_BUSY_POLL_SUPPORT +#endif + +/* Distribution kernel version comparison macros. + * Distribution kernel versioning format may be A.B.C-D.E.F and standard + * KERNEL_VERSION macro covers only the first 3 subversions. + * Using 20bit per subversion, as in some cases, subversion D may be a large + * number (6 digits). + */ +#define ENA_KERNEL_VERSION_16BIT(SV1, SV2, SV3) ((SV1 << 40) | (SV2 << 20) | (SV3)) +#define ENA_KERNEL_VERSION_MAJOR(SV1, SV2, SV3) ENA_KERNEL_VERSION_16BIT(SV1, SV2, SV3) +#define ENA_KERNEL_VERSION_MINOR(SV1, SV2, SV3) ENA_KERNEL_VERSION_16BIT(SV1, SV2, SV3) + +#define ENA_KERNEL_VERSION_GTE(SV1, SV2, SV3, SV4, SV5, SV6) \ + ((ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) > \ + ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3))) || \ + (ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) == \ + ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3)) && \ + ENA_KERNEL_VERSION_MINOR(ENA_KERNEL_SUBVERSION_4, ENA_KERNEL_SUBVERSION_5, ENA_KERNEL_SUBVERSION_6) >= \ + ENA_KERNEL_VERSION_MINOR((SV4), (SV5), (SV6)))) + +#define ENA_KERNEL_VERSION_LTE(SV1, SV2, SV3, SV4, SV5, SV6) \ + ((ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) < \ + ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3))) || \ + (ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) == \ + ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3)) && \ + ENA_KERNEL_VERSION_MINOR(ENA_KERNEL_SUBVERSION_4, ENA_KERNEL_SUBVERSION_5, ENA_KERNEL_SUBVERSION_6) <= \ + ENA_KERNEL_VERSION_MINOR((SV4), (SV5), (SV6)))) + +/******************************************************************************/ +/************************** Ubuntu macros *************************************/ +/******************************************************************************/ + +/* Ubuntu Release ABI is the 4th digit of their kernel version. You can find + * it in /usr/src/linux/$(uname -r)/include/generated/utsrelease.h for new + * enough versions of Ubuntu. Otherwise you can simply see it in the output of + * uname as the 4th digit of the kernel. The UTS_UBUNTU_RELEASE_ABI is not in + * the linux-source package, but in the linux-headers package. It begins to + * appear in later releases of 14.04 and 14.10. + * + * Ex: + * + * $uname -r + * 3.13.0-45-generic + * ABI is 45 + * + * + * $uname -r + * 3.16.0-23-generic + * ABI is 23 + */ +#ifdef UTS_UBUNTU_RELEASE_ABI + +#if UTS_UBUNTU_RELEASE_ABI > 255 +#undef UTS_UBUNTU_RELEASE_ABI +#define UTS_UBUNTU_RELEASE_ABI 0 +#endif /* UTS_UBUNTU_RELEASE_ABI > 255 */ + +/* Ubuntu does not provide actual release version macro, so we use the kernel + * version plus the ABI to generate a unique version code specific to Ubuntu. + * In addition, we mask the lower 8 bits of LINUX_VERSION_CODE in order to + * ignore differences in sublevel which are not important since we have the + * ABI value. Otherwise, it becomes impossible to correlate ABI to version for + * ordering checks. + */ +#define UBUNTU_VERSION_CODE (((LINUX_VERSION_CODE & ~0xFF) << 8) + (UTS_UBUNTU_RELEASE_ABI)) + +#endif /* UTS_UBUNTU_RELEASE_ABI */ + +/* Note that the 3rd digit is always zero, and will be ignored. This is + * because Ubuntu kernels are based on x.y.0-ABI values, and while their linux + * version codes are 3 digit, this 3rd digit is superseded by the ABI value. + */ +#define UBUNTU_VERSION(a,b,c,d) ((KERNEL_VERSION(a,b,0) << 8) + (d)) + +/******************************************************************************/ +/**************************** SuSE macros *************************************/ +/******************************************************************************/ + +/* SuSE version macro is the same as Linux kernel version */ +#ifndef SLE_VERSION +#define SLE_VERSION(a,b,c) KERNEL_VERSION(a,b,c) +#endif +#ifdef CONFIG_SUSE_KERNEL +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 14) +#include +#endif +#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,28) ) +/* SLES12 is at least 3.12.28+ based */ +#define SLE_VERSION_CODE SLE_VERSION(12,0,0) +#endif /* LINUX_VERSION_CODE == KERNEL_VERSION(x,y,z) */ +#endif /* CONFIG_SUSE_KERNEL */ +#ifndef SLE_VERSION_CODE +#define SLE_VERSION_CODE 0 +#endif /* SLE_VERSION_CODE */ +#ifndef SUSE_VERSION +#define SUSE_VERSION 0 +#endif /* SUSE_VERSION */ + +/******************************************************************************/ +/**************************** RHEL macros *************************************/ +/******************************************************************************/ + +#ifndef RHEL_RELEASE_VERSION +#define RHEL_RELEASE_VERSION(a,b) (((a) << 8) + (b)) +#endif +#ifndef AX_RELEASE_VERSION +#define AX_RELEASE_VERSION(a,b) (((a) << 8) + (b)) +#endif + +#ifndef AX_RELEASE_CODE +#define AX_RELEASE_CODE 0 +#endif + +#ifndef RHEL_RELEASE_CODE +#define RHEL_RELEASE_CODE 0 +#endif + +#if (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,0)) +#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,0) +#elif (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,1)) +#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,1) +#elif (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,2)) +#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,3) +#endif + +/*****************************************************************************/ +#if (RHEL_RELEASE_CODE && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,6)) && \ + (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) +#define HAVE_RHEL6_NET_DEVICE_OPS_EXT +#endif + +#if (RHEL_RELEASE_CODE && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,4)) && \ + (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) +#define HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT +#endif /* RHEL >= 6.4 && RHEL < 7.0 */ + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) || \ + (SLE_VERSION_CODE && \ + LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,48))) +#define HAVE_MTU_MIN_MAX_IN_NET_DEVICE +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0) || \ + (RHEL_RELEASE_CODE && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)) || \ + (SLE_VERSION_CODE && \ + LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,50))) +#define NDO_GET_STATS_64_V2 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0) || \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,5)) +#include +#endif + +/*****************************************************************************/ +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) ) +/* The function netif_set_real_num_tx_queues() doesn't return value for + * kernels < 2.6.37 + */ +static inline int _kc_netif_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq) +{ + netif_set_real_num_tx_queues(dev, txq); + return 0; +} +#define netif_set_real_num_tx_queues(dev, txq) \ + _kc_netif_set_real_num_tx_queues(dev, txq) + +#endif /* < 2.6.37 */ + +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) ) +#if !(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,5)) +typedef u32 netdev_features_t; +#endif +#undef PCI_EXP_TYPE_RC_EC +#define PCI_EXP_TYPE_RC_EC 0xa /* Root Complex Event Collector */ +#ifndef CONFIG_BQL +#define netdev_tx_completed_queue(_q, _p, _b) do {} while (0) +#define netdev_completed_queue(_n, _p, _b) do {} while (0) +#define netdev_tx_sent_queue(_q, _b) do {} while (0) +#define netdev_sent_queue(_n, _b) do {} while (0) +#define netdev_tx_reset_queue(_q) do {} while (0) +#define netdev_reset_queue(_n) do {} while (0) +#endif + +#endif /* < 3.3.0 */ + +/******************************************************************************/ +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) ) +#ifdef NET_ADDR_RANDOM +#define eth_hw_addr_random(N) do { \ + eth_random_addr(N->dev_addr); \ + N->addr_assign_type |= NET_ADDR_RANDOM; \ + } while (0) +#else /* NET_ADDR_RANDOM */ +#define eth_hw_addr_random(N) eth_random_addr(N->dev_addr) +#endif /* NET_ADDR_RANDOM */ +#if !(RHEL_RELEASE_CODE) +/* If probe retry doesn't define, return no device */ +#define EPROBE_DEFER ENODEV +#endif +#endif /* >= 3.4.0 */ + +/*****************************************************************************/ +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) ) +#if !(RHEL_RELEASE_CODE) +static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) +{ + const u16 *a = (const u16 *)addr1; + const u16 *b = (const u16 *)addr2; + + return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0; +} +#endif +#endif /* >= 3.5.0 */ + +/******************************************************************************/ +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) ) +#ifndef eth_random_addr +#define eth_random_addr _kc_eth_random_addr +static inline void _kc_eth_random_addr(u8 *addr) +{ + get_random_bytes(addr, ETH_ALEN); + addr[0] &= 0xfe; /* clear multicast */ + addr[0] |= 0x02; /* set local assignment */ +} +#endif +#endif /* < 3.6.0 */ + +/******************************************************************************/ +#ifndef CONFIG_NET_RX_BUSY_POLL +static inline void skb_mark_napi_id(struct sk_buff *skb, + struct napi_struct *napi) +{ + +} + +static inline void napi_hash_del(struct napi_struct *napi) +{ + +} + +static inline void napi_hash_add(struct napi_struct *napi) +{ + +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +/*****************************************************************************/ +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) ) +/* cpu_rmap is buggy on older version and causes dead lock */ +#ifdef CONFIG_RFS_ACCEL +#undef CONFIG_RFS_ACCEL +#endif + +#if !(RHEL_RELEASE_CODE) +static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) +{ + return index % n_rx_rings; +} +#endif +#endif /* >= 3.8.0 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) +#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) +# define u64_stats_init(syncp) seqcount_init(syncp.seq) +#else +# define u64_stats_init(syncp) do { } while (0) +#endif + +#if !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \ + !(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,8) && \ + (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) \ + || (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1)))) && \ + !defined(UEK3_RELEASE) +static inline void reinit_completion(struct completion *x) +{ + x->done = 0; +} +#endif /* SLE 12 */ + +#endif /* < 3.13.0 */ + +#if (( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) ) && \ + (!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,0) && \ + RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0))) \ + && !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))&& \ + !defined(UEK3_RELEASE))) || \ + (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,13,0,30)) +static inline int pci_enable_msix_range(struct pci_dev *dev, + struct msix_entry *entries, + int minvec, + int maxvec) +{ + int nvec = maxvec; + int rc; + + if (maxvec < minvec) + return -ERANGE; + + do { + rc = pci_enable_msix(dev, entries, nvec); + if (rc < 0) { + return rc; + } else if (rc > 0) { + if (rc < minvec) + return -ENOSPC; + nvec = rc; + } + } while (rc); + + return nvec; +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) && \ + !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,1)) +static inline void *devm_kcalloc(struct device *dev, + size_t n, size_t size, gfp_t flags) +{ + return devm_kzalloc(dev, n * size, flags | __GFP_ZERO); +} +#endif + +/*****************************************************************************/ +#if (( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,8) ) && \ + !RHEL_RELEASE_CODE && \ + !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))) || \ + (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,13,0,30)) +enum pkt_hash_types { + PKT_HASH_TYPE_NONE, /* Undefined type */ + PKT_HASH_TYPE_L2, /* Input: src_MAC, dest_MAC */ + PKT_HASH_TYPE_L3, /* Input: src_IP, dst_IP */ + PKT_HASH_TYPE_L4, /* Input: src_IP, dst_IP, src_port, dst_port */ +}; + +static inline void skb_set_hash(struct sk_buff *skb, __u32 hash, + enum pkt_hash_types type) +{ + skb->l4_rxhash = (type == PKT_HASH_TYPE_L4); + skb->rxhash = hash; +} +#endif + +/*****************************************************************************/ +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0) +#if !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0) && \ + RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(6,6)) \ + && !(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,105)) +static inline int pci_msix_vec_count(struct pci_dev *dev) +{ + int pos; + u16 control; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + if (!pos) + return -EINVAL; + + pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control); + return (control & 0x7FF) + 1; +} +#if !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \ + !(RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(7,0)) +static inline void ether_addr_copy(u8 *dst, const u8 *src) +{ + memcpy(dst, src, 6); +} +#endif /* SLE 12 */ +#endif /* RHEL 7 */ +#endif + +#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(6,8))) +#define napi_gro_flush(napi, flush_old) napi_gro_flush(napi) +#endif + +#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,15,0) || \ + (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,30))) || \ + (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) || \ + (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0) \ + && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,1)) +#else +static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, + unsigned int start) +{ + return u64_stats_fetch_retry(syncp, start); +} + +static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) +{ + return u64_stats_fetch_begin(syncp); +} + +#endif + +static inline bool ena_u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) && \ + !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9,3)) + return u64_stats_fetch_retry_irq(syncp, start); +#else + return u64_stats_fetch_retry(syncp, start); +#endif +} + +static inline unsigned int ena_u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) && \ + !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9,3)) + return u64_stats_fetch_begin_irq(syncp); +#else + return u64_stats_fetch_begin(syncp); +#endif +} + +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) && \ + !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1)))) + +#define smp_mb__before_atomic() smp_mb() + +#endif + +/*****************************************************************************/ +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) ) +#undef GENMASK +#define GENMASK(h, l) (((U32_C(1) << ((h) - (l) + 1)) - 1) << (l)) +#undef GENMASK_ULL +#define GENMASK_ULL(h, l) (((U64_C(1) << ((h) - (l) + 1)) - 1) << (l)) +#endif +/*****************************************************************************/ + +#ifndef dma_rmb +#define dma_rmb rmb +#endif + +#ifndef writel_relaxed +#define writel_relaxed writel +#endif + +#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0) ) \ + || (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) \ + || (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0)) +#else +static inline void netdev_rss_key_fill(void *buffer, size_t len) +{ + get_random_bytes(buffer, len); +} +#endif + +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) ) && \ + !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \ + !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)) + +static inline void napi_schedule_irqoff(struct napi_struct *n) +{ + napi_schedule(n); +} + +static inline void __napi_schedule_irqoff(struct napi_struct *n) +{ + __napi_schedule(n); +} + +#ifndef READ_ONCE +#define READ_ONCE(var) (*((volatile typeof(var) *)(&(var)))) +#endif +#endif /* Kernel 3.19 */ + +/*****************************************************************************/ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) \ + || (RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6,7)) && \ + (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) \ + || RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)) \ + || (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,19,0,51)) +#else +static inline void napi_complete_done(struct napi_struct *n, int work_done) +{ + napi_complete(n); +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0) \ + || (defined(UBUNTU_VERSION_CODE) && \ + (UBUNTU_VERSION(3,13,0,126) <= UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,14,0,0))) \ + || (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)) + +#else + +static inline void ioremap_release(struct device *dev, void *res) +{ + iounmap(*(void __iomem **)res); +} + + +static inline void __iomem *devm_ioremap_wc(struct device *dev, + resource_size_t offset, + resource_size_t size) +{ + void __iomem **ptr, *addr; + + ptr = devres_alloc(ioremap_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return NULL; + + addr = ioremap_wc(offset, size); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else + devres_free(ptr); + + return addr; +} +#endif + +#if RHEL_RELEASE_CODE && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 5) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) +#define ndo_change_mtu ndo_change_mtu_rh74 +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)) +#ifndef dma_zalloc_coherent +#define dma_zalloc_coherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) +#endif +#endif + +#ifndef dev_info_once +#ifdef CONFIG_PRINTK +#define dev_info_once(dev, fmt, ...) \ +do { \ + static bool __print_once __read_mostly; \ + \ + if (!__print_once) { \ + __print_once = true; \ + dev_info(dev, fmt, ##__VA_ARGS__); \ + } \ +} while (0) +#else +#define dev_info_once(dev, fmt, ...) \ +do { \ + if (0) \ + dev_info(dev, fmt, ##__VA_ARGS__); \ +} while (0) +#endif +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0)) && \ + !(RHEL_RELEASE_CODE && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 2)) +#define netdev_xmit_more() (skb->xmit_more) +#endif + +#ifndef mmiowb +#define MMIOWB_NOT_DEFINED +#endif + +/* In the driver we currently only support CRC32 and Toeplitz. + * Since in kernel erlier than 4.12 the CRC32 define didn't exist + * We define it here to be XOR. Any user who wishes to select CRC32 + * as the hash function, can do so by choosing xor through ethtool. + */ +#ifndef ETH_RSS_HASH_CRC32 +#define ETH_RSS_HASH_CRC32 ETH_RSS_HASH_XOR +#endif + +#ifndef _ULL +#define _ULL(x) (_AC(x, ULL)) +#endif + +#ifndef ULL +#define ULL(x) (_ULL(x)) +#endif + +#ifndef BIT_ULL +#define BIT_ULL(nr) (ULL(1) << (nr)) +#endif + +#ifndef BITS_PER_TYPE +#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) +#endif + +#ifndef DIV_ROUND_DOWN_ULL +#define DIV_ROUND_DOWN_ULL(ll, d) \ + ({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; }) +#endif + +/* values are taken from here: https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md */ + +#if defined(CONFIG_BPF) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) || \ + (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 5)))) +#define ENA_XDP_SUPPORT +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5,8,0)) || \ + (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 3) || \ + (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 5))) +#define XDP_HAS_FRAME_SZ +#define XDP_CONVERT_TO_FRAME_NAME_CHANGED +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) && \ + !(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 5)) +#define ENA_XDP_QUERY_IN_DRIVER +#endif + +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0) || \ + (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 3))) || \ + (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 3) + +#define HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) && \ + !(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 1)) && \ + (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6, 6)))) && \ + !defined(UBUNTU_VERSION_CODE) && \ + !defined(UEK3_RELEASE) && (!defined(DEBIAN_VERSION) || DEBIAN_VERSION != 8) + +#define DO_ONCE(func, ...) \ + ({ \ + static bool ___done = false; \ + if (unlikely(!___done)) { \ + func(__VA_ARGS__); \ + ___done = true; \ + } \ + }) + +#define get_random_once(buf, nbytes) \ + DO_ONCE(get_random_bytes, (buf), (nbytes)) + +#define net_get_random_once(buf, nbytes) \ + get_random_once((buf), (nbytes)) + +/* RSS keys are 40 or 52 bytes long */ +#define NETDEV_RSS_KEY_LEN 52 +static u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; + +static inline void netdev_rss_key_fill(void *buffer, size_t len) +{ + BUG_ON(len > sizeof(netdev_rss_key)); + net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key)); + memcpy(buffer, netdev_rss_key, len); +} +#endif + +#ifndef WRITE_ONCE +#define WRITE_ONCE(x, val) (ACCESS_ONCE(x) = val) +#endif +#ifndef READ_ONCE +#define READ_ONCE(x) ACCESS_ONCE(x) +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9 ,0) +#define ENA_GENERIC_PM_OPS +#endif + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(4, 6 ,0)) && \ + !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3))) +/* Linux versions 4.4.216 - 4.5 (non inclusive) back propagated + * page_ref_count() from kernel 4.6. + * Ubuntu didn't add these changes to its 4.4.* kernels. + * UEK added this function in kernel 4.1.12-124.43.1 + * Here is a figure that shows all of the cases: + * Legend: + * -------- page_ref_count() is present in the kernel + * ******** page_ref_count() is missing in the kernel + * + * Distro\Kernel 4.1.12-124.43.1 4.4.216 4.5 4.6 + * | | | | + * Upstrem kernel ***********|**************|--------|******| + * | | | | + * Ubuntu ***********|**************|********|******| + * | | | | + * UEK ***********|--------------|--------|------| + */ +#if (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 124, 43, 1)) || \ + (defined(UBUNTU_VERSION_CODE)) || \ + (!defined(IS_UEK) && !defined(UBUNTU_VERSION_CODE) && \ + !(KERNEL_VERSION(4, 4, 216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0))) +static inline int page_ref_count(struct page *page) +{ + return atomic_read(&page->_count); +} +#endif /* (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 124, 43, 1)) ... */ + +static inline void page_ref_inc(struct page *page) +{ + atomic_inc(&page->_count); +} +#endif + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)) && \ + !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2))) +static inline struct page *dev_alloc_page(void) +{ + gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; + + gfp_mask |= __GFP_COLD | __GFP_COMP; + + return alloc_pages_node(NUMA_NO_NODE, gfp_mask, 0); +} +#endif + +/* This entry might seem strange because of the #ifndef numa_mem_id(), + * but these defines were taken from the Linux kernel + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34) +#ifndef numa_mem_id +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +static inline int numa_mem_id(void) +{ + return __this_cpu_read(_numa_mem_); +} +#else /* CONFIG_HAVE_MEMORYLESS_NODES */ +static inline int numa_mem_id(void) +{ + return numa_node_id(); +} +#endif /* CONFIG_HAVE_MEMORYLESS_NODES */ +#endif /* numa_mem_id */ +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34) */ + +#ifndef fallthrough +#define fallthrough do {} while (0) /* fallthrough */ +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) || \ + (defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 5)) +#define AF_XDP_BUSY_POLL_SUPPORTED +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) +#define ENA_LINEAR_FRAG_SUPPORTED +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) +#define ENA_NETDEV_LOGS_WITHOUT_RV +#endif + +#if defined(ENA_XDP_SUPPORT) && \ + (LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0) && \ + !(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL == 3) && \ + ENA_KERNEL_VERSION_GTE(5, 3, 18, 150300, 59, 49))) && \ + !(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 5)) +static __always_inline void +xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) +{ + xdp->rxq = rxq; +#ifdef XDP_HAS_FRAME_SZ + xdp->frame_sz = frame_sz; +#endif +} + +static __always_inline void +xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, + int headroom, int data_len, const bool meta_valid) +{ + unsigned char *data = hard_start + headroom; + + xdp->data_hard_start = hard_start; + xdp->data = data; + xdp->data_end = data + data_len; + xdp->data_meta = meta_valid ? data : data + 1; +} + +#endif /* defined(ENA_XDP_SUPPORT) && (LINUX_VERSION_CODE <= KERNEL_VERSION(5, 12, 0) && !SUSE_VERSION(...)) */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 13, 0) +#define ethtool_sprintf(data, fmt, args...) \ + do { \ + snprintf(*data, ETH_GSTRING_LEN, fmt, ##args); \ + (*data) += ETH_GSTRING_LEN; \ + } while(0) +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 13, 0) +#define ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) && \ + !(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 188) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0)) && \ + !(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 251) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(5, 5, 0))) && \ + !(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6)) && \ + !(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \ + !(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL == 3) && \ + ENA_KERNEL_VERSION_GTE(5, 3, 18, 150300, 59, 43)) +static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr) +{ + memcpy(dev->dev_addr, addr, ETH_ALEN); +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) || \ + (defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6) && \ + RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(9, 0)) || \ + (defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) +#define ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) || \ + (defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 7) && \ + RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(9, 0)) || \ + (defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 5)) +#define ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE +#endif + +#if defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) +#define ENA_AF_XDP_SUPPORT +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0) +/* kernels older than 3.3.0 didn't have this function and + * used netif_tx_queue_stopped() for the same purpose + */ +static inline int netif_xmit_stopped(const struct netdev_queue *dev_queue) +{ + return netif_tx_queue_stopped(dev_queue); +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) +#define NAPIF_STATE_SCHED BIT(NAPI_STATE_SCHED) +#endif + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)) && \ + !(defined(IS_UEK) && ENA_KERNEL_VERSION_GTE(5, 15, 0, 100, 96, 32)) && \ + !(defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 7)) && \ + (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9, 0))) && \ + !(defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 1)))) +#define bpf_warn_invalid_xdp_action(netdev, xdp_prog, verdict) \ + bpf_warn_invalid_xdp_action(verdict) +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) +#define HAS_BPF_HEADER +#endif + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 8, 0)) && \ + !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 7)))) +static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2) +{ + if (cmp1.tv64 < cmp2.tv64) + return -1; + if (cmp1.tv64 > cmp2.tv64) + return 1; + return 0; +} +#endif + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0)) && \ + !(RHEL_RELEASE_CODE && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 7)) && \ + (RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 0)) && \ + (RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 1)))) +static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2) +{ + return ktime_compare(cmp1, cmp2) > 0; +} +#endif + +#if IS_ENABLED(CONFIG_PTP_1588_CLOCK) + +#if defined(ENA_PHC_INCLUDE) && ((LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)) || \ + (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4))) +#define ENA_PHC_SUPPORT +#endif /* ENA_PHC_SUPPORT */ + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 1, 0)) || \ + (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 2)) +#define ENA_PHC_SUPPORT_GETTIME64 +#endif /* ENA_PHC_SUPPORT_GETTIME64 */ + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)) || \ + (RHEL_RELEASE_CODE && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 7)) && \ + (RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(8, 0))) +#define ENA_PHC_SUPPORT_GETTIME64_EXTENDED +#endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */ + +#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)) && (LINUX_VERSION_CODE < KERNEL_VERSION(6, 2, 0))) +#define ENA_PHC_SUPPORT_ADJFREQ +#endif /* ENA_PHC_SUPPORT_ADJFREQ */ + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 7, 0)) && \ + !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4)))) +#define ptp_clock_register(info, parent) ptp_clock_register(info) +#endif + +#endif /* CONFIG_PTP_1588_CLOCK */ + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)) && \ + !(RHEL_RELEASE_CODE && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 2))) +static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, + unsigned int length) +{ + return netdev_alloc_skb_ip_align(napi->dev, length); +} +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)) && \ + !(RHEL_RELEASE_CODE && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 7))) +static inline ssize_t strscpy(char *dest, const char *src, size_t count) +{ + return (ssize_t)strlcpy(dest, src, count); +} +#endif + +static inline void ena_netif_napi_add(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int)) +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)) && \ + !(RHEL_RELEASE_CODE && \ + ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 8)) && \ + (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9, 0))) || \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 2))) +#ifndef NAPI_POLL_WEIGHT +#define NAPI_POLL_WEIGHT 64 +#endif + netif_napi_add(dev, napi, poll, NAPI_POLL_WEIGHT); +#else + netif_napi_add(dev, napi, poll); +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) */ +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0) +#define ENA_LARGE_LLQ_ETHTOOL +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0) +#include +#define ENA_FIELD_GET(value, mask, offset) FIELD_GET(mask, value) +#else +#define ENA_FIELD_GET(value, mask, offset) ((typeof(mask))((value & mask) >> offset)) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0) +#define xdp_features_set_redirect_target(netdev, xdp_xmit_supported) +#define xdp_features_clear_redirect_target(netdev) +#define xdp_clear_features_flag(netdev) +#define xdp_set_features_flag(netdev, features) +#else /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0) */ +#define ENA_XDP_NETLINK_ADVERTISEMENT +#endif + +#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 4))) || \ + (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \ + (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 105, 0, 0)) +static inline void dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->unmap_page) + ops->unmap_page(dev, addr, size, dir, attrs); + debug_dma_unmap_page(dev, addr, size, dir, false); +} +#endif /* RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 4)) */ + +#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 9)) && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 0)) && \ + (LINUX_VERSION_CODE != KERNEL_VERSION(4, 14, 0))) || \ + (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \ + (defined(IS_UEK) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 13)) +#define ENA_DMA_ATTR_SKIP_CPU_SYNC (1 << DMA_ATTR_SKIP_CPU_SYNC) +#elif (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(6, 10))) +#define ENA_DMA_ATTR_SKIP_CPU_SYNC 0 +#else +#define ENA_DMA_ATTR_SKIP_CPU_SYNC DMA_ATTR_SKIP_CPU_SYNC +#endif + +static inline void ena_dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ +#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 9)) && \ + (LINUX_VERSION_CODE != KERNEL_VERSION(4, 14, 0))) || \ + (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \ + (defined(IS_UEK) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 13)) + struct dma_attrs dma_attrs; + + init_dma_attrs(&dma_attrs); + dma_attrs.flags[0] = attrs; + dma_unmap_page_attrs(dev, addr, size, dir, &dma_attrs); +#else + dma_unmap_page_attrs(dev, addr, size, dir, attrs); +#endif +} + +#ifndef ENA_HAVE_PCI_DEV_ID +#define pci_dev_id(pdev) ((((u16)(pdev->bus->number)) << 8) | (pdev->devfn)) +#endif /* ENA_HAVE_PCI_DEV_ID */ + +#ifndef ENA_HAVE_XDP_DO_FLUSH +#define xdp_do_flush xdp_do_flush_map +#endif /* ENA_HAVE_XDP_DO_FLUSH */ + +#ifndef ENA_HAVE_CPUMASK_LOCAL_SPREAD +static inline unsigned int cpumask_local_spread(unsigned int i, int node) +{ + unsigned int cpu; + + /* Wrap: we always want a cpu. */ + i %= num_online_cpus(); + + if (node == NUMA_NO_NODE) { + for_each_cpu(cpu, cpu_online_mask) + if (i-- == 0) + return cpu; + } else { + /* NUMA first. */ + for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask) + if (i-- == 0) + return cpu; + + for_each_cpu(cpu, cpu_online_mask) { + /* Skip NUMA nodes, done above. */ + if (cpumask_test_cpu(cpu, cpumask_of_node(node))) + continue; + + if (i-- == 0) + return cpu; + } + } + return 0; +} +#endif /* ENA_HAVE_CPUMASK_LOCAL_SPREAD */ + +#ifndef ENA_HAVE_UPDATE_AFFINITY_HINT +static inline int irq_update_affinity_hint(unsigned int irq, const struct cpumask *m) +{ + return 0; +} +#endif /* ENA_HAVE_UPDATE_AFFINITY_HINT */ + +#ifndef ENA_HAVE_ETHTOOL_PUTS +#define ethtool_puts ethtool_sprintf +#endif /* ENA_HAVE_ETHTOOL_PUTS */ + +#endif /* _KCOMPAT_H_ */ diff --git a/drivers/amazon/net/ena/net_dim.c b/drivers/amazon/net/ena/net_dim.c new file mode 100644 index 0000000000000..af46903cd53e2 --- /dev/null +++ b/drivers/amazon/net/ena/net_dim.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + */ + +#include "dim.h" + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) + +/* + * Net DIM profiles: + * There are different set of profiles for each CQ period mode. + * There are different set of profiles for RX/TX CQs. + * Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES + */ +#define NET_DIM_PARAMS_NUM_PROFILES 5 +#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256 +#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128 +#define NET_DIM_DEF_PROFILE_CQE 1 +#define NET_DIM_DEF_PROFILE_EQE 1 + +#define NET_DIM_RX_EQE_PROFILES { \ + {1, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {8, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {64, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ +} + +#define NET_DIM_RX_CQE_PROFILES { \ + {2, 256}, \ + {8, 128}, \ + {16, 64}, \ + {32, 64}, \ + {64, 64} \ +} + +#define NET_DIM_TX_EQE_PROFILES { \ + {1, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {8, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {32, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {64, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE} \ +} + +#define NET_DIM_TX_CQE_PROFILES { \ + {5, 128}, \ + {8, 64}, \ + {16, 32}, \ + {32, 32}, \ + {64, 32} \ +} + +static const struct dim_cq_moder +rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { + NET_DIM_RX_EQE_PROFILES, + NET_DIM_RX_CQE_PROFILES, +}; + +static const struct dim_cq_moder +tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { + NET_DIM_TX_EQE_PROFILES, + NET_DIM_TX_CQE_PROFILES, +}; + +struct dim_cq_moder +net_dim_get_rx_moderation(u8 cq_period_mode, int ix) +{ + struct dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix]; + + cq_moder.cq_period_mode = cq_period_mode; + return cq_moder; +} + +struct dim_cq_moder +net_dim_get_def_rx_moderation(u8 cq_period_mode) +{ + u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ? + NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE; + + return net_dim_get_rx_moderation(cq_period_mode, profile_ix); +} + +struct dim_cq_moder +net_dim_get_tx_moderation(u8 cq_period_mode, int ix) +{ + struct dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix]; + + cq_moder.cq_period_mode = cq_period_mode; + return cq_moder; +} + +struct dim_cq_moder +net_dim_get_def_tx_moderation(u8 cq_period_mode) +{ + u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ? + NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE; + + return net_dim_get_tx_moderation(cq_period_mode, profile_ix); +} + +static int net_dim_step(struct dim *dim) +{ + if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2)) + return DIM_TOO_TIRED; + + switch (dim->tune_state) { + case DIM_PARKING_ON_TOP: + case DIM_PARKING_TIRED: + break; + case DIM_GOING_RIGHT: + if (dim->profile_ix == (NET_DIM_PARAMS_NUM_PROFILES - 1)) + return DIM_ON_EDGE; + dim->profile_ix++; + dim->steps_right++; + break; + case DIM_GOING_LEFT: + if (dim->profile_ix == 0) + return DIM_ON_EDGE; + dim->profile_ix--; + dim->steps_left++; + break; + } + + dim->tired++; + return DIM_STEPPED; +} + +static void net_dim_exit_parking(struct dim *dim) +{ + dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT : DIM_GOING_RIGHT; + net_dim_step(dim); +} + +static int net_dim_stats_compare(struct dim_stats *curr, + struct dim_stats *prev) +{ + if (!prev->bpms) + return curr->bpms ? DIM_STATS_BETTER : DIM_STATS_SAME; + + if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms)) + return (curr->bpms > prev->bpms) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + if (!prev->ppms) + return curr->ppms ? DIM_STATS_BETTER : + DIM_STATS_SAME; + + if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms)) + return (curr->ppms > prev->ppms) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + if (!prev->epms) + return DIM_STATS_SAME; + + if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms)) + return (curr->epms < prev->epms) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + return DIM_STATS_SAME; +} + +static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim) +{ + int prev_state = dim->tune_state; + int prev_ix = dim->profile_ix; + int stats_res; + int step_res; + + switch (dim->tune_state) { + case DIM_PARKING_ON_TOP: + stats_res = net_dim_stats_compare(curr_stats, + &dim->prev_stats); + if (stats_res != DIM_STATS_SAME) + net_dim_exit_parking(dim); + break; + + case DIM_PARKING_TIRED: + dim->tired--; + if (!dim->tired) + net_dim_exit_parking(dim); + break; + + case DIM_GOING_RIGHT: + case DIM_GOING_LEFT: + stats_res = net_dim_stats_compare(curr_stats, + &dim->prev_stats); + if (stats_res != DIM_STATS_BETTER) + dim_turn(dim); + + if (dim_on_top(dim)) { + dim_park_on_top(dim); + break; + } + + step_res = net_dim_step(dim); + switch (step_res) { + case DIM_ON_EDGE: + dim_park_on_top(dim); + break; + case DIM_TOO_TIRED: + dim_park_tired(dim); + break; + } + + break; + } + + if (prev_state != DIM_PARKING_ON_TOP || + dim->tune_state != DIM_PARKING_ON_TOP) + dim->prev_stats = *curr_stats; + + return dim->profile_ix != prev_ix; +} + +void net_dim(struct dim *dim, struct dim_sample end_sample) +{ + struct dim_stats curr_stats; + u16 nevents; + + switch (dim->state) { + case DIM_MEASURE_IN_PROGRESS: + nevents = BIT_GAP(BITS_PER_TYPE(u16), + end_sample.event_ctr, + dim->start_sample.event_ctr); + if (nevents < DIM_NEVENTS) + break; + dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats); + if (net_dim_decision(&curr_stats, dim)) { + dim->state = DIM_APPLY_NEW_PROFILE; + schedule_work(&dim->work); + break; + } + /* fall through */ + case DIM_START_MEASURE: + dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr, + end_sample.byte_ctr, &dim->start_sample); + dim->state = DIM_MEASURE_IN_PROGRESS; + break; + case DIM_APPLY_NEW_PROFILE: + break; + } +} + +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */ diff --git a/drivers/amazon/net/igb_uio/Makefile b/drivers/amazon/net/igb_uio/Makefile new file mode 100644 index 0000000000000..ebced2786f7c8 --- /dev/null +++ b/drivers/amazon/net/igb_uio/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_AMAZON_IGB_UIO) += igb_uio.o diff --git a/drivers/amazon/net/igb_uio/compat.h b/drivers/amazon/net/igb_uio/compat.h new file mode 100644 index 0000000000000..8dbb896ae1185 --- /dev/null +++ b/drivers/amazon/net/igb_uio/compat.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Minimal wrappers to allow compiling igb_uio on older kernels. + */ + +#ifndef RHEL_RELEASE_VERSION +#define RHEL_RELEASE_VERSION(a, b) (((a) << 8) + (b)) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0) +#define pci_cfg_access_lock pci_block_user_cfg_access +#define pci_cfg_access_unlock pci_unblock_user_cfg_access +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0) +#define HAVE_PTE_MASK_PAGE_IOMAP +#endif + +#ifndef PCI_MSIX_ENTRY_SIZE +#define PCI_MSIX_ENTRY_SIZE 16 +#define PCI_MSIX_ENTRY_VECTOR_CTRL 12 +#define PCI_MSIX_ENTRY_CTRL_MASKBIT 1 +#endif + +/* + * for kernels < 2.6.38 and backported patch that moves MSI-X entry definition + * to pci_regs.h Those kernels has PCI_MSIX_ENTRY_SIZE defined but not + * PCI_MSIX_ENTRY_CTRL_MASKBIT + */ +#ifndef PCI_MSIX_ENTRY_CTRL_MASKBIT +#define PCI_MSIX_ENTRY_CTRL_MASKBIT 1 +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34) && \ + (!(defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(5, 9))) + +static int pci_num_vf(struct pci_dev *dev) +{ + struct iov { + int pos; + int nres; + u32 cap; + u16 ctrl; + u16 total; + u16 initial; + u16 nr_virtfn; + } *iov = (struct iov *)dev->sriov; + + if (!dev->is_physfn) + return 0; + + return iov->nr_virtfn; +} + +#endif /* < 2.6.34 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 39) && \ + (!(defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4))) + +#define kstrtoul strict_strtoul + +#endif /* < 2.6.39 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0) && \ + (!(defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 3))) + +/* Check if INTX works to control irq's. + * Set's INTX_DISABLE flag and reads it back + */ +static bool pci_intx_mask_supported(struct pci_dev *pdev) +{ + bool mask_supported = false; + uint16_t orig, new; + + pci_block_user_cfg_access(pdev); + pci_read_config_word(pdev, PCI_COMMAND, &orig); + pci_write_config_word(pdev, PCI_COMMAND, + orig ^ PCI_COMMAND_INTX_DISABLE); + pci_read_config_word(pdev, PCI_COMMAND, &new); + + if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) { + dev_err(&pdev->dev, "Command register changed from " + "0x%x to 0x%x: driver or hardware bug?\n", orig, new); + } else if ((new ^ orig) & PCI_COMMAND_INTX_DISABLE) { + mask_supported = true; + pci_write_config_word(pdev, PCI_COMMAND, orig); + } + pci_unblock_user_cfg_access(pdev); + + return mask_supported; +} + +static bool pci_check_and_mask_intx(struct pci_dev *pdev) +{ + bool pending; + uint32_t status; + + pci_block_user_cfg_access(pdev); + pci_read_config_dword(pdev, PCI_COMMAND, &status); + + /* interrupt is not ours, goes to out */ + pending = (((status >> 16) & PCI_STATUS_INTERRUPT) != 0); + if (pending) { + uint16_t old, new; + + old = status; + if (status != 0) + new = old & (~PCI_COMMAND_INTX_DISABLE); + else + new = old | PCI_COMMAND_INTX_DISABLE; + + if (old != new) + pci_write_config_word(pdev, PCI_COMMAND, new); + } + pci_unblock_user_cfg_access(pdev); + + return pending; +} + +#endif /* < 3.3.0 */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) +#define HAVE_PCI_IS_BRIDGE_API 1 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +#define HAVE_MSI_LIST_IN_GENERIC_DEVICE 1 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) +#define HAVE_PCI_MSI_MASK_IRQ 1 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0) +#define HAVE_ALLOC_IRQ_VECTORS 1 +#endif + +static inline bool igbuio_kernel_is_locked_down(void) +{ +#ifdef CONFIG_LOCK_DOWN_KERNEL +#ifdef CONFIG_LOCK_DOWN_IN_EFI_SECURE_BOOT + return kernel_is_locked_down(NULL); +#elif defined(CONFIG_EFI_SECURE_BOOT_LOCK_DOWN) + return kernel_is_locked_down(); +#else + return false; +#endif +#else + return false; +#endif +} diff --git a/drivers/amazon/net/igb_uio/igb_uio.c b/drivers/amazon/net/igb_uio/igb_uio.c new file mode 100644 index 0000000000000..ea439d131de1a --- /dev/null +++ b/drivers/amazon/net/igb_uio/igb_uio.c @@ -0,0 +1,674 @@ +// SPDX-License-Identifier: GPL-2.0 +/*- + * Copyright(c) 2010-2017 Intel Corporation. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * These enum and macro definitions are copied from the + * file rte_pci_dev_features.h + */ +enum rte_intr_mode { + RTE_INTR_MODE_NONE = 0, + RTE_INTR_MODE_LEGACY, + RTE_INTR_MODE_MSI, + RTE_INTR_MODE_MSIX +}; +#define RTE_INTR_MODE_NONE_NAME "none" +#define RTE_INTR_MODE_LEGACY_NAME "legacy" +#define RTE_INTR_MODE_MSI_NAME "msi" +#define RTE_INTR_MODE_MSIX_NAME "msix" + + +#include "compat.h" + +/** + * A structure describing the private information for a uio device. + */ +struct rte_uio_pci_dev { + struct uio_info info; + struct pci_dev *pdev; + enum rte_intr_mode mode; + atomic_t refcnt; +}; + +static int wc_activate; +static char *intr_mode; +static enum rte_intr_mode igbuio_intr_mode_preferred = RTE_INTR_MODE_MSIX; +/* sriov sysfs */ +static ssize_t +show_max_vfs(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 10, "%u\n", dev_num_vf(dev)); +} + +static ssize_t +store_max_vfs(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + int err = 0; + unsigned long max_vfs; + struct pci_dev *pdev = to_pci_dev(dev); + + if (0 != kstrtoul(buf, 0, &max_vfs)) + return -EINVAL; + + if (0 == max_vfs) + pci_disable_sriov(pdev); + else if (0 == pci_num_vf(pdev)) + err = pci_enable_sriov(pdev, max_vfs); + else /* do nothing if change max_vfs number */ + err = -EINVAL; + + return err ? err : count; +} + +static DEVICE_ATTR(max_vfs, S_IRUGO | S_IWUSR, show_max_vfs, store_max_vfs); + +static struct attribute *dev_attrs[] = { + &dev_attr_max_vfs.attr, + NULL, +}; + +static const struct attribute_group dev_attr_grp = { + .attrs = dev_attrs, +}; + +#ifndef HAVE_PCI_MSI_MASK_IRQ +/* + * It masks the msix on/off of generating MSI-X messages. + */ +static void +igbuio_msix_mask_irq(struct msi_desc *desc, s32 state) +{ + u32 mask_bits = desc->masked; + unsigned int offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_VECTOR_CTRL; + + if (state != 0) + mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT; + else + mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT; + + if (mask_bits != desc->masked) { + writel(mask_bits, desc->mask_base + offset); + readl(desc->mask_base); + desc->masked = mask_bits; + } +} + +/* + * It masks the msi on/off of generating MSI messages. + */ +static void +igbuio_msi_mask_irq(struct pci_dev *pdev, struct msi_desc *desc, int32_t state) +{ + u32 mask_bits = desc->masked; + u32 offset = desc->irq - pdev->irq; + u32 mask = 1 << offset; + + if (!desc->msi_attrib.maskbit) + return; + + if (state != 0) + mask_bits &= ~mask; + else + mask_bits |= mask; + + if (mask_bits != desc->masked) { + pci_write_config_dword(pdev, desc->mask_pos, mask_bits); + desc->masked = mask_bits; + } +} + +static void +igbuio_mask_irq(struct pci_dev *pdev, enum rte_intr_mode mode, s32 irq_state) +{ + struct msi_desc *desc; + struct list_head *msi_list; + +#ifdef HAVE_MSI_LIST_IN_GENERIC_DEVICE + msi_list = &pdev->dev.msi_list; +#else + msi_list = &pdev->msi_list; +#endif + + if (mode == RTE_INTR_MODE_MSIX) { + list_for_each_entry(desc, msi_list, list) + igbuio_msix_mask_irq(desc, irq_state); + } else if (mode == RTE_INTR_MODE_MSI) { + list_for_each_entry(desc, msi_list, list) + igbuio_msi_mask_irq(pdev, desc, irq_state); + } +} +#endif + +/** + * This is the irqcontrol callback to be registered to uio_info. + * It can be used to disable/enable interrupt from user space processes. + * + * @param info + * pointer to uio_info. + * @param irq_state + * state value. 1 to enable interrupt, 0 to disable interrupt. + * + * @return + * - On success, 0. + * - On failure, a negative value. + */ +static int +igbuio_pci_irqcontrol(struct uio_info *info, s32 irq_state) +{ + struct rte_uio_pci_dev *udev = info->priv; + struct pci_dev *pdev = udev->pdev; + +#ifdef HAVE_PCI_MSI_MASK_IRQ + struct irq_data *irq = irq_get_irq_data(udev->info.irq); +#endif + + pci_cfg_access_lock(pdev); + + if (udev->mode == RTE_INTR_MODE_MSIX || udev->mode == RTE_INTR_MODE_MSI) { +#ifdef HAVE_PCI_MSI_MASK_IRQ + if (irq_state == 1) + pci_msi_unmask_irq(irq); + else + pci_msi_mask_irq(irq); +#else + igbuio_mask_irq(pdev, udev->mode, irq_state); +#endif + } + + if (udev->mode == RTE_INTR_MODE_LEGACY) + pci_intx(pdev, !!irq_state); + + pci_cfg_access_unlock(pdev); + + return 0; +} + +/** + * This is interrupt handler which will check if the interrupt is for the right device. + * If yes, disable it here and will be enable later. + */ +static irqreturn_t +igbuio_pci_irqhandler(int irq, void *dev_id) +{ + struct rte_uio_pci_dev *udev = (struct rte_uio_pci_dev *)dev_id; + struct uio_info *info = &udev->info; + + /* Legacy mode need to mask in hardware */ + if (udev->mode == RTE_INTR_MODE_LEGACY && + !pci_check_and_mask_intx(udev->pdev)) + return IRQ_NONE; + + uio_event_notify(info); + + /* Message signal mode, no share IRQ and automasked */ + return IRQ_HANDLED; +} + +static int +igbuio_pci_enable_interrupts(struct rte_uio_pci_dev *udev) +{ + int err = 0; +#ifndef HAVE_ALLOC_IRQ_VECTORS + struct msix_entry msix_entry; +#endif + + switch (igbuio_intr_mode_preferred) { + case RTE_INTR_MODE_MSIX: + /* Only 1 msi-x vector needed */ +#ifndef HAVE_ALLOC_IRQ_VECTORS + msix_entry.entry = 0; + if (pci_enable_msix(udev->pdev, &msix_entry, 1) == 0) { + dev_dbg(&udev->pdev->dev, "using MSI-X"); + udev->info.irq_flags = IRQF_NO_THREAD; + udev->info.irq = msix_entry.vector; + udev->mode = RTE_INTR_MODE_MSIX; + break; + } +#else + if (pci_alloc_irq_vectors(udev->pdev, 1, 1, PCI_IRQ_MSIX) == 1) { + dev_dbg(&udev->pdev->dev, "using MSI-X"); + udev->info.irq_flags = IRQF_NO_THREAD; + udev->info.irq = pci_irq_vector(udev->pdev, 0); + udev->mode = RTE_INTR_MODE_MSIX; + break; + } +#endif + + /* falls through - to MSI */ + case RTE_INTR_MODE_MSI: +#ifndef HAVE_ALLOC_IRQ_VECTORS + if (pci_enable_msi(udev->pdev) == 0) { + dev_dbg(&udev->pdev->dev, "using MSI"); + udev->info.irq_flags = IRQF_NO_THREAD; + udev->info.irq = udev->pdev->irq; + udev->mode = RTE_INTR_MODE_MSI; + break; + } +#else + if (pci_alloc_irq_vectors(udev->pdev, 1, 1, PCI_IRQ_MSI) == 1) { + dev_dbg(&udev->pdev->dev, "using MSI"); + udev->info.irq_flags = IRQF_NO_THREAD; + udev->info.irq = pci_irq_vector(udev->pdev, 0); + udev->mode = RTE_INTR_MODE_MSI; + break; + } +#endif + /* falls through - to INTX */ + case RTE_INTR_MODE_LEGACY: + if (pci_intx_mask_supported(udev->pdev)) { + dev_dbg(&udev->pdev->dev, "using INTX"); + udev->info.irq_flags = IRQF_SHARED | IRQF_NO_THREAD; + udev->info.irq = udev->pdev->irq; + udev->mode = RTE_INTR_MODE_LEGACY; + break; + } + dev_notice(&udev->pdev->dev, "PCI INTX mask not supported\n"); + /* falls through - to no IRQ */ + case RTE_INTR_MODE_NONE: + udev->mode = RTE_INTR_MODE_NONE; + udev->info.irq = UIO_IRQ_NONE; + break; + + default: + dev_err(&udev->pdev->dev, "invalid IRQ mode %u", + igbuio_intr_mode_preferred); + udev->info.irq = UIO_IRQ_NONE; + err = -EINVAL; + } + + if (udev->info.irq != UIO_IRQ_NONE) + err = request_irq(udev->info.irq, igbuio_pci_irqhandler, + udev->info.irq_flags, udev->info.name, + udev); + dev_info(&udev->pdev->dev, "uio device registered with irq %ld\n", + udev->info.irq); + + return err; +} + +static void +igbuio_pci_disable_interrupts(struct rte_uio_pci_dev *udev) +{ + if (udev->info.irq) { + free_irq(udev->info.irq, udev); + udev->info.irq = 0; + } + +#ifndef HAVE_ALLOC_IRQ_VECTORS + if (udev->mode == RTE_INTR_MODE_MSIX) + pci_disable_msix(udev->pdev); + if (udev->mode == RTE_INTR_MODE_MSI) + pci_disable_msi(udev->pdev); +#else + if (udev->mode == RTE_INTR_MODE_MSIX || + udev->mode == RTE_INTR_MODE_MSI) + pci_free_irq_vectors(udev->pdev); +#endif +} + + +/** + * This gets called while opening uio device file. + */ +static int +igbuio_pci_open(struct uio_info *info, struct inode *inode) +{ + struct rte_uio_pci_dev *udev = info->priv; + struct pci_dev *dev = udev->pdev; + int err; + + if (atomic_inc_return(&udev->refcnt) != 1) + return 0; + + /* set bus master, which was cleared by the reset function */ + pci_set_master(dev); + + /* enable interrupts */ + err = igbuio_pci_enable_interrupts(udev); + if (err) { + atomic_dec(&udev->refcnt); + dev_err(&dev->dev, "Enable interrupt fails\n"); + } + return err; +} + +static int +igbuio_pci_release(struct uio_info *info, struct inode *inode) +{ + struct rte_uio_pci_dev *udev = info->priv; + struct pci_dev *dev = udev->pdev; + + if (atomic_dec_and_test(&udev->refcnt)) { + /* disable interrupts */ + igbuio_pci_disable_interrupts(udev); + + /* stop the device from further DMA */ + pci_clear_master(dev); + } + + return 0; +} + +/* Remap pci resources described by bar #pci_bar in uio resource n. */ +static int +igbuio_pci_setup_iomem(struct pci_dev *dev, struct uio_info *info, + int n, int pci_bar, const char *name) +{ + unsigned long addr, len; + void *internal_addr; + + if (n >= ARRAY_SIZE(info->mem)) + return -EINVAL; + + addr = pci_resource_start(dev, pci_bar); + len = pci_resource_len(dev, pci_bar); + if (addr == 0 || len == 0) + return -1; + if (wc_activate == 0) { + internal_addr = ioremap(addr, len); + if (internal_addr == NULL) + return -1; + } else { + internal_addr = NULL; + } + info->mem[n].name = name; + info->mem[n].addr = addr; + info->mem[n].internal_addr = internal_addr; + info->mem[n].size = len; + info->mem[n].memtype = UIO_MEM_PHYS; + return 0; +} + +/* Get pci port io resources described by bar #pci_bar in uio resource n. */ +static int +igbuio_pci_setup_ioport(struct pci_dev *dev, struct uio_info *info, + int n, int pci_bar, const char *name) +{ + unsigned long addr, len; + + if (n >= ARRAY_SIZE(info->port)) + return -EINVAL; + + addr = pci_resource_start(dev, pci_bar); + len = pci_resource_len(dev, pci_bar); + if (addr == 0 || len == 0) + return -EINVAL; + + info->port[n].name = name; + info->port[n].start = addr; + info->port[n].size = len; + info->port[n].porttype = UIO_PORT_X86; + + return 0; +} + +/* Unmap previously ioremap'd resources */ +static void +igbuio_pci_release_iomem(struct uio_info *info) +{ + int i; + + for (i = 0; i < MAX_UIO_MAPS; i++) { + if (info->mem[i].internal_addr) + iounmap(info->mem[i].internal_addr); + } +} + +static int +igbuio_setup_bars(struct pci_dev *dev, struct uio_info *info) +{ + int i, iom, iop, ret; + unsigned long flags; + static const char *bar_names[PCI_STD_RESOURCE_END + 1] = { + "BAR0", + "BAR1", + "BAR2", + "BAR3", + "BAR4", + "BAR5", + }; + + iom = 0; + iop = 0; + + for (i = 0; i < ARRAY_SIZE(bar_names); i++) { + if (pci_resource_len(dev, i) != 0 && + pci_resource_start(dev, i) != 0) { + flags = pci_resource_flags(dev, i); + if (flags & IORESOURCE_MEM) { + ret = igbuio_pci_setup_iomem(dev, info, iom, + i, bar_names[i]); + if (ret != 0) + return ret; + iom++; + } else if (flags & IORESOURCE_IO) { + ret = igbuio_pci_setup_ioport(dev, info, iop, + i, bar_names[i]); + if (ret != 0) + return ret; + iop++; + } + } + } + + return (iom != 0 || iop != 0) ? ret : -ENOENT; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 8, 0) +static int __devinit +#else +static int +#endif +igbuio_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + struct rte_uio_pci_dev *udev; + dma_addr_t map_dma_addr; + void *map_addr; + int err; + +#ifdef HAVE_PCI_IS_BRIDGE_API + if (pci_is_bridge(dev)) { + dev_warn(&dev->dev, "Ignoring PCI bridge device\n"); + return -ENODEV; + } +#endif + + udev = kzalloc(sizeof(struct rte_uio_pci_dev), GFP_KERNEL); + if (!udev) + return -ENOMEM; + + /* + * enable device: ask low-level code to enable I/O and + * memory + */ + err = pci_enable_device(dev); + if (err != 0) { + dev_err(&dev->dev, "Cannot enable PCI device\n"); + goto fail_free; + } + + /* enable bus mastering on the device */ + pci_set_master(dev); + + /* remap IO memory */ + err = igbuio_setup_bars(dev, &udev->info); + if (err != 0) + goto fail_release_iomem; + + /* set 64-bit DMA mask */ + err = pci_set_dma_mask(dev, DMA_BIT_MASK(64)); + if (err != 0) { + dev_err(&dev->dev, "Cannot set DMA mask\n"); + goto fail_release_iomem; + } + + err = pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(64)); + if (err != 0) { + dev_err(&dev->dev, "Cannot set consistent DMA mask\n"); + goto fail_release_iomem; + } + + /* fill uio infos */ + udev->info.name = "igb_uio"; + udev->info.version = "0.1"; + udev->info.irqcontrol = igbuio_pci_irqcontrol; + udev->info.open = igbuio_pci_open; + udev->info.release = igbuio_pci_release; + udev->info.priv = udev; + udev->pdev = dev; + atomic_set(&udev->refcnt, 0); + + err = sysfs_create_group(&dev->dev.kobj, &dev_attr_grp); + if (err != 0) + goto fail_release_iomem; + + /* register uio driver */ + err = uio_register_device(&dev->dev, &udev->info); + if (err != 0) + goto fail_remove_group; + + pci_set_drvdata(dev, udev); + + /* + * Doing a harmless dma mapping for attaching the device to + * the iommu identity mapping if kernel boots with iommu=pt. + * Note this is not a problem if no IOMMU at all. + */ + map_addr = dma_alloc_coherent(&dev->dev, 1024, &map_dma_addr, + GFP_KERNEL); + if (map_addr) + memset(map_addr, 0, 1024); + + if (!map_addr) + dev_info(&dev->dev, "dma mapping failed\n"); + else { + dev_info(&dev->dev, "mapping 1K dma=%#llx host=%p\n", + (unsigned long long)map_dma_addr, map_addr); + + dma_free_coherent(&dev->dev, 1024, map_addr, map_dma_addr); + dev_info(&dev->dev, "unmapping 1K dma=%#llx host=%p\n", + (unsigned long long)map_dma_addr, map_addr); + } + + return 0; + +fail_remove_group: + sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp); +fail_release_iomem: + igbuio_pci_release_iomem(&udev->info); + pci_disable_device(dev); +fail_free: + kfree(udev); + + return err; +} + +static void +igbuio_pci_remove(struct pci_dev *dev) +{ + struct rte_uio_pci_dev *udev = pci_get_drvdata(dev); + + igbuio_pci_release(&udev->info, NULL); + + sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp); + uio_unregister_device(&udev->info); + igbuio_pci_release_iomem(&udev->info); + pci_disable_device(dev); + pci_set_drvdata(dev, NULL); + kfree(udev); +} + +static int +igbuio_config_intr_mode(char *intr_str) +{ + if (!intr_str) { + pr_info("Use MSIX interrupt by default\n"); + return 0; + } + + if (!strcmp(intr_str, RTE_INTR_MODE_MSIX_NAME)) { + igbuio_intr_mode_preferred = RTE_INTR_MODE_MSIX; + pr_info("Use MSIX interrupt\n"); + } else if (!strcmp(intr_str, RTE_INTR_MODE_MSI_NAME)) { + igbuio_intr_mode_preferred = RTE_INTR_MODE_MSI; + pr_info("Use MSI interrupt\n"); + } else if (!strcmp(intr_str, RTE_INTR_MODE_LEGACY_NAME)) { + igbuio_intr_mode_preferred = RTE_INTR_MODE_LEGACY; + pr_info("Use legacy interrupt\n"); + } else { + pr_info("Error: bad parameter - %s\n", intr_str); + return -EINVAL; + } + + return 0; +} + +static struct pci_driver igbuio_pci_driver = { + .name = "igb_uio", + .id_table = NULL, + .probe = igbuio_pci_probe, + .remove = igbuio_pci_remove, +}; + +static int __init +igbuio_pci_init_module(void) +{ + int ret; + + if (igbuio_kernel_is_locked_down()) { + pr_err("Not able to use module, kernel lock down is enabled\n"); + return -EINVAL; + } + + if (wc_activate != 0) + pr_info("wc_activate is set\n"); + + ret = igbuio_config_intr_mode(intr_mode); + if (ret < 0) + return ret; + + return pci_register_driver(&igbuio_pci_driver); +} + +static void __exit +igbuio_pci_exit_module(void) +{ + pci_unregister_driver(&igbuio_pci_driver); +} + +module_init(igbuio_pci_init_module); +module_exit(igbuio_pci_exit_module); + +module_param(intr_mode, charp, S_IRUGO); +MODULE_PARM_DESC(intr_mode, +"igb_uio interrupt mode (default=msix):\n" +" " RTE_INTR_MODE_MSIX_NAME " Use MSIX interrupt\n" +" " RTE_INTR_MODE_MSI_NAME " Use MSI interrupt\n" +" " RTE_INTR_MODE_LEGACY_NAME " Use Legacy interrupt\n" +"\n"); + +module_param(wc_activate, int, 0); +MODULE_PARM_DESC(wc_activate, +"Activate support for write combining (WC) (default=0)\n" +" 0 - disable\n" +" other - enable\n"); + +MODULE_DESCRIPTION("UIO driver for Intel IGB PCI cards"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 5d39f3e374dae..48d918d846bc3 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -261,6 +262,11 @@ static int memory_block_offline(struct memory_block *mem) mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); mem->zone = NULL; + +#ifdef CONFIG_PAGE_REPORTING + page_report_offline(start_pfn, nr_pages); +#endif + out: mem_hotplug_done(); return ret; @@ -505,9 +511,10 @@ static DEVICE_ATTR_RW(auto_online_blocks); static ssize_t probe_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - u64 phys_addr; + u64 phys_addr, size; int nid, ret; unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; + mhp_t mhp_flags; ret = kstrtoull(buf, 0, &phys_addr); if (ret) @@ -520,10 +527,12 @@ static ssize_t probe_store(struct device *dev, struct device_attribute *attr, if (ret) return ret; + size = MIN_MEMORY_BLOCK_SIZE * sections_per_block; + mhp_flags = mhp_supports_memmap_on_memory(size) ? + MHP_MEMMAP_ON_MEMORY : MHP_NONE; + nid = memory_add_physaddr_to_nid(phys_addr); - ret = __add_memory(nid, phys_addr, - MIN_MEMORY_BLOCK_SIZE * sections_per_block, - MHP_NONE); + ret = __add_memory(nid, phys_addr, size, mhp_flags); if (ret) goto out; @@ -537,6 +546,33 @@ static ssize_t probe_store(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR_WO(probe); #endif +#ifdef CONFIG_ARCH_MEMORY_REMOVE +static ssize_t remove_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + u64 phys_addr; + int ret; + unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; + + ret = kstrtoull(buf, 0, &phys_addr); + if (ret) + return ret; + + if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) + return -EINVAL; + + ret = offline_and_remove_memory(phys_addr, MIN_MEMORY_BLOCK_SIZE * sections_per_block); + + if (ret) + return ret; + + return count; +} + +static DEVICE_ATTR_WO(remove); +#endif + + #ifdef CONFIG_MEMORY_FAILURE /* * Support for offlining pages of memory @@ -885,6 +921,9 @@ static struct attribute *memory_root_attrs[] = { #ifdef CONFIG_ARCH_MEMORY_PROBE &dev_attr_probe.attr, #endif +#ifdef CONFIG_ARCH_MEMORY_REMOVE + &dev_attr_remove.attr, +#endif #ifdef CONFIG_MEMORY_FAILURE &dev_attr_soft_offline_page.attr, diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 5ddf393aa390f..5e5a600a543bf 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -49,6 +49,8 @@ #include #include #include +#include +#include #include #include @@ -82,6 +84,8 @@ enum blkif_state { BLKIF_STATE_CONNECTED, BLKIF_STATE_SUSPENDED, BLKIF_STATE_ERROR, + BLKIF_STATE_FREEZING, + BLKIF_STATE_FROZEN }; struct grant { @@ -135,7 +139,7 @@ static LIST_HEAD(info_list); * by the backend driver. */ -static unsigned int xen_blkif_max_segments = 32; +static unsigned int xen_blkif_max_segments = 64; module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, 0444); MODULE_PARM_DESC(max_indirect_segments, "Maximum amount of segments in indirect requests (default is 32)"); @@ -231,6 +235,7 @@ struct blkfront_info struct list_head requests; struct bio_list bio_list; struct list_head info_list; + struct completion wait_backend_disconnected; }; static unsigned int nr_minors; @@ -270,6 +275,16 @@ static DEFINE_SPINLOCK(minor_lock); static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); static void blkfront_gather_backend_features(struct blkfront_info *info); static int negotiate_mq(struct blkfront_info *info); +static void __blkif_free(struct blkfront_info *info); + +static inline bool blkfront_ring_is_busy(struct blkif_front_ring *ring) +{ + if (RING_SIZE(ring) > RING_FREE_REQUESTS(ring) || + RING_HAS_UNCONSUMED_RESPONSES(ring)) + return true; + else + return false; +} #define for_each_rinfo(info, ptr, idx) \ for ((ptr) = (info)->rinfo, (idx) = 0; \ @@ -1163,6 +1178,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, info->sector_size = sector_size; info->physical_sector_size = physical_sector_size; blkif_set_queue_limits(info); + init_completion(&info->wait_backend_disconnected); xlvbd_flush(info); @@ -1183,6 +1199,8 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, /* Already hold rinfo->ring_lock. */ static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo) { + if (unlikely(rinfo->dev_info->connected == BLKIF_STATE_FREEZING)) + return; if (!RING_FULL(&rinfo->ring)) blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true); } @@ -1300,9 +1318,6 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) static void blkif_free(struct blkfront_info *info, int suspend) { - unsigned int i; - struct blkfront_ring_info *rinfo; - /* Prevent new requests being issued until we fix things up. */ info->connected = suspend ? BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; @@ -1310,6 +1325,14 @@ static void blkif_free(struct blkfront_info *info, int suspend) if (info->rq) blk_mq_stop_hw_queues(info->rq); + __blkif_free(info); +} + +static void __blkif_free(struct blkfront_info *info) +{ + unsigned int i; + struct blkfront_ring_info *rinfo; + for_each_rinfo(info, rinfo, i) blkif_free_ring(rinfo); @@ -1521,8 +1544,10 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS; if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { - xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS); - return IRQ_HANDLED; + if (info->connected != BLKIF_STATE_FREEZING) { + xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS); + return IRQ_HANDLED; + } } spin_lock_irqsave(&rinfo->ring_lock, flags); @@ -2013,6 +2038,7 @@ static int blkif_recover(struct blkfront_info *info) unsigned int segs; struct blkfront_ring_info *rinfo; + bool frozen = info->connected == BLKIF_STATE_FROZEN; blkfront_gather_backend_features(info); /* Reset limits changed by blk_mq_update_nr_hw_queues(). */ blkif_set_queue_limits(info); @@ -2034,6 +2060,9 @@ static int blkif_recover(struct blkfront_info *info) kick_pending_request_queues(rinfo); } + if (frozen) + return 0; + list_for_each_entry_safe(req, n, &info->requests, queuelist) { /* Requeue pending requests (flush or discard) */ list_del_init(&req->queuelist); @@ -2336,6 +2365,7 @@ static void blkfront_connect(struct blkfront_info *info) return; case BLKIF_STATE_SUSPENDED: + case BLKIF_STATE_FROZEN: /* * If we are recovering from suspension, we need to wait * for the backend to announce it's features before @@ -2460,12 +2490,36 @@ static void blkback_changed(struct xenbus_device *dev, break; case XenbusStateClosed: - if (dev->state == XenbusStateClosed) + if (dev->state == XenbusStateClosed) { + if (info->connected == BLKIF_STATE_FREEZING) { + __blkif_free(info); + info->connected = BLKIF_STATE_FROZEN; + complete(&info->wait_backend_disconnected); + break; + } + + break; + } + + /* + * We may somehow receive backend's Closed again while thawing + * or restoring and it causes thawing or restoring to fail. + * Ignore such unexpected state anyway. + */ + if (info->connected == BLKIF_STATE_FROZEN && + dev->state == XenbusStateInitialised) { + dev_dbg(&dev->dev, + "ignore the backend's Closed state: %s", + dev->nodename); break; + } fallthrough; case XenbusStateClosing: - blkfront_closing(info); - break; + if (info->connected == BLKIF_STATE_FREEZING) + xenbus_frontend_closed(dev); + else + blkfront_closing(info); + break; } } @@ -2500,6 +2554,94 @@ static int blkfront_is_ready(struct xenbus_device *dev) return info->is_ready && info->xbdev; } +static int blkfront_freeze(struct xenbus_device *dev) +{ + unsigned int i; + struct blkfront_info *info = dev_get_drvdata(&dev->dev); + struct blkfront_ring_info *rinfo; + struct blkif_front_ring *ring; + /* This would be reasonable timeout as used in xenbus_dev_shutdown() */ + unsigned int timeout = 5 * HZ; + int err = 0; + + info->connected = BLKIF_STATE_FREEZING; + + blk_mq_stop_hw_queues(info->rq); + + for (i = 0; i < info->nr_rings; i++) { + rinfo = &info->rinfo[i]; + + gnttab_cancel_free_callback(&rinfo->callback); + flush_work(&rinfo->work); + } + + for (i = 0; i < info->nr_rings; i++) { + spinlock_t *lock; + bool busy; + unsigned long req_timeout_ms = 25; + unsigned long ring_timeout; + + rinfo = &info->rinfo[i]; + ring = &rinfo->ring; + + lock = &rinfo->ring_lock; + + ring_timeout = jiffies + + msecs_to_jiffies(req_timeout_ms * RING_SIZE(ring)); + + do { + spin_lock_irq(lock); + busy = blkfront_ring_is_busy(ring); + spin_unlock_irq(lock); + + if (busy) + msleep(req_timeout_ms); + else + break; + } while (time_is_after_jiffies(ring_timeout)); + + /* Timed out */ + if (busy) { + xenbus_dev_error(dev, err, "the ring is still busy"); + info->connected = BLKIF_STATE_CONNECTED; + return -EBUSY; + } + } + + /* Kick the backend to disconnect */ + xenbus_switch_state(dev, XenbusStateClosing); + + /* + * We don't want to move forward before the frontend is diconnected + * from the backend cleanly. + */ + timeout = wait_for_completion_timeout(&info->wait_backend_disconnected, + timeout); + if (!timeout) { + err = -EBUSY; + xenbus_dev_error(dev, err, "Freezing timed out;" + "the device may become inconsistent state"); + } + + return err; +} + +static int blkfront_restore(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev_get_drvdata(&dev->dev); + int err = 0; + + blkfront_gather_backend_features(info); + xlvbd_flush(info); + err = talk_to_blkback(dev, info); + if (err) + goto out; + blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings); + +out: + return err; +} + static const struct block_device_operations xlvbd_block_fops = { .owner = THIS_MODULE, @@ -2521,6 +2663,9 @@ static struct xenbus_driver blkfront_driver = { .resume = blkfront_resume, .otherend_changed = blkback_changed, .is_ready = blkfront_is_ready, + .freeze = blkfront_freeze, + .thaw = blkfront_restore, + .restore = blkfront_restore }; static void purge_persistent_grants(struct blkfront_info *info) diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig index 3da8e85f8aae0..e6a50a84a8b7c 100644 --- a/drivers/char/hw_random/Kconfig +++ b/drivers/char/hw_random/Kconfig @@ -549,6 +549,19 @@ config HW_RANDOM_CN10K To compile this driver as a module, choose M here. The module will be called cn10k_rng. If unsure, say Y. +config HW_RANDOM_GRAVITON + tristate "AWS Graviton Random Number Generator support" + depends on HW_RANDOM && ACPI && (ARM64 || COMPILE_TEST) + default HW_RANDOM + help + This driver provides kernel-side support for the Random Number + Generator SMC found on AWS Graviton systems. + + To compile this driver as a module, choose M here: the + module will be called graviton-rng. + + If unsure, say Y. + endif # HW_RANDOM config UML_RANDOM diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile index 3e948cf044762..399eea6b29a1e 100644 --- a/drivers/char/hw_random/Makefile +++ b/drivers/char/hw_random/Makefile @@ -47,3 +47,4 @@ obj-$(CONFIG_HW_RANDOM_XIPHERA) += xiphera-trng.o obj-$(CONFIG_HW_RANDOM_ARM_SMCCC_TRNG) += arm_smccc_trng.o obj-$(CONFIG_HW_RANDOM_CN10K) += cn10k-rng.o obj-$(CONFIG_HW_RANDOM_POLARFIRE_SOC) += mpfs-rng.o +obj-$(CONFIG_HW_RANDOM_GRAVITON) += graviton-rng.o diff --git a/drivers/char/hw_random/graviton-rng.c b/drivers/char/hw_random/graviton-rng.c new file mode 100644 index 0000000000000..3a8f3fe35359b --- /dev/null +++ b/drivers/char/hw_random/graviton-rng.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AWS Graviton TRNG driver + * + * Copyright (C) 2019 Amazon Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ARM_SMCCC_SIP_GRAVITON_FEATURE_PROBE \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_SIP, 0x00ff) +#define AWS_GRAVITON_UUID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_SIP, 0xFF01) +#define AWS_GRAVITON_GET_VER \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_SIP, 0xFF03) + +#define AWS_GRAVITON_GET_RND \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_SIP, 0x60) +#define AWS_GRAVITON_GET_RND_LEGACY \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_SIP, 0x60) + +/** + * UID of the Graviton TRNG API: eb4af8a0-89d4-49c9-bc8c5b38dc54308e + */ +#define GRVTN_TRNG_UUID_0 0xa0f83aeb +#define GRVTN_TRNG_UUID_1 0xc949d489 +#define GRVTN_TRNG_UUID_2 0x385b8cbc +#define GRVTN_TRNG_UUID_3 0x8e3054dc + +struct grvtn_rng { + u64 call_id; + struct hwrng rng; +}; + +static void grvtn_smccc_conduit(u64 call_id, struct arm_smccc_res *res) +{ + if (acpi_psci_use_hvc()) + arm_smccc_1_1_hvc(call_id, res); + else + arm_smccc_1_1_smc(call_id, res); +} + +static int grvtn_probe_sip_feature(unsigned long feature) +{ + struct arm_smccc_res res = {}; + + if (acpi_psci_use_hvc()) + arm_smccc_1_1_hvc(ARM_SMCCC_SIP_GRAVITON_FEATURE_PROBE, + feature, 0, &res); + else + arm_smccc_1_1_smc(ARM_SMCCC_SIP_GRAVITON_FEATURE_PROBE, + feature, 0, &res); + + return res.a0; +} + +static int grvtn_trng_read(struct hwrng *rng, void *buf, size_t max, bool wait) +{ + struct grvtn_rng *priv = (struct grvtn_rng *)rng->priv; + struct arm_smccc_res res; + int err = 0; + /* timeout after one waiting period */ + int iter_remain = 2; + size_t count = max > sizeof(ulong) * 2 ? sizeof(ulong) * 2 : max; + size_t total = count; + + do { + if (err && wait) + /* Nominal wait is 5us */ + udelay(err); + + grvtn_smccc_conduit(priv->call_id, &res); + + /* In the unlikely event of rolling back to legacy after probe was issued */ + if (unlikely((res.a0 == SMCCC_RET_NOT_SUPPORTED) && (priv->call_id != AWS_GRAVITON_GET_RND_LEGACY))) { + grvtn_smccc_conduit(AWS_GRAVITON_GET_RND_LEGACY, &res); + priv->call_id = AWS_GRAVITON_GET_RND_LEGACY; + } + + err = (int) res.a0; + + if (err < 0) + return err; + + iter_remain--; + } while (iter_remain && err && wait); + + if (err) + return 0; + + if (count > sizeof(ulong)) { + memcpy(buf, &res.a1, sizeof(ulong)); + count -= sizeof(ulong); + buf += sizeof(ulong); + } + memcpy(buf, &res.a2, count); + return total; +} + +static int grvtn_trng_probe(struct platform_device *pdev) +{ + int version; + int err; + struct arm_smccc_res res; + struct grvtn_rng *priv; + + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->rng.name = "graviton"; + priv->rng.read = grvtn_trng_read; + priv->rng.priv = (unsigned long)priv; + priv->rng.quality = 1024; /* all bits are sourced from a HW TRNG */ + priv->call_id = AWS_GRAVITON_GET_RND_LEGACY; /* default mode is legacy */ + + grvtn_smccc_conduit(AWS_GRAVITON_UUID, &res); + + if (res.a0 != GRVTN_TRNG_UUID_0 || res.a1 != GRVTN_TRNG_UUID_1 || + res.a2 != GRVTN_TRNG_UUID_2 || res.a3 != GRVTN_TRNG_UUID_3) { + dev_err(&pdev->dev, "failed to match UUID\n"); + return -ENXIO; + } + + grvtn_smccc_conduit(AWS_GRAVITON_GET_VER, &res); + dev_info(&pdev->dev, "Graviton TRNG, SMC version %d.%d\n", + (u32)res.a0, (u32)res.a1); + + version = grvtn_probe_sip_feature(AWS_GRAVITON_GET_RND); + if (version > 0) + priv->call_id = AWS_GRAVITON_GET_RND; + + platform_set_drvdata(pdev, priv); + err = devm_hwrng_register(&pdev->dev, &priv->rng); + if (err) + dev_err(&pdev->dev, "failed to register hwrng"); + return err; +} + +static const struct acpi_device_id grvtn_trng_acpi_match[] = { + { "AMZN0010", }, + {} +}; + +MODULE_DEVICE_TABLE(acpi, grvtn_trng_acpi_match); + +static struct platform_driver grvtn_trng_driver = { + .probe = grvtn_trng_probe, + .driver = { + .name = "graviton-rng", + .owner = THIS_MODULE, + .acpi_match_table = ACPI_PTR(grvtn_trng_acpi_match), + }, +}; + +module_platform_driver(grvtn_trng_driver); + +MODULE_AUTHOR("Amazon.com, Inc. or it's affiliates"); +MODULE_DESCRIPTION("Graviton TRNG driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/char/random.c b/drivers/char/random.c index fd57eb372d492..55f26452975a5 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -283,6 +284,11 @@ static unsigned int crng_reseed_interval(void) return CRNG_RESEED_INTERVAL; } +/* + * Hook for external RNG. + */ +static const struct random_extrng __rcu *extrng; + /* * This function returns a ChaCha state that you may use for generating * random data. It also returns up to 32 bytes on its own of random data @@ -570,6 +576,9 @@ int __cold random_prepare_cpu(unsigned int cpu) #endif +static const struct file_operations extrng_random_fops; +static const struct file_operations extrng_urandom_fops; + /********************************************************************** * * Entropy accumulation and extraction routines. @@ -936,6 +945,19 @@ void __init add_bootloader_randomness(const void *buf, size_t len) credit_init_bits(len * 8); } +void random_register_extrng(const struct random_extrng *rng) +{ + rcu_assign_pointer(extrng, rng); +} +EXPORT_SYMBOL_GPL(random_register_extrng); + +void random_unregister_extrng(void) +{ + RCU_INIT_POINTER(extrng, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(random_unregister_extrng); + #if IS_ENABLED(CONFIG_VMGENID) static BLOCKING_NOTIFIER_HEAD(vmfork_chain); @@ -1307,6 +1329,7 @@ static void __cold try_to_generate_entropy(void) SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags) { + const struct random_extrng *rng; struct iov_iter iter; struct iovec iov; int ret; @@ -1321,6 +1344,18 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags if ((flags & (GRND_INSECURE | GRND_RANDOM)) == (GRND_INSECURE | GRND_RANDOM)) return -EINVAL; + rcu_read_lock(); + rng = rcu_dereference(extrng); + if (rng && !try_module_get(rng->owner)) + rng = NULL; + rcu_read_unlock(); + + if (rng) { + ret = rng->extrng_read(ubuf, len, !!(flags & GRND_RANDOM)); + module_put(rng->owner); + return ret; + } + if (!crng_ready() && !(flags & GRND_INSECURE)) { if (flags & GRND_NONBLOCK) return -EAGAIN; @@ -1329,6 +1364,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags return ret; } + ret = import_single_range(ITER_DEST, ubuf, len, &iov, &iter); if (unlikely(ret)) return ret; @@ -1341,6 +1377,13 @@ static __poll_t random_poll(struct file *file, poll_table *wait) return crng_ready() ? EPOLLIN | EPOLLRDNORM : EPOLLOUT | EPOLLWRNORM; } +static __poll_t +extrng_poll(struct file *file, poll_table * wait) +{ + /* extrng pool is always full, always read, no writes */ + return EPOLLIN | EPOLLRDNORM; +} + static ssize_t write_pool_user(struct iov_iter *iter) { u8 block[BLAKE2S_BLOCK_SIZE]; @@ -1482,9 +1525,60 @@ static int random_fasync(int fd, struct file *filp, int on) return fasync_helper(fd, filp, on, &fasync); } +static int random_open(struct inode *inode, struct file *filp) +{ + const struct random_extrng *rng; + + rcu_read_lock(); + rng = rcu_dereference(extrng); + if (rng && !try_module_get(rng->owner)) + rng = NULL; + rcu_read_unlock(); + + if (!rng) + return 0; + + filp->f_op = &extrng_random_fops; + filp->private_data = rng->owner; + + return 0; +} + +static int urandom_open(struct inode *inode, struct file *filp) +{ + const struct random_extrng *rng; + + rcu_read_lock(); + rng = rcu_dereference(extrng); + if (rng && !try_module_get(rng->owner)) + rng = NULL; + rcu_read_unlock(); + + if (!rng) + return 0; + + filp->f_op = &extrng_urandom_fops; + filp->private_data = rng->owner; + + return 0; +} + +static int extrng_release(struct inode *inode, struct file *filp) +{ + module_put(filp->private_data); + return 0; +} + +static ssize_t +extrng_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) +{ + return rcu_dereference_raw(extrng)->extrng_read(buf, nbytes, false); +} + const struct file_operations random_fops = { .read_iter = random_read_iter, .write_iter = random_write_iter, + .open = random_open, .poll = random_poll, .unlocked_ioctl = random_ioctl, .compat_ioctl = compat_ptr_ioctl, @@ -1497,6 +1591,7 @@ const struct file_operations random_fops = { const struct file_operations urandom_fops = { .read_iter = urandom_read_iter, .write_iter = random_write_iter, + .open = urandom_open, .unlocked_ioctl = random_ioctl, .compat_ioctl = compat_ptr_ioctl, .fasync = random_fasync, @@ -1505,6 +1600,26 @@ const struct file_operations urandom_fops = { .splice_write = iter_file_splice_write, }; +static const struct file_operations extrng_random_fops = { + .open = random_open, + .read = extrng_read, + .write_iter = random_write_iter, + .poll = extrng_poll, + .unlocked_ioctl = random_ioctl, + .fasync = random_fasync, + .llseek = noop_llseek, + .release = extrng_release, +}; + +static const struct file_operations extrng_urandom_fops = { + .open = urandom_open, + .read = extrng_read, + .write_iter = random_write_iter, + .unlocked_ioctl = random_ioctl, + .fasync = random_fasync, + .llseek = noop_llseek, + .release = extrng_release, +}; /******************************************************************** * diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index a44ba09e49d9c..63edf0c5f7c3b 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -78,6 +78,7 @@ struct psci_0_1_function_ids get_psci_0_1_function_ids(void) static u32 psci_cpu_suspend_feature; static bool psci_system_reset2_supported; +static bool psci_system_off2_hibernate_supported; static inline bool psci_has_ext_power_state(void) { @@ -332,6 +333,28 @@ static void psci_sys_poweroff(void) invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0); } +#ifdef CONFIG_HIBERNATION +static int psci_sys_hibernate(struct sys_off_data *data) +{ + if (system_entering_hibernation()) + invoke_psci_fn(PSCI_FN_NATIVE(1_3, SYSTEM_OFF2), + PSCI_1_3_HIBERNATE_TYPE_OFF, 0, 0); + return NOTIFY_DONE; +} + +static int __init psci_hibernate_init(void) +{ + if (psci_system_off2_hibernate_supported) { + /* Higher priority than EFI shutdown, but only for hibernate */ + register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_FIRMWARE + 2, + psci_sys_hibernate, NULL); + } + return 0; +} +subsys_initcall(psci_hibernate_init); +#endif + static int psci_features(u32 psci_func_id) { return invoke_psci_fn(PSCI_1_0_FN_PSCI_FEATURES, @@ -363,6 +386,7 @@ static const struct { PSCI_ID_NATIVE(1_1, SYSTEM_RESET2), PSCI_ID(1_1, MEM_PROTECT), PSCI_ID_NATIVE(1_1, MEM_PROTECT_CHECK_RANGE), + PSCI_ID_NATIVE(1_3, SYSTEM_OFF2), }; static int psci_debugfs_read(struct seq_file *s, void *data) @@ -513,6 +537,18 @@ static void __init psci_init_system_reset2(void) psci_system_reset2_supported = true; } +static void __init psci_init_system_off2(void) +{ + int ret; + + ret = psci_features(PSCI_FN_NATIVE(1_3, SYSTEM_OFF2)); + if (ret < 0) + return; + + if (ret & BIT(PSCI_1_3_HIBERNATE_TYPE_OFF)) + psci_system_off2_hibernate_supported = true; +} + static void __init psci_init_system_suspend(void) { int ret; @@ -643,6 +679,7 @@ static int __init psci_probe(void) psci_init_cpu_suspend(); psci_init_system_suspend(); psci_init_system_reset2(); + psci_init_system_off2(); kvm_init_hyp_services(); } diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index f30f99166531f..e0264211ca84b 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -124,8 +124,9 @@ config DRM_DEBUG_MODESET_LOCK config DRM_FBDEV_EMULATION bool "Enable legacy fbdev support for your modesetting driver" - depends on DRM_KMS_HELPER - depends on FB=y || FB=DRM_KMS_HELPER + depends on DRM + depends on FB=y || FB=DRM + select DRM_KMS_HELPER select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 95b5ab4b964e2..a608901bade5d 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +60,12 @@ #include #include +enum netif_freeze_state { + NETIF_FREEZE_STATE_UNFROZEN, + NETIF_FREEZE_STATE_FREEZING, + NETIF_FREEZE_STATE_FROZEN, +}; + /* Module parameters */ #define MAX_QUEUES_DEFAULT 8 static unsigned int xennet_max_queues; @@ -72,6 +79,12 @@ MODULE_PARM_DESC(trusted, "Is the backend trusted"); #define XENNET_TIMEOUT (5 * HZ) +static unsigned int netfront_freeze_timeout_secs = 10; +module_param_named(freeze_timeout_secs, + netfront_freeze_timeout_secs, uint, 0644); +MODULE_PARM_DESC(freeze_timeout_secs, + "timeout when freezing netfront device in seconds"); + static const struct ethtool_ops xennet_ethtool_ops; struct netfront_cb { @@ -181,6 +194,10 @@ struct netfront_info { bool bounce; atomic_t rx_gso_checksum_fixup; + + int freeze_state; + + struct completion wait_backend_disconnected; }; struct netfront_rx_info { @@ -910,6 +927,21 @@ static void xennet_set_rx_rsp_cons(struct netfront_queue *queue, RING_IDX val) spin_unlock_irqrestore(&queue->rx_cons_lock, flags); } +static int xennet_disable_interrupts(struct net_device *dev) +{ + struct netfront_info *np = netdev_priv(dev); + unsigned int num_queues = dev->real_num_tx_queues; + unsigned int i; + struct netfront_queue *queue; + + for (i = 0; i < num_queues; ++i) { + queue = &np->queues[i]; + disable_irq(queue->tx_irq); + disable_irq(queue->rx_irq); + } + return 0; +} + static void xennet_move_rx_slot(struct netfront_queue *queue, struct sk_buff *skb, grant_ref_t ref) { @@ -1719,6 +1751,8 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev) np->queues = NULL; + init_completion(&np->wait_backend_disconnected); + err = -ENOMEM; np->rx_stats = netdev_alloc_pcpu_stats(struct netfront_stats); if (np->rx_stats == NULL) @@ -2245,6 +2279,50 @@ static int xennet_create_queues(struct netfront_info *info, return 0; } +static int netfront_freeze(struct xenbus_device *dev) +{ + struct netfront_info *info = dev_get_drvdata(&dev->dev); + unsigned long timeout = netfront_freeze_timeout_secs * HZ; + int err = 0; + + xennet_disable_interrupts(info->netdev); + + netif_device_detach(info->netdev); + + info->freeze_state = NETIF_FREEZE_STATE_FREEZING; + + /* Kick the backend to disconnect */ + xenbus_switch_state(dev, XenbusStateClosing); + + /* We don't want to move forward before the frontend is diconnected + * from the backend cleanly. + */ + timeout = wait_for_completion_timeout(&info->wait_backend_disconnected, + timeout); + if (!timeout) { + err = -EBUSY; + xenbus_dev_error(dev, err, "Freezing timed out;" + "the device may become inconsistent state"); + return err; + } + + /* Tear down queues */ + xennet_disconnect_backend(info); + xennet_destroy_queues(info); + + info->freeze_state = NETIF_FREEZE_STATE_FROZEN; + + return err; +} + +static int netfront_restore(struct xenbus_device *dev) +{ + /* Kick the backend to re-connect */ + xenbus_switch_state(dev, XenbusStateInitialising); + + return 0; +} + /* Common code used when first setting up, and when resuming. */ static int talk_to_netback(struct xenbus_device *dev, struct netfront_info *info) @@ -2446,6 +2524,13 @@ static int xennet_connect(struct net_device *dev) device_unregister(&np->xbdev->dev); return err; } + } else { + /* + * In the resume / thaw case, the netif needs to be + * reattached, as it was detached in netfront_freeze(). + */ + if (np->freeze_state == NETIF_FREEZE_STATE_FROZEN) + netif_device_attach(dev); } rtnl_lock(); @@ -2475,6 +2560,8 @@ static int xennet_connect(struct net_device *dev) spin_unlock_bh(&queue->rx_lock); } + np->freeze_state = NETIF_FREEZE_STATE_UNFROZEN; + return 0; } @@ -2512,10 +2599,22 @@ static void netback_changed(struct xenbus_device *dev, break; case XenbusStateClosed: - if (dev->state == XenbusStateClosed) + if (dev->state == XenbusStateClosed) { + /* dpm context is waiting for the backend */ + if (np->freeze_state == NETIF_FREEZE_STATE_FREEZING) + complete(&np->wait_backend_disconnected); break; + } fallthrough; /* Missed the backend's CLOSING state */ case XenbusStateClosing: + /* We may see unexpected Closed or Closing from the backend. + * Just ignore it not to prevent the frontend from being + * re-connected in the case of PM suspend or hibernation. + */ + if (np->freeze_state == NETIF_FREEZE_STATE_FROZEN && + dev->state == XenbusStateInitialising) { + break; + } xenbus_frontend_closed(dev); break; } @@ -2677,6 +2776,9 @@ static struct xenbus_driver netfront_driver = { .probe = netfront_probe, .remove = xennet_remove, .resume = netfront_resume, + .freeze = netfront_freeze, + .thaw = netfront_restore, + .restore = netfront_restore, .otherend_changed = netback_changed, }; diff --git a/drivers/of/device.c b/drivers/of/device.c index ce225d2590b54..0cc46dec4107f 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -8,6 +8,7 @@ #include #include /* for bus_dma_region */ #include +#include #include #include #include @@ -216,6 +217,11 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, arch_setup_dma_ops(dev, dma_start, size, iommu, coherent); +#ifdef CONFIG_DMA_PAGE_TOUCHING + if (!dev->dma_ops) + setup_dma_page_touching_ops(dev); +#endif + if (!iommu) of_dma_set_restricted_buffer(dev, np); diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 34877a1f43a15..2d5083003987f 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -28,7 +28,6 @@ void pci_msi_teardown_msi_irqs(struct pci_dev *dev) msi_domain_free_irqs_descs_locked(domain, &dev->dev); else pci_msi_legacy_teardown_msi_irqs(dev); - msi_free_msi_descs(&dev->dev); } /** diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index fdd2ec09651e9..5474dd02b3eb0 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -224,8 +224,18 @@ EXPORT_SYMBOL_GPL(pci_write_msi_msg); static void free_msi_irqs(struct pci_dev *dev) { + struct msi_desc *desc; + int i; + + msi_for_each_desc(desc, &dev->dev, MSI_DESC_NOTASSOCIATED) + if (desc->irq) + for (i = 0; i < desc->nvec_used; i++) + BUG_ON(irq_has_action(desc->irq + i)); + pci_msi_teardown_msi_irqs(dev); + msi_free_msi_descs(&dev->dev); + if (dev->msix_base) { iounmap(dev->msix_base); dev->msix_base = NULL; diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 5cfabd5376cc2..c1885029397c5 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -60,6 +60,8 @@ source "drivers/staging/board/Kconfig" source "drivers/staging/gdm724x/Kconfig" +source "drivers/staging/lustrefsx/Kconfig" + source "drivers/staging/fbtft/Kconfig" source "drivers/staging/most/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index f8c3aa9c24182..e28ee59ab6681 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_USB_EMXX) += emxx_udc/ obj-$(CONFIG_MFD_NVEC) += nvec/ obj-$(CONFIG_STAGING_BOARD) += board/ obj-$(CONFIG_LTE_GDM724X) += gdm724x/ +obj-$(CONFIG_LUSTREFSX_LNET) += lustrefsx/ obj-$(CONFIG_FB_TFT) += fbtft/ obj-$(CONFIG_MOST) += most/ obj-$(CONFIG_KS7010) += ks7010/ diff --git a/drivers/staging/lustrefsx/Kconfig b/drivers/staging/lustrefsx/Kconfig new file mode 100644 index 0000000000000..81e9bc1043d76 --- /dev/null +++ b/drivers/staging/lustrefsx/Kconfig @@ -0,0 +1,3 @@ +source "drivers/staging/lustrefsx/libcfs/Kconfig" +source "drivers/staging/lustrefsx/lnet/Kconfig" +source "drivers/staging/lustrefsx/lustre/Kconfig" diff --git a/drivers/staging/lustrefsx/Makefile b/drivers/staging/lustrefsx/Makefile new file mode 100644 index 0000000000000..20c7929213c3f --- /dev/null +++ b/drivers/staging/lustrefsx/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LUSTREFSX_LNET) += lnet/ +obj-$(CONFIG_LUSTREFSX_FS) += lustre/ +obj-$(CONFIG_LUSTREFSX_LIBCFS) += libcfs/ diff --git a/drivers/staging/lustrefsx/Makefile.rules b/drivers/staging/lustrefsx/Makefile.rules new file mode 100644 index 0000000000000..2336cfb1c80ae --- /dev/null +++ b/drivers/staging/lustrefsx/Makefile.rules @@ -0,0 +1,8 @@ +ccflags-y += -include $(srctree)/drivers/staging/lustrefsx/undef.h +ccflags-y += -include $(srctree)/drivers/staging/lustrefsx/config.h +ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/libcfs/include +ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lnet/include +ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lnet/include/uapi +ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lustre/include +ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lustre/include/uapi +ccflags-y += -Wno-format-truncation -Werror diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h new file mode 100644 index 0000000000000..7baa3cd739b71 --- /dev/null +++ b/drivers/staging/lustrefsx/config.h @@ -0,0 +1,1258 @@ +/* config.h. Generated from config.h.in by configure. */ +/* config.h.in. Generated from configure.ac by autoheader. */ + +/* enable libcfs CDEBUG, CWARN */ +#define CDEBUG_ENABLED 1 + +/* enable libcfs ENTRY/EXIT */ +#define CDEBUG_ENTRY_EXIT 1 + +/* enable page state tracking code */ +/* #undef CONFIG_DEBUG_PAGESTATE_TRACKING */ + +/* enable encryption for ldiskfs */ +/* #undef CONFIG_LDISKFS_FS_ENCRYPTION */ + +/* posix acls for ldiskfs */ +/* #undef CONFIG_LDISKFS_FS_POSIX_ACL */ + +/* enable rw access for ldiskfs */ +/* #undef CONFIG_LDISKFS_FS_RW */ + +/* fs security for ldiskfs */ +/* #undef CONFIG_LDISKFS_FS_SECURITY */ + +/* extened attributes for ldiskfs */ +/* #undef CONFIG_LDISKFS_FS_XATTR */ + +/* embedded llcrypt */ +#define CONFIG_LL_ENCRYPTION 1 + +/* enable invariant checking */ +/* #undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */ + +/* enable lu_ref reference tracking code */ +/* #undef CONFIG_LUSTRE_DEBUG_LU_REF */ + +/* Use the Pinger */ +#define CONFIG_LUSTRE_FS_PINGER 1 + +/* Enable POSIX acl */ +#define CONFIG_LUSTRE_FS_POSIX_ACL 1 + +/* name of ldiskfs debug program */ +#define DEBUGFS "debugfs" + +/* name of ldiskfs dump program */ +#define DUMPE2FS "dumpe2fs" + +/* name of ldiskfs fsck program */ +#define E2FSCK "e2fsck" + +/* name of ldiskfs e2fsprogs package */ +#define E2FSPROGS "e2fsprogs" + +/* name of ldiskfs label program */ +#define E2LABEL "e2label" + +/* do data checksums */ +#define ENABLE_CHECKSUM 1 + +/* enable flock by default */ +#define ENABLE_FLOCK 1 + +/* filldir_t return type is bool or int */ +#define FILLDIR_TYPE bool + +/* rhashtable_walk_init() has 3 args */ +/* #undef HAVE_3ARG_RHASHTABLE_WALK_INIT */ + +/* account_page_dirtied takes three arguments */ +/* #undef HAVE_ACCOUNT_PAGE_DIRTIED_3ARGS */ + +/* account_page_dirtied is exported */ +/* #undef HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT */ + +/* 'get_acl' and 'set_acl' use dentry argument */ +/* #undef HAVE_ACL_WITH_DENTRY */ + +/* aes-sha2 is supported by krb5 */ +/* #undef HAVE_AES_SHA2_SUPPORT */ + +/* aio_complete defined */ +/* #undef HAVE_AIO_COMPLETE */ + +/* 'alloc_file_pseudo' exist */ +#define HAVE_ALLOC_FILE_PSEUDO 1 + +/* alloc_inode_sb() exists */ +#define HAVE_ALLOC_INODE_SB 1 + +/* struct address_space_operations() has migrate_folio() */ +#define HAVE_AOPS_MIGRATE_FOLIO 1 + +/* struct address_space_operations() has read_folio() */ +#define HAVE_AOPS_READ_FOLIO 1 + +/* struct address_space_operations() has release_folio() */ +#define HAVE_AOPS_RELEASE_FOLIO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ASM_TYPES_H 1 + +/* backing_dev_info exist */ +/* #undef HAVE_BACKING_DEV_INFO */ + +/* BDI_CAP_MAP_COPY exist */ +/* #undef HAVE_BDI_CAP_MAP_COPY */ + +/* backing_dev_info has io_pages */ +#define HAVE_BDI_IO_PAGES 1 + +/* struct bio has bi_phys_segments member */ +/* #undef HAVE_BIO_BI_PHYS_SEGMENTS */ + +/* bio_endio takes only one argument */ +#define HAVE_BIO_ENDIO_USES_ONE_ARG 1 + +/* 'bio_integrity_enabled' is available */ +/* #undef HAVE_BIO_INTEGRITY_ENABLED */ + +/* kernel has bio_integrity_prep_fn */ +/* #undef HAVE_BIO_INTEGRITY_PREP_FN */ + +/* bio_integrity_prep_fn returns bool */ +#define HAVE_BIO_INTEGRITY_PREP_FN_RETURNS_BOOL 1 + +/* 'bio_set_dev' is available */ +#define HAVE_BIO_SET_DEV 1 + +/* bio_integrity_payload.bip_iter exist */ +#define HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD 1 + +/* Linux bitmap can be allocated */ +#define HAVE_BITMAP_ALLOC 1 + +/* 'bi_bdev' is available */ +#define HAVE_BI_BDEV 1 + +/* struct bio has bi_opf */ +#define HAVE_BI_OPF 1 + +/* 'bi_status' is available */ +#define HAVE_BI_STATUS 1 + +/* kernel has struct blk_integrity_iter */ +#define HAVE_BLK_INTEGRITY_ITER 1 + +/* kernel hash_64() is broken */ +/* #undef HAVE_BROKEN_HASH_64 */ + +/* kernel has struct bvec_iter */ +#define HAVE_BVEC_ITER 1 + +/* if bvec_iter_all exists for multi-page bvec iternation */ +#define HAVE_BVEC_ITER_ALL 1 + +/* struct cache_detail has writers */ +#define HAVE_CACHE_DETAIL_WRITERS 1 + +/* if cache_detail->hash_lock is a spinlock */ +#define HAVE_CACHE_HASH_SPINLOCK 1 + +/* cache_head has hlist cache_list */ +#define HAVE_CACHE_HEAD_HLIST 1 + +/* crypto/internal/cipher.h is present */ +#define HAVE_CIPHER_H 1 + +/* kernel has clean_bdev_aliases */ +#define HAVE_CLEAN_BDEV_ALIASES 1 + +/* 'clear_and_wake_up_bit' is available */ +#define HAVE_CLEAR_AND_WAKE_UP_BIT 1 + +/* compat rdma found */ +/* #undef HAVE_COMPAT_RDMA */ + +/* copy_file_range() is supported */ +#define HAVE_COPY_FILE_RANGE 1 + +/* 'cpus_read_lock' exist */ +#define HAVE_CPUS_READ_LOCK 1 + +/* crypto_alloc_skcipher is defined */ +#define HAVE_CRYPTO_ALLOC_SKCIPHER 1 + +/* crypto hash helper functions are available */ +#define HAVE_CRYPTO_HASH_HELPERS 1 + +/* 'CRYPTO_MAX_ALG_NAME' is 128 */ +#define HAVE_CRYPTO_MAX_ALG_NAME_128 1 + +/* crypto/sha2.h is present */ +#define HAVE_CRYPTO_SHA2_HEADER 1 + +/* current_time() has replaced CURRENT_TIME */ +#define HAVE_CURRENT_TIME 1 + +/* Have db_dirty_records list_t */ +/* #undef HAVE_DB_DIRTY_RECORDS_LIST */ + +/* default_file_splice_read is exported */ +/* #undef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT */ + +/* delete_from_page_cache is exported */ +/* #undef HAVE_DELETE_FROM_PAGE_CACHE */ + +/* dentry.d_child exist */ +#define HAVE_DENTRY_D_CHILD 1 + +/* list dentry.d_u.d_alias exist */ +#define HAVE_DENTRY_D_U_D_ALIAS 1 + +/* DES3 enctype is supported by krb5 */ +/* #undef HAVE_DES3_SUPPORT */ + +/* direct_IO has 2 arguments */ +#define HAVE_DIRECTIO_2ARGS 1 + +/* direct IO uses iov_iter */ +/* #undef HAVE_DIRECTIO_ITER */ + +/* address_spaace_operaions->dirty_folio() member exists */ +#define HAVE_DIRTY_FOLIO 1 + +/* dir_context exist */ +#define HAVE_DIR_CONTEXT 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Have dmu_object_alloc_dnsize in ZFS */ +/* #undef HAVE_DMU_OBJECT_ALLOC_DNSIZE */ + +/* Have dmu_objset_disown() with 3 args */ +/* #undef HAVE_DMU_OBJSET_DISOWN_3ARG */ + +/* Have dmu_objset_own() with 6 args */ +/* #undef HAVE_DMU_OBJSET_OWN_6ARG */ + +/* Have dmu_offset_next() exported */ +/* #undef HAVE_DMU_OFFSET_NEXT */ + +/* Have 6 argument dmu_pretch in ZFS */ +/* #undef HAVE_DMU_PREFETCH_6ARG */ + +/* Have dmu_read_by_dnode() in ZFS */ +/* #undef HAVE_DMU_READ_BY_DNODE */ + +/* Have dmu_tx_hold_write_by_dnode() in ZFS */ +/* #undef HAVE_DMU_TX_HOLD_WRITE_BY_DNODE */ + +/* Have dmu_tx_hold_zap_by_dnode() in ZFS */ +/* #undef HAVE_DMU_TX_HOLD_ZAP_BY_DNODE */ + +/* Have dmu_tx_mark_netfree */ +/* #undef HAVE_DMU_TX_MARK_NETFREE */ + +/* Have native dnode accounting in ZFS */ +/* #undef HAVE_DMU_USEROBJ_ACCOUNTING */ + +/* Have dmu_write_by_dnode() in ZFS */ +/* #undef HAVE_DMU_WRITE_BY_DNODE */ + +/* down_write_killable function exists */ +#define HAVE_DOWN_WRITE_KILLABLE 1 + +/* quotactl_ops.set_dqblk takes struct kqid */ +#define HAVE_DQUOT_KQID 1 + +/* quotactl_ops.set_dqblk takes struct qc_dqblk */ +#define HAVE_DQUOT_QC_DQBLK 1 + +/* dquot_transfer() has user_ns argument */ +#define HAVE_DQUOT_TRANSFER_WITH_USER_NS 1 + +/* Have dsl_pool_config_enter/exit in ZFS */ +/* #undef HAVE_DSL_POOL_CONFIG */ + +/* Have dsl_sync_task_do_nowait in ZFS */ +/* #undef HAVE_DSL_SYNC_TASK_DO_NOWAIT */ + +/* d_compare need 4 arguments */ +#define HAVE_D_COMPARE_4ARGS 1 + +/* d_compare need 5 arguments */ +/* #undef HAVE_D_COMPARE_5ARGS */ + +/* d_count exist */ +#define HAVE_D_COUNT 1 + +/* 'd_init' exists */ +#define HAVE_D_INIT 1 + +/* d_in_lookup is defined */ +#define HAVE_D_IN_LOOKUP 1 + +/* 'd_is_positive' is available */ +#define HAVE_D_IS_POSITIVE 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ENDIAN_H 1 + +/* ethtool_link_settings is defined */ +#define HAVE_ETHTOOL_LINK_SETTINGS 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_EXT2FS_EXT2FS_H */ + +/* ext4_bread takes 4 arguments */ +/* #undef HAVE_EXT4_BREAD_4ARGS */ + +/* ext4_(inc|dec)_count() has 2 arguments */ +/* #undef HAVE_EXT4_INC_DEC_COUNT_2ARGS */ + +/* i_dquot is in ext4_inode_info */ +/* #undef HAVE_EXT4_INFO_DQUOT */ + +/* ext4_free_blocks do not require struct buffer_head */ +/* #undef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD */ + +/* file handle and related syscalls are supported */ +#define HAVE_FHANDLE_GLIBC_SUPPORT 1 + +/* union is unnamed */ +/* #undef HAVE_FID2PATH_ANON_UNIONS */ + +/* filemap_get_folios_contig() is available */ +#define HAVE_FILEMAP_GET_FOLIOS_CONTIG 1 + +/* kernel has file_dentry */ +#define HAVE_FILE_DENTRY 1 + +/* file_operations.[read|write]_iter functions exist */ +#define HAVE_FILE_OPERATIONS_READ_WRITE_ITER 1 + +/* filldir_t needs struct dir_context as argument */ +#define HAVE_FILLDIR_USE_CTX 1 + +/* filldir_t needs struct dir_context and returns bool */ +#define HAVE_FILLDIR_USE_CTX_RETURN_BOOL 1 + +/* FMR pool API is available */ +/* #undef HAVE_FMR_POOL_API */ + +/* file_operations has iterate_shared */ +#define HAVE_FOP_ITERATE_SHARED 1 + +/* force_sig() has task parameter */ +/* #undef HAVE_FORCE_SIG_WITH_TASK */ + +/* 'struct fscrypt_digested_name' exists */ +/* #undef HAVE_FSCRYPT_DIGESTED_NAME */ + +/* embedded llcrypt uses llcrypt_dummy_context_enabled() */ +#define HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED 1 + +/* fscrypt_is_nokey_name() exists */ +#define HAVE_FSCRYPT_IS_NOKEY_NAME 1 + +/* full_name_hash need 3 arguments */ +#define HAVE_FULL_NAME_HASH_3ARGS 1 + +/* generic_write_sync has 2 arguments */ +#define HAVE_GENERIC_WRITE_SYNC_2ARGS 1 + +/* struct genl_dumpit_info has family field */ +#define HAVE_GENL_DUMPIT_INFO 1 + +/* Define to 1 if you have the `gethostbyname' function. */ +#define HAVE_GETHOSTBYNAME 1 + +/* 'get_acl' has a rcu argument */ +#define HAVE_GET_ACL_RCU_ARG 1 + +/* get_inode_usage function exists */ +#define HAVE_GET_INODE_USAGE 1 + +/* get_random_[u32|u64] are available */ +#define HAVE_GET_RANDOM_U32_AND_U64 1 + +/* get_random_u32_below() is available */ +#define HAVE_GET_RANDOM_U32_BELOW 1 + +/* get_request_key_auth() is available */ +#define HAVE_GET_REQUEST_KEY_AUTH 1 + +/* get_user_pages takes 6 arguments */ +/* #undef HAVE_GET_USER_PAGES_6ARG */ + +/* get_user_pages takes gup_flags in arguments */ +#define HAVE_GET_USER_PAGES_GUP_FLAGS 1 + +/* glob_match() is available */ +#define HAVE_GLOB 1 + +/* grab_cache_page_write_begin() has flags argument */ +/* #undef HAVE_GRAB_CACHE_PAGE_WRITE_BEGIN_WITH_FLAGS */ + +/* struct group_info has member gid */ +#define HAVE_GROUP_INFO_GID 1 + +/* Define this is if you enable gss */ +/* #undef HAVE_GSS */ + +/* Define this if you enable gss keyring backend */ +#define HAVE_GSS_KEYRING 1 + +/* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */ +/* #undef HAVE_GSS_KRB5_CCACHE_NAME */ + +/* '__rhashtable_insert_fast()' returns int */ +/* #undef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT */ + +/* Define this if you have Heimdal Kerberos libraries */ +/* #undef HAVE_HEIMDAL */ + +/* hlist_add_after is available */ +/* #undef HAVE_HLIST_ADD_AFTER */ + +/* hotplug state machine is supported */ +#define HAVE_HOTPLUG_STATE_MACHINE 1 + +/* hypervisor_is_type function exists */ +#define HAVE_HYPERVISOR_IS_TYPE 1 + +/* ib_alloc_fast_reg_mr is defined */ +/* #undef HAVE_IB_ALLOC_FAST_REG_MR */ + +/* ib_alloc_pd has 2 arguments */ +#define HAVE_IB_ALLOC_PD_2ARGS 1 + +/* struct ib_cq_init_attr is used by ib_create_cq */ +#define HAVE_IB_CQ_INIT_ATTR 1 + +/* struct ib_device.attrs is defined */ +#define HAVE_IB_DEVICE_ATTRS 1 + +/* if struct ib_device_ops is defined */ +/* #undef HAVE_IB_DEVICE_OPS */ + +/* ib_get_dma_mr is defined */ +/* #undef HAVE_IB_GET_DMA_MR */ + +/* function ib_inc_rkey exist */ +#define HAVE_IB_INC_RKEY 1 + +/* ib_map_mr_sg exists */ +#define HAVE_IB_MAP_MR_SG 1 + +/* ib_map_mr_sg has 5 arguments */ +#define HAVE_IB_MAP_MR_SG_5ARGS 1 + +/* ib_post_send and ib_post_recv have const parameters */ +#define HAVE_IB_POST_SEND_RECV_CONST 1 + +/* struct ib_rdma_wr is defined */ +#define HAVE_IB_RDMA_WR 1 + +/* if ib_sg_dma_address wrapper exists */ +/* #undef HAVE_IB_SG_DMA_ADDRESS */ + +/* inode_operations .getattr member function can gather advance stats */ +/* #undef HAVE_INODEOPS_ENHANCED_GETATTR */ + +/* inode_lock is defined */ +#define HAVE_INODE_LOCK 1 + +/* inode times are using timespec64 */ +#define HAVE_INODE_TIMESPEC64 1 + +/* blk_integrity.interval exist */ +/* #undef HAVE_INTERVAL_BLK_INTEGRITY */ + +/* blk_integrity.interval_exp exist */ +#define HAVE_INTERVAL_EXP_BLK_INTEGRITY 1 + +/* interval trees use rb_tree_cached */ +#define HAVE_INTERVAL_TREE_CACHED 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* address_spaace_operaions->invalidate_folio() member exists */ +#define HAVE_INVALIDATE_FOLIO 1 + +/* address_space invalidate_lock member exists */ +#define HAVE_INVALIDATE_LOCK 1 + +/* address_space_operations.invalidatepage needs 3 arguments */ +/* #undef HAVE_INVALIDATE_RANGE */ + +/* have in_compat_syscall */ +#define HAVE_IN_COMPAT_SYSCALL 1 + +/* 'in_dev_for_each_ifa_rtnl' is defined */ +#define HAVE_IN_DEV_FOR_EACH_IFA_RTNL 1 + +/* inode_operations->rename need flags as argument */ +/* #undef HAVE_IOPS_RENAME_WITH_FLAGS */ + +/* generic_readlink has been removed */ +/* #undef HAVE_IOP_GENERIC_READLINK */ + +/* have iop get_link */ +#define HAVE_IOP_GET_LINK 1 + +/* inode_operations has .set_acl member function */ +#define HAVE_IOP_SET_ACL 1 + +/* inode_operations has {get,set,remove}xattr members */ +/* #undef HAVE_IOP_XATTR */ + +/* iov_iter_get_pages_alloc2() is available */ +#define HAVE_IOV_ITER_GET_PAGES_ALLOC2 1 + +/* if iov_iter has member iter_type */ +#define HAVE_IOV_ITER_HAS_ITER_TYPE_MEMBER 1 + +/* if iov_iter has member type */ +/* #undef HAVE_IOV_ITER_HAS_TYPE_MEMBER */ + +/* iov_iter_init handles directional tag */ +#define HAVE_IOV_ITER_INIT_DIRECTION 1 + +/* iov_iter_rw exist */ +#define HAVE_IOV_ITER_RW 1 + +/* iov_iter_truncate exists */ +#define HAVE_IOV_ITER_TRUNCATE 1 + +/* if iov_iter_type exists */ +#define HAVE_IOV_ITER_TYPE 1 + +/* is_root_inode defined */ +#define HAVE_IS_ROOT_INODE 1 + +/* 'iter_file_splice_write' exists */ +#define HAVE_ITER_FILE_SPLICE_WRITE 1 + +/* struct address_space has i_pages */ +#define HAVE_I_PAGES 1 + +/* if jbd2_journal_get_max_txn_bufs is available */ +/* #undef HAVE_JBD2_JOURNAL_GET_MAX_TXN_BUFS */ + +/* struct jbd2_journal_handle has h_total_credits member */ +/* #undef HAVE_JOURNAL_TOTAL_CREDITS */ + +/* kallsyms_lookup_name is exported by kernel */ +/* #undef HAVE_KALLSYMS_LOOKUP_NAME */ + +/* 'kernel_param_[un]lock' is available */ +#define HAVE_KERNEL_PARAM_LOCK 1 + +/* 'struct kernel_param_ops' is available */ +#define HAVE_KERNEL_PARAM_OPS 1 + +/* kernel_read() signature ends with loff_t *pos */ +#define HAVE_KERNEL_READ_LAST_POSP 1 + +/* kernel_setsockopt still in use */ +/* #undef HAVE_KERNEL_SETSOCKOPT */ + +/* 'getname' has two args */ +#define HAVE_KERN_SOCK_GETNAME_2ARGS 1 + +/* keyring_search has 4 args */ +#define HAVE_KEYRING_SEARCH_4ARGS 1 + +/* struct key_match_data exist */ +#define HAVE_KEY_MATCH_DATA 1 + +/* payload.data is an array */ +#define HAVE_KEY_PAYLOAD_DATA_ARRAY 1 + +/* key_type->instantiate has two args */ +/* #undef HAVE_KEY_TYPE_INSTANTIATE_2ARGS */ + +/* key.usage is of type refcount_t */ +#define HAVE_KEY_USAGE_REFCOUNT 1 + +/* kfree_sensitive() is available. */ +#define HAVE_KFREE_SENSITIVE 1 + +/* kiocb->ki_complete() has 2 arguments */ +#define HAVE_KIOCB_COMPLETE_2ARGS 1 + +/* ki_left exist */ +/* #undef HAVE_KIOCB_KI_LEFT */ + +/* ki_nbytes field exist */ +/* #undef HAVE_KI_NBYTES */ + +/* kmap_to_page is exported by the kernel */ +/* #undef HAVE_KMAP_TO_PAGE */ + +/* struct kobj_type has 'default_groups' member */ +#define HAVE_KOBJ_TYPE_DEFAULT_GROUPS 1 + +/* Define this if you have MIT Kerberos libraries */ +/* #undef HAVE_KRB5 */ + +/* Define this if the function krb5int_derive_key is available */ +/* #undef HAVE_KRB5INT_DERIVE_KEY */ + +/* Define this if the function krb5_derive_key is available */ +/* #undef HAVE_KRB5_DERIVE_KEY */ + +/* Define this if the function krb5_get_error_message is available */ +/* #undef HAVE_KRB5_GET_ERROR_MESSAGE */ + +/* Define this if the function krb5_get_init_creds_opt_set_addressless is + available */ +/* #undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS */ + +/* kref_read() is available */ +#define HAVE_KREF_READ 1 + +/* kset_find_obj is exported by the kernel */ +#define HAVE_KSET_FIND_OBJ 1 + +/* kernel has kstrtobool_from_user */ +#define HAVE_KSTRTOBOOL_FROM_USER 1 + +/* kthread_worker found */ +/* #undef HAVE_KTHREAD_WORK */ + +/* ktime_add is available */ +#define HAVE_KTIME_ADD 1 + +/* ktime_after is available */ +#define HAVE_KTIME_AFTER 1 + +/* ktime_before is available */ +#define HAVE_KTIME_BEFORE 1 + +/* ktime_compare is available */ +#define HAVE_KTIME_COMPARE 1 + +/* 'ktime_get_real_seconds' is available */ +#define HAVE_KTIME_GET_REAL_SECONDS 1 + +/* 'ktime_get_real_ts64' is available */ +#define HAVE_KTIME_GET_REAL_TS64 1 + +/* 'ktime_get_seconds' is available */ +#define HAVE_KTIME_GET_SECONDS 1 + +/* 'ktime_get_ts64' is available */ +#define HAVE_KTIME_GET_TS64 1 + +/* 'ktime_ms_delta' is available */ +#define HAVE_KTIME_MS_DELTA 1 + +/* 'ktime_to_timespec64' is available */ +#define HAVE_KTIME_TO_TIMESPEC64 1 + +/* ldiskfsfs_dirhash takes an inode argument */ +/* #undef HAVE_LDISKFSFS_GETHASH_INODE_ARG */ + +/* enable use of ldiskfsprogs package */ +/* #undef HAVE_LDISKFSPROGS */ + +/* EXT4_GET_BLOCKS_KEEP_SIZE exists */ +/* #undef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE */ + +/* if ldiskfs_iget takes a flags argument */ +/* #undef HAVE_LDISKFS_IGET_WITH_FLAGS */ + +/* 'ext4_journal_ensure_credits' exists */ +/* #undef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS */ + +/* Enable ldiskfs osd */ +/* #undef HAVE_LDISKFS_OSD */ + +/* libefence support is requested */ +/* #undef HAVE_LIBEFENCE */ + +/* Define to 1 if you have the `keyutils' library (-lkeyutils). */ +#define HAVE_LIBKEYUTILS 1 + +/* use libpthread for libcfs library */ +#define HAVE_LIBPTHREAD 1 + +/* readline library is available */ +/* #undef HAVE_LIBREADLINE */ + +/* linux/blk-integrity.h is present */ +#define HAVE_LINUX_BLK_INTEGRITY_HEADER 1 + +/* linux/fortify-string.h header available */ +#define HAVE_LINUX_FORTIFY_STRING_HEADER 1 + +/* linux/stdarg.h is present */ +#define HAVE_LINUX_STDARG_HEADER 1 + +/* list_cmp_func_t type is defined */ +#define HAVE_LIST_CMP_FUNC_T 1 + +/* lock_manager_operations has lm_compare_owner */ +/* #undef HAVE_LM_COMPARE_OWNER */ + +/* kernel has locks_lock_file_wait */ +#define HAVE_LOCKS_LOCK_FILE_WAIT 1 + +/* lock_page_memcg is defined */ +#define HAVE_LOCK_PAGE_MEMCG 1 + +/* lookup_user_key() is available */ +#define HAVE_LOOKUP_USER_KEY 1 + +/* Enable lru resize support */ +#define HAVE_LRU_RESIZE_SUPPORT 1 + +/* lsmcontext_init is available */ +/* #undef HAVE_LSMCONTEXT_INIT */ + +/* Define this if the Kerberos GSS library supports + gss_krb5_export_lucid_sec_context */ +/* #undef HAVE_LUCID_CONTEXT_SUPPORT */ + +/* Enable Lustre client crypto via embedded llcrypt */ +#define HAVE_LUSTRE_CRYPTO 1 + +/* enum mapping_flags has AS_EXITING flag */ +#define HAVE_MAPPING_AS_EXITING_FLAG 1 + +/* match_wildcard() is available */ +#define HAVE_MATCH_WILDCARD 1 + +/* memalloc_noreclaim_{save,restore}() is supported */ +#define HAVE_MEMALLOC_RECLAIM 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* mmap_lock API is available. */ +#define HAVE_MMAP_LOCK 1 + +/* kernel module loading is possible */ +#define HAVE_MODULE_LOADING_SUPPORT 1 + +/* Define to 1 if you have the `name_to_handle_at' function. */ +#define HAVE_NAME_TO_HANDLE_AT 1 + +/* support native Linux client */ +/* #undef HAVE_NATIVE_LINUX_CLIENT */ + +/* Define to 1 if you have the header file. */ +#define HAVE_NETDB_H 1 + +/* struct genl_ops has 'start' callback */ +#define HAVE_NETLINK_CALLBACK_START 1 + +/* DEFINE_TIMER uses only 2 arguements */ +#define HAVE_NEW_DEFINE_TIMER 1 + +/* 'kernel_write' aligns with read/write helpers */ +#define HAVE_NEW_KERNEL_WRITE 1 + +/* libnl3 supports nla_get_s32 */ +#define HAVE_NLA_GET_S32 1 + +/* libnl3 supports nla_get_s64 */ +#define HAVE_NLA_GET_S64 1 + +/* 'nla_strdup' is available */ +#define HAVE_NLA_STRDUP 1 + +/* 'nla_strlcpy' is available */ +/* #undef HAVE_NLA_STRLCPY */ + +/* netlink_ext_ack is handled for Netlink dump handlers */ +#define HAVE_NL_DUMP_WITH_EXT_ACK 1 + +/* netlink_ext_ack is an argument to nla_parse type function */ +#define HAVE_NL_PARSE_WITH_EXT_ACK 1 + +/* no_llseek() is available */ +/* #undef HAVE_NO_LLSEEK */ + +/* NR_UNSTABLE_NFS is still in use. */ +/* #undef HAVE_NR_UNSTABLE_NFS */ + +/* ns_to_timespec64() is available */ +#define HAVE_NS_TO_TIMESPEC64 1 + +/* with oldsize */ +/* #undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE */ + +/* OpenSSL EVP_PKEY_get_params */ +/* #undef HAVE_OPENSSL_EVP_PKEY */ + +/* openssl-devel is present */ +/* #undef HAVE_OPENSSL_GETSEPOL */ + +/* OpenSSL HMAC functions needed for SSK */ +/* #undef HAVE_OPENSSL_SSK */ + +/* if Oracle OFED Extensions are enabled */ +/* #undef HAVE_ORACLE_OFED_EXTENSIONS */ + +/* 'pagevec_init' takes one parameter */ +#define HAVE_PAGEVEC_INIT_ONE_PARAM 1 + +/* linux/panic_notifier.h is present */ +#define HAVE_PANIC_NOTIFIER_H 1 + +/* 'param_set_uint_minmax' is available */ +#define HAVE_PARAM_SET_UINT_MINMAX 1 + +/* percpu_counter_init uses GFP_* flag */ +#define HAVE_PERCPU_COUNTER_INIT_GFP_FLAG 1 + +/* 'struct nsproxy' has 'pid_ns_for_children' */ +#define HAVE_PID_NS_FOR_CHILDREN 1 + +/* 'posix_acl_update_mode' is available */ +/* #undef HAVE_POSIX_ACL_UPDATE_MODE */ + +/* posix_acl_valid takes struct user_namespace */ +#define HAVE_POSIX_ACL_VALID_USER_NS 1 + +/* 'prepare_to_wait_event' is available */ +#define HAVE_PREPARE_TO_WAIT_EVENT 1 + +/* processor.h is present */ +#define HAVE_PROCESSOR_H 1 + +/* struct proc_ops exists */ +#define HAVE_PROC_OPS 1 + +/* get_projid function exists */ +#define HAVE_PROJECT_QUOTA 1 + +/* 'PTR_ERR_OR_ZERO' exist */ +#define HAVE_PTR_ERR_OR_ZERO 1 + +/* If available, contains the Python version number currently in use. */ +#define HAVE_PYTHON "3.9" + +/* radix_tree_tag_set exists */ +#define HAVE_RADIX_TREE_TAG_SET 1 + +/* rdma_connect_locked is defined */ +#define HAVE_RDMA_CONNECT_LOCKED 1 + +/* rdma_create_id wants 4 args */ +/* #undef HAVE_RDMA_CREATE_ID_4ARG */ + +/* rdma_create_id wants 5 args */ +#define HAVE_RDMA_CREATE_ID_5ARG 1 + +/* rdma_reject has 4 arguments */ +#define HAVE_RDMA_REJECT_4ARGS 1 + +/* read_cache_page() filler_t needs struct file */ +#define HAVE_READ_CACHE_PAGE_WANTS_FILE 1 + +/* refcount_t is supported */ +#define HAVE_REFCOUNT_T 1 + +/* register_shrinker() returns status */ +#define HAVE_REGISTER_SHRINKER_FORMAT_NAMED 1 + +/* register_shrinker() returns status */ +/* #undef HAVE_REGISTER_SHRINKER_RET */ + +/* rhashtable_lookup() is available */ +#define HAVE_RHASHTABLE_LOOKUP 1 + +/* rhashtable_lookup_get_insert_fast() is available */ +#define HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST 1 + +/* rhashtable_replace_fast() is available */ +#define HAVE_RHASHTABLE_REPLACE 1 + +/* rhashtable_walk_enter() is available */ +#define HAVE_RHASHTABLE_WALK_ENTER 1 + +/* struct rhltable exist */ +#define HAVE_RHLTABLE 1 + +/* rht_bucket_var() is available */ +#define HAVE_RHT_BUCKET_VAR 1 + +/* save_stack_trace_tsk is exported */ +/* #undef HAVE_SAVE_STACK_TRACE_TSK */ + +/* Have sa_spill_alloc in ZFS */ +/* #undef HAVE_SA_SPILL_ALLOC */ + +/* linux/sched header directory exist */ +#define HAVE_SCHED_HEADERS 1 + +/* security_dentry_init_security needs lsmcontext */ +/* #undef HAVE_SECURITY_DENTRY_INIT_SECURTY_WITH_CTX */ + +/* security_dentry_init_security() returns xattr name */ +#define HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG 1 + +/* security_release_secctx has 1 arg. */ +/* #undef HAVE_SEC_RELEASE_SECCTX_1ARG */ + +/* support for selinux */ +#define HAVE_SELINUX 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SELINUX_SELINUX_H 1 + +/* support server */ +/* #undef HAVE_SERVER_SUPPORT */ + +/* Define this if the Kerberos GSS library supports + gss_krb5_set_allowable_enctypes */ +/* #undef HAVE_SET_ALLOWABLE_ENCTYPES */ + +/* shrinker has count_objects member */ +#define HAVE_SHRINKER_COUNT 1 + +/* sk_data_ready uses only one argument */ +#define HAVE_SK_DATA_READY_ONE_ARG 1 + +/* sock_create_kern use net as first parameter */ +#define HAVE_SOCK_CREATE_KERN_USE_NET 1 + +/* Have spa_maxblocksize in ZFS */ +/* #undef HAVE_SPA_MAXBLOCKSIZE */ + +/* struct stacktrace_ops exists */ +/* #undef HAVE_STACKTRACE_OPS */ + +/* Define to 1 if you have the `statx' function. */ +#define HAVE_STATX 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* stringhash.h is present */ +#define HAVE_STRINGHASH 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* kernel strscpy is available */ +/* #undef HAVE_STRSCPY */ + +/* struct posix_acl_xattr_{header,entry} defined */ +#define HAVE_STRUCT_POSIX_ACL_XATTR 1 + +/* submit_bio takes two arguments */ +/* #undef HAVE_SUBMIT_BIO_2ARGS */ + +/* 'super_setup_bdi_name' is available */ +#define HAVE_SUPER_SETUP_BDI_NAME 1 + +/* symlink inode operations need struct nameidata argument */ +/* #undef HAVE_SYMLINK_OPS_USE_NAMEIDATA */ + +/* new_sync_[read|write] is exported by the kernel */ +/* #undef HAVE_SYNC_READ_WRITE */ + +/* Define to 1 if you have . */ +#define HAVE_SYS_QUOTA_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* 's_uuid' is an uuid_t */ +#define HAVE_S_UUID_AS_UUID_T 1 + +/* task_is_running() is defined */ +#define HAVE_TASK_IS_RUNNING 1 + +/* 'tcp_sock_set_keepcnt()' exists */ +#define HAVE_TCP_SOCK_SET_KEEPCNT 1 + +/* 'tcp_sock_set_keepidle()' exists */ +#define HAVE_TCP_SOCK_SET_KEEPIDLE 1 + +/* 'tcp_sock_set_keepintvl()' exists */ +#define HAVE_TCP_SOCK_SET_KEEPINTVL 1 + +/* 'tcp_sock_set_nodelay()' exists */ +#define HAVE_TCP_SOCK_SET_NODELAY 1 + +/* 'tcp_sock_set_quickack()' exists */ +#define HAVE_TCP_SOCK_SET_QUICKACK 1 + +/* timer_setup has replaced setup_timer */ +#define HAVE_TIMER_SETUP 1 + +/* 'struct timespec64' is available */ +#define HAVE_TIMESPEC64 1 + +/* 'timespec64_sub' is available */ +#define HAVE_TIMESPEC64_SUB 1 + +/* 'timespec64_to_ktime' is available */ +#define HAVE_TIMESPEC64_TO_KTIME 1 + +/* topology_sibling_cpumask is available */ +#define HAVE_TOPOLOGY_SIBLING_CPUMASK 1 + +/* if totalram_pages is a function */ +#define HAVE_TOTALRAM_PAGES_AS_FUNC 1 + +/* kernel has truncate_inode_pages_final */ +#define HAVE_TRUNCATE_INODE_PAGES_FINAL 1 + +/* if MS_RDONLY was moved to uapi/linux/mount.h */ +#define HAVE_UAPI_LINUX_MOUNT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* 'inode_operations' members have user namespace argument */ +#define HAVE_USER_NAMESPACE_ARG 1 + +/* 'enum nlmsgerr_attrs' exists */ +#define HAVE_USRSPC_NLMSGERR 1 + +/* RDMA_PS_TCP exists */ +#define HAVE_USRSPC_RDMA_PS_TCP 1 + +/* 'uuid_t' exist */ +#define HAVE_UUID_T 1 + +/* kernel has vfs_rename with 5 args */ +/* #undef HAVE_VFS_RENAME_5ARGS */ + +/* kernel has vfs_rename with 6 args */ +/* #undef HAVE_VFS_RENAME_6ARGS */ + +/* '__vfs_setxattr' is available */ +/* #undef HAVE_VFS_SETXATTR */ + +/* kernel has vfs_unlink with 3 args */ +/* #undef HAVE_VFS_UNLINK_3ARGS */ + +/* __vmalloc only takes 2 args. */ +#define HAVE_VMALLOC_2ARGS 1 + +/* virtual_address has been replaced by address field */ +#define HAVE_VM_FAULT_ADDRESS 1 + +/* if VM_FAULT_RETRY is defined */ +#define HAVE_VM_FAULT_RETRY 1 + +/* if vm_fault_t type exists */ +#define HAVE_VM_FAULT_T 1 + +/* 'struct vm_operations' remove struct vm_area_struct argument */ +#define HAVE_VM_OPS_USE_VM_FAULT_ONLY 1 + +/* wait_bit.h is present */ +#define HAVE_WAIT_BIT_HEADER_H 1 + +/* if struct wait_bit_queue_entry exists */ +#define HAVE_WAIT_BIT_QUEUE_ENTRY 1 + +/* 'wait_queue_entry_t' is available */ +#define HAVE_WAIT_QUEUE_ENTRY 1 + +/* linux wait_queue_head_t list_head is name head */ +#define HAVE_WAIT_QUEUE_ENTRY_LIST 1 + +/* 'wait_var_event' is available */ +#define HAVE_WAIT_VAR_EVENT 1 + +/* 'wait_woken, is available' */ +#define HAVE_WAIT_WOKEN 1 + +/* kernel Xarray implementation lacks 'xa_is_value' */ +#define HAVE_XARRAY_SUPPORT 1 + +/* needs inode parameter */ +/* #undef HAVE_XATTR_HANDLER_INODE_PARAM */ + +/* xattr_handler has a name member */ +#define HAVE_XATTR_HANDLER_NAME 1 + +/* handler pointer is parameter */ +/* #undef HAVE_XATTR_HANDLER_SIMPLIFIED */ + +/* Have zap_add_by_dnode() in ZFS */ +/* #undef HAVE_ZAP_ADD_BY_DNODE */ + +/* Have zap_lookup_by_dnode() in ZFS */ +/* #undef HAVE_ZAP_LOOKUP_BY_DNODE */ + +/* Have zap_remove_by_dnode() in ZFS */ +/* #undef HAVE_ZAP_REMOVE_ADD_BY_DNODE */ + +/* Have inode_timespec_t */ +/* #undef HAVE_ZFS_INODE_TIMESPEC */ + +/* Have multihost protection in ZFS */ +/* #undef HAVE_ZFS_MULTIHOST */ + +/* Enable zfs osd */ +/* #undef HAVE_ZFS_OSD */ + +/* Have zfs_refcount_add */ +/* #undef HAVE_ZFS_REFCOUNT_ADD */ + +/* Have zfs_refcount.h */ +/* #undef HAVE_ZFS_REFCOUNT_HEADER */ + +/* struct bio has __bi_cnt */ +#define HAVE___BI_CNT 1 + +/* if __ldiskfs_find_entry is available */ +/* #undef HAVE___LDISKFS_FIND_ENTRY */ + +/* function pde_data() available */ +#define HAVE_pde_data 1 + +/* ext4_journal_start takes 3 arguments */ +/* #undef JOURNAL_START_HAS_3ARGS */ + +/* Define this as the Kerberos version number */ +/* #undef KRB5_VERSION */ + +/* enable libcfs LASSERT, LASSERTF */ +#define LIBCFS_DEBUG 1 + +/* use dumplog on panic */ +/* #undef LNET_DUMP_ON_PANIC */ + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Fourth number in the Lustre version */ +#define LUSTRE_FIX 0 + +/* First number in the Lustre version */ +#define LUSTRE_MAJOR 2 + +/* Second number in the Lustre version */ +#define LUSTRE_MINOR 15 + +/* Third number in the Lustre version */ +#define LUSTRE_PATCH 3 + +/* A copy of PACKAGE_VERSION */ +#define LUSTRE_VERSION_STRING "2.15.3_114_gb61b66c_dirty" + +/* maximum number of MDS threads */ +/* #undef MDS_MAX_THREADS */ + +/* Report minimum OST free space */ +/* #undef MIN_DF */ + +/* name of ldiskfs mkfs program */ +#define MKE2FS "mke2fs" + +/* 'ktime_get_ns' is not available */ +/* #undef NEED_KTIME_GET_NS */ + +/* 'ktime_get_real_ns' is not available */ +/* #undef NEED_KTIME_GET_REAL_NS */ + +/* lockdep_is_held() argument is const */ +/* #undef NEED_LOCKDEP_IS_HELD_DISCARD_CONST */ + +/* Name of package */ +#define PACKAGE "lustre" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "https://jira.whamcloud.com/" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "Lustre" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "Lustre 2.15.3_114_gb61b66c_dirty" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "lustre" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "2.15.3_114_gb61b66c_dirty" + +/* name of parallel fsck program */ +#define PFSCK "fsck" + +/* enable randomly alloc failure */ +#define RANDOM_FAIL_ALLOC 1 + +/* The size of `unsigned long long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG_LONG 8 + +/* use tunable backoff TCP */ +/* #undef SOCKNAL_BACKOFF */ + +/* tunable backoff TCP in ms */ +/* #undef SOCKNAL_BACKOFF_MS */ + +/* 'struct stacktrace_ops' address function returns an int */ +/* #undef STACKTRACE_OPS_ADDRESS_RETURN_INT */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* name of ldiskfs tune program */ +#define TUNE2FS "tune2fs" + +/* Define this if the private function, gss_krb5_cache_name, must be used to + tell the Kerberos library which credentials cache to use. Otherwise, this + is done by setting the KRB5CCNAME environment variable */ +/* #undef USE_GSS_KRB5_CCACHE_NAME */ + +/* Write when Checking Health */ +/* #undef USE_HEALTH_CHECK_WRITE */ + +/* Version number of package */ +#define VERSION "2.15.3_114_gb61b66c_dirty" + +/* vfs_setxattr() value argument is non-const */ +#define VFS_SETXATTR_VALUE(value) (value) + +/* zfs fix version */ +/* #undef ZFS_FIX */ + +/* zfs major version */ +/* #undef ZFS_MAJOR */ + +/* zfs minor version */ +/* #undef ZFS_MINOR */ + +/* zfs patch version */ +/* #undef ZFS_PATCH */ + +/* get_random_u32() is not available, use prandom_u32 */ +/* #undef get_random_u32 */ + +/* get_random_u32_below() is not available */ +/* #undef get_random_u32_below */ + +/* function pde_data() unavailable */ +/* #undef pde_data */ diff --git a/drivers/staging/lustrefsx/libcfs/Kconfig b/drivers/staging/lustrefsx/libcfs/Kconfig new file mode 100644 index 0000000000000..3675b8381af2e --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/Kconfig @@ -0,0 +1,3 @@ +config LUSTREFSX_LIBCFS + depends on m + tristate "Lustre helper library" diff --git a/drivers/staging/lustrefsx/libcfs/Makefile b/drivers/staging/lustrefsx/libcfs/Makefile new file mode 100644 index 0000000000000..6c5ff83ac791a --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_LUSTREFSX_LIBCFS) += libcfs/ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h new file mode 100644 index 0000000000000..b2bd5991632c7 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h @@ -0,0 +1,118 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +#ifndef _LIBCFS_BITMAP_H_ +#define _LIBCFS_BITMAP_H_ + +#include +#include + +struct cfs_bitmap { + unsigned int size; + unsigned long data[0]; +}; + +#define CFS_BITMAP_SIZE(nbits) \ + (BITS_TO_LONGS(nbits) * sizeof(long) + sizeof(struct cfs_bitmap)) + +static inline +struct cfs_bitmap *CFS_ALLOCATE_BITMAP(int size) +{ + struct cfs_bitmap *ptr; + + LIBCFS_ALLOC(ptr, CFS_BITMAP_SIZE(size)); + if (ptr == NULL) + RETURN(ptr); + + ptr->size = size; + + RETURN(ptr); +} + +static inline void CFS_RESET_BITMAP(struct cfs_bitmap *bitmap) +{ + if (bitmap->size > 0) { + int nbits = bitmap->size; + + memset(bitmap, 0, CFS_BITMAP_SIZE(nbits)); + bitmap->size = nbits; + } +} + +#define CFS_FREE_BITMAP(ptr) LIBCFS_FREE(ptr, CFS_BITMAP_SIZE(ptr->size)) + +static inline +void cfs_bitmap_set(struct cfs_bitmap *bitmap, int nbit) +{ + set_bit(nbit, bitmap->data); +} + +static inline +void cfs_bitmap_clear(struct cfs_bitmap *bitmap, int nbit) +{ + test_and_clear_bit(nbit, bitmap->data); +} + +static inline +int cfs_bitmap_check(struct cfs_bitmap *bitmap, int nbit) +{ + return test_bit(nbit, bitmap->data); +} + +static inline +int cfs_bitmap_test_and_clear(struct cfs_bitmap *bitmap, int nbit) +{ + return test_and_clear_bit(nbit, bitmap->data); +} + +/* return 0 is bitmap has none set bits */ +static inline +int cfs_bitmap_check_empty(struct cfs_bitmap *bitmap) +{ + return find_first_bit(bitmap->data, bitmap->size) == bitmap->size; +} + +static inline +void cfs_bitmap_copy(struct cfs_bitmap *new, struct cfs_bitmap *old) +{ + size_t newsize; + + LASSERT(new->size >= old->size); + newsize = new->size; + memcpy(new, old, CFS_BITMAP_SIZE(old->size)); + new->size = newsize; +} + +#define cfs_foreach_bit(bitmap, pos) \ + for ((pos) = find_first_bit((bitmap)->data, bitmap->size); \ + (pos) < (bitmap)->size; \ + (pos) = find_next_bit((bitmap)->data, (bitmap)->size, (pos) + 1)) + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/crypto/llcrypt.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/crypto/llcrypt.h new file mode 100644 index 0000000000000..d05ff2af4200b --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/crypto/llcrypt.h @@ -0,0 +1,798 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * llcrypt.h: declarations for per-file encryption + * + * Filesystems that implement per-file encryption must include this header + * file. + * + * Copyright (C) 2015, Google, Inc. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ +/* + * Linux commit 219d54332a09 + * tags/v5.4 + */ +#ifndef _LINUX_LLCRYPT_H +#define _LINUX_LLCRYPT_H + +#ifndef DCACHE_ENCRYPTED_NAME +#define DCACHE_ENCRYPTED_NAME 0x02000000 +#endif + +#include +#include +#include +#include +#include + +#define LL_CRYPTO_BLOCK_SIZE 16 + +struct llcrypt_ctx; +struct llcrypt_info; + +struct llcrypt_str { + unsigned char *name; + u32 len; +}; + +struct llcrypt_name { + const struct qstr *usr_fname; + struct llcrypt_str disk_name; + u32 hash; + u32 minor_hash; + struct llcrypt_str crypto_buf; + bool is_ciphertext_name; +}; + +#define LLTR_INIT(n, l) { .name = n, .len = l } +#define LLTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) +#define lname_name(p) ((p)->disk_name.name) +#define lname_len(p) ((p)->disk_name.len) + +/* Maximum value for the third parameter of llcrypt_operations.set_context(). */ +#define LLCRYPT_SET_CONTEXT_MAX_SIZE 40 +#define LLCRYPT_DIGESTED_CHAR_OLD '_' +#define LLCRYPT_DIGESTED_CHAR '+' + +#ifdef CONFIG_LL_ENCRYPTION +/* + * llcrypt superblock flags + */ +#define LL_CFLG_OWN_PAGES (1U << 1) + +/* + * crypto operations for filesystems + */ +struct llcrypt_operations { + unsigned int flags; + const char *key_prefix; + int (*get_context)(struct inode *, void *, size_t); + int (*set_context)(struct inode *, const void *, size_t, void *); + bool (*dummy_context)(struct inode *); + bool (*empty_dir)(struct inode *); + unsigned int max_namelen; +}; + +/* Decryption work */ +struct llcrypt_ctx { + union { + struct { + struct bio *bio; + struct work_struct work; + }; + struct list_head free_list; /* Free list */ + }; + u8 flags; /* Flags */ +}; + +extern bool llcrypt_has_encryption_key(const struct inode *inode); + +static inline bool llcrypt_dummy_context_enabled(struct inode *inode) +{ + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + + if (unlikely(!lsi)) + return false; + + return lsi->lsi_cop->dummy_context && + lsi->lsi_cop->dummy_context(inode); +} + +/* + * When d_splice_alias() moves a directory's encrypted alias to its decrypted + * alias as a result of the encryption key being added, DCACHE_ENCRYPTED_NAME + * must be cleared. Note that we don't have to support arbitrary moves of this + * flag because llcrypt doesn't allow encrypted aliases to be the source or + * target of a rename(). + */ +static inline void llcrypt_handle_d_move(struct dentry *dentry) +{ + dentry->d_flags &= ~DCACHE_ENCRYPTED_NAME; +} + +/* crypto.c */ +extern int __init llcrypt_init(void); +extern void __exit llcrypt_exit(void); +extern void llcrypt_enqueue_decrypt_work(struct work_struct *); +extern struct llcrypt_ctx *llcrypt_get_ctx(gfp_t); +extern void llcrypt_release_ctx(struct llcrypt_ctx *); + +extern struct page *llcrypt_encrypt_pagecache_blocks(struct page *page, + unsigned int len, + unsigned int offs, + gfp_t gfp_flags); +extern int llcrypt_encrypt_block(const struct inode *inode, struct page *src, + struct page *dst, unsigned int len, + unsigned int offs, u64 lblk_num, gfp_t gfp_flags); + +extern int llcrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len, + unsigned int offs); + +extern int llcrypt_decrypt_block(const struct inode *inode, struct page *src, + struct page *dst, unsigned int len, + unsigned int offs, u64 lblk_num, gfp_t gfp_flags); + +static inline int llcrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num) +{ + return llcrypt_decrypt_block(inode, page, page, len, offs, lblk_num, + GFP_NOFS); +} + +static inline bool llcrypt_is_bounce_page(struct page *page) +{ + return page->mapping == NULL; +} + +static inline struct page *llcrypt_pagecache_page(struct page *bounce_page) +{ + return (struct page *)page_private(bounce_page); +} + +extern void llcrypt_free_bounce_page(struct page *bounce_page); + +/* policy.c */ +extern int llcrypt_ioctl_set_policy(struct file *, const void __user *); +extern int llcrypt_ioctl_get_policy(struct file *, void __user *); +extern int llcrypt_ioctl_get_policy_ex(struct file *, void __user *); +extern int llcrypt_has_permitted_context(struct inode *, struct inode *); +extern int llcrypt_inherit_context(struct inode *, struct inode *, + void *, bool); +extern bool llcrypt_policy_has_filename_enc(struct inode *inode); +/* keyring.c */ +extern void llcrypt_sb_free(struct super_block *sb); +extern int llcrypt_ioctl_add_key(struct file *filp, void __user *arg); +extern int llcrypt_ioctl_remove_key(struct file *filp, void __user *arg); +extern int llcrypt_ioctl_remove_key_all_users(struct file *filp, + void __user *arg); +extern int llcrypt_ioctl_get_key_status(struct file *filp, void __user *arg); + +/* keysetup.c */ +extern int llcrypt_get_encryption_info(struct inode *); +extern void llcrypt_put_encryption_info(struct inode *); +extern void llcrypt_free_inode(struct inode *); +extern int llcrypt_drop_inode(struct inode *inode); + +/* fname.c */ +extern int llcrypt_setup_filename(struct inode *, const struct qstr *, + int lookup, struct llcrypt_name *); + +static inline void llcrypt_free_filename(struct llcrypt_name *fname) +{ + kfree(fname->crypto_buf.name); +} + +extern int llcrypt_fname_alloc_buffer(const struct inode *, u32, + struct llcrypt_str *); +extern void llcrypt_fname_free_buffer(struct llcrypt_str *); +extern int llcrypt_fname_disk_to_usr(struct inode *, u32, u32, + const struct llcrypt_str *, struct llcrypt_str *); + +#define LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE 32 + +/* Extracts the second-to-last ciphertext block; see explanation below */ +#define LLCRYPT_FNAME_DIGEST(name, len) \ + ((name) + round_down((len) - LL_CRYPTO_BLOCK_SIZE - 1, \ + LL_CRYPTO_BLOCK_SIZE)) + +#define LLCRYPT_FNAME_DIGEST_SIZE LL_CRYPTO_BLOCK_SIZE + +/** + * llcrypt_digested_name - alternate identifier for an on-disk filename + * + * When userspace lists an encrypted directory without access to the key, + * filenames whose ciphertext is longer than LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE + * bytes are shown in this abbreviated form (base64-encoded) rather than as the + * full ciphertext (base64-encoded). This is necessary to allow supporting + * filenames up to NAME_MAX bytes, since base64 encoding expands the length. + * + * To make it possible for filesystems to still find the correct directory entry + * despite not knowing the full on-disk name, we encode any filesystem-specific + * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups, + * followed by the second-to-last ciphertext block of the filename. Due to the + * use of the CBC-CTS encryption mode, the second-to-last ciphertext block + * depends on the full plaintext. (Note that ciphertext stealing causes the + * last two blocks to appear "flipped".) This makes accidental collisions very + * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they + * share the same filesystem-specific hashes. + * + * However, this scheme isn't immune to intentional collisions, which can be + * created by anyone able to create arbitrary plaintext filenames and view them + * without the key. Making the "digest" be a real cryptographic hash like + * SHA-256 over the full ciphertext would prevent this, although it would be + * less efficient and harder to implement, especially since the filesystem would + * need to calculate it for each directory entry examined during a search. + */ +struct llcrypt_digested_name { + u32 hash; + u32 minor_hash; + u8 digest[LLCRYPT_FNAME_DIGEST_SIZE]; +}; + +/** + * llcrypt_match_name() - test whether the given name matches a directory entry + * @fname: the name being searched for + * @de_name: the name from the directory entry + * @de_name_len: the length of @de_name in bytes + * + * Normally @fname->disk_name will be set, and in that case we simply compare + * that to the name stored in the directory entry. The only exception is that + * if we don't have the key for an encrypted directory and a filename in it is + * very long, then we won't have the full disk_name and we'll instead need to + * match against the llcrypt_digested_name. + * + * Return: %true if the name matches, otherwise %false. + */ +static inline bool llcrypt_match_name(const struct llcrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + if (unlikely(!fname->disk_name.name)) { + const struct llcrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_')) + return false; + if (de_name_len <= LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE) + return false; + return !memcmp(LLCRYPT_FNAME_DIGEST(de_name, de_name_len), + n->digest, LLCRYPT_FNAME_DIGEST_SIZE); + } + + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + +/* hooks.c */ +extern int llcrypt_file_open(struct inode *inode, struct file *filp); +extern int __llcrypt_prepare_link(struct inode *inode, struct inode *dir, + struct dentry *dentry); +extern int __llcrypt_prepare_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry, + unsigned int flags); +extern int __llcrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct llcrypt_name *fname); +extern int __llcrypt_prepare_symlink(struct inode *dir, unsigned int len, + unsigned int max_len, + struct llcrypt_str *disk_link); +extern int __llcrypt_encrypt_symlink(struct inode *inode, const char *target, + unsigned int len, + struct llcrypt_str *disk_link); +extern const char *llcrypt_get_symlink(struct inode *inode, const void *caddr, + unsigned int max_size, + struct delayed_call *done); +static inline void llcrypt_set_ops(struct super_block *sb, + const struct llcrypt_operations *lsi_cop) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + + if (lsi) + lsi->lsi_cop = lsi_cop; +} +#else /* !CONFIG_LL_ENCRYPTION */ + +struct llcrypt_operations; +#define llcrypt_init() 0 +#define llcrypt_exit() {} + +#undef IS_ENCRYPTED +#define IS_ENCRYPTED(x) 0 + +static inline bool llcrypt_has_encryption_key(const struct inode *inode) +{ + return false; +} + +static inline bool llcrypt_dummy_context_enabled(struct inode *inode) +{ + return false; +} + +static inline void llcrypt_handle_d_move(struct dentry *dentry) +{ +} + +/* crypto.c */ +static inline void llcrypt_enqueue_decrypt_work(struct work_struct *work) +{ +} + +static inline struct llcrypt_ctx *llcrypt_get_ctx(gfp_t gfp_flags) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void llcrypt_release_ctx(struct llcrypt_ctx *ctx) +{ + return; +} + +static inline struct page *llcrypt_encrypt_pagecache_blocks(struct page *page, + unsigned int len, + unsigned int offs, + gfp_t gfp_flags) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int llcrypt_encrypt_block(const struct inode *inode, + struct page *src, struct page *dst, + unsigned int len, unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) +{ + return -EOPNOTSUPP; +} + +static inline int llcrypt_decrypt_pagecache_blocks(struct page *page, + unsigned int len, + unsigned int offs) +{ + return -EOPNOTSUPP; +} + +static inline int llcrypt_decrypt_block(const struct inode *inode, + struct page *src, struct page *dst, + unsigned int len, unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) +{ + return -EOPNOTSUPP; +} + +static inline int llcrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, u64 lblk_num) +{ + return -EOPNOTSUPP; +} + +static inline bool llcrypt_is_bounce_page(struct page *page) +{ + return false; +} + +static inline struct page *llcrypt_pagecache_page(struct page *bounce_page) +{ + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +} + +static inline void llcrypt_free_bounce_page(struct page *bounce_page) +{ +} + +/* policy.c */ +static inline int llcrypt_ioctl_set_policy(struct file *filp, + const void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int llcrypt_ioctl_get_policy(struct file *filp, void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int llcrypt_ioctl_get_policy_ex(struct file *filp, + void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int llcrypt_has_permitted_context(struct inode *parent, + struct inode *child) +{ + return 0; +} + +static inline int llcrypt_inherit_context(struct inode *parent, + struct inode *child, + void *fs_data, bool preload) +{ + return -EOPNOTSUPP; +} +static inline bool llcrypt_policy_has_filename_enc(struct inode *inode) +{ + return false; +} + +/* keyring.c */ +static inline void llcrypt_sb_free(struct super_block *sb) +{ +} + +static inline int llcrypt_ioctl_add_key(struct file *filp, void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int llcrypt_ioctl_remove_key(struct file *filp, void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int llcrypt_ioctl_remove_key_all_users(struct file *filp, + void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int llcrypt_ioctl_get_key_status(struct file *filp, + void __user *arg) +{ + return -EOPNOTSUPP; +} + +/* keysetup.c */ +static inline int llcrypt_get_encryption_info(struct inode *inode) +{ + return -EOPNOTSUPP; +} + +static inline void llcrypt_put_encryption_info(struct inode *inode) +{ + return; +} + +static inline void llcrypt_free_inode(struct inode *inode) +{ +} + +static inline int llcrypt_drop_inode(struct inode *inode) +{ + return 0; +} + + /* fname.c */ +static inline int llcrypt_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct llcrypt_name *fname) +{ + if (IS_ENCRYPTED(dir)) + return -EOPNOTSUPP; + + memset(fname, 0, sizeof(*fname)); + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; +} + +static inline void llcrypt_free_filename(struct llcrypt_name *fname) +{ + return; +} + +static inline int llcrypt_fname_alloc_buffer(const struct inode *inode, + u32 max_encrypted_len, + struct llcrypt_str *crypto_str) +{ + return -EOPNOTSUPP; +} + +static inline void llcrypt_fname_free_buffer(struct llcrypt_str *crypto_str) +{ + return; +} + +static inline int llcrypt_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct llcrypt_str *iname, + struct llcrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +static inline bool llcrypt_match_name(const struct llcrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + /* Encryption support disabled; use standard comparison */ + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + +/* hooks.c */ + +static inline int llcrypt_file_open(struct inode *inode, struct file *filp) +{ + if (IS_ENCRYPTED(inode)) + return -EOPNOTSUPP; + return 0; +} + +static inline int __llcrypt_prepare_link(struct inode *inode, struct inode *dir, + struct dentry *dentry) +{ + return -EOPNOTSUPP; +} + +static inline int __llcrypt_prepare_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry, + unsigned int flags) +{ + return -EOPNOTSUPP; +} + +static inline int __llcrypt_prepare_lookup(struct inode *dir, + struct dentry *dentry, + struct llcrypt_name *fname) +{ + return -EOPNOTSUPP; +} + +static inline int __llcrypt_prepare_symlink(struct inode *dir, + unsigned int len, + unsigned int max_len, + struct llcrypt_str *disk_link) +{ + return -EOPNOTSUPP; +} + + +static inline int __llcrypt_encrypt_symlink(struct inode *inode, + const char *target, + unsigned int len, + struct llcrypt_str *disk_link) +{ + return -EOPNOTSUPP; +} + +#define llcrypt_get_symlink(inode, caddr, max_size, done) ERR_PTR(-EOPNOTSUPP) + +static inline void llcrypt_set_ops(struct super_block *sb, + const struct llcrypt_operations *lsi_cop) +{ +} + +#endif /* !CONFIG_LL_ENCRYPTION */ + +/** + * llcrypt_require_key - require an inode's encryption key + * @inode: the inode we need the key for + * + * If the inode is encrypted, set up its encryption key if not already done. + * Then require that the key be present and return -ENOKEY otherwise. + * + * No locks are needed, and the key will live as long as the struct inode --- so + * it won't go away from under you. + * + * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code + * if a problem occurred while setting up the encryption key. + */ +static inline int llcrypt_require_key(struct inode *inode) +{ + if (IS_ENCRYPTED(inode)) { + int err = llcrypt_get_encryption_info(inode); + + if (err) + return err; + if (!llcrypt_has_encryption_key(inode)) + return -ENOKEY; + } + return 0; +} + +/** + * llcrypt_prepare_link - prepare to link an inode into a possibly-encrypted directory + * @old_dentry: an existing dentry for the inode being linked + * @dir: the target directory + * @dentry: negative dentry for the target filename + * + * A new link can only be added to an encrypted directory if the directory's + * encryption key is available --- since otherwise we'd have no way to encrypt + * the filename. Therefore, we first set up the directory's encryption key (if + * not already done) and return an error if it's unavailable. + * + * We also verify that the link will not violate the constraint that all files + * in an encrypted directory tree use the same encryption policy. + * + * Return: 0 on success, -ENOKEY if the directory's encryption key is missing, + * -EXDEV if the link would result in an inconsistent encryption policy, or + * another -errno code. + */ +static inline int llcrypt_prepare_link(struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + if (IS_ENCRYPTED(dir)) + return __llcrypt_prepare_link(d_inode(old_dentry), dir, dentry); + return 0; +} + +/** + * llcrypt_prepare_rename - prepare for a rename between possibly-encrypted directories + * @old_dir: source directory + * @old_dentry: dentry for source file + * @new_dir: target directory + * @new_dentry: dentry for target location (may be negative unless exchanging) + * @flags: rename flags (we care at least about %RENAME_EXCHANGE) + * + * Prepare for ->rename() where the source and/or target directories may be + * encrypted. A new link can only be added to an encrypted directory if the + * directory's encryption key is available --- since otherwise we'd have no way + * to encrypt the filename. A rename to an existing name, on the other hand, + * *is* cryptographically possible without the key. However, we take the more + * conservative approach and just forbid all no-key renames. + * + * We also verify that the rename will not violate the constraint that all files + * in an encrypted directory tree use the same encryption policy. + * + * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the + * rename would cause inconsistent encryption policies, or another -errno code. + */ +static inline int llcrypt_prepare_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry, + unsigned int flags) +{ + if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir)) + return __llcrypt_prepare_rename(old_dir, old_dentry, + new_dir, new_dentry, flags); + return 0; +} + +/** + * llcrypt_prepare_lookup - prepare to lookup a name in a possibly-encrypted directory + * @dir: directory being searched + * @dentry: filename being looked up + * @fname: (output) the name to use to search the on-disk directory + * + * Prepare for ->lookup() in a directory which may be encrypted by determining + * the name that will actually be used to search the directory on-disk. Lookups + * can be done with or without the directory's encryption key; without the key, + * filenames are presented in encrypted form. Therefore, we'll try to set up + * the directory's encryption key, but even without it the lookup can continue. + * + * This also installs a custom ->d_revalidate() method which will invalidate the + * dentry if it was created without the key and the key is later added. + * + * Return: 0 on success; -ENOENT if key is unavailable but the filename isn't a + * correctly formed encoded ciphertext name, so a negative dentry should be + * created; or another -errno code. + */ +static inline int llcrypt_prepare_lookup(struct inode *dir, + struct dentry *dentry, + struct llcrypt_name *fname) +{ + if (IS_ENCRYPTED(dir)) + return __llcrypt_prepare_lookup(dir, dentry, fname); + + memset(fname, 0, sizeof(*fname)); + fname->usr_fname = &dentry->d_name; + fname->disk_name.name = (unsigned char *)dentry->d_name.name; + fname->disk_name.len = dentry->d_name.len; + return 0; +} + +/** + * llcrypt_prepare_setattr - prepare to change a possibly-encrypted inode's attributes + * @dentry: dentry through which the inode is being changed + * @attr: attributes to change + * + * Prepare for ->setattr() on a possibly-encrypted inode. On an encrypted file, + * most attribute changes are allowed even without the encryption key. However, + * without the encryption key we do have to forbid truncates. This is needed + * because the size being truncated to may not be a multiple of the filesystem + * block size, and in that case we'd have to decrypt the final block, zero the + * portion past i_size, and re-encrypt it. (We *could* allow truncating to a + * filesystem block boundary, but it's simpler to just forbid all truncates --- + * and we already forbid all other contents modifications without the key.) + * + * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code + * if a problem occurred while setting up the encryption key. + */ +static inline int llcrypt_prepare_setattr(struct dentry *dentry, + struct iattr *attr) +{ + if (attr->ia_valid & ATTR_SIZE) + return llcrypt_require_key(d_inode(dentry)); + return 0; +} + +/** + * llcrypt_prepare_symlink - prepare to create a possibly-encrypted symlink + * @dir: directory in which the symlink is being created + * @target: plaintext symlink target + * @len: length of @target excluding null terminator + * @max_len: space the filesystem has available to store the symlink target + * @disk_link: (out) the on-disk symlink target being prepared + * + * This function computes the size the symlink target will require on-disk, + * stores it in @disk_link->len, and validates it against @max_len. An + * encrypted symlink may be longer than the original. + * + * Additionally, @disk_link->name is set to @target if the symlink will be + * unencrypted, but left NULL if the symlink will be encrypted. For encrypted + * symlinks, the filesystem must call llcrypt_encrypt_symlink() to create the + * on-disk target later. (The reason for the two-step process is that some + * filesystems need to know the size of the symlink target before creating the + * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.) + * + * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long, + * -ENOKEY if the encryption key is missing, or another -errno code if a problem + * occurred while setting up the encryption key. + */ +static inline int llcrypt_prepare_symlink(struct inode *dir, + const char *target, + unsigned int len, + unsigned int max_len, + struct llcrypt_str *disk_link) +{ + if ((IS_ENCRYPTED(dir) || llcrypt_dummy_context_enabled(dir)) && + llcrypt_policy_has_filename_enc(dir)) + return __llcrypt_prepare_symlink(dir, len, max_len, disk_link); + + disk_link->name = (unsigned char *)target; + disk_link->len = len + 1; + if (disk_link->len > max_len) + return -ENAMETOOLONG; + return 0; +} + +/** + * llcrypt_encrypt_symlink - encrypt the symlink target if needed + * @inode: symlink inode + * @target: plaintext symlink target + * @len: length of @target excluding null terminator + * @disk_link: (in/out) the on-disk symlink target being prepared + * + * If the symlink target needs to be encrypted, then this function encrypts it + * into @disk_link->name. llcrypt_prepare_symlink() must have been called + * previously to compute @disk_link->len. If the filesystem did not allocate a + * buffer for @disk_link->name after calling llcrypt_prepare_link(), then one + * will be kmalloc()'ed and the filesystem will be responsible for freeing it. + * + * Return: 0 on success, -errno on failure + */ +static inline int llcrypt_encrypt_symlink(struct inode *inode, + const char *target, + unsigned int len, + struct llcrypt_str *disk_link) +{ + if (IS_ENCRYPTED(inode)) + return __llcrypt_encrypt_symlink(inode, target, len, disk_link); + return 0; +} + +/* If *pagep is a bounce page, free it and set *pagep to the pagecache page */ +static inline void llcrypt_finalize_bounce_page(struct page **pagep) +{ + struct page *page = *pagep; + + if (llcrypt_is_bounce_page(page)) { + *pagep = llcrypt_pagecache_page(page); + llcrypt_free_bounce_page(page); + } +} + +#endif /* _LINUX_LLCRYPT_H */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h new file mode 100644 index 0000000000000..79ba6089c3664 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h @@ -0,0 +1,143 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __LIBCFS_LIBCFS_H__ +#define __LIBCFS_LIBCFS_H__ + +#include +#include +#include +#include +#ifdef HAVE_SCHED_HEADERS +#include +#endif + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LIBCFS_VERSION "0.7.1" + +/* Sparse annotations */ +#if !defined(__must_hold) +# ifdef __CHECKER__ +# define __must_hold(x) __attribute__((context(x, 1, 1))) +# else /* __CHECKER__ */ +# define __must_hold(x) +# endif /* !__CHECKER__ */ +#endif /* !__must_hold */ + +typedef s32 timeout_t; + +/* need both kernel and user-land acceptor */ +#define LNET_ACCEPTOR_MIN_RESERVED_PORT 512 +#define LNET_ACCEPTOR_MAX_RESERVED_PORT 1023 + +extern struct blocking_notifier_head libcfs_ioctl_list; +static inline int notifier_from_ioctl_errno(int err) +{ + if (err == -EINVAL) + return NOTIFY_OK; + return notifier_from_errno(err) | NOTIFY_STOP_MASK; +} + +int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data); + +extern struct workqueue_struct *cfs_rehash_wq; + +void lnet_insert_debugfs(struct ctl_table *table); +void lnet_remove_debugfs(struct ctl_table *table); + +/* helper for sysctl handlers */ +int debugfs_doint(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + +/* + * Memory + */ +#if BITS_PER_LONG == 32 +/* limit to lowmem on 32-bit systems */ +#define NUM_CACHEPAGES \ + min(cfs_totalram_pages(), 1UL << (30 - PAGE_SHIFT) * 3 / 4) +#else +#define NUM_CACHEPAGES cfs_totalram_pages() +#endif + +#define wait_var_event_warning(var, condition, format, ...) \ +do { \ + int counter = 4; \ + might_sleep(); \ + if (condition) \ + break; \ + ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + if (schedule_timeout(cfs_time_seconds(1)) == 0)\ + CDEBUG(is_power_of_2(counter++) ? \ + D_WARNING : D_NET, \ + format, ## __VA_ARGS__) \ + ); \ +} while (0) + +/* atomic-context safe vfree */ +void libcfs_vfree_atomic(const void *addr); + +/* interval tree */ + +#ifdef HAVE_INTERVAL_TREE_CACHED +#define interval_tree_root rb_root_cached +#define interval_tree_first rb_first_cached +#define INTERVAL_TREE_ROOT RB_ROOT_CACHED +#else +#define interval_tree_root rb_root +#define interval_tree_first rb_first +#define INTERVAL_TREE_ROOT RB_ROOT +#endif /* HAVE_INTERVAL_TREE_CACHED */ + +#ifndef unsafe_memcpy +#define unsafe_memcpy(to, from, size, reason) memcpy((to), (from), (size)) +#endif + +#define FLEXIBLE_OBJECT \ + "Struct contains a flexible member, the size of object is checked" \ + "and can be safely copied in a single memcpy()" + +#endif /* _LIBCFS_LIBCFS_H_ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h new file mode 100644 index 0000000000000..cb2539e426255 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h @@ -0,0 +1,462 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/libcfs_cpu.h + * + * CPU partition + * . CPU partition is virtual processing unit + * + * . CPU partition can present 1-N cores, or 1-N NUMA nodes, + * in other words, CPU partition is a processors pool. + * + * CPU Partition Table (CPT) + * . a set of CPU partitions + * + * . There are two modes for CPT: CFS_CPU_MODE_NUMA and CFS_CPU_MODE_SMP + * + * . User can specify total number of CPU partitions while creating a + * CPT, ID of CPU partition is always start from 0. + * + * Example: if there are 8 cores on the system, while creating a CPT + * with cpu_npartitions=4: + * core[0, 1] = partition[0], core[2, 3] = partition[1] + * core[4, 5] = partition[2], core[6, 7] = partition[3] + * + * cpu_npartitions=1: + * core[0, 1, ... 7] = partition[0] + * + * . User can also specify CPU partitions by string pattern + * + * Examples: cpu_partitions="0[0,1], 1[2,3]" + * cpu_partitions="N 0[0-3], 1[4-8]" + * + * The first character "N" means following numbers are numa ID + * + * . NUMA allocators, CPU affinity threads are built over CPU partitions, + * instead of HW CPUs or HW nodes. + * + * . By default, Lustre modules should refer to the global cfs_cpt_tab, + * instead of accessing HW CPUs directly, so concurrency of Lustre can be + * configured by cpu_npartitions of the global cfs_cpt_tab + * + * . If cpu_npartitions=1(all CPUs in one pool), lustre should work the + * same way as 2.2 or earlier versions + * + * Author: liang@whamcloud.com + */ + +#ifndef __LIBCFS_CPU_H__ +#define __LIBCFS_CPU_H__ + +#include +#include +#include +#include +#include +#include + +#include + +/* any CPU partition */ +#define CFS_CPT_ANY (-1) + +struct cfs_cpt_table; + +#ifdef CONFIG_SMP +extern struct cfs_cpt_table *cfs_cpt_tab; + +/** + * destroy a CPU partition table + */ +void cfs_cpt_table_free(struct cfs_cpt_table *cptab); +/** + * create a cfs_cpt_table with \a ncpt number of partitions + */ +struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt); +/** + * print string information of cpt-table + */ +int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len); +/** + * print distance information of cpt-table + */ +int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len); +/** + * return total number of CPU partitions in \a cptab + */ +int cfs_cpt_number(struct cfs_cpt_table *cptab); +/** + * return number of HW cores or hyper-threadings in a CPU partition \a cpt + */ +int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt); +/** + * is there any online CPU in CPU partition \a cpt + */ +int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt); +/** + * return cpumask of CPU partition \a cpt + */ +cpumask_var_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt); +/** + * return nodemask of CPU partition \a cpt + */ +nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt); +/** + * shadow current HW processor ID to CPU-partition ID of \a cptab + */ +int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap); +/** + * shadow HW processor ID \a CPU to CPU-partition ID by \a cptab + */ +int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu); +/** + * shadow HW node ID \a NODE to CPU-partition ID by \a cptab + */ +int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node); +/** + * NUMA distance between \a cpt1 and \a cpt2 in \a cptab + */ +unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2); +/** + * bind current thread on a CPU-partition \a cpt of \a cptab + */ +int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt); +/** + * add \a cpu to CPU partition @cpt of \a cptab, return 1 for success, + * otherwise 0 is returned + */ +int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu); +/** + * remove \a cpu from CPU partition \a cpt of \a cptab + */ +void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu); +/** + * add all cpus in \a mask to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, + const cpumask_t *mask); +/** + * remove all cpus in \a mask from CPU partition \a cpt + */ +void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, + const cpumask_t *mask); +/** + * add all cpus in NUMA node \a node to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node); +/** + * remove all cpus in NUMA node \a node from CPU partition \a cpt + */ +void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node); +/** + * add all cpus in node mask \a mask to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, + const nodemask_t *mask); +/** + * remove all cpus in node mask \a mask from CPU partition \a cpt + */ +void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, + const nodemask_t *mask); +/** + * convert partition id \a cpt to numa node id, if there are more than one + * nodes in this partition, it might return a different node id each time. + */ +int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt); + +int cfs_cpu_init(void); +void cfs_cpu_fini(void); + +#else /* !CONFIG_SMP */ + +#define cfs_cpt_tab ((struct cfs_cpt_table *)NULL) + +static inline void cfs_cpt_table_free(struct cfs_cpt_table *cptab) +{ +} + +static inline struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt) +{ + return NULL; +} + +static inline int cfs_cpt_table_print(struct cfs_cpt_table *cptab, + char *buf, int len) +{ + int rc; + + rc = snprintf(buf, len, "0\t: 0\n"); + len -= rc; + if (len <= 0) + return -EFBIG; + + return rc; +} + +static inline int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, + char *buf, int len) +{ + int rc; + + rc = snprintf(buf, len, "0\t: 0:1\n"); + len -= rc; + if (len <= 0) + return -EFBIG; + + return rc; +} + +static inline cpumask_var_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, + int cpt) +{ + return (cpumask_var_t *) cpu_online_mask; +} + +static inline int cfs_cpt_number(struct cfs_cpt_table *cptab) +{ + return 1; +} + +static inline int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt) +{ + return 1; +} + +static inline nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, + int cpt) +{ + return &node_online_map; +} + +static inline unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, + int cpt1, int cpt2) +{ + return 1; +} + +static inline int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, + int node) +{ + return 1; +} + +static inline int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt) +{ + return 0; +} + +static inline int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap) +{ + return 0; +} + +static inline int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node) +{ + return 0; +} + +static inline int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) +{ + return 0; +} + +static inline int cfs_cpu_init(void) +{ + return 0; +} + +static inline void cfs_cpu_fini(void) +{ +} + +#endif /* CONFIG_SMP */ + +static inline +struct workqueue_struct *cfs_cpt_bind_workqueue(const char *wq_name, + struct cfs_cpt_table *tbl, + int flags, int cpt, int nthrs) +{ + cpumask_var_t *mask = cfs_cpt_cpumask(tbl, cpt); + struct workqueue_attrs attrs = { }; + struct workqueue_struct *wq; + + wq = alloc_workqueue(wq_name, WQ_UNBOUND | flags, nthrs); + if (!wq) + return ERR_PTR(-ENOMEM); + + if (mask && alloc_cpumask_var(&attrs.cpumask, GFP_KERNEL)) { + cpumask_copy(attrs.cpumask, *mask); + cpus_read_lock(); + cfs_apply_workqueue_attrs(wq, &attrs); + cpus_read_unlock(); + free_cpumask_var(attrs.cpumask); + } + + return wq; +} + +/* + * allocate per-cpu-partition data, returned value is an array of pointers, + * variable can be indexed by CPU ID. + * cptab != NULL: size of array is number of CPU partitions + * cptab == NULL: size of array is number of HW cores + */ +void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size); +/* + * destroy per-cpu-partition variable + */ +void cfs_percpt_free(void *vars); +int cfs_percpt_number(void *vars); + +#define cfs_percpt_for_each(var, i, vars) \ + for (i = 0; i < cfs_percpt_number(vars) && \ + ((var) = (vars)[i]) != NULL; i++) + +/* + * percpu partition lock + * + * There are some use-cases like this in Lustre: + * . each CPU partition has it's own private data which is frequently changed, + * and mostly by the local CPU partition. + * . all CPU partitions share some global data, these data are rarely changed. + * + * LNet is typical example. + * CPU partition lock is designed for this kind of use-cases: + * . each CPU partition has it's own private lock + * . change on private data just needs to take the private lock + * . read on shared data just needs to take _any_ of private locks + * . change on shared data needs to take _all_ private locks, + * which is slow and should be really rare. + */ +enum { + CFS_PERCPT_LOCK_EX = -1, /* negative */ +}; + +struct cfs_percpt_lock { + /* cpu-partition-table for this lock */ + struct cfs_cpt_table *pcl_cptab; + /* exclusively locked */ + unsigned int pcl_locked; + /* private lock table */ + spinlock_t **pcl_locks; +}; + +/* return number of private locks */ +#define cfs_percpt_lock_num(pcl) cfs_cpt_number(pcl->pcl_cptab) + +/* + * create a cpu-partition lock based on CPU partition table \a cptab, + * each private lock has extra \a psize bytes padding data + */ +struct cfs_percpt_lock *cfs_percpt_lock_create(struct cfs_cpt_table *cptab, + struct lock_class_key *keys); +/* destroy a cpu-partition lock */ +void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl); + +/* lock private lock \a index of \a pcl */ +void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index); + +/* unlock private lock \a index of \a pcl */ +void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index); + +#define CFS_PERCPT_LOCK_KEYS 256 + +/* NB: don't allocate keys dynamically, lockdep needs them to be in ".data" */ +#define cfs_percpt_lock_alloc(cptab) \ +({ \ + static struct lock_class_key ___keys[CFS_PERCPT_LOCK_KEYS]; \ + struct cfs_percpt_lock *___lk; \ + \ + if (cfs_cpt_number(cptab) > CFS_PERCPT_LOCK_KEYS) \ + ___lk = cfs_percpt_lock_create(cptab, NULL); \ + else \ + ___lk = cfs_percpt_lock_create(cptab, ___keys); \ + ___lk; \ +}) + +/** + * allocate \a nr_bytes of physical memory from a contiguous region with the + * properties of \a flags which are bound to the partition id \a cpt. This + * function should only be used for the case when only a few pages of memory + * are need. + */ +static inline void * +cfs_cpt_malloc(struct cfs_cpt_table *cptab, int cpt, size_t nr_bytes, + gfp_t flags) +{ + return kmalloc_node(nr_bytes, flags, + cfs_cpt_spread_node(cptab, cpt)); +} + +/** + * allocate \a nr_bytes of virtually contiguous memory that is bound to the + * partition id \a cpt. + */ +static inline void * +cfs_cpt_vzalloc(struct cfs_cpt_table *cptab, int cpt, size_t nr_bytes) +{ + /* vzalloc_node() sets __GFP_FS by default but no current Kernel + * exported entry-point allows for both a NUMA node specification + * and a custom allocation flags mask. This may be an issue since + * __GFP_FS usage can cause some deadlock situations in our code, + * like when memory reclaim started, within the same context of a + * thread doing FS operations, that can also attempt conflicting FS + * operations, ... + */ + return vzalloc_node(nr_bytes, cfs_cpt_spread_node(cptab, cpt)); +} + +/** + * allocate a single page of memory with the properties of \a flags were + * that page is bound to the partition id \a cpt. + */ +static inline struct page * +cfs_page_cpt_alloc(struct cfs_cpt_table *cptab, int cpt, gfp_t flags) +{ + return alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), flags, 0); +} + +/** + * allocate a chunck of memory from a memory pool that is bound to the + * partition id \a cpt with the properites of \a flags. + */ +static inline void * +cfs_mem_cache_cpt_alloc(struct kmem_cache *cachep, struct cfs_cpt_table *cptab, + int cpt, gfp_t flags) +{ + return kmem_cache_alloc_node(cachep, flags, + cfs_cpt_spread_node(cptab, cpt)); +} + +/** + * iterate over all CPU partitions in \a cptab + */ +#define cfs_cpt_for_each(i, cptab) \ + for (i = 0; i < cfs_cpt_number(cptab); i++) + +#endif /* __LIBCFS_CPU_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h new file mode 100644 index 0000000000000..f271676ff4948 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h @@ -0,0 +1,319 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Copyright (c) 2014, Intel Corporation. + */ + +#ifndef _LIBCFS_CRYPTO_H +#define _LIBCFS_CRYPTO_H + +struct cfs_crypto_hash_type { + char *cht_name; /**< hash algorithm name, equal to + * format name for crypto api */ + unsigned int cht_key; /**< init key by default (vaild for + * 4 bytes context like crc32, adler */ + unsigned int cht_size; /**< hash digest size */ +}; + +struct cfs_crypto_crypt_type { + char *cct_name; /**< crypto algorithm name, equal to + * format name for crypto api */ + unsigned int cct_size; /**< crypto key size */ +}; + +enum cfs_crypto_hash_alg { + CFS_HASH_ALG_NULL = 0, + CFS_HASH_ALG_ADLER32, + CFS_HASH_ALG_CRC32, + CFS_HASH_ALG_CRC32C, + /* hashes before here will be speed-tested at module load */ + CFS_HASH_ALG_MD5, + CFS_HASH_ALG_SHA1, + CFS_HASH_ALG_SHA256, + CFS_HASH_ALG_SHA384, + CFS_HASH_ALG_SHA512, + CFS_HASH_ALG_MAX, + CFS_HASH_ALG_SPEED_MAX = CFS_HASH_ALG_MD5, + CFS_HASH_ALG_UNKNOWN = 0xff +}; + +enum cfs_crypto_crypt_alg { + CFS_CRYPT_ALG_NULL = 0, + CFS_CRYPT_ALG_AES256_CTR, + CFS_CRYPT_ALG_MAX, + CFS_CRYPT_ALG_UNKNOWN = 0xff +}; + +static struct cfs_crypto_hash_type hash_types[] = { + [CFS_HASH_ALG_NULL] = { + .cht_name = "null", + .cht_key = 0, + .cht_size = 0 + }, + [CFS_HASH_ALG_ADLER32] = { + .cht_name = "adler32", + .cht_key = 1, + .cht_size = 4 + }, + [CFS_HASH_ALG_CRC32] = { + .cht_name = "crc32", + .cht_key = ~0, + .cht_size = 4 + }, + [CFS_HASH_ALG_CRC32C] = { + .cht_name = "crc32c", + .cht_key = ~0, + .cht_size = 4 + }, + [CFS_HASH_ALG_MD5] = { + .cht_name = "md5", + .cht_key = 0, + .cht_size = 16 + }, + [CFS_HASH_ALG_SHA1] = { + .cht_name = "sha1", + .cht_key = 0, + .cht_size = 20 + }, + [CFS_HASH_ALG_SHA256] = { + .cht_name = "sha256", + .cht_key = 0, + .cht_size = 32 + }, + [CFS_HASH_ALG_SHA384] = { + .cht_name = "sha384", + .cht_key = 0, + .cht_size = 48 + }, + [CFS_HASH_ALG_SHA512] = { + .cht_name = "sha512", + .cht_key = 0, + .cht_size = 64 + }, + [CFS_HASH_ALG_MAX] = { + .cht_name = NULL, + .cht_key = 0, + .cht_size = 64 + } +}; + +static struct cfs_crypto_crypt_type crypt_types[] = { + [CFS_CRYPT_ALG_NULL] = { + .cct_name = "null", + .cct_size = 0 + }, + [CFS_CRYPT_ALG_AES256_CTR] = { + .cct_name = "ctr(aes)", + .cct_size = 32 + } +}; + +/* Maximum size of hash_types[].cht_size */ +#define CFS_CRYPTO_HASH_DIGESTSIZE_MAX 64 + +/* Array of hash algorithm speed in MByte per second */ +extern int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX]; + +/** + * Return hash algorithm information for the specified algorithm identifier + * + * Hash information includes algorithm name, initial seed, hash size. + * + * \retval cfs_crypto_hash_type for valid ID (CFS_HASH_ALG_*) + * \retval NULL for unknown algorithm identifier + */ +static inline const struct +cfs_crypto_hash_type *cfs_crypto_hash_type(enum cfs_crypto_hash_alg hash_alg) +{ + struct cfs_crypto_hash_type *ht; + + if (hash_alg < CFS_HASH_ALG_MAX) { + ht = &hash_types[hash_alg]; + if (ht->cht_name != NULL) + return ht; + } + return NULL; +} + +/** + * Return hash name for hash algorithm identifier + * + * \param[in] hash_alg hash alrgorithm id (CFS_HASH_ALG_*) + * + * \retval string name of known hash algorithm + * \retval "unknown" if hash algorithm is unknown + */ +static inline const +char *cfs_crypto_hash_name(enum cfs_crypto_hash_alg hash_alg) +{ + const struct cfs_crypto_hash_type *ht; + + ht = cfs_crypto_hash_type(hash_alg); + if (ht) + return ht->cht_name; + + return "unknown"; +} + +/** + * Return digest size for hash algorithm type + * + * \param[in] hash_alg hash alrgorithm id (CFS_HASH_ALG_*) + * + * \retval hash algorithm digest size in bytes + * \retval 0 if hash algorithm type is unknown + */ +static inline +unsigned int cfs_crypto_hash_digestsize(enum cfs_crypto_hash_alg hash_alg) +{ + const struct cfs_crypto_hash_type *ht; + + ht = cfs_crypto_hash_type(hash_alg); + if (ht != NULL) + return ht->cht_size; + + return 0; +} + +/** + * Find hash algorithm ID for the specified algorithm name + * + * \retval hash algorithm ID for valid ID (CFS_HASH_ALG_*) + * \retval CFS_HASH_ALG_UNKNOWN for unknown algorithm name + */ +static inline unsigned char cfs_crypto_hash_alg(const char *algname) +{ + enum cfs_crypto_hash_alg hash_alg; + + for (hash_alg = 0; hash_alg < CFS_HASH_ALG_MAX; hash_alg++) + if (strcmp(hash_types[hash_alg].cht_name, algname) == 0) + return hash_alg; + + return CFS_HASH_ALG_UNKNOWN; +} + +/** + * Return crypt algorithm information for the specified algorithm identifier + * + * Crypt information includes algorithm name, key size. + * + * \retval cfs_crypto_crupt_type for valid ID (CFS_CRYPT_ALG_*) + * \retval NULL for unknown algorithm identifier + */ +static inline const struct +cfs_crypto_crypt_type *cfs_crypto_crypt_type( + enum cfs_crypto_crypt_alg crypt_alg) +{ + struct cfs_crypto_crypt_type *ct; + + if (crypt_alg < CFS_CRYPT_ALG_MAX) { + ct = &crypt_types[crypt_alg]; + if (ct->cct_name != NULL) + return ct; + } + return NULL; +} + +/** + * Return crypt name for crypt algorithm identifier + * + * \param[in] crypt_alg crypt alrgorithm id (CFS_CRYPT_ALG_*) + * + * \retval string name of known crypt algorithm + * \retval "unknown" if hash algorithm is unknown + */ +static inline const +char *cfs_crypto_crypt_name(enum cfs_crypto_crypt_alg crypt_alg) +{ + const struct cfs_crypto_crypt_type *ct; + + ct = cfs_crypto_crypt_type(crypt_alg); + if (ct) + return ct->cct_name; + + return "unknown"; +} + + +/** + * Return key size for crypto algorithm type + * + * \param[in] crypt_alg crypt alrgorithm id (CFS_CRYPT_ALG_*) + * + * \retval crypt algorithm key size in bytes + * \retval 0 if crypt algorithm type is unknown + */ +static inline +unsigned int cfs_crypto_crypt_keysize(enum cfs_crypto_crypt_alg crypt_alg) +{ + const struct cfs_crypto_crypt_type *ct; + + ct = cfs_crypto_crypt_type(crypt_alg); + if (ct != NULL) + return ct->cct_size; + + return 0; +} + +/** + * Find crypto algorithm ID for the specified algorithm name + * + * \retval crypto algorithm ID for valid ID (CFS_CRYPT_ALG_*) + * \retval CFS_CRYPT_ALG_UNKNOWN for unknown algorithm name + */ +static inline unsigned char cfs_crypto_crypt_alg(const char *algname) +{ + enum cfs_crypto_crypt_alg crypt_alg; + + for (crypt_alg = 0; crypt_alg < CFS_CRYPT_ALG_MAX; crypt_alg++) + if (strcmp(crypt_types[crypt_alg].cct_name, algname) == 0) + return crypt_alg; + + return CFS_CRYPT_ALG_UNKNOWN; +} + +int cfs_crypto_hash_digest(enum cfs_crypto_hash_alg hash_alg, + const void *buf, unsigned int buf_len, + unsigned char *key, unsigned int key_len, + unsigned char *hash, unsigned int *hash_len); + +/* cfs crypto hash descriptor */ +struct page; + +struct ahash_request * + cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg, + unsigned char *key, unsigned int key_len); +int cfs_crypto_hash_update_page(struct ahash_request *req, + struct page *page, unsigned int offset, + unsigned int len); +int cfs_crypto_hash_update(struct ahash_request *req, const void *buf, + unsigned int buf_len); +int cfs_crypto_hash_final(struct ahash_request *req, + unsigned char *hash, unsigned int *hash_len); +int cfs_crypto_register(void); +void cfs_crypto_unregister(void); +int cfs_crypto_hash_speed(enum cfs_crypto_hash_alg hash_alg); +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h new file mode 100644 index 0000000000000..f7d5bd9dd0126 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h @@ -0,0 +1,328 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/libcfs_debug.h + * + * Debug messages and assertions + * + */ + +#ifndef __LIBCFS_DEBUG_H__ +#define __LIBCFS_DEBUG_H__ + +#include +#include +#include + +/* + * Debugging + */ +extern unsigned int libcfs_subsystem_debug; +extern unsigned int libcfs_stack; +extern unsigned int libcfs_debug; +extern unsigned int libcfs_printk; +extern unsigned int libcfs_watchdog_ratelimit; +extern unsigned int libcfs_console_ratelimit; +extern unsigned int libcfs_console_max_delay; +extern unsigned int libcfs_console_min_delay; +extern unsigned int libcfs_console_backoff; +extern unsigned int libcfs_debug_binary; +extern char *libcfs_debug_file_path; + +struct task_struct; + +int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys); +int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys); +void libcfs_debug_dumpstack(struct task_struct *tsk); + +/* Has there been an LBUG? */ +extern unsigned int libcfs_catastrophe; +extern unsigned int libcfs_panic_on_lbug; + +#ifndef DEBUG_SUBSYSTEM +# define DEBUG_SUBSYSTEM S_UNDEFINED +#endif + +#define CDEBUG_DEFAULT_MAX_DELAY (cfs_time_seconds(600)) /* jiffies */ +#define CDEBUG_DEFAULT_MIN_DELAY ((cfs_time_seconds(1) + 1) / 2) /* jiffies */ +#define CDEBUG_DEFAULT_BACKOFF 2 +struct cfs_debug_limit_state { + unsigned long cdls_next; + unsigned int cdls_delay; + int cdls_count; +}; + +struct libcfs_debug_msg_data { + const char *msg_file; + const char *msg_fn; + int msg_subsys; + int msg_line; + int msg_mask; + struct cfs_debug_limit_state *msg_cdls; +}; + +#define LIBCFS_DEBUG_MSG_DATA_INIT(file, func, line, msgdata, mask, cdls)\ +do { \ + (msgdata)->msg_subsys = DEBUG_SUBSYSTEM; \ + (msgdata)->msg_file = (file); \ + (msgdata)->msg_fn = (func); \ + (msgdata)->msg_line = (line); \ + (msgdata)->msg_mask = (mask); \ + (msgdata)->msg_cdls = (cdls); \ +} while (0) + +#define LIBCFS_DEBUG_MSG_DATA_DECL_LOC(file, func, line, msgdata, mask, cdls)\ + static struct libcfs_debug_msg_data msgdata = { \ + .msg_subsys = DEBUG_SUBSYSTEM, \ + .msg_file = (file), \ + .msg_fn = (func), \ + .msg_line = (line), \ + .msg_cdls = (cdls) }; \ + msgdata.msg_mask = (mask) + +#define LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, cdls) \ + LIBCFS_DEBUG_MSG_DATA_DECL_LOC(__FILE__, __func__, __LINE__, \ + msgdata, mask, cdls) + +#ifdef CDEBUG_ENABLED + +#if !defined(__x86_64__) +# ifdef __ia64__ +# define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_dwarf_cfa() & \ + (THREAD_SIZE - 1))) +# else +# define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_frame_address(0) & \ + (THREAD_SIZE - 1))) +# endif /* __ia64__ */ + +#define __CHECK_STACK_WITH_LOC(file, func, line, msgdata, mask, cdls) \ +do { \ + if (unlikely(CDEBUG_STACK() > libcfs_stack)) { \ + LIBCFS_DEBUG_MSG_DATA_INIT(file, func, line, msgdata, \ + D_WARNING, NULL); \ + libcfs_stack = CDEBUG_STACK(); \ + libcfs_debug_msg(msgdata, "maximum lustre stack %u\n", \ + libcfs_stack); \ + (msgdata)->msg_mask = mask; \ + (msgdata)->msg_cdls = cdls; \ + dump_stack(); \ + /*panic("LBUG");*/ \ + } \ +} while (0) +#else /* __x86_64__ */ +#define CDEBUG_STACK() (0L) +#define __CHECK_STACK_WITH_LOC(file, func, line, msgdata, mask, cdls) \ + do {} while (0) +#endif /* __x86_64__ */ + +#define CFS_CHECK_STACK(msgdata, mask, cdls) \ + __CHECK_STACK_WITH_LOC(__FILE__, __func__, __LINE__, \ + msgdata, mask, cdls) +/** + * Filters out logging messages based on mask and subsystem. + */ +static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem) +{ + return mask & D_CANTMASK || + ((libcfs_debug & mask) && (libcfs_subsystem_debug & subsystem)); +} + +# define __CDEBUG_WITH_LOC(file, func, line, mask, cdls, format, ...) \ +do { \ + static struct libcfs_debug_msg_data msgdata; \ + \ + __CHECK_STACK_WITH_LOC(file, func, line, &msgdata, mask, cdls); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_INIT(file, func, line, \ + &msgdata, mask, cdls); \ + libcfs_debug_msg(&msgdata, format, ## __VA_ARGS__); \ + } \ +} while (0) + +# define CDEBUG(mask, format, ...) \ + __CDEBUG_WITH_LOC(__FILE__, __func__, __LINE__, \ + mask, NULL, format, ## __VA_ARGS__) + +# define CDEBUG_LIMIT(mask, format, ...) \ +do { \ + static struct cfs_debug_limit_state cdls; \ + \ + __CDEBUG_WITH_LOC(__FILE__, __func__, __LINE__, \ + mask, &cdls, format, ## __VA_ARGS__); \ +} while (0) + +# else /* !CDEBUG_ENABLED */ +static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem) +{ + return 0; +} +# define CDEBUG(mask, format, ...) (void)(0) +# define CDEBUG_LIMIT(mask, format, ...) (void)(0) +# warning "CDEBUG IS DISABLED. THIS SHOULD NEVER BE DONE FOR PRODUCTION!" +# endif /* CDEBUG_ENABLED */ + +/* + * Lustre Error Checksum: calculates checksum + * of Hex number by XORing each bit. + */ +#define LERRCHKSUM(hexnum) (((hexnum) & 0xf) ^ ((hexnum) >> 4 & 0xf) ^ \ + ((hexnum) >> 8 & 0xf)) + +#define CWARN(format, ...) CDEBUG_LIMIT(D_WARNING, format, ## __VA_ARGS__) +#define CERROR(format, ...) CDEBUG_LIMIT(D_ERROR, format, ## __VA_ARGS__) +#define CNETERR(format, a...) CDEBUG_LIMIT(D_NETERROR, format, ## a) +#define CEMERG(format, ...) CDEBUG_LIMIT(D_EMERG, format, ## __VA_ARGS__) + +#define LCONSOLE(mask, format, ...) CDEBUG(D_CONSOLE | (mask), format, ## __VA_ARGS__) +#define LCONSOLE_INFO(format, ...) CDEBUG_LIMIT(D_CONSOLE, format, ## __VA_ARGS__) +#define LCONSOLE_WARN(format, ...) CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## __VA_ARGS__) +#define LCONSOLE_ERROR_MSG(errnum, format, ...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, \ + "%x-%x: " format, errnum, LERRCHKSUM(errnum), ## __VA_ARGS__) +#define LCONSOLE_ERROR(format, ...) LCONSOLE_ERROR_MSG(0x00, format, ## __VA_ARGS__) + +#define LCONSOLE_EMERG(format, ...) \ + CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__) + +int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata, + const char *format1, ...) + __printf(2, 3); + +/* other external symbols that tracefile provides: */ +int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, + const char *knl_buffer, char *append); + +#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log" + +#if defined(CDEBUG_ENTRY_EXIT) + +static inline long libcfs_log_return(struct libcfs_debug_msg_data *msgdata, long rc) +{ + libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n", + rc, rc, rc); + return rc; +} + +static inline void libcfs_log_goto(struct libcfs_debug_msg_data *msgdata, + const char *label, long rc) +{ + libcfs_debug_msg(msgdata, + "Process leaving via %s (rc=%lu : %ld : %#lx)\n", + label, rc, rc, rc); +} + +# define GOTO(label, rc) \ +do { \ + if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(_goto_data, D_TRACE, NULL); \ + libcfs_log_goto(&_goto_data, #label, (long)(rc)); \ + } else { \ + (void)(rc); \ + } \ + \ + goto label; \ +} while (0) + +# if BITS_PER_LONG > 32 +# define RETURN(rc) \ +do { \ + if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL); \ + return (typeof(rc))libcfs_log_return(&msgdata, \ + (long)(rc)); \ + } \ + \ + return rc; \ +} while (0) +# else /* BITS_PER_LONG == 32 */ +/* We need an on-stack variable, because we cannot case a 32-bit pointer + * directly to (long long) without generating a complier warning/error, yet + * casting directly to (long) will truncate 64-bit return values. The log + * values will print as 32-bit values, but they always have been. LU-1436 + */ +# define RETURN(rc) \ +do { \ + if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) { \ + typeof(rc) __rc = (rc); \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL); \ + libcfs_log_return(&msgdata, (long)__rc); \ + return __rc; \ + } \ + \ + return rc; \ +} while (0) + +# endif /* BITS_PER_LONG > 32 */ + +# define ENTRY CDEBUG(D_TRACE, "Process entered\n") +# define EXIT CDEBUG(D_TRACE, "Process leaving\n") + +#else /* !CDEBUG_ENTRY_EXIT */ + +# define GOTO(label, rc) \ + do { \ + ((void)(rc)); \ + goto label; \ + } while (0) + +# define RETURN(rc) return (rc) +# define ENTRY do { } while (0) +# define EXIT do { } while (0) + +#endif /* CDEBUG_ENTRY_EXIT */ + +#define RETURN_EXIT \ +do { \ + EXIT; \ + return; \ +} while (0) + +void cfs_debug_init(void); + +static inline void cfs_tty_write_msg(const char *msg) +{ + struct tty_struct *tty; + + tty = get_current_tty(); + if (!tty) + return; + mutex_lock(&tty->atomic_write_lock); + tty_lock(tty); + if (tty->ops->write && tty->count > 0) + tty->ops->write(tty, msg, strlen(msg)); + tty_unlock(tty); + mutex_unlock(&tty->atomic_write_lock); + wake_up_interruptible_poll(&tty->write_wait, POLL_OUT); + tty_kref_put(tty); +} + +#endif /* __LIBCFS_DEBUG_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h new file mode 100644 index 0000000000000..9e57506974d23 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h @@ -0,0 +1,226 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Oracle Corporation, Inc. + */ + +#ifndef _LIBCFS_FAIL_H +#define _LIBCFS_FAIL_H + +extern unsigned long cfs_fail_loc; +extern unsigned int cfs_fail_val; +extern int cfs_fail_err; + +extern wait_queue_head_t cfs_race_waitq; +extern int cfs_race_state; + +int __cfs_fail_check_set(__u32 id, __u32 value, int set); +int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set); + +enum { + CFS_FAIL_LOC_NOSET = 0, + CFS_FAIL_LOC_ORSET = 1, + CFS_FAIL_LOC_RESET = 2, + CFS_FAIL_LOC_VALUE = 3 +}; + +/* Failure ranges + "0x0100 - 0x3fff" for Lustre + "0xe000 - 0xefff" for LNet + "0xf000 - 0xffff" for LNDs */ +/* Failure injection control */ +#define CFS_FAIL_MASK_SYS 0x0000FF00 +#define CFS_FAIL_MASK_LOC (0x000000FF | CFS_FAIL_MASK_SYS) + +#define CFS_FAILED_BIT 30 +/* CFS_FAILED is 0x40000000 */ +#define CFS_FAILED BIT(CFS_FAILED_BIT) + +#define CFS_FAIL_ONCE_BIT 31 +/* CFS_FAIL_ONCE is 0x80000000 */ +#define CFS_FAIL_ONCE BIT(CFS_FAIL_ONCE_BIT) + +/* The following flags aren't made to be combined */ +#define CFS_FAIL_SKIP 0x20000000 /* skip N times then fail */ +#define CFS_FAIL_SOME 0x10000000 /* only fail N times */ +#define CFS_FAIL_RAND 0x08000000 /* fail 1/N of the times */ +#define CFS_FAIL_USR1 0x04000000 /* user flag */ + +/* CFS_FAULT may be combined with any one of the above flags. */ +#define CFS_FAULT 0x02000000 /* match any CFS_FAULT_CHECK */ + +static inline bool CFS_FAIL_PRECHECK(__u32 id) +{ + return cfs_fail_loc != 0 && + ((cfs_fail_loc & CFS_FAIL_MASK_LOC) == (id & CFS_FAIL_MASK_LOC) || + (cfs_fail_loc & id & CFS_FAULT)); +} + +static inline int cfs_fail_check_set(__u32 id, __u32 value, int set, int quiet) +{ + unsigned long failed_once = cfs_fail_loc & CFS_FAILED; /* ok if racy */ + int ret = 0; + + if (unlikely(CFS_FAIL_PRECHECK(id) && + (ret = __cfs_fail_check_set(id, value, set)))) { + if (quiet && failed_once) { + CDEBUG(D_INFO, "*** cfs_fail_loc=%x, val=%u***\n", + id, value); + } else { + LCONSOLE_INFO("*** cfs_fail_loc=%x, val=%u***\n", + id, value); + } + } + + return ret; +} + +/* If id hit cfs_fail_loc, return 1, otherwise return 0 */ +#define CFS_FAIL_CHECK(id) \ + cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 0) +#define CFS_FAIL_CHECK_QUIET(id) \ + cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 1) + +/* If id hit cfs_fail_loc and cfs_fail_val == (-1 or value) return 1, + * otherwise return 0 */ +#define CFS_FAIL_CHECK_VALUE(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 0) +#define CFS_FAIL_CHECK_VALUE_QUIET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 1) + +/* If id hit cfs_fail_loc, cfs_fail_loc |= value and return 1, + * otherwise return 0 */ +#define CFS_FAIL_CHECK_ORSET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 0) +#define CFS_FAIL_CHECK_ORSET_QUIET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 1) + +/* If id hit cfs_fail_loc, cfs_fail_loc = value and return 1, + * otherwise return 0 */ +#define CFS_FAIL_CHECK_RESET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 0) +#define CFS_FAIL_CHECK_RESET_QUIET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 1) + +static inline int cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set) +{ + if (unlikely(CFS_FAIL_PRECHECK(id))) + return __cfs_fail_timeout_set(id, value, ms, set); + else + return 0; +} + +/* If id hit cfs_fail_loc, sleep for seconds or milliseconds */ +#define CFS_FAIL_TIMEOUT(id, secs) \ + cfs_fail_timeout_set(id, 0, (secs) * 1000, CFS_FAIL_LOC_NOSET) + +#define CFS_FAIL_TIMEOUT_MS(id, ms) \ + cfs_fail_timeout_set(id, 0, ms, CFS_FAIL_LOC_NOSET) + +/* If id hit cfs_fail_loc, cfs_fail_loc |= value and + * sleep seconds or milliseconds */ +#define CFS_FAIL_TIMEOUT_ORSET(id, value, secs) \ + cfs_fail_timeout_set(id, value, (secs) * 1000, CFS_FAIL_LOC_ORSET) + +#define CFS_FAIL_TIMEOUT_RESET(id, value, secs) \ + cfs_fail_timeout_set(id, value, secs * 1000, CFS_FAIL_LOC_RESET) + +#define CFS_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \ + cfs_fail_timeout_set(id, value, ms, CFS_FAIL_LOC_ORSET) + +#define CFS_FAULT_CHECK(id) \ + CFS_FAIL_CHECK(CFS_FAULT | (id)) + +/* The idea here is to synchronise two threads to force a race. The + * first thread that calls this with a matching fail_loc is put to + * sleep. The next thread that calls with the same fail_loc wakes up + * the first and continues. */ +static inline void cfs_race(__u32 id) +{ + if (CFS_FAIL_PRECHECK(id)) { + if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) { + int rc; + cfs_race_state = 0; + CERROR("cfs_race id %x sleeping\n", id); + /* + * XXX: don't wait forever as there is no guarantee + * that this branch is executed first. for testing + * purposes this construction works good enough + */ + rc = wait_event_interruptible_timeout(cfs_race_waitq, + cfs_race_state != 0, + cfs_time_seconds(5)); + CERROR("cfs_fail_race id %x awake: rc=%d\n", id, rc); + } else { + CERROR("cfs_fail_race id %x waking\n", id); + cfs_race_state = 1; + wake_up(&cfs_race_waitq); + } + } +} +#define CFS_RACE(id) cfs_race(id) + +/** + * Wait on race. + * + * The first thread that calls this with a matching fail_loc is put to sleep, + * but subseqent callers of this won't sleep. Until another thread that calls + * cfs_race_wakeup(), the first thread will be woken up and continue. + */ +static inline void cfs_race_wait(__u32 id) +{ + if (CFS_FAIL_PRECHECK(id)) { + if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) { + int rc; + + cfs_race_state = 0; + CERROR("cfs_race id %x sleeping\n", id); + rc = wait_event_interruptible(cfs_race_waitq, + cfs_race_state != 0); + CERROR("cfs_fail_race id %x awake: rc=%d\n", id, rc); + } + } +} +#define CFS_RACE_WAIT(id) cfs_race_wait(id) + +/** + * Wake up the thread that is waiting on the matching fail_loc. + */ +static inline void cfs_race_wakeup(__u32 id) +{ + if (CFS_FAIL_PRECHECK(id)) { + if (likely(!__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) { + CERROR("cfs_fail_race id %x waking\n", id); + cfs_race_state = 1; + wake_up(&cfs_race_waitq); + } + } +} +#define CFS_RACE_WAKEUP(id) cfs_race_wakeup(id) + +#endif /* _LIBCFS_FAIL_H */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h new file mode 100644 index 0000000000000..bdf3cdd37754f --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h @@ -0,0 +1,869 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/libcfs_hash.h + * + * Hashing routines + * + */ + +#ifndef __LIBCFS_HASH_H__ +#define __LIBCFS_HASH_H__ + +#include +#include +#include + +/* + * Knuth recommends primes in approximately golden ratio to the maximum + * integer representable by a machine word for multiplicative hashing. + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * + * These primes are chosen to be bit-sparse, that is operations on + * them can use shifts and additions instead of multiplications for + * machines where multiplications are slow. + */ +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL + +/** disable debug */ +#define CFS_HASH_DEBUG_NONE 0 +/** record hash depth and output to console when it's too deep, + * computing overhead is low but consume more memory */ +#define CFS_HASH_DEBUG_1 1 +/** expensive, check key validation */ +#define CFS_HASH_DEBUG_2 2 + +#define CFS_HASH_DEBUG_LEVEL CFS_HASH_DEBUG_NONE + +struct cfs_hash_ops; +struct cfs_hash_lock_ops; +struct cfs_hash_hlist_ops; + +union cfs_hash_lock { + rwlock_t rw; /**< rwlock */ + spinlock_t spin; /**< spinlock */ + struct rw_semaphore rw_sem; /**< rwsem */ +}; + +/** + * cfs_hash_bucket is a container of: + * - lock, counter ... + * - array of hash-head starting from hsb_head[0], hash-head can be one of + * . struct cfs_hash_head + * . struct cfs_hash_head_dep + * . struct cfs_hash_dhead + * . struct cfs_hash_dhead_dep + * which depends on requirement of user + * - some extra bytes (caller can require it while creating hash) + */ +struct cfs_hash_bucket { + union cfs_hash_lock hsb_lock; /**< bucket lock */ + __u32 hsb_count; /**< current entries */ + __u32 hsb_version; /**< change version */ + unsigned int hsb_index; /**< index of bucket */ + int hsb_depmax; /**< max depth on bucket */ + long hsb_head[0]; /**< hash-head array */ +}; + +/** + * cfs_hash bucket descriptor, it's normally in stack of caller + */ +struct cfs_hash_bd { + /**< address of bucket */ + struct cfs_hash_bucket *bd_bucket; + /**< offset in bucket */ + unsigned int bd_offset; +}; + +#define CFS_HASH_NAME_LEN 16 /**< default name length */ +#define CFS_HASH_BIGNAME_LEN 64 /**< bigname for param tree */ + +#define CFS_HASH_BKT_BITS 3 /**< default bits of bucket */ +#define CFS_HASH_BITS_MAX 30 /**< max bits of bucket */ +#define CFS_HASH_BITS_MIN CFS_HASH_BKT_BITS + +/** + * common hash attributes. + */ +enum cfs_hash_tag { + /** + * don't need any lock, caller will protect operations with it's + * own lock. With this flag: + * . CFS_HASH_NO_BKTLOCK, CFS_HASH_RW_BKTLOCK, CFS_HASH_SPIN_BKTLOCK + * will be ignored. + * . Some functions will be disabled with this flag, i.e: + * cfs_hash_for_each_empty, cfs_hash_rehash + */ + CFS_HASH_NO_LOCK = BIT(0), + /** no bucket lock, use one spinlock to protect the whole hash */ + CFS_HASH_NO_BKTLOCK = BIT(1), + /** rwlock to protect bucket */ + CFS_HASH_RW_BKTLOCK = BIT(2), + /** spinlock to protect bucket */ + CFS_HASH_SPIN_BKTLOCK = BIT(3), + /** always add new item to tail */ + CFS_HASH_ADD_TAIL = BIT(4), + /** hash-table doesn't have refcount on item */ + CFS_HASH_NO_ITEMREF = BIT(5), + /** big name for param-tree */ + CFS_HASH_BIGNAME = BIT(6), + /** track global count */ + CFS_HASH_COUNTER = BIT(7), + /** rehash item by new key */ + CFS_HASH_REHASH_KEY = BIT(8), + /** Enable dynamic hash resizing */ + CFS_HASH_REHASH = BIT(9), + /** can shrink hash-size */ + CFS_HASH_SHRINK = BIT(10), + /** assert hash is empty on exit */ + CFS_HASH_ASSERT_EMPTY = BIT(11), + /** record hlist depth */ + CFS_HASH_DEPTH = BIT(12), + /** + * rehash is always scheduled in a different thread, so current + * change on hash table is non-blocking + */ + CFS_HASH_NBLK_CHANGE = BIT(13), + /** rw semaphore lock to protect bucket */ + CFS_HASH_RW_SEM_BKTLOCK = BIT(14), + /** NB, we typed hs_flags as __u16, please change it + * if you need to extend >=16 flags + */ +}; + +/** most used attributes */ +#define CFS_HASH_DEFAULT (CFS_HASH_RW_BKTLOCK | \ + CFS_HASH_COUNTER | CFS_HASH_REHASH) + +/** + * cfs_hash is a hash-table implementation for general purpose, it can support: + * . two refcount modes + * hash-table with & without refcount + * . four lock modes + * nolock, one-spinlock, rw-bucket-lock, spin-bucket-lock + * . general operations + * lookup, add(add_tail or add_head), delete + * . rehash + * grows or shrink + * . iteration + * locked iteration and unlocked iteration + * . bigname + * support long name hash + * . debug + * trace max searching depth + * + * Rehash: + * When the htable grows or shrinks, a separate task (cfs_hash_rehash_worker) + * is spawned to handle the rehash in the background, it's possible that other + * processes can concurrently perform additions, deletions, and lookups + * without being blocked on rehash completion, because rehash will release + * the global wrlock for each bucket. + * + * rehash and iteration can't run at the same time because it's too tricky + * to keep both of them safe and correct. + * As they are relatively rare operations, so: + * . if iteration is in progress while we try to launch rehash, then + * it just giveup, iterator will launch rehash at the end. + * . if rehash is in progress while we try to iterate the hash table, + * then we just wait (shouldn't be very long time), anyway, nobody + * should expect iteration of whole hash-table to be non-blocking. + * + * During rehashing, a (key,object) pair may be in one of two buckets, + * depending on whether the worker task has yet to transfer the object + * to its new location in the table. Lookups and deletions need to search both + * locations; additions must take care to only insert into the new bucket. + */ + +struct cfs_hash { + /** serialize with rehash, or serialize all operations if + * the hash-table has CFS_HASH_NO_BKTLOCK */ + union cfs_hash_lock hs_lock; + /** hash operations */ + struct cfs_hash_ops *hs_ops; + /** hash lock operations */ + struct cfs_hash_lock_ops *hs_lops; + /** hash list operations */ + struct cfs_hash_hlist_ops *hs_hops; + /** hash buckets-table */ + struct cfs_hash_bucket **hs_buckets; + /** total number of items on this hash-table */ + atomic_t hs_count; + /** hash flags, see cfs_hash_tag for detail */ + __u16 hs_flags; + /** # of extra-bytes for bucket, for user saving extended attributes */ + __u16 hs_extra_bytes; + /** wants to iterate */ + __u8 hs_iterating; + /** hash-table is dying */ + __u8 hs_exiting; + /** current hash bits */ + __u8 hs_cur_bits; + /** min hash bits */ + __u8 hs_min_bits; + /** max hash bits */ + __u8 hs_max_bits; + /** bits for rehash */ + __u8 hs_rehash_bits; + /** bits for each bucket */ + __u8 hs_bkt_bits; + /** resize min threshold */ + __u16 hs_min_theta; + /** resize max threshold */ + __u16 hs_max_theta; + /** resize count */ + __u32 hs_rehash_count; + /** # of iterators (caller of cfs_hash_for_each_*) */ + __u32 hs_iterators; + /** rehash workitem */ + struct work_struct hs_rehash_work; + /** refcount on this hash table */ + atomic_t hs_refcount; + /** rehash buckets-table */ + struct cfs_hash_bucket **hs_rehash_buckets; +#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 + /** serialize debug members */ + spinlock_t hs_dep_lock; + /** max depth */ + unsigned int hs_dep_max; + /** id of the deepest bucket */ + unsigned int hs_dep_bkt; + /** offset in the deepest bucket */ + unsigned int hs_dep_off; + /** bits when we found the max depth */ + unsigned int hs_dep_bits; + /** workitem to output max depth */ + struct work_struct hs_dep_work; +#endif + /** name of htable */ + char hs_name[0]; +}; + +struct cfs_hash_lock_ops { + /** lock the hash table */ + void (*hs_lock)(union cfs_hash_lock *lock, int exclusive); + /** unlock the hash table */ + void (*hs_unlock)(union cfs_hash_lock *lock, int exclusive); + /** lock the hash bucket */ + void (*hs_bkt_lock)(union cfs_hash_lock *lock, int exclusive); + /** unlock the hash bucket */ + void (*hs_bkt_unlock)(union cfs_hash_lock *lock, int exclusive); +}; + +struct cfs_hash_hlist_ops { + /** return hlist_head of hash-head of @bd */ + struct hlist_head *(*hop_hhead)(struct cfs_hash *hs, struct cfs_hash_bd *bd); + /** return hash-head size */ + int (*hop_hhead_size)(struct cfs_hash *hs); + /** add @hnode to hash-head of @bd */ + int (*hop_hnode_add)(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode); + /** remove @hnode from hash-head of @bd */ + int (*hop_hnode_del)(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode); +}; + +struct cfs_hash_ops { + /** return hashed value from @key */ + unsigned (*hs_hash)(struct cfs_hash *hs, const void *key, unsigned mask); + /** return key address of @hnode */ + void * (*hs_key)(struct hlist_node *hnode); + /** copy key from @hnode to @key */ + void (*hs_keycpy)(struct hlist_node *hnode, void *key); + /** + * compare @key with key of @hnode + * returns 1 on a match + */ + int (*hs_keycmp)(const void *key, struct hlist_node *hnode); + /** return object address of @hnode, i.e: container_of(...hnode) */ + void * (*hs_object)(struct hlist_node *hnode); + /** get refcount of item, always called with holding bucket-lock */ + void (*hs_get)(struct cfs_hash *hs, struct hlist_node *hnode); + /** release refcount of item */ + void (*hs_put)(struct cfs_hash *hs, struct hlist_node *hnode); + /** release refcount of item, always called with holding bucket-lock */ + void (*hs_put_locked)(struct cfs_hash *hs, struct hlist_node *hnode); + /** it's called before removing of @hnode */ + void (*hs_exit)(struct cfs_hash *hs, struct hlist_node *hnode); +}; + +/** total number of buckets in @hs */ +#define CFS_HASH_NBKT(hs) \ + (1U << ((hs)->hs_cur_bits - (hs)->hs_bkt_bits)) + +/** total number of buckets in @hs while rehashing */ +#define CFS_HASH_RH_NBKT(hs) \ + (1U << ((hs)->hs_rehash_bits - (hs)->hs_bkt_bits)) + +/** number of hlist for in bucket */ +#define CFS_HASH_BKT_NHLIST(hs) (1U << (hs)->hs_bkt_bits) + +/** total number of hlist in @hs */ +#define CFS_HASH_NHLIST(hs) (1U << (hs)->hs_cur_bits) + +/** total number of hlist in @hs while rehashing */ +#define CFS_HASH_RH_NHLIST(hs) (1U << (hs)->hs_rehash_bits) + +static inline int +cfs_hash_with_no_lock(struct cfs_hash *hs) +{ + /* caller will serialize all operations for this hash-table */ + return (hs->hs_flags & CFS_HASH_NO_LOCK) != 0; +} + +static inline int +cfs_hash_with_no_bktlock(struct cfs_hash *hs) +{ + /* no bucket lock, one single lock to protect the hash-table */ + return (hs->hs_flags & CFS_HASH_NO_BKTLOCK) != 0; +} + +static inline int +cfs_hash_with_rw_bktlock(struct cfs_hash *hs) +{ + /* rwlock to protect hash bucket */ + return (hs->hs_flags & CFS_HASH_RW_BKTLOCK) != 0; +} + +static inline int +cfs_hash_with_spin_bktlock(struct cfs_hash *hs) +{ + /* spinlock to protect hash bucket */ + return (hs->hs_flags & CFS_HASH_SPIN_BKTLOCK) != 0; +} + +static inline int +cfs_hash_with_rw_sem_bktlock(struct cfs_hash *hs) +{ + /* rw sem lock to protect hash bucket */ + return (hs->hs_flags & CFS_HASH_RW_SEM_BKTLOCK) != 0; +} + +static inline int +cfs_hash_with_add_tail(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_ADD_TAIL) != 0; +} + +static inline int +cfs_hash_with_no_itemref(struct cfs_hash *hs) +{ + /* hash-table doesn't keep refcount on item, + * item can't be removed from hash unless it's + * ZERO refcount */ + return (hs->hs_flags & CFS_HASH_NO_ITEMREF) != 0; +} + +static inline int +cfs_hash_with_bigname(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_BIGNAME) != 0; +} + +static inline int +cfs_hash_with_counter(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_COUNTER) != 0; +} + +static inline int +cfs_hash_with_rehash(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_REHASH) != 0; +} + +static inline int +cfs_hash_with_rehash_key(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_REHASH_KEY) != 0; +} + +static inline int +cfs_hash_with_shrink(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_SHRINK) != 0; +} + +static inline int +cfs_hash_with_assert_empty(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_ASSERT_EMPTY) != 0; +} + +static inline int +cfs_hash_with_depth(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_DEPTH) != 0; +} + +static inline int +cfs_hash_with_nblk_change(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_NBLK_CHANGE) != 0; +} + +static inline int +cfs_hash_is_exiting(struct cfs_hash *hs) +{ /* cfs_hash_destroy is called */ + return hs->hs_exiting; +} + +static inline int +cfs_hash_is_rehashing(struct cfs_hash *hs) +{ /* rehash is launched */ + return hs->hs_rehash_bits != 0; +} + +static inline int +cfs_hash_is_iterating(struct cfs_hash *hs) +{ /* someone is calling cfs_hash_for_each_* */ + return hs->hs_iterating || hs->hs_iterators != 0; +} + +static inline int +cfs_hash_bkt_size(struct cfs_hash *hs) +{ + return offsetof(struct cfs_hash_bucket, hsb_head[0]) + + hs->hs_hops->hop_hhead_size(hs) * CFS_HASH_BKT_NHLIST(hs) + + hs->hs_extra_bytes; +} + +static inline unsigned +cfs_hash_id(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return hs->hs_ops->hs_hash(hs, key, mask); +} + +static inline void * +cfs_hash_key(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_key(hnode); +} + +static inline void +cfs_hash_keycpy(struct cfs_hash *hs, struct hlist_node *hnode, void *key) +{ + if (hs->hs_ops->hs_keycpy != NULL) + hs->hs_ops->hs_keycpy(hnode, key); +} + +/** + * Returns 1 on a match, + */ +static inline int +cfs_hash_keycmp(struct cfs_hash *hs, const void *key, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_keycmp(key, hnode); +} + +static inline void * +cfs_hash_object(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_object(hnode); +} + +static inline void +cfs_hash_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_get(hs, hnode); +} + +static inline void +cfs_hash_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_put_locked(hs, hnode); +} + +static inline void +cfs_hash_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_put(hs, hnode); +} + +static inline void +cfs_hash_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + if (hs->hs_ops->hs_exit) + hs->hs_ops->hs_exit(hs, hnode); +} + +static inline void cfs_hash_lock(struct cfs_hash *hs, int excl) +{ + hs->hs_lops->hs_lock(&hs->hs_lock, excl); +} + +static inline void cfs_hash_unlock(struct cfs_hash *hs, int excl) +{ + hs->hs_lops->hs_unlock(&hs->hs_lock, excl); +} + +static inline int cfs_hash_dec_and_lock(struct cfs_hash *hs, + atomic_t *condition) +{ + LASSERT(cfs_hash_with_no_bktlock(hs)); + return atomic_dec_and_lock(condition, &hs->hs_lock.spin); +} + +static inline void cfs_hash_bd_lock(struct cfs_hash *hs, + struct cfs_hash_bd *bd, int excl) +{ + hs->hs_lops->hs_bkt_lock(&bd->bd_bucket->hsb_lock, excl); +} + +static inline void cfs_hash_bd_unlock(struct cfs_hash *hs, + struct cfs_hash_bd *bd, int excl) +{ + hs->hs_lops->hs_bkt_unlock(&bd->bd_bucket->hsb_lock, excl); +} + +/** + * operations on cfs_hash bucket (bd: bucket descriptor), + * they are normally for hash-table without rehash + */ +void cfs_hash_bd_get(struct cfs_hash *hs, const void *key, + struct cfs_hash_bd *bd); + +static inline void +cfs_hash_bd_get_and_lock(struct cfs_hash *hs, const void *key, + struct cfs_hash_bd *bd, int excl) +{ + cfs_hash_bd_get(hs, key, bd); + cfs_hash_bd_lock(hs, bd, excl); +} + +static inline unsigned +cfs_hash_bd_index_get(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + return bd->bd_offset | (bd->bd_bucket->hsb_index << hs->hs_bkt_bits); +} + +static inline void +cfs_hash_bd_index_set(struct cfs_hash *hs, unsigned index, + struct cfs_hash_bd *bd) +{ + bd->bd_bucket = hs->hs_buckets[index >> hs->hs_bkt_bits]; + bd->bd_offset = index & (CFS_HASH_BKT_NHLIST(hs) - 1U); +} + +static inline void * +cfs_hash_bd_extra_get(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + return (void *)bd->bd_bucket + + cfs_hash_bkt_size(hs) - hs->hs_extra_bytes; +} + +static inline __u32 +cfs_hash_bd_version_get(struct cfs_hash_bd *bd) +{ + /* need hold cfs_hash_bd_lock */ + return bd->bd_bucket->hsb_version; +} + +static inline __u32 +cfs_hash_bd_count_get(struct cfs_hash_bd *bd) +{ + /* need hold cfs_hash_bd_lock */ + return bd->bd_bucket->hsb_count; +} + +static inline int +cfs_hash_bd_depmax_get(struct cfs_hash_bd *bd) +{ + return bd->bd_bucket->hsb_depmax; +} + +static inline int +cfs_hash_bd_compare(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2) +{ + if (bd1->bd_bucket->hsb_index != bd2->bd_bucket->hsb_index) + return bd1->bd_bucket->hsb_index - bd2->bd_bucket->hsb_index; + + if (bd1->bd_offset != bd2->bd_offset) + return bd1->bd_offset - bd2->bd_offset; + + return 0; +} + +void cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode); +void cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode); +void cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old, + struct cfs_hash_bd *bd_new, + struct hlist_node *hnode); + +static inline int +cfs_hash_bd_dec_and_lock(struct cfs_hash *hs, struct cfs_hash_bd *bd, + atomic_t *condition) +{ + LASSERT(cfs_hash_with_spin_bktlock(hs)); + return atomic_dec_and_lock(condition, &bd->bd_bucket->hsb_lock.spin); +} + +static inline struct hlist_head * +cfs_hash_bd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + return hs->hs_hops->hop_hhead(hs, bd); +} + +struct hlist_node * +cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key); +struct hlist_node * +cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key); +struct hlist_node * +cfs_hash_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key, struct hlist_node *hnode, + int insist_add); +struct hlist_node * +cfs_hash_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key, struct hlist_node *hnode); + +/** + * operations on cfs_hash bucket (bd: bucket descriptor), + * they are safe for hash-table with rehash + */ +void cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key, + struct cfs_hash_bd *bds); +void cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, + int excl); +void cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, + int excl); + +static inline void +cfs_hash_dual_bd_get_and_lock(struct cfs_hash *hs, const void *key, + struct cfs_hash_bd *bds, int excl) +{ + cfs_hash_dual_bd_get(hs, key, bds); + cfs_hash_dual_bd_lock(hs, bds, excl); +} + +struct hlist_node * +cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key); +struct hlist_node * +cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key, struct hlist_node *hnode, + int insist_add); +struct hlist_node * +cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key, struct hlist_node *hnode); + +/* Hash init/cleanup functions */ +struct cfs_hash * +cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits, + unsigned bkt_bits, unsigned extra_bytes, + unsigned min_theta, unsigned max_theta, + struct cfs_hash_ops *ops, unsigned flags); + +struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs); +void cfs_hash_putref(struct cfs_hash *hs); + +/* Hash addition functions */ +void cfs_hash_add(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode); +int cfs_hash_add_unique(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode); +void *cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode); + +/* Hash deletion functions */ +void *cfs_hash_del(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode); +void *cfs_hash_del_key(struct cfs_hash *hs, const void *key); + +/* Hash lookup/for_each functions */ +#define CFS_HASH_LOOP_HOG 1024 + +typedef int (*cfs_hash_for_each_cb_t)(struct cfs_hash *hs, + struct cfs_hash_bd *bd, + struct hlist_node *node, + void *data); +void * +cfs_hash_lookup(struct cfs_hash *hs, const void *key); +void +cfs_hash_for_each(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data); +void +cfs_hash_for_each_safe(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data); +int +cfs_hash_for_each_nolock(struct cfs_hash *hs, cfs_hash_for_each_cb_t, + void *data, int start); +int +cfs_hash_for_each_empty(struct cfs_hash *hs, cfs_hash_for_each_cb_t, + void *data); +void +cfs_hash_for_each_key(struct cfs_hash *hs, const void *key, + cfs_hash_for_each_cb_t, void *data); +typedef int (*cfs_hash_cond_opt_cb_t)(void *obj, void *data); +void +cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t, void *data); + +void +cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex, + cfs_hash_for_each_cb_t, void *data); +int cfs_hash_is_empty(struct cfs_hash *hs); +__u64 cfs_hash_size_get(struct cfs_hash *hs); + +/* + * Rehash - Theta is calculated to be the average chained + * hash depth assuming a perfectly uniform hash function. + */ +void cfs_hash_rehash_cancel_locked(struct cfs_hash *hs); +void cfs_hash_rehash_cancel(struct cfs_hash *hs); +void cfs_hash_rehash(struct cfs_hash *hs, int do_rehash); +void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key, + void *new_key, struct hlist_node *hnode); + +#if CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 +/* Validate hnode references the correct key */ +static inline void +cfs_hash_key_validate(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode) +{ + LASSERT(cfs_hash_keycmp(hs, key, hnode)); +} + +/* Validate hnode is in the correct bucket */ +static inline void +cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + struct cfs_hash_bd bds[2]; + + cfs_hash_dual_bd_get(hs, cfs_hash_key(hs, hnode), bds); + LASSERT(bds[0].bd_bucket == bd->bd_bucket || + bds[1].bd_bucket == bd->bd_bucket); +} + +#else /* CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 */ + +static inline void +cfs_hash_key_validate(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode) {} + +static inline void +cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) {} + +#endif /* CFS_HASH_DEBUG_LEVEL */ + +#define CFS_HASH_THETA_BITS 10 +#define CFS_HASH_MIN_THETA (1U << (CFS_HASH_THETA_BITS - 1)) +#define CFS_HASH_MAX_THETA (1U << (CFS_HASH_THETA_BITS + 1)) + +/* Return integer component of theta */ +static inline int __cfs_hash_theta_int(int theta) +{ + return (theta >> CFS_HASH_THETA_BITS); +} + +/* Return a fractional value between 0 and 999 */ +static inline int __cfs_hash_theta_frac(int theta) +{ + return ((theta * 1000) >> CFS_HASH_THETA_BITS) - + (__cfs_hash_theta_int(theta) * 1000); +} + +static inline int __cfs_hash_theta(struct cfs_hash *hs) +{ + return (atomic_read(&hs->hs_count) << + CFS_HASH_THETA_BITS) >> hs->hs_cur_bits; +} + +static inline void +__cfs_hash_set_theta(struct cfs_hash *hs, int min, int max) +{ + LASSERT(min < max); + hs->hs_min_theta = (__u16)min; + hs->hs_max_theta = (__u16)max; +} + +/* Generic debug formatting routines mainly for proc handler */ +struct seq_file; +void cfs_hash_debug_header(struct seq_file *m); +void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m); + +/* + * Generic djb2 hash algorithm for character arrays. + */ +static inline unsigned +cfs_hash_djb2_hash(const void *key, size_t size, unsigned mask) +{ + unsigned i, hash = 5381; + + LASSERT(key != NULL); + + for (i = 0; i < size; i++) + hash = hash * 33 + ((char *)key)[i]; + + return (hash & mask); +} + +/* + * Generic u32 hash algorithm. + */ +static inline unsigned +cfs_hash_u32_hash(const __u32 key, unsigned mask) +{ + return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask); +} + +/* + * Generic u64 hash algorithm. + */ +static inline unsigned +cfs_hash_u64_hash(const __u64 key, unsigned mask) +{ + return ((unsigned)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask); +} + +/** iterate over all buckets in @bds (array of struct cfs_hash_bd) */ +#define cfs_hash_for_each_bd(bds, n, i) \ + for (i = 0; i < n && (bds)[i].bd_bucket != NULL; i++) + +/** iterate over all buckets of @hs */ +#define cfs_hash_for_each_bucket(hs, bd, pos) \ + for (pos = 0; \ + pos < CFS_HASH_NBKT(hs) && \ + ((bd)->bd_bucket = (hs)->hs_buckets[pos]) != NULL; pos++) + +/** iterate over all hlist of bucket @bd */ +#define cfs_hash_bd_for_each_hlist(hs, bd, hlist) \ + for ((bd)->bd_offset = 0; \ + (bd)->bd_offset < CFS_HASH_BKT_NHLIST(hs) && \ + (hlist = cfs_hash_bd_hhead(hs, bd)) != NULL; \ + (bd)->bd_offset++) + +/* !__LIBCFS__HASH_H__ */ +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h new file mode 100644 index 0000000000000..a60f14286f511 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h @@ -0,0 +1,341 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/libcfs_private.h + * + * Various defines for libcfs. + * + */ + +#ifndef __LIBCFS_PRIVATE_H__ +#define __LIBCFS_PRIVATE_H__ + +#ifndef DEBUG_SUBSYSTEM +# define DEBUG_SUBSYSTEM S_UNDEFINED +#endif + +#include +#include + +#ifdef LIBCFS_DEBUG + +/* + * When this is on, LASSERT macro includes check for assignment used instead + * of equality check, but doesn't have unlikely(). Turn this on from time to + * time to make test-builds. This shouldn't be on for production release. + */ +#define LASSERT_CHECKED (0) + +#if LASSERT_CHECKED +/* + * Assertion. + * + * Strange construction with empty "then" clause is used to trigger compiler + * warnings on the assertions of the form LASSERT(a = b); + * + * "warning: suggest parentheses around assignment used as truth value" + * + * requires -Wall. Unfortunately this rules out use of likely/unlikely. + */ +#define LASSERTF(cond, fmt, ...) \ +do { \ + if (cond) \ + ; \ + else { \ + LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL); \ + libcfs_debug_msg(&__msg_data, \ + "ASSERTION( %s ) failed: " fmt, #cond, \ + ## __VA_ARGS__); \ + lbug_with_loc(&__msg_data); \ + } \ +} while (0) + +#define LASSERT(cond) LASSERTF(cond, "\n") + +#else /* !LASSERT_CHECKED */ + +#define LASSERTF(cond, fmt, ...) \ +do { \ + if (unlikely(!(cond))) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL); \ + libcfs_debug_msg(&__msg_data, \ + "ASSERTION( %s ) failed: " fmt, #cond, \ + ## __VA_ARGS__); \ + lbug_with_loc(&__msg_data); \ + } \ +} while (0) + +#define LASSERT(cond) LASSERTF(cond, "\n") +#endif /* !LASSERT_CHECKED */ +#else /* !LIBCFS_DEBUG */ +/* sizeof is to use expression without evaluating it. */ +# define LASSERT(e) ((void)sizeof!!(e)) +# define LASSERTF(cond, ...) ((void)sizeof!!(cond)) +#endif /* !LIBCFS_DEBUG */ + +#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK +/** + * This is for more expensive checks that one doesn't want to be enabled all + * the time. LINVRNT() has to be explicitly enabled by --enable-invariants + * configure option. + */ +# define LINVRNT(exp) LASSERT(exp) +#else +# define LINVRNT(exp) ((void)sizeof!!(exp)) +#endif + +void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msg); + +#define LBUG() \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL); \ + lbug_with_loc(&msgdata); \ +} while(0) + +/* + * Memory + */ +#ifdef LIBCFS_DEBUG + +extern atomic64_t libcfs_kmem; + +# define libcfs_kmem_inc(ptr, size) \ +do { \ + atomic64_add(size, &libcfs_kmem); \ +} while (0) + +# define libcfs_kmem_dec(ptr, size) \ +do { \ + atomic64_sub(size, &libcfs_kmem); \ +} while (0) + +# define libcfs_kmem_read() \ + (long long)atomic64_read(&libcfs_kmem) + +#else +# define libcfs_kmem_inc(ptr, size) do {} while (0) +# define libcfs_kmem_dec(ptr, size) do {} while (0) +# define libcfs_kmem_read() (0) +#endif /* LIBCFS_DEBUG */ + +#ifndef LIBCFS_VMALLOC_SIZE +#define LIBCFS_VMALLOC_SIZE (2 << PAGE_SHIFT) /* 2 pages */ +#endif + +#define LIBCFS_ALLOC_PRE(size, mask) \ +do { \ + LASSERT(!in_interrupt() || \ + ((size) <= LIBCFS_VMALLOC_SIZE && \ + ((mask) & GFP_ATOMIC)) != 0); \ +} while (0) + +#define LIBCFS_ALLOC_POST(ptr, size) \ +do { \ + if (unlikely((ptr) == NULL)) { \ + CERROR("LNET: out of memory at %s:%d (tried to alloc '" \ + #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size)); \ + CERROR("LNET: %lld total bytes allocated by lnet\n", \ + libcfs_kmem_read()); \ + } else { \ + libcfs_kmem_inc((ptr), (size)); \ + CDEBUG(D_MALLOC, "alloc '" #ptr "': %d at %p (tot %lld).\n", \ + (int)(size), (ptr), libcfs_kmem_read()); \ + } \ +} while (0) + +/** + * allocate memory with GFP flags @mask + * The allocated memory is zeroed-out. + */ +#define LIBCFS_ALLOC_GFP(ptr, size, mask) \ +do { \ + LIBCFS_ALLOC_PRE((size), (mask)); \ + (ptr) = (size) <= LIBCFS_VMALLOC_SIZE ? \ + kzalloc((size), (mask)) : vzalloc(size); \ + LIBCFS_ALLOC_POST((ptr), (size)); \ +} while (0) + +/** + * default allocator + */ +#define LIBCFS_ALLOC(ptr, size) \ + LIBCFS_ALLOC_GFP(ptr, size, GFP_NOFS) + +/** + * non-sleeping allocator + */ +#define LIBCFS_ALLOC_ATOMIC(ptr, size) \ + LIBCFS_ALLOC_GFP(ptr, size, GFP_ATOMIC) + +/** + * allocate memory for specified CPU partition + * \a cptab != NULL, \a cpt is CPU partition id of \a cptab + * \a cptab == NULL, \a cpt is HW NUMA node id + * The allocated memory is zeroed-out. + */ +#define LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, mask) \ +do { \ + LIBCFS_ALLOC_PRE((size), (mask)); \ + (ptr) = (size) <= LIBCFS_VMALLOC_SIZE ? \ + cfs_cpt_malloc((cptab), (cpt), (size), (mask) | __GFP_ZERO) : \ + cfs_cpt_vzalloc((cptab), (cpt), (size)); \ + LIBCFS_ALLOC_POST((ptr), (size)); \ +} while (0) + +/** default numa allocator */ +#define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size) \ + LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS) + +void init_libcfs_vfree_atomic(void); +void exit_libcfs_vfree_atomic(void); + +#define LIBCFS_FREE(ptr, size) \ +do { \ + int s = (size); \ + if (unlikely((ptr) == NULL)) { \ + CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at " \ + "%s:%d\n", s, __FILE__, __LINE__); \ + break; \ + } \ + libcfs_kmem_dec((ptr), s); \ + CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %lld).\n", \ + s, (ptr), libcfs_kmem_read()); \ + if (unlikely(s > LIBCFS_VMALLOC_SIZE)) \ + libcfs_vfree_atomic(ptr); \ + else \ + kfree(ptr); \ +} while (0) + +/******************************************************************************/ + +void libcfs_debug_dumplog(void); +int libcfs_debug_init(unsigned long bufsize); +int libcfs_debug_cleanup(void); +int libcfs_debug_clear_buffer(void); +int libcfs_debug_mark_buffer(const char *text); + +#define LASSERT_ATOMIC_ENABLED (1) + +#if LASSERT_ATOMIC_ENABLED + +/** assert value of @a is equal to @v */ +#define LASSERT_ATOMIC_EQ(a, v) \ + LASSERTF(atomic_read(a) == v, "value: %d\n", atomic_read((a))); + +/** assert value of @a is unequal to @v */ +#define LASSERT_ATOMIC_NE(a, v) \ + LASSERTF(atomic_read(a) != v, "value: %d\n", atomic_read((a))); + +/** assert value of @a is little than @v */ +#define LASSERT_ATOMIC_LT(a, v) \ + LASSERTF(atomic_read(a) < v, "value: %d\n", atomic_read((a))); + +/** assert value of @a is little/equal to @v */ +#define LASSERT_ATOMIC_LE(a, v) \ + LASSERTF(atomic_read(a) <= v, "value: %d\n", atomic_read((a))); + +/** assert value of @a is great than @v */ +#define LASSERT_ATOMIC_GT(a, v) \ + LASSERTF(atomic_read(a) > v, "value: %d\n", atomic_read((a))); + +/** assert value of @a is great/equal to @v */ +#define LASSERT_ATOMIC_GE(a, v) \ + LASSERTF(atomic_read(a) >= v, "value: %d\n", atomic_read((a))); + +/** assert value of @a is great than @v1 and little than @v2 */ +#define LASSERT_ATOMIC_GT_LT(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v > v1 && __v < v2, "value: %d\n", __v);\ +} while (0) + +/** assert value of @a is great than @v1 and little/equal to @v2 */ +#define LASSERT_ATOMIC_GT_LE(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v > v1 && __v <= v2, "value: %d\n", __v);\ +} while (0) + +/** assert value of @a is great/equal to @v1 and little than @v2 */ +#define LASSERT_ATOMIC_GE_LT(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v >= v1 && __v < v2, "value: %d\n", __v);\ +} while (0) + +/** assert value of @a is great/equal to @v1 and little/equal to @v2 */ +#define LASSERT_ATOMIC_GE_LE(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v >= v1 && __v <= v2, "value: %d\n", __v); \ +} while (0) + +#else /* !LASSERT_ATOMIC_ENABLED */ + +#define LASSERT_ATOMIC_EQ(a, v) do {} while (0) +#define LASSERT_ATOMIC_NE(a, v) do {} while (0) +#define LASSERT_ATOMIC_LT(a, v) do {} while (0) +#define LASSERT_ATOMIC_LE(a, v) do {} while (0) +#define LASSERT_ATOMIC_GT(a, v) do {} while (0) +#define LASSERT_ATOMIC_GE(a, v) do {} while (0) +#define LASSERT_ATOMIC_GT_LT(a, v1, v2) do {} while (0) +#define LASSERT_ATOMIC_GT_LE(a, v1, v2) do {} while (0) +#define LASSERT_ATOMIC_GE_LT(a, v1, v2) do {} while (0) +#define LASSERT_ATOMIC_GE_LE(a, v1, v2) do {} while (0) + +#endif /* LASSERT_ATOMIC_ENABLED */ + +#define LASSERT_ATOMIC_ZERO(a) LASSERT_ATOMIC_EQ(a, 0) +#define LASSERT_ATOMIC_POS(a) LASSERT_ATOMIC_GT(a, 0) + +#define CFS_ALLOC_PTR(ptr) LIBCFS_ALLOC(ptr, sizeof(*(ptr))); +#define CFS_ALLOC_PTR_ARRAY(ptr, count) \ + LIBCFS_ALLOC(ptr, (count) * sizeof(*(ptr))) + +#define CFS_FREE_PTR(ptr) LIBCFS_FREE(ptr, sizeof(*(ptr))); +#define CFS_FREE_PTR_ARRAY(ptr, count) \ + LIBCFS_FREE(ptr, (count) * sizeof(*(ptr))) + +/* implication */ +#define ergo(a, b) (!(a) || (b)) +/* logical equivalence */ +#define equi(a, b) (!!(a) == !!(b)) + +#define MKSTR(ptr) ((ptr))? (ptr) : "" + +#ifndef HAVE_CFS_SIZE_ROUND +static inline size_t cfs_size_round(size_t val) +{ + return (val + 7) & (~0x7); +} +#define HAVE_CFS_SIZE_ROUND +#endif + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h new file mode 100644 index 0000000000000..bc2e03cf3fb42 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h @@ -0,0 +1,86 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/libcfs_string.h + * + * Generic string manipulation functions. + * + * Author: Nathan Rutman + */ + +#ifndef __LIBCFS_STRING_H__ +#define __LIBCFS_STRING_H__ + +/* libcfs_string.c */ +/* Convert a text string to a bitmask */ +int cfs_str2mask(const char *str, const char *(*bit2str)(int bit), + int *oldmask, int minmask, int allmask, int defmask); + +/** + * Structure to represent NULL-less strings. + */ +struct cfs_lstr { + char *ls_str; + int ls_len; +}; + +/* + * Structure to represent \ token of the syntax. + */ +struct cfs_range_expr { + /* + * Link to cfs_expr_list::el_exprs. + */ + struct list_head re_link; + __u32 re_lo; + __u32 re_hi; + __u32 re_stride; +}; + +struct cfs_expr_list { + struct list_head el_link; + struct list_head el_exprs; +}; + +int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res); +int cfs_str2num_check(char *str, int nob, unsigned *num, + unsigned min, unsigned max); +int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list); +int cfs_expr_list_print(char *buffer, int count, + struct cfs_expr_list *expr_list); +int cfs_expr_list_values(struct cfs_expr_list *expr_list, + int max, __u32 **values); +void cfs_expr_list_values_free(__u32 *values, int num); +void cfs_expr_list_free(struct cfs_expr_list *expr_list); +int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max, + struct cfs_expr_list **elpp); +void cfs_expr_list_free_list(struct list_head *list); + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h new file mode 100644 index 0000000000000..d10ec77ca2cd6 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h @@ -0,0 +1,103 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/libcfs_workitem.h + * + * Author: Isaac Huang + * Liang Zhen + * + * A workitems is deferred work with these semantics: + * - a workitem always runs in thread context. + * - a workitem can be concurrent with other workitems but is strictly + * serialized with respect to itself. + * - no CPU affinity, a workitem does not necessarily run on the same CPU + * that schedules it. However, this might change in the future. + * - if a workitem is scheduled again before it has a chance to run, it + * runs only once. + * - if a workitem is scheduled while it runs, it runs again after it + * completes; this ensures that events occurring while other events are + * being processed receive due attention. This behavior also allows a + * workitem to reschedule itself. + * + * Usage notes: + * - a workitem can sleep but it should be aware of how that sleep might + * affect others. + * - a workitem runs inside a kernel thread so there's no user space to access. + * - do not use a workitem if the scheduling latency can't be tolerated. + * + * When wi_action returns non-zero, it means the workitem has either been + * freed or reused and workitem scheduler won't touch it any more. + */ + +#ifndef __LIBCFS_WORKITEM_H__ +#define __LIBCFS_WORKITEM_H__ + +struct cfs_wi_sched; + +void cfs_wi_sched_destroy(struct cfs_wi_sched *); +int cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, int cpt, + int nthrs, struct cfs_wi_sched **); + +struct cfs_workitem; + +typedef int (*cfs_wi_action_t) (struct cfs_workitem *); + +struct cfs_workitem { + /** chain on runq or rerunq */ + struct list_head wi_list; + /** working function */ + cfs_wi_action_t wi_action; + /** in running */ + unsigned short wi_running:1; + /** scheduled */ + unsigned short wi_scheduled:1; +}; + +static inline void +cfs_wi_init(struct cfs_workitem *wi, cfs_wi_action_t action) +{ + INIT_LIST_HEAD(&wi->wi_list); + + wi->wi_running = 0; + wi->wi_scheduled = 0; + wi->wi_action = action; +} + +void cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi); +int cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi); +void cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi); + +int cfs_wi_startup(void); +void cfs_wi_shutdown(void); + +/** # workitem scheduler loops before reschedule */ +#define CFS_WI_RESCHED 128 + +#endif /* __LIBCFS_WORKITEM_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/glob.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/glob.h new file mode 100644 index 0000000000000..fca03b5526878 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/glob.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_GLOB_H +#define _LINUX_GLOB_H + +#ifndef HAVE_GLOB + +#include /* For bool */ +#include /* For __pure */ + +bool __pure glob_match(char const *pat, char const *str); +#endif /* !HAVE_GLOB */ + +#endif /* _LINUX_GLOB_H */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h new file mode 100644 index 0000000000000..22ffe71a4c3e7 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h @@ -0,0 +1,52 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/linux/linux-mem.h + * + * Basic library routines. + * + * Author: liang@whamcloud.com + */ + +#ifndef __LIBCFS_LINUX_CPU_H__ +#define __LIBCFS_LINUX_CPU_H__ + +#include + +#ifndef HAVE_TOPOLOGY_SIBLING_CPUMASK +# define topology_sibling_cpumask(cpu) topology_thread_cpumask(cpu) +#endif /* HAVE_TOPOLOGY_SIBLING_CPUMASK */ + +#ifndef HAVE_CPUS_READ_LOCK +# define cpus_read_lock get_online_cpus +# define cpus_read_unlock put_online_cpus +#endif + +#endif /* __LIBCFS_LINUX_CPU_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fortify-string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fortify-string.h new file mode 100644 index 0000000000000..aeed8c5a0614c --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fortify-string.h @@ -0,0 +1,296 @@ +#ifndef _LIBCFS_FORTIFY_STRING_H +#define _LIBCFS_FORTIFY_STRING_H + +#ifdef HAVE_LINUX_FORTIFY_STRING_HEADER +#include + +/* + * Linux v5.11-11104-ga28a6e860c6c introduces fortify-string.h + * where an unsafe_memcpy is provided in Linux v5.18-rc5-1405-g43213daed6d6 + * + * This following is excerpted from the Linux v6.1 fortified memcpy() + * which resolves some corner cases, one of which is triggered in lustre + */ +#ifndef unsafe_memcpy + +#include +#include +#include + +#ifndef __RENAME +#define __RENAME(x) __asm__(#x) +#endif + +void fortify_panic(const char *name) __noreturn __cold; +void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)"); +void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)"); +void __read_overflow2_field(size_t avail, size_t wanted) __compiletime_warning("detected read beyond size of field (2nd parameter); maybe use struct_group()?"); +void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)"); +void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("detected write beyond size of field (1st parameter); maybe use struct_group()?"); + +#define __compiletime_strlen(p) \ +({ \ + char *__p = (char *)(p); \ + size_t __ret = SIZE_MAX; \ + size_t __p_size = __member_size(p); \ + if (__p_size != SIZE_MAX && \ + __builtin_constant_p(*__p)) { \ + size_t __p_len = __p_size - 1; \ + if (__builtin_constant_p(__p[__p_len]) && \ + __p[__p_len] == '\0') \ + __ret = __builtin_strlen(__p); \ + } \ + __ret; \ +}) + +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) +extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); +extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); +extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); +extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); +extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); +extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); +extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); +extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); +extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); +extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); +#else + +#if defined(__SANITIZE_MEMORY__) +/* + * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the + * corresponding __msan_XXX functions. + */ +#include +#define __underlying_memcpy __msan_memcpy +#define __underlying_memmove __msan_memmove +#define __underlying_memset __msan_memset +#else +#define __underlying_memcpy __builtin_memcpy +#define __underlying_memmove __builtin_memmove +#define __underlying_memset __builtin_memset +#endif + +#define __underlying_memchr __builtin_memchr +#define __underlying_memcmp __builtin_memcmp +#define __underlying_strcat __builtin_strcat +#define __underlying_strcpy __builtin_strcpy +#define __underlying_strlen __builtin_strlen +#define __underlying_strncat __builtin_strncat +#define __underlying_strncpy __builtin_strncpy +#endif + +/** + * unsafe_memcpy - memcpy implementation with no FORTIFY bounds checking + * + * @dst: Destination memory address to write to + * @src: Source memory address to read from + * @bytes: How many bytes to write to @dst from @src + * @justification: Free-form text or comment describing why the use is needed + * + * This should be used for corner cases where the compiler cannot do the + * right thing, or during transitions between APIs, etc. It should be used + * very rarely, and includes a place for justification detailing where bounds + * checking has happened, and why existing solutions cannot be employed. + */ +#define unsafe_memcpy(dst, src, bytes, justification) \ + __underlying_memcpy(dst, src, bytes) + +/* + * Clang's use of __builtin_*object_size() within inlines needs hinting via + * __pass_*object_size(). The preference is to only ever use type 1 (member + * size, rather than struct size), but there remain some stragglers using + * type 0 that will be converted in the future. + */ +#define POS __pass_object_size(1) +#define POS0 __pass_object_size(0) +#define __struct_size(p) __builtin_object_size(p, 0) +#define __member_size(p) __builtin_object_size(p, 1) + +#define __compiletime_lessthan(bounds, length) ( \ + __builtin_constant_p((bounds) < (length)) && \ + (bounds) < (length) \ +) + + +/* + * To make sure the compiler can enforce protection against buffer overflows, + * memcpy(), memmove(), and memset() must not be used beyond individual + * struct members. If you need to copy across multiple members, please use + * struct_group() to create a named mirror of an anonymous struct union. + * (e.g. see struct sk_buff.) Read overflow checking is currently only + * done when a write overflow is also present, or when building with W=1. + * + * Mitigation coverage matrix + * Bounds checking at: + * +-------+-------+-------+-------+ + * | Compile time | Run time | + * memcpy() argument sizes: | write | read | write | read | + * dest source length +-------+-------+-------+-------+ + * memcpy(known, known, constant) | y | y | n/a | n/a | + * memcpy(known, unknown, constant) | y | n | n/a | V | + * memcpy(known, known, dynamic) | n | n | B | B | + * memcpy(known, unknown, dynamic) | n | n | B | V | + * memcpy(unknown, known, constant) | n | y | V | n/a | + * memcpy(unknown, unknown, constant) | n | n | V | V | + * memcpy(unknown, known, dynamic) | n | n | V | B | + * memcpy(unknown, unknown, dynamic) | n | n | V | V | + * +-------+-------+-------+-------+ + * + * y = perform deterministic compile-time bounds checking + * n = cannot perform deterministic compile-time bounds checking + * n/a = no run-time bounds checking needed since compile-time deterministic + * B = can perform run-time bounds checking (currently unimplemented) + * V = vulnerable to run-time overflow (will need refactoring to solve) + * + */ +extern __always_inline __gnu_inline +bool fortify_memcpy_chk(__kernel_size_t size, + const size_t p_size, + const size_t q_size, + const size_t p_size_field, + const size_t q_size_field, + const char *func) +{ + if (__builtin_constant_p(size)) { + /* + * Length argument is a constant expression, so we + * can perform compile-time bounds checking where + * buffer sizes are also known at compile time. + */ + + /* Error when size is larger than enclosing struct. */ + if (__compiletime_lessthan(p_size_field, p_size) && + __compiletime_lessthan(p_size, size)) + __write_overflow(); + if (__compiletime_lessthan(q_size_field, q_size) && + __compiletime_lessthan(q_size, size)) + __read_overflow2(); + + /* Warn when write size argument larger than dest field. */ + if (__compiletime_lessthan(p_size_field, size)) + __write_overflow_field(p_size_field, size); + /* + * Warn for source field over-read when building with W=1 + * or when an over-write happened, so both can be fixed at + * the same time. + */ + if ((IS_ENABLED(KBUILD_EXTRA_WARN1) || + __compiletime_lessthan(p_size_field, size)) && + __compiletime_lessthan(q_size_field, size)) + __read_overflow2_field(q_size_field, size); + } + /* + * At this point, length argument may not be a constant expression, + * so run-time bounds checking can be done where buffer sizes are + * known. (This is not an "else" because the above checks may only + * be compile-time warnings, and we want to still warn for run-time + * overflows.) + */ + + /* + * Always stop accesses beyond the struct that contains the + * field, when the buffer's remaining size is known. + * (The SIZE_MAX test is to optimize away checks where the buffer + * lengths are unknown.) + */ + if ((p_size != SIZE_MAX && p_size < size) || + (q_size != SIZE_MAX && q_size < size)) + fortify_panic(func); + + /* + * Warn when writing beyond destination field size. + * + * We must ignore p_size_field == 0 for existing 0-element + * fake flexible arrays, until they are all converted to + * proper flexible arrays. + * + * The implementation of __builtin_*object_size() behaves + * like sizeof() when not directly referencing a flexible + * array member, which means there will be many bounds checks + * that will appear at run-time, without a way for them to be + * detected at compile-time (as can be done when the destination + * is specifically the flexible array member). + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832 + */ + if (p_size_field != 0 && p_size_field != SIZE_MAX && + p_size != p_size_field && p_size_field < size) + return true; + + return false; +} + +#define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ + p_size_field, q_size_field, op) ({ \ + const size_t __fortify_size = (size_t)(size); \ + const size_t __p_size = (p_size); \ + const size_t __q_size = (q_size); \ + const size_t __p_size_field = (p_size_field); \ + const size_t __q_size_field = (q_size_field); \ + WARN_ONCE(fortify_memcpy_chk(__fortify_size, __p_size, \ + __q_size, __p_size_field, \ + __q_size_field, #op), \ + #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \ + __fortify_size, \ + "field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \ + __p_size_field); \ + __underlying_##op(p, q, __fortify_size); \ +}) + +/* + * Notes about compile-time buffer size detection: + * + * With these types... + * + * struct middle { + * u16 a; + * u8 middle_buf[16]; + * int b; + * }; + * struct end { + * u16 a; + * u8 end_buf[16]; + * }; + * struct flex { + * int a; + * u8 flex_buf[]; + * }; + * + * void func(TYPE *ptr) { ... } + * + * Cases where destination size cannot be currently detected: + * - the size of ptr's object (seemingly by design, gcc & clang fail): + * __builtin_object_size(ptr, 1) == SIZE_MAX + * - the size of flexible arrays in ptr's obj (by design, dynamic size): + * __builtin_object_size(ptr->flex_buf, 1) == SIZE_MAX + * - the size of ANY array at the end of ptr's obj (gcc and clang bug): + * __builtin_object_size(ptr->end_buf, 1) == SIZE_MAX + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836 + * + * Cases where destination size is currently detected: + * - the size of non-array members within ptr's object: + * __builtin_object_size(ptr->a, 1) == 2 + * - the size of non-flexible-array in the middle of ptr's obj: + * __builtin_object_size(ptr->middle_buf, 1) == 16 + * + */ + +/* + * __struct_size() vs __member_size() must be captured here to avoid + * evaluating argument side-effects further into the macro layers. + */ +#define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ + __struct_size(p), __struct_size(q), \ + __member_size(p), __member_size(q), \ + memcpy) + +#endif /* HAVE_LINUX_FORTIFY_STRING_HEADER */ +#endif /* unsafe_memcpy */ + +/* a catch all to ensure an unsafe_memcpy() exists */ +#ifndef unsafe_memcpy +#define unsafe_memcpy(dst, src, bytes, justification) \ + memcpy(dst, src, bytes) +#endif + +#endif /* _LIBCFS_FORTIFY_STRING_H */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h new file mode 100644 index 0000000000000..6ef6b0716aa6d --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h @@ -0,0 +1,87 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/linux/linux-fs.h + * + * Basic library routines. + */ + +#ifndef __LIBCFS_LINUX_CFS_FS_H__ +#define __LIBCFS_LINUX_CFS_FS_H__ + +#include +#include +#include +#include +#include + +#ifndef HAVE_FILE_DENTRY +static inline struct dentry *file_dentry(const struct file *file) +{ + return file->f_path.dentry; +} +#endif + +#ifndef S_DT_SHIFT +#define S_DT_SHIFT 12 +#endif + +#ifndef S_DT +#define S_DT(type) (((type) & S_IFMT) >> S_DT_SHIFT) +#endif +#ifndef DTTOIF +#define DTTOIF(dirtype) ((dirtype) << S_DT_SHIFT) +#endif + +#ifdef HAVE_PROC_OPS +#define PROC_OWNER(_fn) +#else +#define proc_ops file_operations +#define PROC_OWNER(_owner) .owner = (_owner), +#define proc_open open +#define proc_read read +#define proc_write write +#define proc_lseek llseek +#define proc_release release +#define proc_poll poll +#define proc_ioctl unlocked_ioctl +#define proc_compat_ioctl compat_ioctl +#define proc_mmap mmap +#define proc_get_unmapped_area get_unmapped_area +#endif + +static inline void mapping_clear_exiting(struct address_space *mapping) +{ +#ifdef HAVE_MAPPING_AS_EXITING_FLAG + clear_bit(AS_EXITING, &mapping->flags); +#endif +} + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h new file mode 100644 index 0000000000000..3c615bd0df703 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h @@ -0,0 +1,345 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ + +#ifndef __LIBCFS_LINUX_HASH_H__ +#define __LIBCFS_LINUX_HASH_H__ + +#include +#include + +u64 cfs_hashlen_string(const void *salt, const char *name); + +#ifndef hashlen_hash +#define hashlen_hash(hashlen) ((u32)(hashlen)) +#endif + +#ifndef HAVE_STRINGHASH +#ifndef hashlen_create +#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash)) +#endif +#endif /* !HAVE_STRINGHASH */ + +#ifdef HAVE_BROKEN_HASH_64 + +#define GOLDEN_RATIO_32 0x61C88647 +#define GOLDEN_RATIO_64 0x61C8864680B583EBull + +static inline u32 cfs_hash_32(u32 val, unsigned int bits) +{ + /* High bits are more random, so use them. */ + return (val * GOLDEN_RATIO_32) >> (32 - bits); +} + +static __always_inline u32 cfs_hash_64(u64 val, unsigned int bits) +{ +#if BITS_PER_LONG == 64 + /* 64x64-bit multiply is efficient on all 64-bit processors */ + return val * GOLDEN_RATIO_64 >> (64 - bits); +#else + /* Hash 64 bits using only 32x32-bit multiply. */ + return cfs_hash_32(((u32)val ^ ((val >> 32) * GOLDEN_RATIO_32)), bits); +#endif +} +#else + +#define cfs_hash_32 hash_32 +#define cfs_hash_64 hash_64 + +#endif /* HAVE_BROKEN_HASH_64 */ + +#ifndef HAVE_RHASHTABLE_WALK_ENTER +static int rhashtable_walk_enter(struct rhashtable *ht, + struct rhashtable_iter *iter) +{ +#ifdef HAVE_3ARG_RHASHTABLE_WALK_INIT + return rhashtable_walk_init(ht, iter, GFP_KERNEL); +#else + return rhashtable_walk_init(ht, iter); +#endif +} +#endif + +#ifndef HAVE_RHLTABLE +struct rhlist_head { + struct rhash_head rhead; + struct rhlist_head __rcu *next; +}; + +struct rhltable { + struct rhashtable ht; +}; + +#define rhl_for_each_entry_rcu(tpos, pos, list, member) \ + for (pos = list; pos && rht_entry(tpos, pos, member); \ + pos = rcu_dereference_raw(pos->next)) + +static inline int rhltable_init(struct rhltable *hlt, + const struct rhashtable_params *params) +{ + return rhashtable_init(&hlt->ht, params); +} + +static inline struct rhlist_head *rhltable_lookup( + struct rhltable *hlt, const void *key, + const struct rhashtable_params params) +{ + struct rhashtable *ht = &hlt->ht; + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct bucket_table *tbl; + struct rhash_head *he; + unsigned int hash; + + tbl = rht_dereference_rcu(ht->tbl, ht); +restart: + hash = rht_key_hashfn(ht, tbl, key, params); + rht_for_each_rcu(he, tbl, hash) { + if (params.obj_cmpfn ? + params.obj_cmpfn(&arg, rht_obj(ht, he)) : + rhashtable_compare(&arg, rht_obj(ht, he))) + continue; + return he ? container_of(he, struct rhlist_head, rhead) : NULL; + } + + /* Ensure we see any new tables. */ + smp_rmb(); + + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (unlikely(tbl)) + goto restart; + + return NULL; +} + +static inline int rhltable_insert_key( + struct rhltable *hlt, const void *key, struct rhlist_head *list, + const struct rhashtable_params params) +{ +#ifdef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT + return __rhashtable_insert_fast(&hlt->ht, key, &list->rhead, + params); +#else + return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, + params)); +#endif +} + +static inline int rhltable_remove( + struct rhltable *hlt, struct rhlist_head *list, + const struct rhashtable_params params) +{ + return rhashtable_remove_fast(&hlt->ht, &list->rhead, params); +} + +static inline void rhltable_free_and_destroy(struct rhltable *hlt, + void (*free_fn)(void *ptr, + void *arg), + void *arg) +{ + rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); +} + +static inline void rhltable_destroy(struct rhltable *hlt) +{ + rhltable_free_and_destroy(hlt, NULL, NULL); +} + +static inline void rhltable_walk_enter(struct rhltable *hlt, + struct rhashtable_iter *iter) +{ + rhashtable_walk_enter(&hlt->ht, iter); +} +#endif /* !HAVE_RHLTABLE */ + +#ifndef HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST +/** + * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Just like rhashtable_lookup_insert_fast(), but this function returns the + * object if it exists, NULL if it did not and the insertion was successful, + * and an ERR_PTR otherwise. + */ +static inline void *rhashtable_lookup_get_insert_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params) +{ + const char *key; + void *ret; + int rc; + + rc = rhashtable_lookup_insert_fast(ht, obj, params); + switch (rc) { + case -EEXIST: + key = rht_obj(ht, obj); + ret = rhashtable_lookup_fast(ht, key, params); + break; + case 0: + ret = NULL; + break; + default: + ret = ERR_PTR(rc); + break; + } + return ret; +} +#endif /* !HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST */ + +#ifndef HAVE_RHASHTABLE_LOOKUP +/* + * The function rhashtable_lookup() and rhashtable_lookup_fast() + * are almost the same except rhashtable_lookup() doesn't + * take the RCU read lock. Since this is the case and only + * SLES12 SP3 lacks rhashtable_lookup() just duplicate the + * SLES12 SP3 rhashtable_lookup_fast() minus the RCU read lock. + */ +static inline void *rhashtable_lookup( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + const struct bucket_table *tbl; + struct rhash_head *he; + unsigned int hash; + + tbl = rht_dereference_rcu(ht->tbl, ht); +restart: + hash = rht_key_hashfn(ht, tbl, key, params); + rht_for_each_rcu(he, tbl, hash) { + if (params.obj_cmpfn ? + params.obj_cmpfn(&arg, rht_obj(ht, he)) : + rhashtable_compare(&arg, rht_obj(ht, he))) + continue; + return rht_obj(ht, he); + } + + /* Ensure we see any new tables. */ + smp_rmb(); + + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (unlikely(tbl)) + goto restart; + + return NULL; +} +#endif /* !HAVE_RHASHTABLE_LOOKUP */ + +#ifndef HAVE_RHT_BUCKET_VAR +static inline struct rhash_head __rcu **rht_bucket_var( + struct bucket_table *tbl, unsigned int hash) +{ + return &tbl->buckets[hash]; +} +#endif + +#ifndef HAVE_RHASHTABLE_REPLACE +/* Internal function, please use rhashtable_replace_fast() instead */ +static inline int __rhashtable_replace_fast( + struct rhashtable *ht, struct bucket_table *tbl, + struct rhash_head *obj_old, struct rhash_head *obj_new, + const struct rhashtable_params params) +{ + struct rhash_head __rcu **pprev; + struct rhash_head *he; + spinlock_t *lock; + unsigned int hash; + int err = -ENOENT; + + /* Minimally, the old and new objects must have same hash + * (which should mean identifiers are the same). + */ + hash = rht_head_hashfn(ht, tbl, obj_old, params); + if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) + return -EINVAL; + + lock = rht_bucket_lock(tbl, hash); + + spin_lock_bh(lock); + + pprev = rht_bucket_var(tbl, hash); + rht_for_each_continue(he, *pprev, tbl, hash) { + if (he != obj_old) { + pprev = &he->next; + continue; + } + + rcu_assign_pointer(obj_new->next, obj_old->next); + rcu_assign_pointer(*pprev, obj_new); + err = 0; + break; + } + + spin_unlock_bh(lock); + + return err; +} + +/** + * rhashtable_replace_fast - replace an object in hash table + * @ht: hash table + * @obj_old: pointer to hash head inside object being replaced + * @obj_new: pointer to hash head inside object which is new + * @params: hash table parameters + * + * Replacing an object doesn't affect the number of elements in the hash table + * or bucket, so we don't need to worry about shrinking or expanding the + * table here. + * + * Returns zero on success, -ENOENT if the entry could not be found, + * -EINVAL if hash is not the same for the old and new objects. + */ +static inline int rhashtable_replace_fast( + struct rhashtable *ht, struct rhash_head *obj_old, + struct rhash_head *obj_new, + const struct rhashtable_params params) +{ + struct bucket_table *tbl; + int err; + + rcu_read_lock(); + + tbl = rht_dereference_rcu(ht->tbl, ht); + + /* Because we have already taken (and released) the bucket + * lock in old_tbl, if we find that future_tbl is not yet + * visible then that guarantees the entry to still be in + * the old tbl if it exists. + */ + while ((err = __rhashtable_replace_fast(ht, tbl, obj_old, + obj_new, params)) && + (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) + ; + + rcu_read_unlock(); + + return err; +} +#endif /* HAVE_RHASHTABLE_REPLACE */ + +#endif /* __LIBCFS_LINUX_HASH_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h new file mode 100644 index 0000000000000..c457bee35e160 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h @@ -0,0 +1,32 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ + +#ifndef __LIBCFS_LINUX_LIST_H__ +#define __LIBCFS_LINUX_LIST_H__ + +#include + +#ifdef HAVE_HLIST_ADD_AFTER +#define hlist_add_behind(hnode, tail) hlist_add_after(tail, hnode) +#endif /* HAVE_HLIST_ADD_AFTER */ + +#endif /* __LIBCFS_LINUX_LIST_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h new file mode 100644 index 0000000000000..548eb96a2db33 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h @@ -0,0 +1,143 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/linux/linux-mem.h + * + * Basic library routines. + */ + +#ifndef __LIBCFS_LINUX_CFS_MEM_H__ +#define __LIBCFS_LINUX_CFS_MEM_H__ + +#include +#include +#include +#include +#ifdef HAVE_MM_INLINE +# include +#endif +#include +#ifdef HAVE_SCHED_HEADERS +#include +#endif + +#ifdef HAVE_TOTALRAM_PAGES_AS_FUNC + #ifndef cfs_totalram_pages + #define cfs_totalram_pages() totalram_pages() + #endif +#else + #ifndef cfs_totalram_pages + #define cfs_totalram_pages() totalram_pages + #endif +#endif + +#ifndef HAVE_MEMALLOC_RECLAIM +static inline unsigned int memalloc_noreclaim_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC; + + current->flags |= PF_MEMALLOC; + return flags; +} + +static inline void memalloc_noreclaim_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC) | flags; +} +#endif /* !HAVE_MEMALLOC_RECLAIM */ + +#ifndef HAVE_BITMAP_ALLOC +static inline unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags) +{ + return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long), + flags); +} + +static inline unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags) +{ + return bitmap_alloc(nbits, flags | __GFP_ZERO); +} + +static inline void bitmap_free(const unsigned long *bitmap) +{ + kfree(bitmap); +} +#endif /* !HAVE_BITMAP_ALLOC */ + +/* + * Shrinker + */ +#ifndef SHRINK_STOP +# define SHRINK_STOP (~0UL) +#endif + +#ifndef HAVE_MMAP_LOCK +static inline void mmap_write_lock(struct mm_struct *mm) +{ + down_write(&mm->mmap_sem); +} + +static inline bool mmap_write_trylock(struct mm_struct *mm) +{ + return down_write_trylock(&mm->mmap_sem) != 0; +} + +static inline void mmap_write_unlock(struct mm_struct *mm) +{ + up_write(&mm->mmap_sem); +} + +static inline void mmap_read_lock(struct mm_struct *mm) +{ + down_read(&mm->mmap_sem); +} + +static inline bool mmap_read_trylock(struct mm_struct *mm) +{ + return down_read_trylock(&mm->mmap_sem) != 0; +} + +static inline void mmap_read_unlock(struct mm_struct *mm) +{ + up_read(&mm->mmap_sem); +} +#endif + +#ifdef HAVE_VMALLOC_2ARGS +#define __ll_vmalloc(size, flags) __vmalloc(size, flags) +#else +#define __ll_vmalloc(size, flags) __vmalloc(size, flags, PAGE_KERNEL) +#endif + +#ifndef HAVE_KFREE_SENSITIVE +#define kfree_sensitive(x) kzfree(x) +#endif + +#endif /* __LINUX_CFS_MEM_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h new file mode 100644 index 0000000000000..841db69e28742 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h @@ -0,0 +1,189 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __LIBCFS_LINUX_MISC_H__ +#define __LIBCFS_LINUX_MISC_H__ + +#include +/* Since Commit 2f8b544477e6 ("block,fs: untangle fs.h and blk_types.h") + * fs.h doesn't include blk_types.h, but we need it. + */ +#include +#include +#include +#include +#include + +#ifndef HAVE_IOV_ITER_TYPE +#ifdef HAVE_IOV_ITER_HAS_TYPE_MEMBER +#define iter_is_iovec(iter) ((iter)->type & ITER_IOVEC) +#define iov_iter_is_kvec(iter) ((iter)->type & ITER_KVEC) +#define iov_iter_is_bvec(iter) ((iter)->type & ITER_BVEC) +#define iov_iter_is_pipe(iter) ((iter)->type & ITER_PIPE) +#define iov_iter_is_discard(iter) ((iter)->type & ITER_DISCARD) +#else +#define iter_is_iovec(iter) 1 +#define iov_iter_is_kvec(iter) 0 +#define iov_iter_is_bvec(iter) 0 +#define iov_iter_is_pipe(iter) 0 +#define iov_iter_is_discard(iter) 0 +#endif +#endif /* HAVE_IOV_ITER_TYPE */ + +int cfs_kernel_write(struct file *filp, const void *buf, size_t count, + loff_t *pos); +ssize_t cfs_kernel_read(struct file *file, void *buf, size_t count, + loff_t *pos); + +/* + * For RHEL6 struct kernel_parm_ops doesn't exist. Also + * the arguments for .set and .get take different + * parameters which is handled below + */ +#ifdef HAVE_KERNEL_PARAM_OPS +#define cfs_kernel_param_arg_t const struct kernel_param +#else +#define cfs_kernel_param_arg_t struct kernel_param_ops +#define kernel_param_ops kernel_param +#endif /* ! HAVE_KERNEL_PARAM_OPS */ + +#ifndef HAVE_KERNEL_PARAM_LOCK +static inline void kernel_param_unlock(struct module *mod) +{ + __kernel_param_unlock(); +} + +static inline void kernel_param_lock(struct module *mod) +{ + __kernel_param_lock(); +} +#endif /* ! HAVE_KERNEL_PARAM_LOCK */ + +int cfs_apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs); + +#ifndef HAVE_KSTRTOBOOL_FROM_USER + +#define kstrtobool strtobool + +int kstrtobool_from_user(const char __user *s, size_t count, bool *res); +#endif /* HAVE_KSTRTOBOOL_FROM_USER */ + +#ifndef HAVE_MATCH_WILDCARD +bool match_wildcard(const char *pattern, const char *str); +#endif /* !HAVE_MATCH_WILDCARD */ + +#ifndef HAVE_KREF_READ +static inline int kref_read(const struct kref *kref) +{ + return atomic_read(&kref->refcount); +} +#endif /* HAVE_KREF_READ */ + +#ifdef HAVE_FORCE_SIG_WITH_TASK +#define cfs_force_sig(sig, task) force_sig((sig), (task)) +#else +#define cfs_force_sig(sig, task) \ +do { \ + unsigned long flags; \ + \ + spin_lock_irqsave(&task->sighand->siglock, flags); \ + task->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; \ + send_sig(sig, task, 1); \ + spin_unlock_irqrestore(&task->sighand->siglock, flags); \ +} while (0) +#endif + +void cfs_arch_init(void); + +#ifndef container_of_safe +/** + * container_of_safe - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + * If IS_ERR_OR_NULL(ptr), ptr is returned unchanged. + * + * Note: Copied from Linux 5.6, with BUILD_BUG_ON_MSG section removed. + */ +#define container_of_safe(ptr, type, member) ({ \ + void *__mptr = (void *)(ptr); \ + IS_ERR_OR_NULL(__mptr) ? ERR_CAST(__mptr) : \ + ((type *)(__mptr - offsetof(type, member))); }) +#endif + +/* + * Linux v4.15-rc2-5-g4229a470175b added sizeof_field() + * Linux v5.5-rc4-1-g1f07dcc459d5 removed FIELD_SIZEOF() + * Proved a sizeof_field in terms of FIELD_SIZEOF() when one is not provided + */ +#ifndef sizeof_field +#define sizeof_field(type, member) FIELD_SIZEOF(type, member) +#endif + +#ifndef HAVE_TASK_IS_RUNNING +#define task_is_running(task) (task->state == TASK_RUNNING) +#endif + +#ifndef memset_startat +/** from linux 5.19 include/linux/string.h: */ +#define memset_startat(obj, v, member) \ +({ \ + u8 *__ptr = (u8 *)(obj); \ + typeof(v) __val = (v); \ + memset(__ptr + offsetof(typeof(*(obj)), member), __val, \ + sizeof(*(obj)) - offsetof(typeof(*(obj)), member)); \ +}) +#endif /* memset_startat() */ + +#ifdef HAVE_KALLSYMS_LOOKUP_NAME +static inline void *cfs_kallsyms_lookup_name(const char *name) +{ + return (void *)kallsyms_lookup_name(name); +} +#else +static inline void *cfs_kallsyms_lookup_name(const char *name) +{ + return NULL; +} +#endif + +#ifndef HAVE_KOBJ_TYPE_DEFAULT_GROUPS +#define default_groups default_attrs +#define KOBJ_ATTR_GROUPS(_name) _name##_attrs +#define KOBJ_ATTRIBUTE_GROUPS(_name) +#else +#define KOBJ_ATTR_GROUPS(_name) _name##_groups +#define KOBJ_ATTRIBUTE_GROUPS(_name) ATTRIBUTE_GROUPS(_name) +#endif + +#endif /* __LIBCFS_LINUX_MISC_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h new file mode 100644 index 0000000000000..17b1b30be45b6 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h @@ -0,0 +1,162 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ + +#ifndef __LIBCFS_LINUX_NET_H__ +#define __LIBCFS_LINUX_NET_H__ + +#include +#include + +#ifndef HAVE_NLA_STRDUP +char *nla_strdup(const struct nlattr *nla, gfp_t flags); +#endif /* !HAVE_NLA_STRDUP */ + +#ifdef HAVE_NLA_STRLCPY +#define nla_strscpy nla_strlcpy +#endif /* HAVE_NLA_STRLCPY */ + +#ifndef HAVE_NL_PARSE_WITH_EXT_ACK + +#define NL_SET_BAD_ATTR(extack, attr) + +/* this can be increased when necessary - don't expose to userland */ +#define NETLINK_MAX_COOKIE_LEN 20 + +/** + * struct netlink_ext_ack - netlink extended ACK report struct + * @_msg: message string to report - don't access directly, use + * %NL_SET_ERR_MSG + * @bad_attr: attribute with error + * @cookie: cookie data to return to userspace (for success) + * @cookie_len: actual cookie data length + */ +struct netlink_ext_ack { + const char *_msg; + const struct nlattr *bad_attr; + u8 cookie[NETLINK_MAX_COOKIE_LEN]; + u8 cookie_len; +}; + +#define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG(NULL, msg) + +static inline int cfs_nla_parse(struct nlattr **tb, int maxtype, + const struct nlattr *head, int len, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return nla_parse(tb, maxtype, head, len, policy); +} + +static inline int cfs_nla_parse_nested(struct nlattr *tb[], int maxtype, + const struct nlattr *nla, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return nla_parse_nested(tb, maxtype, nla, policy); +} + +#else /* !HAVE_NL_PARSE_WITH_EXT_ACK */ + +#define cfs_nla_parse_nested nla_parse_nested +#define cfs_nla_parse nla_parse + +#endif + +#ifndef HAVE_GENL_DUMPIT_INFO +struct cfs_genl_dumpit_info { + const struct genl_family *family; + const struct genl_ops *ops; + struct nlattr **attrs; +}; + +static inline const struct cfs_genl_dumpit_info * +lnet_genl_dumpit_info(struct netlink_callback *cb) +{ + return (const struct cfs_genl_dumpit_info *)cb->args[1]; +} +#else +#define cfs_genl_dumpit_info genl_dumpit_info + +static inline const struct cfs_genl_dumpit_info * +lnet_genl_dumpit_info(struct netlink_callback *cb) +{ + return (const struct cfs_genl_dumpit_info *)genl_dumpit_info(cb); +} +#endif /* HAVE_GENL_DUMPIT_INFO */ + +#ifdef HAVE_KERNEL_SETSOCKOPT + +#include + +#if !defined(HAVE_TCP_SOCK_SET_QUICKACK) +static inline void tcp_sock_set_quickack(struct sock *sk, int opt) +{ + struct socket *sock = sk->sk_socket; + + kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, + (char *)&opt, sizeof(opt)); +} +#endif /* HAVE_TCP_SOCK_SET_QUICKACK */ + +#if !defined(HAVE_TCP_SOCK_SET_NODELAY) +static inline void tcp_sock_set_nodelay(struct sock *sk) +{ + int opt = 1; + struct socket *sock = sk->sk_socket; + + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char *)&opt, sizeof(opt)); +} +#endif /* HAVE_TCP_SOCK_SET_NODELAY */ + +#if !defined(HAVE_TCP_SOCK_SET_KEEPIDLE) +static inline int tcp_sock_set_keepidle(struct sock *sk, int opt) +{ + struct socket *sock = sk->sk_socket; + + return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, + (char *)&opt, sizeof(opt)); +} +#endif /* HAVE_TCP_SOCK_SET_KEEPIDLE */ + +#if !defined(HAVE_TCP_SOCK_SET_KEEPINTVL) +static inline int tcp_sock_set_keepintvl(struct sock *sk, int opt) +{ + struct socket *sock = sk->sk_socket; + + return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, + (char *)&opt, sizeof(opt)); +} +#endif /* HAVE_TCP_SOCK_SET_KEEPINTVL */ + +#if !defined(HAVE_TCP_SOCK_SET_KEEPCNT) +static inline int tcp_sock_set_keepcnt(struct sock *sk, int opt) +{ + struct socket *sock = sk->sk_socket; + + return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, + (char *)&opt, sizeof(opt)); +} +#endif /* HAVE_TCP_SOCK_SET_KEEPCNT */ +#endif /* HAVE_KERNEL_SETSOCKOPT */ + +#endif /* __LIBCFS_LINUX_NET_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h new file mode 100644 index 0000000000000..4a61fe1143858 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h @@ -0,0 +1,250 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/linux/linux-time.h + * + * Implementation of portable time API for Linux (kernel and user-level). + * + * Author: Nikita Danilov + */ + +#ifndef __LIBCFS_LINUX_LINUX_TIME_H__ +#define __LIBCFS_LINUX_LINUX_TIME_H__ + +/* Portable time API */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Generic kernel stuff + */ +#ifndef HAVE_TIMESPEC64 + +typedef __s64 time64_t; + +#if __BITS_PER_LONG == 64 + +# define timespec64 timespec + +static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) +{ + return ts; +} + +static inline struct timespec timespec64_to_timespec(const struct timespec64 ts) +{ + return ts; +} + +#else +struct timespec64 { + time64_t tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ +}; + +static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) +{ + struct timespec64 ret; + + ret.tv_sec = ts.tv_sec; + ret.tv_nsec = ts.tv_nsec; + return ret; +} + +static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) +{ + struct timespec ret; + + ret.tv_sec = (time_t)ts64.tv_sec; + ret.tv_nsec = ts64.tv_nsec; + return ret; +} +#endif /* __BITS_PER_LONG != 64 */ + +#endif /* HAVE_TIMESPEC64 */ + +#ifndef HAVE_NS_TO_TIMESPEC64 +static inline struct timespec64 ns_to_timespec64(const s64 nsec) +{ + struct timespec64 ts; + s32 rem; + + if (!nsec) + return (struct timespec64) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +#endif + +#ifndef HAVE_KTIME_ADD +# define ktime_add(lhs, rhs) ({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; }) +#endif /* !HAVE_KTIME_ADD */ + +#ifndef HAVE_KTIME_AFTER +static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2) +{ + return cmp1.tv64 > cmp2.tv64; +} +#endif /* !HAVE_KTIME_AFTER */ + +#ifndef HAVE_KTIME_BEFORE +static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2) +{ + return cmp1.tv64 < cmp2.tv64; +} +#endif /* !HAVE_KTIME_BEFORE */ + +#ifndef HAVE_KTIME_COMPARE +static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2) +{ + if (cmp1.tv64 < cmp2.tv64) + return -1; + if (cmp1.tv64 > cmp2.tv64) + return 1; + return 0; +} +#endif /* !HAVE_KTIME_COMPARE */ + +#ifndef HAVE_KTIME_GET_TS64 +void ktime_get_ts64(struct timespec64 *ts); +#endif /* HAVE_KTIME_GET_TS */ + +#ifndef HAVE_KTIME_GET_REAL_TS64 +void ktime_get_real_ts64(struct timespec64 *ts); +#endif /* HAVE_KTIME_GET_REAL_TS */ + +#ifndef HAVE_KTIME_GET_REAL_SECONDS +time64_t ktime_get_real_seconds(void); +#endif /* HAVE_KTIME_GET_REAL_SECONDS */ + +#ifndef HAVE_KTIME_GET_SECONDS +time64_t ktime_get_seconds(void); +#endif /* HAVE_KTIME_GET_SECONDS */ + +#ifdef NEED_KTIME_GET_NS +static inline u64 ktime_get_ns(void) +{ + return ktime_to_ns(ktime_get()); +} +#endif /* NEED_KTIME_GET_NS */ + +#ifdef NEED_KTIME_GET_REAL_NS +static inline u64 ktime_get_real_ns(void) +{ + return ktime_to_ns(ktime_get_real()); +} +#endif /* NEED_KTIME_GET_REAL_NS */ + +#ifndef HAVE_KTIME_MS_DELTA +static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier) +{ + return ktime_to_ms(ktime_sub(later, earlier)); +} +#endif /* HAVE_KTIME_MS_DELTA */ + +#ifndef HAVE_KTIME_TO_TIMESPEC64 +static inline struct timespec64 ktime_to_timespec64(ktime_t kt) +{ + struct timespec ts = ns_to_timespec((kt).tv64); + + return timespec_to_timespec64(ts); +} +#endif /* HAVE_KTIME_TO_TIMESPEC64 */ + +#ifndef HAVE_TIMESPEC64_SUB +static inline struct timespec64 +timespec64_sub(struct timespec64 later, struct timespec64 earlier) +{ + struct timespec diff; + + diff = timespec_sub(timespec64_to_timespec(later), + timespec64_to_timespec(earlier)); + return timespec_to_timespec64(diff); +} +#endif + +#ifndef HAVE_TIMESPEC64_TO_KTIME +static inline ktime_t timespec64_to_ktime(struct timespec64 ts) +{ + return ktime_set(ts.tv_sec, ts.tv_nsec); +} +#endif + +static inline unsigned long cfs_time_seconds(time64_t seconds) +{ + return nsecs_to_jiffies64(seconds * NSEC_PER_SEC); +} + +#ifdef HAVE_NEW_DEFINE_TIMER +# ifndef TIMER_DATA_TYPE +# define TIMER_DATA_TYPE struct timer_list * +# endif + +#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \ + DEFINE_TIMER((_name), (_function)) +#else +# ifndef TIMER_DATA_TYPE +# define TIMER_DATA_TYPE unsigned long +# endif + +#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \ + DEFINE_TIMER((_name), (_function), (_expires), (_data)) +#endif + +#ifdef HAVE_TIMER_SETUP +#define cfs_timer_cb_arg_t struct timer_list * +#define cfs_from_timer(var, callback_timer, timer_fieldname) \ + from_timer(var, callback_timer, timer_fieldname) +#define cfs_timer_setup(timer, callback, data, flags) \ + timer_setup((timer), (callback), (flags)) +#define cfs_timer_cb_arg(var, timer_fieldname) (&(var)->timer_fieldname) +#else +#define cfs_timer_cb_arg_t unsigned long +#define cfs_from_timer(var, data, timer_fieldname) (typeof(var))(data) +#define cfs_timer_setup(timer, callback, data, flags) \ + setup_timer((timer), (callback), (data)) +#define cfs_timer_cb_arg(var, timer_fieldname) (cfs_timer_cb_arg_t)(var) +#endif + +#endif /* __LIBCFS_LINUX_LINUX_TIME_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-uuid.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-uuid.h new file mode 100644 index 0000000000000..df877c0f62813 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-uuid.h @@ -0,0 +1,63 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __LIBCFS_LINUX_UUID_H__ +#define __LIBCFS_LINUX_UUID_H__ + +#include + +#define UUID_SIZE 16 + +/* + * The original linux UUID code had uuid_be and uuid_le. + * Later uuid_le was changed to guid_t and uuid_be + * to uuid_t. See for details kernel commit: + * + * f9727a17db9bab71ddae91f74f11a8a2f9a0ece6 + */ +#ifndef HAVE_UUID_T +typedef struct { + __u8 b[UUID_SIZE]; +} uuid_t; + +static inline void uuid_copy(uuid_t *dst, uuid_t *src) +{ + memcpy(dst, src, sizeof(uuid_t)); +} + +static inline bool uuid_equal(const uuid_t *u1, const uuid_t *u2) +{ + return memcmp(u1, u2, sizeof(uuid_t)) == 0; +} + +#endif + +#endif /* __LIBCFS_LINUX_UUID_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h new file mode 100644 index 0000000000000..aa257fcdf0c8b --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h @@ -0,0 +1,593 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LIBCFS_LINUX_WAIT_BIT_H +#define __LIBCFS_LINUX_WAIT_BIT_H + +/* Make sure we can see if we have TASK_NOLOAD */ +#include +/* + * Linux wait-bit related types and methods: + */ +#ifdef HAVE_WAIT_BIT_HEADER_H +#include +#endif +#include + +#ifndef HAVE_WAIT_QUEUE_ENTRY +#define wait_queue_entry_t wait_queue_t +#endif + +#ifndef HAVE_WAIT_BIT_HEADER_H +struct wait_bit_queue_entry { + struct wait_bit_key key; + wait_queue_entry_t wq_entry; +}; + +#define ___wait_is_interruptible(state) \ + (!__builtin_constant_p(state) || \ + state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ + +#endif /* ! HAVE_WAIT_BIT_HEADER_H */ + +#ifndef HAVE_PREPARE_TO_WAIT_EVENT +extern long prepare_to_wait_event(wait_queue_head_t *wq_head, + wait_queue_entry_t *wq_entry, int state); +#endif + +/* ___wait_cond_timeout changed number of args in v3.12-rc1-78-g35a2af94c7ce + * so let's define our own ___wait_cond_timeout1 + */ + +#define ___wait_cond_timeout1(condition) \ +({ \ + bool __cond = (condition); \ + if (__cond && !__ret) \ + __ret = 1; \ + __cond || !__ret; \ +}) + +#ifndef HAVE_CLEAR_AND_WAKE_UP_BIT +/** + * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit + * + * @bit: the bit of the word being waited on + * @word: the word being waited on, a kernel virtual address + * + * You can use this helper if bitflags are manipulated atomically rather than + * non-atomically under a lock. + */ +static inline void clear_and_wake_up_bit(int bit, void *word) +{ + clear_bit_unlock(bit, word); + /* See wake_up_bit() for which memory barrier you need to use. */ + smp_mb__after_atomic(); + wake_up_bit(word, bit); +} +#endif /* ! HAVE_CLEAR_AND_WAKE_UP_BIT */ + +#ifndef HAVE_WAIT_VAR_EVENT +extern void __init wait_bit_init(void); +extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, + void *var, int flags); +extern void wake_up_var(void *var); +extern wait_queue_head_t *__var_waitqueue(void *p); + +#define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \ +({ \ + __label__ __out; \ + wait_queue_head_t *__wq_head = __var_waitqueue(var); \ + struct wait_bit_queue_entry __wbq_entry; \ + long __ret = ret; /* explicit shadow */ \ + \ + init_wait_var_entry(&__wbq_entry, var, \ + exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ + for (;;) { \ + long __int = prepare_to_wait_event(__wq_head, \ + &__wbq_entry.wq_entry, \ + state); \ + if (condition) \ + break; \ + \ + if (___wait_is_interruptible(state) && __int) { \ + __ret = __int; \ + goto __out; \ + } \ + \ + cmd; \ + } \ + finish_wait(__wq_head, &__wbq_entry.wq_entry); \ +__out: __ret; \ +}) + +#define __wait_var_event(var, condition) \ + ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + schedule()) + +#define wait_var_event(var, condition) \ +do { \ + might_sleep(); \ + if (condition) \ + break; \ + __wait_var_event(var, condition); \ +} while (0) + +#define __wait_var_event_killable(var, condition) \ + ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ + schedule()) + +#define wait_var_event_killable(var, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_var_event_killable(var, condition); \ + __ret; \ +}) + +#define __wait_var_event_timeout(var, condition, timeout) \ + ___wait_var_event(var, ___wait_cond_timeout1(condition), \ + TASK_UNINTERRUPTIBLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) + +#define wait_var_event_timeout(var, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_var_event_timeout(var, condition, timeout); \ + __ret; \ +}) +#else /* !HAVE_WAIT_VAR_EVENT */ +/* linux-3.10.0-1062.el7 defines wait_var_event_timeout() using + * __wait_cond_timeout(), but doesn't define __wait_cond_timeout !!! + */ +# ifndef __wait_cond_timeout +# define ___wait_cond_timeout(condition) \ +({ \ + bool __cond = (condition); \ + if (__cond && !__ret) \ + __ret = 1; \ + __cond || !__ret; \ +}) +# endif /* __wait_cond_timeout */ + +#endif /* ! HAVE_WAIT_VAR_EVENT */ + +/* + * prepare_to_wait_event() does not support an exclusive + * lifo wait. + * However it will not relink the wait_queue_entry if + * it is already linked. So we link to the head of the + * queue here, and it will stay there. + */ +static inline void prepare_to_wait_exclusive_head( + wait_queue_head_t *waitq, wait_queue_entry_t *link) +{ + unsigned long flags; + + spin_lock_irqsave(&(waitq->lock), flags); +#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST + if (list_empty(&link->entry)) +#else + if (list_empty(&link->task_list)) +#endif + __add_wait_queue_exclusive(waitq, link); + spin_unlock_irqrestore(&((waitq)->lock), flags); +} + +#ifndef ___wait_event +/* + * The below macro ___wait_event() has an explicit shadow of the __ret + * variable when used from the wait_event_*() macros. + * + * This is so that both can use the ___wait_cond_timeout1() construct + * to wrap the condition. + * + * The type inconsistency of the wait_event_*() __ret variable is also + * on purpose; we use long where we can return timeout values and int + * otherwise. + */ + +#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \ +({ \ + __label__ __out; \ + wait_queue_entry_t __wq_entry; \ + long __ret = ret; /* explicit shadow */ \ + \ + init_wait(&__wq_entry); \ + if (exclusive) \ + __wq_entry.flags = WQ_FLAG_EXCLUSIVE; \ + for (;;) { \ + long __int = prepare_to_wait_event(&wq_head, \ + &__wq_entry, state); \ + \ + if (condition) \ + break; \ + \ + if (___wait_is_interruptible(state) && __int) { \ + __ret = __int; \ + goto __out; \ + } \ + \ + cmd; \ + } \ + finish_wait(&wq_head, &__wq_entry); \ +__out: __ret; \ +}) +#endif + +#ifndef TASK_NOLOAD + +#define TASK_IDLE TASK_INTERRUPTIBLE + +#define ___wait_event_idle(wq_head, condition, exclusive, ret, cmd) \ +({ \ + wait_queue_entry_t __wq_entry; \ + unsigned long flags; \ + long __ret = ret; /* explicit shadow */ \ + sigset_t __old_blocked, __new_blocked; \ + \ + siginitset(&__new_blocked, LUSTRE_FATAL_SIGS); \ + sigprocmask(0, &__new_blocked, &__old_blocked); \ + init_wait(&__wq_entry); \ + if (exclusive) \ + __wq_entry.flags = WQ_FLAG_EXCLUSIVE; \ + for (;;) { \ + prepare_to_wait_event(&wq_head, \ + &__wq_entry, \ + TASK_INTERRUPTIBLE); \ + \ + if (condition) \ + break; \ + /* We have to do this here because some signals */ \ + /* are not blockable - ie from strace(1). */ \ + /* In these cases we want to schedule_timeout() */ \ + /* again, because we don't want that to return */ \ + /* -EINTR when the RPC actually succeeded. */ \ + /* the recalc_sigpending() below will deliver the */ \ + /* signal properly. */ \ + if (signal_pending(current)) { \ + spin_lock_irqsave(¤t->sighand->siglock, \ + flags); \ + clear_tsk_thread_flag(current, TIF_SIGPENDING); \ + spin_unlock_irqrestore(¤t->sighand->siglock,\ + flags); \ + } \ + cmd; \ + } \ + finish_wait(&wq_head, &__wq_entry); \ + sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ + __ret; \ +}) + +#define wait_event_idle(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event_idle(wq_head, condition, 0, 0, schedule());\ +} while (0) + +#define wait_event_idle_exclusive(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event_idle(wq_head, condition, 1, 0, schedule());\ +} while (0) + +#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)\ + ___wait_event_idle(wq_head, ___wait_cond_timeout1(condition), \ + 1, timeout, \ + __ret = schedule_timeout(__ret)) + +#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_exclusive_timeout( \ + wq_head, condition, timeout); \ + __ret; \ +}) + +#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition, \ + timeout, cmd1, cmd2) \ + ___wait_event_idle(wq_head, ___wait_cond_timeout1(condition), \ + 1, timeout, \ + cmd1; __ret = schedule_timeout(__ret); cmd2) + +#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\ + cmd1, cmd2) \ +({ \ + long __ret = timeout; \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_exclusive_timeout_cmd( \ + wq_head, condition, timeout, cmd1, cmd2); \ + __ret; \ +}) + +#define __wait_event_idle_timeout(wq_head, condition, timeout) \ + ___wait_event_idle(wq_head, ___wait_cond_timeout1(condition), \ + 0, timeout, \ + __ret = schedule_timeout(__ret)) + +#define wait_event_idle_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_timeout(wq_head, condition, \ + timeout); \ + __ret; \ +}) + +#else /* TASK_IDLE */ +#ifndef wait_event_idle +/** + * wait_event_idle - wait for a condition without contributing to system load + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * + * The process is put to sleep (TASK_IDLE) until the + * @condition evaluates to true. + * The @condition is checked each time the waitqueue @wq_head is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + */ +#define wait_event_idle(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, \ + schedule()); \ +} while (0) +#endif +#ifndef wait_event_idle_exclusive +/** + * wait_event_idle_exclusive - wait for a condition without contributing to + * system load + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * + * The process is put to sleep (TASK_IDLE) until the + * @condition evaluates to true. + * The @condition is checked each time the waitqueue @wq_head is woken up. + * + * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag + * set thus if other processes wait on the same list, when this + * process is woken further processes are not considered. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + */ +#define wait_event_idle_exclusive(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, \ + schedule()); \ +} while (0) +#endif +#ifndef wait_event_idle_exclusive_timeout +/** + * wait_event_idle_exclusive_timeout - sleep without load until a condition + * becomes true or a timeout elapses + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout, in jiffies + * + * The process is put to sleep (TASK_IDLE) until the + * @condition evaluates to true. The @condition is checked each time + * the waitqueue @wq_head is woken up. + * + * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag + * set thus if other processes wait on the same list, when this + * process is woken further processes are not considered. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * Returns: + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * or the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed. + */ +#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_exclusive_timeout(wq_head, \ + condition, \ + timeout); \ + __ret; \ +}) +#endif +#ifndef wait_event_idle_exclusive_timeout_cmd +#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition, \ + timeout, cmd1, cmd2) \ + ___wait_event(wq_head, ___wait_cond_timeout1(condition), \ + TASK_IDLE, 1, timeout, \ + cmd1; __ret = schedule_timeout(__ret); cmd2) + +#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\ + cmd1, cmd2) \ +({ \ + long __ret = timeout; \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_exclusive_timeout_cmd( \ + wq_head, condition, timeout, cmd1, cmd2); \ + __ret; \ +}) +#endif + +#ifndef wait_event_idle_timeout + +#define __wait_event_idle_timeout(wq_head, condition, timeout) \ + ___wait_event(wq_head, ___wait_cond_timeout1(condition), \ + TASK_IDLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) + +/** + * wait_event_idle_timeout - sleep without load until a condition becomes + * true or a timeout elapses + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout, in jiffies + * + * The process is put to sleep (TASK_IDLE) until the + * @condition evaluates to true. The @condition is checked each time + * the waitqueue @wq_head is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * Returns: + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * or the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed. + */ +#define wait_event_idle_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_timeout(wq_head, condition, \ + timeout); \ + __ret; \ +}) +#endif +#endif /* TASK_IDLE */ + +/* ___wait_event_lifo is used for lifo exclusive 'idle' waits */ +#ifdef TASK_NOLOAD + +#define ___wait_event_lifo(wq_head, condition, ret, cmd) \ +({ \ + wait_queue_entry_t __wq_entry; \ + long __ret = ret; /* explicit shadow */ \ + \ + init_wait(&__wq_entry); \ + __wq_entry.flags = WQ_FLAG_EXCLUSIVE; \ + for (;;) { \ + prepare_to_wait_exclusive_head(&wq_head, &__wq_entry); \ + prepare_to_wait_event(&wq_head, &__wq_entry, TASK_IDLE);\ + \ + if (condition) \ + break; \ + \ + cmd; \ + } \ + finish_wait(&wq_head, &__wq_entry); \ + __ret; \ +}) +#else +#define ___wait_event_lifo(wq_head, condition, ret, cmd) \ +({ \ + wait_queue_entry_t __wq_entry; \ + unsigned long flags; \ + long __ret = ret; /* explicit shadow */ \ + sigset_t __old_blocked, __new_blocked; \ + \ + siginitset(&__new_blocked, LUSTRE_FATAL_SIGS); \ + sigprocmask(0, &__new_blocked, &__old_blocked); \ + init_wait(&__wq_entry); \ + __wq_entry.flags = WQ_FLAG_EXCLUSIVE; \ + for (;;) { \ + prepare_to_wait_exclusive_head(&wq_head, &__wq_entry); \ + prepare_to_wait_event(&wq_head, &__wq_entry, \ + TASK_INTERRUPTIBLE); \ + \ + if (condition) \ + break; \ + /* See justification in ___wait_event_idle */ \ + if (signal_pending(current)) { \ + spin_lock_irqsave(¤t->sighand->siglock, \ + flags); \ + clear_tsk_thread_flag(current, TIF_SIGPENDING); \ + spin_unlock_irqrestore(¤t->sighand->siglock,\ + flags); \ + } \ + cmd; \ + } \ + sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ + finish_wait(&wq_head, &__wq_entry); \ + __ret; \ +}) +#endif + +#define wait_event_idle_exclusive_lifo(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event_lifo(wq_head, condition, 0, schedule()); \ +} while (0) + +#define __wait_event_idle_lifo_timeout(wq_head, condition, timeout) \ + ___wait_event_lifo(wq_head, ___wait_cond_timeout1(condition), \ + timeout, \ + __ret = schedule_timeout(__ret)) + +#define wait_event_idle_exclusive_lifo_timeout(wq_head, condition, timeout)\ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_lifo_timeout(wq_head, \ + condition, \ + timeout); \ + __ret; \ +}) + +/* l_wait_event_abortable() is a bit like wait_event_killable() + * except there is a fixed set of signals which will abort: + * LUSTRE_FATAL_SIGS + */ +#define LUSTRE_FATAL_SIGS \ + (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGTERM) | \ + sigmask(SIGQUIT) | sigmask(SIGALRM)) + +#define l_wait_event_abortable(wq, condition) \ +({ \ + sigset_t __new_blocked, __old_blocked; \ + int __ret = 0; \ + siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS); \ + sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked); \ + __ret = wait_event_interruptible(wq, condition); \ + sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ + __ret; \ +}) + +#define l_wait_event_abortable_timeout(wq, condition, timeout) \ +({ \ + sigset_t __new_blocked, __old_blocked; \ + int __ret = 0; \ + siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS); \ + sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked); \ + __ret = wait_event_interruptible_timeout(wq, condition, timeout);\ + sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ + __ret; \ +}) + +#define l_wait_event_abortable_exclusive(wq, condition) \ +({ \ + sigset_t __new_blocked, __old_blocked; \ + int __ret = 0; \ + siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS); \ + sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked); \ + __ret = wait_event_interruptible_exclusive(wq, condition); \ + sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ + __ret; \ +}) + +#ifndef HAVE_WAIT_WOKEN +#define WQ_FLAG_WOKEN 0x02 +long wait_woken(wait_queue_entry_t *wait, unsigned int mode, long timeout); +int woken_wake_function(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key); +#endif /* HAVE_WAIT_WOKEN */ + +#endif /* __LICBFS_LINUX_WAIT_BIT_H */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/processor.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/processor.h new file mode 100644 index 0000000000000..700d01e53db40 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/processor.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Misc low level processor primitives */ +#ifndef _LINUX_PROCESSOR_H +#define _LINUX_PROCESSOR_H + +#include + +/* + * spin_begin is used before beginning a busy-wait loop, and must be paired + * with spin_end when the loop is exited. spin_cpu_relax must be called + * within the loop. + * + * The loop body should be as small and fast as possible, on the order of + * tens of instructions/cycles as a guide. It should and avoid calling + * cpu_relax, or any "spin" or sleep type of primitive including nested uses + * of these primitives. It should not lock or take any other resource. + * Violations of these guidelies will not cause a bug, but may cause sub + * optimal performance. + * + * These loops are optimized to be used where wait times are expected to be + * less than the cost of a context switch (and associated overhead). + * + * Detection of resource owner and decision to spin or sleep or guest-yield + * (e.g., spin lock holder vcpu preempted, or mutex owner not on CPU) can be + * tested within the loop body. + */ +#ifndef spin_begin +#ifdef CONFIG_PPC64 +#define spin_begin() HMT_low() +#else +#define spin_begin() +#endif /* CONFIG_PPC64 */ +#endif /* spin_begin */ + +#ifndef spin_cpu_relax +#define spin_cpu_relax() cpu_relax() +#endif + +/* + * spin_cpu_yield may be called to yield (undirected) to the hypervisor if + * necessary. This should be used if the wait is expected to take longer + * than context switch overhead, but we can't sleep or do a directed yield. + */ +#ifndef spin_cpu_yield +#define spin_cpu_yield() cpu_relax_yield() +#endif + +#ifndef spin_end +#ifdef CONFIG_PPC64 +#define spin_end() HMT_medium() +#else +#define spin_end() +#endif /* CONFIG_PPC64 */ +#endif /* spin_end */ + +/* + * spin_until_cond can be used to wait for a condition to become true. It + * may be expected that the first iteration will true in the common case + * (no spinning), so that callers should not require a first "likely" test + * for the uncontended case before using this primitive. + * + * Usage and implementation guidelines are the same as for the spin_begin + * primitives, above. + */ +#ifndef spin_until_cond +#define spin_until_cond(cond) \ +do { \ + if (unlikely(!(cond))) { \ + spin_begin(); \ + do { \ + spin_cpu_relax(); \ + } while (!(cond)); \ + spin_end(); \ + } \ +} while (0) + +#endif + +#endif /* _LINUX_PROCESSOR_H */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/refcount.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/refcount.h new file mode 100644 index 0000000000000..ecbf38561372f --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/refcount.h @@ -0,0 +1,40 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +#ifndef __LIBCFS_LINUX_REFCOUNT_H__ +#define __LIBCFS_LINUX_REFCOUNT_H__ + +#include + +#ifndef HAVE_REFCOUNT_T + +#define refcount_t atomic_t + +#define refcount_set atomic_set +#define refcount_inc atomic_inc +#define refcount_inc_not_zero atomic_inc_not_zero +#define refcount_dec atomic_dec +#define refcount_dec_and_test atomic_dec_and_test +#define refcount_read atomic_read + +#endif + +#endif /* __LIBCFS_LINUX_REFCOUNT_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/xarray.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/xarray.h new file mode 100644 index 0000000000000..74397ab3a080d --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/xarray.h @@ -0,0 +1,1766 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _LINUX_XARRAY_H +#define _LINUX_XARRAY_H +/* + * eXtensible Arrays + * Copyright (c) 2017 Microsoft Corporation + * Author: Matthew Wilcox + * + * This is taken from kernel commit: + * + * 7b785645e ("mm: fix page cache convergence regression") + * + * at kernel verison 5.2-rc2 + * + * See Documentation/core-api/xarray.rst for how to use the XArray. + */ +#ifndef HAVE_XARRAY_SUPPORT + +#if defined(NEED_LOCKDEP_IS_HELD_DISCARD_CONST) \ + && defined(CONFIG_LOCKDEP) \ + && defined(lockdep_is_held) +#undef lockdep_is_held + #define lockdep_is_held(lock) \ + lock_is_held((struct lockdep_map *)&(lock)->dep_map) +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The bottom two bits of the entry determine how the XArray interprets + * the contents: + * + * 00: Pointer entry + * 10: Internal entry + * x1: Value entry or tagged pointer + * + * Attempting to store internal entries in the XArray is a bug. + * + * Most internal entries are pointers to the next node in the tree. + * The following internal entries have a special meaning: + * + * 0-62: Sibling entries + * 256: Zero entry + * 257: Retry entry + * + * Errors are also represented as internal entries, but use the negative + * space (-4094 to -2). They're never stored in the slots array; only + * returned by the normal API. + */ + +#define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) + +/** + * xa_mk_value() - Create an XArray entry from an integer. + * @v: Value to store in XArray. + * + * Context: Any context. + * Return: An entry suitable for storing in the XArray. + */ +static inline void *xa_mk_value(unsigned long v) +{ + WARN_ON((long)v < 0); + return (void *)((v << 1) | 1); +} + +/** + * xa_to_value() - Get value stored in an XArray entry. + * @entry: XArray entry. + * + * Context: Any context. + * Return: The value stored in the XArray entry. + */ +static inline unsigned long xa_to_value(const void *entry) +{ + return (unsigned long)entry >> 1; +} + +/** + * xa_is_value() - Determine if an entry is a value. + * @entry: XArray entry. + * + * Context: Any context. + * Return: True if the entry is a value, false if it is a pointer. + */ +static inline bool xa_is_value(const void *entry) +{ + return (unsigned long)entry & 1; +} + +/** + * xa_tag_pointer() - Create an XArray entry for a tagged pointer. + * @p: Plain pointer. + * @tag: Tag value (0, 1 or 3). + * + * If the user of the XArray prefers, they can tag their pointers instead + * of storing value entries. Three tags are available (0, 1 and 3). + * These are distinct from the xa_mark_t as they are not replicated up + * through the array and cannot be searched for. + * + * Context: Any context. + * Return: An XArray entry. + */ +static inline void *xa_tag_pointer(void *p, unsigned long tag) +{ + return (void *)((unsigned long)p | tag); +} + +/** + * xa_untag_pointer() - Turn an XArray entry into a plain pointer. + * @entry: XArray entry. + * + * If you have stored a tagged pointer in the XArray, call this function + * to get the untagged version of the pointer. + * + * Context: Any context. + * Return: A pointer. + */ +static inline void *xa_untag_pointer(void *entry) +{ + return (void *)((unsigned long)entry & ~3UL); +} + +/** + * xa_pointer_tag() - Get the tag stored in an XArray entry. + * @entry: XArray entry. + * + * If you have stored a tagged pointer in the XArray, call this function + * to get the tag of that pointer. + * + * Context: Any context. + * Return: A tag. + */ +static inline unsigned int xa_pointer_tag(void *entry) +{ + return (unsigned long)entry & 3UL; +} + +/* + * xa_mk_internal() - Create an internal entry. + * @v: Value to turn into an internal entry. + * + * Internal entries are used for a number of purposes. Entries 0-255 are + * used for sibling entries (only 0-62 are used by the current code). 256 + * is used for the retry entry. 257 is used for the reserved / zero entry. + * Negative internal entries are used to represent errnos. Node pointers + * are also tagged as internal entries in some situations. + * + * Context: Any context. + * Return: An XArray internal entry corresponding to this value. + */ +static inline void *xa_mk_internal(unsigned long v) +{ + return (void *)((v << 2) | 2); +} + +/* + * xa_to_internal() - Extract the value from an internal entry. + * @entry: XArray entry. + * + * Context: Any context. + * Return: The value which was stored in the internal entry. + */ +static inline unsigned long xa_to_internal(const void *entry) +{ + return (unsigned long)entry >> 2; +} + +/* + * xa_is_internal() - Is the entry an internal entry? + * @entry: XArray entry. + * + * Context: Any context. + * Return: %true if the entry is an internal entry. + */ +static inline bool xa_is_internal(const void *entry) +{ + return ((unsigned long)entry & 3) == 2; +} + +#define XA_ZERO_ENTRY xa_mk_internal(257) + +/** + * xa_is_zero() - Is the entry a zero entry? + * @entry: Entry retrieved from the XArray + * + * The normal API will return NULL as the contents of a slot containing + * a zero entry. You can only see zero entries by using the advanced API. + * + * Return: %true if the entry is a zero entry. + */ +static inline bool xa_is_zero(const void *entry) +{ + return unlikely(entry == XA_ZERO_ENTRY); +} + +/** + * xa_is_err() - Report whether an XArray operation returned an error + * @entry: Result from calling an XArray function + * + * If an XArray operation cannot complete an operation, it will return + * a special value indicating an error. This function tells you + * whether an error occurred; xa_err() tells you which error occurred. + * + * Context: Any context. + * Return: %true if the entry indicates an error. + */ +static inline bool xa_is_err(const void *entry) +{ + return unlikely(xa_is_internal(entry) && + entry >= xa_mk_internal(-MAX_ERRNO)); +} + +/** + * xa_err() - Turn an XArray result into an errno. + * @entry: Result from calling an XArray function. + * + * If an XArray operation cannot complete an operation, it will return + * a special pointer value which encodes an errno. This function extracts + * the errno from the pointer value, or returns 0 if the pointer does not + * represent an errno. + * + * Context: Any context. + * Return: A negative errno or 0. + */ +static inline int xa_err(void *entry) +{ + /* xa_to_internal() would not do sign extension. */ + if (xa_is_err(entry)) + return (long)entry >> 2; + return 0; +} + +/** + * struct xa_limit - Represents a range of IDs. + * @min: The lowest ID to allocate (inclusive). + * @max: The maximum ID to allocate (inclusive). + * + * This structure is used either directly or via the XA_LIMIT() macro + * to communicate the range of IDs that are valid for allocation. + * Two common ranges are predefined for you: + * * xa_limit_32b - [0 - UINT_MAX] + * * xa_limit_31b - [0 - INT_MAX] + */ +struct xa_limit { + u32 max; + u32 min; +}; + +#define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max } + +#define xa_limit_32b XA_LIMIT(0, UINT_MAX) +#define xa_limit_31b XA_LIMIT(0, INT_MAX) + +typedef unsigned __bitwise xa_mark_t; +#define XA_MARK_0 ((__force xa_mark_t)0U) +#define XA_MARK_1 ((__force xa_mark_t)1U) +#define XA_MARK_2 ((__force xa_mark_t)2U) +#define XA_PRESENT ((__force xa_mark_t)8U) +#define XA_MARK_MAX XA_MARK_2 +#define XA_FREE_MARK XA_MARK_0 + +enum xa_lock_type { + XA_LOCK_IRQ = 1, + XA_LOCK_BH = 2, +}; + +/* + * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, + * and we remain compatible with that. + */ +#define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) +#define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) +#define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) +#define XA_FLAGS_ZERO_BUSY ((__force gfp_t)8U) +#define XA_FLAGS_ALLOC_WRAPPED ((__force gfp_t)16U) +#define XA_FLAGS_ACCOUNT ((__force gfp_t)32U) +#define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ + (__force unsigned)(mark))) + +/* ALLOC is for a normal 0-based alloc. ALLOC1 is for an 1-based alloc */ +#define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK)) +#define XA_FLAGS_ALLOC1 (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY) + +/** + * struct xarray - The anchor of the XArray. + * @xa_lock: Lock that protects the contents of the XArray. + * + * To use the xarray, define it statically or embed it in your data structure. + * It is a very small data structure, so it does not usually make sense to + * allocate it separately and keep a pointer to it in your data structure. + * + * You may use the xa_lock to protect your own data structures as well. + */ +/* + * If all of the entries in the array are NULL, @xa_head is a NULL pointer. + * If the only non-NULL entry in the array is at index 0, @xa_head is that + * entry. If any other entry in the array is non-NULL, @xa_head points + * to an @xa_node. + */ +struct xarray { + spinlock_t xa_lock; +/* private: The rest of the data structure is not to be used directly. */ + gfp_t xa_flags; + void __rcu *xa_head; +}; + +#define XARRAY_INIT(name, flags) { \ + .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ + .xa_flags = flags, \ + .xa_head = NULL, \ +} + +/** + * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags. + * @name: A string that names your XArray. + * @flags: XA_FLAG values. + * + * This is intended for file scope definitions of XArrays. It declares + * and initialises an empty XArray with the chosen name and flags. It is + * equivalent to calling xa_init_flags() on the array, but it does the + * initialisation at compiletime instead of runtime. + */ +#define DEFINE_XARRAY_FLAGS(name, flags) \ + struct xarray name = XARRAY_INIT(name, flags) + +/** + * DEFINE_XARRAY() - Define an XArray. + * @name: A string that names your XArray. + * + * This is intended for file scope definitions of XArrays. It declares + * and initialises an empty XArray with the chosen name. It is equivalent + * to calling xa_init() on the array, but it does the initialisation at + * compiletime instead of runtime. + */ +#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) + +/** + * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0. + * @name: A string that names your XArray. + * + * This is intended for file scope definitions of allocating XArrays. + * See also DEFINE_XARRAY(). + */ +#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC) + +/** + * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1. + * @name: A string that names your XArray. + * + * This is intended for file scope definitions of allocating XArrays. + * See also DEFINE_XARRAY(). + */ +#define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1) + +void *xa_load(struct xarray *, unsigned long index); +void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); +void *xa_erase(struct xarray *, unsigned long index); +void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, + void *entry, gfp_t); +bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); +void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); +void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); +void *xa_find(struct xarray *xa, unsigned long *index, + unsigned long max, xa_mark_t) __attribute__((nonnull(2))); +void *xa_find_after(struct xarray *xa, unsigned long *index, + unsigned long max, xa_mark_t) __attribute__((nonnull(2))); +unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, + unsigned long max, unsigned int n, xa_mark_t); +void xa_destroy(struct xarray *); + +/** + * xa_init_flags() - Initialise an empty XArray with flags. + * @xa: XArray. + * @flags: XA_FLAG values. + * + * If you need to initialise an XArray with special flags (eg you need + * to take the lock from interrupt context), use this function instead + * of xa_init(). + * + * Context: Any context. + */ +static inline void xa_init_flags(struct xarray *xa, gfp_t flags) +{ + spin_lock_init(&xa->xa_lock); + xa->xa_flags = flags; + xa->xa_head = NULL; +} + +/** + * xa_init() - Initialise an empty XArray. + * @xa: XArray. + * + * An empty XArray is full of NULL entries. + * + * Context: Any context. + */ +static inline void xa_init(struct xarray *xa) +{ + xa_init_flags(xa, 0); +} + +/** + * xa_empty() - Determine if an array has any present entries. + * @xa: XArray. + * + * Context: Any context. + * Return: %true if the array contains only NULL pointers. + */ +static inline bool xa_empty(const struct xarray *xa) +{ + return xa->xa_head == NULL; +} + +/** + * xa_marked() - Inquire whether any entry in this array has a mark set + * @xa: Array + * @mark: Mark value + * + * Context: Any context. + * Return: %true if any entry has this mark set. + */ +static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) +{ + return xa->xa_flags & XA_FLAGS_MARK(mark); +} + +/** + * xa_for_each_start() - Iterate over a portion of an XArray. + * @xa: XArray. + * @index: Index of @entry. + * @entry: Entry retrieved from array. + * @start: First index to retrieve from array. + * + * During the iteration, @entry will have the value of the entry stored + * in @xa at @index. You may modify @index during the iteration if you + * want to skip or reprocess indices. It is safe to modify the array + * during the iteration. At the end of the iteration, @entry will be set + * to NULL and @index will have a value less than or equal to max. + * + * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n). You have + * to handle your own locking with xas_for_each(), and if you have to unlock + * after each iteration, it will also end up being O(n.log(n)). + * xa_for_each_start() will spin if it hits a retry entry; if you intend to + * see retry entries, you should use the xas_for_each() iterator instead. + * The xas_for_each() iterator will expand into more inline code than + * xa_for_each_start(). + * + * Context: Any context. Takes and releases the RCU lock. + */ +#define xa_for_each_start(xa, index, entry, start) \ + for (index = start, \ + entry = xa_find(xa, &index, ULONG_MAX, XA_PRESENT); \ + entry; \ + entry = xa_find_after(xa, &index, ULONG_MAX, XA_PRESENT)) + +/** + * xa_for_each() - Iterate over present entries in an XArray. + * @xa: XArray. + * @index: Index of @entry. + * @entry: Entry retrieved from array. + * + * During the iteration, @entry will have the value of the entry stored + * in @xa at @index. You may modify @index during the iteration if you want + * to skip or reprocess indices. It is safe to modify the array during the + * iteration. At the end of the iteration, @entry will be set to NULL and + * @index will have a value less than or equal to max. + * + * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have + * to handle your own locking with xas_for_each(), and if you have to unlock + * after each iteration, it will also end up being O(n.log(n)). xa_for_each() + * will spin if it hits a retry entry; if you intend to see retry entries, + * you should use the xas_for_each() iterator instead. The xas_for_each() + * iterator will expand into more inline code than xa_for_each(). + * + * Context: Any context. Takes and releases the RCU lock. + */ +#define xa_for_each(xa, index, entry) \ + xa_for_each_start(xa, index, entry, 0) + +/** + * xa_for_each_marked() - Iterate over marked entries in an XArray. + * @xa: XArray. + * @index: Index of @entry. + * @entry: Entry retrieved from array. + * @filter: Selection criterion. + * + * During the iteration, @entry will have the value of the entry stored + * in @xa at @index. The iteration will skip all entries in the array + * which do not match @filter. You may modify @index during the iteration + * if you want to skip or reprocess indices. It is safe to modify the array + * during the iteration. At the end of the iteration, @entry will be set to + * NULL and @index will have a value less than or equal to max. + * + * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n). + * You have to handle your own locking with xas_for_each(), and if you have + * to unlock after each iteration, it will also end up being O(n.log(n)). + * xa_for_each_marked() will spin if it hits a retry entry; if you intend to + * see retry entries, you should use the xas_for_each_marked() iterator + * instead. The xas_for_each_marked() iterator will expand into more inline + * code than xa_for_each_marked(). + * + * Context: Any context. Takes and releases the RCU lock. + */ +#define xa_for_each_marked(xa, index, entry, filter) \ + for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \ + entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter)) + +#define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) +#define xa_lock(xa) spin_lock(&(xa)->xa_lock) +#define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) +#define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) +#define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) +#define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) +#define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) +#define xa_lock_irqsave(xa, flags) \ + spin_lock_irqsave(&(xa)->xa_lock, flags) +#define xa_unlock_irqrestore(xa, flags) \ + spin_unlock_irqrestore(&(xa)->xa_lock, flags) + +/* + * Versions of the normal API which require the caller to hold the + * xa_lock. If the GFP flags allow it, they will drop the lock to + * allocate memory, then reacquire it afterwards. These functions + * may also re-enable interrupts if the XArray flags indicate the + * locking should be interrupt safe. + */ +void *__xa_erase(struct xarray *, unsigned long index); +void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); +void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, + void *entry, gfp_t); +int __must_check __xa_insert(struct xarray *, unsigned long index, + void *entry, gfp_t); +int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry, + struct xa_limit, gfp_t); +int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry, + struct xa_limit, u32 *next, gfp_t); +void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); +void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); + +/** + * xa_store_bh() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * This function is like calling xa_store() except it disables softirqs + * while holding the array lock. + * + * Context: Any context. Takes and releases the xa_lock while + * disabling softirqs. + * Return: The entry which used to be at this index. + */ +static inline void *xa_store_bh(struct xarray *xa, unsigned long index, + void *entry, gfp_t gfp) +{ + void *curr; + + xa_lock_bh(xa); + curr = __xa_store(xa, index, entry, gfp); + xa_unlock_bh(xa); + + return curr; +} + +/** + * xa_store_irq() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * This function is like calling xa_store() except it disables interrupts + * while holding the array lock. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. + * Return: The entry which used to be at this index. + */ +static inline void *xa_store_irq(struct xarray *xa, unsigned long index, + void *entry, gfp_t gfp) +{ + void *curr; + + xa_lock_irq(xa); + curr = __xa_store(xa, index, entry, gfp); + xa_unlock_irq(xa); + + return curr; +} + +/** + * xa_erase_bh() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * After this function returns, loading from @index will return %NULL. + * If the index is part of a multi-index entry, all indices will be erased + * and none of the entries will be part of a multi-index entry. + * + * Context: Any context. Takes and releases the xa_lock while + * disabling softirqs. + * Return: The entry which used to be at this index. + */ +static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) +{ + void *entry; + + xa_lock_bh(xa); + entry = __xa_erase(xa, index); + xa_unlock_bh(xa); + + return entry; +} + +/** + * xa_erase_irq() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * After this function returns, loading from @index will return %NULL. + * If the index is part of a multi-index entry, all indices will be erased + * and none of the entries will be part of a multi-index entry. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. + * Return: The entry which used to be at this index. + */ +static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) +{ + void *entry; + + xa_lock_irq(xa); + entry = __xa_erase(xa, index); + xa_unlock_irq(xa); + + return entry; +} + +/** + * xa_cmpxchg() - Conditionally replace an entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @old: Old value to test against. + * @entry: New value to place in array. + * @gfp: Memory allocation flags. + * + * If the entry at @index is the same as @old, replace it with @entry. + * If the return value is equal to @old, then the exchange was successful. + * + * Context: Any context. Takes and releases the xa_lock. May sleep + * if the @gfp flags permit. + * Return: The old value at this index or xa_err() if an error happened. + */ +static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) +{ + void *curr; + + xa_lock(xa); + curr = __xa_cmpxchg(xa, index, old, entry, gfp); + xa_unlock(xa); + + return curr; +} + +/** + * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @old: Old value to test against. + * @entry: New value to place in array. + * @gfp: Memory allocation flags. + * + * This function is like calling xa_cmpxchg() except it disables softirqs + * while holding the array lock. + * + * Context: Any context. Takes and releases the xa_lock while + * disabling softirqs. May sleep if the @gfp flags permit. + * Return: The old value at this index or xa_err() if an error happened. + */ +static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) +{ + void *curr; + + xa_lock_bh(xa); + curr = __xa_cmpxchg(xa, index, old, entry, gfp); + xa_unlock_bh(xa); + + return curr; +} + +/** + * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @old: Old value to test against. + * @entry: New value to place in array. + * @gfp: Memory allocation flags. + * + * This function is like calling xa_cmpxchg() except it disables interrupts + * while holding the array lock. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. May sleep if the @gfp flags permit. + * Return: The old value at this index or xa_err() if an error happened. + */ +static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) +{ + void *curr; + + xa_lock_irq(xa); + curr = __xa_cmpxchg(xa, index, old, entry, gfp); + xa_unlock_irq(xa); + + return curr; +} + +/** + * xa_insert() - Store this entry in the XArray unless another entry is + * already present. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Inserting a NULL entry will store a reserved entry (like xa_reserve()) + * if no entry is present. Inserting will fail if a reserved entry is + * present, even though loading from this index will return NULL. + * + * Context: Any context. Takes and releases the xa_lock. May sleep if + * the @gfp flags permit. + * Return: 0 if the store succeeded. -EBUSY if another entry was present. + * -ENOMEM if memory could not be allocated. + */ +static inline int __must_check xa_insert(struct xarray *xa, + unsigned long index, void *entry, gfp_t gfp) +{ + int err; + + xa_lock(xa); + err = __xa_insert(xa, index, entry, gfp); + xa_unlock(xa); + + return err; +} + +/** + * xa_insert_bh() - Store this entry in the XArray unless another entry is + * already present. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Inserting a NULL entry will store a reserved entry (like xa_reserve()) + * if no entry is present. Inserting will fail if a reserved entry is + * present, even though loading from this index will return NULL. + * + * Context: Any context. Takes and releases the xa_lock while + * disabling softirqs. May sleep if the @gfp flags permit. + * Return: 0 if the store succeeded. -EBUSY if another entry was present. + * -ENOMEM if memory could not be allocated. + */ +static inline int __must_check xa_insert_bh(struct xarray *xa, + unsigned long index, void *entry, gfp_t gfp) +{ + int err; + + xa_lock_bh(xa); + err = __xa_insert(xa, index, entry, gfp); + xa_unlock_bh(xa); + + return err; +} + +/** + * xa_insert_irq() - Store this entry in the XArray unless another entry is + * already present. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Inserting a NULL entry will store a reserved entry (like xa_reserve()) + * if no entry is present. Inserting will fail if a reserved entry is + * present, even though loading from this index will return NULL. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. May sleep if the @gfp flags permit. + * Return: 0 if the store succeeded. -EBUSY if another entry was present. + * -ENOMEM if memory could not be allocated. + */ +static inline int __must_check xa_insert_irq(struct xarray *xa, + unsigned long index, void *entry, gfp_t gfp) +{ + int err; + + xa_lock_irq(xa); + err = __xa_insert(xa, index, entry, gfp); + xa_unlock_irq(xa); + + return err; +} + +/** + * xa_alloc() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @entry: New entry. + * @limit: Range of ID to allocate. + * @gfp: Memory allocation flags. + * + * Finds an empty entry in @xa between @limit.min and @limit.max, + * stores the index into the @id pointer, then stores the entry at + * that index. A concurrent lookup will not see an uninitialised @id. + * + * Context: Any context. Takes and releases the xa_lock. May sleep if + * the @gfp flags permit. + * Return: 0 on success, -ENOMEM if memory could not be allocated or + * -EBUSY if there are no free entries in @limit. + */ +static inline __must_check int xa_alloc(struct xarray *xa, u32 *id, + void *entry, struct xa_limit limit, gfp_t gfp) +{ + int err; + + xa_lock(xa); + err = __xa_alloc(xa, id, entry, limit, gfp); + xa_unlock(xa); + + return err; +} + +/** + * xa_alloc_bh() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @entry: New entry. + * @limit: Range of ID to allocate. + * @gfp: Memory allocation flags. + * + * Finds an empty entry in @xa between @limit.min and @limit.max, + * stores the index into the @id pointer, then stores the entry at + * that index. A concurrent lookup will not see an uninitialised @id. + * + * Context: Any context. Takes and releases the xa_lock while + * disabling softirqs. May sleep if the @gfp flags permit. + * Return: 0 on success, -ENOMEM if memory could not be allocated or + * -EBUSY if there are no free entries in @limit. + */ +static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id, + void *entry, struct xa_limit limit, gfp_t gfp) +{ + int err; + + xa_lock_bh(xa); + err = __xa_alloc(xa, id, entry, limit, gfp); + xa_unlock_bh(xa); + + return err; +} + +/** + * xa_alloc_irq() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @entry: New entry. + * @limit: Range of ID to allocate. + * @gfp: Memory allocation flags. + * + * Finds an empty entry in @xa between @limit.min and @limit.max, + * stores the index into the @id pointer, then stores the entry at + * that index. A concurrent lookup will not see an uninitialised @id. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. May sleep if the @gfp flags permit. + * Return: 0 on success, -ENOMEM if memory could not be allocated or + * -EBUSY if there are no free entries in @limit. + */ +static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, + void *entry, struct xa_limit limit, gfp_t gfp) +{ + int err; + + xa_lock_irq(xa); + err = __xa_alloc(xa, id, entry, limit, gfp); + xa_unlock_irq(xa); + + return err; +} + +/** + * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @entry: New entry. + * @limit: Range of allocated ID. + * @next: Pointer to next ID to allocate. + * @gfp: Memory allocation flags. + * + * Finds an empty entry in @xa between @limit.min and @limit.max, + * stores the index into the @id pointer, then stores the entry at + * that index. A concurrent lookup will not see an uninitialised @id. + * The search for an empty entry will start at @next and will wrap + * around if necessary. + * + * Context: Any context. Takes and releases the xa_lock. May sleep if + * the @gfp flags permit. + * Return: 0 if the allocation succeeded without wrapping. 1 if the + * allocation succeeded after wrapping, -ENOMEM if memory could not be + * allocated or -EBUSY if there are no free entries in @limit. + */ +static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, + struct xa_limit limit, u32 *next, gfp_t gfp) +{ + int err; + + xa_lock(xa); + err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); + xa_unlock(xa); + + return err; +} + +/** + * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @entry: New entry. + * @limit: Range of allocated ID. + * @next: Pointer to next ID to allocate. + * @gfp: Memory allocation flags. + * + * Finds an empty entry in @xa between @limit.min and @limit.max, + * stores the index into the @id pointer, then stores the entry at + * that index. A concurrent lookup will not see an uninitialised @id. + * The search for an empty entry will start at @next and will wrap + * around if necessary. + * + * Context: Any context. Takes and releases the xa_lock while + * disabling softirqs. May sleep if the @gfp flags permit. + * Return: 0 if the allocation succeeded without wrapping. 1 if the + * allocation succeeded after wrapping, -ENOMEM if memory could not be + * allocated or -EBUSY if there are no free entries in @limit. + */ +static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, + struct xa_limit limit, u32 *next, gfp_t gfp) +{ + int err; + + xa_lock_bh(xa); + err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); + xa_unlock_bh(xa); + + return err; +} + +/** + * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @entry: New entry. + * @limit: Range of allocated ID. + * @next: Pointer to next ID to allocate. + * @gfp: Memory allocation flags. + * + * Finds an empty entry in @xa between @limit.min and @limit.max, + * stores the index into the @id pointer, then stores the entry at + * that index. A concurrent lookup will not see an uninitialised @id. + * The search for an empty entry will start at @next and will wrap + * around if necessary. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. May sleep if the @gfp flags permit. + * Return: 0 if the allocation succeeded without wrapping. 1 if the + * allocation succeeded after wrapping, -ENOMEM if memory could not be + * allocated or -EBUSY if there are no free entries in @limit. + */ +static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, + struct xa_limit limit, u32 *next, gfp_t gfp) +{ + int err; + + xa_lock_irq(xa); + err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); + xa_unlock_irq(xa); + + return err; +} + +/** + * xa_reserve() - Reserve this index in the XArray. + * @xa: XArray. + * @index: Index into array. + * @gfp: Memory allocation flags. + * + * Ensures there is somewhere to store an entry at @index in the array. + * If there is already something stored at @index, this function does + * nothing. If there was nothing there, the entry is marked as reserved. + * Loading from a reserved entry returns a %NULL pointer. + * + * If you do not use the entry that you have reserved, call xa_release() + * or xa_erase() to free any unnecessary memory. + * + * Context: Any context. Takes and releases the xa_lock. + * May sleep if the @gfp flags permit. + * Return: 0 if the reservation succeeded or -ENOMEM if it failed. + */ +static inline __must_check +int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) +{ + return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp)); +} + +/** + * xa_reserve_bh() - Reserve this index in the XArray. + * @xa: XArray. + * @index: Index into array. + * @gfp: Memory allocation flags. + * + * A softirq-disabling version of xa_reserve(). + * + * Context: Any context. Takes and releases the xa_lock while + * disabling softirqs. + * Return: 0 if the reservation succeeded or -ENOMEM if it failed. + */ +static inline __must_check +int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp) +{ + return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp)); +} + +/** + * xa_reserve_irq() - Reserve this index in the XArray. + * @xa: XArray. + * @index: Index into array. + * @gfp: Memory allocation flags. + * + * An interrupt-disabling version of xa_reserve(). + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. + * Return: 0 if the reservation succeeded or -ENOMEM if it failed. + */ +static inline __must_check +int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp) +{ + return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp)); +} + +/** + * xa_release() - Release a reserved entry. + * @xa: XArray. + * @index: Index of entry. + * + * After calling xa_reserve(), you can call this function to release the + * reservation. If the entry at @index has been stored to, this function + * will do nothing. + */ +static inline void xa_release(struct xarray *xa, unsigned long index) +{ + xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0); +} + +/* Everything below here is the Advanced API. Proceed with caution. */ + +/* + * The xarray is constructed out of a set of 'chunks' of pointers. Choosing + * the best chunk size requires some tradeoffs. A power of two recommends + * itself so that we can walk the tree based purely on shifts and masks. + * Generally, the larger the better; as the number of slots per level of the + * tree increases, the less tall the tree needs to be. But that needs to be + * balanced against the memory consumption of each node. On a 64-bit system, + * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we + * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. + */ +#ifndef XA_CHUNK_SHIFT +#define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) +#endif +#define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) +#define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) +#define XA_MAX_MARKS 3 +#define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG) + +/* + * @count is the count of every non-NULL element in the ->slots array + * whether that is a value entry, a retry entry, a user pointer, + * a sibling entry or a pointer to the next level of the tree. + * @nr_values is the count of every element in ->slots which is + * either a value entry or a sibling of a value entry. + */ +struct xa_node { + unsigned char shift; /* Bits remaining in each slot */ + unsigned char offset; /* Slot offset in parent */ + unsigned char count; /* Total entry count */ + unsigned char nr_values; /* Value entry count */ + struct xa_node __rcu *parent; /* NULL at top of tree */ + struct xarray *array; /* The array we belong to */ + union { + struct list_head private_list; /* For tree user */ + struct rcu_head rcu_head; /* Used when freeing node */ + }; + void __rcu *slots[XA_CHUNK_SIZE]; + union { + unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; + unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; + }; +}; + +void xa_dump(const struct xarray *); +void xa_dump_node(const struct xa_node *); + +#ifdef XA_DEBUG +#define XA_BUG_ON(xa, x) do { \ + if (x) { \ + xa_dump(xa); \ + BUG(); \ + } \ + } while (0) +#define XA_NODE_BUG_ON(node, x) do { \ + if (x) { \ + if (node) xa_dump_node(node); \ + BUG(); \ + } \ + } while (0) +#else +#define XA_BUG_ON(xa, x) do { } while (0) +#define XA_NODE_BUG_ON(node, x) do { } while (0) +#endif + +/* Private */ +static inline void *xa_head(const struct xarray *xa) +{ + return rcu_dereference_check(xa->xa_head, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_head_locked(const struct xarray *xa) +{ + return rcu_dereference_protected(xa->xa_head, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_entry(const struct xarray *xa, + const struct xa_node *node, unsigned int offset) +{ + XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); + return rcu_dereference_check(node->slots[offset], + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_entry_locked(const struct xarray *xa, + const struct xa_node *node, unsigned int offset) +{ + XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); + return rcu_dereference_protected(node->slots[offset], + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline struct xa_node *xa_parent(const struct xarray *xa, + const struct xa_node *node) +{ + return rcu_dereference_check(node->parent, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline struct xa_node *xa_parent_locked(const struct xarray *xa, + const struct xa_node *node) +{ + return rcu_dereference_protected(node->parent, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_mk_node(const struct xa_node *node) +{ + return (void *)((unsigned long)node | 2); +} + +/* Private */ +static inline struct xa_node *xa_to_node(const void *entry) +{ + return (struct xa_node *)((unsigned long)entry - 2); +} + +/* Private */ +static inline bool xa_is_node(const void *entry) +{ + return xa_is_internal(entry) && (unsigned long)entry > 4096; +} + +/* Private */ +static inline void *xa_mk_sibling(unsigned int offset) +{ + return xa_mk_internal(offset); +} + +/* Private */ +static inline unsigned long xa_to_sibling(const void *entry) +{ + return xa_to_internal(entry); +} + +/** + * xa_is_sibling() - Is the entry a sibling entry? + * @entry: Entry retrieved from the XArray + * + * Return: %true if the entry is a sibling entry. + */ +static inline bool xa_is_sibling(const void *entry) +{ + return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && + (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); +} + +#define XA_RETRY_ENTRY xa_mk_internal(256) + +/** + * xa_is_retry() - Is the entry a retry entry? + * @entry: Entry retrieved from the XArray + * + * Return: %true if the entry is a retry entry. + */ +static inline bool xa_is_retry(const void *entry) +{ + return unlikely(entry == XA_RETRY_ENTRY); +} + +/** + * xa_is_advanced() - Is the entry only permitted for the advanced API? + * @entry: Entry to be stored in the XArray. + * + * Return: %true if the entry cannot be stored by the normal API. + */ +static inline bool xa_is_advanced(const void *entry) +{ + return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY); +} + +/** + * typedef xa_update_node_t - A callback function from the XArray. + * @node: The node which is being processed + * + * This function is called every time the XArray updates the count of + * present and value entries in a node. It allows advanced users to + * maintain the private_list in the node. + * + * Context: The xa_lock is held and interrupts may be disabled. + * Implementations should not drop the xa_lock, nor re-enable + * interrupts. + */ +typedef void (*xa_update_node_t)(struct xa_node *node); + +/* + * The xa_state is opaque to its users. It contains various different pieces + * of state involved in the current operation on the XArray. It should be + * declared on the stack and passed between the various internal routines. + * The various elements in it should not be accessed directly, but only + * through the provided accessor functions. The below documentation is for + * the benefit of those working on the code, not for users of the XArray. + * + * @xa_node usually points to the xa_node containing the slot we're operating + * on (and @xa_offset is the offset in the slots array). If there is a + * single entry in the array at index 0, there are no allocated xa_nodes to + * point to, and so we store %NULL in @xa_node. @xa_node is set to + * the value %XAS_RESTART if the xa_state is not walked to the correct + * position in the tree of nodes for this operation. If an error occurs + * during an operation, it is set to an %XAS_ERROR value. If we run off the + * end of the allocated nodes, it is set to %XAS_BOUNDS. + */ +struct xa_state { + struct xarray *xa; + unsigned long xa_index; + unsigned char xa_shift; + unsigned char xa_sibs; + unsigned char xa_offset; + unsigned char xa_pad; /* Helps gcc generate better code */ + struct xa_node *xa_node; + struct xa_node *xa_alloc; + xa_update_node_t xa_update; +}; + +/* + * We encode errnos in the xas->xa_node. If an error has happened, we need to + * drop the lock to fix it, and once we've done so the xa_state is invalid. + */ +#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) +#define XAS_BOUNDS ((struct xa_node *)1UL) +#define XAS_RESTART ((struct xa_node *)3UL) + +#define __XA_STATE(array, index, shift, sibs) { \ + .xa = array, \ + .xa_index = index, \ + .xa_shift = shift, \ + .xa_sibs = sibs, \ + .xa_offset = 0, \ + .xa_pad = 0, \ + .xa_node = XAS_RESTART, \ + .xa_alloc = NULL, \ + .xa_update = NULL \ +} + +/** + * XA_STATE() - Declare an XArray operation state. + * @name: Name of this operation state (usually xas). + * @array: Array to operate on. + * @index: Initial index of interest. + * + * Declare and initialise an xa_state on the stack. + */ +#define XA_STATE(name, array, index) \ + struct xa_state name = __XA_STATE(array, index, 0, 0) + +/** + * XA_STATE_ORDER() - Declare an XArray operation state. + * @name: Name of this operation state (usually xas). + * @array: Array to operate on. + * @index: Initial index of interest. + * @order: Order of entry. + * + * Declare and initialise an xa_state on the stack. This variant of + * XA_STATE() allows you to specify the 'order' of the element you + * want to operate on.` + */ +#define XA_STATE_ORDER(name, array, index, order) \ + struct xa_state name = __XA_STATE(array, \ + (index >> order) << order, \ + order - (order % XA_CHUNK_SHIFT), \ + (1U << (order % XA_CHUNK_SHIFT)) - 1) + +#define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) +#define xas_trylock(xas) xa_trylock((xas)->xa) +#define xas_lock(xas) xa_lock((xas)->xa) +#define xas_unlock(xas) xa_unlock((xas)->xa) +#define xas_lock_bh(xas) xa_lock_bh((xas)->xa) +#define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) +#define xas_lock_irq(xas) xa_lock_irq((xas)->xa) +#define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) +#define xas_lock_irqsave(xas, flags) \ + xa_lock_irqsave((xas)->xa, flags) +#define xas_unlock_irqrestore(xas, flags) \ + xa_unlock_irqrestore((xas)->xa, flags) + +/** + * xas_error() - Return an errno stored in the xa_state. + * @xas: XArray operation state. + * + * Return: 0 if no error has been noted. A negative errno if one has. + */ +static inline int xas_error(const struct xa_state *xas) +{ + return xa_err(xas->xa_node); +} + +/** + * xas_set_err() - Note an error in the xa_state. + * @xas: XArray operation state. + * @err: Negative error number. + * + * Only call this function with a negative @err; zero or positive errors + * will probably not behave the way you think they should. If you want + * to clear the error from an xa_state, use xas_reset(). + */ +static inline void xas_set_err(struct xa_state *xas, long err) +{ + xas->xa_node = XA_ERROR(err); +} + +/** + * xas_invalid() - Is the xas in a retry or error state? + * @xas: XArray operation state. + * + * Return: %true if the xas cannot be used for operations. + */ +static inline bool xas_invalid(const struct xa_state *xas) +{ + return (unsigned long)xas->xa_node & 3; +} + +/** + * xas_valid() - Is the xas a valid cursor into the array? + * @xas: XArray operation state. + * + * Return: %true if the xas can be used for operations. + */ +static inline bool xas_valid(const struct xa_state *xas) +{ + return !xas_invalid(xas); +} + +/** + * xas_is_node() - Does the xas point to a node? + * @xas: XArray operation state. + * + * Return: %true if the xas currently references a node. + */ +static inline bool xas_is_node(const struct xa_state *xas) +{ + return xas_valid(xas) && xas->xa_node; +} + +/* True if the pointer is something other than a node */ +static inline bool xas_not_node(struct xa_node *node) +{ + return ((unsigned long)node & 3) || !node; +} + +/* True if the node represents RESTART or an error */ +static inline bool xas_frozen(struct xa_node *node) +{ + return (unsigned long)node & 2; +} + +/* True if the node represents head-of-tree, RESTART or BOUNDS */ +static inline bool xas_top(struct xa_node *node) +{ + return node <= XAS_RESTART; +} + +/** + * xas_reset() - Reset an XArray operation state. + * @xas: XArray operation state. + * + * Resets the error or walk state of the @xas so future walks of the + * array will start from the root. Use this if you have dropped the + * xarray lock and want to reuse the xa_state. + * + * Context: Any context. + */ +static inline void xas_reset(struct xa_state *xas) +{ + xas->xa_node = XAS_RESTART; +} + +/** + * xas_retry() - Retry the operation if appropriate. + * @xas: XArray operation state. + * @entry: Entry from xarray. + * + * The advanced functions may sometimes return an internal entry, such as + * a retry entry or a zero entry. This function sets up the @xas to restart + * the walk from the head of the array if needed. + * + * Context: Any context. + * Return: true if the operation needs to be retried. + */ +static inline bool xas_retry(struct xa_state *xas, const void *entry) +{ + if (xa_is_zero(entry)) + return true; + if (!xa_is_retry(entry)) + return false; + xas_reset(xas); + return true; +} + +void *xas_load(struct xa_state *); +void *xas_store(struct xa_state *, void *entry); +void *xas_find(struct xa_state *, unsigned long max); +void *xas_find_conflict(struct xa_state *); + +bool xas_get_mark(const struct xa_state *, xa_mark_t); +void xas_set_mark(const struct xa_state *, xa_mark_t); +void xas_clear_mark(const struct xa_state *, xa_mark_t); +void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); +void xas_init_marks(const struct xa_state *); + +bool xas_nomem(struct xa_state *, gfp_t); +void xas_pause(struct xa_state *); + +void xas_create_range(struct xa_state *); + +/** + * xas_reload() - Refetch an entry from the xarray. + * @xas: XArray operation state. + * + * Use this function to check that a previously loaded entry still has + * the same value. This is useful for the lockless pagecache lookup where + * we walk the array with only the RCU lock to protect us, lock the page, + * then check that the page hasn't moved since we looked it up. + * + * The caller guarantees that @xas is still valid. If it may be in an + * error or restart state, call xas_load() instead. + * + * Return: The entry at this location in the xarray. + */ +static inline void *xas_reload(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (node) + return xa_entry(xas->xa, node, xas->xa_offset); + return xa_head(xas->xa); +} + +/** + * xas_set() - Set up XArray operation state for a different index. + * @xas: XArray operation state. + * @index: New index into the XArray. + * + * Move the operation state to refer to a different index. This will + * have the effect of starting a walk from the top; see xas_next() + * to move to an adjacent index. + */ +static inline void xas_set(struct xa_state *xas, unsigned long index) +{ + xas->xa_index = index; + xas->xa_node = XAS_RESTART; +} + +/** + * xas_set_order() - Set up XArray operation state for a multislot entry. + * @xas: XArray operation state. + * @index: Target of the operation. + * @order: Entry occupies 2^@order indices. + */ +static inline void xas_set_order(struct xa_state *xas, unsigned long index, + unsigned int order) +{ +#ifdef CONFIG_XARRAY_MULTI + xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; + xas->xa_shift = order - (order % XA_CHUNK_SHIFT); + xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; + xas->xa_node = XAS_RESTART; +#else + BUG_ON(order > 0); + xas_set(xas, index); +#endif +} + +/** + * xas_set_update() - Set up XArray operation state for a callback. + * @xas: XArray operation state. + * @update: Function to call when updating a node. + * + * The XArray can notify a caller after it has updated an xa_node. + * This is advanced functionality and is only needed by the page cache. + */ +static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) +{ + xas->xa_update = update; +} + +/** + * xas_next_entry() - Advance iterator to next present entry. + * @xas: XArray operation state. + * @max: Highest index to return. + * + * xas_next_entry() is an inline function to optimise xarray traversal for + * speed. It is equivalent to calling xas_find(), and will call xas_find() + * for all the hard cases. + * + * Return: The next present entry after the one currently referred to by @xas. + */ +static inline void *xas_next_entry(struct xa_state *xas, unsigned long max) +{ + struct xa_node *node = xas->xa_node; + void *entry; + + if (unlikely(xas_not_node(node) || node->shift || + xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK))) + return xas_find(xas, max); + + do { + if (unlikely(xas->xa_index >= max)) + return xas_find(xas, max); + if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) + return xas_find(xas, max); + entry = xa_entry(xas->xa, node, xas->xa_offset + 1); + if (unlikely(xa_is_internal(entry))) + return xas_find(xas, max); + xas->xa_offset++; + xas->xa_index++; + } while (!entry); + + return entry; +} + +/* Private */ +static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance, + xa_mark_t mark) +{ + unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark]; + unsigned int offset = xas->xa_offset; + + if (advance) + offset++; + if (XA_CHUNK_SIZE == BITS_PER_LONG) { + if (offset < XA_CHUNK_SIZE) { + unsigned long data = *addr & (~0UL << offset); + if (data) + return __ffs(data); + } + return XA_CHUNK_SIZE; + } + + return find_next_bit(addr, XA_CHUNK_SIZE, offset); +} + +/** + * xas_next_marked() - Advance iterator to next marked entry. + * @xas: XArray operation state. + * @max: Highest index to return. + * @mark: Mark to search for. + * + * xas_next_marked() is an inline function to optimise xarray traversal for + * speed. It is equivalent to calling xas_find_marked(), and will call + * xas_find_marked() for all the hard cases. + * + * Return: The next marked entry after the one currently referred to by @xas. + */ +static inline void *xas_next_marked(struct xa_state *xas, unsigned long max, + xa_mark_t mark) +{ + struct xa_node *node = xas->xa_node; + unsigned int offset; + + if (unlikely(xas_not_node(node) || node->shift)) + return xas_find_marked(xas, max, mark); + offset = xas_find_chunk(xas, true, mark); + xas->xa_offset = offset; + xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset; + if (xas->xa_index > max) + return NULL; + if (offset == XA_CHUNK_SIZE) + return xas_find_marked(xas, max, mark); + return xa_entry(xas->xa, node, offset); +} + +/* + * If iterating while holding a lock, drop the lock and reschedule + * every %XA_CHECK_SCHED loops. + */ +enum { + XA_CHECK_SCHED = 4096, +}; + +/** + * xas_for_each() - Iterate over a range of an XArray. + * @xas: XArray operation state. + * @entry: Entry retrieved from the array. + * @max: Maximum index to retrieve from array. + * + * The loop body will be executed for each entry present in the xarray + * between the current xas position and @max. @entry will be set to + * the entry retrieved from the xarray. It is safe to delete entries + * from the array in the loop body. You should hold either the RCU lock + * or the xa_lock while iterating. If you need to drop the lock, call + * xas_pause() first. + */ +#define xas_for_each(xas, entry, max) \ + for (entry = xas_find(xas, max); entry; \ + entry = xas_next_entry(xas, max)) + +/** + * xas_for_each_marked() - Iterate over a range of an XArray. + * @xas: XArray operation state. + * @entry: Entry retrieved from the array. + * @max: Maximum index to retrieve from array. + * @mark: Mark to search for. + * + * The loop body will be executed for each marked entry in the xarray + * between the current xas position and @max. @entry will be set to + * the entry retrieved from the xarray. It is safe to delete entries + * from the array in the loop body. You should hold either the RCU lock + * or the xa_lock while iterating. If you need to drop the lock, call + * xas_pause() first. + */ +#define xas_for_each_marked(xas, entry, max, mark) \ + for (entry = xas_find_marked(xas, max, mark); entry; \ + entry = xas_next_marked(xas, max, mark)) + +/** + * xas_for_each_conflict() - Iterate over a range of an XArray. + * @xas: XArray operation state. + * @entry: Entry retrieved from the array. + * + * The loop body will be executed for each entry in the XArray that lies + * within the range specified by @xas. If the loop completes successfully, + * any entries that lie in this range will be replaced by @entry. The caller + * may break out of the loop; if they do so, the contents of the XArray will + * be unchanged. The operation may fail due to an out of memory condition. + * The caller may also call xa_set_err() to exit the loop while setting an + * error to record the reason. + */ +#define xas_for_each_conflict(xas, entry) \ + while ((entry = xas_find_conflict(xas))) + +void *__xas_next(struct xa_state *); +void *__xas_prev(struct xa_state *); + +/** + * xas_prev() - Move iterator to previous index. + * @xas: XArray operation state. + * + * If the @xas was in an error state, it will remain in an error state + * and this function will return %NULL. If the @xas has never been walked, + * it will have the effect of calling xas_load(). Otherwise one will be + * subtracted from the index and the state will be walked to the correct + * location in the array for the next operation. + * + * If the iterator was referencing index 0, this function wraps + * around to %ULONG_MAX. + * + * Return: The entry at the new index. This may be %NULL or an internal + * entry. + */ +static inline void *xas_prev(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (unlikely(xas_not_node(node) || node->shift || + xas->xa_offset == 0)) + return __xas_prev(xas); + + xas->xa_index--; + xas->xa_offset--; + return xa_entry(xas->xa, node, xas->xa_offset); +} + +/** + * xas_next() - Move state to next index. + * @xas: XArray operation state. + * + * If the @xas was in an error state, it will remain in an error state + * and this function will return %NULL. If the @xas has never been walked, + * it will have the effect of calling xas_load(). Otherwise one will be + * added to the index and the state will be walked to the correct + * location in the array for the next operation. + * + * If the iterator was referencing index %ULONG_MAX, this function wraps + * around to 0. + * + * Return: The entry at the new index. This may be %NULL or an internal + * entry. + */ +static inline void *xas_next(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (unlikely(xas_not_node(node) || node->shift || + xas->xa_offset == XA_CHUNK_MASK)) + return __xas_next(xas); + + xas->xa_index++; + xas->xa_offset++; + return xa_entry(xas->xa, node, xas->xa_offset); +} +#endif /* !HAVE_XARRAY_SUPPORT */ + +#endif /* _LINUX_XARRAY_H */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h new file mode 100644 index 0000000000000..45818dddedd94 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h @@ -0,0 +1,103 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ + +#ifndef _LINUX_HASH_H +#define _LINUX_HASH_H +/* Fast hashing routine for ints, longs and pointers. + (C) 2002 Nadia Yvette Chambers, IBM */ + +/* + * Knuth recommends primes in approximately golden ratio to the maximum + * integer representable by a machine word for multiplicative hashing. + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * + * These primes are chosen to be bit-sparse, that is operations on + * them can use shifts and additions instead of multiplications for + * machines where multiplications are slow. + */ + +#include + +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define GOLDEN_RATIO_PRIME_32 0x9e370001UL +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL + +#if __BITS_PER_LONG == 32 +#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32 +#define hash_long(val, bits) hash_32(val, bits) +#elif __BITS_PER_LONG == 64 +#define hash_long(val, bits) hash_64(val, bits) +#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64 +#else +#error Wordsize not 32 or 64 +#endif + +static __always_inline __u64 hash_64(__u64 val, unsigned int bits) +{ + __u64 hash = val; + + /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ + __u64 n = hash; + n <<= 18; + hash -= n; + n <<= 33; + hash -= n; + n <<= 3; + hash += n; + n <<= 3; + hash -= n; + n <<= 4; + hash += n; + n <<= 2; + hash += n; + + /* High bits are more random, so use them. */ + return hash >> (64 - bits); +} + +static inline __u32 hash_32(__u32 val, unsigned int bits) +{ + /* On some cpus multiply is faster, on others gcc will do shifts */ + __u32 hash = val * GOLDEN_RATIO_PRIME_32; + + /* High bits are more random, so use them. */ + return hash >> (32 - bits); +} + +static inline unsigned long hash_ptr(const void *ptr, unsigned int bits) +{ + return hash_long((unsigned long)ptr, bits); +} + +static inline __u32 hash32_ptr(const void *ptr) +{ + unsigned long val = (unsigned long)ptr; + +#if __BITS_PER_LONG == 64 + val ^= (val >> 32); +#endif + return (__u32)val; +} + +#endif /* _LINUX_HASH_H */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h new file mode 100644 index 0000000000000..a59e2c97ba2ff --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h @@ -0,0 +1,67 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/util/ioctl.h + * + * Utility functions for calling ioctls. + * + */ +#ifndef _LIBCFS_IOCTL_H_ +#define _LIBCFS_IOCTL_H_ + +#include +#include + +/* Sparse annotation. */ +#define __user + +#include + +#define LIBCFS_IOC_INIT(data) \ +do { \ + memset(&(data), 0, sizeof(data)); \ + (data).ioc_hdr.ioc_version = LIBCFS_IOCTL_VERSION; \ + (data).ioc_hdr.ioc_len = sizeof(data); \ +} while (0) + +#define LIBCFS_IOC_INIT_V2(data, hdr) \ +do { \ + memset(&(data), 0, sizeof(data)); \ + (data).hdr.ioc_version = LIBCFS_IOCTL_VERSION2; \ + (data).hdr.ioc_len = sizeof(data); \ +} while (0) + +/* FIXME - rename these to libcfs_ */ +int libcfs_ioctl_pack(struct libcfs_ioctl_data *data, char **pbuf, int max); +void libcfs_ioctl_unpack(struct libcfs_ioctl_data *data, char *pbuf); +int register_ioc_dev(int dev_id, const char *dev_name); +void unregister_ioc_dev(int dev_id); +int l_ioctl(int dev_id, unsigned int opc, void *buf); +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h new file mode 100644 index 0000000000000..ef69efed6cf1e --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h @@ -0,0 +1,499 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ + +#ifndef __LIBCFS_UTIL_LIST_H__ +#define __LIBCFS_UTIL_LIST_H__ + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +#define prefetch(a) ((void)a) + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/** + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head * new, + struct list_head * prev, + struct list_head * next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * Insert an entry at the start of a list. + * \param new new entry to be inserted + * \param head list to add it to + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, + struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/** + * Insert an entry at the end of a list. + * \param new new entry to be inserted + * \param head list to add it to + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, + struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head *prev, + struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * Remove an entry from the list it is currently in. + * \param entry the entry to remove + * Note: list_empty(entry) does not return true after this, the entry is in an + * undefined state. + */ +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +/** + * Remove an entry from the list it is currently in and reinitialize it. + * \param entry the entry to remove. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * Remove an entry from the list it is currently in and insert it at the start + * of another list. + * \param list the entry to move + * \param head the list to move it to + */ +static inline void list_move(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * Remove an entry from the list it is currently in and insert it at the end of + * another list. + * \param list the entry to move + * \param head the list to move it to + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * Test whether a list is empty + * \param head the list to test. + */ +static inline int list_empty(struct list_head *head) +{ + return head->next == head; +} + +/** + * Test whether a list is empty and not being modified + * \param head the list to test + * + * Tests whether a list is empty _and_ checks that no other CPU might be + * in the process of modifying either member (next or prev) + * + * NOTE: using list_empty_careful() without synchronization + * can only be safe if the only activity that can happen + * to the list entry is list_del_init(). Eg. it cannot be used + * if another CPU could re-list_add() it. + */ +static inline int list_empty_careful(const struct list_head *head) +{ + struct list_head *next = head->next; + return (next == head) && (next == head->prev); +} + +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +/** + * Join two lists + * \param list the new list to add. + * \param head the place to add it in the first list. + * + * The contents of \a list are added at the start of \a head. \a list is in an + * undefined state on return. + */ +static inline void list_splice(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +static inline void list_splice_tail(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head->prev); +} + +/** + * Join two lists and reinitialise the emptied list. + * \param list the new list to add. + * \param head the place to add it in the first list. + * + * The contents of \a list are added at the start of \a head. \a list is empty + * on return. + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} + +/** + * Get the container of a list + * \param ptr the embedded list. + * \param type the type of the struct this is embedded in. + * \param member the member name of the list within the struct. + */ +#define list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(char *)(&((type *)0)->member))) + +/** + * Iterate over a list + * \param pos the iterator + * \param head the list to iterate over + * + * Behaviour is undefined if \a pos is removed from the list in the body of the + * loop. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next, prefetch(pos->next); pos != (head); \ + pos = pos->next, prefetch(pos->next)) + +/** + * Iterate over a list safely + * \param pos the iterator + * \param n temporary storage + * \param head the list to iterate over + * + * This is safe to use if \a pos could be removed from the list in the body of + * the loop. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +/** + * Iterate over a list continuing after existing point + * \param pos the type * to use as a loop counter + * \param head the list head + * \param member the name of the list_struct within the struct + */ +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member); \ + prefetch(pos->member.next), &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * \defgroup hlist Hash List + * Double linked lists with a single pointer list head. + * Mostly useful for hash tables where the two pointer list head is too + * wasteful. You lose the ability to access the tail in O(1). + * @{ + */ + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +struct hlist_head { + struct hlist_node *first; +}; + +/* @} */ + +/* + * "NULL" might not be defined at this point + */ +#ifdef NULL +#define NULL_P NULL +#else +#define NULL_P ((void *)0) +#endif + +/** + * \addtogroup hlist + * @{ + */ + +#define HLIST_HEAD_INIT { NULL_P } +#define HLIST_HEAD(name) struct hlist_head name = { NULL_P } +#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL_P) +#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL_P, (ptr)->pprev = NULL_P) + +static inline int hlist_unhashed(const struct hlist_node *h) +{ + return !h->pprev; +} + +static inline int hlist_empty(const struct hlist_head *h) +{ + return !h->first; +} + +static inline void __hlist_del(struct hlist_node *n) +{ + struct hlist_node *next = n->next; + struct hlist_node **pprev = n->pprev; + *pprev = next; + if (next) + next->pprev = pprev; +} + +static inline void hlist_del(struct hlist_node *n) +{ + __hlist_del(n); +} + +static inline void hlist_del_init(struct hlist_node *n) +{ + if (n->pprev) { + __hlist_del(n); + INIT_HLIST_NODE(n); + } +} + +static inline void hlist_add_head(struct hlist_node *n, + struct hlist_head *h) +{ + struct hlist_node *first = h->first; + n->next = first; + if (first) + first->pprev = &n->next; + h->first = n; + n->pprev = &h->first; +} + +/* next must be != NULL */ +static inline void hlist_add_before(struct hlist_node *n, + struct hlist_node *next) +{ + n->pprev = next->pprev; + n->next = next; + next->pprev = &n->next; + *(n->pprev) = n; +} + +static inline void hlist_add_after(struct hlist_node *n, + struct hlist_node *next) +{ + next->next = n->next; + n->next = next; + next->pprev = &n->next; + + if(next->next) + next->next->pprev = &next->next; +} + +#define hlist_entry(ptr, type, member) container_of(ptr,type,member) + +#define hlist_for_each(pos, head) \ + for (pos = (head)->first; pos && (prefetch(pos->next), 1); \ + pos = pos->next) + +#define hlist_for_each_safe(pos, n, head) \ + for (pos = (head)->first; pos && (n = pos->next, 1); \ + pos = n) + +/** + * Iterate over an hlist of given type + * \param tpos the type * to use as a loop counter. + * \param pos the &struct hlist_node to use as a loop counter. + * \param head the head for your list. + * \param member the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry(tpos, pos, head, member) \ + for (pos = (head)->first; \ + pos && ({ prefetch(pos->next); 1;}) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * Iterate over an hlist continuing after existing point + * \param tpos the type * to use as a loop counter. + * \param pos the &struct hlist_node to use as a loop counter. + * \param member the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_continue(tpos, pos, member) \ + for (pos = (pos)->next; \ + pos && ({ prefetch(pos->next); 1;}) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * Iterate over an hlist continuing from an existing point + * \param tpos the type * to use as a loop counter. + * \param pos the &struct hlist_node to use as a loop counter. + * \param member the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_from(tpos, pos, member) \ + for (; pos && ({ prefetch(pos->next); 1;}) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * Iterate over an hlist of given type safe against removal of list entry + * \param tpos the type * to use as a loop counter. + * \param pos the &struct hlist_node to use as a loop counter. + * \param n another &struct hlist_node to use as temporary storage + * \param head the head for your list. + * \param member the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_safe(tpos, pos, n, head, member) \ + for (pos = (head)->first; \ + pos && ({ n = pos->next; 1; }) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = n) + +/* @} */ + +/** + * Iterate over a list in reverse order + * \param pos the &struct list_head to use as a loop counter. + * \param head the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ + pos = pos->prev, prefetch(pos->prev)) + +/** + * Iterate over a list of given type + * \param pos the type * to use as a loop counter. + * \param head the head for your list. + * \param member the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + prefetch(pos->member.next); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member), \ + prefetch(pos->member.next)) + +/** + * Iterate backwards over a list of given type. + * \param pos the type * to use as a loop counter. + * \param head the head for your list. + * \param member the name of the list_struct within the struct. + */ +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member); \ + prefetch(pos->member.prev), &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +/** + * Iterate over a list of given type safe against removal of list entry + * \param pos the type * to use as a loop counter. + * \param n another type * to use as temporary storage + * \param head the head for your list. + * \param member the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** + * Iterate backwards over a list of given type safely against removal of entry + * \param pos the type * to use as a loop counter. + * \param n another type * to use as temporary storage + * \param head the head for your list. + * \param member the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member), \ + n = list_entry(pos->member.prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.prev, typeof(*n), member)) + +#endif /* __LIBCFS_UTIL_LIST_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h new file mode 100644 index 0000000000000..2fd1e36b07354 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h @@ -0,0 +1,40 @@ +/* + * LGPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of the + * License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + * + * LGPL HEADER END + * + * Copyright (c) 2015, James Simmons + * + * Author: + * James Simmons + */ +#ifndef _LIBCFS_UTIL_PARAM_H_ +#define _LIBCFS_UTIL_PARAM_H_ + +#include +#include + +static inline void cfs_free_param_data(glob_t *paths) +{ + globfree(paths); +} + +int cfs_get_param_paths(glob_t *paths, const char *pattern, ...) + __attribute__((__format__(__printf__, 2, 3))); + +#endif /* _LIBCFS_UTIL_PARAM_H_ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h new file mode 100644 index 0000000000000..7827718f55a48 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h @@ -0,0 +1,114 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/util/parser.h + * + * A command line parser. + * + */ + +#ifndef _PARSER_H_ +#define _PARSER_H_ + +#define HISTORY 100 /* Don't let history grow unbounded */ +#define MAXARGS 512 + +#define CMD_COMPLETE 0 +#define CMD_INCOMPLETE 1 +#define CMD_NONE 2 +#define CMD_AMBIG 3 +#define CMD_HELP 4 + +typedef struct parser_cmd { + char *pc_name; + int (* pc_func)(int, char **); + struct parser_cmd * pc_sub_cmd; + char *pc_help; +} command_t; + +typedef struct argcmd { + char *ac_name; + int (*ac_func)(int, char **); + char *ac_help; +} argcmd_t; + +typedef struct network { + char *type; + char *server; + int port; +} network_t; + +int Parser_quit(int argc, char **argv); +int Parser_version(int argc, char **argv); +void Parser_init(char *, command_t *); /* Set prompt and load command list */ +int Parser_commands(void); /* Start the command parser */ +void Parser_qhelp(int, char **); /* Quick help routine */ +int Parser_help(int, char **); /* Detailed help routine */ +void Parser_ignore_errors(int ignore); /* Set the ignore errors flag */ +void Parser_printhelp(char *); /* Detailed help routine */ +void Parser_exit(int, char **); /* Shuts down command parser */ +int Parser_execarg(int argc, char **argv, command_t cmds[]); +int execute_line(char * line); +int Parser_list_commands(const command_t *cmdlist, char *buffer, + size_t buf_size, const char *parent_cmd, + int col_start, int col_num); + +/* Converts a string to an integer */ +int Parser_int(char *, int *); + +/* Prompts for a string, with default values and a maximum length */ +char *Parser_getstr(const char *prompt, const char *deft, char *res, + size_t len); + +/* Prompts for an integer, with minimum, maximum and default values and base */ +int Parser_getint(const char *prompt, long min, long max, long deft, + int base); + +/* Prompts for a yes/no, with default */ +int Parser_getbool(const char *prompt, int deft); + +/* Extracts an integer from a string, or prompts if it cannot get one */ +long Parser_intarg(const char *inp, const char *prompt, int deft, + int min, int max, int base); + +/* Extracts a word from the input, or propmts if it cannot get one */ +char *Parser_strarg(char *inp, const char *prompt, const char *deft, + char *answer, int len); + +/* Extracts an integer from a string with a base */ +int Parser_arg2int(const char *inp, long *result, int base); + +/* Convert human readable size string to and int; "1k" -> 1000 */ +int Parser_size(unsigned long *sizep, char *str); + +/* Convert a string boolean to an int; "enable" -> 1 */ +int Parser_bool(int *b, char *str); + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h new file mode 100644 index 0000000000000..97d9adb6984d3 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h @@ -0,0 +1,143 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/libcfs_string.h + * + * Generic string manipulation functions. + * + * Author: Nathan Rutman + */ + +#ifndef __LIBCFS_UTIL_STRING_H__ +#define __LIBCFS_UTIL_STRING_H__ + +#include +#include + +#include +#include +#include + +static inline +int vscnprintf(char *buf, size_t bufsz, const char *format, va_list args) +{ + int ret; + + if (!bufsz) + return 0; + + ret = vsnprintf(buf, bufsz, format, args); + return (bufsz > ret) ? ret : bufsz - 1; +} + +/* __printf from linux kernel */ +#ifndef __printf +#define __printf(a, b) __attribute__((__format__(printf, a, b))) +#endif + +__printf(3, 4) +static inline int scnprintf(char *buf, size_t bufsz, const char *format, ...) +{ + int ret; + va_list args; + + va_start(args, format); + ret = vscnprintf(buf, bufsz, format, args); + va_end(args); + + return ret; +} + +struct netstrfns { + __u32 nf_type; + char *nf_name; + char *nf_modname; + void (*nf_addr2str)(__u32 addr, char *str, size_t size); + int (*nf_str2addr)(const char *str, int nob, __u32 *addr); + int (*nf_parse_addrlist)(char *str, int len, + struct list_head *list); + int (*nf_print_addrlist)(char *buffer, int count, + struct list_head *list); + int (*nf_match_addr)(__u32 addr, struct list_head *list); + int (*nf_min_max)(struct list_head *nidlist, __u32 *min_nid, + __u32 *max_nid); + int (*nf_expand_addrrange)(struct list_head *addrranges, + __u32 *addrs, int max_addrs); +}; + +/** + * Structure to represent NULL-less strings. + */ +struct cfs_lstr { + char *ls_str; + int ls_len; +}; + +/* + * Structure to represent \ token of the syntax. + */ +struct cfs_range_expr { + /* + * Link to cfs_expr_list::el_exprs. + */ + struct list_head re_link; + __u32 re_lo; + __u32 re_hi; + __u32 re_stride; +}; + +struct cfs_expr_list { + struct list_head el_link; + struct list_head el_exprs; +}; + +int cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp); +int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res); +int cfs_str2num_check(char *str, int nob, unsigned *num, + unsigned min, unsigned max); +int cfs_expr2str(struct list_head *list, char *str, size_t size); +int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list); +int cfs_expr_list_print(char *buffer, int count, + struct cfs_expr_list *expr_list); +int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max, + struct cfs_expr_list **elpp); +void cfs_expr_list_free(struct cfs_expr_list *expr_list); +void cfs_expr_list_free_list(struct list_head *list); +int cfs_ip_addr_parse(char *str, int len, struct list_head *list); +int cfs_ip_addr_range_gen(__u32 *ip_list, int count, + struct list_head *ip_addr_expr); +int cfs_ip_addr_match(__u32 addr, struct list_head *list); +int cfs_expand_nidlist(struct list_head *nidlist, lnet_nid_t *lnet_nidlist, + int max_nids); +int cfs_parse_nid_parts(char *str, struct list_head *addr, + struct list_head *net_num, __u32 *net_type); +int cfs_abs_path(const char *request_path, char **resolved_path); + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/uapi/linux/llcrypt.h b/drivers/staging/lustrefsx/libcfs/include/uapi/linux/llcrypt.h new file mode 100644 index 0000000000000..c133859bc2169 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/uapi/linux/llcrypt.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * llcrypt user API + * + * These ioctls can be used on filesystems that support llcrypt. See the + * "User API" section of Documentation/filesystems/llcrypt.rst. + */ +/* + * Linux commit 219d54332a09 + * tags/v5.4 + */ +#ifndef _UAPI_LINUX_LLCRYPT_H +#define _UAPI_LINUX_LLCRYPT_H + +#include + +/* Encryption policy flags */ +#define LLCRYPT_POLICY_FLAGS_PAD_4 0x00 +#define LLCRYPT_POLICY_FLAGS_PAD_8 0x01 +#define LLCRYPT_POLICY_FLAGS_PAD_16 0x02 +#define LLCRYPT_POLICY_FLAGS_PAD_32 0x03 +#define LLCRYPT_POLICY_FLAGS_PAD_MASK 0x03 +#define LLCRYPT_POLICY_FLAG_DIRECT_KEY 0x04 +#define LLCRYPT_POLICY_FLAGS_VALID 0x07 + +/* Encryption algorithms */ +#define LLCRYPT_MODE_NULL 0 +#define LLCRYPT_MODE_AES_256_XTS 1 +#define LLCRYPT_MODE_AES_256_CTS 4 +#define LLCRYPT_MODE_AES_128_CBC 5 +#define LLCRYPT_MODE_AES_128_CTS 6 +#define LLCRYPT_MODE_ADIANTUM 9 +#define __LLCRYPT_MODE_MAX 9 + +/* + * Legacy policy version; ad-hoc KDF and no key verification. + * For new encrypted directories, use llcrypt_policy_v2 instead. + * + * Careful: the .version field for this is actually 0, not 1. + */ +#define LLCRYPT_POLICY_V1 0 +#define LLCRYPT_KEY_DESCRIPTOR_SIZE 8 +struct llcrypt_policy_v1 { + __u8 version; + __u8 contents_encryption_mode; + __u8 filenames_encryption_mode; + __u8 flags; + __u8 master_key_descriptor[LLCRYPT_KEY_DESCRIPTOR_SIZE]; +}; +#define llcrypt_policy llcrypt_policy_v1 + +/* + * Process-subscribed "logon" key description prefix and payload format. + * Deprecated; prefer LL_IOC_ADD_ENCRYPTION_KEY instead. + */ +#define LLCRYPT_KEY_DESC_PREFIX "fscrypt:" +#define LLCRYPT_KEY_DESC_PREFIX_SIZE 8 +#define LLCRYPT_MAX_KEY_SIZE 64 +struct llcrypt_key { + __u32 mode; + __u8 raw[LLCRYPT_MAX_KEY_SIZE]; + __u32 size; +}; + +/* + * New policy version with HKDF and key verification (recommended). + */ +#define LLCRYPT_POLICY_V2 2 +#define LLCRYPT_KEY_IDENTIFIER_SIZE 16 +struct llcrypt_policy_v2 { + __u8 version; + __u8 contents_encryption_mode; + __u8 filenames_encryption_mode; + __u8 flags; + __u8 __reserved[4]; + __u8 master_key_identifier[LLCRYPT_KEY_IDENTIFIER_SIZE]; +}; + +/* Struct passed to LL_IOC_GET_ENCRYPTION_POLICY_EX */ +struct llcrypt_get_policy_ex_arg { + __u64 policy_size; /* input/output */ + union { + __u8 version; + struct llcrypt_policy_v1 v1; + struct llcrypt_policy_v2 v2; + } policy; /* output */ +}; + +/* + * v1 policy keys are specified by an arbitrary 8-byte key "descriptor", + * matching llcrypt_policy_v1::master_key_descriptor. + */ +#define LLCRYPT_KEY_SPEC_TYPE_DESCRIPTOR 1 + +/* + * v2 policy keys are specified by a 16-byte key "identifier" which the kernel + * calculates as a cryptographic hash of the key itself, + * matching llcrypt_policy_v2::master_key_identifier. + */ +#define LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER 2 + +/* + * Specifies a key, either for v1 or v2 policies. This doesn't contain the + * actual key itself; this is just the "name" of the key. + */ +struct llcrypt_key_specifier { + __u32 type; /* one of LLCRYPT_KEY_SPEC_TYPE_* */ + __u32 __reserved; + union { + __u8 __reserved[32]; /* reserve some extra space */ + __u8 descriptor[LLCRYPT_KEY_DESCRIPTOR_SIZE]; + __u8 identifier[LLCRYPT_KEY_IDENTIFIER_SIZE]; + } u; +}; + +/* Struct passed to LL_IOC_ADD_ENCRYPTION_KEY */ +struct llcrypt_add_key_arg { + struct llcrypt_key_specifier key_spec; + __u32 raw_size; + __u32 __reserved[9]; + __u8 raw[]; +}; + +/* Struct passed to LL_IOC_REMOVE_ENCRYPTION_KEY */ +struct llcrypt_remove_key_arg { + struct llcrypt_key_specifier key_spec; +#define LLCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY 0x00000001 +#define LLCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS 0x00000002 + __u32 removal_status_flags; /* output */ + __u32 __reserved[5]; +}; + +/* Struct passed to LL_IOC_GET_ENCRYPTION_KEY_STATUS */ +struct llcrypt_get_key_status_arg { + /* input */ + struct llcrypt_key_specifier key_spec; + __u32 __reserved[6]; + + /* output */ +#define LLCRYPT_KEY_STATUS_ABSENT 1 +#define LLCRYPT_KEY_STATUS_PRESENT 2 +#define LLCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED 3 + __u32 status; +#define LLCRYPT_KEY_STATUS_FLAG_ADDED_BY_SELF 0x00000001 + __u32 status_flags; + __u32 user_count; + __u32 __out_reserved[13]; +}; + +#define LL_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct llcrypt_policy) +#define LL_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) +#define LL_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct llcrypt_policy) +#define LL_IOC_GET_ENCRYPTION_POLICY_EX _IOWR('f', 22, __u8[9]) /* size + version */ +#define LL_IOC_ADD_ENCRYPTION_KEY _IOWR('f', 23, struct llcrypt_add_key_arg) +#define LL_IOC_REMOVE_ENCRYPTION_KEY _IOWR('f', 24, struct llcrypt_remove_key_arg) +#define LL_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS _IOWR('f', 25, struct llcrypt_remove_key_arg) +#define LL_IOC_GET_ENCRYPTION_KEY_STATUS _IOWR('f', 26, struct llcrypt_get_key_status_arg) + +/**********************************************************************/ + +/* old names; don't add anything new here! */ +#ifndef __KERNEL__ +#define LL_KEY_DESCRIPTOR_SIZE LLCRYPT_KEY_DESCRIPTOR_SIZE +#define LL_POLICY_FLAGS_PAD_4 LLCRYPT_POLICY_FLAGS_PAD_4 +#define LL_POLICY_FLAGS_PAD_8 LLCRYPT_POLICY_FLAGS_PAD_8 +#define LL_POLICY_FLAGS_PAD_16 LLCRYPT_POLICY_FLAGS_PAD_16 +#define LL_POLICY_FLAGS_PAD_32 LLCRYPT_POLICY_FLAGS_PAD_32 +#define LL_POLICY_FLAGS_PAD_MASK LLCRYPT_POLICY_FLAGS_PAD_MASK +#define LL_POLICY_FLAG_DIRECT_KEY LLCRYPT_POLICY_FLAG_DIRECT_KEY +#define LL_POLICY_FLAGS_VALID LLCRYPT_POLICY_FLAGS_VALID +#define LL_ENCRYPTION_MODE_INVALID 0 /* never used */ +#define LL_ENCRYPTION_MODE_AES_256_XTS LLCRYPT_MODE_AES_256_XTS +#define LL_ENCRYPTION_MODE_AES_256_GCM 2 /* never used */ +#define LL_ENCRYPTION_MODE_AES_256_CBC 3 /* never used */ +#define LL_ENCRYPTION_MODE_AES_256_CTS LLCRYPT_MODE_AES_256_CTS +#define LL_ENCRYPTION_MODE_AES_128_CBC LLCRYPT_MODE_AES_128_CBC +#define LL_ENCRYPTION_MODE_AES_128_CTS LLCRYPT_MODE_AES_128_CTS +#define LL_ENCRYPTION_MODE_SPECK128_256_XTS 7 /* removed */ +#define LL_ENCRYPTION_MODE_SPECK128_256_CTS 8 /* removed */ +#define LL_ENCRYPTION_MODE_ADIANTUM LLCRYPT_MODE_ADIANTUM +#define LL_KEY_DESC_PREFIX LLCRYPT_KEY_DESC_PREFIX +#define LL_KEY_DESC_PREFIX_SIZE LLCRYPT_KEY_DESC_PREFIX_SIZE +#define LL_MAX_KEY_SIZE LLCRYPT_MAX_KEY_SIZE +#endif /* !__KERNEL__ */ + +#endif /* _UAPI_LINUX_LLCRYPT_H */ diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/Makefile b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile new file mode 100644 index 0000000000000..0cbfedc0f087e --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile @@ -0,0 +1,27 @@ +obj-$(CONFIG_LUSTREFSX_LIBCFS) += libcfs.o + +libcfs-linux-objs := linux-prim.o +libcfs-linux-objs += linux-hash.o +libcfs-linux-objs += linux-wait.o +libcfs-linux-objs += glob.o +libcfs-linux-objs += xarray.o + +libcfs-crypto-objs := crypto.o fname.o hkdf.o hooks.o keyring.o +libcfs-crypto-objs += keysetup.o keysetup_v1.o policy.o + +libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs)) +libcfs-crypto-objs := $(addprefix crypto/,$(libcfs-crypto-objs)) + + +libcfs-all-objs := libcfs_cpu.o +libcfs-all-objs += debug.o fail.o module.o tracefile.o +libcfs-all-objs += libcfs_string.o hash.o workitem.o +libcfs-all-objs += libcfs_mem.o libcfs_lock.o +libcfs-all-objs += linux-crypto.o linux-crypto-adler.o + +libcfs-y += $(libcfs-linux-objs) $(libcfs-all-objs) +libcfs-y += $(libcfs-crypto-objs) + +ccflags-y += -I$(src) + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/crypto.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/crypto.c new file mode 100644 index 0000000000000..3d18715a06c3d --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/crypto.c @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This contains encryption functions for per-file encryption. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Add llcrypt_pullback_bio_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ +/* + * Linux commit 219d54332a09 + * tags/v5.4 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "llcrypt_private.h" + +#ifdef HAVE_CIPHER_H +#include + +MODULE_IMPORT_NS(CRYPTO_INTERNAL); +#endif + +static unsigned int num_prealloc_crypto_pages = 32; +static unsigned int num_prealloc_crypto_ctxs = 128; + +module_param(num_prealloc_crypto_pages, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_pages, + "Number of crypto pages to preallocate"); +module_param(num_prealloc_crypto_ctxs, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_ctxs, + "Number of crypto contexts to preallocate"); + +static mempool_t *llcrypt_bounce_page_pool = NULL; + +static LIST_HEAD(llcrypt_free_ctxs); +static DEFINE_SPINLOCK(llcrypt_ctx_lock); + +static struct workqueue_struct *llcrypt_read_workqueue; +static DEFINE_MUTEX(llcrypt_init_mutex); + +static struct kmem_cache *llcrypt_ctx_cachep; +struct kmem_cache *llcrypt_info_cachep; + +void llcrypt_enqueue_decrypt_work(struct work_struct *work) +{ + queue_work(llcrypt_read_workqueue, work); +} +EXPORT_SYMBOL(llcrypt_enqueue_decrypt_work); + +/** + * llcrypt_release_ctx() - Release a decryption context + * @ctx: The decryption context to release. + * + * If the decryption context was allocated from the pre-allocated pool, return + * it to that pool. Else, free it. + */ +void llcrypt_release_ctx(struct llcrypt_ctx *ctx) +{ + unsigned long flags; + + if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) { + kmem_cache_free(llcrypt_ctx_cachep, ctx); + } else { + spin_lock_irqsave(&llcrypt_ctx_lock, flags); + list_add(&ctx->free_list, &llcrypt_free_ctxs); + spin_unlock_irqrestore(&llcrypt_ctx_lock, flags); + } +} +EXPORT_SYMBOL(llcrypt_release_ctx); + +/** + * llcrypt_get_ctx() - Get a decryption context + * @gfp_flags: The gfp flag for memory allocation + * + * Allocate and initialize a decryption context. + * + * Return: A new decryption context on success; an ERR_PTR() otherwise. + */ +struct llcrypt_ctx *llcrypt_get_ctx(gfp_t gfp_flags) +{ + struct llcrypt_ctx *ctx; + unsigned long flags; + + /* + * First try getting a ctx from the free list so that we don't have to + * call into the slab allocator. + */ + spin_lock_irqsave(&llcrypt_ctx_lock, flags); + ctx = list_first_entry_or_null(&llcrypt_free_ctxs, + struct llcrypt_ctx, free_list); + if (ctx) + list_del(&ctx->free_list); + spin_unlock_irqrestore(&llcrypt_ctx_lock, flags); + if (!ctx) { + ctx = kmem_cache_zalloc(llcrypt_ctx_cachep, gfp_flags); + if (!ctx) + return ERR_PTR(-ENOMEM); + ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } + return ctx; +} +EXPORT_SYMBOL(llcrypt_get_ctx); + +struct page *llcrypt_alloc_bounce_page(gfp_t gfp_flags) +{ + return mempool_alloc(llcrypt_bounce_page_pool, gfp_flags); +} + +/** + * llcrypt_free_bounce_page() - free a ciphertext bounce page + * + * Free a bounce page that was allocated by llcrypt_encrypt_pagecache_blocks(), + * or by llcrypt_alloc_bounce_page() directly. + */ +void llcrypt_free_bounce_page(struct page *bounce_page) +{ + if (!bounce_page) + return; + set_page_private(bounce_page, (unsigned long)NULL); + ClearPagePrivate(bounce_page); + mempool_free(bounce_page, llcrypt_bounce_page_pool); +} +EXPORT_SYMBOL(llcrypt_free_bounce_page); + +void llcrypt_generate_iv(union llcrypt_iv *iv, u64 lblk_num, + const struct llcrypt_info *ci) +{ + memset(iv, 0, ci->ci_mode->ivsize); + iv->lblk_num = cpu_to_le64(lblk_num); + + if (llcrypt_is_direct_key_policy(&ci->ci_policy)) + memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE); + + if (ci->ci_essiv_tfm != NULL) + crypto_cipher_encrypt_one(ci->ci_essiv_tfm, iv->raw, iv->raw); +} + +/* Encrypt or decrypt a single filesystem block of file contents */ +int llcrypt_crypt_block(const struct inode *inode, llcrypt_direction_t rw, + u64 lblk_num, struct page *src_page, + struct page *dest_page, unsigned int len, + unsigned int offs, gfp_t gfp_flags) +{ + union llcrypt_iv iv; + struct skcipher_request *req = NULL; + DECLARE_CRYPTO_WAIT(wait); + struct scatterlist dst, src; + struct llcrypt_info *ci = llcrypt_info(inode); + struct crypto_skcipher *tfm = ci->ci_ctfm; + int res = 0; + + if (tfm == NULL) { + if (dest_page != src_page) + memcpy(page_address(dest_page), page_address(src_page), + PAGE_SIZE); + return 0; + } + + if (WARN_ON_ONCE(len <= 0)) + return -EINVAL; + if (WARN_ON_ONCE(len % LL_CRYPTO_BLOCK_SIZE != 0)) + return -EINVAL; + + llcrypt_generate_iv(&iv, lblk_num, ci); + + req = skcipher_request_alloc(tfm, gfp_flags); + if (!req) + return -ENOMEM; + + skcipher_request_set_callback( + req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, &wait); + + sg_init_table(&dst, 1); + sg_set_page(&dst, dest_page, len, offs); + sg_init_table(&src, 1); + sg_set_page(&src, src_page, len, offs); + skcipher_request_set_crypt(req, &src, &dst, len, &iv); + if (rw == FS_DECRYPT) + res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); + else + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + skcipher_request_free(req); + if (res) { + llcrypt_err(inode, "%scryption failed for block %llu: %d", + (rw == FS_DECRYPT ? "De" : "En"), lblk_num, res); + return res; + } + return 0; +} + +/** + * llcrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a pagecache page + * @page: The locked pagecache page containing the block(s) to encrypt + * @len: Total size of the block(s) to encrypt. Must be a nonzero + * multiple of the filesystem's block size. + * @offs: Byte offset within @page of the first block to encrypt. Must be + * a multiple of the filesystem's block size. + * @gfp_flags: Memory allocation flags + * + * A new bounce page is allocated, and the specified block(s) are encrypted into + * it. In the bounce page, the ciphertext block(s) will be located at the same + * offsets at which the plaintext block(s) were located in the source page; any + * other parts of the bounce page will be left uninitialized. However, normally + * blocksize == PAGE_SIZE and the whole page is encrypted at once. + * + * This is for use by the filesystem's ->writepages() method. + * + * Return: the new encrypted bounce page on success; an ERR_PTR() on failure + */ +struct page *llcrypt_encrypt_pagecache_blocks(struct page *page, + unsigned int len, + unsigned int offs, + gfp_t gfp_flags) + +{ + const struct inode *inode = page->mapping->host; + const unsigned int blockbits = inode->i_blkbits; + const unsigned int blocksize = 1 << blockbits; + struct page *ciphertext_page; + u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) + + (offs >> blockbits); + unsigned int i; + int err; + + if (WARN_ON_ONCE(!PageLocked(page))) + return ERR_PTR(-EINVAL); + + if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize))) + return ERR_PTR(-EINVAL); + + ciphertext_page = llcrypt_alloc_bounce_page(gfp_flags); + if (!ciphertext_page) + return ERR_PTR(-ENOMEM); + + for (i = offs; i < offs + len; i += blocksize, lblk_num++) { + err = llcrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, + blocksize, i, gfp_flags); + if (err) { + llcrypt_free_bounce_page(ciphertext_page); + return ERR_PTR(err); + } + } + SetPagePrivate(ciphertext_page); + set_page_private(ciphertext_page, (unsigned long)page); + return ciphertext_page; +} +EXPORT_SYMBOL(llcrypt_encrypt_pagecache_blocks); + +/** + * llcrypt_encrypt_block() - Encrypt a filesystem block in a page + * @inode: The inode to which this block belongs + * @src: The page containing the block to encrypt + * @dst: The page which will contain the encrypted data + * @len: Size of block to encrypt. Doesn't need to be a multiple of the + * fs block size, but must be a multiple of LL_CRYPTO_BLOCK_SIZE. + * @offs: Byte offset within @page at which the block to encrypt begins + * @lblk_num: Filesystem logical block number of the block, i.e. the 0-based + * number of the block within the file + * @gfp_flags: Memory allocation flags + * + * Encrypt a possibly-compressed filesystem block that is located in an + * arbitrary page, not necessarily in the original pagecache page. The @inode + * and @lblk_num must be specified, as they can't be determined from @page. + * The decrypted data will be stored in @dst. + * + * Return: 0 on success; -errno on failure + */ +int llcrypt_encrypt_block(const struct inode *inode, struct page *src, + struct page *dst, unsigned int len, unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) +{ + return llcrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, src, dst, + len, offs, gfp_flags); +} +EXPORT_SYMBOL(llcrypt_encrypt_block); + +/** + * llcrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a pagecache page + * @page: The locked pagecache page containing the block(s) to decrypt + * @len: Total size of the block(s) to decrypt. Must be a nonzero + * multiple of the filesystem's block size. + * @offs: Byte offset within @page of the first block to decrypt. Must be + * a multiple of the filesystem's block size. + * + * The specified block(s) are decrypted in-place within the pagecache page, + * which must still be locked and not uptodate. Normally, blocksize == + * PAGE_SIZE and the whole page is decrypted at once. + * + * This is for use by the filesystem's ->readpages() method. + * + * Return: 0 on success; -errno on failure + */ +int llcrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len, + unsigned int offs) +{ + const struct inode *inode = page->mapping->host; + const unsigned int blockbits = inode->i_blkbits; + const unsigned int blocksize = 1 << blockbits; + u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) + + (offs >> blockbits); + unsigned int i; + int err; + + if (WARN_ON_ONCE(!PageLocked(page))) + return -EINVAL; + + if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize))) + return -EINVAL; + + for (i = offs; i < offs + len; i += blocksize, lblk_num++) { + err = llcrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, + page, blocksize, i, GFP_NOFS); + if (err) + return err; + } + return 0; +} +EXPORT_SYMBOL(llcrypt_decrypt_pagecache_blocks); + +/** + * llcrypt_decrypt_block() - Cache a decrypted filesystem block in a page + * @inode: The inode to which this block belongs + * @src: The page containing the block to decrypt + * @dst: The page which will contain the plain data + * @len: Size of block to decrypt. Doesn't need to be a multiple of the + * fs block size, but must be a multiple of LL_CRYPTO_BLOCK_SIZE. + * @offs: Byte offset within @page at which the block to decrypt begins + * @lblk_num: Filesystem logical block number of the block, i.e. the 0-based + * number of the block within the file + * + * Decrypt a possibly-compressed filesystem block that is located in an + * arbitrary page, not necessarily in the original pagecache page. The @inode + * and @lblk_num must be specified, as they can't be determined from @page. + * The encrypted data will be stored in @dst. + * + * Return: 0 on success; -errno on failure + */ +int llcrypt_decrypt_block(const struct inode *inode, struct page *src, + struct page *dst, unsigned int len, unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) +{ + return llcrypt_crypt_block(inode, FS_DECRYPT, lblk_num, src, dst, + len, offs, gfp_flags); +} +EXPORT_SYMBOL(llcrypt_decrypt_block); + +/* + * Validate dentries in encrypted directories to make sure we aren't potentially + * caching stale dentries after a key has been added. + */ +static int llcrypt_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct dentry *dir; + int err; + int valid; + + /* + * Plaintext names are always valid, since llcrypt doesn't support + * reverting to ciphertext names without evicting the directory's inode + * -- which implies eviction of the dentries in the directory. + */ + if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME)) + return 1; + + /* + * Ciphertext name; valid if the directory's key is still unavailable. + * + * Although llcrypt forbids rename() on ciphertext names, we still must + * use dget_parent() here rather than use ->d_parent directly. That's + * because a corrupted fs image may contain directory hard links, which + * the VFS handles by moving the directory's dentry tree in the dcache + * each time ->lookup() finds the directory and it already has a dentry + * elsewhere. Thus ->d_parent can be changing, and we must safely grab + * a reference to some ->d_parent to prevent it from being freed. + */ + + if (flags & LOOKUP_RCU) + return -ECHILD; + + dir = dget_parent(dentry); + err = llcrypt_get_encryption_info(d_inode(dir)); + valid = !llcrypt_has_encryption_key(d_inode(dir)); + dput(dir); + + if (err < 0) + return err; + + return valid; +} + +const struct dentry_operations llcrypt_d_ops = { + .d_revalidate = llcrypt_d_revalidate, +}; + +static void llcrypt_destroy(void) +{ + struct llcrypt_ctx *pos, *n; + + list_for_each_entry_safe(pos, n, &llcrypt_free_ctxs, free_list) + kmem_cache_free(llcrypt_ctx_cachep, pos); + INIT_LIST_HEAD(&llcrypt_free_ctxs); + mempool_destroy(llcrypt_bounce_page_pool); + llcrypt_bounce_page_pool = NULL; +} + +/** + * llcrypt_initialize() - allocate major buffers for fs encryption. + * @cop_flags: llcrypt operations flags + * + * We only call this when we start accessing encrypted files, since it + * results in memory getting allocated that wouldn't otherwise be used. + * + * Return: Zero on success, non-zero otherwise. + */ +int llcrypt_initialize(unsigned int cop_flags) +{ + int i, res = -ENOMEM; + + /* No need to allocate a bounce page pool if this FS won't use it. */ + if (cop_flags & LL_CFLG_OWN_PAGES) + return 0; + + mutex_lock(&llcrypt_init_mutex); + if (llcrypt_bounce_page_pool) + goto already_initialized; + + for (i = 0; i < num_prealloc_crypto_ctxs; i++) { + struct llcrypt_ctx *ctx; + + ctx = kmem_cache_zalloc(llcrypt_ctx_cachep, GFP_NOFS); + if (!ctx) + goto fail; + list_add(&ctx->free_list, &llcrypt_free_ctxs); + } + + llcrypt_bounce_page_pool = + mempool_create_page_pool(num_prealloc_crypto_pages, 0); + if (!llcrypt_bounce_page_pool) + goto fail; + +already_initialized: + mutex_unlock(&llcrypt_init_mutex); + return 0; +fail: + llcrypt_destroy(); + mutex_unlock(&llcrypt_init_mutex); + return res; +} + +void llcrypt_msg(const struct inode *inode, int mask, + const char *fmt, ...) +{ + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct va_format vaf; + va_list args; + + if (!__ratelimit(&rs)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (inode) + CDEBUG(mask, "llcrypt (%s, inode %lu): %pV\n", + inode->i_sb->s_id, inode->i_ino, &vaf); + else + CDEBUG(mask, "llcrypt: %pV\n", &vaf); + va_end(args); +} + +/** + * llcrypt_init() - Set up for fs encryption. + */ +int __init llcrypt_init(void) +{ + int err = -ENOMEM; + + /* + * Use an unbound workqueue to allow bios to be decrypted in parallel + * even when they happen to complete on the same CPU. This sacrifices + * locality, but it's worthwhile since decryption is CPU-intensive. + * + * Also use a high-priority workqueue to prioritize decryption work, + * which blocks reads from completing, over regular application tasks. + */ + llcrypt_read_workqueue = alloc_workqueue("llcrypt_read_queue", + WQ_UNBOUND | WQ_HIGHPRI, + num_online_cpus()); + if (!llcrypt_read_workqueue) + goto fail; + + llcrypt_ctx_cachep = KMEM_CACHE(llcrypt_ctx, SLAB_RECLAIM_ACCOUNT); + if (!llcrypt_ctx_cachep) + goto fail_free_queue; + + llcrypt_info_cachep = KMEM_CACHE(llcrypt_info, SLAB_RECLAIM_ACCOUNT); + if (!llcrypt_info_cachep) + goto fail_free_ctx; + + err = llcrypt_init_keyring(); + if (err) + goto fail_free_info; + + return 0; + +fail_free_info: + kmem_cache_destroy(llcrypt_info_cachep); +fail_free_ctx: + kmem_cache_destroy(llcrypt_ctx_cachep); +fail_free_queue: + destroy_workqueue(llcrypt_read_workqueue); +fail: + return err; +} + +/** + * llcrypt_exit() - Clean up for fs encryption. + */ +void __exit llcrypt_exit(void) +{ + llcrypt_exit_keyring(); + + llcrypt_destroy(); + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + + kmem_cache_destroy(llcrypt_info_cachep); + kmem_cache_destroy(llcrypt_ctx_cachep); + destroy_workqueue(llcrypt_read_workqueue); +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/fname.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/fname.c new file mode 100644 index 0000000000000..65b6b422cb343 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/fname.c @@ -0,0 +1,567 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This contains functions for filename crypto management + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * Written by Uday Savagaonkar, 2014. + * Modified by Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + */ +/* + * Linux commit 219d54332a09 + * tags/v5.4 + */ + +#include +#include +#include "llcrypt_private.h" + +static inline bool llcrypt_is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + +/** + * fname_encrypt() - encrypt a filename + * + * The output buffer must be at least as large as the input buffer. + * Any extra space is filled with NUL padding before encryption. + * + * Return: 0 on success, -errno on failure + */ +int fname_encrypt(struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen) +{ + struct skcipher_request *req = NULL; + DECLARE_CRYPTO_WAIT(wait); + struct llcrypt_info *ci = llcrypt_info(inode); + struct crypto_skcipher *tfm = ci->ci_ctfm; + union llcrypt_iv iv; + struct scatterlist sg; + int res; + + /* + * Copy the filename to the output buffer for encrypting in-place and + * pad it with the needed number of NUL bytes. + */ + if (WARN_ON(olen < iname->len)) + return -ENOBUFS; + memcpy(out, iname->name, iname->len); + memset(out + iname->len, 0, olen - iname->len); + + if (tfm == NULL) + return 0; + + /* Initialize the IV */ + llcrypt_generate_iv(&iv, 0, ci); + + /* Set up the encryption request */ + req = skcipher_request_alloc(tfm, GFP_NOFS); + if (!req) + return -ENOMEM; + skcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, &wait); + sg_init_one(&sg, out, olen); + skcipher_request_set_crypt(req, &sg, &sg, olen, &iv); + + /* Do the encryption */ + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + skcipher_request_free(req); + if (res < 0) { + llcrypt_err(inode, "Filename encryption failed: %d", res); + return res; + } + + return 0; +} + +/** + * fname_decrypt() - decrypt a filename + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure + */ +static int fname_decrypt(struct inode *inode, + const struct llcrypt_str *iname, + struct llcrypt_str *oname) +{ + struct skcipher_request *req = NULL; + DECLARE_CRYPTO_WAIT(wait); + struct scatterlist src_sg, dst_sg; + struct llcrypt_info *ci = llcrypt_info(inode); + struct crypto_skcipher *tfm = ci->ci_ctfm; + union llcrypt_iv iv; + int res; + + if (tfm == NULL) { + memcpy(oname->name, iname->name, iname->len); + oname->name[iname->len] = '\0'; + oname->len = iname->len; + return 0; + } + + /* Allocate request */ + req = skcipher_request_alloc(tfm, GFP_NOFS); + if (!req) + return -ENOMEM; + skcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, &wait); + + /* Initialize IV */ + llcrypt_generate_iv(&iv, 0, ci); + + /* Create decryption request */ + sg_init_one(&src_sg, iname->name, iname->len); + sg_init_one(&dst_sg, oname->name, oname->len); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv); + res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); + skcipher_request_free(req); + if (res < 0) { + llcrypt_err(inode, "Filename decryption failed: %d", res); + return res; + } + + oname->len = strnlen(oname->name, iname->len); + return 0; +} + +/* + * Old fashion base64 encoding, taken from Linux 5.4. + * + * This base64 encoding is specific to fscrypt and has been replaced since then + * with an RFC 4648 compliant base64-url encoding, see llcrypt_base64url_* + * below. + * The old fashion base64 encoding is kept for compatibility with older clients. + */ + +static const char lookup_table[65] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + +#define LLCRYPT_BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + +/** + * base64_encode() - + * + * Encodes the input string using characters from the set [A-Za-z0-9+,]. + * The encoded string is roughly 4/3 times the size of the input string. + * + * Return: length of the encoded string + */ +static inline int llcrypt_base64_encode(const u8 *src, int len, char *dst) +{ + int i, bits = 0, ac = 0; + char *cp = dst; + + for (i = 0; i < len; i++) { + ac += src[i] << bits; + bits += 8; + do { + *cp++ = lookup_table[ac & 0x3f]; + ac >>= 6; + bits -= 6; + } while (bits >= 6); + } + if (bits) + *cp++ = lookup_table[ac & 0x3f]; + return cp - dst; +} + +static inline int llcrypt_base64_decode(const char *src, int len, u8 *dst) +{ + int i, bits = 0, ac = 0; + const char *p; + u8 *cp = dst; + + for (i = 0; i < len; i++) { + p = strchr(lookup_table, src[i]); + if (p == NULL || src[i] == 0) + return -2; + ac += (p - lookup_table) << bits; + bits += 6; + if (bits >= 8) { + *cp++ = ac & 0xff; + ac >>= 8; + bits -= 8; + } + } + if (ac) + return -1; + return cp - dst; +} + +/* + * New fashion base64 encoding, taken from Linux 5.14. + * + * This base64 encoding is RFC 4648 compliant base64-url encoding. + */ + +static const char base64url_table[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; + +#define LLCRYPT_BASE64URL_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + +/** + * llcrypt_base64url_encode() - base64url-encode some binary data + * @src: the binary data to encode + * @srclen: the length of @src in bytes + * @dst: (output) the base64url-encoded string. Not NUL-terminated. + * + * Encodes data using base64url encoding, i.e. the "Base 64 Encoding with URL + * and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't used, + * as it's unneeded and not required by the RFC. base64url is used instead of + * base64 to avoid the '/' character, which isn't allowed in filenames. + * + * Return: the length of the resulting base64url-encoded string in bytes. + * This will be equal to LLCRYPT_BASE64URL_CHARS(srclen). + */ +static inline int llcrypt_base64url_encode(const u8 *src, int srclen, char *dst) +{ + u32 ac = 0; + int bits = 0; + int i; + char *cp = dst; + + for (i = 0; i < srclen; i++) { + ac = (ac << 8) | src[i]; + bits += 8; + do { + bits -= 6; + *cp++ = base64url_table[(ac >> bits) & 0x3f]; + } while (bits >= 6); + } + if (bits) + *cp++ = base64url_table[(ac << (6 - bits)) & 0x3f]; + return cp - dst; +} + +/** + * llcrypt_base64url_decode() - base64url-decode a string + * @src: the string to decode. Doesn't need to be NUL-terminated. + * @srclen: the length of @src in bytes + * @dst: (output) the decoded binary data + * + * Decodes a string using base64url encoding, i.e. the "Base 64 Encoding with + * URL and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't + * accepted, nor are non-encoding characters such as whitespace. + * + * This implementation hasn't been optimized for performance. + * + * Return: the length of the resulting decoded binary data in bytes, + * or -1 if the string isn't a valid base64url string. + */ +static inline int llcrypt_base64url_decode(const char *src, int srclen, u8 *dst) +{ + u32 ac = 0; + int bits = 0; + int i; + u8 *bp = dst; + + for (i = 0; i < srclen; i++) { + const char *p = strchr(base64url_table, src[i]); + + if (p == NULL || src[i] == 0) + return -1; + ac = (ac << 6) | (p - base64url_table); + bits += 6; + if (bits >= 8) { + bits -= 8; + *bp++ = (u8)(ac >> bits); + } + } + if (ac & ((1 << bits) - 1)) + return -1; + return bp - dst; +} + +static inline int base64_chars(struct lustre_sb_info *lsi, int nbytes) +{ + if (!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI)) + return LLCRYPT_BASE64URL_CHARS(nbytes); + else + return LLCRYPT_BASE64_CHARS(nbytes); +} + +bool llcrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret) +{ + const struct llcrypt_info *ci = llcrypt_info(inode); + struct crypto_skcipher *tfm = ci->ci_ctfm; + int padding = 4 << (llcrypt_policy_flags(&ci->ci_policy) & + LLCRYPT_POLICY_FLAGS_PAD_MASK); + u32 encrypted_len; + + if (orig_len > max_len) + return false; + if (tfm == NULL) { + *encrypted_len_ret = orig_len; + } else { + encrypted_len = max(orig_len, (u32)LL_CRYPTO_BLOCK_SIZE); + encrypted_len = round_up(encrypted_len, padding); + *encrypted_len_ret = min(encrypted_len, max_len); + } + return true; +} + +/** + * llcrypt_fname_alloc_buffer - allocate a buffer for presented filenames + * + * Allocate a buffer that is large enough to hold any decrypted or encoded + * filename (null-terminated), for the given maximum encrypted filename length. + * + * Return: 0 on success, -errno on failure + */ +int llcrypt_fname_alloc_buffer(const struct inode *inode, + u32 max_encrypted_len, + struct llcrypt_str *crypto_str) +{ + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + const u32 max_encoded_len = + max_t(u32, + base64_chars(lsi, LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE), + 1 + base64_chars(lsi, sizeof(struct llcrypt_digested_name))); + u32 max_presented_len; + + max_presented_len = max(max_encoded_len, max_encrypted_len); + + crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS); + if (!crypto_str->name) + return -ENOMEM; + crypto_str->len = max_presented_len; + return 0; +} +EXPORT_SYMBOL(llcrypt_fname_alloc_buffer); + +/** + * llcrypt_fname_free_buffer - free the buffer for presented filenames + * + * Free the buffer allocated by llcrypt_fname_alloc_buffer(). + */ +void llcrypt_fname_free_buffer(struct llcrypt_str *crypto_str) +{ + if (!crypto_str) + return; + kfree(crypto_str->name); + crypto_str->name = NULL; +} +EXPORT_SYMBOL(llcrypt_fname_free_buffer); + +/** + * llcrypt_fname_disk_to_usr() - converts a filename from disk space to user + * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * If the key is available, we'll decrypt the disk name; otherwise, we'll encode + * it for presentation. Short names are directly base64-encoded, while long + * names are encoded in llcrypt_digested_name format. + * + * Return: 0 on success, -errno on failure + */ +int llcrypt_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct llcrypt_str *iname, + struct llcrypt_str *oname) +{ + int (*b64_encode)(const u8 *src, int srclen, char *dst); + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + const struct qstr qname = LLTR_TO_QSTR(iname); + struct llcrypt_digested_name digested_name; + + if (llcrypt_is_dot_dotdot(&qname)) { + oname->name[0] = '.'; + oname->name[iname->len - 1] = '.'; + oname->len = iname->len; + return 0; + } + + if (llcrypt_has_encryption_key(inode)) { + struct llcrypt_info *ci = llcrypt_info(inode); + struct crypto_skcipher *tfm = ci->ci_ctfm; + + if (tfm && iname->len < LL_CRYPTO_BLOCK_SIZE) + return -EUCLEAN; + + return fname_decrypt(inode, iname, oname); + } + + if (!llcrypt_policy_has_filename_enc(inode)) { + memcpy(oname->name, iname->name, iname->len); + oname->name[iname->len] = '\0'; + oname->len = iname->len; + return 0; + } + + if (!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI)) + b64_encode = llcrypt_base64url_encode; + else + b64_encode = llcrypt_base64_encode; + + if (iname->len <= LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { + oname->len = b64_encode(iname->name, iname->len, oname->name); + return 0; + } + if (hash) { + digested_name.hash = hash; + digested_name.minor_hash = minor_hash; + } else { + digested_name.hash = 0; + digested_name.minor_hash = 0; + } + memcpy(digested_name.digest, + LLCRYPT_FNAME_DIGEST(iname->name, iname->len), + LLCRYPT_FNAME_DIGEST_SIZE); + if (!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI)) + oname->name[0] = LLCRYPT_DIGESTED_CHAR; + else + oname->name[0] = LLCRYPT_DIGESTED_CHAR_OLD; + oname->len = 1 + b64_encode((const u8 *)&digested_name, + sizeof(digested_name), oname->name + 1); + return 0; +} +EXPORT_SYMBOL(llcrypt_fname_disk_to_usr); + +/** + * llcrypt_setup_filename() - prepare to search a possibly encrypted directory + * @dir: the directory that will be searched + * @iname: the user-provided filename being searched for + * @lookup: 1 if we're allowed to proceed without the key because it's + * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot + * proceed without the key because we're going to create the dir_entry. + * @fname: the filename information to be filled in + * + * Given a user-provided filename @iname, this function sets @fname->disk_name + * to the name that would be stored in the on-disk directory entry, if possible. + * If the directory is unencrypted this is simply @iname. Else, if we have the + * directory's encryption key, then @iname is the plaintext, so we encrypt it to + * get the disk_name. + * + * Else, for keyless @lookup operations, @iname is the presented ciphertext, so + * we decode it to get either the ciphertext disk_name (for short names) or the + * llcrypt_digested_name (for long names). Non-@lookup operations will be + * impossible in this case, so we fail them with ENOKEY. + * + * If successful, llcrypt_free_filename() must be called later to clean up. + * + * Return: 0 on success, -errno on failure + */ +int llcrypt_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct llcrypt_name *fname) +{ + struct lustre_sb_info *lsi = s2lsi(dir->i_sb); + int ret; + int digested; + + memset(fname, 0, sizeof(struct llcrypt_name)); + fname->usr_fname = iname; + + if (!IS_ENCRYPTED(dir) || llcrypt_is_dot_dotdot(iname)) { + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; + } + ret = llcrypt_get_encryption_info(dir); + if (ret) + return ret; + + if (llcrypt_has_encryption_key(dir)) { + struct lustre_sb_info *lsi = s2lsi(dir->i_sb); + + if (!llcrypt_fname_encrypted_size(dir, iname->len, + lsi ? + lsi->lsi_cop->max_namelen : + NAME_MAX, + &fname->crypto_buf.len)) + return -ENAMETOOLONG; + fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, + GFP_NOFS); + if (!fname->crypto_buf.name) + return -ENOMEM; + + ret = fname_encrypt(dir, iname, fname->crypto_buf.name, + fname->crypto_buf.len); + if (ret) + goto errout; + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; + return 0; + } + if (!lookup) + return -ENOKEY; + + if (!llcrypt_policy_has_filename_enc(dir)) { + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; + } + + fname->is_ciphertext_name = true; + + /* + * We don't have the key and we are doing a lookup; decode the + * user-supplied name + */ + if ((!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI) && + iname->name[0] == LLCRYPT_DIGESTED_CHAR) || + ((lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI) && + iname->name[0] == LLCRYPT_DIGESTED_CHAR_OLD)) { + if (iname->len != 1 + base64_chars(lsi, + sizeof(struct llcrypt_digested_name))) { + return -ENOENT; + } + digested = 1; + } else { + if (iname->len > + base64_chars(lsi, LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE)) + return -ENOENT; + digested = 0; + } + + fname->crypto_buf.name = + kmalloc(max_t(size_t, LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE, + sizeof(struct llcrypt_digested_name)), + GFP_KERNEL); + if (fname->crypto_buf.name == NULL) + return -ENOMEM; + + if (!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI)) + ret = llcrypt_base64url_decode(iname->name + digested, + iname->len - digested, + fname->crypto_buf.name); + else + ret = llcrypt_base64_decode(iname->name + digested, + iname->len - digested, + fname->crypto_buf.name); + + if (ret < 0) { + ret = -ENOENT; + goto errout; + } + fname->crypto_buf.len = ret; + if (digested) { + const struct llcrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + fname->hash = n->hash; + fname->minor_hash = n->minor_hash; + } else { + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; + } + return 0; + +errout: + kfree(fname->crypto_buf.name); + return ret; +} +EXPORT_SYMBOL(llcrypt_setup_filename); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/hkdf.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/hkdf.c new file mode 100644 index 0000000000000..8874bcb0a527b --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/hkdf.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation + * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): + * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". + * + * This is used to derive keys from the llcrypt master keys. + * + * Copyright 2019 Google LLC + */ +/* + * Linux commit 219d54332a09 + * tags/v5.4 + */ + +#include +#ifdef HAVE_CRYPTO_SHA2_HEADER +#include +#else +#include +#endif +#include "llcrypt_private.h" + +/* + * HKDF supports any unkeyed cryptographic hash algorithm, but llcrypt uses + * SHA-512 because it is reasonably secure and efficient; and since it produces + * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of + * entropy from the master key and requires only one iteration of HKDF-Expand. + */ +#define HKDF_HMAC_ALG "hmac(sha512)" +#define HKDF_HASHLEN SHA512_DIGEST_SIZE + +/* + * HKDF consists of two steps: + * + * 1. HKDF-Extract: extract a pseudorandom key of length HKDF_HASHLEN bytes from + * the input keying material and optional salt. + * 2. HKDF-Expand: expand the pseudorandom key into output keying material of + * any length, parameterized by an application-specific info string. + * + * HKDF-Extract can be skipped if the input is already a pseudorandom key of + * length HKDF_HASHLEN bytes. However, cipher modes other than AES-256-XTS take + * shorter keys, and we don't want to force users of those modes to provide + * unnecessarily long master keys. Thus llcrypt still does HKDF-Extract. No + * salt is used, since llcrypt master keys should already be pseudorandom and + * there's no way to persist a random salt per master key from kernel mode. + */ + +/* HKDF-Extract (RFC 5869 section 2.2), unsalted */ +static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, + unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) +{ + static const u8 default_salt[HKDF_HASHLEN]; + SHASH_DESC_ON_STACK(desc, hmac_tfm); + int err; + + err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); + if (err) + return err; + + desc->tfm = hmac_tfm; + err = crypto_shash_digest(desc, ikm, ikmlen, prk); + shash_desc_zero(desc); + return err; +} + +/* + * Compute HKDF-Extract using the given master key as the input keying material, + * and prepare an HMAC transform object keyed by the resulting pseudorandom key. + * + * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many + * times without having to recompute HKDF-Extract each time. + */ +int llcrypt_init_hkdf(struct llcrypt_hkdf *hkdf, const u8 *master_key, + unsigned int master_key_size) +{ + struct crypto_shash *hmac_tfm; + u8 prk[HKDF_HASHLEN]; + int err; + + hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0); + if (IS_ERR(hmac_tfm)) { + llcrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld", + PTR_ERR(hmac_tfm)); + return PTR_ERR(hmac_tfm); + } + + if (WARN_ON(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) { + err = -EINVAL; + goto err_free_tfm; + } + + err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk); + if (err) + goto err_free_tfm; + + err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk)); + if (err) + goto err_free_tfm; + + hkdf->hmac_tfm = hmac_tfm; + goto out; + +err_free_tfm: + crypto_free_shash(hmac_tfm); +out: + memzero_explicit(prk, sizeof(prk)); + return err; +} + +/* + * HKDF-Expand (RFC 5869 section 2.3). This expands the pseudorandom key, which + * was already keyed into 'hkdf->hmac_tfm' by llcrypt_init_hkdf(), into 'okmlen' + * bytes of output keying material parameterized by the application-specific + * 'info' of length 'infolen' bytes, prefixed by "llcrypt\0" and the 'context' + * byte. This is thread-safe and may be called by multiple threads in parallel. + * + * ('context' isn't part of the HKDF specification; it's just a prefix llcrypt + * adds to its application-specific info strings to guarantee that it doesn't + * accidentally repeat an info string when using HKDF for different purposes.) + */ +int llcrypt_hkdf_expand(struct llcrypt_hkdf *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen) +{ + SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); + u8 prefix[9]; + unsigned int i; + int err; + const u8 *prev = NULL; + u8 counter = 1; + u8 tmp[HKDF_HASHLEN]; + + if (WARN_ON(okmlen > 255 * HKDF_HASHLEN)) + return -EINVAL; + + desc->tfm = hkdf->hmac_tfm; + + memcpy(prefix, "fscrypt\0", 8); + prefix[8] = context; + + for (i = 0; i < okmlen; i += HKDF_HASHLEN) { + + err = crypto_shash_init(desc); + if (err) + goto out; + + if (prev) { + err = crypto_shash_update(desc, prev, HKDF_HASHLEN); + if (err) + goto out; + } + + err = crypto_shash_update(desc, prefix, sizeof(prefix)); + if (err) + goto out; + + err = crypto_shash_update(desc, info, infolen); + if (err) + goto out; + + BUILD_BUG_ON(sizeof(counter) != 1); + if (okmlen - i < HKDF_HASHLEN) { + err = crypto_shash_finup(desc, &counter, 1, tmp); + if (err) + goto out; + memcpy(&okm[i], tmp, okmlen - i); + memzero_explicit(tmp, sizeof(tmp)); + } else { + err = crypto_shash_finup(desc, &counter, 1, &okm[i]); + if (err) + goto out; + } + counter++; + prev = &okm[i]; + } + err = 0; +out: + if (unlikely(err)) + memzero_explicit(okm, okmlen); /* so caller doesn't need to */ + shash_desc_zero(desc); + return err; +} + +void llcrypt_destroy_hkdf(struct llcrypt_hkdf *hkdf) +{ + crypto_free_shash(hkdf->hmac_tfm); +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/hooks.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/hooks.c new file mode 100644 index 0000000000000..36399511b7fb0 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/hooks.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * fs/crypto/hooks.c + * + * Encryption hooks for higher-level filesystem operations. + */ +/* + * Linux commit 219d54332a09 + * tags/v5.4 + */ + +#include "llcrypt_private.h" + +/** + * llcrypt_file_open - prepare to open a possibly-encrypted regular file + * @inode: the inode being opened + * @filp: the struct file being set up + * + * Currently, an encrypted regular file can only be opened if its encryption key + * is available; access to the raw encrypted contents is not supported. + * Therefore, we first set up the inode's encryption key (if not already done) + * and return an error if it's unavailable. + * + * We also verify that if the parent directory (from the path via which the file + * is being opened) is encrypted, then the inode being opened uses the same + * encryption policy. This is needed as part of the enforcement that all files + * in an encrypted directory tree use the same encryption policy, as a + * protection against certain types of offline attacks. Note that this check is + * needed even when opening an *unencrypted* file, since it's forbidden to have + * an unencrypted file in an encrypted directory. + * + * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code + */ +int llcrypt_file_open(struct inode *inode, struct file *filp) +{ + int err; + struct dentry *dir; + + err = llcrypt_require_key(inode); + if (err) + return err; + + dir = dget_parent(file_dentry(filp)); + if (IS_ENCRYPTED(d_inode(dir)) && + !llcrypt_has_permitted_context(d_inode(dir), inode)) { + llcrypt_warn(inode, + "Inconsistent encryption context (parent directory: %lu)", + d_inode(dir)->i_ino); + err = -EPERM; + } + dput(dir); + return err; +} +EXPORT_SYMBOL_GPL(llcrypt_file_open); + +int __llcrypt_prepare_link(struct inode *inode, struct inode *dir, + struct dentry *dentry) +{ + int err; + + err = llcrypt_require_key(dir); + if (err) + return err; + + /* ... in case we looked up ciphertext name before key was added */ + if (dentry->d_flags & DCACHE_ENCRYPTED_NAME) + return -ENOKEY; + + if (!llcrypt_has_permitted_context(dir, inode)) + return -EXDEV; + + return 0; +} +EXPORT_SYMBOL_GPL(__llcrypt_prepare_link); + +int __llcrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + int err; + + err = llcrypt_require_key(old_dir); + if (err) + return err; + + err = llcrypt_require_key(new_dir); + if (err) + return err; + + /* ... in case we looked up ciphertext name(s) before key was added */ + if ((old_dentry->d_flags | new_dentry->d_flags) & + DCACHE_ENCRYPTED_NAME) + return -ENOKEY; + + if (old_dir != new_dir) { + if (IS_ENCRYPTED(new_dir) && + !llcrypt_has_permitted_context(new_dir, + d_inode(old_dentry))) + return -EXDEV; + + if ((flags & RENAME_EXCHANGE) && + IS_ENCRYPTED(old_dir) && + !llcrypt_has_permitted_context(old_dir, + d_inode(new_dentry))) + return -EXDEV; + } + return 0; +} +EXPORT_SYMBOL_GPL(__llcrypt_prepare_rename); + +int __llcrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct llcrypt_name *fname) +{ + int err = llcrypt_setup_filename(dir, &dentry->d_name, 1, fname); + + if (err && err != -ENOENT) + return err; + + if (fname->is_ciphertext_name) { + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_NAME; + spin_unlock(&dentry->d_lock); + d_set_d_op(dentry, &llcrypt_d_ops); + } + return err; +} +EXPORT_SYMBOL_GPL(__llcrypt_prepare_lookup); + +int __llcrypt_prepare_symlink(struct inode *dir, unsigned int len, + unsigned int max_len, + struct llcrypt_str *disk_link) +{ + int err; + + /* + * To calculate the size of the encrypted symlink target we need to know + * the amount of NUL padding, which is determined by the flags set in + * the encryption policy which will be inherited from the directory. + * The easiest way to get access to this is to just load the directory's + * llcrypt_info, since we'll need it to create the dir_entry anyway. + * + * Note: in test_dummy_encryption mode, @dir may be unencrypted. + */ + err = llcrypt_get_encryption_info(dir); + if (err) + return err; + if (!llcrypt_has_encryption_key(dir)) + return -ENOKEY; + + /* + * Calculate the size of the encrypted symlink and verify it won't + * exceed max_len. Note that for historical reasons, encrypted symlink + * targets are prefixed with the ciphertext length, despite this + * actually being redundant with i_size. This decreases by 2 bytes the + * longest symlink target we can accept. + * + * We could recover 1 byte by not counting a null terminator, but + * counting it (even though it is meaningless for ciphertext) is simpler + * for now since filesystems will assume it is there and subtract it. + */ + if (!llcrypt_fname_encrypted_size(dir, len, + max_len - sizeof(struct llcrypt_symlink_data), + &disk_link->len)) + return -ENAMETOOLONG; + disk_link->len += sizeof(struct llcrypt_symlink_data); + + disk_link->name = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(__llcrypt_prepare_symlink); + +int __llcrypt_encrypt_symlink(struct inode *inode, const char *target, + unsigned int len, struct llcrypt_str *disk_link) +{ + int err; + struct qstr iname = QSTR_INIT(target, len); + struct llcrypt_symlink_data *sd; + unsigned int ciphertext_len; + + if (!llcrypt_policy_has_filename_enc(inode)) + return 0; + + err = llcrypt_require_key(inode); + if (err) + return err; + + if (disk_link->name) { + /* filesystem-provided buffer */ + sd = (struct llcrypt_symlink_data *)disk_link->name; + } else { + sd = kmalloc(disk_link->len, GFP_NOFS); + if (!sd) + return -ENOMEM; + } + ciphertext_len = disk_link->len - sizeof(*sd); + sd->len = cpu_to_le16(ciphertext_len); + + err = fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len); + if (err) + goto err_free_sd; + + /* + * Null-terminating the ciphertext doesn't make sense, but we still + * count the null terminator in the length, so we might as well + * initialize it just in case the filesystem writes it out. + */ + sd->encrypted_path[ciphertext_len] = '\0'; + + /* Cache the plaintext symlink target for later use by get_link() */ + err = -ENOMEM; + inode->i_link = kmemdup(target, len + 1, GFP_NOFS); + if (!inode->i_link) + goto err_free_sd; + + if (!disk_link->name) + disk_link->name = (unsigned char *)sd; + return 0; + +err_free_sd: + if (!disk_link->name) + kfree(sd); + return err; +} +EXPORT_SYMBOL_GPL(__llcrypt_encrypt_symlink); + +/** + * llcrypt_get_symlink - get the target of an encrypted symlink + * @inode: the symlink inode + * @caddr: the on-disk contents of the symlink + * @max_size: size of @caddr buffer + * @done: if successful, will be set up to free the returned target if needed + * + * If the symlink's encryption key is available, we decrypt its target. + * Otherwise, we encode its target for presentation. + * + * This may sleep, so the filesystem must have dropped out of RCU mode already. + * + * Return: the presentable symlink target or an ERR_PTR() + */ +const char *llcrypt_get_symlink(struct inode *inode, const void *caddr, + unsigned int max_size, + struct delayed_call *done) +{ + const struct llcrypt_symlink_data *sd; + struct llcrypt_str cstr, pstr; + bool has_key; + int err; + + /* This is for encrypted symlinks only */ + if (WARN_ON(!IS_ENCRYPTED(inode))) + return ERR_PTR(-EINVAL); + + /* If the decrypted target is already cached, just return it. */ + pstr.name = READ_ONCE(inode->i_link); + if (pstr.name) + return pstr.name; + + /* + * Try to set up the symlink's encryption key, but we can continue + * regardless of whether the key is available or not. + */ + err = llcrypt_get_encryption_info(inode); + if (err) + return ERR_PTR(err); + has_key = llcrypt_has_encryption_key(inode); + + /* + * For historical reasons, encrypted symlink targets are prefixed with + * the ciphertext length, even though this is redundant with i_size. + */ + + if (!llcrypt_policy_has_filename_enc(inode)) { + cstr.name = (unsigned char *)caddr; + cstr.len = strlen(cstr.name); + + if (cstr.len == 0) + return ERR_PTR(-EUCLEAN); + } else { + if (max_size < sizeof(*sd)) + return ERR_PTR(-EUCLEAN); + sd = caddr; + cstr.name = (unsigned char *)sd->encrypted_path; + cstr.len = le16_to_cpu(sd->len); + + if (cstr.len == 0) + return ERR_PTR(-EUCLEAN); + + if (cstr.len + sizeof(*sd) - 1 > max_size) + return ERR_PTR(-EUCLEAN); + } + + err = llcrypt_fname_alloc_buffer(inode, cstr.len, &pstr); + if (err) + return ERR_PTR(err); + + err = llcrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); + if (err) + goto err_kfree; + + err = -EUCLEAN; + if (pstr.name[0] == '\0') + goto err_kfree; + + pstr.name[pstr.len] = '\0'; + + /* + * Cache decrypted symlink targets in i_link for later use. Don't cache + * symlink targets encoded without the key, since those become outdated + * once the key is added. This pairs with the READ_ONCE() above and in + * the VFS path lookup code. + */ + if (!has_key || + cmpxchg_release(&inode->i_link, NULL, pstr.name) != NULL) + set_delayed_call(done, kfree_link, pstr.name); + + return pstr.name; + +err_kfree: + kfree(pstr.name); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(llcrypt_get_symlink); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keyring.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keyring.c new file mode 100644 index 0000000000000..358dda2591245 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keyring.c @@ -0,0 +1,1012 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Filesystem-level keyring for llcrypt + * + * Copyright 2019 Google LLC + */ + +/* + * This file implements management of llcrypt master keys in the + * filesystem-level keyring, including the ioctls: + * + * - LL_IOC_ADD_ENCRYPTION_KEY + * - LL_IOC_REMOVE_ENCRYPTION_KEY + * - LL_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS + * - LL_IOC_GET_ENCRYPTION_KEY_STATUS + * + * See the "User API" section of Documentation/filesystems/llcrypt.rst for more + * information about these ioctls. + */ +/* + * Linux commit 219d54332a09 + * tags/v5.4 + */ + +#include +#include +#include +#include + +#include "llcrypt_private.h" + +static void wipe_master_key_secret(struct llcrypt_master_key_secret *secret) +{ + llcrypt_destroy_hkdf(&secret->hkdf); + memzero_explicit(secret, sizeof(*secret)); +} + +static void move_master_key_secret(struct llcrypt_master_key_secret *dst, + struct llcrypt_master_key_secret *src) +{ + memcpy(dst, src, sizeof(*dst)); + memzero_explicit(src, sizeof(*src)); +} + +static void free_master_key(struct llcrypt_master_key *mk) +{ + size_t i; + + wipe_master_key_secret(&mk->mk_secret); + + for (i = 0; i < ARRAY_SIZE(mk->mk_mode_keys); i++) + crypto_free_skcipher(mk->mk_mode_keys[i]); + + key_put(mk->mk_users); + kfree_sensitive(mk); +} + +static inline bool valid_key_spec(const struct llcrypt_key_specifier *spec) +{ + if (spec->__reserved) + return false; + return master_key_spec_len(spec) != 0; +} + +static int llcrypt_key_instantiate(struct key *key, + struct key_preparsed_payload *prep) +{ + key->payload.data[0] = (struct llcrypt_master_key *)prep->data; + return 0; +} + +static void llcrypt_key_destroy(struct key *key) +{ + free_master_key(key->payload.data[0]); +} + +static void llcrypt_key_describe(const struct key *key, struct seq_file *m) +{ + seq_puts(m, key->description); + + if (key_is_positive(key)) { + const struct llcrypt_master_key *mk = key->payload.data[0]; + + if (!is_master_key_secret_present(&mk->mk_secret)) + seq_puts(m, ": secret removed"); + } +} + +/* + * Type of key in ->lsi_master_keys. Each key of this type represents a master + * key which has been added to the filesystem. Its payload is a + * 'struct llcrypt_master_key'. The "." prefix in the key type name prevents + * users from adding keys of this type via the keyrings syscalls rather than via + * the intended method of LL_IOC_ADD_ENCRYPTION_KEY. + */ +static struct key_type key_type_llcrypt = { + .name = "._llcrypt", + .instantiate = llcrypt_key_instantiate, + .destroy = llcrypt_key_destroy, + .describe = llcrypt_key_describe, +}; + +static int llcrypt_user_key_instantiate(struct key *key, + struct key_preparsed_payload *prep) +{ + /* + * We just charge LLCRYPT_MAX_KEY_SIZE bytes to the user's key quota for + * each key, regardless of the exact key size. The amount of memory + * actually used is greater than the size of the raw key anyway. + */ + return key_payload_reserve(key, LLCRYPT_MAX_KEY_SIZE); +} + +static void llcrypt_user_key_describe(const struct key *key, struct seq_file *m) +{ + seq_puts(m, key->description); +} + +/* + * Type of key in ->mk_users. Each key of this type represents a particular + * user who has added a particular master key. + * + * Note that the name of this key type really should be something like + * ".llcrypt-user" instead of simply ".llcrypt". But the shorter name is chosen + * mainly for simplicity of presentation in /proc/keys when read by a non-root + * user. And it is expected to be rare that a key is actually added by multiple + * users, since users should keep their encryption keys confidential. + */ +static struct key_type key_type_llcrypt_user = { + .name = ".llcrypt", + .instantiate = llcrypt_user_key_instantiate, + .describe = llcrypt_user_key_describe, +}; + +/* Search ->lsi_master_keys or ->mk_users */ +static struct key *search_llcrypt_keyring(struct key *keyring, + struct key_type *type, + const char *description) +{ + /* + * We need to mark the keyring reference as "possessed" so that we + * acquire permission to search it, via the KEY_POS_SEARCH permission. + */ + key_ref_t keyref = make_key_ref(keyring, true /* possessed */); + +#ifdef HAVE_KEYRING_SEARCH_4ARGS + keyref = keyring_search(keyref, type, description, false); +#else + keyref = keyring_search(keyref, type, description); +#endif + if (IS_ERR(keyref)) { + if (PTR_ERR(keyref) == -EAGAIN || /* not found */ + PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */ + keyref = ERR_PTR(-ENOKEY); + return ERR_CAST(keyref); + } + return key_ref_to_ptr(keyref); +} + +#define LLCRYPT_FS_KEYRING_DESCRIPTION_SIZE \ + (CONST_STRLEN("llcrypt-") + sizeof_field(struct super_block, s_id)) + +#define LLCRYPT_MK_DESCRIPTION_SIZE (2 * LLCRYPT_KEY_IDENTIFIER_SIZE + 1) + +#define LLCRYPT_MK_USERS_DESCRIPTION_SIZE \ + (CONST_STRLEN("llcrypt-") + 2 * LLCRYPT_KEY_IDENTIFIER_SIZE + \ + CONST_STRLEN("-users") + 1) + +#define LLCRYPT_MK_USER_DESCRIPTION_SIZE \ + (2 * LLCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1) + +static void format_fs_keyring_description( + char description[LLCRYPT_FS_KEYRING_DESCRIPTION_SIZE], + const struct super_block *sb) +{ + sprintf(description, "llcrypt-%s", sb->s_id); +} + +static void format_mk_description( + char description[LLCRYPT_MK_DESCRIPTION_SIZE], + const struct llcrypt_key_specifier *mk_spec) +{ + sprintf(description, "%*phN", + master_key_spec_len(mk_spec), (u8 *)&mk_spec->u); +} + +static void format_mk_users_keyring_description( + char description[LLCRYPT_MK_USERS_DESCRIPTION_SIZE], + const u8 mk_identifier[LLCRYPT_KEY_IDENTIFIER_SIZE]) +{ + sprintf(description, "llcrypt-%*phN-users", + LLCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier); +} + +static void format_mk_user_description( + char description[LLCRYPT_MK_USER_DESCRIPTION_SIZE], + const u8 mk_identifier[LLCRYPT_KEY_IDENTIFIER_SIZE]) +{ + + sprintf(description, "%*phN.uid.%u", LLCRYPT_KEY_IDENTIFIER_SIZE, + mk_identifier, __kuid_val(current_fsuid())); +} + +/* Create ->lsi_master_keys if needed. Synchronized by llcrypt_add_key_mutex. */ +static int allocate_filesystem_keyring(struct super_block *sb) +{ + char description[LLCRYPT_FS_KEYRING_DESCRIPTION_SIZE]; + struct key *keyring; + struct lustre_sb_info *lsi = s2lsi(sb); + + if (!lsi) + return -EINVAL; + + if (lsi->lsi_master_keys) + return 0; + + format_fs_keyring_description(description, sb); + keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_cred(), KEY_POS_SEARCH | + KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(keyring)) + return PTR_ERR(keyring); + + /* Pairs with READ_ONCE() in llcrypt_find_master_key() */ + smp_store_release(&lsi->lsi_master_keys, keyring); + return 0; +} + +void llcrypt_sb_free(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + + if (lsi != NULL) { + key_put(lsi->lsi_master_keys); + lsi->lsi_master_keys = NULL; + } +} +EXPORT_SYMBOL(llcrypt_sb_free); + +/* + * Find the specified master key in ->lsi_master_keys. + * Returns ERR_PTR(-ENOKEY) if not found. + */ +struct key *llcrypt_find_master_key(struct super_block *sb, + const struct llcrypt_key_specifier *mk_spec) +{ + struct key *keyring; + char description[LLCRYPT_MK_DESCRIPTION_SIZE]; + struct lustre_sb_info *lsi = s2lsi(sb); + + if (!lsi) + return ERR_PTR(-EINVAL); + + /* pairs with smp_store_release() in allocate_filesystem_keyring() */ + keyring = READ_ONCE(lsi->lsi_master_keys); + if (keyring == NULL) + return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */ + + format_mk_description(description, mk_spec); + return search_llcrypt_keyring(keyring, &key_type_llcrypt, description); +} + +static int allocate_master_key_users_keyring(struct llcrypt_master_key *mk) +{ + char description[LLCRYPT_MK_USERS_DESCRIPTION_SIZE]; + struct key *keyring; + + format_mk_users_keyring_description(description, + mk->mk_spec.u.identifier); + keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_cred(), KEY_POS_SEARCH | + KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(keyring)) + return PTR_ERR(keyring); + + mk->mk_users = keyring; + return 0; +} + +/* + * Find the current user's "key" in the master key's ->mk_users. + * Returns ERR_PTR(-ENOKEY) if not found. + */ +static struct key *find_master_key_user(struct llcrypt_master_key *mk) +{ + char description[LLCRYPT_MK_USER_DESCRIPTION_SIZE]; + + format_mk_user_description(description, mk->mk_spec.u.identifier); + return search_llcrypt_keyring(mk->mk_users, &key_type_llcrypt_user, + description); +} + +/* + * Give the current user a "key" in ->mk_users. This charges the user's quota + * and marks the master key as added by the current user, so that it cannot be + * removed by another user with the key. Either the master key's key->sem must + * be held for write, or the master key must be still undergoing initialization. + */ +static int add_master_key_user(struct llcrypt_master_key *mk) +{ + char description[LLCRYPT_MK_USER_DESCRIPTION_SIZE]; + struct key *mk_user; + int err; + + format_mk_user_description(description, mk->mk_spec.u.identifier); + mk_user = key_alloc(&key_type_llcrypt_user, description, + current_fsuid(), current_gid(), current_cred(), + KEY_POS_SEARCH | KEY_USR_VIEW, 0, NULL); + if (IS_ERR(mk_user)) + return PTR_ERR(mk_user); + + err = key_instantiate_and_link(mk_user, NULL, 0, mk->mk_users, NULL); + key_put(mk_user); + return err; +} + +/* + * Remove the current user's "key" from ->mk_users. + * The master key's key->sem must be held for write. + * + * Returns 0 if removed, -ENOKEY if not found, or another -errno code. + */ +static int remove_master_key_user(struct llcrypt_master_key *mk) +{ + struct key *mk_user; + int err; + + mk_user = find_master_key_user(mk); + if (IS_ERR(mk_user)) + return PTR_ERR(mk_user); + err = key_unlink(mk->mk_users, mk_user); + key_put(mk_user); + return err; +} + +/* + * Allocate a new llcrypt_master_key which contains the given secret, set it as + * the payload of a new 'struct key' of type llcrypt, and link the 'struct key' + * into the given keyring. Synchronized by llcrypt_add_key_mutex. + */ +static int add_new_master_key(struct llcrypt_master_key_secret *secret, + const struct llcrypt_key_specifier *mk_spec, + struct key *keyring) +{ + struct llcrypt_master_key *mk; + char description[LLCRYPT_MK_DESCRIPTION_SIZE]; + struct key *key; + int err; + + mk = kzalloc(sizeof(*mk), GFP_KERNEL); + if (!mk) + return -ENOMEM; + + mk->mk_spec = *mk_spec; + + move_master_key_secret(&mk->mk_secret, secret); + init_rwsem(&mk->mk_secret_sem); + + refcount_set(&mk->mk_refcount, 1); /* secret is present */ + INIT_LIST_HEAD(&mk->mk_decrypted_inodes); + spin_lock_init(&mk->mk_decrypted_inodes_lock); + + if (mk_spec->type == LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { + err = allocate_master_key_users_keyring(mk); + if (err) + goto out_free_mk; + err = add_master_key_user(mk); + if (err) + goto out_free_mk; + } + + /* + * Note that we don't charge this key to anyone's quota, since when + * ->mk_users is in use those keys are charged instead, and otherwise + * (when ->mk_users isn't in use) only root can add these keys. + */ + format_mk_description(description, mk_spec); + key = key_alloc(&key_type_llcrypt, description, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(key)) { + err = PTR_ERR(key); + goto out_free_mk; + } + err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL); + key_put(key); + if (err) + goto out_free_mk; + + return 0; + +out_free_mk: + free_master_key(mk); + return err; +} + +#define KEY_DEAD 1 + +static int add_existing_master_key(struct llcrypt_master_key *mk, + struct llcrypt_master_key_secret *secret) +{ + struct key *mk_user; + bool rekey; + int err; + + /* + * If the current user is already in ->mk_users, then there's nothing to + * do. (Not applicable for v1 policy keys, which have NULL ->mk_users.) + */ + if (mk->mk_users) { + mk_user = find_master_key_user(mk); + if (mk_user != ERR_PTR(-ENOKEY)) { + if (IS_ERR(mk_user)) + return PTR_ERR(mk_user); + key_put(mk_user); + return 0; + } + } + + /* If we'll be re-adding ->mk_secret, try to take the reference. */ + rekey = !is_master_key_secret_present(&mk->mk_secret); + if (rekey && !refcount_inc_not_zero(&mk->mk_refcount)) + return KEY_DEAD; + + /* Add the current user to ->mk_users, if applicable. */ + if (mk->mk_users) { + err = add_master_key_user(mk); + if (err) { + if (rekey && refcount_dec_and_test(&mk->mk_refcount)) + return KEY_DEAD; + return err; + } + } + + /* Re-add the secret if needed. */ + if (rekey) { + down_write(&mk->mk_secret_sem); + move_master_key_secret(&mk->mk_secret, secret); + up_write(&mk->mk_secret_sem); + } + return 0; +} + +static int add_master_key(struct super_block *sb, + struct llcrypt_master_key_secret *secret, + const struct llcrypt_key_specifier *mk_spec) +{ + static DEFINE_MUTEX(llcrypt_add_key_mutex); + struct key *key; + struct lustre_sb_info *lsi = s2lsi(sb); + int err; + + if (!lsi) + return -EINVAL; + + mutex_lock(&llcrypt_add_key_mutex); /* serialize find + link */ +retry: + key = llcrypt_find_master_key(sb, mk_spec); + if (IS_ERR(key)) { + err = PTR_ERR(key); + if (err != -ENOKEY) + goto out_unlock; + /* Didn't find the key in ->lsi_master_keys. Add it. */ + err = allocate_filesystem_keyring(sb); + if (err) + goto out_unlock; + err = add_new_master_key(secret, mk_spec, + lsi->lsi_master_keys); + } else { + /* + * Found the key in ->lsi_master_keys. Re-add the secret if + * needed, and add the user to ->mk_users if needed. + */ + down_write(&key->sem); + err = add_existing_master_key(key->payload.data[0], secret); + up_write(&key->sem); + if (err == KEY_DEAD) { + /* Key being removed or needs to be removed */ + key_invalidate(key); + key_put(key); + goto retry; + } + key_put(key); + } +out_unlock: + mutex_unlock(&llcrypt_add_key_mutex); + return err; +} + +/* + * Add a master encryption key to the filesystem, causing all files which were + * encrypted with it to appear "unlocked" (decrypted) when accessed. + * + * When adding a key for use by v1 encryption policies, this ioctl is + * privileged, and userspace must provide the 'key_descriptor'. + * + * When adding a key for use by v2+ encryption policies, this ioctl is + * unprivileged. This is needed, in general, to allow non-root users to use + * encryption without encountering the visibility problems of process-subscribed + * keyrings and the inability to properly remove keys. This works by having + * each key identified by its cryptographically secure hash --- the + * 'key_identifier'. The cryptographic hash ensures that a malicious user + * cannot add the wrong key for a given identifier. Furthermore, each added key + * is charged to the appropriate user's quota for the keyrings service, which + * prevents a malicious user from adding too many keys. Finally, we forbid a + * user from removing a key while other users have added it too, which prevents + * a user who knows another user's key from causing a denial-of-service by + * removing it at an inopportune time. (We tolerate that a user who knows a key + * can prevent other users from removing it.) + * + * For more details, see the "LL_IOC_ADD_ENCRYPTION_KEY" section of + * Documentation/filesystems/llcrypt.rst. + */ +int llcrypt_ioctl_add_key(struct file *filp, void __user *_uarg) +{ + struct super_block *sb = file_inode(filp)->i_sb; + struct llcrypt_add_key_arg __user *uarg = _uarg; + struct llcrypt_add_key_arg arg; + struct llcrypt_master_key_secret secret; + int err; + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (!valid_key_spec(&arg.key_spec)) + return -EINVAL; + + if (arg.raw_size < LLCRYPT_MIN_KEY_SIZE || + arg.raw_size > LLCRYPT_MAX_KEY_SIZE) + return -EINVAL; + + if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) + return -EINVAL; + + memset(&secret, 0, sizeof(secret)); + secret.size = arg.raw_size; + err = -EFAULT; + if (copy_from_user(secret.raw, uarg->raw, secret.size)) + goto out_wipe_secret; + + switch (arg.key_spec.type) { + case LLCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: + /* + * Only root can add keys that are identified by an arbitrary + * descriptor rather than by a cryptographic hash --- since + * otherwise a malicious user could add the wrong key. + */ + err = -EACCES; + if (!capable(CAP_SYS_ADMIN)) + goto out_wipe_secret; + break; + case LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER: + err = llcrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size); + if (err) + goto out_wipe_secret; + + /* + * Now that the HKDF context is initialized, the raw key is no + * longer needed. + */ + memzero_explicit(secret.raw, secret.size); + + /* Calculate the key identifier and return it to userspace. */ + err = llcrypt_hkdf_expand(&secret.hkdf, + HKDF_CONTEXT_KEY_IDENTIFIER, + NULL, 0, arg.key_spec.u.identifier, + LLCRYPT_KEY_IDENTIFIER_SIZE); + if (err) + goto out_wipe_secret; + err = -EFAULT; + if (copy_to_user(uarg->key_spec.u.identifier, + arg.key_spec.u.identifier, + LLCRYPT_KEY_IDENTIFIER_SIZE)) + goto out_wipe_secret; + break; + default: + WARN_ON(1); + err = -EINVAL; + goto out_wipe_secret; + } + + err = add_master_key(sb, &secret, &arg.key_spec); +out_wipe_secret: + wipe_master_key_secret(&secret); + return err; +} +EXPORT_SYMBOL_GPL(llcrypt_ioctl_add_key); + +/* + * Verify that the current user has added a master key with the given identifier + * (returns -ENOKEY if not). This is needed to prevent a user from encrypting + * their files using some other user's key which they don't actually know. + * Cryptographically this isn't much of a problem, but the semantics of this + * would be a bit weird, so it's best to just forbid it. + * + * The system administrator (CAP_FOWNER) can override this, which should be + * enough for any use cases where encryption policies are being set using keys + * that were chosen ahead of time but aren't available at the moment. + * + * Note that the key may have already removed by the time this returns, but + * that's okay; we just care whether the key was there at some point. + * + * Return: 0 if the key is added, -ENOKEY if it isn't, or another -errno code + */ +int llcrypt_verify_key_added(struct super_block *sb, + const u8 identifier[LLCRYPT_KEY_IDENTIFIER_SIZE]) +{ + struct llcrypt_key_specifier mk_spec; + struct key *key, *mk_user; + struct llcrypt_master_key *mk; + int err; + + mk_spec.type = LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + memcpy(mk_spec.u.identifier, identifier, LLCRYPT_KEY_IDENTIFIER_SIZE); + + key = llcrypt_find_master_key(sb, &mk_spec); + if (IS_ERR(key)) { + err = PTR_ERR(key); + goto out; + } + mk = key->payload.data[0]; + mk_user = find_master_key_user(mk); + if (IS_ERR(mk_user)) { + err = PTR_ERR(mk_user); + } else { + key_put(mk_user); + err = 0; + } + key_put(key); +out: + if (err == -ENOKEY && capable(CAP_FOWNER)) + err = 0; + return err; +} + +/* + * Try to evict the inode's dentries from the dentry cache. If the inode is a + * directory, then it can have at most one dentry; however, that dentry may be + * pinned by child dentries, so first try to evict the children too. + */ +static void shrink_dcache_inode(struct inode *inode) +{ + struct dentry *dentry; + + if (S_ISDIR(inode->i_mode)) { + dentry = d_find_any_alias(inode); + if (dentry) { + shrink_dcache_parent(dentry); + dput(dentry); + } + } + d_prune_aliases(inode); +} + +static void evict_dentries_for_decrypted_inodes(struct llcrypt_master_key *mk) +{ + struct llcrypt_info *ci; + struct inode *inode; + struct inode *toput_inode = NULL; + + spin_lock(&mk->mk_decrypted_inodes_lock); + + list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) { + inode = ci->ci_inode; + if (igrab(inode) == NULL) + continue; + spin_unlock(&mk->mk_decrypted_inodes_lock); + + shrink_dcache_inode(inode); + iput(toput_inode); + toput_inode = inode; + + spin_lock(&mk->mk_decrypted_inodes_lock); + } + + spin_unlock(&mk->mk_decrypted_inodes_lock); + iput(toput_inode); +} + +static int check_for_busy_inodes(struct super_block *sb, + struct llcrypt_master_key *mk) +{ + struct list_head *pos; + size_t busy_count = 0; + unsigned long ino; + struct dentry *dentry; + char _path[256]; + char *path = NULL; + + spin_lock(&mk->mk_decrypted_inodes_lock); + + list_for_each(pos, &mk->mk_decrypted_inodes) + busy_count++; + + if (busy_count == 0) { + spin_unlock(&mk->mk_decrypted_inodes_lock); + return 0; + } + + { + /* select an example file to show for debugging purposes */ + struct inode *inode = + list_first_entry(&mk->mk_decrypted_inodes, + struct llcrypt_info, + ci_master_key_link)->ci_inode; + ino = inode->i_ino; + dentry = d_find_alias(inode); + } + spin_unlock(&mk->mk_decrypted_inodes_lock); + + if (dentry) { + path = dentry_path_raw(dentry, _path, sizeof(_path)); + dput(dentry); + } + if (IS_ERR_OR_NULL(path)) + path = "(unknown)"; + + llcrypt_warn(NULL, + "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu (%s)", + sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec), + master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, + ino, path); + return -EBUSY; +} + +static int try_to_lock_encrypted_files(struct super_block *sb, + struct llcrypt_master_key *mk) +{ + int err1; + int err2; + + /* + * An inode can't be evicted while it is dirty or has dirty pages. + * Thus, we first have to clean the inodes in ->mk_decrypted_inodes. + * + * Just do it the easy way: call sync_filesystem(). It's overkill, but + * it works, and it's more important to minimize the amount of caches we + * drop than the amount of data we sync. Also, unprivileged users can + * already call sync_filesystem() via sys_syncfs() or sys_sync(). + */ + down_read(&sb->s_umount); + err1 = sync_filesystem(sb); + up_read(&sb->s_umount); + /* If a sync error occurs, still try to evict as much as possible. */ + + /* + * Inodes are pinned by their dentries, so we have to evict their + * dentries. shrink_dcache_sb() would suffice, but would be overkill + * and inappropriate for use by unprivileged users. So instead go + * through the inodes' alias lists and try to evict each dentry. + */ + evict_dentries_for_decrypted_inodes(mk); + + /* + * evict_dentries_for_decrypted_inodes() already iput() each inode in + * the list; any inodes for which that dropped the last reference will + * have been evicted due to llcrypt_drop_inode() detecting the key + * removal and telling the VFS to evict the inode. So to finish, we + * just need to check whether any inodes couldn't be evicted. + */ + err2 = check_for_busy_inodes(sb, mk); + + return err1 ?: err2; +} + +/* + * Try to remove an llcrypt master encryption key. + * + * LL_IOC_REMOVE_ENCRYPTION_KEY (all_users=false) removes the current user's + * claim to the key, then removes the key itself if no other users have claims. + * LL_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS (all_users=true) always removes the + * key itself. + * + * To "remove the key itself", first we wipe the actual master key secret, so + * that no more inodes can be unlocked with it. Then we try to evict all cached + * inodes that had been unlocked with the key. + * + * If all inodes were evicted, then we unlink the llcrypt_master_key from the + * keyring. Otherwise it remains in the keyring in the "incompletely removed" + * state (without the actual secret key) where it tracks the list of remaining + * inodes. Userspace can execute the ioctl again later to retry eviction, or + * alternatively can re-add the secret key again. + * + * For more details, see the "Removing keys" section of + * Documentation/filesystems/llcrypt.rst. + */ +static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) +{ + struct super_block *sb = file_inode(filp)->i_sb; + struct llcrypt_remove_key_arg __user *uarg = _uarg; + struct llcrypt_remove_key_arg arg; + struct key *key; + struct llcrypt_master_key *mk; + u32 status_flags = 0; + int err; + bool dead; + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (!valid_key_spec(&arg.key_spec)) + return -EINVAL; + + if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) + return -EINVAL; + + /* + * Only root can add and remove keys that are identified by an arbitrary + * descriptor rather than by a cryptographic hash. + */ + if (arg.key_spec.type == LLCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && + !capable(CAP_SYS_ADMIN)) + return -EACCES; + + /* Find the key being removed. */ + key = llcrypt_find_master_key(sb, &arg.key_spec); + if (IS_ERR(key)) + return PTR_ERR(key); + mk = key->payload.data[0]; + + down_write(&key->sem); + + /* If relevant, remove current user's (or all users) claim to the key */ + if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) { + if (all_users) + err = keyring_clear(mk->mk_users); + else + err = remove_master_key_user(mk); + if (err) { + up_write(&key->sem); + goto out_put_key; + } + if (mk->mk_users->keys.nr_leaves_on_tree != 0) { + /* + * Other users have still added the key too. We removed + * the current user's claim to the key, but we still + * can't remove the key itself. + */ + status_flags |= + LLCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS; + err = 0; + up_write(&key->sem); + goto out_put_key; + } + } + + /* No user claims remaining. Go ahead and wipe the secret. */ + dead = false; + if (is_master_key_secret_present(&mk->mk_secret)) { + down_write(&mk->mk_secret_sem); + wipe_master_key_secret(&mk->mk_secret); + dead = refcount_dec_and_test(&mk->mk_refcount); + up_write(&mk->mk_secret_sem); + } + up_write(&key->sem); + if (dead) { + /* + * No inodes reference the key, and we wiped the secret, so the + * key object is free to be removed from the keyring. + */ + key_invalidate(key); + err = 0; + } else { + /* Some inodes still reference this key; try to evict them. */ + err = try_to_lock_encrypted_files(sb, mk); + if (err == -EBUSY) { + status_flags |= + LLCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY; + err = 0; + } + } + /* + * We return 0 if we successfully did something: removed a claim to the + * key, wiped the secret, or tried locking the files again. Users need + * to check the informational status flags if they care whether the key + * has been fully removed including all files locked. + */ +out_put_key: + key_put(key); + if (err == 0) + err = put_user(status_flags, &uarg->removal_status_flags); + return err; +} + +int llcrypt_ioctl_remove_key(struct file *filp, void __user *uarg) +{ + return do_remove_key(filp, uarg, false); +} +EXPORT_SYMBOL_GPL(llcrypt_ioctl_remove_key); + +int llcrypt_ioctl_remove_key_all_users(struct file *filp, void __user *uarg) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + return do_remove_key(filp, uarg, true); +} +EXPORT_SYMBOL_GPL(llcrypt_ioctl_remove_key_all_users); + +/* + * Retrieve the status of an llcrypt master encryption key. + * + * We set ->status to indicate whether the key is absent, present, or + * incompletely removed. "Incompletely removed" means that the master key + * secret has been removed, but some files which had been unlocked with it are + * still in use. This field allows applications to easily determine the state + * of an encrypted directory without using a hack such as trying to open a + * regular file in it (which can confuse the "incompletely removed" state with + * absent or present). + * + * In addition, for v2 policy keys we allow applications to determine, via + * ->status_flags and ->user_count, whether the key has been added by the + * current user, by other users, or by both. Most applications should not need + * this, since ordinarily only one user should know a given key. However, if a + * secret key is shared by multiple users, applications may wish to add an + * already-present key to prevent other users from removing it. This ioctl can + * be used to check whether that really is the case before the work is done to + * add the key --- which might e.g. require prompting the user for a passphrase. + * + * For more details, see the "LL_IOC_GET_ENCRYPTION_KEY_STATUS" section of + * Documentation/filesystems/llcrypt.rst. + */ +int llcrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) +{ + struct super_block *sb = file_inode(filp)->i_sb; + struct llcrypt_get_key_status_arg arg; + struct key *key; + struct llcrypt_master_key *mk; + int err; + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (!valid_key_spec(&arg.key_spec)) + return -EINVAL; + + if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) + return -EINVAL; + + arg.status_flags = 0; + arg.user_count = 0; + memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved)); + + key = llcrypt_find_master_key(sb, &arg.key_spec); + if (IS_ERR(key)) { + if (key != ERR_PTR(-ENOKEY)) + return PTR_ERR(key); + arg.status = LLCRYPT_KEY_STATUS_ABSENT; + err = 0; + goto out; + } + mk = key->payload.data[0]; + down_read(&key->sem); + + if (!is_master_key_secret_present(&mk->mk_secret)) { + arg.status = LLCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED; + err = 0; + goto out_release_key; + } + + arg.status = LLCRYPT_KEY_STATUS_PRESENT; + if (mk->mk_users) { + struct key *mk_user; + + arg.user_count = mk->mk_users->keys.nr_leaves_on_tree; + mk_user = find_master_key_user(mk); + if (!IS_ERR(mk_user)) { + arg.status_flags |= + LLCRYPT_KEY_STATUS_FLAG_ADDED_BY_SELF; + key_put(mk_user); + } else if (mk_user != ERR_PTR(-ENOKEY)) { + err = PTR_ERR(mk_user); + goto out_release_key; + } + } + err = 0; +out_release_key: + up_read(&key->sem); + key_put(key); +out: + if (!err && copy_to_user(uarg, &arg, sizeof(arg))) + err = -EFAULT; + return err; +} +EXPORT_SYMBOL_GPL(llcrypt_ioctl_get_key_status); + +int __init llcrypt_init_keyring(void) +{ + int err; + + err = register_key_type(&key_type_llcrypt); + if (err) + return err; + + err = register_key_type(&key_type_llcrypt_user); + if (err) + goto err_unregister_llcrypt; + + return 0; + +err_unregister_llcrypt: + unregister_key_type(&key_type_llcrypt); + return err; +} + +void __exit llcrypt_exit_keyring(void) +{ + unregister_key_type(&key_type_llcrypt_user); + unregister_key_type(&key_type_llcrypt); +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup.c new file mode 100644 index 0000000000000..67fe888f895db --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup.c @@ -0,0 +1,635 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Key setup facility for FS encryption support. + * + * Copyright (C) 2015, Google, Inc. + * + * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. + * Heavily modified since then. + */ +/* + * Linux commit 219d54332a09 + * tags/v5.4 + */ + +#include +#ifdef HAVE_CRYPTO_SHA2_HEADER +#include +#else +#include +#endif +#include +#include + +#include "llcrypt_private.h" + +#ifdef HAVE_CIPHER_H +#include + +MODULE_IMPORT_NS(CRYPTO_INTERNAL); +#endif + +static struct crypto_shash *essiv_hash_tfm; + +static struct llcrypt_mode available_modes[] = { + [LLCRYPT_MODE_NULL] = { + .friendly_name = "NULL", + .cipher_str = "null", + .keysize = 0, + .ivsize = 0, + }, + [LLCRYPT_MODE_AES_256_XTS] = { + .friendly_name = "AES-256-XTS", + .cipher_str = "xts(aes)", + .keysize = 64, + .ivsize = 16, + }, + [LLCRYPT_MODE_AES_256_CTS] = { + .friendly_name = "AES-256-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 32, + .ivsize = 16, + }, + [LLCRYPT_MODE_AES_128_CBC] = { + .friendly_name = "AES-128-CBC", + .cipher_str = "cbc(aes)", + .keysize = 16, + .ivsize = 16, + .needs_essiv = true, + }, + [LLCRYPT_MODE_AES_128_CTS] = { + .friendly_name = "AES-128-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 16, + .ivsize = 16, + }, + [LLCRYPT_MODE_ADIANTUM] = { + .friendly_name = "Adiantum", + .cipher_str = "adiantum(xchacha12,aes)", + .keysize = 32, + .ivsize = 32, + }, +}; + +static struct llcrypt_mode * +select_encryption_mode(const union llcrypt_policy *policy, + const struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + return &available_modes[llcrypt_policy_contents_mode(policy)]; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + return &available_modes[llcrypt_policy_fnames_mode(policy)]; + + WARN_ONCE(1, "llcrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return ERR_PTR(-EINVAL); +} + +/* Create a symmetric cipher object for the given encryption mode and key */ +struct crypto_skcipher *llcrypt_allocate_skcipher(struct llcrypt_mode *mode, + const u8 *raw_key, + const struct inode *inode) +{ + struct crypto_skcipher *tfm; + int err; + + if (!strcmp(mode->cipher_str, "null")) + return NULL; + + tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0); + if (IS_ERR(tfm)) { + if (PTR_ERR(tfm) == -ENOENT) { + llcrypt_warn(inode, + "Missing crypto API support for %s (API name: \"%s\")", + mode->friendly_name, mode->cipher_str); + return ERR_PTR(-ENOPKG); + } + llcrypt_err(inode, "Error allocating '%s' transform: %ld", + mode->cipher_str, PTR_ERR(tfm)); + return tfm; + } + if (unlikely(!mode->logged_impl_name)) { + /* + * llcrypt performance can vary greatly depending on which + * crypto algorithm implementation is used. Help people debug + * performance problems by logging the ->cra_driver_name the + * first time a mode is used. Note that multiple threads can + * race here, but it doesn't really matter. + */ + mode->logged_impl_name = true; + pr_info("llcrypt: %s using implementation \"%s\"\n", + mode->friendly_name, + crypto_skcipher_alg(tfm)->base.cra_driver_name); + } + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); + err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize); + if (err) + goto err_free_tfm; + + return tfm; + +err_free_tfm: + crypto_free_skcipher(tfm); + return ERR_PTR(err); +} + +static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) +{ + struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); + + /* init hash transform on demand */ + if (unlikely(!tfm)) { + struct crypto_shash *prev_tfm; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + if (PTR_ERR(tfm) == -ENOENT) { + llcrypt_warn(NULL, + "Missing crypto API support for SHA-256"); + return -ENOPKG; + } + llcrypt_err(NULL, + "Error allocating SHA-256 transform: %ld", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); + if (prev_tfm) { + crypto_free_shash(tfm); + tfm = prev_tfm; + } + } + + { + SHASH_DESC_ON_STACK(desc, tfm); + desc->tfm = tfm; + + return crypto_shash_digest(desc, key, keysize, salt); + } +} + +static int init_essiv_generator(struct llcrypt_info *ci, const u8 *raw_key, + int keysize) +{ + int err; + struct crypto_cipher *essiv_tfm; + u8 salt[SHA256_DIGEST_SIZE]; + + if (WARN_ON(ci->ci_mode->ivsize != AES_BLOCK_SIZE)) + return -EINVAL; + + essiv_tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(essiv_tfm)) + return PTR_ERR(essiv_tfm); + + ci->ci_essiv_tfm = essiv_tfm; + + err = derive_essiv_salt(raw_key, keysize, salt); + if (err) + goto out; + + /* + * Using SHA256 to derive the salt/key will result in AES-256 being + * used for IV generation. File contents encryption will still use the + * configured keysize (AES-128) nevertheless. + */ + err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); + if (err) + goto out; + +out: + memzero_explicit(salt, sizeof(salt)); + return err; +} + +/* Given the per-file key, set up the file's crypto transform object(s) */ +int llcrypt_set_derived_key(struct llcrypt_info *ci, const u8 *derived_key) +{ + struct llcrypt_mode *mode = ci->ci_mode; + struct crypto_skcipher *ctfm; + int err; + + ctfm = llcrypt_allocate_skcipher(mode, derived_key, ci->ci_inode); + if (IS_ERR(ctfm)) + return PTR_ERR(ctfm); + + ci->ci_ctfm = ctfm; + + if (mode->needs_essiv) { + err = init_essiv_generator(ci, derived_key, mode->keysize); + if (err) { + llcrypt_warn(ci->ci_inode, + "Error initializing ESSIV generator: %d", + err); + return err; + } + } + return 0; +} + +static int setup_per_mode_key(struct llcrypt_info *ci, + struct llcrypt_master_key *mk) +{ + struct llcrypt_mode *mode = ci->ci_mode; + u8 mode_num = mode - available_modes; + struct crypto_skcipher *tfm, *prev_tfm; + u8 mode_key[LLCRYPT_MAX_KEY_SIZE]; + int err; + + if (WARN_ON(mode_num >= ARRAY_SIZE(mk->mk_mode_keys))) + return -EINVAL; + + /* pairs with cmpxchg() below */ + tfm = READ_ONCE(mk->mk_mode_keys[mode_num]); + if (likely(tfm != NULL)) + goto done; + + BUILD_BUG_ON(sizeof(mode_num) != 1); + err = llcrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_PER_MODE_KEY, + &mode_num, sizeof(mode_num), + mode_key, mode->keysize); + if (err) + return err; + tfm = llcrypt_allocate_skcipher(mode, mode_key, ci->ci_inode); + memzero_explicit(mode_key, mode->keysize); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + /* pairs with READ_ONCE() above */ + prev_tfm = cmpxchg(&mk->mk_mode_keys[mode_num], NULL, tfm); + if (prev_tfm != NULL) { + crypto_free_skcipher(tfm); + tfm = prev_tfm; + } +done: + ci->ci_ctfm = tfm; + return 0; +} + +static int llcrypt_setup_v2_file_key(struct llcrypt_info *ci, + struct llcrypt_master_key *mk) +{ + u8 derived_key[LLCRYPT_MAX_KEY_SIZE]; + int err; + + if (ci->ci_policy.v2.flags & LLCRYPT_POLICY_FLAG_DIRECT_KEY) { + /* + * DIRECT_KEY: instead of deriving per-file keys, the per-file + * nonce will be included in all the IVs. But unlike v1 + * policies, for v2 policies in this case we don't encrypt with + * the master key directly but rather derive a per-mode key. + * This ensures that the master key is consistently used only + * for HKDF, avoiding key reuse issues. + */ + if (!llcrypt_mode_supports_direct_key(ci->ci_mode)) { + llcrypt_warn(ci->ci_inode, + "Direct key flag not allowed with %s", + ci->ci_mode->friendly_name); + return -EINVAL; + } + return setup_per_mode_key(ci, mk); + } + + err = llcrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_PER_FILE_KEY, + ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE, + derived_key, ci->ci_mode->keysize); + if (err) + return err; + + err = llcrypt_set_derived_key(ci, derived_key); + memzero_explicit(derived_key, ci->ci_mode->keysize); + return err; +} + +/* + * Find the master key, then set up the inode's actual encryption key. + * + * If the master key is found in the filesystem-level keyring, then the + * corresponding 'struct key' is returned in *master_key_ret with + * ->mk_secret_sem read-locked. This is needed to ensure that only one task + * links the llcrypt_info into ->mk_decrypted_inodes (as multiple tasks may race + * to create an llcrypt_info for the same inode), and to synchronize the master + * key being removed with a new inode starting to use it. + */ +static int setup_file_encryption_key(struct llcrypt_info *ci, + struct key **master_key_ret) +{ + struct key *key; + struct llcrypt_master_key *mk = NULL; + struct llcrypt_key_specifier mk_spec; + int err; + + switch (ci->ci_policy.version) { + case LLCRYPT_POLICY_V1: + mk_spec.type = LLCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; + memcpy(mk_spec.u.descriptor, + ci->ci_policy.v1.master_key_descriptor, + LLCRYPT_KEY_DESCRIPTOR_SIZE); + break; + case LLCRYPT_POLICY_V2: + mk_spec.type = LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + memcpy(mk_spec.u.identifier, + ci->ci_policy.v2.master_key_identifier, + LLCRYPT_KEY_IDENTIFIER_SIZE); + break; + default: + WARN_ON(1); + return -EINVAL; + } + + key = llcrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); + if (IS_ERR(key)) { + if (key != ERR_PTR(-ENOKEY) || + ci->ci_policy.version != LLCRYPT_POLICY_V1) + return PTR_ERR(key); + + /* + * As a legacy fallback for v1 policies, search for the key in + * the current task's subscribed keyrings too. Don't move this + * to before the search of ->lsi_master_keys, since users + * shouldn't be able to override filesystem-level keys. + */ + return llcrypt_setup_v1_file_key_via_subscribed_keyrings(ci); + } + + mk = key->payload.data[0]; + down_read(&mk->mk_secret_sem); + + /* Has the secret been removed (via LL_IOC_REMOVE_ENCRYPTION_KEY)? */ + if (!is_master_key_secret_present(&mk->mk_secret)) { + err = -ENOKEY; + goto out_release_key; + } + + /* + * Require that the master key be at least as long as the derived key. + * Otherwise, the derived key cannot possibly contain as much entropy as + * that required by the encryption mode it will be used for. For v1 + * policies it's also required for the KDF to work at all. + */ + if (mk->mk_secret.size < ci->ci_mode->keysize) { + llcrypt_warn(NULL, + "key with %s %*phN is too short (got %u bytes, need %u+ bytes)", + master_key_spec_type(&mk_spec), + master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u, + mk->mk_secret.size, ci->ci_mode->keysize); + err = -ENOKEY; + goto out_release_key; + } + + switch (ci->ci_policy.version) { + case LLCRYPT_POLICY_V1: + err = llcrypt_setup_v1_file_key(ci, mk->mk_secret.raw); + break; + case LLCRYPT_POLICY_V2: + err = llcrypt_setup_v2_file_key(ci, mk); + break; + default: + WARN_ON(1); + err = -EINVAL; + break; + } + if (err) + goto out_release_key; + + *master_key_ret = key; + return 0; + +out_release_key: + up_read(&mk->mk_secret_sem); + key_put(key); + return err; +} + +static void put_crypt_info(struct llcrypt_info *ci) +{ + struct key *key; + + if (!ci) + return; + + if (ci->ci_direct_key) { + llcrypt_put_direct_key(ci->ci_direct_key); + } else if ((ci->ci_ctfm != NULL || ci->ci_essiv_tfm != NULL) && + !llcrypt_is_direct_key_policy(&ci->ci_policy)) { + if (ci->ci_ctfm) + crypto_free_skcipher(ci->ci_ctfm); + crypto_free_cipher(ci->ci_essiv_tfm); + } + + key = ci->ci_master_key; + if (key) { + struct llcrypt_master_key *mk = key->payload.data[0]; + + /* + * Remove this inode from the list of inodes that were unlocked + * with the master key. + * + * In addition, if we're removing the last inode from a key that + * already had its secret removed, invalidate the key so that it + * gets removed from ->lsi_master_keys. + */ + spin_lock(&mk->mk_decrypted_inodes_lock); + list_del(&ci->ci_master_key_link); + spin_unlock(&mk->mk_decrypted_inodes_lock); + if (refcount_dec_and_test(&mk->mk_refcount)) + key_invalidate(key); + key_put(key); + } + kmem_cache_free(llcrypt_info_cachep, ci); +} + +int llcrypt_get_encryption_info(struct inode *inode) +{ + struct llcrypt_info *crypt_info; + union llcrypt_context ctx; + struct llcrypt_mode *mode; + struct key *master_key = NULL; + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + int res; + + if (llcrypt_has_encryption_key(inode)) + return 0; + + if (!lsi) + return -ENOKEY; + + res = llcrypt_initialize(lsi->lsi_cop->flags); + if (res) + return res; + + res = lsi->lsi_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0) { + if (!llcrypt_dummy_context_enabled(inode) || + IS_ENCRYPTED(inode)) { + llcrypt_warn(inode, + "Error %d getting encryption context", + res); + return res; + } + /* Fake up a context for an unencrypted directory */ + memset(&ctx, 0, sizeof(ctx)); + ctx.version = LLCRYPT_CONTEXT_V1; + ctx.v1.contents_encryption_mode = LLCRYPT_MODE_AES_256_XTS; + if (lsi->lsi_flags & LSI_FILENAME_ENC) { + ctx.v1.filenames_encryption_mode = + LLCRYPT_MODE_AES_256_CTS; + } else { + llcrypt_warn(inode, + "dummy enc: forcing filenames_encryption_mode to null"); + ctx.v1.filenames_encryption_mode = LLCRYPT_MODE_NULL; + } + memset(ctx.v1.master_key_descriptor, 0x42, + LLCRYPT_KEY_DESCRIPTOR_SIZE); + res = sizeof(ctx.v1); + } + + crypt_info = kmem_cache_zalloc(llcrypt_info_cachep, GFP_NOFS); + if (!crypt_info) + return -ENOMEM; + + crypt_info->ci_inode = inode; + + res = llcrypt_policy_from_context(&crypt_info->ci_policy, &ctx, res); + if (res) { + llcrypt_warn(inode, + "Unrecognized or corrupt encryption context"); + goto out; + } + + switch (ctx.version) { + case LLCRYPT_CONTEXT_V1: + memcpy(crypt_info->ci_nonce, ctx.v1.nonce, + FS_KEY_DERIVATION_NONCE_SIZE); + break; + case LLCRYPT_CONTEXT_V2: + memcpy(crypt_info->ci_nonce, ctx.v2.nonce, + FS_KEY_DERIVATION_NONCE_SIZE); + break; + default: + WARN_ON(1); + res = -EINVAL; + goto out; + } + + if (!llcrypt_supported_policy(&crypt_info->ci_policy, inode)) { + res = -EINVAL; + goto out; + } + + mode = select_encryption_mode(&crypt_info->ci_policy, inode); + if (IS_ERR(mode)) { + res = PTR_ERR(mode); + goto out; + } + WARN_ON(mode->ivsize > LLCRYPT_MAX_IV_SIZE); + crypt_info->ci_mode = mode; + + res = setup_file_encryption_key(crypt_info, &master_key); + if (res) + goto out; + + if (cmpxchg_release(&(llcrypt_info_nocast(inode)), NULL, + crypt_info) == NULL) { + if (master_key) { + struct llcrypt_master_key *mk = + master_key->payload.data[0]; + + refcount_inc(&mk->mk_refcount); + crypt_info->ci_master_key = key_get(master_key); + spin_lock(&mk->mk_decrypted_inodes_lock); + list_add(&crypt_info->ci_master_key_link, + &mk->mk_decrypted_inodes); + spin_unlock(&mk->mk_decrypted_inodes_lock); + } + crypt_info = NULL; + } + res = 0; +out: + if (master_key) { + struct llcrypt_master_key *mk = master_key->payload.data[0]; + + up_read(&mk->mk_secret_sem); + key_put(master_key); + } + if (res == -ENOKEY) + res = 0; + put_crypt_info(crypt_info); + return res; +} +EXPORT_SYMBOL(llcrypt_get_encryption_info); + +/** + * llcrypt_put_encryption_info - free most of an inode's llcrypt data + * + * Free the inode's llcrypt_info. Filesystems must call this when the inode is + * being evicted. An RCU grace period need not have elapsed yet. + */ +void llcrypt_put_encryption_info(struct inode *inode) +{ + put_crypt_info(llcrypt_info(inode)); + llcrypt_info_nocast(inode) = NULL; +} +EXPORT_SYMBOL(llcrypt_put_encryption_info); + +/** + * llcrypt_free_inode - free an inode's llcrypt data requiring RCU delay + * + * Free the inode's cached decrypted symlink target, if any. Filesystems must + * call this after an RCU grace period, just before they free the inode. + */ +void llcrypt_free_inode(struct inode *inode) +{ + if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) { + kfree(inode->i_link); + inode->i_link = NULL; + } +} +EXPORT_SYMBOL(llcrypt_free_inode); + +/** + * llcrypt_drop_inode - check whether the inode's master key has been removed + * + * Filesystems supporting llcrypt must call this from their ->drop_inode() + * method so that encrypted inodes are evicted as soon as they're no longer in + * use and their master key has been removed. + * + * Return: 1 if llcrypt wants the inode to be evicted now, otherwise 0 + */ +int llcrypt_drop_inode(struct inode *inode) +{ + const struct llcrypt_info *ci; + const struct llcrypt_master_key *mk; + + ci = (struct llcrypt_info *)READ_ONCE(llcrypt_info_nocast(inode)); + /* + * If ci is NULL, then the inode doesn't have an encryption key set up + * so it's irrelevant. If ci_master_key is NULL, then the master key + * was provided via the legacy mechanism of the process-subscribed + * keyrings, so we don't know whether it's been removed or not. + */ + if (!ci || !ci->ci_master_key) + return 0; + mk = ci->ci_master_key->payload.data[0]; + + /* + * Note: since we aren't holding ->mk_secret_sem, the result here can + * immediately become outdated. But there's no correctness problem with + * unnecessarily evicting. Nor is there a correctness problem with not + * evicting while iput() is racing with the key being removed, since + * then the thread removing the key will either evict the inode itself + * or will correctly detect that it wasn't evicted due to the race. + */ + return !is_master_key_secret_present(&mk->mk_secret); +} +EXPORT_SYMBOL_GPL(llcrypt_drop_inode); + +bool llcrypt_has_encryption_key(const struct inode *inode) +{ + /* pairs with cmpxchg_release() in llcrypt_get_encryption_info() */ + return READ_ONCE(llcrypt_info_nocast(inode)) != NULL; +} +EXPORT_SYMBOL_GPL(llcrypt_has_encryption_key); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup_v1.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup_v1.c new file mode 100644 index 0000000000000..e56bce3717d9a --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup_v1.c @@ -0,0 +1,350 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Key setup for v1 encryption policies + * + * Copyright 2015, 2019 Google LLC + */ +/* + * Linux commit 219d54332a09 + * tags/v5.4 + */ + +/* + * This file implements compatibility functions for the original encryption + * policy version ("v1"), including: + * + * - Deriving per-file keys using the AES-128-ECB based KDF + * (rather than the new method of using HKDF-SHA512) + * + * - Retrieving llcrypt master keys from process-subscribed keyrings + * (rather than the new method of using a filesystem-level keyring) + * + * - Handling policies with the DIRECT_KEY flag set using a master key table + * (rather than the new method of implementing DIRECT_KEY with per-mode keys + * managed alongside the master keys in the filesystem-level keyring) + */ + +#include +#include +#include +#include +#include + +#include "llcrypt_private.h" + +/* Table of keys referenced by DIRECT_KEY policies */ +static DEFINE_HASHTABLE(llcrypt_direct_keys, 6); /* 6 bits = 64 buckets */ +static DEFINE_SPINLOCK(llcrypt_direct_keys_lock); + +/* + * v1 key derivation function. This generates the derived key by encrypting the + * master key with AES-128-ECB using the nonce as the AES key. This provides a + * unique derived key with sufficient entropy for each inode. However, it's + * nonstandard, non-extensible, doesn't evenly distribute the entropy from the + * master key, and is trivially reversible: an attacker who compromises a + * derived key can "decrypt" it to get back to the master key, then derive any + * other key. For all new code, use HKDF instead. + * + * The master key must be at least as long as the derived key. If the master + * key is longer, then only the first 'derived_keysize' bytes are used. + */ +static int derive_key_aes(const u8 *master_key, + const u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE], + u8 *derived_key, unsigned int derived_keysize) +{ + int res = 0; + struct skcipher_request *req = NULL; + DECLARE_CRYPTO_WAIT(wait); + struct scatterlist src_sg, dst_sg; + struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); + + if (IS_ERR(tfm)) { + res = PTR_ERR(tfm); + tfm = NULL; + goto out; + } + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); + req = skcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + res = -ENOMEM; + goto out; + } + skcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, &wait); + res = crypto_skcipher_setkey(tfm, nonce, FS_KEY_DERIVATION_NONCE_SIZE); + if (res < 0) + goto out; + + sg_init_one(&src_sg, master_key, derived_keysize); + sg_init_one(&dst_sg, derived_key, derived_keysize); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize, + NULL); + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); +out: + skcipher_request_free(req); + crypto_free_skcipher(tfm); + return res; +} + +/* + * Search the current task's subscribed keyrings for a "logon" key with + * description prefix:descriptor, and if found acquire a read lock on it and + * return a pointer to its validated payload in *payload_ret. + */ +static struct key * +find_and_lock_process_key(const char *prefix, + const u8 descriptor[LLCRYPT_KEY_DESCRIPTOR_SIZE], + unsigned int min_keysize, + const struct llcrypt_key **payload_ret) +{ + char *description; + struct key *key; + const struct user_key_payload *ukp; + const struct llcrypt_key *payload; + + description = kasprintf(GFP_NOFS, "%s%*phN", prefix, + LLCRYPT_KEY_DESCRIPTOR_SIZE, descriptor); + if (!description) + return ERR_PTR(-ENOMEM); + + key = request_key(&key_type_logon, description, NULL); + kfree(description); + if (IS_ERR(key)) + return key; + + down_read(&key->sem); + ukp = user_key_payload_locked(key); + + if (!ukp) /* was the key revoked before we acquired its semaphore? */ + goto invalid; + + payload = (const struct llcrypt_key *)ukp->data; + + if (ukp->datalen != sizeof(struct llcrypt_key) || + payload->size < 1 || payload->size > LLCRYPT_MAX_KEY_SIZE) { + llcrypt_warn(NULL, + "key with description '%s' has invalid payload", + key->description); + goto invalid; + } + + if (payload->size < min_keysize) { + llcrypt_warn(NULL, + "key with description '%s' is too short (got %u bytes, need %u+ bytes)", + key->description, payload->size, min_keysize); + goto invalid; + } + + *payload_ret = payload; + return key; + +invalid: + up_read(&key->sem); + key_put(key); + return ERR_PTR(-ENOKEY); +} + +/* Master key referenced by DIRECT_KEY policy */ +struct llcrypt_direct_key { + struct hlist_node dk_node; + refcount_t dk_refcount; + const struct llcrypt_mode *dk_mode; + struct crypto_skcipher *dk_ctfm; + u8 dk_descriptor[LLCRYPT_KEY_DESCRIPTOR_SIZE]; + u8 dk_raw[LLCRYPT_MAX_KEY_SIZE]; +}; + +static void free_direct_key(struct llcrypt_direct_key *dk) +{ + if (dk) { + crypto_free_skcipher(dk->dk_ctfm); + kfree_sensitive(dk); + } +} + +void llcrypt_put_direct_key(struct llcrypt_direct_key *dk) +{ + if (!refcount_dec_and_lock(&dk->dk_refcount, &llcrypt_direct_keys_lock)) + return; + hash_del(&dk->dk_node); + spin_unlock(&llcrypt_direct_keys_lock); + + free_direct_key(dk); +} + +/* + * Find/insert the given key into the llcrypt_direct_keys table. If found, it + * is returned with elevated refcount, and 'to_insert' is freed if non-NULL. If + * not found, 'to_insert' is inserted and returned if it's non-NULL; otherwise + * NULL is returned. + */ +static struct llcrypt_direct_key * +find_or_insert_direct_key(struct llcrypt_direct_key *to_insert, + const u8 *raw_key, const struct llcrypt_info *ci) +{ + unsigned long hash_key; + struct llcrypt_direct_key *dk; + + /* + * Careful: to avoid potentially leaking secret key bytes via timing + * information, we must key the hash table by descriptor rather than by + * raw key, and use crypto_memneq() when comparing raw keys. + */ + + BUILD_BUG_ON(sizeof(hash_key) > LLCRYPT_KEY_DESCRIPTOR_SIZE); + memcpy(&hash_key, ci->ci_policy.v1.master_key_descriptor, + sizeof(hash_key)); + + spin_lock(&llcrypt_direct_keys_lock); + hash_for_each_possible(llcrypt_direct_keys, dk, dk_node, hash_key) { + if (memcmp(ci->ci_policy.v1.master_key_descriptor, + dk->dk_descriptor, LLCRYPT_KEY_DESCRIPTOR_SIZE) != 0) + continue; + if (ci->ci_mode != dk->dk_mode) + continue; + if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize)) + continue; + /* using existing tfm with same (descriptor, mode, raw_key) */ + refcount_inc(&dk->dk_refcount); + spin_unlock(&llcrypt_direct_keys_lock); + free_direct_key(to_insert); + return dk; + } + if (to_insert) + hash_add(llcrypt_direct_keys, &to_insert->dk_node, hash_key); + spin_unlock(&llcrypt_direct_keys_lock); + return to_insert; +} + +/* Prepare to encrypt directly using the master key in the given mode */ +static struct llcrypt_direct_key * +llcrypt_get_direct_key(const struct llcrypt_info *ci, const u8 *raw_key) +{ + struct llcrypt_direct_key *dk; + int err; + + /* Is there already a tfm for this key? */ + dk = find_or_insert_direct_key(NULL, raw_key, ci); + if (dk) + return dk; + + /* Nope, allocate one. */ + dk = kzalloc(sizeof(*dk), GFP_NOFS); + if (!dk) + return ERR_PTR(-ENOMEM); + refcount_set(&dk->dk_refcount, 1); + dk->dk_mode = ci->ci_mode; + dk->dk_ctfm = llcrypt_allocate_skcipher(ci->ci_mode, raw_key, + ci->ci_inode); + if (IS_ERR(dk->dk_ctfm)) { + err = PTR_ERR(dk->dk_ctfm); + dk->dk_ctfm = NULL; + goto err_free_dk; + } + memcpy(dk->dk_descriptor, ci->ci_policy.v1.master_key_descriptor, + LLCRYPT_KEY_DESCRIPTOR_SIZE); + memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize); + + return find_or_insert_direct_key(dk, raw_key, ci); + +err_free_dk: + free_direct_key(dk); + return ERR_PTR(err); +} + +/* v1 policy, DIRECT_KEY: use the master key directly */ +static int setup_v1_file_key_direct(struct llcrypt_info *ci, + const u8 *raw_master_key) +{ + const struct llcrypt_mode *mode = ci->ci_mode; + struct llcrypt_direct_key *dk; + + if (!llcrypt_mode_supports_direct_key(mode)) { + llcrypt_warn(ci->ci_inode, + "Direct key mode not allowed with %s", + mode->friendly_name); + return -EINVAL; + } + + if (ci->ci_policy.v1.contents_encryption_mode != + ci->ci_policy.v1.filenames_encryption_mode) { + llcrypt_warn(ci->ci_inode, + "Direct key mode not allowed with different contents and filenames modes"); + return -EINVAL; + } + + /* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */ + if (WARN_ON(mode->needs_essiv)) + return -EINVAL; + + dk = llcrypt_get_direct_key(ci, raw_master_key); + if (IS_ERR(dk)) + return PTR_ERR(dk); + ci->ci_direct_key = dk; + ci->ci_ctfm = dk->dk_ctfm; + return 0; +} + +/* v1 policy, !DIRECT_KEY: derive the file's encryption key */ +static int setup_v1_file_key_derived(struct llcrypt_info *ci, + const u8 *raw_master_key) +{ + u8 *derived_key; + int err; + + /* + * This cannot be a stack buffer because it will be passed to the + * scatterlist crypto API during derive_key_aes(). + */ + derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS); + if (!derived_key) + return -ENOMEM; + + err = derive_key_aes(raw_master_key, ci->ci_nonce, + derived_key, ci->ci_mode->keysize); + if (err) + goto out; + + err = llcrypt_set_derived_key(ci, derived_key); +out: + kfree_sensitive(derived_key); + return err; +} + +int llcrypt_setup_v1_file_key(struct llcrypt_info *ci, const u8 *raw_master_key) +{ + if (ci->ci_policy.v1.flags & LLCRYPT_POLICY_FLAG_DIRECT_KEY) + return setup_v1_file_key_direct(ci, raw_master_key); + else + return setup_v1_file_key_derived(ci, raw_master_key); +} + +int llcrypt_setup_v1_file_key_via_subscribed_keyrings(struct llcrypt_info *ci) +{ + struct key *key; + const struct llcrypt_key *payload; + int err; + + key = find_and_lock_process_key(LLCRYPT_KEY_DESC_PREFIX, + ci->ci_policy.v1.master_key_descriptor, + ci->ci_mode->keysize, &payload); + if (key == ERR_PTR(-ENOKEY)) { + struct lustre_sb_info *lsi = s2lsi(ci->ci_inode->i_sb); + + if (lsi && lsi->lsi_cop->key_prefix) { + key = + find_and_lock_process_key(lsi->lsi_cop->key_prefix, + ci->ci_policy.v1.master_key_descriptor, + ci->ci_mode->keysize, + &payload); + } + } + if (IS_ERR(key)) + return PTR_ERR(key); + + err = llcrypt_setup_v1_file_key(ci, payload->raw); + up_read(&key->sem); + key_put(key); + return err; +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/llcrypt_private.h b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/llcrypt_private.h new file mode 100644 index 0000000000000..06eafaf2b80a9 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/llcrypt_private.h @@ -0,0 +1,499 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * llcrypt_private.h + * + * Copyright (C) 2015, Google, Inc. + * + * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. + * Heavily modified since then. + */ +/* + * Linux commit 219d54332a09 + * tags/v5.4 + */ + +#ifndef _LLCRYPT_PRIVATE_H +#define _LLCRYPT_PRIVATE_H + +#include +#include +#include + +#ifndef CRYPTO_TFM_REQ_FORBID_WEAK_KEYS +#define CRYPTO_TFM_REQ_FORBID_WEAK_KEYS CRYPTO_TFM_REQ_WEAK_KEY +#endif + +#define llcrypt_info(inode) ((struct llcrypt_info *)(inode)->i_private) +#define llcrypt_info_nocast(inode) ((inode)->i_private) + +#define CONST_STRLEN(str) (sizeof(str) - 1) + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 + +#define LLCRYPT_MIN_KEY_SIZE 16 + +#define LLCRYPT_CONTEXT_V1 1 +#define LLCRYPT_CONTEXT_V2 2 + +struct llcrypt_context_v1 { + u8 version; /* LLCRYPT_CONTEXT_V1 */ + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[LLCRYPT_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +}; + +struct llcrypt_context_v2 { + u8 version; /* LLCRYPT_CONTEXT_V2 */ + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 __reserved[4]; + u8 master_key_identifier[LLCRYPT_KEY_IDENTIFIER_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +}; + +/** + * llcrypt_context - the encryption context of an inode + * + * This is the on-disk equivalent of an llcrypt_policy, stored alongside each + * encrypted file usually in a hidden extended attribute. It contains the + * fields from the llcrypt_policy, in order to identify the encryption algorithm + * and key with which the file is encrypted. It also contains a nonce that was + * randomly generated by llcrypt itself; this is used as KDF input or as a tweak + * to cause different files to be encrypted differently. + */ +union llcrypt_context { + u8 version; + struct llcrypt_context_v1 v1; + struct llcrypt_context_v2 v2; +}; + +/* + * Return the size expected for the given llcrypt_context based on its version + * number, or 0 if the context version is unrecognized. + */ +static inline int llcrypt_context_size(const union llcrypt_context *ctx) +{ + switch (ctx->version) { + case LLCRYPT_CONTEXT_V1: + BUILD_BUG_ON(sizeof(ctx->v1) != 28); + return sizeof(ctx->v1); + case LLCRYPT_CONTEXT_V2: + BUILD_BUG_ON(sizeof(ctx->v2) != 40); + return sizeof(ctx->v2); + } + return 0; +} + +#undef llcrypt_policy +union llcrypt_policy { + u8 version; + struct llcrypt_policy_v1 v1; + struct llcrypt_policy_v2 v2; +}; + +/* + * Return the size expected for the given llcrypt_policy based on its version + * number, or 0 if the policy version is unrecognized. + */ +static inline int llcrypt_policy_size(const union llcrypt_policy *policy) +{ + switch (policy->version) { + case LLCRYPT_POLICY_V1: + return sizeof(policy->v1); + case LLCRYPT_POLICY_V2: + return sizeof(policy->v2); + } + return 0; +} + +/* Return the contents encryption mode of a valid encryption policy */ +static inline u8 +llcrypt_policy_contents_mode(const union llcrypt_policy *policy) +{ + switch (policy->version) { + case LLCRYPT_POLICY_V1: + return policy->v1.contents_encryption_mode; + case LLCRYPT_POLICY_V2: + return policy->v2.contents_encryption_mode; + } + BUG(); +} + +/* Return the filenames encryption mode of a valid encryption policy */ +static inline u8 +llcrypt_policy_fnames_mode(const union llcrypt_policy *policy) +{ + switch (policy->version) { + case LLCRYPT_POLICY_V1: + return policy->v1.filenames_encryption_mode; + case LLCRYPT_POLICY_V2: + return policy->v2.filenames_encryption_mode; + } + BUG(); +} + +/* Return the flags (LLCRYPT_POLICY_FLAG*) of a valid encryption policy */ +static inline u8 +llcrypt_policy_flags(const union llcrypt_policy *policy) +{ + switch (policy->version) { + case LLCRYPT_POLICY_V1: + return policy->v1.flags; + case LLCRYPT_POLICY_V2: + return policy->v2.flags; + } + BUG(); +} + +static inline bool +llcrypt_is_direct_key_policy(const union llcrypt_policy *policy) +{ + return llcrypt_policy_flags(policy) & LLCRYPT_POLICY_FLAG_DIRECT_KEY; +} + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct llcrypt_symlink_data { + __le16 len; + char encrypted_path[1]; +} __packed; + +/* + * llcrypt_info - the "encryption key" for an inode + * + * When an encrypted file's key is made available, an instance of this struct is + * allocated and stored in '(struct llcrypt_info *)inode->i_private'. + * Once created, it remains until the inode is evicted. + */ +struct llcrypt_info { + + /* The actual crypto transform used for encryption and decryption */ + struct crypto_skcipher *ci_ctfm; + + /* + * Cipher for ESSIV IV generation. Only set for CBC contents + * encryption, otherwise is NULL. + */ + struct crypto_cipher *ci_essiv_tfm; + + /* + * Encryption mode used for this inode. It corresponds to either the + * contents or filenames encryption mode, depending on the inode type. + */ + struct llcrypt_mode *ci_mode; + + /* Back-pointer to the inode */ + struct inode *ci_inode; + + /* + * The master key with which this inode was unlocked (decrypted). This + * will be NULL if the master key was found in a process-subscribed + * keyring rather than in the filesystem-level keyring. + */ + struct key *ci_master_key; + + /* + * Link in list of inodes that were unlocked with the master key. + * Only used when ->ci_master_key is set. + */ + struct list_head ci_master_key_link; + + /* + * If non-NULL, then encryption is done using the master key directly + * and ci_ctfm will equal ci_direct_key->dk_ctfm. + */ + struct llcrypt_direct_key *ci_direct_key; + + /* The encryption policy used by this inode */ + union llcrypt_policy ci_policy; + + /* This inode's nonce, copied from the llcrypt_context */ + u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +}; + +typedef enum { + FS_DECRYPT = 0, + FS_ENCRYPT, +} llcrypt_direction_t; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 + +static inline bool llcrypt_valid_enc_modes(u32 contents_mode, + u32 filenames_mode) +{ + if (contents_mode == LLCRYPT_MODE_AES_128_CBC && + (filenames_mode == LLCRYPT_MODE_AES_128_CTS || + filenames_mode == LLCRYPT_MODE_NULL)) + return true; + + if (contents_mode == LLCRYPT_MODE_AES_256_XTS && + (filenames_mode == LLCRYPT_MODE_AES_256_CTS || + filenames_mode == LLCRYPT_MODE_NULL)) + return true; + + if (contents_mode == LLCRYPT_MODE_ADIANTUM && + (filenames_mode == LLCRYPT_MODE_ADIANTUM || + filenames_mode == LLCRYPT_MODE_NULL)) + return true; + + return false; +} + +/* crypto.c */ +extern struct kmem_cache *llcrypt_info_cachep; +extern int llcrypt_initialize(unsigned int cop_flags); +extern int llcrypt_crypt_block(const struct inode *inode, + llcrypt_direction_t rw, u64 lblk_num, + struct page *src_page, struct page *dest_page, + unsigned int len, unsigned int offs, + gfp_t gfp_flags); +extern struct page *llcrypt_alloc_bounce_page(gfp_t gfp_flags); +extern const struct dentry_operations llcrypt_d_ops; + +extern void __printf(3, 4) __cold +llcrypt_msg(const struct inode *inode, int mask, const char *fmt, ...); + +#define llcrypt_warn(inode, fmt, ...) \ + llcrypt_msg((inode), D_SEC, fmt, ##__VA_ARGS__) +#define llcrypt_err(inode, fmt, ...) \ + llcrypt_msg((inode), D_ERROR, fmt, ##__VA_ARGS__) + +#define LLCRYPT_MAX_IV_SIZE 32 + +union llcrypt_iv { + struct { + /* logical block number within the file */ + __le64 lblk_num; + + /* per-file nonce; only set in DIRECT_KEY mode */ + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; + }; + u8 raw[LLCRYPT_MAX_IV_SIZE]; +}; + +void llcrypt_generate_iv(union llcrypt_iv *iv, u64 lblk_num, + const struct llcrypt_info *ci); + +/* fname.c */ +extern int fname_encrypt(struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen); +extern bool llcrypt_fname_encrypted_size(const struct inode *inode, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret); + +/* hkdf.c */ + +struct llcrypt_hkdf { + struct crypto_shash *hmac_tfm; +}; + +extern int llcrypt_init_hkdf(struct llcrypt_hkdf *hkdf, const u8 *master_key, + unsigned int master_key_size); + +/* + * The list of contexts in which llcrypt uses HKDF. These values are used as + * the first byte of the HKDF application-specific info string to guarantee that + * info strings are never repeated between contexts. This ensures that all HKDF + * outputs are unique and cryptographically isolated, i.e. knowledge of one + * output doesn't reveal another. + */ +#define HKDF_CONTEXT_KEY_IDENTIFIER 1 +#define HKDF_CONTEXT_PER_FILE_KEY 2 +#define HKDF_CONTEXT_PER_MODE_KEY 3 + +extern int llcrypt_hkdf_expand(struct llcrypt_hkdf *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen); + +extern void llcrypt_destroy_hkdf(struct llcrypt_hkdf *hkdf); + +/* keyring.c */ + +/* + * llcrypt_master_key_secret - secret key material of an in-use master key + */ +struct llcrypt_master_key_secret { + + /* + * For v2 policy keys: HKDF context keyed by this master key. + * For v1 policy keys: not set (hkdf.hmac_tfm == NULL). + */ + struct llcrypt_hkdf hkdf; + + /* Size of the raw key in bytes. Set even if ->raw isn't set. */ + u32 size; + + /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */ + u8 raw[LLCRYPT_MAX_KEY_SIZE]; + +} __randomize_layout; + +/* + * llcrypt_master_key - an in-use master key + * + * This represents a master encryption key which has been added to the + * filesystem and can be used to "unlock" the encrypted files which were + * encrypted with it. + */ +struct llcrypt_master_key { + + /* + * The secret key material. After LL_IOC_REMOVE_ENCRYPTION_KEY is + * executed, this is wiped and no new inodes can be unlocked with this + * key; however, there may still be inodes in ->mk_decrypted_inodes + * which could not be evicted. As long as some inodes still remain, + * LL_IOC_REMOVE_ENCRYPTION_KEY can be retried, or + * LL_IOC_ADD_ENCRYPTION_KEY can add the secret again. + * + * Locking: protected by key->sem (outer) and mk_secret_sem (inner). + * The reason for two locks is that key->sem also protects modifying + * mk_users, which ranks it above the semaphore for the keyring key + * type, which is in turn above page faults (via keyring_read). But + * sometimes filesystems call llcrypt_get_encryption_info() from within + * a transaction, which ranks it below page faults. So we need a + * separate lock which protects mk_secret but not also mk_users. + */ + struct llcrypt_master_key_secret mk_secret; + struct rw_semaphore mk_secret_sem; + + /* + * For v1 policy keys: an arbitrary key descriptor which was assigned by + * userspace (->descriptor). + * + * For v2 policy keys: a cryptographic hash of this key (->identifier). + */ + struct llcrypt_key_specifier mk_spec; + + /* + * Keyring which contains a key of type 'key_type_llcrypt_user' for each + * user who has added this key. Normally each key will be added by just + * one user, but it's possible that multiple users share a key, and in + * that case we need to keep track of those users so that one user can't + * remove the key before the others want it removed too. + * + * This is NULL for v1 policy keys; those can only be added by root. + * + * Locking: in addition to this keyrings own semaphore, this is + * protected by the master key's key->sem, so we can do atomic + * search+insert. It can also be searched without taking any locks, but + * in that case the returned key may have already been removed. + */ + struct key *mk_users; + + /* + * Length of ->mk_decrypted_inodes, plus one if mk_secret is present. + * Once this goes to 0, the master key is removed from ->lsi_master_keys. + * The 'struct llcrypt_master_key' will continue to live as long as the + * 'struct key' whose payload it is, but we won't let this reference + * count rise again. + */ + refcount_t mk_refcount; + + /* + * List of inodes that were unlocked using this key. This allows the + * inodes to be evicted efficiently if the key is removed. + */ + struct list_head mk_decrypted_inodes; + spinlock_t mk_decrypted_inodes_lock; + + /* Per-mode tfms for DIRECT_KEY policies, allocated on-demand */ + struct crypto_skcipher *mk_mode_keys[__LLCRYPT_MODE_MAX + 1]; + +} __randomize_layout; + +static inline bool +is_master_key_secret_present(const struct llcrypt_master_key_secret *secret) +{ + /* + * The READ_ONCE() is only necessary for llcrypt_drop_inode() and + * llcrypt_key_describe(). These run in atomic context, so they can't + * take ->mk_secret_sem and thus 'secret' can change concurrently which + * would be a data race. But they only need to know whether the secret + * *was* present at the time of check, so READ_ONCE() suffices. + */ + return READ_ONCE(secret->size) != 0; +} + +static inline const char *master_key_spec_type( + const struct llcrypt_key_specifier *spec) +{ + switch (spec->type) { + case LLCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: + return "descriptor"; + case LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER: + return "identifier"; + } + return "[unknown]"; +} + +static inline int master_key_spec_len(const struct llcrypt_key_specifier *spec) +{ + switch (spec->type) { + case LLCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: + return LLCRYPT_KEY_DESCRIPTOR_SIZE; + case LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER: + return LLCRYPT_KEY_IDENTIFIER_SIZE; + } + return 0; +} + +extern struct key * +llcrypt_find_master_key(struct super_block *sb, + const struct llcrypt_key_specifier *mk_spec); + +extern int llcrypt_verify_key_added(struct super_block *sb, + const u8 identifier[LLCRYPT_KEY_IDENTIFIER_SIZE]); + +extern int __init llcrypt_init_keyring(void); + +extern void __exit llcrypt_exit_keyring(void); + +/* keysetup.c */ + +struct llcrypt_mode { + const char *friendly_name; + const char *cipher_str; + int keysize; + int ivsize; + bool logged_impl_name; + bool needs_essiv; +}; + +static inline bool +llcrypt_mode_supports_direct_key(const struct llcrypt_mode *mode) +{ + return mode->ivsize >= offsetofend(union llcrypt_iv, nonce); +} + +extern struct crypto_skcipher * +llcrypt_allocate_skcipher(struct llcrypt_mode *mode, const u8 *raw_key, + const struct inode *inode); + +extern int llcrypt_set_derived_key(struct llcrypt_info *ci, + const u8 *derived_key); + +/* keysetup_v1.c */ + +extern void llcrypt_put_direct_key(struct llcrypt_direct_key *dk); + +extern int llcrypt_setup_v1_file_key(struct llcrypt_info *ci, + const u8 *raw_master_key); + +extern int llcrypt_setup_v1_file_key_via_subscribed_keyrings( + struct llcrypt_info *ci); +/* policy.c */ + +extern bool llcrypt_policies_equal(const union llcrypt_policy *policy1, + const union llcrypt_policy *policy2); +extern bool llcrypt_supported_policy(const union llcrypt_policy *policy_u, + const struct inode *inode); +extern int llcrypt_policy_from_context(union llcrypt_policy *policy_u, + const union llcrypt_context *ctx_u, + int ctx_size); + +#endif /* _LLCRYPT_PRIVATE_H */ diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/policy.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/policy.c new file mode 100644 index 0000000000000..5d094d53b01a4 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/policy.c @@ -0,0 +1,594 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Encryption policy functions for per-file encryption support. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility. + * + * Originally written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + * Modified by Eric Biggers, 2019 for v2 policy support. + */ +/* + * Linux commit 219d54332a09 + * tags/v5.4 + */ + +#include +#include +#include +#include +#include "llcrypt_private.h" + +/** + * llcrypt_policies_equal - check whether two encryption policies are the same + * + * Return: %true if equal, else %false + */ +bool llcrypt_policies_equal(const union llcrypt_policy *policy1, + const union llcrypt_policy *policy2) +{ + if (policy1->version != policy2->version) + return false; + + return !memcmp(policy1, policy2, llcrypt_policy_size(policy1)); +} + +/** + * llcrypt_supported_policy - check whether an encryption policy is supported + * + * Given an encryption policy, check whether all its encryption modes and other + * settings are supported by this kernel. (But we don't currently don't check + * for crypto API support here, so attempting to use an algorithm not configured + * into the crypto API will still fail later.) + * + * Return: %true if supported, else %false + */ +bool llcrypt_supported_policy(const union llcrypt_policy *policy_u, + const struct inode *inode) +{ + switch (policy_u->version) { + case LLCRYPT_POLICY_V1: { + const struct llcrypt_policy_v1 *policy = &policy_u->v1; + + if (!llcrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) { + llcrypt_warn(inode, + "Unsupported encryption modes (contents %d, filenames %d)", + policy->contents_encryption_mode, + policy->filenames_encryption_mode); + return false; + } + + if (policy->flags & ~LLCRYPT_POLICY_FLAGS_VALID) { + llcrypt_warn(inode, + "Unsupported encryption flags (0x%02x)", + policy->flags); + return false; + } + + return true; + } + case LLCRYPT_POLICY_V2: { + const struct llcrypt_policy_v2 *policy = &policy_u->v2; + + if (!llcrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) { + llcrypt_warn(inode, + "Unsupported encryption modes (contents %d, filenames %d)", + policy->contents_encryption_mode, + policy->filenames_encryption_mode); + return false; + } + + if (policy->flags & ~LLCRYPT_POLICY_FLAGS_VALID) { + llcrypt_warn(inode, + "Unsupported encryption flags (0x%02x)", + policy->flags); + return false; + } + + if (memchr_inv(policy->__reserved, 0, + sizeof(policy->__reserved))) { + llcrypt_warn(inode, + "Reserved bits set in encryption policy"); + return false; + } + + return true; + } + } + return false; +} + +/** + * llcrypt_new_context_from_policy - create a new llcrypt_context from a policy + * + * Create an llcrypt_context for an inode that is being assigned the given + * encryption policy. A new nonce is randomly generated. + * + * Return: the size of the new context in bytes. + */ +static int llcrypt_new_context_from_policy(union llcrypt_context *ctx_u, + const union llcrypt_policy *policy_u) +{ + memset(ctx_u, 0, sizeof(*ctx_u)); + + switch (policy_u->version) { + case LLCRYPT_POLICY_V1: { + const struct llcrypt_policy_v1 *policy = &policy_u->v1; + struct llcrypt_context_v1 *ctx = &ctx_u->v1; + + ctx->version = LLCRYPT_CONTEXT_V1; + ctx->contents_encryption_mode = + policy->contents_encryption_mode; + ctx->filenames_encryption_mode = + policy->filenames_encryption_mode; + ctx->flags = policy->flags; + memcpy(ctx->master_key_descriptor, + policy->master_key_descriptor, + sizeof(ctx->master_key_descriptor)); + get_random_bytes(ctx->nonce, sizeof(ctx->nonce)); + return sizeof(*ctx); + } + case LLCRYPT_POLICY_V2: { + const struct llcrypt_policy_v2 *policy = &policy_u->v2; + struct llcrypt_context_v2 *ctx = &ctx_u->v2; + + ctx->version = LLCRYPT_CONTEXT_V2; + ctx->contents_encryption_mode = + policy->contents_encryption_mode; + ctx->filenames_encryption_mode = + policy->filenames_encryption_mode; + ctx->flags = policy->flags; + memcpy(ctx->master_key_identifier, + policy->master_key_identifier, + sizeof(ctx->master_key_identifier)); + get_random_bytes(ctx->nonce, sizeof(ctx->nonce)); + return sizeof(*ctx); + } + } + BUG(); +} + +/** + * llcrypt_policy_from_context - convert an llcrypt_context to an llcrypt_policy + * + * Given an llcrypt_context, build the corresponding llcrypt_policy. + * + * Return: 0 on success, or -EINVAL if the llcrypt_context has an unrecognized + * version number or size. + * + * This does *not* validate the settings within the policy itself, e.g. the + * modes, flags, and reserved bits. Use llcrypt_supported_policy() for that. + */ +int llcrypt_policy_from_context(union llcrypt_policy *policy_u, + const union llcrypt_context *ctx_u, + int ctx_size) +{ + memset(policy_u, 0, sizeof(*policy_u)); + + if (ctx_size <= 0 || ctx_size != llcrypt_context_size(ctx_u)) + return -EINVAL; + + switch (ctx_u->version) { + case LLCRYPT_CONTEXT_V1: { + const struct llcrypt_context_v1 *ctx = &ctx_u->v1; + struct llcrypt_policy_v1 *policy = &policy_u->v1; + + policy->version = LLCRYPT_POLICY_V1; + policy->contents_encryption_mode = + ctx->contents_encryption_mode; + policy->filenames_encryption_mode = + ctx->filenames_encryption_mode; + policy->flags = ctx->flags; + memcpy(policy->master_key_descriptor, + ctx->master_key_descriptor, + sizeof(policy->master_key_descriptor)); + return 0; + } + case LLCRYPT_CONTEXT_V2: { + const struct llcrypt_context_v2 *ctx = &ctx_u->v2; + struct llcrypt_policy_v2 *policy = &policy_u->v2; + + policy->version = LLCRYPT_POLICY_V2; + policy->contents_encryption_mode = + ctx->contents_encryption_mode; + policy->filenames_encryption_mode = + ctx->filenames_encryption_mode; + policy->flags = ctx->flags; + memcpy(policy->__reserved, ctx->__reserved, + sizeof(policy->__reserved)); + memcpy(policy->master_key_identifier, + ctx->master_key_identifier, + sizeof(policy->master_key_identifier)); + return 0; + } + } + /* unreachable */ + return -EINVAL; +} + +/* Retrieve an inode's encryption policy */ +static int llcrypt_get_policy(struct inode *inode, union llcrypt_policy *policy) +{ + const struct llcrypt_info *ci; + union llcrypt_context ctx; + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + int ret; + + ci = (struct llcrypt_info *)READ_ONCE(llcrypt_info_nocast(inode)); + if (ci) { + /* key available, use the cached policy */ + *policy = ci->ci_policy; + return 0; + } + + if (!IS_ENCRYPTED(inode)) + return -ENODATA; + + if (!lsi) + return -ENODATA; + + ret = lsi->lsi_cop->get_context(inode, &ctx, sizeof(ctx)); + if (ret < 0) + return (ret == -ERANGE) ? -EINVAL : ret; + + return llcrypt_policy_from_context(policy, &ctx, ret); +} + +static int set_encryption_policy(struct inode *inode, + const union llcrypt_policy *policy) +{ + union llcrypt_context ctx; + int ctxsize; + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + int err; + + if (!llcrypt_supported_policy(policy, inode)) + return -EINVAL; + + switch (policy->version) { + case LLCRYPT_POLICY_V1: + /* + * The original encryption policy version provided no way of + * verifying that the correct master key was supplied, which was + * insecure in scenarios where multiple users have access to the + * same encrypted files (even just read-only access). The new + * encryption policy version fixes this and also implies use of + * an improved key derivation function and allows non-root users + * to securely remove keys. So as long as compatibility with + * old kernels isn't required, it is recommended to use the new + * policy version for all new encrypted directories. + */ + pr_warn_once("%s (pid %d) is setting deprecated v1 encryption policy; recommend upgrading to v2.\n", + current->comm, current->pid); + break; + case LLCRYPT_POLICY_V2: + err = llcrypt_verify_key_added(inode->i_sb, + policy->v2.master_key_identifier); + if (err) + return err; + break; + default: + WARN_ON(1); + return -EINVAL; + } + + ctxsize = llcrypt_new_context_from_policy(&ctx, policy); + + if (!lsi) + return -EINVAL; + + return lsi->lsi_cop->set_context(inode, &ctx, ctxsize, NULL); +} + +/* Tell if an inode's encryption policy has filename encryption */ +bool llcrypt_policy_has_filename_enc(struct inode *inode) +{ + union llcrypt_policy policy; + int err; + + err = llcrypt_get_policy(inode, &policy); + if (err) + return true; + + if ((policy.version == LLCRYPT_POLICY_V1 && + policy.v1.filenames_encryption_mode == LLCRYPT_MODE_NULL) || + (policy.version == LLCRYPT_POLICY_V2 && + policy.v2.filenames_encryption_mode == LLCRYPT_MODE_NULL)) + return false; + return true; +} +EXPORT_SYMBOL(llcrypt_policy_has_filename_enc); + +int llcrypt_ioctl_set_policy(struct file *filp, const void __user *arg) +{ + union llcrypt_policy policy; + union llcrypt_policy existing_policy; + struct inode *inode = file_inode(filp); + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + u8 version; + int size; + int ret; + + if (get_user(policy.version, (const u8 __user *)arg)) + return -EFAULT; + + size = llcrypt_policy_size(&policy); + if (size <= 0) + return -EINVAL; + + /* + * We should just copy the remaining 'size - 1' bytes here, but a + * bizarre bug in gcc 7 and earlier (fixed by gcc r255731) causes gcc to + * think that size can be 0 here (despite the check above!) *and* that + * it's a compile-time constant. Thus it would think copy_from_user() + * is passed compile-time constant ULONG_MAX, causing the compile-time + * buffer overflow check to fail, breaking the build. This only occurred + * when building an i386 kernel with -Os and branch profiling enabled. + * + * Work around it by just copying the first byte again... + */ + version = policy.version; + if (copy_from_user(&policy, arg, size)) + return -EFAULT; + policy.version = version; + + /* Force file/directory name encryption policy to null if + * LSI_FILENAME_ENC flag is not set on sb. + * This allows enabling filename encryption separately from data + * encryption, and can be useful for interoperability with + * encryption-unaware clients. + */ + if (!(lsi->lsi_flags & LSI_FILENAME_ENC)) { + CWARN("inode %lu: forcing policy filenames_encryption_mode to null\n", + inode->i_ino); + cfs_tty_write_msg("\n\nForcing policy filenames_encryption_mode to null.\n\n"); + switch (policy.version) { + case LLCRYPT_POLICY_V1: + policy.v1.filenames_encryption_mode = LLCRYPT_MODE_NULL; + break; + case LLCRYPT_POLICY_V2: + policy.v2.filenames_encryption_mode = LLCRYPT_MODE_NULL; + break; + } + } + + if (!inode_owner_or_capable(&init_user_ns, inode)) + return -EACCES; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + ret = llcrypt_get_policy(inode, &existing_policy); + if (ret == -ENODATA) { + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + + if (!S_ISDIR(inode->i_mode)) + ret = -ENOTDIR; + else if (IS_DEADDIR(inode)) + ret = -ENOENT; + else if (lsi && !lsi->lsi_cop->empty_dir(inode)) + ret = -ENOTEMPTY; + else + ret = set_encryption_policy(inode, &policy); + } else if (ret == -EINVAL || + (ret == 0 && !llcrypt_policies_equal(&policy, + &existing_policy))) { + /* The file already uses a different encryption policy. */ + ret = -EEXIST; + } + + inode_unlock(inode); + + mnt_drop_write_file(filp); + return ret; +} +EXPORT_SYMBOL(llcrypt_ioctl_set_policy); + +/* Original ioctl version; can only get the original policy version */ +int llcrypt_ioctl_get_policy(struct file *filp, void __user *arg) +{ + union llcrypt_policy policy; + int err; + + err = llcrypt_get_policy(file_inode(filp), &policy); + if (err) + return err; + + if (policy.version != LLCRYPT_POLICY_V1) + return -EINVAL; + + if (copy_to_user(arg, &policy, sizeof(policy.v1))) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL(llcrypt_ioctl_get_policy); + +/* Valid filenames_encryption_mode associated with contents_encryption_mode, + * as imposed by llcrypt_valid_enc_modes() + */ +static inline u8 contents2filenames_encmode(u8 contents_encryption_mode) +{ + if (contents_encryption_mode == LLCRYPT_MODE_AES_128_CBC) + return LLCRYPT_MODE_AES_128_CTS; + if (contents_encryption_mode == LLCRYPT_MODE_AES_256_XTS) + return LLCRYPT_MODE_AES_256_CTS; + if (contents_encryption_mode == LLCRYPT_MODE_ADIANTUM) + return LLCRYPT_MODE_ADIANTUM; + return LLCRYPT_MODE_NULL; +} + +/* Extended ioctl version; can get policies of any version */ +int llcrypt_ioctl_get_policy_ex(struct file *filp, void __user *uarg) +{ + struct llcrypt_get_policy_ex_arg arg; + union llcrypt_policy *policy = (union llcrypt_policy *)&arg.policy; + size_t policy_size; + struct inode *inode = file_inode(filp); + int err; + + /* arg is policy_size, then policy */ + BUILD_BUG_ON(offsetof(typeof(arg), policy_size) != 0); + BUILD_BUG_ON(offsetofend(typeof(arg), policy_size) != + offsetof(typeof(arg), policy)); + BUILD_BUG_ON(sizeof(arg.policy) != sizeof(*policy)); + + err = llcrypt_get_policy(file_inode(filp), policy); + if (err) + return err; + policy_size = llcrypt_policy_size(policy); + + if (copy_from_user(&arg, uarg, sizeof(arg.policy_size))) + return -EFAULT; + + if (policy_size > arg.policy_size) + return -EOVERFLOW; + arg.policy_size = policy_size; + + /* Do not return null filenames_encryption_mode to userspace, as it is + * unknown. Instead, return valid mode associated with + * contents_encryption_mode, as imposed by llcrypt_valid_enc_modes(). + */ + switch (policy->version) { + case LLCRYPT_POLICY_V1: + if (policy->v1.filenames_encryption_mode == LLCRYPT_MODE_NULL) { + policy->v1.filenames_encryption_mode = + contents2filenames_encmode( + policy->v1.contents_encryption_mode); + CWARN("inode %lu: returning policy filenames_encryption_mode as %d, but is in fact null\n", + inode->i_ino, + policy->v1.filenames_encryption_mode); + } + break; + case LLCRYPT_POLICY_V2: + if (policy->v2.filenames_encryption_mode == LLCRYPT_MODE_NULL) { + policy->v2.filenames_encryption_mode = + contents2filenames_encmode( + policy->v2.contents_encryption_mode); + CWARN("inode %lu: returning policy filenames_encryption_mode as %d, but is in fact null\n", + inode->i_ino, + policy->v2.filenames_encryption_mode); + } + break; + } + + if (copy_to_user(uarg, &arg, sizeof(arg.policy_size) + policy_size)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(llcrypt_ioctl_get_policy_ex); + +/** + * llcrypt_has_permitted_context() - is a file's encryption policy permitted + * within its directory? + * + * @parent: inode for parent directory + * @child: inode for file being looked up, opened, or linked into @parent + * + * Filesystems must call this before permitting access to an inode in a + * situation where the parent directory is encrypted (either before allowing + * ->lookup() to succeed, or for a regular file before allowing it to be opened) + * and before any operation that involves linking an inode into an encrypted + * directory, including link, rename, and cross rename. It enforces the + * constraint that within a given encrypted directory tree, all files use the + * same encryption policy. The pre-access check is needed to detect potentially + * malicious offline violations of this constraint, while the link and rename + * checks are needed to prevent online violations of this constraint. + * + * Return: 1 if permitted, 0 if forbidden. + */ +int llcrypt_has_permitted_context(struct inode *parent, struct inode *child) +{ + union llcrypt_policy parent_policy, child_policy; + int err; + + /* No restrictions on file types which are never encrypted */ + if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && + !S_ISLNK(child->i_mode)) + return 1; + + /* No restrictions if the parent directory is unencrypted */ + if (!IS_ENCRYPTED(parent)) + return 1; + + /* Encrypted directories must not contain unencrypted files */ + if (!IS_ENCRYPTED(child)) + return 0; + + /* + * Both parent and child are encrypted, so verify they use the same + * encryption policy. Compare the llcrypt_info structs if the keys are + * available, otherwise retrieve and compare the llcrypt_contexts. + * + * Note that the llcrypt_context retrieval will be required frequently + * when accessing an encrypted directory tree without the key. + * Performance-wise this is not a big deal because we already don't + * really optimize for file access without the key (to the extent that + * such access is even possible), given that any attempted access + * already causes a llcrypt_context retrieval and keyring search. + * + * In any case, if an unexpected error occurs, fall back to "forbidden". + */ + + err = llcrypt_get_encryption_info(parent); + if (err) + return 0; + err = llcrypt_get_encryption_info(child); + if (err) + return 0; + + err = llcrypt_get_policy(parent, &parent_policy); + if (err) + return 0; + + err = llcrypt_get_policy(child, &child_policy); + if (err) + return 0; + + return llcrypt_policies_equal(&parent_policy, &child_policy); +} +EXPORT_SYMBOL(llcrypt_has_permitted_context); + +/** + * llcrypt_inherit_context() - Sets a child context from its parent + * @parent: Parent inode from which the context is inherited. + * @child: Child inode that inherits the context from @parent. + * @fs_data: private data given by FS. + * @preload: preload child crypt info if true + * + * Return: 0 on success, -errno on failure + */ +int llcrypt_inherit_context(struct inode *parent, struct inode *child, + void *fs_data, bool preload) +{ + union llcrypt_context ctx; + int ctxsize; + struct llcrypt_info *ci; + struct lustre_sb_info *lsi = s2lsi(parent->i_sb); + int res; + + res = llcrypt_get_encryption_info(parent); + if (res < 0) + return res; + + ci = (struct llcrypt_info *)READ_ONCE(llcrypt_info_nocast(parent)); + if (ci == NULL) + return -ENOKEY; + + if (!lsi) + return -ENOKEY; + + ctxsize = llcrypt_new_context_from_policy(&ctx, &ci->ci_policy); + + BUILD_BUG_ON(sizeof(ctx) != LLCRYPT_SET_CONTEXT_MAX_SIZE); + res = lsi->lsi_cop->set_context(child, &ctx, ctxsize, fs_data); + if (res) + return res; + return preload ? llcrypt_get_encryption_info(child): 0; +} +EXPORT_SYMBOL(llcrypt_inherit_context); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c new file mode 100644 index 0000000000000..f8ad1461cb6d9 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c @@ -0,0 +1,736 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/libcfs/debug.c + * + * Author: Phil Schwan + * + */ + +# define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_PANIC_NOTIFIER_H +#include +#endif +#include "tracefile.h" + +static char debug_file_name[1024]; + +unsigned int libcfs_subsystem_debug = LIBCFS_S_DEFAULT; +EXPORT_SYMBOL(libcfs_subsystem_debug); +module_param(libcfs_subsystem_debug, int, 0644); +MODULE_PARM_DESC(libcfs_subsystem_debug, "Lustre kernel debug subsystem mask"); + +unsigned int libcfs_debug = LIBCFS_D_DEFAULT; +EXPORT_SYMBOL(libcfs_debug); +module_param(libcfs_debug, int, 0644); +MODULE_PARM_DESC(libcfs_debug, "Lustre kernel debug mask"); + +static int libcfs_param_debug_mb_set(const char *val, + cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned int num; + + rc = kstrtouint(val, 0, &num); + if (rc < 0) + return rc; + + num = cfs_trace_set_debug_mb(num); + + *((unsigned int *)kp->arg) = num; + num = cfs_trace_get_debug_mb(); + if (num) + /* This value is more precise */ + *((unsigned int *)kp->arg) = num; + + return 0; +} + +/* While debug_mb setting look like unsigned int, in fact + * it needs quite a bunch of extra processing, so we define special + * debug_mb parameter type with corresponding methods to handle this case + */ +static const struct kernel_param_ops param_ops_debug_mb = { + .set = libcfs_param_debug_mb_set, + .get = param_get_uint, +}; + +#define param_check_debug_mb(name, p) \ + __param_check(name, p, unsigned int) + +static unsigned int libcfs_debug_mb; +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(libcfs_debug_mb, debug_mb, 0644); +#else +module_param_call(libcfs_debug_mb, libcfs_param_debug_mb_set, param_get_uint, + ¶m_ops_debug_mb, 0644); +#endif +MODULE_PARM_DESC(libcfs_debug_mb, "Total debug buffer size."); + +unsigned int libcfs_printk = D_CANTMASK; +module_param(libcfs_printk, uint, 0644); +MODULE_PARM_DESC(libcfs_printk, "Lustre kernel debug console mask"); + +unsigned int libcfs_console_ratelimit = 1; +module_param(libcfs_console_ratelimit, uint, 0644); +MODULE_PARM_DESC(libcfs_console_ratelimit, "Lustre kernel debug console ratelimit (0 to disable)"); + +static int param_set_delay_minmax(const char *val, + cfs_kernel_param_arg_t *kp, + long min, long max) +{ + long d; + int sec; + int rc; + + rc = kstrtoint(val, 0, &sec); + if (rc) + return -EINVAL; + + /* The sysfs setting is in centiseconds */ + d = cfs_time_seconds(sec) / 100; + if (d < min || d > max) + return -EINVAL; + + *((unsigned int *)kp->arg) = d; + + return 0; +} + +static int param_get_delay(char *buffer, cfs_kernel_param_arg_t *kp) +{ + unsigned int d = *(unsigned int *)kp->arg; + + param_get_byte(buffer, kp); + return sprintf(buffer, "%lu%c", jiffies_to_msecs(d * 10) / MSEC_PER_SEC, + strnchr(buffer, PAGE_SIZE, '\n') ? '\n' : '\0'); +} + +unsigned int libcfs_console_max_delay; +unsigned int libcfs_console_min_delay; + +static int param_set_console_max_delay(const char *val, + cfs_kernel_param_arg_t *kp) +{ + return param_set_delay_minmax(val, kp, + libcfs_console_min_delay, INT_MAX); +} + +static const struct kernel_param_ops param_ops_console_max_delay = { + .set = param_set_console_max_delay, + .get = param_get_delay, +}; + +#define param_check_console_max_delay(name, p) \ + __param_check(name, p, unsigned int) + +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(libcfs_console_max_delay, console_max_delay, 0644); +#else +module_param_call(libcfs_console_max_delay, param_set_console_max_delay, + param_get_delay, ¶m_ops_console_max_delay, 0644); +#endif +MODULE_PARM_DESC(libcfs_console_max_delay, "Lustre kernel debug console max delay (jiffies)"); + +static int param_set_console_min_delay(const char *val, + cfs_kernel_param_arg_t *kp) +{ + return param_set_delay_minmax(val, kp, + 1, libcfs_console_max_delay); +} + +static const struct kernel_param_ops param_ops_console_min_delay = { + .set = param_set_console_min_delay, + .get = param_get_delay, +}; + +#define param_check_console_min_delay(name, p) \ + __param_check(name, p, unsigned int) + +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(libcfs_console_min_delay, console_min_delay, 0644); +#else +module_param_call(libcfs_console_min_delay, param_set_console_min_delay, + param_get_delay, ¶m_ops_console_min_delay, 0644); +#endif +MODULE_PARM_DESC(libcfs_console_min_delay, "Lustre kernel debug console min delay (jiffies)"); + +#ifndef HAVE_PARAM_SET_UINT_MINMAX +static int param_set_uint_minmax(const char *val, + cfs_kernel_param_arg_t *kp, + unsigned int min, unsigned int max) +{ + unsigned int num; + int ret; + + if (!val) + return -EINVAL; + + ret = kstrtouint(val, 0, &num); + if (ret < 0 || num < min || num > max) + return -EINVAL; + + *((unsigned int *)kp->arg) = num; + return 0; +} +#endif + +static int param_set_uintpos(const char *val, + cfs_kernel_param_arg_t *kp) +{ + return param_set_uint_minmax(val, kp, 1, -1); +} + +static const struct kernel_param_ops param_ops_uintpos = { + .set = param_set_uintpos, + .get = param_get_uint, +}; + +#define param_check_uintpos(name, p) \ + __param_check(name, p, unsigned int) + +unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF; +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(libcfs_console_backoff, uintpos, 0644); +#else +module_param_call(libcfs_console_backoff, param_set_uintpos, param_get_uint, + ¶m_ops_uintpos, 0644); +#endif +MODULE_PARM_DESC(libcfs_console_backoff, "Lustre kernel debug console backoff factor"); + +unsigned int libcfs_debug_binary = 1; + +unsigned int libcfs_stack = 3 * THREAD_SIZE / 4; +EXPORT_SYMBOL(libcfs_stack); + +unsigned int libcfs_catastrophe; +EXPORT_SYMBOL(libcfs_catastrophe); + +unsigned int libcfs_watchdog_ratelimit = 300; +EXPORT_SYMBOL(libcfs_watchdog_ratelimit); + +unsigned int libcfs_panic_on_lbug = 1; +module_param(libcfs_panic_on_lbug, uint, 0644); +MODULE_PARM_DESC(libcfs_panic_on_lbug, "Lustre kernel panic on LBUG"); + +atomic64_t libcfs_kmem = ATOMIC64_INIT(0); +EXPORT_SYMBOL(libcfs_kmem); + +static DECLARE_COMPLETION(debug_complete); + +/* We need to pass a pointer here, but elsewhere this must be a const */ +char *libcfs_debug_file_path = LIBCFS_DEBUG_FILE_PATH_DEFAULT; +EXPORT_SYMBOL(libcfs_debug_file_path); +module_param(libcfs_debug_file_path, charp, 0644); +MODULE_PARM_DESC(libcfs_debug_file_path, + "Path for dumping debug logs, set 'NONE' to prevent log dumping"); + +int libcfs_panic_in_progress; + +/* libcfs_debug_token2mask() expects the returned string in lower-case */ +static const char *libcfs_debug_subsys2str(int subsys) +{ + static const char *const libcfs_debug_subsystems[] = + LIBCFS_DEBUG_SUBSYS_NAMES; + + if (subsys >= ARRAY_SIZE(libcfs_debug_subsystems)) + return NULL; + + return libcfs_debug_subsystems[subsys]; +} + +/* libcfs_debug_token2mask() expects the returned string in lower-case */ +static const char *libcfs_debug_dbg2str(int debug) +{ + static const char * const libcfs_debug_masks[] = + LIBCFS_DEBUG_MASKS_NAMES; + + if (debug >= ARRAY_SIZE(libcfs_debug_masks)) + return NULL; + + return libcfs_debug_masks[debug]; +} + +int +libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys) +{ + const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str : + libcfs_debug_dbg2str; + int len = 0; + const char *token; + int i; + + if (mask == 0) { /* "0" */ + if (size > 0) + str[0] = '0'; + len = 1; + } else { /* space-separated tokens */ + for (i = 0; i < 32; i++) { + if ((mask & BIT(i)) == 0) + continue; + + token = fn(i); + if (!token) /* unused bit */ + continue; + + if (len > 0) { /* separator? */ + if (len < size) + str[len] = ' '; + len++; + } + + while (*token != 0) { + if (len < size) + str[len] = *token; + token++; + len++; + } + } + } + + /* terminate 'str' */ + if (len < size) + str[len] = 0; + else + str[size - 1] = 0; + + return len; +} + +int +libcfs_debug_str2mask(int *mask, const char *str, int is_subsys) +{ + const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str : + libcfs_debug_dbg2str; + int m = 0; + int matched; + int n; + int t; + + /* Allow a number for backwards compatibility */ + for (n = strlen(str); n > 0; n--) + if (!isspace(str[n-1])) + break; + matched = n; + t = sscanf(str, "%i%n", &m, &matched); + if (t >= 1 && matched == n) { + /* don't print warning for lctl set_param debug=0 or -1 */ + if (m != 0 && m != -1) + CWARN("You are trying to use a numerical value for the mask - this will be deprecated in a future release.\n"); + *mask = m; + return 0; + } + + return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK, ~0, + is_subsys ? LIBCFS_S_DEFAULT : LIBCFS_D_DEFAULT); +} + +char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall"; + +/* Upcall function once a Lustre log has been dumped. + * + * @file path of the dumped log + */ +static void libcfs_run_debug_log_upcall(char *file) +{ + char *argv[3]; + int rc; + static const char * const envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL + }; + + ENTRY; + argv[0] = lnet_debug_log_upcall; + + LASSERTF(file, "called on a null filename\n"); + argv[1] = file; /* only need to pass the path of the file */ + + argv[2] = NULL; + + rc = call_usermodehelper(argv[0], argv, (char **)envp, 1); + if (rc < 0 && rc != -ENOENT) { + CERROR("Error %d invoking LNET debug log upcall %s %s; check /sys/kernel/debug/lnet/debug_log_upcall\n", + rc, argv[0], argv[1]); + } else { + CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n", + argv[0], argv[1]); + } +} + +/** + * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages() + */ +static void libcfs_debug_dumplog_internal(void *arg) +{ + static time64_t last_dump_time; + time64_t current_time; + + current_time = ktime_get_real_seconds(); + + if (strncmp(libcfs_debug_file_path, "NONE", 4) != 0 && + current_time > last_dump_time) { + last_dump_time = current_time; + snprintf(debug_file_name, sizeof(debug_file_name) - 1, + "%s.%lld.%ld", libcfs_debug_file_path, + (s64)current_time, (uintptr_t)arg); + pr_alert("LustreError: dumping log to %s\n", debug_file_name); + cfs_tracefile_dump_all_pages(debug_file_name); + libcfs_run_debug_log_upcall(debug_file_name); + } +} + +static int libcfs_debug_dumplog_thread(void *arg) +{ + libcfs_debug_dumplog_internal(arg); + complete(&debug_complete); + return 0; +} + +static DEFINE_MUTEX(libcfs_debug_dumplog_lock); + +void libcfs_debug_dumplog(void) +{ + struct task_struct *dumper; + + ENTRY; + + if (mutex_trylock(&libcfs_debug_dumplog_lock) == 0) + return; + + /* If a previous call was interrupted, debug_complete->done + * might be elevated, and so we won't actually wait here. + * So we reinit the completion to ensure we wait for + * one thread to complete, though it might not be the one + * we start if there are overlaping thread. + */ + reinit_completion(&debug_complete); + dumper = kthread_run(libcfs_debug_dumplog_thread, + (void *)(long)current->pid, + "libcfs_debug_dumper"); + if (IS_ERR(dumper)) + pr_err("LustreError: cannot start log dump thread: rc = %ld\n", + PTR_ERR(dumper)); + else + wait_for_completion_interruptible(&debug_complete); + + mutex_unlock(&libcfs_debug_dumplog_lock); +} +EXPORT_SYMBOL(libcfs_debug_dumplog); + +/* coverity[+kill] */ +void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msgdata) +{ + libcfs_catastrophe = 1; + libcfs_debug_msg(msgdata, "LBUG\n"); + + if (in_interrupt()) { + panic("LBUG in interrupt.\n"); + /* not reached */ + } + + libcfs_debug_dumpstack(NULL); + if (libcfs_panic_on_lbug) + panic("LBUG"); + else + libcfs_debug_dumplog(); + set_current_state(TASK_UNINTERRUPTIBLE); + while (1) + schedule(); +} +EXPORT_SYMBOL(lbug_with_loc); + +#ifdef CONFIG_STACKTRACE + +#ifndef HAVE_SAVE_STACK_TRACE_TSK +#define save_stack_trace_tsk(tsk, trace) \ +do { \ + if (tsk == current) \ + save_stack_trace(trace); \ + else \ + pr_info("No stack, save_stack_trace_tsk() not exported\n"); \ +} while (0) +#endif + +static void cfs_print_stack_trace(unsigned long *entries, unsigned int nr) +{ + unsigned int i; + + /* Prefer %pB for backtraced symbolic names since it was added in: + * Linux v2.6.38-6557-g0f77a8d37825 + * vsprintf: Introduce %pB format specifier + */ + for (i = 0; i < nr; i++) + pr_info("[<0>] %pB\n", (void *)entries[i]); +} + +#define MAX_ST_ENTRIES 100 +static DEFINE_SPINLOCK(st_lock); + +/* Linux v5.1-rc5 214d8ca6ee ("stacktrace: Provide common infrastructure") + * CONFIG_ARCH_STACKWALK indicates that save_stack_trace_tsk symbol is not + * exported. Use symbol_get() to find if save_stack_trace_tsk is available. + */ +#ifdef CONFIG_ARCH_STACKWALK +typedef unsigned int (stack_trace_save_tsk_t)(struct task_struct *task, + unsigned long *store, + unsigned int size, + unsigned int skipnr); +static stack_trace_save_tsk_t *task_dump_stack; +#endif + +void __init cfs_debug_init(void) +{ +#ifdef CONFIG_ARCH_STACKWALK + task_dump_stack = (void *) + cfs_kallsyms_lookup_name("stack_trace_save_tsk"); + +#endif +} + +static void libcfs_call_trace(struct task_struct *tsk) +{ + static unsigned long entries[MAX_ST_ENTRIES]; +#ifdef CONFIG_ARCH_STACKWALK + unsigned int nr_entries; + + spin_lock(&st_lock); + pr_info("Pid: %d, comm: %.20s %s %s\n", tsk->pid, tsk->comm, + init_utsname()->release, init_utsname()->version); + pr_info("Call Trace TBD:\n"); + if (task_dump_stack) { + nr_entries = task_dump_stack(tsk, entries, MAX_ST_ENTRIES, 0); + cfs_print_stack_trace(entries, nr_entries); + } + spin_unlock(&st_lock); +#else + struct stack_trace trace; + + trace.nr_entries = 0; + trace.max_entries = MAX_ST_ENTRIES; + trace.entries = entries; + trace.skip = 0; + + spin_lock(&st_lock); + pr_info("Pid: %d, comm: %.20s %s %s\n", tsk->pid, tsk->comm, + init_utsname()->release, init_utsname()->version); + pr_info("Call Trace:\n"); + save_stack_trace_tsk(tsk, &trace); + cfs_print_stack_trace(trace.entries, trace.nr_entries); + spin_unlock(&st_lock); +#endif +} + +#else /* !CONFIG_STACKTRACE */ + +#ifdef CONFIG_X86 +#include +#include + +#ifdef HAVE_STACKTRACE_OPS +static int print_trace_stack(void *data, char *name) +{ + printk(" <%s> ", name); + return 0; +} + +#ifdef STACKTRACE_OPS_ADDRESS_RETURN_INT +static int +#else +static void +#endif +print_trace_address(void *data, unsigned long addr, int reliable) +{ + char fmt[32]; + + touch_nmi_watchdog(); + sprintf(fmt, " [<%016lx>] %s%%s\n", addr, reliable ? "" : "? "); + __print_symbol(fmt, addr); +#ifdef STACKTRACE_OPS_ADDRESS_RETURN_INT + return 0; +#endif +} + +static const struct stacktrace_ops print_trace_ops = { + .stack = print_trace_stack, + .address = print_trace_address, + .walk_stack = print_context_stack, +}; +#endif /* HAVE_STACKTRACE_OPS */ + +static void libcfs_call_trace(struct task_struct *tsk) +{ +#ifdef HAVE_STACKTRACE_OPS + printk("Pid: %d, comm: %.20s\n", tsk->pid, tsk->comm); + printk("\nCall Trace:\n"); + dump_trace(tsk, NULL, NULL, 0, &print_trace_ops, NULL); + printk("\n"); +#else /* !HAVE_STACKTRACE_OPS */ + if (tsk == current) + dump_stack(); + else + CWARN("can't show stack: kernel doesn't export show_task\n"); +#endif /* HAVE_STACKTRACE_OPS */ +} + +#else /* !CONFIG_X86 */ + +static void libcfs_call_trace(struct task_struct *tsk) +{ + if (tsk == current) + dump_stack(); + else + CWARN("can't show stack: kernel doesn't export show_task\n"); +} + +#endif /* CONFIG_X86 */ + +#endif /* CONFIG_STACKTRACE */ + +void libcfs_debug_dumpstack(struct task_struct *tsk) +{ + libcfs_call_trace(tsk ?: current); +} +EXPORT_SYMBOL(libcfs_debug_dumpstack); + +static int panic_notifier(struct notifier_block *self, unsigned long unused1, + void *unused2) +{ + if (libcfs_panic_in_progress) + return 0; + + libcfs_panic_in_progress = 1; + mb(); + +#ifdef LNET_DUMP_ON_PANIC + /* This is currently disabled because it spews far too much to the + * console on the rare cases it is ever triggered. */ + + if (in_interrupt()) { + cfs_trace_debug_print(); + } else { + libcfs_debug_dumplog_internal((void *)(long)current->pid); + } +#endif + return 0; +} + +static struct notifier_block libcfs_panic_notifier = { + .notifier_call = panic_notifier, + .next = NULL, + .priority = 10000, +}; + +static void libcfs_register_panic_notifier(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &libcfs_panic_notifier); +} + +static void libcfs_unregister_panic_notifier(void) +{ + atomic_notifier_chain_unregister(&panic_notifier_list, + &libcfs_panic_notifier); +} + +int libcfs_debug_init(unsigned long bufsize) +{ + unsigned int max = libcfs_debug_mb; + int rc = 0; + + if (libcfs_console_max_delay <= 0 || /* not set by user or */ + libcfs_console_min_delay <= 0 || /* set to invalid values */ + libcfs_console_min_delay >= libcfs_console_max_delay) { + libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY; + libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY; + } + + /* If libcfs_debug_mb is uninitialized then just make the + * total buffers smp_num_cpus * TCD_MAX_PAGES + */ + if (max < num_possible_cpus()) { + max = TCD_MAX_PAGES; + } else { + max = (max / num_possible_cpus()); + max <<= (20 - PAGE_SHIFT); + } + + rc = cfs_tracefile_init(max); + if (rc) + return rc; + + libcfs_register_panic_notifier(); + kernel_param_lock(THIS_MODULE); + if (libcfs_debug_mb == 0) + libcfs_debug_mb = cfs_trace_get_debug_mb(); + kernel_param_unlock(THIS_MODULE); + return rc; +} + +int libcfs_debug_cleanup(void) +{ + libcfs_unregister_panic_notifier(); + kernel_param_lock(THIS_MODULE); + cfs_tracefile_exit(); + kernel_param_unlock(THIS_MODULE); + return 0; +} + +int libcfs_debug_clear_buffer(void) +{ + cfs_trace_flush_pages(); + return 0; +} + +/* Debug markers, although printed by S_LNET should not be be marked as such. */ +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_UNDEFINED +int libcfs_debug_mark_buffer(const char *text) +{ + CDEBUG(D_TRACE, + "**************************************************\n"); + LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text); + CDEBUG(D_TRACE, + "**************************************************\n"); + + return 0; +} + +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_LNET diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/fail.c b/drivers/staging/lustrefsx/libcfs/libcfs/fail.c new file mode 100644 index 0000000000000..5623e3f226fa6 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/fail.c @@ -0,0 +1,153 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Oracle Corporation, Inc. + */ + +#include +#include +#include +#include +#include + +unsigned long cfs_fail_loc; +EXPORT_SYMBOL(cfs_fail_loc); + +unsigned int cfs_fail_val; +EXPORT_SYMBOL(cfs_fail_val); + +int cfs_fail_err; +EXPORT_SYMBOL(cfs_fail_err); + +DECLARE_WAIT_QUEUE_HEAD(cfs_race_waitq); +EXPORT_SYMBOL(cfs_race_waitq); + +int cfs_race_state; +EXPORT_SYMBOL(cfs_race_state); + +int __cfs_fail_check_set(u32 id, u32 value, int set) +{ + static atomic_t cfs_fail_count = ATOMIC_INIT(0); + + LASSERT(!(id & CFS_FAIL_ONCE)); + + if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) == + (CFS_FAILED | CFS_FAIL_ONCE)) { + atomic_set(&cfs_fail_count, 0); /* paranoia */ + return 0; + } + + /* Fail 1/cfs_fail_val times */ + if (cfs_fail_loc & CFS_FAIL_RAND) { + if (cfs_fail_val < 2 || get_random_u32_below(cfs_fail_val) > 0) + return 0; + } + + /* Skip the first cfs_fail_val, then fail */ + if (cfs_fail_loc & CFS_FAIL_SKIP) { + if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val) + return 0; + } + + /* check cfs_fail_val... */ + if (set == CFS_FAIL_LOC_VALUE) { + if (cfs_fail_val != -1 && cfs_fail_val != value) + return 0; + } + + /* Fail cfs_fail_val times, overridden by FAIL_ONCE */ + if (cfs_fail_loc & CFS_FAIL_SOME && + (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) { + int count = atomic_inc_return(&cfs_fail_count); + + if (count >= cfs_fail_val) { + set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc); + atomic_set(&cfs_fail_count, 0); + /* we are lost race to increase */ + if (count > cfs_fail_val) + return 0; + } + } + + /* Take into account the current call for FAIL_ONCE for ORSET only, + * as RESET is a new fail_loc, it does not change the current call + */ + if ((set == CFS_FAIL_LOC_ORSET) && (value & CFS_FAIL_ONCE)) + set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc); + /* Lost race to set CFS_FAILED_BIT. */ + if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) { + /* If CFS_FAIL_ONCE is valid, only one process can fail, + * otherwise multi-process can fail at the same time. + */ + if (cfs_fail_loc & CFS_FAIL_ONCE) + return 0; + } + + switch (set) { + case CFS_FAIL_LOC_NOSET: + case CFS_FAIL_LOC_VALUE: + break; + case CFS_FAIL_LOC_ORSET: + cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE); + break; + case CFS_FAIL_LOC_RESET: + cfs_fail_loc = value; + atomic_set(&cfs_fail_count, 0); + break; + default: + LASSERTF(0, "called with bad set %u\n", set); + break; + } + + return 1; +} +EXPORT_SYMBOL(__cfs_fail_check_set); + +int __cfs_fail_timeout_set(u32 id, u32 value, int ms, int set) +{ + ktime_t till = ktime_add_ms(ktime_get(), ms); + int ret; + + ret = __cfs_fail_check_set(id, value, set); + if (ret && likely(ms > 0)) { + CERROR("cfs_fail_timeout id %x sleeping for %dms\n", id, ms); + while (ktime_before(ktime_get(), till)) { + schedule_timeout_uninterruptible(cfs_time_seconds(1) + / 10); + set_current_state(TASK_RUNNING); + if (!cfs_fail_loc) { + CERROR("cfs_fail_timeout interrupted\n"); + break; + } + } + if (cfs_fail_loc) + CERROR("cfs_fail_timeout id %x awake\n", id); + } + return ret; +} +EXPORT_SYMBOL(__cfs_fail_timeout_set); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/hash.c b/drivers/staging/lustrefsx/libcfs/libcfs/hash.c new file mode 100644 index 0000000000000..ca234ef096229 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/hash.c @@ -0,0 +1,2126 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/libcfs/hash.c + * + * Implement a hash class for hash process in lustre system. + * + * Author: YuZhangyong + * + * 2008-08-15: Brian Behlendorf + * - Simplified API and improved documentation + * - Added per-hash feature flags: + * * CFS_HASH_DEBUG additional validation + * * CFS_HASH_REHASH dynamic rehashing + * - Added per-hash statistics + * - General performance enhancements + * + * 2009-07-31: Liang Zhen + * - move all stuff to libcfs + * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH + * - ignore hs_rwlock if without CFS_HASH_REHASH setting + * - buckets are allocated one by one(instead of contiguous memory), + * to avoid unnecessary cacheline conflict + * + * 2010-03-01: Liang Zhen + * - "bucket" is a group of hlist_head now, user can specify bucket size + * by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share + * one lock for reducing memory overhead. + * + * - support lockless hash, caller will take care of locks: + * avoid lock overhead for hash tables that are already protected + * by locking in the caller for another reason + * + * - support both spin_lock/rwlock for bucket: + * overhead of spinlock contention is lower than read/write + * contention of rwlock, so using spinlock to serialize operations on + * bucket is more reasonable for those frequently changed hash tables + * + * - support one-single lock mode: + * one lock to protect all hash operations to avoid overhead of + * multiple locks if hash table is always small + * + * - removed a lot of unnecessary addref & decref on hash element: + * addref & decref are atomic operations in many use-cases which + * are expensive. + * + * - support non-blocking cfs_hash_add() and cfs_hash_findadd(): + * some lustre use-cases require these functions to be strictly + * non-blocking, we need to schedule required rehash on a different + * thread on those cases. + * + * - safer rehash on large hash table + * In old implementation, rehash function will exclusively lock the + * hash table and finish rehash in one batch, it's dangerous on SMP + * system because rehash millions of elements could take long time. + * New implemented rehash can release lock and relax CPU in middle + * of rehash, it's safe for another thread to search/change on the + * hash table even it's in rehasing. + * + * - support two different refcount modes + * . hash table has refcount on element + * . hash table doesn't change refcount on adding/removing element + * + * - support long name hash table (for param-tree) + * + * - fix a bug for cfs_hash_rehash_key: + * in old implementation, cfs_hash_rehash_key could screw up the + * hash-table because @key is overwritten without any protection. + * Now we need user to define hs_keycpy for those rehash enabled + * hash tables, cfs_hash_rehash_key will overwrite hash-key + * inside lock by calling hs_keycpy. + * + * - better hash iteration: + * Now we support both locked iteration & lockless iteration of hash + * table. Also, user can break the iteration by return 1 in callback. + */ +#include +#include + +#include +#include + +#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 +static unsigned int warn_on_depth = 8; +module_param(warn_on_depth, uint, 0644); +MODULE_PARM_DESC(warn_on_depth, "warning when hash depth is high."); +#endif + +struct workqueue_struct *cfs_rehash_wq; + +static inline void +cfs_hash_nl_lock(union cfs_hash_lock *lock, int exclusive) {} + +static inline void +cfs_hash_nl_unlock(union cfs_hash_lock *lock, int exclusive) {} + +static inline void +cfs_hash_spin_lock(union cfs_hash_lock *lock, int exclusive) + __acquires(&lock->spin) +{ + spin_lock(&lock->spin); +} + +static inline void +cfs_hash_spin_unlock(union cfs_hash_lock *lock, int exclusive) + __releases(&lock->spin) +{ + spin_unlock(&lock->spin); +} + +static inline void +cfs_hash_rw_lock(union cfs_hash_lock *lock, int exclusive) + __acquires(&lock->rw) +{ + if (!exclusive) + read_lock(&lock->rw); + else + write_lock(&lock->rw); +} + +static inline void +cfs_hash_rw_unlock(union cfs_hash_lock *lock, int exclusive) + __releases(&lock->rw) +{ + if (!exclusive) + read_unlock(&lock->rw); + else + write_unlock(&lock->rw); +} + +static inline void +cfs_hash_rw_sem_lock(union cfs_hash_lock *lock, int exclusive) + __acquires(&lock->rw_sem) +{ + if (!exclusive) + down_read(&lock->rw_sem); + else + down_write(&lock->rw_sem); +} + +static inline void +cfs_hash_rw_sem_unlock(union cfs_hash_lock *lock, int exclusive) + __releases(&lock->rw_sem) +{ + if (!exclusive) + up_read(&lock->rw_sem); + else + up_write(&lock->rw_sem); +} + +/** No lock hash */ +static struct cfs_hash_lock_ops cfs_hash_nl_lops = { + .hs_lock = cfs_hash_nl_lock, + .hs_unlock = cfs_hash_nl_unlock, + .hs_bkt_lock = cfs_hash_nl_lock, + .hs_bkt_unlock = cfs_hash_nl_unlock, +}; + +/** no bucket lock, one spinlock to protect everything */ +static struct cfs_hash_lock_ops cfs_hash_nbl_lops = { + .hs_lock = cfs_hash_spin_lock, + .hs_unlock = cfs_hash_spin_unlock, + .hs_bkt_lock = cfs_hash_nl_lock, + .hs_bkt_unlock = cfs_hash_nl_unlock, +}; + +/** spin bucket lock, rehash is enabled */ +static struct cfs_hash_lock_ops cfs_hash_bkt_spin_lops = { + .hs_lock = cfs_hash_rw_lock, + .hs_unlock = cfs_hash_rw_unlock, + .hs_bkt_lock = cfs_hash_spin_lock, + .hs_bkt_unlock = cfs_hash_spin_unlock, +}; + +/** rw bucket lock, rehash is enabled */ +static struct cfs_hash_lock_ops cfs_hash_bkt_rw_lops = { + .hs_lock = cfs_hash_rw_lock, + .hs_unlock = cfs_hash_rw_unlock, + .hs_bkt_lock = cfs_hash_rw_lock, + .hs_bkt_unlock = cfs_hash_rw_unlock, +}; + +/** spin bucket lock, rehash is disabled */ +static struct cfs_hash_lock_ops cfs_hash_nr_bkt_spin_lops = { + .hs_lock = cfs_hash_nl_lock, + .hs_unlock = cfs_hash_nl_unlock, + .hs_bkt_lock = cfs_hash_spin_lock, + .hs_bkt_unlock = cfs_hash_spin_unlock, +}; + +/** rw bucket lock, rehash is disabled */ +static struct cfs_hash_lock_ops cfs_hash_nr_bkt_rw_lops = { + .hs_lock = cfs_hash_nl_lock, + .hs_unlock = cfs_hash_nl_unlock, + .hs_bkt_lock = cfs_hash_rw_lock, + .hs_bkt_unlock = cfs_hash_rw_unlock, +}; + +/** rw_sem bucket lock, rehash is disabled */ +static struct cfs_hash_lock_ops cfs_hash_nr_bkt_rw_sem_lops = { + .hs_lock = cfs_hash_nl_lock, + .hs_unlock = cfs_hash_nl_unlock, + .hs_bkt_lock = cfs_hash_rw_sem_lock, + .hs_bkt_unlock = cfs_hash_rw_sem_unlock, +}; + +/** rw_sem bucket lock, rehash is enabled */ +static struct cfs_hash_lock_ops cfs_hash_bkt_rw_sem_lops = { + .hs_lock = cfs_hash_rw_sem_lock, + .hs_unlock = cfs_hash_rw_sem_unlock, + .hs_bkt_lock = cfs_hash_rw_sem_lock, + .hs_bkt_unlock = cfs_hash_rw_sem_unlock, +}; + +static void +cfs_hash_lock_setup(struct cfs_hash *hs) +{ + if (cfs_hash_with_no_lock(hs)) { + hs->hs_lops = &cfs_hash_nl_lops; + + } else if (cfs_hash_with_no_bktlock(hs)) { + hs->hs_lops = &cfs_hash_nbl_lops; + spin_lock_init(&hs->hs_lock.spin); + + } else if (cfs_hash_with_rehash(hs)) { + if (cfs_hash_with_rw_sem_bktlock(hs)) { + init_rwsem(&hs->hs_lock.rw_sem); + hs->hs_lops = &cfs_hash_bkt_rw_sem_lops; + } else { + rwlock_init(&hs->hs_lock.rw); + + if (cfs_hash_with_rw_bktlock(hs)) + hs->hs_lops = &cfs_hash_bkt_rw_lops; + else if (cfs_hash_with_spin_bktlock(hs)) + hs->hs_lops = &cfs_hash_bkt_spin_lops; + else + LBUG(); + } + } else { + if (cfs_hash_with_rw_bktlock(hs)) + hs->hs_lops = &cfs_hash_nr_bkt_rw_lops; + else if (cfs_hash_with_spin_bktlock(hs)) + hs->hs_lops = &cfs_hash_nr_bkt_spin_lops; + else if (cfs_hash_with_rw_sem_bktlock(hs)) + hs->hs_lops = &cfs_hash_nr_bkt_rw_sem_lops; + else + LBUG(); + } +} + +/** + * Simple hash head without depth tracking + * new element is always added to head of hlist + */ +struct cfs_hash_head { + struct hlist_head hh_head; /**< entries list */ +}; + +static int +cfs_hash_hh_hhead_size(struct cfs_hash *hs) +{ + return sizeof(struct cfs_hash_head); +} + +static struct hlist_head * +cfs_hash_hh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + struct cfs_hash_head *head; + + head = (struct cfs_hash_head *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].hh_head; +} + +static int +cfs_hash_hh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd)); + return -1; /* unknown depth */ +} + +static int +cfs_hash_hh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + hlist_del_init(hnode); + return -1; /* unknown depth */ +} + +/** + * Simple hash head with depth tracking + * new element is always added to head of hlist + */ +struct cfs_hash_head_dep { + struct hlist_head hd_head; /**< entries list */ + unsigned int hd_depth; /**< list length */ +}; + +static int +cfs_hash_hd_hhead_size(struct cfs_hash *hs) +{ + return sizeof(struct cfs_hash_head_dep); +} + +static struct hlist_head * +cfs_hash_hd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + struct cfs_hash_head_dep *head; + + head = (struct cfs_hash_head_dep *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].hd_head; +} + +static int +cfs_hash_hd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + struct cfs_hash_head_dep *hh; + + hh = container_of(cfs_hash_hd_hhead(hs, bd), + struct cfs_hash_head_dep, hd_head); + hlist_add_head(hnode, &hh->hd_head); + return ++hh->hd_depth; +} + +static int +cfs_hash_hd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + struct cfs_hash_head_dep *hh; + + hh = container_of(cfs_hash_hd_hhead(hs, bd), + struct cfs_hash_head_dep, hd_head); + hlist_del_init(hnode); + return --hh->hd_depth; +} + +/** + * double links hash head without depth tracking + * new element is always added to tail of hlist + */ +struct cfs_hash_dhead { + struct hlist_head dh_head; /**< entries list */ + struct hlist_node *dh_tail; /**< the last entry */ +}; + +static int +cfs_hash_dh_hhead_size(struct cfs_hash *hs) +{ + return sizeof(struct cfs_hash_dhead); +} + +static struct hlist_head * +cfs_hash_dh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + struct cfs_hash_dhead *head; + + head = (struct cfs_hash_dhead *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].dh_head; +} + +static int +cfs_hash_dh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + struct cfs_hash_dhead *dh; + + dh = container_of(cfs_hash_dh_hhead(hs, bd), + struct cfs_hash_dhead, dh_head); + if (dh->dh_tail != NULL) /* not empty */ + hlist_add_behind(hnode, dh->dh_tail); + else /* empty list */ + hlist_add_head(hnode, &dh->dh_head); + dh->dh_tail = hnode; + return -1; /* unknown depth */ +} + +static int +cfs_hash_dh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnd) +{ + struct cfs_hash_dhead *dh; + + dh = container_of(cfs_hash_dh_hhead(hs, bd), + struct cfs_hash_dhead, dh_head); + if (hnd->next == NULL) { /* it's the tail */ + dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL : + container_of(hnd->pprev, struct hlist_node, next); + } + hlist_del_init(hnd); + return -1; /* unknown depth */ +} + +/** + * double links hash head with depth tracking + * new element is always added to tail of hlist + */ +struct cfs_hash_dhead_dep { + struct hlist_head dd_head; /**< entries list */ + struct hlist_node *dd_tail; /**< the last entry */ + unsigned int dd_depth; /**< list length */ +}; + +static int +cfs_hash_dd_hhead_size(struct cfs_hash *hs) +{ + return sizeof(struct cfs_hash_dhead_dep); +} + +static struct hlist_head * +cfs_hash_dd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + struct cfs_hash_dhead_dep *head; + + head = (struct cfs_hash_dhead_dep *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].dd_head; +} + +static int +cfs_hash_dd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + struct cfs_hash_dhead_dep *dh; + + dh = container_of(cfs_hash_dd_hhead(hs, bd), + struct cfs_hash_dhead_dep, dd_head); + if (dh->dd_tail != NULL) /* not empty */ + hlist_add_behind(hnode, dh->dd_tail); + else /* empty list */ + hlist_add_head(hnode, &dh->dd_head); + dh->dd_tail = hnode; + return ++dh->dd_depth; +} + +static int +cfs_hash_dd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnd) +{ + struct cfs_hash_dhead_dep *dh; + + dh = container_of(cfs_hash_dd_hhead(hs, bd), + struct cfs_hash_dhead_dep, dd_head); + if (hnd->next == NULL) { /* it's the tail */ + dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL : + container_of(hnd->pprev, struct hlist_node, next); + } + hlist_del_init(hnd); + return --dh->dd_depth; +} + +static struct cfs_hash_hlist_ops cfs_hash_hh_hops = { + .hop_hhead = cfs_hash_hh_hhead, + .hop_hhead_size = cfs_hash_hh_hhead_size, + .hop_hnode_add = cfs_hash_hh_hnode_add, + .hop_hnode_del = cfs_hash_hh_hnode_del, +}; + +static struct cfs_hash_hlist_ops cfs_hash_hd_hops = { + .hop_hhead = cfs_hash_hd_hhead, + .hop_hhead_size = cfs_hash_hd_hhead_size, + .hop_hnode_add = cfs_hash_hd_hnode_add, + .hop_hnode_del = cfs_hash_hd_hnode_del, +}; + +static struct cfs_hash_hlist_ops cfs_hash_dh_hops = { + .hop_hhead = cfs_hash_dh_hhead, + .hop_hhead_size = cfs_hash_dh_hhead_size, + .hop_hnode_add = cfs_hash_dh_hnode_add, + .hop_hnode_del = cfs_hash_dh_hnode_del, +}; + +static struct cfs_hash_hlist_ops cfs_hash_dd_hops = { + .hop_hhead = cfs_hash_dd_hhead, + .hop_hhead_size = cfs_hash_dd_hhead_size, + .hop_hnode_add = cfs_hash_dd_hnode_add, + .hop_hnode_del = cfs_hash_dd_hnode_del, +}; + +static void +cfs_hash_hlist_setup(struct cfs_hash *hs) +{ + if (cfs_hash_with_add_tail(hs)) { + hs->hs_hops = cfs_hash_with_depth(hs) ? + &cfs_hash_dd_hops : &cfs_hash_dh_hops; + } else { + hs->hs_hops = cfs_hash_with_depth(hs) ? + &cfs_hash_hd_hops : &cfs_hash_hh_hops; + } +} + +static void +cfs_hash_bd_from_key(struct cfs_hash *hs, struct cfs_hash_bucket **bkts, + unsigned int bits, const void *key, struct cfs_hash_bd *bd) +{ + unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1); + + LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits); + + bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)]; + bd->bd_offset = index >> (bits - hs->hs_bkt_bits); +} + +void +cfs_hash_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bd) +{ + /* NB: caller should hold hs->hs_rwlock if REHASH is set */ + if (likely(hs->hs_rehash_buckets == NULL)) { + cfs_hash_bd_from_key(hs, hs->hs_buckets, + hs->hs_cur_bits, key, bd); + } else { + LASSERT(hs->hs_rehash_bits != 0); + cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, + hs->hs_rehash_bits, key, bd); + } +} +EXPORT_SYMBOL(cfs_hash_bd_get); + +static inline void +cfs_hash_bd_dep_record(struct cfs_hash *hs, struct cfs_hash_bd *bd, int dep_cur) +{ + if (likely(dep_cur <= bd->bd_bucket->hsb_depmax)) + return; + + bd->bd_bucket->hsb_depmax = dep_cur; +# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 + if (likely(warn_on_depth == 0 || + max(warn_on_depth, hs->hs_dep_max) >= dep_cur)) + return; + + spin_lock(&hs->hs_dep_lock); + hs->hs_dep_max = dep_cur; + hs->hs_dep_bkt = bd->bd_bucket->hsb_index; + hs->hs_dep_off = bd->bd_offset; + hs->hs_dep_bits = hs->hs_cur_bits; + spin_unlock(&hs->hs_dep_lock); + + queue_work(cfs_rehash_wq, &hs->hs_dep_work); +# endif +} + +void +cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + int rc; + + rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode); + cfs_hash_bd_dep_record(hs, bd, rc); + bd->bd_bucket->hsb_version++; + if (unlikely(bd->bd_bucket->hsb_version == 0)) + bd->bd_bucket->hsb_version++; + bd->bd_bucket->hsb_count++; + + if (cfs_hash_with_counter(hs)) + atomic_inc(&hs->hs_count); + if (!cfs_hash_with_no_itemref(hs)) + cfs_hash_get(hs, hnode); +} +EXPORT_SYMBOL(cfs_hash_bd_add_locked); + +void +cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + hs->hs_hops->hop_hnode_del(hs, bd, hnode); + + LASSERT(bd->bd_bucket->hsb_count > 0); + bd->bd_bucket->hsb_count--; + bd->bd_bucket->hsb_version++; + if (unlikely(bd->bd_bucket->hsb_version == 0)) + bd->bd_bucket->hsb_version++; + + if (cfs_hash_with_counter(hs)) { + LASSERT(atomic_read(&hs->hs_count) > 0); + atomic_dec(&hs->hs_count); + } + if (!cfs_hash_with_no_itemref(hs)) + cfs_hash_put_locked(hs, hnode); +} +EXPORT_SYMBOL(cfs_hash_bd_del_locked); + +void +cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old, + struct cfs_hash_bd *bd_new, struct hlist_node *hnode) +{ + struct cfs_hash_bucket *obkt = bd_old->bd_bucket; + struct cfs_hash_bucket *nbkt = bd_new->bd_bucket; + int rc; + + if (cfs_hash_bd_compare(bd_old, bd_new) == 0) + return; + + /* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops + * in cfs_hash_bd_del/add_locked */ + hs->hs_hops->hop_hnode_del(hs, bd_old, hnode); + rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode); + cfs_hash_bd_dep_record(hs, bd_new, rc); + + LASSERT(obkt->hsb_count > 0); + obkt->hsb_count--; + obkt->hsb_version++; + if (unlikely(obkt->hsb_version == 0)) + obkt->hsb_version++; + nbkt->hsb_count++; + nbkt->hsb_version++; + if (unlikely(nbkt->hsb_version == 0)) + nbkt->hsb_version++; +} + +enum { + /** always set, for sanity (avoid ZERO intent) */ + CFS_HS_LOOKUP_MASK_FIND = BIT(0), + /** return entry with a ref */ + CFS_HS_LOOKUP_MASK_REF = BIT(1), + /** add entry if not existing */ + CFS_HS_LOOKUP_MASK_ADD = BIT(2), + /** delete entry, ignore other masks */ + CFS_HS_LOOKUP_MASK_DEL = BIT(3), +}; + +enum cfs_hash_lookup_intent { + /** return item w/o refcount */ + CFS_HS_LOOKUP_IT_PEEK = CFS_HS_LOOKUP_MASK_FIND, + /** return item with refcount */ + CFS_HS_LOOKUP_IT_FIND = (CFS_HS_LOOKUP_MASK_FIND | + CFS_HS_LOOKUP_MASK_REF), + /** return item w/o refcount if existed, otherwise add */ + CFS_HS_LOOKUP_IT_ADD = (CFS_HS_LOOKUP_MASK_FIND | + CFS_HS_LOOKUP_MASK_ADD), + /** return item with refcount if existed, otherwise add */ + CFS_HS_LOOKUP_IT_FINDADD = (CFS_HS_LOOKUP_IT_FIND | + CFS_HS_LOOKUP_MASK_ADD), + /** delete if existed */ + CFS_HS_LOOKUP_IT_FINDDEL = (CFS_HS_LOOKUP_MASK_FIND | + CFS_HS_LOOKUP_MASK_DEL) +}; + +static struct hlist_node * +cfs_hash_bd_lookup_intent(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key, struct hlist_node *hnode, + enum cfs_hash_lookup_intent intent) + +{ + struct hlist_head *hhead = cfs_hash_bd_hhead(hs, bd); + struct hlist_node *ehnode; + struct hlist_node *match; + int intent_add = (intent & CFS_HS_LOOKUP_MASK_ADD) != 0; + + /* with this function, we can avoid a lot of useless refcount ops, + * which are expensive atomic operations most time. */ + match = intent_add ? NULL : hnode; + hlist_for_each(ehnode, hhead) { + if (!cfs_hash_keycmp(hs, key, ehnode)) + continue; + + if (match != NULL && match != ehnode) /* can't match */ + continue; + + /* match and ... */ + if ((intent & CFS_HS_LOOKUP_MASK_DEL) != 0) { + cfs_hash_bd_del_locked(hs, bd, ehnode); + return ehnode; + } + + /* caller wants refcount? */ + if ((intent & CFS_HS_LOOKUP_MASK_REF) != 0) + cfs_hash_get(hs, ehnode); + return ehnode; + } + /* no match item */ + if (!intent_add) + return NULL; + + LASSERT(hnode != NULL); + cfs_hash_bd_add_locked(hs, bd, hnode); + return hnode; +} + +struct hlist_node * +cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key) +{ + return cfs_hash_bd_lookup_intent(hs, bd, key, NULL, + CFS_HS_LOOKUP_IT_FIND); +} +EXPORT_SYMBOL(cfs_hash_bd_lookup_locked); + +struct hlist_node * +cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key) +{ + return cfs_hash_bd_lookup_intent(hs, bd, key, NULL, + CFS_HS_LOOKUP_IT_PEEK); +} +EXPORT_SYMBOL(cfs_hash_bd_peek_locked); + +static void +cfs_hash_multi_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, int excl) +{ + struct cfs_hash_bucket *prev = NULL; + int i; + + /** + * bds must be ascendantly ordered by bd->bd_bucket->hsb_index. + * NB: it's possible that several bds point to the same bucket but + * have different bd::bd_offset, so need take care of deadlock. + */ + cfs_hash_for_each_bd(bds, n, i) { + if (prev == bds[i].bd_bucket) + continue; + + LASSERT(prev == NULL || + prev->hsb_index < bds[i].bd_bucket->hsb_index); + cfs_hash_bd_lock(hs, &bds[i], excl); + prev = bds[i].bd_bucket; + } +} + +static void +cfs_hash_multi_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, int excl) +{ + struct cfs_hash_bucket *prev = NULL; + int i; + + cfs_hash_for_each_bd(bds, n, i) { + if (prev != bds[i].bd_bucket) { + cfs_hash_bd_unlock(hs, &bds[i], excl); + prev = bds[i].bd_bucket; + } + } +} + +static struct hlist_node * +cfs_hash_multi_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, const void *key) +{ + struct hlist_node *ehnode; + unsigned i; + + cfs_hash_for_each_bd(bds, n, i) { + ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL, + CFS_HS_LOOKUP_IT_FIND); + if (ehnode != NULL) + return ehnode; + } + return NULL; +} + +static struct hlist_node * +cfs_hash_multi_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, const void *key, + struct hlist_node *hnode, int noref) +{ + struct hlist_node *ehnode; + int intent; + unsigned i; + + LASSERT(hnode != NULL); + intent = CFS_HS_LOOKUP_IT_PEEK | (!noref * CFS_HS_LOOKUP_MASK_REF); + + cfs_hash_for_each_bd(bds, n, i) { + ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, + NULL, intent); + if (ehnode != NULL) + return ehnode; + } + + if (i == 1) { /* only one bucket */ + cfs_hash_bd_add_locked(hs, &bds[0], hnode); + } else { + struct cfs_hash_bd mybd; + + cfs_hash_bd_get(hs, key, &mybd); + cfs_hash_bd_add_locked(hs, &mybd, hnode); + } + + return hnode; +} + +static struct hlist_node * +cfs_hash_multi_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, const void *key, + struct hlist_node *hnode) +{ + struct hlist_node *ehnode; + unsigned i; + + cfs_hash_for_each_bd(bds, n, i) { + ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode, + CFS_HS_LOOKUP_IT_FINDDEL); + if (ehnode != NULL) + return ehnode; + } + return NULL; +} + +static void +cfs_hash_bd_order(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2) +{ + int rc; + + if (bd2->bd_bucket == NULL) + return; + + if (bd1->bd_bucket == NULL) { + *bd1 = *bd2; + bd2->bd_bucket = NULL; + return; + } + + rc = cfs_hash_bd_compare(bd1, bd2); + if (rc == 0) { + bd2->bd_bucket = NULL; + + } else if (rc > 0) { + swap(*bd1, *bd2); /* swab bd1 and bd2 */ + } +} + +void +cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key, + struct cfs_hash_bd *bds) +{ + /* NB: caller should hold hs_lock.rw if REHASH is set */ + cfs_hash_bd_from_key(hs, hs->hs_buckets, + hs->hs_cur_bits, key, &bds[0]); + if (likely(hs->hs_rehash_buckets == NULL)) { + /* no rehash or not rehashing */ + bds[1].bd_bucket = NULL; + return; + } + + LASSERT(hs->hs_rehash_bits != 0); + cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, + hs->hs_rehash_bits, key, &bds[1]); + + cfs_hash_bd_order(&bds[0], &bds[1]); +} + +void +cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl) +{ + cfs_hash_multi_bd_lock(hs, bds, 2, excl); +} + +void +cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl) +{ + cfs_hash_multi_bd_unlock(hs, bds, 2, excl); +} + +struct hlist_node * +cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key) +{ + return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key); +} + +struct hlist_node * +cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key, struct hlist_node *hnode, + int noref) +{ + return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key, + hnode, noref); +} + +struct hlist_node * +cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key, struct hlist_node *hnode) +{ + return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode); +} + +static void +cfs_hash_buckets_free(struct cfs_hash_bucket **buckets, + int bkt_size, int prev_size, int size) +{ + int i; + + for (i = prev_size; i < size; i++) { + if (buckets[i] != NULL) + LIBCFS_FREE(buckets[i], bkt_size); + } + + LIBCFS_FREE(buckets, sizeof(buckets[0]) * size); +} + +/* + * Create or grow bucket memory. Return old_buckets if no allocation was + * needed, the newly allocated buckets if allocation was needed and + * successful, and NULL on error. + */ +static struct cfs_hash_bucket ** +cfs_hash_buckets_realloc(struct cfs_hash *hs, struct cfs_hash_bucket **old_bkts, + unsigned int old_size, unsigned int new_size) +{ + struct cfs_hash_bucket **new_bkts; + int i; + + LASSERT(old_size == 0 || old_bkts != NULL); + + if (old_bkts != NULL && old_size == new_size) + return old_bkts; + + LIBCFS_ALLOC(new_bkts, sizeof(new_bkts[0]) * new_size); + if (new_bkts == NULL) + return NULL; + + if (old_bkts != NULL) { + memcpy(new_bkts, old_bkts, + min(old_size, new_size) * sizeof(*old_bkts)); + } + + for (i = old_size; i < new_size; i++) { + struct hlist_head *hhead; + struct cfs_hash_bd bd; + + LIBCFS_ALLOC(new_bkts[i], cfs_hash_bkt_size(hs)); + if (new_bkts[i] == NULL) { + cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs), + old_size, new_size); + return NULL; + } + + new_bkts[i]->hsb_index = i; + new_bkts[i]->hsb_version = 1; /* shouldn't be zero */ + new_bkts[i]->hsb_depmax = -1; /* unknown */ + bd.bd_bucket = new_bkts[i]; + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) + INIT_HLIST_HEAD(hhead); + + if (cfs_hash_with_no_lock(hs) || + cfs_hash_with_no_bktlock(hs)) + continue; + + if (cfs_hash_with_rw_bktlock(hs)) + rwlock_init(&new_bkts[i]->hsb_lock.rw); + else if (cfs_hash_with_spin_bktlock(hs)) + spin_lock_init(&new_bkts[i]->hsb_lock.spin); + else if (cfs_hash_with_rw_sem_bktlock(hs)) + init_rwsem(&new_bkts[i]->hsb_lock.rw_sem); + else + LBUG(); /* invalid use-case */ + } + return new_bkts; +} + +/** + * Initialize new libcfs hash, where: + * @name - Descriptive hash name + * @cur_bits - Initial hash table size, in bits + * @max_bits - Maximum allowed hash table resize, in bits + * @ops - Registered hash table operations + * @flags - CFS_HASH_REHASH enable synamic hash resizing + * - CFS_HASH_SORT enable chained hash sort + */ +static void cfs_hash_rehash_worker(struct work_struct *work); + +#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 +static void cfs_hash_dep_print(struct work_struct *work) +{ + struct cfs_hash *hs = container_of(work, struct cfs_hash, hs_dep_work); + int dep; + int bkt; + int off; + int bits; + + spin_lock(&hs->hs_dep_lock); + dep = hs->hs_dep_max; + bkt = hs->hs_dep_bkt; + off = hs->hs_dep_off; + bits = hs->hs_dep_bits; + spin_unlock(&hs->hs_dep_lock); + + LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n", + hs->hs_name, bits, dep, bkt, off); + spin_lock(&hs->hs_dep_lock); + hs->hs_dep_bits = 0; /* mark as workitem done */ + spin_unlock(&hs->hs_dep_lock); + return 0; +} + +static void cfs_hash_depth_wi_init(struct cfs_hash *hs) +{ + spin_lock_init(&hs->hs_dep_lock); + INIT_WORK(&hs->hs_dep_work, cfs_hash_dep_print); +} + +static void cfs_hash_depth_wi_cancel(struct cfs_hash *hs) +{ + cancel_work_sync(&hs->hs_dep_work); +} + +#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */ + +static inline void cfs_hash_depth_wi_init(struct cfs_hash *hs) {} +static inline void cfs_hash_depth_wi_cancel(struct cfs_hash *hs) {} + +#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */ + +struct cfs_hash * +cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits, + unsigned bkt_bits, unsigned extra_bytes, + unsigned min_theta, unsigned max_theta, + struct cfs_hash_ops *ops, unsigned flags) +{ + struct cfs_hash *hs; + int len; + + ENTRY; + + BUILD_BUG_ON(CFS_HASH_THETA_BITS >= 15); + + LASSERT(name != NULL); + LASSERT(ops != NULL); + LASSERT(ops->hs_key); + LASSERT(ops->hs_hash); + LASSERT(ops->hs_object); + LASSERT(ops->hs_keycmp); + LASSERT(ops->hs_get != NULL); + LASSERT(ops->hs_put != NULL || ops->hs_put_locked != NULL); + + if ((flags & CFS_HASH_REHASH) != 0) + flags |= CFS_HASH_COUNTER; /* must have counter */ + + LASSERT(cur_bits > 0); + LASSERT(cur_bits >= bkt_bits); + LASSERT(max_bits >= cur_bits && max_bits < 31); + LASSERT(ergo((flags & CFS_HASH_REHASH) == 0, cur_bits == max_bits)); + LASSERT(ergo((flags & CFS_HASH_REHASH) != 0, + (flags & CFS_HASH_NO_LOCK) == 0)); + LASSERT(ergo((flags & CFS_HASH_REHASH_KEY) != 0, + ops->hs_keycpy != NULL)); + + len = (flags & CFS_HASH_BIGNAME) == 0 ? + CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN; + LIBCFS_ALLOC(hs, offsetof(struct cfs_hash, hs_name[len])); + if (hs == NULL) + RETURN(NULL); + + strlcpy(hs->hs_name, name, len); + hs->hs_flags = flags; + + atomic_set(&hs->hs_refcount, 1); + atomic_set(&hs->hs_count, 0); + + cfs_hash_lock_setup(hs); + cfs_hash_hlist_setup(hs); + + hs->hs_cur_bits = (__u8)cur_bits; + hs->hs_min_bits = (__u8)cur_bits; + hs->hs_max_bits = (__u8)max_bits; + hs->hs_bkt_bits = (__u8)bkt_bits; + + hs->hs_ops = ops; + hs->hs_extra_bytes = extra_bytes; + hs->hs_rehash_bits = 0; + INIT_WORK(&hs->hs_rehash_work, cfs_hash_rehash_worker); + cfs_hash_depth_wi_init(hs); + + if (cfs_hash_with_rehash(hs)) + __cfs_hash_set_theta(hs, min_theta, max_theta); + + hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0, + CFS_HASH_NBKT(hs)); + if (hs->hs_buckets != NULL) + return hs; + + LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[len])); + RETURN(NULL); +} +EXPORT_SYMBOL(cfs_hash_create); + +/** + * Cleanup libcfs hash @hs. + */ +static void +cfs_hash_destroy(struct cfs_hash *hs) +{ + struct hlist_node *hnode; + struct hlist_node *pos; + struct cfs_hash_bd bd; + int i; + ENTRY; + + LASSERT(hs != NULL); + LASSERT(!cfs_hash_is_exiting(hs) && + !cfs_hash_is_iterating(hs)); + + /** + * prohibit further rehashes, don't need any lock because + * I'm the only (last) one can change it. + */ + hs->hs_exiting = 1; + if (cfs_hash_with_rehash(hs)) + cfs_hash_rehash_cancel(hs); + + cfs_hash_depth_wi_cancel(hs); + /* rehash should be done/canceled */ + LASSERT(hs->hs_buckets != NULL && + hs->hs_rehash_buckets == NULL); + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct hlist_head *hhead; + + LASSERT(bd.bd_bucket != NULL); + /* no need to take this lock, just for consistent code */ + cfs_hash_bd_lock(hs, &bd, 1); + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + hlist_for_each_safe(hnode, pos, hhead) { + LASSERTF(!cfs_hash_with_assert_empty(hs), + "hash %s bucket %u(%u) is not " + " empty: %u items left\n", + hs->hs_name, bd.bd_bucket->hsb_index, + bd.bd_offset, bd.bd_bucket->hsb_count); + /* can't assert key valicate, because we + * can interrupt rehash */ + cfs_hash_bd_del_locked(hs, &bd, hnode); + cfs_hash_exit(hs, hnode); + } + } + LASSERT(bd.bd_bucket->hsb_count == 0); + cfs_hash_bd_unlock(hs, &bd, 1); + cond_resched(); + } + + LASSERT(atomic_read(&hs->hs_count) == 0); + + cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs), + 0, CFS_HASH_NBKT(hs)); + i = cfs_hash_with_bigname(hs) ? + CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN; + LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[i])); + + EXIT; +} + +struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs) +{ + if (atomic_inc_not_zero(&hs->hs_refcount)) + return hs; + return NULL; +} +EXPORT_SYMBOL(cfs_hash_getref); + +void cfs_hash_putref(struct cfs_hash *hs) +{ + if (atomic_dec_and_test(&hs->hs_refcount)) + cfs_hash_destroy(hs); +} +EXPORT_SYMBOL(cfs_hash_putref); + +static inline int +cfs_hash_rehash_bits(struct cfs_hash *hs) +{ + if (cfs_hash_with_no_lock(hs) || + !cfs_hash_with_rehash(hs)) + return -EOPNOTSUPP; + + if (unlikely(cfs_hash_is_exiting(hs))) + return -ESRCH; + + if (unlikely(cfs_hash_is_rehashing(hs))) + return -EALREADY; + + if (unlikely(cfs_hash_is_iterating(hs))) + return -EAGAIN; + + /* XXX: need to handle case with max_theta != 2.0 + * and the case with min_theta != 0.5 */ + if ((hs->hs_cur_bits < hs->hs_max_bits) && + (__cfs_hash_theta(hs) > hs->hs_max_theta)) + return hs->hs_cur_bits + 1; + + if (!cfs_hash_with_shrink(hs)) + return 0; + + if ((hs->hs_cur_bits > hs->hs_min_bits) && + (__cfs_hash_theta(hs) < hs->hs_min_theta)) + return hs->hs_cur_bits - 1; + + return 0; +} + +/** + * don't allow inline rehash if: + * - user wants non-blocking change (add/del) on hash table + * - too many elements + */ +static inline int +cfs_hash_rehash_inline(struct cfs_hash *hs) +{ + return !cfs_hash_with_nblk_change(hs) && + atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG; +} + +/** + * Add item @hnode to libcfs hash @hs using @key. The registered + * ops->hs_get function will be called when the item is added. + */ +void +cfs_hash_add(struct cfs_hash *hs, const void *key, struct hlist_node *hnode) +{ + struct cfs_hash_bd bd; + int bits; + + LASSERT(hlist_unhashed(hnode)); + + cfs_hash_lock(hs, 0); + cfs_hash_bd_get_and_lock(hs, key, &bd, 1); + + cfs_hash_key_validate(hs, key, hnode); + cfs_hash_bd_add_locked(hs, &bd, hnode); + + cfs_hash_bd_unlock(hs, &bd, 1); + + bits = cfs_hash_rehash_bits(hs); + cfs_hash_unlock(hs, 0); + if (bits > 0) + cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); +} +EXPORT_SYMBOL(cfs_hash_add); + +static struct hlist_node * +cfs_hash_find_or_add(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode, int noref) +{ + struct hlist_node *ehnode; + struct cfs_hash_bd bds[2]; + int bits = 0; + + LASSERTF(hlist_unhashed(hnode), "hnode = %p\n", hnode); + + cfs_hash_lock(hs, 0); + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1); + + cfs_hash_key_validate(hs, key, hnode); + ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key, + hnode, noref); + cfs_hash_dual_bd_unlock(hs, bds, 1); + + if (ehnode == hnode) /* new item added */ + bits = cfs_hash_rehash_bits(hs); + cfs_hash_unlock(hs, 0); + if (bits > 0) + cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); + + return ehnode; +} + +/** + * Add item @hnode to libcfs hash @hs using @key. The registered + * ops->hs_get function will be called if the item was added. + * Returns 0 on success or -EALREADY on key collisions. + */ +int +cfs_hash_add_unique(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode) +{ + return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ? + -EALREADY : 0; +} +EXPORT_SYMBOL(cfs_hash_add_unique); + +/** + * Add item @hnode to libcfs hash @hs using @key. If this @key + * already exists in the hash then ops->hs_get will be called on the + * conflicting entry and that entry will be returned to the caller. + * Otherwise ops->hs_get is called on the item which was added. + */ +void * +cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode) +{ + hnode = cfs_hash_find_or_add(hs, key, hnode, 0); + + return cfs_hash_object(hs, hnode); +} +EXPORT_SYMBOL(cfs_hash_findadd_unique); + +/** + * Delete item @hnode from the libcfs hash @hs using @key. The @key + * is required to ensure the correct hash bucket is locked since there + * is no direct linkage from the item to the bucket. The object + * removed from the hash will be returned and obs->hs_put is called + * on the removed object. + */ +void * +cfs_hash_del(struct cfs_hash *hs, const void *key, struct hlist_node *hnode) +{ + void *obj = NULL; + int bits = 0; + struct cfs_hash_bd bds[2]; + + cfs_hash_lock(hs, 0); + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1); + + /* NB: do nothing if @hnode is not in hash table */ + if (hnode == NULL || !hlist_unhashed(hnode)) { + if (bds[1].bd_bucket == NULL && hnode != NULL) { + cfs_hash_bd_del_locked(hs, &bds[0], hnode); + } else { + hnode = cfs_hash_dual_bd_finddel_locked(hs, bds, + key, hnode); + } + } + + if (hnode != NULL) { + obj = cfs_hash_object(hs, hnode); + bits = cfs_hash_rehash_bits(hs); + } + + cfs_hash_dual_bd_unlock(hs, bds, 1); + cfs_hash_unlock(hs, 0); + if (bits > 0) + cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); + + return obj; +} +EXPORT_SYMBOL(cfs_hash_del); + +/** + * Delete item given @key in libcfs hash @hs. The first @key found in + * the hash will be removed, if the key exists multiple times in the hash + * @hs this function must be called once per key. The removed object + * will be returned and ops->hs_put is called on the removed object. + */ +void * +cfs_hash_del_key(struct cfs_hash *hs, const void *key) +{ + return cfs_hash_del(hs, key, NULL); +} +EXPORT_SYMBOL(cfs_hash_del_key); + +/** + * Lookup an item using @key in the libcfs hash @hs and return it. + * If the @key is found in the hash hs->hs_get() is called and the + * matching objects is returned. It is the callers responsibility + * to call the counterpart ops->hs_put using the cfs_hash_put() macro + * when when finished with the object. If the @key was not found + * in the hash @hs NULL is returned. + */ +void * +cfs_hash_lookup(struct cfs_hash *hs, const void *key) +{ + void *obj = NULL; + struct hlist_node *hnode; + struct cfs_hash_bd bds[2]; + + cfs_hash_lock(hs, 0); + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0); + + hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key); + if (hnode != NULL) + obj = cfs_hash_object(hs, hnode); + + cfs_hash_dual_bd_unlock(hs, bds, 0); + cfs_hash_unlock(hs, 0); + + return obj; +} +EXPORT_SYMBOL(cfs_hash_lookup); + +static void +cfs_hash_for_each_enter(struct cfs_hash *hs) +{ + LASSERT(!cfs_hash_is_exiting(hs)); + + if (!cfs_hash_with_rehash(hs)) + return; + /* + * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter + * because it's just an unreliable signal to rehash-thread, + * rehash-thread will try to finish rehash ASAP when seeing this. + */ + hs->hs_iterating = 1; + + cfs_hash_lock(hs, 1); + hs->hs_iterators++; + cfs_hash_unlock(hs, 1); + + /* NB: iteration is mostly called by service thread, + * we tend to cancel pending rehash-request, instead of + * blocking service thread, we will relaunch rehash request + * after iteration + */ + if (cfs_hash_is_rehashing(hs)) + cfs_hash_rehash_cancel(hs); +} + +static void +cfs_hash_for_each_exit(struct cfs_hash *hs) +{ + int remained; + int bits; + + if (!cfs_hash_with_rehash(hs)) + return; + cfs_hash_lock(hs, 1); + remained = --hs->hs_iterators; + bits = cfs_hash_rehash_bits(hs); + cfs_hash_unlock(hs, 1); + /* NB: it's race on cfs_has_t::hs_iterating, see above */ + if (remained == 0) + hs->hs_iterating = 0; + if (bits > 0) { + cfs_hash_rehash(hs, atomic_read(&hs->hs_count) < + CFS_HASH_LOOP_HOG); + } +} + +/** + * For each item in the libcfs hash @hs call the passed callback @func + * and pass to it as an argument each hash item and the private @data. + * + * a) the function may sleep! + * b) during the callback: + * . the bucket lock is held so the callback must never sleep. + * . if @removal_safe is true, use can remove current item by + * cfs_hash_bd_del_locked + */ +static __u64 +cfs_hash_for_each_tight(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, + void *data, int remove_safe) +{ + struct hlist_node *hnode; + struct hlist_node *pos; + struct cfs_hash_bd bd; + __u64 count = 0; + int excl = !!remove_safe; + int loop = 0; + int i; + ENTRY; + + cfs_hash_for_each_enter(hs); + + cfs_hash_lock(hs, 0); + LASSERT(!cfs_hash_is_rehashing(hs)); + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct hlist_head *hhead; + + cfs_hash_bd_lock(hs, &bd, excl); + if (func == NULL) { /* only glimpse size */ + count += bd.bd_bucket->hsb_count; + cfs_hash_bd_unlock(hs, &bd, excl); + continue; + } + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + hlist_for_each_safe(hnode, pos, hhead) { + cfs_hash_bucket_validate(hs, &bd, hnode); + count++; + loop++; + if (func(hs, &bd, hnode, data)) { + cfs_hash_bd_unlock(hs, &bd, excl); + goto out; + } + } + } + cfs_hash_bd_unlock(hs, &bd, excl); + if (loop < CFS_HASH_LOOP_HOG) + continue; + loop = 0; + cfs_hash_unlock(hs, 0); + cond_resched(); + cfs_hash_lock(hs, 0); + } + out: + cfs_hash_unlock(hs, 0); + + cfs_hash_for_each_exit(hs); + RETURN(count); +} + +struct cfs_hash_cond_arg { + cfs_hash_cond_opt_cb_t func; + void *arg; +}; + +static int +cfs_hash_cond_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) +{ + struct cfs_hash_cond_arg *cond = data; + + if (cond->func(cfs_hash_object(hs, hnode), cond->arg)) + cfs_hash_bd_del_locked(hs, bd, hnode); + return 0; +} + +/** + * Delete item from the libcfs hash @hs when @func return true. + * The write lock being hold during loop for each bucket to avoid + * any object be reference. + */ +void +cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t func, void *data) +{ + struct cfs_hash_cond_arg arg = { + .func = func, + .arg = data, + }; + + cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1); +} +EXPORT_SYMBOL(cfs_hash_cond_del); + +void +cfs_hash_for_each(struct cfs_hash *hs, + cfs_hash_for_each_cb_t func, void *data) +{ + cfs_hash_for_each_tight(hs, func, data, 0); +} +EXPORT_SYMBOL(cfs_hash_for_each); + +void +cfs_hash_for_each_safe(struct cfs_hash *hs, + cfs_hash_for_each_cb_t func, void *data) +{ + cfs_hash_for_each_tight(hs, func, data, 1); +} +EXPORT_SYMBOL(cfs_hash_for_each_safe); + +static int +cfs_hash_peek(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) +{ + *(int *)data = 0; + return 1; /* return 1 to break the loop */ +} + +int +cfs_hash_is_empty(struct cfs_hash *hs) +{ + int empty = 1; + + cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0); + return empty; +} +EXPORT_SYMBOL(cfs_hash_is_empty); + +__u64 +cfs_hash_size_get(struct cfs_hash *hs) +{ + return cfs_hash_with_counter(hs) ? + atomic_read(&hs->hs_count) : + cfs_hash_for_each_tight(hs, NULL, NULL, 0); +} +EXPORT_SYMBOL(cfs_hash_size_get); + +/* + * cfs_hash_for_each_relax: + * Iterate the hash table and call @func on each item without + * any lock. This function can't guarantee to finish iteration + * if these features are enabled: + * + * a. if rehash_key is enabled, an item can be moved from + * one bucket to another bucket + * b. user can remove non-zero-ref item from hash-table, + * so the item can be removed from hash-table, even worse, + * it's possible that user changed key and insert to another + * hash bucket. + * there's no way for us to finish iteration correctly on previous + * two cases, so iteration has to be stopped on change. + */ +static int +cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, + void *data, int start) +{ + struct hlist_node *hnode; + struct hlist_node *next = NULL; + struct cfs_hash_bd bd; + __u32 version; + int count = 0; + int stop_on_change; + int has_put_locked; + int rc = 0; + int i, end = -1; + ENTRY; + + stop_on_change = cfs_hash_with_rehash_key(hs) || + !cfs_hash_with_no_itemref(hs); + has_put_locked = hs->hs_ops->hs_put_locked != NULL; + cfs_hash_lock(hs, 0); +again: + LASSERT(!cfs_hash_is_rehashing(hs)); + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct hlist_head *hhead; + + if (i < start) + continue; + else if (end > 0 && i >= end) + break; + + cfs_hash_bd_lock(hs, &bd, 0); + version = cfs_hash_bd_version_get(&bd); + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + hnode = hhead->first; + if (hnode == NULL) + continue; + cfs_hash_get(hs, hnode); + for (; hnode != NULL; hnode = next) { + cfs_hash_bucket_validate(hs, &bd, hnode); + next = hnode->next; + if (next != NULL) + cfs_hash_get(hs, next); + cfs_hash_bd_unlock(hs, &bd, 0); + cfs_hash_unlock(hs, 0); + + rc = func(hs, &bd, hnode, data); + if (stop_on_change || !has_put_locked) + cfs_hash_put(hs, hnode); + + cond_resched(); + count++; + + cfs_hash_lock(hs, 0); + cfs_hash_bd_lock(hs, &bd, 0); + if (stop_on_change) { + if (version != + cfs_hash_bd_version_get(&bd)) + rc = -EINTR; + } else if (has_put_locked) { + cfs_hash_put_locked(hs, hnode); + } + if (rc) /* callback wants to break iteration */ + break; + } + if (next != NULL) { + if (has_put_locked) { + cfs_hash_put_locked(hs, next); + next = NULL; + } + break; + } else if (rc != 0) { + break; + } + } + cfs_hash_bd_unlock(hs, &bd, 0); + if (next != NULL && !has_put_locked) { + cfs_hash_put(hs, next); + next = NULL; + } + if (rc) /* callback wants to break iteration */ + break; + } + + if (start > 0 && rc == 0) { + end = start; + start = 0; + goto again; + } + + cfs_hash_unlock(hs, 0); + return count; +} + +int +cfs_hash_for_each_nolock(struct cfs_hash *hs, + cfs_hash_for_each_cb_t func, void *data, int start) +{ + ENTRY; + + if (cfs_hash_with_no_lock(hs) || + cfs_hash_with_rehash_key(hs) || + !cfs_hash_with_no_itemref(hs)) + RETURN(-EOPNOTSUPP); + + if (hs->hs_ops->hs_get == NULL || + (hs->hs_ops->hs_put == NULL && + hs->hs_ops->hs_put_locked == NULL)) + RETURN(-EOPNOTSUPP); + + cfs_hash_for_each_enter(hs); + cfs_hash_for_each_relax(hs, func, data, start); + cfs_hash_for_each_exit(hs); + + RETURN(0); +} +EXPORT_SYMBOL(cfs_hash_for_each_nolock); + +/** + * For each hash bucket in the libcfs hash @hs call the passed callback + * @func until all the hash buckets are empty. The passed callback @func + * or the previously registered callback hs->hs_put must remove the item + * from the hash. You may either use the cfs_hash_del() or hlist_del() + * functions. No rwlocks will be held during the callback @func it is + * safe to sleep if needed. This function will not terminate until the + * hash is empty. Note it is still possible to concurrently add new + * items in to the hash. It is the callers responsibility to ensure + * the required locking is in place to prevent concurrent insertions. + */ +int +cfs_hash_for_each_empty(struct cfs_hash *hs, + cfs_hash_for_each_cb_t func, void *data) +{ + unsigned i = 0; + ENTRY; + + if (cfs_hash_with_no_lock(hs)) + return -EOPNOTSUPP; + + if (hs->hs_ops->hs_get == NULL || + (hs->hs_ops->hs_put == NULL && + hs->hs_ops->hs_put_locked == NULL)) + return -EOPNOTSUPP; + + cfs_hash_for_each_enter(hs); + while (cfs_hash_for_each_relax(hs, func, data, 0)) { + CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n", + hs->hs_name, i++); + } + cfs_hash_for_each_exit(hs); + RETURN(0); +} +EXPORT_SYMBOL(cfs_hash_for_each_empty); + +void +cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex, + cfs_hash_for_each_cb_t func, void *data) +{ + struct hlist_head *hhead; + struct hlist_node *hnode; + struct cfs_hash_bd bd; + + cfs_hash_for_each_enter(hs); + cfs_hash_lock(hs, 0); + if (hindex >= CFS_HASH_NHLIST(hs)) + goto out; + + cfs_hash_bd_index_set(hs, hindex, &bd); + + cfs_hash_bd_lock(hs, &bd, 0); + hhead = cfs_hash_bd_hhead(hs, &bd); + hlist_for_each(hnode, hhead) { + if (func(hs, &bd, hnode, data)) + break; + } + cfs_hash_bd_unlock(hs, &bd, 0); +out: + cfs_hash_unlock(hs, 0); + cfs_hash_for_each_exit(hs); +} + +EXPORT_SYMBOL(cfs_hash_hlist_for_each); + +/* + * For each item in the libcfs hash @hs which matches the @key call + * the passed callback @func and pass to it as an argument each hash + * item and the private @data. During the callback the bucket lock + * is held so the callback must never sleep. + */ +void +cfs_hash_for_each_key(struct cfs_hash *hs, const void *key, + cfs_hash_for_each_cb_t func, void *data) +{ + struct hlist_node *hnode; + struct cfs_hash_bd bds[2]; + unsigned i; + + cfs_hash_lock(hs, 0); + + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0); + + cfs_hash_for_each_bd(bds, 2, i) { + struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]); + + hlist_for_each(hnode, hlist) { + cfs_hash_bucket_validate(hs, &bds[i], hnode); + + if (cfs_hash_keycmp(hs, key, hnode)) { + if (func(hs, &bds[i], hnode, data)) + break; + } + } + } + + cfs_hash_dual_bd_unlock(hs, bds, 0); + cfs_hash_unlock(hs, 0); +} +EXPORT_SYMBOL(cfs_hash_for_each_key); + +/** + * Rehash the libcfs hash @hs to the given @bits. This can be used + * to grow the hash size when excessive chaining is detected, or to + * shrink the hash when it is larger than needed. When the CFS_HASH_REHASH + * flag is set in @hs the libcfs hash may be dynamically rehashed + * during addition or removal if the hash's theta value exceeds + * either the hs->hs_min_theta or hs->max_theta values. By default + * these values are tuned to keep the chained hash depth small, and + * this approach assumes a reasonably uniform hashing function. The + * theta thresholds for @hs are tunable via cfs_hash_set_theta(). + */ +void +cfs_hash_rehash_cancel(struct cfs_hash *hs) +{ + LASSERT(cfs_hash_with_rehash(hs)); + cancel_work_sync(&hs->hs_rehash_work); +} + +void +cfs_hash_rehash(struct cfs_hash *hs, int do_rehash) +{ + int rc; + + LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs)); + + cfs_hash_lock(hs, 1); + + rc = cfs_hash_rehash_bits(hs); + if (rc <= 0) { + cfs_hash_unlock(hs, 1); + return; + } + + hs->hs_rehash_bits = rc; + if (!do_rehash) { + /* launch and return */ + queue_work(cfs_rehash_wq, &hs->hs_rehash_work); + cfs_hash_unlock(hs, 1); + return; + } + + /* rehash right now */ + cfs_hash_unlock(hs, 1); + + cfs_hash_rehash_worker(&hs->hs_rehash_work); +} + +static int +cfs_hash_rehash_bd(struct cfs_hash *hs, struct cfs_hash_bd *old) +{ + struct cfs_hash_bd new; + struct hlist_head *hhead; + struct hlist_node *hnode; + struct hlist_node *pos; + void *key; + int c = 0; + + /* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */ + cfs_hash_bd_for_each_hlist(hs, old, hhead) { + hlist_for_each_safe(hnode, pos, hhead) { + key = cfs_hash_key(hs, hnode); + LASSERT(key != NULL); + /* Validate hnode is in the correct bucket. */ + cfs_hash_bucket_validate(hs, old, hnode); + /* + * Delete from old hash bucket; move to new bucket. + * ops->hs_key must be defined. + */ + cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, + hs->hs_rehash_bits, key, &new); + cfs_hash_bd_move_locked(hs, old, &new, hnode); + c++; + } + } + return c; +} + +static void +cfs_hash_rehash_worker(struct work_struct *work) +{ + struct cfs_hash *hs = container_of(work, struct cfs_hash, + hs_rehash_work); + struct cfs_hash_bucket **bkts; + struct cfs_hash_bd bd; + unsigned int old_size; + unsigned int new_size; + int bsize; + int count = 0; + int rc = 0; + int i; + + LASSERT(hs != NULL && cfs_hash_with_rehash(hs)); + + cfs_hash_lock(hs, 0); + LASSERT(cfs_hash_is_rehashing(hs)); + + old_size = CFS_HASH_NBKT(hs); + new_size = CFS_HASH_RH_NBKT(hs); + + cfs_hash_unlock(hs, 0); + + /* + * don't need hs::hs_rwlock for hs::hs_buckets, + * because nobody can change bkt-table except me. + */ + bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets, + old_size, new_size); + cfs_hash_lock(hs, 1); + if (bkts == NULL) { + rc = -ENOMEM; + goto out; + } + + if (bkts == hs->hs_buckets) { + bkts = NULL; /* do nothing */ + goto out; + } + + rc = __cfs_hash_theta(hs); + if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) { + /* free the new allocated bkt-table */ + old_size = new_size; + new_size = CFS_HASH_NBKT(hs); + rc = -EALREADY; + goto out; + } + + LASSERT(hs->hs_rehash_buckets == NULL); + hs->hs_rehash_buckets = bkts; + + rc = 0; + cfs_hash_for_each_bucket(hs, &bd, i) { + if (cfs_hash_is_exiting(hs)) { + rc = -ESRCH; + /* someone wants to destroy the hash, abort now */ + if (old_size < new_size) /* OK to free old bkt-table */ + break; + /* it's shrinking, need free new bkt-table */ + hs->hs_rehash_buckets = NULL; + old_size = new_size; + new_size = CFS_HASH_NBKT(hs); + goto out; + } + + count += cfs_hash_rehash_bd(hs, &bd); + if (count < CFS_HASH_LOOP_HOG || + cfs_hash_is_iterating(hs)) { /* need to finish ASAP */ + continue; + } + + count = 0; + cfs_hash_unlock(hs, 1); + cond_resched(); + cfs_hash_lock(hs, 1); + } + + hs->hs_rehash_count++; + + bkts = hs->hs_buckets; + hs->hs_buckets = hs->hs_rehash_buckets; + hs->hs_rehash_buckets = NULL; + + hs->hs_cur_bits = hs->hs_rehash_bits; +out: + hs->hs_rehash_bits = 0; + bsize = cfs_hash_bkt_size(hs); + cfs_hash_unlock(hs, 1); + /* can't refer to @hs anymore because it could be destroyed */ + if (bkts != NULL) + cfs_hash_buckets_free(bkts, bsize, new_size, old_size); + if (rc != 0) + CDEBUG(D_INFO, "early quit of rehashing: %d\n", rc); +} + +/** + * Rehash the object referenced by @hnode in the libcfs hash @hs. The + * @old_key must be provided to locate the objects previous location + * in the hash, and the @new_key will be used to reinsert the object. + * Use this function instead of a cfs_hash_add() + cfs_hash_del() + * combo when it is critical that there is no window in time where the + * object is missing from the hash. When an object is being rehashed + * the registered cfs_hash_get() and cfs_hash_put() functions will + * not be called. + */ +void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key, + void *new_key, struct hlist_node *hnode) +{ + struct cfs_hash_bd bds[3]; + struct cfs_hash_bd old_bds[2]; + struct cfs_hash_bd new_bd; + + LASSERT(!hlist_unhashed(hnode)); + + cfs_hash_lock(hs, 0); + + cfs_hash_dual_bd_get(hs, old_key, old_bds); + cfs_hash_bd_get(hs, new_key, &new_bd); + + bds[0] = old_bds[0]; + bds[1] = old_bds[1]; + bds[2] = new_bd; + + /* NB: bds[0] and bds[1] are ordered already */ + cfs_hash_bd_order(&bds[1], &bds[2]); + cfs_hash_bd_order(&bds[0], &bds[1]); + + cfs_hash_multi_bd_lock(hs, bds, 3, 1); + if (likely(old_bds[1].bd_bucket == NULL)) { + cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode); + } else { + cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode); + cfs_hash_bd_add_locked(hs, &new_bd, hnode); + } + /* overwrite key inside locks, otherwise may screw up with + * other operations, i.e: rehash */ + cfs_hash_keycpy(hs, hnode, new_key); + + cfs_hash_multi_bd_unlock(hs, bds, 3, 1); + cfs_hash_unlock(hs, 0); +} +EXPORT_SYMBOL(cfs_hash_rehash_key); + +void cfs_hash_debug_header(struct seq_file *m) +{ + seq_printf(m, "%-*s cur min max theta t-min t-max flags rehash count maxdep maxdepb distribution\n", + CFS_HASH_BIGNAME_LEN, "name"); +} +EXPORT_SYMBOL(cfs_hash_debug_header); + +static struct cfs_hash_bucket ** +cfs_hash_full_bkts(struct cfs_hash *hs) +{ + /* NB: caller should hold hs->hs_rwlock if REHASH is set */ + if (hs->hs_rehash_buckets == NULL) + return hs->hs_buckets; + + LASSERT(hs->hs_rehash_bits != 0); + return hs->hs_rehash_bits > hs->hs_cur_bits ? + hs->hs_rehash_buckets : hs->hs_buckets; +} + +static unsigned int +cfs_hash_full_nbkt(struct cfs_hash *hs) +{ + /* NB: caller should hold hs->hs_rwlock if REHASH is set */ + if (hs->hs_rehash_buckets == NULL) + return CFS_HASH_NBKT(hs); + + LASSERT(hs->hs_rehash_bits != 0); + return hs->hs_rehash_bits > hs->hs_cur_bits ? + CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs); +} + +void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m) +{ + int dist[8] = { 0, }; + int maxdep = -1; + int maxdepb = -1; + int total = 0; + int theta; + int i; + + cfs_hash_lock(hs, 0); + theta = __cfs_hash_theta(hs); + + seq_printf(m, "%-*s %5d %5d %5d %d.%03d %d.%03d %d.%03d 0x%02x %6d ", + CFS_HASH_BIGNAME_LEN, hs->hs_name, + 1 << hs->hs_cur_bits, 1 << hs->hs_min_bits, + 1 << hs->hs_max_bits, + __cfs_hash_theta_int(theta), __cfs_hash_theta_frac(theta), + __cfs_hash_theta_int(hs->hs_min_theta), + __cfs_hash_theta_frac(hs->hs_min_theta), + __cfs_hash_theta_int(hs->hs_max_theta), + __cfs_hash_theta_frac(hs->hs_max_theta), + hs->hs_flags, hs->hs_rehash_count); + + /* + * The distribution is a summary of the chained hash depth in + * each of the libcfs hash buckets. Each buckets hsb_count is + * divided by the hash theta value and used to generate a + * histogram of the hash distribution. A uniform hash will + * result in all hash buckets being close to the average thus + * only the first few entries in the histogram will be non-zero. + * If you hash function results in a non-uniform hash the will + * be observable by outlier bucks in the distribution histogram. + * + * Uniform hash distribution: 128/128/0/0/0/0/0/0 + * Non-Uniform hash distribution: 128/125/0/0/0/0/2/1 + */ + for (i = 0; i < cfs_hash_full_nbkt(hs); i++) { + struct cfs_hash_bd bd; + + bd.bd_bucket = cfs_hash_full_bkts(hs)[i]; + cfs_hash_bd_lock(hs, &bd, 0); + if (maxdep < bd.bd_bucket->hsb_depmax) { + maxdep = bd.bd_bucket->hsb_depmax; + maxdepb = ffz(~maxdep); + } + total += bd.bd_bucket->hsb_count; + dist[min(fls(bd.bd_bucket->hsb_count / max(theta, 1)), 7)]++; + cfs_hash_bd_unlock(hs, &bd, 0); + } + + seq_printf(m, "%7d %7d %7d ", total, maxdep, maxdepb); + for (i = 0; i < 8; i++) + seq_printf(m, "%d%c", dist[i], (i == 7) ? '\n' : '/'); + + cfs_hash_unlock(hs, 0); +} +EXPORT_SYMBOL(cfs_hash_debug_str); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c new file mode 100644 index 0000000000000..2616fc9fe9386 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c @@ -0,0 +1,1270 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include + +/** virtual processing unit */ +struct cfs_cpu_partition { + /* CPUs mask for this partition */ + cpumask_var_t cpt_cpumask; + /* nodes mask for this partition */ + nodemask_t *cpt_nodemask; + /* NUMA distance between CPTs */ + unsigned int *cpt_distance; + /* spread rotor for NUMA allocator */ + unsigned int cpt_spread_rotor; + /* NUMA node if cpt_nodemask is empty */ + int cpt_node; +}; + +/** descriptor for CPU partitions */ +struct cfs_cpt_table { + /* spread rotor for NUMA allocator */ + unsigned int ctb_spread_rotor; + /* maximum NUMA distance between all nodes in table */ + unsigned int ctb_distance; + /* # of CPU partitions */ + int ctb_nparts; + /* partitions tables */ + struct cfs_cpu_partition *ctb_parts; + /* shadow HW CPU to CPU partition ID */ + int *ctb_cpu2cpt; + /* all cpus in this partition table */ + cpumask_var_t ctb_cpumask; + /* shadow HW node to CPU partition ID */ + int *ctb_node2cpt; + /* all nodes in this partition table */ + nodemask_t *ctb_nodemask; +}; + +/** Global CPU partition table */ +struct cfs_cpt_table *cfs_cpt_tab __read_mostly; +EXPORT_SYMBOL(cfs_cpt_tab); + +/** + * modparam for setting number of partitions + * + * 0 : estimate best value based on cores or NUMA nodes + * 1 : disable multiple partitions + * >1 : specify number of partitions + */ +static int cpu_npartitions; +module_param(cpu_npartitions, int, 0444); +MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions"); + +/** + * modparam for setting CPU partitions patterns: + * + * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID, + * number in bracket is processor ID (core or HT) + * + * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket + * are NUMA node ID, number before bracket is CPU partition ID. + * + * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology + * + * NB: If user specified cpu_pattern, cpu_npartitions will be ignored + */ +static char *cpu_pattern = "N"; +module_param(cpu_pattern, charp, 0444); +MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern"); + +struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt) +{ + struct cfs_cpt_table *cptab; + int i; + + LIBCFS_ALLOC(cptab, sizeof(*cptab)); + if (!cptab) + return NULL; + + cptab->ctb_nparts = ncpt; + + if (!zalloc_cpumask_var(&cptab->ctb_cpumask, GFP_NOFS)) + goto failed_alloc_cpumask; + + LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); + if (!cptab->ctb_nodemask) + goto failed_alloc_nodemask; + + CFS_ALLOC_PTR_ARRAY(cptab->ctb_cpu2cpt, nr_cpu_ids); + if (!cptab->ctb_cpu2cpt) + goto failed_alloc_cpu2cpt; + + memset(cptab->ctb_cpu2cpt, -1, + nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0])); + + CFS_ALLOC_PTR_ARRAY(cptab->ctb_node2cpt, nr_node_ids); + if (!cptab->ctb_node2cpt) + goto failed_alloc_node2cpt; + + memset(cptab->ctb_node2cpt, -1, + nr_node_ids * sizeof(cptab->ctb_node2cpt[0])); + + CFS_ALLOC_PTR_ARRAY(cptab->ctb_parts, ncpt); + if (!cptab->ctb_parts) + goto failed_alloc_ctb_parts; + + memset(cptab->ctb_parts, -1, ncpt * sizeof(cptab->ctb_parts[0])); + + for (i = 0; i < ncpt; i++) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; + + if (!zalloc_cpumask_var(&part->cpt_cpumask, GFP_NOFS)) + goto failed_setting_ctb_parts; + + LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask)); + if (!part->cpt_nodemask) + goto failed_setting_ctb_parts; + + CFS_ALLOC_PTR_ARRAY(part->cpt_distance, cptab->ctb_nparts); + if (!part->cpt_distance) + goto failed_setting_ctb_parts; + + memset(part->cpt_distance, -1, + cptab->ctb_nparts * sizeof(part->cpt_distance[0])); + } + + return cptab; + +failed_setting_ctb_parts: + while (i-- >= 0) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; + + if (part->cpt_nodemask) { + LIBCFS_FREE(part->cpt_nodemask, + sizeof(*part->cpt_nodemask)); + } + + free_cpumask_var(part->cpt_cpumask); + + if (part->cpt_distance) { + CFS_FREE_PTR_ARRAY(part->cpt_distance, + cptab->ctb_nparts); + } + } + + if (cptab->ctb_parts) + CFS_FREE_PTR_ARRAY(cptab->ctb_parts, cptab->ctb_nparts); + +failed_alloc_ctb_parts: + if (cptab->ctb_node2cpt) + CFS_FREE_PTR_ARRAY(cptab->ctb_node2cpt, nr_node_ids); + +failed_alloc_node2cpt: + if (cptab->ctb_cpu2cpt) + CFS_FREE_PTR_ARRAY(cptab->ctb_cpu2cpt, nr_cpu_ids); + +failed_alloc_cpu2cpt: + if (cptab->ctb_nodemask) + LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); +failed_alloc_nodemask: + free_cpumask_var(cptab->ctb_cpumask); +failed_alloc_cpumask: + LIBCFS_FREE(cptab, sizeof(*cptab)); + return NULL; +} +EXPORT_SYMBOL(cfs_cpt_table_alloc); + +void cfs_cpt_table_free(struct cfs_cpt_table *cptab) +{ + int i; + + if (cptab->ctb_cpu2cpt) + CFS_FREE_PTR_ARRAY(cptab->ctb_cpu2cpt, nr_cpu_ids); + + if (cptab->ctb_node2cpt) + CFS_FREE_PTR_ARRAY(cptab->ctb_node2cpt, nr_node_ids); + + for (i = 0; cptab->ctb_parts && i < cptab->ctb_nparts; i++) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; + + if (part->cpt_nodemask) { + LIBCFS_FREE(part->cpt_nodemask, + sizeof(*part->cpt_nodemask)); + } + + free_cpumask_var(part->cpt_cpumask); + + if (part->cpt_distance) + CFS_FREE_PTR_ARRAY(part->cpt_distance, + cptab->ctb_nparts); + } + + if (cptab->ctb_parts) + CFS_FREE_PTR_ARRAY(cptab->ctb_parts, cptab->ctb_nparts); + + if (cptab->ctb_nodemask) + LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); + free_cpumask_var(cptab->ctb_cpumask); + + LIBCFS_FREE(cptab, sizeof(*cptab)); +} +EXPORT_SYMBOL(cfs_cpt_table_free); + +int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len) +{ + char *tmp = buf; + int rc; + int i; + int j; + + for (i = 0; i < cptab->ctb_nparts; i++) { + if (len <= 0) + goto err; + + rc = snprintf(tmp, len, "%d\t:", i); + len -= rc; + + if (len <= 0) + goto err; + + tmp += rc; + for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) { + rc = snprintf(tmp, len, " %d", j); + len -= rc; + if (len <= 0) + goto err; + tmp += rc; + } + + *tmp = '\n'; + tmp++; + len--; + } + + return tmp - buf; +err: + return -E2BIG; +} +EXPORT_SYMBOL(cfs_cpt_table_print); + +int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len) +{ + char *tmp = buf; + int rc; + int i; + int j; + + for (i = 0; i < cptab->ctb_nparts; i++) { + if (len <= 0) + goto err; + + rc = snprintf(tmp, len, "%d\t:", i); + len -= rc; + + if (len <= 0) + goto err; + + tmp += rc; + for (j = 0; j < cptab->ctb_nparts; j++) { + rc = snprintf(tmp, len, " %d:%d", j, + cptab->ctb_parts[i].cpt_distance[j]); + len -= rc; + if (len <= 0) + goto err; + tmp += rc; + } + + *tmp = '\n'; + tmp++; + len--; + } + + return tmp - buf; +err: + return -E2BIG; +} +EXPORT_SYMBOL(cfs_cpt_distance_print); + +int cfs_cpt_number(struct cfs_cpt_table *cptab) +{ + return cptab->ctb_nparts; +} +EXPORT_SYMBOL(cfs_cpt_number); + +int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cpumask_weight(cptab->ctb_cpumask) : + cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask); +} +EXPORT_SYMBOL(cfs_cpt_weight); + +int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cpumask_any_and(cptab->ctb_cpumask, + cpu_online_mask) < nr_cpu_ids : + cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask, + cpu_online_mask) < nr_cpu_ids; +} +EXPORT_SYMBOL(cfs_cpt_online); + +cpumask_var_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + &cptab->ctb_cpumask : &cptab->ctb_parts[cpt].cpt_cpumask; +} +EXPORT_SYMBOL(cfs_cpt_cpumask); + +nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask; +} +EXPORT_SYMBOL(cfs_cpt_nodemask); + +unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2) +{ + LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts)); + LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts)); + + if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY) + return cptab->ctb_distance; + + return cptab->ctb_parts[cpt1].cpt_distance[cpt2]; +} +EXPORT_SYMBOL(cfs_cpt_distance); + +/* + * Calculate the maximum NUMA distance between all nodes in the + * from_mask and all nodes in the to_mask. + */ +static unsigned int cfs_cpt_distance_calculate(nodemask_t *from_mask, + nodemask_t *to_mask) +{ + unsigned int maximum; + unsigned int distance; + int from; + int to; + + maximum = 0; + for_each_node_mask(from, *from_mask) { + for_each_node_mask(to, *to_mask) { + distance = node_distance(from, to); + if (maximum < distance) + maximum = distance; + } + } + return maximum; +} + +static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + cptab->ctb_cpu2cpt[cpu] = cpt; + + cpumask_set_cpu(cpu, cptab->ctb_cpumask); + cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask); +} + +static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask); + cpumask_clear_cpu(cpu, cptab->ctb_cpumask); + + cptab->ctb_cpu2cpt[cpu] = -1; +} + +static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + struct cfs_cpu_partition *part; + + if (!node_isset(node, *cptab->ctb_nodemask)) { + unsigned int dist; + + /* first time node is added to the CPT table */ + node_set(node, *cptab->ctb_nodemask); + cptab->ctb_node2cpt[node] = cpt; + + dist = cfs_cpt_distance_calculate(cptab->ctb_nodemask, + cptab->ctb_nodemask); + cptab->ctb_distance = dist; + } + + part = &cptab->ctb_parts[cpt]; + if (!node_isset(node, *part->cpt_nodemask)) { + int cpt2; + + /* first time node is added to this CPT */ + node_set(node, *part->cpt_nodemask); + for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) { + struct cfs_cpu_partition *part2; + unsigned int dist; + + part2 = &cptab->ctb_parts[cpt2]; + dist = cfs_cpt_distance_calculate(part->cpt_nodemask, + part2->cpt_nodemask); + part->cpt_distance[cpt2] = dist; + dist = cfs_cpt_distance_calculate(part2->cpt_nodemask, + part->cpt_nodemask); + part2->cpt_distance[cpt] = dist; + } + } +} + +static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt]; + int cpu; + + for_each_cpu(cpu, part->cpt_cpumask) { + /* this CPT has other CPU belonging to this node? */ + if (cpu_to_node(cpu) == node) + break; + } + + if (cpu >= nr_cpu_ids && node_isset(node, *part->cpt_nodemask)) { + int cpt2; + + /* No more CPUs in the node for this CPT. */ + node_clear(node, *part->cpt_nodemask); + for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) { + struct cfs_cpu_partition *part2; + unsigned int dist; + + part2 = &cptab->ctb_parts[cpt2]; + if (node_isset(node, *part2->cpt_nodemask)) + cptab->ctb_node2cpt[node] = cpt2; + + dist = cfs_cpt_distance_calculate(part->cpt_nodemask, + part2->cpt_nodemask); + part->cpt_distance[cpt2] = dist; + dist = cfs_cpt_distance_calculate(part2->cpt_nodemask, + part->cpt_nodemask); + part2->cpt_distance[cpt] = dist; + } + } + + for_each_cpu(cpu, cptab->ctb_cpumask) { + /* this CPT-table has other CPUs belonging to this node? */ + if (cpu_to_node(cpu) == node) + break; + } + + if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) { + /* No more CPUs in the table for this node. */ + node_clear(node, *cptab->ctb_nodemask); + cptab->ctb_node2cpt[node] = -1; + cptab->ctb_distance = + cfs_cpt_distance_calculate(cptab->ctb_nodemask, + cptab->ctb_nodemask); + } +} + +int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts); + + if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) { + CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu); + return 0; + } + + if (cptab->ctb_cpu2cpt[cpu] != -1) { + CDEBUG(D_INFO, "CPU %d is already in partition %d\n", + cpu, cptab->ctb_cpu2cpt[cpu]); + return 0; + } + + if (cpumask_test_cpu(cpu, cptab->ctb_cpumask)) { + CDEBUG(D_INFO, "CPU %d is already in cpumask\n", cpu); + return 0; + } + + if (cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)) { + CDEBUG(D_INFO, "CPU %d is already in partition %d cpumask\n", + cpu, cptab->ctb_cpu2cpt[cpu]); + return 0; + } + + cfs_cpt_add_cpu(cptab, cpt, cpu); + cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu)); + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpu); + +void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + if (cpu < 0 || cpu >= nr_cpu_ids) { + CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu); + return; + } + + if (cpt == CFS_CPT_ANY) { + /* caller doesn't know the partition ID */ + cpt = cptab->ctb_cpu2cpt[cpu]; + if (cpt < 0) { /* not set in this CPT-table */ + CDEBUG(D_INFO, + "Try to unset cpu %d which is not in CPT-table %p\n", + cpt, cptab); + return; + } + + } else if (cpt != cptab->ctb_cpu2cpt[cpu]) { + CDEBUG(D_INFO, + "CPU %d is not in CPU partition %d\n", cpu, cpt); + return; + } + + LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)); + LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask)); + + cfs_cpt_del_cpu(cptab, cpt, cpu); + cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu)); +} +EXPORT_SYMBOL(cfs_cpt_unset_cpu); + +int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, + const cpumask_t *mask) +{ + int cpu; + + if (!cpumask_weight(mask) || + cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) { + CDEBUG(D_INFO, + "No online CPU is found in the CPU mask for CPU partition %d\n", + cpt); + return 0; + } + + for_each_cpu(cpu, mask) { + cfs_cpt_add_cpu(cptab, cpt, cpu); + cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu)); + } + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpumask); + +void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, + const cpumask_t *mask) +{ + int cpu; + + for_each_cpu(cpu, mask) { + cfs_cpt_del_cpu(cptab, cpt, cpu); + cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu)); + } +} +EXPORT_SYMBOL(cfs_cpt_unset_cpumask); + +int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + const cpumask_t *mask; + int cpu; + + if (node < 0 || node >= nr_node_ids) { + CDEBUG(D_INFO, + "Invalid NUMA id %d for CPU partition %d\n", node, cpt); + return 0; + } + + mask = cpumask_of_node(node); + + for_each_cpu(cpu, mask) + cfs_cpt_add_cpu(cptab, cpt, cpu); + + cfs_cpt_add_node(cptab, cpt, node); + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_node); + +void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + const cpumask_t *mask; + int cpu; + + if (node < 0 || node >= nr_node_ids) { + CDEBUG(D_INFO, + "Invalid NUMA id %d for CPU partition %d\n", node, cpt); + return; + } + + mask = cpumask_of_node(node); + + for_each_cpu(cpu, mask) + cfs_cpt_del_cpu(cptab, cpt, cpu); + + cfs_cpt_del_node(cptab, cpt, node); +} +EXPORT_SYMBOL(cfs_cpt_unset_node); + +int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, + const nodemask_t *mask) +{ + int node; + + for_each_node_mask(node, *mask) + cfs_cpt_set_node(cptab, cpt, node); + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_nodemask); + +void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, + const nodemask_t *mask) +{ + int node; + + for_each_node_mask(node, *mask) + cfs_cpt_unset_node(cptab, cpt, node); +} +EXPORT_SYMBOL(cfs_cpt_unset_nodemask); + +int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt) +{ + nodemask_t *mask; + int weight; + unsigned int rotor; + int node = 0; + + /* convert CPU partition ID to HW node id */ + + if (cpt < 0 || cpt >= cptab->ctb_nparts) { + mask = cptab->ctb_nodemask; + rotor = cptab->ctb_spread_rotor++; + } else { + mask = cptab->ctb_parts[cpt].cpt_nodemask; + rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++; + node = cptab->ctb_parts[cpt].cpt_node; + } + + weight = nodes_weight(*mask); + if (weight > 0) { + rotor %= weight; + + for_each_node_mask(node, *mask) { + if (!rotor--) + return node; + } + } + + return node; +} +EXPORT_SYMBOL(cfs_cpt_spread_node); + +int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap) +{ + int cpu; + int cpt; + + preempt_disable(); + cpu = smp_processor_id(); + cpt = cptab->ctb_cpu2cpt[cpu]; + + if (cpt < 0 && remap) { + /* don't return negative value for safety of upper layer, + * instead we shadow the unknown cpu to a valid partition ID + */ + cpt = cpu % cptab->ctb_nparts; + } + preempt_enable(); + return cpt; +} +EXPORT_SYMBOL(cfs_cpt_current); + +int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu) +{ + LASSERT(cpu >= 0 && cpu < nr_cpu_ids); + + return cptab->ctb_cpu2cpt[cpu]; +} +EXPORT_SYMBOL(cfs_cpt_of_cpu); + +int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node) +{ + if (node < 0 || node > nr_node_ids) + return CFS_CPT_ANY; + + return cptab->ctb_node2cpt[node]; +} +EXPORT_SYMBOL(cfs_cpt_of_node); + +int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) +{ + nodemask_t *nodemask; + cpumask_t *cpumask; + int cpu; + int rc; + + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + if (cpt == CFS_CPT_ANY) { + cpumask = cptab->ctb_cpumask; + nodemask = cptab->ctb_nodemask; + } else { + cpumask = cptab->ctb_parts[cpt].cpt_cpumask; + nodemask = cptab->ctb_parts[cpt].cpt_nodemask; + } + + if (!cpumask_intersects(cpumask, cpu_online_mask)) { + CDEBUG(D_INFO, + "No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n", + cpt); + return -ENODEV; + } + + for_each_online_cpu(cpu) { + if (cpumask_test_cpu(cpu, cpumask)) + continue; + + rc = set_cpus_allowed_ptr(current, cpumask); + set_mems_allowed(*nodemask); + if (!rc) + schedule(); /* switch to allowed CPU */ + + return rc; + } + + /* don't need to set affinity because all online CPUs are covered */ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_bind); + +/** + * Choose max to \a number CPUs from \a node and set them in \a cpt. + * We always prefer to choose CPU in the same core/socket. + */ +static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt, + cpumask_t *node_mask, int number) +{ + cpumask_var_t socket_mask; + cpumask_var_t core_mask; + int rc = 0; + int cpu; + int i; + + LASSERT(number > 0); + + if (number >= cpumask_weight(node_mask)) { + while (!cpumask_empty(node_mask)) { + cpu = cpumask_first(node_mask); + cpumask_clear_cpu(cpu, node_mask); + + if (!cpu_online(cpu)) + continue; + + rc = cfs_cpt_set_cpu(cptab, cpt, cpu); + if (!rc) + return -EINVAL; + } + return 0; + } + + /* + * Allocate scratch buffers + * As we cannot initialize a cpumask_var_t, we need + * to alloc both before we can risk trying to free either + */ + if (!zalloc_cpumask_var(&socket_mask, GFP_NOFS)) + rc = -ENOMEM; + if (!zalloc_cpumask_var(&core_mask, GFP_NOFS)) + rc = -ENOMEM; + if (rc) + goto out; + + while (!cpumask_empty(node_mask)) { + cpu = cpumask_first(node_mask); + + /* get cpumask for cores in the same socket */ + cpumask_and(socket_mask, topology_core_cpumask(cpu), node_mask); + while (!cpumask_empty(socket_mask)) { + /* get cpumask for hts in the same core */ + cpumask_and(core_mask, topology_sibling_cpumask(cpu), + node_mask); + + for_each_cpu(i, core_mask) { + cpumask_clear_cpu(i, socket_mask); + cpumask_clear_cpu(i, node_mask); + + if (!cpu_online(i)) + continue; + + rc = cfs_cpt_set_cpu(cptab, cpt, i); + if (!rc) { + rc = -EINVAL; + goto out; + } + + if (!--number) + goto out; + } + cpu = cpumask_first(socket_mask); + } + } + +out: + free_cpumask_var(socket_mask); + free_cpumask_var(core_mask); + return rc; +} + +#define CPT_WEIGHT_MIN 4u + +static unsigned int cfs_cpt_num_estimate(void) +{ + unsigned int nthr; + unsigned int ncpu = num_online_cpus(); + unsigned int ncpt = 1; + + preempt_disable(); + nthr = cpumask_weight(topology_sibling_cpumask(smp_processor_id())); + preempt_enable(); + + if (ncpu > CPT_WEIGHT_MIN) + for (ncpt = 2; ncpu > 2 * nthr * ncpt; ncpt++) + ; /* nothing */ + +#if (BITS_PER_LONG == 32) + /* config many CPU partitions on 32-bit system could consume + * too much memory + */ + ncpt = min(2U, ncpt); +#endif + while (ncpu % ncpt) + ncpt--; /* worst case is 1 */ + + return ncpt; +} + +static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt) +{ + struct cfs_cpt_table *cptab = NULL; + cpumask_var_t node_mask; + int cpt = 0; + int node; + int num; + int rem; + int rc = 0; + + num = cfs_cpt_num_estimate(); + if (ncpt <= 0) + ncpt = num; + + if (ncpt > num_online_cpus()) { + rc = -EINVAL; + CERROR("libcfs: CPU partition count %d > cores %d: rc = %d\n", + ncpt, num_online_cpus(), rc); + goto failed; + } + + if (ncpt > 4 * num) { + CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n", + ncpt, num); + } + + cptab = cfs_cpt_table_alloc(ncpt); + if (!cptab) { + CERROR("Failed to allocate CPU map(%d)\n", ncpt); + rc = -ENOMEM; + goto failed; + } + + if (!zalloc_cpumask_var(&node_mask, GFP_NOFS)) { + CERROR("Failed to allocate scratch cpumask\n"); + rc = -ENOMEM; + goto failed; + } + + num = num_online_cpus() / ncpt; + rem = num_online_cpus() % ncpt; + for_each_online_node(node) { + cpumask_copy(node_mask, cpumask_of_node(node)); + + while (cpt < ncpt && !cpumask_empty(node_mask)) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt]; + int ncpu = cpumask_weight(part->cpt_cpumask); + + rc = cfs_cpt_choose_ncpus(cptab, cpt, node_mask, + (rem > 0) + num - ncpu); + if (rc < 0) { + rc = -EINVAL; + goto failed_mask; + } + + ncpu = cpumask_weight(part->cpt_cpumask); + if (ncpu == num + !!(rem > 0)) { + cpt++; + rem--; + } + } + } + + free_cpumask_var(node_mask); + + return cptab; + +failed_mask: + free_cpumask_var(node_mask); +failed: + CERROR("Failed (rc = %d) to setup CPU partition table with %d partitions, online HW NUMA nodes: %d, HW CPU cores: %d.\n", + rc, ncpt, num_online_nodes(), num_online_cpus()); + + if (cptab) + cfs_cpt_table_free(cptab); + + return ERR_PTR(rc); +} + +static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern) +{ + struct cfs_cpt_table *cptab; + char *pattern_dup; + char *bracket; + char *str; + int node = 0; + int ncpt = 0; + int cpt = 0; + int high; + int rc; + int c; + int i; + + pattern_dup = kstrdup(pattern, GFP_KERNEL); + if (!pattern_dup) { + CERROR("Failed to duplicate pattern '%s'\n", pattern); + return ERR_PTR(-ENOMEM); + } + + str = strim(pattern_dup); + if (*str == 'n' || *str == 'N') { + str++; /* skip 'N' char */ + node = 1; /* NUMA pattern */ + if (*str == '\0') { + node = -1; + for_each_online_node(i) { + if (!cpumask_empty(cpumask_of_node(i))) + ncpt++; + } + if (ncpt == 1) { /* single NUMA node */ + kfree(pattern_dup); + return cfs_cpt_table_create(cpu_npartitions); + } + } + } + + if (!ncpt) { /* scanning bracket which is mark of partition */ + bracket = str; + while ((bracket = strchr(bracket, '['))) { + bracket++; + ncpt++; + } + } + + if (!ncpt || + (node && ncpt > num_online_nodes()) || + (!node && ncpt > num_online_cpus())) { + CERROR("Invalid pattern '%s', or too many partitions %d\n", + pattern_dup, ncpt); + rc = -EINVAL; + goto err_free_str; + } + + cptab = cfs_cpt_table_alloc(ncpt); + if (!cptab) { + CERROR("Failed to allocate CPU partition table\n"); + rc = -ENOMEM; + goto err_free_str; + } + + if (node < 0) { /* shortcut to create CPT from NUMA & CPU topology */ + for_each_online_node(i) { + if (cpumask_empty(cpumask_of_node(i))) + continue; + + rc = cfs_cpt_set_node(cptab, cpt++, i); + if (!rc) { + rc = -EINVAL; + goto err_free_table; + } + } + kfree(pattern_dup); + return cptab; + } + + high = node ? nr_node_ids - 1 : nr_cpu_ids - 1; + + for (str = strim(str), c = 0; /* until break */; c++) { + struct cfs_range_expr *range; + struct cfs_expr_list *el; + int n; + + bracket = strchr(str, '['); + if (!bracket) { + if (*str) { + CERROR("Invalid pattern '%s'\n", str); + rc = -EINVAL; + goto err_free_table; + } else if (c != ncpt) { + CERROR("Expect %d partitions but found %d\n", + ncpt, c); + rc = -EINVAL; + goto err_free_table; + } + break; + } + + if (sscanf(str, "%d%n", &cpt, &n) < 1) { + CERROR("Invalid CPU pattern '%s'\n", str); + rc = -EINVAL; + goto err_free_table; + } + + if (cpt < 0 || cpt >= ncpt) { + CERROR("Invalid partition id %d, total partitions %d\n", + cpt, ncpt); + rc = -EINVAL; + goto err_free_table; + } + + if (cfs_cpt_weight(cptab, cpt)) { + CERROR("Partition %d has already been set.\n", cpt); + rc = -EPERM; + goto err_free_table; + } + + str = strim(str + n); + if (str != bracket) { + CERROR("Invalid pattern '%s'\n", str); + rc = -EINVAL; + goto err_free_table; + } + + bracket = strchr(str, ']'); + if (!bracket) { + CERROR("Missing right bracket for partition %d in '%s'\n", + cpt, str); + rc = -EINVAL; + goto err_free_table; + } + + rc = cfs_expr_list_parse(str, (bracket - str) + 1, 0, high, + &el); + if (rc) { + CERROR("Can't parse number range in '%s'\n", str); + rc = -ERANGE; + goto err_free_table; + } + + list_for_each_entry(range, &el->el_exprs, re_link) { + for (i = range->re_lo; i <= range->re_hi; i++) { + if ((i - range->re_lo) % range->re_stride) + continue; + + rc = node ? cfs_cpt_set_node(cptab, cpt, i) + : cfs_cpt_set_cpu(cptab, cpt, i); + if (!rc) { + cfs_expr_list_free(el); + rc = -EINVAL; + goto err_free_table; + } + } + } + + cfs_expr_list_free(el); + + if (!cfs_cpt_online(cptab, cpt)) { + CERROR("No online CPU is found on partition %d\n", cpt); + rc = -ENODEV; + goto err_free_table; + } + + str = strim(bracket + 1); + } + + kfree(pattern_dup); + return cptab; + +err_free_table: + cfs_cpt_table_free(cptab); +err_free_str: + kfree(pattern_dup); + return ERR_PTR(rc); +} + +#ifdef CONFIG_HOTPLUG_CPU +#ifdef HAVE_HOTPLUG_STATE_MACHINE +static enum cpuhp_state lustre_cpu_online; + +static int cfs_cpu_online(unsigned int cpu) +{ + return 0; +} +#endif + +static int cfs_cpu_dead(unsigned int cpu) +{ + bool warn; + + /* if all HTs in a core are offline, it may break affinity */ + warn = cpumask_any_and(topology_sibling_cpumask(cpu), + cpu_online_mask) >= nr_cpu_ids; + CDEBUG(warn ? D_WARNING : D_INFO, + "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u]\n", + cpu); + return 0; +} + +#ifndef HAVE_HOTPLUG_STATE_MACHINE +static int cfs_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + default: + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) { + CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n", + cpu, action); + break; + } + + cfs_cpu_dead(cpu); + } + + return NOTIFY_OK; +} + +static struct notifier_block cfs_cpu_notifier = { + .notifier_call = cfs_cpu_notify, + .priority = 0 +}; +#endif /* !HAVE_HOTPLUG_STATE_MACHINE */ +#endif /* CONFIG_HOTPLUG_CPU */ + +void cfs_cpu_fini(void) +{ + if (!IS_ERR_OR_NULL(cfs_cpt_tab)) + cfs_cpt_table_free(cfs_cpt_tab); + +#ifdef CONFIG_HOTPLUG_CPU +#ifdef HAVE_HOTPLUG_STATE_MACHINE + if (lustre_cpu_online > 0) + cpuhp_remove_state_nocalls(lustre_cpu_online); + cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD); +#else + unregister_hotcpu_notifier(&cfs_cpu_notifier); +#endif /* !HAVE_HOTPLUG_STATE_MACHINE */ +#endif /* CONFIG_HOTPLUG_CPU */ +} + +int cfs_cpu_init(void) +{ + int ret; + + LASSERT(!cfs_cpt_tab); + +#ifdef CONFIG_HOTPLUG_CPU +#ifdef HAVE_HOTPLUG_STATE_MACHINE + ret = cpuhp_setup_state_nocalls(CPUHP_LUSTRE_CFS_DEAD, + "fs/lustre/cfe:dead", NULL, + cfs_cpu_dead); + if (ret < 0) + goto failed_cpu_dead; + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "fs/lustre/cfe:online", + cfs_cpu_online, NULL); + if (ret < 0) + goto failed_cpu_online; + + lustre_cpu_online = ret; +#else + register_hotcpu_notifier(&cfs_cpu_notifier); +#endif /* !HAVE_HOTPLUG_STATE_MACHINE */ +#endif /* CONFIG_HOTPLUG_CPU */ + + cpus_read_lock(); + if (*cpu_pattern) { + cfs_cpt_tab = cfs_cpt_table_create_pattern(cpu_pattern); + if (IS_ERR(cfs_cpt_tab)) { + CERROR("Failed to create cptab from pattern '%s'\n", + cpu_pattern); + ret = PTR_ERR(cfs_cpt_tab); + goto failed_alloc_table; + } + + } else { + cfs_cpt_tab = cfs_cpt_table_create(cpu_npartitions); + if (IS_ERR(cfs_cpt_tab)) { + CERROR("Failed to create cptab with npartitions %d\n", + cpu_npartitions); + ret = PTR_ERR(cfs_cpt_tab); + goto failed_alloc_table; + } + } + + cpus_read_unlock(); + + LCONSOLE(0, "HW NUMA nodes: %d, HW CPU cores: %d, npartitions: %d\n", + num_online_nodes(), num_online_cpus(), + cfs_cpt_number(cfs_cpt_tab)); + return 0; + +failed_alloc_table: + cpus_read_unlock(); + + if (!IS_ERR_OR_NULL(cfs_cpt_tab)) + cfs_cpt_table_free(cfs_cpt_tab); + +#ifdef CONFIG_HOTPLUG_CPU +#ifdef HAVE_HOTPLUG_STATE_MACHINE + if (lustre_cpu_online > 0) + cpuhp_remove_state_nocalls(lustre_cpu_online); +failed_cpu_online: + cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD); +failed_cpu_dead: +#else + unregister_hotcpu_notifier(&cfs_cpu_notifier); +#endif /* !HAVE_HOTPLUG_STATE_MACHINE */ +#endif /* CONFIG_HOTPLUG_CPU */ + return ret; +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c new file mode 100644 index 0000000000000..c4ad568654a13 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c @@ -0,0 +1,156 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/** destroy cpu-partition lock, see libcfs_private.h for more detail */ +void +cfs_percpt_lock_free(struct cfs_percpt_lock *pcl) +{ + LASSERT(pcl->pcl_locks != NULL); + LASSERT(!pcl->pcl_locked); + + cfs_percpt_free(pcl->pcl_locks); + LIBCFS_FREE(pcl, sizeof(*pcl)); +} +EXPORT_SYMBOL(cfs_percpt_lock_free); + +/** + * create cpu-partition lock, see libcfs_private.h for more detail. + * + * cpu-partition lock is designed for large-scale SMP system, so we need to + * reduce cacheline conflict as possible as we can, that's the + * reason we always allocate cacheline-aligned memory block. + */ +struct cfs_percpt_lock * +cfs_percpt_lock_create(struct cfs_cpt_table *cptab, + struct lock_class_key *keys) +{ + struct cfs_percpt_lock *pcl; + spinlock_t *lock; + int i; + + /* NB: cptab can be NULL, pcl will be for HW CPUs on that case */ + LIBCFS_ALLOC(pcl, sizeof(*pcl)); + if (pcl == NULL) + return NULL; + + pcl->pcl_cptab = cptab; + pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock)); + if (pcl->pcl_locks == NULL) { + LIBCFS_FREE(pcl, sizeof(*pcl)); + return NULL; + } + + if (keys == NULL) { + CWARN("Cannot setup class key for percpt lock, you may see " + "recursive locking warnings which are actually fake.\n"); + } + + cfs_percpt_for_each(lock, i, pcl->pcl_locks) { + spin_lock_init(lock); + if (keys != NULL) + lockdep_set_class(lock, &keys[i]); + } + + return pcl; +} +EXPORT_SYMBOL(cfs_percpt_lock_create); + +/** + * lock a CPU partition + * + * \a index != CFS_PERCPT_LOCK_EX + * hold private lock indexed by \a index + * + * \a index == CFS_PERCPT_LOCK_EX + * exclusively lock @pcl and nobody can take private lock + */ +void +cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index) +__acquires(pcl->pcl_locks) +{ + int ncpt = cfs_cpt_number(pcl->pcl_cptab); + int i; + + LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt); + + if (ncpt == 1) { + index = 0; + } else { /* serialize with exclusive lock */ + while (pcl->pcl_locked) + cpu_relax(); + } + + if (likely(index != CFS_PERCPT_LOCK_EX)) { + spin_lock(pcl->pcl_locks[index]); + return; + } + + /* exclusive lock request */ + for (i = 0; i < ncpt; i++) { + spin_lock(pcl->pcl_locks[i]); + if (i == 0) { + LASSERT(!pcl->pcl_locked); + /* nobody should take private lock after this + * so I wouldn't starve for too long time */ + pcl->pcl_locked = 1; + } + } +} +EXPORT_SYMBOL(cfs_percpt_lock); + +/** unlock a CPU partition */ +void +cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index) +__releases(pcl->pcl_locks) +{ + int ncpt = cfs_cpt_number(pcl->pcl_cptab); + int i; + + index = ncpt == 1 ? 0 : index; + + if (likely(index != CFS_PERCPT_LOCK_EX)) { + spin_unlock(pcl->pcl_locks[index]); + return; + } + + for (i = ncpt - 1; i >= 0; i--) { + if (i == 0) { + LASSERT(pcl->pcl_locked); + pcl->pcl_locked = 0; + } + spin_unlock(pcl->pcl_locks[i]); + } +} +EXPORT_SYMBOL(cfs_percpt_unlock); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c new file mode 100644 index 0000000000000..d514b017b2eaa --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c @@ -0,0 +1,176 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include + +struct cfs_var_array { + unsigned int va_count; /* # of buffers */ + unsigned int va_size; /* size of each var */ + struct cfs_cpt_table *va_cptab; /* cpu partition table */ + void *va_ptrs[0]; /* buffer addresses */ +}; + +/* + * free per-cpu data, see more detail in cfs_percpt_free + */ +void +cfs_percpt_free(void *vars) +{ + struct cfs_var_array *arr; + int i; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + for (i = 0; i < arr->va_count; i++) { + if (arr->va_ptrs[i] != NULL) + LIBCFS_FREE(arr->va_ptrs[i], arr->va_size); + } + + LIBCFS_FREE(arr, offsetof(struct cfs_var_array, + va_ptrs[arr->va_count])); +} +EXPORT_SYMBOL(cfs_percpt_free); + +/* + * allocate per cpu-partition variables, returned value is an array of pointers, + * variable can be indexed by CPU partition ID, i.e: + * + * arr = cfs_percpt_alloc(cfs_cpu_pt, size); + * then caller can access memory block for CPU 0 by arr[0], + * memory block for CPU 1 by arr[1]... + * memory block for CPU N by arr[N]... + * + * cacheline aligned. + */ +void * +cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size) +{ + struct cfs_var_array *arr; + int count; + int i; + + count = cfs_cpt_number(cptab); + + LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count])); + if (arr == NULL) + return NULL; + + arr->va_size = size = L1_CACHE_ALIGN(size); + arr->va_count = count; + arr->va_cptab = cptab; + + for (i = 0; i < count; i++) { + LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size); + if (arr->va_ptrs[i] == NULL) { + cfs_percpt_free((void *)&arr->va_ptrs[0]); + return NULL; + } + } + + return (void *)&arr->va_ptrs[0]; +} +EXPORT_SYMBOL(cfs_percpt_alloc); + +/* + * return number of CPUs (or number of elements in per-cpu data) + * according to cptab of @vars + */ +int +cfs_percpt_number(void *vars) +{ + struct cfs_var_array *arr; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + return arr->va_count; +} +EXPORT_SYMBOL(cfs_percpt_number); + + +/* + * This is opencoding of vfree_atomic from Linux kernel added in 4.10 with + * minimum changes needed to work on older kernels too. + */ + +#ifndef llist_for_each_safe +#define llist_for_each_safe(pos, n, node) \ + for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n)) +#endif + +struct vfree_deferred { + struct llist_head list; + struct work_struct wq; +}; +static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); + +static void free_work(struct work_struct *w) +{ + struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); + struct llist_node *t, *llnode; + + llist_for_each_safe(llnode, t, llist_del_all(&p->list)) + vfree((void *)llnode); +} + +void libcfs_vfree_atomic(const void *addr) +{ + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + + if (!addr) + return; + + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); +} +EXPORT_SYMBOL(libcfs_vfree_atomic); + +void __init init_libcfs_vfree_atomic(void) +{ + int i; + + for_each_possible_cpu(i) { + struct vfree_deferred *p; + + p = &per_cpu(vfree_deferred, i); + init_llist_head(&p->list); + INIT_WORK(&p->wq, free_work); + } +} + +void __exit exit_libcfs_vfree_atomic(void) +{ + flush_scheduled_work(); +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c new file mode 100644 index 0000000000000..a3ff59c5970e6 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c @@ -0,0 +1,561 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * String manipulation functions. + * + * libcfs/libcfs/libcfs_string.c + * + * Author: Nathan Rutman + */ + +#include +#include +#include + +/* Convert a text string to a bitmask */ +int cfs_str2mask(const char *str, const char *(*bit2str)(int bit), + int *oldmask, int minmask, int allmask, int defmask) +{ + const char *debugstr; + char op = 0; + int newmask = minmask, i, len, found = 0; + + ENTRY; + /* must be a list of tokens separated by whitespace or comma, + * and optionally an operator ('+' or '-'). If an operator + * appears first in , '*oldmask' is used as the starting point + * (relative), otherwise minmask is used (absolute). An operator + * applies to all following tokens up to the next operator. + */ + while (*str != 0) { + while (isspace(*str) || *str == ',') + str++; + if (*str == 0) + break; + if (*str == '+' || *str == '-') { + op = *str++; + if (!found) + /* only if first token is relative */ + newmask = *oldmask; + while (isspace(*str)) + str++; + if (*str == 0) /* trailing op */ + return -EINVAL; + } + + /* find token length */ + for (len = 0; str[len] != 0 && !isspace(str[len]) && + str[len] != '+' && str[len] != '-' && str[len] != ','; + len++); + + /* match token */ + found = 0; + for (i = 0; i < 32; i++) { + debugstr = bit2str(i); + if (debugstr != NULL && + strlen(debugstr) == len && + strncasecmp(str, debugstr, len) == 0) { + if (op == '-') + newmask &= ~BIT(i); + else + newmask |= BIT(i); + found = 1; + break; + } + } + if (!found && len == 3 && + (strncasecmp(str, "ALL", len) == 0)) { + if (op == '-') + newmask = minmask; + else + newmask = allmask; + found = 1; + } + if (!found && strcasecmp(str, "DEFAULT") == 0) { + if (op == '-') + newmask = (newmask & ~defmask) | minmask; + else if (op == '+') + newmask |= defmask; + else + newmask = defmask; + found = 1; + } + if (!found) { + CWARN("unknown mask '%.*s'.\n" + "mask usage: [+|-] ...\n", len, str); + return -EINVAL; + } + str += len; + } + + *oldmask = newmask; + return 0; +} +EXPORT_SYMBOL(cfs_str2mask); + +/** + * Extracts tokens from strings. + * + * Looks for \a delim in string \a next, sets \a res to point to + * substring before the delimiter, sets \a next right after the found + * delimiter. + * + * \retval 1 if \a res points to a string of non-whitespace characters + * \retval 0 otherwise + */ +int +cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res) +{ + char *end; + + if (next->ls_str == NULL) + return 0; + + /* skip leading white spaces */ + while (next->ls_len) { + if (!isspace(*next->ls_str)) + break; + next->ls_str++; + next->ls_len--; + } + + if (next->ls_len == 0) /* whitespaces only */ + return 0; + + if (*next->ls_str == delim) { + /* first non-writespace is the delimiter */ + return 0; + } + + res->ls_str = next->ls_str; + end = memchr(next->ls_str, delim, next->ls_len); + if (end == NULL) { + /* there is no the delimeter in the string */ + end = next->ls_str + next->ls_len; + next->ls_str = NULL; + next->ls_len = 0; + } else { + next->ls_str = end + 1; + next->ls_len -= (end - res->ls_str + 1); + } + + /* skip ending whitespaces */ + while (--end != res->ls_str) { + if (!isspace(*end)) + break; + } + + res->ls_len = end - res->ls_str + 1; + return 1; +} +EXPORT_SYMBOL(cfs_gettok); + +/** + * Converts string to integer. + * + * Accepts decimal and hexadecimal number recordings. + * + * \retval 1 if first \a nob chars of \a str convert to decimal or + * hexadecimal integer in the range [\a min, \a max] + * \retval 0 otherwise + */ +int +cfs_str2num_check(char *str, int nob, unsigned *num, + unsigned min, unsigned max) +{ + bool all_numbers = true; + char *endp, cache; + int len; + int rc; + + endp = strim(str); + /** + * kstrouint can only handle strings composed + * of only numbers. We need to scan the string + * passed in for the first non-digit character + * and end the string at that location. If we + * don't find any non-digit character we still + * need to place a '\0' at position len since + * we are not interested in the rest of the + * string which is longer than len in size. + * After we are done the character at the + * position we placed '\0' must be restored. + */ + len = min((int)strlen(endp), nob); + for (; endp < str + len; endp++) { + if (!isxdigit(*endp) && *endp != '-' && + *endp != '+') { + all_numbers = false; + break; + } + } + + /* Eat trailing space */ + if (!all_numbers && isspace(*endp)) { + all_numbers = true; + endp--; + } + + cache = *endp; + *endp = '\0'; + + rc = kstrtouint(str, 0, num); + *endp = cache; + if (rc || !all_numbers) + return 0; + + return (*num >= min && *num <= max); +} +EXPORT_SYMBOL(cfs_str2num_check); + +/** + * Parses \ token of the syntax. If \a bracketed is false, + * \a src should only have a single token which can be \ or \* + * + * \retval pointer to allocated range_expr and initialized + * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a + `* src parses to + * \ | + * \ '-' \ | + * \ '-' \ '/' \ + * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or + * -ENOMEM will be returned. + */ +static int +cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max, + int bracketed, struct cfs_range_expr **expr) +{ + struct cfs_range_expr *re; + struct cfs_lstr tok; + + LIBCFS_ALLOC(re, sizeof(*re)); + if (re == NULL) + return -ENOMEM; + + if (src->ls_len == 1 && src->ls_str[0] == '*') { + re->re_lo = min; + re->re_hi = max; + re->re_stride = 1; + goto out; + } + + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_lo, min, max)) { + /* is parsed */ + re->re_hi = re->re_lo; + re->re_stride = 1; + goto out; + } + + if (!bracketed || !cfs_gettok(src, '-', &tok)) + goto failed; + + if (!cfs_str2num_check(tok.ls_str, tok.ls_len, + &re->re_lo, min, max)) + goto failed; + + /* - */ + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_hi, min, max)) { + /* - is parsed */ + re->re_stride = 1; + goto out; + } + + /* go to check '-' '/' */ + if (cfs_gettok(src, '/', &tok)) { + if (!cfs_str2num_check(tok.ls_str, tok.ls_len, + &re->re_hi, min, max)) + goto failed; + + /* - / ... */ + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_stride, min, max)) { + /* - / is parsed */ + goto out; + } + } + +out: + *expr = re; + return 0; + +failed: + LIBCFS_FREE(re, sizeof(*re)); + return -EINVAL; +} + +/** + * Print the range expression \a re into specified \a buffer. + * If \a bracketed is true, expression does not need additional + * brackets. + * + * \retval number of characters written + */ +static int +cfs_range_expr_print(char *buffer, int count, struct cfs_range_expr *expr, + bool bracketed) +{ + int i; + char s[] = "["; + char e[] = "]"; + + if (bracketed) + s[0] = e[0] = '\0'; + + if (expr->re_lo == expr->re_hi) + i = scnprintf(buffer, count, "%u", expr->re_lo); + else if (expr->re_stride == 1) + i = scnprintf(buffer, count, "%s%u-%u%s", + s, expr->re_lo, expr->re_hi, e); + else + i = scnprintf(buffer, count, "%s%u-%u/%u%s", + s, expr->re_lo, expr->re_hi, + expr->re_stride, e); + return i; +} + +/** + * Print a list of range expressions (\a expr_list) into specified \a buffer. + * If the list contains several expressions, separate them with comma + * and surround the list with brackets. + * + * \retval number of characters written + */ +int +cfs_expr_list_print(char *buffer, int count, struct cfs_expr_list *expr_list) +{ + struct cfs_range_expr *expr; + int i = 0, j = 0; + int numexprs = 0; + + if (count <= 0) + return 0; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) + numexprs++; + + if (numexprs > 1) + i += scnprintf(buffer + i, count - i, "["); + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + if (j++ != 0) + i += scnprintf(buffer + i, count - i, ","); + i += cfs_range_expr_print(buffer + i, count - i, expr, + numexprs > 1); + } + + if (numexprs > 1) + i += scnprintf(buffer + i, count - i, "]"); + + return i; +} +EXPORT_SYMBOL(cfs_expr_list_print); + +/** + * Matches value (\a value) against ranges expression list \a expr_list. + * + * \retval 1 if \a value matches + * \retval 0 otherwise + */ +int +cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list) +{ + struct cfs_range_expr *expr; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + if (value >= expr->re_lo && value <= expr->re_hi && + ((value - expr->re_lo) % expr->re_stride) == 0) + return 1; + } + + return 0; +} +EXPORT_SYMBOL(cfs_expr_list_match); + +/** + * Convert express list (\a expr_list) to an array of all matched values + * + * \retval N N is total number of all matched values + * \retval 0 if expression list is empty + * \retval < 0 for failure + */ +int +cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp) +{ + struct cfs_range_expr *expr; + __u32 *val; + int count = 0; + int i; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + for (i = expr->re_lo; i <= expr->re_hi; i++) { + if (((i - expr->re_lo) % expr->re_stride) == 0) + count++; + } + } + + if (count == 0) /* empty expression list */ + return 0; + + if (count > max) { + CERROR("Number of values %d exceeds max allowed %d\n", + max, count); + return -EINVAL; + } + + CFS_ALLOC_PTR_ARRAY(val, count); + if (val == NULL) + return -ENOMEM; + + count = 0; + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + for (i = expr->re_lo; i <= expr->re_hi; i++) { + if (((i - expr->re_lo) % expr->re_stride) == 0) + val[count++] = i; + } + } + + *valpp = val; + return count; +} +EXPORT_SYMBOL(cfs_expr_list_values); + +void +cfs_expr_list_values_free(__u32 *values, int num) +{ + /* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed + * by OBD_FREE() if it's called by module other than libcfs & LNet, + * otherwise we will see fake memory leak */ + CFS_FREE_PTR_ARRAY(values, num); +} +EXPORT_SYMBOL(cfs_expr_list_values_free); + +/** + * Frees cfs_range_expr structures of \a expr_list. + * + * \retval none + */ +void +cfs_expr_list_free(struct cfs_expr_list *expr_list) +{ + while (!list_empty(&expr_list->el_exprs)) { + struct cfs_range_expr *expr; + + expr = list_entry(expr_list->el_exprs.next, + struct cfs_range_expr, re_link); + list_del(&expr->re_link); + LIBCFS_FREE(expr, sizeof(*expr)); + } + + LIBCFS_FREE(expr_list, sizeof(*expr_list)); +} +EXPORT_SYMBOL(cfs_expr_list_free); + +/** + * Parses \ token of the syntax. + * + * \retval 0 if \a str parses to \ | \ + * \retval -errno otherwise + */ +int +cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max, + struct cfs_expr_list **elpp) +{ + struct cfs_expr_list *expr_list; + struct cfs_range_expr *expr; + struct cfs_lstr src; + int rc; + + LIBCFS_ALLOC(expr_list, sizeof(*expr_list)); + if (expr_list == NULL) + return -ENOMEM; + + src.ls_str = str; + src.ls_len = len; + + INIT_LIST_HEAD(&expr_list->el_exprs); + + if (src.ls_str[0] == '[' && + src.ls_str[src.ls_len - 1] == ']') { + src.ls_str++; + src.ls_len -= 2; + + rc = -EINVAL; + while (src.ls_str != NULL) { + struct cfs_lstr tok; + + if (!cfs_gettok(&src, ',', &tok)) { + rc = -EINVAL; + break; + } + + rc = cfs_range_expr_parse(&tok, min, max, 1, &expr); + if (rc != 0) + break; + + list_add_tail(&expr->re_link, &expr_list->el_exprs); + } + } else { + rc = cfs_range_expr_parse(&src, min, max, 0, &expr); + if (rc == 0) + list_add_tail(&expr->re_link, &expr_list->el_exprs); + } + + if (rc != 0) + cfs_expr_list_free(expr_list); + else + *elpp = expr_list; + + return rc; +} +EXPORT_SYMBOL(cfs_expr_list_parse); + +/** + * Frees cfs_expr_list structures of \a list. + * + * For each struct cfs_expr_list structure found on \a list it frees + * range_expr list attached to it and frees the cfs_expr_list itself. + * + * \retval none + */ +void +cfs_expr_list_free_list(struct list_head *list) +{ + struct cfs_expr_list *el; + + while (!list_empty(list)) { + el = list_entry(list->next, + struct cfs_expr_list, el_link); + list_del(&el->el_link); + cfs_expr_list_free(el); + } +} +EXPORT_SYMBOL(cfs_expr_list_free_list); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto-adler.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto-adler.c new file mode 100644 index 0000000000000..6f19bcad2dc33 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto-adler.c @@ -0,0 +1,137 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + */ + +/* + * This is crypto api shash wrappers to zlib_adler32. + */ + +#include +#include +#include +#include "linux-crypto.h" + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +static int adler32_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 1; + + return 0; +} + +static int adler32_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) + return -EINVAL; + + *mctx = *(u32 *)key; + return 0; +} + +static int adler32_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *cksump = shash_desc_ctx(desc); + + *cksump = *mctx; + + return 0; +} + +static int adler32_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *cksump = shash_desc_ctx(desc); + + *cksump = zlib_adler32(*cksump, data, len); + return 0; +} +static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len, + u8 *out) +{ + *(u32 *)out = zlib_adler32(*cksump, data, len); + return 0; +} + +static int adler32_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __adler32_finup(shash_desc_ctx(desc), data, len, out); +} + +static int adler32_final(struct shash_desc *desc, u8 *out) +{ + u32 *cksump = shash_desc_ctx(desc); + + *(u32 *)out = *cksump; + return 0; +} + +static int adler32_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} +static struct shash_alg alg = { + .setkey = adler32_setkey, + .init = adler32_init, + .update = adler32_update, + .final = adler32_final, + .finup = adler32_finup, + .digest = adler32_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "adler32", + .cra_driver_name = "adler32-zlib", + .cra_priority = 100, +#ifdef CRYPTO_ALG_OPTIONAL_KEY + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, +#endif + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = NULL, + .cra_init = adler32_cra_init, + } +}; + +int cfs_crypto_adler32_register(void) +{ + return crypto_register_shash(&alg); +} + +void cfs_crypto_adler32_unregister(void) +{ + crypto_unregister_shash(&alg); +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.c new file mode 100644 index 0000000000000..e210b8076445e --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.c @@ -0,0 +1,487 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include "linux-crypto.h" + +#ifndef HAVE_CRYPTO_HASH_HELPERS +static inline const char *crypto_ahash_alg_name(struct crypto_ahash *tfm) +{ + return crypto_tfm_alg_name(crypto_ahash_tfm(tfm)); +} + +static inline const char *crypto_ahash_driver_name(struct crypto_ahash *tfm) +{ + return crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm)); +} +#endif + +/** + * Array of hash algorithm speed in MByte per second + */ +int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX]; +EXPORT_SYMBOL(cfs_crypto_hash_speeds); + +/** + * Initialize the state descriptor for the specified hash algorithm. + * + * An internal routine to allocate the hash-specific state in \a hdesc for + * use with cfs_crypto_hash_digest() to compute the hash of a single message, + * though possibly in multiple chunks. The descriptor internal state should + * be freed with cfs_crypto_hash_final(). + * + * \param[in] hash_alg hash algorithm id (CFS_HASH_ALG_*) + * \param[out] type pointer to the hash description in hash_types[] array + * \param[in,out] req ahash request to be initialized + * \param[in] key initial hash value/state, NULL to use default value + * \param[in] key_len length of \a key + * + * \retval 0 on success + * \retval negative errno on failure + */ +static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg, + const struct cfs_crypto_hash_type **type, + struct ahash_request **req, + unsigned char *key, + unsigned int key_len) +{ + struct crypto_ahash *tfm; + int err = 0; + + *type = cfs_crypto_hash_type(hash_alg); + if (!*type) { + CWARN("Unsupported hash algorithm id = %d, max id is %d\n", + hash_alg, CFS_HASH_ALG_MAX); + return -EINVAL; + } + + /* Keys are only supported for the hmac version */ + if (key && key_len > 0) { + char *algo_name; + + algo_name = kasprintf(GFP_KERNEL, "hmac(%s)", + (*type)->cht_name); + if (!algo_name) + return -ENOMEM; + + tfm = crypto_alloc_ahash(algo_name, 0, CRYPTO_ALG_ASYNC); + kfree(algo_name); + } else { + tfm = crypto_alloc_ahash((*type)->cht_name, 0, + CRYPTO_ALG_ASYNC); + } + if (IS_ERR(tfm)) { + CDEBUG_LIMIT(PTR_ERR(tfm) == -ENOMEM ? D_ERROR : D_INFO, + "Failed to alloc crypto hash %s: rc = %d\n", + (*type)->cht_name, (int)PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + + *req = ahash_request_alloc(tfm, GFP_KERNEL); + if (!*req) { + CDEBUG(D_INFO, "Failed to alloc ahash_request for %s\n", + (*type)->cht_name); + GOTO(out_free_tfm, err = -ENOMEM); + } + + ahash_request_set_callback(*req, 0, NULL, NULL); + + if (key) + err = crypto_ahash_setkey(tfm, key, key_len); + else if ((*type)->cht_key != 0) + err = crypto_ahash_setkey(tfm, + (unsigned char *)&((*type)->cht_key), + (*type)->cht_size); + if (err) + GOTO(out_free_req, err); + + CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n", + crypto_ahash_alg_name(tfm), crypto_ahash_driver_name(tfm), + cfs_crypto_hash_speeds[hash_alg]); + + err = crypto_ahash_init(*req); + if (err) { +out_free_req: + ahash_request_free(*req); +out_free_tfm: + crypto_free_ahash(tfm); + } + return err; +} + +/** + * Calculate hash digest for the passed buffer. + * + * This should be used when computing the hash on a single contiguous buffer. + * It combines the hash initialization, computation, and cleanup. + * + * \param[in] hash_alg id of hash algorithm (CFS_HASH_ALG_*) + * \param[in] buf data buffer on which to compute hash + * \param[in] buf_len length of \a buf in bytes + * \param[in] key initial value/state for algorithm, if \a key = NULL + * use default initial value + * \param[in] key_len length of \a key in bytes + * \param[out] hash pointer to computed hash value, if \a hash = NULL then + * \a hash_len is to digest size in bytes, retval -ENOSPC + * \param[in,out] hash_len size of \a hash buffer + * + * \retval -EINVAL \a buf, \a buf_len, \a hash_len, \a hash_alg invalid + * \retval -ENOENT \a hash_alg is unsupported + * \retval -ENOSPC \a hash is NULL, or \a hash_len less than digest size + * \retval 0 for success + * \retval negative errno for other errors from lower layers. + */ +int cfs_crypto_hash_digest(enum cfs_crypto_hash_alg hash_alg, + const void *buf, unsigned int buf_len, + unsigned char *key, unsigned int key_len, + unsigned char *hash, unsigned int *hash_len) +{ + struct scatterlist sl; + struct ahash_request *req; + int err; + const struct cfs_crypto_hash_type *type; + + if (!buf || buf_len == 0 || !hash_len) + return -EINVAL; + + err = cfs_crypto_hash_alloc(hash_alg, &type, &req, key, key_len); + if (err != 0) + return err; + + if (!hash || *hash_len < type->cht_size) { + *hash_len = type->cht_size; + crypto_free_ahash(crypto_ahash_reqtfm(req)); + ahash_request_free(req); + return -ENOSPC; + } + sg_init_one(&sl, (void *)buf, buf_len); + + ahash_request_set_crypt(req, &sl, hash, sl.length); + err = crypto_ahash_digest(req); + crypto_free_ahash(crypto_ahash_reqtfm(req)); + ahash_request_free(req); + + return err; +} +EXPORT_SYMBOL(cfs_crypto_hash_digest); + +/** + * Allocate and initialize desriptor for hash algorithm. + * + * This should be used to initialize a hash descriptor for multiple calls + * to a single hash function when computing the hash across multiple + * separate buffers or pages using cfs_crypto_hash_update{,_page}(). + * + * The hash descriptor should be freed with cfs_crypto_hash_final(). + * + * \param[in] hash_alg algorithm id (CFS_HASH_ALG_*) + * \param[in] key initial value/state for algorithm, if \a key = NULL + * use default initial value + * \param[in] key_len length of \a key in bytes + * + * \retval pointer to ahash request + * \retval ERR_PTR(errno) in case of error + */ +struct ahash_request * + cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg, + unsigned char *key, unsigned int key_len) +{ + struct ahash_request *req; + int err; + const struct cfs_crypto_hash_type *type; + + err = cfs_crypto_hash_alloc(hash_alg, &type, &req, key, key_len); + if (err) + return ERR_PTR(err); + return req; +} +EXPORT_SYMBOL(cfs_crypto_hash_init); + +/** + * Update hash digest computed on data within the given \a page + * + * \param[in] req ahash request + * \param[in] page data page on which to compute the hash + * \param[in] offset offset within \a page at which to start hash + * \param[in] len length of data on which to compute hash + * + * \retval 0 for success + * \retval negative errno on failure + */ +int cfs_crypto_hash_update_page(struct ahash_request *req, + struct page *page, unsigned int offset, + unsigned int len) +{ + struct scatterlist sl; + + sg_init_table(&sl, 1); + sg_set_page(&sl, page, len, offset & ~PAGE_MASK); + + ahash_request_set_crypt(req, &sl, NULL, sl.length); + return crypto_ahash_update(req); +} +EXPORT_SYMBOL(cfs_crypto_hash_update_page); + +/** + * Update hash digest computed on the specified data + * + * \param[in] req ahash request + * \param[in] buf data buffer on which to compute the hash + * \param[in] buf_len length of \buf on which to compute hash + * + * \retval 0 for success + * \retval negative errno on failure + */ +int cfs_crypto_hash_update(struct ahash_request *req, + const void *buf, unsigned int buf_len) +{ + struct scatterlist sl; + + sg_init_one(&sl, (void *)buf, buf_len); + + ahash_request_set_crypt(req, &sl, NULL, sl.length); + return crypto_ahash_update(req); +} +EXPORT_SYMBOL(cfs_crypto_hash_update); + +/** + * Finish hash calculation, copy hash digest to buffer, clean up hash descriptor + * + * \param[in] req ahash request + * \param[out] hash pointer to hash buffer to store hash digest + * \param[in,out] hash_len pointer to hash buffer size, if \a hash == NULL + * or hash_len == NULL only free \a hdesc instead + * of computing the hash + * + * \retval 0 for success + * \retval -EOVERFLOW if hash_len is too small for the hash digest + * \retval negative errno for other errors from lower layers + */ +int cfs_crypto_hash_final(struct ahash_request *req, + unsigned char *hash, unsigned int *hash_len) +{ + int size = crypto_ahash_digestsize(crypto_ahash_reqtfm(req)); + int err; + + if (!hash || !hash_len) { + err = 0; + goto free; + } + if (*hash_len < size) { + err = -EOVERFLOW; + goto free; + } + + ahash_request_set_crypt(req, NULL, hash, 0); + err = crypto_ahash_final(req); + if (err == 0) + *hash_len = size; +free: + crypto_free_ahash(crypto_ahash_reqtfm(req)); + ahash_request_free(req); + + return err; +} +EXPORT_SYMBOL(cfs_crypto_hash_final); + +/** + * Compute the speed of specified hash function + * + * Run a speed test on the given hash algorithm on buffer using a 1MB buffer + * size. This is a reasonable buffer size for Lustre RPCs, even if the actual + * RPC size is larger or smaller. + * + * The speed is stored internally in the cfs_crypto_hash_speeds[] array, and + * is available through the cfs_crypto_hash_speed() function. + * + * This function needs to stay the same as obd_t10_performance_test() so that + * the speeds are comparable. + * + * \param[in] hash_alg hash algorithm id (CFS_HASH_ALG_*) + * \param[in] buf data buffer on which to compute the hash + * \param[in] buf_len length of \buf on which to compute hash + */ +static void cfs_crypto_performance_test(enum cfs_crypto_hash_alg hash_alg) +{ + int buf_len = max(PAGE_SIZE, 1048576UL); + void *buf; + unsigned long start, end; + int err = 0; + unsigned long bcount; + struct page *page; + unsigned char hash[CFS_CRYPTO_HASH_DIGESTSIZE_MAX]; + unsigned int hash_len = sizeof(hash); + + page = alloc_page(GFP_KERNEL); + if (page == NULL) { + err = -ENOMEM; + goto out_err; + } + + buf = kmap(page); + memset(buf, 0xAD, PAGE_SIZE); + kunmap(page); + + for (start = jiffies, end = start + cfs_time_seconds(1) / 4, + bcount = 0; time_before(jiffies, end) && err == 0; bcount++) { + struct ahash_request *req; + int i; + + req = cfs_crypto_hash_init(hash_alg, NULL, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + break; + } + + for (i = 0; i < buf_len / PAGE_SIZE; i++) { + err = cfs_crypto_hash_update_page(req, page, 0, + PAGE_SIZE); + if (err != 0) + break; + } + + err = cfs_crypto_hash_final(req, hash, &hash_len); + if (err != 0) + break; + } + end = jiffies; + __free_page(page); +out_err: + if (err != 0) { + cfs_crypto_hash_speeds[hash_alg] = err; + CDEBUG(D_INFO, "Crypto hash algorithm %s test error: rc = %d\n", + cfs_crypto_hash_name(hash_alg), err); + } else { + unsigned long tmp; + + tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) * + 1000) / (1024 * 1024); + cfs_crypto_hash_speeds[hash_alg] = (int)tmp; + CDEBUG(D_CONFIG, "Crypto hash algorithm %s speed = %d MB/s\n", + cfs_crypto_hash_name(hash_alg), + cfs_crypto_hash_speeds[hash_alg]); + } +} + +/** + * hash speed in Mbytes per second for valid hash algorithm + * + * Return the performance of the specified \a hash_alg that was + * computed using cfs_crypto_performance_test(). If the performance + * has not yet been computed, do that when it is first requested. + * That avoids computing the speed when it is not actually needed. + * To avoid competing threads computing the checksum speed at the + * same time, only compute a single checksum speed at one time. + * + * \param[in] hash_alg hash algorithm id (CFS_HASH_ALG_*) + * + * \retval positive speed of the hash function in MB/s + * \retval -ENOENT if \a hash_alg is unsupported + * \retval negative errno if \a hash_alg speed is unavailable + */ +int cfs_crypto_hash_speed(enum cfs_crypto_hash_alg hash_alg) +{ + if (hash_alg < CFS_HASH_ALG_MAX) { + if (unlikely(cfs_crypto_hash_speeds[hash_alg] == 0)) { + static DEFINE_MUTEX(crypto_hash_speed_mutex); + + mutex_lock(&crypto_hash_speed_mutex); + if (cfs_crypto_hash_speeds[hash_alg] == 0) + cfs_crypto_performance_test(hash_alg); + mutex_unlock(&crypto_hash_speed_mutex); + } + return cfs_crypto_hash_speeds[hash_alg]; + } + + return -ENOENT; +} +EXPORT_SYMBOL(cfs_crypto_hash_speed); + +/** + * Run the performance test for all hash algorithms. + * + * Run the cfs_crypto_performance_test() benchmark for some of the available + * hash functions at module load time. This can't be reliably done at runtime + * since the CPUs may be under load from thousands of connecting clients when + * the first client connects and the checksum speeds are needed. + * + * Since the setup cost and computation speed of various hash algorithms is + * a function of the buffer size (and possibly internal contention of offload + * engines), this speed only represents an estimate of the actual speed under + * actual usage, but is reasonable for comparing available algorithms. + * + * The actual speeds are available via cfs_crypto_hash_speed() for later + * comparison. + * + * \retval 0 on success + * \retval -ENOMEM if no memory is available for test buffer + */ +static int cfs_crypto_test_hashes(void) +{ + enum cfs_crypto_hash_alg hash_alg; + + for (hash_alg = 1; hash_alg < CFS_HASH_ALG_SPEED_MAX; hash_alg++) + cfs_crypto_performance_test(hash_alg); + + return 0; +} + +static int adler32; + +/** + * Register available hash functions + * + * \retval 0 + */ +int cfs_crypto_register(void) +{ + request_module("crc32c"); + + if (cfs_crypto_adler32_register() == 0) + adler32 = 1; + + /* check all algorithms and do performance test */ + cfs_crypto_test_hashes(); + + return 0; +} + +/** + * Unregister previously registered hash functions + */ +void cfs_crypto_unregister(void) +{ + if (adler32) + cfs_crypto_adler32_unregister(); + adler32 = 0; +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.h b/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.h new file mode 100644 index 0000000000000..05610dbf3362e --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.h @@ -0,0 +1,37 @@ + /* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + */ + +/** + * Linux crypto hash specific functions. + */ + +/** + * Functions for start/stop shash adler32 algorithm. + */ +int cfs_crypto_adler32_register(void); +void cfs_crypto_adler32_unregister(void); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/glob.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/glob.c new file mode 100644 index 0000000000000..90192466e6614 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/glob.c @@ -0,0 +1,117 @@ +#ifndef HAVE_GLOB +#include +#include "libcfs/linux/glob.h" + +/** + * glob_match - Shell-style pattern matching, like !fnmatch(pat, str, 0) + * @pat: Shell-style pattern to match, e.g. "*.[ch]". + * @str: String to match. The pattern must match the entire string. + * + * Perform shell-style glob matching, returning true (1) if the match + * succeeds, or false (0) if it fails. Equivalent to !fnmatch(@pat, @str, 0). + * + * Pattern metacharacters are ?, *, [ and \. + * (And, inside character classes, !, - and ].) + * + * This is small and simple implementation intended for device blacklists + * where a string is matched against a number of patterns. Thus, it + * does not preprocess the patterns. It is non-recursive, and run-time + * is at most quadratic: strlen(@str)*strlen(@pat). + * + * An example of the worst case is glob_match("*aaaaa", "aaaaaaaaaa"); + * it takes 6 passes over the pattern before matching the string. + * + * Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT + * treat / or leading . specially; it isn't actually used for pathnames. + * + * Note that according to glob(7) (and unlike bash), character classes + * are complemented by a leading !; this does not support the regex-style + * [^a-z] syntax. + * + * An opening bracket without a matching close is matched literally. + */ +bool __pure glob_match(char const *pat, char const *str) +{ + /* + * Backtrack to previous * on mismatch and retry starting one + * character later in the string. Because * matches all characters + * (no exception for /), it can be easily proved that there's + * never a need to backtrack multiple levels. + */ + char const *back_pat = NULL, *back_str = back_str; + + /* + * Loop over each token (character or class) in pat, matching + * it against the remaining unmatched tail of str. Return false + * on mismatch, or true after matching the trailing nul bytes. + */ + for (;;) { + unsigned char c = *str++; + unsigned char d = *pat++; + + switch (d) { + case '?': /* Wildcard: anything but nul */ + if (c == '\0') + return false; + break; + case '*': /* Any-length wildcard */ + if (*pat == '\0') /* Optimize trailing * case */ + return true; + back_pat = pat; + back_str = --str; /* Allow zero-length match */ + break; + case '[': { /* Character class */ + bool match = false, inverted = (*pat == '!'); + char const *class = pat + inverted; + unsigned char a = *class++; + + /* + * Iterate over each span in the character class. + * A span is either a single character a, or a + * range a-b. The first span may begin with ']'. + */ + do { + unsigned char b = a; + + if (a == '\0') /* Malformed */ + goto literal; + + if (class[0] == '-' && class[1] != ']') { + b = class[1]; + + if (b == '\0') + goto literal; + + class += 2; + /* Any special action if a > b? */ + } + match |= (a <= c && c <= b); + } while ((a = *class++) != ']'); + + if (match == inverted) + goto backtrack; + pat = class; + } + break; + case '\\': + d = *pat++; + /*FALLTHROUGH*/ + default: /* Literal character */ +literal: + if (c == d) { + if (d == '\0') + return true; + break; + } +backtrack: + if (c == '\0' || !back_pat) + return false; /* No point continuing */ + /* Try again from last *, one character later in str. */ + pat = back_pat; + str = ++back_str; + break; + } + } +} +EXPORT_SYMBOL(glob_match); +#endif /* ! HAVE_GLOB */ diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c new file mode 100644 index 0000000000000..e4e67c20cee5d --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c @@ -0,0 +1,57 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#ifdef HAVE_STRINGHASH +#include +#else +#include +#endif +#include + +#include + +/* Return the "hash_len" (hash and length) of a null-terminated string */ +/* The kernel equivalent is in fs/namei.c but for some strange reason + * RHEL7.5 stuck it in dax/super.c instead. This placement never existed + * upstream so to make life easier we just have the equavilent + */ +u64 cfs_hashlen_string(const void *salt, const char *name) +{ +#ifdef HAVE_FULL_NAME_HASH_3ARGS + unsigned long hash = init_name_hash(salt); +#else + unsigned long hash = init_name_hash(); +#endif + unsigned long len = 0, c; + + c = (unsigned char)*name; + while (c) { + len++; + hash = partial_name_hash(c, hash); + c = (unsigned char)name[len]; + } + return hashlen_create(end_name_hash(hash), len); +} +EXPORT_SYMBOL(cfs_hashlen_string); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c new file mode 100644 index 0000000000000..5f2f6aefb77bb --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c @@ -0,0 +1,275 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#ifdef HAVE_SCHED_HEADERS +#include +#endif +#include +#include +#include + +#if defined(CONFIG_KGDB) +#include +#endif + +#include +#include +#include + +#ifndef HAVE_KTIME_GET_TS64 +void ktime_get_ts64(struct timespec64 *ts) +{ + struct timespec now; + + ktime_get_ts(&now); + *ts = timespec_to_timespec64(now); +} +EXPORT_SYMBOL(ktime_get_ts64); +#endif /* HAVE_KTIME_GET_TS64 */ + +#ifndef HAVE_KTIME_GET_REAL_TS64 +void ktime_get_real_ts64(struct timespec64 *ts) +{ + struct timespec now; + + getnstimeofday(&now); + *ts = timespec_to_timespec64(now); +} +EXPORT_SYMBOL(ktime_get_real_ts64); +#endif /* HAVE_KTIME_GET_REAL_TS64 */ + +#ifndef HAVE_KTIME_GET_REAL_SECONDS +/* + * Get the seconds portion of CLOCK_REALTIME (wall clock). + * This is the clock that can be altered by NTP and is + * independent of a reboot. + */ +time64_t ktime_get_real_seconds(void) +{ + return (time64_t)get_seconds(); +} +EXPORT_SYMBOL(ktime_get_real_seconds); +#endif /* HAVE_KTIME_GET_REAL_SECONDS */ + +#ifndef HAVE_KTIME_GET_SECONDS +/* + * Get the seconds portion of CLOCK_MONOTONIC + * This clock is immutable and is reset across + * reboots. For older platforms this is a + * wrapper around get_seconds which is valid + * until 2038. By that time this will be gone + * one would hope. + */ +time64_t ktime_get_seconds(void) +{ + struct timespec64 now; + + ktime_get_ts64(&now); + return now.tv_sec; +} +EXPORT_SYMBOL(ktime_get_seconds); +#endif /* HAVE_KTIME_GET_SECONDS */ + +static int (*cfs_apply_workqueue_attrs_t)(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs); + +int cfs_apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + if (cfs_apply_workqueue_attrs_t) + return cfs_apply_workqueue_attrs_t(wq, attrs); + return 0; +} +EXPORT_SYMBOL_GPL(cfs_apply_workqueue_attrs); + +#ifndef HAVE_XARRAY_SUPPORT +struct kmem_cache (*radix_tree_node_cachep); +#endif + +void __init cfs_arch_init(void) +{ +#ifndef HAVE_WAIT_VAR_EVENT + wait_bit_init(); +#endif + cfs_apply_workqueue_attrs_t = + (void *)cfs_kallsyms_lookup_name("apply_workqueue_attrs"); +#ifndef HAVE_XARRAY_SUPPORT + radix_tree_node_cachep = + (void *)cfs_kallsyms_lookup_name("radix_tree_node_cachep"); +#endif +} + +int cfs_kernel_write(struct file *filp, const void *buf, size_t count, + loff_t *pos) +{ +#ifdef HAVE_NEW_KERNEL_WRITE + return kernel_write(filp, buf, count, pos); +#else + mm_segment_t __old_fs = get_fs(); + int rc; + + set_fs(KERNEL_DS); + rc = vfs_write(filp, (__force const char __user *)buf, count, pos); + set_fs(__old_fs); + + return rc; +#endif +} +EXPORT_SYMBOL(cfs_kernel_write); + +ssize_t cfs_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) +{ +#ifdef HAVE_KERNEL_READ_LAST_POSP + return kernel_read(file, buf, count, pos); +#else + ssize_t size = kernel_read(file, *pos, buf, count); + + if (size > 0) + *pos += size; + return size; +#endif +} +EXPORT_SYMBOL(cfs_kernel_read); + +#ifndef HAVE_KSET_FIND_OBJ +struct kobject *kset_find_obj(struct kset *kset, const char *name) +{ + struct kobject *ret = NULL; + struct kobject *k; + + spin_lock(&kset->list_lock); + + list_for_each_entry(k, &kset->list, entry) { + if (kobject_name(k) && !strcmp(kobject_name(k), name)) { + if (kref_get_unless_zero(&k->kref)) + ret = k; + break; + } + } + + spin_unlock(&kset->list_lock); + return ret; +} +EXPORT_SYMBOL_GPL(kset_find_obj); +#endif + +#ifndef HAVE_MATCH_WILDCARD +/** + * match_wildcard: - parse if a string matches given wildcard pattern + * @pattern: wildcard pattern + * @str: the string to be parsed + * + * Description: Parse the string @str to check if matches wildcard + * pattern @pattern. The pattern may contain two type wildcardes: + * '*' - matches zero or more characters + * '?' - matches one character + * If it's matched, return true, else return false. + */ +bool match_wildcard(const char *pattern, const char *str) +{ + const char *s = str; + const char *p = pattern; + bool star = false; + + while (*s) { + switch (*p) { + case '?': + s++; + p++; + break; + case '*': + star = true; + str = s; + if (!*++p) + return true; + pattern = p; + break; + default: + if (*s == *p) { + s++; + p++; + } else { + if (!star) + return false; + str++; + s = str; + p = pattern; + } + break; + } + } + + if (*p == '*') + ++p; + return !*p; +} +EXPORT_SYMBOL(match_wildcard); +#endif /* !HAVE_MATCH_WILDCARD */ + +#ifndef HAVE_KSTRTOBOOL_FROM_USER +int kstrtobool_from_user(const char __user *s, size_t count, bool *res) +{ + /* Longest string needed to differentiate, newline, terminator */ + char buf[4]; + + count = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, s, count)) + return -EFAULT; + buf[count] = '\0'; + return strtobool(buf, res); +} +EXPORT_SYMBOL(kstrtobool_from_user); +#endif /* !HAVE_KSTRTOBOOL_FROM_USER */ + +#ifndef HAVE_NLA_STRDUP +char *nla_strdup(const struct nlattr *nla, gfp_t flags) +{ + size_t srclen = nla_len(nla); + char *src = nla_data(nla), *dst; + + if (srclen > 0 && src[srclen - 1] == '\0') + srclen--; + + dst = kmalloc(srclen + 1, flags); + if (dst != NULL) { + memcpy(dst, src, srclen); + dst[srclen] = '\0'; + } + return dst; +} +EXPORT_SYMBOL(nla_strdup); +#endif /* !HAVE_NLA_STRDUP */ diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c new file mode 100644 index 0000000000000..33117c25a1302 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c @@ -0,0 +1,174 @@ +/* + * The implementation of the wait_bit*() and related waiting APIs: + */ +#include +#include +#ifdef HAVE_SCHED_HEADERS +#include +#endif +#include + +#ifndef HAVE_PREPARE_TO_WAIT_EVENT + +#define __add_wait_queue_entry_tail __add_wait_queue_tail + +long prepare_to_wait_event(wait_queue_head_t *wq_head, + wait_queue_entry_t *wq_entry, int state) +{ + unsigned long flags; + long ret = 0; + + spin_lock_irqsave(&wq_head->lock, flags); + if (unlikely(signal_pending_state(state, current))) { + /* + * Exclusive waiter must not fail if it was selected by wakeup, + * it should "consume" the condition we were waiting for. + * + * The caller will recheck the condition and return success if + * we were already woken up, we can not miss the event because + * wakeup locks/unlocks the same wq_head->lock. + * + * But we need to ensure that set-condition + wakeup after that + * can't see us, it should wake up another exclusive waiter if + * we fail. + */ + list_del_init(&wq_entry->task_list); + ret = -ERESTARTSYS; + } else { + if (list_empty(&wq_entry->task_list)) { + if (wq_entry->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_entry_tail(wq_head, wq_entry); + else + __add_wait_queue(wq_head, wq_entry); + } + set_current_state(state); + } + spin_unlock_irqrestore(&wq_head->lock, flags); + + return ret; +} +EXPORT_SYMBOL(prepare_to_wait_event); +#endif /* !HAVE_PREPARE_TO_WAIT_EVENT */ + +#ifndef HAVE_WAIT_VAR_EVENT + +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) + +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *__var_waitqueue(void *p) +{ + return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(__var_waitqueue); + +static int +var_wake_function(wait_queue_entry_t *wq_entry, unsigned int mode, + int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue_entry *wbq_entry = + container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); + + if (wbq_entry->key.flags != key->flags || + wbq_entry->key.bit_nr != key->bit_nr) + return 0; + + return autoremove_wake_function(wq_entry, mode, sync, key); +} + +void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, + int flags) +{ + *wbq_entry = (struct wait_bit_queue_entry){ + .key = { + .flags = (var), + .bit_nr = -1, + }, + .wq_entry = { + .private = current, + .func = var_wake_function, +#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST + .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), +#else + .task_list = LIST_HEAD_INIT(wbq_entry->wq_entry.task_list), +#endif + }, + }; +} +EXPORT_SYMBOL(init_wait_var_entry); + +void wake_up_var(void *var) +{ + __wake_up_bit(__var_waitqueue(var), var, -1); +} +EXPORT_SYMBOL(wake_up_var); + +void __init wait_bit_init(void) +{ + int i; + + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); +} +#endif /* ! HAVE_WAIT_VAR_EVENT */ + +#ifndef HAVE_WAIT_WOKEN +/* + * DEFINE_WAIT_FUNC(wait, woken_wake_func); + * + * add_wait_queue(&wq_head, &wait); + * for (;;) { + * if (condition) + * break; + * + * // in wait_woken() // in woken_wake_function() + * + * p->state = mode; wq_entry->flags |= WQ_FLAG_WOKEN; + * smp_mb(); // A try_to_wake_up(): + * if (!(wq_entry->flags & WQ_FLAG_WOKEN)) + * schedule() if (p->state & mode) + * p->state = TASK_RUNNING; p->state = TASK_RUNNING; + * wq_entry->flags &= ~WQ_FLAG_WOKEN; ~~~~~~~~~~~~~~~~~~ + * smp_mb(); // B condition = true; + * } smp_mb(); // C + * remove_wait_queue(&wq_head, &wait); wq_entry->flags |= WQ_FLAG_WOKEN; + */ +long wait_woken(struct wait_queue_entry *wq_entry, unsigned int mode, + long timeout) +{ + /* + * The below executes an smp_mb(), which matches with the full barrier + * executed by the try_to_wake_up() in woken_wake_function() such that + * either we see the store to wq_entry->flags in woken_wake_function() + * or woken_wake_function() sees our store to current->state. + */ + set_current_state(mode); /* A */ + if (!(wq_entry->flags & WQ_FLAG_WOKEN)) + timeout = schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); + + /* + * The below executes an smp_mb(), which matches with the smp_mb() (C) + * in woken_wake_function() such that either we see the wait condition + * being true or the store to wq_entry->flags in woken_wake_function() + * follows ours in the coherence order. + */ + smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */ + + return timeout; +} +EXPORT_SYMBOL(wait_woken); + +int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, + int sync, void *key) +{ + /* Pairs with the smp_store_mb() in wait_woken(). */ + smp_mb(); /* C */ + wq_entry->flags |= WQ_FLAG_WOKEN; + + return default_wake_function(wq_entry, mode, sync, key); +} +EXPORT_SYMBOL(woken_wake_function); +#endif /* HAVE_WAIT_WOKEN */ diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/xarray.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/xarray.c new file mode 100644 index 0000000000000..fea97febdf2ce --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/xarray.c @@ -0,0 +1,2101 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * XArray implementation + * Copyright (c) 2017 Microsoft Corporation + * Author: Matthew Wilcox + * + * This is taken from kernel commit: + * + * 7b785645e ("mm: fix page cache convergence regression") + * + * at kernel verison 5.2-rc2 + */ +#ifndef HAVE_XARRAY_SUPPORT +#include +#include +#include +#include +#include +#include + +/* + * Coding conventions in this file: + * + * @xa is used to refer to the entire xarray. + * @xas is the 'xarray operation state'. It may be either a pointer to + * an xa_state, or an xa_state stored on the stack. This is an unfortunate + * ambiguity. + * @index is the index of the entry being operated on + * @mark is an xa_mark_t; a small number indicating one of the mark bits. + * @node refers to an xa_node; usually the primary one being operated on by + * this function. + * @offset is the index into the slots array inside an xa_node. + * @parent refers to the @xa_node closer to the head than @node. + * @entry refers to something stored in a slot in the xarray + */ + +static inline unsigned int xa_lock_type(const struct xarray *xa) +{ + return (__force unsigned int)xa->xa_flags & 3; +} + +static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type) +{ + if (lock_type == XA_LOCK_IRQ) + xas_lock_irq(xas); + else if (lock_type == XA_LOCK_BH) + xas_lock_bh(xas); + else + xas_lock(xas); +} + +static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type) +{ + if (lock_type == XA_LOCK_IRQ) + xas_unlock_irq(xas); + else if (lock_type == XA_LOCK_BH) + xas_unlock_bh(xas); + else + xas_unlock(xas); +} + +static inline bool xa_track_free(const struct xarray *xa) +{ + return xa->xa_flags & XA_FLAGS_TRACK_FREE; +} + +static inline bool xa_zero_busy(const struct xarray *xa) +{ + return xa->xa_flags & XA_FLAGS_ZERO_BUSY; +} + +static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark) +{ + if (!(xa->xa_flags & XA_FLAGS_MARK(mark))) + xa->xa_flags |= XA_FLAGS_MARK(mark); +} + +static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark) +{ + if (xa->xa_flags & XA_FLAGS_MARK(mark)) + xa->xa_flags &= ~(XA_FLAGS_MARK(mark)); +} + +static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark) +{ + return node->marks[(__force unsigned)mark]; +} + +static inline bool node_get_mark(struct xa_node *node, + unsigned int offset, xa_mark_t mark) +{ + return test_bit(offset, node_marks(node, mark)); +} + +/* returns true if the bit was set */ +static inline bool node_set_mark(struct xa_node *node, unsigned int offset, + xa_mark_t mark) +{ + return __test_and_set_bit(offset, node_marks(node, mark)); +} + +/* returns true if the bit was set */ +static inline bool node_clear_mark(struct xa_node *node, unsigned int offset, + xa_mark_t mark) +{ + return __test_and_clear_bit(offset, node_marks(node, mark)); +} + +static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark) +{ + return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE); +} + +static inline void node_mark_all(struct xa_node *node, xa_mark_t mark) +{ + bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE); +} + +#define mark_inc(mark) do { \ + mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \ +} while (0) + +/* + * xas_squash_marks() - Merge all marks to the first entry + * @xas: Array operation state. + * + * Set a mark on the first entry if any entry has it set. Clear marks on + * all sibling entries. + */ +static void xas_squash_marks(const struct xa_state *xas) +{ + unsigned int mark = 0; + unsigned int limit = xas->xa_offset + xas->xa_sibs + 1; + + if (!xas->xa_sibs) + return; + + do { + unsigned long *marks = xas->xa_node->marks[mark]; + if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit) + continue; + __set_bit(xas->xa_offset, marks); + bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); + } while (mark++ != (__force unsigned)XA_MARK_MAX); +} + +/* extracts the offset within this node from the index */ +static unsigned int get_offset(unsigned long index, struct xa_node *node) +{ + return (index >> node->shift) & XA_CHUNK_MASK; +} + +static void xas_set_offset(struct xa_state *xas) +{ + xas->xa_offset = get_offset(xas->xa_index, xas->xa_node); +} + +/* move the index either forwards (find) or backwards (sibling slot) */ +static void xas_move_index(struct xa_state *xas, unsigned long offset) +{ + unsigned int shift = xas->xa_node->shift; + xas->xa_index &= ~XA_CHUNK_MASK << shift; + xas->xa_index += offset << shift; +} + +static void xas_advance(struct xa_state *xas) +{ + xas->xa_offset++; + xas_move_index(xas, xas->xa_offset); +} + +static void *set_bounds(struct xa_state *xas) +{ + xas->xa_node = XAS_BOUNDS; + return NULL; +} + +/* + * Starts a walk. If the @xas is already valid, we assume that it's on + * the right path and just return where we've got to. If we're in an + * error state, return NULL. If the index is outside the current scope + * of the xarray, return NULL without changing @xas->xa_node. Otherwise + * set @xas->xa_node to NULL and return the current head of the array. + */ +static void *xas_start(struct xa_state *xas) +{ + void *entry; + + if (xas_valid(xas)) + return xas_reload(xas); + if (xas_error(xas)) + return NULL; + + entry = xa_head(xas->xa); + if (!xa_is_node(entry)) { + if (xas->xa_index) + return set_bounds(xas); + } else { + if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK) + return set_bounds(xas); + } + + xas->xa_node = NULL; + return entry; +} + +static void *xas_descend(struct xa_state *xas, struct xa_node *node) +{ + unsigned int offset = get_offset(xas->xa_index, node); + void *entry = xa_entry(xas->xa, node, offset); + + xas->xa_node = node; + if (xa_is_sibling(entry)) { + offset = xa_to_sibling(entry); + entry = xa_entry(xas->xa, node, offset); + } + + xas->xa_offset = offset; + return entry; +} + +/** + * xas_load() - Load an entry from the XArray (advanced). + * @xas: XArray operation state. + * + * Usually walks the @xas to the appropriate state to load the entry + * stored at xa_index. However, it will do nothing and return %NULL if + * @xas is in an error state. xas_load() will never expand the tree. + * + * If the xa_state is set up to operate on a multi-index entry, xas_load() + * may return %NULL or an internal entry, even if there are entries + * present within the range specified by @xas. + * + * Context: Any context. The caller should hold the xa_lock or the RCU lock. + * Return: Usually an entry in the XArray, but see description for exceptions. + */ +void *xas_load(struct xa_state *xas) +{ + void *entry = xas_start(xas); + + while (xa_is_node(entry)) { + struct xa_node *node = xa_to_node(entry); + + if (xas->xa_shift > node->shift) + break; + entry = xas_descend(xas, node); + if (node->shift == 0) + break; + } + return entry; +} +EXPORT_SYMBOL_GPL(xas_load); + +/* Move the radix tree node cache here */ +extern struct kmem_cache *radix_tree_node_cachep; + +static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + __clear_bit(offset, node->tags[tag]); +} + +static void radix_tree_node_rcu_free(struct rcu_head *head) +{ + struct radix_tree_node *node = + container_of(head, struct radix_tree_node, rcu_head); + int i; + + /* + * must only free zeroed nodes into the slab. radix_tree_shrink + * can leave us with a non-NULL entry in the first slot, so clear + * that here to make sure. + */ + for (i = 0; i < RADIX_TREE_MAX_TAGS; i++) + tag_clear(node, i, 0); + + node->slots[0] = NULL; + node->count = 0; + + kmem_cache_free(radix_tree_node_cachep, node); +} + +#define XA_RCU_FREE ((struct xarray *)1) + +static void xa_node_free(struct xa_node *node) +{ + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + node->array = XA_RCU_FREE; + call_rcu(&node->rcu_head, radix_tree_node_rcu_free); +} + +/* + * xas_destroy() - Free any resources allocated during the XArray operation. + * @xas: XArray operation state. + * + * This function is now internal-only. + */ +static void xas_destroy(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_alloc; + + if (!node) + return; + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + kmem_cache_free(radix_tree_node_cachep, node); + xas->xa_alloc = NULL; +} + +/** + * xas_nomem() - Allocate memory if needed. + * @xas: XArray operation state. + * @gfp: Memory allocation flags. + * + * If we need to add new nodes to the XArray, we try to allocate memory + * with GFP_NOWAIT while holding the lock, which will usually succeed. + * If it fails, @xas is flagged as needing memory to continue. The caller + * should drop the lock and call xas_nomem(). If xas_nomem() succeeds, + * the caller should retry the operation. + * + * Forward progress is guaranteed as one node is allocated here and + * stored in the xa_state where it will be found by xas_alloc(). More + * nodes will likely be found in the slab allocator, but we do not tie + * them up here. + * + * Return: true if memory was needed, and was successfully allocated. + */ +bool xas_nomem(struct xa_state *xas, gfp_t gfp) +{ + if (xas->xa_node != XA_ERROR(-ENOMEM)) { + xas_destroy(xas); + return false; + } +#ifdef __GFP_ACCOUNT + if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) + gfp |= __GFP_ACCOUNT; +#endif + xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + if (!xas->xa_alloc) + return false; + XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); + xas->xa_node = XAS_RESTART; + return true; +} +EXPORT_SYMBOL_GPL(xas_nomem); + +/* + * __xas_nomem() - Drop locks and allocate memory if needed. + * @xas: XArray operation state. + * @gfp: Memory allocation flags. + * + * Internal variant of xas_nomem(). + * + * Return: true if memory was needed, and was successfully allocated. + */ +static bool __xas_nomem(struct xa_state *xas, gfp_t gfp) + __must_hold(xas->xa->xa_lock) +{ + unsigned int lock_type = xa_lock_type(xas->xa); + + if (xas->xa_node != XA_ERROR(-ENOMEM)) { + xas_destroy(xas); + return false; + } +#ifdef __GFP_ACCOUNT + if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) + gfp |= __GFP_ACCOUNT; +#endif + if (gfpflags_allow_blocking(gfp)) { + xas_unlock_type(xas, lock_type); + xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + xas_lock_type(xas, lock_type); + } else { + xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + } + if (!xas->xa_alloc) + return false; + XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); + xas->xa_node = XAS_RESTART; + return true; +} + +static void xas_update(struct xa_state *xas, struct xa_node *node) +{ + if (xas->xa_update) + xas->xa_update(node); + else + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); +} + +static void *xas_alloc(struct xa_state *xas, unsigned int shift) +{ + struct xa_node *parent = xas->xa_node; + struct xa_node *node = xas->xa_alloc; + + if (xas_invalid(xas)) + return NULL; + + if (node) { + xas->xa_alloc = NULL; + } else { + gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; +#ifdef __GFP_ACCOUNT + if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) + gfp |= __GFP_ACCOUNT; +#endif + node = kmem_cache_alloc(radix_tree_node_cachep, gfp); + if (!node) { + xas_set_err(xas, -ENOMEM); + return NULL; + } + } + + if (parent) { + node->offset = xas->xa_offset; + parent->count++; + XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE); + xas_update(xas, parent); + } + XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + node->shift = shift; + node->count = 0; + node->nr_values = 0; + RCU_INIT_POINTER(node->parent, xas->xa_node); + node->array = xas->xa; + + return node; +} + +#ifdef CONFIG_XARRAY_MULTI +/* Returns the number of indices covered by a given xa_state */ +static unsigned long xas_size(const struct xa_state *xas) +{ + return (xas->xa_sibs + 1UL) << xas->xa_shift; +} +#endif + +/* + * Use this to calculate the maximum index that will need to be created + * in order to add the entry described by @xas. Because we cannot store a + * multiple-index entry at index 0, the calculation is a little more complex + * than you might expect. + */ +static unsigned long xas_max(struct xa_state *xas) +{ + unsigned long max = xas->xa_index; + +#ifdef CONFIG_XARRAY_MULTI + if (xas->xa_shift || xas->xa_sibs) { + unsigned long mask = xas_size(xas) - 1; + max |= mask; + if (mask == max) + max++; + } +#endif + + return max; +} + +/* The maximum index that can be contained in the array without expanding it */ +static unsigned long max_index(void *entry) +{ + if (!xa_is_node(entry)) + return 0; + return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1; +} + +static void xas_shrink(struct xa_state *xas) +{ + struct xarray *xa = xas->xa; + struct xa_node *node = xas->xa_node; + + for (;;) { + void *entry; + + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + if (node->count != 1) + break; + entry = xa_entry_locked(xa, node, 0); + if (!entry) + break; + if (!xa_is_node(entry) && node->shift) + break; + if (xa_is_zero(entry) && xa_zero_busy(xa)) + entry = NULL; + xas->xa_node = XAS_BOUNDS; + + RCU_INIT_POINTER(xa->xa_head, entry); + if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK)) + xa_mark_clear(xa, XA_FREE_MARK); + + node->count = 0; + node->nr_values = 0; + if (!xa_is_node(entry)) + RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY); + xas_update(xas, node); + xa_node_free(node); + if (!xa_is_node(entry)) + break; + node = xa_to_node(entry); + node->parent = NULL; + } +} + +/* + * xas_delete_node() - Attempt to delete an xa_node + * @xas: Array operation state. + * + * Attempts to delete the @xas->xa_node. This will fail if xa->node has + * a non-zero reference count. + */ +static void xas_delete_node(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + for (;;) { + struct xa_node *parent; + + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + if (node->count) + break; + + parent = xa_parent_locked(xas->xa, node); + xas->xa_node = parent; + xas->xa_offset = node->offset; + xa_node_free(node); + + if (!parent) { + xas->xa->xa_head = NULL; + xas->xa_node = XAS_BOUNDS; + return; + } + + parent->slots[xas->xa_offset] = NULL; + parent->count--; + XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE); + node = parent; + xas_update(xas, node); + } + + if (!node->parent) + xas_shrink(xas); +} + +/** + * xas_free_nodes() - Free this node and all nodes that it references + * @xas: Array operation state. + * @top: Node to free + * + * This node has been removed from the tree. We must now free it and all + * of its subnodes. There may be RCU walkers with references into the tree, + * so we must replace all entries with retry markers. + */ +static void xas_free_nodes(struct xa_state *xas, struct xa_node *top) +{ + unsigned int offset = 0; + struct xa_node *node = top; + + for (;;) { + void *entry = xa_entry_locked(xas->xa, node, offset); + + if (node->shift && xa_is_node(entry)) { + node = xa_to_node(entry); + offset = 0; + continue; + } + if (entry) + RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY); + offset++; + while (offset == XA_CHUNK_SIZE) { + struct xa_node *parent; + + parent = xa_parent_locked(xas->xa, node); + offset = node->offset + 1; + node->count = 0; + node->nr_values = 0; + xas_update(xas, node); + xa_node_free(node); + if (node == top) + return; + node = parent; + } + } +} + +/* + * xas_expand adds nodes to the head of the tree until it has reached + * sufficient height to be able to contain @xas->xa_index + */ +static int xas_expand(struct xa_state *xas, void *head) +{ + struct xarray *xa = xas->xa; + struct xa_node *node = NULL; + unsigned int shift = 0; + unsigned long max = xas_max(xas); + + if (!head) { + if (max == 0) + return 0; + while ((max >> shift) >= XA_CHUNK_SIZE) + shift += XA_CHUNK_SHIFT; + return shift + XA_CHUNK_SHIFT; + } else if (xa_is_node(head)) { + node = xa_to_node(head); + shift = node->shift + XA_CHUNK_SHIFT; + } + xas->xa_node = NULL; + + while (max > max_index(head)) { + xa_mark_t mark = 0; + + XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); + node = xas_alloc(xas, shift); + if (!node) + return -ENOMEM; + + node->count = 1; + if (xa_is_value(head)) + node->nr_values = 1; + RCU_INIT_POINTER(node->slots[0], head); + + /* Propagate the aggregated mark info to the new child */ + for (;;) { + if (xa_track_free(xa) && mark == XA_FREE_MARK) { + node_mark_all(node, XA_FREE_MARK); + if (!xa_marked(xa, XA_FREE_MARK)) { + node_clear_mark(node, 0, XA_FREE_MARK); + xa_mark_set(xa, XA_FREE_MARK); + } + } else if (xa_marked(xa, mark)) { + node_set_mark(node, 0, mark); + } + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } + + /* + * Now that the new node is fully initialised, we can add + * it to the tree + */ + if (xa_is_node(head)) { + xa_to_node(head)->offset = 0; + rcu_assign_pointer(xa_to_node(head)->parent, node); + } + head = xa_mk_node(node); + rcu_assign_pointer(xa->xa_head, head); + xas_update(xas, node); + + shift += XA_CHUNK_SHIFT; + } + + xas->xa_node = node; + return shift; +} + +/* + * xas_create() - Create a slot to store an entry in. + * @xas: XArray operation state. + * @allow_root: %true if we can store the entry in the root directly + * + * Most users will not need to call this function directly, as it is called + * by xas_store(). It is useful for doing conditional store operations + * (see the xa_cmpxchg() implementation for an example). + * + * Return: If the slot already existed, returns the contents of this slot. + * If the slot was newly created, returns %NULL. If it failed to create the + * slot, returns %NULL and indicates the error in @xas. + */ +static void *xas_create(struct xa_state *xas, bool allow_root) +{ + struct xarray *xa = xas->xa; + void *entry; + void __rcu **slot; + struct xa_node *node = xas->xa_node; + int shift; + unsigned int order = xas->xa_shift; + + if (xas_top(node)) { + entry = xa_head_locked(xa); + xas->xa_node = NULL; + if (!entry && xa_zero_busy(xa)) + entry = XA_ZERO_ENTRY; + shift = xas_expand(xas, entry); + if (shift < 0) + return NULL; + if (!shift && !allow_root) + shift = XA_CHUNK_SHIFT; + entry = xa_head_locked(xa); + slot = &xa->xa_head; + } else if (xas_error(xas)) { + return NULL; + } else if (node) { + unsigned int offset = xas->xa_offset; + + shift = node->shift; + entry = xa_entry_locked(xa, node, offset); + slot = &node->slots[offset]; + } else { + shift = 0; + entry = xa_head_locked(xa); + slot = &xa->xa_head; + } + + while (shift > order) { + shift -= XA_CHUNK_SHIFT; + if (!entry) { + node = xas_alloc(xas, shift); + if (!node) + break; + if (xa_track_free(xa)) + node_mark_all(node, XA_FREE_MARK); + rcu_assign_pointer(*slot, xa_mk_node(node)); + } else if (xa_is_node(entry)) { + node = xa_to_node(entry); + } else { + break; + } + entry = xas_descend(xas, node); + slot = &node->slots[xas->xa_offset]; + } + + return entry; +} + +/** + * xas_create_range() - Ensure that stores to this range will succeed + * @xas: XArray operation state. + * + * Creates all of the slots in the range covered by @xas. Sets @xas to + * create single-index entries and positions it at the beginning of the + * range. This is for the benefit of users which have not yet been + * converted to use multi-index entries. + */ +void xas_create_range(struct xa_state *xas) +{ + unsigned long index = xas->xa_index; + unsigned char shift = xas->xa_shift; + unsigned char sibs = xas->xa_sibs; + + xas->xa_index |= ((sibs + 1) << shift) - 1; + if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift) + xas->xa_offset |= sibs; + xas->xa_shift = 0; + xas->xa_sibs = 0; + + for (;;) { + xas_create(xas, true); + if (xas_error(xas)) + goto restore; + if (xas->xa_index <= (index | XA_CHUNK_MASK)) + goto success; + xas->xa_index -= XA_CHUNK_SIZE; + + for (;;) { + struct xa_node *node = xas->xa_node; + xas->xa_node = xa_parent_locked(xas->xa, node); + xas->xa_offset = node->offset - 1; + if (node->offset != 0) + break; + } + } + +restore: + xas->xa_shift = shift; + xas->xa_sibs = sibs; + xas->xa_index = index; + return; +success: + xas->xa_index = index; + if (xas->xa_node) + xas_set_offset(xas); +} +EXPORT_SYMBOL_GPL(xas_create_range); + +static void update_node(struct xa_state *xas, struct xa_node *node, + int count, int values) +{ + if (!node || (!count && !values)) + return; + + node->count += count; + node->nr_values += values; + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE); + xas_update(xas, node); + if (count < 0) + xas_delete_node(xas); +} + +/** + * xas_store() - Store this entry in the XArray. + * @xas: XArray operation state. + * @entry: New entry. + * + * If @xas is operating on a multi-index entry, the entry returned by this + * function is essentially meaningless (it may be an internal entry or it + * may be %NULL, even if there are non-NULL entries at some of the indices + * covered by the range). This is not a problem for any current users, + * and can be changed if needed. + * + * Return: The old entry at this index. + */ +void *xas_store(struct xa_state *xas, void *entry) +{ + struct xa_node *node; + void __rcu **slot = &xas->xa->xa_head; + unsigned int offset, max; + int count = 0; + int values = 0; + void *first, *next; + bool value = xa_is_value(entry); + + if (entry) { + bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry); + first = xas_create(xas, allow_root); + } else { + first = xas_load(xas); + } + + if (xas_invalid(xas)) + return first; + node = xas->xa_node; + if (node && (xas->xa_shift < node->shift)) + xas->xa_sibs = 0; + if ((first == entry) && !xas->xa_sibs) + return first; + + next = first; + offset = xas->xa_offset; + max = xas->xa_offset + xas->xa_sibs; + if (node) { + slot = &node->slots[offset]; + if (xas->xa_sibs) + xas_squash_marks(xas); + } + if (!entry) + xas_init_marks(xas); + + for (;;) { + /* + * Must clear the marks before setting the entry to NULL, + * otherwise xas_for_each_marked may find a NULL entry and + * stop early. rcu_assign_pointer contains a release barrier + * so the mark clearing will appear to happen before the + * entry is set to NULL. + */ + rcu_assign_pointer(*slot, entry); + if (xa_is_node(next) && (!node || node->shift)) + xas_free_nodes(xas, xa_to_node(next)); + if (!node) + break; + count += !next - !entry; + values += !xa_is_value(first) - !value; + if (entry) { + if (offset == max) + break; + if (!xa_is_sibling(entry)) + entry = xa_mk_sibling(xas->xa_offset); + } else { + if (offset == XA_CHUNK_MASK) + break; + } + next = xa_entry_locked(xas->xa, node, ++offset); + if (!xa_is_sibling(next)) { + if (!entry && (offset > max)) + break; + first = next; + } + slot++; + } + + update_node(xas, node, count, values); + return first; +} +EXPORT_SYMBOL_GPL(xas_store); + +/** + * xas_get_mark() - Returns the state of this mark. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Return: true if the mark is set, false if the mark is clear or @xas + * is in an error state. + */ +bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark) +{ + if (xas_invalid(xas)) + return false; + if (!xas->xa_node) + return xa_marked(xas->xa, mark); + return node_get_mark(xas->xa_node, xas->xa_offset, mark); +} +EXPORT_SYMBOL_GPL(xas_get_mark); + +/** + * xas_set_mark() - Sets the mark on this entry and its parents. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Sets the specified mark on this entry, and walks up the tree setting it + * on all the ancestor entries. Does nothing if @xas has not been walked to + * an entry, or is in an error state. + */ +void xas_set_mark(const struct xa_state *xas, xa_mark_t mark) +{ + struct xa_node *node = xas->xa_node; + unsigned int offset = xas->xa_offset; + + if (xas_invalid(xas)) + return; + + while (node) { + if (node_set_mark(node, offset, mark)) + return; + offset = node->offset; + node = xa_parent_locked(xas->xa, node); + } + + if (!xa_marked(xas->xa, mark)) + xa_mark_set(xas->xa, mark); +} +EXPORT_SYMBOL_GPL(xas_set_mark); + +/** + * xas_clear_mark() - Clears the mark on this entry and its parents. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Clears the specified mark on this entry, and walks back to the head + * attempting to clear it on all the ancestor entries. Does nothing if + * @xas has not been walked to an entry, or is in an error state. + */ +void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark) +{ + struct xa_node *node = xas->xa_node; + unsigned int offset = xas->xa_offset; + + if (xas_invalid(xas)) + return; + + while (node) { + if (!node_clear_mark(node, offset, mark)) + return; + if (node_any_mark(node, mark)) + return; + + offset = node->offset; + node = xa_parent_locked(xas->xa, node); + } + + if (xa_marked(xas->xa, mark)) + xa_mark_clear(xas->xa, mark); +} +EXPORT_SYMBOL_GPL(xas_clear_mark); + +/** + * xas_init_marks() - Initialise all marks for the entry + * @xas: Array operations state. + * + * Initialise all marks for the entry specified by @xas. If we're tracking + * free entries with a mark, we need to set it on all entries. All other + * marks are cleared. + * + * This implementation is not as efficient as it could be; we may walk + * up the tree multiple times. + */ +void xas_init_marks(const struct xa_state *xas) +{ + xa_mark_t mark = 0; + + for (;;) { + if (xa_track_free(xas->xa) && mark == XA_FREE_MARK) + xas_set_mark(xas, mark); + else + xas_clear_mark(xas, mark); + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } +} +EXPORT_SYMBOL_GPL(xas_init_marks); + +/** + * xas_pause() - Pause a walk to drop a lock. + * @xas: XArray operation state. + * + * Some users need to pause a walk and drop the lock they're holding in + * order to yield to a higher priority thread or carry out an operation + * on an entry. Those users should call this function before they drop + * the lock. It resets the @xas to be suitable for the next iteration + * of the loop after the user has reacquired the lock. If most entries + * found during a walk require you to call xas_pause(), the xa_for_each() + * iterator may be more appropriate. + * + * Note that xas_pause() only works for forward iteration. If a user needs + * to pause a reverse iteration, we will need a xas_pause_rev(). + */ +void xas_pause(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (xas_invalid(xas)) + return; + + if (node) { + unsigned int offset = xas->xa_offset; + while (++offset < XA_CHUNK_SIZE) { + if (!xa_is_sibling(xa_entry(xas->xa, node, offset))) + break; + } + xas->xa_index += (offset - xas->xa_offset) << node->shift; + } else { + xas->xa_index++; + } + xas->xa_node = XAS_RESTART; +} +EXPORT_SYMBOL_GPL(xas_pause); + +/* + * __xas_prev() - Find the previous entry in the XArray. + * @xas: XArray operation state. + * + * Helper function for xas_prev() which handles all the complex cases + * out of line. + */ +void *__xas_prev(struct xa_state *xas) +{ + void *entry; + + if (!xas_frozen(xas->xa_node)) + xas->xa_index--; + if (!xas->xa_node) + return set_bounds(xas); + if (xas_not_node(xas->xa_node)) + return xas_load(xas); + + if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) + xas->xa_offset--; + + while (xas->xa_offset == 255) { + xas->xa_offset = xas->xa_node->offset - 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + if (!xas->xa_node) + return set_bounds(xas); + } + + for (;;) { + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (!xa_is_node(entry)) + return entry; + + xas->xa_node = xa_to_node(entry); + xas_set_offset(xas); + } +} +EXPORT_SYMBOL_GPL(__xas_prev); + +/* + * __xas_next() - Find the next entry in the XArray. + * @xas: XArray operation state. + * + * Helper function for xas_next() which handles all the complex cases + * out of line. + */ +void *__xas_next(struct xa_state *xas) +{ + void *entry; + + if (!xas_frozen(xas->xa_node)) + xas->xa_index++; + if (!xas->xa_node) + return set_bounds(xas); + if (xas_not_node(xas->xa_node)) + return xas_load(xas); + + if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) + xas->xa_offset++; + + while (xas->xa_offset == XA_CHUNK_SIZE) { + xas->xa_offset = xas->xa_node->offset + 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + if (!xas->xa_node) + return set_bounds(xas); + } + + for (;;) { + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (!xa_is_node(entry)) + return entry; + + xas->xa_node = xa_to_node(entry); + xas_set_offset(xas); + } +} +EXPORT_SYMBOL_GPL(__xas_next); + +/** + * xas_find() - Find the next present entry in the XArray. + * @xas: XArray operation state. + * @max: Highest index to return. + * + * If the @xas has not yet been walked to an entry, return the entry + * which has an index >= xas.xa_index. If it has been walked, the entry + * currently being pointed at has been processed, and so we move to the + * next entry. + * + * If no entry is found and the array is smaller than @max, the iterator + * is set to the smallest index not yet in the array. This allows @xas + * to be immediately passed to xas_store(). + * + * Return: The entry, if found, otherwise %NULL. + */ +void *xas_find(struct xa_state *xas, unsigned long max) +{ + void *entry; + + if (xas_error(xas)) + return NULL; + + if (!xas->xa_node) { + xas->xa_index = 1; + return set_bounds(xas); + } else if (xas_top(xas->xa_node)) { + entry = xas_load(xas); + if (entry || xas_not_node(xas->xa_node)) + return entry; + } else if (!xas->xa_node->shift && + xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) { + xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1; + } + + xas_advance(xas); + + while (xas->xa_node && (xas->xa_index <= max)) { + if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { + xas->xa_offset = xas->xa_node->offset + 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + continue; + } + + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (xa_is_node(entry)) { + xas->xa_node = xa_to_node(entry); + xas->xa_offset = 0; + continue; + } + if (entry && !xa_is_sibling(entry)) + return entry; + + xas_advance(xas); + } + + if (!xas->xa_node) + xas->xa_node = XAS_BOUNDS; + return NULL; +} +EXPORT_SYMBOL_GPL(xas_find); + +/** + * xas_find_marked() - Find the next marked entry in the XArray. + * @xas: XArray operation state. + * @max: Highest index to return. + * @mark: Mark number to search for. + * + * If the @xas has not yet been walked to an entry, return the marked entry + * which has an index >= xas.xa_index. If it has been walked, the entry + * currently being pointed at has been processed, and so we return the + * first marked entry with an index > xas.xa_index. + * + * If no marked entry is found and the array is smaller than @max, @xas is + * set to the bounds state and xas->xa_index is set to the smallest index + * not yet in the array. This allows @xas to be immediately passed to + * xas_store(). + * + * If no entry is found before @max is reached, @xas is set to the restart + * state. + * + * Return: The entry, if found, otherwise %NULL. + */ +void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) +{ + bool advance = true; + unsigned int offset; + void *entry; + + if (xas_error(xas)) + return NULL; + + if (!xas->xa_node) { + xas->xa_index = 1; + goto out; + } else if (xas_top(xas->xa_node)) { + advance = false; + entry = xa_head(xas->xa); + xas->xa_node = NULL; + if (xas->xa_index > max_index(entry)) + goto out; + if (!xa_is_node(entry)) { + if (xa_marked(xas->xa, mark)) + return entry; + xas->xa_index = 1; + goto out; + } + xas->xa_node = xa_to_node(entry); + xas->xa_offset = xas->xa_index >> xas->xa_node->shift; + } + + while (xas->xa_index <= max) { + if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { + xas->xa_offset = xas->xa_node->offset + 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + if (!xas->xa_node) + break; + advance = false; + continue; + } + + if (!advance) { + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (xa_is_sibling(entry)) { + xas->xa_offset = xa_to_sibling(entry); + xas_move_index(xas, xas->xa_offset); + } + } + + offset = xas_find_chunk(xas, advance, mark); + if (offset > xas->xa_offset) { + advance = false; + xas_move_index(xas, offset); + /* Mind the wrap */ + if ((xas->xa_index - 1) >= max) + goto max; + xas->xa_offset = offset; + if (offset == XA_CHUNK_SIZE) + continue; + } + + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (!xa_is_node(entry)) + return entry; + xas->xa_node = xa_to_node(entry); + xas_set_offset(xas); + } + +out: + if (xas->xa_index > max) + goto max; + return set_bounds(xas); +max: + xas->xa_node = XAS_RESTART; + return NULL; +} +EXPORT_SYMBOL_GPL(xas_find_marked); + +/** + * xas_find_conflict() - Find the next present entry in a range. + * @xas: XArray operation state. + * + * The @xas describes both a range and a position within that range. + * + * Context: Any context. Expects xa_lock to be held. + * Return: The next entry in the range covered by @xas or %NULL. + */ +void *xas_find_conflict(struct xa_state *xas) +{ + void *curr; + + if (xas_error(xas)) + return NULL; + + if (!xas->xa_node) + return NULL; + + if (xas_top(xas->xa_node)) { + curr = xas_start(xas); + if (!curr) + return NULL; + while (xa_is_node(curr)) { + struct xa_node *node = xa_to_node(curr); + curr = xas_descend(xas, node); + } + if (curr) + return curr; + } + + if (xas->xa_node->shift > xas->xa_shift) + return NULL; + + for (;;) { + if (xas->xa_node->shift == xas->xa_shift) { + if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs) + break; + } else if (xas->xa_offset == XA_CHUNK_MASK) { + xas->xa_offset = xas->xa_node->offset; + xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node); + if (!xas->xa_node) + break; + continue; + } + curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset); + if (xa_is_sibling(curr)) + continue; + while (xa_is_node(curr)) { + xas->xa_node = xa_to_node(curr); + xas->xa_offset = 0; + curr = xa_entry_locked(xas->xa, xas->xa_node, 0); + } + if (curr) + return curr; + } + xas->xa_offset -= xas->xa_sibs; + return NULL; +} +EXPORT_SYMBOL_GPL(xas_find_conflict); + +/** + * xa_load() - Load an entry from an XArray. + * @xa: XArray. + * @index: index into array. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The entry at @index in @xa. + */ +void *xa_load(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + void *entry; + + rcu_read_lock(); + do { + entry = xas_load(&xas); + if (xa_is_zero(entry)) + entry = NULL; + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + return entry; +} +EXPORT_SYMBOL(xa_load); + +static void *xas_result(struct xa_state *xas, void *curr) +{ + if (xa_is_zero(curr)) + return NULL; + if (xas_error(xas)) + curr = xas->xa_node; + return curr; +} + +/** + * __xa_erase() - Erase this entry from the XArray while locked. + * @xa: XArray. + * @index: Index into array. + * + * After this function returns, loading from @index will return %NULL. + * If the index is part of a multi-index entry, all indices will be erased + * and none of the entries will be part of a multi-index entry. + * + * Context: Any context. Expects xa_lock to be held on entry. + * Return: The entry which used to be at this index. + */ +void *__xa_erase(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + return xas_result(&xas, xas_store(&xas, NULL)); +} +EXPORT_SYMBOL(__xa_erase); + +/** + * xa_erase() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * After this function returns, loading from @index will return %NULL. + * If the index is part of a multi-index entry, all indices will be erased + * and none of the entries will be part of a multi-index entry. + * + * Context: Any context. Takes and releases the xa_lock. + * Return: The entry which used to be at this index. + */ +void *xa_erase(struct xarray *xa, unsigned long index) +{ + void *entry; + + xa_lock(xa); + entry = __xa_erase(xa, index); + xa_unlock(xa); + + return entry; +} +EXPORT_SYMBOL(xa_erase); + +/** + * __xa_store() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * You must already be holding the xa_lock when calling this function. + * It will drop the lock if needed to allocate memory, and then reacquire + * it afterwards. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: The old entry at this index or xa_err() if an error happened. + */ +void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_advanced(entry))) + return XA_ERROR(-EINVAL); + if (xa_track_free(xa) && !entry) + entry = XA_ZERO_ENTRY; + + do { + curr = xas_store(&xas, entry); + if (xa_track_free(xa)) + xas_clear_mark(&xas, XA_FREE_MARK); + } while (__xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(__xa_store); + +/** + * xa_store() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * After this function returns, loads from this index will return @entry. + * Storing into an existing multislot entry updates the entry of every index. + * The marks associated with @index are unaffected unless @entry is %NULL. + * + * Context: Any context. Takes and releases the xa_lock. + * May sleep if the @gfp flags permit. + * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry + * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation + * failed. + */ +void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) +{ + void *curr; + + xa_lock(xa); + curr = __xa_store(xa, index, entry, gfp); + xa_unlock(xa); + + return curr; +} +EXPORT_SYMBOL(xa_store); + +/** + * __xa_cmpxchg() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @old: Old value to test against. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * You must already be holding the xa_lock when calling this function. + * It will drop the lock if needed to allocate memory, and then reacquire + * it afterwards. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: The old entry at this index or xa_err() if an error happened. + */ +void *__xa_cmpxchg(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_advanced(entry))) + return XA_ERROR(-EINVAL); + + do { + curr = xas_load(&xas); + if (curr == old) { + xas_store(&xas, entry); + if (xa_track_free(xa) && entry && !curr) + xas_clear_mark(&xas, XA_FREE_MARK); + } + } while (__xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(__xa_cmpxchg); + +/** + * __xa_insert() - Store this entry in the XArray if no entry is present. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Inserting a NULL entry will store a reserved entry (like xa_reserve()) + * if no entry is present. Inserting will fail if a reserved entry is + * present, even though loading from this index will return NULL. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: 0 if the store succeeded. -EBUSY if another entry was present. + * -ENOMEM if memory could not be allocated. + */ +int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_advanced(entry))) + return -EINVAL; + if (!entry) + entry = XA_ZERO_ENTRY; + + do { + curr = xas_load(&xas); + if (!curr) { + xas_store(&xas, entry); + if (xa_track_free(xa)) + xas_clear_mark(&xas, XA_FREE_MARK); + } else { + xas_set_err(&xas, -EBUSY); + } + } while (__xas_nomem(&xas, gfp)); + + return xas_error(&xas); +} +EXPORT_SYMBOL(__xa_insert); + +#ifdef CONFIG_XARRAY_MULTI +static void xas_set_range(struct xa_state *xas, unsigned long first, + unsigned long last) +{ + unsigned int shift = 0; + unsigned long sibs = last - first; + unsigned int offset = XA_CHUNK_MASK; + + xas_set(xas, first); + + while ((first & XA_CHUNK_MASK) == 0) { + if (sibs < XA_CHUNK_MASK) + break; + if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK)) + break; + shift += XA_CHUNK_SHIFT; + if (offset == XA_CHUNK_MASK) + offset = sibs & XA_CHUNK_MASK; + sibs >>= XA_CHUNK_SHIFT; + first >>= XA_CHUNK_SHIFT; + } + + offset = first & XA_CHUNK_MASK; + if (offset + sibs > XA_CHUNK_MASK) + sibs = XA_CHUNK_MASK - offset; + if ((((first + sibs + 1) << shift) - 1) > last) + sibs -= 1; + + xas->xa_shift = shift; + xas->xa_sibs = sibs; +} + +/** + * xa_store_range() - Store this entry at a range of indices in the XArray. + * @xa: XArray. + * @first: First index to affect. + * @last: Last index to affect. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * After this function returns, loads from any index between @first and @last, + * inclusive will return @entry. + * Storing into an existing multislot entry updates the entry of every index. + * The marks associated with @index are unaffected unless @entry is %NULL. + * + * Context: Process context. Takes and releases the xa_lock. May sleep + * if the @gfp flags permit. + * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in + * an XArray, or xa_err(-ENOMEM) if memory allocation failed. + */ +void *xa_store_range(struct xarray *xa, unsigned long first, + unsigned long last, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, 0); + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + if (last < first) + return XA_ERROR(-EINVAL); + + do { + xas_lock(&xas); + if (entry) { + unsigned int order = BITS_PER_LONG; + if (last + 1) + order = __ffs(last + 1); + xas_set_order(&xas, last, order); + xas_create(&xas, true); + if (xas_error(&xas)) + goto unlock; + } + do { + xas_set_range(&xas, first, last); + xas_store(&xas, entry); + if (xas_error(&xas)) + goto unlock; + first += xas_size(&xas); + } while (first <= last); +unlock: + xas_unlock(&xas); + } while (xas_nomem(&xas, gfp)); + + return xas_result(&xas, NULL); +} +EXPORT_SYMBOL(xa_store_range); +#endif /* CONFIG_XARRAY_MULTI */ + +/** + * __xa_alloc() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @limit: Range for allocated ID. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Finds an empty entry in @xa between @limit.min and @limit.max, + * stores the index into the @id pointer, then stores the entry at + * that index. A concurrent lookup will not see an uninitialised @id. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: 0 on success, -ENOMEM if memory could not be allocated or + * -EBUSY if there are no free entries in @limit. + */ +int __xa_alloc(struct xarray *xa, u32 *id, void *entry, + struct xa_limit limit, gfp_t gfp) +{ + XA_STATE(xas, xa, 0); + + if (WARN_ON_ONCE(xa_is_advanced(entry))) + return -EINVAL; + if (WARN_ON_ONCE(!xa_track_free(xa))) + return -EINVAL; + + if (!entry) + entry = XA_ZERO_ENTRY; + + do { + xas.xa_index = limit.min; + xas_find_marked(&xas, limit.max, XA_FREE_MARK); + if (xas.xa_node == XAS_RESTART) + xas_set_err(&xas, -EBUSY); + else + *id = xas.xa_index; + xas_store(&xas, entry); + xas_clear_mark(&xas, XA_FREE_MARK); + } while (__xas_nomem(&xas, gfp)); + + return xas_error(&xas); +} +EXPORT_SYMBOL(__xa_alloc); + +/** + * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @entry: New entry. + * @limit: Range of allocated ID. + * @next: Pointer to next ID to allocate. + * @gfp: Memory allocation flags. + * + * Finds an empty entry in @xa between @limit.min and @limit.max, + * stores the index into the @id pointer, then stores the entry at + * that index. A concurrent lookup will not see an uninitialised @id. + * The search for an empty entry will start at @next and will wrap + * around if necessary. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: 0 if the allocation succeeded without wrapping. 1 if the + * allocation succeeded after wrapping, -ENOMEM if memory could not be + * allocated or -EBUSY if there are no free entries in @limit. + */ +int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, + struct xa_limit limit, u32 *next, gfp_t gfp) +{ + u32 min = limit.min; + int ret; + + limit.min = max(min, *next); + ret = __xa_alloc(xa, id, entry, limit, gfp); + if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) { + xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED; + ret = 1; + } + + if (ret < 0 && limit.min > min) { + limit.min = min; + ret = __xa_alloc(xa, id, entry, limit, gfp); + if (ret == 0) + ret = 1; + } + + if (ret >= 0) { + *next = *id + 1; + if (*next == 0) + xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED; + } + return ret; +} +EXPORT_SYMBOL(__xa_alloc_cyclic); + +/** + * __xa_set_mark() - Set this mark on this entry while locked. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Attempting to set a mark on a %NULL entry does not succeed. + * + * Context: Any context. Expects xa_lock to be held on entry. + */ +void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry = xas_load(&xas); + + if (entry) + xas_set_mark(&xas, mark); +} +EXPORT_SYMBOL(__xa_set_mark); + +/** + * __xa_clear_mark() - Clear this mark on this entry while locked. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Context: Any context. Expects xa_lock to be held on entry. + */ +void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry = xas_load(&xas); + + if (entry) + xas_clear_mark(&xas, mark); +} +EXPORT_SYMBOL(__xa_clear_mark); + +/** + * xa_get_mark() - Inquire whether this mark is set on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * This function uses the RCU read lock, so the result may be out of date + * by the time it returns. If you need the result to be stable, use a lock. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: True if the entry at @index has this mark set, false if it doesn't. + */ +bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry; + + rcu_read_lock(); + entry = xas_start(&xas); + while (xas_get_mark(&xas, mark)) { + if (!xa_is_node(entry)) + goto found; + entry = xas_descend(&xas, xa_to_node(entry)); + } + rcu_read_unlock(); + return false; + found: + rcu_read_unlock(); + return true; +} +EXPORT_SYMBOL(xa_get_mark); + +/** + * xa_set_mark() - Set this mark on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Attempting to set a mark on a %NULL entry does not succeed. + * + * Context: Process context. Takes and releases the xa_lock. + */ +void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + xa_lock(xa); + __xa_set_mark(xa, index, mark); + xa_unlock(xa); +} +EXPORT_SYMBOL(xa_set_mark); + +/** + * xa_clear_mark() - Clear this mark on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Clearing a mark always succeeds. + * + * Context: Process context. Takes and releases the xa_lock. + */ +void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + xa_lock(xa); + __xa_clear_mark(xa, index, mark); + xa_unlock(xa); +} +EXPORT_SYMBOL(xa_clear_mark); + +/** + * xa_find() - Search the XArray for an entry. + * @xa: XArray. + * @indexp: Pointer to an index. + * @max: Maximum index to search to. + * @filter: Selection criterion. + * + * Finds the entry in @xa which matches the @filter, and has the lowest + * index that is at least @indexp and no more than @max. + * If an entry is found, @indexp is updated to be the index of the entry. + * This function is protected by the RCU read lock, so it may not find + * entries which are being simultaneously added. It will not return an + * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The entry, if found, otherwise %NULL. + */ +void *xa_find(struct xarray *xa, unsigned long *indexp, + unsigned long max, xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp); + void *entry; + + rcu_read_lock(); + do { + if ((__force unsigned int)filter < XA_MAX_MARKS) + entry = xas_find_marked(&xas, max, filter); + else + entry = xas_find(&xas, max); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + if (entry) + *indexp = xas.xa_index; + return entry; +} +EXPORT_SYMBOL(xa_find); + +/** + * xa_find_after() - Search the XArray for a present entry. + * @xa: XArray. + * @indexp: Pointer to an index. + * @max: Maximum index to search to. + * @filter: Selection criterion. + * + * Finds the entry in @xa which matches the @filter and has the lowest + * index that is above @indexp and no more than @max. + * If an entry is found, @indexp is updated to be the index of the entry. + * This function is protected by the RCU read lock, so it may miss entries + * which are being simultaneously added. It will not return an + * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The pointer, if found, otherwise %NULL. + */ +void *xa_find_after(struct xarray *xa, unsigned long *indexp, + unsigned long max, xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp + 1); + void *entry; + + rcu_read_lock(); + for (;;) { + if ((__force unsigned int)filter < XA_MAX_MARKS) + entry = xas_find_marked(&xas, max, filter); + else + entry = xas_find(&xas, max); + if (xas.xa_node == XAS_BOUNDS) + break; + if (xas.xa_shift) { + if (xas.xa_index & ((1UL << xas.xa_shift) - 1)) + continue; + } else { + if (xas.xa_offset < (xas.xa_index & XA_CHUNK_MASK)) + continue; + } + if (!xas_retry(&xas, entry)) + break; + } + rcu_read_unlock(); + + if (entry) + *indexp = xas.xa_index; + return entry; +} +EXPORT_SYMBOL(xa_find_after); + +static unsigned int xas_extract_present(struct xa_state *xas, void **dst, + unsigned long max, unsigned int n) +{ + void *entry; + unsigned int i = 0; + + rcu_read_lock(); + xas_for_each(xas, entry, max) { + if (xas_retry(xas, entry)) + continue; + dst[i++] = entry; + if (i == n) + break; + } + rcu_read_unlock(); + + return i; +} + +static unsigned int xas_extract_marked(struct xa_state *xas, void **dst, + unsigned long max, unsigned int n, xa_mark_t mark) +{ + void *entry; + unsigned int i = 0; + + rcu_read_lock(); + xas_for_each_marked(xas, entry, max, mark) { + if (xas_retry(xas, entry)) + continue; + dst[i++] = entry; + if (i == n) + break; + } + rcu_read_unlock(); + + return i; +} + +/** + * xa_extract() - Copy selected entries from the XArray into a normal array. + * @xa: The source XArray to copy from. + * @dst: The buffer to copy entries into. + * @start: The first index in the XArray eligible to be selected. + * @max: The last index in the XArray eligible to be selected. + * @n: The maximum number of entries to copy. + * @filter: Selection criterion. + * + * Copies up to @n entries that match @filter from the XArray. The + * copied entries will have indices between @start and @max, inclusive. + * + * The @filter may be an XArray mark value, in which case entries which are + * marked with that mark will be copied. It may also be %XA_PRESENT, in + * which case all entries which are not %NULL will be copied. + * + * The entries returned may not represent a snapshot of the XArray at a + * moment in time. For example, if another thread stores to index 5, then + * index 10, calling xa_extract() may return the old contents of index 5 + * and the new contents of index 10. Indices not modified while this + * function is running will not be skipped. + * + * If you need stronger guarantees, holding the xa_lock across calls to this + * function will prevent concurrent modification. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The number of entries copied. + */ +unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start, + unsigned long max, unsigned int n, xa_mark_t filter) +{ + XA_STATE(xas, xa, start); + + if (!n) + return 0; + + if ((__force unsigned int)filter < XA_MAX_MARKS) + return xas_extract_marked(&xas, dst, max, n, filter); + return xas_extract_present(&xas, dst, max, n); +} +EXPORT_SYMBOL(xa_extract); + +/** + * xa_destroy() - Free all internal data structures. + * @xa: XArray. + * + * After calling this function, the XArray is empty and has freed all memory + * allocated for its internal data structures. You are responsible for + * freeing the objects referenced by the XArray. + * + * Context: Any context. Takes and releases the xa_lock, interrupt-safe. + */ +void xa_destroy(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + unsigned long flags; + void *entry; + + xas.xa_node = NULL; + xas_lock_irqsave(&xas, flags); + entry = xa_head_locked(xa); + RCU_INIT_POINTER(xa->xa_head, NULL); + xas_init_marks(&xas); + if (xa_zero_busy(xa)) + xa_mark_clear(xa, XA_FREE_MARK); + /* lockdep checks we're still holding the lock in xas_free_nodes() */ + if (xa_is_node(entry)) + xas_free_nodes(&xas, xa_to_node(entry)); + xas_unlock_irqrestore(&xas, flags); +} +EXPORT_SYMBOL(xa_destroy); + +#ifdef XA_DEBUG +void xa_dump_node(const struct xa_node *node) +{ + unsigned i, j; + + if (!node) + return; + if ((unsigned long)node & 3) { + pr_cont("node %px\n", node); + return; + } + + pr_cont("node %px %s %d parent %px shift %d count %d values %d " + "array %px list %px %px marks", + node, node->parent ? "offset" : "max", node->offset, + node->parent, node->shift, node->count, node->nr_values, + node->array, node->private_list.prev, node->private_list.next); + for (i = 0; i < XA_MAX_MARKS; i++) + for (j = 0; j < XA_MARK_LONGS; j++) + pr_cont(" %lx", node->marks[i][j]); + pr_cont("\n"); +} + +void xa_dump_index(unsigned long index, unsigned int shift) +{ + if (!shift) + pr_info("%lu: ", index); + else if (shift >= BITS_PER_LONG) + pr_info("0-%lu: ", ~0UL); + else + pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1)); +} + +void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift) +{ + if (!entry) + return; + + xa_dump_index(index, shift); + + if (xa_is_node(entry)) { + if (shift == 0) { + pr_cont("%px\n", entry); + } else { + unsigned long i; + struct xa_node *node = xa_to_node(entry); + xa_dump_node(node); + for (i = 0; i < XA_CHUNK_SIZE; i++) + xa_dump_entry(node->slots[i], + index + (i << node->shift), node->shift); + } + } else if (xa_is_value(entry)) + pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry), + xa_to_value(entry), entry); + else if (!xa_is_internal(entry)) + pr_cont("%px\n", entry); + else if (xa_is_retry(entry)) + pr_cont("retry (%ld)\n", xa_to_internal(entry)); + else if (xa_is_sibling(entry)) + pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry)); + else if (xa_is_zero(entry)) + pr_cont("zero (%ld)\n", xa_to_internal(entry)); + else + pr_cont("UNKNOWN ENTRY (%px)\n", entry); +} + +void xa_dump(const struct xarray *xa) +{ + void *entry = xa->xa_head; + unsigned int shift = 0; + + pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry, + xa->xa_flags, xa_marked(xa, XA_MARK_0), + xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2)); + if (xa_is_node(entry)) + shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT; + xa_dump_entry(entry, 0, shift); +} +#endif +#endif /* !HAVE_XARRAY_SUPPORT */ diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/module.c b/drivers/staging/lustrefsx/libcfs/libcfs/module.c new file mode 100644 index 0000000000000..82a75a7fd3e43 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/module.c @@ -0,0 +1,936 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#include "tracefile.h" + +struct lnet_debugfs_symlink_def { + const char *name; + const char *target; +}; + +static struct dentry *lnet_debugfs_root; + +BLOCKING_NOTIFIER_HEAD(libcfs_ioctl_list); +EXPORT_SYMBOL(libcfs_ioctl_list); + +static inline size_t libcfs_ioctl_packlen(struct libcfs_ioctl_data *data) +{ + size_t len = sizeof(*data); + + len += (data->ioc_inllen1 + 7) & ~7; + len += (data->ioc_inllen2 + 7) & ~7; + return len; +} + +static bool libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data) +{ + const int maxlen = 1 << 30; + if (data->ioc_hdr.ioc_len > maxlen) + return true; + + if (data->ioc_inllen1 > maxlen) + return true; + + if (data->ioc_inllen2 > maxlen) + return true; + + if (data->ioc_inlbuf1 && !data->ioc_inllen1) + return true; + + if (data->ioc_inlbuf2 && !data->ioc_inllen2) + return true; + + if (data->ioc_pbuf1 && !data->ioc_plen1) + return true; + + if (data->ioc_pbuf2 && !data->ioc_plen2) + return true; + + if (data->ioc_plen1 && !data->ioc_pbuf1) + return true; + + if (data->ioc_plen2 && !data->ioc_pbuf2) + return true; + + if (libcfs_ioctl_packlen(data) != data->ioc_hdr.ioc_len) + return true; + + if (data->ioc_inllen1 && + data->ioc_bulk[((data->ioc_inllen1 + 7) & ~7) + + data->ioc_inllen2 - 1] != '\0') + return true; + + return false; +} + +int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data) +{ + ENTRY; + + if (libcfs_ioctl_is_invalid(data)) { + CERROR("libcfs ioctl: parameter not correctly formatted\n"); + RETURN(-EINVAL); + } + + if (data->ioc_inllen1 != 0) + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + + if (data->ioc_inllen2 != 0) + data->ioc_inlbuf2 = &data->ioc_bulk[0] + + cfs_size_round(data->ioc_inllen1); + + RETURN(0); +} + +int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp, + struct libcfs_ioctl_hdr __user *uhdr) +{ + struct libcfs_ioctl_hdr hdr; + int err; + + ENTRY; + if (copy_from_user(&hdr, uhdr, sizeof(hdr))) + RETURN(-EFAULT); + + if (hdr.ioc_version != LIBCFS_IOCTL_VERSION && + hdr.ioc_version != LIBCFS_IOCTL_VERSION2) { + CERROR("libcfs ioctl: version mismatch expected %#x, got %#x\n", + LIBCFS_IOCTL_VERSION, hdr.ioc_version); + RETURN(-EINVAL); + } + + if (hdr.ioc_len < sizeof(struct libcfs_ioctl_hdr)) { + CERROR("libcfs ioctl: user buffer too small for ioctl\n"); + RETURN(-EINVAL); + } + + if (hdr.ioc_len > LIBCFS_IOC_DATA_MAX) { + CERROR("libcfs ioctl: user buffer is too large %d/%d\n", + hdr.ioc_len, LIBCFS_IOC_DATA_MAX); + RETURN(-EINVAL); + } + + LIBCFS_ALLOC(*hdr_pp, hdr.ioc_len); + if (*hdr_pp == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(*hdr_pp, uhdr, hdr.ioc_len)) + GOTO(free, err = -EFAULT); + + if ((*hdr_pp)->ioc_version != hdr.ioc_version || + (*hdr_pp)->ioc_len != hdr.ioc_len) { + GOTO(free, err = -EINVAL); + } + + RETURN(0); + +free: + LIBCFS_FREE(*hdr_pp, hdr.ioc_len); + RETURN(err); +} + +static int libcfs_ioctl(unsigned long cmd, void __user *uparam) +{ + struct libcfs_ioctl_data *data = NULL; + struct libcfs_ioctl_hdr *hdr; + int err; + ENTRY; + + /* 'cmd' and permissions get checked in our arch-specific caller */ + err = libcfs_ioctl_getdata(&hdr, uparam); + if (err != 0) { + CDEBUG_LIMIT(D_ERROR, + "libcfs ioctl: data header error %d\n", err); + RETURN(err); + } + + if (hdr->ioc_version == LIBCFS_IOCTL_VERSION) { + /* The libcfs_ioctl_data_adjust() function performs adjustment + * operations on the libcfs_ioctl_data structure to make + * it usable by the code. This doesn't need to be called + * for new data structures added. */ + data = container_of(hdr, struct libcfs_ioctl_data, ioc_hdr); + err = libcfs_ioctl_data_adjust(data); + if (err != 0) + GOTO(out, err); + } + + CDEBUG(D_IOCTL, "libcfs ioctl cmd %lu\n", cmd); + switch (cmd) { + case IOC_LIBCFS_CLEAR_DEBUG: + libcfs_debug_clear_buffer(); + break; + case IOC_LIBCFS_MARK_DEBUG: + if (data == NULL || + data->ioc_inlbuf1 == NULL || + data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0') + GOTO(out, err = -EINVAL); + + libcfs_debug_mark_buffer(data->ioc_inlbuf1); + break; + + default: + err = blocking_notifier_call_chain(&libcfs_ioctl_list, + cmd, hdr); + if (!(err & NOTIFY_STOP_MASK)) + /* No-one claimed the ioctl */ + err = -EINVAL; + else + err = notifier_to_errno(err); + if (copy_to_user(uparam, hdr, hdr->ioc_len) && !err) + err = -EFAULT; + break; + } +out: + LIBCFS_FREE(hdr, hdr->ioc_len); + RETURN(err); +} + +static long +libcfs_psdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (_IOC_TYPE(cmd) != IOC_LIBCFS_TYPE || + _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR || + _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR) { + CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", + _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); + return -EINVAL; + } + + return libcfs_ioctl(cmd, (void __user *)arg); +} + +static const struct file_operations libcfs_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = libcfs_psdev_ioctl, +}; + +static struct miscdevice libcfs_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "lnet", + .fops = &libcfs_fops, +}; + +static int proc_dobitmasks(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + const int tmpstrlen = 512; + char *tmpstr = NULL; + int rc; + size_t nob = *lenp; + loff_t pos = *ppos; + unsigned int *mask = table->data; + int is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0; + int is_printk = (mask == &libcfs_printk) ? 1 : 0; + + if (!write) { + tmpstr = kmalloc(tmpstrlen, GFP_KERNEL | __GFP_ZERO); + if (!tmpstr) + return -ENOMEM; + libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys); + rc = strlen(tmpstr); + + if (pos >= rc) { + rc = 0; + } else { + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, "\n"); + } + } else { + tmpstr = memdup_user_nul(buffer, nob); + if (IS_ERR(tmpstr)) + return PTR_ERR(tmpstr); + + rc = libcfs_debug_str2mask(mask, strim(tmpstr), is_subsys); + /* Always print LBUG/LASSERT to console, so keep this mask */ + if (is_printk) + *mask |= D_EMERG; + } + + kfree(tmpstr); + return rc; +} + +static int min_watchdog_ratelimit; /* disable ratelimiting */ +static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */ + +static int proc_dump_kernel(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + size_t nob = *lenp; + + if (!write) + return 0; + + return cfs_trace_dump_debug_buffer_usrstr(buffer, nob); +} + +static int proc_daemon_file(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + size_t nob = *lenp; + loff_t pos = *ppos; + + if (!write) { + int len = strlen(cfs_tracefile); + + if (pos >= len) + return 0; + + return cfs_trace_copyout_string(buffer, nob, + cfs_tracefile + pos, "\n"); + } + + return cfs_trace_daemon_command_usrstr(buffer, nob); +} + +static int libcfs_force_lbug(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + if (write) + LBUG(); + return 0; +} + +static int proc_fail_loc(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc; + long old_fail_loc = cfs_fail_loc; + + if (!*lenp || *ppos) { + *lenp = 0; + return 0; + } + + if (write) { + char *kbuf = memdup_user_nul(buffer, *lenp); + + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + rc = kstrtoul(kbuf, 0, &cfs_fail_loc); + kfree(kbuf); + *ppos += *lenp; + } else { + char kbuf[64/3+3]; + + rc = scnprintf(kbuf, sizeof(kbuf), "%lu\n", cfs_fail_loc); + if (copy_to_user(buffer, kbuf, rc)) + rc = -EFAULT; + else { + *lenp = rc; + *ppos += rc; + } + } + + if (old_fail_loc != cfs_fail_loc) { + cfs_race_state = 1; + wake_up(&cfs_race_waitq); + } + return rc; +} + +int debugfs_doint(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc; + + if (!*lenp || *ppos) { + *lenp = 0; + return 0; + } + + if (write) { + char *kbuf = memdup_user_nul(buffer, *lenp); + int val; + + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + rc = kstrtoint(kbuf, 0, &val); + kfree(kbuf); + if (!rc) { + if (table->extra1 && val < *(int *)table->extra1) + val = *(int *)table->extra1; + if (table->extra2 && val > *(int *)table->extra2) + val = *(int *)table->extra2; + *(int *)table->data = val; + } + *ppos += *lenp; + } else { + char kbuf[64/3+3]; + + rc = scnprintf(kbuf, sizeof(kbuf), "%u\n", *(int *)table->data); + if (copy_to_user(buffer, kbuf, rc)) + rc = -EFAULT; + else { + *lenp = rc; + *ppos += rc; + } + } + + return rc; +} +EXPORT_SYMBOL(debugfs_doint); + +static int debugfs_dou64(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc; + + if (!*lenp || *ppos) { + *lenp = 0; + return 0; + } + + if (write) { + char *kbuf = memdup_user_nul(buffer, *lenp); + unsigned long long val; + + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + rc = kstrtoull(kbuf, 0, &val); + kfree(kbuf); + if (!rc) + *(u64 *)table->data = val; + *ppos += *lenp; + } else { + char kbuf[64/3+3]; + + rc = scnprintf(kbuf, sizeof(kbuf), "%llu\n", + (unsigned long long)*(u64 *)table->data); + if (copy_to_user(buffer, kbuf, rc)) + rc = -EFAULT; + else { + *lenp = rc; + *ppos += rc; + } + } + + return rc; +} + +static int debugfs_dostring(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int len = *lenp; + char *kbuf = table->data; + + if (!len || *ppos) { + *lenp = 0; + return 0; + } + if (len > table->maxlen) + len = table->maxlen; + if (write) { + if (copy_from_user(kbuf, buffer, len)) + return -EFAULT; + memset(kbuf+len, 0, table->maxlen - len); + *ppos = *lenp; + } else { + len = strnlen(kbuf, len); + if (copy_to_user(buffer, kbuf, len)) + return -EFAULT; + if (len < *lenp) { + if (copy_to_user(buffer+len, "\n", 1)) + return -EFAULT; + len += 1; + } + *ppos += len; + *lenp -= len; + } + return len; +} + +static int proc_cpt_table(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + size_t nob = *lenp; + loff_t pos = *ppos; + char *buf = NULL; + int len = 4096; + int rc = 0; + + if (write) + return -EPERM; + + while (1) { + LIBCFS_ALLOC(buf, len); + if (buf == NULL) + return -ENOMEM; + + rc = cfs_cpt_table_print(cfs_cpt_tab, buf, len); + if (rc >= 0) + break; + + if (rc == -EFBIG) { + LIBCFS_FREE(buf, len); + len <<= 1; + continue; + } + goto out; + } + + if (pos >= rc) { + rc = 0; + goto out; + } + + rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL); +out: + if (buf != NULL) + LIBCFS_FREE(buf, len); + return rc; +} + +static int proc_cpt_distance(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + size_t nob = *lenp; + loff_t pos = *ppos; + char *buf = NULL; + int len = 4096; + int rc = 0; + + if (write) + return -EPERM; + + while (1) { + LIBCFS_ALLOC(buf, len); + if (buf == NULL) + return -ENOMEM; + + rc = cfs_cpt_distance_print(cfs_cpt_tab, buf, len); + if (rc >= 0) + break; + + if (rc == -EFBIG) { + LIBCFS_FREE(buf, len); + len <<= 1; + continue; + } + goto out; + } + + if (pos >= rc) { + rc = 0; + goto out; + } + + rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL); + out: + if (buf != NULL) + LIBCFS_FREE(buf, len); + return rc; +} + +static struct ctl_table lnet_table[] = { + { + .procname = "debug", + .data = &libcfs_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dobitmasks, + }, + { + .procname = "subsystem_debug", + .data = &libcfs_subsystem_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dobitmasks, + }, + { + .procname = "printk", + .data = &libcfs_printk, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dobitmasks, + }, + { + .procname = "cpu_partition_table", + .maxlen = 128, + .mode = 0444, + .proc_handler = &proc_cpt_table, + }, + { + .procname = "cpu_partition_distance", + .maxlen = 128, + .mode = 0444, + .proc_handler = &proc_cpt_distance, + }, + { + .procname = "debug_log_upcall", + .data = lnet_debug_log_upcall, + .maxlen = sizeof(lnet_debug_log_upcall), + .mode = 0644, + .proc_handler = &debugfs_dostring, + }, + { + .procname = "lnet_memused", + .data = (u64 *)&libcfs_kmem.counter, + .maxlen = sizeof(u64), + .mode = 0444, + .proc_handler = &debugfs_dou64, + }, + { + .procname = "catastrophe", + .data = &libcfs_catastrophe, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &debugfs_doint, + }, + { + .procname = "dump_kernel", + .maxlen = 256, + .mode = 0200, + .proc_handler = &proc_dump_kernel, + }, + { + .procname = "daemon_file", + .mode = 0644, + .maxlen = 256, + .proc_handler = &proc_daemon_file, + }, + { + .procname = "watchdog_ratelimit", + .data = &libcfs_watchdog_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &debugfs_doint, + .extra1 = &min_watchdog_ratelimit, + .extra2 = &max_watchdog_ratelimit, + }, + { + .procname = "force_lbug", + .data = NULL, + .maxlen = 0, + .mode = 0200, + .proc_handler = &libcfs_force_lbug + }, + { + .procname = "fail_loc", + .data = &cfs_fail_loc, + .maxlen = sizeof(cfs_fail_loc), + .mode = 0644, + .proc_handler = &proc_fail_loc + }, + { + .procname = "fail_val", + .data = &cfs_fail_val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &debugfs_doint + }, + { + .procname = "fail_err", + .data = &cfs_fail_err, + .maxlen = sizeof(cfs_fail_err), + .mode = 0644, + .proc_handler = &debugfs_doint, + }, + { + } +}; + +static const struct lnet_debugfs_symlink_def lnet_debugfs_symlinks[] = { + { .name = "console_ratelimit", + .target = "../../../module/libcfs/parameters/libcfs_console_ratelimit" }, + { .name = "debug_path", + .target = "../../../module/libcfs/parameters/libcfs_debug_file_path" }, + { .name = "panic_on_lbug", + .target = "../../../module/libcfs/parameters/libcfs_panic_on_lbug" }, + { .name = "console_backoff", + .target = "../../../module/libcfs/parameters/libcfs_console_backoff" }, + { .name = "debug_mb", + .target = "../../../module/libcfs/parameters/libcfs_debug_mb" }, + { .name = "console_min_delay_centisecs", + .target = "../../../module/libcfs/parameters/libcfs_console_min_delay" }, + { .name = "console_max_delay_centisecs", + .target = "../../../module/libcfs/parameters/libcfs_console_max_delay" }, + { .name = NULL }, +}; + +static ssize_t lnet_debugfs_read(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct ctl_table *table = filp->private_data; + loff_t old_pos = *ppos; + ssize_t rc = -EINVAL; + + if (table) + rc = table->proc_handler(table, 0, (void __user *)buf, + &count, ppos); + /* + * On success, the length read is either in error or in count. + * If ppos changed, then use count, else use error + */ + if (!rc && *ppos != old_pos) + rc = count; + else if (rc > 0) + *ppos += rc; + + return rc; +} + +static ssize_t lnet_debugfs_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct ctl_table *table = filp->private_data; + loff_t old_pos = *ppos; + ssize_t rc = -EINVAL; + + if (table) + rc = table->proc_handler(table, 1, (void __user *)buf, &count, + ppos); + if (rc) + return rc; + + if (*ppos == old_pos) + *ppos += count; + + return count; +} + +static const struct file_operations lnet_debugfs_file_operations_rw = { + .open = simple_open, + .read = lnet_debugfs_read, + .write = lnet_debugfs_write, + .llseek = default_llseek, +}; + +static const struct file_operations lnet_debugfs_file_operations_ro = { + .open = simple_open, + .read = lnet_debugfs_read, + .llseek = default_llseek, +}; + +static const struct file_operations lnet_debugfs_file_operations_wo = { + .open = simple_open, + .write = lnet_debugfs_write, + .llseek = default_llseek, +}; + +static const struct file_operations *lnet_debugfs_fops_select(umode_t mode) +{ + if (!(mode & S_IWUGO)) + return &lnet_debugfs_file_operations_ro; + + if (!(mode & S_IRUGO)) + return &lnet_debugfs_file_operations_wo; + + return &lnet_debugfs_file_operations_rw; +} + +void lnet_insert_debugfs(struct ctl_table *table) +{ + if (!lnet_debugfs_root) + lnet_debugfs_root = debugfs_create_dir("lnet", NULL); + + /* Even if we cannot create, just ignore it altogether) */ + if (IS_ERR_OR_NULL(lnet_debugfs_root)) + return; + + /* We don't save the dentry returned in next two calls, because + * we don't call debugfs_remove() but rather remove_recursive() + */ + for (; table && table->procname; table++) + debugfs_create_file(table->procname, table->mode, + lnet_debugfs_root, table, + lnet_debugfs_fops_select(table->mode)); +} +EXPORT_SYMBOL_GPL(lnet_insert_debugfs); + +static void lnet_insert_debugfs_links( + const struct lnet_debugfs_symlink_def *symlinks) +{ + for (; symlinks && symlinks->name; symlinks++) + debugfs_create_symlink(symlinks->name, lnet_debugfs_root, + symlinks->target); +} + +void lnet_remove_debugfs(struct ctl_table *table) +{ + for (; table && table->procname; table++) { + struct qstr dname = QSTR_INIT(table->procname, + strlen(table->procname)); + struct dentry *dentry; + + dentry = d_hash_and_lookup(lnet_debugfs_root, &dname); + debugfs_remove(dentry); + } +} +EXPORT_SYMBOL_GPL(lnet_remove_debugfs); + +static int __init libcfs_init(void) +{ + int rc; + + cfs_arch_init(); + + init_libcfs_vfree_atomic(); + + rc = libcfs_debug_init(5 * 1024 * 1024); + if (rc < 0) { + pr_err("LustreError: libcfs_debug_init: rc = %d\n", rc); + return (rc); + } + + cfs_debug_init(); + + rc = cfs_cpu_init(); + if (rc != 0) + goto cleanup_debug; + + rc = misc_register(&libcfs_dev); + if (rc) { + CERROR("misc_register: error %d\n", rc); + goto cleanup_cpu; + } + + rc = cfs_wi_startup(); + if (rc) { + CERROR("initialize workitem: error %d\n", rc); + goto cleanup_deregister; + } + + cfs_rehash_wq = alloc_workqueue("cfs_rh", WQ_SYSFS, 4); + if (!cfs_rehash_wq) { + rc = -ENOMEM; + CERROR("libcfs: failed to start rehash workqueue: rc = %d\n", + rc); + goto cleanup_deregister; + } + + rc = cfs_crypto_register(); + if (rc) { + CERROR("cfs_crypto_regster: error %d\n", rc); + goto cleanup_wi; + } + + lnet_insert_debugfs(lnet_table); + if (!IS_ERR_OR_NULL(lnet_debugfs_root)) + lnet_insert_debugfs_links(lnet_debugfs_symlinks); + + rc = llcrypt_init(); + if (rc) { + CERROR("llcrypt_init: error %d\n", rc); + goto cleanup_crypto; + } + + CDEBUG(D_OTHER, "portals setup OK\n"); + return 0; +cleanup_crypto: + cfs_crypto_unregister(); +cleanup_wi: + cfs_wi_shutdown(); +cleanup_deregister: + misc_deregister(&libcfs_dev); +cleanup_cpu: + cfs_cpu_fini(); +cleanup_debug: + libcfs_debug_cleanup(); + return rc; +} + +static void __exit libcfs_exit(void) +{ + int rc; + + /* Remove everthing */ + debugfs_remove_recursive(lnet_debugfs_root); + lnet_debugfs_root = NULL; + + CDEBUG(D_MALLOC, "before Portals cleanup: kmem %lld\n", + libcfs_kmem_read()); + + llcrypt_exit(); + + if (cfs_rehash_wq) { + destroy_workqueue(cfs_rehash_wq); + cfs_rehash_wq = NULL; + } + + cfs_crypto_unregister(); + cfs_wi_shutdown(); + + misc_deregister(&libcfs_dev); + + cfs_cpu_fini(); + + /* the below message is checked in test-framework.sh check_mem_leak() */ + if (libcfs_kmem_read() != 0) + CERROR("Portals memory leaked: %lld bytes\n", + libcfs_kmem_read()); + + rc = libcfs_debug_cleanup(); + if (rc) + pr_err("LustreError: libcfs_debug_cleanup: rc = %d\n", rc); + + exit_libcfs_vfree_atomic(); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre helper library"); +MODULE_VERSION(LIBCFS_VERSION); +MODULE_LICENSE("GPL"); + +module_init(libcfs_init); +module_exit(libcfs_exit); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c new file mode 100644 index 0000000000000..ac473c5eae651 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c @@ -0,0 +1,1213 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/libcfs/tracefile.c + * + * Author: Zach Brown + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include "tracefile.h" + +#include +#include +#include +#include +#include +#include +#include +#include + + +enum cfs_trace_buf_type { + CFS_TCD_TYPE_PROC = 0, + CFS_TCD_TYPE_SOFTIRQ, + CFS_TCD_TYPE_IRQ, + CFS_TCD_TYPE_CNT +}; + +union cfs_trace_data_union (*cfs_trace_data[CFS_TCD_TYPE_CNT])[NR_CPUS] __cacheline_aligned; + +/* Pages containing records already processed by daemon. + * Link via ->lru, use size in ->private + */ +static LIST_HEAD(daemon_pages); +static long daemon_pages_count; +static long daemon_pages_max; + +char cfs_tracefile[TRACEFILE_NAME_SIZE]; +long long cfs_tracefile_size = CFS_TRACEFILE_SIZE; + +struct task_struct *tctl_task; + +static atomic_t cfs_tage_allocated = ATOMIC_INIT(0); +static DECLARE_RWSEM(cfs_tracefile_sem); + +/* trace file lock routines */ +/* The walking argument indicates the locking comes from all tcd types + * iterator and we must lock it and dissable local irqs to avoid deadlocks + * with other interrupt locks that might be happening. See LU-1311 + * for details. + */ +int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking) + __acquires(&tcd->tcd_lock) +{ + __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_CNT); + if (tcd->tcd_type == CFS_TCD_TYPE_IRQ) + spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags); + else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ) + spin_lock_bh(&tcd->tcd_lock); + else if (unlikely(walking)) + spin_lock_irq(&tcd->tcd_lock); + else + spin_lock(&tcd->tcd_lock); + return 1; +} + +void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking) + __releases(&tcd->tcd_lock) +{ + __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_CNT); + if (tcd->tcd_type == CFS_TCD_TYPE_IRQ) + spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags); + else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ) + spin_unlock_bh(&tcd->tcd_lock); + else if (unlikely(walking)) + spin_unlock_irq(&tcd->tcd_lock); + else + spin_unlock(&tcd->tcd_lock); +} + +#define cfs_tcd_for_each(tcd, i, j) \ + for (i = 0; i < CFS_TCD_TYPE_CNT && cfs_trace_data[i]; i++) \ + for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd); \ + j < num_possible_cpus(); \ + j++, (tcd) = &(*cfs_trace_data[i])[j].tcd) + +#define cfs_tcd_for_each_type_lock(tcd, i, cpu) \ + for (i = 0; i < CFS_TCD_TYPE_CNT && cfs_trace_data[i] && \ + (tcd = &(*cfs_trace_data[i])[cpu].tcd) && \ + cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++) + +enum cfs_trace_buf_type cfs_trace_buf_idx_get(void) +{ + if (in_irq()) + return CFS_TCD_TYPE_IRQ; + if (in_softirq()) + return CFS_TCD_TYPE_SOFTIRQ; + return CFS_TCD_TYPE_PROC; +} + +static inline struct cfs_trace_cpu_data * +cfs_trace_get_tcd(void) +{ + struct cfs_trace_cpu_data *tcd = + &(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd; + + cfs_trace_lock_tcd(tcd, 0); + + return tcd; +} + +static inline void cfs_trace_put_tcd(struct cfs_trace_cpu_data *tcd) +{ + cfs_trace_unlock_tcd(tcd, 0); + + put_cpu(); +} + +static inline struct cfs_trace_page * +cfs_tage_from_list(struct list_head *list) +{ + return list_entry(list, struct cfs_trace_page, linkage); +} + +static struct cfs_trace_page *cfs_tage_alloc(gfp_t gfp) +{ + struct page *page; + struct cfs_trace_page *tage; + + /* My caller is trying to free memory */ + if (!in_interrupt() && (current->flags & PF_MEMALLOC)) + return NULL; + + /* + * Don't spam console with allocation failures: they will be reported + * by upper layer anyway. + */ + gfp |= __GFP_NOWARN; + page = alloc_page(gfp); + if (page == NULL) + return NULL; + + tage = kmalloc(sizeof(*tage), gfp); + if (tage == NULL) { + __free_page(page); + return NULL; + } + + tage->page = page; + atomic_inc(&cfs_tage_allocated); + return tage; +} + +static void cfs_tage_free(struct cfs_trace_page *tage) +{ + __LASSERT(tage != NULL); + __LASSERT(tage->page != NULL); + + __free_page(tage->page); + kfree(tage); + atomic_dec(&cfs_tage_allocated); +} + +static void cfs_tage_to_tail(struct cfs_trace_page *tage, + struct list_head *queue) +{ + __LASSERT(tage != NULL); + __LASSERT(queue != NULL); + + list_move_tail(&tage->linkage, queue); +} + +/* return a page that has 'len' bytes left at the end */ +static struct cfs_trace_page * +cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len) +{ + struct cfs_trace_page *tage; + struct task_struct *tsk; + + if (tcd->tcd_cur_pages > 0) { + __LASSERT(!list_empty(&tcd->tcd_pages)); + tage = cfs_tage_from_list(tcd->tcd_pages.prev); + if (tage->used + len <= PAGE_SIZE) + return tage; + } + + if (tcd->tcd_cur_pages < tcd->tcd_max_pages) { + if (tcd->tcd_cur_stock_pages > 0) { + tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev); + --tcd->tcd_cur_stock_pages; + list_del_init(&tage->linkage); + } else { + tage = cfs_tage_alloc(GFP_ATOMIC); + if (unlikely(tage == NULL)) { + if ((!(current->flags & PF_MEMALLOC) || + in_interrupt()) && printk_ratelimit()) + pr_warn("Lustre: cannot allocate a tage (%ld)\n", + tcd->tcd_cur_pages); + return NULL; + } + } + + tage->used = 0; + tage->cpu = smp_processor_id(); + tage->type = tcd->tcd_type; + list_add_tail(&tage->linkage, &tcd->tcd_pages); + tcd->tcd_cur_pages++; + + tsk = tctl_task; + if (tcd->tcd_cur_pages > 8 && tsk) + /* + * wake up tracefiled to process some pages. + */ + wake_up_process(tsk); + + return tage; + } + return NULL; +} + +static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd) +{ + int pgcount = tcd->tcd_cur_pages / 10; + struct page_collection pc; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + if (printk_ratelimit()) + pr_warn("Lustre: debug daemon buffer overflowed; discarding 10%% of pages (%d of %ld)\n", + pgcount + 1, tcd->tcd_cur_pages); + + INIT_LIST_HEAD(&pc.pc_pages); + + list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) { + if (pgcount-- == 0) + break; + + list_del(&tage->linkage); + cfs_tage_free(tage); + tcd->tcd_cur_pages--; + } +} + +/* return a page that has 'len' bytes left at the end */ +static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd, + unsigned long len) +{ + struct cfs_trace_page *tage; + + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + if (len > PAGE_SIZE) { + pr_err("LustreError: cowardly refusing to write %lu bytes in a page\n", + len); + return NULL; + } + + tage = cfs_trace_get_tage_try(tcd, len); + if (tage != NULL) + return tage; + if (tctl_task) + cfs_tcd_shrink(tcd); + if (tcd->tcd_cur_pages > 0) { + tage = cfs_tage_from_list(tcd->tcd_pages.next); + tage->used = 0; + cfs_tage_to_tail(tage, &tcd->tcd_pages); + } + return tage; +} + +static void cfs_set_ptldebug_header(struct ptldebug_header *header, + struct libcfs_debug_msg_data *msgdata, + unsigned long stack) +{ + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + + header->ph_subsys = msgdata->msg_subsys; + header->ph_mask = msgdata->msg_mask; + header->ph_cpu_id = smp_processor_id(); + header->ph_type = cfs_trace_buf_idx_get(); + /* y2038 safe since all user space treats this as unsigned, but + * will overflow in 2106 + */ + header->ph_sec = (u32)ts.tv_sec; + header->ph_usec = ts.tv_nsec / NSEC_PER_USEC; + header->ph_stack = stack; + header->ph_pid = current->pid; + header->ph_line_num = msgdata->msg_line; + header->ph_extern_pid = 0; +} + +static void cfs_vprint_to_console(struct ptldebug_header *hdr, int mask, + struct va_format *vaf, const char *file, + const char *fn) +{ + char *prefix = "Lustre"; + + if (hdr->ph_subsys == S_LND || hdr->ph_subsys == S_LNET) + prefix = "LNet"; + + if (mask & D_CONSOLE) { + if (mask & D_EMERG) + pr_emerg("%sError: %pV", prefix, vaf); + else if (mask & D_ERROR) + pr_err("%sError: %pV", prefix, vaf); + else if (mask & D_WARNING) + pr_warn("%s: %pV", prefix, vaf); + else if (mask & libcfs_printk) + pr_info("%s: %pV", prefix, vaf); + } else { + if (mask & D_EMERG) + pr_emerg("%sError: %d:%d:(%s:%d:%s()) %pV", prefix, + hdr->ph_pid, hdr->ph_extern_pid, file, + hdr->ph_line_num, fn, vaf); + else if (mask & D_ERROR) + pr_err("%sError: %d:%d:(%s:%d:%s()) %pV", prefix, + hdr->ph_pid, hdr->ph_extern_pid, file, + hdr->ph_line_num, fn, vaf); + else if (mask & D_WARNING) + pr_warn("%s: %d:%d:(%s:%d:%s()) %pV", prefix, + hdr->ph_pid, hdr->ph_extern_pid, file, + hdr->ph_line_num, fn, vaf); + else if (mask & (D_CONSOLE | libcfs_printk)) + pr_info("%s: %pV", prefix, vaf); + } +} + +static void cfs_print_to_console(struct ptldebug_header *hdr, int mask, + const char *file, const char *fn, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + cfs_vprint_to_console(hdr, mask, &vaf, file, fn); +} + +int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata, + const char *format, ...) +{ + struct cfs_trace_cpu_data *tcd = NULL; + struct ptldebug_header header = {0}; + struct cfs_trace_page *tage; + /* string_buf is used only if tcd != NULL, and is always set then */ + char *string_buf = NULL; + char *debug_buf; + int known_size; + int needed = 85; /* seeded with average message length */ + int max_nob; + va_list ap; + int retry; + int mask = msgdata->msg_mask; + char *file = (char *)msgdata->msg_file; + struct cfs_debug_limit_state *cdls = msgdata->msg_cdls; + + if (strchr(file, '/')) + file = strrchr(file, '/') + 1; + + tcd = cfs_trace_get_tcd(); + + /* cfs_trace_get_tcd() grabs a lock, which disables preemption and + * pins us to a particular CPU. This avoids an smp_processor_id() + * warning on Linux when debugging is enabled. + */ + cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK()); + + if (!tcd) /* arch may not log in IRQ context */ + goto console; + + if (tcd->tcd_cur_pages == 0) + header.ph_flags |= PH_FLAG_FIRST_RECORD; + + if (tcd->tcd_shutting_down) { + cfs_trace_put_tcd(tcd); + tcd = NULL; + goto console; + } + + known_size = strlen(file) + 1; + if (msgdata->msg_fn) + known_size += strlen(msgdata->msg_fn) + 1; + + if (libcfs_debug_binary) + known_size += sizeof(header); + + /* + * May perform an additional pass to update 'needed' and increase + * tage buffer size to match vsnprintf reported size required + * On the second pass (retry=1) use vscnprintf [which returns + * number of bytes written not including the terminating nul] + * to clarify `needed` is used as number of bytes written + * for the remainder of this function + */ + for (retry = 0; retry < 2; retry++) { + tage = cfs_trace_get_tage(tcd, needed + known_size + 1); + if (!tage) { + if (needed + known_size > PAGE_SIZE) + mask |= D_ERROR; + + cfs_trace_put_tcd(tcd); + tcd = NULL; + goto console; + } + + string_buf = (char *)page_address(tage->page) + + tage->used + known_size; + + max_nob = PAGE_SIZE - tage->used - known_size; + if (max_nob <= 0) { + pr_emerg("LustreError: negative max_nob: %d\n", + max_nob); + mask |= D_ERROR; + cfs_trace_put_tcd(tcd); + tcd = NULL; + goto console; + } + + va_start(ap, format); + if (retry) + needed = vscnprintf(string_buf, max_nob, format, ap); + else + needed = vsnprintf(string_buf, max_nob, format, ap); + va_end(ap); + + if (needed < max_nob) /* well. printing ok.. */ + break; + } + + /* `needed` is actual bytes written to string_buf */ + if (*(string_buf + needed - 1) != '\n') { + pr_info("Lustre: format at %s:%d:%s doesn't end in newline\n", + file, msgdata->msg_line, msgdata->msg_fn); + } + + header.ph_len = known_size + needed; + debug_buf = (char *)page_address(tage->page) + tage->used; + + if (libcfs_debug_binary) { + memcpy(debug_buf, &header, sizeof(header)); + tage->used += sizeof(header); + debug_buf += sizeof(header); + } + + strlcpy(debug_buf, file, PAGE_SIZE - tage->used); + tage->used += strlen(file) + 1; + debug_buf += strlen(file) + 1; + + if (msgdata->msg_fn) { + strlcpy(debug_buf, msgdata->msg_fn, PAGE_SIZE - tage->used); + tage->used += strlen(msgdata->msg_fn) + 1; + debug_buf += strlen(msgdata->msg_fn) + 1; + } + + __LASSERT(debug_buf == string_buf); + + tage->used += needed; + __LASSERT(tage->used <= PAGE_SIZE); + +console: + if ((mask & libcfs_printk) == 0) { + /* no console output requested */ + if (tcd != NULL) + cfs_trace_put_tcd(tcd); + return 1; + } + + if (cdls != NULL) { + if (libcfs_console_ratelimit && + cdls->cdls_next != 0 && /* not first time ever */ + time_before(jiffies, cdls->cdls_next)) { + /* skipping a console message */ + cdls->cdls_count++; + if (tcd != NULL) + cfs_trace_put_tcd(tcd); + return 1; + } + + if (time_after(jiffies, cdls->cdls_next + + libcfs_console_max_delay + + cfs_time_seconds(10))) { + /* last timeout was a long time ago */ + cdls->cdls_delay /= libcfs_console_backoff * 4; + } else { + cdls->cdls_delay *= libcfs_console_backoff; + } + + if (cdls->cdls_delay < libcfs_console_min_delay) + cdls->cdls_delay = libcfs_console_min_delay; + else if (cdls->cdls_delay > libcfs_console_max_delay) + cdls->cdls_delay = libcfs_console_max_delay; + + /* ensure cdls_next is never zero after it's been seen */ + cdls->cdls_next = (jiffies + cdls->cdls_delay) | 1; + } + + if (tcd) { + cfs_print_to_console(&header, mask, file, msgdata->msg_fn, + "%s", string_buf); + cfs_trace_put_tcd(tcd); + } else { + struct va_format vaf; + + va_start(ap, format); + vaf.fmt = format; + vaf.va = ≈ + cfs_vprint_to_console(&header, mask, + &vaf, file, msgdata->msg_fn); + va_end(ap); + } + + if (cdls != NULL && cdls->cdls_count != 0) { + cfs_print_to_console(&header, mask, file, + msgdata->msg_fn, + "Skipped %d previous similar message%s\n", + cdls->cdls_count, + (cdls->cdls_count > 1) ? "s" : ""); + + cdls->cdls_count = 0; + } + + return 0; +} +EXPORT_SYMBOL(libcfs_debug_msg); + +void +cfs_trace_assertion_failed(const char *str, + struct libcfs_debug_msg_data *msgdata) +{ + struct ptldebug_header hdr; + + libcfs_panic_in_progress = 1; + libcfs_catastrophe = 1; + smp_mb(); + + cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK()); + + cfs_print_to_console(&hdr, D_EMERG, msgdata->msg_file, msgdata->msg_fn, + "%s", str); + + panic("Lustre debug assertion failure\n"); + + /* not reached */ +} + +static void +panic_collect_pages(struct page_collection *pc) +{ + /* Do the collect_pages job on a single CPU: assumes that all other + * CPUs have been stopped during a panic. If this isn't true for some + * arch, this will have to be implemented separately in each arch. */ + int i; + int j; + struct cfs_trace_cpu_data *tcd; + + INIT_LIST_HEAD(&pc->pc_pages); + + cfs_tcd_for_each(tcd, i, j) { + list_splice_init(&tcd->tcd_pages, &pc->pc_pages); + tcd->tcd_cur_pages = 0; + } +} + +static void collect_pages_on_all_cpus(struct page_collection *pc) +{ + struct cfs_trace_cpu_data *tcd; + int i, cpu; + + for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) { + list_splice_init(&tcd->tcd_pages, &pc->pc_pages); + tcd->tcd_cur_pages = 0; + } + } +} + +static void collect_pages(struct page_collection *pc) +{ + INIT_LIST_HEAD(&pc->pc_pages); + + if (libcfs_panic_in_progress) + panic_collect_pages(pc); + else + collect_pages_on_all_cpus(pc); +} + +static void put_pages_back_on_all_cpus(struct page_collection *pc) +{ + struct cfs_trace_cpu_data *tcd; + struct list_head *cur_head; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + int i, cpu; + + for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) { + cur_head = tcd->tcd_pages.next; + + list_for_each_entry_safe(tage, tmp, &pc->pc_pages, + linkage) { + + __LASSERT_TAGE_INVARIANT(tage); + + if (tage->cpu != cpu || tage->type != i) + continue; + + cfs_tage_to_tail(tage, cur_head); + tcd->tcd_cur_pages++; + } + } + } +} + +static void put_pages_back(struct page_collection *pc) +{ + if (!libcfs_panic_in_progress) + put_pages_back_on_all_cpus(pc); +} + +#ifdef LNET_DUMP_ON_PANIC +void cfs_trace_debug_print(void) +{ + struct page_collection pc; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + struct page *page; + + collect_pages(&pc); + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { + char *p, *file, *fn; + + __LASSERT_TAGE_INVARIANT(tage); + + page = tage->page; + p = page_address(page); + while (p < ((char *)page_address(page) + tage->used)) { + struct ptldebug_header *hdr; + int len; + hdr = (void *)p; + p += sizeof(*hdr); + file = p; + p += strlen(file) + 1; + fn = p; + p += strlen(fn) + 1; + len = hdr->ph_len - (int)(p - (char *)hdr); + + cfs_print_to_console(hdr, D_EMERG, file, fn, + "%.*s", len, p); + + p += len; + } + + list_del(&tage->linkage); + cfs_tage_free(tage); + } + down_write(&cfs_tracefile_sem); + while ((page = list_first_entry_or_null(&daemon_pages, + struct page, lru)) != NULL) { + char *p, *file, *fn; + + p = page_address(page); + while (p < ((char *)page_address(page) + page->private)) { + struct ptldebug_header *hdr; + int len; + + hdr = (void *)p; + p += sizeof(*hdr); + file = p; + p += strlen(file) + 1; + fn = p; + p += strlen(fn) + 1; + len = hdr->ph_len - (int)(p - (char *)hdr); + + cfs_print_to_console(hdr, D_EMERG, file, fn, + "%.*s", len, p); + + p += len; + } + list_del_init(&page->lru); + daemon_pages_count -= 1; + put_page(page); + } + up_write(&cfs_tracefile_sem); +} +#endif /* LNET_DUMP_ON_PANIC */ + +int cfs_tracefile_dump_all_pages(char *filename) +{ + struct page_collection pc; + struct file *filp; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + char *buf; + struct page *page; + int rc; + + down_write(&cfs_tracefile_sem); + + filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + filp = NULL; + pr_err("LustreError: can't open %s for dump: rc = %d\n", + filename, rc); + goto out; + } + + collect_pages(&pc); + if (list_empty(&pc.pc_pages)) { + rc = 0; + goto close; + } + + /* ok, for now, just write the pages. in the future we'll be building + * iobufs with the pages and calling generic_direct_IO */ + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { + + __LASSERT_TAGE_INVARIANT(tage); + + buf = kmap(tage->page); + rc = cfs_kernel_write(filp, buf, tage->used, &filp->f_pos); + kunmap(tage->page); + if (rc != (int)tage->used) { + pr_warn("Lustre: wanted to write %u but wrote %d\n", + tage->used, rc); + put_pages_back(&pc); + __LASSERT(list_empty(&pc.pc_pages)); + break; + } + list_del(&tage->linkage); + cfs_tage_free(tage); + } + while ((page = list_first_entry_or_null(&daemon_pages, + struct page, lru)) != NULL) { + buf = page_address(page); + rc = cfs_kernel_write(filp, buf, page->private, &filp->f_pos); + if (rc != (int)page->private) { + pr_warn("Lustre: wanted to write %u but wrote %d\n", + (int)page->private, rc); + break; + } + list_del(&page->lru); + daemon_pages_count -= 1; + put_page(page); + } + rc = vfs_fsync_range(filp, 0, LLONG_MAX, 1); + if (rc) + pr_err("LustreError: sync returns: rc = %d\n", rc); +close: + filp_close(filp, NULL); +out: + up_write(&cfs_tracefile_sem); + return rc; +} + +void cfs_trace_flush_pages(void) +{ + struct page_collection pc; + struct cfs_trace_page *tage; + struct page *page; + + collect_pages(&pc); + while (!list_empty(&pc.pc_pages)) { + tage = list_first_entry(&pc.pc_pages, + struct cfs_trace_page, linkage); + __LASSERT_TAGE_INVARIANT(tage); + + list_del(&tage->linkage); + cfs_tage_free(tage); + } + + down_write(&cfs_tracefile_sem); + while ((page = list_first_entry_or_null(&daemon_pages, + struct page, lru)) != NULL) { + list_del(&page->lru); + daemon_pages_count -= 1; + put_page(page); + } + up_write(&cfs_tracefile_sem); +} + +int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, + const char *knl_buffer, char *append) +{ + /* NB if 'append' != NULL, it's a single character to append to the + * copied out string - usually "\n", for /proc entries and "" (i.e. a + * terminating zero byte) for sysctl entries */ + int nob = strlen(knl_buffer); + + if (nob > usr_buffer_nob) + nob = usr_buffer_nob; + + if (copy_to_user(usr_buffer, knl_buffer, nob)) + return -EFAULT; + + if (append != NULL && nob < usr_buffer_nob) { + if (copy_to_user(usr_buffer + nob, append, 1)) + return -EFAULT; + + nob++; + } + + return nob; +} +EXPORT_SYMBOL(cfs_trace_copyout_string); + +int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob) +{ + char *str; + char *path; + int rc; + + str = memdup_user_nul(usr_str, usr_str_nob); + if (IS_ERR(str)) + return PTR_ERR(str); + + path = strim(str); + if (path[0] != '/') + rc = -EINVAL; + else + rc = cfs_tracefile_dump_all_pages(path); + kfree(str); + + return rc; +} + +int cfs_trace_daemon_command(char *str) +{ + int rc = 0; + + down_write(&cfs_tracefile_sem); + + if (strcmp(str, "stop") == 0) { + up_write(&cfs_tracefile_sem); + cfs_trace_stop_thread(); + down_write(&cfs_tracefile_sem); + memset(cfs_tracefile, 0, sizeof(cfs_tracefile)); + + } else if (strncmp(str, "size=", 5) == 0) { + unsigned long tmp; + + rc = kstrtoul(str + 5, 10, &tmp); + if (!rc) { + if (tmp < 10 || tmp > 20480) + cfs_tracefile_size = CFS_TRACEFILE_SIZE; + else + cfs_tracefile_size = tmp << 20; + } + } else if (strlen(str) >= sizeof(cfs_tracefile)) { + rc = -ENAMETOOLONG; + } else if (str[0] != '/') { + rc = -EINVAL; + } else { + strcpy(cfs_tracefile, str); + + pr_info("Lustre: debug daemon will attempt to start writing to %s (%lukB max)\n", + cfs_tracefile, (long)(cfs_tracefile_size >> 10)); + + cfs_trace_start_thread(); + } + + up_write(&cfs_tracefile_sem); + return rc; +} + +int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob) +{ + char *str; + int rc; + + str = memdup_user_nul(usr_str, usr_str_nob); + if (IS_ERR(str)) + return PTR_ERR(str); + + rc = cfs_trace_daemon_command(strim(str)); + kfree(str); + + return rc; +} + +int cfs_trace_set_debug_mb(int mb) +{ + int i; + int j; + unsigned long pages; + unsigned long total_mb = (cfs_totalram_pages() >> (20 - PAGE_SHIFT)); + unsigned long limit = max_t(unsigned long, 512, (total_mb * 4) / 5); + struct cfs_trace_cpu_data *tcd; + + if (mb < num_possible_cpus()) { + pr_warn("Lustre: %d MB is too small for debug buffer size, setting it to %d MB.\n", + mb, num_possible_cpus()); + mb = num_possible_cpus(); + } + + if (mb > limit) { + pr_warn("Lustre: %d MB is too large for debug buffer size, setting it to %lu MB.\n", + mb, limit); + mb = limit; + } + + mb /= num_possible_cpus(); + pages = mb << (20 - PAGE_SHIFT); + + down_write(&cfs_tracefile_sem); + + cfs_tcd_for_each(tcd, i, j) + tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100; + + daemon_pages_max = pages; + up_write(&cfs_tracefile_sem); + + return mb; +} + +int cfs_trace_get_debug_mb(void) +{ + int i; + int j; + struct cfs_trace_cpu_data *tcd; + int total_pages = 0; + + down_read(&cfs_tracefile_sem); + + cfs_tcd_for_each(tcd, i, j) + total_pages += tcd->tcd_max_pages; + + up_read(&cfs_tracefile_sem); + + return (total_pages >> (20 - PAGE_SHIFT)) + 1; +} + +static int tracefiled(void *arg) +{ + struct page_collection pc; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + struct file *filp; + char *buf; + int last_loop = 0; + int rc; + + while (!last_loop) { + LIST_HEAD(for_daemon_pages); + int for_daemon_pages_count = 0; + schedule_timeout_interruptible(cfs_time_seconds(1)); + if (kthread_should_stop()) + last_loop = 1; + collect_pages(&pc); + if (list_empty(&pc.pc_pages)) + continue; + + filp = NULL; + down_read(&cfs_tracefile_sem); + if (cfs_tracefile[0] != 0) { + filp = filp_open(cfs_tracefile, + O_CREAT | O_RDWR | O_LARGEFILE, + 0600); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + filp = NULL; + pr_warn("Lustre: couldn't open %s: rc = %d\n", + cfs_tracefile, rc); + } + } + up_read(&cfs_tracefile_sem); + + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { + __LASSERT_TAGE_INVARIANT(tage); + + if (filp) { + struct dentry *de = file_dentry(filp); + static loff_t f_pos; + + if (f_pos >= (off_t)cfs_tracefile_size) + f_pos = 0; + else if (f_pos > i_size_read(de->d_inode)) + f_pos = i_size_read(de->d_inode); + + buf = kmap(tage->page); + rc = cfs_kernel_write(filp, buf, tage->used, + &f_pos); + kunmap(tage->page); + if (rc != (int)tage->used) { + pr_warn("Lustre: wanted to write %u but wrote %d\n", + tage->used, rc); + put_pages_back(&pc); + __LASSERT(list_empty(&pc.pc_pages)); + break; + } + } + list_del_init(&tage->linkage); + list_add_tail(&tage->page->lru, &for_daemon_pages); + for_daemon_pages_count += 1; + + tage->page->private = (int)tage->used; + kfree(tage); + atomic_dec(&cfs_tage_allocated); + } + + if (filp) + filp_close(filp, NULL); + + down_write(&cfs_tracefile_sem); + list_splice_tail(&for_daemon_pages, &daemon_pages); + daemon_pages_count += for_daemon_pages_count; + while (daemon_pages_count > daemon_pages_max) { + struct page *p = list_first_entry(&daemon_pages, + struct page, lru); + list_del(&p->lru); + put_page(p); + daemon_pages_count -= 1; + } + up_write(&cfs_tracefile_sem); + + if (!list_empty(&pc.pc_pages)) { + int i; + + pr_alert("Lustre: trace pages aren't empty\n"); + pr_err("Lustre: total cpus(%d): ", num_possible_cpus()); + for (i = 0; i < num_possible_cpus(); i++) + if (cpu_online(i)) + pr_cont("%d(on) ", i); + else + pr_cont("%d(off) ", i); + pr_cont("\n"); + + i = 0; + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, + linkage) + pr_err("Lustre: page %d belongs to cpu %d\n", + ++i, tage->cpu); + pr_err("Lustre: There are %d pages unwritten\n", i); + } + __LASSERT(list_empty(&pc.pc_pages)); + } + + return 0; +} + +int cfs_trace_start_thread(void) +{ + struct task_struct *tsk; + int rc = 0; + + if (tctl_task) + return 0; + + tsk = kthread_create(tracefiled, NULL, "ktracefiled"); + if (IS_ERR(tsk)) + rc = -ECHILD; + else if (cmpxchg(&tctl_task, NULL, tsk) != NULL) + /* already running */ + kthread_stop(tsk); + else + wake_up_process(tsk); + + return rc; +} + +void cfs_trace_stop_thread(void) +{ + struct task_struct *tsk; + + tsk = xchg(&tctl_task, NULL); + if (tsk) { + pr_info("Lustre: shutting down debug daemon thread...\n"); + kthread_stop(tsk); + } +} + +/* percents to share the total debug memory for each type */ +static unsigned int pages_factor[CFS_TCD_TYPE_CNT] = { + 80, /* 80% pages for CFS_TCD_TYPE_PROC */ + 10, /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */ + 10 /* 10% pages for CFS_TCD_TYPE_IRQ */ +}; + +int cfs_tracefile_init(int max_pages) +{ + struct cfs_trace_cpu_data *tcd; + int i; + int j; + + /* initialize trace_data */ + memset(cfs_trace_data, 0, sizeof(cfs_trace_data)); + for (i = 0; i < CFS_TCD_TYPE_CNT; i++) { + cfs_trace_data[i] = + kmalloc_array(num_possible_cpus(), + sizeof(union cfs_trace_data_union), + GFP_KERNEL); + if (!cfs_trace_data[i]) + goto out_trace_data; + } + + /* arch related info initialized */ + cfs_tcd_for_each(tcd, i, j) { + int factor = pages_factor[i]; + + spin_lock_init(&tcd->tcd_lock); + tcd->tcd_pages_factor = factor; + tcd->tcd_type = i; + tcd->tcd_cpu = j; + + INIT_LIST_HEAD(&tcd->tcd_pages); + INIT_LIST_HEAD(&tcd->tcd_stock_pages); + tcd->tcd_cur_pages = 0; + tcd->tcd_cur_stock_pages = 0; + tcd->tcd_max_pages = (max_pages * factor) / 100; + LASSERT(tcd->tcd_max_pages > 0); + tcd->tcd_shutting_down = 0; + } + daemon_pages_max = max_pages; + + return 0; + +out_trace_data: + for (i = 0; cfs_trace_data[i]; i++) { + kfree(cfs_trace_data[i]); + cfs_trace_data[i] = NULL; + } + pr_err("lnet: Not enough memory\n"); + return -ENOMEM; +} + +static void trace_cleanup_on_all_cpus(void) +{ + struct cfs_trace_cpu_data *tcd; + struct cfs_trace_page *tage; + int i, cpu; + + for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) { + if (!tcd->tcd_pages_factor) + /* Not initialised */ + continue; + tcd->tcd_shutting_down = 1; + + while (!list_empty(&tcd->tcd_pages)) { + tage = list_first_entry(&tcd->tcd_pages, + struct cfs_trace_page, + linkage); + __LASSERT_TAGE_INVARIANT(tage); + + list_del(&tage->linkage); + cfs_tage_free(tage); + } + tcd->tcd_cur_pages = 0; + } + } +} + +static void cfs_trace_cleanup(void) +{ + struct page_collection pc; + int i; + + INIT_LIST_HEAD(&pc.pc_pages); + + trace_cleanup_on_all_cpus(); + + for (i = 0; i < CFS_TCD_TYPE_CNT && cfs_trace_data[i]; i++) { + kfree(cfs_trace_data[i]); + cfs_trace_data[i] = NULL; + } +} + +void cfs_tracefile_exit(void) +{ + cfs_trace_stop_thread(); + cfs_trace_flush_pages(); + cfs_trace_cleanup(); +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h new file mode 100644 index 0000000000000..406a8d5a1fc5c --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h @@ -0,0 +1,190 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __LIBCFS_TRACEFILE_H__ +#define __LIBCFS_TRACEFILE_H__ + +#include + +#define TRACEFILE_NAME_SIZE 1024 +extern char cfs_tracefile[TRACEFILE_NAME_SIZE]; +extern long long cfs_tracefile_size; + +/** + * The path of debug log dump upcall script. + */ +extern char lnet_debug_log_upcall[1024]; + +int cfs_tracefile_dump_all_pages(char *filename); +void cfs_trace_debug_print(void); +void cfs_trace_flush_pages(void); +int cfs_trace_start_thread(void); +void cfs_trace_stop_thread(void); +int cfs_tracefile_init(int max_pages); +void cfs_tracefile_exit(void); + + + +int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, + const char *knl_str, char *append); +int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob); +int cfs_trace_daemon_command(char *str); +int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob); +int cfs_trace_set_debug_mb(int mb); +int cfs_trace_get_debug_mb(void); + +extern int libcfs_panic_in_progress; + +#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT)) +#define TCD_STOCK_PAGES (TCD_MAX_PAGES) +#define CFS_TRACEFILE_SIZE (500 << 20) + +union cfs_trace_data_union { + struct cfs_trace_cpu_data { + /* + * Even though this structure is meant to be per-CPU, locking + * is needed because in some places the data may be accessed + * from other CPUs. This lock is directly used in trace_get_tcd + * and trace_put_tcd, which are called in libcfs_debug_msg and + * tcd_for_each_type_lock + */ + spinlock_t tcd_lock; + unsigned long tcd_lock_flags; + + /* + * pages with trace records not yet processed by tracefiled. + */ + struct list_head tcd_pages; + /* number of pages on ->tcd_pages */ + unsigned long tcd_cur_pages; + + /* + * Maximal number of pages allowed on ->tcd_pages + * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current + * implementation. + */ + unsigned long tcd_max_pages; + + /* + * preallocated pages to write trace records into. Pages from + * ->tcd_stock_pages are moved to ->tcd_pages by + * portals_debug_msg(). + * + * This list is necessary, because on some platforms it's + * impossible to perform efficient atomic page allocation in a + * non-blockable context. + * + * Such platforms fill ->tcd_stock_pages "on occasion", when + * tracing code is entered in blockable context. + * + * trace_get_tage_try() tries to get a page from + * ->tcd_stock_pages first and resorts to atomic page + * allocation only if this queue is empty. ->tcd_stock_pages + * is replenished when tracing code is entered in blocking + * context (darwin-tracefile.c:trace_get_tcd()). We try to + * maintain TCD_STOCK_PAGES (40 by default) pages in this + * queue. Atomic allocation is only required if more than + * TCD_STOCK_PAGES pagesful are consumed by trace records all + * emitted in non-blocking contexts. Which is quite unlikely. + */ + struct list_head tcd_stock_pages; + /* number of pages on ->tcd_stock_pages */ + unsigned long tcd_cur_stock_pages; + + unsigned short tcd_shutting_down; + unsigned short tcd_cpu; + unsigned short tcd_type; + /* The factors to share debug memory. */ + unsigned short tcd_pages_factor; + } tcd; + char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))]; +}; + +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ +struct page_collection { + struct list_head pc_pages; +}; + +/* + * small data-structure for each page owned by tracefiled. + */ +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ +struct cfs_trace_page { + /* + * page itself + */ + struct page *page; + /* + * linkage into one of the lists in trace_data_union or + * page_collection + */ + struct list_head linkage; + /* + * number of bytes used within this page + */ + unsigned int used; + /* + * cpu that owns this page + */ + unsigned short cpu; + /* + * type(context) of this page + */ + unsigned short type; +}; + +int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd, + struct cfs_trace_page *tage); + +extern void cfs_trace_assertion_failed(const char *str, + struct libcfs_debug_msg_data *m); + +/* ASSERTION that is safe to use within the debug system */ +#define __LASSERT(cond) \ +do { \ + if (unlikely(!(cond))) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL); \ + cfs_trace_assertion_failed("ASSERTION("#cond") failed", \ + &msgdata); \ + } \ +} while (0) + +#define __LASSERT_TAGE_INVARIANT(tage) \ +do { \ + __LASSERT(tage != NULL); \ + __LASSERT(tage->page != NULL); \ + __LASSERT(tage->used <= PAGE_SIZE); \ + __LASSERT(page_count(tage->page) > 0); \ +} while (0) + +#endif /* __LIBCFS_TRACEFILE_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c new file mode 100644 index 0000000000000..15e3d330b3bea --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c @@ -0,0 +1,191 @@ +/* + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * Copyright (c) 2014, 2017, Intel Corporation. + * + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define __USE_FILE_OFFSET64 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct ioc_dev { + const char *dev_name; + int dev_fd; +}; + +static struct ioc_dev ioc_dev_list[10]; + +#ifndef ARRAY_SIZE +# define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0]))) +#endif /* !ARRAY_SIZE */ + +static int +open_ioc_dev(int dev_id) +{ + const char *dev_name; + + if (dev_id < 0 || dev_id >= ARRAY_SIZE(ioc_dev_list)) { + errno = EINVAL; + return -errno; + } + + dev_name = ioc_dev_list[dev_id].dev_name; + if (!dev_name) { + fprintf(stderr, "unknown device id: %d\n", dev_id); + errno = EINVAL; + return -errno; + } + + if (ioc_dev_list[dev_id].dev_fd < 0) { + int fd = open(dev_name, O_RDWR); + + if (fd < 0) { + fprintf(stderr, "opening %s failed: %s\n" + "hint: the kernel modules may not be loaded\n", + dev_name, strerror(errno)); + return -errno; + } + ioc_dev_list[dev_id].dev_fd = fd; + } + + return ioc_dev_list[dev_id].dev_fd; +} + +int l_ioctl(int dev_id, unsigned int opc, void *buf) +{ + int fd, rc; + + fd = open_ioc_dev(dev_id); + if (fd < 0) + return fd; + + rc = ioctl(fd, opc, buf); + + return rc < 0 ? -errno : rc; +} + +/* register a device to send ioctls to. */ +int +register_ioc_dev(int dev_id, const char *dev_name) +{ + if (dev_id < 0 || + dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0])) + return -EINVAL; + + unregister_ioc_dev(dev_id); + + ioc_dev_list[dev_id].dev_name = dev_name; + ioc_dev_list[dev_id].dev_fd = -1; + + return dev_id; +} + +void +unregister_ioc_dev(int dev_id) +{ + if (dev_id < 0 || + dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0])) + return; + + if (ioc_dev_list[dev_id].dev_name && + ioc_dev_list[dev_id].dev_fd >= 0) + close(ioc_dev_list[dev_id].dev_fd); + + ioc_dev_list[dev_id].dev_name = NULL; + ioc_dev_list[dev_id].dev_fd = -1; +} + +static inline size_t libcfs_ioctl_packlen(struct libcfs_ioctl_data *data) +{ + size_t len = sizeof(*data); + + len += (data->ioc_inllen1 + 7) & ~7; + len += (data->ioc_inllen2 + 7) & ~7; + return len; +} + +int libcfs_ioctl_pack(struct libcfs_ioctl_data *data, char **pbuf, int max) +{ + char *ptr; + struct libcfs_ioctl_data *overlay; + + data->ioc_hdr.ioc_len = libcfs_ioctl_packlen(data); + data->ioc_hdr.ioc_version = LIBCFS_IOCTL_VERSION; + + if (*pbuf && libcfs_ioctl_packlen(data) > max) + return 1; + if (!*pbuf) + *pbuf = malloc(data->ioc_hdr.ioc_len); + if (!*pbuf) + return 1; + overlay = (struct libcfs_ioctl_data *)*pbuf; + memcpy(*pbuf, data, sizeof(*data)); + + ptr = overlay->ioc_bulk; + if (data->ioc_inlbuf1) { + memcpy((char *)ptr, (const char *)data->ioc_inlbuf1, + data->ioc_inllen1); + ptr += ((data->ioc_inllen1 + 7) & ~7); + } + if (data->ioc_inlbuf2) { + memcpy((char *)ptr, (const char *)data->ioc_inlbuf2, + data->ioc_inllen2); + ptr += ((data->ioc_inllen2 + 7) & ~7); + } + + return 0; +} + +void +libcfs_ioctl_unpack(struct libcfs_ioctl_data *data, char *pbuf) +{ + struct libcfs_ioctl_data *overlay = (struct libcfs_ioctl_data *)pbuf; + char *ptr; + + /* Preserve the caller's buffer pointers */ + overlay->ioc_inlbuf1 = data->ioc_inlbuf1; + overlay->ioc_inlbuf2 = data->ioc_inlbuf2; + + memcpy(data, pbuf, sizeof(*data)); + ptr = &overlay->ioc_bulk[0]; + + if (data->ioc_inlbuf1) { + memcpy((char *)data->ioc_inlbuf1, (const char *)ptr, + data->ioc_inllen1); + ptr += ((data->ioc_inllen1 + 7) & ~7); + } + if (data->ioc_inlbuf2) { + memcpy((char *)data->ioc_inlbuf2, (const char *)ptr, + data->ioc_inllen2); + ptr += ((data->ioc_inllen2 + 7) & ~7); + } +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c new file mode 100644 index 0000000000000..780a8ab1ac21f --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c @@ -0,0 +1,1647 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/libcfs/util/nidstrings.c + * + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#ifdef HAVE_NETDB_H +# include +#endif + +/* max value for numeric network address */ +#define MAX_NUMERIC_VALUE 0xffffffff + +#define IPSTRING_LENGTH 16 + +/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids + * consistent in all conversion functions. Some code fragments are copied + * around for the sake of clarity... + */ + +/* CAVEAT EMPTOR! Racey temporary buffer allocation! + * Choose the number of nidstrings to support the MAXIMUM expected number of + * concurrent users. If there are more, the returned string will be volatile. + * NB this number must allow for a process to be descheduled for a timeslice + * between getting its string and using it. + */ + +static char libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE]; +static int libcfs_nidstring_idx; + +char * +libcfs_next_nidstring(void) +{ + char *str; + + str = libcfs_nidstrings[libcfs_nidstring_idx++]; + if (libcfs_nidstring_idx == + sizeof(libcfs_nidstrings)/sizeof(libcfs_nidstrings[0])) + libcfs_nidstring_idx = 0; + + return str; +} + +static int +libcfs_lo_str2addr(const char *str, int nob, __u32 *addr) +{ + *addr = 0; + return 1; +} + +static void +libcfs_ip_addr2str(__u32 addr, char *str, size_t size) +{ + snprintf(str, size, "%u.%u.%u.%u", + (addr >> 24) & 0xff, (addr >> 16) & 0xff, + (addr >> 8) & 0xff, addr & 0xff); +} + +/* CAVEAT EMPTOR XscanfX + * I use "%n" at the end of a sscanf format to detect trailing junk. However + * sscanf may return immediately if it sees the terminating '0' in a string, so + * I initialise the %n variable to the expected length. If sscanf sets it; + * fine, if it doesn't, then the scan ended at the end of the string, which is + * fine too :) */ +static int +libcfs_ip_str2addr(const char *str, int nob, __u32 *addr) +{ + unsigned int a; + unsigned int b; + unsigned int c; + unsigned int d; + int n = nob; /* XscanfX */ + + /* numeric IP? */ + if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 && + n == nob && + (a & ~0xff) == 0 && (b & ~0xff) == 0 && + (c & ~0xff) == 0 && (d & ~0xff) == 0) { + *addr = ((a<<24)|(b<<16)|(c<<8)|d); + return 1; + } + +#ifdef HAVE_GETHOSTBYNAME + /* known hostname? */ + if (('a' <= str[0] && str[0] <= 'z') || + ('A' <= str[0] && str[0] <= 'Z')) { + char *tmp; + + tmp = calloc(1, nob + 1); + if (tmp != NULL) { + struct hostent *he; + + memcpy(tmp, str, nob); + tmp[nob] = 0; + + he = gethostbyname(tmp); + + free(tmp); + + if (he != NULL) { + __u32 ip = *(__u32 *)he->h_addr; + + *addr = ntohl(ip); + return 1; + } + } + } +#endif + return 0; +} + +int +cfs_ip_addr_parse(char *str, int len, struct list_head *list) +{ + struct cfs_expr_list *el; + struct cfs_lstr src; + int rc; + int i; + + src.ls_str = str; + src.ls_len = len; + i = 0; + + while (src.ls_str != NULL) { + struct cfs_lstr res; + + if (!cfs_gettok(&src, '.', &res)) { + rc = -EINVAL; + goto out; + } + + rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el); + if (rc != 0) + goto out; + + list_add_tail(&el->el_link, list); + i++; + } + + if (i == 4) + return 0; + + rc = -EINVAL; +out: + cfs_expr_list_free_list(list); + + return rc; +} + +int +cfs_expr2str(struct list_head *list, char *str, size_t size) +{ + struct cfs_expr_list *expr; + struct cfs_range_expr *range; + char tmp[LNET_NIDSTR_SIZE]; + size_t len; + bool first; + bool bracket = false; + char *format; + char *tmpc; + + list_for_each_entry(expr, list, el_link) { + first = true; + list_for_each_entry(range, &expr->el_exprs, re_link) { + if (range->re_lo == range->re_hi) { + snprintf(tmp, + LNET_NIDSTR_SIZE, + "%u.", range->re_lo); + } else if (range->re_lo < range->re_hi) { + if (range->re_stride > 1) { + if (first) + format = "[%u-%u/%u,"; + else + format = "%u-%u/%u,"; + snprintf(tmp, LNET_NIDSTR_SIZE, + format, range->re_lo, + range->re_hi, range->re_stride); + bracket = true; + } else { + if (first) + format = "[%u-%u,"; + else + format = "%u-%u,"; + snprintf(tmp, LNET_NIDSTR_SIZE, + format, range->re_lo, + range->re_hi); + bracket = true; + } + } else { + return -EINVAL; + } + len = strlen(tmp); + size -= (len + 1); + if (size < 0) + return -ENOBUFS; + + strncat(str, tmp, size + len); + first = false; + } + if (bracket) { + tmpc = str + (strlen(str) - 1); + size -= 1; + if (size < 0) + return -ENOBUFS; + *tmpc = ']'; + *(tmpc+1) = '.'; + bracket = false; + } + } + + /* + * get rid of the trailing '.' at the end of the string + * only if we actually had something on the list passed in. + * otherwise we could write outside the array + */ + if (!list_empty(list)) + str[strlen(str)-1] = '\0'; + return size; +} + +static int +libcfs_num_addr_range_expand(struct list_head *addrranges, __u32 *addrs, + int max_addrs) +{ + struct cfs_expr_list *expr_list; + struct cfs_range_expr *range; + int i; + int max_idx = max_addrs - 1; + int addrs_idx = max_idx; + + list_for_each_entry(expr_list, addrranges, el_link) { + list_for_each_entry(range, &expr_list->el_exprs, re_link) { + for (i = range->re_lo; i <= range->re_hi; + i += range->re_stride) { + if (addrs_idx < 0) + return -1; + + addrs[addrs_idx] = i; + addrs_idx--; + } + } + } + + return max_idx - addrs_idx; +} + +static int +libcfs_ip_addr_range_expand(struct list_head *addrranges, __u32 *addrs, + int max_addrs) +{ + int rc = 0; + + rc = cfs_ip_addr_range_gen(addrs, max_addrs, addrranges); + + if (rc == -1) + return rc; + else + return max_addrs - rc - 1; +} + +static int +libcfs_ip_addr_range_print(char *buffer, int count, struct list_head *list) +{ + int i = 0, j = 0; + struct cfs_expr_list *el; + + list_for_each_entry(el, list, el_link) { + assert(j++ < 4); + if (i != 0) + i += scnprintf(buffer + i, count - i, "."); + i += cfs_expr_list_print(buffer + i, count - i, el); + } + return i; +} + +static int +cfs_ip_addr_range_gen_recurse(__u32 *ip_list, int *count, int shift, + __u32 result, struct list_head *head_el, + struct cfs_expr_list *octet_el) +{ + __u32 value = 0; + int i; + struct cfs_expr_list *next_octet_el; + struct cfs_range_expr *octet_expr; + + /* + * each octet can have multiple expressions so we need to traverse + * all of the expressions + */ + list_for_each_entry(octet_expr, &octet_el->el_exprs, re_link) { + for (i = octet_expr->re_lo; i <= octet_expr->re_hi; i++) { + if (((i - octet_expr->re_lo) % octet_expr->re_stride) == 0) { + /* + * we have a hit calculate the result and + * pass it forward to the next iteration + * of the recursion. + */ + next_octet_el = + list_entry(octet_el->el_link.next, + typeof(*next_octet_el), + el_link); + value = result | (i << (shift * 8)); + if (next_octet_el->el_link.next != head_el) { + /* + * We still have more octets in + * the IP address so traverse + * that. We're doing a depth first + * recursion here. + */ + if (cfs_ip_addr_range_gen_recurse(ip_list, count, + shift - 1, value, + head_el, + next_octet_el) == -1) + return -1; + } else { + /* + * We have hit a leaf so store the + * calculated IP address in the + * list. If we have run out of + * space stop the recursion. + */ + if (*count == -1) + return -1; + /* add ip to the list */ + ip_list[*count] = value; + (*count)--; + } + } + } + } + return 0; +} + +/* + * only generate maximum of count ip addresses from the given expression + */ +int +cfs_ip_addr_range_gen(__u32 *ip_list, int count, struct list_head *ip_addr_expr) +{ + struct cfs_expr_list *octet_el; + int idx = count - 1; + + octet_el = list_entry(ip_addr_expr->next, typeof(*octet_el), el_link); + + (void) cfs_ip_addr_range_gen_recurse(ip_list, &idx, 3, 0, &octet_el->el_link, octet_el); + + return idx; +} + +/** + * Matches address (\a addr) against address set encoded in \a list. + * + * \retval 1 if \a addr matches + * \retval 0 otherwise + */ +int +cfs_ip_addr_match(__u32 addr, struct list_head *list) +{ + struct cfs_expr_list *el; + int i = 0; + + list_for_each_entry_reverse(el, list, el_link) { + if (!cfs_expr_list_match(addr & 0xff, el)) + return 0; + addr >>= 8; + i++; + } + + return i == 4; +} + +static void +libcfs_decnum_addr2str(__u32 addr, char *str, size_t size) +{ + snprintf(str, size, "%u", addr); +} + +static int +libcfs_num_str2addr(const char *str, int nob, __u32 *addr) +{ + int n; + + n = nob; + if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob) + return 1; + + return 0; +} + +/** + * Nf_parse_addrlist method for networks using numeric addresses. + * + * Examples of such networks are gm and elan. + * + * \retval 0 if \a str parsed to numeric address + * \retval errno otherwise + */ +int +libcfs_num_parse(char *str, int len, struct list_head *list) +{ + struct cfs_expr_list *el; + int rc; + + rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el); + if (rc == 0) + list_add_tail(&el->el_link, list); + + return rc; +} + +static int +libcfs_num_addr_range_print(char *buffer, int count, struct list_head *list) +{ + struct cfs_expr_list *el; + int i = 0, j = 0; + + list_for_each_entry(el, list, el_link) { + assert(j++ < 1); + i += cfs_expr_list_print(buffer + i, count - i, el); + } + return i; +} + +/* + * Nf_match_addr method for networks using numeric addresses + * + * \retval 1 on match + * \retval 0 otherwise + */ +static int +libcfs_num_match(__u32 addr, struct list_head *numaddr) +{ + struct cfs_expr_list *el; + + assert(!list_empty(numaddr)); + el = list_entry(numaddr->next, struct cfs_expr_list, el_link); + + return cfs_expr_list_match(addr, el); +} + +static int cfs_ip_min_max(struct list_head *nidlist, __u32 *min, __u32 *max); +static int cfs_num_min_max(struct list_head *nidlist, __u32 *min, __u32 *max); + +static struct netstrfns libcfs_netstrfns[] = { + { + .nf_type = LOLND, + .nf_name = "lo", + .nf_modname = "klolnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_lo_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match, + .nf_min_max = cfs_num_min_max, + .nf_expand_addrrange = libcfs_num_addr_range_expand + }, + { + .nf_type = SOCKLND, + .nf_name = "tcp", + .nf_modname = "ksocklnd", + .nf_addr2str = libcfs_ip_addr2str, + .nf_str2addr = libcfs_ip_str2addr, + .nf_parse_addrlist = cfs_ip_addr_parse, + .nf_print_addrlist = libcfs_ip_addr_range_print, + .nf_match_addr = cfs_ip_addr_match, + .nf_min_max = cfs_ip_min_max, + .nf_expand_addrrange = libcfs_ip_addr_range_expand + }, + { + .nf_type = O2IBLND, + .nf_name = "o2ib", + .nf_modname = "ko2iblnd", + .nf_addr2str = libcfs_ip_addr2str, + .nf_str2addr = libcfs_ip_str2addr, + .nf_parse_addrlist = cfs_ip_addr_parse, + .nf_print_addrlist = libcfs_ip_addr_range_print, + .nf_match_addr = cfs_ip_addr_match, + .nf_min_max = cfs_ip_min_max, + .nf_expand_addrrange = libcfs_ip_addr_range_expand + }, + { + .nf_type = GNILND, + .nf_name = "gni", + .nf_modname = "kgnilnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_num_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match, + .nf_min_max = cfs_num_min_max, + .nf_expand_addrrange = libcfs_num_addr_range_expand + }, + { + .nf_type = GNIIPLND, + .nf_name = "gip", + .nf_modname = "kgnilnd", + .nf_addr2str = libcfs_ip_addr2str, + .nf_str2addr = libcfs_ip_str2addr, + .nf_parse_addrlist = cfs_ip_addr_parse, + .nf_print_addrlist = libcfs_ip_addr_range_print, + .nf_match_addr = cfs_ip_addr_match, + .nf_min_max = cfs_ip_min_max, + .nf_expand_addrrange = libcfs_ip_addr_range_expand + }, + { + .nf_type = PTL4LND, + .nf_name = "ptlf", + .nf_modname = "kptl4lnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_num_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match, + .nf_min_max = cfs_num_min_max, + .nf_expand_addrrange = libcfs_num_addr_range_expand + }, + { + .nf_type = KFILND, + .nf_name = "kfi", + .nf_modname = "kkfilnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_num_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match, + .nf_min_max = cfs_num_min_max, + .nf_expand_addrrange = libcfs_num_addr_range_expand + } +}; + +static const size_t libcfs_nnetstrfns = + sizeof(libcfs_netstrfns)/sizeof(libcfs_netstrfns[0]); + +static struct netstrfns * +libcfs_lnd2netstrfns(__u32 lnd) +{ + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) + if (lnd == libcfs_netstrfns[i].nf_type) + return &libcfs_netstrfns[i]; + + return NULL; +} + +static struct netstrfns * +libcfs_namenum2netstrfns(const char *name) +{ + struct netstrfns *nf; + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (!strncmp(name, nf->nf_name, strlen(nf->nf_name))) + return nf; + } + return NULL; +} + +static struct netstrfns * +libcfs_name2netstrfns(const char *name) +{ + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) + if (!strcmp(libcfs_netstrfns[i].nf_name, name)) + return &libcfs_netstrfns[i]; + + return NULL; +} + +int +libcfs_isknown_lnd(__u32 lnd) +{ + return libcfs_lnd2netstrfns(lnd) != NULL; +} + +char * +libcfs_lnd2modname(__u32 lnd) +{ + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + + return (nf == NULL) ? NULL : nf->nf_modname; +} + +int +libcfs_str2lnd(const char *str) +{ + struct netstrfns *nf = libcfs_name2netstrfns(str); + + if (nf != NULL) + return nf->nf_type; + + return -1; +} + +char * +libcfs_lnd2str_r(__u32 lnd, char *buf, size_t buf_size) +{ + struct netstrfns *nf; + + nf = libcfs_lnd2netstrfns(lnd); + if (nf == NULL) + snprintf(buf, buf_size, "?%u?", lnd); + else + snprintf(buf, buf_size, "%s", nf->nf_name); + + return buf; +} + +char * +libcfs_net2str_r(__u32 net, char *buf, size_t buf_size) +{ + __u32 nnum = LNET_NETNUM(net); + __u32 lnd = LNET_NETTYP(net); + struct netstrfns *nf; + + nf = libcfs_lnd2netstrfns(lnd); + if (nf == NULL) + snprintf(buf, buf_size, "<%u:%u>", lnd, nnum); + else if (nnum == 0) + snprintf(buf, buf_size, "%s", nf->nf_name); + else + snprintf(buf, buf_size, "%s%u", nf->nf_name, nnum); + + return buf; +} + +char * +libcfs_nid2str_r(lnet_nid_t nid, char *buf, size_t buf_size) +{ + __u32 addr = LNET_NIDADDR(nid); + __u32 net = LNET_NIDNET(nid); + __u32 nnum = LNET_NETNUM(net); + __u32 lnd = LNET_NETTYP(net); + struct netstrfns *nf; + + if (nid == LNET_NID_ANY) { + strncpy(buf, "", buf_size); + buf[buf_size - 1] = '\0'; + return buf; + } + + nf = libcfs_lnd2netstrfns(lnd); + if (nf == NULL) { + snprintf(buf, buf_size, "%x@<%u:%u>", addr, lnd, nnum); + } else { + size_t addr_len; + + nf->nf_addr2str(addr, buf, buf_size); + addr_len = strlen(buf); + if (nnum == 0) + snprintf(buf + addr_len, buf_size - addr_len, "@%s", + nf->nf_name); + else + snprintf(buf + addr_len, buf_size - addr_len, "@%s%u", + nf->nf_name, nnum); + } + + return buf; +} + +static struct netstrfns * +libcfs_str2net_internal(const char *str, __u32 *net) +{ + struct netstrfns *nf = NULL; + int nob; + unsigned int netnum; + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (!strncmp(str, nf->nf_name, strlen(nf->nf_name))) + break; + } + + if (i == libcfs_nnetstrfns) + return NULL; + + nob = strlen(nf->nf_name); + + if (strlen(str) == (unsigned int)nob) { + netnum = 0; + } else { + if (nf->nf_type == LOLND) /* net number not allowed */ + return NULL; + + str += nob; + i = strlen(str); + if (sscanf(str, "%u%n", &netnum, &i) < 1 || + i != (int)strlen(str)) + return NULL; + } + + *net = LNET_MKNET(nf->nf_type, netnum); + return nf; +} + +__u32 +libcfs_str2net(const char *str) +{ + __u32 net; + + if (libcfs_str2net_internal(str, &net) != NULL) + return net; + + return LNET_NET_ANY; +} + +lnet_nid_t +libcfs_str2nid(const char *str) +{ + const char *sep = strchr(str, '@'); + struct netstrfns *nf; + __u32 net; + __u32 addr; + + if (sep != NULL) { + nf = libcfs_str2net_internal(sep + 1, &net); + if (nf == NULL) + return LNET_NID_ANY; + } else { + sep = str + strlen(str); + net = LNET_MKNET(SOCKLND, 0); + nf = libcfs_lnd2netstrfns(SOCKLND); + assert(nf != NULL); + } + + if (!nf->nf_str2addr(str, (int)(sep - str), &addr)) + return LNET_NID_ANY; + + return LNET_MKNID(net, addr); +} + +char * +libcfs_id2str(struct lnet_process_id id) +{ + char *str = libcfs_next_nidstring(); + + if (id.pid == LNET_PID_ANY) { + snprintf(str, LNET_NIDSTR_SIZE, + "LNET_PID_ANY-%s", libcfs_nid2str(id.nid)); + return str; + } + + snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s", + ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "", + (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid)); + return str; +} + +int +libcfs_str2anynid(lnet_nid_t *nidp, const char *str) +{ + if (!strcmp(str, "*")) { + *nidp = LNET_NID_ANY; + return 1; + } + + *nidp = libcfs_str2nid(str); + return *nidp != LNET_NID_ANY; +} + +/** + * Nid range list syntax. + * \verbatim + * + * :== [ ' ' ] + * :== '@' + * :== '*' | + * | + * + * :== ... + * + * :== | + * + * :== '[' [ ',' ] ']' + * :== | + * '-' | + * '-' '/' + * :== | + * :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" | + * "vib" | "ra" | "elan" | "mx" | "ptl" + * \endverbatim + */ + +/** + * Structure to represent \ token of the syntax. + * + * One of this is created for each \ parsed. + */ +struct nidrange { + /** + * Link to list of this structures which is built on nid range + * list parsing. + */ + struct list_head nr_link; + /** + * List head for addrrange::ar_link. + */ + struct list_head nr_addrranges; + /** + * Flag indicating that *@ is found. + */ + int nr_all; + /** + * Pointer to corresponding element of libcfs_netstrfns. + */ + struct netstrfns *nr_netstrfns; + /** + * Number of network. E.g. 5 if \ is "elan5". + */ + int nr_netnum; +}; + +/** + * Structure to represent \ token of the syntax. + */ +struct addrrange { + /** + * Link to nidrange::nr_addrranges. + */ + struct list_head ar_link; + /** + * List head for cfs_expr_list::el_list. + */ + struct list_head ar_numaddr_ranges; +}; + +/** + * Parses \ token on the syntax. + * + * Allocates struct addrrange and links to \a nidrange via + * (nidrange::nr_addrranges) + * + * \retval 0 if \a src parses to '*' | \ | \ + * \retval -errno otherwise + */ +static int +parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange) +{ + struct addrrange *addrrange; + + if (src->ls_len == 1 && src->ls_str[0] == '*') { + nidrange->nr_all = 1; + return 0; + } + + addrrange = calloc(1, sizeof(struct addrrange)); + if (addrrange == NULL) + return -ENOMEM; + list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges); + INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges); + + return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str, + src->ls_len, + &addrrange->ar_numaddr_ranges); +} + +/** + * Finds or creates struct nidrange. + * + * Checks if \a src is a valid network name, looks for corresponding + * nidrange on the ist of nidranges (\a nidlist), creates new struct + * nidrange if it is not found. + * + * \retval pointer to struct nidrange matching network specified via \a src + * \retval NULL if \a src does not match any network + */ +static struct nidrange * +add_nidrange(const struct cfs_lstr *src, + struct list_head *nidlist) +{ + struct netstrfns *nf; + struct nidrange *nr; + int endlen; + unsigned netnum; + + if (src->ls_len >= LNET_NIDSTR_SIZE) + return NULL; + + nf = libcfs_namenum2netstrfns(src->ls_str); + if (nf == NULL) + return NULL; + endlen = src->ls_len - strlen(nf->nf_name); + if (endlen == 0) + /* network name only, e.g. "elan" or "tcp" */ + netnum = 0; + else { + /* e.g. "elan25" or "tcp23", refuse to parse if + * network name is not appended with decimal or + * hexadecimal number */ + if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name), + endlen, &netnum, 0, MAX_NUMERIC_VALUE)) + return NULL; + } + + list_for_each_entry(nr, nidlist, nr_link) { + if (nr->nr_netstrfns != nf) + continue; + if (nr->nr_netnum != netnum) + continue; + return nr; + } + + nr = calloc(1, sizeof(struct nidrange)); + if (nr == NULL) + return NULL; + list_add_tail(&nr->nr_link, nidlist); + INIT_LIST_HEAD(&nr->nr_addrranges); + nr->nr_netstrfns = nf; + nr->nr_all = 0; + nr->nr_netnum = netnum; + + return nr; +} + +/** + * Parses \ token of the syntax. + * + * \retval 1 if \a src parses to \ '@' \ + * \retval 0 otherwise + */ +static int +parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist) +{ + struct cfs_lstr addrrange; + struct cfs_lstr net; + struct cfs_lstr tmp; + struct nidrange *nr; + + tmp = *src; + if (cfs_gettok(src, '@', &addrrange) == 0) + goto failed; + + if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL) + goto failed; + + nr = add_nidrange(&net, nidlist); + if (nr == NULL) + goto failed; + + if (parse_addrange(&addrrange, nr) != 0) + goto failed; + + return 1; + failed: + fprintf(stderr, "can't parse nidrange: \"%.*s\"\n", + tmp.ls_len, tmp.ls_str); + return 0; +} + +static __u32 +libcfs_net_str_len(const char *str) +{ + int i; + struct netstrfns *nf = NULL; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (!strncmp(str, nf->nf_name, strlen(nf->nf_name))) + return strlen(nf->nf_name); + } + + return 0; +} + +int +parse_net_range(char *str, __u32 len, struct list_head *net_num, + __u32 *net_type) +{ + struct cfs_lstr next; + __u32 net_type_len; + __u32 net; + char *bracket; + char *star; + + if (!str) + return -EINVAL; + + next.ls_str = str; + next.ls_len = len; + + net_type_len = libcfs_net_str_len(str); + + if (net_type_len < len) { + char c = str[net_type_len]; + + str[net_type_len] = '\0'; + net = libcfs_str2net(str); + str[net_type_len] = c; + } else { + net = libcfs_str2net(str); + } + + if (net == LNET_NIDNET(LNET_NID_ANY)) + return -EINVAL; + + *net_type = LNET_NETTYP(net); + + /* + * the net is either followed with an absolute number, *, or an + * expression enclosed in [] + */ + bracket = strchr(next.ls_str, '['); + star = strchr(next.ls_str, '*'); + + /* "*[" pattern not allowed */ + if (bracket && star && star < bracket) + return -EINVAL; + + if (!bracket) { + next.ls_str = str + net_type_len; + next.ls_len = strlen(next.ls_str); + } else { + next.ls_str = bracket; + next.ls_len = strlen(bracket); + } + + /* if there is no net number just return */ + if (next.ls_len == 0) + return 0; + + return libcfs_num_parse(next.ls_str, next.ls_len, + net_num); +} + +int +parse_address(struct cfs_lstr *src, const __u32 net_type, + struct list_head *addr) +{ + int i; + struct netstrfns *nf = NULL; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (net_type == nf->nf_type) + return nf->nf_parse_addrlist(src->ls_str, src->ls_len, + addr); + } + + return -EINVAL; +} + +int +cfs_parse_nid_parts(char *str, struct list_head *addr, + struct list_head *net_num, __u32 *net_type) +{ + struct cfs_lstr next; + struct cfs_lstr addrrange; + bool found = false; + int rc; + + if (!str) + return -EINVAL; + + next.ls_str = str; + next.ls_len = strlen(str); + + rc = cfs_gettok(&next, '@', &addrrange); + if (!rc) + return -EINVAL; + + if (!next.ls_str) { + /* only net is present */ + next.ls_str = str; + next.ls_len = strlen(str); + } else { + found = true; + } + + /* assume only net is present */ + rc = parse_net_range(next.ls_str, next.ls_len, net_num, net_type); + + /* + * if we successfully parsed the net range and there is no + * address, or if we fail to parse the net range then return + */ + if ((!rc && !found) || rc) + return rc; + + return parse_address(&addrrange, *net_type, addr); +} + +/** + * Frees addrrange structures of \a list. + * + * For each struct addrrange structure found on \a list it frees + * cfs_expr_list list attached to it and frees the addrrange itself. + * + * \retval none + */ +static void +free_addrranges(struct list_head *list) +{ + while (!list_empty(list)) { + struct addrrange *ar; + + ar = list_entry(list->next, struct addrrange, ar_link); + + cfs_expr_list_free_list(&ar->ar_numaddr_ranges); + list_del(&ar->ar_link); + free(ar); + } +} + +/** + * Frees nidrange strutures of \a list. + * + * For each struct nidrange structure found on \a list it frees + * addrrange list attached to it and frees the nidrange itself. + * + * \retval none + */ +void +cfs_free_nidlist(struct list_head *list) +{ + struct list_head *pos, *next; + struct nidrange *nr; + + list_for_each_safe(pos, next, list) { + nr = list_entry(pos, struct nidrange, nr_link); + free_addrranges(&nr->nr_addrranges); + list_del(pos); + free(nr); + } +} + +/** + * Parses nid range list. + * + * Parses with rigorous syntax and overflow checking \a str into + * \ [ ' ' \ ], compiles \a str into set of + * structures and links that structure to \a nidlist. The resulting + * list can be used to match a NID againts set of NIDS defined by \a + * str. + * \see cfs_match_nid + * + * \retval 1 on success + * \retval 0 otherwise + */ +int +cfs_parse_nidlist(char *str, int len, struct list_head *nidlist) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc; + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(nidlist); + while (src.ls_str) { + rc = cfs_gettok(&src, ' ', &res); + if (rc == 0) { + cfs_free_nidlist(nidlist); + return 0; + } + rc = parse_nidrange(&res, nidlist); + if (rc == 0) { + cfs_free_nidlist(nidlist); + return 0; + } + } + return 1; +} + +/** + * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist). + * + * \see cfs_parse_nidlist() + * + * \retval 1 on match + * \retval 0 otherwises + */ +int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist) +{ + struct nidrange *nr; + struct addrrange *ar; + + list_for_each_entry(nr, nidlist, nr_link) { + if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid))) + continue; + if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid))) + continue; + if (nr->nr_all) + return 1; + list_for_each_entry(ar, &nr->nr_addrranges, ar_link) + if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid), + &ar->ar_numaddr_ranges)) + return 1; + } + return 0; +} + +int +cfs_match_net(__u32 net_id, __u32 net_type, struct list_head *net_num_list) +{ + __u32 net_num; + + if (!net_num_list) + return 0; + + if (net_type != LNET_NETTYP(net_id)) + return 0; + + net_num = LNET_NETNUM(net_id); + + /* + * if there is a net number but the list passed in is empty, then + * there is no match. + */ + if (!net_num && list_empty(net_num_list)) + return 1; + else if (list_empty(net_num_list)) + return 0; + + if (!libcfs_num_match(net_num, net_num_list)) + return 0; + + return 1; +} + +/** + * Print the network part of the nidrange \a nr into the specified \a buffer. + * + * \retval number of characters written + */ +static int +cfs_print_network(char *buffer, int count, struct nidrange *nr) +{ + struct netstrfns *nf = nr->nr_netstrfns; + + if (nr->nr_netnum == 0) + return scnprintf(buffer, count, "@%s", nf->nf_name); + else + return scnprintf(buffer, count, "@%s%u", + nf->nf_name, nr->nr_netnum); +} + + +/** + * Print a list of addrrange (\a addrranges) into the specified \a buffer. + * At max \a count characters can be printed into \a buffer. + * + * \retval number of characters written + */ +static int +cfs_print_addrranges(char *buffer, int count, struct list_head *addrranges, + struct nidrange *nr) +{ + int i = 0; + struct addrrange *ar; + struct netstrfns *nf = nr->nr_netstrfns; + + list_for_each_entry(ar, addrranges, ar_link) { + if (i != 0) + i += scnprintf(buffer + i, count - i, " "); + i += nf->nf_print_addrlist(buffer + i, count - i, + &ar->ar_numaddr_ranges); + i += cfs_print_network(buffer + i, count - i, nr); + } + return i; +} + +/** + * Print a list of nidranges (\a nidlist) into the specified \a buffer. + * At max \a count characters can be printed into \a buffer. + * Nidranges are separated by a space character. + * + * \retval number of characters written + */ +int cfs_print_nidlist(char *buffer, int count, struct list_head *nidlist) +{ + int i = 0; + struct nidrange *nr; + + if (count <= 0) + return 0; + + list_for_each_entry(nr, nidlist, nr_link) { + if (i != 0) + i += scnprintf(buffer + i, count - i, " "); + + if (nr->nr_all != 0) { + assert(list_empty(&nr->nr_addrranges)); + i += scnprintf(buffer + i, count - i, "*"); + i += cfs_print_network(buffer + i, count - i, nr); + } else { + i += cfs_print_addrranges(buffer + i, count - i, + &nr->nr_addrranges, nr); + } + } + return i; +} + +/** + * Determines minimum and maximum addresses for a single + * numeric address range + * + * \param ar + * \param[out] *min_nid __u32 representation of min NID + * \param[out] *max_nid __u32 representation of max NID + * \retval -EINVAL unsupported LNET range + * \retval -ERANGE non-contiguous LNET range + */ +static int cfs_ip_ar_min_max(struct addrrange *ar, __u32 *min_nid, + __u32 *max_nid) +{ + struct cfs_expr_list *expr_list; + struct cfs_range_expr *range; + unsigned int min_ip[4] = {0}; + unsigned int max_ip[4] = {0}; + int cur_octet = 0; + bool expect_full_octet = false; + + list_for_each_entry(expr_list, &ar->ar_numaddr_ranges, el_link) { + int re_count = 0; + + list_for_each_entry(range, &expr_list->el_exprs, re_link) { + /* XXX: add support for multiple & non-contig. re's */ + if (re_count > 0) + return -EINVAL; + + /* if a previous octet was ranged, then all remaining + * octets must be full for contiguous range */ + if (expect_full_octet && (range->re_lo != 0 || + range->re_hi != 255)) + return -ERANGE; + + if (range->re_stride != 1) + return -ERANGE; + + if (range->re_lo > range->re_hi) + return -EINVAL; + + if (range->re_lo != range->re_hi) + expect_full_octet = true; + + min_ip[cur_octet] = range->re_lo; + max_ip[cur_octet] = range->re_hi; + + re_count++; + } + + cur_octet++; + } + + if (min_nid != NULL) + *min_nid = ((min_ip[0] << 24) | (min_ip[1] << 16) | + (min_ip[2] << 8) | min_ip[3]); + + if (max_nid != NULL) + *max_nid = ((max_ip[0] << 24) | (max_ip[1] << 16) | + (max_ip[2] << 8) | max_ip[3]); + + return 0; +} + +/** + * Determines minimum and maximum addresses for a single + * numeric address range + * + * \param ar + * \param[out] *min_nid __u32 representation of min NID + * \param[out] *max_nid __u32 representation of max NID + * \retval -EINVAL unsupported LNET range + */ +static int cfs_num_ar_min_max(struct addrrange *ar, __u32 *min_nid, + __u32 *max_nid) +{ + struct cfs_expr_list *el; + struct cfs_range_expr *re; + unsigned int min_addr = 0; + unsigned int max_addr = 0; + + list_for_each_entry(el, &ar->ar_numaddr_ranges, el_link) { + int re_count = 0; + + list_for_each_entry(re, &el->el_exprs, re_link) { + if (re_count > 0) + return -EINVAL; + if (re->re_lo > re->re_hi) + return -EINVAL; + + if (re->re_lo < min_addr || min_addr == 0) + min_addr = re->re_lo; + if (re->re_hi > max_addr) + max_addr = re->re_hi; + + re_count++; + } + } + + if (min_nid != NULL) + *min_nid = min_addr; + if (max_nid != NULL) + *max_nid = max_addr; + + return 0; +} + +/** + * Takes a linked list of nidrange expressions, determines the minimum + * and maximum nid and creates appropriate nid structures + * + * \param *nidlist + * \param[out] *min_nid string representation of min NID + * \param[out] *max_nid string representation of max NID + * \retval -EINVAL unsupported LNET range + * \retval -ERANGE non-contiguous LNET range + */ +int cfs_nidrange_find_min_max(struct list_head *nidlist, char *min_nid, + char *max_nid, size_t nidstr_length) +{ + struct nidrange *first_nidrange; + int netnum; + struct netstrfns *nf; + char *lndname; + __u32 min_addr; + __u32 max_addr; + char min_addr_str[IPSTRING_LENGTH]; + char max_addr_str[IPSTRING_LENGTH]; + int rc; + + first_nidrange = list_entry(nidlist->next, struct nidrange, nr_link); + + netnum = first_nidrange->nr_netnum; + nf = first_nidrange->nr_netstrfns; + lndname = nf->nf_name; + + rc = nf->nf_min_max(nidlist, &min_addr, &max_addr); + if (rc < 0) + return rc; + + nf->nf_addr2str(min_addr, min_addr_str, sizeof(min_addr_str)); + nf->nf_addr2str(max_addr, max_addr_str, sizeof(max_addr_str)); + + snprintf(min_nid, nidstr_length, "%s@%s%d", min_addr_str, lndname, + netnum); + snprintf(max_nid, nidstr_length, "%s@%s%d", max_addr_str, lndname, + netnum); + + return 0; +} + +/** + * Determines the min and max NID values for num LNDs + * + * \param *nidlist + * \param[out] *min_nid if provided, returns string representation of min NID + * \param[out] *max_nid if provided, returns string representation of max NID + * \retval -EINVAL unsupported LNET range + * \retval -ERANGE non-contiguous LNET range + */ +static int cfs_num_min_max(struct list_head *nidlist, __u32 *min_nid, + __u32 *max_nid) +{ + struct nidrange *nr; + struct addrrange *ar; + unsigned int tmp_min_addr = 0; + unsigned int tmp_max_addr = 0; + unsigned int min_addr = 0; + unsigned int max_addr = 0; + int nidlist_count = 0; + int rc; + + list_for_each_entry(nr, nidlist, nr_link) { + if (nidlist_count > 0) + return -EINVAL; + + list_for_each_entry(ar, &nr->nr_addrranges, ar_link) { + rc = cfs_num_ar_min_max(ar, &tmp_min_addr, + &tmp_max_addr); + if (rc < 0) + return rc; + + if (tmp_min_addr < min_addr || min_addr == 0) + min_addr = tmp_min_addr; + if (tmp_max_addr > max_addr) + max_addr = tmp_min_addr; + } + } + if (max_nid != NULL) + *max_nid = max_addr; + if (min_nid != NULL) + *min_nid = min_addr; + + return 0; +} + +/** + * Takes an nidlist and determines the minimum and maximum + * ip addresses. + * + * \param *nidlist + * \param[out] *min_nid if provided, returns string representation of min NID + * \param[out] *max_nid if provided, returns string representation of max NID + * \retval -EINVAL unsupported LNET range + * \retval -ERANGE non-contiguous LNET range + */ +static int cfs_ip_min_max(struct list_head *nidlist, __u32 *min_nid, + __u32 *max_nid) +{ + struct nidrange *nr; + struct addrrange *ar; + __u32 tmp_min_ip_addr = 0; + __u32 tmp_max_ip_addr = 0; + __u32 min_ip_addr = 0; + __u32 max_ip_addr = 0; + int nidlist_count = 0; + int rc; + + list_for_each_entry(nr, nidlist, nr_link) { + if (nidlist_count > 0) + return -EINVAL; + + if (nr->nr_all) { + min_ip_addr = 0; + max_ip_addr = 0xffffffff; + break; + } + + list_for_each_entry(ar, &nr->nr_addrranges, ar_link) { + rc = cfs_ip_ar_min_max(ar, &tmp_min_ip_addr, + &tmp_max_ip_addr); + if (rc < 0) + return rc; + + if (tmp_min_ip_addr < min_ip_addr || min_ip_addr == 0) + min_ip_addr = tmp_min_ip_addr; + if (tmp_max_ip_addr > max_ip_addr) + max_ip_addr = tmp_max_ip_addr; + } + + nidlist_count++; + } + + if (max_nid != NULL) + *max_nid = max_ip_addr; + if (min_nid != NULL) + *min_nid = min_ip_addr; + + return 0; +} + +static int +libcfs_expand_nidrange(struct nidrange *nr, __u32 *addrs, int max_nids) +{ + struct addrrange *ar; + int rc = 0, count = max_nids; + struct netstrfns *nf = nr->nr_netstrfns; + + list_for_each_entry(ar, &nr->nr_addrranges, ar_link) { + rc = nf->nf_expand_addrrange(&ar->ar_numaddr_ranges, addrs, + count); + if (rc < 0) + return rc; + + count -= rc; + } + + return max_nids - count; +} + +int cfs_expand_nidlist(struct list_head *nidlist, lnet_nid_t *lnet_nidlist, + int max_nids) +{ + struct nidrange *nr; + int rc = 0, count = max_nids; + int i, j = 0; + __u32 *addrs; + struct netstrfns *nf; + __u32 net; + + addrs = calloc(max_nids, sizeof(__u32)); + if (!addrs) + return -ENOMEM; + + list_for_each_entry(nr, nidlist, nr_link) { + rc = libcfs_expand_nidrange(nr, addrs, count); + + if (rc < 0) { + free(addrs); + return rc; + } + + nf = nr->nr_netstrfns; + net = LNET_MKNET(nf->nf_type, nr->nr_netnum); + + for (i = count - 1; i >= count - rc; i--) + lnet_nidlist[j++] = LNET_MKNID(net, addrs[i]); + + count -= rc; + } + + free(addrs); + return max_nids - count; +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c new file mode 100644 index 0000000000000..18fe84dc53f6a --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c @@ -0,0 +1,155 @@ +/* + * LGPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the GNU Lesser General Public License + * (LGPL) version 2.1 or (at your discretion) any later version. + * (LGPL) version 2.1 accompanies this distribution, and is available at + * http://www.gnu.org/licenses/lgpl-2.1.html + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * LGPL HEADER END + */ +/* + * libcfs/libcfs/utils/param.c + * + * This code handles user interaction with the configuration interface + * to the Lustre file system to fine tune it. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Get parameter path matching the pattern + * + * \param[out] paths glob_t structure used to hold the final result + * \param[in] pattern the pattern containing sprintf format specifiers + * which will be used to create the path to match + * + * The \param pattern is appended to the default path glob to complete the + * absolute path to the file the caller is requesting. If the results point + * to one or more files that exist those results are stored in the \param + * paths glob_t structure that is passed by the caller. + * + * Lustre tunables traditionally were in /proc/{sys,fs}/{lnet,lustre} + * but in upstream kernels starting with Linux 4.2 these parameters + * have been moved to /sys/fs/lustre and /sys/kernel/debug/{lnet,lustre} + * so the user tools need to check both locations. + * + * \retval 0 for success, with results stored in \param paths. + * \retval -1 for failure with errno set to report the reason. + */ +int +cfs_get_param_paths(glob_t *paths, const char *pattern, ...) +{ + char topdir[PATH_MAX] = "{/sys/{fs,kernel/debug}/{lnet,lustre}," + "/proc/{fs,sys}/{lnet,lustre}}"; + static bool test_mounted = false; + char path[PATH_MAX]; + char buf[PATH_MAX]; + struct statfs statfsbuf; + va_list args; + int rc; + + + if (test_mounted) + goto skip_mounting; + test_mounted = true; + + rc = statfs("/sys/kernel/debug/", &statfsbuf); + if (rc == 0 && statfsbuf.f_type == DEBUGFS_MAGIC) + goto skip_mounting; + + if (mount("none", "/sys/kernel/debug", "debugfs", 0, "") == -1) { + /* Already mounted or don't have permission to mount is okay */ + if (errno != EPERM && errno != EBUSY) + fprintf(stderr, "Warning: failed to mount debug: %s\n", + strerror(errno)); + } else { + struct stat mtab; + + /* This is all for RHEL6 which is old school. Can be removed + * later when RHEL6 client support is dropped. */ + rc = lstat(_PATH_MOUNTED, &mtab); + if (!rc && !S_ISLNK(mtab.st_mode)) { + FILE *fp = setmntent(_PATH_MOUNTED, "r+"); + + if (fp != NULL) { + const struct mntent fs = { + .mnt_fsname = "debugfs", + .mnt_dir = "/sys/kernel/debug", + .mnt_type = "debugfs", + .mnt_opts = "rw,relatime", + }; + + rc = addmntent(fp, &fs); + if (rc) { + fprintf(stderr, + "failed to add debugfs to %s: %s\n", + _PATH_MOUNTED, strerror(errno)); + } + endmntent(fp); + } else { + fprintf(stderr, "could not open %s: %s\n", + _PATH_MOUNTED, strerror(errno)); + } + } + } +skip_mounting: + va_start(args, pattern); + rc = vsnprintf(buf, sizeof(buf), pattern, args); + va_end(args); + if (rc < 0) { + return rc; + } else if (rc >= sizeof(buf)) { + errno = EINVAL; + return -1; + } + + if (snprintf(path, sizeof(path), "%s/%s", topdir, buf) >= + sizeof(path)) { + errno = E2BIG; + return -1; + } + + rc = glob(path, GLOB_BRACE, NULL, paths); + if (rc != 0) { + switch (rc) { + case GLOB_NOSPACE: + errno = ENOMEM; + break; + case GLOB_ABORTED: + errno = ENODEV; + break; + case GLOB_NOMATCH: + default: + errno = ENOENT; + break; + } + rc = -1; + } + + return rc; +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c new file mode 100644 index 0000000000000..c5c8947ef3d7d --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c @@ -0,0 +1,850 @@ +/* + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * Copyright (c) 2014, 2017, Intel Corporation. + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_LIBREADLINE +# include +# include +#endif /* HAVE_LIBREADLINE */ +#include +#include + +#include +#include + +/* Top level of commands, initialized by InitParser */ +static command_t *top_level; +/* Parser prompt, set by InitParser */ +static char *parser_prompt; +/* Set to 1 if user types exit or quit */ +static int done; +/* + * Normally, the parser will quit when an error occurs in non-interacive + * mode. Setting this to non-zero will force it to keep buggering on. + */ +static int ignore_errors; + +/* static functions */ +static char *skipwhitespace(char *s); +static char *skiptowhitespace(char *s); +static command_t *find_cmd(char *name, command_t cmds[], char **next); +static int process(char *s, char **next, command_t *lookup, command_t **result, + char **prev); + +static char *skipwhitespace(char *s) +{ + char *t; + int len; + + len = (int)strlen(s); + for (t = s; t <= s + len && isspace(*t); t++) + ; + return t; +} + +static char *skiptowhitespace(char *s) +{ + char *t; + + for (t = s; *t && !isspace(*t); t++) + ; + return t; +} + +static int line2args(char *line, char **argv, int maxargs) +{ + char *arg; + int i = 0; + + arg = strtok(line, " \t"); + if (!arg || maxargs < 1) + return 0; + + argv[i++] = arg; + while ((arg = strtok(NULL, " \t")) != NULL && i < maxargs) + argv[i++] = arg; + return i; +} + +/* find a command -- return it if unique otherwise print alternatives */ +static command_t *Parser_findargcmd(char *name, command_t cmds[]) +{ + command_t *cmd; + + for (cmd = cmds; cmd->pc_name; cmd++) { + if (strcmp(name, cmd->pc_name) == 0) + return cmd; + } + return NULL; +} + +void Parser_ignore_errors(int ignore) +{ + ignore_errors = ignore; +} + +int Parser_execarg(int argc, char **argv, command_t cmds[]) +{ + command_t *cmd; + + cmd = Parser_findargcmd(argv[0], cmds); + if (cmd && cmd->pc_func) { + int rc = cmd->pc_func(argc, argv); + + if (rc == CMD_HELP) + fprintf(stderr, "%s\n", cmd->pc_help); + return rc; + } + printf("Try interactive use without arguments or use one of:\n"); + for (cmd = cmds; cmd->pc_name; cmd++) + printf("\"%s\"\n", cmd->pc_name); + printf("as argument.\n"); + + return -1; +} + +/* + * Returns the command_t * (NULL if not found) corresponding to a + * _partial_ match with the first token in name. It sets *next to + * point to the following token. Does not modify *name. + */ +static command_t *find_cmd(char *name, command_t cmds[], char **next) +{ + int i, len; + + if (!cmds || !name) + return NULL; + + /* + * This sets name to point to the first non-white space character, + * and next to the first whitespace after name, len to the length: do + * this with strtok + */ + name = skipwhitespace(name); + *next = skiptowhitespace(name); + len = (int)(*next - name); + if (len == 0) + return NULL; + + for (i = 0; cmds[i].pc_name; i++) { + if (strncasecmp(name, cmds[i].pc_name, len) == 0) { + *next = skipwhitespace(*next); + return &cmds[i]; + } + } + return NULL; +} + +/* + * Recursively process a command line string s and find the command + * corresponding to it. This can be ambiguous, full, incomplete, + * non-existent. + */ +static int process(char *s, char **next, command_t *lookup, + command_t **result, char **prev) +{ + *result = find_cmd(s, lookup, next); + *prev = s; + + /* non existent */ + if (!*result) + return CMD_NONE; + + /* + * found entry: is it ambigous, i.e. not exact command name and + * more than one command in the list matches. Note that find_cmd + * points to the first ambiguous entry + */ + if (strncasecmp(s, (*result)->pc_name, strlen((*result)->pc_name))) { + char *another_next; + int found_another = 0; + + command_t *another_result = find_cmd(s, (*result) + 1, + &another_next); + while (another_result) { + if (strncasecmp(s, another_result->pc_name, + strlen(another_result->pc_name)) == 0) { + *result = another_result; + *next = another_next; + goto got_it; + } + another_result = find_cmd(s, another_result + 1, + &another_next); + found_another = 1; + } + if (found_another) + return CMD_AMBIG; + } + +got_it: + /* found a unique command: component or full? */ + if ((*result)->pc_func) + return CMD_COMPLETE; + + if (**next == '\0') + return CMD_INCOMPLETE; + return process(*next, next, (*result)->pc_sub_cmd, + result, prev); +} + +#ifdef HAVE_LIBREADLINE +static command_t *match_tbl; /* Command completion against this table */ +static char *command_generator(const char *text, int state) +{ + static int index, len; + char *name; + + /* Do we have a match table? */ + if (!match_tbl) + return NULL; + + /* If this is the first time called on this word, state is 0 */ + if (!state) { + index = 0; + len = (int)strlen(text); + } + + /* Return next name in the command list that paritally matches test */ + while ((name = (match_tbl + index)->pc_name)) { + index++; + + if (strncasecmp(name, text, len) == 0) + return strdup(name); + } + + /* No more matches */ + return NULL; +} + +/* probably called by readline */ +static char **command_completion(const char *text, int start, int end) +{ + command_t *table; + char *pos; + + match_tbl = top_level; + + for (table = find_cmd(rl_line_buffer, match_tbl, &pos); + table; table = find_cmd(pos, match_tbl, &pos)) { + if (*(pos - 1) == ' ') + match_tbl = table->pc_sub_cmd; + } + + return rl_completion_matches(text, command_generator); +} +#endif + +/* take a string and execute the function or print help */ +int execute_line(char *line) +{ + command_t *cmd, *ambig; + char *prev; + char *next, *tmp; + char *argv[MAXARGS]; + int i; + int rc = 0; + + switch (process(line, &next, top_level, &cmd, &prev)) { + case CMD_AMBIG: + fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); + while ((ambig = find_cmd(prev, cmd, &tmp))) { + fprintf(stderr, "%s ", ambig->pc_name); + cmd = ambig + 1; + } + fprintf(stderr, "\n"); + break; + case CMD_NONE: + fprintf(stderr, "No such command, type help\n"); + break; + case CMD_INCOMPLETE: + fprintf(stderr, + "'%s' incomplete command. Use '%s x' where x is one of:\n", + line, line); + fprintf(stderr, "\t"); + for (i = 0; cmd->pc_sub_cmd[i].pc_name; i++) + fprintf(stderr, "%s ", cmd->pc_sub_cmd[i].pc_name); + fprintf(stderr, "\n"); + break; + case CMD_COMPLETE: + optind = 0; + i = line2args(line, argv, MAXARGS); + rc = cmd->pc_func(i, argv); + + if (rc == CMD_HELP) + fprintf(stderr, "%s\n", cmd->pc_help); + + break; + } + + return rc; +} + +#ifdef HAVE_LIBREADLINE +static void noop_int_fn(int unused) { } +static void noop_void_fn(void) { } +#endif + +/* + * just in case you're ever in an airplane and discover you + * forgot to install readline-dev. :) + */ +static int init_input(void) +{ + int interactive = isatty(fileno(stdin)); + +#ifdef HAVE_LIBREADLINE + using_history(); + stifle_history(HISTORY); + + if (!interactive) { + rl_prep_term_function = noop_int_fn; + rl_deprep_term_function = noop_void_fn; + } + + rl_attempted_completion_function = command_completion; + rl_completion_entry_function = command_generator; +#endif + return interactive; +} + +#ifndef HAVE_LIBREADLINE +#define add_history(s) +char *readline(char *prompt) +{ + int size = 2048; + char *line = malloc(size); + char *ptr = line; + int c; + int eof = 0; + + if (!line) + return NULL; + if (prompt) + printf("%s", prompt); + + while (1) { + if ((c = fgetc(stdin)) != EOF) { + if (c == '\n') + goto out; + *ptr++ = (char)c; + + if (ptr - line >= size - 1) { + char *tmp; + + size *= 2; + tmp = malloc(size); + if (!tmp) + goto outfree; + memcpy(tmp, line, ptr - line); + ptr = tmp + (ptr - line); + free(line); + line = tmp; + } + } else { + eof = 1; + if (ferror(stdin) || feof(stdin)) + goto outfree; + goto out; + } + } +out: + *ptr = 0; + if (eof && (strlen(line) == 0)) { + free(line); + line = NULL; + } + return line; +outfree: + free(line); + return NULL; +} +#endif + +/* this is the command execution machine */ +int Parser_commands(void) +{ + char *line, *s; + int rc = 0, save_error = 0; + int interactive; + + interactive = init_input(); + + while (!done) { + line = readline(interactive ? parser_prompt : NULL); + + if (!line) + break; + + s = skipwhitespace(line); + + if (*s) { + add_history(s); + rc = execute_line(s); + } + /* stop on error if not-interactive */ + if (rc != 0 && !interactive) { + if (save_error == 0) + save_error = rc; + if (!ignore_errors) + done = 1; + } + free(line); + } + + if (save_error) + rc = save_error; + + return rc; +} + +/* sets the parser prompt */ +void Parser_init(char *prompt, command_t *cmds) +{ + done = 0; + top_level = cmds; + if (parser_prompt) + free(parser_prompt); + parser_prompt = strdup(prompt); +} + +/* frees the parser prompt */ +void Parser_exit(int argc, char *argv[]) +{ + done = 1; + free(parser_prompt); + parser_prompt = NULL; +} + +/* convert a string to an integer */ +int Parser_int(char *s, int *val) +{ + int ret; + + if (*s != '0') { + ret = sscanf(s, "%d", val); + } else if (*(s + 1) != 'x') { + ret = sscanf(s, "%o", val); + } else { + s++; + ret = sscanf(++s, "%x", val); + } + + return ret; +} + +void Parser_qhelp(int argc, char *argv[]) +{ + printf("usage: %s [COMMAND] [OPTIONS]... [ARGS]\n", + program_invocation_short_name); + printf("Without any parameters, interactive mode is invoked\n"); + + printf("Try '%s help ' or '%s --list-commands' for more information\n", + program_invocation_short_name, program_invocation_short_name); +} + +int Parser_help(int argc, char **argv) +{ + char line[1024]; + char *next, *prev, *tmp; + command_t *result, *ambig; + int i; + + if (argc == 1) { + Parser_qhelp(argc, argv); + return 0; + } + + /* + * Joining command line arguments without space is not critical here + * because of this string is used for search a help topic and assume + * that only one argument will be (the name of topic). For example: + * lst > help ping run + * pingrun: Unknown command. + */ + line[0] = '\0'; + for (i = 1; i < argc; i++) { + if (strlen(argv[i]) >= sizeof(line) - strlen(line)) + return -E2BIG; + /* + * The function strlcat() cannot be used here because of + * this function is used in LNet utils that is not linked + * with libcfs.a. + */ + strncat(line, argv[i], sizeof(line) - strlen(line)); + } + + switch (process(line, &next, top_level, &result, &prev)) { + case CMD_COMPLETE: + fprintf(stderr, "%s: %s\n", line, result->pc_help); + break; + case CMD_NONE: + fprintf(stderr, "%s: Unknown command.\n", line); + break; + case CMD_INCOMPLETE: + fprintf(stderr, + "'%s' incomplete command. Use '%s x' where x is one of:\n", + line, line); + fprintf(stderr, "\t"); + for (i = 0; result->pc_sub_cmd[i].pc_name; i++) + fprintf(stderr, "%s ", result->pc_sub_cmd[i].pc_name); + fprintf(stderr, "\n"); + break; + case CMD_AMBIG: + fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); + while ((ambig = find_cmd(prev, result, &tmp))) { + fprintf(stderr, "%s ", ambig->pc_name); + result = ambig + 1; + } + fprintf(stderr, "\n"); + break; + } + return 0; +} + +void Parser_printhelp(char *cmd) +{ + char *argv[] = { "help", cmd }; + + Parser_help(2, argv); +} + +/* COMMANDS */ + +/** + * Parser_list_commands() - Output a list of the supported commands. + * @cmdlist: Array of structures describing the commands. + * @buffer: String buffer used to temporarily store the output text. + * @buf_size: Length of the string buffer. + * @parent_cmd: When called recursively, contains the name of the parent cmd. + * @col_start: Column where printing should begin. + * @col_num: The number of commands printed in a single row. + * + * The commands and subcommands supported by the utility are printed, arranged + * into several columns for readability. If a command supports subcommands, the + * function is called recursively, and the name of the parent command is + * supplied so that it can be prepended to the names of the subcommands. + * + * Return: The number of items that were printed. + */ +int Parser_list_commands(const command_t *cmdlist, char *buffer, + size_t buf_size, const char *parent_cmd, + int col_start, int col_num) +{ + int col = col_start; + int char_max; + int len; + int count = 0; + int rc; + + if (col_start >= col_num) + return 0; + + char_max = (buf_size - 1) / col_num; /* Reserve 1 char for NUL */ + + for (; cmdlist->pc_name; cmdlist++) { + if (!cmdlist->pc_func && !cmdlist->pc_sub_cmd) + break; + count++; + if (parent_cmd) + len = snprintf(&buffer[col * char_max], + char_max + 1, "%s %s", parent_cmd, + cmdlist->pc_name); + else + len = snprintf(&buffer[col * char_max], + char_max + 1, "%s", cmdlist->pc_name); + + /* Add trailing spaces to pad the entry to the column size */ + if (len < char_max) { + snprintf(&buffer[col * char_max] + len, + char_max - len + 1, "%*s", char_max - len, + " "); + } else { + buffer[(col + 1) * char_max - 1] = ' '; + } + + col++; + if (col >= col_num) { + fprintf(stdout, "%s\n", buffer); + col = 0; + buffer[0] = '\0'; + } + + if (cmdlist->pc_sub_cmd) { + rc = Parser_list_commands(cmdlist->pc_sub_cmd, buffer, + buf_size, cmdlist->pc_name, + col, col_num); + col = (col + rc) % col_num; + count += rc; + } + } + if (!parent_cmd && col != 0) + fprintf(stdout, "%s\n", buffer); + return count; +} + +char *Parser_getstr(const char *prompt, const char *deft, char *res, + size_t len) +{ + char *line = NULL; + int size = strlen(prompt) + strlen(deft) + 8; + char *theprompt; + + theprompt = malloc(size); + assert(theprompt); + + snprintf(theprompt, size, "%s [%s]: ", prompt, deft); + + line = readline(theprompt); + free(theprompt); + + /* + * The function strlcpy() cannot be used here because of + * this function is used in LNet utils that is not linked + * with libcfs.a. + */ + if (!line || *line == '\0') + strncpy(res, deft, len); + else + strncpy(res, line, len); + res[len - 1] = '\0'; + + if (line) { + free(line); + return res; + } + return NULL; +} + +/* get integer from prompt, loop forever to get it */ +int Parser_getint(const char *prompt, long min, long max, long deft, int base) +{ + int rc; + long result; + char *line; + int size = strlen(prompt) + 40; + char *theprompt = malloc(size); + + assert(theprompt); + snprintf(theprompt, size, "%s [%ld, (0x%lx)]: ", prompt, deft, deft); + fflush(stdout); + + do { + line = NULL; + line = readline(theprompt); + if (!line) { + fprintf(stdout, "Please enter an integer.\n"); + fflush(stdout); + continue; + } + if (*line == '\0') { + free(line); + result = deft; + break; + } + rc = Parser_arg2int(line, &result, base); + free(line); + if (rc != 0) { + fprintf(stdout, "Invalid string.\n"); + fflush(stdout); + } else if (result > max || result < min) { + fprintf(stdout, + "Error: response must lie between %ld and %ld.\n", + min, max); + fflush(stdout); + } else { + break; + } + } while (1); + + if (theprompt) + free(theprompt); + return result; +} + +/* get boolean (starting with YyNn; loop forever */ +int Parser_getbool(const char *prompt, int deft) +{ + int result = 0; + char *line; + int size = strlen(prompt) + 8; + char *theprompt = malloc(size); + + assert(theprompt); + fflush(stdout); + + if (deft != 0 && deft != 1) { + fprintf(stderr, "Error: Parser_getbool given bad default %d\n", + deft); + assert(0); + } + snprintf(theprompt, size, "%s [%s]: ", prompt, (deft == 0) ? "N" : "Y"); + + do { + line = NULL; + line = readline(theprompt); + if (!line) { + result = deft; + break; + } + if (*line == '\0') { + result = deft; + break; + } + if (*line == 'y' || *line == 'Y') { + result = 1; + break; + } + if (*line == 'n' || *line == 'N') { + result = 0; + break; + } + if (line) + free(line); + fprintf(stdout, "Invalid string. Must start with yY or nN\n"); + fflush(stdout); + } while (1); + + if (line) + free(line); + if (theprompt) + free(theprompt); + + return result; +} + +/* parse int out of a string or prompt for it */ +long Parser_intarg(const char *inp, const char *prompt, int deft, + int min, int max, int base) +{ + long result; + int rc; + + rc = Parser_arg2int(inp, &result, base); + if (rc == 0) + return result; + else + return Parser_getint(prompt, deft, min, max, base); +} + +/* parse int out of a string or prompt for it */ +char *Parser_strarg(char *inp, const char *prompt, const char *deft, + char *answer, int len) +{ + if (!inp || *inp == '\0') + return Parser_getstr(prompt, deft, answer, len); + else + return inp; +} + +/* + * change a string into a number: return 0 on success. No invalid characters + * allowed. The processing of base and validity follows strtol(3) + */ +int Parser_arg2int(const char *inp, long *result, int base) +{ + char *endptr; + + if ((base != 0) && (base < 2 || base > 36)) + return 1; + + *result = strtol(inp, &endptr, base); + + if (*inp != '\0' && *endptr == '\0') + return 0; + else + return 1; +} + +/* Convert human readable size string to and int; "1k" -> 1000 */ +int Parser_size(unsigned long *sizep, char *str) +{ + unsigned long size; + char mod[32]; + + switch (sscanf(str, "%lu%1[gGmMkK]", &size, mod)) { + default: + return -1; + case 1: + *sizep = size; + return 0; + case 2: + switch (*mod) { + case 'g': + case 'G': + *sizep = size << 30; + return 0; + case 'm': + case 'M': + *sizep = size << 20; + return 0; + case 'k': + case 'K': + *sizep = size << 10; + return 0; + default: + *sizep = size; + return 0; + } + } +} + +/* Convert a string boolean to an int; "enable" -> 1 */ +int Parser_bool(int *b, char *str) +{ + if (!strcasecmp(str, "no") || !strcasecmp(str, "n") || + !strcasecmp(str, "off") || !strcasecmp(str, "down") || + !strcasecmp(str, "disable")) { + *b = 0; + return 0; + } + + if (!strcasecmp(str, "yes") || !strcasecmp(str, "y") || + !strcasecmp(str, "on") || !strcasecmp(str, "up") || + !strcasecmp(str, "enable")) { + *b = 1; + return 0; + } + + return -1; +} + +int Parser_quit(int argc, char **argv) +{ + argc = argc; + argv = argv; + done = 1; + return 0; +} + +int Parser_version(int argc, char **argv) +{ + fprintf(stdout, "%s %s\n", program_invocation_short_name, + LUSTRE_VERSION_STRING); + return 0; +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c new file mode 100644 index 0000000000000..700f002d721df --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c @@ -0,0 +1,526 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * String manipulation functions. + * + * libcfs/libcfs/util/string.c + * + * Author: Nathan Rutman + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Extracts tokens from strings. + * + * Looks for \a delim in string \a next, sets \a res to point to + * substring before the delimiter, sets \a next right after the found + * delimiter. + * + * \retval 1 if \a res points to a string of non-whitespace characters + * \retval 0 otherwise + */ +int +cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res) +{ + char *end; + + if (next->ls_str == NULL) + return 0; + + /* skip leading white spaces */ + while (next->ls_len) { + if (!isspace(*next->ls_str)) + break; + next->ls_str++; + next->ls_len--; + } + + if (next->ls_len == 0) /* whitespaces only */ + return 0; + + if (*next->ls_str == delim) { + /* first non-writespace is the delimiter */ + return 0; + } + + res->ls_str = next->ls_str; + end = memchr(next->ls_str, delim, next->ls_len); + if (end == NULL) { + /* there is no the delimeter in the string */ + end = next->ls_str + next->ls_len; + next->ls_str = NULL; + next->ls_len = 0; + } else { + next->ls_str = end + 1; + next->ls_len -= (end - res->ls_str + 1); + } + + /* skip ending whitespaces */ + while (--end != res->ls_str) { + if (!isspace(*end)) + break; + } + + res->ls_len = end - res->ls_str + 1; + return 1; +} + +/** + * Converts string to integer. + * + * Accepts decimal and hexadecimal number recordings. + * + * \retval 1 if first \a nob chars of \a str convert to decimal or + * hexadecimal integer in the range [\a min, \a max] + * \retval 0 otherwise + */ +int +cfs_str2num_check(char *str, int nob, unsigned *num, + unsigned min, unsigned max) +{ + char *endp; + + *num = strtoul(str, &endp, 0); + if (endp == str) + return 0; + + for (; endp < str + nob; endp++) { + if (!isspace(*endp)) + return 0; + } + + return (*num >= min && *num <= max); +} + +/** + * Parses \ token of the syntax. If \a bracketed is false, + * \a src should only have a single token which can be \ or \* + * + * \retval pointer to allocated range_expr and initialized + * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a + * src parses to + * \ | + * \ '-' \ | + * \ '-' \ '/' \ + * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or + * -ENOMEM will be returned. + */ +static int +cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max, + int bracketed, struct cfs_range_expr **expr) +{ + struct cfs_range_expr *re; + struct cfs_lstr tok; + + re = calloc(1, sizeof(*re)); + if (re == NULL) + return -ENOMEM; + + if (src->ls_len == 1 && src->ls_str[0] == '*') { + re->re_lo = min; + re->re_hi = max; + re->re_stride = 1; + goto out; + } + + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_lo, min, max)) { + /* is parsed */ + re->re_hi = re->re_lo; + re->re_stride = 1; + goto out; + } + + if (!bracketed || !cfs_gettok(src, '-', &tok)) + goto failed; + + if (!cfs_str2num_check(tok.ls_str, tok.ls_len, + &re->re_lo, min, max)) + goto failed; + + /* - */ + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_hi, min, max)) { + /* - is parsed */ + re->re_stride = 1; + goto out; + } + + /* go to check '-' '/' */ + if (cfs_gettok(src, '/', &tok)) { + if (!cfs_str2num_check(tok.ls_str, tok.ls_len, + &re->re_hi, min, max)) + goto failed; + + /* - / ... */ + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_stride, min, max)) { + /* - / is parsed */ + goto out; + } + } + + out: + *expr = re; + return 0; + + failed: + free(re); + return -EINVAL; +} + +/** + * Print the range expression \a re into specified \a buffer. + * If \a bracketed is true, expression does not need additional + * brackets. + * + * \retval number of characters written + */ +static int +cfs_range_expr_print(char *buffer, int count, struct cfs_range_expr *expr, + bool bracketed) +{ + int i; + char s[] = "["; + char e[] = "]"; + + if (bracketed) + s[0] = e[0] = '\0'; + + if (expr->re_lo == expr->re_hi) + i = snprintf(buffer, count, "%u", expr->re_lo); + else if (expr->re_stride == 1) + i = snprintf(buffer, count, "%s%u-%u%s", + s, expr->re_lo, expr->re_hi, e); + else + i = snprintf(buffer, count, "%s%u-%u/%u%s", + s, expr->re_lo, expr->re_hi, + expr->re_stride, e); + return i; +} + +/** + * Print a list of range expressions (\a expr_list) into specified \a buffer. + * If the list contains several expressions, separate them with comma + * and surround the list with brackets. + * + * \retval number of characters written + */ +int +cfs_expr_list_print(char *buffer, int count, struct cfs_expr_list *expr_list) +{ + struct cfs_range_expr *expr; + int i = 0, j = 0; + int numexprs = 0; + + if (count <= 0) + return 0; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) + numexprs++; + + if (numexprs > 1) + i += scnprintf(buffer + i, count - i, "["); + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + if (j++ != 0) + i += scnprintf(buffer + i, count - i, ","); + i += cfs_range_expr_print(buffer + i, count - i, expr, + numexprs > 1); + } + + if (numexprs > 1) + i += scnprintf(buffer + i, count - i, "]"); + + return i; +} + +/** + * Matches value (\a value) against ranges expression list \a expr_list. + * + * \retval 1 if \a value matches + * \retval 0 otherwise + */ +int +cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list) +{ + struct cfs_range_expr *expr; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + if (value >= expr->re_lo && value <= expr->re_hi && + ((value - expr->re_lo) % expr->re_stride) == 0) + return 1; + } + + return 0; +} + +/** + * Convert express list (\a expr_list) to an array of all matched values + * + * \retval N N is total number of all matched values + * \retval 0 if expression list is empty + * \retval < 0 for failure + */ +int +cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp) +{ + struct cfs_range_expr *expr; + __u32 *val; + int count = 0; + int i; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + for (i = expr->re_lo; i <= expr->re_hi; i++) { + if (((i - expr->re_lo) % expr->re_stride) == 0) + count++; + } + } + + if (count == 0) /* empty expression list */ + return 0; + + if (count > max) + return -EINVAL; + + val = calloc(sizeof(val[0]), count); + if (val == NULL) + return -ENOMEM; + + count = 0; + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + for (i = expr->re_lo; i <= expr->re_hi; i++) { + if (((i - expr->re_lo) % expr->re_stride) == 0) + val[count++] = i; + } + } + + *valpp = val; + return count; +} + +void +cfs_expr_list_values_free(__u32 *values, int num) +{ + /* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed + * by OBD_FREE() if it's called by module other than libcfs & LNet, + * otherwise we will see fake memory leak */ + free(values); +} + +/** + * Frees cfs_range_expr structures of \a expr_list. + * + * \retval none + */ +void +cfs_expr_list_free(struct cfs_expr_list *expr_list) +{ + while (!list_empty(&expr_list->el_exprs)) { + struct cfs_range_expr *expr; + + expr = list_entry(expr_list->el_exprs.next, + struct cfs_range_expr, re_link); + list_del(&expr->re_link); + free(expr); + } + + free(expr_list); +} + +/** + * Parses \ token of the syntax. + * + * \retval 0 if \a str parses to \ | \ + * \retval -errno otherwise + */ +int +cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max, + struct cfs_expr_list **elpp) +{ + struct cfs_expr_list *expr_list; + struct cfs_range_expr *expr; + struct cfs_lstr src; + int rc; + + expr_list = calloc(1, sizeof(*expr_list)); + if (expr_list == NULL) + return -ENOMEM; + + src.ls_str = str; + src.ls_len = len; + + INIT_LIST_HEAD(&expr_list->el_exprs); + + if (src.ls_str[0] == '[' && + src.ls_str[src.ls_len - 1] == ']') { + src.ls_str++; + src.ls_len -= 2; + + rc = -EINVAL; + while (src.ls_str != NULL) { + struct cfs_lstr tok; + + if (!cfs_gettok(&src, ',', &tok)) { + rc = -EINVAL; + break; + } + + rc = cfs_range_expr_parse(&tok, min, max, 1, &expr); + if (rc != 0) + break; + + list_add_tail(&expr->re_link, + &expr_list->el_exprs); + } + } else { + rc = cfs_range_expr_parse(&src, min, max, 0, &expr); + if (rc == 0) { + list_add_tail(&expr->re_link, + &expr_list->el_exprs); + } + } + + if (rc != 0) + cfs_expr_list_free(expr_list); + else + *elpp = expr_list; + + return rc; +} + +/** + * Frees cfs_expr_list structures of \a list. + * + * For each struct cfs_expr_list structure found on \a list it frees + * range_expr list attached to it and frees the cfs_expr_list itself. + * + * \retval none + */ +void +cfs_expr_list_free_list(struct list_head *list) +{ + struct cfs_expr_list *el; + + while (!list_empty(list)) { + el = list_entry(list->next, + struct cfs_expr_list, el_link); + list_del(&el->el_link); + cfs_expr_list_free(el); + } +} + +/** + * cfs_abs_path() - Get the absolute path of a relative path + * @request_path: The relative path to be resolved + * @resolved_path: Set to the resolved absolute path + * + * Returns the canonicalized absolute pathname. This function is a wrapper to + * realpath, but will work even if the target file does not exist. All + * directories in the path must exist. + * + * Return: On success, 0 is returned and resolved_path points to an allocated + * string containing the absolute pathname. On error, errno is set + * appropriately, -errno is returned, and resolved_path points to NULL. + */ +int cfs_abs_path(const char *request_path, char **resolved_path) +{ + char buf[PATH_MAX + 1] = ""; + char *path; + char *ptr; + int len; + int rc = 0; + const char *fmt; + + path = malloc(sizeof(buf)); + if (path == NULL) + return -ENOMEM; + + if (request_path[0] != '/') { + if (getcwd(path, sizeof(buf) - 1) == NULL) { + rc = -errno; + goto out; + } + len = snprintf(buf, sizeof(buf), "%s/%s", path, request_path); + if (len >= sizeof(buf)) { + rc = -ENAMETOOLONG; + goto out; + } + } else { + /* skip duplicate leading '/' */ + len = snprintf(buf, sizeof(buf), "%s", + request_path + strspn(request_path, "/") - 1); + if (len >= sizeof(buf)) { + rc = -ENAMETOOLONG; + goto out; + } + } + + /* if filename not in root directory, call realpath for parent path */ + ptr = strrchr(buf, '/'); + if (ptr != buf) { + *ptr = '\0'; + if (path != realpath(buf, path)) { + rc = -errno; + goto out; + } + /* add the filename back */ + len = strlen(path); + fmt = (path[len - 1] == '/') ? "%s" : "/%s"; + len = snprintf(path + len, sizeof(buf) - len, fmt, ptr + 1); + if (len >= sizeof(buf) - len) { + rc = -ENAMETOOLONG; + goto out; + } + } else { + len = snprintf(path, sizeof(buf), "%s", buf); + if (len >= sizeof(buf)) { + rc = -ENAMETOOLONG; + goto out; + } + } + +out: + if (rc == 0) { + *resolved_path = path; + } else { + *resolved_path = NULL; + free(path); + } + return rc; +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c new file mode 100644 index 0000000000000..d2b9eb4f871ea --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c @@ -0,0 +1,462 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/libcfs/workitem.c + * + * Author: Isaac Huang + * Liang Zhen + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include + +#define CFS_WS_NAME_LEN 16 + +struct cfs_wi_sched { + struct list_head ws_list; /* chain on global list */ + /** serialised workitems */ + spinlock_t ws_lock; + /** where schedulers sleep */ + wait_queue_head_t ws_waitq; + /** concurrent workitems */ + struct list_head ws_runq; + /** rescheduled running-workitems, a workitem can be rescheduled + * while running in wi_action(), but we don't to execute it again + * unless it returns from wi_action(), so we put it on ws_rerunq + * while rescheduling, and move it to runq after it returns + * from wi_action() */ + struct list_head ws_rerunq; + /** CPT-table for this scheduler */ + struct cfs_cpt_table *ws_cptab; + /** CPT id for affinity */ + int ws_cpt; + /** number of scheduled workitems */ + int ws_nscheduled; + /** started scheduler thread, protected by cfs_wi_data::wi_glock */ + unsigned int ws_nthreads:30; + /** shutting down, protected by cfs_wi_data::wi_glock */ + unsigned int ws_stopping:1; + /** serialize starting thread, protected by cfs_wi_data::wi_glock */ + unsigned int ws_starting:1; + /** scheduler name */ + char ws_name[CFS_WS_NAME_LEN]; +}; + +static struct cfs_workitem_data { + /** serialize */ + spinlock_t wi_glock; + /** list of all schedulers */ + struct list_head wi_scheds; + /** WI module is initialized */ + int wi_init; + /** shutting down the whole WI module */ + int wi_stopping; +} cfs_wi_data; + +static inline int +cfs_wi_sched_cansleep(struct cfs_wi_sched *sched) +{ + spin_lock(&sched->ws_lock); + if (sched->ws_stopping) { + spin_unlock(&sched->ws_lock); + return 0; + } + + if (!list_empty(&sched->ws_runq)) { + spin_unlock(&sched->ws_lock); + return 0; + } + spin_unlock(&sched->ws_lock); + return 1; +} + +/* XXX: + * 0. it only works when called from wi->wi_action. + * 1. when it returns no one shall try to schedule the workitem. + */ +void +cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi) +{ + LASSERT(!in_interrupt()); /* because we use plain spinlock */ + LASSERT(!sched->ws_stopping); + + spin_lock(&sched->ws_lock); + + LASSERT(wi->wi_running); + + if (wi->wi_scheduled) { /* cancel pending schedules */ + LASSERT(!list_empty(&wi->wi_list)); + list_del_init(&wi->wi_list); + + LASSERT(sched->ws_nscheduled > 0); + sched->ws_nscheduled--; + } + + LASSERT(list_empty(&wi->wi_list)); + + wi->wi_scheduled = 1; /* LBUG future schedule attempts */ + spin_unlock(&sched->ws_lock); +} +EXPORT_SYMBOL(cfs_wi_exit); + +/** + * cancel schedule request of workitem \a wi + */ +int +cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi) +{ + int rc; + + LASSERT(!in_interrupt()); /* because we use plain spinlock */ + LASSERT(!sched->ws_stopping); + + /* + * return 0 if it's running already, otherwise return 1, which + * means the workitem will not be scheduled and will not have + * any race with wi_action. + */ + spin_lock(&sched->ws_lock); + + rc = !(wi->wi_running); + + if (wi->wi_scheduled) { /* cancel pending schedules */ + LASSERT(!list_empty(&wi->wi_list)); + list_del_init(&wi->wi_list); + + LASSERT(sched->ws_nscheduled > 0); + sched->ws_nscheduled--; + + wi->wi_scheduled = 0; + } + + LASSERT (list_empty(&wi->wi_list)); + + spin_unlock(&sched->ws_lock); + return rc; +} +EXPORT_SYMBOL(cfs_wi_deschedule); + +/* + * Workitem scheduled with (serial == 1) is strictly serialised not only with + * itself, but also with others scheduled this way. + * + * Now there's only one static serialised queue, but in the future more might + * be added, and even dynamic creation of serialised queues might be supported. + */ +void +cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi) +{ + LASSERT(!in_interrupt()); /* because we use plain spinlock */ + LASSERT(!sched->ws_stopping); + + spin_lock(&sched->ws_lock); + + if (!wi->wi_scheduled) { + LASSERT (list_empty(&wi->wi_list)); + + wi->wi_scheduled = 1; + sched->ws_nscheduled++; + if (!wi->wi_running) { + list_add_tail(&wi->wi_list, &sched->ws_runq); + wake_up(&sched->ws_waitq); + } else { + list_add(&wi->wi_list, &sched->ws_rerunq); + } + } + + LASSERT (!list_empty(&wi->wi_list)); + spin_unlock(&sched->ws_lock); +} +EXPORT_SYMBOL(cfs_wi_schedule); + +static int +cfs_wi_scheduler(void *arg) +{ + struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg; + + /* CPT affinity scheduler? */ + if (sched->ws_cptab != NULL) + if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0) + CWARN("Unable to bind %s on CPU partition %d\n", + sched->ws_name, sched->ws_cpt); + + spin_lock(&cfs_wi_data.wi_glock); + + LASSERT(sched->ws_starting == 1); + sched->ws_starting--; + sched->ws_nthreads++; + + spin_unlock(&cfs_wi_data.wi_glock); + + spin_lock(&sched->ws_lock); + + while (!sched->ws_stopping) { + int nloops = 0; + int rc; + struct cfs_workitem *wi; + + while (!list_empty(&sched->ws_runq) && + nloops < CFS_WI_RESCHED) { + wi = list_entry(sched->ws_runq.next, + struct cfs_workitem, wi_list); + LASSERT(wi->wi_scheduled && !wi->wi_running); + + list_del_init(&wi->wi_list); + + LASSERT(sched->ws_nscheduled > 0); + sched->ws_nscheduled--; + + wi->wi_running = 1; + wi->wi_scheduled = 0; + + spin_unlock(&sched->ws_lock); + nloops++; + + rc = (*wi->wi_action) (wi); + + spin_lock(&sched->ws_lock); + if (rc != 0) /* WI should be dead, even be freed! */ + continue; + + wi->wi_running = 0; + if (list_empty(&wi->wi_list)) + continue; + + LASSERT(wi->wi_scheduled); + /* wi is rescheduled, should be on rerunq now, we + * move it to runq so it can run action now */ + list_move_tail(&wi->wi_list, &sched->ws_runq); + } + + if (!list_empty(&sched->ws_runq)) { + spin_unlock(&sched->ws_lock); + /* don't sleep because some workitems still + * expect me to come back soon */ + cond_resched(); + spin_lock(&sched->ws_lock); + continue; + } + + spin_unlock(&sched->ws_lock); + rc = wait_event_interruptible_exclusive(sched->ws_waitq, + !cfs_wi_sched_cansleep(sched)); + spin_lock(&sched->ws_lock); + } + + spin_unlock(&sched->ws_lock); + + spin_lock(&cfs_wi_data.wi_glock); + sched->ws_nthreads--; + spin_unlock(&cfs_wi_data.wi_glock); + + return 0; +} + +void +cfs_wi_sched_destroy(struct cfs_wi_sched *sched) +{ + LASSERT(cfs_wi_data.wi_init); + LASSERT(!cfs_wi_data.wi_stopping); + + spin_lock(&cfs_wi_data.wi_glock); + if (sched->ws_stopping) { + CDEBUG(D_INFO, "%s is in progress of stopping\n", + sched->ws_name); + spin_unlock(&cfs_wi_data.wi_glock); + return; + } + + LASSERT(!list_empty(&sched->ws_list)); + sched->ws_stopping = 1; + + spin_unlock(&cfs_wi_data.wi_glock); + + wake_up_all(&sched->ws_waitq); + + spin_lock(&cfs_wi_data.wi_glock); + { + int i = 2; + + while (sched->ws_nthreads > 0) { + CDEBUG(is_power_of_2(++i / 20) ? D_WARNING : D_NET, + "waiting %us for %d %s worker threads to exit\n", + i / 20, sched->ws_nthreads, sched->ws_name); + + spin_unlock(&cfs_wi_data.wi_glock); + schedule_timeout_uninterruptible(cfs_time_seconds(1) + / 20); + spin_lock(&cfs_wi_data.wi_glock); + } + } + + list_del(&sched->ws_list); + + spin_unlock(&cfs_wi_data.wi_glock); + + LASSERT(sched->ws_nscheduled == 0); + + LIBCFS_FREE(sched, sizeof(*sched)); +} +EXPORT_SYMBOL(cfs_wi_sched_destroy); + +int +cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, + int cpt, int nthrs, struct cfs_wi_sched **sched_pp) +{ + struct cfs_wi_sched *sched; + + LASSERT(cfs_wi_data.wi_init); + LASSERT(!cfs_wi_data.wi_stopping); + LASSERT(cptab == NULL || cpt == CFS_CPT_ANY || + (cpt >= 0 && cpt < cfs_cpt_number(cptab))); + + LIBCFS_ALLOC(sched, sizeof(*sched)); + if (sched == NULL) + return -ENOMEM; + + if (strlen(name) > sizeof(sched->ws_name)-1) { + LIBCFS_FREE(sched, sizeof(*sched)); + return -E2BIG; + } + strlcpy(sched->ws_name, name, sizeof(sched->ws_name)); + + sched->ws_cptab = cptab; + sched->ws_cpt = cpt; + + spin_lock_init(&sched->ws_lock); + init_waitqueue_head(&sched->ws_waitq); + + INIT_LIST_HEAD(&sched->ws_runq); + INIT_LIST_HEAD(&sched->ws_rerunq); + INIT_LIST_HEAD(&sched->ws_list); + + for (; nthrs > 0; nthrs--) { + char name[16]; + struct task_struct *task; + + spin_lock(&cfs_wi_data.wi_glock); + while (sched->ws_starting > 0) { + spin_unlock(&cfs_wi_data.wi_glock); + schedule(); + spin_lock(&cfs_wi_data.wi_glock); + } + + sched->ws_starting++; + spin_unlock(&cfs_wi_data.wi_glock); + + if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) { + snprintf(name, sizeof(name), "%s_%02d_%02d", + sched->ws_name, sched->ws_cpt, + sched->ws_nthreads); + } else { + snprintf(name, sizeof(name), "%s_%02d", + sched->ws_name, sched->ws_nthreads); + } + + task = kthread_run(cfs_wi_scheduler, sched, "%s", name); + if (IS_ERR(task)) { + int rc = PTR_ERR(task); + + CERROR("Failed to create thread for " + "WI scheduler %s: %d\n", name, rc); + + spin_lock(&cfs_wi_data.wi_glock); + + /* make up for cfs_wi_sched_destroy */ + list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); + sched->ws_starting--; + + spin_unlock(&cfs_wi_data.wi_glock); + + cfs_wi_sched_destroy(sched); + return rc; + } + } + + spin_lock(&cfs_wi_data.wi_glock); + list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); + spin_unlock(&cfs_wi_data.wi_glock); + + *sched_pp = sched; + return 0; +} +EXPORT_SYMBOL(cfs_wi_sched_create); + +int +cfs_wi_startup(void) +{ + memset(&cfs_wi_data, 0, sizeof(struct cfs_workitem_data)); + + spin_lock_init(&cfs_wi_data.wi_glock); + INIT_LIST_HEAD(&cfs_wi_data.wi_scheds); + cfs_wi_data.wi_init = 1; + + return 0; +} + +void +cfs_wi_shutdown (void) +{ + struct cfs_wi_sched *sched; + + spin_lock(&cfs_wi_data.wi_glock); + cfs_wi_data.wi_stopping = 1; + spin_unlock(&cfs_wi_data.wi_glock); + + /* nobody should contend on this list */ + list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { + sched->ws_stopping = 1; + wake_up_all(&sched->ws_waitq); + } + + list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { + spin_lock(&cfs_wi_data.wi_glock); + + while (sched->ws_nthreads != 0) { + spin_unlock(&cfs_wi_data.wi_glock); + schedule_timeout_uninterruptible(cfs_time_seconds(1) + / 20); + spin_lock(&cfs_wi_data.wi_glock); + } + spin_unlock(&cfs_wi_data.wi_glock); + } + + while (!list_empty(&cfs_wi_data.wi_scheds)) { + sched = list_entry(cfs_wi_data.wi_scheds.next, + struct cfs_wi_sched, ws_list); + list_del(&sched->ws_list); + LIBCFS_FREE(sched, sizeof(*sched)); + } + + cfs_wi_data.wi_stopping = 0; + cfs_wi_data.wi_init = 0; +} diff --git a/drivers/staging/lustrefsx/lnet/Kconfig b/drivers/staging/lustrefsx/lnet/Kconfig new file mode 100644 index 0000000000000..0d0686a25fe1e --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/Kconfig @@ -0,0 +1,37 @@ +config LUSTREFSX_LNET + tristate "Lustre networking subsystem (LNet)" + select LUSTREFSX_LIBCFS + depends on m + depends on INET + help + The Lustre network layer, also known as LNet, is a networking abstaction + level API that was initially created to allow Lustre Filesystem to utilize + very different networks like tcp and ib verbs in a uniform way. In the + case of Lustre routers only the LNet layer is required. Lately other + projects are also looking into using LNet as their networking API as well. + +config LUSTREFSX_LNET_SELFTEST + tristate "Lustre networking self testing" + depends on m + depends on LUSTREFSX_LNET + help + Choose Y here if you want to do lnet self testing. To compile this + as a module, choose M here: the module will be called lnet_selftest. + + If unsure, say N. + + See also http://wiki.lustre.org/ + +config LUSTREFSX_LNET_XPRT_IB + tristate "LNET infiniband support" + depends on m + depends on LUSTREFSX_LNET && INFINIBAND && INFINIBAND_ADDR_TRANS + default LUSTREFSX_LNET && INFINIBAND + help + This option allows the LNET users to use infiniband as an + RDMA-enabled transport. + + To compile this as a kernel module, choose M here and it will be + called ko2iblnd. + + If unsure, say N. diff --git a/drivers/staging/lustrefsx/lnet/LICENSE b/drivers/staging/lustrefsx/lnet/LICENSE new file mode 100644 index 0000000000000..92728f4d300d2 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/LICENSE @@ -0,0 +1,363 @@ +Each file in this distribution should contain a header stating the +copyright owner(s), and the licensing terms for that module. Some +files are not eligible for copyright protection, and contain neither. + +All files in this subtree are licensed under the terms and conditions +of the GNU General Public License version 2. + +Reproduced below is the GPL v2, and Linus's clarifying statement from +the Linux kernel source code: + +---------------------------------------- + + NOTE! This copyright does *not* cover user programs that use kernel + services by normal system calls - this is merely considered normal use + of the kernel, and does *not* fall under the heading of "derived work". + Also note that the GPL below is copyrighted by the Free Software + Foundation, but the instance of code that it refers to (the Linux + kernel) is copyrighted by me and others who actually wrote it. + + Linus Torvalds + +---------------------------------------- + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) 19yy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/drivers/staging/lustrefsx/lnet/Makefile b/drivers/staging/lustrefsx/lnet/Makefile new file mode 100644 index 0000000000000..7ee52eb559025 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LUSTREFSX_LNET) += lnet/ +obj-$(CONFIG_LUSTREFSX_LNET) += klnds/ +obj-$(CONFIG_LUSTREFSX_LNET_SELFTEST) += selftest/ diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/api.h b/drivers/staging/lustrefsx/lnet/include/lnet/api.h new file mode 100644 index 0000000000000..89be9c68e003e --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/lnet/api.h @@ -0,0 +1,171 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __LNET_API_H__ +#define __LNET_API_H__ + +/** \defgroup lnet LNet + * + * The Lustre Networking subsystem. + * + * LNet is an asynchronous message-passing API, which provides an unreliable + * connectionless service that can't guarantee any order. It supports OFA IB, + * TCP/IP, and Cray Portals, and routes between heterogeneous networks. + * @{ + */ + +#ifndef __KERNEL__ +# error This include is only for kernel use. +#endif + +#include + +/** \defgroup lnet_init_fini Initialization and cleanup + * The LNet must be properly initialized before any LNet calls can be made. + * @{ */ +int LNetNIInit(lnet_pid_t requested_pid); +int LNetNIFini(void); +/** @} lnet_init_fini */ + +/** \defgroup lnet_addr LNet addressing and basic types + * + * Addressing scheme and basic data types of LNet. + * + * The LNet API is memory-oriented, so LNet must be able to address not only + * end-points but also memory region within a process address space. + * An ::lnet_nid_t addresses an end-point. An ::lnet_pid_t identifies a process + * in a node. A portal represents an opening in the address space of a + * process. Match bits is criteria to identify a region of memory inside a + * portal, and offset specifies an offset within the memory region. + * + * LNet creates a table of portals for each process during initialization. + * This table has MAX_PORTALS entries and its size can't be dynamically + * changed. A portal stays empty until the owning process starts to add + * memory regions to it. A portal is sometimes called an index because + * it's an entry in the portals table of a process. + * + * \see LNetMEAttach + * @{ */ +int LNetGetId(unsigned int index, struct lnet_processid *id); +int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order); +lnet_nid_t LNetPrimaryNID(lnet_nid_t nid); +bool LNetIsPeerLocal(lnet_nid_t nid); + +/** @} lnet_addr */ + + +/** \defgroup lnet_me Match entries + * + * A match entry (abbreviated as ME) describes a set of criteria to accept + * incoming requests. + * + * A portal is essentially a match list plus a set of attributes. A match + * list is a chain of MEs. Each ME includes a pointer to a memory descriptor + * and a set of match criteria. The match criteria can be used to reject + * incoming requests based on process ID or the match bits provided in the + * request. MEs can be dynamically inserted into a match list by LNetMEAttach(), + * and must then be attached to an MD with LNetMDAttach(). + * @{ */ +struct lnet_me * +LNetMEAttach(unsigned int portal, + struct lnet_processid *match_id_in, + __u64 match_bits_in, + __u64 ignore_bits_in, + enum lnet_unlink unlink_in, + enum lnet_ins_pos pos_in); +/** @} lnet_me */ + +/** \defgroup lnet_md Memory descriptors + * + * A memory descriptor contains information about a region of a user's + * memory (either in kernel or user space) and optionally points to an + * event queue where information about the operations performed on the + * memory descriptor are recorded. Memory descriptor is abbreviated as + * MD and can be used interchangeably with the memory region it describes. + * + * The LNet API provides two operations to create MDs: LNetMDAttach() + * and LNetMDBind(); one operation to unlink and release the resources + * associated with a MD: LNetMDUnlink(). + * @{ */ +int LNetMDAttach(struct lnet_me *current_in, + const struct lnet_md *md_in, + enum lnet_unlink unlink_in, + struct lnet_handle_md *md_handle_out); + +int LNetMDBind(const struct lnet_md *md_in, + enum lnet_unlink unlink_in, + struct lnet_handle_md *md_handle_out); + +int __LNetMDUnlink(struct lnet_handle_md md_in, bool discard); +#define LNetMDUnlink(handle) __LNetMDUnlink(handle, false) + +void lnet_assert_handler_unused(lnet_handler_t handler); +/** @} lnet_md */ + +/** \defgroup lnet_data Data movement operations + * + * The LNet API provides two data movement operations: LNetPut() + * and LNetGet(). + * @{ */ +int LNetPut(lnet_nid_t self, + struct lnet_handle_md md_in, + enum lnet_ack_req ack_req_in, + struct lnet_process_id target_in, + unsigned int portal_in, + __u64 match_bits_in, + unsigned int offset_in, + __u64 hdr_data_in); + +int LNetGet(lnet_nid_t self, + struct lnet_handle_md md_in, + struct lnet_process_id target_in, + unsigned int portal_in, + __u64 match_bits_in, + unsigned int offset_in, + bool recovery); +/** @} lnet_data */ + + +/** \defgroup lnet_misc Miscellaneous operations. + * Miscellaneous operations. + * @{ */ + +int LNetSetLazyPortal(int portal); +int LNetClearLazyPortal(int portal); +int LNetCtl(unsigned int cmd, void *arg); +void LNetDebugPeer(struct lnet_processid *id); +int LNetGetPeerDiscoveryStatus(void); +int LNetAddPeer(lnet_nid_t *nids, __u32 num_nids); + +/** @} lnet_misc */ + +/** @} lnet */ +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h new file mode 100644 index 0000000000000..223c6d328bf26 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h @@ -0,0 +1,1243 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/include/lnet/lib-lnet.h + * + * Top level include for library side routines + */ + +#ifndef __LNET_LIB_LNET_H__ +#define __LNET_LIB_LNET_H__ + +/* LNET has 0xeXXX */ +#define CFS_FAIL_PTLRPC_OST_BULK_CB2 0xe000 + +#include + +#include +#include +#include +#include +#include +#include +#include + +extern struct lnet the_lnet; /* THE network */ + +#if (BITS_PER_LONG == 32) +/* 2 CPTs, allowing more CPTs might make us under memory pressure */ +# define LNET_CPT_MAX_BITS 1 + +#else /* 64-bit system */ +/* + * 256 CPTs for thousands of CPUs, allowing more CPTs might make us + * under risk of consuming all lh_cookie. + */ +# define LNET_CPT_MAX_BITS 8 +#endif /* BITS_PER_LONG == 32 */ + +/* max allowed CPT number */ +#define LNET_CPT_MAX (1 << LNET_CPT_MAX_BITS) + +#define LNET_CPT_NUMBER (the_lnet.ln_cpt_number) +#define LNET_CPT_BITS (the_lnet.ln_cpt_bits) +#define LNET_CPT_MASK ((1ULL << LNET_CPT_BITS) - 1) + +/** exclusive lock */ +#define LNET_LOCK_EX CFS_PERCPT_LOCK_EX + +/* default timeout and credits */ +#define DEFAULT_PEER_TIMEOUT 180 +#define DEFAULT_PEER_CREDITS 8 +#define DEFAULT_CREDITS 256 + +/* default number of connections per peer */ +#define DEFAULT_CONNS_PER_PEER 0 + +#ifdef HAVE_KERN_SOCK_GETNAME_2ARGS +#define lnet_kernel_getpeername(sock, addr, addrlen) \ + kernel_getpeername(sock, addr) +#define lnet_kernel_getsockname(sock, addr, addrlen) \ + kernel_getsockname(sock, addr) +#else +#define lnet_kernel_getpeername(sock, addr, addrlen) \ + kernel_getpeername(sock, addr, addrlen) +#define lnet_kernel_getsockname(sock, addr, addrlen) \ + kernel_getsockname(sock, addr, addrlen) +#endif + +/* + * kernel 5.3: commit ef11db3310e272d3d8dbe8739e0770820dd20e52 + * kernel 4.18.0-193.el8: + * added in_dev_for_each_ifa_rtnl and in_dev_for_each_ifa_rcu + * and removed for_ifa and endfor_ifa. + * Use the _rntl variant as the current locking is rtnl. + */ +#ifdef HAVE_IN_DEV_FOR_EACH_IFA_RTNL +#define DECLARE_CONST_IN_IFADDR(ifa) const struct in_ifaddr *ifa +#define endfor_ifa(in_dev) +#else +#define DECLARE_CONST_IN_IFADDR(ifa) +#define in_dev_for_each_ifa_rtnl(ifa, in_dev) for_ifa((in_dev)) +#define in_dev_for_each_ifa_rcu(ifa, in_dev) for_ifa((in_dev)) +#endif + +#ifndef fallthrough +# if defined(__GNUC__) && __GNUC__ >= 7 +# define fallthrough __attribute__((fallthrough)) /* fallthrough */ +# else +# define fallthrough do {} while (0) /* fallthrough */ +# endif +#endif + +int choose_ipv4_src(__u32 *ret, + int interface, __u32 dst_ipaddr, struct net *ns); + +bool lnet_is_route_alive(struct lnet_route *route); +bool lnet_is_gateway_alive(struct lnet_peer *gw); + +static inline int lnet_is_wire_handle_none(struct lnet_handle_wire *wh) +{ + return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_COOKIE_NONE && + wh->wh_object_cookie == LNET_WIRE_HANDLE_COOKIE_NONE); +} + +static inline int lnet_md_exhausted(struct lnet_libmd *md) +{ + return (md->md_threshold == 0 || + ((md->md_options & LNET_MD_MAX_SIZE) != 0 && + md->md_offset + md->md_max_size > md->md_length)); +} + +static inline int lnet_md_unlinkable(struct lnet_libmd *md) +{ + /* Should unlink md when its refcount is 0 and either: + * - md has been flagged for deletion (by auto unlink or LNetM[DE]Unlink, + * in the latter case md may not be exhausted). + * - auto unlink is on and md is exhausted. + */ + if (md->md_refcount != 0) + return 0; + + if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) != 0) + return 1; + + return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 && + lnet_md_exhausted(md)); +} + +#define lnet_cpt_table() (the_lnet.ln_cpt_table) +#define lnet_cpt_current() cfs_cpt_current(the_lnet.ln_cpt_table, 1) + +static inline int +lnet_cpt_of_cookie(__u64 cookie) +{ + unsigned int cpt = (cookie >> LNET_COOKIE_TYPE_BITS) & LNET_CPT_MASK; + + /* LNET_CPT_NUMBER doesn't have to be power2, which means we can + * get illegal cpt from it's invalid cookie */ + return cpt < LNET_CPT_NUMBER ? cpt : cpt % LNET_CPT_NUMBER; +} + +static inline void +lnet_res_lock(int cpt) +{ + cfs_percpt_lock(the_lnet.ln_res_lock, cpt); +} + +static inline void +lnet_res_unlock(int cpt) +{ + cfs_percpt_unlock(the_lnet.ln_res_lock, cpt); +} + +static inline int +lnet_res_lock_current(void) +{ + int cpt = lnet_cpt_current(); + + lnet_res_lock(cpt); + return cpt; +} + +static inline void +lnet_net_lock(int cpt) +{ + cfs_percpt_lock(the_lnet.ln_net_lock, cpt); +} + +static inline void +lnet_net_unlock(int cpt) +{ + cfs_percpt_unlock(the_lnet.ln_net_lock, cpt); +} + +static inline int +lnet_net_lock_current(void) +{ + int cpt = lnet_cpt_current(); + + lnet_net_lock(cpt); + return cpt; +} + +#define LNET_LOCK() lnet_net_lock(LNET_LOCK_EX) +#define LNET_UNLOCK() lnet_net_unlock(LNET_LOCK_EX) + +#define lnet_ptl_lock(ptl) spin_lock(&(ptl)->ptl_lock) +#define lnet_ptl_unlock(ptl) spin_unlock(&(ptl)->ptl_lock) +#define lnet_ni_lock(ni) spin_lock(&(ni)->ni_lock) +#define lnet_ni_unlock(ni) spin_unlock(&(ni)->ni_lock) + +#define MAX_PORTALS 64 + +#define LNET_SMALL_MD_SIZE offsetof(struct lnet_libmd, md_kiov[1]) +extern struct kmem_cache *lnet_mes_cachep; /* MEs kmem_cache */ +extern struct kmem_cache *lnet_small_mds_cachep; /* <= LNET_SMALL_MD_SIZE bytes + * MDs kmem_cache */ +extern struct kmem_cache *lnet_udsp_cachep; +extern struct kmem_cache *lnet_rspt_cachep; +extern struct kmem_cache *lnet_msg_cachep; + +static inline bool +lnet_ni_set_status_locked(struct lnet_ni *ni, __u32 status) +__must_hold(&ni->ni_lock) +{ + bool update = false; + + if (ni->ni_status && ni->ni_status->ns_status != status) { + CDEBUG(D_NET, "ni %s status changed from %#x to %#x\n", + libcfs_nidstr(&ni->ni_nid), + ni->ni_status->ns_status, status); + ni->ni_status->ns_status = status; + update = true; + } + + return update; +} + +static inline unsigned int +lnet_ni_get_status_locked(struct lnet_ni *ni) +__must_hold(&ni->ni_lock) +{ + if (nid_is_lo0(&ni->ni_nid)) + return LNET_NI_STATUS_UP; + else if (atomic_read(&ni->ni_fatal_error_on)) + return LNET_NI_STATUS_DOWN; + else if (ni->ni_status) + return ni->ni_status->ns_status; + else + return LNET_NI_STATUS_UP; +} + +static inline bool +lnet_ni_set_status(struct lnet_ni *ni, __u32 status) +{ + bool update; + + lnet_ni_lock(ni); + update = lnet_ni_set_status_locked(ni, status); + lnet_ni_unlock(ni); + + return update; +} + +static inline void lnet_md_wait_handling(struct lnet_libmd *md, int cpt) +{ + wait_queue_head_t *wq = __var_waitqueue(md); +#if defined(HAVE_WAIT_BIT_QUEUE_ENTRY) || !defined(HAVE_WAIT_VAR_EVENT) + struct wait_bit_queue_entry entry; + wait_queue_entry_t *wqe = &entry.wq_entry; +#else + struct wait_bit_queue entry; + wait_queue_entry_t *wqe = &entry.wait; +#endif + init_wait_var_entry(&entry, md, 0); + prepare_to_wait_event(wq, wqe, TASK_IDLE); + if (md->md_flags & LNET_MD_FLAG_HANDLING) { + /* Race with unlocked call to ->md_handler. + * It is safe to drop the res_lock here as the + * caller has only just claimed it. + */ + lnet_res_unlock(cpt); + schedule(); + /* Cannot check md now, it might be freed. Caller + * must reclaim reference and check. + */ + lnet_res_lock(cpt); + } + finish_wait(wq, wqe); +} + +static inline void +lnet_md_free(struct lnet_libmd *md) +{ + unsigned int size; + + LASSERTF(md->md_rspt_ptr == NULL, "md %p rsp %p\n", md, md->md_rspt_ptr); + + size = offsetof(struct lnet_libmd, md_kiov[md->md_niov]); + + if (size <= LNET_SMALL_MD_SIZE) { + CDEBUG(D_MALLOC, "slab-freed 'md' at %p.\n", md); + kmem_cache_free(lnet_small_mds_cachep, md); + } else { + LIBCFS_FREE(md, size); + } +} + +struct lnet_libhandle *lnet_res_lh_lookup(struct lnet_res_container *rec, + __u64 cookie); +void lnet_res_lh_initialize(struct lnet_res_container *rec, + struct lnet_libhandle *lh); +static inline void +lnet_res_lh_invalidate(struct lnet_libhandle *lh) +{ + /* ALWAYS called with resource lock held */ + /* NB: cookie is still useful, don't reset it */ + list_del(&lh->lh_hash_chain); +} + +static inline void +lnet_md2handle(struct lnet_handle_md *handle, struct lnet_libmd *md) +{ + handle->cookie = md->md_lh.lh_cookie; +} + +static inline struct lnet_libmd * +lnet_handle2md(struct lnet_handle_md *handle) +{ + /* ALWAYS called with resource lock held */ + struct lnet_libhandle *lh; + int cpt; + + cpt = lnet_cpt_of_cookie(handle->cookie); + lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt], + handle->cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, struct lnet_libmd, md_lh); +} + +static inline struct lnet_libmd * +lnet_wire_handle2md(struct lnet_handle_wire *wh) +{ + /* ALWAYS called with resource lock held */ + struct lnet_libhandle *lh; + int cpt; + + if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie) + return NULL; + + cpt = lnet_cpt_of_cookie(wh->wh_object_cookie); + lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt], + wh->wh_object_cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, struct lnet_libmd, md_lh); +} + +static inline void +lnet_peer_net_addref_locked(struct lnet_peer_net *lpn) +{ + atomic_inc(&lpn->lpn_refcount); +} + +extern void lnet_destroy_peer_net_locked(struct lnet_peer_net *lpn); + +static inline void +lnet_peer_net_decref_locked(struct lnet_peer_net *lpn) +{ + if (atomic_dec_and_test(&lpn->lpn_refcount)) + lnet_destroy_peer_net_locked(lpn); +} + +static inline void +lnet_peer_addref_locked(struct lnet_peer *lp) +{ + atomic_inc(&lp->lp_refcount); +} + +extern void lnet_destroy_peer_locked(struct lnet_peer *lp); + +static inline void +lnet_peer_decref_locked(struct lnet_peer *lp) +{ + if (atomic_dec_and_test(&lp->lp_refcount)) + lnet_destroy_peer_locked(lp); +} + +static inline void +lnet_peer_ni_addref_locked(struct lnet_peer_ni *lp) +{ + kref_get(&lp->lpni_kref); +} + +extern void lnet_destroy_peer_ni_locked(struct kref *ref); + +static inline void +lnet_peer_ni_decref_locked(struct lnet_peer_ni *lp) +{ + kref_put(&lp->lpni_kref, lnet_destroy_peer_ni_locked); +} + +static inline int +lnet_isrouter(struct lnet_peer_ni *lpni) +{ + return lpni->lpni_peer_net->lpn_peer->lp_rtr_refcount != 0; +} + +static inline void +lnet_ni_addref_locked(struct lnet_ni *ni, int cpt) +{ + LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER); + LASSERT(*ni->ni_refs[cpt] >= 0); + + (*ni->ni_refs[cpt])++; +} + +static inline void +lnet_ni_addref(struct lnet_ni *ni) +{ + lnet_net_lock(0); + lnet_ni_addref_locked(ni, 0); + lnet_net_unlock(0); +} + +static inline void +lnet_ni_decref_locked(struct lnet_ni *ni, int cpt) +{ + LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER); + LASSERT(*ni->ni_refs[cpt] > 0); + + (*ni->ni_refs[cpt])--; +} + +static inline void +lnet_ni_decref(struct lnet_ni *ni) +{ + lnet_net_lock(0); + lnet_ni_decref_locked(ni, 0); + lnet_net_unlock(0); +} + +static inline struct lnet_msg * +lnet_msg_alloc(void) +{ + struct lnet_msg *msg; + + msg = kmem_cache_zalloc(lnet_msg_cachep, GFP_NOFS); + + return (msg); +} + +static inline void +lnet_msg_free(struct lnet_msg *msg) +{ + LASSERT(!msg->msg_onactivelist); + kmem_cache_free(lnet_msg_cachep, msg); +} + +static inline struct lnet_rsp_tracker * +lnet_rspt_alloc(int cpt) +{ + struct lnet_rsp_tracker *rspt; + + rspt = kmem_cache_zalloc(lnet_rspt_cachep, GFP_NOFS); + if (rspt) { + lnet_net_lock(cpt); + the_lnet.ln_counters[cpt]->lct_health.lch_rst_alloc++; + lnet_net_unlock(cpt); + } + CDEBUG(D_MALLOC, "rspt alloc %p\n", rspt); + return rspt; +} + +static inline void +lnet_rspt_free(struct lnet_rsp_tracker *rspt, int cpt) +{ + CDEBUG(D_MALLOC, "rspt free %p\n", rspt); + + kmem_cache_free(lnet_rspt_cachep, rspt); + lnet_net_lock(cpt); + the_lnet.ln_counters[cpt]->lct_health.lch_rst_alloc--; + lnet_net_unlock(cpt); +} + +void lnet_ni_free(struct lnet_ni *ni); +void lnet_net_free(struct lnet_net *net); + +struct lnet_net * +lnet_net_alloc(__u32 net_type, struct list_head *netlist); + +struct lnet_ni * +lnet_ni_alloc(struct lnet_net *net, struct cfs_expr_list *el, + char *iface); +struct lnet_ni * +lnet_ni_alloc_w_cpt_array(struct lnet_net *net, __u32 *cpts, __u32 ncpts, + char *iface); + +static inline int +lnet_nid2peerhash(struct lnet_nid *nid) +{ + u32 h = 0; + int i; + + for (i = 0; i < 4; i++) + h = hash_32(nid->nid_addr[i]^h, 32); + return hash_32(LNET_NID_NET(nid) ^ h, LNET_PEER_HASH_BITS); +} + +static inline struct list_head * +lnet_net2rnethash(__u32 net) +{ + return &the_lnet.ln_remote_nets_hash[(LNET_NETNUM(net) + + LNET_NETTYP(net)) & + ((1U << the_lnet.ln_remote_nets_hbits) - 1)]; +} + +static inline void lnet_hdr_from_nid4(struct lnet_hdr *hdr, + const struct lnet_hdr_nid4 *vhdr) +{ + const struct _lnet_hdr_nid4 *hdr_nid4 = (void *)vhdr; + + lnet_nid4_to_nid(le64_to_cpu(hdr_nid4->dest_nid), &hdr->dest_nid); + lnet_nid4_to_nid(le64_to_cpu(hdr_nid4->src_nid), &hdr->src_nid); + hdr->dest_pid = le32_to_cpu(hdr_nid4->dest_pid); + hdr->src_pid = le32_to_cpu(hdr_nid4->src_pid); + hdr->type = le32_to_cpu(hdr_nid4->type); + hdr->payload_length = le32_to_cpu(hdr_nid4->payload_length); + + hdr->msg = hdr_nid4->msg; +} + +static inline void lnet_hdr_to_nid4(const struct lnet_hdr *hdr, + struct lnet_hdr_nid4 *vhdr) +{ + struct _lnet_hdr_nid4 *hdr_nid4 = (void *)vhdr; + + hdr_nid4->dest_nid = cpu_to_le64(lnet_nid_to_nid4(&hdr->dest_nid)); + hdr_nid4->src_nid = cpu_to_le64(lnet_nid_to_nid4(&hdr->src_nid)); + hdr_nid4->dest_pid = cpu_to_le32(hdr->dest_pid); + hdr_nid4->src_pid = cpu_to_le32(hdr->src_pid); + hdr_nid4->type = cpu_to_le32(hdr->type); + hdr_nid4->payload_length = cpu_to_le32(hdr->payload_length); + + hdr_nid4->msg = hdr->msg; +} + +static inline void lnet_hdr_from_nid16(struct lnet_hdr *hdr, + const struct lnet_hdr_nid16 *vhdr) +{ + const struct lnet_hdr *hdr16 = (void *)vhdr; + + hdr->dest_nid = hdr16->dest_nid; + hdr->src_nid = hdr16->src_nid; + hdr->dest_pid = le32_to_cpu(hdr16->dest_pid); + hdr->src_pid = le32_to_cpu(hdr16->src_pid); + hdr->type = le32_to_cpu(hdr16->type); + hdr->payload_length = le32_to_cpu(hdr16->payload_length); +} + +static inline void lnet_hdr_to_nid16(const struct lnet_hdr *hdr, + struct lnet_hdr_nid16 *vhdr) +{ + struct lnet_hdr *hdr16 = (void *)vhdr; + + hdr16->dest_nid = hdr->dest_nid; + hdr16->src_nid = hdr->src_nid; + hdr16->dest_pid = cpu_to_le32(hdr->dest_pid); + hdr16->src_pid = cpu_to_le32(hdr->src_pid); + hdr16->type = cpu_to_le32(hdr->type); + hdr16->payload_length = cpu_to_le32(hdr->payload_length); +} + +extern const struct lnet_lnd the_lolnd; +extern int avoid_asym_router_failure; + +extern unsigned int lnet_nid_cpt_hash(struct lnet_nid *nid, + unsigned int number); +extern int lnet_cpt_of_nid_locked(struct lnet_nid *nid, struct lnet_ni *ni); +extern int lnet_cpt_of_nid(lnet_nid_t nid, struct lnet_ni *ni); +extern int lnet_nid2cpt(struct lnet_nid *nid, struct lnet_ni *ni); +extern struct lnet_ni *lnet_nid2ni_locked(lnet_nid_t nid, int cpt); +extern struct lnet_ni *lnet_nid_to_ni_locked(struct lnet_nid *nid, int cpt); +extern struct lnet_ni *lnet_nid2ni_addref(lnet_nid_t nid); +extern struct lnet_ni *lnet_net2ni_locked(__u32 net, int cpt); +extern struct lnet_ni *lnet_net2ni_addref(__u32 net); +extern struct lnet_ni *lnet_nid_to_ni_addref(struct lnet_nid *nid); +struct lnet_net *lnet_get_net_locked(__u32 net_id); + +int lnet_lib_init(void); +void lnet_lib_exit(void); + +extern unsigned int lnet_response_tracking; +extern unsigned lnet_transaction_timeout; +extern unsigned lnet_retry_count; +extern unsigned int lnet_lnd_timeout; +extern unsigned int lnet_numa_range; +extern unsigned int lnet_health_sensitivity; +extern unsigned int lnet_recovery_interval; +extern unsigned int lnet_recovery_limit; +extern unsigned int lnet_peer_discovery_disabled; +extern unsigned int lnet_drop_asym_route; +extern unsigned int router_sensitivity_percentage; +extern int alive_router_check_interval; +extern int live_router_check_interval; +extern int dead_router_check_interval; +extern int portal_rotor; + +void lnet_mt_event_handler(struct lnet_event *event); + +int lnet_notify(struct lnet_ni *ni, lnet_nid_t peer, bool alive, bool reset, + time64_t when); +void lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive, + time64_t when); +int lnet_add_route(__u32 net, __u32 hops, struct lnet_nid *gateway, + __u32 priority, __u32 sensitivity); +int lnet_del_route(__u32 net, struct lnet_nid *gw_nid); +void lnet_move_route(struct lnet_route *route, struct lnet_peer *lp, + struct list_head *rt_list); +void lnet_destroy_routes(void); +int lnet_get_route(int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive, __u32 *priority, + __u32 *sensitivity); +int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg); +struct lnet_ni *lnet_get_next_ni_locked(struct lnet_net *mynet, + struct lnet_ni *prev); +struct lnet_ni *lnet_get_ni_idx_locked(int idx); +int lnet_get_net_healthv_locked(struct lnet_net *net); + +extern int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp, + struct libcfs_ioctl_hdr __user *uparam); +extern int lnet_get_peer_list(__u32 *countp, __u32 *sizep, + struct lnet_process_id __user *ids); +extern void lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all); +extern void lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni, + struct list_head *queue, + time64_t now); +extern int lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, + struct lnet_nid *nid); +extern void lnet_peer_clr_pref_nids(struct lnet_peer_ni *lpni); +extern int lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, + struct lnet_nid *nid); +void lnet_peer_ni_set_selection_priority(struct lnet_peer_ni *lpni, + __u32 priority); +extern void lnet_ni_add_to_recoveryq_locked(struct lnet_ni *ni, + struct list_head *queue, + time64_t now); + +void lnet_router_debugfs_init(void); +void lnet_router_debugfs_fini(void); +int lnet_rtrpools_alloc(int im_a_router); +void lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages); +int lnet_rtrpools_adjust(int tiny, int small, int large); +int lnet_rtrpools_enable(void); +void lnet_rtrpools_disable(void); +void lnet_rtrpools_free(int keep_pools); +void lnet_rtr_transfer_to_peer(struct lnet_peer *src, + struct lnet_peer *target); +struct lnet_remotenet *lnet_find_rnet_locked(__u32 net); +int lnet_dyn_add_net(struct lnet_ioctl_config_data *conf); +int lnet_dyn_del_net(__u32 net); +int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf); +int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf); +int lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason); +struct lnet_net *lnet_get_net_locked(__u32 net_id); +void lnet_net_clr_pref_rtrs(struct lnet_net *net); +int lnet_net_add_pref_rtr(struct lnet_net *net, struct lnet_nid *gw_nid); + +int lnet_islocalnid(struct lnet_nid *nid); +int lnet_islocalnet(__u32 net); +int lnet_islocalnet_locked(__u32 net); + +void lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md, + unsigned int offset, unsigned int mlen); +void lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev); +void lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type); +void lnet_msg_commit(struct lnet_msg *msg, int cpt); +void lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status); + +void lnet_prep_send(struct lnet_msg *msg, int type, + struct lnet_processid *target, unsigned int offset, + unsigned int len); +int lnet_send(struct lnet_nid *nid, struct lnet_msg *msg, + struct lnet_nid *rtr_nid); +int lnet_send_ping(struct lnet_nid *dest_nid, struct lnet_handle_md *mdh, + int nnis, void *user_ptr, lnet_handler_t handler, + bool recovery); +void lnet_return_tx_credits_locked(struct lnet_msg *msg); +void lnet_return_rx_credits_locked(struct lnet_msg *msg); +void lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp); +void lnet_drop_routed_msgs_locked(struct list_head *list, int cpt); + +struct list_head **lnet_create_array_of_queues(void); + +/* portals functions */ +/* portals attributes */ +static inline int +lnet_ptl_is_lazy(struct lnet_portal *ptl) +{ + return !!(ptl->ptl_options & LNET_PTL_LAZY); +} + +static inline int +lnet_ptl_is_unique(struct lnet_portal *ptl) +{ + return !!(ptl->ptl_options & LNET_PTL_MATCH_UNIQUE); +} + +static inline int +lnet_ptl_is_wildcard(struct lnet_portal *ptl) +{ + return !!(ptl->ptl_options & LNET_PTL_MATCH_WILDCARD); +} + +static inline void +lnet_ptl_setopt(struct lnet_portal *ptl, int opt) +{ + ptl->ptl_options |= opt; +} + +static inline void +lnet_ptl_unsetopt(struct lnet_portal *ptl, int opt) +{ + ptl->ptl_options &= ~opt; +} + +/* match-table functions */ +struct list_head *lnet_mt_match_head(struct lnet_match_table *mtable, + struct lnet_processid *id, __u64 mbits); +struct lnet_match_table *lnet_mt_of_attach(unsigned int index, + struct lnet_processid *id, + __u64 mbits, __u64 ignore_bits, + enum lnet_ins_pos pos); +int lnet_mt_match_md(struct lnet_match_table *mtable, + struct lnet_match_info *info, struct lnet_msg *msg); + +/* portals match/attach functions */ +void lnet_ptl_attach_md(struct lnet_me *me, struct lnet_libmd *md, + struct list_head *matches, struct list_head *drops); +void lnet_ptl_detach_md(struct lnet_me *me, struct lnet_libmd *md); +int lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg); + +/* initialized and finalize portals */ +int lnet_portals_create(void); +void lnet_portals_destroy(void); + +/* message functions */ +int lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, + struct lnet_nid *fromnid, void *private, int rdma_req); +int lnet_parse_local(struct lnet_ni *ni, struct lnet_msg *msg); +int lnet_parse_forward_locked(struct lnet_ni *ni, struct lnet_msg *msg); + +void lnet_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, + int delayed, unsigned int offset, unsigned int mlen, + unsigned int rlen); +void lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, + int delayed, unsigned int offset, + unsigned int mlen, unsigned int rlen); +void lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg); + +struct lnet_msg *lnet_create_reply_msg(struct lnet_ni *ni, + struct lnet_msg *get_msg); +void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg, + unsigned int len); +void lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt); +void lnet_clean_zombie_rstqs(void); + +bool lnet_md_discarded(struct lnet_libmd *md); +void lnet_finalize(struct lnet_msg *msg, int rc); +bool lnet_send_error_simulation(struct lnet_msg *msg, + enum lnet_msg_hstatus *hstatus); +void lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni); + +void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, + unsigned int nob, __u32 msg_type); +void lnet_drop_delayed_msg_list(struct list_head *head, char *reason); +void lnet_recv_delayed_msg_list(struct list_head *head); + +int lnet_msg_container_setup(struct lnet_msg_container *container, int cpt); +void lnet_msg_container_cleanup(struct lnet_msg_container *container); +void lnet_msg_containers_destroy(void); +int lnet_msg_containers_create(void); + +char *lnet_health_error2str(enum lnet_msg_hstatus hstatus); +char *lnet_msgtyp2str(int type); +int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold); + +/** \addtogroup lnet_fault_simulation @{ */ + +int lnet_fault_ctl(int cmd, struct libcfs_ioctl_data *data); +int lnet_fault_init(void); +void lnet_fault_fini(void); + +bool lnet_drop_rule_match(struct lnet_hdr *hdr, lnet_nid_t local_nid, + enum lnet_msg_hstatus *hstatus); + +int lnet_delay_rule_add(struct lnet_fault_attr *attr); +int lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown); +int lnet_delay_rule_list(int pos, struct lnet_fault_attr *attr, + struct lnet_fault_stat *stat); +void lnet_delay_rule_reset(void); +void lnet_delay_rule_check(void); +bool lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg); + +/** @} lnet_fault_simulation */ + +void lnet_counters_get_common(struct lnet_counters_common *common); +int lnet_counters_get(struct lnet_counters *counters); +void lnet_counters_reset(void); +static inline void +lnet_ni_set_sel_priority_locked(struct lnet_ni *ni, __u32 priority) +{ + ni->ni_sel_priority = priority; +} + +static inline void +lnet_net_set_sel_priority_locked(struct lnet_net *net, __u32 priority) +{ + net->net_sel_priority = priority; +} + +unsigned int lnet_iov_nob(unsigned int niov, struct kvec *iov); +unsigned int lnet_kiov_nob(unsigned int niov, struct bio_vec *iov); +int lnet_extract_kiov(int dst_niov, struct bio_vec *dst, + int src_niov, struct bio_vec *src, + unsigned int offset, unsigned int len); + +void lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, + unsigned int doffset, + unsigned int nsiov, struct kvec *siov, + unsigned int soffset, unsigned int nob); +void lnet_copy_kiov2iov(unsigned int niov, struct kvec *iov, + unsigned int iovoffset, + unsigned int nkiov, struct bio_vec *kiov, + unsigned int kiovoffset, unsigned int nob); +void lnet_copy_iov2kiov(unsigned int nkiov, struct bio_vec *kiov, + unsigned int kiovoffset, + unsigned int niov, struct kvec *iov, + unsigned int iovoffset, unsigned int nob); +void lnet_copy_kiov2kiov(unsigned int ndkiov, struct bio_vec *dkiov, + unsigned int doffset, + unsigned int nskiov, struct bio_vec *skiov, + unsigned int soffset, unsigned int nob); + +static inline void +lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset, + unsigned int nsiov, struct bio_vec *skiov, + unsigned int soffset, unsigned int nob) +{ + struct kvec diov = { .iov_base = dest, .iov_len = dlen }; + + lnet_copy_kiov2iov(1, &diov, doffset, + nsiov, skiov, soffset, nob); +} + +static inline void +lnet_copy_flat2kiov(unsigned int ndiov, struct bio_vec *dkiov, + unsigned int doffset, int slen, void *src, + unsigned int soffset, unsigned int nob) +{ + struct kvec siov = { .iov_base = src, .iov_len = slen }; + lnet_copy_iov2kiov(ndiov, dkiov, doffset, + 1, &siov, soffset, nob); +} + +void lnet_me_unlink(struct lnet_me *me); + +void lnet_md_unlink(struct lnet_libmd *md); +void lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_event *ev); +struct page *lnet_kvaddr_to_page(unsigned long vaddr); +struct page *lnet_get_first_page(struct lnet_libmd *md, unsigned int offset); +int lnet_cpt_of_md(struct lnet_libmd *md, unsigned int offset); + +unsigned int lnet_get_lnd_timeout(void); +void lnet_register_lnd(const struct lnet_lnd *lnd); +void lnet_unregister_lnd(const struct lnet_lnd *lnd); + +struct socket *lnet_connect(struct lnet_nid *peer_nid, int interface, + struct sockaddr *peeraddr, struct net *ns); +void lnet_connect_console_error(int rc, struct lnet_nid *peer_nid, + struct sockaddr *sa); +int lnet_count_acceptor_nets(void); +int lnet_acceptor_timeout(void); +int lnet_acceptor_port(void); +int lnet_acceptor_start(void); +void lnet_acceptor_stop(void); + +struct lnet_inetdev { + u32 li_cpt; + u32 li_flags; + u32 li_ipaddr; + u32 li_netmask; + char li_name[IFNAMSIZ]; +}; + +int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns); +void lnet_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize); +void lnet_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize); +int lnet_sock_getaddr(struct socket *socket, bool remote, + struct sockaddr_storage *peer); +int lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout); +int lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout); + +struct socket *lnet_sock_listen(int port, int backlog, + struct net *ns); +struct socket *lnet_sock_connect(int interface, int local_port, + struct sockaddr *peeraddr, + struct net *ns); + +int lnet_peers_start_down(void); +int lnet_peer_buffer_credits(struct lnet_net *net); +void lnet_consolidate_routes_locked(struct lnet_peer *orig_lp, + struct lnet_peer *new_lp); +void lnet_router_discovery_complete(struct lnet_peer *lp); +void lnet_router_discovery_ping_reply(struct lnet_peer *lp); + +int lnet_monitor_thr_start(void); +void lnet_monitor_thr_stop(void); + +bool lnet_router_checker_active(void); +void lnet_check_routers(void); +void lnet_wait_router_start(void); +void lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf); + +int lnet_ping_info_validate(struct lnet_ping_info *pinfo); +struct lnet_ping_buffer *lnet_ping_buffer_alloc(int nnis, gfp_t gfp); +void lnet_ping_buffer_free(struct lnet_ping_buffer *pbuf); + +static inline void lnet_ping_buffer_addref(struct lnet_ping_buffer *pbuf) +{ + atomic_inc(&pbuf->pb_refcnt); +} + +static inline void lnet_ping_buffer_decref(struct lnet_ping_buffer *pbuf) +{ + if (atomic_dec_and_test(&pbuf->pb_refcnt)) { + wake_up_var(&pbuf->pb_refcnt); + lnet_ping_buffer_free(pbuf); + } +} + +static inline int lnet_push_target_resize_needed(void) +{ + return the_lnet.ln_push_target->pb_nnis < the_lnet.ln_push_target_nnis; +} + +int lnet_push_target_resize(void); +int lnet_push_target_post(struct lnet_ping_buffer *pbuf, + struct lnet_handle_md *mdh); +void lnet_peer_push_event(struct lnet_event *ev); + +int lnet_parse_ip2nets(const char **networksp, const char *ip2nets); +int lnet_parse_routes(const char *route_str, int *im_a_router); +int lnet_parse_networks(struct list_head *nilist, const char *networks); +bool lnet_net_unique(__u32 net_id, struct list_head *nilist, + struct lnet_net **net); +bool lnet_ni_unique_net(struct list_head *nilist, char *iface); +void lnet_incr_dlc_seq(void); +__u32 lnet_get_dlc_seq_locked(void); + +struct lnet_peer_net *lnet_get_next_peer_net_locked(struct lnet_peer *lp, + __u32 prev_lpn_id); +struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer, + struct lnet_peer_net *peer_net, + struct lnet_peer_ni *prev); +struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref, + int cpt); +struct lnet_peer_ni *lnet_peerni_by_nid_locked(struct lnet_nid *nid, + struct lnet_nid *pref, + int cpt); +struct lnet_peer_ni *lnet_nid2peerni_ex(struct lnet_nid *nid, int cpt); +struct lnet_peer_ni *lnet_peer_get_ni_locked(struct lnet_peer *lp, + lnet_nid_t nid); +struct lnet_peer_ni *lnet_peer_ni_get_locked(struct lnet_peer *lp, + struct lnet_nid *nid); +struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid); +struct lnet_peer_ni *lnet_peer_ni_find_locked(struct lnet_nid *nid); +struct lnet_peer *lnet_find_peer4(lnet_nid_t nid); +struct lnet_peer *lnet_find_peer(struct lnet_nid *nid); +void lnet_peer_net_added(struct lnet_net *net); +void lnet_peer_primary_nid_locked(struct lnet_nid *nid, + struct lnet_nid *result); +int lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block); +void lnet_peer_queue_message(struct lnet_peer *lp, struct lnet_msg *msg); +int lnet_peer_discovery_start(void); +void lnet_peer_discovery_stop(void); +void lnet_push_update_to_peers(int force); +void lnet_peer_tables_cleanup(struct lnet_net *net); +void lnet_peer_uninit(void); +int lnet_peer_tables_create(void); +void lnet_debug_peer(lnet_nid_t nid); +struct lnet_peer_net *lnet_peer_get_net_locked(struct lnet_peer *peer, + __u32 net_id); +bool lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, + struct lnet_nid *nid); +int lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, struct lnet_nid *nid); +void lnet_peer_clr_pref_nids(struct lnet_peer_ni *lpni); +bool lnet_peer_is_pref_rtr_locked(struct lnet_peer_ni *lpni, + struct lnet_nid *gw_nid); +void lnet_peer_clr_pref_rtrs(struct lnet_peer_ni *lpni); +int lnet_peer_add_pref_rtr(struct lnet_peer_ni *lpni, struct lnet_nid *nid); +int lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, + struct lnet_nid *nid); +int lnet_add_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid, bool mr, bool temp); +int lnet_del_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid); +int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk); +int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid, + char alivness[LNET_MAX_STR_LEN], + __u32 *cpt_iter, __u32 *refcount, + __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits, + __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis, + __u32 *peer_tx_qnob); +int lnet_get_peer_ni_hstats(struct lnet_ioctl_peer_ni_hstats *stats); + +static inline void +lnet_peer_net_set_sel_priority_locked(struct lnet_peer_net *lpn, __u32 priority) +{ + lpn->lpn_sel_priority = priority; +} + + +static inline struct lnet_peer_net * +lnet_find_peer_net_locked(struct lnet_peer *peer, __u32 net_id) +{ + struct lnet_peer_net *peer_net; + + list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) { + if (peer_net->lpn_net_id == net_id) + return peer_net; + } + + return NULL; +} + +static inline bool +lnet_peer_is_multi_rail(struct lnet_peer *lp) +{ + if (lp->lp_state & LNET_PEER_MULTI_RAIL) + return true; + return false; +} + +static inline bool +lnet_peer_ni_is_configured(struct lnet_peer_ni *lpni) +{ + if (lpni->lpni_peer_net->lpn_peer->lp_state & LNET_PEER_CONFIGURED) + return true; + return false; +} + +static inline bool +lnet_peer_ni_is_primary(struct lnet_peer_ni *lpni) +{ + return nid_same(&lpni->lpni_nid, + &lpni->lpni_peer_net->lpn_peer->lp_primary_nid); +} + +bool lnet_peer_is_uptodate(struct lnet_peer *lp); +bool lnet_peer_is_uptodate_locked(struct lnet_peer *lp); +bool lnet_is_discovery_disabled(struct lnet_peer *lp); +bool lnet_is_discovery_disabled_locked(struct lnet_peer *lp); +bool lnet_peer_gw_discovery(struct lnet_peer *lp); + +static inline bool +lnet_peer_needs_push(struct lnet_peer *lp) +{ + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) + return false; + if (lp->lp_state & LNET_PEER_MARK_DELETED) + return false; + if (lp->lp_state & LNET_PEER_FORCE_PUSH) + return true; + if (lp->lp_state & LNET_PEER_NO_DISCOVERY) + return false; + /* if discovery is not enabled then no need to push */ + if (lnet_peer_discovery_disabled) + return false; + if (lp->lp_node_seqno < atomic_read(&the_lnet.ln_ping_target_seqno)) + return true; + return false; +} + +#define LNET_RECOVERY_INTERVAL_MAX 900 +static inline unsigned int +lnet_get_next_recovery_ping(unsigned int ping_count, time64_t now) +{ + unsigned int interval; + + /* 2^9 = 512, 2^10 = 1024 */ + if (ping_count > 9) + interval = LNET_RECOVERY_INTERVAL_MAX; + else + interval = 1 << ping_count; + + return now + interval; +} + +static inline void +lnet_peer_ni_set_next_ping(struct lnet_peer_ni *lpni, time64_t now) +{ + lpni->lpni_next_ping = + lnet_get_next_recovery_ping(lpni->lpni_ping_count, now); +} + +static inline void +lnet_ni_set_next_ping(struct lnet_ni *ni, time64_t now) +{ + ni->ni_next_ping = lnet_get_next_recovery_ping(ni->ni_ping_count, now); +} + +/* + * A peer NI is alive if it satisfies the following two conditions: + * 1. peer NI health >= LNET_MAX_HEALTH_VALUE * router_sensitivity_percentage + * 2. the cached NI status received when we discover the peer is UP + */ +static inline bool +lnet_is_peer_ni_alive(struct lnet_peer_ni *lpni) +{ + bool halive = false; + + halive = (atomic_read(&lpni->lpni_healthv) >= + (LNET_MAX_HEALTH_VALUE * router_sensitivity_percentage / 100)); + + return halive && lpni->lpni_ns_status == LNET_NI_STATUS_UP; +} + +static inline void +lnet_update_peer_net_healthv(struct lnet_peer_ni *lpni) +{ + struct lnet_peer_net *lpn; + int best_healthv = 0; + + lpn = lpni->lpni_peer_net; + + list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) { + int lpni_healthv = atomic_read(&lpni->lpni_healthv); + if (best_healthv < lpni_healthv) + best_healthv = lpni_healthv; + } + + lpn->lpn_healthv = best_healthv; +} + +static inline void +lnet_set_lpni_healthv_locked(struct lnet_peer_ni *lpni, int value) +{ + if (atomic_read(&lpni->lpni_healthv) == value) + return; + atomic_set(&lpni->lpni_healthv, value); + lnet_update_peer_net_healthv(lpni); +} + +static inline bool +lnet_atomic_add_unless_max(atomic_t *v, int a, int u) +{ + int c = atomic_read(v); + bool mod = false; + int old; + int m; + + if (c == u) + return mod; + + for (;;) { + if (c + a >= u) + m = u; + else + m = c + a; + old = atomic_cmpxchg(v, c, m); + + if (old == u) + break; + + if (old == c) { + mod = true; + break; + } + c = old; + } + + return mod; +} + +static inline void +lnet_inc_lpni_healthv_locked(struct lnet_peer_ni *lpni, int value) +{ + /* only adjust the net health if the lpni health value changed */ + if (lnet_atomic_add_unless_max(&lpni->lpni_healthv, value, + LNET_MAX_HEALTH_VALUE)) + lnet_update_peer_net_healthv(lpni); +} + +static inline void +lnet_inc_healthv(atomic_t *healthv, int value) +{ + lnet_atomic_add_unless_max(healthv, value, LNET_MAX_HEALTH_VALUE); +} + +static inline int +lnet_get_list_len(struct list_head *list) +{ + struct list_head *l; + int count = 0; + + list_for_each(l, list) + count++; + + return count; +} + +void lnet_incr_stats(struct lnet_element_stats *stats, + enum lnet_msg_type msg_type, + enum lnet_stats_type stats_type); + +__u32 lnet_sum_stats(struct lnet_element_stats *stats, + enum lnet_stats_type stats_type); + +void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats, + struct lnet_element_stats *stats); + +static inline void +lnet_set_route_aliveness(struct lnet_route *route, bool alive) +{ + bool old = atomic_xchg(&route->lr_alive, alive); + + if (old != alive) + CERROR("route to %s through %s has gone from %s to %s\n", + libcfs_net2str(route->lr_net), + libcfs_nidstr(&route->lr_gateway->lp_primary_nid), + old ? "up" : "down", + alive ? "up" : "down"); +} +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h new file mode 100644 index 0000000000000..0df6857d89573 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h @@ -0,0 +1,1338 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/include/lnet/lib-types.h + * + * Types used by the library side routines that do not need to be + * exposed to the user application + */ + +#ifndef __LNET_LIB_TYPES_H__ +#define __LNET_LIB_TYPES_H__ + +#ifndef __KERNEL__ +# error This include is only for kernel use. +#endif + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +char *libcfs_nidstr_r(const struct lnet_nid *nid, + char *buf, size_t buf_size); + +static inline char *libcfs_nidstr(const struct lnet_nid *nid) +{ + return libcfs_nidstr_r(nid, libcfs_next_nidstring(), + LNET_NIDSTR_SIZE); +} + +int libcfs_strnid(struct lnet_nid *nid, const char *str); +char *libcfs_idstr(struct lnet_processid *id); + +int cfs_match_nid_net(struct lnet_nid *nid, u32 net, + struct list_head *net_num_list, + struct list_head *addr); + +/* Max payload size */ +#define LNET_MAX_PAYLOAD LNET_MTU + +/** limit on the number of fragments in discontiguous MDs */ +#define LNET_MAX_IOV 256 + +/* + * This is the maximum health value. + * All local and peer NIs created have their health default to this value. + */ +#define LNET_MAX_HEALTH_VALUE 1000 +#define LNET_MAX_SELECTION_PRIORITY UINT_MAX + +/* forward refs */ +struct lnet_libmd; + +enum lnet_msg_hstatus { + LNET_MSG_STATUS_OK = 0, + LNET_MSG_STATUS_LOCAL_INTERRUPT, + LNET_MSG_STATUS_LOCAL_DROPPED, + LNET_MSG_STATUS_LOCAL_ABORTED, + LNET_MSG_STATUS_LOCAL_NO_ROUTE, + LNET_MSG_STATUS_LOCAL_ERROR, + LNET_MSG_STATUS_LOCAL_TIMEOUT, + LNET_MSG_STATUS_REMOTE_ERROR, + LNET_MSG_STATUS_REMOTE_DROPPED, + LNET_MSG_STATUS_REMOTE_TIMEOUT, + LNET_MSG_STATUS_NETWORK_TIMEOUT, + LNET_MSG_STATUS_END, +}; + +struct lnet_rsp_tracker { + /* chain on the waiting list */ + struct list_head rspt_on_list; + /* cpt to lock */ + int rspt_cpt; + /* nid of next hop */ + struct lnet_nid rspt_next_hop_nid; + /* deadline of the REPLY/ACK */ + ktime_t rspt_deadline; + /* parent MD */ + struct lnet_handle_md rspt_mdh; +}; + +struct lnet_msg { + struct list_head msg_activelist; + struct list_head msg_list; /* Q for credits/MD */ + + struct lnet_processid msg_target; + /* Primary NID of the source. */ + struct lnet_nid msg_initiator; + /* where is it from, it's only for building event */ + struct lnet_nid msg_from; + __u32 msg_type; + + /* + * hold parameters in case message is with held due + * to discovery + */ + struct lnet_nid msg_src_nid_param; + struct lnet_nid msg_rtr_nid_param; + + /* + * Deadline for the message after which it will be finalized if it + * has not completed. + */ + ktime_t msg_deadline; + + /* The message health status. */ + enum lnet_msg_hstatus msg_health_status; + /* This is a recovery message */ + bool msg_recovery; + /* force an RDMA even if the message size is < 4K */ + bool msg_rdma_force; + /* the number of times a transmission has been retried */ + int msg_retry_count; + /* flag to indicate that we do not want to resend this message */ + bool msg_no_resend; + + /* committed for sending */ + unsigned int msg_tx_committed:1; + /* CPT # this message committed for sending */ + unsigned int msg_tx_cpt:15; + /* committed for receiving */ + unsigned int msg_rx_committed:1; + /* CPT # this message committed for receiving */ + unsigned int msg_rx_cpt:15; + /* queued for tx credit */ + unsigned int msg_tx_delayed:1; + /* queued for RX buffer */ + unsigned int msg_rx_delayed:1; + /* ready for pending on RX delay list */ + unsigned int msg_rx_ready_delay:1; + + unsigned int msg_vmflush:1; /* VM trying to free memory */ + unsigned int msg_target_is_router:1; /* sending to a router */ + unsigned int msg_routing:1; /* being forwarded */ + unsigned int msg_ack:1; /* ack on finalize (PUT) */ + unsigned int msg_sending:1; /* outgoing message */ + unsigned int msg_receiving:1; /* being received */ + unsigned int msg_txcredit:1; /* taken an NI send credit */ + unsigned int msg_peertxcredit:1; /* taken a peer send credit */ + unsigned int msg_rtrcredit:1; /* taken a globel router credit */ + unsigned int msg_peerrtrcredit:1; /* taken a peer router credit */ + unsigned int msg_onactivelist:1; /* on the activelist */ + unsigned int msg_rdma_get:1; + + struct lnet_peer_ni *msg_txpeer; /* peer I'm sending to */ + struct lnet_peer_ni *msg_rxpeer; /* peer I received from */ + + void *msg_private; + struct lnet_libmd *msg_md; + /* the NI the message was sent or received over */ + struct lnet_ni *msg_txni; + struct lnet_ni *msg_rxni; + + unsigned int msg_len; + unsigned int msg_wanted; + unsigned int msg_offset; + unsigned int msg_niov; + struct bio_vec *msg_kiov; + + struct lnet_event msg_ev; + struct lnet_hdr msg_hdr; +}; + +struct lnet_libhandle { + struct list_head lh_hash_chain; + __u64 lh_cookie; +}; + +#define lh_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(char *)(&((type *)0)->member))) + +struct lnet_me { + struct list_head me_list; + int me_cpt; + struct lnet_processid me_match_id; + unsigned int me_portal; + unsigned int me_pos; /* hash offset in mt_hash */ + __u64 me_match_bits; + __u64 me_ignore_bits; + enum lnet_unlink me_unlink; + struct lnet_libmd *me_md; +}; + +struct lnet_libmd { + struct list_head md_list; + struct lnet_libhandle md_lh; + struct lnet_me *md_me; + char *md_start; + unsigned int md_offset; + unsigned int md_length; + unsigned int md_max_size; + int md_threshold; + int md_refcount; + unsigned int md_options; + unsigned int md_flags; + unsigned int md_niov; /* # frags at end of struct */ + void *md_user_ptr; + struct lnet_rsp_tracker *md_rspt_ptr; + lnet_handler_t md_handler; + struct lnet_handle_md md_bulk_handle; + struct bio_vec md_kiov[LNET_MAX_IOV]; +}; + +#define LNET_MD_FLAG_ZOMBIE BIT(0) +#define LNET_MD_FLAG_AUTO_UNLINK BIT(1) +#define LNET_MD_FLAG_ABORTED BIT(2) +/* LNET_MD_FLAG_HANDLING is set when a non-unlink event handler + * is being called for an event relating to the md. + * It ensures only one such handler runs at a time. + * The final "unlink" event is only called once the + * md_refcount has reached zero, and this flag has been cleared, + * ensuring that it doesn't race with any other event handler + * call. + */ +#define LNET_MD_FLAG_HANDLING BIT(3) +#define LNET_MD_FLAG_DISCARD BIT(4) + +struct lnet_test_peer { + /* info about peers we are trying to fail */ + struct list_head tp_list; /* ln_test_peers */ + struct lnet_nid tp_nid; /* matching nid */ + unsigned int tp_threshold; /* # failures to simulate */ +}; + +#define LNET_COOKIE_TYPE_MD 1 +#define LNET_COOKIE_TYPE_ME 2 +#define LNET_COOKIE_TYPE_EQ 3 +#define LNET_COOKIE_TYPE_BITS 2 +#define LNET_COOKIE_MASK ((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL) + +struct netstrfns { + u32 nf_type; + char *nf_name; + char *nf_modname; + void (*nf_addr2str)(u32 addr, char *str, size_t size); + void (*nf_addr2str_size)(const __be32 *addr, size_t asize, + char *str, size_t size); + int (*nf_str2addr)(const char *str, int nob, u32 *addr); + int (*nf_str2addr_size)(const char *str, int nob, + __be32 *addr, size_t *asize); + int (*nf_parse_addrlist)(char *str, int len, + struct list_head *list); + int (*nf_print_addrlist)(char *buffer, int count, + struct list_head *list); + int (*nf_match_addr)(u32 addr, struct list_head *list); + int (*nf_min_max)(struct list_head *nidlist, u32 *min_nid, + u32 *max_nid); +}; + +struct lnet_ni; /* forward ref */ +struct socket; + +struct lnet_lnd { + /* fields initialized by the LND */ + __u32 lnd_type; + + int (*lnd_startup)(struct lnet_ni *ni); + void (*lnd_shutdown)(struct lnet_ni *ni); + int (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg); + + /* In data movement APIs below, payload buffers are described as a set + * of 'niov' fragments which are in pages. + * The LND may NOT overwrite these fragment descriptors. + * An 'offset' and may specify a byte offset within the set of + * fragments to start from + */ + + /* Start sending a preformatted message. 'private' is NULL for PUT and + * GET messages; otherwise this is a response to an incoming message + * and 'private' is the 'private' passed to lnet_parse(). Return + * non-zero for immediate failure, otherwise complete later with + * lnet_finalize() */ + int (*lnd_send)(struct lnet_ni *ni, void *private, + struct lnet_msg *msg); + + /* Start receiving 'mlen' bytes of payload data, skipping the following + * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to + * lnet_parse(). Return non-zero for immedaite failure, otherwise + * complete later with lnet_finalize(). This also gives back a receive + * credit if the LND does flow control. */ + int (*lnd_recv)(struct lnet_ni *ni, void *private, struct lnet_msg *msg, + int delayed, unsigned int niov, + struct bio_vec *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); + + /* lnet_parse() has had to delay processing of this message + * (e.g. waiting for a forwarding buffer or send credits). Give the + * LND a chance to free urgently needed resources. If called, return 0 + * for success and do NOT give back a receive credit; that has to wait + * until lnd_recv() gets called. On failure return < 0 and + * release resources; lnd_recv() will not be called. */ + int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, + struct lnet_msg *msg, void **new_privatep); + + /* notification of peer down */ + void (*lnd_notify_peer_down)(struct lnet_nid *peer); + + /* accept a new connection */ + int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock); + + /* get dma_dev priority */ + unsigned int (*lnd_get_dev_prio)(struct lnet_ni *ni, + unsigned int dev_idx); +}; + +struct lnet_tx_queue { + int tq_credits; /* # tx credits free */ + int tq_credits_min; /* lowest it's been */ + int tq_credits_max; /* total # tx credits */ + struct list_head tq_delayed; /* delayed TXs */ +}; + +enum lnet_net_state { + /* set when net block is allocated */ + LNET_NET_STATE_INIT = 0, + /* set when NIs in net are started successfully */ + LNET_NET_STATE_ACTIVE, + /* set if all NIs in net are in FAILED state */ + LNET_NET_STATE_INACTIVE, + /* set when shutting down a NET */ + LNET_NET_STATE_DELETING +}; + +enum lnet_ni_state { + /* initial state when NI is created */ + LNET_NI_STATE_INIT = 0, + /* set when NI is brought up */ + LNET_NI_STATE_ACTIVE, + /* set when NI is being shutdown */ + LNET_NI_STATE_DELETING, +}; + +#define LNET_NI_RECOVERY_PENDING BIT(0) +#define LNET_NI_RECOVERY_FAILED BIT(1) + +enum lnet_stats_type { + LNET_STATS_TYPE_SEND = 0, + LNET_STATS_TYPE_RECV, + LNET_STATS_TYPE_DROP +}; + +struct lnet_comm_count { + atomic_t co_get_count; + atomic_t co_put_count; + atomic_t co_reply_count; + atomic_t co_ack_count; + atomic_t co_hello_count; +}; + +struct lnet_element_stats { + struct lnet_comm_count el_send_stats; + struct lnet_comm_count el_recv_stats; + struct lnet_comm_count el_drop_stats; +}; + +struct lnet_health_local_stats { + atomic_t hlt_local_interrupt; + atomic_t hlt_local_dropped; + atomic_t hlt_local_aborted; + atomic_t hlt_local_no_route; + atomic_t hlt_local_timeout; + atomic_t hlt_local_error; +}; + +struct lnet_health_remote_stats { + atomic_t hlt_remote_dropped; + atomic_t hlt_remote_timeout; + atomic_t hlt_remote_error; + atomic_t hlt_network_timeout; +}; + +struct lnet_net { + /* chain on the ln_nets */ + struct list_head net_list; + + /* net ID, which is composed of + * (net_type << 16) | net_num. + * net_type can be one of the enumerated types defined in + * lnet/include/lnet/nidstr.h */ + __u32 net_id; + + /* round robin selection */ + __u32 net_seq; + + /* total number of CPTs in the array */ + __u32 net_ncpts; + + /* cumulative CPTs of all NIs in this net */ + __u32 *net_cpts; + + /* relative net selection priority */ + __u32 net_sel_priority; + + /* network tunables */ + struct lnet_ioctl_config_lnd_cmn_tunables net_tunables; + + /* + * boolean to indicate that the tunables have been set and + * shouldn't be reset + */ + bool net_tunables_set; + + /* procedural interface */ + const struct lnet_lnd *net_lnd; + + /* list of NIs on this net */ + struct list_head net_ni_list; + + /* list of NIs being added, but not started yet */ + struct list_head net_ni_added; + + /* dying LND instances */ + struct list_head net_ni_zombie; + + /* when I was last alive */ + time64_t net_last_alive; + + /* protects access to net_last_alive */ + spinlock_t net_lock; + + /* list of router nids preferred for this network */ + struct list_head net_rtr_pref_nids; +}; + +struct lnet_ni { + /* chain on the lnet_net structure */ + struct list_head ni_netlist; + + /* chain on the recovery queue */ + struct list_head ni_recovery; + + /* MD handle for recovery ping */ + struct lnet_handle_md ni_ping_mdh; + + spinlock_t ni_lock; + + /* number of CPTs */ + int ni_ncpts; + + /* bond NI on some CPTs */ + __u32 *ni_cpts; + + /* interface's NID */ + struct lnet_nid ni_nid; + + /* instance-specific data */ + void *ni_data; + + /* per ni credits */ + atomic_t ni_tx_credits; + + /* percpt TX queues */ + struct lnet_tx_queue **ni_tx_queues; + + /* percpt reference count */ + int **ni_refs; + + /* pointer to parent network */ + struct lnet_net *ni_net; + + /* my health status */ + struct lnet_ni_status *ni_status; + + /* NI FSM. Protected by lnet_ni_lock() */ + enum lnet_ni_state ni_state; + + /* Recovery state. Protected by lnet_ni_lock() */ + __u32 ni_recovery_state; + + /* When to send the next recovery ping */ + time64_t ni_next_ping; + /* How many pings sent during current recovery period did not receive + * a reply. NB: reset whenever _any_ message arrives on this NI + */ + unsigned int ni_ping_count; + + /* per NI LND tunables */ + struct lnet_lnd_tunables ni_lnd_tunables; + + /* lnd tunables set explicitly */ + bool ni_lnd_tunables_set; + + /* NI statistics */ + struct lnet_element_stats ni_stats; + struct lnet_health_local_stats ni_hstats; + + /* physical device CPT */ + int ni_dev_cpt; + + /* sequence number used to round robin over nis within a net */ + __u32 ni_seq; + + /* + * health value + * initialized to LNET_MAX_HEALTH_VALUE + * Value is decremented every time we fail to send a message over + * this NI because of a NI specific failure. + * Value is incremented if we successfully send a message. + */ + atomic_t ni_healthv; + + /* + * Set to 1 by the LND when it receives an event telling it the device + * has gone into a fatal state. Set to 0 when the LND receives an + * even telling it the device is back online. + */ + atomic_t ni_fatal_error_on; + + /* the relative selection priority of this NI */ + __u32 ni_sel_priority; + + /* + * equivalent interface to use + */ + char *ni_interface; + struct net *ni_net_ns; /* original net namespace */ +}; + +#define LNET_PROTO_PING_MATCHBITS 0x8000000000000000LL + +/* + * Descriptor of a ping info buffer: keep a separate indicator of the + * size and a reference count. The type is used both as a source and + * sink of data, so we need to keep some information outside of the + * area that may be overwritten by network data. + */ +struct lnet_ping_buffer { + int pb_nnis; + atomic_t pb_refcnt; + bool pb_needs_post; + struct lnet_ping_info pb_info; +}; + +#define LNET_PING_BUFFER_SIZE(NNIDS) \ + offsetof(struct lnet_ping_buffer, pb_info.pi_ni[NNIDS]) +#define LNET_PING_BUFFER_LONI(PBUF) ((PBUF)->pb_info.pi_ni[0].ns_nid) +#define LNET_PING_BUFFER_SEQNO(PBUF) ((PBUF)->pb_info.pi_ni[0].ns_status) + +#define LNET_PING_INFO_TO_BUFFER(PINFO) \ + container_of((PINFO), struct lnet_ping_buffer, pb_info) + +struct lnet_nid_list { + struct list_head nl_list; + struct lnet_nid nl_nid; +}; + +struct lnet_peer_ni { + /* chain on lpn_peer_nis */ + struct list_head lpni_peer_nis; + /* chain on remote peer list */ + struct list_head lpni_on_remote_peer_ni_list; + /* chain on recovery queue */ + struct list_head lpni_recovery; + /* chain on peer hash */ + struct list_head lpni_hashlist; + /* messages blocking for tx credits */ + struct list_head lpni_txq; + /* pointer to peer net I'm part of */ + struct lnet_peer_net *lpni_peer_net; + /* statistics kept on each peer NI */ + struct lnet_element_stats lpni_stats; + struct lnet_health_remote_stats lpni_hstats; + /* spin lock protecting credits and lpni_txq */ + spinlock_t lpni_lock; + /* # tx credits available */ + int lpni_txcredits; + /* low water mark */ + int lpni_mintxcredits; + /* + * Each peer_ni in a gateway maintains its own credits. This + * allows more traffic to gateways that have multiple interfaces. + */ + /* # router credits */ + int lpni_rtrcredits; + /* low water mark */ + int lpni_minrtrcredits; + /* bytes queued for sending */ + long lpni_txqnob; + /* network peer is on */ + struct lnet_net *lpni_net; + /* peer's NID */ + struct lnet_nid lpni_nid; + /* # refs */ + struct kref lpni_kref; + /* health value for the peer */ + atomic_t lpni_healthv; + /* recovery ping mdh */ + struct lnet_handle_md lpni_recovery_ping_mdh; + /* When to send the next recovery ping */ + time64_t lpni_next_ping; + /* How many pings sent during current recovery period did not receive + * a reply. NB: reset whenever _any_ message arrives from this peer NI + */ + unsigned int lpni_ping_count; + /* CPT this peer attached on */ + int lpni_cpt; + /* state flags -- protected by lpni_lock */ + unsigned lpni_state; + /* status of the peer NI as reported by the peer */ + __u32 lpni_ns_status; + /* sequence number used to round robin over peer nis within a net */ + __u32 lpni_seq; + /* sequence number used to round robin over gateways */ + __u32 lpni_gw_seq; + /* returned RC ping features. Protected with lpni_lock */ + unsigned int lpni_ping_feats; + /* time last message was received from the peer */ + time64_t lpni_last_alive; + /* preferred local nids: if only one, use lpni_pref.nid */ + union lpni_pref { + struct lnet_nid nid; + struct list_head nids; + } lpni_pref; + /* list of router nids preferred for this peer NI */ + struct list_head lpni_rtr_pref_nids; + /* The relative selection priority of this peer NI */ + __u32 lpni_sel_priority; + /* number of preferred NIDs in lnpi_pref_nids */ + __u32 lpni_pref_nnids; +}; + +/* Preferred path added due to traffic on non-MR peer_ni */ +#define LNET_PEER_NI_NON_MR_PREF BIT(0) +/* peer is being recovered. */ +#define LNET_PEER_NI_RECOVERY_PENDING BIT(1) +/* recovery ping failed */ +#define LNET_PEER_NI_RECOVERY_FAILED BIT(2) +/* peer is being deleted */ +#define LNET_PEER_NI_DELETING BIT(3) + +struct lnet_peer { + /* chain on pt_peer_list */ + struct list_head lp_peer_list; + + /* list of peer nets */ + struct list_head lp_peer_nets; + + /* list of messages pending discovery*/ + struct list_head lp_dc_pendq; + + /* chain on router list */ + struct list_head lp_rtr_list; + + /* primary NID of the peer */ + struct lnet_nid lp_primary_nid; + + /* source NID to use during discovery */ + struct lnet_nid lp_disc_src_nid; + /* destination NID to use during discovery */ + struct lnet_nid lp_disc_dst_nid; + + /* net to perform discovery on */ + __u32 lp_disc_net_id; + + /* CPT of peer_table */ + int lp_cpt; + + /* number of NIDs on this peer */ + int lp_nnis; + + /* # refs from lnet_route::lr_gateway */ + int lp_rtr_refcount; + + /* + * peer specific health sensitivity value to decrement peer nis in + * this peer with if set to something other than 0 + */ + __u32 lp_health_sensitivity; + + /* messages blocking for router credits */ + struct list_head lp_rtrq; + + /* routes on this peer */ + struct list_head lp_routes; + + /* reference count */ + atomic_t lp_refcount; + + /* lock protecting peer state flags and lpni_rtrq */ + spinlock_t lp_lock; + + /* peer state flags */ + unsigned lp_state; + + /* buffer for data pushed by peer */ + struct lnet_ping_buffer *lp_data; + + /* MD handle for ping in progress */ + struct lnet_handle_md lp_ping_mdh; + + /* MD handle for push in progress */ + struct lnet_handle_md lp_push_mdh; + + /* number of NIDs for sizing push data */ + int lp_data_nnis; + + /* NI config sequence number of peer */ + __u32 lp_peer_seqno; + + /* Local NI config sequence number acked by peer */ + __u32 lp_node_seqno; + + /* Local NI config sequence number sent to peer */ + __u32 lp_node_seqno_sent; + + /* Ping error encountered during discovery. */ + int lp_ping_error; + + /* Push error encountered during discovery. */ + int lp_push_error; + + /* Error encountered during discovery. */ + int lp_dc_error; + + /* time it was put on the ln_dc_working queue */ + time64_t lp_last_queued; + + /* link on discovery-related lists */ + struct list_head lp_dc_list; + + /* tasks waiting on discovery of this peer */ + wait_queue_head_t lp_dc_waitq; + + /* cached peer aliveness */ + bool lp_alive; +}; + +/* + * The status flags in lp_state. Their semantics have chosen so that + * lp_state can be zero-initialized. + * + * A peer is marked MULTI_RAIL in two cases: it was configured using DLC + * as multi-rail aware, or the LNET_PING_FEAT_MULTI_RAIL bit was set. + * + * A peer is marked NO_DISCOVERY if the LNET_PING_FEAT_DISCOVERY bit was + * NOT set when the peer was pinged by discovery. + * + * A peer is marked ROUTER if it indicates so in the feature bit. + */ +#define LNET_PEER_MULTI_RAIL BIT(0) /* Multi-rail aware */ +#define LNET_PEER_NO_DISCOVERY BIT(1) /* Peer disabled discovery */ +#define LNET_PEER_ROUTER_ENABLED BIT(2) /* router feature enabled */ + +/* + * A peer is marked CONFIGURED if it was configured by DLC. + * + * In addition, a peer is marked DISCOVERED if it has fully passed + * through Peer Discovery. + * + * When Peer Discovery is disabled, the discovery thread will mark + * peers REDISCOVER to indicate that they should be re-examined if + * discovery is (re)enabled on the node. + * + * A peer that was created as the result of inbound traffic will not + * be marked at all. + */ +#define LNET_PEER_CONFIGURED BIT(3) /* Configured via DLC */ +#define LNET_PEER_DISCOVERED BIT(4) /* Peer was discovered */ +#define LNET_PEER_REDISCOVER BIT(5) /* Discovery was disabled */ +/* + * A peer is marked DISCOVERING when discovery is in progress. + * The other flags below correspond to stages of discovery. + */ +#define LNET_PEER_DISCOVERING BIT(6) /* Discovering */ +#define LNET_PEER_DATA_PRESENT BIT(7) /* Remote peer data present */ +#define LNET_PEER_NIDS_UPTODATE BIT(8) /* Remote peer info uptodate */ +#define LNET_PEER_PING_SENT BIT(9) /* Waiting for REPLY to Ping */ +#define LNET_PEER_PUSH_SENT BIT(10) /* Waiting for ACK of Push */ +#define LNET_PEER_PING_FAILED BIT(11) /* Ping send failure */ +#define LNET_PEER_PUSH_FAILED BIT(12) /* Push send failure */ +/* + * A ping can be forced as a way to fix up state, or as a manual + * intervention by an admin. + * A push can be forced in circumstances that would normally not + * allow for one to happen. + */ +#define LNET_PEER_FORCE_PING BIT(13) /* Forced Ping */ +#define LNET_PEER_FORCE_PUSH BIT(14) /* Forced Push */ + +/* force delete even if router */ +#define LNET_PEER_RTR_NI_FORCE_DEL BIT(15) + +/* gw undergoing alive discovery */ +#define LNET_PEER_RTR_DISCOVERY BIT(16) +/* gw has undergone discovery (does not indicate success or failure) */ +#define LNET_PEER_RTR_DISCOVERED BIT(17) + +/* peer is marked for deletion */ +#define LNET_PEER_MARK_DELETION BIT(18) +/* lnet_peer_del()/lnet_peer_del_locked() has been called on the peer */ +#define LNET_PEER_MARK_DELETED BIT(19) +/* lock primary NID to what's requested by ULP */ +#define LNET_PEER_LOCK_PRIMARY BIT(20) +/* this is for informational purposes only. It is set if a peer gets + * configured from Lustre with a primary NID which belongs to another peer + * which is also configured by Lustre as the primary NID. + */ +#define LNET_PEER_BAD_CONFIG BIT(21) + +struct lnet_peer_net { + /* chain on lp_peer_nets */ + struct list_head lpn_peer_nets; + + /* list of peer_nis on this network */ + struct list_head lpn_peer_nis; + + /* pointer to the peer I'm part of */ + struct lnet_peer *lpn_peer; + + /* Net ID */ + __u32 lpn_net_id; + + /* peer net health */ + int lpn_healthv; + + /* time of next router ping on this net */ + time64_t lpn_next_ping; + + /* selection sequence number */ + __u32 lpn_seq; + + /* relative peer net selection priority */ + __u32 lpn_sel_priority; + + /* reference count */ + atomic_t lpn_refcount; +}; + +/* peer hash size */ +#define LNET_PEER_HASH_BITS 9 +#define LNET_PEER_HASH_SIZE (1 << LNET_PEER_HASH_BITS) + +/* + * peer hash table - one per CPT + * + * protected by lnet_net_lock/EX for update + * pt_version + * pt_hash[...] + * pt_peer_list + * pt_peers + * protected by pt_zombie_lock: + * pt_zombie_list + * pt_zombies + * + * pt_zombie lock nests inside lnet_net_lock + */ +struct lnet_peer_table { + int pt_version; /* /proc validity stamp */ + struct list_head *pt_hash; /* NID->peer hash */ + struct list_head pt_peer_list; /* peers */ + int pt_peers; /* # peers */ + struct list_head pt_zombie_list; /* zombie peer_ni */ + int pt_zombies; /* # zombie peers_ni */ + spinlock_t pt_zombie_lock; /* protect list and count */ +}; + +/* peer aliveness is enabled only on routers for peers in a network where the + * struct lnet_ni::ni_peertimeout has been set to a positive value + */ +#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \ + ((lp)->lpni_net) && \ + (lp)->lpni_net->net_tunables.lct_peer_timeout > 0) + +struct lnet_route { + struct list_head lr_list; /* chain on net */ + struct list_head lr_gwlist; /* chain on gateway */ + struct lnet_peer *lr_gateway; /* router node */ + struct lnet_nid lr_nid; /* NID used to add route */ + __u32 lr_net; /* remote network number */ + __u32 lr_lnet; /* local network number */ + int lr_seq; /* sequence for round-robin */ + __u32 lr_hops; /* how far I am */ + unsigned int lr_priority; /* route priority */ + atomic_t lr_alive; /* cached route aliveness */ + bool lr_single_hop; /* this route is single-hop */ +}; + +#define LNET_REMOTE_NETS_HASH_DEFAULT (1U << 7) +#define LNET_REMOTE_NETS_HASH_MAX (1U << 16) +#define LNET_REMOTE_NETS_HASH_SIZE (1 << the_lnet.ln_remote_nets_hbits) + +struct lnet_remotenet { + /* chain on ln_remote_nets_hash */ + struct list_head lrn_list; + /* routes to me */ + struct list_head lrn_routes; + /* my net number */ + __u32 lrn_net; +}; + +/** lnet message has credit and can be submitted to lnd for send/receive */ +#define LNET_CREDIT_OK 0 +/** lnet message is waiting for credit */ +#define LNET_CREDIT_WAIT 1 +/** lnet message is waiting for discovery */ +#define LNET_DC_WAIT 2 + +struct lnet_rtrbufpool { + /* my free buffer pool */ + struct list_head rbp_bufs; + /* messages blocking for a buffer */ + struct list_head rbp_msgs; + /* # pages in each buffer */ + int rbp_npages; + /* requested number of buffers */ + int rbp_req_nbuffers; + /* # buffers actually allocated */ + int rbp_nbuffers; + /* # free buffers / blocked messages */ + int rbp_credits; + /* low water mark */ + int rbp_mincredits; +}; + +struct lnet_rtrbuf { + struct list_head rb_list; /* chain on rbp_bufs */ + struct lnet_rtrbufpool *rb_pool; /* owning pool */ + struct bio_vec rb_kiov[0]; /* the buffer space */ +}; + +#define LNET_PEER_HASHSIZE 503 /* prime! */ + +enum lnet_match_flags { + /* Didn't match anything */ + LNET_MATCHMD_NONE = BIT(0), + /* Matched OK */ + LNET_MATCHMD_OK = BIT(1), + /* Must be discarded */ + LNET_MATCHMD_DROP = BIT(2), + /* match and buffer is exhausted */ + LNET_MATCHMD_EXHAUSTED = BIT(3), + /* match or drop */ + LNET_MATCHMD_FINISH = (LNET_MATCHMD_OK | LNET_MATCHMD_DROP), +}; + +/* Options for struct lnet_portal::ptl_options */ +#define LNET_PTL_LAZY BIT(0) +#define LNET_PTL_MATCH_UNIQUE BIT(1) /* unique match, for RDMA */ +#define LNET_PTL_MATCH_WILDCARD BIT(2) /* wildcard match, request portal */ + +/* parameter for matching operations (GET, PUT) */ +struct lnet_match_info { + __u64 mi_mbits; + struct lnet_processid mi_id; + unsigned int mi_cpt; + unsigned int mi_opc; + unsigned int mi_portal; + unsigned int mi_rlength; + unsigned int mi_roffset; +}; + +/* ME hash of RDMA portal */ +#define LNET_MT_HASH_BITS 8 +#define LNET_MT_HASH_SIZE (1 << LNET_MT_HASH_BITS) +#define LNET_MT_HASH_MASK (LNET_MT_HASH_SIZE - 1) +/* we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash, + * the last entry is reserved for MEs with ignore-bits */ +#define LNET_MT_HASH_IGNORE LNET_MT_HASH_SIZE +/* __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which + * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the + * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] */ +#define LNET_MT_BITS_U64 6 /* 2^6 bits */ +#define LNET_MT_EXHAUSTED_BITS (LNET_MT_HASH_BITS - LNET_MT_BITS_U64) +#define LNET_MT_EXHAUSTED_BMAP ((1 << LNET_MT_EXHAUSTED_BITS) + 1) + +/* portal match table */ +struct lnet_match_table { + /* reserved for upcoming patches, CPU partition ID */ + unsigned int mt_cpt; + unsigned int mt_portal; /* portal index */ + /* match table is set as "enabled" if there's non-exhausted MD + * attached on mt_mhash, it's only valid for wildcard portal */ + unsigned int mt_enabled; + /* bitmap to flag whether MEs on mt_hash are exhausted or not */ + __u64 mt_exhausted[LNET_MT_EXHAUSTED_BMAP]; + struct list_head *mt_mhash; /* matching hash */ +}; + +/* these are only useful for wildcard portal */ +/* Turn off message rotor for wildcard portals */ +#define LNET_PTL_ROTOR_OFF 0 +/* round-robin dispatch all PUT messages for wildcard portals */ +#define LNET_PTL_ROTOR_ON 1 +/* round-robin dispatch routed PUT message for wildcard portals */ +#define LNET_PTL_ROTOR_RR_RT 2 +/* dispatch routed PUT message by hashing source NID for wildcard portals */ +#define LNET_PTL_ROTOR_HASH_RT 3 + +struct lnet_portal { + spinlock_t ptl_lock; + unsigned int ptl_index; /* portal ID, reserved */ + /* flags on this portal: lazy, unique... */ + unsigned int ptl_options; + /* list of messages which are stealing buffer */ + struct list_head ptl_msg_stealing; + /* messages blocking for MD */ + struct list_head ptl_msg_delayed; + /* Match table for each CPT */ + struct lnet_match_table **ptl_mtables; + /* spread rotor of incoming "PUT" */ + unsigned int ptl_rotor; + /* # active entries for this portal */ + int ptl_mt_nmaps; + /* array of active entries' cpu-partition-id */ + int ptl_mt_maps[0]; +}; + +#define LNET_LH_HASH_BITS 12 +#define LNET_LH_HASH_SIZE (1ULL << LNET_LH_HASH_BITS) +#define LNET_LH_HASH_MASK (LNET_LH_HASH_SIZE - 1) + +/* resource container (ME, MD, EQ) */ +struct lnet_res_container { + unsigned int rec_type; /* container type */ + __u64 rec_lh_cookie; /* cookie generator */ + struct list_head rec_active; /* active resource list */ + struct list_head *rec_lh_hash; /* handle hash */ +}; + +/* message container */ +struct lnet_msg_container { + int msc_init; /* initialized or not */ + /* max # threads finalizing */ + int msc_nfinalizers; + /* msgs waiting to complete finalizing */ + struct list_head msc_finalizing; + /* msgs waiting to be resent */ + struct list_head msc_resending; + struct list_head msc_active; /* active message list */ + /* threads doing finalization */ + void **msc_finalizers; + /* threads doing resends */ + void **msc_resenders; +}; + +/* This UDSP structures need to match the user space liblnetconfig structures + * in order for the marshall and unmarshall functions to be common. + */ + +/* Net is described as a + * 1. net type + * 2. num range + */ +struct lnet_ud_net_descr { + __u32 udn_net_type; + struct list_head udn_net_num_range; +}; + +/* each NID range is defined as + * 1. net descriptor + * 2. address range descriptor + */ +struct lnet_ud_nid_descr { + struct lnet_ud_net_descr ud_net_id; + struct list_head ud_addr_range; + __u32 ud_mem_size; +}; + +/* a UDSP rule can have up to three user defined NID descriptors + * - src: defines the local NID range for the rule + * - dst: defines the peer NID range for the rule + * - rte: defines the router NID range for the rule + * + * An action union defines the action to take when the rule + * is matched + */ +struct lnet_udsp { + struct list_head udsp_on_list; + __u32 udsp_idx; + struct lnet_ud_nid_descr udsp_src; + struct lnet_ud_nid_descr udsp_dst; + struct lnet_ud_nid_descr udsp_rte; + enum lnet_udsp_action_type udsp_action_type; + union { + __u32 udsp_priority; + } udsp_action; +}; + +/* Peer Discovery states */ +#define LNET_DC_STATE_SHUTDOWN 0 /* not started */ +#define LNET_DC_STATE_RUNNING 1 /* started up OK */ +#define LNET_DC_STATE_STOPPING 2 /* telling thread to stop */ + +/* Router Checker states */ +#define LNET_MT_STATE_SHUTDOWN 0 /* not started */ +#define LNET_MT_STATE_RUNNING 1 /* started up OK */ +#define LNET_MT_STATE_STOPPING 2 /* telling thread to stop */ + +/* LNet states */ +#define LNET_STATE_SHUTDOWN 0 /* not started */ +#define LNET_STATE_RUNNING 1 /* started up OK */ +#define LNET_STATE_STOPPING 2 /* telling thread to stop */ + +struct lnet { + /* CPU partition table of LNet */ + struct cfs_cpt_table *ln_cpt_table; + /* number of CPTs in ln_cpt_table */ + unsigned int ln_cpt_number; + unsigned int ln_cpt_bits; + + /* protect LNet resources (ME/MD/EQ) */ + struct cfs_percpt_lock *ln_res_lock; + /* # portals */ + int ln_nportals; + /* the vector of portals */ + struct lnet_portal **ln_portals; + /* percpt MD container */ + struct lnet_res_container **ln_md_containers; + + /* Event Queue container */ + struct lnet_res_container ln_eq_container; + spinlock_t ln_eq_wait_lock; + + unsigned int ln_remote_nets_hbits; + + /* protect NI, peer table, credits, routers, rtrbuf... */ + struct cfs_percpt_lock *ln_net_lock; + /* percpt message containers for active/finalizing/freed message */ + struct lnet_msg_container **ln_msg_containers; + struct lnet_counters **ln_counters; + struct lnet_peer_table **ln_peer_tables; + /* list of peer nis not on a local network */ + struct list_head ln_remote_peer_ni_list; + /* failure simulation */ + struct list_head ln_test_peers; + struct list_head ln_drop_rules; + struct list_head ln_delay_rules; + /* LND instances */ + struct list_head ln_nets; + /* the loopback NI */ + struct lnet_ni *ln_loni; + /* network zombie list */ + struct list_head ln_net_zombie; + /* resend messages list */ + struct list_head ln_msg_resend; + /* spin lock to protect the msg resend list */ + spinlock_t ln_msg_resend_lock; + + /* remote networks with routes to them */ + struct list_head *ln_remote_nets_hash; + /* validity stamp */ + __u64 ln_remote_nets_version; + /* list of all known routers */ + struct list_head ln_routers; + /* validity stamp */ + __u64 ln_routers_version; + /* percpt router buffer pools */ + struct lnet_rtrbufpool **ln_rtrpools; + + /* + * Ping target / Push source + * + * The ping target and push source share a single buffer. The + * ln_ping_target is protected against concurrent updates by + * ln_api_mutex. + */ + struct lnet_handle_md ln_ping_target_md; + lnet_handler_t ln_ping_target_handler; + struct lnet_ping_buffer *ln_ping_target; + atomic_t ln_ping_target_seqno; + + /* + * Push Target + * + * ln_push_nnis contains the desired size of the push target. + * The lnet_net_lock is used to handle update races. The old + * buffer may linger a while after it has been unlinked, in + * which case the event handler cleans up. + */ + lnet_handler_t ln_push_target_handler; + struct lnet_handle_md ln_push_target_md; + struct lnet_ping_buffer *ln_push_target; + int ln_push_target_nnis; + + /* discovery event queue handle */ + lnet_handler_t ln_dc_handler; + /* discovery requests */ + struct list_head ln_dc_request; + /* discovery working list */ + struct list_head ln_dc_working; + /* discovery expired list */ + struct list_head ln_dc_expired; + /* discovery thread wait queue */ + wait_queue_head_t ln_dc_waitq; + /* discovery startup/shutdown state */ + int ln_dc_state; + + /* monitor thread startup/shutdown state */ + int ln_mt_state; + /* serialise startup/shutdown */ + struct semaphore ln_mt_signal; + + struct mutex ln_api_mutex; + struct mutex ln_lnd_mutex; + /* Have I called LNetNIInit myself? */ + int ln_niinit_self; + /* LNetNIInit/LNetNIFini counter */ + int ln_refcount; + /* SHUTDOWN/RUNNING/STOPPING */ + int ln_state; + + int ln_routing; /* am I a router? */ + lnet_pid_t ln_pid; /* requested pid */ + /* uniquely identifies this ni in this epoch */ + __u64 ln_interface_cookie; + /* registered LNDs */ + const struct lnet_lnd *ln_lnds[NUM_LNDS]; + + /* test protocol compatibility flags */ + unsigned long ln_testprotocompat; + + /* 0 - load the NIs from the mod params + * 1 - do not load the NIs from the mod params + * Reverse logic to ensure that other calls to LNetNIInit + * need no change + */ + bool ln_nis_from_mod_params; + + /* + * completion for the monitor thread. The monitor thread takes care of + * checking routes, timedout messages and resending messages. + */ + struct completion ln_mt_wait_complete; + + /* per-cpt resend queues */ + struct list_head **ln_mt_resendqs; + /* local NIs to recover */ + struct list_head ln_mt_localNIRecovq; + /* local NIs to recover */ + struct list_head ln_mt_peerNIRecovq; + /* + * An array of queues for GET/PUT waiting for REPLY/ACK respectively. + * There are CPT number of queues. Since response trackers will be + * added on the fast path we can't afford to grab the exclusive + * net lock to protect these queues. The CPT will be calculated + * based on the mdh cookie. + */ + struct list_head **ln_mt_rstq; + /* + * A response tracker becomes a zombie when the associated MD is queued + * for unlink before the response tracker is detached from the MD. An + * entry on a zombie list can be freed when either the remaining + * operations on the MD complete or when LNet has shut down. + */ + struct list_head **ln_mt_zombie_rstqs; + /* recovery handler */ + lnet_handler_t ln_mt_handler; + + /* + * Completed when the discovery and monitor threads can enter their + * work loops + */ + struct completion ln_started; + /* UDSP list */ + struct list_head ln_udsp_list; +}; + +struct genl_filter_list { + struct list_head lp_list; + void *lp_cursor; + bool lp_first; +}; + +static const struct nla_policy scalar_attr_policy[LN_SCALAR_MAX + 1] = { + [LN_SCALAR_ATTR_LIST] = { .type = NLA_NESTED }, + [LN_SCALAR_ATTR_LIST_SIZE] = { .type = NLA_U16 }, + [LN_SCALAR_ATTR_INDEX] = { .type = NLA_U16 }, + [LN_SCALAR_ATTR_NLA_TYPE] = { .type = NLA_U16 }, + [LN_SCALAR_ATTR_VALUE] = { .type = NLA_STRING }, + [LN_SCALAR_ATTR_KEY_FORMAT] = { .type = NLA_U16 }, +}; + +int lnet_genl_send_scalar_list(struct sk_buff *msg, u32 portid, u32 seq, + const struct genl_family *family, int flags, + u8 cmd, const struct ln_key_list *data[]); + +/* Special workaround for pre-4.19 kernels to send error messages + * from dumpit routines. Newer kernels will send message with + * NL_SET_ERR_MSG information by default if NETLINK_EXT_ACK is set. + */ +static inline int lnet_nl_send_error(struct sk_buff *msg, int portid, int seq, + int error) +{ +#ifndef HAVE_NL_DUMP_WITH_EXT_ACK + struct nlmsghdr *nlh; + + if (!error) + return 0; + + nlh = nlmsg_put(msg, portid, seq, NLMSG_ERROR, sizeof(error), 0); + if (!nlh) + return -ENOMEM; +#ifdef HAVE_NL_PARSE_WITH_EXT_ACK + netlink_ack(msg, nlh, error, NULL); +#else + netlink_ack(msg, nlh, error); +#endif + return nlmsg_len(nlh); +#else + return error; +#endif +} + +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lnet_rdma.h b/drivers/staging/lustrefsx/lnet/include/lnet/lnet_rdma.h new file mode 100644 index 0000000000000..6aa5367af007c --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/lnet/lnet_rdma.h @@ -0,0 +1,89 @@ +#ifndef LUSTRE_NVFS_H +#define LUSTRE_NVFS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define REGSTR2(x) x##_register_nvfs_dma_ops +#define REGSTR(x) REGSTR2(x) + +#define UNREGSTR2(x) x##_unregister_nvfs_dma_ops +#define UNREGSTR(x) UNREGSTR2(x) + +#define MODULE_PREFIX lustre_v1 + +#define REGISTER_FUNC REGSTR(MODULE_PREFIX) +#define UNREGISTER_FUNC UNREGSTR(MODULE_PREFIX) + +#define NVFS_IO_ERR -1 +#define NVFS_CPU_REQ -2 + +#define NVFS_HOLD_TIME_MS 1000 + +struct nvfs_dma_rw_ops { + unsigned long long ft_bmap; /* feature bitmap */ + + int (*nvfs_blk_rq_map_sg) (struct request_queue *q, + struct request *req, + struct scatterlist *sglist); + + int (*nvfs_dma_map_sg_attrs) (struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir, + unsigned long attrs); + + int (*nvfs_dma_unmap_sg) (struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir); + bool (*nvfs_is_gpu_page) (struct page *); + unsigned int (*nvfs_gpu_index) (struct page *page); + unsigned int (*nvfs_device_priority) (struct device *dev, unsigned int dev_index); +}; + +/* feature list for dma_ops, values indicate bit pos */ +enum ft_bits { + nvfs_ft_prep_sglist = 1ULL << 0, + nvfs_ft_map_sglist = 1ULL << 1, + nvfs_ft_is_gpu_page = 1ULL << 2, + nvfs_ft_device_priority = 1ULL << 3, +}; + +/* check features for use in registration with vendor drivers */ +#define NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) \ + ((ops)->ft_bmap & nvfs_ft_prep_sglist) +#define NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops) \ + ((ops)->ft_bmap & nvfs_ft_map_sglist) +#define NVIDIA_FS_CHECK_FT_GPU_PAGE(ops) \ + ((ops)->ft_bmap & nvfs_ft_is_gpu_page) +#define NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops) \ + ((ops)->ft_bmap & nvfs_ft_device_priority) + +int REGISTER_FUNC (struct nvfs_dma_rw_ops *ops); + +void UNREGISTER_FUNC (void); + +unsigned int lnet_get_dev_prio(struct device *dev, + unsigned int dev_idx); +int lnet_rdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction direction); +int lnet_rdma_unmap_sg(struct device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction); +bool lnet_is_rdma_only_page(struct page *page); +unsigned int lnet_get_dev_idx(struct page *page); + +/* DMA_ATTR_NO_WARN was added to kernel v4.8-11962-ga9a62c9 */ +#ifndef DMA_ATTR_NO_WARN +#define DMA_ATTR_NO_WARN 0 +#endif + +#endif /* LUSTRE_NVFS_H */ + diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h new file mode 100644 index 0000000000000..ff1fe2381768d --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h @@ -0,0 +1,99 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/include/lnet/socklnd.h + */ +#ifndef __LNET_LNET_SOCKLND_H__ +#define __LNET_LNET_SOCKLND_H__ + +#include +#include + +struct ksock_hello_msg { + __u32 kshm_magic; /* LNET_PROTO_MAGIC */ + __u32 kshm_version; /* KSOCK_PROTO_V* */ + struct lnet_nid kshm_src_nid; /* sender's nid */ + struct lnet_nid kshm_dst_nid; /* destination nid */ + lnet_pid_t kshm_src_pid; /* sender's pid */ + lnet_pid_t kshm_dst_pid; /* destination pid */ + __u64 kshm_src_incarnation; /* sender's incarnation */ + __u64 kshm_dst_incarnation; /* destination's incarnation */ + __u32 kshm_ctype; /* SOCKLND_CONN_* */ + __u32 kshm_nips; /* always sent as zero */ + __u32 kshm_ips[0]; /* deprecated */ +} __packed; + +struct ksock_hello_msg_nid4 { + __u32 kshm_magic; /* LNET_PROTO_MAGIC */ + __u32 kshm_version; /* KSOCK_PROTO_V* */ + lnet_nid_t kshm_src_nid; /* sender's nid */ + lnet_nid_t kshm_dst_nid; /* destination nid */ + lnet_pid_t kshm_src_pid; /* sender's pid */ + lnet_pid_t kshm_dst_pid; /* destination pid */ + __u64 kshm_src_incarnation; /* sender's incarnation */ + __u64 kshm_dst_incarnation; /* destination's incarnation */ + __u32 kshm_ctype; /* SOCKLND_CONN_* */ + __u32 kshm_nips; /* sent as zero */ + __u32 kshm_ips[0]; /* deprecated */ +} __packed; + +struct ksock_msg_hdr { + __u32 ksh_type; /* type of socklnd message */ + __u32 ksh_csum; /* checksum if != 0 */ + __u64 ksh_zc_cookies[2]; /* Zero-Copy request/ACK + * cookie + */ +} __packed; + +#define KSOCK_MSG_NOOP 0xc0 /* empty */ +#define KSOCK_MSG_LNET 0xc1 /* lnet msg */ + +struct ksock_msg { + struct ksock_msg_hdr ksm_kh; + union { + /* case ksm_kh.ksh_type == KSOCK_MSG_NOOP */ + /* - nothing */ + /* case ksm_kh.ksh_type == KSOCK_MSG_LNET */ + struct lnet_hdr_nid4 lnetmsg_nid4; + /* case ksm_kh.ksh_type == KSOCK_MSG_LNET && + * kshm_version >= KSOCK_PROTO_V4 + */ + struct lnet_hdr_nid16 lnetmsg_nid16; + } __packed ksm_u; +} __packed; +#define ksm_type ksm_kh.ksh_type +#define ksm_csum ksm_kh.ksh_csum +#define ksm_zc_cookies ksm_kh.ksh_zc_cookies + +/* We need to know this number to parse hello msg from ksocklnd in + * other LND (usocklnd, for example) */ +#define KSOCK_PROTO_V2 2 +#define KSOCK_PROTO_V3 3 +#define KSOCK_PROTO_V4 4 + +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/udsp.h b/drivers/staging/lustrefsx/lnet/include/lnet/udsp.h new file mode 100644 index 0000000000000..3ba5a30f6a374 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/lnet/udsp.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * Copyright (c) 2018-2020 Data Direct Networks. + * + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * Author: Amir Shehata + */ + +#ifndef UDSP_H +#define UDSP_H + +#include + +/** + * lnet_udsp_add_policy + * Add a policy \new in position \idx + * Must be called with api_mutex held + */ +int lnet_udsp_add_policy(struct lnet_udsp *new, int idx); + +/** + * lnet_udsp_get_policy + * get a policy in position \idx + * Must be called with api_mutex held + */ +struct lnet_udsp *lnet_udsp_get_policy(int idx); + +/** + * lnet_udsp_del_policy + * Delete a policy from position \idx + * Must be called with api_mutex held + */ +int lnet_udsp_del_policy(int idx); + +/** + * lnet_udsp_apply_policies + * apply all stored policies across the system + * Must be called with api_mutex held + * Must NOT be called with lnet_net_lock held + * udsp: NULL to apply on all existing udsps + * non-NULL to apply to specified udsp + * revert: true to revert policy application + */ +int lnet_udsp_apply_policies(struct lnet_udsp *udsp, bool revert); + +/** + * lnet_udsp_apply_policies_on_lpni + * apply all stored policies on specified \lpni + * Must be called with api_mutex held + * Must be called with LNET_LOCK_EX + */ +int lnet_udsp_apply_policies_on_lpni(struct lnet_peer_ni *lpni); + +/** + * lnet_udsp_apply_policies_on_lpn + * Must be called with api_mutex held + * apply all stored policies on specified \lpn + * Must be called with LNET_LOCK_EX + */ +int lnet_udsp_apply_policies_on_lpn(struct lnet_peer_net *lpn); + +/** + * lnet_udsp_apply_policies_on_ni + * apply all stored policies on specified \ni + * Must be called with api_mutex held + * Must be called with LNET_LOCK_EX + */ +int lnet_udsp_apply_policies_on_ni(struct lnet_ni *ni); + +/** + * lnet_udsp_apply_policies_on_net + * apply all stored policies on specified \net + * Must be called with api_mutex held + * Must be called with LNET_LOCK_EX + */ +int lnet_udsp_apply_policies_on_net(struct lnet_net *net); + +/** + * lnet_udsp_alloc + * Allocates a UDSP block and initializes it. + * Return NULL if allocation fails + * pointer to UDSP otherwise. + */ +struct lnet_udsp *lnet_udsp_alloc(void); + +/** + * lnet_udsp_free + * Free a UDSP and all its descriptors + */ +void lnet_udsp_free(struct lnet_udsp *udsp); + +/** + * lnet_udsp_destroy + * Free all the UDSPs + * force: true to indicate shutdown in progress + */ +void lnet_udsp_destroy(bool shutdown); + +/** + * lnet_get_udsp_size + * Return the size needed to store the marshalled UDSP + */ +size_t lnet_get_udsp_size(struct lnet_udsp *udsp); + +/** + * lnet_udsp_marshal + * Marshal the udsp into the bulk memory provided. + * Return success/failure. + */ +int lnet_udsp_marshal(struct lnet_udsp *udsp, + struct lnet_ioctl_udsp *ioc_udsp); +/** + * lnet_udsp_demarshal_add + * Given a bulk containing a single UDSP, + * demarshal and populate a udsp structure then add policy + */ +int lnet_udsp_demarshal_add(void *bulk, __u32 bulk_size); + +/** + * lnet_udsp_get_construct_info + * get information of how the UDSP policies impacted the given + * construct. + */ +void lnet_udsp_get_construct_info(struct lnet_ioctl_construct_udsp_info *info); + +#endif /* UDSP_H */ diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h new file mode 100644 index 0000000000000..86e46606b0e37 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h @@ -0,0 +1,157 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * libcfs/include/libcfs/libcfs_debug.h + * + * Debug messages and assertions + * + */ + +#ifndef __UAPI_LIBCFS_DEBUG_H__ +#define __UAPI_LIBCFS_DEBUG_H__ + +#include + +/** + * Format for debug message headers + */ +struct ptldebug_header { + __u32 ph_len; + __u32 ph_flags; + __u32 ph_subsys; + __u32 ph_mask; + __u16 ph_cpu_id; + __u16 ph_type; + /* time_t overflow in 2106 */ + __u32 ph_sec; + __u64 ph_usec; + __u32 ph_stack; + __u32 ph_pid; + __u32 ph_extern_pid; + __u32 ph_line_num; +} __attribute__((packed)); + +#define PH_FLAG_FIRST_RECORD 1 + +/* Debugging subsystems (32 bits, non-overlapping) */ +enum libcfs_debug_subsys { + S_UNDEFINED = 0x00000001, + S_MDC = 0x00000002, + S_MDS = 0x00000004, + S_OSC = 0x00000008, + S_OST = 0x00000010, + S_CLASS = 0x00000020, + S_LOG = 0x00000040, + S_LLITE = 0x00000080, + S_RPC = 0x00000100, + S_MGMT = 0x00000200, + S_LNET = 0x00000400, + S_LND = 0x00000800, /* ALL LNDs */ + S_PINGER = 0x00001000, + S_FILTER = 0x00002000, + S_LIBCFS = 0x00004000, + S_ECHO = 0x00008000, + S_LDLM = 0x00010000, + S_LOV = 0x00020000, + S_LQUOTA = 0x00040000, + S_OSD = 0x00080000, + S_LFSCK = 0x00100000, + S_SNAPSHOT = 0x00200000, +/* unused */ + S_LMV = 0x00800000, +/* unused */ + S_SEC = 0x02000000, /* upcall cache */ + S_GSS = 0x04000000, +/* unused */ + S_MGC = 0x10000000, + S_MGS = 0x20000000, + S_FID = 0x40000000, + S_FLD = 0x80000000, +}; +#define LIBCFS_S_DEFAULT (~0) + +#define LIBCFS_DEBUG_SUBSYS_NAMES { \ + "undefined", "mdc", "mds", "osc", "ost", "class", "log", \ + "llite", "rpc", "mgmt", "lnet", "lnd", "pinger", "filter", \ + "libcfs", "echo", "ldlm", "lov", "lquota", "osd", "lfsck", \ + "snapshot", "", "lmv", "", "sec", "gss", "", "mgc", "mgs", \ + "fid", "fld", NULL } + +/* Debugging masks (32 bits, non-overlapping) */ +enum libcfs_debug_masks { + D_TRACE = 0x00000001, /* ENTRY/EXIT markers */ + D_INODE = 0x00000002, + D_SUPER = 0x00000004, + D_IOTRACE = 0x00000008, /* simple, low overhead io tracing */ + D_MALLOC = 0x00000010, /* print malloc, free information */ + D_CACHE = 0x00000020, /* cache-related items */ + D_INFO = 0x00000040, /* general information */ + D_IOCTL = 0x00000080, /* ioctl related information */ + D_NETERROR = 0x00000100, /* network errors */ + D_NET = 0x00000200, /* network communications */ + D_WARNING = 0x00000400, /* CWARN(...) == CDEBUG(D_WARNING, ...) */ + D_BUFFS = 0x00000800, + D_OTHER = 0x00001000, + D_DENTRY = 0x00002000, + D_NETTRACE = 0x00004000, + D_PAGE = 0x00008000, /* bulk page handling */ + D_DLMTRACE = 0x00010000, + D_ERROR = 0x00020000, /* CERROR(...) == CDEBUG(D_ERROR, ...) */ + D_EMERG = 0x00040000, /* CEMERG(...) == CDEBUG(D_EMERG, ...) */ + D_HA = 0x00080000, /* recovery and failover */ + D_RPCTRACE = 0x00100000, /* for distributed debugging */ + D_VFSTRACE = 0x00200000, + D_READA = 0x00400000, /* read-ahead */ + D_MMAP = 0x00800000, + D_CONFIG = 0x01000000, + D_CONSOLE = 0x02000000, + D_QUOTA = 0x04000000, + D_SEC = 0x08000000, + D_LFSCK = 0x10000000, /* For both OI scrub and LFSCK */ + D_HSM = 0x20000000, + D_SNAPSHOT = 0x40000000, + D_LAYOUT = 0x80000000, +}; +#define LIBCFS_D_DEFAULT (D_CANTMASK | D_NETERROR | D_HA | D_CONFIG | D_IOCTL |\ + D_LFSCK) + +#define LIBCFS_DEBUG_MASKS_NAMES { \ + "trace", "inode", "super", "iotrace", "malloc", "cache", "info",\ + "ioctl", "neterror", "net", "warning", "buffs", "other", \ + "dentry", "nettrace", "page", "dlmtrace", "error", "emerg", \ + "ha", "rpctrace", "vfstrace", "reada", "mmap", "config", \ + "console", "quota", "sec", "lfsck", "hsm", "snapshot", "layout",\ + NULL } + +#define D_CANTMASK (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE) + +#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log" + +#endif /* __UAPI_LIBCFS_DEBUG_H__ */ diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h new file mode 100644 index 0000000000000..1bcf47b29e0c9 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h @@ -0,0 +1,162 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Low-level ioctl data structures. Kernel ioctl functions declared here, + * and user space functions are in libcfs/util/ioctl.h. + * + */ + +#ifndef __UAPI_LIBCFS_IOCTL_H__ +#define __UAPI_LIBCFS_IOCTL_H__ + +#include +#include + +/* + * sparse kernel source annotations + */ +#ifndef __user +#define __user +#endif + +#define LIBCFS_IOCTL_VERSION 0x0001000a +#define LIBCFS_IOCTL_VERSION2 0x0001000b + +struct libcfs_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +/** max size to copy from userspace */ +#define LIBCFS_IOC_DATA_MAX (128 * 1024) + +struct libcfs_ioctl_data { + struct libcfs_ioctl_hdr ioc_hdr; + + __u64 ioc_nid; + __u64 ioc_u64[1]; + + __u32 ioc_flags; + __u32 ioc_count; + __u32 ioc_net; + __u32 ioc_u32[7]; + + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + + __u32 ioc_plen1; /* buffers in userspace */ + void __user *ioc_pbuf1; + __u32 ioc_plen2; /* buffers in userspace */ + void __user *ioc_pbuf2; + + char ioc_bulk[0]; +}; + +#define IOCTL_LIBCFS_TYPE long + +#define IOC_LIBCFS_TYPE ('e') +#define IOC_LIBCFS_MIN_NR 30 +/* libcfs ioctls */ +/* IOC_LIBCFS_PANIC obsolete in 2.8.0, was _IOWR('e', 30, IOCTL_LIBCFS_TYPE) */ +#define IOC_LIBCFS_CLEAR_DEBUG _IOWR('e', 31, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_MARK_DEBUG _IOWR('e', 32, IOCTL_LIBCFS_TYPE) +/* IOC_LIBCFS_MEMHOG obsolete in 2.8.0, was _IOWR('e', 36, IOCTL_LIBCFS_TYPE) */ +/* lnet ioctls */ +#define IOC_LIBCFS_GET_NI _IOWR('e', 50, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_FAIL_NID _IOWR('e', 51, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_NOTIFY_ROUTER _IOWR('e', 55, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_UNCONFIGURE _IOWR('e', 56, IOCTL_LIBCFS_TYPE) +/* IOC_LIBCFS_PORTALS_COMPATIBILITY _IOWR('e', 57, IOCTL_LIBCFS_TYPE) */ +#define IOC_LIBCFS_LNET_DIST _IOWR('e', 58, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_CONFIGURE _IOWR('e', 59, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_TESTPROTOCOMPAT _IOWR('e', 60, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PING _IOWR('e', 61, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PING_PEER _IOWR('e', 62, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_LNETST _IOWR('e', 63, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_LNET_FAULT _IOWR('e', 64, IOCTL_LIBCFS_TYPE) +/* lnd ioctls */ +#define IOC_LIBCFS_REGISTER_MYNID _IOWR('e', 70, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_CLOSE_CONNECTION _IOWR('e', 71, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PUSH_CONNECTION _IOWR('e', 72, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_CONN _IOWR('e', 73, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DEL_PEER _IOWR('e', 74, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_ADD_PEER _IOWR('e', 75, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_PEER _IOWR('e', 76, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DISCOVER _IOWR('e', 77, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_ADD_INTERFACE _IOWR('e', 78, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DEL_INTERFACE _IOWR('e', 79, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_INTERFACE _IOWR('e', 80, IOCTL_LIBCFS_TYPE) + + +/* + * DLC Specific IOCTL numbers. + * In order to maintain backward compatibility with any possible external + * tools which might be accessing the IOCTL numbers, a new group of IOCTL + * number have been allocated. + */ +#define IOCTL_CONFIG_SIZE struct lnet_ioctl_config_data +#define IOC_LIBCFS_ADD_ROUTE _IOWR(IOC_LIBCFS_TYPE, 81, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_DEL_ROUTE _IOWR(IOC_LIBCFS_TYPE, 82, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_ROUTE _IOWR(IOC_LIBCFS_TYPE, 83, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_ADD_NET _IOWR(IOC_LIBCFS_TYPE, 84, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_DEL_NET _IOWR(IOC_LIBCFS_TYPE, 85, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_NET _IOWR(IOC_LIBCFS_TYPE, 86, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_CONFIG_RTR _IOWR(IOC_LIBCFS_TYPE, 87, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_ADD_BUF _IOWR(IOC_LIBCFS_TYPE, 88, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_BUF _IOWR(IOC_LIBCFS_TYPE, 89, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_PEER_INFO _IOWR(IOC_LIBCFS_TYPE, 90, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_LNET_STATS _IOWR(IOC_LIBCFS_TYPE, 91, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_ADD_PEER_NI _IOWR(IOC_LIBCFS_TYPE, 92, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_DEL_PEER_NI _IOWR(IOC_LIBCFS_TYPE, 93, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_PEER_NI _IOWR(IOC_LIBCFS_TYPE, 94, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_ADD_LOCAL_NI _IOWR(IOC_LIBCFS_TYPE, 95, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_DEL_LOCAL_NI _IOWR(IOC_LIBCFS_TYPE, 96, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_LOCAL_NI _IOWR(IOC_LIBCFS_TYPE, 97, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_SET_NUMA_RANGE _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_NUMA_RANGE _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_PEER_LIST _IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS _IOWR(IOC_LIBCFS_TYPE, 101, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_SET_HEALHV _IOWR(IOC_LIBCFS_TYPE, 102, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_LOCAL_HSTATS _IOWR(IOC_LIBCFS_TYPE, 103, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_RECOVERY_QUEUE _IOWR(IOC_LIBCFS_TYPE, 104, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_ADD_UDSP _IOWR(IOC_LIBCFS_TYPE, 105, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_DEL_UDSP _IOWR(IOC_LIBCFS_TYPE, 106, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_UDSP_SIZE _IOWR(IOC_LIBCFS_TYPE, 107, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_UDSP _IOWR(IOC_LIBCFS_TYPE, 108, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_CONST_UDSP_INFO _IOWR(IOC_LIBCFS_TYPE, 109, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_RESET_LNET_STATS _IOWR(IOC_LIBCFS_TYPE, 110, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_SET_CONNS_PER_PEER _IOWR(IOC_LIBCFS_TYPE, 111, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_MAX_NR 111 + +extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data); + +#endif /* __UAPI_LIBCFS_IOCTL_H__ */ diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h new file mode 100644 index 0000000000000..2b2c05fa3b0b2 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h @@ -0,0 +1,399 @@ +/* + * LGPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. + * + * LGPL HEADER END + * + */ +/* + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * Author: Amir Shehata + */ + +#ifndef __UAPI_LNET_DLC_H_ +#define __UAPI_LNET_DLC_H_ + +#include +#include +#include + +#define MAX_NUM_SHOW_ENTRIES 32 +#define LNET_MAX_STR_LEN 128 +#define LNET_MAX_SHOW_NUM_CPT 128 +#define LNET_MAX_SHOW_NUM_NID 128 +#define LNET_UNDEFINED_HOPS ((__u32) -1) + +#define LNET_RT_ALIVE (1 << 0) +#define LNET_RT_MULTI_HOP (1 << 1) + +/* + * sparse kernel source annotations + */ +#ifndef __user +#define __user +#endif + +/* + * To allow for future enhancements to extend the tunables + * add a hdr to this structure, so that the version can be set + * and checked for backwards compatibility. Newer versions of LNet + * can still work with older versions of lnetctl. The restriction is + * that the structure can be added to and not removed from in order + * to not invalidate older lnetctl utilities. Moreover, the order of + * fields must remain the same, and new fields appended to the structure + * + * That said all existing LND tunables will be added in this structure + * to avoid future changes. + */ +struct lnet_ioctl_config_lnd_cmn_tunables { + __u32 lct_version; + __s32 lct_peer_timeout; + __s32 lct_peer_tx_credits; + __s32 lct_peer_rtr_credits; + __s32 lct_max_tx_credits; +}; + +struct lnet_ioctl_config_o2iblnd_tunables { + __u32 lnd_version; + __u32 lnd_peercredits_hiw; + __u32 lnd_map_on_demand; + __u32 lnd_concurrent_sends; + __u32 lnd_fmr_pool_size; + __u32 lnd_fmr_flush_trigger; + __u32 lnd_fmr_cache; + __u16 lnd_conns_per_peer; + __u16 lnd_ntx; +}; + +struct lnet_ioctl_config_socklnd_tunables { + __u32 lnd_version; + __u16 lnd_conns_per_peer; + __u16 lnd_pad; +}; + +struct lnet_lnd_tunables { + union { + struct lnet_ioctl_config_o2iblnd_tunables lnd_o2ib; + struct lnet_ioctl_config_socklnd_tunables lnd_sock; + } lnd_tun_u; +}; + +struct lnet_ioctl_config_lnd_tunables { + struct lnet_ioctl_config_lnd_cmn_tunables lt_cmn; + struct lnet_lnd_tunables lt_tun; +}; + +struct lnet_ioctl_net_config { + char ni_interface[LNET_MAX_STR_LEN]; + __u32 ni_status; + __u32 ni_cpts[LNET_MAX_SHOW_NUM_CPT]; + char cfg_bulk[0]; +}; + +#define LNET_TINY_BUF_IDX 0 +#define LNET_SMALL_BUF_IDX 1 +#define LNET_LARGE_BUF_IDX 2 + +/* # different router buffer pools */ +#define LNET_NRBPOOLS (LNET_LARGE_BUF_IDX + 1) + +struct lnet_ioctl_pool_cfg { + struct { + __u32 pl_npages; + __u32 pl_nbuffers; + __u32 pl_credits; + __u32 pl_mincredits; + } pl_pools[LNET_NRBPOOLS]; + __u32 pl_routing; +}; + +struct lnet_ioctl_ping_data { + struct libcfs_ioctl_hdr ping_hdr; + + __u32 op_param; + __u32 ping_count; + __u32 ping_flags; + __u32 mr_info; + struct lnet_process_id ping_id; + struct lnet_process_id __user *ping_buf; + lnet_nid_t ping_src; +}; + +struct lnet_ioctl_config_data { + struct libcfs_ioctl_hdr cfg_hdr; + + __u32 cfg_net; + __u32 cfg_count; + __u64 cfg_nid; + __u32 cfg_ncpts; + + union { + struct { + __u32 rtr_hop; + __u32 rtr_priority; + __u32 rtr_flags; + __u32 rtr_sensitivity; + } cfg_route; + struct { + char net_intf[LNET_MAX_STR_LEN]; + __s32 net_peer_timeout; + __s32 net_peer_tx_credits; + __s32 net_peer_rtr_credits; + __s32 net_max_tx_credits; + __u32 net_cksum_algo; + __u32 net_interface_count; + } cfg_net; + struct { + __u32 buf_enable; + __s32 buf_tiny; + __s32 buf_small; + __s32 buf_large; + } cfg_buffers; + } cfg_config_u; + + char cfg_bulk[0]; +}; + +struct lnet_ioctl_comm_count { + __u32 ico_get_count; + __u32 ico_put_count; + __u32 ico_reply_count; + __u32 ico_ack_count; + __u32 ico_hello_count; +}; + +struct lnet_ioctl_element_stats { + __u32 iel_send_count; + __u32 iel_recv_count; + __u32 iel_drop_count; +}; + +enum lnet_health_type { + LNET_HEALTH_TYPE_LOCAL_NI = 0, + LNET_HEALTH_TYPE_PEER_NI, +}; + +struct lnet_ioctl_local_ni_hstats { + struct libcfs_ioctl_hdr hlni_hdr; + lnet_nid_t hlni_nid; + __u32 hlni_local_interrupt; + __u32 hlni_local_dropped; + __u32 hlni_local_aborted; + __u32 hlni_local_no_route; + __u32 hlni_local_timeout; + __u32 hlni_local_error; + __s32 hlni_fatal_error; + __s32 hlni_health_value; + __u32 hlni_ping_count; + __u64 hlni_next_ping; +}; + +struct lnet_ioctl_peer_ni_hstats { + __u32 hlpni_remote_dropped; + __u32 hlpni_remote_timeout; + __u32 hlpni_remote_error; + __u32 hlpni_network_timeout; + __s32 hlpni_health_value; + __u32 hlpni_ping_count; + __u64 hlpni_next_ping; +}; + +struct lnet_ioctl_element_msg_stats { + struct libcfs_ioctl_hdr im_hdr; + __u32 im_idx; + struct lnet_ioctl_comm_count im_send_stats; + struct lnet_ioctl_comm_count im_recv_stats; + struct lnet_ioctl_comm_count im_drop_stats; +}; + +/* + * lnet_ioctl_config_ni + * This structure describes an NI configuration. There are multiple components + * when configuring an NI: Net, Interfaces, CPT list and LND tunables + * A network is passed as a string to the DLC and translated using + * libcfs_str2net() + * An interface is the name of the system configured interface + * (ex eth0, ib1) + * CPT is the list of CPTS LND tunables are passed in the lic_bulk area + */ +struct lnet_ioctl_config_ni { + struct libcfs_ioctl_hdr lic_cfg_hdr; + lnet_nid_t lic_nid; + char lic_ni_intf[LNET_MAX_STR_LEN]; + char lic_legacy_ip2nets[LNET_MAX_STR_LEN]; + __u32 lic_cpts[LNET_MAX_SHOW_NUM_CPT]; + __u32 lic_ncpts; + __u32 lic_status; + __u32 lic_idx; + __s32 lic_dev_cpt; + char pad[4]; + char lic_bulk[0]; +}; + +struct lnet_peer_ni_credit_info { + char cr_aliveness[LNET_MAX_STR_LEN]; + __u32 cr_refcount; + __s32 cr_ni_peer_tx_credits; + __s32 cr_peer_tx_credits; + __s32 cr_peer_min_tx_credits; + __u32 cr_peer_tx_qnob; + __s32 cr_peer_rtr_credits; + __s32 cr_peer_min_rtr_credits; + __u32 cr_ncpt; +}; + +struct lnet_ioctl_peer { + struct libcfs_ioctl_hdr pr_hdr; + __u32 pr_count; + __u32 pr_pad; + lnet_nid_t pr_nid; + + union { + struct lnet_peer_ni_credit_info pr_peer_credits; + } pr_lnd_u; +}; + +struct lnet_ioctl_peer_cfg { + struct libcfs_ioctl_hdr prcfg_hdr; + lnet_nid_t prcfg_prim_nid; + lnet_nid_t prcfg_cfg_nid; + __u32 prcfg_count; + __u32 prcfg_mr; + __u32 prcfg_state; + __u32 prcfg_size; + void __user *prcfg_bulk; +}; + +struct lnet_ioctl_reset_health_cfg { + struct libcfs_ioctl_hdr rh_hdr; + enum lnet_health_type rh_type:32; + __u16 rh_all:1; + __s16 rh_value; + lnet_nid_t rh_nid; +}; + +struct lnet_ioctl_reset_conns_per_peer_cfg { + struct libcfs_ioctl_hdr rcpp_hdr; + __u16 rcpp_all:1; + __s16 rcpp_value; + lnet_nid_t rcpp_nid; +}; + +struct lnet_ioctl_recovery_list { + struct libcfs_ioctl_hdr rlst_hdr; + enum lnet_health_type rlst_type:32; + __u32 rlst_num_nids; + lnet_nid_t rlst_nid_array[LNET_MAX_SHOW_NUM_NID]; +}; + +struct lnet_ioctl_set_value { + struct libcfs_ioctl_hdr sv_hdr; + __u32 sv_value; +}; + +struct lnet_ioctl_lnet_stats { + struct libcfs_ioctl_hdr st_hdr; + struct lnet_counters st_cntrs; +}; + +/* An IP, numeric NID or a Net number is composed of 1 or more of these + * descriptor structures. + */ +struct lnet_range_expr { + __u32 re_lo; + __u32 re_hi; + __u32 re_stride; +}; + +/* le_count identifies the number of lnet_range_expr in the bulk + * which follows + */ +struct lnet_expressions { + __u32 le_count; +}; + +/* A net descriptor has the net type, IE: O2IBLND, SOCKLND, etc and an + * expression describing a net number range. + */ +struct lnet_ioctl_udsp_net_descr { + __u32 ud_net_type; + struct lnet_expressions ud_net_num_expr; +}; + +/* The UDSP descriptor header contains the type of matching criteria, SRC, + * DST, RTE, etc and how many lnet_expressions compose the LNet portion of + * the LNet NID. For example an IP can be + * composed of 4 lnet_expressions , a gni can be composed of 1 + */ +struct lnet_ioctl_udsp_descr_hdr { + /* The literals SRC, DST and RTE are encoded + * here. + */ + __u32 ud_descr_type; + __u32 ud_descr_count; +}; + +/* each matching expression in the UDSP is described with this. + * The bulk format is as follows: + * 1. 1x struct lnet_ioctl_udsp_net_descr + * -> the net part of the NID + * 2. >=0 struct lnet_expressions + * -> the address part of the NID + */ +struct lnet_ioctl_udsp_descr { + struct lnet_ioctl_udsp_descr_hdr iud_src_hdr; + struct lnet_ioctl_udsp_net_descr iud_net; +}; + +/* The cumulative UDSP descriptor + * The bulk format is as follows: + * 1. >=1 struct lnet_ioctl_udsp_descr + * + * The size indicated in iou_hdr is the total size of the UDSP. + * + */ +struct lnet_ioctl_udsp { + struct libcfs_ioctl_hdr iou_hdr; + __s32 iou_idx; + __u32 iou_action_type; + __u32 iou_bulk_size; + union { + __u32 priority; + } iou_action; + void __user *iou_bulk; +}; + +/* structure used to request udsp instantiation information on the + * specified construct. + * cud_nid: the NID of the local or remote NI to pull info on. + * cud_nid_priority: NID prio of the requested NID. + * cud_net_priority: net prio of network of the requested NID. + * cud_pref_nid: array of preferred NIDs if it exists. + */ +struct lnet_ioctl_construct_udsp_info { + struct libcfs_ioctl_hdr cud_hdr; + __u32 cud_peer:1; + lnet_nid_t cud_nid; + __u32 cud_nid_priority; + __u32 cud_net_priority; + lnet_nid_t cud_pref_nid[LNET_MAX_SHOW_NUM_NID]; + lnet_nid_t cud_pref_rtr_nid[LNET_MAX_SHOW_NUM_NID]; +}; + +#endif /* _LNET_DLC_H_ */ diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-idl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-idl.h new file mode 100644 index 0000000000000..bdff24e8839da --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-idl.h @@ -0,0 +1,298 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __UAPI_LNET_IDL_H__ +#define __UAPI_LNET_IDL_H__ + +#include + +/************************************************************************ + * Core LNet wire message format. + * These are sent in sender's byte order (i.e. receiver flips). + */ + +/** Address of an end-point in an LNet network. + * + * A node can have multiple end-points and hence multiple addresses. + * An LNet network can be a simple network (e.g. tcp0) or a network of + * LNet networks connected by LNet routers. Therefore an end-point address + * has two parts: network ID, and address within a network. + * The most-significant-byte in this format is always 0. A larger value + * would imply a larger nid with a larger address. + * + * \see LNET_NIDNET, LNET_NIDADDR, and LNET_MKNID. + */ +typedef __u64 lnet_nid_t; + +/* + * Address of LNet end-point in extended form + * + * To support addresses larger than 32bits we have + * an extended nid which supports up to 128 bits + * of address and is extensible. + * If nid_size is 0, then the nid can be stored in an lnet_nid_t, + * and the first 8 bytes of the 'struct lnet_nid' are identical to + * the lnet_nid_t in big-endian format. + * If nid_type == 0xff, then all other fields should be ignored + * and this is an ANY wildcard address. In particular, the nid_size + * can be 0xff without making the address too big to fit. + */ +struct lnet_nid { + __u8 nid_size; /* total bytes - 8 */ + __u8 nid_type; + __be16 nid_num; + __be32 nid_addr[4]; +} __attribute__((packed)); + +#define NID_BYTES(nid) ((nid)->nid_size + 8) +#define NID_ADDR_BYTES(nid) ((nid)->nid_size + 4) + +/** + * ID of a process in a node. Shortened as PID to distinguish from + * lnet_process_id, the global process ID. + */ +typedef __u32 lnet_pid_t; + +/* Packed version of struct lnet_process_id to transfer via network */ +struct lnet_process_id_packed { + lnet_nid_t nid; + lnet_pid_t pid; /* node id / process id */ +} __attribute__((packed)); + +/* The wire handle's interface cookie only matches one network interface in + * one epoch (i.e. new cookie when the interface restarts or the node + * reboots). The object cookie only matches one object on that interface + * during that object's lifetime (i.e. no cookie re-use). + */ +struct lnet_handle_wire { + __u64 wh_interface_cookie; + __u64 wh_object_cookie; +} __attribute__((packed)); + +enum lnet_msg_type { + LNET_MSG_ACK = 0, + LNET_MSG_PUT, + LNET_MSG_GET, + LNET_MSG_REPLY, + LNET_MSG_HELLO, +}; + +/* The variant fields of the portals message header are aligned on an 8 + * byte boundary in the message header. Note that all types used in these + * wire structs MUST be fixed size and the smaller types are placed at the + * end. + */ +struct lnet_ack { + struct lnet_handle_wire dst_wmd; + __u64 match_bits; + __u32 mlength; +} __attribute__((packed)); + +struct lnet_put { + struct lnet_handle_wire ack_wmd; + __u64 match_bits; + __u64 hdr_data; + __u32 ptl_index; + __u32 offset; +} __attribute__((packed)); + +struct lnet_get { + struct lnet_handle_wire return_wmd; + __u64 match_bits; + __u32 ptl_index; + __u32 src_offset; + __u32 sink_length; +} __attribute__((packed)); + +struct lnet_reply { + struct lnet_handle_wire dst_wmd; +} __attribute__((packed)); + +struct lnet_hello { + __u64 incarnation; + __u32 type; +} __attribute__((packed)); + +union lnet_cmd_hdr { + struct lnet_ack ack; + struct lnet_put put; + struct lnet_get get; + struct lnet_reply reply; + struct lnet_hello hello; +} __attribute__((packed)); + +/* This is used for message headers that lnet code is manipulating. + * All fields before the union are in host-byte-order. + */ +struct lnet_hdr { + struct lnet_nid dest_nid; + struct lnet_nid src_nid; + lnet_pid_t dest_pid; + lnet_pid_t src_pid; + __u32 type; /* enum lnet_msg_type */ + __u32 payload_length; /* payload data to follow */ + /*<------__u64 aligned------->*/ + union lnet_cmd_hdr msg; +} __attribute__((packed)); + +/* This is used to support conversion between an lnet_hdr and + * the content of a network message. + */ +struct _lnet_hdr_nid4 { + lnet_nid_t dest_nid; + lnet_nid_t src_nid; + lnet_pid_t dest_pid; + lnet_pid_t src_pid; + __u32 type; /* enum lnet_msg_type */ + __u32 payload_length; /* payload data to follow */ + /*<------__u64 aligned------->*/ + union lnet_cmd_hdr msg; +} __attribute__((packed)); + +/* This is stored in a network message buffer. Content cannot be accessed + * without converting to an lnet_hdr. + */ +struct lnet_hdr_nid4 { + char _bytes[sizeof(struct _lnet_hdr_nid4)]; +} __attribute__((packed)); + +/* A HELLO message contains a magic number and protocol version + * code in the header's dest_nid, the peer's NID in the src_nid, and + * LNET_MSG_HELLO in the type field. All other common fields are zero + * (including payload_size; i.e. no payload). + * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is + * running the same protocol and to find out its NID. These LNDs should + * exchange HELLO messages when a connection is first established. Individual + * LNDs can put whatever else they fancy in lnet_hdr::msg. + */ +struct lnet_magicversion { + __u32 magic; /* LNET_PROTO_TCP_MAGIC */ + __u16 version_major; /* increment on incompatible change */ + __u16 version_minor; /* increment on compatible change */ +} __attribute__((packed)); + +/* PROTO MAGIC for LNDs */ +#define LNET_PROTO_IB_MAGIC 0x0be91b91 +#define LNET_PROTO_GNI_MAGIC 0xb00fbabe /* ask Kim */ +#define LNET_PROTO_TCP_MAGIC 0xeebc0ded +#define LNET_PROTO_ACCEPTOR_MAGIC 0xacce7100 +#define LNET_PROTO_PING_MAGIC 0x70696E67 /* 'ping' */ + +/* Placeholder for a future "unified" protocol across all LNDs */ +/* Current LNDs that receive a request with this magic will respond + * with a "stub" reply using their current protocol */ +#define LNET_PROTO_MAGIC 0x45726963 /* ! */ + +#define LNET_PROTO_TCP_VERSION_MAJOR 1 +#define LNET_PROTO_TCP_VERSION_MINOR 0 + +/* Acceptor connection request */ +struct lnet_acceptor_connreq { + __u32 acr_magic; /* LNET_PROTO_ACCEPTOR_MAGIC */ + __u32 acr_version; /* protocol version */ + __u64 acr_nid; /* target NID */ +} __attribute__((packed)); + +#define LNET_PROTO_ACCEPTOR_VERSION 1 + +struct lnet_acceptor_connreq_v2 { + __u32 acr_magic; /* LNET_PROTO_ACCEPTOR_MAGIC */ + __u32 acr_version; /* protocol version - 2 */ + struct lnet_nid acr_nid; /* target NID */ +} __attribute__((packed)); + +/* For use with 16-byte addresses */ +#define LNET_PROTO_ACCEPTOR_VERSION_16 2 + +struct lnet_counters_common { + __u32 lcc_msgs_alloc; + __u32 lcc_msgs_max; + __u32 lcc_errors; + __u32 lcc_send_count; + __u32 lcc_recv_count; + __u32 lcc_route_count; + __u32 lcc_drop_count; + __u64 lcc_send_length; + __u64 lcc_recv_length; + __u64 lcc_route_length; + __u64 lcc_drop_length; +} __attribute__((packed)); + + +#define LNET_NI_STATUS_UP 0x15aac0de +#define LNET_NI_STATUS_DOWN 0xdeadface +#define LNET_NI_STATUS_INVALID 0x00000000 + +struct lnet_ni_status { + lnet_nid_t ns_nid; + __u32 ns_status; + __u32 ns_unused; +} __attribute__((packed)); + +/* + * NB: value of these features equal to LNET_PROTO_PING_VERSION_x + * of old LNet, so there shouldn't be any compatibility issue + */ +#define LNET_PING_FEAT_INVAL (0) /* no feature */ +#define LNET_PING_FEAT_BASE (1 << 0) /* just a ping */ +#define LNET_PING_FEAT_NI_STATUS (1 << 1) /* return NI status */ +#define LNET_PING_FEAT_RTE_DISABLED (1 << 2) /* Routing enabled */ +#define LNET_PING_FEAT_MULTI_RAIL (1 << 3) /* Multi-Rail aware */ +#define LNET_PING_FEAT_DISCOVERY (1 << 4) /* Supports Discovery */ + +/* + * All ping feature bits fit to hit the wire. + * In lnet_assert_wire_constants() this is compared against its open-coded + * value, and in lnet_ping_target_update() it is used to verify that no + * unknown bits have been set. + * New feature bits can be added, just be aware that this does change the + * over-the-wire protocol. + */ +#define LNET_PING_FEAT_BITS (LNET_PING_FEAT_BASE | \ + LNET_PING_FEAT_NI_STATUS | \ + LNET_PING_FEAT_RTE_DISABLED | \ + LNET_PING_FEAT_MULTI_RAIL | \ + LNET_PING_FEAT_DISCOVERY) + +struct lnet_ping_info { + __u32 pi_magic; + __u32 pi_features; + lnet_pid_t pi_pid; + __u32 pi_nnis; + struct lnet_ni_status pi_ni[]; +} __attribute__((packed)); + +#define LNET_PING_INFO_SIZE(NNIDS) \ + offsetof(struct lnet_ping_info, pi_ni[NNIDS]) +#define LNET_PING_INFO_LONI(PINFO) ((PINFO)->pi_ni[0].ns_nid) +#define LNET_PING_INFO_SEQNO(PINFO) ((PINFO)->pi_ni[0].ns_status) + +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-nl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-nl.h new file mode 100644 index 0000000000000..8bc0317c73e1c --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-nl.h @@ -0,0 +1,87 @@ +/* + * LGPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. + * + * LGPL HEADER END + * + */ +/* Copyright (c) 2021, UT-Battelle, LLC + * + * Author: James Simmons + */ + +#ifndef __UAPI_LNET_NL_H__ +#define __UAPI_LNET_NL_H__ + +#include + +enum lnet_nl_key_format { + /* Is it FLOW or BLOCK */ + LNKF_FLOW = 1, + /* Is it SEQUENCE or MAPPING */ + LNKF_MAPPING = 2, + LNKF_SEQUENCE = 4, +}; + +/** + * enum lnet_nl_scalar_attrs - scalar LNet netlink attributes used + * to compose messages for sending or + * receiving. + * + * @LN_SCALAR_ATTR_UNSPEC: unspecified attribute to catch errors + * @LN_SCALAR_ATTR_PAD: padding for 64-bit attributes, ignore + * + * @LN_SCALAR_ATTR_LIST: List of scalar attributes (NLA_NESTED) + * @LN_SCALAR_ATTR_LIST_SIZE: Number of items in scalar list (NLA_U16) + * @LN_SCALAR_ATTR_INDEX: True Netlink attr value (NLA_U16) + * @LN_SCALAR_ATTR_NLA_TYPE: Data format for value part of the pair + * (NLA_U16) + * @LN_SCALAR_ATTR_VALUE: String value of key part of the pair. + * (NLA_NUL_STRING) + * @LN_SCALAR_ATTR_INT_VALUE: Numeric value of key part of the pair. + * (NLA_S64) + * @LN_SCALAR_ATTR_KEY_FORMAT: LNKF_* format of the key value pair. + */ +enum lnet_nl_scalar_attrs { + LN_SCALAR_ATTR_UNSPEC = 0, + LN_SCALAR_ATTR_PAD = LN_SCALAR_ATTR_UNSPEC, + + LN_SCALAR_ATTR_LIST, + LN_SCALAR_ATTR_LIST_SIZE, + LN_SCALAR_ATTR_INDEX, + LN_SCALAR_ATTR_NLA_TYPE, + LN_SCALAR_ATTR_VALUE, + LN_SCALAR_ATTR_INT_VALUE, + LN_SCALAR_ATTR_KEY_FORMAT, + + __LN_SCALAR_ATTR_MAX_PLUS_ONE, +}; + +#define LN_SCALAR_MAX (__LN_SCALAR_ATTR_MAX_PLUS_ONE - 1) + +struct ln_key_props { + char *lkp_value; + __u16 lkp_key_format; + __u16 lkp_data_type; +}; + +struct ln_key_list { + __u16 lkl_maxattr; + struct ln_key_props lkl_list[]; +}; + +#endif /* __UAPI_LNET_NL_H__ */ diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h new file mode 100644 index 0000000000000..d32ec52263f57 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h @@ -0,0 +1,635 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __UAPI_LNET_TYPES_H__ +#define __UAPI_LNET_TYPES_H__ + +#include +#include + +/** \addtogroup lnet + * @{ */ + +#include + +/** \addtogroup lnet_addr + * @{ */ + +#define LNET_VERSION "0.7.0" + +/** Portal reserved for LNet's own use. + * \see lustre/include/lustre/lustre_idl.h for Lustre portal assignments. + */ +#define LNET_RESERVED_PORTAL 0 + +/** wildcard NID that matches any end-point address */ +#define LNET_NID_ANY ((lnet_nid_t) -1) +/** wildcard PID that matches any lnet_pid_t */ +#define LNET_PID_ANY ((lnet_pid_t) -1) + +static inline int LNET_NID_IS_ANY(const struct lnet_nid *nid) +{ + /* A NULL pointer can be used to mean "ANY" */ + return !nid || nid->nid_type == 0xFF; +} + +#define LNET_ANY_NID ((struct lnet_nid) \ + {0xFF, 0xFF, ~0, {~0, ~0, ~0, ~0} }) + +#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */ +#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */ +#define LNET_PID_LUSTRE 12345 + +/* how an LNET NID encodes net:address */ +/** extract the address part of an lnet_nid_t */ + +static inline __u32 LNET_NIDADDR(lnet_nid_t nid) +{ + return nid & 0xffffffff; +} + +static inline __u32 LNET_NIDNET(lnet_nid_t nid) +{ + return (nid >> 32) & 0xffffffff; +} + +static inline lnet_nid_t LNET_MKNID(__u32 net, __u32 addr) +{ + return (((__u64)net) << 32) | addr; +} + +static inline __u32 LNET_NETNUM(__u32 net) +{ + return net & 0xffff; +} + +static inline __u32 LNET_NETTYP(__u32 net) +{ + return (net >> 16) & 0xff; +} + +static inline __u32 LNET_MKNET(__u32 type, __u32 num) +{ + return (type << 16) | num; +} + +/** The lolnd NID (i.e. myself) */ +#define LNET_NID_LO_0 LNET_MKNID(LNET_MKNET(LOLND, 0), 0) + +#define LNET_NET_ANY LNET_NIDNET(LNET_NID_ANY) + +static inline int nid_is_nid4(const struct lnet_nid *nid) +{ + return NID_ADDR_BYTES(nid) == 4; +} + +/* LOLND may not be defined yet, so we cannot use an inline */ +#define nid_is_lo0(__nid) \ + ((__nid)->nid_type == LOLND && \ + nid_is_nid4(__nid) && \ + (__nid)->nid_num == 0 && \ + (__nid)->nid_addr[0] == 0) + +static inline __u32 LNET_NID_NET(const struct lnet_nid *nid) +{ + if (LNET_NID_IS_ANY(nid)) + return LNET_NET_ANY; + else + return LNET_MKNET(nid->nid_type, __be16_to_cpu(nid->nid_num)); +} + +static inline void lnet_nid4_to_nid(lnet_nid_t nid4, struct lnet_nid *nid) +{ + if (nid4 == LNET_NID_ANY) { + *nid = LNET_ANY_NID; + return; + } + + nid->nid_size = 0; + nid->nid_type = LNET_NETTYP(LNET_NIDNET(nid4)); + nid->nid_num = __cpu_to_be16(LNET_NETNUM(LNET_NIDNET(nid4))); + nid->nid_addr[0] = __cpu_to_be32(LNET_NIDADDR(nid4)); + nid->nid_addr[1] = nid->nid_addr[2] = nid->nid_addr[3] = 0; +} + +static inline lnet_nid_t lnet_nid_to_nid4(const struct lnet_nid *nid) +{ + if (LNET_NID_IS_ANY(nid)) + return LNET_NID_ANY; + + return LNET_MKNID(LNET_NID_NET(nid), __be32_to_cpu(nid->nid_addr[0])); +} + +static inline int nid_same(const struct lnet_nid *n1, + const struct lnet_nid *n2) +{ + return n1->nid_size == n2->nid_size && + n1->nid_type == n2->nid_type && + n1->nid_num == n2->nid_num && + n1->nid_addr[0] == n2->nid_addr[0] && + n1->nid_addr[1] == n2->nid_addr[1] && + n1->nid_addr[2] == n2->nid_addr[2] && + n1->nid_addr[3] == n2->nid_addr[3]; +} + +/* This can be used when we need to hash a nid */ +static inline unsigned long nidhash(const struct lnet_nid *nid) +{ + int i; + unsigned long hash = 0; + + hash ^= LNET_NID_NET(nid); + for (i = 0; i < 4; i++) + hash ^= nid->nid_addr[i]; + return hash; +} + +struct lnet_counters_health { + __u32 lch_rst_alloc; + __u32 lch_resend_count; + __u32 lch_response_timeout_count; + __u32 lch_local_interrupt_count; + __u32 lch_local_dropped_count; + __u32 lch_local_aborted_count; + __u32 lch_local_no_route_count; + __u32 lch_local_timeout_count; + __u32 lch_local_error_count; + __u32 lch_remote_dropped_count; + __u32 lch_remote_error_count; + __u32 lch_remote_timeout_count; + __u32 lch_network_timeout_count; +}; + +struct lnet_counters { + struct lnet_counters_common lct_common; + struct lnet_counters_health lct_health; +}; + +/* + * This is a hard-coded limit on the number of interfaces supported by + * the interface bonding implemented by the ksocknal LND. It must be + * defined here because it is used in LNet data structures that are + * common to all LNDs. + */ +#define LNET_INTERFACES_NUM 16 + +/* The minimum number of interfaces per node supported by LNet. */ +#define LNET_INTERFACES_MIN 16 +/* The default - arbitrary - value of the lnet_max_interfaces tunable. */ +#define LNET_INTERFACES_MAX_DEFAULT 200 + +/** + * Objects maintained by the LNet are accessed through handles. Handle types + * have names of the form lnet_handle_xx, where xx is one of the two letter + * object type codes ('md' for memory descriptor, and + * 'me' for match entry). Each type of object is given a unique handle type + * to enhance type checking. + */ +#define LNET_WIRE_HANDLE_COOKIE_NONE (~0ULL) + +struct lnet_handle_md { + __u64 cookie; +}; + +/** + * Invalidate md handle \a h. + */ +static inline void LNetInvalidateMDHandle(struct lnet_handle_md *h) +{ + h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE; +} + +/** + * Check whether handler \a h is invalid. + * + * \return 1 if handle is invalid, 0 if valid. + */ +static inline int LNetMDHandleIsInvalid(struct lnet_handle_md h) +{ + return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie); +} + +/** + * Global process ID. + */ +struct lnet_process_id { + /** node id */ + lnet_nid_t nid; + /** process id */ + lnet_pid_t pid; +}; + +/** + * Global process ID - with large addresses + */ +struct lnet_processid { + /** node id */ + struct lnet_nid nid; + /** process id */ + lnet_pid_t pid; +}; + +static inline void +lnet_pid4_to_pid(struct lnet_process_id pid4, struct lnet_processid *pid) +{ + pid->pid = pid4.pid; + lnet_nid4_to_nid(pid4.nid, &pid->nid); +} + +static inline struct lnet_process_id +lnet_pid_to_pid4(struct lnet_processid *pid) +{ + struct lnet_process_id ret; + + ret.pid = pid->pid; + ret.nid = lnet_nid_to_nid4(&pid->nid); + return ret; +} + +/** @} lnet_addr */ + +/** \addtogroup lnet_me + * @{ */ + +/** + * Specifies whether the match entry or memory descriptor should be unlinked + * automatically (LNET_UNLINK) or not (LNET_RETAIN). + */ +enum lnet_unlink { + LNET_RETAIN = 0, + LNET_UNLINK +}; + +/** + * Values of the type enum lnet_ins_pos are used to control where a new match + * entry is inserted. The value LNET_INS_BEFORE is used to insert the new + * entry before the current entry or before the head of the list. The value + * LNET_INS_AFTER is used to insert the new entry after the current entry + * or after the last item in the list. + */ +enum lnet_ins_pos { + /** insert ME before current position or head of the list */ + LNET_INS_BEFORE, + /** insert ME after current position or tail of the list */ + LNET_INS_AFTER, + /** attach ME at tail of local CPU partition ME list */ + LNET_INS_LOCAL +}; + +/** @} lnet_me */ + +/** \addtogroup lnet_md + * @{ */ + +struct lnet_hdr_nid16 { + char _bytes[sizeof(struct lnet_hdr)]; +} __attribute__((packed)); + +/** + * Event queue handler function type. + * + * The EQ handler runs for each event that is deposited into the EQ. The + * handler is supplied with a pointer to the event that triggered the + * handler invocation. + * + * The handler must not block, must be reentrant, and must not call any LNet + * API functions. It should return as quickly as possible. + */ +struct lnet_event; +typedef void (*lnet_handler_t)(struct lnet_event *event); + +/** + * Defines the visible parts of a memory descriptor. Values of this type + * are used to initialize memory descriptors. + */ +struct lnet_md { + /** + * Specify the memory region associated with the memory descriptor. + * If the options field has: + * - LNET_MD_KIOV bit set: The start field points to the starting + * address of an array of struct bio_vec and the length field specifies + * the number of entries in the array. The length can't be bigger + * than LNET_MAX_IOV. The struct bio_vec is used to describe page-based + * fragments that are not necessarily mapped in virtal memory. + * - Otherwise: The memory region is contiguous. The start field + * specifies the starting address for the memory region and the + * length field specifies its length. + * + * When the memory region is fragmented, all fragments but the first + * one must start on page boundary, and all but the last must end on + * page boundary. + */ + void *start; + unsigned int length; + /** + * Specifies the maximum number of operations that can be performed + * on the memory descriptor. An operation is any action that could + * possibly generate an event. In the usual case, the threshold value + * is decremented for each operation on the MD. When the threshold + * drops to zero, the MD becomes inactive and does not respond to + * operations. A threshold value of LNET_MD_THRESH_INF indicates that + * there is no bound on the number of operations that may be applied + * to a MD. + */ + int threshold; + /** + * Specifies the largest incoming request that the memory descriptor + * should respond to. When the unused portion of a MD (length - + * local offset) falls below this value, the MD becomes inactive and + * does not respond to further operations. This value is only used + * if the LNET_MD_MAX_SIZE option is set. + */ + int max_size; + /** + * Specifies the behavior of the memory descriptor. A bitwise OR + * of the following values can be used: + * - LNET_MD_OP_PUT: The LNet PUT operation is allowed on this MD. + * - LNET_MD_OP_GET: The LNet GET operation is allowed on this MD. + * - LNET_MD_MANAGE_REMOTE: The offset used in accessing the memory + * region is provided by the incoming request. By default, the + * offset is maintained locally. When maintained locally, the + * offset is incremented by the length of the request so that + * the next operation (PUT or GET) will access the next part of + * the memory region. Note that only one offset variable exists + * per memory descriptor. If both PUT and GET operations are + * performed on a memory descriptor, the offset is updated each time. + * - LNET_MD_TRUNCATE: The length provided in the incoming request can + * be reduced to match the memory available in the region (determined + * by subtracting the offset from the length of the memory region). + * By default, if the length in the incoming operation is greater + * than the amount of memory available, the operation is rejected. + * - LNET_MD_ACK_DISABLE: An acknowledgment should not be sent for + * incoming PUT operations, even if requested. By default, + * acknowledgments are sent for PUT operations that request an + * acknowledgment. Acknowledgments are never sent for GET operations. + * The data sent in the REPLY serves as an implicit acknowledgment. + * - LNET_MD_KIOV: The start and length fields specify an array of + * struct bio_vec. + * - LNET_MD_MAX_SIZE: The max_size field is valid. + * - LNET_MD_BULK_HANDLE: The bulk_handle field is valid. + * - LNET_MD_TRACK_RESPONSE: Enable response tracking on this MD + * regardless of the value of the lnet_response_tracking param. + * - LNET_MD_NO_TRACK_RESPONSE: Disable response tracking on this MD + * regardless of the value of the lnet_response_tracking param. + * - LNET_MD_GNILND: Disable warning about exceeding LNET_MAX_IOV. + * + * Note: + * - LNET_MD_KIOV allows for a scatter/gather capability for memory + * descriptors. + * - When LNET_MD_MAX_SIZE is set, the total length of the memory + * region (i.e. sum of all fragment lengths) must not be less than + * \a max_size. + */ + unsigned int options; + /** + * A user-specified value that is associated with the memory + * descriptor. The value does not need to be a pointer, but must fit + * in the space used by a pointer. This value is recorded in events + * associated with operations on this MD. + */ + void *user_ptr; + /** + * The event handler used to log the operations performed on + * the memory region. If this argument is NULL operations + * performed on this memory descriptor are not logged. + */ + lnet_handler_t handler; + /** + * The bulk MD handle which was registered to describe the buffers + * either to be used to transfer data to the peer or receive data + * from the peer. This allows LNet to properly determine the NUMA + * node on which the memory was allocated and use that to select the + * nearest local network interface. This value is only used + * if the LNET_MD_BULK_HANDLE option is set. + */ + struct lnet_handle_md bulk_handle; +}; + +/* Max Transfer Unit (minimum supported everywhere). + * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks) + * these limits are system wide and not interface-local. */ +#define LNET_MTU_BITS 20 +#define LNET_MTU (1 << LNET_MTU_BITS) + +/** + * Options for the MD structure. See struct lnet_md::options. + */ +#define LNET_MD_OP_PUT (1 << 0) +/** See struct lnet_md::options. */ +#define LNET_MD_OP_GET (1 << 1) +/** See struct lnet_md::options. */ +#define LNET_MD_MANAGE_REMOTE (1 << 2) +/* unused (1 << 3) */ +/** See struct lnet_md::options. */ +#define LNET_MD_TRUNCATE (1 << 4) +/** See struct lnet_md::options. */ +#define LNET_MD_ACK_DISABLE (1 << 5) +/** See struct lnet_md::options. */ +/* deprecated #define LNET_MD_IOVEC (1 << 6) */ +/** See struct lnet_md::options. */ +#define LNET_MD_MAX_SIZE (1 << 7) +/** See struct lnet_md::options. */ +#define LNET_MD_KIOV (1 << 8) +/** See struct lnet_md::options. */ +#define LNET_MD_BULK_HANDLE (1 << 9) +/** See struct lnet_md::options. */ +#define LNET_MD_TRACK_RESPONSE (1 << 10) +/** See struct lnet_md::options. */ +#define LNET_MD_NO_TRACK_RESPONSE (1 << 11) +/** See struct lnet_md::options. */ +#define LNET_MD_GNILND (1 << 12) + +/** Infinite threshold on MD operations. See struct lnet_md::threshold */ +#define LNET_MD_THRESH_INF (-1) + +/** @} lnet_md */ + +/** \addtogroup lnet_eq + * @{ */ + +/** + * Six types of events can be logged in an event queue. + */ +enum lnet_event_kind { + /** An incoming GET operation has completed on the MD. */ + LNET_EVENT_GET = 1, + /** + * An incoming PUT operation has completed on the MD. The + * underlying layers will not alter the memory (on behalf of this + * operation) once this event has been logged. + */ + LNET_EVENT_PUT, + /** + * A REPLY operation has completed. This event is logged after the + * data (if any) from the REPLY has been written into the MD. + */ + LNET_EVENT_REPLY, + /** An acknowledgment has been received. */ + LNET_EVENT_ACK, + /** + * An outgoing send (PUT or GET) operation has completed. This event + * is logged after the entire buffer has been sent and it is safe for + * the caller to reuse the buffer. + * + * Note: + * - The LNET_EVENT_SEND doesn't guarantee message delivery. It can + * happen even when the message has not yet been put out on wire. + * - It's unsafe to assume that in an outgoing GET operation + * the LNET_EVENT_SEND event would happen before the + * LNET_EVENT_REPLY event. The same holds for LNET_EVENT_SEND and + * LNET_EVENT_ACK events in an outgoing PUT operation. + */ + LNET_EVENT_SEND, + /** + * A MD has been unlinked. Note that LNetMDUnlink() does not + * necessarily trigger an LNET_EVENT_UNLINK event. + * \see LNetMDUnlink + */ + LNET_EVENT_UNLINK, +}; + +#define LNET_SEQ_GT(a, b) (((signed long)((a) - (b))) > 0) + +/** + * Information about an event on a MD. + */ +struct lnet_event { + /** The identifier (nid, pid) of the target. */ + struct lnet_processid target; + /** The identifier (nid, pid) of the initiator. */ + struct lnet_processid initiator; + /** The source NID on the initiator. */ + struct lnet_processid source; + /** + * The NID of the immediate sender. If the request has been forwarded + * by routers, this is the NID of the last hop; otherwise it's the + * same as the source. + */ + struct lnet_nid sender; + /** Indicates the type of the event. */ + enum lnet_event_kind type; + /** The portal table index specified in the request */ + unsigned int pt_index; + /** A copy of the match bits specified in the request. */ + __u64 match_bits; + /** The length (in bytes) specified in the request. */ + unsigned int rlength; + /** + * The length (in bytes) of the data that was manipulated by the + * operation. For truncated operations, the manipulated length will be + * the number of bytes specified by the MD (possibly with an offset, + * see struct lnet_md). For all other operations, the manipulated length + * will be the length of the requested operation, i.e. rlength. + */ + unsigned int mlength; + /** + * The handle to the MD associated with the event. The handle may be + * invalid if the MD has been unlinked. + */ + struct lnet_handle_md md_handle; + /** + * A snapshot of relevant state of the MD immediately after the event + * has been processed. + */ + void *md_start; + void *md_user_ptr; + unsigned int md_options; + /** + * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT. + * \see LNetPut + */ + __u64 hdr_data; + /** + * The message type, to ensure a handler for LNET_EVENT_SEND can + * distinguish between LNET_MSG_GET and LNET_MSG_PUT. + */ + __u32 msg_type; + /** + * Indicates the completion status of the operation. It's 0 for + * successful operations, otherwise it's an error code. + */ + int status; + /** + * Indicates whether the MD has been unlinked. Note that: + * - An event with unlinked set is the last event on the MD. + * - This field is also set for an explicit LNET_EVENT_UNLINK event. + * \see LNetMDUnlink + */ + int unlinked; + /** + * The displacement (in bytes) into the memory region that the + * operation used. The offset can be determined by the operation for + * a remote managed MD or by the local MD. + * \see struct lnet_md::options + */ + unsigned int offset; + /** + * The sequence number for this event. Sequence numbers are unique + * to each event. + */ + volatile unsigned long sequence; +}; + +/** \addtogroup lnet_data + * @{ */ + +/** + * Specify whether an acknowledgment should be sent by target when the PUT + * operation completes (i.e., when the data has been written to a MD of the + * target process). + * + * \see struct lnet_md::options for the discussion on LNET_MD_ACK_DISABLE + * by which acknowledgments can be disabled for a MD. + */ +enum lnet_ack_req { + /** Request an acknowledgment */ + LNET_ACK_REQ, + /** Request that no acknowledgment should be generated. */ + LNET_NOACK_REQ +}; + +/** + * UDSP action types. There are two available actions: + * 1. PRIORITY - set priority of matching LNet constructs + * 2. PREFERRED LIST - set preferred list of matching LNet constructs + */ +enum lnet_udsp_action_type { + EN_LNET_UDSP_ACTION_NONE = 0, + /** assign a priority to matching constructs */ + EN_LNET_UDSP_ACTION_PRIORITY = 1, + /** assign a preferred list of NIDs to matching constructs */ + EN_LNET_UDSP_ACTION_PREFERRED_LIST = 2, +}; + +/** @} lnet_data */ + +/** @} lnet */ +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h new file mode 100644 index 0000000000000..bbbed82d82874 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h @@ -0,0 +1,154 @@ +/* + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * header for lnet ioctl + */ +/* + * Copyright (c) 2014, 2017, Intel Corporation. + */ +#ifndef __UAPI_LNETCTL_H_ +#define __UAPI_LNETCTL_H_ + +#include +#include + +/** \addtogroup lnet_fault_simulation + * @{ */ + +enum { + LNET_CTL_DROP_ADD, + LNET_CTL_DROP_DEL, + LNET_CTL_DROP_RESET, + LNET_CTL_DROP_LIST, + LNET_CTL_DELAY_ADD, + LNET_CTL_DELAY_DEL, + LNET_CTL_DELAY_RESET, + LNET_CTL_DELAY_LIST, +}; + +#define LNET_ACK_BIT (1 << 0) +#define LNET_PUT_BIT (1 << 1) +#define LNET_GET_BIT (1 << 2) +#define LNET_REPLY_BIT (1 << 3) + +#define HSTATUS_END 11 +#define HSTATUS_LOCAL_INTERRUPT_BIT (1 << 1) +#define HSTATUS_LOCAL_DROPPED_BIT (1 << 2) +#define HSTATUS_LOCAL_ABORTED_BIT (1 << 3) +#define HSTATUS_LOCAL_NO_ROUTE_BIT (1 << 4) +#define HSTATUS_LOCAL_ERROR_BIT (1 << 5) +#define HSTATUS_LOCAL_TIMEOUT_BIT (1 << 6) +#define HSTATUS_REMOTE_ERROR_BIT (1 << 7) +#define HSTATUS_REMOTE_DROPPED_BIT (1 << 8) +#define HSTATUS_REMOTE_TIMEOUT_BIT (1 << 9) +#define HSTATUS_NETWORK_TIMEOUT_BIT (1 << 10) +#define HSTATUS_RANDOM 0xffffffff + +/** ioctl parameter for LNet fault simulation */ +struct lnet_fault_attr { + /** + * source NID of drop rule + * LNET_NID_ANY is wildcard for all sources + * 255.255.255.255@net is wildcard for all addresses from @net + */ + lnet_nid_t fa_src; + /** destination NID of drop rule, see \a dr_src for details */ + lnet_nid_t fa_dst; + /** local NID. In case of router this is the NID we're ceiving + * messages on + */ + lnet_nid_t fa_local_nid; + /** + * Portal mask to drop, -1 means all portals, for example: + * fa_ptl_mask = (1 << _LDLM_CB_REQUEST_PORTAL ) | + * (1 << LDLM_CANCEL_REQUEST_PORTAL) + * + * If it is non-zero then only PUT and GET will be filtered, otherwise + * there is no portal filter, all matched messages will be checked. + */ + __u64 fa_ptl_mask; + /** + * message types to drop, for example: + * dra_type = LNET_DROP_ACK_BIT | LNET_DROP_PUT_BIT + * + * If it is non-zero then only specified message types are filtered, + * otherwise all message types will be checked. + */ + __u32 fa_msg_mask; + union { + /** message drop simulation */ + struct { + /** drop rate of this rule */ + __u32 da_rate; + /** + * time interval of message drop, it is exclusive + * with da_rate + */ + __u32 da_interval; + /** error type mask */ + __u32 da_health_error_mask; + /** randomize error generation */ + __u32 da_random:1, + /** drop all messages if flag is set */ + da_drop_all:1; + } drop; + /** message latency simulation */ + struct { + __u32 la_rate; + /** + * time interval of message delay, it is exclusive + * with la_rate + */ + __u32 la_interval; + /** latency to delay */ + __u32 la_latency; + } delay; + __u64 space[8]; + } u; + +}; + +/** fault simluation stats */ +struct lnet_fault_stat { + /** total # matched messages */ + __u64 fs_count; + /** # dropped LNET_MSG_PUT by this rule */ + __u64 fs_put; + /** # dropped LNET_MSG_ACK by this rule */ + __u64 fs_ack; + /** # dropped LNET_MSG_GET by this rule */ + __u64 fs_get; + /** # dropped LNET_MSG_REPLY by this rule */ + __u64 fs_reply; + union { + struct { + /** total # dropped messages */ + __u64 ds_dropped; + } drop; + struct { + /** total # delayed messages */ + __u64 ls_delayed; + } delay; + __u64 space[8]; + } u; +}; + +/** @} lnet_fault_simulation */ + +#define LNET_DEV_ID 0 +#define LNET_DEV_PATH "/dev/lnet" + +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h new file mode 100644 index 0000000000000..8749f8a5b0646 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h @@ -0,0 +1,537 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Author: Liang Zhen + */ + +#ifndef __UAPI_LNET_ST_H__ +#define __UAPI_LNET_ST_H__ + +#include +#include +#include + +#define LST_FEAT_NONE (0) +#define LST_FEAT_BULK_LEN (1 << 0) /* enable variable page size */ + +#define LST_FEATS_EMPTY (LST_FEAT_NONE) +#define LST_FEATS_MASK (LST_FEAT_NONE | LST_FEAT_BULK_LEN) + +#define LST_NAME_SIZE 32 /* max name buffer length */ + +#define LSTIO_DEBUG 0xC00 /* debug */ +#define LSTIO_SESSION_NEW 0xC01 /* create session */ +#define LSTIO_SESSION_END 0xC02 /* end session */ +#define LSTIO_SESSION_INFO 0xC03 /* query session */ +#define LSTIO_GROUP_ADD 0xC10 /* add group */ +#define LSTIO_GROUP_LIST 0xC11 /* list all groups in session */ +#define LSTIO_GROUP_INFO 0xC12 /* query defailt infomation of specified group */ +#define LSTIO_GROUP_DEL 0xC13 /* delete group */ +#define LSTIO_NODES_ADD 0xC14 /* add nodes to specified group */ +#define LSTIO_GROUP_UPDATE 0xC15 /* update group */ +#define LSTIO_BATCH_ADD 0xC20 /* add batch */ +#define LSTIO_BATCH_START 0xC21 /* start batch */ +#define LSTIO_BATCH_STOP 0xC22 /* stop batch */ +#define LSTIO_BATCH_DEL 0xC23 /* delete batch */ +#define LSTIO_BATCH_LIST 0xC24 /* show all batches in the session */ +#define LSTIO_BATCH_INFO 0xC25 /* show defail of specified batch */ +#define LSTIO_TEST_ADD 0xC26 /* add test (to batch) */ +#define LSTIO_BATCH_QUERY 0xC27 /* query batch status */ +#define LSTIO_STAT_QUERY 0xC30 /* get stats */ + +/* + * sparse kernel source annotations + */ +#ifndef __user +#define __user +#endif + +struct lst_sid { + lnet_nid_t ses_nid; /* nid of console node */ + __s64 ses_stamp; /* time stamp in milliseconds */ +}; /*** session id */ + +extern struct lst_sid LST_INVALID_SID; + +struct lst_bid { + __u64 bat_id; /* unique id in session */ +}; + +/* Status of test node */ +#define LST_NODE_ACTIVE 0x1 /* node in this session */ +#define LST_NODE_BUSY 0x2 /* node is taken by other session */ +#define LST_NODE_DOWN 0x4 /* node is down */ +#define LST_NODE_UNKNOWN 0x8 /* node not in session */ + +struct lstcon_node_ent { + struct lnet_process_id nde_id; /* id of node */ + int nde_state; /* state of node */ +}; /*** node entry, for list_group command */ + +struct lstcon_ndlist_ent { + int nle_nnode; /* # of nodes */ + int nle_nactive; /* # of active nodes */ + int nle_nbusy; /* # of busy nodes */ + int nle_ndown; /* # of down nodes */ + int nle_nunknown; /* # of unknown nodes */ +}; /*** node_list entry, for list_batch command */ + +struct lstcon_test_ent { + int tse_type; /* test type */ + int tse_loop; /* loop count */ + int tse_concur; /* concurrency of test */ +}; /*** test summary entry, for list_batch command */ + +struct lstcon_batch_ent { + int bae_state; /* batch status */ + int bae_timeout; /* batch timeout */ + int bae_ntest; /* # of tests in the batch */ +}; /*** batch summary entry, for list_batch command */ + +struct lstcon_test_batch_ent { + struct lstcon_ndlist_ent tbe_cli_nle; /* client (group) node_list entry */ + struct lstcon_ndlist_ent tbe_srv_nle; /* server (group) node_list entry */ + union { + struct lstcon_test_ent tbe_test; /* test entry */ + struct lstcon_batch_ent tbe_batch; /* batch entry */ + } u; +}; /*** test/batch verbose information entry, + *** for list_batch command */ + +/* This will go away once we move to netlink */ +#if !defined(__KERNEL__) && !defined(__LIBCFS_UTIL_LIST_H__) +struct list_head { + struct list_head *next, *prev; +}; +#endif + +struct lstcon_rpc_ent { + struct list_head rpe_link; /* link chain */ + struct lnet_process_id rpe_peer; /* peer's id */ + /* This has not been used since Lustre 2.2 so its safe to use. + * Update to allow future use of timespec64 + */ + struct { + __s64 tv_sec; + __s64 tv_nsec; + } rpe_stamp; /* time stamp of RPC */ + int rpe_state; /* peer's state */ + int rpe_rpc_errno; /* RPC errno */ + + struct lst_sid rpe_sid; /* peer's session id */ + int rpe_fwk_errno; /* framework errno */ + int rpe_priv[4]; /* private data */ + char rpe_payload[0]; /* private reply payload */ +}; + +struct lstcon_trans_stat { + int trs_rpc_stat[4]; /* RPCs stat (0: total, 1: failed, 2: finished, 4: reserved */ + int trs_rpc_errno; /* RPC errno */ + int trs_fwk_stat[8]; /* framework stat */ + int trs_fwk_errno; /* errno of the first remote error */ + void *trs_fwk_private; /* private framework stat */ +}; + +static inline int +lstcon_rpc_stat_total(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_rpc_stat[0] : stat->trs_rpc_stat[0]; +} + +static inline int +lstcon_rpc_stat_success(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_rpc_stat[1] : stat->trs_rpc_stat[1]; +} + +static inline int +lstcon_rpc_stat_failure(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_rpc_stat[2] : stat->trs_rpc_stat[2]; +} + +static inline int +lstcon_sesop_stat_success(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_sesop_stat_failure(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_sesqry_stat_active(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_sesqry_stat_busy(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_sesqry_stat_unknown(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2]; +} + +static inline int +lstcon_tsbop_stat_success(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_tsbop_stat_failure(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_tsbqry_stat_idle(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_tsbqry_stat_run(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_tsbqry_stat_failure(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2]; +} + +static inline int +lstcon_statqry_stat_success(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_statqry_stat_failure(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +/* create a session */ +struct lstio_session_new_args { + int lstio_ses_key; /* IN: local key */ + int lstio_ses_timeout; /* IN: session timeout */ + int lstio_ses_force; /* IN: force create ? */ + /** IN: session features */ + unsigned lstio_ses_feats; + struct lst_sid __user *lstio_ses_idp; /* OUT: session id */ + int lstio_ses_nmlen; /* IN: name length */ + char __user *lstio_ses_namep; /* IN: session name */ +}; + +/* query current session */ +struct lstio_session_info_args { + struct lst_sid __user *lstio_ses_idp; /* OUT: session id */ + int __user *lstio_ses_keyp; /* OUT: local key */ + /** OUT: session features */ + unsigned __user *lstio_ses_featp; + struct lstcon_ndlist_ent __user *lstio_ses_ndinfo; /* OUT: */ + int lstio_ses_nmlen; /* IN: name length */ + char __user *lstio_ses_namep; /* OUT: session name */ +}; + +/* delete a session */ +struct lstio_session_end_args { + int lstio_ses_key; /* IN: session key */ +}; + +#define LST_OPC_SESSION 1 +#define LST_OPC_GROUP 2 +#define LST_OPC_NODES 3 +#define LST_OPC_BATCHCLI 4 +#define LST_OPC_BATCHSRV 5 + +struct lstio_debug_args { + int lstio_dbg_key; /* IN: session key */ + int lstio_dbg_type; /* IN: debug sessin|batch|group|nodes list */ + int lstio_dbg_flags; /* IN: reserved debug flags */ + int lstio_dbg_timeout; /* IN: timeout of debug */ + + int lstio_dbg_nmlen; /* IN: len of name */ + char __user *lstio_dbg_namep; /* IN: name of group|batch */ + int lstio_dbg_count; /* IN: # of test nodes to debug */ + struct lnet_process_id __user *lstio_dbg_idsp; /* IN: id of test nodes */ + /* OUT: list head of result buffer */ + struct list_head __user *lstio_dbg_resultp; +}; + +struct lstio_group_add_args { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name length */ + char __user *lstio_grp_namep; /* IN: group name */ +}; + +struct lstio_group_del_args { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name length */ + char __user *lstio_grp_namep; /* IN: group name */ +}; + +#define LST_GROUP_CLEAN 1 /* remove inactive nodes in the group */ +#define LST_GROUP_REFRESH 2 /* refresh inactive nodes in the group */ +#define LST_GROUP_RMND 3 /* delete nodes from the group */ + +struct lstio_group_update_args { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_opc; /* IN: OPC */ + int lstio_grp_args; /* IN: arguments */ + int lstio_grp_nmlen; /* IN: name length */ + char __user *lstio_grp_namep; /* IN: group name */ + int lstio_grp_count; /* IN: # of nodes id */ + struct lnet_process_id __user *lstio_grp_idsp; /* IN: array of nodes */ + /* OUT: list head of result buffer */ + struct list_head __user *lstio_grp_resultp; +}; + +struct lstio_group_nodes_args { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name length */ + char __user *lstio_grp_namep; /* IN: group name */ + int lstio_grp_count; /* IN: # of nodes */ + /** OUT: session features */ + unsigned __user *lstio_grp_featp; + struct lnet_process_id __user *lstio_grp_idsp; /* IN: nodes */ + /* OUT: list head of result buffer */ + struct list_head __user *lstio_grp_resultp; +}; + +struct lstio_group_list_args { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_idx; /* IN: group idx */ + int lstio_grp_nmlen; /* IN: name len */ + char __user *lstio_grp_namep; /* OUT: name */ +}; + +struct lstio_group_info_args { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name len */ + char __user *lstio_grp_namep; /* IN: name */ + struct lstcon_ndlist_ent __user *lstio_grp_entp;/* OUT: description of group */ + + int __user *lstio_grp_idxp; /* IN/OUT: node index */ + int __user *lstio_grp_ndentp; /* IN/OUT: # of nodent */ + struct lstcon_node_ent __user *lstio_grp_dentsp;/* OUT: nodent array */ +}; + +#define LST_DEFAULT_BATCH "batch" /* default batch name */ + +struct lstio_batch_add_args { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_nmlen; /* IN: name length */ + char __user *lstio_bat_namep; /* IN: batch name */ +}; + +struct lstio_batch_del_args { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_nmlen; /* IN: name length */ + char __user *lstio_bat_namep; /* IN: batch name */ +}; + +struct lstio_batch_run_args { + /* IN: session key */ + int lstio_bat_key; + /* IN: timeout for the batch */ + int lstio_bat_timeout; + /* IN: name length */ + int lstio_bat_nmlen; + /* IN: batch name */ + char __user *lstio_bat_namep; + /* OUT: list head of result buffer */ + struct list_head __user *lstio_bat_resultp; +}; + +struct lstio_batch_stop_args { + /* IN: session key */ + int lstio_bat_key; + /* IN: abort unfinished test RPC */ + int lstio_bat_force; + /* IN: name length */ + int lstio_bat_nmlen; + /* IN: batch name */ + char __user *lstio_bat_namep; + /* OUT: list head of result buffer */ + struct list_head __user *lstio_bat_resultp; +}; + +struct lstio_batch_query_args { + /* IN: session key */ + int lstio_bat_key; + /* IN: test index */ + int lstio_bat_testidx; + /* IN: is test client? */ + int lstio_bat_client; + /* IN: timeout for waiting */ + int lstio_bat_timeout; + /* IN: name length */ + int lstio_bat_nmlen; + /* IN: batch name */ + char __user *lstio_bat_namep; + /* OUT: list head of result buffer */ + struct list_head __user *lstio_bat_resultp; +}; + +struct lstio_batch_list_args { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_idx; /* IN: index */ + int lstio_bat_nmlen; /* IN: name length */ + char __user *lstio_bat_namep; /* IN: batch name */ +}; + +struct lstio_batch_info_args { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_nmlen; /* IN: name length */ + char __user *lstio_bat_namep; /* IN: name */ + int lstio_bat_server; /* IN: query server or not */ + int lstio_bat_testidx; /* IN: test index */ + struct lstcon_test_batch_ent __user *lstio_bat_entp;/* OUT: batch ent */ + + int __user *lstio_bat_idxp; /* IN/OUT: index of node */ + int __user *lstio_bat_ndentp; /* IN/OUT: # of nodent */ + struct lstcon_node_ent __user *lstio_bat_dentsp;/* array of nodent */ +}; + +/* add stat in session */ +struct lstio_stat_args { + /* IN: session key */ + int lstio_sta_key; + /* IN: timeout for stat requst */ + int lstio_sta_timeout; + /* IN: group name length */ + int lstio_sta_nmlen; + /* IN: group name */ + char __user *lstio_sta_namep; + /* IN: # of pid */ + int lstio_sta_count; + /* IN: pid */ + struct lnet_process_id __user *lstio_sta_idsp; + /* OUT: list head of result buffer */ + struct list_head __user *lstio_sta_resultp; +}; + +enum lst_test_type { + LST_TEST_BULK = 1, + LST_TEST_PING = 2 +}; + +/* create a test in a batch */ +#define LST_MAX_CONCUR 1024 /* Max concurrency of test */ + +struct lstio_test_args { + int lstio_tes_key; /* IN: session key */ + int lstio_tes_bat_nmlen; /* IN: batch name len */ + char __user *lstio_tes_bat_name; /* IN: batch name */ + int lstio_tes_type; /* IN: test type */ + int lstio_tes_oneside; /* IN: one sided test */ + int lstio_tes_loop; /* IN: loop count */ + int lstio_tes_concur; /* IN: concurrency */ + + int lstio_tes_dist; /* IN: node distribution in destination groups */ + int lstio_tes_span; /* IN: node span in destination groups */ + int lstio_tes_sgrp_nmlen; /* IN: source group name length */ + char __user *lstio_tes_sgrp_name; /* IN: group name */ + int lstio_tes_dgrp_nmlen; /* IN: destination group name length */ + char __user *lstio_tes_dgrp_name; /* IN: group name */ + + /* IN: param buffer len */ + int lstio_tes_param_len; + /* IN: parameter for specified test: + lstio_bulk_param_t, + lstio_ping_param_t, + ... more */ + void __user *lstio_tes_param; + /* OUT: private returned value */ + int __user *lstio_tes_retp; + /* OUT: list head of result buffer */ + struct list_head __user *lstio_tes_resultp; +}; + +enum lst_brw_type { + LST_BRW_READ = 1, + LST_BRW_WRITE = 2 +}; + +enum lst_brw_flags { + LST_BRW_CHECK_NONE = 1, + LST_BRW_CHECK_SIMPLE = 2, + LST_BRW_CHECK_FULL = 3 +}; + +struct lst_test_bulk_param { + int blk_opc; /* bulk operation code */ + int blk_size; /* size (bytes) */ + int blk_time; /* time of running the test*/ + int blk_flags; /* reserved flags */ + int blk_cli_off; /* bulk offset on client */ + int blk_srv_off; /* reserved: bulk offset on server */ +}; + +struct lst_test_ping_param { + int png_size; /* size of ping message */ + int png_time; /* time */ + int png_loop; /* loop */ + int png_flags; /* reserved flags */ +}; + +/* Both struct srpc_counters and struct sfw_counters are sent over the wire */ +struct srpc_counters { + __u32 errors; + __u32 rpcs_sent; + __u32 rpcs_rcvd; + __u32 rpcs_dropped; + __u32 rpcs_expired; + __u64 bulk_get; + __u64 bulk_put; +} __attribute__((packed)); + +struct sfw_counters { + /** milliseconds since current session started */ + __u32 running_ms; + __u32 active_batches; + __u32 zombie_sessions; + __u32 brw_errors; + __u32 ping_errors; +} __attribute__((packed)); + +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h new file mode 100644 index 0000000000000..9e4b156450e0b --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h @@ -0,0 +1,107 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2015, 2017, Intel Corporation. + */ +#ifndef _LNET_NIDSTRINGS_H +#define _LNET_NIDSTRINGS_H + +#include +#include + +/** + * Lustre Network Driver types. + */ +enum { + /* Only add to these values (i.e. don't ever change or redefine them): + * network addresses depend on them... */ + /*QSWLND = 1, removed v2_7_50 */ + SOCKLND = 2, + /*GMLND = 3, removed v2_0_0-rc1a-16-gc660aac */ + /*PTLLND = 4, removed v2_7_50 */ + O2IBLND = 5, + /*CIBLND = 6, removed v2_0_0-rc1a-175-gd2b8a0e */ + /*OPENIBLND = 7, removed v2_0_0-rc1a-175-gd2b8a0e */ + /*IIBLND = 8, removed v2_0_0-rc1a-175-gd2b8a0e */ + LOLND = 9, + /*RALND = 10, removed v2_7_50_0-34-g8be9e41 */ + /*VIBLND = 11, removed v2_0_0-rc1a-175-gd2b8a0e */ + /*MXLND = 12, removed v2_7_50_0-34-g8be9e41 */ + GNILND = 13, + GNIIPLND = 14, + PTL4LND = 15, + KFILND = 16, + + NUM_LNDS +}; + +struct list_head; + +#define LNET_NIDSTR_COUNT 1024 /* # of nidstrings */ +#define LNET_NIDSTR_SIZE 64 /* size of each one (see below for usage) */ + +/* support decl needed by both kernel and user space */ +char *libcfs_next_nidstring(void); +int libcfs_isknown_lnd(__u32 lnd); +char *libcfs_lnd2modname(__u32 lnd); +char *libcfs_lnd2str_r(__u32 lnd, char *buf, __kernel_size_t buf_size); +static inline char *libcfs_lnd2str(__u32 lnd) +{ + return libcfs_lnd2str_r(lnd, libcfs_next_nidstring(), + LNET_NIDSTR_SIZE); +} +int libcfs_str2lnd(const char *str); +char *libcfs_net2str_r(__u32 net, char *buf, __kernel_size_t buf_size); +static inline char *libcfs_net2str(__u32 net) +{ + return libcfs_net2str_r(net, libcfs_next_nidstring(), + LNET_NIDSTR_SIZE); +} +char *libcfs_nid2str_r(lnet_nid_t nid, char *buf, __kernel_size_t buf_size); +static inline char *libcfs_nid2str(lnet_nid_t nid) +{ + return libcfs_nid2str_r(nid, libcfs_next_nidstring(), + LNET_NIDSTR_SIZE); +} + +__u32 libcfs_str2net(const char *str); +lnet_nid_t libcfs_str2nid(const char *str); +int libcfs_str2anynid(lnet_nid_t *nid, const char *str); +int libcfs_num_parse(char *str, int len, struct list_head *list); +char *libcfs_id2str(struct lnet_process_id id); +void cfs_free_nidlist(struct list_head *list); +int cfs_parse_nidlist(char *str, int len, struct list_head *list); +int cfs_print_nidlist(char *buffer, int count, struct list_head *list); +int cfs_match_nid(lnet_nid_t nid, struct list_head *list); +int cfs_match_net(__u32 net_id, __u32 net_type, + struct list_head *net_num_list); + +int cfs_ip_addr_parse(char *str, int len, struct list_head *list); +int cfs_ip_addr_match(__u32 addr, struct list_head *list); +int cfs_nidrange_find_min_max(struct list_head *nidlist, char *min_nid, + char *max_nid, __kernel_size_t nidstr_length); +void cfs_expr_list_free_list(struct list_head *list); + +#endif /* _LNET_NIDSTRINGS_H */ diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h new file mode 100644 index 0000000000000..2df2cf731db15 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h @@ -0,0 +1,43 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * #defines shared between socknal implementation and utilities + */ +#ifndef __UAPI_LNET_SOCKLND_H__ +#define __UAPI_LNET_SOCKLND_H__ + +#define SOCKLND_CONN_NONE (-1) +#define SOCKLND_CONN_ANY 0 +#define SOCKLND_CONN_CONTROL 1 +#define SOCKLND_CONN_BULK_IN 2 +#define SOCKLND_CONN_BULK_OUT 3 +#define SOCKLND_CONN_NTYPES 4 + +#define SOCKLND_CONN_ACK SOCKLND_CONN_BULK_IN + +#endif diff --git a/drivers/staging/lustrefsx/lnet/klnds/Makefile b/drivers/staging/lustrefsx/lnet/klnds/Makefile new file mode 100644 index 0000000000000..cd375ca2cc67f --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/Makefile @@ -0,0 +1,2 @@ +obj-y += o2iblnd/ +obj-y += socklnd/ diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile new file mode 100644 index 0000000000000..5ce6dc99ffe1a --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTREFSX_LNET_XPRT_IB) += ko2iblnd.o + +ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd-idl.h b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd-idl.h new file mode 100644 index 0000000000000..35df50b99bbb6 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd-idl.h @@ -0,0 +1,155 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/klnds/o2iblnd/o2iblnd-idl.h + * + * Author: Eric Barton + */ +#ifndef __LNET_O2IBLND_IDL_H__ +#define __LNET_O2IBLND_IDL_H__ + +#include + +/************************************************************************ + * IB Wire message format. + * These are sent in sender's byte order (i.e. receiver flips). + */ + +struct kib_connparams { + u16 ibcp_queue_depth; + u16 ibcp_max_frags; + u32 ibcp_max_msg_size; +} __packed; + +struct kib_immediate_msg { + struct lnet_hdr_nid4 ibim_hdr; /* portals header */ + char ibim_payload[0];/* piggy-backed payload */ +} __packed; + +struct kib_rdma_frag { + u32 rf_nob; /* # bytes this frag */ + u64 rf_addr; /* CAVEAT EMPTOR: misaligned!! */ +} __packed; + +struct kib_rdma_desc { + u32 rd_key; /* local/remote key */ + u32 rd_nfrags; /* # fragments */ + struct kib_rdma_frag rd_frags[0]; /* buffer frags */ +} __packed; + +struct kib_putreq_msg { + struct lnet_hdr_nid4 ibprm_hdr; /* portals header */ + u64 ibprm_cookie; /* opaque completion cookie */ +} __packed; + +struct kib_putack_msg { + u64 ibpam_src_cookie;/* reflected completion cookie */ + u64 ibpam_dst_cookie;/* opaque completion cookie */ + struct kib_rdma_desc ibpam_rd; /* sender's sink buffer */ +} __packed; + +struct kib_get_msg { + struct lnet_hdr_nid4 ibgm_hdr; /* portals header */ + u64 ibgm_cookie; /* opaque completion cookie */ + struct kib_rdma_desc ibgm_rd; /* rdma descriptor */ +} __packed; + +struct kib_completion_msg { + u64 ibcm_cookie; /* opaque completion cookie */ + s32 ibcm_status; /* < 0 failure: >= 0 length */ +} __packed; + +struct kib_msg { + /* First 2 fields fixed FOR ALL TIME */ + u32 ibm_magic; /* I'm an ibnal message */ + u16 ibm_version; /* this is my version number */ + + u8 ibm_type; /* msg type */ + u8 ibm_credits; /* returned credits */ + u32 ibm_nob; /* # bytes in whole message */ + u32 ibm_cksum; /* checksum (0 == no checksum) */ + u64 ibm_srcnid; /* sender's NID */ + u64 ibm_srcstamp; /* sender's incarnation */ + u64 ibm_dstnid; /* destination's NID */ + u64 ibm_dststamp; /* destination's incarnation */ + + union { + struct kib_connparams connparams; + struct kib_immediate_msg immediate; + struct kib_putreq_msg putreq; + struct kib_putack_msg putack; + struct kib_get_msg get; + struct kib_completion_msg completion; + } __packed ibm_u; +} __packed; + +#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC /* unique magic */ + +#define IBLND_MSG_VERSION_1 0x11 +#define IBLND_MSG_VERSION_2 0x12 +#define IBLND_MSG_VERSION IBLND_MSG_VERSION_2 + +#define IBLND_MSG_CONNREQ 0xc0 /* connection request */ +#define IBLND_MSG_CONNACK 0xc1 /* connection acknowledge */ +#define IBLND_MSG_NOOP 0xd0 /* nothing (just credits) */ +#define IBLND_MSG_IMMEDIATE 0xd1 /* immediate */ +#define IBLND_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */ +#define IBLND_MSG_PUT_NAK 0xd3 /* completion (sink->src) */ +#define IBLND_MSG_PUT_ACK 0xd4 /* putack (sink->src) */ +#define IBLND_MSG_PUT_DONE 0xd5 /* completion (src->sink) */ +#define IBLND_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ +#define IBLND_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ + +struct kib_rej { + u32 ibr_magic; /* sender's magic */ + u16 ibr_version; /* sender's version */ + u8 ibr_why; /* reject reason */ + u8 ibr_padding; /* padding */ + u64 ibr_incarnation;/* incarnation of peer_ni */ + struct kib_connparams ibr_cp; /* connection parameters */ +} __packed; + +/* connection rejection reasons */ +#define IBLND_REJECT_CONN_RACE 1 /* You lost connection race */ +#define IBLND_REJECT_NO_RESOURCES 2 /* Out of memory/conns etc */ +#define IBLND_REJECT_FATAL 3 /* Anything else */ + +#define IBLND_REJECT_CONN_UNCOMPAT 4 /* incompatible version peer_ni */ +#define IBLND_REJECT_CONN_STALE 5 /* stale peer_ni */ + +/* peer_ni's rdma frags doesn't match mine */ +#define IBLND_REJECT_RDMA_FRAGS 6 +/* peer_ni's msg queue size doesn't match mine */ +#define IBLND_REJECT_MSG_QUEUE_SIZE 7 +#define IBLND_REJECT_INVALID_SRV_ID 8 + +/***********************************************************************/ + +#endif /* __LNET_O2IBLND_IDL_H__ */ diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c new file mode 100644 index 0000000000000..e9c23326b5c19 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c @@ -0,0 +1,3596 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/klnds/o2iblnd/o2iblnd.c + * + * Author: Eric Barton + */ + +#include +#include +#include + +#include "o2iblnd.h" + +static const struct lnet_lnd the_o2iblnd; + +struct kib_data kiblnd_data; + +static __u32 +kiblnd_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + /* ensure I don't return 0 (== no checksum) */ + return (sum == 0) ? 1 : sum; +} + +static char * +kiblnd_msgtype2str(int type) +{ + switch (type) { + case IBLND_MSG_CONNREQ: + return "CONNREQ"; + + case IBLND_MSG_CONNACK: + return "CONNACK"; + + case IBLND_MSG_NOOP: + return "NOOP"; + + case IBLND_MSG_IMMEDIATE: + return "IMMEDIATE"; + + case IBLND_MSG_PUT_REQ: + return "PUT_REQ"; + + case IBLND_MSG_PUT_NAK: + return "PUT_NAK"; + + case IBLND_MSG_PUT_ACK: + return "PUT_ACK"; + + case IBLND_MSG_PUT_DONE: + return "PUT_DONE"; + + case IBLND_MSG_GET_REQ: + return "GET_REQ"; + + case IBLND_MSG_GET_DONE: + return "GET_DONE"; + + default: + return "???"; + } +} + +static int +kiblnd_msgtype2size(int type) +{ + const int hdr_size = offsetof(struct kib_msg, ibm_u); + + switch (type) { + case IBLND_MSG_CONNREQ: + case IBLND_MSG_CONNACK: + return hdr_size + sizeof(struct kib_connparams); + + case IBLND_MSG_NOOP: + return hdr_size; + + case IBLND_MSG_IMMEDIATE: + return offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[0]); + + case IBLND_MSG_PUT_REQ: + return hdr_size + sizeof(struct kib_putreq_msg); + + case IBLND_MSG_PUT_ACK: + return hdr_size + sizeof(struct kib_putack_msg); + + case IBLND_MSG_GET_REQ: + return hdr_size + sizeof(struct kib_get_msg); + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + return hdr_size + sizeof(struct kib_completion_msg); + default: + return -1; + } +} + +static int kiblnd_unpack_rd(struct kib_msg *msg, bool flip) +{ + struct kib_rdma_desc *rd; + int nob; + int n; + int i; + + LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ || + msg->ibm_type == IBLND_MSG_PUT_ACK); + + rd = msg->ibm_type == IBLND_MSG_GET_REQ ? + &msg->ibm_u.get.ibgm_rd : + &msg->ibm_u.putack.ibpam_rd; + + if (flip) { + __swab32s(&rd->rd_key); + __swab32s(&rd->rd_nfrags); + } + + n = rd->rd_nfrags; + + if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) { + CERROR("Bad nfrags: %d, should be 0 < n <= %d\n", + n, IBLND_MAX_RDMA_FRAGS); + return 1; + } + + nob = offsetof(struct kib_msg, ibm_u) + + kiblnd_rd_msg_size(rd, msg->ibm_type, n); + + if (msg->ibm_nob < nob) { + CERROR("Short %s: %d(%d)\n", + kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob); + return 1; + } + + if (!flip) + return 0; + + for (i = 0; i < n; i++) { + __swab32s(&rd->rd_frags[i].rf_nob); + __swab64s(&rd->rd_frags[i].rf_addr); + } + + return 0; +} + +void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version, + int credits, lnet_nid_t dstnid, __u64 dststamp) +{ + struct kib_net *net = ni->ni_data; + + /* CAVEAT EMPTOR! all message fields not set here should have been + * initialised previously. + */ + msg->ibm_magic = IBLND_MSG_MAGIC; + msg->ibm_version = version; + /* ibm_type */ + msg->ibm_credits = credits; + /* ibm_nob */ + msg->ibm_cksum = 0; + msg->ibm_srcnid = lnet_nid_to_nid4(&ni->ni_nid); + msg->ibm_srcstamp = net->ibn_incarnation; + msg->ibm_dstnid = dstnid; + msg->ibm_dststamp = dststamp; + + if (*kiblnd_tunables.kib_cksum) { + /* NB ibm_cksum zero while computing cksum */ + msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob); + } +} + +int kiblnd_unpack_msg(struct kib_msg *msg, int nob) +{ + const int hdr_size = offsetof(struct kib_msg, ibm_u); + __u32 msg_cksum; + __u16 version; + int msg_nob; + bool flip; + + /* 6 bytes are enough to have received magic + version */ + if (nob < 6) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + if (msg->ibm_magic == IBLND_MSG_MAGIC) { + flip = false; + } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) { + flip = true; + } else { + CERROR("Bad magic: %08x\n", msg->ibm_magic); + return -EPROTO; + } + + version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; + if (version != IBLND_MSG_VERSION && + version != IBLND_MSG_VERSION_1) { + CERROR("Bad version: %x\n", version); + return -EPROTO; + } + + if (nob < hdr_size) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; + if (msg_nob > nob) { + CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); + return -EPROTO; + } + + /* checksum must be computed with ibm_cksum zero and BEFORE anything + * gets flipped + */ + msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; + msg->ibm_cksum = 0; + if (msg_cksum != 0 && + msg_cksum != kiblnd_cksum(msg, msg_nob)) { + CERROR("Bad checksum\n"); + return -EPROTO; + } + + msg->ibm_cksum = msg_cksum; + + if (flip) { + /* leave magic unflipped as a clue to peer_ni endianness */ + msg->ibm_version = version; + BUILD_BUG_ON(sizeof(msg->ibm_type) != 1); + BUILD_BUG_ON(sizeof(msg->ibm_credits) != 1); + msg->ibm_nob = msg_nob; + __swab64s(&msg->ibm_srcnid); + __swab64s(&msg->ibm_srcstamp); + __swab64s(&msg->ibm_dstnid); + __swab64s(&msg->ibm_dststamp); + } + + if (msg->ibm_srcnid == LNET_NID_ANY) { + CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); + return -EPROTO; + } + + if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) { + CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type), + msg_nob, kiblnd_msgtype2size(msg->ibm_type)); + return -EPROTO; + } + + switch (msg->ibm_type) { + default: + CERROR("Unknown message type %x\n", msg->ibm_type); + return -EPROTO; + + case IBLND_MSG_NOOP: + case IBLND_MSG_IMMEDIATE: + case IBLND_MSG_PUT_REQ: + break; + + case IBLND_MSG_PUT_ACK: + case IBLND_MSG_GET_REQ: + if (kiblnd_unpack_rd(msg, flip)) + return -EPROTO; + break; + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + if (flip) + __swab32s(&msg->ibm_u.completion.ibcm_status); + break; + + case IBLND_MSG_CONNREQ: + case IBLND_MSG_CONNACK: + if (flip) { + __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth); + __swab16s(&msg->ibm_u.connparams.ibcp_max_frags); + __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); + } + break; + } + return 0; +} + +int +kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp, + lnet_nid_t nid) +{ + struct kib_peer_ni *peer_ni; + struct kib_net *net = ni->ni_data; + int cpt = lnet_cpt_of_nid(nid, ni); + unsigned long flags; + + LASSERT(net != NULL); + LASSERT(nid != LNET_NID_ANY); + + LIBCFS_CPT_ALLOC(peer_ni, lnet_cpt_table(), cpt, sizeof(*peer_ni)); + if (!peer_ni) { + CERROR("Cannot allocate peer_ni\n"); + return -ENOMEM; + } + + peer_ni->ibp_ni = ni; + peer_ni->ibp_nid = nid; + peer_ni->ibp_error = 0; + peer_ni->ibp_last_alive = 0; + peer_ni->ibp_max_frags = IBLND_MAX_RDMA_FRAGS; + peer_ni->ibp_queue_depth = ni->ni_net->net_tunables.lct_peer_tx_credits; + peer_ni->ibp_queue_depth_mod = 0; /* try to use the default */ + kref_init(&peer_ni->ibp_kref); + + INIT_HLIST_NODE(&peer_ni->ibp_list); + INIT_LIST_HEAD(&peer_ni->ibp_conns); + INIT_LIST_HEAD(&peer_ni->ibp_tx_queue); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + /* always called with a ref on ni, which prevents ni being shutdown */ + LASSERT(net->ibn_shutdown == 0); + + /* npeers only grows with the global lock held */ + atomic_inc(&net->ibn_npeers); + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + *peerp = peer_ni; + return 0; +} + +void +kiblnd_destroy_peer(struct kref *kref) +{ + struct kib_peer_ni *peer_ni = container_of(kref, struct kib_peer_ni, + ibp_kref); + struct kib_net *net = peer_ni->ibp_ni->ni_data; + + LASSERT(net != NULL); + LASSERT(!kiblnd_peer_active(peer_ni)); + LASSERT(kiblnd_peer_idle(peer_ni)); + LASSERT(list_empty(&peer_ni->ibp_tx_queue)); + + LIBCFS_FREE(peer_ni, sizeof(*peer_ni)); + + /* NB a peer_ni's connections keep a reference on their peer_ni until + * they are destroyed, so we can be assured that _all_ state to do + * with this peer_ni has been cleaned up when its refcount drops to + * zero. + */ + if (atomic_dec_and_test(&net->ibn_npeers)) + wake_up_var(&net->ibn_npeers); +} + +struct kib_peer_ni * +kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid) +{ + /* the caller is responsible for accounting the additional reference + * that this creates + */ + struct kib_peer_ni *peer_ni; + + hash_for_each_possible(kiblnd_data.kib_peers, peer_ni, + ibp_list, nid) { + LASSERT(!kiblnd_peer_idle(peer_ni)); + + /* + * Match a peer if its NID and the NID of the local NI it + * communicates over are the same. Otherwise don't match + * the peer, which will result in a new lnd peer being + * created. + */ + if (peer_ni->ibp_nid != nid || + !nid_same(&peer_ni->ibp_ni->ni_nid, &ni->ni_nid)) + continue; + + CDEBUG(D_NET, "got peer_ni [%p] -> %s (%d) version: %x\n", + peer_ni, libcfs_nid2str(nid), + kref_read(&peer_ni->ibp_kref), + peer_ni->ibp_version); + return peer_ni; + } + return NULL; +} + +void +kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni) +{ + LASSERT(list_empty(&peer_ni->ibp_conns)); + + LASSERT(kiblnd_peer_active(peer_ni)); + hlist_del_init(&peer_ni->ibp_list); + /* lose peerlist's ref */ + kiblnd_peer_decref(peer_ni); +} + +static int +kiblnd_get_peer_info(struct lnet_ni *ni, int index, + lnet_nid_t *nidp, int *count) +{ + struct kib_peer_ni *peer_ni; + int i; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + hash_for_each(kiblnd_data.kib_peers, i, peer_ni, ibp_list) { + LASSERT(!kiblnd_peer_idle(peer_ni)); + + if (peer_ni->ibp_ni != ni) + continue; + + if (index-- > 0) + continue; + + *nidp = peer_ni->ibp_nid; + *count = kref_read(&peer_ni->ibp_kref); + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + return 0; + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + return -ENOENT; +} + +static void +kiblnd_del_peer_locked(struct kib_peer_ni *peer_ni) +{ + struct kib_conn *cnxt; + struct kib_conn *conn; + + if (list_empty(&peer_ni->ibp_conns)) { + kiblnd_unlink_peer_locked(peer_ni); + } else { + list_for_each_entry_safe(conn, cnxt, &peer_ni->ibp_conns, + ibc_list) + kiblnd_close_conn_locked(conn, 0); + /* NB closing peer_ni's last conn unlinked it. */ + } + /* NB peer_ni now unlinked; might even be freed if the peer_ni table had the + * last ref on it. */ +} + +static int +kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid) +{ + LIST_HEAD(zombies); + struct hlist_node *pnxt; + struct kib_peer_ni *peer_ni; + int lo; + int hi; + int i; + unsigned long flags; + int rc = -ENOENT; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (nid != LNET_NID_ANY) { + lo = hash_min(nid, HASH_BITS(kiblnd_data.kib_peers)); + hi = lo; + } else { + lo = 0; + hi = HASH_SIZE(kiblnd_data.kib_peers) - 1; + } + + for (i = lo; i <= hi; i++) { + hlist_for_each_entry_safe(peer_ni, pnxt, + &kiblnd_data.kib_peers[i], ibp_list) { + LASSERT(!kiblnd_peer_idle(peer_ni)); + + if (peer_ni->ibp_ni != ni) + continue; + + if (!(nid == LNET_NID_ANY || peer_ni->ibp_nid == nid)) + continue; + + if (!list_empty(&peer_ni->ibp_tx_queue)) { + LASSERT(list_empty(&peer_ni->ibp_conns)); + + list_splice_init(&peer_ni->ibp_tx_queue, + &zombies); + } + + kiblnd_del_peer_locked(peer_ni); + rc = 0; /* matched something */ + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_txlist_done(&zombies, -EIO, LNET_MSG_STATUS_LOCAL_ERROR); + + return rc; +} + +static struct kib_conn * +kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index) +{ + struct kib_peer_ni *peer_ni; + struct kib_conn *conn; + int i; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + hash_for_each(kiblnd_data.kib_peers, i, peer_ni, ibp_list) { + LASSERT(!kiblnd_peer_idle(peer_ni)); + + if (peer_ni->ibp_ni != ni) + continue; + + list_for_each_entry(conn, &peer_ni->ibp_conns, + ibc_list) { + if (index-- > 0) + continue; + + kiblnd_conn_addref(conn); + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return conn; + } + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + return NULL; +} + +static void +kiblnd_debug_rx(struct kib_rx *rx) +{ + CDEBUG(D_CONSOLE, " %p msg_type %x cred %d\n", + rx, rx->rx_msg->ibm_type, + rx->rx_msg->ibm_credits); +} + +static void +kiblnd_debug_tx(struct kib_tx *tx) +{ + CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lld " + "cookie %#llx msg %s%s type %x cred %d\n", + tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting, + tx->tx_status, ktime_to_ns(tx->tx_deadline), tx->tx_cookie, + tx->tx_lntmsg[0] == NULL ? "-" : "!", + tx->tx_lntmsg[1] == NULL ? "-" : "!", + tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits); +} + +void +kiblnd_debug_conn(struct kib_conn *conn) +{ + struct list_head *tmp; + int i; + + spin_lock(&conn->ibc_lock); + + CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s:\n", + atomic_read(&conn->ibc_refcount), conn, + conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + CDEBUG(D_CONSOLE, " state %d nposted %d/%d cred %d o_cred %d " + " r_cred %d\n", conn->ibc_state, conn->ibc_noops_posted, + conn->ibc_nsends_posted, conn->ibc_credits, + conn->ibc_outstanding_credits, conn->ibc_reserved_credits); + CDEBUG(D_CONSOLE, " comms_err %d\n", conn->ibc_comms_error); + + CDEBUG(D_CONSOLE, " early_rxs:\n"); + list_for_each(tmp, &conn->ibc_early_rxs) + kiblnd_debug_rx(list_entry(tmp, struct kib_rx, rx_list)); + + CDEBUG(D_CONSOLE, " tx_noops:\n"); + list_for_each(tmp, &conn->ibc_tx_noops) + kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue_nocred:\n"); + list_for_each(tmp, &conn->ibc_tx_queue_nocred) + kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n"); + list_for_each(tmp, &conn->ibc_tx_queue_rsrvd) + kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue:\n"); + list_for_each(tmp, &conn->ibc_tx_queue) + kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list)); + + CDEBUG(D_CONSOLE, " active_txs:\n"); + list_for_each(tmp, &conn->ibc_active_txs) + kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list)); + + CDEBUG(D_CONSOLE, " rxs:\n"); + for (i = 0; i < IBLND_RX_MSGS(conn); i++) + kiblnd_debug_rx(&conn->ibc_rxs[i]); + + spin_unlock(&conn->ibc_lock); +} + +static void +kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid) +{ + /* XXX There is no path record for iWARP, set by netdev->change_mtu? */ + if (cmid->route.path_rec == NULL) + return; + + if (*kiblnd_tunables.kib_ib_mtu) + cmid->route.path_rec->mtu = + ib_mtu_int_to_enum(*kiblnd_tunables.kib_ib_mtu); +} + +static int +kiblnd_get_completion_vector(struct kib_conn *conn, int cpt) +{ + cpumask_var_t *mask; + int vectors; + int off; + int i; + lnet_nid_t ibp_nid; + + vectors = conn->ibc_cmid->device->num_comp_vectors; + if (vectors <= 1) + return 0; + + mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt); + + /* hash NID to CPU id in this partition... */ + ibp_nid = conn->ibc_peer->ibp_nid; + off = do_div(ibp_nid, cpumask_weight(*mask)); + for_each_cpu(i, *mask) { + if (off-- == 0) + return i % vectors; + } + + LBUG(); + return 1; +} + +/* + * Get the scheduler bound to this CPT. If the scheduler has no + * threads, which means that the CPT has no CPUs, then grab the + * next scheduler that we can use. + * + * This case would be triggered if a NUMA node is configured with + * no associated CPUs. + */ +static struct kib_sched_info * +kiblnd_get_scheduler(int cpt) +{ + struct kib_sched_info *sched; + int i; + + sched = kiblnd_data.kib_scheds[cpt]; + + if (sched->ibs_nthreads > 0) + return sched; + + cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) { + if (sched->ibs_nthreads > 0) { + CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n", + cpt, sched->ibs_cpt); + return sched; + } + } + + return NULL; +} + +static unsigned int kiblnd_send_wrs(struct kib_conn *conn) +{ + /* + * One WR for the LNet message + * And ibc_max_frags for the transfer WRs + */ + int ret; + int multiplier = 1 + conn->ibc_max_frags; + + /* FastReg needs two extra WRs for map and invalidate */ + if (IS_FAST_REG_DEV(conn->ibc_hdev->ibh_dev)) + multiplier += 2; + + /* account for a maximum of ibc_queue_depth in-flight transfers */ + ret = multiplier * conn->ibc_queue_depth; + + if (ret > conn->ibc_hdev->ibh_max_qp_wr) { + CDEBUG(D_NET, "peer_credits %u will result in send work " + "request size %d larger than maximum %d device " + "can handle\n", conn->ibc_queue_depth, ret, + conn->ibc_hdev->ibh_max_qp_wr); + conn->ibc_queue_depth = + conn->ibc_hdev->ibh_max_qp_wr / multiplier; + } + + /* don't go beyond the maximum the device can handle */ + return min(ret, conn->ibc_hdev->ibh_max_qp_wr); +} + +struct kib_conn * +kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid, + int state, int version) +{ + /* CAVEAT EMPTOR: + * If the new conn is created successfully it takes over the caller's + * ref on 'peer_ni'. It also "owns" 'cmid' and destroys it when it itself + * is destroyed. On failure, the caller's ref on 'peer_ni' remains and + * she must dispose of 'cmid'. (Actually I'd block forever if I tried + * to destroy 'cmid' here since I'm called from the CM which still has + * its ref on 'cmid'). */ + rwlock_t *glock = &kiblnd_data.kib_global_lock; + struct kib_net *net = peer_ni->ibp_ni->ni_data; + struct kib_dev *dev; + struct ib_qp_init_attr init_qp_attr = {}; + struct kib_sched_info *sched; +#ifdef HAVE_IB_CQ_INIT_ATTR + struct ib_cq_init_attr cq_attr = {}; +#endif + struct kib_conn *conn; + struct ib_cq *cq; + unsigned long flags; + int cpt; + int rc; + int i; + + LASSERT(net != NULL); + LASSERT(!in_interrupt()); + + dev = net->ibn_dev; + + cpt = lnet_cpt_of_nid(peer_ni->ibp_nid, peer_ni->ibp_ni); + sched = kiblnd_get_scheduler(cpt); + + if (sched == NULL) { + CERROR("no schedulers available. node is unhealthy\n"); + goto failed_0; + } + + /* + * The cpt might have changed if we ended up selecting a non cpt + * native scheduler. So use the scheduler's cpt instead. + */ + cpt = sched->ibs_cpt; + + LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn)); + if (conn == NULL) { + CERROR("Can't allocate connection for %s\n", + libcfs_nid2str(peer_ni->ibp_nid)); + goto failed_0; + } + + conn->ibc_state = IBLND_CONN_INIT; + conn->ibc_version = version; + conn->ibc_peer = peer_ni; /* I take the caller's ref */ + cmid->context = conn; /* for future CM callbacks */ + conn->ibc_cmid = cmid; + conn->ibc_max_frags = peer_ni->ibp_max_frags; + conn->ibc_queue_depth = peer_ni->ibp_queue_depth; + conn->ibc_rxs = NULL; + conn->ibc_rx_pages = NULL; + + INIT_LIST_HEAD(&conn->ibc_early_rxs); + INIT_LIST_HEAD(&conn->ibc_tx_noops); + INIT_LIST_HEAD(&conn->ibc_tx_queue); + INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd); + INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred); + INIT_LIST_HEAD(&conn->ibc_active_txs); + INIT_LIST_HEAD(&conn->ibc_zombie_txs); + spin_lock_init(&conn->ibc_lock); + + LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt, + sizeof(*conn->ibc_connvars)); + if (conn->ibc_connvars == NULL) { + CERROR("Can't allocate in-progress connection state\n"); + goto failed_2; + } + + write_lock_irqsave(glock, flags); + if (dev->ibd_failover) { + write_unlock_irqrestore(glock, flags); + CERROR("%s: failover in progress\n", dev->ibd_ifname); + goto failed_2; + } + + if (dev->ibd_hdev->ibh_ibdev != cmid->device) { + /* wakeup failover thread and teardown connection */ + if (kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + wake_up(&kiblnd_data.kib_failover_waitq); + } + + write_unlock_irqrestore(glock, flags); + CERROR("cmid HCA(%s), kib_dev(%s) need failover\n", + cmid->device->name, dev->ibd_ifname); + goto failed_2; + } + + kiblnd_hdev_addref_locked(dev->ibd_hdev); + conn->ibc_hdev = dev->ibd_hdev; + + kiblnd_setup_mtu_locked(cmid); + + write_unlock_irqrestore(glock, flags); + +#ifdef HAVE_IB_CQ_INIT_ATTR + cq_attr.cqe = IBLND_CQ_ENTRIES(conn); + cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt); + cq = ib_create_cq(cmid->device, + kiblnd_cq_completion, kiblnd_cq_event, conn, + &cq_attr); +#else + cq = ib_create_cq(cmid->device, + kiblnd_cq_completion, kiblnd_cq_event, conn, + IBLND_CQ_ENTRIES(conn), + kiblnd_get_completion_vector(conn, cpt)); +#endif + if (IS_ERR(cq)) { + /* + * on MLX-5 (possibly MLX-4 as well) this error could be + * hit if the concurrent_sends and/or peer_tx_credits is set + * too high. Or due to an MLX-5 bug which tries to + * allocate 256kb via kmalloc for WR cookie array + */ + CERROR("Failed to create CQ with %d CQEs: %ld\n", + IBLND_CQ_ENTRIES(conn), PTR_ERR(cq)); + goto failed_2; + } + + conn->ibc_cq = cq; + + rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (rc != 0) { + CERROR("Can't request completion notification: %d\n", rc); + goto failed_2; + } + + init_qp_attr.event_handler = kiblnd_qp_event; + init_qp_attr.qp_context = conn; + init_qp_attr.cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge; + init_qp_attr.cap.max_recv_sge = 1; + init_qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + init_qp_attr.qp_type = IB_QPT_RC; + init_qp_attr.send_cq = cq; + init_qp_attr.recv_cq = cq; + + if (peer_ni->ibp_queue_depth_mod && + peer_ni->ibp_queue_depth_mod < peer_ni->ibp_queue_depth) { + conn->ibc_queue_depth = peer_ni->ibp_queue_depth_mod; + CDEBUG(D_NET, "Use reduced queue depth %u (from %u)\n", + peer_ni->ibp_queue_depth_mod, + peer_ni->ibp_queue_depth); + } + + do { + /* kiblnd_send_wrs() can change the connection's queue depth if + * the maximum work requests for the device is maxed out + */ + init_qp_attr.cap.max_send_wr = kiblnd_send_wrs(conn); + init_qp_attr.cap.max_recv_wr = IBLND_RECV_WRS(conn); + rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, + &init_qp_attr); + if (rc != -ENOMEM || conn->ibc_queue_depth < 2) + break; + conn->ibc_queue_depth--; + } while (rc); + + if (rc) { + CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, " + "send_sge: %d, recv_sge: %d\n", + rc, init_qp_attr.cap.max_send_wr, + init_qp_attr.cap.max_recv_wr, + init_qp_attr.cap.max_send_sge, + init_qp_attr.cap.max_recv_sge); + goto failed_2; + } + + conn->ibc_sched = sched; + + if (!peer_ni->ibp_queue_depth_mod && + conn->ibc_queue_depth != peer_ni->ibp_queue_depth) { + CWARN("peer %s - queue depth reduced from %u to %u" + " to allow for qp creation\n", + libcfs_nid2str(peer_ni->ibp_nid), + peer_ni->ibp_queue_depth, + conn->ibc_queue_depth); + peer_ni->ibp_queue_depth_mod = conn->ibc_queue_depth; + } + + LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt, + IBLND_RX_MSGS(conn) * sizeof(struct kib_rx)); + if (conn->ibc_rxs == NULL) { + CERROR("Cannot allocate RX buffers\n"); + goto failed_2; + } + + rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt, + IBLND_RX_MSG_PAGES(conn)); + if (rc != 0) + goto failed_2; + + kiblnd_map_rx_descs(conn); + + /* 1 ref for caller and each rxmsg */ + atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn)); + conn->ibc_nrx = IBLND_RX_MSGS(conn); + + /* post receives */ + for (i = 0; i < IBLND_RX_MSGS(conn); i++) { + rc = kiblnd_post_rx(&conn->ibc_rxs[i], IBLND_POSTRX_NO_CREDIT); + if (rc != 0) { + CERROR("Can't post rxmsg: %d\n", rc); + + /* Make posted receives complete */ + kiblnd_abort_receives(conn); + + /* correct # of posted buffers + * NB locking needed now I'm racing with completion */ + spin_lock_irqsave(&sched->ibs_lock, flags); + conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i; + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + /* cmid will be destroyed by CM(ofed) after cm_callback + * returned, so we can't refer it anymore + * (by kiblnd_connd()->kiblnd_destroy_conn) */ + rdma_destroy_qp(conn->ibc_cmid); + conn->ibc_cmid = NULL; + + /* Drop my own and unused rxbuffer refcounts */ + while (i++ <= IBLND_RX_MSGS(conn)) + kiblnd_conn_decref(conn); + + return NULL; + } + } + + /* Init successful! */ + LASSERT (state == IBLND_CONN_ACTIVE_CONNECT || + state == IBLND_CONN_PASSIVE_WAIT); + conn->ibc_state = state; + + /* 1 more conn */ + atomic_inc(&net->ibn_nconns); + return conn; + + failed_2: + kiblnd_destroy_conn(conn); + LIBCFS_FREE(conn, sizeof(*conn)); + failed_0: + return NULL; +} + +void +kiblnd_destroy_conn(struct kib_conn *conn) +{ + struct rdma_cm_id *cmid = conn->ibc_cmid; + struct kib_peer_ni *peer_ni = conn->ibc_peer; + + LASSERT (!in_interrupt()); + LASSERT (atomic_read(&conn->ibc_refcount) == 0); + LASSERT(list_empty(&conn->ibc_early_rxs)); + LASSERT(list_empty(&conn->ibc_tx_noops)); + LASSERT(list_empty(&conn->ibc_tx_queue)); + LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd)); + LASSERT(list_empty(&conn->ibc_tx_queue_nocred)); + LASSERT(list_empty(&conn->ibc_active_txs)); + LASSERT (conn->ibc_noops_posted == 0); + LASSERT (conn->ibc_nsends_posted == 0); + + switch (conn->ibc_state) { + default: + /* conn must be completely disengaged from the network */ + LBUG(); + + case IBLND_CONN_DISCONNECTED: + /* connvars should have been freed already */ + LASSERT (conn->ibc_connvars == NULL); + break; + + case IBLND_CONN_INIT: + break; + } + + /* conn->ibc_cmid might be destroyed by CM already */ + if (cmid != NULL && cmid->qp != NULL) + rdma_destroy_qp(cmid); + + if (conn->ibc_cq) + ib_destroy_cq(conn->ibc_cq); + + kiblnd_txlist_done(&conn->ibc_zombie_txs, -ECONNABORTED, + LNET_MSG_STATUS_OK); + + if (conn->ibc_rx_pages != NULL) + kiblnd_unmap_rx_descs(conn); + + if (conn->ibc_rxs != NULL) + CFS_FREE_PTR_ARRAY(conn->ibc_rxs, IBLND_RX_MSGS(conn)); + + if (conn->ibc_connvars != NULL) + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + + if (conn->ibc_hdev != NULL) + kiblnd_hdev_decref(conn->ibc_hdev); + + /* See CAVEAT EMPTOR above in kiblnd_create_conn */ + if (conn->ibc_state != IBLND_CONN_INIT) { + struct kib_net *net = peer_ni->ibp_ni->ni_data; + + kiblnd_peer_decref(peer_ni); + rdma_destroy_id(cmid); + atomic_dec(&net->ibn_nconns); + } +} + +int +kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why) +{ + struct kib_conn *conn; + struct kib_conn *cnxt; + int count = 0; + + list_for_each_entry_safe(conn, cnxt, &peer_ni->ibp_conns, + ibc_list) { + CDEBUG(D_NET, "Closing conn -> %s, " + "version: %x, reason: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), + conn->ibc_version, why); + + kiblnd_close_conn_locked(conn, why); + count++; + } + + return count; +} + +int +kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni, + int version, __u64 incarnation) +{ + struct kib_conn *conn; + struct kib_conn *cnxt; + int count = 0; + + list_for_each_entry_safe(conn, cnxt, &peer_ni->ibp_conns, + ibc_list) { + if (conn->ibc_version == version && + conn->ibc_incarnation == incarnation) + continue; + + CDEBUG(D_NET, "Closing stale conn -> %s version: %x, " + "incarnation:%#llx(%x, %#llx)\n", + libcfs_nid2str(peer_ni->ibp_nid), + conn->ibc_version, conn->ibc_incarnation, + version, incarnation); + + kiblnd_close_conn_locked(conn, -ESTALE); + count++; + } + + return count; +} + +static int +kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid) +{ + struct kib_peer_ni *peer_ni; + struct hlist_node *pnxt; + int lo; + int hi; + int i; + unsigned long flags; + int count = 0; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (nid != LNET_NID_ANY) { + lo = hash_min(nid, HASH_BITS(kiblnd_data.kib_peers)); + hi = lo; + } else { + lo = 0; + hi = HASH_SIZE(kiblnd_data.kib_peers) - 1; + } + + for (i = lo; i <= hi; i++) { + hlist_for_each_entry_safe(peer_ni, pnxt, + &kiblnd_data.kib_peers[i], ibp_list) { + LASSERT(!kiblnd_peer_idle(peer_ni)); + + if (peer_ni->ibp_ni != ni) + continue; + + if (!(nid == LNET_NID_ANY || nid == peer_ni->ibp_nid)) + continue; + + count += kiblnd_close_peer_conns_locked(peer_ni, 0); + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + /* wildcards always succeed */ + if (nid == LNET_NID_ANY) + return 0; + + return (count == 0) ? -ENOENT : 0; +} + +static int +kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + int rc = -EINVAL; + + switch(cmd) { + case IOC_LIBCFS_GET_PEER: { + lnet_nid_t nid = 0; + int count = 0; + + rc = kiblnd_get_peer_info(ni, data->ioc_count, + &nid, &count); + data->ioc_nid = nid; + data->ioc_count = count; + break; + } + + case IOC_LIBCFS_DEL_PEER: { + rc = kiblnd_del_peer(ni, data->ioc_nid); + break; + } + case IOC_LIBCFS_GET_CONN: { + struct kib_conn *conn; + + rc = 0; + conn = kiblnd_get_conn_by_idx(ni, data->ioc_count); + if (conn == NULL) { + rc = -ENOENT; + break; + } + + LASSERT(conn->ibc_cmid != NULL); + data->ioc_nid = conn->ibc_peer->ibp_nid; + if (conn->ibc_cmid->route.path_rec == NULL) + data->ioc_u32[0] = 0; /* iWarp has no path MTU */ + else + data->ioc_u32[0] = + ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu); + kiblnd_conn_decref(conn); + break; + } + case IOC_LIBCFS_CLOSE_CONNECTION: { + rc = kiblnd_close_matching_conns(ni, data->ioc_nid); + break; + } + + default: + break; + } + + return rc; +} + +static void +kiblnd_free_pages(struct kib_pages *p) +{ + int npages = p->ibp_npages; + int i; + + for (i = 0; i < npages; i++) { + if (p->ibp_pages[i] != NULL) + __free_page(p->ibp_pages[i]); + } + + LIBCFS_FREE(p, offsetof(struct kib_pages, ibp_pages[npages])); +} + +int +kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages) +{ + struct kib_pages *p; + int i; + + LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt, + offsetof(struct kib_pages, ibp_pages[npages])); + if (p == NULL) { + CERROR("Can't allocate descriptor for %d pages\n", npages); + return -ENOMEM; + } + + memset(p, 0, offsetof(struct kib_pages, ibp_pages[npages])); + p->ibp_npages = npages; + + for (i = 0; i < npages; i++) { + p->ibp_pages[i] = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, + GFP_NOFS); + if (p->ibp_pages[i] == NULL) { + CERROR("Can't allocate page %d of %d\n", i, npages); + kiblnd_free_pages(p); + return -ENOMEM; + } + } + + *pp = p; + return 0; +} + +void +kiblnd_unmap_rx_descs(struct kib_conn *conn) +{ + struct kib_rx *rx; + int i; + + LASSERT (conn->ibc_rxs != NULL); + LASSERT (conn->ibc_hdev != NULL); + + for (i = 0; i < IBLND_RX_MSGS(conn); i++) { + rx = &conn->ibc_rxs[i]; + + LASSERT(rx->rx_nob >= 0); /* not posted */ + + kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev, + KIBLND_UNMAP_ADDR(rx, rx_msgunmap, + rx->rx_msgaddr), + IBLND_MSG_SIZE, DMA_FROM_DEVICE); + } + + kiblnd_free_pages(conn->ibc_rx_pages); + + conn->ibc_rx_pages = NULL; +} + +void +kiblnd_map_rx_descs(struct kib_conn *conn) +{ + struct kib_rx *rx; + struct page *pg; + int pg_off; + int ipg; + int i; + + for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) { + pg = conn->ibc_rx_pages->ibp_pages[ipg]; + rx = &conn->ibc_rxs[i]; + + rx->rx_conn = conn; + rx->rx_msg = (struct kib_msg *)(((char *)page_address(pg)) + pg_off); + + rx->rx_msgaddr = + kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev, + rx->rx_msg, IBLND_MSG_SIZE, + DMA_FROM_DEVICE); + LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev, + rx->rx_msgaddr)); + KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr); + + CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n", + i, rx->rx_msg, rx->rx_msgaddr, + (__u64)(page_to_phys(pg) + pg_off)); + + pg_off += IBLND_MSG_SIZE; + LASSERT(pg_off <= PAGE_SIZE); + + if (pg_off == PAGE_SIZE) { + pg_off = 0; + ipg++; + LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn)); + } + } +} + +static void +kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo) +{ + struct kib_hca_dev *hdev = tpo->tpo_hdev; + struct kib_tx *tx; + int i; + + LASSERT (tpo->tpo_pool.po_allocated == 0); + + if (hdev == NULL) + return; + + for (i = 0; i < tpo->tpo_pool.po_size; i++) { + tx = &tpo->tpo_tx_descs[i]; + kiblnd_dma_unmap_single(hdev->ibh_ibdev, + KIBLND_UNMAP_ADDR(tx, tx_msgunmap, + tx->tx_msgaddr), + IBLND_MSG_SIZE, DMA_TO_DEVICE); + } + + kiblnd_hdev_decref(hdev); + tpo->tpo_hdev = NULL; +} + +static struct kib_hca_dev * +kiblnd_current_hdev(struct kib_dev *dev) +{ + struct kib_hca_dev *hdev; + unsigned long flags; + int i = 0; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + while (dev->ibd_failover) { + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + if (i++ % 50 == 0) + CDEBUG(D_NET, "%s: Wait for failover\n", + dev->ibd_ifname); + schedule_timeout_interruptible(cfs_time_seconds(1) / 100); + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + } + + kiblnd_hdev_addref_locked(dev->ibd_hdev); + hdev = dev->ibd_hdev; + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + return hdev; +} + +static void +kiblnd_map_tx_pool(struct kib_tx_pool *tpo) +{ + struct kib_pages *txpgs = tpo->tpo_tx_pages; + struct kib_pool *pool = &tpo->tpo_pool; + struct kib_net *net = pool->po_owner->ps_net; + struct kib_dev *dev; + struct page *page; + struct kib_tx *tx; + int page_offset; + int ipage; + int i; + + LASSERT (net != NULL); + + dev = net->ibn_dev; + + /* pre-mapped messages are not bigger than 1 page */ + BUILD_BUG_ON(IBLND_MSG_SIZE > PAGE_SIZE); + + /* No fancy arithmetic when we do the buffer calculations */ + BUILD_BUG_ON(PAGE_SIZE % IBLND_MSG_SIZE != 0); + + tpo->tpo_hdev = kiblnd_current_hdev(dev); + + for (ipage = page_offset = i = 0; i < pool->po_size; i++) { + page = txpgs->ibp_pages[ipage]; + tx = &tpo->tpo_tx_descs[i]; + + tx->tx_msg = (struct kib_msg *)(((char *)page_address(page)) + + page_offset); + + tx->tx_msgaddr = kiblnd_dma_map_single(tpo->tpo_hdev->ibh_ibdev, + tx->tx_msg, + IBLND_MSG_SIZE, + DMA_TO_DEVICE); + LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev, + tx->tx_msgaddr)); + KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr); + + list_add(&tx->tx_list, &pool->po_free_list); + + page_offset += IBLND_MSG_SIZE; + LASSERT(page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT(ipage <= txpgs->ibp_npages); + } + } +} + +static void +kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo) +{ + LASSERT(fpo->fpo_map_count == 0); + +#ifdef HAVE_FMR_POOL_API + if (fpo->fpo_is_fmr && fpo->fmr.fpo_fmr_pool) { + ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool); + } else +#endif /* HAVE_FMR_POOL_API */ + { + struct kib_fast_reg_descriptor *frd, *tmp; + int i = 0; + + list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list, + frd_list) { + list_del(&frd->frd_list); +#ifndef HAVE_IB_MAP_MR_SG + ib_free_fast_reg_page_list(frd->frd_frpl); +#endif + ib_dereg_mr(frd->frd_mr); + LIBCFS_FREE(frd, sizeof(*frd)); + i++; + } + if (i < fpo->fast_reg.fpo_pool_size) + CERROR("FastReg pool still has %d regions registered\n", + fpo->fast_reg.fpo_pool_size - i); + } + + if (fpo->fpo_hdev) + kiblnd_hdev_decref(fpo->fpo_hdev); + + LIBCFS_FREE(fpo, sizeof(*fpo)); +} + +static void +kiblnd_destroy_fmr_pool_list(struct list_head *head) +{ + struct kib_fmr_pool *fpo, *tmp; + + list_for_each_entry_safe(fpo, tmp, head, fpo_list) { + list_del(&fpo->fpo_list); + kiblnd_destroy_fmr_pool(fpo); + } +} + +static int +kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables, + int ncpts) +{ + int size = tunables->lnd_fmr_pool_size / ncpts; + + return max(IBLND_FMR_POOL, size); +} + +static int +kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables, + int ncpts) +{ + int size = tunables->lnd_fmr_flush_trigger / ncpts; + + return max(IBLND_FMR_POOL_FLUSH, size); +} + +#ifdef HAVE_FMR_POOL_API +static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps, + struct kib_fmr_pool *fpo) +{ + struct ib_fmr_pool_param param = { + .max_pages_per_fmr = IBLND_MAX_RDMA_FRAGS, + .page_shift = PAGE_SHIFT, + .access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE), + .pool_size = fps->fps_pool_size, + .dirty_watermark = fps->fps_flush_trigger, + .flush_function = NULL, + .flush_arg = NULL, + .cache = !!fps->fps_cache }; + int rc = 0; + + fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, + ¶m); + if (IS_ERR(fpo->fmr.fpo_fmr_pool)) { + rc = PTR_ERR(fpo->fmr.fpo_fmr_pool); + if (rc != -ENOSYS) + CERROR("Failed to create FMR pool: %d\n", rc); + else + CERROR("FMRs are not supported\n"); + } + fpo->fpo_is_fmr = true; + + return rc; +} +#endif /* HAVE_FMR_POOL_API */ + +static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps, + struct kib_fmr_pool *fpo, + enum kib_dev_caps dev_caps) +{ + struct kib_fast_reg_descriptor *frd, *tmp; + int i, rc; + +#ifdef HAVE_FMR_POOL_API + fpo->fpo_is_fmr = false; +#endif + + INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list); + fpo->fast_reg.fpo_pool_size = 0; + for (i = 0; i < fps->fps_pool_size; i++) { + LIBCFS_CPT_ALLOC(frd, lnet_cpt_table(), fps->fps_cpt, + sizeof(*frd)); + if (!frd) { + CERROR("Failed to allocate a new fast_reg descriptor\n"); + rc = -ENOMEM; + goto out; + } + frd->frd_mr = NULL; + +#ifndef HAVE_IB_MAP_MR_SG + frd->frd_frpl = ib_alloc_fast_reg_page_list(fpo->fpo_hdev->ibh_ibdev, + IBLND_MAX_RDMA_FRAGS); + if (IS_ERR(frd->frd_frpl)) { + rc = PTR_ERR(frd->frd_frpl); + CERROR("Failed to allocate ib_fast_reg_page_list: %d\n", + rc); + frd->frd_frpl = NULL; + goto out_middle; + } +#endif + +#ifdef HAVE_IB_ALLOC_FAST_REG_MR + frd->frd_mr = ib_alloc_fast_reg_mr(fpo->fpo_hdev->ibh_pd, + IBLND_MAX_RDMA_FRAGS); +#else + /* + * it is expected to get here if this is an MLX-5 card. + * MLX-4 cards will always use FMR and MLX-5 cards will + * always use fast_reg. It turns out that some MLX-5 cards + * (possibly due to older FW versions) do not natively support + * gaps. So we will need to track them here. + */ + frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd, +#ifdef IB_MR_TYPE_SG_GAPS + ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) && + (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT)) ? + IB_MR_TYPE_SG_GAPS : + IB_MR_TYPE_MEM_REG, +#else + IB_MR_TYPE_MEM_REG, +#endif + IBLND_MAX_RDMA_FRAGS); + if ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) && + (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT)) + CWARN("using IB_MR_TYPE_SG_GAPS, expect a performance drop\n"); +#endif + if (IS_ERR(frd->frd_mr)) { + rc = PTR_ERR(frd->frd_mr); + CERROR("Failed to allocate ib_fast_reg_mr: %d\n", rc); + frd->frd_mr = NULL; + goto out_middle; + } + + /* indicate that the local invalidate needs to be generated */ + frd->frd_valid = false; + + list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list); + fpo->fast_reg.fpo_pool_size++; + } + + return 0; + +out_middle: + if (frd->frd_mr) + ib_dereg_mr(frd->frd_mr); +#ifndef HAVE_IB_MAP_MR_SG + if (frd->frd_frpl) + ib_free_fast_reg_page_list(frd->frd_frpl); +#endif + LIBCFS_FREE(frd, sizeof(*frd)); + +out: + list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list, + frd_list) { + list_del(&frd->frd_list); +#ifndef HAVE_IB_MAP_MR_SG + ib_free_fast_reg_page_list(frd->frd_frpl); +#endif + ib_dereg_mr(frd->frd_mr); + LIBCFS_FREE(frd, sizeof(*frd)); + } + + return rc; +} + +static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps, + struct kib_fmr_pool **pp_fpo) +{ + struct kib_dev *dev = fps->fps_net->ibn_dev; + struct kib_fmr_pool *fpo; + int rc; + + LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo)); + if (!fpo) { + return -ENOMEM; + } + memset(fpo, 0, sizeof(*fpo)); + + fpo->fpo_hdev = kiblnd_current_hdev(dev); + +#ifdef HAVE_FMR_POOL_API + if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED) + rc = kiblnd_alloc_fmr_pool(fps, fpo); + else +#endif /* HAVE_FMR_POOL_API */ + rc = kiblnd_alloc_freg_pool(fps, fpo, dev->ibd_dev_caps); + if (rc) + goto out_fpo; + + fpo->fpo_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE; + fpo->fpo_owner = fps; + *pp_fpo = fpo; + + return 0; + +out_fpo: + kiblnd_hdev_decref(fpo->fpo_hdev); + LIBCFS_FREE(fpo, sizeof(*fpo)); + return rc; +} + +static void +kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, struct list_head *zombies) +{ + struct kib_fmr_pool *fpo; + + if (fps->fps_net == NULL) /* intialized? */ + return; + + spin_lock(&fps->fps_lock); + + while ((fpo = list_first_entry_or_null(&fps->fps_pool_list, + struct kib_fmr_pool, + fpo_list)) != NULL) { + fpo->fpo_failed = 1; + if (fpo->fpo_map_count == 0) + list_move(&fpo->fpo_list, zombies); + else + list_move(&fpo->fpo_list, &fps->fps_failed_pool_list); + } + + spin_unlock(&fps->fps_lock); +} + +static void +kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps) +{ + if (fps->fps_net != NULL) { /* initialized? */ + kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list); + kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list); + } +} + +static int +kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts, + struct kib_net *net, + struct lnet_ioctl_config_o2iblnd_tunables *tunables) +{ + struct kib_fmr_pool *fpo; + int rc; + + memset(fps, 0, sizeof(struct kib_fmr_poolset)); + + fps->fps_net = net; + fps->fps_cpt = cpt; + + fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts); + fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts); + fps->fps_cache = tunables->lnd_fmr_cache; + + spin_lock_init(&fps->fps_lock); + INIT_LIST_HEAD(&fps->fps_pool_list); + INIT_LIST_HEAD(&fps->fps_failed_pool_list); + + rc = kiblnd_create_fmr_pool(fps, &fpo); + if (rc == 0) + list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); + + return rc; +} + +static int +kiblnd_fmr_pool_is_idle(struct kib_fmr_pool *fpo, time64_t now) +{ + if (fpo->fpo_map_count != 0) /* still in use */ + return 0; + if (fpo->fpo_failed) + return 1; + return now >= fpo->fpo_deadline; +} + +#if defined(HAVE_FMR_POOL_API) || !defined(HAVE_IB_MAP_MR_SG) +static int +kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd) +{ + struct kib_hca_dev *hdev; + __u64 *pages = tx->tx_pages; + int npages; + int size; + int i; + + hdev = tx->tx_pool->tpo_hdev; + + for (i = 0, npages = 0; i < rd->rd_nfrags; i++) { + for (size = 0; size < rd->rd_frags[i].rf_nob; + size += hdev->ibh_page_size) { + pages[npages++] = (rd->rd_frags[i].rf_addr & + hdev->ibh_page_mask) + size; + } + } + + return npages; +} +#endif + +void +kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status) +{ + LIST_HEAD(zombies); + struct kib_fmr_pool *fpo = fmr->fmr_pool; + struct kib_fmr_poolset *fps; + time64_t now = ktime_get_seconds(); + struct kib_fmr_pool *tmp; + + if (!fpo) + return; + + fps = fpo->fpo_owner; + +#ifdef HAVE_FMR_POOL_API + if (fpo->fpo_is_fmr) { + if (fmr->fmr_pfmr) { + ib_fmr_pool_unmap(fmr->fmr_pfmr); + fmr->fmr_pfmr = NULL; + } + + if (status) { + int rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool); + LASSERT(!rc); + } + } else +#endif /* HAVE_FMR_POOL_API */ + { + struct kib_fast_reg_descriptor *frd = fmr->fmr_frd; + if (frd) { + frd->frd_posted = false; + fmr->fmr_frd = NULL; + spin_lock(&fps->fps_lock); + list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list); + spin_unlock(&fps->fps_lock); + } + } + fmr->fmr_pool = NULL; + + spin_lock(&fps->fps_lock); + fpo->fpo_map_count--; /* decref the pool */ + + list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) { + /* the first pool is persistent */ + if (fps->fps_pool_list.next == &fpo->fpo_list) + continue; + + if (kiblnd_fmr_pool_is_idle(fpo, now)) { + list_move(&fpo->fpo_list, &zombies); + fps->fps_version++; + } + } + spin_unlock(&fps->fps_lock); + + if (!list_empty(&zombies)) + kiblnd_destroy_fmr_pool_list(&zombies); +} + +int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx, + struct kib_rdma_desc *rd, u32 nob, u64 iov, + struct kib_fmr *fmr) +{ + struct kib_fmr_pool *fpo; + __u64 version; + bool is_rx = (rd != tx->tx_rd); +#ifdef HAVE_FMR_POOL_API + __u64 *pages = tx->tx_pages; + bool tx_pages_mapped = false; + int npages = 0; +#endif + int rc; + +again: + spin_lock(&fps->fps_lock); + version = fps->fps_version; + list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) { + fpo->fpo_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE; + fpo->fpo_map_count++; + +#ifdef HAVE_FMR_POOL_API + fmr->fmr_pfmr = NULL; + if (fpo->fpo_is_fmr) { + struct ib_pool_fmr *pfmr; + + spin_unlock(&fps->fps_lock); + + if (!tx_pages_mapped) { + npages = kiblnd_map_tx_pages(tx, rd); + tx_pages_mapped = true; + } + + pfmr = kib_fmr_pool_map(fpo->fmr.fpo_fmr_pool, + pages, npages, iov); + if (likely(!IS_ERR(pfmr))) { + fmr->fmr_key = is_rx ? pfmr->fmr->rkey + : pfmr->fmr->lkey; + fmr->fmr_frd = NULL; + fmr->fmr_pfmr = pfmr; + fmr->fmr_pool = fpo; + return 0; + } + rc = PTR_ERR(pfmr); + } else +#endif /* HAVE_FMR_POOL_API */ + { + if (!list_empty(&fpo->fast_reg.fpo_pool_list)) { + struct kib_fast_reg_descriptor *frd; +#ifdef HAVE_IB_MAP_MR_SG + struct ib_reg_wr *wr; + int n; +#else + struct ib_rdma_wr *wr; + struct ib_fast_reg_page_list *frpl; +#endif + struct ib_mr *mr; + + frd = list_first_entry( + &fpo->fast_reg.fpo_pool_list, + struct kib_fast_reg_descriptor, + frd_list); + list_del(&frd->frd_list); + spin_unlock(&fps->fps_lock); + +#ifndef HAVE_IB_MAP_MR_SG + frpl = frd->frd_frpl; +#endif + mr = frd->frd_mr; + + if (!frd->frd_valid) { + struct ib_rdma_wr *inv_wr; + __u32 key = is_rx ? mr->rkey : mr->lkey; + + frd->frd_valid = true; + inv_wr = &frd->frd_inv_wr; + memset(inv_wr, 0, sizeof(*inv_wr)); + + inv_wr->wr.opcode = IB_WR_LOCAL_INV; + inv_wr->wr.wr_id = IBLND_WID_MR; + inv_wr->wr.ex.invalidate_rkey = key; + + /* Bump the key */ + key = ib_inc_rkey(key); + ib_update_fast_reg_key(mr, key); + } + +#ifdef HAVE_IB_MAP_MR_SG +#ifdef HAVE_IB_MAP_MR_SG_5ARGS + n = ib_map_mr_sg(mr, tx->tx_frags, + rd->rd_nfrags, NULL, PAGE_SIZE); +#else + n = ib_map_mr_sg(mr, tx->tx_frags, + rd->rd_nfrags, PAGE_SIZE); +#endif /* HAVE_IB_MAP_MR_SG_5ARGS */ + if (unlikely(n != rd->rd_nfrags)) { + CERROR("Failed to map mr %d/%d elements\n", + n, rd->rd_nfrags); + return n < 0 ? n : -EINVAL; + } + + wr = &frd->frd_fastreg_wr; + memset(wr, 0, sizeof(*wr)); + + wr->wr.opcode = IB_WR_REG_MR; + wr->wr.wr_id = IBLND_WID_MR; + wr->wr.num_sge = 0; + wr->wr.send_flags = 0; + wr->mr = mr; + wr->key = is_rx ? mr->rkey : mr->lkey; + wr->access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); +#else /* HAVE_IB_MAP_MR_SG */ + if (!tx_pages_mapped) { + npages = kiblnd_map_tx_pages(tx, rd); + tx_pages_mapped = true; + } + + LASSERT(npages <= frpl->max_page_list_len); + memcpy(frpl->page_list, pages, + sizeof(*pages) * npages); + + /* Prepare FastReg WR */ + wr = &frd->frd_fastreg_wr; + memset(wr, 0, sizeof(*wr)); + + wr->wr.opcode = IB_WR_FAST_REG_MR; + wr->wr.wr_id = IBLND_WID_MR; + + wr->wr.wr.fast_reg.iova_start = iov; + wr->wr.wr.fast_reg.page_list = frpl; + wr->wr.wr.fast_reg.page_list_len = npages; + wr->wr.wr.fast_reg.page_shift = PAGE_SHIFT; + wr->wr.wr.fast_reg.length = nob; + wr->wr.wr.fast_reg.rkey = + is_rx ? mr->rkey : mr->lkey; + wr->wr.wr.fast_reg.access_flags = + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); +#endif /* HAVE_IB_MAP_MR_SG */ + + fmr->fmr_key = is_rx ? mr->rkey : mr->lkey; + fmr->fmr_frd = frd; + fmr->fmr_pool = fpo; + frd->frd_posted = false; + return 0; + } + spin_unlock(&fps->fps_lock); + rc = -EAGAIN; + } + + spin_lock(&fps->fps_lock); + fpo->fpo_map_count--; + if (rc != -EAGAIN) { + spin_unlock(&fps->fps_lock); + return rc; + } + + /* EAGAIN and ... */ + if (version != fps->fps_version) { + spin_unlock(&fps->fps_lock); + goto again; + } + } + + if (fps->fps_increasing) { + spin_unlock(&fps->fps_lock); + CDEBUG(D_NET, "Another thread is allocating new " + "FMR pool, waiting for her to complete\n"); + wait_var_event(fps, !fps->fps_increasing); + goto again; + + } + + if (ktime_get_seconds() < fps->fps_next_retry) { + /* someone failed recently */ + spin_unlock(&fps->fps_lock); + return -EAGAIN; + } + + fps->fps_increasing = 1; + spin_unlock(&fps->fps_lock); + + CDEBUG(D_NET, "Allocate new FMR pool\n"); + rc = kiblnd_create_fmr_pool(fps, &fpo); + spin_lock(&fps->fps_lock); + fps->fps_increasing = 0; + wake_up_var(fps); + if (rc == 0) { + fps->fps_version++; + list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); + } else { + fps->fps_next_retry = ktime_get_seconds() + IBLND_POOL_RETRY; + } + spin_unlock(&fps->fps_lock); + + goto again; +} + +static void +kiblnd_fini_pool(struct kib_pool *pool) +{ + LASSERT(list_empty(&pool->po_free_list)); + LASSERT(pool->po_allocated == 0); + + CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name); +} + +static void +kiblnd_init_pool(struct kib_poolset *ps, struct kib_pool *pool, int size) +{ + CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name); + + memset(pool, 0, sizeof(struct kib_pool)); + INIT_LIST_HEAD(&pool->po_free_list); + pool->po_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE; + pool->po_owner = ps; + pool->po_size = size; +} + +static void +kiblnd_destroy_pool_list(struct list_head *head) +{ + struct kib_pool *pool; + + while ((pool = list_first_entry_or_null(head, + struct kib_pool, + po_list)) != NULL) { + list_del(&pool->po_list); + + LASSERT(pool->po_owner != NULL); + pool->po_owner->ps_pool_destroy(pool); + } +} + +static void +kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies) +{ + struct kib_pool *po; + + if (ps->ps_net == NULL) /* intialized? */ + return; + + spin_lock(&ps->ps_lock); + while ((po = list_first_entry_or_null(&ps->ps_pool_list, + struct kib_pool, + po_list)) != NULL) { + po->po_failed = 1; + if (po->po_allocated == 0) + list_move(&po->po_list, zombies); + else + list_move(&po->po_list, &ps->ps_failed_pool_list); + } + spin_unlock(&ps->ps_lock); +} + +static void +kiblnd_fini_poolset(struct kib_poolset *ps) +{ + if (ps->ps_net != NULL) { /* initialized? */ + kiblnd_destroy_pool_list(&ps->ps_failed_pool_list); + kiblnd_destroy_pool_list(&ps->ps_pool_list); + } +} + +static int +kiblnd_init_poolset(struct kib_poolset *ps, int cpt, + struct kib_net *net, char *name, int size, + kib_ps_pool_create_t po_create, + kib_ps_pool_destroy_t po_destroy, + kib_ps_node_init_t nd_init, + kib_ps_node_fini_t nd_fini) +{ + struct kib_pool *pool; + int rc; + + memset(ps, 0, sizeof(struct kib_poolset)); + + ps->ps_cpt = cpt; + ps->ps_net = net; + ps->ps_pool_create = po_create; + ps->ps_pool_destroy = po_destroy; + ps->ps_node_init = nd_init; + ps->ps_node_fini = nd_fini; + ps->ps_pool_size = size; + if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name)) + >= sizeof(ps->ps_name)) + return -E2BIG; + spin_lock_init(&ps->ps_lock); + INIT_LIST_HEAD(&ps->ps_pool_list); + INIT_LIST_HEAD(&ps->ps_failed_pool_list); + + rc = ps->ps_pool_create(ps, size, &pool); + if (rc == 0) + list_add(&pool->po_list, &ps->ps_pool_list); + else + CERROR("Failed to create the first pool for %s\n", ps->ps_name); + + return rc; +} + +static int +kiblnd_pool_is_idle(struct kib_pool *pool, time64_t now) +{ + if (pool->po_allocated != 0) /* still in use */ + return 0; + if (pool->po_failed) + return 1; + return now >= pool->po_deadline; +} + +void +kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node) +{ + LIST_HEAD(zombies); + struct kib_poolset *ps = pool->po_owner; + struct kib_pool *tmp; + time64_t now = ktime_get_seconds(); + + spin_lock(&ps->ps_lock); + + if (ps->ps_node_fini != NULL) + ps->ps_node_fini(pool, node); + + LASSERT(pool->po_allocated > 0); + list_add(node, &pool->po_free_list); + pool->po_allocated--; + + list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) { + /* the first pool is persistent */ + if (ps->ps_pool_list.next == &pool->po_list) + continue; + + if (kiblnd_pool_is_idle(pool, now)) + list_move(&pool->po_list, &zombies); + } + spin_unlock(&ps->ps_lock); + + if (!list_empty(&zombies)) + kiblnd_destroy_pool_list(&zombies); +} + +struct list_head * +kiblnd_pool_alloc_node(struct kib_poolset *ps) +{ + struct list_head *node; + struct kib_pool *pool; + int rc; + unsigned int interval = 1; + ktime_t time_before; + unsigned int trips = 0; + +again: + spin_lock(&ps->ps_lock); + list_for_each_entry(pool, &ps->ps_pool_list, po_list) { + if (list_empty(&pool->po_free_list)) + continue; + + pool->po_allocated++; + pool->po_deadline = ktime_get_seconds() + + IBLND_POOL_DEADLINE; + node = pool->po_free_list.next; + list_del(node); + + if (ps->ps_node_init != NULL) { + /* still hold the lock */ + ps->ps_node_init(pool, node); + } + spin_unlock(&ps->ps_lock); + return node; + } + + /* no available tx pool and ... */ + if (ps->ps_increasing) { + /* another thread is allocating a new pool */ + spin_unlock(&ps->ps_lock); + trips++; + CDEBUG(D_NET, + "Another thread is allocating new %s pool, waiting %d jiffies for her to complete. trips = %d\n", + ps->ps_name, interval, trips); + + schedule_timeout_interruptible(interval); + if (interval < cfs_time_seconds(1)) + interval *= 2; + + goto again; + } + + if (ktime_get_seconds() < ps->ps_next_retry) { + /* someone failed recently */ + spin_unlock(&ps->ps_lock); + return NULL; + } + + ps->ps_increasing = 1; + spin_unlock(&ps->ps_lock); + + CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name); + time_before = ktime_get(); + rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool); + CDEBUG(D_NET, "ps_pool_create took %lld ms to complete", + ktime_ms_delta(ktime_get(), time_before)); + + spin_lock(&ps->ps_lock); + ps->ps_increasing = 0; + if (rc == 0) { + list_add_tail(&pool->po_list, &ps->ps_pool_list); + } else { + ps->ps_next_retry = ktime_get_seconds() + IBLND_POOL_RETRY; + CERROR("Can't allocate new %s pool because out of memory\n", + ps->ps_name); + } + spin_unlock(&ps->ps_lock); + + goto again; +} + +static void +kiblnd_destroy_tx_pool(struct kib_pool *pool) +{ + struct kib_tx_pool *tpo = container_of(pool, struct kib_tx_pool, + tpo_pool); + int i; + + LASSERT (pool->po_allocated == 0); + + if (tpo->tpo_tx_pages != NULL) { + kiblnd_unmap_tx_pool(tpo); + kiblnd_free_pages(tpo->tpo_tx_pages); + } + + if (tpo->tpo_tx_descs == NULL) + goto out; + + for (i = 0; i < pool->po_size; i++) { + struct kib_tx *tx = &tpo->tpo_tx_descs[i]; + int wrq_sge = *kiblnd_tunables.kib_wrq_sge; + + list_del(&tx->tx_list); + if (tx->tx_pages != NULL) + CFS_FREE_PTR_ARRAY(tx->tx_pages, LNET_MAX_IOV); + if (tx->tx_frags != NULL) + CFS_FREE_PTR_ARRAY(tx->tx_frags, + IBLND_MAX_RDMA_FRAGS); + if (tx->tx_wrq != NULL) + CFS_FREE_PTR_ARRAY(tx->tx_wrq, + IBLND_MAX_RDMA_FRAGS); + if (tx->tx_sge != NULL) { + /* +1 is for the lnet header/message itself */ + CFS_FREE_PTR_ARRAY(tx->tx_sge, + (IBLND_MAX_RDMA_FRAGS * + wrq_sge + 1)); + } + if (tx->tx_rd != NULL) + LIBCFS_FREE(tx->tx_rd, + offsetof(struct kib_rdma_desc, + rd_frags[IBLND_MAX_RDMA_FRAGS])); + } + + CFS_FREE_PTR_ARRAY(tpo->tpo_tx_descs, pool->po_size); +out: + kiblnd_fini_pool(pool); + CFS_FREE_PTR(tpo); +} + +static int kiblnd_tx_pool_size(struct lnet_ni *ni, int ncpts) +{ + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + int ntx; + + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + ntx = tunables->lnd_ntx / ncpts; + + return max(IBLND_TX_POOL, ntx); +} + +static int +kiblnd_create_tx_pool(struct kib_poolset *ps, int size, struct kib_pool **pp_po) +{ + int i; + int npg; + struct kib_pool *pool; + struct kib_tx_pool *tpo; + + LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo)); + if (tpo == NULL) { + CERROR("Failed to allocate TX pool\n"); + return -ENOMEM; + } + + pool = &tpo->tpo_pool; + kiblnd_init_pool(ps, pool, size); + tpo->tpo_tx_descs = NULL; + tpo->tpo_tx_pages = NULL; + + npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE; + if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) { + CERROR("Can't allocate tx pages: %d\n", npg); + CFS_FREE_PTR(tpo); + return -ENOMEM; + } + + LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt, + size * sizeof(struct kib_tx)); + if (tpo->tpo_tx_descs == NULL) { + CERROR("Can't allocate %d tx descriptors\n", size); + ps->ps_pool_destroy(pool); + return -ENOMEM; + } + + memset(tpo->tpo_tx_descs, 0, size * sizeof(struct kib_tx)); + + for (i = 0; i < size; i++) { + struct kib_tx *tx = &tpo->tpo_tx_descs[i]; + int wrq_sge = *kiblnd_tunables.kib_wrq_sge; + + tx->tx_pool = tpo; + if (ps->ps_net->ibn_fmr_ps != NULL) { + LIBCFS_CPT_ALLOC(tx->tx_pages, + lnet_cpt_table(), ps->ps_cpt, + LNET_MAX_IOV * sizeof(*tx->tx_pages)); + if (tx->tx_pages == NULL) + break; + } + + LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt, + IBLND_MAX_RDMA_FRAGS * + sizeof(*tx->tx_frags)); + if (tx->tx_frags == NULL) + break; + + sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS); + + LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt, + IBLND_MAX_RDMA_FRAGS * + sizeof(*tx->tx_wrq)); + if (tx->tx_wrq == NULL) + break; + + /* +1 is for the lnet header/message itself */ + LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt, + (IBLND_MAX_RDMA_FRAGS * wrq_sge + 1) * + sizeof(*tx->tx_sge)); + if (tx->tx_sge == NULL) + break; + + LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt, + offsetof(struct kib_rdma_desc, + rd_frags[IBLND_MAX_RDMA_FRAGS])); + if (tx->tx_rd == NULL) + break; + } + + if (i == size) { + kiblnd_map_tx_pool(tpo); + *pp_po = pool; + return 0; + } + + ps->ps_pool_destroy(pool); + return -ENOMEM; +} + +static void +kiblnd_tx_init(struct kib_pool *pool, struct list_head *node) +{ + struct kib_tx_poolset *tps = container_of(pool->po_owner, + struct kib_tx_poolset, + tps_poolset); + struct kib_tx *tx = list_entry(node, struct kib_tx, tx_list); + + tx->tx_cookie = tps->tps_next_tx_cookie++; +} + +static void +kiblnd_net_fini_pools(struct kib_net *net) +{ + int i; + + cfs_cpt_for_each(i, lnet_cpt_table()) { + struct kib_tx_poolset *tps; + struct kib_fmr_poolset *fps; + + if (net->ibn_tx_ps != NULL) { + tps = net->ibn_tx_ps[i]; + kiblnd_fini_poolset(&tps->tps_poolset); + } + + if (net->ibn_fmr_ps != NULL) { + fps = net->ibn_fmr_ps[i]; + kiblnd_fini_fmr_poolset(fps); + } + } + + if (net->ibn_tx_ps != NULL) { + cfs_percpt_free(net->ibn_tx_ps); + net->ibn_tx_ps = NULL; + } + + if (net->ibn_fmr_ps != NULL) { + cfs_percpt_free(net->ibn_fmr_ps); + net->ibn_fmr_ps = NULL; + } +} + +static int +kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, __u32 *cpts, + int ncpts) +{ + struct lnet_ioctl_config_o2iblnd_tunables *tunables; +#ifdef HAVE_IB_GET_DMA_MR + unsigned long flags; +#endif + int cpt; + int rc; + int i; + + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + +#ifdef HAVE_IB_GET_DMA_MR + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + /* + * if lnd_map_on_demand is zero then we have effectively disabled + * FMR or FastReg and we're using global memory regions + * exclusively. + */ + if (!tunables->lnd_map_on_demand) { + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + goto create_tx_pool; + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +#endif + + if (tunables->lnd_fmr_pool_size < tunables->lnd_ntx / 4) { + CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", + tunables->lnd_fmr_pool_size, + tunables->lnd_ntx / 4); + rc = -EINVAL; + goto failed; + } + + /* TX pool must be created later than FMR, see LU-2268 + * for details */ + LASSERT(net->ibn_tx_ps == NULL); + + /* premapping can fail if ibd_nmr > 1, so we always create + * FMR pool and map-on-demand if premapping failed */ + + net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct kib_fmr_poolset)); + if (net->ibn_fmr_ps == NULL) { + CERROR("Failed to allocate FMR pool array\n"); + rc = -ENOMEM; + goto failed; + } + + for (i = 0; i < ncpts; i++) { + cpt = (cpts == NULL) ? i : cpts[i]; + rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts, + net, tunables); + if (rc != 0) { + CERROR("Can't initialize FMR pool for CPT %d: %d\n", + cpt, rc); + goto failed; + } + } + + if (i > 0) + LASSERT(i == ncpts); + +#ifdef HAVE_IB_GET_DMA_MR + create_tx_pool: +#endif + net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct kib_tx_poolset)); + if (net->ibn_tx_ps == NULL) { + CERROR("Failed to allocate tx pool array\n"); + rc = -ENOMEM; + goto failed; + } + + for (i = 0; i < ncpts; i++) { + cpt = (cpts == NULL) ? i : cpts[i]; + rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset, + cpt, net, "TX", + kiblnd_tx_pool_size(ni, ncpts), + kiblnd_create_tx_pool, + kiblnd_destroy_tx_pool, + kiblnd_tx_init, NULL); + if (rc != 0) { + CERROR("Can't initialize TX pool for CPT %d: %d\n", + cpt, rc); + goto failed; + } + } + + return 0; + failed: + kiblnd_net_fini_pools(net); + LASSERT(rc != 0); + return rc; +} + +static int +kiblnd_port_get_attr(struct kib_hca_dev *hdev) +{ + struct ib_port_attr *port_attr; + int rc; + unsigned long flags; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + + LIBCFS_ALLOC(port_attr, sizeof(*port_attr)); + if (port_attr == NULL) { + CDEBUG(D_NETERROR, "Out of memory\n"); + return -ENOMEM; + } + + rc = ib_query_port(hdev->ibh_ibdev, hdev->ibh_port, port_attr); + + write_lock_irqsave(g_lock, flags); + + if (rc == 0) + hdev->ibh_state = port_attr->state == IB_PORT_ACTIVE + ? IBLND_DEV_PORT_ACTIVE + : IBLND_DEV_PORT_DOWN; + + write_unlock_irqrestore(g_lock, flags); + LIBCFS_FREE(port_attr, sizeof(*port_attr)); + + if (rc != 0) { + CDEBUG(D_NETERROR, "Failed to query IB port: %d\n", rc); + return rc; + } + return 0; +} + +static inline void +kiblnd_set_ni_fatal_on(struct kib_hca_dev *hdev, int val) +{ + struct kib_net *net; + + /* for health check */ + list_for_each_entry(net, &hdev->ibh_dev->ibd_nets, ibn_list) { + if (val) + CDEBUG(D_NETERROR, "Fatal device error for NI %s\n", + libcfs_nidstr(&net->ibn_ni->ni_nid)); + atomic_set(&net->ibn_ni->ni_fatal_error_on, val); + } +} + +void +kiblnd_event_handler(struct ib_event_handler *handler, struct ib_event *event) +{ + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + struct kib_hca_dev *hdev; + unsigned long flags; + + hdev = container_of(handler, struct kib_hca_dev, ibh_event_handler); + + write_lock_irqsave(g_lock, flags); + + switch (event->event) { + case IB_EVENT_DEVICE_FATAL: + CDEBUG(D_NET, "IB device fatal\n"); + hdev->ibh_state = IBLND_DEV_FATAL; + kiblnd_set_ni_fatal_on(hdev, 1); + break; + case IB_EVENT_PORT_ACTIVE: + CDEBUG(D_NET, "IB port active\n"); + if (event->element.port_num == hdev->ibh_port) { + hdev->ibh_state = IBLND_DEV_PORT_ACTIVE; + kiblnd_set_ni_fatal_on(hdev, 0); + } + break; + case IB_EVENT_PORT_ERR: + CDEBUG(D_NET, "IB port err\n"); + if (event->element.port_num == hdev->ibh_port) { + hdev->ibh_state = IBLND_DEV_PORT_DOWN; + kiblnd_set_ni_fatal_on(hdev, 1); + } + break; + default: + break; + } + write_unlock_irqrestore(g_lock, flags); +} + +static int +kiblnd_hdev_get_attr(struct kib_hca_dev *hdev) +{ + struct ib_device_attr *dev_attr; + int rc = 0; + int rc2 = 0; + + /* It's safe to assume a HCA can handle a page size + * matching that of the native system */ + hdev->ibh_page_shift = PAGE_SHIFT; + hdev->ibh_page_size = 1 << PAGE_SHIFT; + hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1); + +#ifndef HAVE_IB_DEVICE_ATTRS + LIBCFS_ALLOC(dev_attr, sizeof(*dev_attr)); + if (dev_attr == NULL) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + rc = ib_query_device(hdev->ibh_ibdev, dev_attr); + if (rc != 0) { + CERROR("Failed to query IB device: %d\n", rc); + goto out_clean_attr; + } +#else + dev_attr = &hdev->ibh_ibdev->attrs; +#endif + + hdev->ibh_mr_size = dev_attr->max_mr_size; + hdev->ibh_max_qp_wr = dev_attr->max_qp_wr; + + /* Setup device Memory Registration capabilities */ +#ifdef HAVE_FMR_POOL_API +#ifdef HAVE_IB_DEVICE_OPS + if (hdev->ibh_ibdev->ops.alloc_fmr && + hdev->ibh_ibdev->ops.dealloc_fmr && + hdev->ibh_ibdev->ops.map_phys_fmr && + hdev->ibh_ibdev->ops.unmap_fmr) { +#else + if (hdev->ibh_ibdev->alloc_fmr && + hdev->ibh_ibdev->dealloc_fmr && + hdev->ibh_ibdev->map_phys_fmr && + hdev->ibh_ibdev->unmap_fmr) { +#endif + LCONSOLE_INFO("Using FMR for registration\n"); + hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FMR_ENABLED; + } else +#endif /* HAVE_FMR_POOL_API */ + if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + LCONSOLE_INFO("Using FastReg for registration\n"); + hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_ENABLED; +#ifndef HAVE_IB_ALLOC_FAST_REG_MR +#ifdef IB_DEVICE_SG_GAPS_REG + if (dev_attr->device_cap_flags & IB_DEVICE_SG_GAPS_REG) + hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT; +#endif +#endif + } else { + rc = -ENOSYS; + } + + rc2 = kiblnd_port_get_attr(hdev); + if (rc2 != 0) + return rc2; + + if (rc != 0) + rc = -EINVAL; + +#ifndef HAVE_IB_DEVICE_ATTRS +out_clean_attr: + LIBCFS_FREE(dev_attr, sizeof(*dev_attr)); +#endif + + if (rc == -ENOSYS) + CERROR("IB device does not support FMRs nor FastRegs, can't " + "register memory: %d\n", rc); + else if (rc == -EINVAL) + CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size); + return rc; +} + +#ifdef HAVE_IB_GET_DMA_MR +static void +kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev) +{ + if (hdev->ibh_mrs == NULL) + return; + + ib_dereg_mr(hdev->ibh_mrs); + + hdev->ibh_mrs = NULL; +} +#endif + +void +kiblnd_hdev_destroy(struct kib_hca_dev *hdev) +{ + if (hdev->ibh_event_handler.device != NULL) + ib_unregister_event_handler(&hdev->ibh_event_handler); + +#ifdef HAVE_IB_GET_DMA_MR + kiblnd_hdev_cleanup_mrs(hdev); +#endif + + if (hdev->ibh_pd != NULL) + ib_dealloc_pd(hdev->ibh_pd); + + if (hdev->ibh_cmid != NULL) + rdma_destroy_id(hdev->ibh_cmid); + + LIBCFS_FREE(hdev, sizeof(*hdev)); +} + +#ifdef HAVE_IB_GET_DMA_MR +static int +kiblnd_hdev_setup_mrs(struct kib_hca_dev *hdev) +{ + struct ib_mr *mr; + int acflags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE; + + mr = ib_get_dma_mr(hdev->ibh_pd, acflags); + if (IS_ERR(mr)) { + CERROR("Failed ib_get_dma_mr: %ld\n", PTR_ERR(mr)); + kiblnd_hdev_cleanup_mrs(hdev); + return PTR_ERR(mr); + } + + hdev->ibh_mrs = mr; + + return 0; +} +#endif + +static int +kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) +{ /* DUMMY */ + return 0; +} + +static int kiblnd_get_link_status(struct net_device *dev) +{ + int ret = -1; + + LASSERT(dev); + + if (!netif_running(dev)) + ret = 0; + /* Some devices may not be providing link settings */ + else if (dev->ethtool_ops->get_link) + ret = dev->ethtool_ops->get_link(dev); + + return ret; +} + +static int +kiblnd_dev_need_failover(struct kib_dev *dev, struct net *ns) +{ + struct rdma_cm_id *cmid; + struct sockaddr_in srcaddr; + struct sockaddr_in dstaddr; + int rc; + + if (dev->ibd_hdev == NULL || /* initializing */ + dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */ + *kiblnd_tunables.kib_dev_failover > 1) /* debugging */ + return 1; + + /* XXX: it's UGLY, but I don't have better way to find + * ib-bonding HCA failover because: + * + * a. no reliable CM event for HCA failover... + * b. no OFED API to get ib_device for current net_device... + * + * We have only two choices at this point: + * + * a. rdma_bind_addr(), it will conflict with listener cmid + * b. rdma_resolve_addr() to zero addr */ + cmid = kiblnd_rdma_create_id(ns, kiblnd_dummy_callback, dev, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cmid)) { + rc = PTR_ERR(cmid); + CERROR("Failed to create cmid for failover: %d\n", rc); + return rc; + } + + memset(&srcaddr, 0, sizeof(srcaddr)); + srcaddr.sin_family = AF_INET; + srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip); + + memset(&dstaddr, 0, sizeof(dstaddr)); + dstaddr.sin_family = AF_INET; + rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr, + (struct sockaddr *)&dstaddr, 1); + if (rc != 0 || cmid->device == NULL) { + CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", + dev->ibd_ifname, &dev->ibd_ifip, + cmid->device, rc); + rdma_destroy_id(cmid); + return rc; + } + + rc = dev->ibd_hdev->ibh_ibdev != cmid->device; /* true for failover */ + rdma_destroy_id(cmid); + return rc; +} + +int +kiblnd_dev_failover(struct kib_dev *dev, struct net *ns) +{ + LIST_HEAD(zombie_tpo); + LIST_HEAD(zombie_ppo); + LIST_HEAD(zombie_fpo); + struct rdma_cm_id *cmid = NULL; + struct kib_hca_dev *hdev = NULL; + struct kib_hca_dev *old; + struct ib_pd *pd; + struct kib_net *net; + struct sockaddr_in addr; + struct net_device *netdev; + unsigned long flags; + int rc = 0; + int i; + bool set_fatal = true; + + LASSERT(*kiblnd_tunables.kib_dev_failover > 1 || + dev->ibd_can_failover || + dev->ibd_hdev == NULL); + + rc = kiblnd_dev_need_failover(dev, ns); + if (rc <= 0) + goto out; + + if (dev->ibd_hdev != NULL && + dev->ibd_hdev->ibh_cmid != NULL) { + /* XXX it's not good to close old listener at here, + * because we can fail to create new listener. + * But we have to close it now, otherwise rdma_bind_addr + * will return EADDRINUSE... How crap! */ + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + cmid = dev->ibd_hdev->ibh_cmid; + /* make next schedule of kiblnd_dev_need_failover() + * return 1 for me */ + dev->ibd_hdev->ibh_cmid = NULL; + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + rdma_destroy_id(cmid); + } + + cmid = kiblnd_rdma_create_id(ns, kiblnd_cm_callback, dev, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(cmid)) { + rc = PTR_ERR(cmid); + CERROR("Failed to create cmid for failover: %d\n", rc); + goto out; + } + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip); + addr.sin_port = htons(*kiblnd_tunables.kib_service); + + /* Bind to failover device or port */ + rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr); + if (rc != 0 || cmid->device == NULL) { + CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", + dev->ibd_ifname, &dev->ibd_ifip, + cmid->device, rc); + if (!rc && !cmid->device) + set_fatal = false; + rdma_destroy_id(cmid); + goto out; + } + + LIBCFS_ALLOC(hdev, sizeof(*hdev)); + if (hdev == NULL) { + CERROR("Failed to allocate kib_hca_dev\n"); + rdma_destroy_id(cmid); + rc = -ENOMEM; + goto out; + } + + atomic_set(&hdev->ibh_ref, 1); + hdev->ibh_dev = dev; + hdev->ibh_cmid = cmid; + hdev->ibh_ibdev = cmid->device; + hdev->ibh_port = cmid->port_num; + +#ifdef HAVE_IB_ALLOC_PD_2ARGS + pd = ib_alloc_pd(cmid->device, 0); +#else + pd = ib_alloc_pd(cmid->device); +#endif + if (IS_ERR(pd)) { + rc = PTR_ERR(pd); + CERROR("Can't allocate PD: %d\n", rc); + goto out; + } + + hdev->ibh_pd = pd; + + rc = rdma_listen(cmid, 0); + if (rc != 0) { + CERROR("Can't start new listener: %d\n", rc); + goto out; + } + + rc = kiblnd_hdev_get_attr(hdev); + if (rc != 0) { + CERROR("Can't get device attributes: %d\n", rc); + goto out; + } + +#ifdef HAVE_IB_GET_DMA_MR + rc = kiblnd_hdev_setup_mrs(hdev); + if (rc != 0) { + CERROR("Can't setup device: %d\n", rc); + goto out; + } +#endif + + INIT_IB_EVENT_HANDLER(&hdev->ibh_event_handler, + hdev->ibh_ibdev, kiblnd_event_handler); + ib_register_event_handler(&hdev->ibh_event_handler); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + old = dev->ibd_hdev; + dev->ibd_hdev = hdev; /* take over the refcount */ + hdev = old; + + list_for_each_entry(net, &dev->ibd_nets, ibn_list) { + cfs_cpt_for_each(i, lnet_cpt_table()) { + kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset, + &zombie_tpo); + + if (net->ibn_fmr_ps != NULL) + kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i], + &zombie_fpo); + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + out: + if (!list_empty(&zombie_tpo)) + kiblnd_destroy_pool_list(&zombie_tpo); + if (!list_empty(&zombie_ppo)) + kiblnd_destroy_pool_list(&zombie_ppo); + if (!list_empty(&zombie_fpo)) + kiblnd_destroy_fmr_pool_list(&zombie_fpo); + if (hdev != NULL) + kiblnd_hdev_decref(hdev); + + if (rc != 0) { + dev->ibd_failed_failover++; + } else { + dev->ibd_failed_failover = 0; + + if (set_fatal) { + rcu_read_lock(); + netdev = dev_get_by_name_rcu(ns, dev->ibd_ifname); + if (netdev && (kiblnd_get_link_status(netdev) == 1)) + kiblnd_set_ni_fatal_on(dev->ibd_hdev, 0); + rcu_read_unlock(); + } + } + + return rc; +} + +void +kiblnd_destroy_dev(struct kib_dev *dev) +{ + LASSERT(dev->ibd_nnets == 0); + LASSERT(list_empty(&dev->ibd_nets)); + + list_del(&dev->ibd_fail_list); + list_del(&dev->ibd_list); + + if (dev->ibd_hdev != NULL) + kiblnd_hdev_decref(dev->ibd_hdev); + + LIBCFS_FREE(dev, sizeof(*dev)); +} + +static void +kiblnd_base_shutdown(void) +{ + struct kib_sched_info *sched; + struct kib_peer_ni *peer_ni; + int i; + + LASSERT(list_empty(&kiblnd_data.kib_devs)); + + CDEBUG(D_MALLOC, "before LND base cleanup: kmem %lld\n", + libcfs_kmem_read()); + + switch (kiblnd_data.kib_init) { + default: + LBUG(); + + case IBLND_INIT_ALL: + case IBLND_INIT_DATA: + hash_for_each(kiblnd_data.kib_peers, i, peer_ni, ibp_list) + LASSERT(0); + LASSERT(list_empty(&kiblnd_data.kib_connd_zombies)); + LASSERT(list_empty(&kiblnd_data.kib_connd_conns)); + LASSERT(list_empty(&kiblnd_data.kib_reconn_list)); + LASSERT(list_empty(&kiblnd_data.kib_reconn_wait)); + + /* flag threads to terminate; wake and wait for them to die */ + kiblnd_data.kib_shutdown = 1; + + /* NB: we really want to stop scheduler threads net by net + * instead of the whole module, this should be improved + * with dynamic configuration LNet. + */ + cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) + wake_up_all(&sched->ibs_waitq); + + wake_up(&kiblnd_data.kib_connd_waitq); + wake_up(&kiblnd_data.kib_failover_waitq); + + wait_var_event_warning(&kiblnd_data.kib_nthreads, + !atomic_read(&kiblnd_data.kib_nthreads), + "Waiting for %d threads to terminate\n", + atomic_read(&kiblnd_data.kib_nthreads)); + fallthrough; + + case IBLND_INIT_NOTHING: + break; + } + + if (kiblnd_data.kib_scheds != NULL) + cfs_percpt_free(kiblnd_data.kib_scheds); + + CDEBUG(D_MALLOC, "after LND base cleanup: kmem %lld\n", + libcfs_kmem_read()); + + kiblnd_data.kib_init = IBLND_INIT_NOTHING; + module_put(THIS_MODULE); +} + +static void +kiblnd_shutdown(struct lnet_ni *ni) +{ + struct kib_net *net = ni->ni_data; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + unsigned long flags; + + LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL); + + if (net == NULL) + goto out; + + CDEBUG(D_MALLOC, "before LND net cleanup: kmem %lld\n", + libcfs_kmem_read()); + + write_lock_irqsave(g_lock, flags); + net->ibn_shutdown = 1; + write_unlock_irqrestore(g_lock, flags); + + switch (net->ibn_init) { + default: + LBUG(); + + case IBLND_INIT_ALL: + /* nuke all existing peers within this net */ + kiblnd_del_peer(ni, LNET_NID_ANY); + + /* Wait for all peer_ni state to clean up */ + wait_var_event_warning(&net->ibn_npeers, + atomic_read(&net->ibn_npeers) == 0, + "%s: waiting for %d peers to disconnect\n", + libcfs_nidstr(&ni->ni_nid), + atomic_read(&net->ibn_npeers)); + + kiblnd_net_fini_pools(net); + + write_lock_irqsave(g_lock, flags); + LASSERT(net->ibn_dev->ibd_nnets > 0); + net->ibn_dev->ibd_nnets--; + list_del(&net->ibn_list); + write_unlock_irqrestore(g_lock, flags); + + fallthrough; + + case IBLND_INIT_NOTHING: + LASSERT (atomic_read(&net->ibn_nconns) == 0); + + if (net->ibn_dev != NULL && + net->ibn_dev->ibd_nnets == 0) + kiblnd_destroy_dev(net->ibn_dev); + + break; + } + + CDEBUG(D_MALLOC, "after LND net cleanup: kmem %lld\n", + libcfs_kmem_read()); + + net->ibn_init = IBLND_INIT_NOTHING; + ni->ni_data = NULL; + + LIBCFS_FREE(net, sizeof(*net)); + +out: + if (list_empty(&kiblnd_data.kib_devs)) + kiblnd_base_shutdown(); +} + +static int +kiblnd_base_startup(struct net *ns) +{ + struct kib_sched_info *sched; + int rc; + int i; + + LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING); + + if (!try_module_get(THIS_MODULE)) + goto failed; + + memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */ + + rwlock_init(&kiblnd_data.kib_global_lock); + + INIT_LIST_HEAD(&kiblnd_data.kib_devs); + INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs); + + hash_init(kiblnd_data.kib_peers); + + spin_lock_init(&kiblnd_data.kib_connd_lock); + INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns); + INIT_LIST_HEAD(&kiblnd_data.kib_connd_waits); + INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies); + INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list); + INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait); + + init_waitqueue_head(&kiblnd_data.kib_connd_waitq); + init_waitqueue_head(&kiblnd_data.kib_failover_waitq); + + kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*sched)); + if (kiblnd_data.kib_scheds == NULL) + goto failed; + + cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) { + int nthrs; + + spin_lock_init(&sched->ibs_lock); + INIT_LIST_HEAD(&sched->ibs_conns); + init_waitqueue_head(&sched->ibs_waitq); + + nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + if (*kiblnd_tunables.kib_nscheds > 0) { + nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds); + } else { + /* max to half of CPUs, another half is reserved for + * upper layer modules */ + nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); + } + + sched->ibs_nthreads_max = nthrs; + sched->ibs_cpt = i; + } + + kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR; + + /* lists/ptrs/locks initialised */ + kiblnd_data.kib_init = IBLND_INIT_DATA; + /*****************************************************/ + + rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd"); + if (rc != 0) { + CERROR("Can't spawn o2iblnd connd: %d\n", rc); + goto failed; + } + + if (*kiblnd_tunables.kib_dev_failover != 0) + rc = kiblnd_thread_start(kiblnd_failover_thread, ns, + "kiblnd_failover"); + + if (rc != 0) { + CERROR("Can't spawn o2iblnd failover thread: %d\n", rc); + goto failed; + } + + /* flag everything initialised */ + kiblnd_data.kib_init = IBLND_INIT_ALL; + /*****************************************************/ + + return 0; + + failed: + kiblnd_base_shutdown(); + return -ENETDOWN; +} + +static int +kiblnd_start_schedulers(struct kib_sched_info *sched) +{ + int rc = 0; + int nthrs; + int i; + + if (sched->ibs_nthreads == 0) { + if (*kiblnd_tunables.kib_nscheds > 0) { + nthrs = sched->ibs_nthreads_max; + } else { + nthrs = cfs_cpt_weight(lnet_cpt_table(), + sched->ibs_cpt); + nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); + nthrs = min(IBLND_N_SCHED_HIGH, nthrs); + } + } else { + LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max); + /* increase one thread if there is new interface */ + nthrs = (sched->ibs_nthreads < sched->ibs_nthreads_max); + } + + for (i = 0; i < nthrs; i++) { + long id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i); + + rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, + "kiblnd_sd_%02ld_%02ld", + KIB_THREAD_CPT(id), KIB_THREAD_TID(id)); + if (rc == 0) + continue; + + CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", + sched->ibs_cpt, sched->ibs_nthreads + i, rc); + break; + } + + sched->ibs_nthreads += i; + return rc; +} + +static int kiblnd_dev_start_threads(struct kib_dev *dev, bool newdev, u32 *cpts, + int ncpts) +{ + int cpt; + int rc; + int i; + + for (i = 0; i < ncpts; i++) { + struct kib_sched_info *sched; + + cpt = (cpts == NULL) ? i : cpts[i]; + sched = kiblnd_data.kib_scheds[cpt]; + + if (!newdev && sched->ibs_nthreads > 0) + continue; + + rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]); + if (rc != 0) { + CERROR("Failed to start scheduler threads for %s\n", + dev->ibd_ifname); + return rc; + } + } + return 0; +} + +static struct kib_dev * +kiblnd_dev_search(char *ifname) +{ + struct kib_dev *alias = NULL; + struct kib_dev *dev; + char *colon; + char *colon2; + + colon = strchr(ifname, ':'); + list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { + if (strcmp(&dev->ibd_ifname[0], ifname) == 0) + return dev; + + if (alias != NULL) + continue; + + colon2 = strchr(dev->ibd_ifname, ':'); + if (colon != NULL) + *colon = 0; + if (colon2 != NULL) + *colon2 = 0; + + if (strcmp(&dev->ibd_ifname[0], ifname) == 0) + alias = dev; + + if (colon != NULL) + *colon = ':'; + if (colon2 != NULL) + *colon2 = ':'; + } + return alias; +} + +static int +kiblnd_startup(struct lnet_ni *ni) +{ + char *ifname = NULL; + struct lnet_inetdev *ifaces = NULL; + struct kib_dev *ibdev = NULL; + struct kib_net *net = NULL; + unsigned long flags; + int rc; + int i; + bool newdev; + + LASSERT(ni->ni_net->net_lnd == &the_o2iblnd); + + if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) { + rc = kiblnd_base_startup(ni->ni_net_ns); + if (rc != 0) + return rc; + } + + LIBCFS_ALLOC(net, sizeof(*net)); + ni->ni_data = net; + if (net == NULL) { + rc = -ENOMEM; + goto failed; + } + + net->ibn_ni = ni; + net->ibn_incarnation = ktime_get_real_ns() / NSEC_PER_USEC; + + kiblnd_tunables_setup(ni); + + /* + * Multi-Rail wants each secondary + * IP to be treated as an unique 'struct ni' interface. + */ + if (ni->ni_interface != NULL) { + /* Use the IPoIB interface specified in 'networks=' */ + ifname = ni->ni_interface; + } else { + ifname = *kiblnd_tunables.kib_default_ipif; + } + + if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) { + CERROR("IPoIB interface name too long: %s\n", ifname); + rc = -E2BIG; + goto failed; + } + + rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns); + if (rc < 0) + goto failed; + + for (i = 0; i < rc; i++) { + if (strcmp(ifname, ifaces[i].li_name) == 0) + break; + } + + if (i == rc) { + CERROR("ko2iblnd: No matching interfaces\n"); + rc = -ENOENT; + goto failed; + } + + ibdev = kiblnd_dev_search(ifname); + newdev = ibdev == NULL; + /* hmm...create kib_dev even for alias */ + if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0) { + LIBCFS_ALLOC(ibdev, sizeof(*ibdev)); + if (!ibdev) { + rc = -ENOMEM; + goto failed; + } + + ibdev->ibd_ifip = ifaces[i].li_ipaddr; + strlcpy(ibdev->ibd_ifname, ifaces[i].li_name, + sizeof(ibdev->ibd_ifname)); + ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER); + + INIT_LIST_HEAD(&ibdev->ibd_nets); + INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */ + INIT_LIST_HEAD(&ibdev->ibd_fail_list); + + /* initialize the device */ + rc = kiblnd_dev_failover(ibdev, ni->ni_net_ns); + if (rc) { + CERROR("ko2iblnd: Can't initialize device: rc = %d\n", + rc); + goto failed; + } + + list_add_tail(&ibdev->ibd_list, &kiblnd_data.kib_devs); + } + + net->ibn_dev = ibdev; + ni->ni_nid.nid_addr[0] = cpu_to_be32(ibdev->ibd_ifip); + + ni->ni_dev_cpt = ifaces[i].li_cpt; + + rc = kiblnd_dev_start_threads(ibdev, newdev, ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) + goto failed; + + rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) { + CERROR("Failed to initialize NI pools: %d\n", rc); + goto failed; + } + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + ibdev->ibd_nnets++; + list_add_tail(&net->ibn_list, &ibdev->ibd_nets); + /* for health check */ + if (ibdev->ibd_hdev->ibh_state == IBLND_DEV_PORT_DOWN) + kiblnd_set_ni_fatal_on(ibdev->ibd_hdev, 1); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + net->ibn_init = IBLND_INIT_ALL; + + return 0; + +failed: + if (net != NULL && net->ibn_dev == NULL && ibdev != NULL) + kiblnd_destroy_dev(ibdev); + + kfree(ifaces); + kiblnd_shutdown(ni); + + CDEBUG(D_NET, "Configuration of device %s failed: rc = %d\n", + ifname ? ifname : "", rc); + + return -ENETDOWN; +} + +static const struct lnet_lnd the_o2iblnd = { + .lnd_type = O2IBLND, + .lnd_startup = kiblnd_startup, + .lnd_shutdown = kiblnd_shutdown, + .lnd_ctl = kiblnd_ctl, + .lnd_send = kiblnd_send, + .lnd_recv = kiblnd_recv, + .lnd_get_dev_prio = kiblnd_get_dev_prio, +}; + +static void ko2inlnd_assert_wire_constants(void) +{ + BUILD_BUG_ON(IBLND_MSG_MAGIC != 0x0be91b91); + BUILD_BUG_ON(IBLND_MSG_VERSION_1 != 0x11); + BUILD_BUG_ON(IBLND_MSG_VERSION_2 != 0x12); + BUILD_BUG_ON(IBLND_MSG_VERSION != IBLND_MSG_VERSION_2); + + BUILD_BUG_ON(IBLND_MSG_CONNREQ != 0xc0); + BUILD_BUG_ON(IBLND_MSG_CONNACK != 0xc1); + BUILD_BUG_ON(IBLND_MSG_NOOP != 0xd0); + BUILD_BUG_ON(IBLND_MSG_IMMEDIATE != 0xd1); + BUILD_BUG_ON(IBLND_MSG_PUT_REQ != 0xd2); + BUILD_BUG_ON(IBLND_MSG_PUT_NAK != 0xd3); + BUILD_BUG_ON(IBLND_MSG_PUT_ACK != 0xd4); + BUILD_BUG_ON(IBLND_MSG_PUT_DONE != 0xd5); + BUILD_BUG_ON(IBLND_MSG_GET_REQ != 0xd6); + BUILD_BUG_ON(IBLND_MSG_GET_DONE != 0xd7); + + BUILD_BUG_ON(IBLND_REJECT_CONN_RACE != 1); + BUILD_BUG_ON(IBLND_REJECT_NO_RESOURCES != 2); + BUILD_BUG_ON(IBLND_REJECT_FATAL != 3); + BUILD_BUG_ON(IBLND_REJECT_CONN_UNCOMPAT != 4); + BUILD_BUG_ON(IBLND_REJECT_CONN_STALE != 5); + BUILD_BUG_ON(IBLND_REJECT_RDMA_FRAGS != 6); + BUILD_BUG_ON(IBLND_REJECT_MSG_QUEUE_SIZE != 7); + BUILD_BUG_ON(IBLND_REJECT_INVALID_SRV_ID != 8); + + BUILD_BUG_ON((int)sizeof(struct kib_connparams) != 8); + BUILD_BUG_ON((int)offsetof(struct kib_connparams, ibcp_queue_depth) != 0); + BUILD_BUG_ON((int)sizeof(((struct kib_connparams *)0)->ibcp_queue_depth) != 2); + BUILD_BUG_ON((int)offsetof(struct kib_connparams, ibcp_max_frags) != 2); + BUILD_BUG_ON((int)sizeof(((struct kib_connparams *)0)->ibcp_max_frags) != 2); + BUILD_BUG_ON((int)offsetof(struct kib_connparams, ibcp_max_msg_size) != 4); + BUILD_BUG_ON((int)sizeof(((struct kib_connparams *)0)->ibcp_max_msg_size) != 4); + + BUILD_BUG_ON((int)sizeof(struct kib_immediate_msg) != 72); + BUILD_BUG_ON((int)offsetof(struct kib_immediate_msg, ibim_hdr) != 0); + BUILD_BUG_ON((int)sizeof(((struct kib_immediate_msg *)0)->ibim_hdr) != 72); + BUILD_BUG_ON((int)offsetof(struct kib_immediate_msg, ibim_payload) != 72); + BUILD_BUG_ON((int)sizeof(((struct kib_immediate_msg *)0)->ibim_payload) != 0); + + BUILD_BUG_ON((int)sizeof(struct kib_rdma_frag) != 12); + BUILD_BUG_ON((int)offsetof(struct kib_rdma_frag, rf_nob) != 0); + BUILD_BUG_ON((int)sizeof(((struct kib_rdma_frag *)0)->rf_nob) != 4); + BUILD_BUG_ON((int)offsetof(struct kib_rdma_frag, rf_addr) != 4); + BUILD_BUG_ON((int)sizeof(((struct kib_rdma_frag *)0)->rf_addr) != 8); + + BUILD_BUG_ON((int)sizeof(struct kib_rdma_desc) != 8); + BUILD_BUG_ON((int)offsetof(struct kib_rdma_desc, rd_key) != 0); + BUILD_BUG_ON((int)sizeof(((struct kib_rdma_desc *)0)->rd_key) != 4); + BUILD_BUG_ON((int)offsetof(struct kib_rdma_desc, rd_nfrags) != 4); + BUILD_BUG_ON((int)sizeof(((struct kib_rdma_desc *)0)->rd_nfrags) != 4); + BUILD_BUG_ON((int)offsetof(struct kib_rdma_desc, rd_frags) != 8); + BUILD_BUG_ON((int)sizeof(((struct kib_rdma_desc *)0)->rd_frags) != 0); + + BUILD_BUG_ON((int)sizeof(struct kib_putreq_msg) != 80); + BUILD_BUG_ON((int)offsetof(struct kib_putreq_msg, ibprm_hdr) != 0); + BUILD_BUG_ON((int)sizeof(((struct kib_putreq_msg *)0)->ibprm_hdr) != 72); + BUILD_BUG_ON((int)offsetof(struct kib_putreq_msg, ibprm_cookie) != 72); + BUILD_BUG_ON((int)sizeof(((struct kib_putreq_msg *)0)->ibprm_cookie) != 8); + + BUILD_BUG_ON((int)sizeof(struct kib_putack_msg) != 24); + BUILD_BUG_ON((int)offsetof(struct kib_putack_msg, ibpam_src_cookie) != 0); + BUILD_BUG_ON((int)sizeof(((struct kib_putack_msg *)0)->ibpam_src_cookie) != 8); + BUILD_BUG_ON((int)offsetof(struct kib_putack_msg, ibpam_dst_cookie) != 8); + BUILD_BUG_ON((int)sizeof(((struct kib_putack_msg *)0)->ibpam_dst_cookie) != 8); + BUILD_BUG_ON((int)offsetof(struct kib_putack_msg, ibpam_rd) != 16); + BUILD_BUG_ON((int)sizeof(((struct kib_putack_msg *)0)->ibpam_rd) != 8); + + BUILD_BUG_ON((int)sizeof(struct kib_get_msg) != 88); + BUILD_BUG_ON((int)offsetof(struct kib_get_msg, ibgm_hdr) != 0); + BUILD_BUG_ON((int)sizeof(((struct kib_get_msg *)0)->ibgm_hdr) != 72); + BUILD_BUG_ON((int)offsetof(struct kib_get_msg, ibgm_cookie) != 72); + BUILD_BUG_ON((int)sizeof(((struct kib_get_msg *)0)->ibgm_cookie) != 8); + BUILD_BUG_ON((int)offsetof(struct kib_get_msg, ibgm_rd) != 80); + BUILD_BUG_ON((int)sizeof(((struct kib_get_msg *)0)->ibgm_rd) != 8); + + BUILD_BUG_ON((int)sizeof(struct kib_completion_msg) != 12); + BUILD_BUG_ON((int)offsetof(struct kib_completion_msg, ibcm_cookie) != 0); + BUILD_BUG_ON((int)sizeof(((struct kib_completion_msg *)0)->ibcm_cookie) != 8); + BUILD_BUG_ON((int)offsetof(struct kib_completion_msg, ibcm_status) != 8); + BUILD_BUG_ON((int)sizeof(((struct kib_completion_msg *)0)->ibcm_status) != 4); + + /* Checks for struct kib_msg */ + //BUILD_BUG_ON((int)sizeof(struct kib_msg) != 12); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_magic) != 0); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_magic) != 4); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_version) != 4); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_version) != 2); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_type) != 6); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_type) != 1); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_credits) != 7); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_credits) != 1); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_nob) != 8); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_nob) != 4); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_cksum) != 12); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_cksum) != 4); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_srcnid) != 16); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_srcnid) != 8); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_srcstamp) != 24); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_srcstamp) != 8); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_dstnid) != 32); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_dstnid) != 8); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_dststamp) != 40); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_dststamp) != 8); + + /* Connparams */ + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.connparams.ibcp_queue_depth) != 48); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.connparams.ibcp_queue_depth) != 2); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.connparams.ibcp_max_frags) != 50); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.connparams.ibcp_max_frags) != 2); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.connparams.ibcp_max_msg_size) != 52); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.connparams.ibcp_max_msg_size) != 4); + + /* Immediate message */ + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.immediate.ibim_hdr) != 48); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.immediate.ibim_hdr) != 72); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.immediate.ibim_payload) != 120); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.immediate.ibim_payload) != 0); + + /* PUT req message */ + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.putreq.ibprm_hdr) != 48); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.putreq.ibprm_hdr) != 72); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.putreq.ibprm_cookie) != 120); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.putreq.ibprm_cookie) != 8); + + /* Put ACK */ + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.putack.ibpam_src_cookie) != 48); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.putack.ibpam_src_cookie) != 8); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.putack.ibpam_dst_cookie) != 56); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.putack.ibpam_dst_cookie) != 8); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.putack.ibpam_rd) != 64); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.putack.ibpam_rd) != 8); + + /* GET message */ + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.get.ibgm_hdr) != 48); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.get.ibgm_hdr) != 72); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.get.ibgm_cookie) != 120); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.get.ibgm_cookie) != 8); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.get.ibgm_rd) != 128); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.get.ibgm_rd) != 8); + + /* Completion message */ + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.completion.ibcm_cookie) != 48); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.completion.ibcm_cookie) != 8); + BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.completion.ibcm_status) != 56); + BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.completion.ibcm_status) != 4); + + /* Sanity checks */ + BUILD_BUG_ON(sizeof(struct kib_msg) > IBLND_MSG_SIZE); + BUILD_BUG_ON(offsetof(struct kib_msg, + ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) > + IBLND_MSG_SIZE); + BUILD_BUG_ON(offsetof(struct kib_msg, + ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) > + IBLND_MSG_SIZE); +} + +static void __exit ko2iblnd_exit(void) +{ + lnet_unregister_lnd(&the_o2iblnd); +} + +static int __init ko2iblnd_init(void) +{ + int rc; + + ko2inlnd_assert_wire_constants(); + + rc = kiblnd_tunables_init(); + if (rc != 0) + return rc; + + lnet_register_lnd(&the_o2iblnd); + + return 0; +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("OpenIB gen2 LNet Network Driver"); +MODULE_VERSION("2.8.0"); +MODULE_LICENSE("GPL"); + +module_init(ko2iblnd_init); +module_exit(ko2iblnd_exit); diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h new file mode 100644 index 0000000000000..d3f651224ee47 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h @@ -0,0 +1,1170 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/klnds/o2iblnd/o2iblnd.h + * + * Author: Eric Barton + */ + +#include +#include + +#if defined(NEED_LOCKDEP_IS_HELD_DISCARD_CONST) \ + && defined(CONFIG_LOCKDEP) \ + && defined(lockdep_is_held) +#undef lockdep_is_held + #define lockdep_is_held(lock) \ + lock_is_held((struct lockdep_map *)&(lock)->dep_map) +#endif + +#ifdef HAVE_COMPAT_RDMA +#include + +#ifdef LINUX_3_17_COMPAT_H +#undef NEED_KTIME_GET_REAL_NS +#endif + +#define HAVE_NLA_PUT_U64_64BIT 1 +#define HAVE_NLA_PARSE_6_PARAMS 1 +#define HAVE_NETLINK_EXTACK 1 + + +/* MOFED has its own bitmap_alloc backport */ +#define HAVE_BITMAP_ALLOC 1 + +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#ifdef HAVE_FMR_POOL_API +#include +#endif + +#define DEBUG_SUBSYSTEM S_LND + +#include +#include +#include "o2iblnd-idl.h" + +#define IBLND_PEER_HASH_BITS 7 /* log2 of # peer_ni lists */ +#define IBLND_N_SCHED 2 +#define IBLND_N_SCHED_HIGH 4 + +struct kib_tunables { + int *kib_dev_failover; /* HCA failover */ + unsigned int *kib_service; /* IB service number */ + int *kib_cksum; /* checksum struct kib_msg? */ + int *kib_timeout; /* comms timeout (seconds) */ + int *kib_keepalive; /* keepalive timeout (seconds) */ + char **kib_default_ipif; /* default IPoIB interface */ + int *kib_retry_count; + int *kib_rnr_retry_count; + int *kib_ib_mtu; /* IB MTU */ + int *kib_require_priv_port;/* accept only privileged ports */ + int *kib_use_priv_port; /* use privileged port for active connect */ + /* # threads on each CPT */ + int *kib_nscheds; + int *kib_wrq_sge; /* # sg elements per wrq */ + int *kib_use_fastreg_gaps; /* enable discontiguous fastreg fragment support */ +}; + +extern struct kib_tunables kiblnd_tunables; + +#define IBLND_MSG_QUEUE_SIZE_V1 8 /* V1 only : # messages/RDMAs in-flight */ +#define IBLND_CREDIT_HIGHWATER_V1 7 /* V1 only : when eagerly to return credits */ + +#define IBLND_CREDITS_DEFAULT 8 /* default # of peer_ni credits */ +#define IBLND_CREDITS_MAX ((typeof(((struct kib_msg *) 0)->ibm_credits)) - 1) /* Max # of peer_ni credits */ + +/* when eagerly to return credits */ +#define IBLND_CREDITS_HIGHWATER(t, conn) ((conn->ibc_version) == IBLND_MSG_VERSION_1 ? \ + IBLND_CREDIT_HIGHWATER_V1 : \ + min(t->lnd_peercredits_hiw, (__u32)conn->ibc_queue_depth - 1)) + +#ifdef HAVE_RDMA_CREATE_ID_5ARG +# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \ + rdma_create_id((ns) ? (ns) : &init_net, cb, dev, ps, qpt) +#else +# ifdef HAVE_RDMA_CREATE_ID_4ARG +# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \ + rdma_create_id(cb, dev, ps, qpt) +# else +# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \ + rdma_create_id(cb, dev, ps) +# endif +#endif + +/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */ +#define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1) +#define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0) + +/* max size of queued messages (inc hdr) */ +#define IBLND_MSG_SIZE (4<<10) +/* max # of fragments supported. + 1 for unaligned case */ +#define IBLND_MAX_RDMA_FRAGS (LNET_MAX_IOV + 1) + +/************************/ +/* derived constants... */ +/* Pools (shared by connections on each CPT) */ +/* These pools can grow at runtime, so don't need give a very large value */ +#define IBLND_TX_POOL 256 +#define IBLND_FMR_POOL 256 +#define IBLND_FMR_POOL_FLUSH 192 + +/* RX messages (per connection) */ +#define IBLND_RX_MSGS(c) \ + ((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version)) +#define IBLND_RX_MSG_BYTES(c) (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE) +#define IBLND_RX_MSG_PAGES(c) \ + ((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE) + +/* WRs and CQEs (per connection) */ +#define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c) + +/* 2 = LNet msg + Transfer chain */ +#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + kiblnd_send_wrs(c)) + +struct kib_hca_dev; + +/* o2iblnd can run over aliased interface */ +#ifdef IFALIASZ +#define KIB_IFNAME_SIZE IFALIASZ +#else +#define KIB_IFNAME_SIZE 256 +#endif + +enum kib_dev_caps { + IBLND_DEV_CAPS_FASTREG_ENABLED = BIT(0), + IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT = BIT(1), +#ifdef HAVE_FMR_POOL_API + IBLND_DEV_CAPS_FMR_ENABLED = BIT(2), +#endif +}; + +#define IS_FAST_REG_DEV(dev) \ + ((dev)->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED) + + +struct kib_dev { + struct list_head ibd_list; /* chain on kib_devs */ + struct list_head ibd_fail_list; /* chain on kib_failed_devs */ + __u32 ibd_ifip; /* IPoIB interface IP */ + /** IPoIB interface name */ + char ibd_ifname[KIB_IFNAME_SIZE]; + int ibd_nnets; /* # nets extant */ + + time64_t ibd_next_failover; + /* # failover failures */ + int ibd_failed_failover; + /* failover in progress */ + unsigned int ibd_failover; + /* IPoIB interface is a bonding master */ + unsigned int ibd_can_failover; + struct list_head ibd_nets; + struct kib_hca_dev *ibd_hdev; + enum kib_dev_caps ibd_dev_caps; +}; + +struct kib_hca_dev { + struct rdma_cm_id *ibh_cmid; /* listener cmid */ + struct ib_device *ibh_ibdev; /* IB device */ + int ibh_page_shift; /* page shift of current HCA */ + int ibh_page_size; /* page size of current HCA */ + __u64 ibh_page_mask; /* page mask of current HCA */ + __u64 ibh_mr_size; /* size of MR */ + int ibh_max_qp_wr; /* maximum work requests size */ +#ifdef HAVE_IB_GET_DMA_MR + struct ib_mr *ibh_mrs; /* global MR */ +#endif + struct ib_pd *ibh_pd; /* PD */ + u8 ibh_port; /* port number */ + struct ib_event_handler + ibh_event_handler; /* IB event handler */ + int ibh_state; /* device status */ +#define IBLND_DEV_PORT_DOWN 0 +#define IBLND_DEV_PORT_ACTIVE 1 +#define IBLND_DEV_FATAL 2 + struct kib_dev *ibh_dev; /* owner */ + atomic_t ibh_ref; /* refcount */ +}; + +/** # of seconds to keep pool alive */ +#define IBLND_POOL_DEADLINE 300 +/** # of seconds to retry if allocation failed */ +#define IBLND_POOL_RETRY 1 + +struct kib_pages { + int ibp_npages; /* # pages */ + struct page *ibp_pages[0]; /* page array */ +}; + +struct kib_pool; +struct kib_poolset; + +typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps, + int inc, struct kib_pool **pp_po); +typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po); +typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node); +typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node); + +struct kib_net; + +#define IBLND_POOL_NAME_LEN 32 + +struct kib_poolset { + /* serialize */ + spinlock_t ps_lock; + /* network it belongs to */ + struct kib_net *ps_net; + /* pool set name */ + char ps_name[IBLND_POOL_NAME_LEN]; + /* list of pools */ + struct list_head ps_pool_list; + /* failed pool list */ + struct list_head ps_failed_pool_list; + /* time stamp for retry if failed to allocate */ + time64_t ps_next_retry; + /* is allocating new pool */ + int ps_increasing; + /* new pool size */ + int ps_pool_size; + /* CPT id */ + int ps_cpt; + + /* create a new pool */ + kib_ps_pool_create_t ps_pool_create; + /* destroy a pool */ + kib_ps_pool_destroy_t ps_pool_destroy; + /* initialize new allocated node */ + kib_ps_node_init_t ps_node_init; + /* finalize node */ + kib_ps_node_fini_t ps_node_fini; +}; + +struct kib_pool { + /* chain on pool list */ + struct list_head po_list; + /* pre-allocated node */ + struct list_head po_free_list; + /* pool_set of this pool */ + struct kib_poolset *po_owner; + /* deadline of this pool */ + time64_t po_deadline; + /* # of elements in use */ + int po_allocated; + /* pool is created on failed HCA */ + int po_failed; + /* # of pre-allocated elements */ + int po_size; +}; + +struct kib_tx_poolset { + struct kib_poolset tps_poolset; /* pool-set */ + __u64 tps_next_tx_cookie; /* cookie of TX */ +}; + +struct kib_tx_pool { + struct kib_pool tpo_pool; /* pool */ + struct kib_hca_dev *tpo_hdev; /* device for this pool */ + struct kib_tx *tpo_tx_descs; /* all the tx descriptors */ + struct kib_pages *tpo_tx_pages; /* premapped tx msg pages */ +}; + +struct kib_fmr_poolset { + spinlock_t fps_lock; /* serialize */ + struct kib_net *fps_net; /* IB network */ + struct list_head fps_pool_list; /* FMR pool list */ + struct list_head fps_failed_pool_list; /* FMR pool list */ + __u64 fps_version; /* validity stamp */ + int fps_cpt; /* CPT id */ + int fps_pool_size; + int fps_flush_trigger; + int fps_cache; + /* is allocating new pool */ + int fps_increasing; + /* time stamp for retry if failed to allocate */ + time64_t fps_next_retry; +}; + +#ifndef HAVE_IB_RDMA_WR +struct ib_rdma_wr { + struct ib_send_wr wr; +}; +#endif + +struct kib_fast_reg_descriptor { /* For fast registration */ + struct list_head frd_list; + struct ib_rdma_wr frd_inv_wr; +#ifdef HAVE_IB_MAP_MR_SG + struct ib_reg_wr frd_fastreg_wr; +#else + struct ib_rdma_wr frd_fastreg_wr; + struct ib_fast_reg_page_list *frd_frpl; +#endif + struct ib_mr *frd_mr; + bool frd_valid; + bool frd_posted; +}; + +struct kib_fmr_pool { + struct list_head fpo_list; /* chain on pool list */ + struct kib_hca_dev *fpo_hdev; /* device for this pool */ + struct kib_fmr_poolset *fpo_owner; /* owner of this pool */ +#ifdef HAVE_FMR_POOL_API + union { + struct { + struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */ + } fmr; +#endif + struct { /* For fast registration */ + struct list_head fpo_pool_list; + int fpo_pool_size; + } fast_reg; +#ifdef HAVE_FMR_POOL_API + }; + bool fpo_is_fmr; /* True if FMR pools allocated */ +#endif + time64_t fpo_deadline; /* deadline of this pool */ + int fpo_failed; /* fmr pool is failed */ + int fpo_map_count; /* # of mapped FMR */ +}; + +struct kib_fmr { + struct kib_fmr_pool *fmr_pool; /* pool of FMR */ +#ifdef HAVE_FMR_POOL_API + struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */ +#endif /* HAVE_FMR_POOL_API */ + struct kib_fast_reg_descriptor *fmr_frd; + u32 fmr_key; +}; + +#ifdef HAVE_FMR_POOL_API + +#ifdef HAVE_ORACLE_OFED_EXTENSIONS +#define kib_fmr_pool_map(pool, pgs, n, iov) \ + ib_fmr_pool_map_phys((pool), (pgs), (n), (iov), NULL) +#else +#define kib_fmr_pool_map(pool, pgs, n, iov) \ + ib_fmr_pool_map_phys((pool), (pgs), (n), (iov)) +#endif + +#endif /* HAVE_FMR_POOL_API */ + +struct kib_net { + /* chain on struct kib_dev::ibd_nets */ + struct list_head ibn_list; + __u64 ibn_incarnation;/* my epoch */ + int ibn_init; /* initialisation state */ + int ibn_shutdown; /* shutting down? */ + + atomic_t ibn_npeers; /* # peers extant */ + atomic_t ibn_nconns; /* # connections extant */ + + struct kib_tx_poolset **ibn_tx_ps; /* tx pool-set */ + struct kib_fmr_poolset **ibn_fmr_ps; /* fmr pool-set */ + + struct kib_dev *ibn_dev; /* underlying IB device */ + struct lnet_ni *ibn_ni; /* LNet interface */ +}; + +#define KIB_THREAD_SHIFT 16 +#define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid)) +#define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT) +#define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1)) + +struct kib_sched_info { + /* serialise */ + spinlock_t ibs_lock; + /* schedulers sleep here */ + wait_queue_head_t ibs_waitq; + /* conns to check for rx completions */ + struct list_head ibs_conns; + /* number of scheduler threads */ + int ibs_nthreads; + /* max allowed scheduler threads */ + int ibs_nthreads_max; + int ibs_cpt; /* CPT id */ +}; + +struct kib_data { + int kib_init; /* initialisation state */ + int kib_shutdown; /* shut down? */ + struct list_head kib_devs; /* IB devices extant */ + /* list head of failed devices */ + struct list_head kib_failed_devs; + /* schedulers sleep here */ + wait_queue_head_t kib_failover_waitq; + atomic_t kib_nthreads; /* # live threads */ + /* stabilize net/dev/peer_ni/conn ops */ + rwlock_t kib_global_lock; + /* hash table of all my known peers */ + DECLARE_HASHTABLE(kib_peers, IBLND_PEER_HASH_BITS); + /* the connd task (serialisation assertions) */ + void *kib_connd; + /* connections to setup/teardown */ + struct list_head kib_connd_conns; + /* connections with zero refcount */ + struct list_head kib_connd_zombies; + /* connections to reconnect */ + struct list_head kib_reconn_list; + /* peers wait for reconnection */ + struct list_head kib_reconn_wait; + /* connections wait for completion */ + struct list_head kib_connd_waits; + /* + * The second that peers are pulled out from \a kib_reconn_wait + * for reconnection. + */ + time64_t kib_reconn_sec; + /* connection daemon sleeps here */ + wait_queue_head_t kib_connd_waitq; + spinlock_t kib_connd_lock; /* serialise */ + struct ib_qp_attr kib_error_qpa; /* QP->ERROR */ + /* percpt data for schedulers */ + struct kib_sched_info **kib_scheds; +}; + +#define IBLND_INIT_NOTHING 0 +#define IBLND_INIT_DATA 1 +#define IBLND_INIT_ALL 2 + +struct kib_rx { /* receive message */ + /* queue for attention */ + struct list_head rx_list; + /* owning conn */ + struct kib_conn *rx_conn; + /* # bytes received (-1 while posted) */ + int rx_nob; + /* message buffer (host vaddr) */ + struct kib_msg *rx_msg; + /* message buffer (I/O addr) */ + __u64 rx_msgaddr; + /* for dma_unmap_single() */ + DEFINE_DMA_UNMAP_ADDR(rx_msgunmap); + /* receive work item... */ + struct ib_recv_wr rx_wrq; + /* ...and its memory */ + struct ib_sge rx_sge; +}; + +#define IBLND_POSTRX_DONT_POST 0 /* don't post */ +#define IBLND_POSTRX_NO_CREDIT 1 /* post: no credits */ +#define IBLND_POSTRX_PEER_CREDIT 2 /* post: give peer_ni back 1 credit */ +#define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give myself back 1 reserved credit */ + +struct kib_tx { /* transmit message */ + /* queue on idle_txs ibc_tx_queue etc. */ + struct list_head tx_list; + /* pool I'm from */ + struct kib_tx_pool *tx_pool; + /* owning conn */ + struct kib_conn *tx_conn; + /* # tx callbacks outstanding */ + short tx_sending; + /* queued for sending */ + short tx_queued; + /* waiting for peer_ni */ + short tx_waiting; + /* LNET completion status */ + int tx_status; + /* health status of the transmit */ + enum lnet_msg_hstatus tx_hstatus; + /* completion deadline */ + ktime_t tx_deadline; + /* completion cookie */ + __u64 tx_cookie; + /* lnet msgs to finalize on completion */ + struct lnet_msg *tx_lntmsg[2]; + /* message buffer (host vaddr) */ + struct kib_msg *tx_msg; + /* message buffer (I/O addr) */ + __u64 tx_msgaddr; + /* for dma_unmap_single() */ + DEFINE_DMA_UNMAP_ADDR(tx_msgunmap); + /* # send work items */ + int tx_nwrq; + /* # used scatter/gather elements */ + int tx_nsge; + /* send work items... */ + struct ib_rdma_wr *tx_wrq; + /* ...and their memory */ + struct ib_sge *tx_sge; + /* rdma descriptor */ + struct kib_rdma_desc *tx_rd; + /* # entries in... */ + int tx_nfrags; + /* dma_map_sg descriptor */ + struct scatterlist *tx_frags; + /* rdma phys page addrs */ + __u64 *tx_pages; + /* gaps in fragments */ + bool tx_gaps; + /* FMR */ + struct kib_fmr tx_fmr; + /* dma direction */ + int tx_dmadir; +}; + +struct kib_connvars { + /* connection-in-progress variables */ + struct kib_msg cv_msg; +}; + +struct kib_conn { + /* scheduler information */ + struct kib_sched_info *ibc_sched; + /* owning peer_ni */ + struct kib_peer_ni *ibc_peer; + /* HCA bound on */ + struct kib_hca_dev *ibc_hdev; + /* stash on peer_ni's conn list */ + struct list_head ibc_list; + /* schedule for attention */ + struct list_head ibc_sched_list; + /* version of connection */ + __u16 ibc_version; + /* reconnect later */ + __u16 ibc_reconnect:1; + /* which instance of the peer */ + __u64 ibc_incarnation; + /* # users */ + atomic_t ibc_refcount; + /* what's happening */ + int ibc_state; + /* # uncompleted sends */ + int ibc_nsends_posted; + /* # uncompleted NOOPs */ + int ibc_noops_posted; + /* # credits I have */ + int ibc_credits; + /* # credits to return */ + int ibc_outstanding_credits; + /* # ACK/DONE msg credits */ + int ibc_reserved_credits; + /* set on comms error */ + int ibc_comms_error; + /* connections queue depth */ + __u16 ibc_queue_depth; + /* connections max frags */ + __u16 ibc_max_frags; + /* count of timeout txs waiting on cq */ + __u16 ibc_waits; + /* receive buffers owned */ + unsigned int ibc_nrx:16; + /* scheduled for attention */ + unsigned int ibc_scheduled:1; + /* CQ callback fired */ + unsigned int ibc_ready:1; + /* time of last send */ + ktime_t ibc_last_send; + /** link chain for kiblnd_check_conns only */ + struct list_head ibc_connd_list; + /** rxs completed before ESTABLISHED */ + struct list_head ibc_early_rxs; + /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */ + struct list_head ibc_tx_noops; + /* sends that need a credit */ + struct list_head ibc_tx_queue; + /* sends that don't need a credit */ + struct list_head ibc_tx_queue_nocred; + /* sends that need to reserve an ACK/DONE msg */ + struct list_head ibc_tx_queue_rsrvd; + /* active tx awaiting completion */ + struct list_head ibc_active_txs; + /* zombie tx awaiting done */ + struct list_head ibc_zombie_txs; + /* serialise */ + spinlock_t ibc_lock; + /* the rx descs */ + struct kib_rx *ibc_rxs; + /* premapped rx msg pages */ + struct kib_pages *ibc_rx_pages; + + /* CM id */ + struct rdma_cm_id *ibc_cmid; + /* completion queue */ + struct ib_cq *ibc_cq; + + /* in-progress connection state */ + struct kib_connvars *ibc_connvars; +}; + +#define IBLND_CONN_INIT 0 /* being initialised */ +#define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */ +#define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */ +#define IBLND_CONN_ESTABLISHED 3 /* connection established */ +#define IBLND_CONN_CLOSING 4 /* being closed */ +#define IBLND_CONN_DISCONNECTED 5 /* disconnected */ + +struct kib_peer_ni { + /* on peer_ni hash chain */ + struct hlist_node ibp_list; + /* who's on the other end(s) */ + lnet_nid_t ibp_nid; + /* LNet interface */ + struct lnet_ni *ibp_ni; + /* all active connections */ + struct list_head ibp_conns; + /* next connection to send on for round robin */ + struct kib_conn *ibp_next_conn; + /* msgs waiting for a conn */ + struct list_head ibp_tx_queue; + /* incarnation of peer_ni */ + __u64 ibp_incarnation; + /* when (in seconds) I was last alive */ + time64_t ibp_last_alive; + /* # users */ + struct kref ibp_kref; + /* version of peer_ni */ + __u16 ibp_version; + /* current passive connection attempts */ + unsigned short ibp_accepting; + /* current active connection attempts */ + unsigned short ibp_connecting; + /* reconnect this peer_ni later */ + unsigned char ibp_reconnecting; + /* counter of how many times we triggered a conn race */ + unsigned char ibp_races; + /* # consecutive reconnection attempts to this peer */ + unsigned int ibp_reconnected; + /* errno on closing this peer_ni */ + int ibp_error; + /* max map_on_demand */ + __u16 ibp_max_frags; + /* max_peer_credits */ + __u16 ibp_queue_depth; + /* reduced value which allows conn to be created if max fails */ + __u16 ibp_queue_depth_mod; +}; + +#ifndef HAVE_IB_INC_RKEY +/** + * ib_inc_rkey - increments the key portion of the given rkey. Can be used + * for calculating a new rkey for type 2 memory windows. + * @rkey - the rkey to increment. + */ +static inline u32 ib_inc_rkey(u32 rkey) +{ + const u32 mask = 0x000000ff; + return ((rkey + 1) & mask) | (rkey & ~mask); +} +#endif + +extern struct kib_data kiblnd_data; + +extern void kiblnd_hdev_destroy(struct kib_hca_dev *hdev); + +int kiblnd_msg_queue_size(int version, struct lnet_ni *ni); + +static inline int kiblnd_timeout(void) +{ + return *kiblnd_tunables.kib_timeout ? *kiblnd_tunables.kib_timeout : + lnet_get_lnd_timeout(); +} + +static inline int +kiblnd_concurrent_sends(int version, struct lnet_ni *ni) +{ + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + int concurrent_sends; + + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + concurrent_sends = tunables->lnd_concurrent_sends; + + if (version == IBLND_MSG_VERSION_1) { + if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2) + return IBLND_MSG_QUEUE_SIZE_V1 * 2; + + if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2) + return IBLND_MSG_QUEUE_SIZE_V1 / 2; + } + + return concurrent_sends; +} + +static inline void +kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev) +{ + LASSERT(atomic_read(&hdev->ibh_ref) > 0); + atomic_inc(&hdev->ibh_ref); +} + +static inline void +kiblnd_hdev_decref(struct kib_hca_dev *hdev) +{ + LASSERT(atomic_read(&hdev->ibh_ref) > 0); + if (atomic_dec_and_test(&hdev->ibh_ref)) + kiblnd_hdev_destroy(hdev); +} + +static inline int +kiblnd_dev_can_failover(struct kib_dev *dev) +{ + if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */ + return 0; + + if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */ + return 0; + + if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */ + return 1; + + return dev->ibd_can_failover; +} + +#define kiblnd_conn_addref(conn) \ +do { \ + CDEBUG(D_NET, "conn[%p] (%d)++\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + atomic_inc(&(conn)->ibc_refcount); \ +} while (0) + +#define kiblnd_conn_decref(conn) \ +do { \ + unsigned long flags; \ + \ + CDEBUG(D_NET, "conn[%p] (%d)--\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + LASSERT_ATOMIC_POS(&(conn)->ibc_refcount); \ + if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \ + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); \ + list_add_tail(&(conn)->ibc_list, \ + &kiblnd_data.kib_connd_zombies); \ + wake_up(&kiblnd_data.kib_connd_waitq); \ + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\ + } \ +} while (0) + +void kiblnd_destroy_peer(struct kref *kref); + +static inline void kiblnd_peer_addref(struct kib_peer_ni *peer_ni) +{ + CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)++\n", + peer_ni, libcfs_nid2str(peer_ni->ibp_nid), + kref_read(&peer_ni->ibp_kref)); + kref_get(&(peer_ni)->ibp_kref); +} + +static inline void kiblnd_peer_decref(struct kib_peer_ni *peer_ni) +{ + CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)--\n", + peer_ni, libcfs_nid2str(peer_ni->ibp_nid), + kref_read(&peer_ni->ibp_kref)); + kref_put(&peer_ni->ibp_kref, kiblnd_destroy_peer); +} + +static inline bool +kiblnd_peer_connecting(struct kib_peer_ni *peer_ni) +{ + return peer_ni->ibp_connecting != 0 || + peer_ni->ibp_reconnecting != 0 || + peer_ni->ibp_accepting != 0; +} + +static inline bool +kiblnd_peer_idle(struct kib_peer_ni *peer_ni) +{ + return !kiblnd_peer_connecting(peer_ni) && list_empty(&peer_ni->ibp_conns); +} + +static inline int +kiblnd_peer_active(struct kib_peer_ni *peer_ni) +{ + /* Am I in the peer_ni hash table? */ + return !hlist_unhashed(&peer_ni->ibp_list); +} + +static inline struct kib_conn * +kiblnd_get_conn_locked(struct kib_peer_ni *peer_ni) +{ + struct list_head *next; + + LASSERT(!list_empty(&peer_ni->ibp_conns)); + + /* Advance to next connection, be sure to skip the head node */ + if (!peer_ni->ibp_next_conn || + peer_ni->ibp_next_conn->ibc_list.next == &peer_ni->ibp_conns) + next = peer_ni->ibp_conns.next; + else + next = peer_ni->ibp_next_conn->ibc_list.next; + peer_ni->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list); + + return peer_ni->ibp_next_conn; +} + +static inline int +kiblnd_send_keepalive(struct kib_conn *conn) +{ + s64 keepalive_ns = *kiblnd_tunables.kib_keepalive * NSEC_PER_SEC; + + return (*kiblnd_tunables.kib_keepalive > 0) && + ktime_after(ktime_get(), + ktime_add_ns(conn->ibc_last_send, keepalive_ns)); +} + +static inline int +kiblnd_need_noop(struct kib_conn *conn) +{ + struct lnet_ni *ni = conn->ibc_peer->ibp_ni; + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + + LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + + if (conn->ibc_outstanding_credits < + IBLND_CREDITS_HIGHWATER(tunables, conn) && + !kiblnd_send_keepalive(conn)) + return 0; /* No need to send NOOP */ + + if (IBLND_OOB_CAPABLE(conn->ibc_version)) { + if (!list_empty(&conn->ibc_tx_queue_nocred)) + return 0; /* NOOP can be piggybacked */ + + /* No tx to piggyback NOOP onto or no credit to send a tx */ + return (list_empty(&conn->ibc_tx_queue) || + conn->ibc_credits == 0); + } + + if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */ + !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */ + conn->ibc_credits == 0) /* no credit */ + return 0; + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) /* giving back credits */ + return 0; + + /* No tx to piggyback NOOP onto or no credit to send a tx */ + return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1); +} + +static inline void +kiblnd_abort_receives(struct kib_conn *conn) +{ + ib_modify_qp(conn->ibc_cmid->qp, + &kiblnd_data.kib_error_qpa, IB_QP_STATE); +} + +static inline const char * +kiblnd_queue2str(struct kib_conn *conn, struct list_head *q) +{ + if (q == &conn->ibc_tx_queue) + return "tx_queue"; + + if (q == &conn->ibc_tx_queue_rsrvd) + return "tx_queue_rsrvd"; + + if (q == &conn->ibc_tx_queue_nocred) + return "tx_queue_nocred"; + + if (q == &conn->ibc_active_txs) + return "active_txs"; + + LBUG(); + return NULL; +} + +/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the + * lowest bits of the work request id to stash the work item type. */ + +#define IBLND_WID_INVAL 0 +#define IBLND_WID_TX 1 +#define IBLND_WID_RX 2 +#define IBLND_WID_RDMA 3 +#define IBLND_WID_MR 4 +#define IBLND_WID_MASK 7UL + +static inline __u64 +kiblnd_ptr2wreqid (void *ptr, int type) +{ + unsigned long lptr = (unsigned long)ptr; + + LASSERT ((lptr & IBLND_WID_MASK) == 0); + LASSERT ((type & ~IBLND_WID_MASK) == 0); + return (__u64)(lptr | type); +} + +static inline void * +kiblnd_wreqid2ptr (__u64 wreqid) +{ + return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK); +} + +static inline int +kiblnd_wreqid2type (__u64 wreqid) +{ + return (wreqid & IBLND_WID_MASK); +} + +static inline void +kiblnd_set_conn_state(struct kib_conn *conn, int state) +{ + conn->ibc_state = state; + smp_mb(); +} + +static inline void +kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob) +{ + msg->ibm_type = type; + msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob; +} + +static inline int +kiblnd_rd_size(struct kib_rdma_desc *rd) +{ + int i; + int size; + + for (i = size = 0; i < rd->rd_nfrags; i++) + size += rd->rd_frags[i].rf_nob; + + return size; +} + +static inline __u64 +kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index) +{ + return rd->rd_frags[index].rf_addr; +} + +static inline int +kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index) +{ + return rd->rd_frags[index].rf_nob; +} + +static inline __u32 +kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index) +{ + return rd->rd_key; +} + +static inline int +kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob) +{ + if (nob < rd->rd_frags[index].rf_nob) { + rd->rd_frags[index].rf_addr += nob; + rd->rd_frags[index].rf_nob -= nob; + } else { + index ++; + } + + return index; +} + +static inline int +kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n) +{ + LASSERT (msgtype == IBLND_MSG_GET_REQ || + msgtype == IBLND_MSG_PUT_ACK); + + return msgtype == IBLND_MSG_GET_REQ ? + offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) : + offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]); +} + +static inline __u64 +kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return ib_dma_mapping_error(dev, dma_addr); +} + +static inline __u64 kiblnd_dma_map_single(struct ib_device *dev, + void *msg, size_t size, + enum dma_data_direction direction) +{ + return ib_dma_map_single(dev, msg, size, direction); +} + +static inline void kiblnd_dma_unmap_single(struct ib_device *dev, + __u64 addr, size_t size, + enum dma_data_direction direction) +{ + ib_dma_unmap_single(dev, addr, size, direction); +} + +#define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0) +#define KIBLND_UNMAP_ADDR(p, m, a) (a) + +static inline int kiblnd_dma_map_sg(struct kib_hca_dev *hdev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + int count; + + count = lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device, + sg, nents, direction); + + if (count != 0) + return count; + + return ib_dma_map_sg(hdev->ibh_ibdev, sg, nents, direction); +} + +static inline void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + int count; + + count = lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device, + sg, nents, direction); + if (count != 0) + return; + + ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction); +} + +#ifndef HAVE_IB_SG_DMA_ADDRESS +#include +#define ib_sg_dma_address(dev, sg) sg_dma_address(sg) +#define ib_sg_dma_len(dev, sg) sg_dma_len(sg) +#endif + +static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev, + struct scatterlist *sg) +{ + return ib_sg_dma_address(dev, sg); +} + +static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev, + struct scatterlist *sg) +{ + return ib_sg_dma_len(dev, sg); +} + +#ifndef HAVE_RDMA_CONNECT_LOCKED +#define rdma_connect_locked(cmid, cpp) rdma_connect(cmid, cpp) +#endif + +/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly + * right because OFED1.2 defines it as const, to use it we have to add + * (void *) cast to overcome "const" */ + +#define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data) +#define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len) + +void kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs); +void kiblnd_map_rx_descs(struct kib_conn *conn); +void kiblnd_unmap_rx_descs(struct kib_conn *conn); +void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node); +struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps); + +int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx, + struct kib_rdma_desc *rd, u32 nob, u64 iov, + struct kib_fmr *fmr); +void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status); + +int kiblnd_tunables_setup(struct lnet_ni *ni); +int kiblnd_tunables_init(void); + +int kiblnd_connd (void *arg); +int kiblnd_scheduler(void *arg); +#define kiblnd_thread_start(fn, data, namefmt, arg...) \ + ({ \ + struct task_struct *__task = kthread_run(fn, data, \ + namefmt, ##arg); \ + if (!IS_ERR(__task)) \ + atomic_inc(&kiblnd_data.kib_nthreads); \ + PTR_ERR_OR_ZERO(__task); \ + }) + +int kiblnd_failover_thread (void *arg); + +int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages); + +int kiblnd_cm_callback(struct rdma_cm_id *cmid, + struct rdma_cm_event *event); +int kiblnd_translate_mtu(int value); + +int kiblnd_dev_failover(struct kib_dev *dev, struct net *ns); +int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp, + lnet_nid_t nid); +bool kiblnd_reconnect_peer(struct kib_peer_ni *peer); +void kiblnd_destroy_dev(struct kib_dev *dev); +void kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni); +struct kib_peer_ni *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid); +int kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni, + int version, u64 incarnation); +int kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why); + +struct kib_conn *kiblnd_create_conn(struct kib_peer_ni *peer_ni, + struct rdma_cm_id *cmid, + int state, int version); +void kiblnd_destroy_conn(struct kib_conn *conn); +void kiblnd_close_conn(struct kib_conn *conn, int error); +void kiblnd_close_conn_locked(struct kib_conn *conn, int error); + +void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid); +void kiblnd_txlist_done(struct list_head *txlist, int status, + enum lnet_msg_hstatus hstatus); + +void kiblnd_qp_event(struct ib_event *event, void *arg); +void kiblnd_cq_event(struct ib_event *event, void *arg); +void kiblnd_cq_completion(struct ib_cq *cq, void *arg); + +void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version, + int credits, lnet_nid_t dstnid, __u64 dststamp); +int kiblnd_unpack_msg(struct kib_msg *msg, int nob); +int kiblnd_post_rx(struct kib_rx *rx, int credit); + +int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg); +int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, + int delayed, unsigned int niov, + struct bio_vec *kiov, unsigned int offset, unsigned int mlen, + unsigned int rlen); +unsigned int kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx); + + diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c new file mode 100644 index 0000000000000..75895d69b080a --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -0,0 +1,4021 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/klnds/o2iblnd/o2iblnd_cb.c + * + * Author: Eric Barton + */ + +#include "o2iblnd.h" + +#define MAX_CONN_RACES_BEFORE_ABORT 20 + +static void kiblnd_peer_alive(struct kib_peer_ni *peer_ni); +static void kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active, + int error); +static struct ib_rdma_wr * +kiblnd_init_tx_msg_payload(struct lnet_ni *ni, struct kib_tx *tx, + int type, int body_nob, int payload_nob); +#define kiblnd_init_tx_msg(ni, tx, type, body) \ + kiblnd_init_tx_msg_payload(ni, tx, type, body, 0) +static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, + int resid, struct kib_rdma_desc *dstrd, u64 dstcookie); +static void kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn); +static void kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn); + +static void kiblnd_unmap_tx(struct kib_tx *tx); +static void kiblnd_check_sends_locked(struct kib_conn *conn); + +void +kiblnd_tx_done(struct kib_tx *tx) +{ + struct lnet_msg *lntmsg[2]; + int rc; + int i; + + LASSERT (!in_interrupt()); + LASSERT (!tx->tx_queued); /* mustn't be queued for sending */ + LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */ + LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer_ni response */ + LASSERT (tx->tx_pool != NULL); + + kiblnd_unmap_tx(tx); + + /* tx may have up to 2 lnet msgs to finalise */ + lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; + lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; + rc = tx->tx_status; + + if (tx->tx_conn != NULL) { + kiblnd_conn_decref(tx->tx_conn); + tx->tx_conn = NULL; + } + + tx->tx_nwrq = tx->tx_nsge = 0; + tx->tx_status = 0; + + kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list); + + /* delay finalize until my descs have been freed */ + for (i = 0; i < 2; i++) { + if (lntmsg[i] == NULL) + continue; + + /* propagate health status to LNet for requests */ + if (i == 0 && lntmsg[i]) + lntmsg[i]->msg_health_status = tx->tx_hstatus; + + lnet_finalize(lntmsg[i], rc); + } +} + +void +kiblnd_txlist_done(struct list_head *txlist, int status, + enum lnet_msg_hstatus hstatus) +{ + struct kib_tx *tx; + + while ((tx = list_first_entry_or_null(txlist, + struct kib_tx, + tx_list)) != NULL) { + list_del(&tx->tx_list); + /* complete now */ + tx->tx_waiting = 0; + tx->tx_status = status; + if (hstatus != LNET_MSG_STATUS_OK) + tx->tx_hstatus = hstatus; + kiblnd_tx_done(tx); + } +} + +static struct kib_tx * +kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target) +{ + struct kib_net *net = ni->ni_data; + struct list_head *node; + struct kib_tx *tx; + struct kib_tx_poolset *tps; + + tps = net->ibn_tx_ps[lnet_cpt_of_nid(target, ni)]; + node = kiblnd_pool_alloc_node(&tps->tps_poolset); + if (node == NULL) + return NULL; + tx = container_of(node, struct kib_tx, tx_list); + + LASSERT (tx->tx_nwrq == 0); + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_sending == 0); + LASSERT (!tx->tx_waiting); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (tx->tx_lntmsg[0] == NULL); + LASSERT (tx->tx_lntmsg[1] == NULL); + LASSERT (tx->tx_nfrags == 0); + + tx->tx_gaps = false; + tx->tx_hstatus = LNET_MSG_STATUS_OK; + + return tx; +} + +static void +kiblnd_drop_rx(struct kib_rx *rx) +{ + struct kib_conn *conn = rx->rx_conn; + struct kib_sched_info *sched = conn->ibc_sched; + unsigned long flags; + + spin_lock_irqsave(&sched->ibs_lock, flags); + LASSERT(conn->ibc_nrx > 0); + conn->ibc_nrx--; + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + kiblnd_conn_decref(conn); +} + +int +kiblnd_post_rx(struct kib_rx *rx, int credit) +{ + struct kib_conn *conn = rx->rx_conn; + struct kib_net *net = conn->ibc_peer->ibp_ni->ni_data; + struct ib_recv_wr *bad_wrq = NULL; +#ifdef HAVE_IB_GET_DMA_MR + struct ib_mr *mr = conn->ibc_hdev->ibh_mrs; +#endif + int rc; + + LASSERT (net != NULL); + LASSERT (!in_interrupt()); + LASSERT (credit == IBLND_POSTRX_NO_CREDIT || + credit == IBLND_POSTRX_PEER_CREDIT || + credit == IBLND_POSTRX_RSRVD_CREDIT); +#ifdef HAVE_IB_GET_DMA_MR + LASSERT(mr != NULL); + + rx->rx_sge.lkey = mr->lkey; +#else + rx->rx_sge.lkey = conn->ibc_hdev->ibh_pd->local_dma_lkey; +#endif + rx->rx_sge.addr = rx->rx_msgaddr; + rx->rx_sge.length = IBLND_MSG_SIZE; + + rx->rx_wrq.next = NULL; + rx->rx_wrq.sg_list = &rx->rx_sge; + rx->rx_wrq.num_sge = 1; + rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX); + + LASSERT (conn->ibc_state >= IBLND_CONN_INIT); + LASSERT (rx->rx_nob >= 0); /* not posted */ + + if (conn->ibc_state > IBLND_CONN_ESTABLISHED) { + kiblnd_drop_rx(rx); /* No more posts for this rx */ + return 0; + } + + rx->rx_nob = -1; /* flag posted */ + + /* NB: need an extra reference after ib_post_recv because we don't + * own this rx (and rx::rx_conn) anymore, LU-5678. + */ + kiblnd_conn_addref(conn); +#ifdef HAVE_IB_POST_SEND_RECV_CONST + rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, + (const struct ib_recv_wr **)&bad_wrq); +#else + rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq); +#endif + if (unlikely(rc != 0)) { + CERROR("Can't post rx for %s: %d, bad_wrq: %p\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq); + rx->rx_nob = 0; + } + + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */ + goto out; + + if (unlikely(rc != 0)) { + kiblnd_close_conn(conn, rc); + kiblnd_drop_rx(rx); /* No more posts for this rx */ + goto out; + } + + if (credit == IBLND_POSTRX_NO_CREDIT) + goto out; + + spin_lock(&conn->ibc_lock); + if (credit == IBLND_POSTRX_PEER_CREDIT) + conn->ibc_outstanding_credits++; + else + conn->ibc_reserved_credits++; + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); + +out: + kiblnd_conn_decref(conn); + return rc; +} + +static struct kib_tx * +kiblnd_find_waiting_tx_locked(struct kib_conn *conn, int txtype, u64 cookie) +{ + struct kib_tx *tx; + + list_for_each_entry(tx, &conn->ibc_active_txs, tx_list) { + LASSERT(!tx->tx_queued); + LASSERT(tx->tx_sending != 0 || tx->tx_waiting); + + if (tx->tx_cookie != cookie) + continue; + + if (tx->tx_waiting && + tx->tx_msg->ibm_type == txtype) + return tx; + + CWARN("Bad completion: %swaiting, type %x (wanted %x)\n", + tx->tx_waiting ? "" : "NOT ", + tx->tx_msg->ibm_type, txtype); + } + return NULL; +} + +static void +kiblnd_handle_completion(struct kib_conn *conn, int txtype, int status, u64 cookie) +{ + struct kib_tx *tx; + struct lnet_ni *ni = conn->ibc_peer->ibp_ni; + int idle; + + spin_lock(&conn->ibc_lock); + + tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie); + if (tx == NULL) { + spin_unlock(&conn->ibc_lock); + + CWARN("Unmatched completion type %x cookie %#llx from %s\n", + txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_close_conn(conn, -EPROTO); + return; + } + + if (tx->tx_status == 0) { /* success so far */ + if (status < 0) { /* failed? */ + if (status == -ECONNABORTED) { + CDEBUG(D_NET, "bad status for connection to %s " + "with completion type %x\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + txtype); + } + + tx->tx_status = status; + tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR; + } else if (txtype == IBLND_MSG_GET_REQ) { + lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status); + } + } + + tx->tx_waiting = 0; + + idle = !tx->tx_queued && (tx->tx_sending == 0); + if (idle) + list_del(&tx->tx_list); + + spin_unlock(&conn->ibc_lock); + + if (idle) + kiblnd_tx_done(tx); +} + +static void +kiblnd_send_completion(struct kib_conn *conn, int type, int status, u64 cookie) +{ + struct lnet_ni *ni = conn->ibc_peer->ibp_ni; + struct kib_tx *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); + + if (tx == NULL) { + CERROR("Can't get tx for completion %x for %s\n", + type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + } + + tx->tx_msg->ibm_u.completion.ibcm_status = status; + tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie; + kiblnd_init_tx_msg(ni, tx, type, sizeof(struct kib_completion_msg)); + + kiblnd_queue_tx(tx, conn); +} + +static void +kiblnd_handle_rx(struct kib_rx *rx) +{ + struct kib_msg *msg = rx->rx_msg; + struct kib_conn *conn = rx->rx_conn; + struct lnet_ni *ni = conn->ibc_peer->ibp_ni; + int credits = msg->ibm_credits; + struct kib_tx *tx; + int rc = 0; + int rc2; + int post_credit; + struct lnet_hdr hdr; + struct lnet_nid srcnid; + + LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + CDEBUG (D_NET, "Received %x[%d] from %s\n", + msg->ibm_type, credits, + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + if (credits != 0) { + /* Have I received credits that will let me send? */ + spin_lock(&conn->ibc_lock); + + if (conn->ibc_credits + credits > + conn->ibc_queue_depth) { + rc2 = conn->ibc_credits; + spin_unlock(&conn->ibc_lock); + + CERROR("Bad credits from %s: %d + %d > %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + rc2, credits, + conn->ibc_queue_depth); + + kiblnd_close_conn(conn, -EPROTO); + kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT); + return; + } + + conn->ibc_credits += credits; + + /* This ensures the credit taken by NOOP can be returned */ + if (msg->ibm_type == IBLND_MSG_NOOP && + !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */ + conn->ibc_outstanding_credits++; + + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); + } + + switch (msg->ibm_type) { + default: + CERROR("Bad IBLND message type %x from %s\n", + msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + post_credit = IBLND_POSTRX_NO_CREDIT; + rc = -EPROTO; + break; + + case IBLND_MSG_NOOP: + if (IBLND_OOB_CAPABLE(conn->ibc_version)) { + post_credit = IBLND_POSTRX_NO_CREDIT; + break; + } + + if (credits != 0) /* credit already posted */ + post_credit = IBLND_POSTRX_NO_CREDIT; + else /* a keepalive NOOP */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_IMMEDIATE: + post_credit = IBLND_POSTRX_DONT_POST; + lnet_hdr_from_nid4(&hdr, &msg->ibm_u.immediate.ibim_hdr); + lnet_nid4_to_nid(msg->ibm_srcnid, &srcnid); + rc = lnet_parse(ni, &hdr, &srcnid, rx, 0); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_PUT_REQ: + post_credit = IBLND_POSTRX_DONT_POST; + lnet_hdr_from_nid4(&hdr, &msg->ibm_u.putreq.ibprm_hdr); + lnet_nid4_to_nid(msg->ibm_srcnid, &srcnid); + rc = lnet_parse(ni, &hdr, &srcnid, rx, 1); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_PUT_NAK: + CWARN ("PUT_NACK from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + + case IBLND_MSG_PUT_ACK: + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + + spin_lock(&conn->ibc_lock); + tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ, + msg->ibm_u.putack.ibpam_src_cookie); + if (tx != NULL) + list_del(&tx->tx_list); + spin_unlock(&conn->ibc_lock); + + if (tx == NULL) { + CERROR("Unmatched PUT_ACK from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + rc = -EPROTO; + break; + } + + LASSERT (tx->tx_waiting); + /* CAVEAT EMPTOR: I could be racing with tx_complete, but... + * (a) I can overwrite tx_msg since my peer_ni has received it! + * (b) tx_waiting set tells tx_complete() it's not done. */ + + tx->tx_nwrq = tx->tx_nsge = 0; /* overwrite PUT_REQ */ + + rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE, + kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd), + &msg->ibm_u.putack.ibpam_rd, + msg->ibm_u.putack.ibpam_dst_cookie); + if (rc2 < 0) + CERROR("Can't setup rdma for PUT to %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2); + + spin_lock(&conn->ibc_lock); + tx->tx_waiting = 0; /* clear waiting and queue atomically */ + kiblnd_queue_tx_locked(tx, conn); + spin_unlock(&conn->ibc_lock); + break; + + case IBLND_MSG_PUT_DONE: + post_credit = IBLND_POSTRX_PEER_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + + case IBLND_MSG_GET_REQ: + post_credit = IBLND_POSTRX_DONT_POST; + lnet_hdr_from_nid4(&hdr, &msg->ibm_u.get.ibgm_hdr); + lnet_nid4_to_nid(msg->ibm_srcnid, &srcnid); + rc = lnet_parse(ni, &hdr, &srcnid, rx, 1); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_GET_DONE: + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + } + + if (rc < 0) /* protocol error */ + kiblnd_close_conn(conn, rc); + + if (post_credit != IBLND_POSTRX_DONT_POST) + kiblnd_post_rx(rx, post_credit); +} + +static void +kiblnd_rx_complete(struct kib_rx *rx, int status, int nob) +{ + struct kib_msg *msg = rx->rx_msg; + struct kib_conn *conn = rx->rx_conn; + struct lnet_ni *ni = conn->ibc_peer->ibp_ni; + struct kib_net *net = ni->ni_data; + int rc; + int err = -EIO; + + LASSERT(net); + LASSERT(rx->rx_nob < 0); /* was posted */ + rx->rx_nob = 0; /* isn't now */ + + if (conn->ibc_state > IBLND_CONN_ESTABLISHED) + goto ignore; + + if (status != IB_WC_SUCCESS) { + CNETERR("Rx from %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), status); + goto failed; + } + + LASSERT(nob >= 0); + rx->rx_nob = nob; + + rc = kiblnd_unpack_msg(msg, rx->rx_nob); + if (rc != 0) { + CERROR("Error %d unpacking rx from %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + goto failed; + } + + if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || + msg->ibm_dstnid != lnet_nid_to_nid4(&ni->ni_nid) || + msg->ibm_srcstamp != conn->ibc_incarnation || + msg->ibm_dststamp != net->ibn_incarnation) { + CERROR("Stale rx from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + err = -ESTALE; + goto failed; + } + + /* set time last known alive */ + kiblnd_peer_alive(conn->ibc_peer); + + /* racing with connection establishment/teardown! */ + + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + unsigned long flags; + + write_lock_irqsave(g_lock, flags); + /* must check holding global lock to eliminate race */ + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + list_add_tail(&rx->rx_list, &conn->ibc_early_rxs); + write_unlock_irqrestore(g_lock, flags); + return; + } + write_unlock_irqrestore(g_lock, flags); + } + kiblnd_handle_rx(rx); + return; + +failed: + CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); + kiblnd_close_conn(conn, err); +ignore: + kiblnd_drop_rx(rx); /* Don't re-post rx. */ +} + +static int +kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx, + struct kib_rdma_desc *rd, u32 nob) +{ + struct kib_hca_dev *hdev; + struct kib_dev *dev; + struct kib_fmr_poolset *fps; + int cpt; + int rc; + int i; + + LASSERT(tx->tx_pool != NULL); + LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL); + + dev = net->ibn_dev; + hdev = tx->tx_pool->tpo_hdev; + cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt; + + /* + * If we're dealing with FastReg, but the device doesn't + * support GAPS and the tx has GAPS, then there is no real point + * in trying to map the memory, because it'll just fail. So + * preemptively fail with an appropriate message + */ + if (IS_FAST_REG_DEV(dev) && + !(dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT) && + tx->tx_gaps) { + CERROR("Using FastReg with no GAPS support, but tx has gaps. " + "Try setting use_fastreg_gaps to 1\n"); + return -EPROTONOSUPPORT; + } + +#ifdef HAVE_FMR_POOL_API + /* + * FMR does not support gaps but the tx has gaps then + * we should make sure that the number of fragments we'll be sending + * over fits within the number of fragments negotiated on the + * connection, otherwise, we won't be able to RDMA the data. + * We need to maintain the number of fragments negotiation on the + * connection for backwards compatibility. + */ + if (tx->tx_gaps && (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)) { + if (tx->tx_conn && + tx->tx_conn->ibc_max_frags <= rd->rd_nfrags) { + CERROR("TX number of frags (%d) is <= than connection" + " number of frags (%d). Consider setting peer's" + " map_on_demand to 256\n", tx->tx_nfrags, + tx->tx_conn->ibc_max_frags); + return -EFBIG; + } + } +#endif + + fps = net->ibn_fmr_ps[cpt]; + rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->tx_fmr); + if (rc != 0) { + CERROR("Can't map %u bytes (%u/%u)s: %d\n", nob, + tx->tx_nfrags, rd->rd_nfrags, rc); + return rc; + } + + /* + * If rd is not tx_rd, it's going to get sent to a peer_ni, who will + * need the rkey + */ + rd->rd_key = tx->tx_fmr.fmr_key; + /* + * for FastReg or FMR with no gaps we can accumulate all + * the fragments in one FastReg or FMR fragment. + */ + if ( +#ifdef HAVE_FMR_POOL_API + ((dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED) + && !tx->tx_gaps) || +#endif + IS_FAST_REG_DEV(dev)) { + /* FMR requires zero based address */ +#ifdef HAVE_FMR_POOL_API + if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED) + rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask; +#endif + rd->rd_frags[0].rf_nob = nob; + rd->rd_nfrags = 1; + } else { + /* + * We're transmitting with gaps using FMR. + * We'll need to use multiple fragments and identify the + * zero based address of each fragment. + */ + for (i = 0; i < rd->rd_nfrags; i++) { + rd->rd_frags[i].rf_addr &= ~hdev->ibh_page_mask; + rd->rd_frags[i].rf_addr += i << hdev->ibh_page_shift; + } + } + + return 0; +} + +static void +kiblnd_unmap_tx(struct kib_tx *tx) +{ + if ( +#ifdef HAVE_FMR_POOL_API + tx->tx_fmr.fmr_pfmr || +#endif + tx->tx_fmr.fmr_frd) + kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status); + + if (tx->tx_nfrags != 0) { + kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev, + tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir); + tx->tx_nfrags = 0; + } +} + +#ifdef HAVE_IB_GET_DMA_MR +static struct ib_mr * +kiblnd_find_rd_dma_mr(struct lnet_ni *ni, struct kib_rdma_desc *rd) +{ + struct kib_net *net = ni->ni_data; + struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev; + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + + /* + * if map-on-demand is turned on and the device supports + * either FMR or FastReg then use that. Otherwise use global + * memory regions. If that's not available either, then you're + * dead in the water and fail the operation. + */ + if (tunables->lnd_map_on_demand && (IS_FAST_REG_DEV(net->ibn_dev) +#ifdef HAVE_FMR_POOL_API + || net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED +#endif + )) + return NULL; + + /* + * hdev->ibh_mrs can be NULL. This case is dealt with gracefully + * in the call chain. The mapping will fail with appropriate error + * message. + */ + return hdev->ibh_mrs; +} +#endif + +static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx, + struct kib_rdma_desc *rd, int nfrags) +{ + struct kib_net *net = ni->ni_data; + struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev; +#ifdef HAVE_IB_GET_DMA_MR + struct ib_mr *mr = NULL; +#endif + __u32 nob; + int i; + + /* If rd is not tx_rd, it's going to get sent to a peer_ni and I'm the + * RDMA sink */ + tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + tx->tx_nfrags = nfrags; + + rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx->tx_frags, + tx->tx_nfrags, tx->tx_dmadir); + + for (i = 0, nob = 0; i < rd->rd_nfrags; i++) { + rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len( + hdev->ibh_ibdev, &tx->tx_frags[i]); + rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address( + hdev->ibh_ibdev, &tx->tx_frags[i]); + nob += rd->rd_frags[i].rf_nob; + } + +#ifdef HAVE_IB_GET_DMA_MR + mr = kiblnd_find_rd_dma_mr(ni, rd); + if (mr != NULL) { + /* found pre-mapping MR */ + rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey; + return 0; + } +#endif + + if (net->ibn_fmr_ps != NULL) + return kiblnd_fmr_map_tx(net, tx, rd, nob); + + return -EINVAL; +} + +static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx, + struct kib_rdma_desc *rd, int nkiov, + struct bio_vec *kiov, int offset, int nob) +{ + struct kib_net *net = ni->ni_data; + struct scatterlist *sg; + int fragnob; + int max_nkiov; + int sg_count = 0; + + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); + + LASSERT(nob > 0); + LASSERT(nkiov > 0); + LASSERT(net != NULL); + + while (offset >= kiov->bv_len) { + offset -= kiov->bv_len; + nkiov--; + kiov++; + LASSERT(nkiov > 0); + } + + max_nkiov = nkiov; + + sg = tx->tx_frags; + do { + LASSERT(nkiov > 0); + + if (!sg) { + CERROR("lacking enough sg entries to map tx\n"); + return -EFAULT; + } + sg_count++; + + fragnob = min((int)(kiov->bv_len - offset), nob); + + /* + * We're allowed to start at a non-aligned page offset in + * the first fragment and end at a non-aligned page offset + * in the last fragment. + */ + if ((fragnob < (int)(kiov->bv_len - offset)) && + nkiov < max_nkiov && nob > fragnob) { + CDEBUG(D_NET, "fragnob %d < available page %d: with" + " remaining %d kiovs with %d nob left\n", + fragnob, (int)(kiov->bv_len - offset), + nkiov, nob); + tx->tx_gaps = true; + } + + sg_set_page(sg, kiov->bv_page, fragnob, + kiov->bv_offset + offset); + sg = sg_next(sg); + + offset = 0; + kiov++; + nkiov--; + nob -= fragnob; + } while (nob > 0); + + return kiblnd_map_tx(ni, tx, rd, sg_count); +} + +static int +kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit) +__must_hold(&conn->ibc_lock) +{ + struct kib_msg *msg = tx->tx_msg; + struct kib_peer_ni *peer_ni = conn->ibc_peer; + struct lnet_ni *ni = peer_ni->ibp_ni; + struct kib_fast_reg_descriptor *frd = tx->tx_fmr.fmr_frd; + int ver = conn->ibc_version; + int rc; + int done; + + LASSERT(tx->tx_queued); + /* We rely on this for QP sizing */ + LASSERT(tx->tx_nwrq > 0 && tx->tx_nsge >= 0); + LASSERT(tx->tx_nwrq <= 1 + conn->ibc_max_frags); + + LASSERT(credit == 0 || credit == 1); + LASSERT(conn->ibc_outstanding_credits >= 0); + LASSERT(conn->ibc_outstanding_credits <= conn->ibc_queue_depth); + LASSERT(conn->ibc_credits >= 0); + LASSERT(conn->ibc_credits <= conn->ibc_queue_depth); + + if (conn->ibc_nsends_posted == + kiblnd_concurrent_sends(ver, ni)) { + /* tx completions outstanding... */ + CDEBUG(D_NET, "%s: posted enough\n", + libcfs_nid2str(peer_ni->ibp_nid)); + return -EAGAIN; + } + + if (credit != 0 && conn->ibc_credits == 0) { /* no credits */ + CDEBUG(D_NET, "%s: no credits\n", + libcfs_nid2str(peer_ni->ibp_nid)); + return -EAGAIN; + } + + if (credit != 0 && !IBLND_OOB_CAPABLE(ver) && + conn->ibc_credits == 1 && /* last credit reserved */ + msg->ibm_type != IBLND_MSG_NOOP) { /* for NOOP */ + CDEBUG(D_NET, "%s: not using last credit\n", + libcfs_nid2str(peer_ni->ibp_nid)); + return -EAGAIN; + } + + /* NB don't drop ibc_lock before bumping tx_sending */ + list_del(&tx->tx_list); + tx->tx_queued = 0; + + if (msg->ibm_type == IBLND_MSG_NOOP && + (!kiblnd_need_noop(conn) || /* redundant NOOP */ + (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */ + conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) { + /* OK to drop when posted enough NOOPs, since + * kiblnd_check_sends_locked will queue NOOP again when + * posted NOOPs complete */ + spin_unlock(&conn->ibc_lock); + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + kiblnd_tx_done(tx); + spin_lock(&conn->ibc_lock); + CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n", + libcfs_nid2str(peer_ni->ibp_nid), + conn->ibc_noops_posted); + return 0; + } + + kiblnd_pack_msg(peer_ni->ibp_ni, msg, ver, conn->ibc_outstanding_credits, + peer_ni->ibp_nid, conn->ibc_incarnation); + + conn->ibc_credits -= credit; + conn->ibc_outstanding_credits = 0; + conn->ibc_nsends_posted++; + if (msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_noops_posted++; + + /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA + * PUT. If so, it was first queued here as a PUT_REQ, sent and + * stashed on ibc_active_txs, matched by an incoming PUT_ACK, + * and then re-queued here. It's (just) possible that + * tx_sending is non-zero if we've not done the tx_complete() + * from the first send; hence the ++ rather than = below. */ + tx->tx_sending++; + list_add(&tx->tx_list, &conn->ibc_active_txs); + + /* I'm still holding ibc_lock! */ + if (conn->ibc_state != IBLND_CONN_ESTABLISHED) { + CDEBUG(D_NET, "connection to %s is not established\n", + conn->ibc_peer? libcfs_nid2str(conn->ibc_peer->ibp_nid): "NULL"); + rc = -ECONNABORTED; + } else if (tx->tx_pool->tpo_pool.po_failed || + conn->ibc_hdev != tx->tx_pool->tpo_hdev) { + /* close_conn will launch failover */ + rc = -ENETDOWN; + } else { + struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr; + struct ib_send_wr *wr = &tx->tx_wrq[0].wr; + + if (frd != NULL && !frd->frd_posted) { + wr = &frd->frd_inv_wr.wr; + wr->next = &frd->frd_fastreg_wr.wr; + frd->frd_fastreg_wr.wr.next = &tx->tx_wrq[0].wr; + } + + LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX), + "bad wr_id %#llx, opc %d, flags %d, peer_ni: %s\n", + bad->wr_id, bad->opcode, bad->send_flags, + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + bad = NULL; + if (lnet_send_error_simulation(tx->tx_lntmsg[0], &tx->tx_hstatus)) + rc = -EINVAL; + else +#ifdef HAVE_IB_POST_SEND_RECV_CONST + rc = ib_post_send(conn->ibc_cmid->qp, wr, + (const struct ib_send_wr **)&bad); +#else + rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad); +#endif + if (frd && !frd->frd_posted) { + /* The local invalidate becomes invalid (has been + * successfully used) if the post succeeds or the + * failing wr was not the invalidate. */ + frd->frd_valid = + !(rc == 0 || (bad != &frd->frd_inv_wr.wr)); + } + } + + conn->ibc_last_send = ktime_get(); + + if (rc == 0) { + if (frd != NULL) + frd->frd_posted = true; + return 0; + } + + /* NB credits are transferred in the actual + * message, which can only be the last work item */ + conn->ibc_credits += credit; + conn->ibc_outstanding_credits += msg->ibm_credits; + conn->ibc_nsends_posted--; + if (msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_noops_posted--; + + tx->tx_status = rc; + tx->tx_waiting = 0; + tx->tx_sending--; + + done = (tx->tx_sending == 0); + if (done) + list_del(&tx->tx_list); + + spin_unlock(&conn->ibc_lock); + + if (conn->ibc_state == IBLND_CONN_ESTABLISHED) + CERROR("Error %d posting transmit to %s\n", + rc, libcfs_nid2str(peer_ni->ibp_nid)); + else + CDEBUG(D_NET, "Error %d posting transmit to %s\n", + rc, libcfs_nid2str(peer_ni->ibp_nid)); + + kiblnd_close_conn(conn, rc); + + if (done) + kiblnd_tx_done(tx); + + spin_lock(&conn->ibc_lock); + + return -EIO; +} + +static void +kiblnd_check_sends_locked(struct kib_conn *conn) +{ + int ver = conn->ibc_version; + struct lnet_ni *ni = conn->ibc_peer->ibp_ni; + struct kib_tx *tx; + + /* Don't send anything until after the connection is established */ + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + CDEBUG(D_NET, "%s too soon\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + } + + LASSERT(conn->ibc_nsends_posted <= + kiblnd_concurrent_sends(ver, ni)); + LASSERT (!IBLND_OOB_CAPABLE(ver) || + conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver)); + LASSERT (conn->ibc_reserved_credits >= 0); + + while (conn->ibc_reserved_credits > 0 && + (tx = list_first_entry_or_null(&conn->ibc_tx_queue_rsrvd, + struct kib_tx, tx_list)) != NULL) { + list_move_tail(&tx->tx_list, &conn->ibc_tx_queue); + conn->ibc_reserved_credits--; + } + + if (kiblnd_need_noop(conn)) { + spin_unlock(&conn->ibc_lock); + + tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); + if (tx != NULL) + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0); + + spin_lock(&conn->ibc_lock); + if (tx != NULL) + kiblnd_queue_tx_locked(tx, conn); + } + + for (;;) { + int credit; + + if (!list_empty(&conn->ibc_tx_queue_nocred)) { + credit = 0; + tx = list_first_entry(&conn->ibc_tx_queue_nocred, + struct kib_tx, tx_list); + } else if (!list_empty(&conn->ibc_tx_noops)) { + LASSERT (!IBLND_OOB_CAPABLE(ver)); + credit = 1; + tx = list_first_entry(&conn->ibc_tx_noops, + struct kib_tx, tx_list); + } else if (!list_empty(&conn->ibc_tx_queue)) { + credit = 1; + tx = list_first_entry(&conn->ibc_tx_queue, + struct kib_tx, tx_list); + } else + break; + + if (kiblnd_post_tx_locked(conn, tx, credit) != 0) + break; + } +} + +static void +kiblnd_tx_complete(struct kib_tx *tx, int status) +{ + int failed = (status != IB_WC_SUCCESS); + struct kib_conn *conn = tx->tx_conn; + int idle; + + if (tx->tx_sending <= 0) { + CERROR("Received an event on a freed tx: %p status %d\n", + tx, tx->tx_status); + return; + } + + if (failed) { + if (conn->ibc_state == IBLND_CONN_ESTABLISHED) + CNETERR("Tx -> %s cookie %#llx" + " sending %d waiting %d: failed %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + tx->tx_cookie, tx->tx_sending, tx->tx_waiting, + status); + + kiblnd_close_conn(conn, -EIO); + } else { + kiblnd_peer_alive(conn->ibc_peer); + } + + spin_lock(&conn->ibc_lock); + + /* I could be racing with rdma completion. Whoever makes 'tx' idle + * gets to free it, which also drops its ref on 'conn'. */ + + tx->tx_sending--; + conn->ibc_nsends_posted--; + if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_noops_posted--; + + if (failed) { + tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED; + tx->tx_waiting = 0; /* don't wait for peer_ni */ + tx->tx_status = -EIO; + } + + idle = (tx->tx_sending == 0) && /* This is the final callback */ + !tx->tx_waiting && /* Not waiting for peer_ni */ + !tx->tx_queued; /* Not re-queued (PUT_DONE) */ + if (idle) + list_del(&tx->tx_list); + + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); + + if (idle) + kiblnd_tx_done(tx); +} + + +static void +kiblnd_init_tx_sge(struct kib_tx *tx, u64 addr, unsigned int len) +{ + struct ib_sge *sge = &tx->tx_sge[tx->tx_nsge]; + struct kib_hca_dev *hdev = tx->tx_pool->tpo_hdev; +#ifdef HAVE_IB_GET_DMA_MR + struct ib_mr *mr = hdev->ibh_mrs; +#endif + + *sge = (struct ib_sge) { +#ifdef HAVE_IB_GET_DMA_MR + .lkey = mr->lkey, +#else + .lkey = hdev->ibh_pd->local_dma_lkey, +#endif + .addr = addr, + .length = len, + }; + + tx->tx_nsge++; +} + +static struct ib_rdma_wr * +kiblnd_init_tx_msg_payload(struct lnet_ni *ni, struct kib_tx *tx, int type, + int body_nob, int payload) +{ + struct ib_rdma_wr *wrq; + int nob = offsetof(struct kib_msg, ibm_u) + body_nob; + + LASSERT(tx->tx_nwrq >= 0); + LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1); + LASSERT(nob <= IBLND_MSG_SIZE); + + kiblnd_init_msg(tx->tx_msg, type, body_nob + payload); + + wrq = &tx->tx_wrq[tx->tx_nwrq]; + + *wrq = (struct ib_rdma_wr) { + .wr = { + .wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_TX), + .num_sge = 1, + .sg_list = &tx->tx_sge[tx->tx_nsge], + .opcode = IB_WR_SEND, + .send_flags = IB_SEND_SIGNALED, + }, + }; + + kiblnd_init_tx_sge(tx, tx->tx_msgaddr, nob); + + tx->tx_nwrq++; + return wrq; +} + +static int +kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, + int resid, struct kib_rdma_desc *dstrd, u64 dstcookie) +{ + struct kib_msg *ibmsg = tx->tx_msg; + struct kib_rdma_desc *srcrd = tx->tx_rd; + struct ib_rdma_wr *wrq = NULL; + struct ib_sge *sge; + int rc = resid; + int srcidx; + int dstidx; + int sge_nob; + int wrq_sge; + + LASSERT(!in_interrupt()); + LASSERT(tx->tx_nwrq == 0 && tx->tx_nsge == 0); + LASSERT(type == IBLND_MSG_GET_DONE || type == IBLND_MSG_PUT_DONE); + + for (srcidx = dstidx = wrq_sge = sge_nob = 0; + resid > 0; resid -= sge_nob) { + int prev = dstidx; + + if (srcidx >= srcrd->rd_nfrags) { + CERROR("Src buffer exhausted: %d frags\n", srcidx); + rc = -EPROTO; + break; + } + + if (dstidx >= dstrd->rd_nfrags) { + CERROR("Dst buffer exhausted: %d frags\n", dstidx); + rc = -EPROTO; + break; + } + + if (tx->tx_nwrq >= conn->ibc_max_frags) { + CERROR("RDMA has too many fragments for peer_ni %s (%d), " + "src idx/frags: %d/%d dst idx/frags: %d/%d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + conn->ibc_max_frags, + srcidx, srcrd->rd_nfrags, + dstidx, dstrd->rd_nfrags); + rc = -EMSGSIZE; + break; + } + + sge_nob = min3(kiblnd_rd_frag_size(srcrd, srcidx), + kiblnd_rd_frag_size(dstrd, dstidx), + resid); + + sge = &tx->tx_sge[tx->tx_nsge]; + sge->addr = kiblnd_rd_frag_addr(srcrd, srcidx); + sge->lkey = kiblnd_rd_frag_key(srcrd, srcidx); + sge->length = sge_nob; + + if (wrq_sge == 0) { + wrq = &tx->tx_wrq[tx->tx_nwrq]; + + wrq->wr.next = &(wrq + 1)->wr; + wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA); + wrq->wr.sg_list = sge; + wrq->wr.opcode = IB_WR_RDMA_WRITE; + wrq->wr.send_flags = 0; + +#ifdef HAVE_IB_RDMA_WR + wrq->remote_addr = kiblnd_rd_frag_addr(dstrd, + dstidx); + wrq->rkey = kiblnd_rd_frag_key(dstrd, + dstidx); +#else + wrq->wr.wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, + dstidx); + wrq->wr.wr.rdma.rkey = kiblnd_rd_frag_key(dstrd, + dstidx); +#endif + } + + srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, sge_nob); + dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, sge_nob); + + wrq_sge++; + if (wrq_sge == *kiblnd_tunables.kib_wrq_sge || dstidx != prev) { + tx->tx_nwrq++; + wrq->wr.num_sge = wrq_sge; + wrq_sge = 0; + } + tx->tx_nsge++; + } + + if (rc < 0) /* no RDMA if completing with failure */ + tx->tx_nwrq = tx->tx_nsge = 0; + + ibmsg->ibm_u.completion.ibcm_status = rc; + ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; + kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx, + type, sizeof(struct kib_completion_msg)); + + return rc; +} + +static void +kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn) +{ + struct list_head *q; + s64 timeout_ns; + + LASSERT(tx->tx_nwrq > 0); /* work items set up */ + LASSERT(!tx->tx_queued); /* not queued for sending already */ + LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + if (conn->ibc_state >= IBLND_CONN_DISCONNECTED) { + CDEBUG(D_NET, "connection with %s is disconnected\n", + conn->ibc_peer? libcfs_nid2str(conn->ibc_peer->ibp_nid): "NULL"); + + tx->tx_status = -ECONNABORTED; + tx->tx_waiting = 0; + if (tx->tx_conn != NULL) { + /* PUT_DONE first attached to conn as a PUT_REQ */ + LASSERT(tx->tx_conn == conn); + LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE); + tx->tx_conn = NULL; + kiblnd_conn_decref(conn); + } + list_add(&tx->tx_list, &conn->ibc_zombie_txs); + + return; + } + + timeout_ns = kiblnd_timeout() * NSEC_PER_SEC; + tx->tx_queued = 1; + tx->tx_deadline = ktime_add_ns(ktime_get(), timeout_ns); + + if (tx->tx_conn == NULL) { + kiblnd_conn_addref(conn); + tx->tx_conn = conn; + LASSERT (tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE); + } else { + /* PUT_DONE first attached to conn as a PUT_REQ */ + LASSERT (tx->tx_conn == conn); + LASSERT (tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE); + } + + switch (tx->tx_msg->ibm_type) { + default: + LBUG(); + + case IBLND_MSG_PUT_REQ: + case IBLND_MSG_GET_REQ: + q = &conn->ibc_tx_queue_rsrvd; + break; + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_ACK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + q = &conn->ibc_tx_queue_nocred; + break; + + case IBLND_MSG_NOOP: + if (IBLND_OOB_CAPABLE(conn->ibc_version)) + q = &conn->ibc_tx_queue_nocred; + else + q = &conn->ibc_tx_noops; + break; + + case IBLND_MSG_IMMEDIATE: + q = &conn->ibc_tx_queue; + break; + } + + list_add_tail(&tx->tx_list, q); +} + +static void +kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn) +{ + spin_lock(&conn->ibc_lock); + kiblnd_queue_tx_locked(tx, conn); + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); +} + +static int +kiblnd_resolve_addr_cap(struct rdma_cm_id *cmid, + struct sockaddr_in *srcaddr, + struct sockaddr_in *dstaddr, + int timeout_ms) +{ + unsigned short port; + int rc; + + /* allow the port to be reused */ + rc = rdma_set_reuseaddr(cmid, 1); + if (rc != 0) { + CERROR("Unable to set reuse on cmid: %d\n", rc); + return rc; + } + + /* look for a free privileged port */ + for (port = PROT_SOCK-1; port > 0; port--) { + srcaddr->sin_port = htons(port); + rc = rdma_resolve_addr(cmid, + (struct sockaddr *)srcaddr, + (struct sockaddr *)dstaddr, + timeout_ms); + if (rc == 0) { + CDEBUG(D_NET, "bound to port %hu\n", port); + return 0; + } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) { + CDEBUG(D_NET, "bind to port %hu failed: %d\n", + port, rc); + } else { + return rc; + } + } + + CERROR("cannot bind to a free privileged port: rc = %d\n", rc); + + return rc; +} + +static int +kiblnd_resolve_addr(struct rdma_cm_id *cmid, + struct sockaddr_in *srcaddr, + struct sockaddr_in *dstaddr, + int timeout_ms) +{ + const struct cred *old_creds = NULL; + struct cred *new_creds; + int rc; + + if (!capable(CAP_NET_BIND_SERVICE)) { + new_creds = prepare_kernel_cred(NULL); + if (!new_creds) + return -ENOMEM; + + cap_raise(new_creds->cap_effective, CAP_NET_BIND_SERVICE); + old_creds = override_creds(new_creds); + } + + rc = kiblnd_resolve_addr_cap(cmid, srcaddr, dstaddr, timeout_ms); + + if (old_creds) + revert_creds(old_creds); + + return rc; +} + +static void +kiblnd_connect_peer(struct kib_peer_ni *peer_ni) +{ + struct rdma_cm_id *cmid; + struct kib_dev *dev; + struct kib_net *net = peer_ni->ibp_ni->ni_data; + struct sockaddr_in srcaddr; + struct sockaddr_in dstaddr; + int rc; + + LASSERT (net != NULL); + LASSERT (peer_ni->ibp_connecting > 0); + + cmid = kiblnd_rdma_create_id(peer_ni->ibp_ni->ni_net_ns, + kiblnd_cm_callback, peer_ni, + RDMA_PS_TCP, IB_QPT_RC); + + if (IS_ERR(cmid)) { + CERROR("Can't create CMID for %s: %ld\n", + libcfs_nid2str(peer_ni->ibp_nid), PTR_ERR(cmid)); + rc = PTR_ERR(cmid); + goto failed; + } + + dev = net->ibn_dev; + memset(&srcaddr, 0, sizeof(srcaddr)); + srcaddr.sin_family = AF_INET; + srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip); + + memset(&dstaddr, 0, sizeof(dstaddr)); + dstaddr.sin_family = AF_INET; + dstaddr.sin_port = htons(*kiblnd_tunables.kib_service); + dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer_ni->ibp_nid)); + + kiblnd_peer_addref(peer_ni); /* cmid's ref */ + + if (*kiblnd_tunables.kib_use_priv_port) { + rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr, + kiblnd_timeout() * 1000); + } else { + rc = rdma_resolve_addr(cmid, + (struct sockaddr *)&srcaddr, + (struct sockaddr *)&dstaddr, + kiblnd_timeout() * 1000); + } + if (rc != 0) { + /* Can't initiate address resolution: */ + CERROR("Can't resolve addr for %s: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), rc); + goto failed2; + } + + return; + + failed2: + kiblnd_peer_connect_failed(peer_ni, 1, rc); + kiblnd_peer_decref(peer_ni); /* cmid's ref */ + rdma_destroy_id(cmid); + return; + failed: + kiblnd_peer_connect_failed(peer_ni, 1, rc); +} + +bool +kiblnd_reconnect_peer(struct kib_peer_ni *peer_ni) +{ + rwlock_t *glock = &kiblnd_data.kib_global_lock; + char *reason = NULL; + LIST_HEAD(txs); + unsigned long flags; + + write_lock_irqsave(glock, flags); + if (peer_ni->ibp_reconnecting == 0) { + if (peer_ni->ibp_accepting) + reason = "accepting"; + else if (peer_ni->ibp_connecting) + reason = "connecting"; + else if (!list_empty(&peer_ni->ibp_conns)) + reason = "connected"; + else /* connected then closed */ + reason = "closed"; + + goto no_reconnect; + } + + if (peer_ni->ibp_accepting) + CNETERR("Detecting race between accepting and reconnecting\n"); + peer_ni->ibp_reconnecting--; + + if (!kiblnd_peer_active(peer_ni)) { + list_splice_init(&peer_ni->ibp_tx_queue, &txs); + reason = "unlinked"; + goto no_reconnect; + } + + peer_ni->ibp_connecting++; + peer_ni->ibp_reconnected++; + + write_unlock_irqrestore(glock, flags); + + kiblnd_connect_peer(peer_ni); + return true; + + no_reconnect: + write_unlock_irqrestore(glock, flags); + + CWARN("Abort reconnection of %s: %s\n", + libcfs_nid2str(peer_ni->ibp_nid), reason); + kiblnd_txlist_done(&txs, -ECONNABORTED, + LNET_MSG_STATUS_LOCAL_ABORTED); + return false; +} + +void +kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid) +{ + struct kib_peer_ni *peer_ni; + struct kib_peer_ni *peer2; + struct kib_conn *conn; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + unsigned long flags; + int rc; + int i; + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + + /* If I get here, I've committed to send, so I complete the tx with + * failure on any problems + */ + + LASSERT(!tx || !tx->tx_conn); /* only set when assigned a conn */ + LASSERT(!tx || tx->tx_nwrq > 0); /* work items have been set up */ + + /* First time, just use a read lock since I expect to find my peer_ni + * connected + */ + read_lock_irqsave(g_lock, flags); + + peer_ni = kiblnd_find_peer_locked(ni, nid); + if (peer_ni != NULL && !list_empty(&peer_ni->ibp_conns)) { + /* Found a peer_ni with an established connection */ + conn = kiblnd_get_conn_locked(peer_ni); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + read_unlock_irqrestore(g_lock, flags); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + return; + } + + read_unlock(g_lock); + /* Re-try with a write lock */ + write_lock(g_lock); + + peer_ni = kiblnd_find_peer_locked(ni, nid); + if (peer_ni != NULL) { + if (list_empty(&peer_ni->ibp_conns)) { + /* found a peer_ni, but it's still connecting... */ + LASSERT(kiblnd_peer_connecting(peer_ni)); + if (tx != NULL) + list_add_tail(&tx->tx_list, + &peer_ni->ibp_tx_queue); + write_unlock_irqrestore(g_lock, flags); + } else { + conn = kiblnd_get_conn_locked(peer_ni); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + write_unlock_irqrestore(g_lock, flags); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + } + return; + } + + write_unlock_irqrestore(g_lock, flags); + + /* Allocate a peer_ni ready to add to the peer_ni table and retry */ + rc = kiblnd_create_peer(ni, &peer_ni, nid); + if (rc != 0) { + CERROR("Can't create peer_ni %s\n", libcfs_nid2str(nid)); + if (tx != NULL) { + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + kiblnd_tx_done(tx); + } + return; + } + + write_lock_irqsave(g_lock, flags); + + peer2 = kiblnd_find_peer_locked(ni, nid); + if (peer2 != NULL) { + if (list_empty(&peer2->ibp_conns)) { + /* found a peer_ni, but it's still connecting... */ + LASSERT(kiblnd_peer_connecting(peer2)); + if (tx != NULL) + list_add_tail(&tx->tx_list, + &peer2->ibp_tx_queue); + write_unlock_irqrestore(g_lock, flags); + } else { + conn = kiblnd_get_conn_locked(peer2); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + write_unlock_irqrestore(g_lock, flags); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + } + + kiblnd_peer_decref(peer_ni); + return; + } + + /* Brand new peer_ni */ + LASSERT(peer_ni->ibp_connecting == 0); + tunables = &peer_ni->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + peer_ni->ibp_connecting = tunables->lnd_conns_per_peer; + + /* always called with a ref on ni, which prevents ni being shutdown */ + LASSERT(((struct kib_net *)ni->ni_data)->ibn_shutdown == 0); + + if (tx != NULL) + list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue); + + kiblnd_peer_addref(peer_ni); + hash_add(kiblnd_data.kib_peers, &peer_ni->ibp_list, nid); + + write_unlock_irqrestore(g_lock, flags); + + for (i = 0; i < tunables->lnd_conns_per_peer; i++) + kiblnd_connect_peer(peer_ni); + kiblnd_peer_decref(peer_ni); +} + +int +kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) +{ + struct kib_dev *dev = ((struct kib_net *)ni->ni_data)->ibn_dev; + struct lnet_hdr *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + struct lnet_processid *target = &lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int payload_niov = lntmsg->msg_niov; + struct bio_vec *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + struct kib_msg *ibmsg; + struct kib_rdma_desc *rd; + struct kib_tx *tx; + int nob; + int rc; + + /* NB 'private' is different depending on what we're sending.... */ + + CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_idstr(target)); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= LNET_MAX_IOV); + + /* Thread context */ + LASSERT (!in_interrupt()); + + tx = kiblnd_get_idle_tx(ni, lnet_nid_to_nid4(&target->nid)); + if (tx == NULL) { + CERROR("Can't allocate %s txd for %s\n", + lnet_msgtyp2str(type), + libcfs_nidstr(&target->nid)); + return -ENOMEM; + } + ibmsg = tx->tx_msg; + + switch (type) { + default: + LBUG(); + return (-EIO); + + case LNET_MSG_ACK: + LASSERT (payload_nob == 0); + break; + + case LNET_MSG_GET: + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + /* is the REPLY message too small for RDMA? */ + nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); + if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force) + break; /* send IMMEDIATE */ + + rd = &ibmsg->ibm_u.get.ibgm_rd; + rc = kiblnd_setup_rd_kiov(ni, tx, rd, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_kiov, + 0, lntmsg->msg_md->md_length); + if (rc != 0) { + CERROR("Can't setup GET sink for %s: %d\n", + libcfs_nidstr(&target->nid), rc); + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + kiblnd_tx_done(tx); + return -EIO; + } + + nob = offsetof(struct kib_get_msg, ibgm_rd.rd_frags[rd->rd_nfrags]); + ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; + lnet_hdr_to_nid4(hdr, &ibmsg->ibm_u.get.ibgm_hdr); + + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob); + + tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg); + if (tx->tx_lntmsg[1] == NULL) { + CERROR("Can't create reply for GET -> %s\n", + libcfs_nidstr(&target->nid)); + kiblnd_tx_done(tx); + return -EIO; + } + + /* finalise lntmsg[0,1] on completion */ + tx->tx_lntmsg[0] = lntmsg; + tx->tx_waiting = 1; /* waiting for GET_DONE */ + kiblnd_launch_tx(ni, tx, lnet_nid_to_nid4(&target->nid)); + return 0; + + case LNET_MSG_REPLY: + case LNET_MSG_PUT: + /* Is the payload small enough not to need RDMA? */ + nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force) + break; /* send IMMEDIATE */ + + rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, + payload_niov, payload_kiov, + payload_offset, payload_nob); + if (rc != 0) { + CERROR("Can't setup PUT src for %s: %d\n", + libcfs_nidstr(&target->nid), rc); + kiblnd_tx_done(tx); + return -EIO; + } + + lnet_hdr_to_nid4(hdr, &ibmsg->ibm_u.putreq.ibprm_hdr); + ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, + sizeof(struct kib_putreq_msg)); + + /* finalise lntmsg[0,1] on completion */ + tx->tx_lntmsg[0] = lntmsg; + tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ + kiblnd_launch_tx(ni, tx, lnet_nid_to_nid4(&target->nid)); + return 0; + } + + /* send IMMEDIATE */ + LASSERT(offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]) + <= IBLND_MSG_SIZE); + + ibmsg = tx->tx_msg; + lnet_hdr_to_nid4(hdr, &ibmsg->ibm_u.immediate.ibim_hdr); + + if (IS_FAST_REG_DEV(dev) && payload_nob) { + struct ib_rdma_wr *wrq; + int i; + + nob = offsetof(struct kib_immediate_msg, ibim_payload[0]); + wrq = kiblnd_init_tx_msg_payload(ni, tx, IBLND_MSG_IMMEDIATE, + nob, payload_nob); + + rd = tx->tx_rd; + rc = kiblnd_setup_rd_kiov(ni, tx, rd, + payload_niov, payload_kiov, + payload_offset, payload_nob); + if (rc != 0) { + CERROR("Can't setup IMMEDIATE src for %s: %d\n", + libcfs_nidstr(&target->nid), rc); + kiblnd_tx_done(tx); + return -EIO; + } + + /* lets generate a SGE chain */ + for (i = 0; i < rd->rd_nfrags; i++) { + kiblnd_init_tx_sge(tx, rd->rd_frags[i].rf_addr, + rd->rd_frags[i].rf_nob); + wrq->wr.num_sge++; + } + } else { + lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg, + offsetof(struct kib_msg, + ibm_u.immediate.ibim_payload), + payload_niov, payload_kiov, + payload_offset, payload_nob); + + nob = offsetof(struct kib_immediate_msg, + ibim_payload[payload_nob]); + + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob); + } + + /* finalise lntmsg on completion */ + tx->tx_lntmsg[0] = lntmsg; + + kiblnd_launch_tx(ni, tx, lnet_nid_to_nid4(&target->nid)); + return 0; +} + +static void +kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg) +{ + struct lnet_processid *target = &lntmsg->msg_target; + unsigned int niov = lntmsg->msg_niov; + struct bio_vec *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + struct kib_tx *tx; + int rc; + + tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid); + if (tx == NULL) { + CERROR("Can't get tx for REPLY to %s\n", + libcfs_nidstr(&target->nid)); + goto failed_0; + } + + if (nob == 0) + rc = 0; + else + rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, + niov, kiov, offset, nob); + + if (rc != 0) { + CERROR("Can't setup GET src for %s: %d\n", + libcfs_nidstr(&target->nid), rc); + goto failed_1; + } + + rc = kiblnd_init_rdma(rx->rx_conn, tx, + IBLND_MSG_GET_DONE, nob, + &rx->rx_msg->ibm_u.get.ibgm_rd, + rx->rx_msg->ibm_u.get.ibgm_cookie); + if (rc < 0) { + CERROR("Can't setup rdma for GET from %s: %d\n", + libcfs_nidstr(&target->nid), rc); + goto failed_1; + } + + if (nob == 0) { + /* No RDMA: local completion may happen now! */ + lnet_finalize(lntmsg, 0); + } else { + /* RDMA: lnet_finalize(lntmsg) when it + * completes */ + tx->tx_lntmsg[0] = lntmsg; + } + + kiblnd_queue_tx(tx, rx->rx_conn); + return; + + +failed_1: + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + kiblnd_tx_done(tx); +failed_0: + lnet_finalize(lntmsg, -EIO); +} + +unsigned int +kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx) +{ + struct kib_net *net = ni->ni_data; + struct device *dev = NULL; + + if (net) + dev = net->ibn_dev->ibd_hdev->ibh_ibdev->dma_device; + + return lnet_get_dev_prio(dev, dev_idx); + +} + +int +kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, + int delayed, unsigned int niov, struct bio_vec *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + struct kib_rx *rx = private; + struct kib_msg *rxmsg = rx->rx_msg; + struct kib_conn *conn = rx->rx_conn; + struct kib_tx *tx; + __u64 ibprm_cookie; + int nob; + int post_credit = IBLND_POSTRX_PEER_CREDIT; + int rc = 0; + + LASSERT (mlen <= rlen); + LASSERT (!in_interrupt()); + + switch (rxmsg->ibm_type) { + default: + LBUG(); + /* fallthrough */ + case IBLND_MSG_IMMEDIATE: + nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[rlen]); + if (nob > rx->rx_nob) { + CERROR("Immediate message from %s too big: %d(%d)\n", + libcfs_nidstr(&lntmsg->msg_hdr.src_nid), + nob, rx->rx_nob); + rc = -EPROTO; + break; + } + + lnet_copy_flat2kiov(niov, kiov, offset, + IBLND_MSG_SIZE, rxmsg, + offsetof(struct kib_msg, + ibm_u.immediate.ibim_payload), + mlen); + lnet_finalize(lntmsg, 0); + break; + + case IBLND_MSG_PUT_REQ: { + struct kib_msg *txmsg; + struct kib_rdma_desc *rd; + ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; + + if (mlen == 0) { + lnet_finalize(lntmsg, 0); + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, + 0, ibprm_cookie); + break; + } + + tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); + if (tx == NULL) { + CERROR("Can't allocate tx for %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* Not replying will break the connection */ + rc = -ENOMEM; + break; + } + + txmsg = tx->tx_msg; + rd = &txmsg->ibm_u.putack.ibpam_rd; + rc = kiblnd_setup_rd_kiov(ni, tx, rd, + niov, kiov, offset, mlen); + if (rc != 0) { + CERROR("Can't setup PUT sink for %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + kiblnd_tx_done(tx); + /* tell peer_ni it's over */ + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, + rc, ibprm_cookie); + break; + } + + nob = offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[rd->rd_nfrags]); + txmsg->ibm_u.putack.ibpam_src_cookie = ibprm_cookie; + txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; + + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_DONE */ + kiblnd_queue_tx(tx, conn); + + /* reposted buffer reserved for PUT_DONE */ + post_credit = IBLND_POSTRX_NO_CREDIT; + break; + } + + case IBLND_MSG_GET_REQ: + if (lntmsg != NULL) { + /* Optimized GET; RDMA lntmsg's payload */ + kiblnd_reply(ni, rx, lntmsg); + } else { + /* GET didn't match anything */ + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE, + -ENODATA, + rxmsg->ibm_u.get.ibgm_cookie); + } + break; + } + + kiblnd_post_rx(rx, post_credit); + return rc; +} + +static void +kiblnd_thread_fini (void) +{ + atomic_dec (&kiblnd_data.kib_nthreads); +} + +static void +kiblnd_peer_alive(struct kib_peer_ni *peer_ni) +{ + /* This is racy, but everyone's only writing ktime_get_seconds() */ + peer_ni->ibp_last_alive = ktime_get_seconds(); + smp_mb(); +} + +static void +kiblnd_peer_notify(struct kib_peer_ni *peer_ni) +{ + int error = 0; + time64_t last_alive = 0; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (kiblnd_peer_idle(peer_ni) && peer_ni->ibp_error != 0) { + error = peer_ni->ibp_error; + peer_ni->ibp_error = 0; + + last_alive = peer_ni->ibp_last_alive; + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (error != 0) + lnet_notify(peer_ni->ibp_ni, + peer_ni->ibp_nid, false, false, last_alive); +} + +void +kiblnd_close_conn_locked(struct kib_conn *conn, int error) +{ + /* This just does the immediate housekeeping. 'error' is zero for a + * normal shutdown which can happen only after the connection has been + * established. If the connection is established, schedule the + * connection to be finished off by the connd. Otherwise the connd is + * already dealing with it (either to set it up or tear it down). + * Caller holds kib_global_lock exclusively in irq context */ + struct kib_peer_ni *peer_ni = conn->ibc_peer; + struct kib_dev *dev; + unsigned long flags; + + LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + if (error != 0 && conn->ibc_comms_error == 0) + conn->ibc_comms_error = error; + + if (conn->ibc_state != IBLND_CONN_ESTABLISHED) + return; /* already being handled */ + + if (error == 0 && + list_empty(&conn->ibc_tx_noops) && + list_empty(&conn->ibc_tx_queue) && + list_empty(&conn->ibc_tx_queue_rsrvd) && + list_empty(&conn->ibc_tx_queue_nocred) && + list_empty(&conn->ibc_active_txs)) { + CDEBUG(D_NET, "closing conn to %s\n", + libcfs_nid2str(peer_ni->ibp_nid)); + } else { + CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n", + libcfs_nid2str(peer_ni->ibp_nid), error, + list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", + list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)", + list_empty(&conn->ibc_tx_queue_rsrvd) ? + "" : "(sending_rsrvd)", + list_empty(&conn->ibc_tx_queue_nocred) ? + "" : "(sending_nocred)", + list_empty(&conn->ibc_active_txs) ? "" : "(waiting)"); + } + + dev = ((struct kib_net *)peer_ni->ibp_ni->ni_data)->ibn_dev; + if (peer_ni->ibp_next_conn == conn) + /* clear next_conn so it won't be used */ + peer_ni->ibp_next_conn = NULL; + list_del(&conn->ibc_list); + /* connd (see below) takes over ibc_list's ref */ + + if (list_empty(&peer_ni->ibp_conns) && /* no more conns */ + kiblnd_peer_active(peer_ni)) { /* still in peer_ni table */ + kiblnd_unlink_peer_locked(peer_ni); + + /* set/clear error on last conn */ + peer_ni->ibp_error = conn->ibc_comms_error; + } + + kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING); + + if (error != 0 && + kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + wake_up(&kiblnd_data.kib_failover_waitq); + } + + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + + list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns); + wake_up(&kiblnd_data.kib_connd_waitq); + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); +} + +void +kiblnd_close_conn(struct kib_conn *conn, int error) +{ + unsigned long flags; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + kiblnd_close_conn_locked(conn, error); + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +} + +static void +kiblnd_handle_early_rxs(struct kib_conn *conn) +{ + unsigned long flags; + struct kib_rx *rx; + + LASSERT(!in_interrupt()); + LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + while ((rx = list_first_entry_or_null(&conn->ibc_early_rxs, + struct kib_rx, + rx_list)) != NULL) { + list_del(&rx->rx_list); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_handle_rx(rx); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + } + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +} + +void +kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs) +{ + LIST_HEAD(zombies); + struct kib_tx *nxt; + struct kib_tx *tx; + + spin_lock(&conn->ibc_lock); + + list_for_each_entry_safe(tx, nxt, txs, tx_list) { + if (txs == &conn->ibc_active_txs) { + LASSERT(!tx->tx_queued); + LASSERT(tx->tx_waiting || + tx->tx_sending != 0); + if (conn->ibc_comms_error == -ETIMEDOUT) { + if (tx->tx_waiting && !tx->tx_sending) + tx->tx_hstatus = + LNET_MSG_STATUS_REMOTE_TIMEOUT; + else if (tx->tx_sending) + tx->tx_hstatus = + LNET_MSG_STATUS_NETWORK_TIMEOUT; + } + } else { + LASSERT(tx->tx_queued); + if (conn->ibc_comms_error == -ETIMEDOUT) + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT; + else + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + } + + tx->tx_status = -ECONNABORTED; + tx->tx_waiting = 0; + + /* + * TODO: This makes an assumption that + * kiblnd_tx_complete() will be called for each tx. If + * that event is dropped we could end up with stale + * connections floating around. We'd like to deal with + * that in a better way. + * + * Also that means we can exceed the timeout by many + * seconds. + */ + if (tx->tx_sending == 0) { + tx->tx_queued = 0; + list_move(&tx->tx_list, &zombies); + } else { + /* keep tx until cq destroy */ + list_move(&tx->tx_list, &conn->ibc_zombie_txs); + conn->ibc_waits ++; + } + } + + spin_unlock(&conn->ibc_lock); + + /* + * aborting transmits occurs when finalizing the connection. + * The connection is finalized on error. + * Passing LNET_MSG_STATUS_OK to txlist_done() will not + * override the value already set in tx->tx_hstatus above. + */ + kiblnd_txlist_done(&zombies, -ECONNABORTED, LNET_MSG_STATUS_OK); +} + +static bool +kiblnd_tx_may_discard(struct kib_conn *conn) +{ + bool rc = false; + struct kib_tx *nxt; + struct kib_tx *tx; + + spin_lock(&conn->ibc_lock); + + list_for_each_entry_safe(tx, nxt, &conn->ibc_zombie_txs, tx_list) { + if (tx->tx_sending > 0 && tx->tx_lntmsg[0] && + lnet_md_discarded(tx->tx_lntmsg[0]->msg_md)) { + tx->tx_sending --; + if (tx->tx_sending == 0) { + kiblnd_conn_decref(tx->tx_conn); + tx->tx_conn = NULL; + rc = true; + } + } + } + + spin_unlock(&conn->ibc_lock); + return rc; +} + +static void +kiblnd_finalise_conn(struct kib_conn *conn) +{ + LASSERT (!in_interrupt()); + LASSERT (conn->ibc_state > IBLND_CONN_INIT); + + /* abort_receives moves QP state to IB_QPS_ERR. This is only required + * for connections that didn't get as far as being connected, because + * rdma_disconnect() does this for free. */ + kiblnd_abort_receives(conn); + + kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED); + + /* Complete all tx descs not waiting for sends to complete. + * NB we should be safe from RDMA now that the QP has changed state */ + + CDEBUG(D_NET, "abort connection with %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + kiblnd_abort_txs(conn, &conn->ibc_tx_noops); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred); + kiblnd_abort_txs(conn, &conn->ibc_active_txs); + + kiblnd_handle_early_rxs(conn); +} + +static void +kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active, + int error) +{ + LIST_HEAD(zombies); + unsigned long flags; + enum lnet_msg_hstatus hstatus; + + LASSERT(error != 0); + LASSERT(!in_interrupt()); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (active) { + LASSERT(peer_ni->ibp_connecting > 0); + peer_ni->ibp_connecting--; + } else { + LASSERT (peer_ni->ibp_accepting > 0); + peer_ni->ibp_accepting--; + } + + if (kiblnd_peer_connecting(peer_ni)) { + /* another connection attempt under way... */ + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return; + } + + peer_ni->ibp_reconnected = 0; + if (list_empty(&peer_ni->ibp_conns)) { + /* Take peer_ni's blocked transmits to complete with error */ + list_splice_init(&peer_ni->ibp_tx_queue, &zombies); + + if (kiblnd_peer_active(peer_ni)) + kiblnd_unlink_peer_locked(peer_ni); + + peer_ni->ibp_error = error; + } else { + /* Can't have blocked transmits if there are connections */ + LASSERT(list_empty(&peer_ni->ibp_tx_queue)); + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_peer_notify(peer_ni); + + if (list_empty(&zombies)) + return; + + CNETERR("Deleting messages for %s: connection failed\n", + libcfs_nid2str(peer_ni->ibp_nid)); + + switch (error) { + case -EHOSTUNREACH: + case -ETIMEDOUT: + hstatus = LNET_MSG_STATUS_NETWORK_TIMEOUT; + break; + case -ECONNREFUSED: + hstatus = LNET_MSG_STATUS_REMOTE_DROPPED; + break; + default: + hstatus = LNET_MSG_STATUS_LOCAL_DROPPED; + break; + } + + kiblnd_txlist_done(&zombies, error, hstatus); +} + +static void +kiblnd_connreq_done(struct kib_conn *conn, int status) +{ + struct kib_peer_ni *peer_ni = conn->ibc_peer; + struct kib_tx *tx; + LIST_HEAD(txs); + unsigned long flags; + int active; + + active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + + CDEBUG(D_NET,"%s: active(%d), version(%x), status(%d)\n", + libcfs_nid2str(peer_ni->ibp_nid), active, + conn->ibc_version, status); + + LASSERT (!in_interrupt()); + LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT && + peer_ni->ibp_connecting > 0) || + (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT && + peer_ni->ibp_accepting > 0)); + + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + conn->ibc_connvars = NULL; + + if (status != 0) { + /* failed to establish connection */ + kiblnd_peer_connect_failed(peer_ni, active, status); + kiblnd_finalise_conn(conn); + return; + } + + /* connection established */ + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + conn->ibc_last_send = ktime_get(); + kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED); + kiblnd_peer_alive(peer_ni); + + /* Add conn to peer_ni's list and nuke any dangling conns from a different + * peer_ni instance... */ + kiblnd_conn_addref(conn); /* +1 ref for ibc_list */ + list_add(&conn->ibc_list, &peer_ni->ibp_conns); + peer_ni->ibp_reconnected = 0; + if (active) + peer_ni->ibp_connecting--; + else + peer_ni->ibp_accepting--; + + if (peer_ni->ibp_version == 0) { + peer_ni->ibp_version = conn->ibc_version; + peer_ni->ibp_incarnation = conn->ibc_incarnation; + } + + if (peer_ni->ibp_version != conn->ibc_version || + peer_ni->ibp_incarnation != conn->ibc_incarnation) { + kiblnd_close_stale_conns_locked(peer_ni, conn->ibc_version, + conn->ibc_incarnation); + peer_ni->ibp_version = conn->ibc_version; + peer_ni->ibp_incarnation = conn->ibc_incarnation; + } + + /* grab pending txs while I have the lock */ + list_splice_init(&peer_ni->ibp_tx_queue, &txs); + + if (!kiblnd_peer_active(peer_ni) || /* peer_ni has been deleted */ + conn->ibc_comms_error != 0) { /* error has happened already */ + + /* start to shut down connection */ + kiblnd_close_conn_locked(conn, -ECONNABORTED); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_txlist_done(&txs, -ECONNABORTED, + LNET_MSG_STATUS_LOCAL_ERROR); + + return; + } + + /* +1 ref for myself, this connection is visible to other threads + * now, refcount of peer:ibp_conns can be released by connection + * close from either a different thread, or the calling of + * kiblnd_check_sends_locked() below. See bz21911 for details. + */ + kiblnd_conn_addref(conn); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + /* Schedule blocked txs + * Note: if we are running with conns_per_peer > 1, these blocked + * txs will all get scheduled to the first connection which gets + * scheduled. We won't be using round robin on this first batch. + */ + spin_lock(&conn->ibc_lock); + while ((tx = list_first_entry_or_null(&txs, struct kib_tx, + tx_list)) != NULL) { + list_del(&tx->tx_list); + + kiblnd_queue_tx_locked(tx, conn); + } + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); + + /* schedule blocked rxs */ + kiblnd_handle_early_rxs(conn); + kiblnd_conn_decref(conn); +} + +static void +kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej) +{ + int rc; + +#ifdef HAVE_RDMA_REJECT_4ARGS + rc = rdma_reject(cmid, rej, sizeof(*rej), IB_CM_REJ_CONSUMER_DEFINED); +#else + rc = rdma_reject(cmid, rej, sizeof(*rej)); +#endif + + if (rc != 0) + CWARN("Error %d sending reject\n", rc); +} + +static int +kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) +{ + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + struct kib_msg *reqmsg = priv; + struct kib_msg *ackmsg; + struct kib_dev *ibdev; + struct kib_peer_ni *peer_ni; + struct kib_peer_ni *peer2; + struct kib_conn *conn; + struct lnet_ni *ni = NULL; + struct kib_net *net = NULL; + lnet_nid_t nid; + struct rdma_conn_param cp; + struct kib_rej rej; + int version = IBLND_MSG_VERSION; + unsigned long flags; + int rc; + struct sockaddr_in *peer_addr; + + LASSERT(!in_interrupt()); + /* cmid inherits 'context' from the corresponding listener id */ + ibdev = cmid->context; + LASSERT(ibdev); + + memset(&rej, 0, sizeof(rej)); + rej.ibr_magic = IBLND_MSG_MAGIC; + rej.ibr_why = IBLND_REJECT_FATAL; + rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE; + + peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr); + if (*kiblnd_tunables.kib_require_priv_port && + ntohs(peer_addr->sin_port) >= PROT_SOCK) { + __u32 ip = ntohl(peer_addr->sin_addr.s_addr); + CERROR("peer_ni's port (%pI4h:%hu) is not privileged\n", + &ip, ntohs(peer_addr->sin_port)); + goto failed; + } + + if (priv_nob < offsetof(struct kib_msg, ibm_type)) { + CERROR("Short connection request\n"); + goto failed; + } + + /* Future protocol version compatibility support! If the + * o2iblnd-specific protocol changes, or when LNET unifies + * protocols over all LNDs, the initial connection will + * negotiate a protocol version. I trap this here to avoid + * console errors; the reject tells the peer_ni which protocol I + * speak. */ + if (reqmsg->ibm_magic == LNET_PROTO_MAGIC || + reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) + goto failed; + if (reqmsg->ibm_magic == IBLND_MSG_MAGIC && + reqmsg->ibm_version != IBLND_MSG_VERSION && + reqmsg->ibm_version != IBLND_MSG_VERSION_1) + goto failed; + if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) && + reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) && + reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1)) + goto failed; + + rc = kiblnd_unpack_msg(reqmsg, priv_nob); + if (rc != 0) { + CERROR("Can't parse connection request: %d\n", rc); + goto failed; + } + + nid = reqmsg->ibm_srcnid; + ni = lnet_nid2ni_addref(reqmsg->ibm_dstnid); + + if (ni != NULL) { + net = (struct kib_net *)ni->ni_data; + rej.ibr_incarnation = net->ibn_incarnation; + } + + if (ni == NULL || /* no matching net */ + lnet_nid_to_nid4(&ni->ni_nid) != + reqmsg->ibm_dstnid || /* right NET, wrong NID! */ + net->ibn_dev != ibdev) { /* wrong device */ + CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): bad dst nid %s\n", libcfs_nid2str(nid), + ni ? libcfs_nidstr(&ni->ni_nid) : "NA", + ibdev->ibd_ifname, ibdev->ibd_nnets, + &ibdev->ibd_ifip, + libcfs_nid2str(reqmsg->ibm_dstnid)); + + goto failed; + } + + /* check time stamp as soon as possible */ + if (reqmsg->ibm_dststamp != 0 && + reqmsg->ibm_dststamp != net->ibn_incarnation) { + CWARN("Stale connection request\n"); + rej.ibr_why = IBLND_REJECT_CONN_STALE; + goto failed; + } + + /* I can accept peer_ni's version */ + version = reqmsg->ibm_version; + + if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) { + CERROR("Unexpected connreq msg type: %x from %s\n", + reqmsg->ibm_type, libcfs_nid2str(nid)); + goto failed; + } + + if (reqmsg->ibm_u.connparams.ibcp_queue_depth > + kiblnd_msg_queue_size(version, ni)) { + CERROR("Can't accept conn from %s, queue depth too large: %d (<=%d wanted)\n", + libcfs_nid2str(nid), + reqmsg->ibm_u.connparams.ibcp_queue_depth, + kiblnd_msg_queue_size(version, ni)); + + if (version == IBLND_MSG_VERSION) + rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE; + + goto failed; + } + + if (reqmsg->ibm_u.connparams.ibcp_max_frags > + IBLND_MAX_RDMA_FRAGS) { + CWARN("Can't accept conn from %s (version %x): max_frags %d too large (%d wanted)\n", + libcfs_nid2str(nid), version, + reqmsg->ibm_u.connparams.ibcp_max_frags, + IBLND_MAX_RDMA_FRAGS); + + if (version >= IBLND_MSG_VERSION) + rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; + + goto failed; + } else if (reqmsg->ibm_u.connparams.ibcp_max_frags < + IBLND_MAX_RDMA_FRAGS && + net->ibn_fmr_ps == NULL) { + CWARN("Can't accept conn from %s (version %x): max_frags %d incompatible without FMR pool (%d wanted)\n", + libcfs_nid2str(nid), version, + reqmsg->ibm_u.connparams.ibcp_max_frags, + IBLND_MAX_RDMA_FRAGS); + + if (version == IBLND_MSG_VERSION) + rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; + + goto failed; + } + + if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { + CERROR("Can't accept %s: message size %d too big (%d max)\n", + libcfs_nid2str(nid), + reqmsg->ibm_u.connparams.ibcp_max_msg_size, + IBLND_MSG_SIZE); + goto failed; + } + + /* assume 'nid' is a new peer_ni; create */ + rc = kiblnd_create_peer(ni, &peer_ni, nid); + if (rc != 0) { + CERROR("Can't create peer_ni for %s\n", libcfs_nid2str(nid)); + rej.ibr_why = IBLND_REJECT_NO_RESOURCES; + goto failed; + } + + /* We have validated the peer's parameters so use those */ + peer_ni->ibp_max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags; + peer_ni->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth; + + write_lock_irqsave(g_lock, flags); + + peer2 = kiblnd_find_peer_locked(ni, nid); + if (peer2 != NULL) { + if (peer2->ibp_version == 0) { + peer2->ibp_version = version; + peer2->ibp_incarnation = reqmsg->ibm_srcstamp; + } + + /* not the guy I've talked with */ + if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp || + peer2->ibp_version != version) { + kiblnd_close_peer_conns_locked(peer2, -ESTALE); + + if (kiblnd_peer_active(peer2)) { + peer2->ibp_incarnation = reqmsg->ibm_srcstamp; + peer2->ibp_version = version; + } + write_unlock_irqrestore(g_lock, flags); + + CWARN("Conn stale %s version %x/%x incarnation %llu/%llu\n", + libcfs_nid2str(nid), peer2->ibp_version, version, + peer2->ibp_incarnation, reqmsg->ibm_srcstamp); + + kiblnd_peer_decref(peer_ni); + rej.ibr_why = IBLND_REJECT_CONN_STALE; + goto failed; + } + + /* Tie-break connection race in favour of the higher NID. + * If we keep running into a race condition multiple times, + * we have to assume that the connection attempt with the + * higher NID is stuck in a connecting state and will never + * recover. As such, we pass through this if-block and let + * the lower NID connection win so we can move forward. + */ + if (peer2->ibp_connecting != 0 && + nid < lnet_nid_to_nid4(&ni->ni_nid) && + peer2->ibp_races < MAX_CONN_RACES_BEFORE_ABORT) { + peer2->ibp_races++; + write_unlock_irqrestore(g_lock, flags); + + CDEBUG(D_NET, "Conn race %s\n", + libcfs_nid2str(peer2->ibp_nid)); + + kiblnd_peer_decref(peer_ni); + rej.ibr_why = IBLND_REJECT_CONN_RACE; + goto failed; + } + if (peer2->ibp_races >= MAX_CONN_RACES_BEFORE_ABORT) + CNETERR("Conn race %s: unresolved after %d attempts, letting lower NID win\n", + libcfs_nid2str(peer2->ibp_nid), + MAX_CONN_RACES_BEFORE_ABORT); + /* + * passive connection is allowed even this peer_ni is waiting for + * reconnection. + */ + peer2->ibp_reconnecting = 0; + peer2->ibp_races = 0; + peer2->ibp_accepting++; + kiblnd_peer_addref(peer2); + + /* Race with kiblnd_launch_tx (active connect) to create peer_ni + * so copy validated parameters since we now know what the + * peer_ni's limits are */ + peer2->ibp_max_frags = peer_ni->ibp_max_frags; + peer2->ibp_queue_depth = peer_ni->ibp_queue_depth; + + write_unlock_irqrestore(g_lock, flags); + kiblnd_peer_decref(peer_ni); + peer_ni = peer2; + } else { + /* Brand new peer_ni */ + LASSERT(peer_ni->ibp_accepting == 0); + LASSERT(peer_ni->ibp_version == 0 && + peer_ni->ibp_incarnation == 0); + + peer_ni->ibp_accepting = 1; + peer_ni->ibp_version = version; + peer_ni->ibp_incarnation = reqmsg->ibm_srcstamp; + + /* I have a ref on ni that prevents it being shutdown */ + LASSERT(net->ibn_shutdown == 0); + + kiblnd_peer_addref(peer_ni); + hash_add(kiblnd_data.kib_peers, &peer_ni->ibp_list, nid); + + write_unlock_irqrestore(g_lock, flags); + } + + conn = kiblnd_create_conn(peer_ni, cmid, IBLND_CONN_PASSIVE_WAIT, + version); + if (!conn) { + kiblnd_peer_connect_failed(peer_ni, 0, -ENOMEM); + kiblnd_peer_decref(peer_ni); + rej.ibr_why = IBLND_REJECT_NO_RESOURCES; + goto failed; + } + + /* conn now "owns" cmid, so I return success from here on to ensure the + * CM callback doesn't destroy cmid. + */ + conn->ibc_incarnation = reqmsg->ibm_srcstamp; + conn->ibc_credits = conn->ibc_queue_depth; + conn->ibc_reserved_credits = conn->ibc_queue_depth; + LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + + IBLND_OOB_MSGS(version) <= IBLND_RX_MSGS(conn)); + + ackmsg = &conn->ibc_connvars->cv_msg; + memset(ackmsg, 0, sizeof(*ackmsg)); + + kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK, + sizeof(ackmsg->ibm_u.connparams)); + ackmsg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; + ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags; + ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; + + kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp); + + memset(&cp, 0, sizeof(cp)); + cp.private_data = ackmsg; + cp.private_data_len = ackmsg->ibm_nob; + cp.responder_resources = 0; /* No atomic ops or RDMA reads */ + cp.initiator_depth = 0; + cp.flow_control = 1; + cp.retry_count = *kiblnd_tunables.kib_retry_count; + cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; + + CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid)); + + rc = rdma_accept(cmid, &cp); + if (rc != 0) { + CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc); + rej.ibr_version = version; + rej.ibr_why = IBLND_REJECT_FATAL; + + kiblnd_reject(cmid, &rej); + kiblnd_connreq_done(conn, rc); + kiblnd_conn_decref(conn); + } + + lnet_ni_decref(ni); + return 0; + + failed: + if (ni != NULL) { + rej.ibr_cp.ibcp_queue_depth = + kiblnd_msg_queue_size(version, ni); + rej.ibr_cp.ibcp_max_frags = IBLND_MAX_RDMA_FRAGS; + lnet_ni_decref(ni); + } + + rej.ibr_version = version; + kiblnd_reject(cmid, &rej); + + return -ECONNREFUSED; +} + +static void +kiblnd_check_reconnect(struct kib_conn *conn, int version, + u64 incarnation, int why, struct kib_connparams *cp) +{ + rwlock_t *glock = &kiblnd_data.kib_global_lock; + struct kib_peer_ni *peer_ni = conn->ibc_peer; + char *reason; + int msg_size = IBLND_MSG_SIZE; + int frag_num = -1; + int queue_dep = -1; + bool reconnect; + unsigned long flags; + + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + LASSERT(peer_ni->ibp_connecting > 0); /* 'conn' at least */ + + if (cp) { + msg_size = cp->ibcp_max_msg_size; + frag_num = cp->ibcp_max_frags; + queue_dep = cp->ibcp_queue_depth; + } + + write_lock_irqsave(glock, flags); + /* retry connection if it's still needed and no other connection + * attempts (active or passive) are in progress + * NB: reconnect is still needed even when ibp_tx_queue is + * empty if ibp_version != version because reconnect may be + * initiated. + */ + reconnect = (!list_empty(&peer_ni->ibp_tx_queue) || + peer_ni->ibp_version != version) && + peer_ni->ibp_connecting && + peer_ni->ibp_accepting == 0; + if (!reconnect) { + reason = "no need"; + goto out; + } + + switch (why) { + default: + reason = "Unknown"; + break; + + case IBLND_REJECT_RDMA_FRAGS: { + if (!cp) { + reason = "can't negotiate max frags"; + goto out; + } + + if (conn->ibc_max_frags <= frag_num) { + reason = "unsupported max frags"; + goto out; + } + + peer_ni->ibp_max_frags = frag_num; + reason = "rdma fragments"; + break; + } + case IBLND_REJECT_MSG_QUEUE_SIZE: + if (!cp) { + reason = "can't negotiate queue depth"; + goto out; + } + if (conn->ibc_queue_depth <= queue_dep) { + reason = "unsupported queue depth"; + goto out; + } + + peer_ni->ibp_queue_depth = queue_dep; + reason = "queue depth"; + break; + + case IBLND_REJECT_CONN_STALE: + reason = "stale"; + break; + + case IBLND_REJECT_CONN_RACE: + reason = "conn race"; + break; + + case IBLND_REJECT_CONN_UNCOMPAT: + reason = "version negotiation"; + break; + } + + conn->ibc_reconnect = 1; + peer_ni->ibp_reconnecting++; + peer_ni->ibp_version = version; + if (incarnation != 0) + peer_ni->ibp_incarnation = incarnation; + out: + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + CNETERR("%s: %s (%s), %x, %x, msg_size: %d, queue_depth: %d/%d, max_frags: %d/%d\n", + libcfs_nid2str(peer_ni->ibp_nid), + reconnect ? "reconnect" : "don't reconnect", + reason, IBLND_MSG_VERSION, version, msg_size, + conn->ibc_queue_depth, queue_dep, + conn->ibc_max_frags, frag_num); + /* + * if conn::ibc_reconnect is TRUE, connd will reconnect to the peer_ni + * while destroying the zombie + */ +} + +static void +kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob) +{ + struct kib_peer_ni *peer_ni = conn->ibc_peer; + int status = -ECONNREFUSED; + + LASSERT (!in_interrupt()); + LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + + switch (reason) { + case IB_CM_REJ_STALE_CONN: + kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0, + IBLND_REJECT_CONN_STALE, NULL); + break; + + case IB_CM_REJ_INVALID_SERVICE_ID: + status = -EHOSTUNREACH; + CNETERR("%s rejected: no listener at %d\n", + libcfs_nid2str(peer_ni->ibp_nid), + *kiblnd_tunables.kib_service); + break; + + case IB_CM_REJ_CONSUMER_DEFINED: + if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) { + struct kib_rej *rej = priv; + struct kib_connparams *cp = NULL; + bool flip = false; + __u64 incarnation = -1; + + /* NB. default incarnation is -1 because: + * a) V1 will ignore dst incarnation in connreq. + * b) V2 will provide incarnation while rejecting me, + * -1 will be overwrote. + * + * if I try to connect to a V1 peer_ni with V2 protocol, + * it rejected me then upgrade to V2, I have no idea + * about the upgrading and try to reconnect with V1, + * in this case upgraded V2 can find out I'm trying to + * talk to the old guy and reject me(incarnation is -1). + */ + + if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) || + rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) { + __swab32s(&rej->ibr_magic); + __swab16s(&rej->ibr_version); + flip = true; + } + + if (priv_nob >= sizeof(struct kib_rej) && + rej->ibr_version > IBLND_MSG_VERSION_1) { + /* priv_nob is always 148 in current version + * of OFED, so we still need to check version. + * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) + */ + cp = &rej->ibr_cp; + + if (flip) { + __swab64s(&rej->ibr_incarnation); + __swab16s(&cp->ibcp_queue_depth); + __swab16s(&cp->ibcp_max_frags); + __swab32s(&cp->ibcp_max_msg_size); + } + + incarnation = rej->ibr_incarnation; + } + + if (rej->ibr_magic != IBLND_MSG_MAGIC && + rej->ibr_magic != LNET_PROTO_MAGIC) { + CERROR("%s rejected: consumer defined fatal error\n", + libcfs_nid2str(peer_ni->ibp_nid)); + break; + } + + if (rej->ibr_version != IBLND_MSG_VERSION && + rej->ibr_version != IBLND_MSG_VERSION_1) { + CERROR("%s rejected: o2iblnd version %x error\n", + libcfs_nid2str(peer_ni->ibp_nid), + rej->ibr_version); + break; + } + + if (rej->ibr_why == IBLND_REJECT_FATAL && + rej->ibr_version == IBLND_MSG_VERSION_1) { + CDEBUG(D_NET, "rejected by old version peer_ni %s: %x\n", + libcfs_nid2str(peer_ni->ibp_nid), + rej->ibr_version); + + if (conn->ibc_version != IBLND_MSG_VERSION_1) + rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT; + } + + switch (rej->ibr_why) { + case IBLND_REJECT_CONN_RACE: + case IBLND_REJECT_CONN_STALE: + case IBLND_REJECT_CONN_UNCOMPAT: + case IBLND_REJECT_MSG_QUEUE_SIZE: + case IBLND_REJECT_RDMA_FRAGS: + kiblnd_check_reconnect(conn, rej->ibr_version, + incarnation, + rej->ibr_why, cp); + break; + + case IBLND_REJECT_NO_RESOURCES: + CERROR("%s rejected: o2iblnd no resources\n", + libcfs_nid2str(peer_ni->ibp_nid)); + break; + + case IBLND_REJECT_FATAL: + CERROR("%s rejected: o2iblnd fatal error\n", + libcfs_nid2str(peer_ni->ibp_nid)); + break; + + default: + CERROR("%s rejected: o2iblnd reason %d\n", + libcfs_nid2str(peer_ni->ibp_nid), + rej->ibr_why); + break; + } + break; + } + fallthrough; + default: + CNETERR("%s rejected: reason %d, size %d\n", + libcfs_nid2str(peer_ni->ibp_nid), reason, priv_nob); + break; + } + + kiblnd_connreq_done(conn, status); +} + +static void +kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob) +{ + struct kib_peer_ni *peer_ni = conn->ibc_peer; + struct lnet_ni *ni = peer_ni->ibp_ni; + struct kib_net *net = ni->ni_data; + struct kib_msg *msg = priv; + int ver = conn->ibc_version; + int rc = kiblnd_unpack_msg(msg, priv_nob); + unsigned long flags; + + LASSERT (net != NULL); + + if (rc != 0) { + CERROR("Can't unpack connack from %s: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), rc); + goto failed; + } + + if (msg->ibm_type != IBLND_MSG_CONNACK) { + CERROR("Unexpected message %d from %s\n", + msg->ibm_type, libcfs_nid2str(peer_ni->ibp_nid)); + rc = -EPROTO; + goto failed; + } + + if (ver != msg->ibm_version) { + CERROR("%s replied version %x is different with " + "requested version %x\n", + libcfs_nid2str(peer_ni->ibp_nid), msg->ibm_version, ver); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_queue_depth > + conn->ibc_queue_depth) { + CERROR("%s has incompatible queue depth %d (<=%d wanted)\n", + libcfs_nid2str(peer_ni->ibp_nid), + msg->ibm_u.connparams.ibcp_queue_depth, + conn->ibc_queue_depth); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_max_frags > + conn->ibc_max_frags) { + CERROR("%s has incompatible max_frags %d (<=%d wanted)\n", + libcfs_nid2str(peer_ni->ibp_nid), + msg->ibm_u.connparams.ibcp_max_frags, + conn->ibc_max_frags); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { + CERROR("%s max message size %d too big (%d max)\n", + libcfs_nid2str(peer_ni->ibp_nid), + msg->ibm_u.connparams.ibcp_max_msg_size, + IBLND_MSG_SIZE); + rc = -EPROTO; + goto failed; + } + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + if (msg->ibm_dstnid == lnet_nid_to_nid4(&ni->ni_nid) && + msg->ibm_dststamp == net->ibn_incarnation) + rc = 0; + else + rc = -ESTALE; + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (rc != 0) { + CERROR("Bad connection reply from %s, rc = %d, " + "version: %x max_frags: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), rc, + msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags); + goto failed; + } + + conn->ibc_incarnation = msg->ibm_srcstamp; + conn->ibc_credits = msg->ibm_u.connparams.ibcp_queue_depth; + conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth; + conn->ibc_queue_depth = msg->ibm_u.connparams.ibcp_queue_depth; + conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags; + LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + + IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn)); + + kiblnd_connreq_done(conn, 0); + return; + + failed: + /* NB My QP has already established itself, so I handle anything going + * wrong here by setting ibc_comms_error. + * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then + * immediately tears it down. */ + + LASSERT (rc != 0); + conn->ibc_comms_error = rc; + kiblnd_connreq_done(conn, 0); +} + +static int +kiblnd_active_connect(struct rdma_cm_id *cmid) +{ + struct kib_peer_ni *peer_ni = cmid->context; + struct kib_conn *conn; + struct kib_msg *msg; + struct rdma_conn_param cp; + int version; + __u64 incarnation; + unsigned long flags; + int rc; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + incarnation = peer_ni->ibp_incarnation; + version = (peer_ni->ibp_version == 0) ? IBLND_MSG_VERSION : + peer_ni->ibp_version; + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + conn = kiblnd_create_conn(peer_ni, cmid, IBLND_CONN_ACTIVE_CONNECT, + version); + if (conn == NULL) { + kiblnd_peer_connect_failed(peer_ni, 1, -ENOMEM); + kiblnd_peer_decref(peer_ni); /* lose cmid's ref */ + return -ENOMEM; + } + + /* conn "owns" cmid now, so I return success from here on to ensure the + * CM callback doesn't destroy cmid. conn also takes over cmid's ref + * on peer_ni */ + + msg = &conn->ibc_connvars->cv_msg; + + memset(msg, 0, sizeof(*msg)); + kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); + msg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; + msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags; + msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; + + kiblnd_pack_msg(peer_ni->ibp_ni, msg, version, + 0, peer_ni->ibp_nid, incarnation); + + memset(&cp, 0, sizeof(cp)); + cp.private_data = msg; + cp.private_data_len = msg->ibm_nob; + cp.responder_resources = 0; /* No atomic ops or RDMA reads */ + cp.initiator_depth = 0; + cp.flow_control = 1; + cp.retry_count = *kiblnd_tunables.kib_retry_count; + cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; + + LASSERT(cmid->context == (void *)conn); + LASSERT(conn->ibc_cmid == cmid); + rc = rdma_connect_locked(cmid, &cp); + if (rc != 0) { + CERROR("Can't connect to %s: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), rc); + kiblnd_connreq_done(conn, rc); + kiblnd_conn_decref(conn); + } + + return 0; +} + +int +kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) +{ + struct kib_peer_ni *peer_ni; + struct kib_conn *conn; + int rc; + + switch (event->event) { + default: + CERROR("Unexpected event: %d, status: %d\n", + event->event, event->status); + LBUG(); + + case RDMA_CM_EVENT_CONNECT_REQUEST: + /* destroy cmid on failure */ + rc = kiblnd_passive_connect(cmid, + (void *)KIBLND_CONN_PARAM(event), + KIBLND_CONN_PARAM_LEN(event)); + CDEBUG(D_NET, "connreq: %d\n", rc); + return rc; + + case RDMA_CM_EVENT_ADDR_ERROR: + peer_ni = cmid->context; + CNETERR("%s: ADDR ERROR %d\n", + libcfs_nid2str(peer_ni->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH); + kiblnd_peer_decref(peer_ni); + return -EHOSTUNREACH; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ADDR_RESOLVED: + peer_ni = cmid->context; + + CDEBUG(D_NET,"%s Addr resolved: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), event->status); + + if (event->status != 0) { + CNETERR("Can't resolve address for %s: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), event->status); + rc = event->status; + } else { + rc = rdma_resolve_route( + cmid, kiblnd_timeout() * 1000); + if (rc == 0) { + struct kib_net *net = peer_ni->ibp_ni->ni_data; + struct kib_dev *dev = net->ibn_dev; + + CDEBUG(D_NET, "%s: connection bound to "\ + "%s:%pI4h:%s\n", + libcfs_nid2str(peer_ni->ibp_nid), + dev->ibd_ifname, + &dev->ibd_ifip, cmid->device->name); + + return 0; + } + + /* Can't initiate route resolution */ + CERROR("Can't resolve route for %s: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), rc); + } + kiblnd_peer_connect_failed(peer_ni, 1, rc); + kiblnd_peer_decref(peer_ni); + return rc; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ROUTE_ERROR: + peer_ni = cmid->context; + CNETERR("%s: ROUTE ERROR %d\n", + libcfs_nid2str(peer_ni->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH); + kiblnd_peer_decref(peer_ni); + return -EHOSTUNREACH; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + peer_ni = cmid->context; + CDEBUG(D_NET,"%s Route resolved: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), event->status); + + if (event->status == 0) + return kiblnd_active_connect(cmid); + + CNETERR("Can't resolve route for %s: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer_ni, 1, event->status); + kiblnd_peer_decref(peer_ni); + return event->status; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_UNREACHABLE: + conn = cmid->context; + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || + conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); + CNETERR("%s: UNREACHABLE %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); + kiblnd_connreq_done(conn, -ENETDOWN); + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_CONNECT_ERROR: + conn = cmid->context; + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || + conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); + CNETERR("%s: CONNECT ERROR %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); + kiblnd_connreq_done(conn, -ENOTCONN); + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_REJECTED: + conn = cmid->context; + switch (conn->ibc_state) { + default: + LBUG(); + + case IBLND_CONN_PASSIVE_WAIT: + CERROR ("%s: REJECTED %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + event->status); + kiblnd_connreq_done(conn, -ECONNRESET); + break; + + case IBLND_CONN_ACTIVE_CONNECT: + kiblnd_rejected(conn, event->status, + (void *)KIBLND_CONN_PARAM(event), + KIBLND_CONN_PARAM_LEN(event)); + break; + } + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_ESTABLISHED: + conn = cmid->context; + switch (conn->ibc_state) { + default: + LBUG(); + + case IBLND_CONN_PASSIVE_WAIT: + CDEBUG(D_NET, "ESTABLISHED (passive): %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_connreq_done(conn, 0); + break; + + case IBLND_CONN_ACTIVE_CONNECT: + CDEBUG(D_NET, "ESTABLISHED(active): %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_check_connreply(conn, + (void *)KIBLND_CONN_PARAM(event), + KIBLND_CONN_PARAM_LEN(event)); + break; + } + /* net keeps its ref on conn! */ + return 0; + + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n"); + return 0; + + case RDMA_CM_EVENT_DISCONNECTED: + conn = cmid->context; + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + CERROR("%s DISCONNECTED\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_connreq_done(conn, -ECONNRESET); + } else { + kiblnd_close_conn(conn, 0); + } + kiblnd_conn_decref(conn); + cmid->context = NULL; + return 0; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + LCONSOLE_ERROR_MSG(0x131, + "Received notification of device removal\n" + "Please shutdown LNET to allow this to proceed\n"); + /* Can't remove network from underneath LNET for now, so I have + * to ignore this */ + return 0; + + case RDMA_CM_EVENT_ADDR_CHANGE: + LCONSOLE_INFO("Physical link changed (eg hca/port)\n"); + return 0; + } +} + +static int +kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs) +{ + struct kib_tx *tx; + + list_for_each_entry(tx, txs, tx_list) { + if (txs != &conn->ibc_active_txs) { + LASSERT(tx->tx_queued); + } else { + LASSERT(!tx->tx_queued); + LASSERT(tx->tx_waiting || tx->tx_sending != 0); + } + + if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) { + CERROR("Timed out tx: %s(WSQ:%d%d%d), %lld seconds\n", + kiblnd_queue2str(conn, txs), + tx->tx_waiting, tx->tx_sending, tx->tx_queued, + kiblnd_timeout() + + ktime_ms_delta(ktime_get(), + tx->tx_deadline) / MSEC_PER_SEC); + return 1; + } + } + + return 0; +} + +static int +kiblnd_conn_timed_out_locked(struct kib_conn *conn) +{ + return kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) || + kiblnd_check_txs_locked(conn, &conn->ibc_active_txs); +} + +static void +kiblnd_check_conns (int idx) +{ + LIST_HEAD(closes); + LIST_HEAD(checksends); + LIST_HEAD(timedout_txs); + struct hlist_head *peers = &kiblnd_data.kib_peers[idx]; + struct kib_peer_ni *peer_ni; + struct kib_conn *conn; + struct kib_tx *tx, *tx_tmp; + unsigned long flags; + + /* NB. We expect to have a look at all the peers and not find any + * RDMAs to time out, so we just use a shared lock while we + * take a look... + */ + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + hlist_for_each_entry(peer_ni, peers, ibp_list) { + /* Check tx_deadline */ + list_for_each_entry_safe(tx, tx_tmp, &peer_ni->ibp_tx_queue, tx_list) { + if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) { + CWARN("Timed out tx for %s: %lld seconds\n", + libcfs_nid2str(peer_ni->ibp_nid), + ktime_ms_delta(ktime_get(), + tx->tx_deadline) / MSEC_PER_SEC); + list_move(&tx->tx_list, &timedout_txs); + } + } + + list_for_each_entry(conn, &peer_ni->ibp_conns, ibc_list) { + int timedout; + int sendnoop; + + LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED); + + spin_lock(&conn->ibc_lock); + + sendnoop = kiblnd_need_noop(conn); + timedout = kiblnd_conn_timed_out_locked(conn); + if (!sendnoop && !timedout) { + spin_unlock(&conn->ibc_lock); + continue; + } + + if (timedout) { + CERROR("Timed out RDMA with %s (%lld): c: %u, oc: %u, rc: %u\n", + libcfs_nid2str(peer_ni->ibp_nid), + ktime_get_seconds() + - peer_ni->ibp_last_alive, + conn->ibc_credits, + conn->ibc_outstanding_credits, + conn->ibc_reserved_credits); + list_add(&conn->ibc_connd_list, &closes); + } else { + list_add(&conn->ibc_connd_list, &checksends); + } + /* +ref for 'closes' or 'checksends' */ + kiblnd_conn_addref(conn); + + spin_unlock(&conn->ibc_lock); + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (!list_empty(&timedout_txs)) + kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT, + LNET_MSG_STATUS_NETWORK_TIMEOUT); + + /* Handle timeout by closing the whole + * connection. We can only be sure RDMA activity + * has ceased once the QP has been modified. + */ + while ((conn = list_first_entry_or_null(&closes, + struct kib_conn, + ibc_connd_list)) != NULL) { + list_del(&conn->ibc_connd_list); + kiblnd_close_conn(conn, -ETIMEDOUT); + kiblnd_conn_decref(conn); + } + + /* In case we have enough credits to return via a + * NOOP, but there were no non-blocking tx descs + * free to do it last time... + */ + while ((conn = list_first_entry_or_null(&checksends, + struct kib_conn, + ibc_connd_list)) != NULL) { + list_del(&conn->ibc_connd_list); + + spin_lock(&conn->ibc_lock); + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); + + kiblnd_conn_decref(conn); + } +} + +static void +kiblnd_disconnect_conn(struct kib_conn *conn) +{ + LASSERT (!in_interrupt()); + LASSERT (current == kiblnd_data.kib_connd); + LASSERT (conn->ibc_state == IBLND_CONN_CLOSING); + + rdma_disconnect(conn->ibc_cmid); + kiblnd_finalise_conn(conn); + + kiblnd_peer_notify(conn->ibc_peer); +} + +/* + * High-water for reconnection to the same peer_ni, reconnection attempt should + * be delayed after trying more than KIB_RECONN_HIGH_RACE. + */ +#define KIB_RECONN_HIGH_RACE 10 +/* + * Allow connd to take a break and handle other things after consecutive + * reconnection attemps. + */ +#define KIB_RECONN_BREAK 100 + +int +kiblnd_connd (void *arg) +{ + spinlock_t *lock = &kiblnd_data.kib_connd_lock; + wait_queue_entry_t wait; + unsigned long flags; + struct kib_conn *conn; + int timeout; + int i; + bool dropped_lock; + int peer_index = 0; + unsigned long deadline = jiffies; + + init_wait(&wait); + kiblnd_data.kib_connd = current; + + spin_lock_irqsave(lock, flags); + + while (!kiblnd_data.kib_shutdown) { + int reconn = 0; + + dropped_lock = false; + + conn = list_first_entry_or_null(&kiblnd_data.kib_connd_zombies, + struct kib_conn, ibc_list); + if (conn) { + struct kib_peer_ni *peer_ni = NULL; + + list_del(&conn->ibc_list); + if (conn->ibc_reconnect) { + peer_ni = conn->ibc_peer; + kiblnd_peer_addref(peer_ni); + } + + spin_unlock_irqrestore(lock, flags); + dropped_lock = true; + + kiblnd_destroy_conn(conn); + + spin_lock_irqsave(lock, flags); + if (!peer_ni) { + LIBCFS_FREE(conn, sizeof(*conn)); + continue; + } + + conn->ibc_peer = peer_ni; + if (peer_ni->ibp_reconnected < KIB_RECONN_HIGH_RACE) + list_add_tail(&conn->ibc_list, + &kiblnd_data.kib_reconn_list); + else + list_add_tail(&conn->ibc_list, + &kiblnd_data.kib_reconn_wait); + } + + conn = list_first_entry_or_null(&kiblnd_data.kib_connd_conns, + struct kib_conn, ibc_list); + if (conn) { + int wait; + + list_del(&conn->ibc_list); + + spin_unlock_irqrestore(lock, flags); + dropped_lock = true; + + kiblnd_disconnect_conn(conn); + wait = conn->ibc_waits; + if (wait == 0) /* keep ref for connd_wait, see below */ + kiblnd_conn_decref(conn); + + spin_lock_irqsave(lock, flags); + + if (wait) + list_add_tail(&conn->ibc_list, + &kiblnd_data.kib_connd_waits); + } + + while (reconn < KIB_RECONN_BREAK) { + if (kiblnd_data.kib_reconn_sec != + ktime_get_real_seconds()) { + kiblnd_data.kib_reconn_sec = ktime_get_real_seconds(); + list_splice_init(&kiblnd_data.kib_reconn_wait, + &kiblnd_data.kib_reconn_list); + } + + conn = list_first_entry_or_null(&kiblnd_data.kib_reconn_list, + struct kib_conn, ibc_list); + if (!conn) + break; + + list_del(&conn->ibc_list); + + spin_unlock_irqrestore(lock, flags); + dropped_lock = true; + + reconn += kiblnd_reconnect_peer(conn->ibc_peer); + kiblnd_peer_decref(conn->ibc_peer); + LIBCFS_FREE(conn, sizeof(*conn)); + + spin_lock_irqsave(lock, flags); + } + + conn = list_first_entry_or_null(&kiblnd_data.kib_connd_waits, + struct kib_conn, ibc_list); + if (conn) { + list_del(&conn->ibc_list); + spin_unlock_irqrestore(lock, flags); + + dropped_lock = kiblnd_tx_may_discard(conn); + if (dropped_lock) + kiblnd_conn_decref(conn); + + spin_lock_irqsave(lock, flags); + if (!dropped_lock) + list_add_tail(&conn->ibc_list, + &kiblnd_data.kib_connd_waits); + } + + /* careful with the jiffy wrap... */ + timeout = (int)(deadline - jiffies); + if (timeout <= 0) { + const int n = 4; + const int p = 1; + int chunk = HASH_SIZE(kiblnd_data.kib_peers); + unsigned int lnd_timeout; + + spin_unlock_irqrestore(lock, flags); + dropped_lock = true; + + /* Time to check for RDMA timeouts on a few more + * peers: I do checks every 'p' seconds on a + * proportion of the peer_ni table and I need to check + * every connection 'n' times within a timeout + * interval, to ensure I detect a timeout on any + * connection within (n+1)/n times the timeout + * interval. + */ + + lnd_timeout = kiblnd_timeout(); + if (lnd_timeout > n * p) + chunk = (chunk * n * p) / lnd_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + kiblnd_check_conns(peer_index); + peer_index = (peer_index + 1) % + HASH_SIZE(kiblnd_data.kib_peers); + } + + deadline += cfs_time_seconds(p); + spin_lock_irqsave(lock, flags); + } + + if (dropped_lock) + continue; + + /* Nothing to do for 'timeout' */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); + spin_unlock_irqrestore(lock, flags); + + schedule_timeout(timeout); + + remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); + spin_lock_irqsave(lock, flags); + } + + spin_unlock_irqrestore(lock, flags); + + kiblnd_thread_fini(); + return 0; +} + +void +kiblnd_qp_event(struct ib_event *event, void *arg) +{ + struct kib_conn *conn = arg; + + switch (event->event) { + case IB_EVENT_COMM_EST: + CDEBUG(D_NET, "%s established\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* We received a packet but connection isn't established + * probably handshake packet was lost, so free to + * force make connection established */ + rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST); + return; + + case IB_EVENT_PORT_ERR: + case IB_EVENT_DEVICE_FATAL: + CERROR("Fatal device error for NI %s\n", + libcfs_nidstr(&conn->ibc_peer->ibp_ni->ni_nid)); + atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 1); + return; + + case IB_EVENT_PORT_ACTIVE: + CERROR("Port reactivated for NI %s\n", + libcfs_nidstr(&conn->ibc_peer->ibp_ni->ni_nid)); + atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 0); + return; + + default: + CERROR("%s: Async QP event type %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); + return; + } +} + +static void +kiblnd_complete (struct ib_wc *wc) +{ + switch (kiblnd_wreqid2type(wc->wr_id)) { + default: + LBUG(); + + case IBLND_WID_MR: + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) + CNETERR("FastReg failed: %d\n", wc->status); + return; + + case IBLND_WID_RDMA: + /* We only get RDMA completion notification if it fails. All + * subsequent work items, including the final SEND will fail + * too. However we can't print out any more info about the + * failing RDMA because 'tx' might be back on the idle list or + * even reused already if we didn't manage to post all our work + * items */ + CNETERR("RDMA (tx: %p) failed: %d\n", + kiblnd_wreqid2ptr(wc->wr_id), wc->status); + return; + + case IBLND_WID_TX: + kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status); + return; + + case IBLND_WID_RX: + kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status, + wc->byte_len); + return; + } +} + +void +kiblnd_cq_completion(struct ib_cq *cq, void *arg) +{ + /* NB I'm not allowed to schedule this conn once its refcount has + * reached 0. Since fundamentally I'm racing with scheduler threads + * consuming my CQ I could be called after all completions have + * occurred. But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0 + * and this CQ is about to be destroyed so I NOOP. */ + struct kib_conn *conn = arg; + struct kib_sched_info *sched = conn->ibc_sched; + unsigned long flags; + + LASSERT(cq == conn->ibc_cq); + + spin_lock_irqsave(&sched->ibs_lock, flags); + + conn->ibc_ready = 1; + + if (!conn->ibc_scheduled && + (conn->ibc_nrx > 0 || + conn->ibc_nsends_posted > 0)) { + kiblnd_conn_addref(conn); /* +1 ref for sched_conns */ + conn->ibc_scheduled = 1; + list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns); + + if (waitqueue_active(&sched->ibs_waitq)) + wake_up(&sched->ibs_waitq); + } + + spin_unlock_irqrestore(&sched->ibs_lock, flags); +} + +void +kiblnd_cq_event(struct ib_event *event, void *arg) +{ + struct kib_conn *conn = arg; + + CERROR("%s: async CQ event type %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); +} + +int +kiblnd_scheduler(void *arg) +{ + long id = (long)arg; + struct kib_sched_info *sched; + struct kib_conn *conn; + wait_queue_entry_t wait; + unsigned long flags; + struct ib_wc wc; + bool did_something; + int rc; + + init_wait(&wait); + + sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)]; + + rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt); + if (rc != 0) { + CWARN("Unable to bind on CPU partition %d, please verify whether all CPUs are healthy and reload modules if necessary, otherwise your system might under risk of low performance\n", sched->ibs_cpt); + } + + spin_lock_irqsave(&sched->ibs_lock, flags); + + while (!kiblnd_data.kib_shutdown) { + if (need_resched()) { + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + cond_resched(); + + spin_lock_irqsave(&sched->ibs_lock, flags); + } + + did_something = false; + + conn = list_first_entry_or_null(&sched->ibs_conns, + struct kib_conn, + ibc_sched_list); + if (conn) { + /* take over kib_sched_conns' ref on conn... */ + LASSERT(conn->ibc_scheduled); + list_del(&conn->ibc_sched_list); + conn->ibc_ready = 0; + + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + wc.wr_id = IBLND_WID_INVAL; + + rc = ib_poll_cq(conn->ibc_cq, 1, &wc); + if (rc == 0) { + rc = ib_req_notify_cq(conn->ibc_cq, + IB_CQ_NEXT_COMP); + if (rc < 0) { + CWARN("%s: ib_req_notify_cq failed: %d, closing connection\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + kiblnd_close_conn(conn, -EIO); + kiblnd_conn_decref(conn); + spin_lock_irqsave(&sched->ibs_lock, + flags); + continue; + } + + rc = ib_poll_cq(conn->ibc_cq, 1, &wc); + } + + if (unlikely(rc > 0 && wc.wr_id == IBLND_WID_INVAL)) { + LCONSOLE_ERROR( + "ib_poll_cq (rc: %d) returned invalid " + "wr_id, opcode %d, status: %d, " + "vendor_err: %d, conn: %s status: %d\n" + "please upgrade firmware and OFED or " + "contact vendor.\n", rc, + wc.opcode, wc.status, wc.vendor_err, + libcfs_nid2str(conn->ibc_peer->ibp_nid), + conn->ibc_state); + rc = -EINVAL; + } + + if (rc < 0) { + CWARN("%s: ib_poll_cq failed: %d, closing connection\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + rc); + kiblnd_close_conn(conn, -EIO); + kiblnd_conn_decref(conn); + spin_lock_irqsave(&sched->ibs_lock, flags); + continue; + } + + spin_lock_irqsave(&sched->ibs_lock, flags); + + if (rc != 0 || conn->ibc_ready) { + /* There may be another completion waiting; get + * another scheduler to check while I handle + * this one... */ + /* +1 ref for sched_conns */ + kiblnd_conn_addref(conn); + list_add_tail(&conn->ibc_sched_list, + &sched->ibs_conns); + if (waitqueue_active(&sched->ibs_waitq)) + wake_up(&sched->ibs_waitq); + } else { + conn->ibc_scheduled = 0; + } + + if (rc != 0) { + spin_unlock_irqrestore(&sched->ibs_lock, flags); + kiblnd_complete(&wc); + + spin_lock_irqsave(&sched->ibs_lock, flags); + } + + kiblnd_conn_decref(conn); /* ..drop my ref from above */ + did_something = true; + } + + if (did_something) + continue; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&sched->ibs_waitq, &wait); + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + schedule(); + + remove_wait_queue(&sched->ibs_waitq, &wait); + set_current_state(TASK_RUNNING); + spin_lock_irqsave(&sched->ibs_lock, flags); + } + + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + kiblnd_thread_fini(); + return 0; +} + +int +kiblnd_failover_thread(void *arg) +{ + rwlock_t *glock = &kiblnd_data.kib_global_lock; + struct kib_dev *dev; + struct net *ns = arg; + wait_queue_entry_t wait; + unsigned long flags; + int rc; + + LASSERT(*kiblnd_tunables.kib_dev_failover != 0); + + init_wait(&wait); + write_lock_irqsave(glock, flags); + + while (!kiblnd_data.kib_shutdown) { + bool do_failover = false; + int long_sleep; + + list_for_each_entry(dev, &kiblnd_data.kib_failed_devs, + ibd_fail_list) { + if (ktime_get_seconds() < dev->ibd_next_failover) + continue; + do_failover = true; + break; + } + + if (do_failover) { + list_del_init(&dev->ibd_fail_list); + dev->ibd_failover = 1; + write_unlock_irqrestore(glock, flags); + + rc = kiblnd_dev_failover(dev, ns); + + write_lock_irqsave(glock, flags); + + LASSERT(dev->ibd_failover); + dev->ibd_failover = 0; + if (rc >= 0) { /* Device is OK or failover succeed */ + dev->ibd_next_failover = ktime_get_seconds() + 3; + continue; + } + + /* failed to failover, retry later */ + dev->ibd_next_failover = ktime_get_seconds() + + min(dev->ibd_failed_failover, 10); + if (kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + } + + continue; + } + + /* long sleep if no more pending failover */ + long_sleep = list_empty(&kiblnd_data.kib_failed_devs); + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); + write_unlock_irqrestore(glock, flags); + + rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) : + cfs_time_seconds(1)); + set_current_state(TASK_RUNNING); + remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); + write_lock_irqsave(glock, flags); + + if (!long_sleep || rc != 0) + continue; + + /* have a long sleep, routine check all active devices, + * we need checking like this because if there is not active + * connection on the dev and no SEND from local, we may listen + * on wrong HCA for ever while there is a bonding failover + */ + list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { + if (kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + } + } + } + + write_unlock_irqrestore(glock, flags); + + kiblnd_thread_fini(); + return 0; +} diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c new file mode 100644 index 0000000000000..95e72002c1c74 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -0,0 +1,332 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/klnds/o2iblnd/o2iblnd_modparams.c + * + * Author: Eric Barton + */ + +#include "o2iblnd.h" + +#define CURRENT_LND_VERSION 1 + +static int service = 987; +module_param(service, int, 0444); +MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)"); + +static int cksum = 0; +module_param(cksum, int, 0644); +MODULE_PARM_DESC(cksum, "set non-zero to enable message (not RDMA) checksums"); + +static int timeout; +module_param(timeout, int, 0644); +MODULE_PARM_DESC(timeout, "timeout (seconds)"); + +/* Number of threads in each scheduler pool which is percpt, + * we will estimate reasonable value based on CPUs if it's set to zero. */ +static int nscheds; +module_param(nscheds, int, 0444); +MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool"); + +static unsigned int conns_per_peer = 1; +module_param(conns_per_peer, uint, 0444); +MODULE_PARM_DESC(conns_per_peer, "number of connections per peer"); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int ntx = 512; +module_param(ntx, int, 0444); +MODULE_PARM_DESC(ntx, "# of message descriptors allocated for each pool"); + +/* NB: this value is shared by all CPTs */ +static int credits = DEFAULT_CREDITS; +module_param(credits, int, 0444); +MODULE_PARM_DESC(credits, "# concurrent sends"); + +static int peer_credits = DEFAULT_PEER_CREDITS; +module_param(peer_credits, int, 0444); +MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer"); + +static int peer_credits_hiw = 0; +module_param(peer_credits_hiw, int, 0444); +MODULE_PARM_DESC(peer_credits_hiw, "when eagerly to return credits"); + +static int peer_buffer_credits = 0; +module_param(peer_buffer_credits, int, 0444); +MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits"); + +static int peer_timeout = DEFAULT_PEER_TIMEOUT; +module_param(peer_timeout, int, 0444); +MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)"); + +static char *ipif_name = "ib0"; +module_param(ipif_name, charp, 0444); +MODULE_PARM_DESC(ipif_name, "IPoIB interface name"); + +static int retry_count = 5; +module_param(retry_count, int, 0644); +MODULE_PARM_DESC(retry_count, "Number of times to retry connection operations"); + +static int rnr_retry_count = 6; +module_param(rnr_retry_count, int, 0644); +MODULE_PARM_DESC(rnr_retry_count, "RNR retransmissions"); + +static int keepalive = 100; +module_param(keepalive, int, 0644); +MODULE_PARM_DESC(keepalive, "Idle time in seconds before sending a keepalive"); + +static int ib_mtu; +module_param(ib_mtu, int, 0444); +MODULE_PARM_DESC(ib_mtu, "IB MTU 256/512/1024/2048/4096"); + +static int concurrent_sends; +module_param(concurrent_sends, int, 0444); +MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing"); + +static int use_fastreg_gaps; +module_param(use_fastreg_gaps, int, 0444); +MODULE_PARM_DESC(use_fastreg_gaps, "Enable discontiguous fastreg fragment support. Expect performance drop"); + +/* + * map_on_demand is a flag used to determine if we can use FMR or FastReg. + * This is applicable for kernels which support global memory regions. For + * later kernels this flag is always enabled, since we will always either + * use FMR or FastReg + * For kernels which support global memory regions map_on_demand defaults + * to 0 which means we will be using global memory regions exclusively. + * If it is set to a value other than 0, then we will behave as follows: + * 1. Always default the number of fragments to IBLND_MAX_RDMA_FRAGS + * 2. Create FMR/FastReg pools + * 3. Negotiate the supported number of fragments per connection + * 4. Attempt to transmit using global memory regions only if + * map-on-demand is not turned on, otherwise use FMR or FastReg + * 5. In case of transmitting tx with GAPS over FMR we will need to + * transmit it with multiple fragments. Look at the comments in + * kiblnd_fmr_map_tx() for an explanation of the behavior. + * + * For later kernels we default map_on_demand to 1 and not allow + * it to be set to 0, since there is no longer support for global memory + * regions. Behavior: + * 1. Default the number of fragments to IBLND_MAX_RDMA_FRAGS + * 2. Create FMR/FastReg pools + * 3. Negotiate the supported number of fragments per connection + * 4. Look at the comments in kiblnd_fmr_map_tx() for an explanation of + * the behavior when transmit with GAPS verses contiguous. + */ +#ifdef HAVE_IB_GET_DMA_MR +#define MOD_STR "map on demand" +#else +#define MOD_STR "map on demand (obsolete)" +#endif +static int map_on_demand = 1; +module_param(map_on_demand, int, 0444); +MODULE_PARM_DESC(map_on_demand, MOD_STR); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int fmr_pool_size = 512; +module_param(fmr_pool_size, int, 0444); +MODULE_PARM_DESC(fmr_pool_size, "size of fmr pool on each CPT (>= ntx / 4)"); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int fmr_flush_trigger = 384; +module_param(fmr_flush_trigger, int, 0444); +MODULE_PARM_DESC(fmr_flush_trigger, "# dirty FMRs that triggers pool flush"); + +static int fmr_cache = 1; +module_param(fmr_cache, int, 0444); +MODULE_PARM_DESC(fmr_cache, "non-zero to enable FMR caching"); + +/* + * 0: disable failover + * 1: enable failover if necessary + * 2: force to failover (for debug) + */ +static int dev_failover = 0; +module_param(dev_failover, int, 0444); +MODULE_PARM_DESC(dev_failover, "HCA failover for bonding (0 off, 1 on, other values reserved)"); + +static int require_privileged_port; +module_param(require_privileged_port, int, 0644); +MODULE_PARM_DESC(require_privileged_port, "require privileged port when accepting connection"); + +static int use_privileged_port = 1; +module_param(use_privileged_port, int, 0644); +MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection"); + +static unsigned int wrq_sge = 2; +module_param(wrq_sge, uint, 0444); +MODULE_PARM_DESC(wrq_sge, "# scatter/gather element per work request"); + +struct kib_tunables kiblnd_tunables = { + .kib_dev_failover = &dev_failover, + .kib_service = &service, + .kib_cksum = &cksum, + .kib_timeout = &timeout, + .kib_keepalive = &keepalive, + .kib_default_ipif = &ipif_name, + .kib_retry_count = &retry_count, + .kib_rnr_retry_count = &rnr_retry_count, + .kib_ib_mtu = &ib_mtu, + .kib_require_priv_port = &require_privileged_port, + .kib_use_priv_port = &use_privileged_port, + .kib_nscheds = &nscheds, + .kib_wrq_sge = &wrq_sge, + .kib_use_fastreg_gaps = &use_fastreg_gaps, +}; + +static struct lnet_ioctl_config_o2iblnd_tunables default_tunables; + +/* # messages/RDMAs in-flight */ +int +kiblnd_msg_queue_size(int version, struct lnet_ni *ni) +{ + if (version == IBLND_MSG_VERSION_1) + return IBLND_MSG_QUEUE_SIZE_V1; + else if (ni) + return ni->ni_net->net_tunables.lct_peer_tx_credits; + else + return peer_credits; +} + +int +kiblnd_tunables_setup(struct lnet_ni *ni) +{ + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables; + + /* + * if there was no tunables specified, setup the tunables to be + * defaulted + */ + if (!ni->ni_lnd_tunables_set) + memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib, + &default_tunables, sizeof(*tunables)); + + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + + /* Current API version */ + tunables->lnd_version = CURRENT_LND_VERSION; + + if (*kiblnd_tunables.kib_ib_mtu && + ib_mtu_enum_to_int(ib_mtu_int_to_enum(*kiblnd_tunables.kib_ib_mtu)) != + *kiblnd_tunables.kib_ib_mtu) { + CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n", + *kiblnd_tunables.kib_ib_mtu); + return -EINVAL; + } + + net_tunables = &ni->ni_net->net_tunables; + + if (net_tunables->lct_peer_timeout == -1) + net_tunables->lct_peer_timeout = peer_timeout; + + if (net_tunables->lct_max_tx_credits == -1) + net_tunables->lct_max_tx_credits = credits; + + if (net_tunables->lct_peer_tx_credits == -1) + net_tunables->lct_peer_tx_credits = peer_credits; + + if (net_tunables->lct_peer_rtr_credits == -1) + net_tunables->lct_peer_rtr_credits = peer_buffer_credits; + + if (net_tunables->lct_peer_tx_credits < IBLND_CREDITS_DEFAULT) + net_tunables->lct_peer_tx_credits = IBLND_CREDITS_DEFAULT; + + if (net_tunables->lct_peer_tx_credits > IBLND_CREDITS_MAX) + net_tunables->lct_peer_tx_credits = IBLND_CREDITS_MAX; + + if (net_tunables->lct_peer_tx_credits > + net_tunables->lct_max_tx_credits) + net_tunables->lct_peer_tx_credits = + net_tunables->lct_max_tx_credits; + +#ifndef HAVE_IB_GET_DMA_MR + /* + * For kernels which do not support global memory regions, always + * enable map_on_demand + */ + if (tunables->lnd_map_on_demand == 0) + tunables->lnd_map_on_demand = 1; +#endif + + if (!tunables->lnd_peercredits_hiw) + tunables->lnd_peercredits_hiw = peer_credits_hiw; + + if (tunables->lnd_peercredits_hiw < net_tunables->lct_peer_tx_credits / 2) + tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits / 2; + + if (tunables->lnd_peercredits_hiw >= net_tunables->lct_peer_tx_credits) + tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits - 1; + + if (tunables->lnd_concurrent_sends == 0) + tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits; + + if (tunables->lnd_concurrent_sends > net_tunables->lct_peer_tx_credits * 2) + tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits * 2; + + if (tunables->lnd_concurrent_sends < net_tunables->lct_peer_tx_credits / 2) + tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits / 2; + + if (tunables->lnd_concurrent_sends < net_tunables->lct_peer_tx_credits) { + CWARN("Concurrent sends %d is lower than message " + "queue size: %d, performance may drop slightly.\n", + tunables->lnd_concurrent_sends, + net_tunables->lct_peer_tx_credits); + } + + if (!tunables->lnd_fmr_pool_size) + tunables->lnd_fmr_pool_size = fmr_pool_size; + if (!tunables->lnd_fmr_flush_trigger) + tunables->lnd_fmr_flush_trigger = fmr_flush_trigger; + if (!tunables->lnd_fmr_cache) + tunables->lnd_fmr_cache = fmr_cache; + if (!tunables->lnd_ntx) + tunables->lnd_ntx = ntx; + if (!tunables->lnd_conns_per_peer) { + tunables->lnd_conns_per_peer = (conns_per_peer) ? + conns_per_peer : 1; + } + + return 0; +} + +int +kiblnd_tunables_init(void) +{ + default_tunables.lnd_version = CURRENT_LND_VERSION; + default_tunables.lnd_peercredits_hiw = peer_credits_hiw; + default_tunables.lnd_map_on_demand = map_on_demand; + default_tunables.lnd_concurrent_sends = concurrent_sends; + default_tunables.lnd_fmr_pool_size = fmr_pool_size; + default_tunables.lnd_fmr_flush_trigger = fmr_flush_trigger; + default_tunables.lnd_fmr_cache = fmr_cache; + default_tunables.lnd_ntx = ntx; + default_tunables.lnd_conns_per_peer = conns_per_peer; + return 0; +} diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile b/drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile new file mode 100644 index 0000000000000..6e6ec925b891f --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_LUSTREFSX_LNET) += ksocklnd.o + +ksocklnd-y := socklnd.o socklnd_cb.o socklnd_lib.o +ksocklnd-y += socklnd_modparams.o socklnd_proto.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c new file mode 100644 index 0000000000000..9ea8c318c8190 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c @@ -0,0 +1,2596 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/klnds/socklnd/socklnd.c + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + */ + +#include +#include +#include "socklnd.h" +#include + +static const struct lnet_lnd the_ksocklnd; +struct ksock_nal_data ksocknal_data; + +static struct ksock_interface * +ksocknal_ip2iface(struct lnet_ni *ni, struct sockaddr *addr) +{ + struct ksock_net *net = ni->ni_data; + struct ksock_interface *iface; + + iface = &net->ksnn_interface; + + if (rpc_cmp_addr((struct sockaddr *)&iface->ksni_addr, addr)) + return iface; + + return NULL; +} + +static struct ksock_interface * +ksocknal_index2iface(struct lnet_ni *ni, int index) +{ + struct ksock_net *net = ni->ni_data; + struct ksock_interface *iface; + + iface = &net->ksnn_interface; + + if (iface->ksni_index == index) + return iface; + + return NULL; +} + +static int ksocknal_ip2index(struct sockaddr *addr, struct lnet_ni *ni) +{ + struct net_device *dev; + int ret = -1; + DECLARE_CONST_IN_IFADDR(ifa); + + if (addr->sa_family != AF_INET) + /* No IPv6 support yet */ + return ret; + + rcu_read_lock(); + for_each_netdev(ni->ni_net_ns, dev) { + int flags = dev_get_flags(dev); + struct in_device *in_dev; + + if (flags & IFF_LOOPBACK) /* skip the loopback IF */ + continue; + + if (!(flags & IFF_UP)) + continue; + + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + continue; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ifa->ifa_local == + ((struct sockaddr_in *)addr)->sin_addr.s_addr) + ret = dev->ifindex; + } + endfor_ifa(in_dev); + if (ret >= 0) + break; + } + rcu_read_unlock(); + + return ret; +} + +static struct ksock_conn_cb * +ksocknal_create_conn_cb(struct sockaddr *addr) +{ + struct ksock_conn_cb *conn_cb; + + LIBCFS_ALLOC(conn_cb, sizeof(*conn_cb)); + if (!conn_cb) + return NULL; + + refcount_set(&conn_cb->ksnr_refcount, 1); + conn_cb->ksnr_peer = NULL; + conn_cb->ksnr_retry_interval = 0; /* OK to connect at any time */ + rpc_copy_addr((struct sockaddr *)&conn_cb->ksnr_addr, addr); + rpc_set_port((struct sockaddr *)&conn_cb->ksnr_addr, + rpc_get_port(addr)); + conn_cb->ksnr_myiface = -1; + conn_cb->ksnr_scheduled = 0; + conn_cb->ksnr_connecting = 0; + conn_cb->ksnr_connected = 0; + conn_cb->ksnr_deleted = 0; + conn_cb->ksnr_conn_count = 0; + conn_cb->ksnr_ctrl_conn_count = 0; + conn_cb->ksnr_blki_conn_count = 0; + conn_cb->ksnr_blko_conn_count = 0; + conn_cb->ksnr_max_conns = 0; + + return conn_cb; +} + +void +ksocknal_destroy_conn_cb(struct ksock_conn_cb *conn_cb) +{ + LASSERT(refcount_read(&conn_cb->ksnr_refcount) == 0); + + if (conn_cb->ksnr_peer) + ksocknal_peer_decref(conn_cb->ksnr_peer); + + LIBCFS_FREE(conn_cb, sizeof(*conn_cb)); +} + +static struct ksock_peer_ni * +ksocknal_create_peer(struct lnet_ni *ni, struct lnet_processid *id) +{ + int cpt = lnet_nid2cpt(&id->nid, ni); + struct ksock_net *net = ni->ni_data; + struct ksock_peer_ni *peer_ni; + + LASSERT(!LNET_NID_IS_ANY(&id->nid)); + LASSERT(id->pid != LNET_PID_ANY); + LASSERT(!in_interrupt()); + + if (!atomic_inc_unless_negative(&net->ksnn_npeers)) { + CERROR("Can't create peer_ni: network shutdown\n"); + return ERR_PTR(-ESHUTDOWN); + } + + LIBCFS_CPT_ALLOC(peer_ni, lnet_cpt_table(), cpt, sizeof(*peer_ni)); + if (!peer_ni) { + atomic_dec(&net->ksnn_npeers); + return ERR_PTR(-ENOMEM); + } + + peer_ni->ksnp_ni = ni; + peer_ni->ksnp_id = *id; + refcount_set(&peer_ni->ksnp_refcount, 1); /* 1 ref for caller */ + peer_ni->ksnp_closing = 0; + peer_ni->ksnp_accepting = 0; + peer_ni->ksnp_proto = NULL; + peer_ni->ksnp_last_alive = 0; + peer_ni->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; + peer_ni->ksnp_conn_cb = NULL; + + INIT_LIST_HEAD(&peer_ni->ksnp_conns); + INIT_LIST_HEAD(&peer_ni->ksnp_tx_queue); + INIT_LIST_HEAD(&peer_ni->ksnp_zc_req_list); + spin_lock_init(&peer_ni->ksnp_lock); + + return peer_ni; +} + +void +ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni) +{ + struct ksock_net *net = peer_ni->ksnp_ni->ni_data; + + CDEBUG (D_NET, "peer_ni %s %p deleted\n", + libcfs_idstr(&peer_ni->ksnp_id), peer_ni); + + LASSERT(refcount_read(&peer_ni->ksnp_refcount) == 0); + LASSERT(peer_ni->ksnp_accepting == 0); + LASSERT(list_empty(&peer_ni->ksnp_conns)); + LASSERT(peer_ni->ksnp_conn_cb == NULL); + LASSERT(list_empty(&peer_ni->ksnp_tx_queue)); + LASSERT(list_empty(&peer_ni->ksnp_zc_req_list)); + + LIBCFS_FREE(peer_ni, sizeof(*peer_ni)); + + /* NB a peer_ni's connections and conn_cb keep a reference on their + * peer_ni until they are destroyed, so we can be assured that _all_ + * state to do with this peer_ni has been cleaned up when its refcount + * drops to zero. + */ + if (atomic_dec_and_test(&net->ksnn_npeers)) + wake_up_var(&net->ksnn_npeers); +} + +struct ksock_peer_ni * +ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_processid *id) +{ + struct ksock_peer_ni *peer_ni; + unsigned long hash = nidhash(&id->nid); + + hash_for_each_possible(ksocknal_data.ksnd_peers, peer_ni, + ksnp_list, hash) { + LASSERT(!peer_ni->ksnp_closing); + + if (peer_ni->ksnp_ni != ni) + continue; + + if (!nid_same(&peer_ni->ksnp_id.nid, &id->nid) || + peer_ni->ksnp_id.pid != id->pid) + continue; + + CDEBUG(D_NET, "got peer_ni [%p] -> %s (%d)\n", + peer_ni, libcfs_idstr(id), + refcount_read(&peer_ni->ksnp_refcount)); + return peer_ni; + } + return NULL; +} + +struct ksock_peer_ni * +ksocknal_find_peer(struct lnet_ni *ni, struct lnet_processid *id) +{ + struct ksock_peer_ni *peer_ni; + + read_lock(&ksocknal_data.ksnd_global_lock); + peer_ni = ksocknal_find_peer_locked(ni, id); + if (peer_ni != NULL) /* +1 ref for caller? */ + ksocknal_peer_addref(peer_ni); + read_unlock(&ksocknal_data.ksnd_global_lock); + + return peer_ni; +} + +static void +ksocknal_unlink_peer_locked(struct ksock_peer_ni *peer_ni) +{ + int i; + struct ksock_interface *iface; + + for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++) { + struct sockaddr_in sa = { .sin_family = AF_INET }; + LASSERT(i < LNET_INTERFACES_NUM); + sa.sin_addr.s_addr = htonl(peer_ni->ksnp_passive_ips[i]); + + iface = ksocknal_ip2iface(peer_ni->ksnp_ni, + (struct sockaddr *)&sa); + /* + * All IPs in peer_ni->ksnp_passive_ips[] come from the + * interface list, therefore the call must succeed. + */ + LASSERT(iface != NULL); + + CDEBUG(D_NET, "peer_ni=%p iface=%p ksni_nroutes=%d\n", + peer_ni, iface, iface->ksni_nroutes); + iface->ksni_npeers--; + } + + LASSERT(list_empty(&peer_ni->ksnp_conns)); + LASSERT(peer_ni->ksnp_conn_cb == NULL); + LASSERT(!peer_ni->ksnp_closing); + peer_ni->ksnp_closing = 1; + hlist_del(&peer_ni->ksnp_list); + /* lose peerlist's ref */ + ksocknal_peer_decref(peer_ni); +} + +static int +ksocknal_get_peer_info(struct lnet_ni *ni, int index, + struct lnet_process_id *id, __u32 *myip, __u32 *peer_ip, + int *port, int *conn_count, int *share_count) +{ + struct ksock_peer_ni *peer_ni; + struct ksock_conn_cb *conn_cb; + int i; + int j; + int rc = -ENOENT; + + read_lock(&ksocknal_data.ksnd_global_lock); + + hash_for_each(ksocknal_data.ksnd_peers, i, peer_ni, ksnp_list) { + + if (peer_ni->ksnp_ni != ni) + continue; + + if (peer_ni->ksnp_n_passive_ips == 0 && + peer_ni->ksnp_conn_cb == NULL) { + if (index-- > 0) + continue; + + id->pid = peer_ni->ksnp_id.pid; + id->nid = lnet_nid_to_nid4(&peer_ni->ksnp_id.nid); + *myip = 0; + *peer_ip = 0; + *port = 0; + *conn_count = 0; + *share_count = 0; + rc = 0; + goto out; + } + + for (j = 0; j < peer_ni->ksnp_n_passive_ips; j++) { + if (index-- > 0) + continue; + + id->pid = peer_ni->ksnp_id.pid; + id->nid = lnet_nid_to_nid4(&peer_ni->ksnp_id.nid); + *myip = peer_ni->ksnp_passive_ips[j]; + *peer_ip = 0; + *port = 0; + *conn_count = 0; + *share_count = 0; + rc = 0; + goto out; + } + + if (peer_ni->ksnp_conn_cb) { + if (index-- > 0) + continue; + + conn_cb = peer_ni->ksnp_conn_cb; + + id->pid = peer_ni->ksnp_id.pid; + id->nid = lnet_nid_to_nid4(&peer_ni->ksnp_id.nid); + if (conn_cb->ksnr_addr.ss_family == AF_INET) { + struct sockaddr_in *sa = + (void *)&conn_cb->ksnr_addr; + + rc = choose_ipv4_src(myip, + conn_cb->ksnr_myiface, + ntohl(sa->sin_addr.s_addr), + ni->ni_net_ns); + *peer_ip = ntohl(sa->sin_addr.s_addr); + *port = ntohs(sa->sin_port); + } else { + *myip = 0xFFFFFFFF; + *peer_ip = 0xFFFFFFFF; + *port = 0; + rc = -ENOTSUPP; + } + *conn_count = conn_cb->ksnr_conn_count; + *share_count = 1; + goto out; + } + } +out: + read_unlock(&ksocknal_data.ksnd_global_lock); + return rc; +} + +static unsigned int +ksocknal_get_conn_count_by_type(struct ksock_conn_cb *conn_cb, + int type) +{ + unsigned int count = 0; + + switch (type) { + case SOCKLND_CONN_CONTROL: + count = conn_cb->ksnr_ctrl_conn_count; + break; + case SOCKLND_CONN_BULK_IN: + count = conn_cb->ksnr_blki_conn_count; + break; + case SOCKLND_CONN_BULK_OUT: + count = conn_cb->ksnr_blko_conn_count; + break; + case SOCKLND_CONN_ANY: + count = conn_cb->ksnr_conn_count; + break; + default: + LBUG(); + break; + } + + return count; +} + +static unsigned int +ksocknal_get_conns_per_peer(struct ksock_peer_ni *peer_ni) +{ + struct lnet_ni *ni = peer_ni->ksnp_ni; + struct lnet_ioctl_config_socklnd_tunables *tunables; + + LASSERT(ni); + + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_sock; + + return tunables->lnd_conns_per_peer; +} + +static void +ksocknal_incr_conn_count(struct ksock_conn_cb *conn_cb, + int type) +{ + conn_cb->ksnr_conn_count++; + + /* check if all connections of the given type got created */ + switch (type) { + case SOCKLND_CONN_CONTROL: + conn_cb->ksnr_ctrl_conn_count++; + /* there's a single control connection per peer, + * two in case of loopback + */ + conn_cb->ksnr_connected |= BIT(type); + break; + case SOCKLND_CONN_BULK_IN: + conn_cb->ksnr_blki_conn_count++; + if (conn_cb->ksnr_blki_conn_count >= conn_cb->ksnr_max_conns) + conn_cb->ksnr_connected |= BIT(type); + break; + case SOCKLND_CONN_BULK_OUT: + conn_cb->ksnr_blko_conn_count++; + if (conn_cb->ksnr_blko_conn_count >= conn_cb->ksnr_max_conns) + conn_cb->ksnr_connected |= BIT(type); + break; + case SOCKLND_CONN_ANY: + if (conn_cb->ksnr_conn_count >= conn_cb->ksnr_max_conns) + conn_cb->ksnr_connected |= BIT(type); + break; + default: + LBUG(); + break; + } + + CDEBUG(D_NET, "Add conn type %d, ksnr_connected %x ksnr_max_conns %d\n", + type, conn_cb->ksnr_connected, conn_cb->ksnr_max_conns); +} + + +static void +ksocknal_decr_conn_count(struct ksock_conn_cb *conn_cb, + int type) +{ + conn_cb->ksnr_conn_count--; + + /* check if all connections of the given type got created */ + switch (type) { + case SOCKLND_CONN_CONTROL: + conn_cb->ksnr_ctrl_conn_count--; + /* there's a single control connection per peer, + * two in case of loopback + */ + if (conn_cb->ksnr_ctrl_conn_count == 0) + conn_cb->ksnr_connected &= ~BIT(type); + break; + case SOCKLND_CONN_BULK_IN: + conn_cb->ksnr_blki_conn_count--; + if (conn_cb->ksnr_blki_conn_count < conn_cb->ksnr_max_conns) + conn_cb->ksnr_connected &= ~BIT(type); + break; + case SOCKLND_CONN_BULK_OUT: + conn_cb->ksnr_blko_conn_count--; + if (conn_cb->ksnr_blko_conn_count < conn_cb->ksnr_max_conns) + conn_cb->ksnr_connected &= ~BIT(type); + break; + case SOCKLND_CONN_ANY: + if (conn_cb->ksnr_conn_count < conn_cb->ksnr_max_conns) + conn_cb->ksnr_connected &= ~BIT(type); + break; + default: + LBUG(); + break; + } + + CDEBUG(D_NET, "Del conn type %d, ksnr_connected %x ksnr_max_conns %d\n", + type, conn_cb->ksnr_connected, conn_cb->ksnr_max_conns); +} + +static void +ksocknal_associate_cb_conn_locked(struct ksock_conn_cb *conn_cb, + struct ksock_conn *conn) +{ + struct ksock_peer_ni *peer_ni = conn_cb->ksnr_peer; + int type = conn->ksnc_type; + struct ksock_interface *iface; + int conn_iface; + + conn_iface = ksocknal_ip2index((struct sockaddr *)&conn->ksnc_myaddr, + peer_ni->ksnp_ni); + conn->ksnc_conn_cb = conn_cb; + ksocknal_conn_cb_addref(conn_cb); + + if (conn_cb->ksnr_myiface != conn_iface) { + if (conn_cb->ksnr_myiface < 0) { + /* route wasn't bound locally yet (the initial route) */ + CDEBUG(D_NET, "Binding %s %pIS to interface %d\n", + libcfs_idstr(&peer_ni->ksnp_id), + &conn_cb->ksnr_addr, + conn_iface); + } else { + CDEBUG(D_NET, + "Rebinding %s %pIS from interface %d to %d\n", + libcfs_idstr(&peer_ni->ksnp_id), + &conn_cb->ksnr_addr, + conn_cb->ksnr_myiface, + conn_iface); + + iface = ksocknal_index2iface(peer_ni->ksnp_ni, + conn_cb->ksnr_myiface); + if (iface) + iface->ksni_nroutes--; + } + conn_cb->ksnr_myiface = conn_iface; + iface = ksocknal_index2iface(peer_ni->ksnp_ni, + conn_cb->ksnr_myiface); + if (iface) + iface->ksni_nroutes++; + } + + ksocknal_incr_conn_count(conn_cb, type); + + /* Successful connection => further attempts can + * proceed immediately + */ + conn_cb->ksnr_retry_interval = 0; +} + +static void +ksocknal_add_conn_cb_locked(struct ksock_peer_ni *peer_ni, + struct ksock_conn_cb *conn_cb) +{ + struct ksock_conn *conn; + struct ksock_net *net = peer_ni->ksnp_ni->ni_data; + + LASSERT(!peer_ni->ksnp_closing); + LASSERT(!conn_cb->ksnr_peer); + LASSERT(!conn_cb->ksnr_scheduled); + LASSERT(!conn_cb->ksnr_connecting); + LASSERT(conn_cb->ksnr_connected == 0); + + conn_cb->ksnr_peer = peer_ni; + ksocknal_peer_addref(peer_ni); + + /* set the conn_cb's interface to the current net's interface */ + conn_cb->ksnr_myiface = net->ksnn_interface.ksni_index; + net->ksnn_interface.ksni_nroutes++; + + /* peer_ni's route list takes over my ref on 'route' */ + peer_ni->ksnp_conn_cb = conn_cb; + + list_for_each_entry(conn, &peer_ni->ksnp_conns, ksnc_list) { + if (!rpc_cmp_addr((struct sockaddr *)&conn->ksnc_peeraddr, + (struct sockaddr *)&conn_cb->ksnr_addr)) + continue; + + ksocknal_associate_cb_conn_locked(conn_cb, conn); + /* keep going (typed conns) */ + } +} + +static void +ksocknal_del_conn_cb_locked(struct ksock_conn_cb *conn_cb) +{ + struct ksock_peer_ni *peer_ni = conn_cb->ksnr_peer; + struct ksock_interface *iface; + struct ksock_conn *conn; + struct ksock_conn *cnxt; + + LASSERT(!conn_cb->ksnr_deleted); + + /* Close associated conns */ + list_for_each_entry_safe(conn, cnxt, &peer_ni->ksnp_conns, ksnc_list) { + if (conn->ksnc_conn_cb != conn_cb) + continue; + + ksocknal_close_conn_locked(conn, 0); + } + + if (conn_cb->ksnr_myiface >= 0) { + iface = ksocknal_index2iface(peer_ni->ksnp_ni, + conn_cb->ksnr_myiface); + if (iface) + iface->ksni_nroutes--; + } + + conn_cb->ksnr_deleted = 1; + ksocknal_conn_cb_decref(conn_cb); /* drop peer_ni's ref */ + peer_ni->ksnp_conn_cb = NULL; + + if (list_empty(&peer_ni->ksnp_conns)) { + /* I've just removed the last route to a peer_ni with no active + * connections + */ + ksocknal_unlink_peer_locked(peer_ni); + } +} + +int +ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id4, + struct sockaddr *addr) +{ + struct ksock_peer_ni *peer_ni; + struct ksock_peer_ni *peer2; + struct ksock_conn_cb *conn_cb; + struct lnet_processid id; + + if (id4.nid == LNET_NID_ANY || + id4.pid == LNET_PID_ANY) + return (-EINVAL); + + id.pid = id4.pid; + lnet_nid4_to_nid(id4.nid, &id.nid); + + /* Have a brand new peer_ni ready... */ + peer_ni = ksocknal_create_peer(ni, &id); + if (IS_ERR(peer_ni)) + return PTR_ERR(peer_ni); + + conn_cb = ksocknal_create_conn_cb(addr); + if (!conn_cb) { + ksocknal_peer_decref(peer_ni); + return -ENOMEM; + } + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + /* always called with a ref on ni, so shutdown can't have started */ + LASSERT(atomic_read(&((struct ksock_net *)ni->ni_data)->ksnn_npeers) + >= 0); + + peer2 = ksocknal_find_peer_locked(ni, &id); + if (peer2 != NULL) { + ksocknal_peer_decref(peer_ni); + peer_ni = peer2; + } else { + /* peer_ni table takes my ref on peer_ni */ + hash_add(ksocknal_data.ksnd_peers, &peer_ni->ksnp_list, + nidhash(&id.nid)); + } + + if (peer_ni->ksnp_conn_cb) { + ksocknal_conn_cb_decref(conn_cb); + } else { + ksocknal_add_conn_cb_locked(peer_ni, conn_cb); + /* Remember conns_per_peer setting at the time + * of connection initiation. It will define the + * max number of conns per type for this conn_cb + * while it's in use. + */ + conn_cb->ksnr_max_conns = ksocknal_get_conns_per_peer(peer_ni); + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return 0; +} + +static void +ksocknal_del_peer_locked(struct ksock_peer_ni *peer_ni, __u32 ip) +{ + struct ksock_conn *conn; + struct ksock_conn *cnxt; + struct ksock_conn_cb *conn_cb; + + LASSERT(!peer_ni->ksnp_closing); + + /* Extra ref prevents peer_ni disappearing until I'm done with it */ + ksocknal_peer_addref(peer_ni); + conn_cb = peer_ni->ksnp_conn_cb; + if (conn_cb) + ksocknal_del_conn_cb_locked(conn_cb); + + list_for_each_entry_safe(conn, cnxt, &peer_ni->ksnp_conns, + ksnc_list) + ksocknal_close_conn_locked(conn, 0); + + ksocknal_peer_decref(peer_ni); + /* NB peer_ni unlinks itself when last conn/conn_cb is removed */ +} + +static int +ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id4, __u32 ip) +{ + LIST_HEAD(zombies); + struct hlist_node *pnxt; + struct ksock_peer_ni *peer_ni; + int lo; + int hi; + int i; + int rc = -ENOENT; + struct lnet_processid id; + + id.pid = id4.pid; + lnet_nid4_to_nid(id4.nid, &id.nid); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + if (!LNET_NID_IS_ANY(&id.nid)) { + lo = hash_min(nidhash(&id.nid), + HASH_BITS(ksocknal_data.ksnd_peers)); + hi = lo; + } else { + lo = 0; + hi = HASH_SIZE(ksocknal_data.ksnd_peers) - 1; + } + + for (i = lo; i <= hi; i++) { + hlist_for_each_entry_safe(peer_ni, pnxt, + &ksocknal_data.ksnd_peers[i], + ksnp_list) { + if (peer_ni->ksnp_ni != ni) + continue; + + if (!((LNET_NID_IS_ANY(&id.nid) || + nid_same(&peer_ni->ksnp_id.nid, &id.nid)) && + (id.pid == LNET_PID_ANY || + peer_ni->ksnp_id.pid == id.pid))) + continue; + + ksocknal_peer_addref(peer_ni); /* a ref for me... */ + + ksocknal_del_peer_locked(peer_ni, ip); + + if (peer_ni->ksnp_closing && + !list_empty(&peer_ni->ksnp_tx_queue)) { + LASSERT(list_empty(&peer_ni->ksnp_conns)); + LASSERT(peer_ni->ksnp_conn_cb == NULL); + + list_splice_init(&peer_ni->ksnp_tx_queue, + &zombies); + } + + ksocknal_peer_decref(peer_ni); /* ...till here */ + + rc = 0; /* matched! */ + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_txlist_done(ni, &zombies, -ENETDOWN); + + return rc; +} + +static struct ksock_conn * +ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index) +{ + struct ksock_peer_ni *peer_ni; + struct ksock_conn *conn; + int i; + + read_lock(&ksocknal_data.ksnd_global_lock); + + hash_for_each(ksocknal_data.ksnd_peers, i, peer_ni, ksnp_list) { + LASSERT(!peer_ni->ksnp_closing); + + if (peer_ni->ksnp_ni != ni) + continue; + + list_for_each_entry(conn, &peer_ni->ksnp_conns, + ksnc_list) { + if (index-- > 0) + continue; + + ksocknal_conn_addref(conn); + read_unlock(&ksocknal_data.ksnd_global_lock); + return conn; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return NULL; +} + +static struct ksock_sched * +ksocknal_choose_scheduler_locked(unsigned int cpt) +{ + struct ksock_sched *sched = ksocknal_data.ksnd_schedulers[cpt]; + int i; + + if (sched->kss_nthreads == 0) { + cfs_percpt_for_each(sched, i, ksocknal_data.ksnd_schedulers) { + if (sched->kss_nthreads > 0) { + CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n", + cpt, sched->kss_cpt); + return sched; + } + } + return NULL; + } + + return sched; +} + +int +ksocknal_accept(struct lnet_ni *ni, struct socket *sock) +{ + struct ksock_connreq *cr; + int rc; + struct sockaddr_storage peer; + + rc = lnet_sock_getaddr(sock, true, &peer); + if (rc != 0) { + CERROR("Can't determine new connection's address\n"); + return rc; + } + + LIBCFS_ALLOC(cr, sizeof(*cr)); + if (cr == NULL) { + LCONSOLE_ERROR_MSG(0x12f, + "Dropping connection request from %pIS: memory exhausted\n", + &peer); + return -ENOMEM; + } + + lnet_ni_addref(ni); + cr->ksncr_ni = ni; + cr->ksncr_sock = sock; + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + + list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs); + wake_up(&ksocknal_data.ksnd_connd_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + return 0; +} + +static int +ksocknal_connecting(struct ksock_conn_cb *conn_cb, struct sockaddr *sa) +{ + if (conn_cb && + rpc_cmp_addr((struct sockaddr *)&conn_cb->ksnr_addr, sa)) + return conn_cb->ksnr_connecting; + return 0; +} + +int +ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb, + struct socket *sock, int type) +{ + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + LIST_HEAD(zombies); + struct lnet_processid peerid; + u64 incarnation; + struct ksock_conn *conn; + struct ksock_conn *conn2; + struct ksock_peer_ni *peer_ni = NULL; + struct ksock_peer_ni *peer2; + struct ksock_sched *sched; + struct ksock_hello_msg *hello; + int cpt; + struct ksock_tx *tx; + struct ksock_tx *txtmp; + int rc; + int rc2; + int active; + int num_dup = 0; + char *warn = NULL; + + active = (conn_cb != NULL); + + LASSERT(active == (type != SOCKLND_CONN_NONE)); + + LIBCFS_ALLOC(conn, sizeof(*conn)); + if (conn == NULL) { + rc = -ENOMEM; + goto failed_0; + } + + conn->ksnc_peer = NULL; + conn->ksnc_conn_cb = NULL; + conn->ksnc_sock = sock; + /* 2 ref, 1 for conn, another extra ref prevents socket + * being closed before establishment of connection */ + refcount_set(&conn->ksnc_sock_refcount, 2); + conn->ksnc_type = type; + ksocknal_lib_save_callback(sock, conn); + refcount_set(&conn->ksnc_conn_refcount, 1); /* 1 ref for me */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + + INIT_LIST_HEAD(&conn->ksnc_tx_queue); + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + conn->ksnc_tx_carrier = NULL; + atomic_set (&conn->ksnc_tx_nob, 0); + + LIBCFS_ALLOC(hello, offsetof(struct ksock_hello_msg, + kshm_ips[LNET_INTERFACES_NUM])); + if (hello == NULL) { + rc = -ENOMEM; + goto failed_1; + } + + /* stash conn's local and remote addrs */ + rc = ksocknal_lib_get_conn_addrs(conn); + if (rc != 0) + goto failed_1; + + /* Find out/confirm peer_ni's NID and connection type and get the + * vector of interfaces she's willing to let me connect to. + * Passive connections use the listener timeout since the peer_ni sends + * eagerly + */ + + if (active) { + peer_ni = conn_cb->ksnr_peer; + LASSERT(ni == peer_ni->ksnp_ni); + + /* Active connection sends HELLO eagerly */ + hello->kshm_nips = 0; + peerid = peer_ni->ksnp_id; + + write_lock_bh(global_lock); + conn->ksnc_proto = peer_ni->ksnp_proto; + write_unlock_bh(global_lock); + + if (conn->ksnc_proto == NULL) { + conn->ksnc_proto = &ksocknal_protocol_v3x; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 2) + conn->ksnc_proto = &ksocknal_protocol_v2x; + else if (*ksocknal_tunables.ksnd_protocol == 1) + conn->ksnc_proto = &ksocknal_protocol_v1x; +#endif + } + + rc = ksocknal_send_hello(ni, conn, &peerid.nid, hello); + if (rc != 0) + goto failed_1; + } else { + peerid.nid = LNET_ANY_NID; + peerid.pid = LNET_PID_ANY; + + /* Passive, get protocol from peer_ni */ + conn->ksnc_proto = NULL; + } + + rc = ksocknal_recv_hello(ni, conn, hello, &peerid, &incarnation); + if (rc < 0) + goto failed_1; + + LASSERT(rc == 0 || active); + LASSERT(conn->ksnc_proto != NULL); + LASSERT(!LNET_NID_IS_ANY(&peerid.nid)); + + cpt = lnet_nid2cpt(&peerid.nid, ni); + + if (active) { + ksocknal_peer_addref(peer_ni); + write_lock_bh(global_lock); + } else { + peer_ni = ksocknal_create_peer(ni, &peerid); + if (IS_ERR(peer_ni)) { + rc = PTR_ERR(peer_ni); + goto failed_1; + } + + write_lock_bh(global_lock); + + /* called with a ref on ni, so shutdown can't have started */ + LASSERT(atomic_read(&((struct ksock_net *)ni->ni_data)->ksnn_npeers) >= 0); + + peer2 = ksocknal_find_peer_locked(ni, &peerid); + if (peer2 == NULL) { + /* NB this puts an "empty" peer_ni in the peer_ni + * table (which takes my ref) */ + hash_add(ksocknal_data.ksnd_peers, + &peer_ni->ksnp_list, nidhash(&peerid.nid)); + } else { + ksocknal_peer_decref(peer_ni); + peer_ni = peer2; + } + + /* +1 ref for me */ + ksocknal_peer_addref(peer_ni); + peer_ni->ksnp_accepting++; + + /* Am I already connecting to this guy? Resolve in + * favour of higher NID... + */ + if (memcmp(&peerid.nid, &ni->ni_nid, sizeof(peerid.nid)) < 0 && + ksocknal_connecting(peer_ni->ksnp_conn_cb, + ((struct sockaddr *) &conn->ksnc_peeraddr))) { + rc = EALREADY; + warn = "connection race resolution"; + goto failed_2; + } + } + + if (peer_ni->ksnp_closing || + (active && conn_cb->ksnr_deleted)) { + /* peer_ni/conn_cb got closed under me */ + rc = -ESTALE; + warn = "peer_ni/conn_cb removed"; + goto failed_2; + } + + if (peer_ni->ksnp_proto == NULL) { + /* Never connected before. + * NB recv_hello may have returned EPROTO to signal my peer_ni + * wants a different protocol than the one I asked for. + */ + LASSERT(list_empty(&peer_ni->ksnp_conns)); + + peer_ni->ksnp_proto = conn->ksnc_proto; + peer_ni->ksnp_incarnation = incarnation; + } + + if (peer_ni->ksnp_proto != conn->ksnc_proto || + peer_ni->ksnp_incarnation != incarnation) { + /* peer_ni rebooted or I've got the wrong protocol version */ + ksocknal_close_peer_conns_locked(peer_ni, NULL, 0); + + peer_ni->ksnp_proto = NULL; + rc = ESTALE; + warn = peer_ni->ksnp_incarnation != incarnation ? + "peer_ni rebooted" : + "wrong proto version"; + goto failed_2; + } + + switch (rc) { + default: + LBUG(); + case 0: + break; + case EALREADY: + warn = "lost conn race"; + goto failed_2; + case EPROTO: + warn = "retry with different protocol version"; + goto failed_2; + } + + /* Refuse to duplicate an existing connection, unless this is a + * loopback connection */ + if (!rpc_cmp_addr((struct sockaddr *)&conn->ksnc_peeraddr, + (struct sockaddr *)&conn->ksnc_myaddr)) { + list_for_each_entry(conn2, &peer_ni->ksnp_conns, ksnc_list) { + if (!rpc_cmp_addr( + (struct sockaddr *)&conn2->ksnc_peeraddr, + (struct sockaddr *)&conn->ksnc_peeraddr) || + !rpc_cmp_addr( + (struct sockaddr *)&conn2->ksnc_myaddr, + (struct sockaddr *)&conn->ksnc_myaddr) || + conn2->ksnc_type != conn->ksnc_type) + continue; + + num_dup++; + /* If max conns per type is not registered in conn_cb + * as ksnr_max_conns, use ni's conns_per_peer + */ + if ((peer_ni->ksnp_conn_cb && + num_dup < peer_ni->ksnp_conn_cb->ksnr_max_conns) || + (!peer_ni->ksnp_conn_cb && + num_dup < ksocknal_get_conns_per_peer(peer_ni))) + continue; + + /* Reply on a passive connection attempt so the peer_ni + * realises we're connected. + */ + LASSERT(rc == 0); + if (!active) + rc = EALREADY; + + warn = "duplicate"; + goto failed_2; + } + } + /* If the connection created by this route didn't bind to the IP + * address the route connected to, the connection/route matching + * code below probably isn't going to work. + */ + if (active && + !rpc_cmp_addr((struct sockaddr *)&conn_cb->ksnr_addr, + (struct sockaddr *)&conn->ksnc_peeraddr)) { + CERROR("Route %s %pIS connected to %pIS\n", + libcfs_idstr(&peer_ni->ksnp_id), + &conn_cb->ksnr_addr, + &conn->ksnc_peeraddr); + } + + /* Search for a conn_cb corresponding to the new connection and + * create an association. This allows incoming connections created + * by conn_cbs in my peer_ni to match my own conn_cb entries so I don't + * continually create duplicate conn_cbs. + */ + conn_cb = peer_ni->ksnp_conn_cb; + + if (conn_cb && rpc_cmp_addr((struct sockaddr *)&conn->ksnc_peeraddr, + (struct sockaddr *)&conn_cb->ksnr_addr)) + ksocknal_associate_cb_conn_locked(conn_cb, conn); + + conn->ksnc_peer = peer_ni; /* conn takes my ref on peer_ni */ + peer_ni->ksnp_last_alive = ktime_get_seconds(); + peer_ni->ksnp_send_keepalive = 0; + peer_ni->ksnp_error = 0; + + sched = ksocknal_choose_scheduler_locked(cpt); + if (!sched) { + CERROR("no schedulers available. node is unhealthy\n"); + goto failed_2; + } + /* + * The cpt might have changed if we ended up selecting a non cpt + * native scheduler. So use the scheduler's cpt instead. + */ + cpt = sched->kss_cpt; + sched->kss_nconns++; + conn->ksnc_scheduler = sched; + + conn->ksnc_tx_last_post = ktime_get_seconds(); + /* Set the deadline for the outgoing HELLO to drain */ + conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued; + conn->ksnc_tx_deadline = ktime_get_seconds() + + ksocknal_timeout(); + smp_mb(); /* order with adding to peer_ni's conn list */ + + list_add(&conn->ksnc_list, &peer_ni->ksnp_conns); + ksocknal_conn_addref(conn); + + ksocknal_new_packet(conn, 0); + + conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn); + + /* Take packets blocking for this connection. */ + list_for_each_entry_safe(tx, txtmp, &peer_ni->ksnp_tx_queue, tx_list) { + if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == + SOCKNAL_MATCH_NO) + continue; + + list_del(&tx->tx_list); + ksocknal_queue_tx_locked(tx, conn); + } + + write_unlock_bh(global_lock); + /* We've now got a new connection. Any errors from here on are just + * like "normal" comms errors and we close the connection normally. + * NB (a) we still have to send the reply HELLO for passive + * connections, + * (b) normal I/O on the conn is blocked until I setup and call the + * socket callbacks. + */ + + CDEBUG(D_NET, "New conn %s p %d.x %pIS -> %pISp" + " incarnation:%lld sched[%d]\n", + libcfs_idstr(&peerid), conn->ksnc_proto->pro_version, + &conn->ksnc_myaddr, &conn->ksnc_peeraddr, + incarnation, cpt); + + if (!active) { + hello->kshm_nips = 0; + rc = ksocknal_send_hello(ni, conn, &peerid.nid, hello); + } + + LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg, + kshm_ips[LNET_INTERFACES_NUM])); + + /* setup the socket AFTER I've received hello (it disables + * SO_LINGER). I might call back to the acceptor who may want + * to send a protocol version response and then close the + * socket; this ensures the socket only tears down after the + * response has been sent. + */ + if (rc == 0) + rc = ksocknal_lib_setup_sock(sock); + + write_lock_bh(global_lock); + + /* NB my callbacks block while I hold ksnd_global_lock */ + ksocknal_lib_set_callback(sock, conn); + + if (!active) + peer_ni->ksnp_accepting--; + + write_unlock_bh(global_lock); + + if (rc != 0) { + write_lock_bh(global_lock); + if (!conn->ksnc_closing) { + /* could be closed by another thread */ + ksocknal_close_conn_locked(conn, rc); + } + write_unlock_bh(global_lock); + } else if (ksocknal_connsock_addref(conn) == 0) { + /* Allow I/O to proceed. */ + ksocknal_read_callback(conn); + ksocknal_write_callback(conn); + ksocknal_connsock_decref(conn); + } + + ksocknal_connsock_decref(conn); + ksocknal_conn_decref(conn); + return rc; + +failed_2: + + if (!peer_ni->ksnp_closing && + list_empty(&peer_ni->ksnp_conns) && + peer_ni->ksnp_conn_cb == NULL) { + list_splice_init(&peer_ni->ksnp_tx_queue, &zombies); + ksocknal_unlink_peer_locked(peer_ni); + } + + write_unlock_bh(global_lock); + + if (warn != NULL) { + if (rc < 0) + CERROR("Not creating conn %s type %d: %s\n", + libcfs_idstr(&peerid), conn->ksnc_type, warn); + else + CDEBUG(D_NET, "Not creating conn %s type %d: %s\n", + libcfs_idstr(&peerid), conn->ksnc_type, warn); + } + + if (!active) { + if (rc > 0) { + /* Request retry by replying with CONN_NONE + * ksnc_proto has been set already + */ + conn->ksnc_type = SOCKLND_CONN_NONE; + hello->kshm_nips = 0; + ksocknal_send_hello(ni, conn, &peerid.nid, hello); + } + + write_lock_bh(global_lock); + peer_ni->ksnp_accepting--; + write_unlock_bh(global_lock); + } + + /* + * If we get here without an error code, just use -EALREADY. + * Depending on how we got here, the error may be positive + * or negative. Normalize the value for ksocknal_txlist_done(). + */ + rc2 = (rc == 0 ? -EALREADY : (rc > 0 ? -rc : rc)); + ksocknal_txlist_done(ni, &zombies, rc2); + ksocknal_peer_decref(peer_ni); + +failed_1: + if (hello != NULL) + LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg, + kshm_ips[LNET_INTERFACES_NUM])); + + LIBCFS_FREE(conn, sizeof(*conn)); + +failed_0: + sock_release(sock); + + return rc; +} + +void +ksocknal_close_conn_locked(struct ksock_conn *conn, int error) +{ + /* This just does the immmediate housekeeping, and queues the + * connection for the reaper to terminate. + * Caller holds ksnd_global_lock exclusively in irq context */ + struct ksock_peer_ni *peer_ni = conn->ksnc_peer; + struct ksock_conn_cb *conn_cb; + struct ksock_conn *conn2; + int conn_count; + int duplicate_count = 0; + + LASSERT(peer_ni->ksnp_error == 0); + LASSERT(!conn->ksnc_closing); + conn->ksnc_closing = 1; + + /* ksnd_deathrow_conns takes over peer_ni's ref */ + list_del(&conn->ksnc_list); + + conn_cb = conn->ksnc_conn_cb; + if (conn_cb != NULL) { + /* dissociate conn from cb... */ + LASSERT(!conn_cb->ksnr_deleted); + + conn_count = ksocknal_get_conn_count_by_type(conn_cb, + conn->ksnc_type); + /* connected bit is set only if all connections + * of the given type got created + */ + if (conn_count == conn_cb->ksnr_max_conns) + LASSERT((conn_cb->ksnr_connected & + BIT(conn->ksnc_type)) != 0); + + if (conn_count == 1) { + list_for_each_entry(conn2, &peer_ni->ksnp_conns, + ksnc_list) { + if (conn2->ksnc_conn_cb == conn_cb && + conn2->ksnc_type == conn->ksnc_type) + duplicate_count += 1; + } + if (duplicate_count > 0) + CERROR("Found %d duplicate conns type %d\n", + duplicate_count, + conn->ksnc_type); + } + ksocknal_decr_conn_count(conn_cb, conn->ksnc_type); + + conn->ksnc_conn_cb = NULL; + + /* drop conn's ref on conn_cb */ + ksocknal_conn_cb_decref(conn_cb); + } + + if (list_empty(&peer_ni->ksnp_conns)) { + /* No more connections to this peer_ni */ + + if (!list_empty(&peer_ni->ksnp_tx_queue)) { + struct ksock_tx *tx; + + LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x); + + /* throw them to the last connection..., + * these TXs will be send to /dev/null by scheduler */ + list_for_each_entry(tx, &peer_ni->ksnp_tx_queue, + tx_list) + ksocknal_tx_prep(conn, tx); + + spin_lock_bh(&conn->ksnc_scheduler->kss_lock); + list_splice_init(&peer_ni->ksnp_tx_queue, + &conn->ksnc_tx_queue); + spin_unlock_bh(&conn->ksnc_scheduler->kss_lock); + } + + /* renegotiate protocol version */ + peer_ni->ksnp_proto = NULL; + /* stash last conn close reason */ + peer_ni->ksnp_error = error; + + if (peer_ni->ksnp_conn_cb == NULL) { + /* I've just closed last conn belonging to a + * peer_ni with no connections to it + */ + ksocknal_unlink_peer_locked(peer_ni); + } + } + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_deathrow_conns); + wake_up(&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); +} + +void +ksocknal_peer_failed(struct ksock_peer_ni *peer_ni) +{ + bool notify = false; + time64_t last_alive = 0; + + /* There has been a connection failure or comms error; but I'll only + * tell LNET I think the peer_ni is dead if it's to another kernel and + * there are no connections or connection attempts in existence. */ + + read_lock(&ksocknal_data.ksnd_global_lock); + + if ((peer_ni->ksnp_id.pid & LNET_PID_USERFLAG) == 0 && + list_empty(&peer_ni->ksnp_conns) && + peer_ni->ksnp_accepting == 0 && + !ksocknal_find_connecting_conn_cb_locked(peer_ni)) { + notify = true; + last_alive = peer_ni->ksnp_last_alive; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (notify) + lnet_notify(peer_ni->ksnp_ni, + lnet_nid_to_nid4(&peer_ni->ksnp_id.nid), + false, false, last_alive); +} + +void +ksocknal_finalize_zcreq(struct ksock_conn *conn) +{ + struct ksock_peer_ni *peer_ni = conn->ksnc_peer; + struct ksock_tx *tx; + struct ksock_tx *tmp; + LIST_HEAD(zlist); + + /* NB safe to finalize TXs because closing of socket will + * abort all buffered data */ + LASSERT(conn->ksnc_sock == NULL); + + spin_lock(&peer_ni->ksnp_lock); + + list_for_each_entry_safe(tx, tmp, &peer_ni->ksnp_zc_req_list, + tx_zc_list) { + if (tx->tx_conn != conn) + continue; + + LASSERT(tx->tx_msg.ksm_zc_cookies[0] != 0); + + tx->tx_msg.ksm_zc_cookies[0] = 0; + tx->tx_zc_aborted = 1; /* mark it as not-acked */ + list_move(&tx->tx_zc_list, &zlist); + } + + spin_unlock(&peer_ni->ksnp_lock); + + while ((tx = list_first_entry_or_null(&zlist, struct ksock_tx, + tx_zc_list)) != NULL) { + list_del(&tx->tx_zc_list); + ksocknal_tx_decref(tx); + } +} + +void +ksocknal_terminate_conn(struct ksock_conn *conn) +{ + /* This gets called by the reaper (guaranteed thread context) to + * disengage the socket from its callbacks and close it. + * ksnc_refcount will eventually hit zero, and then the reaper will + * destroy it. + */ + struct ksock_peer_ni *peer_ni = conn->ksnc_peer; + struct ksock_sched *sched = conn->ksnc_scheduler; + bool failed = false; + + LASSERT(conn->ksnc_closing); + + /* wake up the scheduler to "send" all remaining packets to /dev/null */ + spin_lock_bh(&sched->kss_lock); + + /* a closing conn is always ready to tx */ + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && + !list_empty(&conn->ksnc_tx_queue)) { + list_add_tail(&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up(&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); + + /* serialise with callbacks */ + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_lib_reset_callback(conn->ksnc_sock, conn); + + /* OK, so this conn may not be completely disengaged from its + * scheduler yet, but it _has_ committed to terminate... + */ + conn->ksnc_scheduler->kss_nconns--; + + if (peer_ni->ksnp_error != 0) { + /* peer_ni's last conn closed in error */ + LASSERT(list_empty(&peer_ni->ksnp_conns)); + failed = true; + peer_ni->ksnp_error = 0; /* avoid multiple notifications */ + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + if (failed) + ksocknal_peer_failed(peer_ni); + + /* The socket is closed on the final put; either here, or in + * ksocknal_{send,recv}msg(). Since we set up the linger2 option + * when the connection was established, this will close the socket + * immediately, aborting anything buffered in it. Any hung + * zero-copy transmits will therefore complete in finite time. + */ + ksocknal_connsock_decref(conn); +} + +void +ksocknal_queue_zombie_conn(struct ksock_conn *conn) +{ + /* Queue the conn for the reaper to destroy */ + LASSERT(refcount_read(&conn->ksnc_conn_refcount) == 0); + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); + wake_up(&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); +} + +void +ksocknal_destroy_conn(struct ksock_conn *conn) +{ + time64_t last_rcv; + + /* Final coup-de-grace of the reaper */ + CDEBUG(D_NET, "connection %p\n", conn); + + LASSERT(refcount_read(&conn->ksnc_conn_refcount) == 0); + LASSERT(refcount_read(&conn->ksnc_sock_refcount) == 0); + LASSERT(conn->ksnc_sock == NULL); + LASSERT(conn->ksnc_conn_cb == NULL); + LASSERT(!conn->ksnc_tx_scheduled); + LASSERT(!conn->ksnc_rx_scheduled); + LASSERT(list_empty(&conn->ksnc_tx_queue)); + + /* complete current receive if any */ + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_LNET_PAYLOAD: + last_rcv = conn->ksnc_rx_deadline - + ksocknal_timeout(); + CERROR("Completing partial receive from %s[%d], ip %pISp, with error, wanted: %d, left: %d, last alive is %lld secs ago\n", + libcfs_idstr(&conn->ksnc_peer->ksnp_id), + conn->ksnc_type, + &conn->ksnc_peeraddr, + conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left, + ktime_get_seconds() - last_rcv); + if (conn->ksnc_lnet_msg) + conn->ksnc_lnet_msg->msg_health_status = + LNET_MSG_STATUS_REMOTE_ERROR; + lnet_finalize(conn->ksnc_lnet_msg, -EIO); + break; + case SOCKNAL_RX_LNET_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of lnet header from %s, ip %pISp, with error, protocol: %d.x.\n", + libcfs_idstr(&conn->ksnc_peer->ksnp_id), + &conn->ksnc_peeraddr, + conn->ksnc_proto->pro_version); + break; + case SOCKNAL_RX_KSM_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of ksock message from %s, ip %pISp, with error, protocol: %d.x.\n", + libcfs_idstr(&conn->ksnc_peer->ksnp_id), + &conn->ksnc_peeraddr, + conn->ksnc_proto->pro_version); + break; + case SOCKNAL_RX_SLOP: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of slops from %s, ip %pISp, with error\n", + libcfs_idstr(&conn->ksnc_peer->ksnp_id), + &conn->ksnc_peeraddr); + break; + default: + LBUG(); + break; + } + + ksocknal_peer_decref(conn->ksnc_peer); + + LIBCFS_FREE(conn, sizeof(*conn)); +} + +int +ksocknal_close_peer_conns_locked(struct ksock_peer_ni *peer_ni, + struct sockaddr *addr, int why) +{ + struct ksock_conn *conn; + struct ksock_conn *cnxt; + int count = 0; + + list_for_each_entry_safe(conn, cnxt, &peer_ni->ksnp_conns, ksnc_list) { + if (!addr || + rpc_cmp_addr(addr, + (struct sockaddr *)&conn->ksnc_peeraddr)) { + count++; + ksocknal_close_conn_locked(conn, why); + } + } + + return count; +} + +int +ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why) +{ + struct ksock_peer_ni *peer_ni = conn->ksnc_peer; + int count; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + count = ksocknal_close_peer_conns_locked( + peer_ni, (struct sockaddr *)&conn->ksnc_peeraddr, why); + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return count; +} + +int +ksocknal_close_matching_conns(struct lnet_processid *id, __u32 ipaddr) +{ + struct ksock_peer_ni *peer_ni; + struct hlist_node *pnxt; + int lo; + int hi; + int i; + int count = 0; + struct sockaddr_in sa = {.sin_family = AF_INET}; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + if (!LNET_NID_IS_ANY(&id->nid)) { + lo = hash_min(nidhash(&id->nid), + HASH_BITS(ksocknal_data.ksnd_peers)); + hi = lo; + } else { + lo = 0; + hi = HASH_SIZE(ksocknal_data.ksnd_peers) - 1; + } + + sa.sin_addr.s_addr = htonl(ipaddr); + for (i = lo; i <= hi; i++) { + hlist_for_each_entry_safe(peer_ni, pnxt, + &ksocknal_data.ksnd_peers[i], + ksnp_list) { + + if (!((LNET_NID_IS_ANY(&id->nid) || + nid_same(&id->nid, &peer_ni->ksnp_id.nid)) && + (id->pid == LNET_PID_ANY || + id->pid == peer_ni->ksnp_id.pid))) + continue; + + count += ksocknal_close_peer_conns_locked( + peer_ni, + ipaddr ? (struct sockaddr *)&sa : NULL, 0); + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + /* wildcards always succeed */ + if (LNET_NID_IS_ANY(&id->nid) || id->pid == LNET_PID_ANY || + ipaddr == 0) + return 0; + + return (count == 0 ? -ENOENT : 0); +} + +void +ksocknal_notify_gw_down(struct lnet_nid *gw_nid) +{ + /* The router is telling me she's been notified of a change in + * gateway state.... + */ + struct lnet_processid id = { + .pid = LNET_PID_ANY, + .nid = *gw_nid, + }; + + CDEBUG(D_NET, "gw %s down\n", libcfs_nidstr(gw_nid)); + + /* If the gateway crashed, close all open connections... */ + ksocknal_close_matching_conns(&id, 0); + return; + + /* We can only establish new connections + * if we have autroutes, and these connect on demand. + */ +} + +static void +ksocknal_push_peer(struct ksock_peer_ni *peer_ni) +{ + int index; + int i; + struct ksock_conn *conn; + + for (index = 0; ; index++) { + read_lock(&ksocknal_data.ksnd_global_lock); + + i = 0; + conn = NULL; + + list_for_each_entry(conn, &peer_ni->ksnp_conns, ksnc_list) { + if (i++ == index) { + ksocknal_conn_addref(conn); + break; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (i <= index) + break; + + ksocknal_lib_push_conn (conn); + ksocknal_conn_decref(conn); + } +} + +static int +ksocknal_push(struct lnet_ni *ni, struct lnet_processid *id) +{ + int lo; + int hi; + int bkt; + int rc = -ENOENT; + + if (!LNET_NID_IS_ANY(&id->nid)) { + lo = hash_min(nidhash(&id->nid), + HASH_BITS(ksocknal_data.ksnd_peers)); + hi = lo; + } else { + lo = 0; + hi = HASH_SIZE(ksocknal_data.ksnd_peers) - 1; + } + + for (bkt = lo; bkt <= hi; bkt++) { + int peer_off; /* searching offset in peer_ni hash table */ + + for (peer_off = 0; ; peer_off++) { + struct ksock_peer_ni *peer_ni; + int i = 0; + + read_lock(&ksocknal_data.ksnd_global_lock); + hlist_for_each_entry(peer_ni, + &ksocknal_data.ksnd_peers[bkt], + ksnp_list) { + if (!((LNET_NID_IS_ANY(&id->nid) || + nid_same(&id->nid, + &peer_ni->ksnp_id.nid)) && + (id->pid == LNET_PID_ANY || + id->pid == peer_ni->ksnp_id.pid))) + continue; + + if (i++ == peer_off) { + ksocknal_peer_addref(peer_ni); + break; + } + } + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (i <= peer_off) /* no match */ + break; + + rc = 0; + ksocknal_push_peer(peer_ni); + ksocknal_peer_decref(peer_ni); + } + } + return rc; +} + +int +ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg) +{ + struct lnet_process_id id4 = {}; + struct lnet_processid id = {}; + struct libcfs_ioctl_data *data = arg; + int rc; + + switch(cmd) { + case IOC_LIBCFS_GET_INTERFACE: { + struct ksock_net *net = ni->ni_data; + struct ksock_interface *iface; + struct sockaddr_in *sa; + + read_lock(&ksocknal_data.ksnd_global_lock); + + if (data->ioc_count >= 1) { + rc = -ENOENT; + } else { + rc = 0; + iface = &net->ksnn_interface; + + sa = (void *)&iface->ksni_addr; + if (sa->sin_family == AF_INET) + data->ioc_u32[0] = ntohl(sa->sin_addr.s_addr); + else + data->ioc_u32[0] = 0xFFFFFFFF; + data->ioc_u32[1] = iface->ksni_netmask; + data->ioc_u32[2] = iface->ksni_npeers; + data->ioc_u32[3] = iface->ksni_nroutes; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return rc; + } + + case IOC_LIBCFS_GET_PEER: { + __u32 myip = 0; + __u32 ip = 0; + int port = 0; + int conn_count = 0; + int share_count = 0; + + rc = ksocknal_get_peer_info(ni, data->ioc_count, + &id4, &myip, &ip, &port, + &conn_count, &share_count); + if (rc != 0) + return rc; + + data->ioc_nid = id4.nid; + data->ioc_count = share_count; + data->ioc_u32[0] = ip; + data->ioc_u32[1] = port; + data->ioc_u32[2] = myip; + data->ioc_u32[3] = conn_count; + data->ioc_u32[4] = id4.pid; + return 0; + } + + case IOC_LIBCFS_ADD_PEER: { + struct sockaddr_in sa = {.sin_family = AF_INET}; + + id4.nid = data->ioc_nid; + id4.pid = LNET_PID_LUSTRE; + sa.sin_addr.s_addr = htonl(data->ioc_u32[0]); + sa.sin_port = htons(data->ioc_u32[1]); + return ksocknal_add_peer(ni, id4, (struct sockaddr *)&sa); + } + case IOC_LIBCFS_DEL_PEER: + id4.nid = data->ioc_nid; + id4.pid = LNET_PID_ANY; + return ksocknal_del_peer(ni, id4, + data->ioc_u32[0]); /* IP */ + + case IOC_LIBCFS_GET_CONN: { + int txmem; + int rxmem; + int nagle; + struct ksock_conn *conn = ksocknal_get_conn_by_idx(ni, data->ioc_count); + struct sockaddr_in *psa = (void *)&conn->ksnc_peeraddr; + struct sockaddr_in *mysa = (void *)&conn->ksnc_myaddr; + + if (conn == NULL) + return -ENOENT; + + ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle); + + data->ioc_count = txmem; + data->ioc_nid = lnet_nid_to_nid4(&conn->ksnc_peer->ksnp_id.nid); + data->ioc_flags = nagle; + if (psa->sin_family == AF_INET) + data->ioc_u32[0] = ntohl(psa->sin_addr.s_addr); + else + data->ioc_u32[0] = 0xFFFFFFFF; + data->ioc_u32[1] = rpc_get_port((struct sockaddr *) + &conn->ksnc_peeraddr); + if (mysa->sin_family == AF_INET) + data->ioc_u32[2] = ntohl(mysa->sin_addr.s_addr); + else + data->ioc_u32[2] = 0xFFFFFFFF; + data->ioc_u32[3] = conn->ksnc_type; + data->ioc_u32[4] = conn->ksnc_scheduler->kss_cpt; + data->ioc_u32[5] = rxmem; + data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid; + ksocknal_conn_decref(conn); + return 0; + } + + case IOC_LIBCFS_CLOSE_CONNECTION: + lnet_nid4_to_nid(data->ioc_nid, &id.nid); + id.pid = LNET_PID_ANY; + return ksocknal_close_matching_conns(&id, + data->ioc_u32[0]); + + case IOC_LIBCFS_REGISTER_MYNID: + /* Ignore if this is a noop */ + if (nid_is_nid4(&ni->ni_nid) && + data->ioc_nid == lnet_nid_to_nid4(&ni->ni_nid)) + return 0; + + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nidstr(&ni->ni_nid)); + return -EINVAL; + + case IOC_LIBCFS_PUSH_CONNECTION: + lnet_nid4_to_nid(data->ioc_nid, &id.nid); + id.pid = LNET_PID_ANY; + return ksocknal_push(ni, &id); + + default: + return -EINVAL; + } + /* not reached */ +} + +static void +ksocknal_free_buffers (void) +{ + LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0); + + if (ksocknal_data.ksnd_schedulers != NULL) + cfs_percpt_free(ksocknal_data.ksnd_schedulers); + + spin_lock(&ksocknal_data.ksnd_tx_lock); + + if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { + LIST_HEAD(zlist); + struct ksock_tx *tx; + + list_splice_init(&ksocknal_data.ksnd_idle_noop_txs, &zlist); + spin_unlock(&ksocknal_data.ksnd_tx_lock); + + while ((tx = list_first_entry_or_null(&zlist, struct ksock_tx, + tx_list)) != NULL) { + list_del(&tx->tx_list); + LIBCFS_FREE(tx, tx->tx_desc_size); + } + } else { + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } +} + +static int ksocknal_get_link_status(struct net_device *dev) +{ + int ret = -1; + + LASSERT(dev); + + if (!netif_running(dev)) { + ret = 0; + CDEBUG(D_NET, "device not running\n"); + } + /* Some devices may not be providing link settings */ + else if (dev->ethtool_ops->get_link) { + ret = dev->ethtool_ops->get_link(dev); + CDEBUG(D_NET, "get_link returns %u\n", ret); + } + + return ret; +} + +static int +ksocknal_handle_link_state_change(struct net_device *dev, + unsigned char operstate) +{ + struct lnet_ni *ni = NULL; + struct ksock_net *net; + struct ksock_net *cnxt; + int ifindex; + unsigned char link_down = !(operstate == IF_OPER_UP); + struct in_device *in_dev; + bool found_ip = false; + struct ksock_interface *ksi = NULL; + struct sockaddr_in *sa; + DECLARE_CONST_IN_IFADDR(ifa); + + ifindex = dev->ifindex; + + if (!ksocknal_data.ksnd_nnets) + goto out; + + list_for_each_entry_safe(net, cnxt, &ksocknal_data.ksnd_nets, + ksnn_list) { + + ksi = &net->ksnn_interface; + sa = (void *)&ksi->ksni_addr; + found_ip = false; + + if (ksi->ksni_index != ifindex || + strcmp(ksi->ksni_name, dev->name)) + continue; + + ni = net->ksnn_ni; + + in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) { + CDEBUG(D_NET, "Interface %s has no IPv4 status.\n", + dev->name); + CDEBUG(D_NET, "set link fatal state to 1\n"); + atomic_set(&ni->ni_fatal_error_on, 1); + continue; + } + in_dev_for_each_ifa_rtnl(ifa, in_dev) { + if (sa->sin_addr.s_addr == ifa->ifa_local) + found_ip = true; + } + endfor_ifa(in_dev); + + if (!found_ip) { + CDEBUG(D_NET, "Interface %s has no matching ip\n", + dev->name); + CDEBUG(D_NET, "set link fatal state to 1\n"); + atomic_set(&ni->ni_fatal_error_on, 1); + continue; + } + + if (link_down) { + CDEBUG(D_NET, "set link fatal state to 1\n"); + atomic_set(&ni->ni_fatal_error_on, link_down); + } else { + CDEBUG(D_NET, "set link fatal state to %u\n", + (ksocknal_get_link_status(dev) == 0)); + atomic_set(&ni->ni_fatal_error_on, + (ksocknal_get_link_status(dev) == 0)); + } + } +out: + return 0; +} + + +static int +ksocknal_handle_inetaddr_change(struct in_ifaddr *ifa, unsigned long event) +{ + struct lnet_ni *ni; + struct ksock_net *net; + struct ksock_net *cnxt; + struct net_device *event_netdev = ifa->ifa_dev->dev; + int ifindex; + struct ksock_interface *ksi = NULL; + struct sockaddr_in *sa; + + if (!ksocknal_data.ksnd_nnets) + goto out; + + ifindex = event_netdev->ifindex; + + list_for_each_entry_safe(net, cnxt, &ksocknal_data.ksnd_nets, + ksnn_list) { + + ksi = &net->ksnn_interface; + sa = (void *)&ksi->ksni_addr; + + if (ksi->ksni_index != ifindex || + strcmp(ksi->ksni_name, event_netdev->name)) + continue; + + if (sa->sin_addr.s_addr == ifa->ifa_local) { + CDEBUG(D_NET, "set link fatal state to %u\n", + (event == NETDEV_DOWN)); + ni = net->ksnn_ni; + atomic_set(&ni->ni_fatal_error_on, + (event == NETDEV_DOWN)); + } + } +out: + return 0; +} + +/************************************ + * Net device notifier event handler + ************************************/ +static int ksocknal_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + unsigned char operstate; + + operstate = dev->operstate; + + CDEBUG(D_NET, "devevent: status=%ld, iface=%s ifindex %d state %u\n", + event, dev->name, dev->ifindex, operstate); + + switch (event) { + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + ksocknal_handle_link_state_change(dev, operstate); + break; + } + + return NOTIFY_OK; +} + +/************************************ + * Inetaddr notifier event handler + ************************************/ +static int ksocknal_inetaddr_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + + CDEBUG(D_NET, "addrevent: status %ld ip addr %pI4, netmask %pI4.\n", + event, &ifa->ifa_address, &ifa->ifa_mask); + + switch (event) { + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + ksocknal_handle_inetaddr_change(ifa, event); + break; + + } + return NOTIFY_OK; +} + +static struct notifier_block ksocknal_dev_notifier_block = { + .notifier_call = ksocknal_device_event, +}; + +static struct notifier_block ksocknal_inetaddr_notifier_block = { + .notifier_call = ksocknal_inetaddr_event, +}; + +static void +ksocknal_base_shutdown(void) +{ + struct ksock_sched *sched; + struct ksock_peer_ni *peer_ni; + int i; + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %lld\n", + libcfs_kmem_read()); + LASSERT (ksocknal_data.ksnd_nnets == 0); + + if (ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL) { + unregister_netdevice_notifier(&ksocknal_dev_notifier_block); + unregister_inetaddr_notifier(&ksocknal_inetaddr_notifier_block); + } + + switch (ksocknal_data.ksnd_init) { + default: + LASSERT(0); + fallthrough; + + case SOCKNAL_INIT_ALL: + case SOCKNAL_INIT_DATA: + hash_for_each(ksocknal_data.ksnd_peers, i, peer_ni, ksnp_list) + LASSERT(0); + + LASSERT(list_empty(&ksocknal_data.ksnd_nets)); + LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns)); + LASSERT(list_empty(&ksocknal_data.ksnd_zombie_conns)); + LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs)); + LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes)); + + if (ksocknal_data.ksnd_schedulers != NULL) { + cfs_percpt_for_each(sched, i, + ksocknal_data.ksnd_schedulers) { + + LASSERT(list_empty(&sched->kss_tx_conns)); + LASSERT(list_empty(&sched->kss_rx_conns)); + LASSERT(list_empty(&sched->kss_zombie_noop_txs)); + LASSERT(sched->kss_nconns == 0); + } + } + + /* flag threads to terminate; wake and wait for them to die */ + ksocknal_data.ksnd_shuttingdown = 1; + wake_up_all(&ksocknal_data.ksnd_connd_waitq); + wake_up(&ksocknal_data.ksnd_reaper_waitq); + + if (ksocknal_data.ksnd_schedulers != NULL) { + cfs_percpt_for_each(sched, i, + ksocknal_data.ksnd_schedulers) + wake_up_all(&sched->kss_waitq); + } + + wait_var_event_warning(&ksocknal_data.ksnd_nthreads, + atomic_read(&ksocknal_data.ksnd_nthreads) == 0, + "waiting for %d threads to terminate\n", + atomic_read(&ksocknal_data.ksnd_nthreads)); + + ksocknal_free_buffers(); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING; + break; + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %lld\n", + libcfs_kmem_read()); + + module_put(THIS_MODULE); +} + +static int +ksocknal_base_startup(void) +{ + struct ksock_sched *sched; + int rc; + int i; + + LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + LASSERT(ksocknal_data.ksnd_nnets == 0); + + memset(&ksocknal_data, 0, sizeof(ksocknal_data)); /* zero pointers */ + + hash_init(ksocknal_data.ksnd_peers); + + rwlock_init(&ksocknal_data.ksnd_global_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_nets); + + spin_lock_init(&ksocknal_data.ksnd_reaper_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_enomem_conns); + INIT_LIST_HEAD(&ksocknal_data.ksnd_zombie_conns); + INIT_LIST_HEAD(&ksocknal_data.ksnd_deathrow_conns); + init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); + + spin_lock_init(&ksocknal_data.ksnd_connd_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_connreqs); + INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_routes); + init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq); + + spin_lock_init(&ksocknal_data.ksnd_tx_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_noop_txs); + + /* NB memset above zeros whole of ksocknal_data */ + + /* flag lists/ptrs/locks initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; + if (!try_module_get(THIS_MODULE)) + goto failed; + + /* Create a scheduler block per available CPT */ + ksocknal_data.ksnd_schedulers = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*sched)); + if (ksocknal_data.ksnd_schedulers == NULL) + goto failed; + + cfs_percpt_for_each(sched, i, ksocknal_data.ksnd_schedulers) { + int nthrs; + + /* + * make sure not to allocate more threads than there are + * cores/CPUs in teh CPT + */ + nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + if (*ksocknal_tunables.ksnd_nscheds > 0) { + nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds); + } else { + /* + * max to half of CPUs, assume another half should be + * reserved for upper layer modules + */ + nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); + } + + sched->kss_nthreads_max = nthrs; + sched->kss_cpt = i; + + spin_lock_init(&sched->kss_lock); + INIT_LIST_HEAD(&sched->kss_rx_conns); + INIT_LIST_HEAD(&sched->kss_tx_conns); + INIT_LIST_HEAD(&sched->kss_zombie_noop_txs); + init_waitqueue_head(&sched->kss_waitq); + } + + ksocknal_data.ksnd_connd_starting = 0; + ksocknal_data.ksnd_connd_failed_stamp = 0; + ksocknal_data.ksnd_connd_starting_stamp = ktime_get_real_seconds(); + /* must have at least 2 connds to remain responsive to accepts while + * connecting */ + if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1) + *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1; + + if (*ksocknal_tunables.ksnd_nconnds_max < + *ksocknal_tunables.ksnd_nconnds) { + ksocknal_tunables.ksnd_nconnds_max = + ksocknal_tunables.ksnd_nconnds; + } + + for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) { + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + ksocknal_data.ksnd_connd_starting++; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + + rc = ksocknal_thread_start(ksocknal_connd, + (void *)((uintptr_t)i), + "socknal_cd%02d", i); + if (rc != 0) { + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + ksocknal_data.ksnd_connd_starting--; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + CERROR("Can't spawn socknal connd: %d\n", rc); + goto failed; + } + } + + rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper"); + if (rc != 0) { + CERROR ("Can't spawn socknal reaper: %d\n", rc); + goto failed; + } + + register_netdevice_notifier(&ksocknal_dev_notifier_block); + register_inetaddr_notifier(&ksocknal_inetaddr_notifier_block); + + /* flag everything initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; + + return 0; + + failed: + ksocknal_base_shutdown(); + return -ENETDOWN; +} + +static int +ksocknal_debug_peerhash(struct lnet_ni *ni) +{ + struct ksock_peer_ni *peer_ni; + int i; + + read_lock(&ksocknal_data.ksnd_global_lock); + + hash_for_each(ksocknal_data.ksnd_peers, i, peer_ni, ksnp_list) { + struct ksock_conn_cb *conn_cb; + struct ksock_conn *conn; + + if (peer_ni->ksnp_ni != ni) + continue; + + CWARN("Active peer_ni on shutdown: %s, ref %d, closing %d, accepting %d, err %d, zcookie %llu, txq %d, zc_req %d\n", + libcfs_idstr(&peer_ni->ksnp_id), + refcount_read(&peer_ni->ksnp_refcount), + peer_ni->ksnp_closing, + peer_ni->ksnp_accepting, peer_ni->ksnp_error, + peer_ni->ksnp_zc_next_cookie, + !list_empty(&peer_ni->ksnp_tx_queue), + !list_empty(&peer_ni->ksnp_zc_req_list)); + + conn_cb = peer_ni->ksnp_conn_cb; + if (conn_cb) { + CWARN("ConnCB: ref %d, schd %d, conn %d, cnted %d, del %d\n", + refcount_read(&conn_cb->ksnr_refcount), + conn_cb->ksnr_scheduled, conn_cb->ksnr_connecting, + conn_cb->ksnr_connected, conn_cb->ksnr_deleted); + } + + list_for_each_entry(conn, &peer_ni->ksnp_conns, ksnc_list) { + CWARN("Conn: ref %d, sref %d, t %d, c %d\n", + refcount_read(&conn->ksnc_conn_refcount), + refcount_read(&conn->ksnc_sock_refcount), + conn->ksnc_type, conn->ksnc_closing); + } + break; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return 0; +} + +void +ksocknal_shutdown(struct lnet_ni *ni) +{ + struct ksock_net *net = ni->ni_data; + struct lnet_process_id anyid = { + .nid = LNET_NID_ANY, + .pid = LNET_PID_ANY, + }; + + LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL); + LASSERT(ksocknal_data.ksnd_nnets > 0); + + /* prevent new peers */ + atomic_add(SOCKNAL_SHUTDOWN_BIAS, &net->ksnn_npeers); + + /* Delete all peers */ + ksocknal_del_peer(ni, anyid, 0); + + /* Wait for all peer_ni state to clean up */ + wait_var_event_warning(&net->ksnn_npeers, + atomic_read(&net->ksnn_npeers) == + SOCKNAL_SHUTDOWN_BIAS, + "waiting for %d peers to disconnect\n", + ksocknal_debug_peerhash(ni) + + atomic_read(&net->ksnn_npeers) - + SOCKNAL_SHUTDOWN_BIAS); + + LASSERT(net->ksnn_interface.ksni_npeers == 0); + LASSERT(net->ksnn_interface.ksni_nroutes == 0); + + list_del(&net->ksnn_list); + LIBCFS_FREE(net, sizeof(*net)); + + ksocknal_data.ksnd_nnets--; + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); +} + +static int +ksocknal_search_new_ipif(struct ksock_net *net) +{ + int new_ipif = 0; + char *ifnam = &net->ksnn_interface.ksni_name[0]; + char *colon = strchr(ifnam, ':'); + bool found = false; + struct ksock_net *tmp; + + if (colon != NULL) + *colon = 0; + + list_for_each_entry(tmp, &ksocknal_data.ksnd_nets, ksnn_list) { + char *ifnam2 = &tmp->ksnn_interface.ksni_name[0]; + char *colon2 = strchr(ifnam2, ':'); + + if (colon2 != NULL) + *colon2 = 0; + + found = strcmp(ifnam, ifnam2) == 0; + if (colon2 != NULL) + *colon2 = ':'; + } + + new_ipif += !found; + if (colon != NULL) + *colon = ':'; + + return new_ipif; +} + +static int +ksocknal_start_schedulers(struct ksock_sched *sched) +{ + int nthrs; + int rc = 0; + int i; + + if (sched->kss_nthreads == 0) { + if (*ksocknal_tunables.ksnd_nscheds > 0) { + nthrs = sched->kss_nthreads_max; + } else { + nthrs = cfs_cpt_weight(lnet_cpt_table(), + sched->kss_cpt); + nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); + nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs); + } + nthrs = min(nthrs, sched->kss_nthreads_max); + } else { + LASSERT(sched->kss_nthreads <= sched->kss_nthreads_max); + /* increase two threads if there is new interface */ + nthrs = min(2, sched->kss_nthreads_max - sched->kss_nthreads); + } + + for (i = 0; i < nthrs; i++) { + long id; + + id = KSOCK_THREAD_ID(sched->kss_cpt, sched->kss_nthreads + i); + rc = ksocknal_thread_start(ksocknal_scheduler, (void *)id, + "socknal_sd%02d_%02d", + sched->kss_cpt, + (int)KSOCK_THREAD_SID(id)); + if (rc == 0) + continue; + + CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", + sched->kss_cpt, (int) KSOCK_THREAD_SID(id), rc); + break; + } + + sched->kss_nthreads += i; + return rc; +} + +static int +ksocknal_net_start_threads(struct ksock_net *net, __u32 *cpts, int ncpts) +{ + int newif = ksocknal_search_new_ipif(net); + int rc; + int i; + + if (ncpts > 0 && ncpts > cfs_cpt_number(lnet_cpt_table())) + return -EINVAL; + + for (i = 0; i < ncpts; i++) { + struct ksock_sched *sched; + int cpt = (cpts == NULL) ? i : cpts[i]; + + LASSERT(cpt < cfs_cpt_number(lnet_cpt_table())); + sched = ksocknal_data.ksnd_schedulers[cpt]; + + if (!newif && sched->kss_nthreads > 0) + continue; + + rc = ksocknal_start_schedulers(sched); + if (rc != 0) + return rc; + } + return 0; +} + +int +ksocknal_startup(struct lnet_ni *ni) +{ + struct ksock_net *net; + struct ksock_interface *ksi = NULL; + struct lnet_inetdev *ifaces = NULL; + struct sockaddr_in *sa; + int i = 0; + int rc; + + LASSERT (ni->ni_net->net_lnd == &the_ksocklnd); + if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) { + rc = ksocknal_base_startup(); + if (rc != 0) + return rc; + } + LIBCFS_ALLOC(net, sizeof(*net)); + if (net == NULL) + goto fail_0; + net->ksnn_incarnation = ktime_get_real_ns(); + ni->ni_data = net; + + ksocknal_tunables_setup(ni); + + rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns); + if (rc < 0) + goto fail_1; + + ksi = &net->ksnn_interface; + + /* Use the first discovered interface or look in the list */ + if (ni->ni_interface) { + for (i = 0; i < rc; i++) + if (strcmp(ifaces[i].li_name, ni->ni_interface) == 0) + break; + + /* ni_interfaces doesn't contain the interface we want */ + if (i == rc) { + CERROR("ksocklnd: failed to find interface %s\n", + ni->ni_interface); + goto fail_1; + } + } + + ni->ni_dev_cpt = ifaces[i].li_cpt; + sa = (void *)&ksi->ksni_addr; + memset(sa, 0, sizeof(*sa)); + sa->sin_family = AF_INET; + sa->sin_addr.s_addr = htonl(ifaces[i].li_ipaddr); + ksi->ksni_index = ksocknal_ip2index((struct sockaddr *)sa, ni); + ksi->ksni_netmask = ifaces[i].li_netmask; + strlcpy(ksi->ksni_name, ifaces[i].li_name, sizeof(ksi->ksni_name)); + + /* call it before add it to ksocknal_data.ksnd_nets */ + rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) + goto fail_1; + + LASSERT(ksi); + LASSERT(ksi->ksni_addr.ss_family == AF_INET); + ni->ni_nid.nid_addr[0] = + ((struct sockaddr_in *)&ksi->ksni_addr)->sin_addr.s_addr; + list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets); + net->ksnn_ni = ni; + ksocknal_data.ksnd_nnets++; + + return 0; + +fail_1: + LIBCFS_FREE(net, sizeof(*net)); +fail_0: + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); + + return -ENETDOWN; +} + +static void __exit ksocklnd_exit(void) +{ + lnet_unregister_lnd(&the_ksocklnd); +} + +static const struct lnet_lnd the_ksocklnd = { + .lnd_type = SOCKLND, + .lnd_startup = ksocknal_startup, + .lnd_shutdown = ksocknal_shutdown, + .lnd_ctl = ksocknal_ctl, + .lnd_send = ksocknal_send, + .lnd_recv = ksocknal_recv, + .lnd_notify_peer_down = ksocknal_notify_gw_down, + .lnd_accept = ksocknal_accept, +}; + +static int __init ksocklnd_init(void) +{ + int rc; + + /* check ksnr_connected/connecting field large enough */ + BUILD_BUG_ON(SOCKLND_CONN_NTYPES > 4); + BUILD_BUG_ON(SOCKLND_CONN_ACK != SOCKLND_CONN_BULK_IN); + + rc = ksocknal_tunables_init(); + if (rc != 0) + return rc; + + lnet_register_lnd(&the_ksocklnd); + + return 0; +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("TCP Socket LNet Network Driver"); +MODULE_VERSION("2.8.0"); +MODULE_LICENSE("GPL"); + +module_init(ksocklnd_init); +module_exit(ksocklnd_exit); diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h new file mode 100644 index 0000000000000..81db1c3a3e2b2 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h @@ -0,0 +1,682 @@ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef _SOCKLND_SOCKLND_H_ +#define _SOCKLND_SOCKLND_H_ + +#define DEBUG_PORTAL_ALLOC +#define DEBUG_SUBSYSTEM S_LND + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#ifndef NETIF_F_CSUM_MASK +# define NETIF_F_CSUM_MASK NETIF_F_ALL_CSUM +#endif + +/* assume one thread for each connection type */ +#define SOCKNAL_NSCHEDS 3 +#define SOCKNAL_NSCHEDS_HIGH (SOCKNAL_NSCHEDS << 1) + +#define SOCKNAL_PEER_HASH_BITS 7 /* log2 of # peer_ni lists */ +#define SOCKNAL_INSANITY_RECONN 5000 /* connd is trying on reconn infinitely */ +#define SOCKNAL_ENOMEM_RETRY 1 /* seconds between retries */ + +#define SOCKNAL_SINGLE_FRAG_TX 0 /* disable multi-fragment sends */ +#define SOCKNAL_SINGLE_FRAG_RX 0 /* disable multi-fragment receives */ + +#define SOCKNAL_VERSION_DEBUG 0 /* enable protocol version debugging */ + +/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled). + * no risk if we're not running on a CONFIG_HIGHMEM platform. */ +#ifdef CONFIG_HIGHMEM +# define SOCKNAL_RISK_KMAP_DEADLOCK 0 +#else +# define SOCKNAL_RISK_KMAP_DEADLOCK 1 +#endif + +/* per scheduler state */ +struct ksock_sched { + /* serialise */ + spinlock_t kss_lock; + /* conn waiting to be written */ + struct list_head kss_rx_conns; + struct list_head kss_tx_conns; + /* zombie noop tx list */ + struct list_head kss_zombie_noop_txs; + /* where scheduler sleeps */ + wait_queue_head_t kss_waitq; + /* # connections assigned to this scheduler */ + int kss_nconns; + /* max allowed threads */ + int kss_nthreads_max; + /* number of threads */ + int kss_nthreads; + /* CPT id */ + int kss_cpt; +}; + +#define KSOCK_CPT_SHIFT 16 +#define KSOCK_THREAD_ID(cpt, sid) (((cpt) << KSOCK_CPT_SHIFT) | (sid)) +#define KSOCK_THREAD_CPT(id) ((id) >> KSOCK_CPT_SHIFT) +#define KSOCK_THREAD_SID(id) ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1)) + +struct ksock_interface { /* in-use interface */ + int ksni_index; /* Linux interface index */ + struct sockaddr_storage ksni_addr; /* interface's address */ + __u32 ksni_netmask; /* interface's network mask */ + int ksni_nroutes; /* # routes using (active) */ + int ksni_npeers; /* # peers using (passive) */ + char ksni_name[IFNAMSIZ]; /* interface name */ +}; + +struct ksock_tunables { + /* "stuck" socket timeout (seconds) */ + int *ksnd_timeout; + /* # scheduler threads in each pool while starting */ + int *ksnd_nscheds; + int *ksnd_nconnds; /* # connection daemons */ + int *ksnd_nconnds_max; /* max # connection daemons */ + int *ksnd_min_reconnectms; /* first connection retry after (ms)... */ + int *ksnd_max_reconnectms; /* ...exponentially increasing to this */ + int *ksnd_eager_ack; /* make TCP ack eagerly? */ + int *ksnd_typed_conns; /* drive sockets by type? */ + int *ksnd_min_bulk; /* smallest "large" message */ + int *ksnd_tx_buffer_size; /* socket tx buffer size */ + int *ksnd_rx_buffer_size; /* socket rx buffer size */ + int *ksnd_nagle; /* enable NAGLE? */ + int *ksnd_round_robin; /* round robin for multiple interfaces */ + int *ksnd_keepalive; /* # secs for sending keepalive NOOP */ + int *ksnd_keepalive_idle; /* # idle secs before 1st probe */ + int *ksnd_keepalive_count; /* # probes */ + int *ksnd_keepalive_intvl; /* time between probes */ + int *ksnd_credits; /* # concurrent sends */ + int *ksnd_peertxcredits; /* # concurrent sends to 1 peer_ni */ + int *ksnd_peerrtrcredits; /* # per-peer_ni router buffer credits */ + int *ksnd_peertimeout; /* seconds to consider peer_ni dead */ + int *ksnd_enable_csum; /* enable check sum */ + int *ksnd_inject_csum_error; /* set non-zero to inject checksum error */ + int *ksnd_nonblk_zcack; /* always send zc-ack on non-blocking connection */ + unsigned int *ksnd_zc_min_payload; /* minimum zero copy payload size */ + int *ksnd_zc_recv; /* enable ZC receive (for Chelsio TOE) */ + int *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */ + int *ksnd_irq_affinity; /* enable IRQ affinity? */ +#ifdef SOCKNAL_BACKOFF + int *ksnd_backoff_init; /* initial TCP backoff */ + int *ksnd_backoff_max; /* maximum TCP backoff */ +#endif +#if SOCKNAL_VERSION_DEBUG + int *ksnd_protocol; /* protocol version */ +#endif + int *ksnd_conns_per_peer; /* for typed mode, yields: + * 1 + 2*conns_per_peer total + * for untyped: + * conns_per_peer total + */ +}; + +struct ksock_net { + __u64 ksnn_incarnation; /* my epoch */ + struct list_head ksnn_list; /* chain on global list */ + atomic_t ksnn_npeers; /* # peers */ + struct ksock_interface ksnn_interface; /* IP interface */ + struct lnet_ni *ksnn_ni; +}; +/* When the ksock_net is shut down, this (negative) bias is added to + * ksnn_npeers, which prevents new peers from being added. + */ +#define SOCKNAL_SHUTDOWN_BIAS (INT_MIN+1) + +/** connd timeout */ +#define SOCKNAL_CONND_TIMEOUT 120 +/** reserved thread for accepting & creating new connd */ +#define SOCKNAL_CONND_RESV 1 + +struct ksock_nal_data { + int ksnd_init; /* initialisation state */ + int ksnd_nnets; /* # networks set up */ + struct list_head ksnd_nets; /* list of nets */ + /* stabilize peer_ni/conn ops */ + rwlock_t ksnd_global_lock; + /* hash table of all my known peers */ + DECLARE_HASHTABLE(ksnd_peers, SOCKNAL_PEER_HASH_BITS); + + atomic_t ksnd_nthreads; /* # live threads */ + int ksnd_shuttingdown; /* tell threads to exit */ + /* schedulers information */ + struct ksock_sched **ksnd_schedulers; + + atomic_t ksnd_nactive_txs; /* #active txs */ + + /* conns to close: reaper_lock*/ + struct list_head ksnd_deathrow_conns; + /* conns to free: reaper_lock */ + struct list_head ksnd_zombie_conns; + /* conns to retry: reaper_lock*/ + struct list_head ksnd_enomem_conns; + /* reaper sleeps here */ + wait_queue_head_t ksnd_reaper_waitq; + /* when reaper will wake */ + time64_t ksnd_reaper_waketime; + /* serialise */ + spinlock_t ksnd_reaper_lock; + + int ksnd_enomem_tx; /* test ENOMEM sender */ + int ksnd_stall_tx; /* test sluggish sender */ + int ksnd_stall_rx; /* test sluggish receiver */ + + /* incoming connection requests */ + struct list_head ksnd_connd_connreqs; + /* routes waiting to be connected */ + struct list_head ksnd_connd_routes; + /* connds sleep here */ + wait_queue_head_t ksnd_connd_waitq; + /* # connds connecting */ + int ksnd_connd_connecting; + /** time stamp of the last failed connecting attempt */ + time64_t ksnd_connd_failed_stamp; + /** # starting connd */ + unsigned ksnd_connd_starting; + /** time stamp of the last starting connd */ + time64_t ksnd_connd_starting_stamp; + /** # running connd */ + unsigned ksnd_connd_running; + /* serialise */ + spinlock_t ksnd_connd_lock; + + /* list head for freed noop tx */ + struct list_head ksnd_idle_noop_txs; + /* serialise, g_lock unsafe */ + spinlock_t ksnd_tx_lock; +}; + +#define SOCKNAL_INIT_NOTHING 0 +#define SOCKNAL_INIT_DATA 1 +#define SOCKNAL_INIT_ALL 2 + +/* A packet just assembled for transmission is represented by 1 + * struct iovec fragment - the portals header - followed by 0 + * or more struct bio_vec fragments. + * + * On the receive side, initially 1 struct kvec fragment is posted for + * receive (the header). Once the header has been received, the payload is + * received into struct bio_vec fragments. + */ +struct ksock_conn; /* forward ref */ +struct ksock_conn_cb; /* forward ref */ +struct ksock_proto; /* forward ref */ + +struct ksock_tx { /* transmit packet */ + struct list_head tx_list; /* queue on conn for transmission etc */ + struct list_head tx_zc_list; /* queue on peer_ni for ZC request */ + refcount_t tx_refcount; /* tx reference count */ + int tx_nob; /* # packet bytes */ + int tx_resid; /* residual bytes */ + int tx_niov; /* # packet kvec frags */ + int tx_nkiov; /* # packet page frags */ + unsigned short tx_zc_aborted; /* aborted ZC request */ + unsigned short tx_zc_capable:1; /* payload is large enough for ZC */ + unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */ + unsigned short tx_nonblk:1; /* it's a non-blocking ACK */ + struct bio_vec *tx_kiov; /* packet page frags */ + struct ksock_conn *tx_conn; /* owning conn */ + struct lnet_msg *tx_lnetmsg; /* lnet message for lnet_finalize() */ + time64_t tx_deadline; /* when (in secs) tx times out */ + struct ksock_msg tx_msg; /* socklnd message buffer */ + int tx_desc_size; /* size of this descriptor */ + enum lnet_msg_hstatus tx_hstatus; /* health status of tx */ + struct kvec tx_hdr; /* virt hdr */ + struct bio_vec tx_payload[0]; /* paged payload */ +}; + +#define KSOCK_NOOP_TX_SIZE ((int)offsetof(struct ksock_tx, tx_payload[0])) + +/* space for the rx frag descriptors; we either read a single contiguous + * header, or up to LNET_MAX_IOV frags of payload of either type. */ +union ksock_rxiovspace { + struct kvec iov[LNET_MAX_IOV]; + struct bio_vec kiov[LNET_MAX_IOV]; +}; + +#define SOCKNAL_RX_KSM_HEADER 1 /* reading ksock message header */ +#define SOCKNAL_RX_LNET_HEADER 2 /* reading lnet message header */ +#define SOCKNAL_RX_PARSE 3 /* Calling lnet_parse() */ +#define SOCKNAL_RX_PARSE_WAIT 4 /* waiting to be told to read the body */ +#define SOCKNAL_RX_LNET_PAYLOAD 5 /* reading lnet payload (to deliver here) */ +#define SOCKNAL_RX_SLOP 6 /* skipping body */ + +struct ksock_conn { + struct ksock_peer_ni *ksnc_peer; /* owning peer_ni */ + struct ksock_conn_cb *ksnc_conn_cb; /* owning conn control block */ + struct list_head ksnc_list; /* on peer_ni's conn list */ + struct socket *ksnc_sock; /* actual socket */ + void *ksnc_saved_data_ready; /* socket's original + * data_ready() cb */ + void *ksnc_saved_write_space; /* socket's original + * write_space() cb */ + refcount_t ksnc_conn_refcount; /* conn refcount */ + refcount_t ksnc_sock_refcount; /* sock refcount */ + struct ksock_sched *ksnc_scheduler; /* who schedules this + * connection */ + struct sockaddr_storage ksnc_myaddr; /* my address */ + struct sockaddr_storage ksnc_peeraddr; /* peer_ni's address */ + signed int ksnc_type:3; /* type of connection, + * should be signed + * value */ + unsigned int ksnc_closing:1; /* being shut down */ + unsigned int ksnc_flip:1; /* flip or not, only for V2.x */ + unsigned int ksnc_zc_capable:1; /* enable to ZC */ + const struct ksock_proto *ksnc_proto; /* protocol for the connection */ + + /* READER */ + + /* where I enq waiting input or a forwarding descriptor */ + struct list_head ksnc_rx_list; + time64_t ksnc_rx_deadline; /* when (in seconds) receive times out */ + __u8 ksnc_rx_started; /* started receiving a message */ + __u8 ksnc_rx_ready; /* data ready to read */ + __u8 ksnc_rx_scheduled;/* being progressed */ + __u8 ksnc_rx_state; /* what is being read */ + int ksnc_rx_nob_left; /* # bytes to next hdr/body */ + int ksnc_rx_nob_wanted; /* bytes actually wanted */ + int ksnc_rx_niov; /* # kvec frags */ + struct kvec *ksnc_rx_iov; /* the kvec frags */ + int ksnc_rx_nkiov; /* # page frags */ + struct bio_vec *ksnc_rx_kiov; /* the page frags */ + union ksock_rxiovspace ksnc_rx_iov_space;/* space for frag descriptors */ + __u32 ksnc_rx_csum; /* partial checksum for incoming + * data */ + struct lnet_msg *ksnc_lnet_msg; /* rx lnet_finalize arg*/ + struct ksock_msg ksnc_msg; /* incoming message buffer: + * V2.x message takes the + * whole struct + * V1.x message is a bare + * struct lnet_hdr_nid4, it's + * stored in + * ksnc_msg.ksm_u.lnetmsg + */ + /* -- WRITER -- */ + /* where I enq waiting for output space */ + struct list_head ksnc_tx_list; + /* packets waiting to be sent */ + struct list_head ksnc_tx_queue; + /* next TX that can carry a LNet message or ZC-ACK */ + struct ksock_tx *ksnc_tx_carrier; + /* when (in seconds) tx times out */ + time64_t ksnc_tx_deadline; + /* send buffer marker */ + int ksnc_tx_bufnob; + /* # bytes queued */ + atomic_t ksnc_tx_nob; + /* write space */ + int ksnc_tx_ready; + /* being progressed */ + int ksnc_tx_scheduled; + /* time stamp of the last posted TX */ + time64_t ksnc_tx_last_post; +}; + +#define SOCKNAL_CONN_COUNT_MAX_BITS 8 /* max conn count bits */ + +struct ksock_conn_cb { + struct list_head ksnr_connd_list;/* chain on ksnr_connd_routes */ + struct ksock_peer_ni *ksnr_peer; /* owning peer_ni */ + refcount_t ksnr_refcount; /* # users */ + time64_t ksnr_timeout; /* when (in secs) reconnection + * can happen next + */ + time64_t ksnr_retry_interval;/* secs between retries */ + int ksnr_myiface; /* interface index */ + struct sockaddr_storage ksnr_addr; /* IP address to connect to */ + unsigned int ksnr_scheduled:1;/* scheduled for attention */ + unsigned int ksnr_connecting:1;/* connection in progress */ + unsigned int ksnr_connected:4;/* connections by type */ + unsigned int ksnr_deleted:1; /* been removed from peer_ni? */ + unsigned int ksnr_ctrl_conn_count:2; /* # conns by type */ + unsigned int ksnr_blki_conn_count:8; + unsigned int ksnr_blko_conn_count:8; + int ksnr_conn_count;/* total # conns for this cb */ + unsigned int ksnr_max_conns; /* conns_per_peer at peer + * creation + */ +}; + +#define SOCKNAL_KEEPALIVE_PING 1 /* cookie for keepalive ping */ + +struct ksock_peer_ni { + struct hlist_node ksnp_list; /* stash on global peer_ni list */ + time64_t ksnp_last_alive;/* when (in seconds) I was last alive */ + struct lnet_processid ksnp_id; /* who's on the other end(s) */ + refcount_t ksnp_refcount; /* # users */ + int ksnp_closing; /* being closed */ + int ksnp_accepting; /* # passive connections pending */ + int ksnp_error; /* errno on closing last conn */ + __u64 ksnp_zc_next_cookie;/* ZC completion cookie */ + __u64 ksnp_incarnation; /* latest known peer_ni incarnation */ + const struct ksock_proto *ksnp_proto; /* latest known protocol */ + struct list_head ksnp_conns; /* all active connections */ + struct ksock_conn_cb *ksnp_conn_cb; /* conn control block */ + struct list_head ksnp_tx_queue; /* waiting packets */ + spinlock_t ksnp_lock; /* serialize, g_lock unsafe */ + /* zero copy requests wait for ACK */ + struct list_head ksnp_zc_req_list; + time64_t ksnp_send_keepalive; /* time to send keepalive */ + struct lnet_ni *ksnp_ni; /* which network */ + int ksnp_n_passive_ips; /* # of... */ + __u32 ksnp_passive_ips[LNET_INTERFACES_NUM]; /* preferred local interfaces */ +}; + +struct ksock_connreq { + /* stash on ksnd_connd_connreqs */ + struct list_head ksncr_list; + /* chosen NI */ + struct lnet_ni *ksncr_ni; + /* accepted socket */ + struct socket *ksncr_sock; +}; + +extern struct ksock_nal_data ksocknal_data; +extern struct ksock_tunables ksocknal_tunables; + +#define SOCKNAL_MATCH_NO 0 /* TX can't match type of connection */ +#define SOCKNAL_MATCH_YES 1 /* TX matches type of connection */ +#define SOCKNAL_MATCH_MAY 2 /* TX can be sent on the connection, but not preferred */ + +struct ksock_proto { + int pro_version; /* version number of protocol */ + int (*pro_send_hello)(struct ksock_conn *, struct ksock_hello_msg *); /* handshake function */ + int (*pro_recv_hello)(struct ksock_conn *, struct ksock_hello_msg *, int);/* handshake function */ + void (*pro_pack)(struct ksock_tx *); /* message pack */ + void (*pro_unpack)(struct ksock_msg *, struct lnet_hdr *); /* message unpack */ + struct ksock_tx *(*pro_queue_tx_msg)(struct ksock_conn *, struct ksock_tx *); /* queue tx on the connection */ + int (*pro_queue_tx_zcack)(struct ksock_conn *, struct ksock_tx *, __u64); /* queue ZC ack on the connection */ + int (*pro_handle_zcreq)(struct ksock_conn *, __u64, int); /* handle ZC request */ + int (*pro_handle_zcack)(struct ksock_conn *, __u64, __u64); /* handle ZC ACK */ + int (*pro_match_tx)(struct ksock_conn *, struct ksock_tx *, int); /* msg type matches the connection type: + * return value: + * return MATCH_NO : no + * return MATCH_YES : matching type + * return MATCH_MAY : can be backup */ +}; + +extern const struct ksock_proto ksocknal_protocol_v1x; +extern const struct ksock_proto ksocknal_protocol_v2x; +extern const struct ksock_proto ksocknal_protocol_v3x; +extern const struct ksock_proto ksocknal_protocol_v4x; + +#define KSOCK_PROTO_V1_MAJOR LNET_PROTO_TCP_VERSION_MAJOR +#define KSOCK_PROTO_V1_MINOR LNET_PROTO_TCP_VERSION_MINOR +#define KSOCK_PROTO_V1 KSOCK_PROTO_V1_MAJOR + +#ifndef CPU_MASK_NONE +#define CPU_MASK_NONE 0UL +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) +#undef netdev_notifier_info_to_dev +#define netdev_notifier_info_to_dev(ndev) ndev +#endif + +static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len) +{ +#if 1 + return crc32_le(crc, p, len); +#else + while (len-- > 0) + crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ; + + return crc; +#endif +} + +static inline int +ksocknal_conn_cb_mask(void) +{ + if (!*ksocknal_tunables.ksnd_typed_conns) + return BIT(SOCKLND_CONN_ANY); + + return (BIT(SOCKLND_CONN_CONTROL) | + BIT(SOCKLND_CONN_BULK_IN) | + BIT(SOCKLND_CONN_BULK_OUT)); +} + +static inline void +ksocknal_conn_addref(struct ksock_conn *conn) +{ + refcount_inc(&conn->ksnc_conn_refcount); +} + +extern void ksocknal_queue_zombie_conn(struct ksock_conn *conn); +extern void ksocknal_finalize_zcreq(struct ksock_conn *conn); + +static inline void +ksocknal_conn_decref(struct ksock_conn *conn) +{ + if (refcount_dec_and_test(&conn->ksnc_conn_refcount)) + ksocknal_queue_zombie_conn(conn); +} + +static inline int +ksocknal_connsock_addref(struct ksock_conn *conn) +{ + int rc = -ESHUTDOWN; + + read_lock(&ksocknal_data.ksnd_global_lock); + if (!conn->ksnc_closing) { + refcount_inc(&conn->ksnc_sock_refcount); + rc = 0; + } + read_unlock(&ksocknal_data.ksnd_global_lock); + + return (rc); +} + +static inline void +ksocknal_connsock_decref(struct ksock_conn *conn) +{ + if (refcount_dec_and_test(&conn->ksnc_sock_refcount)) { + LASSERT (conn->ksnc_closing); + sock_release(conn->ksnc_sock); + conn->ksnc_sock = NULL; + ksocknal_finalize_zcreq(conn); + } +} + +static inline void +ksocknal_tx_addref(struct ksock_tx *tx) +{ + refcount_inc(&tx->tx_refcount); +} + +extern void ksocknal_tx_prep(struct ksock_conn *, struct ksock_tx *tx); +extern void ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int error); + +static inline void +ksocknal_tx_decref(struct ksock_tx *tx) +{ + if (refcount_dec_and_test(&tx->tx_refcount)) + ksocknal_tx_done(NULL, tx, 0); +} + +static inline void +ksocknal_conn_cb_addref(struct ksock_conn_cb *conn_cb) +{ + refcount_inc(&conn_cb->ksnr_refcount); +} + +extern void ksocknal_destroy_conn_cb(struct ksock_conn_cb *conn_cb); + +static inline void +ksocknal_conn_cb_decref(struct ksock_conn_cb *conn_cb) +{ + if (refcount_dec_and_test(&conn_cb->ksnr_refcount)) + ksocknal_destroy_conn_cb(conn_cb); +} + +static inline void +ksocknal_peer_addref(struct ksock_peer_ni *peer_ni) +{ + refcount_inc(&peer_ni->ksnp_refcount); +} + +extern void ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni); + +static inline void +ksocknal_peer_decref(struct ksock_peer_ni *peer_ni) +{ + if (refcount_dec_and_test(&peer_ni->ksnp_refcount)) + ksocknal_destroy_peer(peer_ni); +} + +static inline int ksocknal_timeout(void) +{ + return *ksocknal_tunables.ksnd_timeout ?: lnet_get_lnd_timeout(); +} + +static inline int ksocknal_conns_per_peer(void) +{ + return *ksocknal_tunables.ksnd_conns_per_peer ?: 1; +} + +int ksocknal_startup(struct lnet_ni *ni); +void ksocknal_shutdown(struct lnet_ni *ni); +int ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg); +int ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg); +int ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, + int delayed, unsigned int niov, + struct bio_vec *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +int ksocknal_accept(struct lnet_ni *ni, struct socket *sock); + +int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, + struct sockaddr *addr); +struct ksock_peer_ni *ksocknal_find_peer_locked(struct lnet_ni *ni, + struct lnet_processid *id); +struct ksock_peer_ni *ksocknal_find_peer(struct lnet_ni *ni, + struct lnet_processid *id); +extern void ksocknal_peer_failed(struct ksock_peer_ni *peer_ni); +extern int ksocknal_create_conn(struct lnet_ni *ni, + struct ksock_conn_cb *conn_cb, + struct socket *sock, int type); +extern void ksocknal_close_conn_locked(struct ksock_conn *conn, int why); +extern void ksocknal_terminate_conn(struct ksock_conn *conn); +extern void ksocknal_destroy_conn(struct ksock_conn *conn); +extern int ksocknal_close_peer_conns_locked(struct ksock_peer_ni *peer_ni, + struct sockaddr *peer, int why); +extern int ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why); +int ksocknal_close_matching_conns(struct lnet_processid *id, __u32 ipaddr); +extern struct ksock_conn *ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni, + struct ksock_tx *tx, int nonblk); + +extern int ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx, + struct lnet_processid *id); +extern struct ksock_tx *ksocknal_alloc_tx(int type, int size); +extern void ksocknal_free_tx(struct ksock_tx *tx); +extern struct ksock_tx *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk); +extern void ksocknal_next_tx_carrier(struct ksock_conn *conn); +extern void ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn); +extern void ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, + int error); +#define ksocknal_thread_start(fn, data, namefmt, arg...) \ + ({ \ + struct task_struct *__task = kthread_run(fn, data, \ + namefmt, ##arg); \ + if (!IS_ERR(__task)) \ + atomic_inc(&ksocknal_data.ksnd_nthreads); \ + PTR_ERR_OR_ZERO(__task); \ + }) + +extern void ksocknal_thread_fini(void); +extern void ksocknal_launch_all_connections_locked(struct ksock_peer_ni *peer_ni); +extern struct ksock_conn_cb *ksocknal_find_connectable_conn_cb_locked(struct ksock_peer_ni *peer_ni); +extern struct ksock_conn_cb *ksocknal_find_connecting_conn_cb_locked(struct ksock_peer_ni *peer_ni); +extern int ksocknal_new_packet(struct ksock_conn *conn, int skip); +extern int ksocknal_scheduler(void *arg); +extern int ksocknal_connd(void *arg); +extern int ksocknal_reaper(void *arg); +int ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn, + struct lnet_nid *peer_nid, + struct ksock_hello_msg *hello); +int ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn, + struct ksock_hello_msg *hello, + struct lnet_processid *id, + __u64 *incarnation); +extern void ksocknal_read_callback(struct ksock_conn *conn); +extern void ksocknal_write_callback(struct ksock_conn *conn); + +extern int ksocknal_lib_zc_capable(struct ksock_conn *conn); +extern void ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn); +extern void ksocknal_lib_set_callback(struct socket *sock, struct ksock_conn *conn); +extern void ksocknal_lib_reset_callback(struct socket *sock, + struct ksock_conn *conn); +extern void ksocknal_lib_push_conn(struct ksock_conn *conn); +extern int ksocknal_lib_get_conn_addrs(struct ksock_conn *conn); +extern int ksocknal_lib_setup_sock(struct socket *so); +extern int ksocknal_lib_send_hdr(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratch_iov); +extern int ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratch_iov); +extern void ksocknal_lib_eager_ack(struct ksock_conn *conn); +extern int ksocknal_lib_recv_iov(struct ksock_conn *conn, + struct kvec *scratchiov); +extern int ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages, + struct kvec *scratchiov); +extern int ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem, + int *rxmem, int *nagle); + +extern int ksocknal_tunables_init(void); +extern void ksocknal_tunables_setup(struct lnet_ni *ni); + +extern void ksocknal_lib_csum_tx(struct ksock_tx *tx); + +extern int ksocknal_lib_memory_pressure(struct ksock_conn *conn); + +#endif /* _SOCKLND_SOCKLND_H_ */ diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c new file mode 100644 index 0000000000000..4e7a61b3f5751 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c @@ -0,0 +1,2694 @@ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include "socklnd.h" +#include + +struct ksock_tx * +ksocknal_alloc_tx(int type, int size) +{ + struct ksock_tx *tx = NULL; + + if (type == KSOCK_MSG_NOOP) { + LASSERT(size == KSOCK_NOOP_TX_SIZE); + + /* searching for a noop tx in free list */ + spin_lock(&ksocknal_data.ksnd_tx_lock); + + tx = list_first_entry_or_null(&ksocknal_data.ksnd_idle_noop_txs, + struct ksock_tx, tx_list); + if (tx) { + LASSERT(tx->tx_desc_size == size); + list_del(&tx->tx_list); + } + + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } + + if (tx == NULL) + LIBCFS_ALLOC(tx, size); + + if (tx == NULL) + return NULL; + + refcount_set(&tx->tx_refcount, 1); + tx->tx_zc_aborted = 0; + tx->tx_zc_capable = 0; + tx->tx_zc_checked = 0; + tx->tx_hstatus = LNET_MSG_STATUS_OK; + tx->tx_desc_size = size; + + atomic_inc(&ksocknal_data.ksnd_nactive_txs); + + return tx; +} + +struct ksock_tx * +ksocknal_alloc_tx_noop(__u64 cookie, int nonblk) +{ + struct ksock_tx *tx; + + tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE); + if (tx == NULL) { + CERROR("Can't allocate noop tx desc\n"); + return NULL; + } + + tx->tx_conn = NULL; + tx->tx_lnetmsg = NULL; + tx->tx_kiov = NULL; + tx->tx_nkiov = 0; + tx->tx_niov = 1; + tx->tx_nonblk = nonblk; + + tx->tx_msg.ksm_csum = 0; + tx->tx_msg.ksm_type = KSOCK_MSG_NOOP; + tx->tx_msg.ksm_zc_cookies[0] = 0; + tx->tx_msg.ksm_zc_cookies[1] = cookie; + + return tx; +} + + +void +ksocknal_free_tx(struct ksock_tx *tx) +{ + atomic_dec(&ksocknal_data.ksnd_nactive_txs); + + if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) { + /* it's a noop tx */ + spin_lock(&ksocknal_data.ksnd_tx_lock); + + list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs); + + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } else { + LIBCFS_FREE(tx, tx->tx_desc_size); + } +} + +static int +ksocknal_send_hdr(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratch_iov) +{ + struct kvec *iov = &tx->tx_hdr; + int nob; + int rc; + + LASSERT(tx->tx_niov > 0); + + /* Never touch tx->tx_hdr inside ksocknal_lib_send_hdr() */ + rc = ksocknal_lib_send_hdr(conn, tx, scratch_iov); + + if (rc <= 0) /* sent nothing? */ + return rc; + + nob = rc; + LASSERT(nob <= tx->tx_resid); + tx->tx_resid -= nob; + + /* "consume" iov */ + LASSERT(tx->tx_niov == 1); + + if (nob < (int) iov->iov_len) { + iov->iov_base += nob; + iov->iov_len -= nob; + return rc; + } + + LASSERT(nob == iov->iov_len); + tx->tx_niov--; + + return rc; +} + +static int +ksocknal_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratch_iov) +{ + struct bio_vec *kiov = tx->tx_kiov; + int nob; + int rc; + + LASSERT(tx->tx_niov == 0); + LASSERT(tx->tx_nkiov > 0); + + /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */ + rc = ksocknal_lib_send_kiov(conn, tx, scratch_iov); + + if (rc <= 0) /* sent nothing? */ + return rc; + + nob = rc; + LASSERT(nob <= tx->tx_resid); + tx->tx_resid -= nob; + + /* "consume" kiov */ + do { + LASSERT(tx->tx_nkiov > 0); + + if (nob < (int)kiov->bv_len) { + kiov->bv_offset += nob; + kiov->bv_len -= nob; + return rc; + } + + nob -= (int)kiov->bv_len; + tx->tx_kiov = ++kiov; + tx->tx_nkiov--; + } while (nob != 0); + + return rc; +} + +static int +ksocknal_transmit(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratch_iov) +{ + int rc; + int bufnob; + + if (ksocknal_data.ksnd_stall_tx != 0) + schedule_timeout_uninterruptible( + cfs_time_seconds(ksocknal_data.ksnd_stall_tx)); + + LASSERT(tx->tx_resid != 0); + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT(conn->ksnc_closing); + return -ESHUTDOWN; + } + + do { + if (ksocknal_data.ksnd_enomem_tx > 0) { + /* testing... */ + ksocknal_data.ksnd_enomem_tx--; + rc = -EAGAIN; + } else if (tx->tx_niov != 0) { + rc = ksocknal_send_hdr(conn, tx, scratch_iov); + } else { + rc = ksocknal_send_kiov(conn, tx, scratch_iov); + } + + bufnob = conn->ksnc_sock->sk->sk_wmem_queued; + if (rc > 0) /* sent something? */ + conn->ksnc_tx_bufnob += rc; /* account it */ + + if (bufnob < conn->ksnc_tx_bufnob) { + /* allocated send buffer bytes < computed; infer + * something got ACKed */ + conn->ksnc_tx_deadline = ktime_get_seconds() + + ksocknal_timeout(); + conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds(); + conn->ksnc_tx_bufnob = bufnob; + smp_mb(); + } + + if (rc <= 0) { /* Didn't write anything? */ + /* some stacks return 0 instead of -EAGAIN */ + if (rc == 0) + rc = -EAGAIN; + + /* Check if EAGAIN is due to memory pressure */ + if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn)) + rc = -ENOMEM; + + break; + } + + /* socket's wmem_queued now includes 'rc' bytes */ + atomic_sub (rc, &conn->ksnc_tx_nob); + rc = 0; + + } while (tx->tx_resid != 0); + + ksocknal_connsock_decref(conn); + return rc; +} + +static int +ksocknal_recv_iov(struct ksock_conn *conn, struct kvec *scratchiov) +{ + struct kvec *iov = conn->ksnc_rx_iov; + int nob; + int rc; + + LASSERT(conn->ksnc_rx_niov > 0); + + /* Never touch conn->ksnc_rx_iov or change connection + * status inside ksocknal_lib_recv_iov */ + rc = ksocknal_lib_recv_iov(conn, scratchiov); + + if (rc <= 0) + return rc; + + /* received something... */ + nob = rc; + + conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds(); + conn->ksnc_rx_deadline = ktime_get_seconds() + + ksocknal_timeout(); + smp_mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + + conn->ksnc_rx_nob_wanted -= nob; + conn->ksnc_rx_nob_left -= nob; + + do { + LASSERT(conn->ksnc_rx_niov > 0); + + if (nob < (int)iov->iov_len) { + iov->iov_len -= nob; + iov->iov_base += nob; + return -EAGAIN; + } + + nob -= iov->iov_len; + conn->ksnc_rx_iov = ++iov; + conn->ksnc_rx_niov--; + } while (nob != 0); + + return rc; +} + +static int +ksocknal_recv_kiov(struct ksock_conn *conn, struct page **rx_scratch_pgs, + struct kvec *scratch_iov) +{ + struct bio_vec *kiov = conn->ksnc_rx_kiov; + int nob; + int rc; + + LASSERT(conn->ksnc_rx_nkiov > 0); + /* Never touch conn->ksnc_rx_kiov or change connection + * status inside ksocknal_lib_recv_iov */ + rc = ksocknal_lib_recv_kiov(conn, rx_scratch_pgs, scratch_iov); + + if (rc <= 0) + return rc; + + /* received something... */ + nob = rc; + + conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds(); + conn->ksnc_rx_deadline = ktime_get_seconds() + + ksocknal_timeout(); + smp_mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + + conn->ksnc_rx_nob_wanted -= nob; + conn->ksnc_rx_nob_left -= nob; + + do { + LASSERT(conn->ksnc_rx_nkiov > 0); + + if (nob < (int) kiov->bv_len) { + kiov->bv_offset += nob; + kiov->bv_len -= nob; + return -EAGAIN; + } + + nob -= kiov->bv_len; + conn->ksnc_rx_kiov = ++kiov; + conn->ksnc_rx_nkiov--; + } while (nob != 0); + + return 1; +} + +static int +ksocknal_receive(struct ksock_conn *conn, struct page **rx_scratch_pgs, + struct kvec *scratch_iov) +{ + /* Return 1 on success, 0 on EOF, < 0 on error. + * Caller checks ksnc_rx_nob_wanted to determine + * progress/completion. */ + int rc; + ENTRY; + + if (ksocknal_data.ksnd_stall_rx != 0) + schedule_timeout_uninterruptible( + cfs_time_seconds(ksocknal_data.ksnd_stall_rx)); + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT(conn->ksnc_closing); + return -ESHUTDOWN; + } + + for (;;) { + if (conn->ksnc_rx_niov != 0) + rc = ksocknal_recv_iov(conn, scratch_iov); + else + rc = ksocknal_recv_kiov(conn, rx_scratch_pgs, + scratch_iov); + + if (rc <= 0) { + /* error/EOF or partial receive */ + if (rc == -EAGAIN) { + rc = 1; + } else if (rc == 0 && conn->ksnc_rx_started) { + /* EOF in the middle of a message */ + rc = -EPROTO; + } + break; + } + + /* Completed a fragment */ + + if (conn->ksnc_rx_nob_wanted == 0) { + rc = 1; + break; + } + } + + ksocknal_connsock_decref(conn); + RETURN(rc); +} + +void +ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int rc) +{ + struct lnet_msg *lnetmsg = tx->tx_lnetmsg; + enum lnet_msg_hstatus hstatus = tx->tx_hstatus; + + LASSERT(ni != NULL || tx->tx_conn != NULL); + + if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted)) { + rc = -EIO; + if (hstatus == LNET_MSG_STATUS_OK) + hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + } + + if (tx->tx_conn != NULL) + ksocknal_conn_decref(tx->tx_conn); + + ksocknal_free_tx(tx); + if (lnetmsg != NULL) { /* KSOCK_MSG_NOOP go without lnetmsg */ + lnetmsg->msg_health_status = hstatus; + lnet_finalize(lnetmsg, rc); + } +} + +void +ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error) +{ + struct ksock_tx *tx; + + while ((tx = list_first_entry_or_null(txlist, struct ksock_tx, + tx_list)) != NULL) { + if (error && tx->tx_lnetmsg) { + CNETERR("Deleting packet type %d len %d %s->%s\n", + tx->tx_lnetmsg->msg_type, + tx->tx_lnetmsg->msg_len, + libcfs_nidstr(&tx->tx_lnetmsg->msg_initiator), + libcfs_nidstr(&tx->tx_lnetmsg->msg_target.nid)); + } else if (error) { + CNETERR("Deleting noop packet\n"); + } + + list_del(&tx->tx_list); + + if (tx->tx_hstatus == LNET_MSG_STATUS_OK) { + if (error == -ETIMEDOUT) + tx->tx_hstatus = + LNET_MSG_STATUS_LOCAL_TIMEOUT; + else if (error == -ENETDOWN || + error == -EHOSTUNREACH || + error == -ENETUNREACH || + error == -ECONNREFUSED || + error == -ECONNRESET) + tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED; + /* + * for all other errors we don't want to + * retransmit + */ + else if (error) + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + } + + LASSERT(refcount_read(&tx->tx_refcount) == 1); + ksocknal_tx_done(ni, tx, error); + } +} + +static void +ksocknal_check_zc_req(struct ksock_tx *tx) +{ + struct ksock_conn *conn = tx->tx_conn; + struct ksock_peer_ni *peer_ni = conn->ksnc_peer; + + /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx + * to ksnp_zc_req_list if some fragment of this message should be sent + * zero-copy. Our peer_ni will send an ACK containing this cookie when + * she has received this message to tell us we can signal completion. + * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on + * ksnp_zc_req_list. */ + LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT (tx->tx_zc_capable); + + tx->tx_zc_checked = 1; + + if (conn->ksnc_proto == &ksocknal_protocol_v1x || + !conn->ksnc_zc_capable) + return; + + /* assign cookie and queue tx to pending list, it will be released when + * a matching ack is received. See ksocknal_handle_zcack() */ + + ksocknal_tx_addref(tx); + + spin_lock(&peer_ni->ksnp_lock); + + /* ZC_REQ is going to be pinned to the peer_ni */ + tx->tx_deadline = ktime_get_seconds() + + ksocknal_timeout(); + + LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0); + + tx->tx_msg.ksm_zc_cookies[0] = peer_ni->ksnp_zc_next_cookie++; + + if (peer_ni->ksnp_zc_next_cookie == 0) + peer_ni->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; + + list_add_tail(&tx->tx_zc_list, &peer_ni->ksnp_zc_req_list); + + spin_unlock(&peer_ni->ksnp_lock); +} + +static void +ksocknal_uncheck_zc_req(struct ksock_tx *tx) +{ + struct ksock_peer_ni *peer_ni = tx->tx_conn->ksnc_peer; + + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT(tx->tx_zc_capable); + + tx->tx_zc_checked = 0; + + spin_lock(&peer_ni->ksnp_lock); + + if (tx->tx_msg.ksm_zc_cookies[0] == 0) { + /* Not waiting for an ACK */ + spin_unlock(&peer_ni->ksnp_lock); + return; + } + + tx->tx_msg.ksm_zc_cookies[0] = 0; + list_del(&tx->tx_zc_list); + + spin_unlock(&peer_ni->ksnp_lock); + + ksocknal_tx_decref(tx); +} + +static int +ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratch_iov) +{ + int rc; + bool error_sim = false; + + if (lnet_send_error_simulation(tx->tx_lnetmsg, &tx->tx_hstatus)) { + error_sim = true; + rc = -EINVAL; + goto simulate_error; + } + + if (tx->tx_zc_capable && !tx->tx_zc_checked) + ksocknal_check_zc_req(tx); + + rc = ksocknal_transmit(conn, tx, scratch_iov); + + CDEBUG(D_NET, "send(%d) %d\n", tx->tx_resid, rc); + + if (tx->tx_resid == 0) { + /* Sent everything OK */ + LASSERT(rc == 0); + + return 0; + } + + if (rc == -EAGAIN) + return rc; + + if (rc == -ENOMEM) { + static int counter; + + counter++; /* exponential backoff warnings */ + if ((counter & (-counter)) == counter) + CWARN("%u ENOMEM tx %p (%lld allocated)\n", + counter, conn, libcfs_kmem_read()); + + /* Queue on ksnd_enomem_conns for retry after a timeout */ + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + /* enomem list takes over scheduler's ref... */ + LASSERT(conn->ksnc_tx_scheduled); + list_add_tail(&conn->ksnc_tx_list, + &ksocknal_data.ksnd_enomem_conns); + if (ktime_get_seconds() + SOCKNAL_ENOMEM_RETRY < + ksocknal_data.ksnd_reaper_waketime) + wake_up(&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + /* + * set the health status of the message which determines + * whether we should retry the transmit + */ + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + return (rc); + } + +simulate_error: + + /* Actual error */ + LASSERT(rc < 0); + + if (!error_sim) { + /* + * set the health status of the message which determines + * whether we should retry the transmit + */ + if (rc == -ETIMEDOUT) + tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_TIMEOUT; + else + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + } + + if (!conn->ksnc_closing) { + switch (rc) { + case -ECONNRESET: + LCONSOLE_WARN("Host %pIS reset our connection while we were sending data; it may have rebooted.\n", + &conn->ksnc_peeraddr); + break; + default: + LCONSOLE_WARN("There was an unexpected network error while writing to %pIS: %d.\n", + &conn->ksnc_peeraddr, rc); + break; + } + CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pISp\n", + conn, rc, libcfs_idstr(&conn->ksnc_peer->ksnp_id), + &conn->ksnc_peeraddr); + } + + if (tx->tx_zc_checked) + ksocknal_uncheck_zc_req(tx); + + /* it's not an error if conn is being closed */ + ksocknal_close_conn_and_siblings(conn, + (conn->ksnc_closing) ? 0 : rc); + + return rc; +} + +static void +ksocknal_launch_connection_locked(struct ksock_conn_cb *conn_cb) +{ + /* called holding write lock on ksnd_global_lock */ + + LASSERT(!conn_cb->ksnr_scheduled); + LASSERT(!conn_cb->ksnr_connecting); + LASSERT((ksocknal_conn_cb_mask() & ~conn_cb->ksnr_connected) != 0); + + /* scheduling conn for connd */ + conn_cb->ksnr_scheduled = 1; + + /* extra ref for connd */ + ksocknal_conn_cb_addref(conn_cb); + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + + list_add_tail(&conn_cb->ksnr_connd_list, + &ksocknal_data.ksnd_connd_routes); + wake_up(&ksocknal_data.ksnd_connd_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); +} + +void +ksocknal_launch_all_connections_locked(struct ksock_peer_ni *peer_ni) +{ + struct ksock_conn_cb *conn_cb; + + /* called holding write lock on ksnd_global_lock */ + for (;;) { + /* launch any/all connections that need it */ + conn_cb = ksocknal_find_connectable_conn_cb_locked(peer_ni); + if (conn_cb == NULL) + return; + + ksocknal_launch_connection_locked(conn_cb); + } +} + +struct ksock_conn * +ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni, struct ksock_tx *tx, int nonblk) +{ + struct ksock_conn *c; + struct ksock_conn *conn; + struct ksock_conn *typed = NULL; + struct ksock_conn *fallback = NULL; + int tnob = 0; + int fnob = 0; + + list_for_each_entry(c, &peer_ni->ksnp_conns, ksnc_list) { + int nob = atomic_read(&c->ksnc_tx_nob) + + c->ksnc_sock->sk->sk_wmem_queued; + int rc; + + LASSERT (!c->ksnc_closing); + LASSERT (c->ksnc_proto != NULL && + c->ksnc_proto->pro_match_tx != NULL); + + rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk); + + switch (rc) { + default: + LBUG(); + case SOCKNAL_MATCH_NO: /* protocol rejected the tx */ + continue; + + case SOCKNAL_MATCH_YES: /* typed connection */ + if (typed == NULL || tnob > nob || + (tnob == nob && *ksocknal_tunables.ksnd_round_robin && + typed->ksnc_tx_last_post > c->ksnc_tx_last_post)) { + typed = c; + tnob = nob; + } + break; + + case SOCKNAL_MATCH_MAY: /* fallback connection */ + if (fallback == NULL || fnob > nob || + (fnob == nob && *ksocknal_tunables.ksnd_round_robin && + fallback->ksnc_tx_last_post > c->ksnc_tx_last_post)) { + fallback = c; + fnob = nob; + } + break; + } + } + + /* prefer the typed selection */ + conn = (typed != NULL) ? typed : fallback; + + if (conn != NULL) + conn->ksnc_tx_last_post = ktime_get_seconds(); + + return conn; +} + +void +ksocknal_tx_prep(struct ksock_conn *conn, struct ksock_tx *tx) +{ + conn->ksnc_proto->pro_pack(tx); + + atomic_add (tx->tx_nob, &conn->ksnc_tx_nob); + ksocknal_conn_addref(conn); /* +1 ref for tx */ + tx->tx_conn = conn; +} + +void +ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn) +{ + struct ksock_sched *sched = conn->ksnc_scheduler; + struct ksock_msg *msg = &tx->tx_msg; + struct ksock_tx *ztx = NULL; + int bufnob = 0; + + /* called holding global lock (read or irq-write) and caller may + * not have dropped this lock between finding conn and calling me, + * so we don't need the {get,put}connsock dance to deref + * ksnc_sock... */ + LASSERT(!conn->ksnc_closing); + + CDEBUG(D_NET, "Sending to %s ip %pISp\n", + libcfs_idstr(&conn->ksnc_peer->ksnp_id), + &conn->ksnc_peeraddr); + + ksocknal_tx_prep(conn, tx); + + /* Ensure the frags we've been given EXACTLY match the number of + * bytes we want to send. Many TCP/IP stacks disregard any total + * size parameters passed to them and just look at the frags. + * + * We always expect at least 1 mapped fragment containing the + * complete ksocknal message header. + */ + LASSERT(lnet_iov_nob(tx->tx_niov, &tx->tx_hdr) + + lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) == + (unsigned int)tx->tx_nob); + LASSERT(tx->tx_niov >= 1); + LASSERT(tx->tx_resid == tx->tx_nob); + + CDEBUG(D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n", + tx, tx->tx_lnetmsg ? tx->tx_lnetmsg->msg_type : KSOCK_MSG_NOOP, + tx->tx_nob, tx->tx_niov, tx->tx_nkiov); + + bufnob = conn->ksnc_sock->sk->sk_wmem_queued; + spin_lock_bh(&sched->kss_lock); + + if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) { + /* First packet starts the timeout */ + conn->ksnc_tx_deadline = ktime_get_seconds() + + ksocknal_timeout(); + if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */ + conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds(); + conn->ksnc_tx_bufnob = 0; + smp_mb(); /* order with adding to tx_queue */ + } + + if (msg->ksm_type == KSOCK_MSG_NOOP) { + /* The packet is noop ZC ACK, try to piggyback the ack_cookie + * on a normal packet so I don't need to send it */ + LASSERT (msg->ksm_zc_cookies[1] != 0); + LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL); + + if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0)) + ztx = tx; /* ZC ACK piggybacked on ztx release tx later */ + + } else { + /* It's a normal packet - can it piggback a noop zc-ack that + * has been queued already? */ + LASSERT (msg->ksm_zc_cookies[1] == 0); + LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL); + + ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx); + /* ztx will be released later */ + } + + if (ztx != NULL) { + atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob); + list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs); + } + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) { /* not scheduled to send */ + /* +1 ref for scheduler */ + ksocknal_conn_addref(conn); + list_add_tail(&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + wake_up(&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); +} + + +struct ksock_conn_cb * +ksocknal_find_connectable_conn_cb_locked(struct ksock_peer_ni *peer_ni) +{ + time64_t now = ktime_get_seconds(); + struct ksock_conn_cb *conn_cb; + + conn_cb = peer_ni->ksnp_conn_cb; + if (!conn_cb) + return NULL; + + LASSERT(!conn_cb->ksnr_connecting || conn_cb->ksnr_scheduled); + + if (conn_cb->ksnr_scheduled) /* connections being established */ + return NULL; + + /* all conn types connected ? */ + if ((ksocknal_conn_cb_mask() & ~conn_cb->ksnr_connected) == 0) + return NULL; + + if (!(conn_cb->ksnr_retry_interval == 0 || /* first attempt */ + now >= conn_cb->ksnr_timeout)) { + CDEBUG(D_NET, + "Too soon to retry route %pIS (cnted %d, interval %lld, %lld secs later)\n", + &conn_cb->ksnr_addr, + conn_cb->ksnr_connected, + conn_cb->ksnr_retry_interval, + conn_cb->ksnr_timeout - now); + return NULL; + } + + return conn_cb; +} + +struct ksock_conn_cb * +ksocknal_find_connecting_conn_cb_locked(struct ksock_peer_ni *peer_ni) +{ + struct ksock_conn_cb *conn_cb; + + conn_cb = peer_ni->ksnp_conn_cb; + if (!conn_cb) + return NULL; + + LASSERT(!conn_cb->ksnr_connecting || conn_cb->ksnr_scheduled); + + return conn_cb->ksnr_scheduled ? conn_cb : NULL; +} + +int +ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx, + struct lnet_processid *id) +{ + struct ksock_peer_ni *peer_ni; + struct ksock_conn *conn; + struct sockaddr_in sa; + rwlock_t *g_lock; + int retry; + int rc; + + LASSERT(tx->tx_conn == NULL); + + g_lock = &ksocknal_data.ksnd_global_lock; + + for (retry = 0;; retry = 1) { + read_lock(g_lock); + peer_ni = ksocknal_find_peer_locked(ni, id); + if (peer_ni != NULL) { + if (ksocknal_find_connectable_conn_cb_locked(peer_ni) == NULL) { + conn = ksocknal_find_conn_locked(peer_ni, tx, tx->tx_nonblk); + if (conn != NULL) { + /* I've got nothing that need to be + * connecting and I do have an actual + * connection... + */ + ksocknal_queue_tx_locked(tx, conn); + read_unlock(g_lock); + return 0; + } + } + } + + /* I'll need a write lock... */ + read_unlock(g_lock); + + write_lock_bh(g_lock); + + peer_ni = ksocknal_find_peer_locked(ni, id); + if (peer_ni != NULL) + break; + + write_unlock_bh(g_lock); + + if ((id->pid & LNET_PID_USERFLAG) != 0) { + CERROR("Refusing to create a connection to userspace process %s\n", + libcfs_idstr(id)); + return -EHOSTUNREACH; + } + + if (retry) { + CERROR("Can't find peer_ni %s\n", libcfs_idstr(id)); + return -EHOSTUNREACH; + } + + memset(&sa, 0, sizeof(sa)); + sa.sin_family = AF_INET; + sa.sin_addr.s_addr = id->nid.nid_addr[0]; + sa.sin_port = htons(lnet_acceptor_port()); + { + struct lnet_process_id id4 = { + .pid = id->pid, + .nid = lnet_nid_to_nid4(&id->nid), + }; + rc = ksocknal_add_peer(ni, id4, (struct sockaddr *)&sa); + } + if (rc != 0) { + CERROR("Can't add peer_ni %s: %d\n", + libcfs_idstr(id), rc); + return rc; + } + } + + ksocknal_launch_all_connections_locked(peer_ni); + + conn = ksocknal_find_conn_locked(peer_ni, tx, tx->tx_nonblk); + if (conn != NULL) { + /* Connection exists; queue message on it */ + ksocknal_queue_tx_locked (tx, conn); + write_unlock_bh(g_lock); + return (0); + } + + if (peer_ni->ksnp_accepting > 0 || + ksocknal_find_connecting_conn_cb_locked(peer_ni) != NULL) { + /* the message is going to be pinned to the peer_ni */ + tx->tx_deadline = ktime_get_seconds() + + ksocknal_timeout(); + + /* Queue the message until a connection is established */ + list_add_tail(&tx->tx_list, &peer_ni->ksnp_tx_queue); + write_unlock_bh(g_lock); + return 0; + } + + write_unlock_bh(g_lock); + + /* NB Routes may be ignored if connections to them failed recently */ + CNETERR("No usable routes to %s\n", libcfs_idstr(id)); + tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR; + return (-EHOSTUNREACH); +} + +int +ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) +{ + /* '1' for consistency with code that checks !mpflag to restore */ + unsigned int mpflag = 1; + int type = lntmsg->msg_type; + struct lnet_processid *target = &lntmsg->msg_target; + unsigned int payload_niov = lntmsg->msg_niov; + struct bio_vec *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + struct ksock_tx *tx; + int desc_size; + int rc; + + /* NB 'private' is different depending on what we're sending. + * Just ignore it... + */ + + CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_idstr(target)); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= LNET_MAX_IOV); + LASSERT (!in_interrupt ()); + + desc_size = offsetof(struct ksock_tx, + tx_payload[payload_niov]); + + if (lntmsg->msg_vmflush) + mpflag = memalloc_noreclaim_save(); + + tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size); + if (tx == NULL) { + CERROR("Can't allocate tx desc type %d size %d\n", + type, desc_size); + if (lntmsg->msg_vmflush) + memalloc_noreclaim_restore(mpflag); + return -ENOMEM; + } + + tx->tx_conn = NULL; /* set when assigned a conn */ + tx->tx_lnetmsg = lntmsg; + + tx->tx_niov = 1; + tx->tx_kiov = tx->tx_payload; + tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov, + payload_niov, payload_kiov, + payload_offset, payload_nob); + + if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload) + tx->tx_zc_capable = 1; + + tx->tx_msg.ksm_csum = 0; + tx->tx_msg.ksm_type = KSOCK_MSG_LNET; + tx->tx_msg.ksm_zc_cookies[0] = 0; + tx->tx_msg.ksm_zc_cookies[1] = 0; + + /* The first fragment will be set later in pro_pack */ + rc = ksocknal_launch_packet(ni, tx, target); + /* + * We can't test lntsmg->msg_vmflush again as lntmsg may + * have been freed. + */ + if (!mpflag) + memalloc_noreclaim_restore(mpflag); + + if (rc == 0) + return (0); + + lntmsg->msg_health_status = tx->tx_hstatus; + ksocknal_free_tx(tx); + return -EIO; +} + +void +ksocknal_thread_fini (void) +{ + if (atomic_dec_and_test(&ksocknal_data.ksnd_nthreads)) + wake_up_var(&ksocknal_data.ksnd_nthreads); +} + +int +ksocknal_new_packet(struct ksock_conn *conn, int nob_to_skip) +{ + static char ksocknal_slop_buffer[4096]; + int nob; + unsigned int niov; + int skipped; + + LASSERT(conn->ksnc_proto != NULL); + + if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) { + /* Remind the socket to ack eagerly... */ + ksocknal_lib_eager_ack(conn); + } + + if (nob_to_skip == 0) { /* right at next packet boundary now */ + conn->ksnc_rx_started = 0; + smp_mb(); /* racing with timeout thread */ + + switch (conn->ksnc_proto->pro_version) { + case KSOCK_PROTO_V2: + case KSOCK_PROTO_V3: + conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER; + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg; + + conn->ksnc_rx_nob_wanted = sizeof(struct ksock_msg_hdr); + conn->ksnc_rx_nob_left = sizeof(struct ksock_msg_hdr); + conn->ksnc_rx_iov[0].iov_len = + sizeof(struct ksock_msg_hdr); + break; + + case KSOCK_PROTO_V1: + /* Receiving bare struct lnet_hdr_nid4 */ + conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; + conn->ksnc_rx_nob_wanted = sizeof(struct lnet_hdr_nid4); + conn->ksnc_rx_nob_left = sizeof(struct lnet_hdr_nid4); + + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = + (void *)&conn->ksnc_msg.ksm_u.lnetmsg_nid4; + conn->ksnc_rx_iov[0].iov_len = + sizeof(struct lnet_hdr_nid4); + break; + + default: + LBUG(); + } + conn->ksnc_rx_niov = 1; + + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_csum = ~0; + return (1); + } + + /* Set up to skip as much as possible now. If there's more left + * (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + skipped = 0; + niov = 0; + + do { + nob = min_t(int, nob_to_skip, sizeof(ksocknal_slop_buffer)); + + conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -= nob; + + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof(conn->ksnc_rx_iov_space) / sizeof(struct kvec)); + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_nob_wanted = skipped; + return (0); +} + +static int +ksocknal_process_receive(struct ksock_conn *conn, + struct page **rx_scratch_pgs, + struct kvec *scratch_iov) +{ + struct _lnet_hdr_nid4 *lhdr; + struct lnet_processid *id; + struct lnet_hdr hdr; + int rc; + + LASSERT(refcount_read(&conn->ksnc_conn_refcount) > 0); + + /* NB: sched lock NOT held */ + /* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */ + LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + again: + if (conn->ksnc_rx_nob_wanted != 0) { + rc = ksocknal_receive(conn, rx_scratch_pgs, + scratch_iov); + + if (rc <= 0) { + struct lnet_processid *ksnp_id; + + ksnp_id = &conn->ksnc_peer->ksnp_id; + + LASSERT(rc != -EAGAIN); + if (rc == 0) + CDEBUG(D_NET, "[%p] EOF from %s ip %pISp\n", + conn, libcfs_idstr(ksnp_id), + &conn->ksnc_peeraddr); + else if (!conn->ksnc_closing) + CERROR("[%p] Error %d on read from %s ip %pISp\n", + conn, rc, libcfs_idstr(ksnp_id), + &conn->ksnc_peeraddr); + + /* it's not an error if conn is being closed */ + ksocknal_close_conn_and_siblings (conn, + (conn->ksnc_closing) ? 0 : rc); + return (rc == 0 ? -ESHUTDOWN : rc); + } + + if (conn->ksnc_rx_nob_wanted != 0) { + /* short read */ + return (-EAGAIN); + } + } + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_KSM_HEADER: + if (conn->ksnc_flip) { + __swab32s(&conn->ksnc_msg.ksm_type); + __swab32s(&conn->ksnc_msg.ksm_csum); + __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]); + __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]); + } + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP && + conn->ksnc_msg.ksm_csum != 0 && /* has checksum */ + conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { + /* NOOP Checksum error */ + CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", + libcfs_idstr(&conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return (-EIO); + } + + if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) { + __u64 cookie = 0; + + LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x); + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) + cookie = conn->ksnc_msg.ksm_zc_cookies[0]; + + rc = conn->ksnc_proto->pro_handle_zcack( + conn, cookie, conn->ksnc_msg.ksm_zc_cookies[1]); + + if (rc != 0) { + CERROR("%s: Unknown ZC-ACK cookie: %llu, %llu\n", + libcfs_idstr(&conn->ksnc_peer->ksnp_id), + cookie, + conn->ksnc_msg.ksm_zc_cookies[1]); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return rc; + } + } + + switch (conn->ksnc_msg.ksm_type) { + case KSOCK_MSG_NOOP: + ksocknal_new_packet(conn, 0); + return 0; /* NOOP is done and just return */ + + case KSOCK_MSG_LNET: + + conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; + conn->ksnc_rx_nob_wanted = sizeof(struct lnet_hdr_nid4); + conn->ksnc_rx_nob_left = sizeof(struct lnet_hdr_nid4); + + conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; + conn->ksnc_rx_iov[0].iov_base = + (void *)&conn->ksnc_msg.ksm_u.lnetmsg_nid4; + conn->ksnc_rx_iov[0].iov_len = + sizeof(struct lnet_hdr_nid4); + + conn->ksnc_rx_niov = 1; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + + goto again; /* read lnet header now */ + + default: + CERROR("%s: Unknown message type: %x\n", + libcfs_idstr(&conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_type); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return -EPROTO; + } + + case SOCKNAL_RX_LNET_HEADER: + /* unpack message header */ + conn->ksnc_proto->pro_unpack(&conn->ksnc_msg, &hdr); + + if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) { + /* Userspace peer_ni */ + id = &conn->ksnc_peer->ksnp_id; + + /* Substitute process ID assigned at connection time */ + hdr.src_pid = id->pid; + hdr.src_nid = id->nid; + } + + conn->ksnc_rx_state = SOCKNAL_RX_PARSE; + ksocknal_conn_addref(conn); /* ++ref while parsing */ + + + rc = lnet_parse(conn->ksnc_peer->ksnp_ni, + &hdr, + &conn->ksnc_peer->ksnp_id.nid, + conn, 0); + if (rc < 0) { + /* I just received garbage: give up on this conn */ + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, rc); + ksocknal_conn_decref(conn); + return (-EPROTO); + } + + /* I'm racing with ksocknal_recv() */ + LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_PARSE || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD); + + if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD) + return 0; + + /* ksocknal_recv() got called */ + goto again; + + case SOCKNAL_RX_LNET_PAYLOAD: + /* payload all received */ + rc = 0; + + if (conn->ksnc_rx_nob_left == 0 && /* not truncating */ + conn->ksnc_msg.ksm_csum != 0 && /* has checksum */ + conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { + CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", + libcfs_idstr(&conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); + rc = -EIO; + } + + if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) { + LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x); + + lhdr = (void *)&conn->ksnc_msg.ksm_u.lnetmsg_nid4; + id = &conn->ksnc_peer->ksnp_id; + + rc = conn->ksnc_proto->pro_handle_zcreq( + conn, + conn->ksnc_msg.ksm_zc_cookies[0], + *ksocknal_tunables.ksnd_nonblk_zcack || + le64_to_cpu(lhdr->src_nid) != + lnet_nid_to_nid4(&id->nid)); + } + + if (rc && conn->ksnc_lnet_msg) + conn->ksnc_lnet_msg->msg_health_status = + LNET_MSG_STATUS_REMOTE_ERROR; + lnet_finalize(conn->ksnc_lnet_msg, rc); + + if (rc != 0) { + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, rc); + return (-EPROTO); + } + fallthrough; + + case SOCKNAL_RX_SLOP: + /* starting new packet? */ + if (ksocknal_new_packet(conn, conn->ksnc_rx_nob_left)) + return 0; /* come back later */ + goto again; /* try to finish reading slop now */ + + default: + break; + } + + /* Not Reached */ + LBUG (); + return (-EINVAL); /* keep gcc happy */ +} + +int +ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, + int delayed, unsigned int niov, + struct bio_vec *kiov, unsigned int offset, unsigned int mlen, + unsigned int rlen) +{ + struct ksock_conn *conn = private; + struct ksock_sched *sched = conn->ksnc_scheduler; + + LASSERT (mlen <= rlen); + LASSERT (niov <= LNET_MAX_IOV); + + conn->ksnc_lnet_msg = msg; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + if (mlen == 0) { + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; + conn->ksnc_rx_niov = 0; + } else { + conn->ksnc_rx_niov = 0; + conn->ksnc_rx_iov = NULL; + conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; + conn->ksnc_rx_nkiov = + lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov, + niov, kiov, offset, mlen); + } + + LASSERT (mlen == + lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + + LASSERT (conn->ksnc_rx_scheduled); + + spin_lock_bh(&sched->kss_lock); + + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_PARSE_WAIT: + list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns); + wake_up(&sched->kss_waitq); + LASSERT(conn->ksnc_rx_ready); + break; + + case SOCKNAL_RX_PARSE: + /* scheduler hasn't noticed I'm parsing yet */ + break; + } + + conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD; + + spin_unlock_bh(&sched->kss_lock); + ksocknal_conn_decref(conn); + return 0; +} + +static inline int +ksocknal_sched_cansleep(struct ksock_sched *sched) +{ + int rc; + + spin_lock_bh(&sched->kss_lock); + + rc = (!ksocknal_data.ksnd_shuttingdown && + list_empty(&sched->kss_rx_conns) && + list_empty(&sched->kss_tx_conns)); + + spin_unlock_bh(&sched->kss_lock); + return rc; +} + +int ksocknal_scheduler(void *arg) +{ + struct ksock_sched *sched; + struct ksock_conn *conn; + struct ksock_tx *tx; + int rc; + long id = (long)arg; + struct page **rx_scratch_pgs; + struct kvec *scratch_iov; + + sched = ksocknal_data.ksnd_schedulers[KSOCK_THREAD_CPT(id)]; + + LIBCFS_CPT_ALLOC(rx_scratch_pgs, lnet_cpt_table(), sched->kss_cpt, + sizeof(*rx_scratch_pgs) * LNET_MAX_IOV); + if (!rx_scratch_pgs) { + CERROR("Unable to allocate scratch pages\n"); + return -ENOMEM; + } + + LIBCFS_CPT_ALLOC(scratch_iov, lnet_cpt_table(), sched->kss_cpt, + sizeof(*scratch_iov) * LNET_MAX_IOV); + if (!scratch_iov) { + CERROR("Unable to allocate scratch iov\n"); + return -ENOMEM; + } + + rc = cfs_cpt_bind(lnet_cpt_table(), sched->kss_cpt); + if (rc != 0) { + CWARN("Can't set CPU partition affinity to %d: %d\n", + sched->kss_cpt, rc); + } + + spin_lock_bh(&sched->kss_lock); + + while (!ksocknal_data.ksnd_shuttingdown) { + bool did_something = false; + + /* Ensure I progress everything semi-fairly */ + conn = list_first_entry_or_null(&sched->kss_rx_conns, + struct ksock_conn, + ksnc_rx_list); + if (conn) { + list_del(&conn->ksnc_rx_list); + + LASSERT(conn->ksnc_rx_scheduled); + LASSERT(conn->ksnc_rx_ready); + + /* clear rx_ready in case receive isn't complete. + * Do it BEFORE we call process_recv, since + * data_ready can set it any time after we release + * kss_lock. */ + conn->ksnc_rx_ready = 0; + spin_unlock_bh(&sched->kss_lock); + + rc = ksocknal_process_receive(conn, rx_scratch_pgs, + scratch_iov); + + spin_lock_bh(&sched->kss_lock); + + /* I'm the only one that can clear this flag */ + LASSERT(conn->ksnc_rx_scheduled); + + /* Did process_receive get everything it wanted? */ + if (rc == 0) + conn->ksnc_rx_ready = 1; + + if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) { + /* Conn blocked waiting for ksocknal_recv() + * I change its state (under lock) to signal + * it can be rescheduled */ + conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT; + } else if (conn->ksnc_rx_ready) { + /* reschedule for rx */ + list_add_tail(&conn->ksnc_rx_list, + &sched->kss_rx_conns); + } else { + conn->ksnc_rx_scheduled = 0; + /* drop my ref */ + ksocknal_conn_decref(conn); + } + + did_something = true; + } + + if (!list_empty(&sched->kss_tx_conns)) { + LIST_HEAD(zlist); + + list_splice_init(&sched->kss_zombie_noop_txs, &zlist); + + conn = list_first_entry(&sched->kss_tx_conns, + struct ksock_conn, + ksnc_tx_list); + list_del(&conn->ksnc_tx_list); + + LASSERT(conn->ksnc_tx_scheduled); + LASSERT(conn->ksnc_tx_ready); + LASSERT(!list_empty(&conn->ksnc_tx_queue)); + + tx = list_first_entry(&conn->ksnc_tx_queue, + struct ksock_tx, tx_list); + + if (conn->ksnc_tx_carrier == tx) + ksocknal_next_tx_carrier(conn); + + /* dequeue now so empty list => more to send */ + list_del(&tx->tx_list); + + /* Clear tx_ready in case send isn't complete. Do + * it BEFORE we call process_transmit, since + * write_space can set it any time after we release + * kss_lock. */ + conn->ksnc_tx_ready = 0; + spin_unlock_bh(&sched->kss_lock); + + if (!list_empty(&zlist)) { + /* free zombie noop txs, it's fast because + * noop txs are just put in freelist */ + ksocknal_txlist_done(NULL, &zlist, 0); + } + + rc = ksocknal_process_transmit(conn, tx, scratch_iov); + + if (rc == -ENOMEM || rc == -EAGAIN) { + /* Incomplete send: replace tx on HEAD of tx_queue */ + spin_lock_bh(&sched->kss_lock); + list_add(&tx->tx_list, + &conn->ksnc_tx_queue); + } else { + /* Complete send; tx -ref */ + ksocknal_tx_decref(tx); + + spin_lock_bh(&sched->kss_lock); + /* assume space for more */ + conn->ksnc_tx_ready = 1; + } + + if (rc == -ENOMEM) { + /* Do nothing; after a short timeout, this + * conn will be reposted on kss_tx_conns. */ + } else if (conn->ksnc_tx_ready && + !list_empty(&conn->ksnc_tx_queue)) { + /* reschedule for tx */ + list_add_tail(&conn->ksnc_tx_list, + &sched->kss_tx_conns); + } else { + conn->ksnc_tx_scheduled = 0; + /* drop my ref */ + ksocknal_conn_decref(conn); + } + + did_something = true; + } + if (!did_something || /* nothing to do */ + need_resched()) { /* hogging CPU? */ + spin_unlock_bh(&sched->kss_lock); + + if (!did_something) { /* wait for something to do */ + rc = wait_event_interruptible_exclusive( + sched->kss_waitq, + !ksocknal_sched_cansleep(sched)); + LASSERT (rc == 0); + } else { + cond_resched(); + } + + spin_lock_bh(&sched->kss_lock); + } + } + + spin_unlock_bh(&sched->kss_lock); + CFS_FREE_PTR_ARRAY(rx_scratch_pgs, LNET_MAX_IOV); + CFS_FREE_PTR_ARRAY(scratch_iov, LNET_MAX_IOV); + ksocknal_thread_fini(); + return 0; +} + +/* + * Add connection to kss_rx_conns of scheduler + * and wakeup the scheduler. + */ +void ksocknal_read_callback(struct ksock_conn *conn) +{ + struct ksock_sched *sched; + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + conn->ksnc_rx_ready = 1; + + if (!conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail(&conn->ksnc_rx_list, + &sched->kss_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up (&sched->kss_waitq); + } + spin_unlock_bh(&sched->kss_lock); +} + +/* + * Add connection to kss_tx_conns of scheduler + * and wakeup the scheduler. + */ +void ksocknal_write_callback(struct ksock_conn *conn) +{ + struct ksock_sched *sched; + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && /* not being progressed */ + !list_empty(&conn->ksnc_tx_queue)) { /* packets to send */ + list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up(&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); +} + +static const struct ksock_proto * +ksocknal_parse_proto_version(struct ksock_hello_msg *hello) +{ + __u32 version = 0; + + if (hello->kshm_magic == LNET_PROTO_MAGIC) + version = hello->kshm_version; + else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC)) + version = __swab32(hello->kshm_version); + + if (version) { +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 1) + return NULL; + + if (*ksocknal_tunables.ksnd_protocol == 2 && + version == KSOCK_PROTO_V3) + return NULL; +#endif + if (version == KSOCK_PROTO_V2) + return &ksocknal_protocol_v2x; + + if (version == KSOCK_PROTO_V3) + return &ksocknal_protocol_v3x; + + return NULL; + } + + if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { + struct lnet_magicversion *hmv; + + BUILD_BUG_ON(sizeof(struct lnet_magicversion) != + offsetof(struct ksock_hello_msg, kshm_src_nid)); + + hmv = (struct lnet_magicversion *)hello; + + if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) && + hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR)) + return &ksocknal_protocol_v1x; + } + + return NULL; +} + +int +ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn, + struct lnet_nid *peer_nid, struct ksock_hello_msg *hello) +{ + /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */ + struct ksock_net *net = (struct ksock_net *)ni->ni_data; + + LASSERT(hello->kshm_nips <= LNET_INTERFACES_NUM); + + /* rely on caller to hold a ref on socket so it wouldn't disappear */ + LASSERT(conn->ksnc_proto != NULL); + + hello->kshm_src_nid = ni->ni_nid; + hello->kshm_dst_nid = *peer_nid; + hello->kshm_src_pid = the_lnet.ln_pid; + + hello->kshm_src_incarnation = net->ksnn_incarnation; + hello->kshm_ctype = conn->ksnc_type; + + return conn->ksnc_proto->pro_send_hello(conn, hello); +} + +static int +ksocknal_invert_type(int type) +{ + switch (type) { + case SOCKLND_CONN_ANY: + case SOCKLND_CONN_CONTROL: + return (type); + case SOCKLND_CONN_BULK_IN: + return SOCKLND_CONN_BULK_OUT; + case SOCKLND_CONN_BULK_OUT: + return SOCKLND_CONN_BULK_IN; + default: + return (SOCKLND_CONN_NONE); + } +} + +int +ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn, + struct ksock_hello_msg *hello, + struct lnet_processid *peerid, + __u64 *incarnation) +{ + /* Return < 0 fatal error + * 0 success + * EALREADY lost connection race + * EPROTO protocol version mismatch + */ + struct socket *sock = conn->ksnc_sock; + int active = (conn->ksnc_proto != NULL); + int timeout; + int proto_match; + int rc; + const struct ksock_proto *proto; + struct lnet_processid recv_id; + + /* socket type set on active connections - not set on passive */ + LASSERT(!active == !(conn->ksnc_type != SOCKLND_CONN_NONE)); + + timeout = active ? ksocknal_timeout() : + lnet_acceptor_timeout(); + + rc = lnet_sock_read(sock, &hello->kshm_magic, + sizeof(hello->kshm_magic), timeout); + if (rc != 0) { + CERROR("Error %d reading HELLO from %pIS\n", + rc, &conn->ksnc_peeraddr); + LASSERT(rc < 0); + return rc; + } + + if (hello->kshm_magic != LNET_PROTO_MAGIC && + hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) && + hello->kshm_magic != le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { + /* Unexpected magic! */ + CERROR("Bad magic(1) %#08x (%#08x expected) from %pIS\n", + __cpu_to_le32 (hello->kshm_magic), + LNET_PROTO_TCP_MAGIC, &conn->ksnc_peeraddr); + return -EPROTO; + } + + rc = lnet_sock_read(sock, &hello->kshm_version, + sizeof(hello->kshm_version), timeout); + if (rc != 0) { + CERROR("Error %d reading HELLO from %pIS\n", + rc, &conn->ksnc_peeraddr); + LASSERT(rc < 0); + return rc; + } + + proto = ksocknal_parse_proto_version(hello); + if (proto == NULL) { + if (!active) { + /* unknown protocol from peer_ni, + * tell peer_ni my protocol. + */ + conn->ksnc_proto = &ksocknal_protocol_v3x; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 2) + conn->ksnc_proto = &ksocknal_protocol_v2x; + else if (*ksocknal_tunables.ksnd_protocol == 1) + conn->ksnc_proto = &ksocknal_protocol_v1x; +#endif + hello->kshm_nips = 0; + ksocknal_send_hello(ni, conn, &ni->ni_nid, + hello); + } + + CERROR("Unknown protocol version (%d.x expected) from %pIS\n", + conn->ksnc_proto->pro_version, &conn->ksnc_peeraddr); + + return -EPROTO; + } + + proto_match = (conn->ksnc_proto == proto); + conn->ksnc_proto = proto; + + /* receive the rest of hello message anyway */ + rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout); + if (rc != 0) { + CERROR("Error %d reading or checking hello from from %pIS\n", + rc, &conn->ksnc_peeraddr); + LASSERT(rc < 0); + return rc; + } + + *incarnation = hello->kshm_src_incarnation; + + if (LNET_NID_IS_ANY(&hello->kshm_src_nid)) { + CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY from %pIS\n", + &conn->ksnc_peeraddr); + return -EPROTO; + } + + if (!active && + rpc_get_port((struct sockaddr *)&conn->ksnc_peeraddr) > + LNET_ACCEPTOR_MAX_RESERVED_PORT) { + /* Userspace NAL assigns peer_ni process ID from socket */ + recv_id.pid = rpc_get_port((struct sockaddr *) + &conn->ksnc_peeraddr) | + LNET_PID_USERFLAG; + LASSERT(conn->ksnc_peeraddr.ss_family == AF_INET); + memset(&recv_id.nid, 0, sizeof(recv_id.nid)); + recv_id.nid.nid_type = ni->ni_nid.nid_type; + recv_id.nid.nid_num = ni->ni_nid.nid_num; + recv_id.nid.nid_addr[0] = + ((struct sockaddr_in *) + &conn->ksnc_peeraddr)->sin_addr.s_addr; + } else { + recv_id.nid = hello->kshm_src_nid; + recv_id.pid = hello->kshm_src_pid; + } + + if (!active) { + *peerid = recv_id; + + /* peer_ni determines type */ + conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype); + if (conn->ksnc_type == SOCKLND_CONN_NONE) { + CERROR("Unexpected type %d from %s ip %pIS\n", + hello->kshm_ctype, libcfs_idstr(peerid), + &conn->ksnc_peeraddr); + return -EPROTO; + } + return 0; + } + + if (peerid->pid != recv_id.pid || + !nid_same(&peerid->nid, &recv_id.nid)) { + LCONSOLE_ERROR_MSG(0x130, + "Connected successfully to %s on host %pIS, but they claimed they were %s; please check your Lustre configuration.\n", + libcfs_idstr(peerid), + &conn->ksnc_peeraddr, + libcfs_idstr(&recv_id)); + return -EPROTO; + } + + if (hello->kshm_ctype == SOCKLND_CONN_NONE) { + /* Possible protocol mismatch or I lost the connection race */ + return proto_match ? EALREADY : EPROTO; + } + + if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) { + CERROR("Mismatched types: me %d, %s ip %pIS %d\n", + conn->ksnc_type, libcfs_idstr(peerid), + &conn->ksnc_peeraddr, + hello->kshm_ctype); + return -EPROTO; + } + return 0; +} + +static bool +ksocknal_connect(struct ksock_conn_cb *conn_cb) +{ + LIST_HEAD(zombies); + struct ksock_peer_ni *peer_ni = conn_cb->ksnr_peer; + int type; + int wanted; + struct socket *sock; + time64_t deadline; + bool retry_later = false; + int rc = 0; + + deadline = ktime_get_seconds() + ksocknal_timeout(); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + LASSERT(conn_cb->ksnr_scheduled); + LASSERT(!conn_cb->ksnr_connecting); + + conn_cb->ksnr_connecting = 1; + + for (;;) { + wanted = ksocknal_conn_cb_mask() & ~conn_cb->ksnr_connected; + + /* stop connecting if peer_ni/cb got closed under me, or + * conn cb got connected while queued + */ + if (peer_ni->ksnp_closing || conn_cb->ksnr_deleted || + wanted == 0) { + retry_later = false; + break; + } + + /* reschedule if peer_ni is connecting to me */ + if (peer_ni->ksnp_accepting > 0) { + CDEBUG(D_NET, + "peer_ni %s(%d) already connecting to me, retry later.\n", + libcfs_nidstr(&peer_ni->ksnp_id.nid), + peer_ni->ksnp_accepting); + retry_later = true; + } + + if (retry_later) /* needs reschedule */ + break; + + if ((wanted & BIT(SOCKLND_CONN_ANY)) != 0) { + type = SOCKLND_CONN_ANY; + } else if ((wanted & BIT(SOCKLND_CONN_CONTROL)) != 0) { + type = SOCKLND_CONN_CONTROL; + } else if ((wanted & BIT(SOCKLND_CONN_BULK_IN)) != 0 && + conn_cb->ksnr_blki_conn_count <= conn_cb->ksnr_blko_conn_count) { + type = SOCKLND_CONN_BULK_IN; + } else { + LASSERT ((wanted & BIT(SOCKLND_CONN_BULK_OUT)) != 0); + type = SOCKLND_CONN_BULK_OUT; + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + if (ktime_get_seconds() >= deadline) { + rc = -ETIMEDOUT; + lnet_connect_console_error( + rc, &peer_ni->ksnp_id.nid, + (struct sockaddr *)&conn_cb->ksnr_addr); + goto failed; + } + + sock = lnet_connect(&peer_ni->ksnp_id.nid, + conn_cb->ksnr_myiface, + (struct sockaddr *)&conn_cb->ksnr_addr, + peer_ni->ksnp_ni->ni_net_ns); + if (IS_ERR(sock)) { + rc = PTR_ERR(sock); + goto failed; + } + + rc = ksocknal_create_conn(peer_ni->ksnp_ni, conn_cb, sock, + type); + if (rc < 0) { + lnet_connect_console_error( + rc, &peer_ni->ksnp_id.nid, + (struct sockaddr *)&conn_cb->ksnr_addr); + goto failed; + } + + /* A +ve RC means I have to retry because I lost the connection + * race or I have to renegotiate protocol version + */ + retry_later = (rc != 0); + if (retry_later) + CDEBUG(D_NET, "peer_ni %s: conn race, retry later.\n", + libcfs_nidstr(&peer_ni->ksnp_id.nid)); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + } + + conn_cb->ksnr_scheduled = 0; + conn_cb->ksnr_connecting = 0; + + if (retry_later) { + /* re-queue for attention; this frees me up to handle + * the peer_ni's incoming connection request + */ + + if (rc == EALREADY || + (rc == 0 && peer_ni->ksnp_accepting > 0)) { + /* We want to introduce a delay before next + * attempt to connect if we lost conn race, but + * the race is resolved quickly usually, so + * min_reconnectms should be good heuristic + */ + conn_cb->ksnr_retry_interval = + *ksocknal_tunables.ksnd_min_reconnectms / 1000; + conn_cb->ksnr_timeout = ktime_get_seconds() + + conn_cb->ksnr_retry_interval; + } + + ksocknal_launch_connection_locked(conn_cb); + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + return retry_later; + + failed: + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + conn_cb->ksnr_scheduled = 0; + conn_cb->ksnr_connecting = 0; + + /* This is a retry rather than a new connection */ + conn_cb->ksnr_retry_interval *= 2; + conn_cb->ksnr_retry_interval = + max_t(time64_t, conn_cb->ksnr_retry_interval, + *ksocknal_tunables.ksnd_min_reconnectms / 1000); + conn_cb->ksnr_retry_interval = + min_t(time64_t, conn_cb->ksnr_retry_interval, + *ksocknal_tunables.ksnd_max_reconnectms / 1000); + + LASSERT(conn_cb->ksnr_retry_interval); + conn_cb->ksnr_timeout = ktime_get_seconds() + + conn_cb->ksnr_retry_interval; + + if (!list_empty(&peer_ni->ksnp_tx_queue) && + peer_ni->ksnp_accepting == 0 && + !ksocknal_find_connecting_conn_cb_locked(peer_ni)) { + struct ksock_conn *conn; + + /* ksnp_tx_queue is queued on a conn on successful + * connection for V1.x and V2.x + */ + conn = list_first_entry_or_null(&peer_ni->ksnp_conns, + struct ksock_conn, ksnc_list); + if (conn) + LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x || + conn->ksnc_proto == &ksocknal_protocol_v4x); + + /* take all the blocked packets while I've got the lock and + * complete below... + */ + list_splice_init(&peer_ni->ksnp_tx_queue, &zombies); + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_peer_failed(peer_ni); + ksocknal_txlist_done(peer_ni->ksnp_ni, &zombies, rc); + return 0; +} + +/* + * check whether we need to create more connds. + * It will try to create new thread if it's necessary, @timeout can + * be updated if failed to create, so caller wouldn't keep try while + * running out of resource. + */ +static int +ksocknal_connd_check_start(time64_t sec, long *timeout) +{ + int rc; + int total = ksocknal_data.ksnd_connd_starting + + ksocknal_data.ksnd_connd_running; + + if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { + /* still in initializing */ + return 0; + } + + if (total >= *ksocknal_tunables.ksnd_nconnds_max || + total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) { + /* can't create more connd, or still have enough + * threads to handle more connecting */ + return 0; + } + + if (list_empty(&ksocknal_data.ksnd_connd_routes)) { + /* no pending connecting request */ + return 0; + } + + if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) { + /* may run out of resource, retry later */ + *timeout = cfs_time_seconds(1); + return 0; + } + + if (ksocknal_data.ksnd_connd_starting > 0) { + /* serialize starting to avoid flood */ + return 0; + } + + ksocknal_data.ksnd_connd_starting_stamp = sec; + ksocknal_data.ksnd_connd_starting++; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + + /* NB: total is the next id */ + rc = ksocknal_thread_start(ksocknal_connd, NULL, + "socknal_cd%02d", total); + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + if (rc == 0) + return 1; + + /* we tried ... */ + LASSERT(ksocknal_data.ksnd_connd_starting > 0); + ksocknal_data.ksnd_connd_starting--; + ksocknal_data.ksnd_connd_failed_stamp = ktime_get_real_seconds(); + + return 1; +} + +/* + * check whether current thread can exit, it will return 1 if there are too + * many threads and no creating in past 120 seconds. + * Also, this function may update @timeout to make caller come back + * again to recheck these conditions. + */ +static int +ksocknal_connd_check_stop(time64_t sec, long *timeout) +{ + int val; + + if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { + /* still in initializing */ + return 0; + } + + if (ksocknal_data.ksnd_connd_starting > 0) { + /* in progress of starting new thread */ + return 0; + } + + if (ksocknal_data.ksnd_connd_running <= + *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */ + return 0; + } + + /* created thread in past 120 seconds? */ + val = (int)(ksocknal_data.ksnd_connd_starting_stamp + + SOCKNAL_CONND_TIMEOUT - sec); + + *timeout = (val > 0) ? cfs_time_seconds(val) : + cfs_time_seconds(SOCKNAL_CONND_TIMEOUT); + if (val > 0) + return 0; + + /* no creating in past 120 seconds */ + + return ksocknal_data.ksnd_connd_running > + ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV; +} + +/* Go through connd_cbs queue looking for a conn_cb that we can process + * right now, @timeout_p can be updated if we need to come back later */ +static struct ksock_conn_cb * +ksocknal_connd_get_conn_cb_locked(signed long *timeout_p) +{ + time64_t now = ktime_get_seconds(); + time64_t conn_timeout; + struct ksock_conn_cb *conn_cb; + + /* connd_routes can contain both pending and ordinary routes */ + list_for_each_entry(conn_cb, &ksocknal_data.ksnd_connd_routes, + ksnr_connd_list) { + + conn_timeout = conn_cb->ksnr_timeout; + + if (conn_cb->ksnr_retry_interval == 0 || + now >= conn_timeout) + return conn_cb; + + if (*timeout_p == MAX_SCHEDULE_TIMEOUT || + *timeout_p > cfs_time_seconds(conn_timeout - now)) + *timeout_p = cfs_time_seconds(conn_timeout - now); + } + + return NULL; +} + +int +ksocknal_connd(void *arg) +{ + spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock; + struct ksock_connreq *cr; + wait_queue_entry_t wait; + int cons_retry = 0; + + init_wait(&wait); + + spin_lock_bh(connd_lock); + + LASSERT(ksocknal_data.ksnd_connd_starting > 0); + ksocknal_data.ksnd_connd_starting--; + ksocknal_data.ksnd_connd_running++; + + while (!ksocknal_data.ksnd_shuttingdown) { + struct ksock_conn_cb *conn_cb = NULL; + time64_t sec = ktime_get_real_seconds(); + long timeout = MAX_SCHEDULE_TIMEOUT; + bool dropped_lock = false; + + if (ksocknal_connd_check_stop(sec, &timeout)) { + /* wakeup another one to check stop */ + wake_up(&ksocknal_data.ksnd_connd_waitq); + break; + } + + if (ksocknal_connd_check_start(sec, &timeout)) { + /* created new thread */ + dropped_lock = true; + } + + cr = list_first_entry_or_null(&ksocknal_data.ksnd_connd_connreqs, + struct ksock_connreq, ksncr_list); + if (cr) { + /* Connection accepted by the listener */ + list_del(&cr->ksncr_list); + spin_unlock_bh(connd_lock); + dropped_lock = true; + + ksocknal_create_conn(cr->ksncr_ni, NULL, + cr->ksncr_sock, SOCKLND_CONN_NONE); + lnet_ni_decref(cr->ksncr_ni); + LIBCFS_FREE(cr, sizeof(*cr)); + + spin_lock_bh(connd_lock); + } + + /* Only handle an outgoing connection request if there + * is a thread left to handle incoming connections and + * create new connd + */ + if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV < + ksocknal_data.ksnd_connd_running) + conn_cb = ksocknal_connd_get_conn_cb_locked(&timeout); + + if (conn_cb) { + list_del(&conn_cb->ksnr_connd_list); + ksocknal_data.ksnd_connd_connecting++; + spin_unlock_bh(connd_lock); + dropped_lock = true; + + if (ksocknal_connect(conn_cb)) { + /* consecutive retry */ + if (cons_retry++ > SOCKNAL_INSANITY_RECONN) { + CWARN("massive consecutive re-connecting to %pIS\n", + &conn_cb->ksnr_addr); + cons_retry = 0; + } + } else { + cons_retry = 0; + } + + ksocknal_conn_cb_decref(conn_cb); + + spin_lock_bh(connd_lock); + ksocknal_data.ksnd_connd_connecting--; + } + + if (dropped_lock) { + if (!need_resched()) + continue; + spin_unlock_bh(connd_lock); + cond_resched(); + spin_lock_bh(connd_lock); + continue; + } + + /* Nothing to do for 'timeout' */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, + &wait); + spin_unlock_bh(connd_lock); + + schedule_timeout(timeout); + + remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait); + spin_lock_bh(connd_lock); + } + ksocknal_data.ksnd_connd_running--; + spin_unlock_bh(connd_lock); + + ksocknal_thread_fini(); + return 0; +} + +static struct ksock_conn * +ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni) +{ + /* We're called with a shared lock on ksnd_global_lock */ + struct ksock_conn *conn; + struct ksock_tx *tx; + struct ksock_sched *sched; + + list_for_each_entry(conn, &peer_ni->ksnp_conns, ksnc_list) { + int error; + + /* Don't need the {get,put}connsock dance to deref ksnc_sock */ + LASSERT (!conn->ksnc_closing); + sched = conn->ksnc_scheduler; + + error = conn->ksnc_sock->sk->sk_err; + if (error != 0) { + ksocknal_conn_addref(conn); + + switch (error) { + case ECONNRESET: + CNETERR("A connection with %s (%pISp) was reset; it may have rebooted.\n", + libcfs_idstr(&peer_ni->ksnp_id), + &conn->ksnc_peeraddr); + break; + case ETIMEDOUT: + CNETERR("A connection with %s (%pISp) timed out; the network or node may be down.\n", + libcfs_idstr(&peer_ni->ksnp_id), + &conn->ksnc_peeraddr); + break; + default: + CNETERR("An unexpected network error %d occurred with %s (%pISp\n", + error, + libcfs_idstr(&peer_ni->ksnp_id), + &conn->ksnc_peeraddr); + break; + } + + return conn; + } + + if (conn->ksnc_rx_started && + ktime_get_seconds() >= conn->ksnc_rx_deadline) { + /* Timed out incomplete incoming message */ + ksocknal_conn_addref(conn); + CNETERR("Timeout receiving from %s (%pISp), state %d wanted %d left %d\n", + libcfs_idstr(&peer_ni->ksnp_id), + &conn->ksnc_peeraddr, + conn->ksnc_rx_state, + conn->ksnc_rx_nob_wanted, + conn->ksnc_rx_nob_left); + return conn; + } + + spin_lock_bh(&sched->kss_lock); + if ((!list_empty(&conn->ksnc_tx_queue) || + conn->ksnc_sock->sk->sk_wmem_queued != 0) && + ktime_get_seconds() >= conn->ksnc_tx_deadline) { + /* Timed out messages queued for sending or + * buffered in the socket's send buffer + */ + ksocknal_conn_addref(conn); + list_for_each_entry(tx, &conn->ksnc_tx_queue, + tx_list) + tx->tx_hstatus = + LNET_MSG_STATUS_LOCAL_TIMEOUT; + CNETERR("Timeout sending data to %s (%pISp) the network or that node may be down.\n", + libcfs_idstr(&peer_ni->ksnp_id), + &conn->ksnc_peeraddr); + spin_unlock_bh(&sched->kss_lock); + return conn; + } + spin_unlock_bh(&sched->kss_lock); + } + + return (NULL); +} + +static inline void +ksocknal_flush_stale_txs(struct ksock_peer_ni *peer_ni) +{ + struct ksock_tx *tx; + LIST_HEAD(stale_txs); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + while ((tx = list_first_entry_or_null(&peer_ni->ksnp_tx_queue, + struct ksock_tx, + tx_list)) != NULL) { + if (ktime_get_seconds() < tx->tx_deadline) + break; + + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT; + + list_move_tail(&tx->tx_list, &stale_txs); + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_txlist_done(peer_ni->ksnp_ni, &stale_txs, -ETIMEDOUT); +} + +static int +ksocknal_send_keepalive_locked(struct ksock_peer_ni *peer_ni) +__must_hold(&ksocknal_data.ksnd_global_lock) +{ + struct ksock_sched *sched; + struct ksock_conn *conn; + struct ksock_tx *tx; + + /* last_alive will be updated by create_conn */ + if (list_empty(&peer_ni->ksnp_conns)) + return 0; + + if (peer_ni->ksnp_proto != &ksocknal_protocol_v3x && + peer_ni->ksnp_proto != &ksocknal_protocol_v4x) + return 0; + + if (*ksocknal_tunables.ksnd_keepalive <= 0 || + ktime_get_seconds() < peer_ni->ksnp_last_alive + + *ksocknal_tunables.ksnd_keepalive) + return 0; + + if (ktime_get_seconds() < peer_ni->ksnp_send_keepalive) + return 0; + + /* retry 10 secs later, so we wouldn't put pressure + * on this peer_ni if we failed to send keepalive this time */ + peer_ni->ksnp_send_keepalive = ktime_get_seconds() + 10; + + conn = ksocknal_find_conn_locked(peer_ni, NULL, 1); + if (conn != NULL) { + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + if (!list_empty(&conn->ksnc_tx_queue)) { + spin_unlock_bh(&sched->kss_lock); + /* there is an queued ACK, don't need keepalive */ + return 0; + } + + spin_unlock_bh(&sched->kss_lock); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + /* cookie = 1 is reserved for keepalive PING */ + tx = ksocknal_alloc_tx_noop(1, 1); + if (tx == NULL) { + read_lock(&ksocknal_data.ksnd_global_lock); + return -ENOMEM; + } + + if (ksocknal_launch_packet(peer_ni->ksnp_ni, tx, &peer_ni->ksnp_id) + == 0) { + read_lock(&ksocknal_data.ksnd_global_lock); + return 1; + } + + ksocknal_free_tx(tx); + read_lock(&ksocknal_data.ksnd_global_lock); + + return -EIO; +} + + +static void +ksocknal_check_peer_timeouts(int idx) +{ + struct hlist_head *peers = &ksocknal_data.ksnd_peers[idx]; + struct ksock_peer_ni *peer_ni; + struct ksock_conn *conn; + struct ksock_tx *tx; + + again: + /* NB. We expect to have a look at all the peers and not find any + * connections to time out, so we just use a shared lock while we + * take a look... + */ + read_lock(&ksocknal_data.ksnd_global_lock); + + hlist_for_each_entry(peer_ni, peers, ksnp_list) { + struct ksock_tx *tx_stale; + time64_t deadline = 0; + int resid = 0; + int n = 0; + + if (ksocknal_send_keepalive_locked(peer_ni) != 0) { + read_unlock(&ksocknal_data.ksnd_global_lock); + goto again; + } + + conn = ksocknal_find_timed_out_conn(peer_ni); + + if (conn != NULL) { + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_close_conn_and_siblings(conn, -ETIMEDOUT); + + /* NB we won't find this one again, but we can't + * just proceed with the next peer_ni, since we dropped + * ksnd_global_lock and it might be dead already! + */ + ksocknal_conn_decref(conn); + goto again; + } + + /* we can't process stale txs right here because we're + * holding only shared lock + */ + tx = list_first_entry_or_null(&peer_ni->ksnp_tx_queue, + struct ksock_tx, tx_list); + if (tx && ktime_get_seconds() >= tx->tx_deadline) { + ksocknal_peer_addref(peer_ni); + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_flush_stale_txs(peer_ni); + + ksocknal_peer_decref(peer_ni); + goto again; + } + + if (list_empty(&peer_ni->ksnp_zc_req_list)) + continue; + + tx_stale = NULL; + spin_lock(&peer_ni->ksnp_lock); + list_for_each_entry(tx, &peer_ni->ksnp_zc_req_list, tx_zc_list) { + if (ktime_get_seconds() < tx->tx_deadline) + break; + /* ignore the TX if connection is being closed */ + if (tx->tx_conn->ksnc_closing) + continue; + n++; + if (tx_stale == NULL) + tx_stale = tx; + } + + if (tx_stale == NULL) { + spin_unlock(&peer_ni->ksnp_lock); + continue; + } + + deadline = tx_stale->tx_deadline; + resid = tx_stale->tx_resid; + conn = tx_stale->tx_conn; + ksocknal_conn_addref(conn); + + spin_unlock(&peer_ni->ksnp_lock); + read_unlock(&ksocknal_data.ksnd_global_lock); + + CERROR("Total %d stale ZC_REQs for peer_ni %s detected; the " + "oldest(%p) timed out %lld secs ago, " + "resid: %d, wmem: %d\n", + n, libcfs_nidstr(&peer_ni->ksnp_id.nid), tx_stale, + ktime_get_seconds() - deadline, + resid, conn->ksnc_sock->sk->sk_wmem_queued); + + ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT); + ksocknal_conn_decref(conn); + goto again; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); +} + +int ksocknal_reaper(void *arg) +{ + wait_queue_entry_t wait; + struct ksock_conn *conn; + struct ksock_sched *sched; + LIST_HEAD(enomem_conns); + int nenomem_conns; + time64_t timeout; + int i; + int peer_index = 0; + time64_t deadline = ktime_get_seconds(); + + init_wait(&wait); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + while (!ksocknal_data.ksnd_shuttingdown) { + conn = list_first_entry_or_null(&ksocknal_data.ksnd_deathrow_conns, + struct ksock_conn, ksnc_list); + if (conn) { + list_del(&conn->ksnc_list); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_terminate_conn(conn); + ksocknal_conn_decref(conn); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + continue; + } + + conn = list_first_entry_or_null(&ksocknal_data.ksnd_zombie_conns, + struct ksock_conn, ksnc_list); + if (conn) { + list_del(&conn->ksnc_list); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_destroy_conn(conn); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + continue; + } + + list_splice_init(&ksocknal_data.ksnd_enomem_conns, + &enomem_conns); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + /* reschedule all the connections that stalled with ENOMEM... */ + nenomem_conns = 0; + while ((conn = list_first_entry_or_null(&enomem_conns, + struct ksock_conn, + ksnc_tx_list)) != NULL) { + list_del(&conn->ksnc_tx_list); + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + LASSERT(conn->ksnc_tx_scheduled); + conn->ksnc_tx_ready = 1; + list_add_tail(&conn->ksnc_tx_list, + &sched->kss_tx_conns); + wake_up(&sched->kss_waitq); + + spin_unlock_bh(&sched->kss_lock); + nenomem_conns++; + } + + /* careful with the jiffy wrap... */ + while ((timeout = deadline - ktime_get_seconds()) <= 0) { + const int n = 4; + const int p = 1; + int chunk = HASH_SIZE(ksocknal_data.ksnd_peers); + unsigned int lnd_timeout; + + /* Time to check for timeouts on a few more peers: I + * do checks every 'p' seconds on a proportion of the + * peer_ni table and I need to check every connection + * 'n' times within a timeout interval, to ensure I + * detect a timeout on any connection within (n+1)/n + * times the timeout interval. + */ + + lnd_timeout = ksocknal_timeout(); + if (lnd_timeout > n * p) + chunk = (chunk * n * p) / lnd_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + ksocknal_check_peer_timeouts(peer_index); + peer_index = (peer_index + 1) % + HASH_SIZE(ksocknal_data.ksnd_peers); + } + + deadline += p; + } + + if (nenomem_conns != 0) { + /* Reduce my timeout if I rescheduled ENOMEM conns. + * This also prevents me getting woken immediately + * if any go back on my enomem list. */ + timeout = SOCKNAL_ENOMEM_RETRY; + } + ksocknal_data.ksnd_reaper_waketime = ktime_get_seconds() + + timeout; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait); + + if (!ksocknal_data.ksnd_shuttingdown && + list_empty(&ksocknal_data.ksnd_deathrow_conns) && + list_empty(&ksocknal_data.ksnd_zombie_conns)) + schedule_timeout(cfs_time_seconds(timeout)); + + set_current_state(TASK_RUNNING); + remove_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + } + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_thread_fini(); + return 0; +} diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c new file mode 100644 index 0000000000000..46cb3c68e26ed --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c @@ -0,0 +1,698 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#include "socklnd.h" + +int +ksocknal_lib_get_conn_addrs(struct ksock_conn *conn) +{ + int rc = lnet_sock_getaddr(conn->ksnc_sock, true, + &conn->ksnc_peeraddr); + + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT(!conn->ksnc_closing); + + if (rc != 0) { + CERROR("Error %d getting sock peer_ni IP\n", rc); + return rc; + } + + rc = lnet_sock_getaddr(conn->ksnc_sock, false, + &conn->ksnc_myaddr); + if (rc != 0) { + CERROR("Error %d getting sock local IP\n", rc); + return rc; + } + + return 0; +} + +int +ksocknal_lib_zc_capable(struct ksock_conn *conn) +{ + int caps = conn->ksnc_sock->sk->sk_route_caps; + + if (conn->ksnc_proto == &ksocknal_protocol_v1x) + return 0; + + /* ZC if the socket supports scatter/gather and doesn't need software + * checksums */ + return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_CSUM_MASK) != 0); +} + +int +ksocknal_lib_send_hdr(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratchiov) +{ + struct socket *sock = conn->ksnc_sock; + int nob = 0; + int rc; + + if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */ + conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */ + tx->tx_nob == tx->tx_resid && /* frist sending */ + tx->tx_msg.ksm_csum == 0) /* not checksummed */ + ksocknal_lib_csum_tx(tx); + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + + { +#if SOCKNAL_SINGLE_FRAG_TX + struct kvec scratch; + struct kvec *scratchiov = &scratch; + unsigned int niov = 1; +#else + unsigned int niov = tx->tx_niov; +#endif + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + + if (tx->tx_niov) { + scratchiov[0] = tx->tx_hdr; + nob += scratchiov[0].iov_len; + } + + if (!list_empty(&conn->ksnc_tx_queue) || + nob < tx->tx_resid) + msg.msg_flags |= MSG_MORE; + + rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob); + } + return rc; +} + +int +ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratchiov) +{ + struct socket *sock = conn->ksnc_sock; + struct bio_vec *kiov = tx->tx_kiov; + int rc; + int nob; + + /* Not NOOP message */ + LASSERT(tx->tx_lnetmsg != NULL); + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + if (tx->tx_msg.ksm_zc_cookies[0] != 0) { + /* Zero copy is enabled */ + struct sock *sk = sock->sk; + struct page *page = kiov->bv_page; + int offset = kiov->bv_offset; + int fragsize = kiov->bv_len; + int msgflg = MSG_DONTWAIT; + + CDEBUG(D_NET, "page %p + offset %x for %d\n", + page, offset, kiov->bv_len); + + if (!list_empty(&conn->ksnc_tx_queue) || + fragsize < tx->tx_resid) + msgflg |= MSG_MORE; + + rc = sk->sk_prot->sendpage(sk, page, + offset, fragsize, msgflg); + } else { +#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK + struct kvec scratch; + struct kvec *scratchiov = &scratch; + unsigned int niov = 1; +#else +#ifdef CONFIG_HIGHMEM +#warning "XXX risk of kmap deadlock on multiple frags..." +#endif + unsigned int niov = tx->tx_nkiov; +#endif + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + int i; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i].iov_base = kmap(kiov[i].bv_page) + + kiov[i].bv_offset; + nob += scratchiov[i].iov_len = kiov[i].bv_len; + } + + if (!list_empty(&conn->ksnc_tx_queue) || + nob < tx->tx_resid) + msg.msg_flags |= MSG_MORE; + + rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob); + + for (i = 0; i < niov; i++) + kunmap(kiov[i].bv_page); + } + return rc; +} + +void +ksocknal_lib_eager_ack(struct ksock_conn *conn) +{ + struct socket *sock = conn->ksnc_sock; + + /* Remind the socket to ACK eagerly. If I don't, the socket might + * think I'm about to send something it could piggy-back the ACK on, + * introducing delay in completing zero-copy sends in my peer_ni. + */ + + tcp_sock_set_quickack(sock->sk, 1); +} + +int +ksocknal_lib_recv_iov(struct ksock_conn *conn, struct kvec *scratchiov) +{ +#if SOCKNAL_SINGLE_FRAG_RX + struct kvec scratch; + struct kvec *scratchiov = &scratch; + unsigned int niov = 1; +#else + unsigned int niov = conn->ksnc_rx_niov; +#endif + struct kvec *iov = conn->ksnc_rx_iov; + struct msghdr msg = { + .msg_flags = 0 + }; + int nob; + int i; + int rc; + int fragnob; + int sum; + __u32 saved_csum; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + LASSERT (niov > 0); + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = iov[i]; + nob += scratchiov[i].iov_len; + } + LASSERT (nob <= conn->ksnc_rx_nob_wanted); + + rc = kernel_recvmsg(conn->ksnc_sock, &msg, scratchiov, niov, nob, + MSG_DONTWAIT); + + saved_csum = 0; + if (conn->ksnc_proto == &ksocknal_protocol_v2x) { + saved_csum = conn->ksnc_msg.ksm_csum; + conn->ksnc_msg.ksm_csum = 0; + } + + if (saved_csum != 0) { + /* accumulate checksum */ + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT (i < niov); + + fragnob = iov[i].iov_len; + if (fragnob > sum) + fragnob = sum; + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + iov[i].iov_base, fragnob); + } + conn->ksnc_msg.ksm_csum = saved_csum; + } + + return rc; +} + +static void +ksocknal_lib_kiov_vunmap(void *addr) +{ + if (addr == NULL) + return; + + vunmap(addr); +} + +static void * +ksocknal_lib_kiov_vmap(struct bio_vec *kiov, int niov, + struct kvec *iov, struct page **pages) +{ + void *addr; + int nob; + int i; + + if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL) + return NULL; + + LASSERT (niov <= LNET_MAX_IOV); + + if (niov < 2 || + niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags) + return NULL; + + for (nob = i = 0; i < niov; i++) { + if ((kiov[i].bv_offset != 0 && i > 0) || + (kiov[i].bv_offset + kiov[i].bv_len != + PAGE_SIZE && i < niov - 1)) + return NULL; + + pages[i] = kiov[i].bv_page; + nob += kiov[i].bv_len; + } + + addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL); + if (addr == NULL) + return NULL; + + iov->iov_base = addr + kiov[0].bv_offset; + iov->iov_len = nob; + + return addr; +} + +int +ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages, + struct kvec *scratchiov) +{ +#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK + struct kvec scratch; + struct kvec *scratchiov = &scratch; + struct page **pages = NULL; + unsigned int niov = 1; +#else +#ifdef CONFIG_HIGHMEM +#warning "XXX risk of kmap deadlock on multiple frags..." +#endif + unsigned int niov = conn->ksnc_rx_nkiov; +#endif + struct bio_vec *kiov = conn->ksnc_rx_kiov; + struct msghdr msg = { + .msg_flags = 0 + }; + int nob; + int i; + int rc; + void *base; + void *addr; + int sum; + int fragnob; + int n; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) { + nob = scratchiov[0].iov_len; + n = 1; + + } else { + for (nob = i = 0; i < niov; i++) { + nob += scratchiov[i].iov_len = kiov[i].bv_len; + scratchiov[i].iov_base = kmap(kiov[i].bv_page) + + kiov[i].bv_offset; + } + n = niov; + } + + LASSERT (nob <= conn->ksnc_rx_nob_wanted); + + rc = kernel_recvmsg(conn->ksnc_sock, &msg, scratchiov, n, nob, + MSG_DONTWAIT); + + if (conn->ksnc_msg.ksm_csum != 0) { + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT(i < niov); + + /* Dang! have to kmap again because I have nowhere to + * stash the mapped address. But by doing it while the + * page is still mapped, the kernel just bumps the map + * count and returns me the address it stashed. + */ + base = kmap(kiov[i].bv_page) + kiov[i].bv_offset; + fragnob = kiov[i].bv_len; + if (fragnob > sum) + fragnob = sum; + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + base, fragnob); + + kunmap(kiov[i].bv_page); + } + } + + if (addr != NULL) { + ksocknal_lib_kiov_vunmap(addr); + } else { + for (i = 0; i < niov; i++) + kunmap(kiov[i].bv_page); + } + + return rc; +} + +void +ksocknal_lib_csum_tx(struct ksock_tx *tx) +{ + int i; + __u32 csum; + void *base; + + LASSERT(tx->tx_hdr.iov_base == (void *)&tx->tx_msg); + LASSERT(tx->tx_conn != NULL); + LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x); + + tx->tx_msg.ksm_csum = 0; + + csum = ksocknal_csum(~0, (void *)tx->tx_hdr.iov_base, + tx->tx_hdr.iov_len); + + for (i = 0; i < tx->tx_nkiov; i++) { + base = kmap(tx->tx_kiov[i].bv_page) + + tx->tx_kiov[i].bv_offset; + + csum = ksocknal_csum(csum, base, tx->tx_kiov[i].bv_len); + + kunmap(tx->tx_kiov[i].bv_page); + } + + if (*ksocknal_tunables.ksnd_inject_csum_error) { + csum++; + *ksocknal_tunables.ksnd_inject_csum_error = 0; + } + + tx->tx_msg.ksm_csum = csum; +} + +int +ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem, int *rxmem, int *nagle) +{ + struct socket *sock = conn->ksnc_sock; + struct tcp_sock *tp = tcp_sk(sock->sk); + + if (ksocknal_connsock_addref(conn) < 0) { + LASSERT(conn->ksnc_closing); + *txmem = 0; + *rxmem = 0; + *nagle = 0; + return -ESHUTDOWN; + } + + lnet_sock_getbuf(sock, txmem, rxmem); + + *nagle = !(tp->nonagle & TCP_NAGLE_OFF); + + ksocknal_connsock_decref(conn); + + + return 0; +} + +int +ksocknal_lib_setup_sock (struct socket *sock) +{ + int rc; + int keep_idle; + int keep_intvl; + int keep_count; + int do_keepalive; + struct tcp_sock *tp = tcp_sk(sock->sk); + + sock->sk->sk_allocation = GFP_NOFS; + + /* Ensure this socket aborts active sends immediately when closed. */ + sock_reset_flag(sock->sk, SOCK_LINGER); + + tp->linger2 = -1; + + if (!*ksocknal_tunables.ksnd_nagle) + tcp_sock_set_nodelay(sock->sk); + + lnet_sock_setbuf(sock, + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size); + +/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */ +#ifdef SOCKNAL_BACKOFF + if (*ksocknal_tunables.ksnd_backoff_init > 0) { + int option = *ksocknal_tunables.ksnd_backoff_init; +#ifdef SOCKNAL_BACKOFF_MS + option *= 1000; +#endif + + rc = kernel_setsockopt(sock, SOL_TCP, TCP_BACKOFF_INIT, + (char *)&option, sizeof(option)); + if (rc != 0) { + CERROR("Can't set initial tcp backoff %d: %d\n", + option, rc); + return rc; + } + } + + if (*ksocknal_tunables.ksnd_backoff_max > 0) { + int option = *ksocknal_tunables.ksnd_backoff_max; +#ifdef SOCKNAL_BACKOFF_MS + option *= 1000; +#endif + + rc = kernel_setsockopt(sock, SOL_TCP, TCP_BACKOFF_MAX, + (char *)&option, sizeof(option)); + if (rc != 0) { + CERROR("Can't set maximum tcp backoff %d: %d\n", + option, rc); + return rc; + } + } +#endif + + /* snapshot tunables */ + keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; + keep_count = *ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; + + do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); + +#ifdef HAVE_KERNEL_SETSOCKOPT + /* open-coded version doesn't work in all kernels, and + * there is no helper function, so call kernel_setsockopt() + * directly. + */ + { + int option = (do_keepalive ? 1 : 0); + kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&option, sizeof(option)); + } +#else + if (sock->sk->sk_prot->keepalive) + sock->sk->sk_prot->keepalive(sock->sk, do_keepalive); + if (do_keepalive) + sock_set_flag(sock->sk, SOCK_KEEPOPEN); + else + sock_reset_flag(sock->sk, SOCK_KEEPOPEN); +#endif /* HAVE_KERNEL_SETSOCKOPT */ + + if (!do_keepalive) + return (0); + + rc = tcp_sock_set_keepidle(sock->sk, keep_idle); + if (rc != 0) { + CERROR("Can't set TCP_KEEPIDLE: %d\n", rc); + return rc; + } + + rc = tcp_sock_set_keepintvl(sock->sk, keep_intvl); + if (rc != 0) { + CERROR("Can't set TCP_KEEPINTVL: %d\n", rc); + return rc; + } + + rc = tcp_sock_set_keepcnt(sock->sk, keep_count); + if (rc != 0) { + CERROR("Can't set TCP_KEEPCNT: %d\n", rc); + return rc; + } + + return (0); +} + +void +ksocknal_lib_push_conn(struct ksock_conn *conn) +{ + struct sock *sk; + struct tcp_sock *tp; + int nonagle; + int rc; + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) /* being shut down */ + return; + + sk = conn->ksnc_sock->sk; + tp = tcp_sk(sk); + + lock_sock(sk); + nonagle = tp->nonagle; + tp->nonagle = TCP_NAGLE_OFF; + release_sock(sk); + + tcp_sock_set_nodelay(conn->ksnc_sock->sk); + + lock_sock(sk); + tp->nonagle = nonagle; + release_sock(sk); + + ksocknal_connsock_decref(conn); +} + +void ksocknal_read_callback(struct ksock_conn *conn); +void ksocknal_write_callback(struct ksock_conn *conn); +/* + * socket call back in Linux + */ +static void +#ifdef HAVE_SK_DATA_READY_ONE_ARG +ksocknal_data_ready(struct sock *sk) +#else +ksocknal_data_ready(struct sock *sk, int n) +#endif +{ + struct ksock_conn *conn; + + /* interleave correctly with closing sockets... */ + LASSERT(!in_irq()); + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = sk->sk_user_data; + if (conn == NULL) { /* raced with ksocknal_terminate_conn */ + LASSERT(sk->sk_data_ready != &ksocknal_data_ready); +#ifdef HAVE_SK_DATA_READY_ONE_ARG + sk->sk_data_ready(sk); +#else + sk->sk_data_ready(sk, n); +#endif + } else + ksocknal_read_callback(conn); + + read_unlock(&ksocknal_data.ksnd_global_lock); +} + +static void +ksocknal_write_space (struct sock *sk) +{ + struct ksock_conn *conn; + int wspace; + int min_wpace; + + /* interleave correctly with closing sockets... */ + LASSERT(!in_irq()); + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = sk->sk_user_data; + wspace = sk_stream_wspace(sk); + min_wpace = sk_stream_min_wspace(sk); + + CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", + sk, wspace, min_wpace, conn, + (conn == NULL) ? "" : (conn->ksnc_tx_ready ? + " ready" : " blocked"), + (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? + " scheduled" : " idle"), + (conn == NULL) ? "" : (list_empty(&conn->ksnc_tx_queue) ? + " empty" : " queued")); + + if (conn == NULL) { /* raced with ksocknal_terminate_conn */ + LASSERT (sk->sk_write_space != &ksocknal_write_space); + sk->sk_write_space (sk); + + read_unlock(&ksocknal_data.ksnd_global_lock); + return; + } + + if (wspace >= min_wpace) { /* got enough space */ + ksocknal_write_callback(conn); + + /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the + * ENOMEM check in ksocknal_transmit is race-free (think about + * it). */ + + clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); +} + +void +ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn) +{ + conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; + conn->ksnc_saved_write_space = sock->sk->sk_write_space; +} + +void +ksocknal_lib_set_callback(struct socket *sock, struct ksock_conn *conn) +{ + sock->sk->sk_user_data = conn; + sock->sk->sk_data_ready = ksocknal_data_ready; + sock->sk->sk_write_space = ksocknal_write_space; +} + +void +ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn) +{ + /* Remove conn's network callbacks. + * NB I _have_ to restore the callback, rather than storing a noop, + * since the socket could survive past this module being unloaded!! */ + sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; + sock->sk->sk_write_space = conn->ksnc_saved_write_space; + + /* A callback could be in progress already; they hold a read lock + * on ksnd_global_lock (to serialise with me) and NOOP if + * sk_user_data is NULL. */ + sock->sk->sk_user_data = NULL; + + return ; +} + +int +ksocknal_lib_memory_pressure(struct ksock_conn *conn) +{ + int rc = 0; + struct ksock_sched *sched; + + sched = conn->ksnc_scheduler; + spin_lock_bh(&sched->kss_lock); + + if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) && + !conn->ksnc_tx_ready) { + /* SOCK_NOSPACE is set when the socket fills + * and cleared in the write_space callback + * (which also sets ksnc_tx_ready). If + * SOCK_NOSPACE and ksnc_tx_ready are BOTH + * zero, I didn't fill the socket and + * write_space won't reschedule me, so I + * return -ENOMEM to get my caller to retry + * after a timeout */ + rc = -ENOMEM; + } + + spin_unlock_bh(&sched->kss_lock); + + return rc; +} diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c new file mode 100644 index 0000000000000..10aee437590bb --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Eric Barton + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +#include +#if defined(__x86_64__) || defined(__i386__) +#include +#endif +#ifdef HAVE_ETHTOOL_LINK_SETTINGS +#include +#include +#endif + +#define CURRENT_LND_VERSION 1 + +static int sock_timeout; +module_param(sock_timeout, int, 0644); +MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)"); + +static int credits = DEFAULT_CREDITS; +module_param(credits, int, 0444); +MODULE_PARM_DESC(credits, "# concurrent sends"); + +static int peer_credits = DEFAULT_PEER_CREDITS; +module_param(peer_credits, int, 0444); +MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer"); + +static int peer_buffer_credits; +module_param(peer_buffer_credits, int, 0444); +MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits"); + +static int peer_timeout = DEFAULT_PEER_TIMEOUT; +module_param(peer_timeout, int, 0444); +MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)"); + +/* Number of daemons in each thread pool which is percpt, + * we will estimate reasonable value based on CPUs if it's not set. */ +static unsigned int nscheds; +module_param(nscheds, int, 0444); +MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting"); + +static int nconnds = 4; +module_param(nconnds, int, 0444); +MODULE_PARM_DESC(nconnds, "# connection daemons while starting"); + +static int nconnds_max = 64; +module_param(nconnds_max, int, 0444); +MODULE_PARM_DESC(nconnds_max, "max # connection daemons"); + +static int min_reconnectms = 1000; +module_param(min_reconnectms, int, 0644); +MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)"); + +static int max_reconnectms = 60000; +module_param(max_reconnectms, int, 0644); +MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)"); + +static int eager_ack; +module_param(eager_ack, int, 0644); +MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly"); + +static int typed_conns = 1; +module_param(typed_conns, int, 0444); +MODULE_PARM_DESC(typed_conns, "use different sockets for bulk"); + +static int min_bulk = (1<<10); +module_param(min_bulk, int, 0644); +MODULE_PARM_DESC(min_bulk, "smallest 'large' message"); + +# define DEFAULT_BUFFER_SIZE 0 +static int tx_buffer_size = DEFAULT_BUFFER_SIZE; +module_param(tx_buffer_size, int, 0644); +MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)"); + +static int rx_buffer_size = DEFAULT_BUFFER_SIZE; +module_param(rx_buffer_size, int, 0644); +MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)"); + +static int nagle = 0; +module_param(nagle, int, 0644); +MODULE_PARM_DESC(nagle, "enable NAGLE?"); + +static int round_robin = 1; +module_param(round_robin, int, 0644); +MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces"); + +static int keepalive = 30; +module_param(keepalive, int, 0644); +MODULE_PARM_DESC(keepalive, "# seconds before send keepalive"); + +static int keepalive_idle = 30; +module_param(keepalive_idle, int, 0644); +MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe"); + +#define DEFAULT_KEEPALIVE_COUNT 5 +static int keepalive_count = DEFAULT_KEEPALIVE_COUNT; +module_param(keepalive_count, int, 0644); +MODULE_PARM_DESC(keepalive_count, "# missed probes == dead"); + +static int keepalive_intvl = 5; +module_param(keepalive_intvl, int, 0644); +MODULE_PARM_DESC(keepalive_intvl, "seconds between probes"); + +static int enable_csum = 0; +module_param(enable_csum, int, 0644); +MODULE_PARM_DESC(enable_csum, "enable check sum"); + +static int inject_csum_error = 0; +module_param(inject_csum_error, int, 0644); +MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error"); + +static int enable_irq_affinity = 0; +module_param(enable_irq_affinity, int, 0644); +MODULE_PARM_DESC(enable_irq_affinity, "enable IRQ affinity"); + +static int nonblk_zcack = 1; +module_param(nonblk_zcack, int, 0644); +MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection"); + +static unsigned int zc_min_payload = (16 << 10); +module_param(zc_min_payload, int, 0644); +MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy"); + +static unsigned int zc_recv = 0; +module_param(zc_recv, int, 0644); +MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver"); + +static unsigned int zc_recv_min_nfrags = 16; +module_param(zc_recv_min_nfrags, int, 0644); +MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv"); + +static unsigned int conns_per_peer = DEFAULT_CONNS_PER_PEER; +module_param(conns_per_peer, uint, 0644); +MODULE_PARM_DESC(conns_per_peer, "number of connections per peer"); + +/* By default skip_mr_route_setup is 0 (do not skip) */ +static unsigned int skip_mr_route_setup; +module_param(skip_mr_route_setup, uint, 0444); +MODULE_PARM_DESC(skip_mr_route_setup, "skip automatic setup of linux routes for MR"); + +#ifdef SOCKNAL_BACKOFF +static int backoff_init = 3; +module_param(backoff_init, int, 0644); +MODULE_PARM_DESC(backoff_init, "seconds for initial tcp backoff"); + +static int backoff_max = 3; +module_param(backoff_max, int, 0644); +MODULE_PARM_DESC(backoff_max, "seconds for maximum tcp backoff"); +#endif + +#if SOCKNAL_VERSION_DEBUG +static int protocol = 3; +module_param(protocol, int, 0644); +MODULE_PARM_DESC(protocol, "protocol version"); +#endif + +static inline bool is_native_host(void) +{ +#if (!(defined(__x86_64__) || defined(__i386))) + return true; +#elif defined(HAVE_HYPERVISOR_IS_TYPE) + return hypervisor_is_type(X86_HYPER_NATIVE); +#else + return x86_hyper == NULL; +#endif +} + +struct ksock_tunables ksocknal_tunables; +static struct lnet_ioctl_config_socklnd_tunables default_tunables; + +#ifdef HAVE_ETHTOOL_LINK_SETTINGS +static int ksocklnd_ni_get_eth_intf_speed(struct lnet_ni *ni) +{ + struct net_device *dev; + int intf_idx = -1; + int ret = -1; + + DECLARE_CONST_IN_IFADDR(ifa); + + /* check if ni has interface assigned */ + if (!ni->ni_net_ns || !ni->ni_interface) + return 0; + + rtnl_lock(); + for_each_netdev(ni->ni_net_ns, dev) { + int flags = dev_get_flags(dev); + struct in_device *in_dev; + + if (flags & IFF_LOOPBACK) /* skip the loopback IF */ + continue; + + if (!(flags & IFF_UP)) + continue; + + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + continue; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (strcmp(ifa->ifa_label, ni->ni_interface) == 0) + intf_idx = dev->ifindex; + } + endfor_ifa(in_dev); + + if (intf_idx >= 0) + break; + } + if (intf_idx >= 0) { + struct ethtool_link_ksettings cmd; + int ethtool_ret; + + /* Some devices may not be providing link settings */ + ethtool_ret = __ethtool_get_link_ksettings(dev, &cmd); + if (!ethtool_ret) + ret = cmd.base.speed; + else + ret = ethtool_ret; + } + rtnl_unlock(); + + return ret; +} + +static int ksocklnd_speed2cpp(int speed) +{ + /* Use the minimum of 1Gbps to avoid calling ilog2 with 0 */ + if (speed < 1000) + speed = 1000; + + /* Pick heuristically optimal conns_per_peer value + * for the specified ethernet interface speed (Mbps) + */ + return ilog2(speed/1000) / 2 + 1; +} +#endif + +static int ksocklnd_lookup_conns_per_peer(struct lnet_ni *ni) +{ + int cpp = 1; +#ifdef HAVE_ETHTOOL_LINK_SETTINGS + int speed = ksocklnd_ni_get_eth_intf_speed(ni); + + if (ni->ni_interface) + CDEBUG(D_NET, "intf %s speed %d\n", ni->ni_interface, speed); + + if (speed > 0) + cpp = ksocklnd_speed2cpp(speed); +#endif + return cpp; +} + +int ksocknal_tunables_init(void) +{ + default_tunables.lnd_version = CURRENT_LND_VERSION; + default_tunables.lnd_conns_per_peer = conns_per_peer; + + /* initialize ksocknal_tunables structure */ + ksocknal_tunables.ksnd_timeout = &sock_timeout; + ksocknal_tunables.ksnd_nscheds = &nscheds; + ksocknal_tunables.ksnd_nconnds = &nconnds; + ksocknal_tunables.ksnd_nconnds_max = &nconnds_max; + ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms; + ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms; + ksocknal_tunables.ksnd_eager_ack = &eager_ack; + ksocknal_tunables.ksnd_typed_conns = &typed_conns; + ksocknal_tunables.ksnd_min_bulk = &min_bulk; + ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size; + ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size; + ksocknal_tunables.ksnd_nagle = &nagle; + ksocknal_tunables.ksnd_round_robin = &round_robin; + ksocknal_tunables.ksnd_keepalive = &keepalive; + ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle; + ksocknal_tunables.ksnd_keepalive_count = &keepalive_count; + ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; + ksocknal_tunables.ksnd_credits = &credits; + ksocknal_tunables.ksnd_peertxcredits = &peer_credits; + ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits; + ksocknal_tunables.ksnd_peertimeout = &peer_timeout; + ksocknal_tunables.ksnd_enable_csum = &enable_csum; + ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; + ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack; + ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload; + ksocknal_tunables.ksnd_zc_recv = &zc_recv; + ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags; + if (conns_per_peer > ((1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1)) { + CWARN("socklnd conns_per_peer is capped at %u.\n", + (1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1); + } + ksocknal_tunables.ksnd_conns_per_peer = &conns_per_peer; + + if (enable_irq_affinity) { + CWARN("irq_affinity is removed from socklnd because modern " + "computer always has fast CPUs and more cores than " + "# NICs, although you still can set irq_affinity by " + "another way, please check manual for details.\n"); + } + ksocknal_tunables.ksnd_irq_affinity = &enable_irq_affinity; + +#ifdef SOCKNAL_BACKOFF + ksocknal_tunables.ksnd_backoff_init = &backoff_init; + ksocknal_tunables.ksnd_backoff_max = &backoff_max; +#endif + +#if SOCKNAL_VERSION_DEBUG + ksocknal_tunables.ksnd_protocol = &protocol; +#endif + + if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10)) + *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10); + + /* When on a hypervisor set the minimum zero copy size + * above the maximum payload size + */ + if (!is_native_host()) + *ksocknal_tunables.ksnd_zc_min_payload = (16 << 20) + 1; + + return 0; +} + +void ksocknal_tunables_setup(struct lnet_ni *ni) +{ + struct lnet_ioctl_config_socklnd_tunables *tunables; + struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables; + + /* If no tunables specified, setup default tunables */ + if (!ni->ni_lnd_tunables_set) + memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_sock, + &default_tunables, sizeof(*tunables)); + + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_sock; + + /* Current API version */ + tunables->lnd_version = CURRENT_LND_VERSION; + + net_tunables = &ni->ni_net->net_tunables; + + if (net_tunables->lct_peer_timeout == -1) + net_tunables->lct_peer_timeout = + *ksocknal_tunables.ksnd_peertimeout; + + if (net_tunables->lct_max_tx_credits == -1) + net_tunables->lct_max_tx_credits = + *ksocknal_tunables.ksnd_credits; + + if (net_tunables->lct_peer_tx_credits == -1) + net_tunables->lct_peer_tx_credits = + *ksocknal_tunables.ksnd_peertxcredits; + + if (net_tunables->lct_peer_tx_credits > + net_tunables->lct_max_tx_credits) + net_tunables->lct_peer_tx_credits = + net_tunables->lct_max_tx_credits; + + if (net_tunables->lct_peer_rtr_credits == -1) + net_tunables->lct_peer_rtr_credits = + *ksocknal_tunables.ksnd_peerrtrcredits; + + if (!tunables->lnd_conns_per_peer) + tunables->lnd_conns_per_peer = + ksocklnd_lookup_conns_per_peer(ni); +} diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c new file mode 100644 index 0000000000000..40a1ffbea1405 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c @@ -0,0 +1,1001 @@ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2017, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +/* + * Protocol entries : + * pro_send_hello : send hello message + * pro_recv_hello : receive hello message + * pro_pack : pack message header + * pro_unpack : unpack message header + * pro_queue_tx_zcack() : Called holding BH lock: kss_lock + * return 1 if ACK is piggybacked, otherwise return 0 + * pro_queue_tx_msg() : Called holding BH lock: kss_lock + * return the ACK that piggybacked by my message, or NULL + * pro_handle_zcreq() : handler of incoming ZC-REQ + * pro_handle_zcack() : handler of incoming ZC-ACK + * pro_match_tx() : Called holding glock + */ + +static struct ksock_tx * +ksocknal_queue_tx_msg_v1(struct ksock_conn *conn, struct ksock_tx *tx_msg) +{ + /* V1.x, just enqueue it */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + return NULL; +} + +void +ksocknal_next_tx_carrier(struct ksock_conn *conn) +{ + struct ksock_tx *tx = conn->ksnc_tx_carrier; + + /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */ + LASSERT(!list_empty(&conn->ksnc_tx_queue)); + LASSERT(tx != NULL); + + /* Next TX that can carry ZC-ACK or LNet message */ + if (tx->tx_list.next == &conn->ksnc_tx_queue) { + /* no more packets queued */ + conn->ksnc_tx_carrier = NULL; + } else { + conn->ksnc_tx_carrier = list_next_entry(tx, tx_list); + LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type == + tx->tx_msg.ksm_type); + } +} + +static int +ksocknal_queue_tx_zcack_v2(struct ksock_conn *conn, + struct ksock_tx *tx_ack, __u64 cookie) +{ + struct ksock_tx *tx = conn->ksnc_tx_carrier; + + LASSERT (tx_ack == NULL || + tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + /* + * Enqueue or piggyback tx_ack / cookie + * . no tx can piggyback cookie of tx_ack (or cookie), just + * enqueue the tx_ack (if tx_ack != NUL) and return NULL. + * . There is tx can piggyback cookie of tx_ack (or cookie), + * piggyback the cookie and return the tx. + */ + if (tx == NULL) { + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_ack; + } + return 0; + } + + if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) { + /* tx is noop zc-ack, can't piggyback zc-ack cookie */ + if (tx_ack != NULL) + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + return 0; + } + + LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET); + LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0); + + if (tx_ack != NULL) + cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; + + /* piggyback the zc-ack cookie */ + tx->tx_msg.ksm_zc_cookies[1] = cookie; + /* move on to the next TX which can carry cookie */ + ksocknal_next_tx_carrier(conn); + + return 1; +} + +static struct ksock_tx * +ksocknal_queue_tx_msg_v2(struct ksock_conn *conn, struct ksock_tx *tx_msg) +{ + struct ksock_tx *tx = conn->ksnc_tx_carrier; + + /* + * Enqueue tx_msg: + * . If there is no NOOP on the connection, just enqueue + * tx_msg and return NULL + * . If there is NOOP on the connection, piggyback the cookie + * and replace the NOOP tx, and return the NOOP tx. + */ + if (tx == NULL) { /* nothing on queue */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_msg; + return NULL; + } + + if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + return NULL; + } + + LASSERT (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + /* There is a noop zc-ack can be piggybacked */ + tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1]; + ksocknal_next_tx_carrier(conn); + + /* use new_tx to replace the noop zc-ack packet */ + list_splice(&tx->tx_list, &tx_msg->tx_list); + + return tx; +} + +static int +ksocknal_queue_tx_zcack_v3(struct ksock_conn *conn, + struct ksock_tx *tx_ack, __u64 cookie) +{ + struct ksock_tx *tx; + + if (conn->ksnc_type != SOCKLND_CONN_ACK) + return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie); + + /* non-blocking ZC-ACK (to router) */ + LASSERT (tx_ack == NULL || + tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + if ((tx = conn->ksnc_tx_carrier) == NULL) { + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_ack; + } + return 0; + } + + /* conn->ksnc_tx_carrier != NULL */ + + if (tx_ack != NULL) + cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; + + if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */ + return 1; + + if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) { + /* replace the keepalive PING with a real ACK */ + LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0); + tx->tx_msg.ksm_zc_cookies[1] = cookie; + return 1; + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[0] || + cookie == tx->tx_msg.ksm_zc_cookies[1]) { + CWARN("%s: duplicated ZC cookie: %llu\n", + libcfs_idstr(&conn->ksnc_peer->ksnp_id), cookie); + return 1; /* XXX return error in the future */ + } + + if (tx->tx_msg.ksm_zc_cookies[0] == 0) { + /* NOOP tx has only one ZC-ACK cookie, can carry at least one more */ + if (tx->tx_msg.ksm_zc_cookies[1] > cookie) { + tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1]; + tx->tx_msg.ksm_zc_cookies[1] = cookie; + } else { + tx->tx_msg.ksm_zc_cookies[0] = cookie; + } + + if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) { + /* not likely to carry more ACKs, skip it to simplify logic */ + ksocknal_next_tx_carrier(conn); + } + + return 1; + } + + /* takes two or more cookies already */ + + if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) { + __u64 tmp = 0; + + /* two separated cookies: (a+2, a) or (a+1, a) */ + LASSERT (tx->tx_msg.ksm_zc_cookies[0] - + tx->tx_msg.ksm_zc_cookies[1] <= 2); + + if (tx->tx_msg.ksm_zc_cookies[0] - + tx->tx_msg.ksm_zc_cookies[1] == 2) { + if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) + tmp = cookie; + } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) { + tmp = tx->tx_msg.ksm_zc_cookies[1]; + } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) { + tmp = tx->tx_msg.ksm_zc_cookies[0]; + } + + if (tmp != 0) { + /* range of cookies */ + tx->tx_msg.ksm_zc_cookies[0] = tmp - 1; + tx->tx_msg.ksm_zc_cookies[1] = tmp + 1; + return 1; + } + + } else { + /* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is a range + * of cookies + */ + if (cookie >= tx->tx_msg.ksm_zc_cookies[0] && + cookie <= tx->tx_msg.ksm_zc_cookies[1]) { + CWARN("%s: duplicated ZC cookie: %llu\n", + libcfs_idstr(&conn->ksnc_peer->ksnp_id), + cookie); + return 1; /* XXX: return error in the future */ + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) { + tx->tx_msg.ksm_zc_cookies[1] = cookie; + return 1; + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) { + tx->tx_msg.ksm_zc_cookies[0] = cookie; + return 1; + } + } + + /* failed to piggyback ZC-ACK */ + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue); + /* the next tx can piggyback at least 1 ACK */ + ksocknal_next_tx_carrier(conn); + } + + return 0; +} + +static int +ksocknal_match_tx(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk) +{ + int nob; + +#if SOCKNAL_VERSION_DEBUG + if (!*ksocknal_tunables.ksnd_typed_conns) + return SOCKNAL_MATCH_YES; +#endif + + if (tx == NULL || tx->tx_lnetmsg == NULL) { + /* noop packet */ + nob = sizeof(struct ksock_msg_hdr); + } else { + nob = tx->tx_lnetmsg->msg_len + + ((conn->ksnc_proto == &ksocknal_protocol_v1x) ? + 0 : sizeof(struct ksock_msg_hdr)) + + sizeof(struct lnet_hdr_nid4); + } + + /* default checking for typed connection */ + switch (conn->ksnc_type) { + default: + CERROR("ksnc_type bad: %u\n", conn->ksnc_type); + LBUG(); + case SOCKLND_CONN_ANY: + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_BULK_IN: + return SOCKNAL_MATCH_MAY; + + case SOCKLND_CONN_BULK_OUT: + if (nob < *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_CONTROL: + if (nob >= *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + } +} + +static int +ksocknal_match_tx_v3(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk) +{ + int nob; + + if (tx == NULL || tx->tx_lnetmsg == NULL) + nob = sizeof(struct ksock_msg_hdr); + else + nob = sizeof(struct ksock_msg_hdr) + + sizeof(struct lnet_hdr_nid4) + + tx->tx_lnetmsg->msg_len; + + switch (conn->ksnc_type) { + default: + CERROR("ksnc_type bad: %u\n", conn->ksnc_type); + LBUG(); + case SOCKLND_CONN_ANY: + return SOCKNAL_MATCH_NO; + + case SOCKLND_CONN_ACK: + if (nonblk) + return SOCKNAL_MATCH_YES; + else if (tx == NULL || tx->tx_lnetmsg == NULL) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_NO; + + case SOCKLND_CONN_BULK_OUT: + if (nonblk) + return SOCKNAL_MATCH_NO; + else if (nob < *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_CONTROL: + if (nonblk) + return SOCKNAL_MATCH_NO; + else if (nob >= *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + } +} + +static int +ksocknal_match_tx_v4(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk) +{ + int nob; + + if (!tx || !tx->tx_lnetmsg) + nob = sizeof(struct ksock_msg_hdr); + else + nob = sizeof(struct ksock_msg_hdr) + + sizeof(struct lnet_hdr_nid16) + + tx->tx_lnetmsg->msg_len; + + switch (conn->ksnc_type) { + default: + CERROR("ksnc_type bad: %u\n", conn->ksnc_type); + LBUG(); + case SOCKLND_CONN_ANY: + return SOCKNAL_MATCH_NO; + + case SOCKLND_CONN_ACK: + if (nonblk) + return SOCKNAL_MATCH_YES; + else if (tx == NULL || tx->tx_lnetmsg == NULL) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_NO; + + case SOCKLND_CONN_BULK_OUT: + if (nonblk) + return SOCKNAL_MATCH_NO; + else if (nob < *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_CONTROL: + if (nonblk) + return SOCKNAL_MATCH_NO; + else if (nob >= *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + } +} + +/* (Sink) handle incoming ZC request from sender */ +static int +ksocknal_handle_zcreq(struct ksock_conn *c, __u64 cookie, int remote) +{ + struct ksock_peer_ni *peer_ni = c->ksnc_peer; + struct ksock_conn *conn; + struct ksock_tx *tx; + int rc; + + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = ksocknal_find_conn_locked(peer_ni, NULL, !!remote); + if (conn != NULL) { + struct ksock_sched *sched = conn->ksnc_scheduler; + + LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL); + + spin_lock_bh(&sched->kss_lock); + + rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie); + + spin_unlock_bh(&sched->kss_lock); + + if (rc) { /* piggybacked */ + read_unlock(&ksocknal_data.ksnd_global_lock); + return 0; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + /* ACK connection is not ready, or can't piggyback the ACK */ + tx = ksocknal_alloc_tx_noop(cookie, !!remote); + if (tx == NULL) + return -ENOMEM; + + rc = ksocknal_launch_packet(peer_ni->ksnp_ni, tx, &peer_ni->ksnp_id); + if (rc == 0) + return 0; + + ksocknal_free_tx(tx); + return rc; +} + +/* (Sender) handle ZC_ACK from sink */ +static int +ksocknal_handle_zcack(struct ksock_conn *conn, __u64 cookie1, __u64 cookie2) +{ + struct ksock_peer_ni *peer_ni = conn->ksnc_peer; + struct ksock_tx *tx; + struct ksock_tx *tmp; + LIST_HEAD(zlist); + int count; + + if (cookie1 == 0) + cookie1 = cookie2; + + count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1); + + if (cookie2 == SOCKNAL_KEEPALIVE_PING && + (conn->ksnc_proto == &ksocknal_protocol_v3x || + conn->ksnc_proto == &ksocknal_protocol_v4x)) { + /* keepalive PING for V3.x, just ignore it */ + return count == 1 ? 0 : -EPROTO; + } + + spin_lock(&peer_ni->ksnp_lock); + + list_for_each_entry_safe(tx, tmp, &peer_ni->ksnp_zc_req_list, + tx_zc_list) { + __u64 c = tx->tx_msg.ksm_zc_cookies[0]; + + if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) { + tx->tx_msg.ksm_zc_cookies[0] = 0; + list_move(&tx->tx_zc_list, &zlist); + + if (--count == 0) + break; + } + } + + spin_unlock(&peer_ni->ksnp_lock); + + while ((tx = list_first_entry_or_null(&zlist, struct ksock_tx, + tx_zc_list)) != NULL) { + list_del(&tx->tx_zc_list); + ksocknal_tx_decref(tx); + } + + return count == 0 ? 0 : -EPROTO; +} + +static int +ksocknal_send_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello) +{ + struct socket *sock = conn->ksnc_sock; + struct _lnet_hdr_nid4 *hdr; + struct lnet_magicversion *hmv; + int rc; + int i; + + BUILD_BUG_ON(sizeof(struct lnet_magicversion) != + offsetof(struct _lnet_hdr_nid4, src_nid)); + + LIBCFS_ALLOC(hdr, sizeof(*hdr)); + if (hdr == NULL) { + CERROR("Can't allocate struct lnet_hdr_nid4\n"); + return -ENOMEM; + } + + hmv = (struct lnet_magicversion *)&hdr->dest_nid; + + /* Re-organize V2.x message header to V1.x (struct lnet_hdr_nid4) + * header and send out + */ + hmv->magic = cpu_to_le32 (LNET_PROTO_TCP_MAGIC); + hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR); + hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR); + + if (the_lnet.ln_testprotocompat) { + /* single-shot proto check */ + if (test_and_clear_bit(0, &the_lnet.ln_testprotocompat)) + hmv->version_major++; /* just different! */ + + if (test_and_clear_bit(1, &the_lnet.ln_testprotocompat)) + hmv->magic = LNET_PROTO_MAGIC; + } + + hdr->src_nid = cpu_to_le64(lnet_nid_to_nid4(&hello->kshm_src_nid)); + hdr->src_pid = cpu_to_le32 (hello->kshm_src_pid); + hdr->type = cpu_to_le32 (LNET_MSG_HELLO); + hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32)); + hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype); + hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation); + + rc = lnet_sock_write(sock, hdr, sizeof(*hdr), lnet_acceptor_timeout()); + if (rc != 0) { + CNETERR("Error %d sending HELLO hdr to %pISp\n", + rc, &conn->ksnc_peeraddr); + goto out; + } + + if (hello->kshm_nips == 0) + goto out; + + for (i = 0; i < (int) hello->kshm_nips; i++) + hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]); + + rc = lnet_sock_write(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), + lnet_acceptor_timeout()); + if (rc != 0) { + CNETERR("Error %d sending HELLO payload (%d) to %pISp\n", + rc, hello->kshm_nips, + &conn->ksnc_peeraddr); + } +out: + LIBCFS_FREE(hdr, sizeof(*hdr)); + + return rc; +} + +static int +ksocknal_send_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello) +{ + struct socket *sock = conn->ksnc_sock; + int rc; + struct ksock_hello_msg_nid4 *hello4; + + CFS_ALLOC_PTR(hello4); + if (!hello4) { + CERROR("Can't allocate struct ksock_hello_msg_nid4\n"); + return -ENOMEM; + } + + hello->kshm_magic = LNET_PROTO_MAGIC; + hello->kshm_version = conn->ksnc_proto->pro_version; + + hello4->kshm_magic = LNET_PROTO_MAGIC; + hello4->kshm_version = conn->ksnc_proto->pro_version; + hello4->kshm_src_nid = lnet_nid_to_nid4(&hello->kshm_src_nid); + hello4->kshm_dst_nid = lnet_nid_to_nid4(&hello->kshm_dst_nid); + hello4->kshm_src_pid = hello->kshm_src_pid; + hello4->kshm_dst_pid = hello->kshm_dst_pid; + hello4->kshm_src_incarnation = hello->kshm_src_incarnation; + hello4->kshm_dst_incarnation = hello->kshm_dst_incarnation; + hello4->kshm_ctype = hello->kshm_ctype; + hello4->kshm_nips = hello->kshm_nips; + + if (the_lnet.ln_testprotocompat) { + /* single-shot proto check */ + if (test_and_clear_bit(0, &the_lnet.ln_testprotocompat)) + hello->kshm_version++; /* just different! */ + } + hello4->kshm_magic = LNET_PROTO_MAGIC; + hello4->kshm_version = hello->kshm_version; + hello4->kshm_src_nid = lnet_nid_to_nid4(&hello->kshm_src_nid); + hello4->kshm_dst_nid = lnet_nid_to_nid4(&hello->kshm_dst_nid); + hello4->kshm_src_pid = hello->kshm_src_pid; + hello4->kshm_dst_pid = hello->kshm_dst_pid; + hello4->kshm_src_incarnation = hello->kshm_src_incarnation; + hello4->kshm_dst_incarnation = hello->kshm_dst_incarnation; + hello4->kshm_ctype = hello->kshm_ctype; + hello4->kshm_nips = hello->kshm_nips; + + rc = lnet_sock_write(sock, hello4, sizeof(*hello4), + lnet_acceptor_timeout()); + CFS_FREE_PTR(hello4); + if (rc) { + CNETERR("Error %d sending HELLO hdr to %pISp\n", + rc, &conn->ksnc_peeraddr); + return rc; + } + + if (hello->kshm_nips == 0) + return 0; + + rc = lnet_sock_write(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), + lnet_acceptor_timeout()); + if (rc != 0) { + CNETERR("Error %d sending HELLO payload (%d) to %pISp\n", rc, + hello->kshm_nips, + &conn->ksnc_peeraddr); + } + + return rc; +} + +static int +ksocknal_send_hello_v4(struct ksock_conn *conn, struct ksock_hello_msg *hello) +{ + struct socket *sock = conn->ksnc_sock; + int rc; + + hello->kshm_magic = LNET_PROTO_MAGIC; + hello->kshm_version = conn->ksnc_proto->pro_version; + + rc = lnet_sock_write(sock, hello, sizeof(*hello), + lnet_acceptor_timeout()); + + if (rc != 0) + CNETERR("Error %d sending HELLO hdr to %pISp\n", + rc, &conn->ksnc_peeraddr); + return rc; +} + +static int +ksocknal_recv_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello, + int timeout) +{ + struct socket *sock = conn->ksnc_sock; + struct _lnet_hdr_nid4 *hdr; + int rc; + int i; + + CFS_ALLOC_PTR(hdr); + if (!hdr) { + CERROR("Can't allocate struct lnet_hdr_nid4\n"); + return -ENOMEM; + } + + rc = lnet_sock_read(sock, &hdr->src_nid, + sizeof(*hdr) - offsetof(struct _lnet_hdr_nid4, + src_nid), + timeout); + if (rc != 0) { + CERROR("Error %d reading rest of HELLO hdr from %pIS\n", + rc, &conn->ksnc_peeraddr); + LASSERT(rc < 0 && rc != -EALREADY); + goto out; + } + + /* ...and check we got what we expected */ + if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) { + CERROR("Expecting a HELLO hdr, but got type %d from %pIS\n", + le32_to_cpu(hdr->type), + &conn->ksnc_peeraddr); + rc = -EPROTO; + goto out; + } + + lnet_nid4_to_nid(le64_to_cpu(hdr->src_nid), &hello->kshm_src_nid); + hello->kshm_src_pid = le32_to_cpu(hdr->src_pid); + hello->kshm_src_incarnation = le64_to_cpu(hdr->msg.hello.incarnation); + hello->kshm_ctype = le32_to_cpu(hdr->msg.hello.type); + hello->kshm_nips = le32_to_cpu(hdr->payload_length) / sizeof(__u32); + + if (hello->kshm_nips > LNET_INTERFACES_NUM) { + CERROR("Bad nips %d from ip %pIS\n", + hello->kshm_nips, &conn->ksnc_peeraddr); + rc = -EPROTO; + goto out; + } + + if (hello->kshm_nips == 0) + goto out; + + rc = lnet_sock_read(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), timeout); + if (rc != 0) { + CERROR("Error %d reading IPs from ip %pIS\n", + rc, &conn->ksnc_peeraddr); + LASSERT(rc < 0 && rc != -EALREADY); + goto out; + } + + for (i = 0; i < (int) hello->kshm_nips; i++) { + hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]); + + if (hello->kshm_ips[i] == 0) { + CERROR("Zero IP[%d] from ip %pIS\n", + i, &conn->ksnc_peeraddr); + rc = -EPROTO; + break; + } + } +out: + CFS_FREE_PTR(hdr); + + return rc; +} + +static int +ksocknal_recv_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello, + int timeout) +{ + struct socket *sock = conn->ksnc_sock; + struct ksock_hello_msg_nid4 *hello4 = (void *)hello; + int rc; + int i; + + if (hello->kshm_magic == LNET_PROTO_MAGIC) + conn->ksnc_flip = 0; + else + conn->ksnc_flip = 1; + + rc = lnet_sock_read(sock, &hello4->kshm_src_nid, + offsetof(struct ksock_hello_msg_nid4, kshm_ips) - + offsetof(struct ksock_hello_msg_nid4, kshm_src_nid), + timeout); + if (rc != 0) { + CERROR("Error %d reading HELLO from %pIS\n", + rc, &conn->ksnc_peeraddr); + LASSERT(rc < 0 && rc != -EALREADY); + return rc; + } + + if (conn->ksnc_flip) { + /* These must be copied in reverse order to avoid corruption. */ + hello->kshm_nips = __swab32(hello4->kshm_nips); + hello->kshm_ctype = __swab32(hello4->kshm_ctype); + hello->kshm_dst_incarnation = __swab64(hello4->kshm_dst_incarnation); + hello->kshm_src_incarnation = __swab64(hello4->kshm_src_incarnation); + hello->kshm_dst_pid = __swab32(hello4->kshm_dst_pid); + hello->kshm_src_pid = __swab32(hello4->kshm_src_pid); + lnet_nid4_to_nid(hello4->kshm_dst_nid, &hello->kshm_dst_nid); + lnet_nid4_to_nid(hello4->kshm_src_nid, &hello->kshm_src_nid); + } else { + /* These must be copied in reverse order to avoid corruption. */ + hello->kshm_nips = hello4->kshm_nips; + hello->kshm_ctype = hello4->kshm_ctype; + hello->kshm_dst_incarnation = hello4->kshm_dst_incarnation; + hello->kshm_src_incarnation = hello4->kshm_src_incarnation; + hello->kshm_dst_pid = hello4->kshm_dst_pid; + hello->kshm_src_pid = hello4->kshm_src_pid; + lnet_nid4_to_nid(hello4->kshm_dst_nid, &hello->kshm_dst_nid); + lnet_nid4_to_nid(hello4->kshm_src_nid, &hello->kshm_src_nid); + } + + if (hello->kshm_nips > LNET_INTERFACES_NUM) { + CERROR("Bad nips %d from ip %pIS\n", + hello->kshm_nips, &conn->ksnc_peeraddr); + return -EPROTO; + } + + if (hello->kshm_nips == 0) + return 0; + + rc = lnet_sock_read(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), timeout); + if (rc != 0) { + CERROR("Error %d reading IPs from ip %pIS\n", + rc, &conn->ksnc_peeraddr); + LASSERT(rc < 0 && rc != -EALREADY); + return rc; + } + + for (i = 0; i < (int) hello->kshm_nips; i++) { + if (conn->ksnc_flip) + __swab32s(&hello->kshm_ips[i]); + + if (hello->kshm_ips[i] == 0) { + CERROR("Zero IP[%d] from ip %pIS\n", + i, &conn->ksnc_peeraddr); + return -EPROTO; + } + } + + return 0; +} + +static int +ksocknal_recv_hello_v4(struct ksock_conn *conn, struct ksock_hello_msg *hello, + int timeout) +{ + struct socket *sock = conn->ksnc_sock; + int rc; + + if (hello->kshm_magic == LNET_PROTO_MAGIC) + conn->ksnc_flip = 0; + else + conn->ksnc_flip = 1; + + rc = lnet_sock_read(sock, &hello->kshm_src_nid, + sizeof(*hello) - + offsetof(struct ksock_hello_msg, kshm_src_nid), + timeout); + if (rc) { + CERROR("Error %d reading HELLO from %pIS\n", + rc, &conn->ksnc_peeraddr); + LASSERT(rc < 0 && rc != -EALREADY); + return rc; + } + + if (conn->ksnc_flip) { + __swab32s(&hello->kshm_src_pid); + __swab32s(&hello->kshm_dst_pid); + __swab64s(&hello->kshm_src_incarnation); + __swab64s(&hello->kshm_dst_incarnation); + __swab32s(&hello->kshm_ctype); + } + + return 0; +} + +static void +ksocknal_pack_msg_v1(struct ksock_tx *tx) +{ + /* V1.x has no KSOCK_MSG_NOOP */ + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT(tx->tx_lnetmsg != NULL); + + lnet_hdr_to_nid4(&tx->tx_lnetmsg->msg_hdr, + &tx->tx_msg.ksm_u.lnetmsg_nid4); + tx->tx_hdr.iov_base = (void *)&tx->tx_msg.ksm_u.lnetmsg_nid4; + tx->tx_hdr.iov_len = sizeof(struct lnet_hdr_nid4); + + tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(struct lnet_hdr_nid4); + tx->tx_resid = tx->tx_nob; +} + +static void +ksocknal_pack_msg_v2(struct ksock_tx *tx) +{ + int hdr_size; + + tx->tx_hdr.iov_base = (void *)&tx->tx_msg; + + switch (tx->tx_msg.ksm_type) { + case KSOCK_MSG_LNET: + LASSERT(tx->tx_lnetmsg != NULL); + hdr_size = (sizeof(struct ksock_msg_hdr) + + sizeof(struct lnet_hdr_nid4)); + + lnet_hdr_to_nid4(&tx->tx_lnetmsg->msg_hdr, + &tx->tx_msg.ksm_u.lnetmsg_nid4); + tx->tx_hdr.iov_len = hdr_size; + tx->tx_resid = tx->tx_nob = hdr_size + tx->tx_lnetmsg->msg_len; + break; + case KSOCK_MSG_NOOP: + LASSERT(tx->tx_lnetmsg == NULL); + hdr_size = sizeof(struct ksock_msg_hdr); + + tx->tx_hdr.iov_len = hdr_size; + tx->tx_resid = tx->tx_nob = hdr_size; + break; + default: + LASSERT(0); + } + /* Don't checksum before start sending, because packet can be + * piggybacked with ACK + */ +} + +static void +ksocknal_pack_msg_v4(struct ksock_tx *tx) +{ + int hdr_size; + + tx->tx_hdr.iov_base = (void *)&tx->tx_msg; + + switch (tx->tx_msg.ksm_type) { + case KSOCK_MSG_LNET: + LASSERT(tx->tx_lnetmsg != NULL); + hdr_size = (sizeof(struct ksock_msg_hdr) + + sizeof(struct lnet_hdr_nid16)); + + lnet_hdr_to_nid16(&tx->tx_lnetmsg->msg_hdr, + &tx->tx_msg.ksm_u.lnetmsg_nid16); + tx->tx_hdr.iov_len = hdr_size; + tx->tx_resid = tx->tx_nob = hdr_size + tx->tx_lnetmsg->msg_len; + break; + case KSOCK_MSG_NOOP: + LASSERT(tx->tx_lnetmsg == NULL); + hdr_size = sizeof(struct ksock_msg_hdr); + + tx->tx_hdr.iov_len = hdr_size; + tx->tx_resid = tx->tx_nob = hdr_size; + break; + default: + LASSERT(0); + } + /* Don't checksum before start sending, because packet can be + * piggybacked with ACK + */ +} + +static void +ksocknal_unpack_msg_v1(struct ksock_msg *msg, struct lnet_hdr *hdr) +{ + msg->ksm_csum = 0; + msg->ksm_type = KSOCK_MSG_LNET; + msg->ksm_zc_cookies[0] = msg->ksm_zc_cookies[1] = 0; + lnet_hdr_from_nid4(hdr, &msg->ksm_u.lnetmsg_nid4); +} + +static void +ksocknal_unpack_msg_v2(struct ksock_msg *msg, struct lnet_hdr *hdr) +{ + lnet_hdr_from_nid4(hdr, &msg->ksm_u.lnetmsg_nid4); +} + +static void +ksocknal_unpack_msg_v4(struct ksock_msg *msg, struct lnet_hdr *hdr) +{ + lnet_hdr_from_nid16(hdr, &msg->ksm_u.lnetmsg_nid16); +} + +const struct ksock_proto ksocknal_protocol_v1x = +{ + .pro_version = KSOCK_PROTO_V1, + .pro_send_hello = ksocknal_send_hello_v1, + .pro_recv_hello = ksocknal_recv_hello_v1, + .pro_pack = ksocknal_pack_msg_v1, + .pro_unpack = ksocknal_unpack_msg_v1, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v1, + .pro_handle_zcreq = NULL, + .pro_handle_zcack = NULL, + .pro_queue_tx_zcack = NULL, + .pro_match_tx = ksocknal_match_tx +}; + +const struct ksock_proto ksocknal_protocol_v2x = +{ + .pro_version = KSOCK_PROTO_V2, + .pro_send_hello = ksocknal_send_hello_v2, + .pro_recv_hello = ksocknal_recv_hello_v2, + .pro_pack = ksocknal_pack_msg_v2, + .pro_unpack = ksocknal_unpack_msg_v2, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, + .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v2, + .pro_handle_zcreq = ksocknal_handle_zcreq, + .pro_handle_zcack = ksocknal_handle_zcack, + .pro_match_tx = ksocknal_match_tx +}; + +const struct ksock_proto ksocknal_protocol_v3x = +{ + .pro_version = KSOCK_PROTO_V3, + .pro_send_hello = ksocknal_send_hello_v2, + .pro_recv_hello = ksocknal_recv_hello_v2, + .pro_pack = ksocknal_pack_msg_v2, + .pro_unpack = ksocknal_unpack_msg_v2, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, + .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v3, + .pro_handle_zcreq = ksocknal_handle_zcreq, + .pro_handle_zcack = ksocknal_handle_zcack, + .pro_match_tx = ksocknal_match_tx_v3 +}; + +const struct ksock_proto ksocknal_protocol_v4x = { + .pro_version = KSOCK_PROTO_V4, + .pro_send_hello = ksocknal_send_hello_v4, + .pro_recv_hello = ksocknal_recv_hello_v4, + .pro_pack = ksocknal_pack_msg_v4, + .pro_unpack = ksocknal_unpack_msg_v4, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, + .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v3, + .pro_handle_zcreq = ksocknal_handle_zcreq, + .pro_handle_zcack = ksocknal_handle_zcack, + .pro_match_tx = ksocknal_match_tx_v4, +}; diff --git a/drivers/staging/lustrefsx/lnet/lnet/Makefile b/drivers/staging/lustrefsx/lnet/lnet/Makefile new file mode 100644 index 0000000000000..95813fbdafda6 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_LUSTREFSX_LNET) += lnet.o + +lnet-y := api-ni.o config.o nidstrings.o lnet_rdma.o +lnet-y += lib-me.o lib-msg.o lib-md.o lib-ptl.o +lnet-y += lib-socket.o lib-move.o module.o lo.o +lnet-y += router.o router_proc.o acceptor.o peer.o net_fault.o udsp.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lnet/lnet/acceptor.c b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c new file mode 100644 index 0000000000000..51ffd29da7c1b --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c @@ -0,0 +1,570 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include + +static int accept_port = 988; +static int accept_backlog = 127; +static int accept_timeout = 5; + +static struct { + int pta_shutdown; + struct socket *pta_sock; + struct completion pta_signal; + struct net *pta_ns; + wait_queue_head_t pta_waitq; + atomic_t pta_ready; +#ifdef HAVE_SK_DATA_READY_ONE_ARG + void (*pta_odata)(struct sock *); +#else + void (*pta_odata)(struct sock *, int); +#endif +} lnet_acceptor_state = { + .pta_shutdown = 1 +}; + +int +lnet_acceptor_port(void) +{ + return accept_port; +} + +static inline int +lnet_accept_magic(__u32 magic, __u32 constant) +{ + return (magic == constant || + magic == __swab32(constant)); +} + +EXPORT_SYMBOL(lnet_acceptor_port); + +static char *accept_type = "secure"; + +module_param_named(accept, accept_type, charp, 0444); +MODULE_PARM_DESC(accept, "Accept connections (secure|all|none)"); +module_param(accept_port, int, 0444); +MODULE_PARM_DESC(accept_port, "Acceptor's port (same on all nodes)"); +module_param(accept_backlog, int, 0444); +MODULE_PARM_DESC(accept_backlog, "Acceptor's listen backlog"); +module_param(accept_timeout, int, 0644); +MODULE_PARM_DESC(accept_timeout, "Acceptor's timeout (seconds)"); + +int +lnet_acceptor_timeout(void) +{ + return accept_timeout; +} +EXPORT_SYMBOL(lnet_acceptor_timeout); + +void +lnet_connect_console_error(int rc, struct lnet_nid *peer_nid, + struct sockaddr *sa) +{ + switch (rc) { + /* "normal" errors */ + case -ECONNREFUSED: + CNETERR("Connection to %s at host %pISp was refused: check that Lustre is running on that node.\n", + libcfs_nidstr(peer_nid), sa); + break; + case -EHOSTUNREACH: + case -ENETUNREACH: + CNETERR("Connection to %s at host %pIS was unreachable: the network or that node may be down, or Lustre may be misconfigured.\n", + libcfs_nidstr(peer_nid), sa); + break; + case -ETIMEDOUT: + CNETERR("Connection to %s at host %pISp took too long: that node may be hung or experiencing high load.\n", + libcfs_nidstr(peer_nid), sa); + break; + case -ECONNRESET: + LCONSOLE_ERROR_MSG(0x11b, + "Connection to %s at host %pISp was reset: is it running a compatible version of Lustre and is %s one of its NIDs?\n", + libcfs_nidstr(peer_nid), sa, + libcfs_nidstr(peer_nid)); + break; + case -EPROTO: + LCONSOLE_ERROR_MSG(0x11c, + "Protocol error connecting to %s at host %pISp: is it running a compatible version of Lustre?\n", + libcfs_nidstr(peer_nid), sa); + break; + case -EADDRINUSE: + LCONSOLE_ERROR_MSG(0x11d, + "No privileged ports available to connect to %s at host %pISp\n", + libcfs_nidstr(peer_nid), sa); + break; + default: + LCONSOLE_ERROR_MSG(0x11e, + "Unexpected error %d connecting to %s at host %pISp\n", + rc, libcfs_nidstr(peer_nid), sa); + break; + } +} +EXPORT_SYMBOL(lnet_connect_console_error); + +struct socket * +lnet_connect(struct lnet_nid *peer_nid, int interface, + struct sockaddr *peeraddr, + struct net *ns) +{ + struct lnet_acceptor_connreq cr1; + struct lnet_acceptor_connreq_v2 cr2; + void *cr; + int crsize; + struct socket *sock; + int rc; + int port; + + BUILD_BUG_ON(sizeof(cr) > 16); /* not too big to be on the stack */ + + LASSERT(peeraddr->sa_family == AF_INET || + peeraddr->sa_family == AF_INET6); + + for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT; + port >= LNET_ACCEPTOR_MIN_RESERVED_PORT; + --port) { + /* Iterate through reserved ports. */ + sock = lnet_sock_connect(interface, port, peeraddr, ns); + if (IS_ERR(sock)) { + rc = PTR_ERR(sock); + if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) + continue; + goto failed; + } + + BUILD_BUG_ON(LNET_PROTO_ACCEPTOR_VERSION != 1); + + if (nid_is_nid4(peer_nid)) { + cr1.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr1.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + cr1.acr_nid = lnet_nid_to_nid4(peer_nid); + cr = &cr1; + crsize = sizeof(cr1); + + if (the_lnet.ln_testprotocompat) { + /* single-shot proto check */ + if (test_and_clear_bit( + 2, &the_lnet.ln_testprotocompat)) + cr1.acr_version++; + if (test_and_clear_bit( + 3, &the_lnet.ln_testprotocompat)) + cr1.acr_magic = LNET_PROTO_MAGIC; + } + + } else { + cr2.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr2.acr_version = LNET_PROTO_ACCEPTOR_VERSION_16; + cr2.acr_nid = *peer_nid; + cr = &cr2; + crsize = sizeof(cr2); + } + + rc = lnet_sock_write(sock, cr, crsize, accept_timeout); + if (rc != 0) + goto failed_sock; + + return sock; + } + + rc = -EADDRINUSE; + goto failed; + +failed_sock: + sock_release(sock); +failed: + lnet_connect_console_error(rc, peer_nid, peeraddr); + return ERR_PTR(rc); +} +EXPORT_SYMBOL(lnet_connect); + +static int +lnet_accept(struct socket *sock, __u32 magic) +{ + struct lnet_acceptor_connreq cr; + struct lnet_acceptor_connreq_v2 cr2; + struct lnet_nid nid; + struct sockaddr_storage peer; + int peer_version; + int rc; + int flip; + struct lnet_ni *ni; + char *str; + + LASSERT(sizeof(cr) <= 16); /* not too big for the stack */ + + rc = lnet_sock_getaddr(sock, true, &peer); + if (rc != 0) { + CERROR("Can't determine new connection's address\n"); + return rc; + } + + if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) { + + if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) { + /* future version compatibility! + * When LNET unifies protocols over all LNDs, the first + * thing sent will be a version query. I send back + * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */ + + memset(&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + rc = lnet_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + + if (rc != 0) + CERROR("Error sending magic+version in response to LNET magic from %pIS: %d\n", + &peer, rc); + return -EPROTO; + } + + if (lnet_accept_magic(magic, LNET_PROTO_TCP_MAGIC)) + str = "'old' socknal/tcpnal"; + else + str = "unrecognised"; + + LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %pIS" + " magic %08x: %s acceptor protocol\n", + &peer, magic, str); + return -EPROTO; + } + + flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC); + + rc = lnet_sock_read(sock, &cr.acr_version, + sizeof(cr.acr_version), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request version from %pIS\n", + rc, &peer); + return -EIO; + } + + if (flip) + __swab32s(&cr.acr_version); + + switch (cr.acr_version) { + default: + /* future version compatibility! + * An acceptor-specific protocol rev will first send a version + * query. I send back my current version to tell her I'm + * "old". */ + peer_version = cr.acr_version; + + memset(&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + + rc = lnet_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + + if (rc != 0) + CERROR("Error sending magic+version in response to version %d from %pIS: %d\n", + peer_version, &peer, rc); + return -EPROTO; + + case LNET_PROTO_ACCEPTOR_VERSION: + + rc = lnet_sock_read(sock, &cr.acr_nid, + sizeof(cr) - + offsetof(struct lnet_acceptor_connreq, + acr_nid), + accept_timeout); + if (rc) + break; + if (flip) + __swab64s(&cr.acr_nid); + + lnet_nid4_to_nid(cr.acr_nid, &nid); + break; + + case LNET_PROTO_ACCEPTOR_VERSION_16: + rc = lnet_sock_read(sock, &cr2.acr_nid, + sizeof(cr2) - + offsetof(struct lnet_acceptor_connreq_v2, + acr_nid), + accept_timeout); + if (rc) + break; + nid = cr2.acr_nid; + break; + } + if (rc != 0) { + CERROR("Error %d reading connection request from %pIS\n", + rc, &peer); + return -EIO; + } + + ni = lnet_nid_to_ni_addref(&nid); + if (ni == NULL || /* no matching net */ + !nid_same(&ni->ni_nid, &nid)) { + /* right NET, wrong NID! */ + if (ni != NULL) + lnet_ni_decref(ni); + LCONSOLE_ERROR_MSG(0x120, + "Refusing connection from %pIS for %s: No matching NI\n", + &peer, libcfs_nidstr(&nid)); + return -EPERM; + } + + if (ni->ni_net->net_lnd->lnd_accept == NULL) { + /* This catches a request for the loopback LND */ + lnet_ni_decref(ni); + LCONSOLE_ERROR_MSG(0x121, + "Refusing connection from %pIS for %s: NI doesn not accept IP connections\n", + &peer, libcfs_nidstr(&nid)); + return -EPERM; + } + + CDEBUG(D_NET, "Accept %s from %pI4h\n", libcfs_nidstr(&nid), &peer); + + rc = ni->ni_net->net_lnd->lnd_accept(ni, sock); + + lnet_ni_decref(ni); + return rc; +} + +#ifdef HAVE_SK_DATA_READY_ONE_ARG +static void lnet_acceptor_ready(struct sock *sk) +#else +static void lnet_acceptor_ready(struct sock *sk, int len) +#endif +{ + /* Ensure pta_odata has actually been set before calling it */ + rmb(); +#ifdef HAVE_SK_DATA_READY_ONE_ARG + lnet_acceptor_state.pta_odata(sk); +#else + lnet_acceptor_state.pta_odata(sk, 0); +#endif + + atomic_set(&lnet_acceptor_state.pta_ready, 1); + wake_up(&lnet_acceptor_state.pta_waitq); +} + +static int +lnet_acceptor(void *arg) +{ + struct socket *newsock; + int rc; + __u32 magic; + struct sockaddr_storage peer; + int secure = (int)((uintptr_t)arg); + + LASSERT(lnet_acceptor_state.pta_sock == NULL); + + lnet_acceptor_state.pta_sock = + lnet_sock_listen(accept_port, accept_backlog, + lnet_acceptor_state.pta_ns); + if (IS_ERR(lnet_acceptor_state.pta_sock)) { + rc = PTR_ERR(lnet_acceptor_state.pta_sock); + if (rc == -EADDRINUSE) + LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port" + " %d: port already in use\n", + accept_port); + else + LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port " + "%d: unexpected error %d\n", + accept_port, rc); + + lnet_acceptor_state.pta_sock = NULL; + } else { + rc = 0; + LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port); + init_waitqueue_head(&lnet_acceptor_state.pta_waitq); + lnet_acceptor_state.pta_odata = + lnet_acceptor_state.pta_sock->sk->sk_data_ready; + /* ensure pta_odata gets set before there is any chance of + * lnet_accept_ready() trying to read it. + */ + wmb(); + lnet_acceptor_state.pta_sock->sk->sk_data_ready = + lnet_acceptor_ready; + atomic_set(&lnet_acceptor_state.pta_ready, 1); + } + + /* set init status and unblock parent */ + lnet_acceptor_state.pta_shutdown = rc; + complete(&lnet_acceptor_state.pta_signal); + + if (rc != 0) + return rc; + + while (!lnet_acceptor_state.pta_shutdown) { + + wait_event_idle(lnet_acceptor_state.pta_waitq, + lnet_acceptor_state.pta_shutdown || + atomic_read(&lnet_acceptor_state.pta_ready)); + if (!atomic_read(&lnet_acceptor_state.pta_ready)) + continue; + atomic_set(&lnet_acceptor_state.pta_ready, 0); + rc = kernel_accept(lnet_acceptor_state.pta_sock, &newsock, + SOCK_NONBLOCK); + if (rc != 0) { + if (rc != -EAGAIN) { + CWARN("Accept error %d: pausing...\n", rc); + schedule_timeout_uninterruptible( + cfs_time_seconds(1)); + } + continue; + } + + /* make sure we call lnet_sock_accept() again, until it fails */ + atomic_set(&lnet_acceptor_state.pta_ready, 1); + + rc = lnet_sock_getaddr(newsock, true, &peer); + if (rc != 0) { + CERROR("Can't determine new connection's address\n"); + goto failed; + } + + if (secure && + rpc_get_port((struct sockaddr *)&peer) > + LNET_ACCEPTOR_MAX_RESERVED_PORT) { + CERROR("Refusing connection from %pISp: insecure port.\n", + &peer); + goto failed; + } + + rc = lnet_sock_read(newsock, &magic, sizeof(magic), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request from %pIS\n", + rc, &peer); + goto failed; + } + + rc = lnet_accept(newsock, magic); + if (rc != 0) + goto failed; + + continue; + +failed: + sock_release(newsock); + } + + lnet_acceptor_state.pta_sock->sk->sk_data_ready = + lnet_acceptor_state.pta_odata; + sock_release(lnet_acceptor_state.pta_sock); + lnet_acceptor_state.pta_sock = NULL; + + CDEBUG(D_NET, "Acceptor stopping\n"); + + /* unblock lnet_acceptor_stop() */ + complete(&lnet_acceptor_state.pta_signal); + return 0; +} + +static inline int +accept2secure(const char *acc, long *sec) +{ + if (!strcmp(acc, "secure")) { + *sec = 1; + return 1; + } else if (!strcmp(acc, "all")) { + *sec = 0; + return 1; + } else if (!strcmp(acc, "none")) { + return 0; + } else { + LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n", + acc); + return -EINVAL; + } +} + +int +lnet_acceptor_start(void) +{ + struct task_struct *task; + int rc; + long rc2; + long secure; + + /* if acceptor is already running return immediately */ + if (!lnet_acceptor_state.pta_shutdown) + return 0; + + LASSERT(lnet_acceptor_state.pta_sock == NULL); + + init_completion(&lnet_acceptor_state.pta_signal); + rc = accept2secure(accept_type, &secure); + if (rc <= 0) + return rc; + + if (lnet_count_acceptor_nets() == 0) /* not required */ + return 0; + if (current->nsproxy && current->nsproxy->net_ns) + lnet_acceptor_state.pta_ns = current->nsproxy->net_ns; + else + lnet_acceptor_state.pta_ns = &init_net; + task = kthread_run(lnet_acceptor, (void *)(uintptr_t)secure, + "acceptor_%03ld", secure); + if (IS_ERR(task)) { + rc2 = PTR_ERR(task); + CERROR("Can't start acceptor thread: %ld\n", rc2); + return -ESRCH; + } + + /* wait for acceptor to startup */ + wait_for_completion(&lnet_acceptor_state.pta_signal); + + if (!lnet_acceptor_state.pta_shutdown) { + /* started OK */ + LASSERT(lnet_acceptor_state.pta_sock != NULL); + return 0; + } + + LASSERT(lnet_acceptor_state.pta_sock == NULL); + + return -ENETDOWN; +} + +void +lnet_acceptor_stop(void) +{ + if (lnet_acceptor_state.pta_shutdown) /* not running */ + return; + + /* If still required, return immediately */ + if (the_lnet.ln_refcount && lnet_count_acceptor_nets() > 0) + return; + + lnet_acceptor_state.pta_shutdown = 1; + wake_up(&lnet_acceptor_state.pta_waitq); + + /* block until acceptor signals exit */ + wait_for_completion(&lnet_acceptor_state.pta_signal); +} diff --git a/drivers/staging/lustrefsx/lnet/lnet/api-ni.c b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c new file mode 100644 index 0000000000000..b99c85b73d0e0 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c @@ -0,0 +1,4884 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#include +#ifdef HAVE_SCHED_HEADERS +#include +#endif +#include +#include + +#define D_LNI D_CONSOLE + +/* + * initialize ln_api_mutex statically, since it needs to be used in + * discovery_set callback. That module parameter callback can be called + * before module init completes. The mutex needs to be ready for use then. + */ +struct lnet the_lnet = { + .ln_api_mutex = __MUTEX_INITIALIZER(the_lnet.ln_api_mutex), +}; /* THE state of the network */ +EXPORT_SYMBOL(the_lnet); + +static char *ip2nets = ""; +module_param(ip2nets, charp, 0444); +MODULE_PARM_DESC(ip2nets, "LNET network <- IP table"); + +static char *networks = ""; +module_param(networks, charp, 0444); +MODULE_PARM_DESC(networks, "local networks"); + +static char *routes = ""; +module_param(routes, charp, 0444); +MODULE_PARM_DESC(routes, "routes to non-local networks"); + +static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT; +module_param(rnet_htable_size, int, 0444); +MODULE_PARM_DESC(rnet_htable_size, "size of remote network hash table"); + +static int use_tcp_bonding; +module_param(use_tcp_bonding, int, 0444); +MODULE_PARM_DESC(use_tcp_bonding, + "use_tcp_bonding parameter has been removed"); + +unsigned int lnet_numa_range = 0; +module_param(lnet_numa_range, uint, 0444); +MODULE_PARM_DESC(lnet_numa_range, + "NUMA range to consider during Multi-Rail selection"); + +/* + * lnet_health_sensitivity determines by how much we decrement the health + * value on sending error. The value defaults to 100, which means health + * interface health is decremented by 100 points every failure. + */ +unsigned int lnet_health_sensitivity = 100; +static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp); +#ifdef HAVE_KERNEL_PARAM_OPS +static struct kernel_param_ops param_ops_health_sensitivity = { + .set = sensitivity_set, + .get = param_get_int, +}; +#define param_check_health_sensitivity(name, p) \ + __param_check(name, p, int) +module_param(lnet_health_sensitivity, health_sensitivity, S_IRUGO|S_IWUSR); +#else +module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int, + &lnet_health_sensitivity, S_IRUGO|S_IWUSR); +#endif +MODULE_PARM_DESC(lnet_health_sensitivity, + "Value to decrement the health value by on error"); + +/* + * lnet_recovery_interval determines how often we should perform recovery + * on unhealthy interfaces. + */ +unsigned int lnet_recovery_interval = 1; +static int recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp); +#ifdef HAVE_KERNEL_PARAM_OPS +static struct kernel_param_ops param_ops_recovery_interval = { + .set = recovery_interval_set, + .get = param_get_int, +}; +#define param_check_recovery_interval(name, p) \ + __param_check(name, p, int) +module_param(lnet_recovery_interval, recovery_interval, S_IRUGO|S_IWUSR); +#else +module_param_call(lnet_recovery_interval, recovery_interval_set, param_get_int, + &lnet_recovery_interval, S_IRUGO|S_IWUSR); +#endif +MODULE_PARM_DESC(lnet_recovery_interval, + "DEPRECATED - Interval to recover unhealthy interfaces in seconds"); + +unsigned int lnet_recovery_limit; +module_param(lnet_recovery_limit, uint, 0644); +MODULE_PARM_DESC(lnet_recovery_limit, + "How long to attempt recovery of unhealthy peer interfaces in seconds. Set to 0 to allow indefinite recovery"); + +static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT; +static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp); + +static struct kernel_param_ops param_ops_interfaces_max = { + .set = intf_max_set, + .get = param_get_int, +}; + +#define param_check_interfaces_max(name, p) \ + __param_check(name, p, int) + +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(lnet_interfaces_max, interfaces_max, 0644); +#else +module_param_call(lnet_interfaces_max, intf_max_set, param_get_int, + ¶m_ops_interfaces_max, 0644); +#endif +MODULE_PARM_DESC(lnet_interfaces_max, + "Maximum number of interfaces in a node."); + +unsigned lnet_peer_discovery_disabled = 0; +static int discovery_set(const char *val, cfs_kernel_param_arg_t *kp); + +static struct kernel_param_ops param_ops_discovery_disabled = { + .set = discovery_set, + .get = param_get_int, +}; + +#define param_check_discovery_disabled(name, p) \ + __param_check(name, p, int) +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(lnet_peer_discovery_disabled, discovery_disabled, 0644); +#else +module_param_call(lnet_peer_discovery_disabled, discovery_set, param_get_int, + ¶m_ops_discovery_disabled, 0644); +#endif +MODULE_PARM_DESC(lnet_peer_discovery_disabled, + "Set to 1 to disable peer discovery on this node."); + +unsigned int lnet_drop_asym_route; +static int drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp); + +static struct kernel_param_ops param_ops_drop_asym_route = { + .set = drop_asym_route_set, + .get = param_get_int, +}; + +#define param_check_drop_asym_route(name, p) \ + __param_check(name, p, int) +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(lnet_drop_asym_route, drop_asym_route, 0644); +#else +module_param_call(lnet_drop_asym_route, drop_asym_route_set, param_get_int, + ¶m_ops_drop_asym_route, 0644); +#endif +MODULE_PARM_DESC(lnet_drop_asym_route, + "Set to 1 to drop asymmetrical route messages."); + +#define LNET_TRANSACTION_TIMEOUT_DEFAULT 50 +unsigned int lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_DEFAULT; +static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp); +#ifdef HAVE_KERNEL_PARAM_OPS +static struct kernel_param_ops param_ops_transaction_timeout = { + .set = transaction_to_set, + .get = param_get_int, +}; + +#define param_check_transaction_timeout(name, p) \ + __param_check(name, p, int) +module_param(lnet_transaction_timeout, transaction_timeout, S_IRUGO|S_IWUSR); +#else +module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int, + &lnet_transaction_timeout, S_IRUGO|S_IWUSR); +#endif +MODULE_PARM_DESC(lnet_transaction_timeout, + "Maximum number of seconds to wait for a peer response."); + +#define LNET_RETRY_COUNT_DEFAULT 2 +unsigned int lnet_retry_count = LNET_RETRY_COUNT_DEFAULT; +static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp); +#ifdef HAVE_KERNEL_PARAM_OPS +static struct kernel_param_ops param_ops_retry_count = { + .set = retry_count_set, + .get = param_get_int, +}; + +#define param_check_retry_count(name, p) \ + __param_check(name, p, int) +module_param(lnet_retry_count, retry_count, S_IRUGO|S_IWUSR); +#else +module_param_call(lnet_retry_count, retry_count_set, param_get_int, + &lnet_retry_count, S_IRUGO|S_IWUSR); +#endif +MODULE_PARM_DESC(lnet_retry_count, + "Maximum number of times to retry transmitting a message"); + +unsigned int lnet_response_tracking = 3; +static int response_tracking_set(const char *val, cfs_kernel_param_arg_t *kp); + +#ifdef HAVE_KERNEL_PARAM_OPS +static struct kernel_param_ops param_ops_response_tracking = { + .set = response_tracking_set, + .get = param_get_int, +}; + +#define param_check_response_tracking(name, p) \ + __param_check(name, p, int) +module_param(lnet_response_tracking, response_tracking, 0644); +#else +module_param_call(lnet_response_tracking, response_tracking_set, param_get_int, + &lnet_response_tracking, 0644); +#endif +MODULE_PARM_DESC(lnet_response_tracking, + "(0|1|2|3) LNet Internal Only|GET Reply only|PUT ACK only|Full Tracking (default)"); + +#define LNET_LND_TIMEOUT_DEFAULT ((LNET_TRANSACTION_TIMEOUT_DEFAULT - 1) / \ + (LNET_RETRY_COUNT_DEFAULT + 1)) +unsigned int lnet_lnd_timeout = LNET_LND_TIMEOUT_DEFAULT; +static void lnet_set_lnd_timeout(void) +{ + lnet_lnd_timeout = (lnet_transaction_timeout - 1) / + (lnet_retry_count + 1); +} + +/* + * This sequence number keeps track of how many times DLC was used to + * update the local NIs. It is incremented when a NI is added or + * removed and checked when sending a message to determine if there is + * a need to re-run the selection algorithm. See lnet_select_pathway() + * for more details on its usage. + */ +static atomic_t lnet_dlc_seq_no = ATOMIC_INIT(0); + +static int lnet_ping(struct lnet_process_id id, struct lnet_nid *src_nid, + signed long timeout, struct lnet_process_id __user *ids, + int n_ids); + +static int lnet_discover(struct lnet_process_id id, __u32 force, + struct lnet_process_id __user *ids, int n_ids); + +static int +sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned *sensitivity = (unsigned *)kp->arg; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_health_sensitivity'\n"); + return rc; + } + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + if (value > LNET_MAX_HEALTH_VALUE) { + mutex_unlock(&the_lnet.ln_api_mutex); + CERROR("Invalid health value. Maximum: %d value = %lu\n", + LNET_MAX_HEALTH_VALUE, value); + return -EINVAL; + } + + if (*sensitivity != 0 && value == 0 && lnet_retry_count != 0) { + lnet_retry_count = 0; + lnet_set_lnd_timeout(); + } + + *sensitivity = value; + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int +recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + CWARN("'lnet_recovery_interval' has been deprecated\n"); + + return 0; +} + +static int +discovery_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned *discovery_off = (unsigned *)kp->arg; + unsigned long value; + struct lnet_ping_buffer *pbuf; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_peer_discovery_disabled'\n"); + return rc; + } + + value = (value) ? 1 : 0; + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + if (value == *discovery_off) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + /* + * We still want to set the discovery value even when LNet is not + * running. This is the case when LNet is being loaded and we want + * the module parameters to take effect. Otherwise if we're + * changing the value dynamically, we want to set it after + * updating the peers + */ + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + *discovery_off = value; + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + /* tell peers that discovery setting has changed */ + lnet_net_lock(LNET_LOCK_EX); + pbuf = the_lnet.ln_ping_target; + if (value) + pbuf->pb_info.pi_features &= ~LNET_PING_FEAT_DISCOVERY; + else + pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY; + lnet_net_unlock(LNET_LOCK_EX); + + /* only send a push when we're turning off discovery */ + if (*discovery_off <= 0 && value > 0) + lnet_push_update_to_peers(1); + *discovery_off = value; + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int +drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned int *drop_asym_route = (unsigned int *)kp->arg; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for " + "'lnet_drop_asym_route'\n"); + return rc; + } + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + if (value == *drop_asym_route) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + *drop_asym_route = value; + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int +transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned *transaction_to = (unsigned *)kp->arg; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_transaction_timeout'\n"); + return rc; + } + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + if (value <= lnet_retry_count || value == 0) { + mutex_unlock(&the_lnet.ln_api_mutex); + CERROR("Invalid value for lnet_transaction_timeout (%lu). " + "Has to be greater than lnet_retry_count (%u)\n", + value, lnet_retry_count); + return -EINVAL; + } + + if (value == *transaction_to) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + *transaction_to = value; + /* Update the lnet_lnd_timeout now that we've modified the + * transaction timeout + */ + lnet_set_lnd_timeout(); + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int +retry_count_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned *retry_count = (unsigned *)kp->arg; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_retry_count'\n"); + return rc; + } + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + if (lnet_health_sensitivity == 0 && value > 0) { + mutex_unlock(&the_lnet.ln_api_mutex); + CERROR("Can not set lnet_retry_count when health feature is turned off\n"); + return -EINVAL; + } + + if (value > lnet_transaction_timeout) { + mutex_unlock(&the_lnet.ln_api_mutex); + CERROR("Invalid value for lnet_retry_count (%lu). " + "Has to be smaller than lnet_transaction_timeout (%u)\n", + value, lnet_transaction_timeout); + return -EINVAL; + } + + *retry_count = value; + + /* Update the lnet_lnd_timeout now that we've modified the + * retry count + */ + lnet_set_lnd_timeout(); + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int +intf_max_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int value, rc; + + rc = kstrtoint(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_interfaces_max'\n"); + return rc; + } + + if (value < LNET_INTERFACES_MIN) { + CWARN("max interfaces provided are too small, setting to %d\n", + LNET_INTERFACES_MAX_DEFAULT); + value = LNET_INTERFACES_MAX_DEFAULT; + } + + *(int *)kp->arg = value; + + return 0; +} + +static int +response_tracking_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned long new_value; + + rc = kstrtoul(val, 0, &new_value); + if (rc) { + CERROR("Invalid value for 'lnet_response_tracking'\n"); + return -EINVAL; + } + + if (new_value < 0 || new_value > 3) { + CWARN("Invalid value (%lu) for 'lnet_response_tracking'\n", + new_value); + return -EINVAL; + } + + lnet_response_tracking = new_value; + + return 0; +} + +static const char * +lnet_get_routes(void) +{ + return routes; +} + +static const char * +lnet_get_networks(void) +{ + const char *nets; + int rc; + + if (*networks != 0 && *ip2nets != 0) { + LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or " + "'ip2nets' but not both at once\n"); + return NULL; + } + + if (*ip2nets != 0) { + rc = lnet_parse_ip2nets(&nets, ip2nets); + return (rc == 0) ? nets : NULL; + } + + if (*networks != 0) + return networks; + + return "tcp"; +} + +static void +lnet_init_locks(void) +{ + spin_lock_init(&the_lnet.ln_eq_wait_lock); + spin_lock_init(&the_lnet.ln_msg_resend_lock); + init_completion(&the_lnet.ln_mt_wait_complete); + mutex_init(&the_lnet.ln_lnd_mutex); +} + +struct kmem_cache *lnet_mes_cachep; /* MEs kmem_cache */ +struct kmem_cache *lnet_small_mds_cachep; /* <= LNET_SMALL_MD_SIZE bytes + * MDs kmem_cache */ +struct kmem_cache *lnet_udsp_cachep; /* udsp cache */ +struct kmem_cache *lnet_rspt_cachep; /* response tracker cache */ +struct kmem_cache *lnet_msg_cachep; + +static int +lnet_slab_setup(void) +{ + /* create specific kmem_cache for MEs and small MDs (i.e., originally + * allocated in kmem_cache). + */ + lnet_mes_cachep = kmem_cache_create("lnet_MEs", sizeof(struct lnet_me), + 0, 0, NULL); + if (!lnet_mes_cachep) + return -ENOMEM; + + lnet_small_mds_cachep = kmem_cache_create("lnet_small_MDs", + LNET_SMALL_MD_SIZE, 0, 0, + NULL); + if (!lnet_small_mds_cachep) + return -ENOMEM; + + lnet_udsp_cachep = kmem_cache_create("lnet_udsp", + sizeof(struct lnet_udsp), + 0, 0, NULL); + if (!lnet_udsp_cachep) + return -ENOMEM; + + lnet_rspt_cachep = kmem_cache_create("lnet_rspt", sizeof(struct lnet_rsp_tracker), + 0, 0, NULL); + if (!lnet_rspt_cachep) + return -ENOMEM; + + lnet_msg_cachep = kmem_cache_create("lnet_msg", sizeof(struct lnet_msg), + 0, 0, NULL); + if (!lnet_msg_cachep) + return -ENOMEM; + + return 0; +} + +static void +lnet_slab_cleanup(void) +{ + if (lnet_msg_cachep) { + kmem_cache_destroy(lnet_msg_cachep); + lnet_msg_cachep = NULL; + } + + if (lnet_rspt_cachep) { + kmem_cache_destroy(lnet_rspt_cachep); + lnet_rspt_cachep = NULL; + } + + if (lnet_udsp_cachep) { + kmem_cache_destroy(lnet_udsp_cachep); + lnet_udsp_cachep = NULL; + } + + if (lnet_small_mds_cachep) { + kmem_cache_destroy(lnet_small_mds_cachep); + lnet_small_mds_cachep = NULL; + } + + if (lnet_mes_cachep) { + kmem_cache_destroy(lnet_mes_cachep); + lnet_mes_cachep = NULL; + } +} + +static int +lnet_create_remote_nets_table(void) +{ + int i; + struct list_head *hash; + + LASSERT(the_lnet.ln_remote_nets_hash == NULL); + LASSERT(the_lnet.ln_remote_nets_hbits > 0); + CFS_ALLOC_PTR_ARRAY(hash, LNET_REMOTE_NETS_HASH_SIZE); + if (hash == NULL) { + CERROR("Failed to create remote nets hash table\n"); + return -ENOMEM; + } + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) + INIT_LIST_HEAD(&hash[i]); + the_lnet.ln_remote_nets_hash = hash; + return 0; +} + +static void +lnet_destroy_remote_nets_table(void) +{ + int i; + + if (the_lnet.ln_remote_nets_hash == NULL) + return; + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) + LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i])); + + CFS_FREE_PTR_ARRAY(the_lnet.ln_remote_nets_hash, + LNET_REMOTE_NETS_HASH_SIZE); + the_lnet.ln_remote_nets_hash = NULL; +} + +static void +lnet_destroy_locks(void) +{ + if (the_lnet.ln_res_lock != NULL) { + cfs_percpt_lock_free(the_lnet.ln_res_lock); + the_lnet.ln_res_lock = NULL; + } + + if (the_lnet.ln_net_lock != NULL) { + cfs_percpt_lock_free(the_lnet.ln_net_lock); + the_lnet.ln_net_lock = NULL; + } +} + +static int +lnet_create_locks(void) +{ + lnet_init_locks(); + + the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table()); + if (the_lnet.ln_res_lock == NULL) + goto failed; + + the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table()); + if (the_lnet.ln_net_lock == NULL) + goto failed; + + return 0; + + failed: + lnet_destroy_locks(); + return -ENOMEM; +} + +static void lnet_assert_wire_constants(void) +{ + /* Wire protocol assertions generated by 'wirecheck' + * running on Linux robert.bartonsoftware.com 2.6.8-1.521 + * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux + * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) + */ + + /* Constants... */ + BUILD_BUG_ON(LNET_PROTO_TCP_MAGIC != 0xeebc0ded); + BUILD_BUG_ON(LNET_PROTO_TCP_VERSION_MAJOR != 1); + BUILD_BUG_ON(LNET_PROTO_TCP_VERSION_MINOR != 0); + BUILD_BUG_ON(LNET_MSG_ACK != 0); + BUILD_BUG_ON(LNET_MSG_PUT != 1); + BUILD_BUG_ON(LNET_MSG_GET != 2); + BUILD_BUG_ON(LNET_MSG_REPLY != 3); + BUILD_BUG_ON(LNET_MSG_HELLO != 4); + + BUILD_BUG_ON((int)sizeof(lnet_nid_t) != 8); + BUILD_BUG_ON((int)sizeof(lnet_pid_t) != 4); + + /* Checks for struct lnet_nid */ + BUILD_BUG_ON((int)sizeof(struct lnet_nid) != 20); + BUILD_BUG_ON((int)offsetof(struct lnet_nid, nid_size) != 0); + BUILD_BUG_ON((int)sizeof(((struct lnet_nid *)0)->nid_size) != 1); + BUILD_BUG_ON((int)offsetof(struct lnet_nid, nid_type) != 1); + BUILD_BUG_ON((int)sizeof(((struct lnet_nid *)0)->nid_type) != 1); + BUILD_BUG_ON((int)offsetof(struct lnet_nid, nid_num) != 2); + BUILD_BUG_ON((int)sizeof(((struct lnet_nid *)0)->nid_num) != 2); + BUILD_BUG_ON((int)offsetof(struct lnet_nid, nid_addr) != 4); + BUILD_BUG_ON((int)sizeof(((struct lnet_nid *)0)->nid_addr) != 16); + + /* Checks for struct lnet_process_id_packed */ + BUILD_BUG_ON((int)sizeof(struct lnet_process_id_packed) != 12); + BUILD_BUG_ON((int)offsetof(struct lnet_process_id_packed, nid) != 0); + BUILD_BUG_ON((int)sizeof(((struct lnet_process_id_packed *)0)->nid) != 8); + BUILD_BUG_ON((int)offsetof(struct lnet_process_id_packed, pid) != 8); + BUILD_BUG_ON((int)sizeof(((struct lnet_process_id_packed *)0)->pid) != 4); + + /* Checks for struct lnet_handle_wire */ + BUILD_BUG_ON((int)sizeof(struct lnet_handle_wire) != 16); + BUILD_BUG_ON((int)offsetof(struct lnet_handle_wire, + wh_interface_cookie) != 0); + BUILD_BUG_ON((int)sizeof(((struct lnet_handle_wire *)0)->wh_interface_cookie) != 8); + BUILD_BUG_ON((int)offsetof(struct lnet_handle_wire, + wh_object_cookie) != 8); + BUILD_BUG_ON((int)sizeof(((struct lnet_handle_wire *)0)->wh_object_cookie) != 8); + + /* Checks for struct struct lnet_magicversion */ + BUILD_BUG_ON((int)sizeof(struct lnet_magicversion) != 8); + BUILD_BUG_ON((int)offsetof(struct lnet_magicversion, magic) != 0); + BUILD_BUG_ON((int)sizeof(((struct lnet_magicversion *)0)->magic) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_magicversion, version_major) != 4); + BUILD_BUG_ON((int)sizeof(((struct lnet_magicversion *)0)->version_major) != 2); + BUILD_BUG_ON((int)offsetof(struct lnet_magicversion, + version_minor) != 6); + BUILD_BUG_ON((int)sizeof(((struct lnet_magicversion *)0)->version_minor) != 2); + + /* Checks for struct _lnet_hdr_nid4 */ + BUILD_BUG_ON((int)sizeof(struct _lnet_hdr_nid4) != 72); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, dest_nid) != 0); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->dest_nid) != 8); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, src_nid) != 8); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->src_nid) != 8); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, dest_pid) != 16); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->dest_pid) != 4); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, src_pid) != 20); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->src_pid) != 4); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, type) != 24); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->type) != 4); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, payload_length) != 28); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->payload_length) != 4); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg) != 32); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg) != 40); + + /* Ack */ + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.ack.dst_wmd) != 32); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.ack.dst_wmd) != 16); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.ack.match_bits) != 48); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.ack.match_bits) != 8); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.ack.mlength) != 56); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.ack.mlength) != 4); + + /* Put */ + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.put.ack_wmd) != 32); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.put.ack_wmd) != 16); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.put.match_bits) != 48); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.put.match_bits) != 8); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.put.hdr_data) != 56); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.put.hdr_data) != 8); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.put.ptl_index) != 64); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.put.ptl_index) != 4); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.put.offset) != 68); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.put.offset) != 4); + + /* Get */ + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.get.return_wmd) != 32); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.get.return_wmd) != 16); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.get.match_bits) != 48); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.get.match_bits) != 8); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.get.ptl_index) != 56); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.get.ptl_index) != 4); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.get.src_offset) != 60); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.get.src_offset) != 4); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.get.sink_length) != 64); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.get.sink_length) != 4); + + /* Reply */ + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.reply.dst_wmd) != 32); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.reply.dst_wmd) != 16); + + /* Hello */ + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.hello.incarnation) != 32); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.hello.incarnation) != 8); + BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.hello.type) != 40); + BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.hello.type) != 4); + + /* Checks for struct lnet_ni_status and related constants */ + BUILD_BUG_ON(LNET_NI_STATUS_INVALID != 0x00000000); + BUILD_BUG_ON(LNET_NI_STATUS_UP != 0x15aac0de); + BUILD_BUG_ON(LNET_NI_STATUS_DOWN != 0xdeadface); + + /* Checks for struct lnet_ni_status */ + BUILD_BUG_ON((int)sizeof(struct lnet_ni_status) != 16); + BUILD_BUG_ON((int)offsetof(struct lnet_ni_status, ns_nid) != 0); + BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_nid) != 8); + BUILD_BUG_ON((int)offsetof(struct lnet_ni_status, ns_status) != 8); + BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_status) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_ni_status, ns_unused) != 12); + BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_unused) != 4); + + /* Checks for struct lnet_ping_info and related constants */ + BUILD_BUG_ON(LNET_PROTO_PING_MAGIC != 0x70696E67); + BUILD_BUG_ON(LNET_PING_FEAT_INVAL != 0); + BUILD_BUG_ON(LNET_PING_FEAT_BASE != 1); + BUILD_BUG_ON(LNET_PING_FEAT_NI_STATUS != 2); + BUILD_BUG_ON(LNET_PING_FEAT_RTE_DISABLED != 4); + BUILD_BUG_ON(LNET_PING_FEAT_MULTI_RAIL != 8); + BUILD_BUG_ON(LNET_PING_FEAT_DISCOVERY != 16); + BUILD_BUG_ON(LNET_PING_FEAT_BITS != 31); + + /* Checks for struct lnet_ping_info */ + BUILD_BUG_ON((int)sizeof(struct lnet_ping_info) != 16); + BUILD_BUG_ON((int)offsetof(struct lnet_ping_info, pi_magic) != 0); + BUILD_BUG_ON((int)sizeof(((struct lnet_ping_info *)0)->pi_magic) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_ping_info, pi_features) != 4); + BUILD_BUG_ON((int)sizeof(((struct lnet_ping_info *)0)->pi_features) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_ping_info, pi_pid) != 8); + BUILD_BUG_ON((int)sizeof(((struct lnet_ping_info *)0)->pi_pid) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_ping_info, pi_nnis) != 12); + BUILD_BUG_ON((int)sizeof(((struct lnet_ping_info *)0)->pi_nnis) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_ping_info, pi_ni) != 16); + BUILD_BUG_ON(offsetof(struct lnet_ping_info, pi_ni) != sizeof(struct lnet_ping_info)); + + /* Acceptor connection request */ + BUILD_BUG_ON(LNET_PROTO_ACCEPTOR_VERSION != 1); + + /* Checks for struct lnet_acceptor_connreq */ + BUILD_BUG_ON((int)sizeof(struct lnet_acceptor_connreq) != 16); + BUILD_BUG_ON((int)offsetof(struct lnet_acceptor_connreq, acr_magic) != 0); + BUILD_BUG_ON((int)sizeof(((struct lnet_acceptor_connreq *)0)->acr_magic) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_acceptor_connreq, acr_version) != 4); + BUILD_BUG_ON((int)sizeof(((struct lnet_acceptor_connreq *)0)->acr_version) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_acceptor_connreq, acr_nid) != 8); + BUILD_BUG_ON((int)sizeof(((struct lnet_acceptor_connreq *)0)->acr_nid) != 8); + + /* Checks for struct lnet_acceptor_connreq_v2 */ + BUILD_BUG_ON((int)sizeof(struct lnet_acceptor_connreq_v2) != 28); + BUILD_BUG_ON((int)offsetof(struct lnet_acceptor_connreq_v2, acr_magic) != 0); + BUILD_BUG_ON((int)sizeof(((struct lnet_acceptor_connreq_v2 *)0)->acr_magic) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_acceptor_connreq_v2, acr_version) != 4); + BUILD_BUG_ON((int)sizeof(((struct lnet_acceptor_connreq_v2 *)0)->acr_version) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_acceptor_connreq_v2, acr_nid) != 8); + BUILD_BUG_ON((int)sizeof(((struct lnet_acceptor_connreq_v2 *)0)->acr_nid) != 20); + + /* Checks for struct lnet_counters_common */ + BUILD_BUG_ON((int)sizeof(struct lnet_counters_common) != 60); + BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_msgs_alloc) != 0); + BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_msgs_alloc) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_msgs_max) != 4); + BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_msgs_max) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_errors) != 8); + BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_errors) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_send_count) != 12); + BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_send_count) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_recv_count) != 16); + BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_recv_count) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_route_count) != 20); + BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_route_count) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_drop_count) != 24); + BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_drop_count) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_send_length) != 28); + BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_send_length) != 8); + BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_recv_length) != 36); + BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_recv_length) != 8); + BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_route_length) != 44); + BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_route_length) != 8); + BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_drop_length) != 52); + BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_drop_length) != 8); +} + +static const struct lnet_lnd *lnet_find_lnd_by_type(__u32 type) +{ + const struct lnet_lnd *lnd; + + /* holding lnd mutex */ + if (type >= NUM_LNDS) + return NULL; + lnd = the_lnet.ln_lnds[type]; + LASSERT(!lnd || lnd->lnd_type == type); + + return lnd; +} + +unsigned int +lnet_get_lnd_timeout(void) +{ + return lnet_lnd_timeout; +} +EXPORT_SYMBOL(lnet_get_lnd_timeout); + +void +lnet_register_lnd(const struct lnet_lnd *lnd) +{ + mutex_lock(&the_lnet.ln_lnd_mutex); + + LASSERT(libcfs_isknown_lnd(lnd->lnd_type)); + LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == NULL); + + the_lnet.ln_lnds[lnd->lnd_type] = lnd; + + CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type)); + + mutex_unlock(&the_lnet.ln_lnd_mutex); +} +EXPORT_SYMBOL(lnet_register_lnd); + +void +lnet_unregister_lnd(const struct lnet_lnd *lnd) +{ + mutex_lock(&the_lnet.ln_lnd_mutex); + + LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == lnd); + + the_lnet.ln_lnds[lnd->lnd_type] = NULL; + CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type)); + + mutex_unlock(&the_lnet.ln_lnd_mutex); +} +EXPORT_SYMBOL(lnet_unregister_lnd); + +static void +lnet_counters_get_common_locked(struct lnet_counters_common *common) +{ + struct lnet_counters *ctr; + int i; + + /* FIXME !!! Their is no assert_lnet_net_locked() to ensure this + * actually called under the protection of the lnet_net_lock. + */ + memset(common, 0, sizeof(*common)); + + cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) { + common->lcc_msgs_max += ctr->lct_common.lcc_msgs_max; + common->lcc_msgs_alloc += ctr->lct_common.lcc_msgs_alloc; + common->lcc_errors += ctr->lct_common.lcc_errors; + common->lcc_send_count += ctr->lct_common.lcc_send_count; + common->lcc_recv_count += ctr->lct_common.lcc_recv_count; + common->lcc_route_count += ctr->lct_common.lcc_route_count; + common->lcc_drop_count += ctr->lct_common.lcc_drop_count; + common->lcc_send_length += ctr->lct_common.lcc_send_length; + common->lcc_recv_length += ctr->lct_common.lcc_recv_length; + common->lcc_route_length += ctr->lct_common.lcc_route_length; + common->lcc_drop_length += ctr->lct_common.lcc_drop_length; + } +} + +void +lnet_counters_get_common(struct lnet_counters_common *common) +{ + lnet_net_lock(LNET_LOCK_EX); + lnet_counters_get_common_locked(common); + lnet_net_unlock(LNET_LOCK_EX); +} +EXPORT_SYMBOL(lnet_counters_get_common); + +int +lnet_counters_get(struct lnet_counters *counters) +{ + struct lnet_counters *ctr; + struct lnet_counters_health *health = &counters->lct_health; + int i, rc = 0; + + memset(counters, 0, sizeof(*counters)); + + lnet_net_lock(LNET_LOCK_EX); + + if (the_lnet.ln_state != LNET_STATE_RUNNING) + GOTO(out_unlock, rc = -ENODEV); + + lnet_counters_get_common_locked(&counters->lct_common); + + cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) { + health->lch_rst_alloc += ctr->lct_health.lch_rst_alloc; + health->lch_resend_count += ctr->lct_health.lch_resend_count; + health->lch_response_timeout_count += + ctr->lct_health.lch_response_timeout_count; + health->lch_local_interrupt_count += + ctr->lct_health.lch_local_interrupt_count; + health->lch_local_dropped_count += + ctr->lct_health.lch_local_dropped_count; + health->lch_local_aborted_count += + ctr->lct_health.lch_local_aborted_count; + health->lch_local_no_route_count += + ctr->lct_health.lch_local_no_route_count; + health->lch_local_timeout_count += + ctr->lct_health.lch_local_timeout_count; + health->lch_local_error_count += + ctr->lct_health.lch_local_error_count; + health->lch_remote_dropped_count += + ctr->lct_health.lch_remote_dropped_count; + health->lch_remote_error_count += + ctr->lct_health.lch_remote_error_count; + health->lch_remote_timeout_count += + ctr->lct_health.lch_remote_timeout_count; + health->lch_network_timeout_count += + ctr->lct_health.lch_network_timeout_count; + } +out_unlock: + lnet_net_unlock(LNET_LOCK_EX); + return rc; +} +EXPORT_SYMBOL(lnet_counters_get); + +void +lnet_counters_reset(void) +{ + struct lnet_counters *counters; + int i; + + lnet_net_lock(LNET_LOCK_EX); + + if (the_lnet.ln_state != LNET_STATE_RUNNING) + goto avoid_reset; + + cfs_percpt_for_each(counters, i, the_lnet.ln_counters) + memset(counters, 0, sizeof(struct lnet_counters)); +avoid_reset: + lnet_net_unlock(LNET_LOCK_EX); +} + +static char * +lnet_res_type2str(int type) +{ + switch (type) { + default: + LBUG(); + case LNET_COOKIE_TYPE_MD: + return "MD"; + case LNET_COOKIE_TYPE_ME: + return "ME"; + case LNET_COOKIE_TYPE_EQ: + return "EQ"; + } +} + +static void +lnet_res_container_cleanup(struct lnet_res_container *rec) +{ + int count = 0; + + if (rec->rec_type == 0) /* not set yet, it's uninitialized */ + return; + + while (!list_empty(&rec->rec_active)) { + struct list_head *e = rec->rec_active.next; + + list_del_init(e); + if (rec->rec_type == LNET_COOKIE_TYPE_MD) { + lnet_md_free(list_entry(e, struct lnet_libmd, md_list)); + + } else { /* NB: Active MEs should be attached on portals */ + LBUG(); + } + count++; + } + + if (count > 0) { + /* Found alive MD/ME/EQ, user really should unlink/free + * all of them before finalize LNet, but if someone didn't, + * we have to recycle garbage for him */ + CERROR("%d active elements on exit of %s container\n", + count, lnet_res_type2str(rec->rec_type)); + } + + if (rec->rec_lh_hash != NULL) { + CFS_FREE_PTR_ARRAY(rec->rec_lh_hash, LNET_LH_HASH_SIZE); + rec->rec_lh_hash = NULL; + } + + rec->rec_type = 0; /* mark it as finalized */ +} + +static int +lnet_res_container_setup(struct lnet_res_container *rec, int cpt, int type) +{ + int rc = 0; + int i; + + LASSERT(rec->rec_type == 0); + + rec->rec_type = type; + INIT_LIST_HEAD(&rec->rec_active); + + rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type; + + /* Arbitrary choice of hash table size */ + LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt, + LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0])); + if (rec->rec_lh_hash == NULL) { + rc = -ENOMEM; + goto out; + } + + for (i = 0; i < LNET_LH_HASH_SIZE; i++) + INIT_LIST_HEAD(&rec->rec_lh_hash[i]); + + return 0; + +out: + CERROR("Failed to setup %s resource container\n", + lnet_res_type2str(type)); + lnet_res_container_cleanup(rec); + return rc; +} + +static void +lnet_res_containers_destroy(struct lnet_res_container **recs) +{ + struct lnet_res_container *rec; + int i; + + cfs_percpt_for_each(rec, i, recs) + lnet_res_container_cleanup(rec); + + cfs_percpt_free(recs); +} + +static struct lnet_res_container ** +lnet_res_containers_create(int type) +{ + struct lnet_res_container **recs; + struct lnet_res_container *rec; + int rc; + int i; + + recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec)); + if (recs == NULL) { + CERROR("Failed to allocate %s resource containers\n", + lnet_res_type2str(type)); + return NULL; + } + + cfs_percpt_for_each(rec, i, recs) { + rc = lnet_res_container_setup(rec, i, type); + if (rc != 0) { + lnet_res_containers_destroy(recs); + return NULL; + } + } + + return recs; +} + +struct lnet_libhandle * +lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie) +{ + /* ALWAYS called with lnet_res_lock held */ + struct list_head *head; + struct lnet_libhandle *lh; + unsigned int hash; + + if ((cookie & LNET_COOKIE_MASK) != rec->rec_type) + return NULL; + + hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS); + head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK]; + + list_for_each_entry(lh, head, lh_hash_chain) { + if (lh->lh_cookie == cookie) + return lh; + } + + return NULL; +} + +void +lnet_res_lh_initialize(struct lnet_res_container *rec, + struct lnet_libhandle *lh) +{ + /* ALWAYS called with lnet_res_lock held */ + unsigned int ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS; + unsigned int hash; + + lh->lh_cookie = rec->rec_lh_cookie; + rec->rec_lh_cookie += 1 << ibits; + + hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK; + + list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]); +} + +struct list_head ** +lnet_create_array_of_queues(void) +{ + struct list_head **qs; + struct list_head *q; + int i; + + qs = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct list_head)); + if (!qs) { + CERROR("Failed to allocate queues\n"); + return NULL; + } + + cfs_percpt_for_each(q, i, qs) + INIT_LIST_HEAD(q); + + return qs; +} + +static int lnet_unprepare(void); + +static int +lnet_prepare(lnet_pid_t requested_pid) +{ + /* Prepare to bring up the network */ + struct lnet_res_container **recs; + int rc = 0; + + if (requested_pid == LNET_PID_ANY) { + /* Don't instantiate LNET just for me */ + return -ENETDOWN; + } + + LASSERT(the_lnet.ln_refcount == 0); + + the_lnet.ln_routing = 0; + + LASSERT((requested_pid & LNET_PID_USERFLAG) == 0); + the_lnet.ln_pid = requested_pid; + + INIT_LIST_HEAD(&the_lnet.ln_test_peers); + INIT_LIST_HEAD(&the_lnet.ln_remote_peer_ni_list); + INIT_LIST_HEAD(&the_lnet.ln_nets); + INIT_LIST_HEAD(&the_lnet.ln_routers); + INIT_LIST_HEAD(&the_lnet.ln_drop_rules); + INIT_LIST_HEAD(&the_lnet.ln_delay_rules); + INIT_LIST_HEAD(&the_lnet.ln_dc_request); + INIT_LIST_HEAD(&the_lnet.ln_dc_working); + INIT_LIST_HEAD(&the_lnet.ln_dc_expired); + INIT_LIST_HEAD(&the_lnet.ln_mt_localNIRecovq); + INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq); + INIT_LIST_HEAD(&the_lnet.ln_udsp_list); + init_waitqueue_head(&the_lnet.ln_dc_waitq); + the_lnet.ln_mt_handler = NULL; + init_completion(&the_lnet.ln_started); + + rc = lnet_slab_setup(); + if (rc != 0) + goto failed; + + rc = lnet_create_remote_nets_table(); + if (rc != 0) + goto failed; + + /* + * NB the interface cookie in wire handles guards against delayed + * replies and ACKs appearing valid after reboot. + */ + the_lnet.ln_interface_cookie = ktime_get_real_ns(); + + the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct lnet_counters)); + if (the_lnet.ln_counters == NULL) { + CERROR("Failed to allocate counters for LNet\n"); + rc = -ENOMEM; + goto failed; + } + + rc = lnet_peer_tables_create(); + if (rc != 0) + goto failed; + + rc = lnet_msg_containers_create(); + if (rc != 0) + goto failed; + + rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0, + LNET_COOKIE_TYPE_EQ); + if (rc != 0) + goto failed; + + recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD); + if (recs == NULL) { + rc = -ENOMEM; + goto failed; + } + + the_lnet.ln_md_containers = recs; + + rc = lnet_portals_create(); + if (rc != 0) { + CERROR("Failed to create portals for LNet: %d\n", rc); + goto failed; + } + + the_lnet.ln_mt_zombie_rstqs = lnet_create_array_of_queues(); + if (!the_lnet.ln_mt_zombie_rstqs) { + rc = -ENOMEM; + goto failed; + } + + return 0; + + failed: + lnet_unprepare(); + return rc; +} + +static int +lnet_unprepare(void) +{ + /* NB no LNET_LOCK since this is the last reference. All LND instances + * have shut down already, so it is safe to unlink and free all + * descriptors, even those that appear committed to a network op (eg MD + * with non-zero pending count) */ + + lnet_fail_nid(LNET_NID_ANY, 0); + + LASSERT(the_lnet.ln_refcount == 0); + LASSERT(list_empty(&the_lnet.ln_test_peers)); + LASSERT(list_empty(&the_lnet.ln_nets)); + + if (the_lnet.ln_mt_zombie_rstqs) { + lnet_clean_zombie_rstqs(); + the_lnet.ln_mt_zombie_rstqs = NULL; + } + + lnet_assert_handler_unused(the_lnet.ln_mt_handler); + the_lnet.ln_mt_handler = NULL; + + lnet_portals_destroy(); + + if (the_lnet.ln_md_containers != NULL) { + lnet_res_containers_destroy(the_lnet.ln_md_containers); + the_lnet.ln_md_containers = NULL; + } + + lnet_res_container_cleanup(&the_lnet.ln_eq_container); + + lnet_msg_containers_destroy(); + lnet_peer_uninit(); + lnet_rtrpools_free(0); + + if (the_lnet.ln_counters != NULL) { + cfs_percpt_free(the_lnet.ln_counters); + the_lnet.ln_counters = NULL; + } + lnet_destroy_remote_nets_table(); + lnet_udsp_destroy(true); + lnet_slab_cleanup(); + + return 0; +} + +struct lnet_ni * +lnet_net2ni_locked(__u32 net_id, int cpt) +{ + struct lnet_ni *ni; + struct lnet_net *net; + + LASSERT(cpt != LNET_LOCK_EX); + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + if (net->net_id == net_id) { + ni = list_entry(net->net_ni_list.next, struct lnet_ni, + ni_netlist); + return ni; + } + } + + return NULL; +} + +struct lnet_ni * +lnet_net2ni_addref(__u32 net) +{ + struct lnet_ni *ni; + + lnet_net_lock(0); + ni = lnet_net2ni_locked(net, 0); + if (ni) + lnet_ni_addref_locked(ni, 0); + lnet_net_unlock(0); + + return ni; +} +EXPORT_SYMBOL(lnet_net2ni_addref); + +struct lnet_net * +lnet_get_net_locked(__u32 net_id) +{ + struct lnet_net *net; + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + if (net->net_id == net_id) + return net; + } + + return NULL; +} + +void +lnet_net_clr_pref_rtrs(struct lnet_net *net) +{ + struct list_head zombies; + struct lnet_nid_list *ne; + struct lnet_nid_list *tmp; + + INIT_LIST_HEAD(&zombies); + + lnet_net_lock(LNET_LOCK_EX); + list_splice_init(&net->net_rtr_pref_nids, &zombies); + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry_safe(ne, tmp, &zombies, nl_list) { + list_del_init(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } +} + +int +lnet_net_add_pref_rtr(struct lnet_net *net, + struct lnet_nid *gw_nid) +__must_hold(&the_lnet.ln_api_mutex) +{ + struct lnet_nid_list *ne; + + /* This function is called with api_mutex held. When the api_mutex + * is held the list can not be modified, as it is only modified as + * a result of applying a UDSP and that happens under api_mutex + * lock. + */ + list_for_each_entry(ne, &net->net_rtr_pref_nids, nl_list) { + if (nid_same(&ne->nl_nid, gw_nid)) + return -EEXIST; + } + + LIBCFS_ALLOC(ne, sizeof(*ne)); + if (!ne) + return -ENOMEM; + + ne->nl_nid = *gw_nid; + + /* Lock the cpt to protect against addition and checks in the + * selection algorithm + */ + lnet_net_lock(LNET_LOCK_EX); + list_add(&ne->nl_list, &net->net_rtr_pref_nids); + lnet_net_unlock(LNET_LOCK_EX); + + return 0; +} + +bool +lnet_net_is_pref_rtr_locked(struct lnet_net *net, struct lnet_nid *rtr_nid) +{ + struct lnet_nid_list *ne; + + CDEBUG(D_NET, "%s: rtr pref empty: %d\n", + libcfs_net2str(net->net_id), + list_empty(&net->net_rtr_pref_nids)); + + if (list_empty(&net->net_rtr_pref_nids)) + return false; + + list_for_each_entry(ne, &net->net_rtr_pref_nids, nl_list) { + CDEBUG(D_NET, "Comparing pref %s with gw %s\n", + libcfs_nidstr(&ne->nl_nid), + libcfs_nidstr(rtr_nid)); + if (nid_same(rtr_nid, &ne->nl_nid)) + return true; + } + + return false; +} + +static unsigned int +lnet_nid4_cpt_hash(lnet_nid_t nid, unsigned int number) +{ + __u64 key = nid; + __u64 pair_bits = 0x0001000100010001LLU; + __u64 mask = pair_bits * 0xFF; + __u64 pair_sum; + + /* Use (sum-by-multiplication of nid bytes) mod (number of CPTs) + * to match nid to a CPT. + */ + pair_sum = (key & mask) + ((key >> 8) & mask); + pair_sum = (pair_sum * pair_bits) >> 48; + + CDEBUG(D_NET, "Match nid %s to cpt %u\n", + libcfs_nid2str(nid), (unsigned int)(pair_sum) % number); + + return (unsigned int)(pair_sum) % number; +} + +unsigned int +lnet_nid_cpt_hash(struct lnet_nid *nid, unsigned int number) +{ + unsigned int val; + u32 h = 0; + int i; + + LASSERT(number >= 1 && number <= LNET_CPT_NUMBER); + + if (number == 1) + return 0; + + if (nid_is_nid4(nid)) + return lnet_nid4_cpt_hash(lnet_nid_to_nid4(nid), number); + + for (i = 0; i < 4; i++) + h = hash_32(nid->nid_addr[i]^h, 32); + val = hash_32(LNET_NID_NET(nid) ^ h, LNET_CPT_BITS); + if (val < number) + return val; + return (unsigned int)(h + val + (val >> 1)) % number; +} + +int +lnet_cpt_of_nid_locked(struct lnet_nid *nid, struct lnet_ni *ni) +{ + struct lnet_net *net; + + /* must called with hold of lnet_net_lock */ + if (LNET_CPT_NUMBER == 1) + return 0; /* the only one */ + + /* + * If NI is provided then use the CPT identified in the NI cpt + * list if one exists. If one doesn't exist, then that NI is + * associated with all CPTs and it follows that the net it belongs + * to is implicitly associated with all CPTs, so just hash the nid + * and return that. + */ + if (ni != NULL) { + if (ni->ni_cpts != NULL) + return ni->ni_cpts[lnet_nid_cpt_hash(nid, + ni->ni_ncpts)]; + else + return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); + } + + /* no NI provided so look at the net */ + net = lnet_get_net_locked(LNET_NID_NET(nid)); + + if (net != NULL && net->net_cpts != NULL) { + return net->net_cpts[lnet_nid_cpt_hash(nid, net->net_ncpts)]; + } + + return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); +} + +int +lnet_nid2cpt(struct lnet_nid *nid, struct lnet_ni *ni) +{ + int cpt; + int cpt2; + + if (LNET_CPT_NUMBER == 1) + return 0; /* the only one */ + + cpt = lnet_net_lock_current(); + + cpt2 = lnet_cpt_of_nid_locked(nid, ni); + + lnet_net_unlock(cpt); + + return cpt2; +} +EXPORT_SYMBOL(lnet_nid2cpt); + +int +lnet_cpt_of_nid(lnet_nid_t nid4, struct lnet_ni *ni) +{ + struct lnet_nid nid; + + if (LNET_CPT_NUMBER == 1) + return 0; /* the only one */ + + lnet_nid4_to_nid(nid4, &nid); + return lnet_nid2cpt(&nid, ni); +} +EXPORT_SYMBOL(lnet_cpt_of_nid); + +int +lnet_islocalnet_locked(__u32 net_id) +{ + struct lnet_net *net; + bool local; + + net = lnet_get_net_locked(net_id); + + local = net != NULL; + + return local; +} + +int +lnet_islocalnet(__u32 net_id) +{ + int cpt; + bool local; + + cpt = lnet_net_lock_current(); + + local = lnet_islocalnet_locked(net_id); + + lnet_net_unlock(cpt); + + return local; +} + +struct lnet_ni * +lnet_nid_to_ni_locked(struct lnet_nid *nid, int cpt) +{ + struct lnet_net *net; + struct lnet_ni *ni; + + LASSERT(cpt != LNET_LOCK_EX); + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (nid_same(&ni->ni_nid, nid)) + return ni; + } + } + + return NULL; +} + +struct lnet_ni * +lnet_nid2ni_locked(lnet_nid_t nid4, int cpt) +{ + struct lnet_nid nid; + + lnet_nid4_to_nid(nid4, &nid); + return lnet_nid_to_ni_locked(&nid, cpt); +} + +struct lnet_ni * +lnet_nid2ni_addref(lnet_nid_t nid4) +{ + struct lnet_ni *ni; + struct lnet_nid nid; + + lnet_nid4_to_nid(nid4, &nid); + + lnet_net_lock(0); + ni = lnet_nid_to_ni_locked(&nid, 0); + if (ni) + lnet_ni_addref_locked(ni, 0); + lnet_net_unlock(0); + + return ni; +} +EXPORT_SYMBOL(lnet_nid2ni_addref); + +struct lnet_ni * +lnet_nid_to_ni_addref(struct lnet_nid *nid) +{ + struct lnet_ni *ni; + + lnet_net_lock(0); + ni = lnet_nid_to_ni_locked(nid, 0); + if (ni) + lnet_ni_addref_locked(ni, 0); + lnet_net_unlock(0); + + return ni; +} +EXPORT_SYMBOL(lnet_nid_to_ni_addref); + +int +lnet_islocalnid(struct lnet_nid *nid) +{ + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + ni = lnet_nid_to_ni_locked(nid, cpt); + lnet_net_unlock(cpt); + + return ni != NULL; +} + +int +lnet_count_acceptor_nets(void) +{ + /* Return the # of NIs that need the acceptor. */ + int count = 0; + struct lnet_net *net; + int cpt; + + cpt = lnet_net_lock_current(); + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + /* all socklnd type networks should have the acceptor + * thread started */ + if (net->net_lnd->lnd_accept != NULL) + count++; + } + + lnet_net_unlock(cpt); + + return count; +} + +struct lnet_ping_buffer * +lnet_ping_buffer_alloc(int nnis, gfp_t gfp) +{ + struct lnet_ping_buffer *pbuf; + + LIBCFS_ALLOC_GFP(pbuf, LNET_PING_BUFFER_SIZE(nnis), gfp); + if (pbuf) { + pbuf->pb_nnis = nnis; + pbuf->pb_needs_post = false; + atomic_set(&pbuf->pb_refcnt, 1); + } + + return pbuf; +} + +void +lnet_ping_buffer_free(struct lnet_ping_buffer *pbuf) +{ + LASSERT(atomic_read(&pbuf->pb_refcnt) == 0); + LIBCFS_FREE(pbuf, LNET_PING_BUFFER_SIZE(pbuf->pb_nnis)); +} + +static struct lnet_ping_buffer * +lnet_ping_target_create(int nnis) +{ + struct lnet_ping_buffer *pbuf; + + pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS); + if (pbuf == NULL) { + CERROR("Can't allocate ping source [%d]\n", nnis); + return NULL; + } + + pbuf->pb_info.pi_nnis = nnis; + pbuf->pb_info.pi_pid = the_lnet.ln_pid; + pbuf->pb_info.pi_magic = LNET_PROTO_PING_MAGIC; + pbuf->pb_info.pi_features = + LNET_PING_FEAT_NI_STATUS | LNET_PING_FEAT_MULTI_RAIL; + + return pbuf; +} + +static inline int +lnet_get_net_ni_count_locked(struct lnet_net *net) +{ + struct lnet_ni *ni; + int count = 0; + + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) + count++; + + return count; +} + +static inline int +lnet_get_net_ni_count_pre(struct lnet_net *net) +{ + struct lnet_ni *ni; + int count = 0; + + list_for_each_entry(ni, &net->net_ni_added, ni_netlist) + count++; + + return count; +} + +static inline int +lnet_get_ni_count(void) +{ + struct lnet_ni *ni; + struct lnet_net *net; + int count = 0; + + lnet_net_lock(0); + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) + count++; + } + + lnet_net_unlock(0); + + return count; +} + +void +lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf) +{ + struct lnet_ni_status *stat; + int nnis; + int i; + + __swab32s(&pbuf->pb_info.pi_magic); + __swab32s(&pbuf->pb_info.pi_features); + __swab32s(&pbuf->pb_info.pi_pid); + __swab32s(&pbuf->pb_info.pi_nnis); + nnis = pbuf->pb_info.pi_nnis; + if (nnis > pbuf->pb_nnis) + nnis = pbuf->pb_nnis; + for (i = 0; i < nnis; i++) { + stat = &pbuf->pb_info.pi_ni[i]; + __swab64s(&stat->ns_nid); + __swab32s(&stat->ns_status); + } +} + +int +lnet_ping_info_validate(struct lnet_ping_info *pinfo) +{ + if (!pinfo) + return -EINVAL; + if (pinfo->pi_magic != LNET_PROTO_PING_MAGIC) + return -EPROTO; + if (!(pinfo->pi_features & LNET_PING_FEAT_NI_STATUS)) + return -EPROTO; + /* Loopback is guaranteed to be present */ + if (pinfo->pi_nnis < 1 || pinfo->pi_nnis > lnet_interfaces_max) + return -ERANGE; + if (LNET_PING_INFO_LONI(pinfo) != LNET_NID_LO_0) + return -EPROTO; + return 0; +} + +static void +lnet_ping_target_destroy(void) +{ + struct lnet_net *net; + struct lnet_ni *ni; + + lnet_net_lock(LNET_LOCK_EX); + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + lnet_ni_lock(ni); + ni->ni_status = NULL; + lnet_ni_unlock(ni); + } + } + + lnet_ping_buffer_decref(the_lnet.ln_ping_target); + the_lnet.ln_ping_target = NULL; + + lnet_net_unlock(LNET_LOCK_EX); +} + +static void +lnet_ping_target_event_handler(struct lnet_event *event) +{ + struct lnet_ping_buffer *pbuf = event->md_user_ptr; + + if (event->unlinked) + lnet_ping_buffer_decref(pbuf); +} + +static int +lnet_ping_target_setup(struct lnet_ping_buffer **ppbuf, + struct lnet_handle_md *ping_mdh, + int ni_count, bool set_eq) +{ + struct lnet_processid id = { + .nid = LNET_ANY_NID, + .pid = LNET_PID_ANY + }; + struct lnet_me *me; + struct lnet_md md = { NULL }; + int rc; + + if (set_eq) + the_lnet.ln_ping_target_handler = + lnet_ping_target_event_handler; + + *ppbuf = lnet_ping_target_create(ni_count); + if (*ppbuf == NULL) { + rc = -ENOMEM; + goto fail_free_eq; + } + + /* Ping target ME/MD */ + me = LNetMEAttach(LNET_RESERVED_PORTAL, &id, + LNET_PROTO_PING_MATCHBITS, 0, + LNET_UNLINK, LNET_INS_AFTER); + if (IS_ERR(me)) { + rc = PTR_ERR(me); + CERROR("Can't create ping target ME: %d\n", rc); + goto fail_decref_ping_buffer; + } + + /* initialize md content */ + md.start = &(*ppbuf)->pb_info; + md.length = LNET_PING_INFO_SIZE((*ppbuf)->pb_nnis); + md.threshold = LNET_MD_THRESH_INF; + md.max_size = 0; + md.options = LNET_MD_OP_GET | LNET_MD_TRUNCATE | + LNET_MD_MANAGE_REMOTE; + md.handler = the_lnet.ln_ping_target_handler; + md.user_ptr = *ppbuf; + + rc = LNetMDAttach(me, &md, LNET_RETAIN, ping_mdh); + if (rc != 0) { + CERROR("Can't attach ping target MD: %d\n", rc); + goto fail_decref_ping_buffer; + } + lnet_ping_buffer_addref(*ppbuf); + + return 0; + +fail_decref_ping_buffer: + LASSERT(atomic_read(&(*ppbuf)->pb_refcnt) == 1); + lnet_ping_buffer_decref(*ppbuf); + *ppbuf = NULL; +fail_free_eq: + return rc; +} + +static void +lnet_ping_md_unlink(struct lnet_ping_buffer *pbuf, + struct lnet_handle_md *ping_mdh) +{ + LNetMDUnlink(*ping_mdh); + LNetInvalidateMDHandle(ping_mdh); + + /* NB the MD could be busy; this just starts the unlink */ + wait_var_event_warning(&pbuf->pb_refcnt, + atomic_read(&pbuf->pb_refcnt) <= 1, + "Still waiting for ping data MD to unlink\n"); +} + +static void +lnet_ping_target_install_locked(struct lnet_ping_buffer *pbuf) +{ + struct lnet_ni *ni; + struct lnet_net *net; + struct lnet_ni_status *ns; + int i; + int rc; + + i = 0; + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + LASSERT(i < pbuf->pb_nnis); + + ns = &pbuf->pb_info.pi_ni[i]; + + if (!nid_is_nid4(&ni->ni_nid)) + continue; + ns->ns_nid = lnet_nid_to_nid4(&ni->ni_nid); + + lnet_ni_lock(ni); + ns->ns_status = lnet_ni_get_status_locked(ni); + ni->ni_status = ns; + lnet_ni_unlock(ni); + + i++; + } + } + /* + * We (ab)use the ns_status of the loopback interface to + * transmit the sequence number. The first interface listed + * must be the loopback interface. + */ + rc = lnet_ping_info_validate(&pbuf->pb_info); + if (rc) { + LCONSOLE_EMERG("Invalid ping target: %d\n", rc); + LBUG(); + } + LNET_PING_BUFFER_SEQNO(pbuf) = + atomic_inc_return(&the_lnet.ln_ping_target_seqno); +} + +static void +lnet_ping_target_update(struct lnet_ping_buffer *pbuf, + struct lnet_handle_md ping_mdh) +{ + struct lnet_ping_buffer *old_pbuf = NULL; + struct lnet_handle_md old_ping_md; + + /* switch the NIs to point to the new ping info created */ + lnet_net_lock(LNET_LOCK_EX); + + if (!the_lnet.ln_routing) + pbuf->pb_info.pi_features |= LNET_PING_FEAT_RTE_DISABLED; + if (!lnet_peer_discovery_disabled) + pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY; + + /* Ensure only known feature bits have been set. */ + LASSERT(pbuf->pb_info.pi_features & LNET_PING_FEAT_BITS); + LASSERT(!(pbuf->pb_info.pi_features & ~LNET_PING_FEAT_BITS)); + + lnet_ping_target_install_locked(pbuf); + + if (the_lnet.ln_ping_target) { + old_pbuf = the_lnet.ln_ping_target; + old_ping_md = the_lnet.ln_ping_target_md; + } + the_lnet.ln_ping_target_md = ping_mdh; + the_lnet.ln_ping_target = pbuf; + + lnet_net_unlock(LNET_LOCK_EX); + + if (old_pbuf) { + /* unlink and free the old ping info */ + lnet_ping_md_unlink(old_pbuf, &old_ping_md); + lnet_ping_buffer_decref(old_pbuf); + } + + lnet_push_update_to_peers(0); +} + +static void +lnet_ping_target_fini(void) +{ + lnet_ping_md_unlink(the_lnet.ln_ping_target, + &the_lnet.ln_ping_target_md); + + lnet_assert_handler_unused(the_lnet.ln_ping_target_handler); + lnet_ping_target_destroy(); +} + +/* Resize the push target. */ +int lnet_push_target_resize(void) +{ + struct lnet_handle_md mdh; + struct lnet_handle_md old_mdh; + struct lnet_ping_buffer *pbuf; + struct lnet_ping_buffer *old_pbuf; + int nnis; + int rc; + +again: + nnis = the_lnet.ln_push_target_nnis; + if (nnis <= 0) { + CDEBUG(D_NET, "Invalid nnis %d\n", nnis); + return -EINVAL; + } + + /* NB: lnet_ping_buffer_alloc() sets pbuf refcount to 1. That ref is + * dropped when we need to resize again (see "old_pbuf" below) or when + * LNet is shutdown (see lnet_push_target_fini()) + */ + pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS); + if (!pbuf) { + CDEBUG(D_NET, "Can't allocate pbuf for nnis %d\n", nnis); + return -ENOMEM; + } + + rc = lnet_push_target_post(pbuf, &mdh); + if (rc) { + CDEBUG(D_NET, "Failed to post push target: %d\n", rc); + lnet_ping_buffer_decref(pbuf); + return rc; + } + + lnet_net_lock(LNET_LOCK_EX); + old_pbuf = the_lnet.ln_push_target; + old_mdh = the_lnet.ln_push_target_md; + the_lnet.ln_push_target = pbuf; + the_lnet.ln_push_target_md = mdh; + lnet_net_unlock(LNET_LOCK_EX); + + if (old_pbuf) { + LNetMDUnlink(old_mdh); + /* Drop ref set by lnet_ping_buffer_alloc() */ + lnet_ping_buffer_decref(old_pbuf); + } + + /* Received another push or reply that requires a larger buffer */ + if (nnis < the_lnet.ln_push_target_nnis) + goto again; + + CDEBUG(D_NET, "nnis %d success\n", nnis); + return 0; +} + +int lnet_push_target_post(struct lnet_ping_buffer *pbuf, + struct lnet_handle_md *mdhp) +{ + struct lnet_processid id = { LNET_ANY_NID, LNET_PID_ANY }; + struct lnet_md md = { NULL }; + struct lnet_me *me; + int rc; + + me = LNetMEAttach(LNET_RESERVED_PORTAL, &id, + LNET_PROTO_PING_MATCHBITS, 0, + LNET_UNLINK, LNET_INS_AFTER); + if (IS_ERR(me)) { + rc = PTR_ERR(me); + CERROR("Can't create push target ME: %d\n", rc); + return rc; + } + + pbuf->pb_needs_post = false; + + /* This reference is dropped by lnet_push_target_event_handler() */ + lnet_ping_buffer_addref(pbuf); + + /* initialize md content */ + md.start = &pbuf->pb_info; + md.length = LNET_PING_INFO_SIZE(pbuf->pb_nnis); + md.threshold = 1; + md.max_size = 0; + md.options = LNET_MD_OP_PUT | LNET_MD_TRUNCATE; + md.user_ptr = pbuf; + md.handler = the_lnet.ln_push_target_handler; + + rc = LNetMDAttach(me, &md, LNET_UNLINK, mdhp); + if (rc) { + CERROR("Can't attach push MD: %d\n", rc); + lnet_ping_buffer_decref(pbuf); + pbuf->pb_needs_post = true; + return rc; + } + + CDEBUG(D_NET, "posted push target %p\n", pbuf); + + return 0; +} + +static void lnet_push_target_event_handler(struct lnet_event *ev) +{ + struct lnet_ping_buffer *pbuf = ev->md_user_ptr; + + CDEBUG(D_NET, "type %d status %d unlinked %d\n", ev->type, ev->status, + ev->unlinked); + + if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) + lnet_swap_pinginfo(pbuf); + + if (ev->type == LNET_EVENT_UNLINK) { + /* Drop ref added by lnet_push_target_post() */ + lnet_ping_buffer_decref(pbuf); + return; + } + + lnet_peer_push_event(ev); + if (ev->unlinked) + /* Drop ref added by lnet_push_target_post */ + lnet_ping_buffer_decref(pbuf); +} + +/* Initialize the push target. */ +static int lnet_push_target_init(void) +{ + int rc; + + if (the_lnet.ln_push_target) + return -EALREADY; + + the_lnet.ln_push_target_handler = + lnet_push_target_event_handler; + + rc = LNetSetLazyPortal(LNET_RESERVED_PORTAL); + LASSERT(rc == 0); + + /* Start at the required minimum, we'll enlarge if required. */ + the_lnet.ln_push_target_nnis = LNET_INTERFACES_MIN; + + rc = lnet_push_target_resize(); + + if (rc) { + LNetClearLazyPortal(LNET_RESERVED_PORTAL); + the_lnet.ln_push_target_handler = NULL; + } + + return rc; +} + +/* Clean up the push target. */ +static void lnet_push_target_fini(void) +{ + if (!the_lnet.ln_push_target) + return; + + /* Unlink and invalidate to prevent new references. */ + LNetMDUnlink(the_lnet.ln_push_target_md); + LNetInvalidateMDHandle(&the_lnet.ln_push_target_md); + + /* Wait for the unlink to complete. */ + wait_var_event_warning(&the_lnet.ln_push_target->pb_refcnt, + atomic_read(&the_lnet.ln_push_target->pb_refcnt) <= 1, + "Still waiting for ping data MD to unlink\n"); + + /* Drop ref set by lnet_ping_buffer_alloc() */ + lnet_ping_buffer_decref(the_lnet.ln_push_target); + the_lnet.ln_push_target = NULL; + the_lnet.ln_push_target_nnis = 0; + + LNetClearLazyPortal(LNET_RESERVED_PORTAL); + lnet_assert_handler_unused(the_lnet.ln_push_target_handler); + the_lnet.ln_push_target_handler = NULL; +} + +static int +lnet_ni_tq_credits(struct lnet_ni *ni) +{ + int credits; + + LASSERT(ni->ni_ncpts >= 1); + + if (ni->ni_ncpts == 1) + return ni->ni_net->net_tunables.lct_max_tx_credits; + + credits = ni->ni_net->net_tunables.lct_max_tx_credits / ni->ni_ncpts; + credits = max(credits, 8 * ni->ni_net->net_tunables.lct_peer_tx_credits); + credits = min(credits, ni->ni_net->net_tunables.lct_max_tx_credits); + + return credits; +} + +static void +lnet_ni_unlink_locked(struct lnet_ni *ni) +{ + /* move it to zombie list and nobody can find it anymore */ + LASSERT(!list_empty(&ni->ni_netlist)); + list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie); + lnet_ni_decref_locked(ni, 0); +} + +static void +lnet_clear_zombies_nis_locked(struct lnet_net *net) +{ + int i; + int islo; + struct lnet_ni *ni; + struct list_head *zombie_list = &net->net_ni_zombie; + + /* + * Now wait for the NIs I just nuked to show up on the zombie + * list and shut them down in guaranteed thread context + */ + i = 2; + while (!list_empty(zombie_list)) { + int *ref; + int j; + + ni = list_entry(zombie_list->next, + struct lnet_ni, ni_netlist); + list_del_init(&ni->ni_netlist); + /* the ni should be in deleting state. If it's not it's + * a bug */ + LASSERT(ni->ni_state == LNET_NI_STATE_DELETING); + cfs_percpt_for_each(ref, j, ni->ni_refs) { + if (*ref == 0) + continue; + /* still busy, add it back to zombie list */ + list_add(&ni->ni_netlist, zombie_list); + break; + } + + if (!list_empty(&ni->ni_netlist)) { + /* Unlock mutex while waiting to allow other + * threads to read the LNet state and fall through + * to avoid deadlock + */ + lnet_net_unlock(LNET_LOCK_EX); + mutex_unlock(&the_lnet.ln_api_mutex); + + ++i; + if ((i & (-i)) == i) { + CDEBUG(D_WARNING, + "Waiting for zombie LNI %s\n", + libcfs_nidstr(&ni->ni_nid)); + } + schedule_timeout_uninterruptible(cfs_time_seconds(1)); + + mutex_lock(&the_lnet.ln_api_mutex); + lnet_net_lock(LNET_LOCK_EX); + continue; + } + + lnet_net_unlock(LNET_LOCK_EX); + + islo = ni->ni_net->net_lnd->lnd_type == LOLND; + + LASSERT(!in_interrupt()); + /* Holding the LND mutex makes it safe for lnd_shutdown + * to call module_put(). Module unload cannot finish + * until lnet_unregister_lnd() completes, and that + * requires the LND mutex. + */ + mutex_unlock(&the_lnet.ln_api_mutex); + mutex_lock(&the_lnet.ln_lnd_mutex); + (net->net_lnd->lnd_shutdown)(ni); + mutex_unlock(&the_lnet.ln_lnd_mutex); + mutex_lock(&the_lnet.ln_api_mutex); + + if (!islo) + CDEBUG(D_LNI, "Removed LNI %s\n", + libcfs_nidstr(&ni->ni_nid)); + + lnet_ni_free(ni); + i = 2; + lnet_net_lock(LNET_LOCK_EX); + } +} + +/* shutdown down the NI and release refcount */ +static void +lnet_shutdown_lndni(struct lnet_ni *ni) +{ + int i; + struct lnet_net *net = ni->ni_net; + + lnet_net_lock(LNET_LOCK_EX); + lnet_ni_lock(ni); + ni->ni_state = LNET_NI_STATE_DELETING; + lnet_ni_unlock(ni); + lnet_ni_unlink_locked(ni); + lnet_incr_dlc_seq(); + lnet_net_unlock(LNET_LOCK_EX); + + /* clear messages for this NI on the lazy portal */ + for (i = 0; i < the_lnet.ln_nportals; i++) + lnet_clear_lazy_portal(ni, i, "Shutting down NI"); + + lnet_net_lock(LNET_LOCK_EX); + lnet_clear_zombies_nis_locked(net); + lnet_net_unlock(LNET_LOCK_EX); +} + +static void +lnet_shutdown_lndnet(struct lnet_net *net) +{ + struct lnet_ni *ni; + + lnet_net_lock(LNET_LOCK_EX); + + list_del_init(&net->net_list); + + while (!list_empty(&net->net_ni_list)) { + ni = list_entry(net->net_ni_list.next, + struct lnet_ni, ni_netlist); + lnet_net_unlock(LNET_LOCK_EX); + lnet_shutdown_lndni(ni); + lnet_net_lock(LNET_LOCK_EX); + } + + lnet_net_unlock(LNET_LOCK_EX); + + /* Do peer table cleanup for this net */ + lnet_peer_tables_cleanup(net); + + lnet_net_free(net); +} + +static void +lnet_shutdown_lndnets(void) +{ + struct lnet_net *net; + LIST_HEAD(resend); + struct lnet_msg *msg, *tmp; + + /* NB called holding the global mutex */ + + /* All quiet on the API front */ + LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING || + the_lnet.ln_state == LNET_STATE_STOPPING); + LASSERT(the_lnet.ln_refcount == 0); + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_state = LNET_STATE_STOPPING; + + /* + * move the nets to the zombie list to avoid them being + * picked up for new work. LONET is also included in the + * Nets that will be moved to the zombie list + */ + list_splice_init(&the_lnet.ln_nets, &the_lnet.ln_net_zombie); + + /* Drop the cached loopback Net. */ + if (the_lnet.ln_loni != NULL) { + lnet_ni_decref_locked(the_lnet.ln_loni, 0); + the_lnet.ln_loni = NULL; + } + lnet_net_unlock(LNET_LOCK_EX); + + /* iterate through the net zombie list and delete each net */ + while (!list_empty(&the_lnet.ln_net_zombie)) { + net = list_entry(the_lnet.ln_net_zombie.next, + struct lnet_net, net_list); + lnet_shutdown_lndnet(net); + } + + spin_lock(&the_lnet.ln_msg_resend_lock); + list_splice(&the_lnet.ln_msg_resend, &resend); + spin_unlock(&the_lnet.ln_msg_resend_lock); + + list_for_each_entry_safe(msg, tmp, &resend, msg_list) { + list_del_init(&msg->msg_list); + msg->msg_no_resend = true; + lnet_finalize(msg, -ECANCELED); + } + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_state = LNET_STATE_SHUTDOWN; + lnet_net_unlock(LNET_LOCK_EX); +} + +static int +lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun) +{ + int rc = -EINVAL; + struct lnet_tx_queue *tq; + int i; + struct lnet_net *net = ni->ni_net; + + mutex_lock(&the_lnet.ln_lnd_mutex); + + if (tun) { + memcpy(&ni->ni_lnd_tunables, tun, sizeof(*tun)); + ni->ni_lnd_tunables_set = true; + } + + rc = (net->net_lnd->lnd_startup)(ni); + + mutex_unlock(&the_lnet.ln_lnd_mutex); + + if (rc != 0) { + LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s\n", + rc, libcfs_lnd2str(net->net_lnd->lnd_type)); + goto failed0; + } + + lnet_ni_lock(ni); + ni->ni_state = LNET_NI_STATE_ACTIVE; + lnet_ni_unlock(ni); + + /* We keep a reference on the loopback net through the loopback NI */ + if (net->net_lnd->lnd_type == LOLND) { + lnet_ni_addref(ni); + LASSERT(the_lnet.ln_loni == NULL); + the_lnet.ln_loni = ni; + ni->ni_net->net_tunables.lct_peer_tx_credits = 0; + ni->ni_net->net_tunables.lct_peer_rtr_credits = 0; + ni->ni_net->net_tunables.lct_max_tx_credits = 0; + ni->ni_net->net_tunables.lct_peer_timeout = 0; + return 0; + } + + if (ni->ni_net->net_tunables.lct_peer_tx_credits == 0 || + ni->ni_net->net_tunables.lct_max_tx_credits == 0) { + LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n", + libcfs_lnd2str(net->net_lnd->lnd_type), + ni->ni_net->net_tunables.lct_peer_tx_credits == 0 ? + "" : "per-peer "); + /* shutdown the NI since if we get here then it must've already + * been started + */ + lnet_shutdown_lndni(ni); + return -EINVAL; + } + + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + tq->tq_credits_min = + tq->tq_credits_max = + tq->tq_credits = lnet_ni_tq_credits(ni); + } + + atomic_set(&ni->ni_tx_credits, + lnet_ni_tq_credits(ni) * ni->ni_ncpts); + atomic_set(&ni->ni_healthv, LNET_MAX_HEALTH_VALUE); + + CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n", + libcfs_nidstr(&ni->ni_nid), + ni->ni_net->net_tunables.lct_peer_tx_credits, + lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER, + ni->ni_net->net_tunables.lct_peer_rtr_credits, + ni->ni_net->net_tunables.lct_peer_timeout); + + return 0; +failed0: + lnet_ni_free(ni); + return rc; +} + +static int +lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun) +{ + struct lnet_ni *ni; + struct lnet_net *net_l = NULL; + LIST_HEAD(local_ni_list); + int rc; + int ni_count = 0; + __u32 lnd_type; + const struct lnet_lnd *lnd; + int peer_timeout = + net->net_tunables.lct_peer_timeout; + int maxtxcredits = + net->net_tunables.lct_max_tx_credits; + int peerrtrcredits = + net->net_tunables.lct_peer_rtr_credits; + + /* + * make sure that this net is unique. If it isn't then + * we are adding interfaces to an already existing network, and + * 'net' is just a convenient way to pass in the list. + * if it is unique we need to find the LND and load it if + * necessary. + */ + if (lnet_net_unique(net->net_id, &the_lnet.ln_nets, &net_l)) { + lnd_type = LNET_NETTYP(net->net_id); + + mutex_lock(&the_lnet.ln_lnd_mutex); + lnd = lnet_find_lnd_by_type(lnd_type); + + if (lnd == NULL) { + mutex_unlock(&the_lnet.ln_lnd_mutex); + rc = request_module("%s", libcfs_lnd2modname(lnd_type)); + mutex_lock(&the_lnet.ln_lnd_mutex); + + lnd = lnet_find_lnd_by_type(lnd_type); + if (lnd == NULL) { + mutex_unlock(&the_lnet.ln_lnd_mutex); + CERROR("Can't load LND %s, module %s, rc=%d\n", + libcfs_lnd2str(lnd_type), + libcfs_lnd2modname(lnd_type), rc); +#ifndef HAVE_MODULE_LOADING_SUPPORT + LCONSOLE_ERROR_MSG(0x104, "Your kernel must be " + "compiled with kernel module " + "loading support."); +#endif + rc = -EINVAL; + goto failed0; + } + } + + net->net_lnd = lnd; + + mutex_unlock(&the_lnet.ln_lnd_mutex); + + net_l = net; + } + + /* + * net_l: if the network being added is unique then net_l + * will point to that network + * if the network being added is not unique then + * net_l points to the existing network. + * + * When we enter the loop below, we'll pick NIs off he + * network beign added and start them up, then add them to + * a local ni list. Once we've successfully started all + * the NIs then we join the local NI list (of started up + * networks) with the net_l->net_ni_list, which should + * point to the correct network to add the new ni list to + * + * If any of the new NIs fail to start up, then we want to + * iterate through the local ni list, which should include + * any NIs which were successfully started up, and shut + * them down. + * + * After than we want to delete the network being added, + * to avoid a memory leak. + */ + while (!list_empty(&net->net_ni_added)) { + ni = list_entry(net->net_ni_added.next, struct lnet_ni, + ni_netlist); + list_del_init(&ni->ni_netlist); + + /* make sure that the the NI we're about to start + * up is actually unique. if it's not fail. */ + if (!lnet_ni_unique_net(&net_l->net_ni_list, + ni->ni_interface)) { + rc = -EEXIST; + goto failed1; + } + + /* adjust the pointer the parent network, just in case it + * the net is a duplicate */ + ni->ni_net = net_l; + + rc = lnet_startup_lndni(ni, tun); + + if (rc < 0) + goto failed1; + + lnet_ni_addref(ni); + list_add_tail(&ni->ni_netlist, &local_ni_list); + + ni_count++; + } + + lnet_net_lock(LNET_LOCK_EX); + list_splice_tail(&local_ni_list, &net_l->net_ni_list); + lnet_incr_dlc_seq(); + lnet_net_unlock(LNET_LOCK_EX); + + /* if the network is not unique then we don't want to keep + * it around after we're done. Free it. Otherwise add that + * net to the global the_lnet.ln_nets */ + if (net_l != net && net_l != NULL) { + /* + * TODO - note. currently the tunables can not be updated + * once added + */ + lnet_net_free(net); + } else { + /* + * restore tunables after it has been overwitten by the + * lnd + */ + if (peer_timeout != -1) + net->net_tunables.lct_peer_timeout = peer_timeout; + if (maxtxcredits != -1) + net->net_tunables.lct_max_tx_credits = maxtxcredits; + if (peerrtrcredits != -1) + net->net_tunables.lct_peer_rtr_credits = peerrtrcredits; + + lnet_net_lock(LNET_LOCK_EX); + list_add_tail(&net->net_list, &the_lnet.ln_nets); + lnet_net_unlock(LNET_LOCK_EX); + } + + return ni_count; + +failed1: + /* + * shutdown the new NIs that are being started up + * free the NET being started + */ + while (!list_empty(&local_ni_list)) { + ni = list_entry(local_ni_list.next, struct lnet_ni, + ni_netlist); + + lnet_shutdown_lndni(ni); + } + +failed0: + lnet_net_free(net); + + return rc; +} + +static int +lnet_startup_lndnets(struct list_head *netlist) +{ + struct lnet_net *net; + int rc; + int ni_count = 0; + + /* + * Change to running state before bringing up the LNDs. This + * allows lnet_shutdown_lndnets() to assert that we've passed + * through here. + */ + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_state = LNET_STATE_RUNNING; + lnet_net_unlock(LNET_LOCK_EX); + + while (!list_empty(netlist)) { + net = list_entry(netlist->next, struct lnet_net, net_list); + list_del_init(&net->net_list); + + rc = lnet_startup_lndnet(net, NULL); + + if (rc < 0) + goto failed; + + ni_count += rc; + } + + return ni_count; +failed: + lnet_shutdown_lndnets(); + + return rc; +} + +static int lnet_genl_parse_list(struct sk_buff *msg, + const struct ln_key_list *data[], u16 idx) +{ + const struct ln_key_list *list = data[idx]; + const struct ln_key_props *props; + struct nlattr *node; + u16 count; + + if (!list) + return 0; + + if (!list->lkl_maxattr) + return -ERANGE; + + props = list->lkl_list; + if (!props) + return -EINVAL; + + node = nla_nest_start(msg, LN_SCALAR_ATTR_LIST); + if (!node) + return -ENOBUFS; + + for (count = 1; count <= list->lkl_maxattr; count++) { + struct nlattr *key = nla_nest_start(msg, count); + + if (count == 1) + nla_put_u16(msg, LN_SCALAR_ATTR_LIST_SIZE, + list->lkl_maxattr); + + nla_put_u16(msg, LN_SCALAR_ATTR_INDEX, count); + if (props[count].lkp_value) + nla_put_string(msg, LN_SCALAR_ATTR_VALUE, + props[count].lkp_value); + if (props[count].lkp_key_format) + nla_put_u16(msg, LN_SCALAR_ATTR_KEY_FORMAT, + props[count].lkp_key_format); + nla_put_u16(msg, LN_SCALAR_ATTR_NLA_TYPE, + props[count].lkp_data_type); + if (props[count].lkp_data_type == NLA_NESTED) { + int rc; + + rc = lnet_genl_parse_list(msg, data, ++idx); + if (rc < 0) + return rc; + idx = rc; + } + + nla_nest_end(msg, key); + } + + nla_nest_end(msg, node); + return idx; +} + +int lnet_genl_send_scalar_list(struct sk_buff *msg, u32 portid, u32 seq, + const struct genl_family *family, int flags, + u8 cmd, const struct ln_key_list *data[]) +{ + int rc = 0; + void *hdr; + + if (!data[0]) + return -EINVAL; + + hdr = genlmsg_put(msg, portid, seq, family, flags, cmd); + if (!hdr) + GOTO(canceled, rc = -EMSGSIZE); + + rc = lnet_genl_parse_list(msg, data, 0); + if (rc < 0) + GOTO(canceled, rc); + + genlmsg_end(msg, hdr); +canceled: + if (rc < 0) + genlmsg_cancel(msg, hdr); + return rc > 0 ? 0 : rc; +} +EXPORT_SYMBOL(lnet_genl_send_scalar_list); + +/** + * Initialize LNet library. + * + * Automatically called at module loading time. Caller has to call + * lnet_lib_exit() after a call to lnet_lib_init(), if and only if the + * latter returned 0. It must be called exactly once. + * + * \retval 0 on success + * \retval -ve on failures. + */ +int lnet_lib_init(void) +{ + int rc; + + lnet_assert_wire_constants(); + + /* refer to global cfs_cpt_table for now */ + the_lnet.ln_cpt_table = cfs_cpt_tab; + the_lnet.ln_cpt_number = cfs_cpt_number(cfs_cpt_tab); + + LASSERT(the_lnet.ln_cpt_number > 0); + if (the_lnet.ln_cpt_number > LNET_CPT_MAX) { + /* we are under risk of consuming all lh_cookie */ + CERROR("Can't have %d CPTs for LNet (max allowed is %d), " + "please change setting of CPT-table and retry\n", + the_lnet.ln_cpt_number, LNET_CPT_MAX); + return -E2BIG; + } + + while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number) + the_lnet.ln_cpt_bits++; + + rc = lnet_create_locks(); + if (rc != 0) { + CERROR("Can't create LNet global locks: %d\n", rc); + return rc; + } + + the_lnet.ln_refcount = 0; + INIT_LIST_HEAD(&the_lnet.ln_net_zombie); + INIT_LIST_HEAD(&the_lnet.ln_msg_resend); + + /* The hash table size is the number of bits it takes to express the set + * ln_num_routes, minus 1 (better to under estimate than over so we + * don't waste memory). */ + if (rnet_htable_size <= 0) + rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT; + else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX) + rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX; + the_lnet.ln_remote_nets_hbits = max_t(int, 1, + order_base_2(rnet_htable_size) - 1); + + /* All LNDs apart from the LOLND are in separate modules. They + * register themselves when their module loads, and unregister + * themselves when their module is unloaded. */ + lnet_register_lnd(&the_lolnd); + return 0; +} + +/** + * Finalize LNet library. + * + * \pre lnet_lib_init() called with success. + * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls. + * + * As this happens at module-unload, all lnds must already be unloaded, + * so they must already be unregistered. + */ +void lnet_lib_exit(void) +{ + int i; + + LASSERT(the_lnet.ln_refcount == 0); + lnet_unregister_lnd(&the_lolnd); + for (i = 0; i < NUM_LNDS; i++) + LASSERT(!the_lnet.ln_lnds[i]); + lnet_destroy_locks(); +} + +/** + * Set LNet PID and start LNet interfaces, routing, and forwarding. + * + * Users must call this function at least once before any other functions. + * For each successful call there must be a corresponding call to + * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is + * ignored. + * + * The PID used by LNet may be different from the one requested. + * See LNetGetId(). + * + * \param requested_pid PID requested by the caller. + * + * \return >= 0 on success, and < 0 error code on failures. + */ +int +LNetNIInit(lnet_pid_t requested_pid) +{ + int im_a_router = 0; + int rc; + int ni_count; + struct lnet_ping_buffer *pbuf; + struct lnet_handle_md ping_mdh; + LIST_HEAD(net_head); + struct lnet_net *net; + + mutex_lock(&the_lnet.ln_api_mutex); + + CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount); + + if (the_lnet.ln_state == LNET_STATE_STOPPING) { + mutex_unlock(&the_lnet.ln_api_mutex); + return -ESHUTDOWN; + } + + if (the_lnet.ln_refcount > 0) { + rc = the_lnet.ln_refcount++; + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + rc = lnet_prepare(requested_pid); + if (rc != 0) { + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + /* create a network for Loopback network */ + net = lnet_net_alloc(LNET_MKNET(LOLND, 0), &net_head); + if (net == NULL) { + rc = -ENOMEM; + goto err_empty_list; + } + + /* Add in the loopback NI */ + if (lnet_ni_alloc(net, NULL, NULL) == NULL) { + rc = -ENOMEM; + goto err_empty_list; + } + + if (use_tcp_bonding) + CWARN("use_tcp_bonding has been removed. Use Multi-Rail and Dynamic Discovery instead, see LU-13641\n"); + + /* If LNet is being initialized via DLC it is possible + * that the user requests not to load module parameters (ones which + * are supported by DLC) on initialization. Therefore, make sure not + * to load networks, routes and forwarding from module parameters + * in this case. On cleanup in case of failure only clean up + * routes if it has been loaded */ + if (!the_lnet.ln_nis_from_mod_params) { + rc = lnet_parse_networks(&net_head, lnet_get_networks()); + if (rc < 0) + goto err_empty_list; + } + + ni_count = lnet_startup_lndnets(&net_head); + if (ni_count < 0) { + rc = ni_count; + goto err_empty_list; + } + + if (!the_lnet.ln_nis_from_mod_params) { + rc = lnet_parse_routes(lnet_get_routes(), &im_a_router); + if (rc != 0) + goto err_shutdown_lndnis; + + rc = lnet_rtrpools_alloc(im_a_router); + if (rc != 0) + goto err_destroy_routes; + } + + rc = lnet_acceptor_start(); + if (rc != 0) + goto err_destroy_routes; + + the_lnet.ln_refcount = 1; + /* Now I may use my own API functions... */ + + rc = lnet_ping_target_setup(&pbuf, &ping_mdh, ni_count, true); + if (rc != 0) + goto err_acceptor_stop; + + lnet_ping_target_update(pbuf, ping_mdh); + + the_lnet.ln_mt_handler = lnet_mt_event_handler; + + rc = lnet_push_target_init(); + if (rc != 0) + goto err_stop_ping; + + rc = lnet_peer_discovery_start(); + if (rc != 0) + goto err_destroy_push_target; + + rc = lnet_monitor_thr_start(); + if (rc != 0) + goto err_stop_discovery_thr; + + lnet_fault_init(); + lnet_router_debugfs_init(); + + mutex_unlock(&the_lnet.ln_api_mutex); + + complete_all(&the_lnet.ln_started); + + /* wait for all routers to start */ + lnet_wait_router_start(); + + return 0; + +err_stop_discovery_thr: + lnet_peer_discovery_stop(); +err_destroy_push_target: + lnet_push_target_fini(); +err_stop_ping: + lnet_ping_target_fini(); +err_acceptor_stop: + the_lnet.ln_refcount = 0; + lnet_acceptor_stop(); +err_destroy_routes: + if (!the_lnet.ln_nis_from_mod_params) + lnet_destroy_routes(); +err_shutdown_lndnis: + lnet_shutdown_lndnets(); +err_empty_list: + lnet_unprepare(); + LASSERT(rc < 0); + mutex_unlock(&the_lnet.ln_api_mutex); + while (!list_empty(&net_head)) { + struct lnet_net *net; + + net = list_entry(net_head.next, struct lnet_net, net_list); + list_del_init(&net->net_list); + lnet_net_free(net); + } + return rc; +} +EXPORT_SYMBOL(LNetNIInit); + +/** + * Stop LNet interfaces, routing, and forwarding. + * + * Users must call this function once for each successful call to LNetNIInit(). + * Once the LNetNIFini() operation has been started, the results of pending + * API operations are undefined. + * + * \return always 0 for current implementation. + */ +int +LNetNIFini(void) +{ + mutex_lock(&the_lnet.ln_api_mutex); + + LASSERT(the_lnet.ln_refcount > 0); + + if (the_lnet.ln_refcount != 1) { + the_lnet.ln_refcount--; + } else { + LASSERT(!the_lnet.ln_niinit_self); + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_state = LNET_STATE_STOPPING; + lnet_net_unlock(LNET_LOCK_EX); + + lnet_fault_fini(); + + lnet_router_debugfs_fini(); + lnet_monitor_thr_stop(); + lnet_peer_discovery_stop(); + lnet_push_target_fini(); + lnet_ping_target_fini(); + + /* Teardown fns that use my own API functions BEFORE here */ + the_lnet.ln_refcount = 0; + + lnet_acceptor_stop(); + lnet_destroy_routes(); + lnet_shutdown_lndnets(); + lnet_unprepare(); + } + + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; +} +EXPORT_SYMBOL(LNetNIFini); + +/** + * Grabs the ni data from the ni structure and fills the out + * parameters + * + * \param[in] ni network interface structure + * \param[out] cfg_ni NI config information + * \param[out] tun network and LND tunables + */ +static void +lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni, + struct lnet_ioctl_config_lnd_tunables *tun, + struct lnet_ioctl_element_stats *stats, + __u32 tun_size) +{ + size_t min_size = 0; + int i; + + if (!ni || !cfg_ni || !tun || !nid_is_nid4(&ni->ni_nid)) + return; + + if (ni->ni_interface != NULL) { + strncpy(cfg_ni->lic_ni_intf, + ni->ni_interface, + sizeof(cfg_ni->lic_ni_intf)); + } + + cfg_ni->lic_nid = lnet_nid_to_nid4(&ni->ni_nid); + cfg_ni->lic_status = lnet_ni_get_status_locked(ni); + cfg_ni->lic_dev_cpt = ni->ni_dev_cpt; + + memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn)); + + if (stats) { + stats->iel_send_count = lnet_sum_stats(&ni->ni_stats, + LNET_STATS_TYPE_SEND); + stats->iel_recv_count = lnet_sum_stats(&ni->ni_stats, + LNET_STATS_TYPE_RECV); + stats->iel_drop_count = lnet_sum_stats(&ni->ni_stats, + LNET_STATS_TYPE_DROP); + } + + /* + * tun->lt_tun will always be present, but in order to be + * backwards compatible, we need to deal with the cases when + * tun->lt_tun is smaller than what the kernel has, because it + * comes from an older version of a userspace program, then we'll + * need to copy as much information as we have available space. + */ + min_size = tun_size - sizeof(tun->lt_cmn); + memcpy(&tun->lt_tun, &ni->ni_lnd_tunables, min_size); + + /* copy over the cpts */ + if (ni->ni_ncpts == LNET_CPT_NUMBER && + ni->ni_cpts == NULL) { + for (i = 0; i < ni->ni_ncpts; i++) + cfg_ni->lic_cpts[i] = i; + } else { + for (i = 0; + ni->ni_cpts != NULL && i < ni->ni_ncpts && + i < LNET_MAX_SHOW_NUM_CPT; + i++) + cfg_ni->lic_cpts[i] = ni->ni_cpts[i]; + } + cfg_ni->lic_ncpts = ni->ni_ncpts; +} + +/** + * NOTE: This is a legacy function left in the code to be backwards + * compatible with older userspace programs. It should eventually be + * removed. + * + * Grabs the ni data from the ni structure and fills the out + * parameters + * + * \param[in] ni network interface structure + * \param[out] config config information + */ +static void +lnet_fill_ni_info_legacy(struct lnet_ni *ni, + struct lnet_ioctl_config_data *config) +{ + struct lnet_ioctl_net_config *net_config; + struct lnet_ioctl_config_lnd_tunables *lnd_cfg = NULL; + size_t min_size, tunable_size = 0; + int i; + + if (!ni || !config || !nid_is_nid4(&ni->ni_nid)) + return; + + net_config = (struct lnet_ioctl_net_config *) config->cfg_bulk; + if (!net_config) + return; + + if (!ni->ni_interface) + return; + + strncpy(net_config->ni_interface, + ni->ni_interface, + sizeof(net_config->ni_interface)); + + config->cfg_nid = lnet_nid_to_nid4(&ni->ni_nid); + config->cfg_config_u.cfg_net.net_peer_timeout = + ni->ni_net->net_tunables.lct_peer_timeout; + config->cfg_config_u.cfg_net.net_max_tx_credits = + ni->ni_net->net_tunables.lct_max_tx_credits; + config->cfg_config_u.cfg_net.net_peer_tx_credits = + ni->ni_net->net_tunables.lct_peer_tx_credits; + config->cfg_config_u.cfg_net.net_peer_rtr_credits = + ni->ni_net->net_tunables.lct_peer_rtr_credits; + + net_config->ni_status = lnet_ni_get_status_locked(ni); + + if (ni->ni_cpts) { + int num_cpts = min(ni->ni_ncpts, LNET_MAX_SHOW_NUM_CPT); + + for (i = 0; i < num_cpts; i++) + net_config->ni_cpts[i] = ni->ni_cpts[i]; + + config->cfg_ncpts = num_cpts; + } + + /* + * See if user land tools sent in a newer and larger version + * of struct lnet_tunables than what the kernel uses. + */ + min_size = sizeof(*config) + sizeof(*net_config); + + if (config->cfg_hdr.ioc_len > min_size) + tunable_size = config->cfg_hdr.ioc_len - min_size; + + /* Don't copy too much data to user space */ + min_size = min(tunable_size, sizeof(ni->ni_lnd_tunables)); + lnd_cfg = (struct lnet_ioctl_config_lnd_tunables *)net_config->cfg_bulk; + + if (lnd_cfg && min_size) { + memcpy(&lnd_cfg->lt_tun, &ni->ni_lnd_tunables, min_size); + config->cfg_config_u.cfg_net.net_interface_count = 1; + + /* Tell user land that kernel side has less data */ + if (tunable_size > sizeof(ni->ni_lnd_tunables)) { + min_size = tunable_size - sizeof(ni->ni_lnd_tunables); + config->cfg_hdr.ioc_len -= min_size; + } + } +} + +struct lnet_ni * +lnet_get_ni_idx_locked(int idx) +{ + struct lnet_ni *ni; + struct lnet_net *net; + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (idx-- == 0) + return ni; + } + } + + return NULL; +} + +int lnet_get_net_healthv_locked(struct lnet_net *net) +{ + struct lnet_ni *ni; + int best_healthv = 0; + int healthv, ni_fatal; + + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + healthv = atomic_read(&ni->ni_healthv); + ni_fatal = atomic_read(&ni->ni_fatal_error_on); + if (!ni_fatal && healthv > best_healthv) + best_healthv = healthv; + } + + return best_healthv; +} + +struct lnet_ni * +lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev) +{ + struct lnet_ni *ni; + struct lnet_net *net = mynet; + + /* + * It is possible that the net has been cleaned out while there is + * a message being sent. This function accessed the net without + * checking if the list is empty + */ + if (prev == NULL) { + if (net == NULL) + net = list_entry(the_lnet.ln_nets.next, struct lnet_net, + net_list); + if (list_empty(&net->net_ni_list)) + return NULL; + ni = list_entry(net->net_ni_list.next, struct lnet_ni, + ni_netlist); + + return ni; + } + + if (prev->ni_netlist.next == &prev->ni_net->net_ni_list) { + /* if you reached the end of the ni list and the net is + * specified, then there are no more nis in that net */ + if (net != NULL) + return NULL; + + /* we reached the end of this net ni list. move to the + * next net */ + if (prev->ni_net->net_list.next == &the_lnet.ln_nets) + /* no more nets and no more NIs. */ + return NULL; + + /* get the next net */ + net = list_entry(prev->ni_net->net_list.next, struct lnet_net, + net_list); + if (list_empty(&net->net_ni_list)) + return NULL; + /* get the ni on it */ + ni = list_entry(net->net_ni_list.next, struct lnet_ni, + ni_netlist); + + return ni; + } + + if (list_empty(&prev->ni_netlist)) + return NULL; + + /* there are more nis left */ + ni = list_entry(prev->ni_netlist.next, struct lnet_ni, ni_netlist); + + return ni; +} + +int +lnet_get_net_config(struct lnet_ioctl_config_data *config) +{ + struct lnet_ni *ni; + int cpt; + int rc = -ENOENT; + int idx = config->cfg_count; + + cpt = lnet_net_lock_current(); + + ni = lnet_get_ni_idx_locked(idx); + + if (ni != NULL) { + rc = 0; + lnet_ni_lock(ni); + lnet_fill_ni_info_legacy(ni, config); + lnet_ni_unlock(ni); + } + + lnet_net_unlock(cpt); + return rc; +} + +int +lnet_get_ni_config(struct lnet_ioctl_config_ni *cfg_ni, + struct lnet_ioctl_config_lnd_tunables *tun, + struct lnet_ioctl_element_stats *stats, + __u32 tun_size) +{ + struct lnet_ni *ni; + int cpt; + int rc = -ENOENT; + + if (!cfg_ni || !tun || !stats) + return -EINVAL; + + cpt = lnet_net_lock_current(); + + ni = lnet_get_ni_idx_locked(cfg_ni->lic_idx); + + if (ni) { + rc = 0; + lnet_ni_lock(ni); + lnet_fill_ni_info(ni, cfg_ni, tun, stats, tun_size); + lnet_ni_unlock(ni); + } + + lnet_net_unlock(cpt); + return rc; +} + +int lnet_get_ni_stats(struct lnet_ioctl_element_msg_stats *msg_stats) +{ + struct lnet_ni *ni; + int cpt; + int rc = -ENOENT; + + if (!msg_stats) + return -EINVAL; + + cpt = lnet_net_lock_current(); + + ni = lnet_get_ni_idx_locked(msg_stats->im_idx); + + if (ni) { + lnet_usr_translate_stats(msg_stats, &ni->ni_stats); + rc = 0; + } + + lnet_net_unlock(cpt); + + return rc; +} + +static int lnet_add_net_common(struct lnet_net *net, + struct lnet_ioctl_config_lnd_tunables *tun) +{ + struct lnet_handle_md ping_mdh; + struct lnet_ping_buffer *pbuf; + struct lnet_remotenet *rnet; + struct lnet_ni *ni; + int net_ni_count; + __u32 net_id; + int rc; + + lnet_net_lock(LNET_LOCK_EX); + rnet = lnet_find_rnet_locked(net->net_id); + lnet_net_unlock(LNET_LOCK_EX); + /* + * make sure that the net added doesn't invalidate the current + * configuration LNet is keeping + */ + if (rnet) { + CERROR("Adding net %s will invalidate routing configuration\n", + libcfs_net2str(net->net_id)); + lnet_net_free(net); + return -EUSERS; + } + + /* + * make sure you calculate the correct number of slots in the ping + * buffer. Since the ping info is a flattened list of all the NIs, + * we should allocate enough slots to accomodate the number of NIs + * which will be added. + * + * since ni hasn't been configured yet, use + * lnet_get_net_ni_count_pre() which checks the net_ni_added list + */ + net_ni_count = lnet_get_net_ni_count_pre(net); + + rc = lnet_ping_target_setup(&pbuf, &ping_mdh, + net_ni_count + lnet_get_ni_count(), + false); + if (rc < 0) { + lnet_net_free(net); + return rc; + } + + if (tun) + memcpy(&net->net_tunables, + &tun->lt_cmn, sizeof(net->net_tunables)); + else + memset(&net->net_tunables, -1, sizeof(net->net_tunables)); + + net_id = net->net_id; + + rc = lnet_startup_lndnet(net, + (tun) ? &tun->lt_tun : NULL); + if (rc < 0) + goto failed; + + lnet_net_lock(LNET_LOCK_EX); + net = lnet_get_net_locked(net_id); + LASSERT(net); + + /* apply the UDSPs */ + rc = lnet_udsp_apply_policies_on_net(net); + if (rc) + CERROR("Failed to apply UDSPs on local net %s\n", + libcfs_net2str(net->net_id)); + + /* At this point we lost track of which NI was just added, so we + * just re-apply the policies on all of the NIs on this net + */ + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + rc = lnet_udsp_apply_policies_on_ni(ni); + if (rc) + CERROR("Failed to apply UDSPs on ni %s\n", + libcfs_nidstr(&ni->ni_nid)); + } + lnet_net_unlock(LNET_LOCK_EX); + + /* + * Start the acceptor thread if this is the first network + * being added that requires the thread. + */ + if (net->net_lnd->lnd_accept) { + rc = lnet_acceptor_start(); + if (rc < 0) { + /* shutdown the net that we just started */ + CERROR("Failed to start up acceptor thread\n"); + lnet_shutdown_lndnet(net); + goto failed; + } + } + + lnet_net_lock(LNET_LOCK_EX); + lnet_peer_net_added(net); + lnet_net_unlock(LNET_LOCK_EX); + + lnet_ping_target_update(pbuf, ping_mdh); + + return 0; + +failed: + lnet_ping_md_unlink(pbuf, &ping_mdh); + lnet_ping_buffer_decref(pbuf); + return rc; +} + +static void +lnet_set_tune_defaults(struct lnet_ioctl_config_lnd_tunables *tun) +{ + if (tun) { + if (!tun->lt_cmn.lct_peer_timeout) + tun->lt_cmn.lct_peer_timeout = DEFAULT_PEER_TIMEOUT; + if (!tun->lt_cmn.lct_peer_tx_credits) + tun->lt_cmn.lct_peer_tx_credits = DEFAULT_PEER_CREDITS; + if (!tun->lt_cmn.lct_max_tx_credits) + tun->lt_cmn.lct_max_tx_credits = DEFAULT_CREDITS; + } +} + +static int lnet_handle_legacy_ip2nets(char *ip2nets, + struct lnet_ioctl_config_lnd_tunables *tun) +{ + struct lnet_net *net; + const char *nets; + int rc; + LIST_HEAD(net_head); + + rc = lnet_parse_ip2nets(&nets, ip2nets); + if (rc < 0) + return rc; + + rc = lnet_parse_networks(&net_head, nets); + if (rc < 0) + return rc; + + lnet_set_tune_defaults(tun); + + mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto out; + } + while (!list_empty(&net_head)) { + net = list_entry(net_head.next, struct lnet_net, net_list); + list_del_init(&net->net_list); + rc = lnet_add_net_common(net, tun); + if (rc < 0) + goto out; + } + +out: + mutex_unlock(&the_lnet.ln_api_mutex); + + while (!list_empty(&net_head)) { + net = list_entry(net_head.next, struct lnet_net, net_list); + list_del_init(&net->net_list); + lnet_net_free(net); + } + return rc; +} + +int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf) +{ + struct lnet_net *net; + struct lnet_ni *ni; + struct lnet_ioctl_config_lnd_tunables *tun = NULL; + int rc, i; + __u32 net_id, lnd_type; + + /* get the tunables if they are available */ + if (conf->lic_cfg_hdr.ioc_len >= + sizeof(*conf) + sizeof(*tun)) + tun = (struct lnet_ioctl_config_lnd_tunables *) + conf->lic_bulk; + + /* handle legacy ip2nets from DLC */ + if (conf->lic_legacy_ip2nets[0] != '\0') + return lnet_handle_legacy_ip2nets(conf->lic_legacy_ip2nets, + tun); + + net_id = LNET_NIDNET(conf->lic_nid); + lnd_type = LNET_NETTYP(net_id); + + if (!libcfs_isknown_lnd(lnd_type)) { + CERROR("No valid net and lnd information provided\n"); + return -EINVAL; + } + + net = lnet_net_alloc(net_id, NULL); + if (!net) + return -ENOMEM; + + for (i = 0; i < conf->lic_ncpts; i++) { + if (conf->lic_cpts[i] >= LNET_CPT_NUMBER) + return -EINVAL; + } + + ni = lnet_ni_alloc_w_cpt_array(net, conf->lic_cpts, conf->lic_ncpts, + conf->lic_ni_intf); + if (!ni) + return -ENOMEM; + + lnet_set_tune_defaults(tun); + + mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) + rc = -ESHUTDOWN; + else + rc = lnet_add_net_common(net, tun); + + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; +} + +int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf) +{ + struct lnet_net *net; + struct lnet_ni *ni; + __u32 net_id = LNET_NIDNET(conf->lic_nid); + struct lnet_ping_buffer *pbuf; + struct lnet_handle_md ping_mdh; + int rc; + int net_count; + __u32 addr; + + /* don't allow userspace to shutdown the LOLND */ + if (LNET_NETTYP(net_id) == LOLND) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto unlock_api_mutex; + } + + lnet_net_lock(0); + + net = lnet_get_net_locked(net_id); + if (!net) { + CERROR("net %s not found\n", + libcfs_net2str(net_id)); + rc = -ENOENT; + goto unlock_net; + } + + addr = LNET_NIDADDR(conf->lic_nid); + if (addr == 0) { + /* remove the entire net */ + net_count = lnet_get_net_ni_count_locked(net); + + lnet_net_unlock(0); + + /* create and link a new ping info, before removing the old one */ + rc = lnet_ping_target_setup(&pbuf, &ping_mdh, + lnet_get_ni_count() - net_count, + false); + if (rc != 0) + goto unlock_api_mutex; + + lnet_shutdown_lndnet(net); + + lnet_acceptor_stop(); + + lnet_ping_target_update(pbuf, ping_mdh); + + goto unlock_api_mutex; + } + + ni = lnet_nid2ni_locked(conf->lic_nid, 0); + if (!ni) { + CERROR("nid %s not found\n", + libcfs_nid2str(conf->lic_nid)); + rc = -ENOENT; + goto unlock_net; + } + + net_count = lnet_get_net_ni_count_locked(net); + + lnet_net_unlock(0); + + /* create and link a new ping info, before removing the old one */ + rc = lnet_ping_target_setup(&pbuf, &ping_mdh, + lnet_get_ni_count() - 1, false); + if (rc != 0) + goto unlock_api_mutex; + + lnet_shutdown_lndni(ni); + + lnet_acceptor_stop(); + + lnet_ping_target_update(pbuf, ping_mdh); + + /* check if the net is empty and remove it if it is */ + if (net_count == 1) + lnet_shutdown_lndnet(net); + + goto unlock_api_mutex; + +unlock_net: + lnet_net_unlock(0); +unlock_api_mutex: + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; +} + +/* + * lnet_dyn_add_net and lnet_dyn_del_net are now deprecated. + * They are only expected to be called for unique networks. + * That can be as a result of older DLC library + * calls. Multi-Rail DLC and beyond no longer uses these APIs. + */ +int +lnet_dyn_add_net(struct lnet_ioctl_config_data *conf) +{ + struct lnet_net *net; + LIST_HEAD(net_head); + int rc; + struct lnet_ioctl_config_lnd_tunables tun; + const char *nets = conf->cfg_config_u.cfg_net.net_intf; + + /* Create a net/ni structures for the network string */ + rc = lnet_parse_networks(&net_head, nets); + if (rc <= 0) + return rc == 0 ? -EINVAL : rc; + + mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto out_unlock_clean; + } + + if (rc > 1) { + rc = -EINVAL; /* only add one network per call */ + goto out_unlock_clean; + } + + net = list_entry(net_head.next, struct lnet_net, net_list); + list_del_init(&net->net_list); + + LASSERT(lnet_net_unique(net->net_id, &the_lnet.ln_nets, NULL)); + + memset(&tun, 0, sizeof(tun)); + + tun.lt_cmn.lct_peer_timeout = + (!conf->cfg_config_u.cfg_net.net_peer_timeout) ? DEFAULT_PEER_TIMEOUT : + conf->cfg_config_u.cfg_net.net_peer_timeout; + tun.lt_cmn.lct_peer_tx_credits = + (!conf->cfg_config_u.cfg_net.net_peer_tx_credits) ? DEFAULT_PEER_CREDITS : + conf->cfg_config_u.cfg_net.net_peer_tx_credits; + tun.lt_cmn.lct_peer_rtr_credits = + conf->cfg_config_u.cfg_net.net_peer_rtr_credits; + tun.lt_cmn.lct_max_tx_credits = + (!conf->cfg_config_u.cfg_net.net_max_tx_credits) ? DEFAULT_CREDITS : + conf->cfg_config_u.cfg_net.net_max_tx_credits; + + rc = lnet_add_net_common(net, &tun); + +out_unlock_clean: + mutex_unlock(&the_lnet.ln_api_mutex); + while (!list_empty(&net_head)) { + /* net_head list is empty in success case */ + net = list_entry(net_head.next, struct lnet_net, net_list); + list_del_init(&net->net_list); + lnet_net_free(net); + } + return rc; +} + +int +lnet_dyn_del_net(__u32 net_id) +{ + struct lnet_net *net; + struct lnet_ping_buffer *pbuf; + struct lnet_handle_md ping_mdh; + int rc; + int net_ni_count; + + /* don't allow userspace to shutdown the LOLND */ + if (LNET_NETTYP(net_id) == LOLND) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto out; + } + + lnet_net_lock(0); + + net = lnet_get_net_locked(net_id); + if (net == NULL) { + lnet_net_unlock(0); + rc = -EINVAL; + goto out; + } + + net_ni_count = lnet_get_net_ni_count_locked(net); + + lnet_net_unlock(0); + + /* create and link a new ping info, before removing the old one */ + rc = lnet_ping_target_setup(&pbuf, &ping_mdh, + lnet_get_ni_count() - net_ni_count, false); + if (rc != 0) + goto out; + + lnet_shutdown_lndnet(net); + + lnet_acceptor_stop(); + + lnet_ping_target_update(pbuf, ping_mdh); + +out: + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; +} + +void lnet_incr_dlc_seq(void) +{ + atomic_inc(&lnet_dlc_seq_no); +} + +__u32 lnet_get_dlc_seq_locked(void) +{ + return atomic_read(&lnet_dlc_seq_no); +} + +static void +lnet_ni_set_healthv(lnet_nid_t nid, int value, bool all) +{ + struct lnet_net *net; + struct lnet_ni *ni; + + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (all || (nid_is_nid4(&ni->ni_nid) && + lnet_nid_to_nid4(&ni->ni_nid) == nid)) { + atomic_set(&ni->ni_healthv, value); + if (list_empty(&ni->ni_recovery) && + value < LNET_MAX_HEALTH_VALUE) { + CERROR("manually adding local NI %s to recovery\n", + libcfs_nidstr(&ni->ni_nid)); + list_add_tail(&ni->ni_recovery, + &the_lnet.ln_mt_localNIRecovq); + lnet_ni_addref_locked(ni, 0); + } + if (!all) { + lnet_net_unlock(LNET_LOCK_EX); + return; + } + } + } + } + lnet_net_unlock(LNET_LOCK_EX); +} + +static void +lnet_ni_set_conns_per_peer(lnet_nid_t nid, int value, bool all) +{ + struct lnet_net *net; + struct lnet_ni *ni; + + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (lnet_nid_to_nid4(&ni->ni_nid) != nid && !all) + continue; + if (LNET_NETTYP(net->net_id) == SOCKLND) + ni->ni_lnd_tunables.lnd_tun_u.lnd_sock.lnd_conns_per_peer = value; + else if (LNET_NETTYP(net->net_id) == O2IBLND) + ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib.lnd_conns_per_peer = value; + if (!all) { + lnet_net_unlock(LNET_LOCK_EX); + return; + } + } + } + lnet_net_unlock(LNET_LOCK_EX); +} + +static int +lnet_get_local_ni_hstats(struct lnet_ioctl_local_ni_hstats *stats) +{ + int cpt, rc = 0; + struct lnet_ni *ni; + lnet_nid_t nid = stats->hlni_nid; + + cpt = lnet_net_lock_current(); + ni = lnet_nid2ni_locked(nid, cpt); + + if (!ni) { + rc = -ENOENT; + goto unlock; + } + + stats->hlni_local_interrupt = atomic_read(&ni->ni_hstats.hlt_local_interrupt); + stats->hlni_local_dropped = atomic_read(&ni->ni_hstats.hlt_local_dropped); + stats->hlni_local_aborted = atomic_read(&ni->ni_hstats.hlt_local_aborted); + stats->hlni_local_no_route = atomic_read(&ni->ni_hstats.hlt_local_no_route); + stats->hlni_local_timeout = atomic_read(&ni->ni_hstats.hlt_local_timeout); + stats->hlni_local_error = atomic_read(&ni->ni_hstats.hlt_local_error); + stats->hlni_fatal_error = atomic_read(&ni->ni_fatal_error_on); + stats->hlni_health_value = atomic_read(&ni->ni_healthv); + stats->hlni_ping_count = ni->ni_ping_count; + stats->hlni_next_ping = ni->ni_next_ping; + +unlock: + lnet_net_unlock(cpt); + + return rc; +} + +static int +lnet_get_local_ni_recovery_list(struct lnet_ioctl_recovery_list *list) +{ + struct lnet_ni *ni; + int i = 0; + + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry(ni, &the_lnet.ln_mt_localNIRecovq, ni_recovery) { + if (!nid_is_nid4(&ni->ni_nid)) + continue; + list->rlst_nid_array[i] = lnet_nid_to_nid4(&ni->ni_nid); + i++; + if (i >= LNET_MAX_SHOW_NUM_NID) + break; + } + lnet_net_unlock(LNET_LOCK_EX); + list->rlst_num_nids = i; + + return 0; +} + +static int +lnet_get_peer_ni_recovery_list(struct lnet_ioctl_recovery_list *list) +{ + struct lnet_peer_ni *lpni; + int i = 0; + + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry(lpni, &the_lnet.ln_mt_peerNIRecovq, lpni_recovery) { + list->rlst_nid_array[i] = lnet_nid_to_nid4(&lpni->lpni_nid); + i++; + if (i >= LNET_MAX_SHOW_NUM_NID) + break; + } + lnet_net_unlock(LNET_LOCK_EX); + list->rlst_num_nids = i; + + return 0; +} + +/** + * LNet ioctl handler. + * + */ +int +LNetCtl(unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + struct lnet_ioctl_config_data *config; + struct lnet_process_id id4 = {}; + struct lnet_processid id = {}; + struct lnet_ni *ni; + struct lnet_nid nid; + int rc; + + BUILD_BUG_ON(sizeof(struct lnet_ioctl_net_config) + + sizeof(struct lnet_ioctl_config_data) > LIBCFS_IOC_DATA_MAX); + + switch (cmd) { + case IOC_LIBCFS_GET_NI: + rc = LNetGetId(data->ioc_count, &id); + data->ioc_nid = lnet_nid_to_nid4(&id.nid); + return rc; + + case IOC_LIBCFS_FAIL_NID: + return lnet_fail_nid(data->ioc_nid, data->ioc_count); + + case IOC_LIBCFS_ADD_ROUTE: { + /* default router sensitivity to 1 */ + unsigned int sensitivity = 1; + config = arg; + + if (config->cfg_hdr.ioc_len < sizeof(*config)) + return -EINVAL; + + if (config->cfg_config_u.cfg_route.rtr_sensitivity) { + sensitivity = + config->cfg_config_u.cfg_route.rtr_sensitivity; + } + + lnet_nid4_to_nid(config->cfg_nid, &nid); + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_add_route(config->cfg_net, + config->cfg_config_u.cfg_route.rtr_hop, + &nid, + config->cfg_config_u.cfg_route. + rtr_priority, sensitivity); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_DEL_ROUTE: + config = arg; + + if (config->cfg_hdr.ioc_len < sizeof(*config)) + return -EINVAL; + + lnet_nid4_to_nid(config->cfg_nid, &nid); + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_del_route(config->cfg_net, &nid); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + + case IOC_LIBCFS_GET_ROUTE: + config = arg; + + if (config->cfg_hdr.ioc_len < sizeof(*config)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_route(config->cfg_count, + &config->cfg_net, + &config->cfg_config_u.cfg_route.rtr_hop, + &config->cfg_nid, + &config->cfg_config_u.cfg_route.rtr_flags, + &config->cfg_config_u.cfg_route. + rtr_priority, + &config->cfg_config_u.cfg_route. + rtr_sensitivity); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + + case IOC_LIBCFS_GET_LOCAL_NI: { + struct lnet_ioctl_config_ni *cfg_ni; + struct lnet_ioctl_config_lnd_tunables *tun = NULL; + struct lnet_ioctl_element_stats *stats; + __u32 tun_size; + + cfg_ni = arg; + + /* get the tunables if they are available */ + if (cfg_ni->lic_cfg_hdr.ioc_len < + sizeof(*cfg_ni) + sizeof(*stats) + sizeof(*tun)) + return -EINVAL; + + stats = (struct lnet_ioctl_element_stats *) + cfg_ni->lic_bulk; + tun = (struct lnet_ioctl_config_lnd_tunables *) + (cfg_ni->lic_bulk + sizeof(*stats)); + + tun_size = cfg_ni->lic_cfg_hdr.ioc_len - sizeof(*cfg_ni) - + sizeof(*stats); + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_ni_config(cfg_ni, tun, stats, tun_size); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS: { + struct lnet_ioctl_element_msg_stats *msg_stats = arg; + + if (msg_stats->im_hdr.ioc_len != sizeof(*msg_stats)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_ni_stats(msg_stats); + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; + } + + case IOC_LIBCFS_GET_NET: { + size_t total = sizeof(*config) + + sizeof(struct lnet_ioctl_net_config); + config = arg; + + if (config->cfg_hdr.ioc_len < total) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_net_config(config); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_GET_LNET_STATS: + { + struct lnet_ioctl_lnet_stats *lnet_stats = arg; + + if (lnet_stats->st_hdr.ioc_len < sizeof(*lnet_stats)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_counters_get(&lnet_stats->st_cntrs); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_RESET_LNET_STATS: + { + mutex_lock(&the_lnet.ln_api_mutex); + lnet_counters_reset(); + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + case IOC_LIBCFS_CONFIG_RTR: + config = arg; + + if (config->cfg_hdr.ioc_len < sizeof(*config)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + if (config->cfg_config_u.cfg_buffers.buf_enable) { + rc = lnet_rtrpools_enable(); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + lnet_rtrpools_disable(); + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + + case IOC_LIBCFS_ADD_BUF: + config = arg; + + if (config->cfg_hdr.ioc_len < sizeof(*config)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_rtrpools_adjust(config->cfg_config_u.cfg_buffers. + buf_tiny, + config->cfg_config_u.cfg_buffers. + buf_small, + config->cfg_config_u.cfg_buffers. + buf_large); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + + case IOC_LIBCFS_SET_NUMA_RANGE: { + struct lnet_ioctl_set_value *numa; + numa = arg; + if (numa->sv_hdr.ioc_len != sizeof(*numa)) + return -EINVAL; + lnet_net_lock(LNET_LOCK_EX); + lnet_numa_range = numa->sv_value; + lnet_net_unlock(LNET_LOCK_EX); + return 0; + } + + case IOC_LIBCFS_GET_NUMA_RANGE: { + struct lnet_ioctl_set_value *numa; + numa = arg; + if (numa->sv_hdr.ioc_len != sizeof(*numa)) + return -EINVAL; + numa->sv_value = lnet_numa_range; + return 0; + } + + case IOC_LIBCFS_GET_BUF: { + struct lnet_ioctl_pool_cfg *pool_cfg; + size_t total = sizeof(*config) + sizeof(*pool_cfg); + + config = arg; + + if (config->cfg_hdr.ioc_len < total) + return -EINVAL; + + pool_cfg = (struct lnet_ioctl_pool_cfg *)config->cfg_bulk; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_rtr_pool_cfg(config->cfg_count, pool_cfg); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_GET_LOCAL_HSTATS: { + struct lnet_ioctl_local_ni_hstats *stats = arg; + + if (stats->hlni_hdr.ioc_len < sizeof(*stats)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_local_ni_hstats(stats); + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; + } + + case IOC_LIBCFS_GET_RECOVERY_QUEUE: { + struct lnet_ioctl_recovery_list *list = arg; + if (list->rlst_hdr.ioc_len < sizeof(*list)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + if (list->rlst_type == LNET_HEALTH_TYPE_LOCAL_NI) + rc = lnet_get_local_ni_recovery_list(list); + else + rc = lnet_get_peer_ni_recovery_list(list); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_ADD_PEER_NI: { + struct lnet_ioctl_peer_cfg *cfg = arg; + + if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_add_peer_ni(cfg->prcfg_prim_nid, + cfg->prcfg_cfg_nid, + cfg->prcfg_mr, false); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_DEL_PEER_NI: { + struct lnet_ioctl_peer_cfg *cfg = arg; + + if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_del_peer_ni(cfg->prcfg_prim_nid, + cfg->prcfg_cfg_nid); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_GET_PEER_INFO: { + struct lnet_ioctl_peer *peer_info = arg; + + if (peer_info->pr_hdr.ioc_len < sizeof(*peer_info)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_peer_ni_info( + peer_info->pr_count, + &peer_info->pr_nid, + peer_info->pr_lnd_u.pr_peer_credits.cr_aliveness, + &peer_info->pr_lnd_u.pr_peer_credits.cr_ncpt, + &peer_info->pr_lnd_u.pr_peer_credits.cr_refcount, + &peer_info->pr_lnd_u.pr_peer_credits.cr_ni_peer_tx_credits, + &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_tx_credits, + &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_rtr_credits, + &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_min_tx_credits, + &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_tx_qnob); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_GET_PEER_NI: { + struct lnet_ioctl_peer_cfg *cfg = arg; + + if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_peer_info(cfg, + (void __user *)cfg->prcfg_bulk); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_GET_PEER_LIST: { + struct lnet_ioctl_peer_cfg *cfg = arg; + + if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_peer_list(&cfg->prcfg_count, &cfg->prcfg_size, + (struct lnet_process_id __user *)cfg->prcfg_bulk); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_SET_HEALHV: { + struct lnet_ioctl_reset_health_cfg *cfg = arg; + int value; + if (cfg->rh_hdr.ioc_len < sizeof(*cfg)) + return -EINVAL; + if (cfg->rh_value < 0 || + cfg->rh_value > LNET_MAX_HEALTH_VALUE) + value = LNET_MAX_HEALTH_VALUE; + else + value = cfg->rh_value; + CDEBUG(D_NET, "Manually setting healthv to %d for %s:%s. all = %d\n", + value, (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI) ? + "local" : "peer", libcfs_nid2str(cfg->rh_nid), cfg->rh_all); + mutex_lock(&the_lnet.ln_api_mutex); + if (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI) + lnet_ni_set_healthv(cfg->rh_nid, value, + cfg->rh_all); + else + lnet_peer_ni_set_healthv(cfg->rh_nid, value, + cfg->rh_all); + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + case IOC_LIBCFS_SET_CONNS_PER_PEER: { + struct lnet_ioctl_reset_conns_per_peer_cfg *cfg = arg; + int value; + + if (cfg->rcpp_hdr.ioc_len < sizeof(*cfg)) + return -EINVAL; + if (cfg->rcpp_value < 0) + value = 1; + else + value = cfg->rcpp_value; + CDEBUG(D_NET, + "Setting conns_per_peer to %d for %s. all = %d\n", + value, libcfs_nid2str(cfg->rcpp_nid), cfg->rcpp_all); + mutex_lock(&the_lnet.ln_api_mutex); + lnet_ni_set_conns_per_peer(cfg->rcpp_nid, value, cfg->rcpp_all); + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + case IOC_LIBCFS_NOTIFY_ROUTER: { + time64_t deadline = ktime_get_real_seconds() - data->ioc_u64[0]; + + /* The deadline passed in by the user should be some time in + * seconds in the future since the UNIX epoch. We have to map + * that deadline to the wall clock. + */ + deadline += ktime_get_seconds(); + return lnet_notify(NULL, data->ioc_nid, data->ioc_flags, false, + deadline); + } + + case IOC_LIBCFS_LNET_DIST: + rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]); + if (rc < 0 && rc != -EHOSTUNREACH) + return rc; + + data->ioc_u32[0] = rc; + return 0; + + case IOC_LIBCFS_TESTPROTOCOMPAT: + the_lnet.ln_testprotocompat = data->ioc_flags; + return 0; + + case IOC_LIBCFS_LNET_FAULT: + return lnet_fault_ctl(data->ioc_flags, data); + + case IOC_LIBCFS_PING: { + signed long timeout; + + id4.nid = data->ioc_nid; + id4.pid = data->ioc_u32[0]; + + /* If timeout is negative then set default of 3 minutes */ + if (((s32)data->ioc_u32[1] <= 0) || + data->ioc_u32[1] > (DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC)) + timeout = cfs_time_seconds(DEFAULT_PEER_TIMEOUT); + else + timeout = nsecs_to_jiffies(data->ioc_u32[1] * NSEC_PER_MSEC); + + rc = lnet_ping(id4, &LNET_ANY_NID, timeout, data->ioc_pbuf1, + data->ioc_plen1 / sizeof(struct lnet_process_id)); + + if (rc < 0) + return rc; + + data->ioc_count = rc; + return 0; + } + + case IOC_LIBCFS_PING_PEER: { + struct lnet_ioctl_ping_data *ping = arg; + struct lnet_nid src_nid = LNET_ANY_NID; + struct lnet_peer *lp; + signed long timeout; + + /* Check if the supplied ping data supports source nid + * NB: This check is sufficient if lnet_ioctl_ping_data has + * additional fields added, but if they are re-ordered or + * fields removed then this will break. It is expected that + * these ioctls will be replaced with netlink implementation, so + * it is probably not worth coming up with a more robust version + * compatibility scheme. + */ + if (ping->ping_hdr.ioc_len >= sizeof(struct lnet_ioctl_ping_data)) + lnet_nid4_to_nid(ping->ping_src, &src_nid); + + /* If timeout is negative then set default of 3 minutes */ + if (((s32)ping->op_param) <= 0 || + ping->op_param > (DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC)) + timeout = cfs_time_seconds(DEFAULT_PEER_TIMEOUT); + else + timeout = nsecs_to_jiffies(ping->op_param * NSEC_PER_MSEC); + + rc = lnet_ping(ping->ping_id, &src_nid, timeout, + ping->ping_buf, + ping->ping_count); + if (rc < 0) + return rc; + + mutex_lock(&the_lnet.ln_api_mutex); + lp = lnet_find_peer4(ping->ping_id.nid); + if (lp) { + ping->ping_id.nid = + lnet_nid_to_nid4(&lp->lp_primary_nid); + ping->mr_info = lnet_peer_is_multi_rail(lp); + lnet_peer_decref_locked(lp); + } + mutex_unlock(&the_lnet.ln_api_mutex); + + ping->ping_count = rc; + return 0; + } + + case IOC_LIBCFS_DISCOVER: { + struct lnet_ioctl_ping_data *discover = arg; + struct lnet_peer *lp; + + rc = lnet_discover(discover->ping_id, discover->op_param, + discover->ping_buf, + discover->ping_count); + if (rc < 0) + return rc; + + mutex_lock(&the_lnet.ln_api_mutex); + lp = lnet_find_peer4(discover->ping_id.nid); + if (lp) { + discover->ping_id.nid = + lnet_nid_to_nid4(&lp->lp_primary_nid); + discover->mr_info = lnet_peer_is_multi_rail(lp); + lnet_peer_decref_locked(lp); + } + mutex_unlock(&the_lnet.ln_api_mutex); + + discover->ping_count = rc; + return 0; + } + + case IOC_LIBCFS_ADD_UDSP: { + struct lnet_ioctl_udsp *ioc_udsp = arg; + __u32 bulk_size = ioc_udsp->iou_hdr.ioc_len; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_udsp_demarshal_add(arg, bulk_size); + if (!rc) { + rc = lnet_udsp_apply_policies(NULL, false); + CDEBUG(D_NET, "policy application returned %d\n", rc); + rc = 0; + } + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; + } + + case IOC_LIBCFS_DEL_UDSP: { + struct lnet_ioctl_udsp *ioc_udsp = arg; + int idx = ioc_udsp->iou_idx; + + if (ioc_udsp->iou_hdr.ioc_len < sizeof(*ioc_udsp)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_udsp_del_policy(idx); + if (!rc) { + rc = lnet_udsp_apply_policies(NULL, false); + CDEBUG(D_NET, "policy re-application returned %d\n", + rc); + rc = 0; + } + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; + } + + case IOC_LIBCFS_GET_UDSP_SIZE: { + struct lnet_ioctl_udsp *ioc_udsp = arg; + struct lnet_udsp *udsp; + + if (ioc_udsp->iou_hdr.ioc_len < sizeof(*ioc_udsp)) + return -EINVAL; + + rc = 0; + + mutex_lock(&the_lnet.ln_api_mutex); + udsp = lnet_udsp_get_policy(ioc_udsp->iou_idx); + if (!udsp) { + rc = -ENOENT; + } else { + /* coming in iou_idx will hold the idx of the udsp + * to get the size of. going out the iou_idx will + * hold the size of the UDSP found at the passed + * in index. + */ + ioc_udsp->iou_idx = lnet_get_udsp_size(udsp); + if (ioc_udsp->iou_idx < 0) + rc = -EINVAL; + } + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; + } + + case IOC_LIBCFS_GET_UDSP: { + struct lnet_ioctl_udsp *ioc_udsp = arg; + struct lnet_udsp *udsp; + + if (ioc_udsp->iou_hdr.ioc_len < sizeof(*ioc_udsp)) + return -EINVAL; + + rc = 0; + + mutex_lock(&the_lnet.ln_api_mutex); + udsp = lnet_udsp_get_policy(ioc_udsp->iou_idx); + if (!udsp) + rc = -ENOENT; + else + rc = lnet_udsp_marshal(udsp, ioc_udsp); + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; + } + + case IOC_LIBCFS_GET_CONST_UDSP_INFO: { + struct lnet_ioctl_construct_udsp_info *info = arg; + + if (info->cud_hdr.ioc_len < sizeof(*info)) + return -EINVAL; + + CDEBUG(D_NET, "GET_UDSP_INFO for %s\n", + libcfs_nid2str(info->cud_nid)); + + mutex_lock(&the_lnet.ln_api_mutex); + lnet_udsp_get_construct_info(info); + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; + } + + default: + ni = lnet_net2ni_addref(data->ioc_net); + if (ni == NULL) + return -EINVAL; + + if (ni->ni_net->net_lnd->lnd_ctl == NULL) + rc = -EINVAL; + else + rc = ni->ni_net->net_lnd->lnd_ctl(ni, cmd, arg); + + lnet_ni_decref(ni); + return rc; + } + /* not reached */ +} +EXPORT_SYMBOL(LNetCtl); + +void LNetDebugPeer(struct lnet_processid *id) +{ + lnet_debug_peer(lnet_nid_to_nid4(&id->nid)); +} +EXPORT_SYMBOL(LNetDebugPeer); + +/** + * Determine if the specified peer \a nid is on the local node. + * + * \param nid peer nid to check + * + * \retval true If peer NID is on the local node. + * \retval false If peer NID is not on the local node. + */ +bool LNetIsPeerLocal(lnet_nid_t nid) +{ + struct lnet_net *net; + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (lnet_nid_to_nid4(&ni->ni_nid) == nid) { + lnet_net_unlock(cpt); + return true; + } + } + } + lnet_net_unlock(cpt); + + return false; +} +EXPORT_SYMBOL(LNetIsPeerLocal); + +/** + * Retrieve the struct lnet_process_id ID of LNet interface at \a index. + * Note that all interfaces share a same PID, as requested by LNetNIInit(). + * + * \param index Index of the interface to look up. + * \param id On successful return, this location will hold the + * struct lnet_process_id ID of the interface. + * + * \retval 0 If an interface exists at \a index. + * \retval -ENOENT If no interface has been found. + */ +int +LNetGetId(unsigned int index, struct lnet_processid *id) +{ + struct lnet_ni *ni; + struct lnet_net *net; + int cpt; + int rc = -ENOENT; + + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_net_lock_current(); + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (!nid_is_nid4(&ni->ni_nid)) + /* FIXME this needs to be handled */ + continue; + if (index-- != 0) + continue; + + id->nid = ni->ni_nid; + id->pid = the_lnet.ln_pid; + rc = 0; + break; + } + } + + lnet_net_unlock(cpt); + return rc; +} +EXPORT_SYMBOL(LNetGetId); + +struct ping_data { + int rc; + int replied; + struct lnet_handle_md mdh; + struct completion completion; +}; + +static void +lnet_ping_event_handler(struct lnet_event *event) +{ + struct ping_data *pd = event->md_user_ptr; + + CDEBUG(D_NET, "ping event (%d %d)%s\n", + event->type, event->status, + event->unlinked ? " unlinked" : ""); + + if (event->status) { + if (!pd->rc) + pd->rc = event->status; + } else if (event->type == LNET_EVENT_REPLY) { + pd->replied = 1; + pd->rc = event->mlength; + } + if (event->unlinked) + complete(&pd->completion); +} + +static int lnet_ping(struct lnet_process_id id, struct lnet_nid *src_nid, + signed long timeout, struct lnet_process_id __user *ids, + int n_ids) +{ + struct lnet_md md = { NULL }; + struct ping_data pd = { 0 }; + struct lnet_ping_buffer *pbuf; + struct lnet_process_id tmpid; + int i; + int nob; + int rc; + int rc2; + + /* n_ids limit is arbitrary */ + if (n_ids <= 0 || id.nid == LNET_NID_ANY) + return -EINVAL; + + /* + * if the user buffer has more space than the lnet_interfaces_max + * then only fill it up to lnet_interfaces_max + */ + if (n_ids > lnet_interfaces_max) + n_ids = lnet_interfaces_max; + + if (id.pid == LNET_PID_ANY) + id.pid = LNET_PID_LUSTRE; + + pbuf = lnet_ping_buffer_alloc(n_ids, GFP_NOFS); + if (!pbuf) + return -ENOMEM; + + /* initialize md content */ + md.start = &pbuf->pb_info; + md.length = LNET_PING_INFO_SIZE(n_ids); + md.threshold = 2; /* GET/REPLY */ + md.max_size = 0; + md.options = LNET_MD_TRUNCATE; + md.user_ptr = &pd; + md.handler = lnet_ping_event_handler; + + init_completion(&pd.completion); + + rc = LNetMDBind(&md, LNET_UNLINK, &pd.mdh); + if (rc != 0) { + CERROR("Can't bind MD: %d\n", rc); + goto fail_ping_buffer_decref; + } + + rc = LNetGet(lnet_nid_to_nid4(src_nid), pd.mdh, id, + LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0, false); + + if (rc != 0) { + /* Don't CERROR; this could be deliberate! */ + rc2 = LNetMDUnlink(pd.mdh); + LASSERT(rc2 == 0); + + /* NB must wait for the UNLINK event below... */ + } + + if (wait_for_completion_timeout(&pd.completion, timeout) == 0) { + /* Ensure completion in finite time... */ + LNetMDUnlink(pd.mdh); + wait_for_completion(&pd.completion); + } + if (!pd.replied) { + rc = -EIO; + goto fail_ping_buffer_decref; + } + + nob = pd.rc; + LASSERT(nob >= 0 && nob <= LNET_PING_INFO_SIZE(n_ids)); + + rc = -EPROTO; /* if I can't parse... */ + + if (nob < 8) { + CERROR("%s: ping info too short %d\n", + libcfs_id2str(id), nob); + goto fail_ping_buffer_decref; + } + + if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) { + lnet_swap_pinginfo(pbuf); + } else if (pbuf->pb_info.pi_magic != LNET_PROTO_PING_MAGIC) { + CERROR("%s: Unexpected magic %08x\n", + libcfs_id2str(id), pbuf->pb_info.pi_magic); + goto fail_ping_buffer_decref; + } + + if ((pbuf->pb_info.pi_features & LNET_PING_FEAT_NI_STATUS) == 0) { + CERROR("%s: ping w/o NI status: 0x%x\n", + libcfs_id2str(id), pbuf->pb_info.pi_features); + goto fail_ping_buffer_decref; + } + + if (nob < LNET_PING_INFO_SIZE(0)) { + CERROR("%s: Short reply %d(%d min)\n", + libcfs_id2str(id), + nob, (int)LNET_PING_INFO_SIZE(0)); + goto fail_ping_buffer_decref; + } + + if (pbuf->pb_info.pi_nnis < n_ids) + n_ids = pbuf->pb_info.pi_nnis; + + if (nob < LNET_PING_INFO_SIZE(n_ids)) { + CERROR("%s: Short reply %d(%d expected)\n", + libcfs_id2str(id), + nob, (int)LNET_PING_INFO_SIZE(n_ids)); + goto fail_ping_buffer_decref; + } + + rc = -EFAULT; /* if I segv in copy_to_user()... */ + + memset(&tmpid, 0, sizeof(tmpid)); + for (i = 0; i < n_ids; i++) { + tmpid.pid = pbuf->pb_info.pi_pid; + tmpid.nid = pbuf->pb_info.pi_ni[i].ns_nid; + if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid))) + goto fail_ping_buffer_decref; + } + rc = pbuf->pb_info.pi_nnis; + + fail_ping_buffer_decref: + lnet_ping_buffer_decref(pbuf); + return rc; +} + +static int +lnet_discover(struct lnet_process_id id, __u32 force, + struct lnet_process_id __user *ids, int n_ids) +{ + struct lnet_peer_ni *lpni; + struct lnet_peer_ni *p; + struct lnet_peer *lp; + struct lnet_process_id *buf; + int cpt; + int i; + int rc; + + if (n_ids <= 0 || + id.nid == LNET_NID_ANY) + return -EINVAL; + + if (id.pid == LNET_PID_ANY) + id.pid = LNET_PID_LUSTRE; + + /* + * If the user buffer has more space than the lnet_interfaces_max, + * then only fill it up to lnet_interfaces_max. + */ + if (n_ids > lnet_interfaces_max) + n_ids = lnet_interfaces_max; + + CFS_ALLOC_PTR_ARRAY(buf, n_ids); + if (!buf) + return -ENOMEM; + + cpt = lnet_net_lock_current(); + lpni = lnet_nid2peerni_locked(id.nid, LNET_NID_ANY, cpt); + if (IS_ERR(lpni)) { + rc = PTR_ERR(lpni); + goto out; + } + + /* + * Clearing the NIDS_UPTODATE flag ensures the peer will + * be discovered, provided discovery has not been disabled. + */ + lp = lpni->lpni_peer_net->lpn_peer; + spin_lock(&lp->lp_lock); + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; + /* If the force flag is set, force a PING and PUSH as well. */ + if (force) + lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH; + spin_unlock(&lp->lp_lock); + rc = lnet_discover_peer_locked(lpni, cpt, true); + if (rc) + goto out_decref; + + /* The lpni (or lp) for this NID may have changed and our ref is + * the only thing keeping the old one around. Release the ref + * and lookup the lpni again + */ + lnet_peer_ni_decref_locked(lpni); + lpni = lnet_find_peer_ni_locked(id.nid); + if (!lpni) { + rc = -ENOENT; + goto out; + } + lp = lpni->lpni_peer_net->lpn_peer; + + i = 0; + p = NULL; + while ((p = lnet_get_next_peer_ni_locked(lp, NULL, p)) != NULL) { + buf[i].pid = id.pid; + buf[i].nid = lnet_nid_to_nid4(&p->lpni_nid); + if (++i >= n_ids) + break; + } + rc = i; + +out_decref: + lnet_peer_ni_decref_locked(lpni); +out: + lnet_net_unlock(cpt); + + if (rc >= 0) + if (copy_to_user(ids, buf, rc * sizeof(*buf))) + rc = -EFAULT; + CFS_FREE_PTR_ARRAY(buf, n_ids); + + return rc; +} + +/** + * Retrieve peer discovery status. + * + * \retval 1 if lnet_peer_discovery_disabled is 0 + * \retval 0 if lnet_peer_discovery_disabled is 1 + */ +int +LNetGetPeerDiscoveryStatus(void) +{ + return !lnet_peer_discovery_disabled; +} +EXPORT_SYMBOL(LNetGetPeerDiscoveryStatus); diff --git a/drivers/staging/lustrefsx/lnet/lnet/config.c b/drivers/staging/lustrefsx/lnet/lnet/config.c new file mode 100644 index 0000000000000..09fe96d6c1011 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/config.c @@ -0,0 +1,1636 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#include + +/* tmp struct for parsing routes */ +struct lnet_text_buf { + struct list_head ltb_list; /* stash on lists */ + int ltb_size; /* allocated size */ + char ltb_text[0]; /* text buffer */ +}; + +static int lnet_tbnob = 0; /* track text buf allocation */ +#define LNET_MAX_TEXTBUF_NOB (64<<10) /* bound allocation */ +#define LNET_SINGLE_TEXTBUF_NOB (4<<10) + +#define SPACESTR " \t\v\r\n" +#define DELIMITERS ":()[]" + +#ifndef HAVE_STRSCPY +#define strscpy(s1, s2, sz) strlcpy((s1), (s2), (sz)) +#endif + +static void +lnet_syntax(const char *name, const char *str, int offset, int width) +{ + static char dots[LNET_SINGLE_TEXTBUF_NOB]; + static char dashes[LNET_SINGLE_TEXTBUF_NOB]; + + memset(dots, '.', sizeof(dots)); + dots[sizeof(dots)-1] = 0; + memset(dashes, '-', sizeof(dashes)); + dashes[sizeof(dashes)-1] = 0; + + LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str); + LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n", + (int)strlen(name), dots, offset, dots, + (width < 1) ? 0 : width - 1, dashes); +} + +static int +lnet_issep (char c) +{ + switch (c) { + case '\n': + case '\r': + case ';': + return 1; + default: + return 0; + } +} + +bool +lnet_net_unique(__u32 net_id, struct list_head *netlist, + struct lnet_net **net) +{ + struct lnet_net *net_l; + + if (!netlist) + return true; + + list_for_each_entry(net_l, netlist, net_list) { + if (net_l->net_id == net_id) { + if (net != NULL) + *net = net_l; + return false; + } + } + + return true; +} + +/* check that the NI is unique within the list of NIs already added to + * a network */ +bool +lnet_ni_unique_net(struct list_head *nilist, char *iface) +{ + struct list_head *tmp; + struct lnet_ni *ni; + + list_for_each(tmp, nilist) { + ni = list_entry(tmp, struct lnet_ni, ni_netlist); + + if (ni->ni_interface != NULL && + strncmp(ni->ni_interface, iface, strlen(iface)) == 0) + return false; + } + + return true; +} +static bool +in_array(__u32 *array, __u32 size, __u32 value) +{ + int i; + + for (i = 0; i < size; i++) { + if (array[i] == value) + return false; + } + + return true; +} + +static int +lnet_net_append_cpts(__u32 *cpts, __u32 ncpts, struct lnet_net *net) +{ + __u32 *added_cpts = NULL; + int i, j = 0, rc = 0; + + /* + * no need to go futher since a subset of the NIs already exist on + * all CPTs + */ + if (net->net_ncpts == LNET_CPT_NUMBER) + return 0; + + if (cpts == NULL) { + /* there is an NI which will exist on all CPTs */ + if (net->net_cpts != NULL) + CFS_FREE_PTR_ARRAY(net->net_cpts, net->net_ncpts); + net->net_cpts = NULL; + net->net_ncpts = LNET_CPT_NUMBER; + return 0; + } + + if (net->net_cpts == NULL) { + CFS_ALLOC_PTR_ARRAY(net->net_cpts, ncpts); + if (net->net_cpts == NULL) + return -ENOMEM; + memcpy(net->net_cpts, cpts, ncpts * sizeof(*net->net_cpts)); + net->net_ncpts = ncpts; + return 0; + } + + CFS_ALLOC_PTR_ARRAY(added_cpts, LNET_CPT_NUMBER); + if (added_cpts == NULL) + return -ENOMEM; + + for (i = 0; i < ncpts; i++) { + if (!in_array(net->net_cpts, net->net_ncpts, cpts[i])) { + added_cpts[j] = cpts[i]; + j++; + } + } + + /* append the new cpts if any to the list of cpts in the net */ + if (j > 0) { + __u32 *array = NULL, *loc; + __u32 total_entries = j + net->net_ncpts; + + CFS_ALLOC_PTR_ARRAY(array, total_entries); + if (array == NULL) { + rc = -ENOMEM; + goto failed; + } + + memcpy(array, net->net_cpts, + net->net_ncpts * sizeof(*net->net_cpts)); + loc = array + net->net_ncpts; + memcpy(loc, added_cpts, j * sizeof(*net->net_cpts)); + + CFS_FREE_PTR_ARRAY(net->net_cpts, net->net_ncpts); + net->net_ncpts = total_entries; + net->net_cpts = array; + } + +failed: + CFS_FREE_PTR_ARRAY(added_cpts, LNET_CPT_NUMBER); + + return rc; +} + +static void +lnet_net_remove_cpts(__u32 *cpts, __u32 ncpts, struct lnet_net *net) +{ + struct lnet_ni *ni; + int rc; + + /* + * Operation Assumption: + * This function is called after an NI has been removed from + * its parent net. + * + * if we're removing an NI which exists on all CPTs then + * we have to check if any of the other NIs on this net also + * exists on all CPTs. If none, then we need to build our Net CPT + * list based on the remaining NIs. + * + * If the NI being removed exist on a subset of the CPTs then we + * alo rebuild the Net CPT list based on the remaining NIs, which + * should resutl in the expected Net CPT list. + */ + + /* + * sometimes this function can be called due to some failure + * creating an NI, before any of the cpts are allocated, so check + * for that case and don't do anything + */ + if (ncpts == 0) + return; + + if (ncpts == LNET_CPT_NUMBER) { + /* + * first iteration through the NI list in the net to see + * if any of the NIs exist on all the CPTs. If one is + * found then our job is done. + */ + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (ni->ni_ncpts == LNET_CPT_NUMBER) + return; + } + } + + /* + * Rebuild the Net CPT list again, thereby only including only the + * CPTs which the remaining NIs are associated with. + */ + if (net->net_cpts != NULL) { + CFS_FREE_PTR_ARRAY(net->net_cpts, net->net_ncpts); + net->net_cpts = NULL; + } + + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts, + net); + if (rc != 0) { + CERROR("Out of Memory\n"); + /* + * do our best to keep on going. Delete + * the net cpts and set it to NULL. This + * way we can keep on going but less + * efficiently, since memory accesses might be + * accross CPT lines. + */ + if (net->net_cpts != NULL) { + CFS_FREE_PTR_ARRAY(net->net_cpts, + net->net_ncpts); + net->net_cpts = NULL; + net->net_ncpts = LNET_CPT_NUMBER; + } + return; + } + } +} + +void +lnet_ni_free(struct lnet_ni *ni) +{ + lnet_net_remove_cpts(ni->ni_cpts, ni->ni_ncpts, ni->ni_net); + + if (ni->ni_refs != NULL) + cfs_percpt_free(ni->ni_refs); + + if (ni->ni_tx_queues != NULL) + cfs_percpt_free(ni->ni_tx_queues); + + if (ni->ni_cpts != NULL) + cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts); + + if (ni->ni_interface != NULL) { + LIBCFS_FREE(ni->ni_interface, + strlen(ni->ni_interface) + 1); + } + + /* release reference to net namespace */ + if (ni->ni_net_ns != NULL) + put_net(ni->ni_net_ns); + + LIBCFS_FREE(ni, sizeof(*ni)); +} + +void +lnet_net_free(struct lnet_net *net) +{ + struct list_head *tmp, *tmp2; + struct lnet_ni *ni; + + LASSERT(list_empty(&net->net_ni_zombie)); + + /* + * delete any nis that haven't been added yet. This could happen + * if there is a failure on net startup + */ + list_for_each_safe(tmp, tmp2, &net->net_ni_added) { + ni = list_entry(tmp, struct lnet_ni, ni_netlist); + list_del_init(&ni->ni_netlist); + lnet_ni_free(ni); + } + + /* delete any nis which have been started. */ + list_for_each_safe(tmp, tmp2, &net->net_ni_list) { + ni = list_entry(tmp, struct lnet_ni, ni_netlist); + list_del_init(&ni->ni_netlist); + lnet_ni_free(ni); + } + + if (net->net_cpts != NULL) + CFS_FREE_PTR_ARRAY(net->net_cpts, net->net_ncpts); + + LIBCFS_FREE(net, sizeof(*net)); +} + +struct lnet_net * +lnet_net_alloc(__u32 net_id, struct list_head *net_list) +{ + struct lnet_net *net; + + if (!lnet_net_unique(net_id, net_list, &net)) { + CDEBUG(D_NET, "Returning duplicate net %p %s\n", net, + libcfs_net2str(net->net_id)); + return net; + } + + LIBCFS_ALLOC(net, sizeof(*net)); + if (net == NULL) { + CERROR("Out of memory creating network %s\n", + libcfs_net2str(net_id)); + return NULL; + } + + INIT_LIST_HEAD(&net->net_list); + INIT_LIST_HEAD(&net->net_ni_list); + INIT_LIST_HEAD(&net->net_ni_added); + INIT_LIST_HEAD(&net->net_ni_zombie); + INIT_LIST_HEAD(&net->net_rtr_pref_nids); + spin_lock_init(&net->net_lock); + + net->net_id = net_id; + net->net_last_alive = ktime_get_seconds(); + + net->net_sel_priority = LNET_MAX_SELECTION_PRIORITY; + + /* initialize global paramters to undefiend */ + net->net_tunables.lct_peer_timeout = -1; + net->net_tunables.lct_max_tx_credits = -1; + net->net_tunables.lct_peer_tx_credits = -1; + net->net_tunables.lct_peer_rtr_credits = -1; + + if (net_list) + list_add_tail(&net->net_list, net_list); + + return net; +} + +static int +lnet_ni_add_interface(struct lnet_ni *ni, char *iface) +{ + size_t iface_len = strlen(iface) + 1; + + if (ni == NULL) + return -ENOMEM; + + if (ni->ni_interface != NULL) { + LCONSOLE_ERROR_MSG(0x115, "%s: interface %s already set for net %s: rc = %d\n", + iface, ni->ni_interface, + libcfs_net2str(LNET_NID_NET(&ni->ni_nid)), + -EINVAL); + return -EINVAL; + } + + /* Allocate memory for the interface, so the code parsing input into + * tokens and adding interfaces can free the input safely. + * ni->ni_interface is freed in lnet_ni_free(). + */ + LIBCFS_ALLOC(ni->ni_interface, iface_len); + + if (ni->ni_interface == NULL) { + CERROR("%s: cannot allocate net interface name: rc = %d\n", + iface, -ENOMEM); + return -ENOMEM; + } + + strscpy(ni->ni_interface, iface, iface_len); + + return 0; +} + +static struct lnet_ni * +lnet_ni_alloc_common(struct lnet_net *net, char *iface) +{ + struct lnet_tx_queue *tq; + struct lnet_ni *ni; + int i; + + if (iface != NULL) + /* make sure that this NI is unique in the net it's + * being added to */ + if (!lnet_ni_unique_net(&net->net_ni_added, iface)) + return NULL; + + LIBCFS_ALLOC(ni, sizeof(*ni)); + if (ni == NULL) { + CERROR("Out of memory creating network interface %s%s\n", + libcfs_net2str(net->net_id), + (iface != NULL) ? iface : ""); + return NULL; + } + + spin_lock_init(&ni->ni_lock); + INIT_LIST_HEAD(&ni->ni_netlist); + INIT_LIST_HEAD(&ni->ni_recovery); + LNetInvalidateMDHandle(&ni->ni_ping_mdh); + ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ni->ni_refs[0])); + if (ni->ni_refs == NULL) + goto failed; + + ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ni->ni_tx_queues[0])); + if (ni->ni_tx_queues == NULL) + goto failed; + + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) + INIT_LIST_HEAD(&tq->tq_delayed); + + ni->ni_net = net; + /* LND will fill in the address part of the NID */ + ni->ni_nid.nid_type = LNET_NETTYP(net->net_id); + ni->ni_nid.nid_num = cpu_to_be16(LNET_NETNUM(net->net_id)); + + /* Store net namespace in which current ni is being created */ + if (current->nsproxy && current->nsproxy->net_ns) + ni->ni_net_ns = get_net(current->nsproxy->net_ns); + else + ni->ni_net_ns = get_net(&init_net); + + ni->ni_state = LNET_NI_STATE_INIT; + ni->ni_sel_priority = LNET_MAX_SELECTION_PRIORITY; + list_add_tail(&ni->ni_netlist, &net->net_ni_added); + + /* + * if an interface name is provided then make sure to add in that + * interface name in NI + */ + if (iface) + if (lnet_ni_add_interface(ni, iface) != 0) + goto failed; + + return ni; +failed: + lnet_ni_free(ni); + return NULL; +} + +/* allocate and add to the provided network */ +struct lnet_ni * +lnet_ni_alloc(struct lnet_net *net, struct cfs_expr_list *el, char *iface) +{ + struct lnet_ni *ni; + int rc; + + ni = lnet_ni_alloc_common(net, iface); + if (!ni) + return NULL; + + if (!el) { + ni->ni_cpts = NULL; + ni->ni_ncpts = LNET_CPT_NUMBER; + } else { + rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts); + if (rc <= 0) { + CERROR("Failed to set CPTs for NI %s(%s): %d\n", + libcfs_net2str(net->net_id), + (iface != NULL) ? iface : "", rc); + goto failed; + } + + LASSERT(rc <= LNET_CPT_NUMBER); + if (rc == LNET_CPT_NUMBER) { + CFS_FREE_PTR_ARRAY(ni->ni_cpts, rc); + ni->ni_cpts = NULL; + } + + ni->ni_ncpts = rc; + } + + rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts, net); + if (rc != 0) + goto failed; + + return ni; +failed: + lnet_ni_free(ni); + return NULL; +} + +struct lnet_ni * +lnet_ni_alloc_w_cpt_array(struct lnet_net *net, __u32 *cpts, __u32 ncpts, + char *iface) +{ + struct lnet_ni *ni; + int rc; + + ni = lnet_ni_alloc_common(net, iface); + if (!ni) + return NULL; + + if (ncpts == 0) { + ni->ni_cpts = NULL; + ni->ni_ncpts = LNET_CPT_NUMBER; + } else { + size_t array_size = ncpts * sizeof(ni->ni_cpts[0]); + + CFS_ALLOC_PTR_ARRAY(ni->ni_cpts, ncpts); + if (ni->ni_cpts == NULL) + goto failed; + memcpy(ni->ni_cpts, cpts, array_size); + ni->ni_ncpts = ncpts; + } + + rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts, net); + if (rc != 0) + goto failed; + + return ni; +failed: + lnet_ni_free(ni); + return NULL; +} + +/* + * Parse the networks string and create the matching set of NIs on the + * nilist. + */ +int +lnet_parse_networks(struct list_head *netlist, const char *networks) +{ + struct cfs_expr_list *net_el = NULL; + struct cfs_expr_list *ni_el = NULL; + int tokensize; + char *tokens; + char *str; + struct lnet_net *net; + struct lnet_ni *ni = NULL; + __u32 net_id; + int nnets = 0; + + if (networks == NULL) { + CERROR("networks string is undefined\n"); + return -EINVAL; + } + + if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) { + /* _WAY_ conservative */ + LCONSOLE_ERROR_MSG(0x112, "Can't parse networks: string too " + "long\n"); + return -EINVAL; + } + + tokensize = strlen(networks) + 1; + + LIBCFS_ALLOC(tokens, tokensize); + if (tokens == NULL) { + CERROR("Can't allocate net tokens\n"); + return -ENOMEM; + } + + memcpy(tokens, networks, tokensize); + str = tokens; + + /* + * Main parser loop. + * + * NB we don't check interface conflicts here; it's the LNDs + * responsibility (if it cares at all) + */ + do { + char *nistr; + char *elstr; + char *name; + int rc; + + /* + * Parse a network string into its components. + * + * {"("...")"}{"[""]"} + */ + + /* Network name (mandatory) */ + while (isspace(*str)) + *str++ = '\0'; + if (!*str) + break; + name = str; + str += strcspn(str, SPACESTR ":()[],"); + while (isspace(*str)) + *str++ = '\0'; + + /* Interface list (optional) */ + if (*str == '(') { + *str++ = '\0'; + nistr = str; + str += strcspn(str, ")"); + if (*str != ')') { + str = nistr; + goto failed_syntax; + } + do { + *str++ = '\0'; + } while (isspace(*str)); + } else { + nistr = NULL; + } + + /* CPT expression (optional) */ + if (*str == '[') { + elstr = str; + str += strcspn(str, "]"); + if (*str != ']') { + str = elstr; + goto failed_syntax; + } + rc = cfs_expr_list_parse(elstr, str - elstr + 1, + 0, LNET_CPT_NUMBER - 1, + &net_el); + if (rc != 0) { + str = elstr; + goto failed_syntax; + } + *elstr = '\0'; + do { + *str++ = '\0'; + } while (isspace(*str)); + } + + /* Bad delimiters */ + if (*str && (strchr(DELIMITERS, *str) != NULL)) + goto failed_syntax; + + /* go to the next net if it exits */ + str += strcspn(str, ","); + if (*str == ',') + *str++ = '\0'; + + /* + * At this point the name is properly terminated. + */ + net_id = libcfs_str2net(name); + if (net_id == LNET_NET_ANY) { + LCONSOLE_ERROR_MSG(0x113, + "Unrecognised network type\n"); + str = name; + goto failed_syntax; + } + + if (LNET_NETTYP(net_id) == LOLND) { + /* Loopback is implicit, and there can be only one. */ + if (net_el) { + cfs_expr_list_free(net_el); + net_el = NULL; + } + /* Should we error out instead? */ + continue; + } + + /* + * All network paramaters are now known. + */ + nnets++; + + /* always allocate a net, since we will eventually add an + * interface to it, or we will fail, in which case we'll + * just delete it */ + net = lnet_net_alloc(net_id, netlist); + if (IS_ERR_OR_NULL(net)) + goto failed; + + if (!nistr) { + /* + * No interface list was specified, allocate a + * ni using the defaults. + */ + ni = lnet_ni_alloc(net, net_el, NULL); + if (IS_ERR_OR_NULL(ni)) + goto failed; + + if (!nistr) { + if (net_el) { + cfs_expr_list_free(net_el); + net_el = NULL; + } + continue; + } + } + + do { + elstr = NULL; + + /* Interface name (mandatory) */ + while (isspace(*nistr)) + *nistr++ = '\0'; + name = nistr; + nistr += strcspn(nistr, SPACESTR "[],"); + while (isspace(*nistr)) + *nistr++ = '\0'; + + /* CPT expression (optional) */ + if (*nistr == '[') { + elstr = nistr; + nistr += strcspn(nistr, "]"); + if (*nistr != ']') { + str = elstr; + goto failed_syntax; + } + rc = cfs_expr_list_parse(elstr, + nistr - elstr + 1, + 0, LNET_CPT_NUMBER - 1, + &ni_el); + if (rc != 0) { + str = elstr; + goto failed_syntax; + } + *elstr = '\0'; + do { + *nistr++ = '\0'; + } while (isspace(*nistr)); + } else { + ni_el = net_el; + } + + /* + * End of single interface specificaton, + * advance to the start of the next one, if + * any. + */ + if (*nistr == ',') { + do { + *nistr++ = '\0'; + } while (isspace(*nistr)); + if (!*nistr) { + str = nistr; + goto failed_syntax; + } + } else if (*nistr) { + str = nistr; + goto failed_syntax; + } + + /* + * At this point the name is properly terminated. + */ + if (!*name) { + str = name; + goto failed_syntax; + } + + ni = lnet_ni_alloc(net, ni_el, name); + if (IS_ERR_OR_NULL(ni)) + goto failed; + + if (ni_el) { + if (ni_el != net_el) { + cfs_expr_list_free(ni_el); + ni_el = NULL; + } + } + } while (*nistr); + + if (net_el) { + cfs_expr_list_free(net_el); + net_el = NULL; + } + } while (*str); + + LIBCFS_FREE(tokens, tokensize); + return nnets; + + failed_syntax: + lnet_syntax("networks", networks, (int)(str - tokens), strlen(str)); + failed: + /* free the net list and all the nis on each net */ + while (!list_empty(netlist)) { + net = list_entry(netlist->next, struct lnet_net, net_list); + + list_del_init(&net->net_list); + lnet_net_free(net); + } + + if (ni_el && ni_el != net_el) + cfs_expr_list_free(ni_el); + if (net_el) + cfs_expr_list_free(net_el); + + LIBCFS_FREE(tokens, tokensize); + + return -EINVAL; +} + +static struct lnet_text_buf *lnet_new_text_buf(int str_len) +{ + struct lnet_text_buf *ltb; + int nob; + + /* NB allocate space for the terminating 0 */ + nob = offsetof(struct lnet_text_buf, ltb_text[str_len + 1]); + if (nob > LNET_SINGLE_TEXTBUF_NOB) { + /* _way_ conservative for "route net gateway..." */ + CERROR("text buffer too big\n"); + return NULL; + } + + if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) { + CERROR("Too many text buffers\n"); + return NULL; + } + + LIBCFS_ALLOC(ltb, nob); + if (ltb == NULL) + return NULL; + + ltb->ltb_size = nob; + ltb->ltb_text[0] = 0; + lnet_tbnob += nob; + return ltb; +} + +static void +lnet_free_text_buf(struct lnet_text_buf *ltb) +{ + lnet_tbnob -= ltb->ltb_size; + LIBCFS_FREE(ltb, ltb->ltb_size); +} + +static void +lnet_free_text_bufs(struct list_head *tbs) +{ + struct lnet_text_buf *ltb; + + while (!list_empty(tbs)) { + ltb = list_entry(tbs->next, struct lnet_text_buf, ltb_list); + + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + } +} + +static int +lnet_str2tbs_sep(struct list_head *tbs, const char *str) +{ + LIST_HEAD(pending); + const char *sep; + int nob; + int i; + struct lnet_text_buf *ltb; + + /* Split 'str' into separate commands */ + for (;;) { + /* skip leading whitespace */ + while (isspace(*str)) + str++; + + /* scan for separator or comment */ + for (sep = str; *sep != 0; sep++) + if (lnet_issep(*sep) || *sep == '#') + break; + + nob = (int)(sep - str); + if (nob > 0) { + ltb = lnet_new_text_buf(nob); + if (ltb == NULL) { + lnet_free_text_bufs(&pending); + return -ENOMEM; + } + + for (i = 0; i < nob; i++) + if (isspace(str[i])) + ltb->ltb_text[i] = ' '; + else + ltb->ltb_text[i] = str[i]; + + ltb->ltb_text[nob] = 0; + + list_add_tail(<b->ltb_list, &pending); + } + + if (*sep == '#') { + /* scan for separator */ + do { + sep++; + } while (*sep != 0 && !lnet_issep(*sep)); + } + + if (*sep == 0) + break; + + str = sep + 1; + } + + list_splice(&pending, tbs->prev); + return 0; +} + +static int +lnet_expand1tb(struct list_head *list, + char *str, char *sep1, char *sep2, + char *item, int itemlen) +{ + int len1 = (int)(sep1 - str); + int len2 = strlen(sep2 + 1); + struct lnet_text_buf *ltb; + + LASSERT (*sep1 == '['); + LASSERT (*sep2 == ']'); + + ltb = lnet_new_text_buf(len1 + itemlen + len2); + if (ltb == NULL) + return -ENOMEM; + + memcpy(ltb->ltb_text, str, len1); + memcpy(<b->ltb_text[len1], item, itemlen); + memcpy(<b->ltb_text[len1+itemlen], sep2 + 1, len2); + ltb->ltb_text[len1 + itemlen + len2] = 0; + + list_add_tail(<b->ltb_list, list); + return 0; +} + +static int +lnet_str2tbs_expand(struct list_head *tbs, char *str) +{ + char num[16]; + LIST_HEAD(pending); + char *sep; + char *sep2; + char *parsed; + char *enditem; + int lo; + int hi; + int stride; + int i; + int nob; + int scanned; + + sep = strchr(str, '['); + if (sep == NULL) /* nothing to expand */ + return 0; + + sep2 = strchr(sep, ']'); + if (sep2 == NULL) + goto failed; + + for (parsed = sep; parsed < sep2; parsed = enditem) { + + enditem = ++parsed; + while (enditem < sep2 && *enditem != ',') + enditem++; + + if (enditem == parsed) /* no empty items */ + goto failed; + + if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) { + + if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) { + + /* simple string enumeration */ + if (lnet_expand1tb(&pending, str, sep, sep2, + parsed, (int)(enditem - parsed)) != 0) + goto failed; + + continue; + } + + stride = 1; + } + + /* range expansion */ + + if (enditem != parsed + scanned) /* no trailing junk */ + goto failed; + + if (hi < 0 || lo < 0 || stride < 0 || hi < lo || + (hi - lo) % stride != 0) + goto failed; + + for (i = lo; i <= hi; i += stride) { + + snprintf(num, sizeof(num), "%d", i); + nob = strlen(num); + if (nob + 1 == sizeof(num)) + goto failed; + + if (lnet_expand1tb(&pending, str, sep, sep2, + num, nob) != 0) + goto failed; + } + } + + list_splice(&pending, tbs->prev); + return 1; + + failed: + lnet_free_text_bufs(&pending); + return -EINVAL; +} + +static int +lnet_parse_hops (char *str, unsigned int *hops) +{ + int len = strlen(str); + int nob = len; + + return (sscanf(str, "%u%n", hops, &nob) >= 1 && + nob == len && + *hops > 0 && *hops < 256); +} + +#define LNET_PRIORITY_SEPARATOR (':') + +static int +lnet_parse_priority(char *str, unsigned int *priority, char **token) +{ + int nob; + char *sep; + int len; + + sep = strchr(str, LNET_PRIORITY_SEPARATOR); + if (sep == NULL) { + *priority = 0; + return 0; + } + len = strlen(sep + 1); + + if ((sscanf((sep+1), "%u%n", priority, &nob) < 1) || (len != nob)) { + /* Update the caller's token pointer so it treats the found + priority as the token to report in the error message. */ + *token += sep - str + 1; + return -EINVAL; + } + + CDEBUG(D_NET, "gateway %s, priority %d, nob %d\n", str, *priority, nob); + + /* + * Change priority separator to \0 to be able to parse NID + */ + *sep = '\0'; + return 0; +} + +static int +lnet_parse_route(char *str, int *im_a_router) +{ + /* static scratch buffer OK (single threaded) */ + static char cmd[LNET_SINGLE_TEXTBUF_NOB]; + + LIST_HEAD(nets); + LIST_HEAD(gateways); + struct list_head *tmp1; + struct list_head *tmp2; + __u32 net; + struct lnet_nid nid; + struct lnet_text_buf *ltb; + int rc; + char *sep; + char *token = str; + int ntokens = 0; + int myrc = -1; + __u32 hops; + int got_hops = 0; + unsigned int priority = 0; + + /* save a copy of the string for error messages */ + strncpy(cmd, str, sizeof(cmd)); + cmd[sizeof(cmd) - 1] = '\0'; + + sep = str; + for (;;) { + /* scan for token start */ + while (isspace(*sep)) + sep++; + if (*sep == 0) { + if (ntokens < (got_hops ? 3 : 2)) + goto token_error; + break; + } + + ntokens++; + token = sep++; + + /* scan for token end */ + while (*sep != 0 && !isspace(*sep)) + sep++; + if (*sep != 0) + *sep++ = 0; + + if (ntokens == 1) { + tmp2 = &nets; /* expanding nets */ + } else if (ntokens == 2 && + lnet_parse_hops(token, &hops)) { + got_hops = 1; /* got a hop count */ + continue; + } else { + tmp2 = &gateways; /* expanding gateways */ + } + + ltb = lnet_new_text_buf(strlen(token)); + if (ltb == NULL) + goto out; + + strcpy(ltb->ltb_text, token); + tmp1 = <b->ltb_list; + list_add_tail(tmp1, tmp2); + + while (tmp1 != tmp2) { + ltb = list_entry(tmp1, struct lnet_text_buf, ltb_list); + + rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text); + if (rc < 0) + goto token_error; + + tmp1 = tmp1->next; + + if (rc > 0) { /* expanded! */ + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + continue; + } + + if (ntokens == 1) { + net = libcfs_str2net(ltb->ltb_text); + if (net == LNET_NET_ANY || + LNET_NETTYP(net) == LOLND) + goto token_error; + } else { + rc = lnet_parse_priority(ltb->ltb_text, + &priority, &token); + if (rc < 0) + goto token_error; + + if (libcfs_strnid(&nid, ltb->ltb_text) != 0 || + nid_is_lo0(&nid)) + goto token_error; + } + } + } + + /* if there are no hops set then we want to flag this value as + * unset since hops is an optional parameter */ + if (!got_hops) + hops = LNET_UNDEFINED_HOPS; + + LASSERT(!list_empty(&nets)); + LASSERT(!list_empty(&gateways)); + + list_for_each(tmp1, &nets) { + ltb = list_entry(tmp1, struct lnet_text_buf, ltb_list); + net = libcfs_str2net(ltb->ltb_text); + LASSERT(net != LNET_NET_ANY); + + list_for_each(tmp2, &gateways) { + ltb = list_entry(tmp2, struct lnet_text_buf, ltb_list); + LASSERT(libcfs_strnid(&nid, ltb->ltb_text) == 0); + + if (lnet_islocalnid(&nid)) { + *im_a_router = 1; + continue; + } + + rc = lnet_add_route(net, hops, &nid, priority, 1); + if (rc != 0 && rc != -EEXIST && rc != -EHOSTUNREACH) { + CERROR("Can't create route " + "to %s via %s\n", + libcfs_net2str(net), + libcfs_nidstr(&nid)); + goto out; + } + } + } + + myrc = 0; + goto out; + +token_error: + lnet_syntax("routes", cmd, (int)(token - str), strlen(token)); +out: + lnet_free_text_bufs(&nets); + lnet_free_text_bufs(&gateways); + return myrc; +} + +static int +lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router) +{ + struct lnet_text_buf *ltb; + + while (!list_empty(tbs)) { + ltb = list_entry(tbs->next, struct lnet_text_buf, ltb_list); + + if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) { + lnet_free_text_bufs(tbs); + return -EINVAL; + } + + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + } + + return 0; +} + +int +lnet_parse_routes(const char *routes, int *im_a_router) +{ + LIST_HEAD(tbs); + int rc = 0; + + *im_a_router = 0; + + if (lnet_str2tbs_sep(&tbs, routes) < 0) { + CERROR("Error parsing routes\n"); + rc = -EINVAL; + } else { + rc = lnet_parse_route_tbs(&tbs, im_a_router); + } + + LASSERT (lnet_tbnob == 0); + return rc; +} + +static int +lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip) +{ + LIST_HEAD(list); + int rc; + int i; + + rc = cfs_ip_addr_parse(token, len, &list); + if (rc != 0) + return rc; + + for (rc = i = 0; !rc && i < nip; i++) + rc = cfs_ip_addr_match(ipaddrs[i], &list); + + cfs_expr_list_free_list(&list); + + return rc; +} + +static int +lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip) +{ + static char tokens[LNET_SINGLE_TEXTBUF_NOB]; + + int matched = 0; + int ntokens = 0; + int len; + char *net = NULL; + char *sep; + char *token; + int rc; + + LASSERT(strlen(net_entry) < sizeof(tokens)); + + /* work on a copy of the string */ + strcpy(tokens, net_entry); + sep = tokens; + for (;;) { + /* scan for token start */ + while (isspace(*sep)) + sep++; + if (*sep == 0) + break; + + token = sep++; + + /* scan for token end */ + while (*sep != 0 && !isspace(*sep)) + sep++; + if (*sep != 0) + *sep++ = 0; + + if (ntokens++ == 0) { + net = token; + continue; + } + + len = strlen(token); + + rc = lnet_match_network_token(token, len, ipaddrs, nip); + if (rc < 0) { + lnet_syntax("ip2nets", net_entry, + (int)(token - tokens), len); + return rc; + } + + matched |= (rc != 0); + } + + if (!matched) + return 0; + + strcpy(net_entry, net); /* replace with matched net */ + return 1; +} + +static __u32 +lnet_netspec2net(char *netspec) +{ + char *bracket = strchr(netspec, '('); + __u32 net; + + if (bracket != NULL) + *bracket = 0; + + net = libcfs_str2net(netspec); + + if (bracket != NULL) + *bracket = '('; + + return net; +} + +static int +lnet_splitnets(char *source, struct list_head *nets) +{ + int offset = 0; + int offset2; + int len; + struct lnet_text_buf *tb; + struct lnet_text_buf *tb2; + struct list_head *t; + char *sep; + char *bracket; + __u32 net; + + LASSERT(!list_empty(nets)); + LASSERT(nets->next == nets->prev); /* single entry */ + + tb = list_entry(nets->next, struct lnet_text_buf, ltb_list); + + for (;;) { + sep = strchr(tb->ltb_text, ','); + bracket = strchr(tb->ltb_text, '('); + + if (sep != NULL && + bracket != NULL && + bracket < sep) { + /* netspec lists interfaces... */ + + offset2 = offset + (int)(bracket - tb->ltb_text); + len = strlen(bracket); + + bracket = strchr(bracket + 1, ')'); + + if (bracket == NULL || + !(bracket[1] == ',' || bracket[1] == 0)) { + lnet_syntax("ip2nets", source, offset2, len); + return -EINVAL; + } + + sep = (bracket[1] == 0) ? NULL : bracket + 1; + } + + if (sep != NULL) + *sep++ = 0; + + net = lnet_netspec2net(tb->ltb_text); + if (net == LNET_NET_ANY) { + lnet_syntax("ip2nets", source, offset, + strlen(tb->ltb_text)); + return -EINVAL; + } + + list_for_each(t, nets) { + tb2 = list_entry(t, struct lnet_text_buf, ltb_list); + + if (tb2 == tb) + continue; + + if (net == lnet_netspec2net(tb2->ltb_text)) { + /* duplicate network */ + lnet_syntax("ip2nets", source, offset, + strlen(tb->ltb_text)); + return -EINVAL; + } + } + + if (sep == NULL) + return 0; + + offset += (int)(sep - tb->ltb_text); + len = strlen(sep); + tb2 = lnet_new_text_buf(len); + if (tb2 == NULL) + return -ENOMEM; + + strncpy(tb2->ltb_text, sep, len); + tb2->ltb_text[len] = '\0'; + list_add_tail(&tb2->ltb_list, nets); + + tb = tb2; + } +} + +static int +lnet_match_networks(const char **networksp, const char *ip2nets, + __u32 *ipaddrs, int nip) +{ + static char networks[LNET_SINGLE_TEXTBUF_NOB]; + static char source[LNET_SINGLE_TEXTBUF_NOB]; + + LIST_HEAD(raw_entries); + LIST_HEAD(matched_nets); + LIST_HEAD(current_nets); + struct list_head *t; + struct list_head *t2; + struct lnet_text_buf *tb; + int len; + int count; + int rc; + + if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) { + CERROR("Error parsing ip2nets\n"); + LASSERT(lnet_tbnob == 0); + return -EINVAL; + } + + networks[0] = 0; + count = 0; + len = 0; + rc = 0; + + while (!list_empty(&raw_entries)) { + tb = list_entry(raw_entries.next, struct lnet_text_buf, + ltb_list); + + strncpy(source, tb->ltb_text, sizeof(source)); + source[sizeof(source) - 1] = '\0'; + + /* replace ltb_text with the network(s) add on match */ + rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip); + if (rc < 0) + break; + + list_del(&tb->ltb_list); + + if (rc == 0) { /* no match */ + lnet_free_text_buf(tb); + continue; + } + + /* split into separate networks */ + INIT_LIST_HEAD(¤t_nets); + list_add(&tb->ltb_list, ¤t_nets); + rc = lnet_splitnets(source, ¤t_nets); + if (rc < 0) + break; + + list_for_each_safe(t, t2, ¤t_nets) { + tb = list_entry(t, struct lnet_text_buf, ltb_list); + + list_move_tail(&tb->ltb_list, &matched_nets); + + len += scnprintf(networks + len, sizeof(networks) - len, + "%s%s", (len == 0) ? "" : ",", + tb->ltb_text); + + if (len >= sizeof(networks)) { + CERROR("Too many matched networks\n"); + rc = -E2BIG; + goto out; + } + } + + count++; + } + + out: + lnet_free_text_bufs(&raw_entries); + lnet_free_text_bufs(&matched_nets); + lnet_free_text_bufs(¤t_nets); + LASSERT(lnet_tbnob == 0); + + if (rc < 0) + return rc; + + *networksp = networks; + return count; +} + +int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns) +{ + struct lnet_inetdev *ifaces = NULL; + struct net_device *dev; + int nalloc = 0; + int nip = 0; + DECLARE_CONST_IN_IFADDR(ifa); + + rtnl_lock(); + for_each_netdev(ns, dev) { + int flags = dev_get_flags(dev); + struct in_device *in_dev; + int node_id; + int cpt; + + if (flags & IFF_LOOPBACK) /* skip the loopback IF */ + continue; + + if (!(flags & IFF_UP)) { + CWARN("lnet: Ignoring interface %s: it's down\n", + dev->name); + continue; + } + + in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) { + CWARN("lnet: Interface %s has no IPv4 status.\n", + dev->name); + continue; + } + + node_id = dev_to_node(&dev->dev); + cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id); + + in_dev_for_each_ifa_rtnl(ifa, in_dev) { + if (nip >= nalloc) { + struct lnet_inetdev *tmp; + + nalloc += LNET_INTERFACES_NUM; + tmp = krealloc(ifaces, nalloc * sizeof(*tmp), + GFP_KERNEL); + if (!tmp) { + kfree(ifaces); + ifaces = NULL; + nip = -ENOMEM; + goto unlock_rtnl; + } + ifaces = tmp; + } + + ifaces[nip].li_cpt = cpt; + ifaces[nip].li_flags = flags; + ifaces[nip].li_ipaddr = ntohl(ifa->ifa_local); + ifaces[nip].li_netmask = ntohl(ifa->ifa_mask); + strlcpy(ifaces[nip].li_name, ifa->ifa_label, + sizeof(ifaces[nip].li_name)); + nip++; + } + endfor_ifa(in_dev); + } +unlock_rtnl: + rtnl_unlock(); + + if (nip == 0) { + CERROR("lnet: Can't find any usable interfaces, rc = -ENOENT\n"); + nip = -ENOENT; + } + + *dev_list = ifaces; + return nip; +} +EXPORT_SYMBOL(lnet_inet_enumerate); + +int +lnet_parse_ip2nets(const char **networksp, const char *ip2nets) +{ + struct lnet_inetdev *ifaces = NULL; + __u32 *ipaddrs = NULL; + int nip; + int rc; + int i; + + if (current->nsproxy && current->nsproxy->net_ns) + nip = lnet_inet_enumerate(&ifaces, current->nsproxy->net_ns); + else + nip = lnet_inet_enumerate(&ifaces, &init_net); + if (nip < 0) { + if (nip != -ENOENT) { + LCONSOLE_ERROR_MSG(0x117, + "Error %d enumerating local IP interfaces for ip2nets to match\n", + nip); + } else { + LCONSOLE_ERROR_MSG(0x118, + "No local IP interfaces for ip2nets to match\n"); + } + return nip; + } + + CFS_ALLOC_PTR_ARRAY(ipaddrs, nip); + if (!ipaddrs) { + rc = -ENOMEM; + CERROR("lnet: Can't allocate ipaddrs[%d], rc = %d\n", + nip, rc); + goto out_free_addrs; + } + + for (i = 0; i < nip; i++) + ipaddrs[i] = ifaces[i].li_ipaddr; + + rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip); + if (rc < 0) { + LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc); + } else if (rc == 0) { + LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match " + "any local IP interfaces\n"); + rc = -ENOENT; + } + CFS_FREE_PTR_ARRAY(ipaddrs, nip); +out_free_addrs: + kfree(ifaces); + return rc > 0 ? 0 : rc; +} diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-md.c b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c new file mode 100644 index 0000000000000..ba318a2929632 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c @@ -0,0 +1,558 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/lnet/lib-md.c + * + * Memory Descriptor management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/* must be called with lnet_res_lock held */ +void +lnet_md_unlink(struct lnet_libmd *md) +{ + if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) { + /* first unlink attempt... */ + struct lnet_me *me = md->md_me; + + md->md_flags |= LNET_MD_FLAG_ZOMBIE; + + /* Disassociate from ME (if any), and unlink it if it was created + * with LNET_UNLINK */ + if (me != NULL) { + /* detach MD from portal */ + lnet_ptl_detach_md(me, md); + if (me->me_unlink == LNET_UNLINK) + lnet_me_unlink(me); + } + + /* ensure all future handle lookups fail */ + lnet_res_lh_invalidate(&md->md_lh); + } + + if (md->md_refcount != 0) { + CDEBUG(D_NET, "Queueing unlink of md %p\n", md); + return; + } + + CDEBUG(D_NET, "Unlinking md %p\n", md); + + LASSERT(!list_empty(&md->md_list)); + list_del_init(&md->md_list); + LASSERT(!(md->md_flags & LNET_MD_FLAG_HANDLING)); + lnet_md_free(md); +} + +struct page * +lnet_kvaddr_to_page(unsigned long vaddr) +{ + if (is_vmalloc_addr((void *)vaddr)) + return vmalloc_to_page((void *)vaddr); + +#ifdef CONFIG_HIGHMEM + +#ifdef HAVE_KMAP_TO_PAGE + /* + * This ifdef is added to handle the kernel versions + * which have kmap_to_page() function exported. If so, + * we should use it. Otherwise, remain with the legacy check. + */ + return kmap_to_page((void *)vaddr); +#else + + if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { + /* No highmem pages only used for bulk (kiov) I/O */ + CERROR("find page for address in highmem\n"); + LBUG(); + } + return virt_to_page(vaddr); +#endif /* HAVE_KMAP_TO_PAGE */ +#else + + return virt_to_page(vaddr); +#endif /* CONFIG_HIGHMEM */ +} +EXPORT_SYMBOL(lnet_kvaddr_to_page); + +struct page * +lnet_get_first_page(struct lnet_libmd *md, unsigned int offset) +{ + unsigned int niov; + struct bio_vec *kiov; + + /* + * if the md_options has a bulk handle then we want to look at the + * bulk md because that's the data which we will be DMAing + */ + if (md && (md->md_options & LNET_MD_BULK_HANDLE) != 0 && + !LNetMDHandleIsInvalid(md->md_bulk_handle)) + md = lnet_handle2md(&md->md_bulk_handle); + + if (!md || md->md_niov == 0) + return NULL; + + kiov = md->md_kiov; + niov = md->md_niov; + + while (offset >= kiov->bv_len) { + offset -= kiov->bv_len; + niov--; + kiov++; + if (niov == 0) { + CERROR("offset %d goes beyond kiov\n", offset); + return NULL; + } + } + + return kiov->bv_page; +} + +int +lnet_cpt_of_md(struct lnet_libmd *md, unsigned int offset) +{ + struct page *page; + int cpt = CFS_CPT_ANY; + + page = lnet_get_first_page(md, offset); + if (!page) { + CDEBUG(D_NET, "Couldn't resolve first page of md %p with offset %u\n", + md, offset); + goto out; + } + + cpt = cfs_cpt_of_node(lnet_cpt_table(), page_to_nid(page)); + +out: + return cpt; +} + +static int lnet_md_validate(const struct lnet_md *umd); + +static struct lnet_libmd * +lnet_md_build(const struct lnet_md *umd, int unlink) +{ + int i; + unsigned int niov; + int total_length = 0; + struct lnet_libmd *lmd; + unsigned int size; + + if (lnet_md_validate(umd) != 0) + return ERR_PTR(-EINVAL); + + if (umd->options & LNET_MD_KIOV) + niov = umd->length; + else + niov = DIV_ROUND_UP(offset_in_page(umd->start) + umd->length, + PAGE_SIZE); + size = offsetof(struct lnet_libmd, md_kiov[niov]); + + if (size <= LNET_SMALL_MD_SIZE) { + lmd = kmem_cache_zalloc(lnet_small_mds_cachep, GFP_NOFS); + if (lmd) { + CDEBUG(D_MALLOC, + "slab-alloced 'md' of size %u at %p.\n", + size, lmd); + } else { + CDEBUG(D_MALLOC, "failed to allocate 'md' of size %u\n", + size); + } + } else { + LIBCFS_ALLOC(lmd, size); + } + + if (!lmd) + return ERR_PTR(-ENOMEM); + + lmd->md_niov = niov; + INIT_LIST_HEAD(&lmd->md_list); + lmd->md_me = NULL; + lmd->md_start = umd->start; + lmd->md_offset = 0; + lmd->md_max_size = umd->max_size; + lmd->md_options = umd->options; + lmd->md_user_ptr = umd->user_ptr; + lmd->md_handler = NULL; + lmd->md_threshold = umd->threshold; + lmd->md_refcount = 0; + lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0; + lmd->md_bulk_handle = umd->bulk_handle; + + if (umd->options & LNET_MD_KIOV) { + memcpy(lmd->md_kiov, umd->start, + niov * sizeof(lmd->md_kiov[0])); + + for (i = 0; i < (int)niov; i++) { + /* We take the page pointer on trust */ + if (lmd->md_kiov[i].bv_offset + + lmd->md_kiov[i].bv_len > PAGE_SIZE) { + lnet_md_free(lmd); + return ERR_PTR(-EINVAL); /* invalid length */ + } + + total_length += lmd->md_kiov[i].bv_len; + } + + lmd->md_length = total_length; + + if ((umd->options & LNET_MD_MAX_SIZE) && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) { /* illegal max_size */ + lnet_md_free(lmd); + return ERR_PTR(-EINVAL); + } + } else { /* contiguous - split into pages */ + void *pa = umd->start; + int len = umd->length; + + lmd->md_length = len; + i = 0; + while (len) { + int plen; + + plen = min_t(int, len, PAGE_SIZE - offset_in_page(pa)); + + lmd->md_kiov[i].bv_page = + lnet_kvaddr_to_page((unsigned long) pa); + lmd->md_kiov[i].bv_offset = offset_in_page(pa); + lmd->md_kiov[i].bv_len = plen; + + len -= plen; + pa += plen; + i += 1; + } + WARN(!(lmd->md_options & LNET_MD_GNILND) && i > LNET_MAX_IOV, + "Max IOV exceeded: %d should be < %d\n", + i, LNET_MAX_IOV); + if ((umd->options & LNET_MD_MAX_SIZE) && /* max size used */ + (umd->max_size < 0 || + umd->max_size > (int)umd->length)) { /* illegal max_size */ + lnet_md_free(lmd); + return ERR_PTR(-EINVAL); + } + lmd->md_options |= LNET_MD_KIOV; + } + + return lmd; +} + +/* must be called with resource lock held */ +static void +lnet_md_link(struct lnet_libmd *md, lnet_handler_t handler, int cpt) +{ + struct lnet_res_container *container = the_lnet.ln_md_containers[cpt]; + + /* NB we are passed an allocated, but inactive md. + * Caller may lnet_md_unlink() it, or may lnet_md_free() it. + */ + /* This implementation doesn't know how to create START events or + * disable END events. Best to LASSERT our caller is compliant so + * we find out quickly... */ + /* TODO - reevaluate what should be here in light of + * the removal of the start and end events + * maybe there we shouldn't even allow LNET_EQ_NONE!) + * LASSERT (handler != NULL); + */ + md->md_handler = handler; + + lnet_res_lh_initialize(container, &md->md_lh); + + LASSERT(list_empty(&md->md_list)); + list_add(&md->md_list, &container->rec_active); +} + +void lnet_assert_handler_unused(lnet_handler_t handler) +{ + struct lnet_res_container *container; + int cpt; + + if (!handler) + return; + cfs_percpt_for_each(container, cpt, the_lnet.ln_md_containers) { + struct lnet_libmd *md; + + lnet_res_lock(cpt); + list_for_each_entry(md, &container->rec_active, md_list) + LASSERT(md->md_handler != handler); + lnet_res_unlock(cpt); + } +} +EXPORT_SYMBOL(lnet_assert_handler_unused); + +/* must be called with lnet_res_lock held */ +void +lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_event *ev) +{ + ev->md_start = lmd->md_start; + ev->md_options = lmd->md_options; + ev->md_user_ptr = lmd->md_user_ptr; +} + +static int +lnet_md_validate(const struct lnet_md *umd) +{ + if (umd->start == NULL && umd->length != 0) { + CERROR("MD start pointer can not be NULL with length %u\n", + umd->length); + return -EINVAL; + } + + if ((umd->options & LNET_MD_KIOV) && + umd->length > LNET_MAX_IOV) { + CERROR("Invalid option: too many fragments %u, %d max\n", + umd->length, LNET_MAX_IOV); + return -EINVAL; + } + + return 0; +} + +/** + * Create a memory descriptor and attach it to a ME + * + * \param me An ME to associate the new MD with. + * \param umd Provides initial values for the user-visible parts of a MD. + * Other than its use for initialization, there is no linkage between this + * structure and the MD maintained by the LNet. + * \param unlink A flag to indicate whether the MD is automatically unlinked + * when it becomes inactive, either because the operation threshold drops to + * zero or because the available memory becomes less than \a umd.max_size. + * (Note that the check for unlinking a MD only occurs after the completion + * of a successful operation on the MD.) The value LNET_UNLINK enables auto + * unlinking; the value LNET_RETAIN disables it. + * \param handle On successful returns, a handle to the newly created MD is + * saved here. This handle can be used later in LNetMDUnlink(). + * + * The ME will either be linked to the new MD, or it will be freed. + * + * \retval 0 On success. + * \retval -EINVAL If \a umd is not valid. + * \retval -ENOMEM If new MD cannot be allocated. + */ +int +LNetMDAttach(struct lnet_me *me, const struct lnet_md *umd, + enum lnet_unlink unlink, struct lnet_handle_md *handle) +{ + LIST_HEAD(matches); + LIST_HEAD(drops); + struct lnet_libmd *md; + int cpt; + + LASSERT(the_lnet.ln_refcount > 0); + LASSERT(!me->me_md); + + if ((umd->options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) { + CERROR("Invalid option: no MD_OP set\n"); + md = ERR_PTR(-EINVAL); + } else + md = lnet_md_build(umd, unlink); + + cpt = me->me_cpt; + lnet_res_lock(cpt); + + if (IS_ERR(md)) { + lnet_me_unlink(me); + lnet_res_unlock(cpt); + return PTR_ERR(md); + } + + lnet_md_link(md, umd->handler, cpt); + + /* attach this MD to portal of ME and check if it matches any + * blocked msgs on this portal */ + lnet_ptl_attach_md(me, md, &matches, &drops); + + lnet_md2handle(handle, md); + + lnet_res_unlock(cpt); + + lnet_drop_delayed_msg_list(&drops, "Bad match"); + lnet_recv_delayed_msg_list(&matches); + + return 0; +} +EXPORT_SYMBOL(LNetMDAttach); + +/** + * Create a "free floating" memory descriptor - a MD that is not associated + * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations. + * + * \param umd,unlink See the discussion for LNetMDAttach(). + * \param handle On successful returns, a handle to the newly created MD is + * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(), + * and LNetGet() operations. + * + * \retval 0 On success. + * \retval -EINVAL If \a umd is not valid. + * \retval -ENOMEM If new MD cannot be allocated. + */ +int +LNetMDBind(const struct lnet_md *umd, enum lnet_unlink unlink, + struct lnet_handle_md *handle) +{ + struct lnet_libmd *md; + int cpt; + int rc; + + LASSERT(the_lnet.ln_refcount > 0); + + if ((umd->options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) { + CERROR("Invalid option: GET|PUT illegal on active MDs\n"); + return -EINVAL; + } + + md = lnet_md_build(umd, unlink); + if (IS_ERR(md)) + return PTR_ERR(md); + + if (md->md_length > LNET_MTU) { + CERROR("Invalid length: too big transfer size %u, %d max\n", + md->md_length, LNET_MTU); + rc = -EINVAL; + goto out_free; + } + + cpt = lnet_res_lock_current(); + + lnet_md_link(md, umd->handler, cpt); + + lnet_md2handle(handle, md); + + lnet_res_unlock(cpt); + return 0; + + out_free: + lnet_md_free(md); + return rc; +} +EXPORT_SYMBOL(LNetMDBind); + +/** + * Unlink the memory descriptor from any ME it may be linked to and release + * the internal resources associated with it. As a result, active messages + * associated with the MD may get aborted. + * + * This function does not free the memory region associated with the MD; + * i.e., the memory the user allocated for this MD. If the ME associated with + * this MD is not NULL and was created with auto unlink enabled, the ME is + * unlinked as well (see LNetMEAttach()). + * + * Explicitly unlinking a MD via this function call has the same behavior as + * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK + * is generated in the latter case. + * + * An unlinked event can be reported in two ways: + * - If there's no pending operations on the MD, it's unlinked immediately + * and an LNET_EVENT_UNLINK event is logged before this function returns. + * - Otherwise, the MD is only marked for deletion when this function + * returns, and the unlinked event will be piggybacked on the event of + * the completion of the last operation by setting the unlinked field of + * the event. No dedicated LNET_EVENT_UNLINK event is generated. + * + * Note that in both cases the unlinked field of the event is always set; no + * more event will happen on the MD after such an event is logged. + * + * \param mdh A handle for the MD to be unlinked. + * + * \retval 0 On success. + * \retval -ENOENT If \a mdh does not point to a valid MD object. + */ +int +__LNetMDUnlink(struct lnet_handle_md mdh, bool discard) +{ + struct lnet_event ev; + struct lnet_libmd *md = NULL; + lnet_handler_t handler = NULL; + int cpt; + + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_cpt_of_cookie(mdh.cookie); + lnet_res_lock(cpt); + while (!md) { + md = lnet_handle2md(&mdh); + if (!md) { + lnet_res_unlock(cpt); + return -ENOENT; + } + if (md->md_refcount == 0 && + md->md_flags & LNET_MD_FLAG_HANDLING) { + /* Race with unlocked call to ->md_handler. */ + lnet_md_wait_handling(md, cpt); + md = NULL; + } + } + + md->md_flags |= LNET_MD_FLAG_ABORTED; + /* If the MD is busy, lnet_md_unlink just marks it for deletion, and + * when the LND is done, the completion event flags that the MD was + * unlinked. Otherwise, we enqueue an event now... */ + if (md->md_handler && md->md_refcount == 0) { + lnet_build_unlink_event(md, &ev); + handler = md->md_handler; + } + + if (discard) + md->md_flags |= LNET_MD_FLAG_DISCARD; + + if (md->md_rspt_ptr != NULL) + lnet_detach_rsp_tracker(md, cpt); + + lnet_md_unlink(md); + + lnet_res_unlock(cpt); + + if (handler) + handler(&ev); + + return 0; +} +EXPORT_SYMBOL(__LNetMDUnlink); + +bool +lnet_md_discarded(struct lnet_libmd *md) +{ + bool rc; + int cpt; + + if (md == NULL) + return false; + + cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); + lnet_res_lock(cpt); + rc = md->md_flags & LNET_MD_FLAG_DISCARD; + lnet_res_unlock(cpt); + + return rc; +} +EXPORT_SYMBOL(lnet_md_discarded); diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-me.c b/drivers/staging/lustrefsx/lnet/lnet/lib-me.c new file mode 100644 index 0000000000000..8d7c9ee97f94b --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lib-me.c @@ -0,0 +1,155 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/lnet/lib-me.c + * + * Match Entry management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/** + * Create and attach a match entry to the match list of \a portal. The new + * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach() + * can be used to attach a MD to an empty ME. + * + * \param portal The portal table index where the ME should be attached. + * \param match_id Specifies the match criteria for the process ID of + * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be + * used to wildcard either of the identifiers in the struct lnet_process_id + * structure. + * \param match_bits,ignore_bits Specify the match criteria to apply + * to the match bits in the incoming request. The ignore bits are used + * to mask out insignificant bits in the incoming match bits. The resulting + * bits are then compared to the ME's match bits to determine if the + * incoming request meets the match criteria. + * \param unlink Indicates whether the ME should be unlinked when the memory + * descriptor associated with it is unlinked (Note that the check for + * unlinking a ME only occurs when the memory descriptor is unlinked.). + * Valid values are LNET_RETAIN and LNET_UNLINK. + * \param pos Indicates whether the new ME should be prepended or + * appended to the match list. Allowed constants: LNET_INS_BEFORE, + * LNET_INS_AFTER. + * + * \retval A handle to the newly created ME is returned on success + * \retval ERR_PTR(-EINVAL) If \a portal is invalid. + * \retval ERR_PTR(-ENOMEM) If new ME object cannot be allocated. + */ +struct lnet_me * +LNetMEAttach(unsigned int portal, + struct lnet_processid *match_id, + __u64 match_bits, __u64 ignore_bits, + enum lnet_unlink unlink, enum lnet_ins_pos pos) +{ + struct lnet_match_table *mtable; + struct lnet_me *me; + struct list_head *head; + + LASSERT(the_lnet.ln_refcount > 0); + + if ((int)portal >= the_lnet.ln_nportals) + return ERR_PTR(-EINVAL); + + mtable = lnet_mt_of_attach(portal, match_id, + match_bits, ignore_bits, pos); + if (mtable == NULL) /* can't match portal type */ + return ERR_PTR(-EPERM); + + me = kmem_cache_zalloc(lnet_mes_cachep, GFP_NOFS); + if (me == NULL) { + CDEBUG(D_MALLOC, "failed to allocate 'me'\n"); + return ERR_PTR(-ENOMEM); + } + CDEBUG(D_MALLOC, "slab-alloced 'me' at %p.\n", me); + + lnet_res_lock(mtable->mt_cpt); + + me->me_portal = portal; + me->me_match_id = *match_id; + me->me_match_bits = match_bits; + me->me_ignore_bits = ignore_bits; + me->me_unlink = unlink; + me->me_md = NULL; + + me->me_cpt = mtable->mt_cpt; + + if (ignore_bits != 0) + head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE]; + else + head = lnet_mt_match_head(mtable, match_id, match_bits); + + me->me_pos = head - &mtable->mt_mhash[0]; + if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL) + list_add_tail(&me->me_list, head); + else + list_add(&me->me_list, head); + + lnet_res_unlock(mtable->mt_cpt); + return me; +} +EXPORT_SYMBOL(LNetMEAttach); + +/* call with lnet_res_lock please */ +void +lnet_me_unlink(struct lnet_me *me) +{ + list_del(&me->me_list); + + if (me->me_md != NULL) { + struct lnet_libmd *md = me->me_md; + + /* detach MD from portal of this ME */ + lnet_ptl_detach_md(me, md); + lnet_md_unlink(md); + } + + CDEBUG(D_MALLOC, "slab-freed 'me' at %p.\n", me); + kmem_cache_free(lnet_mes_cachep, me); +} + +#if 0 +static void +lib_me_dump(struct lnet_me *me) +{ + CWARN("Match Entry %p (%#llx)\n", me, + me->me_lh.lh_cookie); + + CWARN("\tMatch/Ignore\t= %016lx / %016lx\n", + me->me_match_bits, me->me_ignore_bits); + + CWARN("\tMD\t= %p\n", me->md); + CWARN("\tprev\t= %p\n", + list_entry(me->me_list.prev, struct lnet_me, me_list)); + CWARN("\tnext\t= %p\n", + list_entry(me->me_list.next, struct lnet_me, me_list)); +} +#endif diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-move.c b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c new file mode 100644 index 0000000000000..5c50c9e179ac2 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c @@ -0,0 +1,5456 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/lnet/lib-move.c + * + * Data movement routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +#include +#include +#include +#include + +static int local_nid_dist_zero = 1; +module_param(local_nid_dist_zero, int, 0444); +MODULE_PARM_DESC(local_nid_dist_zero, "Reserved"); + +struct lnet_send_data { + struct lnet_ni *sd_best_ni; + struct lnet_peer_ni *sd_best_lpni; + struct lnet_peer_ni *sd_final_dst_lpni; + struct lnet_peer *sd_peer; + struct lnet_peer *sd_gw_peer; + struct lnet_peer_ni *sd_gw_lpni; + struct lnet_peer_net *sd_peer_net; + struct lnet_msg *sd_msg; + struct lnet_nid sd_dst_nid; + struct lnet_nid sd_src_nid; + struct lnet_nid sd_rtr_nid; + int sd_cpt; + int sd_md_cpt; + __u32 sd_send_case; +}; + +static inline bool +lnet_msg_is_response(struct lnet_msg *msg) +{ + return msg->msg_type == LNET_MSG_ACK || msg->msg_type == LNET_MSG_REPLY; +} + +static inline bool +lnet_response_tracking_enabled(__u32 msg_type, unsigned int md_options) +{ + if (md_options & LNET_MD_NO_TRACK_RESPONSE) + /* Explicitly disabled in MD options */ + return false; + + if (md_options & LNET_MD_TRACK_RESPONSE) + /* Explicity enabled in MD options */ + return true; + + if (lnet_response_tracking == 3) + /* Enabled for all message types */ + return true; + + if (msg_type == LNET_MSG_PUT) + return lnet_response_tracking == 2; + + if (msg_type == LNET_MSG_GET) + return lnet_response_tracking == 1; + + return false; +} + +static inline struct lnet_comm_count * +get_stats_counts(struct lnet_element_stats *stats, + enum lnet_stats_type stats_type) +{ + switch (stats_type) { + case LNET_STATS_TYPE_SEND: + return &stats->el_send_stats; + case LNET_STATS_TYPE_RECV: + return &stats->el_recv_stats; + case LNET_STATS_TYPE_DROP: + return &stats->el_drop_stats; + default: + CERROR("Unknown stats type\n"); + } + + return NULL; +} + +void lnet_incr_stats(struct lnet_element_stats *stats, + enum lnet_msg_type msg_type, + enum lnet_stats_type stats_type) +{ + struct lnet_comm_count *counts = get_stats_counts(stats, stats_type); + if (!counts) + return; + + switch (msg_type) { + case LNET_MSG_ACK: + atomic_inc(&counts->co_ack_count); + break; + case LNET_MSG_PUT: + atomic_inc(&counts->co_put_count); + break; + case LNET_MSG_GET: + atomic_inc(&counts->co_get_count); + break; + case LNET_MSG_REPLY: + atomic_inc(&counts->co_reply_count); + break; + case LNET_MSG_HELLO: + atomic_inc(&counts->co_hello_count); + break; + default: + CERROR("There is a BUG in the code. Unknown message type\n"); + break; + } +} + +__u32 lnet_sum_stats(struct lnet_element_stats *stats, + enum lnet_stats_type stats_type) +{ + struct lnet_comm_count *counts = get_stats_counts(stats, stats_type); + if (!counts) + return 0; + + return (atomic_read(&counts->co_ack_count) + + atomic_read(&counts->co_put_count) + + atomic_read(&counts->co_get_count) + + atomic_read(&counts->co_reply_count) + + atomic_read(&counts->co_hello_count)); +} + +static inline void assign_stats(struct lnet_ioctl_comm_count *msg_stats, + struct lnet_comm_count *counts) +{ + msg_stats->ico_get_count = atomic_read(&counts->co_get_count); + msg_stats->ico_put_count = atomic_read(&counts->co_put_count); + msg_stats->ico_reply_count = atomic_read(&counts->co_reply_count); + msg_stats->ico_ack_count = atomic_read(&counts->co_ack_count); + msg_stats->ico_hello_count = atomic_read(&counts->co_hello_count); +} + +void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats, + struct lnet_element_stats *stats) +{ + struct lnet_comm_count *counts; + + LASSERT(msg_stats); + LASSERT(stats); + + counts = get_stats_counts(stats, LNET_STATS_TYPE_SEND); + if (!counts) + return; + assign_stats(&msg_stats->im_send_stats, counts); + + counts = get_stats_counts(stats, LNET_STATS_TYPE_RECV); + if (!counts) + return; + assign_stats(&msg_stats->im_recv_stats, counts); + + counts = get_stats_counts(stats, LNET_STATS_TYPE_DROP); + if (!counts) + return; + assign_stats(&msg_stats->im_drop_stats, counts); +} + +int +lnet_fail_nid(lnet_nid_t nid4, unsigned int threshold) +{ + struct lnet_test_peer *tp; + struct list_head *el; + struct list_head *next; + struct lnet_nid nid; + LIST_HEAD(cull); + + lnet_nid4_to_nid(nid4, &nid); + /* NB: use lnet_net_lock(0) to serialize operations on test peers */ + if (threshold != 0) { + /* Adding a new entry */ + LIBCFS_ALLOC(tp, sizeof(*tp)); + if (tp == NULL) + return -ENOMEM; + + tp->tp_nid = nid; + tp->tp_threshold = threshold; + + lnet_net_lock(0); + list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers); + lnet_net_unlock(0); + return 0; + } + + lnet_net_lock(0); + + list_for_each_safe(el, next, &the_lnet.ln_test_peers) { + tp = list_entry(el, struct lnet_test_peer, tp_list); + + if (tp->tp_threshold == 0 || /* needs culling anyway */ + LNET_NID_IS_ANY(&nid) || /* removing all entries */ + nid_same(&tp->tp_nid, &nid)) { /* matched this one */ + list_move(&tp->tp_list, &cull); + } + } + + lnet_net_unlock(0); + + while (!list_empty(&cull)) { + tp = list_entry(cull.next, struct lnet_test_peer, tp_list); + + list_del(&tp->tp_list); + LIBCFS_FREE(tp, sizeof(*tp)); + } + return 0; +} + +static int +fail_peer(struct lnet_nid *nid, int outgoing) +{ + struct lnet_test_peer *tp; + struct list_head *el; + struct list_head *next; + LIST_HEAD(cull); + int fail = 0; + + /* NB: use lnet_net_lock(0) to serialize operations on test peers */ + lnet_net_lock(0); + + list_for_each_safe(el, next, &the_lnet.ln_test_peers) { + tp = list_entry(el, struct lnet_test_peer, tp_list); + + if (tp->tp_threshold == 0) { + /* zombie entry */ + if (outgoing) { + /* only cull zombies on outgoing tests, + * since we may be at interrupt priority on + * incoming messages. */ + list_move(&tp->tp_list, &cull); + } + continue; + } + + if (LNET_NID_IS_ANY(&tp->tp_nid) || /* fail every peer */ + nid_same(nid, &tp->tp_nid)) { /* fail this peer */ + fail = 1; + + if (tp->tp_threshold != LNET_MD_THRESH_INF) { + tp->tp_threshold--; + if (outgoing && + tp->tp_threshold == 0) { + /* see above */ + list_move(&tp->tp_list, &cull); + } + } + break; + } + } + + lnet_net_unlock(0); + + while (!list_empty(&cull)) { + tp = list_entry(cull.next, struct lnet_test_peer, tp_list); + list_del(&tp->tp_list); + + LIBCFS_FREE(tp, sizeof(*tp)); + } + + return fail; +} + +unsigned int +lnet_iov_nob(unsigned int niov, struct kvec *iov) +{ + unsigned int nob = 0; + + LASSERT(niov == 0 || iov != NULL); + while (niov-- > 0) + nob += (iov++)->iov_len; + + return (nob); +} +EXPORT_SYMBOL(lnet_iov_nob); + +void +lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, + unsigned int nsiov, struct kvec *siov, unsigned int soffset, + unsigned int nob) +{ + /* NB diov, siov are READ-ONLY */ + unsigned int this_nob; + + if (nob == 0) + return; + + /* skip complete frags before 'doffset' */ + LASSERT(ndiov > 0); + while (doffset >= diov->iov_len) { + doffset -= diov->iov_len; + diov++; + ndiov--; + LASSERT(ndiov > 0); + } + + /* skip complete frags before 'soffset' */ + LASSERT(nsiov > 0); + while (soffset >= siov->iov_len) { + soffset -= siov->iov_len; + siov++; + nsiov--; + LASSERT(nsiov > 0); + } + + do { + LASSERT(ndiov > 0); + LASSERT(nsiov > 0); + this_nob = min3((unsigned int)diov->iov_len - doffset, + (unsigned int)siov->iov_len - soffset, + nob); + + memcpy((char *)diov->iov_base + doffset, + (char *)siov->iov_base + soffset, this_nob); + nob -= this_nob; + + if (diov->iov_len > doffset + this_nob) { + doffset += this_nob; + } else { + diov++; + ndiov--; + doffset = 0; + } + + if (siov->iov_len > soffset + this_nob) { + soffset += this_nob; + } else { + siov++; + nsiov--; + soffset = 0; + } + } while (nob > 0); +} +EXPORT_SYMBOL(lnet_copy_iov2iov); + +unsigned int +lnet_kiov_nob(unsigned int niov, struct bio_vec *kiov) +{ + unsigned int nob = 0; + + LASSERT(niov == 0 || kiov != NULL); + while (niov-- > 0) + nob += (kiov++)->bv_len; + + return (nob); +} +EXPORT_SYMBOL(lnet_kiov_nob); + +void +lnet_copy_kiov2kiov(unsigned int ndiov, struct bio_vec *diov, + unsigned int doffset, + unsigned int nsiov, struct bio_vec *siov, + unsigned int soffset, + unsigned int nob) +{ + /* NB diov, siov are READ-ONLY */ + unsigned int this_nob; + char *daddr = NULL; + char *saddr = NULL; + + if (nob == 0) + return; + + LASSERT (!in_interrupt ()); + + LASSERT (ndiov > 0); + while (doffset >= diov->bv_len) { + doffset -= diov->bv_len; + diov++; + ndiov--; + LASSERT(ndiov > 0); + } + + LASSERT(nsiov > 0); + while (soffset >= siov->bv_len) { + soffset -= siov->bv_len; + siov++; + nsiov--; + LASSERT(nsiov > 0); + } + + do { + LASSERT(ndiov > 0); + LASSERT(nsiov > 0); + this_nob = min3(diov->bv_len - doffset, + siov->bv_len - soffset, + nob); + + if (daddr == NULL) + daddr = ((char *)kmap(diov->bv_page)) + + diov->bv_offset + doffset; + if (saddr == NULL) + saddr = ((char *)kmap(siov->bv_page)) + + siov->bv_offset + soffset; + + /* Vanishing risk of kmap deadlock when mapping 2 pages. + * However in practice at least one of the kiovs will be mapped + * kernel pages and the map/unmap will be NOOPs */ + + memcpy (daddr, saddr, this_nob); + nob -= this_nob; + + if (diov->bv_len > doffset + this_nob) { + daddr += this_nob; + doffset += this_nob; + } else { + kunmap(diov->bv_page); + daddr = NULL; + diov++; + ndiov--; + doffset = 0; + } + + if (siov->bv_len > soffset + this_nob) { + saddr += this_nob; + soffset += this_nob; + } else { + kunmap(siov->bv_page); + saddr = NULL; + siov++; + nsiov--; + soffset = 0; + } + } while (nob > 0); + + if (daddr != NULL) + kunmap(diov->bv_page); + if (saddr != NULL) + kunmap(siov->bv_page); +} +EXPORT_SYMBOL(lnet_copy_kiov2kiov); + +void +lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, + unsigned int nkiov, struct bio_vec *kiov, + unsigned int kiovoffset, + unsigned int nob) +{ + /* NB iov, kiov are READ-ONLY */ + unsigned int this_nob; + char *addr = NULL; + + if (nob == 0) + return; + + LASSERT (!in_interrupt ()); + + LASSERT (niov > 0); + while (iovoffset >= iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; + niov--; + LASSERT(niov > 0); + } + + LASSERT(nkiov > 0); + while (kiovoffset >= kiov->bv_len) { + kiovoffset -= kiov->bv_len; + kiov++; + nkiov--; + LASSERT(nkiov > 0); + } + + do { + LASSERT(niov > 0); + LASSERT(nkiov > 0); + this_nob = min3((unsigned int)iov->iov_len - iovoffset, + (unsigned int)kiov->bv_len - kiovoffset, + nob); + + if (addr == NULL) + addr = ((char *)kmap(kiov->bv_page)) + + kiov->bv_offset + kiovoffset; + + memcpy((char *)iov->iov_base + iovoffset, addr, this_nob); + nob -= this_nob; + + if (iov->iov_len > iovoffset + this_nob) { + iovoffset += this_nob; + } else { + iov++; + niov--; + iovoffset = 0; + } + + if (kiov->bv_len > kiovoffset + this_nob) { + addr += this_nob; + kiovoffset += this_nob; + } else { + kunmap(kiov->bv_page); + addr = NULL; + kiov++; + nkiov--; + kiovoffset = 0; + } + + } while (nob > 0); + + if (addr != NULL) + kunmap(kiov->bv_page); +} +EXPORT_SYMBOL(lnet_copy_kiov2iov); + +void +lnet_copy_iov2kiov(unsigned int nkiov, struct bio_vec *kiov, + unsigned int kiovoffset, + unsigned int niov, struct kvec *iov, unsigned int iovoffset, + unsigned int nob) +{ + /* NB kiov, iov are READ-ONLY */ + unsigned int this_nob; + char *addr = NULL; + + if (nob == 0) + return; + + LASSERT (!in_interrupt ()); + + LASSERT (nkiov > 0); + while (kiovoffset >= kiov->bv_len) { + kiovoffset -= kiov->bv_len; + kiov++; + nkiov--; + LASSERT(nkiov > 0); + } + + LASSERT(niov > 0); + while (iovoffset >= iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; + niov--; + LASSERT(niov > 0); + } + + do { + LASSERT(nkiov > 0); + LASSERT(niov > 0); + this_nob = min3((unsigned int)kiov->bv_len - kiovoffset, + (unsigned int)iov->iov_len - iovoffset, + nob); + + if (addr == NULL) + addr = ((char *)kmap(kiov->bv_page)) + + kiov->bv_offset + kiovoffset; + + memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob); + nob -= this_nob; + + if (kiov->bv_len > kiovoffset + this_nob) { + addr += this_nob; + kiovoffset += this_nob; + } else { + kunmap(kiov->bv_page); + addr = NULL; + kiov++; + nkiov--; + kiovoffset = 0; + } + + if (iov->iov_len > iovoffset + this_nob) { + iovoffset += this_nob; + } else { + iov++; + niov--; + iovoffset = 0; + } + } while (nob > 0); + + if (addr != NULL) + kunmap(kiov->bv_page); +} +EXPORT_SYMBOL(lnet_copy_iov2kiov); + +int +lnet_extract_kiov(int dst_niov, struct bio_vec *dst, + int src_niov, struct bio_vec *src, + unsigned int offset, unsigned int len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + unsigned int frag_len; + unsigned int niov; + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT(src_niov > 0); + while (offset >= src->bv_len) { /* skip initial frags */ + offset -= src->bv_len; + src_niov--; + src++; + LASSERT(src_niov > 0); + } + + niov = 1; + for (;;) { + LASSERT(src_niov > 0); + LASSERT((int)niov <= dst_niov); + + frag_len = src->bv_len - offset; + dst->bv_page = src->bv_page; + dst->bv_offset = src->bv_offset + offset; + + if (len <= frag_len) { + dst->bv_len = len; + LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE); + return niov; + } + + dst->bv_len = frag_len; + LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE); + + len -= frag_len; + dst++; + src++; + niov++; + src_niov--; + offset = 0; + } +} +EXPORT_SYMBOL(lnet_extract_kiov); + +void +lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, + int delayed, unsigned int offset, unsigned int mlen, + unsigned int rlen) +{ + unsigned int niov = 0; + struct kvec *iov = NULL; + struct bio_vec *kiov = NULL; + int rc; + + LASSERT (!in_interrupt ()); + LASSERT (mlen == 0 || msg != NULL); + + if (msg != NULL) { + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_sending); + LASSERT(rlen == msg->msg_len); + LASSERT(mlen <= msg->msg_len); + LASSERT(msg->msg_offset == offset); + LASSERT(msg->msg_wanted == mlen); + + msg->msg_receiving = 0; + + if (mlen != 0) { + niov = msg->msg_niov; + kiov = msg->msg_kiov; + + LASSERT (niov > 0); + LASSERT ((iov == NULL) != (kiov == NULL)); + } + } + + rc = (ni->ni_net->net_lnd->lnd_recv)(ni, private, msg, delayed, + niov, kiov, offset, mlen, + rlen); + if (rc < 0) + lnet_finalize(msg, rc); +} + +static void +lnet_setpayloadbuffer(struct lnet_msg *msg) +{ + struct lnet_libmd *md = msg->msg_md; + + LASSERT(msg->msg_len > 0); + LASSERT(!msg->msg_routing); + LASSERT(md != NULL); + LASSERT(msg->msg_niov == 0); + LASSERT(msg->msg_kiov == NULL); + + msg->msg_niov = md->md_niov; + msg->msg_kiov = md->md_kiov; +} + +void +lnet_prep_send(struct lnet_msg *msg, int type, struct lnet_processid *target, + unsigned int offset, unsigned int len) +{ + msg->msg_type = type; + msg->msg_target = *target; + msg->msg_len = len; + msg->msg_offset = offset; + + if (len != 0) + lnet_setpayloadbuffer(msg); + + memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr)); + msg->msg_hdr.type = type; + /* dest_nid will be overwritten by lnet_select_pathway() */ + msg->msg_hdr.dest_nid = target->nid; + msg->msg_hdr.dest_pid = target->pid; + /* src_nid will be set later */ + msg->msg_hdr.src_pid = the_lnet.ln_pid; + msg->msg_hdr.payload_length = len; +} + +void +lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg) +{ + void *priv = msg->msg_private; + int rc; + + LASSERT(!in_interrupt()); + LASSERT(nid_is_lo0(&ni->ni_nid) || + (msg->msg_txcredit && msg->msg_peertxcredit)); + + rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg); + if (rc < 0) { + msg->msg_no_resend = true; + lnet_finalize(msg, rc); + } +} + +static int +lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg) +{ + int rc; + + LASSERT(!msg->msg_sending); + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_rx_ready_delay); + LASSERT(ni->ni_net->net_lnd->lnd_eager_recv != NULL); + + msg->msg_rx_ready_delay = 1; + rc = (ni->ni_net->net_lnd->lnd_eager_recv)(ni, msg->msg_private, msg, + &msg->msg_private); + if (rc != 0) { + CERROR("recv from %s / send to %s aborted: " + "eager_recv failed %d\n", + libcfs_nidstr(&msg->msg_rxpeer->lpni_nid), + libcfs_idstr(&msg->msg_target), rc); + LASSERT(rc < 0); /* required by my callers */ + } + + return rc; +} + +static bool +lnet_is_peer_deadline_passed(struct lnet_peer_ni *lpni, time64_t now) +{ + time64_t deadline; + + deadline = lpni->lpni_last_alive + + lpni->lpni_net->net_tunables.lct_peer_timeout; + + /* + * assume peer_ni is alive as long as we're within the configured + * peer timeout + */ + if (deadline > now) + return false; + + return true; +} + +/* NB: returns 1 when alive, 0 when dead, negative when error; + * may drop the lnet_net_lock */ +static int +lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lpni, + struct lnet_msg *msg) +{ + time64_t now = ktime_get_seconds(); + + if (!lnet_peer_aliveness_enabled(lpni)) + return -ENODEV; + + /* + * If we're resending a message, let's attempt to send it even if + * the peer is down to fulfill our resend quota on the message + */ + if (msg->msg_retry_count > 0) + return 1; + + /* try and send recovery messages irregardless */ + if (msg->msg_recovery) + return 1; + + /* always send any responses */ + if (lnet_msg_is_response(msg)) + return 1; + + /* always send non-routed messages */ + if (!msg->msg_routing) + return 1; + + if (!lnet_is_peer_deadline_passed(lpni, now)) + return true; + + return lnet_is_peer_ni_alive(lpni); +} + +/** + * \param msg The message to be sent. + * \param do_send True if lnet_ni_send() should be called in this function. + * lnet_send() is going to lnet_net_unlock immediately after this, so + * it sets do_send FALSE and I don't do the unlock/send/lock bit. + * + * \retval LNET_CREDIT_OK If \a msg sent or OK to send. + * \retval LNET_CREDIT_WAIT If \a msg blocked for credit. + * \retval -EHOSTUNREACH If the next hop of the message appears dead. + * \retval -ECANCELED If the MD of the message has been unlinked. + */ +static int +lnet_post_send_locked(struct lnet_msg *msg, int do_send) +{ + struct lnet_peer_ni *lp = msg->msg_txpeer; + struct lnet_ni *ni = msg->msg_txni; + int cpt = msg->msg_tx_cpt; + struct lnet_tx_queue *tq = ni->ni_tx_queues[cpt]; + + /* non-lnet_send() callers have checked before */ + LASSERT(!do_send || msg->msg_tx_delayed); + LASSERT(!msg->msg_receiving); + LASSERT(msg->msg_tx_committed); + + /* can't get here if we're sending to the loopback interface */ + if (the_lnet.ln_loni) + LASSERT(!nid_same(&lp->lpni_nid, &the_lnet.ln_loni->ni_nid)); + + /* NB 'lp' is always the next hop */ + if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && + lnet_peer_alive_locked(ni, lp, msg) == 0) { + the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++; + the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length += + msg->msg_len; + lnet_net_unlock(cpt); + if (msg->msg_txpeer) + lnet_incr_stats(&msg->msg_txpeer->lpni_stats, + msg->msg_type, + LNET_STATS_TYPE_DROP); + if (msg->msg_txni) + lnet_incr_stats(&msg->msg_txni->ni_stats, + msg->msg_type, + LNET_STATS_TYPE_DROP); + + CNETERR("Dropping message for %s: peer not alive\n", + libcfs_idstr(&msg->msg_target)); + msg->msg_health_status = LNET_MSG_STATUS_REMOTE_DROPPED; + if (do_send) + lnet_finalize(msg, -EHOSTUNREACH); + + lnet_net_lock(cpt); + return -EHOSTUNREACH; + } + + if (msg->msg_md != NULL && + (msg->msg_md->md_flags & LNET_MD_FLAG_ABORTED) != 0) { + lnet_net_unlock(cpt); + + CNETERR("Aborting message for %s: LNetM[DE]Unlink() already " + "called on the MD/ME.\n", + libcfs_idstr(&msg->msg_target)); + if (do_send) { + msg->msg_no_resend = true; + CDEBUG(D_NET, "msg %p to %s canceled and will not be resent\n", + msg, libcfs_idstr(&msg->msg_target)); + lnet_finalize(msg, -ECANCELED); + } + + lnet_net_lock(cpt); + return -ECANCELED; + } + + if (!msg->msg_peertxcredit) { + spin_lock(&lp->lpni_lock); + LASSERT((lp->lpni_txcredits < 0) == + !list_empty(&lp->lpni_txq)); + + msg->msg_peertxcredit = 1; + lp->lpni_txqnob += msg->msg_len + sizeof(struct lnet_hdr_nid4); + lp->lpni_txcredits--; + + if (lp->lpni_txcredits < lp->lpni_mintxcredits) + lp->lpni_mintxcredits = lp->lpni_txcredits; + + if (lp->lpni_txcredits < 0) { + msg->msg_tx_delayed = 1; + list_add_tail(&msg->msg_list, &lp->lpni_txq); + spin_unlock(&lp->lpni_lock); + return LNET_CREDIT_WAIT; + } + spin_unlock(&lp->lpni_lock); + } + + if (!msg->msg_txcredit) { + LASSERT((tq->tq_credits < 0) == + !list_empty(&tq->tq_delayed)); + + msg->msg_txcredit = 1; + tq->tq_credits--; + atomic_dec(&ni->ni_tx_credits); + + if (tq->tq_credits < tq->tq_credits_min) + tq->tq_credits_min = tq->tq_credits; + + if (tq->tq_credits < 0) { + msg->msg_tx_delayed = 1; + list_add_tail(&msg->msg_list, &tq->tq_delayed); + return LNET_CREDIT_WAIT; + } + } + + if (unlikely(!list_empty(&the_lnet.ln_delay_rules)) && + lnet_delay_rule_match_locked(&msg->msg_hdr, msg)) { + msg->msg_tx_delayed = 1; + return LNET_CREDIT_WAIT; + } + + /* unset the tx_delay flag as we're going to send it now */ + msg->msg_tx_delayed = 0; + + if (do_send) { + lnet_net_unlock(cpt); + lnet_ni_send(ni, msg); + lnet_net_lock(cpt); + } + return LNET_CREDIT_OK; +} + + +static struct lnet_rtrbufpool * +lnet_msg2bufpool(struct lnet_msg *msg) +{ + struct lnet_rtrbufpool *rbp; + int cpt; + + LASSERT(msg->msg_rx_committed); + + cpt = msg->msg_rx_cpt; + rbp = &the_lnet.ln_rtrpools[cpt][0]; + + LASSERT(msg->msg_len <= LNET_MTU); + while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_SIZE) { + rbp++; + LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]); + } + + return rbp; +} + +static int +lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) +{ + /* lnet_parse is going to lnet_net_unlock immediately after this, so it + * sets do_recv FALSE and I don't do the unlock/send/lock bit. + * I return LNET_CREDIT_WAIT if msg blocked and LNET_CREDIT_OK if + * received or OK to receive */ + struct lnet_peer_ni *lpni = msg->msg_rxpeer; + struct lnet_peer *lp; + struct lnet_rtrbufpool *rbp; + struct lnet_rtrbuf *rb; + + LASSERT(msg->msg_kiov == NULL); + LASSERT(msg->msg_niov == 0); + LASSERT(msg->msg_routing); + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_sending); + LASSERT(lpni->lpni_peer_net); + LASSERT(lpni->lpni_peer_net->lpn_peer); + + lp = lpni->lpni_peer_net->lpn_peer; + + /* non-lnet_parse callers only receive delayed messages */ + LASSERT(!do_recv || msg->msg_rx_delayed); + + if (!msg->msg_peerrtrcredit) { + /* lpni_lock protects the credit manipulation */ + spin_lock(&lpni->lpni_lock); + + msg->msg_peerrtrcredit = 1; + lpni->lpni_rtrcredits--; + if (lpni->lpni_rtrcredits < lpni->lpni_minrtrcredits) + lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits; + + if (lpni->lpni_rtrcredits < 0) { + spin_unlock(&lpni->lpni_lock); + /* must have checked eager_recv before here */ + LASSERT(msg->msg_rx_ready_delay); + msg->msg_rx_delayed = 1; + /* lp_lock protects the lp_rtrq */ + spin_lock(&lp->lp_lock); + list_add_tail(&msg->msg_list, &lp->lp_rtrq); + spin_unlock(&lp->lp_lock); + return LNET_CREDIT_WAIT; + } + spin_unlock(&lpni->lpni_lock); + } + + rbp = lnet_msg2bufpool(msg); + + if (!msg->msg_rtrcredit) { + msg->msg_rtrcredit = 1; + rbp->rbp_credits--; + if (rbp->rbp_credits < rbp->rbp_mincredits) + rbp->rbp_mincredits = rbp->rbp_credits; + + if (rbp->rbp_credits < 0) { + /* must have checked eager_recv before here */ + LASSERT(msg->msg_rx_ready_delay); + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &rbp->rbp_msgs); + return LNET_CREDIT_WAIT; + } + } + + LASSERT(!list_empty(&rbp->rbp_bufs)); + rb = list_entry(rbp->rbp_bufs.next, struct lnet_rtrbuf, rb_list); + list_del(&rb->rb_list); + + msg->msg_niov = rbp->rbp_npages; + msg->msg_kiov = &rb->rb_kiov[0]; + + /* unset the msg-rx_delayed flag since we're receiving the message */ + msg->msg_rx_delayed = 0; + + if (do_recv) { + int cpt = msg->msg_rx_cpt; + + lnet_net_unlock(cpt); + lnet_ni_recv(msg->msg_rxni, msg->msg_private, msg, 1, + 0, msg->msg_len, msg->msg_len); + lnet_net_lock(cpt); + } + return LNET_CREDIT_OK; +} + +void +lnet_return_tx_credits_locked(struct lnet_msg *msg) +{ + struct lnet_peer_ni *txpeer = msg->msg_txpeer; + struct lnet_ni *txni = msg->msg_txni; + struct lnet_msg *msg2; + + if (msg->msg_txcredit) { + struct lnet_ni *ni = msg->msg_txni; + struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt]; + + /* give back NI txcredits */ + msg->msg_txcredit = 0; + + LASSERT((tq->tq_credits < 0) == + !list_empty(&tq->tq_delayed)); + + tq->tq_credits++; + atomic_inc(&ni->ni_tx_credits); + if (tq->tq_credits <= 0) { + msg2 = list_entry(tq->tq_delayed.next, + struct lnet_msg, msg_list); + list_del(&msg2->msg_list); + + LASSERT(msg2->msg_txni == ni); + LASSERT(msg2->msg_tx_delayed); + LASSERT(msg2->msg_tx_cpt == msg->msg_tx_cpt); + + (void) lnet_post_send_locked(msg2, 1); + } + } + + if (msg->msg_peertxcredit) { + /* give back peer txcredits */ + msg->msg_peertxcredit = 0; + + spin_lock(&txpeer->lpni_lock); + LASSERT((txpeer->lpni_txcredits < 0) == + !list_empty(&txpeer->lpni_txq)); + + txpeer->lpni_txqnob -= msg->msg_len + + sizeof(struct lnet_hdr_nid4); + LASSERT(txpeer->lpni_txqnob >= 0); + + txpeer->lpni_txcredits++; + if (txpeer->lpni_txcredits <= 0) { + int msg2_cpt; + + msg2 = list_entry(txpeer->lpni_txq.next, + struct lnet_msg, msg_list); + list_del(&msg2->msg_list); + spin_unlock(&txpeer->lpni_lock); + + LASSERT(msg2->msg_txpeer == txpeer); + LASSERT(msg2->msg_tx_delayed); + + msg2_cpt = msg2->msg_tx_cpt; + + /* + * The msg_cpt can be different from the msg2_cpt + * so we need to make sure we lock the correct cpt + * for msg2. + * Once we call lnet_post_send_locked() it is no + * longer safe to access msg2, since it could've + * been freed by lnet_finalize(), but we still + * need to relock the correct cpt, so we cache the + * msg2_cpt for the purpose of the check that + * follows the call to lnet_pose_send_locked(). + */ + if (msg2_cpt != msg->msg_tx_cpt) { + lnet_net_unlock(msg->msg_tx_cpt); + lnet_net_lock(msg2_cpt); + } + (void) lnet_post_send_locked(msg2, 1); + if (msg2_cpt != msg->msg_tx_cpt) { + lnet_net_unlock(msg2_cpt); + lnet_net_lock(msg->msg_tx_cpt); + } + } else { + spin_unlock(&txpeer->lpni_lock); + } + } + + if (txni != NULL) { + msg->msg_txni = NULL; + lnet_ni_decref_locked(txni, msg->msg_tx_cpt); + } + + if (txpeer != NULL) { + msg->msg_txpeer = NULL; + lnet_peer_ni_decref_locked(txpeer); + } +} + +void +lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp) +{ + struct lnet_msg *msg; + + if (list_empty(&rbp->rbp_msgs)) + return; + msg = list_entry(rbp->rbp_msgs.next, + struct lnet_msg, msg_list); + list_del(&msg->msg_list); + + (void)lnet_post_routed_recv_locked(msg, 1); +} + +void +lnet_drop_routed_msgs_locked(struct list_head *list, int cpt) +{ + struct lnet_msg *msg; + struct lnet_msg *tmp; + + lnet_net_unlock(cpt); + + list_for_each_entry_safe(msg, tmp, list, msg_list) { + lnet_ni_recv(msg->msg_rxni, msg->msg_private, NULL, + 0, 0, 0, msg->msg_hdr.payload_length); + list_del_init(&msg->msg_list); + msg->msg_no_resend = true; + msg->msg_health_status = LNET_MSG_STATUS_REMOTE_ERROR; + lnet_finalize(msg, -ECANCELED); + } + + lnet_net_lock(cpt); +} + +void +lnet_return_rx_credits_locked(struct lnet_msg *msg) +{ + struct lnet_peer_ni *rxpeerni = msg->msg_rxpeer; + struct lnet_peer *lp; + struct lnet_ni *rxni = msg->msg_rxni; + struct lnet_msg *msg2; + + if (msg->msg_rtrcredit) { + /* give back global router credits */ + struct lnet_rtrbuf *rb; + struct lnet_rtrbufpool *rbp; + + /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays + * there until it gets one allocated, or aborts the wait + * itself */ + LASSERT(msg->msg_kiov != NULL); + + rb = list_entry(msg->msg_kiov, struct lnet_rtrbuf, rb_kiov[0]); + rbp = rb->rb_pool; + + msg->msg_kiov = NULL; + msg->msg_rtrcredit = 0; + + LASSERT(rbp == lnet_msg2bufpool(msg)); + + LASSERT((rbp->rbp_credits > 0) == + !list_empty(&rbp->rbp_bufs)); + + /* If routing is now turned off, we just drop this buffer and + * don't bother trying to return credits. */ + if (!the_lnet.ln_routing) { + lnet_destroy_rtrbuf(rb, rbp->rbp_npages); + goto routing_off; + } + + /* It is possible that a user has lowered the desired number of + * buffers in this pool. Make sure we never put back + * more buffers than the stated number. */ + if (unlikely(rbp->rbp_credits >= rbp->rbp_req_nbuffers)) { + /* Discard this buffer so we don't have too + * many. */ + lnet_destroy_rtrbuf(rb, rbp->rbp_npages); + rbp->rbp_nbuffers--; + } else { + list_add(&rb->rb_list, &rbp->rbp_bufs); + rbp->rbp_credits++; + if (rbp->rbp_credits <= 0) + lnet_schedule_blocked_locked(rbp); + } + } + +routing_off: + if (msg->msg_peerrtrcredit) { + LASSERT(rxpeerni); + LASSERT(rxpeerni->lpni_peer_net); + LASSERT(rxpeerni->lpni_peer_net->lpn_peer); + + /* give back peer router credits */ + msg->msg_peerrtrcredit = 0; + + spin_lock(&rxpeerni->lpni_lock); + rxpeerni->lpni_rtrcredits++; + spin_unlock(&rxpeerni->lpni_lock); + + lp = rxpeerni->lpni_peer_net->lpn_peer; + spin_lock(&lp->lp_lock); + + /* drop all messages which are queued to be routed on that + * peer. */ + if (!the_lnet.ln_routing) { + LIST_HEAD(drop); + list_splice_init(&lp->lp_rtrq, &drop); + spin_unlock(&lp->lp_lock); + lnet_drop_routed_msgs_locked(&drop, msg->msg_rx_cpt); + } else if (!list_empty(&lp->lp_rtrq)) { + int msg2_cpt; + + msg2 = list_entry(lp->lp_rtrq.next, + struct lnet_msg, msg_list); + list_del(&msg2->msg_list); + msg2_cpt = msg2->msg_rx_cpt; + spin_unlock(&lp->lp_lock); + /* + * messages on the lp_rtrq can be from any NID in + * the peer, which means they might have different + * cpts. We need to make sure we lock the right + * one. + */ + if (msg2_cpt != msg->msg_rx_cpt) { + lnet_net_unlock(msg->msg_rx_cpt); + lnet_net_lock(msg2_cpt); + } + (void) lnet_post_routed_recv_locked(msg2, 1); + if (msg2_cpt != msg->msg_rx_cpt) { + lnet_net_unlock(msg2_cpt); + lnet_net_lock(msg->msg_rx_cpt); + } + } else { + spin_unlock(&lp->lp_lock); + } + } + if (rxni != NULL) { + msg->msg_rxni = NULL; + lnet_ni_decref_locked(rxni, msg->msg_rx_cpt); + } + if (rxpeerni != NULL) { + msg->msg_rxpeer = NULL; + lnet_peer_ni_decref_locked(rxpeerni); + } +} + +static struct lnet_peer_ni * +lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, + struct lnet_peer *peer, + struct lnet_peer_ni *best_lpni, + struct lnet_peer_net *peer_net) +{ + /* + * Look at the peer NIs for the destination peer that connect + * to the chosen net. If a peer_ni is preferred when using the + * best_ni to communicate, we use that one. If there is no + * preferred peer_ni, or there are multiple preferred peer_ni, + * the available transmit credits are used. If the transmit + * credits are equal, we round-robin over the peer_ni. + */ + struct lnet_peer_ni *lpni = NULL; + int best_lpni_credits = (best_lpni) ? best_lpni->lpni_txcredits : + INT_MIN; + int best_lpni_healthv = (best_lpni) ? + atomic_read(&best_lpni->lpni_healthv) : 0; + bool best_lpni_is_preferred = false; + bool lpni_is_preferred; + int lpni_healthv; + __u32 lpni_sel_prio; + __u32 best_sel_prio = LNET_MAX_SELECTION_PRIORITY; + + while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) { + /* + * if the best_ni we've chosen aleady has this lpni + * preferred, then let's use it + */ + if (best_ni) { + lpni_is_preferred = lnet_peer_is_pref_nid_locked( + lpni, &best_ni->ni_nid); + CDEBUG(D_NET, "%s lpni_is_preferred = %d\n", + libcfs_nidstr(&best_ni->ni_nid), + lpni_is_preferred); + } else { + lpni_is_preferred = false; + } + + lpni_healthv = atomic_read(&lpni->lpni_healthv); + lpni_sel_prio = lpni->lpni_sel_priority; + + if (best_lpni) + CDEBUG(D_NET, "n:[%s, %s] h:[%d, %d] p:[%d, %d] c:[%d, %d] s:[%d, %d]\n", + libcfs_nidstr(&lpni->lpni_nid), + libcfs_nidstr(&best_lpni->lpni_nid), + lpni_healthv, best_lpni_healthv, + lpni_sel_prio, best_sel_prio, + lpni->lpni_txcredits, best_lpni_credits, + lpni->lpni_seq, best_lpni->lpni_seq); + else + goto select_lpni; + + /* pick the healthiest peer ni */ + if (lpni_healthv < best_lpni_healthv) + continue; + else if (lpni_healthv > best_lpni_healthv) { + if (best_lpni_is_preferred) + best_lpni_is_preferred = false; + goto select_lpni; + } + + if (lpni_sel_prio > best_sel_prio) + continue; + else if (lpni_sel_prio < best_sel_prio) { + if (best_lpni_is_preferred) + best_lpni_is_preferred = false; + goto select_lpni; + } + + /* if this is a preferred peer use it */ + if (!best_lpni_is_preferred && lpni_is_preferred) { + best_lpni_is_preferred = true; + goto select_lpni; + } else if (best_lpni_is_preferred && !lpni_is_preferred) { + /* this is not the preferred peer so let's ignore + * it. + */ + continue; + } + + if (lpni->lpni_txcredits < best_lpni_credits) + /* We already have a peer that has more credits + * available than this one. No need to consider + * this peer further. + */ + continue; + else if (lpni->lpni_txcredits > best_lpni_credits) + goto select_lpni; + + /* The best peer found so far and the current peer + * have the same number of available credits let's + * make sure to select between them using Round Robin + */ + if (best_lpni && (best_lpni->lpni_seq <= lpni->lpni_seq)) + continue; +select_lpni: + best_lpni_is_preferred = lpni_is_preferred; + best_lpni_healthv = lpni_healthv; + best_sel_prio = lpni_sel_prio; + best_lpni = lpni; + best_lpni_credits = lpni->lpni_txcredits; + } + + /* if we still can't find a peer ni then we can't reach it */ + if (!best_lpni) { + __u32 net_id = (peer_net) ? peer_net->lpn_net_id : + LNET_NIDNET(dst_nid); + CDEBUG(D_NET, "no peer_ni found on peer net %s\n", + libcfs_net2str(net_id)); + return NULL; + } + + CDEBUG(D_NET, "sd_best_lpni = %s\n", + libcfs_nidstr(&best_lpni->lpni_nid)); + + return best_lpni; +} + +/* + * Prerequisite: the best_ni should already be set in the sd + * Find the best lpni. + * If the net id is provided then restrict lpni selection on + * that particular net. + * Otherwise find any reachable lpni. When dealing with an MR + * gateway and it has multiple lpnis which we can use + * we want to select the best one from the list of reachable + * ones. + */ +static inline struct lnet_peer_ni * +lnet_find_best_lpni(struct lnet_ni *lni, lnet_nid_t dst_nid, + struct lnet_peer *peer, __u32 net_id) +{ + struct lnet_peer_net *peer_net; + + /* find the best_lpni on any local network */ + if (net_id == LNET_NET_ANY) { + struct lnet_peer_ni *best_lpni = NULL; + struct lnet_peer_net *lpn; + list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_peer_nets) { + /* no net specified find any reachable peer ni */ + if (!lnet_islocalnet_locked(lpn->lpn_net_id)) + continue; + best_lpni = lnet_select_peer_ni(lni, dst_nid, peer, + best_lpni, lpn); + } + + return best_lpni; + } + /* restrict on the specified net */ + peer_net = lnet_peer_get_net_locked(peer, net_id); + if (peer_net) + return lnet_select_peer_ni(lni, dst_nid, peer, NULL, peer_net); + + return NULL; +} + +static int +lnet_compare_gw_lpnis(struct lnet_peer_ni *lpni1, struct lnet_peer_ni *lpni2) +{ + if (lpni1->lpni_txqnob < lpni2->lpni_txqnob) + return 1; + + if (lpni1->lpni_txqnob > lpni2->lpni_txqnob) + return -1; + + if (lpni1->lpni_txcredits > lpni2->lpni_txcredits) + return 1; + + if (lpni1->lpni_txcredits < lpni2->lpni_txcredits) + return -1; + + return 0; +} + +/* Compare route priorities and hop counts */ +static int +lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2) +{ + int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops; + int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops; + + if (r1->lr_priority < r2->lr_priority) + return 1; + + if (r1->lr_priority > r2->lr_priority) + return -1; + + if (r1_hops < r2_hops) + return 1; + + if (r1_hops > r2_hops) + return -1; + + return 0; +} + +static struct lnet_route * +lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net, + struct lnet_peer_ni *remote_lpni, + struct lnet_route **prev_route, + struct lnet_peer_ni **gwni) +{ + struct lnet_peer_ni *lpni, *best_gw_ni = NULL; + struct lnet_route *best_route; + struct lnet_route *last_route; + struct lnet_route *route; + int rc; + bool best_rte_is_preferred = false; + struct lnet_nid *gw_pnid; + + CDEBUG(D_NET, "Looking up a route to %s, from %s\n", + libcfs_net2str(rnet->lrn_net), libcfs_net2str(src_net)); + + best_route = last_route = NULL; + list_for_each_entry(route, &rnet->lrn_routes, lr_list) { + if (!lnet_is_route_alive(route)) + continue; + gw_pnid = &route->lr_gateway->lp_primary_nid; + + /* no protection on below fields, but it's harmless */ + if (last_route && (last_route->lr_seq - route->lr_seq < 0)) + last_route = route; + + /* if the best route found is in the preferred list then + * tag it as preferred and use it later on. But if we + * didn't find any routes which are on the preferred list + * then just use the best route possible. + */ + rc = lnet_peer_is_pref_rtr_locked(remote_lpni, gw_pnid); + + if (!best_route || (rc && !best_rte_is_preferred)) { + /* Restrict the selection of the router NI on the + * src_net provided. If the src_net is LNET_NID_ANY, + * then select the best interface available. + */ + lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY, + route->lr_gateway, + src_net); + if (!lpni) { + CDEBUG(D_NET, + "Gateway %s does not have a peer NI on net %s\n", + libcfs_nidstr(gw_pnid), + libcfs_net2str(src_net)); + continue; + } + } + + if (rc && !best_rte_is_preferred) { + /* This is the first preferred route we found, + * so it beats any route found previously + */ + best_route = route; + if (!last_route) + last_route = route; + best_gw_ni = lpni; + best_rte_is_preferred = true; + CDEBUG(D_NET, "preferred gw = %s\n", + libcfs_nidstr(gw_pnid)); + continue; + } else if ((!rc) && best_rte_is_preferred) + /* The best route we found so far is in the preferred + * list, so it beats any non-preferred route + */ + continue; + + if (!best_route) { + best_route = last_route = route; + best_gw_ni = lpni; + continue; + } + + rc = lnet_compare_routes(route, best_route); + if (rc == -1) + continue; + + /* Restrict the selection of the router NI on the + * src_net provided. If the src_net is LNET_NID_ANY, + * then select the best interface available. + */ + lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY, + route->lr_gateway, + src_net); + if (!lpni) { + CDEBUG(D_NET, + "Gateway %s does not have a peer NI on net %s\n", + libcfs_nidstr(gw_pnid), + libcfs_net2str(src_net)); + continue; + } + + if (rc == 1) { + best_route = route; + best_gw_ni = lpni; + continue; + } + + rc = lnet_compare_gw_lpnis(lpni, best_gw_ni); + if (rc == -1) + continue; + + if (rc == 1 || route->lr_seq <= best_route->lr_seq) { + best_route = route; + best_gw_ni = lpni; + continue; + } + } + + *prev_route = last_route; + *gwni = best_gw_ni; + + return best_route; +} + +static inline unsigned int +lnet_dev_prio_of_md(struct lnet_ni *ni, unsigned int dev_idx) +{ + if (dev_idx == UINT_MAX) + return UINT_MAX; + + if (!ni || !ni->ni_net || !ni->ni_net->net_lnd || + !ni->ni_net->net_lnd->lnd_get_dev_prio) + return UINT_MAX; + + return ni->ni_net->net_lnd->lnd_get_dev_prio(ni, dev_idx); +} + +static struct lnet_ni * +lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, + struct lnet_peer *peer, struct lnet_peer_net *peer_net, + struct lnet_msg *msg, int md_cpt) +{ + struct lnet_libmd *md = msg->msg_md; + unsigned int offset = msg->msg_offset; + unsigned int shortest_distance; + struct lnet_ni *ni = NULL; + int best_credits; + int best_healthv; + __u32 best_sel_prio; + unsigned int best_dev_prio; + unsigned int dev_idx = UINT_MAX; + struct page *page = lnet_get_first_page(md, offset); + msg->msg_rdma_force = lnet_is_rdma_only_page(page); + + if (msg->msg_rdma_force) + dev_idx = lnet_get_dev_idx(page); + + /* + * If there is no peer_ni that we can send to on this network, + * then there is no point in looking for a new best_ni here. + */ + if (!lnet_get_next_peer_ni_locked(peer, peer_net, NULL)) + return best_ni; + + if (best_ni == NULL) { + best_sel_prio = LNET_MAX_SELECTION_PRIORITY; + shortest_distance = UINT_MAX; + best_dev_prio = UINT_MAX; + best_credits = INT_MIN; + best_healthv = 0; + } else { + best_dev_prio = lnet_dev_prio_of_md(best_ni, dev_idx); + shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt, + best_ni->ni_dev_cpt); + best_credits = atomic_read(&best_ni->ni_tx_credits); + best_healthv = atomic_read(&best_ni->ni_healthv); + best_sel_prio = best_ni->ni_sel_priority; + } + + while ((ni = lnet_get_next_ni_locked(local_net, ni))) { + unsigned int distance; + int ni_credits; + int ni_healthv; + int ni_fatal; + __u32 ni_sel_prio; + unsigned int ni_dev_prio; + + ni_credits = atomic_read(&ni->ni_tx_credits); + ni_healthv = atomic_read(&ni->ni_healthv); + ni_fatal = atomic_read(&ni->ni_fatal_error_on); + ni_sel_prio = ni->ni_sel_priority; + + /* + * calculate the distance from the CPT on which + * the message memory is allocated to the CPT of + * the NI's physical device + */ + distance = cfs_cpt_distance(lnet_cpt_table(), + md_cpt, + ni->ni_dev_cpt); + + ni_dev_prio = lnet_dev_prio_of_md(ni, dev_idx); + + /* + * All distances smaller than the NUMA range + * are treated equally. + */ + if (distance < lnet_numa_range) + distance = lnet_numa_range; + + /* + * Select on health, selection policy, direct dma prio, + * shorter distance, available credits, then round-robin. + */ + if (ni_fatal) + continue; + + if (best_ni) + CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d, p:%u, g:%u, h:%d] with best_ni %s [c:%d, d:%d, s:%d, p:%u, g:%u, h:%d]\n", + libcfs_nidstr(&ni->ni_nid), ni_credits, distance, + ni->ni_seq, ni_sel_prio, ni_dev_prio, ni_healthv, + (best_ni) ? libcfs_nidstr(&best_ni->ni_nid) + : "not selected", best_credits, shortest_distance, + (best_ni) ? best_ni->ni_seq : 0, + best_sel_prio, best_dev_prio, best_healthv); + else + goto select_ni; + + if (ni_healthv < best_healthv) + continue; + else if (ni_healthv > best_healthv) + goto select_ni; + + if (ni_sel_prio > best_sel_prio) + continue; + else if (ni_sel_prio < best_sel_prio) + goto select_ni; + + if (ni_dev_prio > best_dev_prio) + continue; + else if (ni_dev_prio < best_dev_prio) + goto select_ni; + + if (distance > shortest_distance) + continue; + else if (distance < shortest_distance) + goto select_ni; + + if (ni_credits < best_credits) + continue; + else if (ni_credits > best_credits) + goto select_ni; + + if (best_ni && best_ni->ni_seq <= ni->ni_seq) + continue; + +select_ni: + best_sel_prio = ni_sel_prio; + best_dev_prio = ni_dev_prio; + shortest_distance = distance; + best_healthv = ni_healthv; + best_ni = ni; + best_credits = ni_credits; + } + + CDEBUG(D_NET, "selected best_ni %s\n", + (best_ni) ? libcfs_nidstr(&best_ni->ni_nid) : "no selection"); + + return best_ni; +} + +static bool +lnet_reserved_msg(struct lnet_msg *msg) +{ + if (msg->msg_type == LNET_MSG_PUT) { + if (msg->msg_hdr.msg.put.ptl_index == LNET_RESERVED_PORTAL) + return true; + } else if (msg->msg_type == LNET_MSG_GET) { + if (msg->msg_hdr.msg.get.ptl_index == LNET_RESERVED_PORTAL) + return true; + } + return false; +} + +/* + * Traffic to the LNET_RESERVED_PORTAL may not trigger peer discovery, + * because such traffic is required to perform discovery. We therefore + * exclude all GET and PUT on that portal. We also exclude all ACK and + * REPLY traffic, but that is because the portal is not tracked in the + * message structure for these message types. We could restrict this + * further by also checking for LNET_PROTO_PING_MATCHBITS. + */ +static bool +lnet_msg_discovery(struct lnet_msg *msg) +{ + return !(lnet_reserved_msg(msg) || lnet_msg_is_response(msg)); +} + +#define SRC_SPEC 0x0001 +#define SRC_ANY 0x0002 +#define LOCAL_DST 0x0004 +#define REMOTE_DST 0x0008 +#define MR_DST 0x0010 +#define NMR_DST 0x0020 +#define SND_RESP 0x0040 + +/* The following to defines are used for return codes */ +#define REPEAT_SEND 0x1000 +#define PASS_THROUGH 0x2000 + +/* The different cases lnet_select pathway needs to handle */ +#define SRC_SPEC_LOCAL_MR_DST (SRC_SPEC | LOCAL_DST | MR_DST) +#define SRC_SPEC_ROUTER_MR_DST (SRC_SPEC | REMOTE_DST | MR_DST) +#define SRC_SPEC_LOCAL_NMR_DST (SRC_SPEC | LOCAL_DST | NMR_DST) +#define SRC_SPEC_ROUTER_NMR_DST (SRC_SPEC | REMOTE_DST | NMR_DST) +#define SRC_ANY_LOCAL_MR_DST (SRC_ANY | LOCAL_DST | MR_DST) +#define SRC_ANY_ROUTER_MR_DST (SRC_ANY | REMOTE_DST | MR_DST) +#define SRC_ANY_LOCAL_NMR_DST (SRC_ANY | LOCAL_DST | NMR_DST) +#define SRC_ANY_ROUTER_NMR_DST (SRC_ANY | REMOTE_DST | NMR_DST) + +static int +lnet_handle_lo_send(struct lnet_send_data *sd) +{ + struct lnet_msg *msg = sd->sd_msg; + int cpt = sd->sd_cpt; + + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return -ESHUTDOWN; + + /* No send credit hassles with LOLND */ + lnet_ni_addref_locked(the_lnet.ln_loni, cpt); + msg->msg_hdr.dest_nid = the_lnet.ln_loni->ni_nid; + if (!msg->msg_routing) + msg->msg_hdr.src_nid = the_lnet.ln_loni->ni_nid; + msg->msg_target.nid = the_lnet.ln_loni->ni_nid; + lnet_msg_commit(msg, cpt); + msg->msg_txni = the_lnet.ln_loni; + + return LNET_CREDIT_OK; +} + +static int +lnet_handle_send(struct lnet_send_data *sd) +{ + struct lnet_ni *best_ni = sd->sd_best_ni; + struct lnet_peer_ni *best_lpni = sd->sd_best_lpni; + struct lnet_peer_ni *final_dst_lpni = sd->sd_final_dst_lpni; + struct lnet_msg *msg = sd->sd_msg; + int cpt2; + __u32 send_case = sd->sd_send_case; + int rc; + __u32 routing = send_case & REMOTE_DST; + struct lnet_rsp_tracker *rspt; + + /* Increment sequence number of the selected peer, peer net, + * local ni and local net so that we pick the next ones + * in Round Robin. + */ + best_lpni->lpni_peer_net->lpn_seq++; + best_lpni->lpni_seq = best_lpni->lpni_peer_net->lpn_seq; + best_ni->ni_net->net_seq++; + best_ni->ni_seq = best_ni->ni_net->net_seq; + + CDEBUG(D_NET, "%s NI seq info: [%d:%d:%d:%u] %s LPNI seq info [%d:%d:%d:%u]\n", + libcfs_nidstr(&best_ni->ni_nid), + best_ni->ni_seq, best_ni->ni_net->net_seq, + atomic_read(&best_ni->ni_tx_credits), + best_ni->ni_sel_priority, + libcfs_nidstr(&best_lpni->lpni_nid), + best_lpni->lpni_seq, best_lpni->lpni_peer_net->lpn_seq, + best_lpni->lpni_txcredits, + best_lpni->lpni_sel_priority); + + /* + * grab a reference on the peer_ni so it sticks around even if + * we need to drop and relock the lnet_net_lock below. + */ + lnet_peer_ni_addref_locked(best_lpni); + + /* + * Use lnet_cpt_of_nid() to determine the CPT used to commit the + * message. This ensures that we get a CPT that is correct for + * the NI when the NI has been restricted to a subset of all CPTs. + * If the selected CPT differs from the one currently locked, we + * must unlock and relock the lnet_net_lock(), and then check whether + * the configuration has changed. We don't have a hold on the best_ni + * yet, and it may have vanished. + */ + cpt2 = lnet_cpt_of_nid_locked(&best_lpni->lpni_nid, best_ni); + if (sd->sd_cpt != cpt2) { + __u32 seq = lnet_get_dlc_seq_locked(); + lnet_net_unlock(sd->sd_cpt); + sd->sd_cpt = cpt2; + lnet_net_lock(sd->sd_cpt); + if (seq != lnet_get_dlc_seq_locked()) { + lnet_peer_ni_decref_locked(best_lpni); + return REPEAT_SEND; + } + } + + /* + * store the best_lpni in the message right away to avoid having + * to do the same operation under different conditions + */ + msg->msg_txpeer = best_lpni; + msg->msg_txni = best_ni; + + /* + * grab a reference for the best_ni since now it's in use in this + * send. The reference will be dropped in lnet_finalize() + */ + lnet_ni_addref_locked(msg->msg_txni, sd->sd_cpt); + + /* + * Always set the target.nid to the best peer picked. Either the + * NID will be one of the peer NIDs selected, or the same NID as + * what was originally set in the target or it will be the NID of + * a router if this message should be routed + */ + msg->msg_target.nid = msg->msg_txpeer->lpni_nid; + + /* + * lnet_msg_commit assigns the correct cpt to the message, which + * is used to decrement the correct refcount on the ni when it's + * time to return the credits + */ + lnet_msg_commit(msg, sd->sd_cpt); + + /* + * If we are routing the message then we keep the src_nid that was + * set by the originator. If we are not routing then we are the + * originator and set it here. + */ + if (!msg->msg_routing) + msg->msg_hdr.src_nid = msg->msg_txni->ni_nid; + + if (routing) { + msg->msg_target_is_router = 1; + msg->msg_target.pid = LNET_PID_LUSTRE; + /* + * since we're routing we want to ensure that the + * msg_hdr.dest_nid is set to the final destination. When + * the router receives this message it knows how to route + * it. + * + * final_dst_lpni is set at the beginning of the + * lnet_select_pathway() function and is never changed. + * It's safe to use it here. + */ + msg->msg_hdr.dest_nid = final_dst_lpni->lpni_nid; + } else { + /* + * if we're not routing set the dest_nid to the best peer + * ni NID that we picked earlier in the algorithm. + */ + msg->msg_hdr.dest_nid = msg->msg_txpeer->lpni_nid; + } + + /* + * if we have response tracker block update it with the next hop + * nid + */ + if (msg->msg_md) { + rspt = msg->msg_md->md_rspt_ptr; + if (rspt) { + rspt->rspt_next_hop_nid = + msg->msg_txpeer->lpni_nid; + CDEBUG(D_NET, "rspt_next_hop_nid = %s\n", + libcfs_nidstr(&rspt->rspt_next_hop_nid)); + } + } + + rc = lnet_post_send_locked(msg, 0); + + if (!rc) + CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) %s : %s try# %d\n", + libcfs_nidstr(&msg->msg_hdr.src_nid), + libcfs_nidstr(&msg->msg_txni->ni_nid), + libcfs_nidstr(&sd->sd_src_nid), + libcfs_nidstr(&msg->msg_hdr.dest_nid), + libcfs_nidstr(&sd->sd_dst_nid), + libcfs_nidstr(&msg->msg_txpeer->lpni_nid), + libcfs_nidstr(&sd->sd_rtr_nid), + lnet_msgtyp2str(msg->msg_type), msg->msg_retry_count); + + return rc; +} + +static inline void +lnet_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, struct lnet_ni *lni, + struct lnet_msg *msg) +{ + if (!lnet_peer_is_multi_rail(lpni->lpni_peer_net->lpn_peer) && + !lnet_msg_is_response(msg) && lpni->lpni_pref_nnids == 0) { + CDEBUG(D_NET, "Setting preferred local NID %s on NMR peer %s\n", + libcfs_nidstr(&lni->ni_nid), + libcfs_nidstr(&lpni->lpni_nid)); + lnet_peer_ni_set_non_mr_pref_nid(lpni, &lni->ni_nid); + } +} + +/* + * Source Specified + * Local Destination + * non-mr peer + * + * use the source and destination NIDs as the pathway + */ +static int +lnet_handle_spec_local_nmr_dst(struct lnet_send_data *sd) +{ + /* the destination lpni is set before we get here. */ + + /* find local NI */ + sd->sd_best_ni = lnet_nid_to_ni_locked(&sd->sd_src_nid, sd->sd_cpt); + if (!sd->sd_best_ni) { + CERROR("Can't send to %s: src %s is not a local nid\n", + libcfs_nidstr(&sd->sd_dst_nid), + libcfs_nidstr(&sd->sd_src_nid)); + return -EINVAL; + } + + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); + + return lnet_handle_send(sd); +} + +/* + * Source Specified + * Local Destination + * MR Peer + * + * Don't run the selection algorithm on the peer NIs. By specifying the + * local NID, we're also saying that we should always use the destination NID + * provided. This handles the case where we should be using the same + * destination NID for the all the messages which belong to the same RPC + * request. + */ +static int +lnet_handle_spec_local_mr_dst(struct lnet_send_data *sd) +{ + sd->sd_best_ni = lnet_nid_to_ni_locked(&sd->sd_src_nid, sd->sd_cpt); + if (!sd->sd_best_ni) { + CERROR("Can't send to %s: src %s is not a local nid\n", + libcfs_nidstr(&sd->sd_dst_nid), + libcfs_nidstr(&sd->sd_src_nid)); + return -EINVAL; + } + + if (sd->sd_best_lpni && + nid_same(&sd->sd_best_lpni->lpni_nid, + &the_lnet.ln_loni->ni_nid)) + return lnet_handle_lo_send(sd); + else if (sd->sd_best_lpni) + return lnet_handle_send(sd); + + CERROR("can't send to %s. no NI on %s\n", + libcfs_nidstr(&sd->sd_dst_nid), + libcfs_net2str(sd->sd_best_ni->ni_net->net_id)); + + return -EHOSTUNREACH; +} + +struct lnet_ni * +lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni, + struct lnet_peer *peer, + struct lnet_peer_net *peer_net, + struct lnet_msg *msg, + int cpt) +{ + struct lnet_net *local_net; + struct lnet_ni *best_ni; + + local_net = lnet_get_net_locked(peer_net->lpn_net_id); + if (!local_net) + return NULL; + + /* + * Iterate through the NIs in this local Net and select + * the NI to send from. The selection is determined by + * these 3 criterion in the following priority: + * 1. NUMA + * 2. NI available credits + * 3. Round Robin + */ + best_ni = lnet_get_best_ni(local_net, cur_best_ni, + peer, peer_net, msg, cpt); + + return best_ni; +} + +static int +lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, struct lnet_msg *msg, + int cpt) +{ + struct lnet_peer *peer; + struct lnet_peer_ni *new_lpni; + int rc; + + lnet_peer_ni_addref_locked(lpni); + + peer = lpni->lpni_peer_net->lpn_peer; + + if (lnet_peer_gw_discovery(peer)) { + lnet_peer_ni_decref_locked(lpni); + return 0; + } + + if (!lnet_msg_discovery(msg) || lnet_peer_is_uptodate(peer)) { + lnet_peer_ni_decref_locked(lpni); + return 0; + } + + rc = lnet_discover_peer_locked(lpni, cpt, false); + if (rc) { + lnet_peer_ni_decref_locked(lpni); + return rc; + } + + new_lpni = lnet_find_peer_ni_locked(lnet_nid_to_nid4(&lpni->lpni_nid)); + if (!new_lpni) { + lnet_peer_ni_decref_locked(lpni); + return -ENOENT; + } + + peer = new_lpni->lpni_peer_net->lpn_peer; + spin_lock(&peer->lp_lock); + if (lpni == new_lpni && lnet_peer_is_uptodate_locked(peer)) { + /* The peer NI did not change and the peer is up to date. + * Nothing more to do. + */ + spin_unlock(&peer->lp_lock); + lnet_peer_ni_decref_locked(lpni); + lnet_peer_ni_decref_locked(new_lpni); + return 0; + } + spin_unlock(&peer->lp_lock); + + /* Either the peer NI changed during discovery, or the peer isn't up + * to date. In both cases we want to queue the message on the + * (possibly new) peer's pending queue and queue the peer for discovery + */ + msg->msg_sending = 0; + msg->msg_txpeer = NULL; + lnet_net_unlock(cpt); + lnet_peer_queue_message(peer, msg); + lnet_net_lock(cpt); + + lnet_peer_ni_decref_locked(lpni); + lnet_peer_ni_decref_locked(new_lpni); + + CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n", + msg, libcfs_nidstr(&peer->lp_primary_nid)); + + return LNET_DC_WAIT; +} + +static int +lnet_handle_find_routed_path(struct lnet_send_data *sd, + struct lnet_nid *dst_nid, + struct lnet_peer_ni **gw_lpni, + struct lnet_peer **gw_peer) +{ + int rc; + struct lnet_peer *gw; + struct lnet_peer *lp; + struct lnet_peer_net *lpn; + struct lnet_peer_net *best_lpn = NULL; + struct lnet_remotenet *rnet, *best_rnet = NULL; + struct lnet_route *best_route = NULL; + struct lnet_route *last_route = NULL; + struct lnet_peer_ni *lpni = NULL; + struct lnet_peer_ni *gwni = NULL; + bool route_found = false; + struct lnet_nid *src_nid = + !LNET_NID_IS_ANY(&sd->sd_src_nid) || !sd->sd_best_ni + ? &sd->sd_src_nid + : &sd->sd_best_ni->ni_nid; + int best_lpn_healthv = 0; + __u32 best_lpn_sel_prio = LNET_MAX_SELECTION_PRIORITY; + + CDEBUG(D_NET, "using src nid %s for route restriction\n", + src_nid ? libcfs_nidstr(src_nid) : "ANY"); + + /* If a router nid was specified then we are replying to a GET or + * sending an ACK. In this case we use the gateway associated with the + * specified router nid. + */ + if (!LNET_NID_IS_ANY(&sd->sd_rtr_nid)) { + gwni = lnet_peer_ni_find_locked(&sd->sd_rtr_nid); + if (gwni) { + gw = gwni->lpni_peer_net->lpn_peer; + lnet_peer_ni_decref_locked(gwni); + if (gw->lp_rtr_refcount) + route_found = true; + } else { + CWARN("No peer NI for gateway %s. Attempting to find an alternative route.\n", + libcfs_nidstr(&sd->sd_rtr_nid)); + } + } + + if (!route_found) { + if (sd->sd_msg->msg_routing || (src_nid && !LNET_NID_IS_ANY(src_nid))) { + /* If I'm routing this message then I need to find the + * next hop based on the destination NID + * + * We also find next hop based on the destination NID + * if the source NI was specified + */ + best_rnet = lnet_find_rnet_locked(LNET_NID_NET(&sd->sd_dst_nid)); + if (!best_rnet) { + CERROR("Unable to send message from %s to %s - Route table may be misconfigured\n", + (src_nid && LNET_NID_IS_ANY(src_nid)) ? + "any local NI" : + libcfs_nidstr(src_nid), + libcfs_nidstr(&sd->sd_dst_nid)); + return -EHOSTUNREACH; + } + } else { + /* we've already looked up the initial lpni using + * dst_nid + */ + lpni = sd->sd_best_lpni; + /* the peer tree must be in existence */ + LASSERT(lpni && lpni->lpni_peer_net && + lpni->lpni_peer_net->lpn_peer); + lp = lpni->lpni_peer_net->lpn_peer; + + list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) { + /* is this remote network reachable? */ + rnet = lnet_find_rnet_locked(lpn->lpn_net_id); + if (!rnet) + continue; + + if (!best_lpn) { + best_lpn = lpn; + best_rnet = rnet; + } + + /* select the preferred peer net */ + if (best_lpn_healthv > lpn->lpn_healthv) + continue; + else if (best_lpn_healthv < lpn->lpn_healthv) + goto use_lpn; + + if (best_lpn_sel_prio < lpn->lpn_sel_priority) + continue; + else if (best_lpn_sel_prio > lpn->lpn_sel_priority) + goto use_lpn; + + if (best_lpn->lpn_seq <= lpn->lpn_seq) + continue; +use_lpn: + best_lpn_healthv = lpn->lpn_healthv; + best_lpn_sel_prio = lpn->lpn_sel_priority; + best_lpn = lpn; + best_rnet = rnet; + } + + if (!best_lpn) { + CERROR("peer %s has no available nets\n", + libcfs_nidstr(&sd->sd_dst_nid)); + return -EHOSTUNREACH; + } + + sd->sd_best_lpni = lnet_find_best_lpni(sd->sd_best_ni, + lnet_nid_to_nid4(&sd->sd_dst_nid), + lp, + best_lpn->lpn_net_id); + if (!sd->sd_best_lpni) { + CERROR("peer %s is unreachable\n", + libcfs_nidstr(&sd->sd_dst_nid)); + return -EHOSTUNREACH; + } + + /* We're attempting to round robin over the remote peer + * NI's so update the final destination we selected + */ + sd->sd_final_dst_lpni = sd->sd_best_lpni; + + /* Increment the sequence number of the remote lpni so + * we can round robin over the different interfaces of + * the remote lpni + */ + sd->sd_best_lpni->lpni_seq++; + } + + /* + * find the best route. Restrict the selection on the net of the + * local NI if we've already picked the local NI to send from. + * Otherwise, let's pick any route we can find and then find + * a local NI we can reach the route's gateway on. Any route we + * select will be reachable by virtue of the restriction we have + * when adding a route. + */ + best_route = lnet_find_route_locked(best_rnet, + LNET_NID_NET(src_nid), + sd->sd_best_lpni, + &last_route, &gwni); + + if (!best_route) { + CERROR("no route to %s from %s\n", + libcfs_nidstr(dst_nid), + libcfs_nidstr(src_nid)); + return -EHOSTUNREACH; + } + + if (!gwni) { + CERROR("Internal Error. Route expected to %s from %s\n", + libcfs_nidstr(dst_nid), + libcfs_nidstr(src_nid)); + return -EFAULT; + } + + gw = best_route->lr_gateway; + LASSERT(gw == gwni->lpni_peer_net->lpn_peer); + } + + /* + * If the router checker is not active then discover the gateway here. + * This ensures we are able to take advantage of multi-rail routing, but + * if the router checker is active then we do not unecessarily delay + * messages while the gateway is being checked by the dedicated monitor + * thread. + * + * NB: We're only checking the alive_router_check_interval here, rather + * than calling lnet_router_checker_active(), because the other + * conditions that are checked by that function are either + * irrelevant (the_lnet.ln_routing) or must be true (list of routers + * is not empty) + */ + if (alive_router_check_interval <= 0) { + rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_cpt); + if (rc) + return rc; + } + + if (!sd->sd_best_ni) { + lpn = gwni->lpni_peer_net; + sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, gw, lpn, + sd->sd_msg, + sd->sd_md_cpt); + if (!sd->sd_best_ni) { + CERROR("Internal Error. Expected local ni on %s but non found: %s\n", + libcfs_net2str(lpn->lpn_net_id), + libcfs_nidstr(&sd->sd_src_nid)); + return -EFAULT; + } + } + + *gw_lpni = gwni; + *gw_peer = gw; + + /* + * increment the sequence numbers since now we're sure we're + * going to use this path + */ + if (LNET_NID_IS_ANY(&sd->sd_rtr_nid)) { + LASSERT(best_route && last_route); + best_route->lr_seq = last_route->lr_seq + 1; + if (best_lpn) + best_lpn->lpn_seq++; + } + + return 0; +} + +/* + * Handle two cases: + * + * Case 1: + * Source specified + * Remote destination + * Non-MR destination + * + * Case 2: + * Source specified + * Remote destination + * MR destination + * + * The handling of these two cases is similar. Even though the destination + * can be MR or non-MR, we'll deal directly with the router. + */ +static int +lnet_handle_spec_router_dst(struct lnet_send_data *sd) +{ + int rc; + struct lnet_peer_ni *gw_lpni = NULL; + struct lnet_peer *gw_peer = NULL; + + /* find local NI */ + sd->sd_best_ni = lnet_nid_to_ni_locked(&sd->sd_src_nid, sd->sd_cpt); + if (!sd->sd_best_ni) { + CERROR("Can't send to %s: src %s is not a local nid\n", + libcfs_nidstr(&sd->sd_dst_nid), + libcfs_nidstr(&sd->sd_src_nid)); + return -EINVAL; + } + + rc = lnet_handle_find_routed_path(sd, &sd->sd_dst_nid, + &gw_lpni, &gw_peer); + if (rc) + return rc; + + if (sd->sd_send_case & NMR_DST) + /* + * since the final destination is non-MR let's set its preferred + * NID before we send + */ + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, + sd->sd_msg); + + /* + * We're going to send to the gw found so let's set its + * info + */ + sd->sd_peer = gw_peer; + sd->sd_best_lpni = gw_lpni; + + return lnet_handle_send(sd); +} + +struct lnet_ni * +lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, + struct lnet_msg *msg, bool discovery) +{ + struct lnet_peer_net *lpn = NULL; + struct lnet_peer_net *best_lpn = NULL; + struct lnet_net *net = NULL; + struct lnet_net *best_net = NULL; + struct lnet_ni *best_ni = NULL; + int best_lpn_healthv = 0; + int best_net_healthv = 0; + int net_healthv; + __u32 best_lpn_sel_prio = LNET_MAX_SELECTION_PRIORITY; + __u32 lpn_sel_prio; + __u32 best_net_sel_prio = LNET_MAX_SELECTION_PRIORITY; + __u32 net_sel_prio; + bool exit = false; + + /* + * The peer can have multiple interfaces, some of them can be on + * the local network and others on a routed network. We should + * prefer the local network. However if the local network is not + * available then we need to try the routed network + */ + + /* go through all the peer nets and find the best_ni */ + list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_peer_nets) { + /* + * The peer's list of nets can contain non-local nets. We + * want to only examine the local ones. + */ + net = lnet_get_net_locked(lpn->lpn_net_id); + if (!net) + continue; + + lpn_sel_prio = lpn->lpn_sel_priority; + net_healthv = lnet_get_net_healthv_locked(net); + net_sel_prio = net->net_sel_priority; + + /* + * if this is a discovery message and lp_disc_net_id is + * specified then use that net to send the discovery on. + */ + if (peer->lp_disc_net_id == lpn->lpn_net_id && + discovery) { + exit = true; + goto select_lpn; + } + + if (!best_lpn) + goto select_lpn; + + /* always select the lpn with the best health */ + if (best_lpn_healthv > lpn->lpn_healthv) + continue; + else if (best_lpn_healthv < lpn->lpn_healthv) + goto select_lpn; + + /* select the preferred peer and local nets */ + if (best_lpn_sel_prio < lpn_sel_prio) + continue; + else if (best_lpn_sel_prio > lpn_sel_prio) + goto select_lpn; + + if (best_net_healthv > net_healthv) + continue; + else if (best_net_healthv < net_healthv) + goto select_lpn; + + if (best_net_sel_prio < net_sel_prio) + continue; + else if (best_net_sel_prio > net_sel_prio) + goto select_lpn; + + if (best_lpn->lpn_seq < lpn->lpn_seq) + continue; + else if (best_lpn->lpn_seq > lpn->lpn_seq) + goto select_lpn; + + /* round robin over the local networks */ + if (best_net->net_seq <= net->net_seq) + continue; + +select_lpn: + best_net_healthv = net_healthv; + best_net_sel_prio = net_sel_prio; + best_lpn_healthv = lpn->lpn_healthv; + best_lpn_sel_prio = lpn_sel_prio; + best_lpn = lpn; + best_net = net; + + if (exit) + break; + } + + if (best_lpn) { + /* Select the best NI on the same net as best_lpn chosen + * above + */ + best_ni = lnet_find_best_ni_on_spec_net(NULL, peer, best_lpn, + msg, md_cpt); + } + + return best_ni; +} + +static struct lnet_ni * +lnet_find_existing_preferred_best_ni(struct lnet_peer_ni *lpni, int cpt) +{ + struct lnet_ni *best_ni = NULL; + struct lnet_peer_net *peer_net = lpni->lpni_peer_net; + struct lnet_peer_ni *lpni_entry; + + /* + * We must use a consistent source address when sending to a + * non-MR peer. However, a non-MR peer can have multiple NIDs + * on multiple networks, and we may even need to talk to this + * peer on multiple networks -- certain types of + * load-balancing configuration do this. + * + * So we need to pick the NI the peer prefers for this + * particular network. + */ + LASSERT(peer_net); + list_for_each_entry(lpni_entry, &peer_net->lpn_peer_nis, + lpni_peer_nis) { + if (lpni_entry->lpni_pref_nnids == 0) + continue; + LASSERT(lpni_entry->lpni_pref_nnids == 1); + best_ni = lnet_nid_to_ni_locked(&lpni_entry->lpni_pref.nid, + cpt); + break; + } + + return best_ni; +} + +/* Prerequisite: sd->sd_peer and sd->sd_best_lpni should be set */ +static int +lnet_select_preferred_best_ni(struct lnet_send_data *sd) +{ + struct lnet_ni *best_ni = NULL; + + /* + * We must use a consistent source address when sending to a + * non-MR peer. However, a non-MR peer can have multiple NIDs + * on multiple networks, and we may even need to talk to this + * peer on multiple networks -- certain types of + * load-balancing configuration do this. + * + * So we need to pick the NI the peer prefers for this + * particular network. + * + * An exception is traffic on LNET_RESERVED_PORTAL. Internal LNet + * traffic doesn't care which source NI is used, and we don't actually + * want to restrict local recovery pings to a single source NI. + */ + if (!lnet_reserved_msg(sd->sd_msg)) + best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni, + sd->sd_cpt); + + if (!best_ni) + best_ni = lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer, + sd->sd_best_lpni->lpni_peer_net, + sd->sd_msg, + sd->sd_md_cpt); + + /* If there is no best_ni we don't have a route */ + if (!best_ni) { + CERROR("no path to %s from net %s\n", + libcfs_nidstr(&sd->sd_best_lpni->lpni_nid), + libcfs_net2str(sd->sd_best_lpni->lpni_net->net_id)); + return -EHOSTUNREACH; + } + + sd->sd_best_ni = best_ni; + + /* Set preferred NI if necessary. */ + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); + + return 0; +} + + +/* + * Source not specified + * Local destination + * Non-MR Peer + * + * always use the same source NID for NMR peers + * If we've talked to that peer before then we already have a preferred + * source NI associated with it. Otherwise, we select a preferred local NI + * and store it in the peer + */ +static int +lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd) +{ + int rc = 0; + + /* sd->sd_best_lpni is already set to the final destination */ + + /* + * At this point we should've created the peer ni and peer. If we + * can't find it, then something went wrong. Instead of assert + * output a relevant message and fail the send + */ + if (!sd->sd_best_lpni) { + CERROR("Internal fault. Unable to send msg %s to %s. NID not known\n", + lnet_msgtyp2str(sd->sd_msg->msg_type), + libcfs_nidstr(&sd->sd_dst_nid)); + return -EFAULT; + } + + if (sd->sd_msg->msg_routing) { + /* If I'm forwarding this message then I can choose any NI + * on the destination peer net + */ + sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, + sd->sd_peer, + sd->sd_best_lpni->lpni_peer_net, + sd->sd_msg, + sd->sd_md_cpt); + if (!sd->sd_best_ni) { + CERROR("Unable to forward message to %s. No local NI available\n", + libcfs_nidstr(&sd->sd_dst_nid)); + rc = -EHOSTUNREACH; + } + } else + rc = lnet_select_preferred_best_ni(sd); + + if (!rc) + rc = lnet_handle_send(sd); + + return rc; +} + +static int +lnet_handle_any_mr_dsta(struct lnet_send_data *sd) +{ + /* + * NOTE we've already handled the remote peer case. So we only + * need to worry about the local case here. + * + * if we're sending a response, ACK or reply, we need to send it + * to the destination NID given to us. At this point we already + * have the peer_ni we're suppose to send to, so just find the + * best_ni on the peer net and use that. Since we're sending to an + * MR peer then we can just run the selection algorithm on our + * local NIs and pick the best one. + */ + if (sd->sd_send_case & SND_RESP) { + sd->sd_best_ni = + lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer, + sd->sd_best_lpni->lpni_peer_net, + sd->sd_msg, + sd->sd_md_cpt); + + if (!sd->sd_best_ni) { + /* + * We're not going to deal with not able to send + * a response to the provided final destination + */ + CERROR("Can't send response to %s. No local NI available\n", + libcfs_nidstr(&sd->sd_dst_nid)); + return -EHOSTUNREACH; + } + + return lnet_handle_send(sd); + } + + /* + * If we get here that means we're sending a fresh request, PUT or + * GET, so we need to run our standard selection algorithm. + * First find the best local interface that's on any of the peer's + * networks. + */ + sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer, + sd->sd_md_cpt, + sd->sd_msg, + lnet_msg_discovery(sd->sd_msg)); + if (sd->sd_best_ni) { + sd->sd_best_lpni = + lnet_find_best_lpni(sd->sd_best_ni, + lnet_nid_to_nid4(&sd->sd_dst_nid), + sd->sd_peer, + sd->sd_best_ni->ni_net->net_id); + + /* + * if we're successful in selecting a peer_ni on the local + * network, then send to it. Otherwise fall through and + * try and see if we can reach it over another routed + * network + */ + if (sd->sd_best_lpni && + nid_same(&sd->sd_best_lpni->lpni_nid, + &the_lnet.ln_loni->ni_nid)) { + /* + * in case we initially started with a routed + * destination, let's reset to local + */ + sd->sd_send_case &= ~REMOTE_DST; + sd->sd_send_case |= LOCAL_DST; + return lnet_handle_lo_send(sd); + } else if (sd->sd_best_lpni) { + /* + * in case we initially started with a routed + * destination, let's reset to local + */ + sd->sd_send_case &= ~REMOTE_DST; + sd->sd_send_case |= LOCAL_DST; + return lnet_handle_send(sd); + } + + CERROR("Internal Error. Expected to have a best_lpni: " + "%s -> %s\n", + libcfs_nidstr(&sd->sd_src_nid), + libcfs_nidstr(&sd->sd_dst_nid)); + + return -EFAULT; + } + + /* + * Peer doesn't have a local network. Let's see if there is + * a remote network we can reach it on. + */ + return PASS_THROUGH; +} + +/* + * Case 1: + * Source NID not specified + * Local destination + * MR peer + * + * Case 2: + * Source NID not speified + * Remote destination + * MR peer + * + * In both of these cases if we're sending a response, ACK or REPLY, then + * we need to send to the destination NID provided. + * + * In the remote case let's deal with MR routers. + * + */ + +static int +lnet_handle_any_mr_dst(struct lnet_send_data *sd) +{ + int rc = 0; + struct lnet_peer *gw_peer = NULL; + struct lnet_peer_ni *gw_lpni = NULL; + + /* + * handle sending a response to a remote peer here so we don't + * have to worry about it if we hit lnet_handle_any_mr_dsta() + */ + if (sd->sd_send_case & REMOTE_DST && + sd->sd_send_case & SND_RESP) { + struct lnet_peer_ni *gw; + struct lnet_peer *gw_peer; + + rc = lnet_handle_find_routed_path( + sd, &sd->sd_dst_nid, &gw, &gw_peer); + if (rc < 0) { + CERROR("Can't send response to %s. No route available\n", + libcfs_nidstr(&sd->sd_dst_nid)); + return -EHOSTUNREACH; + } else if (rc > 0) { + return rc; + } + + sd->sd_best_lpni = gw; + sd->sd_peer = gw_peer; + + return lnet_handle_send(sd); + } + + /* + * Even though the NID for the peer might not be on a local network, + * since the peer is MR there could be other interfaces on the + * local network. In that case we'd still like to prefer the local + * network over the routed network. If we're unable to do that + * then we select the best router among the different routed networks, + * and if the router is MR then we can deal with it as such. + */ + rc = lnet_handle_any_mr_dsta(sd); + if (rc != PASS_THROUGH) + return rc; + + /* + * Now that we must route to the destination, we must consider the + * MR case, where the destination has multiple interfaces, some of + * which we can route to and others we do not. For this reason we + * need to select the destination which we can route to and if + * there are multiple, we need to round robin. + */ + rc = lnet_handle_find_routed_path(sd, &sd->sd_dst_nid, + &gw_lpni, &gw_peer); + if (rc) + return rc; + + sd->sd_send_case &= ~LOCAL_DST; + sd->sd_send_case |= REMOTE_DST; + + sd->sd_peer = gw_peer; + sd->sd_best_lpni = gw_lpni; + + return lnet_handle_send(sd); +} + +/* + * Source not specified + * Remote destination + * Non-MR peer + * + * Must send to the specified peer NID using the same source NID that + * we've used before. If it's the first time to talk to that peer then + * find the source NI and assign it as preferred to that peer + */ +static int +lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd) +{ + int rc; + struct lnet_peer_ni *gw_lpni = NULL; + struct lnet_peer *gw_peer = NULL; + + /* + * Let's see if we have a preferred NI to talk to this NMR peer + */ + sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni, + sd->sd_cpt); + + /* + * find the router and that'll find the best NI if we didn't find + * it already. + */ + rc = lnet_handle_find_routed_path(sd, &sd->sd_dst_nid, &gw_lpni, + &gw_peer); + if (rc) + return rc; + + /* + * set the best_ni we've chosen as the preferred one for + * this peer + */ + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); + + /* we'll be sending to the gw */ + sd->sd_best_lpni = gw_lpni; + sd->sd_peer = gw_peer; + + return lnet_handle_send(sd); +} + +static int +lnet_handle_send_case_locked(struct lnet_send_data *sd) +{ + /* + * turn off the SND_RESP bit. + * It will be checked in the case handling + */ + __u32 send_case = sd->sd_send_case &= ~SND_RESP ; + + CDEBUG(D_NET, "Source %s%s to %s %s %s destination\n", + (send_case & SRC_SPEC) ? "Specified: " : "ANY", + (send_case & SRC_SPEC) ? libcfs_nidstr(&sd->sd_src_nid) : "", + (send_case & MR_DST) ? "MR: " : "NMR: ", + libcfs_nidstr(&sd->sd_dst_nid), + (send_case & LOCAL_DST) ? "local" : "routed"); + + switch (send_case) { + /* + * For all cases where the source is specified, we should always + * use the destination NID, whether it's an MR destination or not, + * since we're continuing a series of related messages for the + * same RPC + */ + case SRC_SPEC_LOCAL_NMR_DST: + return lnet_handle_spec_local_nmr_dst(sd); + case SRC_SPEC_LOCAL_MR_DST: + return lnet_handle_spec_local_mr_dst(sd); + case SRC_SPEC_ROUTER_NMR_DST: + case SRC_SPEC_ROUTER_MR_DST: + return lnet_handle_spec_router_dst(sd); + case SRC_ANY_LOCAL_NMR_DST: + return lnet_handle_any_local_nmr_dst(sd); + case SRC_ANY_LOCAL_MR_DST: + case SRC_ANY_ROUTER_MR_DST: + return lnet_handle_any_mr_dst(sd); + case SRC_ANY_ROUTER_NMR_DST: + return lnet_handle_any_router_nmr_dst(sd); + default: + CERROR("Unknown send case\n"); + return -1; + } +} + +static int +lnet_select_pathway(struct lnet_nid *src_nid, + struct lnet_nid *dst_nid, + struct lnet_msg *msg, + struct lnet_nid *rtr_nid) +{ + struct lnet_peer_ni *lpni; + struct lnet_peer *peer; + struct lnet_send_data send_data; + int cpt, rc; + int md_cpt; + __u32 send_case = 0; + bool final_hop; + bool mr_forwarding_allowed; + + memset(&send_data, 0, sizeof(send_data)); + + /* + * get an initial CPT to use for locking. The idea here is not to + * serialize the calls to select_pathway, so that as many + * operations can run concurrently as possible. To do that we use + * the CPT where this call is being executed. Later on when we + * determine the CPT to use in lnet_message_commit, we switch the + * lock and check if there was any configuration change. If none, + * then we proceed, if there is, then we restart the operation. + */ + cpt = lnet_net_lock_current(); + + md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset); + if (md_cpt == CFS_CPT_ANY) + md_cpt = cpt; + +again: + + /* + * If we're being asked to send to the loopback interface, there + * is no need to go through any selection. We can just shortcut + * the entire process and send over lolnd + */ + send_data.sd_msg = msg; + send_data.sd_cpt = cpt; + if (nid_is_lo0(dst_nid)) { + rc = lnet_handle_lo_send(&send_data); + lnet_net_unlock(cpt); + return rc; + } + + /* + * find an existing peer_ni, or create one and mark it as having been + * created due to network traffic. This call will create the + * peer->peer_net->peer_ni tree. + */ + lpni = lnet_peerni_by_nid_locked(dst_nid, NULL, cpt); + if (IS_ERR(lpni)) { + lnet_net_unlock(cpt); + return PTR_ERR(lpni); + } + + /* + * Cache the original src_nid and rtr_nid. If we need to resend the + * message then we'll need to know whether the src_nid was originally + * specified for this message. If it was originally specified, + * then we need to keep using the same src_nid since it's + * continuing the same sequence of messages. Similarly, rtr_nid will + * affect our choice of next hop. + */ + if (src_nid) + msg->msg_src_nid_param = *src_nid; + else + msg->msg_src_nid_param = LNET_ANY_NID; + if (rtr_nid) + msg->msg_rtr_nid_param = *rtr_nid; + else + msg->msg_rtr_nid_param = LNET_ANY_NID; + + /* + * If necessary, perform discovery on the peer that owns this peer_ni. + * Note, this can result in the ownership of this peer_ni changing + * to another peer object. + */ + rc = lnet_initiate_peer_discovery(lpni, msg, cpt); + if (rc) { + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(cpt); + return rc; + } + lnet_peer_ni_decref_locked(lpni); + + peer = lpni->lpni_peer_net->lpn_peer; + + /* + * Identify the different send cases + */ + if (!src_nid || LNET_NID_IS_ANY(src_nid)) { + send_case |= SRC_ANY; + if (lnet_get_net_locked(LNET_NID_NET(dst_nid))) + send_case |= LOCAL_DST; + else + send_case |= REMOTE_DST; + } else { + send_case |= SRC_SPEC; + if (LNET_NID_NET(src_nid) == LNET_NID_NET(dst_nid)) + send_case |= LOCAL_DST; + else + send_case |= REMOTE_DST; + } + + final_hop = false; + if (msg->msg_routing && (send_case & LOCAL_DST)) + final_hop = true; + + /* Determine whether to allow MR forwarding for this message. + * NB: MR forwarding is allowed if the message originator and the + * destination are both MR capable, and the destination lpni that was + * originally chosen by the originator is unhealthy or down. + * We check the MR capability of the destination further below + */ + mr_forwarding_allowed = false; + if (final_hop) { + struct lnet_peer *src_lp; + struct lnet_peer_ni *src_lpni; + + src_lpni = lnet_peerni_by_nid_locked(&msg->msg_hdr.src_nid, + NULL, cpt); + /* We don't fail the send if we hit any errors here. We'll just + * try to send it via non-multi-rail criteria + */ + if (!IS_ERR(src_lpni)) { + /* Drop ref taken by lnet_nid2peerni_locked() */ + lnet_peer_ni_decref_locked(src_lpni); + src_lp = lpni->lpni_peer_net->lpn_peer; + if (lnet_peer_is_multi_rail(src_lp) && + !lnet_is_peer_ni_alive(lpni)) + mr_forwarding_allowed = true; + + } + CDEBUG(D_NET, "msg %p MR forwarding %s\n", msg, + mr_forwarding_allowed ? "allowed" : "not allowed"); + } + + /* + * Deal with the peer as NMR in the following cases: + * 1. the peer is NMR + * 2. We're trying to recover a specific peer NI + * 3. I'm a router sending to the final destination and MR forwarding is + * not allowed for this message (as determined above). + * In this case the source of the message would've + * already selected the final destination so my job + * is to honor the selection. + */ + if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery || + (final_hop && !mr_forwarding_allowed)) + send_case |= NMR_DST; + else + send_case |= MR_DST; + + if (lnet_msg_is_response(msg)) + send_case |= SND_RESP; + + /* assign parameters to the send_data */ + if (rtr_nid) + send_data.sd_rtr_nid = *rtr_nid; + else + send_data.sd_rtr_nid = LNET_ANY_NID; + if (src_nid) + send_data.sd_src_nid = *src_nid; + else + send_data.sd_src_nid = LNET_ANY_NID; + send_data.sd_dst_nid = *dst_nid; + send_data.sd_best_lpni = lpni; + /* + * keep a pointer to the final destination in case we're going to + * route, so we'll need to access it later + */ + send_data.sd_final_dst_lpni = lpni; + send_data.sd_peer = peer; + send_data.sd_md_cpt = md_cpt; + send_data.sd_send_case = send_case; + + rc = lnet_handle_send_case_locked(&send_data); + + /* + * Update the local cpt since send_data.sd_cpt might've been + * updated as a result of calling lnet_handle_send_case_locked(). + */ + cpt = send_data.sd_cpt; + + if (rc == REPEAT_SEND) + goto again; + + lnet_net_unlock(cpt); + + return rc; +} + +int +lnet_send(struct lnet_nid *src_nid, struct lnet_msg *msg, + struct lnet_nid *rtr_nid) +{ + struct lnet_nid *dst_nid = &msg->msg_target.nid; + int rc; + + /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */ + LASSERT(msg->msg_txpeer == NULL); + LASSERT(msg->msg_txni == NULL); + LASSERT(!msg->msg_sending); + LASSERT(!msg->msg_target_is_router); + LASSERT(!msg->msg_receiving); + + msg->msg_sending = 1; + + LASSERT(!msg->msg_tx_committed); + + rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid); + if (rc < 0) { + if (rc == -EHOSTUNREACH) + msg->msg_health_status = LNET_MSG_STATUS_REMOTE_ERROR; + else + msg->msg_health_status = LNET_MSG_STATUS_LOCAL_ERROR; + return rc; + } + + if (rc == LNET_CREDIT_OK) + lnet_ni_send(msg->msg_txni, msg); + + /* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT or LNET_DC_WAIT */ + return 0; +} + +enum lnet_mt_event_type { + MT_TYPE_LOCAL_NI = 0, + MT_TYPE_PEER_NI +}; + +struct lnet_mt_event_info { + enum lnet_mt_event_type mt_type; + struct lnet_nid mt_nid; +}; + +/* called with res_lock held */ +void +lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt) +{ + struct lnet_rsp_tracker *rspt; + + /* + * msg has a refcount on the MD so the MD is not going away. + * The rspt queue for the cpt is protected by + * the lnet_net_lock(cpt). cpt is the cpt of the MD cookie. + */ + if (!md->md_rspt_ptr) + return; + + rspt = md->md_rspt_ptr; + + /* debug code */ + LASSERT(rspt->rspt_cpt == cpt); + + md->md_rspt_ptr = NULL; + + if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) { + /* + * The monitor thread has invalidated this handle because the + * response timed out, but it failed to lookup the MD. That + * means this response tracker is on the zombie list. We can + * safely remove it under the resource lock (held by caller) and + * free the response tracker block. + */ + list_del(&rspt->rspt_on_list); + lnet_rspt_free(rspt, cpt); + } else { + /* + * invalidate the handle to indicate that a response has been + * received, which will then lead the monitor thread to clean up + * the rspt block. + */ + LNetInvalidateMDHandle(&rspt->rspt_mdh); + } +} + +void +lnet_clean_zombie_rstqs(void) +{ + struct lnet_rsp_tracker *rspt, *tmp; + int i; + + cfs_cpt_for_each(i, lnet_cpt_table()) { + list_for_each_entry_safe(rspt, tmp, + the_lnet.ln_mt_zombie_rstqs[i], + rspt_on_list) { + list_del(&rspt->rspt_on_list); + lnet_rspt_free(rspt, i); + } + } + + cfs_percpt_free(the_lnet.ln_mt_zombie_rstqs); +} + +static void +lnet_finalize_expired_responses(void) +{ + struct lnet_libmd *md; + struct lnet_rsp_tracker *rspt, *tmp; + ktime_t now; + int i; + + if (the_lnet.ln_mt_rstq == NULL) + return; + + cfs_cpt_for_each(i, lnet_cpt_table()) { + LIST_HEAD(local_queue); + + lnet_net_lock(i); + if (!the_lnet.ln_mt_rstq[i]) { + lnet_net_unlock(i); + continue; + } + list_splice_init(the_lnet.ln_mt_rstq[i], &local_queue); + lnet_net_unlock(i); + + now = ktime_get(); + + list_for_each_entry_safe(rspt, tmp, &local_queue, rspt_on_list) { + /* + * The rspt mdh will be invalidated when a response + * is received or whenever we want to discard the + * block the monitor thread will walk the queue + * and clean up any rsts with an invalid mdh. + * The monitor thread will walk the queue until + * the first unexpired rspt block. This means that + * some rspt blocks which received their + * corresponding responses will linger in the + * queue until they are cleaned up eventually. + */ + lnet_res_lock(i); + if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) { + lnet_res_unlock(i); + list_del(&rspt->rspt_on_list); + lnet_rspt_free(rspt, i); + continue; + } + + if (ktime_compare(now, rspt->rspt_deadline) >= 0 || + the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) { + struct lnet_peer_ni *lpni; + struct lnet_nid nid; + + md = lnet_handle2md(&rspt->rspt_mdh); + if (!md) { + /* MD has been queued for unlink, but + * rspt hasn't been detached (Note we've + * checked above that the rspt_mdh is + * valid). Since we cannot lookup the MD + * we're unable to detach the rspt + * ourselves. Thus, move the rspt to the + * zombie list where we'll wait for + * either: + * 1. The remaining operations on the + * MD to complete. In this case the + * final operation will result in + * lnet_msg_detach_md()-> + * lnet_detach_rsp_tracker() where + * we will clean up this response + * tracker. + * 2. LNet to shutdown. In this case + * we'll wait until after all LND Nets + * have shutdown and then we can + * safely free any remaining response + * tracker blocks on the zombie list. + * Note: We need to hold the resource + * lock when adding to the zombie list + * because we may have concurrent access + * with lnet_detach_rsp_tracker(). + */ + LNetInvalidateMDHandle(&rspt->rspt_mdh); + list_move(&rspt->rspt_on_list, + the_lnet.ln_mt_zombie_rstqs[i]); + lnet_res_unlock(i); + continue; + } + LASSERT(md->md_rspt_ptr == rspt); + md->md_rspt_ptr = NULL; + lnet_res_unlock(i); + + LNetMDUnlink(rspt->rspt_mdh); + + nid = rspt->rspt_next_hop_nid; + + list_del(&rspt->rspt_on_list); + lnet_rspt_free(rspt, i); + + /* If we're shutting down we just want to clean + * up the rspt blocks + */ + if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) + continue; + + lnet_net_lock(i); + the_lnet.ln_counters[i]->lct_health.lch_response_timeout_count++; + lnet_net_unlock(i); + + CDEBUG(D_NET, + "Response timeout: md = %p: nid = %s\n", + md, libcfs_nidstr(&nid)); + + /* + * If there is a timeout on the response + * from the next hop decrement its health + * value so that we don't use it + */ + lnet_net_lock(0); + lpni = lnet_peer_ni_find_locked(&nid); + if (lpni) { + lnet_handle_remote_failure_locked(lpni); + lnet_peer_ni_decref_locked(lpni); + } + lnet_net_unlock(0); + } else { + lnet_res_unlock(i); + break; + } + } + + if (!list_empty(&local_queue)) { + lnet_net_lock(i); + list_splice(&local_queue, the_lnet.ln_mt_rstq[i]); + lnet_net_unlock(i); + } + } +} + +static void +lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt) +{ + struct lnet_msg *msg; + + while (!list_empty(resendq)) { + struct lnet_peer_ni *lpni; + + msg = list_entry(resendq->next, struct lnet_msg, + msg_list); + + list_del_init(&msg->msg_list); + + lpni = lnet_peer_ni_find_locked(&msg->msg_hdr.dest_nid); + if (!lpni) { + lnet_net_unlock(cpt); + CERROR("Expected that a peer is already created for %s\n", + libcfs_nidstr(&msg->msg_hdr.dest_nid)); + msg->msg_no_resend = true; + lnet_finalize(msg, -EFAULT); + lnet_net_lock(cpt); + } else { + int rc; + + lnet_peer_ni_decref_locked(lpni); + + lnet_net_unlock(cpt); + CDEBUG(D_NET, "resending %s->%s: %s recovery %d try# %d\n", + libcfs_nidstr(&msg->msg_src_nid_param), + libcfs_idstr(&msg->msg_target), + lnet_msgtyp2str(msg->msg_type), + msg->msg_recovery, + msg->msg_retry_count); + rc = lnet_send(&msg->msg_src_nid_param, msg, + &msg->msg_rtr_nid_param); + if (rc) { + CERROR("Error sending %s to %s: %d\n", + lnet_msgtyp2str(msg->msg_type), + libcfs_idstr(&msg->msg_target), rc); + msg->msg_no_resend = true; + lnet_finalize(msg, rc); + } + lnet_net_lock(cpt); + if (!rc) + the_lnet.ln_counters[cpt]->lct_health.lch_resend_count++; + } + } +} + +static void +lnet_resend_pending_msgs(void) +{ + int i; + + cfs_cpt_for_each(i, lnet_cpt_table()) { + lnet_net_lock(i); + lnet_resend_pending_msgs_locked(the_lnet.ln_mt_resendqs[i], i); + lnet_net_unlock(i); + } +} + +/* called with cpt and ni_lock held */ +static void +lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt, bool force) +{ + struct lnet_handle_md recovery_mdh; + + LNetInvalidateMDHandle(&recovery_mdh); + + if (ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING || + force) { + recovery_mdh = ni->ni_ping_mdh; + LNetInvalidateMDHandle(&ni->ni_ping_mdh); + } + lnet_ni_unlock(ni); + lnet_net_unlock(cpt); + if (!LNetMDHandleIsInvalid(recovery_mdh)) + LNetMDUnlink(recovery_mdh); + lnet_net_lock(cpt); + lnet_ni_lock(ni); +} + +static void +lnet_recover_local_nis(void) +{ + struct lnet_mt_event_info *ev_info; + LIST_HEAD(processed_list); + LIST_HEAD(local_queue); + struct lnet_handle_md mdh; + struct lnet_ni *tmp; + struct lnet_ni *ni; + struct lnet_nid nid; + int healthv; + int rc; + time64_t now; + + /* + * splice the recovery queue on a local queue. We will iterate + * through the local queue and update it as needed. Once we're + * done with the traversal, we'll splice the local queue back on + * the head of the ln_mt_localNIRecovq. Any newly added local NIs + * will be traversed in the next iteration. + */ + lnet_net_lock(0); + list_splice_init(&the_lnet.ln_mt_localNIRecovq, + &local_queue); + lnet_net_unlock(0); + + now = ktime_get_seconds(); + + list_for_each_entry_safe(ni, tmp, &local_queue, ni_recovery) { + /* + * if an NI is being deleted or it is now healthy, there + * is no need to keep it around in the recovery queue. + * The monitor thread is the only thread responsible for + * removing the NI from the recovery queue. + * Multiple threads can be adding NIs to the recovery + * queue. + */ + healthv = atomic_read(&ni->ni_healthv); + + lnet_net_lock(0); + lnet_ni_lock(ni); + if (ni->ni_state != LNET_NI_STATE_ACTIVE || + healthv == LNET_MAX_HEALTH_VALUE) { + list_del_init(&ni->ni_recovery); + lnet_unlink_ni_recovery_mdh_locked(ni, 0, false); + lnet_ni_unlock(ni); + lnet_ni_decref_locked(ni, 0); + lnet_net_unlock(0); + continue; + } + + /* + * if the local NI failed recovery we must unlink the md. + * But we want to keep the local_ni on the recovery queue + * so we can continue the attempts to recover it. + */ + if (ni->ni_recovery_state & LNET_NI_RECOVERY_FAILED) { + lnet_unlink_ni_recovery_mdh_locked(ni, 0, true); + ni->ni_recovery_state &= ~LNET_NI_RECOVERY_FAILED; + } + + + lnet_ni_unlock(ni); + + if (now < ni->ni_next_ping) { + lnet_net_unlock(0); + continue; + } + + lnet_net_unlock(0); + + CDEBUG(D_NET, "attempting to recover local ni: %s\n", + libcfs_nidstr(&ni->ni_nid)); + + lnet_ni_lock(ni); + if (!(ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING)) { + ni->ni_recovery_state |= LNET_NI_RECOVERY_PENDING; + lnet_ni_unlock(ni); + + LIBCFS_ALLOC(ev_info, sizeof(*ev_info)); + if (!ev_info) { + CERROR("out of memory. Can't recover %s\n", + libcfs_nidstr(&ni->ni_nid)); + lnet_ni_lock(ni); + ni->ni_recovery_state &= + ~LNET_NI_RECOVERY_PENDING; + lnet_ni_unlock(ni); + continue; + } + + mdh = ni->ni_ping_mdh; + /* + * Invalidate the ni mdh in case it's deleted. + * We'll unlink the mdh in this case below. + */ + LNetInvalidateMDHandle(&ni->ni_ping_mdh); + nid = ni->ni_nid; + + /* + * remove the NI from the local queue and drop the + * reference count to it while we're recovering + * it. The reason for that, is that the NI could + * be deleted, and the way the code is structured + * is if we don't drop the NI, then the deletion + * code will enter a loop waiting for the + * reference count to be removed while holding the + * ln_mutex_lock(). When we look up the peer to + * send to in lnet_select_pathway() we will try to + * lock the ln_mutex_lock() as well, leading to + * a deadlock. By dropping the refcount and + * removing it from the list, we allow for the NI + * to be removed, then we use the cached NID to + * look it up again. If it's gone, then we just + * continue examining the rest of the queue. + */ + lnet_net_lock(0); + list_del_init(&ni->ni_recovery); + lnet_ni_decref_locked(ni, 0); + lnet_net_unlock(0); + + ev_info->mt_type = MT_TYPE_LOCAL_NI; + ev_info->mt_nid = nid; + rc = lnet_send_ping(&nid, &mdh, LNET_INTERFACES_MIN, + ev_info, the_lnet.ln_mt_handler, + true); + /* lookup the nid again */ + lnet_net_lock(0); + ni = lnet_nid_to_ni_locked(&nid, 0); + if (!ni) { + /* + * the NI has been deleted when we dropped + * the ref count + */ + lnet_net_unlock(0); + LNetMDUnlink(mdh); + continue; + } + ni->ni_ping_count++; + + ni->ni_ping_mdh = mdh; + lnet_ni_add_to_recoveryq_locked(ni, &processed_list, + now); + + if (rc) { + lnet_ni_lock(ni); + ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; + lnet_ni_unlock(ni); + } + lnet_net_unlock(0); + } else + lnet_ni_unlock(ni); + } + + /* + * put back the remaining NIs on the ln_mt_localNIRecovq to be + * reexamined in the next iteration. + */ + list_splice_init(&processed_list, &local_queue); + lnet_net_lock(0); + list_splice(&local_queue, &the_lnet.ln_mt_localNIRecovq); + lnet_net_unlock(0); +} + +static int +lnet_resendqs_create(void) +{ + struct list_head **resendqs; + resendqs = lnet_create_array_of_queues(); + + if (!resendqs) + return -ENOMEM; + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_mt_resendqs = resendqs; + lnet_net_unlock(LNET_LOCK_EX); + + return 0; +} + +static void +lnet_clean_local_ni_recoveryq(void) +{ + struct lnet_ni *ni; + + /* This is only called when the monitor thread has stopped */ + lnet_net_lock(0); + + while (!list_empty(&the_lnet.ln_mt_localNIRecovq)) { + ni = list_entry(the_lnet.ln_mt_localNIRecovq.next, + struct lnet_ni, ni_recovery); + list_del_init(&ni->ni_recovery); + lnet_ni_lock(ni); + lnet_unlink_ni_recovery_mdh_locked(ni, 0, true); + lnet_ni_unlock(ni); + lnet_ni_decref_locked(ni, 0); + } + + lnet_net_unlock(0); +} + +static void +lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt, + bool force) +{ + struct lnet_handle_md recovery_mdh; + + LNetInvalidateMDHandle(&recovery_mdh); + + if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING || force) { + recovery_mdh = lpni->lpni_recovery_ping_mdh; + LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh); + } + spin_unlock(&lpni->lpni_lock); + lnet_net_unlock(cpt); + if (!LNetMDHandleIsInvalid(recovery_mdh)) + LNetMDUnlink(recovery_mdh); + lnet_net_lock(cpt); + spin_lock(&lpni->lpni_lock); +} + +static void +lnet_clean_peer_ni_recoveryq(void) +{ + struct lnet_peer_ni *lpni, *tmp; + + lnet_net_lock(LNET_LOCK_EX); + + list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_mt_peerNIRecovq, + lpni_recovery) { + list_del_init(&lpni->lpni_recovery); + spin_lock(&lpni->lpni_lock); + lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX, true); + spin_unlock(&lpni->lpni_lock); + lnet_peer_ni_decref_locked(lpni); + } + + lnet_net_unlock(LNET_LOCK_EX); +} + +static void +lnet_clean_resendqs(void) +{ + struct lnet_msg *msg, *tmp; + LIST_HEAD(msgs); + int i; + + cfs_cpt_for_each(i, lnet_cpt_table()) { + lnet_net_lock(i); + list_splice_init(the_lnet.ln_mt_resendqs[i], &msgs); + lnet_net_unlock(i); + list_for_each_entry_safe(msg, tmp, &msgs, msg_list) { + list_del_init(&msg->msg_list); + msg->msg_no_resend = true; + lnet_finalize(msg, -ESHUTDOWN); + } + } + + cfs_percpt_free(the_lnet.ln_mt_resendqs); +} + +static void +lnet_recover_peer_nis(void) +{ + struct lnet_mt_event_info *ev_info; + LIST_HEAD(processed_list); + LIST_HEAD(local_queue); + struct lnet_handle_md mdh; + struct lnet_peer_ni *lpni; + struct lnet_peer_ni *tmp; + struct lnet_nid nid; + int healthv; + int rc; + time64_t now; + + /* + * Always use cpt 0 for locking across all interactions with + * ln_mt_peerNIRecovq + */ + lnet_net_lock(0); + list_splice_init(&the_lnet.ln_mt_peerNIRecovq, + &local_queue); + lnet_net_unlock(0); + + now = ktime_get_seconds(); + + list_for_each_entry_safe(lpni, tmp, &local_queue, + lpni_recovery) { + /* + * The same protection strategy is used here as is in the + * local recovery case. + */ + lnet_net_lock(0); + healthv = atomic_read(&lpni->lpni_healthv); + spin_lock(&lpni->lpni_lock); + if (lpni->lpni_state & LNET_PEER_NI_DELETING || + healthv == LNET_MAX_HEALTH_VALUE) { + list_del_init(&lpni->lpni_recovery); + lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, false); + spin_unlock(&lpni->lpni_lock); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(0); + continue; + } + + /* + * If the peer NI has failed recovery we must unlink the + * md. But we want to keep the peer ni on the recovery + * queue so we can try to continue recovering it + */ + if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_FAILED) { + lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, true); + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_FAILED; + } + + spin_unlock(&lpni->lpni_lock); + + if (now < lpni->lpni_next_ping) { + lnet_net_unlock(0); + continue; + } + + lnet_net_unlock(0); + + /* + * NOTE: we're racing with peer deletion from user space. + * It's possible that a peer is deleted after we check its + * state. In this case the recovery can create a new peer + */ + spin_lock(&lpni->lpni_lock); + if (!(lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING) && + !(lpni->lpni_state & LNET_PEER_NI_DELETING)) { + lpni->lpni_state |= LNET_PEER_NI_RECOVERY_PENDING; + spin_unlock(&lpni->lpni_lock); + + LIBCFS_ALLOC(ev_info, sizeof(*ev_info)); + if (!ev_info) { + CERROR("out of memory. Can't recover %s\n", + libcfs_nidstr(&lpni->lpni_nid)); + spin_lock(&lpni->lpni_lock); + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; + spin_unlock(&lpni->lpni_lock); + continue; + } + + /* look at the comments in lnet_recover_local_nis() */ + mdh = lpni->lpni_recovery_ping_mdh; + nid = lpni->lpni_nid; + LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh); + lnet_net_lock(0); + list_del_init(&lpni->lpni_recovery); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(0); + + ev_info->mt_type = MT_TYPE_PEER_NI; + ev_info->mt_nid = nid; + rc = lnet_send_ping(&nid, &mdh, LNET_INTERFACES_MIN, + ev_info, the_lnet.ln_mt_handler, + true); + lnet_net_lock(0); + /* + * lnet_find_peer_ni_locked() grabs a refcount for + * us. No need to take it explicitly. + */ + lpni = lnet_peer_ni_find_locked(&nid); + if (!lpni) { + lnet_net_unlock(0); + LNetMDUnlink(mdh); + continue; + } + + lpni->lpni_ping_count++; + + lpni->lpni_recovery_ping_mdh = mdh; + + lnet_peer_ni_add_to_recoveryq_locked(lpni, + &processed_list, + now); + if (rc) { + spin_lock(&lpni->lpni_lock); + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; + spin_unlock(&lpni->lpni_lock); + } + + /* Drop the ref taken by lnet_find_peer_ni_locked() */ + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(0); + } else + spin_unlock(&lpni->lpni_lock); + } + + list_splice_init(&processed_list, &local_queue); + lnet_net_lock(0); + list_splice(&local_queue, &the_lnet.ln_mt_peerNIRecovq); + lnet_net_unlock(0); +} + +static int +lnet_monitor_thread(void *arg) +{ + time64_t rsp_timeout = 0; + time64_t now; + + wait_for_completion(&the_lnet.ln_started); + /* + * The monitor thread takes care of the following: + * 1. Checks the aliveness of routers + * 2. Checks if there are messages on the resend queue to resend + * them. + * 3. Check if there are any NIs on the local recovery queue and + * pings them + * 4. Checks if there are any NIs on the remote recovery queue + * and pings them. + */ + while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) { + now = ktime_get_real_seconds(); + + if (lnet_router_checker_active()) + lnet_check_routers(); + + lnet_resend_pending_msgs(); + + if (now >= rsp_timeout) { + lnet_finalize_expired_responses(); + rsp_timeout = now + (lnet_transaction_timeout / 2); + } + + lnet_recover_local_nis(); + lnet_recover_peer_nis(); + + /* + * TODO do we need to check if we should sleep without + * timeout? Technically, an active system will always + * have messages in flight so this check will always + * evaluate to false. And on an idle system do we care + * if we wake up every 1 second? Although, we've seen + * cases where we get a complaint that an idle thread + * is waking up unnecessarily. + */ + wait_for_completion_interruptible_timeout( + &the_lnet.ln_mt_wait_complete, + cfs_time_seconds(1)); + /* Must re-init the completion before testing anything, + * including ln_mt_state. + */ + reinit_completion(&the_lnet.ln_mt_wait_complete); + } + + /* Shutting down */ + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN; + lnet_net_unlock(LNET_LOCK_EX); + + /* signal that the monitor thread is exiting */ + up(&the_lnet.ln_mt_signal); + + return 0; +} + +/* + * lnet_send_ping + * Sends a ping. + * Returns == 0 if success + * Returns > 0 if LNetMDBind or prior fails + * Returns < 0 if LNetGet fails + */ +int +lnet_send_ping(struct lnet_nid *dest_nid, + struct lnet_handle_md *mdh, int nnis, + void *user_data, lnet_handler_t handler, bool recovery) +{ + struct lnet_md md = { NULL }; + struct lnet_process_id id; + struct lnet_ping_buffer *pbuf; + int rc; + + if (LNET_NID_IS_ANY(dest_nid)) { + rc = -EHOSTUNREACH; + goto fail_error; + } + + pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS); + if (!pbuf) { + rc = ENOMEM; + goto fail_error; + } + + /* initialize md content */ + md.start = &pbuf->pb_info; + md.length = LNET_PING_INFO_SIZE(nnis); + md.threshold = 2; /* GET/REPLY */ + md.max_size = 0; + md.options = LNET_MD_TRUNCATE | LNET_MD_TRACK_RESPONSE; + md.user_ptr = user_data; + md.handler = handler; + + rc = LNetMDBind(&md, LNET_UNLINK, mdh); + if (rc) { + lnet_ping_buffer_decref(pbuf); + CERROR("Can't bind MD: %d\n", rc); + rc = -rc; /* change the rc to positive */ + goto fail_error; + } + id.pid = LNET_PID_LUSTRE; + id.nid = lnet_nid_to_nid4(dest_nid); + + rc = LNetGet(LNET_NID_ANY, *mdh, id, + LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0, recovery); + + if (rc) + goto fail_unlink_md; + + return 0; + +fail_unlink_md: + LNetMDUnlink(*mdh); + LNetInvalidateMDHandle(mdh); +fail_error: + return rc; +} + +static void +lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, + int status, bool send, bool unlink_event) +{ + struct lnet_nid *nid = &ev_info->mt_nid; + + if (ev_info->mt_type == MT_TYPE_LOCAL_NI) { + struct lnet_ni *ni; + + lnet_net_lock(0); + ni = lnet_nid_to_ni_locked(nid, 0); + if (!ni) { + lnet_net_unlock(0); + return; + } + lnet_ni_lock(ni); + if (!send || (send && status != 0)) + ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; + if (status) + ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED; + lnet_ni_unlock(ni); + lnet_net_unlock(0); + + if (status != 0) { + CERROR("local NI (%s) recovery failed with %d\n", + libcfs_nidstr(nid), status); + return; + } + /* + * need to increment healthv for the ni here, because in + * the lnet_finalize() path we don't have access to this + * NI. And in order to get access to it, we'll need to + * carry forward too much information. + * In the peer case, it'll naturally be incremented + */ + if (!unlink_event) + lnet_inc_healthv(&ni->ni_healthv, + lnet_health_sensitivity); + } else { + struct lnet_peer_ni *lpni; + int cpt; + + cpt = lnet_net_lock_current(); + lpni = lnet_peer_ni_find_locked(nid); + if (!lpni) { + lnet_net_unlock(cpt); + return; + } + spin_lock(&lpni->lpni_lock); + if (!send || (send && status != 0)) + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; + if (status) + lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED; + spin_unlock(&lpni->lpni_lock); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(cpt); + + if (status != 0) + CERROR("peer NI (%s) recovery failed with %d\n", + libcfs_nidstr(nid), status); + } +} + +void +lnet_mt_event_handler(struct lnet_event *event) +{ + struct lnet_mt_event_info *ev_info = event->md_user_ptr; + struct lnet_ping_buffer *pbuf; + + /* TODO: remove assert */ + LASSERT(event->type == LNET_EVENT_REPLY || + event->type == LNET_EVENT_SEND || + event->type == LNET_EVENT_UNLINK); + + CDEBUG(D_NET, "Received event: %d status: %d\n", event->type, + event->status); + + switch (event->type) { + case LNET_EVENT_UNLINK: + CDEBUG(D_NET, "%s recovery ping unlinked\n", + libcfs_nidstr(&ev_info->mt_nid)); + fallthrough; + case LNET_EVENT_REPLY: + lnet_handle_recovery_reply(ev_info, event->status, false, + event->type == LNET_EVENT_UNLINK); + break; + case LNET_EVENT_SEND: + CDEBUG(D_NET, "%s recovery message sent %s:%d\n", + libcfs_nidstr(&ev_info->mt_nid), + (event->status) ? "unsuccessfully" : + "successfully", event->status); + lnet_handle_recovery_reply(ev_info, event->status, true, false); + break; + default: + CERROR("Unexpected event: %d\n", event->type); + break; + } + if (event->unlinked) { + LIBCFS_FREE(ev_info, sizeof(*ev_info)); + pbuf = LNET_PING_INFO_TO_BUFFER(event->md_start); + lnet_ping_buffer_decref(pbuf); + } +} + +static int +lnet_rsp_tracker_create(void) +{ + struct list_head **rstqs; + rstqs = lnet_create_array_of_queues(); + + if (!rstqs) + return -ENOMEM; + + the_lnet.ln_mt_rstq = rstqs; + + return 0; +} + +static void +lnet_rsp_tracker_clean(void) +{ + lnet_finalize_expired_responses(); + + cfs_percpt_free(the_lnet.ln_mt_rstq); + the_lnet.ln_mt_rstq = NULL; +} + +int lnet_monitor_thr_start(void) +{ + int rc = 0; + struct task_struct *task; + + if (the_lnet.ln_mt_state != LNET_MT_STATE_SHUTDOWN) + return -EALREADY; + + rc = lnet_resendqs_create(); + if (rc) + return rc; + + rc = lnet_rsp_tracker_create(); + if (rc) + goto clean_queues; + + sema_init(&the_lnet.ln_mt_signal, 0); + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_mt_state = LNET_MT_STATE_RUNNING; + lnet_net_unlock(LNET_LOCK_EX); + task = kthread_run(lnet_monitor_thread, NULL, "monitor_thread"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("Can't start monitor thread: %d\n", rc); + goto clean_thread; + } + + return 0; + +clean_thread: + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING; + lnet_net_unlock(LNET_LOCK_EX); + /* block until event callback signals exit */ + down(&the_lnet.ln_mt_signal); + /* clean up */ + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN; + lnet_net_unlock(LNET_LOCK_EX); + lnet_rsp_tracker_clean(); + lnet_clean_local_ni_recoveryq(); + lnet_clean_peer_ni_recoveryq(); + lnet_clean_resendqs(); + the_lnet.ln_mt_handler = NULL; + return rc; +clean_queues: + lnet_rsp_tracker_clean(); + lnet_clean_local_ni_recoveryq(); + lnet_clean_peer_ni_recoveryq(); + lnet_clean_resendqs(); + return rc; +} + +void lnet_monitor_thr_stop(void) +{ + if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) + return; + + LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING); + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING; + lnet_net_unlock(LNET_LOCK_EX); + + /* tell the monitor thread that we're shutting down */ + complete(&the_lnet.ln_mt_wait_complete); + + /* block until monitor thread signals that it's done */ + mutex_unlock(&the_lnet.ln_api_mutex); + down(&the_lnet.ln_mt_signal); + mutex_lock(&the_lnet.ln_api_mutex); + LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN); + + /* perform cleanup tasks */ + lnet_rsp_tracker_clean(); + lnet_clean_local_ni_recoveryq(); + lnet_clean_peer_ni_recoveryq(); + lnet_clean_resendqs(); +} + +void +lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob, + __u32 msg_type) +{ + lnet_net_lock(cpt); + lnet_incr_stats(&ni->ni_stats, msg_type, LNET_STATS_TYPE_DROP); + the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++; + the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length += nob; + lnet_net_unlock(cpt); + + lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob); +} + +static void +lnet_recv_put(struct lnet_ni *ni, struct lnet_msg *msg) +{ + struct lnet_hdr *hdr = &msg->msg_hdr; + + if (msg->msg_wanted != 0) + lnet_setpayloadbuffer(msg); + + lnet_build_msg_event(msg, LNET_EVENT_PUT); + + /* Must I ACK? If so I'll grab the ack_wmd out of the header and put + * it back into the ACK during lnet_finalize() */ + msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) && + (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0); + + lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed, + msg->msg_offset, msg->msg_wanted, hdr->payload_length); +} + +static int +lnet_parse_put(struct lnet_ni *ni, struct lnet_msg *msg) +{ + struct lnet_hdr *hdr = &msg->msg_hdr; + struct lnet_match_info info; + int rc; + bool ready_delay; + + /* Convert put fields to host byte order */ + hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits); + hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index); + hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); + + /* Primary peer NID. */ + info.mi_id.nid = msg->msg_initiator; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_PUT; + info.mi_portal = hdr->msg.put.ptl_index; + info.mi_rlength = hdr->payload_length; + info.mi_roffset = hdr->msg.put.offset; + info.mi_mbits = hdr->msg.put.match_bits; + info.mi_cpt = lnet_nid2cpt(&msg->msg_initiator, ni); + + msg->msg_rx_ready_delay = ni->ni_net->net_lnd->lnd_eager_recv == NULL; + ready_delay = msg->msg_rx_ready_delay; + + again: + rc = lnet_ptl_match_md(&info, msg); + switch (rc) { + default: + LBUG(); + + case LNET_MATCHMD_OK: + lnet_recv_put(ni, msg); + return 0; + + case LNET_MATCHMD_NONE: + if (ready_delay) + /* no eager_recv or has already called it, should + * have been attached on delayed list */ + return 0; + + rc = lnet_ni_eager_recv(ni, msg); + if (rc == 0) { + ready_delay = true; + goto again; + } + fallthrough; + + case LNET_MATCHMD_DROP: + CNETERR("Dropping PUT from %s portal %d match %llu" + " offset %d length %d: %d\n", + libcfs_idstr(&info.mi_id), info.mi_portal, + info.mi_mbits, info.mi_roffset, info.mi_rlength, rc); + + return -ENOENT; /* -ve: OK but no match */ + } +} + +static int +lnet_parse_get(struct lnet_ni *ni, struct lnet_msg *msg, int rdma_get) +{ + struct lnet_match_info info; + struct lnet_hdr *hdr = &msg->msg_hdr; + struct lnet_processid source_id; + struct lnet_handle_wire reply_wmd; + int rc; + + /* Convert get fields to host byte order */ + hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits); + hdr->msg.get.ptl_index = le32_to_cpu(hdr->msg.get.ptl_index); + hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length); + hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset); + + source_id.nid = hdr->src_nid; + source_id.pid = hdr->src_pid; + /* Primary peer NID */ + info.mi_id.nid = msg->msg_initiator; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_GET; + info.mi_portal = hdr->msg.get.ptl_index; + info.mi_rlength = hdr->msg.get.sink_length; + info.mi_roffset = hdr->msg.get.src_offset; + info.mi_mbits = hdr->msg.get.match_bits; + info.mi_cpt = lnet_nid2cpt(&msg->msg_initiator, ni); + + rc = lnet_ptl_match_md(&info, msg); + if (rc == LNET_MATCHMD_DROP) { + CNETERR("Dropping GET from %s portal %d match %llu" + " offset %d length %d\n", + libcfs_idstr(&info.mi_id), info.mi_portal, + info.mi_mbits, info.mi_roffset, info.mi_rlength); + return -ENOENT; /* -ve: OK but no match */ + } + + LASSERT(rc == LNET_MATCHMD_OK); + + lnet_build_msg_event(msg, LNET_EVENT_GET); + + reply_wmd = hdr->msg.get.return_wmd; + + lnet_prep_send(msg, LNET_MSG_REPLY, &source_id, + msg->msg_offset, msg->msg_wanted); + + msg->msg_hdr.msg.reply.dst_wmd = reply_wmd; + + if (rdma_get) { + /* The LND completes the REPLY from her recv procedure */ + lnet_ni_recv(ni, msg->msg_private, msg, 0, + msg->msg_offset, msg->msg_len, msg->msg_len); + return 0; + } + + lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0); + msg->msg_receiving = 0; + + rc = lnet_send(&ni->ni_nid, msg, &msg->msg_from); + if (rc < 0) { + /* didn't get as far as lnet_ni_send() */ + CERROR("%s: Unable to send REPLY for GET from %s: %d\n", + libcfs_nidstr(&ni->ni_nid), + libcfs_idstr(&info.mi_id), rc); + + lnet_finalize(msg, rc); + } + + return 0; +} + +static int +lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg) +{ + void *private = msg->msg_private; + struct lnet_hdr *hdr = &msg->msg_hdr; + struct lnet_processid src = {}; + struct lnet_libmd *md; + unsigned int rlength; + unsigned int mlength; + int cpt; + + cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie); + lnet_res_lock(cpt); + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + /* NB handles only looked up by creator (no flips) */ + md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CNETERR("%s: Dropping REPLY from %s for %s " + "MD %#llx.%#llx\n", + libcfs_nidstr(&ni->ni_nid), libcfs_idstr(&src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie); + if (md != NULL && md->md_me != NULL) + CERROR("REPLY MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + return -ENOENT; /* -ve: OK but no match */ + } + + LASSERT(md->md_offset == 0); + + rlength = hdr->payload_length; + mlength = min(rlength, md->md_length); + + if (mlength < rlength && + (md->md_options & LNET_MD_TRUNCATE) == 0) { + CNETERR("%s: Dropping REPLY from %s length %d " + "for MD %#llx would overflow (%d)\n", + libcfs_nidstr(&ni->ni_nid), libcfs_idstr(&src), + rlength, hdr->msg.reply.dst_wmd.wh_object_cookie, + mlength); + lnet_res_unlock(cpt); + return -ENOENT; /* -ve: OK but no match */ + } + + CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md %#llx\n", + libcfs_nidstr(&ni->ni_nid), libcfs_idstr(&src), + mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie); + + lnet_msg_attach_md(msg, md, 0, mlength); + + if (mlength != 0) + lnet_setpayloadbuffer(msg); + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_REPLY); + + lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength); + return 0; +} + +static int +lnet_parse_ack(struct lnet_ni *ni, struct lnet_msg *msg) +{ + struct lnet_hdr *hdr = &msg->msg_hdr; + struct lnet_processid src = {}; + struct lnet_libmd *md; + int cpt; + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + /* Convert ack fields to host byte order */ + hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits); + hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength); + + cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie); + lnet_res_lock(cpt); + + /* NB handles only looked up by creator (no flips) */ + md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + /* Don't moan; this is expected */ + CDEBUG(D_NET, + "%s: Dropping ACK from %s to %s MD %#llx.%#llx\n", + libcfs_nidstr(&ni->ni_nid), libcfs_idstr(&src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie); + if (md != NULL && md->md_me != NULL) + CERROR("Source MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + return -ENOENT; /* -ve! */ + } + + CDEBUG(D_NET, "%s: ACK from %s into md %#llx\n", + libcfs_nidstr(&ni->ni_nid), libcfs_idstr(&src), + hdr->msg.ack.dst_wmd.wh_object_cookie); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_ACK); + + lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len); + return 0; +} + +/** + * \retval LNET_CREDIT_OK If \a msg is forwarded + * \retval LNET_CREDIT_WAIT If \a msg is blocked because w/o buffer + * \retval -ve error code + */ +int +lnet_parse_forward_locked(struct lnet_ni *ni, struct lnet_msg *msg) +{ + int rc = 0; + + if (!the_lnet.ln_routing) + return -ECANCELED; + + if (msg->msg_rxpeer->lpni_rtrcredits <= 0 || + lnet_msg2bufpool(msg)->rbp_credits <= 0) { + if (ni->ni_net->net_lnd->lnd_eager_recv == NULL) { + msg->msg_rx_ready_delay = 1; + } else { + lnet_net_unlock(msg->msg_rx_cpt); + rc = lnet_ni_eager_recv(ni, msg); + lnet_net_lock(msg->msg_rx_cpt); + } + } + + if (rc == 0) + rc = lnet_post_routed_recv_locked(msg, 0); + return rc; +} + +int +lnet_parse_local(struct lnet_ni *ni, struct lnet_msg *msg) +{ + int rc; + + switch (msg->msg_type) { + case LNET_MSG_ACK: + rc = lnet_parse_ack(ni, msg); + break; + case LNET_MSG_PUT: + rc = lnet_parse_put(ni, msg); + break; + case LNET_MSG_GET: + rc = lnet_parse_get(ni, msg, msg->msg_rdma_get); + break; + case LNET_MSG_REPLY: + rc = lnet_parse_reply(ni, msg); + break; + default: /* prevent an unused label if !kernel */ + LASSERT(0); + return -EPROTO; + } + + LASSERT(rc == 0 || rc == -ENOENT); + return rc; +} + +char * +lnet_msgtyp2str (int type) +{ + switch (type) { + case LNET_MSG_ACK: + return ("ACK"); + case LNET_MSG_PUT: + return ("PUT"); + case LNET_MSG_GET: + return ("GET"); + case LNET_MSG_REPLY: + return ("REPLY"); + case LNET_MSG_HELLO: + return ("HELLO"); + default: + return (""); + } +} +EXPORT_SYMBOL(lnet_msgtyp2str); + +int +lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, + struct lnet_nid *from_nid, void *private, int rdma_req) +{ + struct lnet_peer_ni *lpni; + struct lnet_msg *msg; + __u32 payload_length; + lnet_pid_t dest_pid; + struct lnet_nid dest_nid; + struct lnet_nid src_nid; + bool push = false; + int for_me; + __u32 type; + int rc = 0; + int cpt; + time64_t now = ktime_get_seconds(); + + LASSERT (!in_interrupt ()); + + type = hdr->type; + src_nid = hdr->src_nid; + dest_nid = hdr->dest_nid; + dest_pid = hdr->dest_pid; + payload_length = hdr->payload_length; + + for_me = nid_same(&ni->ni_nid, &dest_nid); + cpt = lnet_nid2cpt(from_nid, ni); + + CDEBUG(D_NET, "TRACE: %s(%s) <- %s : %s - %s\n", + libcfs_nidstr(&dest_nid), + libcfs_nidstr(&ni->ni_nid), + libcfs_nidstr(&src_nid), + lnet_msgtyp2str(type), + (for_me) ? "for me" : "routed"); + + switch (type) { + case LNET_MSG_ACK: + case LNET_MSG_GET: + if (payload_length > 0) { + CERROR("%s, src %s: bad %s payload %d (0 expected)\n", + libcfs_nidstr(from_nid), + libcfs_nidstr(&src_nid), + lnet_msgtyp2str(type), payload_length); + return -EPROTO; + } + break; + + case LNET_MSG_PUT: + case LNET_MSG_REPLY: + if (payload_length > + (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) { + CERROR("%s, src %s: bad %s payload %d " + "(%d max expected)\n", + libcfs_nidstr(from_nid), + libcfs_nidstr(&src_nid), + lnet_msgtyp2str(type), + payload_length, + for_me ? LNET_MAX_PAYLOAD : LNET_MTU); + return -EPROTO; + } + break; + + default: + CERROR("%s, src %s: Bad message type 0x%x\n", + libcfs_nidstr(from_nid), + libcfs_nidstr(&src_nid), type); + return -EPROTO; + } + + /* Only update net_last_alive for incoming GETs on the reserved portal + * (i.e. incoming lnet/discovery pings). + * This avoids situations where the router's own traffic results in NI + * status changes + */ + if (the_lnet.ln_routing && type == LNET_MSG_GET && + hdr->msg.get.ptl_index == LNET_RESERVED_PORTAL && + !lnet_islocalnid(&src_nid) && + ni->ni_net->net_last_alive != now) { + lnet_ni_lock(ni); + spin_lock(&ni->ni_net->net_lock); + ni->ni_net->net_last_alive = now; + spin_unlock(&ni->ni_net->net_lock); + push = lnet_ni_set_status_locked(ni, LNET_NI_STATUS_UP); + lnet_ni_unlock(ni); + } + + if (push) + lnet_push_update_to_peers(1); + + /* Regard a bad destination NID as a protocol error. Senders should + * know what they're doing; if they don't they're misconfigured, buggy + * or malicious so we chop them off at the knees :) */ + + if (!for_me) { + if (LNET_NID_NET(&dest_nid) == LNET_NID_NET(&ni->ni_nid)) { + /* should have gone direct */ + CERROR("%s, src %s: Bad dest nid %s " + "(should have been sent direct)\n", + libcfs_nidstr(from_nid), + libcfs_nidstr(&src_nid), + libcfs_nidstr(&dest_nid)); + return -EPROTO; + } + + if (lnet_islocalnid(&dest_nid)) { + /* dest is another local NI; sender should have used + * this node's NID on its own network */ + CERROR("%s, src %s: Bad dest nid %s " + "(it's my nid but on a different network)\n", + libcfs_nidstr(from_nid), + libcfs_nidstr(&src_nid), + libcfs_nidstr(&dest_nid)); + return -EPROTO; + } + + if (rdma_req && type == LNET_MSG_GET) { + CERROR("%s, src %s: Bad optimized GET for %s " + "(final destination must be me)\n", + libcfs_nidstr(from_nid), + libcfs_nidstr(&src_nid), + libcfs_nidstr(&dest_nid)); + return -EPROTO; + } + + if (!the_lnet.ln_routing) { + CERROR("%s, src %s: Dropping message for %s " + "(routing not enabled)\n", + libcfs_nidstr(from_nid), + libcfs_nidstr(&src_nid), + libcfs_nidstr(&dest_nid)); + goto drop; + } + } + + /* Message looks OK; we're not going to return an error, so we MUST + * call back lnd_recv() come what may... */ + + if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer(&src_nid, 0)) { /* shall we now? */ + CERROR("%s, src %s: Dropping %s to simulate failure\n", + libcfs_nidstr(from_nid), libcfs_nidstr(&src_nid), + lnet_msgtyp2str(type)); + goto drop; + } + + /* FIXME need to support large-addr nid */ + if (!list_empty(&the_lnet.ln_drop_rules) && + lnet_drop_rule_match(hdr, lnet_nid_to_nid4(&ni->ni_nid), NULL)) { + CDEBUG(D_NET, + "%s, src %s, dst %s: Dropping %s to simulate silent message loss\n", + libcfs_nidstr(from_nid), libcfs_nidstr(&src_nid), + libcfs_nidstr(&dest_nid), lnet_msgtyp2str(type)); + goto drop; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("%s, src %s: Dropping %s (out of memory)\n", + libcfs_nidstr(from_nid), libcfs_nidstr(&src_nid), + lnet_msgtyp2str(type)); + goto drop; + } + + /* msg zeroed in lnet_msg_alloc; i.e. flags all clear, + * pointers NULL etc */ + + msg->msg_type = type; + msg->msg_private = private; + msg->msg_receiving = 1; + msg->msg_rdma_get = rdma_req; + msg->msg_len = msg->msg_wanted = payload_length; + msg->msg_offset = 0; + msg->msg_hdr = *hdr; + /* for building message event */ + msg->msg_from = *from_nid; + if (!for_me) { + msg->msg_target.pid = dest_pid; + msg->msg_target.nid = dest_nid; + msg->msg_routing = 1; + } + + lnet_net_lock(cpt); + lpni = lnet_peerni_by_nid_locked(from_nid, &ni->ni_nid, cpt); + if (IS_ERR(lpni)) { + lnet_net_unlock(cpt); + rc = PTR_ERR(lpni); + CERROR("%s, src %s: Dropping %s (error %d looking up sender)\n", + libcfs_nidstr(from_nid), libcfs_nidstr(&src_nid), + lnet_msgtyp2str(type), rc); + lnet_msg_free(msg); + if (rc == -ESHUTDOWN) + /* We are shutting down. Don't do anything more */ + return 0; + goto drop; + } + + /* If this message was forwarded to us from a router then we may need + * to update router aliveness or check for an asymmetrical route + * (or both) + */ + if (((lnet_drop_asym_route && for_me) || + !lpni->lpni_peer_net->lpn_peer->lp_alive) && + LNET_NID_NET(&src_nid) != LNET_NID_NET(from_nid)) { + __u32 src_net_id = LNET_NID_NET(&src_nid); + struct lnet_peer *gw = lpni->lpni_peer_net->lpn_peer; + struct lnet_route *route; + bool found = false; + + list_for_each_entry(route, &gw->lp_routes, lr_gwlist) { + if (route->lr_net == src_net_id) { + found = true; + /* If we're transitioning the gateway from + * dead -> alive, and discovery is disabled + * locally or on the gateway, then we need to + * update the cached route aliveness for each + * route to the src_nid's net. + * + * Otherwise, we're only checking for + * symmetrical route, and we can break the + * loop + */ + if (!gw->lp_alive && + lnet_is_discovery_disabled(gw)) + lnet_set_route_aliveness(route, true); + else + break; + } + } + if (lnet_drop_asym_route && for_me && !found) { + /* Drop ref taken by lnet_nid2peerni_locked() */ + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(cpt); + /* we would not use from_nid to route a message to + * src_nid + * => asymmetric routing detected but forbidden + */ + CERROR("%s, src %s: Dropping asymmetrical route %s\n", + libcfs_nidstr(from_nid), + libcfs_nidstr(&src_nid), lnet_msgtyp2str(type)); + lnet_msg_free(msg); + goto drop; + } + if (!gw->lp_alive) { + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni2; + + gw->lp_alive = true; + /* Mark all remote NIs on src_nid's net UP */ + lpn = lnet_peer_get_net_locked(gw, src_net_id); + if (lpn) + list_for_each_entry(lpni2, &lpn->lpn_peer_nis, + lpni_peer_nis) + lpni2->lpni_ns_status = LNET_NI_STATUS_UP; + } + } + + lpni->lpni_last_alive = now; + + msg->msg_rxpeer = lpni; + msg->msg_rxni = ni; + lnet_ni_addref_locked(ni, cpt); + /* Multi-Rail: Primary NID of source. */ + lnet_peer_primary_nid_locked(&src_nid, &msg->msg_initiator); + + /* + * mark the status of this lpni as UP since we received a message + * from it. The ping response reports back the ns_status which is + * marked on the remote as up or down and we cache it here. + */ + msg->msg_rxpeer->lpni_ns_status = LNET_NI_STATUS_UP; + + lnet_msg_commit(msg, cpt); + + /* message delay simulation */ + if (unlikely(!list_empty(&the_lnet.ln_delay_rules) && + lnet_delay_rule_match_locked(hdr, msg))) { + lnet_net_unlock(cpt); + return 0; + } + + if (!for_me) { + rc = lnet_parse_forward_locked(ni, msg); + lnet_net_unlock(cpt); + + if (rc < 0) + goto free_drop; + + if (rc == LNET_CREDIT_OK) { + lnet_ni_recv(ni, msg->msg_private, msg, 0, + 0, payload_length, payload_length); + } + return 0; + } + + lnet_net_unlock(cpt); + + rc = lnet_parse_local(ni, msg); + if (rc != 0) + goto free_drop; + return 0; + + free_drop: + LASSERT(msg->msg_md == NULL); + lnet_finalize(msg, rc); + + drop: + lnet_drop_message(ni, cpt, private, payload_length, type); + return 0; +} +EXPORT_SYMBOL(lnet_parse); + +void +lnet_drop_delayed_msg_list(struct list_head *head, char *reason) +{ + while (!list_empty(head)) { + struct lnet_processid id = {}; + struct lnet_msg *msg; + + msg = list_entry(head->next, struct lnet_msg, msg_list); + list_del(&msg->msg_list); + + id.nid = msg->msg_hdr.src_nid; + id.pid = msg->msg_hdr.src_pid; + + LASSERT(msg->msg_md == NULL); + LASSERT(msg->msg_rx_delayed); + LASSERT(msg->msg_rxpeer != NULL); + LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); + + CWARN("Dropping delayed PUT from %s portal %d match %llu" + " offset %d length %d: %s\n", + libcfs_idstr(&id), + msg->msg_hdr.msg.put.ptl_index, + msg->msg_hdr.msg.put.match_bits, + msg->msg_hdr.msg.put.offset, + msg->msg_hdr.payload_length, reason); + + /* NB I can't drop msg's ref on msg_rxpeer until after I've + * called lnet_drop_message(), so I just hang onto msg as well + * until that's done */ + + lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt, + msg->msg_private, msg->msg_len, + msg->msg_type); + + msg->msg_no_resend = true; + /* + * NB: message will not generate event because w/o attached MD, + * but we still should give error code so lnet_msg_decommit() + * can skip counters operations and other checks. + */ + lnet_finalize(msg, -ENOENT); + } +} + +void +lnet_recv_delayed_msg_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct lnet_msg *msg; + struct lnet_processid id; + + msg = list_entry(head->next, struct lnet_msg, msg_list); + list_del(&msg->msg_list); + + /* md won't disappear under me, since each msg + * holds a ref on it */ + + id.nid = msg->msg_hdr.src_nid; + id.pid = msg->msg_hdr.src_pid; + + LASSERT(msg->msg_rx_delayed); + LASSERT(msg->msg_md != NULL); + LASSERT(msg->msg_rxpeer != NULL); + LASSERT(msg->msg_rxni != NULL); + LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); + + CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d " + "match %llu offset %d length %d.\n", + libcfs_idstr(&id), msg->msg_hdr.msg.put.ptl_index, + msg->msg_hdr.msg.put.match_bits, + msg->msg_hdr.msg.put.offset, + msg->msg_hdr.payload_length); + + lnet_recv_put(msg->msg_rxni, msg); + } +} + +static void +lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt, + struct lnet_libmd *md, struct lnet_handle_md mdh) +{ + s64 timeout_ns; + struct lnet_rsp_tracker *local_rspt; + + /* + * MD has a refcount taken by message so it's not going away. + * The MD however can be looked up. We need to secure the access + * to the md_rspt_ptr by taking the res_lock. + * The rspt can be accessed without protection up to when it gets + * added to the list. + */ + + lnet_res_lock(cpt); + local_rspt = md->md_rspt_ptr; + timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC; + if (local_rspt != NULL) { + /* + * we already have an rspt attached to the md, so we'll + * update the deadline on that one. + */ + lnet_rspt_free(rspt, cpt); + } else { + /* new md */ + rspt->rspt_mdh = mdh; + rspt->rspt_cpt = cpt; + /* store the rspt so we can access it when we get the REPLY */ + md->md_rspt_ptr = rspt; + local_rspt = rspt; + } + local_rspt->rspt_deadline = ktime_add_ns(ktime_get(), timeout_ns); + + /* + * add to the list of tracked responses. It's added to tail of the + * list in order to expire all the older entries first. + */ + lnet_net_lock(cpt); + list_move_tail(&local_rspt->rspt_on_list, the_lnet.ln_mt_rstq[cpt]); + lnet_net_unlock(cpt); + lnet_res_unlock(cpt); +} + +/** + * Initiate an asynchronous PUT operation. + * + * There are several events associated with a PUT: completion of the send on + * the initiator node (LNET_EVENT_SEND), and when the send completes + * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating + * that the operation was accepted by the target. The event LNET_EVENT_PUT is + * used at the target node to indicate the completion of incoming data + * delivery. + * + * The local events will be logged in the EQ associated with the MD pointed to + * by \a mdh handle. Using a MD without an associated EQ results in these + * events being discarded. In this case, the caller must have another + * mechanism (e.g., a higher level protocol) for determining when it is safe + * to modify the memory region associated with the MD. + * + * Note that LNet does not guarantee the order of LNET_EVENT_SEND and + * LNET_EVENT_ACK, though intuitively ACK should happen after SEND. + * + * \param self Indicates the NID of a local interface through which to send + * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself. + * \param mdh A handle for the MD that describes the memory to be sent. The MD + * must be "free floating" (See LNetMDBind()). + * \param ack Controls whether an acknowledgment is requested. + * Acknowledgments are only sent when they are requested by the initiating + * process and the target MD enables them. + * \param target A process identifier for the target process. + * \param portal The index in the \a target's portal table. + * \param match_bits The match bits to use for MD selection at the target + * process. + * \param offset The offset into the target MD (only used when the target + * MD has the LNET_MD_MANAGE_REMOTE option set). + * \param hdr_data 64 bits of user data that can be included in the message + * header. This data is written to an event queue entry at the target if an + * EQ is present on the matching MD. + * + * \retval 0 Success, and only in this case events will be generated + * and logged to EQ (if it exists). + * \retval -EIO Simulated failure. + * \retval -ENOMEM Memory allocation failure. + * \retval -ENOENT Invalid MD object. + * + * \see struct lnet_event::hdr_data and lnet_event_kind_t. + */ +int +LNetPut(lnet_nid_t self4, struct lnet_handle_md mdh, enum lnet_ack_req ack, + struct lnet_process_id target4, unsigned int portal, + __u64 match_bits, unsigned int offset, + __u64 hdr_data) +{ + struct lnet_msg *msg; + struct lnet_libmd *md; + int cpt; + int rc; + struct lnet_processid target; + struct lnet_rsp_tracker *rspt = NULL; + struct lnet_nid self; + + LASSERT(the_lnet.ln_refcount > 0); + + lnet_nid4_to_nid(self4, &self); + lnet_nid4_to_nid(target4.nid, &target.nid); + target.pid = target4.pid; + + if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer(&target.nid, 1)) { /* shall we now? */ + CERROR("Dropping PUT to %s: simulated failure\n", + libcfs_id2str(target4)); + return -EIO; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("Dropping PUT to %s: ENOMEM on struct lnet_msg\n", + libcfs_id2str(target4)); + return -ENOMEM; + } + msg->msg_vmflush = !!(current->flags & PF_MEMALLOC); + + cpt = lnet_cpt_of_cookie(mdh.cookie); + + if (ack == LNET_ACK_REQ) { + rspt = lnet_rspt_alloc(cpt); + if (!rspt) { + CERROR("Dropping PUT to %s: ENOMEM on response tracker\n", + libcfs_id2str(target4)); + return -ENOMEM; + } + INIT_LIST_HEAD(&rspt->rspt_on_list); + } + + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CERROR("Dropping PUT (%llu:%d:%s): MD (%d) invalid\n", + match_bits, portal, libcfs_id2str(target4), + md == NULL ? -1 : md->md_threshold); + if (md != NULL && md->md_me != NULL) + CERROR("Source MD also attached to portal %d\n", + md->md_me->me_portal); + lnet_res_unlock(cpt); + + if (rspt) + lnet_rspt_free(rspt, cpt); + + lnet_msg_free(msg); + return -ENOENT; + } + + CDEBUG(D_NET, "%s -> %s\n", __func__, libcfs_id2str(target4)); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_prep_send(msg, LNET_MSG_PUT, &target, 0, md->md_length); + + msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.put.offset = cpu_to_le32(offset); + msg->msg_hdr.msg.put.hdr_data = hdr_data; + + /* NB handles only looked up by creator (no flips) */ + if (ack == LNET_ACK_REQ) { + msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = + the_lnet.ln_interface_cookie; + msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = + md->md_lh.lh_cookie; + } else { + msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = + LNET_WIRE_HANDLE_COOKIE_NONE; + msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = + LNET_WIRE_HANDLE_COOKIE_NONE; + } + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_SEND); + + if (rspt && lnet_response_tracking_enabled(LNET_MSG_PUT, + md->md_options)) + lnet_attach_rsp_tracker(rspt, cpt, md, mdh); + else if (rspt) + lnet_rspt_free(rspt, cpt); + + if (CFS_FAIL_CHECK_ORSET(CFS_FAIL_PTLRPC_OST_BULK_CB2, + CFS_FAIL_ONCE)) + rc = -EIO; + else + rc = lnet_send(&self, msg, NULL); + + if (rc != 0) { + CNETERR("Error sending PUT to %s: %d\n", + libcfs_id2str(target4), rc); + msg->msg_no_resend = true; + lnet_finalize(msg, rc); + } + + /* completion will be signalled by an event */ + return 0; +} +EXPORT_SYMBOL(LNetPut); + +/* + * The LND can DMA direct to the GET md (i.e. no REPLY msg). This + * returns a msg for the LND to pass to lnet_finalize() when the sink + * data has been received. + * + * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when + * lnet_finalize() is called on it, so the LND must call this first + */ +struct lnet_msg * +lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg) +{ + struct lnet_msg *msg = lnet_msg_alloc(); + struct lnet_libmd *getmd = getmsg->msg_md; + struct lnet_processid *peer_id = &getmsg->msg_target; + int cpt; + + LASSERT(!getmsg->msg_target_is_router); + LASSERT(!getmsg->msg_routing); + + if (msg == NULL) { + CERROR("%s: Dropping REPLY from %s: can't allocate msg\n", + libcfs_nidstr(&ni->ni_nid), libcfs_idstr(peer_id)); + goto drop; + } + + cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie); + lnet_res_lock(cpt); + + LASSERT(getmd->md_refcount > 0); + + if (getmd->md_threshold == 0) { + CERROR("%s: Dropping REPLY from %s for inactive MD %p\n", + libcfs_nidstr(&ni->ni_nid), libcfs_idstr(peer_id), + getmd); + lnet_res_unlock(cpt); + goto drop; + } + + LASSERT(getmd->md_offset == 0); + + CDEBUG(D_NET, "%s: Reply from %s md %p\n", + libcfs_nidstr(&ni->ni_nid), libcfs_idstr(peer_id), getmd); + + /* setup information for lnet_build_msg_event */ + msg->msg_initiator = + getmsg->msg_txpeer->lpni_peer_net->lpn_peer->lp_primary_nid; + msg->msg_from = peer_id->nid; + msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */ + msg->msg_hdr.src_nid = peer_id->nid; + msg->msg_hdr.payload_length = getmd->md_length; + msg->msg_receiving = 1; /* required by lnet_msg_attach_md */ + + lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length); + lnet_res_unlock(cpt); + + cpt = lnet_nid2cpt(&peer_id->nid, ni); + + lnet_net_lock(cpt); + lnet_msg_commit(msg, cpt); + lnet_net_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_REPLY); + + return msg; + + drop: + cpt = lnet_nid2cpt(&peer_id->nid, ni); + + lnet_net_lock(cpt); + lnet_incr_stats(&ni->ni_stats, LNET_MSG_GET, LNET_STATS_TYPE_DROP); + the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++; + the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length += + getmd->md_length; + lnet_net_unlock(cpt); + + if (msg != NULL) + lnet_msg_free(msg); + + return NULL; +} +EXPORT_SYMBOL(lnet_create_reply_msg); + +void +lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *reply, + unsigned int len) +{ + /* Set the REPLY length, now the RDMA that elides the REPLY message has + * completed and I know it. */ + LASSERT(reply != NULL); + LASSERT(reply->msg_type == LNET_MSG_GET); + LASSERT(reply->msg_ev.type == LNET_EVENT_REPLY); + + /* NB I trusted my peer to RDMA. If she tells me she's written beyond + * the end of my buffer, I might as well be dead. */ + LASSERT(len <= reply->msg_ev.mlength); + + reply->msg_ev.mlength = len; +} +EXPORT_SYMBOL(lnet_set_reply_msg_len); + +/** + * Initiate an asynchronous GET operation. + * + * On the initiator node, an LNET_EVENT_SEND is logged when the GET request + * is sent, and an LNET_EVENT_REPLY is logged when the data returned from + * the target node in the REPLY has been written to local MD. + * + * On the target node, an LNET_EVENT_GET is logged when the GET request + * arrives and is accepted into a MD. + * + * \param self,target,portal,match_bits,offset See the discussion in LNetPut(). + * \param mdh A handle for the MD that describes the memory into which the + * requested data will be received. The MD must be "free floating" (See LNetMDBind()). + * + * \retval 0 Success, and only in this case events will be generated + * and logged to EQ (if it exists) of the MD. + * \retval -EIO Simulated failure. + * \retval -ENOMEM Memory allocation failure. + * \retval -ENOENT Invalid MD object. + */ +int +LNetGet(lnet_nid_t self4, struct lnet_handle_md mdh, + struct lnet_process_id target4, unsigned int portal, + __u64 match_bits, unsigned int offset, bool recovery) +{ + struct lnet_msg *msg; + struct lnet_libmd *md; + struct lnet_rsp_tracker *rspt; + int cpt; + int rc; + struct lnet_nid self; + struct lnet_processid target; + + LASSERT(the_lnet.ln_refcount > 0); + + lnet_nid4_to_nid(self4, &self); + lnet_nid4_to_nid(target4.nid, &target.nid); + target.pid = target4.pid; + + if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer(&target.nid, 1)) /* shall we now? */ + { + CERROR("Dropping GET to %s: simulated failure\n", + libcfs_id2str(target4)); + return -EIO; + } + + msg = lnet_msg_alloc(); + if (!msg) { + CERROR("Dropping GET to %s: ENOMEM on struct lnet_msg\n", + libcfs_id2str(target4)); + return -ENOMEM; + } + + cpt = lnet_cpt_of_cookie(mdh.cookie); + + rspt = lnet_rspt_alloc(cpt); + if (!rspt) { + CERROR("Dropping GET to %s: ENOMEM on response tracker\n", + libcfs_id2str(target4)); + return -ENOMEM; + } + INIT_LIST_HEAD(&rspt->rspt_on_list); + + msg->msg_recovery = recovery; + + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CERROR("Dropping GET (%llu:%d:%s): MD (%d) invalid\n", + match_bits, portal, libcfs_id2str(target4), + md == NULL ? -1 : md->md_threshold); + if (md != NULL && md->md_me != NULL) + CERROR("REPLY MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + + lnet_msg_free(msg); + lnet_rspt_free(rspt, cpt); + return -ENOENT; + } + + CDEBUG(D_NET, "%s -> %s\n", __func__, libcfs_id2str(target4)); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_prep_send(msg, LNET_MSG_GET, &target, 0, 0); + + msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset); + msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length); + + /* NB handles only looked up by creator (no flips) */ + msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie = + the_lnet.ln_interface_cookie; + msg->msg_hdr.msg.get.return_wmd.wh_object_cookie = + md->md_lh.lh_cookie; + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_SEND); + + if (lnet_response_tracking_enabled(LNET_MSG_GET, md->md_options)) + lnet_attach_rsp_tracker(rspt, cpt, md, mdh); + else + lnet_rspt_free(rspt, cpt); + + rc = lnet_send(&self, msg, NULL); + if (rc < 0) { + CNETERR("Error sending GET to %s: %d\n", + libcfs_id2str(target4), rc); + msg->msg_no_resend = true; + lnet_finalize(msg, rc); + } + + /* completion will be signalled by an event */ + return 0; +} +EXPORT_SYMBOL(LNetGet); + +/** + * Calculate distance to node at \a dstnid. + * + * \param dstnid Target NID. + * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid + * is saved here. + * \param orderp If not NULL, order of the route to reach \a dstnid is saved + * here. + * + * \retval 0 If \a dstnid belongs to a local interface, and reserved option + * local_nid_dist_zero is set, which is the default. + * \retval positives Distance to target NID, i.e. number of hops plus one. + * \retval -EHOSTUNREACH If \a dstnid is not reachable. + */ +int +LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) +{ + struct list_head *e; + struct lnet_ni *ni = NULL; + struct lnet_remotenet *rnet; + __u32 dstnet = LNET_NIDNET(dstnid); + int hops; + int cpt; + __u32 order = 2; + struct list_head *rn_list; + bool matched_dstnet = false; + + /* if !local_nid_dist_zero, I don't return a distance of 0 ever + * (when lustre sees a distance of 0, it substitutes 0@lo), so I + * keep order 0 free for 0@lo and order 1 free for a local NID + * match */ + + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_net_lock_current(); + + while ((ni = lnet_get_next_ni_locked(NULL, ni))) { + /* FIXME support large-addr nid */ + if (lnet_nid_to_nid4(&ni->ni_nid) == dstnid) { + if (srcnidp != NULL) + *srcnidp = dstnid; + if (orderp != NULL) { + if (dstnid == LNET_NID_LO_0) + *orderp = 0; + else + *orderp = 1; + } + lnet_net_unlock(cpt); + + return local_nid_dist_zero ? 0 : 1; + } + + if (!matched_dstnet && LNET_NID_NET(&ni->ni_nid) == dstnet) { + matched_dstnet = true; + /* We matched the destination net, but we may have + * additional local NIs to inspect. + * + * We record the nid and order as appropriate, but + * they may be overwritten if we match local NI above. + */ + if (srcnidp) + /* FIXME support large-addr nids */ + *srcnidp = lnet_nid_to_nid4(&ni->ni_nid); + + if (orderp) { + /* Check if ni was originally created in + * current net namespace. + * If not, assign order above 0xffff0000, + * to make this ni not a priority. + */ + if (current->nsproxy && + !net_eq(ni->ni_net_ns, + current->nsproxy->net_ns)) + *orderp = order + 0xffff0000; + else + *orderp = order; + } + } + + order++; + } + + if (matched_dstnet) { + lnet_net_unlock(cpt); + return 1; + } + + rn_list = lnet_net2rnethash(dstnet); + list_for_each(e, rn_list) { + rnet = list_entry(e, struct lnet_remotenet, lrn_list); + + if (rnet->lrn_net == dstnet) { + struct lnet_route *route; + struct lnet_route *shortest = NULL; + __u32 shortest_hops = LNET_UNDEFINED_HOPS; + __u32 route_hops; + + LASSERT(!list_empty(&rnet->lrn_routes)); + + list_for_each_entry(route, &rnet->lrn_routes, + lr_list) { + route_hops = route->lr_hops; + if (route_hops == LNET_UNDEFINED_HOPS) + route_hops = 1; + if (shortest == NULL || + route_hops < shortest_hops) { + shortest = route; + shortest_hops = route_hops; + } + } + + LASSERT(shortest != NULL); + hops = shortest_hops; + if (srcnidp != NULL) { + struct lnet_net *net; + net = lnet_get_net_locked(shortest->lr_lnet); + LASSERT(net); + ni = lnet_get_next_ni_locked(net, NULL); + /* FIXME support large-addr nids */ + *srcnidp = lnet_nid_to_nid4(&ni->ni_nid); + } + if (orderp != NULL) + *orderp = order; + lnet_net_unlock(cpt); + return hops + 1; + } + order++; + } + + lnet_net_unlock(cpt); + return -EHOSTUNREACH; +} +EXPORT_SYMBOL(LNetDist); diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c new file mode 100644 index 0000000000000..ed979bcbd9d08 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c @@ -0,0 +1,1346 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/lnet/lib-msg.c + * + * Message decoding, parsing and finalizing routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +void +lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev) +{ + ENTRY; + + memset(ev, 0, sizeof(*ev)); + + ev->status = 0; + ev->unlinked = 1; + ev->type = LNET_EVENT_UNLINK; + lnet_md_deconstruct(md, ev); + lnet_md2handle(&ev->md_handle, md); + EXIT; +} + +/* + * Don't need any lock, must be called after lnet_commit_md + */ +void +lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type) +{ + struct lnet_hdr *hdr = &msg->msg_hdr; + struct lnet_event *ev = &msg->msg_ev; + + LASSERT(!msg->msg_routing); + + ev->type = ev_type; + ev->msg_type = msg->msg_type; + + if (ev_type == LNET_EVENT_SEND) { + /* event for active message */ + ev->target.nid = hdr->dest_nid; + ev->target.pid = hdr->dest_pid; + ev->initiator.nid = LNET_ANY_NID; + ev->initiator.pid = the_lnet.ln_pid; + ev->source.nid = LNET_ANY_NID; + ev->source.pid = the_lnet.ln_pid; + ev->sender = LNET_ANY_NID; + } else { + /* event for passive message */ + ev->target.pid = hdr->dest_pid; + ev->target.nid = hdr->dest_nid; + ev->initiator.pid = hdr->src_pid; + /* Multi-Rail: resolve src_nid to "primary" peer NID */ + ev->initiator.nid = msg->msg_initiator; + /* Multi-Rail: track source NID. */ + ev->source.pid = hdr->src_pid; + ev->source.nid = hdr->src_nid; + ev->rlength = hdr->payload_length; + ev->sender = msg->msg_from; + ev->mlength = msg->msg_wanted; + ev->offset = msg->msg_offset; + } + + switch (ev_type) { + default: + LBUG(); + + case LNET_EVENT_PUT: /* passive PUT */ + ev->pt_index = hdr->msg.put.ptl_index; + ev->match_bits = hdr->msg.put.match_bits; + ev->hdr_data = hdr->msg.put.hdr_data; + return; + + case LNET_EVENT_GET: /* passive GET */ + ev->pt_index = hdr->msg.get.ptl_index; + ev->match_bits = hdr->msg.get.match_bits; + ev->hdr_data = 0; + return; + + case LNET_EVENT_ACK: /* ACK */ + ev->match_bits = hdr->msg.ack.match_bits; + ev->mlength = hdr->msg.ack.mlength; + return; + + case LNET_EVENT_REPLY: /* REPLY */ + return; + + case LNET_EVENT_SEND: /* active message */ + if (msg->msg_type == LNET_MSG_PUT) { + ev->pt_index = le32_to_cpu(hdr->msg.put.ptl_index); + ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits); + ev->offset = le32_to_cpu(hdr->msg.put.offset); + ev->mlength = + ev->rlength = le32_to_cpu(hdr->payload_length); + ev->hdr_data = le64_to_cpu(hdr->msg.put.hdr_data); + + } else { + LASSERT(msg->msg_type == LNET_MSG_GET); + ev->pt_index = le32_to_cpu(hdr->msg.get.ptl_index); + ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits); + ev->mlength = + ev->rlength = le32_to_cpu(hdr->msg.get.sink_length); + ev->offset = le32_to_cpu(hdr->msg.get.src_offset); + ev->hdr_data = 0; + } + return; + } +} + +void +lnet_msg_commit(struct lnet_msg *msg, int cpt) +{ + struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt]; + struct lnet_counters_common *common; + s64 timeout_ns; + + /* set the message deadline */ + timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC; + msg->msg_deadline = ktime_add_ns(ktime_get(), timeout_ns); + + /* routed message can be committed for both receiving and sending */ + LASSERT(!msg->msg_tx_committed); + + if (msg->msg_sending) { + LASSERT(!msg->msg_receiving); + msg->msg_tx_cpt = cpt; + msg->msg_tx_committed = 1; + if (msg->msg_rx_committed) { /* routed message REPLY */ + LASSERT(msg->msg_onactivelist); + return; + } + } else { + LASSERT(!msg->msg_sending); + msg->msg_rx_cpt = cpt; + msg->msg_rx_committed = 1; + } + + LASSERT(!msg->msg_onactivelist); + + msg->msg_onactivelist = 1; + list_add_tail(&msg->msg_activelist, &container->msc_active); + + common = &the_lnet.ln_counters[cpt]->lct_common; + common->lcc_msgs_alloc++; + if (common->lcc_msgs_alloc > common->lcc_msgs_max) + common->lcc_msgs_max = common->lcc_msgs_alloc; +} + +static void +lnet_msg_decommit_tx(struct lnet_msg *msg, int status) +{ + struct lnet_counters_common *common; + struct lnet_event *ev = &msg->msg_ev; + + LASSERT(msg->msg_tx_committed); + if (status != 0) + goto out; + + common = &(the_lnet.ln_counters[msg->msg_tx_cpt]->lct_common); + switch (ev->type) { + default: /* routed message */ + LASSERT(msg->msg_routing); + LASSERT(msg->msg_rx_committed); + LASSERT(ev->type == 0); + + common->lcc_route_length += msg->msg_len; + common->lcc_route_count++; + goto incr_stats; + + case LNET_EVENT_PUT: + /* should have been decommitted */ + LASSERT(!msg->msg_rx_committed); + /* overwritten while sending ACK */ + LASSERT(msg->msg_type == LNET_MSG_ACK); + msg->msg_type = LNET_MSG_PUT; /* fix type */ + break; + + case LNET_EVENT_SEND: + LASSERT(!msg->msg_rx_committed); + if (msg->msg_type == LNET_MSG_PUT) + common->lcc_send_length += msg->msg_len; + break; + + case LNET_EVENT_GET: + LASSERT(msg->msg_rx_committed); + /* overwritten while sending reply, we should never be + * here for optimized GET */ + LASSERT(msg->msg_type == LNET_MSG_REPLY); + msg->msg_type = LNET_MSG_GET; /* fix type */ + break; + } + + common->lcc_send_count++; + +incr_stats: + if (msg->msg_txpeer) + lnet_incr_stats(&msg->msg_txpeer->lpni_stats, + msg->msg_type, + LNET_STATS_TYPE_SEND); + if (msg->msg_txni) + lnet_incr_stats(&msg->msg_txni->ni_stats, + msg->msg_type, + LNET_STATS_TYPE_SEND); + out: + lnet_return_tx_credits_locked(msg); + msg->msg_tx_committed = 0; +} + +static void +lnet_msg_decommit_rx(struct lnet_msg *msg, int status) +{ + struct lnet_counters_common *common; + struct lnet_event *ev = &msg->msg_ev; + + LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */ + LASSERT(msg->msg_rx_committed); + + if (status != 0) + goto out; + + common = &(the_lnet.ln_counters[msg->msg_rx_cpt]->lct_common); + switch (ev->type) { + default: + LASSERT(ev->type == 0); + LASSERT(msg->msg_routing); + goto incr_stats; + + case LNET_EVENT_ACK: + LASSERT(msg->msg_type == LNET_MSG_ACK); + break; + + case LNET_EVENT_GET: + /* type is "REPLY" if it's an optimized GET on passive side, + * because optimized GET will never be committed for sending, + * so message type wouldn't be changed back to "GET" by + * lnet_msg_decommit_tx(), see details in lnet_parse_get() */ + LASSERT(msg->msg_type == LNET_MSG_REPLY || + msg->msg_type == LNET_MSG_GET); + common->lcc_send_length += msg->msg_wanted; + break; + + case LNET_EVENT_PUT: + LASSERT(msg->msg_type == LNET_MSG_PUT); + break; + + case LNET_EVENT_REPLY: + /* type is "GET" if it's an optimized GET on active side, + * see details in lnet_create_reply_msg() */ + LASSERT(msg->msg_type == LNET_MSG_GET || + msg->msg_type == LNET_MSG_REPLY); + break; + } + + common->lcc_recv_count++; + +incr_stats: + if (msg->msg_rxpeer) + lnet_incr_stats(&msg->msg_rxpeer->lpni_stats, + msg->msg_type, + LNET_STATS_TYPE_RECV); + if (msg->msg_rxni) + lnet_incr_stats(&msg->msg_rxni->ni_stats, + msg->msg_type, + LNET_STATS_TYPE_RECV); + if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY) + common->lcc_recv_length += msg->msg_wanted; + + out: + lnet_return_rx_credits_locked(msg); + msg->msg_rx_committed = 0; +} + +void +lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status) +{ + int cpt2 = cpt; + + LASSERT(msg->msg_tx_committed || msg->msg_rx_committed); + LASSERT(msg->msg_onactivelist); + + if (msg->msg_tx_committed) { /* always decommit for sending first */ + LASSERT(cpt == msg->msg_tx_cpt); + lnet_msg_decommit_tx(msg, status); + } + + if (msg->msg_rx_committed) { + /* forwarding msg committed for both receiving and sending */ + if (cpt != msg->msg_rx_cpt) { + lnet_net_unlock(cpt); + cpt2 = msg->msg_rx_cpt; + lnet_net_lock(cpt2); + } + lnet_msg_decommit_rx(msg, status); + } + + list_del(&msg->msg_activelist); + msg->msg_onactivelist = 0; + + the_lnet.ln_counters[cpt2]->lct_common.lcc_msgs_alloc--; + + if (cpt2 != cpt) { + lnet_net_unlock(cpt2); + lnet_net_lock(cpt); + } +} + +void +lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md, + unsigned int offset, unsigned int mlen) +{ + /* NB: @offset and @len are only useful for receiving */ + /* Here, we attach the MD on lnet_msg and mark it busy and + * decrementing its threshold. Come what may, the lnet_msg "owns" + * the MD until a call to lnet_msg_detach_md or lnet_finalize() + * signals completion. */ + LASSERT(!msg->msg_routing); + + msg->msg_md = md; + if (msg->msg_receiving) { /* committed for receiving */ + msg->msg_offset = offset; + msg->msg_wanted = mlen; + } + + md->md_refcount++; + if (md->md_threshold != LNET_MD_THRESH_INF) { + LASSERT(md->md_threshold > 0); + md->md_threshold--; + } + + /* build umd in event */ + lnet_md2handle(&msg->msg_ev.md_handle, md); + lnet_md_deconstruct(md, &msg->msg_ev); +} + +static int +lnet_complete_msg_locked(struct lnet_msg *msg, int cpt) +{ + struct lnet_handle_wire ack_wmd; + int rc; + int status = msg->msg_ev.status; + + LASSERT(msg->msg_onactivelist); + + if (status == 0 && msg->msg_ack) { + /* Only send an ACK if the PUT completed successfully */ + + lnet_msg_decommit(msg, cpt, 0); + + msg->msg_ack = 0; + lnet_net_unlock(cpt); + + LASSERT(msg->msg_ev.type == LNET_EVENT_PUT); + LASSERT(!msg->msg_routing); + + ack_wmd = msg->msg_hdr.msg.put.ack_wmd; + + lnet_prep_send(msg, LNET_MSG_ACK, &msg->msg_ev.source, 0, 0); + + msg->msg_hdr.msg.ack.dst_wmd = ack_wmd; + msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits; + msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength); + + rc = lnet_send(&msg->msg_ev.target.nid, msg, + &msg->msg_from); + + lnet_net_lock(cpt); + /* + * NB: message is committed for sending, we should return + * on success because LND will finalize this message later. + * + * Also, there is possibility that message is committed for + * sending and also failed before delivering to LND, + * i.e: ENOMEM, in that case we can't fall through either + * because CPT for sending can be different with CPT for + * receiving, so we should return back to lnet_finalize() + * to make sure we are locking the correct partition. + */ + return rc; + + } else if (status == 0 && /* OK so far */ + (msg->msg_routing && !msg->msg_sending)) { + /* not forwarded */ + LASSERT(!msg->msg_receiving); /* called back recv already */ + lnet_net_unlock(cpt); + + rc = lnet_send(NULL, msg, NULL); + + lnet_net_lock(cpt); + /* + * NB: message is committed for sending, we should return + * on success because LND will finalize this message later. + * + * Also, there is possibility that message is committed for + * sending and also failed before delivering to LND, + * i.e: ENOMEM, in that case we can't fall through either: + * - The rule is message must decommit for sending first if + * the it's committed for both sending and receiving + * - CPT for sending can be different with CPT for receiving, + * so we should return back to lnet_finalize() to make + * sure we are locking the correct partition. + */ + return rc; + } + + lnet_msg_decommit(msg, cpt, status); + lnet_msg_free(msg); + return 0; +} + +static void +lnet_dec_healthv_locked(atomic_t *healthv, int sensitivity) +{ + int h = atomic_read(healthv); + + if (h < sensitivity) { + atomic_set(healthv, 0); + } else { + h -= sensitivity; + atomic_set(healthv, h); + } +} + +/* must hold net_lock/0 */ +void +lnet_ni_add_to_recoveryq_locked(struct lnet_ni *ni, + struct list_head *recovery_queue, time64_t now) +{ + if (!list_empty(&ni->ni_recovery)) + return; + + if (atomic_read(&ni->ni_healthv) == LNET_MAX_HEALTH_VALUE) + return; + + /* This NI is going on the recovery queue, so take a ref on it */ + lnet_ni_addref_locked(ni, 0); + + lnet_ni_set_next_ping(ni, now); + + CDEBUG(D_NET, "%s added to recovery queue. ping count: %u next ping: %lld health :%d\n", + libcfs_nidstr(&ni->ni_nid), + ni->ni_ping_count, + ni->ni_next_ping, + atomic_read(&ni->ni_healthv)); + + list_add_tail(&ni->ni_recovery, recovery_queue); +} + +static void +lnet_handle_local_failure(struct lnet_ni *local_ni) +{ + /* + * the lnet_net_lock(0) is used to protect the addref on the ni + * and the recovery queue. + */ + lnet_net_lock(0); + /* the mt could've shutdown and cleaned up the queues */ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) { + lnet_net_unlock(0); + return; + } + + lnet_dec_healthv_locked(&local_ni->ni_healthv, lnet_health_sensitivity); + lnet_ni_add_to_recoveryq_locked(local_ni, &the_lnet.ln_mt_localNIRecovq, + ktime_get_seconds()); + lnet_net_unlock(0); +} + +/* must hold net_lock/0 */ +void +lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni) +{ + __u32 sensitivity = lnet_health_sensitivity; + __u32 lp_sensitivity; + + /* + * If there is a health sensitivity in the peer then use that + * instead of the globally set one. + */ + lp_sensitivity = lpni->lpni_peer_net->lpn_peer->lp_health_sensitivity; + if (lp_sensitivity) + sensitivity = lp_sensitivity; + + lnet_dec_healthv_locked(&lpni->lpni_healthv, sensitivity); + + /* update the peer_net's health value */ + lnet_update_peer_net_healthv(lpni); + + /* + * add the peer NI to the recovery queue if it's not already there + * and it's health value is actually below the maximum. It's + * possible that the sensitivity might be set to 0, and the health + * value will not be reduced. In this case, there is no reason to + * invoke recovery + */ + lnet_peer_ni_add_to_recoveryq_locked(lpni, + &the_lnet.ln_mt_peerNIRecovq, + ktime_get_seconds()); +} + +static void +lnet_handle_remote_failure(struct lnet_peer_ni *lpni) +{ + /* lpni could be NULL if we're in the LOLND case */ + if (!lpni) + return; + + lnet_net_lock(0); + /* the mt could've shutdown and cleaned up the queues */ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) { + lnet_net_unlock(0); + return; + } + lnet_handle_remote_failure_locked(lpni); + lnet_net_unlock(0); +} + +static void +lnet_incr_hstats(struct lnet_ni *ni, struct lnet_peer_ni *lpni, + enum lnet_msg_hstatus hstatus) +{ + struct lnet_counters_health *health; + + health = &the_lnet.ln_counters[0]->lct_health; + + switch (hstatus) { + case LNET_MSG_STATUS_LOCAL_INTERRUPT: + atomic_inc(&ni->ni_hstats.hlt_local_interrupt); + health->lch_local_interrupt_count++; + break; + case LNET_MSG_STATUS_LOCAL_DROPPED: + atomic_inc(&ni->ni_hstats.hlt_local_dropped); + health->lch_local_dropped_count++; + break; + case LNET_MSG_STATUS_LOCAL_ABORTED: + atomic_inc(&ni->ni_hstats.hlt_local_aborted); + health->lch_local_aborted_count++; + break; + case LNET_MSG_STATUS_LOCAL_NO_ROUTE: + atomic_inc(&ni->ni_hstats.hlt_local_no_route); + health->lch_local_no_route_count++; + break; + case LNET_MSG_STATUS_LOCAL_TIMEOUT: + atomic_inc(&ni->ni_hstats.hlt_local_timeout); + health->lch_local_timeout_count++; + break; + case LNET_MSG_STATUS_LOCAL_ERROR: + atomic_inc(&ni->ni_hstats.hlt_local_error); + health->lch_local_error_count++; + break; + case LNET_MSG_STATUS_REMOTE_DROPPED: + if (lpni) + atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped); + health->lch_remote_dropped_count++; + break; + case LNET_MSG_STATUS_REMOTE_ERROR: + if (lpni) + atomic_inc(&lpni->lpni_hstats.hlt_remote_error); + health->lch_remote_error_count++; + break; + case LNET_MSG_STATUS_REMOTE_TIMEOUT: + if (lpni) + atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout); + health->lch_remote_timeout_count++; + break; + case LNET_MSG_STATUS_NETWORK_TIMEOUT: + if (lpni) + atomic_inc(&lpni->lpni_hstats.hlt_network_timeout); + health->lch_network_timeout_count++; + break; + case LNET_MSG_STATUS_OK: + break; + default: + LBUG(); + } +} + +static void +lnet_resend_msg_locked(struct lnet_msg *msg) +{ + msg->msg_retry_count++; + + /* + * remove message from the active list and reset it to prepare + * for a resend. Two exceptions to this + * + * 1. the router case. When a message is being routed it is + * committed for rx when received and committed for tx when + * forwarded. We don't want to remove it from the active list, since + * code which handles receiving expects it to remain on the active + * list. + * + * 2. The REPLY case. Reply messages use the same message + * structure for the GET that was received. + */ + if (!msg->msg_routing && msg->msg_type != LNET_MSG_REPLY) { + list_del_init(&msg->msg_activelist); + msg->msg_onactivelist = 0; + } + /* + * The msg_target.nid which was originally set + * when calling LNetGet() or LNetPut() might've + * been overwritten if we're routing this message. + * Call lnet_msg_decommit_tx() to return the credit + * this message consumed. The message will + * consume another credit when it gets resent. + */ + msg->msg_target.nid = msg->msg_hdr.dest_nid; + lnet_msg_decommit_tx(msg, -EAGAIN); + msg->msg_sending = 0; + msg->msg_receiving = 0; + msg->msg_target_is_router = 0; + + CDEBUG(D_NET, "%s->%s:%s:%s - queuing msg (%p) for resend\n", + libcfs_nidstr(&msg->msg_hdr.src_nid), + libcfs_nidstr(&msg->msg_hdr.dest_nid), + lnet_msgtyp2str(msg->msg_type), + lnet_health_error2str(msg->msg_health_status), msg); + + list_add_tail(&msg->msg_list, the_lnet.ln_mt_resendqs[msg->msg_tx_cpt]); + + complete(&the_lnet.ln_mt_wait_complete); +} + +int +lnet_check_finalize_recursion_locked(struct lnet_msg *msg, + struct list_head *containerq, + int nworkers, void **workers) +{ + int my_slot = -1; + int i; + + list_add_tail(&msg->msg_list, containerq); + + for (i = 0; i < nworkers; i++) { + if (workers[i] == current) + break; + + if (my_slot < 0 && workers[i] == NULL) + my_slot = i; + } + + if (i < nworkers || my_slot < 0) + return -1; + + workers[my_slot] = current; + + return my_slot; +} + +int +lnet_attempt_msg_resend(struct lnet_msg *msg) +{ + struct lnet_msg_container *container; + int my_slot; + int cpt; + + /* we can only resend tx_committed messages */ + LASSERT(msg->msg_tx_committed); + + /* don't resend recovery messages */ + if (msg->msg_recovery) { + CDEBUG(D_NET, "msg %s->%s is a recovery ping. retry# %d\n", + libcfs_nidstr(&msg->msg_from), + libcfs_nidstr(&msg->msg_target.nid), + msg->msg_retry_count); + return -ENOTRECOVERABLE; + } + + /* + * if we explicitly indicated we don't want to resend then just + * return + */ + if (msg->msg_no_resend) { + CDEBUG(D_NET, "msg %s->%s requested no resend. retry# %d\n", + libcfs_nidstr(&msg->msg_from), + libcfs_nidstr(&msg->msg_target.nid), + msg->msg_retry_count); + return -ENOTRECOVERABLE; + } + + /* check if the message has exceeded the number of retries */ + if (msg->msg_retry_count >= lnet_retry_count) { + CNETERR("msg %s->%s exceeded retry count %d\n", + libcfs_nidstr(&msg->msg_from), + libcfs_nidstr(&msg->msg_target.nid), + msg->msg_retry_count); + return -ENOTRECOVERABLE; + } + + cpt = msg->msg_tx_cpt; + lnet_net_lock(cpt); + + /* check again under lock */ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) { + lnet_net_unlock(cpt); + return -ESHUTDOWN; + } + + container = the_lnet.ln_msg_containers[cpt]; + my_slot = + lnet_check_finalize_recursion_locked(msg, + &container->msc_resending, + container->msc_nfinalizers, + container->msc_resenders); + + /* enough threads are resending */ + if (my_slot == -1) { + lnet_net_unlock(cpt); + return 0; + } + + while (!list_empty(&container->msc_resending)) { + msg = list_entry(container->msc_resending.next, + struct lnet_msg, msg_list); + list_del(&msg->msg_list); + + /* + * resending the message will require us to call + * lnet_msg_decommit_tx() which will return the credit + * which this message holds. This could trigger another + * queued message to be sent. If that message fails and + * requires a resend we will recurse. + * But since at this point the slot is taken, the message + * will be queued in the container and dealt with + * later. This breaks the recursion. + */ + lnet_resend_msg_locked(msg); + } + + /* + * msc_resenders is an array of process pointers. Each entry holds + * a pointer to the current process operating on the message. An + * array entry is created per CPT. If the array slot is already + * set, then it means that there is a thread on the CPT currently + * resending a message. + * Once the thread finishes clear the slot to enable the thread to + * take on more resend work. + */ + container->msc_resenders[my_slot] = NULL; + lnet_net_unlock(cpt); + + return 0; +} + +/* + * Do a health check on the message: + * return -1 if we're not going to handle the error or + * if we've reached the maximum number of retries. + * success case will return -1 as well + * return 0 if it the message is requeued for send + */ +static int +lnet_health_check(struct lnet_msg *msg) +{ + enum lnet_msg_hstatus hstatus = msg->msg_health_status; + struct lnet_peer_ni *lpni; + struct lnet_ni *ni; + bool lo = false; + bool attempt_local_resend; + bool attempt_remote_resend; + bool handle_local_health; + bool handle_remote_health; + + /* if we're shutting down no point in handling health. */ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) + return -1; + + LASSERT(msg->msg_tx_committed || msg->msg_rx_committed); + + /* + * if we're sending to the LOLND then the msg_txpeer will not be + * set. So no need to sanity check it. + */ + if (msg->msg_tx_committed && + !nid_is_lo0(&msg->msg_txni->ni_nid)) + LASSERT(msg->msg_txpeer); + else if (msg->msg_tx_committed && + nid_is_lo0(&msg->msg_txni->ni_nid)) + lo = true; + + if (hstatus != LNET_MSG_STATUS_OK && + ktime_compare(ktime_get(), msg->msg_deadline) >= 0) + return -1; + + /* + * always prefer txni/txpeer if they message is committed for both + * directions. + */ + if (msg->msg_tx_committed) { + ni = msg->msg_txni; + lpni = msg->msg_txpeer; + attempt_local_resend = attempt_remote_resend = true; + } else { + ni = msg->msg_rxni; + lpni = msg->msg_rxpeer; + attempt_local_resend = attempt_remote_resend = false; + } + + if (!lo) + LASSERT(ni && lpni); + else + LASSERT(ni); + + CDEBUG(D_NET, "health check: %s->%s: %s: %s\n", + libcfs_nidstr(&ni->ni_nid), + (lo) ? "self" : libcfs_nidstr(&lpni->lpni_nid), + lnet_msgtyp2str(msg->msg_type), + lnet_health_error2str(hstatus)); + + /* + * stats are only incremented for errors so avoid wasting time + * incrementing statistics if there is no error. Similarly, whether to + * update health values or perform resends is only applicable for + * messages with a health status != OK. + */ + if (hstatus != LNET_MSG_STATUS_OK) { + /* Don't further decrement the health value if a recovery + * message failed. + */ + if (msg->msg_recovery) + handle_local_health = handle_remote_health = false; + else + handle_local_health = handle_remote_health = true; + + /* For local failures, health/recovery/resends are not needed if + * I only have a single (non-lolnd) interface. NB: pb_nnis + * includes the lolnd interface, so a single-rail node would + * have pb_nnis == 2. + */ + if (the_lnet.ln_ping_target->pb_nnis <= 2) { + handle_local_health = false; + attempt_local_resend = false; + } + + lnet_net_lock(0); + lnet_incr_hstats(ni, lpni, hstatus); + /* For remote failures, health/recovery/resends are not needed + * if the peer only has a single interface. Special case for + * routers where we rely on health feature to manage route + * aliveness. NB: unlike pb_nnis above, lp_nnis does _not_ + * include the lolnd, so a single-rail node would have + * lp_nnis == 1. + */ + if (lpni && lpni->lpni_peer_net && + lpni->lpni_peer_net->lpn_peer && + lpni->lpni_peer_net->lpn_peer->lp_nnis <= 1) { + attempt_remote_resend = false; + if (!lnet_isrouter(lpni)) + handle_remote_health = false; + } + /* Do not put my interfaces into peer NI recovery. They should + * be handled with local NI recovery. + */ + if (handle_remote_health && lpni && + lnet_nid_to_ni_locked(&lpni->lpni_nid, 0)) + handle_remote_health = false; + lnet_net_unlock(0); + } + + switch (hstatus) { + case LNET_MSG_STATUS_OK: + /* + * increment the local ni health whether we successfully + * received or sent a message on it. + * + * Ping counts are reset to 0 as appropriate to allow for + * faster recovery. + */ + lnet_inc_healthv(&ni->ni_healthv, lnet_health_sensitivity); + /* + * It's possible msg_txpeer is NULL in the LOLND + * case. Only increment the peer's health if we're + * receiving a message from it. It's the only sure way to + * know that a remote interface is up. + * If this interface is part of a router, then take that + * as indication that the router is fully healthy. + */ + if (lpni && msg->msg_rx_committed) { + lnet_net_lock(0); + lpni->lpni_ping_count = 0; + ni->ni_ping_count = 0; + /* + * If we're receiving a message from the router or + * I'm a router, then set that lpni's health to + * maximum so we can commence communication + */ + if (lnet_isrouter(lpni) || the_lnet.ln_routing) { + lnet_set_lpni_healthv_locked(lpni, + LNET_MAX_HEALTH_VALUE); + } else { + __u32 sensitivity = lpni->lpni_peer_net-> + lpn_peer->lp_health_sensitivity; + + lnet_inc_lpni_healthv_locked(lpni, + (sensitivity) ? sensitivity : + lnet_health_sensitivity); + /* This peer NI may have previously aged out + * of recovery. Now that we've received a + * message from it, we can continue recovery + * if its health value is still below the + * maximum. + */ + lnet_peer_ni_add_to_recoveryq_locked(lpni, + &the_lnet.ln_mt_peerNIRecovq, + ktime_get_seconds()); + } + lnet_net_unlock(0); + } + + /* we can finalize this message */ + return -1; + case LNET_MSG_STATUS_LOCAL_INTERRUPT: + case LNET_MSG_STATUS_LOCAL_DROPPED: + case LNET_MSG_STATUS_LOCAL_ABORTED: + case LNET_MSG_STATUS_LOCAL_NO_ROUTE: + case LNET_MSG_STATUS_LOCAL_TIMEOUT: + if (handle_local_health) + lnet_handle_local_failure(ni); + if (attempt_local_resend) + return lnet_attempt_msg_resend(msg); + break; + case LNET_MSG_STATUS_LOCAL_ERROR: + if (handle_local_health) + lnet_handle_local_failure(ni); + return -1; + case LNET_MSG_STATUS_REMOTE_DROPPED: + if (handle_remote_health) + lnet_handle_remote_failure(lpni); + if (attempt_remote_resend) + return lnet_attempt_msg_resend(msg); + break; + case LNET_MSG_STATUS_REMOTE_ERROR: + case LNET_MSG_STATUS_REMOTE_TIMEOUT: + if (handle_remote_health) + lnet_handle_remote_failure(lpni); + return -1; + case LNET_MSG_STATUS_NETWORK_TIMEOUT: + if (handle_remote_health) + lnet_handle_remote_failure(lpni); + if (handle_local_health) + lnet_handle_local_failure(ni); + return -1; + default: + LBUG(); + } + + /* no resend is needed */ + return -1; +} + +static void +lnet_msg_detach_md(struct lnet_msg *msg, int status) +{ + struct lnet_libmd *md = msg->msg_md; + lnet_handler_t handler = NULL; + int cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); + int unlink; + + lnet_res_lock(cpt); + while (md->md_flags & LNET_MD_FLAG_HANDLING) + /* An event handler is running - wait for it to + * complete to avoid races. + */ + lnet_md_wait_handling(md, cpt); + + /* Now it's safe to drop my caller's ref */ + md->md_refcount--; + LASSERT(md->md_refcount >= 0); + + unlink = lnet_md_unlinkable(md); + if (md->md_handler) { + if ((md->md_flags & LNET_MD_FLAG_ABORTED) && !status) { + msg->msg_ev.status = -ETIMEDOUT; + CDEBUG(D_NET, "md 0x%p already unlinked\n", md); + } else { + msg->msg_ev.status = status; + } + msg->msg_ev.unlinked = unlink; + handler = md->md_handler; + if (!unlink) + md->md_flags |= LNET_MD_FLAG_HANDLING; + } + + if (unlink || (md->md_refcount == 0 && + md->md_threshold == LNET_MD_THRESH_INF)) + lnet_detach_rsp_tracker(md, cpt); + + msg->msg_md = NULL; + if (unlink) + lnet_md_unlink(md); + + lnet_res_unlock(cpt); + + if (handler) { + handler(&msg->msg_ev); + if (!unlink) { + lnet_res_lock(cpt); + md->md_flags &= ~LNET_MD_FLAG_HANDLING; + wake_up_var(md); + lnet_res_unlock(cpt); + } + } +} + +static bool +lnet_is_health_check(struct lnet_msg *msg) +{ + bool hc = true; + int status = msg->msg_ev.status; + + if ((!msg->msg_tx_committed && !msg->msg_rx_committed) || + !msg->msg_onactivelist) { + CDEBUG(D_NET, "msg %p not committed for send or receive\n", + msg); + return false; + } + + if ((msg->msg_tx_committed && !msg->msg_txpeer) || + (msg->msg_rx_committed && !msg->msg_rxpeer)) { + /* The optimized GET case does not set msg_rxpeer, but status + * could be zero. Only print the error message if we have a + * non-zero status. + */ + if (status) + CDEBUG(D_NET, "msg %p status %d cannot retry\n", msg, + status); + return false; + } + + /* Check for status inconsistencies */ + if ((!status && msg->msg_health_status != LNET_MSG_STATUS_OK) || + (status && msg->msg_health_status == LNET_MSG_STATUS_OK)) { + CDEBUG(D_NET, "Msg %p is in inconsistent state, don't perform health " + "checking (%d, %d)\n", msg, status, + msg->msg_health_status); + hc = false; + } + + CDEBUG(D_NET, "health check = %d, status = %d, hstatus = %d\n", + hc, status, msg->msg_health_status); + + return hc; +} + +char * +lnet_health_error2str(enum lnet_msg_hstatus hstatus) +{ + switch (hstatus) { + case LNET_MSG_STATUS_LOCAL_INTERRUPT: + return "LOCAL_INTERRUPT"; + case LNET_MSG_STATUS_LOCAL_DROPPED: + return "LOCAL_DROPPED"; + case LNET_MSG_STATUS_LOCAL_ABORTED: + return "LOCAL_ABORTED"; + case LNET_MSG_STATUS_LOCAL_NO_ROUTE: + return "LOCAL_NO_ROUTE"; + case LNET_MSG_STATUS_LOCAL_TIMEOUT: + return "LOCAL_TIMEOUT"; + case LNET_MSG_STATUS_LOCAL_ERROR: + return "LOCAL_ERROR"; + case LNET_MSG_STATUS_REMOTE_DROPPED: + return "REMOTE_DROPPED"; + case LNET_MSG_STATUS_REMOTE_ERROR: + return "REMOTE_ERROR"; + case LNET_MSG_STATUS_REMOTE_TIMEOUT: + return "REMOTE_TIMEOUT"; + case LNET_MSG_STATUS_NETWORK_TIMEOUT: + return "NETWORK_TIMEOUT"; + case LNET_MSG_STATUS_OK: + return "OK"; + default: + return ""; + } +} + +bool +lnet_send_error_simulation(struct lnet_msg *msg, + enum lnet_msg_hstatus *hstatus) +{ + if (!msg) + return false; + + if (list_empty(&the_lnet.ln_drop_rules)) + return false; + + /* match only health rules */ + if (!lnet_drop_rule_match(&msg->msg_hdr, LNET_NID_ANY, + hstatus)) + return false; + + CDEBUG(D_NET, "src %s(%s)->dst %s: %s simulate health error: %s\n", + libcfs_nidstr(&msg->msg_hdr.src_nid), + libcfs_nidstr(&msg->msg_txni->ni_nid), + libcfs_nidstr(&msg->msg_hdr.dest_nid), + lnet_msgtyp2str(msg->msg_type), + lnet_health_error2str(*hstatus)); + + return true; +} +EXPORT_SYMBOL(lnet_send_error_simulation); + +void +lnet_finalize(struct lnet_msg *msg, int status) +{ + struct lnet_msg_container *container; + int my_slot; + int cpt; + int rc; + + LASSERT(!in_interrupt()); + + if (msg == NULL) + return; + + msg->msg_ev.status = status; + + if (lnet_is_health_check(msg)) { + /* + * Check the health status of the message. If it has one + * of the errors that we're supposed to handle, and it has + * not timed out, then + * 1. Decrement the appropriate health_value + * 2. queue the message on the resend queue + + * if the message send is success, timed out or failed in the + * health check for any reason then we'll just finalize the + * message. Otherwise just return since the message has been + * put on the resend queue. + */ + if (!lnet_health_check(msg)) + return; + } + + /* + * We're not going to resend this message so detach its MD and invoke + * the appropriate callbacks + */ + if (msg->msg_md != NULL) + lnet_msg_detach_md(msg, status); + +again: + if (!msg->msg_tx_committed && !msg->msg_rx_committed) { + /* not committed to network yet */ + LASSERT(!msg->msg_onactivelist); + lnet_msg_free(msg); + return; + } + + /* + * NB: routed message can be committed for both receiving and sending, + * we should finalize in LIFO order and keep counters correct. + * (finalize sending first then finalize receiving) + */ + cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt; + lnet_net_lock(cpt); + + container = the_lnet.ln_msg_containers[cpt]; + + /* Recursion breaker. Don't complete the message here if I am (or + * enough other threads are) already completing messages */ + my_slot = lnet_check_finalize_recursion_locked(msg, + &container->msc_finalizing, + container->msc_nfinalizers, + container->msc_finalizers); + + /* enough threads are resending */ + if (my_slot == -1) { + lnet_net_unlock(cpt); + return; + } + + rc = 0; + while (!list_empty(&container->msc_finalizing)) { + msg = list_entry(container->msc_finalizing.next, + struct lnet_msg, msg_list); + + list_del_init(&msg->msg_list); + + /* NB drops and regains the lnet lock if it actually does + * anything, so my finalizing friends can chomp along too */ + rc = lnet_complete_msg_locked(msg, cpt); + if (rc != 0) + break; + } + + if (unlikely(!list_empty(&the_lnet.ln_delay_rules))) { + lnet_net_unlock(cpt); + lnet_delay_rule_check(); + lnet_net_lock(cpt); + } + + container->msc_finalizers[my_slot] = NULL; + lnet_net_unlock(cpt); + + if (rc != 0) + goto again; +} +EXPORT_SYMBOL(lnet_finalize); + +void +lnet_msg_container_cleanup(struct lnet_msg_container *container) +{ + int count = 0; + + if (container->msc_init == 0) + return; + + while (!list_empty(&container->msc_active)) { + struct lnet_msg *msg; + + msg = list_entry(container->msc_active.next, + struct lnet_msg, msg_activelist); + LASSERT(msg->msg_onactivelist); + msg->msg_onactivelist = 0; + list_del_init(&msg->msg_activelist); + lnet_msg_free(msg); + count++; + } + + if (count > 0) + CERROR("%d active msg on exit\n", count); + + if (container->msc_finalizers != NULL) { + CFS_FREE_PTR_ARRAY(container->msc_finalizers, + container->msc_nfinalizers); + container->msc_finalizers = NULL; + } + + if (container->msc_resenders != NULL) { + CFS_FREE_PTR_ARRAY(container->msc_resenders, + container->msc_nfinalizers); + container->msc_resenders = NULL; + } + container->msc_init = 0; +} + +int +lnet_msg_container_setup(struct lnet_msg_container *container, int cpt) +{ + int rc = 0; + + container->msc_init = 1; + + INIT_LIST_HEAD(&container->msc_active); + INIT_LIST_HEAD(&container->msc_finalizing); + INIT_LIST_HEAD(&container->msc_resending); + + /* number of CPUs */ + container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt); + if (container->msc_nfinalizers == 0) + container->msc_nfinalizers = 1; + + LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt, + container->msc_nfinalizers * + sizeof(*container->msc_finalizers)); + + if (container->msc_finalizers == NULL) { + CERROR("Failed to allocate message finalizers\n"); + lnet_msg_container_cleanup(container); + return -ENOMEM; + } + + LIBCFS_CPT_ALLOC(container->msc_resenders, lnet_cpt_table(), cpt, + container->msc_nfinalizers * + sizeof(*container->msc_resenders)); + + if (container->msc_resenders == NULL) { + CERROR("Failed to allocate message resenders\n"); + lnet_msg_container_cleanup(container); + return -ENOMEM; + } + + return rc; +} + +void +lnet_msg_containers_destroy(void) +{ + struct lnet_msg_container *container; + int i; + + if (the_lnet.ln_msg_containers == NULL) + return; + + cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) + lnet_msg_container_cleanup(container); + + cfs_percpt_free(the_lnet.ln_msg_containers); + the_lnet.ln_msg_containers = NULL; +} + +int +lnet_msg_containers_create(void) +{ + struct lnet_msg_container *container; + int rc; + int i; + + the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*container)); + + if (the_lnet.ln_msg_containers == NULL) { + CERROR("Failed to allocate cpu-partition data for network\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) { + rc = lnet_msg_container_setup(container, i); + if (rc != 0) { + lnet_msg_containers_destroy(); + return rc; + } + } + + return 0; +} diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c new file mode 100644 index 0000000000000..cbe7a30eb50bd --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c @@ -0,0 +1,991 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/lnet/lib-ptl.c + * + * portal & match routines + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/* NB: add /proc interfaces in upcoming patches */ +int portal_rotor = LNET_PTL_ROTOR_HASH_RT; +module_param(portal_rotor, int, 0644); +MODULE_PARM_DESC(portal_rotor, "redirect PUTs to different cpu-partitions"); + +static int +lnet_ptl_match_type(unsigned int index, struct lnet_processid *match_id, + __u64 mbits, __u64 ignore_bits) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[index]; + int unique; + + unique = (ignore_bits == 0 && + !LNET_NID_IS_ANY(&match_id->nid) && + match_id->pid != LNET_PID_ANY); + + LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl)); + + /* prefer to check w/o any lock */ + if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) + goto match; + + /* unset, new portal */ + lnet_ptl_lock(ptl); + /* check again with lock */ + if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) { + lnet_ptl_unlock(ptl); + goto match; + } + + /* still not set */ + if (unique) + lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE); + else + lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD); + + lnet_ptl_unlock(ptl); + + return 1; + + match: + if ((lnet_ptl_is_unique(ptl) && !unique) || + (lnet_ptl_is_wildcard(ptl) && unique)) + return 0; + return 1; +} + +static void +lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt) +{ + struct lnet_match_table *mtable = ptl->ptl_mtables[cpt]; + int i; + + /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + mtable->mt_enabled = 1; + + ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt; + for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) { + LASSERT(ptl->ptl_mt_maps[i] != cpt); + if (ptl->ptl_mt_maps[i] < cpt) + break; + + /* swap to order */ + ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i]; + ptl->ptl_mt_maps[i] = cpt; + } + + ptl->ptl_mt_nmaps++; +} + +static void +lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt) +{ + struct lnet_match_table *mtable = ptl->ptl_mtables[cpt]; + int i; + + /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + if (LNET_CPT_NUMBER == 1) + return; /* never disable the only match-table */ + + mtable->mt_enabled = 0; + + LASSERT(ptl->ptl_mt_nmaps > 0 && + ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER); + + /* remove it from mt_maps */ + ptl->ptl_mt_nmaps--; + for (i = 0; i < ptl->ptl_mt_nmaps; i++) { + if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */ + ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1]; + } +} + +static int +lnet_try_match_md(struct lnet_libmd *md, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + /* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock; + * lnet_match_blocked_msg() relies on this to avoid races */ + unsigned int offset; + unsigned int mlength; + struct lnet_me *me = md->md_me; + + /* MD exhausted */ + if (lnet_md_exhausted(md)) + return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED; + + /* mismatched MD op */ + if ((md->md_options & info->mi_opc) == 0) + return LNET_MATCHMD_NONE; + + /* mismatched ME nid/pid? */ + if (!LNET_NID_IS_ANY(&me->me_match_id.nid) && + !nid_same(&me->me_match_id.nid, &info->mi_id.nid)) + return LNET_MATCHMD_NONE; + + if (me->me_match_id.pid != LNET_PID_ANY && + me->me_match_id.pid != info->mi_id.pid) + return LNET_MATCHMD_NONE; + + /* mismatched ME matchbits? */ + if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0) + return LNET_MATCHMD_NONE; + + /* Hurrah! This _is_ a match; check it out... */ + + if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0) + offset = md->md_offset; + else + offset = info->mi_roffset; + + if ((md->md_options & LNET_MD_MAX_SIZE) != 0) { + mlength = md->md_max_size; + LASSERT(md->md_offset + mlength <= md->md_length); + } else { + mlength = md->md_length - offset; + } + + if (info->mi_rlength <= mlength) { /* fits in allowed space */ + mlength = info->mi_rlength; + } else if ((md->md_options & LNET_MD_TRUNCATE) == 0) { + /* this packet _really_ is too big */ + CERROR("Matching packet from %s, match %llu" + " length %d too big: %d left, %d allowed\n", + libcfs_idstr(&info->mi_id), info->mi_mbits, + info->mi_rlength, md->md_length - offset, mlength); + + return LNET_MATCHMD_DROP; + } + + /* Commit to this ME/MD */ + CDEBUG(D_NET, "Incoming %s index %x from %s of " + "length %d/%d into md %#llx [%d] + %d\n", + (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get", + info->mi_portal, libcfs_idstr(&info->mi_id), mlength, + info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset); + + lnet_msg_attach_md(msg, md, offset, mlength); + md->md_offset = offset + mlength; + + if (!lnet_md_exhausted(md)) + return LNET_MATCHMD_OK; + + /* Auto-unlink NOW, so the ME gets unlinked if required. + * We bumped md->md_refcount above so the MD just gets flagged + * for unlink when it is finalized. */ + if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0) + lnet_md_unlink(md); + + return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED; +} + +static struct lnet_match_table * +lnet_match2mt(struct lnet_portal *ptl, struct lnet_processid *id, __u64 mbits) +{ + if (LNET_CPT_NUMBER == 1) + return ptl->ptl_mtables[0]; /* the only one */ + + /* if it's a unique portal, return match-table hashed by NID */ + return lnet_ptl_is_unique(ptl) ? + ptl->ptl_mtables[lnet_nid2cpt(&id->nid, NULL)] : NULL; +} + +struct lnet_match_table * +lnet_mt_of_attach(unsigned int index, struct lnet_processid *id, + __u64 mbits, __u64 ignore_bits, enum lnet_ins_pos pos) +{ + struct lnet_portal *ptl; + struct lnet_match_table *mtable; + + /* NB: called w/o lock */ + LASSERT(index < the_lnet.ln_nportals); + + if (!lnet_ptl_match_type(index, id, mbits, ignore_bits)) + return NULL; + + ptl = the_lnet.ln_portals[index]; + + mtable = lnet_match2mt(ptl, id, mbits); + if (mtable != NULL) /* unique portal or only one match-table */ + return mtable; + + /* it's a wildcard portal */ + switch (pos) { + default: + return NULL; + case LNET_INS_BEFORE: + case LNET_INS_AFTER: + /* posted by no affinity thread, always hash to specific + * match-table to avoid buffer stealing which is heavy */ + return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER]; + case LNET_INS_LOCAL: + /* posted by cpu-affinity thread */ + return ptl->ptl_mtables[lnet_cpt_current()]; + } +} + +static struct lnet_match_table * +lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct lnet_match_table *mtable; + struct lnet_portal *ptl; + unsigned int nmaps; + unsigned int rotor; + unsigned int cpt; + bool routed; + + /* NB: called w/o lock */ + LASSERT(info->mi_portal < the_lnet.ln_nportals); + ptl = the_lnet.ln_portals[info->mi_portal]; + + LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)); + + mtable = lnet_match2mt(ptl, &info->mi_id, info->mi_mbits); + if (mtable != NULL) + return mtable; + + /* it's a wildcard portal */ + routed = LNET_NID_NET(&msg->msg_hdr.src_nid) != + LNET_NID_NET(&msg->msg_hdr.dest_nid); + + if (portal_rotor == LNET_PTL_ROTOR_OFF || + (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) { + cpt = lnet_cpt_current(); + if (ptl->ptl_mtables[cpt]->mt_enabled) + return ptl->ptl_mtables[cpt]; + } + + rotor = ptl->ptl_rotor++; /* get round-robin factor */ + if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed) + cpt = info->mi_cpt; + else + cpt = rotor % LNET_CPT_NUMBER; + + if (!ptl->ptl_mtables[cpt]->mt_enabled) { + /* is there any active entry for this portal? */ + nmaps = ptl->ptl_mt_nmaps; + /* map to an active mtable to avoid heavy "stealing" */ + if (nmaps != 0) { + /* NB: there is possibility that ptl_mt_maps is being + * changed because we are not under protection of + * lnet_ptl_lock, but it shouldn't hurt anything */ + cpt = ptl->ptl_mt_maps[rotor % nmaps]; + } + } + + return ptl->ptl_mtables[cpt]; +} + +static int +lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos) +{ + __u64 *bmap; + int i; + + if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])) + return 0; + + if (pos < 0) { /* check all bits */ + for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) { + if (mtable->mt_exhausted[i] != (__u64)(-1)) + return 0; + } + return 1; + } + + LASSERT(pos <= LNET_MT_HASH_IGNORE); + /* mtable::mt_mhash[pos] is marked as exhausted or not */ + bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64]; + pos &= (1 << LNET_MT_BITS_U64) - 1; + + return ((*bmap) & (1ULL << pos)) != 0; +} + +static void +lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted) +{ + __u64 *bmap; + + LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])); + LASSERT(pos <= LNET_MT_HASH_IGNORE); + + /* set mtable::mt_mhash[pos] as exhausted/non-exhausted */ + bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64]; + pos &= (1 << LNET_MT_BITS_U64) - 1; + + if (!exhausted) + *bmap &= ~(1ULL << pos); + else + *bmap |= 1ULL << pos; +} + +struct list_head * +lnet_mt_match_head(struct lnet_match_table *mtable, + struct lnet_processid *id, __u64 mbits) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal]; + + if (lnet_ptl_is_wildcard(ptl)) { + return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK]; + } else { + unsigned long hash = mbits + nidhash(&id->nid) + id->pid; + + LASSERT(lnet_ptl_is_unique(ptl)); + hash = hash_long(hash, LNET_MT_HASH_BITS); + return &mtable->mt_mhash[hash & LNET_MT_HASH_MASK]; + } +} + +int +lnet_mt_match_md(struct lnet_match_table *mtable, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct list_head *head; + struct lnet_me *me; + struct lnet_me *tmp; + int exhausted = 0; + int rc; + + /* any ME with ignore bits? */ + if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE])) + head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE]; + else + head = lnet_mt_match_head(mtable, &info->mi_id, + info->mi_mbits); + again: + /* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */ + if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])) + exhausted = LNET_MATCHMD_EXHAUSTED; + + list_for_each_entry_safe(me, tmp, head, me_list) { + /* ME attached but MD not attached yet */ + if (me->me_md == NULL) + continue; + + LASSERT(me == me->me_md->md_me); + + rc = lnet_try_match_md(me->me_md, info, msg); + if ((rc & LNET_MATCHMD_EXHAUSTED) == 0) + exhausted = 0; /* mlist is not empty */ + + if ((rc & LNET_MATCHMD_FINISH) != 0) { + /* don't return EXHAUSTED bit because we don't know + * whether the mlist is empty or not */ + return rc & ~LNET_MATCHMD_EXHAUSTED; + } + } + + if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */ + lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1); + if (!lnet_mt_test_exhausted(mtable, -1)) + exhausted = 0; + } + + if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) { + head = lnet_mt_match_head(mtable, &info->mi_id, + info->mi_mbits); + goto again; /* re-check MEs w/o ignore-bits */ + } + + if (info->mi_opc == LNET_MD_OP_GET || + !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal])) + return LNET_MATCHMD_DROP | exhausted; + + return LNET_MATCHMD_NONE | exhausted; +} + +static int +lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg) +{ + int rc; + + /* message arrived before any buffer posting on this portal, + * simply delay or drop this message */ + if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl))) + return 0; + + lnet_ptl_lock(ptl); + /* check it again with hold of lock */ + if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) { + lnet_ptl_unlock(ptl); + return 0; + } + + if (lnet_ptl_is_lazy(ptl)) { + if (msg->msg_rx_ready_delay) { + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_delayed); + } + rc = LNET_MATCHMD_NONE; + } else { + rc = LNET_MATCHMD_DROP; + } + + lnet_ptl_unlock(ptl); + return rc; +} + +static int +lnet_ptl_match_delay(struct lnet_portal *ptl, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + int first = ptl->ptl_mt_maps[0]; /* read w/o lock */ + int rc = 0; + int i; + + /* + * Steal buffer from other CPTs, and delay msg if nothing to + * steal. This function is more expensive than a regular + * match, but we don't expect it can happen a lot. The return + * code contains one of LNET_MATCHMD_OK, LNET_MATCHMD_DROP, or + * LNET_MATCHMD_NONE. + */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + for (i = 0; i < LNET_CPT_NUMBER; i++) { + struct lnet_match_table *mtable; + int cpt; + + cpt = (first + i) % LNET_CPT_NUMBER; + mtable = ptl->ptl_mtables[cpt]; + if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled) + continue; + + lnet_res_lock(cpt); + lnet_ptl_lock(ptl); + + if (i == 0) { + /* The first try, add to stealing list. */ + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_stealing); + } + + if (!list_empty(&msg->msg_list)) { + /* On stealing list. */ + rc = lnet_mt_match_md(mtable, info, msg); + + if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && + mtable->mt_enabled) + lnet_ptl_disable_mt(ptl, cpt); + + if ((rc & LNET_MATCHMD_FINISH) != 0) { + /* Match found, remove from stealing list. */ + list_del_init(&msg->msg_list); + } else if (i == LNET_CPT_NUMBER - 1 || /* (1) */ + ptl->ptl_mt_nmaps == 0 || /* (2) */ + (ptl->ptl_mt_nmaps == 1 && /* (3) */ + ptl->ptl_mt_maps[0] == cpt)) { + /* + * No match found, and this is either + * (1) the last cpt to check, or + * (2) there is no active cpt, or + * (3) this is the only active cpt. + * There is nothing to steal: delay or + * drop the message. + */ + list_del_init(&msg->msg_list); + + if (lnet_ptl_is_lazy(ptl)) { + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_delayed); + rc = LNET_MATCHMD_NONE; + } else { + rc = LNET_MATCHMD_DROP; + } + } else { + /* Do another iteration. */ + rc = 0; + } + } else { + /* + * No longer on stealing list: another thread + * matched the message in lnet_ptl_attach_md(). + * We are now expected to handle the message. + */ + rc = msg->msg_md == NULL ? + LNET_MATCHMD_DROP : LNET_MATCHMD_OK; + } + + lnet_ptl_unlock(ptl); + lnet_res_unlock(cpt); + + /* + * Note that test (1) above ensures that we always + * exit the loop through this break statement. + * + * LNET_MATCHMD_NONE means msg was added to the + * delayed queue, and we may no longer reference it + * after lnet_ptl_unlock() and lnet_res_unlock(). + */ + if (rc & (LNET_MATCHMD_FINISH | LNET_MATCHMD_NONE)) + break; + } + + return rc; +} + +int +lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct lnet_match_table *mtable; + struct lnet_portal *ptl; + int rc; + + CDEBUG(D_NET, + "Request from %s of length %d into portal %d MB=%#llx\n", + libcfs_idstr(&info->mi_id), + info->mi_rlength, info->mi_portal, info->mi_mbits); + + if (info->mi_portal >= the_lnet.ln_nportals) { + CERROR("Invalid portal %d not in [0-%d]\n", + info->mi_portal, the_lnet.ln_nportals); + return LNET_MATCHMD_DROP; + } + + ptl = the_lnet.ln_portals[info->mi_portal]; + rc = lnet_ptl_match_early(ptl, msg); + if (rc != 0) /* matched or delayed early message */ + return rc; + + mtable = lnet_mt_of_match(info, msg); + lnet_res_lock(mtable->mt_cpt); + + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = LNET_MATCHMD_DROP; + goto out1; + } + + rc = lnet_mt_match_md(mtable, info, msg); + if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) { + lnet_ptl_lock(ptl); + lnet_ptl_disable_mt(ptl, mtable->mt_cpt); + lnet_ptl_unlock(ptl); + } + + if ((rc & LNET_MATCHMD_FINISH) != 0) /* matched or dropping */ + goto out1; + + if (!msg->msg_rx_ready_delay) + goto out1; + + LASSERT(lnet_ptl_is_lazy(ptl)); + LASSERT(!msg->msg_rx_delayed); + + /* NB: we don't expect "delay" can happen a lot */ + if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) { + lnet_ptl_lock(ptl); + + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed); + + lnet_ptl_unlock(ptl); + lnet_res_unlock(mtable->mt_cpt); + rc = LNET_MATCHMD_NONE; + } else { + lnet_res_unlock(mtable->mt_cpt); + rc = lnet_ptl_match_delay(ptl, info, msg); + } + + /* LNET_MATCHMD_NONE means msg was added to the delay queue */ + if (rc & LNET_MATCHMD_NONE) { + CDEBUG(D_NET, + "Delaying %s from %s ptl %d MB %#llx off %d len %d\n", + info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET", + libcfs_idstr(&info->mi_id), info->mi_portal, + info->mi_mbits, info->mi_roffset, info->mi_rlength); + } + goto out0; + out1: + lnet_res_unlock(mtable->mt_cpt); + out0: + /* EXHAUSTED bit is only meaningful for internal functions */ + return rc & ~LNET_MATCHMD_EXHAUSTED; +} + +void +lnet_ptl_detach_md(struct lnet_me *me, struct lnet_libmd *md) +{ + LASSERT(me->me_md == md && md->md_me == me); + + me->me_md = NULL; + md->md_me = NULL; +} + +/* called with lnet_res_lock held */ +void +lnet_ptl_attach_md(struct lnet_me *me, struct lnet_libmd *md, + struct list_head *matches, struct list_head *drops) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[me->me_portal]; + struct lnet_match_table *mtable; + struct list_head *head; + struct lnet_msg *tmp; + struct lnet_msg *msg; + int exhausted = 0; + int cpt; + + LASSERT(md->md_refcount == 0); /* a brand new MD */ + + me->me_md = md; + md->md_me = me; + + cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); + mtable = ptl->ptl_mtables[cpt]; + + if (list_empty(&ptl->ptl_msg_stealing) && + list_empty(&ptl->ptl_msg_delayed) && + !lnet_mt_test_exhausted(mtable, me->me_pos)) + return; + + lnet_ptl_lock(ptl); + head = &ptl->ptl_msg_stealing; + again: + list_for_each_entry_safe(msg, tmp, head, msg_list) { + struct lnet_match_info info; + struct lnet_hdr *hdr; + int rc; + + LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing); + + hdr = &msg->msg_hdr; + /* Multi-Rail: Primary peer NID */ + info.mi_id.nid = msg->msg_initiator; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_PUT; + info.mi_portal = hdr->msg.put.ptl_index; + info.mi_rlength = hdr->payload_length; + info.mi_roffset = hdr->msg.put.offset; + info.mi_mbits = hdr->msg.put.match_bits; + + rc = lnet_try_match_md(md, &info, msg); + + exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0; + if ((rc & LNET_MATCHMD_NONE) != 0) { + if (exhausted) + break; + continue; + } + + /* Hurrah! This _is_ a match */ + LASSERT((rc & LNET_MATCHMD_FINISH) != 0); + list_del_init(&msg->msg_list); + + if (head == &ptl->ptl_msg_stealing) { + if (exhausted) + break; + /* stealing thread will handle the message */ + continue; + } + + if ((rc & LNET_MATCHMD_OK) != 0) { + list_add_tail(&msg->msg_list, matches); + + CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d " + "match %llu offset %d length %d.\n", + libcfs_idstr(&info.mi_id), + info.mi_portal, info.mi_mbits, + info.mi_roffset, info.mi_rlength); + } else { + list_add_tail(&msg->msg_list, drops); + } + + if (exhausted) + break; + } + + if (!exhausted && head == &ptl->ptl_msg_stealing) { + head = &ptl->ptl_msg_delayed; + goto again; + } + + if (lnet_ptl_is_wildcard(ptl) && !exhausted) { + lnet_mt_set_exhausted(mtable, me->me_pos, 0); + if (!mtable->mt_enabled) + lnet_ptl_enable_mt(ptl, cpt); + } + + lnet_ptl_unlock(ptl); +} + +static void +lnet_ptl_cleanup(struct lnet_portal *ptl) +{ + struct lnet_match_table *mtable; + int i; + + if (ptl->ptl_mtables == NULL) /* uninitialized portal */ + return; + + LASSERT(list_empty(&ptl->ptl_msg_delayed)); + LASSERT(list_empty(&ptl->ptl_msg_stealing)); + cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) { + struct list_head *mhash; + struct lnet_me *me; + int j; + + if (mtable->mt_mhash == NULL) /* uninitialized match-table */ + continue; + + mhash = mtable->mt_mhash; + /* cleanup ME */ + for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) { + while (!list_empty(&mhash[j])) { + me = list_entry(mhash[j].next, + struct lnet_me, me_list); + CERROR("Active ME %p on exit\n", me); + list_del(&me->me_list); + CDEBUG(D_MALLOC, + "slab-freed 'me' at %p in cleanup.\n", + me); + kmem_cache_free(lnet_mes_cachep, me); + } + } + /* the extra entry is for MEs with ignore bits */ + CFS_FREE_PTR_ARRAY(mhash, LNET_MT_HASH_SIZE + 1); + } + + cfs_percpt_free(ptl->ptl_mtables); + ptl->ptl_mtables = NULL; +} + +static int +lnet_ptl_setup(struct lnet_portal *ptl, int index) +{ + struct lnet_match_table *mtable; + struct list_head *mhash; + int i; + int j; + + ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct lnet_match_table)); + if (ptl->ptl_mtables == NULL) { + CERROR("Failed to create match table for portal %d\n", index); + return -ENOMEM; + } + + ptl->ptl_index = index; + INIT_LIST_HEAD(&ptl->ptl_msg_delayed); + INIT_LIST_HEAD(&ptl->ptl_msg_stealing); + spin_lock_init(&ptl->ptl_lock); + cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) { + /* the extra entry is for MEs with ignore bits */ + LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i, + sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1)); + if (mhash == NULL) { + CERROR("Failed to create match hash for portal %d\n", + index); + goto failed; + } + + memset(&mtable->mt_exhausted[0], -1, + sizeof(mtable->mt_exhausted[0]) * + LNET_MT_EXHAUSTED_BMAP); + mtable->mt_mhash = mhash; + for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) + INIT_LIST_HEAD(&mhash[j]); + + mtable->mt_portal = index; + mtable->mt_cpt = i; + } + + return 0; + failed: + lnet_ptl_cleanup(ptl); + return -ENOMEM; +} + +#define PORTAL_SIZE (offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER])) +void +lnet_portals_destroy(void) +{ + int i; + + if (the_lnet.ln_portals == NULL) + return; + + for (i = 0; i < the_lnet.ln_nportals; i++) + if (the_lnet.ln_portals[i]) { + lnet_ptl_cleanup(the_lnet.ln_portals[i]); + LIBCFS_FREE(the_lnet.ln_portals[i], PORTAL_SIZE); + } + + CFS_FREE_PTR_ARRAY(the_lnet.ln_portals, the_lnet.ln_nportals); + the_lnet.ln_portals = NULL; +} + +int +lnet_portals_create(void) +{ + int i; + + the_lnet.ln_nportals = MAX_PORTALS; + CFS_ALLOC_PTR_ARRAY(the_lnet.ln_portals, the_lnet.ln_nportals); + if (the_lnet.ln_portals == NULL) { + CERROR("Failed to allocate portals table\n"); + return -ENOMEM; + } + + for (i = 0; i < the_lnet.ln_nportals; i++) { + LIBCFS_ALLOC(the_lnet.ln_portals[i], PORTAL_SIZE); + if (!the_lnet.ln_portals[i] || + lnet_ptl_setup(the_lnet.ln_portals[i], i)) { + lnet_portals_destroy(); + return -ENOMEM; + } + } + + return 0; +} + +/** + * Turn on the lazy portal attribute. Use with caution! + * + * This portal attribute only affects incoming PUT requests to the portal, + * and is off by default. By default, if there's no matching MD for an + * incoming PUT request, it is simply dropped. With the lazy attribute on, + * such requests are queued indefinitely until either a matching MD is + * posted to the portal or the lazy attribute is turned off. + * + * It would prevent dropped requests, however it should be regarded as the + * last line of defense - i.e. users must keep a close watch on active + * buffers on a lazy portal and once it becomes too low post more buffers as + * soon as possible. This is because delayed requests usually have detrimental + * effects on underlying network connections. A few delayed requests often + * suffice to bring an underlying connection to a complete halt, due to flow + * control mechanisms. + * + * There's also a DOS attack risk. If users don't post match-all MDs on a + * lazy portal, a malicious peer can easily stop a service by sending some + * PUT requests with match bits that won't match any MD. A routed server is + * especially vulnerable since the connections to its neighbor routers are + * shared among all clients. + * + * \param portal Index of the portal to enable the lazy attribute on. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is not a valid index. + */ +int +LNetSetLazyPortal(int portal) +{ + struct lnet_portal *ptl; + + if (portal < 0 || portal >= the_lnet.ln_nportals) + return -EINVAL; + + CDEBUG(D_NET, "Setting portal %d lazy\n", portal); + ptl = the_lnet.ln_portals[portal]; + + lnet_res_lock(LNET_LOCK_EX); + lnet_ptl_lock(ptl); + + lnet_ptl_setopt(ptl, LNET_PTL_LAZY); + + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + + return 0; +} +EXPORT_SYMBOL(LNetSetLazyPortal); + +int +lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason) +{ + struct lnet_portal *ptl; + LIST_HEAD(zombies); + + if (portal < 0 || portal >= the_lnet.ln_nportals) + return -EINVAL; + + ptl = the_lnet.ln_portals[portal]; + + lnet_res_lock(LNET_LOCK_EX); + lnet_ptl_lock(ptl); + + if (!lnet_ptl_is_lazy(ptl)) { + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + return 0; + } + + if (ni != NULL) { + struct lnet_msg *msg, *tmp; + + /* grab all messages which are on the NI passed in */ + list_for_each_entry_safe(msg, tmp, &ptl->ptl_msg_delayed, + msg_list) { + if (msg->msg_txni == ni || msg->msg_rxni == ni) + list_move(&msg->msg_list, &zombies); + } + } else { + if (the_lnet.ln_state != LNET_STATE_RUNNING) + CWARN("Active lazy portal %d on exit\n", portal); + else + CDEBUG(D_NET, "clearing portal %d lazy\n", portal); + + /* grab all the blocked messages atomically */ + list_splice_init(&ptl->ptl_msg_delayed, &zombies); + + lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY); + } + + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + + lnet_drop_delayed_msg_list(&zombies, reason); + + return 0; +} + +/** + * Turn off the lazy portal attribute. Delayed requests on the portal, + * if any, will be all dropped when this function returns. + * + * \param portal Index of the portal to disable the lazy attribute on. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is not a valid index. + */ +int +LNetClearLazyPortal(int portal) +{ + return lnet_clear_lazy_portal(NULL, portal, + "Clearing lazy portal attr"); +} +EXPORT_SYMBOL(LNetClearLazyPortal); diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c new file mode 100644 index 0000000000000..90cdc3e2b4dbe --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c @@ -0,0 +1,434 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2015, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#include +#include +#include +/* For sys_open & sys_close */ +#include +#include +#include + +#include +#include +#include +#include + +int +lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + long jiffies_left = cfs_time_seconds(timeout); + unsigned long then; + + LASSERT(nob > 0); + /* Caller may pass a zero timeout if she thinks the socket buffer is + * empty enough to take the whole message immediately */ + + for (;;) { + struct kvec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_flags = (timeout == 0) ? MSG_DONTWAIT : 0 + }; + + if (timeout != 0) { + struct sock *sk = sock->sk; + + /* Set send timeout to remaining time */ + lock_sock(sk); + sk->sk_sndtimeo = jiffies_left; + release_sock(sk); + } + + then = jiffies; + rc = kernel_sendmsg(sock, &msg, &iov, 1, nob); + jiffies_left -= jiffies - then; + + if (rc == nob) + return 0; + + if (rc < 0) + return rc; + + if (rc == 0) { + CERROR("Unexpected zero rc\n"); + return -ECONNABORTED; + } + + if (jiffies_left <= 0) + return -EAGAIN; + + buffer = ((char *)buffer) + rc; + nob -= rc; + } + return 0; +} +EXPORT_SYMBOL(lnet_sock_write); + +int +lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + long jiffies_left = cfs_time_seconds(timeout); + unsigned long then; + + LASSERT(nob > 0); + LASSERT(jiffies_left > 0); + + for (;;) { + struct kvec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_flags = 0 + }; + struct sock *sk = sock->sk; + + /* Set receive timeout to remaining time */ + lock_sock(sk); + sk->sk_rcvtimeo = jiffies_left; + release_sock(sk); + + then = jiffies; + rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0); + jiffies_left -= jiffies - then; + + if (rc < 0) + return rc; + + if (rc == 0) + return -ECONNRESET; + + buffer = ((char *)buffer) + rc; + nob -= rc; + + if (nob == 0) + return 0; + + if (jiffies_left <= 0) + return -ETIMEDOUT; + } +} +EXPORT_SYMBOL(lnet_sock_read); + +int choose_ipv4_src(__u32 *ret, int interface, __u32 dst_ipaddr, struct net *ns) +{ + struct net_device *dev; + struct in_device *in_dev; + int err; + DECLARE_CONST_IN_IFADDR(ifa); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(ns, interface); + err = -EINVAL; + if (!dev || !(dev->flags & IFF_UP)) + goto out; + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + goto out; + err = -ENOENT; + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (err || + ((dst_ipaddr ^ ntohl(ifa->ifa_local)) + & ntohl(ifa->ifa_mask)) == 0) { + /* This address at least as good as what we + * already have + */ + *ret = ntohl(ifa->ifa_local); + err = 0; + } + } + endfor_ifa(in_dev); +out: + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(choose_ipv4_src); + +static struct socket * +lnet_sock_create(int interface, struct sockaddr *remaddr, + int local_port, struct net *ns) +{ + struct socket *sock; + int rc; + int family; + + family = AF_INET6; + if (remaddr) + family = remaddr->sa_family; +retry: +#ifdef HAVE_SOCK_CREATE_KERN_USE_NET + rc = sock_create_kern(ns, family, SOCK_STREAM, 0, &sock); +#else + rc = sock_create_kern(family, SOCK_STREAM, 0, &sock); +#endif + if (rc == -EAFNOSUPPORT && family == AF_INET6 && !remaddr) { + family = AF_INET; + goto retry; + } + + if (rc) { + CERROR("Can't create socket: %d\n", rc); + return ERR_PTR(rc); + } + + sock->sk->sk_reuseport = 1; + + if (interface >= 0 || local_port != 0) { + struct sockaddr_storage locaddr = {}; + + switch (family) { + case AF_INET: { + struct sockaddr_in *sin = (void *)&locaddr; + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = INADDR_ANY; + if (interface >= 0 && remaddr) { + struct sockaddr_in *rem = (void *)remaddr; + __u32 ip; + + rc = choose_ipv4_src(&ip, + interface, + ntohl(rem->sin_addr.s_addr), + ns); + if (rc) + goto failed; + sin->sin_addr.s_addr = htonl(ip); + } + sin->sin_port = htons(local_port); + break; + } +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: { + struct sockaddr_in6 *sin6 = (void *)&locaddr; + int val = 0; + + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = in6addr_any; + + /* Make sure we get both IPv4 and IPv6 connections. + * This is the default, but it can be overridden so we + * force it back. + */ +#ifdef HAVE_KERNEL_SETSOCKOPT + kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, + (char *) &val, sizeof(val)); +#elif defined(_LINUX_SOCKPTR_H) + /* sockptr_t was introduced around + * v5.8-rc4-1952-ga7b75c5a8c41 and allows a + * kernel address to be passed to ->setsockopt + */ + if (ipv6_only_sock(sock->sk)) { + sockptr_t optval = KERNEL_SOCKPTR(&val); + + sock->ops->setsockopt(sock, + IPPROTO_IPV6, IPV6_V6ONLY, + optval, sizeof(val)); + } +#else + /* From v5.7-rc6-2614-g5a892ff2facb when + * kernel_setsockopt() was removed until + * sockptr_t (above) there is no clean way to + * pass kernel address to setsockopt. We could + * use get_fs()/set_fs(), but in this particular + * situation there is an easier way. It depends + * on the fact that at least for these few + * kernels a NULL address to ipv6_setsockopt() + * is treated like the address of a zero. + */ + if (ipv6_only_sock(sock->sk) && !val) { + void *optval = NULL; + + sock->ops->setsockopt(sock, + IPPROTO_IPV6, IPV6_V6ONLY, + optval, sizeof(val)); + } +#endif /* HAVE_KERNEL_SETSOCKOPT */ + + if (interface >= 0 && remaddr) { + struct sockaddr_in6 *rem = (void *)remaddr; + + ipv6_dev_get_saddr(ns, + dev_get_by_index(ns, + interface), + &rem->sin6_addr, 0, + &sin6->sin6_addr); + } + sin6->sin6_port = htons(local_port); + break; + } +#endif /* IS_ENABLED(CONFIG_IPV6) */ + } + rc = kernel_bind(sock, (struct sockaddr *)&locaddr, + sizeof(locaddr)); + if (rc == -EADDRINUSE) { + CDEBUG(D_NET, "Port %d already in use\n", local_port); + goto failed; + } + if (rc != 0) { + CERROR("Error trying to bind to port %d: %d\n", + local_port, rc); + goto failed; + } + } + return sock; + +failed: + sock_release(sock); + return ERR_PTR(rc); +} + +void +lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize) +{ + struct sock *sk = sock->sk; + + if (txbufsize != 0) { + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = txbufsize; + sk->sk_write_space(sk); + } + + if (rxbufsize != 0) { + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + sk->sk_sndbuf = rxbufsize; + } +} +EXPORT_SYMBOL(lnet_sock_setbuf); + +int +lnet_sock_getaddr(struct socket *sock, bool remote, + struct sockaddr_storage *peer) +{ + int rc; +#ifndef HAVE_KERN_SOCK_GETNAME_2ARGS + int len = sizeof(*peer); +#endif + + if (remote) + rc = lnet_kernel_getpeername(sock, + (struct sockaddr *)peer, &len); + else + rc = lnet_kernel_getsockname(sock, + (struct sockaddr *)peer, &len); + if (rc < 0) { + CERROR("Error %d getting sock %s IP/port\n", + rc, remote ? "peer" : "local"); + return rc; + } + if (peer->ss_family == AF_INET6) { + struct sockaddr_in6 *in6 = (void *)peer; + struct sockaddr_in *in = (void *)peer; + short port = in6->sin6_port; + + if (ipv6_addr_v4mapped(&in6->sin6_addr)) { + /* Pretend it is a v4 socket */ + memset(in, 0, sizeof(*in)); + in->sin_family = AF_INET; + in->sin_port = port; + memcpy(&in->sin_addr, &in6->sin6_addr.s6_addr32[3], 4); + } + } + return 0; +} +EXPORT_SYMBOL(lnet_sock_getaddr); + +void lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize) +{ + if (txbufsize != NULL) + *txbufsize = sock->sk->sk_sndbuf; + + if (rxbufsize != NULL) + *rxbufsize = sock->sk->sk_rcvbuf; +} +EXPORT_SYMBOL(lnet_sock_getbuf); + +struct socket * +lnet_sock_listen(int local_port, int backlog, struct net *ns) +{ + struct socket *sock; + int rc; + + sock = lnet_sock_create(-1, NULL, local_port, ns); + if (IS_ERR(sock)) { + rc = PTR_ERR(sock); + if (rc == -EADDRINUSE) + CERROR("Can't create socket: port %d already in use\n", + local_port); + return ERR_PTR(rc); + } + + rc = kernel_listen(sock, backlog); + if (rc == 0) + return sock; + + CERROR("Can't set listen backlog %d: %d\n", backlog, rc); + sock_release(sock); + return ERR_PTR(rc); +} + +struct socket * +lnet_sock_connect(int interface, int local_port, + struct sockaddr *peeraddr, + struct net *ns) +{ + struct socket *sock; + int rc; + + sock = lnet_sock_create(interface, peeraddr, local_port, ns); + if (IS_ERR(sock)) + return sock; + + rc = kernel_connect(sock, peeraddr, sizeof(struct sockaddr_in6), 0); + if (rc == 0) + return sock; + + /* EADDRNOTAVAIL probably means we're already connected to the same + * peer/port on the same local port on a differently typed + * connection. Let our caller retry with a different local + * port... */ + + CDEBUG_LIMIT(rc == -EADDRNOTAVAIL ? D_NET : D_NETERROR, + "Error %d connecting %d -> %pISp\n", rc, + local_port, peeraddr); + + sock_release(sock); + return ERR_PTR(rc); +} diff --git a/drivers/staging/lustrefsx/lnet/lnet/lnet_rdma.c b/drivers/staging/lustrefsx/lnet/lnet/lnet_rdma.c new file mode 100644 index 0000000000000..c5c9d9ffe8b50 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lnet_rdma.c @@ -0,0 +1,208 @@ +#include +#include +#include + +#define ERROR_PRINT_DEADLINE 3600 + +atomic_t nvfs_shutdown = ATOMIC_INIT(1); +struct nvfs_dma_rw_ops *nvfs_ops = NULL; +struct percpu_counter nvfs_n_ops; + +static inline long nvfs_count_ops(void) +{ + return percpu_counter_sum(&nvfs_n_ops); +} + +static struct nvfs_dma_rw_ops *nvfs_get_ops(void) +{ + if (!nvfs_ops || atomic_read(&nvfs_shutdown)) + return NULL; + + percpu_counter_inc(&nvfs_n_ops); + + return nvfs_ops; +} + +static inline void nvfs_put_ops(void) +{ + percpu_counter_dec(&nvfs_n_ops); +} + +static inline bool nvfs_check_feature_set(struct nvfs_dma_rw_ops *ops) +{ + bool supported = true; + static time64_t last_printed; + + if (unlikely(!NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops))) { + if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE) + CDEBUG(D_CONSOLE, + "NVFS sg list preparation callback missing\n"); + supported = false; + } + if (unlikely(!NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops))) { + if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE) + CDEBUG(D_CONSOLE, + "NVFS DMA mapping callbacks missing\n"); + supported = false; + } + if (unlikely(!NVIDIA_FS_CHECK_FT_GPU_PAGE(ops))) { + if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE) + CDEBUG(D_CONSOLE, + "NVFS page identification callback missing\n"); + supported = false; + } + if (unlikely(!NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops))) { + if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE) + CDEBUG(D_CONSOLE, + "NVFS device priority callback not missing\n"); + supported = false; + } + + if (unlikely(!supported && + ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE))) + last_printed = ktime_get_seconds(); + else if (supported) + last_printed = 0; + + return supported; +} + +int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops) +{ + if (!ops || !nvfs_check_feature_set(ops)) + return -EINVAL; + + nvfs_ops = ops; + (void)percpu_counter_init(&nvfs_n_ops, 0, GFP_KERNEL); + atomic_set(&nvfs_shutdown, 0); + CDEBUG(D_NET, "registering nvfs %p\n", ops); + return 0; +} +EXPORT_SYMBOL(REGISTER_FUNC); + +void UNREGISTER_FUNC(void) +{ + (void)atomic_cmpxchg(&nvfs_shutdown, 0, 1); + do { + CDEBUG(D_NET, "Attempting to de-register nvfs: %ld\n", + nvfs_count_ops()); + msleep(NVFS_HOLD_TIME_MS); + } while (nvfs_count_ops()); + nvfs_ops = NULL; + percpu_counter_destroy(&nvfs_n_ops); +} +EXPORT_SYMBOL(UNREGISTER_FUNC); + +unsigned int +lnet_get_dev_prio(struct device *dev, unsigned int dev_idx) +{ + unsigned int dev_prio = UINT_MAX; + struct nvfs_dma_rw_ops *nvfs_ops; + + if (!dev) + return dev_prio; + + nvfs_ops = nvfs_get_ops(); + if (!nvfs_ops) + return dev_prio; + + dev_prio = nvfs_ops->nvfs_device_priority (dev, dev_idx); + + nvfs_put_ops(); + return dev_prio; +} +EXPORT_SYMBOL(lnet_get_dev_prio); + +int lnet_rdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction direction) +{ + struct nvfs_dma_rw_ops *nvfs_ops = nvfs_get_ops(); + + if (nvfs_ops) { + int count; + + count = nvfs_ops->nvfs_dma_map_sg_attrs(dev, + sg, nents, direction, + DMA_ATTR_NO_WARN); + + if (unlikely((count == NVFS_IO_ERR))) { + nvfs_put_ops(); + return -EIO; + } + + if (unlikely(count == NVFS_CPU_REQ)) + nvfs_put_ops(); + else + return count; + } + + return 0; +} +EXPORT_SYMBOL(lnet_rdma_map_sg_attrs); + +int lnet_rdma_unmap_sg(struct device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + struct nvfs_dma_rw_ops *nvfs_ops = nvfs_get_ops(); + + if (nvfs_ops) { + int count; + + count = nvfs_ops->nvfs_dma_unmap_sg(dev, sg, + nents, direction); + + /* drop the count we got by calling nvfs_get_ops() */ + nvfs_put_ops(); + + if (count) { + nvfs_put_ops(); + return count; + } + } + + return 0; +} +EXPORT_SYMBOL(lnet_rdma_unmap_sg); + +bool +lnet_is_rdma_only_page(struct page *page) +{ + bool found = false; + struct nvfs_dma_rw_ops *nvfs_ops; + + if (!page) + return found; + + nvfs_ops = nvfs_get_ops(); + if (!nvfs_ops) + return found; + + if (!nvfs_ops->nvfs_is_gpu_page(page)) + goto out; + + found = true; + +out: + nvfs_put_ops(); + return found; +} +EXPORT_SYMBOL(lnet_is_rdma_only_page); + +unsigned int +lnet_get_dev_idx(struct page *page) +{ + unsigned int dev_idx = UINT_MAX; + struct nvfs_dma_rw_ops *nvfs_ops; + + nvfs_ops = nvfs_get_ops(); + if (!nvfs_ops) + return dev_idx; + + dev_idx = nvfs_ops->nvfs_gpu_index(page); + + nvfs_put_ops(); + return dev_idx; +} +EXPORT_SYMBOL(lnet_get_dev_idx); + diff --git a/drivers/staging/lustrefsx/lnet/lnet/lo.c b/drivers/staging/lustrefsx/lnet/lnet/lo.c new file mode 100644 index 0000000000000..d4c9ed101e803 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lo.c @@ -0,0 +1,92 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + +static int +lolnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) +{ + LASSERT(!lntmsg->msg_routing); + LASSERT(!lntmsg->msg_target_is_router); + + return lnet_parse(ni, &lntmsg->msg_hdr, &ni->ni_nid, lntmsg, 0); +} + +static int +lolnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, + int delayed, unsigned int niov, + struct bio_vec *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + struct lnet_msg *sendmsg = private; + + if (lntmsg) { /* not discarding */ + lnet_copy_kiov2kiov(niov, kiov, offset, + sendmsg->msg_niov, + sendmsg->msg_kiov, + sendmsg->msg_offset, mlen); + + lnet_finalize(lntmsg, 0); + } + + lnet_finalize(sendmsg, 0); + return 0; +} + +static int lolnd_instanced; + +static void +lolnd_shutdown(struct lnet_ni *ni) +{ + CDEBUG (D_NET, "shutdown\n"); + LASSERT(lolnd_instanced); + + lolnd_instanced = 0; +} + +static int +lolnd_startup(struct lnet_ni *ni) +{ + LASSERT (ni->ni_net->net_lnd == &the_lolnd); + LASSERT (!lolnd_instanced); + lolnd_instanced = 1; + + return (0); +} + +const struct lnet_lnd the_lolnd = { + .lnd_type = LOLND, + .lnd_startup = lolnd_startup, + .lnd_shutdown = lolnd_shutdown, + .lnd_send = lolnd_send, + .lnd_recv = lolnd_recv +}; diff --git a/drivers/staging/lustrefsx/lnet/lnet/module.c b/drivers/staging/lustrefsx/lnet/lnet/module.c new file mode 100644 index 0000000000000..e4fe3f8aa2381 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/module.c @@ -0,0 +1,277 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include + +static int config_on_load = 0; +module_param(config_on_load, int, 0444); +MODULE_PARM_DESC(config_on_load, "configure network at module load"); + +static DEFINE_MUTEX(lnet_config_mutex); + +static int +lnet_configure(void *arg) +{ + /* 'arg' only there so I can be passed to cfs_create_thread() */ + int rc = 0; + + mutex_lock(&lnet_config_mutex); + + if (!the_lnet.ln_niinit_self) { + rc = try_module_get(THIS_MODULE); + + if (rc != 1) + goto out; + + rc = LNetNIInit(LNET_PID_LUSTRE); + if (rc >= 0) { + the_lnet.ln_niinit_self = 1; + rc = 0; + } else { + module_put(THIS_MODULE); + } + } + +out: + mutex_unlock(&lnet_config_mutex); + return rc; +} + +static int +lnet_unconfigure (void) +{ + int refcount; + + mutex_lock(&lnet_config_mutex); + + if (the_lnet.ln_niinit_self) { + the_lnet.ln_niinit_self = 0; + LNetNIFini(); + module_put(THIS_MODULE); + } + + mutex_lock(&the_lnet.ln_api_mutex); + refcount = the_lnet.ln_refcount; + mutex_unlock(&the_lnet.ln_api_mutex); + + mutex_unlock(&lnet_config_mutex); + + return (refcount == 0) ? 0 : -EBUSY; +} + +static int +lnet_dyn_configure_net(struct libcfs_ioctl_hdr *hdr) +{ + struct lnet_ioctl_config_data *conf = + (struct lnet_ioctl_config_data *)hdr; + int rc; + + if (conf->cfg_hdr.ioc_len < sizeof(*conf)) + return -EINVAL; + + mutex_lock(&lnet_config_mutex); + if (the_lnet.ln_niinit_self) + rc = lnet_dyn_add_net(conf); + else + rc = -EINVAL; + mutex_unlock(&lnet_config_mutex); + + return rc; +} + +static int +lnet_dyn_unconfigure_net(struct libcfs_ioctl_hdr *hdr) +{ + struct lnet_ioctl_config_data *conf = + (struct lnet_ioctl_config_data *) hdr; + int rc; + + if (conf->cfg_hdr.ioc_len < sizeof(*conf)) + return -EINVAL; + + mutex_lock(&lnet_config_mutex); + if (the_lnet.ln_niinit_self) + rc = lnet_dyn_del_net(conf->cfg_net); + else + rc = -EINVAL; + mutex_unlock(&lnet_config_mutex); + + return rc; +} + +static int +lnet_dyn_configure_ni(struct libcfs_ioctl_hdr *hdr) +{ + struct lnet_ioctl_config_ni *conf = + (struct lnet_ioctl_config_ni *)hdr; + int rc; + + if (conf->lic_cfg_hdr.ioc_len < sizeof(*conf)) + return -EINVAL; + + mutex_lock(&lnet_config_mutex); + if (the_lnet.ln_niinit_self) + rc = lnet_dyn_add_ni(conf); + else + rc = -EINVAL; + mutex_unlock(&lnet_config_mutex); + + return rc; +} + +static int +lnet_dyn_unconfigure_ni(struct libcfs_ioctl_hdr *hdr) +{ + struct lnet_ioctl_config_ni *conf = + (struct lnet_ioctl_config_ni *) hdr; + int rc; + + if (conf->lic_cfg_hdr.ioc_len < sizeof(*conf)) + return -EINVAL; + + mutex_lock(&lnet_config_mutex); + if (the_lnet.ln_niinit_self) + rc = lnet_dyn_del_ni(conf); + else + rc = -EINVAL; + mutex_unlock(&lnet_config_mutex); + + return rc; +} + +static int +lnet_ioctl(struct notifier_block *nb, + unsigned long cmd, void *vdata) +{ + struct libcfs_ioctl_hdr *hdr = vdata; + int rc; + + switch (cmd) { + case IOC_LIBCFS_CONFIGURE: { + struct libcfs_ioctl_data *data = + (struct libcfs_ioctl_data *)hdr; + + if (data->ioc_hdr.ioc_len < sizeof(*data)) { + rc = -EINVAL; + } else { + the_lnet.ln_nis_from_mod_params = data->ioc_flags; + rc = lnet_configure(NULL); + } + break; + } + + case IOC_LIBCFS_UNCONFIGURE: + rc = lnet_unconfigure(); + break; + + case IOC_LIBCFS_ADD_NET: + rc = lnet_dyn_configure_net(hdr); + break; + + case IOC_LIBCFS_DEL_NET: + rc = lnet_dyn_unconfigure_net(hdr); + break; + + case IOC_LIBCFS_ADD_LOCAL_NI: + rc = lnet_dyn_configure_ni(hdr); + break; + + case IOC_LIBCFS_DEL_LOCAL_NI: + rc = lnet_dyn_unconfigure_ni(hdr); + break; + + default: + /* Passing LNET_PID_ANY only gives me a ref if the net is up + * already; I'll need it to ensure the net can't go down while + * I'm called into it */ + rc = LNetNIInit(LNET_PID_ANY); + if (rc >= 0) { + rc = LNetCtl(cmd, hdr); + LNetNIFini(); + } + break; + } + return notifier_from_ioctl_errno(rc); +} + +static struct notifier_block lnet_ioctl_handler = { + .notifier_call = lnet_ioctl, +}; + +static int __init lnet_init(void) +{ + int rc; + ENTRY; + + rc = lnet_lib_init(); + if (rc != 0) { + CERROR("lnet_lib_init: error %d\n", rc); + RETURN(rc); + } + + if (live_router_check_interval != INT_MIN || + dead_router_check_interval != INT_MIN) + LCONSOLE_WARN("live_router_check_interval and dead_router_check_interval have been deprecated. Use alive_router_check_interval instead. Ignoring these deprecated parameters.\n"); + + rc = blocking_notifier_chain_register(&libcfs_ioctl_list, + &lnet_ioctl_handler); + LASSERT(rc == 0); + + if (config_on_load) { + /* Have to schedule a separate thread to avoid deadlocking + * in modload */ + (void)kthread_run(lnet_configure, NULL, "lnet_initd"); + } + + RETURN(0); +} + +static void __exit lnet_exit(void) +{ + int rc; + + rc = blocking_notifier_chain_unregister(&libcfs_ioctl_list, + &lnet_ioctl_handler); + LASSERT(rc == 0); + + lnet_lib_exit(); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Networking layer"); +MODULE_VERSION(LNET_VERSION); +MODULE_LICENSE("GPL"); + +module_init(lnet_init); +module_exit(lnet_exit); diff --git a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c new file mode 100644 index 0000000000000..c2f81fb150887 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c @@ -0,0 +1,1114 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/lnet/net_fault.c + * + * Lustre network fault simulation + * + * Author: liang.zhen@intel.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include + +#define LNET_MSG_MASK (LNET_PUT_BIT | LNET_ACK_BIT | \ + LNET_GET_BIT | LNET_REPLY_BIT) + +struct lnet_drop_rule { + /** link chain on the_lnet.ln_drop_rules */ + struct list_head dr_link; + /** attributes of this rule */ + struct lnet_fault_attr dr_attr; + /** lock to protect \a dr_drop_at and \a dr_stat */ + spinlock_t dr_lock; + /** + * the message sequence to drop, which means message is dropped when + * dr_stat.drs_count == dr_drop_at + */ + unsigned long dr_drop_at; + /** + * seconds to drop the next message, it's exclusive with dr_drop_at + */ + time64_t dr_drop_time; + /** baseline to caculate dr_drop_time */ + time64_t dr_time_base; + /** statistic of dropped messages */ + struct lnet_fault_stat dr_stat; +}; + +static bool +lnet_fault_nid_match(lnet_nid_t nid, lnet_nid_t msg_nid) +{ + if (nid == msg_nid || nid == LNET_NID_ANY) + return true; + + if (LNET_NIDNET(nid) != LNET_NIDNET(msg_nid)) + return false; + + /* 255.255.255.255@net is wildcard for all addresses in a network */ + return LNET_NIDADDR(nid) == LNET_NIDADDR(LNET_NID_ANY); +} + +static bool +lnet_fault_attr_match(struct lnet_fault_attr *attr, lnet_nid_t src, + lnet_nid_t local_nid, lnet_nid_t dst, + unsigned int type, unsigned int portal) +{ + if (!lnet_fault_nid_match(attr->fa_src, src) || + !lnet_fault_nid_match(attr->fa_dst, dst) || + !lnet_fault_nid_match(attr->fa_local_nid, local_nid)) + return false; + + if (!(attr->fa_msg_mask & BIT(type))) + return false; + + /* NB: ACK and REPLY have no portal, but they should have been + * rejected by message mask */ + if (attr->fa_ptl_mask != 0 && /* has portal filter */ + !(attr->fa_ptl_mask & (1ULL << portal))) + return false; + + return true; +} + +static int +lnet_fault_attr_validate(struct lnet_fault_attr *attr) +{ + if (attr->fa_msg_mask == 0) + attr->fa_msg_mask = LNET_MSG_MASK; /* all message types */ + + if (attr->fa_ptl_mask == 0) /* no portal filter */ + return 0; + + /* NB: only PUT and GET can be filtered if portal filter has been set */ + attr->fa_msg_mask &= LNET_GET_BIT | LNET_PUT_BIT; + if (attr->fa_msg_mask == 0) { + CDEBUG(D_NET, "can't find valid message type bits %x\n", + attr->fa_msg_mask); + return -EINVAL; + } + return 0; +} + +static void +lnet_fault_stat_inc(struct lnet_fault_stat *stat, unsigned int type) +{ + /* NB: fs_counter is NOT updated by this function */ + switch (type) { + case LNET_MSG_PUT: + stat->fs_put++; + return; + case LNET_MSG_ACK: + stat->fs_ack++; + return; + case LNET_MSG_GET: + stat->fs_get++; + return; + case LNET_MSG_REPLY: + stat->fs_reply++; + return; + } +} + +/** + * LNet message drop simulation + */ + +/** + * Add a new drop rule to LNet + * There is no check for duplicated drop rule, all rules will be checked for + * incoming message. + */ +static int +lnet_drop_rule_add(struct lnet_fault_attr *attr) +{ + struct lnet_drop_rule *rule; + ENTRY; + + if (!((attr->u.drop.da_rate == 0) ^ (attr->u.drop.da_interval == 0))) { + CDEBUG(D_NET, + "please provide either drop rate or drop interval, " + "but not both at the same time %d/%d\n", + attr->u.drop.da_rate, attr->u.drop.da_interval); + RETURN(-EINVAL); + } + + if (lnet_fault_attr_validate(attr) != 0) + RETURN(-EINVAL); + + CFS_ALLOC_PTR(rule); + if (rule == NULL) + RETURN(-ENOMEM); + + spin_lock_init(&rule->dr_lock); + + rule->dr_attr = *attr; + if (attr->u.drop.da_interval != 0) { + rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval; + rule->dr_drop_time = ktime_get_seconds() + + get_random_u32_below(attr->u.drop.da_interval); + } else { + rule->dr_drop_at = get_random_u32_below(attr->u.drop.da_rate); + } + + lnet_net_lock(LNET_LOCK_EX); + list_add(&rule->dr_link, &the_lnet.ln_drop_rules); + lnet_net_unlock(LNET_LOCK_EX); + + CDEBUG(D_NET, "Added drop rule: src %s, dst %s, rate %d, interval %d\n", + libcfs_nid2str(attr->fa_src), libcfs_nid2str(attr->fa_src), + attr->u.drop.da_rate, attr->u.drop.da_interval); + RETURN(0); +} + +/** + * Remove matched drop rules from lnet, all rules that can match \a src and + * \a dst will be removed. + * If \a src is zero, then all rules have \a dst as destination will be remove + * If \a dst is zero, then all rules have \a src as source will be removed + * If both of them are zero, all rules will be removed + */ +static int +lnet_drop_rule_del(lnet_nid_t src, lnet_nid_t dst) +{ + struct lnet_drop_rule *rule; + struct lnet_drop_rule *tmp; + LIST_HEAD(zombies); + int n = 0; + ENTRY; + + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry_safe(rule, tmp, &the_lnet.ln_drop_rules, dr_link) { + if (rule->dr_attr.fa_src != src && src != 0) + continue; + + if (rule->dr_attr.fa_dst != dst && dst != 0) + continue; + + list_move(&rule->dr_link, &zombies); + } + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry_safe(rule, tmp, &zombies, dr_link) { + CDEBUG(D_NET, "Remove drop rule: src %s->dst: %s (1/%d, %d)\n", + libcfs_nid2str(rule->dr_attr.fa_src), + libcfs_nid2str(rule->dr_attr.fa_dst), + rule->dr_attr.u.drop.da_rate, + rule->dr_attr.u.drop.da_interval); + + list_del(&rule->dr_link); + CFS_FREE_PTR(rule); + n++; + } + + RETURN(n); +} + +/** + * List drop rule at position of \a pos + */ +static int +lnet_drop_rule_list(int pos, struct lnet_fault_attr *attr, + struct lnet_fault_stat *stat) +{ + struct lnet_drop_rule *rule; + int cpt; + int i = 0; + int rc = -ENOENT; + ENTRY; + + cpt = lnet_net_lock_current(); + list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) { + if (i++ < pos) + continue; + + spin_lock(&rule->dr_lock); + *attr = rule->dr_attr; + *stat = rule->dr_stat; + spin_unlock(&rule->dr_lock); + rc = 0; + break; + } + + lnet_net_unlock(cpt); + RETURN(rc); +} + +/** + * reset counters for all drop rules + */ +static void +lnet_drop_rule_reset(void) +{ + struct lnet_drop_rule *rule; + int cpt; + ENTRY; + + cpt = lnet_net_lock_current(); + + list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) { + struct lnet_fault_attr *attr = &rule->dr_attr; + + spin_lock(&rule->dr_lock); + + memset(&rule->dr_stat, 0, sizeof(rule->dr_stat)); + if (attr->u.drop.da_rate != 0) { + rule->dr_drop_at = get_random_u32_below(attr->u.drop.da_rate); + } else { + rule->dr_drop_time = ktime_get_seconds() + + get_random_u32_below(attr->u.drop.da_interval); + rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval; + } + spin_unlock(&rule->dr_lock); + } + + lnet_net_unlock(cpt); + EXIT; +} + +static void +lnet_fault_match_health(enum lnet_msg_hstatus *hstatus, __u32 mask) +{ + int choice; + int delta; + int best_delta; + int i; + + /* assign a random failure */ + choice = get_random_u32_below(LNET_MSG_STATUS_END - LNET_MSG_STATUS_OK); + if (choice == 0) + choice++; + + if (mask == HSTATUS_RANDOM) { + *hstatus = choice; + return; + } + + if (mask & BIT(choice)) { + *hstatus = choice; + return; + } + + /* round to the closest ON bit */ + i = HSTATUS_END; + best_delta = HSTATUS_END; + while (i > 0) { + if (mask & BIT(i)) { + delta = choice - i; + if (delta < 0) + delta *= -1; + if (delta < best_delta) { + best_delta = delta; + choice = i; + } + } + i--; + } + + *hstatus = choice; +} + +/** + * check source/destination NID, portal, message type and drop rate, + * decide whether should drop this message or not + */ +static bool +drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src, + lnet_nid_t local_nid, lnet_nid_t dst, + unsigned int type, unsigned int portal, + enum lnet_msg_hstatus *hstatus) +{ + struct lnet_fault_attr *attr = &rule->dr_attr; + bool drop; + + if (!lnet_fault_attr_match(attr, src, local_nid, dst, type, portal)) + return false; + + if (attr->u.drop.da_drop_all) { + CDEBUG(D_NET, "set to drop all messages\n"); + drop = true; + goto drop_matched; + } + + /* + * if we're trying to match a health status error but it hasn't + * been set in the rule, then don't match + */ + if ((hstatus && !attr->u.drop.da_health_error_mask) || + (!hstatus && attr->u.drop.da_health_error_mask)) + return false; + + /* match this rule, check drop rate now */ + spin_lock(&rule->dr_lock); + if (attr->u.drop.da_random) { + int value = get_random_u32_below(attr->u.drop.da_interval); + if (value >= (attr->u.drop.da_interval / 2)) + drop = true; + else + drop = false; + } else if (rule->dr_drop_time != 0) { /* time based drop */ + time64_t now = ktime_get_seconds(); + + rule->dr_stat.fs_count++; + drop = now >= rule->dr_drop_time; + if (drop) { + if (now > rule->dr_time_base) + rule->dr_time_base = now; + + rule->dr_drop_time = rule->dr_time_base + + get_random_u32_below(attr->u.drop.da_interval); + rule->dr_time_base += attr->u.drop.da_interval; + + CDEBUG(D_NET, "Drop Rule %s->%s: next drop : %lld\n", + libcfs_nid2str(attr->fa_src), + libcfs_nid2str(attr->fa_dst), + rule->dr_drop_time); + } + + } else { /* rate based drop */ + __u64 count; + + drop = rule->dr_stat.fs_count++ == rule->dr_drop_at; + count = rule->dr_stat.fs_count; + if (do_div(count, attr->u.drop.da_rate) == 0) { + rule->dr_drop_at = rule->dr_stat.fs_count + + get_random_u32_below(attr->u.drop.da_rate); + CDEBUG(D_NET, "Drop Rule %s->%s: next drop: %lu\n", + libcfs_nid2str(attr->fa_src), + libcfs_nid2str(attr->fa_dst), rule->dr_drop_at); + } + } + +drop_matched: + + if (drop) { /* drop this message, update counters */ + if (hstatus) + lnet_fault_match_health(hstatus, + attr->u.drop.da_health_error_mask); + lnet_fault_stat_inc(&rule->dr_stat, type); + rule->dr_stat.u.drop.ds_dropped++; + } + + spin_unlock(&rule->dr_lock); + return drop; +} + +/** + * Check if message from \a src to \a dst can match any existed drop rule + */ +bool +lnet_drop_rule_match(struct lnet_hdr *hdr, + lnet_nid_t local_nid, + enum lnet_msg_hstatus *hstatus) +{ + lnet_nid_t src = lnet_nid_to_nid4(&hdr->src_nid); + lnet_nid_t dst = lnet_nid_to_nid4(&hdr->dest_nid); + unsigned int typ = hdr->type; + struct lnet_drop_rule *rule; + unsigned int ptl = -1; + bool drop = false; + int cpt; + + /* NB: if Portal is specified, then only PUT and GET will be + * filtered by drop rule */ + if (typ == LNET_MSG_PUT) + ptl = le32_to_cpu(hdr->msg.put.ptl_index); + else if (typ == LNET_MSG_GET) + ptl = le32_to_cpu(hdr->msg.get.ptl_index); + + cpt = lnet_net_lock_current(); + list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) { + drop = drop_rule_match(rule, src, local_nid, dst, typ, ptl, + hstatus); + if (drop) + break; + } + lnet_net_unlock(cpt); + + return drop; +} + +/** + * LNet Delay Simulation + */ +/** timestamp (second) to send delayed message */ +#define msg_delay_send msg_ev.hdr_data + +struct lnet_delay_rule { + /** link chain on the_lnet.ln_delay_rules */ + struct list_head dl_link; + /** link chain on delay_dd.dd_sched_rules */ + struct list_head dl_sched_link; + /** attributes of this rule */ + struct lnet_fault_attr dl_attr; + /** lock to protect \a below members */ + spinlock_t dl_lock; + /** refcount of delay rule */ + atomic_t dl_refcount; + /** + * the message sequence to delay, which means message is delayed when + * dl_stat.fs_count == dl_delay_at + */ + unsigned long dl_delay_at; + /** + * seconds to delay the next message, it's exclusive with dl_delay_at + */ + time64_t dl_delay_time; + /** baseline to caculate dl_delay_time */ + time64_t dl_time_base; + /** seconds until we send the next delayed message */ + time64_t dl_msg_send; + /** delayed message list */ + struct list_head dl_msg_list; + /** statistic of delayed messages */ + struct lnet_fault_stat dl_stat; + /** timer to wakeup delay_daemon */ + struct timer_list dl_timer; +}; + +struct delay_daemon_data { + /** serialise rule add/remove */ + struct mutex dd_mutex; + /** protect rules on \a dd_sched_rules */ + spinlock_t dd_lock; + /** scheduled delay rules (by timer) */ + struct list_head dd_sched_rules; + /** deamon thread sleeps at here */ + wait_queue_head_t dd_waitq; + /** controler (lctl command) wait at here */ + wait_queue_head_t dd_ctl_waitq; + /** deamon is running */ + unsigned int dd_running; + /** deamon stopped */ + unsigned int dd_stopped; +}; + +static struct delay_daemon_data delay_dd; + +static void +delay_rule_decref(struct lnet_delay_rule *rule) +{ + if (atomic_dec_and_test(&rule->dl_refcount)) { + LASSERT(list_empty(&rule->dl_sched_link)); + LASSERT(list_empty(&rule->dl_msg_list)); + LASSERT(list_empty(&rule->dl_link)); + + CFS_FREE_PTR(rule); + } +} + +/** + * check source/destination NID, portal, message type and delay rate, + * decide whether should delay this message or not + */ +static bool +delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src, + lnet_nid_t dst, unsigned int type, unsigned int portal, + struct lnet_msg *msg) +{ + struct lnet_fault_attr *attr = &rule->dl_attr; + bool delay; + time64_t now = ktime_get_seconds(); + + if (!lnet_fault_attr_match(attr, src, LNET_NID_ANY, + dst, type, portal)) + return false; + + /* match this rule, check delay rate now */ + spin_lock(&rule->dl_lock); + if (rule->dl_delay_time != 0) { /* time based delay */ + rule->dl_stat.fs_count++; + delay = now >= rule->dl_delay_time; + if (delay) { + if (now > rule->dl_time_base) + rule->dl_time_base = now; + + rule->dl_delay_time = rule->dl_time_base + + get_random_u32_below(attr->u.delay.la_interval); + rule->dl_time_base += attr->u.delay.la_interval; + + CDEBUG(D_NET, "Delay Rule %s->%s: next delay : %lld\n", + libcfs_nid2str(attr->fa_src), + libcfs_nid2str(attr->fa_dst), + rule->dl_delay_time); + } + + } else { /* rate based delay */ + __u64 count; + + delay = rule->dl_stat.fs_count++ == rule->dl_delay_at; + /* generate the next random rate sequence */ + count = rule->dl_stat.fs_count; + if (do_div(count, attr->u.delay.la_rate) == 0) { + rule->dl_delay_at = rule->dl_stat.fs_count + + get_random_u32_below(attr->u.delay.la_rate); + CDEBUG(D_NET, "Delay Rule %s->%s: next delay: %lu\n", + libcfs_nid2str(attr->fa_src), + libcfs_nid2str(attr->fa_dst), rule->dl_delay_at); + } + } + + if (!delay) { + spin_unlock(&rule->dl_lock); + return false; + } + + /* delay this message, update counters */ + lnet_fault_stat_inc(&rule->dl_stat, type); + rule->dl_stat.u.delay.ls_delayed++; + + list_add_tail(&msg->msg_list, &rule->dl_msg_list); + msg->msg_delay_send = now + attr->u.delay.la_latency; + if (rule->dl_msg_send == -1) { + rule->dl_msg_send = msg->msg_delay_send; + mod_timer(&rule->dl_timer, + jiffies + cfs_time_seconds(attr->u.delay.la_latency)); + } + + spin_unlock(&rule->dl_lock); + return true; +} + +/** + * check if \a msg can match any Delay Rule, receiving of this message + * will be delayed if there is a match. + */ +bool +lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg) +{ + struct lnet_delay_rule *rule; + lnet_nid_t src = lnet_nid_to_nid4(&hdr->src_nid); + lnet_nid_t dst = lnet_nid_to_nid4(&hdr->dest_nid); + unsigned int typ = hdr->type; + unsigned int ptl = -1; + + /* NB: called with hold of lnet_net_lock */ + + /* NB: if Portal is specified, then only PUT and GET will be + * filtered by delay rule */ + if (typ == LNET_MSG_PUT) + ptl = le32_to_cpu(hdr->msg.put.ptl_index); + else if (typ == LNET_MSG_GET) + ptl = le32_to_cpu(hdr->msg.get.ptl_index); + + list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) { + if (delay_rule_match(rule, src, dst, typ, ptl, msg)) + return true; + } + + return false; +} + +/** check out delayed messages for send */ +static void +delayed_msg_check(struct lnet_delay_rule *rule, bool all, + struct list_head *msg_list) +{ + struct lnet_msg *msg; + struct lnet_msg *tmp; + time64_t now = ktime_get_seconds(); + + if (!all && rule->dl_msg_send > now) + return; + + spin_lock(&rule->dl_lock); + list_for_each_entry_safe(msg, tmp, &rule->dl_msg_list, msg_list) { + if (!all && msg->msg_delay_send > now) + break; + + msg->msg_delay_send = 0; + list_move_tail(&msg->msg_list, msg_list); + } + + if (list_empty(&rule->dl_msg_list)) { + del_timer(&rule->dl_timer); + rule->dl_msg_send = -1; + + } else if (!list_empty(msg_list)) { + /* dequeued some timedout messages, update timer for the + * next delayed message on rule */ + msg = list_entry(rule->dl_msg_list.next, + struct lnet_msg, msg_list); + rule->dl_msg_send = msg->msg_delay_send; + mod_timer(&rule->dl_timer, + jiffies + + cfs_time_seconds(msg->msg_delay_send - now)); + } + spin_unlock(&rule->dl_lock); +} + +static void +delayed_msg_process(struct list_head *msg_list, bool drop) +{ + struct lnet_msg *msg; + + while (!list_empty(msg_list)) { + struct lnet_ni *ni; + int cpt; + int rc; + + msg = list_entry(msg_list->next, struct lnet_msg, msg_list); + + if (msg->msg_sending) { + /* Delayed send */ + list_del_init(&msg->msg_list); + ni = msg->msg_txni; + CDEBUG(D_NET, "TRACE: msg %p %s -> %s : %s\n", msg, + libcfs_nidstr(&ni->ni_nid), + libcfs_nidstr(&msg->msg_txpeer->lpni_nid), + lnet_msgtyp2str(msg->msg_type)); + lnet_ni_send(ni, msg); + continue; + } + + /* Delayed receive */ + LASSERT(msg->msg_rxpeer != NULL); + LASSERT(msg->msg_rxni != NULL); + + ni = msg->msg_rxni; + cpt = msg->msg_rx_cpt; + + list_del_init(&msg->msg_list); + if (drop) { + rc = -ECANCELED; + + } else if (!msg->msg_routing) { + rc = lnet_parse_local(ni, msg); + if (rc == 0) + continue; + + } else { + lnet_net_lock(cpt); + rc = lnet_parse_forward_locked(ni, msg); + lnet_net_unlock(cpt); + + switch (rc) { + case LNET_CREDIT_OK: + lnet_ni_recv(ni, msg->msg_private, msg, 0, + 0, msg->msg_len, msg->msg_len); + fallthrough; + case LNET_CREDIT_WAIT: + continue; + default: /* failures */ + break; + } + } + + lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len, + msg->msg_type); + lnet_finalize(msg, rc); + } +} + +/** + * Process delayed messages for scheduled rules + * This function can either be called by delay_rule_daemon, or by lnet_finalise + */ +void +lnet_delay_rule_check(void) +{ + struct lnet_delay_rule *rule; + LIST_HEAD(msgs); + + while (1) { + if (list_empty(&delay_dd.dd_sched_rules)) + break; + + spin_lock_bh(&delay_dd.dd_lock); + if (list_empty(&delay_dd.dd_sched_rules)) { + spin_unlock_bh(&delay_dd.dd_lock); + break; + } + + rule = list_entry(delay_dd.dd_sched_rules.next, + struct lnet_delay_rule, dl_sched_link); + list_del_init(&rule->dl_sched_link); + spin_unlock_bh(&delay_dd.dd_lock); + + delayed_msg_check(rule, false, &msgs); + delay_rule_decref(rule); /* -1 for delay_dd.dd_sched_rules */ + } + + if (!list_empty(&msgs)) + delayed_msg_process(&msgs, false); +} + +/** deamon thread to handle delayed messages */ +static int +lnet_delay_rule_daemon(void *arg) +{ + delay_dd.dd_running = 1; + wake_up(&delay_dd.dd_ctl_waitq); + + while (delay_dd.dd_running) { + wait_event_interruptible(delay_dd.dd_waitq, + !delay_dd.dd_running || + !list_empty(&delay_dd.dd_sched_rules)); + lnet_delay_rule_check(); + } + + /* in case more rules have been enqueued after my last check */ + lnet_delay_rule_check(); + delay_dd.dd_stopped = 1; + wake_up(&delay_dd.dd_ctl_waitq); + + return 0; +} + +static void +delay_timer_cb(cfs_timer_cb_arg_t data) +{ + struct lnet_delay_rule *rule = cfs_from_timer(rule, data, dl_timer); + + spin_lock_bh(&delay_dd.dd_lock); + if (list_empty(&rule->dl_sched_link) && delay_dd.dd_running) { + atomic_inc(&rule->dl_refcount); + list_add_tail(&rule->dl_sched_link, &delay_dd.dd_sched_rules); + wake_up(&delay_dd.dd_waitq); + } + spin_unlock_bh(&delay_dd.dd_lock); +} + +/** + * Add a new delay rule to LNet + * There is no check for duplicated delay rule, all rules will be checked for + * incoming message. + */ +int +lnet_delay_rule_add(struct lnet_fault_attr *attr) +{ + struct lnet_delay_rule *rule; + int rc = 0; + ENTRY; + + if (!((attr->u.delay.la_rate == 0) ^ + (attr->u.delay.la_interval == 0))) { + CDEBUG(D_NET, + "please provide either delay rate or delay interval, " + "but not both at the same time %d/%d\n", + attr->u.delay.la_rate, attr->u.delay.la_interval); + RETURN(-EINVAL); + } + + if (attr->u.delay.la_latency == 0) { + CDEBUG(D_NET, "delay latency cannot be zero\n"); + RETURN(-EINVAL); + } + + if (lnet_fault_attr_validate(attr) != 0) + RETURN(-EINVAL); + + CFS_ALLOC_PTR(rule); + if (rule == NULL) + RETURN(-ENOMEM); + + mutex_lock(&delay_dd.dd_mutex); + if (!delay_dd.dd_running) { + struct task_struct *task; + + /* NB: although LND threads will process delayed message + * in lnet_finalize, but there is no guarantee that LND + * threads will be waken up if no other message needs to + * be handled. + * Only one daemon thread, performance is not the concern + * of this simualation module. + */ + task = kthread_run(lnet_delay_rule_daemon, NULL, "lnet_dd"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + GOTO(failed, rc); + } + wait_event(delay_dd.dd_ctl_waitq, delay_dd.dd_running); + } + + cfs_timer_setup(&rule->dl_timer, delay_timer_cb, + (unsigned long)rule, 0); + + spin_lock_init(&rule->dl_lock); + INIT_LIST_HEAD(&rule->dl_msg_list); + INIT_LIST_HEAD(&rule->dl_sched_link); + + rule->dl_attr = *attr; + if (attr->u.delay.la_interval != 0) { + rule->dl_time_base = ktime_get_seconds() + + attr->u.delay.la_interval; + rule->dl_delay_time = ktime_get_seconds() + + get_random_u32_below(attr->u.delay.la_interval); + } else { + rule->dl_delay_at = get_random_u32_below(attr->u.delay.la_rate); + } + + rule->dl_msg_send = -1; + + lnet_net_lock(LNET_LOCK_EX); + atomic_set(&rule->dl_refcount, 1); + list_add(&rule->dl_link, &the_lnet.ln_delay_rules); + lnet_net_unlock(LNET_LOCK_EX); + + CDEBUG(D_NET, "Added delay rule: src %s, dst %s, rate %d\n", + libcfs_nid2str(attr->fa_src), libcfs_nid2str(attr->fa_src), + attr->u.delay.la_rate); + + mutex_unlock(&delay_dd.dd_mutex); + RETURN(0); + failed: + mutex_unlock(&delay_dd.dd_mutex); + CFS_FREE_PTR(rule); + return rc; +} + +/** + * Remove matched Delay Rules from lnet, if \a shutdown is true or both \a src + * and \a dst are zero, all rules will be removed, otherwise only matched rules + * will be removed. + * If \a src is zero, then all rules have \a dst as destination will be remove + * If \a dst is zero, then all rules have \a src as source will be removed + * + * When a delay rule is removed, all delayed messages of this rule will be + * processed immediately. + */ +int +lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown) +{ + struct lnet_delay_rule *rule; + struct lnet_delay_rule *tmp; + LIST_HEAD(rule_list); + LIST_HEAD(msg_list); + int n = 0; + bool cleanup; + ENTRY; + + if (shutdown) + src = dst = 0; + + mutex_lock(&delay_dd.dd_mutex); + lnet_net_lock(LNET_LOCK_EX); + + list_for_each_entry_safe(rule, tmp, &the_lnet.ln_delay_rules, dl_link) { + if (rule->dl_attr.fa_src != src && src != 0) + continue; + + if (rule->dl_attr.fa_dst != dst && dst != 0) + continue; + + CDEBUG(D_NET, "Remove delay rule: src %s->dst: %s (1/%d, %d)\n", + libcfs_nid2str(rule->dl_attr.fa_src), + libcfs_nid2str(rule->dl_attr.fa_dst), + rule->dl_attr.u.delay.la_rate, + rule->dl_attr.u.delay.la_interval); + /* refcount is taken over by rule_list */ + list_move(&rule->dl_link, &rule_list); + } + + /* check if we need to shutdown delay_daemon */ + cleanup = list_empty(&the_lnet.ln_delay_rules) && + !list_empty(&rule_list); + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry_safe(rule, tmp, &rule_list, dl_link) { + list_del_init(&rule->dl_link); + + del_timer_sync(&rule->dl_timer); + delayed_msg_check(rule, true, &msg_list); + delay_rule_decref(rule); /* -1 for the_lnet.ln_delay_rules */ + n++; + } + + if (cleanup) { /* no more delay rule, shutdown delay_daemon */ + LASSERT(delay_dd.dd_running); + delay_dd.dd_running = 0; + wake_up(&delay_dd.dd_waitq); + + while (!delay_dd.dd_stopped) + wait_event(delay_dd.dd_ctl_waitq, delay_dd.dd_stopped); + } + mutex_unlock(&delay_dd.dd_mutex); + + if (!list_empty(&msg_list)) + delayed_msg_process(&msg_list, shutdown); + + RETURN(n); +} + +/** + * List Delay Rule at position of \a pos + */ +int +lnet_delay_rule_list(int pos, struct lnet_fault_attr *attr, + struct lnet_fault_stat *stat) +{ + struct lnet_delay_rule *rule; + int cpt; + int i = 0; + int rc = -ENOENT; + ENTRY; + + cpt = lnet_net_lock_current(); + list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) { + if (i++ < pos) + continue; + + spin_lock(&rule->dl_lock); + *attr = rule->dl_attr; + *stat = rule->dl_stat; + spin_unlock(&rule->dl_lock); + rc = 0; + break; + } + + lnet_net_unlock(cpt); + RETURN(rc); +} + +/** + * reset counters for all Delay Rules + */ +void +lnet_delay_rule_reset(void) +{ + struct lnet_delay_rule *rule; + int cpt; + ENTRY; + + cpt = lnet_net_lock_current(); + + list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) { + struct lnet_fault_attr *attr = &rule->dl_attr; + + spin_lock(&rule->dl_lock); + + memset(&rule->dl_stat, 0, sizeof(rule->dl_stat)); + if (attr->u.delay.la_rate != 0) { + rule->dl_delay_at = get_random_u32_below(attr->u.delay.la_rate); + } else { + rule->dl_delay_time = ktime_get_seconds() + + get_random_u32_below(attr->u.delay.la_interval); + rule->dl_time_base = ktime_get_seconds() + + attr->u.delay.la_interval; + } + spin_unlock(&rule->dl_lock); + } + + lnet_net_unlock(cpt); + EXIT; +} + +int +lnet_fault_ctl(int opc, struct libcfs_ioctl_data *data) +{ + struct lnet_fault_attr *attr; + struct lnet_fault_stat *stat; + + attr = (struct lnet_fault_attr *)data->ioc_inlbuf1; + + switch (opc) { + default: + return -EINVAL; + + case LNET_CTL_DROP_ADD: + if (attr == NULL) + return -EINVAL; + + return lnet_drop_rule_add(attr); + + case LNET_CTL_DROP_DEL: + if (attr == NULL) + return -EINVAL; + + data->ioc_count = lnet_drop_rule_del(attr->fa_src, + attr->fa_dst); + return 0; + + case LNET_CTL_DROP_RESET: + lnet_drop_rule_reset(); + return 0; + + case LNET_CTL_DROP_LIST: + stat = (struct lnet_fault_stat *)data->ioc_inlbuf2; + if (attr == NULL || stat == NULL) + return -EINVAL; + + return lnet_drop_rule_list(data->ioc_count, attr, stat); + + case LNET_CTL_DELAY_ADD: + if (attr == NULL) + return -EINVAL; + + return lnet_delay_rule_add(attr); + + case LNET_CTL_DELAY_DEL: + if (attr == NULL) + return -EINVAL; + + data->ioc_count = lnet_delay_rule_del(attr->fa_src, + attr->fa_dst, false); + return 0; + + case LNET_CTL_DELAY_RESET: + lnet_delay_rule_reset(); + return 0; + + case LNET_CTL_DELAY_LIST: + stat = (struct lnet_fault_stat *)data->ioc_inlbuf2; + if (attr == NULL || stat == NULL) + return -EINVAL; + + return lnet_delay_rule_list(data->ioc_count, attr, stat); + } +} + +int +lnet_fault_init(void) +{ + BUILD_BUG_ON(LNET_PUT_BIT != BIT(LNET_MSG_PUT)); + BUILD_BUG_ON(LNET_ACK_BIT != BIT(LNET_MSG_ACK)); + BUILD_BUG_ON(LNET_GET_BIT != BIT(LNET_MSG_GET)); + BUILD_BUG_ON(LNET_REPLY_BIT != BIT(LNET_MSG_REPLY)); + + mutex_init(&delay_dd.dd_mutex); + spin_lock_init(&delay_dd.dd_lock); + init_waitqueue_head(&delay_dd.dd_waitq); + init_waitqueue_head(&delay_dd.dd_ctl_waitq); + INIT_LIST_HEAD(&delay_dd.dd_sched_rules); + + return 0; +} + +void +lnet_fault_fini(void) +{ + lnet_drop_rule_del(0, 0); + lnet_delay_rule_del(0, 0, true); + + LASSERT(list_empty(&the_lnet.ln_drop_rules)); + LASSERT(list_empty(&the_lnet.ln_delay_rules)); + LASSERT(list_empty(&delay_dd.dd_sched_rules)); +} diff --git a/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c new file mode 100644 index 0000000000000..16e16f6360adb --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c @@ -0,0 +1,1190 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/lnet/nidstrings.c + * + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include + +/* max value for numeric network address */ +#define MAX_NUMERIC_VALUE 0xffffffff + +#define IPSTRING_LENGTH 16 + +/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids + * consistent in all conversion functions. Some code fragments are copied + * around for the sake of clarity... + */ + +/* CAVEAT EMPTOR! Racey temporary buffer allocation! + * Choose the number of nidstrings to support the MAXIMUM expected number of + * concurrent users. If there are more, the returned string will be volatile. + * NB this number must allow for a process to be descheduled for a timeslice + * between getting its string and using it. + */ + +static char libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE]; +static int libcfs_nidstring_idx; + +static DEFINE_SPINLOCK(libcfs_nidstring_lock); + +static struct netstrfns *libcfs_namenum2netstrfns(const char *name); + +char * +libcfs_next_nidstring(void) +{ + char *str; + unsigned long flags; + + spin_lock_irqsave(&libcfs_nidstring_lock, flags); + + str = libcfs_nidstrings[libcfs_nidstring_idx++]; + if (libcfs_nidstring_idx == ARRAY_SIZE(libcfs_nidstrings)) + libcfs_nidstring_idx = 0; + + spin_unlock_irqrestore(&libcfs_nidstring_lock, flags); + return str; +} +EXPORT_SYMBOL(libcfs_next_nidstring); + +/** + * Nid range list syntax. + * \verbatim + * + * :== [ ' ' ] + * :== '@' + * :== '*' | + * | + * + * :== ... + * + * :== | + * + * :== '[' [ ',' ] ']' + * :== | + * '-' | + * '-' '/' + * :== | + * :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" | + * "vib" | "ra" | "elan" | "mx" | "ptl" + * \endverbatim + */ + +/** + * Structure to represent \ token of the syntax. + * + * One of this is created for each \ parsed. + */ +struct nidrange { + /** + * Link to list of this structures which is built on nid range + * list parsing. + */ + struct list_head nr_link; + /** + * List head for addrrange::ar_link. + */ + struct list_head nr_addrranges; + /** + * Flag indicating that *@ is found. + */ + int nr_all; + /** + * Pointer to corresponding element of libcfs_netstrfns. + */ + struct netstrfns *nr_netstrfns; + /** + * Number of network. E.g. 5 if \ is "elan5". + */ + int nr_netnum; +}; + +/** + * Structure to represent \ token of the syntax. + */ +struct addrrange { + /** + * Link to nidrange::nr_addrranges. + */ + struct list_head ar_link; + /** + * List head for cfs_expr_list::el_list. + */ + struct list_head ar_numaddr_ranges; +}; + +/** + * Parses \ token on the syntax. + * + * Allocates struct addrrange and links to \a nidrange via + * (nidrange::nr_addrranges) + * + * \retval 0 if \a src parses to '*' | \ | \ + * \retval -errno otherwise + */ +static int +parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange) +{ + struct addrrange *addrrange; + + if (src->ls_len == 1 && src->ls_str[0] == '*') { + nidrange->nr_all = 1; + return 0; + } + + CFS_ALLOC_PTR(addrrange); + if (addrrange == NULL) + return -ENOMEM; + list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges); + INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges); + + return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str, + src->ls_len, + &addrrange->ar_numaddr_ranges); +} + +/** + * Finds or creates struct nidrange. + * + * Checks if \a src is a valid network name, looks for corresponding + * nidrange on the ist of nidranges (\a nidlist), creates new struct + * nidrange if it is not found. + * + * \retval pointer to struct nidrange matching network specified via \a src + * \retval NULL if \a src does not match any network + */ +static struct nidrange * +add_nidrange(const struct cfs_lstr *src, + struct list_head *nidlist) +{ + struct netstrfns *nf; + struct nidrange *nr; + int endlen; + unsigned netnum; + + if (src->ls_len >= LNET_NIDSTR_SIZE) + return NULL; + + nf = libcfs_namenum2netstrfns(src->ls_str); + if (nf == NULL) + return NULL; + endlen = src->ls_len - strlen(nf->nf_name); + if (endlen == 0) + /* network name only, e.g. "elan" or "tcp" */ + netnum = 0; + else { + /* e.g. "elan25" or "tcp23", refuse to parse if + * network name is not appended with decimal or + * hexadecimal number */ + if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name), + endlen, &netnum, 0, MAX_NUMERIC_VALUE)) + return NULL; + } + + list_for_each_entry(nr, nidlist, nr_link) { + if (nr->nr_netstrfns != nf) + continue; + if (nr->nr_netnum != netnum) + continue; + return nr; + } + + CFS_ALLOC_PTR(nr); + if (nr == NULL) + return NULL; + list_add_tail(&nr->nr_link, nidlist); + INIT_LIST_HEAD(&nr->nr_addrranges); + nr->nr_netstrfns = nf; + nr->nr_all = 0; + nr->nr_netnum = netnum; + + return nr; +} + +/** + * Parses \ token of the syntax. + * + * \retval 1 if \a src parses to \ '@' \ + * \retval 0 otherwise + */ +static int +parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist) +{ + struct cfs_lstr addrrange; + struct cfs_lstr net; + struct nidrange *nr; + + if (cfs_gettok(src, '@', &addrrange) == 0) + goto failed; + + if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL) + goto failed; + + nr = add_nidrange(&net, nidlist); + if (nr == NULL) + goto failed; + + if (parse_addrange(&addrrange, nr) != 0) + goto failed; + + return 1; +failed: + return 0; +} + +/** + * Frees addrrange structures of \a list. + * + * For each struct addrrange structure found on \a list it frees + * cfs_expr_list list attached to it and frees the addrrange itself. + * + * \retval none + */ +static void +free_addrranges(struct list_head *list) +{ + while (!list_empty(list)) { + struct addrrange *ar; + + ar = list_entry(list->next, struct addrrange, ar_link); + + cfs_expr_list_free_list(&ar->ar_numaddr_ranges); + list_del(&ar->ar_link); + CFS_FREE_PTR(ar); + } +} + +/** + * Frees nidrange strutures of \a list. + * + * For each struct nidrange structure found on \a list it frees + * addrrange list attached to it and frees the nidrange itself. + * + * \retval none + */ +void +cfs_free_nidlist(struct list_head *list) +{ + struct list_head *pos, *next; + struct nidrange *nr; + + list_for_each_safe(pos, next, list) { + nr = list_entry(pos, struct nidrange, nr_link); + free_addrranges(&nr->nr_addrranges); + list_del(pos); + CFS_FREE_PTR(nr); + } +} +EXPORT_SYMBOL(cfs_free_nidlist); + +/** + * Parses nid range list. + * + * Parses with rigorous syntax and overflow checking \a str into + * \ [ ' ' \ ], compiles \a str into set of + * structures and links that structure to \a nidlist. The resulting + * list can be used to match a NID againts set of NIDS defined by \a + * str. + * \see cfs_match_nid + * + * \retval 1 on success + * \retval 0 otherwise + */ +int +cfs_parse_nidlist(char *str, int len, struct list_head *nidlist) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc; + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(nidlist); + while (src.ls_str) { + rc = cfs_gettok(&src, ' ', &res); + if (rc == 0) { + cfs_free_nidlist(nidlist); + return 0; + } + rc = parse_nidrange(&res, nidlist); + if (rc == 0) { + cfs_free_nidlist(nidlist); + return 0; + } + } + return 1; +} +EXPORT_SYMBOL(cfs_parse_nidlist); + +/** + * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist). + * + * \see cfs_parse_nidlist() + * + * \retval 1 on match + * \retval 0 otherwises + */ +int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist) +{ + struct nidrange *nr; + struct addrrange *ar; + + list_for_each_entry(nr, nidlist, nr_link) { + if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid))) + continue; + if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid))) + continue; + if (nr->nr_all) + return 1; + list_for_each_entry(ar, &nr->nr_addrranges, ar_link) + if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid), + &ar->ar_numaddr_ranges)) + return 1; + } + return 0; +} +EXPORT_SYMBOL(cfs_match_nid); + +/** + * Print the network part of the nidrange \a nr into the specified \a buffer. + * + * \retval number of characters written + */ +static int +cfs_print_network(char *buffer, int count, struct nidrange *nr) +{ + struct netstrfns *nf = nr->nr_netstrfns; + + if (nr->nr_netnum == 0) + return scnprintf(buffer, count, "@%s", nf->nf_name); + else + return scnprintf(buffer, count, "@%s%u", + nf->nf_name, nr->nr_netnum); +} + +/** + * Print a list of addrrange (\a addrranges) into the specified \a buffer. + * At max \a count characters can be printed into \a buffer. + * + * \retval number of characters written + */ +static int +cfs_print_addrranges(char *buffer, int count, struct list_head *addrranges, + struct nidrange *nr) +{ + int i = 0; + struct addrrange *ar; + struct netstrfns *nf = nr->nr_netstrfns; + + list_for_each_entry(ar, addrranges, ar_link) { + if (i != 0) + i += scnprintf(buffer + i, count - i, " "); + i += nf->nf_print_addrlist(buffer + i, count - i, + &ar->ar_numaddr_ranges); + i += cfs_print_network(buffer + i, count - i, nr); + } + return i; +} + +/** + * Print a list of nidranges (\a nidlist) into the specified \a buffer. + * At max \a count characters can be printed into \a buffer. + * Nidranges are separated by a space character. + * + * \retval number of characters written + */ +int cfs_print_nidlist(char *buffer, int count, struct list_head *nidlist) +{ + int i = 0; + struct nidrange *nr; + + if (count <= 0) + return 0; + + list_for_each_entry(nr, nidlist, nr_link) { + if (i != 0) + i += scnprintf(buffer + i, count - i, " "); + + if (nr->nr_all != 0) { + LASSERT(list_empty(&nr->nr_addrranges)); + i += scnprintf(buffer + i, count - i, "*"); + i += cfs_print_network(buffer + i, count - i, nr); + } else { + i += cfs_print_addrranges(buffer + i, count - i, + &nr->nr_addrranges, nr); + } + } + return i; +} +EXPORT_SYMBOL(cfs_print_nidlist); + +static int +libcfs_lo_str2addr(const char *str, int nob, __u32 *addr) +{ + *addr = 0; + return 1; +} + +static void +libcfs_ip_addr2str(__u32 addr, char *str, size_t size) +{ + snprintf(str, size, "%u.%u.%u.%u", + (addr >> 24) & 0xff, (addr >> 16) & 0xff, + (addr >> 8) & 0xff, addr & 0xff); +} + +static void +libcfs_ip_addr2str_size(const __be32 *addr, size_t asize, + char *str, size_t size) +{ + struct sockaddr_storage sa = {}; + + switch (asize) { + case 4: + sa.ss_family = AF_INET; + memcpy(&((struct sockaddr_in *)(&sa))->sin_addr.s_addr, + addr, asize); + break; + case 16: + sa.ss_family = AF_INET6; + memcpy(&((struct sockaddr_in6 *)(&sa))->sin6_addr.s6_addr, + addr, asize); + break; + default: + return; + } + + rpc_ntop((struct sockaddr *)&sa, str, size); +} + +/* CAVEAT EMPTOR XscanfX + * I use "%n" at the end of a sscanf format to detect trailing junk. However + * sscanf may return immediately if it sees the terminating '0' in a string, so + * I initialise the %n variable to the expected length. If sscanf sets it; + * fine, if it doesn't, then the scan ended at the end of the string, which is + * fine too :) */ +static int +libcfs_ip_str2addr(const char *str, int nob, __u32 *addr) +{ + unsigned int a; + unsigned int b; + unsigned int c; + unsigned int d; + int n = nob; /* XscanfX */ + + /* numeric IP? */ + if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 && + n == nob && + (a & ~0xff) == 0 && (b & ~0xff) == 0 && + (c & ~0xff) == 0 && (d & ~0xff) == 0) { + *addr = ((a<<24)|(b<<16)|(c<<8)|d); + return 1; + } + return 0; +} + +static int +libcfs_ip_str2addr_size(const char *str, int nob, + __be32 *addr, size_t *alen) +{ + struct sockaddr_storage sa; + + /* Note: 'net' arg to rpc_pton is only needed for link-local + * addresses. Such addresses would not work with LNet routing, + * so we can assume they aren't used. So it doesn't matter + * which net namespace is passed. + */ + if (rpc_pton(&init_net, str, nob, + (struct sockaddr *)&sa, sizeof(sa)) == 0) + return 0; + if (sa.ss_family == AF_INET6) { + memcpy(addr, + &((struct sockaddr_in6 *)(&sa))->sin6_addr.s6_addr, + 16); + *alen = 16; + return 1; + } + if (sa.ss_family == AF_INET) { + memcpy(addr, + &((struct sockaddr_in *)(&sa))->sin_addr.s_addr, + 4); + *alen = 4; + return 1; + } + return 0; +} + + +/* Used by lnet/config.c so it can't be static */ +int +cfs_ip_addr_parse(char *str, int len, struct list_head *list) +{ + struct cfs_expr_list *el; + struct cfs_lstr src; + int rc; + int i; + + src.ls_str = str; + src.ls_len = len; + i = 0; + + while (src.ls_str != NULL) { + struct cfs_lstr res; + + if (!cfs_gettok(&src, '.', &res)) { + rc = -EINVAL; + goto out; + } + + rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el); + if (rc != 0) + goto out; + + list_add_tail(&el->el_link, list); + i++; + } + + if (i == 4) + return 0; + + rc = -EINVAL; +out: + cfs_expr_list_free_list(list); + + return rc; +} + +static int +libcfs_ip_addr_range_print(char *buffer, int count, struct list_head *list) +{ + int i = 0, j = 0; + struct cfs_expr_list *el; + + list_for_each_entry(el, list, el_link) { + LASSERT(j++ < 4); + if (i != 0) + i += scnprintf(buffer + i, count - i, "."); + i += cfs_expr_list_print(buffer + i, count - i, el); + } + return i; +} + +/** + * Matches address (\a addr) against address set encoded in \a list. + * + * \retval 1 if \a addr matches + * \retval 0 otherwise + */ +int +cfs_ip_addr_match(__u32 addr, struct list_head *list) +{ + struct cfs_expr_list *el; + int i = 0; + + list_for_each_entry_reverse(el, list, el_link) { + if (!cfs_expr_list_match(addr & 0xff, el)) + return 0; + addr >>= 8; + i++; + } + + return i == 4; +} + +/** + * Print the network part of the nidrange \a nr into the specified \a buffer. + * + * \retval number of characters written + */ +static void +libcfs_decnum_addr2str(__u32 addr, char *str, size_t size) +{ + snprintf(str, size, "%u", addr); +} + +static int +libcfs_num_str2addr(const char *str, int nob, __u32 *addr) +{ + int n; + + n = nob; + if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob) + return 1; + + return 0; +} + +/** + * Nf_parse_addrlist method for networks using numeric addresses. + * + * Examples of such networks are gm and elan. + * + * \retval 0 if \a str parsed to numeric address + * \retval errno otherwise + */ +int +libcfs_num_parse(char *str, int len, struct list_head *list) +{ + struct cfs_expr_list *el; + int rc; + + rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el); + if (rc == 0) + list_add_tail(&el->el_link, list); + + return rc; +} + +static int +libcfs_num_addr_range_print(char *buffer, int count, struct list_head *list) +{ + int i = 0, j = 0; + struct cfs_expr_list *el; + + list_for_each_entry(el, list, el_link) { + LASSERT(j++ < 1); + i += cfs_expr_list_print(buffer + i, count - i, el); + } + return i; +} + +/* + * Nf_match_addr method for networks using numeric addresses + * + * \retval 1 on match + * \retval 0 otherwise + */ +static int +libcfs_num_match(__u32 addr, struct list_head *numaddr) +{ + struct cfs_expr_list *el; + + LASSERT(!list_empty(numaddr)); + el = list_entry(numaddr->next, struct cfs_expr_list, el_link); + + return cfs_expr_list_match(addr, el); +} + +static struct netstrfns libcfs_netstrfns[] = { + { .nf_type = LOLND, + .nf_name = "lo", + .nf_modname = "klolnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_lo_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match + }, + { .nf_type = SOCKLND, + .nf_name = "tcp", + .nf_modname = "ksocklnd", + .nf_addr2str = libcfs_ip_addr2str, + .nf_addr2str_size = libcfs_ip_addr2str_size, + .nf_str2addr = libcfs_ip_str2addr, + .nf_str2addr_size = libcfs_ip_str2addr_size, + .nf_parse_addrlist = cfs_ip_addr_parse, + .nf_print_addrlist = libcfs_ip_addr_range_print, + .nf_match_addr = cfs_ip_addr_match + }, + { .nf_type = O2IBLND, + .nf_name = "o2ib", + .nf_modname = "ko2iblnd", + .nf_addr2str = libcfs_ip_addr2str, + .nf_str2addr = libcfs_ip_str2addr, + .nf_parse_addrlist = cfs_ip_addr_parse, + .nf_print_addrlist = libcfs_ip_addr_range_print, + .nf_match_addr = cfs_ip_addr_match + }, + { .nf_type = GNILND, + .nf_name = "gni", + .nf_modname = "kgnilnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_num_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match + }, + { .nf_type = GNIIPLND, + .nf_name = "gip", + .nf_modname = "kgnilnd", + .nf_addr2str = libcfs_ip_addr2str, + .nf_str2addr = libcfs_ip_str2addr, + .nf_parse_addrlist = cfs_ip_addr_parse, + .nf_print_addrlist = libcfs_ip_addr_range_print, + .nf_match_addr = cfs_ip_addr_match + }, + { .nf_type = PTL4LND, + .nf_name = "ptlf", + .nf_modname = "kptl4lnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_num_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match + }, + { + .nf_type = KFILND, + .nf_name = "kfi", + .nf_modname = "kkfilnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_num_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match + }, +}; + +static const size_t libcfs_nnetstrfns = ARRAY_SIZE(libcfs_netstrfns); + +static struct netstrfns * +type2net_info(__u32 net_type) +{ + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + if (libcfs_netstrfns[i].nf_type == net_type) + return &libcfs_netstrfns[i]; + } + + return NULL; +} + +int +cfs_match_net(__u32 net_id, __u32 net_type, struct list_head *net_num_list) +{ + __u32 net_num; + + if (!net_num_list) + return 0; + + if (net_type != LNET_NETTYP(net_id)) + return 0; + + net_num = LNET_NETNUM(net_id); + + /* if there is a net number but the list passed in is empty, then + * there is no match. + */ + if (!net_num && list_empty(net_num_list)) + return 1; + else if (list_empty(net_num_list)) + return 0; + + if (!libcfs_num_match(net_num, net_num_list)) + return 0; + + return 1; +} + +int +cfs_match_nid_net(struct lnet_nid *nid, __u32 net_type, + struct list_head *net_num_list, + struct list_head *addr) +{ + __u32 address; + struct netstrfns *nf; + + if (!addr || !net_num_list) + return 0; + + nf = type2net_info(LNET_NETTYP(LNET_NID_NET(nid))); + if (!nf || !net_num_list || !addr) + return 0; + + /* FIXME handle long-addr nid */ + address = LNET_NIDADDR(lnet_nid_to_nid4(nid)); + + /* if either the address or net number don't match then no match */ + if (!nf->nf_match_addr(address, addr) || + !cfs_match_net(LNET_NID_NET(nid), net_type, net_num_list)) + return 0; + + return 1; +} +EXPORT_SYMBOL(cfs_match_nid_net); + +static struct netstrfns * +libcfs_lnd2netstrfns(__u32 lnd) +{ + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) + if (lnd == libcfs_netstrfns[i].nf_type) + return &libcfs_netstrfns[i]; + + return NULL; +} + +static struct netstrfns * +libcfs_namenum2netstrfns(const char *name) +{ + struct netstrfns *nf; + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (!strncmp(name, nf->nf_name, strlen(nf->nf_name))) + return nf; + } + return NULL; +} + +static struct netstrfns * +libcfs_name2netstrfns(const char *name) +{ + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) + if (!strcmp(libcfs_netstrfns[i].nf_name, name)) + return &libcfs_netstrfns[i]; + + return NULL; +} + +int +libcfs_isknown_lnd(__u32 lnd) +{ + return libcfs_lnd2netstrfns(lnd) != NULL; +} +EXPORT_SYMBOL(libcfs_isknown_lnd); + +char * +libcfs_lnd2modname(__u32 lnd) +{ + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + + return (nf == NULL) ? NULL : nf->nf_modname; +} +EXPORT_SYMBOL(libcfs_lnd2modname); + +int +libcfs_str2lnd(const char *str) +{ + struct netstrfns *nf = libcfs_name2netstrfns(str); + + if (nf != NULL) + return nf->nf_type; + + return -ENXIO; +} +EXPORT_SYMBOL(libcfs_str2lnd); + +char * +libcfs_lnd2str_r(__u32 lnd, char *buf, size_t buf_size) +{ + struct netstrfns *nf; + + nf = libcfs_lnd2netstrfns(lnd); + if (nf == NULL) + snprintf(buf, buf_size, "?%u?", lnd); + else + snprintf(buf, buf_size, "%s", nf->nf_name); + + return buf; +} +EXPORT_SYMBOL(libcfs_lnd2str_r); + +char * +libcfs_net2str_r(__u32 net, char *buf, size_t buf_size) +{ + __u32 nnum = LNET_NETNUM(net); + __u32 lnd = LNET_NETTYP(net); + struct netstrfns *nf; + + nf = libcfs_lnd2netstrfns(lnd); + if (nf == NULL) + snprintf(buf, buf_size, "<%u:%u>", lnd, nnum); + else if (nnum == 0) + snprintf(buf, buf_size, "%s", nf->nf_name); + else + snprintf(buf, buf_size, "%s%u", nf->nf_name, nnum); + + return buf; +} +EXPORT_SYMBOL(libcfs_net2str_r); + +char * +libcfs_nid2str_r(lnet_nid_t nid, char *buf, size_t buf_size) +{ + __u32 addr = LNET_NIDADDR(nid); + __u32 net = LNET_NIDNET(nid); + __u32 nnum = LNET_NETNUM(net); + __u32 lnd = LNET_NETTYP(net); + struct netstrfns *nf; + + if (nid == LNET_NID_ANY) { + strncpy(buf, "", buf_size); + buf[buf_size - 1] = '\0'; + return buf; + } + + nf = libcfs_lnd2netstrfns(lnd); + if (nf == NULL) { + snprintf(buf, buf_size, "%x@<%u:%u>", addr, lnd, nnum); + } else { + size_t addr_len; + + nf->nf_addr2str(addr, buf, buf_size); + addr_len = strlen(buf); + if (nnum == 0) + snprintf(buf + addr_len, buf_size - addr_len, "@%s", + nf->nf_name); + else + snprintf(buf + addr_len, buf_size - addr_len, "@%s%u", + nf->nf_name, nnum); + } + + return buf; +} +EXPORT_SYMBOL(libcfs_nid2str_r); + +char * +libcfs_nidstr_r(const struct lnet_nid *nid, char *buf, size_t buf_size) +{ + __u32 nnum; + __u32 lnd; + struct netstrfns *nf; + + if (LNET_NID_IS_ANY(nid)) { + strncpy(buf, "", buf_size); + buf[buf_size - 1] = '\0'; + return buf; + } + + nnum = be16_to_cpu(nid->nid_num); + lnd = nid->nid_type; + nf = libcfs_lnd2netstrfns(lnd); + if (nf) { + size_t addr_len; + + if (nf->nf_addr2str_size) + nf->nf_addr2str_size(nid->nid_addr, NID_ADDR_BYTES(nid), + buf, buf_size); + else + nf->nf_addr2str(ntohl(nid->nid_addr[0]), buf, buf_size); + addr_len = strlen(buf); + if (nnum == 0) + snprintf(buf + addr_len, buf_size - addr_len, "@%s", + nf->nf_name); + else + snprintf(buf + addr_len, buf_size - addr_len, "@%s%u", + nf->nf_name, nnum); + } else { + int l = 0; + int words = DIV_ROUND_UP(NID_ADDR_BYTES(nid), 4); + int i; + + for (i = 0; i < words && i < 4; i++) + l = snprintf(buf+l, buf_size-l, "%s%x", + i ? ":" : "", ntohl(nid->nid_addr[i])); + snprintf(buf+l, buf_size-l, "@<%u:%u>", lnd, nnum); + } + + return buf; +} +EXPORT_SYMBOL(libcfs_nidstr_r); + +static struct netstrfns * +libcfs_str2net_internal(const char *str, __u32 *net) +{ + struct netstrfns *nf = NULL; + int nob; + unsigned int netnum; + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (!strncmp(str, nf->nf_name, strlen(nf->nf_name))) + break; + } + + if (i == libcfs_nnetstrfns) + return NULL; + + nob = strlen(nf->nf_name); + + if (strlen(str) == (unsigned int)nob) { + netnum = 0; + } else { + if (nf->nf_type == LOLND) /* net number not allowed */ + return NULL; + + str += nob; + i = strlen(str); + if (sscanf(str, "%u%n", &netnum, &i) < 1 || + i != (int)strlen(str)) + return NULL; + } + + *net = LNET_MKNET(nf->nf_type, netnum); + return nf; +} + +__u32 +libcfs_str2net(const char *str) +{ + __u32 net; + + if (libcfs_str2net_internal(str, &net) != NULL) + return net; + + return LNET_NET_ANY; +} +EXPORT_SYMBOL(libcfs_str2net); + +lnet_nid_t +libcfs_str2nid(const char *str) +{ + const char *sep = strchr(str, '@'); + struct netstrfns *nf; + __u32 net; + __u32 addr; + + if (sep != NULL) { + nf = libcfs_str2net_internal(sep + 1, &net); + if (nf == NULL) + return LNET_NID_ANY; + } else { + sep = str + strlen(str); + net = LNET_MKNET(SOCKLND, 0); + nf = libcfs_lnd2netstrfns(SOCKLND); + LASSERT(nf != NULL); + } + + if (!nf->nf_str2addr(str, (int)(sep - str), &addr)) + return LNET_NID_ANY; + + return LNET_MKNID(net, addr); +} +EXPORT_SYMBOL(libcfs_str2nid); + +int +libcfs_strnid(struct lnet_nid *nid, const char *str) +{ + const char *sep = strchr(str, '@'); + struct netstrfns *nf; + __u32 net; + + if (sep != NULL) { + nf = libcfs_str2net_internal(sep + 1, &net); + if (nf == NULL) + return -EINVAL; + } else { + sep = str + strlen(str); + net = LNET_MKNET(SOCKLND, 0); + nf = libcfs_lnd2netstrfns(SOCKLND); + LASSERT(nf != NULL); + } + + memset(nid, 0, sizeof(*nid)); + nid->nid_type = LNET_NETTYP(net); + nid->nid_num = htons(LNET_NETNUM(net)); + if (nf->nf_str2addr_size) { + size_t asize = 0; + + if (!nf->nf_str2addr_size(str, (int)(sep - str), + nid->nid_addr, &asize)) + return -EINVAL; + nid->nid_size = asize - 4; + } else { + __u32 addr; + + if (!nf->nf_str2addr(str, (int)(sep - str), &addr)) + return -EINVAL; + nid->nid_addr[0] = htonl(addr); + nid->nid_size = 0; + } + return 0; +} +EXPORT_SYMBOL(libcfs_strnid); + +char * +libcfs_id2str(struct lnet_process_id id) +{ + char *str = libcfs_next_nidstring(); + + if (id.pid == LNET_PID_ANY) { + snprintf(str, LNET_NIDSTR_SIZE, + "LNET_PID_ANY-%s", libcfs_nid2str(id.nid)); + return str; + } + + snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s", + ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "", + (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid)); + return str; +} +EXPORT_SYMBOL(libcfs_id2str); + +char * +libcfs_idstr(struct lnet_processid *id) +{ + char *str = libcfs_next_nidstring(); + + if (id->pid == LNET_PID_ANY) { + snprintf(str, LNET_NIDSTR_SIZE, + "LNET_PID_ANY-%s", libcfs_nidstr(&id->nid)); + return str; + } + + snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s", + ((id->pid & LNET_PID_USERFLAG) != 0) ? "U" : "", + (id->pid & ~LNET_PID_USERFLAG), libcfs_nidstr(&id->nid)); + return str; +} +EXPORT_SYMBOL(libcfs_idstr); + +int +libcfs_str2anynid(lnet_nid_t *nidp, const char *str) +{ + if (!strcmp(str, "*")) { + *nidp = LNET_NID_ANY; + return 1; + } + + *nidp = libcfs_str2nid(str); + return *nidp != LNET_NID_ANY; +} +EXPORT_SYMBOL(libcfs_str2anynid); diff --git a/drivers/staging/lustrefsx/lnet/lnet/peer.c b/drivers/staging/lustrefsx/lnet/lnet/peer.c new file mode 100644 index 0000000000000..7a438ea086c4e --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/peer.c @@ -0,0 +1,4314 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/lnet/peer.c + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#ifdef HAVE_SCHED_HEADERS +#include +#endif +#include + +#include +#include +#include + +/* Value indicating that recovery needs to re-check a peer immediately. */ +#define LNET_REDISCOVER_PEER (1) + +static int lnet_peer_queue_for_discovery(struct lnet_peer *lp); + +static void +lnet_peer_remove_from_remote_list(struct lnet_peer_ni *lpni) +{ + if (!list_empty(&lpni->lpni_on_remote_peer_ni_list)) { + list_del_init(&lpni->lpni_on_remote_peer_ni_list); + lnet_peer_ni_decref_locked(lpni); + } +} + +void +lnet_peer_net_added(struct lnet_net *net) +{ + struct lnet_peer_ni *lpni, *tmp; + + list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_remote_peer_ni_list, + lpni_on_remote_peer_ni_list) { + + if (LNET_NID_NET(&lpni->lpni_nid) == net->net_id) { + lpni->lpni_net = net; + + spin_lock(&lpni->lpni_lock); + lpni->lpni_txcredits = + lpni->lpni_net->net_tunables.lct_peer_tx_credits; + lpni->lpni_mintxcredits = lpni->lpni_txcredits; + lpni->lpni_rtrcredits = + lnet_peer_buffer_credits(lpni->lpni_net); + lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits; + spin_unlock(&lpni->lpni_lock); + + lnet_peer_remove_from_remote_list(lpni); + } + } +} + +static void +lnet_peer_tables_destroy(void) +{ + struct lnet_peer_table *ptable; + struct list_head *hash; + int i; + int j; + + if (!the_lnet.ln_peer_tables) + return; + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + hash = ptable->pt_hash; + if (!hash) /* not intialized */ + break; + + LASSERT(list_empty(&ptable->pt_zombie_list)); + + ptable->pt_hash = NULL; + for (j = 0; j < LNET_PEER_HASH_SIZE; j++) + LASSERT(list_empty(&hash[j])); + + CFS_FREE_PTR_ARRAY(hash, LNET_PEER_HASH_SIZE); + } + + cfs_percpt_free(the_lnet.ln_peer_tables); + the_lnet.ln_peer_tables = NULL; +} + +int +lnet_peer_tables_create(void) +{ + struct lnet_peer_table *ptable; + struct list_head *hash; + int i; + int j; + + the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ptable)); + if (the_lnet.ln_peer_tables == NULL) { + CERROR("Failed to allocate cpu-partition peer tables\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i, + LNET_PEER_HASH_SIZE * sizeof(*hash)); + if (hash == NULL) { + CERROR("Failed to create peer hash table\n"); + lnet_peer_tables_destroy(); + return -ENOMEM; + } + + spin_lock_init(&ptable->pt_zombie_lock); + INIT_LIST_HEAD(&ptable->pt_zombie_list); + + INIT_LIST_HEAD(&ptable->pt_peer_list); + + for (j = 0; j < LNET_PEER_HASH_SIZE; j++) + INIT_LIST_HEAD(&hash[j]); + ptable->pt_hash = hash; /* sign of initialization */ + } + + return 0; +} + +static struct lnet_peer_ni * +lnet_peer_ni_alloc(struct lnet_nid *nid) +{ + struct lnet_peer_ni *lpni; + struct lnet_net *net; + int cpt; + + cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); + + LIBCFS_CPT_ALLOC(lpni, lnet_cpt_table(), cpt, sizeof(*lpni)); + if (!lpni) + return NULL; + + INIT_LIST_HEAD(&lpni->lpni_txq); + INIT_LIST_HEAD(&lpni->lpni_hashlist); + INIT_LIST_HEAD(&lpni->lpni_peer_nis); + INIT_LIST_HEAD(&lpni->lpni_recovery); + INIT_LIST_HEAD(&lpni->lpni_on_remote_peer_ni_list); + INIT_LIST_HEAD(&lpni->lpni_rtr_pref_nids); + LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh); + kref_init(&lpni->lpni_kref); + lpni->lpni_sel_priority = LNET_MAX_SELECTION_PRIORITY; + + spin_lock_init(&lpni->lpni_lock); + + if (lnet_peers_start_down()) + lpni->lpni_ns_status = LNET_NI_STATUS_DOWN; + else + lpni->lpni_ns_status = LNET_NI_STATUS_UP; + lpni->lpni_ping_feats = LNET_PING_FEAT_INVAL; + lpni->lpni_nid = *nid; + lpni->lpni_cpt = cpt; + atomic_set(&lpni->lpni_healthv, LNET_MAX_HEALTH_VALUE); + + net = lnet_get_net_locked(LNET_NID_NET(nid)); + lpni->lpni_net = net; + if (net) { + lpni->lpni_txcredits = net->net_tunables.lct_peer_tx_credits; + lpni->lpni_mintxcredits = lpni->lpni_txcredits; + lpni->lpni_rtrcredits = lnet_peer_buffer_credits(net); + lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits; + } else { + /* + * This peer_ni is not on a local network, so we + * cannot add the credits here. In case the net is + * added later, add the peer_ni to the remote peer ni + * list so it can be easily found and revisited. + */ + /* FIXME: per-net implementation instead? */ + lnet_peer_ni_addref_locked(lpni); + list_add_tail(&lpni->lpni_on_remote_peer_ni_list, + &the_lnet.ln_remote_peer_ni_list); + } + + CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nidstr(&lpni->lpni_nid)); + + return lpni; +} + +static struct lnet_peer_net * +lnet_peer_net_alloc(__u32 net_id) +{ + struct lnet_peer_net *lpn; + + LIBCFS_CPT_ALLOC(lpn, lnet_cpt_table(), CFS_CPT_ANY, sizeof(*lpn)); + if (!lpn) + return NULL; + + INIT_LIST_HEAD(&lpn->lpn_peer_nets); + INIT_LIST_HEAD(&lpn->lpn_peer_nis); + lpn->lpn_net_id = net_id; + lpn->lpn_sel_priority = LNET_MAX_SELECTION_PRIORITY; + + CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id)); + + return lpn; +} + +void +lnet_destroy_peer_net_locked(struct lnet_peer_net *lpn) +{ + struct lnet_peer *lp; + + CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id)); + + LASSERT(atomic_read(&lpn->lpn_refcount) == 0); + LASSERT(list_empty(&lpn->lpn_peer_nis)); + LASSERT(list_empty(&lpn->lpn_peer_nets)); + lp = lpn->lpn_peer; + lpn->lpn_peer = NULL; + LIBCFS_FREE(lpn, sizeof(*lpn)); + + lnet_peer_decref_locked(lp); +} + +static struct lnet_peer * +lnet_peer_alloc(struct lnet_nid *nid) +{ + struct lnet_peer *lp; + + LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), CFS_CPT_ANY, sizeof(*lp)); + if (!lp) + return NULL; + + INIT_LIST_HEAD(&lp->lp_rtrq); + INIT_LIST_HEAD(&lp->lp_routes); + INIT_LIST_HEAD(&lp->lp_peer_list); + INIT_LIST_HEAD(&lp->lp_peer_nets); + INIT_LIST_HEAD(&lp->lp_dc_list); + INIT_LIST_HEAD(&lp->lp_dc_pendq); + INIT_LIST_HEAD(&lp->lp_rtr_list); + init_waitqueue_head(&lp->lp_dc_waitq); + spin_lock_init(&lp->lp_lock); + lp->lp_primary_nid = *nid; + lp->lp_disc_src_nid = LNET_ANY_NID; + lp->lp_disc_dst_nid = LNET_ANY_NID; + if (lnet_peers_start_down()) + lp->lp_alive = false; + else + lp->lp_alive = true; + + /* + * all peers created on a router should have health on + * if it's not already on. + */ + if (the_lnet.ln_routing && !lnet_health_sensitivity) + lp->lp_health_sensitivity = 1; + + /* + * Turn off discovery for loopback peer. If you're creating a peer + * for the loopback interface then that was initiated when we + * attempted to send a message over the loopback. There is no need + * to ever use a different interface when sending messages to + * myself. + */ + if (nid_is_lo0(nid)) + lp->lp_state = LNET_PEER_NO_DISCOVERY; + lp->lp_cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); + + CDEBUG(D_NET, "%p nid %s\n", lp, libcfs_nidstr(&lp->lp_primary_nid)); + + return lp; +} + +void +lnet_destroy_peer_locked(struct lnet_peer *lp) +{ + CDEBUG(D_NET, "%p nid %s\n", lp, libcfs_nidstr(&lp->lp_primary_nid)); + + LASSERT(atomic_read(&lp->lp_refcount) == 0); + LASSERT(lp->lp_rtr_refcount == 0); + LASSERT(list_empty(&lp->lp_peer_nets)); + LASSERT(list_empty(&lp->lp_peer_list)); + LASSERT(list_empty(&lp->lp_dc_list)); + + if (lp->lp_data) + lnet_ping_buffer_decref(lp->lp_data); + + /* + * if there are messages still on the pending queue, then make + * sure to queue them on the ln_msg_resend list so they can be + * resent at a later point if the discovery thread is still + * running. + * If the discovery thread has stopped, then the wakeup will be a + * no-op, and it is expected the lnet_shutdown_lndnets() will + * eventually be called, which will traverse this list and + * finalize the messages on the list. + * We can not resend them now because we're holding the cpt lock. + * Releasing the lock can cause an inconsistent state + */ + spin_lock(&the_lnet.ln_msg_resend_lock); + spin_lock(&lp->lp_lock); + list_splice(&lp->lp_dc_pendq, &the_lnet.ln_msg_resend); + spin_unlock(&lp->lp_lock); + spin_unlock(&the_lnet.ln_msg_resend_lock); + wake_up(&the_lnet.ln_dc_waitq); + + LIBCFS_FREE(lp, sizeof(*lp)); +} + +/* + * Detach a peer_ni from its peer_net. If this was the last peer_ni on + * that peer_net, detach the peer_net from the peer. + * + * Call with lnet_net_lock/EX held + */ +static void +lnet_peer_detach_peer_ni_locked(struct lnet_peer_ni *lpni) +{ + struct lnet_peer_table *ptable; + struct lnet_peer_net *lpn; + struct lnet_peer *lp; + + /* + * Belts and suspenders: gracefully handle teardown of a + * partially connected peer_ni. + */ + lpn = lpni->lpni_peer_net; + + list_del_init(&lpni->lpni_peer_nis); + /* + * If there are no lpni's left, we detach lpn from + * lp_peer_nets, so it cannot be found anymore. + */ + if (list_empty(&lpn->lpn_peer_nis)) + list_del_init(&lpn->lpn_peer_nets); + + /* Update peer NID count. */ + lp = lpn->lpn_peer; + lp->lp_nnis--; + + /* + * If there are no more peer nets, make the peer unfindable + * via the peer_tables. + * + * Otherwise, if the peer is DISCOVERED, tell discovery to + * take another look at it. This is a no-op if discovery for + * this peer did the detaching. + */ + if (list_empty(&lp->lp_peer_nets)) { + list_del_init(&lp->lp_peer_list); + ptable = the_lnet.ln_peer_tables[lp->lp_cpt]; + ptable->pt_peers--; + } else if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) { + /* Discovery isn't running, nothing to do here. */ + } else if (lp->lp_state & LNET_PEER_DISCOVERED) { + lnet_peer_queue_for_discovery(lp); + wake_up(&the_lnet.ln_dc_waitq); + } + CDEBUG(D_NET, "peer %s NID %s\n", + libcfs_nidstr(&lp->lp_primary_nid), + libcfs_nidstr(&lpni->lpni_nid)); +} + +/* called with lnet_net_lock LNET_LOCK_EX held */ +static int +lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni, bool force) +{ + struct lnet_peer_table *ptable = NULL; + + /* don't remove a peer_ni if it's also a gateway */ + if (lnet_isrouter(lpni) && !force) { + CERROR("Peer NI %s is a gateway. Can not delete it\n", + libcfs_nidstr(&lpni->lpni_nid)); + return -EBUSY; + } + + lnet_peer_remove_from_remote_list(lpni); + + /* remove peer ni from the hash list. */ + list_del_init(&lpni->lpni_hashlist); + + /* + * indicate the peer is being deleted so the monitor thread can + * remove it from the recovery queue. + */ + spin_lock(&lpni->lpni_lock); + lpni->lpni_state |= LNET_PEER_NI_DELETING; + spin_unlock(&lpni->lpni_lock); + + /* decrement the ref count on the peer table */ + ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; + + /* + * The peer_ni can no longer be found with a lookup. But there + * can be current users, so keep track of it on the zombie + * list until the reference count has gone to zero. + * + * The last reference may be lost in a place where the + * lnet_net_lock locks only a single cpt, and that cpt may not + * be lpni->lpni_cpt. So the zombie list of lnet_peer_table + * has its own lock. + */ + spin_lock(&ptable->pt_zombie_lock); + list_add(&lpni->lpni_hashlist, &ptable->pt_zombie_list); + ptable->pt_zombies++; + spin_unlock(&ptable->pt_zombie_lock); + + /* no need to keep this peer_ni on the hierarchy anymore */ + lnet_peer_detach_peer_ni_locked(lpni); + + /* remove hashlist reference on peer_ni */ + lnet_peer_ni_decref_locked(lpni); + + return 0; +} + +void lnet_peer_uninit(void) +{ + struct lnet_peer_ni *lpni, *tmp; + + lnet_net_lock(LNET_LOCK_EX); + + /* remove all peer_nis from the remote peer and the hash list */ + list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_remote_peer_ni_list, + lpni_on_remote_peer_ni_list) + lnet_peer_ni_del_locked(lpni, false); + + lnet_peer_tables_destroy(); + + lnet_net_unlock(LNET_LOCK_EX); +} + +static int +lnet_peer_del_locked(struct lnet_peer *peer) +{ + struct lnet_peer_ni *lpni = NULL, *lpni2; + int rc = 0, rc2 = 0; + + CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&peer->lp_primary_nid)); + + spin_lock(&peer->lp_lock); + peer->lp_state |= LNET_PEER_MARK_DELETED; + spin_unlock(&peer->lp_lock); + + lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni); + while (lpni != NULL) { + lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni); + rc = lnet_peer_ni_del_locked(lpni, false); + if (rc != 0) + rc2 = rc; + lpni = lpni2; + } + + return rc2; +} + +/* + * Discovering this peer is taking too long. Cancel any Ping or Push + * that discovery is waiting on by unlinking the relevant MDs. The + * lnet_discovery_event_handler() will proceed from here and complete + * the cleanup. + */ +static void lnet_peer_cancel_discovery(struct lnet_peer *lp) +{ + struct lnet_handle_md ping_mdh; + struct lnet_handle_md push_mdh; + + LNetInvalidateMDHandle(&ping_mdh); + LNetInvalidateMDHandle(&push_mdh); + + spin_lock(&lp->lp_lock); + if (lp->lp_state & LNET_PEER_PING_SENT) { + ping_mdh = lp->lp_ping_mdh; + LNetInvalidateMDHandle(&lp->lp_ping_mdh); + } + if (lp->lp_state & LNET_PEER_PUSH_SENT) { + push_mdh = lp->lp_push_mdh; + LNetInvalidateMDHandle(&lp->lp_push_mdh); + } + spin_unlock(&lp->lp_lock); + + if (!LNetMDHandleIsInvalid(ping_mdh)) + LNetMDUnlink(ping_mdh); + if (!LNetMDHandleIsInvalid(push_mdh)) + LNetMDUnlink(push_mdh); +} + +static int +lnet_peer_del(struct lnet_peer *peer) +{ + int rc; + + lnet_peer_cancel_discovery(peer); + lnet_net_lock(LNET_LOCK_EX); + rc = lnet_peer_del_locked(peer); + lnet_net_unlock(LNET_LOCK_EX); + + return rc; +} + +/* + * Delete a NID from a peer. Call with ln_api_mutex held. + * + * Error codes: + * -EPERM: Non-DLC deletion from DLC-configured peer. + * -ENOENT: No lnet_peer_ni corresponding to the nid. + * -ECHILD: The lnet_peer_ni isn't connected to the peer. + * -EBUSY: The lnet_peer_ni is the primary, and not the only peer_ni. + */ +static int +lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags) +{ + struct lnet_peer_ni *lpni; + struct lnet_nid primary_nid = lp->lp_primary_nid; + struct lnet_nid nid; + int rc = 0; + bool force = (flags & LNET_PEER_RTR_NI_FORCE_DEL) ? true : false; + + lnet_nid4_to_nid(nid4, &nid); + if (!(flags & LNET_PEER_CONFIGURED)) { + if (lp->lp_state & LNET_PEER_CONFIGURED) { + rc = -EPERM; + goto out; + } + } + + lpni = lnet_peer_ni_find_locked(&nid); + if (!lpni) { + rc = -ENOENT; + goto out; + } + lnet_peer_ni_decref_locked(lpni); + if (lp != lpni->lpni_peer_net->lpn_peer) { + rc = -ECHILD; + goto out; + } + + /* + * This function only allows deletion of the primary NID if it + * is the only NID. + */ + if (nid_same(&nid, &lp->lp_primary_nid) && lp->lp_nnis != 1 && !force) { + rc = -EBUSY; + goto out; + } + + lnet_net_lock(LNET_LOCK_EX); + + if (nid_same(&nid, &lp->lp_primary_nid) && lp->lp_nnis != 1 && force) { + struct lnet_peer_ni *lpni2; + /* assign the next peer_ni to be the primary */ + lpni2 = lnet_get_next_peer_ni_locked(lp, NULL, lpni); + LASSERT(lpni2); + lp->lp_primary_nid = lpni2->lpni_nid; + } + rc = lnet_peer_ni_del_locked(lpni, force); + + lnet_net_unlock(LNET_LOCK_EX); + +out: + CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n", + libcfs_nidstr(&primary_nid), libcfs_nidstr(&nid), + flags, rc); + + return rc; +} + +static void +lnet_peer_table_cleanup_locked(struct lnet_net *net, + struct lnet_peer_table *ptable) +{ + int i; + struct lnet_peer_ni *next; + struct lnet_peer_ni *lpni; + struct lnet_peer *peer; + + for (i = 0; i < LNET_PEER_HASH_SIZE; i++) { + list_for_each_entry_safe(lpni, next, &ptable->pt_hash[i], + lpni_hashlist) { + if (net != NULL && net != lpni->lpni_net) + continue; + + peer = lpni->lpni_peer_net->lpn_peer; + if (!nid_same(&peer->lp_primary_nid, + &lpni->lpni_nid)) { + lnet_peer_ni_del_locked(lpni, false); + continue; + } + /* + * Removing the primary NID implies removing + * the entire peer. Advance next beyond any + * peer_ni that belongs to the same peer. + */ + list_for_each_entry_from(next, &ptable->pt_hash[i], + lpni_hashlist) { + if (next->lpni_peer_net->lpn_peer != peer) + break; + } + lnet_peer_del_locked(peer); + } + } +} + +static void +lnet_peer_ni_finalize_wait(struct lnet_peer_table *ptable) +{ + wait_var_event_warning(&ptable->pt_zombies, + ptable->pt_zombies == 0, + "Waiting for %d zombies on peer table\n", + ptable->pt_zombies); +} + +static void +lnet_peer_table_del_rtrs_locked(struct lnet_net *net, + struct lnet_peer_table *ptable) +{ + struct lnet_peer_ni *lp; + struct lnet_peer_ni *tmp; + struct lnet_nid gw_nid; + int i; + + for (i = 0; i < LNET_PEER_HASH_SIZE; i++) { + list_for_each_entry_safe(lp, tmp, &ptable->pt_hash[i], + lpni_hashlist) { + if (net != lp->lpni_net) + continue; + + if (!lnet_isrouter(lp)) + continue; + + gw_nid = lp->lpni_peer_net->lpn_peer->lp_primary_nid; + + lnet_net_unlock(LNET_LOCK_EX); + lnet_del_route(LNET_NET_ANY, &gw_nid); + lnet_net_lock(LNET_LOCK_EX); + } + } +} + +void +lnet_peer_tables_cleanup(struct lnet_net *net) +{ + int i; + struct lnet_peer_table *ptable; + + LASSERT(the_lnet.ln_state != LNET_STATE_SHUTDOWN || net != NULL); + /* If just deleting the peers for a NI, get rid of any routes these + * peers are gateways for. */ + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + lnet_net_lock(LNET_LOCK_EX); + lnet_peer_table_del_rtrs_locked(net, ptable); + lnet_net_unlock(LNET_LOCK_EX); + } + + /* Start the cleanup process */ + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + lnet_net_lock(LNET_LOCK_EX); + lnet_peer_table_cleanup_locked(net, ptable); + lnet_net_unlock(LNET_LOCK_EX); + } + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) + lnet_peer_ni_finalize_wait(ptable); +} + +static struct lnet_peer_ni * +lnet_get_peer_ni_locked(struct lnet_peer_table *ptable, struct lnet_nid *nid) +{ + struct list_head *peers; + struct lnet_peer_ni *lp; + + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return NULL; + + peers = &ptable->pt_hash[lnet_nid2peerhash(nid)]; + list_for_each_entry(lp, peers, lpni_hashlist) { + if (nid_same(&lp->lpni_nid, nid)) { + lnet_peer_ni_addref_locked(lp); + return lp; + } + } + + return NULL; +} + +struct lnet_peer_ni * +lnet_find_peer_ni_locked(lnet_nid_t nid4) +{ + struct lnet_peer_ni *lpni; + struct lnet_peer_table *ptable; + int cpt; + struct lnet_nid nid; + + lnet_nid4_to_nid(nid4, &nid); + + cpt = lnet_nid_cpt_hash(&nid, LNET_CPT_NUMBER); + + ptable = the_lnet.ln_peer_tables[cpt]; + lpni = lnet_get_peer_ni_locked(ptable, &nid); + + return lpni; +} + +struct lnet_peer_ni * +lnet_peer_ni_find_locked(struct lnet_nid *nid) +{ + struct lnet_peer_ni *lpni; + struct lnet_peer_table *ptable; + int cpt; + + cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); + + ptable = the_lnet.ln_peer_tables[cpt]; + lpni = lnet_get_peer_ni_locked(ptable, nid); + + return lpni; +} + +struct lnet_peer_ni * +lnet_peer_get_ni_locked(struct lnet_peer *lp, lnet_nid_t nid) +{ + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + + lpn = lnet_peer_get_net_locked(lp, LNET_NIDNET(nid)); + if (!lpn) + return NULL; + + list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) { + if (lnet_nid_to_nid4(&lpni->lpni_nid) == nid) + return lpni; + } + + return NULL; +} + +struct lnet_peer_ni * +lnet_peer_ni_get_locked(struct lnet_peer *lp, struct lnet_nid *nid) +{ + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + + lpn = lnet_peer_get_net_locked(lp, LNET_NID_NET(nid)); + if (!lpn) + return NULL; + + list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) { + if (nid_same(&lpni->lpni_nid, nid)) + return lpni; + } + + return NULL; +} + +struct lnet_peer * +lnet_find_peer4(lnet_nid_t nid) +{ + struct lnet_peer_ni *lpni; + struct lnet_peer *lp = NULL; + int cpt; + + cpt = lnet_net_lock_current(); + lpni = lnet_find_peer_ni_locked(nid); + if (lpni) { + lp = lpni->lpni_peer_net->lpn_peer; + lnet_peer_addref_locked(lp); + lnet_peer_ni_decref_locked(lpni); + } + lnet_net_unlock(cpt); + + return lp; +} + +struct lnet_peer * +lnet_find_peer(struct lnet_nid *nid) +{ + struct lnet_peer_ni *lpni; + struct lnet_peer *lp = NULL; + int cpt; + + cpt = lnet_net_lock_current(); + lpni = lnet_peer_ni_find_locked(nid); + if (lpni) { + lp = lpni->lpni_peer_net->lpn_peer; + lnet_peer_addref_locked(lp); + lnet_peer_ni_decref_locked(lpni); + } + lnet_net_unlock(cpt); + + return lp; +} + +struct lnet_peer_net * +lnet_get_next_peer_net_locked(struct lnet_peer *lp, __u32 prev_lpn_id) +{ + struct lnet_peer_net *net; + + if (!prev_lpn_id) { + /* no net id provided return the first net */ + net = list_first_entry_or_null(&lp->lp_peer_nets, + struct lnet_peer_net, + lpn_peer_nets); + + return net; + } + + /* find the net after the one provided */ + list_for_each_entry(net, &lp->lp_peer_nets, lpn_peer_nets) { + if (net->lpn_net_id == prev_lpn_id) { + /* + * if we reached the end of the list loop to the + * beginning. + */ + if (net->lpn_peer_nets.next == &lp->lp_peer_nets) + return list_first_entry_or_null(&lp->lp_peer_nets, + struct lnet_peer_net, + lpn_peer_nets); + else + return list_next_entry(net, lpn_peer_nets); + } + } + + return NULL; +} + +struct lnet_peer_ni * +lnet_get_next_peer_ni_locked(struct lnet_peer *peer, + struct lnet_peer_net *peer_net, + struct lnet_peer_ni *prev) +{ + struct lnet_peer_ni *lpni; + struct lnet_peer_net *net = peer_net; + + if (!prev) { + if (!net) { + if (list_empty(&peer->lp_peer_nets)) + return NULL; + + net = list_entry(peer->lp_peer_nets.next, + struct lnet_peer_net, + lpn_peer_nets); + } + lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni, + lpni_peer_nis); + + return lpni; + } + + if (prev->lpni_peer_nis.next == &prev->lpni_peer_net->lpn_peer_nis) { + /* + * if you reached the end of the peer ni list and the peer + * net is specified then there are no more peer nis in that + * net. + */ + if (net) + return NULL; + + /* + * we reached the end of this net ni list. move to the + * next net + */ + if (prev->lpni_peer_net->lpn_peer_nets.next == + &peer->lp_peer_nets) + /* no more nets and no more NIs. */ + return NULL; + + /* get the next net */ + net = list_entry(prev->lpni_peer_net->lpn_peer_nets.next, + struct lnet_peer_net, + lpn_peer_nets); + /* get the ni on it */ + lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni, + lpni_peer_nis); + + return lpni; + } + + /* there are more nis left */ + lpni = list_entry(prev->lpni_peer_nis.next, + struct lnet_peer_ni, lpni_peer_nis); + + return lpni; +} + +/* Call with the ln_api_mutex held */ +int lnet_get_peer_list(u32 *countp, u32 *sizep, struct lnet_process_id __user *ids) +{ + struct lnet_process_id id; + struct lnet_peer_table *ptable; + struct lnet_peer *lp; + __u32 count = 0; + __u32 size = 0; + int lncpt; + int cpt; + __u32 i; + int rc; + + rc = -ESHUTDOWN; + if (the_lnet.ln_state != LNET_STATE_RUNNING) + goto done; + + lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); + + /* + * Count the number of peers, and return E2BIG if the buffer + * is too small. We'll also return the desired size. + */ + rc = -E2BIG; + for (cpt = 0; cpt < lncpt; cpt++) { + ptable = the_lnet.ln_peer_tables[cpt]; + count += ptable->pt_peers; + } + size = count * sizeof(*ids); + if (size > *sizep) + goto done; + + /* + * Walk the peer lists and copy out the primary nids. + * This is safe because the peer lists are only modified + * while the ln_api_mutex is held. So we don't need to + * hold the lnet_net_lock as well, and can therefore + * directly call copy_to_user(). + */ + rc = -EFAULT; + memset(&id, 0, sizeof(id)); + id.pid = LNET_PID_LUSTRE; + i = 0; + for (cpt = 0; cpt < lncpt; cpt++) { + ptable = the_lnet.ln_peer_tables[cpt]; + list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) { + if (!nid_is_nid4(&lp->lp_primary_nid)) + continue; + if (i >= count) + goto done; + id.nid = lnet_nid_to_nid4(&lp->lp_primary_nid); + if (copy_to_user(&ids[i], &id, sizeof(id))) + goto done; + i++; + } + } + rc = 0; +done: + *countp = count; + *sizep = size; + return rc; +} + +/* + * Start pushes to peers that need to be updated for a configuration + * change on this node. + */ +void +lnet_push_update_to_peers(int force) +{ + struct lnet_peer_table *ptable; + struct lnet_peer *lp; + int lncpt; + int cpt; + + lnet_net_lock(LNET_LOCK_EX); + if (lnet_peer_discovery_disabled) + force = 0; + lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); + for (cpt = 0; cpt < lncpt; cpt++) { + ptable = the_lnet.ln_peer_tables[cpt]; + list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) { + if (force) { + spin_lock(&lp->lp_lock); + if (lp->lp_state & LNET_PEER_MULTI_RAIL) + lp->lp_state |= LNET_PEER_FORCE_PUSH; + spin_unlock(&lp->lp_lock); + } + if (lnet_peer_needs_push(lp)) + lnet_peer_queue_for_discovery(lp); + } + } + lnet_net_unlock(LNET_LOCK_EX); + wake_up(&the_lnet.ln_dc_waitq); +} + +/* find the NID in the preferred gateways for the remote peer + * return: + * false: list is not empty and NID is not preferred + * false: list is empty + * true: nid is found in the list + */ +bool +lnet_peer_is_pref_rtr_locked(struct lnet_peer_ni *lpni, + struct lnet_nid *gw_nid) +{ + struct lnet_nid_list *ne; + + CDEBUG(D_NET, "%s: rtr pref emtpy: %d\n", + libcfs_nidstr(&lpni->lpni_nid), + list_empty(&lpni->lpni_rtr_pref_nids)); + + if (list_empty(&lpni->lpni_rtr_pref_nids)) + return false; + + /* iterate through all the preferred NIDs and see if any of them + * matches the provided gw_nid + */ + list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) { + CDEBUG(D_NET, "Comparing pref %s with gw %s\n", + libcfs_nidstr(&ne->nl_nid), + libcfs_nidstr(gw_nid)); + if (nid_same(&ne->nl_nid, gw_nid)) + return true; + } + + return false; +} + +void +lnet_peer_clr_pref_rtrs(struct lnet_peer_ni *lpni) +{ + struct list_head zombies; + struct lnet_nid_list *ne; + struct lnet_nid_list *tmp; + int cpt = lpni->lpni_cpt; + + INIT_LIST_HEAD(&zombies); + + lnet_net_lock(cpt); + list_splice_init(&lpni->lpni_rtr_pref_nids, &zombies); + lnet_net_unlock(cpt); + + list_for_each_entry_safe(ne, tmp, &zombies, nl_list) { + list_del(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } +} + +int +lnet_peer_add_pref_rtr(struct lnet_peer_ni *lpni, + struct lnet_nid *gw_nid) +{ + int cpt = lpni->lpni_cpt; + struct lnet_nid_list *ne = NULL; + + /* This function is called with api_mutex held. When the api_mutex + * is held the list can not be modified, as it is only modified as + * a result of applying a UDSP and that happens under api_mutex + * lock. + */ + __must_hold(&the_lnet.ln_api_mutex); + + list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) { + if (nid_same(&ne->nl_nid, gw_nid)) + return -EEXIST; + } + + LIBCFS_CPT_ALLOC(ne, lnet_cpt_table(), cpt, sizeof(*ne)); + if (!ne) + return -ENOMEM; + + ne->nl_nid = *gw_nid; + + /* Lock the cpt to protect against addition and checks in the + * selection algorithm + */ + lnet_net_lock(cpt); + list_add(&ne->nl_list, &lpni->lpni_rtr_pref_nids); + lnet_net_unlock(cpt); + + return 0; +} + +/* + * Test whether a ni is a preferred ni for this peer_ni, e.g, whether + * this is a preferred point-to-point path. Call with lnet_net_lock in + * shared mmode. + */ +bool +lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, struct lnet_nid *nid) +{ + struct lnet_nid_list *ne; + + if (lpni->lpni_pref_nnids == 0) + return false; + if (lpni->lpni_pref_nnids == 1) + return nid_same(&lpni->lpni_pref.nid, nid); + list_for_each_entry(ne, &lpni->lpni_pref.nids, nl_list) { + if (nid_same(&ne->nl_nid, nid)) + return true; + } + return false; +} + +/* + * Set a single ni as preferred, provided no preferred ni is already + * defined. Only to be used for non-multi-rail peer_ni. + */ +int +lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, + struct lnet_nid *nid) +{ + int rc = 0; + + if (!nid) + return -EINVAL; + spin_lock(&lpni->lpni_lock); + if (LNET_NID_IS_ANY(nid)) { + rc = -EINVAL; + } else if (lpni->lpni_pref_nnids > 0) { + rc = -EPERM; + } else if (lpni->lpni_pref_nnids == 0) { + lpni->lpni_pref.nid = *nid; + lpni->lpni_pref_nnids = 1; + lpni->lpni_state |= LNET_PEER_NI_NON_MR_PREF; + } + spin_unlock(&lpni->lpni_lock); + + CDEBUG(D_NET, "peer %s nid %s: %d\n", + libcfs_nidstr(&lpni->lpni_nid), libcfs_nidstr(nid), rc); + return rc; +} + +/* + * Clear the preferred NID from a non-multi-rail peer_ni, provided + * this preference was set by lnet_peer_ni_set_non_mr_pref_nid(). + */ +int +lnet_peer_ni_clr_non_mr_pref_nid(struct lnet_peer_ni *lpni) +{ + int rc = 0; + + spin_lock(&lpni->lpni_lock); + if (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF) { + lpni->lpni_pref_nnids = 0; + lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; + } else if (lpni->lpni_pref_nnids == 0) { + rc = -ENOENT; + } else { + rc = -EPERM; + } + spin_unlock(&lpni->lpni_lock); + + CDEBUG(D_NET, "peer %s: %d\n", + libcfs_nidstr(&lpni->lpni_nid), rc); + return rc; +} + +void +lnet_peer_ni_set_selection_priority(struct lnet_peer_ni *lpni, __u32 priority) +{ + lpni->lpni_sel_priority = priority; +} + +/* + * Clear the preferred NIDs from a non-multi-rail peer. + */ +void +lnet_peer_clr_non_mr_pref_nids(struct lnet_peer *lp) +{ + struct lnet_peer_ni *lpni = NULL; + + while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) + lnet_peer_ni_clr_non_mr_pref_nid(lpni); +} + +int +lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, struct lnet_nid *nid) +{ + struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer; + struct lnet_nid_list *ne1 = NULL; + struct lnet_nid_list *ne2 = NULL; + struct lnet_nid *tmp_nid = NULL; + int rc = 0; + + if (LNET_NID_IS_ANY(nid)) { + rc = -EINVAL; + goto out; + } + + if (lpni->lpni_pref_nnids == 1 && + nid_same(&lpni->lpni_pref.nid, nid)) { + rc = -EEXIST; + goto out; + } + + /* A non-MR node may have only one preferred NI per peer_ni */ + if (lpni->lpni_pref_nnids > 0 && + !(lp->lp_state & LNET_PEER_MULTI_RAIL)) { + rc = -EPERM; + goto out; + } + + /* add the new preferred nid to the list of preferred nids */ + if (lpni->lpni_pref_nnids != 0) { + size_t alloc_size = sizeof(*ne1); + + if (lpni->lpni_pref_nnids == 1) { + tmp_nid = &lpni->lpni_pref.nid; + INIT_LIST_HEAD(&lpni->lpni_pref.nids); + } + + list_for_each_entry(ne1, &lpni->lpni_pref.nids, nl_list) { + if (nid_same(&ne1->nl_nid, nid)) { + rc = -EEXIST; + goto out; + } + } + + LIBCFS_CPT_ALLOC(ne1, lnet_cpt_table(), lpni->lpni_cpt, + alloc_size); + if (!ne1) { + rc = -ENOMEM; + goto out; + } + + /* move the originally stored nid to the list */ + if (lpni->lpni_pref_nnids == 1) { + LIBCFS_CPT_ALLOC(ne2, lnet_cpt_table(), + lpni->lpni_cpt, alloc_size); + if (!ne2) { + rc = -ENOMEM; + goto out; + } + INIT_LIST_HEAD(&ne2->nl_list); + ne2->nl_nid = *tmp_nid; + } + ne1->nl_nid = *nid; + } + + lnet_net_lock(LNET_LOCK_EX); + spin_lock(&lpni->lpni_lock); + if (lpni->lpni_pref_nnids == 0) { + lpni->lpni_pref.nid = *nid; + } else { + if (ne2) + list_add_tail(&ne2->nl_list, &lpni->lpni_pref.nids); + list_add_tail(&ne1->nl_list, &lpni->lpni_pref.nids); + } + lpni->lpni_pref_nnids++; + lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; + spin_unlock(&lpni->lpni_lock); + lnet_net_unlock(LNET_LOCK_EX); + +out: + if (rc == -EEXIST && (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF)) { + spin_lock(&lpni->lpni_lock); + lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; + spin_unlock(&lpni->lpni_lock); + } + CDEBUG(D_NET, "peer %s nid %s: %d\n", + libcfs_nidstr(&lp->lp_primary_nid), libcfs_nidstr(nid), rc); + return rc; +} + +int +lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, struct lnet_nid *nid) +{ + struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer; + struct lnet_nid_list *ne = NULL; + int rc = 0; + + if (lpni->lpni_pref_nnids == 0) { + rc = -ENOENT; + goto out; + } + + if (lpni->lpni_pref_nnids == 1) { + if (!nid_same(&lpni->lpni_pref.nid, nid)) { + rc = -ENOENT; + goto out; + } + } else { + list_for_each_entry(ne, &lpni->lpni_pref.nids, nl_list) { + if (nid_same(&ne->nl_nid, nid)) + goto remove_nid_entry; + } + rc = -ENOENT; + ne = NULL; + goto out; + } + +remove_nid_entry: + lnet_net_lock(LNET_LOCK_EX); + spin_lock(&lpni->lpni_lock); + if (lpni->lpni_pref_nnids == 1) + lpni->lpni_pref.nid = LNET_ANY_NID; + else { + list_del_init(&ne->nl_list); + if (lpni->lpni_pref_nnids == 2) { + struct lnet_nid_list *ne, *tmp; + + list_for_each_entry_safe(ne, tmp, + &lpni->lpni_pref.nids, + nl_list) { + lpni->lpni_pref.nid = ne->nl_nid; + list_del_init(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } + } + } + lpni->lpni_pref_nnids--; + lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; + spin_unlock(&lpni->lpni_lock); + lnet_net_unlock(LNET_LOCK_EX); + + if (ne) + LIBCFS_FREE(ne, sizeof(*ne)); +out: + CDEBUG(D_NET, "peer %s nid %s: %d\n", + libcfs_nidstr(&lp->lp_primary_nid), libcfs_nidstr(nid), rc); + return rc; +} + +void +lnet_peer_clr_pref_nids(struct lnet_peer_ni *lpni) +{ + struct list_head zombies; + struct lnet_nid_list *ne; + struct lnet_nid_list *tmp; + + INIT_LIST_HEAD(&zombies); + + lnet_net_lock(LNET_LOCK_EX); + if (lpni->lpni_pref_nnids == 1) + lpni->lpni_pref.nid = LNET_ANY_NID; + else if (lpni->lpni_pref_nnids > 1) + list_splice_init(&lpni->lpni_pref.nids, &zombies); + lpni->lpni_pref_nnids = 0; + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry_safe(ne, tmp, &zombies, nl_list) { + list_del_init(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } +} + +void +lnet_peer_primary_nid_locked(struct lnet_nid *nid, struct lnet_nid *result) +{ + struct lnet_peer_ni *lpni; + + *result = *nid; + lpni = lnet_peer_ni_find_locked(nid); + if (lpni) { + *result = lpni->lpni_peer_net->lpn_peer->lp_primary_nid; + lnet_peer_ni_decref_locked(lpni); + } +} + +bool +lnet_is_discovery_disabled_locked(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + if (lnet_peer_discovery_disabled) + return true; + + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL) || + (lp->lp_state & LNET_PEER_NO_DISCOVERY)) { + return true; + } + + return false; +} + +/* + * Peer Discovery + */ +bool +lnet_is_discovery_disabled(struct lnet_peer *lp) +{ + bool rc = false; + + spin_lock(&lp->lp_lock); + rc = lnet_is_discovery_disabled_locked(lp); + spin_unlock(&lp->lp_lock); + + return rc; +} + +int +LNetAddPeer(lnet_nid_t *nids, __u32 num_nids) +{ + lnet_nid_t pnid = 0; + bool mr; + int i, rc; + + if (!nids || num_nids < 1) + return -EINVAL; + + rc = LNetNIInit(LNET_PID_ANY); + if (rc < 0) + return rc; + + mutex_lock(&the_lnet.ln_api_mutex); + + mr = lnet_peer_discovery_disabled == 0; + + rc = 0; + for (i = 0; i < num_nids; i++) { + if (nids[i] == LNET_NID_LO_0) + continue; + + if (!pnid) { + pnid = nids[i]; + rc = lnet_add_peer_ni(pnid, LNET_NID_ANY, mr, true); + } else if (lnet_peer_discovery_disabled) { + rc = lnet_add_peer_ni(nids[i], LNET_NID_ANY, mr, true); + } else { + rc = lnet_add_peer_ni(pnid, nids[i], mr, true); + } + + if (rc && rc != -EEXIST) + goto unlock; + } + +unlock: + mutex_unlock(&the_lnet.ln_api_mutex); + + LNetNIFini(); + + return rc == -EEXIST ? 0 : rc; +} +EXPORT_SYMBOL(LNetAddPeer); + +/* FIXME support large-addr nid */ +lnet_nid_t +LNetPrimaryNID(lnet_nid_t nid) +{ + struct lnet_peer *lp; + struct lnet_peer_ni *lpni; + lnet_nid_t primary_nid = nid; + int rc = 0; + int cpt; + + if (nid == LNET_NID_LO_0) + return LNET_NID_LO_0; + + cpt = lnet_net_lock_current(); + lpni = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt); + if (IS_ERR(lpni)) { + rc = PTR_ERR(lpni); + goto out_unlock; + } + lp = lpni->lpni_peer_net->lpn_peer; + + /* If discovery is disabled locally then we needn't bother running + * discovery here because discovery will not modify whatever + * primary NID is currently set for this peer. If the specified peer is + * down then this discovery can introduce long delays into the mount + * process, so skip it if it isn't necessary. + */ + while (!lnet_peer_discovery_disabled && !lnet_peer_is_uptodate(lp)) { + spin_lock(&lp->lp_lock); + /* force a full discovery cycle */ + lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH; + spin_unlock(&lp->lp_lock); + + rc = lnet_discover_peer_locked(lpni, cpt, true); + if (rc) + goto out_decref; + /* The lpni (or lp) for this NID may have changed and our ref is + * the only thing keeping the old one around. Release the ref + * and lookup the lpni again + */ + lnet_peer_ni_decref_locked(lpni); + lpni = lnet_find_peer_ni_locked(nid); + if (!lpni) { + rc = -ENOENT; + goto out_unlock; + } + lp = lpni->lpni_peer_net->lpn_peer; + + /* If we find that the peer has discovery disabled then we will + * not modify whatever primary NID is currently set for this + * peer. Thus, we can break out of this loop even if the peer + * is not fully up to date. + */ + if (lnet_is_discovery_disabled(lp)) + break; + } + primary_nid = lnet_nid_to_nid4(&lp->lp_primary_nid); +out_decref: + lnet_peer_ni_decref_locked(lpni); +out_unlock: + lnet_net_unlock(cpt); + + CDEBUG(D_NET, "NID %s primary NID %s rc %d\n", libcfs_nid2str(nid), + libcfs_nid2str(primary_nid), rc); + return primary_nid; +} +EXPORT_SYMBOL(LNetPrimaryNID); + +struct lnet_peer_net * +lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id) +{ + struct lnet_peer_net *peer_net; + list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) { + if (peer_net->lpn_net_id == net_id) + return peer_net; + } + return NULL; +} + +/* + * Attach a peer_ni to a peer_net and peer. This function assumes + * peer_ni is not already attached to the peer_net/peer. The peer_ni + * may be attached to a different peer, in which case it will be + * properly detached first. The whole operation is done atomically. + * + * This function consumes the reference on lpni and Always returns 0. + * This is the last function called from functions that do return an + * int, so returning 0 here allows the compiler to do a tail call. + */ +static int +lnet_peer_attach_peer_ni(struct lnet_peer *lp, + struct lnet_peer_net *lpn, + struct lnet_peer_ni *lpni, + unsigned flags) +{ + struct lnet_peer_table *ptable; + bool new_lpn = false; + int rc; + + /* Install the new peer_ni */ + lnet_net_lock(LNET_LOCK_EX); + /* Add peer_ni to global peer table hash, if necessary. */ + if (list_empty(&lpni->lpni_hashlist)) { + int hash = lnet_nid2peerhash(&lpni->lpni_nid); + + ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; + list_add_tail(&lpni->lpni_hashlist, &ptable->pt_hash[hash]); + ptable->pt_version++; + lnet_peer_ni_addref_locked(lpni); + } + + /* Detach the peer_ni from an existing peer, if necessary. */ + if (lpni->lpni_peer_net) { + LASSERT(lpni->lpni_peer_net != lpn); + LASSERT(lpni->lpni_peer_net->lpn_peer != lp); + lnet_peer_detach_peer_ni_locked(lpni); + lnet_peer_net_decref_locked(lpni->lpni_peer_net); + lpni->lpni_peer_net = NULL; + } + + /* Add peer_ni to peer_net */ + lpni->lpni_peer_net = lpn; + if (nid_same(&lp->lp_primary_nid, &lpni->lpni_nid)) + list_add(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis); + else + list_add_tail(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis); + lnet_update_peer_net_healthv(lpni); + lnet_peer_net_addref_locked(lpn); + + /* Add peer_net to peer */ + if (!lpn->lpn_peer) { + new_lpn = true; + lpn->lpn_peer = lp; + if (nid_same(&lp->lp_primary_nid, &lpni->lpni_nid)) + list_add(&lpn->lpn_peer_nets, &lp->lp_peer_nets); + else + list_add_tail(&lpn->lpn_peer_nets, &lp->lp_peer_nets); + lnet_peer_addref_locked(lp); + } + + /* Add peer to global peer list, if necessary */ + ptable = the_lnet.ln_peer_tables[lp->lp_cpt]; + if (list_empty(&lp->lp_peer_list)) { + list_add_tail(&lp->lp_peer_list, &ptable->pt_peer_list); + ptable->pt_peers++; + } + + + /* Update peer state */ + spin_lock(&lp->lp_lock); + if (flags & LNET_PEER_CONFIGURED) { + if (!(lp->lp_state & LNET_PEER_CONFIGURED)) + lp->lp_state |= LNET_PEER_CONFIGURED; + } + if (flags & LNET_PEER_MULTI_RAIL) { + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { + lp->lp_state |= LNET_PEER_MULTI_RAIL; + lnet_peer_clr_non_mr_pref_nids(lp); + } + } + spin_unlock(&lp->lp_lock); + + lp->lp_nnis++; + + /* apply UDSPs */ + if (new_lpn) { + rc = lnet_udsp_apply_policies_on_lpn(lpn); + if (rc) + CERROR("Failed to apply UDSPs on lpn %s\n", + libcfs_net2str(lpn->lpn_net_id)); + } + rc = lnet_udsp_apply_policies_on_lpni(lpni); + if (rc) + CERROR("Failed to apply UDSPs on lpni %s\n", + libcfs_nidstr(&lpni->lpni_nid)); + + CDEBUG(D_NET, "peer %s NID %s flags %#x\n", + libcfs_nidstr(&lp->lp_primary_nid), + libcfs_nidstr(&lpni->lpni_nid), flags); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(LNET_LOCK_EX); + + return 0; +} + +/* + * Create a new peer, with nid as its primary nid. + * + * Call with the lnet_api_mutex held. + */ +static int +lnet_peer_add(lnet_nid_t nid4, unsigned int flags) +{ + struct lnet_nid nid; + struct lnet_peer *lp; + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + int rc = 0; + + LASSERT(nid4 != LNET_NID_ANY); + + /* + * No need for the lnet_net_lock here, because the + * lnet_api_mutex is held. + */ + lpni = lnet_find_peer_ni_locked(nid4); + if (lpni) { + /* A peer with this NID already exists. */ + lp = lpni->lpni_peer_net->lpn_peer; + lnet_peer_ni_decref_locked(lpni); + /* + * This is an error if the peer was configured and the + * primary NID differs or an attempt is made to change + * the Multi-Rail flag. Otherwise the assumption is + * that an existing peer is being modified. + */ + if (lp->lp_state & LNET_PEER_CONFIGURED) { + if (lnet_nid_to_nid4(&lp->lp_primary_nid) != nid4) + rc = -EEXIST; + else if ((lp->lp_state ^ flags) & LNET_PEER_MULTI_RAIL) + rc = -EPERM; + goto out; + } else if (!(flags & LNET_PEER_CONFIGURED)) { + if (lnet_nid_to_nid4(&lp->lp_primary_nid) == nid4) { + rc = -EEXIST; + goto out; + } + } + /* Delete and recreate as a configured peer. */ + rc = lnet_peer_del(lp); + if (rc) + goto out; + } + + /* Create peer, peer_net, and peer_ni. */ + rc = -ENOMEM; + lnet_nid4_to_nid(nid4, &nid); + lp = lnet_peer_alloc(&nid); + if (!lp) + goto out; + lpn = lnet_peer_net_alloc(LNET_NID_NET(&nid)); + if (!lpn) + goto out_free_lp; + lpni = lnet_peer_ni_alloc(&nid); + if (!lpni) + goto out_free_lpn; + + return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags); + +out_free_lpn: + LIBCFS_FREE(lpn, sizeof(*lpn)); +out_free_lp: + LIBCFS_FREE(lp, sizeof(*lp)); +out: + CDEBUG(D_NET, "peer %s NID flags %#x: %d\n", + libcfs_nid2str(nid4), flags, rc); + return rc; +} + +/* + * Add a NID to a peer. Call with ln_api_mutex held. + * + * Error codes: + * -EPERM: Non-DLC addition to a DLC-configured peer. + * -EEXIST: The NID was configured by DLC for a different peer. + * -ENOMEM: Out of memory. + * -ENOTUNIQ: Adding a second peer NID on a single network on a + * non-multi-rail peer. + */ +static int +lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags) +{ + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + struct lnet_nid nid; + int rc = 0; + + LASSERT(lp); + LASSERT(nid4 != LNET_NID_ANY); + + lnet_nid4_to_nid(nid4, &nid); + + /* A configured peer can only be updated through configuration. */ + if (!(flags & LNET_PEER_CONFIGURED)) { + if (lp->lp_state & LNET_PEER_CONFIGURED) { + rc = -EPERM; + goto out; + } + } + + /* + * The MULTI_RAIL flag can be set but not cleared, because + * that would leave the peer struct in an invalid state. + */ + if (flags & LNET_PEER_MULTI_RAIL) { + spin_lock(&lp->lp_lock); + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { + lp->lp_state |= LNET_PEER_MULTI_RAIL; + lnet_peer_clr_non_mr_pref_nids(lp); + } + spin_unlock(&lp->lp_lock); + } else if (lp->lp_state & LNET_PEER_MULTI_RAIL) { + rc = -EPERM; + goto out; + } + + lpni = lnet_find_peer_ni_locked(nid4); + if (lpni) { + /* + * A peer_ni already exists. This is only a problem if + * it is not connected to this peer and was configured + * by DLC. + */ + if (lpni->lpni_peer_net->lpn_peer == lp) + goto out_free_lpni; + if (lnet_peer_ni_is_configured(lpni)) { + rc = -EEXIST; + goto out_free_lpni; + } + /* If this is the primary NID, destroy the peer. */ + if (lnet_peer_ni_is_primary(lpni)) { + struct lnet_peer *rtr_lp = + lpni->lpni_peer_net->lpn_peer; + int rtr_refcount = rtr_lp->lp_rtr_refcount; + /* + * if we're trying to delete a router it means + * we're moving this peer NI to a new peer so must + * transfer router properties to the new peer + */ + if (rtr_refcount > 0) { + flags |= LNET_PEER_RTR_NI_FORCE_DEL; + lnet_rtr_transfer_to_peer(rtr_lp, lp); + } + lnet_peer_del(lpni->lpni_peer_net->lpn_peer); + lnet_peer_ni_decref_locked(lpni); + lpni = lnet_peer_ni_alloc(&nid); + if (!lpni) { + rc = -ENOMEM; + goto out_free_lpni; + } + } + } else { + lpni = lnet_peer_ni_alloc(&nid); + if (!lpni) { + rc = -ENOMEM; + goto out_free_lpni; + } + } + + /* + * Get the peer_net. Check that we're not adding a second + * peer_ni on a peer_net of a non-multi-rail peer. + */ + lpn = lnet_peer_get_net_locked(lp, LNET_NIDNET(nid4)); + if (!lpn) { + lpn = lnet_peer_net_alloc(LNET_NIDNET(nid4)); + if (!lpn) { + rc = -ENOMEM; + goto out_free_lpni; + } + } else if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { + rc = -ENOTUNIQ; + goto out_free_lpni; + } + + return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags); + +out_free_lpni: + lnet_peer_ni_decref_locked(lpni); +out: + CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n", + libcfs_nidstr(&lp->lp_primary_nid), libcfs_nid2str(nid4), + flags, rc); + return rc; +} + +/* + * Update the primary NID of a peer, if possible. + * + * Call with the lnet_api_mutex held. + */ +static int +lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid, + unsigned int flags) +{ + struct lnet_nid old = lp->lp_primary_nid; + int rc = 0; + + if (lnet_nid_to_nid4(&lp->lp_primary_nid) == nid) + goto out; + + lnet_nid4_to_nid(nid, &lp->lp_primary_nid); + + rc = lnet_peer_add_nid(lp, nid, flags); + if (rc) { + lp->lp_primary_nid = old; + goto out; + } +out: + CDEBUG(D_NET, "peer %s NID %s: %d\n", + libcfs_nidstr(&old), libcfs_nid2str(nid), rc); + + return rc; +} + +/* + * lpni creation initiated due to traffic either sending or receiving. + */ +static int +lnet_peer_ni_traffic_add(struct lnet_nid *nid, struct lnet_nid *pref) +{ + struct lnet_peer *lp; + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + unsigned flags = 0; + int rc = 0; + + if (LNET_NID_IS_ANY(nid)) { + rc = -EINVAL; + goto out; + } + + /* lnet_net_lock is not needed here because ln_api_lock is held */ + lpni = lnet_peer_ni_find_locked(nid); + if (lpni) { + /* + * We must have raced with another thread. Since we + * know next to nothing about a peer_ni created by + * traffic, we just assume everything is ok and + * return. + */ + lnet_peer_ni_decref_locked(lpni); + goto out; + } + + /* Create peer, peer_net, and peer_ni. */ + rc = -ENOMEM; + lp = lnet_peer_alloc(nid); + if (!lp) + goto out; + lpn = lnet_peer_net_alloc(LNET_NID_NET(nid)); + if (!lpn) + goto out_free_lp; + lpni = lnet_peer_ni_alloc(nid); + if (!lpni) + goto out_free_lpn; + lnet_peer_ni_set_non_mr_pref_nid(lpni, pref); + + return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags); + +out_free_lpn: + LIBCFS_FREE(lpn, sizeof(*lpn)); +out_free_lp: + LIBCFS_FREE(lp, sizeof(*lp)); +out: + CDEBUG(D_NET, "peer %s: %d\n", libcfs_nidstr(nid), rc); + return rc; +} + +/* + * Implementation of IOC_LIBCFS_ADD_PEER_NI. + * + * This API handles the following combinations: + * Create a peer with its primary NI if only the prim_nid is provided + * Add a NID to a peer identified by the prim_nid. The peer identified + * by the prim_nid must already exist. + * The peer being created may be non-MR. + * + * The caller must hold ln_api_mutex. This prevents the peer from + * being created/modified/deleted by a different thread. + */ +int +lnet_add_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr, bool temp) +{ + struct lnet_peer *lp = NULL; + struct lnet_peer_ni *lpni; + unsigned int flags = 0; + + /* The prim_nid must always be specified */ + if (prim_nid == LNET_NID_ANY) + return -EINVAL; + + if (!temp) + flags = LNET_PEER_CONFIGURED; + + if (mr) + flags |= LNET_PEER_MULTI_RAIL; + + /* + * If nid isn't specified, we must create a new peer with + * prim_nid as its primary nid. + */ + if (nid == LNET_NID_ANY) + return lnet_peer_add(prim_nid, flags); + + /* Look up the prim_nid, which must exist. */ + lpni = lnet_find_peer_ni_locked(prim_nid); + if (!lpni) + return -ENOENT; + lnet_peer_ni_decref_locked(lpni); + lp = lpni->lpni_peer_net->lpn_peer; + + /* Peer must have been configured. */ + if (!temp && !(lp->lp_state & LNET_PEER_CONFIGURED)) { + CDEBUG(D_NET, "peer %s was not configured\n", + libcfs_nid2str(prim_nid)); + return -ENOENT; + } + + /* Primary NID must match */ + if (lnet_nid_to_nid4(&lp->lp_primary_nid) != prim_nid) { + CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n", + libcfs_nid2str(prim_nid), + libcfs_nidstr(&lp->lp_primary_nid)); + return -ENODEV; + } + + /* Multi-Rail flag must match. */ + if ((lp->lp_state ^ flags) & LNET_PEER_MULTI_RAIL) { + CDEBUG(D_NET, "multi-rail state mismatch for peer %s\n", + libcfs_nid2str(prim_nid)); + return -EPERM; + } + + return lnet_peer_add_nid(lp, nid, flags); +} + +/* + * Implementation of IOC_LIBCFS_DEL_PEER_NI. + * + * This API handles the following combinations: + * Delete a NI from a peer if both prim_nid and nid are provided. + * Delete a peer if only prim_nid is provided. + * Delete a peer if its primary nid is provided. + * + * The caller must hold ln_api_mutex. This prevents the peer from + * being modified/deleted by a different thread. + */ +int +lnet_del_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid) +{ + struct lnet_peer *lp; + struct lnet_peer_ni *lpni; + unsigned flags; + + if (prim_nid == LNET_NID_ANY) + return -EINVAL; + + lpni = lnet_find_peer_ni_locked(prim_nid); + if (!lpni) + return -ENOENT; + lnet_peer_ni_decref_locked(lpni); + lp = lpni->lpni_peer_net->lpn_peer; + + if (prim_nid != lnet_nid_to_nid4(&lp->lp_primary_nid)) { + CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n", + libcfs_nid2str(prim_nid), + libcfs_nidstr(&lp->lp_primary_nid)); + return -ENODEV; + } + + lnet_net_lock(LNET_LOCK_EX); + if (lp->lp_rtr_refcount > 0) { + lnet_net_unlock(LNET_LOCK_EX); + CERROR("%s is a router. Can not be deleted\n", + libcfs_nid2str(prim_nid)); + return -EBUSY; + } + lnet_net_unlock(LNET_LOCK_EX); + + if (nid == LNET_NID_ANY || nid == lnet_nid_to_nid4(&lp->lp_primary_nid)) + return lnet_peer_del(lp); + + flags = LNET_PEER_CONFIGURED; + if (lp->lp_state & LNET_PEER_MULTI_RAIL) + flags |= LNET_PEER_MULTI_RAIL; + + return lnet_peer_del_nid(lp, nid, flags); +} + +void +lnet_destroy_peer_ni_locked(struct kref *ref) +{ + struct lnet_peer_ni *lpni = container_of(ref, struct lnet_peer_ni, + lpni_kref); + struct lnet_peer_table *ptable; + struct lnet_peer_net *lpn; + + CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nidstr(&lpni->lpni_nid)); + + LASSERT(kref_read(&lpni->lpni_kref) == 0); + LASSERT(list_empty(&lpni->lpni_txq)); + LASSERT(lpni->lpni_txqnob == 0); + LASSERT(list_empty(&lpni->lpni_peer_nis)); + LASSERT(list_empty(&lpni->lpni_on_remote_peer_ni_list)); + + lpn = lpni->lpni_peer_net; + lpni->lpni_peer_net = NULL; + lpni->lpni_net = NULL; + + if (!list_empty(&lpni->lpni_hashlist)) { + /* remove the peer ni from the zombie list */ + ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; + spin_lock(&ptable->pt_zombie_lock); + list_del_init(&lpni->lpni_hashlist); + ptable->pt_zombies--; + spin_unlock(&ptable->pt_zombie_lock); + } + + if (lpni->lpni_pref_nnids > 1) { + struct lnet_nid_list *ne, *tmp; + + list_for_each_entry_safe(ne, tmp, &lpni->lpni_pref.nids, + nl_list) { + list_del_init(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } + } + LIBCFS_FREE(lpni, sizeof(*lpni)); + + if (lpn) + lnet_peer_net_decref_locked(lpn); +} + +struct lnet_peer_ni * +lnet_nid2peerni_ex(struct lnet_nid *nid, int cpt) +{ + struct lnet_peer_ni *lpni = NULL; + int rc; + + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return ERR_PTR(-ESHUTDOWN); + + /* + * find if a peer_ni already exists. + * If so then just return that. + */ + lpni = lnet_peer_ni_find_locked(nid); + if (lpni) + return lpni; + + lnet_net_unlock(cpt); + + rc = lnet_peer_ni_traffic_add(nid, NULL); + if (rc) { + lpni = ERR_PTR(rc); + goto out_net_relock; + } + + lpni = lnet_peer_ni_find_locked(nid); + LASSERT(lpni); + +out_net_relock: + lnet_net_lock(cpt); + + return lpni; +} + +/* + * Get a peer_ni for the given nid, create it if necessary. Takes a + * hold on the peer_ni. + */ +struct lnet_peer_ni * +lnet_peerni_by_nid_locked(struct lnet_nid *nid, + struct lnet_nid *pref, int cpt) +{ + struct lnet_peer_ni *lpni = NULL; + int rc; + + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return ERR_PTR(-ESHUTDOWN); + + /* + * find if a peer_ni already exists. + * If so then just return that. + */ + lpni = lnet_peer_ni_find_locked(nid); + if (lpni) + return lpni; + + /* + * Slow path: + * use the lnet_api_mutex to serialize the creation of the peer_ni + * and the creation/deletion of the local ni/net. When a local ni is + * created, if there exists a set of peer_nis on that network, + * they need to be traversed and updated. When a local NI is + * deleted, which could result in a network being deleted, then + * all peer nis on that network need to be removed as well. + * + * Creation through traffic should also be serialized with + * creation through DLC. + */ + lnet_net_unlock(cpt); + mutex_lock(&the_lnet.ln_api_mutex); + /* + * Shutdown is only set under the ln_api_lock, so a single + * check here is sufficent. + */ + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + lpni = ERR_PTR(-ESHUTDOWN); + goto out_mutex_unlock; + } + + rc = lnet_peer_ni_traffic_add(nid, pref); + if (rc) { + lpni = ERR_PTR(rc); + goto out_mutex_unlock; + } + + lpni = lnet_peer_ni_find_locked(nid); + LASSERT(lpni); + +out_mutex_unlock: + mutex_unlock(&the_lnet.ln_api_mutex); + lnet_net_lock(cpt); + + /* Lock has been dropped, check again for shutdown. */ + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + if (!IS_ERR(lpni)) + lnet_peer_ni_decref_locked(lpni); + lpni = ERR_PTR(-ESHUTDOWN); + } + + return lpni; +} + +struct lnet_peer_ni * +lnet_nid2peerni_locked(lnet_nid_t nid4, lnet_nid_t pref4, int cpt) +{ + struct lnet_nid nid, pref; + + lnet_nid4_to_nid(nid4, &nid); + lnet_nid4_to_nid(pref4, &pref); + if (pref4 == LNET_NID_ANY) + return lnet_peerni_by_nid_locked(&nid, NULL, cpt); + else + return lnet_peerni_by_nid_locked(&nid, &pref, cpt); +} + +bool +lnet_peer_gw_discovery(struct lnet_peer *lp) +{ + bool rc = false; + + spin_lock(&lp->lp_lock); + if (lp->lp_state & LNET_PEER_RTR_DISCOVERY) + rc = true; + spin_unlock(&lp->lp_lock); + + return rc; +} + +bool +lnet_peer_is_uptodate(struct lnet_peer *lp) +{ + bool rc; + + spin_lock(&lp->lp_lock); + rc = lnet_peer_is_uptodate_locked(lp); + spin_unlock(&lp->lp_lock); + return rc; +} + +/* + * Is a peer uptodate from the point of view of discovery? + * + * If it is currently being processed, obviously not. + * A forced Ping or Push is also handled by the discovery thread. + * + * Otherwise look at whether the peer needs rediscovering. + */ +bool +lnet_peer_is_uptodate_locked(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + bool rc; + + if (lp->lp_state & (LNET_PEER_DISCOVERING | + LNET_PEER_FORCE_PING | + LNET_PEER_FORCE_PUSH)) { + rc = false; + } else if (lp->lp_state & LNET_PEER_REDISCOVER) { + rc = false; + } else if (lnet_peer_needs_push(lp)) { + rc = false; + } else if (lp->lp_state & LNET_PEER_DISCOVERED) { + if (lp->lp_state & LNET_PEER_NIDS_UPTODATE) + rc = true; + else + rc = false; + } else { + rc = false; + } + + return rc; +} + +/* Add the message to the peer's lp_dc_pendq and queue the peer for discovery */ +void +lnet_peer_queue_message(struct lnet_peer *lp, struct lnet_msg *msg) +{ + /* The discovery thread holds net_lock/EX and lp_lock when it splices + * the lp_dc_pendq onto a local list for resending. Thus, we do the same + * when adding to the list and queuing the peer to ensure that we do not + * strand any messages on the lp_dc_pendq. This scheme ensures the + * message will be resent even if the peer is already being discovered. + * Therefore we needn't check the return value of + * lnet_peer_queue_for_discovery(lp). + */ + lnet_net_lock(LNET_LOCK_EX); + spin_lock(&lp->lp_lock); + list_add_tail(&msg->msg_list, &lp->lp_dc_pendq); + spin_unlock(&lp->lp_lock); + lnet_peer_queue_for_discovery(lp); + lnet_net_unlock(LNET_LOCK_EX); +} + +/* + * Queue a peer for the attention of the discovery thread. Call with + * lnet_net_lock/EX held. Returns 0 if the peer was queued, and + * -EALREADY if the peer was already queued. + */ +static int lnet_peer_queue_for_discovery(struct lnet_peer *lp) +{ + int rc; + + spin_lock(&lp->lp_lock); + if (!(lp->lp_state & LNET_PEER_DISCOVERING)) + lp->lp_state |= LNET_PEER_DISCOVERING; + spin_unlock(&lp->lp_lock); + if (list_empty(&lp->lp_dc_list)) { + lnet_peer_addref_locked(lp); + list_add_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request); + wake_up(&the_lnet.ln_dc_waitq); + rc = 0; + } else { + rc = -EALREADY; + } + + CDEBUG(D_NET, "Queue peer %s: %d\n", + libcfs_nidstr(&lp->lp_primary_nid), rc); + + return rc; +} + +/* + * Discovery of a peer is complete. Wake all waiters on the peer. + * Call with lnet_net_lock/EX held. + */ +static void lnet_peer_discovery_complete(struct lnet_peer *lp, int dc_error) +{ + struct lnet_msg *msg, *tmp; + int rc = 0; + LIST_HEAD(pending_msgs); + + CDEBUG(D_NET, "Discovery complete. Dequeue peer %s\n", + libcfs_nidstr(&lp->lp_primary_nid)); + + spin_lock(&lp->lp_lock); + /* Our caller dropped lp_lock which may have allowed another thread to + * set LNET_PEER_DISCOVERING, or it may be set if dc_error is non-zero. + * Ensure it is cleared. + */ + lp->lp_state &= ~LNET_PEER_DISCOVERING; + if (dc_error) { + lp->lp_dc_error = dc_error; + lp->lp_state |= LNET_PEER_REDISCOVER; + } + list_splice_init(&lp->lp_dc_pendq, &pending_msgs); + spin_unlock(&lp->lp_lock); + list_del_init(&lp->lp_dc_list); + wake_up(&lp->lp_dc_waitq); + + if (lp->lp_rtr_refcount > 0) + lnet_router_discovery_complete(lp); + + lnet_net_unlock(LNET_LOCK_EX); + + /* iterate through all pending messages and send them again */ + list_for_each_entry_safe(msg, tmp, &pending_msgs, msg_list) { + list_del_init(&msg->msg_list); + if (dc_error) { + lnet_finalize(msg, dc_error); + continue; + } + + CDEBUG(D_NET, "sending pending message %s to target %s\n", + lnet_msgtyp2str(msg->msg_type), + libcfs_idstr(&msg->msg_target)); + rc = lnet_send(&msg->msg_src_nid_param, msg, + &msg->msg_rtr_nid_param); + if (rc < 0) { + CNETERR("Error sending %s to %s: %d\n", + lnet_msgtyp2str(msg->msg_type), + libcfs_idstr(&msg->msg_target), rc); + lnet_finalize(msg, rc); + } + } + lnet_net_lock(LNET_LOCK_EX); + lnet_peer_decref_locked(lp); +} + +/* + * Handle inbound push. + * Like any event handler, called with lnet_res_lock/CPT held. + */ +void lnet_peer_push_event(struct lnet_event *ev) +{ + struct lnet_ping_buffer *pbuf; + struct lnet_peer *lp; + + pbuf = LNET_PING_INFO_TO_BUFFER(ev->md_start + ev->offset); + + /* lnet_find_peer() adds a refcount */ + lp = lnet_find_peer(&ev->source.nid); + if (!lp) { + CDEBUG(D_NET, "Push Put from unknown %s (source %s). Ignoring...\n", + libcfs_nidstr(&ev->initiator.nid), + libcfs_nidstr(&ev->source.nid)); + pbuf->pb_needs_post = true; + return; + } + + /* Ensure peer state remains consistent while we modify it. */ + spin_lock(&lp->lp_lock); + + /* + * If some kind of error happened the contents of the message + * cannot be used. Clear the NIDS_UPTODATE and set the + * FORCE_PING flag to trigger a ping. + */ + if (ev->status) { + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; + lp->lp_state |= LNET_PEER_FORCE_PING; + CDEBUG(D_NET, "Push Put error %d from %s (source %s)\n", + ev->status, + libcfs_nidstr(&lp->lp_primary_nid), + libcfs_nidstr(&ev->source.nid)); + goto out; + } + + /* + * A push with invalid or corrupted info. Clear the UPTODATE + * flag to trigger a ping. + */ + if (lnet_ping_info_validate(&pbuf->pb_info)) { + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; + lp->lp_state |= LNET_PEER_FORCE_PING; + CDEBUG(D_NET, "Corrupted Push from %s\n", + libcfs_nidstr(&lp->lp_primary_nid)); + goto out; + } + + /* + * Make sure we'll allocate the correct size ping buffer when + * pinging the peer. + */ + if (lp->lp_data_nnis < pbuf->pb_info.pi_nnis) + lp->lp_data_nnis = pbuf->pb_info.pi_nnis; + + /* + * A non-Multi-Rail peer is not supposed to be capable of + * sending a push. + */ + if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)) { + CERROR("Push from non-Multi-Rail peer %s dropped\n", + libcfs_nidstr(&lp->lp_primary_nid)); + goto out; + } + + /* + * The peer may have discovery disabled at its end. Set + * NO_DISCOVERY as appropriate. + */ + if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY)) { + CDEBUG(D_NET, "Peer %s has discovery disabled\n", + libcfs_nidstr(&lp->lp_primary_nid)); + /* + * Mark the peer for deletion if we already know about it + * and it's going from discovery set to no discovery set + */ + if (!(lp->lp_state & (LNET_PEER_NO_DISCOVERY | + LNET_PEER_DISCOVERING)) && + lp->lp_state & LNET_PEER_DISCOVERED) { + CDEBUG(D_NET, "Marking %s:0x%x for deletion\n", + libcfs_nidstr(&lp->lp_primary_nid), + lp->lp_state); + lp->lp_state |= LNET_PEER_MARK_DELETION; + } + lp->lp_state |= LNET_PEER_NO_DISCOVERY; + } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { + CDEBUG(D_NET, "Peer %s has discovery enabled\n", + libcfs_nidstr(&lp->lp_primary_nid)); + lp->lp_state &= ~LNET_PEER_NO_DISCOVERY; + } + + /* + * Update the MULTI_RAIL flag based on the push. If the peer + * was configured with DLC then the setting should match what + * DLC put in. + * NB: We verified above that the MR feature bit is set in pi_features + */ + if (lp->lp_state & LNET_PEER_MULTI_RAIL) { + CDEBUG(D_NET, "peer %s(%p) is MR\n", + libcfs_nidstr(&lp->lp_primary_nid), lp); + } else if (lp->lp_state & LNET_PEER_CONFIGURED) { + CWARN("Push says %s is Multi-Rail, DLC says not\n", + libcfs_nidstr(&lp->lp_primary_nid)); + } else if (lnet_peer_discovery_disabled) { + CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled locally\n", + libcfs_nidstr(&lp->lp_primary_nid), lp); + } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { + CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled remotely\n", + libcfs_nidstr(&lp->lp_primary_nid), lp); + } else { + CDEBUG(D_NET, "peer %s(%p) is MR capable\n", + libcfs_nidstr(&lp->lp_primary_nid), lp); + lp->lp_state |= LNET_PEER_MULTI_RAIL; + lnet_peer_clr_non_mr_pref_nids(lp); + } + + /* + * Check for truncation of the Put message. Clear the + * NIDS_UPTODATE flag and set FORCE_PING to trigger a ping, + * and tell discovery to allocate a bigger buffer. + */ + if (ev->mlength < ev->rlength) { + if (the_lnet.ln_push_target_nnis < pbuf->pb_info.pi_nnis) + the_lnet.ln_push_target_nnis = pbuf->pb_info.pi_nnis; + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; + lp->lp_state |= LNET_PEER_FORCE_PING; + CDEBUG(D_NET, "Truncated Push from %s (%d nids)\n", + libcfs_nidstr(&lp->lp_primary_nid), + pbuf->pb_info.pi_nnis); + goto out; + } + + /* always assume new data */ + lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; + + /* + * If there is data present that hasn't been processed yet, + * we'll replace it if the Put contained newer data and it + * fits. We're racing with a Ping or earlier Push in this + * case. + */ + if (lp->lp_state & LNET_PEER_DATA_PRESENT) { + if (LNET_PING_BUFFER_SEQNO(pbuf) > + LNET_PING_BUFFER_SEQNO(lp->lp_data) && + pbuf->pb_info.pi_nnis <= lp->lp_data->pb_nnis) { + memcpy(&lp->lp_data->pb_info, &pbuf->pb_info, + LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis)); + CDEBUG(D_NET, "Ping/Push race from %s: %u vs %u\n", + libcfs_nidstr(&lp->lp_primary_nid), + LNET_PING_BUFFER_SEQNO(pbuf), + LNET_PING_BUFFER_SEQNO(lp->lp_data)); + } + goto out; + } + + /* + * Allocate a buffer to copy the data. On a failure we drop + * the Push and set FORCE_PING to force the discovery + * thread to fix the problem by pinging the peer. + */ + lp->lp_data = lnet_ping_buffer_alloc(lp->lp_data_nnis, GFP_ATOMIC); + if (!lp->lp_data) { + lp->lp_state |= LNET_PEER_FORCE_PING; + CDEBUG(D_NET, "Cannot allocate Push buffer for %s %u\n", + libcfs_nidstr(&lp->lp_primary_nid), + LNET_PING_BUFFER_SEQNO(pbuf)); + goto out; + } + + /* Success */ + unsafe_memcpy(&lp->lp_data->pb_info, &pbuf->pb_info, + LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis), FLEXIBLE_OBJECT); + lp->lp_state |= LNET_PEER_DATA_PRESENT; + CDEBUG(D_NET, "Received Push %s %u\n", + libcfs_nidstr(&lp->lp_primary_nid), + LNET_PING_BUFFER_SEQNO(pbuf)); + +out: + /* We've processed this buffer. It can be reposted */ + pbuf->pb_needs_post = true; + + /* + * Queue the peer for discovery if not done, force it on the request + * queue and wake the discovery thread if the peer was already queued, + * because its status changed. + */ + spin_unlock(&lp->lp_lock); + lnet_net_lock(LNET_LOCK_EX); + if (!lnet_peer_is_uptodate(lp) && lnet_peer_queue_for_discovery(lp)) { + list_move(&lp->lp_dc_list, &the_lnet.ln_dc_request); + wake_up(&the_lnet.ln_dc_waitq); + } + /* Drop refcount from lookup */ + lnet_peer_decref_locked(lp); + lnet_net_unlock(LNET_LOCK_EX); +} + +/* + * Clear the discovery error state, unless we're already discovering + * this peer, in which case the error is current. + */ +static void lnet_peer_clear_discovery_error(struct lnet_peer *lp) +{ + spin_lock(&lp->lp_lock); + if (!(lp->lp_state & LNET_PEER_DISCOVERING)) + lp->lp_dc_error = 0; + spin_unlock(&lp->lp_lock); +} + +/* + * Peer discovery slow path. The ln_api_mutex is held on entry, and + * dropped/retaken within this function. An lnet_peer_ni is passed in + * because discovery could tear down an lnet_peer. + */ +int +lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block) +{ + DEFINE_WAIT(wait); + struct lnet_peer *lp; + int rc = 0; + int count = 0; + +again: + lnet_net_unlock(cpt); + lnet_net_lock(LNET_LOCK_EX); + lp = lpni->lpni_peer_net->lpn_peer; + lnet_peer_clear_discovery_error(lp); + + /* + * We're willing to be interrupted. The lpni can become a + * zombie if we race with DLC, so we must check for that. + */ + for (;;) { + /* Keep lp alive when the lnet_net_lock is unlocked */ + lnet_peer_addref_locked(lp); + prepare_to_wait(&lp->lp_dc_waitq, &wait, TASK_INTERRUPTIBLE); + if (signal_pending(current)) + break; + if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) + break; + /* + * Don't repeat discovery if discovery is disabled. This is + * done to ensure we can use discovery as a standard ping as + * well for backwards compatibility with routers which do not + * have discovery or have discovery disabled + */ + if (lnet_is_discovery_disabled(lp) && count > 0) + break; + if (lp->lp_dc_error) + break; + if (lnet_peer_is_uptodate(lp)) + break; + if (lp->lp_state & LNET_PEER_MARK_DELETED) + break; + lnet_peer_queue_for_discovery(lp); + count++; + CDEBUG(D_NET, "Discovery attempt # %d\n", count); + + /* + * If caller requested a non-blocking operation then + * return immediately. Once discovery is complete any + * pending messages that were stopped due to discovery + * will be transmitted. + */ + if (!block) + break; + + lnet_net_unlock(LNET_LOCK_EX); + schedule(); + finish_wait(&lp->lp_dc_waitq, &wait); + lnet_net_lock(LNET_LOCK_EX); + lnet_peer_decref_locked(lp); + /* Peer may have changed */ + lp = lpni->lpni_peer_net->lpn_peer; + } + finish_wait(&lp->lp_dc_waitq, &wait); + + lnet_net_unlock(LNET_LOCK_EX); + lnet_net_lock(cpt); + lnet_peer_decref_locked(lp); + /* + * The peer may have changed, so re-check and rediscover if that turns + * out to have been the case. The reference count on lp ensured that + * even if it was unlinked from lpni the memory could not be recycled. + * Thus the check below is sufficient to determine whether the peer + * changed. If the peer changed, then lp must not be dereferenced. + */ + if (lp != lpni->lpni_peer_net->lpn_peer) + goto again; + + if (signal_pending(current)) + rc = -EINTR; + else if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) + rc = -ESHUTDOWN; + else if (lp->lp_dc_error) + rc = lp->lp_dc_error; + else if (!block) + CDEBUG(D_NET, "non-blocking discovery\n"); + else if (!lnet_peer_is_uptodate(lp) && + !(lnet_is_discovery_disabled(lp) || + (lp->lp_state & LNET_PEER_MARK_DELETED))) + goto again; + + CDEBUG(D_NET, "peer %s NID %s: %d. %s\n", + (lp ? libcfs_nidstr(&lp->lp_primary_nid) : "(none)"), + libcfs_nidstr(&lpni->lpni_nid), rc, + (!block) ? "pending discovery" : "discovery complete"); + + return rc; +} + +/* Handle an incoming ack for a push. */ +static void +lnet_discovery_event_ack(struct lnet_peer *lp, struct lnet_event *ev) +{ + struct lnet_ping_buffer *pbuf; + + pbuf = LNET_PING_INFO_TO_BUFFER(ev->md_start); + spin_lock(&lp->lp_lock); + lp->lp_state &= ~LNET_PEER_PUSH_SENT; + lp->lp_push_error = ev->status; + if (ev->status) + lp->lp_state |= LNET_PEER_PUSH_FAILED; + else + lp->lp_node_seqno = LNET_PING_BUFFER_SEQNO(pbuf); + spin_unlock(&lp->lp_lock); + + CDEBUG(D_NET, "peer %s ev->status %d\n", + libcfs_nidstr(&lp->lp_primary_nid), ev->status); +} + +/* Handle a Reply message. This is the reply to a Ping message. */ +static void +lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) +{ + struct lnet_ping_buffer *pbuf; + int rc; + + spin_lock(&lp->lp_lock); + + lp->lp_disc_src_nid = ev->target.nid; + lp->lp_disc_dst_nid = ev->source.nid; + + /* + * If some kind of error happened the contents of message + * cannot be used. Set PING_FAILED to trigger a retry. + */ + if (ev->status) { + lp->lp_state |= LNET_PEER_PING_FAILED; + lp->lp_ping_error = ev->status; + CDEBUG(D_NET, "Ping Reply error %d from %s (source %s)\n", + ev->status, + libcfs_nidstr(&lp->lp_primary_nid), + libcfs_nidstr(&ev->source.nid)); + goto out; + } + + pbuf = LNET_PING_INFO_TO_BUFFER(ev->md_start); + if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) + lnet_swap_pinginfo(pbuf); + + /* + * A reply with invalid or corrupted info. Set PING_FAILED to + * trigger a retry. + */ + rc = lnet_ping_info_validate(&pbuf->pb_info); + if (rc) { + lp->lp_state |= LNET_PEER_PING_FAILED; + lp->lp_ping_error = 0; + CDEBUG(D_NET, "Corrupted Ping Reply from %s: %d\n", + libcfs_nidstr(&lp->lp_primary_nid), rc); + goto out; + } + + /* + * The peer may have discovery disabled at its end. Set + * NO_DISCOVERY as appropriate. + */ + if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY) || + lnet_peer_discovery_disabled) { + CDEBUG(D_NET, "Peer %s has discovery disabled\n", + libcfs_nidstr(&lp->lp_primary_nid)); + + /* Detect whether this peer has toggled discovery from on to + * off and whether we can delete and re-create the peer. Peers + * that were manually configured cannot be deleted by discovery. + * We need to delete this peer and re-create it if the peer was + * not configured manually, is currently considered DD capable, + * and either: + * 1. We've already discovered the peer (the peer has toggled + * the discovery feature from on to off), or + * 2. The peer is considered MR, but it was not user configured + * (this was a "temporary" peer created via the kernel APIs + * that we're discovering for the first time) + */ + if (!(lp->lp_state & (LNET_PEER_CONFIGURED | + LNET_PEER_NO_DISCOVERY)) && + (lp->lp_state & (LNET_PEER_DISCOVERED | + LNET_PEER_MULTI_RAIL))) { + CDEBUG(D_NET, "Marking %s:0x%x for deletion\n", + libcfs_nidstr(&lp->lp_primary_nid), + lp->lp_state); + lp->lp_state |= LNET_PEER_MARK_DELETION; + } + lp->lp_state |= LNET_PEER_NO_DISCOVERY; + } else { + CDEBUG(D_NET, "Peer %s has discovery enabled\n", + libcfs_nidstr(&lp->lp_primary_nid)); + lp->lp_state &= ~LNET_PEER_NO_DISCOVERY; + } + + /* + * Update the MULTI_RAIL flag based on the reply. If the peer + * was configured with DLC then the setting should match what + * DLC put in. + */ + if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) { + if (lp->lp_state & LNET_PEER_MULTI_RAIL) { + CDEBUG(D_NET, "peer %s(%p) is MR\n", + libcfs_nidstr(&lp->lp_primary_nid), lp); + } else if (lp->lp_state & LNET_PEER_CONFIGURED) { + CWARN("Reply says %s is Multi-Rail, DLC says not\n", + libcfs_nidstr(&lp->lp_primary_nid)); + } else if (lnet_peer_discovery_disabled) { + CDEBUG(D_NET, + "peer %s(%p) not MR: DD disabled locally\n", + libcfs_nidstr(&lp->lp_primary_nid), lp); + } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { + CDEBUG(D_NET, + "peer %s(%p) not MR: DD disabled remotely\n", + libcfs_nidstr(&lp->lp_primary_nid), lp); + } else { + CDEBUG(D_NET, "peer %s(%p) is MR capable\n", + libcfs_nidstr(&lp->lp_primary_nid), lp); + lp->lp_state |= LNET_PEER_MULTI_RAIL; + lnet_peer_clr_non_mr_pref_nids(lp); + } + } else if (lp->lp_state & LNET_PEER_MULTI_RAIL) { + if (lp->lp_state & LNET_PEER_CONFIGURED) { + CWARN("DLC says %s is Multi-Rail, Reply says not\n", + libcfs_nidstr(&lp->lp_primary_nid)); + } else { + CERROR("Multi-Rail state vanished from %s\n", + libcfs_nidstr(&lp->lp_primary_nid)); + lp->lp_state &= ~LNET_PEER_MULTI_RAIL; + } + } + + /* + * Make sure we'll allocate the correct size ping buffer when + * pinging the peer. + */ + if (lp->lp_data_nnis < pbuf->pb_info.pi_nnis) + lp->lp_data_nnis = pbuf->pb_info.pi_nnis; + + /* + * Check for truncation of the Reply. Clear PING_SENT and set + * PING_FAILED to trigger a retry. + */ + if (pbuf->pb_nnis < pbuf->pb_info.pi_nnis) { + if (the_lnet.ln_push_target_nnis < pbuf->pb_info.pi_nnis) + the_lnet.ln_push_target_nnis = pbuf->pb_info.pi_nnis; + lp->lp_state |= LNET_PEER_PING_FAILED; + lp->lp_ping_error = 0; + CDEBUG(D_NET, "Truncated Reply from %s (%d nids)\n", + libcfs_nidstr(&lp->lp_primary_nid), + pbuf->pb_info.pi_nnis); + goto out; + } + + /* + * Check the sequence numbers in the reply. These are only + * available if the reply came from a Multi-Rail peer. + */ + if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL && + pbuf->pb_info.pi_nnis > 1 && + lnet_nid_to_nid4(&lp->lp_primary_nid) == + pbuf->pb_info.pi_ni[1].ns_nid) { + if (LNET_PING_BUFFER_SEQNO(pbuf) < lp->lp_peer_seqno) + CDEBUG(D_NET, "peer %s: seq# got %u have %u. peer rebooted?\n", + libcfs_nidstr(&lp->lp_primary_nid), + LNET_PING_BUFFER_SEQNO(pbuf), + lp->lp_peer_seqno); + + lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); + } + + /* We're happy with the state of the data in the buffer. */ + CDEBUG(D_NET, "peer %s data present %u. state = 0x%x\n", + libcfs_nidstr(&lp->lp_primary_nid), lp->lp_peer_seqno, + lp->lp_state); + if (lp->lp_state & LNET_PEER_DATA_PRESENT) + lnet_ping_buffer_decref(lp->lp_data); + else + lp->lp_state |= LNET_PEER_DATA_PRESENT; + lnet_ping_buffer_addref(pbuf); + lp->lp_data = pbuf; +out: + lp->lp_state &= ~LNET_PEER_PING_SENT; + spin_unlock(&lp->lp_lock); + + lnet_net_lock(LNET_LOCK_EX); + /* + * If this peer is a gateway, call the routing callback to + * handle the ping reply + */ + if (lp->lp_rtr_refcount > 0) + lnet_router_discovery_ping_reply(lp); + lnet_net_unlock(LNET_LOCK_EX); +} + +/* + * Send event handling. Only matters for error cases, where we clean + * up state on the peer and peer_ni that would otherwise be updated in + * the REPLY event handler for a successful Ping, and the ACK event + * handler for a successful Push. + */ +static int +lnet_discovery_event_send(struct lnet_peer *lp, struct lnet_event *ev) +{ + int rc = 0; + + if (!ev->status) + goto out; + + spin_lock(&lp->lp_lock); + if (ev->msg_type == LNET_MSG_GET) { + lp->lp_state &= ~LNET_PEER_PING_SENT; + lp->lp_state |= LNET_PEER_PING_FAILED; + lp->lp_ping_error = ev->status; + } else { /* ev->msg_type == LNET_MSG_PUT */ + lp->lp_state &= ~LNET_PEER_PUSH_SENT; + lp->lp_state |= LNET_PEER_PUSH_FAILED; + lp->lp_push_error = ev->status; + } + spin_unlock(&lp->lp_lock); + rc = LNET_REDISCOVER_PEER; +out: + CDEBUG(D_NET, "%s Send to %s: %d\n", + (ev->msg_type == LNET_MSG_GET ? "Ping" : "Push"), + libcfs_nidstr(&ev->target.nid), rc); + return rc; +} + +/* + * Unlink event handling. This event is only seen if a call to + * LNetMDUnlink() caused the event to be unlinked. If this call was + * made after the event was set up in LNetGet() or LNetPut() then we + * assume the Ping or Push timed out. + */ +static void +lnet_discovery_event_unlink(struct lnet_peer *lp, struct lnet_event *ev) +{ + spin_lock(&lp->lp_lock); + /* We've passed through LNetGet() */ + if (lp->lp_state & LNET_PEER_PING_SENT) { + lp->lp_state &= ~LNET_PEER_PING_SENT; + lp->lp_state |= LNET_PEER_PING_FAILED; + lp->lp_ping_error = -ETIMEDOUT; + CDEBUG(D_NET, "Ping Unlink for message to peer %s\n", + libcfs_nidstr(&lp->lp_primary_nid)); + } + /* We've passed through LNetPut() */ + if (lp->lp_state & LNET_PEER_PUSH_SENT) { + lp->lp_state &= ~LNET_PEER_PUSH_SENT; + lp->lp_state |= LNET_PEER_PUSH_FAILED; + lp->lp_push_error = -ETIMEDOUT; + CDEBUG(D_NET, "Push Unlink for message to peer %s\n", + libcfs_nidstr(&lp->lp_primary_nid)); + } + spin_unlock(&lp->lp_lock); +} + +/* + * Event handler for the discovery EQ. + * + * Called with lnet_res_lock(cpt) held. The cpt is the + * lnet_cpt_of_cookie() of the md handle cookie. + */ +static void lnet_discovery_event_handler(struct lnet_event *event) +{ + struct lnet_peer *lp = event->md_user_ptr; + struct lnet_ping_buffer *pbuf; + int rc; + + /* discovery needs to take another look */ + rc = LNET_REDISCOVER_PEER; + + CDEBUG(D_NET, "Received event: %d\n", event->type); + + switch (event->type) { + case LNET_EVENT_ACK: + lnet_discovery_event_ack(lp, event); + break; + case LNET_EVENT_REPLY: + lnet_discovery_event_reply(lp, event); + break; + case LNET_EVENT_SEND: + /* Only send failure triggers a retry. */ + rc = lnet_discovery_event_send(lp, event); + break; + case LNET_EVENT_UNLINK: + /* LNetMDUnlink() was called */ + lnet_discovery_event_unlink(lp, event); + break; + default: + /* Invalid events. */ + LBUG(); + } + lnet_net_lock(LNET_LOCK_EX); + if (event->unlinked) { + pbuf = LNET_PING_INFO_TO_BUFFER(event->md_start); + lnet_ping_buffer_decref(pbuf); + lnet_peer_decref_locked(lp); + } + + /* put peer back at end of request queue, if discovery not already + * done */ + if (rc == LNET_REDISCOVER_PEER && !lnet_peer_is_uptodate(lp) && + lnet_peer_queue_for_discovery(lp)) { + list_move_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request); + wake_up(&the_lnet.ln_dc_waitq); + } + lnet_net_unlock(LNET_LOCK_EX); +} + +/* + * Build a peer from incoming data. + * + * The NIDs in the incoming data are supposed to be structured as follows: + * - loopback + * - primary NID + * - other NIDs in same net + * - NIDs in second net + * - NIDs in third net + * - ... + * This due to the way the list of NIDs in the data is created. + * + * Note that this function will mark the peer uptodate unless an + * ENOMEM is encontered. All other errors are due to a conflict + * between the DLC configuration and what discovery sees. We treat DLC + * as binding, and therefore set the NIDS_UPTODATE flag to prevent the + * peer from becoming stuck in discovery. + */ +static int lnet_peer_merge_data(struct lnet_peer *lp, + struct lnet_ping_buffer *pbuf) +{ + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + lnet_nid_t *curnis = NULL; + struct lnet_ni_status *addnis = NULL; + lnet_nid_t *delnis = NULL; + unsigned flags; + int ncurnis; + int naddnis; + int ndelnis; + int nnis = 0; + int i; + int j; + int rc; + + flags = LNET_PEER_DISCOVERED; + if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) + flags |= LNET_PEER_MULTI_RAIL; + + /* + * Cache the routing feature for the peer; whether it is enabled + * for disabled as reported by the remote peer. + */ + spin_lock(&lp->lp_lock); + if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_RTE_DISABLED)) + lp->lp_state |= LNET_PEER_ROUTER_ENABLED; + else + lp->lp_state &= ~LNET_PEER_ROUTER_ENABLED; + spin_unlock(&lp->lp_lock); + + nnis = max_t(int, lp->lp_nnis, pbuf->pb_info.pi_nnis); + CFS_ALLOC_PTR_ARRAY(curnis, nnis); + CFS_ALLOC_PTR_ARRAY(addnis, nnis); + CFS_ALLOC_PTR_ARRAY(delnis, nnis); + if (!curnis || !addnis || !delnis) { + rc = -ENOMEM; + goto out; + } + ncurnis = 0; + naddnis = 0; + ndelnis = 0; + + /* Construct the list of NIDs present in peer. */ + lpni = NULL; + while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) + curnis[ncurnis++] = lnet_nid_to_nid4(&lpni->lpni_nid); + + /* + * Check for NIDs in pbuf not present in curnis[]. + * The loop starts at 1 to skip the loopback NID. + */ + for (i = 1; i < pbuf->pb_info.pi_nnis; i++) { + for (j = 0; j < ncurnis; j++) + if (pbuf->pb_info.pi_ni[i].ns_nid == curnis[j]) + break; + if (j == ncurnis) + addnis[naddnis++] = pbuf->pb_info.pi_ni[i]; + } + /* + * Check for NIDs in curnis[] not present in pbuf. + * The nested loop starts at 1 to skip the loopback NID. + * + * But never add the loopback NID to delnis[]: if it is + * present in curnis[] then this peer is for this node. + */ + for (i = 0; i < ncurnis; i++) { + if (curnis[i] == LNET_NID_LO_0) + continue; + for (j = 1; j < pbuf->pb_info.pi_nnis; j++) { + if (curnis[i] == pbuf->pb_info.pi_ni[j].ns_nid) { + /* + * update the information we cache for the + * peer with the latest information we + * received + */ + lpni = lnet_find_peer_ni_locked(curnis[i]); + if (lpni) { + lpni->lpni_ns_status = pbuf->pb_info.pi_ni[j].ns_status; + lnet_peer_ni_decref_locked(lpni); + } + break; + } + } + if (j == pbuf->pb_info.pi_nnis) + delnis[ndelnis++] = curnis[i]; + } + + /* + * If we get here and the discovery is disabled then we don't want + * to add or delete any NIs. We just updated the ones we have some + * information on, and call it a day + */ + rc = 0; + if (lnet_is_discovery_disabled(lp)) + goto out; + + for (i = 0; i < naddnis; i++) { + rc = lnet_peer_add_nid(lp, addnis[i].ns_nid, flags); + if (rc) { + CERROR("Error adding NID %s to peer %s: %d\n", + libcfs_nid2str(addnis[i].ns_nid), + libcfs_nidstr(&lp->lp_primary_nid), rc); + if (rc == -ENOMEM) + goto out; + } + lpni = lnet_find_peer_ni_locked(addnis[i].ns_nid); + if (lpni) { + lpni->lpni_ns_status = addnis[i].ns_status; + lnet_peer_ni_decref_locked(lpni); + } + } + + for (i = 0; i < ndelnis; i++) { + /* + * for routers it's okay to delete the primary_nid because + * the upper layers don't really rely on it. So if we're + * being told that the router changed its primary_nid + * then it's okay to delete it. + */ + if (lp->lp_rtr_refcount > 0) + flags |= LNET_PEER_RTR_NI_FORCE_DEL; + rc = lnet_peer_del_nid(lp, delnis[i], flags); + if (rc) { + CERROR("Error deleting NID %s from peer %s: %d\n", + libcfs_nid2str(delnis[i]), + libcfs_nidstr(&lp->lp_primary_nid), rc); + if (rc == -ENOMEM) + goto out; + } + } + + /* The peer net for the primary NID should be the first entry in the + * peer's lp_peer_nets list, and the peer NI for the primary NID should + * be the first entry in its peer net's lpn_peer_nis list. + */ + lpni = lnet_find_peer_ni_locked(pbuf->pb_info.pi_ni[1].ns_nid); + if (!lpni) { + CERROR("Internal error: Failed to lookup peer NI for primary NID: %s\n", + libcfs_nid2str(pbuf->pb_info.pi_ni[1].ns_nid)); + goto out; + } + + lnet_peer_ni_decref_locked(lpni); + + lpn = lpni->lpni_peer_net; + if (lpn->lpn_peer_nets.prev != &lp->lp_peer_nets) + list_move(&lpn->lpn_peer_nets, &lp->lp_peer_nets); + + if (lpni->lpni_peer_nis.prev != &lpni->lpni_peer_net->lpn_peer_nis) + list_move(&lpni->lpni_peer_nis, + &lpni->lpni_peer_net->lpn_peer_nis); + + /* + * Errors other than -ENOMEM are due to peers having been + * configured with DLC. Ignore these because DLC overrides + * Discovery. + */ + rc = 0; +out: + CFS_FREE_PTR_ARRAY(curnis, nnis); + CFS_FREE_PTR_ARRAY(addnis, nnis); + CFS_FREE_PTR_ARRAY(delnis, nnis); + lnet_ping_buffer_decref(pbuf); + CDEBUG(D_NET, "peer %s (%p): %d\n", + libcfs_nidstr(&lp->lp_primary_nid), lp, rc); + + if (rc) { + spin_lock(&lp->lp_lock); + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; + lp->lp_state |= LNET_PEER_FORCE_PING; + spin_unlock(&lp->lp_lock); + } + return rc; +} + +/* + * The data in pbuf says lp is its primary peer, but the data was + * received by a different peer. Try to update lp with the data. + */ +static int +lnet_peer_set_primary_data(struct lnet_peer *lp, struct lnet_ping_buffer *pbuf) +{ + struct lnet_handle_md mdh; + + /* Queue lp for discovery, and force it on the request queue. */ + lnet_net_lock(LNET_LOCK_EX); + if (lnet_peer_queue_for_discovery(lp)) + list_move(&lp->lp_dc_list, &the_lnet.ln_dc_request); + lnet_net_unlock(LNET_LOCK_EX); + + LNetInvalidateMDHandle(&mdh); + + /* + * Decide whether we can move the peer to the DATA_PRESENT state. + * + * We replace stale data for a multi-rail peer, repair PING_FAILED + * status, and preempt FORCE_PING. + * + * If after that we have DATA_PRESENT, we merge it into this peer. + */ + spin_lock(&lp->lp_lock); + if (lp->lp_state & LNET_PEER_MULTI_RAIL) { + if (lp->lp_peer_seqno < LNET_PING_BUFFER_SEQNO(pbuf)) { + lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); + } else if (lp->lp_state & LNET_PEER_DATA_PRESENT) { + lp->lp_state &= ~LNET_PEER_DATA_PRESENT; + lnet_ping_buffer_decref(pbuf); + pbuf = lp->lp_data; + lp->lp_data = NULL; + } + } + if (lp->lp_state & LNET_PEER_DATA_PRESENT) { + lnet_ping_buffer_decref(lp->lp_data); + lp->lp_data = NULL; + lp->lp_state &= ~LNET_PEER_DATA_PRESENT; + } + if (lp->lp_state & LNET_PEER_PING_FAILED) { + mdh = lp->lp_ping_mdh; + LNetInvalidateMDHandle(&lp->lp_ping_mdh); + lp->lp_state &= ~LNET_PEER_PING_FAILED; + lp->lp_ping_error = 0; + } + if (lp->lp_state & LNET_PEER_FORCE_PING) + lp->lp_state &= ~LNET_PEER_FORCE_PING; + lp->lp_state |= LNET_PEER_NIDS_UPTODATE; + spin_unlock(&lp->lp_lock); + + if (!LNetMDHandleIsInvalid(mdh)) + LNetMDUnlink(mdh); + + if (pbuf) + return lnet_peer_merge_data(lp, pbuf); + + CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&lp->lp_primary_nid)); + return 0; +} + +static bool lnet_is_nid_in_ping_info(lnet_nid_t nid, struct lnet_ping_info *pinfo) +{ + int i; + + for (i = 0; i < pinfo->pi_nnis; i++) { + if (pinfo->pi_ni[i].ns_nid == nid) + return true; + } + + return false; +} + +/* Delete a peer that has been marked for deletion. NB: when this peer was added + * to the discovery queue a reference was taken that will prevent the peer from + * actually being freed by this function. After this function exits the + * discovery thread should call lnet_peer_discovery_complete() which will + * drop that reference as well as wake any waiters that may also be holding a + * ref on the peer + */ +static int lnet_peer_deletion(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + struct list_head rlist; + struct lnet_route *route, *tmp; + int sensitivity = lp->lp_health_sensitivity; + int rc = 0; + + INIT_LIST_HEAD(&rlist); + + CDEBUG(D_NET, "peer %s(%p) state %#x\n", + libcfs_nidstr(&lp->lp_primary_nid), lp, lp->lp_state); + + /* no-op if lnet_peer_del() has already been called on this peer */ + if (lp->lp_state & LNET_PEER_MARK_DELETED) + goto clear_discovering; + + spin_unlock(&lp->lp_lock); + + mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING || + the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) { + mutex_unlock(&the_lnet.ln_api_mutex); + spin_lock(&lp->lp_lock); + rc = -ESHUTDOWN; + goto clear_discovering; + } + + lnet_peer_cancel_discovery(lp); + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry_safe(route, tmp, + &lp->lp_routes, + lr_gwlist) + lnet_move_route(route, NULL, &rlist); + + /* lnet_peer_del_locked() deletes all the peer NIs owned by this peer */ + rc = lnet_peer_del_locked(lp); + if (rc) + CNETERR("Internal error: Unable to delete peer %s rc %d\n", + libcfs_nidstr(&lp->lp_primary_nid), rc); + + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry_safe(route, tmp, + &rlist, lr_list) { + /* re-add these routes */ + lnet_add_route(route->lr_net, + route->lr_hops, + &route->lr_nid, + route->lr_priority, + sensitivity); + LIBCFS_FREE(route, sizeof(*route)); + } + + mutex_unlock(&the_lnet.ln_api_mutex); + + spin_lock(&lp->lp_lock); + + rc = 0; + +clear_discovering: + lp->lp_state &= ~(LNET_PEER_DISCOVERING | LNET_PEER_FORCE_PING | + LNET_PEER_FORCE_PUSH); + + return rc; +} + +/* + * Update a peer using the data received. + */ +static int lnet_peer_data_present(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + struct lnet_ping_buffer *pbuf; + struct lnet_peer_ni *lpni; + lnet_nid_t nid = LNET_NID_ANY; + unsigned flags; + int rc = 0; + + pbuf = lp->lp_data; + lp->lp_data = NULL; + lp->lp_state &= ~LNET_PEER_DATA_PRESENT; + lp->lp_state |= LNET_PEER_NIDS_UPTODATE; + spin_unlock(&lp->lp_lock); + + /* + * Modifications of peer structures are done while holding the + * ln_api_mutex. A global lock is required because we may be + * modifying multiple peer structures, and a mutex greatly + * simplifies memory management. + * + * The actual changes to the data structures must also protect + * against concurrent lookups, for which the lnet_net_lock in + * LNET_LOCK_EX mode is used. + */ + mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto out; + } + + /* + * If this peer is not on the peer list then it is being torn + * down, and our reference count may be all that is keeping it + * alive. Don't do any work on it. + */ + if (list_empty(&lp->lp_peer_list)) + goto out; + + flags = LNET_PEER_DISCOVERED; + if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) + flags |= LNET_PEER_MULTI_RAIL; + + /* + * Check whether the primary NID in the message matches the + * primary NID of the peer. If it does, update the peer, if + * it it does not, check whether there is already a peer with + * that primary NID. If no such peer exists, try to update + * the primary NID of the current peer (allowed if it was + * created due to message traffic) and complete the update. + * If the peer did exist, hand off the data to it. + * + * The peer for the loopback interface is a special case: this + * is the peer for the local node, and we want to set its + * primary NID to the correct value here. Moreover, this peer + * can show up with only the loopback NID in the ping buffer. + */ + if (pbuf->pb_info.pi_nnis <= 1) { + lnet_ping_buffer_decref(pbuf); + goto out; + } + nid = pbuf->pb_info.pi_ni[1].ns_nid; + if (nid_is_lo0(&lp->lp_primary_nid)) { + rc = lnet_peer_set_primary_nid(lp, nid, flags); + if (!rc) + rc = lnet_peer_merge_data(lp, pbuf); + /* + * if the primary nid of the peer is present in the ping info returned + * from the peer, but it's not the local primary peer we have + * cached and discovery is disabled, then we don't want to update + * our local peer info, by adding or removing NIDs, we just want + * to update the status of the nids that we currently have + * recorded in that peer. + */ + } else if (lnet_nid_to_nid4(&lp->lp_primary_nid) == nid || + (lnet_is_nid_in_ping_info(lnet_nid_to_nid4(&lp->lp_primary_nid), + &pbuf->pb_info) && + lnet_is_discovery_disabled(lp))) { + rc = lnet_peer_merge_data(lp, pbuf); + } else { + lpni = lnet_find_peer_ni_locked(nid); + if (!lpni || lp == lpni->lpni_peer_net->lpn_peer) { + rc = lnet_peer_set_primary_nid(lp, nid, flags); + if (rc) { + CERROR("Primary NID error %s versus %s: %d\n", + libcfs_nidstr(&lp->lp_primary_nid), + libcfs_nid2str(nid), rc); + } else { + rc = lnet_peer_merge_data(lp, pbuf); + } + if (lpni) + lnet_peer_ni_decref_locked(lpni); + } else { + struct lnet_peer *new_lp; + new_lp = lpni->lpni_peer_net->lpn_peer; + /* + * if lp has discovery/MR enabled that means new_lp + * should have discovery/MR enabled as well, since + * it's the same peer, which we're about to merge + */ + spin_lock(&lp->lp_lock); + spin_lock(&new_lp->lp_lock); + if (!(lp->lp_state & LNET_PEER_NO_DISCOVERY)) + new_lp->lp_state &= ~LNET_PEER_NO_DISCOVERY; + if (lp->lp_state & LNET_PEER_MULTI_RAIL) + new_lp->lp_state |= LNET_PEER_MULTI_RAIL; + /* If we're processing a ping reply then we may be + * about to send a push to the peer that we ping'd. + * Since the ping reply that we're processing was + * received by lp, we need to set the discovery source + * NID for new_lp to the NID stored in lp. + */ + if (!LNET_NID_IS_ANY(&lp->lp_disc_src_nid)) { + new_lp->lp_disc_src_nid = lp->lp_disc_src_nid; + new_lp->lp_disc_dst_nid = lp->lp_disc_dst_nid; + } + spin_unlock(&new_lp->lp_lock); + spin_unlock(&lp->lp_lock); + + rc = lnet_peer_set_primary_data(new_lp, pbuf); + lnet_consolidate_routes_locked(lp, new_lp); + lnet_peer_ni_decref_locked(lpni); + } + } +out: + CDEBUG(D_NET, "peer %s(%p): %d. state = 0x%x\n", + libcfs_nidstr(&lp->lp_primary_nid), lp, rc, + lp->lp_state); + mutex_unlock(&the_lnet.ln_api_mutex); + + spin_lock(&lp->lp_lock); + /* Tell discovery to re-check the peer immediately. */ + if (!rc) + rc = LNET_REDISCOVER_PEER; + return rc; +} + +/* + * A ping failed. Clear the PING_FAILED state and set the + * FORCE_PING state, to ensure a retry even if discovery is + * disabled. This avoids being left with incorrect state. + */ +static int lnet_peer_ping_failed(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + struct lnet_handle_md mdh; + int rc; + + mdh = lp->lp_ping_mdh; + LNetInvalidateMDHandle(&lp->lp_ping_mdh); + lp->lp_state &= ~LNET_PEER_PING_FAILED; + lp->lp_state |= LNET_PEER_FORCE_PING; + rc = lp->lp_ping_error; + lp->lp_ping_error = 0; + spin_unlock(&lp->lp_lock); + + if (!LNetMDHandleIsInvalid(mdh)) + LNetMDUnlink(mdh); + + CDEBUG(D_NET, "peer %s:%d\n", + libcfs_nidstr(&lp->lp_primary_nid), rc); + + spin_lock(&lp->lp_lock); + return rc ? rc : LNET_REDISCOVER_PEER; +} + +/* Active side of ping. */ +static int lnet_peer_send_ping(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + int nnis; + int rc; + int cpt; + + lp->lp_state |= LNET_PEER_PING_SENT; + lp->lp_state &= ~LNET_PEER_FORCE_PING; + spin_unlock(&lp->lp_lock); + + cpt = lnet_net_lock_current(); + /* Refcount for MD. */ + lnet_peer_addref_locked(lp); + lnet_net_unlock(cpt); + + nnis = max(lp->lp_data_nnis, LNET_INTERFACES_MIN); + + rc = lnet_send_ping(&lp->lp_primary_nid, &lp->lp_ping_mdh, nnis, lp, + the_lnet.ln_dc_handler, false); + + /* + * if LNetMDBind in lnet_send_ping fails we need to decrement the + * refcount on the peer, otherwise LNetMDUnlink will be called + * which will eventually do that. + */ + if (rc > 0) { + lnet_net_lock(cpt); + lnet_peer_decref_locked(lp); + lnet_net_unlock(cpt); + rc = -rc; /* change the rc to negative value */ + goto fail_error; + } else if (rc < 0) { + goto fail_error; + } + + CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&lp->lp_primary_nid)); + + spin_lock(&lp->lp_lock); + return 0; + +fail_error: + CDEBUG(D_NET, "peer %s: %d\n", libcfs_nidstr(&lp->lp_primary_nid), rc); + /* + * The errors that get us here are considered hard errors and + * cause Discovery to terminate. So we clear PING_SENT, but do + * not set either PING_FAILED or FORCE_PING. In fact we need + * to clear PING_FAILED, because the unlink event handler will + * have set it if we called LNetMDUnlink() above. + */ + spin_lock(&lp->lp_lock); + lp->lp_state &= ~(LNET_PEER_PING_SENT | LNET_PEER_PING_FAILED); + return rc; +} + +/* + * This function exists because you cannot call LNetMDUnlink() from an + * event handler. + */ +static int lnet_peer_push_failed(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + struct lnet_handle_md mdh; + int rc; + + mdh = lp->lp_push_mdh; + LNetInvalidateMDHandle(&lp->lp_push_mdh); + lp->lp_state &= ~LNET_PEER_PUSH_FAILED; + rc = lp->lp_push_error; + lp->lp_push_error = 0; + spin_unlock(&lp->lp_lock); + + if (!LNetMDHandleIsInvalid(mdh)) + LNetMDUnlink(mdh); + + CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&lp->lp_primary_nid)); + spin_lock(&lp->lp_lock); + return rc ? rc : LNET_REDISCOVER_PEER; +} + +/* + * Mark the peer as discovered. + */ +static int lnet_peer_discovered(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + lp->lp_state |= LNET_PEER_DISCOVERED; + lp->lp_state &= ~(LNET_PEER_DISCOVERING | + LNET_PEER_REDISCOVER); + + lp->lp_dc_error = 0; + + CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&lp->lp_primary_nid)); + + return 0; +} + +/* Active side of push. */ +static int lnet_peer_send_push(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + struct lnet_ping_buffer *pbuf; + struct lnet_process_id id; + struct lnet_md md; + int cpt; + int rc; + + /* Don't push to a non-multi-rail peer. */ + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { + lp->lp_state &= ~LNET_PEER_FORCE_PUSH; + /* if peer's NIDs are uptodate then peer is discovered */ + if (lp->lp_state & LNET_PEER_NIDS_UPTODATE) { + rc = lnet_peer_discovered(lp); + return rc; + } + + return 0; + } + + lp->lp_state |= LNET_PEER_PUSH_SENT; + lp->lp_state &= ~LNET_PEER_FORCE_PUSH; + spin_unlock(&lp->lp_lock); + + cpt = lnet_net_lock_current(); + pbuf = the_lnet.ln_ping_target; + lnet_ping_buffer_addref(pbuf); + lnet_net_unlock(cpt); + + /* Push source MD */ + md.start = &pbuf->pb_info; + md.length = LNET_PING_INFO_SIZE(pbuf->pb_nnis); + md.threshold = 2; /* Put/Ack */ + md.max_size = 0; + md.options = LNET_MD_TRACK_RESPONSE; + md.handler = the_lnet.ln_dc_handler; + md.user_ptr = lp; + + rc = LNetMDBind(&md, LNET_UNLINK, &lp->lp_push_mdh); + if (rc) { + lnet_ping_buffer_decref(pbuf); + CERROR("Can't bind push source MD: %d\n", rc); + goto fail_error; + } + + cpt = lnet_net_lock_current(); + /* Refcount for MD. */ + lnet_peer_addref_locked(lp); + id.pid = LNET_PID_LUSTRE; + if (!LNET_NID_IS_ANY(&lp->lp_disc_dst_nid)) + id.nid = lnet_nid_to_nid4(&lp->lp_disc_dst_nid); + else + id.nid = lnet_nid_to_nid4(&lp->lp_primary_nid); + lnet_net_unlock(cpt); + + rc = LNetPut(lnet_nid_to_nid4(&lp->lp_disc_src_nid), lp->lp_push_mdh, + LNET_ACK_REQ, id, LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0, 0); + + /* + * reset the discovery nid. There is no need to restrict sending + * from that source, if we call lnet_push_update_to_peers(). It'll + * get set to a specific NID, if we initiate discovery from the + * scratch + */ + lp->lp_disc_src_nid = LNET_ANY_NID; + lp->lp_disc_dst_nid = LNET_ANY_NID; + + if (rc) + goto fail_unlink; + + CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&lp->lp_primary_nid)); + + spin_lock(&lp->lp_lock); + return 0; + +fail_unlink: + LNetMDUnlink(lp->lp_push_mdh); + LNetInvalidateMDHandle(&lp->lp_push_mdh); +fail_error: + CDEBUG(D_NET, "peer %s(%p): %d\n", libcfs_nidstr(&lp->lp_primary_nid), + lp, rc); + /* + * The errors that get us here are considered hard errors and + * cause Discovery to terminate. So we clear PUSH_SENT, but do + * not set PUSH_FAILED. In fact we need to clear PUSH_FAILED, + * because the unlink event handler will have set it if we + * called LNetMDUnlink() above. + */ + spin_lock(&lp->lp_lock); + lp->lp_state &= ~(LNET_PEER_PUSH_SENT | LNET_PEER_PUSH_FAILED); + return rc; +} + +/* + * Wait for work to be queued or some other change that must be + * attended to. Returns non-zero if the discovery thread should shut + * down. + */ +static int lnet_peer_discovery_wait_for_work(void) +{ + int cpt; + int rc = 0; + + DEFINE_WAIT(wait); + + cpt = lnet_net_lock_current(); + for (;;) { + prepare_to_wait(&the_lnet.ln_dc_waitq, &wait, + TASK_INTERRUPTIBLE); + if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) + break; + if (lnet_push_target_resize_needed() || + the_lnet.ln_push_target->pb_needs_post) + break; + if (!list_empty(&the_lnet.ln_dc_request)) + break; + if (!list_empty(&the_lnet.ln_msg_resend)) + break; + lnet_net_unlock(cpt); + + /* + * wakeup max every second to check if there are peers that + * have been stuck on the working queue for greater than + * the peer timeout. + */ + schedule_timeout(cfs_time_seconds(1)); + finish_wait(&the_lnet.ln_dc_waitq, &wait); + cpt = lnet_net_lock_current(); + } + finish_wait(&the_lnet.ln_dc_waitq, &wait); + + if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) + rc = -ESHUTDOWN; + + lnet_net_unlock(cpt); + + CDEBUG(D_NET, "woken: %d\n", rc); + + return rc; +} + +/* + * Messages that were pending on a destroyed peer will be put on a global + * resend list. The message resend list will be checked by + * the discovery thread when it wakes up, and will resend messages. These + * messages can still be sendable in the case the lpni which was the initial + * cause of the message re-queue was transfered to another peer. + * + * It is possible that LNet could be shutdown while we're iterating + * through the list. lnet_shudown_lndnets() will attempt to access the + * resend list, but will have to wait until the spinlock is released, by + * which time there shouldn't be any more messages on the resend list. + * During shutdown lnet_send() will fail and lnet_finalize() will be called + * for the messages so they can be released. The other case is that + * lnet_shudown_lndnets() can finalize all the messages before this + * function can visit the resend list, in which case this function will be + * a no-op. + */ +static void lnet_resend_msgs(void) +{ + struct lnet_msg *msg, *tmp; + LIST_HEAD(resend); + int rc; + + spin_lock(&the_lnet.ln_msg_resend_lock); + list_splice(&the_lnet.ln_msg_resend, &resend); + spin_unlock(&the_lnet.ln_msg_resend_lock); + + list_for_each_entry_safe(msg, tmp, &resend, msg_list) { + list_del_init(&msg->msg_list); + rc = lnet_send(&msg->msg_src_nid_param, msg, + &msg->msg_rtr_nid_param); + if (rc < 0) { + CNETERR("Error sending %s to %s: %d\n", + lnet_msgtyp2str(msg->msg_type), + libcfs_idstr(&msg->msg_target), rc); + lnet_finalize(msg, rc); + } + } +} + +/* The discovery thread. */ +static int lnet_peer_discovery(void *arg) +{ + struct lnet_peer *lp; + int rc; + + wait_for_completion(&the_lnet.ln_started); + + CDEBUG(D_NET, "started\n"); + + for (;;) { + if (lnet_peer_discovery_wait_for_work()) + break; + + if (lnet_push_target_resize_needed()) + lnet_push_target_resize(); + else if (the_lnet.ln_push_target->pb_needs_post) + lnet_push_target_post(the_lnet.ln_push_target, + &the_lnet.ln_push_target_md); + + lnet_resend_msgs(); + + lnet_net_lock(LNET_LOCK_EX); + if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) { + lnet_net_unlock(LNET_LOCK_EX); + break; + } + + /* + * Process all incoming discovery work requests. When + * discovery must wait on a peer to change state, it + * is added to the tail of the ln_dc_working queue. A + * timestamp keeps track of when the peer was added, + * so we can time out discovery requests that take too + * long. + */ + while (!list_empty(&the_lnet.ln_dc_request)) { + lp = list_first_entry(&the_lnet.ln_dc_request, + struct lnet_peer, lp_dc_list); + list_move(&lp->lp_dc_list, &the_lnet.ln_dc_working); + /* + * set the time the peer was put on the dc_working + * queue. It shouldn't remain on the queue + * forever, in case the GET message (for ping) + * doesn't get a REPLY or the PUT message (for + * push) doesn't get an ACK. + */ + lp->lp_last_queued = ktime_get_real_seconds(); + lnet_net_unlock(LNET_LOCK_EX); + + if (lnet_push_target_resize_needed()) + lnet_push_target_resize(); + else if (the_lnet.ln_push_target->pb_needs_post) + lnet_push_target_post(the_lnet.ln_push_target, + &the_lnet.ln_push_target_md); + + /* + * Select an action depending on the state of + * the peer and whether discovery is disabled. + * The check whether discovery is disabled is + * done after the code that handles processing + * for arrived data, cleanup for failures, and + * forcing a Ping or Push. + */ + spin_lock(&lp->lp_lock); + CDEBUG(D_NET, "peer %s(%p) state %#x\n", + libcfs_nidstr(&lp->lp_primary_nid), lp, + lp->lp_state); + if (lp->lp_state & (LNET_PEER_MARK_DELETION | + LNET_PEER_MARK_DELETED)) + rc = lnet_peer_deletion(lp); + else if (lp->lp_state & LNET_PEER_DATA_PRESENT) + rc = lnet_peer_data_present(lp); + else if (lp->lp_state & LNET_PEER_PING_FAILED) + rc = lnet_peer_ping_failed(lp); + else if (lp->lp_state & LNET_PEER_PUSH_FAILED) + rc = lnet_peer_push_failed(lp); + else if (lp->lp_state & LNET_PEER_FORCE_PING) + rc = lnet_peer_send_ping(lp); + else if (lp->lp_state & LNET_PEER_FORCE_PUSH) + rc = lnet_peer_send_push(lp); + else if (!(lp->lp_state & LNET_PEER_NIDS_UPTODATE)) + rc = lnet_peer_send_ping(lp); + else if (lnet_peer_needs_push(lp)) + rc = lnet_peer_send_push(lp); + else + rc = lnet_peer_discovered(lp); + CDEBUG(D_NET, "peer %s(%p) state %#x rc %d\n", + libcfs_nidstr(&lp->lp_primary_nid), lp, + lp->lp_state, rc); + + if (rc == LNET_REDISCOVER_PEER) { + spin_unlock(&lp->lp_lock); + lnet_net_lock(LNET_LOCK_EX); + list_move(&lp->lp_dc_list, + &the_lnet.ln_dc_request); + } else if (rc || + !(lp->lp_state & LNET_PEER_DISCOVERING)) { + spin_unlock(&lp->lp_lock); + lnet_net_lock(LNET_LOCK_EX); + lnet_peer_discovery_complete(lp, rc); + } else { + spin_unlock(&lp->lp_lock); + lnet_net_lock(LNET_LOCK_EX); + } + + if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) + break; + + } + + lnet_net_unlock(LNET_LOCK_EX); + } + + CDEBUG(D_NET, "stopping\n"); + /* + * Clean up before telling lnet_peer_discovery_stop() that + * we're done. Use wake_up() below to somewhat reduce the + * size of the thundering herd if there are multiple threads + * waiting on discovery of a single peer. + */ + + /* Queue cleanup 1: stop all pending pings and pushes. */ + lnet_net_lock(LNET_LOCK_EX); + while (!list_empty(&the_lnet.ln_dc_working)) { + lp = list_first_entry(&the_lnet.ln_dc_working, + struct lnet_peer, lp_dc_list); + list_move(&lp->lp_dc_list, &the_lnet.ln_dc_expired); + lnet_net_unlock(LNET_LOCK_EX); + lnet_peer_cancel_discovery(lp); + lnet_net_lock(LNET_LOCK_EX); + } + lnet_net_unlock(LNET_LOCK_EX); + + /* Queue cleanup 2: wait for the expired queue to clear. */ + while (!list_empty(&the_lnet.ln_dc_expired)) + schedule_timeout_uninterruptible(cfs_time_seconds(1)); + + /* Queue cleanup 3: clear the request queue. */ + lnet_net_lock(LNET_LOCK_EX); + while (!list_empty(&the_lnet.ln_dc_request)) { + lp = list_first_entry(&the_lnet.ln_dc_request, + struct lnet_peer, lp_dc_list); + lnet_peer_discovery_complete(lp, -ESHUTDOWN); + } + lnet_net_unlock(LNET_LOCK_EX); + + lnet_assert_handler_unused(the_lnet.ln_dc_handler); + the_lnet.ln_dc_handler = NULL; + + the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN; + wake_up(&the_lnet.ln_dc_waitq); + + CDEBUG(D_NET, "stopped\n"); + + return 0; +} + +/* ln_api_mutex is held on entry. */ +int lnet_peer_discovery_start(void) +{ + struct task_struct *task; + int rc = 0; + + if (the_lnet.ln_dc_state != LNET_DC_STATE_SHUTDOWN) + return -EALREADY; + + the_lnet.ln_dc_handler = lnet_discovery_event_handler; + the_lnet.ln_dc_state = LNET_DC_STATE_RUNNING; + task = kthread_run(lnet_peer_discovery, NULL, "lnet_discovery"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("Can't start peer discovery thread: %d\n", rc); + + the_lnet.ln_dc_handler = NULL; + + the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN; + } + + CDEBUG(D_NET, "discovery start: %d\n", rc); + + return rc; +} + +/* ln_api_mutex is held on entry. */ +void lnet_peer_discovery_stop(void) +{ + if (the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN) + return; + + LASSERT(the_lnet.ln_dc_state == LNET_DC_STATE_RUNNING); + the_lnet.ln_dc_state = LNET_DC_STATE_STOPPING; + + /* In the LNetNIInit() path we may be stopping discovery before it + * entered its work loop + */ + if (!completion_done(&the_lnet.ln_started)) + complete(&the_lnet.ln_started); + else + wake_up(&the_lnet.ln_dc_waitq); + + mutex_unlock(&the_lnet.ln_api_mutex); + wait_event(the_lnet.ln_dc_waitq, + the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN); + mutex_lock(&the_lnet.ln_api_mutex); + + LASSERT(list_empty(&the_lnet.ln_dc_request)); + LASSERT(list_empty(&the_lnet.ln_dc_working)); + LASSERT(list_empty(&the_lnet.ln_dc_expired)); + + CDEBUG(D_NET, "discovery stopped\n"); +} + +/* Debugging */ + +void +lnet_debug_peer(lnet_nid_t nid) +{ + char *aliveness = "NA"; + struct lnet_peer_ni *lp; + int cpt; + + cpt = lnet_cpt_of_nid(nid, NULL); + lnet_net_lock(cpt); + + lp = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt); + if (IS_ERR(lp)) { + lnet_net_unlock(cpt); + CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid)); + return; + } + + if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp)) + aliveness = (lnet_is_peer_ni_alive(lp)) ? "up" : "down"; + + CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n", + libcfs_nidstr(&lp->lpni_nid), kref_read(&lp->lpni_kref), + aliveness, lp->lpni_net->net_tunables.lct_peer_tx_credits, + lp->lpni_rtrcredits, lp->lpni_minrtrcredits, + lp->lpni_txcredits, lp->lpni_mintxcredits, lp->lpni_txqnob); + + lnet_peer_ni_decref_locked(lp); + + lnet_net_unlock(cpt); +} + +/* Gathering information for userspace. */ + +int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid, + char aliveness[LNET_MAX_STR_LEN], + __u32 *cpt_iter, __u32 *refcount, + __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits, + __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credits, + __u32 *peer_tx_qnob) +{ + struct lnet_peer_table *peer_table; + struct lnet_peer_ni *lp; + int j; + int lncpt; + bool found = false; + + /* get the number of CPTs */ + lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); + + /* if the cpt number to be examined is >= the number of cpts in + * the system then indicate that there are no more cpts to examin + */ + if (*cpt_iter >= lncpt) + return -ENOENT; + + /* get the current table */ + peer_table = the_lnet.ln_peer_tables[*cpt_iter]; + /* if the ptable is NULL then there are no more cpts to examine */ + if (peer_table == NULL) + return -ENOENT; + + lnet_net_lock(*cpt_iter); + + for (j = 0; j < LNET_PEER_HASH_SIZE && !found; j++) { + struct list_head *peers = &peer_table->pt_hash[j]; + + list_for_each_entry(lp, peers, lpni_hashlist) { + if (!nid_is_nid4(&lp->lpni_nid)) + continue; + if (peer_index-- > 0) + continue; + + snprintf(aliveness, LNET_MAX_STR_LEN, "NA"); + if (lnet_isrouter(lp) || + lnet_peer_aliveness_enabled(lp)) + snprintf(aliveness, LNET_MAX_STR_LEN, + lnet_is_peer_ni_alive(lp) ? "up" : "down"); + + *nid = lnet_nid_to_nid4(&lp->lpni_nid); + *refcount = kref_read(&lp->lpni_kref); + *ni_peer_tx_credits = + lp->lpni_net->net_tunables.lct_peer_tx_credits; + *peer_tx_credits = lp->lpni_txcredits; + *peer_rtr_credits = lp->lpni_rtrcredits; + *peer_min_rtr_credits = lp->lpni_mintxcredits; + *peer_tx_qnob = lp->lpni_txqnob; + + found = true; + } + + } + lnet_net_unlock(*cpt_iter); + + *cpt_iter = lncpt; + + return found ? 0 : -ENOENT; +} + +/* ln_api_mutex is held, which keeps the peer list stable */ +int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk) +{ + struct lnet_ioctl_element_stats *lpni_stats; + struct lnet_ioctl_element_msg_stats *lpni_msg_stats; + struct lnet_ioctl_peer_ni_hstats *lpni_hstats; + struct lnet_peer_ni_credit_info *lpni_info; + struct lnet_peer_ni *lpni; + struct lnet_peer *lp; + lnet_nid_t nid; + __u32 size; + int rc; + + lp = lnet_find_peer4(cfg->prcfg_prim_nid); + + if (!lp) { + rc = -ENOENT; + goto out; + } + + size = sizeof(nid) + sizeof(*lpni_info) + sizeof(*lpni_stats) + + sizeof(*lpni_msg_stats) + sizeof(*lpni_hstats); + size *= lp->lp_nnis; + if (size > cfg->prcfg_size) { + cfg->prcfg_size = size; + rc = -E2BIG; + goto out_lp_decref; + } + + cfg->prcfg_prim_nid = lnet_nid_to_nid4(&lp->lp_primary_nid); + cfg->prcfg_mr = lnet_peer_is_multi_rail(lp); + cfg->prcfg_cfg_nid = lnet_nid_to_nid4(&lp->lp_primary_nid); + cfg->prcfg_count = lp->lp_nnis; + cfg->prcfg_size = size; + cfg->prcfg_state = lp->lp_state; + + /* Allocate helper buffers. */ + rc = -ENOMEM; + LIBCFS_ALLOC(lpni_info, sizeof(*lpni_info)); + if (!lpni_info) + goto out_lp_decref; + LIBCFS_ALLOC(lpni_stats, sizeof(*lpni_stats)); + if (!lpni_stats) + goto out_free_info; + LIBCFS_ALLOC(lpni_msg_stats, sizeof(*lpni_msg_stats)); + if (!lpni_msg_stats) + goto out_free_stats; + LIBCFS_ALLOC(lpni_hstats, sizeof(*lpni_hstats)); + if (!lpni_hstats) + goto out_free_msg_stats; + + + lpni = NULL; + rc = -EFAULT; + while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) { + if (!nid_is_nid4(&lpni->lpni_nid)) + continue; + nid = lnet_nid_to_nid4(&lpni->lpni_nid); + if (copy_to_user(bulk, &nid, sizeof(nid))) + goto out_free_hstats; + bulk += sizeof(nid); + + memset(lpni_info, 0, sizeof(*lpni_info)); + snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN, "NA"); + if (lnet_isrouter(lpni) || + lnet_peer_aliveness_enabled(lpni)) + snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN, + lnet_is_peer_ni_alive(lpni) ? "up" : "down"); + + lpni_info->cr_refcount = kref_read(&lpni->lpni_kref); + lpni_info->cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ? + lpni->lpni_net->net_tunables.lct_peer_tx_credits : 0; + lpni_info->cr_peer_tx_credits = lpni->lpni_txcredits; + lpni_info->cr_peer_rtr_credits = lpni->lpni_rtrcredits; + lpni_info->cr_peer_min_rtr_credits = lpni->lpni_minrtrcredits; + lpni_info->cr_peer_min_tx_credits = lpni->lpni_mintxcredits; + lpni_info->cr_peer_tx_qnob = lpni->lpni_txqnob; + if (copy_to_user(bulk, lpni_info, sizeof(*lpni_info))) + goto out_free_hstats; + bulk += sizeof(*lpni_info); + + memset(lpni_stats, 0, sizeof(*lpni_stats)); + lpni_stats->iel_send_count = lnet_sum_stats(&lpni->lpni_stats, + LNET_STATS_TYPE_SEND); + lpni_stats->iel_recv_count = lnet_sum_stats(&lpni->lpni_stats, + LNET_STATS_TYPE_RECV); + lpni_stats->iel_drop_count = lnet_sum_stats(&lpni->lpni_stats, + LNET_STATS_TYPE_DROP); + if (copy_to_user(bulk, lpni_stats, sizeof(*lpni_stats))) + goto out_free_hstats; + bulk += sizeof(*lpni_stats); + lnet_usr_translate_stats(lpni_msg_stats, &lpni->lpni_stats); + if (copy_to_user(bulk, lpni_msg_stats, sizeof(*lpni_msg_stats))) + goto out_free_hstats; + bulk += sizeof(*lpni_msg_stats); + lpni_hstats->hlpni_network_timeout = + atomic_read(&lpni->lpni_hstats.hlt_network_timeout); + lpni_hstats->hlpni_remote_dropped = + atomic_read(&lpni->lpni_hstats.hlt_remote_dropped); + lpni_hstats->hlpni_remote_timeout = + atomic_read(&lpni->lpni_hstats.hlt_remote_timeout); + lpni_hstats->hlpni_remote_error = + atomic_read(&lpni->lpni_hstats.hlt_remote_error); + lpni_hstats->hlpni_health_value = + atomic_read(&lpni->lpni_healthv); + lpni_hstats->hlpni_ping_count = lpni->lpni_ping_count; + lpni_hstats->hlpni_next_ping = lpni->lpni_next_ping; + if (copy_to_user(bulk, lpni_hstats, sizeof(*lpni_hstats))) + goto out_free_hstats; + bulk += sizeof(*lpni_hstats); + } + rc = 0; + +out_free_hstats: + LIBCFS_FREE(lpni_hstats, sizeof(*lpni_hstats)); +out_free_msg_stats: + LIBCFS_FREE(lpni_msg_stats, sizeof(*lpni_msg_stats)); +out_free_stats: + LIBCFS_FREE(lpni_stats, sizeof(*lpni_stats)); +out_free_info: + LIBCFS_FREE(lpni_info, sizeof(*lpni_info)); +out_lp_decref: + lnet_peer_decref_locked(lp); +out: + return rc; +} + +/* must hold net_lock/0 */ +void +lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni, + struct list_head *recovery_queue, + time64_t now) +{ + /* the mt could've shutdown and cleaned up the queues */ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) + return; + + if (!list_empty(&lpni->lpni_recovery)) + return; + + if (atomic_read(&lpni->lpni_healthv) == LNET_MAX_HEALTH_VALUE) + return; + + if (!lpni->lpni_last_alive) { + CDEBUG(D_NET, + "lpni %s(%p) not eligible for recovery last alive %lld\n", + libcfs_nidstr(&lpni->lpni_nid), lpni, + lpni->lpni_last_alive); + return; + } + + if (lnet_recovery_limit && + now > lpni->lpni_last_alive + lnet_recovery_limit) { + CDEBUG(D_NET, "lpni %s aged out last alive %lld\n", + libcfs_nidstr(&lpni->lpni_nid), + lpni->lpni_last_alive); + /* Reset the ping count so that if this peer NI is added back to + * the recovery queue we will send the first ping right away. + */ + lpni->lpni_ping_count = 0; + return; + } + + /* This peer NI is going on the recovery queue, so take a ref on it */ + lnet_peer_ni_addref_locked(lpni); + + lnet_peer_ni_set_next_ping(lpni, now); + + CDEBUG(D_NET, "%s added to recovery queue. ping count: %u next ping: %lld last alive: %lld health: %d\n", + libcfs_nidstr(&lpni->lpni_nid), + lpni->lpni_ping_count, + lpni->lpni_next_ping, + lpni->lpni_last_alive, + atomic_read(&lpni->lpni_healthv)); + + list_add_tail(&lpni->lpni_recovery, recovery_queue); +} + +/* Call with the ln_api_mutex held */ +void +lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all) +{ + struct lnet_peer_table *ptable; + struct lnet_peer *lp; + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + int lncpt; + int cpt; + time64_t now; + + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return; + + now = ktime_get_seconds(); + + if (!all) { + lnet_net_lock(LNET_LOCK_EX); + lpni = lnet_find_peer_ni_locked(nid); + if (!lpni) { + lnet_net_unlock(LNET_LOCK_EX); + return; + } + lnet_set_lpni_healthv_locked(lpni, value); + lnet_peer_ni_add_to_recoveryq_locked(lpni, + &the_lnet.ln_mt_peerNIRecovq, now); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(LNET_LOCK_EX); + return; + } + + lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); + + /* + * Walk all the peers and reset the health value for each one to the + * specified value. + */ + lnet_net_lock(LNET_LOCK_EX); + for (cpt = 0; cpt < lncpt; cpt++) { + ptable = the_lnet.ln_peer_tables[cpt]; + list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) { + list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) { + list_for_each_entry(lpni, &lpn->lpn_peer_nis, + lpni_peer_nis) { + lnet_set_lpni_healthv_locked(lpni, + value); + lnet_peer_ni_add_to_recoveryq_locked(lpni, + &the_lnet.ln_mt_peerNIRecovq, now); + } + } + } + } + lnet_net_unlock(LNET_LOCK_EX); +} + diff --git a/drivers/staging/lustrefsx/lnet/lnet/router.c b/drivers/staging/lustrefsx/lnet/lnet/router.c new file mode 100644 index 0000000000000..9002cf0bcbe89 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/router.c @@ -0,0 +1,1835 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include + +#define LNET_NRB_TINY_MIN 512 /* min value for each CPT */ +#define LNET_NRB_TINY (LNET_NRB_TINY_MIN * 4) +#define LNET_NRB_SMALL_MIN 4096 /* min value for each CPT */ +#define LNET_NRB_SMALL (LNET_NRB_SMALL_MIN * 4) +#define LNET_NRB_SMALL_PAGES 1 +#define LNET_NRB_LARGE_MIN 256 /* min value for each CPT */ +#define LNET_NRB_LARGE (LNET_NRB_LARGE_MIN * 4) +#define LNET_NRB_LARGE_PAGES ((LNET_MTU + PAGE_SIZE - 1) >> \ + PAGE_SHIFT) + +static char *forwarding = ""; +module_param(forwarding, charp, 0444); +MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks"); + +static int tiny_router_buffers; +module_param(tiny_router_buffers, int, 0444); +MODULE_PARM_DESC(tiny_router_buffers, "# of 0 payload messages to buffer in the router"); +static int small_router_buffers; +module_param(small_router_buffers, int, 0444); +MODULE_PARM_DESC(small_router_buffers, "# of small (1 page) messages to buffer in the router"); +static int large_router_buffers; +module_param(large_router_buffers, int, 0444); +MODULE_PARM_DESC(large_router_buffers, "# of large messages to buffer in the router"); +static int peer_buffer_credits; +module_param(peer_buffer_credits, int, 0444); +MODULE_PARM_DESC(peer_buffer_credits, "# router buffer credits per peer"); + +static int auto_down = 1; +module_param(auto_down, int, 0444); +MODULE_PARM_DESC(auto_down, "Automatically mark peers down on comms error"); + +int +lnet_peer_buffer_credits(struct lnet_net *net) +{ + /* NI option overrides LNet default */ + if (net->net_tunables.lct_peer_rtr_credits > 0) + return net->net_tunables.lct_peer_rtr_credits; + if (peer_buffer_credits > 0) + return peer_buffer_credits; + + /* As an approximation, allow this peer the same number of router + * buffers as it is allowed outstanding sends */ + return net->net_tunables.lct_peer_tx_credits; +} + +static int check_routers_before_use; +module_param(check_routers_before_use, int, 0444); +MODULE_PARM_DESC(check_routers_before_use, "Assume routers are down and ping them before use"); + +int avoid_asym_router_failure = 1; +module_param(avoid_asym_router_failure, int, 0644); +MODULE_PARM_DESC(avoid_asym_router_failure, "Avoid asymmetrical router failures (0 to disable)"); + +int dead_router_check_interval = INT_MIN; +module_param(dead_router_check_interval, int, 0444); +MODULE_PARM_DESC(dead_router_check_interval, "(DEPRECATED - Use alive_router_check_interval)"); + +int live_router_check_interval = INT_MIN; +module_param(live_router_check_interval, int, 0444); +MODULE_PARM_DESC(live_router_check_interval, "(DEPRECATED - Use alive_router_check_interval)"); + +int alive_router_check_interval = 60; +module_param(alive_router_check_interval, int, 0644); +MODULE_PARM_DESC(alive_router_check_interval, "Seconds between live router health checks (<= 0 to disable)"); + +static int router_ping_timeout = 50; +module_param(router_ping_timeout, int, 0644); +MODULE_PARM_DESC(router_ping_timeout, "Seconds to wait for the reply to a router health query"); + +/* + * A value between 0 and 100. 0 meaning that even if router's interfaces + * have the worse health still consider the gateway usable. + * 100 means that at least one interface on the route's remote net is 100% + * healthy to consider the route alive. + * The default is set to 100 to ensure we maintain the original behavior. + */ +unsigned int router_sensitivity_percentage = 100; +static int rtr_sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp); +static struct kernel_param_ops param_ops_rtr_sensitivity = { + .set = rtr_sensitivity_set, + .get = param_get_int, +}; +#define param_check_rtr_sensitivity(name, p) \ + __param_check(name, p, int) +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(router_sensitivity_percentage, rtr_sensitivity, S_IRUGO|S_IWUSR); +#else +module_param_call(router_sensitivity_percentage, rtr_sensitivity_set, param_get_int, + &router_sensitivity_percentage, S_IRUGO|S_IWUSR); +#endif +MODULE_PARM_DESC(router_sensitivity_percentage, + "How healthy a gateway should be to be used in percent"); + +static void lnet_add_route_to_rnet(struct lnet_remotenet *rnet, + struct lnet_route *route); +static void lnet_del_route_from_rnet(struct lnet_nid *gw_nid, + struct list_head *route_list, + struct list_head *zombies); + +static int +rtr_sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned *sen = (unsigned *)kp->arg; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'router_sensitivity_percentage'\n"); + return rc; + } + + if (value < 0 || value > 100) { + CERROR("Invalid value: %lu for 'router_sensitivity_percentage'\n", value); + return -EINVAL; + } + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + *sen = value; + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +void +lnet_move_route(struct lnet_route *route, struct lnet_peer *lp, + struct list_head *rt_list) +{ + struct lnet_remotenet *rnet; + struct list_head zombies; + struct list_head *l; + + INIT_LIST_HEAD(&zombies); + + if (rt_list) + l = rt_list; + else + l = &zombies; + + rnet = lnet_find_rnet_locked(route->lr_net); + LASSERT(rnet); + + CDEBUG(D_NET, "deleting route %s->%s\n", + libcfs_net2str(route->lr_net), + libcfs_nidstr(&route->lr_nid)); + + /* + * use the gateway's lp_primary_nid to delete the route as the + * lr_nid can be a constituent NID of the peer + */ + lnet_del_route_from_rnet( + &route->lr_gateway->lp_primary_nid, + &rnet->lrn_routes, l); + + if (lp) { + route = list_first_entry(l, struct lnet_route, + lr_list); + route->lr_gateway = lp; + lnet_add_route_to_rnet(rnet, route); + } else { + while (!list_empty(l) && !rt_list) { + route = list_first_entry(l, struct lnet_route, + lr_list); + list_del(&route->lr_list); + LIBCFS_FREE(route, sizeof(*route)); + } + } +} + +void +lnet_rtr_transfer_to_peer(struct lnet_peer *src, struct lnet_peer *target) +{ + struct lnet_route *route; + struct lnet_route *tmp, *tmp2; + + lnet_net_lock(LNET_LOCK_EX); + CDEBUG(D_NET, "transfering routes from %s -> %s\n", + libcfs_nidstr(&src->lp_primary_nid), + libcfs_nidstr(&target->lp_primary_nid)); + list_for_each_entry(route, &src->lp_routes, lr_gwlist) { + CDEBUG(D_NET, "%s: %s->%s\n", + libcfs_nidstr(&src->lp_primary_nid), + libcfs_net2str(route->lr_net), + libcfs_nidstr(&route->lr_nid)); + } + list_splice_init(&src->lp_rtrq, &target->lp_rtrq); + list_for_each_entry_safe(route, tmp, &src->lp_routes, lr_gwlist) { + struct lnet_route *r2; + bool present = false; + list_for_each_entry_safe(r2, tmp2, &target->lp_routes, lr_gwlist) { + if (route->lr_net == r2->lr_net) { + if (route->lr_priority >= r2->lr_priority) + present = true; + else if (route->lr_hops >= r2->lr_hops) + present = true; + else + lnet_move_route(r2, NULL, NULL); + } + } + if (present) + lnet_move_route(route, NULL, NULL); + else + lnet_move_route(route, target, NULL); + } + + if (list_empty(&target->lp_rtr_list)) { + lnet_peer_addref_locked(target); + list_add_tail(&target->lp_rtr_list, &the_lnet.ln_routers); + } + + the_lnet.ln_routers_version++; + lnet_net_unlock(LNET_LOCK_EX); +} + +int +lnet_peers_start_down(void) +{ + return check_routers_before_use; +} + +/* + * The peer_net of a gateway is alive if at least one of the peer_ni's on + * that peer_net is alive. + */ +static bool +lnet_is_gateway_net_alive(struct lnet_peer_net *lpn) +{ + struct lnet_peer_ni *lpni; + + list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) { + if (lnet_is_peer_ni_alive(lpni)) + return true; + } + + return false; +} + +/* + * a gateway is alive only if all its nets are alive + * called with cpt lock held + */ +bool lnet_is_gateway_alive(struct lnet_peer *gw) +{ + struct lnet_peer_net *lpn; + + if (!gw->lp_alive) + return false; + + list_for_each_entry(lpn, &gw->lp_peer_nets, lpn_peer_nets) { + if (!lnet_is_gateway_net_alive(lpn)) + return false; + } + + return true; +} + +/* + * lnet_is_route_alive() needs to be called with cpt lock held + * A route is alive if the gateway can route between the local network and + * the remote network of the route. + * This means at least one NI is alive on each of the local and remote + * networks of the gateway. + */ +bool lnet_is_route_alive(struct lnet_route *route) +{ + struct lnet_peer *gw = route->lr_gateway; + struct lnet_peer_net *llpn; + struct lnet_peer_net *rlpn; + + /* If the gateway is down then all routes are considered down */ + if (!gw->lp_alive) + return false; + + /* + * if discovery is disabled then rely on the cached aliveness + * information. This is handicapped information which we log when + * we receive the discovery ping response. The most uptodate + * aliveness information can only be obtained when discovery is + * enabled. + */ + if (lnet_is_discovery_disabled(gw)) + return atomic_read(&route->lr_alive) == 1; + + /* + * check the gateway's interfaces on the local network + */ + llpn = lnet_peer_get_net_locked(gw, route->lr_lnet); + if (!llpn) + return false; + + if (!lnet_is_gateway_net_alive(llpn)) + return false; + + /* + * For single hop routes avoid_asym_router_failure dictates + * that the remote net must exist on the gateway. For multi-hop + * routes the next-hop will not have the remote net. + */ + if (avoid_asym_router_failure && + (route->lr_hops == 1 || route->lr_single_hop)) { + rlpn = lnet_peer_get_net_locked(gw, route->lr_net); + if (!rlpn) + return false; + if (!lnet_is_gateway_net_alive(rlpn)) + return false; + } + + spin_lock(&gw->lp_lock); + if (!(gw->lp_state & LNET_PEER_ROUTER_ENABLED)) { + spin_unlock(&gw->lp_lock); + if (gw->lp_rtr_refcount > 0) + CERROR("peer %s is being used as a gateway but routing feature is not turned on\n", + libcfs_nidstr(&gw->lp_primary_nid)); + return false; + } + spin_unlock(&gw->lp_lock); + + return true; +} + +void +lnet_consolidate_routes_locked(struct lnet_peer *orig_lp, + struct lnet_peer *new_lp) +{ + struct lnet_peer_ni *lpni; + struct lnet_route *route; + + /* + * Although a route is correlated with a peer, but when it's added + * a specific NID is used. That NID refers to a peer_ni within + * a peer. There could be other peer_nis on the same net, which + * can be used to send to that gateway. However when we are + * consolidating gateways because of discovery, the nid used to + * add the route might've moved between gateway peers. In this + * case we want to move the route to the new gateway as well. The + * intent here is not to confuse the user who added the route. + */ + list_for_each_entry(route, &orig_lp->lp_routes, lr_gwlist) { + lpni = lnet_peer_ni_get_locked(orig_lp, &route->lr_nid); + if (!lpni) { + lnet_net_lock(LNET_LOCK_EX); + list_move(&route->lr_gwlist, &new_lp->lp_routes); + lnet_net_unlock(LNET_LOCK_EX); + } + } +} + +static inline void +lnet_check_route_inconsistency(struct lnet_route *route) +{ + if (!route->lr_single_hop && + (route->lr_hops == 1 || route->lr_hops == LNET_UNDEFINED_HOPS)) { + CWARN("route %s->%s is detected to be multi-hop but hop count is set to %d\n", + libcfs_net2str(route->lr_net), + libcfs_nidstr(&route->lr_gateway->lp_primary_nid), + (int) route->lr_hops); + } +} + +static void +lnet_set_route_hop_type(struct lnet_peer *gw, struct lnet_route *route) +{ + struct lnet_peer_net *lpn; + bool single_hop = false; + + list_for_each_entry(lpn, &gw->lp_peer_nets, lpn_peer_nets) { + if (route->lr_net == lpn->lpn_net_id) { + single_hop = true; + break; + } + } + route->lr_single_hop = single_hop; + lnet_check_route_inconsistency(route); +} + +/* Must hold net_lock/EX */ +void +lnet_router_discovery_ping_reply(struct lnet_peer *lp) +{ + struct lnet_ping_buffer *pbuf = lp->lp_data; + struct lnet_peer_net *llpn; + struct lnet_route *route; + bool single_hop = false; + bool net_up = false; + unsigned lp_state; + __u32 net; + int i; + + + spin_lock(&lp->lp_lock); + lp_state = lp->lp_state; + + /* only handle replies if discovery is disabled. */ + if (!lnet_is_discovery_disabled_locked(lp)) { + spin_unlock(&lp->lp_lock); + return; + } + + spin_unlock(&lp->lp_lock); + + if (lp_state & LNET_PEER_PING_FAILED || + pbuf->pb_info.pi_features & LNET_PING_FEAT_RTE_DISABLED) { + CDEBUG(D_NET, "Set routes down for gw %s because %s %d\n", + libcfs_nidstr(&lp->lp_primary_nid), + lp_state & LNET_PEER_PING_FAILED ? "ping failed" : + "route feature is disabled", lp->lp_ping_error); + /* If the ping failed or the peer has routing disabled then + * mark the routes served by this peer down + */ + list_for_each_entry(route, &lp->lp_routes, lr_gwlist) + lnet_set_route_aliveness(route, false); + return; + } + + CDEBUG(D_NET, "Discovery is disabled. Processing reply for gw: %s:%d\n", + libcfs_nidstr(&lp->lp_primary_nid), pbuf->pb_info.pi_nnis); + + /* + * examine the ping response to determine if the routes on that + * gateway should be declared alive. + * The route is alive if: + * 1. local network to reach the route is alive and + * 2. route is single hop, avoid_async_router_failure is set and + * there exists at least one NI on the route's remote net + */ + list_for_each_entry(route, &lp->lp_routes, lr_gwlist) { + llpn = lnet_peer_get_net_locked(lp, route->lr_lnet); + if (!llpn) { + lnet_set_route_aliveness(route, false); + continue; + } + + if (!lnet_is_gateway_net_alive(llpn)) { + lnet_set_route_aliveness(route, false); + continue; + } + + single_hop = net_up = false; + for (i = 1; i < pbuf->pb_info.pi_nnis; i++) { + net = LNET_NIDNET(pbuf->pb_info.pi_ni[i].ns_nid); + + if (route->lr_net == net) { + single_hop = true; + if (pbuf->pb_info.pi_ni[i].ns_status == + LNET_NI_STATUS_UP) { + net_up = true; + break; + } + } + } + + route->lr_single_hop = single_hop; + if (avoid_asym_router_failure && + (route->lr_hops == 1 || route->lr_single_hop)) + lnet_set_route_aliveness(route, net_up); + else + lnet_set_route_aliveness(route, true); + + /* + * warn that the route is configured as single-hop but it + * really is multi-hop as far as we can tell. + */ + lnet_check_route_inconsistency(route); + } +} + +void +lnet_router_discovery_complete(struct lnet_peer *lp) +{ + struct lnet_peer_ni *lpni = NULL; + struct lnet_route *route; + + spin_lock(&lp->lp_lock); + lp->lp_state &= ~LNET_PEER_RTR_DISCOVERY; + lp->lp_state |= LNET_PEER_RTR_DISCOVERED; + lp->lp_alive = lp->lp_dc_error == 0; + spin_unlock(&lp->lp_lock); + + if (!lp->lp_dc_error) { + /* ping replies are being handled when discovery is disabled */ + if (lnet_is_discovery_disabled_locked(lp)) + return; + + /* + * mark single-hop routes. If the remote net is not configured on + * the gateway we assume this is intentional and we mark the + * gateway as multi-hop + */ + list_for_each_entry(route, &lp->lp_routes, lr_gwlist) { + lnet_set_route_aliveness(route, true); + lnet_set_route_hop_type(lp, route); + } + + return; + } + + /* + * We do not send messages directly to the remote interfaces + * of an LNet router. As such, we rely on the PING response + * to determine the up/down status of these interfaces. If + * a PING response is not receieved, or some other problem with + * discovery occurs that prevents us from getting this status, + * we assume all interfaces are down until we're able to + * determine otherwise. + */ + CDEBUG(D_NET, "%s: Router discovery failed %d\n", + libcfs_nidstr(&lp->lp_primary_nid), lp->lp_dc_error); + while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) + lpni->lpni_ns_status = LNET_NI_STATUS_DOWN; + + list_for_each_entry(route, &lp->lp_routes, lr_gwlist) + lnet_set_route_aliveness(route, false); +} + +static void +lnet_rtr_addref_locked(struct lnet_peer *lp) +{ + LASSERT(lp->lp_rtr_refcount >= 0); + + /* lnet_net_lock must be exclusively locked */ + lp->lp_rtr_refcount++; + if (lp->lp_rtr_refcount == 1) { + list_add_tail(&lp->lp_rtr_list, &the_lnet.ln_routers); + /* addref for the_lnet.ln_routers */ + lnet_peer_addref_locked(lp); + the_lnet.ln_routers_version++; + } +} + +static void +lnet_rtr_decref_locked(struct lnet_peer *lp) +{ + LASSERT(atomic_read(&lp->lp_refcount) > 0); + LASSERT(lp->lp_rtr_refcount > 0); + + /* lnet_net_lock must be exclusively locked */ + lp->lp_rtr_refcount--; + if (lp->lp_rtr_refcount == 0) { + LASSERT(list_empty(&lp->lp_routes)); + + list_del(&lp->lp_rtr_list); + /* decref for the_lnet.ln_routers */ + lnet_peer_decref_locked(lp); + the_lnet.ln_routers_version++; + } +} + +struct lnet_remotenet * +lnet_find_rnet_locked(__u32 net) +{ + struct lnet_remotenet *rnet; + struct list_head *tmp; + struct list_head *rn_list; + + LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING); + + rn_list = lnet_net2rnethash(net); + list_for_each(tmp, rn_list) { + rnet = list_entry(tmp, struct lnet_remotenet, lrn_list); + + if (rnet->lrn_net == net) + return rnet; + } + return NULL; +} + +static void lnet_shuffle_seed(void) +{ + static int seeded; + struct lnet_ni *ni = NULL; + + if (seeded) + return; + + /* Nodes with small feet have little entropy + * the NID for this node gives the most entropy in the low bits */ + while ((ni = lnet_get_next_ni_locked(NULL, ni))) + add_device_randomness(&ni->ni_nid, sizeof(ni->ni_nid)); + + seeded = 1; +} + +/* NB expects LNET_LOCK held */ +static void +lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route) +{ + struct lnet_peer_net *lpn; + unsigned int offset = 0; + unsigned int len = 0; + struct list_head *e; + time64_t now; + + lnet_shuffle_seed(); + + list_for_each(e, &rnet->lrn_routes) + len++; + + /* + * Randomly adding routes to the list is done to ensure that when + * different nodes are using the same list of routers, they end up + * preferring different routers. + */ + offset = get_random_u32_below(len + 1); + list_for_each(e, &rnet->lrn_routes) { + if (offset == 0) + break; + offset--; + } + list_add(&route->lr_list, e); + /* + * force a router check on the gateway to make sure the route is + * alive + */ + now = ktime_get_real_seconds(); + list_for_each_entry(lpn, &route->lr_gateway->lp_peer_nets, + lpn_peer_nets) { + lpn->lpn_next_ping = now; + } + + the_lnet.ln_remote_nets_version++; + + /* add the route on the gateway list */ + list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes); + + /* take a router reference count on the gateway */ + lnet_rtr_addref_locked(route->lr_gateway); +} + +int +lnet_add_route(__u32 net, __u32 hops, struct lnet_nid *gateway, + __u32 priority, __u32 sensitivity) +{ + struct list_head *route_entry; + struct lnet_remotenet *rnet; + struct lnet_remotenet *rnet2; + struct lnet_route *route; + struct lnet_peer_ni *lpni; + struct lnet_peer *gw; + int add_route; + int rc; + + CDEBUG(D_NET, "Add route: remote net %s hops %d priority %u gw %s\n", + libcfs_net2str(net), hops, priority, libcfs_nidstr(gateway)); + + if (LNET_NID_IS_ANY(gateway) || + nid_is_lo0(gateway) || + net == LNET_NET_ANY || + LNET_NETTYP(net) == LOLND || + LNET_NID_NET(gateway) == net || + (hops != LNET_UNDEFINED_HOPS && (hops < 1 || hops > 255))) + return -EINVAL; + + /* it's a local network */ + if (lnet_islocalnet(net)) + return -EEXIST; + + if (!lnet_islocalnet(LNET_NID_NET(gateway))) { + CERROR("Cannot add route with gateway %s. There is no local interface configured on LNet %s\n", + libcfs_nidstr(gateway), + libcfs_net2str(LNET_NID_NET(gateway))); + return -EHOSTUNREACH; + } + + /* Assume net, route, all new */ + LIBCFS_ALLOC(route, sizeof(*route)); + LIBCFS_ALLOC(rnet, sizeof(*rnet)); + if (route == NULL || rnet == NULL) { + CERROR("Out of memory creating route %s %d %s\n", + libcfs_net2str(net), hops, libcfs_nidstr(gateway)); + if (route != NULL) + LIBCFS_FREE(route, sizeof(*route)); + if (rnet != NULL) + LIBCFS_FREE(rnet, sizeof(*rnet)); + return -ENOMEM; + } + + INIT_LIST_HEAD(&rnet->lrn_routes); + rnet->lrn_net = net; + /* store the local and remote net that the route represents */ + route->lr_lnet = LNET_NID_NET(gateway); + route->lr_net = net; + route->lr_nid = *gateway; + route->lr_priority = priority; + route->lr_hops = hops; + if (lnet_peers_start_down()) + atomic_set(&route->lr_alive, 0); + else + atomic_set(&route->lr_alive, 1); + + lnet_net_lock(LNET_LOCK_EX); + + /* + * lnet_nid2peerni_ex() grabs a ref on the lpni. We will need to + * lose that once we're done + */ + lpni = lnet_nid2peerni_ex(gateway, LNET_LOCK_EX); + if (IS_ERR(lpni)) { + lnet_net_unlock(LNET_LOCK_EX); + + LIBCFS_FREE(route, sizeof(*route)); + LIBCFS_FREE(rnet, sizeof(*rnet)); + + rc = PTR_ERR(lpni); + CERROR("Error %d creating route %s %d %s\n", rc, + libcfs_net2str(net), hops, + libcfs_nidstr(gateway)); + return rc; + } + + LASSERT(lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer); + gw = lpni->lpni_peer_net->lpn_peer; + + route->lr_gateway = gw; + + rnet2 = lnet_find_rnet_locked(net); + if (rnet2 == NULL) { + /* new network */ + list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net)); + rnet2 = rnet; + } + + /* Search for a duplicate route (it's a NOOP if it is) */ + add_route = 1; + list_for_each(route_entry, &rnet2->lrn_routes) { + struct lnet_route *route2; + + route2 = list_entry(route_entry, struct lnet_route, lr_list); + if (route2->lr_gateway == route->lr_gateway) { + add_route = 0; + break; + } + + /* our lookups must be true */ + LASSERT(!nid_same(&route2->lr_gateway->lp_primary_nid, + gateway)); + } + + /* + * It is possible to add multiple routes through the same peer, + * but it'll be using a different NID of that peer. When the + * gateway is discovered, discovery will consolidate the different + * peers into one peer. In this case the discovery code will have + * to move the routes from the peer that's being deleted to the + * consolidated peer lp_routes list + */ + if (add_route) { + gw->lp_health_sensitivity = sensitivity; + lnet_add_route_to_rnet(rnet2, route); + if (lnet_peer_discovery_disabled) + CWARN("Consider turning discovery on to enable full Multi-Rail routing functionality\n"); + } + + /* + * get rid of the reference on the lpni. + */ + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(LNET_LOCK_EX); + + /* If avoid_asym_router_failure is enabled and hop count is not + * set to 1 for a route that is actually single-hop, then the + * feature will fail to prevent the router from being selected + * if it is missing a NI on the remote network due to misconfiguration. + */ + if (avoid_asym_router_failure && hops == LNET_UNDEFINED_HOPS) + CWARN("Use hops = 1 for a single-hop route when avoid_asym_router_failure feature is enabled\n"); + + rc = 0; + + if (!add_route) { + rc = -EEXIST; + LIBCFS_FREE(route, sizeof(*route)); + } + + if (rnet != rnet2) + LIBCFS_FREE(rnet, sizeof(*rnet)); + + /* kick start the monitor thread to handle the added route */ + complete(&the_lnet.ln_mt_wait_complete); + + return rc; +} + +void +lnet_del_route_from_rnet(struct lnet_nid *gw_nid, + struct list_head *route_list, + struct list_head *zombies) +{ + struct lnet_peer *gateway; + struct lnet_route *route; + struct lnet_route *tmp; + + list_for_each_entry_safe(route, tmp, route_list, lr_list) { + gateway = route->lr_gateway; + if (gw_nid && !nid_same(gw_nid, &gateway->lp_primary_nid)) + continue; + + /* + * move to zombie to delete outside the lock + * Note that this function is called with the + * ln_api_mutex held as well as the exclusive net + * lock. Adding to the remote net list happens + * under the same conditions. Same goes for the + * gateway router list + */ + list_move(&route->lr_list, zombies); + the_lnet.ln_remote_nets_version++; + + list_del(&route->lr_gwlist); + lnet_rtr_decref_locked(gateway); + } +} + +int +lnet_del_route(__u32 net, struct lnet_nid *gw) +{ + LIST_HEAD(rnet_zombies); + struct lnet_remotenet *rnet; + struct lnet_remotenet *tmp; + struct list_head *rn_list; + struct lnet_peer_ni *lpni; + struct lnet_route *route; + struct lnet_nid gw_nid; + LIST_HEAD(zombies); + struct lnet_peer *lp = NULL; + int i = 0; + + CDEBUG(D_NET, "Del route: net %s : gw %s\n", + libcfs_net2str(net), libcfs_nidstr(gw)); + + /* NB Caller may specify either all routes via the given gateway + * or a specific route entry actual NIDs) */ + + lnet_net_lock(LNET_LOCK_EX); + + if (gw) + lpni = lnet_peer_ni_find_locked(gw); + else + lpni = NULL; + if (lpni) { + lp = lpni->lpni_peer_net->lpn_peer; + LASSERT(lp); + gw_nid = lp->lp_primary_nid; + gw = &gw_nid; + lnet_peer_ni_decref_locked(lpni); + } + + if (net != LNET_NET_ANY) { + rnet = lnet_find_rnet_locked(net); + if (!rnet) { + lnet_net_unlock(LNET_LOCK_EX); + return -ENOENT; + } + lnet_del_route_from_rnet(gw, &rnet->lrn_routes, + &zombies); + if (list_empty(&rnet->lrn_routes)) + list_move(&rnet->lrn_list, &rnet_zombies); + goto delete_zombies; + } + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + + list_for_each_entry_safe(rnet, tmp, rn_list, lrn_list) { + lnet_del_route_from_rnet(gw, &rnet->lrn_routes, + &zombies); + if (list_empty(&rnet->lrn_routes)) + list_move(&rnet->lrn_list, &rnet_zombies); + } + } + +delete_zombies: + /* + * check if there are any routes remaining on the gateway + * If there are no more routes make sure to set the peer's + * lp_disc_net_id to 0 (invalid), in case we add more routes in + * the future on that gateway, then we start our discovery process + * from scratch + */ + if (lpni) { + if (list_empty(&lp->lp_routes)) + lp->lp_disc_net_id = 0; + } + + lnet_net_unlock(LNET_LOCK_EX); + + while (!list_empty(&zombies)) { + route = list_first_entry(&zombies, struct lnet_route, lr_list); + list_del(&route->lr_list); + LIBCFS_FREE(route, sizeof(*route)); + } + + while (!list_empty(&rnet_zombies)) { + rnet = list_first_entry(&rnet_zombies, struct lnet_remotenet, + lrn_list); + list_del(&rnet->lrn_list); + LIBCFS_FREE(rnet, sizeof(*rnet)); + } + + return 0; +} + +void +lnet_destroy_routes(void) +{ + lnet_del_route(LNET_NET_ANY, NULL); +} + +int lnet_get_rtr_pool_cfg(int cpt, struct lnet_ioctl_pool_cfg *pool_cfg) +{ + struct lnet_rtrbufpool *rbp; + int i, rc = -ENOENT, j; + + if (the_lnet.ln_rtrpools == NULL) + return rc; + + + cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) { + if (i != cpt) + continue; + + lnet_net_lock(i); + for (j = 0; j < LNET_NRBPOOLS; j++) { + pool_cfg->pl_pools[j].pl_npages = rbp[j].rbp_npages; + pool_cfg->pl_pools[j].pl_nbuffers = rbp[j].rbp_nbuffers; + pool_cfg->pl_pools[j].pl_credits = rbp[j].rbp_credits; + pool_cfg->pl_pools[j].pl_mincredits = rbp[j].rbp_mincredits; + } + lnet_net_unlock(i); + rc = 0; + break; + } + + lnet_net_lock(LNET_LOCK_EX); + pool_cfg->pl_routing = the_lnet.ln_routing; + lnet_net_unlock(LNET_LOCK_EX); + + return rc; +} + +int +lnet_get_route(int idx, __u32 *net, __u32 *hops, lnet_nid_t *gateway, + __u32 *flags, __u32 *priority, __u32 *sensitivity) +{ + struct lnet_remotenet *rnet; + struct list_head *rn_list; + struct lnet_route *route; + struct list_head *e1; + struct list_head *e2; + int cpt; + int i; + + cpt = lnet_net_lock_current(); + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + list_for_each(e1, rn_list) { + rnet = list_entry(e1, struct lnet_remotenet, lrn_list); + + list_for_each(e2, &rnet->lrn_routes) { + route = list_entry(e2, struct lnet_route, + lr_list); + + if (idx-- == 0) { + *net = rnet->lrn_net; + *gateway = lnet_nid_to_nid4(&route->lr_nid); + *hops = route->lr_hops; + *priority = route->lr_priority; + *sensitivity = route->lr_gateway-> + lp_health_sensitivity; + if (lnet_is_route_alive(route)) + *flags |= LNET_RT_ALIVE; + else + *flags &= ~LNET_RT_ALIVE; + if (route->lr_single_hop) + *flags &= ~LNET_RT_MULTI_HOP; + else + *flags |= LNET_RT_MULTI_HOP; + lnet_net_unlock(cpt); + return 0; + } + } + } + } + + lnet_net_unlock(cpt); + return -ENOENT; +} + +static void +lnet_wait_known_routerstate(void) +{ + struct lnet_peer *rtr; + struct list_head *entry; + int all_known; + + LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING); + + for (;;) { + int cpt = lnet_net_lock_current(); + + all_known = 1; + list_for_each(entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, struct lnet_peer, + lp_rtr_list); + + spin_lock(&rtr->lp_lock); + + if ((rtr->lp_state & LNET_PEER_RTR_DISCOVERED) == 0) { + all_known = 0; + spin_unlock(&rtr->lp_lock); + break; + } + spin_unlock(&rtr->lp_lock); + } + + lnet_net_unlock(cpt); + + if (all_known) + return; + + schedule_timeout_uninterruptible(cfs_time_seconds(1)); + } +} + +static inline bool +lnet_net_set_status_locked(struct lnet_net *net, __u32 status) +{ + struct lnet_ni *ni; + bool update = false; + + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) + if (lnet_ni_set_status(ni, status)) + update = true; + + return update; +} + +static bool +lnet_update_ni_status_locked(void) +{ + struct lnet_net *net; + struct lnet_ni *ni; + bool push = false; + time64_t now; + time64_t timeout; + + LASSERT(the_lnet.ln_routing); + + timeout = router_ping_timeout + alive_router_check_interval; + + now = ktime_get_seconds(); + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + if (net->net_lnd->lnd_type == LOLND) + continue; + + if (now < net->net_last_alive + timeout) + goto check_ni_fatal; + + spin_lock(&net->net_lock); + /* re-check with lock */ + if (now < net->net_last_alive + timeout) { + spin_unlock(&net->net_lock); + goto check_ni_fatal; + } + spin_unlock(&net->net_lock); + + /* + * if the net didn't receive any traffic for past the + * timeout on any of its constituent NIs, then mark all + * the NIs down. + */ + if (lnet_net_set_status_locked(net, LNET_NI_STATUS_DOWN)) { + push = true; + continue; + } + +check_ni_fatal: + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + /* lnet_ni_set_status() will perform the same check of + * ni_status while holding the ni lock. We can safely + * check ni_status without that lock because it is only + * written to under net_lock/EX and our caller is + * holding a net lock. + */ + if (atomic_read(&ni->ni_fatal_error_on) && + ni->ni_status && + ni->ni_status->ns_status != LNET_NI_STATUS_DOWN && + lnet_ni_set_status(ni, LNET_NI_STATUS_DOWN)) + push = true; + } + } + + return push; +} + +void lnet_wait_router_start(void) +{ + if (check_routers_before_use) { + /* Note that a helpful side-effect of pinging all known routers + * at startup is that it makes them drop stale connections they + * may have to a previous instance of me. */ + lnet_wait_known_routerstate(); + } +} + +/* + * This function is called from the monitor thread to check if there are + * any active routers that need to be checked. + */ +bool lnet_router_checker_active(void) +{ + /* Router Checker thread needs to run when routing is enabled in + * order to call lnet_update_ni_status_locked() */ + if (the_lnet.ln_routing) + return true; + + return !list_empty(&the_lnet.ln_routers) && + alive_router_check_interval > 0; +} + +void +lnet_check_routers(void) +{ + struct lnet_peer_net *first_lpn; + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + struct list_head *entry; + struct lnet_peer *rtr; + bool push = false; + bool needs_ping; + bool found_lpn; + __u64 version; + __u32 net_id; + time64_t now; + int cpt; + int rc; + + cpt = lnet_net_lock_current(); +rescan: + version = the_lnet.ln_routers_version; + + list_for_each(entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, struct lnet_peer, + lp_rtr_list); + + /* If we're currently discovering the peer then don't + * issue another discovery + */ + if (rtr->lp_state & LNET_PEER_RTR_DISCOVERY) + continue; + + now = ktime_get_real_seconds(); + + /* find the next local peer net which needs to be ping'd */ + needs_ping = false; + first_lpn = NULL; + found_lpn = false; + net_id = rtr->lp_disc_net_id; + do { + lpn = lnet_get_next_peer_net_locked(rtr, net_id); + if (!lpn) { + CERROR("gateway %s has no networks\n", + libcfs_nidstr(&rtr->lp_primary_nid)); + break; + } + + /* We looped back to the first peer net */ + if (first_lpn == lpn) + break; + if (!first_lpn) + first_lpn = lpn; + + net_id = lpn->lpn_net_id; + if (!lnet_islocalnet_locked(net_id)) + continue; + + found_lpn = true; + + CDEBUG(D_NET, "rtr %s(%p) %s(%p) next ping %lld\n", + libcfs_nidstr(&rtr->lp_primary_nid), rtr, + libcfs_net2str(net_id), lpn, + lpn->lpn_next_ping); + + needs_ping = now >= lpn->lpn_next_ping; + + } while (!needs_ping); + + if (!found_lpn || !lpn) { + CERROR("no local network found for gateway %s\n", + libcfs_nidstr(&rtr->lp_primary_nid)); + continue; + } + + if (!needs_ping) + continue; + + spin_lock(&rtr->lp_lock); + /* make sure we fully discover the router */ + rtr->lp_state &= ~LNET_PEER_NIDS_UPTODATE; + rtr->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH | + LNET_PEER_RTR_DISCOVERY; + spin_unlock(&rtr->lp_lock); + + /* find the peer_ni associated with the primary NID */ + lpni = lnet_peer_get_ni_locked( + rtr, lnet_nid_to_nid4(&rtr->lp_primary_nid)); + if (!lpni) { + CDEBUG(D_NET, "Expected to find an lpni for %s, but non found\n", + libcfs_nidstr(&rtr->lp_primary_nid)); + continue; + } + lnet_peer_ni_addref_locked(lpni); + + /* specify the net to use */ + rtr->lp_disc_net_id = lpn->lpn_net_id; + + /* discover the router */ + CDEBUG(D_NET, "discover %s, cpt = %d\n", + libcfs_nidstr(&lpni->lpni_nid), cpt); + rc = lnet_discover_peer_locked(lpni, cpt, false); + + /* drop ref taken above */ + lnet_peer_ni_decref_locked(lpni); + + if (!rc) + lpn->lpn_next_ping = now + alive_router_check_interval; + else + CERROR("Failed to discover router %s\n", + libcfs_nidstr(&rtr->lp_primary_nid)); + + /* NB cpt lock was dropped in lnet_discover_peer_locked() */ + if (version != the_lnet.ln_routers_version) { + /* the routers list has changed */ + goto rescan; + } + } + + if (the_lnet.ln_routing) + push = lnet_update_ni_status_locked(); + + lnet_net_unlock(cpt); + + /* if the status of the ni changed update the peers */ + if (push) + lnet_push_update_to_peers(1); +} + +void +lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages) +{ + int sz = offsetof(struct lnet_rtrbuf, rb_kiov[npages]); + + while (--npages >= 0) + __free_page(rb->rb_kiov[npages].bv_page); + + LIBCFS_FREE(rb, sz); +} + +static struct lnet_rtrbuf * +lnet_new_rtrbuf(struct lnet_rtrbufpool *rbp, int cpt) +{ + int npages = rbp->rbp_npages; + int sz = offsetof(struct lnet_rtrbuf, rb_kiov[npages]); + struct page *page; + struct lnet_rtrbuf *rb; + int i; + + LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz); + if (rb == NULL) + return NULL; + + rb->rb_pool = rbp; + + for (i = 0; i < npages; i++) { + page = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_KERNEL | + __GFP_ZERO | __GFP_NORETRY); + if (page == NULL) { + while (--i >= 0) + __free_page(rb->rb_kiov[i].bv_page); + + LIBCFS_FREE(rb, sz); + return NULL; + } + + rb->rb_kiov[i].bv_len = PAGE_SIZE; + rb->rb_kiov[i].bv_offset = 0; + rb->rb_kiov[i].bv_page = page; + } + + return rb; +} + +static void +lnet_rtrpool_free_bufs(struct lnet_rtrbufpool *rbp, int cpt) +{ + int npages = rbp->rbp_npages; + struct lnet_rtrbuf *rb; + LIST_HEAD(tmp); + + if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */ + return; + + lnet_net_lock(cpt); + list_splice_init(&rbp->rbp_msgs, &tmp); + lnet_drop_routed_msgs_locked(&tmp, cpt); + list_splice_init(&rbp->rbp_bufs, &tmp); + rbp->rbp_req_nbuffers = 0; + rbp->rbp_nbuffers = rbp->rbp_credits = 0; + rbp->rbp_mincredits = 0; + lnet_net_unlock(cpt); + + /* Free buffers on the free list. */ + while (!list_empty(&tmp)) { + rb = list_entry(tmp.next, struct lnet_rtrbuf, rb_list); + list_del(&rb->rb_list); + lnet_destroy_rtrbuf(rb, npages); + } +} + +static int +lnet_rtrpool_adjust_bufs(struct lnet_rtrbufpool *rbp, int nbufs, int cpt) +{ + LIST_HEAD(rb_list); + struct lnet_rtrbuf *rb; + int num_rb; + int num_buffers = 0; + int old_req_nbufs; + int npages = rbp->rbp_npages; + + lnet_net_lock(cpt); + /* If we are called for less buffers than already in the pool, we + * just lower the req_nbuffers number and excess buffers will be + * thrown away as they are returned to the free list. Credits + * then get adjusted as well. + * If we already have enough buffers allocated to serve the + * increase requested, then we can treat that the same way as we + * do the decrease. */ + num_rb = nbufs - rbp->rbp_nbuffers; + if (nbufs <= rbp->rbp_req_nbuffers || num_rb <= 0) { + rbp->rbp_req_nbuffers = nbufs; + lnet_net_unlock(cpt); + return 0; + } + /* store the older value of rbp_req_nbuffers and then set it to + * the new request to prevent lnet_return_rx_credits_locked() from + * freeing buffers that we need to keep around */ + old_req_nbufs = rbp->rbp_req_nbuffers; + rbp->rbp_req_nbuffers = nbufs; + lnet_net_unlock(cpt); + + /* allocate the buffers on a local list first. If all buffers are + * allocated successfully then join this list to the rbp buffer + * list. If not then free all allocated buffers. */ + while (num_rb-- > 0) { + rb = lnet_new_rtrbuf(rbp, cpt); + if (rb == NULL) { + CERROR("lnet: error allocating %ux%u page router buffers on CPT %u: rc = %d\n", + nbufs, npages, cpt, -ENOMEM); + + lnet_net_lock(cpt); + rbp->rbp_req_nbuffers = old_req_nbufs; + lnet_net_unlock(cpt); + + goto failed; + } + + list_add(&rb->rb_list, &rb_list); + num_buffers++; + } + + lnet_net_lock(cpt); + + list_splice_tail(&rb_list, &rbp->rbp_bufs); + rbp->rbp_nbuffers += num_buffers; + rbp->rbp_credits += num_buffers; + rbp->rbp_mincredits = rbp->rbp_credits; + /* We need to schedule blocked msg using the newly + * added buffers. */ + while (!list_empty(&rbp->rbp_bufs) && + !list_empty(&rbp->rbp_msgs)) + lnet_schedule_blocked_locked(rbp); + + lnet_net_unlock(cpt); + + return 0; + +failed: + while (!list_empty(&rb_list)) { + rb = list_entry(rb_list.next, struct lnet_rtrbuf, rb_list); + list_del(&rb->rb_list); + lnet_destroy_rtrbuf(rb, npages); + } + + return -ENOMEM; +} + +static void +lnet_rtrpool_init(struct lnet_rtrbufpool *rbp, int npages) +{ + INIT_LIST_HEAD(&rbp->rbp_msgs); + INIT_LIST_HEAD(&rbp->rbp_bufs); + + rbp->rbp_npages = npages; + rbp->rbp_credits = 0; + rbp->rbp_mincredits = 0; +} + +void +lnet_rtrpools_free(int keep_pools) +{ + struct lnet_rtrbufpool *rtrp; + int i; + + if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */ + return; + + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + lnet_rtrpool_free_bufs(&rtrp[LNET_TINY_BUF_IDX], i); + lnet_rtrpool_free_bufs(&rtrp[LNET_SMALL_BUF_IDX], i); + lnet_rtrpool_free_bufs(&rtrp[LNET_LARGE_BUF_IDX], i); + } + + if (!keep_pools) { + cfs_percpt_free(the_lnet.ln_rtrpools); + the_lnet.ln_rtrpools = NULL; + } +} + +static int +lnet_nrb_tiny_calculate(void) +{ + int nrbs = LNET_NRB_TINY; + + if (tiny_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "tiny_router_buffers=%d invalid when " + "routing enabled\n", tiny_router_buffers); + return -EINVAL; + } + + if (tiny_router_buffers > 0) + nrbs = tiny_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_TINY_MIN); +} + +static int +lnet_nrb_small_calculate(void) +{ + int nrbs = LNET_NRB_SMALL; + + if (small_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "small_router_buffers=%d invalid when " + "routing enabled\n", small_router_buffers); + return -EINVAL; + } + + if (small_router_buffers > 0) + nrbs = small_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_SMALL_MIN); +} + +static int +lnet_nrb_large_calculate(void) +{ + int nrbs = LNET_NRB_LARGE; + + if (large_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "large_router_buffers=%d invalid when " + "routing enabled\n", large_router_buffers); + return -EINVAL; + } + + if (large_router_buffers > 0) + nrbs = large_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_LARGE_MIN); +} + +int +lnet_rtrpools_alloc(int im_a_router) +{ + struct lnet_rtrbufpool *rtrp; + int nrb_tiny; + int nrb_small; + int nrb_large; + int rc; + int i; + + if (!strcmp(forwarding, "")) { + /* not set either way */ + if (!im_a_router) + return 0; + } else if (!strcmp(forwarding, "disabled")) { + /* explicitly disabled */ + return 0; + } else if (!strcmp(forwarding, "enabled")) { + /* explicitly enabled */ + } else { + rc = -EINVAL; + LCONSOLE_ERROR_MSG(0x10b, + "lnet: forwarding='%s' not set to either 'enabled' or 'disabled': rc = %d\n", + forwarding, rc); + return rc; + } + + nrb_tiny = lnet_nrb_tiny_calculate(); + if (nrb_tiny < 0) + return -EINVAL; + + nrb_small = lnet_nrb_small_calculate(); + if (nrb_small < 0) + return -EINVAL; + + nrb_large = lnet_nrb_large_calculate(); + if (nrb_large < 0) + return -EINVAL; + + the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(), + LNET_NRBPOOLS * + sizeof(struct lnet_rtrbufpool)); + if (the_lnet.ln_rtrpools == NULL) { + rc = -ENOMEM; + LCONSOLE_ERROR_MSG(0x10c, + "lnet: error allocating router buffer pool: rc = %d\n", + rc); + return rc; + } + + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + lnet_rtrpool_init(&rtrp[LNET_TINY_BUF_IDX], 0); + rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_TINY_BUF_IDX], + nrb_tiny, i); + if (rc) + goto failed; + + lnet_rtrpool_init(&rtrp[LNET_SMALL_BUF_IDX], + LNET_NRB_SMALL_PAGES); + rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_SMALL_BUF_IDX], + nrb_small, i); + if (rc) + goto failed; + + lnet_rtrpool_init(&rtrp[LNET_LARGE_BUF_IDX], + LNET_NRB_LARGE_PAGES); + rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_LARGE_BUF_IDX], + nrb_large, i); + if (rc) + goto failed; + } + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_routing = 1; + lnet_net_unlock(LNET_LOCK_EX); + complete(&the_lnet.ln_mt_wait_complete); + return 0; + + failed: + lnet_rtrpools_free(0); + return rc; +} + +static int +lnet_rtrpools_adjust_helper(int tiny, int small, int large) +{ + int nrb = 0; + int rc = 0; + int i; + struct lnet_rtrbufpool *rtrp; + + /* If the provided values for each buffer pool are different than the + * configured values, we need to take action. */ + if (tiny >= 0) { + tiny_router_buffers = tiny; + nrb = lnet_nrb_tiny_calculate(); + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_TINY_BUF_IDX], + nrb, i); + if (rc != 0) + return rc; + } + } + if (small >= 0) { + small_router_buffers = small; + nrb = lnet_nrb_small_calculate(); + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_SMALL_BUF_IDX], + nrb, i); + if (rc != 0) + return rc; + } + } + if (large >= 0) { + large_router_buffers = large; + nrb = lnet_nrb_large_calculate(); + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_LARGE_BUF_IDX], + nrb, i); + if (rc != 0) + return rc; + } + } + + return 0; +} + +int +lnet_rtrpools_adjust(int tiny, int small, int large) +{ + /* this function doesn't revert the changes if adding new buffers + * failed. It's up to the user space caller to revert the + * changes. */ + + if (!the_lnet.ln_routing) + return 0; + + return lnet_rtrpools_adjust_helper(tiny, small, large); +} + +int +lnet_rtrpools_enable(void) +{ + int rc = 0; + + if (the_lnet.ln_routing) + return 0; + + if (the_lnet.ln_rtrpools == NULL) + /* If routing is turned off, and we have never + * initialized the pools before, just call the + * standard buffer pool allocation routine as + * if we are just configuring this for the first + * time. */ + rc = lnet_rtrpools_alloc(1); + else + rc = lnet_rtrpools_adjust_helper(0, 0, 0); + if (rc != 0) + return rc; + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_routing = 1; + + the_lnet.ln_ping_target->pb_info.pi_features &= + ~LNET_PING_FEAT_RTE_DISABLED; + lnet_net_unlock(LNET_LOCK_EX); + + if (lnet_peer_discovery_disabled) + CWARN("Consider turning discovery on to enable full " + "Multi-Rail routing functionality\n"); + + return rc; +} + +void +lnet_rtrpools_disable(void) +{ + if (!the_lnet.ln_routing) + return; + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_routing = 0; + the_lnet.ln_ping_target->pb_info.pi_features |= + LNET_PING_FEAT_RTE_DISABLED; + + tiny_router_buffers = 0; + small_router_buffers = 0; + large_router_buffers = 0; + lnet_net_unlock(LNET_LOCK_EX); + lnet_rtrpools_free(1); +} + +static inline void +lnet_notify_peer_down(struct lnet_ni *ni, struct lnet_nid *nid) +{ + if (ni->ni_net->net_lnd->lnd_notify_peer_down != NULL) + (ni->ni_net->net_lnd->lnd_notify_peer_down)(nid); +} + +/* + * ni: local NI used to communicate with the peer + * nid: peer NID + * alive: true if peer is alive, false otherwise + * reset: reset health value. This is requested by the LND. + * when: notificaiton time. + */ +int +lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset, + time64_t when) +{ + struct lnet_peer_ni *lpni = NULL; + struct lnet_route *route; + struct lnet_peer *lp; + time64_t now = ktime_get_seconds(); + int cpt; + + LASSERT(!in_interrupt()); + + CDEBUG(D_NET, "%s notifying %s: %s\n", + (ni == NULL) ? "userspace" : libcfs_nidstr(&ni->ni_nid), + libcfs_nid2str(nid), alive ? "up" : "down"); + + if (ni != NULL && + LNET_NID_NET(&ni->ni_nid) != LNET_NIDNET(nid)) { + CWARN("Ignoring notification of %s %s by %s (different net)\n", + libcfs_nid2str(nid), alive ? "birth" : "death", + libcfs_nidstr(&ni->ni_nid)); + return -EINVAL; + } + + /* can't do predictions... */ + if (when > now) { + CWARN("Ignoring prediction from %s of %s %s %lld seconds in the future\n", + ni ? libcfs_nidstr(&ni->ni_nid) : "userspace", + libcfs_nid2str(nid), alive ? "up" : "down", when - now); + return -EINVAL; + } + + if (ni != NULL && !alive && /* LND telling me she's down */ + !auto_down) { /* auto-down disabled */ + CDEBUG(D_NET, "Auto-down disabled\n"); + return 0; + } + + /* must lock 0 since this is used for synchronization */ + lnet_net_lock(0); + + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + lnet_net_unlock(0); + return -ESHUTDOWN; + } + + lpni = lnet_find_peer_ni_locked(nid); + if (lpni == NULL) { + /* nid not found */ + lnet_net_unlock(0); + CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid)); + return 0; + } + + if (alive) { + if (reset) { + lpni->lpni_ns_status = LNET_NI_STATUS_UP; + lnet_set_lpni_healthv_locked(lpni, + LNET_MAX_HEALTH_VALUE); + } else { + __u32 sensitivity = lpni->lpni_peer_net-> + lpn_peer->lp_health_sensitivity; + + lnet_inc_lpni_healthv_locked(lpni, + (sensitivity) ? sensitivity : + lnet_health_sensitivity); + } + } else if (reset) { + lpni->lpni_ns_status = LNET_NI_STATUS_DOWN; + } + + /* recalculate aliveness */ + alive = lnet_is_peer_ni_alive(lpni); + + lp = lpni->lpni_peer_net->lpn_peer; + /* If this is an LNet router then update route aliveness */ + if (lp->lp_rtr_refcount) { + if (reset) + /* reset flag indicates gateway peer went up or down */ + lp->lp_alive = alive; + + /* If discovery is disabled, locally or on the gateway, then + * any routes using lpni as next-hop need to be updated + * + * NB: We can get many notifications while a route is down, so + * we try and avoid the expensive net_lock/EX here for the + * common case of receiving duplicate lnet_notify() calls (i.e. + * only grab EX lock when we actually need to update the route + * aliveness). + */ + if (lnet_is_discovery_disabled(lp)) { + list_for_each_entry(route, &lp->lp_routes, lr_gwlist) { + if (nid_same(&route->lr_nid, &lpni->lpni_nid)) + lnet_set_route_aliveness(route, alive); + } + } + } + + lnet_net_unlock(0); + + if (ni != NULL && !alive) + lnet_notify_peer_down(ni, &lpni->lpni_nid); + + cpt = lpni->lpni_cpt; + lnet_net_lock(cpt); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(cpt); + + return 0; +} +EXPORT_SYMBOL(lnet_notify); diff --git a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c new file mode 100644 index 0000000000000..926891481d641 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c @@ -0,0 +1,902 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +#include +#include + +/* This is really lnet_proc.c. You might need to update sanity test 215 + * if any file format is changed. */ + +#define LNET_LOFFT_BITS (sizeof(loff_t) * 8) +/* + * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system + */ +#define LNET_PROC_CPT_BITS (LNET_CPT_BITS + 1) +/* change version, 16 bits or 8 bits */ +#define LNET_PROC_VER_BITS \ + clamp_t(int, LNET_LOFFT_BITS / 4, 8, 16) + +#define LNET_PROC_HASH_BITS LNET_PEER_HASH_BITS +/* + * bits for peer hash offset + * NB: we don't use the highest bit of *ppos because it's signed + */ +#define LNET_PROC_HOFF_BITS (LNET_LOFFT_BITS - \ + LNET_PROC_CPT_BITS - \ + LNET_PROC_VER_BITS - \ + LNET_PROC_HASH_BITS - 1) +/* bits for hash index + position */ +#define LNET_PROC_HPOS_BITS (LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS) +/* bits for peer hash table + hash version */ +#define LNET_PROC_VPOS_BITS (LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS) + +#define LNET_PROC_CPT_MASK ((1ULL << LNET_PROC_CPT_BITS) - 1) +#define LNET_PROC_VER_MASK ((1ULL << LNET_PROC_VER_BITS) - 1) +#define LNET_PROC_HASH_MASK ((1ULL << LNET_PROC_HASH_BITS) - 1) +#define LNET_PROC_HOFF_MASK ((1ULL << LNET_PROC_HOFF_BITS) - 1) + +#define LNET_PROC_CPT_GET(pos) \ + (int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK) + +#define LNET_PROC_VER_GET(pos) \ + (int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK) + +#define LNET_PROC_HASH_GET(pos) \ + (int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK) + +#define LNET_PROC_HOFF_GET(pos) \ + (int)((pos) & LNET_PROC_HOFF_MASK) + +#define LNET_PROC_POS_MAKE(cpt, ver, hash, off) \ + (((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) | \ + ((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) | \ + ((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \ + ((off) & LNET_PROC_HOFF_MASK)) + +#define LNET_PROC_VERSION(v) ((unsigned int)((v) & LNET_PROC_VER_MASK)) + +static int proc_lnet_stats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc; + struct lnet_counters *ctrs; + struct lnet_counters_common common; + size_t nob = *lenp; + loff_t pos = *ppos; + int len; + char tmpstr[256]; /* 7 %u and 4 u64 */ + + if (write) { + lnet_counters_reset(); + return 0; + } + + /* read */ + + LIBCFS_ALLOC(ctrs, sizeof(*ctrs)); + if (ctrs == NULL) + return -ENOMEM; + + rc = lnet_counters_get(ctrs); + if (rc) + goto out_no_ctrs; + + common = ctrs->lct_common; + + len = scnprintf(tmpstr, sizeof(tmpstr), + "%u %u %u %u %u %u %u %llu %llu " + "%llu %llu", + common.lcc_msgs_alloc, common.lcc_msgs_max, + common.lcc_errors, + common.lcc_send_count, common.lcc_recv_count, + common.lcc_route_count, common.lcc_drop_count, + common.lcc_send_length, common.lcc_recv_length, + common.lcc_route_length, common.lcc_drop_length); + + if (pos >= len) + rc = 0; + else + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, "\n"); +out_no_ctrs: + LIBCFS_FREE(ctrs, sizeof(*ctrs)); + return rc; +} + +static int +proc_lnet_routes(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + const int tmpsiz = 256; + char *tmpstr; + char *s; + int rc = 0; + int len; + int ver; + int off; + + BUILD_BUG_ON(sizeof(loff_t) < 4); + + off = LNET_PROC_HOFF_GET(*ppos); + ver = LNET_PROC_VER_GET(*ppos); + + LASSERT(!write); + + if (*lenp == 0) + return 0; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += scnprintf(s, tmpstr + tmpsiz - s, "Routing %s\n", + the_lnet.ln_routing ? "enabled" : "disabled"); + LASSERT(tmpstr + tmpsiz - s > 0); + + s += scnprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %8s %7s %s\n", + "net", "hops", "priority", "state", "router"); + LASSERT(tmpstr + tmpsiz - s > 0); + + lnet_net_lock(0); + ver = (unsigned int)the_lnet.ln_remote_nets_version; + lnet_net_unlock(0); + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } else { + struct list_head *n; + struct list_head *r; + struct lnet_route *route = NULL; + struct lnet_remotenet *rnet = NULL; + int skip = off - 1; + struct list_head *rn_list; + int i; + + lnet_net_lock(0); + + if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) { + lnet_net_unlock(0); + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL; + i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + + n = rn_list->next; + + while (n != rn_list && route == NULL) { + rnet = list_entry(n, struct lnet_remotenet, + lrn_list); + + r = rnet->lrn_routes.next; + + while (r != &rnet->lrn_routes) { + struct lnet_route *re = + list_entry(r, struct lnet_route, + lr_list); + if (skip == 0) { + route = re; + break; + } + + skip--; + r = r->next; + } + + n = n->next; + } + } + + if (route != NULL) { + __u32 net = rnet->lrn_net; + __u32 hops = route->lr_hops; + unsigned int priority = route->lr_priority; + int alive = lnet_is_route_alive(route); + + s += scnprintf(s, tmpstr + tmpsiz - s, + "%-8s %4d %8u %7s %s\n", + libcfs_net2str(net), hops, + priority, + alive ? "up" : "down", + libcfs_nidstr(&route->lr_nid)); + LASSERT(tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else { + off += 1; + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +static int +proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int rc = 0; + char *tmpstr; + char *s; + const int tmpsiz = 256; + int len; + int ver; + int off; + + off = LNET_PROC_HOFF_GET(*ppos); + ver = LNET_PROC_VER_GET(*ppos); + + LASSERT(!write); + + if (*lenp == 0) + return 0; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += scnprintf(s, tmpstr + tmpsiz - s, + "%-4s %7s %5s %s\n", + "ref", "rtr_ref", "alive", "router"); + LASSERT(tmpstr + tmpsiz - s > 0); + + lnet_net_lock(0); + ver = (unsigned int)the_lnet.ln_routers_version; + lnet_net_unlock(0); + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } else { + struct list_head *r; + struct lnet_peer *peer = NULL; + int skip = off - 1; + + lnet_net_lock(0); + + if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) { + lnet_net_unlock(0); + + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + r = the_lnet.ln_routers.next; + + while (r != &the_lnet.ln_routers) { + struct lnet_peer *lp = + list_entry(r, struct lnet_peer, + lp_rtr_list); + + if (skip == 0) { + peer = lp; + break; + } + + skip--; + r = r->next; + } + + if (peer != NULL) { + struct lnet_nid *nid = &peer->lp_primary_nid; + int nrefs = atomic_read(&peer->lp_refcount); + int nrtrrefs = peer->lp_rtr_refcount; + int alive = lnet_is_gateway_alive(peer); + + s += scnprintf(s, tmpstr + tmpsiz - s, + "%-4d %7d %5s %s\n", + nrefs, nrtrrefs, + alive ? "up" : "down", + libcfs_nidstr(nid)); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else { + off += 1; + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +/* TODO: there should be no direct access to ptable. We should add a set + * of APIs that give access to the ptable and its members */ +static int +proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + const int tmpsiz = 256; + struct lnet_peer_table *ptable; + char *tmpstr = NULL; + char *s; + int cpt = LNET_PROC_CPT_GET(*ppos); + int ver = LNET_PROC_VER_GET(*ppos); + int hash = LNET_PROC_HASH_GET(*ppos); + int hoff = LNET_PROC_HOFF_GET(*ppos); + int rc = 0; + int len; + + if (write) { + int i; + struct lnet_peer_ni *peer; + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + lnet_net_lock(i); + for (hash = 0; hash < LNET_PEER_HASH_SIZE; hash++) { + list_for_each_entry(peer, + &ptable->pt_hash[hash], + lpni_hashlist) { + peer->lpni_mintxcredits = + peer->lpni_txcredits; + peer->lpni_minrtrcredits = + peer->lpni_rtrcredits; + } + } + lnet_net_unlock(i); + } + *ppos += *lenp; + return 0; + } + + if (*lenp == 0) + return 0; + + BUILD_BUG_ON(LNET_PROC_HASH_BITS < LNET_PEER_HASH_BITS); + + if (cpt >= LNET_CPT_NUMBER) { + *lenp = 0; + return 0; + } + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += scnprintf(s, tmpstr + tmpsiz - s, + "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n", + "nid", "refs", "state", "last", "max", + "rtr", "min", "tx", "min", "queue"); + LASSERT(tmpstr + tmpsiz - s > 0); + + hoff++; + } else { + struct lnet_peer_ni *peer; + struct list_head *p; + int skip; + + again: + p = NULL; + peer = NULL; + skip = hoff - 1; + + lnet_net_lock(cpt); + ptable = the_lnet.ln_peer_tables[cpt]; + if (hoff == 1) + ver = LNET_PROC_VERSION(ptable->pt_version); + + if (ver != LNET_PROC_VERSION(ptable->pt_version)) { + lnet_net_unlock(cpt); + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + while (hash < LNET_PEER_HASH_SIZE) { + if (p == NULL) + p = ptable->pt_hash[hash].next; + + while (p != &ptable->pt_hash[hash]) { + struct lnet_peer_ni *lp = + list_entry(p, struct lnet_peer_ni, + lpni_hashlist); + if (skip == 0) { + peer = lp; + + /* minor optimization: start from idx+1 + * on next iteration if we've just + * drained lpni_hashlist */ + if (lp->lpni_hashlist.next == + &ptable->pt_hash[hash]) { + hoff = 1; + hash++; + } else { + hoff++; + } + + break; + } + + skip--; + p = lp->lpni_hashlist.next; + } + + if (peer != NULL) + break; + + p = NULL; + hoff = 1; + hash++; + } + + if (peer != NULL) { + struct lnet_nid nid = peer->lpni_nid; + int nrefs = kref_read(&peer->lpni_kref); + time64_t lastalive = -1; + char *aliveness = "NA"; + int maxcr = (peer->lpni_net) ? + peer->lpni_net->net_tunables.lct_peer_tx_credits : 0; + int txcr = peer->lpni_txcredits; + int mintxcr = peer->lpni_mintxcredits; + int rtrcr = peer->lpni_rtrcredits; + int minrtrcr = peer->lpni_minrtrcredits; + int txqnob = peer->lpni_txqnob; + + if (lnet_isrouter(peer) || + lnet_peer_aliveness_enabled(peer)) + aliveness = lnet_is_peer_ni_alive(peer) ? + "up" : "down"; + + lnet_net_unlock(cpt); + + s += scnprintf(s, tmpstr + tmpsiz - s, + "%-24s %4d %5s %5lld %5d %5d %5d %5d %5d %d\n", + libcfs_nidstr(&nid), nrefs, aliveness, + lastalive, maxcr, rtrcr, minrtrcr, txcr, + mintxcr, txqnob); + LASSERT(tmpstr + tmpsiz - s > 0); + + } else { /* peer is NULL */ + lnet_net_unlock(cpt); + } + + if (hash == LNET_PEER_HASH_SIZE) { + cpt++; + hash = 0; + hoff = 1; + if (peer == NULL && cpt < LNET_CPT_NUMBER) + goto again; + } + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else + *ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff); + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +static int proc_lnet_buffers(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + size_t nob = *lenp; + loff_t pos = *ppos; + char *s; + char *tmpstr; + int tmpsiz; + int idx; + int len; + int rc; + int i; + + LASSERT(!write); + + /* (4 %d) * 4 * LNET_CPT_NUMBER */ + tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER; + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + s += scnprintf(s, tmpstr + tmpsiz - s, + "%5s %5s %7s %7s\n", + "pages", "count", "credits", "min"); + LASSERT(tmpstr + tmpsiz - s > 0); + + if (the_lnet.ln_rtrpools == NULL) + goto out; /* I'm not a router */ + + for (idx = 0; idx < LNET_NRBPOOLS; idx++) { + struct lnet_rtrbufpool *rbp; + + lnet_net_lock(LNET_LOCK_EX); + cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) { + s += scnprintf(s, tmpstr + tmpsiz - s, + "%5d %5d %7d %7d\n", + rbp[idx].rbp_npages, + rbp[idx].rbp_nbuffers, + rbp[idx].rbp_credits, + rbp[idx].rbp_mincredits); + LASSERT(tmpstr + tmpsiz - s > 0); + } + lnet_net_unlock(LNET_LOCK_EX); + } + + out: + len = s - tmpstr; + + if (pos >= min_t(int, len, strlen(tmpstr))) + rc = 0; + else + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, NULL); + + LIBCFS_FREE(tmpstr, tmpsiz); + return rc; +} + +static int +proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int tmpsiz = 128 * LNET_CPT_NUMBER; + int rc = 0; + char *tmpstr; + char *s; + int len; + + if (*lenp == 0) + return 0; + + if (write) { + /* Just reset the min stat. */ + struct lnet_ni *ni; + struct lnet_net *net; + + lnet_net_lock(0); + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + struct lnet_tx_queue *tq; + int i; + int j; + + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + for (j = 0; ni->ni_cpts != NULL && + j < ni->ni_ncpts; j++) { + if (i == ni->ni_cpts[j]) + break; + } + + if (j == ni->ni_ncpts) + continue; + + if (i != 0) + lnet_net_lock(i); + tq->tq_credits_min = tq->tq_credits; + if (i != 0) + lnet_net_unlock(i); + } + } + } + lnet_net_unlock(0); + *ppos += *lenp; + return 0; + } + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += scnprintf(s, tmpstr + tmpsiz - s, + "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n", + "nid", "status", "alive", "refs", "peer", + "rtr", "max", "tx", "min"); + LASSERT (tmpstr + tmpsiz - s > 0); + } else { + struct lnet_ni *ni = NULL; + int skip = *ppos - 1; + + lnet_net_lock(0); + + ni = lnet_get_ni_idx_locked(skip); + + if (ni != NULL) { + struct lnet_tx_queue *tq; + char *stat; + time64_t now = ktime_get_seconds(); + time64_t last_alive = -1; + int i; + int j; + + if (the_lnet.ln_routing) + last_alive = now - ni->ni_net->net_last_alive; + + lnet_ni_lock(ni); + LASSERT(ni->ni_status != NULL); + stat = (lnet_ni_get_status_locked(ni) == + LNET_NI_STATUS_UP) ? "up" : "down"; + lnet_ni_unlock(ni); + + /* @lo forever alive */ + if (ni->ni_net->net_lnd->lnd_type == LOLND) { + last_alive = 0; + stat = "up"; + } + + /* we actually output credits information for + * TX queue of each partition */ + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + for (j = 0; ni->ni_cpts != NULL && + j < ni->ni_ncpts; j++) { + if (i == ni->ni_cpts[j]) + break; + } + + if (j == ni->ni_ncpts) + continue; + + if (i != 0) + lnet_net_lock(i); + + s += scnprintf(s, tmpstr + tmpsiz - s, + "%-24s %6s %5lld %4d %4d %4d %5d %5d %5d\n", + libcfs_nidstr(&ni->ni_nid), stat, + last_alive, *ni->ni_refs[i], + ni->ni_net->net_tunables.lct_peer_tx_credits, + ni->ni_net->net_tunables.lct_peer_rtr_credits, + tq->tq_credits_max, + tq->tq_credits, tq->tq_credits_min); + if (i != 0) + lnet_net_unlock(i); + } + LASSERT(tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else + *ppos += 1; + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +struct lnet_portal_rotors { + int pr_value; + const char *pr_name; + const char *pr_desc; +}; + +static struct lnet_portal_rotors portal_rotors[] = { + { + .pr_value = LNET_PTL_ROTOR_OFF, + .pr_name = "OFF", + .pr_desc = "Turn off message rotor for wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_ON, + .pr_name = "ON", + .pr_desc = "round-robin dispatch all PUT messages for " + "wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_RR_RT, + .pr_name = "RR_RT", + .pr_desc = "round-robin dispatch routed PUT message for " + "wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_HASH_RT, + .pr_name = "HASH_RT", + .pr_desc = "dispatch routed PUT message by hashing source " + "NID for wildcard portals" + }, + { + .pr_value = -1, + .pr_name = NULL, + .pr_desc = NULL + }, +}; + +static int proc_lnet_portal_rotor(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + const int buf_len = 128; + size_t nob = *lenp; + loff_t pos = *ppos; + char *buf; + char *tmp; + int rc; + int i; + + if (!write) { + LIBCFS_ALLOC(buf, buf_len); + if (buf == NULL) + return -ENOMEM; + + lnet_res_lock(0); + + for (i = 0; portal_rotors[i].pr_value >= 0; i++) { + if (portal_rotors[i].pr_value == portal_rotor) + break; + } + + LASSERT(portal_rotors[i].pr_value == portal_rotor); + lnet_res_unlock(0); + + rc = scnprintf(buf, buf_len, + "{\n\tportals: all\n" + "\trotor: %s\n\tdescription: %s\n}", + portal_rotors[i].pr_name, + portal_rotors[i].pr_desc); + + if (pos >= min_t(int, rc, buf_len)) { + rc = 0; + } else { + rc = cfs_trace_copyout_string(buffer, nob, + buf + pos, "\n"); + } + LIBCFS_FREE(buf, buf_len); + + return rc; + } + + buf = memdup_user_nul(buffer, nob); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + tmp = strim(buf); + + rc = -EINVAL; + lnet_res_lock(0); + for (i = 0; portal_rotors[i].pr_name != NULL; i++) { + if (strncasecmp(portal_rotors[i].pr_name, tmp, + strlen(portal_rotors[i].pr_name)) == 0) { + portal_rotor = portal_rotors[i].pr_value; + rc = 0; + break; + } + } + lnet_res_unlock(0); + kfree(buf); + + return rc; +} + +static struct ctl_table lnet_table[] = { + /* + * NB No .strategy entries have been provided since sysctl(8) prefers + * to go via /proc for portability. + */ + { + .procname = "stats", + .mode = 0644, + .proc_handler = &proc_lnet_stats, + }, + { + .procname = "routes", + .mode = 0444, + .proc_handler = &proc_lnet_routes, + }, + { + .procname = "routers", + .mode = 0444, + .proc_handler = &proc_lnet_routers, + }, + { + .procname = "peers", + .mode = 0644, + .proc_handler = &proc_lnet_peers, + }, + { + .procname = "buffers", + .mode = 0444, + .proc_handler = &proc_lnet_buffers, + }, + { + .procname = "nis", + .mode = 0644, + .proc_handler = &proc_lnet_nis, + }, + { + .procname = "portal_rotor", + .mode = 0644, + .proc_handler = &proc_lnet_portal_rotor, + }, + { + .procname = "lnet_lnd_timeout", + .data = &lnet_lnd_timeout, + .maxlen = sizeof(lnet_lnd_timeout), + .mode = 0444, + .proc_handler = &debugfs_doint, + }, + { .procname = NULL } +}; + +void lnet_router_debugfs_init(void) +{ + lnet_insert_debugfs(lnet_table); +} + +void lnet_router_debugfs_fini(void) +{ + lnet_remove_debugfs(lnet_table); +} diff --git a/drivers/staging/lustrefsx/lnet/lnet/udsp.c b/drivers/staging/lustrefsx/lnet/lnet/udsp.c new file mode 100644 index 0000000000000..08c1a7fcccc0d --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/udsp.c @@ -0,0 +1,1557 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * Copyright (c) 2018-2020 Data Direct Networks. + * + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * lnet/lnet/udsp.c + * + * User Defined Selection Policies (UDSP) are introduced to add + * ability of fine traffic control. The policies are instantiated + * on LNet constructs and allow preference of some constructs + * over others as an extension of the selection algorithm. + * The order of operation is defined by the selection algorithm logical flow: + * + * 1. Iterate over all the networks that a peer can be reached on + * and select the best local network + * - The remote network with the highest priority is examined + * (Network Rule) + * - The local network with the highest priority is selected + * (Network Rule) + * - The local NI with the highest priority is selected + * (NID Rule) + * 2. If the peer is a remote peer and has no local networks, + * - then select the remote peer network with the highest priority + * (Network Rule) + * - Select the highest priority remote peer_ni on the network selected + * (NID Rule) + * - Now that the peer's network and NI are decided, select the router + * in round robin from the peer NI's preferred router list. + * (Router Rule) + * - Select the highest priority local NI on the local net of the + * selected route. + * (NID Rule) + * 3. Otherwise for local peers, select the peer_ni from the peer. + * - highest priority peer NI is selected + * (NID Rule) + * - Select the peer NI which has the local NI selected on its + * preferred list. + * (NID Pair Rule) + * + * Accordingly, the User Interface allows for the following: + * - Adding a local network udsp: if multiple local networks are + * available, each one can have a priority. + * - Adding a local NID udsp: after a local network is chosen, + * if there are multiple NIs, each one can have a priority. + * - Adding a remote NID udsp: assign priority to a peer NID. + * - Adding a NID pair udsp: allows to specify local NIDs + * to be added on the list on the specified peer NIs + * When selecting a peer NI, the one with the + * local NID being used on its list is preferred. + * - Adding a Router udsp: similar to the NID pair udsp. + * Specified router NIDs are added on the list on the specified peer NIs. + * When sending to a remote peer, remote net is selected and the peer NID + * is selected. The router which has its nid on the peer NI list + * is preferred. + * - Deleting a udsp: use the specified policy index to remove it + * from the policy list. + * + * Generally, the syntax is as follows + * lnetctl policy + * --src: ip2nets syntax specifying the local NID to match + * --dst: ip2nets syntax specifying the remote NID to match + * --rte: ip2nets syntax specifying the router NID to match + * --priority: Priority to apply to rule matches + * --idx: Index of where to insert or delete the rule + * By default add appends to the end of the rule list + * + * Author: Amir Shehata + */ + +#include + +#include +#include + +struct udsp_info { + struct lnet_peer_ni *udi_lpni; + struct lnet_peer_net *udi_lpn; + struct lnet_ni *udi_ni; + struct lnet_net *udi_net; + struct lnet_ud_nid_descr *udi_match; + struct lnet_ud_nid_descr *udi_action; + __u32 udi_priority; + enum lnet_udsp_action_type udi_type; + bool udi_local; + bool udi_revert; +}; + +typedef int (*udsp_apply_rule)(struct udsp_info *); + +enum udsp_apply { + UDSP_APPLY_ON_PEERS = 0, + UDSP_APPLY_PRIO_ON_NIS = 1, + UDSP_APPLY_RTE_ON_NETS = 2, + UDSP_APPLY_MAX_ENUM = 3, +}; + +#define RULE_NOT_APPLICABLE -1 + +static inline bool +lnet_udsp_is_net_rule(struct lnet_ud_nid_descr *match) +{ + return list_empty(&match->ud_addr_range); +} + +static bool +lnet_udsp_expr_list_equal(struct list_head *e1, + struct list_head *e2) +{ + struct cfs_expr_list *expr1; + struct cfs_expr_list *expr2; + struct cfs_range_expr *range1, *range2; + + if (list_empty(e1) && list_empty(e2)) + return true; + + if (lnet_get_list_len(e1) != lnet_get_list_len(e2)) + return false; + + expr2 = list_first_entry(e2, struct cfs_expr_list, el_link); + + list_for_each_entry(expr1, e1, el_link) { + if (lnet_get_list_len(&expr1->el_exprs) != + lnet_get_list_len(&expr2->el_exprs)) + return false; + + range2 = list_first_entry(&expr2->el_exprs, + struct cfs_range_expr, + re_link); + + list_for_each_entry(range1, &expr1->el_exprs, re_link) { + if (range1->re_lo != range2->re_lo || + range1->re_hi != range2->re_hi || + range1->re_stride != range2->re_stride) + return false; + range2 = list_next_entry(range2, re_link); + } + expr2 = list_next_entry(expr2, el_link); + } + + return true; +} + +static bool +lnet_udsp_nid_descr_equal(struct lnet_ud_nid_descr *e1, + struct lnet_ud_nid_descr *e2) +{ + if (e1->ud_net_id.udn_net_type != e2->ud_net_id.udn_net_type || + !lnet_udsp_expr_list_equal(&e1->ud_net_id.udn_net_num_range, + &e2->ud_net_id.udn_net_num_range) || + !lnet_udsp_expr_list_equal(&e1->ud_addr_range, &e2->ud_addr_range)) + return false; + + return true; +} + +static bool +lnet_udsp_action_equal(struct lnet_udsp *e1, struct lnet_udsp *e2) +{ + if (e1->udsp_action_type != e2->udsp_action_type) + return false; + + if (e1->udsp_action_type == EN_LNET_UDSP_ACTION_PRIORITY && + e1->udsp_action.udsp_priority != e2->udsp_action.udsp_priority) + return false; + + return true; +} + +static bool +lnet_udsp_equal(struct lnet_udsp *e1, struct lnet_udsp *e2) +{ + /* check each NID descr */ + if (!lnet_udsp_nid_descr_equal(&e1->udsp_src, &e2->udsp_src) || + !lnet_udsp_nid_descr_equal(&e1->udsp_dst, &e2->udsp_dst) || + !lnet_udsp_nid_descr_equal(&e1->udsp_rte, &e2->udsp_rte)) + return false; + + return true; +} + +/* it is enough to look at the net type of the descriptor. If the criteria + * is present the net must be specified + */ +static inline bool +lnet_udsp_criteria_present(struct lnet_ud_nid_descr *descr) +{ + return (descr->ud_net_id.udn_net_type != 0); +} + +static int +lnet_udsp_apply_rule_on_ni(struct udsp_info *udi) +{ + int rc; + struct lnet_ni *ni = udi->udi_ni; + struct lnet_ud_nid_descr *ni_match = udi->udi_match; + __u32 priority = (udi->udi_revert) ? -1 : udi->udi_priority; + + rc = cfs_match_nid_net( + &ni->ni_nid, + ni_match->ud_net_id.udn_net_type, + &ni_match->ud_net_id.udn_net_num_range, + &ni_match->ud_addr_range); + if (!rc) + return 0; + + CDEBUG(D_NET, "apply udsp on ni %s\n", + libcfs_nidstr(&ni->ni_nid)); + + /* Detected match. Set NIDs priority */ + lnet_ni_set_sel_priority_locked(ni, priority); + + return 0; +} + +static int +lnet_udsp_apply_rte_list_on_net(struct lnet_net *net, + struct lnet_ud_nid_descr *rte_action, + bool revert) +{ + struct lnet_remotenet *rnet; + struct list_head *rn_list; + struct lnet_route *route; + struct lnet_peer_ni *lpni; + bool cleared = false; + struct lnet_nid *gw_nid, *gw_prim_nid; + int rc = 0; + int i; + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + list_for_each_entry(rnet, rn_list, lrn_list) { + list_for_each_entry(route, &rnet->lrn_routes, lr_list) { + /* look if gw nid on the same net matches */ + gw_prim_nid = + &route->lr_gateway->lp_primary_nid; + lpni = NULL; + while ((lpni = lnet_get_next_peer_ni_locked(route->lr_gateway, + NULL, + lpni)) != NULL) { + if (!lnet_get_net_locked(lpni->lpni_peer_net->lpn_net_id)) + continue; + gw_nid = &lpni->lpni_nid; + rc = cfs_match_nid_net( + gw_nid, + rte_action->ud_net_id.udn_net_type, + &rte_action->ud_net_id.udn_net_num_range, + &rte_action->ud_addr_range); + if (rc) + break; + } + /* match gw primary nid on a remote network */ + if (!rc) { + gw_nid = gw_prim_nid; + rc = cfs_match_nid_net( + gw_nid, + rte_action->ud_net_id.udn_net_type, + &rte_action->ud_net_id.udn_net_num_range, + &rte_action->ud_addr_range); + } + if (!rc) + continue; + lnet_net_unlock(LNET_LOCK_EX); + if (!cleared || revert) { + lnet_net_clr_pref_rtrs(net); + cleared = true; + if (revert) { + lnet_net_lock(LNET_LOCK_EX); + continue; + } + } + /* match. Add to pref NIDs */ + CDEBUG(D_NET, "udsp net->gw: %s->%s\n", + libcfs_net2str(net->net_id), + libcfs_nidstr(gw_prim_nid)); + rc = lnet_net_add_pref_rtr(net, gw_prim_nid); + lnet_net_lock(LNET_LOCK_EX); + /* success if EEXIST return */ + if (rc && rc != -EEXIST) { + CERROR("Failed to add %s to %s pref rtr list\n", + libcfs_nidstr(gw_prim_nid), + libcfs_net2str(net->net_id)); + return rc; + } + } + } + } + + return rc; +} + +static int +lnet_udsp_apply_rte_rule_on_nets(struct udsp_info *udi) +{ + int rc = 0; + int last_failure = 0; + struct lnet_net *net; + struct lnet_ud_nid_descr *match = udi->udi_match; + struct lnet_ud_nid_descr *rte_action = udi->udi_action; + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + if (LNET_NETTYP(net->net_id) != match->ud_net_id.udn_net_type) + continue; + + rc = cfs_match_net(net->net_id, + match->ud_net_id.udn_net_type, + &match->ud_net_id.udn_net_num_range); + if (!rc) + continue; + + CDEBUG(D_NET, "apply rule on %s\n", + libcfs_net2str(net->net_id)); + rc = lnet_udsp_apply_rte_list_on_net(net, rte_action, + udi->udi_revert); + if (rc) + last_failure = rc; + } + + return last_failure; +} + +static int +lnet_udsp_apply_rte_rule_on_net(struct udsp_info *udi) +{ + int rc = 0; + struct lnet_net *net = udi->udi_net; + struct lnet_ud_nid_descr *match = udi->udi_match; + struct lnet_ud_nid_descr *rte_action = udi->udi_action; + + rc = cfs_match_net(net->net_id, + match->ud_net_id.udn_net_type, + &match->ud_net_id.udn_net_num_range); + if (!rc) + return 0; + + CDEBUG(D_NET, "apply rule on %s\n", + libcfs_net2str(net->net_id)); + rc = lnet_udsp_apply_rte_list_on_net(net, rte_action, + udi->udi_revert); + + return rc; +} + +static int +lnet_udsp_apply_prio_rule_on_net(struct udsp_info *udi) +{ + int rc; + struct lnet_ud_nid_descr *match = udi->udi_match; + struct lnet_net *net = udi->udi_net; + __u32 priority = (udi->udi_revert) ? -1 : udi->udi_priority; + + if (!lnet_udsp_is_net_rule(match)) + return RULE_NOT_APPLICABLE; + + rc = cfs_match_net(net->net_id, + match->ud_net_id.udn_net_type, + &match->ud_net_id.udn_net_num_range); + if (!rc) + return 0; + + CDEBUG(D_NET, "apply rule on %s\n", + libcfs_net2str(net->net_id)); + + lnet_net_set_sel_priority_locked(net, priority); + + return 0; +} + +static int +lnet_udsp_apply_rule_on_nis(struct udsp_info *udi) +{ + int rc = 0; + struct lnet_ni *ni; + struct lnet_net *net; + struct lnet_ud_nid_descr *ni_match = udi->udi_match; + int last_failure = 0; + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + if (LNET_NETTYP(net->net_id) != ni_match->ud_net_id.udn_net_type) + continue; + + udi->udi_net = net; + if (!lnet_udsp_apply_prio_rule_on_net(udi)) + continue; + + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + udi->udi_ni = ni; + rc = lnet_udsp_apply_rule_on_ni(udi); + if (rc) + last_failure = rc; + } + } + + return last_failure; +} + +static int +lnet_udsp_apply_rte_list_on_lpni(struct lnet_peer_ni *lpni, + struct lnet_ud_nid_descr *rte_action, + bool revert) +{ + struct lnet_remotenet *rnet; + struct list_head *rn_list; + struct lnet_route *route; + bool cleared = false; + struct lnet_nid *gw_nid; + int rc = 0; + int i; + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + list_for_each_entry(rnet, rn_list, lrn_list) { + list_for_each_entry(route, &rnet->lrn_routes, lr_list) { + gw_nid = &route->lr_gateway->lp_primary_nid; + rc = cfs_match_nid_net( + gw_nid, + rte_action->ud_net_id.udn_net_type, + &rte_action->ud_net_id.udn_net_num_range, + &rte_action->ud_addr_range); + if (!rc) + continue; + lnet_net_unlock(LNET_LOCK_EX); + if (!cleared || revert) { + CDEBUG(D_NET, "%spref rtr nids from lpni %s\n", + (revert) ? "revert " : "clear ", + libcfs_nidstr(&lpni->lpni_nid)); + lnet_peer_clr_pref_rtrs(lpni); + cleared = true; + if (revert) { + lnet_net_lock(LNET_LOCK_EX); + continue; + } + } + CDEBUG(D_NET, "add gw nid %s as preferred for peer %s\n", + libcfs_nidstr(gw_nid), + libcfs_nidstr(&lpni->lpni_nid)); + /* match. Add to pref NIDs */ + rc = lnet_peer_add_pref_rtr(lpni, gw_nid); + lnet_net_lock(LNET_LOCK_EX); + /* success if EEXIST return */ + if (rc && rc != -EEXIST) { + CERROR("Failed to add %s to %s pref rtr list\n", + libcfs_nidstr(gw_nid), + libcfs_nidstr(&lpni->lpni_nid)); + return rc; + } + } + } + } + + return rc; +} + +static int +lnet_udsp_apply_ni_list(struct lnet_peer_ni *lpni, + struct lnet_ud_nid_descr *ni_action, + bool revert) +{ + int rc = 0; + struct lnet_ni *ni; + struct lnet_net *net; + bool cleared = false; + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + if (LNET_NETTYP(net->net_id) != ni_action->ud_net_id.udn_net_type) + continue; + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + rc = cfs_match_nid_net( + &ni->ni_nid, + ni_action->ud_net_id.udn_net_type, + &ni_action->ud_net_id.udn_net_num_range, + &ni_action->ud_addr_range); + if (!rc) + continue; + lnet_net_unlock(LNET_LOCK_EX); + if (!cleared || revert) { + lnet_peer_clr_pref_nids(lpni); + CDEBUG(D_NET, "%spref nids from lpni %s\n", + (revert) ? "revert " : "clear ", + libcfs_nidstr(&lpni->lpni_nid)); + cleared = true; + if (revert) { + lnet_net_lock(LNET_LOCK_EX); + continue; + } + } + CDEBUG(D_NET, "add nid %s as preferred for peer %s\n", + libcfs_nidstr(&ni->ni_nid), + libcfs_nidstr(&lpni->lpni_nid)); + /* match. Add to pref NIDs */ + rc = lnet_peer_add_pref_nid(lpni, &ni->ni_nid); + lnet_net_lock(LNET_LOCK_EX); + /* success if EEXIST return */ + if (rc && rc != -EEXIST) { + CERROR("Failed to add %s to %s pref nid list\n", + libcfs_nidstr(&ni->ni_nid), + libcfs_nidstr(&lpni->lpni_nid)); + return rc; + } + } + } + + return rc; +} + +static int +lnet_udsp_apply_rule_on_lpni(struct udsp_info *udi) +{ + int rc; + struct lnet_peer_ni *lpni = udi->udi_lpni; + struct lnet_ud_nid_descr *lp_match = udi->udi_match; + struct lnet_ud_nid_descr *action = udi->udi_action; + __u32 priority = (udi->udi_revert) ? -1 : udi->udi_priority; + bool local = udi->udi_local; + enum lnet_udsp_action_type type = udi->udi_type; + + rc = cfs_match_nid_net( + &lpni->lpni_nid, + lp_match->ud_net_id.udn_net_type, + &lp_match->ud_net_id.udn_net_num_range, + &lp_match->ud_addr_range); + + /* check if looking for a net match */ + if (!rc && + (lnet_get_list_len(&lp_match->ud_addr_range) || + !cfs_match_net(udi->udi_lpn->lpn_net_id, + lp_match->ud_net_id.udn_net_type, + &lp_match->ud_net_id.udn_net_num_range))) { + return 0; + } + + if (type == EN_LNET_UDSP_ACTION_PREFERRED_LIST && local) { + rc = lnet_udsp_apply_ni_list(lpni, action, + udi->udi_revert); + if (rc) + return rc; + } else if (type == EN_LNET_UDSP_ACTION_PREFERRED_LIST && + !local) { + rc = lnet_udsp_apply_rte_list_on_lpni(lpni, action, + udi->udi_revert); + if (rc) + return rc; + } else { + lnet_peer_ni_set_selection_priority(lpni, priority); + } + + return 0; +} + +static int +lnet_udsp_apply_rule_on_lpn(struct udsp_info *udi) +{ + int rc; + struct lnet_ud_nid_descr *match = udi->udi_match; + struct lnet_peer_net *lpn = udi->udi_lpn; + __u32 priority = (udi->udi_revert) ? -1 : udi->udi_priority; + + if (udi->udi_type == EN_LNET_UDSP_ACTION_PREFERRED_LIST || + !lnet_udsp_is_net_rule(match)) + return RULE_NOT_APPLICABLE; + + rc = cfs_match_net(lpn->lpn_net_id, + match->ud_net_id.udn_net_type, + &match->ud_net_id.udn_net_num_range); + if (!rc) + return 0; + + CDEBUG(D_NET, "apply rule on lpn %s\n", + libcfs_net2str(lpn->lpn_net_id)); + lnet_peer_net_set_sel_priority_locked(lpn, priority); + + return 0; +} + +static int +lnet_udsp_apply_rule_on_lpnis(struct udsp_info *udi) +{ + /* iterate over all the peers in the system and find if any of the + * peers match the criteria. If they do, clear the preferred list + * and add the new list + */ + int lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); + struct lnet_ud_nid_descr *lp_match = udi->udi_match; + struct lnet_peer_table *ptable; + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + struct lnet_peer *lp; + int last_failure = 0; + int cpt; + int rc; + + for (cpt = 0; cpt < lncpt; cpt++) { + ptable = the_lnet.ln_peer_tables[cpt]; + list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) { + CDEBUG(D_NET, "udsp examining lp %s\n", + libcfs_nidstr(&lp->lp_primary_nid)); + list_for_each_entry(lpn, + &lp->lp_peer_nets, + lpn_peer_nets) { + CDEBUG(D_NET, "udsp examining lpn %s\n", + libcfs_net2str(lpn->lpn_net_id)); + + if (LNET_NETTYP(lpn->lpn_net_id) != + lp_match->ud_net_id.udn_net_type) + continue; + + udi->udi_lpn = lpn; + + if (!lnet_udsp_apply_rule_on_lpn(udi)) + continue; + + list_for_each_entry(lpni, + &lpn->lpn_peer_nis, + lpni_peer_nis) { + CDEBUG(D_NET, "udsp examining lpni %s\n", + libcfs_nidstr(&lpni->lpni_nid)); + udi->udi_lpni = lpni; + rc = lnet_udsp_apply_rule_on_lpni(udi); + if (rc) + last_failure = rc; + } + } + } + } + + return last_failure; +} + +static int +lnet_udsp_apply_single_policy(struct lnet_udsp *udsp, struct udsp_info *udi, + udsp_apply_rule *cbs) +{ + int rc; + + if (lnet_udsp_criteria_present(&udsp->udsp_dst) && + lnet_udsp_criteria_present(&udsp->udsp_src)) { + /* NID Pair rule */ + if (!cbs[UDSP_APPLY_ON_PEERS]) + return 0; + + if (udsp->udsp_action_type != + EN_LNET_UDSP_ACTION_PREFERRED_LIST) { + CERROR("Bad action type. Expected %d got %d\n", + EN_LNET_UDSP_ACTION_PREFERRED_LIST, + udsp->udsp_action_type); + return 0; + } + udi->udi_match = &udsp->udsp_dst; + udi->udi_action = &udsp->udsp_src; + udi->udi_type = EN_LNET_UDSP_ACTION_PREFERRED_LIST; + udi->udi_local = true; + + CDEBUG(D_NET, "applying udsp (%p) dst->src\n", + udsp); + rc = cbs[UDSP_APPLY_ON_PEERS](udi); + if (rc) + return rc; + } else if (lnet_udsp_criteria_present(&udsp->udsp_dst) && + lnet_udsp_criteria_present(&udsp->udsp_rte)) { + /* Router rule */ + if (!cbs[UDSP_APPLY_ON_PEERS]) + return 0; + + if (udsp->udsp_action_type != + EN_LNET_UDSP_ACTION_PREFERRED_LIST) { + CERROR("Bad action type. Expected %d got %d\n", + EN_LNET_UDSP_ACTION_PREFERRED_LIST, + udsp->udsp_action_type); + return 0; + } + + if (lnet_udsp_criteria_present(&udsp->udsp_src)) { + CERROR("only one of src or dst can be specified\n"); + return 0; + } + udi->udi_match = &udsp->udsp_dst; + udi->udi_action = &udsp->udsp_rte; + udi->udi_type = EN_LNET_UDSP_ACTION_PREFERRED_LIST; + udi->udi_local = false; + + CDEBUG(D_NET, "applying udsp (%p) dst->rte\n", + udsp); + rc = cbs[UDSP_APPLY_ON_PEERS](udi); + if (rc) + return rc; + } else if (lnet_udsp_criteria_present(&udsp->udsp_dst)) { + /* destination priority rule */ + if (!cbs[UDSP_APPLY_ON_PEERS]) + return 0; + + if (udsp->udsp_action_type != + EN_LNET_UDSP_ACTION_PRIORITY) { + CERROR("Bad action type. Expected %d got %d\n", + EN_LNET_UDSP_ACTION_PRIORITY, + udsp->udsp_action_type); + return 0; + } + udi->udi_match = &udsp->udsp_dst; + udi->udi_type = EN_LNET_UDSP_ACTION_PRIORITY; + if (udsp->udsp_action_type != + EN_LNET_UDSP_ACTION_PRIORITY) { + udi->udi_priority = 0; + } else { + udi->udi_priority = udsp->udsp_action.udsp_priority; + } + udi->udi_local = true; + + CDEBUG(D_NET, "applying udsp (%p) on destination\n", + udsp); + rc = cbs[UDSP_APPLY_ON_PEERS](udi); + if (rc) + return rc; + } else if (lnet_udsp_criteria_present(&udsp->udsp_src)) { + /* source priority rule */ + if (!cbs[UDSP_APPLY_PRIO_ON_NIS]) + return 0; + + if (udsp->udsp_action_type != + EN_LNET_UDSP_ACTION_PRIORITY) { + CERROR("Bad action type. Expected %d got %d\n", + EN_LNET_UDSP_ACTION_PRIORITY, + udsp->udsp_action_type); + return 0; + } + udi->udi_match = &udsp->udsp_src; + udi->udi_type = EN_LNET_UDSP_ACTION_PRIORITY; + if (udsp->udsp_action_type != + EN_LNET_UDSP_ACTION_PRIORITY) { + udi->udi_priority = 0; + } else { + udi->udi_priority = udsp->udsp_action.udsp_priority; + } + udi->udi_local = true; + + CDEBUG(D_NET, "applying udsp (%p) on source\n", + udsp); + rc = cbs[UDSP_APPLY_PRIO_ON_NIS](udi); + } else { + CERROR("Bad UDSP policy\n"); + return 0; + } + + return 0; +} + +static int +lnet_udsp_apply_policies_helper(struct lnet_udsp *udsp, struct udsp_info *udi, + udsp_apply_rule *cbs) +{ + int rc; + int last_failure = 0; + + if (udsp) + return lnet_udsp_apply_single_policy(udsp, udi, cbs); + + list_for_each_entry_reverse(udsp, + &the_lnet.ln_udsp_list, + udsp_on_list) { + rc = lnet_udsp_apply_single_policy(udsp, udi, cbs); + if (rc) + last_failure = rc; + } + + return last_failure; +} + +int +lnet_udsp_apply_policies_on_ni(struct lnet_ni *ni) +{ + struct udsp_info udi; + udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL}; + + memset(&udi, 0, sizeof(udi)); + + udi.udi_ni = ni; + + cbs[UDSP_APPLY_PRIO_ON_NIS] = lnet_udsp_apply_rule_on_ni; + + return lnet_udsp_apply_policies_helper(NULL, &udi, cbs); +} + +int +lnet_udsp_apply_policies_on_net(struct lnet_net *net) +{ + struct udsp_info udi; + udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL}; + + memset(&udi, 0, sizeof(udi)); + + udi.udi_net = net; + + cbs[UDSP_APPLY_PRIO_ON_NIS] = lnet_udsp_apply_prio_rule_on_net; + cbs[UDSP_APPLY_RTE_ON_NETS] = lnet_udsp_apply_rte_rule_on_net; + + return lnet_udsp_apply_policies_helper(NULL, &udi, cbs); +} + +int +lnet_udsp_apply_policies_on_lpni(struct lnet_peer_ni *lpni) +{ + struct udsp_info udi; + udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL}; + + memset(&udi, 0, sizeof(udi)); + + udi.udi_lpni = lpni; + + cbs[UDSP_APPLY_ON_PEERS] = lnet_udsp_apply_rule_on_lpni; + + return lnet_udsp_apply_policies_helper(NULL, &udi, cbs); +} + +int +lnet_udsp_apply_policies_on_lpn(struct lnet_peer_net *lpn) +{ + struct udsp_info udi; + udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL}; + + memset(&udi, 0, sizeof(udi)); + + udi.udi_lpn = lpn; + + cbs[UDSP_APPLY_ON_PEERS] = lnet_udsp_apply_rule_on_lpn; + + return lnet_udsp_apply_policies_helper(NULL, &udi, cbs); +} + +int +lnet_udsp_apply_policies(struct lnet_udsp *udsp, bool revert) +{ + int rc; + struct udsp_info udi; + udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL}; + + memset(&udi, 0, sizeof(udi)); + + cbs[UDSP_APPLY_ON_PEERS] = lnet_udsp_apply_rule_on_lpnis; + cbs[UDSP_APPLY_PRIO_ON_NIS] = lnet_udsp_apply_rule_on_nis; + cbs[UDSP_APPLY_RTE_ON_NETS] = lnet_udsp_apply_rte_rule_on_nets; + + udi.udi_revert = revert; + + lnet_net_lock(LNET_LOCK_EX); + rc = lnet_udsp_apply_policies_helper(udsp, &udi, cbs); + lnet_net_unlock(LNET_LOCK_EX); + + return rc; +} + +struct lnet_udsp * +lnet_udsp_get_policy(int idx) +{ + int i = 0; + struct lnet_udsp *udsp = NULL; + bool found = false; + + CDEBUG(D_NET, "Get UDSP at idx = %d\n", idx); + + if (idx < 0) + return NULL; + + list_for_each_entry(udsp, &the_lnet.ln_udsp_list, udsp_on_list) { + CDEBUG(D_NET, "iterating over upsp %d:%d:%d\n", + udsp->udsp_idx, i, idx); + if (i == idx) { + found = true; + break; + } + i++; + } + + CDEBUG(D_NET, "Found UDSP (%p)\n", udsp); + + if (!found) + return NULL; + + return udsp; +} + +int +lnet_udsp_add_policy(struct lnet_udsp *new, int idx) +{ + struct lnet_udsp *udsp; + struct lnet_udsp *insert = NULL; + int i = 0; + + list_for_each_entry(udsp, &the_lnet.ln_udsp_list, udsp_on_list) { + CDEBUG(D_NET, "found udsp i = %d:%d, idx = %d\n", + i, udsp->udsp_idx, idx); + if (i == idx) { + insert = udsp; + new->udsp_idx = idx; + } + i++; + if (lnet_udsp_equal(udsp, new)) { + if (!lnet_udsp_action_equal(udsp, new) && + udsp->udsp_action_type == EN_LNET_UDSP_ACTION_PRIORITY && + new->udsp_action_type == EN_LNET_UDSP_ACTION_PRIORITY) { + udsp->udsp_action.udsp_priority = new->udsp_action.udsp_priority; + CDEBUG(D_NET, "udsp: %p index %d updated priority to %d\n", + udsp, + udsp->udsp_idx, + udsp->udsp_action.udsp_priority); + return 0; + } + return -EALREADY; + } + } + + if (insert) { + list_add(&new->udsp_on_list, insert->udsp_on_list.prev); + i = 0; + list_for_each_entry(udsp, + &the_lnet.ln_udsp_list, + udsp_on_list) { + if (i <= idx) { + i++; + continue; + } + udsp->udsp_idx++; + } + } else { + list_add_tail(&new->udsp_on_list, &the_lnet.ln_udsp_list); + new->udsp_idx = i; + } + + CDEBUG(D_NET, "udsp: %p added at index %d\n", new, new->udsp_idx); + + CDEBUG(D_NET, "udsp list:\n"); + list_for_each_entry(udsp, &the_lnet.ln_udsp_list, udsp_on_list) + CDEBUG(D_NET, "udsp %p:%d\n", udsp, udsp->udsp_idx); + + return 0; +} + +int +lnet_udsp_del_policy(int idx) +{ + struct lnet_udsp *udsp; + struct lnet_udsp *tmp; + bool removed = false; + + if (idx < 0) { + lnet_udsp_destroy(false); + return 0; + } + + CDEBUG(D_NET, "del udsp at idx = %d\n", idx); + + list_for_each_entry_safe(udsp, + tmp, + &the_lnet.ln_udsp_list, + udsp_on_list) { + if (removed) + udsp->udsp_idx--; + if (udsp->udsp_idx == idx && !removed) { + list_del_init(&udsp->udsp_on_list); + lnet_udsp_apply_policies(udsp, true); + lnet_udsp_free(udsp); + removed = true; + } + } + + return 0; +} + +static void +lnet_udsp_get_ni_info(struct lnet_ioctl_construct_udsp_info *info, + struct lnet_ni *ni) +{ + struct lnet_nid_list *ne; + struct lnet_net *net = ni->ni_net; + int i = 0; + + LASSERT(ni); + + info->cud_nid_priority = ni->ni_sel_priority; + if (net) { + info->cud_net_priority = ni->ni_net->net_sel_priority; + list_for_each_entry(ne, &net->net_rtr_pref_nids, nl_list) { + if (i < LNET_MAX_SHOW_NUM_NID) + info->cud_pref_rtr_nid[i] = + lnet_nid_to_nid4(&ne->nl_nid); + else + break; + i++; + } + } +} + +static void +lnet_udsp_get_peer_info(struct lnet_ioctl_construct_udsp_info *info, + struct lnet_peer_ni *lpni) +{ + struct lnet_nid_list *ne; + int i = 0; + + /* peer tree structure needs to be in existence */ + LASSERT(lpni && lpni->lpni_peer_net && + lpni->lpni_peer_net->lpn_peer); + + info->cud_nid_priority = lpni->lpni_sel_priority; + CDEBUG(D_NET, "lpni %s has %d pref nids\n", + libcfs_nidstr(&lpni->lpni_nid), + lpni->lpni_pref_nnids); + if (lpni->lpni_pref_nnids == 1) { + info->cud_pref_nid[0] = lnet_nid_to_nid4(&lpni->lpni_pref.nid); + } else if (lpni->lpni_pref_nnids > 1) { + struct list_head *list = &lpni->lpni_pref.nids; + + list_for_each_entry(ne, list, nl_list) { + if (i < LNET_MAX_SHOW_NUM_NID) + info->cud_pref_nid[i] = + lnet_nid_to_nid4(&ne->nl_nid); + else + break; + i++; + } + } + + i = 0; + list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) { + if (i < LNET_MAX_SHOW_NUM_NID) + info->cud_pref_rtr_nid[i] = + lnet_nid_to_nid4(&ne->nl_nid); + else + break; + i++; + } + + info->cud_net_priority = lpni->lpni_peer_net->lpn_sel_priority; +} + +void +lnet_udsp_get_construct_info(struct lnet_ioctl_construct_udsp_info *info) +{ + struct lnet_ni *ni; + struct lnet_peer_ni *lpni; + + lnet_net_lock(0); + if (!info->cud_peer) { + ni = lnet_nid2ni_locked(info->cud_nid, 0); + if (ni) + lnet_udsp_get_ni_info(info, ni); + } else { + lpni = lnet_find_peer_ni_locked(info->cud_nid); + if (!lpni) { + CDEBUG(D_NET, "nid %s is not found\n", + libcfs_nid2str(info->cud_nid)); + } else { + lnet_udsp_get_peer_info(info, lpni); + lnet_peer_ni_decref_locked(lpni); + } + } + lnet_net_unlock(0); +} + +struct lnet_udsp * +lnet_udsp_alloc(void) +{ + struct lnet_udsp *udsp; + + udsp = kmem_cache_alloc(lnet_udsp_cachep, GFP_NOFS | __GFP_ZERO); + + if (!udsp) + return NULL; + + INIT_LIST_HEAD(&udsp->udsp_on_list); + INIT_LIST_HEAD(&udsp->udsp_src.ud_addr_range); + INIT_LIST_HEAD(&udsp->udsp_src.ud_net_id.udn_net_num_range); + INIT_LIST_HEAD(&udsp->udsp_dst.ud_addr_range); + INIT_LIST_HEAD(&udsp->udsp_dst.ud_net_id.udn_net_num_range); + INIT_LIST_HEAD(&udsp->udsp_rte.ud_addr_range); + INIT_LIST_HEAD(&udsp->udsp_rte.ud_net_id.udn_net_num_range); + + CDEBUG(D_MALLOC, "udsp alloc %p\n", udsp); + return udsp; +} + +static void +lnet_udsp_nid_descr_free(struct lnet_ud_nid_descr *nid_descr) +{ + struct list_head *net_range = &nid_descr->ud_net_id.udn_net_num_range; + + if (!lnet_udsp_criteria_present(nid_descr)) + return; + + /* memory management is a bit tricky here. When we allocate the + * memory to store the NID descriptor we allocate a large buffer + * for all the data, so we need to free the entire buffer at + * once. If the net is present the net_range->next points to that + * buffer otherwise if the ud_addr_range is present then it's the + * ud_addr_range.next + */ + if (!list_empty(net_range)) + LIBCFS_FREE(net_range->next, nid_descr->ud_mem_size); + else if (!list_empty(&nid_descr->ud_addr_range)) + LIBCFS_FREE(nid_descr->ud_addr_range.next, + nid_descr->ud_mem_size); +} + +void +lnet_udsp_free(struct lnet_udsp *udsp) +{ + lnet_udsp_nid_descr_free(&udsp->udsp_src); + lnet_udsp_nid_descr_free(&udsp->udsp_dst); + lnet_udsp_nid_descr_free(&udsp->udsp_rte); + + CDEBUG(D_MALLOC, "udsp free %p\n", udsp); + kmem_cache_free(lnet_udsp_cachep, udsp); +} + +void +lnet_udsp_destroy(bool shutdown) +{ + struct lnet_udsp *udsp, *tmp; + + CDEBUG(D_NET, "Destroying UDSPs in the system\n"); + + list_for_each_entry_safe(udsp, tmp, &the_lnet.ln_udsp_list, + udsp_on_list) { + list_del(&udsp->udsp_on_list); + if (!shutdown) + lnet_udsp_apply_policies(udsp, true); + lnet_udsp_free(udsp); + } +} + +static size_t +lnet_size_marshaled_nid_descr(struct lnet_ud_nid_descr *descr) +{ + struct cfs_expr_list *expr; + int expr_count = 0; + int range_count = 0; + size_t size = sizeof(struct lnet_ioctl_udsp_descr); + + if (!lnet_udsp_criteria_present(descr)) + return size; + + /* we always have one net expression */ + if (!list_empty(&descr->ud_net_id.udn_net_num_range)) { + expr = list_first_entry(&descr->ud_net_id.udn_net_num_range, + struct cfs_expr_list, el_link); + + /* count the number of cfs_range_expr in the net expression */ + range_count = lnet_get_list_len(&expr->el_exprs); + } + + /* count the number of cfs_range_expr in the address expressions */ + list_for_each_entry(expr, &descr->ud_addr_range, el_link) { + expr_count++; + range_count += lnet_get_list_len(&expr->el_exprs); + } + + size += (sizeof(struct lnet_expressions) * expr_count); + size += (sizeof(struct lnet_range_expr) * range_count); + + return size; +} + +size_t +lnet_get_udsp_size(struct lnet_udsp *udsp) +{ + size_t size = sizeof(struct lnet_ioctl_udsp); + + size += lnet_size_marshaled_nid_descr(&udsp->udsp_src); + size += lnet_size_marshaled_nid_descr(&udsp->udsp_dst); + size += lnet_size_marshaled_nid_descr(&udsp->udsp_rte); + + CDEBUG(D_NET, "get udsp (%p) size: %d\n", udsp, (int)size); + + return size; +} + +static int +copy_exprs(struct cfs_expr_list *expr, void __user **bulk, + __u32 *bulk_size) +{ + struct cfs_range_expr *range; + struct lnet_range_expr range_expr; + + /* copy over the net range expressions to the bulk */ + list_for_each_entry(range, &expr->el_exprs, re_link) { + range_expr.re_lo = range->re_lo; + range_expr.re_hi = range->re_hi; + range_expr.re_stride = range->re_stride; + CDEBUG(D_NET, "Copy Range %u:%u:%u\n", + range_expr.re_lo, range_expr.re_hi, + range_expr.re_stride); + if (copy_to_user(*bulk, &range_expr, sizeof(range_expr))) { + CDEBUG(D_NET, "Failed to copy range_expr\n"); + return -EFAULT; + } + *bulk += sizeof(range_expr); + *bulk_size -= sizeof(range_expr); + } + + return 0; +} + +static int +copy_nid_range(struct lnet_ud_nid_descr *nid_descr, char *type, + void __user **bulk, __u32 *bulk_size) +{ + struct lnet_ioctl_udsp_descr ioc_udsp_descr; + struct cfs_expr_list *expr; + struct lnet_expressions ioc_expr; + int expr_count; + int net_expr_count; + int rc; + + memset(&ioc_udsp_descr, 0, sizeof(ioc_udsp_descr)); + ioc_udsp_descr.iud_src_hdr.ud_descr_type = *(__u32 *)type; + + /* if criteria not present, copy over the static part of the NID + * descriptor + */ + if (!lnet_udsp_criteria_present(nid_descr)) { + CDEBUG(D_NET, "Descriptor %u:%u:%u:%u\n", + ioc_udsp_descr.iud_src_hdr.ud_descr_type, + ioc_udsp_descr.iud_src_hdr.ud_descr_count, + ioc_udsp_descr.iud_net.ud_net_type, + ioc_udsp_descr.iud_net.ud_net_num_expr.le_count); + if (copy_to_user(*bulk, &ioc_udsp_descr, + sizeof(ioc_udsp_descr))) { + CDEBUG(D_NET, "failed to copy ioc_udsp_descr\n"); + return -EFAULT; + } + *bulk += sizeof(ioc_udsp_descr); + *bulk_size -= sizeof(ioc_udsp_descr); + return 0; + } + + expr_count = lnet_get_list_len(&nid_descr->ud_addr_range); + + /* copy the net information */ + if (!list_empty(&nid_descr->ud_net_id.udn_net_num_range)) { + expr = list_first_entry(&nid_descr->ud_net_id.udn_net_num_range, + struct cfs_expr_list, el_link); + net_expr_count = lnet_get_list_len(&expr->el_exprs); + } else { + net_expr_count = 0; + } + + /* set the total expression count */ + ioc_udsp_descr.iud_src_hdr.ud_descr_count = expr_count; + ioc_udsp_descr.iud_net.ud_net_type = + nid_descr->ud_net_id.udn_net_type; + ioc_udsp_descr.iud_net.ud_net_num_expr.le_count = net_expr_count; + + CDEBUG(D_NET, "Descriptor %u:%u:%u:%u\n", + ioc_udsp_descr.iud_src_hdr.ud_descr_type, + ioc_udsp_descr.iud_src_hdr.ud_descr_count, + ioc_udsp_descr.iud_net.ud_net_type, + ioc_udsp_descr.iud_net.ud_net_num_expr.le_count); + + /* copy over the header info to the bulk */ + if (copy_to_user(*bulk, &ioc_udsp_descr, sizeof(ioc_udsp_descr))) { + CDEBUG(D_NET, "Failed to copy data\n"); + return -EFAULT; + } + *bulk += sizeof(ioc_udsp_descr); + *bulk_size -= sizeof(ioc_udsp_descr); + + /* copy over the net num expression if it exists */ + if (net_expr_count) { + rc = copy_exprs(expr, bulk, bulk_size); + if (rc) + return rc; + } + + /* copy the address range */ + list_for_each_entry(expr, &nid_descr->ud_addr_range, el_link) { + ioc_expr.le_count = lnet_get_list_len(&expr->el_exprs); + if (copy_to_user(*bulk, &ioc_expr, sizeof(ioc_expr))) { + CDEBUG(D_NET, "failex to copy ioc_expr\n"); + return -EFAULT; + } + *bulk += sizeof(ioc_expr); + *bulk_size -= sizeof(ioc_expr); + + rc = copy_exprs(expr, bulk, bulk_size); + if (rc) + return rc; + } + + return 0; +} + +int +lnet_udsp_marshal(struct lnet_udsp *udsp, struct lnet_ioctl_udsp *ioc_udsp) +{ + int rc = -ENOMEM; + void __user *bulk; + __u32 bulk_size; + + if (!ioc_udsp) + return -EINVAL; + + bulk = ioc_udsp->iou_bulk; + bulk_size = ioc_udsp->iou_hdr.ioc_len + + ioc_udsp->iou_bulk_size; + + CDEBUG(D_NET, "marshal udsp (%p)\n", udsp); + CDEBUG(D_NET, "MEM -----> bulk: %p:0x%x\n", bulk, bulk_size); + /* make sure user space allocated enough buffer to marshal the + * udsp + */ + if (bulk_size != lnet_get_udsp_size(udsp)) { + rc = -ENOSPC; + goto fail; + } + + ioc_udsp->iou_idx = udsp->udsp_idx; + ioc_udsp->iou_action_type = udsp->udsp_action_type; + ioc_udsp->iou_action.priority = udsp->udsp_action.udsp_priority; + + bulk_size -= sizeof(*ioc_udsp); + + rc = copy_nid_range(&udsp->udsp_src, "SRC", &bulk, &bulk_size); + if (rc) + goto fail; + + rc = copy_nid_range(&udsp->udsp_dst, "DST", &bulk, &bulk_size); + if (rc) + goto fail; + + rc = copy_nid_range(&udsp->udsp_rte, "RTE", &bulk, &bulk_size); + if (rc) + goto fail; + + CDEBUG(D_NET, "MEM <----- bulk: %p\n", bulk); + + /* we should've consumed the entire buffer */ + LASSERT(bulk_size == 0); + return 0; + +fail: + CERROR("Failed to marshal udsp: %d\n", rc); + return rc; +} + +static void +copy_range_info(void **bulk, void **buf, struct list_head *list, + int count) +{ + struct lnet_range_expr *range_expr; + struct cfs_range_expr *range; + struct cfs_expr_list *exprs; + int range_count = count; + int i; + + if (range_count == 0) + return; + + if (range_count == -1) { + struct lnet_expressions *e; + + e = *bulk; + range_count = e->le_count; + *bulk += sizeof(*e); + } + + exprs = *buf; + INIT_LIST_HEAD(&exprs->el_link); + INIT_LIST_HEAD(&exprs->el_exprs); + list_add_tail(&exprs->el_link, list); + *buf += sizeof(*exprs); + + for (i = 0; i < range_count; i++) { + range_expr = *bulk; + range = *buf; + INIT_LIST_HEAD(&range->re_link); + range->re_lo = range_expr->re_lo; + range->re_hi = range_expr->re_hi; + range->re_stride = range_expr->re_stride; + CDEBUG(D_NET, "Copy Range %u:%u:%u\n", + range->re_lo, + range->re_hi, + range->re_stride); + list_add_tail(&range->re_link, &exprs->el_exprs); + *bulk += sizeof(*range_expr); + *buf += sizeof(*range); + } +} + +static int +copy_ioc_udsp_descr(struct lnet_ud_nid_descr *nid_descr, char *type, + void **bulk, __u32 *bulk_size) +{ + struct lnet_ioctl_udsp_descr *ioc_nid = *bulk; + struct lnet_expressions *exprs; + __u32 descr_type; + int expr_count = 0; + int range_count = 0; + int i; + __u32 size; + int remaining_size = *bulk_size; + void *tmp = *bulk; + __u32 alloc_size; + void *buf; + size_t range_expr_s = sizeof(struct lnet_range_expr); + size_t lnet_exprs_s = sizeof(struct lnet_expressions); + + CDEBUG(D_NET, "%s: bulk = %p:%u\n", type, *bulk, *bulk_size); + + /* criteria not present, skip over the static part of the + * bulk, which is included for each NID descriptor + */ + if (ioc_nid->iud_net.ud_net_type == 0) { + remaining_size -= sizeof(*ioc_nid); + if (remaining_size < 0) { + CERROR("Truncated userspace udsp buffer given\n"); + return -EINVAL; + } + *bulk += sizeof(*ioc_nid); + *bulk_size = remaining_size; + return 0; + } + + descr_type = ioc_nid->iud_src_hdr.ud_descr_type; + if (descr_type != *(__u32 *)type) { + CERROR("Bad NID descriptor type. Expected %s, given %c%c%c\n", + type, (__u8)descr_type, (__u8)(descr_type << 4), + (__u8)(descr_type << 8)); + return -EINVAL; + } + + /* calculate the total size to verify we have enough buffer. + * Start of by finding how many ranges there are for the net + * expression. + */ + range_count = ioc_nid->iud_net.ud_net_num_expr.le_count; + size = sizeof(*ioc_nid) + (range_count * range_expr_s); + remaining_size -= size; + if (remaining_size < 0) { + CERROR("Truncated userspace udsp buffer given\n"); + return -EINVAL; + } + + CDEBUG(D_NET, "Total net num ranges in %s: %d:%u\n", type, + range_count, size); + /* the number of expressions for the NID. IE 4 for IP, 1 for GNI */ + expr_count = ioc_nid->iud_src_hdr.ud_descr_count; + CDEBUG(D_NET, "addr as %d exprs\n", expr_count); + /* point tmp to the beginning of the NID expressions */ + tmp += size; + for (i = 0; i < expr_count; i++) { + /* get the number of ranges per expression */ + exprs = tmp; + range_count += exprs->le_count; + size = (range_expr_s * exprs->le_count) + lnet_exprs_s; + remaining_size -= size; + CDEBUG(D_NET, "expr %d:%d:%u:%d:%d\n", i, exprs->le_count, + size, remaining_size, range_count); + if (remaining_size < 0) { + CERROR("Truncated userspace udsp buffer given\n"); + return -EINVAL; + } + tmp += size; + } + + *bulk_size = remaining_size; + + /* copy over the net type */ + nid_descr->ud_net_id.udn_net_type = ioc_nid->iud_net.ud_net_type; + + CDEBUG(D_NET, "%u\n", nid_descr->ud_net_id.udn_net_type); + + /* allocate the total memory required to copy this NID descriptor */ + alloc_size = (sizeof(struct cfs_expr_list) * (expr_count + 1)) + + (sizeof(struct cfs_range_expr) * (range_count)); + LIBCFS_ALLOC(buf, alloc_size); + if (!buf) + return -ENOMEM; + + /* store the amount of memory allocated so we can free it later on */ + nid_descr->ud_mem_size = alloc_size; + + /* copy over the net number range */ + range_count = ioc_nid->iud_net.ud_net_num_expr.le_count; + *bulk += sizeof(*ioc_nid); + CDEBUG(D_NET, "bulk = %p\n", *bulk); + copy_range_info(bulk, &buf, &nid_descr->ud_net_id.udn_net_num_range, + range_count); + CDEBUG(D_NET, "bulk = %p\n", *bulk); + + /* copy over the NID descriptor */ + for (i = 0; i < expr_count; i++) { + copy_range_info(bulk, &buf, &nid_descr->ud_addr_range, -1); + CDEBUG(D_NET, "bulk = %p\n", *bulk); + } + + return 0; +} + +int +lnet_udsp_demarshal_add(void *bulk, __u32 bulk_size) +{ + struct lnet_ioctl_udsp *ioc_udsp; + struct lnet_udsp *udsp; + int rc = -ENOMEM; + int idx; + + if (bulk_size < sizeof(*ioc_udsp)) + return -ENOSPC; + + udsp = lnet_udsp_alloc(); + if (!udsp) + return rc; + + ioc_udsp = bulk; + + udsp->udsp_action_type = ioc_udsp->iou_action_type; + udsp->udsp_action.udsp_priority = ioc_udsp->iou_action.priority; + idx = ioc_udsp->iou_idx; + + CDEBUG(D_NET, "demarshal descr %u:%u:%d:%u\n", udsp->udsp_action_type, + udsp->udsp_action.udsp_priority, idx, bulk_size); + + bulk += sizeof(*ioc_udsp); + bulk_size -= sizeof(*ioc_udsp); + + rc = copy_ioc_udsp_descr(&udsp->udsp_src, "SRC", &bulk, &bulk_size); + if (rc < 0) + goto free_udsp; + + rc = copy_ioc_udsp_descr(&udsp->udsp_dst, "DST", &bulk, &bulk_size); + if (rc < 0) + goto free_udsp; + + rc = copy_ioc_udsp_descr(&udsp->udsp_rte, "RTE", &bulk, &bulk_size); + if (rc < 0) + goto free_udsp; + + return lnet_udsp_add_policy(udsp, idx); + +free_udsp: + lnet_udsp_free(udsp); + return rc; +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/Makefile b/drivers/staging/lustrefsx/lnet/selftest/Makefile new file mode 100644 index 0000000000000..5380812715f7f --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_LUSTREFSX_LNET_SELFTEST) += lnet_selftest.o + +lnet_selftest-y := console.o conrpc.o conctl.o framework.o timer.o +lnet_selftest-y += rpc.o module.o ping_test.o brw_test.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lnet/selftest/brw_test.c b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c new file mode 100644 index 0000000000000..2e77d8fa6d6b6 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c @@ -0,0 +1,524 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/selftest/brw_test.c + * + * Author: Isaac Huang + */ + +#include "selftest.h" + +static int brw_srv_workitems = SFW_TEST_WI_MAX; +module_param(brw_srv_workitems, int, 0644); +MODULE_PARM_DESC(brw_srv_workitems, "# BRW server workitems"); + +static int brw_inject_errors; +module_param(brw_inject_errors, int, 0644); +MODULE_PARM_DESC(brw_inject_errors, "# data errors to inject randomly, zero by default"); + +#define BRW_POISON 0xbeefbeefbeefbeefULL +#define BRW_MAGIC 0xeeb0eeb1eeb2eeb3ULL +#define BRW_MSIZE sizeof(__u64) + +static void +brw_client_fini(struct sfw_test_instance *tsi) +{ + struct srpc_bulk *bulk; + struct sfw_test_unit *tsu; + + LASSERT(tsi->tsi_is_client); + + list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) { + bulk = tsu->tsu_private; + if (bulk == NULL) + continue; + + srpc_free_bulk(bulk); + tsu->tsu_private = NULL; + } +} + +static int +brw_client_init(struct sfw_test_instance *tsi) +{ + struct sfw_session *sn = tsi->tsi_batch->bat_session; + int flags; + int off; + int npg; + int len; + int opc; + struct srpc_bulk *bulk; + struct sfw_test_unit *tsu; + + LASSERT(sn != NULL); + LASSERT(tsi->tsi_is_client); + + if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) { + struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0; + + opc = breq->blk_opc; + flags = breq->blk_flags; + npg = breq->blk_npg; + /* NB: this is not going to work for variable page size, + * but we have to keep it for compatibility */ + len = npg * PAGE_SIZE; + off = 0; + + } else { + struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1; + + /* I should never get this step if it's unknown feature + * because make_session will reject unknown feature */ + LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0); + + opc = breq->blk_opc; + flags = breq->blk_flags; + len = breq->blk_len; + off = breq->blk_offset & ~PAGE_MASK; + npg = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + + if (off % BRW_MSIZE != 0) + return -EINVAL; + + if (npg > LNET_MAX_IOV || npg <= 0) + return -EINVAL; + + if (opc != LST_BRW_READ && opc != LST_BRW_WRITE) + return -EINVAL; + + if (flags != LST_BRW_CHECK_NONE && + flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE) + return -EINVAL; + + list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) { + bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid, NULL), + off, npg, len, opc == LST_BRW_READ); + if (bulk == NULL) { + brw_client_fini(tsi); + return -ENOMEM; + } + + tsu->tsu_private = bulk; + } + + return 0; +} + +#define BRW_POISON 0xbeefbeefbeefbeefULL +#define BRW_MAGIC 0xeeb0eeb1eeb2eeb3ULL +#define BRW_MSIZE sizeof(__u64) + +static int brw_inject_one_error(void) +{ + struct timespec64 ts; + + if (brw_inject_errors <= 0) return 0; + + ktime_get_ts64(&ts); + + if (((ts.tv_nsec / NSEC_PER_USEC) & 1) == 0) + return 0; + + return brw_inject_errors--; +} + +static void +brw_fill_page(struct page *pg, int off, int len, int pattern, __u64 magic) +{ + char *addr = page_address(pg) + off; + int i; + + LASSERT(addr != NULL); + LASSERT(off % BRW_MSIZE == 0 && len % BRW_MSIZE == 0); + + if (pattern == LST_BRW_CHECK_NONE) + return; + + if (magic == BRW_MAGIC) + magic += brw_inject_one_error(); + + if (pattern == LST_BRW_CHECK_SIMPLE) { + memcpy(addr, &magic, BRW_MSIZE); + if (len > BRW_MSIZE) { + addr += len - BRW_MSIZE; + memcpy(addr, &magic, BRW_MSIZE); + } + return; + } + + if (pattern == LST_BRW_CHECK_FULL) { + for (i = 0; i < len; i += BRW_MSIZE) + memcpy(addr + i, &magic, BRW_MSIZE); + return; + } + LBUG(); +} + +static int +brw_check_page(struct page *pg, int off, int len, int pattern, __u64 magic) +{ + char *addr = page_address(pg) + off; + __u64 data = 0; /* make compiler happy */ + int i; + + LASSERT(addr != NULL); + LASSERT(off % BRW_MSIZE == 0 && len % BRW_MSIZE == 0); + + if (pattern == LST_BRW_CHECK_NONE) + return 0; + + if (pattern == LST_BRW_CHECK_SIMPLE) { + data = *((__u64 *) addr); + if (data != magic) + goto bad_data; + + if (len > BRW_MSIZE) { + addr += len - BRW_MSIZE; + data = *((__u64 *) addr); + if (data != magic) + goto bad_data; + } + return 0; + } + + if (pattern == LST_BRW_CHECK_FULL) { + for (i = 0; i < len; i += BRW_MSIZE) { + data = *(__u64 *)(addr + i); + if (data != magic) + goto bad_data; + } + return 0; + } + + LBUG(); + +bad_data: + CERROR ("Bad data in page %p: %#llx, %#llx expected\n", + pg, data, magic); + return 1; +} + +static void +brw_fill_bulk(struct srpc_bulk *bk, int pattern, __u64 magic) +{ + int i; + struct page *pg; + + for (i = 0; i < bk->bk_niov; i++) { + int off; + int len; + + pg = bk->bk_iovs[i].bv_page; + off = bk->bk_iovs[i].bv_offset; + len = bk->bk_iovs[i].bv_len; + brw_fill_page(pg, off, len, pattern, magic); + } +} + +static int +brw_check_bulk(struct srpc_bulk *bk, int pattern, __u64 magic) +{ + int i; + struct page *pg; + + for (i = 0; i < bk->bk_niov; i++) { + int off; + int len; + + pg = bk->bk_iovs[i].bv_page; + off = bk->bk_iovs[i].bv_offset; + len = bk->bk_iovs[i].bv_len; + if (brw_check_page(pg, off, len, pattern, magic) != 0) { + CERROR("Bulk page %p (%d/%d) is corrupted!\n", + pg, i, bk->bk_niov); + return 1; + } + } + + return 0; +} + +static int +brw_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest, + struct srpc_client_rpc **rpcpp) +{ + struct srpc_bulk *bulk = tsu->tsu_private; + struct sfw_test_instance *tsi = tsu->tsu_instance; + struct sfw_session *sn = tsi->tsi_batch->bat_session; + struct srpc_client_rpc *rpc; + struct srpc_brw_reqst *req; + int flags; + int npg; + int len; + int opc; + int rc; + + LASSERT(sn != NULL); + LASSERT(bulk != NULL); + + if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) { + struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0; + + opc = breq->blk_opc; + flags = breq->blk_flags; + npg = breq->blk_npg; + len = npg * PAGE_SIZE; + + } else { + struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1; + int off; + + /* I should never get this step if it's unknown feature + * because make_session will reject unknown feature */ + LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0); + + opc = breq->blk_opc; + flags = breq->blk_flags; + len = breq->blk_len; + off = breq->blk_offset; + npg = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + + rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc); + if (rc != 0) + return rc; + + memcpy(&rpc->crpc_bulk, bulk, offsetof(struct srpc_bulk, bk_iovs[npg])); + if (opc == LST_BRW_WRITE) + brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC); + else + brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON); + + req = &rpc->crpc_reqstmsg.msg_body.brw_reqst; + req->brw_flags = flags; + req->brw_rw = opc; + req->brw_len = len; + + *rpcpp = rpc; + return 0; +} + +static void +brw_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc) +{ + __u64 magic = BRW_MAGIC; + struct sfw_test_instance *tsi = tsu->tsu_instance; + struct sfw_session *sn = tsi->tsi_batch->bat_session; + struct srpc_msg *msg = &rpc->crpc_replymsg; + struct srpc_brw_reply *reply = &msg->msg_body.brw_reply; + struct srpc_brw_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst; + + LASSERT(sn != NULL); + + if (rpc->crpc_status != 0) { + CERROR("BRW RPC to %s failed with %d\n", + libcfs_id2str(rpc->crpc_dest), rpc->crpc_status); + if (!tsi->tsi_stopping) /* rpc could have been aborted */ + atomic_inc(&sn->sn_brw_errors); + return; + } + + if (msg->msg_magic != SRPC_MSG_MAGIC) { + __swab64s(&magic); + __swab32s(&reply->brw_status); + } + + CDEBUG(reply->brw_status ? D_WARNING : D_NET, + "BRW RPC to %s finished with brw_status: %d\n", + libcfs_id2str(rpc->crpc_dest), reply->brw_status); + + if (reply->brw_status != 0) { + atomic_inc(&sn->sn_brw_errors); + rpc->crpc_status = -(int)reply->brw_status; + return; + } + + if (reqst->brw_rw == LST_BRW_WRITE) + return; + + if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) { + CERROR("Bulk data from %s is corrupted!\n", + libcfs_id2str(rpc->crpc_dest)); + atomic_inc(&sn->sn_brw_errors); + rpc->crpc_status = -EBADMSG; + } +} + +static void +brw_server_rpc_done(struct srpc_server_rpc *rpc) +{ + struct srpc_bulk *blk = rpc->srpc_bulk; + + if (blk == NULL) + return; + + if (rpc->srpc_status != 0) + CERROR("Bulk transfer %s %s has failed: %d\n", + blk->bk_sink ? "from" : "to", + libcfs_id2str(rpc->srpc_peer), rpc->srpc_status); + else + CDEBUG(D_NET, "Transferred %d pages bulk data %s %s\n", + blk->bk_niov, blk->bk_sink ? "from" : "to", + libcfs_id2str(rpc->srpc_peer)); + + sfw_free_pages(rpc); +} + +static int +brw_bulk_ready(struct srpc_server_rpc *rpc, int status) +{ + __u64 magic = BRW_MAGIC; + struct srpc_brw_reply *reply = &rpc->srpc_replymsg.msg_body.brw_reply; + struct srpc_brw_reqst *reqst; + struct srpc_msg *reqstmsg; + + LASSERT (rpc->srpc_bulk != NULL); + LASSERT (rpc->srpc_reqstbuf != NULL); + + reqstmsg = &rpc->srpc_reqstbuf->buf_msg; + reqst = &reqstmsg->msg_body.brw_reqst; + + if (status != 0) { + CERROR ("BRW bulk %s failed for RPC from %s: %d\n", + reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE", + libcfs_id2str(rpc->srpc_peer), status); + return -EIO; + } + + if (reqst->brw_rw == LST_BRW_READ) + return 0; + + if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) + __swab64s(&magic); + + if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) { + CERROR ("Bulk data from %s is corrupted!\n", + libcfs_id2str(rpc->srpc_peer)); + reply->brw_status = EBADMSG; + } + + return 0; +} + +static int +brw_server_handle(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + struct srpc_msg *replymsg = &rpc->srpc_replymsg; + struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg; + struct srpc_brw_reply *reply = &replymsg->msg_body.brw_reply; + struct srpc_brw_reqst *reqst = &reqstmsg->msg_body.brw_reqst; + int npg; + int rc; + + LASSERT (sv->sv_id == SRPC_SERVICE_BRW); + + if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) { + LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + __swab32s(&reqst->brw_rw); + __swab32s(&reqst->brw_len); + __swab32s(&reqst->brw_flags); + __swab64s(&reqst->brw_rpyid); + __swab64s(&reqst->brw_bulkid); + } + LASSERT (reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id)); + + reply->brw_status = 0; + rpc->srpc_done = brw_server_rpc_done; + + if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) || + (reqst->brw_flags != LST_BRW_CHECK_NONE && + reqst->brw_flags != LST_BRW_CHECK_FULL && + reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) { + reply->brw_status = EINVAL; + return 0; + } + + if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + replymsg->msg_ses_feats = LST_FEATS_MASK; + reply->brw_status = EPROTO; + return 0; + } + + if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) { + /* compat with old version */ + if ((reqst->brw_len & ~PAGE_MASK) != 0) { + reply->brw_status = EINVAL; + return 0; + } + npg = reqst->brw_len >> PAGE_SHIFT; + + } else { + npg = (reqst->brw_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + + replymsg->msg_ses_feats = reqstmsg->msg_ses_feats; + + if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) { + reply->brw_status = EINVAL; + return 0; + } + + rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg, + reqst->brw_len, + reqst->brw_rw == LST_BRW_WRITE); + if (rc != 0) + return rc; + + if (reqst->brw_rw == LST_BRW_READ) + brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC); + else + brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON); + + return 0; +} + +struct sfw_test_client_ops brw_test_client; + +void brw_init_test_client(void) +{ + brw_test_client.tso_init = brw_client_init; + brw_test_client.tso_fini = brw_client_fini; + brw_test_client.tso_prep_rpc = brw_client_prep_rpc; + brw_test_client.tso_done_rpc = brw_client_done_rpc; +}; + +struct srpc_service brw_test_service; + +void brw_init_test_service(void) +{ + brw_test_service.sv_id = SRPC_SERVICE_BRW; + brw_test_service.sv_name = "brw_test"; + brw_test_service.sv_handler = brw_server_handle; + brw_test_service.sv_bulk_ready = brw_bulk_ready; + brw_test_service.sv_wi_total = brw_srv_workitems; +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/conctl.c b/drivers/staging/lustrefsx/lnet/selftest/conctl.c new file mode 100644 index 0000000000000..9afbdae89d398 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/conctl.c @@ -0,0 +1,929 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/selftest/conctl.c + * + * IOC handle in kernel + * + * Author: Liang Zhen + */ + +#include +#include +#include "console.h" + +static int +lst_session_new_ioctl(struct lstio_session_new_args *args) +{ + char *name; + int rc; + + if (args->lstio_ses_idp == NULL || /* address for output sid */ + args->lstio_ses_key == 0 || /* no key is specified */ + args->lstio_ses_namep == NULL || /* session name */ + args->lstio_ses_nmlen <= 0 || + args->lstio_ses_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_ses_namep, + args->lstio_ses_nmlen)) { + LIBCFS_FREE(name, args->lstio_ses_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_ses_nmlen] = 0; + + rc = lstcon_session_new(name, + args->lstio_ses_key, + args->lstio_ses_feats, + args->lstio_ses_timeout, + args->lstio_ses_force, + args->lstio_ses_idp); + + LIBCFS_FREE(name, args->lstio_ses_nmlen + 1); + return rc; +} + +static int +lst_session_end_ioctl(struct lstio_session_end_args *args) +{ + if (args->lstio_ses_key != console_session.ses_key) + return -EACCES; + + return lstcon_session_end(); +} + +static int +lst_session_info_ioctl(struct lstio_session_info_args *args) +{ + /* no checking of key */ + + if (args->lstio_ses_idp == NULL || /* address for ouput sid */ + args->lstio_ses_keyp == NULL || /* address for ouput key */ + args->lstio_ses_featp == NULL || /* address for ouput features */ + args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */ + args->lstio_ses_namep == NULL || /* address for ouput name */ + args->lstio_ses_nmlen <= 0 || + args->lstio_ses_nmlen > LST_NAME_SIZE) + return -EINVAL; + + return lstcon_session_info(args->lstio_ses_idp, + args->lstio_ses_keyp, + args->lstio_ses_featp, + args->lstio_ses_ndinfo, + args->lstio_ses_namep, + args->lstio_ses_nmlen); +} + +static int +lst_debug_ioctl(struct lstio_debug_args *args) +{ + char *name = NULL; + int client = 1; + int rc; + + if (args->lstio_dbg_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_dbg_resultp == NULL) + return -EINVAL; + + if (args->lstio_dbg_namep != NULL && /* name of batch/group */ + (args->lstio_dbg_nmlen <= 0 || + args->lstio_dbg_nmlen > LST_NAME_SIZE)) + return -EINVAL; + + if (args->lstio_dbg_namep != NULL) { + LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_dbg_namep, + args->lstio_dbg_nmlen)) { + LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1); + + return -EFAULT; + } + + name[args->lstio_dbg_nmlen] = 0; + } + + rc = -EINVAL; + + switch (args->lstio_dbg_type) { + case LST_OPC_SESSION: + rc = lstcon_session_debug(args->lstio_dbg_timeout, + args->lstio_dbg_resultp); + break; + + case LST_OPC_BATCHSRV: + client = 0; + fallthrough; + case LST_OPC_BATCHCLI: + if (name == NULL) + goto out; + + rc = lstcon_batch_debug(args->lstio_dbg_timeout, + name, client, args->lstio_dbg_resultp); + break; + + case LST_OPC_GROUP: + if (name == NULL) + goto out; + + rc = lstcon_group_debug(args->lstio_dbg_timeout, + name, args->lstio_dbg_resultp); + break; + + case LST_OPC_NODES: + if (args->lstio_dbg_count <= 0 || + args->lstio_dbg_idsp == NULL) + goto out; + + rc = lstcon_nodes_debug(args->lstio_dbg_timeout, + args->lstio_dbg_count, + args->lstio_dbg_idsp, + args->lstio_dbg_resultp); + break; + + default: + break; + } + +out: + if (name != NULL) + LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1); + + return rc; +} + +static int +lst_group_add_ioctl(struct lstio_group_add_args *args) +{ + char *name; + int rc; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_group_add(name); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return rc; +} + +static int +lst_group_del_ioctl(struct lstio_group_del_args *args) +{ + int rc; + char *name; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_group_del(name); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return rc; +} + +static int +lst_group_update_ioctl(struct lstio_group_update_args *args) +{ + int rc; + char *name; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_resultp == NULL || + args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + switch (args->lstio_grp_opc) { + case LST_GROUP_CLEAN: + rc = lstcon_group_clean(name, args->lstio_grp_args); + break; + + case LST_GROUP_REFRESH: + rc = lstcon_group_refresh(name, args->lstio_grp_resultp); + break; + + case LST_GROUP_RMND: + if (args->lstio_grp_count <= 0 || + args->lstio_grp_idsp == NULL) { + rc = -EINVAL; + break; + } + rc = lstcon_nodes_remove(name, args->lstio_grp_count, + args->lstio_grp_idsp, + args->lstio_grp_resultp); + break; + + default: + rc = -EINVAL; + break; + } + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return rc; +} + +static int +lst_nodes_add_ioctl(struct lstio_group_nodes_args *args) +{ + unsigned int feats; + int rc; + char *name; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_idsp == NULL || /* array of ids */ + args->lstio_grp_count <= 0 || + args->lstio_grp_resultp == NULL || + args->lstio_grp_featp == NULL || + args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_nodes_add(name, args->lstio_grp_count, + args->lstio_grp_idsp, &feats, + args->lstio_grp_resultp); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + if (rc == 0 && + copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) { + return -EINVAL; + } + + return rc; +} + +static int +lst_group_list_ioctl(struct lstio_group_list_args *args) +{ + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_idx < 0 || + args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + return lstcon_group_list(args->lstio_grp_idx, + args->lstio_grp_nmlen, + args->lstio_grp_namep); +} + +static int +lst_group_info_ioctl(struct lstio_group_info_args *args) +{ + char *name; + int ndent; + int index; + int rc; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_grp_entp == NULL && /* output: group entry */ + args->lstio_grp_dentsp == NULL) /* output: node entry */ + return -EINVAL; + + if (args->lstio_grp_dentsp != NULL) { /* have node entry */ + if (args->lstio_grp_idxp == NULL || /* node index */ + args->lstio_grp_ndentp == NULL) /* # of node entry */ + return -EINVAL; + + if (copy_from_user(&ndent, args->lstio_grp_ndentp, + sizeof(ndent)) || + copy_from_user(&index, args->lstio_grp_idxp, + sizeof(index))) + return -EFAULT; + + if (ndent <= 0 || index < 0) + return -EINVAL; + } + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_group_info(name, args->lstio_grp_entp, + &index, &ndent, args->lstio_grp_dentsp); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + if (rc != 0) + return rc; + + if (args->lstio_grp_dentsp != NULL && + (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) || + copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent)))) + return -EFAULT; + + return 0; +} + +static int +lst_batch_add_ioctl(struct lstio_batch_add_args *args) +{ + int rc; + char *name; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_add(name); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +static int +lst_batch_run_ioctl(struct lstio_batch_run_args *args) +{ + int rc; + char *name; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_run(name, args->lstio_bat_timeout, + args->lstio_bat_resultp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +static int +lst_batch_stop_ioctl(struct lstio_batch_stop_args *args) +{ + int rc; + char *name; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_resultp == NULL || + args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_stop(name, args->lstio_bat_force, + args->lstio_bat_resultp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +static int +lst_batch_query_ioctl(struct lstio_batch_query_args *args) +{ + char *name; + int rc; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_resultp == NULL || + args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_bat_testidx < 0) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_test_batch_query(name, + args->lstio_bat_testidx, + args->lstio_bat_client, + args->lstio_bat_timeout, + args->lstio_bat_resultp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +static int +lst_batch_list_ioctl(struct lstio_batch_list_args *args) +{ + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_idx < 0 || + args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + return lstcon_batch_list(args->lstio_bat_idx, + args->lstio_bat_nmlen, + args->lstio_bat_namep); +} + +static int +lst_batch_info_ioctl(struct lstio_batch_info_args *args) +{ + char *name; + int rc; + int index; + int ndent; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_namep == NULL || /* batch name */ + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_bat_entp == NULL && /* output: batch entry */ + args->lstio_bat_dentsp == NULL) /* output: node entry */ + return -EINVAL; + + if (args->lstio_bat_dentsp != NULL) { /* have node entry */ + if (args->lstio_bat_idxp == NULL || /* node index */ + args->lstio_bat_ndentp == NULL) /* # of node entry */ + return -EINVAL; + + if (copy_from_user(&index, args->lstio_bat_idxp, + sizeof(index)) || + copy_from_user(&ndent, args->lstio_bat_ndentp, + sizeof(ndent))) + return -EFAULT; + + if (ndent <= 0 || index < 0) + return -EINVAL; + } + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_info(name, + args->lstio_bat_entp, args->lstio_bat_server, + args->lstio_bat_testidx, &index, &ndent, + args->lstio_bat_dentsp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + if (rc != 0) + return rc; + + if (args->lstio_bat_dentsp != NULL && + (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) || + copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent)))) + rc = -EFAULT; + + return rc; +} + +static int +lst_stat_query_ioctl(struct lstio_stat_args *args) +{ + int rc; + char *name = NULL; + + /* TODO: not finished */ + if (args->lstio_sta_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_sta_resultp == NULL) + return -EINVAL; + + if (args->lstio_sta_idsp != NULL) { + if (args->lstio_sta_count <= 0) + return -EINVAL; + + rc = lstcon_nodes_stat(args->lstio_sta_count, + args->lstio_sta_idsp, + args->lstio_sta_timeout, + args->lstio_sta_resultp); + } else if (args->lstio_sta_namep != NULL) { + if (args->lstio_sta_nmlen <= 0 || + args->lstio_sta_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + rc = copy_from_user(name, args->lstio_sta_namep, + args->lstio_sta_nmlen); + if (rc == 0) + rc = lstcon_group_stat(name, args->lstio_sta_timeout, + args->lstio_sta_resultp); + else + rc = -EFAULT; + + } else { + rc = -EINVAL; + } + + if (name != NULL) + LIBCFS_FREE(name, args->lstio_sta_nmlen + 1); + return rc; +} + +static int lst_test_add_ioctl(struct lstio_test_args *args) +{ + char *batch_name; + char *src_name = NULL; + char *dst_name = NULL; + void *param = NULL; + int ret = 0; + int rc = -ENOMEM; + + if (args->lstio_tes_resultp == NULL || + args->lstio_tes_retp == NULL || + args->lstio_tes_bat_name == NULL || /* no specified batch */ + args->lstio_tes_bat_nmlen <= 0 || + args->lstio_tes_bat_nmlen > LST_NAME_SIZE || + args->lstio_tes_sgrp_name == NULL || /* no source group */ + args->lstio_tes_sgrp_nmlen <= 0 || + args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE || + args->lstio_tes_dgrp_name == NULL || /* no target group */ + args->lstio_tes_dgrp_nmlen <= 0 || + args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_tes_loop == 0 || /* negative is infinite */ + args->lstio_tes_concur <= 0 || + args->lstio_tes_dist <= 0 || + args->lstio_tes_span <= 0) + return -EINVAL; + + /* have parameter, check if parameter length is valid */ + if (args->lstio_tes_param != NULL && + (args->lstio_tes_param_len <= 0 || + args->lstio_tes_param_len > + PAGE_SIZE - sizeof(struct lstcon_test))) + return -EINVAL; + + LIBCFS_ALLOC(batch_name, args->lstio_tes_bat_nmlen + 1); + if (batch_name == NULL) + return rc; + + LIBCFS_ALLOC(src_name, args->lstio_tes_sgrp_nmlen + 1); + if (src_name == NULL) + goto out; + + LIBCFS_ALLOC(dst_name, args->lstio_tes_dgrp_nmlen + 1); + if (dst_name == NULL) + goto out; + + if (args->lstio_tes_param != NULL) { + LIBCFS_ALLOC(param, args->lstio_tes_param_len); + if (param == NULL) + goto out; + if (copy_from_user(param, args->lstio_tes_param, + args->lstio_tes_param_len)) { + rc = -EFAULT; + goto out; + } + } + + rc = -EFAULT; + if (copy_from_user(batch_name, args->lstio_tes_bat_name, + args->lstio_tes_bat_nmlen) || + copy_from_user(src_name, args->lstio_tes_sgrp_name, + args->lstio_tes_sgrp_nmlen) || + copy_from_user(dst_name, args->lstio_tes_dgrp_name, + args->lstio_tes_dgrp_nmlen)) + goto out; + + rc = lstcon_test_add(batch_name, + args->lstio_tes_type, + args->lstio_tes_loop, + args->lstio_tes_concur, + args->lstio_tes_dist, args->lstio_tes_span, + src_name, dst_name, param, + args->lstio_tes_param_len, + &ret, args->lstio_tes_resultp); + + if (ret != 0) + rc = (copy_to_user(args->lstio_tes_retp, &ret, + sizeof(ret))) ? -EFAULT : 0; +out: + if (batch_name != NULL) + LIBCFS_FREE(batch_name, args->lstio_tes_bat_nmlen + 1); + + if (src_name != NULL) + LIBCFS_FREE(src_name, args->lstio_tes_sgrp_nmlen + 1); + + if (dst_name != NULL) + LIBCFS_FREE(dst_name, args->lstio_tes_dgrp_nmlen + 1); + + if (param != NULL) + LIBCFS_FREE(param, args->lstio_tes_param_len); + + return rc; +} + +int +lstcon_ioctl_entry(struct notifier_block *nb, + unsigned long cmd, void *vdata) +{ + struct libcfs_ioctl_hdr *hdr = vdata; + struct libcfs_ioctl_data *data; + char *buf = NULL; + int rc = -EINVAL; + int opc; + + if (cmd != IOC_LIBCFS_LNETST) + goto err; + + data = container_of(hdr, struct libcfs_ioctl_data, ioc_hdr); + + opc = data->ioc_u32[0]; + + if (data->ioc_plen1 > PAGE_SIZE) + goto err; + + LIBCFS_ALLOC(buf, data->ioc_plen1); + if (buf == NULL) { + rc = -ENOMEM; + goto err; + } + + /* copy in parameter */ + if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) { + rc = -EFAULT; + goto out_free_buf; + } + + mutex_lock(&console_session.ses_mutex); + + console_session.ses_laststamp = ktime_get_real_seconds(); + + if (console_session.ses_shutdown) { + rc = -ESHUTDOWN; + goto out; + } + + if (console_session.ses_expired) + lstcon_session_end(); + + if (opc != LSTIO_SESSION_NEW && + console_session.ses_state == LST_SESSION_NONE) { + CDEBUG(D_NET, "LST no active session\n"); + rc = -ESRCH; + goto out; + } + + memset(&console_session.ses_trans_stat, 0, + sizeof(struct lstcon_trans_stat)); + + switch (opc) { + case LSTIO_SESSION_NEW: + rc = lst_session_new_ioctl((struct lstio_session_new_args *)buf); + break; + case LSTIO_SESSION_END: + rc = lst_session_end_ioctl((struct lstio_session_end_args *)buf); + break; + case LSTIO_SESSION_INFO: + rc = lst_session_info_ioctl((struct lstio_session_info_args *)buf); + break; + case LSTIO_DEBUG: + rc = lst_debug_ioctl((struct lstio_debug_args *)buf); + break; + case LSTIO_GROUP_ADD: + rc = lst_group_add_ioctl((struct lstio_group_add_args *)buf); + break; + case LSTIO_GROUP_DEL: + rc = lst_group_del_ioctl((struct lstio_group_del_args *)buf); + break; + case LSTIO_GROUP_UPDATE: + rc = lst_group_update_ioctl((struct lstio_group_update_args *)buf); + break; + case LSTIO_NODES_ADD: + rc = lst_nodes_add_ioctl((struct lstio_group_nodes_args *)buf); + break; + case LSTIO_GROUP_LIST: + rc = lst_group_list_ioctl((struct lstio_group_list_args *)buf); + break; + case LSTIO_GROUP_INFO: + rc = lst_group_info_ioctl((struct lstio_group_info_args *)buf); + break; + case LSTIO_BATCH_ADD: + rc = lst_batch_add_ioctl((struct lstio_batch_add_args *)buf); + break; + case LSTIO_BATCH_START: + rc = lst_batch_run_ioctl((struct lstio_batch_run_args *)buf); + break; + case LSTIO_BATCH_STOP: + rc = lst_batch_stop_ioctl((struct lstio_batch_stop_args *)buf); + break; + case LSTIO_BATCH_QUERY: + rc = lst_batch_query_ioctl((struct lstio_batch_query_args *)buf); + break; + case LSTIO_BATCH_LIST: + rc = lst_batch_list_ioctl((struct lstio_batch_list_args *)buf); + break; + case LSTIO_BATCH_INFO: + rc = lst_batch_info_ioctl((struct lstio_batch_info_args *)buf); + break; + case LSTIO_TEST_ADD: + rc = lst_test_add_ioctl((struct lstio_test_args *)buf); + break; + case LSTIO_STAT_QUERY: + rc = lst_stat_query_ioctl((struct lstio_stat_args *)buf); + break; + default: + rc = -EINVAL; + goto out; + } + + if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat, + sizeof(struct lstcon_trans_stat))) + rc = -EFAULT; +out: + mutex_unlock(&console_session.ses_mutex); +out_free_buf: + LIBCFS_FREE(buf, data->ioc_plen1); +err: + return notifier_from_ioctl_errno(rc); +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c new file mode 100644 index 0000000000000..d2147e6bb8b44 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c @@ -0,0 +1,1398 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/selftest/conctl.c + * + * Console framework rpcs + * + * Author: Liang Zhen + */ + + +#include +#include +#include "timer.h" +#include "conrpc.h" +#include "console.h" + +void lstcon_rpc_stat_reply(struct lstcon_rpc_trans *, struct srpc_msg *, + struct lstcon_node *, struct lstcon_trans_stat *); + +static void +lstcon_rpc_done(struct srpc_client_rpc *rpc) +{ + struct lstcon_rpc *crpc = rpc->crpc_priv; + + LASSERT(crpc != NULL && rpc == crpc->crp_rpc); + LASSERT(crpc->crp_posted && !crpc->crp_finished); + + spin_lock(&rpc->crpc_lock); + + if (crpc->crp_trans == NULL) { + /* Orphan RPC is not in any transaction, + * I'm just a poor body and nobody loves me */ + spin_unlock(&rpc->crpc_lock); + + /* release it */ + lstcon_rpc_put(crpc); + return; + } + + /* not an orphan RPC */ + crpc->crp_finished = 1; + + if (crpc->crp_stamp_ns == 0) { + /* not aborted */ + LASSERT(crpc->crp_status == 0); + + crpc->crp_stamp_ns = ktime_get_ns(); + crpc->crp_status = rpc->crpc_status; + } + + /* wakeup (transaction)thread if I'm the last RPC in the transaction */ + if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining)) + wake_up(&crpc->crp_trans->tas_waitq); + + spin_unlock(&rpc->crpc_lock); +} + +static int +lstcon_rpc_init(struct lstcon_node *nd, int service, unsigned int feats, + int bulk_npg, int bulk_len, int embedded, + struct lstcon_rpc *crpc) +{ + memset(crpc, 0, sizeof(*crpc)); + + crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service, + feats, bulk_npg, bulk_len, + lstcon_rpc_done, (void *)crpc); + if (crpc->crp_rpc == NULL) + return -ENOMEM; + + crpc->crp_node = nd; + crpc->crp_embedded = embedded; + INIT_LIST_HEAD(&crpc->crp_link); + + atomic_inc(&console_session.ses_rpc_counter); + + return 0; +} + +static int +lstcon_rpc_prep(struct lstcon_node *nd, int service, unsigned int feats, + int bulk_npg, int bulk_len, struct lstcon_rpc **crpcpp) +{ + struct lstcon_rpc *crpc = NULL; + int rc; + + spin_lock(&console_session.ses_rpc_lock); + + if (!list_empty(&console_session.ses_rpc_freelist)) { + crpc = list_entry(console_session.ses_rpc_freelist.next, + struct lstcon_rpc, crp_link); + list_del_init(&crpc->crp_link); + } + + spin_unlock(&console_session.ses_rpc_lock); + + if (crpc == NULL) { + LIBCFS_ALLOC(crpc, sizeof(*crpc)); + if (crpc == NULL) + return -ENOMEM; + } + + rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc); + if (rc == 0) { + *crpcpp = crpc; + return 0; + } + + LIBCFS_FREE(crpc, sizeof(*crpc)); + + return rc; +} + +void +lstcon_rpc_put(struct lstcon_rpc *crpc) +{ + struct srpc_bulk *bulk = &crpc->crp_rpc->crpc_bulk; + int i; + + LASSERT(list_empty(&crpc->crp_link)); + + for (i = 0; i < bulk->bk_niov; i++) { + if (bulk->bk_iovs[i].bv_page == NULL) + continue; + + __free_page(bulk->bk_iovs[i].bv_page); + } + + srpc_client_rpc_decref(crpc->crp_rpc); + + if (crpc->crp_embedded) { + /* embedded RPC, don't recycle it */ + memset(crpc, 0, sizeof(*crpc)); + crpc->crp_embedded = 1; + + } else { + spin_lock(&console_session.ses_rpc_lock); + + list_add(&crpc->crp_link, + &console_session.ses_rpc_freelist); + + spin_unlock(&console_session.ses_rpc_lock); + } + + /* RPC is not alive now */ + atomic_dec(&console_session.ses_rpc_counter); +} + +static void +lstcon_rpc_post(struct lstcon_rpc *crpc) +{ + struct lstcon_rpc_trans *trans = crpc->crp_trans; + + LASSERT (trans != NULL); + + atomic_inc(&trans->tas_remaining); + crpc->crp_posted = 1; + + sfw_post_rpc(crpc->crp_rpc); +} + +static char * +lstcon_rpc_trans_name(int transop) +{ + if (transop == LST_TRANS_SESNEW) + return "SESNEW"; + + if (transop == LST_TRANS_SESEND) + return "SESEND"; + + if (transop == LST_TRANS_SESQRY) + return "SESQRY"; + + if (transop == LST_TRANS_SESPING) + return "SESPING"; + + if (transop == LST_TRANS_TSBCLIADD) + return "TSBCLIADD"; + + if (transop == LST_TRANS_TSBSRVADD) + return "TSBSRVADD"; + + if (transop == LST_TRANS_TSBRUN) + return "TSBRUN"; + + if (transop == LST_TRANS_TSBSTOP) + return "TSBSTOP"; + + if (transop == LST_TRANS_TSBCLIQRY) + return "TSBCLIQRY"; + + if (transop == LST_TRANS_TSBSRVQRY) + return "TSBSRVQRY"; + + if (transop == LST_TRANS_STATQRY) + return "STATQRY"; + + return "Unknown"; +} + +int +lstcon_rpc_trans_prep(struct list_head *translist, int transop, + struct lstcon_rpc_trans **transpp) +{ + struct lstcon_rpc_trans *trans; + + if (translist != NULL) { + list_for_each_entry(trans, translist, tas_link) { + /* Can't enqueue two private transaction on + * the same object */ + if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE) + return -EPERM; + } + } + + /* create a trans group */ + LIBCFS_ALLOC(trans, sizeof(*trans)); + if (trans == NULL) + return -ENOMEM; + + trans->tas_opc = transop; + + if (translist == NULL) + INIT_LIST_HEAD(&trans->tas_olink); + else + list_add_tail(&trans->tas_olink, translist); + + list_add_tail(&trans->tas_link, &console_session.ses_trans_list); + + INIT_LIST_HEAD(&trans->tas_rpcs_list); + atomic_set(&trans->tas_remaining, 0); + init_waitqueue_head(&trans->tas_waitq); + + spin_lock(&console_session.ses_rpc_lock); + trans->tas_features = console_session.ses_features; + spin_unlock(&console_session.ses_rpc_lock); + + *transpp = trans; + return 0; +} + +void +lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans, struct lstcon_rpc *crpc) +{ + list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list); + crpc->crp_trans = trans; +} + +void +lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error) +{ + struct srpc_client_rpc *rpc; + struct lstcon_rpc *crpc; + struct lstcon_node *nd; + + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + rpc = crpc->crp_rpc; + + spin_lock(&rpc->crpc_lock); + + if (!crpc->crp_posted || /* not posted */ + crpc->crp_stamp_ns != 0) { /* rpc done or aborted already */ + if (crpc->crp_stamp_ns == 0) { + crpc->crp_stamp_ns = ktime_get_ns(); + crpc->crp_status = -EINTR; + } + spin_unlock(&rpc->crpc_lock); + continue; + } + + crpc->crp_stamp_ns = ktime_get_ns(); + crpc->crp_status = error; + + spin_unlock(&rpc->crpc_lock); + + sfw_abort_rpc(rpc); + + if (error != -ETIMEDOUT) + continue; + + nd = crpc->crp_node; + if (ktime_to_ns(nd->nd_stamp) > crpc->crp_stamp_ns) + continue; + + nd->nd_stamp = ktime_set(0, crpc->crp_stamp_ns); + nd->nd_state = LST_NODE_DOWN; + } +} + +static int +lstcon_rpc_trans_check(struct lstcon_rpc_trans *trans) +{ + if (console_session.ses_shutdown && + !list_empty(&trans->tas_olink)) /* Not an end session RPC */ + return 1; + + return (atomic_read(&trans->tas_remaining) == 0) ? 1: 0; +} + +int +lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout) +{ + struct lstcon_rpc *crpc; + int rc; + + if (list_empty(&trans->tas_rpcs_list)) + return 0; + + if (timeout < LST_TRANS_MIN_TIMEOUT) + timeout = LST_TRANS_MIN_TIMEOUT; + + CDEBUG(D_NET, "Transaction %s started\n", + lstcon_rpc_trans_name(trans->tas_opc)); + + /* post all requests */ + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + LASSERT(!crpc->crp_posted); + + lstcon_rpc_post(crpc); + } + + mutex_unlock(&console_session.ses_mutex); + + rc = wait_event_interruptible_timeout(trans->tas_waitq, + lstcon_rpc_trans_check(trans), + cfs_time_seconds(timeout)); + + rc = (rc > 0)? 0: ((rc < 0)? -EINTR: -ETIMEDOUT); + + mutex_lock(&console_session.ses_mutex); + + if (console_session.ses_shutdown) + rc = -ESHUTDOWN; + + if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) { + /* treat short timeout as canceled */ + if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2) + rc = -EINTR; + + lstcon_rpc_trans_abort(trans, rc); + } + + CDEBUG(D_NET, "Transaction %s stopped: %d\n", + lstcon_rpc_trans_name(trans->tas_opc), rc); + + lstcon_rpc_trans_stat(trans, lstcon_trans_stat()); + + return rc; +} + +static int +lstcon_rpc_get_reply(struct lstcon_rpc *crpc, struct srpc_msg **msgpp) +{ + struct lstcon_node *nd = crpc->crp_node; + struct srpc_client_rpc *rpc = crpc->crp_rpc; + struct srpc_generic_reply *rep; + + LASSERT(nd != NULL && rpc != NULL); + LASSERT(crpc->crp_stamp_ns != 0); + + if (crpc->crp_status != 0) { + *msgpp = NULL; + return crpc->crp_status; + } + + *msgpp = &rpc->crpc_replymsg; + if (!crpc->crp_unpacked) { + sfw_unpack_message(*msgpp); + crpc->crp_unpacked = 1; + } + + if (ktime_to_ns(nd->nd_stamp) > crpc->crp_stamp_ns) + return 0; + + nd->nd_stamp = ktime_set(0, crpc->crp_stamp_ns); + rep = &(*msgpp)->msg_body.reply; + + if (rep->sid.ses_nid == LNET_NID_ANY) + nd->nd_state = LST_NODE_UNKNOWN; + else if (lstcon_session_match(rep->sid)) + nd->nd_state = LST_NODE_ACTIVE; + else + nd->nd_state = LST_NODE_BUSY; + + return 0; +} + +void +lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans, + struct lstcon_trans_stat *stat) +{ + struct lstcon_rpc *crpc; + struct srpc_msg *rep; + int error; + + LASSERT(stat != NULL); + + memset(stat, 0, sizeof(*stat)); + + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + lstcon_rpc_stat_total(stat, 1); + + LASSERT(crpc->crp_stamp_ns != 0); + + error = lstcon_rpc_get_reply(crpc, &rep); + if (error != 0) { + lstcon_rpc_stat_failure(stat, 1); + if (stat->trs_rpc_errno == 0) + stat->trs_rpc_errno = -error; + + continue; + } + + lstcon_rpc_stat_success(stat, 1); + + lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat); + } + + if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) { + stat->trs_fwk_errno = + lstcon_session_feats_check(trans->tas_features); + } + + CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, " + "RPC error(%d), Framework error(%d)\n", + lstcon_rpc_trans_name(trans->tas_opc), + lstcon_rpc_stat_success(stat, 0), + lstcon_rpc_stat_failure(stat, 0), + lstcon_rpc_stat_total(stat, 0), + stat->trs_rpc_errno, stat->trs_fwk_errno); +} + +int +lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans, + struct list_head __user *head_up, + lstcon_rpc_readent_func_t readent) +{ + struct list_head tmp; + struct list_head __user *next; + struct lstcon_rpc_ent *ent; + struct srpc_generic_reply *rep; + struct lstcon_rpc *crpc; + struct srpc_msg *msg; + struct lstcon_node *nd; + struct timespec64 ts; + int error; + s64 dur; + + LASSERT(head_up != NULL); + + next = head_up; + + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + if (copy_from_user(&tmp, next, + sizeof(struct list_head))) + return -EFAULT; + + if (tmp.next == head_up) + return 0; + + next = tmp.next; + + ent = list_entry(next, struct lstcon_rpc_ent, rpe_link); + + LASSERT(crpc->crp_stamp_ns != 0); + + error = lstcon_rpc_get_reply(crpc, &msg); + + nd = crpc->crp_node; + + dur = crpc->crp_stamp_ns - + console_session.ses_id.ses_stamp * NSEC_PER_MSEC; + ts = ns_to_timespec64(dur); + + if (copy_to_user(&ent->rpe_peer, + &nd->nd_id, sizeof(struct lnet_process_id)) || + copy_to_user(&ent->rpe_stamp, &ts, sizeof(ts)) || + copy_to_user(&ent->rpe_state, + &nd->nd_state, sizeof(nd->nd_state)) || + copy_to_user(&ent->rpe_rpc_errno, &error, + sizeof(error))) + return -EFAULT; + + if (error != 0) + continue; + + /* RPC is done */ + rep = (struct srpc_generic_reply *)&msg->msg_body.reply; + + if (copy_to_user(&ent->rpe_sid, + &rep->sid, sizeof(rep->sid)) || + copy_to_user(&ent->rpe_fwk_errno, + &rep->status, sizeof(rep->status))) + return -EFAULT; + + if (readent == NULL) + continue; + + error = readent(trans->tas_opc, msg, ent); + if (error != 0) + return error; + } + + return 0; +} + +void +lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans) +{ + struct srpc_client_rpc *rpc; + struct lstcon_rpc *crpc; + struct lstcon_rpc *tmp; + int count = 0; + + list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list, crp_link) { + rpc = crpc->crp_rpc; + + spin_lock(&rpc->crpc_lock); + + /* free it if not posted or finished already */ + if (!crpc->crp_posted || crpc->crp_finished) { + spin_unlock(&rpc->crpc_lock); + + list_del_init(&crpc->crp_link); + lstcon_rpc_put(crpc); + + continue; + } + + /* rpcs can be still not callbacked (even LNetMDUnlink is + * called) because huge timeout for inaccessible network, + * don't make user wait for them, just abandon them, they + * will be recycled in callback */ + + LASSERT(crpc->crp_status != 0); + + crpc->crp_node = NULL; + crpc->crp_trans = NULL; + list_del_init(&crpc->crp_link); + count++; + + spin_unlock(&rpc->crpc_lock); + + atomic_dec(&trans->tas_remaining); + } + + LASSERT(atomic_read(&trans->tas_remaining) == 0); + + list_del(&trans->tas_link); + if (!list_empty(&trans->tas_olink)) + list_del(&trans->tas_olink); + + CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n", + lstcon_rpc_trans_name(trans->tas_opc), count); + + LIBCFS_FREE(trans, sizeof(*trans)); +} + +int +lstcon_sesrpc_prep(struct lstcon_node *nd, int transop, + unsigned int feats, struct lstcon_rpc **crpc) +{ + struct srpc_mksn_reqst *msrq; + struct srpc_rmsn_reqst *rsrq; + int rc; + + switch (transop) { + case LST_TRANS_SESNEW: + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION, + feats, 0, 0, crpc); + if (rc != 0) + return rc; + + msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst; + msrq->mksn_sid = console_session.ses_id; + msrq->mksn_force = console_session.ses_force; + strlcpy(msrq->mksn_name, console_session.ses_name, + sizeof(msrq->mksn_name)); + break; + + case LST_TRANS_SESEND: + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION, + feats, 0, 0, crpc); + if (rc != 0) + return rc; + + rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst; + rsrq->rmsn_sid = console_session.ses_id; + break; + + default: + LBUG(); + } + + return 0; +} + +int +lstcon_dbgrpc_prep(struct lstcon_node *nd, unsigned int feats, + struct lstcon_rpc **crpc) +{ + struct srpc_debug_reqst *drq; + int rc; + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc); + if (rc != 0) + return rc; + + drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst; + + drq->dbg_sid = console_session.ses_id; + drq->dbg_flags = 0; + + return rc; +} + +int +lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats, + struct lstcon_tsb_hdr *tsb, struct lstcon_rpc **crpc) +{ + struct lstcon_batch *batch; + struct srpc_batch_reqst *brq; + int rc; + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc); + if (rc != 0) + return rc; + + brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst; + + brq->bar_sid = console_session.ses_id; + brq->bar_bid = tsb->tsb_id; + brq->bar_testidx = tsb->tsb_index; + brq->bar_opc = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN : + (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP: + SRPC_BATCH_OPC_QUERY); + + if (transop != LST_TRANS_TSBRUN && + transop != LST_TRANS_TSBSTOP) + return 0; + + LASSERT (tsb->tsb_index == 0); + + batch = (struct lstcon_batch *)tsb; + brq->bar_arg = batch->bat_arg; + + return 0; +} + +int +lstcon_statrpc_prep(struct lstcon_node *nd, unsigned int feats, + struct lstcon_rpc **crpc) +{ + struct srpc_stat_reqst *srq; + int rc; + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc); + if (rc != 0) + return rc; + + srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst; + + srq->str_sid = console_session.ses_id; + srq->str_type = 0; /* XXX remove it */ + + return 0; +} + +static struct lnet_process_id_packed * +lstcon_next_id(int idx, int nkiov, struct bio_vec *kiov) +{ + struct lnet_process_id_packed *pid; + int i; + + i = idx / SFW_ID_PER_PAGE; + + LASSERT (i < nkiov); + + pid = (struct lnet_process_id_packed *)page_address(kiov[i].bv_page); + + return &pid[idx % SFW_ID_PER_PAGE]; +} + +static int +lstcon_dstnodes_prep(struct lstcon_group *grp, int idx, + int dist, int span, int nkiov, struct bio_vec *kiov) +{ + struct lnet_process_id_packed *pid; + struct lstcon_ndlink *ndl; + struct lstcon_node *nd; + int start; + int end; + int i = 0; + + LASSERT (dist >= 1); + LASSERT (span >= 1); + LASSERT (grp->grp_nnode >= 1); + + if (span > grp->grp_nnode) + return -EINVAL; + + start = ((idx / dist) * span) % grp->grp_nnode; + end = ((idx / dist) * span + span - 1) % grp->grp_nnode; + + list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) { + nd = ndl->ndl_node; + if (i < start) { + i++; + continue; + } + + if (i > (end >= start ? end : grp->grp_nnode)) + break; + + pid = lstcon_next_id((i - start), nkiov, kiov); + pid->nid = nd->nd_id.nid; + pid->pid = nd->nd_id.pid; + i++; + } + + if (start <= end) /* done */ + return 0; + + list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) { + if (i > grp->grp_nnode + end) + break; + + nd = ndl->ndl_node; + pid = lstcon_next_id((i - start), nkiov, kiov); + pid->nid = nd->nd_id.nid; + pid->pid = nd->nd_id.pid; + i++; + } + + return 0; +} + +static int +lstcon_pingrpc_prep(struct lst_test_ping_param *param, + struct srpc_test_reqst *req) +{ + struct test_ping_req *prq = &req->tsr_u.ping; + + prq->png_size = param->png_size; + prq->png_flags = param->png_flags; + /* TODO dest */ + return 0; +} + +static int +lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param, + struct srpc_test_reqst *req) +{ + struct test_bulk_req *brq = &req->tsr_u.bulk_v0; + + brq->blk_opc = param->blk_opc; + brq->blk_npg = (param->blk_size + PAGE_SIZE - 1) / + PAGE_SIZE; + brq->blk_flags = param->blk_flags; + + return 0; +} + +static int +lstcon_bulkrpc_v1_prep(struct lst_test_bulk_param *param, bool is_client, + struct srpc_test_reqst *req) +{ + struct test_bulk_req_v1 *brq = &req->tsr_u.bulk_v1; + + brq->blk_opc = param->blk_opc; + brq->blk_flags = param->blk_flags; + brq->blk_len = param->blk_size; + brq->blk_offset = is_client ? param->blk_cli_off : param->blk_srv_off; + + return 0; +} + +int +lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats, + struct lstcon_test *test, struct lstcon_rpc **crpc) +{ + struct lstcon_group *sgrp = test->tes_src_grp; + struct lstcon_group *dgrp = test->tes_dst_grp; + struct srpc_test_reqst *trq; + struct srpc_bulk *bulk; + int i; + int npg = 0; + int nob = 0; + int rc = 0; + + if (transop == LST_TRANS_TSBCLIADD) { + npg = sfw_id_pages(test->tes_span); + nob = (feats & LST_FEAT_BULK_LEN) == 0 ? + npg * PAGE_SIZE : + sizeof(struct lnet_process_id_packed) * test->tes_span; + } + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc); + if (rc != 0) + return rc; + + trq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst; + + if (transop == LST_TRANS_TSBSRVADD) { + int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist; + int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span; + int nmax = (ndist + nspan - 1) / nspan; + + trq->tsr_ndest = 0; + trq->tsr_loop = nmax * test->tes_dist * test->tes_concur; + + } else { + bulk = &(*crpc)->crp_rpc->crpc_bulk; + + for (i = 0; i < npg; i++) { + int len; + + LASSERT(nob > 0); + + len = (feats & LST_FEAT_BULK_LEN) == 0 ? + PAGE_SIZE : min_t(int, nob, PAGE_SIZE); + nob -= len; + + bulk->bk_iovs[i].bv_offset = 0; + bulk->bk_iovs[i].bv_len = len; + bulk->bk_iovs[i].bv_page = + alloc_page(GFP_KERNEL); + + if (bulk->bk_iovs[i].bv_page == NULL) { + lstcon_rpc_put(*crpc); + return -ENOMEM; + } + } + + bulk->bk_sink = 0; + + LASSERT (transop == LST_TRANS_TSBCLIADD); + + rc = lstcon_dstnodes_prep(test->tes_dst_grp, + test->tes_cliidx++, + test->tes_dist, + test->tes_span, + npg, &bulk->bk_iovs[0]); + if (rc != 0) { + lstcon_rpc_put(*crpc); + return rc; + } + + trq->tsr_ndest = test->tes_span; + trq->tsr_loop = test->tes_loop; + } + + trq->tsr_sid = console_session.ses_id; + trq->tsr_bid = test->tes_hdr.tsb_id; + trq->tsr_concur = test->tes_concur; + trq->tsr_is_client = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0; + trq->tsr_stop_onerr = !!test->tes_stop_onerr; + + switch (test->tes_type) { + case LST_TEST_PING: + trq->tsr_service = SRPC_SERVICE_PING; + rc = lstcon_pingrpc_prep((struct lst_test_ping_param *) + &test->tes_param[0], trq); + break; + + case LST_TEST_BULK: + trq->tsr_service = SRPC_SERVICE_BRW; + if ((feats & LST_FEAT_BULK_LEN) == 0) { + rc = lstcon_bulkrpc_v0_prep((struct lst_test_bulk_param *) + &test->tes_param[0], trq); + } else { + rc = lstcon_bulkrpc_v1_prep((struct lst_test_bulk_param *) + &test->tes_param[0], + trq->tsr_is_client, trq); + } + + break; + default: + LBUG(); + break; + } + + return rc; +} + +static int +lstcon_sesnew_stat_reply(struct lstcon_rpc_trans *trans, + struct lstcon_node *nd, struct srpc_msg *reply) +{ + struct srpc_mksn_reply *mksn_rep = &reply->msg_body.mksn_reply; + int status = mksn_rep->mksn_status; + + if (status == 0 && + (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + mksn_rep->mksn_status = EPROTO; + status = EPROTO; + } + + if (status == EPROTO) { + CNETERR("session protocol error from %s: %u\n", + libcfs_nid2str(nd->nd_id.nid), + reply->msg_ses_feats); + } + + if (status != 0) + return status; + + if (!trans->tas_feats_updated) { + spin_lock(&console_session.ses_rpc_lock); + if (!trans->tas_feats_updated) { /* recheck with lock */ + trans->tas_feats_updated = 1; + trans->tas_features = reply->msg_ses_feats; + } + spin_unlock(&console_session.ses_rpc_lock); + } + + if (reply->msg_ses_feats != trans->tas_features) { + CNETERR("Framework features %x from %s is different with " + "features on this transaction: %x\n", + reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid), + trans->tas_features); + status = mksn_rep->mksn_status = EPROTO; + } + + if (status == 0) { + /* session timeout on remote node */ + nd->nd_timeout = mksn_rep->mksn_timeout; + } + + return status; +} + +void +lstcon_rpc_stat_reply(struct lstcon_rpc_trans *trans, struct srpc_msg *msg, + struct lstcon_node *nd, struct lstcon_trans_stat *stat) +{ + struct srpc_rmsn_reply *rmsn_rep; + struct srpc_debug_reply *dbg_rep; + struct srpc_batch_reply *bat_rep; + struct srpc_test_reply *test_rep; + struct srpc_stat_reply *stat_rep; + int rc = 0; + + switch (trans->tas_opc) { + case LST_TRANS_SESNEW: + rc = lstcon_sesnew_stat_reply(trans, nd, msg); + if (rc == 0) { + lstcon_sesop_stat_success(stat, 1); + return; + } + + lstcon_sesop_stat_failure(stat, 1); + break; + + case LST_TRANS_SESEND: + rmsn_rep = &msg->msg_body.rmsn_reply; + /* ESRCH is not an error for end session */ + if (rmsn_rep->rmsn_status == 0 || + rmsn_rep->rmsn_status == ESRCH) { + lstcon_sesop_stat_success(stat, 1); + return; + } + + lstcon_sesop_stat_failure(stat, 1); + rc = rmsn_rep->rmsn_status; + break; + + case LST_TRANS_SESQRY: + case LST_TRANS_SESPING: + dbg_rep = &msg->msg_body.dbg_reply; + + if (dbg_rep->dbg_status == ESRCH) { + lstcon_sesqry_stat_unknown(stat, 1); + return; + } + + if (lstcon_session_match(dbg_rep->dbg_sid)) + lstcon_sesqry_stat_active(stat, 1); + else + lstcon_sesqry_stat_busy(stat, 1); + return; + + case LST_TRANS_TSBRUN: + case LST_TRANS_TSBSTOP: + bat_rep = &msg->msg_body.bat_reply; + + if (bat_rep->bar_status == 0) { + lstcon_tsbop_stat_success(stat, 1); + return; + } + + if (bat_rep->bar_status == EPERM && + trans->tas_opc == LST_TRANS_TSBSTOP) { + lstcon_tsbop_stat_success(stat, 1); + return; + } + + lstcon_tsbop_stat_failure(stat, 1); + rc = bat_rep->bar_status; + break; + + case LST_TRANS_TSBCLIQRY: + case LST_TRANS_TSBSRVQRY: + bat_rep = &msg->msg_body.bat_reply; + + if (bat_rep->bar_active != 0) + lstcon_tsbqry_stat_run(stat, 1); + else + lstcon_tsbqry_stat_idle(stat, 1); + + if (bat_rep->bar_status == 0) + return; + + lstcon_tsbqry_stat_failure(stat, 1); + rc = bat_rep->bar_status; + break; + + case LST_TRANS_TSBCLIADD: + case LST_TRANS_TSBSRVADD: + test_rep = &msg->msg_body.tes_reply; + + if (test_rep->tsr_status == 0) { + lstcon_tsbop_stat_success(stat, 1); + return; + } + + lstcon_tsbop_stat_failure(stat, 1); + rc = test_rep->tsr_status; + break; + + case LST_TRANS_STATQRY: + stat_rep = &msg->msg_body.stat_reply; + + if (stat_rep->str_status == 0) { + lstcon_statqry_stat_success(stat, 1); + return; + } + + lstcon_statqry_stat_failure(stat, 1); + rc = stat_rep->str_status; + break; + + default: + LBUG(); + } + + if (stat->trs_fwk_errno == 0) + stat->trs_fwk_errno = rc; +} + +int +lstcon_rpc_trans_ndlist(struct list_head *ndlist, + struct list_head *translist, int transop, + void *arg, lstcon_rpc_cond_func_t condition, + struct lstcon_rpc_trans **transpp) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_ndlink *ndl; + struct lstcon_node *nd; + struct lstcon_rpc *rpc; + unsigned int feats; + int rc; + + /* Creating session RPG for list of nodes */ + + rc = lstcon_rpc_trans_prep(translist, transop, &trans); + if (rc != 0) { + CERROR("Can't create transaction %d: %d\n", transop, rc); + return rc; + } + + feats = trans->tas_features; + list_for_each_entry(ndl, ndlist, ndl_link) { + rc = condition == NULL ? 1 : + condition(transop, ndl->ndl_node, arg); + + if (rc == 0) + continue; + + if (rc < 0) { + CDEBUG(D_NET, "Condition error while creating RPC " + " for transaction %d: %d\n", transop, rc); + break; + } + + nd = ndl->ndl_node; + + switch (transop) { + case LST_TRANS_SESNEW: + case LST_TRANS_SESEND: + rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc); + break; + case LST_TRANS_SESQRY: + case LST_TRANS_SESPING: + rc = lstcon_dbgrpc_prep(nd, feats, &rpc); + break; + case LST_TRANS_TSBCLIADD: + case LST_TRANS_TSBSRVADD: + rc = lstcon_testrpc_prep(nd, transop, feats, + (struct lstcon_test *)arg, + &rpc); + break; + case LST_TRANS_TSBRUN: + case LST_TRANS_TSBSTOP: + case LST_TRANS_TSBCLIQRY: + case LST_TRANS_TSBSRVQRY: + rc = lstcon_batrpc_prep(nd, transop, feats, + (struct lstcon_tsb_hdr *)arg, + &rpc); + break; + case LST_TRANS_STATQRY: + rc = lstcon_statrpc_prep(nd, feats, &rpc); + break; + default: + rc = -EINVAL; + break; + } + + if (rc != 0) { + CERROR("Failed to create RPC for transaction %s: %d\n", + lstcon_rpc_trans_name(transop), rc); + break; + } + + lstcon_rpc_trans_addreq(trans, rpc); + } + + if (rc == 0) { + *transpp = trans; + return 0; + } + + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +static void +lstcon_rpc_pinger(void *arg) +{ + struct stt_timer *ptimer = arg; + struct lstcon_rpc_trans *trans; + struct lstcon_rpc *crpc; + struct srpc_msg *rep; + struct srpc_debug_reqst *drq; + struct lstcon_ndlink *ndl; + struct lstcon_node *nd; + int intv; + int count = 0; + int rc; + + /* RPC pinger is a special case of transaction, + * it's called by timer at 8 seconds interval. + */ + mutex_lock(&console_session.ses_mutex); + + if (console_session.ses_shutdown || console_session.ses_expired) { + mutex_unlock(&console_session.ses_mutex); + return; + } + + if (!console_session.ses_expired && + ktime_get_real_seconds() - console_session.ses_laststamp > + (time64_t)console_session.ses_timeout) + console_session.ses_expired = 1; + + trans = console_session.ses_ping; + + LASSERT(trans != NULL); + + list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) { + nd = ndl->ndl_node; + + if (console_session.ses_expired) { + /* idle console, end session on all nodes */ + if (nd->nd_state != LST_NODE_ACTIVE) + continue; + + rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND, + trans->tas_features, &crpc); + if (rc != 0) { + CERROR("Out of memory\n"); + break; + } + + lstcon_rpc_trans_addreq(trans, crpc); + lstcon_rpc_post(crpc); + + continue; + } + + crpc = &nd->nd_ping; + + if (crpc->crp_rpc != NULL) { + LASSERT(crpc->crp_trans == trans); + LASSERT(!list_empty(&crpc->crp_link)); + + spin_lock(&crpc->crp_rpc->crpc_lock); + + LASSERT(crpc->crp_posted); + + if (!crpc->crp_finished) { + /* in flight */ + spin_unlock(&crpc->crp_rpc->crpc_lock); + continue; + } + + spin_unlock(&crpc->crp_rpc->crpc_lock); + + lstcon_rpc_get_reply(crpc, &rep); + + list_del_init(&crpc->crp_link); + + lstcon_rpc_put(crpc); + } + + if (nd->nd_state != LST_NODE_ACTIVE) + continue; + + intv = div_u64(ktime_ms_delta(ktime_get(), nd->nd_stamp), + MSEC_PER_SEC); + if (intv < nd->nd_timeout / 2) + continue; + + rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG, + trans->tas_features, 0, 0, 1, crpc); + if (rc != 0) { + CERROR("Out of memory\n"); + break; + } + + drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst; + + drq->dbg_sid = console_session.ses_id; + drq->dbg_flags = 0; + + lstcon_rpc_trans_addreq(trans, crpc); + lstcon_rpc_post(crpc); + + count++; + } + + if (console_session.ses_expired) { + mutex_unlock(&console_session.ses_mutex); + return; + } + + CDEBUG(D_NET, "Ping %d nodes in session\n", count); + + ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL; + stt_add_timer(ptimer); + + mutex_unlock(&console_session.ses_mutex); +} + +int +lstcon_rpc_pinger_start(void) +{ + struct stt_timer *ptimer; + int rc; + + LASSERT(list_empty(&console_session.ses_rpc_freelist)); + LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0); + + rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING, + &console_session.ses_ping); + if (rc != 0) { + CERROR("Failed to create console pinger\n"); + return rc; + } + + ptimer = &console_session.ses_ping_timer; + ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL; + + stt_add_timer(ptimer); + + return 0; +} + +void +lstcon_rpc_pinger_stop(void) +{ + LASSERT (console_session.ses_shutdown); + + stt_del_timer(&console_session.ses_ping_timer); + + lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN); + lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat()); + lstcon_rpc_trans_destroy(console_session.ses_ping); + + memset(lstcon_trans_stat(), 0, sizeof(struct lstcon_trans_stat)); + + console_session.ses_ping = NULL; +} + +void +lstcon_rpc_cleanup_wait(void) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_rpc *crpc; + struct list_head *pacer; + LIST_HEAD(zlist); + + /* Called with hold of global mutex */ + + LASSERT(console_session.ses_shutdown); + + while (!list_empty(&console_session.ses_trans_list)) { + list_for_each(pacer, &console_session.ses_trans_list) { + trans = list_entry(pacer, struct lstcon_rpc_trans, + tas_link); + + CDEBUG(D_NET, "Session closed, wakeup transaction %s\n", + lstcon_rpc_trans_name(trans->tas_opc)); + + wake_up(&trans->tas_waitq); + } + + mutex_unlock(&console_session.ses_mutex); + + CWARN("Session is shutting down, " + "waiting for termination of transactions\n"); + schedule_timeout_uninterruptible(cfs_time_seconds(1)); + + mutex_lock(&console_session.ses_mutex); + } + + spin_lock(&console_session.ses_rpc_lock); + + lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0), + console_session.ses_rpc_lock, + "Network is not accessable or target is down, " + "waiting for %d console RPCs to being recycled\n", + atomic_read(&console_session.ses_rpc_counter)); + + list_splice_init(&console_session.ses_rpc_freelist, &zlist); + + spin_unlock(&console_session.ses_rpc_lock); + + while (!list_empty(&zlist)) { + crpc = list_entry(zlist.next, struct lstcon_rpc, crp_link); + + list_del(&crpc->crp_link); + LIBCFS_FREE(crpc, sizeof(*crpc)); + } +} + +int +lstcon_rpc_module_init(void) +{ + INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list); + console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger; + console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer; + + console_session.ses_ping = NULL; + + spin_lock_init(&console_session.ses_rpc_lock); + atomic_set(&console_session.ses_rpc_counter, 0); + INIT_LIST_HEAD(&console_session.ses_rpc_freelist); + + return 0; +} + +void +lstcon_rpc_module_fini(void) +{ + LASSERT(list_empty(&console_session.ses_rpc_freelist)); + LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0); +} + diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.h b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h new file mode 100644 index 0000000000000..4defb121497fc --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h @@ -0,0 +1,145 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * /lnet/selftest/conrpc.h + * + * Console rpc + * + * Author: Liang Zhen + */ + +#ifndef __LST_CONRPC_H__ +#define __LST_CONRPC_H__ + +#include +#include +#include "rpc.h" +#include "selftest.h" + +/* Console rpc and rpc transaction */ +#define LST_TRANS_TIMEOUT 30 +#define LST_TRANS_MIN_TIMEOUT 3 + +#define LST_VALIDATE_TIMEOUT(t) \ + clamp_t(int, t, LST_TRANS_MIN_TIMEOUT, LST_TRANS_TIMEOUT) + +#define LST_PING_INTERVAL 8 + +struct lstcon_rpc_trans; +struct lstcon_tsb_hdr; +struct lstcon_test; +struct lstcon_node; + +struct lstcon_rpc { + struct list_head crp_link; /* chain on rpc transaction */ + struct srpc_client_rpc *crp_rpc; /* client rpc */ + struct lstcon_node *crp_node; /* destination node */ + struct lstcon_rpc_trans *crp_trans; /* conrpc transaction */ + + unsigned int crp_posted:1; /* rpc is posted */ + unsigned int crp_finished:1; /* rpc is finished */ + unsigned int crp_unpacked:1; /* reply is unpacked */ + /** RPC is embedded in other structure and can't free it */ + unsigned int crp_embedded:1; + int crp_status; /* console rpc errors */ + s64 crp_stamp_ns; /* replied time stamp */ +}; + +struct lstcon_rpc_trans { + /* link chain on owner list */ + struct list_head tas_olink; + /* link chain on global list */ + struct list_head tas_link; + /* operation code of transaction */ + int tas_opc; + /* features mask is uptodate */ + unsigned tas_feats_updated; + /* test features mask */ + unsigned tas_features; + wait_queue_head_t tas_waitq; /* wait queue head */ + atomic_t tas_remaining; /* # of un-scheduled rpcs */ + struct list_head tas_rpcs_list; /* queued requests */ +}; + +#define LST_TRANS_PRIVATE 0x1000 + +#define LST_TRANS_SESNEW (LST_TRANS_PRIVATE | 0x01) +#define LST_TRANS_SESEND (LST_TRANS_PRIVATE | 0x02) +#define LST_TRANS_SESQRY 0x03 +#define LST_TRANS_SESPING 0x04 + +#define LST_TRANS_TSBCLIADD (LST_TRANS_PRIVATE | 0x11) +#define LST_TRANS_TSBSRVADD (LST_TRANS_PRIVATE | 0x12) +#define LST_TRANS_TSBRUN (LST_TRANS_PRIVATE | 0x13) +#define LST_TRANS_TSBSTOP (LST_TRANS_PRIVATE | 0x14) +#define LST_TRANS_TSBCLIQRY 0x15 +#define LST_TRANS_TSBSRVQRY 0x16 + +#define LST_TRANS_STATQRY 0x21 + +typedef int (*lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *); +typedef int (*lstcon_rpc_readent_func_t)(int, struct srpc_msg *, + struct lstcon_rpc_ent __user *); + +int lstcon_sesrpc_prep(struct lstcon_node *nd, int transop, + unsigned int version, struct lstcon_rpc **crpc); +int lstcon_dbgrpc_prep(struct lstcon_node *nd, + unsigned int version, struct lstcon_rpc **crpc); +int lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version, + struct lstcon_tsb_hdr *tsb, struct lstcon_rpc **crpc); +int lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version, + struct lstcon_test *test, struct lstcon_rpc **crpc); +int lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version, + struct lstcon_rpc **crpc); +void lstcon_rpc_put(struct lstcon_rpc *crpc); +int lstcon_rpc_trans_prep(struct list_head *translist, + int transop, struct lstcon_rpc_trans **transpp); +int lstcon_rpc_trans_ndlist(struct list_head *ndlist, + struct list_head *translist, int transop, + void *arg, lstcon_rpc_cond_func_t condition, + struct lstcon_rpc_trans **transpp); +void lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans, + struct lstcon_trans_stat *stat); +int lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans, + struct list_head __user *head_up, + lstcon_rpc_readent_func_t readent); +void lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error); +void lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans); +void lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans, + struct lstcon_rpc *req); +int lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout); +int lstcon_rpc_pinger_start(void); +void lstcon_rpc_pinger_stop(void); +void lstcon_rpc_cleanup_wait(void); +int lstcon_rpc_module_init(void); +void lstcon_rpc_module_fini(void); + + +#endif diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.c b/drivers/staging/lustrefsx/lnet/selftest/console.c new file mode 100644 index 0000000000000..25de1f25242e0 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/console.c @@ -0,0 +1,2105 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/selftest/conctl.c + * + * Infrastructure of LST console + * + * Author: Liang Zhen + */ + +#include +#include +#include "console.h" +#include "conrpc.h" + +#define LST_NODE_STATE_COUNTER(nd, p) \ +do { \ + if ((nd)->nd_state == LST_NODE_ACTIVE) \ + (p)->nle_nactive ++; \ + else if ((nd)->nd_state == LST_NODE_BUSY) \ + (p)->nle_nbusy ++; \ + else if ((nd)->nd_state == LST_NODE_DOWN) \ + (p)->nle_ndown ++; \ + else \ + (p)->nle_nunknown ++; \ + (p)->nle_nnode ++; \ +} while (0) + +struct lstcon_session console_session; + +static void +lstcon_node_get(struct lstcon_node *nd) +{ + LASSERT (nd->nd_ref >= 1); + + nd->nd_ref++; +} + +static int +lstcon_node_find(struct lnet_process_id id, struct lstcon_node **ndpp, + int create) +{ + struct lstcon_ndlink *ndl; + unsigned int idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE; + + LASSERT(id.nid != LNET_NID_ANY); + + list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx], + ndl_hlink) { + if (ndl->ndl_node->nd_id.nid != id.nid || + ndl->ndl_node->nd_id.pid != id.pid) + continue; + + lstcon_node_get(ndl->ndl_node); + *ndpp = ndl->ndl_node; + return 0; + } + + if (!create) + return -ENOENT; + + LIBCFS_ALLOC(*ndpp, sizeof(**ndpp) + sizeof(*ndl)); + if (*ndpp == NULL) + return -ENOMEM; + + ndl = (struct lstcon_ndlink *)(*ndpp + 1); + + ndl->ndl_node = *ndpp; + + ndl->ndl_node->nd_ref = 1; + ndl->ndl_node->nd_id = id; + ndl->ndl_node->nd_stamp = ktime_get(); + ndl->ndl_node->nd_state = LST_NODE_UNKNOWN; + ndl->ndl_node->nd_timeout = 0; + memset(&ndl->ndl_node->nd_ping, 0, sizeof(ndl->ndl_node->nd_ping)); + + /* queued in global hash & list, no refcount is taken by + * global hash & list, if caller release his refcount, + * node will be released */ + list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]); + list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list); + + return 0; +} + +static void +lstcon_node_put(struct lstcon_node *nd) +{ + struct lstcon_ndlink *ndl; + + LASSERT(nd->nd_ref > 0); + + if (--nd->nd_ref > 0) + return; + + ndl = (struct lstcon_ndlink *)(nd + 1); + + LASSERT(!list_empty(&ndl->ndl_link)); + LASSERT(!list_empty(&ndl->ndl_hlink)); + + /* remove from session */ + list_del(&ndl->ndl_link); + list_del(&ndl->ndl_hlink); + + LIBCFS_FREE(nd, sizeof(*nd) + sizeof(*ndl)); +} + +static int +lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id, + struct lstcon_ndlink **ndlpp, int create) +{ + unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE; + struct lstcon_ndlink *ndl; + struct lstcon_node *nd; + int rc; + + if (id.nid == LNET_NID_ANY) + return -EINVAL; + + /* search in hash */ + list_for_each_entry(ndl, &hash[idx], ndl_hlink) { + if (ndl->ndl_node->nd_id.nid != id.nid || + ndl->ndl_node->nd_id.pid != id.pid) + continue; + + *ndlpp = ndl; + return 0; + } + + if (create == 0) + return -ENOENT; + + /* find or create in session hash */ + rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0); + if (rc != 0) + return rc; + + LIBCFS_ALLOC(ndl, sizeof(*ndl)); + if (ndl == NULL) { + lstcon_node_put(nd); + return -ENOMEM; + } + + *ndlpp = ndl; + + ndl->ndl_node = nd; + INIT_LIST_HEAD(&ndl->ndl_link); + list_add_tail(&ndl->ndl_hlink, &hash[idx]); + + return 0; +} + +static void +lstcon_ndlink_release(struct lstcon_ndlink *ndl) +{ + LASSERT(list_empty(&ndl->ndl_link)); + LASSERT(!list_empty(&ndl->ndl_hlink)); + + list_del(&ndl->ndl_hlink); /* delete from hash */ + lstcon_node_put(ndl->ndl_node); + + LIBCFS_FREE(ndl, sizeof(*ndl)); +} + +static int +lstcon_group_alloc(char *name, struct lstcon_group **grpp) +{ + struct lstcon_group *grp; + int i; + + LIBCFS_ALLOC(grp, offsetof(struct lstcon_group, + grp_ndl_hash[LST_NODE_HASHSIZE])); + if (grp == NULL) + return -ENOMEM; + + grp->grp_ref = 1; + if (name != NULL) { + if (strlen(name) > sizeof(grp->grp_name)-1) { + LIBCFS_FREE(grp, offsetof(struct lstcon_group, + grp_ndl_hash[LST_NODE_HASHSIZE])); + return -E2BIG; + } + strncpy(grp->grp_name, name, sizeof(grp->grp_name)); + } + + INIT_LIST_HEAD(&grp->grp_link); + INIT_LIST_HEAD(&grp->grp_ndl_list); + INIT_LIST_HEAD(&grp->grp_trans_list); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) + INIT_LIST_HEAD(&grp->grp_ndl_hash[i]); + + *grpp = grp; + + return 0; +} + +static void +lstcon_group_addref(struct lstcon_group *grp) +{ + grp->grp_ref++; +} + +static void lstcon_group_ndlink_release(struct lstcon_group *, + struct lstcon_ndlink *); + +static void +lstcon_group_drain(struct lstcon_group *grp, int keep) +{ + struct lstcon_ndlink *ndl; + struct lstcon_ndlink *tmp; + + list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) { + if ((ndl->ndl_node->nd_state & keep) == 0) + lstcon_group_ndlink_release(grp, ndl); + } +} + +static void +lstcon_group_decref(struct lstcon_group *grp) +{ + int i; + + if (--grp->grp_ref > 0) + return; + + if (!list_empty(&grp->grp_link)) + list_del(&grp->grp_link); + + lstcon_group_drain(grp, 0); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) + LASSERT(list_empty(&grp->grp_ndl_hash[i])); + + LIBCFS_FREE(grp, offsetof(struct lstcon_group, + grp_ndl_hash[LST_NODE_HASHSIZE])); +} + +static int +lstcon_group_find(const char *name, struct lstcon_group **grpp) +{ + struct lstcon_group *grp; + + list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) { + if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0) + continue; + + lstcon_group_addref(grp); /* +1 ref for caller */ + *grpp = grp; + return 0; + } + + return -ENOENT; +} + +static int +lstcon_group_ndlink_find(struct lstcon_group *grp, struct lnet_process_id id, + struct lstcon_ndlink **ndlpp, int create) +{ + int rc; + + rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create); + if (rc != 0) + return rc; + + if (!list_empty(&(*ndlpp)->ndl_link)) + return 0; + + list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list); + grp->grp_nnode++; + + return 0; +} + +static void +lstcon_group_ndlink_release(struct lstcon_group *grp, struct lstcon_ndlink *ndl) +{ + list_del_init(&ndl->ndl_link); + lstcon_ndlink_release(ndl); + grp->grp_nnode--; +} + +static void +lstcon_group_ndlink_move(struct lstcon_group *old, + struct lstcon_group *new, struct lstcon_ndlink *ndl) +{ + unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) % + LST_NODE_HASHSIZE; + + old->grp_nnode--; + + list_move_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]); + list_move_tail(&ndl->ndl_link, &new->grp_ndl_list); + new->grp_nnode++; +} + +static void +lstcon_group_move(struct lstcon_group *old, struct lstcon_group *new) +{ + struct lstcon_ndlink *ndl; + + while (!list_empty(&old->grp_ndl_list)) { + ndl = list_entry(old->grp_ndl_list.next, + struct lstcon_ndlink, ndl_link); + lstcon_group_ndlink_move(old, new, ndl); + } +} + +static int +lstcon_sesrpc_condition(int transop, struct lstcon_node *nd, void *arg) +{ + struct lstcon_group *grp = arg; + + switch (transop) { + case LST_TRANS_SESNEW: + if (nd->nd_state == LST_NODE_ACTIVE) + return 0; + break; + + case LST_TRANS_SESEND: + if (nd->nd_state != LST_NODE_ACTIVE) + return 0; + + if (grp != NULL && nd->nd_ref > 1) + return 0; + break; + + case LST_TRANS_SESQRY: + break; + + default: + LBUG(); + } + + return 1; +} + +static int +lstcon_sesrpc_readent(int transop, struct srpc_msg *msg, + struct lstcon_rpc_ent __user *ent_up) +{ + struct srpc_debug_reply *rep; + + switch (transop) { + case LST_TRANS_SESNEW: + case LST_TRANS_SESEND: + return 0; + + case LST_TRANS_SESQRY: + rep = &msg->msg_body.dbg_reply; + + if (copy_to_user(&ent_up->rpe_priv[0], + &rep->dbg_timeout, sizeof(int)) || + copy_to_user(&ent_up->rpe_payload[0], + &rep->dbg_name, LST_NAME_SIZE)) + return -EFAULT; + + return 0; + + default: + LBUG(); + } + + return 0; +} + +static int +lstcon_group_nodes_add(struct lstcon_group *grp, + int count, struct lnet_process_id __user *ids_up, + unsigned int *featp, + struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_ndlink *ndl; + struct lstcon_group *tmp; + struct lnet_process_id id; + int i; + int rc; + + rc = lstcon_group_alloc(NULL, &tmp); + if (rc != 0) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + for (i = 0 ; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + break; + } + + /* skip if it's in this group already */ + rc = lstcon_group_ndlink_find(grp, id, &ndl, 0); + if (rc == 0) + continue; + + /* add to tmp group */ + rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1); + if (rc != 0) { + CERROR("Can't create ndlink, out of memory\n"); + break; + } + } + + if (rc != 0) { + lstcon_group_decref(tmp); + return rc; + } + + rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list, + &tmp->grp_trans_list, LST_TRANS_SESNEW, + tmp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + lstcon_group_decref(tmp); + return rc; + } + + /* post all RPCs */ + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_sesrpc_readent); + *featp = trans->tas_features; + + /* destroy all RPGs */ + lstcon_rpc_trans_destroy(trans); + + lstcon_group_move(tmp, grp); + lstcon_group_decref(tmp); + + return rc; +} + +static int +lstcon_group_nodes_remove(struct lstcon_group *grp, + int count, struct lnet_process_id __user *ids_up, + struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_ndlink *ndl; + struct lstcon_group *tmp; + struct lnet_process_id id; + int rc; + int i; + + /* End session and remove node from the group */ + + rc = lstcon_group_alloc(NULL, &tmp); + if (rc != 0) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + for (i = 0; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + goto error; + } + + /* move node to tmp group */ + if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0) + lstcon_group_ndlink_move(grp, tmp, ndl); + } + + rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list, + &tmp->grp_trans_list, LST_TRANS_SESEND, + tmp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + goto error; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + /* release nodes anyway, because we can't rollback status */ + lstcon_group_decref(tmp); + + return rc; +error: + lstcon_group_move(tmp, grp); + lstcon_group_decref(tmp); + + return rc; +} + +int +lstcon_group_add(char *name) +{ + struct lstcon_group *grp; + int rc; + + rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0; + if (rc != 0) { + /* find a group with same name */ + lstcon_group_decref(grp); + return rc; + } + + rc = lstcon_group_alloc(name, &grp); + if (rc != 0) { + CERROR("Can't allocate descriptor for group %s\n", name); + return -ENOMEM; + } + + list_add_tail(&grp->grp_link, &console_session.ses_grp_list); + + return rc; +} + +int +lstcon_nodes_add(char *name, int count, struct lnet_process_id __user *ids_up, + unsigned *featp, struct list_head __user *result_up) +{ + struct lstcon_group *grp; + int rc; + + LASSERT (count > 0); + LASSERT (ids_up != NULL); + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by other threads or test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_decref(grp); + + return -EBUSY; + } + + rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up); + + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_group_del(char *name) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_group *grp; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group: %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by others threads or test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_decref(grp); + return -EBUSY; + } + + rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, + &grp->grp_trans_list, LST_TRANS_SESEND, + grp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + lstcon_group_decref(grp); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + lstcon_rpc_trans_destroy(trans); + + lstcon_group_decref(grp); + /* -ref for session, it's destroyed, + * status can't be rolled back, destroy group anway */ + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_group_clean(char *name, int args) +{ + struct lstcon_group *grp = NULL; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_decref(grp); + return -EBUSY; + } + + args = (LST_NODE_ACTIVE | LST_NODE_BUSY | + LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args; + + lstcon_group_drain(grp, args); + + lstcon_group_decref(grp); + /* release empty group */ + if (list_empty(&grp->grp_ndl_list)) + lstcon_group_decref(grp); + + return 0; +} + +int +lstcon_nodes_remove(char *name, int count, + struct lnet_process_id __user *ids_up, + struct list_head __user *result_up) +{ + struct lstcon_group *grp = NULL; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group: %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_decref(grp); + return -EBUSY; + } + + rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up); + + lstcon_group_decref(grp); + /* release empty group */ + if (list_empty(&grp->grp_ndl_list)) + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_group_refresh(char *name, struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_group *grp; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group: %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_decref(grp); + return -EBUSY; + } + + /* re-invite all inactive nodes int the group */ + rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, + &grp->grp_trans_list, LST_TRANS_SESNEW, + grp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + /* local error, return */ + CDEBUG(D_NET, "Can't create transaction: %d\n", rc); + lstcon_group_decref(grp); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + /* -ref for me */ + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_group_list(int index, int len, char __user *name_up) +{ + struct lstcon_group *grp; + + LASSERT(index >= 0); + LASSERT(name_up != NULL); + + list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) { + if (index-- == 0) { + return copy_to_user(name_up, grp->grp_name, len) ? + -EFAULT : 0; + } + } + + return -ENOENT; +} + +static int +lstcon_nodes_getent(struct list_head *head, int *index_p, + int *count_p, struct lstcon_node_ent __user *dents_up) +{ + struct lstcon_ndlink *ndl; + struct lstcon_node *nd; + int count = 0; + int index = 0; + + LASSERT(index_p != NULL && count_p != NULL); + LASSERT(dents_up != NULL); + LASSERT(*index_p >= 0); + LASSERT(*count_p > 0); + + list_for_each_entry(ndl, head, ndl_link) { + if (index++ < *index_p) + continue; + + if (count >= *count_p) + break; + + nd = ndl->ndl_node; + if (copy_to_user(&dents_up[count].nde_id, + &nd->nd_id, sizeof(nd->nd_id)) || + copy_to_user(&dents_up[count].nde_state, + &nd->nd_state, sizeof(nd->nd_state))) + return -EFAULT; + + count ++; + } + + if (index <= *index_p) + return -ENOENT; + + *count_p = count; + *index_p = index; + + return 0; +} + +int +lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gents_p, + int *index_p, int *count_p, + struct lstcon_node_ent __user *dents_up) +{ + struct lstcon_ndlist_ent *gentp; + struct lstcon_group *grp; + struct lstcon_ndlink *ndl; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", name); + return rc; + } + + if (dents_up != NULL) { + /* verbose query */ + rc = lstcon_nodes_getent(&grp->grp_ndl_list, + index_p, count_p, dents_up); + lstcon_group_decref(grp); + + return rc; + } + + /* non-verbose query */ + CFS_ALLOC_PTR(gentp); + if (gentp == NULL) { + CERROR("Can't allocate ndlist_ent\n"); + lstcon_group_decref(grp); + + return -ENOMEM; + } + + list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp); + + rc = copy_to_user(gents_p, gentp, + sizeof(struct lstcon_ndlist_ent)) ? -EFAULT : 0; + + CFS_FREE_PTR(gentp); + + lstcon_group_decref(grp); + + return 0; +} + +static int +lstcon_batch_find(const char *name, struct lstcon_batch **batpp) +{ + struct lstcon_batch *bat; + + list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) { + if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) { + *batpp = bat; + return 0; + } + } + + return -ENOENT; +} + +int +lstcon_batch_add(char *name) +{ + struct lstcon_batch *bat; + int i; + int rc; + + rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0; + if (rc != 0) { + CDEBUG(D_NET, "Batch %s already exists\n", name); + return rc; + } + + LIBCFS_ALLOC(bat, sizeof(*bat)); + if (bat == NULL) { + CERROR("Can't allocate descriptor for batch %s\n", name); + return -ENOMEM; + } + + CFS_ALLOC_PTR_ARRAY(bat->bat_cli_hash, LST_NODE_HASHSIZE); + if (bat->bat_cli_hash == NULL) { + CERROR("Can't allocate hash for batch %s\n", name); + LIBCFS_FREE(bat, sizeof(*bat)); + + return -ENOMEM; + } + + CFS_ALLOC_PTR_ARRAY(bat->bat_srv_hash, LST_NODE_HASHSIZE); + if (bat->bat_srv_hash == NULL) { + CERROR("Can't allocate hash for batch %s\n", name); + LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE); + LIBCFS_FREE(bat, sizeof(*bat)); + + return -ENOMEM; + } + + if (strlen(name) > sizeof(bat->bat_name)-1) { + LIBCFS_FREE(bat->bat_srv_hash, LST_NODE_HASHSIZE); + LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE); + LIBCFS_FREE(bat, sizeof(*bat)); + return -E2BIG; + } + strncpy(bat->bat_name, name, sizeof(bat->bat_name)); + bat->bat_hdr.tsb_index = 0; + bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie; + + bat->bat_ntest = 0; + bat->bat_state = LST_BATCH_IDLE; + + INIT_LIST_HEAD(&bat->bat_cli_list); + INIT_LIST_HEAD(&bat->bat_srv_list); + INIT_LIST_HEAD(&bat->bat_test_list); + INIT_LIST_HEAD(&bat->bat_trans_list); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) { + INIT_LIST_HEAD(&bat->bat_cli_hash[i]); + INIT_LIST_HEAD(&bat->bat_srv_hash[i]); + } + + list_add_tail(&bat->bat_link, &console_session.ses_bat_list); + + return rc; +} + +int +lstcon_batch_list(int index, int len, char __user *name_up) +{ + struct lstcon_batch *bat; + + LASSERT(name_up != NULL); + LASSERT(index >= 0); + + list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) { + if (index-- == 0) { + return copy_to_user(name_up, bat->bat_name, len) ? + -EFAULT : 0; + } + } + + return -ENOENT; +} + +int +lstcon_batch_info(char *name, struct lstcon_test_batch_ent __user *ent_up, + int server, int testidx, int *index_p, int *ndent_p, + struct lstcon_node_ent __user *dents_up) +{ + struct lstcon_test_batch_ent *entp; + struct list_head *clilst; + struct list_head *srvlst; + struct lstcon_test *test = NULL; + struct lstcon_batch *bat; + struct lstcon_ndlink *ndl; + int rc; + + rc = lstcon_batch_find(name, &bat); + if (rc != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return -ENOENT; + } + + if (testidx > 0) { + /* query test, test index start from 1 */ + list_for_each_entry(test, &bat->bat_test_list, tes_link) { + if (testidx-- == 1) + break; + } + + if (testidx > 0) { + CDEBUG(D_NET, "Can't find specified test in batch\n"); + return -ENOENT; + } + } + + clilst = (test == NULL) ? &bat->bat_cli_list : + &test->tes_src_grp->grp_ndl_list; + srvlst = (test == NULL) ? &bat->bat_srv_list : + &test->tes_dst_grp->grp_ndl_list; + + if (dents_up != NULL) { + rc = lstcon_nodes_getent((server ? srvlst: clilst), + index_p, ndent_p, dents_up); + return rc; + } + + /* non-verbose query */ + CFS_ALLOC_PTR(entp); + if (entp == NULL) + return -ENOMEM; + + if (test == NULL) { + entp->u.tbe_batch.bae_ntest = bat->bat_ntest; + entp->u.tbe_batch.bae_state = bat->bat_state; + + } else { + + entp->u.tbe_test.tse_type = test->tes_type; + entp->u.tbe_test.tse_loop = test->tes_loop; + entp->u.tbe_test.tse_concur = test->tes_concur; + } + + list_for_each_entry(ndl, clilst, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle); + + list_for_each_entry(ndl, srvlst, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle); + + rc = copy_to_user(ent_up, entp, + sizeof(struct lstcon_test_batch_ent)) ? -EFAULT : 0; + + CFS_FREE_PTR(entp) + + return rc; +} + +static int +lstcon_batrpc_condition(int transop, struct lstcon_node *nd, void *arg) +{ + switch (transop) { + case LST_TRANS_TSBRUN: + if (nd->nd_state != LST_NODE_ACTIVE) + return -ENETDOWN; + break; + + case LST_TRANS_TSBSTOP: + if (nd->nd_state != LST_NODE_ACTIVE) + return 0; + break; + + case LST_TRANS_TSBCLIQRY: + case LST_TRANS_TSBSRVQRY: + break; + } + + return 1; +} + +static int +lstcon_batch_op(struct lstcon_batch *bat, int transop, + struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + int rc; + + rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list, + &bat->bat_trans_list, transop, + bat, lstcon_batrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +int +lstcon_batch_run(char *name, int timeout, struct list_head __user *result_up) +{ + struct lstcon_batch *bat; + int rc; + + if (lstcon_batch_find(name, &bat) != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return -ENOENT; + } + + bat->bat_arg = timeout; + + rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up); + + /* mark batch as running if it's started in any node */ + if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0) + bat->bat_state = LST_BATCH_RUNNING; + + return rc; +} + +int +lstcon_batch_stop(char *name, int force, struct list_head __user *result_up) +{ + struct lstcon_batch *bat; + int rc; + + if (lstcon_batch_find(name, &bat) != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return -ENOENT; + } + + bat->bat_arg = force; + + rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up); + + /* mark batch as stopped if all RPCs finished */ + if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0) + bat->bat_state = LST_BATCH_IDLE; + + return rc; +} + +static void +lstcon_batch_destroy(struct lstcon_batch *bat) +{ + struct lstcon_ndlink *ndl; + struct lstcon_test *test; + int i; + + list_del(&bat->bat_link); + + while (!list_empty(&bat->bat_test_list)) { + test = list_entry(bat->bat_test_list.next, + struct lstcon_test, tes_link); + LASSERT(list_empty(&test->tes_trans_list)); + + list_del(&test->tes_link); + + lstcon_group_decref(test->tes_src_grp); + lstcon_group_decref(test->tes_dst_grp); + + LIBCFS_FREE(test, offsetof(struct lstcon_test, + tes_param[test->tes_paramlen])); + } + + LASSERT(list_empty(&bat->bat_trans_list)); + + while (!list_empty(&bat->bat_cli_list)) { + ndl = list_entry(bat->bat_cli_list.next, + struct lstcon_ndlink, ndl_link); + list_del_init(&ndl->ndl_link); + + lstcon_ndlink_release(ndl); + } + + while (!list_empty(&bat->bat_srv_list)) { + ndl = list_entry(bat->bat_srv_list.next, + struct lstcon_ndlink, ndl_link); + list_del_init(&ndl->ndl_link); + + lstcon_ndlink_release(ndl); + } + + for (i = 0; i < LST_NODE_HASHSIZE; i++) { + LASSERT(list_empty(&bat->bat_cli_hash[i])); + LASSERT(list_empty(&bat->bat_srv_hash[i])); + } + + LIBCFS_FREE(bat->bat_cli_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + LIBCFS_FREE(bat->bat_srv_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + LIBCFS_FREE(bat, sizeof(*bat)); +} + +static int +lstcon_testrpc_condition(int transop, struct lstcon_node *nd, void *arg) +{ + struct lstcon_test *test = arg; + struct lstcon_batch *batch; + struct lstcon_ndlink *ndl; + struct list_head *hash; + struct list_head *head; + + LASSERT(test != NULL); + + batch = test->tes_batch; + LASSERT(batch != NULL); + + if (test->tes_oneside && + transop == LST_TRANS_TSBSRVADD) + return 0; + + if (nd->nd_state != LST_NODE_ACTIVE) + return -ENETDOWN; + + if (transop == LST_TRANS_TSBCLIADD) { + hash = batch->bat_cli_hash; + head = &batch->bat_cli_list; + + } else { + LASSERT (transop == LST_TRANS_TSBSRVADD); + + hash = batch->bat_srv_hash; + head = &batch->bat_srv_list; + } + + LASSERT (nd->nd_id.nid != LNET_NID_ANY); + + if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0) + return -ENOMEM; + + if (list_empty(&ndl->ndl_link)) + list_add_tail(&ndl->ndl_link, head); + + return 1; +} + +static int +lstcon_test_nodes_add(struct lstcon_test *test, + struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_group *grp; + int transop; + int rc; + + LASSERT (test->tes_src_grp != NULL); + LASSERT (test->tes_dst_grp != NULL); + + transop = LST_TRANS_TSBSRVADD; + grp = test->tes_dst_grp; +again: + rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, + &test->tes_trans_list, transop, + test, lstcon_testrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + if (lstcon_trans_stat()->trs_rpc_errno != 0 || + lstcon_trans_stat()->trs_fwk_errno != 0) { + lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + /* return if any error */ + CDEBUG(D_NET, "Failed to add test %s, " + "RPC error %d, framework error %d\n", + transop == LST_TRANS_TSBCLIADD ? "client" : "server", + lstcon_trans_stat()->trs_rpc_errno, + lstcon_trans_stat()->trs_fwk_errno); + + return rc; + } + + lstcon_rpc_trans_destroy(trans); + + if (transop == LST_TRANS_TSBCLIADD) + return rc; + + transop = LST_TRANS_TSBCLIADD; + grp = test->tes_src_grp; + test->tes_cliidx = 0; + + /* requests to test clients */ + goto again; +} + +static int +lstcon_verify_batch(const char *name, struct lstcon_batch **batch) +{ + int rc; + + rc = lstcon_batch_find(name, batch); + if (rc != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return rc; + } + + if ((*batch)->bat_state != LST_BATCH_IDLE) { + CDEBUG(D_NET, "Can't change running batch %s\n", name); + return -EINVAL; + } + + return 0; +} + +static int +lstcon_verify_group(const char *name, struct lstcon_group **grp) +{ + int rc; + struct lstcon_ndlink *ndl; + + rc = lstcon_group_find(name, grp); + if (rc != 0) { + CDEBUG(D_NET, "can't find group %s\n", name); + return rc; + } + + list_for_each_entry(ndl, &(*grp)->grp_ndl_list, ndl_link) { + if (ndl->ndl_node->nd_state == LST_NODE_ACTIVE) { + return 0; + } + } + + CDEBUG(D_NET, "Group %s has no ACTIVE nodes\n", name); + + return -EINVAL; +} + +int +lstcon_test_add(char *batch_name, int type, int loop, + int concur, int dist, int span, + char *src_name, char *dst_name, + void *param, int paramlen, int *retp, + struct list_head __user *result_up) +{ + struct lstcon_test *test = NULL; + int rc; + struct lstcon_group *src_grp = NULL; + struct lstcon_group *dst_grp = NULL; + struct lstcon_batch *batch = NULL; + + /* + * verify that a batch of the given name exists, and the groups + * that will be part of the batch exist and have at least one + * active node + */ + rc = lstcon_verify_batch(batch_name, &batch); + if (rc != 0) + goto out; + + rc = lstcon_verify_group(src_name, &src_grp); + if (rc != 0) + goto out; + + rc = lstcon_verify_group(dst_name, &dst_grp); + if (rc != 0) + goto out; + + if (dst_grp->grp_userland) + *retp = 1; + + LIBCFS_ALLOC(test, offsetof(struct lstcon_test, tes_param[paramlen])); + if (!test) { + CERROR("Can't allocate test descriptor\n"); + rc = -ENOMEM; + + goto out; + } + + test->tes_hdr.tsb_id = batch->bat_hdr.tsb_id; + test->tes_batch = batch; + test->tes_type = type; + test->tes_oneside = 0; /* TODO */ + test->tes_loop = loop; + test->tes_concur = concur; + test->tes_stop_onerr = 1; /* TODO */ + test->tes_span = span; + test->tes_dist = dist; + test->tes_cliidx = 0; /* just used for creating RPC */ + test->tes_src_grp = src_grp; + test->tes_dst_grp = dst_grp; + INIT_LIST_HEAD(&test->tes_trans_list); + + if (param != NULL) { + test->tes_paramlen = paramlen; + memcpy(&test->tes_param[0], param, paramlen); + } + + rc = lstcon_test_nodes_add(test, result_up); + + if (rc != 0) + goto out; + + if (lstcon_trans_stat()->trs_rpc_errno != 0 || + lstcon_trans_stat()->trs_fwk_errno != 0) + CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type, + batch_name); + + /* add to test list anyway, so user can check what's going on */ + list_add_tail(&test->tes_link, &batch->bat_test_list); + + batch->bat_ntest++; + test->tes_hdr.tsb_index = batch->bat_ntest; + + /* hold groups so nobody can change them */ + return rc; +out: + if (test != NULL) + LIBCFS_FREE(test, offsetof(struct lstcon_test, + tes_param[paramlen])); + + if (dst_grp != NULL) + lstcon_group_decref(dst_grp); + + if (src_grp != NULL) + lstcon_group_decref(src_grp); + + return rc; +} + +static int +lstcon_test_find(struct lstcon_batch *batch, int idx, + struct lstcon_test **testpp) +{ + struct lstcon_test *test; + + list_for_each_entry(test, &batch->bat_test_list, tes_link) { + if (idx == test->tes_hdr.tsb_index) { + *testpp = test; + return 0; + } + } + + return -ENOENT; +} + +static int +lstcon_tsbrpc_readent(int transop, struct srpc_msg *msg, + struct lstcon_rpc_ent __user *ent_up) +{ + struct srpc_batch_reply *rep = &msg->msg_body.bat_reply; + + LASSERT (transop == LST_TRANS_TSBCLIQRY || + transop == LST_TRANS_TSBSRVQRY); + + /* positive errno, framework error code */ + if (copy_to_user(&ent_up->rpe_priv[0], + &rep->bar_active, sizeof(rep->bar_active))) + return -EFAULT; + + return 0; +} + +int +lstcon_test_batch_query(char *name, int testidx, int client, + int timeout, struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + struct list_head *translist; + struct list_head *ndlist; + struct lstcon_tsb_hdr *hdr; + struct lstcon_batch *batch; + struct lstcon_test *test = NULL; + int transop; + int rc; + + rc = lstcon_batch_find(name, &batch); + if (rc != 0) { + CDEBUG(D_NET, "Can't find batch: %s\n", name); + return rc; + } + + if (testidx == 0) { + translist = &batch->bat_trans_list; + ndlist = &batch->bat_cli_list; + hdr = &batch->bat_hdr; + + } else { + /* query specified test only */ + rc = lstcon_test_find(batch, testidx, &test); + if (rc != 0) { + CDEBUG(D_NET, "Can't find test: %d\n", testidx); + return rc; + } + + translist = &test->tes_trans_list; + ndlist = &test->tes_src_grp->grp_ndl_list; + hdr = &test->tes_hdr; + } + + transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY; + + rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr, + lstcon_batrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, timeout); + + if (testidx == 0 && /* query a batch, not a test */ + lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 && + lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) { + /* all RPCs finished, and no active test */ + batch->bat_state = LST_BATCH_IDLE; + } + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_tsbrpc_readent); + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +static int +lstcon_statrpc_readent(int transop, struct srpc_msg *msg, + struct lstcon_rpc_ent __user *ent_up) +{ + struct srpc_stat_reply *rep = &msg->msg_body.stat_reply; + struct sfw_counters __user *sfwk_stat; + struct srpc_counters __user *srpc_stat; + struct lnet_counters_common __user *lnet_stat; + + if (rep->str_status != 0) + return 0; + + sfwk_stat = (struct sfw_counters __user *)&ent_up->rpe_payload[0]; + srpc_stat = (struct srpc_counters __user *) + ((char __user *)sfwk_stat + sizeof(*sfwk_stat)); + lnet_stat = (struct lnet_counters_common __user *) + ((char __user *)srpc_stat + sizeof(*srpc_stat)); + + if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) || + copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) || + copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat))) + return -EFAULT; + + return 0; +} + +static int +lstcon_ndlist_stat(struct list_head *ndlist, + int timeout, struct list_head __user *result_up) +{ + LIST_HEAD(head); + struct lstcon_rpc_trans *trans; + int rc; + + rc = lstcon_rpc_trans_ndlist(ndlist, &head, + LST_TRANS_STATQRY, NULL, NULL, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout)); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_statrpc_readent); + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +int +lstcon_group_stat(char *grp_name, int timeout, + struct list_head __user *result_up) +{ + struct lstcon_group *grp; + int rc; + + rc = lstcon_group_find(grp_name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", grp_name); + return rc; + } + + rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up); + + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_nodes_stat(int count, struct lnet_process_id __user *ids_up, + int timeout, struct list_head __user *result_up) +{ + struct lstcon_ndlink *ndl; + struct lstcon_group *tmp; + struct lnet_process_id id; + int i; + int rc; + + rc = lstcon_group_alloc(NULL, &tmp); + if (rc != 0) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + for (i = 0 ; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + break; + } + + /* add to tmp group */ + rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2); + if (rc != 0) { + CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET, + "Failed to find or create %s: %d\n", + libcfs_id2str(id), rc); + break; + } + } + + if (rc != 0) { + lstcon_group_decref(tmp); + return rc; + } + + rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up); + + lstcon_group_decref(tmp); + + return rc; +} + +static int +lstcon_debug_ndlist(struct list_head *ndlist, + struct list_head *translist, + int timeout, struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + int rc; + + rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY, + NULL, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout)); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_sesrpc_readent); + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +int +lstcon_session_debug(int timeout, struct list_head __user *result_up) +{ + return lstcon_debug_ndlist(&console_session.ses_ndl_list, + NULL, timeout, result_up); +} + +int +lstcon_batch_debug(int timeout, char *name, + int client, struct list_head __user *result_up) +{ + struct lstcon_batch *bat; + int rc; + + rc = lstcon_batch_find(name, &bat); + if (rc != 0) + return -ENOENT; + + rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list : + &bat->bat_srv_list, + NULL, timeout, result_up); + + return rc; +} + +int +lstcon_group_debug(int timeout, char *name, + struct list_head __user *result_up) +{ + struct lstcon_group *grp; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) + return -ENOENT; + + rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL, + timeout, result_up); + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_nodes_debug(int timeout, int count, + struct lnet_process_id __user *ids_up, + struct list_head __user *result_up) +{ + struct lnet_process_id id; + struct lstcon_ndlink *ndl; + struct lstcon_group *grp; + int i; + int rc; + + rc = lstcon_group_alloc(NULL, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Out of memory\n"); + return rc; + } + + for (i = 0; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + break; + } + + /* node is added to tmp group */ + rc = lstcon_group_ndlink_find(grp, id, &ndl, 1); + if (rc != 0) { + CERROR("Can't create node link\n"); + break; + } + } + + if (rc != 0) { + lstcon_group_decref(grp); + return rc; + } + + rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL, + timeout, result_up); + + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_session_match(struct lst_sid sid) +{ + return (console_session.ses_id.ses_nid == sid.ses_nid && + console_session.ses_id.ses_stamp == sid.ses_stamp) ? 1: 0; +} + +static void +lstcon_new_session_id(struct lst_sid *sid) +{ + struct lnet_processid id; + + LASSERT(console_session.ses_state == LST_SESSION_NONE); + + LNetGetId(1, &id); + sid->ses_nid = lnet_nid_to_nid4(&id.nid); + sid->ses_stamp = div_u64(ktime_get_ns(), NSEC_PER_MSEC); +} + +int +lstcon_session_new(char *name, int key, unsigned feats, + int timeout, int force, struct lst_sid __user *sid_up) +{ + int rc = 0; + int i; + + if (console_session.ses_state != LST_SESSION_NONE) { + /* session exists */ + if (!force) { + CNETERR("Session %s already exists\n", + console_session.ses_name); + return -EEXIST; + } + + rc = lstcon_session_end(); + + /* lstcon_session_end() only return local error */ + if (rc != 0) + return rc; + } + + if ((feats & ~LST_FEATS_MASK) != 0) { + CNETERR("Unknown session features %x\n", + (feats & ~LST_FEATS_MASK)); + return -EINVAL; + } + + for (i = 0; i < LST_GLOBAL_HASHSIZE; i++) + LASSERT(list_empty(&console_session.ses_ndl_hash[i])); + + lstcon_new_session_id(&console_session.ses_id); + + console_session.ses_key = key; + console_session.ses_state = LST_SESSION_ACTIVE; + console_session.ses_force = !!force; + console_session.ses_features = feats; + console_session.ses_feats_updated = 0; + console_session.ses_timeout = (timeout <= 0) ? + LST_CONSOLE_TIMEOUT : timeout; + + if (strlen(name) > sizeof(console_session.ses_name)-1) + return -E2BIG; + strlcpy(console_session.ses_name, name, + sizeof(console_session.ses_name)); + + rc = lstcon_batch_add(LST_DEFAULT_BATCH); + if (rc != 0) + return rc; + + rc = lstcon_rpc_pinger_start(); + if (rc != 0) { + struct lstcon_batch *bat = NULL; + + lstcon_batch_find(LST_DEFAULT_BATCH, &bat); + lstcon_batch_destroy(bat); + + return rc; + } + + if (copy_to_user(sid_up, &console_session.ses_id, + sizeof(struct lst_sid)) == 0) + return rc; + + lstcon_session_end(); + + return -EFAULT; +} + +int +lstcon_session_info(struct lst_sid __user *sid_up, int __user *key_up, + unsigned __user *featp, + struct lstcon_ndlist_ent __user *ndinfo_up, + char __user *name_up, int len) +{ + struct lstcon_ndlist_ent *entp; + struct lstcon_ndlink *ndl; + int rc = 0; + + if (console_session.ses_state != LST_SESSION_ACTIVE) + return -ESRCH; + + LIBCFS_ALLOC(entp, sizeof(*entp)); + if (entp == NULL) + return -ENOMEM; + + list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, entp); + + if (copy_to_user(sid_up, &console_session.ses_id, + sizeof(struct lst_sid)) || + copy_to_user(key_up, &console_session.ses_key, + sizeof(*key_up)) || + copy_to_user(featp, &console_session.ses_features, + sizeof(*featp)) || + copy_to_user(ndinfo_up, entp, sizeof(*entp)) || + copy_to_user(name_up, console_session.ses_name, len)) + rc = -EFAULT; + + LIBCFS_FREE(entp, sizeof(*entp)); + + return rc; +} + +int +lstcon_session_end(void) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_group *grp; + struct lstcon_batch *bat; + int rc = 0; + + LASSERT (console_session.ses_state == LST_SESSION_ACTIVE); + + rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list, + NULL, LST_TRANS_SESEND, NULL, + lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + console_session.ses_shutdown = 1; + + lstcon_rpc_pinger_stop(); + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + lstcon_rpc_trans_destroy(trans); + /* User can do nothing even rpc failed, so go on */ + + /* waiting for orphan rpcs to die */ + lstcon_rpc_cleanup_wait(); + + console_session.ses_id = LST_INVALID_SID; + console_session.ses_state = LST_SESSION_NONE; + console_session.ses_key = 0; + console_session.ses_force = 0; + console_session.ses_feats_updated = 0; + + /* destroy all batches */ + while (!list_empty(&console_session.ses_bat_list)) { + bat = list_entry(console_session.ses_bat_list.next, + struct lstcon_batch, bat_link); + + lstcon_batch_destroy(bat); + } + + /* destroy all groups */ + while (!list_empty(&console_session.ses_grp_list)) { + grp = list_entry(console_session.ses_grp_list.next, + struct lstcon_group, grp_link); + LASSERT(grp->grp_ref == 1); + + lstcon_group_decref(grp); + } + + /* all nodes should be released */ + LASSERT(list_empty(&console_session.ses_ndl_list)); + + console_session.ses_shutdown = 0; + console_session.ses_expired = 0; + + return rc; +} + +int +lstcon_session_feats_check(unsigned feats) +{ + int rc = 0; + + if ((feats & ~LST_FEATS_MASK) != 0) { + CERROR("Can't support these features: %x\n", + (feats & ~LST_FEATS_MASK)); + return -EPROTO; + } + + spin_lock(&console_session.ses_rpc_lock); + + if (!console_session.ses_feats_updated) { + console_session.ses_feats_updated = 1; + console_session.ses_features = feats; + } + + if (console_session.ses_features != feats) + rc = -EPROTO; + + spin_unlock(&console_session.ses_rpc_lock); + + if (rc != 0) { + CERROR("remote features %x do not match with " + "session features %x of console\n", + feats, console_session.ses_features); + } + + return rc; +} + +static int +lstcon_acceptor_handle(struct srpc_server_rpc *rpc) +{ + struct srpc_msg *rep = &rpc->srpc_replymsg; + struct srpc_msg *req = &rpc->srpc_reqstbuf->buf_msg; + struct srpc_join_reqst *jreq = &req->msg_body.join_reqst; + struct srpc_join_reply *jrep = &rep->msg_body.join_reply; + struct lstcon_group *grp = NULL; + struct lstcon_ndlink *ndl; + int rc = 0; + + sfw_unpack_message(req); + + mutex_lock(&console_session.ses_mutex); + + jrep->join_sid = console_session.ses_id; + + if (console_session.ses_id.ses_nid == LNET_NID_ANY) { + jrep->join_status = ESRCH; + goto out; + } + + if (lstcon_session_feats_check(req->msg_ses_feats) != 0) { + jrep->join_status = EPROTO; + goto out; + } + + if (jreq->join_sid.ses_nid != LNET_NID_ANY && + !lstcon_session_match(jreq->join_sid)) { + jrep->join_status = EBUSY; + goto out; + } + + if (lstcon_group_find(jreq->join_group, &grp) != 0) { + rc = lstcon_group_alloc(jreq->join_group, &grp); + if (rc != 0) { + CERROR("Out of memory\n"); + goto out; + } + + list_add_tail(&grp->grp_link, + &console_session.ses_grp_list); + lstcon_group_addref(grp); + } + + if (grp->grp_ref > 2) { + /* Group in using */ + jrep->join_status = EBUSY; + goto out; + } + + rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0); + if (rc == 0) { + jrep->join_status = EEXIST; + goto out; + } + + rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1); + if (rc != 0) { + CERROR("Out of memory\n"); + goto out; + } + + ndl->ndl_node->nd_state = LST_NODE_ACTIVE; + ndl->ndl_node->nd_timeout = console_session.ses_timeout; + + if (grp->grp_userland == 0) + grp->grp_userland = 1; + + strlcpy(jrep->join_session, console_session.ses_name, + sizeof(jrep->join_session)); + jrep->join_timeout = console_session.ses_timeout; + jrep->join_status = 0; + +out: + rep->msg_ses_feats = console_session.ses_features; + if (grp != NULL) + lstcon_group_decref(grp); + + mutex_unlock(&console_session.ses_mutex); + + return rc; +} + +static struct srpc_service lstcon_acceptor_service; + +static void lstcon_init_acceptor_service(void) +{ + /* initialize selftest console acceptor service table */ + lstcon_acceptor_service.sv_name = "join session"; + lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle; + lstcon_acceptor_service.sv_id = SRPC_SERVICE_JOIN; + lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX; +} + +static struct notifier_block lstcon_ioctl_handler = { + .notifier_call = lstcon_ioctl_entry, +}; + +/* initialize console */ +int +lstcon_console_init(void) +{ + int i; + int rc; + + console_session.ses_id = LST_INVALID_SID; + console_session.ses_state = LST_SESSION_NONE; + console_session.ses_timeout = 0; + console_session.ses_force = 0; + console_session.ses_expired = 0; + console_session.ses_feats_updated = 0; + console_session.ses_features = LST_FEATS_MASK; + console_session.ses_laststamp = ktime_get_real_seconds(); + + mutex_init(&console_session.ses_mutex); + + INIT_LIST_HEAD(&console_session.ses_ndl_list); + INIT_LIST_HEAD(&console_session.ses_grp_list); + INIT_LIST_HEAD(&console_session.ses_bat_list); + INIT_LIST_HEAD(&console_session.ses_trans_list); + + CFS_ALLOC_PTR_ARRAY(console_session.ses_ndl_hash, + LST_GLOBAL_HASHSIZE); + if (console_session.ses_ndl_hash == NULL) + return -ENOMEM; + + for (i = 0; i < LST_GLOBAL_HASHSIZE; i++) + INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]); + + + /* initialize acceptor service table */ + lstcon_init_acceptor_service(); + + rc = srpc_add_service(&lstcon_acceptor_service); + LASSERT(rc != -EBUSY); + if (rc != 0) { + CFS_FREE_PTR_ARRAY(console_session.ses_ndl_hash, + LST_GLOBAL_HASHSIZE); + return rc; + } + + rc = srpc_service_add_buffers(&lstcon_acceptor_service, + lstcon_acceptor_service.sv_wi_total); + if (rc != 0) { + rc = -ENOMEM; + goto out; + } + + rc = blocking_notifier_chain_register(&libcfs_ioctl_list, + &lstcon_ioctl_handler); + if (rc == 0) { + lstcon_rpc_module_init(); + return 0; + } + +out: + srpc_shutdown_service(&lstcon_acceptor_service); + srpc_remove_service(&lstcon_acceptor_service); + + CFS_FREE_PTR_ARRAY(console_session.ses_ndl_hash, LST_GLOBAL_HASHSIZE); + + srpc_wait_service_shutdown(&lstcon_acceptor_service); + + return rc; +} + +int +lstcon_console_fini(void) +{ + int i; + + blocking_notifier_chain_unregister(&libcfs_ioctl_list, + &lstcon_ioctl_handler); + + mutex_lock(&console_session.ses_mutex); + + srpc_shutdown_service(&lstcon_acceptor_service); + srpc_remove_service(&lstcon_acceptor_service); + + if (console_session.ses_state != LST_SESSION_NONE) + lstcon_session_end(); + + lstcon_rpc_module_fini(); + + mutex_unlock(&console_session.ses_mutex); + + LASSERT(list_empty(&console_session.ses_ndl_list)); + LASSERT(list_empty(&console_session.ses_grp_list)); + LASSERT(list_empty(&console_session.ses_bat_list)); + LASSERT(list_empty(&console_session.ses_trans_list)); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) + LASSERT(list_empty(&console_session.ses_ndl_hash[i])); + + CFS_FREE_PTR_ARRAY(console_session.ses_ndl_hash, + LST_GLOBAL_HASHSIZE); + + srpc_wait_service_shutdown(&lstcon_acceptor_service); + + return 0; +} + diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.h b/drivers/staging/lustrefsx/lnet/selftest/console.h new file mode 100644 index 0000000000000..13144cc8dd4cd --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/console.h @@ -0,0 +1,262 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/selftest/console.h + * + * kernel structure for LST console + * + * Author: Liang Zhen + */ + +#ifndef __LST_CONSOLE_H__ +#define __LST_CONSOLE_H__ + +#include + +#include +#include +#include "selftest.h" +#include "conrpc.h" + +/* node descriptor */ +struct lstcon_node { + struct lnet_process_id nd_id; /* id of the node */ + int nd_ref; /* reference count */ + int nd_state; /* state of the node */ + int nd_timeout; /* session timeout */ + ktime_t nd_stamp; /* last RPC reply timestamp */ + struct lstcon_rpc nd_ping; /* ping rpc */ +}; + +/* node link descriptor */ +struct lstcon_ndlink { + struct list_head ndl_link; /* chain on list */ + struct list_head ndl_hlink; /* chain on hash */ + struct lstcon_node *ndl_node; /* pointer to node */ +}; + +/* (alias of nodes) group descriptor */ +struct lstcon_group { + struct list_head grp_link; /* chain on global group list */ + int grp_ref; /* reference count */ + int grp_userland; /* has userland nodes */ + int grp_nnode; /* # of nodes */ + char grp_name[LST_NAME_SIZE]; /* group name */ + + struct list_head grp_trans_list; /* transaction list */ + struct list_head grp_ndl_list; /* nodes list */ + struct list_head grp_ndl_hash[0];/* hash table for nodes */ +}; + +#define LST_BATCH_IDLE 0xB0 /* idle batch */ +#define LST_BATCH_RUNNING 0xB1 /* running batch */ + +struct lstcon_tsb_hdr { + struct lst_bid tsb_id; /* batch ID */ + int tsb_index; /* test index */ +}; + +/* (tests ) batch descriptor */ +struct lstcon_batch { + /* test_batch header */ + struct lstcon_tsb_hdr bat_hdr; + /* chain on session's batches list */ + struct list_head bat_link; + /* # of test */ + int bat_ntest; + /* state of the batch */ + int bat_state; + /* parameter for run|stop, timeout for run, force for stop */ + int bat_arg; + /* name of batch */ + char bat_name[LST_NAME_SIZE]; + + /* list head of tests (lstcon_test_t) */ + struct list_head bat_test_list; + /* list head of transaction */ + struct list_head bat_trans_list; + /* list head of client nodes (struct lstcon_node) */ + struct list_head bat_cli_list; + /* hash table of client nodes */ + struct list_head *bat_cli_hash; + /* list head of server nodes */ + struct list_head bat_srv_list; + /* hash table of server nodes */ + struct list_head *bat_srv_hash; +}; + +/* a single test descriptor */ +struct lstcon_test { + /* test batch header */ + struct lstcon_tsb_hdr tes_hdr; + /* chain on batch's tests list */ + struct list_head tes_link; + /* pointer to batch */ + struct lstcon_batch *tes_batch; + + int tes_type; /* type of the test, i.e: bulk, ping */ + int tes_stop_onerr; /* stop on error */ + int tes_oneside; /* one-sided test */ + int tes_concur; /* concurrency */ + int tes_loop; /* loop count */ + int tes_dist; /* nodes distribution of target group */ + int tes_span; /* nodes span of target group */ + int tes_cliidx; /* client index, used for RPC creating */ + + struct list_head tes_trans_list; /* transaction list */ + struct lstcon_group *tes_src_grp; /* group run the test */ + struct lstcon_group *tes_dst_grp; /* target group */ + + int tes_paramlen; /* test parameter length */ + char tes_param[0]; /* test parameter */ +}; + +#define LST_GLOBAL_HASHSIZE 503 /* global nodes hash table size */ +#define LST_NODE_HASHSIZE 239 /* node hash table (for batch or group) */ + +#define LST_SESSION_NONE 0x0 /* no session */ +#define LST_SESSION_ACTIVE 0x1 /* working session */ + +#define LST_CONSOLE_TIMEOUT 300 /* default console timeout */ + +struct lstcon_session { + struct mutex ses_mutex; /* only 1 thread in session */ + struct lst_sid ses_id; /* global session id */ + int ses_key; /* local session key */ + int ses_state; /* state of session */ + int ses_timeout; /* timeout in seconds */ + time64_t ses_laststamp; /* last operation stamp (seconds) */ + /** tests features of the session */ + unsigned ses_features; + /** features are synced with remote test nodes */ + unsigned ses_feats_updated:1; + /** force creating */ + unsigned ses_force:1; + /** session is shutting down */ + unsigned ses_shutdown:1; + /** console is timedout */ + unsigned ses_expired:1; + __u64 ses_id_cookie; /* batch id cookie */ + char ses_name[LST_NAME_SIZE]; /* session name */ + struct lstcon_rpc_trans *ses_ping; /* session pinger */ + struct stt_timer ses_ping_timer; /* timer for pinger */ + struct lstcon_trans_stat ses_trans_stat;/* transaction stats */ + + struct list_head ses_trans_list; /* global list of transaction */ + struct list_head ses_grp_list; /* global list of groups */ + struct list_head ses_bat_list; /* global list of batches */ + struct list_head ses_ndl_list; /* global list of nodes */ + struct list_head *ses_ndl_hash; /* hash table of nodes */ + + spinlock_t ses_rpc_lock; /* serialize */ + atomic_t ses_rpc_counter;/* # of initialized RPCs */ + struct list_head ses_rpc_freelist;/* idle console rpc */ +}; /* session descriptor */ + +extern struct lstcon_session console_session; + +static inline struct lstcon_trans_stat * +lstcon_trans_stat(void) +{ + return &console_session.ses_trans_stat; +} + +static inline struct list_head * +lstcon_id2hash(struct lnet_process_id id, struct list_head *hash) +{ + unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE; + + return &hash[idx]; +} + +extern int lstcon_session_match(struct lst_sid sid); +extern int lstcon_session_new(char *name, int key, unsigned version, + int timeout, int flags, struct lst_sid __user *sid_up); +extern int lstcon_session_info(struct lst_sid __user *sid_up, int __user *key, + unsigned __user *verp, + struct lstcon_ndlist_ent __user *entp, + char __user *name_up, int len); +extern int lstcon_session_end(void); +extern int lstcon_session_debug(int timeout, + struct list_head __user *result_up); +extern int lstcon_session_feats_check(unsigned feats); +extern int lstcon_batch_debug(int timeout, char *name, + int client, struct list_head __user *result_up); +extern int lstcon_group_debug(int timeout, char *name, + struct list_head __user *result_up); +extern int lstcon_nodes_debug(int timeout, int nnd, + struct lnet_process_id __user *nds_up, + struct list_head __user *result_up); +extern int lstcon_group_add(char *name); +extern int lstcon_group_del(char *name); +extern int lstcon_group_clean(char *name, int args); +extern int lstcon_group_refresh(char *name, struct list_head __user *result_up); +extern int lstcon_nodes_add(char *name, int nnd, + struct lnet_process_id __user *nds_up, + unsigned *featp, + struct list_head __user *result_up); +extern int lstcon_nodes_remove(char *name, int nnd, + struct lnet_process_id __user *nds_up, + struct list_head __user *result_up); +extern int lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gent_up, + int *index_p, int *ndent_p, + struct lstcon_node_ent __user *ndents_up); +extern int lstcon_group_list(int idx, int len, char __user *name_up); +extern int lstcon_batch_add(char *name); +extern int lstcon_batch_run(char *name, int timeout, + struct list_head __user *result_up); +extern int lstcon_batch_stop(char *name, int force, + struct list_head __user *result_up); +extern int lstcon_test_batch_query(char *name, int testidx, + int client, int timeout, + struct list_head __user *result_up); +extern int lstcon_batch_del(char *name); +extern int lstcon_batch_list(int idx, int namelen, char __user *name_up); +extern int lstcon_batch_info(char *name, + struct lstcon_test_batch_ent __user *ent_up, + int server, int testidx, int *index_p, + int *ndent_p, + struct lstcon_node_ent __user *dents_up); +extern int lstcon_group_stat(char *grp_name, int timeout, + struct list_head __user *result_up); +extern int lstcon_nodes_stat(int count, struct lnet_process_id __user *ids_up, + int timeout, struct list_head __user *result_up); +extern int lstcon_test_add(char *batch_name, int type, int loop, + int concur, int dist, int span, + char *src_name, char *dst_name, + void *param, int paramlen, int *retp, + struct list_head __user *result_up); + +int lstcon_ioctl_entry(struct notifier_block *nb, + unsigned long cmd, void *vdata); +int lstcon_console_init(void); +int lstcon_console_fini(void); + +#endif diff --git a/drivers/staging/lustrefsx/lnet/selftest/framework.c b/drivers/staging/lustrefsx/lnet/selftest/framework.c new file mode 100644 index 0000000000000..7e048ad4595e4 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/framework.c @@ -0,0 +1,1766 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/selftest/framework.c + * + * Author: Isaac Huang + * Author: Liang Zhen + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + +struct lst_sid LST_INVALID_SID = { .ses_nid = LNET_NID_ANY, .ses_stamp = -1}; + +static int session_timeout = 100; +module_param(session_timeout, int, 0444); +MODULE_PARM_DESC(session_timeout, "test session timeout in seconds (100 by default, 0 == never)"); + +static int rpc_timeout = 64; +module_param(rpc_timeout, int, 0644); +MODULE_PARM_DESC(rpc_timeout, "rpc timeout in seconds (64 by default, 0 == never)"); + +#define sfw_unpack_id(id) \ +do { \ + __swab64s(&(id).nid); \ + __swab32s(&(id).pid); \ +} while (0) + +#define sfw_unpack_sid(sid) \ +do { \ + __swab64s(&(sid).ses_nid); \ + __swab64s(&(sid).ses_stamp); \ +} while (0) + +#define sfw_unpack_fw_counters(fc) \ +do { \ + __swab32s(&(fc).running_ms); \ + __swab32s(&(fc).active_batches); \ + __swab32s(&(fc).zombie_sessions); \ + __swab32s(&(fc).brw_errors); \ + __swab32s(&(fc).ping_errors); \ +} while (0) + +#define sfw_unpack_rpc_counters(rc) \ +do { \ + __swab32s(&(rc).errors); \ + __swab32s(&(rc).rpcs_sent); \ + __swab32s(&(rc).rpcs_rcvd); \ + __swab32s(&(rc).rpcs_dropped); \ + __swab32s(&(rc).rpcs_expired); \ + __swab64s(&(rc).bulk_get); \ + __swab64s(&(rc).bulk_put); \ +} while (0) + +#define sfw_unpack_lnet_counters(lc) \ +do { \ + __swab32s(&(lc).lcc_errors); \ + __swab32s(&(lc).lcc_msgs_max); \ + __swab32s(&(lc).lcc_msgs_alloc); \ + __swab32s(&(lc).lcc_send_count); \ + __swab32s(&(lc).lcc_recv_count); \ + __swab32s(&(lc).lcc_drop_count); \ + __swab32s(&(lc).lcc_route_count); \ + __swab64s(&(lc).lcc_send_length); \ + __swab64s(&(lc).lcc_recv_length); \ + __swab64s(&(lc).lcc_drop_length); \ + __swab64s(&(lc).lcc_route_length); \ +} while (0) + +#define sfw_test_active(t) (atomic_read(&(t)->tsi_nactive) != 0) +#define sfw_batch_active(b) (atomic_read(&(b)->bat_nactive) != 0) + +static struct smoketest_framework { + /* RPCs to be recycled */ + struct list_head fw_zombie_rpcs; + /* stopping sessions */ + struct list_head fw_zombie_sessions; + /* registered test cases */ + struct list_head fw_tests; + /* # zombie sessions */ + atomic_t fw_nzombies; + /* serialise */ + spinlock_t fw_lock; + /* _the_ session */ + struct sfw_session *fw_session; + /* shutdown in progress */ + int fw_shuttingdown; + /* running RPC */ + struct srpc_server_rpc *fw_active_srpc; +} sfw_data; + +/* forward ref's */ +static int sfw_stop_batch(struct sfw_batch *tsb, int force); +static void sfw_destroy_session(struct sfw_session *sn); + +static inline struct sfw_test_case * +sfw_find_test_case(int id) +{ + struct sfw_test_case *tsc; + + LASSERT(id <= SRPC_SERVICE_MAX_ID); + LASSERT(id > SRPC_FRAMEWORK_SERVICE_MAX_ID); + + list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { + if (tsc->tsc_srv_service->sv_id == id) + return tsc; + } + + return NULL; +} + +static int +sfw_register_test(struct srpc_service *service, + struct sfw_test_client_ops *cliops) +{ + struct sfw_test_case *tsc; + + if (sfw_find_test_case(service->sv_id) != NULL) { + CERROR ("Failed to register test %s (%d)\n", + service->sv_name, service->sv_id); + return -EEXIST; + } + + LIBCFS_ALLOC(tsc, sizeof(*tsc)); + if (tsc == NULL) + return -ENOMEM; + + tsc->tsc_cli_ops = cliops; + tsc->tsc_srv_service = service; + + list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests); + return 0; +} + +static void +sfw_add_session_timer (void) +{ + struct sfw_session *sn = sfw_data.fw_session; + struct stt_timer *timer = &sn->sn_timer; + + LASSERT (!sfw_data.fw_shuttingdown); + + if (sn == NULL || sn->sn_timeout == 0) + return; + + LASSERT (!sn->sn_timer_active); + + sn->sn_timer_active = 1; + timer->stt_expires = ktime_get_real_seconds()+ sn->sn_timeout; + stt_add_timer(timer); +} + +static int +sfw_del_session_timer (void) +{ + struct sfw_session *sn = sfw_data.fw_session; + + if (sn == NULL || !sn->sn_timer_active) + return 0; + + LASSERT (sn->sn_timeout != 0); + + if (stt_del_timer(&sn->sn_timer)) { /* timer defused */ + sn->sn_timer_active = 0; + return 0; + } + + return EBUSY; /* racing with sfw_session_expired() */ +} + +/* called with sfw_data.fw_lock held */ +static void +sfw_deactivate_session (void) +__must_hold(&sfw_data.fw_lock) +{ + struct sfw_session *sn = sfw_data.fw_session; + int nactive = 0; + struct sfw_batch *tsb; + struct sfw_test_case *tsc; + + if (sn == NULL) return; + + LASSERT(!sn->sn_timer_active); + + sfw_data.fw_session = NULL; + atomic_inc(&sfw_data.fw_nzombies); + list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions); + + spin_unlock(&sfw_data.fw_lock); + + list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { + srpc_abort_service(tsc->tsc_srv_service); + } + + spin_lock(&sfw_data.fw_lock); + + list_for_each_entry(tsb, &sn->sn_batches, bat_list) { + if (sfw_batch_active(tsb)) { + nactive++; + sfw_stop_batch(tsb, 1); + } + } + + if (nactive != 0) + return; /* wait for active batches to stop */ + + list_del_init(&sn->sn_list); + spin_unlock(&sfw_data.fw_lock); + + sfw_destroy_session(sn); + + spin_lock(&sfw_data.fw_lock); +} + + +static void +sfw_session_expired (void *data) +{ + struct sfw_session *sn = data; + + spin_lock(&sfw_data.fw_lock); + + LASSERT (sn->sn_timer_active); + LASSERT (sn == sfw_data.fw_session); + + CWARN ("Session expired! sid: %s-%llu, name: %s\n", + libcfs_nid2str(sn->sn_id.ses_nid), + sn->sn_id.ses_stamp, &sn->sn_name[0]); + + sn->sn_timer_active = 0; + sfw_deactivate_session(); + + spin_unlock(&sfw_data.fw_lock); +} + +static inline void +sfw_init_session(struct sfw_session *sn, struct lst_sid sid, + unsigned features, const char *name) +{ + struct stt_timer *timer = &sn->sn_timer; + + memset(sn, 0, sizeof(struct sfw_session)); + INIT_LIST_HEAD(&sn->sn_list); + INIT_LIST_HEAD(&sn->sn_batches); + atomic_set(&sn->sn_refcount, 1); /* +1 for caller */ + atomic_set(&sn->sn_brw_errors, 0); + atomic_set(&sn->sn_ping_errors, 0); + strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name)); + + sn->sn_timer_active = 0; + sn->sn_id = sid; + sn->sn_features = features; + sn->sn_timeout = session_timeout; + sn->sn_started = ktime_get(); + + timer->stt_data = sn; + timer->stt_func = sfw_session_expired; + INIT_LIST_HEAD(&timer->stt_list); +} + +/* completion handler for incoming framework RPCs */ +static void +sfw_server_rpc_done(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + int status = rpc->srpc_status; + + CDEBUG (D_NET, + "Incoming framework RPC done: " + "service %s, peer %s, status %s:%d\n", + sv->sv_name, libcfs_id2str(rpc->srpc_peer), + swi_state2str(rpc->srpc_wi.swi_state), + status); + + if (rpc->srpc_bulk != NULL) + sfw_free_pages(rpc); +} + +static void +sfw_client_rpc_fini(struct srpc_client_rpc *rpc) +{ + LASSERT(rpc->crpc_bulk.bk_niov == 0); + LASSERT(list_empty(&rpc->crpc_list)); + LASSERT(atomic_read(&rpc->crpc_refcount) == 0); + + CDEBUG(D_NET, "Outgoing framework RPC done: " + "service %d, peer %s, status %s:%d:%d\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + swi_state2str(rpc->crpc_wi.swi_state), + rpc->crpc_aborted, rpc->crpc_status); + + spin_lock(&sfw_data.fw_lock); + + /* my callers must finish all RPCs before shutting me down */ + LASSERT(!sfw_data.fw_shuttingdown); + list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs); + + spin_unlock(&sfw_data.fw_lock); +} + +static struct sfw_batch * +sfw_find_batch(struct lst_bid bid) +{ + struct sfw_session *sn = sfw_data.fw_session; + struct sfw_batch *bat; + + LASSERT(sn != NULL); + + list_for_each_entry(bat, &sn->sn_batches, bat_list) { + if (bat->bat_id.bat_id == bid.bat_id) + return bat; + } + + return NULL; +} + +static struct sfw_batch * +sfw_bid2batch(struct lst_bid bid) +{ + struct sfw_session *sn = sfw_data.fw_session; + struct sfw_batch *bat; + + LASSERT (sn != NULL); + + bat = sfw_find_batch(bid); + if (bat != NULL) + return bat; + + LIBCFS_ALLOC(bat, sizeof(*bat)); + if (bat == NULL) + return NULL; + + bat->bat_error = 0; + bat->bat_session = sn; + bat->bat_id = bid; + atomic_set(&bat->bat_nactive, 0); + INIT_LIST_HEAD(&bat->bat_tests); + + list_add_tail(&bat->bat_list, &sn->sn_batches); + return bat; +} + +static int +sfw_get_stats(struct srpc_stat_reqst *request, struct srpc_stat_reply *reply) +{ + struct sfw_session *sn = sfw_data.fw_session; + struct sfw_counters *cnt = &reply->str_fw; + struct sfw_batch *bat; + + reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (request->str_sid.ses_nid == LNET_NID_ANY) { + reply->str_status = EINVAL; + return 0; + } + + if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) { + reply->str_status = ESRCH; + return 0; + } + + lnet_counters_get_common(&reply->str_lnet); + srpc_get_counters(&reply->str_rpc); + + /* send over the msecs since the session was started + - with 32 bits to send, this is ~49 days */ + cnt->running_ms = ktime_ms_delta(ktime_get(), sn->sn_started); + cnt->brw_errors = atomic_read(&sn->sn_brw_errors); + cnt->ping_errors = atomic_read(&sn->sn_ping_errors); + cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies); + + cnt->active_batches = 0; + list_for_each_entry(bat, &sn->sn_batches, bat_list) { + if (atomic_read(&bat->bat_nactive) > 0) + cnt->active_batches++; + } + + reply->str_status = 0; + return 0; +} + +int +sfw_make_session(struct srpc_mksn_reqst *request, struct srpc_mksn_reply *reply) +{ + struct sfw_session *sn = sfw_data.fw_session; + struct srpc_msg *msg = container_of(request, struct srpc_msg, + msg_body.mksn_reqst); + int cplen = 0; + + if (request->mksn_sid.ses_nid == LNET_NID_ANY) { + reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + reply->mksn_status = EINVAL; + return 0; + } + + if (sn != NULL) { + reply->mksn_status = 0; + reply->mksn_sid = sn->sn_id; + reply->mksn_timeout = sn->sn_timeout; + + if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) { + atomic_inc(&sn->sn_refcount); + return 0; + } + + if (!request->mksn_force) { + reply->mksn_status = EBUSY; + cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0], + sizeof(reply->mksn_name)); + if (cplen >= sizeof(reply->mksn_name)) + return -E2BIG; + return 0; + } + } + + /* reject the request if it requires unknown features + * NB: old version will always accept all features because it's not + * aware of struct srpc_msg::msg_ses_feats, it's a defect but it's also + * harmless because it will return zero feature to console, and it's + * console's responsibility to make sure all nodes in a session have + * same feature mask. */ + if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + reply->mksn_status = EPROTO; + return 0; + } + + /* brand new or create by force */ + LIBCFS_ALLOC(sn, sizeof(*sn)); + if (sn == NULL) { + CERROR("dropping RPC mksn under memory pressure\n"); + return -ENOMEM; + } + + sfw_init_session(sn, request->mksn_sid, + msg->msg_ses_feats, &request->mksn_name[0]); + + spin_lock(&sfw_data.fw_lock); + + sfw_deactivate_session(); + LASSERT(sfw_data.fw_session == NULL); + sfw_data.fw_session = sn; + + spin_unlock(&sfw_data.fw_lock); + + reply->mksn_status = 0; + reply->mksn_sid = sn->sn_id; + reply->mksn_timeout = sn->sn_timeout; + return 0; +} + +static int +sfw_remove_session(struct srpc_rmsn_reqst *request, + struct srpc_rmsn_reply *reply) +{ + struct sfw_session *sn = sfw_data.fw_session; + + reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (request->rmsn_sid.ses_nid == LNET_NID_ANY) { + reply->rmsn_status = EINVAL; + return 0; + } + + if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) { + reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY; + return 0; + } + + if (!atomic_dec_and_test(&sn->sn_refcount)) { + reply->rmsn_status = 0; + return 0; + } + + spin_lock(&sfw_data.fw_lock); + sfw_deactivate_session(); + spin_unlock(&sfw_data.fw_lock); + + reply->rmsn_status = 0; + reply->rmsn_sid = LST_INVALID_SID; + LASSERT(sfw_data.fw_session == NULL); + return 0; +} + +static int +sfw_debug_session(struct srpc_debug_reqst *request, + struct srpc_debug_reply *reply) +{ + struct sfw_session *sn = sfw_data.fw_session; + + if (sn == NULL) { + reply->dbg_status = ESRCH; + reply->dbg_sid = LST_INVALID_SID; + return 0; + } + + reply->dbg_status = 0; + reply->dbg_sid = sn->sn_id; + reply->dbg_timeout = sn->sn_timeout; + if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name)) + >= sizeof(reply->dbg_name)) + return -E2BIG; + + return 0; +} + +static void +sfw_test_rpc_fini(struct srpc_client_rpc *rpc) +{ + struct sfw_test_unit *tsu = rpc->crpc_priv; + struct sfw_test_instance *tsi = tsu->tsu_instance; + + /* Called with hold of tsi->tsi_lock */ + LASSERT(list_empty(&rpc->crpc_list)); + list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs); +} + +static inline int +sfw_test_buffers(struct sfw_test_instance *tsi) +{ + struct sfw_test_case *tsc; + struct srpc_service *svc; + int nbuf; + + LASSERT(tsi != NULL); + tsc = sfw_find_test_case(tsi->tsi_service); + LASSERT(tsc != NULL); + svc = tsc->tsc_srv_service; + LASSERT(svc != NULL); + + nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts; + return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA); +} + +static int +sfw_load_test(struct sfw_test_instance *tsi) +{ + struct sfw_test_case *tsc; + struct srpc_service *svc; + int nbuf; + int rc; + + LASSERT(tsi != NULL); + tsc = sfw_find_test_case(tsi->tsi_service); + nbuf = sfw_test_buffers(tsi); + LASSERT(tsc != NULL); + svc = tsc->tsc_srv_service; + + if (tsi->tsi_is_client) { + tsi->tsi_ops = tsc->tsc_cli_ops; + return 0; + } + + rc = srpc_service_add_buffers(svc, nbuf); + if (rc != 0) { + CWARN("Failed to reserve enough buffers: " + "service %s, %d needed: %d\n", svc->sv_name, nbuf, rc); + /* NB: this error handler is not strictly correct, because + * it may release more buffers than already allocated, + * but it doesn't matter because request portal should + * be lazy portal and will grow buffers if necessary. */ + srpc_service_remove_buffers(svc, nbuf); + return -ENOMEM; + } + + CDEBUG(D_NET, "Reserved %d buffers for test %s\n", + nbuf * (srpc_serv_is_framework(svc) ? + 1 : cfs_cpt_number(cfs_cpt_tab)), svc->sv_name); + return 0; +} + +static void +sfw_unload_test(struct sfw_test_instance *tsi) +{ + struct sfw_test_case *tsc; + + LASSERT(tsi != NULL); + tsc = sfw_find_test_case(tsi->tsi_service); + LASSERT(tsc != NULL); + + if (tsi->tsi_is_client) + return; + + /* shrink buffers, because request portal is lazy portal + * which can grow buffers at runtime so we may leave + * some buffers behind, but never mind... */ + srpc_service_remove_buffers(tsc->tsc_srv_service, + sfw_test_buffers(tsi)); +} + +static void +sfw_destroy_test_instance(struct sfw_test_instance *tsi) +{ + struct srpc_client_rpc *rpc; + struct sfw_test_unit *tsu; + + if (!tsi->tsi_is_client) goto clean; + + tsi->tsi_ops->tso_fini(tsi); + + LASSERT(!tsi->tsi_stopping); + LASSERT(list_empty(&tsi->tsi_active_rpcs)); + LASSERT(!sfw_test_active(tsi)); + + while (!list_empty(&tsi->tsi_units)) { + tsu = list_entry(tsi->tsi_units.next, + struct sfw_test_unit, tsu_list); + list_del(&tsu->tsu_list); + LIBCFS_FREE(tsu, sizeof(*tsu)); + } + + while (!list_empty(&tsi->tsi_free_rpcs)) { + rpc = list_entry(tsi->tsi_free_rpcs.next, + struct srpc_client_rpc, crpc_list); + list_del(&rpc->crpc_list); + LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc)); + } + +clean: + sfw_unload_test(tsi); + LIBCFS_FREE(tsi, sizeof(*tsi)); +} + +static void +sfw_destroy_batch(struct sfw_batch *tsb) +{ + struct sfw_test_instance *tsi; + + LASSERT(!sfw_batch_active(tsb)); + LASSERT(list_empty(&tsb->bat_list)); + + while (!list_empty(&tsb->bat_tests)) { + tsi = list_entry(tsb->bat_tests.next, + struct sfw_test_instance, tsi_list); + list_del_init(&tsi->tsi_list); + sfw_destroy_test_instance(tsi); + } + + LIBCFS_FREE(tsb, sizeof(*tsb)); +} + +static void +sfw_destroy_session(struct sfw_session *sn) +{ + struct sfw_batch *batch; + + LASSERT(list_empty(&sn->sn_list)); + LASSERT(sn != sfw_data.fw_session); + + while (!list_empty(&sn->sn_batches)) { + batch = list_entry(sn->sn_batches.next, + struct sfw_batch, bat_list); + list_del_init(&batch->bat_list); + sfw_destroy_batch(batch); + } + + LIBCFS_FREE(sn, sizeof(*sn)); + atomic_dec(&sfw_data.fw_nzombies); +} + +static void +sfw_unpack_addtest_req(struct srpc_msg *msg) +{ + struct srpc_test_reqst *req = &msg->msg_body.tes_reqst; + + LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST); + LASSERT (req->tsr_is_client); + + if (msg->msg_magic == SRPC_MSG_MAGIC) + return; /* no flipping needed */ + + LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + if (req->tsr_service == SRPC_SERVICE_BRW) { + if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) { + struct test_bulk_req *bulk = &req->tsr_u.bulk_v0; + + __swab32s(&bulk->blk_opc); + __swab32s(&bulk->blk_npg); + __swab32s(&bulk->blk_flags); + + } else { + struct test_bulk_req_v1 *bulk = &req->tsr_u.bulk_v1; + + __swab16s(&bulk->blk_opc); + __swab16s(&bulk->blk_flags); + __swab32s(&bulk->blk_offset); + __swab32s(&bulk->blk_len); + } + + return; + } + + if (req->tsr_service == SRPC_SERVICE_PING) { + struct test_ping_req *ping = &req->tsr_u.ping; + + __swab32s(&ping->png_size); + __swab32s(&ping->png_flags); + return; + } + + LBUG(); +} + +static int +sfw_add_test_instance(struct sfw_batch *tsb, struct srpc_server_rpc *rpc) +{ + struct srpc_msg *msg = &rpc->srpc_reqstbuf->buf_msg; + struct srpc_test_reqst *req = &msg->msg_body.tes_reqst; + struct srpc_bulk *bk = rpc->srpc_bulk; + int ndest = req->tsr_ndest; + struct sfw_test_unit *tsu; + struct sfw_test_instance *tsi; + int i; + int rc; + + LIBCFS_ALLOC(tsi, sizeof(*tsi)); + if (tsi == NULL) { + CERROR ("Can't allocate test instance for batch: %llu\n", + tsb->bat_id.bat_id); + return -ENOMEM; + } + + spin_lock_init(&tsi->tsi_lock); + atomic_set(&tsi->tsi_nactive, 0); + INIT_LIST_HEAD(&tsi->tsi_units); + INIT_LIST_HEAD(&tsi->tsi_free_rpcs); + INIT_LIST_HEAD(&tsi->tsi_active_rpcs); + + tsi->tsi_stopping = 0; + tsi->tsi_batch = tsb; + tsi->tsi_loop = req->tsr_loop; + tsi->tsi_concur = req->tsr_concur; + tsi->tsi_service = req->tsr_service; + tsi->tsi_is_client = !!(req->tsr_is_client); + tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr); + + rc = sfw_load_test(tsi); + if (rc != 0) { + LIBCFS_FREE(tsi, sizeof(*tsi)); + return rc; + } + + LASSERT (!sfw_batch_active(tsb)); + + if (!tsi->tsi_is_client) { + /* it's test server, just add it to tsb */ + list_add_tail(&tsi->tsi_list, &tsb->bat_tests); + return 0; + } + + LASSERT (bk != NULL); + LASSERT (bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest); + LASSERT((unsigned int)bk->bk_len >= + sizeof(struct lnet_process_id_packed) * ndest); + + sfw_unpack_addtest_req(msg); + memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u)); + + for (i = 0; i < ndest; i++) { + struct lnet_process_id_packed *dests; + struct lnet_process_id_packed id; + int j; + + dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].bv_page); + LASSERT (dests != NULL); /* my pages are within KVM always */ + id = dests[i % SFW_ID_PER_PAGE]; + if (msg->msg_magic != SRPC_MSG_MAGIC) + sfw_unpack_id(id); + + for (j = 0; j < tsi->tsi_concur; j++) { + LIBCFS_ALLOC(tsu, sizeof(*tsu)); + if (tsu == NULL) { + rc = -ENOMEM; + CERROR ("Can't allocate tsu for %d\n", + tsi->tsi_service); + goto error; + } + + tsu->tsu_dest.nid = id.nid; + tsu->tsu_dest.pid = id.pid; + tsu->tsu_instance = tsi; + tsu->tsu_private = NULL; + list_add_tail(&tsu->tsu_list, &tsi->tsi_units); + } + } + + rc = tsi->tsi_ops->tso_init(tsi); + if (rc == 0) { + list_add_tail(&tsi->tsi_list, &tsb->bat_tests); + return 0; + } + +error: + LASSERT(rc != 0); + sfw_destroy_test_instance(tsi); + return rc; +} + +static void +sfw_test_unit_done(struct sfw_test_unit *tsu) +{ + struct sfw_test_instance *tsi = tsu->tsu_instance; + struct sfw_batch *tsb = tsi->tsi_batch; + struct sfw_session *sn = tsb->bat_session; + + LASSERT (sfw_test_active(tsi)); + + if (!atomic_dec_and_test(&tsi->tsi_nactive)) + return; + + /* the test instance is done */ + spin_lock(&tsi->tsi_lock); + + tsi->tsi_stopping = 0; + + spin_unlock(&tsi->tsi_lock); + + spin_lock(&sfw_data.fw_lock); + + if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */ + sn == sfw_data.fw_session) { /* sn also active */ + spin_unlock(&sfw_data.fw_lock); + return; + } + + LASSERT(!list_empty(&sn->sn_list)); /* I'm a zombie! */ + + list_for_each_entry(tsb, &sn->sn_batches, bat_list) { + if (sfw_batch_active(tsb)) { + spin_unlock(&sfw_data.fw_lock); + return; + } + } + + list_del_init(&sn->sn_list); + spin_unlock(&sfw_data.fw_lock); + + sfw_destroy_session(sn); +} + +static void +sfw_test_rpc_done(struct srpc_client_rpc *rpc) +{ + struct sfw_test_unit *tsu = rpc->crpc_priv; + struct sfw_test_instance *tsi = tsu->tsu_instance; + int done = 0; + + tsi->tsi_ops->tso_done_rpc(tsu, rpc); + + spin_lock(&tsi->tsi_lock); + + LASSERT(sfw_test_active(tsi)); + LASSERT(!list_empty(&rpc->crpc_list)); + + list_del_init(&rpc->crpc_list); + + /* batch is stopping or loop is done or get error */ + if (tsi->tsi_stopping || + tsu->tsu_loop == 0 || + (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr)) + done = 1; + + /* dec ref for poster */ + srpc_client_rpc_decref(rpc); + + spin_unlock(&tsi->tsi_lock); + + if (!done) { + swi_schedule_workitem(&tsu->tsu_worker); + return; + } + + sfw_test_unit_done(tsu); +} + +int +sfw_create_test_rpc(struct sfw_test_unit *tsu, struct lnet_process_id peer, + unsigned features, int nblk, int blklen, + struct srpc_client_rpc **rpcpp) +{ + struct srpc_client_rpc *rpc = NULL; + struct sfw_test_instance *tsi = tsu->tsu_instance; + + spin_lock(&tsi->tsi_lock); + + LASSERT (sfw_test_active(tsi)); + + if (!list_empty(&tsi->tsi_free_rpcs)) { + /* pick request from buffer */ + rpc = list_entry(tsi->tsi_free_rpcs.next, + struct srpc_client_rpc, crpc_list); + LASSERT(nblk == rpc->crpc_bulk.bk_niov); + list_del_init(&rpc->crpc_list); + } + + spin_unlock(&tsi->tsi_lock); + + if (rpc == NULL) { + rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk, + blklen, sfw_test_rpc_done, + sfw_test_rpc_fini, tsu); + } else { + srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk, + blklen, sfw_test_rpc_done, + sfw_test_rpc_fini, tsu); + } + + if (rpc == NULL) { + CERROR("Can't create rpc for test %d\n", tsi->tsi_service); + return -ENOMEM; + } + + rpc->crpc_reqstmsg.msg_ses_feats = features; + *rpcpp = rpc; + + return 0; +} + +static int +sfw_run_test(struct swi_workitem *wi) +{ + struct sfw_test_unit *tsu = container_of(wi, struct sfw_test_unit, tsu_worker); + struct sfw_test_instance *tsi = tsu->tsu_instance; + struct srpc_client_rpc *rpc = NULL; + + LASSERT (wi == &tsu->tsu_worker); + + if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) { + LASSERT (rpc == NULL); + goto test_done; + } + + LASSERT (rpc != NULL); + + spin_lock(&tsi->tsi_lock); + + if (tsi->tsi_stopping) { + list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs); + spin_unlock(&tsi->tsi_lock); + goto test_done; + } + + if (tsu->tsu_loop > 0) + tsu->tsu_loop--; + + list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs); + spin_unlock(&tsi->tsi_lock); + + spin_lock(&rpc->crpc_lock); + rpc->crpc_timeout = rpc_timeout; + srpc_post_rpc(rpc); + spin_unlock(&rpc->crpc_lock); + return 0; + +test_done: + /* + * No one can schedule me now since: + * - previous RPC, if any, has done and + * - no new RPC is initiated. + * - my batch is still active; no one can run it again now. + * Cancel pending schedules and prevent future schedule attempts: + */ + swi_exit_workitem(wi); + sfw_test_unit_done(tsu); + return 1; +} + +static int +sfw_run_batch(struct sfw_batch *tsb) +{ + struct swi_workitem *wi; + struct sfw_test_unit *tsu; + struct sfw_test_instance *tsi; + + if (sfw_batch_active(tsb)) { + CDEBUG(D_NET, "Batch already active: %llu (%d)\n", + tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive)); + return 0; + } + + list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) { + if (!tsi->tsi_is_client) /* skip server instances */ + continue; + + LASSERT(!tsi->tsi_stopping); + LASSERT(!sfw_test_active(tsi)); + + atomic_inc(&tsb->bat_nactive); + + list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) { + atomic_inc(&tsi->tsi_nactive); + tsu->tsu_loop = tsi->tsi_loop; + wi = &tsu->tsu_worker; + swi_init_workitem(wi, sfw_run_test, + lst_sched_test[lnet_cpt_of_nid(tsu->tsu_dest.nid, NULL)]); + swi_schedule_workitem(wi); + } + } + + return 0; +} + +static int +sfw_stop_batch(struct sfw_batch *tsb, int force) +{ + struct sfw_test_instance *tsi; + struct srpc_client_rpc *rpc; + + if (!sfw_batch_active(tsb)) { + CDEBUG(D_NET, "Batch %llu inactive\n", tsb->bat_id.bat_id); + return 0; + } + + list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) { + spin_lock(&tsi->tsi_lock); + + if (!tsi->tsi_is_client || + !sfw_test_active(tsi) || tsi->tsi_stopping) { + spin_unlock(&tsi->tsi_lock); + continue; + } + + tsi->tsi_stopping = 1; + + if (!force) { + spin_unlock(&tsi->tsi_lock); + continue; + } + + /* abort launched rpcs in the test */ + list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) { + spin_lock(&rpc->crpc_lock); + + srpc_abort_rpc(rpc, -EINTR); + + spin_unlock(&rpc->crpc_lock); + } + + spin_unlock(&tsi->tsi_lock); + } + + return 0; +} + +static int +sfw_query_batch(struct sfw_batch *tsb, int testidx, + struct srpc_batch_reply *reply) +{ + struct sfw_test_instance *tsi; + + if (testidx < 0) + return -EINVAL; + + if (testidx == 0) { + reply->bar_active = atomic_read(&tsb->bat_nactive); + return 0; + } + + list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) { + if (testidx-- > 1) + continue; + + reply->bar_active = atomic_read(&tsi->tsi_nactive); + return 0; + } + + return -ENOENT; +} + +void +sfw_free_pages(struct srpc_server_rpc *rpc) +{ + srpc_free_bulk(rpc->srpc_bulk); + rpc->srpc_bulk = NULL; +} + +int +sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len, + int sink) +{ + LASSERT(rpc->srpc_bulk == NULL); + LASSERT(npages > 0 && npages <= LNET_MAX_IOV); + + rpc->srpc_bulk = srpc_alloc_bulk(cpt, 0, npages, len, sink); + if (rpc->srpc_bulk == NULL) + return -ENOMEM; + + return 0; +} + +static int +sfw_add_test(struct srpc_server_rpc *rpc) +{ + struct sfw_session *sn = sfw_data.fw_session; + struct srpc_test_reply *reply = &rpc->srpc_replymsg.msg_body.tes_reply; + struct srpc_test_reqst *request; + int rc; + struct sfw_batch *bat; + + request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst; + reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (request->tsr_loop == 0 || + request->tsr_concur == 0 || + request->tsr_sid.ses_nid == LNET_NID_ANY || + request->tsr_ndest > SFW_MAX_NDESTS || + (request->tsr_is_client && request->tsr_ndest == 0) || + request->tsr_concur > SFW_MAX_CONCUR || + request->tsr_service > SRPC_SERVICE_MAX_ID || + request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) { + reply->tsr_status = EINVAL; + return 0; + } + + if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) || + sfw_find_test_case(request->tsr_service) == NULL) { + reply->tsr_status = ENOENT; + return 0; + } + + bat = sfw_bid2batch(request->tsr_bid); + if (bat == NULL) { + CERROR("dropping RPC %s from %s under memory pressure\n", + rpc->srpc_scd->scd_svc->sv_name, + libcfs_id2str(rpc->srpc_peer)); + return -ENOMEM; + } + + if (sfw_batch_active(bat)) { + reply->tsr_status = EBUSY; + return 0; + } + + if (request->tsr_is_client && rpc->srpc_bulk == NULL) { + /* rpc will be resumed later in sfw_bulk_ready */ + int npg = sfw_id_pages(request->tsr_ndest); + int len; + + if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) { + len = npg * PAGE_SIZE; + + } else { + len = sizeof(struct lnet_process_id_packed) * + request->tsr_ndest; + } + + return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1); + } + + rc = sfw_add_test_instance(bat, rpc); + CDEBUG (rc == 0 ? D_NET : D_WARNING, + "%s test: sv %d %s, loop %d, concur %d, ndest %d\n", + rc == 0 ? "Added" : "Failed to add", request->tsr_service, + request->tsr_is_client ? "client" : "server", + request->tsr_loop, request->tsr_concur, request->tsr_ndest); + + reply->tsr_status = (rc < 0) ? -rc : rc; + return 0; +} + +static int +sfw_control_batch(struct srpc_batch_reqst *request, + struct srpc_batch_reply *reply) +{ + struct sfw_session *sn = sfw_data.fw_session; + int rc = 0; + struct sfw_batch *bat; + + reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) { + reply->bar_status = ESRCH; + return 0; + } + + bat = sfw_find_batch(request->bar_bid); + if (bat == NULL) { + reply->bar_status = ENOENT; + return 0; + } + + switch (request->bar_opc) { + case SRPC_BATCH_OPC_RUN: + rc = sfw_run_batch(bat); + break; + + case SRPC_BATCH_OPC_STOP: + rc = sfw_stop_batch(bat, request->bar_arg); + break; + + case SRPC_BATCH_OPC_QUERY: + rc = sfw_query_batch(bat, request->bar_testidx, reply); + break; + + default: + return -EINVAL; /* drop it */ + } + + reply->bar_status = (rc < 0) ? -rc : rc; + return 0; +} + +static int +sfw_handle_server_rpc(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + struct srpc_msg *reply = &rpc->srpc_replymsg; + struct srpc_msg *request = &rpc->srpc_reqstbuf->buf_msg; + unsigned features = LST_FEATS_MASK; + int rc = 0; + + LASSERT(sfw_data.fw_active_srpc == NULL); + LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID); + + spin_lock(&sfw_data.fw_lock); + + if (sfw_data.fw_shuttingdown) { + spin_unlock(&sfw_data.fw_lock); + return -ESHUTDOWN; + } + + /* Remove timer to avoid racing with it or expiring active session */ + if (sfw_del_session_timer() != 0) { + CERROR("dropping RPC %s from %s: racing with expiry timer\n", + sv->sv_name, libcfs_id2str(rpc->srpc_peer)); + spin_unlock(&sfw_data.fw_lock); + return -EAGAIN; + } + + sfw_data.fw_active_srpc = rpc; + spin_unlock(&sfw_data.fw_lock); + + sfw_unpack_message(request); + LASSERT(request->msg_type == srpc_service2request(sv->sv_id)); + + /* rpc module should have checked this */ + LASSERT(request->msg_version == SRPC_MSG_VERSION); + + if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION && + sv->sv_id != SRPC_SERVICE_DEBUG) { + struct sfw_session *sn = sfw_data.fw_session; + + if (sn != NULL && + sn->sn_features != request->msg_ses_feats) { + CNETERR("Features of framework RPC don't match " + "features of current session: %x/%x\n", + request->msg_ses_feats, sn->sn_features); + reply->msg_body.reply.status = EPROTO; + reply->msg_body.reply.sid = sn->sn_id; + goto out; + } + + } else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + /* NB: at this point, old version will ignore features and + * create new session anyway, so console should be able + * to handle this */ + reply->msg_body.reply.status = EPROTO; + goto out; + } + + switch(sv->sv_id) { + default: + LBUG (); + case SRPC_SERVICE_TEST: + rc = sfw_add_test(rpc); + break; + + case SRPC_SERVICE_BATCH: + rc = sfw_control_batch(&request->msg_body.bat_reqst, + &reply->msg_body.bat_reply); + break; + + case SRPC_SERVICE_QUERY_STAT: + rc = sfw_get_stats(&request->msg_body.stat_reqst, + &reply->msg_body.stat_reply); + break; + + case SRPC_SERVICE_DEBUG: + rc = sfw_debug_session(&request->msg_body.dbg_reqst, + &reply->msg_body.dbg_reply); + break; + + case SRPC_SERVICE_MAKE_SESSION: + rc = sfw_make_session(&request->msg_body.mksn_reqst, + &reply->msg_body.mksn_reply); + break; + + case SRPC_SERVICE_REMOVE_SESSION: + rc = sfw_remove_session(&request->msg_body.rmsn_reqst, + &reply->msg_body.rmsn_reply); + break; + } + + if (sfw_data.fw_session != NULL) + features = sfw_data.fw_session->sn_features; + out: + reply->msg_ses_feats = features; + rpc->srpc_done = sfw_server_rpc_done; + spin_lock(&sfw_data.fw_lock); + + if (!sfw_data.fw_shuttingdown) + sfw_add_session_timer(); + + sfw_data.fw_active_srpc = NULL; + spin_unlock(&sfw_data.fw_lock); + return rc; +} + +static int +sfw_bulk_ready(struct srpc_server_rpc *rpc, int status) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + int rc; + + LASSERT(rpc->srpc_bulk != NULL); + LASSERT(sv->sv_id == SRPC_SERVICE_TEST); + LASSERT(sfw_data.fw_active_srpc == NULL); + LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client); + + spin_lock(&sfw_data.fw_lock); + + if (status != 0) { + CERROR("Bulk transfer failed for RPC: " + "service %s, peer %s, status %d\n", + sv->sv_name, libcfs_id2str(rpc->srpc_peer), status); + spin_unlock(&sfw_data.fw_lock); + return -EIO; + } + + if (sfw_data.fw_shuttingdown) { + spin_unlock(&sfw_data.fw_lock); + return -ESHUTDOWN; + } + + if (sfw_del_session_timer() != 0) { + CERROR("dropping RPC %s from %s: racing with expiry timer\n", + sv->sv_name, libcfs_id2str(rpc->srpc_peer)); + spin_unlock(&sfw_data.fw_lock); + return -EAGAIN; + } + + sfw_data.fw_active_srpc = rpc; + spin_unlock(&sfw_data.fw_lock); + + rc = sfw_add_test(rpc); + + spin_lock(&sfw_data.fw_lock); + + if (!sfw_data.fw_shuttingdown) + sfw_add_session_timer(); + + sfw_data.fw_active_srpc = NULL; + spin_unlock(&sfw_data.fw_lock); + return rc; +} + +struct srpc_client_rpc * +sfw_create_rpc(struct lnet_process_id peer, int service, + unsigned features, int nbulkiov, int bulklen, + void (*done)(struct srpc_client_rpc *), void *priv) +{ + struct srpc_client_rpc *rpc = NULL; + + spin_lock(&sfw_data.fw_lock); + + LASSERT (!sfw_data.fw_shuttingdown); + LASSERT (service <= SRPC_FRAMEWORK_SERVICE_MAX_ID); + + if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) { + rpc = list_entry(sfw_data.fw_zombie_rpcs.next, + struct srpc_client_rpc, crpc_list); + list_del(&rpc->crpc_list); + + srpc_init_client_rpc(rpc, peer, service, 0, 0, + done, sfw_client_rpc_fini, priv); + } + + spin_unlock(&sfw_data.fw_lock); + + if (rpc == NULL) { + rpc = srpc_create_client_rpc(peer, service, + nbulkiov, bulklen, done, + nbulkiov != 0 ? NULL : + sfw_client_rpc_fini, + priv); + } + + if (rpc != NULL) /* "session" is concept in framework */ + rpc->crpc_reqstmsg.msg_ses_feats = features; + + return rpc; +} + +void +sfw_unpack_message(struct srpc_msg *msg) +{ + if (msg->msg_magic == SRPC_MSG_MAGIC) + return; /* no flipping needed */ + + /* srpc module should guarantee I wouldn't get crap */ + LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + if (msg->msg_type == SRPC_MSG_STAT_REQST) { + struct srpc_stat_reqst *req = &msg->msg_body.stat_reqst; + + __swab32s(&req->str_type); + __swab64s(&req->str_rpyid); + sfw_unpack_sid(req->str_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_STAT_REPLY) { + struct srpc_stat_reply *rep = &msg->msg_body.stat_reply; + + __swab32s(&rep->str_status); + sfw_unpack_sid(rep->str_sid); + sfw_unpack_fw_counters(rep->str_fw); + sfw_unpack_rpc_counters(rep->str_rpc); + sfw_unpack_lnet_counters(rep->str_lnet); + return; + } + + if (msg->msg_type == SRPC_MSG_MKSN_REQST) { + struct srpc_mksn_reqst *req = &msg->msg_body.mksn_reqst; + + __swab64s(&req->mksn_rpyid); + __swab32s(&req->mksn_force); + sfw_unpack_sid(req->mksn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_MKSN_REPLY) { + struct srpc_mksn_reply *rep = &msg->msg_body.mksn_reply; + + __swab32s(&rep->mksn_status); + __swab32s(&rep->mksn_timeout); + sfw_unpack_sid(rep->mksn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_RMSN_REQST) { + struct srpc_rmsn_reqst *req = &msg->msg_body.rmsn_reqst; + + __swab64s(&req->rmsn_rpyid); + sfw_unpack_sid(req->rmsn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_RMSN_REPLY) { + struct srpc_rmsn_reply *rep = &msg->msg_body.rmsn_reply; + + __swab32s(&rep->rmsn_status); + sfw_unpack_sid(rep->rmsn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_DEBUG_REQST) { + struct srpc_debug_reqst *req = &msg->msg_body.dbg_reqst; + + __swab64s(&req->dbg_rpyid); + __swab32s(&req->dbg_flags); + sfw_unpack_sid(req->dbg_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) { + struct srpc_debug_reply *rep = &msg->msg_body.dbg_reply; + + __swab32s(&rep->dbg_nbatch); + __swab32s(&rep->dbg_timeout); + sfw_unpack_sid(rep->dbg_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_BATCH_REQST) { + struct srpc_batch_reqst *req = &msg->msg_body.bat_reqst; + + __swab32s(&req->bar_opc); + __swab64s(&req->bar_rpyid); + __swab32s(&req->bar_testidx); + __swab32s(&req->bar_arg); + sfw_unpack_sid(req->bar_sid); + __swab64s(&req->bar_bid.bat_id); + return; + } + + if (msg->msg_type == SRPC_MSG_BATCH_REPLY) { + struct srpc_batch_reply *rep = &msg->msg_body.bat_reply; + + __swab32s(&rep->bar_status); + sfw_unpack_sid(rep->bar_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_TEST_REQST) { + struct srpc_test_reqst *req = &msg->msg_body.tes_reqst; + + __swab64s(&req->tsr_rpyid); + __swab64s(&req->tsr_bulkid); + __swab32s(&req->tsr_loop); + __swab32s(&req->tsr_ndest); + __swab32s(&req->tsr_concur); + __swab32s(&req->tsr_service); + sfw_unpack_sid(req->tsr_sid); + __swab64s(&req->tsr_bid.bat_id); + return; + } + + if (msg->msg_type == SRPC_MSG_TEST_REPLY) { + struct srpc_test_reply *rep = &msg->msg_body.tes_reply; + + __swab32s(&rep->tsr_status); + sfw_unpack_sid(rep->tsr_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_JOIN_REQST) { + struct srpc_join_reqst *req = &msg->msg_body.join_reqst; + + __swab64s(&req->join_rpyid); + sfw_unpack_sid(req->join_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_JOIN_REPLY) { + struct srpc_join_reply *rep = &msg->msg_body.join_reply; + + __swab32s(&rep->join_status); + __swab32s(&rep->join_timeout); + sfw_unpack_sid(rep->join_sid); + return; + } + + LBUG (); +} + +void +sfw_abort_rpc(struct srpc_client_rpc *rpc) +{ + LASSERT(atomic_read(&rpc->crpc_refcount) > 0); + LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID); + + spin_lock(&rpc->crpc_lock); + srpc_abort_rpc(rpc, -EINTR); + spin_unlock(&rpc->crpc_lock); +} + +void +sfw_post_rpc(struct srpc_client_rpc *rpc) +{ + spin_lock(&rpc->crpc_lock); + + LASSERT(!rpc->crpc_closed); + LASSERT(!rpc->crpc_aborted); + LASSERT(list_empty(&rpc->crpc_list)); + LASSERT(!sfw_data.fw_shuttingdown); + + rpc->crpc_timeout = rpc_timeout; + srpc_post_rpc(rpc); + + spin_unlock(&rpc->crpc_lock); +} + +static struct srpc_service sfw_services[] = { + { .sv_id = SRPC_SERVICE_DEBUG, .sv_name = "debug", }, + { .sv_id = SRPC_SERVICE_QUERY_STAT, .sv_name = "query stats", }, + { .sv_id = SRPC_SERVICE_MAKE_SESSION, .sv_name = "make session", }, + { .sv_id = SRPC_SERVICE_REMOVE_SESSION, .sv_name = "remove session", }, + { .sv_id = SRPC_SERVICE_BATCH, .sv_name = "batch service", }, + { .sv_id = SRPC_SERVICE_TEST, .sv_name = "test service", }, + { .sv_id = 0, } }; + +int +sfw_startup (void) +{ + int i; + int rc; + int error; + struct srpc_service *sv; + struct sfw_test_case *tsc; + + + if (session_timeout < 0) { + CERROR ("Session timeout must be non-negative: %d\n", + session_timeout); + return -EINVAL; + } + + if (rpc_timeout < 0) { + CERROR ("RPC timeout must be non-negative: %d\n", + rpc_timeout); + return -EINVAL; + } + + if (session_timeout == 0) + CWARN ("Zero session_timeout specified " + "- test sessions never expire.\n"); + + if (rpc_timeout == 0) + CWARN ("Zero rpc_timeout specified " + "- test RPC never expire.\n"); + + memset(&sfw_data, 0, sizeof(struct smoketest_framework)); + + sfw_data.fw_session = NULL; + sfw_data.fw_active_srpc = NULL; + spin_lock_init(&sfw_data.fw_lock); + atomic_set(&sfw_data.fw_nzombies, 0); + INIT_LIST_HEAD(&sfw_data.fw_tests); + INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs); + INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions); + + brw_init_test_client(); + brw_init_test_service(); + rc = sfw_register_test(&brw_test_service, &brw_test_client); + LASSERT (rc == 0); + + ping_init_test_client(); + ping_init_test_service(); + rc = sfw_register_test(&ping_test_service, &ping_test_client); + LASSERT (rc == 0); + + error = 0; + list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { + sv = tsc->tsc_srv_service; + + rc = srpc_add_service(sv); + LASSERT(rc != -EBUSY); + if (rc != 0) { + CWARN("Failed to add %s service: %d\n", + sv->sv_name, rc); + error = rc; + } + } + + for (i = 0; ; i++) { + sv = &sfw_services[i]; + if (sv->sv_name == NULL) break; + + sv->sv_bulk_ready = NULL; + sv->sv_handler = sfw_handle_server_rpc; + sv->sv_wi_total = SFW_FRWK_WI_MAX; + if (sv->sv_id == SRPC_SERVICE_TEST) + sv->sv_bulk_ready = sfw_bulk_ready; + + rc = srpc_add_service(sv); + LASSERT (rc != -EBUSY); + if (rc != 0) { + CWARN ("Failed to add %s service: %d\n", + sv->sv_name, rc); + error = rc; + } + + /* about to sfw_shutdown, no need to add buffer */ + if (error) continue; + + rc = srpc_service_add_buffers(sv, sv->sv_wi_total); + if (rc != 0) { + CWARN("Failed to reserve enough buffers: " + "service %s, %d needed: %d\n", + sv->sv_name, sv->sv_wi_total, rc); + error = -ENOMEM; + } + } + + if (error != 0) + sfw_shutdown(); + return error; +} + +void +sfw_shutdown (void) +{ + struct srpc_service *sv; + struct sfw_test_case *tsc; + int i; + + spin_lock(&sfw_data.fw_lock); + + sfw_data.fw_shuttingdown = 1; + lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock, + "waiting for active RPC to finish.\n"); + + if (sfw_del_session_timer() != 0) + lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock, + "waiting for session timer to explode.\n"); + + sfw_deactivate_session(); + lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0, + sfw_data.fw_lock, + "waiting for %d zombie sessions to die.\n", + atomic_read(&sfw_data.fw_nzombies)); + + spin_unlock(&sfw_data.fw_lock); + + for (i = 0; ; i++) { + sv = &sfw_services[i]; + if (sv->sv_name == NULL) + break; + + srpc_shutdown_service(sv); + srpc_remove_service(sv); + } + + list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { + sv = tsc->tsc_srv_service; + srpc_shutdown_service(sv); + srpc_remove_service(sv); + } + + while (!list_empty(&sfw_data.fw_zombie_rpcs)) { + struct srpc_client_rpc *rpc; + + rpc = list_entry(sfw_data.fw_zombie_rpcs.next, + struct srpc_client_rpc, crpc_list); + list_del(&rpc->crpc_list); + + LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc)); + } + + for (i = 0; ; i++) { + sv = &sfw_services[i]; + if (sv->sv_name == NULL) + break; + + srpc_wait_service_shutdown(sv); + } + + while (!list_empty(&sfw_data.fw_tests)) { + tsc = list_entry(sfw_data.fw_tests.next, + struct sfw_test_case, tsc_list); + + srpc_wait_service_shutdown(tsc->tsc_srv_service); + + list_del(&tsc->tsc_list); + LIBCFS_FREE(tsc, sizeof(*tsc)); + } +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/module.c b/drivers/staging/lustrefsx/lnet/selftest/module.c new file mode 100644 index 0000000000000..1441600e1a327 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/module.c @@ -0,0 +1,170 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" +#include "console.h" + +enum { + LST_INIT_NONE = 0, + LST_INIT_WI_SERIAL, + LST_INIT_WI_TEST, + LST_INIT_RPC, + LST_INIT_FW, + LST_INIT_CONSOLE +}; + +static int lst_init_step = LST_INIT_NONE; + +struct cfs_wi_sched *lst_sched_serial; +struct cfs_wi_sched **lst_sched_test; + +static void +lnet_selftest_exit(void) +{ + int i; + + switch (lst_init_step) { + case LST_INIT_CONSOLE: + lstcon_console_fini(); + fallthrough; + case LST_INIT_FW: + sfw_shutdown(); + fallthrough; + case LST_INIT_RPC: + srpc_shutdown(); + fallthrough; + case LST_INIT_WI_TEST: + for (i = 0; + i < cfs_cpt_number(lnet_cpt_table()); i++) { + if (lst_sched_test[i] == NULL) + continue; + cfs_wi_sched_destroy(lst_sched_test[i]); + } + CFS_FREE_PTR_ARRAY(lst_sched_test, + cfs_cpt_number(lnet_cpt_table())); + lst_sched_test = NULL; + fallthrough; + case LST_INIT_WI_SERIAL: + cfs_wi_sched_destroy(lst_sched_serial); + lst_sched_serial = NULL; + fallthrough; + case LST_INIT_NONE: + break; + default: + LBUG(); + } +} + +void +lnet_selftest_structure_assertion(void) +{ + BUILD_BUG_ON(sizeof(struct srpc_msg) != 160); + BUILD_BUG_ON(sizeof(struct srpc_test_reqst) != 70); + BUILD_BUG_ON(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_concur) != + 72); + BUILD_BUG_ON(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_ndest) != + 78); + BUILD_BUG_ON(sizeof(struct srpc_stat_reply) != 136); + BUILD_BUG_ON(sizeof(struct srpc_stat_reqst) != 28); +} + +static int __init +lnet_selftest_init(void) +{ + int nscheds; + int rc; + int i; + + rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY, + 1, &lst_sched_serial); + if (rc != 0) { + CERROR("Failed to create serial WI scheduler for LST\n"); + return rc; + } + lst_init_step = LST_INIT_WI_SERIAL; + + nscheds = cfs_cpt_number(lnet_cpt_table()); + CFS_ALLOC_PTR_ARRAY(lst_sched_test, nscheds); + if (lst_sched_test == NULL) { + rc = -ENOMEM; + goto error; + } + + lst_init_step = LST_INIT_WI_TEST; + for (i = 0; i < nscheds; i++) { + int nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + + /* reserve at least one CPU for LND */ + nthrs = max(nthrs - 1, 1); + rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i, + nthrs, &lst_sched_test[i]); + if (rc != 0) { + CERROR("Failed to create CPU partition affinity WI scheduler %d for LST\n", + i); + goto error; + } + } + + rc = srpc_startup(); + if (rc != 0) { + CERROR("LST can't startup rpc\n"); + goto error; + } + lst_init_step = LST_INIT_RPC; + + rc = sfw_startup(); + if (rc != 0) { + CERROR("LST can't startup framework\n"); + goto error; + } + lst_init_step = LST_INIT_FW; + + rc = lstcon_console_init(); + if (rc != 0) { + CERROR("LST can't startup console\n"); + goto error; + } + lst_init_step = LST_INIT_CONSOLE; + return 0; +error: + lnet_selftest_exit(); + return rc; +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("LNet Selftest"); +MODULE_VERSION("2.8.0"); +MODULE_LICENSE("GPL"); + +module_init(lnet_selftest_init); +module_exit(lnet_selftest_exit); diff --git a/drivers/staging/lustrefsx/lnet/selftest/ping_test.c b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c new file mode 100644 index 0000000000000..021cb431108dd --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c @@ -0,0 +1,226 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/selftest/conctl.c + * + * Test client & Server + * + * Author: Liang Zhen + */ + +#include "selftest.h" + +#define LST_PING_TEST_MAGIC 0xbabeface + +static int ping_srv_workitems = SFW_TEST_WI_MAX; +module_param(ping_srv_workitems, int, 0644); +MODULE_PARM_DESC(ping_srv_workitems, "# PING server workitems"); + +struct lst_ping_data { + spinlock_t pnd_lock; /* serialize */ + int pnd_counter; /* sequence counter */ +}; + +static struct lst_ping_data lst_ping_data; + +static int +ping_client_init(struct sfw_test_instance *tsi) +{ + struct sfw_session *sn = tsi->tsi_batch->bat_session; + + LASSERT(tsi->tsi_is_client); + LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0); + + spin_lock_init(&lst_ping_data.pnd_lock); + lst_ping_data.pnd_counter = 0; + + return 0; +} + +static void +ping_client_fini(struct sfw_test_instance *tsi) +{ + struct sfw_session *sn = tsi->tsi_batch->bat_session; + int errors; + + LASSERT (sn != NULL); + LASSERT (tsi->tsi_is_client); + + errors = atomic_read(&sn->sn_ping_errors); + if (errors) + CWARN ("%d pings have failed.\n", errors); + else + CDEBUG (D_NET, "Ping test finished OK.\n"); +} + +static int +ping_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest, + struct srpc_client_rpc **rpc) +{ + struct srpc_ping_reqst *req; + struct sfw_test_instance *tsi = tsu->tsu_instance; + struct sfw_session *sn = tsi->tsi_batch->bat_session; + struct timespec64 ts; + int rc; + + LASSERT(sn != NULL); + LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0); + + rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc); + if (rc != 0) + return rc; + + req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst; + + req->pnr_magic = LST_PING_TEST_MAGIC; + + spin_lock(&lst_ping_data.pnd_lock); + req->pnr_seq = lst_ping_data.pnd_counter++; + spin_unlock(&lst_ping_data.pnd_lock); + + ktime_get_real_ts64(&ts); + req->pnr_time_sec = ts.tv_sec; + req->pnr_time_nsec = ts.tv_nsec; + + return rc; +} + +static void +ping_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc) +{ + struct sfw_test_instance *tsi = tsu->tsu_instance; + struct sfw_session *sn = tsi->tsi_batch->bat_session; + struct srpc_ping_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst; + struct srpc_ping_reply *reply = &rpc->crpc_replymsg.msg_body.ping_reply; + struct timespec64 ts; + + LASSERT(sn != NULL); + + if (rpc->crpc_status != 0) { + if (!tsi->tsi_stopping) /* rpc could have been aborted */ + atomic_inc(&sn->sn_ping_errors); + CERROR ("Unable to ping %s (%d): %d\n", + libcfs_id2str(rpc->crpc_dest), + reqst->pnr_seq, rpc->crpc_status); + return; + } + + if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) { + __swab32s(&reply->pnr_seq); + __swab32s(&reply->pnr_magic); + __swab32s(&reply->pnr_status); + } + + if (reply->pnr_magic != LST_PING_TEST_MAGIC) { + rpc->crpc_status = -EBADMSG; + atomic_inc(&sn->sn_ping_errors); + CERROR ("Bad magic %u from %s, %u expected.\n", + reply->pnr_magic, libcfs_id2str(rpc->crpc_dest), + LST_PING_TEST_MAGIC); + return; + } + + if (reply->pnr_seq != reqst->pnr_seq) { + rpc->crpc_status = -EBADMSG; + atomic_inc(&sn->sn_ping_errors); + CERROR ("Bad seq %u from %s, %u expected.\n", + reply->pnr_seq, libcfs_id2str(rpc->crpc_dest), + reqst->pnr_seq); + return; + } + + ktime_get_real_ts64(&ts); + CDEBUG(D_NET, "%d reply in %llu nsec\n", reply->pnr_seq, + (u64)((ts.tv_sec - reqst->pnr_time_sec) * NSEC_PER_SEC + + (ts.tv_nsec - reqst->pnr_time_nsec))); +} + +static int +ping_server_handle(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg; + struct srpc_msg *replymsg = &rpc->srpc_replymsg; + struct srpc_ping_reqst *req = &reqstmsg->msg_body.ping_reqst; + struct srpc_ping_reply *rep = &rpc->srpc_replymsg.msg_body.ping_reply; + + LASSERT (sv->sv_id == SRPC_SERVICE_PING); + + if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) { + LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + __swab32s(&req->pnr_seq); + __swab32s(&req->pnr_magic); + __swab64s(&req->pnr_time_sec); + __swab64s(&req->pnr_time_nsec); + } + LASSERT (reqstmsg->msg_type == srpc_service2request(sv->sv_id)); + + if (req->pnr_magic != LST_PING_TEST_MAGIC) { + CERROR ("Unexpect magic %08x from %s\n", + req->pnr_magic, libcfs_id2str(rpc->srpc_peer)); + return -EINVAL; + } + + rep->pnr_seq = req->pnr_seq; + rep->pnr_magic = LST_PING_TEST_MAGIC; + + if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + replymsg->msg_ses_feats = LST_FEATS_MASK; + rep->pnr_status = EPROTO; + return 0; + } + + replymsg->msg_ses_feats = reqstmsg->msg_ses_feats; + + CDEBUG(D_NET, "Get ping %d from %s\n", + req->pnr_seq, libcfs_id2str(rpc->srpc_peer)); + return 0; +} + +struct sfw_test_client_ops ping_test_client; + +void ping_init_test_client(void) +{ + ping_test_client.tso_init = ping_client_init; + ping_test_client.tso_fini = ping_client_fini; + ping_test_client.tso_prep_rpc = ping_client_prep_rpc; + ping_test_client.tso_done_rpc = ping_client_done_rpc; +} + +struct srpc_service ping_test_service; + +void ping_init_test_service(void) +{ + ping_test_service.sv_id = SRPC_SERVICE_PING; + ping_test_service.sv_name = "ping_test"; + ping_test_service.sv_handler = ping_server_handle; + ping_test_service.sv_wi_total = ping_srv_workitems; +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.c b/drivers/staging/lustrefsx/lnet/selftest/rpc.c new file mode 100644 index 0000000000000..d0bdd019e47e6 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.c @@ -0,0 +1,1685 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/selftest/rpc.c + * + * Author: Isaac Huang + * + * 2012-05-13: Liang Zhen + * - percpt data for service to improve smp performance + * - code cleanup + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + +enum srpc_state { + SRPC_STATE_NONE, + SRPC_STATE_NI_INIT, + SRPC_STATE_EQ_INIT, + SRPC_STATE_RUNNING, + SRPC_STATE_STOPPING, +}; + +static struct smoketest_rpc { + spinlock_t rpc_glock; /* global lock */ + struct srpc_service *rpc_services[SRPC_SERVICE_MAX_ID + 1]; + lnet_handler_t rpc_lnet_handler;/* _the_ LNet event handler */ + enum srpc_state rpc_state; + struct srpc_counters rpc_counters; + __u64 rpc_matchbits; /* matchbits counter */ +} srpc_data; + +static inline int +srpc_serv_portal(int svc_id) +{ + return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ? + SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL; +} + +/* forward ref's */ +static int srpc_handle_rpc(struct swi_workitem *wi); + +void srpc_get_counters(struct srpc_counters *cnt) +{ + spin_lock(&srpc_data.rpc_glock); + *cnt = srpc_data.rpc_counters; + spin_unlock(&srpc_data.rpc_glock); +} + +void srpc_set_counters(const struct srpc_counters *cnt) +{ + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters = *cnt; + spin_unlock(&srpc_data.rpc_glock); +} + +static int +srpc_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i, int off, + int nob) +{ + LASSERT(off < PAGE_SIZE); + LASSERT(nob > 0 && nob <= PAGE_SIZE); + + bk->bk_iovs[i].bv_offset = off; + bk->bk_iovs[i].bv_page = pg; + bk->bk_iovs[i].bv_len = nob; + return nob; +} + +void +srpc_free_bulk(struct srpc_bulk *bk) +{ + int i; + struct page *pg; + + LASSERT(bk != NULL); + + for (i = 0; i < bk->bk_niov; i++) { + pg = bk->bk_iovs[i].bv_page; + if (pg == NULL) + break; + + __free_page(pg); + } + + LIBCFS_FREE(bk, offsetof(struct srpc_bulk, bk_iovs[bk->bk_niov])); +} + +struct srpc_bulk * +srpc_alloc_bulk(int cpt, unsigned bulk_off, unsigned bulk_npg, + unsigned bulk_len, int sink) +{ + struct srpc_bulk *bk; + int i; + + LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV); + + LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt, + offsetof(struct srpc_bulk, bk_iovs[bulk_npg])); + if (bk == NULL) { + CERROR("Can't allocate descriptor for %d pages\n", bulk_npg); + return NULL; + } + + memset(bk, 0, offsetof(struct srpc_bulk, bk_iovs[bulk_npg])); + bk->bk_sink = sink; + bk->bk_len = bulk_len; + bk->bk_niov = bulk_npg; + + for (i = 0; i < bulk_npg; i++) { + struct page *pg; + int nob; + + pg = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_KERNEL); + if (pg == NULL) { + CERROR("Can't allocate page %d of %d\n", i, bulk_npg); + srpc_free_bulk(bk); + return NULL; + } + + nob = min_t(unsigned, bulk_off + bulk_len, PAGE_SIZE) - + bulk_off; + + srpc_add_bulk_page(bk, pg, i, bulk_off, nob); + bulk_len -= nob; + bulk_off = 0; + } + + return bk; +} + +static inline __u64 +srpc_next_id (void) +{ + __u64 id; + + spin_lock(&srpc_data.rpc_glock); + id = srpc_data.rpc_matchbits++; + spin_unlock(&srpc_data.rpc_glock); + return id; +} + +static void +srpc_init_server_rpc(struct srpc_server_rpc *rpc, + struct srpc_service_cd *scd, + struct srpc_buffer *buffer) +{ + memset(rpc, 0, sizeof(*rpc)); + swi_init_workitem(&rpc->srpc_wi, srpc_handle_rpc, + srpc_serv_is_framework(scd->scd_svc) ? + lst_sched_serial : lst_sched_test[scd->scd_cpt]); + + rpc->srpc_ev.ev_fired = 1; /* no event expected now */ + + rpc->srpc_scd = scd; + rpc->srpc_reqstbuf = buffer; + rpc->srpc_peer = buffer->buf_peer; + rpc->srpc_self = buffer->buf_self; + LNetInvalidateMDHandle(&rpc->srpc_replymdh); +} + +static void +srpc_service_fini(struct srpc_service *svc) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + struct srpc_buffer *buf; + struct list_head *q; + int i; + + if (svc->sv_cpt_data == NULL) + return; + + cfs_percpt_for_each(scd, i, svc->sv_cpt_data) { + while (1) { + if (!list_empty(&scd->scd_buf_posted)) + q = &scd->scd_buf_posted; + else if (!list_empty(&scd->scd_buf_blocked)) + q = &scd->scd_buf_blocked; + else + break; + + while (!list_empty(q)) { + buf = list_entry(q->next, + struct srpc_buffer, + buf_list); + list_del(&buf->buf_list); + LIBCFS_FREE(buf, sizeof(*buf)); + } + } + + LASSERT(list_empty(&scd->scd_rpc_active)); + + while (!list_empty(&scd->scd_rpc_free)) { + rpc = list_entry(scd->scd_rpc_free.next, + struct srpc_server_rpc, + srpc_list); + list_del(&rpc->srpc_list); + LIBCFS_FREE(rpc, sizeof(*rpc)); + } + } + + cfs_percpt_free(svc->sv_cpt_data); + svc->sv_cpt_data = NULL; +} + +static int +srpc_service_nrpcs(struct srpc_service *svc) +{ + int nrpcs = svc->sv_wi_total / svc->sv_ncpts; + + return srpc_serv_is_framework(svc) ? + max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN); +} + +int srpc_add_buffer(struct swi_workitem *wi); + +static int +srpc_service_init(struct srpc_service *svc) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + int nrpcs; + int i; + int j; + + svc->sv_shuttingdown = 0; + + svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct srpc_service_cd)); + if (svc->sv_cpt_data == NULL) + return -ENOMEM; + + svc->sv_ncpts = srpc_serv_is_framework(svc) ? + 1 : cfs_cpt_number(lnet_cpt_table()); + nrpcs = srpc_service_nrpcs(svc); + + cfs_percpt_for_each(scd, i, svc->sv_cpt_data) { + scd->scd_cpt = i; + scd->scd_svc = svc; + spin_lock_init(&scd->scd_lock); + INIT_LIST_HEAD(&scd->scd_rpc_free); + INIT_LIST_HEAD(&scd->scd_rpc_active); + INIT_LIST_HEAD(&scd->scd_buf_posted); + INIT_LIST_HEAD(&scd->scd_buf_blocked); + + scd->scd_ev.ev_data = scd; + scd->scd_ev.ev_type = SRPC_REQUEST_RCVD; + + /* NB: don't use lst_sched_serial for adding buffer, + * see details in srpc_service_add_buffers() */ + swi_init_workitem(&scd->scd_buf_wi, + srpc_add_buffer, lst_sched_test[i]); + + if (i != 0 && srpc_serv_is_framework(svc)) { + /* NB: framework service only needs srpc_service_cd for + * one partition, but we allocate for all to make + * it easier to implement, it will waste a little + * memory but nobody should care about this */ + continue; + } + + for (j = 0; j < nrpcs; j++) { + LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(), + i, sizeof(*rpc)); + if (rpc == NULL) { + srpc_service_fini(svc); + return -ENOMEM; + } + list_add(&rpc->srpc_list, &scd->scd_rpc_free); + } + } + + return 0; +} + +int +srpc_add_service(struct srpc_service *sv) +{ + int id = sv->sv_id; + + LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID); + + if (srpc_service_init(sv) != 0) + return -ENOMEM; + + spin_lock(&srpc_data.rpc_glock); + + LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING); + + if (srpc_data.rpc_services[id] != NULL) { + spin_unlock(&srpc_data.rpc_glock); + goto failed; + } + + srpc_data.rpc_services[id] = sv; + spin_unlock(&srpc_data.rpc_glock); + + CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name); + return 0; + +failed: + srpc_service_fini(sv); + return -EBUSY; +} + +int +srpc_remove_service(struct srpc_service *sv) +{ + int id = sv->sv_id; + + spin_lock(&srpc_data.rpc_glock); + + if (srpc_data.rpc_services[id] != sv) { + spin_unlock(&srpc_data.rpc_glock); + return -ENOENT; + } + + srpc_data.rpc_services[id] = NULL; + spin_unlock(&srpc_data.rpc_glock); + return 0; +} + +static int +srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf, + int len, int options, struct lnet_process_id peer4, + struct lnet_handle_md *mdh, struct srpc_event *ev) +{ + int rc; + struct lnet_md md; + struct lnet_me *me; + struct lnet_processid peer; + + peer.pid = peer4.pid; + lnet_nid4_to_nid(peer4.nid, &peer.nid); + + me = LNetMEAttach(portal, &peer, matchbits, 0, LNET_UNLINK, + local ? LNET_INS_LOCAL : LNET_INS_AFTER); + if (IS_ERR(me)) { + rc = PTR_ERR(me); + CERROR("LNetMEAttach failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + return -ENOMEM; + } + + md.threshold = 1; + md.user_ptr = ev; + md.start = buf; + md.length = len; + md.options = options; + md.handler = srpc_data.rpc_lnet_handler; + + rc = LNetMDAttach(me, &md, LNET_UNLINK, mdh); + if (rc != 0) { + CERROR("LNetMDAttach failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + + return -ENOMEM; + } + + CDEBUG(D_NET, + "Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n", + libcfs_id2str(peer4), portal, matchbits); + return 0; +} + +static int +srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len, + int options, struct lnet_process_id peer, + lnet_nid_t self, struct lnet_handle_md *mdh, + struct srpc_event *ev) +{ + int rc; + struct lnet_md md; + + md.user_ptr = ev; + md.start = buf; + md.length = len; + md.handler = srpc_data.rpc_lnet_handler; + md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1; + md.options = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET); + + rc = LNetMDBind(&md, LNET_UNLINK, mdh); + if (rc != 0) { + CERROR("LNetMDBind failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + return -ENOMEM; + } + + /* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options. + * they're only meaningful for MDs attached to an ME (i.e. passive + * buffers... + */ + if ((options & LNET_MD_OP_PUT) != 0) { + rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer, + portal, matchbits, 0, 0); + } else { + LASSERT((options & LNET_MD_OP_GET) != 0); + + rc = LNetGet(self, *mdh, peer, portal, matchbits, 0, false); + } + + if (rc != 0) { + CERROR("LNet%s(%s, %d, %lld) failed: %d\n", + ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get", + libcfs_id2str(peer), portal, matchbits, rc); + + /* The forthcoming unlink event will complete this operation + * with failure, so fall through and return success here. + */ + rc = LNetMDUnlink(*mdh); + LASSERT(rc == 0); + } else { + CDEBUG(D_NET, + "Posted active RDMA: peer %s, portal %u, matchbits %#llx\n", + libcfs_id2str(peer), portal, matchbits); + } + return 0; +} + +static int +srpc_post_passive_rqtbuf(int service, int local, void *buf, int len, + struct lnet_handle_md *mdh, struct srpc_event *ev) +{ + struct lnet_process_id any = {0}; + + any.nid = LNET_NID_ANY; + any.pid = LNET_PID_ANY; + + return srpc_post_passive_rdma(srpc_serv_portal(service), + local, service, buf, len, + LNET_MD_OP_PUT, any, mdh, ev); +} + +static int +srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf) +__must_hold(&scd->scd_lock) +{ + struct srpc_service *sv = scd->scd_svc; + struct srpc_msg *msg = &buf->buf_msg; + int rc; + + LNetInvalidateMDHandle(&buf->buf_mdh); + list_add(&buf->buf_list, &scd->scd_buf_posted); + scd->scd_buf_nposted++; + spin_unlock(&scd->scd_lock); + + rc = srpc_post_passive_rqtbuf(sv->sv_id, + !srpc_serv_is_framework(sv), + msg, sizeof(*msg), &buf->buf_mdh, + &scd->scd_ev); + + /* At this point, a RPC (new or delayed) may have arrived in + * msg and its event handler has been called. So we must add + * buf to scd_buf_posted _before_ dropping scd_lock */ + + spin_lock(&scd->scd_lock); + + if (rc == 0) { + if (!sv->sv_shuttingdown) + return 0; + + spin_unlock(&scd->scd_lock); + /* srpc_shutdown_service might have tried to unlink me + * when my buf_mdh was still invalid */ + LNetMDUnlink(buf->buf_mdh); + spin_lock(&scd->scd_lock); + return 0; + } + + scd->scd_buf_nposted--; + if (sv->sv_shuttingdown) + return rc; /* don't allow to change scd_buf_posted */ + + list_del(&buf->buf_list); + spin_unlock(&scd->scd_lock); + + LIBCFS_FREE(buf, sizeof(*buf)); + + spin_lock(&scd->scd_lock); + return rc; +} + +int +srpc_add_buffer(struct swi_workitem *wi) +{ + struct srpc_service_cd *scd = container_of(wi, struct srpc_service_cd, + scd_buf_wi); + struct srpc_buffer *buf; + int rc = 0; + + /* it's called by workitem scheduler threads, these threads + * should have been set CPT affinity, so buffers will be posted + * on CPT local list of Portal */ + spin_lock(&scd->scd_lock); + + while (scd->scd_buf_adjust > 0 && + !scd->scd_svc->sv_shuttingdown) { + scd->scd_buf_adjust--; /* consume it */ + scd->scd_buf_posting++; + + spin_unlock(&scd->scd_lock); + + LIBCFS_ALLOC(buf, sizeof(*buf)); + if (buf == NULL) { + CERROR("Failed to add new buf to service: %s\n", + scd->scd_svc->sv_name); + spin_lock(&scd->scd_lock); + rc = -ENOMEM; + break; + } + + spin_lock(&scd->scd_lock); + if (scd->scd_svc->sv_shuttingdown) { + spin_unlock(&scd->scd_lock); + LIBCFS_FREE(buf, sizeof(*buf)); + + spin_lock(&scd->scd_lock); + rc = -ESHUTDOWN; + break; + } + + rc = srpc_service_post_buffer(scd, buf); + if (rc != 0) + break; /* buf has been freed inside */ + + LASSERT(scd->scd_buf_posting > 0); + scd->scd_buf_posting--; + scd->scd_buf_total++; + scd->scd_buf_low = max(2, scd->scd_buf_total / 4); + } + + if (rc != 0) { + scd->scd_buf_err_stamp = ktime_get_real_seconds(); + scd->scd_buf_err = rc; + + LASSERT(scd->scd_buf_posting > 0); + scd->scd_buf_posting--; + } + + spin_unlock(&scd->scd_lock); + return 0; +} + +int +srpc_service_add_buffers(struct srpc_service *sv, int nbuffer) +{ + struct srpc_service_cd *scd; + int rc = 0; + int i; + + LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + scd->scd_buf_err = 0; + scd->scd_buf_err_stamp = 0; + scd->scd_buf_posting = 0; + scd->scd_buf_adjust = nbuffer; + /* start to post buffers */ + swi_schedule_workitem(&scd->scd_buf_wi); + spin_unlock(&scd->scd_lock); + + /* framework service only post buffer for one partition */ + if (srpc_serv_is_framework(sv)) + break; + } + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + /* + * NB: srpc_service_add_buffers() can be called inside + * thread context of lst_sched_serial, and we don't normally + * allow to sleep inside thread context of WI scheduler + * because it will block current scheduler thread from doing + * anything else, even worse, it could deadlock if it's + * waiting on result from another WI of the same scheduler. + * However, it's safe at here because scd_buf_wi is scheduled + * by thread in a different WI scheduler (lst_sched_test), + * so we don't have any risk of deadlock, though this could + * block all WIs pending on lst_sched_serial for a moment + * which is not good but not fatal. + */ + lst_wait_until(scd->scd_buf_err != 0 || + (scd->scd_buf_adjust == 0 && + scd->scd_buf_posting == 0), + scd->scd_lock, "waiting for adding buffer\n"); + + if (scd->scd_buf_err != 0 && rc == 0) + rc = scd->scd_buf_err; + + spin_unlock(&scd->scd_lock); + } + + return rc; +} + +void +srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer) +{ + struct srpc_service_cd *scd; + int num; + int i; + + LASSERT(!sv->sv_shuttingdown); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + num = scd->scd_buf_total + scd->scd_buf_posting; + scd->scd_buf_adjust -= min(nbuffer, num); + + spin_unlock(&scd->scd_lock); + } +} + +/* returns 1 if sv has finished, otherwise 0 */ +int +srpc_finish_service(struct srpc_service *sv) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + int i; + + LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */ + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + if (!swi_deschedule_workitem(&scd->scd_buf_wi)) { + spin_unlock(&scd->scd_lock); + return 0; + } + + if (scd->scd_buf_nposted > 0) { + CDEBUG(D_NET, "waiting for %d posted buffers to unlink\n", + scd->scd_buf_nposted); + spin_unlock(&scd->scd_lock); + return 0; + } + + if (list_empty(&scd->scd_rpc_active)) { + spin_unlock(&scd->scd_lock); + continue; + } + + rpc = list_entry(scd->scd_rpc_active.next, + struct srpc_server_rpc, srpc_list); + CNETERR("Active RPC %p on shutdown: sv %s, peer %s, wi %s scheduled %d running %d, ev fired %d type %d status %d lnet %d\n", + rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer), + swi_state2str(rpc->srpc_wi.swi_state), + rpc->srpc_wi.swi_workitem.wi_scheduled, + rpc->srpc_wi.swi_workitem.wi_running, + rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type, + rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet); + spin_unlock(&scd->scd_lock); + return 0; + } + + /* no lock needed from now on */ + srpc_service_fini(sv); + return 1; +} + +/* called with sv->sv_lock held */ +static void +srpc_service_recycle_buffer(struct srpc_service_cd *scd, + struct srpc_buffer *buf) +__must_hold(&scd->scd_lock) +{ + if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) { + if (srpc_service_post_buffer(scd, buf) != 0) { + CWARN("Failed to post %s buffer\n", + scd->scd_svc->sv_name); + } + return; + } + + /* service is shutting down, or we want to recycle some buffers */ + scd->scd_buf_total--; + + if (scd->scd_buf_adjust < 0) { + scd->scd_buf_adjust++; + if (scd->scd_buf_adjust < 0 && + scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) { + CDEBUG(D_INFO, + "Try to recyle %d buffers but nothing left\n", + scd->scd_buf_adjust); + scd->scd_buf_adjust = 0; + } + } + + spin_unlock(&scd->scd_lock); + LIBCFS_FREE(buf, sizeof(*buf)); + spin_lock(&scd->scd_lock); +} + +void +srpc_abort_service(struct srpc_service *sv) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + int i; + + CDEBUG(D_NET, "Aborting service: id %d, name %s\n", + sv->sv_id, sv->sv_name); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + /* schedule in-flight RPCs to notice the abort, NB: + * racing with incoming RPCs; complete fix should make test + * RPCs carry session ID in its headers + */ + list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) { + rpc->srpc_aborted = 1; + swi_schedule_workitem(&rpc->srpc_wi); + } + + spin_unlock(&scd->scd_lock); + } +} + +void +srpc_shutdown_service(struct srpc_service *sv) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + struct srpc_buffer *buf; + int i; + + CDEBUG(D_NET, "Shutting down service: id %d, name %s\n", + sv->sv_id, sv->sv_name); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) + spin_lock(&scd->scd_lock); + + sv->sv_shuttingdown = 1; /* i.e. no new active RPC */ + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) + spin_unlock(&scd->scd_lock); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + /* schedule in-flight RPCs to notice the shutdown */ + list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) + swi_schedule_workitem(&rpc->srpc_wi); + + spin_unlock(&scd->scd_lock); + + /* OK to traverse scd_buf_posted without lock, since no one + * touches scd_buf_posted now + */ + list_for_each_entry(buf, &scd->scd_buf_posted, buf_list) + LNetMDUnlink(buf->buf_mdh); + } +} + +static int +srpc_send_request(struct srpc_client_rpc *rpc) +{ + struct srpc_event *ev = &rpc->crpc_reqstev; + int rc; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_REQUEST_SENT; + + rc = srpc_post_active_rdma(srpc_serv_portal(rpc->crpc_service), + rpc->crpc_service, &rpc->crpc_reqstmsg, + sizeof(struct srpc_msg), LNET_MD_OP_PUT, + rpc->crpc_dest, LNET_NID_ANY, + &rpc->crpc_reqstmdh, ev); + if (rc != 0) { + LASSERT(rc == -ENOMEM); + ev->ev_fired = 1; /* no more event expected */ + } + return rc; +} + +static int +srpc_prepare_reply(struct srpc_client_rpc *rpc) +{ + struct srpc_event *ev = &rpc->crpc_replyev; + u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid; + int rc; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_REPLY_RCVD; + + *id = srpc_next_id(); + + rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id, + &rpc->crpc_replymsg, + sizeof(struct srpc_msg), + LNET_MD_OP_PUT, rpc->crpc_dest, + &rpc->crpc_replymdh, ev); + if (rc != 0) { + LASSERT(rc == -ENOMEM); + ev->ev_fired = 1; /* no more event expected */ + } + return rc; +} + +static int +srpc_prepare_bulk(struct srpc_client_rpc *rpc) +{ + struct srpc_bulk *bk = &rpc->crpc_bulk; + struct srpc_event *ev = &rpc->crpc_bulkev; + __u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid; + int rc; + int opt; + + LASSERT(bk->bk_niov <= LNET_MAX_IOV); + + /* nothing to do */ + if (bk->bk_niov == 0) + return 0; + + opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET; + opt |= LNET_MD_KIOV; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_BULK_REQ_RCVD; + + *id = srpc_next_id(); + + rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id, + &bk->bk_iovs[0], bk->bk_niov, opt, + rpc->crpc_dest, &bk->bk_mdh, ev); + if (rc != 0) { + LASSERT(rc == -ENOMEM); + ev->ev_fired = 1; /* no more event expected */ + } + return rc; +} + +static int +srpc_do_bulk(struct srpc_server_rpc *rpc) +{ + struct srpc_event *ev = &rpc->srpc_ev; + struct srpc_bulk *bk = rpc->srpc_bulk; + __u64 id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid; + int rc; + int opt; + + LASSERT(bk != NULL); + + opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT; + opt |= LNET_MD_KIOV; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT; + + rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id, + &bk->bk_iovs[0], bk->bk_niov, opt, + rpc->srpc_peer, rpc->srpc_self, + &bk->bk_mdh, ev); + if (rc != 0) + ev->ev_fired = 1; /* no more event expected */ + return rc; +} + +/* only called from srpc_handle_rpc */ +static void +srpc_server_rpc_done(struct srpc_server_rpc *rpc, int status) +{ + struct srpc_service_cd *scd = rpc->srpc_scd; + struct srpc_service *sv = scd->scd_svc; + struct srpc_buffer *buffer; + + LASSERT(status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE); + + rpc->srpc_status = status; + + CDEBUG_LIMIT(status == 0 ? D_NET : D_NETERROR, + "Server RPC %p done: service %s, peer %s, status %s:%d\n", + rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer), + swi_state2str(rpc->srpc_wi.swi_state), status); + + if (status != 0) { + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_dropped++; + spin_unlock(&srpc_data.rpc_glock); + } + + if (rpc->srpc_done != NULL) + (*rpc->srpc_done) (rpc); + LASSERT(rpc->srpc_bulk == NULL); + + spin_lock(&scd->scd_lock); + + if (rpc->srpc_reqstbuf != NULL) { + /* NB might drop sv_lock in srpc_service_recycle_buffer, but + * sv won't go away for scd_rpc_active must not be empty + */ + srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf); + rpc->srpc_reqstbuf = NULL; + } + + list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */ + + /* + * No one can schedule me now since: + * - I'm not on scd_rpc_active. + * - all LNet events have been fired. + * Cancel pending schedules and prevent future schedule attempts: + */ + LASSERT(rpc->srpc_ev.ev_fired); + swi_exit_workitem(&rpc->srpc_wi); + + if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) { + buffer = list_entry(scd->scd_buf_blocked.next, + struct srpc_buffer, buf_list); + list_del(&buffer->buf_list); + + srpc_init_server_rpc(rpc, scd, buffer); + list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active); + swi_schedule_workitem(&rpc->srpc_wi); + } else { + list_add(&rpc->srpc_list, &scd->scd_rpc_free); + } + + spin_unlock(&scd->scd_lock); +} + +/* handles an incoming RPC */ +static int srpc_handle_rpc(struct swi_workitem *wi) +{ + struct srpc_server_rpc *rpc = container_of(wi, struct srpc_server_rpc, + srpc_wi); + struct srpc_service_cd *scd = rpc->srpc_scd; + struct srpc_service *sv = scd->scd_svc; + struct srpc_event *ev = &rpc->srpc_ev; + int rc = 0; + + LASSERT(wi == &rpc->srpc_wi); + + spin_lock(&scd->scd_lock); + + if (sv->sv_shuttingdown || rpc->srpc_aborted) { + spin_unlock(&scd->scd_lock); + + if (rpc->srpc_bulk != NULL) + LNetMDUnlink(rpc->srpc_bulk->bk_mdh); + LNetMDUnlink(rpc->srpc_replymdh); + + if (ev->ev_fired) { /* no more event, OK to finish */ + srpc_server_rpc_done(rpc, -ESHUTDOWN); + return 1; + } + return 0; + } + + spin_unlock(&scd->scd_lock); + + switch (wi->swi_state) { + default: + LBUG(); + fallthrough; + case SWI_STATE_NEWBORN: { + struct srpc_msg *msg; + struct srpc_generic_reply *reply; + + msg = &rpc->srpc_reqstbuf->buf_msg; + reply = &rpc->srpc_replymsg.msg_body.reply; + + if (msg->msg_magic == 0) { + /* moaned already in srpc_lnet_ev_handler */ + srpc_server_rpc_done(rpc, EBADMSG); + return 1; + } + + srpc_unpack_msg_hdr(msg); + if (msg->msg_version != SRPC_MSG_VERSION) { + CWARN("Version mismatch: %u, %u expected, from %s\n", + msg->msg_version, SRPC_MSG_VERSION, + libcfs_id2str(rpc->srpc_peer)); + reply->status = EPROTO; + /* drop through and send reply */ + } else { + reply->status = 0; + rc = (*sv->sv_handler)(rpc); + LASSERT(reply->status == 0 || !rpc->srpc_bulk); + if (rc != 0) { + srpc_server_rpc_done(rpc, rc); + return 1; + } + } + + wi->swi_state = SWI_STATE_BULK_STARTED; + + if (rpc->srpc_bulk != NULL) { + rc = srpc_do_bulk(rpc); + if (rc == 0) + return 0; /* wait for bulk */ + + LASSERT(ev->ev_fired); + ev->ev_status = rc; + } + } + fallthrough; + case SWI_STATE_BULK_STARTED: + LASSERT(rpc->srpc_bulk == NULL || ev->ev_fired); + + if (rpc->srpc_bulk != NULL) { + rc = ev->ev_status; + + if (sv->sv_bulk_ready != NULL) + rc = (*sv->sv_bulk_ready) (rpc, rc); + + if (rc != 0) { + srpc_server_rpc_done(rpc, rc); + return 1; + } + } + + wi->swi_state = SWI_STATE_REPLY_SUBMITTED; + rc = srpc_send_reply(rpc); + if (rc == 0) + return 0; /* wait for reply */ + srpc_server_rpc_done(rpc, rc); + return 1; + + case SWI_STATE_REPLY_SUBMITTED: + if (!ev->ev_fired) { + CERROR("RPC %p: bulk %p, service %d\n", + rpc, rpc->srpc_bulk, sv->sv_id); + CERROR("Event: status %d, type %d, lnet %d\n", + ev->ev_status, ev->ev_type, ev->ev_lnet); + LASSERT(ev->ev_fired); + } + + wi->swi_state = SWI_STATE_DONE; + srpc_server_rpc_done(rpc, ev->ev_status); + return 1; + } + + return 0; +} + +static void +srpc_client_rpc_expired (void *data) +{ + struct srpc_client_rpc *rpc = data; + + CWARN("Client RPC expired: service %d, peer %s, timeout %d.\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + rpc->crpc_timeout); + + spin_lock(&rpc->crpc_lock); + + rpc->crpc_timeout = 0; + srpc_abort_rpc(rpc, -ETIMEDOUT); + + spin_unlock(&rpc->crpc_lock); + + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_expired++; + spin_unlock(&srpc_data.rpc_glock); +} + +static void +srpc_add_client_rpc_timer(struct srpc_client_rpc *rpc) +{ + struct stt_timer *timer = &rpc->crpc_timer; + + if (rpc->crpc_timeout == 0) + return; + + INIT_LIST_HEAD(&timer->stt_list); + timer->stt_data = rpc; + timer->stt_func = srpc_client_rpc_expired; + timer->stt_expires = ktime_get_real_seconds() + rpc->crpc_timeout; + stt_add_timer(timer); +} + +/* + * Called with rpc->crpc_lock held. + * + * Upon exit the RPC expiry timer is not queued and the handler is not + * running on any CPU. + */ +static void +srpc_del_client_rpc_timer(struct srpc_client_rpc *rpc) +{ + /* timer not planted or already exploded */ + if (rpc->crpc_timeout == 0) + return; + + /* timer successfully defused */ + if (stt_del_timer(&rpc->crpc_timer)) + return; + + /* timer detonated, wait for it to explode */ + while (rpc->crpc_timeout != 0) { + spin_unlock(&rpc->crpc_lock); + + schedule(); + + spin_lock(&rpc->crpc_lock); + } +} + +static void +srpc_client_rpc_done(struct srpc_client_rpc *rpc, int status) +{ + struct swi_workitem *wi = &rpc->crpc_wi; + + LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE); + + spin_lock(&rpc->crpc_lock); + + rpc->crpc_closed = 1; + if (rpc->crpc_status == 0) + rpc->crpc_status = status; + + srpc_del_client_rpc_timer(rpc); + + CDEBUG_LIMIT((status == 0) ? D_NET : D_NETERROR, + "Client RPC done: service %d, peer %s, status %s:%d:%d\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + swi_state2str(wi->swi_state), rpc->crpc_aborted, status); + + /* + * No one can schedule me now since: + * - RPC timer has been defused. + * - all LNet events have been fired. + * - crpc_closed has been set, preventing srpc_abort_rpc from + * scheduling me. + * Cancel pending schedules and prevent future schedule attempts: + */ + LASSERT(!srpc_event_pending(rpc)); + swi_exit_workitem(wi); + + spin_unlock(&rpc->crpc_lock); + + (*rpc->crpc_done)(rpc); +} + +/* sends an outgoing RPC */ +int +srpc_send_rpc(struct swi_workitem *wi) +{ + int rc = 0; + struct srpc_client_rpc *rpc; + struct srpc_msg *reply; + int do_bulk; + + LASSERT(wi != NULL); + + rpc = container_of(wi, struct srpc_client_rpc, crpc_wi); + + LASSERT(rpc != NULL); + LASSERT(wi == &rpc->crpc_wi); + + reply = &rpc->crpc_replymsg; + do_bulk = rpc->crpc_bulk.bk_niov > 0; + + spin_lock(&rpc->crpc_lock); + + if (rpc->crpc_aborted) { + spin_unlock(&rpc->crpc_lock); + goto abort; + } + + spin_unlock(&rpc->crpc_lock); + + switch (wi->swi_state) { + default: + LBUG(); + case SWI_STATE_NEWBORN: + LASSERT(!srpc_event_pending(rpc)); + + rc = srpc_prepare_reply(rpc); + if (rc != 0) { + srpc_client_rpc_done(rpc, rc); + return 1; + } + + rc = srpc_prepare_bulk(rpc); + if (rc != 0) + break; + + wi->swi_state = SWI_STATE_REQUEST_SUBMITTED; + rc = srpc_send_request(rpc); + break; + + case SWI_STATE_REQUEST_SUBMITTED: + /* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any + * order; however, they're processed in a strict order: + * rqt, rpy, and bulk. + */ + if (!rpc->crpc_reqstev.ev_fired) + break; + + rc = rpc->crpc_reqstev.ev_status; + if (rc != 0) + break; + + wi->swi_state = SWI_STATE_REQUEST_SENT; + fallthrough; + case SWI_STATE_REQUEST_SENT: { + enum srpc_msg_type type; + + type = srpc_service2reply(rpc->crpc_service); + + if (!rpc->crpc_replyev.ev_fired) + break; + + rc = rpc->crpc_replyev.ev_status; + if (rc != 0) + break; + + srpc_unpack_msg_hdr(reply); + if (reply->msg_type != type || + (reply->msg_magic != SRPC_MSG_MAGIC && + reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) { + CWARN("Bad message from %s: type %u (%d expected), magic %u (%d expected).\n", + libcfs_id2str(rpc->crpc_dest), + reply->msg_type, type, + reply->msg_magic, SRPC_MSG_MAGIC); + rc = -EBADMSG; + break; + } + + if (do_bulk && reply->msg_body.reply.status != 0) { + CWARN("Remote error %d at %s, unlink bulk buffer in case peer didn't initiate bulk transfer\n", + reply->msg_body.reply.status, + libcfs_id2str(rpc->crpc_dest)); + LNetMDUnlink(rpc->crpc_bulk.bk_mdh); + } + + wi->swi_state = SWI_STATE_REPLY_RECEIVED; + } + fallthrough; + case SWI_STATE_REPLY_RECEIVED: + if (do_bulk && !rpc->crpc_bulkev.ev_fired) + break; + + rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0; + + /* Bulk buffer was unlinked due to remote error. Clear error + * since reply buffer still contains valid data. + * NB rpc->crpc_done shouldn't look into bulk data in case of + * remote error. + */ + if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK && + rpc->crpc_status == 0 && reply->msg_body.reply.status != 0) + rc = 0; + + wi->swi_state = SWI_STATE_DONE; + srpc_client_rpc_done(rpc, rc); + return 1; + } + + if (rc != 0) { + spin_lock(&rpc->crpc_lock); + srpc_abort_rpc(rpc, rc); + spin_unlock(&rpc->crpc_lock); + } + +abort: + if (rpc->crpc_aborted) { + LNetMDUnlink(rpc->crpc_reqstmdh); + LNetMDUnlink(rpc->crpc_replymdh); + LNetMDUnlink(rpc->crpc_bulk.bk_mdh); + + if (!srpc_event_pending(rpc)) { + srpc_client_rpc_done(rpc, -EINTR); + return 1; + } + } + return 0; +} + +struct srpc_client_rpc * +srpc_create_client_rpc(struct lnet_process_id peer, int service, + int nbulkiov, int bulklen, + void (*rpc_done)(struct srpc_client_rpc *), + void (*rpc_fini)(struct srpc_client_rpc *), void *priv) +{ + struct srpc_client_rpc *rpc; + + LIBCFS_ALLOC(rpc, offsetof(struct srpc_client_rpc, + crpc_bulk.bk_iovs[nbulkiov])); + if (rpc == NULL) + return NULL; + + srpc_init_client_rpc(rpc, peer, service, nbulkiov, + bulklen, rpc_done, rpc_fini, priv); + return rpc; +} + +/* called with rpc->crpc_lock held */ +void +srpc_abort_rpc(struct srpc_client_rpc *rpc, int why) +{ + LASSERT(why != 0); + + if (rpc->crpc_aborted || /* already aborted */ + rpc->crpc_closed) /* callback imminent */ + return; + + CDEBUG(D_NET, + "Aborting RPC: service %d, peer %s, state %s, why %d\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + swi_state2str(rpc->crpc_wi.swi_state), why); + + rpc->crpc_aborted = 1; + rpc->crpc_status = why; + swi_schedule_workitem(&rpc->crpc_wi); +} + +/* called with rpc->crpc_lock held */ +void +srpc_post_rpc(struct srpc_client_rpc *rpc) +{ + LASSERT(!rpc->crpc_aborted); + LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING); + + CDEBUG(D_NET, "Posting RPC: peer %s, service %d, timeout %d\n", + libcfs_id2str(rpc->crpc_dest), rpc->crpc_service, + rpc->crpc_timeout); + + srpc_add_client_rpc_timer(rpc); + swi_schedule_workitem(&rpc->crpc_wi); +} + + +int +srpc_send_reply(struct srpc_server_rpc *rpc) +{ + struct srpc_event *ev = &rpc->srpc_ev; + struct srpc_msg *msg = &rpc->srpc_replymsg; + struct srpc_buffer *buffer = rpc->srpc_reqstbuf; + struct srpc_service_cd *scd = rpc->srpc_scd; + struct srpc_service *sv = scd->scd_svc; + __u64 rpyid; + int rc; + + LASSERT(buffer != NULL); + rpyid = buffer->buf_msg.msg_body.reqst.rpyid; + + spin_lock(&scd->scd_lock); + + if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) { + /* Repost buffer before replying since test client + * might send me another RPC once it gets the reply + */ + if (srpc_service_post_buffer(scd, buffer) != 0) + CWARN("Failed to repost %s buffer\n", sv->sv_name); + rpc->srpc_reqstbuf = NULL; + } + + spin_unlock(&scd->scd_lock); + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_REPLY_SENT; + + msg->msg_magic = SRPC_MSG_MAGIC; + msg->msg_version = SRPC_MSG_VERSION; + msg->msg_type = srpc_service2reply(sv->sv_id); + + rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg, + sizeof(*msg), LNET_MD_OP_PUT, + rpc->srpc_peer, rpc->srpc_self, + &rpc->srpc_replymdh, ev); + if (rc != 0) + ev->ev_fired = 1; /* no more event expected */ + return rc; +} + +/* when in kernel always called with LNET_LOCK() held, and in thread context */ +static void +srpc_lnet_ev_handler(struct lnet_event *ev) +{ + struct srpc_service_cd *scd; + struct srpc_event *rpcev = ev->md_user_ptr; + struct srpc_client_rpc *crpc; + struct srpc_server_rpc *srpc; + struct srpc_buffer *buffer; + struct srpc_service *sv; + struct srpc_msg *msg; + enum srpc_msg_type type; + + LASSERT(!in_interrupt()); + + if (ev->status != 0) { + __u32 errors; + + spin_lock(&srpc_data.rpc_glock); + if (ev->status != -ECANCELED) /* cancellation is not error */ + srpc_data.rpc_counters.errors++; + errors = srpc_data.rpc_counters.errors; + spin_unlock(&srpc_data.rpc_glock); + + CNETERR("LNet event status %d type %d, RPC errors %u\n", + ev->status, ev->type, errors); + } + + rpcev->ev_lnet = ev->type; + + switch (rpcev->ev_type) { + default: + CERROR("Unknown event: status %d, type %d, lnet %d\n", + rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet); + LBUG(); + fallthrough; + case SRPC_REQUEST_SENT: + if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) { + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_sent++; + spin_unlock(&srpc_data.rpc_glock); + } + fallthrough; + case SRPC_REPLY_RCVD: + case SRPC_BULK_REQ_RCVD: + crpc = rpcev->ev_data; + + if (rpcev != &crpc->crpc_reqstev && + rpcev != &crpc->crpc_replyev && + rpcev != &crpc->crpc_bulkev) { + CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n", + rpcev, crpc, &crpc->crpc_reqstev, + &crpc->crpc_replyev, &crpc->crpc_bulkev); + CERROR("Bad event: status %d, type %d, lnet %d\n", + rpcev->ev_status, rpcev->ev_type, + rpcev->ev_lnet); + LBUG(); + } + + spin_lock(&crpc->crpc_lock); + + LASSERT(rpcev->ev_fired == 0); + rpcev->ev_fired = 1; + rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? + -EINTR : ev->status; + swi_schedule_workitem(&crpc->crpc_wi); + + spin_unlock(&crpc->crpc_lock); + break; + + case SRPC_REQUEST_RCVD: + scd = rpcev->ev_data; + sv = scd->scd_svc; + + LASSERT(rpcev == &scd->scd_ev); + + spin_lock(&scd->scd_lock); + + LASSERT(ev->unlinked); + LASSERT(ev->type == LNET_EVENT_PUT || + ev->type == LNET_EVENT_UNLINK); + LASSERT(ev->type != LNET_EVENT_UNLINK || + sv->sv_shuttingdown); + + buffer = container_of(ev->md_start, struct srpc_buffer, + buf_msg); + buffer->buf_peer = lnet_pid_to_pid4(&ev->source); + buffer->buf_self = lnet_nid_to_nid4(&ev->target.nid); + + LASSERT(scd->scd_buf_nposted > 0); + scd->scd_buf_nposted--; + + if (sv->sv_shuttingdown) { + /* Leave buffer on scd->scd_buf_nposted since + * srpc_finish_service needs to traverse it. + */ + spin_unlock(&scd->scd_lock); + break; + } + + if (scd->scd_buf_err_stamp != 0 && + scd->scd_buf_err_stamp < ktime_get_real_seconds()) { + /* re-enable adding buffer */ + scd->scd_buf_err_stamp = 0; + scd->scd_buf_err = 0; + } + + if (scd->scd_buf_err == 0 && /* adding buffer is enabled */ + scd->scd_buf_adjust == 0 && + scd->scd_buf_nposted < scd->scd_buf_low) { + scd->scd_buf_adjust = max(scd->scd_buf_total / 2, + SFW_TEST_WI_MIN); + swi_schedule_workitem(&scd->scd_buf_wi); + } + + list_del(&buffer->buf_list); /* from scd->scd_buf_posted */ + msg = &buffer->buf_msg; + type = srpc_service2request(sv->sv_id); + + if (ev->status != 0 || ev->mlength != sizeof(*msg) || + (msg->msg_type != type && + msg->msg_type != __swab32(type)) || + (msg->msg_magic != SRPC_MSG_MAGIC && + msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) { + CERROR("Dropping RPC (%s) from %s: status %d mlength %d type %u magic %u.\n", + sv->sv_name, libcfs_idstr(&ev->initiator), + ev->status, ev->mlength, + msg->msg_type, msg->msg_magic); + + /* NB can't call srpc_service_recycle_buffer here since + * it may call LNetM[DE]Attach. The invalid magic tells + * srpc_handle_rpc to drop this RPC + */ + msg->msg_magic = 0; + } + + if (!list_empty(&scd->scd_rpc_free)) { + srpc = list_entry(scd->scd_rpc_free.next, + struct srpc_server_rpc, + srpc_list); + list_del(&srpc->srpc_list); + + srpc_init_server_rpc(srpc, scd, buffer); + list_add_tail(&srpc->srpc_list, + &scd->scd_rpc_active); + swi_schedule_workitem(&srpc->srpc_wi); + } else { + list_add_tail(&buffer->buf_list, + &scd->scd_buf_blocked); + } + + spin_unlock(&scd->scd_lock); + + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_rcvd++; + spin_unlock(&srpc_data.rpc_glock); + break; + + case SRPC_BULK_GET_RPLD: + LASSERT(ev->type == LNET_EVENT_SEND || + ev->type == LNET_EVENT_REPLY || + ev->type == LNET_EVENT_UNLINK); + + if (!ev->unlinked) + break; /* wait for final event */ + fallthrough; + case SRPC_BULK_PUT_SENT: + if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) { + spin_lock(&srpc_data.rpc_glock); + + if (rpcev->ev_type == SRPC_BULK_GET_RPLD) + srpc_data.rpc_counters.bulk_get += ev->mlength; + else + srpc_data.rpc_counters.bulk_put += ev->mlength; + + spin_unlock(&srpc_data.rpc_glock); + } + fallthrough; + case SRPC_REPLY_SENT: + srpc = rpcev->ev_data; + scd = srpc->srpc_scd; + + LASSERT(rpcev == &srpc->srpc_ev); + + spin_lock(&scd->scd_lock); + + rpcev->ev_fired = 1; + rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? + -EINTR : ev->status; + swi_schedule_workitem(&srpc->srpc_wi); + + spin_unlock(&scd->scd_lock); + break; + } +} + + +int +srpc_startup (void) +{ + int rc; + + memset(&srpc_data, 0, sizeof(struct smoketest_rpc)); + spin_lock_init(&srpc_data.rpc_glock); + + /* 1 second pause to avoid timestamp reuse */ + schedule_timeout_uninterruptible(cfs_time_seconds(1)); + srpc_data.rpc_matchbits = ((__u64) ktime_get_real_seconds()) << 48; + + srpc_data.rpc_state = SRPC_STATE_NONE; + + rc = LNetNIInit(LNET_PID_LUSTRE); + if (rc < 0) { + CERROR("LNetNIInit() has failed: %d\n", rc); + return rc; + } + + srpc_data.rpc_state = SRPC_STATE_NI_INIT; + + srpc_data.rpc_lnet_handler = srpc_lnet_ev_handler; + + rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL); + LASSERT(rc == 0); + rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL); + LASSERT(rc == 0); + + srpc_data.rpc_state = SRPC_STATE_EQ_INIT; + + rc = stt_startup(); + + if (rc != 0) + srpc_shutdown(); + else + srpc_data.rpc_state = SRPC_STATE_RUNNING; + + return rc; +} + +void +srpc_shutdown (void) +{ + int i; + int rc; + int state; + + state = srpc_data.rpc_state; + srpc_data.rpc_state = SRPC_STATE_STOPPING; + + switch (state) { + default: + LBUG(); + fallthrough; + case SRPC_STATE_RUNNING: + spin_lock(&srpc_data.rpc_glock); + + for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) { + struct srpc_service *sv = srpc_data.rpc_services[i]; + + LASSERTF(sv == NULL, + "service not empty: id %d, name %s\n", + i, sv->sv_name); + } + + spin_unlock(&srpc_data.rpc_glock); + + stt_shutdown(); + fallthrough; + + case SRPC_STATE_EQ_INIT: + rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL); + rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL); + LASSERT(rc == 0); + lnet_assert_handler_unused(srpc_data.rpc_lnet_handler); + fallthrough; + + case SRPC_STATE_NI_INIT: + LNetNIFini(); + } +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.h b/drivers/staging/lustrefsx/lnet/selftest/rpc.h new file mode 100644 index 0000000000000..7b0b786cce324 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.h @@ -0,0 +1,296 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __SELFTEST_RPC_H__ +#define __SELFTEST_RPC_H__ + +#include + +/* + * LST wired structures + * + * XXX: *REPLY == *REQST + 1 + */ +enum srpc_msg_type { + SRPC_MSG_MKSN_REQST = 0, + SRPC_MSG_MKSN_REPLY = 1, + SRPC_MSG_RMSN_REQST = 2, + SRPC_MSG_RMSN_REPLY = 3, + SRPC_MSG_BATCH_REQST = 4, + SRPC_MSG_BATCH_REPLY = 5, + SRPC_MSG_STAT_REQST = 6, + SRPC_MSG_STAT_REPLY = 7, + SRPC_MSG_TEST_REQST = 8, + SRPC_MSG_TEST_REPLY = 9, + SRPC_MSG_DEBUG_REQST = 10, + SRPC_MSG_DEBUG_REPLY = 11, + SRPC_MSG_BRW_REQST = 12, + SRPC_MSG_BRW_REPLY = 13, + SRPC_MSG_PING_REQST = 14, + SRPC_MSG_PING_REPLY = 15, + SRPC_MSG_JOIN_REQST = 16, + SRPC_MSG_JOIN_REPLY = 17, +}; + +/* CAVEAT EMPTOR: + * All struct srpc_*_reqst's 1st field must be matchbits of reply buffer, + * and 2nd field matchbits of bulk buffer if any. + * + * All struct srpc_*_reply's 1st field must be a __u32 status, and 2nd field + * session id if needed. + */ +struct srpc_generic_reqst { + __u64 rpyid; /* reply buffer matchbits */ + __u64 bulkid; /* bulk buffer matchbits */ +} __packed; + +struct srpc_generic_reply { + __u32 status; + struct lst_sid sid; +} __packed; + +/* FRAMEWORK RPCs */ +struct srpc_mksn_reqst { + __u64 mksn_rpyid; /* reply buffer matchbits */ + struct lst_sid mksn_sid; /* session id */ + __u32 mksn_force; /* use brute force */ + char mksn_name[LST_NAME_SIZE]; +} __packed; /* make session request */ + +struct srpc_mksn_reply { + __u32 mksn_status; /* session status */ + struct lst_sid mksn_sid; /* session id */ + __u32 mksn_timeout; /* session timeout */ + char mksn_name[LST_NAME_SIZE]; +} __packed; /* make session reply */ + +struct srpc_rmsn_reqst { + __u64 rmsn_rpyid; /* reply buffer matchbits */ + struct lst_sid rmsn_sid; /* session id */ +} __packed; /* remove session request */ + +struct srpc_rmsn_reply { + __u32 rmsn_status; + struct lst_sid rmsn_sid; /* session id */ +} __packed; /* remove session reply */ + +struct srpc_join_reqst { + __u64 join_rpyid; /* reply buffer matchbits */ + struct lst_sid join_sid; /* session id to join */ + char join_group[LST_NAME_SIZE]; /* group name */ +} __packed; + +struct srpc_join_reply { + __u32 join_status; /* returned status */ + struct lst_sid join_sid; /* session id */ + __u32 join_timeout; /* # seconds' inactivity to expire */ + char join_session[LST_NAME_SIZE]; /* session name */ +} __packed; + +struct srpc_debug_reqst { + __u64 dbg_rpyid; /* reply buffer matchbits */ + struct lst_sid dbg_sid; /* session id */ + __u32 dbg_flags; /* bitmap of debug */ +} __packed; + +struct srpc_debug_reply { + __u32 dbg_status; /* returned code */ + struct lst_sid dbg_sid; /* session id */ + __u32 dbg_timeout; /* session timeout */ + __u32 dbg_nbatch; /* # of batches in the node */ + char dbg_name[LST_NAME_SIZE]; /* session name */ +} __packed; + +#define SRPC_BATCH_OPC_RUN 1 +#define SRPC_BATCH_OPC_STOP 2 +#define SRPC_BATCH_OPC_QUERY 3 + +struct srpc_batch_reqst { + __u64 bar_rpyid; /* reply buffer matchbits */ + struct lst_sid bar_sid; /* session id */ + struct lst_bid bar_bid; /* batch id */ + __u32 bar_opc; /* create/start/stop batch */ + __u32 bar_testidx; /* index of test */ + __u32 bar_arg; /* parameters */ +} __packed; + +struct srpc_batch_reply { + __u32 bar_status; /* status of request */ + struct lst_sid bar_sid; /* session id */ + __u32 bar_active; /* # of active tests in batch/test */ + __u32 bar_time; /* remained time */ +} __packed; + +struct srpc_stat_reqst { + __u64 str_rpyid; /* reply buffer matchbits */ + struct lst_sid str_sid; /* session id */ + __u32 str_type; /* type of stat */ +} __packed; + +struct srpc_stat_reply { + __u32 str_status; + struct lst_sid str_sid; + struct sfw_counters str_fw; + struct srpc_counters str_rpc; + struct lnet_counters_common str_lnet; +} __packed; + +struct test_bulk_req { + __u32 blk_opc; /* bulk operation code */ + __u32 blk_npg; /* # of pages */ + __u32 blk_flags; /* reserved flags */ +} __packed; + +struct test_bulk_req_v1 { + /** bulk operation code */ + __u16 blk_opc; + /** data check flags */ + __u16 blk_flags; + /** data length */ + __u32 blk_len; + /** bulk offset */ + __u32 blk_offset; +} __packed; + +struct test_ping_req { + __u32 png_size; /* size of ping message */ + __u32 png_flags; /* reserved flags */ +} __packed; + +struct srpc_test_reqst { + __u64 tsr_rpyid; /* reply buffer matchbits */ + __u64 tsr_bulkid; /* bulk buffer matchbits */ + struct lst_sid tsr_sid; /* session id */ + struct lst_bid tsr_bid; /* batch id */ + __u32 tsr_service; /* test type: bulk|ping|... */ + /* test client loop count or # server buffers needed */ + __u32 tsr_loop; + __u32 tsr_concur; /* concurrency of test */ + __u8 tsr_is_client; /* is test client or not */ + __u8 tsr_stop_onerr; /* stop on error */ + __u32 tsr_ndest; /* # of dest nodes */ + + union { + struct test_ping_req ping; + struct test_bulk_req bulk_v0; + struct test_bulk_req_v1 bulk_v1; + } tsr_u; +} __packed; + +struct srpc_test_reply { + __u32 tsr_status; /* returned code */ + struct lst_sid tsr_sid; +} __packed; + +/* TEST RPCs */ +struct srpc_ping_reqst { + __u64 pnr_rpyid; + __u32 pnr_magic; + __u32 pnr_seq; + __u64 pnr_time_sec; + __u64 pnr_time_nsec; +} __packed; + +struct srpc_ping_reply { + __u32 pnr_status; + __u32 pnr_magic; + __u32 pnr_seq; +} __packed; + +struct srpc_brw_reqst { + __u64 brw_rpyid; /* reply buffer matchbits */ + __u64 brw_bulkid; /* bulk buffer matchbits */ + __u32 brw_rw; /* read or write */ + __u32 brw_len; /* bulk data len */ + __u32 brw_flags; /* bulk data patterns */ +} __packed; /* bulk r/w request */ + +struct srpc_brw_reply { + __u32 brw_status; +} __packed; /* bulk r/w reply */ + +#define SRPC_MSG_MAGIC 0xeeb0f00d +#define SRPC_MSG_VERSION 1 + +struct srpc_msg { + /** magic number */ + __u32 msg_magic; + /** message version number */ + __u32 msg_version; + /** type of message body: enum srpc_msg_type */ + __u32 msg_type; + __u32 msg_reserved0; + __u32 msg_reserved1; + /** test session features */ + __u32 msg_ses_feats; + union { + struct srpc_generic_reqst reqst; + struct srpc_generic_reply reply; + + struct srpc_mksn_reqst mksn_reqst; + struct srpc_mksn_reply mksn_reply; + struct srpc_rmsn_reqst rmsn_reqst; + struct srpc_rmsn_reply rmsn_reply; + struct srpc_debug_reqst dbg_reqst; + struct srpc_debug_reply dbg_reply; + struct srpc_batch_reqst bat_reqst; + struct srpc_batch_reply bat_reply; + struct srpc_stat_reqst stat_reqst; + struct srpc_stat_reply stat_reply; + struct srpc_test_reqst tes_reqst; + struct srpc_test_reply tes_reply; + struct srpc_join_reqst join_reqst; + struct srpc_join_reply join_reply; + + struct srpc_ping_reqst ping_reqst; + struct srpc_ping_reply ping_reply; + struct srpc_brw_reqst brw_reqst; + struct srpc_brw_reply brw_reply; + } msg_body; +} __packed; + +static inline void +srpc_unpack_msg_hdr(struct srpc_msg *msg) +{ + if (msg->msg_magic == SRPC_MSG_MAGIC) + return; /* no flipping needed */ + + /* We do not swap the magic number here as it is needed to + determine whether the body needs to be swapped. */ + /* __swab32s(&msg->msg_magic); */ + __swab32s(&msg->msg_type); + __swab32s(&msg->msg_version); + __swab32s(&msg->msg_ses_feats); + __swab32s(&msg->msg_reserved0); + __swab32s(&msg->msg_reserved1); +} + +#endif /* __SELFTEST_RPC_H__ */ diff --git a/drivers/staging/lustrefsx/lnet/selftest/selftest.h b/drivers/staging/lustrefsx/lnet/selftest/selftest.h new file mode 100644 index 0000000000000..27126be9cb086 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/selftest.h @@ -0,0 +1,613 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/selftest/selftest.h + * + * Author: Isaac Huang + */ +#ifndef __SELFTEST_SELFTEST_H__ +#define __SELFTEST_SELFTEST_H__ + +#define LNET_ONLY + +#include +#include +#include +#include +#include + +#include "rpc.h" +#include "timer.h" + +#ifndef MADE_WITHOUT_COMPROMISE +#define MADE_WITHOUT_COMPROMISE +#endif + + +#define SWI_STATE_NEWBORN 0 +#define SWI_STATE_REPLY_SUBMITTED 1 +#define SWI_STATE_REPLY_SENT 2 +#define SWI_STATE_REQUEST_SUBMITTED 3 +#define SWI_STATE_REQUEST_SENT 4 +#define SWI_STATE_REPLY_RECEIVED 5 +#define SWI_STATE_BULK_STARTED 6 +#define SWI_STATE_DONE 10 + +/* forward refs */ +struct srpc_service; +struct srpc_service_cd; +struct sfw_test_unit; +struct sfw_test_instance; + +/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework + * services, e.g. create/modify session. + */ +#define SRPC_SERVICE_DEBUG 0 +#define SRPC_SERVICE_MAKE_SESSION 1 +#define SRPC_SERVICE_REMOVE_SESSION 2 +#define SRPC_SERVICE_BATCH 3 +#define SRPC_SERVICE_TEST 4 +#define SRPC_SERVICE_QUERY_STAT 5 +#define SRPC_SERVICE_JOIN 6 +#define SRPC_FRAMEWORK_SERVICE_MAX_ID 10 +/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */ +#define SRPC_SERVICE_BRW 11 +#define SRPC_SERVICE_PING 12 +#define SRPC_SERVICE_MAX_ID 12 + +#define SRPC_REQUEST_PORTAL 50 +/* a lazy portal for framework RPC requests */ +#define SRPC_FRAMEWORK_REQUEST_PORTAL 51 +/* all reply/bulk RDMAs go to this portal */ +#define SRPC_RDMA_PORTAL 52 + +static inline enum srpc_msg_type +srpc_service2request (int service) +{ + switch (service) { + default: + LBUG (); + case SRPC_SERVICE_DEBUG: + return SRPC_MSG_DEBUG_REQST; + + case SRPC_SERVICE_MAKE_SESSION: + return SRPC_MSG_MKSN_REQST; + + case SRPC_SERVICE_REMOVE_SESSION: + return SRPC_MSG_RMSN_REQST; + + case SRPC_SERVICE_BATCH: + return SRPC_MSG_BATCH_REQST; + + case SRPC_SERVICE_TEST: + return SRPC_MSG_TEST_REQST; + + case SRPC_SERVICE_QUERY_STAT: + return SRPC_MSG_STAT_REQST; + + case SRPC_SERVICE_BRW: + return SRPC_MSG_BRW_REQST; + + case SRPC_SERVICE_PING: + return SRPC_MSG_PING_REQST; + + case SRPC_SERVICE_JOIN: + return SRPC_MSG_JOIN_REQST; + } +} + +static inline enum srpc_msg_type +srpc_service2reply (int service) +{ + return srpc_service2request(service) + 1; +} + +enum srpc_event_type { + SRPC_BULK_REQ_RCVD = 1, /* passive bulk request(PUT sink/GET source) received */ + SRPC_BULK_PUT_SENT = 2, /* active bulk PUT sent (source) */ + SRPC_BULK_GET_RPLD = 3, /* active bulk GET replied (sink) */ + SRPC_REPLY_RCVD = 4, /* incoming reply received */ + SRPC_REPLY_SENT = 5, /* outgoing reply sent */ + SRPC_REQUEST_RCVD = 6, /* incoming request received */ + SRPC_REQUEST_SENT = 7, /* outgoing request sent */ +}; + +/* RPC event */ +struct srpc_event { + enum srpc_event_type ev_type; /* what's up */ + enum lnet_event_kind ev_lnet; /* LNet event type */ + int ev_fired; /* LNet event fired? */ + int ev_status; /* LNet event status */ + void *ev_data; /* owning server/client RPC */ +}; + +/* bulk descriptor */ +struct srpc_bulk { + int bk_len; /* len of bulk data */ + struct lnet_handle_md bk_mdh; + int bk_sink; /* sink/source */ + int bk_niov; /* # iov in bk_iovs */ + struct bio_vec bk_iovs[0]; +}; + +/* message buffer descriptor */ +struct srpc_buffer { + struct list_head buf_list; /* chain on srpc_service::*_msgq */ + struct srpc_msg buf_msg; + struct lnet_handle_md buf_mdh; + lnet_nid_t buf_self; + struct lnet_process_id buf_peer; +}; + +struct swi_workitem; +typedef int (*swi_action_t)(struct swi_workitem *); + +struct swi_workitem { + struct cfs_wi_sched *swi_sched; + struct cfs_workitem swi_workitem; + swi_action_t swi_action; + int swi_state; +}; + +/* server-side state of a RPC */ +struct srpc_server_rpc { + /* chain on srpc_service::*_rpcq */ + struct list_head srpc_list; + struct srpc_service_cd *srpc_scd; + struct swi_workitem srpc_wi; + struct srpc_event srpc_ev; /* bulk/reply event */ + lnet_nid_t srpc_self; + struct lnet_process_id srpc_peer; + struct srpc_msg srpc_replymsg; + struct lnet_handle_md srpc_replymdh; + struct srpc_buffer *srpc_reqstbuf; + struct srpc_bulk *srpc_bulk; + + unsigned int srpc_aborted; /* being given up */ + int srpc_status; + void (*srpc_done)(struct srpc_server_rpc *); +}; + +/* client-side state of a RPC */ +struct srpc_client_rpc { + struct list_head crpc_list; /* chain on user's lists */ + spinlock_t crpc_lock; /* serialize */ + int crpc_service; + atomic_t crpc_refcount; + /* # seconds to wait for reply */ + int crpc_timeout; + struct stt_timer crpc_timer; + struct swi_workitem crpc_wi; + struct lnet_process_id crpc_dest; + + void (*crpc_done)(struct srpc_client_rpc *); + void (*crpc_fini)(struct srpc_client_rpc *); + int crpc_status; /* completion status */ + void *crpc_priv; /* caller data */ + + /* state flags */ + unsigned int crpc_aborted:1; /* being given up */ + unsigned int crpc_closed:1; /* completed */ + + /* RPC events */ + struct srpc_event crpc_bulkev; /* bulk event */ + struct srpc_event crpc_reqstev; /* request event */ + struct srpc_event crpc_replyev; /* reply event */ + + /* bulk, request(reqst), and reply exchanged on wire */ + struct srpc_msg crpc_reqstmsg; + struct srpc_msg crpc_replymsg; + struct lnet_handle_md crpc_reqstmdh; + struct lnet_handle_md crpc_replymdh; + struct srpc_bulk crpc_bulk; +}; + +#define srpc_client_rpc_size(rpc) \ +offsetof(struct srpc_client_rpc, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov]) + +#define srpc_client_rpc_addref(rpc) \ +do { \ + CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n", \ + (rpc), libcfs_id2str((rpc)->crpc_dest), \ + atomic_read(&(rpc)->crpc_refcount)); \ + LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \ + atomic_inc(&(rpc)->crpc_refcount); \ +} while (0) + +#define srpc_client_rpc_decref(rpc) \ +do { \ + CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n", \ + (rpc), libcfs_id2str((rpc)->crpc_dest), \ + atomic_read(&(rpc)->crpc_refcount)); \ + LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \ + if (atomic_dec_and_test(&(rpc)->crpc_refcount)) \ + srpc_destroy_client_rpc(rpc); \ +} while (0) + +#define srpc_event_pending(rpc) ((rpc)->crpc_bulkev.ev_fired == 0 || \ + (rpc)->crpc_reqstev.ev_fired == 0 || \ + (rpc)->crpc_replyev.ev_fired == 0) + +/* CPU partition data of srpc service */ +struct srpc_service_cd { + /** serialize */ + spinlock_t scd_lock; + /** backref to service */ + struct srpc_service *scd_svc; + /** event buffer */ + struct srpc_event scd_ev; + /** free RPC descriptors */ + struct list_head scd_rpc_free; + /** in-flight RPCs */ + struct list_head scd_rpc_active; + /** workitem for posting buffer */ + struct swi_workitem scd_buf_wi; + /** CPT id */ + int scd_cpt; + /** error code for scd_buf_wi */ + int scd_buf_err; + /** timestamp for scd_buf_err */ + time64_t scd_buf_err_stamp; + /** total # request buffers */ + int scd_buf_total; + /** # posted request buffers */ + int scd_buf_nposted; + /** in progress of buffer posting */ + int scd_buf_posting; + /** allocate more buffers if scd_buf_nposted < scd_buf_low */ + int scd_buf_low; + /** increase/decrease some buffers */ + int scd_buf_adjust; + /** posted message buffers */ + struct list_head scd_buf_posted; + /** blocked for RPC descriptor */ + struct list_head scd_buf_blocked; +}; + +/* number of server workitems (mini-thread) for testing service */ +#define SFW_TEST_WI_MIN 256 +#define SFW_TEST_WI_MAX 2048 +/* extra buffers for tolerating buggy peers, or unbalanced number + * of peers between partitions */ +#define SFW_TEST_WI_EXTRA 64 + +/* number of server workitems (mini-thread) for framework service */ +#define SFW_FRWK_WI_MIN 16 +#define SFW_FRWK_WI_MAX 256 + +struct srpc_service { + int sv_id; /* service id */ + const char *sv_name; /* human readable name */ + int sv_wi_total; /* total server workitems */ + int sv_shuttingdown; + int sv_ncpts; + /* percpt data for srpc_service */ + struct srpc_service_cd **sv_cpt_data; + /* Service callbacks: + * - sv_handler: process incoming RPC request + * - sv_bulk_ready: notify bulk data + */ + int (*sv_handler)(struct srpc_server_rpc *); + int (*sv_bulk_ready)(struct srpc_server_rpc *, int); +}; + +struct sfw_session { + /* chain on fw_zombie_sessions */ + struct list_head sn_list; + struct lst_sid sn_id; /* unique identifier */ + /* # seconds' inactivity to expire */ + unsigned int sn_timeout; + int sn_timer_active; + unsigned int sn_features; + struct stt_timer sn_timer; + struct list_head sn_batches; /* list of batches */ + char sn_name[LST_NAME_SIZE]; + atomic_t sn_refcount; + atomic_t sn_brw_errors; + atomic_t sn_ping_errors; + ktime_t sn_started; +}; + +#define sfw_sid_equal(sid0, sid1) ((sid0).ses_nid == (sid1).ses_nid && \ + (sid0).ses_stamp == (sid1).ses_stamp) + +struct sfw_batch { + struct list_head bat_list; /* chain on sn_batches */ + struct lst_bid bat_id; /* batch id */ + int bat_error; /* error code of batch */ + struct sfw_session *bat_session; /* batch's session */ + atomic_t bat_nactive; /* # of active tests */ + struct list_head bat_tests; /* test instances */ +}; + +struct sfw_test_client_ops { + int (*tso_init)(struct sfw_test_instance *tsi); /* intailize test client */ + void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */ + int (*tso_prep_rpc)(struct sfw_test_unit *tsu, + struct lnet_process_id dest, + struct srpc_client_rpc **rpc); /* prep a tests rpc */ + void (*tso_done_rpc)(struct sfw_test_unit *tsu, + struct srpc_client_rpc *rpc); /* done a test rpc */ +}; + +struct sfw_test_instance { + struct list_head tsi_list; /* chain on batch */ + int tsi_service; /* test type */ + struct sfw_batch *tsi_batch; /* batch */ + struct sfw_test_client_ops *tsi_ops; /* test client operations */ + + /* public parameter for all test units */ + unsigned int tsi_is_client:1; /* is test client */ + unsigned int tsi_stoptsu_onerr:1; /* stop tsu on error */ + int tsi_concur; /* concurrency */ + int tsi_loop; /* loop count */ + + /* status of test instance */ + spinlock_t tsi_lock; /* serialize */ + unsigned int tsi_stopping:1; /* test is stopping */ + atomic_t tsi_nactive; /* # of active test unit */ + struct list_head tsi_units; /* test units */ + struct list_head tsi_free_rpcs; /* free rpcs */ + struct list_head tsi_active_rpcs;/* active rpcs */ + + union { + struct test_ping_req ping; /* ping parameter */ + struct test_bulk_req bulk_v0; /* bulk parameter */ + struct test_bulk_req_v1 bulk_v1; /* bulk v1 parameter */ + } tsi_u; +}; + +/* XXX: trailing (PAGE_SIZE % sizeof(struct lnet_process_id)) bytes at + * the end of pages are not used */ +#define SFW_MAX_CONCUR LST_MAX_CONCUR +#define SFW_ID_PER_PAGE (PAGE_SIZE / sizeof(struct lnet_process_id_packed)) +#define SFW_MAX_NDESTS (LNET_MAX_IOV * SFW_ID_PER_PAGE) +#define sfw_id_pages(n) (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE) + +struct sfw_test_unit { + struct list_head tsu_list; /* chain on lst_test_instance */ + struct lnet_process_id tsu_dest; /* id of dest node */ + int tsu_loop; /* loop count of the test */ + struct sfw_test_instance *tsu_instance; /* pointer to test instance */ + void *tsu_private; /* private data */ + struct swi_workitem tsu_worker; /* workitem of the test unit */ +}; + +struct sfw_test_case { + struct list_head tsc_list; /* chain on fw_tests */ + struct srpc_service *tsc_srv_service; /* test service */ + struct sfw_test_client_ops *tsc_cli_ops; /* ops of test client */ +}; + +struct srpc_client_rpc * +sfw_create_rpc(struct lnet_process_id peer, int service, + unsigned features, int nbulkiov, int bulklen, + void (*done)(struct srpc_client_rpc *), void *priv); +int sfw_create_test_rpc(struct sfw_test_unit *tsu, + struct lnet_process_id peer, unsigned int features, + int nblk, int blklen, struct srpc_client_rpc **rpc); +void sfw_abort_rpc(struct srpc_client_rpc *rpc); +void sfw_post_rpc(struct srpc_client_rpc *rpc); +void sfw_client_rpc_done(struct srpc_client_rpc *rpc); +void sfw_unpack_message(struct srpc_msg *msg); +void sfw_free_pages(struct srpc_server_rpc *rpc); +void sfw_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i); +int sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len, + int sink); +int sfw_make_session(struct srpc_mksn_reqst *request, + struct srpc_mksn_reply *reply); + +struct srpc_client_rpc * +srpc_create_client_rpc(struct lnet_process_id peer, int service, + int nbulkiov, int bulklen, + void (*rpc_done)(struct srpc_client_rpc *), + void (*rpc_fini)(struct srpc_client_rpc *), void *priv); +void srpc_post_rpc(struct srpc_client_rpc *rpc); +void srpc_abort_rpc(struct srpc_client_rpc *rpc, int why); +void srpc_free_bulk(struct srpc_bulk *bk); +struct srpc_bulk *srpc_alloc_bulk(int cpt, unsigned int off, + unsigned int bulk_npg, unsigned int bulk_len, + int sink); +int srpc_send_rpc(struct swi_workitem *wi); +int srpc_send_reply(struct srpc_server_rpc *rpc); +int srpc_add_service(struct srpc_service *sv); +int srpc_remove_service(struct srpc_service *sv); +void srpc_shutdown_service(struct srpc_service *sv); +void srpc_abort_service(struct srpc_service *sv); +int srpc_finish_service(struct srpc_service *sv); +int srpc_service_add_buffers(struct srpc_service *sv, int nbuffer); +void srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer); +void srpc_get_counters(struct srpc_counters *cnt); +void srpc_set_counters(const struct srpc_counters *cnt); + +extern struct cfs_wi_sched *lst_sched_serial; +extern struct cfs_wi_sched **lst_sched_test; + +static inline int +srpc_serv_is_framework(struct srpc_service *svc) +{ + return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID; +} + +static inline int +swi_wi_action(struct cfs_workitem *wi) +{ + struct swi_workitem *swi; + + swi = container_of(wi, struct swi_workitem, swi_workitem); + return swi->swi_action(swi); +} + +static inline void +swi_init_workitem(struct swi_workitem *swi, + swi_action_t action, struct cfs_wi_sched *sched) +{ + swi->swi_sched = sched; + swi->swi_action = action; + swi->swi_state = SWI_STATE_NEWBORN; + cfs_wi_init(&swi->swi_workitem, swi_wi_action); +} + +static inline void +swi_schedule_workitem(struct swi_workitem *wi) +{ + cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem); +} + +static inline void +swi_exit_workitem(struct swi_workitem *swi) +{ + cfs_wi_exit(swi->swi_sched, &swi->swi_workitem); +} + +static inline int +swi_deschedule_workitem(struct swi_workitem *swi) +{ + return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem); +} + +int sfw_startup(void); +int srpc_startup(void); +void sfw_shutdown(void); +void srpc_shutdown(void); + +static inline void +srpc_destroy_client_rpc(struct srpc_client_rpc *rpc) +{ + LASSERT (rpc != NULL); + LASSERT (!srpc_event_pending(rpc)); + LASSERT (atomic_read(&rpc->crpc_refcount) == 0); + + if (rpc->crpc_fini == NULL) { + LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc)); + } else { + (*rpc->crpc_fini) (rpc); + } +} + +static inline void +srpc_init_client_rpc(struct srpc_client_rpc *rpc, struct lnet_process_id peer, + int service, int nbulkiov, int bulklen, + void (*rpc_done)(struct srpc_client_rpc *), + void (*rpc_fini)(struct srpc_client_rpc *), void *priv) +{ + LASSERT(nbulkiov <= LNET_MAX_IOV); + + memset(rpc, 0, offsetof(struct srpc_client_rpc, + crpc_bulk.bk_iovs[nbulkiov])); + + INIT_LIST_HEAD(&rpc->crpc_list); + swi_init_workitem(&rpc->crpc_wi, srpc_send_rpc, + lst_sched_test[lnet_cpt_of_nid(peer.nid, NULL)]); + spin_lock_init(&rpc->crpc_lock); + atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */ + + rpc->crpc_dest = peer; + rpc->crpc_priv = priv; + rpc->crpc_service = service; + rpc->crpc_bulk.bk_len = bulklen; + rpc->crpc_bulk.bk_niov = nbulkiov; + rpc->crpc_done = rpc_done; + rpc->crpc_fini = rpc_fini; + LNetInvalidateMDHandle(&rpc->crpc_reqstmdh); + LNetInvalidateMDHandle(&rpc->crpc_replymdh); + LNetInvalidateMDHandle(&rpc->crpc_bulk.bk_mdh); + + /* no event is expected at this point */ + rpc->crpc_bulkev.ev_fired = + rpc->crpc_reqstev.ev_fired = + rpc->crpc_replyev.ev_fired = 1; + + rpc->crpc_reqstmsg.msg_magic = SRPC_MSG_MAGIC; + rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION; + rpc->crpc_reqstmsg.msg_type = srpc_service2request(service); +} + +static inline const char * +swi_state2str (int state) +{ +#define STATE2STR(x) case x: return #x + switch(state) { + default: + LBUG(); + STATE2STR(SWI_STATE_NEWBORN); + STATE2STR(SWI_STATE_REPLY_SUBMITTED); + STATE2STR(SWI_STATE_REPLY_SENT); + STATE2STR(SWI_STATE_REQUEST_SUBMITTED); + STATE2STR(SWI_STATE_REQUEST_SENT); + STATE2STR(SWI_STATE_REPLY_RECEIVED); + STATE2STR(SWI_STATE_BULK_STARTED); + STATE2STR(SWI_STATE_DONE); + } +#undef STATE2STR +} + +#define lst_wait_until(cond, lock, fmt, ...) \ +do { \ + int __I = 2; \ + while (!(cond)) { \ + CDEBUG(is_power_of_2(++__I) ? D_WARNING : D_NET, \ + fmt, ## __VA_ARGS__); \ + spin_unlock(&(lock)); \ + \ + schedule_timeout_uninterruptible( \ + cfs_time_seconds(1) / 10); \ + \ + spin_lock(&(lock)); \ + } \ +} while (0) + +static inline void +srpc_wait_service_shutdown(struct srpc_service *sv) +{ + int i = 2; + + LASSERT(sv->sv_shuttingdown); + + while (srpc_finish_service(sv) == 0) { + i++; + CDEBUG(((i & -i) == i) ? D_WARNING : D_NET, + "Waiting for %s service to shutdown...\n", + sv->sv_name); + schedule_timeout_uninterruptible(cfs_time_seconds(1) / 10); + } +} + +extern struct sfw_test_client_ops ping_test_client; +extern struct srpc_service ping_test_service; +void ping_init_test_client(void); +void ping_init_test_service(void); + +extern struct sfw_test_client_ops brw_test_client; +extern struct srpc_service brw_test_service; +void brw_init_test_client(void); +void brw_init_test_service(void); + +#endif /* __SELFTEST_SELFTEST_H__ */ diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.c b/drivers/staging/lustrefsx/lnet/selftest/timer.c new file mode 100644 index 0000000000000..8a35334b065cd --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/timer.c @@ -0,0 +1,244 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/selftest/timer.c + * + * Author: Isaac Huang + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + + +/* + * Timers are implemented as a sorted queue of expiry times. The queue + * is slotted, with each slot holding timers which expire in a + * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are + * sorted by increasing expiry time. The number of slots is 2**7 (128), + * to cover a time period of 1024 seconds into the future before wrapping. + */ +#define STTIMER_MINPOLL 3 /* log2 min poll interval (8 s) */ +#define STTIMER_SLOTTIME (1 << STTIMER_MINPOLL) +#define STTIMER_SLOTTIMEMASK (~(STTIMER_SLOTTIME - 1)) +#define STTIMER_NSLOTS (1 << 7) +#define STTIMER_SLOT(t) (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \ + (STTIMER_NSLOTS - 1))]) + +static struct st_timer_data { + spinlock_t stt_lock; + /* start time of the slot processed previously */ + time64_t stt_prev_slot; + struct list_head stt_hash[STTIMER_NSLOTS]; + int stt_shuttingdown; + wait_queue_head_t stt_waitq; + int stt_nthreads; +} stt_data; + +void +stt_add_timer(struct stt_timer *timer) +{ + struct list_head *pos; + + spin_lock(&stt_data.stt_lock); + + LASSERT(stt_data.stt_nthreads > 0); + LASSERT(!stt_data.stt_shuttingdown); + LASSERT(timer->stt_func != NULL); + LASSERT(list_empty(&timer->stt_list)); + LASSERT(timer->stt_expires > ktime_get_real_seconds()); + + /* a simple insertion sort */ + list_for_each_prev(pos, STTIMER_SLOT(timer->stt_expires)) { + struct stt_timer *old = list_entry(pos, struct stt_timer, + stt_list); + + if (timer->stt_expires >= old->stt_expires) + break; + } + list_add(&timer->stt_list, pos); + + spin_unlock(&stt_data.stt_lock); +} + +/* + * The function returns whether it has deactivated a pending timer or not. + * (ie. del_timer() of an inactive timer returns 0, del_timer() of an + * active timer returns 1.) + * + * CAVEAT EMPTOR: + * When 0 is returned, it is possible that timer->stt_func _is_ running on + * another CPU. + */ +int +stt_del_timer(struct stt_timer *timer) +{ + int ret = 0; + + spin_lock(&stt_data.stt_lock); + + LASSERT(stt_data.stt_nthreads > 0); + LASSERT(!stt_data.stt_shuttingdown); + + if (!list_empty(&timer->stt_list)) { + ret = 1; + list_del_init(&timer->stt_list); + } + + spin_unlock(&stt_data.stt_lock); + return ret; +} + +/* called with stt_data.stt_lock held */ +static int +stt_expire_list(struct list_head *slot, time64_t now) +{ + int expired = 0; + struct stt_timer *timer; + + while (!list_empty(slot)) { + timer = list_entry(slot->next, struct stt_timer, stt_list); + + if (timer->stt_expires > now) + break; + + list_del_init(&timer->stt_list); + spin_unlock(&stt_data.stt_lock); + + expired++; + (*timer->stt_func) (timer->stt_data); + + spin_lock(&stt_data.stt_lock); + } + + return expired; +} + +static int +stt_check_timers(time64_t *last) +{ + int expired = 0; + time64_t now; + time64_t this_slot; + + now = ktime_get_real_seconds(); + this_slot = now & STTIMER_SLOTTIMEMASK; + + spin_lock(&stt_data.stt_lock); + + while (this_slot >= *last) { + expired += stt_expire_list(STTIMER_SLOT(this_slot), now); + this_slot = this_slot - STTIMER_SLOTTIME; + } + + *last = now & STTIMER_SLOTTIMEMASK; + spin_unlock(&stt_data.stt_lock); + return expired; +} + + +static int +stt_timer_main (void *arg) +{ + int rc = 0; + + while (!stt_data.stt_shuttingdown) { + stt_check_timers(&stt_data.stt_prev_slot); + + rc = wait_event_timeout(stt_data.stt_waitq, + stt_data.stt_shuttingdown, + cfs_time_seconds(STTIMER_SLOTTIME)); + } + + spin_lock(&stt_data.stt_lock); + stt_data.stt_nthreads--; + spin_unlock(&stt_data.stt_lock); + return rc; +} + +static int +stt_start_timer_thread (void) +{ + struct task_struct *task; + + LASSERT(!stt_data.stt_shuttingdown); + + task = kthread_run(stt_timer_main, NULL, "st_timer"); + if (IS_ERR(task)) + return PTR_ERR(task); + + spin_lock(&stt_data.stt_lock); + stt_data.stt_nthreads++; + spin_unlock(&stt_data.stt_lock); + return 0; +} + + +int +stt_startup (void) +{ + int rc = 0; + int i; + + stt_data.stt_shuttingdown = 0; + stt_data.stt_prev_slot = ktime_get_real_seconds() & STTIMER_SLOTTIMEMASK; + + spin_lock_init(&stt_data.stt_lock); + for (i = 0; i < STTIMER_NSLOTS; i++) + INIT_LIST_HEAD(&stt_data.stt_hash[i]); + + stt_data.stt_nthreads = 0; + init_waitqueue_head(&stt_data.stt_waitq); + rc = stt_start_timer_thread(); + if (rc != 0) + CERROR ("Can't spawn timer thread: %d\n", rc); + + return rc; +} + +void +stt_shutdown(void) +{ + int i; + + spin_lock(&stt_data.stt_lock); + + for (i = 0; i < STTIMER_NSLOTS; i++) + LASSERT(list_empty(&stt_data.stt_hash[i])); + + stt_data.stt_shuttingdown = 1; + + wake_up(&stt_data.stt_waitq); + lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock, + "waiting for %d threads to terminate\n", + stt_data.stt_nthreads); + + spin_unlock(&stt_data.stt_lock); +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.h b/drivers/staging/lustrefsx/lnet/selftest/timer.h new file mode 100644 index 0000000000000..bd90553e2d942 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/timer.h @@ -0,0 +1,48 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lnet/selftest/timer.h + * + * Author: Isaac Huang + */ +#ifndef __SELFTEST_TIMER_H__ +#define __SELFTEST_TIMER_H__ + +struct stt_timer { + struct list_head stt_list; + time64_t stt_expires; + void (*stt_func)(void *); + void *stt_data; +}; + +void stt_add_timer(struct stt_timer *timer); +int stt_del_timer(struct stt_timer *timer); +int stt_startup(void); +void stt_shutdown(void); + +#endif /* __SELFTEST_TIMER_H__ */ diff --git a/drivers/staging/lustrefsx/lustre/Kconfig b/drivers/staging/lustrefsx/lustre/Kconfig new file mode 100644 index 0000000000000..c565c870d805b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/Kconfig @@ -0,0 +1,41 @@ +config LUSTREFSX_FS + tristate "Lustre file system client support" + depends on m + select LUSTREFSX_LIBCFS + depends on LUSTREFSX_LNET + select CRYPTO + select CRYPTO_CRC32 + select CRYPTO_CRC32_PCLMUL if X86 + select CRYPTO_CRC32C + select CRYPTO_MD5 + select CRYPTO_SHA1 + select CRYPTO_SHA256 + select CRYPTO_SHA512 + depends on MULTIUSER + help + This option enables Lustre file system client support. Choose Y + here if you want to access a Lustre file system cluster. To compile + this file system support as a module, choose M here: the module will + be called lustre. + + To mount Lustre file systems, you also need to install the user space + mount.lustre and other user space commands which can be found in the + lustre-client package. + + Lustre file system is the most popular cluster file system in high + performance computing. Source code of both kernel space and user space + Lustre components can also be found at + http://git.whamcloud.com/?p=fs/lustre-release.git;a=summary + + If unsure, say N. + + See also http://wiki.lustre.org/ + +config LUSTRE_DEBUG_EXPENSIVE_CHECK + bool "Enable Lustre DEBUG checks" + depends on LUSTREFSX_FS + help + This option is mainly for debug purpose. It enables Lustre code to do + expensive checks that may have a performance impact. + + Use with caution. If unsure, say N. diff --git a/drivers/staging/lustrefsx/lustre/LICENSE b/drivers/staging/lustrefsx/lustre/LICENSE new file mode 100644 index 0000000000000..edb73cdedca6a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/LICENSE @@ -0,0 +1,372 @@ +Each file in this distribution contains a header stating the copyright +owner(s), and the licensing terms for that file. Some files are not +eligible for copyright protection, and contain neither. + +There are many files which may be covered by a separate license that +you signed or otherwise agreed to before downloading this software. +If you did not agree to such an agreement, or if the file does not +mention that license, then you can redistribute and/or modify it under +the terms of version 2 of the GNU General Public License. Each file +is very clear about which license is applicable. + +In any case, Lustre is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the license +text for more details. + +Reproduced below is the GNU General Public License version 2, and +Linus's clarifying statement from the Linux kernel source code: + +---------------------------------------- + + NOTE! This copyright does *not* cover user programs that use kernel + services by normal system calls - this is merely considered normal use + of the kernel, and does *not* fall under the heading of "derived work". + Also note that the GPL below is copyrighted by the Free Software + Foundation, but the instance of code that it refers to (the Linux + kernel) is copyrighted by me and others who actually wrote it. + + Linus Torvalds + +---------------------------------------- + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) 19yy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/drivers/staging/lustrefsx/lustre/Makefile b/drivers/staging/lustrefsx/lustre/Makefile new file mode 100644 index 0000000000000..207cab53c0633 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_LUSTREFSX_FS) += fid/ +obj-$(CONFIG_LUSTREFSX_FS) += obdclass/ +obj-$(CONFIG_LUSTREFSX_FS) += ptlrpc/ +obj-$(CONFIG_LUSTREFSX_FS) += obdecho/ +obj-$(CONFIG_LUSTREFSX_FS) += mgc/ +obj-$(CONFIG_LUSTREFSX_FS) += lov/ osc/ mdc/ lmv/ llite/ fld/ + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/fid/Makefile b/drivers/staging/lustrefsx/lustre/fid/Makefile new file mode 100644 index 0000000000000..22be6773ba08f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTREFSX_LNET) += fid.o + +fid-y := fid_request.o lproc_fid.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c new file mode 100644 index 0000000000000..06196e66b971e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c @@ -0,0 +1,616 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/fid/fid_handler.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +#include +#include +#include +#include +#include +#include +#include +#include +#include "fid_internal.h" + +/* Assigns client to sequence controller node. */ +int seq_server_set_cli(const struct lu_env *env, struct lu_server_seq *seq, + struct lu_client_seq *cli) +{ + int rc = 0; + ENTRY; + + /* + * Ask client for new range, assign that range to ->seq_space and write + * seq state to backing store should be atomic. + */ + mutex_lock(&seq->lss_mutex); + + if (!cli) { + CDEBUG(D_INFO, "%s: Detached sequence client\n", seq->lss_name); + seq->lss_cli = NULL; + GOTO(out_up, rc = 0); + } + + if (seq->lss_cli) { + CDEBUG(D_HA, "%s: Sequence controller is already assigned\n", + seq->lss_name); + GOTO(out_up, rc = -EEXIST); + } + + CDEBUG(D_INFO, "%s: Attached sequence controller %s\n", + seq->lss_name, cli->lcs_name); + + seq->lss_cli = cli; + cli->lcs_space.lsr_index = seq->lss_site->ss_node_id; + EXIT; +out_up: + mutex_unlock(&seq->lss_mutex); + return rc; +} +EXPORT_SYMBOL(seq_server_set_cli); +/* + * allocate \a w units of sequence from range \a from. + */ +static inline void range_alloc(struct lu_seq_range *to, + struct lu_seq_range *from, + __u64 width) +{ + width = min(lu_seq_range_space(from), width); + to->lsr_start = from->lsr_start; + to->lsr_end = from->lsr_start + width; + from->lsr_start += width; +} + +/** + * On controller node, allocate new super sequence for regular sequence server. + * As this super sequence controller, this node suppose to maintain fld + * and update index. + * \a out range always has currect mds node number of requester. + */ + +static int __seq_server_alloc_super(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env) +{ + struct lu_seq_range *space = &seq->lss_space; + int rc; + ENTRY; + + LASSERT(lu_seq_range_is_sane(space)); + + if (lu_seq_range_is_exhausted(space)) { + CERROR("%s: Sequences space is exhausted\n", + seq->lss_name); + RETURN(-ENOSPC); + } else { + range_alloc(out, space, seq->lss_width); + } + + rc = seq_store_update(env, seq, out, 1 /* sync */); + + LCONSOLE_INFO("%s: super-sequence allocation rc = %d " DRANGE"\n", + seq->lss_name, rc, PRANGE(out)); + + RETURN(rc); +} + +int seq_server_alloc_super(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env) +{ + int rc; + ENTRY; + + mutex_lock(&seq->lss_mutex); + rc = __seq_server_alloc_super(seq, out, env); + mutex_unlock(&seq->lss_mutex); + + RETURN(rc); +} + +int seq_server_alloc_spec(struct lu_server_seq *seq, + struct lu_seq_range *spec, + const struct lu_env *env) +{ + struct lu_seq_range *space = &seq->lss_space; + int rc = -ENOSPC; + ENTRY; + + /* + * In some cases (like recovery after a disaster) + * we may need to allocate sequences manually + * Notice some sequences can be lost if requested + * range doesn't start at the beginning of current + * free space. Also notice it's not possible now + * to allocate sequences out of natural order. + */ + if (spec->lsr_start >= spec->lsr_end) + RETURN(-EINVAL); + if (spec->lsr_flags != LU_SEQ_RANGE_MDT && + spec->lsr_flags != LU_SEQ_RANGE_OST) + RETURN(-EINVAL); + + mutex_lock(&seq->lss_mutex); + if (spec->lsr_start >= space->lsr_start) { + space->lsr_start = spec->lsr_end; + rc = seq_store_update(env, seq, spec, 1 /* sync */); + + LCONSOLE_INFO("%s: "DRANGE" sequences allocated: rc = %d \n", + seq->lss_name, PRANGE(spec), rc); + } + mutex_unlock(&seq->lss_mutex); + + RETURN(rc); +} + +static int __seq_set_init(const struct lu_env *env, + struct lu_server_seq *seq) +{ + struct lu_seq_range *space = &seq->lss_space; + int rc; + + range_alloc(&seq->lss_lowater_set, space, seq->lss_set_width); + range_alloc(&seq->lss_hiwater_set, space, seq->lss_set_width); + + rc = seq_store_update(env, seq, NULL, 1); + + return rc; +} + +/* + * This function implements new seq allocation algorithm using async + * updates to seq file on disk. ref bug 18857 for details. + * there are four variable to keep track of this process + * + * lss_space; - available lss_space + * lss_lowater_set; - lu_seq_range for all seqs before barrier, i.e. safe to use + * lss_hiwater_set; - lu_seq_range after barrier, i.e. allocated but may be + * not yet committed + * + * when lss_lowater_set reaches the end it is replaced with hiwater one and + * a write operation is initiated to allocate new hiwater range. + * if last seq write opearion is still not committed, current operation is + * flaged as sync write op. + */ +static int range_alloc_set(const struct lu_env *env, + struct lu_seq_range *out, + struct lu_server_seq *seq) +{ + struct lu_seq_range *space = &seq->lss_space; + struct lu_seq_range *loset = &seq->lss_lowater_set; + struct lu_seq_range *hiset = &seq->lss_hiwater_set; + int rc = 0; + + if (lu_seq_range_is_zero(loset)) + __seq_set_init(env, seq); + + if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_ALLOC)) /* exhaust set */ + loset->lsr_start = loset->lsr_end; + + if (lu_seq_range_is_exhausted(loset)) { + /* reached high water mark. */ + struct lu_device *dev = seq->lss_site->ss_lu->ls_top_dev; + int obd_num_clients = dev->ld_obd->obd_num_exports; + __u64 set_sz; + + /* calculate new seq width based on number of clients */ + set_sz = max(seq->lss_set_width, + obd_num_clients * seq->lss_width); + set_sz = min(lu_seq_range_space(space), set_sz); + + /* Switch to hiwater range now */ + *loset = *hiset; + /* allocate new hiwater range */ + range_alloc(hiset, space, set_sz); + + /* update ondisk seq with new *space */ + rc = seq_store_update(env, seq, NULL, seq->lss_need_sync); + } + + LASSERTF(!lu_seq_range_is_exhausted(loset) || + lu_seq_range_is_sane(loset), + DRANGE"\n", PRANGE(loset)); + + if (rc == 0) + range_alloc(out, loset, seq->lss_width); + + RETURN(rc); +} + +/** + * Check if the sequence server has sequence avaible + * + * Check if the sequence server has sequence avaible, if not, then + * allocating super sequence from sequence manager (MDT0). + * + * \param[in] env execution environment + * \param[in] seq server sequence + * + * \retval negative errno if allocating new sequence fails + * \retval 0 if there is enough sequence or allocating + * new sequence succeeds + */ +int seq_server_check_and_alloc_super(const struct lu_env *env, + struct lu_server_seq *seq) +{ + struct lu_seq_range *space = &seq->lss_space; + int rc = 0; + + ENTRY; + + /* Check if available space ends and allocate new super seq */ + if (lu_seq_range_is_exhausted(space)) { + if (!seq->lss_cli) { + CERROR("%s: No sequence controller is attached.\n", + seq->lss_name); + RETURN(-ENODEV); + } + + rc = seq_client_alloc_super(seq->lss_cli, env); + if (rc) { + CDEBUG(D_HA, + "%s: Can't allocate super-sequence: rc = %d\n", + seq->lss_name, rc); + RETURN(rc); + } + + /* Saving new range to allocation space. */ + *space = seq->lss_cli->lcs_space; + LASSERT(lu_seq_range_is_sane(space)); + if (!seq->lss_cli->lcs_srv) { + struct lu_server_fld *fld; + + /* Insert it to the local FLDB */ + fld = seq->lss_site->ss_server_fld; + mutex_lock(&fld->lsf_lock); + rc = fld_insert_entry(env, fld, space); + mutex_unlock(&fld->lsf_lock); + } + } + + if (lu_seq_range_is_zero(&seq->lss_lowater_set)) + __seq_set_init(env, seq); + + RETURN(rc); +} +EXPORT_SYMBOL(seq_server_check_and_alloc_super); + +static int __seq_server_alloc_meta(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env) +{ + struct lu_seq_range *space = &seq->lss_space; + int rc = 0; + + ENTRY; + + LASSERT(lu_seq_range_is_sane(space)); + + rc = seq_server_check_and_alloc_super(env, seq); + if (rc < 0) { + if (rc == -EINPROGRESS) { + static int printed; + + if (printed++ % 8 == 0) + LCONSOLE_INFO("%s: Waiting to contact MDT0000 to allocate super-sequence: rc = %d\n", + seq->lss_name, rc); + } else { + CERROR("%s: Allocated super-sequence failed: rc = %d\n", + seq->lss_name, rc); + } + RETURN(rc); + } + + rc = range_alloc_set(env, out, seq); + if (rc != 0) { + CERROR("%s: Allocated meta-sequence failed: rc = %d\n", + seq->lss_name, rc); + RETURN(rc); + } + + CDEBUG(D_INFO, "%s: Allocated meta-sequence " DRANGE"\n", + seq->lss_name, PRANGE(out)); + + RETURN(rc); +} + +int seq_server_alloc_meta(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env) +{ + int rc; + ENTRY; + + mutex_lock(&seq->lss_mutex); + rc = __seq_server_alloc_meta(seq, out, env); + mutex_unlock(&seq->lss_mutex); + + RETURN(rc); +} +EXPORT_SYMBOL(seq_server_alloc_meta); + +static int seq_server_handle(struct lu_site *site, + const struct lu_env *env, + __u32 opc, struct lu_seq_range *out) +{ + int rc; + struct seq_server_site *ss_site; + struct dt_device *dev; + ENTRY; + + ss_site = lu_site2seq(site); + + switch (opc) { + case SEQ_ALLOC_META: + if (!ss_site->ss_server_seq) { + rc = -EINVAL; + CERROR("Sequence server is not initialized: rc = %d\n", + rc); + RETURN(rc); + } + + dev = lu2dt_dev(ss_site->ss_server_seq->lss_obj->do_lu.lo_dev); + if (dev->dd_rdonly) + RETURN(-EROFS); + + rc = seq_server_alloc_meta(ss_site->ss_server_seq, out, env); + break; + case SEQ_ALLOC_SUPER: + if (!ss_site->ss_control_seq) { + rc = -EINVAL; + CERROR("Sequence controller is not initialized: rc = %d\n", + rc); + RETURN(rc); + } + + dev = lu2dt_dev(ss_site->ss_control_seq->lss_obj->do_lu.lo_dev); + if (dev->dd_rdonly) + RETURN(-EROFS); + + rc = seq_server_alloc_super(ss_site->ss_control_seq, out, env); + break; + default: + rc = -EINVAL; + break; + } + + RETURN(rc); +} + +static int seq_handler(struct tgt_session_info *tsi) +{ + struct lu_seq_range *out, *tmp; + struct lu_site *site; + int rc; + __u32 *opc; + + ENTRY; + + LASSERT(!(lustre_msg_get_flags(tgt_ses_req(tsi)->rq_reqmsg) & MSG_REPLAY)); + site = tsi->tsi_exp->exp_obd->obd_lu_dev->ld_site; + LASSERT(site != NULL); + + opc = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_OPC); + if (opc) { + out = req_capsule_server_get(tsi->tsi_pill, &RMF_SEQ_RANGE); + if (!out) + RETURN(err_serious(-EPROTO)); + + tmp = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_RANGE); + + /* + * seq client passed mdt id, we need to pass that using out + * range parameter + */ + out->lsr_index = tmp->lsr_index; + out->lsr_flags = tmp->lsr_flags; + rc = seq_server_handle(site, tsi->tsi_env, *opc, out); + } else { + rc = err_serious(-EPROTO); + } + + RETURN(rc); +} + +struct tgt_handler seq_handlers[] = { +TGT_SEQ_HDL(HAS_REPLY, SEQ_QUERY, seq_handler), +}; +EXPORT_SYMBOL(seq_handlers); + +/* context key constructor/destructor: seq_key_init, seq_key_fini */ +LU_KEY_INIT_FINI(seq, struct seq_thread_info); + +/* context key: seq_thread_key */ +LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD); + +static void seq_server_debugfs_fini(struct lu_server_seq *seq) +{ + debugfs_remove_recursive(seq->lss_debugfs_entry); +} + +static void seq_server_debugfs_init(struct lu_server_seq *seq) +{ + ENTRY; + + seq->lss_debugfs_entry = debugfs_create_dir(seq->lss_name, + seq_debugfs_dir); + + ldebugfs_add_vars(seq->lss_debugfs_entry, + seq_server_debugfs_list, seq); + + if (seq->lss_type == LUSTRE_SEQ_CONTROLLER) + debugfs_create_file("fldb", 0644, seq->lss_debugfs_entry, + seq, &seq_fld_debugfs_seq_fops); +} + +int seq_server_init(const struct lu_env *env, + struct lu_server_seq *seq, + struct dt_device *dev, + const char *prefix, + enum lu_mgr_type type, + struct seq_server_site *ss) +{ + int rc, is_srv = (type == LUSTRE_SEQ_SERVER); + ENTRY; + + LASSERT(dev != NULL); + LASSERT(prefix != NULL); + LASSERT(ss != NULL); + LASSERT(ss->ss_lu != NULL); + + /* + * Check all lu_fid fields are converted in fid_cpu_to_le() and friends + * and that there is no padding added by compiler to the struct. + */ + { + struct lu_fid tst; + + BUILD_BUG_ON(sizeof(tst) != sizeof(tst.f_seq) + + sizeof(tst.f_oid) + sizeof(tst.f_ver)); + } + + seq->lss_cli = NULL; + seq->lss_type = type; + seq->lss_site = ss; + lu_seq_range_init(&seq->lss_space); + + lu_seq_range_init(&seq->lss_lowater_set); + lu_seq_range_init(&seq->lss_hiwater_set); + seq->lss_set_width = LUSTRE_SEQ_BATCH_WIDTH; + + mutex_init(&seq->lss_mutex); + + seq->lss_width = is_srv ? + LUSTRE_SEQ_META_WIDTH : LUSTRE_SEQ_SUPER_WIDTH; + + snprintf(seq->lss_name, sizeof(seq->lss_name), + "%s-%s", (is_srv ? "srv" : "ctl"), prefix); + + rc = seq_store_init(seq, env, dev); + if (rc) + GOTO(out, rc); + /* Request backing store for saved sequence info. */ + rc = seq_store_read(seq, env); + if (rc == -ENODATA) { + + /* Nothing is read, init by default value. */ + seq->lss_space = is_srv ? + LUSTRE_SEQ_ZERO_RANGE : + LUSTRE_SEQ_SPACE_RANGE; + + seq->lss_space.lsr_index = ss->ss_node_id; + LCONSOLE_INFO("%s: No data found on store. Initialize space: rc = %d\n", + seq->lss_name, rc); + + rc = seq_store_update(env, seq, NULL, 0); + if (rc) { + CERROR("%s: Can't write space data: rc = %d\n", + seq->lss_name, rc); + } + } else if (rc) { + CERROR("%s: Can't read space data: rc = %d\n", + seq->lss_name, rc); + GOTO(out, rc); + } + + if (is_srv) { + LASSERT(lu_seq_range_is_sane(&seq->lss_space)); + } else { + LASSERT(!lu_seq_range_is_zero(&seq->lss_space) && + lu_seq_range_is_sane(&seq->lss_space)); + } + + seq_server_debugfs_init(seq); + + EXIT; +out: + if (rc) + seq_server_fini(seq, env); + return rc; +} +EXPORT_SYMBOL(seq_server_init); + +void seq_server_fini(struct lu_server_seq *seq, + const struct lu_env *env) +{ + ENTRY; + + seq_server_debugfs_fini(seq); + seq_store_fini(seq, env); + + EXIT; +} +EXPORT_SYMBOL(seq_server_fini); + +int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss) +{ + if (!ss) + RETURN(0); + + if (ss->ss_server_seq) { + seq_server_fini(ss->ss_server_seq, env); + OBD_FREE_PTR(ss->ss_server_seq); + ss->ss_server_seq = NULL; + } + + if (ss->ss_control_seq) { + seq_server_fini(ss->ss_control_seq, env); + OBD_FREE_PTR(ss->ss_control_seq); + ss->ss_control_seq = NULL; + } + + if (ss->ss_client_seq) { + seq_client_fini(ss->ss_client_seq); + OBD_FREE_PTR(ss->ss_client_seq); + ss->ss_client_seq = NULL; + } + + RETURN(0); +} +EXPORT_SYMBOL(seq_site_fini); + +int fid_server_mod_init(void) +{ + LU_CONTEXT_KEY_INIT(&seq_thread_key); + return lu_context_key_register(&seq_thread_key); +} + +void fid_server_mod_exit(void) +{ + lu_context_key_degister(&seq_thread_key); +} diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_internal.h b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h new file mode 100644 index 0000000000000..c2b0f5f688f1d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h @@ -0,0 +1,95 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/fid/fid_internal.h + * + * Author: Yury Umanets + */ +#ifndef __FID_INTERNAL_H +#define __FID_INTERNAL_H + +#include + +#ifdef HAVE_SERVER_SUPPORT +# define HAVE_SEQ_SERVER + +struct req_capsule; + +struct seq_thread_info { + struct req_capsule *sti_pill; + struct lu_seq_range sti_space; + struct lu_buf sti_buf; +}; + +enum { + SEQ_TXN_STORE_CREDITS = 20 +}; + +extern struct lu_context_key seq_thread_key; + +extern struct ldebugfs_vars seq_server_debugfs_list[]; + +/* Store API functions. */ +struct dt_device; + +int seq_store_init(struct lu_server_seq *seq, + const struct lu_env *env, + struct dt_device *dt); + +void seq_store_fini(struct lu_server_seq *seq, + const struct lu_env *env); + +int seq_store_read(struct lu_server_seq *seq, + const struct lu_env *env); + +int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq, + struct lu_seq_range *out, int sync); + +int seq_server_alloc_spec(struct lu_server_seq *seq, + struct lu_seq_range *spec, + const struct lu_env *env); + +int fid_server_mod_init(void); + +void fid_server_mod_exit(void); + +# endif /* HAVE_SERVER_SUPPORT */ + +/* Functions used internally in module. */ +int seq_client_alloc_super(struct lu_client_seq *seq, + const struct lu_env *env); + +extern struct dentry *seq_debugfs_dir; + +extern struct ldebugfs_vars seq_client_debugfs_list[]; + +extern const struct file_operations seq_fld_debugfs_seq_fops; + +#endif /* __FID_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_lib.c b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c new file mode 100644 index 0000000000000..4bd05526fe283 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c @@ -0,0 +1,98 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/fid/fid_lib.c + * + * Miscellaneous fid functions. + * + * Author: Nikita Danilov + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +#include +#include +#include + +/** + * A cluster-wide range from which fid-sequences are granted to servers and + * then clients. + * + * Fid namespace: + *
+ * Normal FID:        seq:64 [2^33,2^64-1]      oid:32          ver:32
+ * IGIF      :        0:32, ino:32              gen:32          0:32
+ * IDIF      :        0:31, 1:1, ost-index:16,  objd:48         0:32
+ * 
+ * + * The first 0x400 sequences of normal FID are reserved for special purpose. + * FID_SEQ_START + 1 is for local file id generation. + * FID_SEQ_START + 2 is for .lustre directory and its objects + */ +const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = { + .lsr_start = FID_SEQ_NORMAL, + .lsr_end = (__u64)~0ULL, +}; + +/* Zero range, used for init and other purposes. */ +const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = { + .lsr_start = 0, +}; + +/* Lustre Big Fs Lock fid. */ +const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL, + .f_oid = FID_OID_SPECIAL_BFL, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LUSTRE_BFL_FID); + +/** Special fid for ".lustre" directory */ +const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = FID_SEQ_DOT_LUSTRE, + .f_oid = FID_OID_DOT_LUSTRE, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_DOT_LUSTRE_FID); + +/** Special fid for "fid" special object in .lustre */ +const struct lu_fid LU_OBF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE, + .f_oid = FID_OID_DOT_LUSTRE_OBF, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_OBF_FID); + +/** Special fid for "lost+found" special object in .lustre */ +const struct lu_fid LU_LPF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE, + .f_oid = FID_OID_DOT_LUSTRE_LPF, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_LPF_FID); + +/** "/lost+found" - special FID for ldiskfs backend, invislbe to client. */ +const struct lu_fid LU_BACKEND_LPF_FID = { .f_seq = FID_SEQ_LOCAL_FILE, + .f_oid = OSD_LPF_OID, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_BACKEND_LPF_FID); diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_request.c b/drivers/staging/lustrefsx/lustre/fid/fid_request.c new file mode 100644 index 0000000000000..2fa8590506c0f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/fid_request.c @@ -0,0 +1,523 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/fid/fid_request.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +#include +#include +#include +#include +#include +#include +#include +/* mdc RPC locks */ +#include +#include "fid_internal.h" + +struct dentry *seq_debugfs_dir; + +static int seq_client_rpc(struct lu_client_seq *seq, + struct lu_seq_range *output, __u32 opc, + const char *opcname) +{ + struct obd_export *exp = seq->lcs_exp; + struct ptlrpc_request *req; + struct lu_seq_range *out, *in; + __u32 *op; + unsigned int debug_mask; + int rc; + ENTRY; + + LASSERT(exp != NULL && !IS_ERR(exp)); + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_SEQ_QUERY, + LUSTRE_MDS_VERSION, SEQ_QUERY); + if (!req) + RETURN(-ENOMEM); + + /* Init operation code */ + op = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_OPC); + *op = opc; + + /* Zero out input range, this is not recovery yet. */ + in = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_RANGE); + lu_seq_range_init(in); + + ptlrpc_request_set_replen(req); + + in->lsr_index = seq->lcs_space.lsr_index; + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + fld_range_set_mdt(in); + else + fld_range_set_ost(in); + + if (opc == SEQ_ALLOC_SUPER) { + req->rq_request_portal = SEQ_CONTROLLER_PORTAL; + req->rq_reply_portal = MDC_REPLY_PORTAL; + /* + * During allocating super sequence for data object, + * the current thread might hold the export of MDT0(MDT0 + * precreating objects on this OST), and it will send the + * request to MDT0 here, so we can not keep resending the + * request here, otherwise if MDT0 is failed(umounted), + * it can not release the export of MDT0 + */ + if (seq->lcs_type == LUSTRE_SEQ_DATA) { + req->rq_no_resend = 1; + req->rq_no_delay = 1; + } + debug_mask = D_CONSOLE; + } else { + if (seq->lcs_type == LUSTRE_SEQ_METADATA) { + req->rq_reply_portal = MDC_REPLY_PORTAL; + req->rq_request_portal = SEQ_METADATA_PORTAL; + } else { + req->rq_reply_portal = OSC_REPLY_PORTAL; + req->rq_request_portal = SEQ_DATA_PORTAL; + } + + debug_mask = D_INFO; + } + + /* Allow seq client RPC during recovery time. */ + req->rq_allow_replay = 1; + + ptlrpc_at_set_req_timeout(req); + + rc = ptlrpc_queue_wait(req); + + if (rc) + GOTO(out_req, rc); + + out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE); + *output = *out; + + if (!lu_seq_range_is_sane(output)) { + CERROR("%s: Invalid range received from server: " + DRANGE"\n", seq->lcs_name, PRANGE(output)); + GOTO(out_req, rc = -EINVAL); + } + + if (lu_seq_range_is_exhausted(output)) { + CERROR("%s: Range received from server is exhausted: " + DRANGE"]\n", seq->lcs_name, PRANGE(output)); + GOTO(out_req, rc = -EINVAL); + } + + CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence "DRANGE"]\n", + seq->lcs_name, opcname, PRANGE(output)); + + EXIT; +out_req: + ptlrpc_req_finished(req); + return rc; +} + +/* Request sequence-controller node to allocate new super-sequence. */ +int seq_client_alloc_super(struct lu_client_seq *seq, + const struct lu_env *env) +{ + int rc; + ENTRY; + + mutex_lock(&seq->lcs_mutex); + + if (seq->lcs_srv) { +#ifdef HAVE_SEQ_SERVER + LASSERT(env != NULL); + rc = seq_server_alloc_super(seq->lcs_srv, &seq->lcs_space, env); +#else + rc = 0; +#endif + } else { + /* + * Check whether the connection to seq controller has been + * setup (lcs_exp != NULL) + */ + if (!seq->lcs_exp) { + mutex_unlock(&seq->lcs_mutex); + RETURN(-EINPROGRESS); + } + + rc = seq_client_rpc(seq, &seq->lcs_space, + SEQ_ALLOC_SUPER, "super"); + } + mutex_unlock(&seq->lcs_mutex); + RETURN(rc); +} + +/* Request sequence-controller node to allocate new meta-sequence. */ +static int seq_client_alloc_meta(const struct lu_env *env, + struct lu_client_seq *seq) +{ + int rc; + ENTRY; + + if (seq->lcs_srv) { +#ifdef HAVE_SEQ_SERVER + LASSERT(env); + rc = seq_server_alloc_meta(seq->lcs_srv, &seq->lcs_space, env); +#else + rc = 0; +#endif + } else { + do { + /* + * If meta server return -EINPROGRESS or EAGAIN, + * it means meta server might not be ready to + * allocate super sequence from sequence controller + * (MDT0)yet + */ + rc = seq_client_rpc(seq, &seq->lcs_space, + SEQ_ALLOC_META, "meta"); + if (rc == -EINPROGRESS || rc == -EAGAIN) + /* + * MDT0 is not ready, let's wait for 2 + * seconds and retry. + */ + ssleep(2); + + } while (rc == -EINPROGRESS || rc == -EAGAIN); + } + + RETURN(rc); +} + +/* Allocate new sequence for client. */ +static int seq_client_alloc_seq(const struct lu_env *env, + struct lu_client_seq *seq, u64 *seqnr) +{ + int rc; + ENTRY; + + LASSERT(lu_seq_range_is_sane(&seq->lcs_space)); + + if (lu_seq_range_is_exhausted(&seq->lcs_space)) { + rc = seq_client_alloc_meta(env, seq); + if (rc) { + if (rc != -EINPROGRESS) + CERROR("%s: Cannot allocate new meta-sequence: rc = %d\n", + seq->lcs_name, rc); + RETURN(rc); + } else { + CDEBUG(D_INFO, "%s: New range - "DRANGE"\n", + seq->lcs_name, PRANGE(&seq->lcs_space)); + } + } else { + rc = 0; + } + + LASSERT(!lu_seq_range_is_exhausted(&seq->lcs_space)); + *seqnr = seq->lcs_space.lsr_start; + seq->lcs_space.lsr_start += 1; + + CDEBUG(D_INFO, "%s: Allocated sequence [%#llx]\n", seq->lcs_name, + *seqnr); + + RETURN(rc); +} + +/** + * Allocate the whole non-used seq to the caller. + * + * \param[in] env pointer to the thread context + * \param[in,out] seq pointer to the client sequence manager + * \param[out] seqnr to hold the new allocated sequence + * + * \retval 0 for new sequence allocated. + * \retval Negative error number on failure. + */ +int seq_client_get_seq(const struct lu_env *env, + struct lu_client_seq *seq, u64 *seqnr) +{ + int rc; + + LASSERT(seqnr != NULL); + + mutex_lock(&seq->lcs_mutex); + + rc = seq_client_alloc_seq(env, seq, seqnr); + if (rc) { + CERROR("%s: Can't allocate new sequence: rc = %d\n", + seq->lcs_name, rc); + } else { + CDEBUG(D_INFO, "%s: New sequence [0x%16.16llx]\n", + seq->lcs_name, *seqnr); + seq->lcs_fid.f_seq = *seqnr; + seq->lcs_fid.f_ver = 0; + /* + * The caller require the whole seq, + * so marked this seq to be used + */ + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + seq->lcs_fid.f_oid = + LUSTRE_METADATA_SEQ_MAX_WIDTH; + else + seq->lcs_fid.f_oid = LUSTRE_DATA_SEQ_MAX_WIDTH; + } + mutex_unlock(&seq->lcs_mutex); + + return rc; +} +EXPORT_SYMBOL(seq_client_get_seq); + +/** + * Allocate new fid on passed client @seq and save it to @fid. + * + * \param[in] env pointer to the thread context + * \param[in,out] seq pointer to the client sequence manager + * \param[out] fid to hold the new allocated fid + * + * \retval 1 for notify the caller that sequence switch + * is performed to allow it to setup FLD for it. + * \retval 0 for new FID allocated in current sequence. + * \retval Negative error number on failure. + */ +int seq_client_alloc_fid(const struct lu_env *env, + struct lu_client_seq *seq, struct lu_fid *fid) +{ + int rc; + ENTRY; + + LASSERT(seq != NULL); + LASSERT(fid != NULL); + + mutex_lock(&seq->lcs_mutex); + + if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST)) + seq->lcs_fid.f_oid = seq->lcs_width; + + if (unlikely(!fid_is_zero(&seq->lcs_fid) && + fid_oid(&seq->lcs_fid) < seq->lcs_width)) { + /* Just bump last allocated fid and return to caller. */ + seq->lcs_fid.f_oid++; + rc = 0; + } else { + u64 seqnr; + + rc = seq_client_alloc_seq(env, seq, &seqnr); + if (rc) { + if (rc != -EINPROGRESS) + CERROR("%s: Can't allocate new sequence: rc = %d\n", + seq->lcs_name, rc); + } else { + CDEBUG(D_INFO, "%s: New sequence [0x%16.16llx]\n", + seq->lcs_name, seqnr); + + seq->lcs_fid.f_seq = seqnr; + seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID; + seq->lcs_fid.f_ver = 0; + rc = 1; + } + } + + if (rc >= 0) { + *fid = seq->lcs_fid; + CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name, + PFID(fid)); + } + mutex_unlock(&seq->lcs_mutex); + + RETURN(rc); +} +EXPORT_SYMBOL(seq_client_alloc_fid); + +/* + * Finish the current sequence due to disconnect. + * See mdc_import_event() + */ +void seq_client_flush(struct lu_client_seq *seq) +{ + LASSERT(seq != NULL); + mutex_lock(&seq->lcs_mutex); + + fid_zero(&seq->lcs_fid); + /** + * this id shld not be used for seq range allocation. + * set to -1 for dgb check. + */ + seq->lcs_space.lsr_index = -1; + + lu_seq_range_init(&seq->lcs_space); + mutex_unlock(&seq->lcs_mutex); +} +EXPORT_SYMBOL(seq_client_flush); + +static void seq_client_debugfs_fini(struct lu_client_seq *seq) +{ + debugfs_remove_recursive(seq->lcs_debugfs_entry); +} + +static void seq_client_debugfs_init(struct lu_client_seq *seq) +{ + seq->lcs_debugfs_entry = debugfs_create_dir(seq->lcs_name, + seq_debugfs_dir); + + ldebugfs_add_vars(seq->lcs_debugfs_entry, + seq_client_debugfs_list, seq); +} + +void seq_client_fini(struct lu_client_seq *seq) +{ + ENTRY; + + seq_client_debugfs_fini(seq); + + if (seq->lcs_exp) { + class_export_put(seq->lcs_exp); + seq->lcs_exp = NULL; + } + + seq->lcs_srv = NULL; + EXIT; +} +EXPORT_SYMBOL(seq_client_fini); + +void seq_client_init(struct lu_client_seq *seq, + struct obd_export *exp, + enum lu_cli_type type, + const char *prefix, + struct lu_server_seq *srv) +{ + ENTRY; + + LASSERT(seq != NULL); + LASSERT(prefix != NULL); + + seq->lcs_srv = srv; + seq->lcs_type = type; + + mutex_init(&seq->lcs_mutex); + if (type == LUSTRE_SEQ_METADATA) + seq->lcs_width = LUSTRE_METADATA_SEQ_MAX_WIDTH; + else + seq->lcs_width = LUSTRE_DATA_SEQ_MAX_WIDTH; + + /* Make sure that things are clear before work is started. */ + seq_client_flush(seq); + + if (exp) + seq->lcs_exp = class_export_get(exp); + + snprintf(seq->lcs_name, sizeof(seq->lcs_name), + "cli-%s", prefix); + + seq_client_debugfs_init(seq); +} +EXPORT_SYMBOL(seq_client_init); + +int client_fid_init(struct obd_device *obd, + struct obd_export *exp, enum lu_cli_type type) +{ + struct client_obd *cli = &obd->u.cli; + char *prefix; + int rc = 0; + ENTRY; + + down_write(&cli->cl_seq_rwsem); + OBD_ALLOC_PTR(cli->cl_seq); + if (!cli->cl_seq) + GOTO(out, rc = -ENOMEM); + + OBD_ALLOC(prefix, MAX_OBD_NAME + 5); + if (!prefix) + GOTO(out, rc = -ENOMEM); + + snprintf(prefix, MAX_OBD_NAME + 5, "cli-%s", obd->obd_name); + + /* Init client side sequence-manager */ + seq_client_init(cli->cl_seq, exp, type, prefix, NULL); + OBD_FREE(prefix, MAX_OBD_NAME + 5); + +out: + if (rc && cli->cl_seq) { + OBD_FREE_PTR(cli->cl_seq); + cli->cl_seq = NULL; + } + up_write(&cli->cl_seq_rwsem); + + RETURN(rc); +} +EXPORT_SYMBOL(client_fid_init); + +int client_fid_fini(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + ENTRY; + + down_write(&cli->cl_seq_rwsem); + if (cli->cl_seq) { + seq_client_fini(cli->cl_seq); + OBD_FREE_PTR(cli->cl_seq); + cli->cl_seq = NULL; + } + up_write(&cli->cl_seq_rwsem); + + RETURN(0); +} +EXPORT_SYMBOL(client_fid_fini); + +static int __init fid_init(void) +{ + struct dentry *de; +#ifdef HAVE_SERVER_SUPPORT + int rc = fid_server_mod_init(); + + if (rc) + return rc; +#endif + de = debugfs_create_dir(LUSTRE_SEQ_NAME, + debugfs_lustre_root); + if (!IS_ERR(de)) + seq_debugfs_dir = de; + return PTR_ERR_OR_ZERO(de); +} + +static void __exit fid_exit(void) +{ +# ifdef HAVE_SERVER_SUPPORT + fid_server_mod_exit(); +# endif + debugfs_remove_recursive(seq_debugfs_dir); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre File IDentifier"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(fid_init); +module_exit(fid_exit); diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_store.c b/drivers/staging/lustrefsx/lustre/fid/fid_store.c new file mode 100644 index 0000000000000..e73e8498ece59 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/fid_store.c @@ -0,0 +1,249 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/fid/fid_store.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +#include +#include +#include +#include +#include +#include "fid_internal.h" + +static struct lu_buf *seq_store_buf(struct seq_thread_info *info) +{ + struct lu_buf *buf; + + buf = &info->sti_buf; + buf->lb_buf = &info->sti_space; + buf->lb_len = sizeof(info->sti_space); + return buf; +} + +struct seq_update_callback { + struct dt_txn_commit_cb suc_cb; + struct lu_server_seq *suc_seq; +}; + +void seq_update_cb(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err) +{ + struct seq_update_callback *ccb; + + ccb = container_of(cb, struct seq_update_callback, suc_cb); + + LASSERT(ccb->suc_seq != NULL); + + ccb->suc_seq->lss_need_sync = 0; + OBD_FREE_PTR(ccb); +} + +int seq_update_cb_add(struct thandle *th, struct lu_server_seq *seq) +{ + struct seq_update_callback *ccb; + struct dt_txn_commit_cb *dcb; + int rc; + + OBD_ALLOC_PTR(ccb); + if (!ccb) + return -ENOMEM; + + ccb->suc_seq = seq; + seq->lss_need_sync = 1; + + dcb = &ccb->suc_cb; + dcb->dcb_func = seq_update_cb; + INIT_LIST_HEAD(&dcb->dcb_linkage); + strlcpy(dcb->dcb_name, "seq_update_cb", sizeof(dcb->dcb_name)); + + rc = dt_trans_cb_add(th, dcb); + if (rc) + OBD_FREE_PTR(ccb); + return rc; +} + +/* This function implies that caller takes care about locking. */ +int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq, + struct lu_seq_range *out, int sync) +{ + struct dt_device *dt_dev = lu2dt_dev(seq->lss_obj->do_lu.lo_dev); + struct seq_thread_info *info; + struct thandle *th; + loff_t pos = 0; + int rc; + + if (dt_dev->dd_rdonly) + RETURN(0); + + info = lu_context_key_get(&env->le_ctx, &seq_thread_key); + LASSERT(info != NULL); + + th = dt_trans_create(env, dt_dev); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + /* Store ranges in le format. */ + range_cpu_to_le(&info->sti_space, &seq->lss_space); + + rc = dt_declare_record_write(env, seq->lss_obj, + seq_store_buf(info), 0, th); + if (rc) + GOTO(exit, rc); + + if (out) { + rc = fld_declare_server_create(env, + seq->lss_site->ss_server_fld, + out, th); + if (rc) + GOTO(exit, rc); + } + + rc = dt_trans_start_local(env, dt_dev, th); + if (rc) + GOTO(exit, rc); + + rc = dt_record_write(env, seq->lss_obj, seq_store_buf(info), &pos, th); + if (rc) { + CERROR("%s: Can't write space data, rc %d\n", + seq->lss_name, rc); + GOTO(exit, rc); + } else if (out) { + rc = fld_server_create(env, seq->lss_site->ss_server_fld, out, + th); + if (rc) { + CERROR("%s: Can't Update fld database, rc %d\n", + seq->lss_name, rc); + GOTO(exit, rc); + } + } + /* + * next sequence update will need sync until this update is committed + * in case of sync operation this is not needed obviously + */ + if (!sync) + /* if callback can't be added then sync always */ + sync = !!seq_update_cb_add(th, seq); + + th->th_sync |= sync; +exit: + dt_trans_stop(env, dt_dev, th); + return rc; +} + +/* + * This function implies that caller takes care about locking or locking is not + * needed (init time). + */ +int seq_store_read(struct lu_server_seq *seq, + const struct lu_env *env) +{ + struct seq_thread_info *info; + loff_t pos = 0; + int rc; + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &seq_thread_key); + LASSERT(info != NULL); + + rc = dt_read(env, seq->lss_obj, seq_store_buf(info), &pos); + + if (rc == sizeof(info->sti_space)) { + range_le_to_cpu(&seq->lss_space, &info->sti_space); + CDEBUG(D_INFO, "%s: Space - "DRANGE"\n", + seq->lss_name, PRANGE(&seq->lss_space)); + rc = 0; + } else if (rc == 0) { + rc = -ENODATA; + } else if (rc > 0) { + CERROR("%s: Read only %d bytes of %d\n", seq->lss_name, + rc, (int)sizeof(info->sti_space)); + rc = -EIO; + } + + RETURN(rc); +} + +int seq_store_init(struct lu_server_seq *seq, + const struct lu_env *env, + struct dt_device *dt) +{ + struct dt_object *dt_obj; + struct lu_fid fid; + struct lu_attr attr; + struct dt_object_format dof; + const char *name; + int rc; + ENTRY; + + name = seq->lss_type == LUSTRE_SEQ_SERVER ? + LUSTRE_SEQ_SRV_NAME : LUSTRE_SEQ_CTL_NAME; + + if (seq->lss_type == LUSTRE_SEQ_SERVER) + lu_local_obj_fid(&fid, FID_SEQ_SRV_OID); + else + lu_local_obj_fid(&fid, FID_SEQ_CTL_OID); + + memset(&attr, 0, sizeof(attr)); + attr.la_valid = LA_MODE; + attr.la_mode = S_IFREG | 0666; + dof.dof_type = DFT_REGULAR; + + dt_obj = dt_find_or_create(env, dt, &fid, &dof, &attr); + if (!IS_ERR(dt_obj)) { + seq->lss_obj = dt_obj; + rc = 0; + } else { + CERROR("%s: Can't find \"%s\" obj %d\n", + seq->lss_name, name, (int)PTR_ERR(dt_obj)); + rc = PTR_ERR(dt_obj); + } + + RETURN(rc); +} + +void seq_store_fini(struct lu_server_seq *seq, const struct lu_env *env) +{ + ENTRY; + + if (seq->lss_obj) { + if (!IS_ERR(seq->lss_obj)) + dt_object_put(env, seq->lss_obj); + seq->lss_obj = NULL; + } + + EXIT; +} diff --git a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c new file mode 100644 index 0000000000000..f4d9b6a8e0861 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c @@ -0,0 +1,635 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/fid/lproc_fid.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +#include +#include +#include +#include +#include +#include +#include +#include "fid_internal.h" + +/* Format: [0x64BIT_INT - 0x64BIT_INT] + 32 bytes just in case */ +#define MAX_FID_RANGE_STRLEN (32 + 2 * 2 * sizeof(__u64)) +/** + * Reduce the SEQ range allocated to a node to a strict subset of the range + * currently-allocated SEQ range. If the specified range is "clear", then + * drop all allocated sequences and request a new one from the master. + * + * Note: this function should only be used for testing, it is not necessarily + * safe for production use. + */ +static int +ldebugfs_fid_write_common(const char __user *buffer, size_t count, + struct lu_seq_range *range) +{ + char kernbuf[MAX_FID_RANGE_STRLEN]; + struct lu_seq_range tmp = { + .lsr_start = 0, + }; + int rc; + + ENTRY; + LASSERT(range); + + if (count >= sizeof(kernbuf)) + RETURN(-EINVAL); + + if (copy_from_user(kernbuf, buffer, count)) + RETURN(-EFAULT); + + kernbuf[count] = 0; + + if (count == 5 && strcmp(kernbuf, "clear") == 0) { + memset(range, 0, sizeof(*range)); + RETURN(count); + } + + /* of the form "[0x0000000240000400 - 0x000000028000400]" */ + rc = sscanf(kernbuf, "[%llx - %llx]\n", + (unsigned long long *)&tmp.lsr_start, + (unsigned long long *)&tmp.lsr_end); + if (rc != 2) + RETURN(-EINVAL); + if (!lu_seq_range_is_sane(&tmp) || lu_seq_range_is_zero(&tmp) || + tmp.lsr_start < range->lsr_start || tmp.lsr_end > range->lsr_end) + RETURN(-EINVAL); + *range = tmp; + RETURN(0); +} + +#ifdef HAVE_SERVER_SUPPORT +/* + * Server side debugfs stuff. + */ +static ssize_t +ldebugfs_server_fid_space_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct lu_server_seq *seq = m->private; + int rc; + + ENTRY; + + mutex_lock(&seq->lss_mutex); + rc = ldebugfs_fid_write_common(buffer, count, &seq->lss_space); + if (rc == 0) { + CDEBUG(D_INFO, "%s: Space: " DRANGE "\n", + seq->lss_name, PRANGE(&seq->lss_space)); + } + mutex_unlock(&seq->lss_mutex); + + RETURN(count); +} + +static int +ldebugfs_server_fid_space_seq_show(struct seq_file *m, void *unused) +{ + struct lu_server_seq *seq = (struct lu_server_seq *)m->private; + ENTRY; + + mutex_lock(&seq->lss_mutex); + seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lss_space)); + mutex_unlock(&seq->lss_mutex); + + RETURN(0); +} + +static int +ldebugfs_server_fid_server_seq_show(struct seq_file *m, void *unused) +{ + struct lu_server_seq *seq = (struct lu_server_seq *)m->private; + struct client_obd *cli; + ENTRY; + + if (seq->lss_cli) { + if (seq->lss_cli->lcs_exp != NULL) { + cli = &seq->lss_cli->lcs_exp->exp_obd->u.cli; + seq_printf(m, "%s\n", cli->cl_target_uuid.uuid); + } else { + seq_printf(m, "%s\n", seq->lss_cli->lcs_srv->lss_name); + } + } else { + seq_puts(m, "\n"); + } + + RETURN(0); +} + +static ssize_t ldebugfs_server_fid_width_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct lu_server_seq *seq = m->private; + int rc; + + ENTRY; + mutex_lock(&seq->lss_mutex); + + rc = kstrtoull_from_user(buffer, count, 0, &seq->lss_width); + if (rc) { + CERROR("%s: invalid FID sequence width: rc = %d\n", + seq->lss_name, rc); + GOTO(out_unlock, count = rc); + } + + CDEBUG(D_INFO, "%s: Width: %llu\n", + seq->lss_name, seq->lss_width); +out_unlock: + mutex_unlock(&seq->lss_mutex); + + RETURN(count); +} + +static int +ldebugfs_server_fid_width_seq_show(struct seq_file *m, void *unused) +{ + struct lu_server_seq *seq = (struct lu_server_seq *)m->private; + + ENTRY; + mutex_lock(&seq->lss_mutex); + seq_printf(m, "%llu\n", seq->lss_width); + mutex_unlock(&seq->lss_mutex); + + RETURN(0); +} + +LDEBUGFS_SEQ_FOPS(ldebugfs_server_fid_space); +LDEBUGFS_SEQ_FOPS(ldebugfs_server_fid_width); +LDEBUGFS_SEQ_FOPS_RO(ldebugfs_server_fid_server); + +struct ldebugfs_vars seq_server_debugfs_list[] = { + { .name = "space", + .fops = &ldebugfs_server_fid_space_fops }, + { .name = "width", + .fops = &ldebugfs_server_fid_width_fops }, + { .name = "server", + .fops = &ldebugfs_server_fid_server_fops}, + { NULL } +}; + +struct fld_seq_param { + struct lu_env fsp_env; + struct dt_it *fsp_it; + struct lu_server_fld *fsp_fld; + struct lu_server_seq *fsp_seq; + unsigned int fsp_stop:1; +}; + +/* + * XXX: below is a copy of the functions in lustre/fld/lproc_fld.c. + * we want to avoid this duplication either by exporting the + * functions or merging fid and fld into a single module. + */ +static void *fldb_seq_start(struct seq_file *p, loff_t *pos) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct dt_key *key; + int rc; + + if (param == NULL || param->fsp_stop) + return NULL; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + rc = iops->load(¶m->fsp_env, param->fsp_it, *pos); + if (rc <= 0) + return NULL; + + key = iops->key(¶m->fsp_env, param->fsp_it); + if (IS_ERR(key)) + return NULL; + + *pos = be64_to_cpu(*(__u64 *)key); + + return param; +} + +static void fldb_seq_stop(struct seq_file *p, void *v) +{ + struct fld_seq_param *param = p->private; + const struct dt_it_ops *iops; + struct lu_server_fld *fld; + struct dt_object *obj; + + if (param == NULL) + return; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + iops->put(¶m->fsp_env, param->fsp_it); +} + +static void *fldb_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + int rc; + + ++*pos; + if (param == NULL || param->fsp_stop) + return NULL; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + rc = iops->next(¶m->fsp_env, param->fsp_it); + if (rc > 0) { + param->fsp_stop = 1; + return NULL; + } + + *pos = be64_to_cpu(*(__u64 *)iops->key(¶m->fsp_env, param->fsp_it)); + return param; +} + +static int fldb_seq_show(struct seq_file *p, void *v) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct lu_seq_range fld_rec; + int rc; + + if (param == NULL || param->fsp_stop) + return 0; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + rc = iops->rec(¶m->fsp_env, param->fsp_it, + (struct dt_rec *)&fld_rec, 0); + if (rc != 0) { + CERROR("%s: read record error: rc = %d\n", + fld->lsf_name, rc); + } else if (fld_rec.lsr_start != 0) { + range_be_to_cpu(&fld_rec, &fld_rec); + seq_printf(p, DRANGE"\n", PRANGE(&fld_rec)); + } + + return rc; +} + +static const struct seq_operations fldb_sops = { + .start = fldb_seq_start, + .stop = fldb_seq_stop, + .next = fldb_seq_next, + .show = fldb_seq_show, +}; + +static int fldb_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct lu_server_seq *ss = inode->i_private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct fld_seq_param *param = NULL; + int env_init = 0; + int rc; + + fld = ss->lss_site->ss_server_fld; + LASSERT(fld != NULL); + + rc = seq_open(file, &fldb_sops); + if (rc) + return rc; + + obj = fld->lsf_obj; + if (obj == NULL) { + seq = file->private_data; + seq->private = NULL; + return 0; + } + + OBD_ALLOC_PTR(param); + if (param == NULL) + GOTO(out, rc = -ENOMEM); + + rc = lu_env_init(¶m->fsp_env, LCT_MD_THREAD); + if (rc != 0) + GOTO(out, rc); + + env_init = 1; + iops = &obj->do_index_ops->dio_it; + param->fsp_it = iops->init(¶m->fsp_env, obj, 0); + if (IS_ERR(param->fsp_it)) + GOTO(out, rc = PTR_ERR(param->fsp_it)); + + param->fsp_fld = fld; + param->fsp_seq = ss; + param->fsp_stop = 0; + + seq = file->private_data; + seq->private = param; +out: + if (rc != 0) { + if (env_init == 1) + lu_env_fini(¶m->fsp_env); + if (param != NULL) + OBD_FREE_PTR(param); + } + return rc; +} + +static int fldb_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct fld_seq_param *param; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + + param = seq->private; + if (param == NULL) { + seq_release(inode, file); + return 0; + } + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + LASSERT(iops != NULL); + LASSERT(param->fsp_it != NULL); + iops->fini(¶m->fsp_env, param->fsp_it); + lu_env_fini(¶m->fsp_env); + OBD_FREE_PTR(param); + seq_release(inode, file); + + return 0; +} + +static ssize_t fldb_seq_write(struct file *file, const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct fld_seq_param *param; + struct lu_seq_range range; + int rc = 0; + char _buffer[MAX_FID_RANGE_STRLEN]; + char *buffer = _buffer; + char *tmp; + ENTRY; + + param = seq->private; + if (param == NULL) + RETURN(-EINVAL); + + if (len >= sizeof(_buffer)) + RETURN(-EINVAL); + + if (copy_from_user(buffer, buf, len)) + GOTO(out, rc = -EFAULT); + buffer[len] = 0; + + /* + * format - [0x0000000200000007-0x0000000200000008):0:mdt + */ + if (*buffer != '[') + GOTO(out, rc = -EINVAL); + buffer++; + + tmp = strchr(buffer, '-'); + if (!tmp) + GOTO(out, rc = -EINVAL); + *tmp++ = '\0'; + rc = kstrtoull(buffer, 0, &range.lsr_start); + if (rc) + GOTO(out, rc); + buffer = tmp; + + tmp = strchr(buffer, ')'); + if (!tmp) + GOTO(out, rc = -EINVAL); + *tmp++ = '\0'; + rc = kstrtoull(buffer, 0, &range.lsr_end); + if (rc) + GOTO(out, rc); + buffer = tmp; + + if (*buffer != ':') + GOTO(out, rc = -EINVAL); + buffer++; + + tmp = strchr(buffer, ':'); + if (!tmp) + GOTO(out, rc = -EINVAL); + *tmp++ = '\0'; + rc = kstrtouint(buffer, 0, &range.lsr_index); + if (rc) + GOTO(out, rc); + buffer = tmp; + + if (strncmp(buffer, "mdt", 3) == 0) + range.lsr_flags = LU_SEQ_RANGE_MDT; + else if (strncmp(buffer, "ost", 3) == 0) + range.lsr_flags = LU_SEQ_RANGE_OST; + else + GOTO(out, rc = -EINVAL); + + rc = seq_server_alloc_spec(param->fsp_seq->lss_site->ss_control_seq, + &range, ¶m->fsp_env); + +out: + RETURN(rc < 0 ? rc : len); +} + +const struct file_operations seq_fld_debugfs_seq_fops = { + .owner = THIS_MODULE, + .open = fldb_seq_open, + .read = seq_read, + .write = fldb_seq_write, + .release = fldb_seq_release, +}; + +#endif /* HAVE_SERVER_SUPPORT */ + +/* Client side debugfs stuff */ +static ssize_t +ldebugfs_client_fid_space_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct lu_client_seq *seq = m->private; + int rc; + + ENTRY; + + mutex_lock(&seq->lcs_mutex); + rc = ldebugfs_fid_write_common(buffer, count, &seq->lcs_space); + if (rc == 0) { + CDEBUG(D_INFO, "%s: Space: " DRANGE "\n", + seq->lcs_name, PRANGE(&seq->lcs_space)); + } + + mutex_unlock(&seq->lcs_mutex); + + RETURN(count); +} + +static int ldebugfs_client_fid_space_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)m->private; + + ENTRY; + mutex_lock(&seq->lcs_mutex); + seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lcs_space)); + mutex_unlock(&seq->lcs_mutex); + + RETURN(0); +} + +static ssize_t ldebugfs_client_fid_width_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct lu_client_seq *seq = m->private; + u64 val; + u64 max; + int rc; + + ENTRY; + rc = kstrtoull_from_user(buffer, count, 0, &val); + if (rc) + return rc; + + mutex_lock(&seq->lcs_mutex); + if (seq->lcs_type == LUSTRE_SEQ_DATA) + max = LUSTRE_DATA_SEQ_MAX_WIDTH; + else + max = LUSTRE_METADATA_SEQ_MAX_WIDTH; + + if (val <= max) { + seq->lcs_width = val; + + CDEBUG(D_INFO, "%s: Sequence size: %llu\n", seq->lcs_name, + seq->lcs_width); + } else { + count = -ERANGE; + } + + mutex_unlock(&seq->lcs_mutex); + RETURN(count); +} + +static int +ldebugfs_client_fid_width_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)m->private; + + ENTRY; + mutex_lock(&seq->lcs_mutex); + seq_printf(m, "%llu\n", seq->lcs_width); + mutex_unlock(&seq->lcs_mutex); + + RETURN(0); +} + +static int +ldebugfs_client_fid_fid_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)m->private; + + ENTRY; + mutex_lock(&seq->lcs_mutex); + seq_printf(m, DFID"\n", PFID(&seq->lcs_fid)); + mutex_unlock(&seq->lcs_mutex); + + RETURN(0); +} + +static int +ldebugfs_client_fid_server_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)m->private; + struct client_obd *cli; + ENTRY; + + if (seq->lcs_exp) { + cli = &seq->lcs_exp->exp_obd->u.cli; + seq_printf(m, "%s\n", cli->cl_target_uuid.uuid); +#ifdef HAVE_SERVER_SUPPORT + } else { + seq_printf(m, "%s\n", seq->lcs_srv->lss_name); +#endif /* HAVE_SERVER_SUPPORT */ + } + + RETURN(0); +} + +LDEBUGFS_SEQ_FOPS(ldebugfs_client_fid_space); +LDEBUGFS_SEQ_FOPS(ldebugfs_client_fid_width); +LDEBUGFS_SEQ_FOPS_RO(ldebugfs_client_fid_server); +LDEBUGFS_SEQ_FOPS_RO(ldebugfs_client_fid_fid); + +struct ldebugfs_vars seq_client_debugfs_list[] = { + { .name = "space", + .fops = &ldebugfs_client_fid_space_fops }, + { .name = "width", + .fops = &ldebugfs_client_fid_width_fops }, + { .name = "server", + .fops = &ldebugfs_client_fid_server_fops}, + { .name = "fid", + .fops = &ldebugfs_client_fid_fid_fops }, + { NULL } +}; diff --git a/drivers/staging/lustrefsx/lustre/fld/Makefile b/drivers/staging/lustrefsx/lustre/fld/Makefile new file mode 100644 index 0000000000000..722c19fe30409 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_LUSTREFSX_LNET) += fld.o + +ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/include + +fld-y := fld_request.o fld_cache.o lproc_fld.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules + diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_cache.c b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c new file mode 100644 index 0000000000000..e77df9652141e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c @@ -0,0 +1,492 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/fld/fld_cache.c + * + * FLD (Fids Location Database) + * + * Author: Pravin Shelar + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FLD + +#include +#include +#include +#include +#include +#include "fld_internal.h" + +/** + * create fld cache. + */ +struct fld_cache *fld_cache_init(const char *name, int cache_size, + int cache_threshold) +{ + struct fld_cache *cache; + + ENTRY; + + LASSERT(name != NULL); + LASSERT(cache_threshold < cache_size); + + OBD_ALLOC_PTR(cache); + if (cache == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + INIT_LIST_HEAD(&cache->fci_entries_head); + INIT_LIST_HEAD(&cache->fci_lru); + + cache->fci_cache_count = 0; + rwlock_init(&cache->fci_lock); + + strlcpy(cache->fci_name, name, sizeof(cache->fci_name)); + + cache->fci_cache_size = cache_size; + cache->fci_threshold = cache_threshold; + + /* Init fld cache info. */ + memset(&cache->fci_stat, 0, sizeof(cache->fci_stat)); + + CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n", + cache->fci_name, cache_size, cache_threshold); + + RETURN(cache); +} + +/** + * destroy fld cache. + */ +void fld_cache_fini(struct fld_cache *cache) +{ + LASSERT(cache != NULL); + fld_cache_flush(cache); + + CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name); + CDEBUG(D_INFO, " Cache reqs: %llu\n", cache->fci_stat.fst_cache); + CDEBUG(D_INFO, " Total reqs: %llu\n", cache->fci_stat.fst_count); + + OBD_FREE_PTR(cache); +} + +/** + * delete given node from list. + */ +static void fld_cache_entry_delete(struct fld_cache *cache, + struct fld_cache_entry *node) +{ + list_del(&node->fce_list); + list_del(&node->fce_lru); + cache->fci_cache_count--; + OBD_FREE_PTR(node); +} + +/** + * fix list by checking new entry with NEXT entry in order. + */ +static void fld_fix_new_list(struct fld_cache *cache) +{ + struct fld_cache_entry *f_curr; + struct fld_cache_entry *f_next; + struct lu_seq_range *c_range; + struct lu_seq_range *n_range; + struct list_head *head = &cache->fci_entries_head; + + ENTRY; + +restart_fixup: + + list_for_each_entry_safe(f_curr, f_next, head, fce_list) { + c_range = &f_curr->fce_range; + n_range = &f_next->fce_range; + + LASSERT(lu_seq_range_is_sane(c_range)); + if (&f_next->fce_list == head) + break; + + if (c_range->lsr_flags != n_range->lsr_flags) + continue; + + LASSERTF(c_range->lsr_start <= n_range->lsr_start, + "cur lsr_start "DRANGE" next lsr_start "DRANGE"\n", + PRANGE(c_range), PRANGE(n_range)); + + /* check merge possibility with next range */ + if (c_range->lsr_end == n_range->lsr_start) { + if (c_range->lsr_index != n_range->lsr_index) + continue; + n_range->lsr_start = c_range->lsr_start; + fld_cache_entry_delete(cache, f_curr); + continue; + } + + /* check if current range overlaps with next range. */ + if (n_range->lsr_start < c_range->lsr_end) { + if (c_range->lsr_index == n_range->lsr_index) { + n_range->lsr_start = c_range->lsr_start; + n_range->lsr_end = max(c_range->lsr_end, + n_range->lsr_end); + fld_cache_entry_delete(cache, f_curr); + } else { + if (n_range->lsr_end <= c_range->lsr_end) { + *n_range = *c_range; + fld_cache_entry_delete(cache, f_curr); + } else + n_range->lsr_start = c_range->lsr_end; + } + + /* we could have overlap over next + * range too. better restart. + */ + goto restart_fixup; + } + + /* kill duplicates */ + if (c_range->lsr_start == n_range->lsr_start && + c_range->lsr_end == n_range->lsr_end) + fld_cache_entry_delete(cache, f_curr); + } + + EXIT; +} + +/** + * add node to fld cache + */ +static inline void fld_cache_entry_add(struct fld_cache *cache, + struct fld_cache_entry *f_new, + struct list_head *pos) +{ + list_add(&f_new->fce_list, pos); + list_add(&f_new->fce_lru, &cache->fci_lru); + + cache->fci_cache_count++; + fld_fix_new_list(cache); +} + +/** + * Check if cache needs to be shrunk. If so - do it. + * Remove one entry in list and so on until cache is shrunk enough. + */ +static int fld_cache_shrink(struct fld_cache *cache) +{ + int num = 0; + + ENTRY; + + LASSERT(cache != NULL); + + if (cache->fci_cache_count < cache->fci_cache_size) + RETURN(0); + + while (cache->fci_cache_count + cache->fci_threshold > + cache->fci_cache_size && + !list_empty(&cache->fci_lru)) { + struct fld_cache_entry *flde = + list_last_entry(&cache->fci_lru, struct fld_cache_entry, + fce_lru); + + fld_cache_entry_delete(cache, flde); + num++; + } + + CDEBUG(D_INFO, "%s: FLD cache - Shrunk by %d entries\n", + cache->fci_name, num); + + RETURN(0); +} + +/** + * kill all fld cache entries. + */ +void fld_cache_flush(struct fld_cache *cache) +{ + ENTRY; + + write_lock(&cache->fci_lock); + cache->fci_cache_size = 0; + fld_cache_shrink(cache); + write_unlock(&cache->fci_lock); + + EXIT; +} + +/** + * punch hole in existing range. divide this range and add new + * entry accordingly. + */ + +static void fld_cache_punch_hole(struct fld_cache *cache, + struct fld_cache_entry *f_curr, + struct fld_cache_entry *f_new) +{ + const struct lu_seq_range *range = &f_new->fce_range; + const u64 new_start = range->lsr_start; + const u64 new_end = range->lsr_end; + struct fld_cache_entry *fldt; + + ENTRY; + OBD_ALLOC_GFP(fldt, sizeof(*fldt), GFP_ATOMIC); + if (!fldt) { + OBD_FREE_PTR(f_new); + EXIT; + /* overlap is not allowed, so dont mess up list. */ + return; + } + /* break f_curr RANGE into three RANGES: + * f_curr, f_new , fldt + */ + + /* fldt */ + fldt->fce_range.lsr_start = new_end; + fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end; + fldt->fce_range.lsr_index = f_curr->fce_range.lsr_index; + + /* f_curr */ + f_curr->fce_range.lsr_end = new_start; + + /* add these two entries to list */ + fld_cache_entry_add(cache, f_new, &f_curr->fce_list); + fld_cache_entry_add(cache, fldt, &f_new->fce_list); + + /* no need to fixup */ + EXIT; +} + +/** + * handle range overlap in fld cache. + */ +static void fld_cache_overlap_handle(struct fld_cache *cache, + struct fld_cache_entry *f_curr, + struct fld_cache_entry *f_new) +{ + const struct lu_seq_range *range = &f_new->fce_range; + const u64 new_start = range->lsr_start; + const u64 new_end = range->lsr_end; + const u32 mdt = range->lsr_index; + + /* this is overlap case, these case are checking overlapping with + * prev range only. fixup will handle overlaping with next range. + */ + + if (f_curr->fce_range.lsr_index == mdt) { + f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start, + new_start); + + f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end, + new_end); + + OBD_FREE_PTR(f_new); + fld_fix_new_list(cache); + + } else if (new_start <= f_curr->fce_range.lsr_start && + f_curr->fce_range.lsr_end <= new_end) { + /* case 1: new range completely overshadowed existing range. + * e.g. whole range migrated. update fld cache entry + */ + + f_curr->fce_range = *range; + OBD_FREE_PTR(f_new); + fld_fix_new_list(cache); + + } else if (f_curr->fce_range.lsr_start < new_start && + new_end < f_curr->fce_range.lsr_end) { + /* case 2: new range fit within existing range. */ + + fld_cache_punch_hole(cache, f_curr, f_new); + + } else if (new_end <= f_curr->fce_range.lsr_end) { + /* case 3: overlap: + * [new_start [c_start new_end) c_end) + */ + + LASSERT(new_start <= f_curr->fce_range.lsr_start); + + f_curr->fce_range.lsr_start = new_end; + fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev); + + } else if (f_curr->fce_range.lsr_start <= new_start) { + /* case 4: overlap: + * [c_start [new_start c_end) new_end) + */ + + LASSERT(f_curr->fce_range.lsr_end <= new_end); + + f_curr->fce_range.lsr_end = new_start; + fld_cache_entry_add(cache, f_new, &f_curr->fce_list); + } else + CERROR("NEW range ="DRANGE" curr = "DRANGE"\n", + PRANGE(range), PRANGE(&f_curr->fce_range)); +} + +struct fld_cache_entry +*fld_cache_entry_create(const struct lu_seq_range *range) +{ + struct fld_cache_entry *f_new; + + LASSERT(lu_seq_range_is_sane(range)); + + OBD_ALLOC_PTR(f_new); + if (!f_new) + RETURN(ERR_PTR(-ENOMEM)); + + f_new->fce_range = *range; + RETURN(f_new); +} + +/** + * Insert FLD entry in FLD cache. + * + * This function handles all cases of merging and breaking up of + * ranges. + */ +int fld_cache_insert_nolock(struct fld_cache *cache, + struct fld_cache_entry *f_new) +{ + struct fld_cache_entry *f_curr; + struct fld_cache_entry *n; + struct list_head *head; + struct list_head *prev = NULL; + const u64 new_start = f_new->fce_range.lsr_start; + const u64 new_end = f_new->fce_range.lsr_end; + __u32 new_flags = f_new->fce_range.lsr_flags; + + ENTRY; + + /* + * Duplicate entries are eliminated in insert op. + * So we don't need to search new entry before starting + * insertion loop. + */ + + fld_cache_shrink(cache); + + head = &cache->fci_entries_head; + + list_for_each_entry_safe(f_curr, n, head, fce_list) { + /* add list if next is end of list */ + if (new_end < f_curr->fce_range.lsr_start || + (new_end == f_curr->fce_range.lsr_start && + new_flags != f_curr->fce_range.lsr_flags)) + break; + + prev = &f_curr->fce_list; + /* check if this range is to left of new range. */ + if (new_start < f_curr->fce_range.lsr_end && + new_flags == f_curr->fce_range.lsr_flags) { + fld_cache_overlap_handle(cache, f_curr, f_new); + goto out; + } + } + + if (prev == NULL) + prev = head; + + CDEBUG(D_INFO, "insert range "DRANGE"\n", PRANGE(&f_new->fce_range)); + /* Add new entry to cache and lru list. */ + fld_cache_entry_add(cache, f_new, prev); +out: + RETURN(0); +} + +int fld_cache_insert(struct fld_cache *cache, + const struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + int rc; + + flde = fld_cache_entry_create(range); + if (IS_ERR(flde)) + RETURN(PTR_ERR(flde)); + + write_lock(&cache->fci_lock); + rc = fld_cache_insert_nolock(cache, flde); + write_unlock(&cache->fci_lock); + if (rc) + OBD_FREE_PTR(flde); + + RETURN(rc); +} + +void fld_cache_delete_nolock(struct fld_cache *cache, + const struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + struct fld_cache_entry *tmp; + struct list_head *head; + + head = &cache->fci_entries_head; + list_for_each_entry_safe(flde, tmp, head, fce_list) { + /* add list if next is end of list */ + if (range->lsr_start == flde->fce_range.lsr_start || + (range->lsr_end == flde->fce_range.lsr_end && + range->lsr_flags == flde->fce_range.lsr_flags)) { + fld_cache_entry_delete(cache, flde); + break; + } + } +} + +/** + * lookup \a seq sequence for range in fld cache. + */ +int fld_cache_lookup(struct fld_cache *cache, + const u64 seq, struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + struct fld_cache_entry *prev = NULL; + struct list_head *head; + + ENTRY; + + read_lock(&cache->fci_lock); + head = &cache->fci_entries_head; + + cache->fci_stat.fst_count++; + list_for_each_entry(flde, head, fce_list) { + if (flde->fce_range.lsr_start > seq) { + if (prev != NULL) + *range = prev->fce_range; + break; + } + + prev = flde; + if (lu_seq_range_within(&flde->fce_range, seq)) { + *range = flde->fce_range; + + cache->fci_stat.fst_cache++; + read_unlock(&cache->fci_lock); + RETURN(0); + } + } + read_unlock(&cache->fci_lock); + RETURN(-ENOENT); +} diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_handler.c b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c new file mode 100644 index 0000000000000..6f01007c59e8c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c @@ -0,0 +1,485 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/fld/fld_handler.c + * + * FLD (Fids Location Database) + * + * Author: Yury Umanets + * Author: WangDi + * Author: Pravin Shelar + */ + +#define DEBUG_SUBSYSTEM S_FLD + +#include +#include + +#include +#include +#include +#include +#include +#include +#include "fld_internal.h" + +/* context key constructor/destructor: fld_key_init, fld_key_fini */ +LU_KEY_INIT_FINI(fld, struct fld_thread_info); + +/* context key: fld_thread_key */ +/* MGS thread may create llog file causing FLD lookup */ +LU_CONTEXT_KEY_DEFINE(fld, LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD); + +int fld_server_mod_init(void) +{ + LU_CONTEXT_KEY_INIT(&fld_thread_key); + return lu_context_key_register(&fld_thread_key); +} + +void fld_server_mod_exit(void) +{ + lu_context_key_degister(&fld_thread_key); +} + +int fld_declare_server_create(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *range, + struct thandle *th) +{ + int rc; + + rc = fld_declare_index_create(env, fld, range, th); + RETURN(rc); +} +EXPORT_SYMBOL(fld_declare_server_create); + +/** + * Insert FLD index entry and update FLD cache. + * + * This function is called from the sequence allocator when a super-sequence + * is granted to a server. + */ +int fld_server_create(const struct lu_env *env, struct lu_server_fld *fld, + const struct lu_seq_range *range, struct thandle *th) +{ + int rc; + + mutex_lock(&fld->lsf_lock); + rc = fld_index_create(env, fld, range, th); + mutex_unlock(&fld->lsf_lock); + + RETURN(rc); +} +EXPORT_SYMBOL(fld_server_create); + +/** + * Extract index information from fld name like srv-fsname-MDT0000 + **/ +int fld_name_to_index(const char *name, u32 *index) +{ + char *dash; + int rc; + + ENTRY; + + CDEBUG(D_INFO, "get index from %s\n", name); + dash = strrchr(name, '-'); + if (!dash) + RETURN(-EINVAL); + dash++; + rc = target_name2index(dash, index, NULL); + RETURN(rc); +} + +/** + * Retrieve fldb entry from MDT0 and add to local FLDB and cache. + **/ +int fld_update_from_controller(const struct lu_env *env, + struct lu_server_fld *fld) +{ + struct fld_thread_info *info; + struct lu_seq_range *range; + struct lu_seq_range_array *lsra; + u32 index; + struct ptlrpc_request *req; + int rc; + int i; + + ENTRY; + + /* + * Update only happens during initalization, i.e. local FLDB + * does not exist yet + */ + if (!fld->lsf_new) + RETURN(0); + + rc = fld_name_to_index(fld->lsf_name, &index); + if (rc < 0) + RETURN(rc); + + /* No need update fldb for MDT0 */ + if (index == 0) + RETURN(0); + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + LASSERT(info != NULL); + range = &info->fti_lrange; + memset(range, 0, sizeof(*range)); + range->lsr_index = index; + fld_range_set_mdt(range); + + do { + rc = fld_client_rpc(fld->lsf_control_exp, range, FLD_READ, + &req); + if (rc != 0) + GOTO(out, rc); + + LASSERT(req != NULL); + lsra = (struct lu_seq_range_array *)req_capsule_server_get( + &req->rq_pill, &RMF_GENERIC_DATA); + if (!lsra) + GOTO(out, rc = -EPROTO); + + range_array_le_to_cpu(lsra, lsra); + for (i = 0; i < lsra->lsra_count; i++) { + int rc1; + + if (lsra->lsra_lsr[i].lsr_flags != LU_SEQ_RANGE_MDT) + GOTO(out, rc = -EINVAL); + + if (lsra->lsra_lsr[i].lsr_index != index) + GOTO(out, rc = -EINVAL); + + mutex_lock(&fld->lsf_lock); + rc1 = fld_insert_entry(env, fld, &lsra->lsra_lsr[i]); + mutex_unlock(&fld->lsf_lock); + + if (rc1 != 0) + GOTO(out, rc = rc1); + } + if (rc == -EAGAIN) + *range = lsra->lsra_lsr[lsra->lsra_count - 1]; + } while (rc == -EAGAIN); + + fld->lsf_new = 1; +out: + if (req) + ptlrpc_req_finished(req); + + RETURN(rc); +} +EXPORT_SYMBOL(fld_update_from_controller); + +/** + * Lookup sequece in local cache/fldb. + **/ +int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld, + u64 seq, struct lu_seq_range *range) +{ + struct lu_seq_range *erange; + struct fld_thread_info *info; + int rc; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + LASSERT(info != NULL); + erange = &info->fti_lrange; + + /* Lookup it in the cache. */ + rc = fld_cache_lookup(fld->lsf_cache, seq, erange); + if (rc == 0) { + if (unlikely(fld_range_type(erange) != fld_range_type(range) && + !fld_range_is_any(range))) { + CERROR("%s: FLD cache range "DRANGE" does not match requested flag %x: rc = %d\n", + fld->lsf_name, PRANGE(erange), range->lsr_flags, + -EIO); + RETURN(-EIO); + } + *range = *erange; + RETURN(0); + } + RETURN(rc); +} +EXPORT_SYMBOL(fld_local_lookup); + +/** + * Lookup MDT/OST by seq, returns a range for given seq. + * + * If that entry is not cached in fld cache, request is sent to super + * sequence controller node (MDT0). All other MDT[1...N] and client + * cache fld entries, but this cache is not persistent. + */ +int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld, + u64 seq, struct lu_seq_range *range) +{ + u32 index; + int rc; + + ENTRY; + + rc = fld_local_lookup(env, fld, seq, range); + if (likely(rc == 0)) + RETURN(rc); + + rc = fld_name_to_index(fld->lsf_name, &index); + if (rc < 0) + RETURN(rc); + + if (index == 0 && rc == LDD_F_SV_TYPE_MDT) { + /* + * On server side, all entries should be in cache. + * If we can not find it in cache, just return error + */ + CERROR("%s: Cannot find sequence %#llx: rc = %d\n", + fld->lsf_name, seq, -ENOENT); + RETURN(-ENOENT); + } else { + int i; + + if (!fld->lsf_control_exp) { + CERROR("%s: lookup %#llx, but not connects to MDT0 yet: rc = %d.\n", + fld->lsf_name, seq, -EIO); + RETURN(-EIO); + } + /* + * send request to mdt0 i.e. super seq. controller. + * This is temporary solution, long term solution is fld + * replication on all mdt servers. + */ + range->lsr_start = seq; + for (i = 0; i < 5; i++) { + rc = fld_client_rpc(fld->lsf_control_exp, + range, FLD_QUERY, NULL); + if (rc != -EAGAIN) + break; + schedule_timeout_interruptible(cfs_time_seconds(1)); + } + if (rc == 0) + fld_cache_insert(fld->lsf_cache, range); + } + RETURN(rc); +} +EXPORT_SYMBOL(fld_server_lookup); + +/** + * All MDT server handle fld lookup operation. But only MDT0 has fld index. + * if entry is not found in cache we need to forward lookup request to MDT0 + */ +static int fld_handle_lookup(struct tgt_session_info *tsi) +{ + struct obd_export *exp = tsi->tsi_exp; + struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site; + struct lu_server_fld *fld; + struct lu_seq_range *in; + struct lu_seq_range *out; + int rc; + + ENTRY; + + in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD); + if (!in) + RETURN(err_serious(-EPROTO)); + + rc = req_capsule_server_pack(tsi->tsi_pill); + if (unlikely(rc != 0)) + RETURN(err_serious(rc)); + + out = req_capsule_server_get(tsi->tsi_pill, &RMF_FLD_MDFLD); + if (!out) + RETURN(err_serious(-EPROTO)); + *out = *in; + + fld = lu_site2seq(site)->ss_server_fld; + + rc = fld_server_lookup(tsi->tsi_env, fld, in->lsr_start, out); + + CDEBUG(D_INFO, "%s: FLD req handle: error %d (range: "DRANGE")\n", + fld->lsf_name, rc, PRANGE(out)); + + RETURN(rc); +} + +static int fld_handle_read(struct tgt_session_info *tsi) +{ + struct obd_export *exp = tsi->tsi_exp; + struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site; + struct lu_seq_range *in; + void *data; + int rc; + + ENTRY; + + req_capsule_set(tsi->tsi_pill, &RQF_FLD_READ); + + in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD); + if (!in) + RETURN(err_serious(-EPROTO)); + + req_capsule_set_size(tsi->tsi_pill, &RMF_GENERIC_DATA, RCL_SERVER, + PAGE_SIZE); + + rc = req_capsule_server_pack(tsi->tsi_pill); + if (unlikely(rc != 0)) + RETURN(err_serious(rc)); + + data = req_capsule_server_get(tsi->tsi_pill, &RMF_GENERIC_DATA); + + rc = fld_server_read(tsi->tsi_env, lu_site2seq(site)->ss_server_fld, + in, data, PAGE_SIZE); + RETURN(rc); +} + +static int fld_handle_query(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + req_capsule_set(tsi->tsi_pill, &RQF_FLD_QUERY); + + rc = fld_handle_lookup(tsi); + + RETURN(rc); +} + +/* + * Returns true, if fid is local to this server node. + * + * WARNING: this function is *not* guaranteed to return false if fid is + * remote: it makes an educated conservative guess only. + * + * fid_is_local() is supposed to be used in assertion checks only. + */ +int fid_is_local(const struct lu_env *env, + struct lu_site *site, const struct lu_fid *fid) +{ + int result; + struct seq_server_site *ss_site; + struct lu_seq_range *range; + struct fld_thread_info *info; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + range = &info->fti_lrange; + + result = 1; /* conservatively assume fid is local */ + ss_site = lu_site2seq(site); + if (ss_site->ss_client_fld) { + int rc; + + rc = fld_cache_lookup(ss_site->ss_client_fld->lcf_cache, + fid_seq(fid), range); + if (rc == 0) + result = (range->lsr_index == ss_site->ss_node_id); + } + return result; +} +EXPORT_SYMBOL(fid_is_local); + +static void fld_server_debugfs_fini(struct lu_server_fld *fld) +{ + debugfs_remove_recursive(fld->lsf_debugfs_entry); +} + +static void fld_server_debugfs_init(struct lu_server_fld *fld) +{ + ENTRY; + fld->lsf_debugfs_entry = debugfs_create_dir(fld->lsf_name, + fld_debugfs_dir); + + debugfs_create_file("fldb", 0444, fld->lsf_debugfs_entry, fld, + &fld_debugfs_seq_fops); +} + +int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld, + struct dt_device *dt, const char *prefix, int type) +{ + int cache_size, cache_threshold; + int rc; + + ENTRY; + + snprintf(fld->lsf_name, sizeof(fld->lsf_name), "srv-%s", prefix); + + cache_size = FLD_SERVER_CACHE_SIZE / sizeof(struct fld_cache_entry); + + cache_threshold = cache_size * FLD_SERVER_CACHE_THRESHOLD / 100; + + mutex_init(&fld->lsf_lock); + fld->lsf_cache = fld_cache_init(fld->lsf_name, cache_size, + cache_threshold); + if (IS_ERR(fld->lsf_cache)) { + rc = PTR_ERR(fld->lsf_cache); + fld->lsf_cache = NULL; + RETURN(rc); + } + + rc = fld_index_init(env, fld, dt, type); + if (rc) + GOTO(out_cache, rc); + + fld_server_debugfs_init(fld); + + fld->lsf_control_exp = NULL; + fld->lsf_seq_lookup = fld_server_lookup; + + fld->lsf_seq_lookup = fld_server_lookup; + RETURN(0); +out_cache: + fld_cache_fini(fld->lsf_cache); + return rc; +} +EXPORT_SYMBOL(fld_server_init); + +void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld) +{ + ENTRY; + + fld_server_debugfs_fini(fld); + fld_index_fini(env, fld); + + if (fld->lsf_cache) { + if (!IS_ERR(fld->lsf_cache)) + fld_cache_fini(fld->lsf_cache); + fld->lsf_cache = NULL; + } + + EXIT; +} +EXPORT_SYMBOL(fld_server_fini); + +struct tgt_handler fld_handlers[] = { +TGT_FLD_HDL_VAR(0, FLD_QUERY, fld_handle_query), +TGT_FLD_HDL_VAR(0, FLD_READ, fld_handle_read), +}; +EXPORT_SYMBOL(fld_handlers); diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_index.c b/drivers/staging/lustrefsx/lustre/fld/fld_index.c new file mode 100644 index 0000000000000..7188f45b95869 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/fld_index.c @@ -0,0 +1,531 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/fld/fld_index.c + * + * Author: WangDi + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FLD + +#include +#include +#include +#include +#include +#include +#include "fld_internal.h" + +static const char fld_index_name[] = "fld"; + +static const struct lu_seq_range IGIF_FLD_RANGE = { + .lsr_start = FID_SEQ_IGIF, + .lsr_end = FID_SEQ_IGIF_MAX + 1, + .lsr_index = 0, + .lsr_flags = LU_SEQ_RANGE_MDT +}; + +static const struct lu_seq_range DOT_LUSTRE_FLD_RANGE = { + .lsr_start = FID_SEQ_DOT_LUSTRE, + .lsr_end = FID_SEQ_DOT_LUSTRE + 1, + .lsr_index = 0, + .lsr_flags = LU_SEQ_RANGE_MDT +}; + +static const struct lu_seq_range ROOT_FLD_RANGE = { + .lsr_start = FID_SEQ_ROOT, + .lsr_end = FID_SEQ_ROOT + 1, + .lsr_index = 0, + .lsr_flags = LU_SEQ_RANGE_MDT +}; + +static const struct dt_index_features fld_index_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(u64), + .dif_keysize_max = sizeof(u64), + .dif_recsize_min = sizeof(struct lu_seq_range), + .dif_recsize_max = sizeof(struct lu_seq_range), + .dif_ptrsize = 4 +}; + +int fld_declare_index_create(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *new_range, + struct thandle *th) +{ + struct lu_seq_range *tmp; + struct lu_seq_range *range; + struct fld_thread_info *info; + int rc = 0; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + range = &info->fti_lrange; + tmp = &info->fti_irange; + memset(range, 0, sizeof(*range)); + + rc = fld_index_lookup(env, fld, new_range->lsr_start, range); + if (rc == 0) { + /* In case of duplicate entry, the location must be same */ + LASSERT((lu_seq_range_compare_loc(new_range, range) == 0)); + GOTO(out, rc = -EEXIST); + } + + if (rc != -ENOENT) { + CERROR("%s: lookup range "DRANGE" error: rc = %d\n", + fld->lsf_name, PRANGE(range), rc); + GOTO(out, rc); + } + + /* + * Check for merge case, since the fld entry can only be increamental, + * so we will only check whether it can be merged from the left. + */ + if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 && + lu_seq_range_compare_loc(new_range, range) == 0) { + range_cpu_to_be(tmp, range); + rc = dt_declare_delete(env, fld->lsf_obj, + (struct dt_key *)&tmp->lsr_start, th); + if (rc) { + CERROR("%s: declare record "DRANGE" failed: rc = %d\n", + fld->lsf_name, PRANGE(range), rc); + GOTO(out, rc); + } + *tmp = *new_range; + tmp->lsr_start = range->lsr_start; + } else { + *tmp = *new_range; + } + + range_cpu_to_be(tmp, tmp); + rc = dt_declare_insert(env, fld->lsf_obj, (struct dt_rec *)tmp, + (struct dt_key *)&tmp->lsr_start, th); +out: + RETURN(rc); +} + +/** + * insert range in fld store. + * + * \param range range to be inserted + * \param th transaction for this operation as it could compound + * transaction. + * + * \retval 0 success + * \retval -ve error + * + * The whole fld index insertion is protected by seq->lss_mutex (see + * seq_server_alloc_super), i.e. only one thread will access fldb each + * time, so we do not need worry the fld file and cache will being + * changed between declare and create. + * Because the fld entry can only be increamental, so we will only check + * whether it can be merged from the left. + * + * Caller must hold fld->lsf_lock + **/ +int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld, + const struct lu_seq_range *new_range, struct thandle *th) +{ + struct lu_seq_range *range; + struct lu_seq_range *tmp; + struct fld_thread_info *info; + int rc = 0; + int deleted = 0; + struct fld_cache_entry *flde; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + + LASSERT(mutex_is_locked(&fld->lsf_lock)); + + range = &info->fti_lrange; + memset(range, 0, sizeof(*range)); + tmp = &info->fti_irange; + rc = fld_index_lookup(env, fld, new_range->lsr_start, range); + if (rc != -ENOENT) { + rc = rc == 0 ? -EEXIST : rc; + GOTO(out, rc); + } + + if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 && + lu_seq_range_compare_loc(new_range, range) == 0) { + range_cpu_to_be(tmp, range); + rc = dt_delete(env, fld->lsf_obj, + (struct dt_key *)&tmp->lsr_start, th); + if (rc != 0) + GOTO(out, rc); + *tmp = *new_range; + tmp->lsr_start = range->lsr_start; + deleted = 1; + } else { + *tmp = *new_range; + } + + range_cpu_to_be(tmp, tmp); + rc = dt_insert(env, fld->lsf_obj, (struct dt_rec *)tmp, + (struct dt_key *)&tmp->lsr_start, th); + if (rc != 0) { + CERROR("%s: insert range "DRANGE" failed: rc = %d\n", + fld->lsf_name, PRANGE(new_range), rc); + GOTO(out, rc); + } + + flde = fld_cache_entry_create(new_range); + if (IS_ERR(flde)) + GOTO(out, rc = PTR_ERR(flde)); + + write_lock(&fld->lsf_cache->fci_lock); + if (deleted) + fld_cache_delete_nolock(fld->lsf_cache, new_range); + rc = fld_cache_insert_nolock(fld->lsf_cache, flde); + write_unlock(&fld->lsf_cache->fci_lock); + if (rc) + OBD_FREE_PTR(flde); +out: + RETURN(rc); +} + +/** + * lookup range for a seq passed. note here we only care about the start/end, + * caller should handle the attached location data (flags, index). + * + * \param seq seq for lookup. + * \param range result of lookup. + * + * \retval 0 found, \a range is the matched range; + * \retval -ENOENT not found, \a range is the left-side range; + * \retval -ve other error; + */ +int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld, + u64 seq, struct lu_seq_range *range) +{ + struct lu_seq_range *fld_rec; + struct fld_thread_info *info; + int rc; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + fld_rec = &info->fti_rec; + + rc = fld_cache_lookup(fld->lsf_cache, seq, fld_rec); + if (rc == 0) { + *range = *fld_rec; + if (lu_seq_range_within(range, seq)) + rc = 0; + else + rc = -ENOENT; + } + + CDEBUG(D_INFO, "%s: lookup seq = %#llx range : "DRANGE" rc = %d\n", + fld->lsf_name, seq, PRANGE(range), rc); + + RETURN(rc); +} + +/** + * insert entry in fld store. + * + * \param env relevant lu_env + * \param fld fld store + * \param range range to be inserted + * + * \retval 0 success + * \retval -ve error + * + * Caller must hold fld->lsf_lock + **/ + +int fld_insert_entry(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *range) +{ + struct thandle *th; + struct dt_device *dt = lu2dt_dev(fld->lsf_obj->do_lu.lo_dev); + int rc; + + ENTRY; + + LASSERT(mutex_is_locked(&fld->lsf_lock)); + + if (dt->dd_rdonly) + RETURN(0); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = fld_declare_index_create(env, fld, range, th); + if (rc != 0) { + if (rc == -EEXIST) + rc = 0; + GOTO(out, rc); + } + + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(out, rc); + + rc = fld_index_create(env, fld, range, th); + if (rc == -EEXIST) + rc = 0; +out: + dt_trans_stop(env, dt, th); + RETURN(rc); +} +EXPORT_SYMBOL(fld_insert_entry); + +static int fld_insert_special_entries(const struct lu_env *env, + struct lu_server_fld *fld) +{ + int rc; + + rc = fld_insert_entry(env, fld, &IGIF_FLD_RANGE); + if (rc != 0) + RETURN(rc); + + rc = fld_insert_entry(env, fld, &DOT_LUSTRE_FLD_RANGE); + if (rc != 0) + RETURN(rc); + + rc = fld_insert_entry(env, fld, &ROOT_FLD_RANGE); + + RETURN(rc); +} + +int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld, + struct dt_device *dt, int type) +{ + struct dt_object *dt_obj = NULL; + struct lu_fid fid; + struct lu_attr *attr = NULL; + struct lu_seq_range *range = NULL; + struct fld_thread_info *info; + struct dt_object_format dof; + struct dt_it *it; + const struct dt_it_ops *iops; + int rc; + u32 index; + int range_count = 0; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + LASSERT(info != NULL); + + lu_local_obj_fid(&fid, FLD_INDEX_OID); + OBD_ALLOC_PTR(attr); + if (!attr) + RETURN(-ENOMEM); + + memset(attr, 0, sizeof(*attr)); + attr->la_valid = LA_MODE; + attr->la_mode = S_IFREG | 0666; + dof.dof_type = DFT_INDEX; + dof.u.dof_idx.di_feat = &fld_index_features; + + dt_obj = dt_locate(env, dt, &fid); + if (IS_ERR(dt_obj)) { + rc = PTR_ERR(dt_obj); + dt_obj = NULL; + GOTO(out, rc); + } + + LASSERT(dt_obj != NULL); + if (!dt_object_exists(dt_obj)) { + dt_object_put(env, dt_obj); + dt_obj = dt_find_or_create(env, dt, &fid, &dof, attr); + fld->lsf_new = 1; + if (IS_ERR(dt_obj)) { + rc = PTR_ERR(dt_obj); + CERROR("%s: Can't find \"%s\" obj %d\n", fld->lsf_name, + fld_index_name, rc); + dt_obj = NULL; + GOTO(out, rc); + } + } + + fld->lsf_obj = dt_obj; + rc = dt_obj->do_ops->do_index_try(env, dt_obj, &fld_index_features); + if (rc != 0) { + CERROR("%s: File \"%s\" is not an index: rc = %d!\n", + fld->lsf_name, fld_index_name, rc); + GOTO(out, rc); + } + + range = &info->fti_rec; + /* Load fld entry to cache */ + iops = &dt_obj->do_index_ops->dio_it; + it = iops->init(env, dt_obj, 0); + if (IS_ERR(it)) + GOTO(out, rc = PTR_ERR(it)); + + rc = iops->load(env, it, 0); + if (rc > 0) + rc = 0; + else if (rc == 0) + rc = iops->next(env, it); + + if (rc < 0) + GOTO(out_it_fini, rc); + + while (rc == 0) { + rc = iops->rec(env, it, (struct dt_rec *)range, 0); + if (rc != 0) + GOTO(out_it_put, rc); + + range_be_to_cpu(range, range); + + /* + * Newly created ldiskfs IAM indexes may include a + * zeroed-out key and record. Ignore it here. + */ + if (range->lsr_start < range->lsr_end) { + rc = fld_cache_insert(fld->lsf_cache, range); + if (rc != 0) + GOTO(out_it_put, rc); + + range_count++; + } + + rc = iops->next(env, it); + if (rc < 0) + GOTO(out_it_fini, rc); + } + + if (range_count == 0) + fld->lsf_new = 1; + + rc = fld_name_to_index(fld->lsf_name, &index); + if (rc < 0) + GOTO(out_it_put, rc); + else + rc = 0; + + if (index == 0 && type == LU_SEQ_RANGE_MDT) { + /* + * Note: fld_insert_entry will detect whether these + * special entries already exist inside FLDB + */ + mutex_lock(&fld->lsf_lock); + rc = fld_insert_special_entries(env, fld); + mutex_unlock(&fld->lsf_lock); + if (rc != 0) { + CERROR("%s: insert special entries failed!: rc = %d\n", + fld->lsf_name, rc); + GOTO(out_it_put, rc); + } + } +out_it_put: + iops->put(env, it); +out_it_fini: + iops->fini(env, it); +out: + if (attr) + OBD_FREE_PTR(attr); + + if (rc < 0) { + if (dt_obj) + dt_object_put(env, dt_obj); + fld->lsf_obj = NULL; + } + RETURN(rc); +} + +void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld) +{ + ENTRY; + if (fld->lsf_obj) { + if (!IS_ERR(fld->lsf_obj)) + dt_object_put(env, fld->lsf_obj); + fld->lsf_obj = NULL; + } + EXIT; +} + +int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld, + struct lu_seq_range *range, void *data, int data_len) +{ + struct lu_seq_range_array *lsra = data; + struct fld_thread_info *info; + struct dt_object *dt_obj = fld->lsf_obj; + struct lu_seq_range *entry; + struct dt_it *it; + const struct dt_it_ops *iops; + int rc; + + ENTRY; + + lsra->lsra_count = 0; + iops = &dt_obj->do_index_ops->dio_it; + it = iops->init(env, dt_obj, 0); + if (IS_ERR(it)) + RETURN(PTR_ERR(it)); + + rc = iops->load(env, it, range->lsr_end); + if (rc <= 0) + GOTO(out_it_fini, rc); + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + LASSERT(info != NULL); + entry = &info->fti_rec; + do { + rc = iops->rec(env, it, (struct dt_rec *)entry, 0); + if (rc != 0) + GOTO(out_it_put, rc); + + if (offsetof(typeof(*lsra), lsra_lsr[lsra->lsra_count + 1]) > + data_len) + GOTO(out, rc = -EAGAIN); + + range_be_to_cpu(entry, entry); + if (entry->lsr_index == range->lsr_index && + entry->lsr_flags == range->lsr_flags && + entry->lsr_start > range->lsr_start) { + lsra->lsra_lsr[lsra->lsra_count] = *entry; + lsra->lsra_count++; + } + + rc = iops->next(env, it); + } while (rc == 0); + if (rc > 0) + rc = 0; +out: + range_array_cpu_to_le(lsra, lsra); +out_it_put: + iops->put(env, it); +out_it_fini: + iops->fini(env, it); + + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h new file mode 100644 index 0000000000000..84c0f92ac21f4 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h @@ -0,0 +1,214 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/fld/fld_internal.h + * + * Subsystem Description: + * FLD is FID Location Database, which stores where (IE, on which MDT) + * FIDs are located. + * The database is basically a record file, each record consists of a FID + * sequence range, MDT/OST index, and flags. The FLD for the whole FS + * is only stored on the sequence controller(MDT0) right now, but each target + * also has its local FLD, which only stores the local sequence. + * + * The FLD subsystem usually has two tasks: + * 1. maintain the database, i.e. when the sequence controller allocates + * new sequence ranges to some nodes, it will call the FLD API to insert the + * location information in FLDB. + * + * 2. Handle requests from other nodes, i.e. if client needs to know where + * the FID is located, if it can not find the information in the local cache, + * it will send a FLD lookup RPC to the FLD service, and the FLD service will + * look up the FLDB entry and return the location information to client. + * + * Author: Yury Umanets + * Author: Tom WangDi + */ +#ifndef __FLD_INTERNAL_H +#define __FLD_INTERNAL_H + +#include +#include +#include + +struct fld_stats { + __u64 fst_count; + __u64 fst_cache; +}; + +struct lu_fld_hash { + const char *fh_name; + int (*fh_hash_func)(struct lu_client_fld *fld, + __u64 seq); + struct lu_fld_target * (*fh_scan_func)(struct lu_client_fld *fld, + __u64 seq); +}; + +struct fld_cache_entry { + struct list_head fce_lru; + struct list_head fce_list; + /** + * fld cache entries are sorted on range->lsr_start field. */ + struct lu_seq_range fce_range; +}; + +struct fld_cache { + /** + * Cache guard, protects fci_hash mostly because others immutable after + * init is finished. + */ + rwlock_t fci_lock; + + /** + * Cache shrink threshold */ + int fci_threshold; + + /** + * Prefered number of cached entries */ + int fci_cache_size; + + /** + * Current number of cached entries. Protected by \a fci_lock */ + int fci_cache_count; + + /** + * LRU list fld entries. */ + struct list_head fci_lru; + + /** + * sorted fld entries. */ + struct list_head fci_entries_head; + + /** + * Cache statistics. + */ + struct fld_stats fci_stat; + + /** + * Cache name used for debug and messages. + */ + char fci_name[LUSTRE_MDT_MAXNAMELEN]; +}; + +enum { + /* 4M of FLD cache will not hurt client a lot. */ + FLD_SERVER_CACHE_SIZE = (4 * 0x100000), + + /* 1M of FLD cache will not hurt client a lot. */ + FLD_CLIENT_CACHE_SIZE = (1 * 0x100000) +}; + +enum { + /* Cache threshold is 10 percent of size. */ + FLD_SERVER_CACHE_THRESHOLD = 10, + + /* Cache threshold is 10 percent of size. */ + FLD_CLIENT_CACHE_THRESHOLD = 10 +}; + +extern struct lu_fld_hash fld_hash[]; + +# ifdef HAVE_SERVER_SUPPORT +struct fld_thread_info { + struct lu_seq_range fti_rec; + struct lu_seq_range fti_lrange; + struct lu_seq_range fti_irange; +}; + +extern struct lu_context_key fld_thread_key; + +struct dt_device; +int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld, + struct dt_device *dt, int type); + +void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld); + +int fld_declare_index_create(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *new_range, + struct thandle *th); + +int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld, + const struct lu_seq_range *new_range, struct thandle *th); + +int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld, + u64 seq, struct lu_seq_range *range); + +int fld_name_to_index(const char *name, __u32 *index); + +int fld_server_mod_init(void); +void fld_server_mod_exit(void); + +int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld, + struct lu_seq_range *range, void *data, int data_len); + +extern const struct file_operations fld_debugfs_seq_fops; +extern struct dentry *fld_debugfs_dir; + +# endif /* HAVE_SERVER_SUPPORT */ + +int fld_client_rpc(struct obd_export *exp, + struct lu_seq_range *range, __u32 fld_op, + struct ptlrpc_request **reqp); + +extern struct ldebugfs_vars fld_client_debugfs_list[]; + +struct fld_cache *fld_cache_init(const char *name, + int cache_size, int cache_threshold); + +void fld_cache_fini(struct fld_cache *cache); + +void fld_cache_flush(struct fld_cache *cache); + +int fld_cache_insert(struct fld_cache *cache, + const struct lu_seq_range *range); + +struct fld_cache_entry +*fld_cache_entry_create(const struct lu_seq_range *range); + +int fld_cache_insert_nolock(struct fld_cache *cache, + struct fld_cache_entry *f_new); +void fld_cache_delete_nolock(struct fld_cache *cache, + const struct lu_seq_range *range); +int fld_cache_lookup(struct fld_cache *cache, + const u64 seq, struct lu_seq_range *range); + +static inline const char * +fld_target_name(const struct lu_fld_target *tar) +{ +#ifdef HAVE_SERVER_SUPPORT + if (tar->ft_srv != NULL) + return tar->ft_srv->lsf_name; +#endif + + return tar->ft_exp->exp_obd->obd_name; +} + +#endif /* __FLD_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_request.c b/drivers/staging/lustrefsx/lustre/fld/fld_request.c new file mode 100644 index 0000000000000..e381eb87634fc --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/fld_request.c @@ -0,0 +1,544 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/fld/fld_request.c + * + * FLD (Fids Location Database) + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FLD + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "fld_internal.h" + +static int fld_rrb_hash(struct lu_client_fld *fld, u64 seq) +{ + LASSERT(fld->lcf_count > 0); + return do_div(seq, fld->lcf_count); +} + +static struct lu_fld_target * +fld_rrb_scan(struct lu_client_fld *fld, u64 seq) +{ + struct lu_fld_target *target; + int hash; + + ENTRY; + + /* + * Because almost all of special sequence located in MDT0, + * it should go to index 0 directly, instead of calculating + * hash again, and also if other MDTs is not being connected, + * the fld lookup requests(for seq on MDT0) should not be + * blocked because of other MDTs + */ + if (fid_seq_is_norm(seq)) + hash = fld_rrb_hash(fld, seq); + else + hash = 0; + +again: + list_for_each_entry(target, &fld->lcf_targets, ft_chain) { + if (target->ft_idx == hash) + RETURN(target); + } + + if (hash != 0) { + /* + * It is possible the remote target(MDT) are not connected to + * with client yet, so we will refer this to MDT0, which should + * be connected during mount + */ + hash = 0; + goto again; + } + + CERROR("%s: Can't find target by hash %d (seq %#llx). Targets (%d):\n", + fld->lcf_name, hash, seq, fld->lcf_count); + + list_for_each_entry(target, &fld->lcf_targets, ft_chain) { + const char *srv_name = target->ft_srv != NULL ? + target->ft_srv->lsf_name : ""; + const char *exp_name = target->ft_exp != NULL ? + (char *)target->ft_exp->exp_obd->obd_uuid.uuid : + ""; + + CERROR(" exp: 0x%p (%s), srv: 0x%p (%s), idx: %llu\n", + target->ft_exp, exp_name, target->ft_srv, + srv_name, target->ft_idx); + } + + /* + * If target is not found, there is logical error anyway, so here is + * LBUG() to catch this situation. + */ + LBUG(); + RETURN(NULL); +} + +struct lu_fld_hash fld_hash[] = { + { + .fh_name = "RRB", + .fh_hash_func = fld_rrb_hash, + .fh_scan_func = fld_rrb_scan + }, + { + NULL, + } +}; + +static struct lu_fld_target * +fld_client_get_target(struct lu_client_fld *fld, u64 seq) +{ + struct lu_fld_target *target; + + ENTRY; + + LASSERT(fld->lcf_hash != NULL); + + spin_lock(&fld->lcf_lock); + target = fld->lcf_hash->fh_scan_func(fld, seq); + spin_unlock(&fld->lcf_lock); + + if (target) { + CDEBUG(D_INFO, "%s: Found target (idx %llu) by seq %#llx\n", + fld->lcf_name, target->ft_idx, seq); + } + + RETURN(target); +} + +/* + * Add export to FLD. This is usually done by CMM and LMV as they are main users + * of FLD module. + */ +int fld_client_add_target(struct lu_client_fld *fld, + struct lu_fld_target *tar) +{ + const char *name; + struct lu_fld_target *target, *tmp; + + ENTRY; + + LASSERT(tar != NULL); + name = fld_target_name(tar); + LASSERT(name != NULL); + LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL); + + CDEBUG(D_INFO, "%s: Adding target %s (idx %llu)\n", fld->lcf_name, + name, tar->ft_idx); + + OBD_ALLOC_PTR(target); + if (!target) + RETURN(-ENOMEM); + + spin_lock(&fld->lcf_lock); + list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) { + if (tmp->ft_idx == tar->ft_idx) { + spin_unlock(&fld->lcf_lock); + OBD_FREE_PTR(target); + CERROR("Target %s exists in FLD and known as %s:#%llu\n", + name, fld_target_name(tmp), tmp->ft_idx); + RETURN(-EEXIST); + } + } + + target->ft_exp = tar->ft_exp; + if (target->ft_exp) + class_export_get(target->ft_exp); + target->ft_srv = tar->ft_srv; + target->ft_idx = tar->ft_idx; + + list_add_tail(&target->ft_chain, &fld->lcf_targets); + + fld->lcf_count++; + spin_unlock(&fld->lcf_lock); + + RETURN(0); +} +EXPORT_SYMBOL(fld_client_add_target); + +/* Remove export from FLD */ +int fld_client_del_target(struct lu_client_fld *fld, u64 idx) +{ + struct lu_fld_target *target, *tmp; + + ENTRY; + + spin_lock(&fld->lcf_lock); + list_for_each_entry_safe(target, tmp, &fld->lcf_targets, ft_chain) { + if (target->ft_idx == idx) { + fld->lcf_count--; + list_del(&target->ft_chain); + spin_unlock(&fld->lcf_lock); + + if (target->ft_exp) + class_export_put(target->ft_exp); + + OBD_FREE_PTR(target); + RETURN(0); + } + } + spin_unlock(&fld->lcf_lock); + RETURN(-ENOENT); +} + +struct dentry *fld_debugfs_dir; + +static void fld_client_debugfs_init(struct lu_client_fld *fld) +{ + ENTRY; + fld->lcf_debugfs_entry = debugfs_create_dir(fld->lcf_name, + fld_debugfs_dir); + ldebugfs_add_vars(fld->lcf_debugfs_entry, + fld_client_debugfs_list, + fld); +} + +void fld_client_debugfs_fini(struct lu_client_fld *fld) +{ + debugfs_remove_recursive(fld->lcf_debugfs_entry); +} +EXPORT_SYMBOL(fld_client_debugfs_fini); + +static inline int hash_is_sane(int hash) +{ + return (hash >= 0 && hash < ARRAY_SIZE(fld_hash)); +} + +int fld_client_init(struct lu_client_fld *fld, + const char *prefix, int hash) +{ + int cache_size, cache_threshold; + int rc = 0; + + ENTRY; + snprintf(fld->lcf_name, sizeof(fld->lcf_name), + "cli-%s", prefix); + + if (!hash_is_sane(hash)) { + CERROR("%s: Wrong hash function %#x\n", + fld->lcf_name, hash); + RETURN(-EINVAL); + } + + fld->lcf_count = 0; + spin_lock_init(&fld->lcf_lock); + fld->lcf_hash = &fld_hash[hash]; + INIT_LIST_HEAD(&fld->lcf_targets); + + cache_size = FLD_CLIENT_CACHE_SIZE / + sizeof(struct fld_cache_entry); + + cache_threshold = cache_size * + FLD_CLIENT_CACHE_THRESHOLD / 100; + + fld->lcf_cache = fld_cache_init(fld->lcf_name, + cache_size, cache_threshold); + if (IS_ERR(fld->lcf_cache)) { + rc = PTR_ERR(fld->lcf_cache); + fld->lcf_cache = NULL; + GOTO(out, rc); + } + + fld_client_debugfs_init(fld); + EXIT; +out: + if (rc) + fld_client_fini(fld); + else + CDEBUG(D_INFO, "%s: Using \"%s\" hash\n", + fld->lcf_name, fld->lcf_hash->fh_name); + return rc; +} +EXPORT_SYMBOL(fld_client_init); + +void fld_client_fini(struct lu_client_fld *fld) +{ + struct lu_fld_target *target, *tmp; + + ENTRY; + + spin_lock(&fld->lcf_lock); + list_for_each_entry_safe(target, tmp, &fld->lcf_targets, ft_chain) { + fld->lcf_count--; + list_del(&target->ft_chain); + if (target->ft_exp) + class_export_put(target->ft_exp); + OBD_FREE_PTR(target); + } + spin_unlock(&fld->lcf_lock); + + if (fld->lcf_cache) { + if (!IS_ERR(fld->lcf_cache)) + fld_cache_fini(fld->lcf_cache); + fld->lcf_cache = NULL; + } + + EXIT; +} +EXPORT_SYMBOL(fld_client_fini); + +int fld_client_rpc(struct obd_export *exp, + struct lu_seq_range *range, u32 fld_op, + struct ptlrpc_request **reqp) +{ + struct ptlrpc_request *req = NULL; + struct lu_seq_range *prange; + u32 *op; + int rc = 0; + struct obd_import *imp; + + ENTRY; + + LASSERT(exp != NULL); + + imp = class_exp2cliimp(exp); +again: + switch (fld_op) { + case FLD_QUERY: + req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY, + LUSTRE_MDS_VERSION, FLD_QUERY); + if (!req) + RETURN(-ENOMEM); + + /* + * XXX: only needed when talking to old server(< 2.6), it should + * be removed when < 2.6 server is not supported + */ + op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC); + *op = FLD_LOOKUP; + + /* + * For MDS_MDS seq lookup, it will always use LWP connection, + * but LWP will be evicted after restart, so cause the error. + * so we will set no_delay for seq lookup request, once the + * request fails because of the eviction. always retry here + */ + if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) { + req->rq_allow_replay = 1; + req->rq_no_delay = 1; + } + break; + case FLD_READ: + req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_READ, + LUSTRE_MDS_VERSION, FLD_READ); + if (!req) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, + RCL_SERVER, PAGE_SIZE); + break; + default: + rc = -EINVAL; + break; + } + + if (rc != 0) + RETURN(rc); + + prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD); + *prange = *range; + ptlrpc_request_set_replen(req); + req->rq_request_portal = FLD_REQUEST_PORTAL; + req->rq_reply_portal = MDC_REPLY_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ) && req->rq_no_delay) { + /* the same error returned by ptlrpc_import_delay_req */ + rc = -EAGAIN; + req->rq_status = rc; + } else { + rc = ptlrpc_queue_wait(req); + } + + if (rc == -ENOENT) { + /* Don't loop forever on non-existing FID sequences. */ + GOTO(out_req, rc); + } + + if (rc != 0) { + if (imp->imp_state != LUSTRE_IMP_CLOSED && + !imp->imp_deactive && + imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS && + OCD_HAS_FLAG(&imp->imp_connect_data, LIGHTWEIGHT) && + rc != -ENOTSUPP) { + /* LWP is not replayable, retry after a while */ + rc = -EAGAIN; + } + if (rc == -EAGAIN) { + ptlrpc_req_finished(req); + if (msleep_interruptible(2 * MSEC_PER_SEC)) + GOTO(out_req, rc = -EINTR); + rc = 0; + goto again; + } + GOTO(out_req, rc); + } + + if (fld_op == FLD_QUERY) { + prange = req_capsule_server_get(&req->rq_pill, + &RMF_FLD_MDFLD); + if (!prange) + GOTO(out_req, rc = -EFAULT); + *range = *prange; + } + + EXIT; +out_req: + if (rc != 0 || !reqp) { + ptlrpc_req_finished(req); + req = NULL; + } + + if (reqp) + *reqp = req; + + return rc; +} + +int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds, + u32 flags, const struct lu_env *env) +{ + struct lu_seq_range res = { 0 }; + struct lu_fld_target *target; + struct lu_fld_target *origin; + int rc; + + ENTRY; + + rc = fld_cache_lookup(fld->lcf_cache, seq, &res); + if (rc == 0) { + *mds = res.lsr_index; + RETURN(0); + } + + /* Can not find it in the cache */ + target = fld_client_get_target(fld, seq); + LASSERT(target != NULL); + origin = target; +again: + CDEBUG(D_INFO, "%s: Lookup fld entry (seq: %#llx) on target %s (idx %llu)\n", + fld->lcf_name, seq, fld_target_name(target), target->ft_idx); + + res.lsr_start = seq; + fld_range_set_type(&res, flags); + +#ifdef HAVE_SERVER_SUPPORT + if (target->ft_srv) { + LASSERT(env != NULL); + rc = fld_server_lookup(env, target->ft_srv, seq, &res); + } else +#endif /* HAVE_SERVER_SUPPORT */ + { + rc = fld_client_rpc(target->ft_exp, &res, FLD_QUERY, NULL); + } + + if (rc == -ESHUTDOWN) { + /* + * If fld lookup failed because the target has been shutdown, + * then try next target in the list, until trying all targets + * or fld lookup succeeds + */ + spin_lock(&fld->lcf_lock); + /* + * If the next entry in the list is the head of the list, + * move to the next entry after the head and retrieve + * the target. Else retreive the next target entry. + */ + if (target->ft_chain.next == &fld->lcf_targets) + target = list_entry(target->ft_chain.next->next, + struct lu_fld_target, ft_chain); + else + target = list_entry(target->ft_chain.next, + struct lu_fld_target, + ft_chain); + spin_unlock(&fld->lcf_lock); + if (target != origin) + goto again; + } + if (rc == 0) { + *mds = res.lsr_index; + fld_cache_insert(fld->lcf_cache, &res); + } + + RETURN(rc); +} +EXPORT_SYMBOL(fld_client_lookup); + +void fld_client_flush(struct lu_client_fld *fld) +{ + fld_cache_flush(fld->lcf_cache); +} + +static int __init fld_init(void) +{ +#ifdef HAVE_SERVER_SUPPORT + int rc; + + rc = fld_server_mod_init(); + if (rc) + return rc; +#endif /* HAVE_SERVER_SUPPORT */ + + fld_debugfs_dir = debugfs_create_dir(LUSTRE_FLD_NAME, + debugfs_lustre_root); + return PTR_ERR_OR_ZERO(fld_debugfs_dir); +} + +static void __exit fld_exit(void) +{ +#ifdef HAVE_SERVER_SUPPORT + fld_server_mod_exit(); +#endif /* HAVE_SERVER_SUPPORT */ + + debugfs_remove_recursive(fld_debugfs_dir); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre FID Location Database"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(fld_init); +module_exit(fld_exit); diff --git a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c new file mode 100644 index 0000000000000..91641015c94bd --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c @@ -0,0 +1,357 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/fld/lproc_fld.c + * + * FLD (FIDs Location Database) + * + * Author: Yury Umanets + * Di Wang + */ + +#define DEBUG_SUBSYSTEM S_FLD + +#include +#include + +#ifdef HAVE_SERVER_SUPPORT +#include +#endif +#include +#include +#include +#include "fld_internal.h" + +static int +fld_debugfs_targets_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_fld *fld = (struct lu_client_fld *)m->private; + struct lu_fld_target *target; + + ENTRY; + spin_lock(&fld->lcf_lock); + list_for_each_entry(target, &fld->lcf_targets, ft_chain) + seq_printf(m, "%s\n", fld_target_name(target)); + spin_unlock(&fld->lcf_lock); + + RETURN(0); +} + +static int +fld_debugfs_hash_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_fld *fld = (struct lu_client_fld *)m->private; + + ENTRY; + spin_lock(&fld->lcf_lock); + seq_printf(m, "%s\n", fld->lcf_hash->fh_name); + spin_unlock(&fld->lcf_lock); + + RETURN(0); +} + +static ssize_t +fld_debugfs_hash_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct lu_client_fld *fld = m->private; + struct lu_fld_hash *hash = NULL; + char fh_name[8]; + int i; + + if (count > sizeof(fh_name)) + return -ENAMETOOLONG; + + if (copy_from_user(fh_name, buffer, count) != 0) + return -EFAULT; + + for (i = 0; fld_hash[i].fh_name; i++) { + if (count != strlen(fld_hash[i].fh_name)) + continue; + + if (!strncmp(fld_hash[i].fh_name, fh_name, count)) { + hash = &fld_hash[i]; + break; + } + } + + if (hash) { + spin_lock(&fld->lcf_lock); + fld->lcf_hash = hash; + spin_unlock(&fld->lcf_lock); + + CDEBUG(D_INFO, "%s: Changed hash to \"%s\"\n", + fld->lcf_name, hash->fh_name); + } + + return count; +} + +static ssize_t ldebugfs_cache_flush_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *pos) +{ + struct seq_file *m = file->private_data; + struct lu_client_fld *fld = m->private; + + ENTRY; + fld_cache_flush(fld->lcf_cache); + + CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name); + + RETURN(count); +} + +LDEBUGFS_SEQ_FOPS_RO(fld_debugfs_targets); +LDEBUGFS_SEQ_FOPS(fld_debugfs_hash); +LDEBUGFS_FOPS_WR_ONLY(fld, cache_flush); + +struct ldebugfs_vars fld_client_debugfs_list[] = { + { .name = "targets", + .fops = &fld_debugfs_targets_fops }, + { .name = "hash", + .fops = &fld_debugfs_hash_fops }, + { .name = "cache_flush", + .fops = &fld_cache_flush_fops }, + { NULL } +}; + +#ifdef HAVE_SERVER_SUPPORT +struct fld_seq_param { + struct lu_env fsp_env; + struct dt_it *fsp_it; + struct lu_server_fld *fsp_fld; + unsigned int fsp_stop:1; +}; + +static void *fldb_seq_start(struct seq_file *p, loff_t *pos) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct dt_key *key; + int rc; + + if (param == NULL || param->fsp_stop) + return NULL; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + rc = iops->load(¶m->fsp_env, param->fsp_it, *pos); + if (rc <= 0) + return NULL; + + key = iops->key(¶m->fsp_env, param->fsp_it); + if (IS_ERR(key)) + return NULL; + + *pos = be64_to_cpu(*(__u64 *)key); + + return param; +} + +static void fldb_seq_stop(struct seq_file *p, void *v) +{ + struct fld_seq_param *param = p->private; + const struct dt_it_ops *iops; + struct lu_server_fld *fld; + struct dt_object *obj; + + if (param == NULL) + return; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + iops->put(¶m->fsp_env, param->fsp_it); +} + +static void *fldb_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + int rc; + + ++*pos; + if (param == NULL || param->fsp_stop) + return NULL; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + rc = iops->next(¶m->fsp_env, param->fsp_it); + if (rc > 0) { + param->fsp_stop = 1; + return NULL; + } + + *pos = be64_to_cpu(*(__u64 *)iops->key(¶m->fsp_env, param->fsp_it)); + return param; +} + +static int fldb_seq_show(struct seq_file *p, void *v) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct fld_thread_info *info; + struct lu_seq_range *fld_rec; + int rc; + + if (param == NULL || param->fsp_stop) + return 0; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + info = lu_context_key_get(¶m->fsp_env.le_ctx, + &fld_thread_key); + fld_rec = &info->fti_rec; + rc = iops->rec(¶m->fsp_env, param->fsp_it, + (struct dt_rec *)fld_rec, 0); + if (rc != 0) { + CERROR("%s:read record error: rc %d\n", + fld->lsf_name, rc); + } else if (fld_rec->lsr_start != 0) { + range_be_to_cpu(fld_rec, fld_rec); + seq_printf(p, DRANGE"\n", PRANGE(fld_rec)); + } + + return rc; +} + +static const struct seq_operations fldb_sops = { + .start = fldb_seq_start, + .stop = fldb_seq_stop, + .next = fldb_seq_next, + .show = fldb_seq_show, +}; + +static int fldb_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct lu_server_fld *fld = inode->i_private; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct fld_seq_param *param = NULL; + int env_init = 0; + int rc; + + rc = seq_open(file, &fldb_sops); + if (rc) + GOTO(out, rc); + + obj = fld->lsf_obj; + if (obj == NULL) { + seq = file->private_data; + seq->private = NULL; + return 0; + } + + OBD_ALLOC_PTR(param); + if (param == NULL) + GOTO(out, rc = -ENOMEM); + + rc = lu_env_init(¶m->fsp_env, LCT_MD_THREAD); + if (rc != 0) + GOTO(out, rc); + + env_init = 1; + iops = &obj->do_index_ops->dio_it; + param->fsp_it = iops->init(¶m->fsp_env, obj, 0); + if (IS_ERR(param->fsp_it)) + GOTO(out, rc = PTR_ERR(param->fsp_it)); + + param->fsp_fld = fld; + param->fsp_stop = 0; + + seq = file->private_data; + seq->private = param; +out: + if (rc != 0) { + if (env_init == 1) + lu_env_fini(¶m->fsp_env); + if (param != NULL) + OBD_FREE_PTR(param); + } + return rc; +} + +static int fldb_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct fld_seq_param *param; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + + param = seq->private; + if (param == NULL) { + lprocfs_seq_release(inode, file); + return 0; + } + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + LASSERT(iops != NULL); + LASSERT(param->fsp_it != NULL); + iops->fini(¶m->fsp_env, param->fsp_it); + lu_env_fini(¶m->fsp_env); + OBD_FREE_PTR(param); + lprocfs_seq_release(inode, file); + + return 0; +} + +const struct file_operations fld_debugfs_seq_fops = { + .owner = THIS_MODULE, + .open = fldb_seq_open, + .read = seq_read, + .release = fldb_seq_release, +}; + +# endif /* HAVE_SERVER_SUPPORT */ diff --git a/drivers/staging/lustrefsx/lustre/include/cl_object.h b/drivers/staging/lustrefsx/lustre/include/cl_object.h new file mode 100644 index 0000000000000..4d2d059e6243f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/cl_object.h @@ -0,0 +1,2710 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +#ifndef _LUSTRE_CL_OBJECT_H +#define _LUSTRE_CL_OBJECT_H + +/** \defgroup clio clio + * + * Client objects implement io operations and cache pages. + * + * Examples: lov and osc are implementations of cl interface. + * + * Big Theory Statement. + * + * Layered objects. + * + * Client implementation is based on the following data-types: + * + * - cl_object + * + * - cl_page + * + * - cl_lock represents an extent lock on an object. + * + * - cl_io represents high-level i/o activity such as whole read/write + * system call, or write-out of pages from under the lock being + * canceled. cl_io has sub-ios that can be stopped and resumed + * independently, thus achieving high degree of transfer + * parallelism. Single cl_io can be advanced forward by + * the multiple threads (although in the most usual case of + * read/write system call it is associated with the single user + * thread, that issued the system call). + * + * Terminology + * + * - to avoid confusion high-level I/O operation like read or write system + * call is referred to as "an io", whereas low-level I/O operation, like + * RPC, is referred to as "a transfer" + * + * - "generic code" means generic (not file system specific) code in the + * hosting environment. "cl-code" means code (mostly in cl_*.c files) that + * is not layer specific. + * + * Locking. + * + * - i_mutex + * - PG_locked + * - cl_object_header::coh_page_guard + * - lu_site::ls_guard + * + * See the top comment in cl_object.c for the description of overall locking and + * reference-counting design. + * + * See comments below for the description of i/o, page, and dlm-locking + * design. + * + * @{ + */ + +/* + * super-class definitions. + */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct obd_info; +struct inode; + +struct cl_device; + +struct cl_object; + +struct cl_page; +struct cl_page_slice; +struct cl_lock; +struct cl_lock_slice; + +struct cl_lock_operations; +struct cl_page_operations; + +struct cl_io; +struct cl_io_slice; + +struct cl_req_attr; + +/** + * Device in the client stack. + * + * \see vvp_device, lov_device, lovsub_device, osc_device + */ +struct cl_device { + /** Super-class. */ + struct lu_device cd_lu_dev; +}; + +/** \addtogroup cl_object cl_object + * @{ */ +/** + * "Data attributes" of cl_object. Data attributes can be updated + * independently for a sub-object, and top-object's attributes are calculated + * from sub-objects' ones. + */ +struct cl_attr { + /** Object size, in bytes */ + loff_t cat_size; + /** + * Known minimal size, in bytes. + * + * This is only valid when at least one DLM lock is held. + */ + loff_t cat_kms; + /** Modification time. Measured in seconds since epoch. */ + time64_t cat_mtime; + /** Access time. Measured in seconds since epoch. */ + time64_t cat_atime; + /** Change time. Measured in seconds since epoch. */ + time64_t cat_ctime; + /** + * Blocks allocated to this cl_object on the server file system. + * + * \todo XXX An interface for block size is needed. + */ + __u64 cat_blocks; + /** + * User identifier for quota purposes. + */ + uid_t cat_uid; + /** + * Group identifier for quota purposes. + */ + gid_t cat_gid; + + /* nlink of the directory */ + __u64 cat_nlink; + + /* Project identifier for quota purpose. */ + __u32 cat_projid; +}; + +/** + * Fields in cl_attr that are being set. + */ +enum cl_attr_valid { + CAT_SIZE = BIT(0), + CAT_KMS = BIT(1), + CAT_MTIME = BIT(3), + CAT_ATIME = BIT(4), + CAT_CTIME = BIT(5), + CAT_BLOCKS = BIT(6), + CAT_UID = BIT(7), + CAT_GID = BIT(8), + CAT_PROJID = BIT(9), +}; + +/** + * Sub-class of lu_object with methods common for objects on the client + * stacks. + * + * cl_object: represents a regular file system object, both a file and a + * stripe. cl_object is based on lu_object: it is identified by a fid, + * layered, cached, hashed, and lrued. Important distinction with the server + * side, where md_object and dt_object are used, is that cl_object "fans out" + * at the lov/sns level: depending on the file layout, single file is + * represented as a set of "sub-objects" (stripes). At the implementation + * level, struct lov_object contains an array of cl_objects. Each sub-object + * is a full-fledged cl_object, having its fid, living in the lru and hash + * table. + * + * This leads to the next important difference with the server side: on the + * client, it's quite usual to have objects with the different sequence of + * layers. For example, typical top-object is composed of the following + * layers: + * + * - vvp + * - lov + * + * whereas its sub-objects are composed of + * + * - lovsub + * - osc + * + * layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep + * track of the object-subobject relationship. + * + * Sub-objects are not cached independently: when top-object is about to + * be discarded from the memory, all its sub-objects are torn-down and + * destroyed too. + * + * \see vvp_object, lov_object, lovsub_object, osc_object + */ +struct cl_object { + /** super class */ + struct lu_object co_lu; + /** per-object-layer operations */ + const struct cl_object_operations *co_ops; + /** offset of page slice in cl_page buffer */ + int co_slice_off; +}; + +/** + * Description of the client object configuration. This is used for the + * creation of a new client object that is identified by a more state than + * fid. + */ +struct cl_object_conf { + /** Super-class. */ + struct lu_object_conf coc_lu; + union { + /** + * Object layout. This is consumed by lov. + */ + struct lu_buf coc_layout; + /** + * Description of particular stripe location in the + * cluster. This is consumed by osc. + */ + struct lov_oinfo *coc_oinfo; + } u; + /** + * VFS inode. This is consumed by vvp. + */ + struct inode *coc_inode; + /** + * Layout lock handle. + */ + struct ldlm_lock *coc_lock; + /** + * Operation to handle layout, OBJECT_CONF_XYZ. + */ + int coc_opc; +}; + +enum { + /** configure layout, set up a new stripe, must be called while + * holding layout lock. */ + OBJECT_CONF_SET = 0, + /** invalidate the current stripe configuration due to losing + * layout lock. */ + OBJECT_CONF_INVALIDATE = 1, + /** wait for old layout to go away so that new layout can be + * set up. */ + OBJECT_CONF_WAIT = 2 +}; + +enum { + CL_LAYOUT_GEN_NONE = (u32)-2, /* layout lock was cancelled */ + CL_LAYOUT_GEN_EMPTY = (u32)-1, /* for empty layout */ +}; + +struct cl_layout { + /** the buffer to return the layout in lov_mds_md format. */ + struct lu_buf cl_buf; + /** size of layout in lov_mds_md format. */ + size_t cl_size; + /** Layout generation. */ + u32 cl_layout_gen; + /** whether layout is a composite one */ + bool cl_is_composite; + /** Whether layout is a HSM released one */ + bool cl_is_released; +}; + +/** + * Operations implemented for each cl object layer. + * + * \see vvp_ops, lov_ops, lovsub_ops, osc_ops + */ +struct cl_object_operations { + /** + * Initialize page slice for this layer. Called top-to-bottom through + * every object layer when a new cl_page is instantiated. Layer + * keeping private per-page data, or requiring its own page operations + * vector should allocate these data here, and attach then to the page + * by calling cl_page_slice_add(). \a vmpage is locked (in the VM + * sense). Optional. + * + * \retval NULL success. + * + * \retval ERR_PTR(errno) failure code. + * + * \retval valid-pointer pointer to already existing referenced page + * to be used instead of newly created. + */ + int (*coo_page_init)(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index); + /** + * Initialize lock slice for this layer. Called top-to-bottom through + * every object layer when a new cl_lock is instantiated. Layer + * keeping private per-lock data, or requiring its own lock operations + * vector should allocate these data here, and attach then to the lock + * by calling cl_lock_slice_add(). Mandatory. + */ + int (*coo_lock_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); + /** + * Initialize io state for a given layer. + * + * called top-to-bottom once per io existence to initialize io + * state. If layer wants to keep some state for this type of io, it + * has to embed struct cl_io_slice in lu_env::le_ses, and register + * slice with cl_io_slice_add(). It is guaranteed that all threads + * participating in this io share the same session. + */ + int (*coo_io_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); + /** + * Fill portion of \a attr that this layer controls. This method is + * called top-to-bottom through all object layers. + * + * \pre cl_object_header::coh_attr_guard of the top-object is locked. + * + * \return 0: to continue + * \return +ve: to stop iterating through layers (but 0 is returned + * from enclosing cl_object_attr_get()) + * \return -ve: to signal error + */ + int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); + /** + * Update attributes. + * + * \a valid is a bitmask composed from enum #cl_attr_valid, and + * indicating what attributes are to be set. + * + * \pre cl_object_header::coh_attr_guard of the top-object is locked. + * + * \return the same convention as for + * cl_object_operations::coo_attr_get() is used. + */ + int (*coo_attr_update)(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); + /** + * Update object configuration. Called top-to-bottom to modify object + * configuration. + * + * XXX error conditions and handling. + */ + int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); + /** + * Glimpse ast. Executed when glimpse ast arrives for a lock on this + * object. Layers are supposed to fill parts of \a lvb that will be + * shipped to the glimpse originator as a glimpse result. + * + * \see vvp_object_glimpse(), lovsub_object_glimpse(), + * \see osc_object_glimpse() + */ + int (*coo_glimpse)(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb); + /** + * Object prune method. Called when the layout is going to change on + * this object, therefore each layer has to clean up their cache, + * mainly pages and locks. + */ + int (*coo_prune)(const struct lu_env *env, struct cl_object *obj); + /** + * Object getstripe method. + */ + int (*coo_getstripe)(const struct lu_env *env, struct cl_object *obj, + struct lov_user_md __user *lum, size_t size); + /** + * Get FIEMAP mapping from the object. + */ + int (*coo_fiemap)(const struct lu_env *env, struct cl_object *obj, + struct ll_fiemap_info_key *fmkey, + struct fiemap *fiemap, size_t *buflen); + /** + * Get layout and generation of the object. + */ + int (*coo_layout_get)(const struct lu_env *env, struct cl_object *obj, + struct cl_layout *layout); + /** + * Get maximum size of the object. + */ + loff_t (*coo_maxbytes)(struct cl_object *obj); + /** + * Set request attributes. + */ + void (*coo_req_attr_set)(const struct lu_env *env, + struct cl_object *obj, + struct cl_req_attr *attr); + /** + * Flush \a obj data corresponding to \a lock. Used for DoM + * locks in llite's cancelling blocking ast callback. + */ + int (*coo_object_flush)(const struct lu_env *env, + struct cl_object *obj, + struct ldlm_lock *lock); +}; + +/** + * Extended header for client object. + */ +struct cl_object_header { + /** Standard lu_object_header. cl_object::co_lu::lo_header points + * here. */ + struct lu_object_header coh_lu; + + /** + * Parent object. It is assumed that an object has a well-defined + * parent, but not a well-defined child (there may be multiple + * sub-objects, for the same top-object). cl_object_header::coh_parent + * field allows certain code to be written generically, without + * limiting possible cl_object layouts unduly. + */ + struct cl_object_header *coh_parent; + /** + * Protects consistency between cl_attr of parent object and + * attributes of sub-objects, that the former is calculated ("merged") + * from. + * + * \todo XXX this can be read/write lock if needed. + */ + spinlock_t coh_attr_guard; + /** + * Size of cl_page + page slices + */ + unsigned short coh_page_bufsize; + /** + * Number of objects above this one: 0 for a top-object, 1 for its + * sub-object, etc. + */ + unsigned char coh_nesting; +}; + +/** + * Helper macro: iterate over all layers of the object \a obj, assigning every + * layer top-to-bottom to \a slice. + */ +#define cl_object_for_each(slice, obj) \ + list_for_each_entry((slice), \ + &(obj)->co_lu.lo_header->loh_layers,\ + co_lu.lo_linkage) + +/** + * Helper macro: iterate over all layers of the object \a obj, assigning every + * layer bottom-to-top to \a slice. + */ +#define cl_object_for_each_reverse(slice, obj) \ + list_for_each_entry_reverse((slice), \ + &(obj)->co_lu.lo_header->loh_layers,\ + co_lu.lo_linkage) + +/** @} cl_object */ + +#define CL_PAGE_EOF ((pgoff_t)~0ull) + +/** \addtogroup cl_page cl_page + * @{ */ + +/** \struct cl_page + * Layered client page. + * + * cl_page: represents a portion of a file, cached in the memory. All pages + * of the given file are of the same size, and are kept in the radix tree + * hanging off the cl_object. cl_page doesn't fan out, but as sub-objects + * of the top-level file object are first class cl_objects, they have their + * own radix trees of pages and hence page is implemented as a sequence of + * struct cl_pages's, linked into double-linked list through + * cl_page::cp_parent and cl_page::cp_child pointers, each residing in the + * corresponding radix tree at the corresponding logical offset. + * + * cl_page is associated with VM page of the hosting environment (struct + * page in Linux kernel, for example), struct page. It is assumed, that this + * association is implemented by one of cl_page layers (top layer in the + * current design) that + * + * - intercepts per-VM-page call-backs made by the environment (e.g., + * memory pressure), + * + * - translates state (page flag bits) and locking between lustre and + * environment. + * + * The association between cl_page and struct page is immutable and + * established when cl_page is created. + * + * cl_page can be "owned" by a particular cl_io (see below), guaranteeing + * this io an exclusive access to this page w.r.t. other io attempts and + * various events changing page state (such as transfer completion, or + * eviction of the page from the memory). Note, that in general cl_io + * cannot be identified with a particular thread, and page ownership is not + * exactly equal to the current thread holding a lock on the page. Layer + * implementing association between cl_page and struct page has to implement + * ownership on top of available synchronization mechanisms. + * + * While lustre client maintains the notion of an page ownership by io, + * hosting MM/VM usually has its own page concurrency control + * mechanisms. For example, in Linux, page access is synchronized by the + * per-page PG_locked bit-lock, and generic kernel code (generic_file_*()) + * takes care to acquire and release such locks as necessary around the + * calls to the file system methods (->readpage(), ->prepare_write(), + * ->commit_write(), etc.). This leads to the situation when there are two + * different ways to own a page in the client: + * + * - client code explicitly and voluntary owns the page (cl_page_own()); + * + * - VM locks a page and then calls the client, that has "to assume" + * the ownership from the VM (cl_page_assume()). + * + * Dual methods to release ownership are cl_page_disown() and + * cl_page_unassume(). + * + * cl_page is reference counted (cl_page::cp_ref). When reference counter + * drops to 0, the page is returned to the cache, unless it is in + * cl_page_state::CPS_FREEING state, in which case it is immediately + * destroyed. + * + * The general logic guaranteeing the absence of "existential races" for + * pages is the following: + * + * - there are fixed known ways for a thread to obtain a new reference + * to a page: + * + * - by doing a lookup in the cl_object radix tree, protected by the + * spin-lock; + * + * - by starting from VM-locked struct page and following some + * hosting environment method (e.g., following ->private pointer in + * the case of Linux kernel), see cl_vmpage_page(); + * + * - when the page enters cl_page_state::CPS_FREEING state, all these + * ways are severed with the proper synchronization + * (cl_page_delete()); + * + * - entry into cl_page_state::CPS_FREEING is serialized by the VM page + * lock; + * + * - no new references to the page in cl_page_state::CPS_FREEING state + * are allowed (checked in cl_page_get()). + * + * Together this guarantees that when last reference to a + * cl_page_state::CPS_FREEING page is released, it is safe to destroy the + * page, as neither references to it can be acquired at that point, nor + * ones exist. + * + * cl_page is a state machine. States are enumerated in enum + * cl_page_state. Possible state transitions are enumerated in + * cl_page_state_set(). State transition process (i.e., actual changing of + * cl_page::cp_state field) is protected by the lock on the underlying VM + * page. + * + * Linux Kernel implementation. + * + * Binding between cl_page and struct page (which is a typedef for + * struct page) is implemented in the vvp layer. cl_page is attached to the + * ->private pointer of the struct page, together with the setting of + * PG_private bit in page->flags, and acquiring additional reference on the + * struct page (much like struct buffer_head, or any similar file system + * private data structures). + * + * PG_locked lock is used to implement both ownership and transfer + * synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}} + * states. No additional references are acquired for the duration of the + * transfer. + * + * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where + * write-out is "protected" by the special PG_writeback bit. + */ + +/** + * States of cl_page. cl_page.c assumes particular order here. + * + * The page state machine is rather crude, as it doesn't recognize finer page + * states like "dirty" or "up to date". This is because such states are not + * always well defined for the whole stack (see, for example, the + * implementation of the read-ahead, that hides page up-to-dateness to track + * cache hits accurately). Such sub-states are maintained by the layers that + * are interested in them. + */ +enum cl_page_state { + /** + * Page is in the cache, un-owned. Page leaves cached state in the + * following cases: + * + * - [cl_page_state::CPS_OWNED] io comes across the page and + * owns it; + * + * - [cl_page_state::CPS_PAGEOUT] page is dirty, the + * req-formation engine decides that it wants to include this page + * into an RPC being constructed, and yanks it from the cache; + * + * - [cl_page_state::CPS_FREEING] VM callback is executed to + * evict the page form the memory; + * + * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL + */ + CPS_CACHED = 1, + /** + * Page is exclusively owned by some cl_io. Page may end up in this + * state as a result of + * + * - io creating new page and immediately owning it; + * + * - [cl_page_state::CPS_CACHED] io finding existing cached page + * and owning it; + * + * - [cl_page_state::CPS_OWNED] io finding existing owned page + * and waiting for owner to release the page; + * + * Page leaves owned state in the following cases: + * + * - [cl_page_state::CPS_CACHED] io decides to leave the page in + * the cache, doing nothing; + * + * - [cl_page_state::CPS_PAGEIN] io starts read transfer for + * this page; + * + * - [cl_page_state::CPS_PAGEOUT] io starts immediate write + * transfer for this page; + * + * - [cl_page_state::CPS_FREEING] io decides to destroy this + * page (e.g., as part of truncate or extent lock cancellation). + * + * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL + */ + CPS_OWNED, + /** + * Page is being written out, as a part of a transfer. This state is + * entered when req-formation logic decided that it wants this page to + * be sent through the wire _now_. Specifically, it means that once + * this state is achieved, transfer completion handler (with either + * success or failure indication) is guaranteed to be executed against + * this page independently of any locks and any scheduling decisions + * made by the hosting environment (that effectively means that the + * page is never put into cl_page_state::CPS_PAGEOUT state "in + * advance". This property is mentioned, because it is important when + * reasoning about possible dead-locks in the system). The page can + * enter this state as a result of + * + * - [cl_page_state::CPS_OWNED] an io requesting an immediate + * write-out of this page, or + * + * - [cl_page_state::CPS_CACHED] req-forming engine deciding + * that it has enough dirty pages cached to issue a "good" + * transfer. + * + * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer + * is completed---it is moved into cl_page_state::CPS_CACHED state. + * + * Underlying VM page is locked for the duration of transfer. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL + */ + CPS_PAGEOUT, + /** + * Page is being read in, as a part of a transfer. This is quite + * similar to the cl_page_state::CPS_PAGEOUT state, except that + * read-in is always "immediate"---there is no such thing a sudden + * construction of read request from cached, presumably not up to date, + * pages. + * + * Underlying VM page is locked for the duration of transfer. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL + */ + CPS_PAGEIN, + /** + * Page is being destroyed. This state is entered when client decides + * that page has to be deleted from its host object, as, e.g., a part + * of truncate. + * + * Once this state is reached, there is no way to escape it. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL + */ + CPS_FREEING, + CPS_NR +}; + +enum cl_page_type { + /** Host page, the page is from the host inode which the cl_page + * belongs to. */ + CPT_CACHEABLE = 1, + + /** Transient page, the transient cl_page is used to bind a cl_page + * to vmpage which is not belonging to the same object of cl_page. + * it is used in DirectIO and lockless IO. */ + CPT_TRANSIENT, + CPT_NR +}; + +#define CP_STATE_BITS 4 +#define CP_TYPE_BITS 2 +#define CP_MAX_LAYER 3 + +/** + * Fields are protected by the lock on struct page, except for atomics and + * immutables. + * + * \invariant Data type invariants are in cl_page_invariant(). Basically: + * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked + * list, consistent with the parent/child pointers in the cl_page::cp_obj and + * cl_page::cp_owner (when set). + */ +struct cl_page { + /** Reference counter. */ + atomic_t cp_ref; + /** layout_entry + stripe index, composed using lov_comp_index() */ + unsigned int cp_lov_index; + /** page->index of the page within the whole file */ + pgoff_t cp_page_index; + /** An object this page is a part of. Immutable after creation. */ + struct cl_object *cp_obj; + /** vmpage */ + struct page *cp_vmpage; + /** + * Assigned if doing direct IO, because in this case cp_vmpage is not + * a valid page cache page, hence the inode cannot be inferred from + * cp_vmpage->mapping->host. + */ + struct inode *cp_inode; + /** Linkage of pages within group. Pages must be owned */ + struct list_head cp_batch; + /** array of slices offset. Immutable after creation. */ + unsigned char cp_layer_offset[CP_MAX_LAYER]; /* 24 bits */ + /** current slice index */ + unsigned char cp_layer_count:2; /* 26 bits */ + /** + * Page state. This field is const to avoid accidental update, it is + * modified only internally within cl_page.c. Protected by a VM lock. + */ + enum cl_page_state cp_state:CP_STATE_BITS; /* 30 bits */ + /** + * Page type. Only CPT_TRANSIENT is used so far. Immutable after + * creation. + */ + enum cl_page_type cp_type:CP_TYPE_BITS; /* 32 bits */ + /* which slab kmem index this memory allocated from */ + short int cp_kmem_index; /* 48 bits */ + unsigned int cp_unused1:16; /* 64 bits */ + + /** + * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned + * by sub-io. Protected by a VM lock. + */ + struct cl_io *cp_owner; + /** List of references to this page, for debugging. */ + struct lu_ref cp_reference; + /** Link to an object, for debugging. */ + struct lu_ref_link cp_obj_ref; + /** Link to a queue, for debugging. */ + struct lu_ref_link cp_queue_ref; + /** Assigned if doing a sync_io */ + struct cl_sync_io *cp_sync_io; +}; + +/** + * Per-layer part of cl_page. + * + * \see vvp_page, lov_page, osc_page + */ +struct cl_page_slice { + struct cl_page *cpl_page; + /** + * Object slice corresponding to this page slice. Immutable after + * creation. + */ + struct cl_object *cpl_obj; + const struct cl_page_operations *cpl_ops; +}; + +/** + * Lock mode. For the client extent locks. + * + * \ingroup cl_lock + */ +enum cl_lock_mode { + CLM_READ, + CLM_WRITE, + CLM_GROUP, + CLM_MAX, +}; + +/** + * Requested transfer type. + */ +enum cl_req_type { + CRT_READ, + CRT_WRITE, + CRT_NR +}; + +/** + * Per-layer page operations. + * + * Methods taking an \a io argument are for the activity happening in the + * context of given \a io. Page is assumed to be owned by that io, except for + * the obvious cases (like cl_page_operations::cpo_own()). + * + * \see vvp_page_ops, lov_page_ops, osc_page_ops + */ +struct cl_page_operations { + /** + * cl_page<->struct page methods. Only one layer in the stack has to + * implement these. Current code assumes that this functionality is + * provided by the topmost layer, see cl_page_disown0() as an example. + */ + + /** + * Called when \a io acquires this page into the exclusive + * ownership. When this method returns, it is guaranteed that the is + * not owned by other io, and no transfer is going on against + * it. Optional. + * + * \see cl_page_own() + * \see vvp_page_own(), lov_page_own() + */ + int (*cpo_own)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io, int nonblock); + /** Called when ownership it yielded. Optional. + * + * \see cl_page_disown() + * \see vvp_page_disown() + */ + void (*cpo_disown)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** + * Called for a page that is already "owned" by \a io from VM point of + * view. Optional. + * + * \see cl_page_assume() + * \see vvp_page_assume(), lov_page_assume() + */ + void (*cpo_assume)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** Dual to cl_page_operations::cpo_assume(). Optional. Called + * bottom-to-top when IO releases a page without actually unlocking + * it. + * + * \see cl_page_unassume() + * \see vvp_page_unassume() + */ + void (*cpo_unassume)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Announces whether the page contains valid data or not by \a uptodate. + * + * \see cl_page_export() + * \see vvp_page_export() + */ + void (*cpo_export)(const struct lu_env *env, + const struct cl_page_slice *slice, int uptodate); + /** + * Checks whether underlying VM page is locked (in the suitable + * sense). Used for assertions. + * + * \retval -EBUSY: page is protected by a lock of a given mode; + * \retval -ENODATA: page is not protected by a lock; + * \retval 0: this layer cannot decide. (Should never happen.) + */ + int (*cpo_is_vmlocked)(const struct lu_env *env, + const struct cl_page_slice *slice); + + /** + * Update file attributes when all we have is this page. Used for tiny + * writes to update attributes when we don't have a full cl_io. + */ + void (*cpo_page_touch)(const struct lu_env *env, + const struct cl_page_slice *slice, size_t to); + /** + * Page destruction. + */ + + /** + * Called when page is truncated from the object. Optional. + * + * \see cl_page_discard() + * \see vvp_page_discard(), osc_page_discard() + */ + void (*cpo_discard)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Called when page is removed from the cache, and is about to being + * destroyed. Optional. + * + * \see cl_page_delete() + * \see vvp_page_delete(), osc_page_delete() + */ + void (*cpo_delete)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** Destructor. Frees resources and slice itself. */ + void (*cpo_fini)(const struct lu_env *env, + struct cl_page_slice *slice, + struct pagevec *pvec); + /** + * Optional debugging helper. Prints given page slice. + * + * \see cl_page_print() + */ + int (*cpo_print)(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t p); + /** + * \name transfer + * + * Transfer methods. + * + * @{ + */ + /** + * Request type dependent vector of operations. + * + * Transfer operations depend on transfer mode (cl_req_type). To avoid + * passing transfer mode to each and every of these methods, and to + * avoid branching on request type inside of the methods, separate + * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are + * provided. That is, method invocation usually looks like + * + * slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...); + */ + struct { + /** + * Called when a page is submitted for a transfer as a part of + * cl_page_list. + * + * \return 0 : page is eligible for submission; + * \return -EALREADY : skip this page; + * \return -ve : error. + * + * \see cl_page_prep() + */ + int (*cpo_prep)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Completion handler. This is guaranteed to be eventually + * fired after cl_page_operations::cpo_prep() or + * cl_page_operations::cpo_make_ready() call. + * + * This method can be called in a non-blocking context. It is + * guaranteed however, that the page involved and its object + * are pinned in memory (and, hence, calling cl_page_put() is + * safe). + * + * \see cl_page_completion() + */ + void (*cpo_completion)(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret); + /** + * Called when cached page is about to be added to the + * ptlrpc request as a part of req formation. + * + * \return 0 : proceed with this page; + * \return -EAGAIN : skip this page; + * \return -ve : error. + * + * \see cl_page_make_ready() + */ + int (*cpo_make_ready)(const struct lu_env *env, + const struct cl_page_slice *slice); + } io[CRT_NR]; + /** + * Tell transfer engine that only [to, from] part of a page should be + * transmitted. + * + * This is used for immediate transfers. + * + * \todo XXX this is not very good interface. It would be much better + * if all transfer parameters were supplied as arguments to + * cl_io_operations::cio_submit() call, but it is not clear how to do + * this for page queues. + * + * \see cl_page_clip() + */ + void (*cpo_clip)(const struct lu_env *env, + const struct cl_page_slice *slice, + int from, int to); + /** + * Write out a page by kernel. This is only called by ll_writepage + * right now. + * + * \see cl_page_flush() + */ + int (*cpo_flush)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** @} transfer */ +}; + +/** + * Helper macro, dumping detailed information about \a page into a log. + */ +#define CL_PAGE_DEBUG(mask, env, page, format, ...) \ +do { \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + cl_page_print(env, &msgdata, lu_cdebug_printer, page); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +/** + * Helper macro, dumping shorter information about \a page into a log. + */ +#define CL_PAGE_HEADER(mask, env, page, format, ...) \ +do { \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +static inline struct page *cl_page_vmpage(const struct cl_page *page) +{ + LASSERT(page->cp_vmpage != NULL); + return page->cp_vmpage; +} + +/** + * Check if a cl_page is in use. + * + * Client cache holds a refcount, this refcount will be dropped when + * the page is taken out of cache, see vvp_page_delete(). + */ +static inline bool __page_in_use(const struct cl_page *page, int refc) +{ + return (atomic_read(&page->cp_ref) > refc + 1); +} + +/** + * Caller itself holds a refcount of cl_page. + */ +#define cl_page_in_use(pg) __page_in_use(pg, 1) +/** + * Caller doesn't hold a refcount. + */ +#define cl_page_in_use_noref(pg) __page_in_use(pg, 0) + +/** @} cl_page */ + +/** \addtogroup cl_lock cl_lock + * @{ */ +/** \struct cl_lock + * + * Extent locking on the client. + * + * LAYERING + * + * The locking model of the new client code is built around + * + * struct cl_lock + * + * data-type representing an extent lock on a regular file. cl_lock is a + * layered object (much like cl_object and cl_page), it consists of a header + * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to + * cl_lock::cll_layers list through cl_lock_slice::cls_linkage. + * + * Typical cl_lock consists of one layer: + * + * - lov_lock (lov specific data). + * + * lov_lock contains an array of sub-locks. Each of these sub-locks is a + * normal cl_lock: it has a header (struct cl_lock) and a list of layers: + * + * - osc_lock + * + * Each sub-lock is associated with a cl_object (representing stripe + * sub-object or the file to which top-level cl_lock is associated to), and is + * linked into that cl_object::coh_locks. In this respect cl_lock is similar to + * cl_object (that at lov layer also fans out into multiple sub-objects), and + * is different from cl_page, that doesn't fan out (there is usually exactly + * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock + * a "top-lock" and its lovsub-osc portion a "sub-lock". + * + * LIFE CYCLE + * + * cl_lock is a cacheless data container for the requirements of locks to + * complete the IO. cl_lock is created before I/O starts and destroyed when the + * I/O is complete. + * + * cl_lock depends on LDLM lock to fulfill lock semantics. LDLM lock is attached + * to cl_lock at OSC layer. LDLM lock is still cacheable. + * + * INTERFACE AND USAGE + * + * Two major methods are supported for cl_lock: clo_enqueue and clo_cancel. A + * cl_lock is enqueued by cl_lock_request(), which will call clo_enqueue() + * methods for each layer to enqueue the lock. At the LOV layer, if a cl_lock + * consists of multiple sub cl_locks, each sub locks will be enqueued + * correspondingly. At OSC layer, the lock enqueue request will tend to reuse + * cached LDLM lock; otherwise a new LDLM lock will have to be requested from + * OST side. + * + * cl_lock_cancel() must be called to release a cl_lock after use. clo_cancel() + * method will be called for each layer to release the resource held by this + * lock. At OSC layer, the reference count of LDLM lock, which is held at + * clo_enqueue time, is released. + * + * LDLM lock can only be canceled if there is no cl_lock using it. + * + * Overall process of the locking during IO operation is as following: + * + * - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock() + * is called on each layer. Responsibility of this method is to add locks, + * needed by a given layer into cl_io.ci_lockset. + * + * - once locks for all layers were collected, they are sorted to avoid + * dead-locks (cl_io_locks_sort()), and enqueued. + * + * - when all locks are acquired, IO is performed; + * + * - locks are released after IO is complete. + * + * Striping introduces major additional complexity into locking. The + * fundamental problem is that it is generally unsafe to actively use (hold) + * two locks on the different OST servers at the same time, as this introduces + * inter-server dependency and can lead to cascading evictions. + * + * Basic solution is to sub-divide large read/write IOs into smaller pieces so + * that no multi-stripe locks are taken (note that this design abandons POSIX + * read/write semantics). Such pieces ideally can be executed concurrently. At + * the same time, certain types of IO cannot be sub-divived, without + * sacrificing correctness. This includes: + * + * - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee + * atomicity; + * + * - ftruncate(fd, offset), where [offset, EOF] lock has to be taken. + * + * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where + * buf is a part of memory mapped Lustre file, a lock or locks protecting buf + * has to be held together with the usual lock on [offset, offset + count]. + * + * Interaction with DLM + * + * In the expected setup, cl_lock is ultimately backed up by a collection of + * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is + * implemented in osc layer, that also matches DLM events (ASTs, cancellation, + * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed + * description of interaction with DLM. + */ + +/** + * Lock description. + */ +struct cl_lock_descr { + /** Object this lock is granted for. */ + struct cl_object *cld_obj; + /** Index of the first page protected by this lock. */ + pgoff_t cld_start; + /** Index of the last page (inclusive) protected by this lock. */ + pgoff_t cld_end; + /** Group ID, for group lock */ + __u64 cld_gid; + /** Lock mode. */ + enum cl_lock_mode cld_mode; + /** + * flags to enqueue lock. A combination of bit-flags from + * enum cl_enq_flags. + */ + __u32 cld_enq_flags; +}; + +#define DDESCR "%s(%d):[%lu, %lu]:%x" +#define PDESCR(descr) \ + cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode, \ + (descr)->cld_start, (descr)->cld_end, (descr)->cld_enq_flags + +const char *cl_lock_mode_name(const enum cl_lock_mode mode); + +/** + * Layered client lock. + */ +struct cl_lock { + /** List of slices. Immutable after creation. */ + struct list_head cll_layers; + /** lock attribute, extent, cl_object, etc. */ + struct cl_lock_descr cll_descr; +}; + +/** + * Per-layer part of cl_lock + * + * \see lov_lock, osc_lock + */ +struct cl_lock_slice { + struct cl_lock *cls_lock; + /** Object slice corresponding to this lock slice. Immutable after + * creation. */ + struct cl_object *cls_obj; + const struct cl_lock_operations *cls_ops; + /** Linkage into cl_lock::cll_layers. Immutable after creation. */ + struct list_head cls_linkage; +}; + +/** + * + * \see lov_lock_ops, osc_lock_ops + */ +struct cl_lock_operations { + /** @{ */ + /** + * Attempts to enqueue the lock. Called top-to-bottom. + * + * \retval 0 this layer has enqueued the lock successfully + * \retval >0 this layer has enqueued the lock, but need to wait on + * @anchor for resources + * \retval -ve failure + * + * \see lov_lock_enqueue(), osc_lock_enqueue() + */ + int (*clo_enqueue)(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *io, struct cl_sync_io *anchor); + /** + * Cancel a lock, release its DLM lock ref, while does not cancel the + * DLM lock + */ + void (*clo_cancel)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** @} */ + /** + * Destructor. Frees resources and the slice. + * + * \see lov_lock_fini(), osc_lock_fini() + */ + void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice); + /** + * Optional debugging helper. Prints given lock slice. + */ + int (*clo_print)(const struct lu_env *env, + void *cookie, lu_printer_t p, + const struct cl_lock_slice *slice); +}; + +#define CL_LOCK_DEBUG(mask, env, lock, format, ...) \ +do { \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + cl_lock_print(env, &msgdata, lu_cdebug_printer, lock); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +#define CL_LOCK_ASSERT(expr, env, lock) do { \ + if (likely(expr)) \ + break; \ + \ + CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr); \ + LBUG(); \ +} while (0) + +/** @} cl_lock */ + +/** \addtogroup cl_page_list cl_page_list + * Page list used to perform collective operations on a group of pages. + * + * Pages are added to the list one by one. cl_page_list acquires a reference + * for every page in it. Page list is used to perform collective operations on + * pages: + * + * - submit pages for an immediate transfer, + * + * - own pages on behalf of certain io (waiting for each page in turn), + * + * - discard pages. + * + * When list is finalized, it releases references on all pages it still has. + * + * \todo XXX concurrency control. + * + * @{ + */ +struct cl_page_list { + unsigned pl_nr; + struct list_head pl_pages; +}; + +/** + * A 2-queue of pages. A convenience data-type for common use case, 2-queue + * contains an incoming page list and an outgoing page list. + */ +struct cl_2queue { + struct cl_page_list c2_qin; + struct cl_page_list c2_qout; +}; + +/** @} cl_page_list */ + +/** \addtogroup cl_io cl_io + * @{ */ +/** \struct cl_io + * I/O + * + * cl_io represents a high level I/O activity like + * read(2)/write(2)/truncate(2) system call, or cancellation of an extent + * lock. + * + * cl_io is a layered object, much like cl_{object,page,lock} but with one + * important distinction. We want to minimize number of calls to the allocator + * in the fast path, e.g., in the case of read(2) when everything is cached: + * client already owns the lock over region being read, and data are cached + * due to read-ahead. To avoid allocation of cl_io layers in such situations, + * per-layer io state is stored in the session, associated with the io, see + * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized + * by using free-lists, see cl_env_get(). + * + * There is a small predefined number of possible io types, enumerated in enum + * cl_io_type. + * + * cl_io is a state machine, that can be advanced concurrently by the multiple + * threads. It is up to these threads to control the concurrency and, + * specifically, to detect when io is done, and its state can be safely + * released. + * + * For read/write io overall execution plan is as following: + * + * (0) initialize io state through all layers; + * + * (1) loop: prepare chunk of work to do + * + * (2) call all layers to collect locks they need to process current chunk + * + * (3) sort all locks to avoid dead-locks, and acquire them + * + * (4) process the chunk: call per-page methods + * cl_io_operations::cio_prepare_write(), + * cl_io_operations::cio_commit_write() for write) + * + * (5) release locks + * + * (6) repeat loop. + * + * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to + * address allocation efficiency issues mentioned above), and returns with the + * special error condition from per-page method when current sub-io has to + * block. This causes io loop to be repeated, and lov switches to the next + * sub-io in its cl_io_operations::cio_iter_init() implementation. + */ + +/** IO types */ +enum cl_io_type { + /** read system call */ + CIT_READ = 1, + /** write system call */ + CIT_WRITE, + /** truncate, utime system calls */ + CIT_SETATTR, + /** get data version */ + CIT_DATA_VERSION, + /** + * page fault handling + */ + CIT_FAULT, + /** + * fsync system call handling + * To write out a range of file + */ + CIT_FSYNC, + /** + * glimpse. An io context to acquire glimpse lock. + */ + CIT_GLIMPSE, + /** + * Miscellaneous io. This is used for occasional io activity that + * doesn't fit into other types. Currently this is used for: + * + * - cancellation of an extent lock. This io exists as a context + * to write dirty pages from under the lock being canceled back + * to the server; + * + * - VM induced page write-out. An io context for writing page out + * for memory cleansing; + * + * - grouplock. An io context to acquire group lock. + * + * CIT_MISC io is used simply as a context in which locks and pages + * are manipulated. Such io has no internal "process", that is, + * cl_io_loop() is never called for it. + */ + CIT_MISC, + /** + * ladvise handling + * To give advice about access of a file + */ + CIT_LADVISE, + /** + * SEEK_HOLE/SEEK_DATA handling to search holes or data + * across all file objects + */ + CIT_LSEEK, + CIT_OP_NR +}; + +/** + * States of cl_io state machine + */ +enum cl_io_state { + /** Not initialized. */ + CIS_ZERO, + /** Initialized. */ + CIS_INIT, + /** IO iteration started. */ + CIS_IT_STARTED, + /** Locks taken. */ + CIS_LOCKED, + /** Actual IO is in progress. */ + CIS_IO_GOING, + /** IO for the current iteration finished. */ + CIS_IO_FINISHED, + /** Locks released. */ + CIS_UNLOCKED, + /** Iteration completed. */ + CIS_IT_ENDED, + /** cl_io finalized. */ + CIS_FINI +}; + +/** + * IO state private for a layer. + * + * This is usually embedded into layer session data, rather than allocated + * dynamically. + * + * \see vvp_io, lov_io, osc_io + */ +struct cl_io_slice { + struct cl_io *cis_io; + /** corresponding object slice. Immutable after creation. */ + struct cl_object *cis_obj; + /** io operations. Immutable after creation. */ + const struct cl_io_operations *cis_iop; + /** + * linkage into a list of all slices for a given cl_io, hanging off + * cl_io::ci_layers. Immutable after creation. + */ + struct list_head cis_linkage; +}; + +typedef void (*cl_commit_cbt)(const struct lu_env *, struct cl_io *, + struct pagevec *); + +struct cl_read_ahead { + /* Maximum page index the readahead window will end. + * This is determined DLM lock coverage, RPC and stripe boundary. + * cra_end is included. */ + pgoff_t cra_end_idx; + /* optimal RPC size for this read, by pages */ + unsigned long cra_rpc_pages; + /* Release callback. If readahead holds resources underneath, this + * function should be called to release it. */ + void (*cra_release)(const struct lu_env *env, + struct cl_read_ahead *ra); + + /* Callback data for cra_release routine */ + void *cra_dlmlock; + void *cra_oio; + + /* whether lock is in contention */ + bool cra_contention; +}; + +static inline void cl_read_ahead_release(const struct lu_env *env, + struct cl_read_ahead *ra) +{ + if (ra->cra_release != NULL) + ra->cra_release(env, ra); + memset(ra, 0, sizeof(*ra)); +} + + +/** + * Per-layer io operations. + * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops + */ +struct cl_io_operations { + /** + * Vector of io state transition methods for every io type. + * + * \see cl_page_operations::io + */ + struct { + /** + * Prepare io iteration at a given layer. + * + * Called top-to-bottom at the beginning of each iteration of + * "io loop" (if it makes sense for this type of io). Here + * layer selects what work it will do during this iteration. + * + * \see cl_io_operations::cio_iter_fini() + */ + int (*cio_iter_init) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Finalize io iteration. + * + * Called bottom-to-top at the end of each iteration of "io + * loop". Here layers can decide whether IO has to be + * continued. + * + * \see cl_io_operations::cio_iter_init() + */ + void (*cio_iter_fini) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Collect locks for the current iteration of io. + * + * Called top-to-bottom to collect all locks necessary for + * this iteration. This methods shouldn't actually enqueue + * anything, instead it should post a lock through + * cl_io_lock_add(). Once all locks are collected, they are + * sorted and enqueued in the proper order. + */ + int (*cio_lock) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Finalize unlocking. + * + * Called bottom-to-top to finish layer specific unlocking + * functionality, after generic code released all locks + * acquired by cl_io_operations::cio_lock(). + */ + void (*cio_unlock)(const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Start io iteration. + * + * Once all locks are acquired, called top-to-bottom to + * commence actual IO. In the current implementation, + * top-level vvp_io_{read,write}_start() does all the work + * synchronously by calling generic_file_*(), so other layers + * are called when everything is done. + */ + int (*cio_start)(const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Called top-to-bottom at the end of io loop. Here layer + * might wait for an unfinished asynchronous io. + */ + void (*cio_end) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Called bottom-to-top to notify layers that read/write IO + * iteration finished, with \a nob bytes transferred. + */ + void (*cio_advance)(const struct lu_env *env, + const struct cl_io_slice *slice, + size_t nob); + /** + * Called once per io, bottom-to-top to release io resources. + */ + void (*cio_fini) (const struct lu_env *env, + const struct cl_io_slice *slice); + } op[CIT_OP_NR]; + + /** + * Submit pages from \a queue->c2_qin for IO, and move + * successfully submitted pages into \a queue->c2_qout. Return + * non-zero if failed to submit even the single page. If + * submission failed after some pages were moved into \a + * queue->c2_qout, completion callback with non-zero ioret is + * executed on them. + */ + int (*cio_submit)(const struct lu_env *env, + const struct cl_io_slice *slice, + enum cl_req_type crt, + struct cl_2queue *queue); + /** + * Queue async page for write. + * The difference between cio_submit and cio_queue is that + * cio_submit is for urgent request. + */ + int (*cio_commit_async)(const struct lu_env *env, + const struct cl_io_slice *slice, + struct cl_page_list *queue, int from, int to, + cl_commit_cbt cb); + /** + * Release active extent. + */ + void (*cio_extent_release)(const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Decide maximum read ahead extent + * + * \pre io->ci_type == CIT_READ + */ + int (*cio_read_ahead)(const struct lu_env *env, + const struct cl_io_slice *slice, + pgoff_t start, struct cl_read_ahead *ra); + /** + * + * Reserve LRU slots before IO. + */ + int (*cio_lru_reserve) (const struct lu_env *env, + const struct cl_io_slice *slice, + loff_t pos, size_t bytes); + /** + * Optional debugging helper. Print given io slice. + */ + int (*cio_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_io_slice *slice); +}; + +/** + * Flags to lock enqueue procedure. + * \ingroup cl_lock + */ +enum cl_enq_flags { + /** + * instruct server to not block, if conflicting lock is found. Instead + * -EAGAIN is returned immediately. + */ + CEF_NONBLOCK = 0x00000001, + /** + * Tell lower layers this is a glimpse request, translated to + * LDLM_FL_HAS_INTENT at LDLM layer. + * + * Also, because glimpse locks never block other locks, we count this + * as automatically compatible with other osc locks. + * (see osc_lock_compatible) + */ + CEF_GLIMPSE = 0x00000002, + /** + * tell the server to instruct (though a flag in the blocking ast) an + * owner of the conflicting lock, that it can drop dirty pages + * protected by this lock, without sending them to the server. + */ + CEF_DISCARD_DATA = 0x00000004, + /** + * tell the sub layers that it must be a `real' lock. This is used for + * mmapped-buffer locks, glimpse locks, manually requested locks + * (LU_LADVISE_LOCKAHEAD) that must never be converted into lockless + * mode. + * + * \see vvp_mmap_locks(), cl_glimpse_lock, cl_request_lock(). + */ + CEF_MUST = 0x00000008, + /** + * tell the sub layers that never request a `real' lock. This flag is + * not used currently. + * + * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless + * conversion policy: ci_lockreq describes generic information of lock + * requirement for this IO, especially for locks which belong to the + * object doing IO; however, lock itself may have precise requirements + * that are described by the enqueue flags. + */ + CEF_NEVER = 0x00000010, + /** + * tell the dlm layer this is a speculative lock request + * speculative lock requests are locks which are not requested as part + * of an I/O operation. Instead, they are requested because we expect + * to use them in the future. They are requested asynchronously at the + * ptlrpc layer. + * + * Currently used for asynchronous glimpse locks and manually requested + * locks (LU_LADVISE_LOCKAHEAD). + */ + CEF_SPECULATIVE = 0x00000020, + /** + * enqueue a lock to test DLM lock existence. + */ + CEF_PEEK = 0x00000040, + /** + * Lock match only. Used by group lock in I/O as group lock + * is known to exist. + */ + CEF_LOCK_MATCH = 0x00000080, + /** + * tell the DLM layer to lock only the requested range + */ + CEF_LOCK_NO_EXPAND = 0x00000100, + /** + * mask of enq_flags. + */ + CEF_MASK = 0x000001ff, +}; + +/** + * Link between lock and io. Intermediate structure is needed, because the + * same lock can be part of multiple io's simultaneously. + */ +struct cl_io_lock_link { + /** linkage into one of cl_lockset lists. */ + struct list_head cill_linkage; + struct cl_lock cill_lock; + /** optional destructor */ + void (*cill_fini)(const struct lu_env *env, + struct cl_io_lock_link *link); +}; +#define cill_descr cill_lock.cll_descr + +/** + * Lock-set represents a collection of locks, that io needs at a + * time. Generally speaking, client tries to avoid holding multiple locks when + * possible, because + * + * - holding extent locks over multiple ost's introduces the danger of + * "cascading timeouts"; + * + * - holding multiple locks over the same ost is still dead-lock prone, + * see comment in osc_lock_enqueue(), + * + * but there are certain situations where this is unavoidable: + * + * - O_APPEND writes have to take [0, EOF] lock for correctness; + * + * - truncate has to take [new-size, EOF] lock for correctness; + * + * - SNS has to take locks across full stripe for correctness; + * + * - in the case when user level buffer, supplied to {read,write}(file0), + * is a part of a memory mapped lustre file, client has to take a dlm + * locks on file0, and all files that back up the buffer (or a part of + * the buffer, that is being processed in the current chunk, in any + * case, there are situations where at least 2 locks are necessary). + * + * In such cases we at least try to take locks in the same consistent + * order. To this end, all locks are first collected, then sorted, and then + * enqueued. + */ +struct cl_lockset { + /** locks to be acquired. */ + struct list_head cls_todo; + /** locks acquired. */ + struct list_head cls_done; +}; + +/** + * Lock requirements(demand) for IO. It should be cl_io_lock_req, + * but 'req' is always to be thought as 'request' :-) + */ +enum cl_io_lock_dmd { + /** Always lock data (e.g., O_APPEND). */ + CILR_MANDATORY = 0, + /** Layers are free to decide between local and global locking. */ + CILR_MAYBE, + /** Never lock: there is no cache (e.g., liblustre). */ + CILR_NEVER +}; + +enum cl_fsync_mode { + /** start writeback, do not wait for them to finish */ + CL_FSYNC_NONE = 0, + /** start writeback and wait for them to finish */ + CL_FSYNC_LOCAL = 1, + /** discard all of dirty pages in a specific file range */ + CL_FSYNC_DISCARD = 2, + /** start writeback and make sure they have reached storage before + * return. OST_SYNC RPC must be issued and finished */ + CL_FSYNC_ALL = 3 +}; + +struct cl_io_rw_common { + loff_t crw_pos; + size_t crw_count; + int crw_nonblock; +}; +enum cl_setattr_subtype { + /** regular setattr **/ + CL_SETATTR_REG = 1, + /** truncate(2) **/ + CL_SETATTR_TRUNC, + /** fallocate(2) - mode preallocate **/ + CL_SETATTR_FALLOCATE +}; + +struct cl_io_range { + loff_t cir_pos; + size_t cir_count; +}; + +struct cl_io_pt { + struct cl_io_pt *cip_next; + struct kiocb cip_iocb; + struct iov_iter cip_iter; + struct file *cip_file; + enum cl_io_type cip_iot; + unsigned int cip_need_restart:1; + loff_t cip_pos; + size_t cip_count; + ssize_t cip_result; +}; + +/** + * State for io. + * + * cl_io is shared by all threads participating in this IO (in current + * implementation only one thread advances IO, but parallel IO design and + * concurrent copy_*_user() require multiple threads acting on the same IO. It + * is up to these threads to serialize their activities, including updates to + * mutable cl_io fields. + */ +struct cl_io { + /** type of this IO. Immutable after creation. */ + enum cl_io_type ci_type; + /** current state of cl_io state machine. */ + enum cl_io_state ci_state; + /** main object this io is against. Immutable after creation. */ + struct cl_object *ci_obj; + /** top level dio_aio */ + struct cl_dio_aio *ci_dio_aio; + /** + * Upper layer io, of which this io is a part of. Immutable after + * creation. + */ + struct cl_io *ci_parent; + /** List of slices. Immutable after creation. */ + struct list_head ci_layers; + /** list of locks (to be) acquired by this io. */ + struct cl_lockset ci_lockset; + /** lock requirements, this is just a help info for sublayers. */ + enum cl_io_lock_dmd ci_lockreq; + /** layout version when this IO occurs */ + __u32 ci_layout_version; + union { + struct cl_rd_io { + struct cl_io_rw_common rd; + } ci_rd; + struct cl_wr_io { + struct cl_io_rw_common wr; + int wr_append; + int wr_sync; + } ci_wr; + struct cl_io_rw_common ci_rw; + struct cl_setattr_io { + struct ost_lvb sa_attr; + unsigned int sa_attr_flags; + unsigned int sa_avalid; /* ATTR_* */ + unsigned int sa_xvalid; /* OP_XVALID */ + int sa_stripe_index; + struct ost_layout sa_layout; + const struct lu_fid *sa_parent_fid; + /* SETATTR interface is used for regular setattr, */ + /* truncate(2) and fallocate(2) subtypes */ + enum cl_setattr_subtype sa_subtype; + /* The following are used for fallocate(2) */ + int sa_falloc_mode; + loff_t sa_falloc_offset; + loff_t sa_falloc_end; + uid_t sa_falloc_uid; + gid_t sa_falloc_gid; + } ci_setattr; + struct cl_data_version_io { + u64 dv_data_version; + u32 dv_layout_version; + int dv_flags; + } ci_data_version; + struct cl_fault_io { + /** page index within file. */ + pgoff_t ft_index; + /** bytes valid byte on a faulted page. */ + size_t ft_nob; + /** writable page? for nopage() only */ + int ft_writable; + /** page of an executable? */ + int ft_executable; + /** page_mkwrite() */ + int ft_mkwrite; + /** resulting page */ + struct cl_page *ft_page; + } ci_fault; + struct cl_fsync_io { + loff_t fi_start; + loff_t fi_end; + /** file system level fid */ + struct lu_fid *fi_fid; + enum cl_fsync_mode fi_mode; + /* how many pages were written/discarded */ + unsigned int fi_nr_written; + } ci_fsync; + struct cl_ladvise_io { + __u64 li_start; + __u64 li_end; + /** file system level fid */ + struct lu_fid *li_fid; + enum lu_ladvise_type li_advice; + __u64 li_flags; + } ci_ladvise; + struct cl_lseek_io { + loff_t ls_start; + loff_t ls_result; + int ls_whence; + } ci_lseek; + struct cl_misc_io { + time64_t lm_next_rpc_time; + } ci_misc; + } u; + struct cl_2queue ci_queue; + size_t ci_nob; + int ci_result; + unsigned int ci_continue:1, + /** + * This io has held grouplock, to inform sublayers that + * don't do lockless i/o. + */ + ci_no_srvlock:1, + /** + * The whole IO need to be restarted because layout has been changed + */ + ci_need_restart:1, + /** + * to not refresh layout - the IO issuer knows that the layout won't + * change(page operations, layout change causes all page to be + * discarded), or it doesn't matter if it changes(sync). + */ + ci_ignore_layout:1, + /** + * Need MDS intervention to complete a write. + * Write intent is required for the following cases: + * 1. component being written is not initialized, or + * 2. the mirrored files are NOT in WRITE_PENDING state. + */ + ci_need_write_intent:1, + /** + * Check if layout changed after the IO finishes. Mainly for HSM + * requirement. If IO occurs to openning files, it doesn't need to + * verify layout because HSM won't release openning files. + * Right now, only two opertaions need to verify layout: glimpse + * and setattr. + */ + ci_verify_layout:1, + /** + * file is released, restore has to to be triggered by vvp layer + */ + ci_restore_needed:1, + /** + * O_NOATIME + */ + ci_noatime:1, + /* Tell sublayers not to expand LDLM locks requested for this IO */ + ci_lock_no_expand:1, + /** + * Set if non-delay RPC should be used for this IO. + * + * If this file has multiple mirrors, and if the OSTs of the current + * mirror is inaccessible, non-delay RPC would error out quickly so + * that the upper layer can try to access the next mirror. + */ + ci_ndelay:1, + /** + * Set if IO is triggered by async workqueue readahead. + */ + ci_async_readahead:1, + /** + * Ignore lockless and do normal locking for this io. + */ + ci_dio_lock:1, + /** + * Set if we've tried all mirrors for this read IO, if it's not set, + * the read IO will check to-be-read OSCs' status, and make fast-switch + * another mirror if some of the OSTs are not healthy. + */ + ci_tried_all_mirrors:1, + /** + * Random read hints, readahead will be disabled. + */ + ci_rand_read:1, + /** + * Sequential read hints. + */ + ci_seq_read:1, + /** + * Do parallel (async) submission of DIO RPCs. Note DIO is still sync + * to userspace, only the RPCs are submitted async, then waited for at + * the llite layer before returning. + */ + ci_parallel_dio:1; + /** + * Bypass quota check + */ + unsigned ci_noquota:1, + /** + * The filesystem must exclusively acquire invalidate_lock before + * invalidating page cache in truncate / hole punch / DLM extent + * lock blocking AST path (and thus calling into ->invalidatepage) + * to block races between page cache invalidation and page cache + * filling functions (fault, read, ...) + */ + ci_invalidate_page_cache:1; + + /** + * How many times the read has retried before this one. + * Set by the top level and consumed by the LOV. + */ + unsigned ci_ndelay_tried; + /** + * Designated mirror index for this I/O. + */ + unsigned ci_designated_mirror; + /** + * Number of pages owned by this IO. For invariant checking. + */ + unsigned ci_owned_nr; + /** + * Range of write intent. Valid if ci_need_write_intent is set. + */ + struct lu_extent ci_write_intent; +}; + +/** @} cl_io */ + +/** + * Per-transfer attributes. + */ +struct cl_req_attr { + enum cl_req_type cra_type; + u64 cra_flags; + struct cl_page *cra_page; + /** Generic attributes for the server consumption. */ + struct obdo *cra_oa; + /** Jobid */ + char cra_jobid[LUSTRE_JOBID_SIZE]; +}; + +enum cache_stats_item { + /** how many cache lookups were performed */ + CS_lookup = 0, + /** how many times cache lookup resulted in a hit */ + CS_hit, + /** how many entities are in the cache right now */ + CS_total, + /** how many entities in the cache are actively used (and cannot be + * evicted) right now */ + CS_busy, + /** how many entities were created at all */ + CS_create, + CS_NR +}; + +#define CS_NAMES { "lookup", "hit", "total", "busy", "create" } + +/** + * Stats for a generic cache (similar to inode, lu_object, etc. caches). + */ +struct cache_stats { + const char *cs_name; + atomic_t cs_stats[CS_NR]; +}; + +/** These are not exported so far */ +void cache_stats_init (struct cache_stats *cs, const char *name); + +/** + * Client-side site. This represents particular client stack. "Global" + * variables should (directly or indirectly) be added here to allow multiple + * clients to co-exist in the single address space. + */ +struct cl_site { + struct lu_site cs_lu; + /** + * Statistical counters. Atomics do not scale, something better like + * per-cpu counters is needed. + * + * These are exported as /proc/fs/lustre/llite/.../site + * + * When interpreting keep in mind that both sub-locks (and sub-pages) + * and top-locks (and top-pages) are accounted here. + */ + struct cache_stats cs_pages; + atomic_t cs_pages_state[CPS_NR]; +}; + +int cl_site_init(struct cl_site *s, struct cl_device *top); +void cl_site_fini(struct cl_site *s); +void cl_stack_fini(const struct lu_env *env, struct cl_device *cl); + +/** + * Output client site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int cl_site_stats_print(const struct cl_site *site, struct seq_file *m); + +/** + * \name helpers + * + * Type conversion and accessory functions. + */ +/** @{ */ + +static inline struct cl_site *lu2cl_site(const struct lu_site *site) +{ + return container_of(site, struct cl_site, cs_lu); +} + +static inline struct cl_device *lu2cl_dev(const struct lu_device *d) +{ + LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d)); + return container_of_safe(d, struct cl_device, cd_lu_dev); +} + +static inline struct lu_device *cl2lu_dev(struct cl_device *d) +{ + return &d->cd_lu_dev; +} + +static inline struct cl_object *lu2cl(const struct lu_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev)); + return container_of_safe(o, struct cl_object, co_lu); +} + +static inline const struct cl_object_conf * +lu2cl_conf(const struct lu_object_conf *conf) +{ + return container_of_safe(conf, struct cl_object_conf, coc_lu); +} + +static inline struct cl_object *cl_object_next(const struct cl_object *obj) +{ + return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL; +} + +static inline struct cl_object_header *luh2coh(const struct lu_object_header *h) +{ + return container_of_safe(h, struct cl_object_header, coh_lu); +} + +static inline struct cl_site *cl_object_site(const struct cl_object *obj) +{ + return lu2cl_site(obj->co_lu.lo_dev->ld_site); +} + +static inline +struct cl_object_header *cl_object_header(const struct cl_object *obj) +{ + return luh2coh(obj->co_lu.lo_header); +} + +static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t) +{ + return lu_device_init(&d->cd_lu_dev, t); +} + +static inline void cl_device_fini(struct cl_device *d) +{ + lu_device_fini(&d->cd_lu_dev); +} + +void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, + struct cl_object *obj, + const struct cl_page_operations *ops); +void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, + struct cl_object *obj, + const struct cl_lock_operations *ops); +void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, + struct cl_object *obj, const struct cl_io_operations *ops); +/** @} helpers */ + +/** \defgroup cl_object cl_object + * @{ */ +struct cl_object *cl_object_top (struct cl_object *o); +struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd, + const struct lu_fid *fid, + const struct cl_object_conf *c); + +int cl_object_header_init(struct cl_object_header *h); +void cl_object_header_fini(struct cl_object_header *h); +void cl_object_put (const struct lu_env *env, struct cl_object *o); +void cl_object_get (struct cl_object *o); +void cl_object_attr_lock (struct cl_object *o); +void cl_object_attr_unlock(struct cl_object *o); +int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); +int cl_object_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); +int cl_object_glimpse (const struct lu_env *env, struct cl_object *obj, + struct ost_lvb *lvb); +int cl_conf_set (const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); +int cl_object_prune (const struct lu_env *env, struct cl_object *obj); +void cl_object_kill (const struct lu_env *env, struct cl_object *obj); +int cl_object_getstripe(const struct lu_env *env, struct cl_object *obj, + struct lov_user_md __user *lum, size_t size); +int cl_object_fiemap(const struct lu_env *env, struct cl_object *obj, + struct ll_fiemap_info_key *fmkey, struct fiemap *fiemap, + size_t *buflen); +int cl_object_layout_get(const struct lu_env *env, struct cl_object *obj, + struct cl_layout *cl); +loff_t cl_object_maxbytes(struct cl_object *obj); +int cl_object_flush(const struct lu_env *env, struct cl_object *obj, + struct ldlm_lock *lock); + + +/** + * Returns true, iff \a o0 and \a o1 are slices of the same object. + */ +static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1) +{ + return cl_object_header(o0) == cl_object_header(o1); +} + +static inline void cl_object_page_init(struct cl_object *clob, int size) +{ + clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize; + cl_object_header(clob)->coh_page_bufsize += cfs_size_round(size); + WARN_ON(cl_object_header(clob)->coh_page_bufsize > 512); +} + +static inline void *cl_object_page_slice(struct cl_object *clob, + struct cl_page *page) +{ + return (void *)((char *)page + clob->co_slice_off); +} + +/** + * Return refcount of cl_object. + */ +static inline int cl_object_refc(struct cl_object *clob) +{ + struct lu_object_header *header = clob->co_lu.lo_header; + return atomic_read(&header->loh_ref); +} + +/** @} cl_object */ + +/** \defgroup cl_page cl_page + * @{ */ +struct cl_page *cl_page_find (const struct lu_env *env, + struct cl_object *obj, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type); +struct cl_page *cl_page_alloc (const struct lu_env *env, + struct cl_object *o, pgoff_t ind, + struct page *vmpage, + enum cl_page_type type); +void cl_page_get (struct cl_page *page); +void cl_page_put (const struct lu_env *env, + struct cl_page *page); +void cl_pagevec_put (const struct lu_env *env, + struct cl_page *page, + struct pagevec *pvec); +void cl_page_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_page *pg); +void cl_page_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_page *pg); +struct cl_page *cl_vmpage_page (struct page *vmpage, struct cl_object *obj); +struct cl_page *cl_page_top (struct cl_page *page); + +const struct cl_page_slice *cl_page_at(const struct cl_page *page, + const struct lu_device_type *dtype); + +/** + * \name ownership + * + * Functions dealing with the ownership of page by io. + */ +/** @{ */ + +int cl_page_own (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +int cl_page_own_try (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +void cl_page_assume (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +void cl_page_unassume (const struct lu_env *env, + struct cl_io *io, struct cl_page *pg); +void cl_page_disown (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +int cl_page_is_owned (const struct cl_page *pg, const struct cl_io *io); + +/** @} ownership */ + +/** + * \name transfer + * + * Functions dealing with the preparation of a page for a transfer, and + * tracking transfer state. + */ +/** @{ */ +int cl_page_prep (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt); +void cl_page_completion (const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt, int ioret); +int cl_page_make_ready (const struct lu_env *env, struct cl_page *pg, + enum cl_req_type crt); +int cl_page_cache_add (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt); +void cl_page_clip (const struct lu_env *env, struct cl_page *pg, + int from, int to); +int cl_page_flush (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); + +/** @} transfer */ + + +/** + * \name helper routines + * Functions to discard, delete and export a cl_page. + */ +/** @{ */ +void cl_page_discard(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); +void cl_page_delete(const struct lu_env *env, struct cl_page *pg); +int cl_page_is_vmlocked(const struct lu_env *env, + const struct cl_page *pg); +void cl_page_touch(const struct lu_env *env, const struct cl_page *pg, + size_t to); +void cl_page_export(const struct lu_env *env, + struct cl_page *pg, int uptodate); +loff_t cl_offset(const struct cl_object *obj, pgoff_t idx); +pgoff_t cl_index(const struct cl_object *obj, loff_t offset); +size_t cl_page_size(const struct cl_object *obj); + +void cl_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_lock *lock); +void cl_lock_descr_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_lock_descr *descr); +/* @} helper */ + +/** + * Data structure managing a client's cached pages. A count of + * "unstable" pages is maintained, and an LRU of clean pages is + * maintained. "unstable" pages are pages pinned by the ptlrpc + * layer for recovery purposes. + */ +struct cl_client_cache { + /** + * # of client cache refcount + * # of users (OSCs) + 2 (held by llite and lov) + */ + atomic_t ccc_users; + /** + * # of threads are doing shrinking + */ + unsigned int ccc_lru_shrinkers; + /** + * # of LRU entries available + */ + atomic_long_t ccc_lru_left; + /** + * List of entities(OSCs) for this LRU cache + */ + struct list_head ccc_lru; + /** + * Max # of LRU entries + */ + unsigned long ccc_lru_max; + /** + * Lock to protect ccc_lru list + */ + spinlock_t ccc_lru_lock; + /** + * Set if unstable check is enabled + */ + unsigned int ccc_unstable_check:1; + /** + * # of unstable pages for this mount point + */ + atomic_long_t ccc_unstable_nr; + /** + * Waitq for awaiting unstable pages to reach zero. + * Used at umounting time and signaled on BRW commit + */ + wait_queue_head_t ccc_unstable_waitq; + /** + * Serialize max_cache_mb write operation + */ + struct mutex ccc_max_cache_mb_lock; +}; +/** + * cl_cache functions + */ +struct cl_client_cache *cl_cache_init(unsigned long lru_page_max); +void cl_cache_incref(struct cl_client_cache *cache); +void cl_cache_decref(struct cl_client_cache *cache); + +/** @} cl_page */ + +/** \defgroup cl_lock cl_lock + * @{ */ +int cl_lock_request(const struct lu_env *env, struct cl_io *io, + struct cl_lock *lock); +int cl_lock_init(const struct lu_env *env, struct cl_lock *lock, + const struct cl_io *io); +void cl_lock_fini(const struct lu_env *env, struct cl_lock *lock); +const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, + const struct lu_device_type *dtype); +void cl_lock_release(const struct lu_env *env, struct cl_lock *lock); + +int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io, + struct cl_lock *lock, struct cl_sync_io *anchor); +void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock); + +/** @} cl_lock */ + +/** \defgroup cl_io cl_io + * @{ */ + +int cl_io_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj); +int cl_io_sub_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj); +int cl_io_rw_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, loff_t pos, size_t count); +int cl_io_loop (const struct lu_env *env, struct cl_io *io); + +void cl_io_fini (const struct lu_env *env, struct cl_io *io); +int cl_io_iter_init (const struct lu_env *env, struct cl_io *io); +void cl_io_iter_fini (const struct lu_env *env, struct cl_io *io); +int cl_io_lock (const struct lu_env *env, struct cl_io *io); +void cl_io_unlock (const struct lu_env *env, struct cl_io *io); +int cl_io_start (const struct lu_env *env, struct cl_io *io); +void cl_io_end (const struct lu_env *env, struct cl_io *io); +int cl_io_lock_add (const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link); +int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, + struct cl_lock_descr *descr); +int cl_io_submit_rw (const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue); +int cl_io_submit_sync (const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue, + long timeout); +int cl_io_commit_async (const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, int from, int to, + cl_commit_cbt cb); +void cl_io_extent_release (const struct lu_env *env, struct cl_io *io); +int cl_io_lru_reserve(const struct lu_env *env, struct cl_io *io, + loff_t pos, size_t bytes); +int cl_io_read_ahead (const struct lu_env *env, struct cl_io *io, + pgoff_t start, struct cl_read_ahead *ra); +void cl_io_rw_advance (const struct lu_env *env, struct cl_io *io, + size_t nob); + +/** + * True, iff \a io is an O_APPEND write(2). + */ +static inline int cl_io_is_append(const struct cl_io *io) +{ + return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append; +} + +static inline int cl_io_is_sync_write(const struct cl_io *io) +{ + return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync; +} + +static inline int cl_io_is_mkwrite(const struct cl_io *io) +{ + return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite; +} + +/** + * True, iff \a io is a truncate(2). + */ +static inline int cl_io_is_trunc(const struct cl_io *io) +{ + return io->ci_type == CIT_SETATTR && + (io->u.ci_setattr.sa_avalid & ATTR_SIZE) && + (io->u.ci_setattr.sa_subtype != CL_SETATTR_FALLOCATE); +} + +static inline int cl_io_is_fallocate(const struct cl_io *io) +{ + return (io->ci_type == CIT_SETATTR) && + (io->u.ci_setattr.sa_subtype == CL_SETATTR_FALLOCATE); +} + +struct cl_io *cl_io_top(struct cl_io *io); + +void cl_io_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_io *io); + +#define CL_IO_SLICE_CLEAN(obj, base) memset_startat(obj, 0, base) + +/** @} cl_io */ + +/** \defgroup cl_page_list cl_page_list + * @{ */ + +/** + * Last page in the page list. + */ +static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist) +{ + LASSERT(plist->pl_nr > 0); + return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch); +} + +static inline struct cl_page *cl_page_list_first(struct cl_page_list *plist) +{ + LASSERT(plist->pl_nr > 0); + return list_first_entry(&plist->pl_pages, struct cl_page, cp_batch); +} + +/** + * Iterate over pages in a page list. + */ +#define cl_page_list_for_each(page, list) \ + list_for_each_entry((page), &(list)->pl_pages, cp_batch) + +/** + * Iterate over pages in a page list, taking possible removals into account. + */ +#define cl_page_list_for_each_safe(page, temp, list) \ + list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch) + +void cl_page_list_init(struct cl_page_list *plist); +void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page, + bool get_ref); +void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page); +void cl_page_list_move_head(struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page); +void cl_page_list_splice(struct cl_page_list *list, + struct cl_page_list *head); +void cl_page_list_del(const struct lu_env *env, + struct cl_page_list *plist, struct cl_page *page); +void cl_page_list_disown(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_assume(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_discard(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist); + +void cl_2queue_init(struct cl_2queue *queue); +void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page, + bool get_ref); +void cl_2queue_disown(const struct lu_env *env, struct cl_io *io, + struct cl_2queue *queue); +void cl_2queue_assume(const struct lu_env *env, struct cl_io *io, + struct cl_2queue *queue); +void cl_2queue_discard(const struct lu_env *env, struct cl_io *io, + struct cl_2queue *queue); +void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue); +void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page); + +/** @} cl_page_list */ + +void cl_req_attr_set(const struct lu_env *env, struct cl_object *obj, + struct cl_req_attr *attr); + +/** \defgroup cl_sync_io cl_sync_io + * @{ */ + +struct cl_sync_io; +struct cl_dio_aio; +struct cl_sub_dio; + +typedef void (cl_sync_io_end_t)(const struct lu_env *, struct cl_sync_io *); + +void cl_sync_io_init_notify(struct cl_sync_io *anchor, int nr, void *dio_aio, + cl_sync_io_end_t *end); + +int cl_sync_io_wait(const struct lu_env *env, struct cl_sync_io *anchor, + long timeout); +void cl_sync_io_note(const struct lu_env *env, struct cl_sync_io *anchor, + int ioret); +int cl_sync_io_wait_recycle(const struct lu_env *env, struct cl_sync_io *anchor, + long timeout, int ioret); +struct cl_dio_aio *cl_dio_aio_alloc(struct kiocb *iocb, struct cl_object *obj, + bool is_aio); +struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio, bool sync); +void cl_dio_aio_free(const struct lu_env *env, struct cl_dio_aio *aio); +void cl_sub_dio_free(struct cl_sub_dio *sdio); +static inline void cl_sync_io_init(struct cl_sync_io *anchor, int nr) +{ + cl_sync_io_init_notify(anchor, nr, NULL, NULL); +} + +/** + * Anchor for synchronous transfer. This is allocated on a stack by thread + * doing synchronous transfer, and a pointer to this structure is set up in + * every page submitted for transfer. Transfer completion routine updates + * anchor and wakes up waiting thread when transfer is complete. + */ +struct cl_sync_io { + /** number of pages yet to be transferred. */ + atomic_t csi_sync_nr; + /** error code. */ + int csi_sync_rc; + /** completion to be signaled when transfer is complete. */ + wait_queue_head_t csi_waitq; + /** callback to invoke when this IO is finished */ + cl_sync_io_end_t *csi_end_io; + /* private pointer for an associated DIO/AIO */ + void *csi_dio_aio; +}; + +/** direct IO pages */ +struct ll_dio_pages { + /* + * page array to be written. we don't support + * partial pages except the last one. + */ + struct page **ldp_pages; + /** # of pages in the array. */ + size_t ldp_count; + /* the file offset of the first page. */ + loff_t ldp_file_offset; +}; + +/* Top level struct used for AIO and DIO */ +struct cl_dio_aio { + struct cl_sync_io cda_sync; + struct cl_object *cda_obj; + struct kiocb *cda_iocb; + ssize_t cda_bytes; + unsigned cda_no_aio_complete:1, + cda_creator_free:1; +}; + +/* Sub-dio used for splitting DIO (and AIO, because AIO is DIO) according to + * the layout/striping, so we can do parallel submit of DIO RPCs + */ +struct cl_sub_dio { + struct cl_sync_io csd_sync; + struct cl_page_list csd_pages; + ssize_t csd_bytes; + struct cl_dio_aio *csd_ll_aio; + struct ll_dio_pages csd_dio_pages; + unsigned csd_creator_free:1; +}; +#if defined(HAVE_DIRECTIO_ITER) || defined(HAVE_IOV_ITER_RW) || \ + defined(HAVE_DIRECTIO_2ARGS) +#define HAVE_DIO_ITER 1 +#endif + +void ll_release_user_pages(struct page **pages, int npages); + +/** @} cl_sync_io */ + +/** \defgroup cl_env cl_env + * + * lu_env handling for a client. + * + * lu_env is an environment within which lustre code executes. Its major part + * is lu_context---a fast memory allocation mechanism that is used to conserve + * precious kernel stack space. Originally lu_env was designed for a server, + * where + * + * - there is a (mostly) fixed number of threads, and + * + * - call chains have no non-lustre portions inserted between lustre code. + * + * On a client both these assumtpion fails, because every user thread can + * potentially execute lustre code as part of a system call, and lustre calls + * into VFS or MM that call back into lustre. + * + * To deal with that, cl_env wrapper functions implement the following + * optimizations: + * + * - allocation and destruction of environment is amortized by caching no + * longer used environments instead of destroying them; + * + * \see lu_env, lu_context, lu_context_key + * @{ */ + +struct lu_env *cl_env_get(__u16 *refcheck); +struct lu_env *cl_env_alloc(__u16 *refcheck, __u32 tags); +void cl_env_put(struct lu_env *env, __u16 *refcheck); +unsigned cl_env_cache_purge(unsigned nr); +struct lu_env *cl_env_percpu_get(void); +void cl_env_percpu_put(struct lu_env *env); + +/** @} cl_env */ + +/* + * Misc + */ +void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr); +void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb); + +struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, + struct lu_device_type *ldt, + struct lu_device *next); +/** @} clio */ + +int cl_global_init(void); +void cl_global_fini(void); + +#endif /* _LINUX_CL_OBJECT_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/dt_object.h b/drivers/staging/lustrefsx/lustre/include/dt_object.h new file mode 100644 index 0000000000000..f24d7d359453a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/dt_object.h @@ -0,0 +1,3054 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __LUSTRE_DT_OBJECT_H +#define __LUSTRE_DT_OBJECT_H + +/** \defgroup dt dt + * Sub-class of lu_object with methods common for "data" objects in OST stack. + * + * Data objects behave like regular files: you can read/write them, get and + * set their attributes. Implementation of dt interface is supposed to + * implement some form of garbage collection, normally reference counting + * (nlink) based one. + * + * Examples: osd (lustre/osd) is an implementation of dt interface. + * @{ + */ + +#include +/* + * super-class definitions. + */ +#include + +#include + +struct seq_file; +struct proc_dir_entry; +struct lustre_cfg; + +struct thandle; +struct dt_device; +struct dt_object; +struct dt_index_features; +struct niobuf_local; +struct niobuf_remote; +struct ldlm_enqueue_info; + +typedef enum { + MNTOPT_USERXATTR = 0x00000001, + MNTOPT_ACL = 0x00000002, +} mntopt_t; + +struct dt_device_param { + unsigned ddp_max_name_len; + unsigned ddp_max_nlink; + unsigned ddp_symlink_max; + mntopt_t ddp_mntopts; + unsigned ddp_max_ea_size; + unsigned ddp_mount_type; + unsigned long long ddp_maxbytes; + /* per-inode space consumption */ + short ddp_inodespace; + /* maximum number of blocks in an extent */ + unsigned ddp_max_extent_blks; + /* per-extent insertion overhead to be used by client for grant + * calculation */ + unsigned int ddp_extent_tax; + unsigned int ddp_brw_size; /* optimal RPC size */ + /* T10PI checksum type, zero if not supported */ + enum cksum_types ddp_t10_cksum_type; + bool ddp_has_lseek_data_hole; +}; + +/** + * Per-transaction commit callback function + */ +struct dt_txn_commit_cb; +typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err); +/** + * Special per-transaction callback for cases when just commit callback + * is needed and per-device callback are not convenient to use + */ +#define TRANS_COMMIT_CB_MAGIC 0xa0a00a0a +#define MAX_COMMIT_CB_STR_LEN 32 + +#define DCB_TRANS_STOP 0x1 +struct dt_txn_commit_cb { + struct list_head dcb_linkage; + dt_cb_t dcb_func; + void *dcb_data; + __u32 dcb_magic; + __u32 dcb_flags; + char dcb_name[MAX_COMMIT_CB_STR_LEN]; +}; + +/** + * Operations on dt device. + */ +struct dt_device_operations { + /** + * Return device-wide statistics. + * + * Return device-wide stats including block size, total and + * free blocks, total and free objects, etc. See struct obd_statfs + * for the details. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * \param[out] osfs stats information + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_statfs)(const struct lu_env *env, + struct dt_device *dev, + struct obd_statfs *osfs, + struct obd_statfs_info *info); + + /** + * Create transaction. + * + * Create in-memory structure representing the transaction for the + * caller. The structure returned will be used by the calling thread + * to specify the transaction the updates belong to. Once created + * successfully ->dt_trans_stop() must be called in any case (with + * ->dt_trans_start() and updates or not) so that the transaction + * handle and other resources can be released by the layers below. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * + * \retval pointer to handle if creation succeeds + * \retval ERR_PTR(errno) if creation fails + */ + struct thandle *(*dt_trans_create)(const struct lu_env *env, + struct dt_device *dev); + + /** + * Start transaction. + * + * Start the transaction. The transaction described by \a th can be + * started only once. Another start is considered as an error. + * A thread is not supposed to start a transaction while another + * transaction isn't closed by the thread (though multiple handles + * can be created). The caller should start the transaction once + * all possible updates are declared (see the ->do_declare_* methods + * below) and all the needed resources are reserved. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_trans_start)(const struct lu_env *env, + struct dt_device *dev, + struct thandle *th); + + /** + * Stop transaction. + * + * Once stopped the transaction described by \a th is complete (all + * the needed updates are applied) and further processing such as + * flushing to disk, sending to another target, etc, is handled by + * lower layers. The caller can't access this transaction by the + * handle anymore (except from the commit callbacks, see below). + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_trans_stop)(const struct lu_env *env, + struct dt_device *dev, + struct thandle *th); + + /** + * Add commit callback to the transaction. + * + * Add a commit callback to the given transaction handle. The callback + * will be called when the associated transaction is stored. I.e. the + * transaction will survive an event like power off if the callback did + * run. The number of callbacks isn't limited, but you should note that + * some disk filesystems do handle the commit callbacks in the thread + * handling commit/flush of all the transactions, meaning that new + * transactions are blocked from commit and flush until all the + * callbacks are done. Also, note multiple callbacks can be running + * concurrently using multiple CPU cores. The callbacks will be running + * in a special environment which can not be used to pass data around. + * + * \param[in] th transaction handle + * \param[in] dcb commit callback description + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_trans_cb_add)(struct thandle *th, + struct dt_txn_commit_cb *dcb); + + /** + * Return FID of root index object. + * + * Return the FID of the root object in the filesystem. This object + * is usually provided as a bootstrap point by a disk filesystem. + * This is up to the implementation which FID to use, though + * [FID_SEQ_ROOT:1:0] is reserved for this purpose. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * \param[out] fid FID of the root object + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_root_get)(const struct lu_env *env, + struct dt_device *dev, + struct lu_fid *f); + + /** + * Return device configuration data. + * + * Return device (disk fs, actually) specific configuration. + * The configuration isn't subject to change at runtime. + * See struct dt_device_param for the details. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * \param[out] param configuration parameters + */ + void (*dt_conf_get)(const struct lu_env *env, + const struct dt_device *dev, + struct dt_device_param *param); + + /** + * Return device's super block. + * + * \param[in] dev dt device + */ + struct super_block *(*dt_mnt_sb_get)(const struct dt_device *dev); + + /** + * Sync the device. + * + * Sync all the cached state (dirty buffers, pages, etc) to the + * persistent storage. The method returns control once the sync is + * complete. This operation may incur significant I/O to disk and + * should be reserved for cases where a global sync is strictly + * necessary. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_sync)(const struct lu_env *env, + struct dt_device *dev); + + /** + * Make device read-only. + * + * Prevent new modifications to the device. This is a very specific + * state where all the changes are accepted successfully and the + * commit callbacks are called, but persistent state never changes. + * Used only in the tests to simulate power-off scenario. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_ro)(const struct lu_env *env, + struct dt_device *dev); + + /** + * Start transaction commit asynchronously. + * + + * Provide a hint to the underlying filesystem that it should start + * committing soon. The control returns immediately. It's up to the + * layer implementing the method how soon to start committing. Usually + * this should be throttled to some extent, otherwise the number of + * aggregated transaction goes too high causing performance drop. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_commit_async)(const struct lu_env *env, + struct dt_device *dev); + + /** + * The unit of \a count is byte for block or inodes for metadata. + * + * If \a count > 0, reserve quota in advance of an operation that + * changes the quota assignment, such as chgrp() or rename() into + * a directory with a different group ID. + * + * If \a count < 0, free the reserved quota previously. + * + * \param[in] env execution environment for this thread + * \param[in] dev the bottom OSD device to reserve quota + * \param[in] type quota type (LQUOTA_RES_DT or LQUOTA_RES_MD) + * \param[in] uid quota uid + * \param[in] gid quota gid + * \param[in] count space (bytes or inodes) to reserve or free + * \param[in] md true for inode, false for block + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_reserve_or_free_quota)(const struct lu_env *env, + struct dt_device *dev, + enum quota_type type, __u64 uid, + __u64 gid, __s64 count, bool md); +}; + +struct dt_index_features { + /** required feature flags from enum dt_index_flags */ + __u32 dif_flags; + /** minimal required key size */ + size_t dif_keysize_min; + /** maximal required key size, 0 if no limit */ + size_t dif_keysize_max; + /** minimal required record size */ + size_t dif_recsize_min; + /** maximal required record size, 0 if no limit */ + size_t dif_recsize_max; + /** pointer size for record */ + size_t dif_ptrsize; +}; + +enum dt_index_flags { + /** index supports variable sized keys */ + DT_IND_VARKEY = BIT(0), + /** index supports variable sized records */ + DT_IND_VARREC = BIT(1), + /** index can be modified */ + DT_IND_UPDATE = BIT(2), + /** index supports records with non-unique (duplicate) keys */ + DT_IND_NONUNQ = BIT(3), + /** + * index support fixed-size keys sorted with natural numerical way + * and is able to return left-side value if no exact value found + */ + DT_IND_RANGE = BIT(4), +}; + +/* for dt_read_lock() and dt_write_lock() object lock rule */ +enum dt_object_role { + DT_SRC_PARENT, + DT_SRC_CHILD, + DT_TGT_PARENT, + DT_TGT_CHILD, + DT_TGT_ORPHAN, + DT_LASTID, +}; + +/** + * Features, required from index to support file system directories (mapping + * names to fids). + */ +extern const struct dt_index_features dt_directory_features; +extern const struct dt_index_features dt_otable_features; +extern const struct dt_index_features dt_lfsck_layout_orphan_features; +extern const struct dt_index_features dt_lfsck_layout_dangling_features; +extern const struct dt_index_features dt_lfsck_namespace_features; + +/* index features supported by the accounting objects */ +extern const struct dt_index_features dt_acct_features; + +/* index features supported by the quota global indexes */ +extern const struct dt_index_features dt_quota_glb_features; + +/* index features supported by the quota slave indexes */ +extern const struct dt_index_features dt_quota_slv_features; + +/* index features supported by the nodemap index */ +extern const struct dt_index_features dt_nodemap_features; + +/** + * This is a general purpose dt allocation hint. + * It now contains the parent object. + * It can contain any allocation hint in the future. + */ +struct dt_allocation_hint { + struct dt_object *dah_parent; + const void *dah_eadata; + int dah_eadata_len; + int dah_acl_len; + __u32 dah_mode; + int dah_append_stripes; + bool dah_can_block; + char *dah_append_pool; +}; + +/** + * object type specifier. + */ + +enum dt_format_type { + DFT_REGULAR, + DFT_DIR, + /** for mknod */ + DFT_NODE, + /** for special index */ + DFT_INDEX, + /** for symbolic link */ + DFT_SYM, +}; + +/** + * object format specifier. + */ +struct dt_object_format { + /** type for dt object */ + enum dt_format_type dof_type; + union { + struct dof_regular { + int striped; + } dof_reg; + struct dof_dir { + } dof_dir; + struct dof_node { + } dof_node; + /** + * special index need feature as parameter to create + * special idx + */ + struct dof_index { + const struct dt_index_features *di_feat; + } dof_idx; + } u; +}; + +enum dt_format_type dt_mode_to_dft(__u32 mode); + +typedef __u64 dt_obj_version_t; + +union ldlm_policy_data; + +struct md_layout_change; + +/** + * A dt_object provides common operations to create and destroy + * objects and to manage regular and extended attributes. + */ +struct dt_object_operations { + /** + * Get read lock on object. + * + * Read lock is compatible with other read locks, so it's shared. + * Read lock is not compatible with write lock which is exclusive. + * The lock is blocking and can't be used from an interrupt context. + * + * \param[in] env execution environment for this thread + * \param[in] dt object to lock for reading + * \param[in] role a hint to debug locks (see kernel's mutexes) + */ + void (*do_read_lock)(const struct lu_env *env, + struct dt_object *dt, + unsigned role); + + /* + * Get write lock on object. + * + * Write lock is exclusive and cannot be shared. The lock is blocking + * and can't be used from an interrupt context. + * + * \param[in] env execution environment for this thread + * \param[in] dt object to lock for writing + * \param[in] role a hint to debug locks (see kernel's mutexes) + * + */ + void (*do_write_lock)(const struct lu_env *env, + struct dt_object *dt, + unsigned role); + + /** + * Release read lock. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + */ + void (*do_read_unlock)(const struct lu_env *env, + struct dt_object *dt); + + /** + * Release write lock. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + */ + void (*do_write_unlock)(const struct lu_env *env, + struct dt_object *dt); + + /** + * Check whether write lock is held. + * + * The caller can learn whether write lock is held on the object + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * + * \retval 0 no write lock + * \retval 1 write lock is held + */ + int (*do_write_locked)(const struct lu_env *env, + struct dt_object *dt); + + /** + * Declare intention to request reqular attributes. + * + * Notity the underlying filesystem that the caller may request regular + * attributes with ->do_attr_get() soon. This allows OSD to implement + * prefetching logic in an object-oriented manner. The implementation + * can be noop. This method should avoid expensive delays such as + * waiting on disk I/O, otherwise the goal of enabling a performance + * optimization would be defeated. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_attr_get)(const struct lu_env *env, + struct dt_object *dt); + + /** + * Return regular attributes. + * + * The object must exist. Currently all the attributes should be + * returned, but in the future this can be improved so that only + * a selected set is returned. This can improve performance as in + * some cases attributes are stored in different places and + * getting them all can be an iterative and expensive process. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] attr attributes to fill + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_attr_get)(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr); + + /** + * Declare intention to change regular object's attributes. + * + * Notify the underlying filesystem that the regular attributes may + * change in this transaction. This enables the layer below to prepare + * resources (e.g. journal credits in ext4). This method should be + * called between creating the transaction and starting it. Note that + * the la_valid field of \a attr specifies which attributes will change. + * The object need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] attr attributes to change specified in attr.la_valid + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_attr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *attr, + struct thandle *th); + + /** + * Change regular attributes. + * + * Change regular attributes in the given transaction. Note only + * attributes flagged by attr.la_valid change. The object must + * exist. If the layer implementing this method is responsible for + * quota, then the method should maintain object accounting for the + * given credentials when la_uid/la_gid changes. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] attr new attributes to apply + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_attr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *attr, + struct thandle *th); + + /** + * Declare intention to request extented attribute. + * + * Notify the underlying filesystem that the caller may request extended + * attribute with ->do_xattr_get() soon. This allows OSD to implement + * prefetching logic in an object-oriented manner. The implementation + * can be noop. This method should avoid expensive delays such as + * waiting on disk I/O, otherwise the goal of enabling a performance + * optimization would be defeated. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] buf unused, may be removed in the future + * \param[in] name name of the extended attribute + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_xattr_get)(const struct lu_env *env, + struct dt_object *dt, + struct lu_buf *buf, + const char *name); + + /** + * Return a value of an extended attribute. + * + * The object must exist. If the buffer is NULL, then the method + * must return the size of the value. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] buf buffer in which to store the value + * \param[in] name name of the extended attribute + * + * \retval 0 on success + * \retval -ERANGE if \a buf is too small + * \retval negative negated errno on error + * \retval positive value's size if \a buf is NULL or has zero size + */ + int (*do_xattr_get)(const struct lu_env *env, + struct dt_object *dt, + struct lu_buf *buf, + const char *name); + + /** + * Declare intention to change an extended attribute. + * + * Notify the underlying filesystem that the extended attribute may + * change in this transaction. This enables the layer below to prepare + * resources (e.g. journal credits in ext4). This method should be + * called between creating the transaction and starting it. The object + * need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] buf buffer storing new value of the attribute + * \param[in] name name of the attribute + * \param[in] fl LU_XATTR_CREATE - fail if EA exists + * LU_XATTR_REPLACE - fail if EA doesn't exist + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_xattr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + const char *name, + int fl, + struct thandle *th); + + /** + * Set an extended attribute. + * + * Change or replace the specified extended attribute (EA). + * The flags passed in \a fl dictate whether the EA is to be + * created or replaced, as follows. + * LU_XATTR_CREATE - fail if EA exists + * LU_XATTR_REPLACE - fail if EA doesn't exist + * The object must exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] buf buffer storing new value of the attribute + * \param[in] name name of the attribute + * \param[in] fl flags indicating EA creation or replacement + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_xattr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + const char *name, + int fl, + struct thandle *th); + + /** + * Declare intention to delete an extended attribute. + * + * Notify the underlying filesystem that the extended attribute may + * be deleted in this transaction. This enables the layer below to + * prepare resources (e.g. journal credits in ext4). This method + * should be called between creating the transaction and starting it. + * The object need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] name name of the attribute + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_xattr_del)(const struct lu_env *env, + struct dt_object *dt, + const char *name, + struct thandle *th); + + /** + * Delete an extended attribute. + * + * This method deletes the specified extended attribute. The object + * must exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] name name of the attribute + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_xattr_del)(const struct lu_env *env, + struct dt_object *dt, + const char *name, + struct thandle *th); + + /** + * Return a list of the extended attributes. + * + * Fills the passed buffer with a list of the extended attributes + * found in the object. The names are separated with '\0'. + * The object must exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] buf buffer to put the list in + * + * \retval positive bytes used/required in the buffer + * \retval negative negated errno on error + */ + int (*do_xattr_list)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf); + + /** + * Prepare allocation hint for a new object. + * + * This method is used by the caller to inform OSD of the parent-child + * relationship between two objects and enable efficient object + * allocation. Filled allocation hint will be passed to ->do_create() + * later. + * + * \param[in] env execution environment for this thread + * \param[out] ah allocation hint + * \param[in] parent parent object (can be NULL) + * \param[in] child child object + * \param[in] _mode type of the child object + */ + void (*do_ah_init)(const struct lu_env *env, + struct dt_allocation_hint *ah, + struct dt_object *parent, + struct dt_object *child, + umode_t mode); + + /** + * Declare intention to create a new object. + * + * Notify the underlying filesystem that the object may be created + * in this transaction. This enables the layer below to prepare + * resources (e.g. journal credits in ext4). This method should be + * called between creating the transaction and starting it. + * + * If the layer implementing this method is responsible for quota, + * then the method should reserve an object for the given credentials + * and return an error if quota is over. If object creation later + * fails for some reason, then the reservation should be released + * properly (usually in ->dt_trans_stop()). + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] attr attributes of the new object + * \param[in] hint allocation hint + * \param[in] dof object format + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_create)(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th); + + /** + * Create new object. + * + * The method creates the object passed with the specified attributes + * and object format. Object allocation procedure can use information + * stored in the allocation hint. Different object formats are supported + * (see enum dt_format_type and struct dt_object_format) depending on + * the device. If creation succeeds, then LOHA_EXISTS flag must be set + * in the LU-object header attributes. + * + * If the layer implementing this method is responsible for quota, + * then the method should maintain object accounting for the given + * credentials. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] attr attributes of the new object + * \param[in] hint allocation hint + * \param[in] dof object format + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_create)(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th); + + /** + * Declare intention to destroy an object. + * + * Notify the underlying filesystem that the object may be destroyed + * in this transaction. This enables the layer below to prepare + * resources (e.g. journal credits in ext4). This method should be + * called between creating the transaction and starting it. The object + * need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_destroy)(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th); + + /** + * Destroy an object. + * + * This method destroys the object and all the resources associated + * with the object (data, key/value pairs, extended attributes, etc). + * The object must exist. If destroy is successful, then flag + * LU_OBJECT_HEARD_BANSHEE should be set to forbid access to this + * instance of in-core object. Any subsequent access to the same FID + * should get another instance with no LOHA_EXIST flag set. + * + * If the layer implementing this method is responsible for quota, + * then the method should maintain object accounting for the given + * credentials. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_destroy)(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th); + + /** + * Try object as an index. + * + * Announce that this object is going to be used as an index. This + * operation checks that object supports indexing operations and + * installs appropriate dt_index_operations vector on success. + * Also probes for features. Operation is successful if all required + * features are supported. It's not possible to access the object + * with index methods before ->do_index_try() returns success. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] feat index features + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_index_try)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_index_features *feat); + + /** + * Declare intention to increment nlink count. + * + * Notify the underlying filesystem that the nlink regular attribute + * be changed in this transaction. This enables the layer below to + * prepare resources (e.g. journal credits in ext4). This method + * should be called between creating the transaction and starting it. + * The object need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_ref_add)(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th); + + /** + * Increment nlink. + * + * Increment nlink (from the regular attributes set) in the given + * transaction. Note the absolute limit for nlink should be learnt + * from struct dt_device_param::ddp_max_nlink. The object must exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_ref_add)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + + /** + * Declare intention to decrement nlink count. + * + * Notify the underlying filesystem that the nlink regular attribute + * be changed in this transaction. This enables the layer below to + * prepare resources (e.g. journal credits in ext4). This method + * should be called between creating the transaction and starting it. + * The object need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_ref_del)(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th); + + /** + * Decrement nlink. + * + * Decrement nlink (from the regular attributes set) in the given + * transaction. The object must exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_ref_del)(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th); + + /** + * Sync obect. + * + * The method is called to sync specified range of the object to a + * persistent storage. The control is returned once the operation is + * complete. The difference from ->do_sync() is that the object can + * be in-sync with the persistent storage (nothing to flush), then + * the method returns quickly with no I/O overhead. So, this method + * should be preferred over ->do_sync() where possible. Also note that + * if the object isn't clean, then some disk filesystems will call + * ->do_sync() to maintain overall consistency, in which case it's + * still very expensive. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] start start of the range to sync + * \param[in] end end of the range to sync + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_object_sync)(const struct lu_env *env, struct dt_object *obj, + __u64 start, __u64 end); + + /** + * Lock object. + * + * Lock object(s) using Distributed Lock Manager (LDLM). + * + * Get LDLM locks for the object. Currently used to lock "remote" + * objects in DNE configuration - a service running on MDTx needs + * to lock an object on MDTy. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] lh lock handle, sometimes used, sometimes not + * \param[in] einfo ldlm callbacks, locking type and mode + * \param[out] einfo private data to be passed to unlock later + * \param[in] policy inodebits data + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt, + struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy); + + /** + * Unlock object. + * + * Release LDLM lock(s) granted with ->do_object_lock(). + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] einfo lock handles, from ->do_object_lock() + * \param[in] policy inodebits data + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_object_unlock)(const struct lu_env *env, + struct dt_object *dt, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy); + + /** + * Invalidate attribute cache. + * + * This method invalidate attribute cache of the object, which is on OSP + * only. + * + * \param[in] env execution envionment for this thread + * \param[in] dt object + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_invalidate)(const struct lu_env *env, struct dt_object *dt); + + /** + * Check object stale state. + * + * OSP only. + * + * \param[in] dt object + * + * \retval true for stale object + * \retval false for not stale object + */ + bool (*do_check_stale)(struct dt_object *dt); + + /** + * Declare intention to instaintiate extended layout component. + * + * \param[in] env execution environment + * \param[in] dt DT object + * \param[in] layout data structure to describe the changes to + * the DT object's layout + * \param[in] buf buffer containing client's lovea or empty + * + * \retval 0 success + * \retval -ne error code + */ + int (*do_declare_layout_change)(const struct lu_env *env, + struct dt_object *dt, + struct md_layout_change *mlc, + struct thandle *th); + + /** + * Client is trying to write to un-instantiated layout component. + * + * \param[in] env execution environment + * \param[in] dt DT object + * \param[in] layout data structure to describe the changes to + * the DT object's layout + * \param[in] buf buffer containing client's lovea or empty + * + * \retval 0 success + * \retval -ne error code + */ + int (*do_layout_change)(const struct lu_env *env, struct dt_object *dt, + struct md_layout_change *mlc, + struct thandle *th); +}; + +enum dt_bufs_type { + DT_BUFS_TYPE_READ = 0x0000, + DT_BUFS_TYPE_WRITE = 0x0001, + DT_BUFS_TYPE_READAHEAD = 0x0002, + DT_BUFS_TYPE_LOCAL = 0x0004, +}; + +/** + * Per-dt-object operations on "file body" - unstructure raw data. + */ +struct dt_body_operations { + /** + * Read data. + * + * Read unstructured data from an existing regular object. + * Only data before attr.la_size is returned. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] buf buffer (including size) to copy data in + * \param[in] pos position in the object to start + * \param[out] pos original value of \a pos + bytes returned + * + * \retval positive bytes read on success + * \retval negative negated errno on error + */ + ssize_t (*dbo_read)(const struct lu_env *env, + struct dt_object *dt, + struct lu_buf *buf, + loff_t *pos); + + /** + * Declare intention to write data to object. + * + * Notify the underlying filesystem that data may be written in + * this transaction. This enables the layer below to prepare resources + * (e.g. journal credits in ext4). This method should be called + * between creating the transaction and starting it. The object need + * not exist. If the layer implementing this method is responsible for + * quota, then the method should reserve space for the given credentials + * and return an error if quota is over. If the write later fails + * for some reason, then the reserve should be released properly + * (usually in ->dt_trans_stop()). + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] buf buffer (including size) to copy data from + * \param[in] pos position in the object to start + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + ssize_t (*dbo_declare_write)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + loff_t pos, + struct thandle *th); + + /** + * Write unstructured data to regular existing object. + * + * The method allocates space and puts data in. Also, the method should + * maintain attr.la_size properly. Partial writes are possible. + * + * If the layer implementing this method is responsible for quota, + * then the method should maintain space accounting for the given + * credentials. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] buf buffer (including size) to copy data from + * \param[in] pos position in the object to start + * \param[out] pos \a pos + bytes written + * \param[in] th transaction handle + * + * \retval positive bytes written on success + * \retval negative negated errno on error + */ + ssize_t (*dbo_write)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + loff_t *pos, + struct thandle *th); + + /** + * Return buffers for data. + * + * This method is used to access data with no copying. It's so-called + * zero-copy I/O. The method returns the descriptors for the internal + * buffers where data are managed by the disk filesystem. For example, + * pagecache in case of ext4 or ARC with ZFS. Then other components + * (e.g. networking) can transfer data from or to the buffers with no + * additional copying. + * + * The method should fill an array of struct niobuf_local, where + * each element describes a full or partial page for data at specific + * offset. The caller should use page/lnb_page_offset/len to find data + * at object's offset lnb_file_offset. + * + * The memory referenced by the descriptors can't change its purpose + * until the complementary ->dbo_bufs_put() is called. The caller should + * specify if the buffers are used to read or modify data so that OSD + * can decide how to initialize the buffers: bring all the data for + * reads or just bring partial buffers for write. Note: the method does + * not check whether output array is large enough. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] pos position in the object to start + * \param[in] len size of region in bytes + * \param[out] lb array of descriptors to fill + * \param[in] maxlnb max slots in @lnb array + * \param[in] rw 0 if used to read, 1 if used for write + * + * \retval positive number of descriptors on success + * \retval negative negated errno on error + */ + int (*dbo_bufs_get)(const struct lu_env *env, + struct dt_object *dt, + loff_t pos, + ssize_t len, + struct niobuf_local *lb, + int maxlnb, + enum dt_bufs_type rw); + + /** + * Release reference granted by ->dbo_bufs_get(). + * + * Release the reference granted by the previous ->dbo_bufs_get(). + * Note the references are counted. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] lb array of descriptors to fill + * \param[in] nr size of the array + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_bufs_put)(const struct lu_env *env, + struct dt_object *dt, + struct niobuf_local *lb, + int nr); + + /** + * Prepare buffers for reading. + * + * The method is called on the given buffers to fill them with data + * if that wasn't done in ->dbo_bufs_get(). The idea is that the + * caller should be able to get few buffers for discontiguous regions + * using few calls to ->dbo_bufs_get() and then request them all for + * the preparation with a single call, so that OSD can fire many I/Os + * to run concurrently. It's up to the specific OSD whether to implement + * this logic in ->dbo_read_prep() or just use ->dbo_bufs_get() to + * prepare data for every requested region individually. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] lnb array of buffer descriptors + * \param[in] nr size of the array + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_read_prep)(const struct lu_env *env, + struct dt_object *dt, + struct niobuf_local *lnb, + int nr); + + /** + * Prepare buffers for write. + * + * This method is called on the given buffers to ensure the partial + * buffers contain correct data. The underlying idea is the same as + * in ->db_read_prep(). + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] lb array of buffer descriptors + * \param[in] nr size of the array + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_write_prep)(const struct lu_env *env, + struct dt_object *dt, + struct niobuf_local *lb, + int nr); + + /** + * Declare intention to write data stored in the buffers. + * + * Notify the underlying filesystem that data may be written in + * this transaction. This enables the layer below to prepare resources + * (e.g. journal credits in ext4). This method should be called + * between creating the transaction and starting it. + * + * If the layer implementing this method is responsible for quota, + * then the method should be reserving a space for the given + * credentials and return an error if quota is exceeded. If the write + * later fails for some reason, then the reserve should be released + * properly (usually in ->dt_trans_stop()). + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] lb array of descriptors + * \param[in] nr size of the array + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_declare_write_commit)(const struct lu_env *env, + struct dt_object *dt, + struct niobuf_local *lb, + int nr, + struct thandle *th); + + /** + * Write to existing object. + * + * This method is used to write data to a persistent storage using + * the buffers returned by ->dbo_bufs_get(). The caller puts new + * data into the buffers using own mechanisms (e.g. direct transfer + * from a NIC). The method should maintain attr.la_size. Also, + * attr.la_blocks should be maintained but this can be done in lazy + * manner, when actual allocation happens. + * + * If the layer implementing this method is responsible for quota, + * then the method should maintain space accounting for the given + * credentials. + * + * user_size parameter is the apparent size of the file, ie the size + * of the clear text version of the file. It can differ from the actual + * amount of valuable data received when a file is encrypted, + * because encrypted pages always contain PAGE_SIZE bytes of data, + * even if clear text data is only a few bytes. + * In case of encrypted file, apparent size will be stored as the inode + * size, so that servers return to clients an object size they can use + * to determine clear text size. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] lb array of descriptors for the buffers + * \param[in] nr size of the array + * \param[in] th transaction handle + * \param[in] user_size apparent size + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_write_commit)(const struct lu_env *env, + struct dt_object *dt, + struct niobuf_local *lb, + int nr, + struct thandle *th, + __u64 user_size); + + /** + * Return logical to physical block mapping for a given extent + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] fm describe the region to map and the output buffer + * see the details in include/linux/fiemap.h + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_fiemap_get)(const struct lu_env *env, + struct dt_object *dt, + struct fiemap *fm); + + /** + * Declare intention to deallocate space from an object. + * + * Notify the underlying filesystem that space may be deallocated in + * this transactions. This enables the layer below to prepare resources + * (e.g. journal credits in ext4). This method should be called between + * creating the transaction and starting it. The object need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] start the start of the region to deallocate + * \param[in] end the end of the region to deallocate + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_declare_punch)(const struct lu_env *env, + struct dt_object *dt, + __u64 start, + __u64 end, + struct thandle *th); + + /** + * Deallocate specified region in an object. + * + * This method is used to deallocate (release) space possibly consumed + * by the given region of the object. If the layer implementing this + * method is responsible for quota, then the method should maintain + * space accounting for the given credentials. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] start the start of the region to deallocate + * \param[in] end the end of the region to deallocate + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_punch)(const struct lu_env *env, + struct dt_object *dt, + __u64 start, + __u64 end, + struct thandle *th); + /** + * Give advices on specified region in an object. + * + * This method is used to give advices about access pattern on an + * given region of the object. The disk filesystem understands + * the advices and tunes cache/read-ahead policies. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] start the start of the region affected + * \param[in] end the end of the region affected + * \param[in] advice advice type + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_ladvise)(const struct lu_env *env, + struct dt_object *dt, + __u64 start, + __u64 end, + enum lu_ladvise_type advice); + + /** + * Declare intention to preallocate space for an object + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_declare_fallocate)(const struct lu_env *env, + struct dt_object *dt, __u64 start, + __u64 end, int mode, struct thandle *th); + /** + * Allocate specified region for an object + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] start the start of the region to allocate + * \param[in] end the end of the region to allocate + * \param[in] mode fallocate mode + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_fallocate)(const struct lu_env *env, + struct dt_object *dt, + __u64 start, + __u64 end, + int mode, + struct thandle *th); + /** + * Do SEEK_HOLE/SEEK_DATA request on object + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] offset the offset to start seek from + * \param[in] whence seek mode, SEEK_HOLE or SEEK_DATA + * + * \retval hole/data offset on success + * \retval negative negated errno on error + */ + loff_t (*dbo_lseek)(const struct lu_env *env, struct dt_object *dt, + loff_t offset, int whence); +}; + +/** + * Incomplete type of index record. + */ +struct dt_rec; + +/** + * Incomplete type of index key. + */ +struct dt_key; + +/** + * Incomplete type of dt iterator. + */ +struct dt_it; + +/** + * Per-dt-object operations on object as index. Index is a set of key/value + * pairs abstracted from an on-disk representation. An index supports the + * number of operations including lookup by key, insert and delete. Also, + * an index can be iterated to find the pairs one by one, from a beginning + * or specified point. + */ +struct dt_index_operations { + /** + * Lookup in an index by key. + * + * The method returns a value for the given key. Key/value format + * and size should have been negotiated with ->do_index_try() before. + * Thus it's the caller's responsibility to provide the method with + * proper key and big enough buffer. No external locking is required, + * all the internal consistency should be implemented by the method + * or lower layers. The object should should have been created with + * type DFT_INDEX or DFT_DIR. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] rec buffer where value will be stored + * \param[in] key key + * + * \retval 0 on success + * \retval -ENOENT if key isn't found + * \retval negative negated errno on error + */ + int (*dio_lookup)(const struct lu_env *env, + struct dt_object *dt, + struct dt_rec *rec, + const struct dt_key *key); + + /** + * Declare intention to insert a key/value into an index. + * + * Notify the underlying filesystem that new key/value may be inserted + * in this transaction. This enables the layer below to prepare + * resources (e.g. journal credits in ext4). This method should be + * called between creating the transaction and starting it. key/value + * format and size is subject to ->do_index_try(). + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] rec buffer storing value + * \param[in] key key + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dio_declare_insert)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th); + + /** + * Insert a new key/value pair into an index. + * + * The method inserts specified key/value pair into the given index + * object. The internal consistency is maintained by the method or + * the functionality below. The format and size of key/value should + * have been negotiated before using ->do_index_try(), no additional + * information can be specified to the method. The keys are unique + * in a given index. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] rec buffer storing value + * \param[in] key key + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dio_insert)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th); + + /** + * Declare intention to delete a key/value from an index. + * + * Notify the underlying filesystem that key/value may be deleted in + * this transaction. This enables the layer below to prepare resources + * (e.g. journal credits in ext4). This method should be called + * between creating the transaction and starting it. Key/value format + * and size is subject to ->do_index_try(). The object need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] key key + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dio_declare_delete)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th); + + /** + * Delete key/value pair from an index. + * + * The method deletes specified key and corresponding value from the + * given index object. The internal consistency is maintained by the + * method or the functionality below. The format and size of the key + * should have been negotiated before using ->do_index_try(), no + * additional information can be specified to the method. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] key key + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dio_delete)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th); + + /** + * Iterator interface. + * + * Methods to iterate over an existing index, list the keys stored and + * associated values, get key/value size, etc. + */ + struct dt_it_ops { + /** + * Allocate and initialize new iterator. + * + * The iterator is a handler to be used in the subsequent + * methods to access index's content. Note the position is + * not defined at this point and should be initialized with + * ->get() or ->load() method. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] attr ask the iterator to return part of + the records, see LUDA_* for details + * + * \retval pointer iterator pointer on success + * \retval ERR_PTR(errno) on error + */ + struct dt_it *(*init)(const struct lu_env *env, + struct dt_object *dt, + __u32 attr); + + /** + * Release iterator. + * + * Release the specified iterator and all the resources + * associated (e.g. the object, index cache, etc). + * + * \param[in] env execution environment for this thread + * \param[in] di iterator to release + */ + void (*fini)(const struct lu_env *env, + struct dt_it *di); + + /** + * Move position of iterator. + * + * Move the position of the specified iterator to the specified + * key. + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * \param[in] key key to position to + * + * \retval 0 if exact key is found + * \retval 1 if at the record with least key + * not larger than the key + * \retval negative negated errno on error + */ + int (*get)(const struct lu_env *env, + struct dt_it *di, + const struct dt_key *key); + + /** + * Release position + * + * Complimentary method for dt_it_ops::get() above. Some + * implementation can increase a reference on the iterator in + * dt_it_ops::get(). So the caller should be able to release + * with dt_it_ops::put(). + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + */ + void (*put)(const struct lu_env *env, + struct dt_it *di); + + /** + * Move to next record. + * + * Moves the position of the iterator to a next record + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * + * \retval 1 if no more records + * \retval 0 on success, the next record is found + * \retval negative negated errno on error + */ + int (*next)(const struct lu_env *env, + struct dt_it *di); + + /** + * Return key. + * + * Returns a pointer to a buffer containing the key of the + * record at the current position. The pointer is valid and + * retains data until ->get(), ->load() and ->fini() methods + * are called. + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * + * \retval pointer to key on success + * \retval ERR_PTR(errno) on error + */ + struct dt_key *(*key)(const struct lu_env *env, + const struct dt_it *di); + + /** + * Return key size. + * + * Returns size of the key at the current position. + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * + * \retval key's size on success + * \retval negative negated errno on error + */ + int (*key_size)(const struct lu_env *env, + const struct dt_it *di); + + /** + * Return record. + * + * Stores the value of the record at the current position. The + * buffer must be big enough (as negotiated with + * ->do_index_try() or ->rec_size()). The caller can specify + * she is interested only in part of the record, using attr + * argument (see LUDA_* definitions for the details). + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * \param[out] rec buffer to store value in + * \param[in] attr specify part of the value to copy + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*rec)(const struct lu_env *env, + const struct dt_it *di, + struct dt_rec *rec, + __u32 attr); + + /** + * Return record size. + * + * Returns size of the record at the current position. The + * \a attr can be used to specify only the parts of the record + * needed to be returned. (see LUDA_* definitions for the + * details). + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * \param[in] attr part of the record to return + * + * \retval record's size on success + * \retval negative negated errno on error + */ + int (*rec_size)(const struct lu_env *env, + const struct dt_it *di, + __u32 attr); + + /** + * Return a cookie (hash). + * + * Returns the cookie (usually hash) of the key at the current + * position. This allows the caller to resume iteration at this + * position later. The exact value is specific to implementation + * and should not be interpreted by the caller. + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * + * \retval cookie/hash of the key + */ + __u64 (*store)(const struct lu_env *env, + const struct dt_it *di); + + /** + * Initialize position using cookie/hash. + * + * Initializes the current position of the iterator to one + * described by the cookie/hash as returned by ->store() + * previously. + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * \param[in] hash cookie/hash value + * + * \retval positive if current position points to + * record with least cookie not larger + * than cookie + * \retval 0 if current position matches cookie + * \retval negative negated errno on error + */ + int (*load)(const struct lu_env *env, + const struct dt_it *di, + __u64 hash); + + /** + * Not used + */ + int (*key_rec)(const struct lu_env *env, + const struct dt_it *di, + void *key_rec); + } dio_it; +}; + +enum dt_otable_it_valid { + DOIV_ERROR_HANDLE = 0x0001, + DOIV_DRYRUN = 0x0002, +}; + +enum dt_otable_it_flags { + /* Exit when fail. */ + DOIF_FAILOUT = 0x0001, + + /* Reset iteration position to the device beginning. */ + DOIF_RESET = 0x0002, + + /* There is up layer component uses the iteration. */ + DOIF_OUTUSED = 0x0004, + + /* Check only without repairing. */ + DOIF_DRYRUN = 0x0008, +}; + +/* otable based iteration needs to use the common DT iteration APIs. + * To initialize the iteration, it needs call dio_it::init() firstly. + * Here is how the otable based iteration should prepare arguments to + * call dt_it_ops::init(). + * + * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init() + * is composed of two parts: + * low 16-bits is for valid bits, high 16-bits is for flags bits. */ +#define DT_OTABLE_IT_FLAGS_SHIFT 16 +#define DT_OTABLE_IT_FLAGS_MASK 0xffff0000 + +struct dt_device { + struct lu_device dd_lu_dev; + const struct dt_device_operations *dd_ops; + + /** + * List of dt_txn_callback (see below). This is not protected in any + * way, because callbacks are supposed to be added/deleted only during + * single-threaded start-up shut-down procedures. + */ + struct list_head dd_txn_callbacks; + unsigned int dd_record_fid_accessed:1, + dd_rdonly:1; + + /* sysfs and debugfs handling */ + struct dentry *dd_debugfs_entry; + + const struct attribute **dd_def_attrs; + struct kobject dd_kobj; + struct kobj_type dd_ktype; + struct completion dd_kobj_unregister; +}; + +int dt_device_init(struct dt_device *dev, struct lu_device_type *t); +void dt_device_fini(struct dt_device *dev); + +static inline int lu_device_is_dt(const struct lu_device *d) +{ + return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT); +} + +static inline struct dt_device * lu2dt_dev(struct lu_device *l) +{ + LASSERT(lu_device_is_dt(l)); + return container_of_safe(l, struct dt_device, dd_lu_dev); +} + +struct dt_object { + struct lu_object do_lu; + const struct dt_object_operations *do_ops; + const struct dt_body_operations *do_body_ops; + const struct dt_index_operations *do_index_ops; +}; + +/* + * In-core representation of per-device local object OID storage + */ +struct local_oid_storage { + /* all initialized llog systems on this node linked by this */ + struct list_head los_list; + + /* how many handle's reference this los has */ + atomic_t los_refcount; + struct dt_device *los_dev; + struct dt_object *los_obj; + + /* data used to generate new fids */ + struct mutex los_id_lock; + __u64 los_seq; + __u32 los_last_oid; +}; + +static inline struct lu_device *dt2lu_dev(struct dt_device *d) +{ + return &d->dd_lu_dev; +} + +static inline struct dt_object *lu2dt(struct lu_object *l) +{ + LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev)); + return container_of_safe(l, struct dt_object, do_lu); +} + +int dt_object_init(struct dt_object *obj, + struct lu_object_header *h, struct lu_device *d); + +void dt_object_fini(struct dt_object *obj); + +static inline int dt_object_exists(const struct dt_object *dt) +{ + return lu_object_exists(&dt->do_lu); +} + +static inline int dt_object_remote(const struct dt_object *dt) +{ + return lu_object_remote(&dt->do_lu); +} + +static inline struct dt_object *lu2dt_obj(struct lu_object *o) +{ + LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev))); + return container_of_safe(o, struct dt_object, do_lu); +} + +static inline struct dt_object *dt_object_child(struct dt_object *o) +{ + return container_of(lu_object_next(&(o)->do_lu), + struct dt_object, do_lu); +} + +struct dt_quota_reserve_rec { + enum quota_type qrr_type; + union lquota_id qrr_id; + __u64 qrr_count; +}; + +/** + * This is the general purpose transaction handle. + * 1. Transaction Life Cycle + * This transaction handle is allocated upon starting a new transaction, + * and deallocated after this transaction is committed. + * 2. Transaction Nesting + * We do _NOT_ support nested transaction. So, every thread should only + * have one active transaction, and a transaction only belongs to one + * thread. Due to this, transaction handle need no reference count. + * 3. Transaction & dt_object locking + * dt_object locks should be taken inside transaction. + * 4. Transaction & RPC + * No RPC request should be issued inside transaction. + */ +struct thandle { + /** the dt device on which the transactions are executed */ + struct dt_device *th_dev; + + /* point to the top thandle, XXX this is a bit hacky right now, + * but normal device trans callback triggered by the bottom + * device (OSP/OSD == sub thandle layer) needs to get the + * top_thandle (see dt_txn_hook_start/stop()), so we put the + * top thandle here for now, will fix it when we have better + * callback mechanism */ + struct thandle *th_top; + + /* reserved quota for this handle */ + struct dt_quota_reserve_rec th_reserved_quota; + + /** the last operation result in this transaction. + * this value is used in recovery */ + __s32 th_result; + + /** whether we need sync commit */ + unsigned int th_sync:1, + /* local transation, no need to inform other layers */ + th_local:1, + /* Whether we need wait the transaction to be submitted + * (send to remote target) */ + th_wait_submit:1, + /* complex transaction which will track updates on all targets, + * including OSTs */ + th_complex:1, + /* whether ignore quota */ + th_ignore_quota:1, + /* whether restart transaction */ + th_restart_tran:1; +}; + +/** + * Transaction call-backs. + * + * These are invoked by osd (or underlying transaction engine) when + * transaction changes state. + * + * Call-backs are used by upper layers to modify transaction parameters and to + * perform some actions on for each transaction state transition. Typical + * example is mdt registering call-back to write into last-received file + * before each transaction commit. + */ +struct dt_txn_callback { + int (*dtc_txn_start)(const struct lu_env *env, + struct thandle *txn, void *cookie); + int (*dtc_txn_stop)(const struct lu_env *env, + struct thandle *txn, void *cookie); + void *dtc_cookie; + __u32 dtc_tag; + struct list_head dtc_linkage; +}; + +void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb); +void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb); + +int dt_txn_hook_start(const struct lu_env *env, + struct dt_device *dev, struct thandle *txn); +int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn); + +int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj); + +/** + * Callback function used for parsing path. + * \see llo_store_resolve + */ +typedef int (*dt_entry_func_t)(const struct lu_env *env, + const char *name, + void *pvt); + +#define DT_MAX_PATH 1024 + +int dt_path_parser(const struct lu_env *env, + char *local, dt_entry_func_t entry_func, + void *data); + +struct dt_object * +dt_store_resolve(const struct lu_env *env, struct dt_device *dt, + const char *path, struct lu_fid *fid); + +struct dt_object *dt_store_open(const struct lu_env *env, + struct dt_device *dt, + const char *dirname, + const char *filename, + struct lu_fid *fid); + +struct dt_object *dt_find_or_create(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object_format *dof, + struct lu_attr *attr); + +struct dt_object *dt_locate_at(const struct lu_env *env, + struct dt_device *dev, + const struct lu_fid *fid, + struct lu_device *top_dev, + const struct lu_object_conf *conf); + +static inline struct dt_object * +dt_locate(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *fid) +{ + return dt_locate_at(env, dev, fid, + dev->dd_lu_dev.ld_site->ls_top_dev, NULL); +} + +static inline struct dt_object * +dt_object_locate(struct dt_object *dto, struct dt_device *dt_dev) +{ + struct lu_object *lo; + + list_for_each_entry(lo, &dto->do_lu.lo_header->loh_layers, lo_linkage) { + if (lo->lo_dev == &dt_dev->dd_lu_dev) + return container_of(lo, struct dt_object, do_lu); + } + return NULL; +} + +static inline void dt_object_put(const struct lu_env *env, + struct dt_object *dto) +{ + lu_object_put(env, &dto->do_lu); +} + +static inline void dt_object_put_nocache(const struct lu_env *env, + struct dt_object *dto) +{ + lu_object_put_nocache(env, &dto->do_lu); +} + +int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *first_fid, + struct local_oid_storage **los); +void local_oid_storage_fini(const struct lu_env *env, + struct local_oid_storage *los); +int local_object_fid_generate(const struct lu_env *env, + struct local_oid_storage *los, + struct lu_fid *fid); +int local_object_declare_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct lu_attr *attr, + struct dt_object_format *dof, + struct thandle *th); +int local_object_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct lu_attr *attr, struct dt_object_format *dof, + struct thandle *th); +struct dt_object *local_file_find(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name); +struct dt_object *local_file_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode); +struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, + __u32 mode); +struct dt_object * +local_index_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft); +struct dt_object * +local_index_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft); +int local_object_unlink(const struct lu_env *env, struct dt_device *dt, + struct dt_object *parent, const char *name); + +static inline int dt_object_lock(const struct lu_env *env, + struct dt_object *o, struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy) +{ + LASSERT(o != NULL); + LASSERT(o->do_ops != NULL); + LASSERT(o->do_ops->do_object_lock != NULL); + return o->do_ops->do_object_lock(env, o, lh, einfo, policy); +} + +static inline int dt_object_unlock(const struct lu_env *env, + struct dt_object *o, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy) +{ + LASSERT(o != NULL); + LASSERT(o->do_ops != NULL); + LASSERT(o->do_ops->do_object_unlock != NULL); + return o->do_ops->do_object_unlock(env, o, einfo, policy); +} + +int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir, + const char *name, struct lu_fid *fid); + +static inline int dt_object_sync(const struct lu_env *env, struct dt_object *o, + __u64 start, __u64 end) +{ + LASSERT(o); + LASSERT(o->do_ops); + LASSERT(o->do_ops->do_object_sync); + return o->do_ops->do_object_sync(env, o, start, end); +} + +static inline int dt_fid_alloc(const struct lu_env *env, + struct dt_device *d, + struct lu_fid *fid, + struct lu_object *parent, + const struct lu_name *name) +{ + struct lu_device *l = dt2lu_dev(d); + + return l->ld_ops->ldo_fid_alloc(env, l, fid, parent, name); +} + +int dt_declare_version_set(const struct lu_env *env, struct dt_object *o, + struct thandle *th); +void dt_version_set(const struct lu_env *env, struct dt_object *o, + dt_obj_version_t version, struct thandle *th); +dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o); + + +int dt_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos); +int dt_record_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos); +int dt_record_write(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, struct thandle *th); +typedef int (*dt_index_page_build_t)(const struct lu_env *env, + union lu_page *lp, size_t nob, + const struct dt_it_ops *iops, + struct dt_it *it, __u32 attr, void *arg); +int dt_index_walk(const struct lu_env *env, struct dt_object *obj, + const struct lu_rdpg *rdpg, dt_index_page_build_t filler, + void *arg); +int dt_index_read(const struct lu_env *env, struct dt_device *dev, + struct idx_info *ii, const struct lu_rdpg *rdpg); + +static inline struct thandle *dt_trans_create(const struct lu_env *env, + struct dt_device *d) +{ + LASSERT(d->dd_ops->dt_trans_create); + return d->dd_ops->dt_trans_create(env, d); +} + +static inline int dt_trans_start(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_start); + return d->dd_ops->dt_trans_start(env, d, th); +} + +/* for this transaction hooks shouldn't be called */ +static inline int dt_trans_start_local(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_start); + th->th_local = 1; + return d->dd_ops->dt_trans_start(env, d, th); +} + +static inline int dt_trans_stop(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_stop); + return d->dd_ops->dt_trans_stop(env, d, th); +} + +static inline int dt_trans_cb_add(struct thandle *th, + struct dt_txn_commit_cb *dcb) +{ + LASSERT(th->th_dev->dd_ops->dt_trans_cb_add); + dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC; + return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb); +} +/** @} dt */ + + +static inline int dt_declare_record_write(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + loff_t pos, + struct thandle *th) +{ + int rc; + + LASSERTF(dt != NULL, "dt is NULL when we want to write record\n"); + LASSERT(th != NULL); + LASSERTF(dt->do_body_ops, DFID" doesn't exit\n", + PFID(lu_object_fid(&dt->do_lu))); + LASSERT(dt->do_body_ops->dbo_declare_write); + rc = dt->do_body_ops->dbo_declare_write(env, dt, buf, pos, th); + return rc; +} + +static inline int dt_declare_create(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_create); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_CREATE)) + return cfs_fail_err; + + return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th); +} + +static inline int dt_create(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_create); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_CREATE)) + return cfs_fail_err; + + return dt->do_ops->do_create(env, dt, attr, hint, dof, th); +} + +static inline int dt_declare_destroy(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_destroy); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_DESTROY)) + return cfs_fail_err; + + return dt->do_ops->do_declare_destroy(env, dt, th); +} + +static inline int dt_destroy(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_destroy); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DESTROY)) + return cfs_fail_err; + + return dt->do_ops->do_destroy(env, dt, th); +} + +static inline void dt_read_lock(const struct lu_env *env, + struct dt_object *dt, + unsigned role) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_read_lock); + dt->do_ops->do_read_lock(env, dt, role); +} + +static inline void dt_write_lock(const struct lu_env *env, + struct dt_object *dt, + unsigned role) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_lock); + dt->do_ops->do_write_lock(env, dt, role); +} + +static inline void dt_read_unlock(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_read_unlock); + dt->do_ops->do_read_unlock(env, dt); +} + +static inline void dt_write_unlock(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_unlock); + dt->do_ops->do_write_unlock(env, dt); +} + +static inline int dt_write_locked(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_locked); + return dt->do_ops->do_write_locked(env, dt); +} + +static inline bool dt_object_stale(struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_check_stale); + + return dt->do_ops->do_check_stale(dt); +} + +static inline int dt_declare_attr_get(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_attr_get); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_ATTR_GET)) + return cfs_fail_err; + + return dt->do_ops->do_declare_attr_get(env, dt); +} + +static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *la) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_attr_get); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_ATTR_GET)) + return cfs_fail_err; + + return dt->do_ops->do_attr_get(env, dt, la); +} + +static inline int dt_declare_attr_set(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *la, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_attr_set); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_ATTR_SET)) + return cfs_fail_err; + + return dt->do_ops->do_declare_attr_set(env, dt, la, th); +} + +static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_attr *la, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_attr_set); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_ATTR_SET)) + return cfs_fail_err; + + return dt->do_ops->do_attr_set(env, dt, la, th); +} + +static inline int dt_declare_ref_add(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_ref_add); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_REF_ADD)) + return cfs_fail_err; + + return dt->do_ops->do_declare_ref_add(env, dt, th); +} + +static inline int dt_ref_add(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_ref_add); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_REF_ADD)) + return cfs_fail_err; + + return dt->do_ops->do_ref_add(env, dt, th); +} + +static inline int dt_declare_ref_del(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_ref_del); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_REF_DEL)) + return cfs_fail_err; + + return dt->do_ops->do_declare_ref_del(env, dt, th); +} + +static inline int dt_ref_del(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_ref_del); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_REF_DEL)) + return cfs_fail_err; + + return dt->do_ops->do_ref_del(env, dt, th); +} + +static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d, + struct niobuf_remote *rnb, + struct niobuf_local *lnb, int maxlnb, + enum dt_bufs_type rw) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_bufs_get); + return d->do_body_ops->dbo_bufs_get(env, d, rnb->rnb_offset, + rnb->rnb_len, lnb, maxlnb, rw); +} + +static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_bufs_put); + return d->do_body_ops->dbo_bufs_put(env, d, lnb, n); +} + +static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_write_prep); + return d->do_body_ops->dbo_write_prep(env, d, lnb, n); +} + +static inline int dt_declare_write_commit(const struct lu_env *env, + struct dt_object *d, + struct niobuf_local *lnb, + int n, struct thandle *th) +{ + LASSERTF(d != NULL, "dt is NULL when we want to declare write\n"); + LASSERT(th != NULL); + return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th); +} + + +static inline int dt_write_commit(const struct lu_env *env, + struct dt_object *d, struct niobuf_local *lnb, + int n, struct thandle *th, __u64 size) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_write_commit); + return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th, size); +} + +static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_read_prep); + return d->do_body_ops->dbo_read_prep(env, d, lnb, n); +} + +static inline int dt_declare_write(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, loff_t pos, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_declare_write); + return dt->do_body_ops->dbo_declare_write(env, dt, buf, pos, th); +} + +static inline ssize_t dt_write(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_write); + return dt->do_body_ops->dbo_write(env, dt, buf, pos, th); +} + +static inline int dt_declare_punch(const struct lu_env *env, + struct dt_object *dt, __u64 start, + __u64 end, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_declare_punch); + return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th); +} + +static inline int dt_punch(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_punch); + return dt->do_body_ops->dbo_punch(env, dt, start, end, th); +} + +static inline int dt_ladvise(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, int advice) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_ladvise); + return dt->do_body_ops->dbo_ladvise(env, dt, start, end, advice); +} + +static inline int dt_declare_fallocate(const struct lu_env *env, + struct dt_object *dt, __u64 start, + __u64 end, int mode, struct thandle *th) +{ + LASSERT(dt); + if (!dt->do_body_ops) + return -EOPNOTSUPP; + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_declare_fallocate); + return dt->do_body_ops->dbo_declare_fallocate(env, dt, start, end, + mode, th); +} + +static inline int dt_falloc(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, int mode, + struct thandle *th) +{ + LASSERT(dt); + if (!dt->do_body_ops) + return -EOPNOTSUPP; + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_fallocate); + return dt->do_body_ops->dbo_fallocate(env, dt, start, end, mode, th); +} + +static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d, + struct fiemap *fm) +{ + LASSERT(d); + if (d->do_body_ops == NULL) + return -EPROTO; + if (d->do_body_ops->dbo_fiemap_get == NULL) + return -EOPNOTSUPP; + return d->do_body_ops->dbo_fiemap_get(env, d, fm); +} + +static inline loff_t dt_lseek(const struct lu_env *env, struct dt_object *d, + loff_t offset, int whence) +{ + LASSERT(d); + if (d->do_body_ops == NULL) + return -EPROTO; + if (d->do_body_ops->dbo_lseek == NULL) + return -EOPNOTSUPP; + return d->do_body_ops->dbo_lseek(env, d, offset, whence); +} + +static inline int dt_statfs_info(const struct lu_env *env, + struct dt_device *dev, + struct obd_statfs *osfs, + struct obd_statfs_info *info) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_statfs); + return dev->dd_ops->dt_statfs(env, dev, osfs, info); +} + +static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev, + struct obd_statfs *osfs) +{ + return dt_statfs_info(env, dev, osfs, NULL); +} + +static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev, + struct lu_fid *f) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_root_get); + return dev->dd_ops->dt_root_get(env, dev, f); +} + +static inline void dt_conf_get(const struct lu_env *env, + const struct dt_device *dev, + struct dt_device_param *param) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_conf_get); + return dev->dd_ops->dt_conf_get(env, dev, param); +} + +static inline struct super_block *dt_mnt_sb_get(const struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + if (dev->dd_ops->dt_mnt_sb_get) + return dev->dd_ops->dt_mnt_sb_get(dev); + + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int dt_sync(const struct lu_env *env, struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_sync); + return dev->dd_ops->dt_sync(env, dev); +} + +static inline int dt_ro(const struct lu_env *env, struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_ro); + return dev->dd_ops->dt_ro(env, dev); +} + +static inline int dt_declare_insert(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_declare_insert); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_INSERT)) + return cfs_fail_err; + + return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th); +} + +static inline int dt_insert(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_insert); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_INSERT)) + return cfs_fail_err; + + return dt->do_index_ops->dio_insert(env, dt, rec, key, th); +} + +static inline int dt_declare_xattr_del(const struct lu_env *env, + struct dt_object *dt, + const char *name, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_xattr_del); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_XATTR_DEL)) + return cfs_fail_err; + + return dt->do_ops->do_declare_xattr_del(env, dt, name, th); +} + +static inline int dt_xattr_del(const struct lu_env *env, + struct dt_object *dt, const char *name, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_del); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_DEL)) + return cfs_fail_err; + + return dt->do_ops->do_xattr_del(env, dt, name, th); +} + +static inline int dt_declare_xattr_set(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + const char *name, int fl, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_xattr_set); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_XATTR_SET)) + return cfs_fail_err; + + return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th); +} + +static inline int dt_xattr_set(const struct lu_env *env, + struct dt_object *dt, const struct lu_buf *buf, + const char *name, int fl, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_set); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_SET)) + return cfs_fail_err; + + return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th); +} + +static inline int dt_declare_xattr_get(const struct lu_env *env, + struct dt_object *dt, + struct lu_buf *buf, + const char *name) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_xattr_get); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_XATTR_GET)) + return cfs_fail_err; + + return dt->do_ops->do_declare_xattr_get(env, dt, buf, name); +} + +static inline int dt_xattr_get(const struct lu_env *env, + struct dt_object *dt, struct lu_buf *buf, + const char *name) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_get); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_GET)) + return cfs_fail_err; + + return dt->do_ops->do_xattr_get(env, dt, buf, name); +} + +static inline int dt_xattr_list(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_list); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_LIST)) + return cfs_fail_err; + + return dt->do_ops->do_xattr_list(env, dt, buf); +} + +static inline int dt_invalidate(const struct lu_env *env, struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_invalidate); + + return dt->do_ops->do_invalidate(env, dt); +} + +static inline int dt_declare_delete(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_declare_delete); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_DELETE)) + return cfs_fail_err; + + return dt->do_index_ops->dio_declare_delete(env, dt, key, th); +} + +static inline int dt_delete(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_delete); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DELETE)) + return cfs_fail_err; + + return dt->do_index_ops->dio_delete(env, dt, key, th); +} + +static inline int dt_commit_async(const struct lu_env *env, + struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_commit_async); + return dev->dd_ops->dt_commit_async(env, dev); +} + +static inline int dt_reserve_or_free_quota(const struct lu_env *env, + struct dt_device *dev, + enum quota_type type, __u64 uid, + __u64 gid, int count, bool is_md) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_reserve_or_free_quota); + return dev->dd_ops->dt_reserve_or_free_quota(env, dev, type, uid, gid, + count, is_md); +} + +static inline int dt_lookup(const struct lu_env *env, + struct dt_object *dt, + struct dt_rec *rec, + const struct dt_key *key) +{ + int ret; + + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_lookup); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_LOOKUP)) + return cfs_fail_err; + + ret = dt->do_index_ops->dio_lookup(env, dt, rec, key); + if (ret > 0) + ret = 0; + else if (ret == 0) + ret = -ENOENT; + return ret; +} + +static inline int dt_declare_layout_change(const struct lu_env *env, + struct dt_object *o, + struct md_layout_change *mlc, + struct thandle *th) +{ + LASSERT(o); + LASSERT(o->do_ops); + LASSERT(o->do_ops->do_declare_layout_change); + return o->do_ops->do_declare_layout_change(env, o, mlc, th); +} + +static inline int dt_layout_change(const struct lu_env *env, + struct dt_object *o, + struct md_layout_change *mlc, + struct thandle *th) +{ + LASSERT(o); + LASSERT(o->do_ops); + LASSERT(o->do_ops->do_layout_change); + return o->do_ops->do_layout_change(env, o, mlc, th); +} + +struct dt_find_hint { + struct lu_fid *dfh_fid; + struct dt_device *dfh_dt; + struct dt_object *dfh_o; +}; + +struct dt_insert_rec { + union { + const struct lu_fid *rec_fid; + void *rec_data; + }; + union { + struct { + __u32 rec_type; + __u32 rec_padding; + }; + __u64 rec_misc; + }; +}; + +struct dt_thread_info { + char dti_buf[DT_MAX_PATH]; + struct dt_find_hint dti_dfh; + struct lu_attr dti_attr; + struct lu_fid dti_fid; + struct dt_object_format dti_dof; + struct lustre_mdt_attrs dti_lma; + struct lu_buf dti_lb; + struct lu_object_conf dti_conf; + loff_t dti_off; + struct dt_insert_rec dti_dt_rec; +}; + +extern struct lu_context_key dt_key; + +static inline struct dt_thread_info *dt_info(const struct lu_env *env) +{ + struct dt_thread_info *dti; + + dti = lu_context_key_get(&env->le_ctx, &dt_key); + LASSERT(dti); + return dti; +} + +int dt_global_init(void); +void dt_global_fini(void); +int dt_tunables_init(struct dt_device *dt, struct obd_type *type, + const char *name, struct ldebugfs_vars *list); +int dt_tunables_fini(struct dt_device *dt); + +# ifdef CONFIG_PROC_FS +int lprocfs_dt_blksize_seq_show(struct seq_file *m, void *v); +int lprocfs_dt_kbytestotal_seq_show(struct seq_file *m, void *v); +int lprocfs_dt_kbytesfree_seq_show(struct seq_file *m, void *v); +int lprocfs_dt_kbytesavail_seq_show(struct seq_file *m, void *v); +int lprocfs_dt_filestotal_seq_show(struct seq_file *m, void *v); +int lprocfs_dt_filesfree_seq_show(struct seq_file *m, void *v); +# endif /* CONFIG_PROC_FS */ + +#endif /* __LUSTRE_DT_OBJECT_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/interval_tree.h b/drivers/staging/lustrefsx/lustre/include/interval_tree.h new file mode 100644 index 0000000000000..9d6f3031b4293 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/interval_tree.h @@ -0,0 +1,130 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/interval_tree.h + * + * Author: Huang Wei + * Author: Jay Xiong + */ + +#ifndef _INTERVAL_H__ +#define _INTERVAL_H__ + +#include +#include +#include + +struct interval_node { + struct interval_node *in_left; + struct interval_node *in_right; + struct interval_node *in_parent; + unsigned in_color:1, + in_intree:1, /** set if the node is in tree */ + in_res1:30; + __u8 in_res2[4]; /** tags, 8-bytes aligned */ + __u64 in_max_high; + struct interval_node_extent { + __u64 start; + __u64 end; + } in_extent; +}; + +enum interval_iter { + INTERVAL_ITER_CONT = 1, + INTERVAL_ITER_STOP = 2 +}; + +static inline int interval_is_intree(struct interval_node *node) +{ + return node->in_intree == 1; +} + +static inline __u64 interval_low(struct interval_node *node) +{ + return node->in_extent.start; +} + +static inline __u64 interval_high(struct interval_node *node) +{ + return node->in_extent.end; +} + +static inline int interval_set(struct interval_node *node, + __u64 start, __u64 end) +{ + if (start > end) + return -ERANGE; + node->in_extent.start = start; + node->in_extent.end = end; + node->in_max_high = end; + return 0; +} + +static inline void interval_init(struct interval_node *node) +{ + memset(node, 0, sizeof(*node)); +} + +int node_equal(struct interval_node *n1, struct interval_node *n2); + +/* Rules to write an interval callback. + * - the callback returns INTERVAL_ITER_STOP when it thinks the iteration + * should be stopped. It will then cause the iteration function to return + * immediately with return value INTERVAL_ITER_STOP. + * - callbacks for interval_iterate and interval_iterate_reverse: Every + * nodes in the tree will be set to @node before the callback being called + * - callback for interval_search: Only overlapped node will be set to @node + * before the callback being called. + */ +typedef enum interval_iter (*interval_callback_t)(struct interval_node *node, + void *args); + +struct interval_node *interval_insert(struct interval_node *node, + struct interval_node **root); +void interval_erase(struct interval_node *node, struct interval_node **root); + +/* Search the extents in the tree and call @func for each overlapped + * extents. */ +enum interval_iter interval_search(struct interval_node *root, + struct interval_node_extent *ex, + interval_callback_t func, void *data); + +/* Iterate every node in the tree - by reverse order or regular order. */ +enum interval_iter interval_iterate(struct interval_node *root, + interval_callback_t func, void *data); +enum interval_iter interval_iterate_reverse(struct interval_node *root, + interval_callback_t func,void *data); + +void interval_expand(struct interval_node *root, + struct interval_node_extent *ext, + struct interval_node_extent *limiter); +int interval_is_overlapped(struct interval_node *root, + struct interval_node_extent *ex); +struct interval_node *interval_find(struct interval_node *root, + struct interval_node_extent *ex); +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/llog_swab.h b/drivers/staging/lustrefsx/lustre/include/llog_swab.h new file mode 100644 index 0000000000000..cf48167ff8042 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/llog_swab.h @@ -0,0 +1,70 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + * + * Copyright 2015 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * We assume all nodes are either little-endian or big-endian, and we + * always send messages in the sender's native format. The receiver + * detects the message format by checking the 'magic' field of the message + * (see lustre_msg_swabbed() below). + * + * Each type has corresponding 'lustre_swab_xxxtypexxx()' routines + * are implemented in ptlrpc/pack_generic.c. These 'swabbers' convert the + * type from "other" endian, in-place in the message buffer. + * + * A swabber takes a single pointer argument. The caller must already have + * verified that the length of the message buffer >= sizeof (type). + * + * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine + * may be defined that swabs just the variable part, after the caller has + * verified that the message buffer is large enough. + */ + +#ifndef _LLOG_SWAB_H_ +#define _LLOG_SWAB_H_ + +#include +struct lustre_cfg; + +void lustre_swab_lu_fid(struct lu_fid *fid); +void lustre_swab_ost_id(struct ost_id *oid); +void lustre_swab_ll_fid(struct ll_fid *fid); +void lustre_swab_llogd_body(struct llogd_body *d); +void lustre_swab_llog_hdr(struct llog_log_hdr *h); +void lustre_swab_llogd_conn_body(struct llogd_conn_body *d); +void lustre_swab_llog_rec(struct llog_rec_hdr *rec); +void lustre_swab_llog_id(struct llog_logid *lid); +void lustre_swab_lu_seq_range(struct lu_seq_range *range); +#ifdef HAVE_SERVER_SUPPORT +void lustre_swab_update_ops(struct update_ops *uops, unsigned int op_count); +#endif +void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg); +void lustre_swab_cfg_marker(struct cfg_marker *marker, + int swab, int size); + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h new file mode 100644 index 0000000000000..d5fc7da4fbda2 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h @@ -0,0 +1,1124 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lprocfs_status.h + * + * Top level header file for LProc + * + * Author: Hariharan Thantry thantry@users.sourceforge.net + */ +#ifndef _LPROCFS_STATUS_H +#define _LPROCFS_STATUS_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Liuux 5.6 introduces proc_ops with v5.5-8862-gd56c0d45f0e2 + * Now that proc and debugfs use separate operation vector types + * separate containers are also needed. + */ +struct lprocfs_vars { + const char *name; + const struct proc_ops *fops; + void *data; + /** /proc file mode. */ + mode_t proc_mode; +}; + +/** Provide a debugfs container */ +struct ldebugfs_vars { + const char *name; + const struct file_operations *fops; + void *data; + /** debugfs file mode. */ + mode_t proc_mode; +}; + +static inline unsigned int pct(unsigned long a, unsigned long b) +{ + return b ? a * 100 / b : 0; +} + +#define PAGES_TO_MiB(pages) ((pages) >> (20 - PAGE_SHIFT)) +#define MiB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + +/** + * Append a space separated list of current set flags to str. + */ +#define flag2str(port, flag) \ + do { \ + if ((port)->port##_##flag) { \ + seq_printf(m, "%s" #flag, first ? "" : ", "); \ + first = false; \ + } \ + } while (0) + +void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, __u64 flags2, + const char *sep); +void obd_connect_data_seqprint(struct seq_file *m, + struct obd_connect_data *ocd); + +/* if we find more consumers this could be generalized */ +#define OBD_HIST_MAX 32 +struct obd_histogram { + spinlock_t oh_lock; + unsigned long oh_buckets[OBD_HIST_MAX]; +}; + +struct obd_hist_pcpu { + struct percpu_counter oh_pc_buckets[OBD_HIST_MAX]; + bool oh_initialized; +}; + +enum { + RENAME_SAMEDIR_SIZE = 0, + RENAME_CROSSDIR_SRC_SIZE, + RENAME_CROSSDIR_TGT_SIZE, + RENAME_LAST, +}; + +struct rename_stats { + ktime_t rs_init; + struct obd_histogram rs_hist[RENAME_LAST]; +}; + +/* An lprocfs counter can be configured using the enum bit masks below. + * + * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already + * protects this counter from concurrent updates. If not specified, + * lprocfs an internal per-counter lock variable. External locks are + * not used to protect counter increments, but are used to protect + * counter readout and resets. + * + * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples, + * (i.e. counter can be incremented by more than "1"). When specified, + * the counter maintains min, max and sum in addition to a simple + * invocation count. This allows averages to be be computed. + * If not specified, the counter is an increment-by-1 counter. + * min, max, sum, etc. are not maintained. + * + * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of + * squares (for multi-valued counter samples only). This allows + * external computation of standard deviation, but involves a 64-bit + * multiply per counter increment. + */ + +enum { + LPROCFS_CNTR_EXTERNALLOCK = 0x0001, + LPROCFS_CNTR_AVGMINMAX = 0x0002, + LPROCFS_CNTR_STDDEV = 0x0004, + + /* counter data type */ + LPROCFS_TYPE_REQS = 0x0100, + LPROCFS_TYPE_BYTES = 0x0200, + LPROCFS_TYPE_PAGES = 0x0400, + LPROCFS_TYPE_USEC = 0x0800, + + LPROCFS_TYPE_LATENCY = LPROCFS_TYPE_USEC | + LPROCFS_CNTR_AVGMINMAX | + LPROCFS_CNTR_STDDEV, + LPROCFS_TYPE_BYTES_FULL = LPROCFS_TYPE_BYTES | + LPROCFS_CNTR_AVGMINMAX | + LPROCFS_CNTR_STDDEV, +}; +#define LC_MIN_INIT ((~(__u64)0) >> 1) + +struct lprocfs_counter_header { + unsigned int lc_config; + const char *lc_name; /* must be static */ + const char *lc_units; /* must be static */ +}; + +struct lprocfs_counter { + __s64 lc_count; + __s64 lc_min; + __s64 lc_max; + __s64 lc_sumsquare; + /* + * Every counter has lc_array_sum[0], while lc_array_sum[1] is only + * for irq context counter, i.e. stats with + * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need + * lc_array_sum[1] + */ + __s64 lc_array_sum[1]; +}; +#define lc_sum lc_array_sum[0] +#define lc_sum_irq lc_array_sum[1] + +struct lprocfs_percpu { + struct lprocfs_counter lp_cntr[0]; +}; + +enum lprocfs_stats_lock_ops { + LPROCFS_GET_NUM_CPU = 0x0001, /* number allocated per-CPU stats */ + LPROCFS_GET_SMP_ID = 0x0002, /* current stat to be updated */ +}; + +enum lprocfs_stats_flags { + LPROCFS_STATS_FLAG_NONE = 0x0000, /* per cpu counter */ + LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu + * area and need locking */ + LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */ +}; + +enum lprocfs_fields_flags { + LPROCFS_FIELDS_FLAGS_CONFIG = 0x0001, + LPROCFS_FIELDS_FLAGS_SUM = 0x0002, + LPROCFS_FIELDS_FLAGS_MIN = 0x0003, + LPROCFS_FIELDS_FLAGS_MAX = 0x0004, + LPROCFS_FIELDS_FLAGS_AVG = 0x0005, + LPROCFS_FIELDS_FLAGS_SUMSQUARE = 0x0006, + LPROCFS_FIELDS_FLAGS_COUNT = 0x0007, +}; + +struct lprocfs_stats { + /* # of counters */ + unsigned short ls_num; + /* 1 + the biggest cpu # whose ls_percpu slot has been allocated */ + unsigned short ls_biggest_alloc_num; + enum lprocfs_stats_flags ls_flags; + ktime_t ls_init; + /* Lock used when there are no percpu stats areas; For percpu stats, + * it is used to protect ls_biggest_alloc_num change */ + spinlock_t ls_lock; + + /* has ls_num of counter headers */ + struct lprocfs_counter_header *ls_cnt_header; + struct lprocfs_percpu *ls_percpu[0]; +}; + +#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC) + +/* Pack all opcodes down into a single monotonically increasing index */ +static inline int opcode_offset(__u32 opc) { + if (opc < OST_LAST_OPC) { + /* OST opcode */ + return (opc - OST_FIRST_OPC); + } else if (opc < MDS_LAST_OPC) { + /* MDS opcode */ + return (opc - MDS_FIRST_OPC + + OPC_RANGE(OST)); + } else if (opc < LDLM_LAST_OPC) { + /* LDLM Opcode */ + return (opc - LDLM_FIRST_OPC + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < MGS_LAST_OPC) { + /* MGS Opcode */ + return (opc - MGS_FIRST_OPC + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < OBD_LAST_OPC) { + /* OBD Ping */ + return (opc - OBD_FIRST_OPC + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < LLOG_LAST_OPC) { + /* LLOG Opcode */ + return (opc - LLOG_FIRST_OPC + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < QUOTA_LAST_OPC) { + /* LQUOTA Opcode */ + return (opc - QUOTA_FIRST_OPC + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < SEQ_LAST_OPC) { + /* SEQ opcode */ + return (opc - SEQ_FIRST_OPC + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < SEC_LAST_OPC) { + /* SEC opcode */ + return (opc - SEC_FIRST_OPC + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < FLD_LAST_OPC) { + /* FLD opcode */ + return (opc - FLD_FIRST_OPC + + OPC_RANGE(SEC) + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); +#ifdef HAVE_SERVER_SUPPORT + } else if (opc < OUT_UPDATE_LAST_OPC) { + /* update opcode */ + return (opc - OUT_UPDATE_FIRST_OPC + + OPC_RANGE(FLD) + + OPC_RANGE(SEC) + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < LFSCK_LAST_OPC) { + /* LFSCK opcode */ + return (opc - LFSCK_FIRST_OPC + + OPC_RANGE(OUT_UPDATE) + + OPC_RANGE(FLD) + + OPC_RANGE(SEC) + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); +#endif /* HAVE_SERVER_SUPPORT */ + } else { + /* Unknown Opcode */ + return -1; + } +} + +#define LUSTRE_MAX_OPCODES_CLIENT (OPC_RANGE(OST) + \ + OPC_RANGE(MDS) + \ + OPC_RANGE(LDLM) + \ + OPC_RANGE(MGS) + \ + OPC_RANGE(OBD) + \ + OPC_RANGE(LLOG) + \ + OPC_RANGE(SEC) + \ + OPC_RANGE(SEQ) + \ + OPC_RANGE(SEC) + \ + OPC_RANGE(FLD)) + +#ifdef HAVE_SERVER_SUPPORT +#define LUSTRE_MAX_OPCODES (LUSTRE_MAX_OPCODES_CLIENT + \ + OPC_RANGE(OUT_UPDATE) + \ + OPC_RANGE(LFSCK)) +#else +#define LUSTRE_MAX_OPCODES LUSTRE_MAX_OPCODES_CLIENT +#endif + +#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR) + \ + OPC_RANGE(EXTRA)) + +enum { + PTLRPC_REQWAIT_CNTR = 0, + PTLRPC_REQQDEPTH_CNTR, + PTLRPC_REQACTIVE_CNTR, + PTLRPC_TIMEOUT, + PTLRPC_REQBUF_AVAIL_CNTR, + PTLRPC_LAST_CNTR +}; + +#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR + +enum lprocfs_extra_opc { + LDLM_GLIMPSE_ENQUEUE = 0, + LDLM_PLAIN_ENQUEUE, + LDLM_EXTENT_ENQUEUE, + LDLM_FLOCK_ENQUEUE, + LDLM_IBITS_ENQUEUE, + MDS_REINT_SETATTR, + MDS_REINT_CREATE, + MDS_REINT_LINK, + MDS_REINT_UNLINK, + MDS_REINT_RENAME, + MDS_REINT_OPEN, + MDS_REINT_SETXATTR, + MDS_REINT_RESYNC, + BRW_READ_BYTES, + BRW_WRITE_BYTES, + EXTRA_LAST_OPC +}; + +#ifdef HAVE_SERVER_SUPPORT +enum brw_rw_stats { + BRW_R_PAGES = 0, + BRW_W_PAGES, + BRW_R_DISCONT_PAGES, + BRW_W_DISCONT_PAGES, + BRW_R_DISCONT_BLOCKS, + BRW_W_DISCONT_BLOCKS, + BRW_R_DIO_FRAGS, + BRW_W_DIO_FRAGS, + BRW_R_RPC_HIST, + BRW_W_RPC_HIST, + BRW_R_IO_TIME, + BRW_W_IO_TIME, + BRW_R_DISK_IOSIZE, + BRW_W_DISK_IOSIZE, + BRW_RW_STATS_NUM, +}; + +struct brw_stats_props { + const char *bsp_name; + const char *bsp_units; + bool bsp_scale; +}; + +struct brw_stats { + ktime_t bs_init; + struct obd_hist_pcpu bs_hist[BRW_RW_STATS_NUM]; + struct brw_stats_props bs_props[BRW_RW_STATS_NUM / 2]; +}; + +int lprocfs_init_brw_stats(struct brw_stats *brw_stats); +void lprocfs_fini_brw_stats(struct brw_stats *brw_stats); + +void ldebugfs_register_osd_stats(struct dentry *parent, + struct brw_stats *brw_stats, + struct lprocfs_stats *stats); +#endif /* HAVE_SERVER_SUPPORT */ + +#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE +/* class_obd.c */ +extern struct proc_dir_entry *proc_lustre_root; +extern struct dentry *debugfs_lustre_root; +extern struct kset *lustre_kset; + +struct obd_device; + +#define JOBSTATS_JOBID_VAR_MAX_LEN 20 +#define JOBSTATS_DISABLE "disable" +#define JOBSTATS_PROCNAME_UID "procname_uid" +#define JOBSTATS_NODELOCAL "nodelocal" +#define JOBSTATS_SESSION "session" + +typedef void (*cntr_init_callback)(struct lprocfs_stats *stats, + unsigned int offset); + +struct obd_job_stats { + struct cfs_hash *ojs_hash; /* hash of jobids */ + struct list_head ojs_list; /* list of job_stat structs */ + rwlock_t ojs_lock; /* protect ojs_list/js_list */ + ktime_t ojs_cleanup_interval;/* 1/2 expiry seconds */ + ktime_t ojs_cleanup_last;/* previous cleanup time */ + cntr_init_callback ojs_cntr_init_fn;/* lprocfs_stats initializer */ + unsigned short ojs_cntr_num; /* number of stats in struct */ + bool ojs_cleaning; /* currently expiring stats */ +}; + +#ifdef CONFIG_PROC_FS + +int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, + unsigned int cpuid); +int lprocfs_stats_lock(struct lprocfs_stats *stats, + enum lprocfs_stats_lock_ops opc, + unsigned long *flags); +void lprocfs_stats_unlock(struct lprocfs_stats *stats, + enum lprocfs_stats_lock_ops opc, + unsigned long *flags); + +static inline unsigned int +lprocfs_stats_counter_size(struct lprocfs_stats *stats) +{ + unsigned int percpusize; + + percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]); + + /* irq safe stats need lc_array_sum[1] */ + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpusize += stats->ls_num * sizeof(__s64); + + if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0) + percpusize = L1_CACHE_ALIGN(percpusize); + + return percpusize; +} + +static inline struct lprocfs_counter * +lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid, + int index) +{ + struct lprocfs_counter *cntr; + + cntr = &stats->ls_percpu[cpuid]->lp_cntr[index]; + + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + cntr = (void *)cntr + index * sizeof(__s64); + + return cntr; +} + +/* Two optimized LPROCFS counter increment functions are provided: + * lprocfs_counter_incr(cntr, value) - optimized for by-one counters + * lprocfs_counter_add(cntr) - use for multi-valued counters + * Counter data layout allows config flag, counter lock and the + * count itself to reside within a single cache line. + */ + +extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, + long amount); +extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, + long amount); + +#define lprocfs_counter_incr(stats, idx) \ + lprocfs_counter_add(stats, idx, 1) +#define lprocfs_counter_decr(stats, idx) \ + lprocfs_counter_sub(stats, idx, 1) + +extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc, + struct lprocfs_counter_header *header, + enum lprocfs_stats_flags flags, + enum lprocfs_fields_flags field); +u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, + enum lprocfs_fields_flags field); + +extern struct lprocfs_stats * +lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags); +extern void lprocfs_clear_stats(struct lprocfs_stats *stats); +extern void lprocfs_free_stats(struct lprocfs_stats **stats); +extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats); +extern int lprocfs_alloc_obd_stats(struct obd_device *obd, + unsigned int num_stats); +extern int lprocfs_alloc_md_stats(struct obd_device *obd, + unsigned int num_private_stats); +extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index, + unsigned conf, const char *name, + const char *units); +extern void lprocfs_free_obd_stats(struct obd_device *obd); +extern void lprocfs_free_md_stats(struct obd_device *obd); +struct obd_export; +struct nid_stat; +extern int lprocfs_add_clear_entry(struct obd_device *obd, + struct proc_dir_entry *entry); +#ifdef HAVE_SERVER_SUPPORT +extern int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *peer_nid); +extern int lprocfs_exp_cleanup(struct obd_export *exp); +struct dentry *ldebugfs_add_symlink(const char *name, const char *target, + const char *format, ...); +#else +static inline int lprocfs_exp_cleanup(struct obd_export *exp) +{ return 0; } +#endif +extern struct proc_dir_entry * +lprocfs_add_simple(struct proc_dir_entry *root, char *name, + void *data, const struct proc_ops *ops); +extern struct proc_dir_entry * +lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent, + const char *format, ...); +extern void lprocfs_free_per_client_stats(struct obd_device *obd); +#ifdef HAVE_SERVER_SUPPORT +extern ssize_t +lprocfs_nid_stats_clear_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off); +extern int lprocfs_nid_stats_clear_seq_show(struct seq_file *file, void *data); +#endif +extern int lprocfs_register_stats(struct proc_dir_entry *root, const char *name, + struct lprocfs_stats *stats); +extern const struct file_operations ldebugfs_stats_seq_fops; + +/* lprocfs_status.c */ +extern void ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *var, + void *data); +extern struct dentry *ldebugfs_register(const char *name, + struct dentry *parent, + struct ldebugfs_vars *list, + void *data); +extern int lprocfs_add_vars(struct proc_dir_entry *root, + struct lprocfs_vars *var, void *data); + +extern struct proc_dir_entry * +lprocfs_register(const char *name, struct proc_dir_entry *parent, + struct lprocfs_vars *list, void *data); + +extern void lprocfs_remove(struct proc_dir_entry **root); +extern void lprocfs_remove_proc_entry(const char *name, + struct proc_dir_entry *parent); +extern int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only); +extern int lprocfs_obd_cleanup(struct obd_device *obd); + +extern int lprocfs_seq_create(struct proc_dir_entry *parent, const char *name, + mode_t mode, const struct proc_ops *seq_fops, + void *data); +extern int lprocfs_obd_seq_create(struct obd_device *obd, const char *name, + mode_t mode, const struct proc_ops *seq_fops, + void *data); +extern void lprocfs_stats_header(struct seq_file *seq, ktime_t now, + ktime_t ts_init, int width, const char *colon, + bool show_units, const char *prefix); + +/* Generic callbacks */ +extern int lprocfs_uuid_seq_show(struct seq_file *m, void *data); +extern int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data); +ssize_t conn_uuid_show(struct kobject *kobj, struct attribute *attr, char *buf); +extern int lprocfs_import_seq_show(struct seq_file *m, void *data); +extern int lprocfs_state_seq_show(struct seq_file *m, void *data); +extern int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data); +#ifdef HAVE_SERVER_SUPPORT +ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t grant_check_threshold_show(struct kobject *kobj, + struct attribute *attr, char *buf); +ssize_t grant_check_threshold_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count); +#endif +struct adaptive_timeout; +extern int lprocfs_at_hist_helper(struct seq_file *m, + struct adaptive_timeout *at); +extern int lprocfs_timeouts_seq_show(struct seq_file *m, void *data); +extern ssize_t +lprocfs_timeouts_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off); +#ifdef HAVE_SERVER_SUPPORT +extern ssize_t +lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off); +#endif +ssize_t ping_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count); +ssize_t ping_show(struct kobject *kobj, struct attribute *attr, + char *buffer); + +extern ssize_t +ldebugfs_import_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off); +static inline ssize_t +lprocfs_import_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + return ldebugfs_import_seq_write(file, buffer, count, off); +} + +extern int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *data); +extern ssize_t +lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off); + +int string_to_size(u64 *size, const char *buffer, size_t count); +int sysfs_memparse(const char *buffer, size_t count, u64 *val, + const char *defunit); +char *lprocfs_strnstr(const char *s1, const char *s2, size_t len); +char *lprocfs_find_named_value(const char *buffer, const char *name, + size_t *count); +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value); +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value); +void lprocfs_oh_clear(struct obd_histogram *oh); +unsigned long lprocfs_oh_sum(struct obd_histogram *oh); + +void lprocfs_oh_tally_pcpu(struct obd_hist_pcpu *oh, unsigned int value); +void lprocfs_oh_tally_log2_pcpu(struct obd_hist_pcpu *oh, unsigned int value); +int lprocfs_oh_alloc_pcpu(struct obd_hist_pcpu *oh); +void lprocfs_oh_clear_pcpu(struct obd_hist_pcpu *oh); +void lprocfs_oh_release_pcpu(struct obd_hist_pcpu *oh); +unsigned long lprocfs_oh_sum_pcpu(struct obd_hist_pcpu *oh); +unsigned long lprocfs_oh_counter_pcpu(struct obd_hist_pcpu *oh, + unsigned int value); + +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt); + +#ifdef HAVE_SERVER_SUPPORT +/* lprocfs_status.c: recovery status */ +int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data); + +/* lprocfs_status.c: hash statistics */ +int lprocfs_hash_seq_show(struct seq_file *m, void *data); + +/* lprocfs_status.c: IR factor */ +ssize_t ir_factor_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t ir_factor_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count); +#endif + +/* lprocfs_status.c: dump pages on cksum error */ +int lprocfs_checksum_dump_seq_show(struct seq_file *m, void *data); +ssize_t +lprocfs_checksum_dump_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off); + +extern int lprocfs_single_release(struct inode *, struct file *); +extern int lprocfs_seq_release(struct inode *, struct file *); + +/* You must use these macros when you want to refer to + * the import in a client obd_device for a lprocfs entry + * Note that it is not safe to 'goto', 'return' or 'break' + * out of the body of this statement. It *IS* safe to + * 'goto' the a label inside the statement, or to 'continue' + * to get out of the statement. + */ + +#define with_imp_locked_nested(__obd, __imp, __rc, __nest) \ + for (down_read_nested(&(__obd)->u.cli.cl_sem, __nest), \ + __imp = (__obd)->u.cli.cl_import, \ + __rc = __imp ? 0 : -ENODEV; \ + __imp ? 1 : (up_read(&(__obd)->u.cli.cl_sem), 0); \ + __imp = NULL) + +#define with_imp_locked(__obd, __imp, __rc) \ + with_imp_locked_nested(__obd, __imp, __rc, 0) + +/* write the name##_seq_show function, call LDEBUGFS_SEQ_FOPS_RO for read-only + * debugfs entries; otherwise, you will define name##_seq_write function also + * for a read-write debugfs entry, and then call LDEBUGFS_SEQ_FOPS instead. + * Finally, call debugfs_create_file(filename, 0444, obd, data, &name#_fops); + */ +#define __LDEBUGFS_SEQ_FOPS(name, custom_seq_write) \ +static int name##_single_open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, name##_seq_show, inode->i_private); \ +} \ +static const struct file_operations name##_fops = { \ + .owner = THIS_MODULE, \ + .open = name##_single_open, \ + .read = seq_read, \ + .write = custom_seq_write, \ + .llseek = seq_lseek, \ + .release = single_release, \ +} + +#define LDEBUGFS_SEQ_FOPS_RO(name) __LDEBUGFS_SEQ_FOPS(name, NULL) +#define LDEBUGFS_SEQ_FOPS(name) __LDEBUGFS_SEQ_FOPS(name, \ + name##_seq_write) + +#define LDEBUGFS_SEQ_FOPS_RO_TYPE(name, type) \ + static int name##_##type##_seq_show(struct seq_file *m, void *v)\ + { \ + if (!m->private) \ + return -ENODEV; \ + return lprocfs_##type##_seq_show(m, m->private); \ + } \ + LDEBUGFS_SEQ_FOPS_RO(name##_##type) + +#define LDEBUGFS_SEQ_FOPS_RW_TYPE(name, type) \ + static int name##_##type##_seq_show(struct seq_file *m, void *v)\ + { \ + if (!m->private) \ + return -ENODEV; \ + return lprocfs_##type##_seq_show(m, m->private); \ + } \ + static ssize_t name##_##type##_seq_write(struct file *file, \ + const char __user *buffer, size_t count, \ + loff_t *off) \ + { \ + struct seq_file *seq = file->private_data; \ + \ + if (!seq->private) \ + return -ENODEV; \ + return ldebugfs_##type##_seq_write(file, buffer, count, \ + seq->private); \ + } \ + LDEBUGFS_SEQ_FOPS(name##_##type); + +#define LDEBUGFS_FOPS_WR_ONLY(name, type) \ + static ssize_t name##_##type##_write(struct file *file, \ + const char __user *buffer, size_t count, \ + loff_t *off) \ + { \ + return ldebugfs_##type##_seq_write(file, buffer, count, \ + off); \ + } \ + static int name##_##type##_open(struct inode *inode, \ + struct file *file) \ + { \ + return single_open(file, NULL, inode->i_private); \ + } \ + static const struct file_operations name##_##type##_fops = { \ + .open = name##_##type##_open, \ + .write = name##_##type##_write, \ + .release = single_release, \ + }; + +/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only + * proc entries; otherwise, you will define name##_seq_write function also for + * a read-write proc entry, and then call LPROC_SEQ_FOPS instead. Finally, + * call ldebugfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); + */ +#define __LPROC_SEQ_FOPS(name, custom_seq_write) \ +static int name##_single_open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, name##_seq_show, \ + inode->i_private ? inode->i_private : \ + pde_data(inode)); \ +} \ +static const struct proc_ops name##_fops = { \ + PROC_OWNER(THIS_MODULE) \ + .proc_open = name##_single_open, \ + .proc_read = seq_read, \ + .proc_write = custom_seq_write, \ + .proc_lseek = seq_lseek, \ + .proc_release = lprocfs_single_release, \ +} + +#define LPROC_SEQ_FOPS_RO(name) __LPROC_SEQ_FOPS(name, NULL) +#define LPROC_SEQ_FOPS(name) __LPROC_SEQ_FOPS(name, name##_seq_write) + +#define LPROC_SEQ_FOPS_RO_TYPE(name, type) \ + static int name##_##type##_seq_show(struct seq_file *m, void *v)\ + { \ + return lprocfs_##type##_seq_show(m, m->private); \ + } \ + LPROC_SEQ_FOPS_RO(name##_##type) + +#define LPROC_SEQ_FOPS_RW_TYPE(name, type) \ + static int name##_##type##_seq_show(struct seq_file *m, void *v)\ + { \ + return lprocfs_##type##_seq_show(m, m->private); \ + } \ + static ssize_t name##_##type##_seq_write(struct file *file, \ + const char __user *buffer, size_t count, \ + loff_t *off) \ + { \ + struct seq_file *seq = file->private_data; \ + return lprocfs_##type##_seq_write(file, buffer, \ + count, seq->private); \ + } \ + LPROC_SEQ_FOPS(name##_##type); + +#define LPROC_SEQ_FOPS_WR_ONLY(name, type) \ + static ssize_t name##_##type##_write(struct file *file, \ + const char __user *buffer, size_t count, \ + loff_t *off) \ + { \ + return lprocfs_##type##_seq_write(file, buffer, count, off);\ + } \ + static int name##_##type##_open(struct inode *inode, struct file *file)\ + { \ + return single_open(file, NULL, \ + inode->i_private ? inode->i_private : \ + pde_data(inode)); \ + } \ + static const struct proc_ops name##_##type##_fops = { \ + .proc_open = name##_##type##_open, \ + .proc_write = name##_##type##_write, \ + .proc_release = lprocfs_single_release, \ + }; + +struct lustre_attr { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, struct attribute *attr, + char *buf); + ssize_t (*store)(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len); +}; + +#define LUSTRE_ATTR(name, mode, show, store) \ +static struct lustre_attr lustre_attr_##name = __ATTR(name, mode, show, store) + +#define LUSTRE_WO_ATTR(name) LUSTRE_ATTR(name, 0200, NULL, name##_store) +#define LUSTRE_RO_ATTR(name) LUSTRE_ATTR(name, 0444, name##_show, NULL) +#define LUSTRE_RW_ATTR(name) LUSTRE_ATTR(name, 0644, name##_show, name##_store) + +ssize_t lustre_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t lustre_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len); + +extern const struct sysfs_ops lustre_sysfs_ops; + +/* lproc_ptlrpc.c */ +struct ptlrpc_request; +extern void target_print_req(void *seq_file, struct ptlrpc_request *req); + +#ifdef HAVE_SERVER_SUPPORT +/* lprocfs_jobstats.c */ +int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, + int event, long amount); +void lprocfs_job_stats_fini(struct obd_device *obd); +int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num, + cntr_init_callback fn); +ssize_t job_cleanup_interval_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t job_cleanup_interval_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count); +/* lproc_status_server.c */ +ssize_t recovery_time_soft_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t recovery_time_soft_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count); +ssize_t recovery_time_hard_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t recovery_time_hard_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count); +ssize_t instance_show(struct kobject *kobj, struct attribute *attr, + char *buf); +#endif +/* lproc_status.c */ +int lprocfs_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *data); +ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off); +ssize_t short_io_bytes_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t short_io_bytes_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count); + +struct root_squash_info; +int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count, + struct root_squash_info *squash, char *name); +int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count, + struct root_squash_info *squash, char *name); + +#else /* !CONFIG_PROC_FS */ + +#define proc_lustre_root NULL + +static inline void lprocfs_counter_add(struct lprocfs_stats *stats, + int index, long amount) +{ return; } +static inline void lprocfs_counter_incr(struct lprocfs_stats *stats, + int index) +{ return; } +static inline void lprocfs_counter_sub(struct lprocfs_stats *stats, + int index, long amount) +{ return; } +static inline void lprocfs_counter_decr(struct lprocfs_stats *stats, + int index) +{ return; } +static inline void lprocfs_counter_init(struct lprocfs_stats *stats, + int index, unsigned conf, + const char *name, const char *units) +{ return; } + +static inline __u64 lc_read_helper(struct lprocfs_counter *lc, + enum lprocfs_fields_flags field) +{ return 0; } + +/* NB: we return !NULL to satisfy error checker */ +static inline struct lprocfs_stats * +lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags) +{ return (struct lprocfs_stats *)1; } +static inline void lprocfs_clear_stats(struct lprocfs_stats *stats) +{ return; } +static inline void lprocfs_free_stats(struct lprocfs_stats **stats) +{ return; } +static inline int lprocfs_register_stats(struct proc_dir_entry *root, + const char *name, + struct lprocfs_stats *stats) +{ return 0; } +static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats) +{ return; } +static inline int lprocfs_alloc_obd_stats(struct obd_device *obd, + unsigned int num_stats) +{ return 0; } +static inline int lprocfs_alloc_md_stats(struct obd_device *obd, + unsigned int num_private_stats) +{ return 0; } +static inline void lprocfs_free_obd_stats(struct obd_device *obd) +{ return; } +static inline void lprocfs_free_md_stats(struct obd_device *obd) +{ return; } + +struct obd_export; +static inline int lprocfs_add_clear_entry(struct obd_export *exp) +{ return 0; } +static inline void lprocfs_free_per_client_stats(struct obd_device *obd) +{ return; } +#ifdef HAVE_SERVER_SUPPORT +static inline +ssize_t lprocfs_nid_stats_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{return 0;} +static inline +int lprocfs_nid_stats_clear_seq_show(struct seq_file *m, void *data) +{return 0;} +static inline int lprocfs_exp_setup(struct obd_export *exp,lnet_nid_t *peer_nid) +{ return 0; } +#endif +static inline int lprocfs_exp_cleanup(struct obd_export *exp) +{ return 0; } +static inline struct proc_dir_entry * +lprocfs_add_simple(struct proc_dir_entry *root, char *name, + void *data, const struct file_operations *fops) +{return 0; } +static inline struct proc_dir_entry * +lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent, + const char *format, ...) +{return NULL; } +static inline int lprocfs_add_vars(struct proc_dir_entry *root, + struct lprocfs_vars *var, void *data) +{ return 0; } +static inline struct proc_dir_entry * +lprocfs_register(const char *name, struct proc_dir_entry *parent, + struct lprocfs_vars *list, void *data) +{ return NULL; } +static inline void lprocfs_remove(struct proc_dir_entry **root) +{ return; } +static inline void lprocfs_remove_proc_entry(const char *name, + struct proc_dir_entry *parent) +{ return; } +static inline int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only) +{ return 0; } +static inline int lprocfs_obd_cleanup(struct obd_device *obd) +{ return 0; } +static inline int lprocfs_uuid_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_server_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_import_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_state_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data) +{ return 0; } +#ifdef HAVE_SERVER_SUPPORT +static inline int lprocfs_num_exports_seq_show(struct seq_file *m, void *data) +{ return 0; } +#endif +struct adaptive_timeout; +static inline int lprocfs_at_hist_helper(struct seq_file *m, + struct adaptive_timeout *at) +{ return 0; } +static inline int lprocfs_timeouts_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline ssize_t +lprocfs_timeouts_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } +#ifdef HAVE_SERVER_SUPPORT +static inline ssize_t +lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } +#endif +static inline ssize_t +lprocfs_ping_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } +static inline ssize_t +ldebugfs_import_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } +static inline ssize_t +lprocfs_import_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } +static inline int +lprocfs_pinger_recov_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline ssize_t +lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } + +/* Statfs helpers */ +static inline +int lprocfs_blksize_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_kbytestotal_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_kbytesfree_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_kbytesavail_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_filestotal_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_filesfree_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value) +{ return; } +static inline +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value) +{ return; } +static inline +void lprocfs_oh_clear(struct obd_histogram *oh) +{ return; } +static inline +unsigned long lprocfs_oh_sum(struct obd_histogram *oh) +{ return 0; } +static inline +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt) +{ return; } +static inline +u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, + enum lprocfs_fields_flags field) +{ return (__u64)0; } + +#define LPROC_SEQ_FOPS_RO(name) +#define LPROC_SEQ_FOPS(name) +#define LPROC_SEQ_FOPS_RO_TYPE(name, type) +#define LPROC_SEQ_FOPS_RW_TYPE(name, type) +#define LPROC_SEQ_FOPS_WR_ONLY(name, type) + +/* lprocfs_jobstats.c */ +static inline +int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, int event, + long amount) +{ return 0; } +static inline +void lprocfs_job_stats_fini(struct obd_device *obd) +{ return; } +static inline +int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num, + cntr_init_callback fn) +{ return 0; } + + +/* lproc_ptlrpc.c */ +#define target_print_req NULL + +#endif /* CONFIG_PROC_FS */ + +#endif /* LPROCFS_STATUS_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lu_object.h b/drivers/staging/lustrefsx/lustre/include/lu_object.h new file mode 100644 index 0000000000000..9ceabafb2636f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lu_object.h @@ -0,0 +1,1760 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __LUSTRE_LU_OBJECT_H +#define __LUSTRE_LU_OBJECT_H + +#ifdef HAVE_LINUX_STDARG_HEADER +#include +#else +#include +#endif +#include +#include +#include +#include +#include +#include + +struct seq_file; +struct proc_dir_entry; +struct lustre_cfg; +struct lprocfs_stats; +struct obd_type; + +/** \defgroup lu lu + * lu_* data-types represent server-side entities shared by data and meta-data + * stacks. + * + * Design goals: + * + * -# support for layering. + * + * Server side object is split into layers, one per device in the + * corresponding device stack. Individual layer is represented by struct + * lu_object. Compound layered object --- by struct lu_object_header. Most + * interface functions take lu_object as an argument and operate on the + * whole compound object. This decision was made due to the following + * reasons: + * + * - it's envisaged that lu_object will be used much more often than + * lu_object_header; + * + * - we want lower (non-top) layers to be able to initiate operations + * on the whole object. + * + * Generic code supports layering more complex than simple stacking, e.g., + * it is possible that at some layer object "spawns" multiple sub-objects + * on the lower layer. + * + * -# fid-based identification. + * + * Compound object is uniquely identified by its fid. Objects are indexed + * by their fids (hash table is used for index). + * + * -# caching and life-cycle management. + * + * Object's life-time is controlled by reference counting. When reference + * count drops to 0, object is returned to cache. Cached objects still + * retain their identity (i.e., fid), and can be recovered from cache. + * + * Objects are kept in the global LRU list, and lu_site_purge() function + * can be used to reclaim given number of unused objects from the tail of + * the LRU. + * + * -# avoiding recursion. + * + * Generic code tries to replace recursion through layers by iterations + * where possible. Additionally to the end of reducing stack consumption, + * data, when practically possible, are allocated through lu_context_key + * interface rather than on stack. + * @{ + */ + +struct lu_site; +struct lu_object; +struct lu_device; +struct lu_object_header; +struct lu_context; +struct lu_env; +struct lu_name; + +/** + * Operations common for data and meta-data devices. + */ +struct lu_device_operations { + /** + * Allocate object for the given device (without lower-layer + * parts). This is called by lu_object_operations::loo_object_init() + * from the parent layer, and should setup at least lu_object::lo_dev + * and lu_object::lo_ops fields of resulting lu_object. + * + * Object creation protocol. + * + * Due to design goal of avoiding recursion, object creation (see + * lu_object_alloc()) is somewhat involved: + * + * - first, lu_device_operations::ldo_object_alloc() method of the + * top-level device in the stack is called. It should allocate top + * level object (including lu_object_header), but without any + * lower-layer sub-object(s). + * + * - then lu_object_alloc() sets fid in the header of newly created + * object. + * + * - then lu_object_operations::loo_object_init() is called. It has + * to allocate lower-layer object(s). To do this, + * lu_object_operations::loo_object_init() calls ldo_object_alloc() + * of the lower-layer device(s). + * + * - for all new objects allocated by + * lu_object_operations::loo_object_init() (and inserted into object + * stack), lu_object_operations::loo_object_init() is called again + * repeatedly, until no new objects are created. + * + * \post ergo(!IS_ERR(result), result->lo_dev == d && + * result->lo_ops != NULL); + */ + struct lu_object *(*ldo_object_alloc)(const struct lu_env *env, + const struct lu_object_header *h, + struct lu_device *d); + /** + * process config specific for device. + */ + int (*ldo_process_config)(const struct lu_env *env, + struct lu_device *, struct lustre_cfg *); + int (*ldo_recovery_complete)(const struct lu_env *, + struct lu_device *); + + /** + * initialize local objects for device. this method called after layer has + * been initialized (after LCFG_SETUP stage) and before it starts serving + * user requests. + */ + + int (*ldo_prepare)(const struct lu_env *, + struct lu_device *parent, + struct lu_device *dev); + + + /** + * Allocate new FID for file with @name under @parent + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * \param[out] fid new FID allocated + * \param[in] parent parent object + * \param[in] name lu_name + * + * \retval 0 0 FID allocated successfully. + * \retval 1 1 FID allocated successfully and new sequence + * requested from seq meta server + * \retval negative negative errno if FID allocation failed. + */ + int (*ldo_fid_alloc)(const struct lu_env *env, + struct lu_device *dev, + struct lu_fid *fid, + struct lu_object *parent, + const struct lu_name *name); +}; + +/** + * For lu_object_conf flags + */ +typedef enum { + /* This is a new object to be allocated, or the file + * corresponding to the object does not exists. */ + LOC_F_NEW = 0x00000001, +} loc_flags_t; + +/** + * Object configuration, describing particulars of object being created. On + * server this is not used, as server objects are full identified by fid. On + * client configuration contains struct lustre_md. + */ +struct lu_object_conf { + /** + * Some hints for obj find and alloc. + */ + loc_flags_t loc_flags; +}; + +/** + * Type of "printer" function used by lu_object_operations::loo_object_print() + * method. + * + * Printer function is needed to provide some flexibility in (semi-)debugging + * output: possible implementations: printk, CDEBUG, sysfs/seq_file + */ +typedef int (*lu_printer_t)(const struct lu_env *env, + void *cookie, const char *format, ...) + __attribute__ ((format (printf, 3, 4))); + +/** + * Operations specific for particular lu_object. + */ +struct lu_object_operations { + + /** + * Allocate lower-layer parts of the object by calling + * lu_device_operations::ldo_object_alloc() of the corresponding + * underlying device. + * + * This method is called once for each object inserted into object + * stack. It's responsibility of this method to insert lower-layer + * object(s) it create into appropriate places of object stack. + */ + int (*loo_object_init)(const struct lu_env *env, + struct lu_object *o, + const struct lu_object_conf *conf); + /** + * Called (in top-to-bottom order) during object allocation after all + * layers were allocated and initialized. Can be used to perform + * initialization depending on lower layers. + */ + int (*loo_object_start)(const struct lu_env *env, + struct lu_object *o); + /** + * Called before lu_object_operations::loo_object_free() to signal + * that object is being destroyed. Dual to + * lu_object_operations::loo_object_init(). + */ + void (*loo_object_delete)(const struct lu_env *env, + struct lu_object *o); + /** + * Dual to lu_device_operations::ldo_object_alloc(). Called when + * object is removed from memory. Must use call_rcu or kfree_rcu + * if the object contains an lu_object_header. + */ + void (*loo_object_free)(const struct lu_env *env, + struct lu_object *o); + /** + * Called when last active reference to the object is released (and + * object returns to the cache). This method is optional. + */ + void (*loo_object_release)(const struct lu_env *env, + struct lu_object *o); + /** + * Optional debugging helper. Print given object. + */ + int (*loo_object_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o); + /** + * Optional debugging method. Returns true iff method is internally + * consistent. + */ + int (*loo_object_invariant)(const struct lu_object *o); +}; + +/** + * Type of lu_device. + */ +struct lu_device_type; + +/** + * Device: a layer in the server side abstraction stacking. + */ +struct lu_device { + /** + * reference count. This is incremented, in particular, on each object + * created at this layer. + * + * \todo XXX which means that atomic_t is probably too small. + */ + atomic_t ld_ref; + /** + * Pointer to device type. Never modified once set. + */ + struct lu_device_type *ld_type; + /** + * Operation vector for this device. + */ + const struct lu_device_operations *ld_ops; + /** + * Stack this device belongs to. + */ + struct lu_site *ld_site; + struct proc_dir_entry *ld_proc_entry; + + /** \todo XXX: temporary back pointer into obd. */ + struct obd_device *ld_obd; + /** + * A list of references to this object, for debugging. + */ + struct lu_ref ld_reference; + /** + * Link the device to the site. + **/ + struct list_head ld_linkage; +}; + +struct lu_device_type_operations; + +/** + * Tag bits for device type. They are used to distinguish certain groups of + * device types. + */ +enum lu_device_tag { + /** this is meta-data device */ + LU_DEVICE_MD = BIT(0), + /** this is data device */ + LU_DEVICE_DT = BIT(1), + /** data device in the client stack */ + LU_DEVICE_CL = BIT(2) +}; + +/** + * Type of device. + */ +struct lu_device_type { + /** + * Tag bits. Taken from enum lu_device_tag. Never modified once set. + */ + __u32 ldt_tags; + /** + * Name of this class. Unique system-wide. Never modified once set. + */ + char *ldt_name; + /** + * Operations for this type. + */ + const struct lu_device_type_operations *ldt_ops; + /** + * \todo XXX: temporary: context tags used by obd_*() calls. + */ + __u32 ldt_ctx_tags; + /** + * Number of existing device type instances. + */ + atomic_t ldt_device_nr; +}; + +/** + * Operations on a device type. + */ +struct lu_device_type_operations { + /** + * Allocate new device. + */ + struct lu_device *(*ldto_device_alloc)(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *lcfg); + /** + * Free device. Dual to + * lu_device_type_operations::ldto_device_alloc(). Returns pointer to + * the next device in the stack. + */ + struct lu_device *(*ldto_device_free)(const struct lu_env *, + struct lu_device *); + + /** + * Initialize the devices after allocation + */ + int (*ldto_device_init)(const struct lu_env *env, + struct lu_device *, const char *, + struct lu_device *); + /** + * Finalize device. Dual to + * lu_device_type_operations::ldto_device_init(). Returns pointer to + * the next device in the stack. + */ + struct lu_device *(*ldto_device_fini)(const struct lu_env *env, + struct lu_device *); + /** + * Initialize device type. This is called on module load. + */ + int (*ldto_init)(struct lu_device_type *t); + /** + * Finalize device type. Dual to + * lu_device_type_operations::ldto_init(). Called on module unload. + */ + void (*ldto_fini)(struct lu_device_type *t); + /** + * Called when the first device is created. + */ + void (*ldto_start)(struct lu_device_type *t); + /** + * Called when number of devices drops to 0. + */ + void (*ldto_stop)(struct lu_device_type *t); +}; + +static inline int lu_device_is_md(const struct lu_device *d) +{ + return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD); +} + +/** + * Common object attributes. + */ +struct lu_attr { + /** + * valid bits + * + * \see enum la_valid + */ + __u64 la_valid; + /** size in bytes */ + __u64 la_size; + /** modification time in seconds since Epoch */ + s64 la_mtime; + /** access time in seconds since Epoch */ + s64 la_atime; + /** change time in seconds since Epoch */ + s64 la_ctime; + /** create time in seconds since Epoch */ + s64 la_btime; + /** 512-byte blocks allocated to object */ + __u64 la_blocks; + /** permission bits and file type */ + __u32 la_mode; + /** owner id */ + __u32 la_uid; + /** group id */ + __u32 la_gid; + /** object flags */ + __u32 la_flags; + /** number of persistent references to this object */ + __u32 la_nlink; + /** blk bits of the object*/ + __u32 la_blkbits; + /** blk size of the object*/ + __u32 la_blksize; + /** real device */ + __u32 la_rdev; + /** project id */ + __u32 la_projid; + /** set layout version to OST objects. */ + __u32 la_layout_version; + /** dirent count */ + __u64 la_dirent_count; +}; + +#define LU_DIRENT_COUNT_UNSET ~0ULL + +/** + * Layer in the layered object. + */ +struct lu_object { + /** + * Header for this object. + */ + struct lu_object_header *lo_header; + /** + * Device for this layer. + */ + struct lu_device *lo_dev; + /** + * Operations for this object. + */ + const struct lu_object_operations *lo_ops; + /** + * Linkage into list of all layers. + */ + struct list_head lo_linkage; + /** + * Link to the device, for debugging. + */ + struct lu_ref_link lo_dev_ref; +}; + +enum lu_object_header_flags { + /** + * Don't keep this object in cache. Object will be destroyed as soon + * as last reference to it is released. This flag cannot be cleared + * once set. + */ + LU_OBJECT_HEARD_BANSHEE = 0, + /** + * Mark this object has already been taken out of cache. + */ + LU_OBJECT_UNHASHED = 1, + /** + * Object is initialized, when object is found in cache, it may not be + * intialized yet, the object allocator will initialize it. + */ + LU_OBJECT_INITED = 2, +}; + +enum lu_object_header_attr { + LOHA_EXISTS = BIT(0), + LOHA_REMOTE = BIT(1), + LOHA_HAS_AGENT_ENTRY = BIT(2), + /** + * UNIX file type is stored in S_IFMT bits. + */ + LOHA_FT_START = 001 << 12, /**< S_IFIFO */ + LOHA_FT_END = 017 << 12, /**< S_IFMT */ +}; + +/** + * "Compound" object, consisting of multiple layers. + * + * Compound object with given fid is unique with given lu_site. + * + * Note, that object does *not* necessary correspond to the real object in the + * persistent storage: object is an anchor for locking and method calling, so + * it is created for things like not-yet-existing child created by mkdir or + * create calls. lu_object_operations::loo_exists() can be used to check + * whether object is backed by persistent storage entity. + * Any object containing this structre which might be placed in an + * rhashtable via loh_hash MUST be freed using call_rcu() or rcu_kfree(). + */ +struct lu_object_header { + /** + * Fid, uniquely identifying this object. + */ + struct lu_fid loh_fid; + /** + * Object flags from enum lu_object_header_flags. Set and checked + * atomically. + */ + unsigned long loh_flags; + /** + * Object reference count. Protected by lu_site::ls_guard. + */ + atomic_t loh_ref; + /** + * Common object attributes, cached for efficiency. From enum + * lu_object_header_attr. + */ + __u32 loh_attr; + /** + * Linkage into per-site hash table. + */ + struct rhash_head loh_hash; + /** + * Linkage into per-site LRU list. Protected by lu_site::ls_guard. + */ + struct list_head loh_lru; + /** + * Linkage into list of layers. Never modified once set (except lately + * during object destruction). No locking is necessary. + */ + struct list_head loh_layers; + /** + * A list of references to this object, for debugging. + */ + struct lu_ref loh_reference; + /* + * Handle used for kfree_rcu() or similar. + */ + struct rcu_head loh_rcu; +}; + +struct fld; + +enum { + LU_SS_CREATED = 0, + LU_SS_CACHE_HIT, + LU_SS_CACHE_MISS, + LU_SS_CACHE_RACE, + LU_SS_CACHE_DEATH_RACE, + LU_SS_LRU_PURGED, + LU_SS_LAST_STAT +}; + +/** + * lu_site is a "compartment" within which objects are unique, and LRU + * discipline is maintained. + * + * lu_site exists so that multiple layered stacks can co-exist in the same + * address space. + * + * lu_site has the same relation to lu_device as lu_object_header to + * lu_object. + */ +struct lu_site { + /** + * objects hash table + */ + struct rhashtable ls_obj_hash; + /* + * buckets for summary data + */ + struct lu_site_bkt_data *ls_bkts; + int ls_bkt_cnt; + u32 ls_bkt_seed; + /** + * index of bucket on hash table while purging + */ + unsigned int ls_purge_start; + /** + * Top-level device for this stack. + */ + struct lu_device *ls_top_dev; + /** + * Bottom-level device for this stack + */ + struct lu_device *ls_bottom_dev; + /** + * Linkage into global list of sites. + */ + struct list_head ls_linkage; + /** + * List for lu device for this site, protected + * by ls_ld_lock. + **/ + struct list_head ls_ld_linkage; + spinlock_t ls_ld_lock; + /** + * Lock to serialize site purge. + */ + struct mutex ls_purge_mutex; + /** + * lu_site stats + */ + struct lprocfs_stats *ls_stats; + /** + * XXX: a hack! fld has to find md_site via site, remove when possible + */ + struct seq_server_site *ld_seq_site; + /** + * Pointer to the lu_target for this site. + */ + struct lu_target *ls_tgt; + + /** + * Number of objects in lsb_lru_lists - used for shrinking + */ + struct percpu_counter ls_lru_len_counter; +}; + +wait_queue_head_t * +lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid); + +static inline struct seq_server_site *lu_site2seq(const struct lu_site *s) +{ + return s->ld_seq_site; +} + +/** \name ctors + * Constructors/destructors. + * @{ + */ + +int lu_site_init (struct lu_site *s, struct lu_device *d); +void lu_site_fini (struct lu_site *s); +int lu_site_init_finish (struct lu_site *s); +void lu_stack_fini (const struct lu_env *env, struct lu_device *top); +void lu_device_get (struct lu_device *d); +void lu_device_put (struct lu_device *d); +int lu_device_init (struct lu_device *d, struct lu_device_type *t); +void lu_device_fini (struct lu_device *d); +int lu_object_header_init(struct lu_object_header *h); +void lu_object_header_fini(struct lu_object_header *h); +void lu_object_header_free(struct lu_object_header *h); +int lu_object_init (struct lu_object *o, + struct lu_object_header *h, struct lu_device *d); +void lu_object_fini (struct lu_object *o); +void lu_object_add_top (struct lu_object_header *h, struct lu_object *o); +void lu_object_add (struct lu_object *before, struct lu_object *o); +struct lu_object *lu_object_get_first(struct lu_object_header *h, + struct lu_device *dev); +void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d); +void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d); + +/** + * Helpers to initialize and finalize device types. + */ + +int lu_device_type_init(struct lu_device_type *ldt); +void lu_device_type_fini(struct lu_device_type *ldt); + +/** @} ctors */ + +/** \name caching + * Caching and reference counting. + * @{ + */ + +/** + * Acquire additional reference to the given object. This function is used to + * attain additional reference. To acquire initial reference use + * lu_object_find(). + */ +static inline void lu_object_get(struct lu_object *o) +{ + LASSERT(atomic_read(&o->lo_header->loh_ref) > 0); + atomic_inc(&o->lo_header->loh_ref); +} + +/** + * Return true if object will not be cached after last reference to it is + * released. + */ +static inline int lu_object_is_dying(const struct lu_object_header *h) +{ + return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags); +} + +/** + * Return true if object is initialized. + */ +static inline int lu_object_is_inited(const struct lu_object_header *h) +{ + return test_bit(LU_OBJECT_INITED, &h->loh_flags); +} + +void lu_object_put(const struct lu_env *env, struct lu_object *o); +void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o); +void lu_object_unhash(const struct lu_env *env, struct lu_object *o); +int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s, int nr, + int canblock); + +static inline int lu_site_purge(const struct lu_env *env, struct lu_site *s, + int nr) +{ + return lu_site_purge_objects(env, s, nr, 1); +} + +void lu_site_print(const struct lu_env *env, struct lu_site *s, atomic_t *ref, + int msg_flags, lu_printer_t printer); +struct lu_object *lu_object_find(const struct lu_env *env, + struct lu_device *dev, const struct lu_fid *f, + const struct lu_object_conf *conf); +struct lu_object *lu_object_find_at(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf); +struct lu_object *lu_object_find_slice(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf); +/** @} caching */ + +/** \name helpers + * Helpers. + * @{ + */ + +/** + * First (topmost) sub-object of given compound object + */ +static inline struct lu_object *lu_object_top(struct lu_object_header *h) +{ + LASSERT(!list_empty(&h->loh_layers)); + return container_of(h->loh_layers.next, struct lu_object, lo_linkage); +} + +/** + * Next sub-object in the layering + */ +static inline struct lu_object *lu_object_next(const struct lu_object *o) +{ + return container_of(o->lo_linkage.next, struct lu_object, lo_linkage); +} + +/** + * Pointer to the fid of this object. + */ +static inline const struct lu_fid *lu_object_fid(const struct lu_object *o) +{ + return &o->lo_header->loh_fid; +} + +/** + * return device operations vector for this object + */ +static const inline struct lu_device_operations * +lu_object_ops(const struct lu_object *o) +{ + return o->lo_dev->ld_ops; +} + +/** + * Given a compound object, find its slice, corresponding to the device type + * \a dtype. + */ +struct lu_object *lu_object_locate(struct lu_object_header *h, + const struct lu_device_type *dtype); + +/** + * Printer function emitting messages through libcfs_debug_msg(). + */ +int lu_cdebug_printer(const struct lu_env *env, + void *cookie, const char *format, ...); + +/** + * Print object description followed by a user-supplied message. + */ +#define LU_OBJECT_DEBUG(mask, env, object, format, ...) \ +do { \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + lu_object_print(env, &msgdata, lu_cdebug_printer, object);\ + CDEBUG(mask, format "\n", ## __VA_ARGS__); \ + } \ +} while (0) + +/** + * Print short object description followed by a user-supplied message. + */ +#define LU_OBJECT_HEADER(mask, env, object, format, ...) \ +do { \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + lu_object_header_print(env, &msgdata, lu_cdebug_printer,\ + (object)->lo_header); \ + lu_cdebug_printer(env, &msgdata, "\n"); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +void lu_object_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct lu_object *o); +void lu_object_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct lu_object_header *hdr); + +/** + * Check object consistency. + */ +int lu_object_invariant(const struct lu_object *o); + + +/** + * Check whether object exists, no matter on local or remote storage. + * Note: LOHA_EXISTS will be set once some one created the object, + * and it does not needs to be committed to storage. + */ +#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS) + +/** + * Check whether object on the remote storage. + */ +#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE) + +/** + * Check whether the object as agent entry on current target + */ +#define lu_object_has_agent_entry(o) \ + unlikely((o)->lo_header->loh_attr & LOHA_HAS_AGENT_ENTRY) + +static inline void lu_object_set_agent_entry(struct lu_object *o) +{ + o->lo_header->loh_attr |= LOHA_HAS_AGENT_ENTRY; +} + +static inline void lu_object_clear_agent_entry(struct lu_object *o) +{ + o->lo_header->loh_attr &= ~LOHA_HAS_AGENT_ENTRY; +} + +static inline int lu_object_assert_exists(const struct lu_object *o) +{ + return lu_object_exists(o); +} + +static inline int lu_object_assert_not_exists(const struct lu_object *o) +{ + return !lu_object_exists(o); +} + +/** + * Attr of this object. + */ +static inline __u32 lu_object_attr(const struct lu_object *o) +{ + LASSERT(lu_object_exists(o) != 0); + + return o->lo_header->loh_attr & S_IFMT; +} + +static inline void lu_object_ref_add_atomic(struct lu_object *o, + const char *scope, + const void *source) +{ + lu_ref_add_atomic(&o->lo_header->loh_reference, scope, source); +} + +static inline void lu_object_ref_add(struct lu_object *o, + const char *scope, + const void *source) +{ + lu_ref_add(&o->lo_header->loh_reference, scope, source); +} + +static inline void lu_object_ref_add_at(struct lu_object *o, + struct lu_ref_link *link, + const char *scope, + const void *source) +{ + lu_ref_add_at(&o->lo_header->loh_reference, link, scope, source); +} + +static inline void lu_object_ref_del(struct lu_object *o, + const char *scope, const void *source) +{ + lu_ref_del(&o->lo_header->loh_reference, scope, source); +} + +static inline void lu_object_ref_del_at(struct lu_object *o, + struct lu_ref_link *link, + const char *scope, const void *source) +{ + lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source); +} + +/** input params, should be filled out by mdt */ +struct lu_rdpg { + /** hash */ + __u64 rp_hash; + /** count in bytes */ + unsigned int rp_count; + /** number of pages */ + unsigned int rp_npages; + /** requested attr */ + __u32 rp_attrs; + /** pointers to pages */ + struct page **rp_pages; +}; + +enum lu_xattr_flags { + LU_XATTR_REPLACE = BIT(0), + LU_XATTR_CREATE = BIT(1), + LU_XATTR_MERGE = BIT(2), + LU_XATTR_SPLIT = BIT(3), + LU_XATTR_PURGE = BIT(4), +}; + +/** @} helpers */ + +/** \name lu_context + * @{ */ + +/** For lu_context health-checks */ +enum lu_context_state { + LCS_INITIALIZED = 1, + LCS_ENTERED, + LCS_LEAVING, + LCS_LEFT, + LCS_FINALIZED +}; + +/** + * lu_context. Execution context for lu_object methods. Currently associated + * with thread. + * + * All lu_object methods, except device and device type methods (called during + * system initialization and shutdown) are executed "within" some + * lu_context. This means, that pointer to some "current" lu_context is passed + * as an argument to all methods. + * + * All service ptlrpc threads create lu_context as part of their + * initialization. It is possible to create "stand-alone" context for other + * execution environments (like system calls). + * + * lu_object methods mainly use lu_context through lu_context_key interface + * that allows each layer to associate arbitrary pieces of data with each + * context (see pthread_key_create(3) for similar interface). + * + * On a client, lu_context is bound to a thread, see cl_env_get(). + * + * \see lu_context_key + */ +struct lu_context { + /** + * lu_context is used on the client side too. Yet we don't want to + * allocate values of server-side keys for the client contexts and + * vice versa. + * + * To achieve this, set of tags in introduced. Contexts and keys are + * marked with tags. Key value are created only for context whose set + * of tags has non-empty intersection with one for key. Tags are taken + * from enum lu_context_tag. + */ + __u32 lc_tags; + enum lu_context_state lc_state; + /** + * Pointer to the home service thread. NULL for other execution + * contexts. + */ + struct ptlrpc_thread *lc_thread; + /** + * Pointer to an array with key values. Internal implementation + * detail. + */ + void **lc_value; + /** + * Linkage into a list of all remembered contexts. Only + * `non-transient' contexts, i.e., ones created for service threads + * are placed here. + */ + struct list_head lc_remember; + /** + * Version counter used to skip calls to lu_context_refill() when no + * keys were registered. + */ + unsigned lc_version; + /** + * Debugging cookie. + */ + unsigned lc_cookie; +}; + +/** + * lu_context_key interface. Similar to pthread_key. + */ + +enum lu_context_tag { + /** + * Thread on md server + */ + LCT_MD_THREAD = BIT(0), + /** + * Thread on dt server + */ + LCT_DT_THREAD = BIT(1), + /** + * Thread on client + */ + LCT_CL_THREAD = BIT(3), + /** + * A per-request session on a server, and a per-system-call session on + * a client. + */ + LCT_SESSION = BIT(4), + /** + * A per-request data on OSP device + */ + LCT_OSP_THREAD = BIT(5), + /** + * MGS device thread + */ + LCT_MG_THREAD = BIT(6), + /** + * Context for local operations + */ + LCT_LOCAL = BIT(7), + /** + * session for server thread + **/ + LCT_SERVER_SESSION = BIT(8), + /** + * Set when at least one of keys, having values in this context has + * non-NULL lu_context_key::lct_exit() method. This is used to + * optimize lu_context_exit() call. + */ + LCT_HAS_EXIT = BIT(28), + /** + * Don't add references for modules creating key values in that context. + * This is only for contexts used internally by lu_object framework. + */ + LCT_NOREF = BIT(29), + /** + * Key is being prepared for retiring, don't create new values for it. + */ + LCT_QUIESCENT = BIT(30), + /** + * Context should be remembered. + */ + LCT_REMEMBER = BIT(31), + /** + * Contexts usable in cache shrinker thread. + */ + LCT_SHRINKER = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF, +}; + +/** + * Key. Represents per-context value slot. + * + * Keys are usually registered when module owning the key is initialized, and + * de-registered when module is unloaded. Once key is registered, all new + * contexts with matching tags, will get key value. "Old" contexts, already + * initialized at the time of key registration, can be forced to get key value + * by calling lu_context_refill(). + * + * Every key value is counted in lu_context_key::lct_used and acquires a + * reference on an owning module. This means, that all key values have to be + * destroyed before module can be unloaded. This is usually achieved by + * stopping threads started by the module, that created contexts in their + * entry functions. Situation is complicated by the threads shared by multiple + * modules, like ptlrpcd daemon on a client. To work around this problem, + * contexts, created in such threads, are `remembered' (see + * LCT_REMEMBER)---i.e., added into a global list. When module is preparing + * for unloading it does the following: + * + * - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT) + * preventing new key values from being allocated in the new contexts, + * and + * + * - scans a list of remembered contexts, destroying values of module + * keys, thus releasing references to the module. + * + * This is done by lu_context_key_quiesce(). If module is re-activated + * before key has been de-registered, lu_context_key_revive() call clears + * `quiescent' marker. + * + * lu_context code doesn't provide any internal synchronization for these + * activities---it's assumed that startup (including threads start-up) and + * shutdown are serialized by some external means. + * + * \see lu_context + */ +struct lu_context_key { + /** + * Set of tags for which values of this key are to be instantiated. + */ + __u32 lct_tags; + /** + * Value constructor. This is called when new value is created for a + * context. Returns pointer to new value of error pointer. + */ + void *(*lct_init)(const struct lu_context *ctx, + struct lu_context_key *key); + /** + * Value destructor. Called when context with previously allocated + * value of this slot is destroyed. \a data is a value that was returned + * by a matching call to lu_context_key::lct_init(). + */ + void (*lct_fini)(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + /** + * Optional method called on lu_context_exit() for all allocated + * keys. Can be used by debugging code checking that locks are + * released, etc. + */ + void (*lct_exit)(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + /** + * Internal implementation detail: index within lu_context::lc_value[] + * reserved for this key. + */ + int lct_index; + /** + * Internal implementation detail: number of values created for this + * key. + */ + atomic_t lct_used; + /** + * Internal implementation detail: module for this key. + */ + struct module *lct_owner; + /** + * References to this key. For debugging. + */ + struct lu_ref lct_reference; +}; + +#define LU_KEY_INIT(mod, type) \ + static void *mod##_key_init(const struct lu_context *ctx, \ + struct lu_context_key *key) \ + { \ + type *value; \ + \ + BUILD_BUG_ON(PAGE_SIZE < sizeof(*value)); \ + \ + OBD_ALLOC_PTR(value); \ + if (value == NULL) \ + value = ERR_PTR(-ENOMEM); \ + \ + return value; \ + } \ + struct __##mod##__dummy_init { ; } /* semicolon catcher */ + +#define LU_KEY_FINI(mod, type) \ + static void mod##_key_fini(const struct lu_context *ctx, \ + struct lu_context_key *key, void* data) \ + { \ + type *info = data; \ + \ + OBD_FREE_PTR(info); \ + } \ + struct __##mod##__dummy_fini {;} /* semicolon catcher */ + +#define LU_KEY_INIT_FINI(mod, type) \ + LU_KEY_INIT(mod,type); \ + LU_KEY_FINI(mod,type) + +#define LU_CONTEXT_KEY_DEFINE(mod, tags) \ + struct lu_context_key mod##_thread_key = { \ + .lct_tags = tags, \ + .lct_init = mod##_key_init, \ + .lct_fini = mod##_key_fini \ + } + +#define LU_CONTEXT_KEY_INIT(key) \ +do { \ + (key)->lct_owner = THIS_MODULE; \ +} while (0) + +int lu_context_key_register(struct lu_context_key *key); +void lu_context_key_degister(struct lu_context_key *key); +void *lu_context_key_get (const struct lu_context *ctx, + const struct lu_context_key *key); +void lu_context_key_quiesce(struct lu_device_type *t, + struct lu_context_key *key); +void lu_context_key_revive(struct lu_context_key *key); + + +/* + * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an + * owning module. + */ + +#define LU_KEY_INIT_GENERIC(mod) \ + static void mod##_key_init_generic(struct lu_context_key *k, ...) \ + { \ + struct lu_context_key *key = k; \ + va_list args; \ + \ + va_start(args, k); \ + do { \ + LU_CONTEXT_KEY_INIT(key); \ + key = va_arg(args, struct lu_context_key *); \ + } while (key != NULL); \ + va_end(args); \ + } + +#define LU_TYPE_INIT(mod, ...) \ + LU_KEY_INIT_GENERIC(mod) \ + static int mod##_type_init(struct lu_device_type *t) \ + { \ + mod##_key_init_generic(__VA_ARGS__, NULL); \ + return lu_context_key_register_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_init {;} + +#define LU_TYPE_FINI(mod, ...) \ + static void mod##_type_fini(struct lu_device_type *t) \ + { \ + lu_context_key_degister_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_fini {;} + +#define LU_TYPE_START(mod, ...) \ + static void mod##_type_start(struct lu_device_type *t) \ + { \ + lu_context_key_revive_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_start {;} + +#define LU_TYPE_STOP(mod, ...) \ + static void mod##_type_stop(struct lu_device_type *t) \ + { \ + lu_context_key_quiesce_many(t, __VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_stop { } + + + +#define LU_TYPE_INIT_FINI(mod, ...) \ + LU_TYPE_INIT(mod, __VA_ARGS__); \ + LU_TYPE_FINI(mod, __VA_ARGS__); \ + LU_TYPE_START(mod, __VA_ARGS__); \ + LU_TYPE_STOP(mod, __VA_ARGS__) + +int lu_context_init (struct lu_context *ctx, __u32 tags); +void lu_context_fini (struct lu_context *ctx); +void lu_context_enter (struct lu_context *ctx); +void lu_context_exit (struct lu_context *ctx); +int lu_context_refill(struct lu_context *ctx); + +/* + * Helper functions to operate on multiple keys. These are used by the default + * device type operations, defined by LU_TYPE_INIT_FINI(). + */ + +int lu_context_key_register_many(struct lu_context_key *k, ...); +void lu_context_key_degister_many(struct lu_context_key *k, ...); +void lu_context_key_revive_many (struct lu_context_key *k, ...); +void lu_context_key_quiesce_many(struct lu_device_type *t, + struct lu_context_key *k, ...); + +/* + * update/clear ctx/ses tags. + */ +void lu_context_tags_update(__u32 tags); +void lu_context_tags_clear(__u32 tags); +void lu_session_tags_update(__u32 tags); +void lu_session_tags_clear(__u32 tags); + +/** + * Environment. + */ +struct lu_env { + /** + * "Local" context, used to store data instead of stack. + */ + struct lu_context le_ctx; + /** + * "Session" context for per-request data. + */ + struct lu_context *le_ses; +}; + +int lu_env_init (struct lu_env *env, __u32 tags); +void lu_env_fini (struct lu_env *env); +int lu_env_refill(struct lu_env *env); +int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags); + +static inline void* lu_env_info(const struct lu_env *env, + const struct lu_context_key *key) +{ + void *info; + info = lu_context_key_get(&env->le_ctx, key); + if (!info) { + if (!lu_env_refill((struct lu_env *)env)) + info = lu_context_key_get(&env->le_ctx, key); + } + LASSERT(info); + return info; +} + +struct lu_env *lu_env_find(void); +int lu_env_add(struct lu_env *env); +int lu_env_add_task(struct lu_env *env, struct task_struct *task); +void lu_env_remove(struct lu_env *env); + +/** @} lu_context */ + +/** + * Output site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int lu_site_stats_seq_print(const struct lu_site *s, struct seq_file *m); + +/** + * Common name structure to be passed around for various name related methods. + */ +struct lu_name { + const char *ln_name; + int ln_namelen; +}; + +static inline bool name_is_dot_or_dotdot(const char *name, int namelen) +{ + return name[0] == '.' && + (namelen == 1 || (namelen == 2 && name[1] == '.')); +} + +static inline bool lu_name_is_dot_or_dotdot(const struct lu_name *lname) +{ + return name_is_dot_or_dotdot(lname->ln_name, lname->ln_namelen); +} + +static inline bool lu_name_is_temp_file(const char *name, int namelen, + bool dot_prefix, int suffixlen) +{ + int lower = 0; + int upper = 0; + int digit = 0; + int len = suffixlen; + + if (dot_prefix && name[0] != '.') + return false; + + if (namelen < dot_prefix + suffixlen + 2 || + name[namelen - suffixlen - 1] != '.') + return false; + + while (len) { + lower += islower(name[namelen - len]); + upper += isupper(name[namelen - len]); + digit += isdigit(name[namelen - len]); + len--; + } + /* mktemp() filename suffixes will have a mix of upper- and lower-case + * letters and/or numbers, not all numbers, or all upper or lower-case. + * About 0.07% of randomly-generated names will slip through, + * but this avoids 99.93% of cross-MDT renames for those files. + */ + if ((digit >= suffixlen - 1 && !isdigit(name[namelen - suffixlen])) || + upper == suffixlen || lower == suffixlen) + return false; + + return true; +} + +static inline bool lu_name_is_backup_file(const char *name, int namelen, + int *suffixlen) +{ + if (namelen > 1 && + name[namelen - 2] != '.' && name[namelen - 1] == '~') { + if (suffixlen) + *suffixlen = 1; + return true; + } + + if (namelen > 4 && name[namelen - 4] == '.' && + (!strncasecmp(name + namelen - 3, "bak", 3) || + !strncasecmp(name + namelen - 3, "sav", 3))) { + if (suffixlen) + *suffixlen = 4; + return true; + } + + if (namelen > 5 && name[namelen - 5] == '.' && + !strncasecmp(name + namelen - 4, "orig", 4)) { + if (suffixlen) + *suffixlen = 5; + return true; + } + + return false; +} + +static inline bool lu_name_is_valid_len(const char *name, size_t name_len) +{ + return name != NULL && + name_len > 0 && + name_len < INT_MAX && + strlen(name) == name_len && + memchr(name, '/', name_len) == NULL; +} + +/** + * Validate names (path components) + * + * To be valid \a name must be non-empty, '\0' terminated of length \a + * name_len, and not contain '/'. The maximum length of a name (before + * say -ENAMETOOLONG will be returned) is really controlled by llite + * and the server. We only check for something insane coming from bad + * integer handling here. + */ +static inline bool lu_name_is_valid_2(const char *name, size_t name_len) +{ + return lu_name_is_valid_len(name, name_len) && name[name_len] == '\0'; +} + +static inline bool lu_name_is_valid(const struct lu_name *ln) +{ + return lu_name_is_valid_2(ln->ln_name, ln->ln_namelen); +} + +#define DNAME "%.*s" +#define PNAME(ln) \ + (lu_name_is_valid(ln) ? (ln)->ln_namelen : 0), \ + (lu_name_is_valid(ln) ? (ln)->ln_name : "") + +/** + * Common buffer structure to be passed around for various xattr_{s,g}et() + * methods. + */ +struct lu_buf { + void *lb_buf; + size_t lb_len; +}; + +#define DLUBUF "(%p %zu)" +#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len + +/* read buffer params, should be filled out by out */ +struct lu_rdbuf { + /** number of buffers */ + unsigned int rb_nbufs; + /** pointers to buffers */ + struct lu_buf rb_bufs[]; +}; + +/** + * One-time initializers, called at obdclass module initialization, not + * exported. + */ + +/** + * Initialization of global lu_* data. + */ +int lu_global_init(void); + +/** + * Dual to lu_global_init(). + */ +void lu_global_fini(void); + +struct lu_kmem_descr { + struct kmem_cache **ckd_cache; + const char *ckd_name; + const size_t ckd_size; +}; + +int lu_kmem_init(struct lu_kmem_descr *caches); +void lu_kmem_fini(struct lu_kmem_descr *caches); + +void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o, + const struct lu_fid *fid); +struct lu_object *lu_object_anon(const struct lu_env *env, + struct lu_device *dev, + const struct lu_object_conf *conf); + +/** null buffer */ +extern struct lu_buf LU_BUF_NULL; + +void lu_buf_free(struct lu_buf *buf); +void lu_buf_alloc(struct lu_buf *buf, size_t size); +void lu_buf_realloc(struct lu_buf *buf, size_t size); + +int lu_buf_check_and_grow(struct lu_buf *buf, size_t len); +struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, size_t len); + +extern __u32 lu_context_tags_default; +extern __u32 lu_session_tags_default; + +static inline bool lu_device_is_cl(const struct lu_device *d) +{ + return d->ld_type->ldt_tags & LU_DEVICE_CL; +} + +static inline bool lu_object_is_cl(const struct lu_object *o) +{ + return lu_device_is_cl(o->lo_dev); +} + +/* Generic subset of tgts */ +struct lu_tgt_pool { + __u32 *op_array; /* array of index of + * lov_obd->lov_tgts + */ + unsigned int op_count; /* number of tgts in the array */ + unsigned int op_size; /* allocated size of op_array */ + struct rw_semaphore op_rw_sem; /* to protect lu_tgt_pool use */ +}; + +int lu_tgt_pool_init(struct lu_tgt_pool *op, unsigned int count); +int lu_tgt_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count); +int lu_tgt_pool_remove(struct lu_tgt_pool *op, __u32 idx); +void lu_tgt_pool_free(struct lu_tgt_pool *op); +int lu_tgt_check_index(int idx, struct lu_tgt_pool *osts); +int lu_tgt_pool_extend(struct lu_tgt_pool *op, unsigned int min_count); + +/* bitflags used in rr / qos allocation */ +enum lq_flag { + LQ_DIRTY = 0, /* recalc qos data */ + LQ_SAME_SPACE, /* the OSTs all have approx. + * the same space avail */ + LQ_RESET, /* zero current penalties */ + LQ_SF_PROGRESS, /* statfs op in progress */ +}; + +#ifdef HAVE_SERVER_SUPPORT +/* round-robin QoS data for LOD/LMV */ +struct lu_qos_rr { + spinlock_t lqr_alloc; /* protect allocation index */ + atomic_t lqr_start_idx; /* start index of new inode */ + __u32 lqr_offset_idx;/* aliasing for start_idx */ + int lqr_start_count;/* reseed counter */ + struct lu_tgt_pool lqr_pool; /* round-robin optimized list */ + unsigned long lqr_flags; +}; + +static inline void lu_qos_rr_init(struct lu_qos_rr *lqr) +{ + spin_lock_init(&lqr->lqr_alloc); + set_bit(LQ_DIRTY, &lqr->lqr_flags); +} + +#endif /* HAVE_SERVER_SUPPORT */ + +/* QoS data per MDS/OSS */ +struct lu_svr_qos { + struct obd_uuid lsq_uuid; /* ptlrpc's c_remote_uuid */ + struct list_head lsq_svr_list; /* link to lq_svr_list */ + __u64 lsq_bavail; /* total bytes avail on svr */ + __u64 lsq_iavail; /* total inode avail on svr */ + __u64 lsq_penalty; /* current penalty */ + __u64 lsq_penalty_per_obj; /* penalty decrease + * every obj*/ + time64_t lsq_used; /* last used time, seconds */ + __u32 lsq_tgt_count; /* number of tgts on this svr */ + __u32 lsq_id; /* unique svr id */ +}; + +/* QoS data per MDT/OST */ +struct lu_tgt_qos { + struct lu_svr_qos *ltq_svr; /* svr info */ + __u64 ltq_penalty; /* current penalty */ + __u64 ltq_penalty_per_obj; /* penalty decrease + * every obj*/ + __u64 ltq_avail; /* bytes/inode avail */ + __u64 ltq_weight; /* net weighting */ + time64_t ltq_used; /* last used time, seconds */ + bool ltq_usable:1; /* usable for striping */ +}; + +/* target descriptor */ +#define LOV_QOS_DEF_THRESHOLD_RR_PCT 17 +#define LMV_QOS_DEF_THRESHOLD_RR_PCT 5 + +#define LOV_QOS_DEF_PRIO_FREE 90 +#define LMV_QOS_DEF_PRIO_FREE 90 + +struct lu_tgt_desc { + union { + struct dt_device *ltd_tgt; + struct obd_device *ltd_obd; + }; + struct obd_export *ltd_exp; + struct obd_uuid ltd_uuid; + __u32 ltd_index; + __u32 ltd_gen; + struct list_head ltd_kill; + struct task_struct *ltd_recovery_task; + struct mutex ltd_fid_mutex; + struct lu_tgt_qos ltd_qos; /* qos info per target */ + struct obd_statfs ltd_statfs; + time64_t ltd_statfs_age; + unsigned long ltd_active:1,/* is this target up for requests */ + ltd_activate:1,/* should target be activated */ + ltd_reap:1, /* should this target be deleted */ + ltd_got_update_log:1, /* Already got update log */ + ltd_connecting:1; /* target is connecting */ +}; + +/* number of pointers at 2nd level */ +#define TGT_PTRS_PER_BLOCK (PAGE_SIZE / sizeof(void *)) +/* number of pointers at 1st level - only need as many as max OST/MDT count */ +#define TGT_PTRS ((LOV_ALL_STRIPES + 1) / TGT_PTRS_PER_BLOCK) + +struct lu_tgt_desc_idx { + struct lu_tgt_desc *ldi_tgt[TGT_PTRS_PER_BLOCK]; +}; + + +/* QoS data for LOD/LMV */ +#define QOS_THRESHOLD_MAX 256 /* should be power of two */ +struct lu_qos { + struct list_head lq_svr_list; /* lu_svr_qos list */ + struct rw_semaphore lq_rw_sem; + __u32 lq_active_svr_count; + unsigned int lq_prio_free; /* priority for free space */ + unsigned int lq_threshold_rr;/* priority for rr */ +#ifdef HAVE_SERVER_SUPPORT + struct lu_qos_rr lq_rr; /* round robin qos data */ +#endif + unsigned long lq_flags; +#if 0 + unsigned long lq_dirty:1, /* recalc qos data */ + lq_same_space:1,/* the servers all have approx. + * the same space avail */ + lq_reset:1; /* zero current penalties */ +#endif +}; + +struct lu_tgt_descs { + union { + struct lov_desc ltd_lov_desc; + struct lmv_desc ltd_lmv_desc; + }; + /* list of known TGTs */ + struct lu_tgt_desc_idx *ltd_tgt_idx[TGT_PTRS]; + /* Size of the lu_tgts array, granted to be a power of 2 */ + __u32 ltd_tgts_size; + /* bitmap of TGTs available */ + unsigned long *ltd_tgt_bitmap; + /* TGTs scheduled to be deleted */ + __u32 ltd_death_row; + /* Table refcount used for delayed deletion */ + int ltd_refcount; + /* mutex to serialize concurrent updates to the tgt table */ + struct mutex ltd_mutex; + /* read/write semaphore used for array relocation */ + struct rw_semaphore ltd_rw_sem; + /* QoS */ + struct lu_qos ltd_qos; + /* all tgts in a packed array */ + struct lu_tgt_pool ltd_tgt_pool; + /* true if tgt is MDT */ + bool ltd_is_mdt; +}; + +#define LTD_TGT(ltd, index) \ + (ltd)->ltd_tgt_idx[(index) / TGT_PTRS_PER_BLOCK]-> \ + ldi_tgt[(index) % TGT_PTRS_PER_BLOCK] + +u64 lu_prandom_u64_max(u64 ep_ro); +int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd); +void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt); + +int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt); +void lu_tgt_descs_fini(struct lu_tgt_descs *ltd); +int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt); +void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt); +int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd); +int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt, + __u64 *total_wt); + +/** + * Whether MDT inode and space usages are balanced. + */ +static inline bool ltd_qos_is_balanced(struct lu_tgt_descs *ltd) +{ + return !test_bit(LQ_DIRTY, <d->ltd_qos.lq_flags) && + test_bit(LQ_SAME_SPACE, <d->ltd_qos.lq_flags); +} + +/** + * Whether QoS data is up-to-date and QoS can be applied. + */ +static inline bool ltd_qos_is_usable(struct lu_tgt_descs *ltd) +{ + if (ltd_qos_is_balanced(ltd)) + return false; + + if (ltd->ltd_lov_desc.ld_active_tgt_count < 2) + return false; + + return true; +} + +static inline struct lu_tgt_desc *ltd_first_tgt(struct lu_tgt_descs *ltd) +{ + int index; + + index = find_first_bit(ltd->ltd_tgt_bitmap, + ltd->ltd_tgts_size); + return (index < ltd->ltd_tgts_size) ? LTD_TGT(ltd, index) : NULL; +} + +static inline struct lu_tgt_desc *ltd_next_tgt(struct lu_tgt_descs *ltd, + struct lu_tgt_desc *tgt) +{ + int index; + + if (!tgt) + return NULL; + + index = tgt->ltd_index; + LASSERT(index < ltd->ltd_tgts_size); + index = find_next_bit(ltd->ltd_tgt_bitmap, + ltd->ltd_tgts_size, index + 1); + return (index < ltd->ltd_tgts_size) ? LTD_TGT(ltd, index) : NULL; +} + +#define ltd_foreach_tgt(ltd, tgt) \ + for (tgt = ltd_first_tgt(ltd); tgt; tgt = ltd_next_tgt(ltd, tgt)) + +#define ltd_foreach_tgt_safe(ltd, tgt, tmp) \ + for (tgt = ltd_first_tgt(ltd), tmp = ltd_next_tgt(ltd, tgt); tgt; \ + tgt = tmp, tmp = ltd_next_tgt(ltd, tgt)) + +/** @} lu */ +#endif /* __LUSTRE_LU_OBJECT_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lu_ref.h b/drivers/staging/lustrefsx/lustre/include/lu_ref.h new file mode 100644 index 0000000000000..7b368c297ff13 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lu_ref.h @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + * + * Author: Nikita Danilov + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef __LUSTRE_LU_REF_H +#define __LUSTRE_LU_REF_H + +#include + +/** \defgroup lu_ref lu_ref + * + * An interface to track references between objects. Mostly for debugging. + * + * Suppose there is a reference counted data-structure struct foo. To track + * who acquired references to instance of struct foo, add lu_ref field to it: + * + * \code + * struct foo { + * atomic_t foo_refcount; + * struct lu_ref foo_reference; + * ... + * }; + * \endcode + * + * foo::foo_reference has to be initialized by calling + * lu_ref_init(). Typically there will be functions or macros to increment and + * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo) + * and foo_put(struct foo *foo), respectively. + * + * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add() + * has to be called to insert into foo::foo_reference a record, describing + * acquired reference. Dually, lu_ref_del() removes matching record. Typical + * usages are: + * + * \code + * struct bar *bar; + * + * // bar owns a reference to foo. + * bar->bar_foo = foo_get(foo); + * lu_ref_add(&foo->foo_reference, "bar", bar); + * + * ... + * + * // reference from bar to foo is released. + * lu_ref_del(&foo->foo_reference, "bar", bar); + * foo_put(bar->bar_foo); + * + * + * // current thread acquired a temporary reference to foo. + * foo_get(foo); + * lu_ref_add(&foo->reference, __func__, current); + * + * ... + * + * // temporary reference is released. + * lu_ref_del(&foo->reference, __func__, current); + * foo_put(foo); + * \endcode + * + * \e Et \e cetera. Often it makes sense to include lu_ref_add() and + * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct + * foo is destroyed, lu_ref_fini() has to be called that checks that no + * pending references remain. lu_ref_print() can be used to dump a list of + * pending references, while hunting down a leak. + * + * For objects to which a large number of references can be acquired, + * lu_ref_del() can become cpu consuming, as it has to scan the list of + * references. To work around this, remember result of lu_ref_add() (usually + * in the same place where pointer to struct foo is stored), and use + * lu_ref_del_at(): + * + * \code + * // There is a large number of bar's for a single foo. + * bar->bar_foo = foo_get(foo); + * bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar); + * + * ... + * + * // reference from bar to foo is released. + * lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar); + * foo_put(bar->bar_foo); + * \endcode + * + * lu_ref interface degrades gracefully in case of memory shortages. + * + * @{ + */ + +#ifdef CONFIG_LUSTRE_DEBUG_LU_REF + +/** + * Data-structure to keep track of references to a given object. This is used + * for debugging. + * + * lu_ref is embedded into an object which other entities (objects, threads, + * etc.) refer to. + */ +struct lu_ref { + /** + * Spin-lock protecting lu_ref::lf_list. + */ + spinlock_t lf_guard; + /** + * List of all outstanding references (each represented by struct + * lu_ref_link), pointing to this object. + */ + struct list_head lf_list; + /** + * # of links. + */ + short lf_refs; + /** + * Flag set when lu_ref_add() failed to allocate lu_ref_link. It is + * used to mask spurious failure of the following lu_ref_del(). + */ + short lf_failed; + /** + * flags - attribute for the lu_ref, for pad and future use. + */ + short lf_flags; + /** + * Where was I initialized? + */ + short lf_line; + const char *lf_func; + /** + * Linkage into a global list of all lu_ref's (lu_ref_refs). + */ + struct list_head lf_linkage; +}; + +struct lu_ref_link { + struct lu_ref *ll_ref; + struct list_head ll_linkage; + const char *ll_scope; + const void *ll_source; +}; + +void lu_ref_init_loc(struct lu_ref *ref, const char *func, const int line); +void lu_ref_fini(struct lu_ref *ref); +#define lu_ref_init(ref) lu_ref_init_loc(ref, __func__, __LINE__) + +void lu_ref_add(struct lu_ref *ref, const char *scope, const void *source); + +void lu_ref_add_atomic(struct lu_ref *ref, const char *scope, + const void *source); + +void lu_ref_add_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source); + +void lu_ref_del(struct lu_ref *ref, const char *scope, const void *source); + +void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source0, const void *source1); + +void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source); + +void lu_ref_print(const struct lu_ref *ref); + +void lu_ref_print_all(void); + +int lu_ref_global_init(void); + +void lu_ref_global_fini(void); + +#else /* !CONFIG_LUSTRE_DEBUG_LU_REF */ + +struct lu_ref { +}; + +struct lu_ref_link { +}; + +static inline void lu_ref_init(struct lu_ref *ref) +{ +} + +static inline void lu_ref_fini(struct lu_ref *ref) +{ +} + +static inline void lu_ref_add(struct lu_ref *ref, + const char *scope, + const void *source) +{ +} + +static inline void lu_ref_add_atomic(struct lu_ref *ref, + const char *scope, + const void *source) +{ +} + +static inline void lu_ref_add_at(struct lu_ref *ref, + struct lu_ref_link *link, + const char *scope, + const void *source) +{ +} + +static inline void lu_ref_del(struct lu_ref *ref, const char *scope, + const void *source) +{ +} + +static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source0, + const void *source1) +{ +} + +static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source) +{ +} + +static inline int lu_ref_global_init(void) +{ + return 0; +} + +static inline void lu_ref_global_fini(void) +{ +} + +static inline void lu_ref_print(const struct lu_ref *ref) +{ +} + +static inline void lu_ref_print_all(void) +{ +} +#endif /* CONFIG_LUSTRE_DEBUG_LU_REF */ + +/** @} lu */ + +#endif /* __LUSTRE_LU_REF_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lu_target.h b/drivers/staging/lustrefsx/lustre/include/lu_target.h new file mode 100644 index 0000000000000..d061244d22322 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lu_target.h @@ -0,0 +1,740 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _LUSTRE_LU_TARGET_H +#define _LUSTRE_LU_TARGET_H + +#include +#include +#include +#include +#include +#include + +/* Each one represents a distribute transaction replay + * operation, and updates on each MDTs are linked to + * dtr_sub_list */ +struct distribute_txn_replay_req { + /* update record, may be vmalloc'd */ + struct llog_update_record *dtrq_lur; + int dtrq_lur_size; + + /* linked to the distribute transaction replay + * list (tdtd_replay_list) */ + struct list_head dtrq_list; + __u64 dtrq_master_transno; + __u64 dtrq_batchid; + __u64 dtrq_xid; + + /* all of sub updates are linked here */ + struct list_head dtrq_sub_list; + spinlock_t dtrq_sub_list_lock; + + /* If the local update has been executed during replay */ + __u32 dtrq_local_update_executed:1; +}; + +/* Each one represents a sub replay item under a distribute + * transaction. A distribute transaction will be operated in + * two or more MDTs, and updates on each MDT will be represented + * by this structure */ +struct distribute_txn_replay_req_sub { + __u32 dtrqs_mdt_index; + + /* All of cookies for the update will be linked here */ + spinlock_t dtrqs_cookie_list_lock; + struct list_head dtrqs_cookie_list; + struct list_head dtrqs_list; +}; + +struct target_distribute_txn_data; +typedef int (*distribute_txn_replay_handler_t)(struct lu_env *env, + struct target_distribute_txn_data *tdtd, + struct distribute_txn_replay_req *dtrq); +typedef char *(*target_show_update_logs_retrievers_t)(void *data, int *size, + int *count); +struct target_distribute_txn_data { + /* Distribution ID is used to identify updates log on different + * MDTs for one operation */ + spinlock_t tdtd_batchid_lock; + __u64 tdtd_batchid; + struct lu_target *tdtd_lut; + struct dt_object *tdtd_batchid_obj; + struct dt_device *tdtd_dt; + + /* Committed batchid for distribute transaction */ + __u64 tdtd_committed_batchid; + + /* List for distribute transaction */ + struct list_head tdtd_list; + + /* Threads to manage distribute transaction */ + struct task_struct *tdtd_commit_task; + atomic_t tdtd_refcount; + struct lu_env tdtd_env; + + /* recovery update */ + distribute_txn_replay_handler_t tdtd_replay_handler; + struct list_head tdtd_replay_list; + struct list_head tdtd_replay_finish_list; + spinlock_t tdtd_replay_list_lock; + /* last replay update transno */ + __u32 tdtd_replay_ready:1; + + /* Manage the llog recovery threads */ + atomic_t tdtd_recovery_threads_count; + wait_queue_head_t tdtd_recovery_threads_waitq; + target_show_update_logs_retrievers_t + tdtd_show_update_logs_retrievers; + void *tdtd_show_retrievers_cbdata; +}; + +struct tg_grants_data { + /* grants: all values in bytes */ + /* grant lock to protect all grant counters */ + spinlock_t tgd_grant_lock; + /* total amount of dirty data reported by clients in incoming obdo */ + u64 tgd_tot_dirty; + /* sum of filesystem space granted to clients for async writes */ + u64 tgd_tot_granted; + /* grant used by I/Os in progress (between prepare and commit) */ + u64 tgd_tot_pending; + /* amount of available space in percentage that is never used for + * grants, used on MDT to always keep space for metadata. */ + u64 tgd_reserved_pcnt; + /* number of clients using grants */ + int tgd_tot_granted_clients; + /* shall we grant space to clients not + * supporting OBD_CONNECT_GRANT_PARAM? */ + int tgd_grant_compat_disable; + /* protect all statfs-related counters */ + spinlock_t tgd_osfs_lock; + time64_t tgd_osfs_age; + int tgd_blockbits; + /* counters used during statfs update, protected by ofd_osfs_lock. + * record when some statfs refresh are in progress */ + int tgd_statfs_inflight; + /* writes between prep & commit which might be accounted twice in + * ofd_osfs.os_bavail */ + u64 tgd_osfs_unstable; + /* track writes completed while statfs refresh is underway. + * tracking is only effective when ofd_statfs_inflight > 1 */ + u64 tgd_osfs_inflight; + /* statfs optimization: we cache a bit */ + struct obd_statfs tgd_osfs; +}; + +struct lu_target { + struct obd_device *lut_obd; + struct dt_device *lut_bottom; + struct dt_device_param lut_dt_conf; + + struct target_distribute_txn_data *lut_tdtd; + + /* supported opcodes and handlers for this target */ + struct tgt_opc_slice *lut_slice; + __u32 lut_reply_fail_id; + __u32 lut_request_fail_id; + + /* sptlrpc rules */ + rwlock_t lut_sptlrpc_lock; + struct sptlrpc_rule_set lut_sptlrpc_rset; + spinlock_t lut_flags_lock; + unsigned int lut_syncjournal:1, + lut_sync_lock_cancel:2, + /* e.g. OST node */ + lut_no_reconstruct:1, + /* enforce recovery for local clients */ + lut_local_recovery:1, + lut_cksum_t10pi_enforce:1; + /* checksum types supported on this node */ + enum cksum_types lut_cksum_types_supported; + /** last_rcvd file */ + struct dt_object *lut_last_rcvd; + /* transaction callbacks */ + struct dt_txn_callback lut_txn_cb; + /** server data in last_rcvd file */ + struct lr_server_data lut_lsd; + /** Server last transaction number */ + __u64 lut_last_transno; + /** Lock protecting last transaction number */ + spinlock_t lut_translock; + /** Lock protecting client bitmap */ + spinlock_t lut_client_bitmap_lock; + /** Bitmap of known clients */ + unsigned long *lut_client_bitmap; + /* Number of clients supporting multiple modify RPCs + * recorded in the last_rcvd file + */ + atomic_t lut_num_clients; + /* Client generation to identify client slot reuse */ + atomic_t lut_client_generation; + /** reply_data file */ + struct dt_object *lut_reply_data; + /** Bitmap of used slots in the reply data file */ + unsigned long **lut_reply_bitmap; + /** target sync count, used for debug & test */ + atomic_t lut_sync_count; + + /** cross MDT locks which should trigger Sync-on-Lock-Cancel */ + spinlock_t lut_slc_locks_guard; + struct list_head lut_slc_locks; + + /* target grants fields */ + struct tg_grants_data lut_tgd; + + /* target tunables */ + const struct attribute **lut_attrs; + + /* FMD (file modification data) values */ + int lut_fmd_max_num; + time64_t lut_fmd_max_age; +}; + +#define LUT_FMD_MAX_NUM_DEFAULT 128 +#define LUT_FMD_MAX_AGE_DEFAULT (obd_timeout + 10) + +/* number of slots in reply bitmap */ +#define LUT_REPLY_SLOTS_PER_CHUNK (1<<20) +#define LUT_REPLY_SLOTS_MAX_CHUNKS 16 + +#define TRD_INDEX_MEMORY -1 + +/** + * Target reply data + */ +struct tg_reply_data { + /** chain of reply data anchored in tg_export_data */ + struct list_head trd_list; + /** copy of on-disk reply data */ + struct lsd_reply_data trd_reply; + /** versions for Version Based Recovery */ + __u64 trd_pre_versions[4]; + /** slot index in reply_data file */ + int trd_index; + /** tag the client used */ + __u16 trd_tag; + /** child fid to reconstruct open */ + struct lu_fid trd_object; +}; + +extern struct lu_context_key tgt_session_key; + +struct tgt_session_info { + /* + * The following members will be filled explicitly + * with specific data in tgt_ses_init(). + */ + struct req_capsule *tsi_pill; + + /* + * Lock request for "habeo clavis" operations. + */ + struct ldlm_request *tsi_dlm_req; + + /* although we have export in req, there are cases when it is not + * available, e.g. closing files upon export destroy */ + struct obd_export *tsi_exp; + const struct lu_env *tsi_env; + struct lu_target *tsi_tgt; + + const struct mdt_body *tsi_mdt_body; + struct ost_body *tsi_ost_body; + struct lu_object *tsi_corpus; + + struct lu_fid tsi_fid; + struct ldlm_res_id tsi_resid; + + /* object affected by VBR, for last_rcvd_update */ + struct dt_object *tsi_vbr_obj; + /* open child object, for last_rcvd_update */ + struct dt_object *tsi_open_obj; + /* opdata for mdt_reint_open(), has the same value as + * ldlm_reply:lock_policy_res1. The tgt_update_last_rcvd() stores + * this value onto disk for recovery when tgt_txn_stop_cb() is called. + */ + __u64 tsi_opdata; + + /* + * Additional fail id that can be set by handler. + */ + int tsi_reply_fail_id; + bool tsi_preprocessed; + /* request JobID */ + char *tsi_jobid; + + /* update replay */ + __u64 tsi_xid; + __u32 tsi_result; + __u32 tsi_client_gen; +}; + +static inline struct tgt_session_info *tgt_ses_info(const struct lu_env *env) +{ + struct tgt_session_info *tsi; + + LASSERT(env->le_ses != NULL); + tsi = lu_context_key_get(env->le_ses, &tgt_session_key); + LASSERT(tsi); + return tsi; +} + +static inline void tgt_vbr_obj_set(const struct lu_env *env, + struct dt_object *obj) +{ + struct tgt_session_info *tsi; + + if (env->le_ses != NULL) { + tsi = tgt_ses_info(env); + tsi->tsi_vbr_obj = obj; + } +} + +static inline void tgt_open_obj_set(const struct lu_env *env, + struct dt_object *obj) +{ + struct tgt_session_info *tsi; + + if (env->le_ses != NULL) { + tsi = tgt_ses_info(env); + tsi->tsi_open_obj = obj; + } +} + +static inline void tgt_opdata_set(const struct lu_env *env, __u64 flags) +{ + struct tgt_session_info *tsi; + + if (env->le_ses != NULL) { + tsi = tgt_ses_info(env); + tsi->tsi_opdata |= flags; + } +} + +static inline void tgt_opdata_clear(const struct lu_env *env, __u64 flags) +{ + struct tgt_session_info *tsi; + + if (env->le_ses != NULL) { + tsi = tgt_ses_info(env); + tsi->tsi_opdata &= ~flags; + } +} + +/* + * Generic unified target support. + */ +enum tgt_handler_flags { + /* + * struct *_body is passed in the incoming message, and object + * identified by this fid exists on disk. + */ + HAS_BODY = BIT(0), + /* + * struct ldlm_request is passed in the incoming message. + */ + HAS_KEY = BIT(1), + /* + * this request has fixed reply format, so that reply message can be + * packed by generic code. + */ + HAS_REPLY = BIT(2), + /* + * this request will modify something, so check whether the file system + * is readonly or not, then return -EROFS to client asap if necessary. + */ + IS_MUTABLE = BIT(3) +}; + +struct tgt_handler { + /* The name of this handler. */ + const char *th_name; + /* Fail id, check at the beginning */ + int th_fail_id; + /* Operation code */ + __u32 th_opc; + /* Flags in enum tgt_handler_flags */ + __u32 th_flags; + /* Request version for this opcode */ + enum lustre_msg_version th_version; + /* Handler function */ + int (*th_act)(struct tgt_session_info *tsi); + /* Handler function for high priority requests */ + void (*th_hp)(struct tgt_session_info *tsi); + /* Request format for this request */ + const struct req_format *th_fmt; +}; + +struct tgt_opc_slice { + __u32 tos_opc_start; /* First op code */ + __u32 tos_opc_end; /* Last op code */ + struct tgt_handler *tos_hs; /* Registered handler */ +}; + +static inline struct ptlrpc_request *tgt_ses_req(struct tgt_session_info *tsi) +{ + return tsi->tsi_pill ? tsi->tsi_pill->rc_req : NULL; +} + +static inline __u64 tgt_conn_flags(struct tgt_session_info *tsi) +{ + LASSERT(tsi->tsi_exp); + return exp_connect_flags(tsi->tsi_exp); +} + +static inline int req_is_replay(struct ptlrpc_request *req) +{ + LASSERT(req->rq_reqmsg); + return !!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY); +} + +static inline bool tgt_is_multimodrpcs_client(struct obd_export *exp) +{ + return exp_connect_flags(exp) & OBD_CONNECT_MULTIMODRPCS; +} + +static inline bool tgt_is_increasing_xid_client(struct obd_export *exp) +{ + return exp_connect_flags2(exp) & OBD_CONNECT2_INC_XID; +} + +/* target/tgt_handler.c */ +int tgt_request_handle(struct ptlrpc_request *req); +char *tgt_name(struct lu_target *tgt); +void tgt_counter_incr(struct obd_export *exp, int opcode); +int tgt_connect_check_sptlrpc(struct ptlrpc_request *req, + struct obd_export *exp); +int tgt_adapt_sptlrpc_conf(struct lu_target *tgt); +int tgt_connect(struct tgt_session_info *tsi); +int tgt_disconnect(struct tgt_session_info *uti); +int tgt_obd_ping(struct tgt_session_info *tsi); +int tgt_enqueue(struct tgt_session_info *tsi); +int tgt_convert(struct tgt_session_info *tsi); +int tgt_bl_callback(struct tgt_session_info *tsi); +int tgt_cp_callback(struct tgt_session_info *tsi); +int tgt_llog_open(struct tgt_session_info *tsi); +int tgt_llog_read_header(struct tgt_session_info *tsi); +int tgt_llog_next_block(struct tgt_session_info *tsi); +int tgt_llog_prev_block(struct tgt_session_info *tsi); +int tgt_sec_ctx_init(struct tgt_session_info *tsi); +int tgt_sec_ctx_init_cont(struct tgt_session_info *tsi); +int tgt_sec_ctx_fini(struct tgt_session_info *tsi); +int tgt_sendpage(struct tgt_session_info *tsi, struct lu_rdpg *rdpg, int nob); +int tgt_send_buffer(struct tgt_session_info *tsi, struct lu_rdbuf *rdbuf); +int tgt_validate_obdo(struct tgt_session_info *tsi, struct obdo *oa); +int tgt_sync(const struct lu_env *env, struct lu_target *tgt, + struct dt_object *obj, __u64 start, __u64 end); + +int tgt_io_thread_init(struct ptlrpc_thread *thread); +void tgt_io_thread_done(struct ptlrpc_thread *thread); + +int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id, + struct lustre_handle *lh, int mode, __u64 *flags); +int tgt_extent_lock(const struct lu_env *env, struct ldlm_namespace *ns, + struct ldlm_res_id *res_id, __u64 start, __u64 end, + struct lustre_handle *lh, int mode, __u64 *flags); +void tgt_data_unlock(struct lustre_handle *lh, enum ldlm_mode mode); +int tgt_brw_read(struct tgt_session_info *tsi); +int tgt_brw_write(struct tgt_session_info *tsi); +int tgt_lseek(struct tgt_session_info *tsi); +int tgt_hpreq_handler(struct ptlrpc_request *req); +void tgt_register_lfsck_in_notify_local(int (*notify)(const struct lu_env *, + struct dt_device *, + struct lfsck_req_local *, + struct thandle *)); +void tgt_register_lfsck_in_notify(int (*notify)(const struct lu_env *, + struct dt_device *, + struct lfsck_request *)); +void tgt_register_lfsck_query(int (*query)(const struct lu_env *, + struct dt_device *, + struct lfsck_request *, + struct lfsck_reply *, + struct lfsck_query *)); +int req_can_reconstruct(struct ptlrpc_request *req, struct tg_reply_data *trd); + +extern struct tgt_handler tgt_sec_ctx_handlers[]; +extern struct tgt_handler tgt_lfsck_handlers[]; +extern struct tgt_handler tgt_obd_handlers[]; +extern struct tgt_handler tgt_dlm_handlers[]; +extern struct tgt_handler tgt_llog_handlers[]; +extern struct tgt_handler tgt_out_handlers[]; +extern struct tgt_handler fld_handlers[]; +extern struct tgt_handler seq_handlers[]; + +typedef void (*tgt_cb_t)(struct lu_target *lut, __u64 transno, + void *data, int err); +struct tgt_commit_cb { + tgt_cb_t tgt_cb_func; + void *tgt_cb_data; +}; + +int tgt_hpreq_handler(struct ptlrpc_request *req); + +/* target/tgt_main.c */ +void tgt_boot_epoch_update(struct lu_target *lut); +void tgt_save_slc_lock(struct lu_target *lut, struct ldlm_lock *lock, + __u64 transno); +void tgt_discard_slc_lock(struct lu_target *lut, struct ldlm_lock *lock); +int tgt_init(const struct lu_env *env, struct lu_target *lut, + struct obd_device *obd, struct dt_device *dt, + struct tgt_opc_slice *slice, + int request_fail_id, int reply_fail_id); +void tgt_fini(const struct lu_env *env, struct lu_target *lut); +int tgt_client_alloc(struct obd_export *exp); +void tgt_client_free(struct obd_export *exp); +int tgt_client_del(const struct lu_env *env, struct obd_export *exp); +int tgt_client_add(const struct lu_env *env, struct obd_export *exp, int); +int tgt_client_new(const struct lu_env *env, struct obd_export *exp); +int tgt_server_data_update(const struct lu_env *env, struct lu_target *tg, + int sync); +int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt); +int tgt_lookup_reply(struct ptlrpc_request *req, struct tg_reply_data *trd); +int tgt_mk_reply_data(const struct lu_env *env, struct lu_target *tgt, + struct tg_export_data *ted, struct ptlrpc_request *req, + __u64 opdata, struct thandle *th, bool write_update, + __u64 transno); +struct tg_reply_data *tgt_lookup_reply_by_xid(struct tg_export_data *ted, + __u64 xid); +int tgt_tunables_init(struct lu_target *lut); +void tgt_tunables_fini(struct lu_target *lut); +void tgt_mask_cksum_types(struct lu_target *lut, enum cksum_types *cksum_types); + +/* target/tgt_grant.c */ +static inline int exp_grant_param_supp(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_GRANT_PARAM); +} + +/* Blocksize used for client not supporting OBD_CONNECT_GRANT_PARAM. + * That's 4KB=2^12 which is the biggest block size known to work whatever + * the client's page size is. */ +#define COMPAT_BSIZE_SHIFT 12 + +void tgt_grant_sanity_check(struct obd_device *obd, const char *func); +void tgt_grant_connect(const struct lu_env *env, struct obd_export *exp, + struct obd_connect_data *data, bool new_conn); +void tgt_grant_discard(struct obd_export *exp); +void tgt_grant_prepare_read(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa); +void tgt_grant_prepare_write(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct niobuf_remote *rnb, + int niocount); +void tgt_grant_commit(struct obd_export *exp, unsigned long grant_used, int rc); +int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp, + unsigned long grant); +long tgt_grant_create(const struct lu_env *env, struct obd_export *exp, + s64 *nr); +int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut, + struct obd_statfs *osfs, time64_t max_age, + int *from_cache); +ssize_t tot_dirty_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t tot_granted_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t tot_pending_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t grant_compat_disable_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t grant_compat_disable_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count); +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 16, 53, 0) +ssize_t sync_lock_cancel_show(struct kobject *kobj, + struct attribute *attr, char *buf); +ssize_t sync_lock_cancel_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count); +#endif + +/* FMD */ +void tgt_fmd_update(struct obd_export *exp, const struct lu_fid *fid, + __u64 xid); +bool tgt_fmd_check(struct obd_export *exp, const struct lu_fid *fid, + __u64 xid); +#ifdef DO_FMD_DROP +void tgt_fmd_drop(struct obd_export *exp, const struct lu_fid *fid); +#else +#define tgt_fmd_drop(exp, fid) do {} while (0) +#endif + +/* target/update_trans.c */ +int distribute_txn_init(const struct lu_env *env, + struct lu_target *lut, + struct target_distribute_txn_data *tdtd, + __u32 index); +void distribute_txn_fini(const struct lu_env *env, + struct target_distribute_txn_data *tdtd); + +/* target/update_recovery.c */ +int insert_update_records_to_replay_list(struct target_distribute_txn_data *, + struct llog_update_record *, + struct llog_cookie *, __u32); +void dtrq_list_dump(struct target_distribute_txn_data *tdtd, + unsigned int mask); +void dtrq_list_destroy(struct target_distribute_txn_data *tdtd); +int distribute_txn_replay_handle(struct lu_env *env, + struct target_distribute_txn_data *tdtd, + struct distribute_txn_replay_req *dtrq); +__u64 distribute_txn_get_next_transno(struct target_distribute_txn_data *tdtd); +struct distribute_txn_replay_req * +distribute_txn_get_next_req(struct target_distribute_txn_data *tdtd); +void dtrq_destroy(struct distribute_txn_replay_req *dtrq); +struct distribute_txn_replay_req_sub * +dtrq_sub_lookup(struct distribute_txn_replay_req *dtrq, __u32 mdt_index); +struct distribute_txn_replay_req * +distribute_txn_lookup_finish_list(struct target_distribute_txn_data *tdtd, + __u64 transno); +bool is_req_replayed_by_update(struct ptlrpc_request *req); +enum { + ESERIOUS = 0x0001000 +}; + +static inline int err_serious(int rc) +{ + LASSERT(rc < 0); + return -(-rc | ESERIOUS); +} + +static inline int clear_serious(int rc) +{ + if (rc < 0) + rc = -(-rc & ~ESERIOUS); + return rc; +} + +static inline int is_serious(int rc) +{ + return (rc < 0 && -rc & ESERIOUS); +} + +/* + * Unified target generic handers macros and generic functions. + */ +#define TGT_RPC_HANDLER_HP(base, flags, opc, fn, hp, fmt, version) \ +[opc - base] = { \ + .th_name = #opc, \ + .th_fail_id = OBD_FAIL_ ## opc ## _NET, \ + .th_opc = opc, \ + .th_flags = flags, \ + .th_act = fn, \ + .th_fmt = fmt, \ + .th_version = version, \ + .th_hp = hp, \ +} +#define TGT_RPC_HANDLER(base, flags, opc, fn, fmt, version) \ + TGT_RPC_HANDLER_HP(base, flags, opc, fn, NULL, fmt, version) + +/* MDT Request with a format known in advance */ +#define TGT_MDT_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(MDS_FIRST_OPC, flags, name, fn, &RQF_ ## name, \ + LUSTRE_MDS_VERSION) +/* Request with a format we do not yet know */ +#define TGT_MDT_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(MDS_FIRST_OPC, flags, name, fn, NULL, \ + LUSTRE_MDS_VERSION) + +/* OST Request with a format known in advance */ +#define TGT_OST_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(OST_FIRST_OPC, flags, name, fn, &RQF_ ## name, \ + LUSTRE_OST_VERSION) +#define TGT_OST_HDL_HP(flags, name, fn, hp) \ + TGT_RPC_HANDLER_HP(OST_FIRST_OPC, flags, name, fn, hp, \ + &RQF_ ## name, LUSTRE_OST_VERSION) + +/* MGS request with a format known in advance */ +#define TGT_MGS_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(MGS_FIRST_OPC, flags, name, fn, &RQF_ ## name, \ + LUSTRE_MGS_VERSION) +#define TGT_MGS_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(MGS_FIRST_OPC, flags, name, fn, NULL, \ + LUSTRE_MGS_VERSION) + +/* + * OBD handler macros and generic functions. + */ +#define TGT_OBD_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(OBD_FIRST_OPC, flags, name, fn, &RQF_ ## name, \ + LUSTRE_OBD_VERSION) +#define TGT_OBD_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(OBD_FIRST_OPC, flags, name, fn, NULL, \ + LUSTRE_OBD_VERSION) + +/* + * DLM handler macros and generic functions. + */ +#define TGT_DLM_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(LDLM_FIRST_OPC, flags, name, fn, NULL, \ + LUSTRE_DLM_VERSION) +#define TGT_DLM_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(LDLM_FIRST_OPC, flags, name, fn, &RQF_ ## name, \ + LUSTRE_DLM_VERSION) + +/* + * LLOG handler macros and generic functions. + */ +#define TGT_LLOG_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(LLOG_FIRST_OPC, flags, name, fn, NULL, \ + LUSTRE_LOG_VERSION) +#define TGT_LLOG_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(LLOG_FIRST_OPC, flags, name, fn, &RQF_ ## name, \ + LUSTRE_LOG_VERSION) + +/* + * Sec context handler macros and generic functions. + */ +#define TGT_SEC_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(SEC_FIRST_OPC, flags, name, fn, NULL, \ + LUSTRE_OBD_VERSION) + +#define TGT_QUOTA_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(QUOTA_DQACQ, flags, name, fn, &RQF_ ## name, \ + LUSTRE_MDS_VERSION) + +/* Sequence service handlers */ +#define TGT_SEQ_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(SEQ_QUERY, flags, name, fn, &RQF_ ## name, \ + LUSTRE_MDS_VERSION) + +/* FID Location Database handlers */ +#define TGT_FLD_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(FLD_QUERY, flags, name, fn, NULL, \ + LUSTRE_MDS_VERSION) + +/* LFSCK handlers */ +#define TGT_LFSCK_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(LFSCK_FIRST_OPC, flags, name, fn, \ + &RQF_ ## name, LUSTRE_OBD_VERSION) + +/* Request with a format known in advance */ +#define TGT_UPDATE_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(OUT_UPDATE, flags, name, fn, &RQF_ ## name, \ + LUSTRE_MDS_VERSION) + +#endif /* __LUSTRE_LU_TARGET_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/libiam.h b/drivers/staging/lustrefsx/lustre/include/lustre/libiam.h new file mode 100644 index 0000000000000..b0f20f7b0a483 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/libiam.h @@ -0,0 +1,140 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre/libiam.h + * + * iam user level library + * + * Author: Wang Di + * Author: Nikita Danilov + * Author: Fan Yong + */ + +/* + * lustre/libiam.h + */ + +#ifndef __IAM_ULIB_H__ +#define __IAM_ULIB_H__ + +/** \defgroup libiam libiam + * + * @{ + */ + + +#define DX_FMT_NAME_LEN 16 + +enum iam_fmt_t { + FMT_LFIX, + FMT_LVAR +}; + +struct iam_uapi_info { + __u16 iui_keysize; + __u16 iui_recsize; + __u16 iui_ptrsize; + __u16 iui_height; + char iui_fmt_name[DX_FMT_NAME_LEN]; +}; + +/* + * Creat an iam file, but do NOT open it. + * Return 0 if success, else -1. + */ +int iam_creat(char *filename, enum iam_fmt_t fmt, + int blocksize, int keysize, int recsize, int ptrsize); + +/* + * Open an iam file, but do NOT creat it if the file doesn't exist. + * Please use iam_creat for creating the file before use iam_open. + * Return file id (fd) if success, else -1. + */ +int iam_open(char *filename, struct iam_uapi_info *ua); + +/* + * Close file opened by iam_open. + */ +int iam_close(int fd); + +/* + * Please use iam_open before use this function. + */ +int iam_insert(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Please use iam_open before use this function. + */ +int iam_lookup(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_delete(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Please use iam_open before use this function. + */ +int iam_it_start(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_it_next(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_it_stop(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Change iam file mode. + */ +int iam_polymorph(char *filename, unsigned long mode); + +/** @} libiam */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h b/drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h new file mode 100644 index 0000000000000..2778a34149bfa --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h @@ -0,0 +1,38 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +/* + * NOTE: This file is DEPRECATED! Please include lustreapi.h directly + * instead of this file. This file will be removed from a future version + * of lustre! + */ + +#ifndef _LIBLUSTREAPI_H_ +#define _LIBLUSTREAPI_H_ + +#include +#warning "Including liblustreapi.h is deprecated. Include lustreapi.h directly." + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h new file mode 100644 index 0000000000000..083187b461269 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h @@ -0,0 +1,48 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre/ll_fiemap.h + * + * FIEMAP data structures and flags. This header file will be used until + * fiemap.h is available in the upstream kernel. + * + * Author: Kalpak Shah + * Author: Andreas Dilger + */ + +/* + * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_fiemap.h + * directly instead of this file. This file will be removed from a + * future version of lustre! + */ + +#include + +#warning "Including ll_fiemap.h is deprecated. Include linux/lustre/lustre_fiemap.h directly." diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h new file mode 100644 index 0000000000000..f8489d55a3b44 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h @@ -0,0 +1,40 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + * + * lustre/include/lustre/lustre_barrier_user.h + * + * Lustre write barrier (on MDT) userspace interfaces. + * + * Author: Fan, Yong + */ + +/* + * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_barrier_user.h + * directly instead of this file. This file will be removed from a + * future version of lustre! + */ + +#include + +#warning "Including lustre_barrier_user.h is deprecated. Include linux/lustre/lustre_barrier_user.h directly." diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h new file mode 100644 index 0000000000000..7b84426fa2750 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h @@ -0,0 +1,40 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * lustre/include/lustre/lustre_lfsck_user.h + * + * Lustre LFSCK userspace interfaces. + * + * Author: Fan, Yong + */ + +/* + * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_lfsck_user.h + * directly instead of this file. This file will be removed from a + * future version of lustre! + */ + +#include +#warning "Including lustre_lfsck_user.h is deprecated. Include linux/lustre/lustre_lfsck_user.h directly." diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h new file mode 100644 index 0000000000000..81bcf6dc6697e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h @@ -0,0 +1,47 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre/lustre_user.h + * + * Lustre public user-space interface definitions. + */ + +/* + * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_user.h + * directly instead of this file. This file will be removed from a + * future version of lustre! + */ + +#include + +/* Disable warning until 2.16 or 3.0, until new header is widely available. + * This gives apps time to move to the new header without spurious warnings. +#warning "Including lustre/lustre_user.h is deprecated. Include linux/lustre/lustre_user.h instead." +*/ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h new file mode 100644 index 0000000000000..02317112226a5 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h @@ -0,0 +1,1245 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _LUSTREAPI_H_ +#define _LUSTREAPI_H_ + +/** \defgroup llapi llapi + * + * @{ + */ + +#include +#include +#include +#include +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifndef LL_MAXQUOTAS +#define LL_MAXQUOTAS 3 +#endif + +#ifndef SEL_UNIT_SIZE +#define SEL_UNIT_SIZE 1024llu +#endif + +#ifndef LOV_PATTERN_DEFAULT +#define LOV_PATTERN_DEFAULT 0xffffffff +#endif + +#ifndef fallthrough +# if defined(__GNUC__) && __GNUC__ >= 7 +# define fallthrough __attribute__((fallthrough)) /* fallthrough */ +# else +# define fallthrough do {} while (0) /* fallthrough */ +# endif +#endif + +typedef struct statx lstatx_t; + +#define lustre_fid struct lu_fid + +/* + * BUILD_BUG_ON() is Compile-time check which verifies correctness at + * compile-time rather than runtime. If "cond" is true, (1 - 2*!!(cond)) + * will be a negative value, which will cause the compiler to complain. + * + */ +#ifndef BUILD_BUG_ON +#define BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2*!!(cond)])) +#endif + +/* Currently external applications can access this but in the + * future this will no longer be exposed for the user. Instead + * if you want to know if the library is initialized just call + * llapi_liblustreapi_initialized() which is now available. */ +extern bool liblustreapi_initialized; + +typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid, + void *args); + +/* lustreapi message severity level */ +enum llapi_message_level { + LLAPI_MSG_OFF = 0, + LLAPI_MSG_FATAL = 1, + LLAPI_MSG_ERROR = 2, + LLAPI_MSG_WARN = 3, + LLAPI_MSG_NORMAL = 4, + LLAPI_MSG_INFO = 5, + LLAPI_MSG_DEBUG = 6, + LLAPI_MSG_MAX +}; + +typedef void (*llapi_log_callback_t)(enum llapi_message_level level, int err, + const char *fmt, va_list ap); + +static inline bool llapi_liblustreapi_initialized(void) +{ + return liblustreapi_initialized; +} + +/* the bottom three bits reserved for llapi_message_level */ +#define LLAPI_MSG_MASK 0x00000007 +#define LLAPI_MSG_NO_ERRNO 0x00000010 + +static inline const char *llapi_msg_level2str(enum llapi_message_level level) +{ + static const char *levels[LLAPI_MSG_MAX] = {"OFF", "FATAL", "ERROR", + "WARNING", "NORMAL", + "INFO", "DEBUG"}; + + if (level >= LLAPI_MSG_MAX) + return NULL; + + return levels[level]; +} + +void llapi_msg_set_level(int level); +int llapi_msg_get_level(void); +llapi_log_callback_t llapi_error_callback_set(llapi_log_callback_t cb); +llapi_log_callback_t llapi_info_callback_set(llapi_log_callback_t cb); + +void llapi_error(enum llapi_message_level level, int err, const char *fmt, ...) + __attribute__((__format__(__printf__, 3, 4))); +#define llapi_err_noerrno(level, fmt, a...) \ + llapi_error((level) | LLAPI_MSG_NO_ERRNO, 0, fmt, ## a) +void llapi_printf(enum llapi_message_level level, const char *fmt, ...) + __attribute__((__format__(__printf__, 2, 3))); + +struct llapi_stripe_param { + unsigned long long lsp_stripe_size; + char *lsp_pool; + int lsp_stripe_offset; + int lsp_stripe_pattern; + /* Number of stripes. Size of lsp_osts[] if lsp_specific is true.*/ + int lsp_stripe_count; + bool lsp_is_specific; + bool lsp_is_create; + __u8 lsp_max_inherit; + __u8 lsp_max_inherit_rr; + __u32 lsp_osts[0]; +}; + +#define lsp_tgts lsp_osts + +enum { + LLAPI_MIGRATION_NONBLOCK = 0x0001, + LLAPI_MIGRATION_MIRROR = 0x0002, + LLAPI_MIGRATION_NONDIRECT = 0x0004, + LLAPI_MIGRATION_VERBOSE = 0x0008, +}; + +__u32 llapi_pattern_to_lov(uint64_t pattern); + +int llapi_file_open_param(const char *name, int flags, mode_t mode, + const struct llapi_stripe_param *param); +int llapi_file_is_encrypted(int fd); +int llapi_file_create_foreign(const char *name, mode_t mode, __u32 type, + __u32 flags, char *foreign_lov); +int llapi_file_create(const char *name, unsigned long long stripe_size, + int stripe_offset, int stripe_count, int stripe_pattern); +int llapi_file_open(const char *name, int flags, int mode, + unsigned long long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern); +int llapi_file_create_pool(const char *name, unsigned long long stripe_size, + int stripe_offset, int stripe_count, + int stripe_pattern, char *pool_name); +int llapi_file_open_pool(const char *name, int flags, int mode, + unsigned long long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern, char *pool_name); +int llapi_poollist(const char *name); +int llapi_get_poolbuf(const char *name, char **buf, + char ***poolist, int *poolcount); +int llapi_get_poollist(const char *name, char **poollist, int list_size, + char *buffer, int buffer_size); +int llapi_get_poolmembers(const char *poolname, char **members, int list_size, + char *buffer, int buffer_size); +int llapi_file_get_stripe(const char *path, struct lov_user_md *lum); +int llapi_file_lookup(int dirfd, const char *name); +void llapi_set_command_name(const char *cmd); +void llapi_clear_command_name(void); + +enum llapi_layout_verbose { + VERBOSE_STRIPE_COUNT = 0x1, + VERBOSE_STRIPE_SIZE = 0x2, + VERBOSE_STRIPE_OFFSET = 0x4, + VERBOSE_POOL = 0x8, + VERBOSE_DETAIL = 0x10, + VERBOSE_OBJID = 0x20, + VERBOSE_GENERATION = 0x40, + VERBOSE_MDTINDEX = 0x80, + VERBOSE_PATTERN = 0x100, + VERBOSE_COMP_COUNT = 0x200, + VERBOSE_COMP_FLAGS = 0x400, + VERBOSE_COMP_START = 0x800, + VERBOSE_COMP_END = 0x1000, + VERBOSE_COMP_ID = 0x2000, + VERBOSE_DFID = 0x4000, + VERBOSE_HASH_TYPE = 0x8000, + VERBOSE_MIRROR_COUNT = 0x10000, + VERBOSE_MIRROR_ID = 0x20000, + VERBOSE_EXT_SIZE = 0x40000, + VERBOSE_INHERIT = 0x80000, + VERBOSE_INHERIT_RR = 0x100000, + VERBOSE_DEFAULT = VERBOSE_STRIPE_COUNT | VERBOSE_STRIPE_SIZE | + VERBOSE_STRIPE_OFFSET | VERBOSE_POOL | + VERBOSE_OBJID | VERBOSE_GENERATION | + VERBOSE_PATTERN | VERBOSE_HASH_TYPE | + VERBOSE_COMP_COUNT | VERBOSE_COMP_FLAGS | + VERBOSE_COMP_START | VERBOSE_COMP_END | + VERBOSE_COMP_ID | VERBOSE_MIRROR_COUNT | + VERBOSE_MIRROR_ID | VERBOSE_EXT_SIZE | + VERBOSE_INHERIT | VERBOSE_INHERIT_RR +}; +/* Compatibility with original names */ +#define VERBOSE_SIZE VERBOSE_STRIPE_SIZE +#define VERBOSE_COUNT VERBOSE_STRIPE_COUNT +#define VERBOSE_OFFSET VERBOSE_STRIPE_OFFSET +#define VERBOSE_LAYOUT VERBOSE_PATTERN + +enum { + NEWERXY_ATIME = 0, /* neweraY */ + NEWERXY_MTIME = 1, /* newermY */ + NEWERXY_CTIME = 2, /* newercY */ + NEWERXY_BTIME = 3, /* newerbY | newerBY */ + NEWERXY_MAX, +}; + +enum lfs_find_perm { + LFS_FIND_PERM_EXACT = -2, + LFS_FIND_PERM_ANY = -1, + LFS_FIND_PERM_OFF = 0, + LFS_FIND_PERM_ALL = 1, +}; + +struct find_param { + unsigned int fp_max_depth; + dev_t fp_dev; + mode_t fp_type; /* S_IFIFO,... */ + uid_t fp_uid; + gid_t fp_gid; + mode_t fp_perm; + time_t fp_atime; + time_t fp_mtime; + time_t fp_ctime; + /* {a,m,c,b}sign cannot be bitfields due to using pointers to + * access them during argument parsing. */ + int fp_asign; + int fp_msign; + int fp_csign; + /* these need to be signed values */ + int fp_size_sign:2, + fp_stripe_size_sign:2, + fp_stripe_count_sign:2, + fp_comp_start_sign:2, + fp_comp_end_sign:2, + fp_comp_count_sign:2, + fp_mirror_count_sign:2, + fp_mirror_index_sign:2, + fp_mirror_id_sign:2, + fp_mdt_count_sign:2, + fp_blocks_sign:2, + fp_ext_size_sign:2, + fp_perm_sign:2, + fp_unused2_sign:2, /* Once used we must add */ + fp_unused3_sign:2, /* a separate flag field */ + fp_unused4_sign:2; /* at end of the struct. */ + unsigned long long fp_size; + unsigned long long fp_size_units; + + unsigned long long fp_zero_end:1, + fp_recursive:1, + fp_exclude_pattern:1, + fp_exclude_type:1, + fp_exclude_obd:1, + fp_exclude_mdt:1, + fp_exclude_gid:1, + fp_exclude_uid:1, + fp_check_gid:1, + fp_check_uid:1, + fp_check_pool:1, /* LOV pool name */ + fp_check_size:1, /* file size */ + fp_exclude_pool:1, + fp_exclude_size:1, + fp_exclude_atime:1, + fp_exclude_mtime:1, + fp_exclude_ctime:1, + fp_get_lmv:1, /* get MDT list from LMV */ + fp_raw:1, /* do not fill in defaults */ + fp_check_stripe_size:1, /* LOV stripe size */ + fp_exclude_stripe_size:1, + fp_check_stripe_count:1, /* LOV stripe count */ + fp_exclude_stripe_count:1, + fp_check_layout:1, + fp_exclude_layout:1, + fp_get_default_lmv:1, /* Get default LMV */ + fp_migrate:1, + fp_check_projid:1, + fp_exclude_projid:1, + fp_check_comp_count:1, + fp_exclude_comp_count:1, + fp_check_mirror_count:1, + fp_exclude_mirror_count:1, + fp_check_comp_flags:1, + fp_check_mirror_state:1, + fp_check_comp_start:1, + fp_exclude_comp_start:1, + fp_check_comp_end:1, + fp_exclude_comp_end:1, + fp_check_comp_id:1, + fp_exclude_comp_id:1, + fp_check_mirror_id:1, + fp_exclude_mirror_id:1, + fp_check_mirror_index:1, + fp_exclude_mirror_index:1, + fp_check_mdt_count:1, + fp_exclude_mdt_count:1, + fp_check_hash_flag:1, + fp_exclude_hash_type:1, + fp_yaml:1, /* output layout in YAML */ + fp_check_blocks:1, + fp_exclude_blocks:1, + fp_check_foreign:1, + fp_exclude_foreign:1, + fp_check_ext_size:1, /* extension size */ + fp_exclude_ext_size:1, + fp_lazy:1, + fp_newerxy:1, + fp_exclude_btime:1, + fp_exclude_perm:1, + fp_unused_bit4:1, /* Once all unused fields */ + fp_unused_bit5:1, /* are used we need to add */ + fp_unused_bit6:1, /* a separate flag field at*/ + fp_unused_bit7:1; /* the end of the struct. */ + + enum llapi_layout_verbose fp_verbose; + int fp_quiet; + + /* regular expression */ + char *fp_pattern; + + struct obd_uuid *fp_obd_uuid; + int fp_num_obds; + int fp_num_alloc_obds; + int fp_obd_index; + int *fp_obd_indexes; + + struct obd_uuid *fp_mdt_uuid; + int fp_num_mdts; + int fp_num_alloc_mdts; + int fp_mdt_index; + int *fp_mdt_indexes; + int fp_file_mdt_index; + + size_t fp_lum_size; + struct lov_user_mds_data *fp_lmd; + + char fp_poolname[LOV_MAXPOOLNAME + 1]; + + __u32 fp_lmv_stripe_count; + struct lmv_user_md *fp_lmv_md; + + unsigned long long fp_stripe_size; + unsigned long long fp_stripe_size_units; + unsigned long long fp_stripe_count; + __u32 fp_layout; + + __u32 fp_comp_count; + __u32 fp_mirror_count; + __u32 fp_comp_flags; + __u32 fp_comp_neg_flags; + __u16 fp_mirror_state; + __u16 fp_mirror_neg_state; + __u32 fp_comp_id; + __u16 fp_mirror_id; + __u16 fp_mirror_index; + unsigned long long fp_comp_start; + unsigned long long fp_comp_start_units; + unsigned long long fp_comp_end; + unsigned long long fp_comp_end_units; + unsigned long long fp_mdt_count; + unsigned int fp_projid; + unsigned long long fp_blocks; + unsigned long long fp_blocks_units; + + unsigned long fp_got_uuids:1, + fp_obds_printed:1; + unsigned int fp_depth; + unsigned int fp_hash_type; + unsigned int fp_time_margin; /* time margin in seconds */ + __u32 fp_foreign_type; + unsigned long long fp_ext_size; + unsigned long long fp_ext_size_units; + + /* + * fp_newery[NEWERXY_MAX][0]: --newerXY reference + * fp_newery[NEWERXY_MAX][1]: ! -- newerXY reference + */ + time_t fp_newery[NEWERXY_MAX][2]; + + time_t fp_btime; + int fp_bsign; + unsigned int fp_hash_inflags; + unsigned int fp_hash_exflags; + /* Print all information (lfs find only) */ + char *fp_format_printf_str; +}; + +int llapi_ostlist(char *path, struct find_param *param); +int llapi_uuid_match(char *real_uuid, char *search_uuid); +int llapi_getstripe(char *path, struct find_param *param); +int llapi_find(char *path, struct find_param *param); + +int llapi_file_fget_mdtidx(int fd, int *mdtidx); +int llapi_dir_set_default_lmv(const char *name, + const struct llapi_stripe_param *param); +int llapi_dir_set_default_lmv_stripe(const char *name, int stripe_offset, + int stripe_count, int stripe_pattern, + const char *pool_name); +int llapi_dir_create(const char *name, mode_t mode, + const struct llapi_stripe_param *param); +int llapi_dir_create_foreign(const char *name, mode_t mode, __u32 type, + __u32 flags, const char *value); +int llapi_dir_create_pool(const char *name, int flags, int stripe_offset, + int stripe_count, int stripe_pattern, + const char *poolname); +int llapi_direntry_remove(char *dname); +int llapi_unlink_foreign(char *dname); + +int llapi_obd_fstatfs(int fd, __u32 type, __u32 index, + struct obd_statfs *stat_buf, struct obd_uuid *uuid_buf); +int llapi_obd_statfs(char *path, __u32 type, __u32 index, + struct obd_statfs *stat_buf, struct obd_uuid *uuid_buf); +int llapi_ping(char *obd_type, char *obd_name); +int llapi_target_check(int num_types, char **obd_types, char *dir); +int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid); +int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid); +int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid); +int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count); +int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count); +int llapi_is_lustre_mnttype(const char *type); +int llapi_search_tgt(const char *fsname, const char *poolname, + const char *tgtname, bool is_mdt); +int llapi_search_mdt(const char *fsname, const char *poolname, + const char *mdtname); +int llapi_search_ost(const char *fsname, const char *poolname, + const char *ostname); +int llapi_get_obd_count(char *mnt, int *count, int is_mdt); +int llapi_parse_size(const char *optarg, unsigned long long *size, + unsigned long long *size_units, int bytes_spec); +int llapi_search_mounts(const char *pathname, int index, char *mntdir, + char *fsname); +int llapi_search_fsname(const char *pathname, char *fsname); +int llapi_get_fsname_instance(const char *path, char *fsname, size_t fsname_len, + char *instance, size_t instance_len); +int llapi_get_instance(const char *path, char *instance, size_t instance_len); +int llapi_get_fsname(const char *path, char *fsname, size_t fsname_len); +int llapi_getname(const char *path, char *name, size_t namelen); +int llapi_search_fileset(const char *pathname, char *fileset); + +int llapi_search_rootpath(char *pathname, const char *fsname); +int llapi_search_rootpath_by_dev(char *pathname, dev_t dev); +int llapi_nodemap_exists(const char *name); +int llapi_migrate_mdt(char *path, struct find_param *param); +int llapi_mv(char *path, struct find_param *param); + +struct mntent; + +#define HAVE_LLAPI_IS_LUSTRE_MNT +int llapi_is_lustre_mnt(struct mntent *mnt); +int llapi_quotactl(char *mnt, struct if_quotactl *qctl); +int llapi_target_iterate(int type_num, char **obd_type, void *args, + llapi_cb_t cb); +int llapi_get_connect_flags(const char *mnt, __u64 *flags); +int llapi_cp(int argc, char *argv[]); +int llapi_ls(int argc, char *argv[]); +int llapi_fid_parse(const char *fidstr, struct lu_fid *fid, char **endptr); +int llapi_fid2path_at(int mnt_fd, const struct lu_fid *fid, char *path, + int pathlen, long long *recno, int *linkno); +int llapi_fid2path(const char *device, const char *fidstr, char *path, + int pathlen, long long *recno, int *linkno); +int llapi_path2fid(const char *path, struct lu_fid *fid); +int llapi_get_mdt_index_by_fid(int fd, const struct lu_fid *fid, + int *mdt_index); +int llapi_get_lum_file(const char *path, __u64 *valid, lstatx_t *statx, + struct lov_user_md *lum, size_t lumsize); +int llapi_get_lum_dir(const char *path, __u64 *valid, lstatx_t *statx, + struct lov_user_md *lum, size_t lumsize); +int llapi_get_lum_file_fd(int dir_fd, const char *fname, __u64 *valid, + lstatx_t *statx, struct lov_user_md *lum, + size_t lumsize); +int llapi_get_lum_dir_fd(int dir_fd, __u64 *valid, lstatx_t *statx, + struct lov_user_md *lum, size_t lumsize); + +int llapi_fd2fid(int fd, struct lu_fid *fid); +/* get FID of parent dir + the related name of entry in this parent dir */ +int llapi_path2parent(const char *path, unsigned int linkno, + struct lu_fid *parent_fid, char *name, size_t name_size); +int llapi_fd2parent(int fd, unsigned int linkno, struct lu_fid *parent_fid, + char *name, size_t name_size); +int llapi_rmfid(const char *path, struct fid_array *fa); +int llapi_chomp_string(char *buf); +int llapi_open_by_fid(const char *dir, const struct lu_fid *fid, + int open_flags); +int llapi_get_version_string(char *version, unsigned int version_size); +/* llapi_get_version() is deprecated, use llapi_get_version_string() instead */ +int llapi_get_version(char *buffer, int buffer_size, char **version) + __attribute__((deprecated)); +int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags); +int llapi_file_flush(int fd); +extern int llapi_get_ost_layout_version(int fd, __u32 *layout_version); +int llapi_hsm_state_get_fd(int fd, struct hsm_user_state *hus); +int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus); +int llapi_hsm_state_set_fd(int fd, __u64 setmask, __u64 clearmask, + __u32 archive_id); +int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask, + __u32 archive_id); +int llapi_hsm_register_event_fifo(const char *path); +int llapi_hsm_unregister_event_fifo(const char *path); +void llapi_hsm_log_error(enum llapi_message_level level, int _rc, + const char *fmt, va_list args); + +int llapi_get_agent_uuid(char *path, char *buf, size_t bufsize); +int llapi_create_volatile_idx(const char *directory, int mdt_idx, + int open_flags); +int llapi_create_volatile_param(const char *directory, int mdt_idx, + int open_flags, mode_t mode, + const struct llapi_stripe_param *stripe_param); + +static inline int llapi_create_volatile(char *directory, int open_flags) +{ + return llapi_create_volatile_idx(directory, -1, open_flags); +} + + +int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2, + int gid, __u64 flags); +int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags); +int llapi_swap_layouts(const char *path1, const char *path2, __u64 dv1, + __u64 dv2, __u64 flags); + +/* Changelog interface. priv is private state, managed internally by these + * functions */ + +/* Records received are in extended format now, though most of them are still + * written in disk in changelog_rec format (to save space and time), it's + * converted to extended format in the lustre api to ease changelog analysis. + */ +#define HAVE_CHANGELOG_EXTEND_REC 1 + +int llapi_changelog_start(void **priv, enum changelog_send_flag flags, + const char *mdtname, long long startrec); +int llapi_changelog_fini(void **priv); +int llapi_changelog_recv(void *priv, struct changelog_rec **rech); +int llapi_changelog_in_buf(void *priv); +int llapi_changelog_free(struct changelog_rec **rech); +int llapi_changelog_get_fd(void *priv); +/* Allow records up to endrec to be destroyed; requires registered id. */ +int llapi_changelog_clear(const char *mdtname, const char *idstr, + long long endrec); +extern int llapi_changelog_set_xflags(void *priv, + enum changelog_send_extra_flag extra_flags); + +/* HSM copytool interface. + * priv is private state, managed internally by these functions + */ +struct hsm_copytool_private; +struct hsm_copyaction_private; + +int llapi_hsm_copytool_register(struct hsm_copytool_private **priv, + const char *mnt, int archive_count, + int *archives, int rfd_flags); +int llapi_hsm_copytool_unregister(struct hsm_copytool_private **priv); +int llapi_hsm_copytool_get_fd(struct hsm_copytool_private *ct); +int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv, + struct hsm_action_list **hal, int *msgsize); +int llapi_hsm_action_begin(struct hsm_copyaction_private **phcp, + const struct hsm_copytool_private *ct, + const struct hsm_action_item *hai, + int restore_mdt_index, int restore_open_flags, + bool is_error); +int llapi_hsm_action_end(struct hsm_copyaction_private **phcp, + const struct hsm_extent *he, int hp_flags, int errval); +int llapi_hsm_action_progress(struct hsm_copyaction_private *hcp, + const struct hsm_extent *he, __u64 total, + int hp_flags); +int llapi_hsm_action_get_dfid(const struct hsm_copyaction_private *hcp, + struct lu_fid *fid); +int llapi_hsm_action_get_fd(const struct hsm_copyaction_private *hcp); +int llapi_hsm_import(const char *dst, int archive, const struct stat *st, + unsigned long long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern, char *pool_name, + struct lu_fid *newfid); + +/* HSM user interface */ +struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount, + int data_len); +int llapi_hsm_request(const char *path, const struct hsm_user_request *request); +int llapi_hsm_current_action(const char *path, struct hsm_current_action *hca); + +/* JSON handling */ +enum llapi_json_types { + LLAPI_JSON_INTEGER = 1, + LLAPI_JSON_BIGNUM, + LLAPI_JSON_REAL, + LLAPI_JSON_STRING +}; + +struct llapi_json_item { + char *lji_key; + __u32 lji_type; + union { + int lji_integer; + __u64 lji_u64; + double lji_real; + char *lji_string; + }; + struct llapi_json_item *lji_next; +}; + +struct llapi_json_item_list { + int ljil_item_count; + struct llapi_json_item *ljil_items; +}; + +int llapi_json_init_list(struct llapi_json_item_list **item_list); +int llapi_json_destroy_list(struct llapi_json_item_list **item_list); +int llapi_json_add_item(struct llapi_json_item_list **item_list, char *key, + __u32 type, void *val); +int llapi_json_write_list(struct llapi_json_item_list **item_list, FILE *fp); + +/* File lease */ +int llapi_lease_acquire(int fd, enum ll_lease_mode mode); +int llapi_lease_release(int fd); +int llapi_lease_set(int fd, const struct ll_ioc_lease *data); +int llapi_lease_check(int fd); +int llapi_lease_get(int fd, int mode); /* obsoleted */ +int llapi_lease_put(int fd); /* obsoleted */ + +/* Group lock */ +int llapi_group_lock(int fd, int gid); +int llapi_group_unlock(int fd, int gid); +int llapi_group_lock64(int fd, __u64 gid); +int llapi_group_unlock64(int fd, __u64 gid); + +bool llapi_file_is_sparse(int fd); +off_t llapi_data_seek(int src_fd, off_t offset, size_t *length); +int llapi_hole_punch(int fd, off_t start, size_t length); + +/* Ladvise */ +int llapi_ladvise(int fd, unsigned long long flags, int num_advise, + struct llapi_lu_ladvise *ladvise); + +/* PCC */ +int llapi_pcc_attach(const char *path, __u32 id, enum lu_pcc_type type); +int llapi_pcc_attach_fid(const char *mntpath, const struct lu_fid *fid, + __u32 id, enum lu_pcc_type type); +int llapi_pcc_attach_fid_str(const char *mntpath, const char *fidstr, + __u32 id, enum lu_pcc_type type); +int llapi_pcc_detach_fd(int fd, __u32 option); +int llapi_pcc_detach_fid(const char *mntpath, const struct lu_fid *fid, + __u32 option); +int llapi_pcc_detach_fid_str(const char *mntpath, const char *fidstr, + __u32 option); +int llapi_pcc_detach_file(const char *path, __u32 option); +int llapi_pcc_state_get_fd(int fd, struct lu_pcc_state *state); +int llapi_pcc_state_get(const char *path, struct lu_pcc_state *state); +int llapi_pccdev_set(const char *mntpath, const char *cmd); +int llapi_pccdev_get(const char *mntpath); +/** @} llapi */ + +/* llapi_layout user interface */ + +/** + * An array element storing component info to be resynced during mirror + * resynchronization. + */ +struct llapi_resync_comp { + uint64_t lrc_start; + uint64_t lrc_end; + uint32_t lrc_mirror_id; + uint32_t lrc_id; /* component id */ + bool lrc_synced; +}; + +/** Opaque data type abstracting the layout of a Lustre file. */ +struct llapi_layout; + +int llapi_mirror_truncate(int fd, unsigned int id, off_t length); +ssize_t llapi_mirror_write(int fd, unsigned int id, const void *buf, + size_t count, off_t pos); +int llapi_mirror_find(struct llapi_layout *layout, uint64_t file_start, + uint64_t file_end, uint64_t *endp); +int llapi_layout_get_last_init_comp(struct llapi_layout *layout); +int llapi_layout_mirror_inherit(struct llapi_layout *f_layout, + struct llapi_layout *m_layout); +int llapi_mirror_find_stale(struct llapi_layout *layout, + struct llapi_resync_comp *comp, size_t comp_size, + __u16 *mirror_ids, int ids_nr); +int llapi_mirror_resync_many(int fd, struct llapi_layout *layout, + struct llapi_resync_comp *comp_array, + int comp_size, uint64_t start, uint64_t end); +/* + * Flags to control how layouts are retrieved. + */ + +enum llapi_layout_get_flags { + /** Replace non-specified values with expected inherited values. */ + LLAPI_LAYOUT_GET_EXPECTED = 0x0001, + /** Use a temporary buffer to swab and return xattrs. */ + LLAPI_LAYOUT_GET_COPY = 0x0002, + /** Verify xattr contains sane layout values. */ + LLAPI_LAYOUT_GET_CHECK = 0x0004, +}; +/* compatibility macros for old interfaces */ +#define LAYOUT_GET_EXPECTED LLAPI_LAYOUT_GET_EXPECTED +#define LLAPI_LXF_COPY LLAPI_LAYOUT_GET_COPY +#define LLAPI_LXF_CHECK LLAPI_LAYOUT_GET_CHECK + +/** + * Return a pointer to a newly-allocated opaque data structure containing + * the layout for the file at \a path. The pointer should be freed with + * llapi_layout_free() when it is no longer needed. Failure is indicated + * by a NULL return value and an appropriate error code stored in errno. + */ +struct llapi_layout *llapi_layout_get_by_path(const char *path, + enum llapi_layout_get_flags flags); + +/** + * Return a pointer to a newly-allocated opaque data type containing the + * layout for the file referenced by open file descriptor \a fd. The + * pointer should be freed with llapi_layout_free() when it is no longer + * needed. Failure is indicated by a NULL return value and an + * appropriate error code stored in errno. + */ +struct llapi_layout *llapi_layout_get_by_fd(int fd, + enum llapi_layout_get_flags flags); + +/** + * Return a pointer to a newly-allocated opaque data type containing the + * layout for the file associated with Lustre file identifier + * \a fid. The string \a path must name a path within the + * filesystem that contains the file being looked up, such as the + * filesystem root. The returned pointer should be freed with + * llapi_layout_free() when it is no longer needed. Failure is + * indicated with a NULL return value and an appropriate error code + * stored in errno. + */ +struct llapi_layout *llapi_layout_get_by_fid(const char *path, + const struct lu_fid *fid, + enum llapi_layout_get_flags flags); + +/** + * Return a pointer to a newly-allocated opaque data type containing the + * layout for the file associated with extended attribute \a lov_xattr. The + * length of the extended attribute is \a lov_xattr_size. The \a lov_xattr + * should be raw xattr without being swapped, since this function will swap it + * properly. Thus, \a lov_xattr will be modified during the process. If the + * \a LLAPI_LXF_CHECK flag of \a flags is set, this function will check whether + * the objects count in lum is consistent with the stripe count in lum. This + * check only apply to regular file, so \a LLAPI_LAYOUT_GET_CHECK flag should + * be cleared if the xattr belongs to a directory. If the flag \a + * LLAPI_LAYOUT_GET_COPY is set, this function will use a temporary buffer for + * byte swapping when necessary, leaving \a lov_xattr untouched. Otherwise, the + * byte swapping will be done to the \a lov_xattr buffer directly. The returned + * pointer should be freed with llapi_layout_free() when it is no longer + * needed. Failure is * indicated with a NULL return value and an appropriate + * error code stored in errno. + */ +struct llapi_layout *llapi_layout_get_by_xattr(void *lov_xattr, + ssize_t lov_xattr_size, + enum llapi_layout_get_flags flags); + +/** + * Allocate a new layout. Use this when creating a new file with + * llapi_layout_file_create(). + */ +struct llapi_layout *llapi_layout_alloc(void); + +/** + * Free memory allocated for \a layout. + */ +void llapi_layout_free(struct llapi_layout *layout); + +/** + * llapi_layout_merge() - Merge a composite layout into another one. + * @dst_layout: Destination composite layout. + * @src_layout: Source composite layout. + * + * This function copies all of the components from @src_layout and + * appends them to @dst_layout. + * + * Return: 0 on success or -1 on failure. + */ +int llapi_layout_merge(struct llapi_layout **dst_layout, + const struct llapi_layout *src_layout); + +/** Not a valid stripe size, offset, or RAID pattern. */ +#define LLAPI_LAYOUT_INVALID 0x1000000000000001ULL + +/** + * When specified or returned as the value for stripe count, + * stripe size, offset, or RAID pattern, the filesystem-wide + * default behavior will apply. + */ +#define LLAPI_LAYOUT_DEFAULT (LLAPI_LAYOUT_INVALID + 1) + +/** + * When specified or returned as the value for stripe count, all + * available OSTs will be used. + */ +#define LLAPI_LAYOUT_WIDE (LLAPI_LAYOUT_INVALID + 2) + +/** + * When specified as the value for layout pattern, file objects will be + * stored using RAID0. That is, data will be split evenly and without + * redundancy across all OSTs in the layout. + */ +#define LLAPI_LAYOUT_RAID0 0ULL +#define LLAPI_LAYOUT_MDT 2ULL +#define LLAPI_LAYOUT_OVERSTRIPING 4ULL + +/** + * The layout includes a specific set of OSTs on which to allocate. + */ +#define LLAPI_LAYOUT_SPECIFIC 0x2000000000000000ULL + +/** + * A valid ost index should be less than maximum valid OST index (UINT_MAX). + */ +#define LLAPI_LAYOUT_IDX_MAX 0x00000000FFFFFFFFULL + +/** + * Flags to modify how layouts are retrieved. + */ +/******************** Stripe Count ********************/ + +/** + * Store the stripe count of \a layout in \a count. + * + * \retval 0 Success + * \retval -1 Error with status code in errno. + */ +int llapi_layout_stripe_count_get(const struct llapi_layout *layout, + uint64_t *count); + +/** + * Set the stripe count of \a layout to \a count. + * + * \retval 0 Success. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_stripe_count_set(struct llapi_layout *layout, uint64_t count); + +/** + * Check if the stripe count \a stripe_count \a is valid. + */ +bool llapi_layout_stripe_count_is_valid(int64_t stripe_count); +/******************** Stripe Size ********************/ + +/** + * Store the stripe size of \a layout in \a size. + * + * \retval 0 Success. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_stripe_size_get(const struct llapi_layout *layout, + uint64_t *size); + +/** + * Set the stripe size of \a layout to \a stripe_size. + * + * \retval 0 Success. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_stripe_size_set(struct llapi_layout *layout, uint64_t size); + + +/******************** Extension Size ********************/ + +/** + * Store the extension size of \a layout in \a size. + * + * \retval 0 Success. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_extension_size_get(const struct llapi_layout *layout, + uint64_t *size); + +/** + * Set the extension size of \a layout to \a stripe_size. + * + * \retval 0 Success. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_extension_size_set(struct llapi_layout *layout, uint64_t size); + + +/******************** Stripe Pattern ********************/ + +/** + * Store the stripe pattern of \a layout in \a pattern. + * + * \retval 0 Success. + * \retval -1 Error with status code in errno. + */ +int llapi_layout_pattern_get(const struct llapi_layout *layout, + uint64_t *pattern); + +/** + * Set the stripe pattern of \a layout to \a pattern. + * + * \retval 0 Success. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_pattern_set(struct llapi_layout *layout, uint64_t pattern); + +/******************** OST Index ********************/ + +/** + * Store the index of the OST where stripe number \a stripe_number is stored + * in \a index. + * + * An error return value will result from a NULL layout, if \a + * stripe_number is out of range, or if \a layout was not initialized + * with llapi_layout_lookup_by{path,fd,fid}(). + * + * \retval 0 Success + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_ost_index_get(const struct llapi_layout *layout, + uint64_t stripe_number, uint64_t *index); + +/** + * Set the OST index associated with stripe number \a stripe_number to + * \a ost_index. + * NB: This is currently supported only for \a stripe_number = 0 and + * other usage will return ENOTSUPP in errno. A NULL \a layout or + * out-of-range \a stripe_number will return EINVAL in errno. + * + * \retval 0 Success. + * \retval -1 Error with errno set to non-zero value. + */ +int llapi_layout_ost_index_set(struct llapi_layout *layout, int stripe_number, + uint64_t index); + +/******************** Pool Name ********************/ + +/** + * Store up to \a pool_name_len characters of the name of the pool of + * OSTs associated with \a layout into the buffer pointed to by + * \a pool_name. + * + * The correct calling form is: + * + * llapi_layout_pool_name_get(layout, pool_name, sizeof(pool_name)); + * + * A pool defines a set of OSTs from which file objects may be + * allocated for a file using \a layout. + * + * On success, the number of bytes stored is returned, excluding the + * terminating '\0' character (zero indicates that \a layout does not + * have an associated OST pool). On error, -1 is returned and errno is + * set appropriately. Possible sources of error include a NULL pointer + * argument or insufficient space in \a dest to store the pool name, + * in which cases errno will be set to EINVAL. + * + * \retval 0+ The number of bytes stored in \a dest. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_pool_name_get(const struct llapi_layout *layout, + char *pool_name, size_t pool_name_len); + +/** + * Set the name of the pool of OSTs from which file objects will be + * allocated to \a pool_name. + * + * If the pool name uses "fsname.pool" notation to qualify the pool name + * with a filesystem name, the "fsname." portion will be silently + * discarded before storing the value. No validation that \a pool_name + * is an existing non-empty pool in filesystem \a fsname will be + * performed. Such validation can be performed by the application if + * desired using the llapi_search_ost() function. The maximum length of + * the stored value is defined by the constant LOV_MAXPOOLNAME. + * + * \retval 0 Success. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_pool_name_set(struct llapi_layout *layout, + const char *pool_name); + +/******************** File Creation ********************/ + +/** + * Open an existing file at \a path, or create it with the specified + * \a layout and \a mode. + * + * One access mode and zero or more file creation flags and file status + * flags May be bitwise-or'd in \a open_flags (see open(2)). Return an + * open file descriptor for the file. If \a layout is non-NULL and + * \a path is not on a Lustre filesystem this function will fail and set + * errno to ENOTTY. + * + * An already existing file may be opened with this function, but + * \a layout and \a mode will not be applied to it. Callers requiring a + * guarantee that the opened file is created with the specified + * \a layout and \a mode should use llapi_layout_file_create(). + * + * A NULL \a layout may be specified, in which case the standard Lustre + * behavior for assigning layouts to newly-created files will apply. + * + * \retval 0+ An open file descriptor. + * \retval -1 Error with status code in errno. + */ +int llapi_layout_file_open(const char *path, int open_flags, mode_t mode, + const struct llapi_layout *layout); + +/** + * Create a new file at \a path with the specified \a layout and \a mode. + * + * One access mode and zero or more file creation flags and file status + * flags May be bitwise-or'd in \a open_flags (see open(2)). Return an + * open file descriptor for the file. If \a layout is non-NULL and + * \a path is not on a Lustre filesystem this function will fail and set + * errno to ENOTTY. + * + * The function call + * + * llapi_layout_file_create(path, open_flags, mode, layout) + * + * shall be equivalent to: + * + * llapi_layout_file_open(path, open_flags|O_CREAT|O_EXCL, mode, layout) + * + * It is an error if \a path specifies an existing file. + * + * A NULL \a layout may be specified, in which the standard Lustre + * behavior for assigning layouts to newly-created files will apply. + * + * \retval 0+ An open file descriptor. + * \retval -1 Error with status code in errno. + */ +int llapi_layout_file_create(const char *path, int open_flags, int mode, + const struct llapi_layout *layout); + +/** + * Set flags to the header of component layout. + */ +int llapi_layout_flags_set(struct llapi_layout *layout, uint32_t flags); +int llapi_layout_flags_get(struct llapi_layout *layout, uint32_t *flags); +const char *llapi_layout_flags_string(uint32_t flags); +__u16 llapi_layout_string_flags(char *string); + +/** + * llapi_layout_mirror_count_get() - Get mirror count from the header of + * a layout. + * @layout: Layout to get mirror count from. + * @count: Returned mirror count value. + * + * This function gets mirror count from the header of a layout. + * + * Return: 0 on success or -1 on failure. + */ +int llapi_layout_mirror_count_get(struct llapi_layout *layout, + uint16_t *count); + +/** + * llapi_layout_mirror_count_set() - Set mirror count to the header of a layout. + * @layout: Layout to set mirror count in. + * @count: Mirror count value to be set. + * + * This function sets mirror count to the header of a layout. + * + * Return: 0 on success or -1 on failure. + */ +int llapi_layout_mirror_count_set(struct llapi_layout *layout, + uint16_t count); + +/** + * Fetch the start and end offset of the current layout component. + */ +int llapi_layout_comp_extent_get(const struct llapi_layout *layout, + uint64_t *start, uint64_t *end); +/** + * Set the extent of current layout component. + */ +int llapi_layout_comp_extent_set(struct llapi_layout *layout, + uint64_t start, uint64_t end); + +/* PFL component flags table */ +static const struct comp_flag_name { + enum lov_comp_md_entry_flags cfn_flag; + const char *cfn_name; +} comp_flags_table[] = { + { LCME_FL_INIT, "init" }, + { LCME_FL_STALE, "stale" }, + { LCME_FL_PREF_RW, "prefer" }, + { LCME_FL_OFFLINE, "offline" }, + { LCME_FL_NOSYNC, "nosync" }, + { LCME_FL_EXTENSION, "extension" }, +}; + +/** + * Gets the attribute flags of the current component. + */ +int llapi_layout_comp_flags_get(const struct llapi_layout *layout, + uint32_t *flags); +/** + * Sets the specified flags of the current component leaving other flags as-is. + */ +int llapi_layout_comp_flags_set(struct llapi_layout *layout, uint32_t flags); +/** + * Clears the flags specified in the flags leaving other flags as-is. + */ +int llapi_layout_comp_flags_clear(struct llapi_layout *layout, uint32_t flags); +/** + * Fetches the file-unique component ID of the current layout component. + */ +int llapi_layout_comp_id_get(const struct llapi_layout *layout, uint32_t *id); +/** + * Fetches the mirror ID of the current layout component. + */ +int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id); +/** + * Adds one component to the existing composite or plain layout. + */ +int llapi_layout_comp_add(struct llapi_layout *layout); +/** + * Adds a first component of a mirror to the existing composite layout. + */ +int llapi_layout_add_first_comp(struct llapi_layout *layout); +/** + * Deletes the current layout component from the composite layout. + */ +int llapi_layout_comp_del(struct llapi_layout *layout); + +enum llapi_layout_comp_use { + LLAPI_LAYOUT_COMP_USE_FIRST = 1, + LLAPI_LAYOUT_COMP_USE_LAST = 2, + LLAPI_LAYOUT_COMP_USE_NEXT = 3, + LLAPI_LAYOUT_COMP_USE_PREV = 4, +}; + +/** + * Set the currently active component to the specified component ID. + */ +int llapi_layout_comp_use_id(struct llapi_layout *layout, uint32_t id); +/** + * Select the currently active component at the specified position. + */ +int llapi_layout_comp_use(struct llapi_layout *layout, uint32_t pos); +/** + * Add layout components to an existing file. + */ +int llapi_layout_file_comp_add(const char *path, + const struct llapi_layout *layout); +/** + * Delete component(s) by the specified component id or flags. + */ +int llapi_layout_file_comp_del(const char *path, uint32_t id, uint32_t flags); +/** + * Change flags or other parameters of the component(s) by component ID of an + * existing file. The component to be modified is specified by the + * comp->lcme_id value, which must be an unique component ID. The new + * attributes are passed in by @comp and @valid is used to specify which + * attributes in the component are going to be changed. + */ +int llapi_layout_file_comp_set(const char *path, uint32_t *ids, uint32_t *flags, + size_t count); +/** + * Check if the file layout is composite. + */ +bool llapi_layout_is_composite(struct llapi_layout *layout); + +enum { + LLAPI_LAYOUT_ITER_CONT = 0, + LLAPI_LAYOUT_ITER_STOP = 1, +}; + +/** + * Iteration callback function. + * + * \retval LLAPI_LAYOUT_ITER_CONT Iteration proceeds + * \retval LLAPI_LAYOUT_ITER_STOP Stop iteration + * \retval < 0 error code + */ +typedef int (*llapi_layout_iter_cb)(struct llapi_layout *layout, void *cbdata); + +/** + * Iterate all components in the corresponding layout + */ +int llapi_layout_comp_iterate(struct llapi_layout *layout, + llapi_layout_iter_cb cb, void *cbdata); + +/** + * FLR: mirror operation APIs + */ +int llapi_mirror_set(int fd, unsigned int id); +int llapi_mirror_clear(int fd); +ssize_t llapi_mirror_read(int fd, unsigned int id, + void *buf, size_t count, off_t pos); +ssize_t llapi_mirror_copy_many(int fd, __u16 src, __u16 *dst, size_t count); +int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst, + off_t pos, size_t count); +off_t llapi_mirror_data_seek(int fd, unsigned int id, off_t pos, size_t *size); +int llapi_mirror_punch(int fd, unsigned int id, off_t start, size_t length); + +int llapi_heat_get(int fd, struct lu_heat *heat); +int llapi_heat_set(int fd, __u64 flags); + +int llapi_layout_sanity(struct llapi_layout *layout, bool incomplete, bool flr); +void llapi_layout_sanity_perror(int error); +int llapi_layout_dom_size(struct llapi_layout *layout, uint64_t *size); + +int llapi_param_get_paths(const char *pattern, glob_t *paths); +int llapi_param_get_value(const char *path, char **buf, size_t *buflen); +void llapi_param_paths_free(glob_t *paths); + +/* MDLL */ +int llapi_dir_open_pool(const char *name, int flags, int mode, + unsigned long long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern, char *pool_name); + +void llapi_hsm_action_begin_restore_dir(struct hsm_copytool_private *ct); + +/** @} llapi */ + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_acl.h b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h new file mode 100644 index 0000000000000..166e1bd10994a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h @@ -0,0 +1,51 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre_acl.h + */ + +#ifndef _LUSTRE_ACL_H +#define _LUSTRE_ACL_H + +#include +#include +#ifdef CONFIG_LUSTRE_FS_POSIX_ACL +# include +# define LUSTRE_POSIX_ACL_MAX_ENTRIES 32 +# define LUSTRE_POSIX_ACL_MAX_SIZE_OLD \ + (sizeof(posix_acl_xattr_header) + \ + LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry)) +#endif /* CONFIG_LUSTRE_FS_POSIX_ACL */ + +#ifndef LUSTRE_POSIX_ACL_MAX_SIZE_OLD +# define LUSTRE_POSIX_ACL_MAX_SIZE_OLD 0 +#endif /* LUSTRE_POSIX_ACL_MAX_SIZE */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h new file mode 100644 index 0000000000000..df6f78bb4b29b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h @@ -0,0 +1,44 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + * + * lustre/include/lustre_barrier.h + * + * Lustre write barrier (on MDT) exported functions. + * + * Author: Fan, Yong + */ + +#ifndef _LUSTRE_BARRIER_H +# define _LUSTRE_BARRIER_H + +#include +#include + +bool barrier_entry(struct dt_device *key); +void barrier_exit(struct dt_device *key); +int barrier_handler(struct dt_device *key, struct ptlrpc_request *req); +int barrier_register(struct dt_device *key, struct dt_device *next); +void barrier_deregister(struct dt_device *key); + +#endif /* _LUSTRE_BARRIER_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h new file mode 100644 index 0000000000000..3b269d4fd1d33 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h @@ -0,0 +1,637 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _LUSTRE_COMPAT_H +#define _LUSTRE_COMPAT_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_4ARGS_VFS_SYMLINK +#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \ + vfs_symlink(dir, dentry, path, mode) +#else +#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \ + vfs_symlink(dir, dentry, path) +#endif + +#ifdef HAVE_BVEC_ITER +#define bio_idx(bio) (bio->bi_iter.bi_idx) +#define bio_set_sector(bio, sector) (bio->bi_iter.bi_sector = sector) +#define bvl_to_page(bvl) (bvl->bv_page) +#else +#define bio_idx(bio) (bio->bi_idx) +#define bio_set_sector(bio, sector) (bio->bi_sector = sector) +#define bio_sectors(bio) ((bio)->bi_size >> 9) +#define bvl_to_page(bvl) (bvl->bv_page) +#endif + +#ifdef HAVE_BVEC_ITER +#define bio_start_sector(bio) (bio->bi_iter.bi_sector) +#else +#define bio_start_sector(bio) (bio->bi_sector) +#endif + +#ifndef HAVE_DENTRY_D_CHILD +#define d_child d_u.d_child +#endif + +#ifdef HAVE_DENTRY_D_U_D_ALIAS +#define d_alias d_u.d_alias +#endif + +#ifndef HAVE_D_IN_LOOKUP +static inline int d_in_lookup(struct dentry *dentry) +{ + return false; +} +#endif + +#ifndef HAVE_VM_FAULT_T +#define vm_fault_t int +#endif + +#ifndef HAVE_FOP_ITERATE_SHARED +#define iterate_shared iterate +#endif + +#ifdef HAVE_OLDSIZE_TRUNCATE_PAGECACHE +#define ll_truncate_pagecache(inode, size) truncate_pagecache(inode, 0, size) +#else +#define ll_truncate_pagecache(inode, size) truncate_pagecache(inode, size) +#endif + +#ifdef HAVE_VFS_RENAME_5ARGS +#define ll_vfs_rename(a, b, c, d) vfs_rename(a, b, c, d, NULL) +#elif defined HAVE_VFS_RENAME_6ARGS +#define ll_vfs_rename(a, b, c, d) vfs_rename(a, b, c, d, NULL, 0) +#else +#define ll_vfs_rename(a, b, c, d) vfs_rename(a, b, c, d) +#endif + +#ifdef HAVE_USER_NAMESPACE_ARG +#define vfs_unlink(ns, dir, de) vfs_unlink(ns, dir, de, NULL) +#elif defined HAVE_VFS_UNLINK_3ARGS +#define vfs_unlink(ns, dir, de) vfs_unlink(dir, de, NULL) +#else +#define vfs_unlink(ns, dir, de) vfs_unlink(dir, de) +#endif + +static inline int ll_vfs_getattr(struct path *path, struct kstat *st, + u32 request_mask, unsigned int flags) +{ + int rc; + +#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR) + rc = vfs_getattr(path, st, request_mask, flags); +#else + rc = vfs_getattr(path, st); +#endif + return rc; +} + +#ifndef HAVE_D_IS_POSITIVE +static inline bool d_is_positive(const struct dentry *dentry) +{ + return dentry->d_inode != NULL; +} +#endif + +#ifndef HAVE_INODE_LOCK +# define inode_lock(inode) mutex_lock(&(inode)->i_mutex) +# define inode_unlock(inode) mutex_unlock(&(inode)->i_mutex) +# define inode_trylock(inode) mutex_trylock(&(inode)->i_mutex) +#endif + +/* Old kernels lacked both Xarray support and the page cache + * using Xarrays. Our back ported Xarray support introduces + * the real xa_is_value() but we need a wrapper as well for + * the page cache interaction. Lets keep xa_is_value() separate + * in old kernels for Xarray support and page cache handling. + */ +#ifndef HAVE_XARRAY_SUPPORT +static inline bool ll_xa_is_value(void *entry) +{ + return radix_tree_exceptional_entry(entry); +} +#else +#define ll_xa_is_value xa_is_value +#endif + +#ifndef HAVE_TRUNCATE_INODE_PAGES_FINAL +static inline void truncate_inode_pages_final(struct address_space *map) +{ + truncate_inode_pages(map, 0); +} +#endif + +#ifndef HAVE_PTR_ERR_OR_ZERO +static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr) +{ + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + else + return 0; +} +#endif + +#ifdef HAVE_PID_NS_FOR_CHILDREN +# define ll_task_pid_ns(task) \ + ((task)->nsproxy ? ((task)->nsproxy->pid_ns_for_children) : NULL) +#else +# define ll_task_pid_ns(task) \ + ((task)->nsproxy ? ((task)->nsproxy->pid_ns) : NULL) +#endif + +#ifdef HAVE_FULL_NAME_HASH_3ARGS +# define ll_full_name_hash(salt, name, len) full_name_hash(salt, name, len) +#else +# define ll_full_name_hash(salt, name, len) full_name_hash(name, len) +#endif + +#ifdef HAVE_STRUCT_POSIX_ACL_XATTR +# define posix_acl_xattr_header struct posix_acl_xattr_header +# define posix_acl_xattr_entry struct posix_acl_xattr_entry +# define GET_POSIX_ACL_XATTR_ENTRY(head) ((void *)((head) + 1)) +#else +# define GET_POSIX_ACL_XATTR_ENTRY(head) ((head)->a_entries) +#endif + +#ifdef HAVE_IOP_XATTR +#define ll_setxattr generic_setxattr +#define ll_getxattr generic_getxattr +#define ll_removexattr generic_removexattr +#endif /* HAVE_IOP_XATTR */ + +#ifndef HAVE_POSIX_ACL_VALID_USER_NS +#define posix_acl_valid(a,b) posix_acl_valid(b) +#endif + +#ifdef HAVE_IOP_SET_ACL +#ifdef CONFIG_LUSTRE_FS_POSIX_ACL +#if !defined(HAVE_USER_NAMESPACE_ARG) && !defined(HAVE_POSIX_ACL_UPDATE_MODE) +static inline int posix_acl_update_mode(struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} +#endif /* HAVE_POSIX_ACL_UPDATE_MODE */ +#endif +#endif + +#ifndef HAVE_IOV_ITER_TRUNCATE +static inline void iov_iter_truncate(struct iov_iter *i, u64 count) +{ + if (i->count > count) + i->count = count; +} +#endif + +/* + * mount MS_* flags split from superblock SB_* flags + * if the SB_* flags are not available use the MS_* flags + */ +#if !defined(SB_RDONLY) && defined(MS_RDONLY) +# define SB_RDONLY MS_RDONLY +#endif +#if !defined(SB_ACTIVE) && defined(MS_ACTIVE) +# define SB_ACTIVE MS_ACTIVE +#endif +#if !defined(SB_NOSEC) && defined(MS_NOSEC) +# define SB_NOSEC MS_NOSEC +#endif +#if !defined(SB_POSIXACL) && defined(MS_POSIXACL) +# define SB_POSIXACL MS_POSIXACL +#endif +#if !defined(SB_NODIRATIME) && defined(MS_NODIRATIME) +# define SB_NODIRATIME MS_NODIRATIME +#endif + +#ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER +static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) +{ + i->count = count; +} + +static inline struct iovec iov_iter_iovec(const struct iov_iter *iter) +{ + return (struct iovec) { + .iov_base = iter->iov->iov_base + iter->iov_offset, + .iov_len = min(iter->count, + iter->iov->iov_len - iter->iov_offset), + }; +} + +#define iov_for_each(iov, iter, start) \ + for (iter = (start); \ + (iter).count && ((iov = iov_iter_iovec(&(iter))), 1); \ + iov_iter_advance(&(iter), (iov).iov_len)) + +static inline ssize_t +generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct iovec iov; + struct iov_iter i; + ssize_t bytes = 0; + + iov_for_each(iov, i, *iter) { + ssize_t res; + + res = generic_file_aio_read(iocb, &iov, 1, iocb->ki_pos); + if (res <= 0) { + if (bytes == 0) + bytes = res; + break; + } + + bytes += res; + if (res < iov.iov_len) + break; + } + + if (bytes > 0) + iov_iter_advance(iter, bytes); + return bytes; +} + +static inline ssize_t +__generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct iovec iov; + struct iov_iter i; + ssize_t bytes = 0; + + /* Since LLITE updates file size at the end of I/O in + * vvp_io_commit_write(), append write has to be done in atomic when + * there are multiple segments because otherwise each iteration to + * __generic_file_aio_write() will see original file size */ + if (unlikely(iocb->ki_filp->f_flags & O_APPEND && iter->nr_segs > 1)) { + struct iovec *iov_copy; + int count = 0; + + OBD_ALLOC_PTR_ARRAY(iov_copy, iter->nr_segs); + if (!iov_copy) + return -ENOMEM; + + iov_for_each(iov, i, *iter) + iov_copy[count++] = iov; + + bytes = __generic_file_aio_write(iocb, iov_copy, count, + &iocb->ki_pos); + OBD_FREE_PTR_ARRAY(iov_copy, iter->nr_segs); + + if (bytes > 0) + iov_iter_advance(iter, bytes); + return bytes; + } + + iov_for_each(iov, i, *iter) { + ssize_t res; + + res = __generic_file_aio_write(iocb, &iov, 1, &iocb->ki_pos); + if (res <= 0) { + if (bytes == 0) + bytes = res; + break; + } + + bytes += res; + if (res < iov.iov_len) + break; + } + + if (bytes > 0) + iov_iter_advance(iter, bytes); + return bytes; +} +#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + +static inline void __user *get_vmf_address(struct vm_fault *vmf) +{ +#ifdef HAVE_VM_FAULT_ADDRESS + return (void __user *)vmf->address; +#else + return vmf->virtual_address; +#endif +} + +#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY +# define __ll_filemap_fault(vma, vmf) filemap_fault(vmf) +#else +# define __ll_filemap_fault(vma, vmf) filemap_fault(vma, vmf) +#endif + +#ifndef HAVE_CURRENT_TIME +static inline struct timespec current_time(struct inode *inode) +{ + return CURRENT_TIME; +} +#endif + +#ifndef time_after32 +/** + * time_after32 - compare two 32-bit relative times + * @a: the time which may be after @b + * @b: the time which may be before @a + * + * Needed for kernels earlier than v4.14-rc1~134^2 + * + * time_after32(a, b) returns true if the time @a is after time @b. + * time_before32(b, a) returns true if the time @b is before time @a. + * + * Similar to time_after(), compare two 32-bit timestamps for relative + * times. This is useful for comparing 32-bit seconds values that can't + * be converted to 64-bit values (e.g. due to disk format or wire protocol + * issues) when it is known that the times are less than 68 years apart. + */ +#define time_after32(a, b) ((s32)((u32)(b) - (u32)(a)) < 0) +#define time_before32(b, a) time_after32(a, b) + +#endif + +#ifndef smp_store_mb +#define smp_store_mb(var, value) set_mb(var, value) +#endif + +#ifdef HAVE_PAGEVEC_INIT_ONE_PARAM +#define ll_pagevec_init(pvec, n) pagevec_init(pvec) +#else +#define ll_pagevec_init(pvec, n) pagevec_init(pvec, n) +#endif + +#ifdef HAVE_D_COUNT +# define ll_d_count(d) d_count(d) +#else +# define ll_d_count(d) ((d)->d_count) +#endif /* HAVE_D_COUNT */ + +#ifndef HAVE_IN_COMPAT_SYSCALL +#define in_compat_syscall is_compat_task +#endif + +#ifdef HAVE_I_PAGES +#define page_tree i_pages +#define ll_xa_lock_irqsave(lockp, flags) xa_lock_irqsave(lockp, flags) +#define ll_xa_unlock_irqrestore(lockp, flags) xa_unlock_irqrestore(lockp, flags) +#else +#define i_pages tree_lock +#define ll_xa_lock_irqsave(lockp, flags) spin_lock_irqsave(lockp, flags) +#define ll_xa_unlock_irqrestore(lockp, flags) spin_unlock_irqrestore(lockp, flags) +#endif + +/* Linux commit v5.15-12273-gab2f9d2d3626 + * mm: unexport {,un}lock_page_memcg + * + * Note that the functions are still defined or declared breaking + * the simple approach of just defining the missing functions here + */ +#ifdef HAVE_LOCK_PAGE_MEMCG +#define vvp_lock_page_memcg(page) lock_page_memcg((page)) +#define vvp_unlock_page_memcg(page) unlock_page_memcg((page)) +#else +#define vvp_lock_page_memcg(page) +#define vvp_unlock_page_memcg(page) +#endif + +#ifndef KMEM_CACHE_USERCOPY +#define kmem_cache_create_usercopy(name, size, align, flags, useroffset, \ + usersize, ctor) \ + kmem_cache_create(name, size, align, flags, ctor) +#endif + +static inline bool ll_security_xattr_wanted(struct inode *in) +{ +#ifdef CONFIG_SECURITY + return in->i_security && in->i_sb->s_security; +#else + return false; +#endif +} + +static inline int ll_vfs_getxattr(struct dentry *dentry, struct inode *inode, + const char *name, + void *value, size_t size) +{ +#ifdef HAVE_USER_NAMESPACE_ARG + return vfs_getxattr(&init_user_ns, dentry, name, value, size); +#elif defined(HAVE_VFS_SETXATTR) + return __vfs_getxattr(dentry, inode, name, value, size); +#else + if (unlikely(!inode->i_op->getxattr)) + return -ENODATA; + + return inode->i_op->getxattr(dentry, name, value, size); +#endif +} + +static inline int ll_vfs_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, + const void *value, size_t size, int flags) +{ +#ifdef HAVE_USER_NAMESPACE_ARG + return vfs_setxattr(&init_user_ns, dentry, name, + VFS_SETXATTR_VALUE(value), size, flags); +#elif defined(HAVE_VFS_SETXATTR) + return __vfs_setxattr(dentry, inode, name, value, size, flags); +#else + if (unlikely(!inode->i_op->setxattr)) + return -EOPNOTSUPP; + + return inode->i_op->setxattr(dentry, name, value, size, flags); +#endif +} + +static inline int ll_vfs_removexattr(struct dentry *dentry, struct inode *inode, + const char *name) +{ +#ifdef HAVE_USER_NAMESPACE_ARG + return vfs_removexattr(&init_user_ns, dentry, name); +#elif defined(HAVE_VFS_SETXATTR) + return __vfs_removexattr(dentry, name); +#else + if (unlikely(!inode->i_op->setxattr)) + return -EOPNOTSUPP; + + return inode->i_op->removexattr(dentry, name); +#endif +} + +#ifndef FALLOC_FL_COLLAPSE_RANGE +#define FALLOC_FL_COLLAPSE_RANGE 0x08 /* remove a range of a file */ +#endif + +#ifndef FALLOC_FL_ZERO_RANGE +#define FALLOC_FL_ZERO_RANGE 0x10 /* convert range to zeros */ +#endif + +#ifndef FALLOC_FL_INSERT_RANGE +#define FALLOC_FL_INSERT_RANGE 0x20 /* insert space within file */ +#endif + +#ifndef raw_cpu_ptr +#define raw_cpu_ptr(p) __this_cpu_ptr(p) +#endif + +#ifndef HAVE_IS_ROOT_INODE +static inline bool is_root_inode(struct inode *inode) +{ + return inode == inode->i_sb->s_root->d_inode; +} +#endif + +#ifndef HAVE_IOV_ITER_GET_PAGES_ALLOC2 +#define iov_iter_get_pages_alloc2(i, p, m, s) \ + iov_iter_get_pages_alloc((i), (p), (m), (s)) +#endif + +#ifdef HAVE_AOPS_MIGRATE_FOLIO +#define folio_migr folio +#else +#define folio_migr page +#define migrate_folio migratepage +#endif + +#ifdef HAVE_REGISTER_SHRINKER_FORMAT_NAMED +#define register_shrinker(_s) register_shrinker((_s), "%ps", (_s)) +#elif !defined(HAVE_REGISTER_SHRINKER_RET) +#define register_shrinker(_s) (register_shrinker(_s), 0) +#endif + +#ifndef fallthrough +# if defined(__GNUC__) && __GNUC__ >= 7 +# define fallthrough __attribute__((fallthrough)) /* fallthrough */ +# else +# define fallthrough do {} while (0) /* fallthrough */ +# endif +#endif + +#ifdef HAVE_SEC_RELEASE_SECCTX_1ARG +#ifndef HAVE_LSMCONTEXT_INIT +/* Ubuntu 5.19 */ +static inline void lsmcontext_init(struct lsmcontext *cp, char *context, + u32 size, int slot) +{ + cp->slot = slot; + cp->context = context; + cp->len = size; +} +#endif +#endif + +static inline void ll_security_release_secctx(char *secdata, u32 seclen, + int slot) +{ +#ifdef HAVE_SEC_RELEASE_SECCTX_1ARG + struct lsmcontext context = { }; + + lsmcontext_init(&context, secdata, seclen, slot); + return security_release_secctx(&context); +#else + return security_release_secctx(secdata, seclen); +#endif +} + +#ifndef HAVE_USER_NAMESPACE_ARG +#define posix_acl_update_mode(ns, inode, mode, acl) \ + posix_acl_update_mode(inode, mode, acl) +#define notify_change(ns, de, attr, inode) notify_change(de, attr, inode) +#define inode_owner_or_capable(ns, inode) inode_owner_or_capable(inode) +#define vfs_create(ns, dir, de, mode, ex) vfs_create(dir, de, mode, ex) +#define vfs_mkdir(ns, dir, de, mode) vfs_mkdir(dir, de, mode) +#define ll_set_acl(ns, inode, acl, type) ll_set_acl(inode, acl, type) +#endif + +/** + * delete_from_page_cache is not exported anymore + */ +#ifdef HAVE_DELETE_FROM_PAGE_CACHE +#define cfs_delete_from_page_cache(page) delete_from_page_cache((page)) +#else +static inline void cfs_delete_from_page_cache(struct page *page) +{ + if (!page->mapping) + return; + LASSERT(PageLocked(page)); + get_page(page); + unlock_page(page); + /* on entry page is locked */ + if (S_ISREG(page->mapping->host->i_mode)) { + generic_error_remove_page(page->mapping, page); + } else { + loff_t lstart = page->index << PAGE_SHIFT; + loff_t lend = lstart + PAGE_SIZE - 1; + + truncate_inode_pages_range(page->mapping, lstart, lend); + } + lock_page(page); + put_page(page); +} +#endif + +static inline struct page *ll_read_cache_page(struct address_space *mapping, + pgoff_t index, filler_t *filler, + void *data) +{ +#ifdef HAVE_READ_CACHE_PAGE_WANTS_FILE + struct file dummy_file; + + dummy_file.f_ra.ra_pages = 32; /* unused, modified on ra error */ + dummy_file.private_data = data; + return read_cache_page(mapping, index, filler, &dummy_file); +#else + return read_cache_page(mapping, index, filler, data); +#endif /* HAVE_READ_CACHE_PAGE_WANTS_FILE */ +} + +#endif /* _LUSTRE_COMPAT_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_crypto.h b/drivers/staging/lustrefsx/lustre/include/lustre_crypto.h new file mode 100644 index 0000000000000..d048470691a4b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_crypto.h @@ -0,0 +1,230 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2019, 2020, Whamcloud. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _LUSTRE_CRYPTO_H_ +#define _LUSTRE_CRYPTO_H_ + +#if defined(HAVE_LUSTRE_CRYPTO) && !defined(CONFIG_LL_ENCRYPTION) +#define __FS_HAS_ENCRYPTION 1 +#include + +#define LL_CRYPTO_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE +#define llcrypt_name fscrypt_name +#define llcrypt_str fscrypt_str +#define LLTR_INIT FSTR_INIT +#define llcrypt_operations fscrypt_operations +#define llcrypt_symlink_data fscrypt_symlink_data +#define llcrypt_init() 0 +#define llcrypt_exit() {} +#ifndef HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED +#define llcrypt_context fscrypt_context +#define llcrypt_dummy_context fscrypt_dummy_context +#endif +#define llcrypt_require_key(inode) \ + fscrypt_require_key(inode) +#define llcrypt_has_encryption_key(inode) fscrypt_has_encryption_key(inode) +#define llcrypt_encrypt_pagecache_blocks(page, len, offs, gfp_flags) \ + fscrypt_encrypt_pagecache_blocks(page, len, offs, gfp_flags) +#define llcrypt_decrypt_pagecache_blocks(page, len, offs) \ + fscrypt_decrypt_pagecache_blocks(page, len, offs) +#define llcrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num) \ + fscrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num) +#define llcrypt_inherit_context(parent, child, fs_data, preload) \ + fscrypt_inherit_context(parent, child, fs_data, preload) +#define llcrypt_get_encryption_info(inode) fscrypt_get_encryption_info(inode) +#define llcrypt_put_encryption_info(inode) fscrypt_put_encryption_info(inode) +#define llcrypt_free_inode(inode) fscrypt_free_inode(inode) +#define llcrypt_finalize_bounce_page(pagep) fscrypt_finalize_bounce_page(pagep) +#define llcrypt_file_open(inode, filp) fscrypt_file_open(inode, filp) +#define llcrypt_ioctl_set_policy(filp, arg) fscrypt_ioctl_set_policy(filp, arg) +#define llcrypt_ioctl_get_policy_ex(filp, arg) \ + fscrypt_ioctl_get_policy_ex(filp, arg) +#define llcrypt_policy_has_filename_enc(inode) true +#define llcrypt_ioctl_add_key(filp, arg) fscrypt_ioctl_add_key(filp, arg) +#define llcrypt_ioctl_remove_key(filp, arg) fscrypt_ioctl_remove_key(filp, arg) +#define llcrypt_ioctl_remove_key_all_users(filp, arg) \ + fscrypt_ioctl_remove_key_all_users(filp, arg) +#define llcrypt_ioctl_get_key_status(filp, arg) \ + fscrypt_ioctl_get_key_status(filp, arg) +#define llcrypt_drop_inode(inode) fscrypt_drop_inode(inode) +#define llcrypt_prepare_rename(olddir, olddentry, newdir, newdentry, flags) \ + fscrypt_prepare_rename(olddir, olddentry, newdir, newdentry, flags) +#define llcrypt_prepare_link(old_dentry, dir, dentry) \ + fscrypt_prepare_link(old_dentry, dir, dentry) +#define llcrypt_prepare_setattr(dentry, attr) \ + fscrypt_prepare_setattr(dentry, attr) +#define __llcrypt_prepare_lookup(inode, dentry, fname) \ + __fscrypt_prepare_lookup(inode, dentry, fname) +#define llcrypt_set_ops(sb, cop) fscrypt_set_ops(sb, cop) +#define llcrypt_sb_free(sb) {} +#define llcrypt_fname_alloc_buffer(inode, max_encrypted_len, crypto_str) \ + fscrypt_fname_alloc_buffer(inode, max_encrypted_len, crypto_str) +#define llcrypt_fname_disk_to_usr(inode, hash, minor_hash, iname, oname) \ + fscrypt_fname_disk_to_usr(inode, hash, minor_hash, iname, oname) +#define llcrypt_fname_free_buffer(crypto_str) \ + fscrypt_fname_free_buffer(crypto_str) +#define llcrypt_setup_filename(dir, iname, lookup, fname) \ + fscrypt_setup_filename(dir, iname, lookup, fname) +#define llcrypt_free_filename(fname) \ + fscrypt_free_filename(fname) +#define llcrypt_match_name(fname, de_name, name_len) \ + fscrypt_match_name(fname, de_name, name_len) +#define llcrypt_prepare_lookup(dir, dentry, fname) \ + fscrypt_prepare_lookup(dir, dentry, fname) +#define llcrypt_encrypt_symlink(inode, target, len, disk_link) \ + fscrypt_encrypt_symlink(inode, target, len, disk_link) +#define __llcrypt_encrypt_symlink(inode, target, len, disk_link) \ + __fscrypt_encrypt_symlink(inode, target, len, disk_link) +#define llcrypt_prepare_symlink(dir, target, len, max_len, disk_link) \ + fscrypt_prepare_symlink(dir, target, len, max_len, disk_link) +#define llcrypt_get_symlink(inode, caddr, max_size, done) \ + fscrypt_get_symlink(inode, caddr, max_size, done) + +#define LL_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY +#define LL_IOC_GET_ENCRYPTION_POLICY_EX FS_IOC_GET_ENCRYPTION_POLICY_EX +#define LL_IOC_ADD_ENCRYPTION_KEY FS_IOC_ADD_ENCRYPTION_KEY +#define LL_IOC_REMOVE_ENCRYPTION_KEY FS_IOC_REMOVE_ENCRYPTION_KEY +#define LL_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS \ + FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS +#define LL_IOC_GET_ENCRYPTION_KEY_STATUS FS_IOC_GET_ENCRYPTION_KEY_STATUS + +#else /* HAVE_LUSTRE_CRYPTO && !CONFIG_LL_ENCRYPTION */ +#include +#endif /* !HAVE_LUSTRE_CRYPTO || CONFIG_LL_ENCRYPTION */ + +#ifndef DCACHE_NOKEY_NAME +#define DCACHE_NOKEY_NAME 0x02000000 /* Enc name without key */ +#endif + +#if !defined(HAVE_FSCRYPT_IS_NOKEY_NAME) || defined(CONFIG_LL_ENCRYPTION) + +static inline bool llcrypt_is_nokey_name(const struct dentry *dentry) +{ + return dentry->d_flags & DCACHE_NOKEY_NAME; +} +#else +#define llcrypt_is_nokey_name(dentry) \ + fscrypt_is_nokey_name(dentry) +#endif + +#if defined(HAVE_LUSTRE_CRYPTO) && !defined(HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED) +#define llcrypt_show_test_dummy_encryption(seq, sep, sb) \ + fscrypt_show_test_dummy_encryption(seq, sep, sb) +#define llcrypt_set_test_dummy_encryption(sb, arg, ctx) \ + fscrypt_set_test_dummy_encryption(sb, arg, ctx) +#define llcrypt_free_dummy_context(ctx) \ + fscrypt_free_dummy_context(ctx) +#else +#define llcrypt_show_test_dummy_encryption(seq, sep, sb) {} +#define llcrypt_free_dummy_context(ctx) {} +#endif + +/* Macro to extract digest from Lustre specific structures */ +#if defined(HAVE_FSCRYPT_DIGESTED_NAME) && !defined(CONFIG_LL_ENCRYPTION) +#define LLCRYPT_EXTRACT_DIGEST FSCRYPT_FNAME_DIGEST +#else +#define LLCRYPT_EXTRACT_DIGEST(name, len) \ + ((name) + round_down((len) - LL_CRYPTO_BLOCK_SIZE - 1, \ + LL_CRYPTO_BLOCK_SIZE)) +#endif + +struct ll_sb_info; +int ll_set_encflags(struct inode *inode, void *encctx, __u32 encctxlen, + bool preload); +void llcrypt_free_ctx(void *encctx, __u32 size); +bool ll_sb_has_test_dummy_encryption(struct super_block *sb); +bool ll_sbi_has_encrypt(struct ll_sb_info *sbi); +void ll_sbi_set_encrypt(struct ll_sb_info *sbi, bool set); +bool ll_sbi_has_name_encrypt(struct ll_sb_info *sbi); +void ll_sbi_set_name_encrypt(struct ll_sb_info *sbi, bool set); +/* sizeof(struct fscrypt_context_v2) = 40 */ +#define LLCRYPT_ENC_CTX_SIZE 40 + +/* Encoding/decoding routines inspired from yEnc principles. + * We just take care of a few critical characters: + * NULL, LF, CR, /, DEL and =. + * If such a char is found, it is replaced with '=' followed by + * the char value + 64. + * All other chars are left untouched. + * Efficiency of this encoding depends on the occurences of the + * critical chars, but statistically on binary data it can be much higher + * than base64 for instance. + */ +static inline int critical_encode(const u8 *src, int len, char *dst) +{ + u8 *p = (u8 *)src, *q = dst; + + while (p - src < len) { + /* escape NULL, LF, CR, /, DEL and = */ + if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD || + *p == '/' || *p == 0x7F || *p == '=')) { + *(q++) = '='; + *(q++) = *(p++) + 64; + } else { + *(q++) = *(p++); + } + } + + return (char *)q - dst; +} + +/* returns the number of chars encoding would produce */ +static inline int critical_chars(const u8 *src, int len) +{ + u8 *p = (u8 *)src; + int newlen = len; + + while (p - src < len) { + /* NULL, LF, CR, /, DEL and = cost an additional '=' */ + if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD || + *p == '/' || *p == 0x7F || *p == '=')) + newlen++; + p++; + } + + return newlen; +} + +/* decoding routine - returns the number of chars in output */ +static inline int critical_decode(const u8 *src, int len, char *dst) +{ + u8 *p = (u8 *)src, *q = dst; + + while (p - src < len) { + if (unlikely(*p == '=')) { + *(q++) = *(++p) - 64; + p++; + } else { + *(q++) = *(p++); + } + } + + return (char *)q - dst; +} + +#endif /* _LUSTRE_CRYPTO_H_ */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h new file mode 100644 index 0000000000000..23fe796728c8f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h @@ -0,0 +1,383 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre_disk.h + * + * Lustre disk format definitions. + * + * Author: Nathan Rutman + */ + +#ifndef _LUSTRE_DISK_H +#define _LUSTRE_DISK_H + +/** \defgroup disk disk + * + * @{ + */ +#include +#include +#include +#include +#include +#include +#include +#if !defined(CONFIG_LL_ENCRYPTION) && defined(HAVE_LUSTRE_CRYPTO) +#include +#endif + +#define IS_MDT(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MDT) +#define IS_OST(data) ((data)->lsi_flags & LDD_F_SV_TYPE_OST) +#define IS_MGS(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MGS) +#define IS_SERVER(data) ((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \ + LDD_F_SV_TYPE_MDT | \ + LDD_F_SV_TYPE_OST)) +#define MT_STR(data) mt_str((data)->ldd_mount_type) + +/****************** mount command *********************/ + +/* The lmd is only used internally by Lustre; mount simply passes + * everything as string options + */ +#define LMD_MAGIC 0xbdacbd03 +#define LMD_PARAMS_MAXLEN 4096 + +/* gleaned from the mount command - no persistent info here */ +struct lustre_mount_data { + u32 lmd_magic; + u32 lmd_flags; /* lustre mount flags */ + int lmd_mgs_failnodes; /* mgs failover node count */ + int lmd_exclude_count; + int lmd_recovery_time_soft; + int lmd_recovery_time_hard; + char *lmd_dev; /* device name */ + char *lmd_profile; /* client only */ + char *lmd_fileset; /* mount fileset */ + char *lmd_mgssec; /* sptlrpc flavor to mgs */ + char *lmd_opts; /* lustre mount options (as opposed to + * device_ mount options) */ + char *lmd_params; /* lustre params */ + u32 *lmd_exclude; /* array of OSTs to ignore */ + char *lmd_mgs; /* MGS nid */ + char *lmd_osd_type; /* OSD type */ + char *lmd_nidnet; /* network to restrict this client to */ +}; + +#define LMD_FLG_SERVER 0x0001 /* Mounting a server */ +#define LMD_FLG_CLIENT 0x0002 /* Mounting a client */ +#define LMD_FLG_SKIP_LFSCK 0x0004 /* NOT auto resume LFSCK when mount */ +#define LMD_FLG_ABORT_RECOV 0x0008 /* Abort recovery */ +#define LMD_FLG_NOSVC 0x0010 /* Only start MGS/MGC for servers, + no other services */ +#define LMD_FLG_NOMGS 0x0020 /* Only start target for servers, reusing + existing MGS services */ +#define LMD_FLG_WRITECONF 0x0040 /* Rewrite config log */ +#define LMD_FLG_NOIR 0x0080 /* NO imperative recovery */ +#define LMD_FLG_NOSCRUB 0x0100 /* Do not trigger scrub automatically */ +#define LMD_FLG_MGS 0x0200 /* Also start MGS along with server */ +#define LMD_FLG_IAM 0x0400 /* IAM dir */ +#define LMD_FLG_NO_PRIMNODE 0x0800 /* all nodes are service nodes */ +#define LMD_FLG_VIRGIN 0x1000 /* the service registers first time */ +#define LMD_FLG_UPDATE 0x2000 /* update parameters */ +#define LMD_FLG_HSM 0x4000 /* Start coordinator */ +#define LMD_FLG_DEV_RDONLY 0x8000 /* discard modification quitely */ +#define LMD_FLG_NO_PRECREATE 0x10000 /* do not allow OST object creation */ +#define LMD_FLG_LOCAL_RECOV 0x20000 /* force recovery for local clients */ +#define LMD_FLG_ABORT_RECOV_MDT 0x40000 /* Abort recovery between MDTs */ +#define LMD_FLG_NO_LOCAL_LOGS 0x80000 /* Use config logs from MGS */ + +#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT) + +/****************** superblock additional info *********************/ +struct ll_sb_info; +struct kobject; + +struct lustre_sb_info { + int lsi_flags; + struct obd_device *lsi_mgc; /* mgc obd */ + struct lustre_mount_data *lsi_lmd; /* mount command info */ + struct ll_sb_info *lsi_llsbi; /* add'l client sbi info */ + struct dt_device *lsi_dt_dev; /* dt device to access disk fs*/ + atomic_t lsi_mounts; /* references to the srv_mnt */ + struct kobject *lsi_kobj; + char lsi_svname[MTI_NAME_MAXLEN]; + /* lsi_osd_obdname format = 'lsi->ls_svname'-osd */ + char lsi_osd_obdname[MTI_NAME_MAXLEN + 4]; + /* lsi_osd_uuid format = 'lsi->ls_osd_obdname'_UUID */ + char lsi_osd_uuid[MTI_NAME_MAXLEN + 9]; + struct obd_export *lsi_osd_exp; + char lsi_osd_type[16]; + char lsi_fstype[16]; + struct backing_dev_info lsi_bdi; /* each client mountpoint needs + own backing_dev_info */ + /* protect lsi_lwp_list */ + struct mutex lsi_lwp_mutex; + struct list_head lsi_lwp_list; + unsigned long lsi_lwp_started:1, + lsi_server_started:1; +#ifdef CONFIG_LL_ENCRYPTION + const struct llcrypt_operations *lsi_cop; + struct key *lsi_master_keys; /* master crypto keys used */ +#elif defined(HAVE_LUSTRE_CRYPTO) && !defined(HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED) + /* Encryption context for '-o test_dummy_encryption' */ + struct llcrypt_dummy_context lsi_dummy_enc_ctx; +#endif +}; + +#define LSI_UMOUNT_FAILOVER 0x00200000 +#ifndef HAVE_SUPER_SETUP_BDI_NAME +#define LSI_BDI_INITIALIZED 0x00400000 +#endif +#define LSI_FILENAME_ENC 0x00800000 /* enable name encryption */ +#define LSI_FILENAME_ENC_B64_OLD_CLI 0x01000000 /* use old style base64 */ + +#define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info)) +#define s2lsi_nocast(sb) ((sb)->s_fs_info) + +#define get_profile_name(sb) (s2lsi(sb)->lsi_lmd->lmd_profile) +#define get_mount_fileset(sb) (s2lsi(sb)->lsi_lmd->lmd_fileset) + +# ifdef HAVE_SERVER_SUPPORT +/* opc for target register */ +#define LDD_F_OPC_REG 0x10000000 +#define LDD_F_OPC_UNREG 0x20000000 +#define LDD_F_OPC_READY 0x40000000 +#define LDD_F_OPC_MASK 0xf0000000 + +#define LDD_F_MASK 0xFFFF + +/* + * This limit is arbitrary (131072 clients on x86), but it is convenient to use + * 2^n * PAGE_SIZE * 8 for the number of bits that fit an order-n allocation. + * If we need more than 131072 clients (order-2 allocation on x86) then this + * should become an array of single-page pointers that are allocated on demand. + */ +#if (128 * 1024UL) > (PAGE_SIZE * 8) +#define LR_MAX_CLIENTS (128 * 1024UL) +#else +#define LR_MAX_CLIENTS (PAGE_SIZE * 8) +#endif + +/** COMPAT_146: this is an OST (temporary) */ +#define OBD_COMPAT_OST 0x00000002 +/** COMPAT_146: this is an MDT (temporary) */ +#define OBD_COMPAT_MDT 0x00000004 +/** 2.0 server, interop flag to show server version is changed */ +#define OBD_COMPAT_20 0x00000008 + +/** MDS handles LOV_OBJID file */ +#define OBD_ROCOMPAT_LOVOBJID 0x00000001 +/** store OST index in the IDIF */ +#define OBD_ROCOMPAT_IDX_IN_IDIF 0x00000002 + +/** OST handles group subdirs */ +#define OBD_INCOMPAT_GROUPS 0x00000001 +/** this is an OST */ +#define OBD_INCOMPAT_OST 0x00000002 +/** this is an MDT */ +#define OBD_INCOMPAT_MDT 0x00000004 +/** common last_rvcd format */ +#define OBD_INCOMPAT_COMMON_LR 0x00000008 +/** FID is enabled */ +#define OBD_INCOMPAT_FID 0x00000010 +/** Size-on-MDS is enabled */ +#define OBD_INCOMPAT_SOM 0x00000020 +/** filesystem using iam format to store directory entries */ +#define OBD_INCOMPAT_IAM_DIR 0x00000040 +/** LMA attribute contains per-inode incompatible flags */ +#define OBD_INCOMPAT_LMA 0x00000080 +/** lmm_stripe_count has been shrunk from u32 to u16 and the remaining 16 + * bits are now used to store a generation. Once we start changing the layout + * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count + * will be confused by interpreting stripe_count | gen << 16 as the actual + * stripe count */ +#define OBD_INCOMPAT_LMM_VER 0x00000100 +/** multiple OI files for MDT */ +#define OBD_INCOMPAT_MULTI_OI 0x00000200 +/** multiple RPCs in flight */ +#define OBD_INCOMPAT_MULTI_RPCS 0x00000400 + +/* last_rcvd handling */ +static inline void lsd_le_to_cpu(struct lr_server_data *buf, + struct lr_server_data *lsd) +{ + int i; + + memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid)); + lsd->lsd_last_transno = le64_to_cpu(buf->lsd_last_transno); + lsd->lsd_compat14 = le64_to_cpu(buf->lsd_compat14); + lsd->lsd_mount_count = le64_to_cpu(buf->lsd_mount_count); + lsd->lsd_feature_compat = le32_to_cpu(buf->lsd_feature_compat); + lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat); + lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat); + lsd->lsd_server_size = le32_to_cpu(buf->lsd_server_size); + lsd->lsd_client_start = le32_to_cpu(buf->lsd_client_start); + lsd->lsd_client_size = le16_to_cpu(buf->lsd_client_size); + lsd->lsd_subdir_count = le16_to_cpu(buf->lsd_subdir_count); + lsd->lsd_catalog_oid = le64_to_cpu(buf->lsd_catalog_oid); + lsd->lsd_catalog_ogen = le32_to_cpu(buf->lsd_catalog_ogen); + memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid)); + lsd->lsd_osd_index = le32_to_cpu(buf->lsd_osd_index); + lsd->lsd_padding1 = le32_to_cpu(buf->lsd_padding1); + lsd->lsd_start_epoch = le32_to_cpu(buf->lsd_start_epoch); + for (i = 0; i < LR_EXPIRE_INTERVALS; i++) + lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]); + lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time); + lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals); +} + +static inline void lsd_cpu_to_le(struct lr_server_data *lsd, + struct lr_server_data *buf) +{ + int i; + + memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid)); + buf->lsd_last_transno = cpu_to_le64(lsd->lsd_last_transno); + buf->lsd_compat14 = cpu_to_le64(lsd->lsd_compat14); + buf->lsd_mount_count = cpu_to_le64(lsd->lsd_mount_count); + buf->lsd_feature_compat = cpu_to_le32(lsd->lsd_feature_compat); + buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat); + buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat); + buf->lsd_server_size = cpu_to_le32(lsd->lsd_server_size); + buf->lsd_client_start = cpu_to_le32(lsd->lsd_client_start); + buf->lsd_client_size = cpu_to_le16(lsd->lsd_client_size); + buf->lsd_subdir_count = cpu_to_le16(lsd->lsd_subdir_count); + buf->lsd_catalog_oid = cpu_to_le64(lsd->lsd_catalog_oid); + buf->lsd_catalog_ogen = cpu_to_le32(lsd->lsd_catalog_ogen); + memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid)); + buf->lsd_osd_index = cpu_to_le32(lsd->lsd_osd_index); + buf->lsd_padding1 = cpu_to_le32(lsd->lsd_padding1); + buf->lsd_start_epoch = cpu_to_le32(lsd->lsd_start_epoch); + for (i = 0; i < LR_EXPIRE_INTERVALS; i++) + buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]); + buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time); + buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals); +} + +static inline void lcd_le_to_cpu(struct lsd_client_data *buf, + struct lsd_client_data *lcd) +{ + memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid)); + lcd->lcd_last_transno = le64_to_cpu(buf->lcd_last_transno); + lcd->lcd_last_xid = le64_to_cpu(buf->lcd_last_xid); + lcd->lcd_last_result = le32_to_cpu(buf->lcd_last_result); + lcd->lcd_last_data = le32_to_cpu(buf->lcd_last_data); + lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno); + lcd->lcd_last_close_xid = le64_to_cpu(buf->lcd_last_close_xid); + lcd->lcd_last_close_result = le32_to_cpu(buf->lcd_last_close_result); + lcd->lcd_last_close_data = le32_to_cpu(buf->lcd_last_close_data); + lcd->lcd_pre_versions[0] = le64_to_cpu(buf->lcd_pre_versions[0]); + lcd->lcd_pre_versions[1] = le64_to_cpu(buf->lcd_pre_versions[1]); + lcd->lcd_pre_versions[2] = le64_to_cpu(buf->lcd_pre_versions[2]); + lcd->lcd_pre_versions[3] = le64_to_cpu(buf->lcd_pre_versions[3]); + lcd->lcd_last_epoch = le32_to_cpu(buf->lcd_last_epoch); + lcd->lcd_generation = le32_to_cpu(buf->lcd_generation); +} + +static inline void lcd_cpu_to_le(struct lsd_client_data *lcd, + struct lsd_client_data *buf) +{ + memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid)); + buf->lcd_last_transno = cpu_to_le64(lcd->lcd_last_transno); + buf->lcd_last_xid = cpu_to_le64(lcd->lcd_last_xid); + buf->lcd_last_result = cpu_to_le32(lcd->lcd_last_result); + buf->lcd_last_data = cpu_to_le32(lcd->lcd_last_data); + buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno); + buf->lcd_last_close_xid = cpu_to_le64(lcd->lcd_last_close_xid); + buf->lcd_last_close_result = cpu_to_le32(lcd->lcd_last_close_result); + buf->lcd_last_close_data = cpu_to_le32(lcd->lcd_last_close_data); + buf->lcd_pre_versions[0] = cpu_to_le64(lcd->lcd_pre_versions[0]); + buf->lcd_pre_versions[1] = cpu_to_le64(lcd->lcd_pre_versions[1]); + buf->lcd_pre_versions[2] = cpu_to_le64(lcd->lcd_pre_versions[2]); + buf->lcd_pre_versions[3] = cpu_to_le64(lcd->lcd_pre_versions[3]); + buf->lcd_last_epoch = cpu_to_le32(lcd->lcd_last_epoch); + buf->lcd_generation = cpu_to_le32(lcd->lcd_generation); +} + +static inline u64 lcd_last_transno(struct lsd_client_data *lcd) +{ + return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ? + lcd->lcd_last_transno : lcd->lcd_last_close_transno); +} + +static inline u64 lcd_last_xid(struct lsd_client_data *lcd) +{ + return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ? + lcd->lcd_last_xid : lcd->lcd_last_close_xid); +} + +/****************** mount lookup info *********************/ + +struct lustre_mount_info { + char *lmi_name; + struct super_block *lmi_sb; + struct list_head lmi_list_chain; +}; + +/****************** prototypes *********************/ + +/* obd_mount_server.c */ +int server_fill_super(struct super_block *sb); +struct lustre_mount_info *server_get_mount(const char *name); +int server_put_mount(const char *name, bool dereg_mnt); +struct mgs_target_info; +int server_mti_print(const char *title, struct mgs_target_info *mti); +void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd); + +/* obd_mount.c */ +int server_name2svname(const char *label, char *svname, const char **endptr, + size_t svsize); + +int server_name_is_ost(const char *svname); +int target_name2index(const char *svname, u32 *idx, const char **endptr); + +int lustre_put_lsi(struct super_block *sb); +int lustre_start_simple(char *obdname, char *type, char *uuid, + char *s1, char *s2, char *s3, char *s4); +#endif /* HAVE_SERVER_SUPPORT */ +int server_name2fsname(const char *svname, char *fsname, const char **endptr); +void obdname2fsname(const char *tgt, char *fsname, size_t fslen); + +int lustre_start_mgc(struct super_block *sb); +int lustre_common_put_super(struct super_block *sb); + +struct lustre_sb_info *lustre_init_lsi(struct super_block *sb); +int lustre_put_lsi(struct super_block *sb); +int lmd_parse(char *options, struct lustre_mount_data *lmd); + +/* mgc_request.c */ +int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, + enum mgs_cfg_type type); +int mgc_logname2resid(char *fsname, struct ldlm_res_id *res_id, + enum mgs_cfg_type type); + +/** @} disk */ + +#endif /* _LUSTRE_DISK_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h new file mode 100644 index 0000000000000..dc738f4184c29 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h @@ -0,0 +1,1865 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +/** \defgroup LDLM Lustre Distributed Lock Manager + * + * Lustre DLM is based on VAX DLM. + * Its two main roles are: + * - To provide locking assuring consistency of data on all Lustre nodes. + * - To allow clients to cache state protected by a lock by holding the + * lock until a conflicting lock is requested or it is expired by the LRU. + * + * @{ + */ + +#ifndef _LUSTRE_DLM_H__ +#define _LUSTRE_DLM_H__ + +#include +#include +#include +#include +#include /* for interval_node{}, ldlm_extent */ +#include + +#include "lustre_dlm_flags.h" + +struct obd_ops; +struct obd_device; + +extern struct kset *ldlm_ns_kset; +extern struct kset *ldlm_svc_kset; + +#define OBD_LDLM_DEVICENAME "ldlm" + +#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus()) +#define LDLM_DEFAULT_MAX_ALIVE 3900 /* 3900 seconds ~65 min */ +#define LDLM_CTIME_AGE_LIMIT (10) +/* if client lock is unused for that time it can be cancelled if any other + * client shows interest in that lock, e.g. glimpse is occured. */ +#define LDLM_DIRTY_AGE_LIMIT (10) +#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024 +#define LDLM_DEFAULT_LRU_SHRINK_BATCH (16) +#define LDLM_DEFAULT_SLV_RECALC_PCT (10) + +/** + * LDLM non-error return states + */ +enum ldlm_error { + ELDLM_OK = 0, + ELDLM_LOCK_MATCHED = 1, + + ELDLM_LOCK_CHANGED = 300, + ELDLM_LOCK_ABORTED = 301, + ELDLM_LOCK_REPLACED = 302, + ELDLM_NO_LOCK_DATA = 303, + ELDLM_LOCK_WOULDBLOCK = 304, + + ELDLM_NAMESPACE_EXISTS = 400, + ELDLM_BAD_NAMESPACE = 401, +}; + +/** + * LDLM namespace type. + * The "client" type is actually an indication that this is a narrow local view + * into complete namespace on the server. Such namespaces cannot make any + * decisions about lack of conflicts or do any autonomous lock granting without + * first speaking to a server. + */ +enum ldlm_side { + LDLM_NAMESPACE_SERVER = 0x01, + LDLM_NAMESPACE_CLIENT = 0x02 +}; + +/** + * The blocking callback is overloaded to perform two functions. These flags + * indicate which operation should be performed. + */ +#define LDLM_CB_BLOCKING 1 +#define LDLM_CB_CANCELING 2 + +/** + * \name Lock Compatibility Matrix. + * + * A lock has both a type (extent, flock, inode bits, or plain) and a mode. + * Lock types are described in their respective implementation files: + * ldlm_{extent,flock,inodebits,plain}.c. + * + * There are six lock modes along with a compatibility matrix to indicate if + * two locks are compatible. + * + * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock + * on the parent. + * - PW: Protective Write (normal write) mode. When a client requests a write + * lock from an OST, a lock with PW mode will be issued. + * - PR: Protective Read (normal read) mode. When a client requests a read from + * an OST, a lock with PR mode will be issued. Also, if the client opens a + * file for execution, it is granted a lock with PR mode. + * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client + * requests a write lock during a file open operation. + * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants + * an inodebit lock with the CR mode on the intermediate path component. + * - NL Null mode. + * + *
+ *       NL  CR  CW  PR  PW  EX
+ *  NL    1   1   1   1   1   1
+ *  CR    1   1   1   1   1   0
+ *  CW    1   1   1   0   0   0
+ *  PR    1   1   0   1   0   0
+ *  PW    1   1   0   0   0   0
+ *  EX    1   0   0   0   0   0
+ * 
+ */ +/** @{ */ +#define LCK_COMPAT_EX LCK_NL +#define LCK_COMPAT_PW (LCK_COMPAT_EX | LCK_CR) +#define LCK_COMPAT_PR (LCK_COMPAT_PW | LCK_PR) +#define LCK_COMPAT_CW (LCK_COMPAT_PW | LCK_CW) +#define LCK_COMPAT_CR (LCK_COMPAT_CW | LCK_PR | LCK_PW) +#define LCK_COMPAT_NL (LCK_COMPAT_CR | LCK_EX | LCK_GROUP) +#define LCK_COMPAT_GROUP (LCK_GROUP | LCK_NL) +#define LCK_COMPAT_COS (LCK_COS) +/** @} Lock Compatibility Matrix */ + +extern enum ldlm_mode lck_compat_array[]; + +static inline void lockmode_verify(enum ldlm_mode mode) +{ + LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE); +} + +static inline int lockmode_compat(enum ldlm_mode exist_mode, + enum ldlm_mode new_mode) +{ + return lck_compat_array[exist_mode] & new_mode; +} + +/* + * + * cluster name spaces + * + */ + +#define DLM_OST_NAMESPACE 1 +#define DLM_MDS_NAMESPACE 2 + +/* XXX + - do we just separate this by security domains and use a prefix for + multiple namespaces in the same domain? + - +*/ + +/** + * Locking rules for LDLM: + * + * lr_lock + * + * lr_lock + * waiting_locks_spinlock + * + * lr_lock + * led_lock + * + * lr_lock + * ns_lock + * + * lr_lvb_mutex + * lr_lock + * + */ + +/* Cancel lru flag, it indicates we cancel aged locks. */ +enum ldlm_lru_flags { + LDLM_LRU_FLAG_NO_WAIT = 0x1, /* Cancel locks w/o blocking (neither + * sending nor waiting for any RPCs) */ + LDLM_LRU_FLAG_CLEANUP = 0x2, /* Used when clearing lru, tells + * prepare_lru_list to set discard flag + * on PR extent locks so we don't waste + * time saving pages that will be + * discarded momentarily */ +}; + +struct ldlm_pool; +struct ldlm_lock; +struct ldlm_resource; +struct ldlm_namespace; + +/** + * Operations on LDLM pools. + * LDLM pool is a pool of locks in the namespace without any implicitly + * specified limits. + * Locks in the pool are organized in LRU. + * Local memory pressure or server instructions (e.g. mempressure on server) + * can trigger freeing of locks from the pool + */ +struct ldlm_pool_ops { + /** Recalculate pool \a pl usage */ + int (*po_recalc)(struct ldlm_pool *pl, bool force); + /** Cancel at least \a nr locks from pool \a pl */ + int (*po_shrink)(struct ldlm_pool *pl, int nr, gfp_t gfp_mask); + int (*po_setup)(struct ldlm_pool *pl, int limit); +}; + +/** One second for pools thread check interval. Each pool has own period. */ +#define LDLM_POOLS_THREAD_PERIOD (1) + +/** ~6% margin for modest pools. See ldlm_pool.c for details. */ +#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4) + +/** Default recalc period for server side pools in sec. */ +#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1) + +/** Default recalc period for client side pools in sec. */ +#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10) + +/** + * LDLM pool structure to track granted locks. + * For purposes of determining when to release locks on e.g. memory pressure. + * This feature is commonly referred to as lru_resize. + */ +struct ldlm_pool { + /** Pool debugfs directory. */ + struct dentry *pl_debugfs_entry; + /** Pool name, must be long enough to hold compound proc entry name. */ + char pl_name[100]; + /** Lock for protecting SLV/CLV updates. */ + spinlock_t pl_lock; + /** Number of allowed locks in in pool, both, client and server side. */ + atomic_t pl_limit; + /** Number of granted locks in */ + atomic_t pl_granted; + /** Grant rate per T. */ + atomic_t pl_grant_rate; + /** Cancel rate per T. */ + atomic_t pl_cancel_rate; + /** Server lock volume (SLV). Protected by pl_lock. */ + __u64 pl_server_lock_volume; + /** Current biggest client lock volume. Protected by pl_lock. */ + __u64 pl_client_lock_volume; + /** Lock volume factor, shown in percents in procfs, but internally + * Client SLV calculated as: server_slv * lock_volume_factor >> 8. + */ + atomic_t pl_lock_volume_factor; + /** Time when last SLV from server was obtained. */ + time64_t pl_recalc_time; + /** Recalculation period for pool. */ + time64_t pl_recalc_period; + /** Recalculation and shrink operations. */ + struct ldlm_pool_ops *pl_ops; + /** Number of planned locks for next period. */ + int pl_grant_plan; + /** Pool statistics. */ + struct lprocfs_stats *pl_stats; + + /* sysfs object */ + struct kobject pl_kobj; + struct completion pl_kobj_unregister; +}; + +typedef int (*ldlm_res_policy)(const struct lu_env *env, + struct ldlm_namespace *, + struct ldlm_lock **, void *req_cookie, + enum ldlm_mode mode, __u64 flags, void *data); + +typedef int (*ldlm_cancel_cbt)(struct ldlm_lock *lock); + +/** + * LVB operations. + * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could + * be associated with an LDLM lock and transferred from client to server and + * back. + * + * Currently LVBs are used by: + * - OSC-OST code to maintain current object size/times + * - layout lock code to return the layout when the layout lock is granted + * + * To ensure delayed LVB initialization, it is highly recommended to use the set + * of ldlm_[res_]lvbo_[init,update,fill]() functions. + */ +struct ldlm_valblock_ops { + int (*lvbo_init)(struct ldlm_resource *res); + int (*lvbo_update)(struct ldlm_resource *res, struct ldlm_lock *lock, + struct ptlrpc_request *r, int increase); + int (*lvbo_free)(struct ldlm_resource *res); + /* Return size of lvb data appropriate RPC size can be reserved */ + int (*lvbo_size)(struct ldlm_lock *lock); + /* Called to fill in lvb data to RPC buffer @buf */ + int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int *buflen); +}; + +/** + * LDLM pools related, type of lock pool in the namespace. + * Greedy means release cached locks aggressively + */ +enum ldlm_appetite { + LDLM_NAMESPACE_GREEDY = BIT(0), + LDLM_NAMESPACE_MODEST = BIT(1), +}; + +/** + * Default values for the "max_nolock_size", "contention_time" and + * "contended_locks" namespace tunables. + */ +#define NS_DEFAULT_MAX_NOLOCK_BYTES 0 +#define NS_DEFAULT_CONTENTION_SECONDS 2 +#define NS_DEFAULT_CONTENDED_LOCKS 32 + +struct ldlm_ns_bucket { + /** back pointer to namespace */ + struct ldlm_namespace *nsb_namespace; + /** + * Estimated lock callback time. Used by adaptive timeout code to + * avoid spurious client evictions due to unresponsiveness when in + * fact the network or overall system load is at fault + */ + struct adaptive_timeout nsb_at_estimate; + /** + * Which res in the bucket should we start with the reclaim. + */ + int nsb_reclaim_start; + /* counter of entries in this bucket */ + atomic_t nsb_count; +}; + +enum { + /** LDLM namespace lock stats */ + LDLM_NSS_LOCKS = 0, + LDLM_NSS_LAST +}; + +enum ldlm_ns_type { + LDLM_NS_TYPE_UNKNOWN = 0, /**< invalid type */ + LDLM_NS_TYPE_MDC, /**< MDC namespace */ + LDLM_NS_TYPE_MDT, /**< MDT namespace */ + LDLM_NS_TYPE_OSC, /**< OSC namespace */ + LDLM_NS_TYPE_OST, /**< OST namespace */ + LDLM_NS_TYPE_MGC, /**< MGC namespace */ + LDLM_NS_TYPE_MGT, /**< MGT namespace */ +}; + +enum ldlm_namespace_flags { + /** + * Flag to indicate the LRU cancel is in progress. + * Used to limit the process by 1 thread only. + */ + LDLM_LRU_CANCEL = 0 +}; + +/** + * LDLM Namespace. + * + * Namespace serves to contain locks related to a particular service. + * There are two kinds of namespaces: + * - Server namespace has knowledge of all locks and is therefore authoritative + * to make decisions like what locks could be granted and what conflicts + * exist during new lock enqueue. + * - Client namespace only has limited knowledge about locks in the namespace, + * only seeing locks held by the client. + * + * Every Lustre service has one server namespace present on the server serving + * that service. Every client connected to the service has a client namespace + * for it. + * Every lock obtained by client in that namespace is actually represented by + * two in-memory locks. One on the server and one on the client. The locks are + * linked by a special cookie by which one node can tell to the other which lock + * it actually means during communications. Such locks are called remote locks. + * The locks held by server only without any reference to a client are called + * local locks. + */ +struct ldlm_namespace { + /** Backward link to OBD, required for LDLM pool to store new SLV. */ + struct obd_device *ns_obd; + + /** Flag indicating if namespace is on client instead of server */ + enum ldlm_side ns_client; + + /** name of this namespace */ + char *ns_name; + + /** Resource hash table for namespace. */ + struct cfs_hash *ns_rs_hash; + struct ldlm_ns_bucket *ns_rs_buckets; + unsigned int ns_bucket_bits; + + /** serialize */ + spinlock_t ns_lock; + + /** big refcount (by bucket) */ + atomic_t ns_bref; + + /** + * Namespace connect flags supported by server (may be changed via + * /proc, LRU resize may be disabled/enabled). + */ + __u64 ns_connect_flags; + + /** Client side original connect flags supported by server. */ + __u64 ns_orig_connect_flags; + + /* namespace debugfs dir entry */ + struct dentry *ns_debugfs_entry; + + /** + * Position in global namespace list linking all namespaces on + * the node. + */ + struct list_head ns_list_chain; + + /** + * List of unused locks for this namespace. This list is also called + * LRU lock list. + * Unused locks are locks with zero reader/writer reference counts. + * This list is only used on clients for lock caching purposes. + * When we want to release some locks voluntarily or if server wants + * us to release some locks due to e.g. memory pressure, we take locks + * to release from the head of this list. + * Locks are linked via l_lru field in \see struct ldlm_lock. + */ + struct list_head ns_unused_list; + /** Number of locks in the LRU list above */ + int ns_nr_unused; + struct list_head *ns_last_pos; + + /** + * Maximum number of locks permitted in the LRU. If 0, means locks + * are managed by pools and there is no preset limit, rather it is all + * controlled by available memory on this client and on server. + */ + unsigned int ns_max_unused; + + /** + * Cancel batch, if unused lock count exceed lru_size + * Only be used if LRUR disable. + */ + unsigned int ns_cancel_batch; + + /** + * How much the SLV should decrease in %% to trigger LRU cancel urgently. + */ + unsigned int ns_recalc_pct; + + /** Maximum allowed age (last used time) for locks in the LRU. Set in + * seconds from userspace, but stored in ns to avoid repeat conversions. + */ + ktime_t ns_max_age; + + /** + * Server only: number of times we evicted clients due to lack of reply + * to ASTs. + */ + unsigned int ns_timeouts; + /** + * Number of seconds since the file change time after which + * the MDT will return an UPDATE lock along with a LOOKUP lock. + * This allows the client to start caching negative dentries + * for a directory and may save an RPC for a later stat. + */ + timeout_t ns_ctime_age_limit; + /** + * Number of (nano)seconds since the lock was last used. The client + * may cancel the lock older than this age and flush related data if + * another client shows interest in this lock by doing glimpse request. + * This allows to cache stat data locally for such files early. Set in + * seconds from userspace, but stored in ns to avoid repeat conversions. + */ + ktime_t ns_dirty_age_limit; + /** + * Used to rate-limit ldlm_namespace_dump calls. + * \see ldlm_namespace_dump. Increased by 10 seconds every time + * it is called. + */ + time64_t ns_next_dump; + + /** "policy" function that does actual lock conflict determination */ + ldlm_res_policy ns_policy; + + /** + * LVB operations for this namespace. + * \see struct ldlm_valblock_ops + */ + struct ldlm_valblock_ops *ns_lvbo; + + /** + * Used by filter code to store pointer to OBD of the service. + * Should be dropped in favor of \a ns_obd + */ + void *ns_lvbp; + + /** + * Wait queue used by __ldlm_namespace_free. Gets woken up every time + * a resource is removed. + */ + wait_queue_head_t ns_waitq; + /** LDLM pool structure for this namespace */ + struct ldlm_pool ns_pool; + /** Definition of how eagerly unused locks will be released from LRU */ + enum ldlm_appetite ns_appetite; + + /** + * If more than \a ns_contended_locks are found, the resource is + * considered to be contended. Lock enqueues might specify that no + * contended locks should be granted + */ + unsigned ns_contended_locks; + + /** + * The resources in this namespace remember contended state during + * \a ns_contention_time, in seconds. + */ + timeout_t ns_contention_time; + + /** + * Limit size of contended extent locks, in bytes. + * If extended lock is requested for more then this many bytes and + * caller instructs us not to grant contended locks, we would disregard + * such a request. + */ + unsigned ns_max_nolock_size; + + /** Limit of parallel AST RPC count. */ + unsigned ns_max_parallel_ast; + + /** + * Callback to check if a lock is good to be canceled by ELC or + * during recovery. + */ + ldlm_cancel_cbt ns_cancel; + + /** LDLM lock stats */ + struct lprocfs_stats *ns_stats; + + /** + * Flag to indicate namespace is being freed. Used to determine if + * recalculation of LDLM pool statistics should be skipped. + */ + unsigned ns_stopping:1, + + /** + * Flag to indicate the LRU recalc on RPC reply is in progress. + * Used to limit the process by 1 thread only. + */ + ns_rpc_recalc:1; + + /** + * Which bucket should we start with the lock reclaim. + */ + int ns_reclaim_start; + + struct kobject ns_kobj; /* sysfs object */ + struct completion ns_kobj_unregister; + + /** + * To avoid another ns_lock usage, a separate bitops field. + */ + unsigned long ns_flags; +}; + +/** + * Returns 1 if namespace \a ns is a client namespace. + */ +static inline int ns_is_client(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT || + ns->ns_client == LDLM_NAMESPACE_SERVER); + return ns->ns_client == LDLM_NAMESPACE_CLIENT; +} + +/** + * Returns 1 if namespace \a ns is a server namespace. + */ +static inline int ns_is_server(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT || + ns->ns_client == LDLM_NAMESPACE_SERVER); + return ns->ns_client == LDLM_NAMESPACE_SERVER; +} + +/** + * Returns 1 if namespace \a ns supports early lock cancel (ELC). + */ +static inline int ns_connect_cancelset(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET); +} + +/** + * Returns 1 if this namespace supports lru_resize. + */ +static inline int ns_connect_lru_resize(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE); +} + +static inline void ns_register_cancel(struct ldlm_namespace *ns, + ldlm_cancel_cbt arg) +{ + LASSERT(ns != NULL); + ns->ns_cancel = arg; +} + +struct ldlm_lock; + +/** Type for blocking callback function of a lock. */ +typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock, + struct ldlm_lock_desc *new, void *data, + int flag); +/** Type for completion callback function of a lock. */ +typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags, + void *data); +/** Type for glimpse callback function of a lock. */ +typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data); + +/** Type for created callback function of a lock. */ +typedef void (*ldlm_created_callback)(struct ldlm_lock *lock); + +/** Work list for sending GL ASTs to multiple locks. */ +struct ldlm_glimpse_work { + struct ldlm_lock *gl_lock; /* lock to glimpse */ + struct list_head gl_list; /* linkage to other gl work structs */ + __u32 gl_flags;/* see LDLM_GL_WORK_* below */ + union ldlm_gl_desc *gl_desc; /* glimpse descriptor to be packed in + * glimpse callback request */ + ptlrpc_interpterer_t gl_interpret_reply; + void *gl_interpret_data; +}; + +struct ldlm_bl_desc { + unsigned int bl_same_client:1, + bl_cos_incompat:1; +}; + +struct ldlm_cb_set_arg { + struct ptlrpc_request_set *set; + int type; /* LDLM_{CP,BL,GL}_CALLBACK */ + atomic_t restart; + struct list_head *list; + union ldlm_gl_desc *gl_desc; /* glimpse AST descriptor */ + ptlrpc_interpterer_t gl_interpret_reply; + void *gl_interpret_data; + struct ldlm_bl_desc *bl_desc; +}; + +struct ldlm_cb_async_args { + struct ldlm_cb_set_arg *ca_set_arg; + struct ldlm_lock *ca_lock; +}; + +/** The ldlm_glimpse_work was slab allocated & must be freed accordingly.*/ +#define LDLM_GL_WORK_SLAB_ALLOCATED 0x1 + +/** Interval node data for each LDLM_EXTENT lock. */ +struct ldlm_interval { + struct interval_node li_node; /* node for tree management */ + struct list_head li_group; /* the locks which have the same + * policy - group of the policy */ +}; +#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node) + +/** + * Interval tree for extent locks. + * The interval tree must be accessed under the resource lock. + * Interval trees are used for granted extent locks to speed up conflicts + * lookup. See ldlm/interval_tree.c for more details. + */ +struct ldlm_interval_tree { + /** Tree size. */ + int lit_size; + enum ldlm_mode lit_mode; /* lock mode */ + struct interval_node *lit_root; /* actual ldlm_interval */ +}; + +/** + * Lists of waiting locks for each inodebit type. + * A lock can be in several liq_waiting lists and it remains in lr_waiting. + */ +struct ldlm_ibits_queues { + struct list_head liq_waiting[MDS_INODELOCK_NUMBITS]; +}; + +struct ldlm_ibits_node { + struct list_head lin_link[MDS_INODELOCK_NUMBITS]; + struct ldlm_lock *lock; +}; + +/** Whether to track references to exports by LDLM locks. */ +#define LUSTRE_TRACKS_LOCK_EXP_REFS (0) + +/** Cancel flags. */ +enum ldlm_cancel_flags { + LCF_ASYNC = 0x1, /* Cancel locks asynchronously. */ + LCF_LOCAL = 0x2, /* Cancel locks locally, not notifing server */ + LCF_BL_AST = 0x4, /* Cancel LDLM_FL_BL_AST locks in the same RPC */ +}; + +struct ldlm_flock { + __u64 start; + __u64 end; + __u64 owner; + __u64 blocking_owner; + struct obd_export *blocking_export; + atomic_t blocking_refs; + __u32 pid; +}; + +union ldlm_policy_data { + struct ldlm_extent l_extent; + struct ldlm_flock l_flock; + struct ldlm_inodebits l_inodebits; +}; + +void ldlm_convert_policy_to_wire(enum ldlm_type type, + const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy); +void ldlm_convert_policy_to_local(struct obd_export *exp, enum ldlm_type type, + const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy); + +enum lvb_type { + LVB_T_NONE = 0, + LVB_T_OST = 1, + LVB_T_LQUOTA = 2, + LVB_T_LAYOUT = 3, +}; + +/** + * LDLM_GID_ANY is used to match any group id in ldlm_lock_match(). + */ +#define LDLM_GID_ANY ((__u64)-1) + +/** + * LDLM lock structure + * + * Represents a single LDLM lock and its state in memory. Each lock is + * associated with a single ldlm_resource, the object which is being + * locked. There may be multiple ldlm_locks on a single resource, + * depending on the lock type and whether the locks are conflicting or + * not. + */ +struct ldlm_lock { + /** + * Local lock handle. + * When remote side wants to tell us about a lock, they address + * it by this opaque handle. The handle does not hold a + * reference on the ldlm_lock, so it can be safely passed to + * other threads or nodes. When the lock needs to be accessed + * from the handle, it is looked up again in the lock table, and + * may no longer exist. + * + * Must be first in the structure. + */ + struct portals_handle l_handle; + /** + * Pointer to actual resource this lock is in. + * ldlm_lock_change_resource() can change this on the client. + * When this is possible, rcu must be used to stablise + * the resource while we lock and check it hasn't been changed. + */ + struct ldlm_resource *l_resource; + /** + * List item for client side LRU list. + * Protected by ns_lock in struct ldlm_namespace. + */ + struct list_head l_lru; + /** + * Linkage to resource's lock queues according to current lock state. + * (could be granted or waiting) + * Protected by lr_lock in struct ldlm_resource. + */ + struct list_head l_res_link; + /** + * Internal structures per lock type.. + */ + union { + struct ldlm_interval *l_tree_node; + struct ldlm_ibits_node *l_ibits_node; + }; + /** + * Per export hash of locks. + * Protected by per-bucket exp->exp_lock_hash locks. + */ + struct hlist_node l_exp_hash; + /** + * Per export hash of flock locks. + * Protected by per-bucket exp->exp_flock_hash locks. + */ + struct hlist_node l_exp_flock_hash; + /** + * Requested mode. + * Protected by lr_lock. + */ + enum ldlm_mode l_req_mode; + /** + * Granted mode, also protected by lr_lock. + */ + enum ldlm_mode l_granted_mode; + /** Lock completion handler pointer. Called when lock is granted. */ + ldlm_completion_callback l_completion_ast; + /** + * Lock blocking AST handler pointer. + * It plays two roles: + * - as a notification of an attempt to queue a conflicting lock (once) + * - as a notification when the lock is being cancelled. + * + * As such it's typically called twice: once for the initial conflict + * and then once more when the last user went away and the lock is + * cancelled (could happen recursively). + */ + ldlm_blocking_callback l_blocking_ast; + /** + * Lock glimpse handler. + * Glimpse handler is used to obtain LVB updates from a client by + * server + */ + ldlm_glimpse_callback l_glimpse_ast; + + /** + * Lock export. + * This is a pointer to actual client export for locks that were granted + * to clients. Used server-side. + */ + struct obd_export *l_export; + /** + * Lock connection export. + * Pointer to server export on a client. + */ + struct obd_export *l_conn_export; + + /** + * Remote lock handle. + * If the lock is remote, this is the handle of the other side lock + * (l_handle) + */ + struct lustre_handle l_remote_handle; + + /** + * Representation of private data specific for a lock type. + * Examples are: extent range for extent lock or bitmask for ibits locks + */ + union ldlm_policy_data l_policy_data; + + /** + * Lock state flags. Protected by lr_lock. + * \see lustre_dlm_flags.h where the bits are defined. + */ + __u64 l_flags; + + /** + * Lock r/w usage counters. + * Protected by lr_lock. + */ + __u32 l_readers; + __u32 l_writers; + /** + * If the lock is granted, a process sleeps on this waitq to learn when + * it's no longer in use. If the lock is not granted, a process sleeps + * on this waitq to learn when it becomes granted. + */ + wait_queue_head_t l_waitq; + + /** + * Time, in nanoseconds, last used by e.g. being matched by lock match. + */ + ktime_t l_last_used; + + /** Originally requested extent for the extent lock. */ + struct ldlm_extent l_req_extent; + + /* + * Client-side-only members. + */ + + enum lvb_type l_lvb_type; + + /** + * Temporary storage for a LVB received during an enqueue operation. + * May be vmalloc'd, so needs to be freed with OBD_FREE_LARGE(). + */ + __u32 l_lvb_len; + void *l_lvb_data; + + /** Private storage for lock user. Opaque to LDLM. */ + void *l_ast_data; + + union { + /** + * Seconds. It will be updated if there is any activity related to + * the lock at client, e.g. enqueue the lock. For server it is the + * time when blocking ast was sent. + */ + time64_t l_activity; + time64_t l_blast_sent; + }; + + /* separate ost_lvb used mostly by Data-on-MDT for now. + * It is introduced to don't mix with layout lock data. */ + struct ost_lvb l_ost_lvb; + /* + * Server-side-only members. + */ + + /** + * Connection cookie for the client originating the operation. + * Used by Commit on Share (COS) code. Currently only used for + * inodebits locks on MDS. + */ + __u64 l_client_cookie; + + /** + * List item for locks waiting for cancellation from clients. + * The lists this could be linked into are: + * waiting_locks_list (protected by waiting_locks_spinlock), + * then if the lock timed out, it is moved to + * expired_lock_list for further processing. + */ + struct list_head l_pending_chain; + + /** + * Set when lock is sent a blocking AST. Time in seconds when timeout + * is reached and client holding this lock could be evicted. + * This timeout could be further extended by e.g. certain IO activity + * under this lock. + * \see ost_rw_prolong_locks + */ + time64_t l_callback_timestamp; + + /** Local PID of process which created this lock. */ + __u32 l_pid; + + /** + * Number of times blocking AST was sent for this lock. + * This is for debugging. Valid values are 0 and 1, if there is an + * attempt to send blocking AST more than once, an assertion would be + * hit. \see ldlm_work_bl_ast_lock + */ + int l_bl_ast_run; + /** List item ldlm_add_ast_work_item() for case of blocking ASTs. */ + struct list_head l_bl_ast; + /** List item ldlm_add_ast_work_item() for case of completion ASTs. */ + struct list_head l_cp_ast; + /** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */ + struct list_head l_rk_ast; + + /** + * Pointer to a conflicting lock that caused blocking AST to be sent + * for this lock + */ + struct ldlm_lock *l_blocking_lock; + + /** + * Protected by lr_lock, linkages to "skip lists". + * For more explanations of skip lists see ldlm/ldlm_inodebits.c + */ + struct list_head l_sl_mode; + struct list_head l_sl_policy; + + /** Reference tracking structure to debug leaked locks. */ + struct lu_ref l_reference; +#if LUSTRE_TRACKS_LOCK_EXP_REFS + /* Debugging stuff for bug 20498, for tracking export references. */ + /** number of export references taken */ + int l_exp_refs_nr; + /** link all locks referencing one export */ + struct list_head l_exp_refs_link; + /** referenced export object */ + struct obd_export *l_exp_refs_target; +#endif + /** + * export blocking dlm lock list, protected by + * l_export->exp_bl_list_lock. + * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock + * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock. + */ + struct list_head l_exp_list; +}; + +enum ldlm_match_flags { + LDLM_MATCH_UNREF = BIT(0), + LDLM_MATCH_AST = BIT(1), + LDLM_MATCH_AST_ANY = BIT(2), + LDLM_MATCH_RIGHT = BIT(3), +}; + +/** + * Describe the overlap between two locks. itree_overlap_cb data. + */ +struct ldlm_match_data { + struct ldlm_lock *lmd_old; + struct ldlm_lock *lmd_lock; + enum ldlm_mode *lmd_mode; + union ldlm_policy_data *lmd_policy; + __u64 lmd_flags; + __u64 lmd_skip_flags; + enum ldlm_match_flags lmd_match; +}; + +/** For uncommitted cross-MDT lock, store transno this lock belongs to */ +#define l_transno l_client_cookie + +/** For uncommitted cross-MDT lock, which is client lock, share with l_rk_ast + * which is for server. */ +#define l_slc_link l_rk_ast + +struct lustre_handle_array { + unsigned int ha_count; + /* ha_map is used as bit flag to indicate handle is remote or local */ + DECLARE_BITMAP(ha_map, LMV_MAX_STRIPE_COUNT); + struct lustre_handle ha_handles[0]; +}; + +/** + * LDLM resource description. + * Basically, resource is a representation for a single object. + * Object has a name which is currently 4 64-bit integers. LDLM user is + * responsible for creation of a mapping between objects it wants to be + * protected and resource names. + * + * A resource can only hold locks of a single lock type, though there may be + * multiple ldlm_locks on a single resource, depending on the lock type and + * whether the locks are conflicting or not. + */ +struct ldlm_resource { + struct ldlm_ns_bucket *lr_ns_bucket; + + /** + * List item for list in namespace hash. + * protected by ns_lock. + * Shared with linkage for RCU-delayed free. + */ + union { + struct hlist_node lr_hash; + struct rcu_head lr_rcu; + }; + + /** Reference count for this resource */ + atomic_t lr_refcount; + + /** Spinlock to protect locks under this resource. */ + spinlock_t lr_lock; + + /** + * protected by lr_lock + * @{ */ + /** List of locks in granted state */ + struct list_head lr_granted; + /** + * List of locks that could not be granted due to conflicts and + * that are waiting for conflicts to go away */ + struct list_head lr_waiting; + /** @} */ + + /** Resource name */ + struct ldlm_res_id lr_name; + + union { + /** + * Interval trees (only for extent locks) for all modes of + * this resource + */ + struct ldlm_interval_tree *lr_itree; + struct ldlm_ibits_queues *lr_ibits_queues; + }; + + union { + /** + * When the resource was considered as contended, + * used only on server side. + */ + time64_t lr_contention_time; + /** + * Associated inode, used only on client side. + */ + struct inode *lr_lvb_inode; + }; + + /** Type of locks this resource can hold. Only one type per resource. */ + enum ldlm_type lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */ + + /** + * Server-side-only lock value block elements. + * To serialize lvbo_init. + */ + int lr_lvb_len; + struct mutex lr_lvb_mutex; + /** protected by lr_lock */ + void *lr_lvb_data; + /** is lvb initialized ? */ + bool lr_lvb_initialized; + + /** List of references to this resource. For debugging. */ + struct lu_ref lr_reference; +}; + +static inline int ldlm_is_granted(struct ldlm_lock *lock) +{ + return lock->l_req_mode == lock->l_granted_mode; +} + +static inline bool ldlm_has_layout(struct ldlm_lock *lock) +{ + return lock->l_resource->lr_type == LDLM_IBITS && + lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT; +} + +static inline bool ldlm_has_dom(struct ldlm_lock *lock) +{ + return lock->l_resource->lr_type == LDLM_IBITS && + lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_DOM; +} + +static inline char * +ldlm_ns_name(struct ldlm_namespace *ns) +{ + return ns->ns_name; +} + +static inline struct ldlm_namespace * +ldlm_res_to_ns(struct ldlm_resource *res) +{ + return res->lr_ns_bucket->nsb_namespace; +} + +static inline struct ldlm_namespace * +ldlm_lock_to_ns(struct ldlm_lock *lock) +{ + return ldlm_res_to_ns(lock->l_resource); +} + +static inline char * +ldlm_lock_to_ns_name(struct ldlm_lock *lock) +{ + return ldlm_ns_name(ldlm_lock_to_ns(lock)); +} + +static inline struct adaptive_timeout * +ldlm_lock_to_ns_at(struct ldlm_lock *lock) +{ + return &lock->l_resource->lr_ns_bucket->nsb_at_estimate; +} + +static inline int ldlm_lvbo_init(struct ldlm_resource *res) +{ + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + int rc = 0; + + if (ns->ns_lvbo == NULL || ns->ns_lvbo->lvbo_init == NULL || + res->lr_lvb_initialized) + return 0; + + mutex_lock(&res->lr_lvb_mutex); + /* Did we lose the race? */ + if (res->lr_lvb_initialized) { + mutex_unlock(&res->lr_lvb_mutex); + return 0; + } + rc = ns->ns_lvbo->lvbo_init(res); + if (rc < 0) { + CDEBUG(D_DLMTRACE, "lvbo_init failed for resource : rc = %d\n", + rc); + if (res->lr_lvb_data != NULL) { + OBD_FREE(res->lr_lvb_data, res->lr_lvb_len); + res->lr_lvb_data = NULL; + } + res->lr_lvb_len = rc; + } else { + res->lr_lvb_initialized = true; + } + mutex_unlock(&res->lr_lvb_mutex); + return rc; +} + +static inline int ldlm_lvbo_size(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL) + return ns->ns_lvbo->lvbo_size(lock); + + return 0; +} + +static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int *len) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + int rc; + + if (ns->ns_lvbo != NULL) { + LASSERT(ns->ns_lvbo->lvbo_fill != NULL); + /* init lvb now if not already */ + rc = ldlm_lvbo_init(lock->l_resource); + if (rc < 0) { + CERROR("lock %p: delayed lvb init failed (rc %d)\n", + lock, rc); + return rc; + } + return ns->ns_lvbo->lvbo_fill(lock, buf, len); + } + return 0; +} + +struct ldlm_ast_work { + struct ldlm_lock *w_lock; + int w_blocking; + struct ldlm_lock_desc w_desc; + struct list_head w_list; + int w_flags; + void *w_data; + int w_datalen; +}; + +/** + * Common ldlm_enqueue parameters + */ +struct ldlm_enqueue_info { + enum ldlm_type ei_type; /** Type of the lock being enqueued. */ + enum ldlm_mode ei_mode; /** Mode of the lock being enqueued. */ + void *ei_cb_bl; /** blocking lock callback */ + void *ei_cb_local_bl; /** blocking local lock callback */ + void *ei_cb_cp; /** lock completion callback */ + void *ei_cb_gl; /** lock glimpse callback */ + ldlm_created_callback ei_cb_created; /** lock created callback */ + void *ei_cbdata; /** Data to be passed into callbacks. */ + void *ei_namespace; /** lock namespace **/ + u64 ei_inodebits; /** lock inode bits **/ + unsigned int ei_enq_slave:1; /** whether enqueue slave stripes */ + unsigned int ei_req_slot:1; /** whether acquire rpc slot */ + unsigned int ei_mod_slot:1; /** whether acquire mod rpc slot */ +}; + +#define ei_res_id ei_cb_gl + +extern char *ldlm_lockname[]; +extern char *ldlm_typename[]; +extern const char *ldlm_it2str(enum ldlm_intent_flags it); + +/** + * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG. + * For the cases where we do not have actual lock to print along + * with a debugging message that is ldlm-related + */ +#define LDLM_DEBUG_NOLOCK(format, a...) \ + CDEBUG(D_DLMTRACE, "### " format "\n" , ##a) + +/** + * Support function for lock information printing into debug logs. + * \see LDLM_DEBUG + */ +#ifdef LIBCFS_DEBUG +#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do { \ + CFS_CHECK_STACK(msgdata, mask, cdls); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + _ldlm_lock_debug(lock, msgdata, fmt, ##a); \ +} while(0) + +void _ldlm_lock_debug(struct ldlm_lock *lock, + struct libcfs_debug_msg_data *data, + const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); + +/** + * Rate-limited version of lock printing function. + */ +#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do { \ + static struct cfs_debug_limit_state _ldlm_cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls); \ + ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\ +} while (0) + +#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a) +#define LDLM_WARN(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a) + +/** Non-rate-limited lock printing function for debugging purposes. */ +#define LDLM_DEBUG(lock, fmt, a...) do { \ + if (likely(lock != NULL)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL); \ + ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock, \ + "### " fmt , ##a); \ + } else { \ + LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a); \ + } \ +} while (0) +#else /* !LIBCFS_DEBUG */ +# define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) ((void)0) +# define LDLM_DEBUG(lock, fmt, a...) ((void)0) +# define LDLM_ERROR(lock, fmt, a...) ((void)0) +#endif + +/* + * Three intentions can be used for the policy functions in + * ldlm_processing_policy. + * + * LDLM_PROCESS_RESCAN: + * + * It's used when policy functions are called from ldlm_reprocess_queue() to + * reprocess the wait list and try to grant locks, blocking ASTs + * have already been sent in this situation, completion ASTs need be sent for + * the locks being granted. + * + * LDLM_PROCESS_ENQUEUE: + * + * It's used when policy functions are called from ldlm_lock_enqueue() to + * process the wait list for handling an enqueue request, blocking + * ASTs have not been sent yet, so list of conflicting locks would be + * collected and ASTs sent. + * + * LDLM_PROCESS_RECOVERY: + * + * It's used when policy functions are called from ldlm_reprocess_queue() to + * reprocess the wait list when recovery done. In case of blocking + * ASTs are lost before recovery, it needs not only to grant locks if + * available, but also send blocking ASTs to the locks doesn't have AST sent + * flag. Completion ASTs need be sent for the locks being granted. + */ +enum ldlm_process_intention { + LDLM_PROCESS_RESCAN = 0, + LDLM_PROCESS_ENQUEUE = 1, + LDLM_PROCESS_RECOVERY = 2, +}; + +typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, + struct list_head *work_list); + +typedef int (*ldlm_reprocessing_policy)(struct ldlm_resource *res, + struct list_head *queue, + struct list_head *work_list, + enum ldlm_process_intention intention, + __u64 hint); + +/** + * Return values for lock iterators. + * Also used during deciding of lock grants and cancellations. + */ +#define LDLM_ITER_CONTINUE 1 /* keep iterating */ +#define LDLM_ITER_STOP 2 /* stop iterating */ + +typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *); +typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *); + +/** \defgroup ldlm_iterator Lock iterators + * + * LDLM provides for a way to iterate through every lock on a resource or + * namespace or every resource in a namespace. + * @{ */ +int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter, + void *closure); +void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter, + void *closure); +int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *, + ldlm_iterator_t iter, void *data); +/** @} ldlm_iterator */ + +int ldlm_replay_locks(struct obd_import *imp); + +/* ldlm_flock.c */ +int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); + +/* ldlm_extent.c */ +__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms); + +struct ldlm_prolong_args { + struct obd_export *lpa_export; + struct ldlm_res_id lpa_resid; + struct ldlm_extent lpa_extent; + enum ldlm_mode lpa_mode; + timeout_t lpa_timeout; + int lpa_locks_cnt; + int lpa_blocks_cnt; +}; +void ldlm_lock_prolong_one(struct ldlm_lock *lock, + struct ldlm_prolong_args *arg); +void ldlm_resource_prolong(struct ldlm_prolong_args *arg); + +struct ldlm_callback_suite { + ldlm_completion_callback lcs_completion; + ldlm_blocking_callback lcs_blocking; + ldlm_glimpse_callback lcs_glimpse; +}; + +/* ldlm_lockd.c */ +#ifdef HAVE_SERVER_SUPPORT +/** \defgroup ldlm_srv_ast Server AST handlers + * These are AST handlers used by server code. + * Their property is that they are just preparing RPCs to be sent to clients. + * @{ + */ +int ldlm_server_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, + void *data, int flag); +int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); +int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data); +int ldlm_glimpse_locks(struct ldlm_resource *res, + struct list_head *gl_work_list); +/** @} ldlm_srv_ast */ + +/** \defgroup ldlm_handlers Server LDLM handlers + * These are handler functions that should be called by "frontends" such as + * MDT or OST to pass through LDLM requests to LDLM for handling + * @{ + */ +int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req, + const struct ldlm_request *dlm_req, + const struct ldlm_callback_suite *cbs); +int ldlm_handle_convert0(struct ptlrpc_request *req, + const struct ldlm_request *dlm_req); +int ldlm_handle_cancel(struct ptlrpc_request *req); +int ldlm_request_cancel(struct ptlrpc_request *req, + const struct ldlm_request *dlm_req, + int first, enum lustre_at_flags flags); +/** @} ldlm_handlers */ + +void ldlm_revoke_export_locks(struct obd_export *exp); +timeout_t ldlm_bl_timeout(struct ldlm_lock *lock); +#endif +int ldlm_del_waiting_lock(struct ldlm_lock *lock); +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, timeout_t timeout); +int ldlm_get_ref(void); +void ldlm_put_ref(void); +int ldlm_init_export(struct obd_export *exp); +void ldlm_destroy_export(struct obd_export *exp); +struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req); + +/* ldlm_lock.c */ +#ifdef HAVE_SERVER_SUPPORT +ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res); +ldlm_reprocessing_policy +ldlm_get_reprocessing_policy(struct ldlm_resource *res); +#endif +void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg); +void ldlm_lock2handle(const struct ldlm_lock *lock, + struct lustre_handle *lockh); +struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags); +void ldlm_cancel_callback(struct ldlm_lock *); +int ldlm_lock_remove_from_lru(struct ldlm_lock *); +int ldlm_lock_set_data(const struct lustre_handle *lockh, void *data); + +/** + * Obtain a lock reference by its handle. + */ +static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h) +{ + return __ldlm_handle2lock(h, 0); +} + +#define LDLM_LOCK_REF_DEL(lock) \ + lu_ref_del(&lock->l_reference, "handle", lock) + +static inline struct ldlm_lock * +ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags) +{ + struct ldlm_lock *lock; + + lock = __ldlm_handle2lock(h, flags); + if (lock != NULL) + LDLM_LOCK_REF_DEL(lock); + return lock; +} + +/** + * Update Lock Value Block Operations (LVBO) on a resource taking into account + * data from request \a r + */ +static inline int ldlm_lvbo_update(struct ldlm_resource *res, + struct ldlm_lock *lock, + struct ptlrpc_request *req, int increase) +{ + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + int rc; + + /* delayed lvb init may be required */ + rc = ldlm_lvbo_init(res); + if (rc < 0) { + CERROR("delayed lvb init failed (rc %d)\n", rc); + return rc; + } + + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_update) + return ns->ns_lvbo->lvbo_update(res, lock, req, increase); + + return 0; +} + +static inline int ldlm_res_lvbo_update(struct ldlm_resource *res, + struct ptlrpc_request *req, + int increase) +{ + return ldlm_lvbo_update(res, NULL, req, increase); +} + +int is_granted_or_cancelled_nolock(struct ldlm_lock *lock); + +int ldlm_error2errno(enum ldlm_error error); +enum ldlm_error ldlm_errno2error(int err_no); /* don't call it `errno': this + * confuses user-space. */ +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void ldlm_dump_export_locks(struct obd_export *exp); +#endif + +/** + * Release a temporary lock reference obtained by ldlm_handle2lock() or + * __ldlm_handle2lock(). + */ +#define LDLM_LOCK_PUT(lock) \ +do { \ + LDLM_LOCK_REF_DEL(lock); \ + /*LDLM_DEBUG((lock), "put");*/ \ + ldlm_lock_put(lock); \ +} while (0) + +/** + * Release a lock reference obtained by some other means (see + * LDLM_LOCK_PUT()). + */ +#define LDLM_LOCK_RELEASE(lock) \ +do { \ + /*LDLM_DEBUG((lock), "put");*/ \ + ldlm_lock_put(lock); \ +} while (0) + +#define LDLM_LOCK_GET(lock) \ +({ \ + ldlm_lock_get(lock); \ + /*LDLM_DEBUG((lock), "get");*/ \ + lock; \ +}) + +#define ldlm_lock_list_put(head, member, count) \ +({ \ + struct ldlm_lock *_lock, *_next; \ + int c = count; \ + list_for_each_entry_safe(_lock, _next, head, member) { \ + if (c-- == 0) \ + break; \ + list_del_init(&_lock->member); \ + LDLM_LOCK_RELEASE(_lock); \ + } \ + LASSERT(c <= 0); \ +}) + +struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock); +void ldlm_lock_put(struct ldlm_lock *lock); +void ldlm_lock_destroy(struct ldlm_lock *lock); +void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc); +void ldlm_lock_addref(const struct lustre_handle *lockh, enum ldlm_mode mode); +int ldlm_lock_addref_try(const struct lustre_handle *lockh, + enum ldlm_mode mode); +void ldlm_lock_decref(const struct lustre_handle *lockh, enum ldlm_mode mode); +void ldlm_lock_decref_and_cancel(const struct lustre_handle *lockh, + enum ldlm_mode mode); +void ldlm_lock_fail_match_locked(struct ldlm_lock *lock); +void ldlm_lock_fail_match(struct ldlm_lock *lock); +void ldlm_lock_allow_match(struct ldlm_lock *lock); +void ldlm_lock_allow_match_locked(struct ldlm_lock *lock); + +enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns, + __u64 flags, __u64 skip_flags, + const struct ldlm_res_id *res_id, + enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, + struct lustre_handle *lh, + enum ldlm_match_flags match_flags); +static inline enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, + __u64 flags, + const struct ldlm_res_id *res_id, + enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, + struct lustre_handle *lh) +{ + return ldlm_lock_match_with_skip(ns, flags, 0, res_id, type, policy, + mode, lh, 0); +} +struct ldlm_lock *search_itree(struct ldlm_resource *res, + struct ldlm_match_data *data); +enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh, + __u64 *bits); +void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode); +void ldlm_lock_cancel(struct ldlm_lock *lock); +void ldlm_reprocess_all(struct ldlm_resource *res, __u64 hint); +void ldlm_reprocess_recovery_done(struct ldlm_namespace *ns); +void ldlm_lock_dump_handle(int level, const struct lustre_handle *lockh); +void ldlm_unlink_lock_skiplist(struct ldlm_lock *req); + +/* resource.c */ +struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name, + enum ldlm_side client, + enum ldlm_appetite apt, + enum ldlm_ns_type ns_type); +int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags); +void ldlm_namespace_free_prior(struct ldlm_namespace *ns, + struct obd_import *imp, + int force); +void ldlm_namespace_free_post(struct ldlm_namespace *ns); +void ldlm_namespace_free(struct ldlm_namespace *ns, + struct obd_import *imp, int force); +void ldlm_namespace_register(struct ldlm_namespace *ns, enum ldlm_side client); +void ldlm_namespace_unregister(struct ldlm_namespace *ns, + enum ldlm_side client); +void ldlm_namespace_get(struct ldlm_namespace *ns); +void ldlm_namespace_put(struct ldlm_namespace *ns); + +int ldlm_debugfs_setup(void); +void ldlm_debugfs_cleanup(void); + +static inline void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req, + struct lprocfs_stats *srv_stats) +{ + int lock_type = 0, op = 0; + + lock_type = dlm_req->lock_desc.l_resource.lr_type; + + switch (lock_type) { + case LDLM_PLAIN: + op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE; + break; + case LDLM_EXTENT: + op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE; + break; + case LDLM_FLOCK: + op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE; + break; + case LDLM_IBITS: + op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE; + break; + default: + op = 0; + break; + } + + if (op != 0) + lprocfs_counter_incr(srv_stats, op); +} + +/* resource.c - internal */ +struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns, + struct ldlm_resource *parent, + const struct ldlm_res_id *, + enum ldlm_type type, int create); +struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res); +int ldlm_resource_putref(struct ldlm_resource *res); +void ldlm_resource_add_lock(struct ldlm_resource *res, + struct list_head *head, + struct ldlm_lock *lock); +void ldlm_resource_unlink_lock(struct ldlm_lock *lock); +void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc); +void ldlm_dump_all_namespaces(enum ldlm_side client, int level); +void ldlm_namespace_dump(int level, struct ldlm_namespace *); +void ldlm_resource_dump(int level, struct ldlm_resource *); +int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *, + const struct ldlm_res_id *); + +#define LDLM_RESOURCE_ADDREF(res) do { \ + lu_ref_add_atomic(&(res)->lr_reference, __FUNCTION__, current); \ +} while (0) + +#define LDLM_RESOURCE_DELREF(res) do { \ + lu_ref_del(&(res)->lr_reference, __FUNCTION__, current); \ +} while (0) + +/* ldlm_request.c */ +/** \defgroup ldlm_local_ast Default AST handlers for local locks + * These AST handlers are typically used for server-side local locks and are + * also used by client-side lock handlers to perform minimum level base + * processing. + * @{ */ +int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock); +int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag); +int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp); +int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data); +int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); +/** @} ldlm_local_ast */ + +/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users. + * These are typically used by client and server (*_local versions) + * to obtain and release locks. + * @{ */ +int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, + struct ldlm_enqueue_info *einfo, + const struct ldlm_res_id *res_id, + union ldlm_policy_data const *policy, __u64 *flags, + void *lvb, __u32 lvb_len, enum lvb_type lvb_type, + struct lustre_handle *lockh, int async); +int ldlm_prep_enqueue_req(struct obd_export *exp, + struct ptlrpc_request *req, + struct list_head *cancels, + int count); +int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req, + int version, int opc, int canceloff, + struct list_head *cancels, int count); + +struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len); +int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req, + const struct ldlm_request *dlm_req, + const struct ldlm_callback_suite *cbs); +int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, + struct ldlm_enqueue_info *einfo, __u8 with_policy, + __u64 *flags, void *lvb, __u32 lvb_len, + const struct lustre_handle *lockh, int rc, + bool request_slot); +int ldlm_cli_enqueue_local(const struct lu_env *env, + struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + enum ldlm_type type, union ldlm_policy_data *policy, + enum ldlm_mode mode, __u64 *flags, + ldlm_blocking_callback blocking, + ldlm_completion_callback completion, + ldlm_glimpse_callback glimpse, + void *data, __u32 lvb_len, enum lvb_type lvb_type, + const __u64 *client_cookie, + struct lustre_handle *lockh); +int ldlm_cli_convert_req(struct ldlm_lock *lock, __u32 *flags, __u64 new_bits); +int ldlm_cli_convert(struct ldlm_lock *lock, + enum ldlm_cancel_flags cancel_flags); +int ldlm_cli_update_pool(struct ptlrpc_request *req); +int ldlm_cli_cancel(const struct lustre_handle *lockh, + enum ldlm_cancel_flags cancel_flags); +int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *, + enum ldlm_cancel_flags flags, void *opaque); +int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + union ldlm_policy_data *policy, + enum ldlm_mode mode, + enum ldlm_cancel_flags flags, void *opaque); +int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head, + int count, enum ldlm_cancel_flags flags); +int ldlm_cancel_resource_local(struct ldlm_resource *res, + struct list_head *cancels, + union ldlm_policy_data *policy, + enum ldlm_mode mode, __u64 lock_flags, + enum ldlm_cancel_flags cancel_flags, + void *opaque); +int ldlm_cli_cancel_list_local(struct list_head *cancels, int count, + enum ldlm_cancel_flags flags); +int ldlm_cli_cancel_list(struct list_head *head, int count, + struct ptlrpc_request *req, + enum ldlm_cancel_flags flags); + +int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop); +int ldlm_cli_inodebits_convert(struct ldlm_lock *lock, + enum ldlm_cancel_flags cancel_flags); + +/** @} ldlm_cli_api */ + +extern unsigned int ldlm_enqueue_min; + +/* mds/handler.c */ +/* This has to be here because recursive inclusion sucks. */ +int intent_disposition(struct ldlm_reply *rep, int flag); +void intent_set_disposition(struct ldlm_reply *rep, int flag); + +/** + * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more + * than one lock_res is dead-lock safe. + */ +enum lock_res_type { + LRT_NORMAL, + LRT_NEW +}; + +/** Lock resource. */ +static inline void lock_res(struct ldlm_resource *res) +{ + spin_lock(&res->lr_lock); +} + +/** Lock resource with a way to instruct lockdep code about nestedness-safe. */ +static inline void lock_res_nested(struct ldlm_resource *res, + enum lock_res_type mode) +{ + spin_lock_nested(&res->lr_lock, mode); +} + +/** Unlock resource. */ +static inline void unlock_res(struct ldlm_resource *res) +{ + spin_unlock(&res->lr_lock); +} + +/** Check if resource is already locked, assert if not. */ +static inline void check_res_locked(struct ldlm_resource *res) +{ + assert_spin_locked(&res->lr_lock); +} + +struct ldlm_resource * lock_res_and_lock(struct ldlm_lock *lock); +void unlock_res_and_lock(struct ldlm_lock *lock); + +/* ldlm_pool.c */ +/** \defgroup ldlm_pools Various LDLM pool related functions + * There are not used outside of ldlm. + * @{ + */ +int ldlm_pools_init(void); +void ldlm_pools_fini(void); + +int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, + int idx, enum ldlm_side client); +int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, gfp_t gfp_mask); +void ldlm_pool_fini(struct ldlm_pool *pl); +int ldlm_pool_setup(struct ldlm_pool *pl, int limit); +time64_t ldlm_pool_recalc(struct ldlm_pool *pl, bool force); +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl); +__u64 ldlm_pool_get_slv(struct ldlm_pool *pl); +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl); +__u32 ldlm_pool_get_limit(struct ldlm_pool *pl); +void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv); +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv); +void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit); +void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock); +void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock); +/** @} */ + +static inline int ldlm_extent_overlap(const struct ldlm_extent *ex1, + const struct ldlm_extent *ex2) +{ + return ex1->start <= ex2->end && ex2->start <= ex1->end; +} + +/* check if @ex1 contains @ex2 */ +static inline int ldlm_extent_contain(const struct ldlm_extent *ex1, + const struct ldlm_extent *ex2) +{ + return ex1->start <= ex2->start && ex1->end >= ex2->end; +} + +int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop); + +#endif +/** @} LDLM */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h new file mode 100644 index 0000000000000..1fa049de2f567 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h @@ -0,0 +1,444 @@ +/* -*- buffer-read-only: t -*- vi: set ro: + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . + */ +/** + * \file lustre_dlm_flags.h + * The flags and collections of flags (masks) for \see struct ldlm_lock. + * + * \addtogroup LDLM Lustre Distributed Lock Manager + * @{ + * + * \name flags + * The flags and collections of flags (masks) for \see struct ldlm_lock. + * @{ + */ +#ifndef LDLM_ALL_FLAGS_MASK + +/** l_flags bits marked as "all_flags" bits */ +#define LDLM_FL_ALL_FLAGS_MASK 0x00FFFFFFC28F932FULL + +/** extent, mode, or resource changed */ +#define LDLM_FL_LOCK_CHANGED 0x0000000000000001ULL // bit 0 +#define ldlm_is_lock_changed(_l) LDLM_TEST_FLAG(( _l), 1ULL << 0) +#define ldlm_set_lock_changed(_l) LDLM_SET_FLAG(( _l), 1ULL << 0) +#define ldlm_clear_lock_changed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 0) + +/** + * Server placed lock on granted list, or a recovering client wants the + * lock added to the granted list, no questions asked. */ +#define LDLM_FL_BLOCK_GRANTED 0x0000000000000002ULL // bit 1 +#define ldlm_is_block_granted(_l) LDLM_TEST_FLAG(( _l), 1ULL << 1) +#define ldlm_set_block_granted(_l) LDLM_SET_FLAG(( _l), 1ULL << 1) +#define ldlm_clear_block_granted(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 1) + +/** + * Server placed lock on conv list, or a recovering client wants the lock + * added to the conv list, no questions asked. (obsoleted) */ +#define LDLM_FL_BLOCK_CONV 0x0000000000000004ULL // bit 2 +#define ldlm_is_block_conv(_l) LDLM_TEST_FLAG(( _l), 1ULL << 2) +#define ldlm_set_block_conv(_l) LDLM_SET_FLAG(( _l), 1ULL << 2) +#define ldlm_clear_block_conv(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 2) + +/** + * Server placed lock on wait list, or a recovering client wants the lock + * added to the wait list, no questions asked. */ +#define LDLM_FL_BLOCK_WAIT 0x0000000000000008ULL // bit 3 +#define ldlm_is_block_wait(_l) LDLM_TEST_FLAG(( _l), 1ULL << 3) +#define ldlm_set_block_wait(_l) LDLM_SET_FLAG(( _l), 1ULL << 3) +#define ldlm_clear_block_wait(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 3) + +/** + * Lock request is speculative/asynchronous, and cannot wait for any reason. + * Fail the lock request if any blocking locks are encountered. + * */ +#define LDLM_FL_SPECULATIVE 0x0000000000000010ULL /* bit 4 */ +#define ldlm_is_speculative(_l) LDLM_TEST_FLAG((_l), 1ULL << 4) +#define ldlm_set_speculative(_l) LDLM_SET_FLAG((_l), 1ULL << 4) +#define ldlm_clear_specualtive_(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 4) + +/** blocking or cancel packet was queued for sending. */ +#define LDLM_FL_AST_SENT 0x0000000000000020ULL // bit 5 +#define ldlm_is_ast_sent(_l) LDLM_TEST_FLAG(( _l), 1ULL << 5) +#define ldlm_set_ast_sent(_l) LDLM_SET_FLAG(( _l), 1ULL << 5) +#define ldlm_clear_ast_sent(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 5) + +/** + * Lock is being replayed. This could probably be implied by the fact that + * one of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous. */ +#define LDLM_FL_REPLAY 0x0000000000000100ULL // bit 8 +#define ldlm_is_replay(_l) LDLM_TEST_FLAG(( _l), 1ULL << 8) +#define ldlm_set_replay(_l) LDLM_SET_FLAG(( _l), 1ULL << 8) +#define ldlm_clear_replay(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 8) + +/** Don't grant lock, just do intent. */ +#define LDLM_FL_INTENT_ONLY 0x0000000000000200ULL // bit 9 +#define ldlm_is_intent_only(_l) LDLM_TEST_FLAG(( _l), 1ULL << 9) +#define ldlm_set_intent_only(_l) LDLM_SET_FLAG(( _l), 1ULL << 9) +#define ldlm_clear_intent_only(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 9) + +/** lock request has intent */ +#define LDLM_FL_HAS_INTENT 0x0000000000001000ULL // bit 12 +#define ldlm_is_has_intent(_l) LDLM_TEST_FLAG(( _l), 1ULL << 12) +#define ldlm_set_has_intent(_l) LDLM_SET_FLAG(( _l), 1ULL << 12) +#define ldlm_clear_has_intent(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 12) + +/** flock deadlock detected */ +#define LDLM_FL_FLOCK_DEADLOCK 0x0000000000008000ULL // bit 15 +#define ldlm_is_flock_deadlock(_l) LDLM_TEST_FLAG(( _l), 1ULL << 15) +#define ldlm_set_flock_deadlock(_l) LDLM_SET_FLAG(( _l), 1ULL << 15) +#define ldlm_clear_flock_deadlock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 15) + +/** discard (no writeback (PW locks) or page retention (PR locks)) on cancel */ +#define LDLM_FL_DISCARD_DATA 0x0000000000010000ULL // bit 16 +#define ldlm_is_discard_data(_l) LDLM_TEST_FLAG(( _l), 1ULL << 16) +#define ldlm_set_discard_data(_l) LDLM_SET_FLAG(( _l), 1ULL << 16) +#define ldlm_clear_discard_data(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 16) + +/** Blocked by group lock - wait indefinitely */ +#define LDLM_FL_NO_TIMEOUT 0x0000000000020000ULL // bit 17 +#define ldlm_is_no_timeout(_l) LDLM_TEST_FLAG(( _l), 1ULL << 17) +#define ldlm_set_no_timeout(_l) LDLM_SET_FLAG(( _l), 1ULL << 17) +#define ldlm_clear_no_timeout(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 17) + +/** + * Server told not to wait if blocked. For AGL, OST will not send glimpse + * callback. */ +#define LDLM_FL_BLOCK_NOWAIT 0x0000000000040000ULL // bit 18 +#define ldlm_is_block_nowait(_l) LDLM_TEST_FLAG(( _l), 1ULL << 18) +#define ldlm_set_block_nowait(_l) LDLM_SET_FLAG(( _l), 1ULL << 18) +#define ldlm_clear_block_nowait(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 18) + +/** return blocking lock */ +#define LDLM_FL_TEST_LOCK 0x0000000000080000ULL // bit 19 +#define ldlm_is_test_lock(_l) LDLM_TEST_FLAG(( _l), 1ULL << 19) +#define ldlm_set_test_lock(_l) LDLM_SET_FLAG(( _l), 1ULL << 19) +#define ldlm_clear_test_lock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 19) + +/** match lock only */ +#define LDLM_FL_MATCH_LOCK 0x0000000000100000ULL // bit 20 + +/** + * Immediatelly cancel such locks when they block some other locks. Send + * cancel notification to original lock holder, but expect no reply. This + * is for clients (like liblustre) that cannot be expected to reliably + * response to blocking AST. */ +#define LDLM_FL_CANCEL_ON_BLOCK 0x0000000000800000ULL // bit 23 +#define ldlm_is_cancel_on_block(_l) LDLM_TEST_FLAG(( _l), 1ULL << 23) +#define ldlm_set_cancel_on_block(_l) LDLM_SET_FLAG(( _l), 1ULL << 23) +#define ldlm_clear_cancel_on_block(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 23) + +/** Flag whether a lock is enqueued from a distributed transaction, and the + * requesting lock mode is PW/EX, if so, it will check compatibility with COS + * locks, and different from original COS semantic, transactions from the same + * client is also treated as lock conflict. */ +#define LDLM_FL_COS_INCOMPAT 0x0000000001000000ULL /* bit 24 */ +#define ldlm_is_cos_incompat(_l) LDLM_TEST_FLAG((_l), 1ULL << 24) +#define ldlm_set_cos_incompat(_l) LDLM_SET_FLAG((_l), 1ULL << 24) +#define ldlm_clear_cos_incompat(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 24) + +/* + * Flag indicates that lock is being converted (downgraded) during the blocking + * AST instead of cancelling. Used for IBITS locks now and drops conflicting + * bits only keepeing other. + */ +#define LDLM_FL_CONVERTING 0x0000000002000000ULL /* bit 25 */ +#define ldlm_is_converting(_l) LDLM_TEST_FLAG((_l), 1ULL << 25) +#define ldlm_set_converting(_l) LDLM_SET_FLAG((_l), 1ULL << 25) +#define ldlm_clear_converting(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 25) + +/** + * Do not expand this lock. Grant it only on the extent requested. + * Used for manually requested locks from the client (LU_LADVISE_LOCKAHEAD). + * */ +#define LDLM_FL_NO_EXPANSION 0x0000000020000000ULL /* bit 29 */ +#define ldlm_is_do_not_expand(_l) LDLM_TEST_FLAG((_l), 1ULL << 29) +#define ldlm_set_do_not_expand(_l) LDLM_SET_FLAG((_l), 1ULL << 29) +#define ldlm_clear_do_not_expand(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 29) + +/** + * measure lock contention and return -EUSERS if locking contention is high */ +#define LDLM_FL_DENY_ON_CONTENTION 0x0000000040000000ULL // bit 30 +#define ldlm_is_deny_on_contention(_l) LDLM_TEST_FLAG(( _l), 1ULL << 30) +#define ldlm_set_deny_on_contention(_l) LDLM_SET_FLAG(( _l), 1ULL << 30) +#define ldlm_clear_deny_on_contention(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 30) + +/** + * These are flags that are mapped into the flags and ASTs of blocking + * locks Add FL_DISCARD to blocking ASTs */ +#define LDLM_FL_AST_DISCARD_DATA 0x0000000080000000ULL // bit 31 +#define ldlm_is_ast_discard_data(_l) LDLM_TEST_FLAG(( _l), 1ULL << 31) +#define ldlm_set_ast_discard_data(_l) LDLM_SET_FLAG(( _l), 1ULL << 31) +#define ldlm_clear_ast_discard_data(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 31) + +/** + * Used for marking lock as a target for -EINTR while cp_ast sleep emulation + * + race with upcoming bl_ast. */ +#define LDLM_FL_FAIL_LOC 0x0000000100000000ULL // bit 32 +#define ldlm_is_fail_loc(_l) LDLM_TEST_FLAG(( _l), 1ULL << 32) +#define ldlm_set_fail_loc(_l) LDLM_SET_FLAG(( _l), 1ULL << 32) +#define ldlm_clear_fail_loc(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 32) + +/** this lock is being destroyed */ +#define LDLM_FL_CBPENDING 0x0000000400000000ULL // bit 34 +#define ldlm_is_cbpending(_l) LDLM_TEST_FLAG(( _l), 1ULL << 34) +#define ldlm_set_cbpending(_l) LDLM_SET_FLAG(( _l), 1ULL << 34) +#define ldlm_clear_cbpending(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 34) + +/** not a real flag, not saved in lock */ +#define LDLM_FL_WAIT_NOREPROC 0x0000000800000000ULL // bit 35 +#define ldlm_is_wait_noreproc(_l) LDLM_TEST_FLAG(( _l), 1ULL << 35) +#define ldlm_set_wait_noreproc(_l) LDLM_SET_FLAG(( _l), 1ULL << 35) +#define ldlm_clear_wait_noreproc(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 35) + +/** cancellation callback already run */ +#define LDLM_FL_CANCEL 0x0000001000000000ULL // bit 36 +#define ldlm_is_cancel(_l) LDLM_TEST_FLAG(( _l), 1ULL << 36) +#define ldlm_set_cancel(_l) LDLM_SET_FLAG(( _l), 1ULL << 36) +#define ldlm_clear_cancel(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 36) + +/** whatever it might mean -- never transmitted? */ +#define LDLM_FL_LOCAL_ONLY 0x0000002000000000ULL // bit 37 +#define ldlm_is_local_only(_l) LDLM_TEST_FLAG(( _l), 1ULL << 37) +#define ldlm_set_local_only(_l) LDLM_SET_FLAG(( _l), 1ULL << 37) +#define ldlm_clear_local_only(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 37) + +/** don't run the cancel callback under ldlm_cli_cancel_unused */ +#define LDLM_FL_FAILED 0x0000004000000000ULL // bit 38 +#define ldlm_is_failed(_l) LDLM_TEST_FLAG(( _l), 1ULL << 38) +#define ldlm_set_failed(_l) LDLM_SET_FLAG(( _l), 1ULL << 38) +#define ldlm_clear_failed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 38) + +/** lock cancel has already been sent */ +#define LDLM_FL_CANCELING 0x0000008000000000ULL // bit 39 +#define ldlm_is_canceling(_l) LDLM_TEST_FLAG(( _l), 1ULL << 39) +#define ldlm_set_canceling(_l) LDLM_SET_FLAG(( _l), 1ULL << 39) +#define ldlm_clear_canceling(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 39) + +/** local lock (ie, no srv/cli split) */ +#define LDLM_FL_LOCAL 0x0000010000000000ULL // bit 40 +#define ldlm_is_local(_l) LDLM_TEST_FLAG(( _l), 1ULL << 40) +#define ldlm_set_local(_l) LDLM_SET_FLAG(( _l), 1ULL << 40) +#define ldlm_clear_local(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 40) + +/** + * XXX FIXME: This is being added to b_size as a low-risk fix to the + * fact that the LVB filling happens _after_ the lock has been granted, + * so another thread can match it before the LVB has been updated. As a + * dirty hack, we set LDLM_FL_LVB_READY only after we've done the LVB poop. + * this is only needed on LOV/OSC now, where LVB is actually used and + * callers must set it in input flags. + * + * The proper fix is to do the granting inside of the completion AST, + * which can be replaced with a LVB-aware wrapping function for OSC locks. + * That change is pretty high-risk, though, and would need a lot more + * testing. */ +#define LDLM_FL_LVB_READY 0x0000020000000000ULL // bit 41 +#define ldlm_is_lvb_ready(_l) LDLM_TEST_FLAG(( _l), 1ULL << 41) +#define ldlm_set_lvb_ready(_l) LDLM_SET_FLAG(( _l), 1ULL << 41) +#define ldlm_clear_lvb_ready(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 41) + +/** + * A lock contributes to the known minimum size (KMS) calculation until it + * has finished the part of its cancelation that performs write back on its + * dirty pages. It can remain on the granted list during this whole time. + * Threads racing to update the KMS after performing their writeback need + * to know to exclude each other's locks from the calculation as they walk + * the granted list. */ +#define LDLM_FL_KMS_IGNORE 0x0000040000000000ULL // bit 42 +#define ldlm_is_kms_ignore(_l) LDLM_TEST_FLAG(( _l), 1ULL << 42) +#define ldlm_set_kms_ignore(_l) LDLM_SET_FLAG(( _l), 1ULL << 42) +#define ldlm_clear_kms_ignore(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 42) + +/** completion AST to be executed */ +#define LDLM_FL_CP_REQD 0x0000080000000000ULL // bit 43 +#define ldlm_is_cp_reqd(_l) LDLM_TEST_FLAG(( _l), 1ULL << 43) +#define ldlm_set_cp_reqd(_l) LDLM_SET_FLAG(( _l), 1ULL << 43) +#define ldlm_clear_cp_reqd(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 43) + +/** cleanup_resource has already handled the lock */ +#define LDLM_FL_CLEANED 0x0000100000000000ULL // bit 44 +#define ldlm_is_cleaned(_l) LDLM_TEST_FLAG(( _l), 1ULL << 44) +#define ldlm_set_cleaned(_l) LDLM_SET_FLAG(( _l), 1ULL << 44) +#define ldlm_clear_cleaned(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 44) + +/** + * optimization hint: LDLM can run blocking callback from current context + * w/o involving separate thread. in order to decrease cs rate */ +#define LDLM_FL_ATOMIC_CB 0x0000200000000000ULL // bit 45 +#define ldlm_is_atomic_cb(_l) LDLM_TEST_FLAG(( _l), 1ULL << 45) +#define ldlm_set_atomic_cb(_l) LDLM_SET_FLAG(( _l), 1ULL << 45) +#define ldlm_clear_atomic_cb(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 45) + +/** + * It may happen that a client initiates two operations, e.g. unlink and + * mkdir, such that the server sends a blocking AST for conflicting locks + * to this client for the first operation, whereas the second operation + * has canceled this lock and is waiting for rpc_lock which is taken by + * the first operation. LDLM_FL_BL_AST is set by ldlm_callback_handler() in + * the lock to prevent the Early Lock Cancel (ELC) code from cancelling it. */ +#define LDLM_FL_BL_AST 0x0000400000000000ULL // bit 46 +#define ldlm_is_bl_ast(_l) LDLM_TEST_FLAG(( _l), 1ULL << 46) +#define ldlm_set_bl_ast(_l) LDLM_SET_FLAG(( _l), 1ULL << 46) +#define ldlm_clear_bl_ast(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 46) + +/** + * Set by ldlm_cancel_callback() when lock cache is dropped to let + * ldlm_callback_handler() return EINVAL to the server. It is used when + * ELC RPC is already prepared and is waiting for rpc_lock, too late to + * send a separate CANCEL RPC. */ +#define LDLM_FL_BL_DONE 0x0000800000000000ULL // bit 47 +#define ldlm_is_bl_done(_l) LDLM_TEST_FLAG(( _l), 1ULL << 47) +#define ldlm_set_bl_done(_l) LDLM_SET_FLAG(( _l), 1ULL << 47) +#define ldlm_clear_bl_done(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 47) + +/** + * Don't put lock into the LRU list, so that it is not canceled due + * to aging. Used by MGC locks, they are cancelled only at unmount or + * by callback. */ +#define LDLM_FL_NO_LRU 0x0001000000000000ULL // bit 48 +#define ldlm_is_no_lru(_l) LDLM_TEST_FLAG(( _l), 1ULL << 48) +#define ldlm_set_no_lru(_l) LDLM_SET_FLAG(( _l), 1ULL << 48) +#define ldlm_clear_no_lru(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 48) + +/** + * Set for locks that failed and where the server has been notified. + * + * Protected by lock and resource locks. */ +#define LDLM_FL_FAIL_NOTIFIED 0x0002000000000000ULL // bit 49 +#define ldlm_is_fail_notified(_l) LDLM_TEST_FLAG(( _l), 1ULL << 49) +#define ldlm_set_fail_notified(_l) LDLM_SET_FLAG(( _l), 1ULL << 49) +#define ldlm_clear_fail_notified(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 49) + +/** + * Set for locks that were removed from class hash table and will + * be destroyed when last reference to them is released. Set by + * ldlm_lock_destroy_internal(). + * + * Protected by lock and resource locks. */ +#define LDLM_FL_DESTROYED 0x0004000000000000ULL // bit 50 +#define ldlm_is_destroyed(_l) LDLM_TEST_FLAG(( _l), 1ULL << 50) +#define ldlm_set_destroyed(_l) LDLM_SET_FLAG(( _l), 1ULL << 50) +#define ldlm_clear_destroyed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 50) + +/** flag whether this is a server namespace lock */ +#define LDLM_FL_SERVER_LOCK 0x0008000000000000ULL // bit 51 +#define ldlm_is_server_lock(_l) LDLM_TEST_FLAG(( _l), 1ULL << 51) +#define ldlm_set_server_lock(_l) LDLM_SET_FLAG(( _l), 1ULL << 51) +#define ldlm_clear_server_lock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 51) + +/** + * It's set in lock_res_and_lock() and unset in unlock_res_and_lock(). + * + * NB: compared with check_res_locked(), checking this bit is cheaper. + * Also, spin_is_locked() is deprecated for kernel code; one reason is + * because it works only for SMP so user needs to add extra macros like + * LASSERT_SPIN_LOCKED for uniprocessor kernels. */ +#define LDLM_FL_RES_LOCKED 0x0010000000000000ULL // bit 52 +#define ldlm_is_res_locked(_l) LDLM_TEST_FLAG(( _l), 1ULL << 52) +#define ldlm_set_res_locked(_l) LDLM_SET_FLAG(( _l), 1ULL << 52) +#define ldlm_clear_res_locked(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 52) + +/** + * It's set once we call ldlm_add_waiting_lock_res_locked() to start the + * lock-timeout timer and it will never be reset. + * + * Protected by lock and resource locks. */ +#define LDLM_FL_WAITED 0x0020000000000000ULL // bit 53 +#define ldlm_is_waited(_l) LDLM_TEST_FLAG(( _l), 1ULL << 53) +#define ldlm_set_waited(_l) LDLM_SET_FLAG(( _l), 1ULL << 53) +#define ldlm_clear_waited(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 53) + +/** Flag whether this is a server namespace lock. */ +#define LDLM_FL_NS_SRV 0x0040000000000000ULL // bit 54 +#define ldlm_is_ns_srv(_l) LDLM_TEST_FLAG(( _l), 1ULL << 54) +#define ldlm_set_ns_srv(_l) LDLM_SET_FLAG(( _l), 1ULL << 54) +#define ldlm_clear_ns_srv(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 54) + +/** Flag whether this lock can be reused. Used by exclusive open. */ +#define LDLM_FL_EXCL 0x0080000000000000ULL // bit 55 +#define ldlm_is_excl(_l) LDLM_TEST_FLAG(( _l), 1ULL << 55) +#define ldlm_set_excl(_l) LDLM_SET_FLAG(( _l), 1ULL << 55) +#define ldlm_clear_excl(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 55) + +/** Flag whether a lock is found on server for re-sent RPC. */ +#define LDLM_FL_RESENT 0x0100000000000000ULL // bit 56 + +/** Flag whether Commit-on-Sharing is enabled, if LDLM_FL_COS_INCOMPAT is set + * this flag may not be set because once the former is set this flag won't be + * checked, and for cross-MDT lock COS_INCOMPAT is always set but ast handle is + * in ldlm context which doesn't know whether COS is enabled or not. */ +#define LDLM_FL_COS_ENABLED 0x0200000000000000ULL /* bit 57 */ +#define ldlm_is_cos_enabled(_l) LDLM_TEST_FLAG((_l), 1ULL << 57) +#define ldlm_set_cos_enabled(_l) LDLM_SET_FLAG((_l), 1ULL << 57) + +/** + * This flags means to use non-delay RPC to send dlm request RPC. + */ +#define LDLM_FL_NDELAY 0x0400000000000000ULL /* bit 58 */ +#define ldlm_is_ndelay(_l) LDLM_TEST_FLAG((_l), 1ULL << 58) +#define ldlm_set_ndelay(_l) LDLM_SET_FLAG((_l), 1ULL << 58) + +/** + * LVB from this lock is cached in osc object + */ +#define LDLM_FL_LVB_CACHED 0x0800000000000000ULL /* bit 59 */ +#define ldlm_is_lvb_cached(_l) LDLM_TEST_FLAG((_l), 1ULL << 59) +#define ldlm_set_lvb_cached(_l) LDLM_SET_FLAG((_l), 1ULL << 59) +#define ldlm_clear_lvb_cached(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 59) + +/** l_flags bits marked as "ast" bits */ +#define LDLM_FL_AST_MASK (LDLM_FL_FLOCK_DEADLOCK |\ + LDLM_FL_DISCARD_DATA) + +/** l_flags bits marked as "blocked" bits */ +#define LDLM_FL_BLOCKED_MASK (LDLM_FL_BLOCK_GRANTED |\ + LDLM_FL_BLOCK_WAIT) + +/** l_flags bits marked as "gone" bits */ +#define LDLM_FL_GONE_MASK (LDLM_FL_DESTROYED |\ + LDLM_FL_FAILED) + +/** l_flags bits marked as "inherit" bits + * Flags inherited from wire on enqueue/reply between client/server. + * CANCEL_ON_BLOCK so server will not grant if a blocking lock is found + * NO_TIMEOUT flag to force ldlm_lock_match() to wait with no timeout. + * TEST_LOCK flag to not let TEST lock to be granted. + * NO_EXPANSION to tell server not to expand extent of lock request */ +#define LDLM_FL_INHERIT_MASK (LDLM_FL_CANCEL_ON_BLOCK |\ + LDLM_FL_NO_TIMEOUT |\ + LDLM_FL_TEST_LOCK |\ + LDLM_FL_NO_EXPANSION) + +/** flags returned in @flags parameter on ldlm_lock_enqueue, + * to be re-constructed on re-send */ +#define LDLM_FL_SRV_ENQ_MASK (LDLM_FL_LOCK_CHANGED |\ + LDLM_FL_BLOCKED_MASK |\ + LDLM_FL_NO_TIMEOUT) + +/** test for ldlm_lock flag bit set */ +#define LDLM_TEST_FLAG(_l, _b) (((_l)->l_flags & (_b)) != 0) + +/** multi-bit test: are any of mask bits set? */ +#define LDLM_HAVE_MASK(_l, _m) (((_l)->l_flags & LDLM_FL_##_m##_MASK) != 0) + +/** set a ldlm_lock flag bit */ +#define LDLM_SET_FLAG(_l, _b) ((_l)->l_flags |= (_b)) + +/** clear a ldlm_lock flag bit */ +#define LDLM_CLEAR_FLAG(_l, _b) ((_l)->l_flags &= ~(_b)) + +/** @} subgroup */ +/** @} group */ +#endif /* LDLM_ALL_FLAGS_MASK */ + diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_errno.h b/drivers/staging/lustrefsx/lustre/include/lustre_errno.h new file mode 100644 index 0000000000000..fe9ccd2e07a82 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_errno.h @@ -0,0 +1,218 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.txt + * + * GPL HEADER END + */ +/* + * Copyright (C) 2011 FUJITSU LIMITED. All rights reserved. + * + * Copyright (c) 2013, Intel Corporation. + */ + +#ifndef LUSTRE_ERRNO_H +#define LUSTRE_ERRNO_H + +/* + * Only "network" errnos, which are defined below, are allowed on wire (or on + * disk). Generic routines exist to help translate between these and a subset + * of the "host" errnos. Some host errnos (e.g., EDEADLOCK) are intentionally + * left out. See also the comment on lustre_errno_hton_mapping[]. + * + * To maintain compatibility with existing x86 clients and servers, each of + * these network errnos has the same numerical value as its corresponding host + * errno on x86. + */ +#define LUSTRE_EPERM 1 /* Operation not permitted */ +#define LUSTRE_ENOENT 2 /* No such file or directory */ +#define LUSTRE_ESRCH 3 /* No such process */ +#define LUSTRE_EINTR 4 /* Interrupted system call */ +#define LUSTRE_EIO 5 /* I/O error */ +#define LUSTRE_ENXIO 6 /* No such device or address */ +#define LUSTRE_E2BIG 7 /* Argument list too long */ +#define LUSTRE_ENOEXEC 8 /* Exec format error */ +#define LUSTRE_EBADF 9 /* Bad file number */ +#define LUSTRE_ECHILD 10 /* No child processes */ +#define LUSTRE_EAGAIN 11 /* Try again */ +#define LUSTRE_ENOMEM 12 /* Out of memory */ +#define LUSTRE_EACCES 13 /* Permission denied */ +#define LUSTRE_EFAULT 14 /* Bad address */ +#define LUSTRE_ENOTBLK 15 /* Block device required */ +#define LUSTRE_EBUSY 16 /* Device or resource busy */ +#define LUSTRE_EEXIST 17 /* File exists */ +#define LUSTRE_EXDEV 18 /* Cross-device link */ +#define LUSTRE_ENODEV 19 /* No such device */ +#define LUSTRE_ENOTDIR 20 /* Not a directory */ +#define LUSTRE_EISDIR 21 /* Is a directory */ +#define LUSTRE_EINVAL 22 /* Invalid argument */ +#define LUSTRE_ENFILE 23 /* File table overflow */ +#define LUSTRE_EMFILE 24 /* Too many open files */ +#define LUSTRE_ENOTTY 25 /* Not a typewriter */ +#define LUSTRE_ETXTBSY 26 /* Text file busy */ +#define LUSTRE_EFBIG 27 /* File too large */ +#define LUSTRE_ENOSPC 28 /* No space left on device */ +#define LUSTRE_ESPIPE 29 /* Illegal seek */ +#define LUSTRE_EROFS 30 /* Read-only file system */ +#define LUSTRE_EMLINK 31 /* Too many links */ +#define LUSTRE_EPIPE 32 /* Broken pipe */ +#define LUSTRE_EDOM 33 /* Math argument out of domain of + func */ +#define LUSTRE_ERANGE 34 /* Math result not representable */ +#define LUSTRE_EDEADLK 35 /* Resource deadlock would occur */ +#define LUSTRE_ENAMETOOLONG 36 /* File name too long */ +#define LUSTRE_ENOLCK 37 /* No record locks available */ +#define LUSTRE_ENOSYS 38 /* Function not implemented */ +#define LUSTRE_ENOTEMPTY 39 /* Directory not empty */ +#define LUSTRE_ELOOP 40 /* Too many symbolic links + encountered */ +#define LUSTRE_ENOMSG 42 /* No message of desired type */ +#define LUSTRE_EIDRM 43 /* Identifier removed */ +#define LUSTRE_ECHRNG 44 /* Channel number out of range */ +#define LUSTRE_EL2NSYNC 45 /* Level 2 not synchronized */ +#define LUSTRE_EL3HLT 46 /* Level 3 halted */ +#define LUSTRE_EL3RST 47 /* Level 3 reset */ +#define LUSTRE_ELNRNG 48 /* Link number out of range */ +#define LUSTRE_EUNATCH 49 /* Protocol driver not attached */ +#define LUSTRE_ENOCSI 50 /* No CSI structure available */ +#define LUSTRE_EL2HLT 51 /* Level 2 halted */ +#define LUSTRE_EBADE 52 /* Invalid exchange */ +#define LUSTRE_EBADR 53 /* Invalid request descriptor */ +#define LUSTRE_EXFULL 54 /* Exchange full */ +#define LUSTRE_ENOANO 55 /* No anode */ +#define LUSTRE_EBADRQC 56 /* Invalid request code */ +#define LUSTRE_EBADSLT 57 /* Invalid slot */ +#define LUSTRE_EBFONT 59 /* Bad font file format */ +#define LUSTRE_ENOSTR 60 /* Device not a stream */ +#define LUSTRE_ENODATA 61 /* No data available */ +#define LUSTRE_ETIME 62 /* Timer expired */ +#define LUSTRE_ENOSR 63 /* Out of streams resources */ +#define LUSTRE_ENONET 64 /* Machine is not on the network */ +#define LUSTRE_ENOPKG 65 /* Package not installed */ +#define LUSTRE_EREMOTE 66 /* Object is remote */ +#define LUSTRE_ENOLINK 67 /* Link has been severed */ +#define LUSTRE_EADV 68 /* Advertise error */ +#define LUSTRE_ESRMNT 69 /* Srmount error */ +#define LUSTRE_ECOMM 70 /* Communication error on send */ +#define LUSTRE_EPROTO 71 /* Protocol error */ +#define LUSTRE_EMULTIHOP 72 /* Multihop attempted */ +#define LUSTRE_EDOTDOT 73 /* RFS specific error */ +#define LUSTRE_EBADMSG 74 /* Not a data message */ +#define LUSTRE_EOVERFLOW 75 /* Value too large for defined data + type */ +#define LUSTRE_ENOTUNIQ 76 /* Name not unique on network */ +#define LUSTRE_EBADFD 77 /* File descriptor in bad state */ +#define LUSTRE_EREMCHG 78 /* Remote address changed */ +#define LUSTRE_ELIBACC 79 /* Can not access a needed shared + library */ +#define LUSTRE_ELIBBAD 80 /* Accessing a corrupted shared + library */ +#define LUSTRE_ELIBSCN 81 /* .lib section in a.out corrupted */ +#define LUSTRE_ELIBMAX 82 /* Attempting to link in too many shared + libraries */ +#define LUSTRE_ELIBEXEC 83 /* Cannot exec a shared library + directly */ +#define LUSTRE_EILSEQ 84 /* Illegal byte sequence */ +#define LUSTRE_ERESTART 85 /* Interrupted system call should be + restarted */ +#define LUSTRE_ESTRPIPE 86 /* Streams pipe error */ +#define LUSTRE_EUSERS 87 /* Too many users */ +#define LUSTRE_ENOTSOCK 88 /* Socket operation on non-socket */ +#define LUSTRE_EDESTADDRREQ 89 /* Destination address required */ +#define LUSTRE_EMSGSIZE 90 /* Message too long */ +#define LUSTRE_EPROTOTYPE 91 /* Protocol wrong type for socket */ +#define LUSTRE_ENOPROTOOPT 92 /* Protocol not available */ +#define LUSTRE_EPROTONOSUPPORT 93 /* Protocol not supported */ +#define LUSTRE_ESOCKTNOSUPPORT 94 /* Socket type not supported */ +#define LUSTRE_EOPNOTSUPP 95 /* Operation not supported on transport + endpoint */ +#define LUSTRE_EPFNOSUPPORT 96 /* Protocol family not supported */ +#define LUSTRE_EAFNOSUPPORT 97 /* Address family not supported by + protocol */ +#define LUSTRE_EADDRINUSE 98 /* Address already in use */ +#define LUSTRE_EADDRNOTAVAIL 99 /* Cannot assign requested address */ +#define LUSTRE_ENETDOWN 100 /* Network is down */ +#define LUSTRE_ENETUNREACH 101 /* Network is unreachable */ +#define LUSTRE_ENETRESET 102 /* Network dropped connection because of + reset */ +#define LUSTRE_ECONNABORTED 103 /* Software caused connection abort */ +#define LUSTRE_ECONNRESET 104 /* Connection reset by peer */ +#define LUSTRE_ENOBUFS 105 /* No buffer space available */ +#define LUSTRE_EISCONN 106 /* Transport endpoint is already + connected */ +#define LUSTRE_ENOTCONN 107 /* Transport endpoint is not + connected */ +#define LUSTRE_ESHUTDOWN 108 /* Cannot send after transport endpoint + shutdown */ +#define LUSTRE_ETOOMANYREFS 109 /* Too many references: cannot splice */ +#define LUSTRE_ETIMEDOUT 110 /* Connection timed out */ +#define LUSTRE_ECONNREFUSED 111 /* Connection refused */ +#define LUSTRE_EHOSTDOWN 112 /* Host is down */ +#define LUSTRE_EHOSTUNREACH 113 /* No route to host */ +#define LUSTRE_EALREADY 114 /* Operation already in progress */ +#define LUSTRE_EINPROGRESS 115 /* Operation now in progress */ +#define LUSTRE_ESTALE 116 /* Stale NFS file handle */ +#define LUSTRE_EUCLEAN 117 /* Structure needs cleaning */ +#define LUSTRE_ENOTNAM 118 /* Not a XENIX named type file */ +#define LUSTRE_ENAVAIL 119 /* No XENIX semaphores available */ +#define LUSTRE_EISNAM 120 /* Is a named type file */ +#define LUSTRE_EREMOTEIO 121 /* Remote I/O error */ +#define LUSTRE_EDQUOT 122 /* Quota exceeded */ +#define LUSTRE_ENOMEDIUM 123 /* No medium found */ +#define LUSTRE_EMEDIUMTYPE 124 /* Wrong medium type */ +#define LUSTRE_ECANCELED 125 /* Operation Canceled */ +#define LUSTRE_ENOKEY 126 /* Required key not available */ +#define LUSTRE_EKEYEXPIRED 127 /* Key has expired */ +#define LUSTRE_EKEYREVOKED 128 /* Key has been revoked */ +#define LUSTRE_EKEYREJECTED 129 /* Key was rejected by service */ +#define LUSTRE_EOWNERDEAD 130 /* Owner died */ +#define LUSTRE_ENOTRECOVERABLE 131 /* State not recoverable */ +#define LUSTRE_ERESTARTSYS 512 +#define LUSTRE_ERESTARTNOINTR 513 +#define LUSTRE_ERESTARTNOHAND 514 /* restart if no handler.. */ +#define LUSTRE_ENOIOCTLCMD 515 /* No ioctl command */ +#define LUSTRE_ERESTART_RESTARTBLOCK 516 /* restart by calling + sys_restart_syscall */ +#define LUSTRE_EBADHANDLE 521 /* Illegal NFS file handle */ +#define LUSTRE_ENOTSYNC 522 /* Update synchronization mismatch */ +#define LUSTRE_EBADCOOKIE 523 /* Cookie is stale */ +#define LUSTRE_ENOTSUPP 524 /* Operation is not supported */ +#define LUSTRE_ETOOSMALL 525 /* Buffer or request is too small */ +#define LUSTRE_ESERVERFAULT 526 /* An untranslatable error occurred */ +#define LUSTRE_EBADTYPE 527 /* Type not supported by server */ +#define LUSTRE_EJUKEBOX 528 /* Request initiated, but will not + complete before timeout */ +#define LUSTRE_EIOCBQUEUED 529 /* iocb queued, will get completion + event */ + +/* + * Translations are optimized away on x86. Host errnos that shouldn't be put + * on wire could leak through as a result. Do not count on this side effect. + */ +#if !defined(__x86_64__) && !defined(__i386__) +#define LUSTRE_TRANSLATE_ERRNOS +#endif + +#ifdef LUSTRE_TRANSLATE_ERRNOS +unsigned int lustre_errno_hton(unsigned int h); +unsigned int lustre_errno_ntoh(unsigned int n); +#else +#define lustre_errno_hton(h) (h) +#define lustre_errno_ntoh(n) (n) +#endif + +#endif /* LUSTRE_ERRNO_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_export.h b/drivers/staging/lustrefsx/lustre/include/lustre_export.h new file mode 100644 index 0000000000000..a0682d85620c4 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_export.h @@ -0,0 +1,519 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +/** \defgroup obd_export PortalRPC export definitions + * + * @{ + */ + +#ifndef __EXPORT_H +#define __EXPORT_H + +/** \defgroup export export + * + * @{ + */ + +#include +#include + +#include +#include +#include + +struct mds_client_data; +struct mdt_client_data; +struct mds_idmap_table; +struct mdt_idmap_table; + +/** + * Target-specific export data + */ +struct tg_export_data { + /** Protects ted_lcd, ted_reply_* and + * ted_release_* fields below */ + struct mutex ted_lcd_lock; + /** Per-client data for each export */ + struct lsd_client_data *ted_lcd; + /** Offset of record in last_rcvd file */ + loff_t ted_lr_off; + /** Client index in last_rcvd file */ + int ted_lr_idx; + + /** + * ted_nodemap_lock is used to ensure that the nodemap is not destroyed + * between the time that ted_nodemap is checked for NULL, and a + * reference is taken. Modifications to ted_nodemap require that the + * active_config_lock and the nodemap(s)'s nm_member_list_lock be + * taken, as well as ted_nodemap_lock, so the export can be properly + * added to or removed from the nodemap's member list. When an export + * is added to a nodemap, a reference on that nodemap must be taken. + * That reference can be put only after ted_nodemap no longer refers to + * it. + */ + spinlock_t ted_nodemap_lock; + struct lu_nodemap *ted_nodemap; + struct list_head ted_nodemap_member; + + /** last version of nodemap config sent to client */ + __u64 ted_nodemap_version; + + /* Every reply data fields below are + * protected by ted_lcd_lock */ + /** List of reply data */ + struct list_head ted_reply_list; + int ted_reply_cnt; + /** Reply data with highest transno is retained */ + struct tg_reply_data *ted_reply_last; + /* Statistics */ + int ted_reply_max; /* high water mark */ + int ted_release_xid; + int ted_release_tag; + /* grants */ + long ted_dirty; /* in bytes */ + long ted_grant; /* in bytes */ + long ted_pending; /* bytes just being written */ + __u8 ted_pagebits; /* log2 of client page size */ + + /** + * File Modification Data (FMD) tracking + */ + spinlock_t ted_fmd_lock; /* protects ted_fmd_list */ + struct list_head ted_fmd_list; /* FIDs being modified */ + int ted_fmd_count;/* items in ted_fmd_list */ +}; + +/** + * MDT-specific export data + */ +struct mdt_export_data { + struct tg_export_data med_ted; + /** List of all files opened by client on this MDT */ + struct list_head med_open_head; + spinlock_t med_open_lock; /* med_open_head, mfd_list */ +}; + +struct ec_export_data { /* echo client */ + struct list_head eced_locks; +}; + +/* In-memory access to client data from OST struct */ +/** Filter (oss-side) specific import data */ +struct filter_export_data { + struct tg_export_data fed_ted; + __u64 fed_lastid_gen; + /* count of SOFT_SYNC RPCs, which will be reset after + * ofd_soft_sync_limit number of RPCs, and trigger a sync. */ + atomic_t fed_soft_sync_count; + __u32 fed_group; +}; + +struct mgs_export_data { + struct list_head med_clients; /* mgc fs client via this exp */ + spinlock_t med_lock; /* protect med_clients */ +}; + +/** + * per-NID statistics structure. + * It tracks access patterns to this export on a per-client-NID basis + */ +struct nid_stat { + struct lnet_nid nid; + struct hlist_node nid_hash; + struct list_head nid_list; + struct obd_device *nid_obd; + struct proc_dir_entry *nid_proc; + struct lprocfs_stats *nid_stats; + struct lprocfs_stats *nid_ldlm_stats; + atomic_t nid_exp_ref_count; /* for obd_nid_stats_hash + exp_nid_stats */ +}; + +#define nidstat_getref(nidstat) \ +do { \ + atomic_inc(&(nidstat)->nid_exp_ref_count); \ +} while(0) + +#define nidstat_putref(nidstat) \ +do { \ + atomic_dec(&(nidstat)->nid_exp_ref_count); \ + LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0, \ + "stat %p nid_exp_ref_count < 0\n", nidstat); \ +} while(0) + +enum obd_option { + OBD_OPT_FORCE = 0x0001, + OBD_OPT_FAILOVER = 0x0002, + OBD_OPT_ABORT_RECOV = 0x0004, +}; + +/** + * Export structure. Represents target-side of connection in portals. + * Also used in Lustre to connect between layers on the same node when + * there is no network-connection in-between. + * For every connected client there is an export structure on the server + * attached to the same obd device. + */ +struct obd_export { + /** + * Export handle, it's id is provided to client on connect + * Subsequent client RPCs contain this handle id to identify + * what export they are talking to. + */ + struct portals_handle exp_handle; + /** + * Set of counters below is to track where export references are + * kept. The exp_rpc_count is used for reconnect handling also, + * the cb_count and locks_count are for debug purposes only for now. + * The sum of them should be less than exp_handle.href by 3 + */ + atomic_t exp_rpc_count; /* RPC references */ + atomic_t exp_cb_count; /* Commit callback references */ + /** Number of queued replay requests to be processes */ + atomic_t exp_replay_count; + atomic_t exp_locks_count; /** Lock references */ +#if LUSTRE_TRACKS_LOCK_EXP_REFS + struct list_head exp_locks_list; + spinlock_t exp_locks_list_guard; +#endif + /** UUID of client connected to this export */ + struct obd_uuid exp_client_uuid; + /** To link all exports on an obd device */ + struct list_head exp_obd_chain; + /** work_struct for destruction of export */ + struct work_struct exp_zombie_work; + /* Unlinked export list */ + struct list_head exp_stale_list; + struct rhash_head exp_uuid_hash; /** uuid-export hash */ + struct rhlist_head exp_nid_hash; /** nid-export hash */ + struct hlist_node exp_gen_hash; /** last_rcvd clt gen hash */ + /** + * All exports eligible for ping evictor are linked into a list + * through this field in "most time since last request on this export" + * order + * protected by obd_dev_lock + */ + struct list_head exp_obd_chain_timed; + /** Obd device of this export */ + struct obd_device *exp_obd; + /** + * "reverse" import to send requests (e.g. from ldlm) back to client + * exp_lock protect its change + */ + struct obd_import *exp_imp_reverse; + struct nid_stat *exp_nid_stats; + /** Active connetion */ + struct ptlrpc_connection *exp_connection; + /** Connection count value from last successful reconnect rpc */ + __u32 exp_conn_cnt; + /** Hash list of all ldlm locks granted on this export */ + struct cfs_hash *exp_lock_hash; + /** + * Hash list for Posix lock deadlock detection, added with + * ldlm_lock::l_exp_flock_hash. + */ + struct cfs_hash *exp_flock_hash; + struct list_head exp_outstanding_replies; + struct list_head exp_uncommitted_replies; + spinlock_t exp_uncommitted_replies_lock; + /** Last committed transno for this export */ + __u64 exp_last_committed; + /** When was last request received */ + time64_t exp_last_request_time; + /** On replay all requests waiting for replay are linked here */ + struct list_head exp_req_replay_queue; + /** + * protects exp_flags, exp_outstanding_replies and the change + * of exp_imp_reverse + */ + spinlock_t exp_lock; + /** Compatibility flags for this export are embedded into + * exp_connect_data */ + struct obd_connect_data exp_connect_data; + enum obd_option exp_flags; + unsigned long exp_failed:1, + exp_in_recovery:1, + exp_disconnected:1, + exp_connecting:1, + /** VBR: export missed recovery */ + exp_delayed:1, + /** VBR: failed version checking */ + exp_vbr_failed:1, + exp_req_replay_needed:1, + exp_lock_replay_needed:1, + exp_need_sync:1, + exp_flvr_changed:1, + exp_flvr_adapt:1, + /* if to swap nidtbl entries for 2.2 clients. + * Only used by the MGS to fix LU-1644. */ + exp_need_mne_swab:1, + /* The export already got final replay ping + * request. */ + exp_replay_done:1, + /* local client with recovery disabled */ + exp_no_recovery:1, + exp_hashed:1; + /* also protected by exp_lock */ + enum lustre_sec_part exp_sp_peer; + struct sptlrpc_flavor exp_flvr; /* current */ + struct sptlrpc_flavor exp_flvr_old[2]; /* about-to-expire */ + time64_t exp_flvr_expire[2]; /* seconds */ + + /** protects exp_hp_rpcs */ + spinlock_t exp_rpc_lock; + struct list_head exp_hp_rpcs; /* (potential) HP RPCs */ + struct list_head exp_reg_rpcs; /* RPC being handled */ + + /** blocking dlm lock list, protected by exp_bl_list_lock */ + struct list_head exp_bl_list; + spinlock_t exp_bl_list_lock; + + /** Target specific data */ + union { + struct tg_export_data eu_target_data; + struct mdt_export_data eu_mdt_data; + struct filter_export_data eu_filter_data; + struct ec_export_data eu_ec_data; + struct mgs_export_data eu_mgs_data; + } u; + + struct adaptive_timeout exp_bl_lock_at; + + /** highest XID received by export client that has no + * unreceived lower-numbered XID + */ + __u64 exp_last_xid; + long *exp_used_slots; +}; + +#define exp_target_data u.eu_target_data +#define exp_mdt_data u.eu_mdt_data +#define exp_filter_data u.eu_filter_data +#define exp_ec_data u.eu_ec_data + +static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp) +{ + return &exp->exp_connect_data.ocd_connect_flags; +} + +static inline __u64 exp_connect_flags(struct obd_export *exp) +{ + return *exp_connect_flags_ptr(exp); +} + +static inline __u64 *exp_connect_flags2_ptr(struct obd_export *exp) +{ + return &exp->exp_connect_data.ocd_connect_flags2; +} + +static inline __u64 exp_connect_flags2(struct obd_export *exp) +{ + if (exp_connect_flags(exp) & OBD_CONNECT_FLAGS2) + return *exp_connect_flags2_ptr(exp); + return 0; +} + +static inline int exp_max_brw_size(struct obd_export *exp) +{ + LASSERT(exp != NULL); + if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE) + return exp->exp_connect_data.ocd_brw_size; + + return ONE_MB_BRW_SIZE; +} + +static inline int exp_connect_multibulk(struct obd_export *exp) +{ + return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE; +} + +static inline int exp_connect_cancelset(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET); +} + +static inline int exp_connect_lru_resize(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE); +} + +static inline int exp_connect_vbr(struct obd_export *exp) +{ + LASSERT(exp != NULL); + LASSERT(exp->exp_connection); + return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR); +} + +static inline int exp_connect_umask(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK); +} + +static inline int imp_connect_lru_resize(struct obd_import *imp) +{ + struct obd_connect_data *ocd; + + LASSERT(imp != NULL); + ocd = &imp->imp_connect_data; + return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE); +} + +static inline int exp_connect_layout(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK); +} + +static inline bool exp_connect_lvb_type(struct obd_export *exp) +{ + LASSERT(exp != NULL); + if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE) + return true; + else + return false; +} + +static inline bool imp_connect_lvb_type(struct obd_import *imp) +{ + struct obd_connect_data *ocd; + + LASSERT(imp != NULL); + ocd = &imp->imp_connect_data; + if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE) + return true; + else + return false; +} + +static inline bool imp_connect_disp_stripe(struct obd_import *imp) +{ + struct obd_connect_data *ocd; + + LASSERT(imp != NULL); + ocd = &imp->imp_connect_data; + return ocd->ocd_connect_flags & OBD_CONNECT_DISP_STRIPE; +} + +static inline bool imp_connect_shortio(struct obd_import *imp) +{ + struct obd_connect_data *ocd = &imp->imp_connect_data; + + return ocd->ocd_connect_flags & OBD_CONNECT_SHORTIO; +} + +static inline __u64 exp_connect_ibits(struct obd_export *exp) +{ + struct obd_connect_data *ocd; + + ocd = &exp->exp_connect_data; + return ocd->ocd_ibits_known; +} + +static inline int exp_connect_large_acl(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_LARGE_ACL); +} + +static inline int exp_connect_lockahead(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LOCKAHEAD); +} + +static inline int exp_connect_overstriping(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_OVERSTRIPING); +} + +static inline int exp_connect_flr(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_FLR); +} + +static inline int exp_bypass_mdll(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_MDLL_BYPASS); +} + +static inline int exp_mdll(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_MDLL); +} + +static inline int exp_connect_lock_convert(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LOCK_CONVERT); +} + +extern struct obd_export *class_conn2export(struct lustre_handle *conn); + +static inline int exp_connect_archive_id_array(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_ARCHIVE_ID_ARRAY); +} + +static inline int exp_connect_sepol(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_SELINUX_POLICY); +} + +static inline int exp_connect_encrypt(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_ENCRYPT); +} + +static inline int exp_connect_lseek(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LSEEK); +} + +static inline int exp_connect_dom_lvb(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_DOM_LVB); +} + +enum { + /* archive_ids in array format */ + KKUC_CT_DATA_ARRAY_MAGIC = 0x092013cea, + /* archive_ids in bitmap format */ + KKUC_CT_DATA_BITMAP_MAGIC = 0x082018cea, +}; + + +struct kkuc_ct_data { + __u32 kcd_magic; + __u32 kcd_nr_archives; + __u32 kcd_archives[0]; +}; + +/** @} export */ + +#endif /* __EXPORT_H */ +/** @} obd_export */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h new file mode 100644 index 0000000000000..e82b847885eac --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h @@ -0,0 +1,953 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre_fid.h + * + * Author: Yury Umanets + */ + +#ifndef __LUSTRE_FID_H +#define __LUSTRE_FID_H + +/** \defgroup fid fid + * + * @{ + * + * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs + * describes the FID namespace and interoperability requirements for FIDs. + * The important parts of that document are included here for reference. + * + * FID + * File IDentifier generated by client from range allocated by the SEQuence + * service and stored in struct lu_fid. The FID is composed of three parts: + * SEQuence, ObjectID, and VERsion. The SEQ component is a filesystem + * unique 64-bit integer, and only one client is ever assigned any SEQ value. + * The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved + * for system use. The OID component is a 32-bit value generated by the + * client on a per-SEQ basis to allow creating many unique FIDs without + * communication with the server. The VER component is a 32-bit value that + * distinguishes between different FID instantiations, such as snapshots or + * separate subtrees within the filesystem. FIDs with the same VER field + * are considered part of the same namespace. + * + * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and + * MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while + * OSTs use 64-bit Lustre object IDs and generation numbers. + * + * NEW filesystems are those formatted since the introduction of FIDs. + * + * IGIF + * Inode and Generation In FID, a surrogate FID used to globally identify + * an existing object on OLD formatted MDT file system. This would only be + * used on MDT0 in a DNE filesystem, because there cannot be more than one + * MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1] + * range, where inode number is stored in SEQ, and inode generation is in OID. + * NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem, + * which is the maximum possible for an ldiskfs backend. It also assumes + * that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible + * to clients, which has always been true. + * + * IDIF + * object ID In FID, a surrogate FID used to globally identify an existing + * OST object on OLD formatted OST file system. Belongs to a sequence in + * [2^32, 2^33 - 1]. Sequence number is calculated as: + * + * 1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff) + * + * that is, SEQ consists of 16-bit OST index, and higher 16 bits of object + * ID. The generation of unique SEQ values per OST allows the IDIF FIDs to + * be identified in the FLD correctly. The OID field is calculated as: + * + * objid & 0xffffffff + * + * that is, it consists of lower 32 bits of object ID. For objects within + * the IDIF range, object ID extraction will be: + * + * o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid; + * o_seq = 0; // formerly group number + * + * NOTE: This assumes that no more than 2^48-1 objects have ever been created + * on any OST, and that no more than 65535 OSTs are in use. Both are very + * reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming + * a maximum creation rate of 1M objects per second for a maximum of 9 years, + * or combinations thereof. + * + * OST_MDT0 + * Surrogate FID used to identify an existing object on OLD formatted OST + * filesystem. Belongs to the reserved SEQuence 0, and is used prior to + * the introduction of FID-on-OST, at which point IDIF will be used to + * identify objects as residing on a specific OST. + * + * LLOG + * For Lustre Log objects the object sequence 1 is used. This is compatible + * with both OLD and NEW namespaces, as this SEQ number is in the + * ext3/ldiskfs reserved inode range and does not conflict with IGIF + * sequence numbers. + * + * ECHO + * For testing OST IO performance the object sequence 2 is used. This is + * compatible with both OLD and NEW namespaces, as this SEQ number is in + * the ext3/ldiskfs reserved inode range and does not conflict with IGIF + * sequence numbers. + * + * OST_MDT1 .. OST_MAX + * For testing with multiple MDTs the object sequence 3 through 9 is used, + * allowing direct mapping of MDTs 1 through 7 respectively, for a total + * of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group" + * mappings. However, this SEQ range is only for testing prior to any + * production DNE release, as the objects in this range conflict across all + * OSTs, as the OST index is not part of the FID. For production DNE usage, + * OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs. + * + * DLM OST objid to IDIF mapping + * For compatibility with existing OLD OST network protocol structures, the + * FID must map onto the o_id and o_seq in a manner that ensures existing + * objects are identified consistently for IO, as well as onto the LDLM + * namespace to ensure IDIFs there is only a single resource name for any + * object in the DLM. The OLD OST object DLM resource mapping is: + * + * resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases + * + * The NEW OST object DLM resource mapping is the same for both MDT and OST: + * + * resource[] = {SEQ, OID, VER, HASH}; + * + * NOTE: for mapping IDIF values to DLM resource names the o_id may be + * larger than the 2^33 reserved sequence numbers for IDIF, so it is possible + * for the o_id numbers to overlap FID SEQ numbers in the resource. However, + * in all production releases the OLD o_seq field is always zero, and all + * valid FID OID values are non-zero, so the lock resources will not collide. + * Even so, the MDT and OST resources are also in different LDLM namespaces. + */ + +#include +#include +#include +#include + +/* Lustre service names are following the format + * service name + MDT + seq name + */ +#define LUSTRE_MDT_MAXNAMELEN 80 + +struct lu_env; +struct lu_site; +struct lu_context; +struct obd_device; +struct obd_export; + +/* Whole sequences space range and zero range definitions */ +extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE; +extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE; +extern const struct lu_fid LUSTRE_BFL_FID; +extern const struct lu_fid LU_OBF_FID; +extern const struct lu_fid LU_LPF_FID; +extern const struct lu_fid LU_DOT_LUSTRE_FID; +extern const struct lu_fid LU_BACKEND_LPF_FID; + +enum { + /* + * This is how may metadata FIDs may be allocated in one sequence(128k) + */ + LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL, + + /* + * This is how many data FIDs could be allocated in one sequence(4B - 1) + */ + LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL, + + /* + * How many sequences to allocate to a client at once. + */ + LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL, + + /* + * seq allocation pool size. + */ + LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000, + + /* + * This is how many sequences may be in one super-sequence allocated to + * MDTs. + */ + LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH) +}; + +/** special OID for local objects */ +enum local_oid { + /** \see fld_mod_init */ + FLD_INDEX_OID = 3UL, + /** \see fid_mod_init */ + FID_SEQ_CTL_OID = 4UL, + FID_SEQ_SRV_OID = 5UL, + /** \see mdd_mod_init */ + MDD_ROOT_INDEX_OID = 6UL, /* deprecated in 2.4 */ + MDD_ORPHAN_OID = 7UL, /* deprecated in 2.4 */ + MDD_LOV_OBJ_OID = 8UL, + MDD_CAPA_KEYS_OID = 9UL, + /** \see mdt_mod_init */ + LAST_RECV_OID = 11UL, + OSD_FS_ROOT_OID = 13UL, + ACCT_USER_OID = 15UL, + ACCT_GROUP_OID = 16UL, + LFSCK_BOOKMARK_OID = 17UL, + OTABLE_IT_OID = 18UL, + OSD_LPF_OID = 19UL, + REPLY_DATA_OID = 21UL, + ACCT_PROJECT_OID = 22UL, + INDEX_BACKUP_OID = 4116UL, + OFD_LAST_GROUP_OID = 4117UL, + LLOG_CATALOGS_OID = 4118UL, + MGS_CONFIGS_OID = 4119UL, + OFD_HEALTH_CHECK_OID = 4120UL, + MDD_LOV_OBJ_OSEQ = 4121UL, + LFSCK_NAMESPACE_OID = 4122UL, + REMOTE_PARENT_DIR_OID = 4123UL, + /* This definition is obsolete + * SLAVE_LLOG_CATALOGS_OID = 4124UL, + */ + BATCHID_COMMITTED_OID = 4125UL, +}; + +static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid) +{ + fid->f_seq = FID_SEQ_LOCAL_FILE; + fid->f_oid = oid; + fid->f_ver = 0; +} + +static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid) +{ + fid->f_seq = FID_SEQ_LOCAL_NAME; + fid->f_oid = oid; + fid->f_ver = 0; +} + +/* For new FS (>= 2.4), the root FID will be changed to + * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4), + * the root FID will still be IGIF */ +static inline int fid_is_root(const struct lu_fid *fid) +{ + return unlikely((fid_seq(fid) == FID_SEQ_ROOT && + fid_oid(fid) == FID_OID_ROOT)); +} + +static inline int fid_is_dot_lustre(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE && + fid_oid(fid) == FID_OID_DOT_LUSTRE); +} + +static inline int fid_is_obf(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE && + fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF); +} + +static inline int fid_is_otable_it(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE && + fid_oid(fid) == OTABLE_IT_OID); +} + +static inline int fid_oid_is_quota(const struct lu_fid *fid) +{ + switch (fid_oid(fid)) { + case ACCT_USER_OID: + case ACCT_GROUP_OID: + case ACCT_PROJECT_OID: + return 1; + default: + return 0; + } +} + +static inline int fid_is_acct(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_LOCAL_FILE && + fid_oid_is_quota(fid); +} + +static inline int fid_is_quota(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_QUOTA || + fid_seq(fid) == FID_SEQ_QUOTA_GLB; +} + +static inline int fid_is_name_llog(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_LLOG_NAME; +} + +static inline int fid_seq_in_fldb(u64 seq) +{ + return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) || + fid_seq_is_root(seq) || fid_seq_is_dot(seq); +} + +#ifdef HAVE_SERVER_SUPPORT +static inline int fid_is_namespace_visible(const struct lu_fid *fid) +{ + const __u64 seq = fid_seq(fid); + + /* Here, we cannot distinguish whether the normal FID is for OST + * object or not. It is caller's duty to check more if needed. */ + return (!fid_is_last_id(fid) && + (fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) || + fid_is_root(fid) || fid_seq_is_dot(seq); +} + +static inline void ost_layout_cpu_to_le(struct ost_layout *dst, + const struct ost_layout *src) +{ + dst->ol_stripe_size = __cpu_to_le32(src->ol_stripe_size); + dst->ol_stripe_count = __cpu_to_le32(src->ol_stripe_count); + dst->ol_comp_start = __cpu_to_le64(src->ol_comp_start); + dst->ol_comp_end = __cpu_to_le64(src->ol_comp_end); + dst->ol_comp_id = __cpu_to_le32(src->ol_comp_id); +} + +static inline void ost_layout_le_to_cpu(struct ost_layout *dst, + const struct ost_layout *src) +{ + dst->ol_stripe_size = __le32_to_cpu(src->ol_stripe_size); + dst->ol_stripe_count = __le32_to_cpu(src->ol_stripe_count); + dst->ol_comp_start = __le64_to_cpu(src->ol_comp_start); + dst->ol_comp_end = __le64_to_cpu(src->ol_comp_end); + dst->ol_comp_id = __le32_to_cpu(src->ol_comp_id); +} + +static inline void filter_fid_cpu_to_le(struct filter_fid *dst, + const struct filter_fid *src, int size) +{ + fid_cpu_to_le(&dst->ff_parent, &src->ff_parent); + + if (size < sizeof(struct filter_fid)) { + memset(&dst->ff_layout, 0, sizeof(dst->ff_layout)); + } else { + ost_layout_cpu_to_le(&dst->ff_layout, &src->ff_layout); + dst->ff_layout_version = cpu_to_le32(src->ff_layout_version); + dst->ff_range = cpu_to_le32(src->ff_range); + } + + /* XXX: Add more if filter_fid is enlarged in the future. */ +} + +static inline void filter_fid_le_to_cpu(struct filter_fid *dst, + const struct filter_fid *src, int size) +{ + fid_le_to_cpu(&dst->ff_parent, &src->ff_parent); + + if (size < sizeof(struct filter_fid)) { + memset(&dst->ff_layout, 0, sizeof(dst->ff_layout)); + } else { + ost_layout_le_to_cpu(&dst->ff_layout, &src->ff_layout); + dst->ff_layout_version = le32_to_cpu(src->ff_layout_version); + dst->ff_range = le32_to_cpu(src->ff_range); + } + + /* XXX: Add more if filter_fid is enlarged in the future. */ +} +#endif /* HAVE_SERVER_SUPPORT */ + +static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq, __u32 ost_idx) +{ + if (fid_seq_is_mdt0(seq)) { + fid->f_seq = fid_idif_seq(0, ost_idx); + } else { + LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) || + fid_seq_is_idif(seq), "%#llx\n", seq); + fid->f_seq = seq; + } + fid->f_oid = 0; + fid->f_ver = 0; +} + +static inline bool fid_is_md_operative(const struct lu_fid *fid) +{ + return fid_is_mdt0(fid) || fid_is_igif(fid) || + fid_is_norm(fid) || fid_is_root(fid); +} + +/* seq client type */ +enum lu_cli_type { + LUSTRE_SEQ_METADATA = 1, + LUSTRE_SEQ_DATA +}; + +enum lu_mgr_type { + LUSTRE_SEQ_SERVER, + LUSTRE_SEQ_CONTROLLER +}; + +struct lu_server_seq; + +/* Client sequence manager interface. */ +struct lu_client_seq { + /* Sequence-controller export. */ + struct obd_export *lcs_exp; + struct mutex lcs_mutex; + + /* + * Range of allowed for allocation sequeces. When using lu_client_seq on + * clients, this contains meta-sequence range. And for servers this + * contains super-sequence range. + */ + struct lu_seq_range lcs_space; + + /* Seq related debugfs */ + struct dentry *lcs_debugfs_entry; + + /* This holds last allocated fid in last obtained seq */ + struct lu_fid lcs_fid; + + /* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */ + enum lu_cli_type lcs_type; + + /* + * Service uuid, passed from MDT + seq name to form unique seq name to + * use it with debugfs. + */ + char lcs_name[LUSTRE_MDT_MAXNAMELEN]; + + /* + * Sequence width, that is how many objects may be allocated in one + * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH. + */ + __u64 lcs_width; + + /* Seq-server for direct talking */ + struct lu_server_seq *lcs_srv; +}; + +/* server sequence manager interface */ +struct lu_server_seq { + /* Available sequences space */ + struct lu_seq_range lss_space; + + /* keeps highwater in lsr_end for seq allocation algorithm */ + struct lu_seq_range lss_lowater_set; + struct lu_seq_range lss_hiwater_set; + + /* + * Device for server side seq manager needs (saving sequences to backing + * store). + */ + struct dt_device *lss_dev; + + /* /seq file object device */ + struct dt_object *lss_obj; + + /* Seq related debugfs */ + struct dentry *lss_debugfs_entry; + + /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */ + enum lu_mgr_type lss_type; + + /* Client interface to request controller */ + struct lu_client_seq *lss_cli; + + /* Mutex for protecting allocation */ + struct mutex lss_mutex; + + /* + * Service uuid, passed from MDT + seq name to form unique seq name to + * use it with debugfs. + */ + char lss_name[LUSTRE_MDT_MAXNAMELEN]; + + /* + * Allocation chunks for super and meta sequences. Default values are + * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH. + */ + __u64 lss_width; + + /* + * minimum lss_alloc_set size that should be allocated from + * lss_space + */ + __u64 lss_set_width; + + /* sync is needed for update operation */ + __u32 lss_need_sync; + + /** + * Pointer to site object, required to access site fld. + */ + struct seq_server_site *lss_site; +}; + +struct seq_server_site { + struct lu_site *ss_lu; + /** + * mds number of this site. + */ + u32 ss_node_id; + /** + * Fid location database + */ + struct lu_server_fld *ss_server_fld; + struct lu_client_fld *ss_client_fld; + + /** + * Server Seq Manager + */ + struct lu_server_seq *ss_server_seq; + + /** + * Controller Seq Manager + */ + struct lu_server_seq *ss_control_seq; + struct obd_export *ss_control_exp; + + /** + * Client Seq Manager + */ + struct lu_client_seq *ss_client_seq; +}; + +/* Server methods */ + +int seq_server_init(const struct lu_env *env, + struct lu_server_seq *seq, + struct dt_device *dev, + const char *prefix, + enum lu_mgr_type type, + struct seq_server_site *ss); + +void seq_server_fini(struct lu_server_seq *seq, + const struct lu_env *env); + +int seq_server_alloc_super(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env); + +int seq_server_alloc_meta(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env); + +int seq_server_set_cli(const struct lu_env *env, + struct lu_server_seq *seq, + struct lu_client_seq *cli); + +int seq_server_check_and_alloc_super(const struct lu_env *env, + struct lu_server_seq *seq); +/* Client methods */ +void seq_client_init(struct lu_client_seq *seq, + struct obd_export *exp, + enum lu_cli_type type, + const char *prefix, + struct lu_server_seq *srv); + +void seq_client_fini(struct lu_client_seq *seq); + +void seq_client_flush(struct lu_client_seq *seq); + +int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq, + struct lu_fid *fid); +int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq, + u64 *seqnr); +int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss); +/* Fids common stuff */ +int fid_is_local(const struct lu_env *env, + struct lu_site *site, const struct lu_fid *fid); + +enum lu_cli_type; +int client_fid_init(struct obd_device *obd, struct obd_export *exp, + enum lu_cli_type type); +int client_fid_fini(struct obd_device *obd); + +/* fid locking */ + +struct ldlm_namespace; + +/* + * Build (DLM) resource name from FID. + * + * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2], + * but was moved into name[1] along with the OID to avoid consuming the + * renaming name[2,3] fields that need to be used for the quota identifier. + */ +static inline void +fid_build_reg_res_name(const struct lu_fid *fid, struct ldlm_res_id *res) +{ + memset(res, 0, sizeof(*res)); + res->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(fid); + res->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(fid); +} + +/* + * Return true if resource is for object identified by FID. + */ +static inline int fid_res_name_eq(const struct lu_fid *fid, + const struct ldlm_res_id *res) +{ + return res->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(fid) && + res->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(fid); +} + +/* + * Extract FID from LDLM resource. Reverse of fid_build_reg_res_name(). + */ +static inline void +fid_extract_from_res_name(struct lu_fid *fid, const struct ldlm_res_id *res) +{ + fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF]; + fid->f_oid = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF]); + fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32); + LASSERT(fid_res_name_eq(fid, res)); +} + +/* + * Build (DLM) resource identifier from global quota FID and quota ID. + */ +static inline void +fid_build_quota_res_name(const struct lu_fid *glb_fid, union lquota_id *qid, + struct ldlm_res_id *res) +{ + fid_build_reg_res_name(glb_fid, res); + res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid); + res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid); +} + +/* + * Extract global FID and quota ID from resource name + */ +static inline void fid_extract_from_quota_res(struct lu_fid *glb_fid, + union lquota_id *qid, + const struct ldlm_res_id *res) +{ + fid_extract_from_res_name(glb_fid, res); + qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF]; + qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF]; + qid->qid_fid.f_ver = + (__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32); +} + +static inline void +fid_build_pdo_res_name(const struct lu_fid *fid, unsigned int hash, + struct ldlm_res_id *res) +{ + fid_build_reg_res_name(fid, res); + res->name[LUSTRE_RES_ID_HSH_OFF] = hash; +} + +/** + * Build DLM resource name from object id & seq, which will be removed + * finally, when we replace ost_id with FID in data stack. + * + * Currently, resid from the old client, whose res[0] = object_id, + * res[1] = object_seq, is just oposite with Metatdata + * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid. + * To unifiy the resid identification, we will reverse the data + * resid to keep it same with Metadata resid, i.e. + * + * For resid from the old client, + * res[0] = objid, res[1] = 0, still keep the original order, + * for compatiblity. + * + * For new resid + * res will be built from normal FID directly, i.e. res[0] = f_seq, + * res[1] = f_oid + f_ver. + */ +static inline void ostid_build_res_name(const struct ost_id *oi, + struct ldlm_res_id *name) +{ + memset(name, 0, sizeof *name); + if (fid_seq_is_mdt0(ostid_seq(oi))) { + name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi); + name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi); + } else { + fid_build_reg_res_name(&oi->oi_fid, name); + } +} + +/** + * Return true if the resource is for the object identified by this id & group. + */ +static inline bool ostid_res_name_eq(const struct ost_id *oi, + const struct ldlm_res_id *name) +{ + /* Note: it is just a trick here to save some effort, probably the + * correct way would be turn them into the FID and compare */ + if (fid_seq_is_mdt0(ostid_seq(oi))) { + return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) && + name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi); + } else { + return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) && + name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi); + } +} + +/** + * Note: we need check oi_seq to decide where to set oi_id, + * so oi_seq should always be set ahead of oi_id. + */ +static inline int ostid_set_id(struct ost_id *oi, __u64 oid) +{ + if (fid_seq_is_mdt0(oi->oi.oi_seq)) { + if (oid >= IDIF_MAX_OID) + return -E2BIG; + oi->oi.oi_id = oid; + } else if (fid_is_idif(&oi->oi_fid)) { + if (oid >= IDIF_MAX_OID) + return -E2BIG; + oi->oi_fid.f_seq = fid_idif_seq(oid, + fid_idif_ost_idx(&oi->oi_fid)); + oi->oi_fid.f_oid = oid; + oi->oi_fid.f_ver = oid >> 48; + } else { + if (oid >= OBIF_MAX_OID) + return -E2BIG; + oi->oi_fid.f_oid = oid; + } + return 0; +} + +/* pack any OST FID into an ostid (id/seq) for the wire/disk */ +static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid) +{ + int rc = 0; + + if (fid_seq_is_igif(fid->f_seq)) + return -EBADF; + + if (fid_is_idif(fid)) { + ostid_set_seq_mdt0(ostid); + rc = ostid_set_id(ostid, fid_idif_id(fid_seq(fid), + fid_oid(fid), fid_ver(fid))); + } else { + ostid->oi_fid = *fid; + } + + return rc; +} + +/* The same as osc_build_res_name() */ +static inline void ost_fid_build_resid(const struct lu_fid *fid, + struct ldlm_res_id *resname) +{ + if (fid_is_mdt0(fid) || fid_is_idif(fid)) { + struct ost_id oi; + oi.oi.oi_id = 0; /* gcc 4.7.2 complains otherwise */ + if (fid_to_ostid(fid, &oi) != 0) + return; + ostid_build_res_name(&oi, resname); + } else { + fid_build_reg_res_name(fid, resname); + } +} + +static inline void ost_fid_from_resid(struct lu_fid *fid, + const struct ldlm_res_id *name, + int ost_idx) +{ + if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) { + /* old resid */ + struct ost_id oi; + + memset(&oi, 0, sizeof(oi)); + ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]); + if (ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF])) { + CERROR("Bad %llu to set " DOSTID "\n", + name->name[LUSTRE_RES_ID_SEQ_OFF], POSTID(&oi)); + } + ostid_to_fid(fid, &oi, ost_idx); + } else { + /* new resid */ + fid_extract_from_res_name(fid, name); + } +} + +/** + * Flatten 128-bit FID values into a 64-bit value for use as an inode number. + * For non-IGIF FIDs this starts just over 2^32, and continues without + * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ + * into the range where there may not be many OID values in use, to minimize + * the risk of conflict. + * + * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true, + * the time between re-used inode numbers is very long - 2^40 SEQ numbers, + * or about 2^40 client mounts, if clients create less than 2^24 files/mount. + */ +static inline __u64 fid_flatten(const struct lu_fid *fid) +{ + __u64 ino; + __u64 seq; + + if (fid_is_igif(fid)) { + ino = lu_igif_ino(fid); + return ino; + } + + seq = fid_seq(fid); + + ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid); + + return ino ?: fid_oid(fid); +} + +static inline __u32 fid_hash(const struct lu_fid *f, int bits) +{ + /* all objects with same id and different versions will belong to same + * collisions list. */ + return hash_long(fid_flatten(f), bits); +} + +/** + * map fid to 32 bit value for ino on 32bit systems. */ +static inline __u32 fid_flatten32(const struct lu_fid *fid) +{ + __u32 ino; + __u64 seq; + + if (fid_is_igif(fid)) { + ino = lu_igif_ino(fid); + return ino; + } + + seq = fid_seq(fid) - FID_SEQ_START; + + /* Map the high bits of the OID into higher bits of the inode number so + * that inodes generated at about the same time have a reduced chance + * of collisions. This will give a period of 2^12 = 1024 unique clients + * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects + * (from OID), or up to 128M inodes without collisions for new files. */ + ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) + + (seq >> (64 - (40-8)) & 0xffffff00) + + (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8); + + return ino ?: fid_oid(fid); +} + +static inline int +lu_fid_diff(const struct lu_fid *fid1, const struct lu_fid *fid2) +{ + LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n", + PFID(fid1), PFID(fid2)); + + if (fid_is_idif(fid1) && fid_is_idif(fid2)) + return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) - + fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver); + + return fid_oid(fid1) - fid_oid(fid2); +} + +static inline int fid_set_id(struct lu_fid *fid, u64 oid) +{ + if (unlikely(fid_seq_is_igif(fid->f_seq))) { + CERROR("bad IGIF, "DFID"\n", PFID(fid)); + return -EBADF; + } + + if (fid_is_idif(fid)) { + if (oid >= IDIF_MAX_OID) { + CERROR("Too large OID %#llx to set IDIF "DFID"\n", + (unsigned long long)oid, PFID(fid)); + return -EBADF; + } + fid->f_seq = fid_idif_seq(oid, fid_idif_ost_idx(fid)); + fid->f_oid = oid; + fid->f_ver = oid >> 48; + } else { + if (oid > OBIF_MAX_OID) { + CERROR("Too large OID %#llx to set REG "DFID"\n", + (unsigned long long)oid, PFID(fid)); + return -EBADF; + } + fid->f_oid = oid; + } + return 0; +} + +#define LUSTRE_SEQ_SRV_NAME "seq_srv" +#define LUSTRE_SEQ_CTL_NAME "seq_ctl" + +/* Range common stuff */ +static inline void +range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = cpu_to_le64(src->lsr_start); + dst->lsr_end = cpu_to_le64(src->lsr_end); + dst->lsr_index = cpu_to_le32(src->lsr_index); + dst->lsr_flags = cpu_to_le32(src->lsr_flags); +} + +static inline void +range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = le64_to_cpu(src->lsr_start); + dst->lsr_end = le64_to_cpu(src->lsr_end); + dst->lsr_index = le32_to_cpu(src->lsr_index); + dst->lsr_flags = le32_to_cpu(src->lsr_flags); +} + +static inline void +range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = cpu_to_be64(src->lsr_start); + dst->lsr_end = cpu_to_be64(src->lsr_end); + dst->lsr_index = cpu_to_be32(src->lsr_index); + dst->lsr_flags = cpu_to_be32(src->lsr_flags); +} + +static inline void +range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = be64_to_cpu(src->lsr_start); + dst->lsr_end = be64_to_cpu(src->lsr_end); + dst->lsr_index = be32_to_cpu(src->lsr_index); + dst->lsr_flags = be32_to_cpu(src->lsr_flags); +} + +static inline void range_array_cpu_to_le(struct lu_seq_range_array *dst, + const struct lu_seq_range_array *src) +{ + __u32 i; + + for (i = 0; i < src->lsra_count; i++) + range_cpu_to_le(&dst->lsra_lsr[i], &src->lsra_lsr[i]); + + dst->lsra_count = cpu_to_le32(src->lsra_count); +} + +static inline void range_array_le_to_cpu(struct lu_seq_range_array *dst, + const struct lu_seq_range_array *src) +{ + __u32 i; + + dst->lsra_count = le32_to_cpu(src->lsra_count); + for (i = 0; i < dst->lsra_count; i++) + range_le_to_cpu(&dst->lsra_lsr[i], &src->lsra_lsr[i]); +} + +/** @} fid */ + +#endif /* __LUSTRE_FID_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fld.h b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h new file mode 100644 index 0000000000000..cd9036a32d344 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h @@ -0,0 +1,200 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __LINUX_FLD_H +#define __LINUX_FLD_H + +/** \defgroup fld fld + * + * @{ + */ + +#include +#include +#include +#include + +struct lu_env; +struct lu_client_fld; +struct lu_server_fld; +struct lu_fld_hash; +struct fld_cache; +struct thandle; +struct dt_device; +struct dt_object; + +/* + * FLD (Fid Location Database) interface. + */ +enum { + LUSTRE_CLI_FLD_HASH_DHT = 0, + LUSTRE_CLI_FLD_HASH_RRB +}; + +struct lu_fld_target { + struct list_head ft_chain; + struct obd_export *ft_exp; + struct lu_server_fld *ft_srv; + __u64 ft_idx; +}; + +struct lu_server_fld { + /** + * Fld dir debugfs entry. + */ + struct dentry *lsf_debugfs_entry; + + /** + * /fld file object device */ + struct dt_object *lsf_obj; + + /** + * super sequence controller export, needed to forward fld + * lookup request. */ + struct obd_export *lsf_control_exp; + + /** + * Client FLD cache. */ + struct fld_cache *lsf_cache; + + /** + * Protect index modifications */ + struct mutex lsf_lock; + + /** + * Fld service name in form "fld-srv-lustre-MDTXXX" + */ + char lsf_name[LUSTRE_MDT_MAXNAMELEN]; + + int (*lsf_seq_lookup)(const struct lu_env *env, + struct lu_server_fld *fld, u64 seq, + struct lu_seq_range *range); + + /** + * Just reformatted or upgraded, and this flag is being + * used to check whether the local FLDB is needs to be + * synced with global FLDB(in MDT0), and it is only needed + * if the MDT is upgraded from < 2.6 to 2.6, i.e. when the + * local FLDB is being invited */ + unsigned int lsf_new:1; + +}; + +struct lu_client_fld { + /** + * Client side debugfs entry. + */ + struct dentry *lcf_debugfs_entry; + + /** + * List of exports client FLD knows about. */ + struct list_head lcf_targets; + + /** + * Current hash to be used to chose an export. */ + struct lu_fld_hash *lcf_hash; + + /** + * Exports count. */ + int lcf_count; + + /** + * Lock protecting exports list and fld_hash. */ + spinlock_t lcf_lock; + + /** + * Client FLD cache. */ + struct fld_cache *lcf_cache; + + /** + * Client fld debugfs entry name. + */ + char lcf_name[LUSTRE_MDT_MAXNAMELEN]; +}; + +/* Server methods */ +int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld, + struct dt_device *dt, const char *prefix, int type); + +void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld); + +int fld_declare_server_create(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *range, + struct thandle *th); + +int fld_server_create(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *add_range, + struct thandle *th); + +int fld_insert_entry(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *range); + +int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld, + u64 seq, struct lu_seq_range *range); + +int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld, + u64 seq, struct lu_seq_range *range); + +int fld_update_from_controller(const struct lu_env *env, + struct lu_server_fld *fld); + +/* Client methods */ +int fld_client_init(struct lu_client_fld *fld, + const char *prefix, int hash); + +void fld_client_fini(struct lu_client_fld *fld); + +void fld_client_flush(struct lu_client_fld *fld); + +int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds, + __u32 flags, const struct lu_env *env); + +int fld_client_create(struct lu_client_fld *fld, + struct lu_seq_range *range, + const struct lu_env *env); + +int fld_client_delete(struct lu_client_fld *fld, u64 seq, + const struct lu_env *env); + +int fld_client_add_target(struct lu_client_fld *fld, + struct lu_fld_target *tar); + +int fld_client_del_target(struct lu_client_fld *fld, + __u64 idx); + +void fld_client_debugfs_fini(struct lu_client_fld *fld); + +/** @} fld */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_ha.h b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h new file mode 100644 index 0000000000000..282115ef67550 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h @@ -0,0 +1,59 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _LUSTRE_HA_H +#define _LUSTRE_HA_H + +/** \defgroup ha ha + * + * @{ + */ + +struct obd_import; +struct obd_export; +struct obd_device; +struct ptlrpc_request; + + +int ptlrpc_replay(struct obd_import *imp); +int ptlrpc_resend(struct obd_import *imp); +void ptlrpc_free_committed(struct obd_import *imp); +void ptlrpc_wake_delayed(struct obd_import *imp); +int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async); +int ptlrpc_set_import_active(struct obd_import *imp, int active); +void ptlrpc_activate_import(struct obd_import *imp, bool set_state_full); +void ptlrpc_deactivate_import(struct obd_import *imp); +void ptlrpc_invalidate_import(struct obd_import *imp); +void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt); +void ptlrpc_pinger_force(struct obd_import *imp); +/** @} ha */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_handles.h b/drivers/staging/lustrefsx/lustre/include/lustre_handles.h new file mode 100644 index 0000000000000..538f427683cbd --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_handles.h @@ -0,0 +1,75 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __LUSTRE_HANDLES_H_ +#define __LUSTRE_HANDLES_H_ + +/** \defgroup handles handles + * + * @{ + */ + +#include +#include +#include +#include + +/* These handles are most easily used by having them appear at the very top of + * whatever object that you want to make handles for. ie: + * + * struct ldlm_lock { + * struct portals_handle handle; + * ... + * }; + * + * Now you're able to assign the results of cookie2handle directly to an + * ldlm_lock. If it's not at the top, you'll want to use container_of() + * to compute the start of the structure based on the handle field. */ +struct portals_handle { + struct hlist_node h_link; + __u64 h_cookie; + const char *h_owner; + refcount_t h_ref; + struct rcu_head h_rcu; +}; + +/* handles.c */ + +/* Add a handle to the hash table */ +void class_handle_hash(struct portals_handle *, const char *h_owner); +void class_handle_unhash(struct portals_handle *); +void *class_handle2object(u64 cookie, const char *h_owner); +int class_handle_init(void); +void class_handle_cleanup(void); + +/** @} handles */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h new file mode 100644 index 0000000000000..35f55eb755707 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h @@ -0,0 +1,70 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/lustre/include/lustre_idmap.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_IDMAP_H +#define _LUSTRE_IDMAP_H + +/** \defgroup idmap idmap + * + * @{ + */ + +#include + +#ifdef HAVE_GROUP_INFO_GID + +#define CFS_GROUP_AT(gi, i) ((gi)->gid[(i)]) + +#else /* !HAVE_GROUP_INFO_GID */ + +#define CFS_NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t))) + +#define CFS_GROUP_AT(gi, i) \ + ((gi)->blocks[(i) / CFS_NGROUPS_PER_BLOCK][(i) % CFS_NGROUPS_PER_BLOCK]) + +#endif /* HAVE_GROUP_INFO_GID */ + +#include + +struct lu_ucred; + +extern void lustre_groups_from_list(struct group_info *ginfo, gid_t *glist); +extern void lustre_groups_sort(struct group_info *group_info); +extern int lustre_in_group_p(struct lu_ucred *mu, gid_t grp); + +/** @} idmap */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_import.h b/drivers/staging/lustrefsx/lustre/include/lustre_import.h new file mode 100644 index 0000000000000..7b97c5555c327 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_import.h @@ -0,0 +1,430 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +/** \defgroup obd_import PtlRPC import definitions + * Imports are client-side representation of remote obd target. + * + * @{ + */ + +#ifndef __IMPORT_H +#define __IMPORT_H + +/** \defgroup export export + * + * @{ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Adaptive Timeout stuff + * + * @{ + */ +#define D_ADAPTTO D_OTHER +#define AT_BINS 4 /* "bin" means "N seconds of history" */ +#define AT_FLG_NOHIST 0x1 /* use last reported value only */ + +struct adaptive_timeout { + time64_t at_binstart; /* bin start time */ + unsigned int at_hist[AT_BINS]; /* timeout history bins */ + unsigned int at_flags; + timeout_t at_current_timeout; /* current timeout value */ + timeout_t at_worst_timeout_ever; /* worst-ever timeout delta + * value + */ + time64_t at_worst_timestamp; /* worst-ever timeout + * timestamp + */ + spinlock_t at_lock; +}; + +enum lustre_at_flags { + LATF_SKIP = 0x0, + LATF_STATS = 0x1, +}; + +struct ptlrpc_at_array { + struct list_head *paa_reqs_array; /** array to hold requests */ + __u32 paa_size; /** the size of array */ + __u32 paa_count; /** the total count of reqs */ + time64_t paa_deadline; /** the earliest deadline of reqs */ + __u32 *paa_reqs_count; /** the count of reqs in each entry */ +}; + +#define IMP_AT_MAX_PORTALS 8 +struct imp_at { + int iat_portal[IMP_AT_MAX_PORTALS]; + struct adaptive_timeout iat_net_latency; + struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS]; +}; + + +/** @} */ + +/** Possible import states */ +enum lustre_imp_state { + LUSTRE_IMP_CLOSED = 1, + LUSTRE_IMP_NEW = 2, + LUSTRE_IMP_DISCON = 3, + LUSTRE_IMP_CONNECTING = 4, + LUSTRE_IMP_REPLAY = 5, + LUSTRE_IMP_REPLAY_LOCKS = 6, + LUSTRE_IMP_REPLAY_WAIT = 7, + LUSTRE_IMP_RECOVER = 8, + LUSTRE_IMP_FULL = 9, + LUSTRE_IMP_EVICTED = 10, + LUSTRE_IMP_IDLE = 11, + LUSTRE_IMP_LAST +}; + +/** Returns test string representation of numeric import state \a state */ +static inline const char *ptlrpc_import_state_name(enum lustre_imp_state state) +{ + static const char * const import_state_names[] = { + "", "CLOSED", "NEW", "DISCONN", + "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT", + "RECOVER", "FULL", "EVICTED", "IDLE", + }; + + LASSERT(state < LUSTRE_IMP_LAST); + return import_state_names[state]; +} + +/** + * List of import event types + */ +enum obd_import_event { + IMP_EVENT_DISCON = 0x808001, + IMP_EVENT_INACTIVE = 0x808002, + IMP_EVENT_INVALIDATE = 0x808003, + IMP_EVENT_ACTIVE = 0x808004, + IMP_EVENT_OCD = 0x808005, + IMP_EVENT_DEACTIVATE = 0x808006, + IMP_EVENT_ACTIVATE = 0x808007, +}; + +/** + * Definition of import connection structure + */ +struct obd_import_conn { + /** Item for linking connections together */ + struct list_head oic_item; + /** Pointer to actual PortalRPC connection */ + struct ptlrpc_connection *oic_conn; + /** uuid of remote side */ + struct obd_uuid oic_uuid; + /** + * Time (64 bit seconds) of last connection attempt on this connection + */ + time64_t oic_last_attempt; +}; + +/* state history */ +#define IMP_STATE_HIST_LEN 16 +struct import_state_hist { + enum lustre_imp_state ish_state; + time64_t ish_time; +}; + +/** + * Defintion of PortalRPC import structure. + * Imports are representing client-side view to remote target. + */ +struct obd_import { + /** Reference counter */ + refcount_t imp_refcount; + struct lustre_handle imp_dlm_handle; /* client's ldlm export */ + /** Currently active connection */ + struct ptlrpc_connection *imp_connection; + /** PortalRPC client structure for this import */ + struct ptlrpc_client *imp_client; + /** List element for linking into pinger chain */ + struct list_head imp_pinger_chain; + /** work struct for destruction of import */ + struct work_struct imp_zombie_work; + + /** + * Lists of requests that are retained for replay, waiting for a reply, + * or waiting for recovery to complete, respectively. + * @{ + */ + struct list_head imp_replay_list; + struct list_head imp_sending_list; + struct list_head imp_delayed_list; + /** @} */ + + /** + * List of requests that are retained for committed open replay. Once + * open is committed, open replay request will be moved from the + * imp_replay_list into the imp_committed_list. + * The imp_replay_cursor is for accelerating searching during replay. + * @{ + */ + struct list_head imp_committed_list; + struct list_head *imp_replay_cursor; + /** @} */ + + /** List of not replied requests */ + struct list_head imp_unreplied_list; + /** Known maximal replied XID */ + __u64 imp_known_replied_xid; + + /** obd device for this import */ + struct obd_device *imp_obd; + + /** + * some seciruty-related fields + * @{ + */ + struct ptlrpc_sec *imp_sec; + rwlock_t imp_sec_lock; + time64_t imp_sec_expire; + pid_t imp_sec_refpid; + /** @} */ + + /** Wait queue for those who need to wait for recovery completion */ + wait_queue_head_t imp_recovery_waitq; + + /** Number of requests allocated */ + atomic_t imp_reqs; + /** Number of requests currently in-flight */ + atomic_t imp_inflight; + /** Number of requests currently unregistering */ + atomic_t imp_unregistering; + /** Number of replay requests inflight */ + atomic_t imp_replay_inflight; + /** In-flight replays rate control */ + wait_queue_head_t imp_replay_waitq; + + /** Number of currently happening import invalidations */ + atomic_t imp_inval_count; + /** Numbner of request timeouts */ + atomic_t imp_timeouts; + /** Current import state */ + enum lustre_imp_state imp_state; + /** Last replay state */ + enum lustre_imp_state imp_replay_state; + /** History of import states */ + struct import_state_hist imp_state_hist[IMP_STATE_HIST_LEN]; + int imp_state_hist_idx; + /** Current import generation. Incremented on every reconnect */ + int imp_generation; + /** Idle connection initiated at this generation */ + int imp_initiated_at; + /** Incremented every time we send reconnection request */ + __u32 imp_conn_cnt; + /** + * \see ptlrpc_free_committed remembers imp_generation value here + * after a check to save on unnecessary replay list iterations + */ + int imp_last_generation_checked; + /** Last tranno we replayed */ + __u64 imp_last_replay_transno; + /** Last transno committed on remote side */ + __u64 imp_peer_committed_transno; + /** + * \see ptlrpc_free_committed remembers last_transno since its last + * check here and if last_transno did not change since last run of + * ptlrpc_free_committed and import generation is the same, we can + * skip looking for requests to remove from replay list as optimisation + */ + __u64 imp_last_transno_checked; + /** + * Remote export handle. This is how remote side knows what export + * we are talking to. Filled from response to connect request + */ + struct lustre_handle imp_remote_handle; + /** When to perform next ping. time in jiffies. */ + time64_t imp_next_ping; + /** When we last successfully connected. time in 64bit jiffies */ + time64_t imp_last_success_conn; + + /** List of all possible connection for import. */ + struct list_head imp_conn_list; + /** + * Current connection. \a imp_connection is imp_conn_current->oic_conn + */ + struct obd_import_conn *imp_conn_current; + + /** Protects flags, level, generation, conn_cnt, *_list */ + spinlock_t imp_lock; + + /** + * A "sentinel" value used to check if there are other threads + * waiting on the imp_lock. + */ + atomic_t imp_waiting; + + /* flags */ + unsigned long imp_invalid:1, /* evicted */ + /* administratively disabled */ + imp_deactive:1, + /* try to recover the import */ + imp_replayable:1, + /* don't run recovery (timeout instead) */ + imp_dlm_fake:1, + /* use 1/2 timeout on MDS' OSCs */ + imp_server_timeout:1, + /* VBR: imp in delayed recovery */ + imp_delayed_recovery:1, + /* recovery by versions was failed */ + imp_vbr_failed:1, + /* force an immidiate ping */ + imp_force_verify:1, + /* force a scheduled ping */ + imp_force_next_verify:1, + /* pingable */ + imp_pingable:1, + /* resend for replay */ + imp_resend_replay:1, + /* disable normal recovery, for test only. */ + imp_no_pinger_recover:1, + /* import must be reconnected instead of + * chouse new connection */ + imp_force_reconnect:1, + /* import has tried to connect with server */ + imp_connect_tried:1, + /* connected but not FULL yet */ + imp_connected:1, + /* grant shrink disabled */ + imp_grant_shrink_disabled:1, + /* to supress LCONSOLE() at conn.restore */ + imp_was_idle:1; + u32 imp_connect_op; + u32 imp_idle_timeout; + u32 imp_idle_debug; + struct obd_connect_data imp_connect_data; + __u64 imp_connect_flags_orig; + __u64 imp_connect_flags2_orig; + int imp_connect_error; + + enum lustre_msg_magic imp_msg_magic; + /* adjusted based on server capability */ + enum lustre_msghdr imp_msghdr_flags; + + /* adaptive timeout data */ + struct imp_at imp_at; + time64_t imp_last_reply_time; /* for health check */ + __u32 imp_conn_restricted_net; +}; + +/* import.c : adaptive timeout handling. + * + * Lustre tracks how long RPCs take to complete. This information is reported + * back to clients who utilize the information to estimate the time needed + * for future requests and set appropriate RPC timeouts. Minimum and maximum + * service times can be configured via the at_min and at_max kernel module + * parameters, respectively. + * + * Since this information is transmitted between nodes the timeouts are in + * seconds not jiffies which can vary from node to node. To avoid confusion + * the timeout is handled in timeout_t (s32) instead of time64_t or + * long (jiffies). + */ +static inline timeout_t at_est2timeout(timeout_t timeout) +{ + /* add an arbitrary minimum: 125% +5 sec */ + return timeout + (timeout >> 2) + 5; +} + +static inline timeout_t at_timeout2est(timeout_t timeout) +{ + /* restore estimate value from timeout: e=4/5(t-5) */ + LASSERT(timeout > 0); + return max((timeout << 2) / 5, 5) - 4; +} + +static inline void at_reset_nolock(struct adaptive_timeout *at, + timeout_t timeout) +{ + at->at_current_timeout = timeout; + at->at_worst_timeout_ever = timeout; + at->at_worst_timestamp = ktime_get_real_seconds(); +} + +static inline void at_reset(struct adaptive_timeout *at, timeout_t timeout) +{ + spin_lock(&at->at_lock); + at_reset_nolock(at, timeout); + spin_unlock(&at->at_lock); +} + +static inline void at_init(struct adaptive_timeout *at, timeout_t timeout, + int flags) +{ + memset(at, 0, sizeof(*at)); + spin_lock_init(&at->at_lock); + at->at_flags = flags; + at_reset(at, timeout); +} + +static inline void at_reinit(struct adaptive_timeout *at, timeout_t timeout, + int flags) +{ + spin_lock(&at->at_lock); + at->at_binstart = 0; + memset(at->at_hist, 0, sizeof(at->at_hist)); + at->at_flags = flags; + at_reset_nolock(at, timeout); + spin_unlock(&at->at_lock); +} + +extern unsigned int at_min; +extern unsigned int at_max; +#define AT_OFF (at_max == 0) + +static inline timeout_t at_get(struct adaptive_timeout *at) +{ + return (at->at_current_timeout > at_min) ? + at->at_current_timeout : at_min; +} + +timeout_t at_measured(struct adaptive_timeout *at, timeout_t timeout); +int import_at_get_index(struct obd_import *imp, int portal); + +/* genops.c */ +struct obd_export; +extern struct obd_import *class_exp2cliimp(struct obd_export *); + +/** @} import */ + +#endif /* __IMPORT_H */ + +/** @} obd_import */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_intent.h b/drivers/staging/lustrefsx/lustre/include/lustre_intent.h new file mode 100644 index 0000000000000..5f3a717c9590b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_intent.h @@ -0,0 +1,67 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef LUSTRE_INTENT_H +#define LUSTRE_INTENT_H + +/* intent IT_XXX are defined in lustre/include/obd.h */ + +struct lookup_intent { + int it_op; + int it_create_mode; + __u64 it_flags; + int it_disposition; + int it_status; + __u64 it_lock_handle; + __u64 it_lock_bits; + int it_lock_mode; + int it_remote_lock_mode; + __u64 it_remote_lock_handle; + struct ptlrpc_request *it_request; + unsigned int it_lock_set:1; +}; + +static inline int it_disposition(const struct lookup_intent *it, int flag) +{ + return it->it_disposition & flag; +} + +static inline void it_set_disposition(struct lookup_intent *it, int flag) +{ + it->it_disposition |= flag; +} + +static inline void it_clear_disposition(struct lookup_intent *it, int flag) +{ + it->it_disposition &= ~flag; +} + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h new file mode 100644 index 0000000000000..4af88af0edf87 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h @@ -0,0 +1,57 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Author: Nathan Rutman + * + * Kernel <-> userspace communication routines. + * The definitions below are used in the kernel and userspace. + */ + +#ifndef __LUSTRE_KERNELCOMM_H__ +#define __LUSTRE_KERNELCOMM_H__ + +/* For declarations shared with userspace */ +#include + +/* prototype for callback function on kuc groups */ +typedef int (*libcfs_kkuc_cb_t)(void *data, void *cb_arg); + +/* Kernel methods */ +void libcfs_kkuc_init(void); +int libcfs_kkuc_msg_put(struct file *fp, void *payload); +int libcfs_kkuc_group_put(const struct obd_uuid *uuid, int group, void *data); +int libcfs_kkuc_group_add(struct file *fp, const struct obd_uuid *uuid, int uid, + int group, void *data, size_t data_len); +int libcfs_kkuc_group_rem(const struct obd_uuid *uuid, int uid, int group); +int libcfs_kkuc_group_foreach(const struct obd_uuid *uuid, int group, + libcfs_kkuc_cb_t cb_func, void *cb_arg); + +#endif /* __LUSTRE_KERNELCOMM_H__ */ + diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h new file mode 100644 index 0000000000000..64b0d55921897 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h @@ -0,0 +1,130 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * lustre/include/lustre_lfsck.h + * + * Lustre LFSCK exported functions. + * + * Author: Fan, Yong + */ + +#ifndef _LUSTRE_LFSCK_H +# define _LUSTRE_LFSCK_H + +#include +#include +#include +#include + +struct lfsck_start_param { + struct lfsck_start *lsp_start; + __u32 lsp_index; + unsigned int lsp_index_valid:1; +}; + +/* For LE_PAIRS_VERIFY returned status */ +enum lfsck_pv_status { + LPVS_INIT = 0, + LPVS_INCONSISTENT = 1, + LPVS_INCONSISTENT_TOFIX = 2, +}; + +enum lfsck_events_local { + LEL_FID_ACCESSED = 1, + LEL_PAIRS_VERIFY_LOCAL = 2, +}; + +struct lfsck_req_local { + __u32 lrl_event; + __u32 lrl_status; + __u16 lrl_active; + __u16 lrl_padding0; + __u32 lrl_padding1; + struct lu_fid lrl_fid; + struct filter_fid lrl_ff_client; + struct filter_fid lrl_ff_local; +}; + +struct lfsck_layout_dangling_key { + struct lu_fid lldk_fid; + __u32 lldk_comp_id; + __u32 lldk_ea_off; +}; + +typedef int (*lfsck_out_notify)(const struct lu_env *env, void *data, + enum lfsck_events event); + +int lfsck_register_namespace(const struct lu_env *env, struct dt_device *key, + struct ldlm_namespace *ns); +int lfsck_register(const struct lu_env *env, struct dt_device *key, + struct dt_device *next, struct obd_device *obd, + lfsck_out_notify notify, void *notify_data, bool master); +void lfsck_degister(const struct lu_env *env, struct dt_device *key); + +int lfsck_add_target(const struct lu_env *env, struct dt_device *key, + struct dt_device *tgt, struct obd_export *exp, + __u32 index, bool for_ost); +void lfsck_del_target(const struct lu_env *env, struct dt_device *key, + struct dt_device *tgt, __u32 index, bool for_ost); + +int lfsck_start(const struct lu_env *env, struct dt_device *key, + struct lfsck_start_param *lsp); +int lfsck_stop(const struct lu_env *env, struct dt_device *key, + struct lfsck_stop *stop); +int lfsck_in_notify_local(const struct lu_env *env, struct dt_device *key, + struct lfsck_req_local *lrl, struct thandle *th); +int lfsck_in_notify(const struct lu_env *env, struct dt_device *key, + struct lfsck_request *lr); +int lfsck_query(const struct lu_env *env, struct dt_device *key, + struct lfsck_request *req, struct lfsck_reply *rep, + struct lfsck_query *que); + +int lfsck_get_speed(char *buf, struct dt_device *key); +int lfsck_set_speed(struct dt_device *key, __u32 val); +int lfsck_get_windows(char *buf, struct dt_device *key); +int lfsck_set_windows(struct dt_device *key, unsigned int val); + +int lfsck_dump(struct seq_file *m, struct dt_device *key, enum lfsck_type type); + +static inline void lfsck_pack_rfa(struct lfsck_req_local *lrl, + const struct lu_fid *fid, + enum lfsck_events_local event, __u16 com) +{ + memset(lrl, 0, sizeof(*lrl)); + lrl->lrl_fid = *fid; + lrl->lrl_event = event; + lrl->lrl_active = com; +} + +static inline bool lovea_slot_is_dummy(const struct lov_ost_data_v1 *obj) +{ + /* zero area does not care about the bytes-order. */ + if (obj->l_ost_oi.oi.oi_id == 0 && obj->l_ost_oi.oi.oi_seq == 0 && + obj->l_ost_idx == 0 && obj->l_ost_gen == 0) + return true; + + return false; +} +#endif /* _LUSTRE_LFSCK_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lib.h b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h new file mode 100644 index 0000000000000..4d36fcfb3c000 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h @@ -0,0 +1,99 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre_lib.h + * + * Basic Lustre library routines. + */ + +#ifndef _LUSTRE_LIB_H +#define _LUSTRE_LIB_H + +/** \defgroup lib lib + * + * @{ + */ + +#ifdef HAVE_SCHED_HEADERS +#include +#include +#endif + +#include +#include +#include +#include + +/* target.c */ +struct ptlrpc_request; +struct obd_export; +struct lu_target; +#include +#include + +#define LI_POISON 0x5a5a5a5a +#if BITS_PER_LONG > 32 +# define LL_POISON 0x5a5a5a5a5a5a5a5aL +#else +# define LL_POISON 0x5a5a5a5aL +#endif +#define LP_POISON ((void *)LL_POISON) + +#ifdef HAVE_SERVER_SUPPORT +int rev_import_init(struct obd_export *exp); +int target_handle_connect(struct ptlrpc_request *req); +int target_handle_disconnect(struct ptlrpc_request *req); +void target_destroy_export(struct obd_export *exp); +void target_committed_to_req(struct ptlrpc_request *req); +void target_cancel_recovery_timer(struct obd_device *obd); +void target_stop_recovery_thread(struct obd_device *obd); +void target_cleanup_recovery(struct obd_device *obd); +int target_queue_recovery_request(struct ptlrpc_request *req, + struct obd_device *obd); +int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc); +#endif + +int target_pack_pool_reply(struct ptlrpc_request *req); +int do_set_info_async(struct obd_import *imp, + int opcode, int version, + size_t keylen, void *key, + size_t vallen, void *val, + struct ptlrpc_request_set *set); + +void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id); + +#define LL_CDEBUG_PAGE(mask, page, fmt, arg...) \ + CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: " \ + fmt, page, page->mapping, page->index, (long)page->flags, \ + page_count(page), page_private(page), ## arg) + +/** @} lib */ + +#endif /* _LUSTRE_LIB_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h new file mode 100644 index 0000000000000..f9deb4d28a4df --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h @@ -0,0 +1,98 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + * Use is subject to license terms. + * + * Author: di wang + */ + +/* There are several reasons to restrict the linkEA size: + * + * 1. Under DNE mode, if we do not restrict the linkEA size, and if there + * are too many cross-MDTs hard links to the same object, then it will + * casue the llog overflow. + * + * 2. Some backend has limited size for EA. For example, if without large + * EA enabled, the ldiskfs will make all EAs to share one (4K) EA block. + * + * 3. Too many entries in linkEA will seriously affect linkEA performance + * because we only support to locate linkEA entry consecutively. */ +#define MAX_LINKEA_SIZE 4096 + +struct linkea_data { + /** + * Buffer to keep link EA body. + */ + struct lu_buf *ld_buf; + /** + * The matched header, entry and its lenght in the EA + */ + struct link_ea_header *ld_leh; + struct link_ea_entry *ld_lee; + int ld_reclen; +}; + +int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf); +int linkea_init(struct linkea_data *ldata); +int linkea_init_with_rec(struct linkea_data *ldata); +void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen, + struct lu_name *lname, struct lu_fid *pfid); +int linkea_entry_pack(struct link_ea_entry *lee, const struct lu_name *lname, + const struct lu_fid *pfid); +bool linkea_will_overflow(struct linkea_data *ldata, + const struct lu_name *lname); +int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid, bool err_on_overflow); +void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname, + bool is_encrypted); +int linkea_links_new(struct linkea_data *ldata, struct lu_buf *buf, + const struct lu_name *cname, const struct lu_fid *pfid); +int linkea_overflow_shrink(struct linkea_data *ldata); +int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid); + +static inline void linkea_first_entry(struct linkea_data *ldata) +{ + LASSERT(ldata != NULL); + LASSERT(ldata->ld_leh != NULL); + + if (ldata->ld_leh->leh_reccount == 0) + ldata->ld_lee = NULL; + else + ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1); +} + +static inline void linkea_next_entry(struct linkea_data *ldata) +{ + LASSERT(ldata != NULL); + LASSERT(ldata->ld_leh != NULL); + + if (ldata->ld_lee != NULL) { + ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee + + ldata->ld_reclen); + if ((char *)ldata->ld_lee >= ((char *)ldata->ld_leh + + ldata->ld_leh->leh_len)) + ldata->ld_lee = NULL; + } +} diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h new file mode 100644 index 0000000000000..2f11379003e12 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h @@ -0,0 +1,539 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, 2016, Intel Corporation. + */ +/* + * lustre/include/lustre_lmv.h + * + * Lustre LMV structures and functions. + * + * Author: Di Wang + */ + +#ifndef _LUSTRE_LMV_H +#define _LUSTRE_LMV_H +#include + +struct lmv_oinfo { + struct lu_fid lmo_fid; + u32 lmo_mds; + struct inode *lmo_root; +}; + +struct lmv_stripe_md { + __u32 lsm_md_magic; + __u32 lsm_md_stripe_count; + __u32 lsm_md_master_mdt_index; + __u32 lsm_md_hash_type; + __u8 lsm_md_max_inherit; + __u8 lsm_md_max_inherit_rr; + __u32 lsm_md_layout_version; + __u32 lsm_md_migrate_offset; + __u32 lsm_md_migrate_hash; + char lsm_md_pool_name[LOV_MAXPOOLNAME + 1]; + struct lmv_oinfo lsm_md_oinfo[0]; +}; + +static inline bool lmv_dir_striped(const struct lmv_stripe_md *lsm) +{ + return lsm && lsm->lsm_md_magic == LMV_MAGIC; +} + +static inline bool lmv_dir_foreign(const struct lmv_stripe_md *lsm) +{ + return lsm && lsm->lsm_md_magic == LMV_MAGIC_FOREIGN; +} + +static inline bool lmv_dir_layout_changing(const struct lmv_stripe_md *lsm) +{ + return lmv_dir_striped(lsm) && + lmv_hash_is_layout_changing(lsm->lsm_md_hash_type); +} + +static inline bool lmv_dir_bad_hash(const struct lmv_stripe_md *lsm) +{ + if (!lmv_dir_striped(lsm)) + return false; + + if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_BAD_TYPE) + return true; + + return !lmv_is_known_hash_type(lsm->lsm_md_hash_type); +} + +static inline bool +lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2) +{ + __u32 idx; + + if (lsm1->lsm_md_magic != lsm2->lsm_md_magic || + lsm1->lsm_md_stripe_count != lsm2->lsm_md_stripe_count || + lsm1->lsm_md_master_mdt_index != + lsm2->lsm_md_master_mdt_index || + lsm1->lsm_md_hash_type != lsm2->lsm_md_hash_type || + lsm1->lsm_md_max_inherit != lsm2->lsm_md_max_inherit || + lsm1->lsm_md_max_inherit_rr != lsm2->lsm_md_max_inherit_rr || + lsm1->lsm_md_layout_version != + lsm2->lsm_md_layout_version || + lsm1->lsm_md_migrate_offset != + lsm2->lsm_md_migrate_offset || + lsm1->lsm_md_migrate_hash != + lsm2->lsm_md_migrate_hash || + strncmp(lsm1->lsm_md_pool_name, lsm2->lsm_md_pool_name, + sizeof(lsm1->lsm_md_pool_name)) != 0) + return false; + + if (lmv_dir_striped(lsm1)) { + for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) { + if (!lu_fid_eq(&lsm1->lsm_md_oinfo[idx].lmo_fid, + &lsm2->lsm_md_oinfo[idx].lmo_fid)) + return false; + } + } else if (lsm1->lsm_md_magic == LMV_USER_MAGIC_SPECIFIC) { + for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) { + if (lsm1->lsm_md_oinfo[idx].lmo_mds != + lsm2->lsm_md_oinfo[idx].lmo_mds) + return false; + } + } + + return true; +} + +static inline void lsm_md_dump(int mask, const struct lmv_stripe_md *lsm) +{ + bool valid_hash = lmv_dir_bad_hash(lsm); + int i; + + /* If lsm_md_magic == LMV_MAGIC_FOREIGN pool_name may not be a null + * terminated string so only print LOV_MAXPOOLNAME bytes. + */ + CDEBUG(mask, + "magic %#x stripe count %d master mdt %d hash type %s:%#x max-inherit %hhu max-inherit-rr %hhu version %d migrate offset %d migrate hash %#x pool %.*s\n", + lsm->lsm_md_magic, lsm->lsm_md_stripe_count, + lsm->lsm_md_master_mdt_index, + valid_hash ? "invalid hash" : + mdt_hash_name[lsm->lsm_md_hash_type & (LMV_HASH_TYPE_MAX - 1)], + lsm->lsm_md_hash_type, lsm->lsm_md_max_inherit, + lsm->lsm_md_max_inherit_rr, lsm->lsm_md_layout_version, + lsm->lsm_md_migrate_offset, lsm->lsm_md_migrate_hash, + LOV_MAXPOOLNAME, lsm->lsm_md_pool_name); + + if (!lmv_dir_striped(lsm)) + return; + + for (i = 0; i < lsm->lsm_md_stripe_count; i++) + CDEBUG(mask, "stripe[%d] "DFID"\n", + i, PFID(&lsm->lsm_md_oinfo[i].lmo_fid)); +} + +union lmv_mds_md; + +void lmv_free_memmd(struct lmv_stripe_md *lsm); + +static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst, + const struct lmv_mds_md_v1 *lmv_src) +{ + __u32 i; + + lmv_dst->lmv_magic = le32_to_cpu(lmv_src->lmv_magic); + lmv_dst->lmv_stripe_count = le32_to_cpu(lmv_src->lmv_stripe_count); + lmv_dst->lmv_master_mdt_index = + le32_to_cpu(lmv_src->lmv_master_mdt_index); + lmv_dst->lmv_hash_type = le32_to_cpu(lmv_src->lmv_hash_type); + lmv_dst->lmv_layout_version = le32_to_cpu(lmv_src->lmv_layout_version); + if (lmv_src->lmv_stripe_count > LMV_MAX_STRIPE_COUNT) + return; + for (i = 0; i < lmv_src->lmv_stripe_count; i++) + fid_le_to_cpu(&lmv_dst->lmv_stripe_fids[i], + &lmv_src->lmv_stripe_fids[i]); +} + +static inline void lmv_le_to_cpu(union lmv_mds_md *lmv_dst, + const union lmv_mds_md *lmv_src) +{ + switch (le32_to_cpu(lmv_src->lmv_magic)) { + case LMV_MAGIC_V1: + lmv1_le_to_cpu(&lmv_dst->lmv_md_v1, &lmv_src->lmv_md_v1); + break; + default: + break; + } +} + +/* This hash is only for testing purpose */ +static inline unsigned int +lmv_hash_all_chars(unsigned int count, const char *name, int namelen) +{ + unsigned int c = 0; + const unsigned char *p = (const unsigned char *)name; + + while (--namelen >= 0) + c += p[namelen]; + + c = c % count; + + return c; +} + +static inline unsigned int +lmv_hash_fnv1a(unsigned int count, const char *name, int namelen) +{ + __u64 hash; + + hash = lustre_hash_fnv_1a_64(name, namelen); + + return do_div(hash, count); +} + +/* + * Robert Jenkins' function for mixing 32-bit values + * http://burtleburtle.net/bob/hash/evahash.html + * a, b = random bits, c = input and output + * + * Mixing inputs to generate an evenly distributed hash. + */ +#define crush_hashmix(a, b, c) \ +do { \ + a = a - b; a = a - c; a = a ^ (c >> 13); \ + b = b - c; b = b - a; b = b ^ (a << 8); \ + c = c - a; c = c - b; c = c ^ (b >> 13); \ + a = a - b; a = a - c; a = a ^ (c >> 12); \ + b = b - c; b = b - a; b = b ^ (a << 16); \ + c = c - a; c = c - b; c = c ^ (b >> 5); \ + a = a - b; a = a - c; a = a ^ (c >> 3); \ + b = b - c; b = b - a; b = b ^ (a << 10); \ + c = c - a; c = c - b; c = c ^ (b >> 15); \ +} while (0) + +#define crush_hash_seed 1315423911 + +static inline __u32 crush_hash(__u32 a, __u32 b) +{ + __u32 hash = crush_hash_seed ^ a ^ b; + __u32 x = 231232; + __u32 y = 1232; + + crush_hashmix(a, b, hash); + crush_hashmix(x, a, hash); + crush_hashmix(b, y, hash); + + return hash; +} + +/* refer to https://github.com/ceph/ceph/blob/master/src/crush/hash.c and + * https://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf for details of CRUSH + * algorithm. + */ +static inline unsigned int +lmv_hash_crush(unsigned int count, const char *name, int namelen) +{ + unsigned long long straw; + unsigned long long highest_straw = 0; + unsigned int pg_id; + unsigned int idx = 0; + int i; + + /* put temp and backup file on the same MDT where target is located. + * temporary file naming rule: + * 1. rsync: ..XXXXXX + * 2. dstripe: .XXXXXXXX + */ + if (lu_name_is_temp_file(name, namelen, true, 6)) { + name++; + namelen -= 8; + } else if (lu_name_is_temp_file(name, namelen, false, 8)) { + namelen -= 9; + } else if (lu_name_is_backup_file(name, namelen, &i)) { + LASSERT(i < namelen); + namelen -= i; + } + + pg_id = lmv_hash_fnv1a(LMV_CRUSH_PG_COUNT, name, namelen); + + /* distribute PG among all stripes pseudo-randomly, so they are almost + * evenly distributed, and when stripe count changes, only (delta / + * total) sub files need to be moved, herein 'delta' is added or removed + * stripe count, 'total' is total stripe count before change for + * removal, or count after change for addition. + */ + for (i = 0; i < count; i++) { + straw = crush_hash(pg_id, i); + if (straw > highest_straw) { + highest_straw = straw; + idx = i; + } + } + LASSERT(idx < count); + + return idx; +} + +/* directory layout may change in three ways: + * 1. directory migration, in its LMV source stripes are appended after + * target stripes, \a migrate_hash is source hash type, \a migrate_offset is + * target stripe count, + * 2. directory split, \a migrate_hash is hash type before split, + * \a migrate_offset is stripe count before split. + * 3. directory merge, \a migrate_hash is hash type after merge, + * \a migrate_offset is stripe count after merge. + */ +static inline int +__lmv_name_to_stripe_index(__u32 hash_type, __u32 stripe_count, + __u32 migrate_hash, __u32 migrate_offset, + const char *name, int namelen, bool new_layout) +{ + __u32 saved_hash = hash_type; + __u32 saved_count = stripe_count; + int stripe_index = 0; + + LASSERT(namelen > 0); + LASSERT(stripe_count > 0); + + if (lmv_hash_is_splitting(hash_type)) { + if (!new_layout) { + hash_type = migrate_hash; + stripe_count = migrate_offset; + } + } else if (lmv_hash_is_merging(hash_type)) { + if (new_layout) { + hash_type = migrate_hash; + stripe_count = migrate_offset; + } + } else if (lmv_hash_is_migrating(hash_type)) { + if (new_layout) { + stripe_count = migrate_offset; + } else { + hash_type = migrate_hash; + stripe_count -= migrate_offset; + } + } + + if (stripe_count > 1) { + switch (hash_type & LMV_HASH_TYPE_MASK) { + case LMV_HASH_TYPE_ALL_CHARS: + stripe_index = lmv_hash_all_chars(stripe_count, name, + namelen); + break; + case LMV_HASH_TYPE_FNV_1A_64: + stripe_index = lmv_hash_fnv1a(stripe_count, name, + namelen); + break; + case LMV_HASH_TYPE_CRUSH: + stripe_index = lmv_hash_crush(stripe_count, name, + namelen); + break; + default: + return -EBADFD; + } + } + + LASSERT(stripe_index < stripe_count); + + if (!new_layout && lmv_hash_is_migrating(saved_hash)) + stripe_index += migrate_offset; + + LASSERT(stripe_index < saved_count); + + CDEBUG(D_INFO, "name %.*s hash=%#x/%#x idx=%d/%u/%u under %s layout\n", + namelen, name, saved_hash, migrate_hash, stripe_index, + saved_count, migrate_offset, new_layout ? "new" : "old"); + + return stripe_index; +} + +static inline int lmv_name_to_stripe_index(struct lmv_mds_md_v1 *lmv, + const char *name, int namelen) +{ + if (lmv->lmv_magic == LMV_MAGIC_V1) + return __lmv_name_to_stripe_index(lmv->lmv_hash_type, + lmv->lmv_stripe_count, + lmv->lmv_migrate_hash, + lmv->lmv_migrate_offset, + name, namelen, true); + + if (lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_V1)) + return __lmv_name_to_stripe_index( + le32_to_cpu(lmv->lmv_hash_type), + le32_to_cpu(lmv->lmv_stripe_count), + le32_to_cpu(lmv->lmv_migrate_hash), + le32_to_cpu(lmv->lmv_migrate_offset), + name, namelen, true); + + return -EINVAL; +} + +static inline int lmv_name_to_stripe_index_old(struct lmv_mds_md_v1 *lmv, + const char *name, int namelen) +{ + if (lmv->lmv_magic == LMV_MAGIC_V1 || + lmv->lmv_magic == LMV_MAGIC_STRIPE) + return __lmv_name_to_stripe_index(lmv->lmv_hash_type, + lmv->lmv_stripe_count, + lmv->lmv_migrate_hash, + lmv->lmv_migrate_offset, + name, namelen, false); + + if (lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_V1) || + lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_STRIPE)) + return __lmv_name_to_stripe_index( + le32_to_cpu(lmv->lmv_hash_type), + le32_to_cpu(lmv->lmv_stripe_count), + le32_to_cpu(lmv->lmv_migrate_hash), + le32_to_cpu(lmv->lmv_migrate_offset), + name, namelen, false); + + return -EINVAL; +} + +static inline bool lmv_user_magic_supported(__u32 lum_magic) +{ + return lum_magic == LMV_USER_MAGIC || + lum_magic == LMV_USER_MAGIC_SPECIFIC || + lum_magic == LMV_MAGIC_FOREIGN; +} + +#define LMV_DEBUG(mask, lmv, msg) \ + CDEBUG(mask, \ + "%s LMV: magic=%#x count=%u index=%u hash=%s:%#x version=%u migrate offset=%u migrate hash=%s:%u.\n",\ + msg, (lmv)->lmv_magic, (lmv)->lmv_stripe_count, \ + (lmv)->lmv_master_mdt_index, \ + mdt_hash_name[(lmv)->lmv_hash_type & (LMV_HASH_TYPE_MAX - 1)],\ + (lmv)->lmv_hash_type, (lmv)->lmv_layout_version, \ + (lmv)->lmv_migrate_offset, \ + mdt_hash_name[(lmv)->lmv_migrate_hash & (LMV_HASH_TYPE_MAX - 1)],\ + (lmv)->lmv_migrate_hash) + +/* master LMV is sane */ +static inline bool lmv_is_sane(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv) + return false; + + if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1) + goto insane; + + if (le32_to_cpu(lmv->lmv_stripe_count) == 0) + goto insane; + + if (!lmv_is_known_hash_type(le32_to_cpu(lmv->lmv_hash_type))) + goto insane; + + return true; +insane: + LMV_DEBUG(D_ERROR, lmv, "insane"); + return false; +} + +/* LMV can be either master or stripe LMV */ +static inline bool lmv_is_sane2(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv) + return false; + + if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1 && + le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_STRIPE) + goto insane; + + if (le32_to_cpu(lmv->lmv_stripe_count) == 0) + goto insane; + + if (!lmv_is_known_hash_type(le32_to_cpu(lmv->lmv_hash_type))) + goto insane; + + return true; +insane: + LMV_DEBUG(D_ERROR, lmv, "insane"); + return false; +} + +static inline bool lmv_is_splitting(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv_is_sane2(lmv)) + return false; + + return lmv_hash_is_splitting(cpu_to_le32(lmv->lmv_hash_type)); +} + +static inline bool lmv_is_merging(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv_is_sane2(lmv)) + return false; + + return lmv_hash_is_merging(cpu_to_le32(lmv->lmv_hash_type)); +} + +static inline bool lmv_is_migrating(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv_is_sane(lmv)) + return false; + + return lmv_hash_is_migrating(cpu_to_le32(lmv->lmv_hash_type)); +} + +static inline bool lmv_is_restriping(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv_is_sane2(lmv)) + return false; + + return lmv_hash_is_splitting(cpu_to_le32(lmv->lmv_hash_type)) || + lmv_hash_is_merging(cpu_to_le32(lmv->lmv_hash_type)); +} + +static inline bool lmv_is_layout_changing(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv_is_sane2(lmv)) + return false; + + return lmv_hash_is_splitting(cpu_to_le32(lmv->lmv_hash_type)) || + lmv_hash_is_merging(cpu_to_le32(lmv->lmv_hash_type)) || + lmv_hash_is_migrating(cpu_to_le32(lmv->lmv_hash_type)); +} + +static inline bool lmv_is_fixed(const struct lmv_mds_md_v1 *lmv) +{ + return cpu_to_le32(lmv->lmv_hash_type) & LMV_HASH_FLAG_FIXED; +} + +static inline __u8 lmv_inherit_next(__u8 inherit) +{ + if (inherit == LMV_INHERIT_END || inherit == LMV_INHERIT_NONE) + return LMV_INHERIT_NONE; + + if (inherit == LMV_INHERIT_UNLIMITED || inherit > LMV_INHERIT_MAX) + return inherit; + + return inherit - 1; +} + +static inline __u8 lmv_inherit_rr_next(__u8 inherit_rr) +{ + if (inherit_rr == LMV_INHERIT_RR_NONE || + inherit_rr == LMV_INHERIT_RR_UNLIMITED || + inherit_rr > LMV_INHERIT_RR_MAX) + return inherit_rr; + + return inherit_rr - 1; +} + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_log.h b/drivers/staging/lustrefsx/lustre/include/lustre_log.h new file mode 100644 index 0000000000000..360ba26dd52c8 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_log.h @@ -0,0 +1,572 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre_log.h + * + * Generic infrastructure for managing a collection of logs. + * These logs are used for: + * + * - orphan recovery: OST adds record on create + * - mtime/size consistency: the OST adds a record on first write + * - open/unlinked objects: OST adds a record on destroy + * + * - mds unlink log: the MDS adds an entry upon delete + * + * - raid1 replication log between OST's + * - MDS replication logs + */ + +#ifndef _LUSTRE_LOG_H +#define _LUSTRE_LOG_H + +/** \defgroup log log + * + * @{ + */ + +#include +#include +#include +#include + +#define LOG_NAME_LIMIT(logname, name) \ + snprintf(logname, sizeof(logname), "LOGS/%s", name) +#define LLOG_EEMPTY 4711 + +enum llog_open_param { + LLOG_OPEN_EXISTS = 0x0000, + LLOG_OPEN_NEW = 0x0001, +}; + +struct plain_handle_data { + struct list_head phd_entry; + struct llog_handle *phd_cat_handle; + /* cookie of this log in its cat */ + struct llog_cookie phd_cookie; +}; + +struct cat_handle_data { + struct list_head chd_head; + struct llog_handle *chd_current_log;/* currently open log */ + struct llog_handle *chd_next_log; /* llog to be used next */ +}; + +struct llog_handle; + +/* llog.c - general API */ +int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, + int flags, struct obd_uuid *uuid); +int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data); +int llog_process(const struct lu_env *env, struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata); +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork); +int llog_reverse_process(const struct lu_env *env, + struct llog_handle *loghandle, llog_cb_t cb, + void *data, void *catdata); +int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle, + int index); +int llog_cancel_arr_rec(const struct lu_env *env, struct llog_handle *loghandle, + int num, int *index); +int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param); +int llog_close(const struct lu_env *env, struct llog_handle *cathandle); +int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name); +int llog_backup(const struct lu_env *env, struct obd_device *obd, + struct llog_ctxt *ctxt, struct llog_ctxt *bak_ctxt, + char *name, char *backup); +int llog_read_header(const struct lu_env *env, struct llog_handle *handle, + const struct obd_uuid *uuid); +__u64 llog_size(const struct lu_env *env, struct llog_handle *llh); + +/* llog_process flags */ +#define LLOG_FLAG_NODEAMON 0x0001 + +/* llog read mode, LLOG_READ_MODE_RAW will process llog canceled records */ +enum llog_read_mode { + LLOG_READ_MODE_NORMAL = 0x0000, + LLOG_READ_MODE_RAW = 0x0001, +}; + + +/* llog_cat.c - catalog api */ +struct llog_process_data { + /** + * Any useful data needed while processing catalog. This is + * passed later to process callback. + */ + void *lpd_data; + /** + * Catalog process callback function, called for each record + * in catalog. + */ + llog_cb_t lpd_cb; + /** + * Start processing the catalog from startcat/startidx + */ + int lpd_startcat; + int lpd_startidx; +}; + +struct llog_process_cat_data { + /** + * Temporary stored first_idx while scanning log. + */ + int lpcd_first_idx; + /** + * Temporary stored last_idx while scanning log. + */ + int lpcd_last_idx; + /** + * llog read mode + */ + enum llog_read_mode lpcd_read_mode; +}; + +int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle); +int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + struct thandle *th); +int llog_cat_declare_add_rec(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct thandle *th); +int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie); +int llog_cat_cancel_arr_rec(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_logid *lgl, int count, int *index); +int llog_cat_cancel_records(const struct lu_env *env, + struct llog_handle *cathandle, int count, + struct llog_cookie *cookies); +int llog_cat_process_or_fork(const struct lu_env *env, + struct llog_handle *cat_llh, llog_cb_t cat_cb, + llog_cb_t cb, void *data, int startcat, + int startidx, bool fork); +int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh, + llog_cb_t cb, void *data, int startcat, int startidx); +__u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh); +__u32 llog_cat_free_space(struct llog_handle *cat_llh); +int llog_cat_reverse_process(const struct lu_env *env, + struct llog_handle *cat_llh, llog_cb_t cb, + void *data); +/* llog_obd.c */ +int llog_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int index, + struct obd_device *disk_obd, const struct llog_operations *op); +int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt); +int llog_cleanup(const struct lu_env *env, struct llog_ctxt *); +int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags); + +/* llog_ioctl.c */ +struct obd_ioctl_data; +int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd, + struct obd_ioctl_data *data); +int llog_catalog_list(const struct lu_env *env, struct dt_device *d, + int count, struct obd_ioctl_data *data, + const struct lu_fid *fid); + +/* llog_net.c */ +int llog_initiator_connect(struct llog_ctxt *ctxt); + +struct llog_operations { + int (*lop_declare_destroy)(const struct lu_env *env, + struct llog_handle *handle, struct thandle *th); + int (*lop_destroy)(const struct lu_env *env, + struct llog_handle *handle, struct thandle *th); + int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h, + int *curr_idx, int next_idx, __u64 *offset, + void *buf, int len); + int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h, + int prev_idx, void *buf, int len); + int (*lop_read_header)(const struct lu_env *env, + struct llog_handle *handle); + int (*lop_setup)(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int ctxt_idx, + struct obd_device *disk_obd); + int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp, + int flags); + int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt); + int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid, + struct llog_gen *gen, struct obd_uuid *uuid); + /** + * Any llog file must be opened first using llog_open(). Llog can be + * opened by name, logid or without both, in last case the new logid + * will be generated. + */ + int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh, + struct llog_logid *logid, char *name, + enum llog_open_param); + /** + * Opened llog may not exist and this must be checked where needed using + * the llog_exist() call. + */ + int (*lop_exist)(struct llog_handle *lgh); + /** + * Close llog file and calls llog_free_handle() implicitly. + * Any opened llog must be closed by llog_close() call. + */ + int (*lop_close)(const struct lu_env *env, struct llog_handle *handle); + /** + * Create new llog file. The llog must be opened. + * Must be used only for local llog operations. + */ + int (*lop_declare_create)(const struct lu_env *env, + struct llog_handle *handle, + struct thandle *th); + int (*lop_create)(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th); + /** + * write new record in llog. It appends records usually but can edit + * existing records too. + */ + int (*lop_declare_write_rec)(const struct lu_env *env, + struct llog_handle *lgh, + struct llog_rec_hdr *rec, + int idx, struct thandle *th); + int (*lop_write_rec)(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + struct llog_cookie *cookie, + int idx, struct thandle *th); + /** + * Add new record in llog catalog. Does the same as llog_write_rec() + * but using llog catalog. + */ + int (*lop_declare_add)(const struct lu_env *env, + struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th); + int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *cookie, + struct thandle *th); +}; + +/* In-memory descriptor for a log object or log catalog */ +struct llog_handle { + struct rw_semaphore lgh_lock; + struct mutex lgh_hdr_mutex; /* protect lgh_hdr data */ + struct llog_logid lgh_id; /* id of this log */ + struct llog_log_hdr *lgh_hdr; /* may be vmalloc'd */ + size_t lgh_hdr_size; + struct dt_object *lgh_obj; + /* For a Catalog, is the last/newest used index for a plain slot. + * Used in conjunction with llh_cat_idx to handle Catalog wrap-around + * case, after it will have reached LLOG_HDR_BITMAP_SIZE, llh_cat_idx + * will become its upper limit */ + int lgh_last_idx; + struct rw_semaphore lgh_last_sem; + __u64 lgh_cur_offset; /* used for test only */ + struct llog_ctxt *lgh_ctxt; + union { + struct plain_handle_data phd; + struct cat_handle_data chd; + } u; + char *lgh_name; + void *private_data; + const struct llog_operations *lgh_logops; + refcount_t lgh_refcount; + + int lgh_max_size; + bool lgh_destroyed; +}; + +/* llog_osd.c */ +extern const struct llog_operations llog_osd_ops; +extern const struct llog_operations llog_common_cat_ops; +int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, struct llog_catid *idarray, + const struct lu_fid *fid); +int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, struct llog_catid *idarray, + const struct lu_fid *fid); + +#define LLOG_CTXT_FLAG_UNINITIALIZED 0x00000001 +#define LLOG_CTXT_FLAG_STOP 0x00000002 + +/* Indicate the llog objects under this context are normal FID objects, + * instead of objects with local FID. */ +#define LLOG_CTXT_FLAG_NORMAL_FID 0x00000004 + +struct llog_ctxt { + int loc_idx; /* my index the obd array of ctxt's */ + struct obd_device *loc_obd; /* points back to the containing obd*/ + struct obd_llog_group *loc_olg; /* group containing that ctxt */ + struct obd_export *loc_exp; /* parent "disk" export (e.g. MDS) */ + struct obd_import *loc_imp; /* to use in RPC's: can be backward + * pointing import */ + const struct llog_operations *loc_logops; + struct llog_handle *loc_handle; + struct mutex loc_mutex; /* protect loc_imp */ + atomic_t loc_refcount; + long loc_flags; /* flags, see above defines */ + struct dt_object *loc_dir; + struct local_oid_storage *loc_los_nameless; + struct local_oid_storage *loc_los_named; + /* llog chunk size, and llog record size can not be bigger than + * loc_chunk_size */ + __u32 loc_chunk_size; +}; + +#define LLOG_PROC_BREAK 0x0001 +#define LLOG_DEL_RECORD 0x0002 +#define LLOG_DEL_PLAIN 0x0003 + +static inline int llog_obd2ops(struct llog_ctxt *ctxt, + const struct llog_operations **lop) +{ + if (ctxt == NULL) + return -ENOTCONN; + + *lop = ctxt->loc_logops; + if (*lop == NULL) + return -EOPNOTSUPP; + + return 0; +} + +static inline int llog_handle2ops(struct llog_handle *loghandle, + const struct llog_operations **lop) +{ + if (loghandle == NULL || loghandle->lgh_logops == NULL) + return -EINVAL; + + *lop = loghandle->lgh_logops; + return 0; +} + +static inline int llog_data_len(int len) +{ + return cfs_size_round(len); +} + +static inline int llog_get_size(struct llog_handle *loghandle) +{ + if (loghandle && loghandle->lgh_hdr) + return loghandle->lgh_hdr->llh_count; + return 0; +} + +static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt) +{ + atomic_inc(&ctxt->loc_refcount); + CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt, + atomic_read(&ctxt->loc_refcount)); + return ctxt; +} + +static inline void llog_ctxt_put(struct llog_ctxt *ctxt) +{ + if (ctxt == NULL) + return; + LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt, + atomic_read(&ctxt->loc_refcount) - 1); + __llog_ctxt_put(NULL, ctxt); +} + +static inline void llog_group_init(struct obd_llog_group *olg) +{ + init_waitqueue_head(&olg->olg_waitq); + spin_lock_init(&olg->olg_lock); +} + +static inline int llog_group_set_ctxt(struct obd_llog_group *olg, + struct llog_ctxt *ctxt, int index) +{ + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + + spin_lock(&olg->olg_lock); + if (olg->olg_ctxts[index] != NULL) { + spin_unlock(&olg->olg_lock); + return -EEXIST; + } + olg->olg_ctxts[index] = ctxt; + spin_unlock(&olg->olg_lock); + return 0; +} + +static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg, + int index) +{ + struct llog_ctxt *ctxt; + + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + + spin_lock(&olg->olg_lock); + if (olg->olg_ctxts[index] == NULL) + ctxt = NULL; + else + ctxt = llog_ctxt_get(olg->olg_ctxts[index]); + spin_unlock(&olg->olg_lock); + return ctxt; +} + +static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index) +{ + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + spin_lock(&olg->olg_lock); + olg->olg_ctxts[index] = NULL; + spin_unlock(&olg->olg_lock); +} + +static inline struct llog_ctxt *llog_get_context(struct obd_device *obd, + int index) +{ + return llog_group_get_ctxt(&obd->obd_olg, index); +} + +static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index) +{ + return (olg->olg_ctxts[index] == NULL); +} + +static inline int llog_ctxt_null(struct obd_device *obd, int index) +{ + return (llog_group_ctxt_null(&obd->obd_olg, index)); +} + +static inline int llog_next_block(const struct lu_env *env, + struct llog_handle *loghandle, int *cur_idx, + int next_idx, __u64 *cur_offset, void *buf, + int len) +{ + const struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_next_block == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx, + cur_offset, buf, len); + RETURN(rc); +} + +static inline int llog_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + const struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_prev_block == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len); + RETURN(rc); +} + +static inline int llog_connect(struct llog_ctxt *ctxt, + struct llog_logid *logid, struct llog_gen *gen, + struct obd_uuid *uuid) +{ + const struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_obd2ops(ctxt, &lop); + if (rc) + RETURN(rc); + if (lop->lop_connect == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_connect(ctxt, logid, gen, uuid); + RETURN(rc); +} + +static inline int llog_is_full(struct llog_handle *llh) +{ + return llh->lgh_last_idx >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1; +} + +struct llog_cfg_rec { + struct llog_rec_hdr lcr_hdr; + struct lustre_cfg lcr_cfg; + struct llog_rec_tail lcr_tail; +}; + +struct llog_cfg_rec *lustre_cfg_rec_new(int cmd, struct lustre_cfg_bufs *bufs); +void lustre_cfg_rec_free(struct llog_cfg_rec *lcr); + +enum { + LLOG_NEXT_IDX = -1, + LLOG_HEADER_IDX = 0, +}; + +/* llog.c */ +int llog_exist(struct llog_handle *loghandle); +int llog_declare_create(const struct lu_env *env, + struct llog_handle *loghandle, struct thandle *th); +int llog_create(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th); +int llog_trans_destroy(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th); +int llog_destroy(const struct lu_env *env, struct llog_handle *handle); + +int llog_declare_write_rec(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, int idx, + struct thandle *th); +int llog_write_rec(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + int idx, struct thandle *th); +int llog_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + struct thandle *th); +int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th); +int lustre_process_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg); +int lustre_end_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg); +int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **res, struct llog_logid *logid, + char *name); +int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_logid *logid, char *name); +int llog_write(const struct lu_env *env, struct llog_handle *loghandle, + struct llog_rec_hdr *rec, int idx); + +/** @} log */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h new file mode 100644 index 0000000000000..0f3d7592fc154 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h @@ -0,0 +1,126 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre_mdc.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_MDC_H +#define _LUSTRE_MDC_H + +/** \defgroup mdc mdc + * + * @{ + */ + +#include +#include +#ifdef CONFIG_LUSTRE_FS_POSIX_ACL +# include +#endif /* CONFIG_LUSTRE_FS_POSIX_ACL */ +#include +#include +#include +#include +#include +#include +#include +#include + +struct ptlrpc_client; +struct obd_export; +struct ptlrpc_request; +struct obd_device; + +/** + * Update the maximum possible easize. + * + * This value is learned from ptlrpc replies sent by the MDT. The + * default easize is initialized to the minimum value but allowed to + * grow up to a single page in size if required to handle the common + * case. + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] exp export for MDC device + * \param[in] body body of ptlrpc reply from MDT + * + */ +static inline void mdc_update_max_ea_from_body(struct obd_export *exp, + struct mdt_body *body) +{ + if (body->mbo_valid & OBD_MD_FLMODEASIZE) { + struct client_obd *cli = &exp->exp_obd->u.cli; + __u32 def_easize; + + if (cli->cl_max_mds_easize < body->mbo_max_mdsize) + cli->cl_max_mds_easize = body->mbo_max_mdsize; + + def_easize = min_t(__u32, body->mbo_max_mdsize, + OBD_MAX_DEFAULT_EA_SIZE); + cli->cl_default_mds_easize = def_easize; + } +} + + +/* mdc/mdc_locks.c */ +int it_open_error(int phase, struct lookup_intent *it); + +static inline bool cl_is_lov_delay_create(unsigned int flags) +{ + return (flags & O_LOV_DELAY_CREATE_1_8) != 0 || + (flags & O_LOV_DELAY_CREATE_MASK) == O_LOV_DELAY_CREATE_MASK; +} + +static inline void cl_lov_delay_create_clear(unsigned int *flags) +{ + if ((*flags & O_LOV_DELAY_CREATE_1_8) != 0) + *flags &= ~O_LOV_DELAY_CREATE_1_8; + if ((*flags & O_LOV_DELAY_CREATE_MASK) == O_LOV_DELAY_CREATE_MASK) + *flags &= ~O_LOV_DELAY_CREATE_MASK; +} + +static inline bool cl_is_lu_noimport(unsigned int flags) +{ + return (flags & O_LU_NOIMPORT_MASK) == O_LU_NOIMPORT_MASK; +} + +static inline void cl_lu_noimport_clear(unsigned int *flags) +{ + if (cl_is_lu_noimport(*flags)) + *flags &= ~O_LU_NOIMPORT_MASK; +} + +/** @} mdc */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mds.h b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h new file mode 100644 index 0000000000000..8c3c010c8c49a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h @@ -0,0 +1,84 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre_mds.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_MDS_H +#define _LUSTRE_MDS_H + +/** \defgroup mds mds + * + * @{ + */ + +#include +#include +#include +#include +#include +#include + +struct md_rejig_data { + struct md_object *mrd_obj; + __u16 mrd_mirror_id; +}; + +#define MDD_OBD_NAME "mdd_obd" +#define MDD_OBD_UUID "mdd_obd_uuid" + +static inline int md_should_create(u64 open_flags) +{ + return !(open_flags & MDS_OPEN_DELAY_CREATE) && + (open_flags & MDS_FMODE_WRITE) && + !(open_flags & MDS_OPEN_LEASE); +} + +/* do NOT or the MAY_*'s, you'll get the weakest */ +static inline int mds_accmode(u64 open_flags) +{ + unsigned int may_mask = 0; + + if (open_flags & MDS_FMODE_READ) + may_mask |= MAY_READ; + if (open_flags & (MDS_FMODE_WRITE | MDS_OPEN_TRUNC | MDS_OPEN_APPEND)) + may_mask |= MAY_WRITE; + if (open_flags & MDS_FMODE_EXEC) + may_mask = MAY_EXEC; + + return may_mask; +} + +/** @} mds */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_net.h b/drivers/staging/lustrefsx/lustre/include/lustre_net.h new file mode 100644 index 0000000000000..7fde30cfe18b3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_net.h @@ -0,0 +1,2673 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +/** \defgroup PtlRPC Portal RPC and networking module. + * + * PortalRPC is the layer used by rest of lustre code to achieve network + * communications: establish connections with corresponding export and import + * states, listen for a service, send and receive RPCs. + * PortalRPC also includes base recovery framework: packet resending and + * replaying, reconnections, pinger. + * + * PortalRPC utilizes LNet as its transport layer. + * + * @{ + */ + + +#ifndef _LUSTRE_NET_H +#define _LUSTRE_NET_H + +/** \defgroup net net + * + * @{ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* MD flags we _always_ use */ +#define PTLRPC_MD_OPTIONS 0 + +/** + * log2 max # of bulk operations in one request: 2=4MB/RPC, 5=32MB/RPC, ... + * In order for the client and server to properly negotiate the maximum + * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two + * value. The client is free to limit the actual RPC size for any bulk + * transfer via cl_max_pages_per_rpc to some non-power-of-two value. + * NOTE: This is limited to 16 (=64GB RPCs) by IOOBJ_MAX_BRW_BITS. */ +#define PTLRPC_BULK_OPS_BITS 6 +#if PTLRPC_BULK_OPS_BITS > 16 +#error "More than 65536 BRW RPCs not allowed by IOOBJ_MAX_BRW_BITS." +#endif +#define PTLRPC_BULK_OPS_COUNT (1U << PTLRPC_BULK_OPS_BITS) +/** + * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and + * should not be used on the server at all. Otherwise, it imposes a + * protocol limitation on the maximum RPC size that can be used by any + * RPC sent to that server in the future. Instead, the server should + * use the negotiated per-client ocd_brw_size to determine the bulk + * RPC count. */ +#define PTLRPC_BULK_OPS_MASK (~((__u64)PTLRPC_BULK_OPS_COUNT - 1)) + +/** + * Define maxima for bulk I/O. + * + * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT + * of LNET_MTU sized RDMA transfers. Clients and servers negotiate the + * currently supported maximum between peers at connect via ocd_brw_size. + */ +#define PTLRPC_MAX_BRW_BITS (LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS) +#define PTLRPC_MAX_BRW_SIZE (1U << PTLRPC_MAX_BRW_BITS) +#define PTLRPC_MAX_BRW_PAGES (PTLRPC_MAX_BRW_SIZE >> PAGE_SHIFT) + +#define ONE_MB_BRW_SIZE (1U << LNET_MTU_BITS) +#define MD_MAX_BRW_SIZE (1U << LNET_MTU_BITS) +#define MD_MAX_BRW_PAGES (MD_MAX_BRW_SIZE >> PAGE_SHIFT) +#define DT_MAX_BRW_SIZE PTLRPC_MAX_BRW_SIZE +#define DT_DEF_BRW_SIZE (4 * ONE_MB_BRW_SIZE) +#define DT_MAX_BRW_PAGES (DT_MAX_BRW_SIZE >> PAGE_SHIFT) +#define OFD_MAX_BRW_SIZE (1U << LNET_MTU_BITS) + +/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */ +#if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0) +# error "PTLRPC_MAX_BRW_PAGES isn't a power of two" +#endif +#if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_SIZE)) +# error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_SIZE" +#endif +#if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT) +# error "PTLRPC_MAX_BRW_SIZE too big" +#endif +#if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT) +# error "PTLRPC_MAX_BRW_PAGES too big" +#endif + +#define PTLRPC_NTHRS_INIT 2 + +/** + * Buffer Constants + * + * Constants determine how memory is used to buffer incoming service requests. + * + * ?_NBUFS # buffers to allocate when growing the pool + * ?_BUFSIZE # bytes in a single request buffer + * ?_MAXREQSIZE # maximum request service will receive + * + * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk + * of ?_NBUFS is added to the pool. + * + * Messages larger than ?_MAXREQSIZE are dropped. Request buffers are + * considered full when less than ?_MAXREQSIZE is left in them. + */ +/** + * Thread Constants + * + * Constants determine how threads are created for ptlrpc service. + * + * ?_NTHRS_INIT # threads to create for each service partition on + * initializing. If it's non-affinity service and + * there is only one partition, it's the overall # + * threads for the service while initializing. + * ?_NTHRS_BASE # threads should be created at least for each + * ptlrpc partition to keep the service healthy. + * It's the low-water mark of threads upper-limit + * for each partition. + * ?_THR_FACTOR # threads can be added on threads upper-limit for + * each CPU core. This factor is only for reference, + * we might decrease value of factor if number of cores + * per CPT is above a limit. + * ?_NTHRS_MAX # overall threads can be created for a service, + * it's a soft limit because if service is running + * on machine with hundreds of cores and tens of + * CPU partitions, we need to guarantee each partition + * has ?_NTHRS_BASE threads, which means total threads + * will be ?_NTHRS_BASE * number_of_cpts which can + * exceed ?_NTHRS_MAX. + * + * Examples + * + * #define MDS_NTHRS_INIT 2 + * #define MDS_NTHRS_BASE 64 + * #define MDS_NTHRS_FACTOR 8 + * #define MDS_NTHRS_MAX 1024 + * + * Example 1): + * --------------------------------------------------------------------- + * Server(A) has 16 cores, user configured it to 4 partitions so each + * partition has 4 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96 + * + * Total number of threads for the service is: + * 96 * partitions(4) = 384 + * + * Example 2): + * --------------------------------------------------------------------- + * Server(B) has 32 cores, user configured it to 4 partitions so each + * partition has 8 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128 + * + * Total number of threads for the service is: + * 128 * partitions(4) = 512 + * + * Example 3): + * --------------------------------------------------------------------- + * Server(B) has 96 cores, user configured it to 8 partitions so each + * partition has 12 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160 + * + * Total number of threads for the service is: + * 160 * partitions(8) = 1280 + * + * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number + * as upper limit of threads number for each partition: + * MDS_NTHRS_MAX(1024) / partitions(8) = 128 + * + * Example 4): + * --------------------------------------------------------------------- + * Server(C) have a thousand of cores and user configured it to 32 partitions + * MDS_NTHRS_BASE(64) * 32 = 2048 + * + * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need + * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads + * to keep service healthy, so total number of threads will just be 2048. + * + * NB: we don't suggest to choose server with that many cores because backend + * filesystem itself, buffer cache, or underlying network stack might + * have some SMP scalability issues at that large scale. + * + * If user already has a fat machine with hundreds or thousands of cores, + * there are two choices for configuration: + * a) create CPU table from subset of all CPUs and run Lustre on + * top of this subset + * b) bind service threads on a few partitions, see modparameters of + * MDS and OSS for details +* + * NB: these calculations (and examples below) are simplified to help + * understanding, the real implementation is a little more complex, + * please see ptlrpc_server_nthreads_check() for details. + * + */ + + /* + * LDLM threads constants: + * + * Given 8 as factor and 24 as base threads number + * + * example 1) + * On 4-core machine we will have 24 + 8 * 4 = 56 threads. + * + * example 2) + * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56 + * threads for each partition and total threads number will be 112. + * + * example 3) + * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24) + * threads for each partition to keep service healthy, so total threads + * number should be 24 * 8 = 192. + * + * So with these constants, threads number will be at the similar level + * of old versions, unless target machine has over a hundred cores + */ +#define LDLM_THR_FACTOR 8 +#define LDLM_NTHRS_INIT PTLRPC_NTHRS_INIT +#define LDLM_NTHRS_BASE 24 +#define LDLM_NTHRS_MAX (num_online_cpus() == 1 ? 64 : 128) + +#define LDLM_BL_THREADS LDLM_NTHRS_AUTO_INIT +#define LDLM_CLIENT_NBUFS 1 +#define LDLM_SERVER_NBUFS 64 +#define LDLM_BUFSIZE (8 * 1024) +#define LDLM_MAXREQSIZE (5 * 1024) +#define LDLM_MAXREPSIZE (1024) + + /* + * MDS threads constants: + * + * Please see examples in "Thread Constants", MDS threads number will be at + * the comparable level of old versions, unless the server has many cores. + */ +#ifndef MDS_MAX_THREADS +#define MDS_MAX_THREADS 1024 +#define MDS_MAX_OTHR_THREADS 256 + +#else /* MDS_MAX_THREADS */ +#if MDS_MAX_THREADS < PTLRPC_NTHRS_INIT +#undef MDS_MAX_THREADS +#define MDS_MAX_THREADS PTLRPC_NTHRS_INIT +#endif +#define MDS_MAX_OTHR_THREADS max(PTLRPC_NTHRS_INIT, MDS_MAX_THREADS / 2) +#endif + +/* default service */ +#define MDS_THR_FACTOR 8 +#define MDS_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_NTHRS_MAX MDS_MAX_THREADS +#define MDS_NTHRS_BASE min(64, MDS_NTHRS_MAX) + +/* read-page service */ +#define MDS_RDPG_THR_FACTOR 4 +#define MDS_RDPG_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_RDPG_NTHRS_MAX MDS_MAX_OTHR_THREADS +#define MDS_RDPG_NTHRS_BASE min(48, MDS_RDPG_NTHRS_MAX) + +/* these should be removed when we remove setattr service in the future */ +#define MDS_SETA_THR_FACTOR 4 +#define MDS_SETA_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_SETA_NTHRS_MAX MDS_MAX_OTHR_THREADS +#define MDS_SETA_NTHRS_BASE min(48, MDS_SETA_NTHRS_MAX) + +/* non-affinity threads */ +#define MDS_OTHR_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_OTHR_NTHRS_MAX MDS_MAX_OTHR_THREADS + +#define MDS_NBUFS 64 + +/** + * Assume file name length = FNAME_MAX = 256 (true for ext3). + * path name length = PATH_MAX = 4096 + * LOV MD size max = EA_MAX = 24 * 2000 + * (NB: 24 is size of lov_ost_data) + * LOV LOGCOOKIE size max = 32 * 2000 + * (NB: 32 is size of llog_cookie) + * symlink: FNAME_MAX + PATH_MAX <- largest + * link: FNAME_MAX + PATH_MAX (mds_rec_link < mds_rec_create) + * rename: FNAME_MAX + FNAME_MAX + * open: FNAME_MAX + EA_MAX + * + * MDS_MAXREQSIZE ~= 4736 bytes = + * lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX + * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header + * + * Realistic size is about 512 bytes (20 character name + 128 char symlink), + * except in the open case where there are a large number of OSTs in a LOV. + */ +#define MDS_MAXREQSIZE (5 * 1024) /* >= 4736 */ +#define MDS_MAXREPSIZE (9 * 1024) /* >= 8300 */ + +/** + * MDS incoming request with LOV EA + * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate + */ +#define MDS_LOV_MAXREQSIZE max(MDS_MAXREQSIZE, \ + 362 + LOV_MAX_STRIPE_COUNT * 24) +/** + * MDS outgoing reply with LOV EA + * + * NB: max reply size Lustre 2.4+ client can get from old MDS is: + * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes + * + * but 2.4 or later MDS will never send reply with llog_cookie to any + * version client. This macro is defined for server side reply buffer size. + */ +#define MDS_LOV_MAXREPSIZE MDS_LOV_MAXREQSIZE + +/** + * This is the size of a maximum REINT_SETXATTR request: + * + * lustre_msg 56 (32 + 4 x 5 + 4) + * ptlrpc_body 184 + * mdt_rec_setxattr 136 + * lustre_capa 120 + * name 256 (XATTR_NAME_MAX) + * value 65536 (XATTR_SIZE_MAX) + */ +#define MDS_EA_MAXREQSIZE 66288 + +/** + * These are the maximum request and reply sizes (rounded up to 1 KB + * boundaries) for the "regular" MDS_REQUEST_PORTAL and MDS_REPLY_PORTAL. + */ +#define MDS_REG_MAXREQSIZE (((max(MDS_EA_MAXREQSIZE, \ + MDS_LOV_MAXREQSIZE) + 1023) >> 10) << 10) +#define MDS_REG_MAXREPSIZE MDS_REG_MAXREQSIZE + +/** + * The update request includes all of updates from the create, which might + * include linkea (4K maxim), together with other updates, we set it to 1000K: + * lustre_msg + ptlrpc_body + OUT_UPDATE_BUFFER_SIZE_MAX + */ +#define OUT_MAXREQSIZE (1000 * 1024) +#define OUT_MAXREPSIZE MDS_MAXREPSIZE + +/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */ +#define MDS_BUFSIZE max(MDS_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \ + 8 * 1024) + +/** + * MDS_REG_BUFSIZE should at least be MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD. + * However, we need to allocate a much larger buffer for it because LNet + * requires each MD(rqbd) has at least MDS_REQ_MAXREQSIZE bytes left to avoid + * dropping of maximum-sized incoming request. So if MDS_REG_BUFSIZE is only a + * little larger than MDS_REG_MAXREQSIZE, then it can only fit in one request + * even there are about MDS_REG_MAX_REQSIZE bytes left in a rqbd, and memory + * utilization is very low. + * + * In the meanwhile, size of rqbd can't be too large, because rqbd can't be + * reused until all requests fit in it have been processed and released, + * which means one long blocked request can prevent the rqbd be reused. + * Now we set request buffer size to 160 KB, so even each rqbd is unlinked + * from LNet with unused 65 KB, buffer utilization will be about 59%. + * Please check LU-2432 for details. + */ +#define MDS_REG_BUFSIZE max(MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \ + 160 * 1024) + +/** + * OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is + * about 10K, for the same reason as MDS_REG_BUFSIZE, we also give some + * extra bytes to each request buffer to improve buffer utilization rate. + */ +#define OUT_BUFSIZE max(OUT_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \ + 24 * 1024) + +/** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */ +#define FLD_MAXREQSIZE (160) + +/** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */ +#define FLD_MAXREPSIZE (152) +#define FLD_BUFSIZE (1 << 12) + +/** + * SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range + + * __u32 padding */ +#define SEQ_MAXREQSIZE (160) + +/** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */ +#define SEQ_MAXREPSIZE (152) +#define SEQ_BUFSIZE (1 << 12) + +/** MGS threads must be >= 3, see bug 22458 comment #28 */ +#define MGS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1) +#define MGS_NTHRS_MAX 32 + +#define MGS_NBUFS 64 +#define MGS_BUFSIZE (8 * 1024) +#define MGS_MAXREQSIZE (7 * 1024) +#define MGS_MAXREPSIZE (9 * 1024) + + /* + * OSS threads constants: + * + * Given 8 as factor and 64 as base threads number + * + * example 1): + * On 8-core server configured to 2 partitions, we will have + * 64 + 8 * 4 = 96 threads for each partition, 192 total threads. + * + * example 2): + * On 32-core machine configured to 4 partitions, we will have + * 64 + 8 * 8 = 112 threads for each partition, so total threads number + * will be 112 * 4 = 448. + * + * example 3): + * On 64-core machine configured to 4 partitions, we will have + * 64 + 16 * 8 = 192 threads for each partition, so total threads number + * will be 192 * 4 = 768 which is above limit OSS_NTHRS_MAX(512), so we + * cut off the value to OSS_NTHRS_MAX(512) / 4 which is 128 threads + * for each partition. + * + * So we can see that with these constants, threads number wil be at the + * similar level of old versions, unless the server has many cores. + */ + /* depress threads factor for VM with small memory size */ +#define OSS_THR_FACTOR min_t(int, 8, \ + NUM_CACHEPAGES >> (28 - PAGE_SHIFT)) +#define OSS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1) +#define OSS_NTHRS_BASE 64 + +/* threads for handling "create" request */ +#define OSS_CR_THR_FACTOR 1 +#define OSS_CR_NTHRS_INIT PTLRPC_NTHRS_INIT +#define OSS_CR_NTHRS_BASE 8 +#define OSS_CR_NTHRS_MAX 64 + +/** + * OST_IO_MAXREQSIZE ~= + * lustre_msg + ptlrpc_body + obdo + obd_ioobj + + * DT_MAX_BRW_PAGES * niobuf_remote + * + * - single object with 16 pages is 512 bytes + * - OST_IO_MAXREQSIZE must be at least 1 niobuf per page of data + * - Must be a multiple of 1024 + * - should allow a reasonably large SHORT_IO_BYTES size (64KB) + */ +#define _OST_MAXREQSIZE_BASE ((unsigned long)(sizeof(struct lustre_msg) + \ + /* lm_buflens */ sizeof(__u32) * 4 + \ + sizeof(struct ptlrpc_body) + \ + sizeof(struct obdo) + \ + sizeof(struct obd_ioobj) + \ + sizeof(struct niobuf_remote))) +#define _OST_MAXREQSIZE_SUM ((unsigned long)(_OST_MAXREQSIZE_BASE + \ + sizeof(struct niobuf_remote) * \ + DT_MAX_BRW_PAGES)) +/** + * FIEMAP request can be 4K+ for now + */ +#define OST_MAXREQSIZE (16UL * 1024UL) +#define OST_IO_MAXREQSIZE max(OST_MAXREQSIZE, \ + ((_OST_MAXREQSIZE_SUM - 1) | \ + (1024UL - 1)) + 1) +/* Safe estimate of free space in standard RPC, provides upper limit for # of + * bytes of i/o to pack in RPC (skipping bulk transfer). */ +#define OST_MAX_SHORT_IO_BYTES ((OST_IO_MAXREQSIZE - _OST_MAXREQSIZE_BASE) & \ + PAGE_MASK) + +/* Actual size used for short i/o buffer. Calculation means this: + * At least one page (for large PAGE_SIZE), or 16 KiB, but not more + * than the available space aligned to a page boundary. */ +#define OBD_DEF_SHORT_IO_BYTES min(max(PAGE_SIZE, 16UL * 1024UL), \ + OST_MAX_SHORT_IO_BYTES) + +#define OST_MAXREPSIZE (9 * 1024) +#define OST_IO_MAXREPSIZE OST_MAXREPSIZE + +#define OST_NBUFS 64 +/** OST_BUFSIZE = max_reqsize + max sptlrpc payload size */ +#define OST_BUFSIZE max_t(int, OST_MAXREQSIZE + 1024, 32 * 1024) +/** + * OST_IO_MAXREQSIZE is 18K, giving extra 46K can increase buffer utilization + * rate of request buffer, please check comment of MDS_LOV_BUFSIZE for details. + */ +#define OST_IO_BUFSIZE max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024) + +/* Macro to hide a typecast and BUILD_BUG. */ +#define ptlrpc_req_async_args(_var, req) ({ \ + BUILD_BUG_ON(sizeof(*_var) > sizeof(req->rq_async_args)); \ + (typeof(_var))&req->rq_async_args; \ + }) + +struct ptlrpc_replay_async_args { + int praa_old_state; + int praa_old_status; +}; + +/** + * Structure to single define portal connection. + */ +struct ptlrpc_connection { + /** linkage for connections hash table */ + struct rhash_head c_hash; + /** Our own lnet nid for this connection */ + struct lnet_nid c_self; + /** Remote side nid for this connection */ + struct lnet_processid c_peer; + /** UUID of the other side */ + struct obd_uuid c_remote_uuid; + /** reference counter for this connection */ + atomic_t c_refcount; +}; + +/** Client definition for PortalRPC */ +struct ptlrpc_client { + /** What lnet portal does this client send messages to by default */ + __u32 cli_request_portal; + /** What portal do we expect replies on */ + __u32 cli_reply_portal; + /** Name of the client */ + const char *cli_name; +}; + +/** state flags of requests */ +/* XXX only ones left are those used by the bulk descs as well! */ +#define PTL_RPC_FL_INTR BIT(0) /* reply wait was interrupted by user */ +#define PTL_RPC_FL_TIMEOUT BIT(7) /* request timed out waiting for reply */ + +#define REQ_MAX_ACK_LOCKS 8 + +union ptlrpc_async_args { + /** + * Scratchpad for passing args to completion interpreter. Users + * cast to the struct of their choosing, and BUILD_BUG_ON that this is + * big enough. For _tons_ of context, OBD_ALLOC a struct and store + * a pointer to it here. The pointer_arg ensures this struct is at + * least big enough for that. + */ + void *pointer_arg[11]; + __u64 space[7]; +}; + +struct ptlrpc_request_set; +typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *); + +/** + * Definition of request set structure. + * Request set is a list of requests (not necessary to the same target) that + * once populated with RPCs could be sent in parallel. + * There are two kinds of request sets. General purpose and with dedicated + * serving thread. Example of the latter is ptlrpcd set. + * For general purpose sets once request set started sending it is impossible + * to add new requests to such set. + * Provides a way to call "completion callbacks" when all requests in the set + * returned. + */ +struct ptlrpc_request_set { + atomic_t set_refcount; + /** number of in queue requests */ + atomic_t set_new_count; + /** number of uncompleted requests */ + atomic_t set_remaining; + /** wait queue to wait on for request events */ + wait_queue_head_t set_waitq; + /** List of requests in the set */ + struct list_head set_requests; + /** + * Lock for \a set_new_requests manipulations + * locked so that any old caller can communicate requests to + * the set holder who can then fold them into the lock-free set + */ + spinlock_t set_new_req_lock; + /** List of new yet unsent requests. Only used with ptlrpcd now. */ + struct list_head set_new_requests; + + /** rq_status of requests that have been freed already */ + int set_rc; + /** Additional fields used by the flow control extension */ + /** Maximum number of RPCs in flight */ + int set_max_inflight; + /** Callback function used to generate RPCs */ + set_producer_func set_producer; + /** opaq argument passed to the producer callback */ + void *set_producer_arg; + unsigned int set_allow_intr:1; +}; + +struct ptlrpc_bulk_desc; +struct ptlrpc_service_part; +struct ptlrpc_service; + +/** + * ptlrpc callback & work item stuff + */ +struct ptlrpc_cb_id { + void (*cbid_fn)(struct lnet_event *ev); /* specific callback fn */ + void *cbid_arg; /* additional arg */ +}; + +/** Maximum number of locks to fit into reply state */ +#define RS_MAX_LOCKS 8 +#define RS_DEBUG 0 + +/** + * Structure to define reply state on the server + * Reply state holds various reply message information. Also for "difficult" + * replies (rep-ack case) we store the state after sending reply and wait + * for the client to acknowledge the reception. In these cases locks could be + * added to the state for replay/failover consistency guarantees. + */ +struct ptlrpc_reply_state { + /** Callback description */ + struct ptlrpc_cb_id rs_cb_id; + /** Linkage for list of all reply states in a system */ + struct list_head rs_list; + /** Linkage for list of all reply states on same export */ + struct list_head rs_exp_list; + /** Linkage for list of all reply states for same obd */ + struct list_head rs_obd_list; +#if RS_DEBUG + struct list_head rs_debug_list; +#endif + /** A spinlock to protect the reply state flags */ + spinlock_t rs_lock; + /** Reply state flags */ + unsigned long rs_difficult:1; /* ACK/commit stuff */ + unsigned long rs_no_ack:1; /* no ACK, even for + difficult requests */ + unsigned long rs_scheduled:1; /* being handled? */ + unsigned long rs_scheduled_ever:1;/* any schedule attempts? */ + unsigned long rs_handled:1; /* been handled yet? */ + unsigned long rs_sent:1; /* Got LNET_EVENT_SEND? */ + unsigned long rs_unlinked:1; /* Reply MD unlinked? */ + unsigned long rs_prealloc:1; /* rs from prealloc list */ + unsigned long rs_committed:1;/* the transaction was committed + and the rs was dispatched + by ptlrpc_commit_replies */ + unsigned long rs_convert_lock:1; /* need to convert saved + * locks to COS mode */ + atomic_t rs_refcount; /* number of users */ + /** Number of locks awaiting client ACK */ + int rs_nlocks; + + /** Size of the state */ + int rs_size; + /** opcode */ + __u32 rs_opc; + /** Transaction number */ + __u64 rs_transno; + /** xid */ + __u64 rs_xid; + struct obd_export *rs_export; + struct ptlrpc_service_part *rs_svcpt; + /** Lnet metadata handle for the reply */ + struct lnet_handle_md rs_md_h; + + /** Context for the sevice thread */ + struct ptlrpc_svc_ctx *rs_svc_ctx; + /** Reply buffer (actually sent to the client), encoded if needed */ + struct lustre_msg *rs_repbuf; /* wrapper */ + /** Size of the reply buffer */ + int rs_repbuf_len; /* wrapper buf length */ + /** Size of the reply message */ + int rs_repdata_len; /* wrapper msg length */ + /** + * Actual reply message. Its content is encrupted (if needed) to + * produce reply buffer for actual sending. In simple case + * of no network encryption we jus set \a rs_repbuf to \a rs_msg + */ + struct lustre_msg *rs_msg; /* reply message */ + + /** Handles of locks awaiting client reply ACK */ + struct lustre_handle rs_locks[RS_MAX_LOCKS]; + /** Lock modes of locks in \a rs_locks */ + enum ldlm_mode rs_modes[RS_MAX_LOCKS]; +}; + +struct ptlrpc_thread; + +/** RPC stages */ +enum rq_phase { + RQ_PHASE_NEW = 0xebc0de00, + RQ_PHASE_RPC = 0xebc0de01, + RQ_PHASE_BULK = 0xebc0de02, + RQ_PHASE_INTERPRET = 0xebc0de03, + RQ_PHASE_COMPLETE = 0xebc0de04, + RQ_PHASE_UNREG_RPC = 0xebc0de05, + RQ_PHASE_UNREG_BULK = 0xebc0de06, + RQ_PHASE_UNDEFINED = 0xebc0de07 +}; + +/** Type of request interpreter call-back */ +typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env, + struct ptlrpc_request *req, + void *arg, int rc); +/** Type of request resend call-back */ +typedef void (*ptlrpc_resend_cb_t)(struct ptlrpc_request *req, + void *arg); + +/** + * Definition of request pool structure. + * The pool is used to store empty preallocated requests for the case + * when we would actually need to send something without performing + * any allocations (to avoid e.g. OOM). + */ +struct ptlrpc_request_pool { + /** Locks the list */ + spinlock_t prp_lock; + /** list of ptlrpc_request structs */ + struct list_head prp_req_list; + /** Maximum message size that would fit into a rquest from this pool */ + int prp_rq_size; + /** Function to allocate more requests for this pool */ + int (*prp_populate)(struct ptlrpc_request_pool *, int); +}; + +struct lu_context; +struct lu_env; + +struct ldlm_lock; + +#include + +/** + * Basic request prioritization operations structure. + * The whole idea is centered around locks and RPCs that might affect locks. + * When a lock is contended we try to give priority to RPCs that might lead + * to fastest release of that lock. + * Currently only implemented for OSTs only in a way that makes all + * IO and truncate RPCs that are coming from a locked region where a lock is + * contended a priority over other requests. + */ +struct ptlrpc_hpreq_ops { + /** + * Check if the lock handle of the given lock is the same as + * taken from the request. + */ + int (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *); + /** + * Check if the request is a high priority one. + */ + int (*hpreq_check)(struct ptlrpc_request *); + /** + * Called after the request has been handled. + */ + void (*hpreq_fini)(struct ptlrpc_request *); +}; + +struct ptlrpc_cli_req { + /** For bulk requests on client only: bulk descriptor */ + struct ptlrpc_bulk_desc *cr_bulk; + /** optional time limit for send attempts. This is a timeout + * not a timestamp so timeout_t (s32) is used instead of time64_t + */ + timeout_t cr_delay_limit; + /** time request was first queued */ + time64_t cr_queued_time; + /** request sent in nanoseconds */ + ktime_t cr_sent_ns; + /** time for request really sent out */ + time64_t cr_sent_out; + /** when req reply unlink must finish. */ + time64_t cr_reply_deadline; + /** when req bulk unlink must finish. */ + time64_t cr_bulk_deadline; + /** when req unlink must finish. */ + time64_t cr_req_deadline; + /** Portal to which this request would be sent */ + short cr_req_ptl; + /** Portal where to wait for reply and where reply would be sent */ + short cr_rep_ptl; + /** request resending number */ + unsigned int cr_resend_nr; + /** What was import generation when this request was sent */ + int cr_imp_gen; + enum lustre_imp_state cr_send_state; + /** Per-request waitq introduced by bug 21938 for recovery waiting */ + wait_queue_head_t cr_set_waitq; + /** Link item for request set lists */ + struct list_head cr_set_chain; + /** link to waited ctx */ + struct list_head cr_ctx_chain; + + /** client's half ctx */ + struct ptlrpc_cli_ctx *cr_cli_ctx; + /** Link back to the request set */ + struct ptlrpc_request_set *cr_set; + /** outgoing request MD handle */ + struct lnet_handle_md cr_req_md_h; + /** request-out callback parameter */ + struct ptlrpc_cb_id cr_req_cbid; + /** incoming reply MD handle */ + struct lnet_handle_md cr_reply_md_h; + wait_queue_head_t cr_reply_waitq; + /** reply callback parameter */ + struct ptlrpc_cb_id cr_reply_cbid; + /** Async completion handler, called when reply is received */ + ptlrpc_interpterer_t cr_reply_interp; + /** Resend handler, called when request is resend to update RPC data */ + ptlrpc_resend_cb_t cr_resend_cb; + /** Async completion context */ + union ptlrpc_async_args cr_async_args; + /** Opaq data for replay and commit callbacks. */ + void *cr_cb_data; + /** Link to the imp->imp_unreplied_list */ + struct list_head cr_unreplied_list; + /** + * Commit callback, called when request is committed and about to be + * freed. + */ + void (*cr_commit_cb)(struct ptlrpc_request *); + /** Replay callback, called after request is replayed at recovery */ + void (*cr_replay_cb)(struct ptlrpc_request *); +}; + +/** client request member alias */ +/* NB: these alias should NOT be used by any new code, instead they should + * be removed step by step to avoid potential abuse */ +#define rq_bulk rq_cli.cr_bulk +#define rq_delay_limit rq_cli.cr_delay_limit +#define rq_queued_time rq_cli.cr_queued_time +#define rq_sent_ns rq_cli.cr_sent_ns +#define rq_real_sent rq_cli.cr_sent_out +#define rq_reply_deadline rq_cli.cr_reply_deadline +#define rq_bulk_deadline rq_cli.cr_bulk_deadline +#define rq_req_deadline rq_cli.cr_req_deadline +#define rq_nr_resend rq_cli.cr_resend_nr +#define rq_request_portal rq_cli.cr_req_ptl +#define rq_reply_portal rq_cli.cr_rep_ptl +#define rq_import_generation rq_cli.cr_imp_gen +#define rq_send_state rq_cli.cr_send_state +#define rq_set_chain rq_cli.cr_set_chain +#define rq_ctx_chain rq_cli.cr_ctx_chain +#define rq_set rq_cli.cr_set +#define rq_set_waitq rq_cli.cr_set_waitq +#define rq_cli_ctx rq_cli.cr_cli_ctx +#define rq_req_md_h rq_cli.cr_req_md_h +#define rq_req_cbid rq_cli.cr_req_cbid +#define rq_reply_md_h rq_cli.cr_reply_md_h +#define rq_reply_waitq rq_cli.cr_reply_waitq +#define rq_reply_cbid rq_cli.cr_reply_cbid +#define rq_interpret_reply rq_cli.cr_reply_interp +#define rq_resend_cb rq_cli.cr_resend_cb +#define rq_async_args rq_cli.cr_async_args +#define rq_cb_data rq_cli.cr_cb_data +#define rq_unreplied_list rq_cli.cr_unreplied_list +#define rq_commit_cb rq_cli.cr_commit_cb +#define rq_replay_cb rq_cli.cr_replay_cb + +struct ptlrpc_srv_req { + /** initial thread servicing this request */ + struct ptlrpc_thread *sr_svc_thread; + /** + * Server side list of incoming unserved requests sorted by arrival + * time. Traversed from time to time to notice about to expire + * requests and sent back "early replies" to clients to let them + * know server is alive and well, just very busy to service their + * requests in time + */ + struct list_head sr_timed_list; + /** server-side per-export list */ + struct list_head sr_exp_list; + /** server-side history, used for debuging purposes. */ + struct list_head sr_hist_list; + /** history sequence # */ + __u64 sr_hist_seq; + /** the index of service's srv_at_array into which request is linked */ + __u32 sr_at_index; + /** authed uid */ + uid_t sr_auth_uid; + /** authed uid mapped to */ + uid_t sr_auth_mapped_uid; + /** RPC is generated from what part of Lustre */ + enum lustre_sec_part sr_sp_from; + /** request session context */ + struct lu_context sr_ses; + /** \addtogroup nrs + * @{ + */ + /** stub for NRS request */ + struct ptlrpc_nrs_request sr_nrq; + /** @} nrs */ + /** request arrival time */ + struct timespec64 sr_arrival_time; + /** server's half ctx */ + struct ptlrpc_svc_ctx *sr_svc_ctx; + /** (server side), pointed directly into req buffer */ + struct ptlrpc_user_desc *sr_user_desc; + /** separated reply state, may be vmalloc'd */ + struct ptlrpc_reply_state *sr_reply_state; + /** server-side hp handlers */ + struct ptlrpc_hpreq_ops *sr_ops; + /** incoming request buffer */ + struct ptlrpc_request_buffer_desc *sr_rqbd; +}; + +/** server request member alias */ +/* NB: these alias should NOT be used by any new code, instead they should + * be removed step by step to avoid potential abuse */ +#define rq_svc_thread rq_srv.sr_svc_thread +#define rq_timed_list rq_srv.sr_timed_list +#define rq_exp_list rq_srv.sr_exp_list +#define rq_history_list rq_srv.sr_hist_list +#define rq_history_seq rq_srv.sr_hist_seq +#define rq_at_index rq_srv.sr_at_index +#define rq_auth_uid rq_srv.sr_auth_uid +#define rq_auth_mapped_uid rq_srv.sr_auth_mapped_uid +#define rq_sp_from rq_srv.sr_sp_from +#define rq_session rq_srv.sr_ses +#define rq_nrq rq_srv.sr_nrq +#define rq_arrival_time rq_srv.sr_arrival_time +#define rq_reply_state rq_srv.sr_reply_state +#define rq_svc_ctx rq_srv.sr_svc_ctx +#define rq_user_desc rq_srv.sr_user_desc +#define rq_ops rq_srv.sr_ops +#define rq_rqbd rq_srv.sr_rqbd +#define rq_reqmsg rq_pill.rc_reqmsg +#define rq_repmsg rq_pill.rc_repmsg +#define rq_req_swab_mask rq_pill.rc_req_swab_mask +#define rq_rep_swab_mask rq_pill.rc_rep_swab_mask + +/** + * Represents remote procedure call. + * + * This is a staple structure used by everybody wanting to send a request + * in Lustre. + */ +struct ptlrpc_request { + /* Request type: one of PTL_RPC_MSG_* */ + int rq_type; + /** Result of request processing */ + int rq_status; + /** + * Linkage item through which this request is included into + * sending/delayed lists on client and into rqbd list on server + */ + struct list_head rq_list; + /** Lock to protect request flags and some other important bits, like + * rq_list + */ + spinlock_t rq_lock; + spinlock_t rq_early_free_lock; + /** client-side flags are serialized by rq_lock @{ */ + unsigned int rq_intr:1, rq_replied:1, rq_err:1, + rq_timedout:1, rq_resend:1, rq_restart:1, + /** + * when ->rq_replay is set, request is kept by the client even + * after server commits corresponding transaction. This is + * used for operations that require sequence of multiple + * requests to be replayed. The only example currently is file + * open/close. When last request in such a sequence is + * committed, ->rq_replay is cleared on all requests in the + * sequence. + */ + rq_replay:1, + rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1, + rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1, + rq_early:1, + rq_req_unlinked:1, /* unlinked request buffer from lnet */ + rq_reply_unlinked:1, /* unlinked reply buffer from lnet */ + rq_memalloc:1, /* req originated from "kswapd" */ + rq_committed:1, + rq_reply_truncated:1, + /** whether the "rq_set" is a valid one */ + rq_invalid_rqset:1, + rq_generation_set:1, + /** do not resend request on -EINPROGRESS */ + rq_no_retry_einprogress:1, + /* allow the req to be sent if the import is in recovery + * status */ + rq_allow_replay:1, + /* bulk request, sent to server, but uncommitted */ + rq_unstable:1, + rq_early_free_repbuf:1, /* free reply buffer in advance */ + rq_allow_intr:1; + /** @} */ + + /** server-side flags @{ */ + unsigned int + rq_hp:1, /**< high priority RPC */ + rq_at_linked:1, /**< link into service's srv_at_array */ + rq_packed_final:1, /**< packed final reply */ + rq_obsolete:1; /* aborted by a signal on a client */ + /** @} */ + + /** one of RQ_PHASE_* */ + enum rq_phase rq_phase; + /** one of RQ_PHASE_* to be used next */ + enum rq_phase rq_next_phase; + /** + * client-side refcount for SENT race, server-side refcounf + * for multiple replies + */ + atomic_t rq_refcount; + /** + * client-side: + * !rq_truncate : # reply bytes actually received, + * rq_truncate : required repbuf_len for resend + */ + int rq_nob_received; + /** Request length */ + int rq_reqlen; + /** Reply length */ + int rq_replen; + /** Pool if request is from preallocated list */ + struct ptlrpc_request_pool *rq_pool; + /** Transaction number */ + __u64 rq_transno; + /** xid */ + __u64 rq_xid; + /** bulk match bits */ + __u64 rq_mbits; + /** reply match bits */ + __u64 rq_rep_mbits; + /** + * List item to for replay list. Not yet committed requests get linked + * there. + * Also see \a rq_replay comment above. + * It's also link chain on obd_export::exp_req_replay_queue + */ + struct list_head rq_replay_list; + /** non-shared members for client & server request*/ + union { + struct ptlrpc_cli_req rq_cli; + struct ptlrpc_srv_req rq_srv; + }; + /** + * security and encryption data + * @{ */ + /** description of flavors for client & server */ + struct sptlrpc_flavor rq_flvr; + + /** + * SELinux policy info at the time of the request + * sepol string format is: + * ::: + */ + char rq_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + 1]; + + /* client/server security flags */ + unsigned int + rq_ctx_init:1, /* context initiation */ + rq_ctx_fini:1, /* context destroy */ + rq_bulk_read:1, /* request bulk read */ + rq_bulk_write:1, /* request bulk write */ + /* server authentication flags */ + rq_auth_gss:1, /* authenticated by gss */ + rq_auth_usr_root:1, /* authed as root */ + rq_auth_usr_mdt:1, /* authed as mdt */ + rq_auth_usr_ost:1, /* authed as ost */ + /* security tfm flags */ + rq_pack_udesc:1, + rq_pack_bulk:1, + /* doesn't expect reply FIXME */ + rq_no_reply:1, + rq_pill_init:1, /* pill initialized */ + rq_srv_req:1; /* server request */ + + + /** various buffer pointers */ + struct lustre_msg *rq_reqbuf; /**< req wrapper, vmalloc*/ + char *rq_repbuf; /**< rep buffer, vmalloc */ + struct lustre_msg *rq_repdata; /**< rep wrapper msg */ + /** only in priv mode */ + struct lustre_msg *rq_clrbuf; + int rq_reqbuf_len; /* req wrapper buf len */ + int rq_reqdata_len; /* req wrapper msg len */ + int rq_repbuf_len; /* rep buffer len */ + int rq_repdata_len; /* rep wrapper msg len */ + int rq_clrbuf_len; /* only in priv mode */ + int rq_clrdata_len; /* only in priv mode */ + + /** early replies go to offset 0, regular replies go after that */ + unsigned int rq_reply_off; + /** @} */ + + /** how many early replies (for stats) */ + int rq_early_count; + /** Server-side, export on which request was received */ + struct obd_export *rq_export; + /** import where request is being sent */ + struct obd_import *rq_import; + /** our LNet NID */ + lnet_nid_t rq_self; + /** Peer description (the other side) */ + struct lnet_process_id rq_peer; + /** Descriptor for the NID from which the peer sent the request. */ + struct lnet_process_id rq_source; + /** + * service time estimate (secs) + * If the request is not served by this time, it is marked as timed out. + * Do not change to time64_t since this is transmitted over the wire. + * + * The linux kernel handles timestamps with time64_t and timeouts + * are normally done with jiffies. Lustre shares the rq_timeout between + * nodes. Since jiffies can vary from node to node Lustre instead + * will express the timeout value in seconds. To avoid confusion with + * timestamps (time64_t) and jiffy timeouts (long) Lustre timeouts + * are expressed in s32 (timeout_t). Also what is transmitted over + * the wire is 32 bits. + */ + timeout_t rq_timeout; + /** + * when request/reply sent (secs), or time when request should be sent + */ + time64_t rq_sent; + /** when request must finish. */ + time64_t rq_deadline; + /** request format description */ + struct req_capsule rq_pill; +}; + +/** + * Call completion handler for rpc if any, return it's status or original + * rc if there was no handler defined for this request. + */ +static inline int ptlrpc_req_interpret(const struct lu_env *env, + struct ptlrpc_request *req, int rc) +{ + if (req->rq_interpret_reply != NULL) { + req->rq_status = req->rq_interpret_reply(env, req, + &req->rq_async_args, + rc); + return req->rq_status; + } + + return rc; +} + +/** \addtogroup nrs + * @{ + */ +void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req); + +/* + * Can the request be moved from the regular NRS head to the high-priority NRS + * head (of the same PTLRPC service partition), if any? + * + * For a reliable result, this should be checked under svcpt->scp_req lock. + */ +static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req) +{ + struct ptlrpc_nrs_request *nrq = &req->rq_nrq; + + /** + * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the + * request has been enqueued first, and ptlrpc_nrs_request::nr_started + * to make sure it has not been scheduled yet (analogous to previous + * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list). + */ + return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp; +} +/** @} nrs */ + +/** + * Convert numerical request phase value \a phase into text string description + */ +static inline const char * +ptlrpc_phase2str(enum rq_phase phase) +{ + switch (phase) { + case RQ_PHASE_NEW: + return "New"; + case RQ_PHASE_RPC: + return "Rpc"; + case RQ_PHASE_BULK: + return "Bulk"; + case RQ_PHASE_INTERPRET: + return "Interpret"; + case RQ_PHASE_COMPLETE: + return "Complete"; + case RQ_PHASE_UNREG_RPC: + return "UnregRPC"; + case RQ_PHASE_UNREG_BULK: + return "UnregBULK"; + default: + return "?Phase?"; + } +} + +/** + * Convert numerical request phase of the request \a req into text stringi + * description + */ +static inline const char * +ptlrpc_rqphase2str(struct ptlrpc_request *req) +{ + return ptlrpc_phase2str(req->rq_phase); +} + +/** + * Debugging functions and helpers to print request structure into debug log + * @{ + */ +/* Spare the preprocessor, spoil the bugs. */ +#define FLAG(field, str) (field ? str : "") + +/** Convert bit flags into a string */ +#define DEBUG_REQ_FLAGS(req) \ + ptlrpc_rqphase2str(req), \ + FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \ + FLAG(req->rq_err, "E"), FLAG(req->rq_net_err, "e"), \ + FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \ + FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \ + FLAG(req->rq_no_resend, "N"), FLAG(req->rq_no_reply, "n"), \ + FLAG(req->rq_waiting, "W"), \ + FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"), \ + FLAG(req->rq_committed, "M"), \ + FLAG(req->rq_req_unlinked, "Q"), \ + FLAG(req->rq_reply_unlinked, "U"), \ + FLAG(req->rq_receiving_reply, "r") + +#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s" + +void _debug_req(struct ptlrpc_request *req, + struct libcfs_debug_msg_data *data, const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); + +/** + * Helper that decides if we need to print request accordig to current debug + * level settings + */ +#define debug_req(msgdata, mask, cdls, req, fmt, a...) \ +do { \ + CFS_CHECK_STACK(msgdata, mask, cdls); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + _debug_req((req), msgdata, fmt, ##a); \ +} while(0) + +/** + * This is the debug print function you need to use to print request sturucture + * content into lustre debug log. + * for most callers (level is a constant) this is resolved at compile time */ +#define DEBUG_REQ(level, req, fmt, args...) \ +do { \ + if ((level) & (D_ERROR | D_WARNING)) { \ + static struct cfs_debug_limit_state cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ + debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\ + } else { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \ + debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \ + } \ +} while (0) +/** @} */ + +enum ptlrpc_bulk_op_type { + PTLRPC_BULK_OP_ACTIVE = 0x00000001, + PTLRPC_BULK_OP_PASSIVE = 0x00000002, + PTLRPC_BULK_OP_PUT = 0x00000004, + PTLRPC_BULK_OP_GET = 0x00000008, + PTLRPC_BULK_GET_SOURCE = PTLRPC_BULK_OP_PASSIVE | PTLRPC_BULK_OP_GET, + PTLRPC_BULK_PUT_SINK = PTLRPC_BULK_OP_PASSIVE | PTLRPC_BULK_OP_PUT, + PTLRPC_BULK_GET_SINK = PTLRPC_BULK_OP_ACTIVE | PTLRPC_BULK_OP_GET, + PTLRPC_BULK_PUT_SOURCE = PTLRPC_BULK_OP_ACTIVE | PTLRPC_BULK_OP_PUT, +}; + +static inline bool ptlrpc_is_bulk_op_get(enum ptlrpc_bulk_op_type type) +{ + return (type & PTLRPC_BULK_OP_GET) == PTLRPC_BULK_OP_GET; +} + +static inline bool ptlrpc_is_bulk_get_source(enum ptlrpc_bulk_op_type type) +{ + return (type & PTLRPC_BULK_GET_SOURCE) == PTLRPC_BULK_GET_SOURCE; +} + +static inline bool ptlrpc_is_bulk_put_sink(enum ptlrpc_bulk_op_type type) +{ + return (type & PTLRPC_BULK_PUT_SINK) == PTLRPC_BULK_PUT_SINK; +} + +static inline bool ptlrpc_is_bulk_get_sink(enum ptlrpc_bulk_op_type type) +{ + return (type & PTLRPC_BULK_GET_SINK) == PTLRPC_BULK_GET_SINK; +} + +static inline bool ptlrpc_is_bulk_put_source(enum ptlrpc_bulk_op_type type) +{ + return (type & PTLRPC_BULK_PUT_SOURCE) == PTLRPC_BULK_PUT_SOURCE; +} + +static inline bool ptlrpc_is_bulk_op_active(enum ptlrpc_bulk_op_type type) +{ + return ((type & PTLRPC_BULK_OP_ACTIVE) | + (type & PTLRPC_BULK_OP_PASSIVE)) + == PTLRPC_BULK_OP_ACTIVE; +} + +static inline bool ptlrpc_is_bulk_op_passive(enum ptlrpc_bulk_op_type type) +{ + return ((type & PTLRPC_BULK_OP_ACTIVE) | + (type & PTLRPC_BULK_OP_PASSIVE)) + == PTLRPC_BULK_OP_PASSIVE; +} + +struct ptlrpc_bulk_frag_ops { + /** + * Add a page \a page to the bulk descriptor \a desc + * Data to transfer in the page starts at offset \a pageoffset and + * amount of data to transfer from the page is \a len + */ + void (*add_kiov_frag)(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len); + + /* + * Add a \a fragment to the bulk descriptor \a desc. + * Data to transfer in the fragment is pointed to by \a frag + * The size of the fragment is \a len + */ + int (*add_iov_frag)(struct ptlrpc_bulk_desc *desc, void *frag, int len); + + /** + * Uninitialize and free bulk descriptor \a desc. + * Works on bulk descriptors both from server and client side. + */ + void (*release_frags)(struct ptlrpc_bulk_desc *desc); +}; + +extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops; +extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops; + +/* + * Definition of bulk descriptor. + * Bulks are special "Two phase" RPCs where initial request message + * is sent first and it is followed bt a transfer (o receiving) of a large + * amount of data to be settled into pages referenced from the bulk descriptors. + * Bulks transfers (the actual data following the small requests) are done + * on separate LNet portals. + * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs. + * Another user is readpage for MDT. + */ +struct ptlrpc_bulk_desc { + unsigned int bd_refs; /* number MD's assigned including zero-sends */ + /** completed with failure */ + unsigned long bd_failure:1; + /** client side */ + unsigned long bd_registered:1; + /** For serialization with callback */ + spinlock_t bd_lock; + /** {put,get}{source,sink}{kvec,kiov} */ + enum ptlrpc_bulk_op_type bd_type; + /** LNet portal for this bulk */ + __u32 bd_portal; + /** Server side - export this bulk created for */ + struct obd_export *bd_export; + /** Client side - import this bulk was sent on */ + struct obd_import *bd_import; + /** Back pointer to the request */ + struct ptlrpc_request *bd_req; + const struct ptlrpc_bulk_frag_ops *bd_frag_ops; + wait_queue_head_t bd_waitq; /* server side only WQ */ + int bd_iov_count; /* # entries in bd_iov */ + int bd_max_iov; /* allocated size of bd_iov */ + int bd_nob; /* # bytes covered */ + int bd_nob_transferred; /* # bytes GOT/PUT */ + unsigned int bd_nob_last; /* # bytes in last MD */ + + __u64 bd_last_mbits; + + struct ptlrpc_cb_id bd_cbid; /* network callback info */ + lnet_nid_t bd_sender; /* stash event::sender */ + int bd_md_count; /* # valid entries in bd_mds */ + int bd_md_max_brw; /* max entries in bd_mds */ + + /** array of offsets for each MD */ + unsigned int bd_mds_off[PTLRPC_BULK_OPS_COUNT]; + /** array of associated MDs */ + struct lnet_handle_md bd_mds[PTLRPC_BULK_OPS_COUNT]; + + /* encrypted iov, size is either 0 or bd_iov_count. */ + struct bio_vec *bd_enc_vec; + struct bio_vec *bd_vec; +}; + +enum { + SVC_INIT = 0, + SVC_STOPPED = BIT(0), + SVC_STOPPING = BIT(1), + SVC_STARTING = BIT(2), + SVC_RUNNING = BIT(3), +}; + +#define PTLRPC_THR_NAME_LEN 32 +/** + * Definition of server service thread structure + */ +struct ptlrpc_thread { + /** + * List of active threads in svcpt->scp_threads + */ + struct list_head t_link; + /** + * thread-private data (preallocated vmalloc'd memory) + */ + void *t_data; + __u32 t_flags; + /** + * service thread index, from ptlrpc_start_threads + */ + unsigned int t_id; + /** + * service thread + */ + struct task_struct *t_task; + pid_t t_pid; + ktime_t t_touched; + /** + * put watchdog in the structure per thread b=14840 + */ + struct delayed_work t_watchdog; + /** + * the svc this thread belonged to b=18582 + */ + struct ptlrpc_service_part *t_svcpt; + wait_queue_head_t t_ctl_waitq; + struct lu_env *t_env; + char t_name[PTLRPC_THR_NAME_LEN]; +}; + +static inline int thread_is_init(struct ptlrpc_thread *thread) +{ + return thread->t_flags == 0; +} + +static inline int thread_is_stopped(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STOPPED); +} + +static inline int thread_is_stopping(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STOPPING); +} + +static inline int thread_is_starting(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STARTING); +} + +static inline int thread_is_running(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_RUNNING); +} + +static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags &= ~flags; +} + +static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags = flags; +} + +static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags |= flags; +} + +static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread, + __u32 flags) +{ + if (thread->t_flags & flags) { + thread->t_flags &= ~flags; + return 1; + } + return 0; +} + +/** + * Request buffer descriptor structure. + * This is a structure that contains one posted request buffer for service. + * Once data land into a buffer, event callback creates actual request and + * notifies wakes one of the service threads to process new incoming request. + * More than one request can fit into the buffer. + */ +struct ptlrpc_request_buffer_desc { + /** Link item for rqbds on a service */ + struct list_head rqbd_list; + /** History of requests for this buffer */ + struct list_head rqbd_reqs; + /** Back pointer to service for which this buffer is registered */ + struct ptlrpc_service_part *rqbd_svcpt; + /** LNet descriptor */ + struct lnet_handle_md rqbd_md_h; + int rqbd_refcount; + /** The buffer itself */ + char *rqbd_buffer; + struct ptlrpc_cb_id rqbd_cbid; + /** + * This "embedded" request structure is only used for the + * last request to fit into the buffer + */ + struct ptlrpc_request rqbd_req; +}; + +typedef int (*svc_handler_t)(struct ptlrpc_request *req); + +struct ptlrpc_service_ops { + /** + * if non-NULL called during thread creation (ptlrpc_start_thread()) + * to initialize service specific per-thread state. + */ + int (*so_thr_init)(struct ptlrpc_thread *thr); + /** + * if non-NULL called during thread shutdown (ptlrpc_main()) to + * destruct state created by ->srv_init(). + */ + void (*so_thr_done)(struct ptlrpc_thread *thr); + /** + * Handler function for incoming requests for this service + */ + int (*so_req_handler)(struct ptlrpc_request *req); + /** + * function to determine priority of the request, it's called + * on every new request + */ + int (*so_hpreq_handler)(struct ptlrpc_request *); + /** + * service-specific print fn + */ + void (*so_req_printer)(void *, struct ptlrpc_request *); +}; + +#ifndef __cfs_cacheline_aligned +/* NB: put it here for reducing patche dependence */ +# define __cfs_cacheline_aligned +#endif + +/** + * How many high priority requests to serve before serving one normal + * priority request + */ +#define PTLRPC_SVC_HP_RATIO 10 + +/** + * Definition of PortalRPC service. + * The service is listening on a particular portal (like tcp port) + * and perform actions for a specific server like IO service for OST + * or general metadata service for MDS. + */ +struct ptlrpc_service { + /** serialize /proc operations */ + spinlock_t srv_lock; + /** most often accessed fields */ + /** chain thru all services */ + struct list_head srv_list; + /** service operations table */ + struct ptlrpc_service_ops srv_ops; + /** only statically allocated strings here; we don't clean them */ + char *srv_name; + /** only statically allocated strings here; we don't clean them */ + char *srv_thread_name; + /** threads # should be created for each partition on initializing */ + int srv_nthrs_cpt_init; + /** limit of threads number for each partition */ + int srv_nthrs_cpt_limit; + /** Root of debugfs dir tree for this service */ + struct dentry *srv_debugfs_entry; + /** Pointer to statistic data for this service */ + struct lprocfs_stats *srv_stats; + /** # hp per lp reqs to handle */ + int srv_hpreq_ratio; + /** biggest request to receive */ + int srv_max_req_size; + /** biggest reply to send */ + int srv_max_reply_size; + /** size of individual buffers */ + int srv_buf_size; + /** # buffers to allocate in 1 group */ + int srv_nbuf_per_group; + /** Local portal on which to receive requests */ + __u32 srv_req_portal; + /** Portal on the client to send replies to */ + __u32 srv_rep_portal; + /** + * Tags for lu_context associated with this thread, see struct + * lu_context. + */ + __u32 srv_ctx_tags; + /** soft watchdog timeout multiplier */ + int srv_watchdog_factor; + /** under unregister_service */ + unsigned srv_is_stopping:1; + /** Whether or not to restrict service threads to CPUs in this CPT */ + unsigned srv_cpt_bind:1; + + /** max # request buffers */ + int srv_nrqbds_max; + /** max # request buffers in history per partition */ + int srv_hist_nrqbds_cpt_max; + /** number of CPTs this service associated with */ + int srv_ncpts; + /** CPTs array this service associated with */ + __u32 *srv_cpts; + /** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */ + int srv_cpt_bits; + /** CPT table this service is running over */ + struct cfs_cpt_table *srv_cptable; + + /* sysfs object */ + struct kobject srv_kobj; + struct completion srv_kobj_unregister; + /** + * partition data for ptlrpc service + */ + struct ptlrpc_service_part *srv_parts[0]; +}; + +/** + * Definition of PortalRPC service partition data. + * Although a service only has one instance of it right now, but we + * will have multiple instances very soon (instance per CPT). + * + * it has four locks: + * \a scp_lock + * serialize operations on rqbd and requests waiting for preprocess + * \a scp_req_lock + * serialize operations active requests sent to this portal + * \a scp_at_lock + * serialize adaptive timeout stuff + * \a scp_rep_lock + * serialize operations on RS list (reply states) + * + * We don't have any use-case to take two or more locks at the same time + * for now, so there is no lock order issue. + */ +struct ptlrpc_service_part { + /** back reference to owner */ + struct ptlrpc_service *scp_service __cfs_cacheline_aligned; + /* CPT id, reserved */ + int scp_cpt; + /** always increasing number */ + int scp_thr_nextid; + /** # of starting threads */ + int scp_nthrs_starting; + /** # running threads */ + int scp_nthrs_running; + /** service threads list */ + struct list_head scp_threads; + + /** + * serialize the following fields, used for protecting + * rqbd list and incoming requests waiting for preprocess, + * threads starting & stopping are also protected by this lock. + */ + spinlock_t scp_lock __cfs_cacheline_aligned; + /** userland serialization */ + struct mutex scp_mutex; + /** total # req buffer descs allocated */ + int scp_nrqbds_total; + /** # posted request buffers for receiving */ + int scp_nrqbds_posted; + /** in progress of allocating rqbd */ + int scp_rqbd_allocating; + /** # incoming reqs */ + int scp_nreqs_incoming; + /** request buffers to be reposted */ + struct list_head scp_rqbd_idle; + /** req buffers receiving */ + struct list_head scp_rqbd_posted; + /** incoming reqs */ + struct list_head scp_req_incoming; + /** timeout before re-posting reqs, in jiffies */ + long scp_rqbd_timeout; + /** + * all threads sleep on this. This wait-queue is signalled when new + * incoming request arrives and when difficult reply has to be handled. + */ + wait_queue_head_t scp_waitq; + + /** request history */ + struct list_head scp_hist_reqs; + /** request buffer history */ + struct list_head scp_hist_rqbds; + /** # request buffers in history */ + int scp_hist_nrqbds; + /** sequence number for request */ + __u64 scp_hist_seq; + /** highest seq culled from history */ + __u64 scp_hist_seq_culled; + + /** + * serialize the following fields, used for processing requests + * sent to this portal + */ + spinlock_t scp_req_lock __cfs_cacheline_aligned; + /** # reqs in either of the NRS heads below */ + /** # reqs being served */ + int scp_nreqs_active; + /** # HPreqs being served */ + int scp_nhreqs_active; + /** # hp requests handled */ + int scp_hreq_count; + + /** NRS head for regular requests */ + struct ptlrpc_nrs scp_nrs_reg; + /** NRS head for HP requests; this is only valid for services that can + * handle HP requests */ + struct ptlrpc_nrs *scp_nrs_hp; + + /** AT stuff */ + /** @{ */ + /** + * serialize the following fields, used for changes on + * adaptive timeout + */ + spinlock_t scp_at_lock __cfs_cacheline_aligned; + /** estimated rpc service time */ + struct adaptive_timeout scp_at_estimate; + /** reqs waiting for replies */ + struct ptlrpc_at_array scp_at_array; + /** early reply timer */ + struct timer_list scp_at_timer; + /** debug */ + ktime_t scp_at_checktime; + /** check early replies */ + unsigned scp_at_check; + /** @} */ + + /** + * serialize the following fields, used for processing + * replies for this portal + */ + spinlock_t scp_rep_lock __cfs_cacheline_aligned; + /** all the active replies */ + struct list_head scp_rep_active; + /** List of free reply_states */ + struct list_head scp_rep_idle; + /** waitq to run, when adding stuff to srv_free_rs_list */ + wait_queue_head_t scp_rep_waitq; + /** # 'difficult' replies */ + atomic_t scp_nreps_difficult; +}; + +#define ptlrpc_service_for_each_part(part, i, svc) \ + for (i = 0; \ + i < (svc)->srv_ncpts && \ + (svc)->srv_parts != NULL && \ + ((part) = (svc)->srv_parts[i]) != NULL; i++) + +/** + * Declaration of ptlrpcd control structure + */ +struct ptlrpcd_ctl { + /** + * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE) + */ + unsigned long pc_flags; + /** + * Thread lock protecting structure fields. + */ + spinlock_t pc_lock; + /** + * Start completion. + */ + struct completion pc_starting; + /** + * Stop completion. + */ + struct completion pc_finishing; + /** + * Thread requests set. + */ + struct ptlrpc_request_set *pc_set; + /** + * Thread name used in kthread_run() + */ + char pc_name[16]; + /** + * CPT the thread is bound on. + */ + int pc_cpt; + /** + * Index of ptlrpcd thread in the array. + */ + int pc_index; + /** + * Pointer to the array of partners' ptlrpcd_ctl structure. + */ + struct ptlrpcd_ctl **pc_partners; + /** + * Number of the ptlrpcd's partners. + */ + int pc_npartners; + /** + * Record the partner index to be processed next. + */ + int pc_cursor; + /** + * Error code if the thread failed to fully start. + */ + int pc_error; +}; + +/* Bits for pc_flags */ +enum ptlrpcd_ctl_flags { + /** + * Ptlrpc thread start flag. + */ + LIOD_START = BIT(0), + /** + * Ptlrpc thread stop flag. + */ + LIOD_STOP = BIT(1), + /** + * Ptlrpc thread force flag (only stop force so far). + * This will cause aborting any inflight rpcs handled + * by thread if LIOD_STOP is specified. + */ + LIOD_FORCE = BIT(2), + /** + * This is a recovery ptlrpc thread. + */ + LIOD_RECOVERY = BIT(3), +}; + +/** + * \addtogroup nrs + * @{ + * + * Service compatibility function; the policy is compatible with all services. + * + * \param[in] svc The service the policy is attempting to register with. + * \param[in] desc The policy descriptor + * + * \retval true The policy is compatible with the service + * + * \see ptlrpc_nrs_pol_desc::pd_compat() + */ +static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + return true; +} + +/** + * Service compatibility function; the policy is compatible with only a specific + * service which is identified by its human-readable name at + * ptlrpc_service::srv_name. + * + * \param[in] svc The service the policy is attempting to register with. + * \param[in] desc The policy descriptor + * + * \retval false The policy is not compatible with the service + * \retval true The policy is compatible with the service + * + * \see ptlrpc_nrs_pol_desc::pd_compat() + */ +static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + LASSERT(desc->pd_compat_svc_name != NULL); + return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0; +} + +/** @} nrs */ + +/* ptlrpc/events.c */ +extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, + struct lnet_process_id *peer, lnet_nid_t *self); +/** + * These callbacks are invoked by LNet when something happened to + * underlying buffer + * @{ + */ +extern void request_out_callback(struct lnet_event *ev); +extern void reply_in_callback(struct lnet_event *ev); +extern void client_bulk_callback(struct lnet_event *ev); +extern void request_in_callback(struct lnet_event *ev); +extern void reply_out_callback(struct lnet_event *ev); +#ifdef HAVE_SERVER_SUPPORT +extern void server_bulk_callback(struct lnet_event *ev); +#endif +/** @} */ + +/* ptlrpc/connection.c */ +struct ptlrpc_connection *ptlrpc_connection_get(struct lnet_process_id peer, + lnet_nid_t self, + struct obd_uuid *uuid); + +static inline void ptlrpc_connection_put(struct ptlrpc_connection *conn) +{ + if (!conn) + return; + + LASSERT(atomic_read(&conn->c_refcount) > 0); + + /* + * We do not remove connection from hashtable and + * do not free it even if last caller released ref, + * as we want to have it cached for the case it is + * needed again. + * + * Deallocating it and later creating new connection + * again would be wastful. This way we also avoid + * expensive locking to protect things from get/put + * race when found cached connection is freed by + * ptlrpc_connection_put(). + * + * It will be freed later in module unload time, + * when ptlrpc_connection_fini()->lh_exit->conn_exit() + * path is called. + */ + atomic_dec(&conn->c_refcount); + + CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n", + conn, atomic_read(&conn->c_refcount), + libcfs_nidstr(&conn->c_peer.nid)); +} + +struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *); +int ptlrpc_connection_init(void); +void ptlrpc_connection_fini(void); +extern lnet_pid_t ptl_get_pid(void); + +/* + * Check if the peer connection is on the local node. We need to use GFP_NOFS + * for requests from a local client to avoid recursing into the filesystem + * as we might end up waiting on a page sent in the request we're serving. + * + * Use __GFP_HIGHMEM so that the pages can use all of the available memory + * on 32-bit machines. Use more aggressive GFP_HIGHUSER flags from non-local + * clients to be able to generate more memory pressure on the OSS and allow + * inactive pages to be reclaimed, since it doesn't have any other processes + * or allocations that generate memory reclaim pressure. + * + * See b=17576 (bdf50dc9) and b=19529 (3dcf18d3) for details. + */ +static inline bool ptlrpc_connection_is_local(struct ptlrpc_connection *conn) +{ + if (!conn) + return false; + + if (nid_same(&conn->c_peer.nid, &conn->c_self)) + return true; + + RETURN(LNetIsPeerLocal(lnet_nid_to_nid4(&conn->c_peer.nid))); +} + +/* ptlrpc/niobuf.c */ +/** + * Actual interfacing with LNet to put/get/register/unregister stuff + * @{ + */ +#ifdef HAVE_SERVER_SUPPORT +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req, + unsigned nfrags, unsigned max_brw, + unsigned int type, + unsigned portal, + const struct ptlrpc_bulk_frag_ops + *ops); +int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc); +void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc); + +static inline int ptlrpc_server_bulk_active(struct ptlrpc_bulk_desc *desc) +{ + int rc; + + LASSERT(desc != NULL); + + spin_lock(&desc->bd_lock); + rc = desc->bd_refs; + spin_unlock(&desc->bd_lock); + return rc; +} +#endif + +int ptlrpc_register_bulk(struct ptlrpc_request *req); +int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async); + +static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc; + int rc; + + LASSERT(req != NULL); + desc = req->rq_bulk; + + if (!desc) + return 0; + + if (req->rq_bulk_deadline > ktime_get_real_seconds()) + return 1; + + + spin_lock(&desc->bd_lock); + rc = desc->bd_refs; + spin_unlock(&desc->bd_lock); + return rc; +} + +#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01 +#define PTLRPC_REPLY_EARLY 0x02 +int ptlrpc_send_reply(struct ptlrpc_request *req, int flags); +int ptlrpc_reply(struct ptlrpc_request *req); +int ptlrpc_send_error(struct ptlrpc_request *req, int difficult); +int ptlrpc_error(struct ptlrpc_request *req); +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req); +int ptl_send_rpc(struct ptlrpc_request *request, int noreply); +int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd); +/** @} */ + +/* ptlrpc/client.c */ +/** + * Client-side portals API. Everything to send requests, receive replies, + * request queues, request management, etc. + * @{ + */ +void ptlrpc_request_committed(struct ptlrpc_request *req, int force); + +void ptlrpc_init_client(int req_portal, int rep_portal, const char *name, + struct ptlrpc_client *); +void ptlrpc_cleanup_client(struct obd_import *imp); +struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid, + lnet_nid_t nid4refnet); + +int ptlrpc_queue_wait(struct ptlrpc_request *req); +int ptlrpc_replay_req(struct ptlrpc_request *req); +void ptlrpc_restart_req(struct ptlrpc_request *req); +void ptlrpc_abort_inflight(struct obd_import *imp); +void ptlrpc_cleanup_imp(struct obd_import *imp); +void ptlrpc_abort_set(struct ptlrpc_request_set *set); + +struct ptlrpc_request_set *ptlrpc_prep_set(void); +struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, + void *arg); +int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set); +int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *); +void ptlrpc_set_destroy(struct ptlrpc_request_set *); +void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *); +#define PTLRPCD_SET ((struct ptlrpc_request_set *)1) + +void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool); +int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq); + +struct ptlrpc_request_pool * +ptlrpc_init_rq_pool(int, int, + int (*populate_pool)(struct ptlrpc_request_pool *, int)); + +void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req); +struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, + const struct req_format *format); +struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, + struct ptlrpc_request_pool *, + const struct req_format *format); +void ptlrpc_request_free(struct ptlrpc_request *request); +int ptlrpc_request_pack(struct ptlrpc_request *request, + __u32 version, int opcode); +struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, + const struct req_format *format, + __u32 version, int opcode); +int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, + __u32 version, int opcode, char **bufs, + struct ptlrpc_cli_ctx *ctx); +void ptlrpc_req_finished(struct ptlrpc_request *request); +void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request); +struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req); +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, + unsigned nfrags, unsigned max_brw, + unsigned int type, + unsigned portal, + const struct ptlrpc_bulk_frag_ops + *ops); + +void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len, + int pin); + +void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk); + +static inline void ptlrpc_release_bulk_noop(struct ptlrpc_bulk_desc *desc) +{ +} + +void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, + struct obd_import *imp); +__u64 ptlrpc_next_xid(void); +__u64 ptlrpc_sample_next_xid(void); +__u64 ptlrpc_req_xid(struct ptlrpc_request *request); +void ptlrpc_get_mod_rpc_slot(struct ptlrpc_request *req); +void ptlrpc_put_mod_rpc_slot(struct ptlrpc_request *req); + +/* Set of routines to run a function in ptlrpcd context */ +void *ptlrpcd_alloc_work(struct obd_import *imp, + int (*cb)(const struct lu_env *, void *), void *data); +void ptlrpcd_destroy_work(void *handler); +int ptlrpcd_queue_work(void *handler); + +/** @} */ +struct ptlrpc_service_buf_conf { + /* nbufs is buffers # to allocate when growing the pool */ + unsigned int bc_nbufs; + /* buffer size to post */ + unsigned int bc_buf_size; + /* portal to listed for requests on */ + unsigned int bc_req_portal; + /* portal of where to send replies to */ + unsigned int bc_rep_portal; + /* maximum request size to be accepted for this service */ + unsigned int bc_req_max_size; + /* maximum reply size this service can ever send */ + unsigned int bc_rep_max_size; +}; + +struct ptlrpc_service_thr_conf { + /* threadname should be 8 characters or less - 6 will be added on */ + char *tc_thr_name; + /* threads increasing factor for each CPU */ + unsigned int tc_thr_factor; + /* service threads # to start on each partition while initializing */ + unsigned int tc_nthrs_init; + /* + * low water of threads # upper-limit on each partition while running, + * service availability may be impacted if threads number is lower + * than this value. It can be ZERO if the service doesn't require + * CPU affinity or there is only one partition. + */ + unsigned int tc_nthrs_base; + /* "soft" limit for total threads number */ + unsigned int tc_nthrs_max; + /* user specified threads number, it will be validated due to + * other members of this structure. */ + unsigned int tc_nthrs_user; + /* bind service threads to only CPUs in their associated CPT */ + unsigned int tc_cpu_bind; + /* Tags for lu_context associated with service thread */ + __u32 tc_ctx_tags; +}; + +struct ptlrpc_service_cpt_conf { + struct cfs_cpt_table *cc_cptable; + /* string pattern to describe CPTs for a service */ + char *cc_pattern; + /* whether or not to have per-CPT service partitions */ + bool cc_affinity; +}; + +struct ptlrpc_service_conf { + /* service name */ + char *psc_name; + /* soft watchdog timeout multiplifier to print stuck service traces */ + unsigned int psc_watchdog_factor; + /* buffer information */ + struct ptlrpc_service_buf_conf psc_buf; + /* thread information */ + struct ptlrpc_service_thr_conf psc_thr; + /* CPU partition information */ + struct ptlrpc_service_cpt_conf psc_cpt; + /* function table */ + struct ptlrpc_service_ops psc_ops; +}; + +/* ptlrpc/service.c */ +/** + * Server-side services API. Register/unregister service, request state + * management, service thread management + * + * @{ + */ +void ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock, + int mode, bool no_ack, bool convert_lock); +void ptlrpc_commit_replies(struct obd_export *exp); +void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs); +void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs); +int ptlrpc_hpreq_handler(struct ptlrpc_request *req); +struct ptlrpc_service *ptlrpc_register_service( + struct ptlrpc_service_conf *conf, + struct kset *parent, + struct dentry *debugfs_entry); + +int ptlrpc_unregister_service(struct ptlrpc_service *service); +int ptlrpc_service_health_check(struct ptlrpc_service *); +void ptlrpc_server_drop_request(struct ptlrpc_request *req); +void ptlrpc_request_change_export(struct ptlrpc_request *req, + struct obd_export *export); +void ptlrpc_update_export_timer(struct obd_export *exp, + time64_t extra_delay); + +int ptlrpc_hr_init(void); +void ptlrpc_hr_fini(void); + +void ptlrpc_watchdog_init(struct delayed_work *work, timeout_t timeout); +void ptlrpc_watchdog_disable(struct delayed_work *work); +void ptlrpc_watchdog_touch(struct delayed_work *work, timeout_t timeout); + +/** @} */ + +/* ptlrpc/import.c */ +/** + * Import API + * @{ + */ +int ptlrpc_connect_import(struct obd_import *imp); +int ptlrpc_connect_import_locked(struct obd_import *imp); +int ptlrpc_init_import(struct obd_import *imp); +int ptlrpc_disconnect_import(struct obd_import *imp, int noclose); +int ptlrpc_disconnect_and_idle_import(struct obd_import *imp); +int ptlrpc_import_recovery_state_machine(struct obd_import *imp); +void deuuidify(char *uuid, const char *prefix, char **uuid_start, + int *uuid_len); +void ptlrpc_import_enter_resend(struct obd_import *imp); +/* ptlrpc/pack_generic.c */ +int ptlrpc_reconnect_import(struct obd_import *imp); +/** @} */ + +/** + * ptlrpc msg buffer and swab interface + * + * @{ + */ +#define PTLRPC_MAX_BUFCOUNT \ + (sizeof(((struct ptlrpc_request *)0)->rq_req_swab_mask) * 8) +#define MD_MAX_BUFLEN (MDS_REG_MAXREQSIZE > OUT_MAXREQSIZE ? \ + MDS_REG_MAXREQSIZE : OUT_MAXREQSIZE) +#define PTLRPC_MAX_BUFLEN (OST_IO_MAXREQSIZE > MD_MAX_BUFLEN ? \ + OST_IO_MAXREQSIZE : MD_MAX_BUFLEN) +int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len); +int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len); + +int lustre_msg_check_version(struct lustre_msg *msg, __u32 version); +void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens, + char **bufs); +int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count, + __u32 *lens, char **bufs); +int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens, + char **bufs); +int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, + __u32 *lens, char **bufs, int flags); +#define LPRFL_EARLY_REPLY 1 +int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens, + char **bufs, int flags); +int lustre_shrink_msg(struct lustre_msg *msg, int segment, + unsigned int newlen, int move_data); +int lustre_grow_msg(struct lustre_msg *msg, int segment, unsigned int newlen); +void lustre_free_reply_state(struct ptlrpc_reply_state *rs); +int __lustre_unpack_msg(struct lustre_msg *m, int len); +__u32 lustre_msg_hdr_size(__u32 magic, __u32 count); +__u32 lustre_msg_size(__u32 magic, int count, __u32 *lengths); +__u32 lustre_msg_size_v2(int count, __u32 *lengths); +__u32 lustre_packed_msg_size(struct lustre_msg *msg); +extern __u32 lustre_msg_early_size; +void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, __u32 n, __u32 min_size); +void *lustre_msg_buf(struct lustre_msg *m, __u32 n, __u32 minlen); +__u32 lustre_msg_buflen(struct lustre_msg *m, __u32 n); +void lustre_msg_set_buflen(struct lustre_msg *m, __u32 n, __u32 len); +__u32 lustre_msg_bufcount(struct lustre_msg *m); +char *lustre_msg_string(struct lustre_msg *m, __u32 n, __u32 max_len); +__u32 lustre_msghdr_get_flags(struct lustre_msg *msg); +void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags); +__u32 lustre_msg_get_flags(struct lustre_msg *msg); +void lustre_msg_add_flags(struct lustre_msg *msg, __u32 flags); +void lustre_msg_set_flags(struct lustre_msg *msg, __u32 flags); +void lustre_msg_clear_flags(struct lustre_msg *msg, __u32 flags); +__u32 lustre_msg_get_op_flags(struct lustre_msg *msg); +void lustre_msg_add_op_flags(struct lustre_msg *msg, __u32 flags); +struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg); +__u32 lustre_msg_get_type(struct lustre_msg *msg); +enum lustre_msg_version lustre_msg_get_version(struct lustre_msg *msg); +void lustre_msg_add_version(struct lustre_msg *msg, __u32 version); +__u32 lustre_msg_get_opc(struct lustre_msg *msg); +__u64 lustre_msg_get_last_xid(struct lustre_msg *msg); +__u16 lustre_msg_get_tag(struct lustre_msg *msg); +__u64 lustre_msg_get_last_committed(struct lustre_msg *msg); +__u64 *lustre_msg_get_versions(struct lustre_msg *msg); +__u64 lustre_msg_get_transno(struct lustre_msg *msg); +__u64 lustre_msg_get_slv(struct lustre_msg *msg); +__u32 lustre_msg_get_limit(struct lustre_msg *msg); +void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv); +void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit); +int lustre_msg_get_status(struct lustre_msg *msg); +__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg); +__u32 lustre_msg_get_magic(struct lustre_msg *msg); +timeout_t lustre_msg_get_timeout(struct lustre_msg *msg); +timeout_t lustre_msg_get_service_timeout(struct lustre_msg *msg); +char *lustre_msg_get_jobid(struct lustre_msg *msg); +__u32 lustre_msg_get_cksum(struct lustre_msg *msg); +__u64 lustre_msg_get_mbits(struct lustre_msg *msg); +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, __u32 buf); +void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle); +void lustre_msg_set_type(struct lustre_msg *msg, __u32 type); +void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc); +void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid); +void lustre_msg_set_tag(struct lustre_msg *msg, __u16 tag); +void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed); +void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions); +void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno); +void lustre_msg_set_status(struct lustre_msg *msg, __u32 status); +void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt); +void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes); +void ptlrpc_request_set_replen(struct ptlrpc_request *req); +void lustre_msg_set_timeout(struct lustre_msg *msg, timeout_t timeout); +void lustre_msg_set_service_timeout(struct lustre_msg *msg, + timeout_t service_timeout); +void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid); +void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum); +void lustre_msg_set_mbits(struct lustre_msg *msg, __u64 mbits); + +static inline void +lustre_shrink_reply(struct ptlrpc_request *req, int segment, + unsigned int newlen, int move_data) +{ + LASSERT(req->rq_reply_state); + LASSERT(req->rq_repmsg); + req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment, + newlen, move_data); +} + +#ifdef LUSTRE_TRANSLATE_ERRNOS + +static inline int ptlrpc_status_hton(int h) +{ + /* + * Positive errnos must be network errnos, such as LUSTRE_EDEADLK, + * ELDLM_LOCK_ABORTED, etc. + */ + if (h < 0) + return -lustre_errno_hton(-h); + else + return h; +} + +static inline int ptlrpc_status_ntoh(int n) +{ + /* + * See the comment in ptlrpc_status_hton(). + */ + if (n < 0) + return -lustre_errno_ntoh(-n); + else + return n; +} + +#else + +#define ptlrpc_status_hton(h) (h) +#define ptlrpc_status_ntoh(n) (n) + +#endif +/** @} */ + +/** Change request phase of \a req to \a new_phase */ +static inline void +ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase) +{ + if (req->rq_phase == new_phase) + return; + + if (new_phase == RQ_PHASE_UNREG_RPC || + new_phase == RQ_PHASE_UNREG_BULK) { + /* No embedded unregistering phases */ + if (req->rq_phase == RQ_PHASE_UNREG_RPC || + req->rq_phase == RQ_PHASE_UNREG_BULK) + return; + + req->rq_next_phase = req->rq_phase; + if (req->rq_import) + atomic_inc(&req->rq_import->imp_unregistering); + } + + if (req->rq_phase == RQ_PHASE_UNREG_RPC || + req->rq_phase == RQ_PHASE_UNREG_BULK) { + if (req->rq_import) + atomic_dec(&req->rq_import->imp_unregistering); + } + + DEBUG_REQ(D_INFO, req, "move request phase from %s to %s", + ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase)); + + req->rq_phase = new_phase; +} + +/** + * Returns true if request \a req got early reply and hard deadline is not met + */ +static inline int +ptlrpc_client_early(struct ptlrpc_request *req) +{ + return req->rq_early; +} + +/** + * Returns true if we got real reply from server for this request + */ +static inline int +ptlrpc_client_replied(struct ptlrpc_request *req) +{ + if (req->rq_reply_deadline > ktime_get_real_seconds()) + return 0; + return req->rq_replied; +} + +/** Returns true if request \a req is in process of receiving server reply */ +static inline int +ptlrpc_client_recv(struct ptlrpc_request *req) +{ + if (req->rq_reply_deadline > ktime_get_real_seconds()) + return 1; + return req->rq_receiving_reply; +} + +#define ptlrpc_cli_wait_unlink(req) __ptlrpc_cli_wait_unlink(req, NULL) + +static inline int +__ptlrpc_cli_wait_unlink(struct ptlrpc_request *req, bool *discard) +{ + int rc; + + spin_lock(&req->rq_lock); + if (req->rq_reply_deadline > ktime_get_real_seconds()) { + spin_unlock(&req->rq_lock); + return 1; + } + if (req->rq_req_deadline > ktime_get_real_seconds()) { + spin_unlock(&req->rq_lock); + return 1; + } + + if (discard) { + *discard = false; + if (req->rq_reply_unlinked && req->rq_req_unlinked == 0) { + *discard = true; + spin_unlock(&req->rq_lock); + return 1; /* Should call again after LNetMDUnlink */ + } + } + + rc = !req->rq_req_unlinked || !req->rq_reply_unlinked || + req->rq_receiving_reply; + spin_unlock(&req->rq_lock); + return rc; +} + +static inline void +ptlrpc_client_wake_req(struct ptlrpc_request *req) +{ + smp_mb(); + if (req->rq_set == NULL) + wake_up(&req->rq_reply_waitq); + else + wake_up(&req->rq_set->set_waitq); +} + +static inline void +ptlrpc_rs_addref(struct ptlrpc_reply_state *rs) +{ + LASSERT(atomic_read(&rs->rs_refcount) > 0); + atomic_inc(&rs->rs_refcount); +} + +static inline void +ptlrpc_rs_decref(struct ptlrpc_reply_state *rs) +{ + LASSERT(atomic_read(&rs->rs_refcount) > 0); + if (atomic_dec_and_test(&rs->rs_refcount)) + lustre_free_reply_state(rs); +} + +/* Should only be called once per req */ +static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req) +{ + if (req->rq_reply_state == NULL) + return; /* shouldn't occur */ + + /* req_repmsg equals rq_reply_state->rs_msg, + * so set it to NULL before rq_reply_state is possibly freed + */ + spin_lock(&req->rq_early_free_lock); + req->rq_repmsg = NULL; + spin_unlock(&req->rq_early_free_lock); + + ptlrpc_rs_decref(req->rq_reply_state); + req->rq_reply_state = NULL; +} + +static inline __u32 lustre_request_magic(struct ptlrpc_request *req) +{ + return lustre_msg_get_magic(req->rq_reqmsg); +} + +static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req) +{ + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return req->rq_reqmsg->lm_repsize; + default: + LASSERTF(0, "incorrect message magic: %08x\n", + req->rq_reqmsg->lm_magic); + return -EFAULT; + } +} + +static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req) +{ + if (req->rq_delay_limit != 0 && + req->rq_queued_time + req->rq_delay_limit < ktime_get_seconds()) + return 1; + return 0; +} + +static inline int ptlrpc_no_resend(struct ptlrpc_request *req) +{ + if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) { + spin_lock(&req->rq_lock); + req->rq_no_resend = 1; + spin_unlock(&req->rq_lock); + } + return req->rq_no_resend; +} + +static inline int +ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt) +{ + int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate); + + return svcpt->scp_service->srv_watchdog_factor * + max_t(int, at, obd_timeout); +} + +/** + * Calculate the amount of time for lock prolongation. + * + * This is helper function to get the timeout extra time. + * + * @req current request + * + * Return: amount of time to extend the timeout with + */ +static inline timeout_t prolong_timeout(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + timeout_t req_timeout = 0; + + if (AT_OFF) + return obd_timeout / 2; + + if (req->rq_deadline > req->rq_arrival_time.tv_sec) + req_timeout = req->rq_deadline - req->rq_arrival_time.tv_sec; + + return max(req_timeout, + at_est2timeout(at_get(&svcpt->scp_at_estimate))); +} + +static inline struct ptlrpc_service * +ptlrpc_req2svc(struct ptlrpc_request *req) +{ + LASSERT(req->rq_rqbd != NULL); + return req->rq_rqbd->rqbd_svcpt->scp_service; +} + +/* ldlm/ldlm_lib.c */ +/** + * Target client logic + * @{ + */ +int client_obd_setup(struct obd_device *obd, struct lustre_cfg *lcfg); +int client_obd_cleanup(struct obd_device *obd); +int client_connect_import(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *, + void *localdata); +int client_disconnect_export(struct obd_export *exp); +int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority); +int client_import_dyn_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + lnet_nid_t prim_nid, int priority); +int client_import_add_nids_to_conn(struct obd_import *imp, lnet_nid_t *nids, + int nid_count, struct obd_uuid *uuid); +int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid); +int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer, + struct obd_uuid *uuid); +int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid); +void client_destroy_import(struct obd_import *imp); +/** @} */ + +#ifdef HAVE_SERVER_SUPPORT +int server_disconnect_export(struct obd_export *exp); +#endif + +/* ptlrpc/pinger.c */ +/** + * Pinger API (client side only) + * @{ + */ +enum timeout_event { + TIMEOUT_GRANT = 1 +}; +struct timeout_item; +typedef int (*timeout_cb_t)(struct timeout_item *, void *); +int ptlrpc_pinger_add_import(struct obd_import *imp); +int ptlrpc_pinger_del_import(struct obd_import *imp); +struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp); +int ptlrpc_obd_ping(struct obd_device *obd); +void ping_evictor_start(void); +void ping_evictor_stop(void); +void ptlrpc_pinger_ir_up(void); +void ptlrpc_pinger_ir_down(void); +/** @} */ +int ptlrpc_pinger_suppress_pings(void); + +/* ptlrpc/ptlrpcd.c */ +void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force); +void ptlrpcd_free(struct ptlrpcd_ctl *pc); +void ptlrpcd_wake(struct ptlrpc_request *req); +void ptlrpcd_add_req(struct ptlrpc_request *req); +void ptlrpcd_add_rqset(struct ptlrpc_request_set *set); +int ptlrpcd_addref(void); +void ptlrpcd_decref(void); + +/* ptlrpc/lproc_ptlrpc.c */ +/** + * procfs output related functions + * @{ + */ +const char* ll_opcode2str(__u32 opcode); +const int ll_str2opcode(const char *ops); +#ifdef CONFIG_PROC_FS +void ptlrpc_lprocfs_register_obd(struct obd_device *obd); +void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd); +void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes); +#else +static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {} +static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {} +static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {} +#endif +/** @} */ + +/* ptlrpc/llog_server.c */ +int llog_origin_handle_open(struct ptlrpc_request *req); +int llog_origin_handle_prev_block(struct ptlrpc_request *req); +int llog_origin_handle_next_block(struct ptlrpc_request *req); +int llog_origin_handle_read_header(struct ptlrpc_request *req); + +/* ptlrpc/llog_client.c */ +extern const struct llog_operations llog_client_ops; +/** @} net */ + +#endif +/** @} PtlRPC */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h new file mode 100644 index 0000000000000..80f123a8e2277 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h @@ -0,0 +1,241 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (C) 2013, Trustees of Indiana University + * + * Copyright (c) 2017, Intel Corporation. + * + * Author: Joshua Walgenbach + */ + +#ifndef _LUSTRE_NODEMAP_H +#define _LUSTRE_NODEMAP_H + +#include + +#define LUSTRE_NODEMAP_NAME "nodemap" + +#define LUSTRE_NODEMAP_DEFAULT_ID 0 + +/** enums containing the types of ids contained in a nodemap + * kept so other modules (mgs, mdt, etc) can define the type + * of search easily + */ + +enum nodemap_id_type { + NODEMAP_UID, + NODEMAP_GID, + NODEMAP_PROJID, +}; + +enum nodemap_tree_type { + NODEMAP_FS_TO_CLIENT, + NODEMAP_CLIENT_TO_FS, +}; + +enum nodemap_mapping_modes { + NODEMAP_MAP_BOTH_LEGACY = 0x0, /* for compatibility */ + NODEMAP_MAP_UID = 0x01, + NODEMAP_MAP_GID = 0x02, + NODEMAP_MAP_BOTH = 0x03, /* for compatibility */ + NODEMAP_MAP_PROJID = 0x04, + NODEMAP_MAP_ALL = NODEMAP_MAP_UID | + NODEMAP_MAP_GID | + NODEMAP_MAP_PROJID, +}; + +struct nodemap_pde { + char npe_name[LUSTRE_NODEMAP_NAME_LENGTH + 1]; + struct proc_dir_entry *npe_proc_entry; + struct list_head npe_list_member; +}; + +/** The nodemap id 0 will be the default nodemap. It will have a configuration + * set by the MGS, but no ranges will be allowed as all NIDs that do not map + * will be added to the default nodemap + */ + +struct lu_nodemap { + /* human readable ID */ + char nm_name[LUSTRE_NODEMAP_NAME_LENGTH + 1]; + /* flags to govern nodemap behavior */ + bool nmf_trust_client_ids:1, + nmf_deny_unknown:1, + nmf_allow_root_access:1, + nmf_enable_audit:1, + nmf_forbid_encryption:1; + /* bitmap for mapping type */ + enum nodemap_mapping_modes + nmf_map_mode; + /* unique ID set by MGS */ + unsigned int nm_id; + /* nodemap ref counter */ + atomic_t nm_refcount; + /* UID to squash unmapped UIDs */ + uid_t nm_squash_uid; + /* GID to squash unmapped GIDs */ + gid_t nm_squash_gid; + /* PROJID to squash unmapped PROJIDs */ + projid_t nm_squash_projid; + /* NID range list */ + struct list_head nm_ranges; + /* lock for idmap red/black trees */ + struct rw_semaphore nm_idmap_lock; + /* UID map keyed by local UID */ + struct rb_root nm_fs_to_client_uidmap; + /* UID map keyed by remote UID */ + struct rb_root nm_client_to_fs_uidmap; + /* GID map keyed by local UID */ + struct rb_root nm_fs_to_client_gidmap; + /* GID map keyed by remote UID */ + struct rb_root nm_client_to_fs_gidmap; + /* PROJID map keyed by local UID */ + struct rb_root nm_fs_to_client_projidmap; + /* PROJID map keyed by remote UID */ + struct rb_root nm_client_to_fs_projidmap; + /* attached client members of this nodemap */ + struct mutex nm_member_list_lock; + struct list_head nm_member_list; + /* access by nodemap name */ + struct hlist_node nm_hash; + struct nodemap_pde *nm_pde_data; + /* fileset the nodes of this nodemap are restricted to */ + char nm_fileset[PATH_MAX+1]; + /* information about the expected SELinux policy on the nodes */ + char nm_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + 1]; + + /* used when loading/unloading nodemaps */ + struct list_head nm_list; +}; + +/* Store handles to local MGC storage to save config locally. In future + * versions of nodemap, mgc will receive the config directly and so this might + * not be needed. + */ +struct nm_config_file { + struct local_oid_storage *ncf_los; + struct dt_object *ncf_obj; + struct list_head ncf_list; +}; + +void nodemap_activate(const bool value); +int nodemap_add(const char *nodemap_name); +int nodemap_del(const char *nodemap_name); +int nodemap_add_member(lnet_nid_t nid, struct obd_export *exp); +void nodemap_del_member(struct obd_export *exp); +int nodemap_parse_range(const char *range_string, lnet_nid_t range[2]); +int nodemap_parse_idmap(char *idmap_string, __u32 idmap[2]); +int nodemap_add_range(const char *name, const lnet_nid_t nid[2]); +int nodemap_del_range(const char *name, const lnet_nid_t nid[2]); +int nodemap_set_allow_root(const char *name, bool allow_root); +int nodemap_set_trust_client_ids(const char *name, bool trust_client_ids); +int nodemap_set_deny_unknown(const char *name, bool deny_unknown); +int nodemap_set_mapping_mode(const char *name, + enum nodemap_mapping_modes map_mode); +int nodemap_set_squash_uid(const char *name, uid_t uid); +int nodemap_set_squash_gid(const char *name, gid_t gid); +int nodemap_set_squash_projid(const char *name, projid_t projid); +int nodemap_set_audit_mode(const char *name, bool enable_audit); +int nodemap_set_forbid_encryption(const char *name, bool forbid_encryption); +bool nodemap_can_setquota(struct lu_nodemap *nodemap, __u32 qc_type, __u32 id); +int nodemap_add_idmap(const char *name, enum nodemap_id_type id_type, + const __u32 map[2]); +int nodemap_del_idmap(const char *name, enum nodemap_id_type id_type, + const __u32 map[2]); +int nodemap_set_fileset(const char *name, const char *fileset); +char *nodemap_get_fileset(const struct lu_nodemap *nodemap); +int nodemap_set_sepol(const char *name, const char *sepol); +const char *nodemap_get_sepol(const struct lu_nodemap *nodemap); +__u32 nodemap_map_id(struct lu_nodemap *nodemap, + enum nodemap_id_type id_type, + enum nodemap_tree_type tree_type, __u32 id); +ssize_t nodemap_map_acl(struct lu_nodemap *nodemap, void *buf, size_t size, + enum nodemap_tree_type tree_type); +#ifdef HAVE_SERVER_SUPPORT +void nodemap_test_nid(lnet_nid_t nid, char *name_buf, size_t name_len); +#else +#define nodemap_test_nid(nid, name_buf, name_len) do {} while(0) +#endif +int nodemap_test_id(lnet_nid_t nid, enum nodemap_id_type idtype, + __u32 client_id, __u32 *fs_id); + +struct nm_config_file *nm_config_file_register_mgs(const struct lu_env *env, + struct dt_object *obj, + struct local_oid_storage *los); +struct dt_device; +struct nm_config_file *nm_config_file_register_tgt(const struct lu_env *env, + struct dt_device *dev, + struct local_oid_storage *los); +void nm_config_file_deregister_mgs(const struct lu_env *env, + struct nm_config_file *ncf); +void nm_config_file_deregister_tgt(const struct lu_env *env, + struct nm_config_file *ncf); +struct lu_nodemap *nodemap_get_from_exp(struct obd_export *exp); +void nodemap_putref(struct lu_nodemap *nodemap); + +#ifdef HAVE_SERVER_SUPPORT + +struct nodemap_range_tree { + struct interval_tree_root nmrt_range_interval_root; + unsigned int nmrt_range_highest_id; +}; + +struct nodemap_config { + /* Highest numerical lu_nodemap.nm_id defined */ + unsigned int nmc_nodemap_highest_id; + + /* Simple flag to determine if nodemaps are active */ + bool nmc_nodemap_is_active; + + /* Pointer to default nodemap as it is needed more often */ + struct lu_nodemap *nmc_default_nodemap; + + /** + * Lock required to access the range tree. + */ + struct rw_semaphore nmc_range_tree_lock; + struct nodemap_range_tree nmc_range_tree; + + /** + * Hash keyed on nodemap name containing all + * nodemaps + */ + struct cfs_hash *nmc_nodemap_hash; +}; + +struct nodemap_config *nodemap_config_alloc(void); +void nodemap_config_dealloc(struct nodemap_config *config); +void nodemap_config_set_active_mgc(struct nodemap_config *config); + +int nodemap_process_idx_pages(struct nodemap_config *config, union lu_page *lip, + struct lu_nodemap **recent_nodemap); + +#else /* disable nodemap processing in MGC of non-servers */ +static inline int nodemap_process_idx_pages(void *config, + union lu_page *lip, + struct lu_nodemap **recent_nodemap) +{ return 0; } +#endif /* HAVE_SERVER_SUPPORT */ + +int nodemap_get_config_req(struct obd_device *mgs_obd, + struct ptlrpc_request *req); +#endif /* _LUSTRE_NODEMAP_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs.h new file mode 100644 index 0000000000000..80f9f6b4a2b3a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs.h @@ -0,0 +1,752 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * + * Network Request Scheduler (NRS) + * + */ + +#ifndef _LUSTRE_NRS_H +#define _LUSTRE_NRS_H + +/** + * \defgroup nrs Network Request Scheduler + * @{ + */ +struct ptlrpc_nrs_policy; +struct ptlrpc_nrs_resource; +struct ptlrpc_nrs_request; + +/** + * NRS control operations. + * + * These are common for all policies. + */ +enum ptlrpc_nrs_ctl { + /** + * Not a valid opcode. + */ + PTLRPC_NRS_CTL_INVALID, + /** + * Activate the policy. + */ + PTLRPC_NRS_CTL_START, + /** + * Reserved for multiple primary policies, which may be a possibility + * in the future. + */ + PTLRPC_NRS_CTL_STOP, + /** + * Policies can start using opcodes from this value and onwards for + * their own purposes; the assigned value itself is arbitrary. + */ + PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20, +}; + +/** + * NRS policy operations. + * + * These determine the behaviour of a policy, and are called in response to + * NRS core events. + */ +struct ptlrpc_nrs_pol_ops { + /** + * Called during policy registration; this operation is optional. + * + * \param[in,out] policy The policy being initialized + */ + int (*op_policy_init) (struct ptlrpc_nrs_policy *policy); + /** + * Called during policy unregistration; this operation is optional. + * + * \param[in,out] policy The policy being unregistered/finalized + */ + void (*op_policy_fini) (struct ptlrpc_nrs_policy *policy); + /** + * Called when activating a policy via lprocfs; policies allocate and + * initialize their resources here; this operation is optional. + * + * \param[in,out] policy The policy being started + * \param[in,out] arg A generic char buffer + * + * \see nrs_policy_start_locked() + */ + int (*op_policy_start) (struct ptlrpc_nrs_policy *policy, + char *arg); + /** + * Called when deactivating a policy via lprocfs; policies deallocate + * their resources here; this operation is optional + * + * \param[in,out] policy The policy being stopped + * + * \see nrs_policy_stop0() + */ + void (*op_policy_stop) (struct ptlrpc_nrs_policy *policy); + /** + * Used for policy-specific operations; i.e. not generic ones like + * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous + * to an ioctl; this operation is optional. + * + * \param[in,out] policy The policy carrying out operation \a opc + * \param[in] opc The command operation being carried out + * \param[in,out] arg An generic buffer for communication between the + * user and the control operation + * + * \retval -ve error + * \retval 0 success + * + * \see ptlrpc_nrs_policy_control() + */ + int (*op_policy_ctl) (struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg); + + /** + * Called when obtaining references to the resources of the resource + * hierarchy for a request that has arrived for handling at the PTLRPC + * service. Policies should return -ve for requests they do not wish + * to handle. This operation is mandatory. + * + * \param[in,out] policy The policy we're getting resources for. + * \param[in,out] nrq The request we are getting resources for. + * \param[in] parent The parent resource of the resource being + * requested; set to NULL if none. + * \param[out] resp The resource is to be returned here; the + * fallback policy in an NRS head should + * \e always return a non-NULL pointer value. + * \param[in] moving_req When set, signifies that this is an attempt + * to obtain resources for a request being moved + * to the high-priority NRS head by + * ldlm_lock_reorder_req(). + * This implies two things: + * 1. We are under obd_export::exp_rpc_lock and + * so should not sleep. + * 2. We should not perform non-idempotent or can + * skip performing idempotent operations that + * were carried out when resources were first + * taken for the request when it was initialized + * in ptlrpc_nrs_req_initialize(). + * + * \retval 0, +ve The level of the returned resource in the resource + * hierarchy; currently only 0 (for a non-leaf resource) + * and 1 (for a leaf resource) are supported by the + * framework. + * \retval -ve error + * + * \see ptlrpc_nrs_req_initialize() + * \see ptlrpc_nrs_hpreq_add_nolock() + * \see ptlrpc_nrs_req_hp_move() + */ + int (*op_res_get) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, + bool moving_req); + /** + * Called when releasing references taken for resources in the resource + * hierarchy for the request; this operation is optional. + * + * \param[in,out] policy The policy the resource belongs to + * \param[in] res The resource to be freed + * + * \see ptlrpc_nrs_req_finalize() + * \see ptlrpc_nrs_hpreq_add_nolock() + * \see ptlrpc_nrs_req_hp_move() + */ + void (*op_res_put) (struct ptlrpc_nrs_policy *policy, + const struct ptlrpc_nrs_resource *res); + + /** + * Obtains a request for handling from the policy, and optionally + * removes the request from the policy; this operation is mandatory. + * + * \param[in,out] policy The policy to poll + * \param[in] peek When set, signifies that we just want to + * examine the request, and not handle it, so the + * request is not removed from the policy. + * \param[in] force When set, it will force a policy to return a + * request if it has one queued. + * + * \retval NULL No request available for handling + * \retval valid-pointer The request polled for handling + * + * \see ptlrpc_nrs_req_get_nolock() + */ + struct ptlrpc_nrs_request * + (*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek, + bool force); + /** + * Called when attempting to add a request to a policy for later + * handling; this operation is mandatory. + * + * \param[in,out] policy The policy on which to enqueue \a nrq + * \param[in,out] nrq The request to enqueue + * + * \retval 0 success + * \retval != 0 error + * + * \see ptlrpc_nrs_req_add_nolock() + */ + int (*op_req_enqueue) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Removes a request from the policy's set of pending requests. Normally + * called after a request has been polled successfully from the policy + * for handling; this operation is mandatory. + * + * \param[in,out] policy The policy the request \a nrq belongs to + * \param[in,out] nrq The request to dequeue + * + * \see ptlrpc_nrs_req_del_nolock() + */ + void (*op_req_dequeue) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Called after the request being carried out. Could be used for + * job/resource control; this operation is optional. + * + * \param[in,out] policy The policy which is stopping to handle request + * \a nrq + * \param[in,out] nrq The request + * + * \pre assert_spin_locked(&svcpt->scp_req_lock) + * + * \see ptlrpc_nrs_req_stop_nolock() + */ + void (*op_req_stop) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Registers the policy's lprocfs interface with a PTLRPC service. + * + * \param[in] svc The service + * + * \retval 0 success + * \retval != 0 error + */ + int (*op_lprocfs_init) (struct ptlrpc_service *svc); + /** + * Unegisters the policy's lprocfs interface with a PTLRPC service. + * + * In cases of failed policy registration in + * \e ptlrpc_nrs_policy_register(), this function may be called for a + * service which has not registered the policy successfully, so + * implementations of this method should make sure their operations are + * safe in such cases. + * + * \param[in] svc The service + */ + void (*op_lprocfs_fini) (struct ptlrpc_service *svc); +}; + +/** + * Policy flags + */ +enum nrs_policy_flags { + /** + * Fallback policy, use this flag only on a single supported policy per + * service. The flag cannot be used on policies that use + * \e PTLRPC_NRS_FL_REG_EXTERN + */ + PTLRPC_NRS_FL_FALLBACK = BIT(0), + /** + * Start policy immediately after registering. + */ + PTLRPC_NRS_FL_REG_START = BIT(1), + /** + * This is a policy registering from a module different to the one NRS + * core ships in (currently ptlrpc). + */ + PTLRPC_NRS_FL_REG_EXTERN = BIT(2), +}; + +/** + * NRS queue type. + * + * Denotes whether an NRS instance is for handling normal or high-priority + * RPCs, or whether an operation pertains to one or both of the NRS instances + * in a service. + */ +enum ptlrpc_nrs_queue_type { + PTLRPC_NRS_QUEUE_REG = BIT(0), + PTLRPC_NRS_QUEUE_HP = BIT(1), + PTLRPC_NRS_QUEUE_BOTH = (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP) +}; + +/** + * NRS head + * + * A PTLRPC service has at least one NRS head instance for handling normal + * priority RPCs, and may optionally have a second NRS head instance for + * handling high-priority RPCs. Each NRS head maintains a list of available + * policies, of which one and only one policy is acting as the fallback policy, + * and optionally a different policy may be acting as the primary policy. For + * all RPCs handled by this NRS head instance, NRS core will first attempt to + * enqueue the RPC using the primary policy (if any). The fallback policy is + * used in the following cases: + * - when there was no primary policy in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request + * was initialized. + * - when the primary policy that was at the + * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the + * RPC was initialized, denoted it did not wish, or for some other reason was + * not able to handle the request, by returning a non-valid NRS resource + * reference. + * - when the primary policy that was at the + * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the + * RPC was initialized, fails later during the request enqueueing stage. + * + * \see nrs_resource_get_safe() + * \see nrs_request_enqueue() + */ +struct ptlrpc_nrs { + spinlock_t nrs_lock; + /** XXX Possibly replace svcpt->scp_req_lock with another lock here. */ + /** + * List of registered policies + */ + struct list_head nrs_policy_list; + /** + * List of policies with queued requests. Policies that have any + * outstanding requests are queued here, and this list is queried + * in a round-robin manner from NRS core when obtaining a request + * for handling. This ensures that requests from policies that at some + * point transition away from the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained. + */ + struct list_head nrs_policy_queued; + /** + * Service partition for this NRS head + */ + struct ptlrpc_service_part *nrs_svcpt; + /** + * Primary policy, which is the preferred policy for handling RPCs + */ + struct ptlrpc_nrs_policy *nrs_policy_primary; + /** + * Fallback policy, which is the backup policy for handling RPCs + */ + struct ptlrpc_nrs_policy *nrs_policy_fallback; + /** + * This NRS head handles either HP or regular requests + */ + enum ptlrpc_nrs_queue_type nrs_queue_type; + /** + * # queued requests from all policies in this NRS head + */ + unsigned long nrs_req_queued; + /** + * # scheduled requests from all policies in this NRS head + */ + unsigned long nrs_req_started; + /** + * # policies on this NRS + */ + unsigned nrs_num_pols; + /** + * This NRS head is in progress of starting a policy + */ + unsigned nrs_policy_starting:1; + /** + * In progress of shutting down the whole NRS head; used during + * unregistration + */ + unsigned nrs_stopping:1; + /** + * NRS policy is throttling reqeust + */ + unsigned nrs_throttling:1; +}; + +#define NRS_POL_NAME_MAX 16 +#define NRS_POL_ARG_MAX 16 + +struct ptlrpc_nrs_pol_desc; + +/** + * Service compatibility predicate; this determines whether a policy is adequate + * for handling RPCs of a particular PTLRPC service. + * + * XXX:This should give the same result during policy registration and + * unregistration, and for all partitions of a service; so the result should not + * depend on temporal service or other properties, that may influence the + * result. + */ +typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc); + +struct ptlrpc_nrs_pol_conf { + /** + * Human-readable policy name + */ + char nc_name[NRS_POL_NAME_MAX]; + /** + * NRS operations for this policy + */ + const struct ptlrpc_nrs_pol_ops *nc_ops; + /** + * Service compatibility predicate + */ + nrs_pol_desc_compat_t nc_compat; + /** + * Set for policies that support a single ptlrpc service, i.e. ones that + * have \a pd_compat set to nrs_policy_compat_one(). The variable value + * depicts the name of the single service that such policies are + * compatible with. + */ + const char *nc_compat_svc_name; + /** + * Owner module for this policy descriptor; policies registering from a + * different module to the one the NRS framework is held within + * (currently ptlrpc), should set this field to THIS_MODULE. + */ + struct module *nc_owner; + /** + * Policy registration flags; a bitmast of \e nrs_policy_flags + */ + unsigned nc_flags; +}; + +/** + * NRS policy registering descriptor + * + * Is used to hold a description of a policy that can be passed to NRS core in + * order to register the policy with NRS heads in different PTLRPC services. + */ +struct ptlrpc_nrs_pol_desc { + /** + * Human-readable policy name + */ + char pd_name[NRS_POL_NAME_MAX]; + /** + * Link into nrs_core::nrs_policies + */ + struct list_head pd_list; + /** + * NRS operations for this policy + */ + const struct ptlrpc_nrs_pol_ops *pd_ops; + /** + * Service compatibility predicate + */ + nrs_pol_desc_compat_t pd_compat; + /** + * Set for policies that are compatible with only one PTLRPC service. + * + * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name + */ + const char *pd_compat_svc_name; + /** + * Owner module for this policy descriptor. + * + * We need to hold a reference to the module whenever we might make use + * of any of the module's contents, i.e. + * - If one or more instances of the policy are at a state where they + * might be handling a request, i.e. + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to + * call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference + * is taken on the module when + * \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it + * becomes 0, so that we hold only one reference to the module maximum + * at any time. + * + * We do not need to hold a reference to the module, even though we + * might use code and data from the module, in the following cases: + * - During external policy registration, because this should happen in + * the module's init() function, in which case the module is safe from + * removal because a reference is being held on the module by the + * kernel, and iirc kmod (and I guess module-init-tools also) will + * serialize any racing processes properly anyway. + * - During external policy unregistration, because this should happen + * in a module's exit() function, and any attempts to start a policy + * instance would need to take a reference on the module, and this is + * not possible once we have reached the point where the exit() + * handler is called. + * - During service registration and unregistration, as service setup + * and cleanup, and policy registration, unregistration and policy + * instance starting, are serialized by \e nrs_core::nrs_mutex, so + * as long as users adhere to the convention of registering policies + * in init() and unregistering them in module exit() functions, there + * should not be a race between these operations. + * - During any policy-specific lprocfs operations, because a reference + * is held by the kernel on a proc entry that has been entered by a + * syscall, so as long as proc entries are removed during + * unregistration time, then unregistration and lprocfs operations + * will be properly serialized. + */ + struct module *pd_owner; + /** + * Bitmask of \e nrs_policy_flags + */ + unsigned pd_flags; + /** + * # of references on this descriptor + */ + atomic_t pd_refs; +}; + +/** + * NRS policy state + * + * Policies transition from one state to the other during their lifetime + */ +enum ptlrpc_nrs_pol_state { + /** + * Not a valid policy state. + */ + NRS_POL_STATE_INVALID, + /** + * Policies are at this state either at the start of their life, or + * transition here when the user selects a different policy to act + * as the primary one. + */ + NRS_POL_STATE_STOPPED, + /** + * Policy is progress of stopping + */ + NRS_POL_STATE_STOPPING, + /** + * Policy is in progress of starting + */ + NRS_POL_STATE_STARTING, + /** + * A policy is in this state in two cases: + * - it is the fallback policy, which is always in this state. + * - it has been activated by the user; i.e. it is the primary policy, + */ + NRS_POL_STATE_STARTED, +}; + +/** + * NRS policy information + * + * Used for obtaining information for the status of a policy via lprocfs + */ +struct ptlrpc_nrs_pol_info { + /** + * Policy name + */ + char pi_name[NRS_POL_NAME_MAX]; + /** + * Policy argument + */ + char pi_arg[NRS_POL_ARG_MAX]; + /** + * Current policy state + */ + enum ptlrpc_nrs_pol_state pi_state; + /** + * # RPCs enqueued for later dispatching by the policy + */ + long pi_req_queued; + /** + * # RPCs started for dispatch by the policy + */ + long pi_req_started; + /** + * Is this a fallback policy? + */ + unsigned pi_fallback:1; +}; + +/** + * NRS policy + * + * There is one instance of this for each policy in each NRS head of each + * PTLRPC service partition. + */ +struct ptlrpc_nrs_policy { + /** + * Linkage into the NRS head's list of policies, + * ptlrpc_nrs:nrs_policy_list + */ + struct list_head pol_list; + /** + * Linkage into the NRS head's list of policies with enqueued + * requests ptlrpc_nrs:nrs_policy_queued + */ + struct list_head pol_list_queued; + /** + * Current state of this policy + */ + enum ptlrpc_nrs_pol_state pol_state; + /** + * Bitmask of nrs_policy_flags + */ + unsigned pol_flags; + /** + * # RPCs enqueued for later dispatching by the policy + */ + long pol_req_queued; + /** + * # RPCs started for dispatch by the policy + */ + long pol_req_started; + /** + * Usage Reference count taken on the policy instance + */ + long pol_ref; + /** + * Human-readable policy argument + */ + char pol_arg[NRS_POL_ARG_MAX]; + /** + * The NRS head this policy has been created at + */ + struct ptlrpc_nrs *pol_nrs; + /** + * Private policy data; varies by policy type + */ + void *pol_private; + /** + * Policy descriptor for this policy instance. + */ + struct ptlrpc_nrs_pol_desc *pol_desc; +}; + +/** + * NRS resource + * + * Resources are embedded into two types of NRS entities: + * - Inside NRS policies, in the policy's private data in + * ptlrpc_nrs_policy::pol_private + * - In objects that act as prime-level scheduling entities in different NRS + * policies; e.g. on a policy that performs round robin or similar order + * scheduling across client NIDs, there would be one NRS resource per unique + * client NID. On a policy which performs round robin scheduling across + * backend filesystem objects, there would be one resource associated with + * each of the backend filesystem objects partaking in the scheduling + * performed by the policy. + * + * NRS resources share a parent-child relationship, in which resources embedded + * in policy instances are the parent entities, with all scheduling entities + * a policy schedules across being the children, thus forming a simple resource + * hierarchy. This hierarchy may be extended with one or more levels in the + * future if the ability to have more than one primary policy is added. + * + * Upon request initialization, references to the then active NRS policies are + * taken and used to later handle the dispatching of the request with one of + * these policies. + * + * \see nrs_resource_get_safe() + * \see ptlrpc_nrs_req_add() + */ +struct ptlrpc_nrs_resource { + /** + * This NRS resource's parent; is NULL for resources embedded in NRS + * policy instances; i.e. those are top-level ones. + */ + struct ptlrpc_nrs_resource *res_parent; + /** + * The policy associated with this resource. + */ + struct ptlrpc_nrs_policy *res_policy; +}; + +enum { + NRS_RES_FALLBACK, + NRS_RES_PRIMARY, + NRS_RES_MAX +}; + +#include +/** + * Binary heap node. + * + * Objects of this type are embedded into objects of the ordered set that is to + * be maintained by a \e struct binheap instance. + */ +struct binheap_node { + /** Index into the binary tree */ + unsigned int chn_index; +}; +#ifdef HAVE_SERVER_SUPPORT +#include +#include +#include +#endif /* HAVE_SERVER_SUPPORT */ +#include + +/** + * NRS request + * + * Instances of this object exist embedded within ptlrpc_request; the main + * purpose of this object is to hold references to the request's resources + * for the lifetime of the request, and to hold properties that policies use + * use for determining the request's scheduling priority. + * */ +struct ptlrpc_nrs_request { + /** + * The request's resource hierarchy. + */ + struct ptlrpc_nrs_resource *nr_res_ptrs[NRS_RES_MAX]; + /** + * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the + * policy that was used to enqueue the request. + * + * \see nrs_request_enqueue() + */ + unsigned nr_res_idx; + unsigned nr_initialized:1; + unsigned nr_enqueued:1; + unsigned nr_started:1; + unsigned nr_finalized:1; + struct binheap_node nr_node; + + /** + * Policy-specific fields, used for determining a request's scheduling + * priority, and other supporting functionality. + */ + union { + /** + * Fields for the FIFO policy + */ + struct nrs_fifo_req fifo; +#ifdef HAVE_SERVER_SUPPORT + /** + * CRR-N request defintion + */ + struct nrs_crrn_req crr; + /** ORR and TRR share the same request definition */ + struct nrs_orr_req orr; + /** + * TBF request definition + */ + struct nrs_tbf_req tbf; +#endif /* HAVE_SERVER_SUPPORT */ + /** + * Fields for the delay policy + */ + struct nrs_delay_req delay; + } nr_u; + /** + * Externally-registering policies may want to use this to allocate + * their own request properties. + */ + void *ext; +}; + +/** @} nrs */ +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h new file mode 100644 index 0000000000000..c4c217bd52679 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h @@ -0,0 +1,128 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * + * Network Request Scheduler (NRS) Client Round Robin over NIDs (CRR-N) policy + * + */ + +#ifndef _LUSTRE_NRS_CRR_H +#define _LUSTRE_NRS_CRR_H + +/** + * \name CRR-N + * + * CRR-N, Client Round Robin over NIDs + * @{ + */ +#include + +/** + * private data structure for CRR-N NRS + */ +struct nrs_crrn_net { + struct ptlrpc_nrs_resource cn_res; + struct binheap *cn_binheap; + /* CRR-N NRS - NID hash body */ + struct rhashtable cn_cli_hash; + /** + * Used when a new scheduling round commences, in order to synchronize + * all clients with the new round number. + */ + __u64 cn_round; + /** + * Determines the relevant ordering amongst request batches within a + * scheduling round. + */ + __u64 cn_sequence; + /** + * Round Robin quantum; the maximum number of RPCs that each request + * batch for each client can have in a scheduling round. + */ + __u16 cn_quantum; +}; + +/** + * Object representing a client in CRR-N, as identified by its NID + */ +struct nrs_crrn_client { + struct ptlrpc_nrs_resource cc_res; + struct rhash_head cc_rhead; + lnet_nid_t cc_nid; + /** + * The round number against which this client is currently scheduling + * requests. + */ + __u64 cc_round; + /** + * The sequence number used for requests scheduled by this client during + * the current round number. + */ + __u64 cc_sequence; + atomic_t cc_ref; + /** + * Round Robin quantum; the maximum number of RPCs the client is allowed + * to schedule in a single batch of each round. + */ + __u16 cc_quantum; + /** + * # of pending requests for this client, on all existing rounds + */ + __u16 cc_active; +}; + +/** + * CRR-N NRS request definition + */ +struct nrs_crrn_req { + /** + * Round number for this request; shared with all other requests in the + * same batch. + */ + __u64 cr_round; + /** + * Sequence number for this request; shared with all other requests in + * the same batch. + */ + __u64 cr_sequence; +}; + +/** + * CRR-N policy operations. + */ +enum nrs_ctl_crr { + /** + * Read the RR quantum size of a CRR-N policy. + */ + NRS_CTL_CRRN_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC, + /** + * Write the RR quantum size of a CRR-N policy. + */ + NRS_CTL_CRRN_WR_QUANTUM, +}; + +/** @} CRR-N */ +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h new file mode 100644 index 0000000000000..9ffbc51b988c0 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h @@ -0,0 +1,87 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2015, Cray Inc. All Rights Reserved. + * + * Copyright (c) 2015, Intel Corporation. + */ +/* + * + * Network Request Scheduler (NRS) Delay policy + * + */ + +#ifndef _LUSTRE_NRS_DELAY_H +#define _LUSTRE_NRS_DELAY_H + +/* \name delay + * + * Delay policy + * @{ + */ + +/** + * Private data structure for the delay policy + */ +struct nrs_delay_data { + struct ptlrpc_nrs_resource delay_res; + + /** + * Delayed requests are stored in this binheap until they are + * removed for handling. + */ + struct binheap *delay_binheap; + + /** + * Minimum service time + */ + __u32 min_delay; + + /** + * Maximum service time + */ + __u32 max_delay; + + /** + * We'll delay this percent of requests + */ + __u32 delay_pct; +}; + +struct nrs_delay_req { + /** + * This is the time at which a request becomes eligible for handling + */ + time64_t req_start_time; +}; + +enum nrs_ctl_delay { + NRS_CTL_DELAY_RD_MIN = PTLRPC_NRS_CTL_1ST_POL_SPEC, + NRS_CTL_DELAY_WR_MIN, + NRS_CTL_DELAY_RD_MAX, + NRS_CTL_DELAY_WR_MAX, + NRS_CTL_DELAY_RD_PCT, + NRS_CTL_DELAY_WR_PCT, +}; + +/** @} delay */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h new file mode 100644 index 0000000000000..3b5418eac6c44 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h @@ -0,0 +1,70 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * + * Network Request Scheduler (NRS) First-in First-out (FIFO) policy + * + */ + +#ifndef _LUSTRE_NRS_FIFO_H +#define _LUSTRE_NRS_FIFO_H + +/* \name fifo + * + * FIFO policy + * + * This policy is a logical wrapper around previous, non-NRS functionality. + * It dispatches RPCs in the same order as they arrive from the network. This + * policy is currently used as the fallback policy, and the only enabled policy + * on all NRS heads of all PTLRPC service partitions. + * @{ + */ + +/** + * Private data structure for the FIFO policy + */ +struct nrs_fifo_head { + /** + * Resource object for policy instance. + */ + struct ptlrpc_nrs_resource fh_res; + /** + * List of queued requests. + */ + struct list_head fh_list; + /** + * For debugging purposes. + */ + __u64 fh_sequence; +}; + +struct nrs_fifo_req { + struct list_head fr_list; + __u64 fr_sequence; +}; + +/** @} fifo */ +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h new file mode 100644 index 0000000000000..df3d16ab8b1c9 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h @@ -0,0 +1,225 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * + * Network Request Scheduler (NRS) Object-based Round Robin and Target-based + * Round Robin (ORR and TRR) policies + * + */ + +#ifndef _LUSTRE_NRS_ORR_H +#define _LUSTRE_NRS_ORR_H + +/** + * ORR policy operations + */ +enum nrs_ctl_orr { + NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC, + NRS_CTL_ORR_WR_QUANTUM, + NRS_CTL_ORR_RD_OFF_TYPE, + NRS_CTL_ORR_WR_OFF_TYPE, + NRS_CTL_ORR_RD_SUPP_REQ, + NRS_CTL_ORR_WR_SUPP_REQ, +}; + +/** + * \name ORR/TRR + * + * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies + * @{ + */ + +/** + * Lower and upper byte offsets of a brw RPC + */ +struct nrs_orr_req_range { + __u64 or_start; + __u64 or_end; +}; + +/** + * RPC types supported by the ORR/TRR policies + */ +enum nrs_orr_supp { + NOS_OST_READ = BIT(0), + NOS_OST_WRITE = BIT(1), + NOS_OST_RW = (NOS_OST_READ | NOS_OST_WRITE), + /** + * Default value for policies. + */ + NOS_DFLT = NOS_OST_READ +}; + +/** + * As unique keys for grouping RPCs together, we use the object's OST FID for + * the ORR policy, and the OST index for the TRR policy. + * + * XXX: We waste some space for TRR policy instances by using a union, but it + * allows to consolidate some of the code between ORR and TRR, and these + * policies will probably eventually merge into one anyway. + */ +struct nrs_orr_key { + union { + /** object FID for ORR */ + struct lu_fid ok_fid; + /** OST index for TRR */ + __u32 ok_idx; + }; +}; + +/** + * The largest base string for unique hash/slab object names is + * "nrs_orr_reg_", so 13 characters. We add 3 to this to be used for the CPT + * id number, so this _should_ be more than enough for the maximum number of + * CPTs on any system. If it does happen that this statement is incorrect, + * nrs_orr_genobjname() will inevitably yield a non-unique name and cause + * kmem_cache_create() to complain (on Linux), so the erroneous situation + * will hopefully not go unnoticed. + */ +#define NRS_ORR_OBJ_NAME_MAX (sizeof("nrs_orr_reg_") + 3) + +/** + * private data structure for ORR and TRR NRS + */ +struct nrs_orr_data { + struct ptlrpc_nrs_resource od_res; + struct binheap *od_binheap; + struct cfs_hash *od_obj_hash; + struct kmem_cache *od_cache; + /** + * Used when a new scheduling round commences, in order to synchronize + * all object or OST batches with the new round number. + */ + __u64 od_round; + /** + * Determines the relevant ordering amongst request batches within a + * scheduling round. + */ + __u64 od_sequence; + /** + * RPC types that are currently supported. + */ + enum nrs_orr_supp od_supp; + /** + * Round Robin quantum; the maxium number of RPCs that each request + * batch for each object or OST can have in a scheduling round. + */ + __u16 od_quantum; + /** + * Whether to use physical disk offsets or logical file offsets. + */ + bool od_physical; + /** + * XXX: We need to provide a persistently allocated string to hold + * unique object names for this policy, since in currently supported + * versions of Linux by Lustre, kmem_cache_create() just sets a pointer + * to the name string provided. kstrdup() is used in the version of + * kmeme_cache_create() in current Linux mainline, so we may be able to + * remove this in the future. + */ + char od_objname[NRS_ORR_OBJ_NAME_MAX]; +}; + +/** + * Represents a backend-fs object or OST in the ORR and TRR policies + * respectively + */ +struct nrs_orr_object { + struct ptlrpc_nrs_resource oo_res; + struct hlist_node oo_hnode; + /** + * The round number against which requests are being scheduled for this + * object or OST + */ + __u64 oo_round; + /** + * The sequence number used for requests scheduled for this object or + * OST during the current round number. + */ + __u64 oo_sequence; + /** + * The key of the object or OST for which this structure instance is + * scheduling RPCs + */ + struct nrs_orr_key oo_key; + long oo_ref; + /** + * Round Robin quantum; the maximum number of RPCs that are allowed to + * be scheduled for the object or OST in a single batch of each round. + */ + __u16 oo_quantum; + /** + * # of pending requests for this object or OST, on all existing rounds + */ + __u16 oo_active; +}; + +/** + * ORR/TRR NRS request definition + */ +struct nrs_orr_req { + /** + * The offset range this request covers + */ + struct nrs_orr_req_range or_range; + /** + * Round number for this request; shared with all other requests in the + * same batch. + */ + __u64 or_round; + /** + * Sequence number for this request; shared with all other requests in + * the same batch. + */ + __u64 or_sequence; + /** + * For debugging purposes. + */ + struct nrs_orr_key or_key; + /** + * An ORR policy instance has filled in request information while + * enqueueing the request on the service partition's regular NRS head. + */ + unsigned int or_orr_set:1; + /** + * A TRR policy instance has filled in request information while + * enqueueing the request on the service partition's regular NRS head. + */ + unsigned int or_trr_set:1; + /** + * Request offset ranges have been filled in with logical offset + * values. + */ + unsigned int or_logical_set:1; + /** + * Request offset ranges have been filled in with physical offset + * values. + */ + unsigned int or_physical_set:1; +}; + +/** @} ORR/TRR */ +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h new file mode 100644 index 0000000000000..feffa4eecee63 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h @@ -0,0 +1,380 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (C) 2013 DataDirect Networks, Inc. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * + * Network Request Scheduler (NRS) Token Bucket Filter(TBF) policy + * + */ + +#ifndef _LUSTRE_NRS_TBF_H +#define _LUSTRE_NRS_TBF_H + +/* \name tbf + * + * TBF policy + * + * @{ + */ + +struct nrs_tbf_head; +struct nrs_tbf_cmd; + +#define NRS_TBF_MATCH_FULL 0x0000001 +#define NRS_TBF_MATCH_WILDCARD 0x0000002 + +struct nrs_tbf_jobid { + char *tj_id; + __u32 tj_match_flag; + struct list_head tj_linkage; +}; + +#define MAX_U32_STR_LEN 10 +#define NRS_TBF_KEY_LEN (LNET_NIDSTR_SIZE + LUSTRE_JOBID_SIZE + \ + MAX_U32_STR_LEN + MAX_U32_STR_LEN + 3 + 2) + +enum nrs_tbf_flag { + NRS_TBF_FLAG_INVALID = 0x0000000, + NRS_TBF_FLAG_JOBID = 0x0000001, + NRS_TBF_FLAG_NID = 0x0000002, + NRS_TBF_FLAG_OPCODE = 0x0000004, + NRS_TBF_FLAG_GENERIC = 0x0000008, + NRS_TBF_FLAG_UID = 0x0000010, + NRS_TBF_FLAG_GID = 0x0000020, +}; + +struct tbf_id { + enum nrs_tbf_flag ti_type; + u32 ti_uid; + u32 ti_gid; +}; + +struct nrs_tbf_id { + struct tbf_id nti_id; + struct list_head nti_linkage; +}; + +struct nrs_tbf_client { + /** Resource object for policy instance. */ + struct ptlrpc_nrs_resource tc_res; + /** Node in the hash table. */ + struct hlist_node tc_hnode; + /** NID of the client. */ + lnet_nid_t tc_nid; + /** Jobid of the client. */ + char tc_jobid[LUSTRE_JOBID_SIZE]; + /** opcode of the client. */ + __u32 tc_opcode; + /** gid or uid of the client. */ + struct tbf_id tc_id; + /** Hash key of the client. */ + char tc_key[NRS_TBF_KEY_LEN]; + /** Reference number of the client. */ + atomic_t tc_ref; + /** Lock to protect rule and linkage. */ + spinlock_t tc_rule_lock; + /** Linkage to rule. */ + struct list_head tc_linkage; + /** Pointer to rule. */ + struct nrs_tbf_rule *tc_rule; + /** Generation of the rule matched. */ + __u64 tc_rule_generation; + /** Limit of RPC rate. */ + __u64 tc_rpc_rate; + /** Time to wait for next token. */ + __u64 tc_nsecs; + /** RPC token number. */ + __u64 tc_ntoken; + /** Token bucket depth. */ + __u64 tc_depth; + /** Time check-point. */ + __u64 tc_check_time; + /** Deadline of a class */ + __u64 tc_deadline; + /** + * Time residue: the remainder of elapsed time + * divided by nsecs when dequeue a request. + */ + __u64 tc_nsecs_resid; + /** List of queued requests. */ + struct list_head tc_list; + /** Node in binary heap. */ + struct binheap_node tc_node; + /** Whether the client is in heap. */ + bool tc_in_heap; + /** Sequence of the newest rule. */ + __u32 tc_rule_sequence; + /** + * Linkage into LRU list. Protected bucket lock of + * nrs_tbf_head::th_cli_hash. + */ + struct list_head tc_lru; +}; + +#define MAX_TBF_NAME (16) + +enum nrs_rule_flags { + NTRS_STOPPING = 0x00000001, + NTRS_DEFAULT = 0x00000002, + NTRS_REALTIME = 0x00000004, +}; + +struct nrs_tbf_rule { + /** Name of the rule. */ + char tr_name[MAX_TBF_NAME]; + /** Head belongs to. */ + struct nrs_tbf_head *tr_head; + /** Likage to head. */ + struct list_head tr_linkage; + /** Nid list of the rule. */ + struct list_head tr_nids; + /** Nid list string of the rule.*/ + char *tr_nids_str; + /** Jobid list of the rule. */ + struct list_head tr_jobids; + /** Jobid list string of the rule.*/ + char *tr_jobids_str; + /** uid/gid list of the rule. */ + struct list_head tr_ids; + /** uid/gid list string of the rule. */ + char *tr_ids_str; + /** Opcode bitmap of the rule. */ + struct cfs_bitmap *tr_opcodes; + /** Opcode list string of the rule.*/ + char *tr_opcodes_str; + /** Condition list of the rule.*/ + struct list_head tr_conds; + /** Generic condition string of the rule. */ + char *tr_conds_str; + /** RPC/s limit. */ + __u64 tr_rpc_rate; + /** Time to wait for next token. */ + u64 tr_nsecs_per_rpc; + /** Token bucket depth. */ + __u64 tr_depth; + /** Lock to protect the list of clients. */ + spinlock_t tr_rule_lock; + /** List of client. */ + struct list_head tr_cli_list; + /** Flags of the rule. */ + enum nrs_rule_flags tr_flags; + /** Usage Reference count taken on the rule. */ + atomic_t tr_ref; + /** Generation of the rule. */ + __u64 tr_generation; +}; + +struct nrs_tbf_ops { + char *o_name; + int (*o_startup)(struct ptlrpc_nrs_policy *, struct nrs_tbf_head *); + struct nrs_tbf_client *(*o_cli_find)(struct nrs_tbf_head *, + struct ptlrpc_request *); + struct nrs_tbf_client *(*o_cli_findadd)(struct nrs_tbf_head *, + struct nrs_tbf_client *); + void (*o_cli_put)(struct nrs_tbf_head *, struct nrs_tbf_client *); + void (*o_cli_init)(struct nrs_tbf_client *, struct ptlrpc_request *); + int (*o_rule_init)(struct ptlrpc_nrs_policy *, + struct nrs_tbf_rule *, + struct nrs_tbf_cmd *); + int (*o_rule_dump)(struct nrs_tbf_rule *, struct seq_file *); + int (*o_rule_match)(struct nrs_tbf_rule *, + struct nrs_tbf_client *); + void (*o_rule_fini)(struct nrs_tbf_rule *); +}; + +#define NRS_TBF_TYPE_JOBID "jobid" +#define NRS_TBF_TYPE_NID "nid" +#define NRS_TBF_TYPE_OPCODE "opcode" +#define NRS_TBF_TYPE_GENERIC "generic" +#define NRS_TBF_TYPE_UID "uid" +#define NRS_TBF_TYPE_GID "gid" +#define NRS_TBF_TYPE_MAX_LEN 20 + +struct nrs_tbf_type { + const char *ntt_name; + enum nrs_tbf_flag ntt_flag; + struct nrs_tbf_ops *ntt_ops; +}; + +struct nrs_tbf_bucket { + /** + * LRU list, updated on each access to client. Protected by + * bucket lock of nrs_tbf_head::th_cli_hash. + */ + struct list_head ntb_lru; +}; + +/** + * Private data structure for the TBF policy + */ +struct nrs_tbf_head { + /** + * Resource object for policy instance. + */ + struct ptlrpc_nrs_resource th_res; + /** + * List of rules. + */ + struct list_head th_list; + /** + * Lock to protect the list of rules. + */ + spinlock_t th_rule_lock; + /** + * Generation of rules. + */ + atomic_t th_rule_sequence; + /** + * Default rule. + */ + struct nrs_tbf_rule *th_rule; + /** + * Timer for next token. + */ + struct hrtimer th_timer; + /** + * Deadline of the timer. + */ + __u64 th_deadline; + /** + * Sequence of requests. + */ + __u64 th_sequence; + /** + * Heap of queues. + */ + struct binheap *th_binheap; + /** + * Hash of clients. + */ + struct cfs_hash *th_cli_hash; + /** + * Type of TBF policy. + */ + char th_type[NRS_TBF_TYPE_MAX_LEN + 1]; + /** + * Rule operations. + */ + struct nrs_tbf_ops *th_ops; + /** + * Flag of type. + */ + __u32 th_type_flag; + /** + * Index of bucket on hash table while purging. + */ + int th_purge_start; +}; + +enum nrs_tbf_cmd_type { + NRS_CTL_TBF_START_RULE = 0, + NRS_CTL_TBF_STOP_RULE, + NRS_CTL_TBF_CHANGE_RULE, +}; + +struct nrs_tbf_cmd { + enum nrs_tbf_cmd_type tc_cmd; + char *tc_name; + union { + struct nrs_tbf_cmd_start { + __u64 ts_rpc_rate; + struct list_head ts_nids; + char *ts_nids_str; + struct list_head ts_jobids; + char *ts_jobids_str; + struct list_head ts_ids; + char *ts_ids_str; + char *ts_opcodes_str; + struct list_head ts_conds; + char *ts_conds_str; + __u32 ts_valid_type; + enum nrs_rule_flags ts_rule_flags; + char *ts_next_name; + } tc_start; + struct nrs_tbf_cmd_change { + __u64 tc_rpc_rate; + char *tc_next_name; + } tc_change; + } u; +}; + +enum nrs_tbf_field { + NRS_TBF_FIELD_NID, + NRS_TBF_FIELD_JOBID, + NRS_TBF_FIELD_OPCODE, + NRS_TBF_FIELD_UID, + NRS_TBF_FIELD_GID, + NRS_TBF_FIELD_MAX +}; + +struct nrs_tbf_expression { + enum nrs_tbf_field te_field; + struct list_head te_cond; + struct cfs_bitmap *te_opcodes; + struct list_head te_linkage; +}; + +struct nrs_tbf_conjunction { + /** + * link to disjunction. + */ + struct list_head tc_linkage; + /** + * list of logical conjunction + */ + struct list_head tc_expressions; +}; + +struct nrs_tbf_req { + /** + * Linkage to queue. + */ + struct list_head tr_list; + /** + * Sequence of the request. + */ + __u64 tr_sequence; +}; + +/** + * TBF policy operations. + */ +enum nrs_ctl_tbf { + /** + * Read the the data of a TBF policy. + */ + NRS_CTL_TBF_RD_RULE = PTLRPC_NRS_CTL_1ST_POL_SPEC, + /** + * Write the the data of a TBF policy. + */ + NRS_CTL_TBF_WR_RULE, + /** + * Read the TBF policy type preset by proc entry "nrs_policies". + */ + NRS_CTL_TBF_RD_TYPE_FLAG, +}; + +/** @} tbf */ +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h new file mode 100644 index 0000000000000..dd99eee5af714 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h @@ -0,0 +1,53 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + * + * Copyright 2015 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * Define obdo associated functions + * obdo: OBject Device o... + */ + +#ifndef _LUSTRE_OBDO_H_ +#define _LUSTRE_OBDO_H_ + +#include + +/** + * Create an obdo to send over the wire + */ +void lustre_set_wire_obdo(const struct obd_connect_data *ocd, + struct obdo *wobdo, + const struct obdo *lobdo); + +/** + * Create a local obdo from a wire based odbo + */ +void lustre_get_wire_obdo(const struct obd_connect_data *ocd, + struct obdo *lobdo, + const struct obdo *wobdo); +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_osc.h b/drivers/staging/lustrefsx/lustre/include/lustre_osc.h new file mode 100644 index 0000000000000..60300fa2b970b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_osc.h @@ -0,0 +1,983 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +/* + * lustre/include/lustre_osc.h + * + * OSC layer structures and methods common for both OSC and MDC. + * + * This file contains OSC interfaces used by OSC and MDC. Most of them + * were just moved from lustre/osc/osc_cl_internal.h for Data-on-MDT + * purposes. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + * Author: Mikhail Pershin + */ + +#ifndef LUSTRE_OSC_H +#define LUSTRE_OSC_H + +#include +#include +#include +#include + +/** \defgroup osc osc + * @{ + */ + +struct osc_quota_info { + /** linkage for quota hash table */ + struct hlist_node oqi_hash; + __u32 oqi_id; +}; + +enum async_flags { + ASYNC_READY = 0x1, /* ap_make_ready will not be called before this + page is added to an rpc */ + ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */ + ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called + to give the caller a chance to update + or cancel the size of the io */ + ASYNC_HP = 0x10, +}; + +struct osc_async_page { + int oap_magic; + unsigned short oap_cmd; + + struct list_head oap_pending_item; + struct list_head oap_rpc_item; + + loff_t oap_obj_off; + unsigned oap_page_off; + enum async_flags oap_async_flags; + + struct brw_page oap_brw_page; + + struct ptlrpc_request *oap_request; + struct client_obd *oap_cli; + struct osc_object *oap_obj; + + spinlock_t oap_lock; +}; + +#define oap_page oap_brw_page.pg +#define oap_count oap_brw_page.count +#define oap_brw_flags oap_brw_page.flag + +static inline struct osc_async_page *brw_page2oap(struct brw_page *pga) +{ + return container_of(pga, struct osc_async_page, oap_brw_page); +} + +struct osc_device { + struct cl_device osc_cl; + struct obd_export *osc_exp; + + /* Write stats is actually protected by client_obd's lock. */ + struct osc_stats { + ktime_t os_init; + uint64_t os_lockless_writes; /* by bytes */ + uint64_t os_lockless_reads; /* by bytes */ + } osc_stats; + + /* configuration item(s) */ + time64_t osc_contention_time; +}; + +struct osc_extent; + +/** + * State maintained by osc layer for each IO context. + */ +struct osc_io { + /** super class */ + struct cl_io_slice oi_cl; + /** true if this io is lockless. */ + unsigned int oi_lockless:1, + /** true if this io is counted as active IO */ + oi_is_active:1, + /** true if this io has CAP_SYS_RESOURCE */ + oi_cap_sys_resource:1, + /** true if this io issued by readahead */ + oi_is_readahead:1; + /** how many LRU pages are reserved for this IO */ + unsigned long oi_lru_reserved; + + /** active extents, we know how many bytes is going to be written, + * so having an active extent will prevent it from being fragmented */ + struct osc_extent *oi_active; + /** partially truncated extent, we need to hold this extent to prevent + * page writeback from happening. */ + struct osc_extent *oi_trunc; + /** write osc_lock for this IO, used by osc_extent_find(). */ + struct osc_lock *oi_write_osclock; + struct obdo oi_oa; + struct osc_async_cbargs { + bool opc_rpc_sent; + int opc_rc; + struct completion opc_sync; + } oi_cbarg; +}; + +/** + * State maintained by osc layer for the duration of a system call. + */ +struct osc_session { + struct osc_io os_io; +}; + +#define OTI_PVEC_SIZE 256 +struct osc_thread_info { + struct ldlm_res_id oti_resname; + union ldlm_policy_data oti_policy; + struct cl_attr oti_attr; + struct cl_io oti_io; + struct pagevec oti_pagevec; + void *oti_pvec[OTI_PVEC_SIZE]; + /** + * Fields used by cl_lock_discard_pages(). + */ + pgoff_t oti_next_index; + pgoff_t oti_fn_index; /* first non-overlapped index */ + pgoff_t oti_ng_index; /* negative lock caching */ + struct cl_sync_io oti_anchor; + struct cl_req_attr oti_req_attr; + struct lu_buf oti_ladvise_buf; +}; + +static inline __u64 osc_enq2ldlm_flags(__u32 enqflags) +{ + __u64 result = 0; + + CDEBUG(D_DLMTRACE, "flags: %x\n", enqflags); + + LASSERT((enqflags & ~CEF_MASK) == 0); + + if (enqflags & CEF_NONBLOCK) + result |= LDLM_FL_BLOCK_NOWAIT; + if (enqflags & CEF_GLIMPSE) + result |= LDLM_FL_HAS_INTENT|LDLM_FL_CBPENDING; + if (enqflags & CEF_DISCARD_DATA) + result |= LDLM_FL_AST_DISCARD_DATA; + if (enqflags & CEF_PEEK) + result |= LDLM_FL_TEST_LOCK; + if (enqflags & CEF_LOCK_MATCH) + result |= LDLM_FL_MATCH_LOCK; + if (enqflags & CEF_LOCK_NO_EXPAND) + result |= LDLM_FL_NO_EXPANSION; + if (enqflags & CEF_SPECULATIVE) + result |= LDLM_FL_SPECULATIVE; + return result; +} + +typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh, + int rc); + +struct osc_enqueue_args { + struct obd_export *oa_exp; + enum ldlm_type oa_type; + enum ldlm_mode oa_mode; + __u64 *oa_flags; + osc_enqueue_upcall_f oa_upcall; + void *oa_cookie; + struct ost_lvb *oa_lvb; + struct lustre_handle oa_lockh; + bool oa_speculative; +}; + +/** + * Bit flags for osc_dlm_lock_at_pageoff(). + */ +enum osc_dap_flags { + /** + * Just check if the desired lock exists, it won't hold reference + * count on lock. + */ + OSC_DAP_FL_TEST_LOCK = BIT(0), + /** + * Return the lock even if it is being canceled. + */ + OSC_DAP_FL_CANCELING = BIT(1), + /** + * check ast data is present, requested to cancel cb + */ + OSC_DAP_FL_AST = BIT(2), + /** + * look at right region for the desired lock + */ + OSC_DAP_FL_RIGHT = BIT(3), +}; + +/* + * The set of operations which are different for MDC and OSC objects + */ +struct osc_object_operations { + void (*oto_build_res_name)(struct osc_object *osc, + struct ldlm_res_id *resname); + struct ldlm_lock* (*oto_dlmlock_at_pgoff)(const struct lu_env *env, + struct osc_object *obj, + pgoff_t index, + enum osc_dap_flags dap_flags); +}; + +struct osc_object { + struct cl_object oo_cl; + struct lov_oinfo *oo_oinfo; + /** + * True if locking against this stripe got -EUSERS. + */ + int oo_contended; + ktime_t oo_contention_time; +#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK + /** + * IO context used for invariant checks in osc_lock_has_pages(). + */ + struct cl_io oo_debug_io; + /** Serialization object for osc_object::oo_debug_io. */ + struct mutex oo_debug_mutex; +#endif + /** + * used by the osc to keep track of what objects to build into rpcs. + * Protected by client_obd->cli_loi_list_lock. + */ + struct list_head oo_ready_item; + struct list_head oo_hp_ready_item; + struct list_head oo_write_item; + struct list_head oo_read_item; + + /** + * extent is a red black tree to manage (async) dirty pages. + */ + struct rb_root oo_root; + /** + * Manage write(dirty) extents. + */ + struct list_head oo_hp_exts; /* list of hp extents */ + struct list_head oo_urgent_exts; /* list of writeback extents */ + struct list_head oo_full_exts; + + struct list_head oo_reading_exts; + + atomic_t oo_nr_reads; + atomic_t oo_nr_writes; + + /** Protect extent tree. Will be used to protect + * oo_{read|write}_pages soon. */ + spinlock_t oo_lock; + + /** + * Radix tree for caching pages + */ + spinlock_t oo_tree_lock; + struct radix_tree_root oo_tree; + unsigned long oo_npages; + + /* Protect osc_lock this osc_object has */ + struct list_head oo_ol_list; + spinlock_t oo_ol_spin; + + /** number of active IOs of this object */ + atomic_t oo_nr_ios; + wait_queue_head_t oo_io_waitq; + + const struct osc_object_operations *oo_obj_ops; + bool oo_initialized; +}; + +static inline void osc_build_res_name(struct osc_object *osc, + struct ldlm_res_id *resname) +{ + return osc->oo_obj_ops->oto_build_res_name(osc, resname); +} + +static inline struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env, + struct osc_object *obj, + pgoff_t index, + enum osc_dap_flags flags) +{ + return obj->oo_obj_ops->oto_dlmlock_at_pgoff(env, obj, index, flags); +} + +static inline void osc_object_lock(struct osc_object *obj) +{ + spin_lock(&obj->oo_lock); +} + +static inline int osc_object_trylock(struct osc_object *obj) +{ + return spin_trylock(&obj->oo_lock); +} + +static inline void osc_object_unlock(struct osc_object *obj) +{ + spin_unlock(&obj->oo_lock); +} + +#define assert_osc_object_is_locked(obj) \ + assert_spin_locked(&obj->oo_lock) + +static inline void osc_object_set_contended(struct osc_object *obj) +{ + obj->oo_contention_time = ktime_get(); + /* mb(); */ + obj->oo_contended = 1; +} + +static inline void osc_object_clear_contended(struct osc_object *obj) +{ + obj->oo_contended = 0; +} + +/* + * Lock "micro-states" for osc layer. + */ +enum osc_lock_state { + OLS_NEW, + OLS_ENQUEUED, + OLS_UPCALL_RECEIVED, + OLS_GRANTED, + OLS_CANCELLED +}; + +/** + * osc-private state of cl_lock. + * + * Interaction with DLM. + * + * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in + * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_dlmlock. + * + * This pointer is protected through a reference, acquired by + * osc_lock_upcall0(). Also, an additional reference is acquired by + * ldlm_lock_addref() call protecting the lock from cancellation, until + * osc_lock_unuse() releases it. + * + * Below is a description of how lock references are acquired and released + * inside of DLM. + * + * - When new lock is created and enqueued to the server (ldlm_cli_enqueue()) + * - ldlm_lock_create() + * - ldlm_lock_new(): initializes a lock with 2 references. One for + * the caller (released when reply from the server is received, or on + * error), and another for the hash table. + * - ldlm_lock_addref_internal(): protects the lock from cancellation. + * + * - When reply is received from the server (osc_enqueue_interpret()) + * - ldlm_cli_enqueue_fini() + * - LDLM_LOCK_PUT(): releases caller reference acquired by + * ldlm_lock_new(). + * - if (rc != 0) + * ldlm_lock_decref(): error case: matches ldlm_cli_enqueue(). + * - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue(). + * + * - When lock is being cancelled (ldlm_lock_cancel()) + * - ldlm_lock_destroy() + * - LDLM_LOCK_PUT(): releases hash-table reference acquired by + * ldlm_lock_new(). + * + * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called + * either when lock is cancelled (osc_lock_blocking()), or when locks is + * deleted without cancellation (e.g., from cl_locks_prune()). In the latter + * case ldlm lock remains in memory, and can be re-attached to osc_lock in the + * future. + */ +struct osc_lock { + struct cl_lock_slice ols_cl; + /** Internal lock to protect states, etc. */ + spinlock_t ols_lock; + /** Owner sleeps on this channel for state change */ + struct cl_sync_io *ols_owner; + /** waiting list for this lock to be cancelled */ + struct list_head ols_waiting_list; + /** wait entry of ols_waiting_list */ + struct list_head ols_wait_entry; + /** list entry for osc_object::oo_ol_list */ + struct list_head ols_nextlock_oscobj; + + /** underlying DLM lock */ + struct ldlm_lock *ols_dlmlock; + /** DLM flags with which osc_lock::ols_lock was enqueued */ + __u64 ols_flags; + /** osc_lock::ols_lock handle */ + struct lustre_handle ols_handle; + struct ldlm_enqueue_info ols_einfo; + enum osc_lock_state ols_state; + /** lock value block */ + struct ost_lvb ols_lvb; + /** Lockless operations to be used by lockless lock */ + const struct cl_lock_operations *ols_lockless_ops; + /** + * true, if ldlm_lock_addref() was called against + * osc_lock::ols_lock. This is used for sanity checking. + * + * \see osc_lock::ols_has_ref + */ + unsigned ols_hold :1, + /** + * this is much like osc_lock::ols_hold, except that this bit is + * cleared _after_ reference in released in osc_lock_unuse(). This + * fine distinction is needed because: + * + * - if ldlm lock still has a reference, osc_ast_data_get() needs + * to return associated cl_lock (so that a flag is needed that is + * cleared after ldlm_lock_decref() returned), and + * + * - ldlm_lock_decref() can invoke blocking ast (for a + * LDLM_FL_CBPENDING lock), and osc_lock functions like + * osc_lock_cancel() called from there need to know whether to + * release lock reference (so that a flag is needed that is + * cleared before ldlm_lock_decref() is called). + */ + ols_has_ref:1, + /** + * inherit the lockless attribute from top level cl_io. + * If true, osc_lock_enqueue is able to tolerate the -EUSERS error. + */ + ols_locklessable:1, + /** + * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat + * the EVAVAIL error as torerable, this will make upper logic happy + * to wait all glimpse locks to each OSTs to be completed. + * Glimpse lock converts to normal lock if the server lock is granted. + * Glimpse lock should be destroyed immediately after use. + */ + ols_glimpse:1, + /** + * For async glimpse lock. + */ + ols_agl:1, + /** + * for speculative locks - asynchronous glimpse locks and ladvise + * lockahead manual lock requests + * + * Used to tell osc layer to not wait for the ldlm reply from the + * server, so the osc lock will be short lived - It only exists to + * create the ldlm request and is not updated on request completion. + */ + ols_speculative:1; +}; + +static inline int osc_lock_is_lockless(const struct osc_lock *ols) +{ + return (ols->ols_cl.cls_ops == ols->ols_lockless_ops); +} + +/** + * Page state private for osc layer. + */ +struct osc_page { + struct cl_page_slice ops_cl; + /** + * Page queues used by osc to detect when RPC can be formed. + */ + struct osc_async_page ops_oap; + /** + * An offset within page from which next transfer starts. This is used + * by cl_page_clip() to submit partial page transfers. + */ + unsigned int ops_from:PAGE_SHIFT, + /** + * An offset within page at which next transfer ends(inclusive). + * + * \see osc_page::ops_from. + */ + ops_to:PAGE_SHIFT, + /** + * Boolean, true iff page is under transfer. Used for sanity checking. + */ + ops_transfer_pinned:1, + /** + * in LRU? + */ + ops_in_lru:1, + /** + * Set if the page must be transferred with OBD_BRW_SRVLOCK. + */ + ops_srvlock:1, + /** + * If the page is in osc_object::oo_tree. + */ + ops_intree:1; + /** + * lru page list. See osc_lru_{del|use}() in osc_page.c for usage. + */ + struct list_head ops_lru; + /** + * Submit time - the time when the page is starting RPC. For debugging. + */ + ktime_t ops_submit_time; +}; + +struct osc_brw_async_args { + struct obdo *aa_oa; + int aa_requested_nob; + int aa_nio_count; + u32 aa_page_count; + s32 aa_resends; + struct brw_page **aa_ppga; + struct client_obd *aa_cli; + struct list_head aa_oaps; + struct list_head aa_exts; +}; + +extern struct kmem_cache *osc_lock_kmem; +extern struct kmem_cache *osc_object_kmem; +extern struct kmem_cache *osc_thread_kmem; +extern struct kmem_cache *osc_session_kmem; +extern struct kmem_cache *osc_extent_kmem; +extern struct kmem_cache *osc_quota_kmem; +extern struct kmem_cache *osc_obdo_kmem; + +extern struct lu_context_key osc_key; +extern struct lu_context_key osc_session_key; + +#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY) + +/* osc_page.c */ +int osc_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t ind); +void osc_index2policy(union ldlm_policy_data *policy, const struct cl_object *obj, + pgoff_t start, pgoff_t end); +void osc_lru_add_batch(struct client_obd *cli, struct list_head *list); +void osc_page_submit(const struct lu_env *env, struct osc_page *opg, + enum cl_req_type crt, int brw_flags, ktime_t submit_time); +int lru_queue_work(const struct lu_env *env, void *data); +long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, + long target, bool force); + +/* osc_cache.c */ +int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg, + u32 async_flags); +int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, + struct cl_page *page, loff_t offset); +int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops, cl_commit_cbt cb); +int osc_page_cache_add(const struct lu_env *env, struct osc_page *opg, + struct cl_io *io, cl_commit_cbt cb); +int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj, + struct osc_page *ops); +int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops); +int osc_queue_sync_pages(const struct lu_env *env, struct cl_io *io, + struct osc_object *obj, struct list_head *list, + int brw_flags); +int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj, + __u64 size, struct osc_extent **extp); +void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext); +int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end, int hp, int discard); +int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end); +int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, int async); +static inline void osc_wake_cache_waiters(struct client_obd *cli) +{ + wake_up(&cli->cl_cache_waiters); +} + +static inline int osc_io_unplug_async(const struct lu_env *env, + struct client_obd *cli, + struct osc_object *osc) +{ + return osc_io_unplug0(env, cli, osc, 1); +} + +static inline void osc_io_unplug(const struct lu_env *env, + struct client_obd *cli, + struct osc_object *osc) +{ + (void)osc_io_unplug0(env, cli, osc, 0); +} + +typedef bool (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *, + void**, int, void *); +bool osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, + struct osc_object *osc, pgoff_t start, pgoff_t end, + osc_page_gang_cbt cb, void *cbdata); +bool osc_discard_cb(const struct lu_env *env, struct cl_io *io, + void**, int, void *cbdata); + +/* osc_dev.c */ +int osc_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next); +struct lu_device *osc_device_fini(const struct lu_env *env, + struct lu_device *d); +struct lu_device *osc_device_free(const struct lu_env *env, + struct lu_device *d); + +/* osc_object.c */ +int osc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +void osc_object_free(const struct lu_env *env, struct lu_object *obj); +int osc_lvb_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct ost_lvb *lvb); +int osc_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *obj); +int osc_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); +int osc_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); +int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj, + struct ost_lvb *lvb); +int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc); +int osc_object_find_cbdata(const struct lu_env *env, struct cl_object *obj, + ldlm_iterator_t iter, void *data); +int osc_object_prune(const struct lu_env *env, struct cl_object *obj); + +/* osc_request.c */ +void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd); +int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg); +int osc_precleanup_common(struct obd_device *obd); +int osc_cleanup_common(struct obd_device *obd); +int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, + u32 keylen, void *key, u32 vallen, void *val, + struct ptlrpc_request_set *set); +int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg); +int osc_reconnect(const struct lu_env *env, struct obd_export *exp, + struct obd_device *obd, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata); +int osc_disconnect(struct obd_export *exp); +int osc_punch_send(struct obd_export *exp, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie); +int osc_fallocate_base(struct obd_export *exp, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie, int mode); +void osc_update_next_shrink(struct client_obd *cli); +void osc_schedule_grant_work(void); + +/* osc_io.c */ +int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue); +int osc_io_commit_async(const struct lu_env *env, + const struct cl_io_slice *ios, + struct cl_page_list *qin, int from, int to, + cl_commit_cbt cb); +void osc_io_extent_release(const struct lu_env *env, + const struct cl_io_slice *ios); +int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios); +void osc_io_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios); +void osc_io_rw_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios); +int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios); +void osc_io_setattr_end(const struct lu_env *env, + const struct cl_io_slice *slice); +int osc_io_read_start(const struct lu_env *env, + const struct cl_io_slice *slice); +int osc_io_write_start(const struct lu_env *env, + const struct cl_io_slice *slice); +void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice); +int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj, + struct cl_fsync_io *fio); +void osc_io_fsync_end(const struct lu_env *env, + const struct cl_io_slice *slice); +void osc_read_ahead_release(const struct lu_env *env, struct cl_read_ahead *ra); +int osc_io_lseek_start(const struct lu_env *env, + const struct cl_io_slice *slice); +void osc_io_lseek_end(const struct lu_env *env, + const struct cl_io_slice *slice); +int osc_io_lru_reserve(const struct lu_env *env, const struct cl_io_slice *ios, + loff_t pos, size_t count); +int osc_punch_start(const struct lu_env *env, struct cl_io *io, + struct cl_object *obj); + +/* osc_lock.c */ +void osc_lock_to_lockless(const struct lu_env *env, struct osc_lock *ols, + int force); +void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc, + struct osc_lock *oscl); +int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj, + struct osc_lock *oscl); +void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io, + struct cl_object *obj, struct osc_lock *oscl); +int osc_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice); +void osc_lock_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice); +void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice); +int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data); +unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock); + +/***************************************************************************** + * + * Accessors and type conversions. + * + */ +static inline struct osc_thread_info *osc_env_info(const struct lu_env *env) +{ + struct osc_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &osc_key); + LASSERT(info != NULL); + return info; +} + +static inline struct osc_session *osc_env_session(const struct lu_env *env) +{ + struct osc_session *ses; + + ses = lu_context_key_get(env->le_ses, &osc_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct osc_io *osc_env_io(const struct lu_env *env) +{ + return &osc_env_session(env)->os_io; +} + +static inline struct osc_device *lu2osc_dev(const struct lu_device *d) +{ + return container_of_safe(d, struct osc_device, osc_cl.cd_lu_dev); +} + +static inline struct obd_export *osc_export(const struct osc_object *obj) +{ + return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->osc_exp; +} + +static inline struct client_obd *osc_cli(const struct osc_object *obj) +{ + return &osc_export(obj)->exp_obd->u.cli; +} + +static inline struct osc_object *cl2osc(const struct cl_object *obj) +{ + return container_of_safe(obj, struct osc_object, oo_cl); +} + +static inline struct cl_object *osc2cl(const struct osc_object *obj) +{ + return (struct cl_object *)&obj->oo_cl; +} + +static inline struct osc_device *obd2osc_dev(const struct obd_device *obd) +{ + return container_of_safe(obd->obd_lu_dev, struct osc_device, + osc_cl.cd_lu_dev); +} + +static inline struct lu_device *osc2lu_dev(struct osc_device *osc) +{ + return &osc->osc_cl.cd_lu_dev; +} + +static inline struct lu_object *osc2lu(struct osc_object *osc) +{ + return &osc->oo_cl.co_lu; +} + +static inline struct osc_object *lu2osc(const struct lu_object *obj) +{ + return container_of_safe(obj, struct osc_object, oo_cl.co_lu); +} + +static inline struct osc_io *cl2osc_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct osc_io *oio = container_of(slice, struct osc_io, oi_cl); + + LINVRNT(oio == osc_env_io(env)); + return oio; +} + +static inline enum ldlm_mode osc_cl_lock2ldlm(enum cl_lock_mode mode) +{ + LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP); + if (mode == CLM_READ) + return LCK_PR; + if (mode == CLM_WRITE) + return LCK_PW; + return LCK_GROUP; +} + +static inline enum cl_lock_mode osc_ldlm2cl_lock(enum ldlm_mode mode) +{ + LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP); + if (mode == LCK_PR) + return CLM_READ; + if (mode == LCK_PW) + return CLM_WRITE; + return CLM_GROUP; +} + +static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice) +{ + return container_of_safe(slice, struct osc_page, ops_cl); +} + +static inline struct osc_page *oap2osc(struct osc_async_page *oap) +{ + return container_of_safe(oap, struct osc_page, ops_oap); +} + +static inline pgoff_t osc_index(struct osc_page *opg) +{ + return opg->ops_oap.oap_obj_off >> PAGE_SHIFT; +} + +static inline struct cl_page *oap2cl_page(struct osc_async_page *oap) +{ + return oap2osc(oap)->ops_cl.cpl_page; +} + +static inline struct osc_page *oap2osc_page(struct osc_async_page *oap) +{ + return (struct osc_page *)container_of(oap, struct osc_page, ops_oap); +} + +static inline struct osc_page * +osc_cl_page_osc(struct cl_page *page, struct osc_object *osc) +{ + const struct cl_page_slice *slice; + + LASSERT(osc != NULL); + slice = cl_object_page_slice(&osc->oo_cl, page); + return cl2osc_page(slice); +} + +static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice) +{ + return container_of_safe(slice, struct osc_lock, ols_cl); +} + +static inline int osc_io_srvlock(struct osc_io *oio) +{ + return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock); +} + +enum osc_extent_state { + OES_INV = 0, /** extent is just initialized or destroyed */ + OES_ACTIVE = 1, /** process is using this extent */ + OES_CACHE = 2, /** extent is ready for IO */ + OES_LOCKING = 3, /** locking page to prepare IO */ + OES_LOCK_DONE = 4, /** locking finished, ready to send */ + OES_RPC = 5, /** in RPC */ + OES_TRUNC = 6, /** being truncated */ + OES_STATE_MAX +}; + +/** + * osc_extent data to manage dirty pages. + * osc_extent has the following attributes: + * 1. all pages in the same must be in one RPC in write back; + * 2. # of pages must be less than max_pages_per_rpc - implied by 1; + * 3. must be covered by only 1 osc_lock; + * 4. exclusive. It's impossible to have overlapped osc_extent. + * + * The lifetime of an extent is from when the 1st page is dirtied to when + * all pages inside it are written out. + * + * LOCKING ORDER + * ============= + * page lock -> client_obd_list_lock -> object lock(osc_object::oo_lock) + */ +struct osc_extent { + /** red-black tree node */ + struct rb_node oe_node; + /** osc_object of this extent */ + struct osc_object *oe_obj; + /** refcount, removed from red-black tree if reaches zero. */ + struct kref oe_refc; + /** busy if non-zero */ + atomic_t oe_users; + /** link list of osc_object's oo_{hp|urgent|locking}_exts. */ + struct list_head oe_link; + /** state of this extent */ + enum osc_extent_state oe_state; + /** flags for this extent. */ + /** 0 is write, 1 is read */ + unsigned int oe_rw:1, + /** sync extent, queued by osc_queue_sync_pages() */ + oe_sync:1, + /** set if this extent has partial, sync pages. + * Extents with partial page(s) can't merge with others in RPC */ + oe_no_merge:1, + oe_srvlock:1, + oe_memalloc:1, + /** an ACTIVE extent is going to be truncated, so when this extent + * is released, it will turn into TRUNC state instead of CACHE. */ + oe_trunc_pending:1, + /** this extent should be written asap and someone may wait for the + * write to finish. This bit is usually set along with urgent if + * the extent was CACHE state. + * fsync_wait extent can't be merged because new extent region may + * exceed fsync range. */ + oe_fsync_wait:1, + /** covering lock is being canceled */ + oe_hp:1, + /** this extent should be written back asap. set if one of pages is + * called by page WB daemon, or sync write or reading requests. */ + oe_urgent:1, + /** Non-delay RPC should be used for this extent. */ + oe_ndelay:1, + /** direct IO pages */ + oe_dio:1, + /** this extent consists of pages that are not directly accessible + * from the CPU */ + oe_is_rdma_only:1; + /** how many grants allocated for this extent. + * Grant allocated for this extent. There is no grant allocated + * for reading extents and sync write extents. */ + unsigned int oe_grants; + /** # of dirty pages in this extent */ + unsigned int oe_nr_pages; + /** list of pending oap pages. Pages in this list are NOT sorted. */ + struct list_head oe_pages; + /** start and end index of this extent, include start and end + * themselves. Page offset here is the page index of osc_pages. + * oe_start is used as keyword for red-black tree. */ + pgoff_t oe_start; + pgoff_t oe_end; + /** maximum ending index of this extent, this is limited by + * max_pages_per_rpc, lock extent and chunk size. */ + pgoff_t oe_max_end; + /** waitqueue - for those who want to be notified if this extent's + * state has changed. */ + wait_queue_head_t oe_waitq; + /** lock covering this extent */ + struct ldlm_lock *oe_dlmlock; + /** terminator of this extent. Must be true if this extent is in IO. */ + struct task_struct *oe_owner; + /** return value of writeback. If somebody is waiting for this extent, + * this value can be known by outside world. */ + int oe_rc; + /** max pages per rpc when this extent was created */ + unsigned int oe_mppr; + /** FLR: layout version when this osc_extent is publised */ + __u32 oe_layout_version; +}; + +/** @} osc */ + +#endif /* LUSTRE_OSC_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_quota.h b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h new file mode 100644 index 0000000000000..4b674d8b1257b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h @@ -0,0 +1,279 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + * Use is subject to license terms. + */ + +#ifndef _LUSTRE_QUOTA_H +#define _LUSTRE_QUOTA_H + +/** \defgroup quota quota + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifndef MAX_IQ_TIME +#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ +#endif + +#ifndef MAX_DQ_TIME +#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ +#endif + +struct lquota_id_info; +struct lquota_trans; + +/* Gather all quota record type in an union that can be used to read any records + * from disk. All fields of these records must be 64-bit aligned, otherwise the + * OSD layer may swab them incorrectly. */ +union lquota_rec { + struct lquota_glb_rec lqr_glb_rec; + struct lquota_slv_rec lqr_slv_rec; + struct lquota_acct_rec lqr_acct_rec; +}; + +/* flags for inode/block quota accounting */ +enum osd_qid_declare_flags { + OSD_QID_INODE = BIT(0), + OSD_QID_BLK = BIT(1), + OSD_QID_FORCE = BIT(2), +}; + +/* Index features supported by the global index objects + * Only used for migration purpose and should be removed once on-disk migration + * is no longer needed */ +extern struct dt_index_features dt_quota_iusr_features; +extern struct dt_index_features dt_quota_busr_features; +extern struct dt_index_features dt_quota_igrp_features; +extern struct dt_index_features dt_quota_bgrp_features; + +/* Name used in the configuration logs to identify the default metadata pool + * (composed of all the MDTs, with pool ID 0) and the default data pool (all + * the OSTs, with pool ID 0 too). */ +#define QUOTA_METAPOOL_NAME "mdt=" +#define QUOTA_DATAPOOL_NAME "ost=" + +/* + * Quota Master Target support + */ + +/* Request handlers for quota master operations. + * This is used by the MDT to pass quota/lock requests to the quota master + * target. This won't be needed any more once the QMT is a real target and + * does not rely any more on the MDT service threads and namespace. */ +struct qmt_handlers { + /* Handle quotactl request from client. */ + int (*qmth_quotactl)(const struct lu_env *, struct lu_device *, + struct obd_quotactl *); + + /* Handle dqacq/dqrel request from slave. */ + int (*qmth_dqacq)(const struct lu_env *, struct lu_device *, + struct ptlrpc_request *); + + /* LDLM intent policy associated with quota locks */ + int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *, + struct ptlrpc_request *, struct ldlm_lock **, + int); + + /* Initialize LVB of ldlm resource associated with quota objects */ + int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *); + + /* Update LVB of ldlm resource associated with quota objects */ + int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *, + struct ptlrpc_request *, int); + + /* Return size of LVB to be packed in ldlm message */ + int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *); + + /* Fill request buffer with lvb */ + int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *, + int); + + /* Free lvb associated with ldlm resource */ + int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *); +}; + +/* actual handlers are defined in lustre/quota/qmt_handler.c */ +extern struct qmt_handlers qmt_hdls; + +/* + * Quota enforcement support on slaves + */ + +struct qsd_instance; + +/* The quota slave feature is implemented under the form of a library. + * The API is the following: + * + * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd + * instance via qsd_init(). This creates all required structures + * to manage quota enforcement for this target and performs all + * low-level initialization which does not involve any lustre + * object. qsd_init() should typically be called when the OSD + * is being set up. + * + * - qsd_prepare(): This sets up on-disk objects associated with the quota slave + * feature and initiates the quota reintegration procedure if + * needed. qsd_prepare() should typically be called when + * ->ldo_prepare is invoked. + * + * - qsd_start(): a qsd instance should be started once recovery is completed + * (i.e. when ->ldo_recovery_complete is called). This is used + * to notify the qsd layer that quota should now be enforced + * again via the qsd_op_begin/end functions. The last step of the + * reintegration prodecure (namely usage reconciliation) will be + * completed during start. + * + * - qsd_fini(): is used to release a qsd_instance structure allocated with + * qsd_init(). This releases all quota slave objects and frees the + * structures associated with the qsd_instance. + * + * - qsd_op_begin(): is used to enforce quota, it must be called in the + * declaration of each operation. qsd_op_end() should then be + * invoked later once all operations have been completed in + * order to release/adjust the quota space. + * Running qsd_op_begin() before qsd_start() isn't fatal and + * will return success. + * Once qsd_start() has been run, qsd_op_begin() will block + * until the reintegration procedure is completed. + * + * - qsd_op_end(): performs the post operation quota processing. This must be + * called after the operation transaction stopped. + * While qsd_op_begin() must be invoked each time a new + * operation is declared, qsd_op_end() should be called only + * once for the whole transaction. + * + * - qsd_op_adjust(): triggers pre-acquire/release if necessary. + * + * Below are the function prototypes to be used by OSD layer to manage quota + * enforcement. Arguments are documented where each function is defined. */ + +/* flags for quota local enforcement */ +enum osd_quota_local_flags { + QUOTA_FL_OVER_USRQUOTA = BIT(0), + QUOTA_FL_OVER_GRPQUOTA = BIT(1), + QUOTA_FL_SYNC = BIT(2), + QUOTA_FL_OVER_PRJQUOTA = BIT(3), +}; + +struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *, + struct proc_dir_entry *, bool is_md, bool excl); +int qsd_prepare(const struct lu_env *, struct qsd_instance *); +int qsd_start(const struct lu_env *, struct qsd_instance *); +void qsd_fini(const struct lu_env *, struct qsd_instance *); +int qsd_op_begin(const struct lu_env *, struct qsd_instance *, + struct lquota_trans *, struct lquota_id_info *, + enum osd_quota_local_flags *); +void qsd_op_end(const struct lu_env *, struct qsd_instance *, + struct lquota_trans *); +void qsd_op_adjust(const struct lu_env *, struct qsd_instance *, + union lquota_id *, int); +int qsd_transfer(const struct lu_env *env, struct qsd_instance *qsd, + struct lquota_trans *trans, unsigned int qtype, + u64 orig_id, u64 new_id, u64 bspace, + struct lquota_id_info *qi); +int qsd_reserve_or_free_quota(const struct lu_env *env, + struct qsd_instance *qsd, + struct lquota_id_info *qi); + +/* + * Quota information attached to a transaction + */ + +struct lquota_entry; + +struct lquota_id_info { + /* quota identifier */ + union lquota_id lqi_id; + + /* USRQUOTA or GRPQUOTA for now, could be expanded for + * directory quota or other types later. */ + int lqi_type; + + /* inodes or kbytes to be consumed or released, it could + * be negative when releasing space. */ + long long lqi_space; + + /* quota slave entry structure associated with this ID */ + struct lquota_entry *lqi_qentry; + + /* whether we are reporting blocks or inodes */ + bool lqi_is_blk; +}; + +/* With the DoM, both inode quota in meta pool and block quota in data pool + * will be enforced at MDT, there are at most 4 quota ids being enforced in + * a single transaction for inode and block quota, which is chown transaction: + * original uid and gid, new uid and gid. + * + * This value might need to be revised when directory quota is added. */ +#define QUOTA_MAX_TRANSIDS 8 + +/* all qids involved in a single transaction */ +struct lquota_trans { + unsigned short lqt_id_cnt; + struct lquota_id_info lqt_ids[QUOTA_MAX_TRANSIDS]; +}; + +#define IS_LQUOTA_RES(res) \ + (res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA || \ + res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB) + +/* helper function used by MDT & OFD to retrieve quota accounting information + * on slave */ +int lquotactl_slv(const struct lu_env *, struct dt_device *, + struct obd_quotactl *); + +static inline int quota_reserve_or_free(const struct lu_env *env, + struct qsd_instance *qsd, + struct lquota_id_info *qi, + enum quota_type type, __u64 uid, + __u64 gid, __s64 count, bool is_md) +{ + qi->lqi_type = type; + if (count > 0) + qi->lqi_space = toqb(count); + else + qi->lqi_space = -toqb(-count); + + if (is_md) + qi->lqi_is_blk = false; + else + qi->lqi_is_blk = true; + + qi->lqi_id.qid_uid = uid; + qi->lqi_id.qid_gid = gid; + + return qsd_reserve_or_free_quota(env, qsd, qi); +} + +/** @} quota */ +#endif /* _LUSTRE_QUOTA_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h new file mode 100644 index 0000000000000..57c74aa322fe1 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h @@ -0,0 +1,428 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre_req_layout.h + * + * Lustre Metadata Target (mdt) request handler + * + * Author: Nikita Danilov + */ + +#ifndef _LUSTRE_REQ_LAYOUT_H__ +#define _LUSTRE_REQ_LAYOUT_H__ + +#include + +/** \defgroup req_layout req_layout + * + * @{ + */ + +struct req_msg_field; +struct req_format; +struct req_capsule; + +struct ptlrpc_request; + +enum req_location { + RCL_CLIENT, + RCL_SERVER, + RCL_NR +}; + +/* Maximal number of fields (buffers) in a request message. */ +#define REQ_MAX_FIELD_NR 12 + +struct req_capsule { + struct ptlrpc_request *rc_req; + /** Request message - what client sent */ + struct lustre_msg *rc_reqmsg; + /** Reply message - server response */ + struct lustre_msg *rc_repmsg; + /** Fields that help to see if request and reply were swabved or not */ + __u32 rc_req_swab_mask; + __u32 rc_rep_swab_mask; + const struct req_format *rc_fmt; + enum req_location rc_loc; + __u32 rc_area[RCL_NR][REQ_MAX_FIELD_NR]; +}; + +void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req, + enum req_location location); +void req_capsule_fini(struct req_capsule *pill); + +void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt); +void req_capsule_client_dump(struct req_capsule *pill); +void req_capsule_server_dump(struct req_capsule *pill); +void req_capsule_init_area(struct req_capsule *pill); +size_t req_capsule_filled_sizes(struct req_capsule *pill, + enum req_location loc); +int req_capsule_server_pack(struct req_capsule *pill); + +void *req_capsule_client_get(struct req_capsule *pill, + const struct req_msg_field *field); +void *req_capsule_client_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber); +void *req_capsule_client_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 len); +void *req_capsule_server_get(struct req_capsule *pill, + const struct req_msg_field *field); +void *req_capsule_server_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 len); +void *req_capsule_server_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber); +void *req_capsule_server_sized_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 len, void *swabber); +const void *req_capsule_other_get(struct req_capsule *pill, + const struct req_msg_field *field); + +void req_capsule_set_size(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, __u32 size); +__u32 req_capsule_get_size(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +__u32 req_capsule_msg_size(struct req_capsule *pill, enum req_location loc); +__u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, + enum req_location loc); +void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt); + +int req_capsule_has_field(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +int req_capsule_field_present(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +void req_capsule_shrink(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 newlen, + enum req_location loc); +int req_capsule_server_grow(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 newlen); +bool req_capsule_need_swab(struct req_capsule *pill, enum req_location loc, + __u32 index); +void req_capsule_set_swabbed(struct req_capsule *pill, enum req_location loc, + __u32 index); + +/** + * Returns true if request buffer at offset \a index was already swabbed + */ +static inline bool req_capsule_req_swabbed(struct req_capsule *pill, + size_t index) +{ + LASSERT(index < sizeof(pill->rc_req_swab_mask) * 8); + return pill->rc_req_swab_mask & BIT(index); +} + +/** + * Returns true if request reply buffer at offset \a index was already swabbed + */ +static inline bool req_capsule_rep_swabbed(struct req_capsule *pill, + size_t index) +{ + LASSERT(index < sizeof(pill->rc_rep_swab_mask) * 8); + return pill->rc_rep_swab_mask & BIT(index); +} + +/** + * Returns true if request needs to be swabbed into local cpu byteorder + */ +static inline bool req_capsule_req_need_swab(struct req_capsule *pill) +{ + return req_capsule_req_swabbed(pill, MSG_PTLRPC_HEADER_OFF); +} + +/** + * Returns true if request reply needs to be swabbed into local cpu byteorder + */ +static inline bool req_capsule_rep_need_swab(struct req_capsule *pill) +{ + return req_capsule_rep_swabbed(pill, MSG_PTLRPC_HEADER_OFF); +} + +/** + * Mark request buffer at offset \a index that it was already swabbed + */ +static inline void req_capsule_set_req_swabbed(struct req_capsule *pill, + size_t index) +{ + LASSERT(index < sizeof(pill->rc_req_swab_mask) * 8); + LASSERT((pill->rc_req_swab_mask & BIT(index)) == 0); + pill->rc_req_swab_mask |= BIT(index); +} + +/** + * Mark request reply buffer at offset \a index that it was already swabbed + */ +static inline void req_capsule_set_rep_swabbed(struct req_capsule *pill, + size_t index) +{ + LASSERT(index < sizeof(pill->rc_rep_swab_mask) * 8); + LASSERT((pill->rc_rep_swab_mask & BIT(index)) == 0); + pill->rc_rep_swab_mask |= BIT(index); +} + +int req_layout_init(void); +void req_layout_fini(void); +#ifdef HAVE_SERVER_SUPPORT +int req_check_sepol(struct req_capsule *pill); +#else +static inline int req_check_sepol(struct req_capsule *pill) +{ + return 0; +} +#endif + +extern struct req_format RQF_OBD_PING; +extern struct req_format RQF_OBD_SET_INFO; +extern struct req_format RQF_MDT_SET_INFO; +extern struct req_format RQF_SEC_CTX; +extern struct req_format RQF_OBD_IDX_READ; +/* MGS req_format */ +extern struct req_format RQF_MGS_TARGET_REG; +extern struct req_format RQF_MGS_SET_INFO; +extern struct req_format RQF_MGS_CONFIG_READ; +/* fid/fld req_format */ +extern struct req_format RQF_SEQ_QUERY; +extern struct req_format RQF_FLD_QUERY; +extern struct req_format RQF_FLD_READ; +/* MDS req_format */ +extern struct req_format RQF_MDS_CONNECT; +extern struct req_format RQF_MDS_DISCONNECT; +extern struct req_format RQF_MDS_STATFS; +extern struct req_format RQF_MDS_STATFS_NEW; +extern struct req_format RQF_MDS_GET_ROOT; +extern struct req_format RQF_MDS_SYNC; +extern struct req_format RQF_MDS_GETXATTR; +extern struct req_format RQF_MDS_GETATTR; +extern struct req_format RQF_OUT_UPDATE; + +/* + * This is format of direct (non-intent) MDS_GETATTR_NAME request. + */ +extern struct req_format RQF_MDS_GETATTR_NAME; +extern struct req_format RQF_MDS_CLOSE; +extern struct req_format RQF_MDS_CLOSE_INTENT; +extern struct req_format RQF_MDS_CONNECT; +extern struct req_format RQF_MDS_DISCONNECT; +extern struct req_format RQF_MDS_GET_INFO; +extern struct req_format RQF_MDS_READPAGE; +extern struct req_format RQF_MDS_REINT; +extern struct req_format RQF_MDS_REINT_CREATE; +extern struct req_format RQF_MDS_REINT_CREATE_ACL; +extern struct req_format RQF_MDS_REINT_CREATE_SLAVE; +extern struct req_format RQF_MDS_REINT_CREATE_SYM; +extern struct req_format RQF_MDS_REINT_OPEN; +extern struct req_format RQF_MDS_REINT_UNLINK; +extern struct req_format RQF_MDS_REINT_LINK; +extern struct req_format RQF_MDS_REINT_RENAME; +extern struct req_format RQF_MDS_REINT_SETATTR; +extern struct req_format RQF_MDS_REINT_SETXATTR; +extern struct req_format RQF_MDS_QUOTACTL; +extern struct req_format RQF_QUOTA_DQACQ; +extern struct req_format RQF_MDS_SWAP_LAYOUTS; +extern struct req_format RQF_MDS_REINT_MIGRATE; +extern struct req_format RQF_MDS_REINT_RESYNC; +extern struct req_format RQF_MDS_RMFID; +/* MDS hsm formats */ +extern struct req_format RQF_MDS_HSM_STATE_GET; +extern struct req_format RQF_MDS_HSM_STATE_SET; +extern struct req_format RQF_MDS_HSM_ACTION; +extern struct req_format RQF_MDS_HSM_PROGRESS; +extern struct req_format RQF_MDS_HSM_CT_REGISTER; +extern struct req_format RQF_MDS_HSM_CT_UNREGISTER; +extern struct req_format RQF_MDS_HSM_REQUEST; +/* OST req_format */ +extern struct req_format RQF_OST_CONNECT; +extern struct req_format RQF_OST_DISCONNECT; +extern struct req_format RQF_OST_QUOTACTL; +extern struct req_format RQF_OST_GETATTR; +extern struct req_format RQF_OST_SETATTR; +extern struct req_format RQF_OST_CREATE; +extern struct req_format RQF_OST_PUNCH; +extern struct req_format RQF_OST_FALLOCATE; +extern struct req_format RQF_OST_SYNC; +extern struct req_format RQF_OST_DESTROY; +extern struct req_format RQF_OST_BRW_READ; +extern struct req_format RQF_OST_BRW_WRITE; +extern struct req_format RQF_OST_STATFS; +extern struct req_format RQF_OST_SET_GRANT_INFO; +extern struct req_format RQF_OST_GET_INFO; +extern struct req_format RQF_OST_GET_INFO_LAST_ID; +extern struct req_format RQF_OST_GET_INFO_LAST_FID; +extern struct req_format RQF_OST_SET_INFO_LAST_FID; +extern struct req_format RQF_OST_GET_INFO_FIEMAP; +extern struct req_format RQF_OST_LADVISE; +extern struct req_format RQF_OST_SEEK; + +/* LDLM req_format */ +extern struct req_format RQF_LDLM_ENQUEUE; +extern struct req_format RQF_LDLM_ENQUEUE_LVB; +extern struct req_format RQF_LDLM_CONVERT; +extern struct req_format RQF_LDLM_INTENT; +extern struct req_format RQF_LDLM_INTENT_BASIC; +extern struct req_format RQF_LDLM_INTENT_LAYOUT; +extern struct req_format RQF_LDLM_INTENT_GETATTR; +extern struct req_format RQF_LDLM_INTENT_OPEN; +extern struct req_format RQF_LDLM_INTENT_CREATE; +extern struct req_format RQF_LDLM_INTENT_GETXATTR; +extern struct req_format RQF_LDLM_INTENT_QUOTA; +extern struct req_format RQF_LDLM_CANCEL; +extern struct req_format RQF_LDLM_CALLBACK; +extern struct req_format RQF_LDLM_CP_CALLBACK; +extern struct req_format RQF_LDLM_BL_CALLBACK; +extern struct req_format RQF_LDLM_GL_CALLBACK; +extern struct req_format RQF_LDLM_GL_CALLBACK_DESC; +/* LOG req_format */ +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER; + +extern struct req_format RQF_CONNECT; + +/* LFSCK req_format */ +extern struct req_format RQF_LFSCK_NOTIFY; +extern struct req_format RQF_LFSCK_QUERY; + +extern struct req_msg_field RMF_GENERIC_DATA; +extern struct req_msg_field RMF_PTLRPC_BODY; +extern struct req_msg_field RMF_MDT_BODY; +extern struct req_msg_field RMF_MDT_EPOCH; +extern struct req_msg_field RMF_OBD_STATFS; +extern struct req_msg_field RMF_NAME; +extern struct req_msg_field RMF_SYMTGT; +extern struct req_msg_field RMF_TGTUUID; +extern struct req_msg_field RMF_CLUUID; +extern struct req_msg_field RMF_SETINFO_VAL; +extern struct req_msg_field RMF_SETINFO_KEY; +extern struct req_msg_field RMF_GETINFO_VAL; +extern struct req_msg_field RMF_GETINFO_VALLEN; +extern struct req_msg_field RMF_GETINFO_KEY; +extern struct req_msg_field RMF_IDX_INFO; +extern struct req_msg_field RMF_CLOSE_DATA; +extern struct req_msg_field RMF_FILE_SECCTX_NAME; +extern struct req_msg_field RMF_FILE_SECCTX; +extern struct req_msg_field RMF_FID_ARRAY; +extern struct req_msg_field RMF_FILE_ENCCTX; + +/* + * connection handle received in MDS_CONNECT request. + */ +extern struct req_msg_field RMF_CONN; +extern struct req_msg_field RMF_CONNECT_DATA; +extern struct req_msg_field RMF_DLM_REQ; +extern struct req_msg_field RMF_DLM_REP; +extern struct req_msg_field RMF_DLM_LVB; +extern struct req_msg_field RMF_DLM_GL_DESC; +extern struct req_msg_field RMF_LDLM_INTENT; +extern struct req_msg_field RMF_LAYOUT_INTENT; +extern struct req_msg_field RMF_MDT_MD; +extern struct req_msg_field RMF_DEFAULT_MDT_MD; +extern struct req_msg_field RMF_REC_REINT; +extern struct req_msg_field RMF_EADATA; +extern struct req_msg_field RMF_EAVALS; +extern struct req_msg_field RMF_EAVALS_LENS; +extern struct req_msg_field RMF_ACL; +extern struct req_msg_field RMF_LOGCOOKIES; +extern struct req_msg_field RMF_CAPA1; +extern struct req_msg_field RMF_CAPA2; +extern struct req_msg_field RMF_OBD_QUOTACHECK; +extern struct req_msg_field RMF_OBD_QUOTACTL; +extern struct req_msg_field RMF_OBD_QUOTACTL_POOL; +extern struct req_msg_field RMF_QUOTA_BODY; +extern struct req_msg_field RMF_STRING; +extern struct req_msg_field RMF_SWAP_LAYOUTS; +extern struct req_msg_field RMF_MDS_HSM_PROGRESS; +extern struct req_msg_field RMF_MDS_HSM_REQUEST; +extern struct req_msg_field RMF_MDS_HSM_USER_ITEM; +extern struct req_msg_field RMF_MDS_HSM_ARCHIVE; +extern struct req_msg_field RMF_HSM_USER_STATE; +extern struct req_msg_field RMF_HSM_STATE_SET; +extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION; +extern struct req_msg_field RMF_MDS_HSM_REQUEST; +extern struct req_msg_field RMF_SELINUX_POL; + +/* seq-mgr fields */ +extern struct req_msg_field RMF_SEQ_OPC; +extern struct req_msg_field RMF_SEQ_RANGE; +extern struct req_msg_field RMF_FID_SPACE; + +/* FLD fields */ +extern struct req_msg_field RMF_FLD_OPC; +extern struct req_msg_field RMF_FLD_MDFLD; + +extern struct req_msg_field RMF_LLOGD_BODY; +extern struct req_msg_field RMF_LLOG_LOG_HDR; +extern struct req_msg_field RMF_LLOGD_CONN_BODY; + +extern struct req_msg_field RMF_MGS_TARGET_INFO; +extern struct req_msg_field RMF_MGS_SEND_PARAM; + +extern struct req_msg_field RMF_OST_BODY; +extern struct req_msg_field RMF_OBD_IOOBJ; +extern struct req_msg_field RMF_OBD_ID; +extern struct req_msg_field RMF_FID; +extern struct req_msg_field RMF_NIOBUF_REMOTE; +extern struct req_msg_field RMF_NIOBUF_INLINE; +extern struct req_msg_field RMF_RCS; +extern struct req_msg_field RMF_FIEMAP_KEY; +extern struct req_msg_field RMF_FIEMAP_VAL; +extern struct req_msg_field RMF_OST_ID; +extern struct req_msg_field RMF_SHORT_IO; + +/* MGS config read message format */ +extern struct req_msg_field RMF_MGS_CONFIG_BODY; +extern struct req_msg_field RMF_MGS_CONFIG_RES; + +/* generic uint32 */ +extern struct req_msg_field RMF_U32; + +/* OBJ update format */ +extern struct req_msg_field RMF_OUT_UPDATE; +extern struct req_msg_field RMF_OUT_UPDATE_REPLY; +extern struct req_msg_field RMF_OUT_UPDATE_HEADER; +extern struct req_msg_field RMF_OUT_UPDATE_BUF; + +/* LFSCK format */ +extern struct req_msg_field RMF_LFSCK_REQUEST; +extern struct req_msg_field RMF_LFSCK_REPLY; + +extern struct req_msg_field RMF_OST_LADVISE_HDR; +extern struct req_msg_field RMF_OST_LADVISE; +/** @} req_layout */ + +#endif /* _LUSTRE_REQ_LAYOUT_H__ */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h b/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h new file mode 100644 index 0000000000000..16249a3a65f2e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h @@ -0,0 +1,392 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + */ +/* + * lustre/include/lustre_scrub.h + * + * Shared definitions and declarations for Lustre OI scrub. + * + * Author: Fan Yong + */ + +#ifndef _LUSTRE_SCRUB_H +# define _LUSTRE_SCRUB_H + +#include +#include +#include + +#define OSD_OI_FID_OID_BITS_MAX 10 +#define OSD_OI_FID_NR_MAX (1UL << OSD_OI_FID_OID_BITS_MAX) +#define SCRUB_OI_BITMAP_SIZE (OSD_OI_FID_NR_MAX >> 3) +#define PFID_STRIPE_IDX_BITS 16 +#define PFID_STRIPE_COUNT_MASK ((1 << PFID_STRIPE_IDX_BITS) - 1) + +#define SCRUB_MAGIC_V1 0x4C5FD252 +#define SCRUB_MAGIC_V2 0x4C5FE253 +#define SCRUB_CHECKPOINT_INTERVAL 60 +#define SCRUB_WINDOW_SIZE 1024 + +enum scrub_next_status { + /* exit current loop and process next group */ + SCRUB_NEXT_BREAK = 1, + + /* skip current object and process next bit */ + SCRUB_NEXT_CONTINUE = 2, + + /* exit all the loops */ + SCRUB_NEXT_EXIT = 3, + + /* wait for free cache slot */ + SCRUB_NEXT_WAIT = 4, + + /* simulate system crash during OI scrub */ + SCRUB_NEXT_CRASH = 5, + + /* simulate failure during OI scrub */ + SCRUB_NEXT_FATAL = 6, + + /* new created object, no scrub on it */ + SCRUB_NEXT_NOSCRUB = 7, + + /* the object has no FID-in-LMA */ + SCRUB_NEXT_NOLMA = 8, + + /* for OST-object */ + SCRUB_NEXT_OSTOBJ = 9, + + /* old OST-object, no LMA or no FID-on-OST flags in LMA */ + SCRUB_NEXT_OSTOBJ_OLD = 10, +}; + +enum scrub_local_file_flags { + SLFF_SCAN_SUBITEMS = 0x0001, + SLFF_HIDE_FID = 0x0002, + SLFF_SHOW_NAME = 0x0004, + SLFF_NO_OI = 0x0008, + SLFF_IDX_IN_FID = 0x0010, +}; + +enum scrub_status { + /* The scrub file is new created, for new MDT, upgrading from old disk, + * or re-creating the scrub file manually. */ + SS_INIT = 0, + + /* The scrub is checking/repairing the OI files. */ + SS_SCANNING = 1, + + /* The scrub checked/repaired the OI files successfully. */ + SS_COMPLETED = 2, + + /* The scrub failed to check/repair the OI files. */ + SS_FAILED = 3, + + /* The scrub is stopped manually, the OI files may be inconsistent. */ + SS_STOPPED = 4, + + /* The scrub is paused automatically when umount. */ + SS_PAUSED = 5, + + /* The scrub crashed during the scanning, should be restarted. */ + SS_CRASHED = 6, +}; + +enum scrub_flags { + /* OI files have been recreated, OI mappings should be re-inserted. */ + SF_RECREATED = 0x0000000000000001ULL, + + /* OI files are invalid, should be rebuild ASAP */ + SF_INCONSISTENT = 0x0000000000000002ULL, + + /* OI scrub is triggered automatically. */ + SF_AUTO = 0x0000000000000004ULL, + + /* The device is upgraded from 1.8 format. */ + SF_UPGRADE = 0x0000000000000008ULL, +}; + +enum scrub_param { + /* Exit when fail. */ + SP_FAILOUT = 0x0001, + + /* Check only without repairing. */ + SP_DRYRUN = 0x0002, +}; + +enum scrub_start { + /* Set failout flag. */ + SS_SET_FAILOUT = 0x00000001, + + /* Clear failout flag. */ + SS_CLEAR_FAILOUT = 0x00000002, + + /* Reset scrub start position. */ + SS_RESET = 0x00000004, + + /* Trigger full scrub automatically. */ + SS_AUTO_FULL = 0x00000008, + + /* Trigger partial scrub automatically. */ + SS_AUTO_PARTIAL = 0x00000010, + + /* Set dryrun flag. */ + SS_SET_DRYRUN = 0x00000020, + + /* Clear dryrun flag. */ + SS_CLEAR_DRYRUN = 0x00000040, +}; + +enum osd_lf_flags { + OLF_SCAN_SUBITEMS = 0x0001, + OLF_HIDE_FID = 0x0002, + OLF_SHOW_NAME = 0x0004, + OLF_NO_OI = 0x0008, + OLF_IDX_IN_FID = 0x0010, + OLF_NOT_BACKUP = 0x0020, +}; + +/* There are some overhead to detect OI inconsistency automatically + * during normal RPC handling. We do not want to always auto detect + * OI inconsistency especailly when OI scrub just done recently. + * + * The 'auto_scrub' defines the time (united as second) interval to + * enable auto detect OI inconsistency since last OI scurb done. */ +enum auto_scrub { + /* Disable auto scrub. */ + AS_NEVER = 0, + + /* 1 second is too short interval, it is almost equal to always auto + * detect inconsistent OI, usually used for test. */ + AS_ALWAYS = 1, + + /* Enable auto detect OI inconsistency one month (60 * 60 * 24 * 30) + * after last OI scrub. */ + AS_DEFAULT = 2592000LL, +}; + +struct scrub_file { + /* 128-bit uuid for volume. */ + uuid_t sf_uuid; + + /* See 'enum scrub_flags'. */ + __u64 sf_flags; + + /* The scrub magic. */ + __u32 sf_magic; + + /* See 'enum scrub_status'. */ + __u16 sf_status; + + /* See 'enum scrub_param'. */ + __u16 sf_param; + + /* The time for the last OI scrub completed. */ + time64_t sf_time_last_complete; + + /* The ttime for the latest OI scrub ran. */ + time64_t sf_time_latest_start; + + /* The time for the last OI scrub checkpoint. */ + time64_t sf_time_last_checkpoint; + + /* The position for the latest OI scrub started from. */ + __u64 sf_pos_latest_start; + + /* The position for the last OI scrub checkpoint. */ + __u64 sf_pos_last_checkpoint; + + /* The position for the first should be updated object. */ + __u64 sf_pos_first_inconsistent; + + /* How many objects have been checked. */ + __u64 sf_items_checked; + + /* How many objects have been updated. */ + __u64 sf_items_updated; + + /* How many objects failed to be processed. */ + __u64 sf_items_failed; + + /* How many prior objects have been updated during scanning. */ + __u64 sf_items_updated_prior; + + /* How many objects marked as LDISKFS_STATE_LUSTRE_NOSCRUB. */ + __u64 sf_items_noscrub; + + /* How many IGIF objects. */ + __u64 sf_items_igif; + + /* How long the OI scrub has run in seconds. Do NOT change + * to time64_t since this breaks backwards compatibility. + * It shouldn't take more than 136 years to complete :-) + */ + s32 sf_run_time; + + /* How many completed OI scrub ran on the device. */ + __u32 sf_success_count; + + /* How many OI files. */ + __u16 sf_oi_count; + + /* Keep the flags after scrub reset. See 'enum scrub_internal_flags' */ + __u16 sf_internal_flags; + + __u32 sf_reserved_1; + __u64 sf_reserved_2[16]; + + /* Bitmap for OI files recreated case. */ + __u8 sf_oi_bitmap[SCRUB_OI_BITMAP_SIZE]; +}; + +struct lustre_scrub { + /* Object for the scrub file. */ + struct dt_object *os_obj; + + struct task_struct *os_task; + struct list_head os_inconsistent_items; + + /* write lock for scrub prep/update/post/checkpoint, + * read lock for scrub dump. */ + struct rw_semaphore os_rwsem; + spinlock_t os_lock; + + /* Scrub file in memory. */ + struct scrub_file os_file; + + /* Buffer for scrub file load/store. */ + struct scrub_file os_file_disk; + + const char *os_name; + + /* The time for last checkpoint, seconds */ + time64_t os_time_last_checkpoint; + + /* The time for next checkpoint, seconds */ + time64_t os_time_next_checkpoint; + + /* How long to wait to start scrubbing */ + time64_t os_auto_scrub_interval; + + /* How many objects have been checked since last checkpoint. */ + __u64 os_new_checked; + __u64 os_pos_current; + __u32 os_start_flags; + /* Some of these bits can be set by different threads so + * all updates must be protected by ->os_lock to avoid + * racing read-modify-write cycles causing corruption. + */ + unsigned int os_in_prior:1, /* process inconsistent item + * found by RPC prior */ + os_waiting:1, /* Waiting for scan window. */ + os_full_speed:1, /* run w/o speed limit */ + os_paused:1, /* The scrub is paused. */ + os_convert_igif:1, + os_partial_scan:1, + os_in_join:1, + os_running:1, /* scrub thread is running */ + os_full_scrub:1, + os_has_ml_file:1; +}; + +#define INDEX_BACKUP_MAGIC_V1 0x1E41F208 +#define INDEX_BACKUP_BUFSIZE (4096 * 4) + +enum lustre_index_backup_policy { + /* By default, do not backup the index */ + LIBP_NONE = 0, + + /* Backup the dirty index objects when umount */ + LIBP_AUTO = 1, +}; + +struct lustre_index_backup_header { + __u32 libh_magic; + __u32 libh_count; + __u32 libh_keysize; + __u32 libh_recsize; + struct lu_fid libh_owner; + __u64 libh_pad[60]; /* keep header 512 bytes aligned */ +}; + +struct lustre_index_backup_unit { + struct list_head libu_link; + struct lu_fid libu_fid; + __u32 libu_keysize; + __u32 libu_recsize; +}; + +struct lustre_index_restore_unit { + struct list_head liru_link; + struct lu_fid liru_pfid; + struct lu_fid liru_cfid; + __u64 liru_clid; + int liru_len; + char liru_name[0]; +}; + +void scrub_file_init(struct lustre_scrub *scrub, uuid_t uuid); +void scrub_file_reset(struct lustre_scrub *scrub, uuid_t uuid, u64 flags); +int scrub_file_load(const struct lu_env *env, struct lustre_scrub *scrub); +int scrub_file_store(const struct lu_env *env, struct lustre_scrub *scrub); +bool scrub_needs_check(struct lustre_scrub *scrub, const struct lu_fid *fid, + u64 index); +int scrub_checkpoint(const struct lu_env *env, struct lustre_scrub *scrub); +int scrub_thread_prep(const struct lu_env *env, struct lustre_scrub *scrub, + uuid_t uuid, u64 start); +int scrub_thread_post(const struct lu_env *env, struct lustre_scrub *scrub, + int result); +int scrub_start(int (*threadfn)(void *data), struct lustre_scrub *scrub, + void *data, __u32 flags); +void scrub_stop(struct lustre_scrub *scrub); +void scrub_dump(struct seq_file *m, struct lustre_scrub *scrub); + +int lustre_liru_new(struct list_head *head, const struct lu_fid *pfid, + const struct lu_fid *cfid, __u64 child, + const char *name, int namelen); + +int lustre_index_register(struct dt_device *dev, const char *devname, + struct list_head *head, spinlock_t *lock, int *guard, + const struct lu_fid *fid, + __u32 keysize, __u32 recsize); + +void lustre_index_backup(const struct lu_env *env, struct dt_device *dev, + const char *devname, struct list_head *head, + spinlock_t *lock, int *guard, bool backup); +int lustre_index_restore(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *parent_fid, + const struct lu_fid *tgt_fid, + const struct lu_fid *bak_fid, const char *name, + struct list_head *head, spinlock_t *lock, + char *buf, int bufsize); + +static inline void lustre_fid2lbx(char *buf, const struct lu_fid *fid, int len) +{ + snprintf(buf, len, DFID_NOBRACE".lbx", PFID(fid)); +} + +static inline const char *osd_scrub2name(struct lustre_scrub *scrub) +{ + return scrub->os_name; +} +#endif /* _LUSTRE_SCRUB_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_sec.h b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h new file mode 100644 index 0000000000000..831d35183247f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h @@ -0,0 +1,1208 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _LUSTRE_SEC_H_ +#define _LUSTRE_SEC_H_ + +/** \defgroup sptlrpc sptlrpc + * + * @{ + */ + +/* + * to avoid include + */ +struct obd_import; +struct obd_export; +struct ptlrpc_request; +struct ptlrpc_reply_state; +struct ptlrpc_bulk_desc; +struct brw_page; +struct lu_env; +/* Linux specific */ +struct key; +struct seq_file; +struct lustre_cfg; + +/* + * forward declaration + */ +struct ptlrpc_sec_policy; +struct ptlrpc_sec_cops; +struct ptlrpc_sec_sops; +struct ptlrpc_sec; +struct ptlrpc_svc_ctx; +struct ptlrpc_cli_ctx; +struct ptlrpc_ctx_ops; +struct req_msg_field; + +/** + * \addtogroup flavor flavor + * + * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits + * are unused, must be set to 0 for future expansion. + *
+ * ------------------------------------------------------------------------
+ * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech)  | 4b (policy) |
+ * ------------------------------------------------------------------------
+ * 
+ * + * @{ + */ + +/* + * flavor constants + */ +enum sptlrpc_policy { + SPTLRPC_POLICY_NULL = 0, + SPTLRPC_POLICY_PLAIN = 1, + SPTLRPC_POLICY_GSS = 2, + SPTLRPC_POLICY_MAX, +}; + +enum sptlrpc_mech_null { + SPTLRPC_MECH_NULL = 0, + SPTLRPC_MECH_NULL_MAX, +}; + +enum sptlrpc_mech_plain { + SPTLRPC_MECH_PLAIN = 0, + SPTLRPC_MECH_PLAIN_MAX, +}; + +enum sptlrpc_mech_gss { + SPTLRPC_MECH_GSS_NULL = 0, + SPTLRPC_MECH_GSS_KRB5 = 1, + SPTLRPC_MECH_GSS_SK = 2, + SPTLRPC_MECH_GSS_MAX, +}; + +enum sptlrpc_service_type { + SPTLRPC_SVC_NULL = 0, /**< no security */ + SPTLRPC_SVC_AUTH = 1, /**< authentication only */ + SPTLRPC_SVC_INTG = 2, /**< integrity */ + SPTLRPC_SVC_PRIV = 3, /**< privacy */ + SPTLRPC_SVC_MAX, +}; + +enum sptlrpc_bulk_type { + SPTLRPC_BULK_DEFAULT = 0, /**< follow rpc flavor */ + SPTLRPC_BULK_HASH = 1, /**< hash integrity */ + SPTLRPC_BULK_MAX, +}; + +enum sptlrpc_bulk_service { + SPTLRPC_BULK_SVC_NULL = 0, /**< no security */ + SPTLRPC_BULK_SVC_AUTH = 1, /**< authentication only */ + SPTLRPC_BULK_SVC_INTG = 2, /**< integrity */ + SPTLRPC_BULK_SVC_PRIV = 3, /**< privacy */ + SPTLRPC_BULK_SVC_MAX, +}; + +/* + * compose/extract macros + */ +#define FLVR_POLICY_OFFSET (0) +#define FLVR_MECH_OFFSET (4) +#define FLVR_SVC_OFFSET (8) +#define FLVR_BULK_TYPE_OFFSET (12) +#define FLVR_BULK_SVC_OFFSET (16) + +#define MAKE_FLVR(policy, mech, svc, btype, bsvc) \ + (((__u32)(policy) << FLVR_POLICY_OFFSET) | \ + ((__u32)(mech) << FLVR_MECH_OFFSET) | \ + ((__u32)(svc) << FLVR_SVC_OFFSET) | \ + ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) | \ + ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET)) + +/* + * extraction + */ +#define SPTLRPC_FLVR_POLICY(flavor) \ + ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF) +#define SPTLRPC_FLVR_MECH(flavor) \ + ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF) +#define SPTLRPC_FLVR_SVC(flavor) \ + ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF) +#define SPTLRPC_FLVR_BULK_TYPE(flavor) \ + ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF) +#define SPTLRPC_FLVR_BULK_SVC(flavor) \ + ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF) + +#define SPTLRPC_FLVR_BASE(flavor) \ + ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF) +#define SPTLRPC_FLVR_BASE_SUB(flavor) \ + ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF) + +/* + * gss subflavors + */ +#define MAKE_BASE_SUBFLVR(mech, svc) \ + ((__u32)(mech) | \ + ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET))) + +#define SPTLRPC_SUBFLVR_GSSNULL \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_NULL, SPTLRPC_SVC_NULL) +#define SPTLRPC_SUBFLVR_KRB5N \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL) +#define SPTLRPC_SUBFLVR_KRB5A \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH) +#define SPTLRPC_SUBFLVR_KRB5I \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG) +#define SPTLRPC_SUBFLVR_KRB5P \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV) +#define SPTLRPC_SUBFLVR_SKN \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_NULL) +#define SPTLRPC_SUBFLVR_SKA \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_AUTH) +#define SPTLRPC_SUBFLVR_SKI \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_INTG) +#define SPTLRPC_SUBFLVR_SKPI \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_PRIV) + +/* + * "end user" flavors + */ +#define SPTLRPC_FLVR_NULL \ + MAKE_FLVR(SPTLRPC_POLICY_NULL, \ + SPTLRPC_MECH_NULL, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_PLAIN \ + MAKE_FLVR(SPTLRPC_POLICY_PLAIN, \ + SPTLRPC_MECH_PLAIN, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_HASH, \ + SPTLRPC_BULK_SVC_INTG) +#define SPTLRPC_FLVR_GSSNULL \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_NULL, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_KRB5N \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_KRB5A \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_AUTH, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_KRB5I \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_INTG, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_INTG) +#define SPTLRPC_FLVR_KRB5P \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_PRIV, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_PRIV) +#define SPTLRPC_FLVR_SKN \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_SK, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_SKA \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_SK, \ + SPTLRPC_SVC_AUTH, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_SKI \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_SK, \ + SPTLRPC_SVC_INTG, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_INTG) +#define SPTLRPC_FLVR_SKPI \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_SK, \ + SPTLRPC_SVC_PRIV, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_PRIV) + +#define SPTLRPC_FLVR_DEFAULT SPTLRPC_FLVR_NULL + +#define SPTLRPC_FLVR_INVALID ((__u32) 0xFFFFFFFF) +#define SPTLRPC_FLVR_ANY ((__u32) 0xFFF00000) + +/** + * extract the useful part from wire flavor + */ +#define WIRE_FLVR(wflvr) (((__u32) (wflvr)) & 0x000FFFFF) + +/** @} flavor */ + +static inline void flvr_set_svc(__u32 *flvr, __u32 svc) +{ + LASSERT(svc < SPTLRPC_SVC_MAX); + *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), + SPTLRPC_FLVR_MECH(*flvr), + svc, + SPTLRPC_FLVR_BULK_TYPE(*flvr), + SPTLRPC_FLVR_BULK_SVC(*flvr)); +} + +static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc) +{ + LASSERT(svc < SPTLRPC_BULK_SVC_MAX); + *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), + SPTLRPC_FLVR_MECH(*flvr), + SPTLRPC_FLVR_SVC(*flvr), + SPTLRPC_FLVR_BULK_TYPE(*flvr), + svc); +} + +struct bulk_spec_hash { + __u8 hash_alg; +}; + +/** + * Full description of flavors being used on a ptlrpc connection, include + * both regular RPC and bulk transfer parts. + */ +struct sptlrpc_flavor { + /** + * wire flavor, should be renamed to sf_wire. + */ + __u32 sf_rpc; + /** + * general flags of PTLRPC_SEC_FL_* + */ + __u32 sf_flags; + /** + * rpc flavor specification + */ + union { + /* nothing for now */ + } u_rpc; + /** + * bulk flavor specification + */ + union { + struct bulk_spec_hash hash; + } u_bulk; +}; + +/** + * identify the RPC is generated from what part of Lustre. It's encoded into + * RPC requests and to be checked by ptlrpc service. + */ +enum lustre_sec_part { + LUSTRE_SP_CLI = 0, + LUSTRE_SP_MDT, + LUSTRE_SP_OST, + LUSTRE_SP_MGC, + LUSTRE_SP_MGS, + LUSTRE_SP_ANY = 0xFF +}; + +const char *sptlrpc_part2name(enum lustre_sec_part sp); +enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd); + +/** + * A rule specifies a flavor to be used by a ptlrpc connection between + * two Lustre parts. + */ +struct sptlrpc_rule { + __u32 sr_netid; /* LNET network ID */ + __u8 sr_from; /* sec_part */ + __u8 sr_to; /* sec_part */ + __u16 sr_padding; + struct sptlrpc_flavor sr_flvr; +}; + +/** + * A set of rules in memory. + * + * Rules are generated and stored on MGS, and propagated to MDT, OST, + * and client when needed. + */ +struct sptlrpc_rule_set { + int srs_nslot; + int srs_nrule; + struct sptlrpc_rule *srs_rules; +}; + +int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr); +int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr); + +static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set) +{ + memset(set, 0, sizeof(*set)); +} + +void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set); +int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set); +int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set, + struct sptlrpc_rule *rule); +int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + enum lustre_sec_part to, + lnet_nid_t nid, + struct sptlrpc_flavor *sf); +void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set); + +int sptlrpc_process_config(struct lustre_cfg *lcfg); +void sptlrpc_conf_log_start(const char *logname); +void sptlrpc_conf_log_stop(const char *logname); +void sptlrpc_conf_log_update_begin(const char *logname); +void sptlrpc_conf_log_update_end(const char *logname); +void sptlrpc_conf_client_adapt(struct obd_device *obd); +int sptlrpc_conf_target_get_rules(struct obd_device *obd, + struct sptlrpc_rule_set *rset); +void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + lnet_nid_t nid, + struct sptlrpc_flavor *flavor); + +/* The maximum length of security payload. 1024 is enough for Kerberos 5, + * and should be enough for other future mechanisms but not sure. + * Only used by pre-allocated request/reply pool. + */ +#define SPTLRPC_MAX_PAYLOAD (1024) + + +struct vfs_cred { + uint32_t vc_uid; + uint32_t vc_gid; +}; + +struct ptlrpc_ctx_ops { + /** + * To determine whether it's suitable to use the \a ctx for \a vcred. + */ + int (*match) (struct ptlrpc_cli_ctx *ctx, + struct vfs_cred *vcred); + + /** + * To bring the \a ctx uptodate. + */ + int (*refresh) (struct ptlrpc_cli_ctx *ctx); + + /** + * Validate the \a ctx. + */ + int (*validate) (struct ptlrpc_cli_ctx *ctx); + + /** + * Force the \a ctx to die. + */ + void (*die) (struct ptlrpc_cli_ctx *ctx, + int grace); + int (*display) (struct ptlrpc_cli_ctx *ctx, + char *buf, int bufsize); + + /** + * Sign the request message using \a ctx. + * + * \pre req->rq_reqmsg point to request message. + * \pre req->rq_reqlen is the request message length. + * \post req->rq_reqbuf point to request message with signature. + * \post req->rq_reqdata_len is set to the final request message size. + * + * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign(). + */ + int (*sign) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Verify the reply message using \a ctx. + * + * \pre req->rq_repdata point to reply message with signature. + * \pre req->rq_repdata_len is the total reply message length. + * \post req->rq_repmsg point to reply message without signature. + * \post req->rq_replen is the reply message length. + * + * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify(). + */ + int (*verify) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Encrypt the request message using \a ctx. + * + * \pre req->rq_reqmsg point to request message in clear text. + * \pre req->rq_reqlen is the request message length. + * \post req->rq_reqbuf point to request message. + * \post req->rq_reqdata_len is set to the final request message size. + * + * \see gss_cli_ctx_seal(). + */ + int (*seal) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Decrypt the reply message using \a ctx. + * + * \pre req->rq_repdata point to encrypted reply message. + * \pre req->rq_repdata_len is the total cipher text length. + * \post req->rq_repmsg point to reply message in clear text. + * \post req->rq_replen is the reply message length in clear text. + * + * \see gss_cli_ctx_unseal(). + */ + int (*unseal) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Wrap bulk request data. This is called before wrapping RPC + * request message. + * + * \pre bulk buffer is descripted by desc->bd_iov and + * desc->bd_iov_count. note for read it's just buffer, no data + * need to be sent; for write it contains data in clear text. + * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared + * (usually inside of RPC request message). + * - encryption: cipher text bulk buffer is descripted by + * desc->bd_enc_iov and desc->bd_iov_count (currently assume iov + * count remains the same). + * - otherwise: bulk buffer is still desc->bd_iov and + * desc->bd_iov_count. + * + * \return 0: success. + * \return -ev: error code. + * + * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk(). + */ + int (*wrap_bulk) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Unwrap bulk reply data. This is called after wrapping RPC + * reply message. + * + * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and + * desc->bd_iov_count, according to wrap_bulk(). + * \post final bulk data in clear text is placed in buffer described + * by desc->bd_iov and desc->bd_iov_count. + * \return +ve nob of actual bulk data in clear text. + * \return -ve error code. + * + * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk(). + */ + int (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +}; + +#define PTLRPC_CTX_NEW_BIT (0) /* newly created */ +#define PTLRPC_CTX_UPTODATE_BIT (1) /* uptodate */ +#define PTLRPC_CTX_DEAD_BIT (2) /* mark expired gracefully */ +#define PTLRPC_CTX_ERROR_BIT (3) /* fatal error (refresh, etc.) */ +#define PTLRPC_CTX_CACHED_BIT (8) /* in ctx cache (hash etc.) */ +#define PTLRPC_CTX_ETERNAL_BIT (9) /* always valid */ + +#define PTLRPC_CTX_NEW BIT(PTLRPC_CTX_NEW_BIT) +#define PTLRPC_CTX_UPTODATE BIT(PTLRPC_CTX_UPTODATE_BIT) +#define PTLRPC_CTX_DEAD BIT(PTLRPC_CTX_DEAD_BIT) +#define PTLRPC_CTX_ERROR BIT(PTLRPC_CTX_ERROR_BIT) +#define PTLRPC_CTX_CACHED BIT(PTLRPC_CTX_CACHED_BIT) +#define PTLRPC_CTX_ETERNAL BIT(PTLRPC_CTX_ETERNAL_BIT) + +#define PTLRPC_CTX_STATUS_MASK (PTLRPC_CTX_NEW_BIT | \ + PTLRPC_CTX_UPTODATE | \ + PTLRPC_CTX_DEAD | \ + PTLRPC_CTX_ERROR) + +struct ptlrpc_cli_ctx { + struct hlist_node cc_cache; /* linked into ctx cache */ + atomic_t cc_refcount; + struct ptlrpc_sec *cc_sec; + struct ptlrpc_ctx_ops *cc_ops; + time64_t cc_expire; /* in seconds */ + unsigned int cc_early_expire:1; + unsigned long cc_flags; + struct vfs_cred cc_vcred; + spinlock_t cc_lock; + struct list_head cc_req_list; /* waiting reqs linked here */ + struct list_head cc_gc_chain; /* linked to gc chain */ +}; + +/** + * client side policy operation vector. + */ +struct ptlrpc_sec_cops { + /** + * Given an \a imp, create and initialize a ptlrpc_sec structure. + * \param ctx service context: + * - regular import: \a ctx should be NULL; + * - reverse import: \a ctx is obtained from incoming request. + * \param flavor specify what flavor to use. + * + * When necessary, policy module is responsible for taking reference + * on the import. + * + * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr(). + */ + struct ptlrpc_sec * (*create_sec) (struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *flavor); + + /** + * Destructor of ptlrpc_sec. When called, refcount has been dropped + * to 0 and all contexts has been destroyed. + * + * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr(). + */ + void (*destroy_sec) (struct ptlrpc_sec *sec); + + /** + * Notify that this ptlrpc_sec is going to die. Optionally, policy + * module is supposed to set sec->ps_dying and whatever necessary + * actions. + * + * \see plain_kill_sec(), gss_sec_kill(). + */ + void (*kill_sec) (struct ptlrpc_sec *sec); + + /** + * Given \a vcred, lookup and/or create its context. The policy module + * is supposed to maintain its own context cache. + * XXX currently \a create and \a remove_dead is always 1, perhaps + * should be removed completely. + * + * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr(). + */ + struct ptlrpc_cli_ctx * (*lookup_ctx) (struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, + int remove_dead); + + /** + * Called then the reference of \a ctx dropped to 0. The policy module + * is supposed to destroy this context or whatever else according to + * its cache maintainance mechamism. + * + * \param sync if zero, we shouldn't wait for the context being + * destroyed completely. + * + * \see plain_release_ctx(), gss_sec_release_ctx_kr(). + */ + void (*release_ctx) (struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + int sync); + + /** + * Flush the context cache. + * + * \param uid context of which user, -1 means all contexts. + * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected + * contexts should be cleared immediately. + * \param force if zero, only idle contexts will be flushed. + * + * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr(). + */ + int (*flush_ctx_cache) + (struct ptlrpc_sec *sec, + uid_t uid, + int grace, + int force); + + /** + * Called periodically by garbage collector to remove dead contexts + * from cache. + * + * \see gss_sec_gc_ctx_kr(). + */ + void (*gc_ctx) (struct ptlrpc_sec *sec); + + /** + * Given an context \a ctx, install a corresponding reverse service + * context on client side. + * XXX currently it's only used by GSS module, maybe we should remove + * this from general API. + */ + int (*install_rctx)(struct obd_import *imp, + struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx); + + /** + * To allocate request buffer for \a req. + * + * \pre req->rq_reqmsg == NULL. + * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated, + * we are not supposed to free it. + * \post if success, req->rq_reqmsg point to a buffer with size + * at least \a lustre_msg_size. + * + * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf(). + */ + int (*alloc_reqbuf)(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int lustre_msg_size); + + /** + * To free request buffer for \a req. + * + * \pre req->rq_reqbuf != NULL. + * + * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf(). + */ + void (*free_reqbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req); + + /** + * To allocate reply buffer for \a req. + * + * \pre req->rq_repbuf == NULL. + * \post if success, req->rq_repbuf point to a buffer with size + * req->rq_repbuf_len, the size should be large enough to receive + * reply which be transformed from \a lustre_msg_size of clear text. + * + * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf(). + */ + int (*alloc_repbuf)(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int lustre_msg_size); + + /** + * To free reply buffer for \a req. + * + * \pre req->rq_repbuf != NULL. + * \post req->rq_repbuf == NULL. + * \post req->rq_repbuf_len == 0. + * + * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf(). + */ + void (*free_repbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req); + + /** + * To expand the request buffer of \a req, thus the \a segment in + * the request message pointed by req->rq_reqmsg can accommodate + * at least \a newsize of data. + * + * \pre req->rq_reqmsg->lm_buflens[segment] < newsize. + * + * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(), + * gss_enlarge_reqbuf(). + */ + int (*enlarge_reqbuf) + (struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize); + /* + * misc + */ + int (*display) (struct ptlrpc_sec *sec, + struct seq_file *seq); +}; + +/** + * server side policy operation vector. + */ +struct ptlrpc_sec_sops { + /** + * verify an incoming request. + * + * \pre request message is pointed by req->rq_reqbuf, size is + * req->rq_reqdata_len; and the message has been unpacked to + * host byte order. + * + * \retval SECSVC_OK success, req->rq_reqmsg point to request message + * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set; + * req->rq_sp_from is decoded from request. + * \retval SECSVC_COMPLETE success, the request has been fully + * processed, and reply message has been prepared; req->rq_sp_from is + * decoded from request. + * \retval SECSVC_DROP failed, this request should be dropped. + * + * \see null_accept(), plain_accept(), gss_svc_accept_kr(). + */ + int (*accept) (struct ptlrpc_request *req); + + /** + * Perform security transformation upon reply message. + * + * \pre reply message is pointed by req->rq_reply_state->rs_msg, size + * is req->rq_replen. + * \post req->rs_repdata_len is the final message size. + * \post req->rq_reply_off is set. + * + * \see null_authorize(), plain_authorize(), gss_svc_authorize(). + */ + int (*authorize) (struct ptlrpc_request *req); + + /** + * Invalidate server context \a ctx. + * + * \see gss_svc_invalidate_ctx(). + */ + void (*invalidate_ctx) + (struct ptlrpc_svc_ctx *ctx); + + /** + * Allocate a ptlrpc_reply_state. + * + * \param msgsize size of the reply message in clear text. + * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we + * should simply use it; otherwise we'll responsible for allocating + * a new one. + * \post req->rq_reply_state != NULL; + * \post req->rq_reply_state->rs_msg != NULL; + * + * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs(). + */ + int (*alloc_rs) (struct ptlrpc_request *req, + int msgsize); + + /** + * Free a ptlrpc_reply_state. + */ + void (*free_rs) (struct ptlrpc_reply_state *rs); + + /** + * Release the server context \a ctx. + * + * \see gss_svc_free_ctx(). + */ + void (*free_ctx) (struct ptlrpc_svc_ctx *ctx); + + /** + * Install a reverse context based on the server context \a ctx. + * + * \see gss_svc_install_rctx_kr(). + */ + int (*install_rctx)(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx); + + /** + * Prepare buffer for incoming bulk write. + * + * \pre desc->bd_iov and desc->bd_iov_count describes the buffer + * intended to receive the write. + * + * \see gss_svc_prep_bulk(). + */ + int (*prep_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Unwrap the bulk write data. + * + * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk(). + */ + int (*unwrap_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Wrap the bulk read data. + * + * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk(). + */ + int (*wrap_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +}; + +struct ptlrpc_sec_policy { + struct module *sp_owner; + char *sp_name; + __u16 sp_policy; /* policy number */ + struct ptlrpc_sec_cops *sp_cops; /* client ops */ + struct ptlrpc_sec_sops *sp_sops; /* server ops */ +}; + +#define PTLRPC_SEC_FL_REVERSE 0x0001 /* reverse sec */ +#define PTLRPC_SEC_FL_ROOTONLY 0x0002 /* treat everyone as root */ +#define PTLRPC_SEC_FL_UDESC 0x0004 /* ship udesc */ +#define PTLRPC_SEC_FL_BULK 0x0008 /* intensive bulk i/o expected */ +#define PTLRPC_SEC_FL_PAG 0x0010 /* PAG mode */ + +/** + * The ptlrpc_sec represents the client side ptlrpc security facilities, + * each obd_import (both regular and reverse import) must associate with + * a ptlrpc_sec. + * + * \see sptlrpc_import_sec_adapt(). + */ +struct ptlrpc_sec { + struct ptlrpc_sec_policy *ps_policy; + atomic_t ps_refcount; + /** statistic only */ + atomic_t ps_nctx; + /** unique identifier */ + int ps_id; + struct sptlrpc_flavor ps_flvr; + enum lustre_sec_part ps_part; + /** after set, no more new context will be created */ + unsigned int ps_dying:1; + /** owning import */ + struct obd_import *ps_import; + spinlock_t ps_lock; + /** mtime of SELinux policy file */ + ktime_t ps_sepol_mtime; + /** next check time of SELinux policy file */ + ktime_t ps_sepol_checknext; + /** + * SELinux policy info + * sepol string format is: + * ::: + */ + char ps_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + + 1]; + + /* + * garbage collection + */ + struct list_head ps_gc_list; + time64_t ps_gc_interval; /* in seconds */ + time64_t ps_gc_next; /* in seconds */ +}; + +static inline int flvr_is_rootonly(__u32 flavor) +{ + return (SPTLRPC_FLVR_POLICY(flavor) == SPTLRPC_POLICY_GSS && + (SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_NULL || + SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_SK)); +} + +static inline int flvr_allows_user_desc(__u32 flavor) +{ + return (SPTLRPC_FLVR_POLICY(flavor) == SPTLRPC_POLICY_GSS && + (SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_NULL || + SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_SK)); +} + +static inline int sec_is_reverse(struct ptlrpc_sec *sec) +{ + return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE); +} + +static inline int sec_is_rootonly(struct ptlrpc_sec *sec) +{ + return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY); +} + + +struct ptlrpc_svc_ctx { + atomic_t sc_refcount; + struct ptlrpc_sec_policy *sc_policy; +}; + +/* + * user identity descriptor + */ +#define LUSTRE_MAX_GROUPS (128) + +struct ptlrpc_user_desc { + __u32 pud_uid; + __u32 pud_gid; + __u32 pud_fsuid; + __u32 pud_fsgid; + __u32 pud_cap; + __u32 pud_ngroups; + __u32 pud_groups[0]; +}; + +/* + * bulk flavors + */ +enum sptlrpc_bulk_hash_alg { + BULK_HASH_ALG_NULL = 0, + BULK_HASH_ALG_ADLER32, + BULK_HASH_ALG_CRC32, + BULK_HASH_ALG_MD5, + BULK_HASH_ALG_SHA1, + BULK_HASH_ALG_SHA256, + BULK_HASH_ALG_SHA384, + BULK_HASH_ALG_SHA512, + BULK_HASH_ALG_MAX +}; + +const char * sptlrpc_get_hash_name(__u8 hash_alg); +__u8 sptlrpc_get_hash_alg(const char *algname); + +enum { + BSD_FL_ERR = 1, +}; + +struct ptlrpc_bulk_sec_desc { + __u8 bsd_version; /* 0 */ + __u8 bsd_type; /* SPTLRPC_BULK_XXX */ + __u8 bsd_svc; /* SPTLRPC_BULK_SVC_XXXX */ + __u8 bsd_flags; /* flags */ + __u32 bsd_nob; /* nob of bulk data */ + __u8 bsd_data[0]; /* policy-specific token */ +}; + +extern struct dentry *sptlrpc_debugfs_dir; +extern struct proc_dir_entry *sptlrpc_lprocfs_dir; + +/* + * round size up to next power of 2, for slab allocation. + * @size must be sane (can't overflow after round up) + */ +static inline int size_roundup_power2(int size) +{ + size--; + size |= size >> 1; + size |= size >> 2; + size |= size >> 4; + size |= size >> 8; + size |= size >> 16; + size++; + return size; +} + +/* + * internal support libraries + */ +void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg, + int segment, int newsize); + +/* + * security policies + */ +int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy); +int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy); + +__u32 sptlrpc_name2flavor_base(const char *name); +const char *sptlrpc_flavor2name_base(__u32 flvr); +char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf, + char *buf, int bufsize); +char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize); +char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize); + +static inline struct ptlrpc_sec_policy * +sptlrpc_policy_get(struct ptlrpc_sec_policy *policy) +{ + __module_get(policy->sp_owner); + return policy; +} + +static inline void +sptlrpc_policy_put(struct ptlrpc_sec_policy *policy) +{ + module_put(policy->sp_owner); +} + +/* + * client credential + */ +static inline +unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx) +{ + return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK); +} + +static inline +int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx) +{ + return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE); +} + +static inline +int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx) +{ + return (cli_ctx_status(ctx) != 0); +} + +static inline +int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0); +} + +static inline +int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0); +} + +static inline +int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0); +} + +static inline +int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0); +} + +/* + * sec get/put + */ +struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec); +void sptlrpc_sec_put(struct ptlrpc_sec *sec); + +/* + * internal apis which only used by policy impelentation + */ +int sptlrpc_get_next_secid(void); +void sptlrpc_sec_destroy(struct ptlrpc_sec *sec); + +/* + * exported client context api + */ +struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx); +void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync); +void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx); +void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx); +int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize); + +/* + * exported client context wrap/buffers + */ +int sptlrpc_cli_wrap_request(struct ptlrpc_request *req); +int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req); +int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize); +void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req); +int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize); +void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req); +int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req, + const struct req_msg_field *field, + int newsize); +int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req, + struct ptlrpc_request **req_ret); +void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req); + +void sptlrpc_request_out_callback(struct ptlrpc_request *req); +int sptlrpc_get_sepol(struct ptlrpc_request *req); + +/* + * exported higher interface of import & request + */ +int sptlrpc_import_sec_adapt(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *flvr); +struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp); +void sptlrpc_import_sec_put(struct obd_import *imp); + +int sptlrpc_import_check_ctx(struct obd_import *imp); +void sptlrpc_import_flush_root_ctx(struct obd_import *imp); +void sptlrpc_import_flush_my_ctx(struct obd_import *imp); +void sptlrpc_import_flush_all_ctx(struct obd_import *imp); +int sptlrpc_req_get_ctx(struct ptlrpc_request *req); +void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync); +int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout); +int sptlrpc_export_update_ctx(struct obd_export *exp); +int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req); +void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode); + +int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule); + +/* gc */ +void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec); +void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec); +void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx); + +/* misc */ +const char * sec2target_str(struct ptlrpc_sec *sec); +int sptlrpc_lprocfs_cliobd_attach(struct obd_device *obd); + +/* + * server side + */ +enum secsvc_accept_res { + SECSVC_OK = 0, + SECSVC_COMPLETE, + SECSVC_DROP, +}; + +int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req); +int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen); +int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req); +void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs); +void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req); +void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req); +void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req); + +int sptlrpc_target_export_check(struct obd_export *exp, + struct ptlrpc_request *req); +void sptlrpc_target_update_exp_flavor(struct obd_device *obd, + struct sptlrpc_rule_set *rset); + +/* + * reverse context + */ +int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx); +int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_cli_ctx *ctx); + +/* bulk security api */ +int sptlrpc_enc_pool_add_user(void); +int sptlrpc_enc_pool_del_user(void); +int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc); +int sptlrpc_enc_pool_get_pages_array(struct page **pa, unsigned int count); +void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc); +void sptlrpc_enc_pool_put_pages_array(struct page **pa, unsigned int count); +int get_free_pages_in_pool(void); +int pool_is_at_full_capacity(void); + +int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc, + int nob); +int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +#ifdef HAVE_SERVER_SUPPORT +int sptlrpc_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int sptlrpc_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int sptlrpc_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +#endif + +/* bulk helpers (internal use only by policies) */ +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen); + +int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed); + +/* user descriptor helpers */ +static inline int sptlrpc_user_desc_size(int ngroups) +{ + return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32); +} + +int sptlrpc_current_user_desc_size(void); +int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset); +int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed); + +/** @} sptlrpc */ + +#endif /* _LUSTRE_SEC_H_ */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_swab.h b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h new file mode 100644 index 0000000000000..2e9d9f5cdbb99 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h @@ -0,0 +1,139 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2017, Intel Corporation. + * + * Copyright 2015 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * We assume all nodes are either little-endian or big-endian, and we + * always send messages in the sender's native format. The receiver + * detects the message format by checking the 'magic' field of the message + * (see lustre_msg_swabbed() below). + * + * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines + * are implemented in ptlrpc/lustre_swab.c. These 'swabbers' convert the + * type from "other" endian, in-place in the message buffer. + * + * A swabber takes a single pointer argument. The caller must already have + * verified that the length of the message buffer >= sizeof (type). + * + * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine + * may be defined that swabs just the variable part, after the caller has + * verified that the message buffer is large enough. + */ + +#ifndef _LUSTRE_SWAB_H_ +#define _LUSTRE_SWAB_H_ + +#include + +#ifdef HAVE_SERVER_SUPPORT +void lustre_swab_orphan_ent(struct lu_orphan_ent *ent); +void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent); +void lustre_swab_orphan_ent_v3(struct lu_orphan_ent_v3 *ent); +void lustre_swab_gl_lquota_desc(struct ldlm_gl_lquota_desc *desc); +void lustre_swab_gl_barrier_desc(struct ldlm_gl_barrier_desc *desc); +void lustre_swab_object_update(struct object_update *ou); +int lustre_swab_object_update_request(struct object_update_request *our, + __u32 len); +void lustre_swab_out_update_header(struct out_update_header *ouh); +void lustre_swab_out_update_buffer(struct out_update_buffer *oub); +void lustre_swab_object_update_result(struct object_update_result *our); +int lustre_swab_object_update_reply(struct object_update_reply *our, __u32 len); +#endif /* HAVE_SERVER_SUPPORT */ +void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); +void lustre_swab_connect(struct obd_connect_data *ocd); +void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +void lustre_swab_hsm_state_set(struct hsm_state_set *hss); +void lustre_swab_obd_statfs(struct obd_statfs *os); +void lustre_swab_obd_ioobj(struct obd_ioobj *ioo); +void lustre_swab_niobuf_remote(struct niobuf_remote *nbr); +void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb); +void lustre_swab_ost_lvb(struct ost_lvb *lvb); +int lustre_swab_obd_quotactl(struct obd_quotactl *q, __u32 len); +void lustre_swab_quota_body(struct quota_body *b); +void lustre_swab_lquota_lvb(struct lquota_lvb *lvb); +void lustre_swab_barrier_lvb(struct barrier_lvb *lvb); +void lustre_swab_generic_32s(__u32 *val); +void lustre_swab_mdt_body(struct mdt_body *b); +void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b); +void lustre_swab_mdt_rec_setattr(struct mdt_rec_setattr *sa); +void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr); +void lustre_swab_lmv_desc(struct lmv_desc *ld); +void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm); +void lustre_swab_lov_desc(struct lov_desc *ld); +void lustre_swab_ldlm_res_id(struct ldlm_res_id *id); +void lustre_swab_ldlm_policy_data(union ldlm_wire_policy_data *d); +void lustre_swab_ldlm_intent(struct ldlm_intent *i); +void lustre_swab_ldlm_resource_desc(struct ldlm_resource_desc *r); +void lustre_swab_ldlm_lock_desc(struct ldlm_lock_desc *l); +void lustre_swab_ldlm_request(struct ldlm_request *rq); +void lustre_swab_ldlm_reply(struct ldlm_reply *r); +void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo); +void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo); +void lustre_swab_mgs_config_body(struct mgs_config_body *body); +void lustre_swab_mgs_config_res(struct mgs_config_res *body); +void lustre_swab_lfsck_request(struct lfsck_request *lr); +void lustre_swab_lfsck_reply(struct lfsck_reply *lr); +void lustre_swab_obdo(struct obdo *o); +void lustre_swab_ost_body(struct ost_body *b); +void lustre_swab_ost_last_id(__u64 *id); +int lustre_swab_fiemap(struct fiemap *fiemap, __u32 len); +void lustre_swab_fiemap_info_key(struct ll_fiemap_info_key *fiemap_info); +void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum); +void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum); +void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum); +void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, + int stripe_count); +void lustre_swab_lov_user_md(struct lov_user_md *lum, size_t size); +void lustre_swab_lov_mds_md(struct lov_mds_md *lmm); +void lustre_swab_idx_info(struct idx_info *ii); +void lustre_swab_lip_header(struct lu_idxpage *lip); +void lustre_swab_fid2path(struct getinfo_fid2path *gf); +void lustre_swab_layout_intent(struct layout_intent *li); +void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +void lustre_swab_hsm_current_action(struct hsm_current_action *action); +void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk); +void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +void lustre_swab_hsm_user_item(struct hsm_user_item *hui); +void lustre_swab_hsm_request(struct hsm_request *hr); +void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl); +void lustre_swab_close_data(struct close_data *data); +void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync); +void lustre_swab_lmv_user_md(struct lmv_user_md *lum); +void lustre_swab_ladvise(struct lu_ladvise *ladvise); +void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr); + +/* Functions for dumping PTLRPC fields */ +void dump_rniobuf(struct niobuf_remote *rnb); +void dump_ioo(struct obd_ioobj *nb); +void dump_ost_body(struct ost_body *ob); +void dump_rcs(__u32 *rc); + +void lustre_print_user_md(unsigned int level, struct lov_user_md *lum, + const char *msg); + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_update.h b/drivers/staging/lustrefsx/lustre/include/lustre_update.h new file mode 100644 index 0000000000000..78cd3d4bfdd51 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_update.h @@ -0,0 +1,709 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.htm + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * lustre/include/lustre_update.h + * + * Author: Di Wang + */ + +#ifndef _LUSTRE_UPDATE_H +#define _LUSTRE_UPDATE_H +#include +#include +#include + +#define OUT_UPDATE_REPLY_SIZE 4096 +#define OUT_BULK_BUFFER_SIZE 4096 + +struct dt_key; +struct dt_rec; +struct object_update_param; +struct llog_update_record; + +static inline size_t update_params_size(const struct update_params *params, + unsigned int param_count) +{ + struct object_update_param *param; + size_t total_size = sizeof(*params); + unsigned int i; + + param = (struct object_update_param *)¶ms->up_params[0]; + for (i = 0; i < param_count; i++) { + size_t size = object_update_param_size(param); + + param = (struct object_update_param *)((char *)param + size); + total_size += size; + } + + return total_size; +} + +static inline struct object_update_param * +update_params_get_param(const struct update_params *params, + unsigned int index, unsigned int param_count) +{ + struct object_update_param *param; + unsigned int i; + + if (index > param_count) + return NULL; + + param = (struct object_update_param *)¶ms->up_params[0]; + for (i = 0; i < index; i++) + param = (struct object_update_param *)((char *)param + + object_update_param_size(param)); + + return param; +} + +static inline void* +update_params_get_param_buf(const struct update_params *params, __u16 index, + unsigned int param_count, __u16 *size) +{ + struct object_update_param *param; + + param = update_params_get_param(params, (unsigned int)index, + param_count); + if (param == NULL) + return NULL; + + if (size != NULL) + *size = param->oup_len; + + return param->oup_buf; +} + +static inline size_t +update_op_size(unsigned int param_count) +{ + return offsetof(struct update_op, uop_params_off[param_count]); +} + +static inline struct update_op * +update_op_next_op(const struct update_op *uop) +{ + return (struct update_op *)((char *)uop + + update_op_size(uop->uop_param_count)); +} + +static inline size_t update_ops_size(const struct update_ops *ops, + unsigned int update_count) +{ + struct update_op *op; + size_t total_size = sizeof(*ops); + unsigned int i; + + op = (struct update_op *)&ops->uops_op[0]; + for (i = 0; i < update_count; i++, op = update_op_next_op(op)) + total_size += update_op_size(op->uop_param_count); + + return total_size; +} + +static inline struct update_params * +update_records_get_params(const struct update_records *record) +{ + return (struct update_params *)((char *)record + + offsetof(struct update_records, ur_ops) + + update_ops_size(&record->ur_ops, record->ur_update_count)); +} + +static inline struct update_param * +update_param_next_param(const struct update_param *param) +{ + return (struct update_param *)((char *)param + + object_update_param_size( + (struct object_update_param *)param)); +} + +static inline size_t +__update_records_size(size_t raw_size) +{ + return cfs_size_round(offsetof(struct update_records, ur_ops) + + raw_size); +} + +static inline size_t +update_records_size(const struct update_records *record) +{ + size_t op_size = 0; + size_t param_size = 0; + + if (record->ur_update_count > 0) + op_size = update_ops_size(&record->ur_ops, + record->ur_update_count); + if (record->ur_param_count > 0) { + struct update_params *params; + + params = update_records_get_params(record); + param_size = update_params_size(params, record->ur_param_count); + } + + return __update_records_size(op_size + param_size); +} + +static inline size_t +__llog_update_record_size(size_t records_size) +{ + return cfs_size_round(sizeof(struct llog_rec_hdr) + records_size + + sizeof(struct llog_rec_tail)); +} + +static inline size_t +llog_update_record_size(const struct llog_update_record *lur) +{ + return __llog_update_record_size( + update_records_size(&lur->lur_update_rec)); +} + +static inline struct update_op * +update_ops_get_op(const struct update_ops *ops, unsigned int index, + unsigned int update_count) +{ + struct update_op *op; + unsigned int i; + + if (index > update_count) + return NULL; + + op = (struct update_op *)&ops->uops_op[0]; + for (i = 0; i < index; i++) + op = update_op_next_op(op); + + return op; +} + +static inline void +*object_update_param_get(const struct object_update *update, size_t index, + size_t *size) +{ + const struct object_update_param *param; + size_t i; + + if (index >= update->ou_params_count) + return ERR_PTR(-EINVAL); + + param = &update->ou_params[0]; + for (i = 0; i < index; i++) + param = (struct object_update_param *)((char *)param + + object_update_param_size(param)); + + if (size != NULL) + *size = param->oup_len; + + if (param->oup_len == 0) + return ERR_PTR(-ENODATA); + + return (void *)¶m->oup_buf[0]; +} + +static inline unsigned long +object_update_request_size(const struct object_update_request *our) +{ + unsigned long size; + size_t i = 0; + + size = offsetof(struct object_update_request, ourq_updates[0]); + for (i = 0; i < our->ourq_count; i++) { + struct object_update *update; + + update = (struct object_update *)((char *)our + size); + size += object_update_size(update); + } + return size; +} + +static inline void +object_update_result_insert(struct object_update_reply *reply, + void *data, size_t data_len, size_t index, + int rc) +{ + struct object_update_result *update_result; + + update_result = object_update_result_get(reply, index, NULL); + LASSERT(update_result); + + update_result->our_rc = ptlrpc_status_hton(rc); + if (rc >= 0) { + if (data_len > 0 && data) + memcpy(update_result->our_data, data, data_len); + update_result->our_datalen = data_len; + } + + reply->ourp_lens[index] = cfs_size_round(data_len + + sizeof(struct object_update_result)); +} + +static inline int +object_update_result_data_get(const struct object_update_reply *reply, + struct lu_buf *lbuf, size_t index) +{ + struct object_update_result *update_result; + size_t size = 0; + int result; + + LASSERT(lbuf != NULL); + update_result = object_update_result_get(reply, index, &size); + if (update_result == NULL || + size < cfs_size_round(sizeof(struct object_update_reply)) || + update_result->our_datalen > size) + RETURN(-EFAULT); + + result = ptlrpc_status_ntoh(update_result->our_rc); + if (result < 0) + return result; + + lbuf->lb_buf = update_result->our_data; + lbuf->lb_len = update_result->our_datalen; + + return result; +} + +/** + * Attached in the thandle to record the updates for distribute + * distribution. + */ +struct thandle_update_records { + /* All of updates for the cross-MDT operation, vmalloc'd. */ + struct llog_update_record *tur_update_records; + size_t tur_update_records_buf_size; + + /* All of parameters for the cross-MDT operation, vmalloc'd */ + struct update_params *tur_update_params; + unsigned int tur_update_param_count; + size_t tur_update_params_buf_size; +}; + +#define TOP_THANDLE_MAGIC 0x20140917 +struct top_multiple_thandle { + struct dt_device *tmt_master_sub_dt; + atomic_t tmt_refcount; + /* Other sub transactions will be listed here. */ + struct list_head tmt_sub_thandle_list; + spinlock_t tmt_sub_lock; + + struct list_head tmt_commit_list; + /* All of update records will packed here */ + struct thandle_update_records *tmt_update_records; + + wait_queue_head_t tmt_stop_waitq; + __u64 tmt_batchid; + int tmt_result; + __u32 tmt_magic; + size_t tmt_record_size; + __u32 tmt_committed:1; +}; + +/* {top,sub}_thandle are used to manage distributed transactions which + * include updates on several nodes. A top_handle represents the + * whole operation, and sub_thandle represents updates on each node. */ +struct top_thandle { + struct thandle tt_super; + /* The master sub transaction. */ + struct thandle *tt_master_sub_thandle; + + struct top_multiple_thandle *tt_multiple_thandle; +}; + +struct sub_thandle_cookie { + struct llog_cookie stc_cookie; + struct list_head stc_list; +}; + +/* Sub thandle is used to track multiple sub thandles under one parent + * thandle */ +struct sub_thandle { + struct thandle *st_sub_th; + struct dt_device *st_dt; + struct list_head st_cookie_list; + struct dt_txn_commit_cb st_commit_dcb; + struct dt_txn_commit_cb st_stop_dcb; + int st_result; + + /* linked to top_thandle */ + struct list_head st_sub_list; + + /* If this sub thandle is committed */ + bool st_committed:1, + st_stopped:1, + st_started:1; +}; + +struct tx_arg; +typedef int (*tx_exec_func_t)(const struct lu_env *env, struct thandle *th, + struct tx_arg *ta); + +/* Structure for holding one update execution */ +struct tx_arg { + tx_exec_func_t exec_fn; + tx_exec_func_t undo_fn; + struct dt_object *object; + const char *file; + struct object_update_reply *reply; + int line; + int index; + union { + struct { + struct dt_insert_rec rec; + const struct dt_key *key; + } insert; + struct { + } ref; + struct { + struct lu_attr attr; + } attr_set; + struct { + struct lu_buf buf; + const char *name; + int flags; + __u32 csum; + } xattr_set; + struct { + struct lu_attr attr; + struct dt_allocation_hint hint; + struct dt_object_format dof; + struct lu_fid fid; + } create; + struct { + struct lu_buf buf; + loff_t pos; + } write; + struct { + struct ost_body *body; + } destroy; + } u; +}; + +/* Structure for holding all update executations of one transaction */ +struct thandle_exec_args { + struct thandle *ta_handle; + int ta_argno; /* used args */ + int ta_alloc_args; /* allocated args count */ + struct tx_arg **ta_args; +}; + +/* target/out_lib.c */ +int out_update_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, enum update_type op, + const struct lu_fid *fid, unsigned int params_count, + __u16 *param_sizes, const void **param_bufs, + __u32 reply_size); +int out_create_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof); +int out_destroy_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid); +int out_index_delete_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const struct dt_key *key); +int out_index_insert_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const struct dt_rec *rec, + const struct dt_key *key); +int out_xattr_set_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const struct lu_buf *buf, + const char *name, __u32 flag); +int out_xattr_del_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const char *name); +int out_attr_set_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const struct lu_attr *attr); +int out_ref_add_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid); +int out_ref_del_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid); +int out_write_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const struct lu_buf *buf, + __u64 pos); +int out_attr_get_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid); +int out_index_lookup_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, struct dt_rec *rec, + const struct dt_key *key); +int out_xattr_get_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const char *name, + const int bufsize); +int out_xattr_list_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const int bufsize); +int out_read_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_length, const struct lu_fid *fid, + size_t size, loff_t pos); + +const char *update_op_str(__u16 opcode); + +/* target/update_trans.c */ +struct thandle *thandle_get_sub_by_dt(const struct lu_env *env, + struct thandle *th, + struct dt_device *sub_dt); + +static inline struct thandle * +thandle_get_sub(const struct lu_env *env, struct thandle *th, + const struct dt_object *sub_obj) +{ + return thandle_get_sub_by_dt(env, th, lu2dt_dev(sub_obj->do_lu.lo_dev)); +} + +struct thandle * +top_trans_create(const struct lu_env *env, struct dt_device *master_dev); +int top_trans_start(const struct lu_env *env, struct dt_device *master_dev, + struct thandle *th); +int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev, + struct thandle *th); +void top_multiple_thandle_destroy(struct top_multiple_thandle *tmt); + +static inline void top_multiple_thandle_get(struct top_multiple_thandle *tmt) +{ + atomic_inc(&tmt->tmt_refcount); +} + +static inline void top_multiple_thandle_put(struct top_multiple_thandle *tmt) +{ + if (atomic_dec_and_test(&tmt->tmt_refcount)) + top_multiple_thandle_destroy(tmt); +} + +struct sub_thandle *lookup_sub_thandle(struct top_multiple_thandle *tmt, + struct dt_device *dt_dev); +int sub_thandle_trans_create(const struct lu_env *env, + struct top_thandle *top_th, + struct sub_thandle *st); + +/* update_records.c */ +size_t update_records_create_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_attr *attr, + const struct dt_allocation_hint *hint, + struct dt_object_format *dof); +size_t update_records_attr_set_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_attr *attr); +size_t update_records_ref_add_size(const struct lu_env *env, + const struct lu_fid *fid); +size_t update_records_ref_del_size(const struct lu_env *env, + const struct lu_fid *fid); +size_t update_records_destroy_size(const struct lu_env *env, + const struct lu_fid *fid); +size_t update_records_index_insert_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct dt_rec *rec, + const struct dt_key *key); +size_t update_records_index_delete_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct dt_key *key); +size_t update_records_xattr_set_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_buf *buf, + const char *name, + __u32 flag); +size_t update_records_xattr_del_size(const struct lu_env *env, + const struct lu_fid *fid, + const char *name); +size_t update_records_write_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_buf *buf, + __u64 pos); +size_t update_records_punch_size(const struct lu_env *env, + const struct lu_fid *fid, + __u64 start, __u64 end); + +int update_records_create_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_attr *attr, + const struct dt_allocation_hint *hint, + struct dt_object_format *dof); +int update_records_attr_set_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_attr *attr); +int update_records_ref_add_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid); +int update_records_ref_del_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid); +int update_records_destroy_pack(const struct lu_env *env, + struct update_ops *ops, unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid); +int update_records_index_insert_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct dt_rec *rec, + const struct dt_key *key); +int update_records_index_delete_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct dt_key *key); +int update_records_xattr_set_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_buf *buf, const char *name, + __u32 flag); +int update_records_xattr_del_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const char *name); +int update_records_write_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_buf *buf, + __u64 pos); +int update_records_punch_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + __u64 start, __u64 end); +int update_records_noop_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid); + +int tur_update_records_extend(struct thandle_update_records *tur, + size_t new_size); +int tur_update_params_extend(struct thandle_update_records *tur, + size_t new_size); +int tur_update_extend(struct thandle_update_records *tur, + size_t new_op_size, size_t new_param_size); + +#define update_record_pack(name, th, ...) \ +({ \ + struct top_thandle *top_th; \ + struct top_multiple_thandle *tmt; \ + struct thandle_update_records *tur; \ + struct llog_update_record *lur; \ + size_t avail_param_size; \ + size_t avail_op_size; \ + int ret; \ + \ + while (1) { \ + top_th = container_of(th, struct top_thandle, tt_super);\ + tmt = top_th->tt_multiple_thandle; \ + tur = tmt->tmt_update_records; \ + lur = tur->tur_update_records; \ + avail_param_size = tur->tur_update_params_buf_size - \ + update_params_size(tur->tur_update_params, \ + tur->tur_update_param_count); \ + avail_op_size = tur->tur_update_records_buf_size - \ + llog_update_record_size(lur); \ + ret = update_records_##name##_pack(env, \ + &lur->lur_update_rec.ur_ops, \ + &lur->lur_update_rec.ur_update_count, \ + &avail_op_size, \ + tur->tur_update_params, \ + &tur->tur_update_param_count, \ + &avail_param_size, __VA_ARGS__); \ + if (ret == -E2BIG) { \ + ret = tur_update_extend(tur, avail_op_size, \ + avail_param_size); \ + if (ret != 0) \ + break; \ + continue; \ + } else { \ + break; \ + } \ + } \ + ret; \ +}) + +#define update_record_size(env, name, th, ...) \ +({ \ + struct top_thandle *top_th; \ + struct top_multiple_thandle *tmt; \ + \ + top_th = container_of(th, struct top_thandle, tt_super); \ + \ + LASSERT(top_th->tt_multiple_thandle != NULL); \ + tmt = top_th->tt_multiple_thandle; \ + tmt->tmt_record_size += \ + update_records_##name##_size(env, __VA_ARGS__); \ +}) +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lvfs.h b/drivers/staging/lustrefsx/lustre/include/lvfs.h new file mode 100644 index 0000000000000..2ca2f19bab7b3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lvfs.h @@ -0,0 +1,72 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lvfs.h + * + * lustre VFS/process permission interface + */ + +#ifndef __LVFS_H__ +#define __LVFS_H__ + +#include +#include +#include +#include +#include +#include + +#define OBD_RUN_CTXT_MAGIC 0xC0FFEEAA +#define OBD_CTXT_DEBUG /* development-only debugging */ + +struct dt_device; + +struct lvfs_run_ctxt { + struct vfsmount *pwdmnt; + struct dentry *pwd; + int umask; + struct dt_device *dt; +#ifdef OBD_CTXT_DEBUG + unsigned int magic; +#endif +}; + +static inline void OBD_SET_CTXT_MAGIC(struct lvfs_run_ctxt *ctxt) +{ +#ifdef OBD_CTXT_DEBUG + ctxt->magic = OBD_RUN_CTXT_MAGIC; +#endif +} + +/* ptlrpc_sec_ctx.c */ +void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx); +void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx); + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/md_object.h b/drivers/staging/lustrefsx/lustre/include/md_object.h new file mode 100644 index 0000000000000..daa62d86f9f29 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/md_object.h @@ -0,0 +1,733 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/md_object.h + * + * Extention of lu_object.h for metadata objects + */ + +#ifndef _LUSTRE_MD_OBJECT_H +#define _LUSTRE_MD_OBJECT_H + +#ifndef HAVE_SERVER_SUPPORT +# error "client code should not depend on md_object.h" +#endif /* !HAVE_SERVER_SUPPORT */ + +/** \defgroup md md + * Sub-class of lu_object with methods common for "meta-data" objects in MDT + * stack. + * + * Meta-data objects implement namespace operations: you can link, unlink + * them, and treat them as directories. + * + * Examples: mdt, cmm, and mdt are implementations of md interface. + * @{ + */ + + +/* + * super-class definitions. + */ +#include + +struct md_device; +struct md_device_operations; +struct md_object; +struct obd_export; + +/** metadata attributes */ +enum ma_valid { + MA_INODE = BIT(0), + MA_LOV = BIT(1), + MA_FLAGS = BIT(2), + MA_LMV = BIT(3), + MA_ACL_DEF = BIT(4), + MA_LOV_DEF = BIT(5), + MA_HSM = BIT(6), + MA_PFID = BIT(7), + MA_LMV_DEF = BIT(8), + MA_SOM = BIT(9), + MA_FORCE_LOG = BIT(10), /* forced close logged in mdt_mfd_close */ +}; + +typedef enum { + MDL_MINMODE = 0, + MDL_EX = 1, + MDL_PW = 2, + MDL_PR = 4, + MDL_CW = 8, + MDL_CR = 16, + MDL_NL = 32, + MDL_GROUP = 64, + MDL_MAXMODE +} mdl_mode_t; + +typedef enum { + MDT_NUL_LOCK = 0, + MDT_REG_LOCK = BIT(0), + MDT_PDO_LOCK = BIT(1), +} mdl_type_t; + +/* lfs rgetfacl permission check */ +#define MAY_RGETFACL BIT(14) + +/* memory structure for hsm attributes + * for fields description see the on disk structure hsm_attrs + * which is defined in lustre_idl.h + */ +struct md_hsm { + __u32 mh_compat; + __u32 mh_flags; + __u64 mh_arch_id; + __u64 mh_arch_ver; +}; + + +/* memory structure for SOM attributes + * for fields description see the on disk structure som_attrs + * which is defined in lustre_idl.h + */ +struct md_som { + __u16 ms_valid; + __u64 ms_size; + __u64 ms_blocks; +}; + +struct md_attr { + __u64 ma_valid; + __u64 ma_need; + __u64 ma_attr_flags; + struct lu_attr ma_attr; + struct lu_fid ma_pfid; + struct md_hsm ma_hsm; + struct md_som ma_som; + struct lov_mds_md *ma_lmm; + union lmv_mds_md *ma_lmv; + struct lmv_user_md *ma_default_lmv; + void *ma_acl; + int ma_lmm_size; + int ma_lmv_size; + int ma_default_lmv_size; + int ma_acl_size; + int ma_enable_chprojid_gid; +}; + +/** Additional parameters for create */ +struct md_op_spec { + union { + /** symlink target */ + struct lu_name sp_symname; + /** eadata for regular files */ + struct md_spec_reg { + void *eadata; + int eadatalen; + } sp_ea; + } u; + + /** Open flags from client: such as MDS_OPEN_CREAT, and others. */ + __u64 sp_cr_flags; + + /* File security context for creates. */ + const char *sp_cr_file_secctx_name; /* (security) xattr name */ + void *sp_cr_file_secctx; /* xattr value */ + size_t sp_cr_file_secctx_size; /* xattr value size */ + + /* File encryption context for creates. */ + void *sp_cr_file_encctx; /* enc ctx value */ + size_t sp_cr_file_encctx_size; /* enc ctx size */ + + /* Archive ID used for auto PCC attach when create newly files. */ + __u32 sp_archive_id; + + /** don't create lov objects or llog cookie - this replay */ + unsigned int no_create:1, + sp_cr_lookup:1, /* do lookup sanity check or not. */ + sp_rm_entry:1, /* only remove name entry */ + sp_permitted:1, /* do not check permission */ + sp_migrate_close:1, /* close the file during migrate */ + sp_migrate_nsonly:1; /* migrate dirent only */ + + /** to create directory */ + const struct dt_index_features *sp_feat; +}; + +enum md_layout_opc { + MD_LAYOUT_NOP = 0, + MD_LAYOUT_WRITE, /* FLR: write the file */ + MD_LAYOUT_RESYNC, /* FLR: resync starts */ + MD_LAYOUT_RESYNC_DONE, /* FLR: resync done */ + MD_LAYOUT_ATTACH, /* attach stripes */ + MD_LAYOUT_DETACH, /* detach stripes */ + MD_LAYOUT_SHRINK, /* shrink striped directory (destroy stripes) */ + MD_LAYOUT_SPLIT, /* split directory (allocate new stripes) */ + MD_LAYOUT_MAX, +}; + +/** + * Parameters for layout change API. + */ +struct md_layout_change { + enum md_layout_opc mlc_opc; + struct lu_buf mlc_buf; + union { + struct { + __u16 mlc_mirror_id; + struct layout_intent *mlc_intent; + struct lustre_som_attrs mlc_som; + size_t mlc_resync_count; + __u32 *mlc_resync_ids; + }; /* file */ + struct { + /* parent obj in plain dir split */ + struct md_object *mlc_parent; + /* target obj in plain dir split */ + struct md_object *mlc_target; + /* target attr in plain dir split */ + struct lu_attr *mlc_attr; + /* target name in plain dir split */ + const struct lu_name *mlc_name; + /* dir split spec */ + struct md_op_spec *mlc_spec; + }; /* dir */ + }; +}; + +union ldlm_policy_data; +/** + * Operations implemented for each md object (both directory and leaf). + */ +struct md_object_operations { + int (*moo_permission)(const struct lu_env *env, + struct md_object *pobj, struct md_object *cobj, + struct md_attr *attr, unsigned int may_mask); + + int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj, + struct md_attr *attr); + + int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj, + const struct md_attr *attr); + + int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj, + struct lu_buf *buf, const char *name); + + int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj, + struct lu_buf *buf); + + int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj, + const struct lu_buf *buf, const char *name, + int fl); + + int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj, + const char *name); + + /** This method is used to swap the layouts between 2 objects */ + int (*moo_swap_layouts)(const struct lu_env *env, + struct md_object *obj1, struct md_object *obj2, + __u64 flags); + + /** \retval number of bytes actually read upon success */ + int (*moo_readpage)(const struct lu_env *env, struct md_object *obj, + const struct lu_rdpg *rdpg); + + int (*moo_readlink)(const struct lu_env *env, struct md_object *obj, + struct lu_buf *buf); + + int (*moo_changelog)(const struct lu_env *env, + enum changelog_rec_type type, + enum changelog_rec_flags clf_flags, + struct md_device *m, const struct lu_fid *fid); + + int (*moo_open)(const struct lu_env *env, struct md_object *obj, + u64 open_flags, struct md_op_spec*); + + int (*moo_close)(const struct lu_env *env, struct md_object *obj, + struct md_attr *ma, u64 open_flags); + + int (*moo_object_sync)(const struct lu_env *, struct md_object *); + + int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj, + struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy); + int (*moo_object_unlock)(const struct lu_env *env, + struct md_object *obj, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy); + + int (*moo_invalidate)(const struct lu_env *env, struct md_object *obj); + /** + * Trying to write to un-instantiated layout component. + * + * The caller should have held layout lock. + * + * This API can be extended to support every other layout changing + * operations, such as component {add,del,change}, layout swap, + * layout merge, etc. One of the benefits by doing this is that the MDT + * no longer needs to understand layout. + * + * However, layout creation, removal, and fetch should still use + * xattr_{get,set}() because they don't interpret layout on the + * MDT layer. + * + * \param[in] env execution environment + * \param[in] obj MD object + * \param[in] layout data structure to describe the changes to + * the MD object's layout + * + * \retval 0 success + * \retval -ne error code + */ + int (*moo_layout_change)(const struct lu_env *env, + struct md_object *obj, + struct md_layout_change *layout); +}; + +/** + * Operations implemented for each directory object. + */ +struct md_dir_operations { + int (*mdo_is_subdir)(const struct lu_env *env, struct md_object *obj, + const struct lu_fid *fid); + + int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj, + const struct lu_name *lname, struct lu_fid *fid, + struct md_op_spec *spec); + + mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env, + struct md_object *obj, + mdl_mode_t mode); + + int (*mdo_create)(const struct lu_env *env, struct md_object *pobj, + const struct lu_name *lname, struct md_object *child, + struct md_op_spec *spec, + struct md_attr *ma); + + /** This method is used for creating data object for this meta object*/ + int (*mdo_create_data)(const struct lu_env *env, struct md_object *p, + struct md_object *o, + const struct md_op_spec *spec, + struct md_attr *ma); + + int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj, + struct md_object *tpobj, const struct lu_fid *lf, + const struct lu_name *lsname, struct md_object *tobj, + const struct lu_name *ltname, struct md_attr *ma); + + int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj, + struct md_object *src_obj, const struct lu_name *lname, + struct md_attr *ma); + + int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj, + struct md_object *cobj, const struct lu_name *lname, + struct md_attr *ma, int no_name); + + int (*mdo_migrate)(const struct lu_env *env, struct md_object *pobj, + struct md_object *sobj, const struct lu_name *lname, + struct md_object *tobj, struct md_op_spec *spec, + struct md_attr *ma); +}; + +struct md_device_operations { + /** meta-data device related handlers. */ + int (*mdo_root_get)(const struct lu_env *env, struct md_device *m, + struct lu_fid *f); + + const struct dt_device_param *(*mdo_dtconf_get)(const struct lu_env *e, + struct md_device *m); + + int (*mdo_statfs)(const struct lu_env *env, struct md_device *m, + struct obd_statfs *sfs); + + int (*mdo_llog_ctxt_get)(const struct lu_env *env, + struct md_device *m, int idx, void **h); + + int (*mdo_iocontrol)(const struct lu_env *env, struct md_device *m, + unsigned int cmd, int len, void *data); +}; + +struct md_device { + struct lu_device md_lu_dev; + const struct md_device_operations *md_ops; +}; + +struct md_object { + struct lu_object mo_lu; + const struct md_object_operations *mo_ops; + const struct md_dir_operations *mo_dir_ops; +}; + +static inline struct md_device *lu2md_dev(const struct lu_device *d) +{ + LASSERT(IS_ERR(d) || lu_device_is_md(d)); + return container_of_safe(d, struct md_device, md_lu_dev); +} + +static inline struct lu_device *md2lu_dev(struct md_device *d) +{ + return &d->md_lu_dev; +} + +static inline struct md_object *lu2md(const struct lu_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->lo_dev)); + return container_of_safe(o, struct md_object, mo_lu); +} + +static inline int md_device_init(struct md_device *md, struct lu_device_type *t) +{ + return lu_device_init(&md->md_lu_dev, t); +} + +static inline void md_device_fini(struct md_device *md) +{ + lu_device_fini(&md->md_lu_dev); +} + +static inline struct md_object *md_object_find_slice(const struct lu_env *env, + struct md_device *md, + const struct lu_fid *f) +{ + return lu2md(lu_object_find_slice(env, md2lu_dev(md), f, NULL)); +} + + +/** md operations */ +static inline int mo_permission(const struct lu_env *env, struct md_object *p, + struct md_object *c, struct md_attr *at, + unsigned int may_mask) +{ + LASSERT(c->mo_ops->moo_permission); + return c->mo_ops->moo_permission(env, p, c, at, may_mask); +} + +static inline int mo_attr_get(const struct lu_env *env, struct md_object *m, + struct md_attr *at) +{ + LASSERT(m->mo_ops->moo_attr_get); + return m->mo_ops->moo_attr_get(env, m, at); +} + +static inline int mo_readlink(const struct lu_env *env, + struct md_object *m, + struct lu_buf *buf) +{ + LASSERT(m->mo_ops->moo_readlink); + return m->mo_ops->moo_readlink(env, m, buf); +} + +static inline int mo_changelog(const struct lu_env *env, + enum changelog_rec_type type, + enum changelog_rec_flags clf_flags, + struct md_device *m, const struct lu_fid *fid) +{ + struct lu_fid rootfid; + struct md_object *root; + int rc; + + rc = m->md_ops->mdo_root_get(env, m, &rootfid); + if (rc) + return rc; + + root = md_object_find_slice(env, m, &rootfid); + if (IS_ERR(root)) + RETURN(PTR_ERR(root)); + + LASSERT(root->mo_ops->moo_changelog); + rc = root->mo_ops->moo_changelog(env, type, clf_flags, m, fid); + + lu_object_put(env, &root->mo_lu); + + return rc; +} + +static inline int mo_attr_set(const struct lu_env *env, + struct md_object *m, + const struct md_attr *at) +{ + LASSERT(m->mo_ops->moo_attr_set); + return m->mo_ops->moo_attr_set(env, m, at); +} + +static inline int mo_xattr_get(const struct lu_env *env, + struct md_object *m, + struct lu_buf *buf, + const char *name) +{ + LASSERT(m->mo_ops->moo_xattr_get); + return m->mo_ops->moo_xattr_get(env, m, buf, name); +} + +static inline int mo_xattr_del(const struct lu_env *env, + struct md_object *m, + const char *name) +{ + LASSERT(m->mo_ops->moo_xattr_del); + return m->mo_ops->moo_xattr_del(env, m, name); +} + +static inline int mo_xattr_set(const struct lu_env *env, + struct md_object *m, + const struct lu_buf *buf, + const char *name, + int flags) +{ + LASSERT(m->mo_ops->moo_xattr_set); + return m->mo_ops->moo_xattr_set(env, m, buf, name, flags); +} + +static inline int mo_xattr_list(const struct lu_env *env, + struct md_object *m, + struct lu_buf *buf) +{ + LASSERT(m->mo_ops->moo_xattr_list); + return m->mo_ops->moo_xattr_list(env, m, buf); +} + +static inline int mo_invalidate(const struct lu_env *env, struct md_object *m) +{ + LASSERT(m->mo_ops->moo_invalidate); + return m->mo_ops->moo_invalidate(env, m); +} + +static inline int mo_layout_change(const struct lu_env *env, + struct md_object *m, + struct md_layout_change *layout) +{ + /* need instantiate objects which in the access range */ + LASSERT(m->mo_ops->moo_layout_change); + return m->mo_ops->moo_layout_change(env, m, layout); +} + +static inline int mo_swap_layouts(const struct lu_env *env, + struct md_object *o1, + struct md_object *o2, __u64 flags) +{ + LASSERT(o1->mo_ops->moo_swap_layouts); + LASSERT(o2->mo_ops->moo_swap_layouts); + if (o1->mo_ops->moo_swap_layouts != o2->mo_ops->moo_swap_layouts) + return -EPERM; + return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags); +} + +static inline int mo_open(const struct lu_env *env, struct md_object *m, + u64 open_flags, struct md_op_spec *spec) +{ + LASSERT(m->mo_ops->moo_open); + return m->mo_ops->moo_open(env, m, open_flags, spec); +} + +static inline int mo_close(const struct lu_env *env, struct md_object *m, + struct md_attr *ma, u64 open_flags) +{ + LASSERT(m->mo_ops->moo_close); + return m->mo_ops->moo_close(env, m, ma, open_flags); +} + +static inline int mo_readpage(const struct lu_env *env, + struct md_object *m, + const struct lu_rdpg *rdpg) +{ + LASSERT(m->mo_ops->moo_readpage); + return m->mo_ops->moo_readpage(env, m, rdpg); +} + +static inline int mo_object_sync(const struct lu_env *env, struct md_object *m) +{ + LASSERT(m->mo_ops->moo_object_sync); + return m->mo_ops->moo_object_sync(env, m); +} + +static inline int mo_object_lock(const struct lu_env *env, + struct md_object *m, + struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy) +{ + LASSERT(m->mo_ops->moo_object_lock); + return m->mo_ops->moo_object_lock(env, m, lh, einfo, policy); +} + +static inline int mo_object_unlock(const struct lu_env *env, + struct md_object *m, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy) +{ + LASSERT(m->mo_ops->moo_object_unlock); + return m->mo_ops->moo_object_unlock(env, m, einfo, policy); +} + +static inline int mdo_lookup(const struct lu_env *env, + struct md_object *p, + const struct lu_name *lname, + struct lu_fid *f, + struct md_op_spec *spec) +{ + LASSERT(p->mo_dir_ops->mdo_lookup); + return p->mo_dir_ops->mdo_lookup(env, p, lname, f, spec); +} + +static inline mdl_mode_t mdo_lock_mode(const struct lu_env *env, + struct md_object *mo, + mdl_mode_t lm) +{ + if (mo->mo_dir_ops->mdo_lock_mode == NULL) + return MDL_MINMODE; + return mo->mo_dir_ops->mdo_lock_mode(env, mo, lm); +} + +static inline int mdo_create(const struct lu_env *env, + struct md_object *p, + const struct lu_name *lchild_name, + struct md_object *c, + struct md_op_spec *spc, + struct md_attr *at) +{ + LASSERT(p->mo_dir_ops->mdo_create); + return p->mo_dir_ops->mdo_create(env, p, lchild_name, c, spc, at); +} + +static inline int mdo_create_data(const struct lu_env *env, + struct md_object *p, + struct md_object *c, + const struct md_op_spec *spec, + struct md_attr *ma) +{ + LASSERT(c->mo_dir_ops->mdo_create_data); + return c->mo_dir_ops->mdo_create_data(env, p, c, spec, ma); +} + +static inline int mdo_rename(const struct lu_env *env, + struct md_object *sp, + struct md_object *tp, + const struct lu_fid *lf, + const struct lu_name *lsname, + struct md_object *t, + const struct lu_name *ltname, + struct md_attr *ma) +{ + LASSERT(tp->mo_dir_ops->mdo_rename); + return tp->mo_dir_ops->mdo_rename(env, sp, tp, lf, lsname, t, ltname, + ma); +} + +static inline int mdo_migrate(const struct lu_env *env, + struct md_object *pobj, + struct md_object *sobj, + const struct lu_name *lname, + struct md_object *tobj, + struct md_op_spec *spec, + struct md_attr *ma) +{ + LASSERT(pobj->mo_dir_ops->mdo_migrate); + return pobj->mo_dir_ops->mdo_migrate(env, pobj, sobj, lname, tobj, spec, + ma); +} + +static inline int mdo_is_subdir(const struct lu_env *env, + struct md_object *mo, + const struct lu_fid *fid) +{ + LASSERT(mo->mo_dir_ops->mdo_is_subdir); + return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid); +} + +static inline int mdo_link(const struct lu_env *env, + struct md_object *p, + struct md_object *s, + const struct lu_name *lname, + struct md_attr *ma) +{ + LASSERT(s->mo_dir_ops->mdo_link); + return s->mo_dir_ops->mdo_link(env, p, s, lname, ma); +} + +static inline int mdo_unlink(const struct lu_env *env, + struct md_object *p, + struct md_object *c, + const struct lu_name *lname, + struct md_attr *ma, int no_name) +{ + LASSERT(p->mo_dir_ops->mdo_unlink); + return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name); +} + +static inline int mdo_statfs(const struct lu_env *env, + struct md_device *m, + struct obd_statfs *sfs) +{ + LASSERT(m->md_ops->mdo_statfs); + return m->md_ops->mdo_statfs(env, m, sfs); +} + +struct dt_device; + +void lustre_som_swab(struct lustre_som_attrs *attrs); +int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh); +void lustre_hsm2buf(void *buf, const struct md_hsm *mh); + +enum { + UCRED_INVALID = -1, + UCRED_INIT = 0, + UCRED_OLD = 1, + UCRED_NEW = 2, +}; + +struct lu_ucred { + __u32 uc_valid; + __u32 uc_o_uid; + __u32 uc_o_gid; + __u32 uc_o_fsuid; + __u32 uc_o_fsgid; + __u32 uc_uid; + __u32 uc_gid; + __u32 uc_fsuid; + __u32 uc_fsgid; + __u32 uc_suppgids[2]; + kernel_cap_t uc_cap; + __u32 uc_umask; + struct group_info *uc_ginfo; + struct md_identity *uc_identity; + char uc_jobid[LUSTRE_JOBID_SIZE]; + lnet_nid_t uc_nid; + bool uc_enable_audit; +}; + +struct lu_ucred *lu_ucred(const struct lu_env *env); + +struct lu_ucred *lu_ucred_check(const struct lu_env *env); + +struct lu_ucred *lu_ucred_assert(const struct lu_env *env); + +int lu_ucred_global_init(void); + +void lu_ucred_global_fini(void); + +/** @} md */ +#endif /* _LINUX_MD_OBJECT_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/obd.h b/drivers/staging/lustrefsx/lustre/include/obd.h new file mode 100644 index 0000000000000..4a42feb690f35 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obd.h @@ -0,0 +1,1376 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __OBD_H +#define __OBD_H + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#ifdef HAVE_SERVER_SUPPORT +# include +# include +# include +#endif +#include +#include +#include +#include +#include +#include +#include + +#define MAX_OBD_DEVICES 8192 + +struct osc_async_rc { + int ar_rc; + int ar_force_sync; + __u64 ar_min_xid; +}; + +struct lov_oinfo { /* per-stripe data structure */ + struct ost_id loi_oi; /* object ID/Sequence on the target OST */ + int loi_ost_idx; /* OST stripe index in lov_tgt_desc->tgts */ + int loi_ost_gen; /* generation of this loi_ost_idx */ + + unsigned long loi_kms_valid:1; + __u64 loi_kms; /* known minimum size */ + struct ost_lvb loi_lvb; + struct osc_async_rc loi_ar; +}; + +void lov_fix_ea_for_replay(void *lovea); + +static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms) +{ + oinfo->loi_kms = kms; + oinfo->loi_kms_valid = 1; +} + +struct lov_stripe_md; +struct obd_info; + +typedef int (*obd_enqueue_update_f)(void *cookie, int rc); + +/* obd info for a particular level (lov, osc). */ +struct obd_info { + /* OBD_STATFS_* flags */ + __u64 oi_flags; + struct obd_device *oi_obd; + struct lu_tgt_desc *oi_tgt; + /* statfs data specific for every OSC, if needed at all. */ + struct obd_statfs *oi_osfs; + /* An update callback which is called to update some data on upper + * level. E.g. it is used for update lsm->lsm_oinfo at every received + * request in osc level for enqueue requests. It is also possible to + * update some caller data from LOV layer if needed. */ + obd_enqueue_update_f oi_cb_up; +}; + +struct obd_type { + const struct obd_ops *typ_dt_ops; + const struct md_ops *typ_md_ops; + struct proc_dir_entry *typ_procroot; + struct dentry *typ_debugfs_entry; +#ifdef HAVE_SERVER_SUPPORT + bool typ_sym_filter; +#endif + atomic_t typ_refcnt; + struct lu_device_type *typ_lu; + struct kobject typ_kobj; +}; +#define typ_name typ_kobj.name +#define OBD_LU_TYPE_SETUP ((void *)0x01UL) + +struct brw_page { + u64 off; + struct page *pg; + u32 count; + u32 flag; + /* used for encryption: difference with offset in clear text page */ + u16 bp_off_diff; + /* used for encryption: difference with count in clear text page */ + u16 bp_count_diff; + u32 bp_padding; +}; + +struct timeout_item { + enum timeout_event ti_event; + time64_t ti_timeout; + timeout_cb_t ti_cb; + void *ti_cb_data; + struct list_head ti_obd_list; + struct list_head ti_chain; +}; + +#define OBD_MAX_RIF_DEFAULT 8 +#define OBD_MAX_RIF_MAX 512 +#define OSC_MAX_RIF_MAX 256 +#define OSC_MAX_DIRTY_DEFAULT 64 +#define OSC_MAX_DIRTY_MB_MAX 2048 /* arbitrary, but < MAX_LONG bytes */ +#define OSC_DEFAULT_RESENDS 10 + +/* possible values for lut_sync_lock_cancel */ +enum tgt_sync_lock_cancel { + SYNC_LOCK_CANCEL_NEVER = 0, + SYNC_LOCK_CANCEL_BLOCKING = 1, + SYNC_LOCK_CANCEL_ALWAYS = 2, +}; + +/* + * Limit reply buffer size for striping data to one x86_64 page. This + * value is chosen to fit the striping data for common use cases while + * staying well below the limit at which the buffer must be backed by + * vmalloc(). Excessive use of vmalloc() may cause spinlock contention + * on the MDS. + */ +#define OBD_MAX_DEFAULT_EA_SIZE 4096 + +/* + * Lustre can handle larger xattrs internally, but we must respect the Linux + * VFS limitation or tools like tar cannot interact with Lustre volumes + * correctly. + */ +#define OBD_MAX_EA_SIZE XATTR_SIZE_MAX + + +enum obd_cl_sem_lock_class { + OBD_CLI_SEM_NORMAL, + OBD_CLI_SEM_MGC, + OBD_CLI_SEM_MDCOSC, +}; + +struct mdc_rpc_lock; +struct obd_import; +struct client_obd { + struct rw_semaphore cl_sem; + struct obd_uuid cl_target_uuid; + struct obd_import *cl_import; /* ptlrpc connection state */ + size_t cl_conn_count; + + /* Cache maximum and default values for easize. This is + * strictly a performance optimization to minimize calls to + * obd_size_diskmd(). The default values are used to calculate the + * initial size of a request buffer. The ptlrpc layer will resize the + * buffer as needed to accommodate a larger reply from the + * server. The default values should be small enough to avoid wasted + * memory and excessive use of vmalloc(), yet large enough to avoid + * reallocating the buffer in the common use case. */ + + /* Default EA size for striping attributes. It is initialized at + * mount-time based on the default stripe width of the filesystem, + * then it tracks the largest observed EA size advertised by + * the MDT, up to a maximum value of OBD_MAX_DEFAULT_EA_SIZE. */ + __u32 cl_default_mds_easize; + + /* Maximum possible EA size computed at mount-time based on + * the number of OSTs in the filesystem. May be increased at + * run-time if a larger observed size is advertised by the MDT. */ + __u32 cl_max_mds_easize; + + /* Data-on-MDT specific value to set larger reply buffer for possible + * data read along with open/stat requests. By default it tries to use + * unused space in reply buffer. + * This value is used to ensure that reply buffer has at least as + * much free space as value indicates. That free space is gained from + * LOV EA buffer which is small for DoM files and on big systems can + * provide up to 32KB of extra space in reply buffer. + * Default value is 8K now. + */ + __u32 cl_dom_min_inline_repsize; + + unsigned int cl_checksum:1, /* 0 = disabled, 1 = enabled */ + cl_checksum_dump:1, /* same */ + cl_ocd_grant_param:1, + cl_lsom_update:1; /* send LSOM updates */ + enum lustre_sec_part cl_sp_me; + enum lustre_sec_part cl_sp_to; + struct sptlrpc_flavor cl_flvr_mgc; /* fixed flavor of mgc->mgs */ + + /* the grant values are protected by loi_list_lock below */ + unsigned long cl_dirty_pages; /* all _dirty_ in pages */ + unsigned long cl_dirty_max_pages; /* allowed w/o rpc */ + unsigned long cl_avail_grant; /* bytes of credit for ost */ + unsigned long cl_lost_grant; /* lost credits (trunc) */ + /* grant consumed for dirty pages */ + unsigned long cl_dirty_grant; + + /* since we allocate grant by blocks, we don't know how many grant will + * be used to add a page into cache. As a solution, we reserve maximum + * grant before trying to dirty a page and unreserve the rest. + * See osc_{reserve|unreserve}_grant for details. */ + long cl_reserved_grant; + wait_queue_head_t cl_cache_waiters; /* waiting for cache/grant */ + time64_t cl_next_shrink_grant; /* seconds */ + struct list_head cl_grant_chain; + time64_t cl_grant_shrink_interval; /* seconds */ + + int cl_root_squash; /* if root squash enabled*/ + + /* A chunk is an optimal size used by osc_extent to determine + * the extent size. A chunk is max(PAGE_SIZE, OST block size) */ + int cl_chunkbits; + /* extent insertion metadata overhead to be accounted in grant, + * in bytes */ + unsigned int cl_grant_extent_tax; + /* maximum extent size, in number of pages */ + unsigned int cl_max_extent_pages; + + /* keep track of objects that have lois that contain pages which + * have been queued for async brw. this lock also protects the + * lists of osc_client_pages that hang off of the loi */ + /* + * ->cl_loi_list_lock protects consistency of + * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and + * ->ap_completion() call-backs are executed under this lock. As we + * cannot guarantee that these call-backs never block on all platforms + * (as a matter of fact they do block on Mac OS X), type of + * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux + * and blocking mutex on Mac OS X. (Alternative is to make this lock + * blocking everywhere, but we don't want to slow down fast-path of + * our main platform.) + * + * NB by Jinshan: though field names are still _loi_, but actually + * osc_object{}s are in the list. + */ + spinlock_t cl_loi_list_lock; + struct list_head cl_loi_ready_list; + struct list_head cl_loi_hp_ready_list; + struct list_head cl_loi_write_list; + struct list_head cl_loi_read_list; + __u32 cl_r_in_flight; + __u32 cl_w_in_flight; + /* just a sum of the loi/lop pending numbers to be exported by /proc */ + atomic_t cl_pending_w_pages; + atomic_t cl_pending_r_pages; + u32 cl_max_pages_per_rpc; + u32 cl_max_rpcs_in_flight; + u32 cl_max_short_io_bytes; + ktime_t cl_stats_init; + struct obd_histogram cl_read_rpc_hist; + struct obd_histogram cl_write_rpc_hist; + struct obd_histogram cl_read_page_hist; + struct obd_histogram cl_write_page_hist; + struct obd_histogram cl_read_offset_hist; + struct obd_histogram cl_write_offset_hist; + + /** LRU for osc caching pages */ + struct cl_client_cache *cl_cache; + /** member of cl_cache->ccc_lru */ + struct list_head cl_lru_osc; + /** # of available LRU slots left in the per-OSC cache. + * Available LRU slots are shared by all OSCs of the same file system, + * therefore this is a pointer to cl_client_cache::ccc_lru_left. */ + atomic_long_t *cl_lru_left; + /** # of busy LRU pages. A page is considered busy if it's in writeback + * queue, or in transfer. Busy pages can't be discarded so they are not + * in LRU cache. */ + atomic_long_t cl_lru_busy; + /** # of LRU pages in the cache for this client_obd */ + atomic_long_t cl_lru_in_list; + /** # of threads are shrinking LRU cache. To avoid contention, it's not + * allowed to have multiple threads shrinking LRU cache. */ + atomic_t cl_lru_shrinkers; + /** The time when this LRU cache was last used. */ + time64_t cl_lru_last_used; + /** stats: how many reclaims have happened for this client_obd. + * reclaim and shrink - shrink is async, voluntarily rebalancing; + * reclaim is sync, initiated by IO thread when the LRU slots are + * in shortage. */ + __u64 cl_lru_reclaim; + /** List of LRU pages for this client_obd */ + struct list_head cl_lru_list; + /** Lock for LRU page list */ + spinlock_t cl_lru_list_lock; + /** # of unstable pages in this client_obd. + * An unstable page is a page state that WRITE RPC has finished but + * the transaction has NOT yet committed. */ + atomic_long_t cl_unstable_count; + /** Link to osc_shrinker_list */ + struct list_head cl_shrink_list; + + /* number of in flight destroy rpcs is limited to max_rpcs_in_flight */ + atomic_t cl_destroy_in_flight; + wait_queue_head_t cl_destroy_waitq; + + /* modify rpcs in flight + * currently used for metadata only */ + spinlock_t cl_mod_rpcs_lock; + __u16 cl_max_mod_rpcs_in_flight; + __u16 cl_mod_rpcs_in_flight; + __u16 cl_close_rpcs_in_flight; + wait_queue_head_t cl_mod_rpcs_waitq; + unsigned long *cl_mod_tag_bitmap; + ktime_t cl_mod_rpcs_init; + struct obd_histogram cl_mod_rpcs_hist; + + /* mgc datastruct */ + struct mutex cl_mgc_mutex; + struct local_oid_storage *cl_mgc_los; + struct dt_object *cl_mgc_configs_dir; + struct obd_export *cl_mgc_mgsexp; + atomic_t cl_mgc_refcount; + /* in-flight control list and total RPCs counter */ + struct list_head cl_flight_waiters; + __u32 cl_rpcs_in_flight; + + /* supported checksum types that are worked out at connect time */ + __u32 cl_supp_cksum_types; + /* checksum algorithm to be used */ + enum cksum_types cl_cksum_type; + /* preferred checksum algorithm to be used */ + enum cksum_types cl_preferred_cksum_type; + + /* also protected by the poorly named _loi_list_lock lock above */ + struct osc_async_rc cl_ar; + + /* sequence manager */ + struct lu_client_seq *cl_seq; + struct rw_semaphore cl_seq_rwsem; + + atomic_t cl_resends; /* resend count */ + + /* ptlrpc work for writeback in ptlrpcd context */ + void *cl_writeback_work; + void *cl_lru_work; + struct mutex cl_quota_mutex; + /* hash tables for osc_quota_info */ + struct cfs_hash *cl_quota_hash[LL_MAXQUOTAS]; + /* the xid of the request updating the hash tables */ + __u64 cl_quota_last_xid; + /* Links to the global list of registered changelog devices */ + struct list_head cl_chg_dev_linkage; +}; +#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid) + +struct obd_id_info { + u32 idx; + u64 *data; +}; + +struct echo_client_obd { + struct obd_export *ec_exp; /* the local connection to osc/lov */ + spinlock_t ec_lock; + struct list_head ec_objects; + struct list_head ec_locks; + __u64 ec_unique; +}; + +/* allow statfs data caching for 1 second */ +#define OBD_STATFS_CACHE_SECONDS 1 +/* arbitrary maximum. larger would be useless, allows catching bogus input */ +#define OBD_STATFS_CACHE_MAX_AGE 3600 /* seconds */ +/* By default, don't do time based negative cache invalidation */ +#define OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS (-1) /* seconds */ + +#define lov_tgt_desc lu_tgt_desc + +struct lov_md_tgt_desc { + struct obd_device *lmtd_mdc; + __u32 lmtd_index; +}; + +struct lov_obd { + struct lov_desc desc; + struct lov_tgt_desc **lov_tgts; /* sparse array */ + struct lu_tgt_pool lov_packed; /* all OSTs in a packed + array */ + struct mutex lov_lock; + struct obd_connect_data lov_ocd; + atomic_t lov_refcount; + __u32 lov_death_row; /* tgts scheduled to be deleted */ + __u32 lov_tgt_size; /* size of tgts array */ + int lov_connects; + int lov_pool_count; + struct rhashtable lov_pools_hash_body; /* used for key access */ + struct list_head lov_pool_list; /* used for sequential access */ + struct proc_dir_entry *lov_pool_proc_entry; + enum lustre_sec_part lov_sp_me; + + /* Cached LRU and unstable data from upper layer */ + struct cl_client_cache *lov_cache; + + struct rw_semaphore lov_notify_lock; + /* Data-on-MDT: MDC array */ + struct lov_md_tgt_desc *lov_mdc_tgts; + + struct kobject *lov_tgts_kobj; +}; + +#define lmv_tgt_desc lu_tgt_desc + +struct lmv_obd { + struct lu_client_fld lmv_fld; + spinlock_t lmv_lock; + + int connected; + int max_easize; + int max_def_easize; + u32 lmv_statfs_start; + + struct lu_tgt_descs lmv_mdt_descs; + + struct obd_connect_data conn_data; + struct kobject *lmv_tgts_kobj; + void *lmv_cache; + + __u32 lmv_qos_rr_index; +}; + +#define lmv_mdt_count lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count +#define lmv_qos lmv_mdt_descs.ltd_qos + +/* Minimum sector size is 512 */ +#define MAX_GUARD_NUMBER (PAGE_SIZE / 512) + +struct niobuf_local { + __u64 lnb_file_offset; + __u32 lnb_page_offset; + __u32 lnb_len; + __u32 lnb_flags; + int lnb_rc; + struct page *lnb_page; + void *lnb_data; + __be16 lnb_guards[MAX_GUARD_NUMBER]; + __u16 lnb_guard_rpc:1; + __u16 lnb_guard_disk:1; + /* separate unlock for read path to allow shared access */ + __u16 lnb_locked:1; +}; + +struct tgt_thread_big_cache { + struct niobuf_local local[PTLRPC_MAX_BRW_PAGES]; +}; + +#define LUSTRE_FLD_NAME "fld" +#define LUSTRE_SEQ_NAME "seq" + +#define LUSTRE_MDD_NAME "mdd" +#define LUSTRE_OSD_LDISKFS_NAME "osd-ldiskfs" +#define LUSTRE_OSD_ZFS_NAME "osd-zfs" +#define LUSTRE_VVP_NAME "vvp" +#define LUSTRE_LMV_NAME "lmv" +#define LUSTRE_SLP_NAME "slp" +#define LUSTRE_LOD_NAME "lod" +#define LUSTRE_OSP_NAME "osp" +#define LUSTRE_LWP_NAME "lwp" + +/* obd device type names */ + /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */ +#define LUSTRE_MDS_NAME "mds" +#define LUSTRE_MDT_NAME "mdt" +#define LUSTRE_MDC_NAME "mdc" +#define LUSTRE_OSS_NAME "ost" /* FIXME change name to oss */ +#define LUSTRE_OST_NAME "obdfilter" /* FIXME change name to ost */ +#define LUSTRE_OSC_NAME "osc" +#define LUSTRE_LOV_NAME "lov" +#define LUSTRE_MGS_NAME "mgs" +#define LUSTRE_MGC_NAME "mgc" + +#define LUSTRE_ECHO_NAME "obdecho" +#define LUSTRE_ECHO_CLIENT_NAME "echo_client" +#define LUSTRE_QMT_NAME "qmt" + +/* Constant obd names (post-rename) */ +#define LUSTRE_MDS_OBDNAME "MDS" +#define LUSTRE_OSS_OBDNAME "OSS" +#define LUSTRE_MGS_OBDNAME "MGS" +#define LUSTRE_MGC_OBDNAME "MGC" + +static inline int is_lwp_on_mdt(char *name) +{ + char *ptr; + + ptr = strrchr(name, '-'); + if (ptr == NULL) { + CERROR("%s is not a obdname\n", name); + return 0; + } + + /* LWP name on MDT is fsname-MDTxxxx-lwp-MDTxxxx */ + + if (strncmp(ptr + 1, "MDT", 3) != 0) + return 0; + + while (*(--ptr) != '-' && ptr != name); + + if (ptr == name) + return 0; + + if (strncmp(ptr + 1, LUSTRE_LWP_NAME, strlen(LUSTRE_LWP_NAME)) != 0) + return 0; + + return 1; +} + +static inline int is_lwp_on_ost(char *name) +{ + char *ptr; + + ptr = strrchr(name, '-'); + if (ptr == NULL) { + CERROR("%s is not a obdname\n", name); + return 0; + } + + /* LWP name on OST is fsname-MDTxxxx-lwp-OSTxxxx */ + + if (strncmp(ptr + 1, "OST", 3) != 0) + return 0; + + while (*(--ptr) != '-' && ptr != name); + + if (ptr == name) + return 0; + + if (strncmp(ptr + 1, LUSTRE_LWP_NAME, strlen(LUSTRE_LWP_NAME)) != 0) + return 0; + + return 1; +} + +/* + * Events signalled through obd_notify() upcall-chain. + */ +enum obd_notify_event { + /* Device connect start */ + OBD_NOTIFY_CONNECT, + /* Device activated */ + OBD_NOTIFY_ACTIVE, + /* Device deactivated */ + OBD_NOTIFY_INACTIVE, + /* Connect data for import were changed */ + OBD_NOTIFY_OCD, + /* Administratively deactivate/activate event */ + OBD_NOTIFY_DEACTIVATE, + OBD_NOTIFY_ACTIVATE +}; + +/* + * Data structure used to pass obd_notify()-event to non-obd listeners (llite + * being main example). + */ +struct obd_notify_upcall { + int (*onu_upcall)(struct obd_device *host, struct obd_device *watched, + enum obd_notify_event ev, void *owner); + /* Opaque datum supplied by upper layer listener */ + void *onu_owner; +}; + +struct target_recovery_data { + svc_handler_t trd_recovery_handler; + pid_t trd_processing_task; + struct completion trd_starting; + struct completion trd_finishing; +}; + +struct obd_llog_group { + struct llog_ctxt *olg_ctxts[LLOG_MAX_CTXTS]; + wait_queue_head_t olg_waitq; + spinlock_t olg_lock; +}; + +/* corresponds to one of the obd's */ +#define OBD_DEVICE_MAGIC 0XAB5CD6EF + +struct obd_device { + struct obd_type *obd_type; + __u32 obd_magic; /* OBD_DEVICE_MAGIC */ + int obd_minor; /* device number: lctl dl */ + struct lu_device *obd_lu_dev; + + /* common and UUID name of this device */ + struct obd_uuid obd_uuid; + char obd_name[MAX_OBD_NAME]; + + /* bitfield modification is protected by obd_dev_lock */ + unsigned long + obd_attached:1, /* finished attach */ + obd_set_up:1, /* finished setup */ + obd_recovering:1, /* there are recoverable clients */ + obd_abort_recovery:1, /* recovery expired */ + obd_abort_recov_mdt:1, /* only abort recovery between MDTs */ + obd_version_recov:1, /* obd uses version checking */ + obd_replayable:1, /* recovery enabled; inform clients */ + obd_no_recov:1, /* fail instead of retry messages */ + obd_stopping:1, /* started cleanup */ + obd_starting:1, /* started setup */ + obd_force:1, /* cleanup with > 0 obd refcount */ + obd_fail:1, /* cleanup with failover */ + obd_no_conn:1, /* deny new connections */ + obd_inactive:1, /* device active/inactive + * (for /proc/status only!!) */ + obd_no_ir:1, /* no imperative recovery. */ + obd_process_conf:1, /* device is processing mgs config */ + obd_checksum_dump:1, /* dump pages upon cksum error */ + obd_dynamic_nids:1; /* Allow dynamic NIDs on device */ +#ifdef HAVE_SERVER_SUPPORT + /* no committed-transno notification */ + unsigned long obd_no_transno:1; +#endif + + /* use separate field as it is set in interrupt to don't mess with + * protection of other bits using _bh lock */ + unsigned long obd_recovery_expired:1; + /* uuid-export hash body */ + struct rhashtable obd_uuid_hash; + /* nid-export hash body */ + struct rhltable obd_nid_hash; + /* nid stats body */ + struct cfs_hash *obd_nid_stats_hash; + /* client_generation-export hash body */ + struct cfs_hash *obd_gen_hash; + struct list_head obd_nid_stats; + struct list_head obd_exports; + struct list_head obd_unlinked_exports; + struct list_head obd_delayed_exports; + struct list_head obd_lwp_list; + atomic_t obd_refcount; + int obd_num_exports; + int obd_grant_check_threshold; + spinlock_t obd_nid_lock; + struct ldlm_namespace *obd_namespace; + struct ptlrpc_client obd_ldlm_client; /* XXX OST/MDS only */ + /* a spinlock is OK for what we do now, may need a semaphore later */ + spinlock_t obd_dev_lock; /* protect OBD bitfield above */ + spinlock_t obd_osfs_lock; + struct obd_statfs obd_osfs; /* locked by obd_osfs_lock */ + time64_t obd_osfs_age; + __u64 obd_last_committed; + struct mutex obd_dev_mutex; + struct lvfs_run_ctxt obd_lvfs_ctxt; + struct obd_llog_group obd_olg; /* default llog group */ + struct obd_device *obd_observer; + struct rw_semaphore obd_observer_link_sem; + struct obd_notify_upcall obd_upcall; + struct obd_export *obd_self_export; + struct obd_export *obd_lwp_export; + /* list of exports in LRU order, for ping evictor, with obd_dev_lock */ + struct list_head obd_exports_timed; + time64_t obd_eviction_timer; /* for ping evictor */ + + atomic_t obd_max_recoverable_clients; + atomic_t obd_connected_clients; + int obd_stale_clients; + /* this lock protects all recovery list_heads, timer and + * obd_next_recovery_transno value */ + spinlock_t obd_recovery_task_lock; + __u64 obd_next_recovery_transno; + int obd_replayed_requests; + int obd_requests_queued_for_recovery; + wait_queue_head_t obd_next_transno_waitq; + /* protected by obd_recovery_task_lock */ + struct hrtimer obd_recovery_timer; + /* seconds */ + time64_t obd_recovery_start; + /* seconds, for lprocfs_status */ + time64_t obd_recovery_end; + /* To tell timeouts from time stamps Lustre uses timeout_t + * instead of time64_t. + */ + timeout_t obd_recovery_time_hard; + timeout_t obd_recovery_timeout; + int obd_recovery_ir_factor; + + /* new recovery stuff from CMD2 */ + int obd_replayed_locks; + atomic_t obd_req_replay_clients; + atomic_t obd_lock_replay_clients; + struct target_recovery_data obd_recovery_data; + + /* all lists are protected by obd_recovery_task_lock */ + struct list_head obd_req_replay_queue; + struct list_head obd_lock_replay_queue; + struct list_head obd_final_req_queue; + + union { +#ifdef HAVE_SERVER_SUPPORT + struct obd_device_target obt; + struct filter_obd filter; + struct ost_obd ost; + struct echo_obd echo; +#endif + struct client_obd cli; + struct echo_client_obd echo_client; + struct lov_obd lov; + struct lmv_obd lmv; + } u; + + /* Fields used by LProcFS */ + struct lprocfs_stats *obd_stats; + + struct lprocfs_stats *obd_md_stats; + + struct dentry *obd_debugfs_entry; + struct proc_dir_entry *obd_proc_entry; + struct proc_dir_entry *obd_proc_exports_entry; + struct dentry *obd_svc_debugfs_entry; + struct lprocfs_stats *obd_svc_stats; + const struct attribute **obd_attrs; + struct lprocfs_vars *obd_vars; + struct ldebugfs_vars *obd_debugfs_vars; + atomic_t obd_evict_inprogress; + wait_queue_head_t obd_evict_inprogress_waitq; + struct list_head obd_evict_list; /* protected with pet_lock */ + + /** + * LDLM pool part. Save last calculated SLV and Limit. + */ + rwlock_t obd_pool_lock; + __u64 obd_pool_slv; + int obd_pool_limit; + + int obd_conn_inprogress; + + /** + * List of outstanding class_incref()'s fo this OBD. For debugging. */ + struct lu_ref obd_reference; + + struct kset obd_kset; /* sysfs object collection */ + struct kobj_type obd_ktype; + struct completion obd_kobj_unregister; +}; + +int obd_uuid_add(struct obd_device *obd, struct obd_export *export); +void obd_uuid_del(struct obd_device *obd, struct obd_export *export); +#ifdef HAVE_SERVER_SUPPORT +struct obd_export *obd_uuid_lookup(struct obd_device *obd, + struct obd_uuid *uuid); + +int obd_nid_export_for_each(struct obd_device *obd, struct lnet_nid *nid, + int cb(struct obd_export *exp, void *data), + void *data); +int obd_nid_add(struct obd_device *obd, struct obd_export *exp); +void obd_nid_del(struct obd_device *obd, struct obd_export *exp); +#endif + +/* get/set_info keys */ +#define KEY_ASYNC "async" +#define KEY_CHANGELOG_CLEAR "changelog_clear" +#define KEY_FID2PATH "fid2path" +#define KEY_CHECKSUM "checksum" +#define KEY_CLEAR_FS "clear_fs" +#define KEY_CONN_DATA "conn_data" +#define KEY_EVICT_BY_NID "evict_by_nid" +#define KEY_FIEMAP "fiemap" +#define KEY_FLUSH_CTX "flush_ctx" +#define KEY_GRANT_SHRINK "grant_shrink" +#define KEY_HSM_COPYTOOL_SEND "hsm_send" +#define KEY_INIT_RECOV_BACKUP "init_recov_bk" +#define KEY_INTERMDS "inter_mds" +#define KEY_LAST_ID "last_id" +#define KEY_LAST_FID "last_fid" +#define KEY_MAX_EASIZE "max_easize" +#define KEY_DEFAULT_EASIZE "default_easize" +#define KEY_MGSSEC "mgssec" +#define KEY_READ_ONLY "read-only" +#define KEY_REGISTER_TARGET "register_target" +#define KEY_SET_FS "set_fs" +#define KEY_TGT_COUNT "tgt_count" +/* KEY_SET_INFO in lustre_idl.h */ +#define KEY_SPTLRPC_CONF "sptlrpc_conf" + +#define KEY_CACHE_LRU_SHRINK "cache_lru_shrink" +#define KEY_OSP_CONNECTED "osp_connected" + +/* Flags for op_xvalid */ +enum op_xvalid { + OP_XVALID_CTIME_SET = BIT(0), /* 0x0001 */ + OP_XVALID_BLOCKS = BIT(1), /* 0x0002 */ + OP_XVALID_OWNEROVERRIDE = BIT(2), /* 0x0004 */ + OP_XVALID_FLAGS = BIT(3), /* 0x0008 */ + OP_XVALID_PROJID = BIT(4), /* 0x0010 */ + OP_XVALID_LAZYSIZE = BIT(5), /* 0x0020 */ + OP_XVALID_LAZYBLOCKS = BIT(6), /* 0x0040 */ +}; + +struct lu_context; + +static inline int it_to_lock_mode(struct lookup_intent *it) +{ + /* CREAT needs to be tested before open (both could be set) */ + if (it->it_op & IT_CREAT) + return LCK_CW; + else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP)) + return LCK_CR; + else if (it->it_op & IT_LAYOUT) + return (it->it_flags & FMODE_WRITE) ? LCK_EX : LCK_CR; + else if (it->it_op & IT_READDIR) + return LCK_PR; + else if (it->it_op & IT_GETXATTR) + return LCK_PR; + + LASSERTF(0, "Invalid it_op: %d\n", it->it_op); + return -EINVAL; +} + +enum md_op_flags { + MF_MDC_CANCEL_FID1 = BIT(0), + MF_MDC_CANCEL_FID2 = BIT(1), + MF_MDC_CANCEL_FID3 = BIT(2), + MF_MDC_CANCEL_FID4 = BIT(3), + MF_GET_MDT_IDX = BIT(4), + MF_GETATTR_BY_FID = BIT(5), + MF_QOS_MKDIR = BIT(6), + MF_RR_MKDIR = BIT(7), + MF_OPNAME_KMALLOCED = BIT(8), +}; + +enum md_cli_flags { + CLI_SET_MEA = BIT(0), + CLI_RM_ENTRY = BIT(1), + CLI_HASH64 = BIT(2), + CLI_API32 = BIT(3), + CLI_MIGRATE = BIT(4), + CLI_DIRTY_DATA = BIT(5), + CLI_NO_SLOT = BIT(6), +}; + +enum md_op_code { + LUSTRE_OPC_MKDIR = 1, + LUSTRE_OPC_SYMLINK, + LUSTRE_OPC_MKNOD, + LUSTRE_OPC_CREATE, + LUSTRE_OPC_ANY, + LUSTRE_OPC_LOOKUP, + LUSTRE_OPC_OPEN, +}; + +/** + * GETXATTR is not included as only a couple of fields in the reply body + * is filled, but not FID which is needed for common intent handling in + * mdc_finish_intent_lock() + */ +static inline bool it_has_reply_body(const struct lookup_intent *it) +{ + return it->it_op & (IT_OPEN | IT_LOOKUP | IT_GETATTR); +} + +struct md_op_data { + struct lu_fid op_fid1; /* operation fid1 (usualy parent) */ + struct lu_fid op_fid2; /* operation fid2 (usualy child) */ + struct lu_fid op_fid3; /* 2 extra fids to find conflicting */ + struct lu_fid op_fid4; /* to the operation locks. */ + u32 op_mds; /* what mds server open will go to */ + __u32 op_mode; + enum md_op_code op_code; + struct lustre_handle op_open_handle; + s64 op_mod_time; + const char *op_name; + size_t op_namelen; + struct rw_semaphore *op_mea1_sem; + struct rw_semaphore *op_mea2_sem; + struct lmv_stripe_md *op_mea1; + struct lmv_stripe_md *op_mea2; + struct lmv_stripe_md *op_default_mea1; /* default LMV */ + __u32 op_suppgids[2]; + __u32 op_fsuid; + __u32 op_fsgid; + kernel_cap_t op_cap; + void *op_data; + size_t op_data_size; + + /* iattr fields and blocks. */ + struct iattr op_attr; + enum op_xvalid op_xvalid; /* eXtra validity flags */ + loff_t op_attr_blocks; + u64 op_valid; /* OBD_MD_* */ + unsigned int op_attr_flags; /* LUSTRE_{SYNC,..}_FL */ + + enum md_op_flags op_flags; + + /* Various operation flags. */ + enum mds_op_bias op_bias; + + /* used to transfer info between the stacks of MD client + * see enum op_cli_flags */ + enum md_cli_flags op_cli_flags; + + /* File object data version for HSM release, on client */ + __u64 op_data_version; + struct lustre_handle op_lease_handle; + + /* File security context, for creates/metadata ops */ + const char *op_file_secctx_name; + __u32 op_file_secctx_name_size; + void *op_file_secctx; + __u32 op_file_secctx_size; + int op_file_secctx_slot; + + /* File encryption context, for creates/metadata ops */ + void *op_file_encctx; + __u32 op_file_encctx_size; + + __u32 op_projid; + + union { + /* Used by readdir */ + unsigned int op_max_pages; + /* mkdir */ + unsigned short op_dir_depth; + }; + + __u16 op_mirror_id; + + /* + * used to access dir that is changing layout: if it's set, access + * dir by new layout, otherwise old layout. + * By default it's not set, because new files are created under new + * layout, if we can't find file with name under both old and new + * layout, we are sure file with name doesn't exist, but in reverse + * order there may be a race with creation by others. + */ + bool op_new_layout; + /* used to access dir with bash hash */ + __u32 op_stripe_index; + /* Archive ID for PCC attach */ + __u32 op_archive_id; +}; + +struct md_readdir_info { + int (*mr_blocking_ast)(struct ldlm_lock *lock, + struct ldlm_lock_desc *desc, + void *data, int flag); + /* if striped directory is partially read, the result is stored here */ + int mr_partial_readdir_rc; +}; + +struct md_enqueue_info; +/* metadata stat-ahead */ +typedef int (* md_enqueue_cb_t)(struct ptlrpc_request *req, + struct md_enqueue_info *minfo, + int rc); + +struct md_enqueue_info { + struct md_op_data mi_data; + struct lookup_intent mi_it; + struct lustre_handle mi_lockh; + struct inode *mi_dir; + struct ldlm_enqueue_info mi_einfo; + md_enqueue_cb_t mi_cb; + void *mi_cbdata; +}; + +struct obd_ops { + struct module *o_owner; + int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void __user *uarg); + int (*o_get_info)(const struct lu_env *env, struct obd_export *, + __u32 keylen, void *key, __u32 *vallen, void *val); + int (*o_set_info_async)(const struct lu_env *, struct obd_export *, + __u32 keylen, void *key, + __u32 vallen, void *val, + struct ptlrpc_request_set *set); + int (*o_setup) (struct obd_device *obd, struct lustre_cfg *cfg); + int (*o_precleanup)(struct obd_device *obd); + int (*o_cleanup)(struct obd_device *obd); + int (*o_process_config)(struct obd_device *obd, size_t len, void *data); + int (*o_postrecov)(struct obd_device *obd); + int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid, + int priority); + int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid); + /* connect to the target device with given connection + * data. @ocd->ocd_connect_flags is modified to reflect flags actually + * granted by the target, which are guaranteed to be a subset of flags + * asked for. If @ocd == NULL, use default parameters. */ + int (*o_connect)(const struct lu_env *env, + struct obd_export **exp, struct obd_device *src, + struct obd_uuid *cluuid, struct obd_connect_data *ocd, + void *localdata); + int (*o_reconnect)(const struct lu_env *env, + struct obd_export *exp, struct obd_device *src, + struct obd_uuid *cluuid, + struct obd_connect_data *ocd, + void *localdata); + int (*o_disconnect)(struct obd_export *exp); + + /* Initialize/finalize fids infrastructure. */ + int (*o_fid_init)(struct obd_device *obd, + struct obd_export *exp, enum lu_cli_type type); + int (*o_fid_fini)(struct obd_device *obd); + + /* Allocate new fid according to passed @hint. */ + int (*o_fid_alloc)(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data); + + /* + * Object with @fid is getting deleted, we may want to do something + * about this. + */ + int (*o_statfs)(const struct lu_env *, struct obd_export *exp, + struct obd_statfs *osfs, time64_t max_age, __u32 flags); + int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo, + time64_t max_age, struct ptlrpc_request_set *set); + int (*o_create)(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa); + int (*o_destroy)(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa); + int (*o_setattr)(const struct lu_env *, struct obd_export *exp, + struct obdo *oa); + int (*o_getattr)(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa); + int (*o_preprw)(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, int objcount, + struct obd_ioobj *obj, struct niobuf_remote *remote, + int *nr_pages, struct niobuf_local *local); + int (*o_commitrw)(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *remote, int pages, + struct niobuf_local *local, int rc, int nob, + ktime_t kstart); + int (*o_init_export)(struct obd_export *exp); + int (*o_destroy_export)(struct obd_export *exp); + + int (*o_import_event)(struct obd_device *, struct obd_import *, + enum obd_import_event); + + int (*o_notify)(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev); + + int (*o_health_check)(const struct lu_env *env, struct obd_device *); + struct obd_uuid *(*o_get_uuid) (struct obd_export *exp); + + /* quota methods */ + int (*o_quotactl)(struct obd_device *, struct obd_export *, + struct obd_quotactl *); + + /* pools methods */ + int (*o_pool_new)(struct obd_device *obd, char *poolname); + int (*o_pool_del)(struct obd_device *obd, char *poolname); + int (*o_pool_add)(struct obd_device *obd, char *poolname, + char *ostname); + int (*o_pool_rem)(struct obd_device *obd, char *poolname, + char *ostname); +}; + +/* lmv structures */ +struct lustre_md { + struct mdt_body *body; + struct lu_buf layout; + union { + struct lmv_stripe_md *lmv; + struct lmv_foreign_md *lfm; + }; + struct lmv_stripe_md *default_lmv; +#ifdef CONFIG_LUSTRE_FS_POSIX_ACL + struct posix_acl *posix_acl; +#endif +}; + +#ifdef CONFIG_LUSTRE_FS_POSIX_ACL +static inline void lmd_clear_acl(struct lustre_md *md) +{ + if (md->posix_acl) { + posix_acl_release(md->posix_acl); + md->posix_acl = NULL; + } +} + +#define OBD_CONNECT_ACL_FLAGS \ + (OBD_CONNECT_ACL | OBD_CONNECT_UMASK | OBD_CONNECT_LARGE_ACL) +#else +static inline void lmd_clear_acl(struct lustre_md *md) +{ +} + +#define OBD_CONNECT_ACL_FLAGS (0) +#endif + +struct md_open_data { + struct obd_client_handle *mod_och; + struct ptlrpc_request *mod_open_req; + struct ptlrpc_request *mod_close_req; + atomic_t mod_refcount; + bool mod_is_create; +}; + +struct obd_client_handle { + struct lustre_handle och_open_handle; + struct lu_fid och_fid; + struct md_open_data *och_mod; + struct lustre_handle och_lease_handle; /* open lock for lease */ + __u32 och_magic; + int och_flags; +}; + +#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed + +struct lookup_intent; +struct cl_attr; + +struct md_ops { + int (*m_close)(struct obd_export *, struct md_op_data *, + struct md_open_data *, struct ptlrpc_request **); + + int (*m_create)(struct obd_export *, struct md_op_data *, + const void *, size_t, umode_t, uid_t, gid_t, + kernel_cap_t, __u64, struct ptlrpc_request **); + + int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *, + const union ldlm_policy_data *, struct md_op_data *, + struct lustre_handle *, __u64); + + int (*m_getattr)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + + int (*m_intent_lock)(struct obd_export *, struct md_op_data *, + struct lookup_intent *, + struct ptlrpc_request **, + ldlm_blocking_callback, __u64); + + int (*m_link)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + + int (*m_rename)(struct obd_export *, struct md_op_data *, + const char *, size_t, const char *, size_t, + struct ptlrpc_request **); + + int (*m_setattr)(struct obd_export *, struct md_op_data *, void *, + size_t , struct ptlrpc_request **); + + int (*m_fsync)(struct obd_export *, const struct lu_fid *, + struct ptlrpc_request **); + + int (*m_read_page)(struct obd_export *, struct md_op_data *, + struct md_readdir_info *mrinfo, __u64 hash_offset, + struct page **ppage); + + int (*m_unlink)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + + int (*m_setxattr)(struct obd_export *, const struct lu_fid *, + u64, const char *, const void *, size_t, unsigned int, + u32, struct ptlrpc_request **); + + int (*m_getxattr)(struct obd_export *, const struct lu_fid *, + u64, const char *, size_t, struct ptlrpc_request **); + + int (*m_intent_getattr_async)(struct obd_export *, + struct md_enqueue_info *); + + int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *, + struct lu_fid *, __u64 *bits); + + int (*m_file_resync)(struct obd_export *, struct md_op_data *); + + int (*m_get_root)(struct obd_export *, const char *, struct lu_fid *); + int (*m_null_inode)(struct obd_export *, const struct lu_fid *); + + int (*m_getattr_name)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + + int (*m_init_ea_size)(struct obd_export *, __u32, __u32); + + int (*m_get_lustre_md)(struct obd_export *, struct req_capsule *, + struct obd_export *, struct obd_export *, + struct lustre_md *); + + int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *); + + int (*m_merge_attr)(struct obd_export *, + const struct lmv_stripe_md *lsm, + struct cl_attr *attr, ldlm_blocking_callback); + + int (*m_set_open_replay_data)(struct obd_export *, + struct obd_client_handle *, + struct lookup_intent *); + + int (*m_clear_open_replay_data)(struct obd_export *, + struct obd_client_handle *); + + int (*m_set_lock_data)(struct obd_export *, + const struct lustre_handle *, void *, __u64 *); + + enum ldlm_mode (*m_lock_match)(struct obd_export *, __u64, + const struct lu_fid *, enum ldlm_type, + union ldlm_policy_data *, enum ldlm_mode, + struct lustre_handle *); + + int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *, + union ldlm_policy_data *, enum ldlm_mode, + enum ldlm_cancel_flags flags, void *opaque); + + int (*m_get_fid_from_lsm)(struct obd_export *, + const struct lmv_stripe_md *, + const char *name, int namelen, + struct lu_fid *fid); + int (*m_unpackmd)(struct obd_export *exp, struct lmv_stripe_md **plsm, + const union lmv_mds_md *lmv, size_t lmv_size); + int (*m_rmfid)(struct obd_export *exp, struct fid_array *fa, int *rcs, + struct ptlrpc_request_set *set); +}; + +static inline struct md_open_data *obd_mod_alloc(void) +{ + struct md_open_data *mod; + OBD_ALLOC_PTR(mod); + if (mod == NULL) + return NULL; + atomic_set(&mod->mod_refcount, 1); + return mod; +} + +#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount) +#define obd_mod_put(mod) \ +({ \ + if (atomic_dec_and_test(&(mod)->mod_refcount)) { \ + if ((mod)->mod_open_req) \ + ptlrpc_req_finished((mod)->mod_open_req); \ + OBD_FREE_PTR(mod); \ + } \ +}) + +void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid); +void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent); +void obdo_set_o_projid(struct obdo *dst, u32 projid); + +/* return 1 if client should be resend request */ +static inline int client_should_resend(int resend, struct client_obd *cli) +{ + return atomic_read(&cli->cl_resends) ? + atomic_read(&cli->cl_resends) > resend : 1; +} + +/** + * Return device name for this device + * + * XXX: lu_device is declared before obd_device, while a pointer pointing + * back to obd_device in lu_device, so this helper function defines here + * instead of in lu_object.h + */ +static inline const char *lu_dev_name(const struct lu_device *lu_dev) +{ + return lu_dev->ld_obd->obd_name; +} + +static inline bool filename_is_volatile(const char *name, size_t namelen, + int *idx) +{ + const char *start; + char *end; + + if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0) + return false; + + /* caller does not care of idx */ + if (idx == NULL) + return true; + + /* volatile file, the MDT can be set from name */ + /* name format is LUSTRE_VOLATILE_HDR:[idx]: */ + /* if no MDT is specified, use std way */ + if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2) + goto bad_format; + /* test for no MDT idx case */ + if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') && + (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) { + *idx = -1; + return true; + } + /* we have an idx, read it */ + start = name + LUSTRE_VOLATILE_HDR_LEN + 1; + *idx = simple_strtoul(start, &end, 16); + /* error cases: + * no digit, no trailing :, negative value + */ + if (((*idx == 0) && (end == start)) || + (*end != ':') || (*idx < 0)) + goto bad_format; + + return true; +bad_format: + /* bad format of mdt idx, we cannot return an error + * to caller so we use hash algo */ + CERROR("Bad volatile file name format: %s\n", + name + LUSTRE_VOLATILE_HDR_LEN); + return false; +} + +static inline int cli_brw_size(struct obd_device *obd) +{ + LASSERT(obd != NULL); + return obd->u.cli.cl_max_pages_per_rpc << PAGE_SHIFT; +} + +/* + * When RPC size or the max RPCs in flight is increased, the max dirty pages + * of the client should be increased accordingly to avoid sending fragmented + * RPCs over the network when the client runs out of the maximum dirty space + * when so many RPCs are being generated. + */ +static inline void client_adjust_max_dirty(struct client_obd *cli) +{ + /* initializing */ + if (cli->cl_dirty_max_pages <= 0) { + cli->cl_dirty_max_pages = + (OSC_MAX_DIRTY_DEFAULT * 1024 * 1024) >> PAGE_SHIFT; + } else { + unsigned long dirty_max = cli->cl_max_rpcs_in_flight * + cli->cl_max_pages_per_rpc; + + if (dirty_max > cli->cl_dirty_max_pages) + cli->cl_dirty_max_pages = dirty_max; + } + + if (cli->cl_dirty_max_pages > cfs_totalram_pages() / 8) + cli->cl_dirty_max_pages = cfs_totalram_pages() / 8; + + /* This value is exported to userspace through the max_dirty_mb + * parameter. So we round up the number of pages to make it a round + * number of MBs. */ + cli->cl_dirty_max_pages = round_up(cli->cl_dirty_max_pages, + 1 << (20 - PAGE_SHIFT)); +} + +/* Must be used for page cache pages only, + * not safe otherwise (e.g. direct IO pages) + */ +static inline struct inode *page2inode(struct page *page) +{ + if (page->mapping) { + if (PageAnon(page)) + return NULL; + else + return page->mapping->host; + } else { + return NULL; + } +} + +#endif /* __OBD_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/obd_cache.h b/drivers/staging/lustrefsx/lustre/include/obd_cache.h new file mode 100644 index 0000000000000..128fad781edcb --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obd_cache.h @@ -0,0 +1,34 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _OBD_CACHE_H__ +#define _OBD_CACHE_H__ + + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/obd_cksum.h b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h new file mode 100644 index 0000000000000..1f9f0b14a5975 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h @@ -0,0 +1,193 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __OBD_CKSUM +#define __OBD_CKSUM +#include +#include +#include + +int obd_t10_cksum_speed(const char *obd_name, + enum cksum_types cksum_type); + +static inline unsigned char cksum_obd2cfs(enum cksum_types cksum_type) +{ + switch (cksum_type) { + case OBD_CKSUM_CRC32: + return CFS_HASH_ALG_CRC32; + case OBD_CKSUM_ADLER: + return CFS_HASH_ALG_ADLER32; + case OBD_CKSUM_CRC32C: + return CFS_HASH_ALG_CRC32C; + default: + CERROR("Unknown checksum type (%x)!!!\n", cksum_type); + LBUG(); + } + return 0; +} + +u32 obd_cksum_type_pack(const char *obd_name, enum cksum_types cksum_type); + +static inline enum cksum_types obd_cksum_type_unpack(u32 o_flags) +{ + switch (o_flags & OBD_FL_CKSUM_ALL) { + case OBD_FL_CKSUM_CRC32C: + return OBD_CKSUM_CRC32C; + case OBD_FL_CKSUM_CRC32: + return OBD_CKSUM_CRC32; + case OBD_FL_CKSUM_T10IP512: + return OBD_CKSUM_T10IP512; + case OBD_FL_CKSUM_T10IP4K: + return OBD_CKSUM_T10IP4K; + case OBD_FL_CKSUM_T10CRC512: + return OBD_CKSUM_T10CRC512; + case OBD_FL_CKSUM_T10CRC4K: + return OBD_CKSUM_T10CRC4K; + default: + break; + } + + return OBD_CKSUM_ADLER; +} + +/* Return a bitmask of the checksum types supported on this system. + * 1.8 supported ADLER it is base and not depend on hw + * Client uses all available local algos + */ +static inline enum cksum_types obd_cksum_types_supported_client(void) +{ + enum cksum_types ret = OBD_CKSUM_ADLER; + + CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n", + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER))); + + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0) + ret |= OBD_CKSUM_CRC32C; + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0) + ret |= OBD_CKSUM_CRC32; + + /* Client support all kinds of T10 checksum */ + ret |= OBD_CKSUM_T10_ALL; + + return ret; +} + +enum cksum_types obd_cksum_types_supported_server(const char *obd_name); + +/* Select the best checksum algorithm among those supplied in the cksum_types + * input. + * + * Currently, calling cksum_type_pack() with a mask will return the fastest + * checksum type due to its benchmarking at libcfs module load. + * Caution is advised, however, since what is fastest on a single client may + * not be the fastest or most efficient algorithm on the server. */ +static inline +enum cksum_types obd_cksum_type_select(const char *obd_name, + enum cksum_types cksum_types, + enum cksum_types preferred) +{ + u32 flag; + + if (preferred & cksum_types) + return preferred; + + /* + * Server reporting a single T10 checksum type + * means the target actually supports T10-PI. + */ + if (hweight32(cksum_types & OBD_CKSUM_T10_ALL) == 1) + return cksum_types & OBD_CKSUM_T10_ALL; + + flag = obd_cksum_type_pack(obd_name, cksum_types); + + return obd_cksum_type_unpack(flag); +} + +/* Checksum algorithm names. Must be defined in the same order as the + * OBD_CKSUM_* flags. */ +#define DECLARE_CKSUM_NAME const char *const cksum_name[] = {"crc32", "adler", \ + "crc32c", "reserved", "t10ip512", "t10ip4K", "t10crc512", "t10crc4K"} + +typedef __be16 (obd_dif_csum_fn) (void *, unsigned int); + +__be16 obd_dif_crc_fn(void *data, unsigned int len); +__be16 obd_dif_ip_fn(void *data, unsigned int len); +int obd_page_dif_generate_buffer(const char *obd_name, struct page *page, + __u32 offset, __u32 length, + __be16 *guard_start, int guard_number, + int *used_number, int sector_size, + obd_dif_csum_fn *fn); +/* + * If checksum type is one T10 checksum types, init the csum_fn and sector + * size. Otherwise, init them to NULL/zero. + */ +static inline void obd_t10_cksum2dif(enum cksum_types cksum_type, + obd_dif_csum_fn **fn, int *sector_size) +{ + *fn = NULL; + *sector_size = 0; + +#if IS_ENABLED(CONFIG_CRC_T10DIF) + switch (cksum_type) { + case OBD_CKSUM_T10IP512: + *fn = obd_dif_ip_fn; + *sector_size = 512; + break; + case OBD_CKSUM_T10IP4K: + *fn = obd_dif_ip_fn; + *sector_size = 4096; + break; + case OBD_CKSUM_T10CRC512: + *fn = obd_dif_crc_fn; + *sector_size = 512; + break; + case OBD_CKSUM_T10CRC4K: + *fn = obd_dif_crc_fn; + *sector_size = 4096; + break; + default: + break; + } +#endif /* CONFIG_CRC_T10DIF */ +} + +enum obd_t10_cksum_type { + OBD_T10_CKSUM_UNKNOWN = 0, + OBD_T10_CKSUM_IP512, + OBD_T10_CKSUM_IP4K, + OBD_T10_CKSUM_CRC512, + OBD_T10_CKSUM_CRC4K, + OBD_T10_CKSUM_MAX +}; + +#endif /* __OBD_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/obd_class.h b/drivers/staging/lustrefsx/lustre/include/obd_class.h new file mode 100644 index 0000000000000..8d93466d61b5b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obd_class.h @@ -0,0 +1,1954 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +#ifndef __CLASS_OBD_H +#define __CLASS_OBD_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define OBD_STATFS_NODELAY 0x0001 /* requests should be send without delay + * and resends for avoid deadlocks */ +#define OBD_STATFS_FROM_CACHE 0x0002 /* the statfs callback should not update + * obd_osfs_age */ +#define OBD_STATFS_FOR_MDT0 0x0004 /* The statfs is only for retrieving + * information from MDT0. */ +#define OBD_STATFS_SUM 0x0008 /* get aggregated statfs from MDT */ +#define OBD_STATFS_NESTED 0x0010 /* Call while already holding + * obd_dev_mutex of a difference + * device. + */ + +extern rwlock_t obd_dev_lock; + +/* OBD Operations Declarations */ +extern struct obd_device *class_exp2obd(struct obd_export *); +extern int class_handle_ioctl(unsigned int cmd, unsigned long arg); +int lustre_get_jobid(char *jobid, size_t len); +void lustre_jobid_clear(const char *jobid); +void jobid_cache_fini(void); +int jobid_cache_init(void); +char *jobid_current(void); +int jobid_set_current(char *jobid); + +struct lu_device_type; + +/* genops.c */ +struct obd_export *class_conn2export(struct lustre_handle *); +#ifdef HAVE_SERVER_SUPPORT +struct obd_type *class_add_symlinks(const char *name, bool enable_proc); +#endif +int class_register_type(const struct obd_ops *dt_ops, + const struct md_ops *md_ops, bool enable_proc, + const char *nm, struct lu_device_type *ldt); +int class_unregister_type(const char *nm); + +struct obd_device *class_newdev(const char *type_name, const char *name, + const char *uuid); +int class_register_device(struct obd_device *obd); +void class_unregister_device(struct obd_device *obd); +void class_free_dev(struct obd_device *obd); + +struct obd_device *class_dev_by_str(const char *str); +int class_name2dev(const char *name); +struct obd_device *class_name2obd(const char *name); +int class_uuid2dev(struct obd_uuid *uuid); +struct obd_device *class_uuid2obd(struct obd_uuid *uuid); +void class_obd_list(void); +struct obd_device *class_find_client_obd(struct obd_uuid *tgt_uuid, + const char *type_name, + struct obd_uuid *grp_uuid); +struct obd_device *class_devices_in_group(struct obd_uuid *grp_uuid, + int *next); +struct obd_device *class_num2obd(int num); +int get_devices_count(void); + +int class_notify_sptlrpc_conf(const char *fsname, int namelen); + +static inline char *obd_export_nid2str(struct obd_export *exp) +{ + return exp->exp_connection == NULL ? + "" : libcfs_nidstr(&exp->exp_connection->c_peer.nid); +} + +static inline char *obd_import_nid2str(struct obd_import *imp) +{ + return imp->imp_connection == NULL ? + "" : libcfs_nidstr(&imp->imp_connection->c_peer.nid); +} + +int obd_export_evict_by_nid(struct obd_device *obd, const char *nid); +int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid); +int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2, + const char *sep); + +int obd_zombie_impexp_init(void); +void obd_zombie_impexp_stop(void); +void obd_zombie_impexp_cull(void); +void obd_zombie_barrier(void); +void obd_exports_barrier(struct obd_device *obd); +int kuc_len(int payload_len); +struct kuc_hdr * kuc_ptr(void *p); +void *kuc_alloc(int payload_len, int transport, int type); +void kuc_free(void *p, int payload_len); +int obd_get_request_slot(struct client_obd *cli); +void obd_put_request_slot(struct client_obd *cli); +__u32 obd_get_max_rpcs_in_flight(struct client_obd *cli); +int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max); +__u16 obd_get_max_mod_rpcs_in_flight(struct client_obd *cli); +int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max); +int obd_mod_rpc_stats_seq_show(struct client_obd *cli, struct seq_file *seq); + +__u16 obd_get_mod_rpc_slot(struct client_obd *cli, __u32 opc); +void obd_put_mod_rpc_slot(struct client_obd *cli, __u32 opc, __u16 tag); + +struct llog_handle; +struct llog_rec_hdr; +typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *, + struct llog_rec_hdr *, void *); + +struct obd_export *obd_stale_export_get(void); +void obd_stale_export_put(struct obd_export *exp); +void obd_stale_export_adjust(struct obd_export *exp); + +/* obd_config.c */ +/* For interoperability */ +struct cfg_interop_param { + char *old_param; + char *new_param; +}; + +char *lustre_cfg_string(struct lustre_cfg *lcfg, u32 index); +struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg, + const char *new_name); +void print_lustre_cfg(struct lustre_cfg *lcfg); +int class_process_config(struct lustre_cfg *lcfg); +ssize_t class_set_global(const char *param); +ssize_t class_modify_config(struct lustre_cfg *lcfg, const char *prefix, + struct kobject *kobj); +int class_attach(struct lustre_cfg *lcfg); +int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg); +int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg); +int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg); + +int class_find_param(char *buf, char *key, char **valp); +struct cfg_interop_param *class_find_old_param(const char *param, + struct cfg_interop_param *ptr); +int class_get_next_param(char **params, char *copy); +int class_match_param(char *buf, const char *key, char **valp); +int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh); +int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh); +int class_parse_net(char *buf, u32 *net, char **endh); +int class_match_nid(char *buf, char *key, lnet_nid_t nid); +int class_match_net(char *buf, char *key, u32 net); + +struct obd_device *class_incref(struct obd_device *obd, + const char *scope, const void *source); +void class_decref(struct obd_device *obd, + const char *scope, const void *source); +void dump_exports(struct obd_device *obd, int locks, int debug_level); +int class_config_llog_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data); +int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg); + +#define CFG_F_START 0x01 /* Set when we start updating from a log */ +#define CFG_F_MARKER 0x02 /* We are within a maker */ +#define CFG_F_SKIP 0x04 /* We should ignore this cfg command */ +#define CFG_F_EXCLUDE 0x10 /* OST exclusion list */ + +/* Passed as data param to class_config_parse_llog */ +struct config_llog_instance { + unsigned long cfg_instance; + struct super_block *cfg_sb; + struct obd_uuid cfg_uuid; + llog_cb_t cfg_callback; + int cfg_last_idx; /* for partial llog processing */ + int cfg_flags; + __u32 cfg_lwp_idx; + __u32 cfg_sub_clds; +}; +int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg); + +/** + * Generate a unique configuration instance for this mount + * + * Temporary hack to bypass ASLR in 4.15+ kernels, a better fix soon. + * For now, use the same value as before - the superblock pointer value. + * + * Using the client UUID would be an option, but it needs more testing. + */ +static inline unsigned long ll_get_cfg_instance(struct super_block *sb) +{ + return (unsigned long)sb; +} + +#define CONFIG_SUB_SPTLRPC 0x01 +#define CONFIG_SUB_RECOVER 0x02 +#define CONFIG_SUB_PARAMS 0x04 +#define CONFIG_SUB_NODEMAP 0x08 +#define CONFIG_SUB_BARRIER 0x10 + +/* Sub clds should be attached to the config_llog_data when processing + * config log for client or server target. */ +#define CONFIG_SUB_CLIENT (CONFIG_SUB_SPTLRPC | CONFIG_SUB_RECOVER | \ + CONFIG_SUB_PARAMS) +#define CONFIG_SUB_SERVER (CONFIG_SUB_CLIENT | CONFIG_SUB_NODEMAP | \ + CONFIG_SUB_BARRIER) + +#define PARAMS_FILENAME "params" +#define BARRIER_FILENAME "barrier" +#define LCTL_UPCALL "lctl" + +static inline bool logname_is_barrier(const char *logname) +{ + char *ptr; + + /* logname for barrier is "fsname-barrier" */ + ptr = strstr(logname, BARRIER_FILENAME); + if (ptr && (ptr - logname) >= 2 && + *(ptr - 1) == '-' && *(ptr + 7) == '\0') + return true; + + return false; +} + +/* list of active configuration logs */ +struct config_llog_data { + struct ldlm_res_id cld_resid; + struct lustre_handle cld_lockh; + struct config_llog_instance cld_cfg; + struct list_head cld_list_chain;/* on config_llog_list */ + atomic_t cld_refcount; + struct config_llog_data *cld_sptlrpc;/* depended sptlrpc log */ + struct config_llog_data *cld_params; /* common parameters log */ + struct config_llog_data *cld_recover;/* imperative recover log */ + struct config_llog_data *cld_nodemap;/* nodemap log */ + struct config_llog_data *cld_barrier;/* barrier log (for MDT only) */ + struct obd_export *cld_mgcexp; + struct mutex cld_lock; + enum mgs_cfg_type cld_type; + unsigned int cld_stopping:1, /* we were told to stop + * watching */ + cld_lostlock:1; /* lock not requeued */ + char cld_logname[0]; +}; + +struct lustre_profile { + struct list_head lp_list; + char *lp_profile; + char *lp_dt; + char *lp_md; + int lp_refs; + bool lp_list_deleted; +}; + +struct lustre_profile *class_get_profile(const char * prof); +void class_del_profile(const char *prof); +void class_put_profile(struct lustre_profile *lprof); +void class_del_profiles(void); + + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + +void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *); +void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *); +extern void (*class_export_dump_hook)(struct obd_export *); + +#else + +#define __class_export_add_lock_ref(exp, lock) do {} while(0) +#define __class_export_del_lock_ref(exp, lock) do {} while(0) + +#endif + +#define class_export_rpc_inc(exp) \ +({ \ + atomic_inc(&(exp)->exp_rpc_count); \ + CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n", \ + (exp), atomic_read(&(exp)->exp_rpc_count)); \ +}) + +#define class_export_rpc_dec(exp) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_rpc_count); \ + atomic_dec(&(exp)->exp_rpc_count); \ + CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n", \ + (exp), atomic_read(&(exp)->exp_rpc_count)); \ +}) + +#define class_export_lock_get(exp, lock) \ +({ \ + atomic_inc(&(exp)->exp_locks_count); \ + __class_export_add_lock_ref(exp, lock); \ + CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \ + (exp), atomic_read(&(exp)->exp_locks_count)); \ + class_export_get(exp); \ +}) + +#define class_export_lock_put(exp, lock) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_locks_count); \ + atomic_dec(&(exp)->exp_locks_count); \ + __class_export_del_lock_ref(exp, lock); \ + CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \ + (exp), atomic_read(&(exp)->exp_locks_count)); \ + class_export_put(exp); \ +}) + +#define class_export_cb_get(exp) \ +({ \ + atomic_inc(&(exp)->exp_cb_count); \ + CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\ + (exp), atomic_read(&(exp)->exp_cb_count)); \ + class_export_get(exp); \ +}) + +#define class_export_cb_put(exp) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_cb_count); \ + atomic_dec(&(exp)->exp_cb_count); \ + CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\ + (exp), atomic_read(&(exp)->exp_cb_count)); \ + class_export_put(exp); \ +}) + +/* genops.c */ +struct obd_export *class_export_get(struct obd_export *exp); +void class_export_put(struct obd_export *exp); +struct obd_export *class_new_export(struct obd_device *obd, + struct obd_uuid *cluuid); +struct obd_export *class_new_export_self(struct obd_device *obd, + struct obd_uuid *uuid); +void class_unlink_export(struct obd_export *exp); + +struct obd_import *class_import_get(struct obd_import *); +void class_import_put(struct obd_import *); +struct obd_import *class_new_import(struct obd_device *obd); +void class_destroy_import(struct obd_import *exp); + +#ifdef HAVE_SERVER_SUPPORT +struct obd_type *class_search_type(const char *name); +struct obd_type *class_get_type(const char *name); +#endif +void class_put_type(struct obd_type *type); +int class_connect(struct lustre_handle *conn, struct obd_device *obd, + struct obd_uuid *cluuid); +int class_disconnect(struct obd_export *exp); +void class_fail_export(struct obd_export *exp); +int class_connected_export(struct obd_export *exp); +void class_disconnect_exports(struct obd_device *obd); +int class_manual_cleanup(struct obd_device *obd); +void class_disconnect_stale_exports(struct obd_device *, + int (*test_export)(struct obd_export *)); + +static inline enum obd_option exp_flags_from_obd(struct obd_device *obd) +{ + return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) | + (obd->obd_force ? OBD_OPT_FORCE : 0) | + (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) | + 0); +} + +#ifdef HAVE_SERVER_SUPPORT +static inline struct lu_target *class_exp2tgt(struct obd_export *exp) +{ + LASSERT(exp->exp_obd); + if (exp->exp_obd->u.obt.obt_magic != OBT_MAGIC) + return NULL; + return exp->exp_obd->u.obt.obt_lut; +} + +static inline struct lr_server_data *class_server_data(struct obd_device *obd) +{ + LASSERT(obd->u.obt.obt_lut); + return &obd->u.obt.obt_lut->lut_lsd; +} +#endif + +/* obdo.c */ +struct lu_attr; +struct inode; + +void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid); +void la_from_obdo(struct lu_attr *la, const struct obdo *dst, u64 valid); + +void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid); +void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj); + +#define OBP(dev, op) (dev)->obd_type->typ_dt_ops->o_ ## op +#define MDP(dev, op) (dev)->obd_type->typ_md_ops->m_ ## op + +static inline int obd_check_dev(struct obd_device *obd) +{ + if (!obd) { + CERROR("NULL device\n"); + return -ENODEV; + } + return 0; +} + +/* ensure obd_setup and !obd_stopping */ +#define OBD_CHECK_DEV_ACTIVE(obd) \ +do { \ + rc = obd_check_dev(obd); \ + if (rc) \ + return rc; \ + \ + if (!(obd)->obd_set_up || (obd)->obd_stopping) { \ + CERROR("Device %d not setup\n", \ + (obd)->obd_minor); \ + RETURN(-ENODEV); \ + } \ +} while (0) + + +static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp) +{ + /* Always add in ldlm_stats */ + tmp->nid_ldlm_stats = + lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC, + LPROCFS_STATS_FLAG_NOPERCPU); + if (tmp->nid_ldlm_stats == NULL) + return -ENOMEM; + + lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats); + + return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats", + tmp->nid_ldlm_stats); +} + +static inline int exp_check_ops(struct obd_export *exp) +{ + if (exp == NULL) { + RETURN(-ENODEV); + } + if (exp->exp_obd == NULL || !exp->exp_obd->obd_type) { + RETURN(-EOPNOTSUPP); + } + RETURN(0); +} + +static inline int class_devno_max(void) +{ + return MAX_OBD_DEVICES; +} + +static inline int obd_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, + __u32 *vallen, void *val) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_get_info) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val); + RETURN(rc); +} + +static inline int obd_set_info_async(const struct lu_env *env, + struct obd_export *exp, + __u32 keylen, void *key, + __u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_set_info_async) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen, + val, set); + RETURN(rc); +} + +/* + * obd-lu integration. + * + * Functionality is being moved into new lu_device-based layering, but some + * pieces of configuration process are still based on obd devices. + * + * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully + * subsume ->o_setup() methods of obd devices they replace. The same for + * lu_device_operations::ldo_process_config() and ->o_process_config(). As a + * result, obd_setup() and obd_process_config() branch and call one XOR + * another. + * + * Yet neither lu_device_type_operations::ldto_device_fini() nor + * lu_device_type_operations::ldto_device_free() fully implement the + * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence, + * obd_precleanup() and obd_cleanup() call both lu_device and obd operations. + */ +static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg) +{ + int rc; + struct obd_type *type = obd->obd_type; + struct lu_device_type *ldt; + + ENTRY; + + wait_var_event(&type->typ_lu, + smp_load_acquire(&type->typ_lu) != OBD_LU_TYPE_SETUP); + ldt = type->typ_lu; + if (ldt != NULL) { + struct lu_context session_ctx; + struct lu_env env; + + lu_context_init(&session_ctx, LCT_SESSION | LCT_SERVER_SESSION); + session_ctx.lc_thread = NULL; + lu_context_enter(&session_ctx); + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + struct lu_device *dev; + env.le_ses = &session_ctx; + dev = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg); + lu_env_fini(&env); + if (!IS_ERR(dev)) { + obd->obd_lu_dev = dev; + dev->ld_obd = obd; + rc = 0; + } else + rc = PTR_ERR(dev); + } + lu_context_exit(&session_ctx); + lu_context_fini(&session_ctx); + } else { + if (!obd->obd_type->typ_dt_ops->o_setup) { + CERROR("%s: no %s operation\n", obd->obd_name, + __func__); + RETURN(-EOPNOTSUPP); + } + rc = OBP(obd, setup)(obd, cfg); + } + RETURN(rc); +} + +static inline int obd_precleanup(struct obd_device *obd) +{ + int rc; + struct lu_device_type *ldt = obd->obd_type->typ_lu; + struct lu_device *d = obd->obd_lu_dev; + + ENTRY; + + if (ldt != NULL && d != NULL) { + struct lu_env *env = lu_env_find(); + struct lu_env _env; + + if (!env) { + env = &_env; + rc = lu_env_init(env, ldt->ldt_ctx_tags); + LASSERT(rc == 0); + lu_env_add(env); + } + ldt->ldt_ops->ldto_device_fini(env, d); + if (env == &_env) { + lu_env_remove(env); + lu_env_fini(env); + } + } + + if (!obd->obd_type->typ_dt_ops->o_precleanup) + RETURN(0); + + rc = OBP(obd, precleanup)(obd); + RETURN(rc); +} + +static inline int obd_cleanup(struct obd_device *obd) +{ + int rc; + struct lu_device_type *ldt = obd->obd_type->typ_lu; + struct lu_device *d = obd->obd_lu_dev; + + ENTRY; + if (ldt != NULL && d != NULL) { + struct lu_env env; + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + ldt->ldt_ops->ldto_device_free(&env, d); + lu_env_fini(&env); + obd->obd_lu_dev = NULL; + } + } + if (!obd->obd_type->typ_dt_ops->o_cleanup) + RETURN(0); + + rc = OBP(obd, cleanup)(obd); + RETURN(rc); +} + +static inline void obd_cleanup_client_import(struct obd_device *obd) +{ + ENTRY; + + /* If we set up but never connected, the client import will not + * have been cleaned. + */ + down_write(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import) { + struct obd_import *imp; + + imp = obd->u.cli.cl_import; + CDEBUG(D_CONFIG, "%s: client import never connected\n", + obd->obd_name); + ptlrpc_invalidate_import(imp); + client_destroy_import(imp); + obd->u.cli.cl_import = NULL; + } + up_write(&obd->u.cli.cl_sem); + + EXIT; +} + +static inline int obd_process_config(struct obd_device *obd, int datalen, + void *data) +{ + int rc; + struct lu_device_type *ldt = obd->obd_type->typ_lu; + struct lu_device *d = obd->obd_lu_dev; + + ENTRY; + + obd->obd_process_conf = 1; + if (ldt != NULL && d != NULL) { + struct lu_env env; + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + rc = d->ld_ops->ldo_process_config(&env, d, data); + lu_env_fini(&env); + } + } else { + if (!obd->obd_type->typ_dt_ops->o_process_config) { + CERROR("%s: no %s operation\n", + obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + rc = OBP(obd, process_config)(obd, datalen, data); + } + + obd->obd_process_conf = 0; + + RETURN(rc); +} + +static inline int obd_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *obdo) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_create) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, create)(env, exp, obdo); + RETURN(rc); +} + +static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp, + struct obdo *obdo) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_destroy) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, destroy)(env, exp, obdo); + RETURN(rc); +} + +static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa) +{ + int rc; + + ENTRY; + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_getattr) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, getattr)(env, exp, oa); + + RETURN(rc); +} + +static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa) +{ + int rc; + + ENTRY; + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_setattr) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, setattr)(env, exp, oa); + + RETURN(rc); +} + +static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority) +{ + struct obd_device *obd = imp->imp_obd; + int rc; + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_add_conn) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, add_conn)(imp, uuid, priority); + RETURN(rc); +} + +static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid) +{ + struct obd_device *obd = imp->imp_obd; + int rc; + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_del_conn) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, del_conn)(imp, uuid); + RETURN(rc); +} + +static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp) +{ + struct obd_uuid *uuid; + ENTRY; + + if (!exp->exp_obd->obd_type || + !exp->exp_obd->obd_type->typ_dt_ops->o_get_uuid) + RETURN(NULL); + + uuid = OBP(exp->exp_obd, get_uuid)(exp); + RETURN(uuid); +} + +/** Create a new /a exp on device /a obd for the uuid /a cluuid + * @param exp New export handle + * @param d Connect data, supported flags are set, flags also understood + * by obd are returned. + */ +static inline int obd_connect(const struct lu_env *env, + struct obd_export **exp,struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *data, + void *localdata) +{ + int rc; + __u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition + * check */ + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_connect) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata); + /* check that only subset is granted */ + LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) == + data->ocd_connect_flags)); + RETURN(rc); +} + +static inline int obd_reconnect(const struct lu_env *env, + struct obd_export *exp, + struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *d, + void *localdata) +{ + int rc; + __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition + * check */ + + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_reconnect) + RETURN(0); + + rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata); + /* check that only subset is granted */ + LASSERT(ergo(d != NULL, + (d->ocd_connect_flags & ocf) == d->ocd_connect_flags)); + RETURN(rc); +} + +static inline int obd_disconnect(struct obd_export *exp) +{ + int rc; + ENTRY; + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_disconnect) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, disconnect)(exp); + RETURN(rc); +} + +static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp, + enum lu_cli_type type) +{ + int rc; + ENTRY; + + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_fid_init) + RETURN(0); + + rc = OBP(obd, fid_init)(obd, exp, type); + RETURN(rc); +} + +static inline int obd_fid_fini(struct obd_device *obd) +{ + int rc; + ENTRY; + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_fid_fini) + RETURN(0); + + rc = OBP(obd, fid_fini)(obd); + RETURN(rc); +} + +static inline int obd_fid_alloc(const struct lu_env *env, + struct obd_export *exp, + struct lu_fid *fid, + struct md_op_data *op_data) +{ + int rc; + ENTRY; + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_fid_alloc) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, fid_alloc)(env, exp, fid, op_data); + RETURN(rc); +} + +static inline int obd_pool_new(struct obd_device *obd, char *poolname) +{ + int rc; + ENTRY; + + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_new) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, pool_new)(obd, poolname); + RETURN(rc); +} + +static inline int obd_pool_del(struct obd_device *obd, char *poolname) +{ + int rc; + ENTRY; + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_del) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, pool_del)(obd, poolname); + RETURN(rc); +} + +static inline int obd_pool_add(struct obd_device *obd, char *poolname, + char *ostname) +{ + int rc; + ENTRY; + + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_add) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, pool_add)(obd, poolname, ostname); + RETURN(rc); +} + +static inline int obd_pool_rem(struct obd_device *obd, char *poolname, + char *ostname) +{ + int rc; + + ENTRY; + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_rem) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, pool_rem)(obd, poolname, ostname); + RETURN(rc); +} + +static inline int obd_init_export(struct obd_export *exp) +{ + int rc = 0; + + ENTRY; + if (exp->exp_obd != NULL && exp->exp_obd->obd_type && + OBP((exp)->exp_obd, init_export)) + rc = OBP(exp->exp_obd, init_export)(exp); + RETURN(rc); +} + +static inline int obd_destroy_export(struct obd_export *exp) +{ + ENTRY; + if (exp->exp_obd != NULL && exp->exp_obd->obd_type && + OBP(exp->exp_obd, destroy_export)) + OBP(exp->exp_obd, destroy_export)(exp); + RETURN(0); +} + +/* @max_age is the oldest time in seconds that we accept using a cached data. + * If the cache is older than @max_age we will get a new value from the + * target. Use a value of 'ktime_get_seconds() + X' to guarantee freshness. + */ +static inline int obd_statfs_async(struct obd_export *exp, + struct obd_info *oinfo, + time64_t max_age, + struct ptlrpc_request_set *rqset) +{ + struct obd_device *obd; + int rc = 0; + + ENTRY; + + if (exp == NULL || exp->exp_obd == NULL) + RETURN(-EINVAL); + + obd = exp->exp_obd; + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_statfs) { + rc = -EOPNOTSUPP; + CERROR("%s: no statfs operation: rc = %d\n", obd->obd_name, rc); + RETURN(rc); + } + + CDEBUG(D_SUPER, "%s: age %lld, max_age %lld\n", + obd->obd_name, obd->obd_osfs_age, max_age); + rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset); + + RETURN(rc); +} + +/* @max_age is the oldest time in seconds that we accept using a cached data. + * If the cache is older than @max_age we will get a new value from the + * target. Use a value of 'ktime_get_seconds() + X' to guarantee freshness. + */ +static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, time64_t max_age, + __u32 flags) +{ + struct obd_device *obd; + int rc = 0; + + ENTRY; + if (unlikely(exp == NULL || exp->exp_obd == NULL)) + RETURN(-EINVAL); + + obd = exp->exp_obd; + OBD_CHECK_DEV_ACTIVE(obd); + + if (unlikely(!obd->obd_type || !obd->obd_type->typ_dt_ops->o_statfs)) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + CDEBUG(D_SUPER, "%s: age %lld, max_age %lld\n", + obd->obd_name, obd->obd_osfs_age, max_age); + /* ignore cache if aggregated isn't expected */ + if (obd->obd_osfs_age < max_age || + ((obd->obd_osfs.os_state & OS_STATFS_SUM) && + !(flags & OBD_STATFS_SUM))) { + /* the RPC will block anyway, so avoid sending many at once */ + rc = mutex_lock_interruptible_nested(&obd->obd_dev_mutex, + (flags & OBD_STATFS_NESTED) + ? SINGLE_DEPTH_NESTING : 0); + if (rc) + RETURN(rc); + if (obd->obd_osfs_age < max_age || + ((obd->obd_osfs.os_state & OS_STATFS_SUM) && + !(flags & OBD_STATFS_SUM))) { + rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags); + } else { + mutex_unlock(&obd->obd_dev_mutex); + GOTO(cached, rc = 0); + } + if (rc == 0) { + CDEBUG(D_SUPER, + "%s: update %p cache blocks %llu/%llu objects %llu/%llu\n", + obd->obd_name, &obd->obd_osfs, + osfs->os_bavail, osfs->os_blocks, + osfs->os_ffree, osfs->os_files); + + spin_lock(&obd->obd_osfs_lock); + memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs)); + obd->obd_osfs_age = ktime_get_seconds(); + spin_unlock(&obd->obd_osfs_lock); + } + mutex_unlock(&obd->obd_dev_mutex); + } else { +cached: + CDEBUG(D_SUPER, + "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n", + obd->obd_name, &obd->obd_osfs, + obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks, + obd->obd_osfs.os_ffree, obd->obd_osfs.os_files); + spin_lock(&obd->obd_osfs_lock); + memcpy(osfs, &obd->obd_osfs, sizeof(*osfs)); + spin_unlock(&obd->obd_osfs_lock); + } + RETURN(rc); +} + +static inline int obd_preprw(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *remote, int *pages, + struct niobuf_local *local) +{ + int rc; + + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_preprw) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote, + pages, local); + + RETURN(rc); +} + +static inline int obd_commitrw(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *rnb, int pages, + struct niobuf_local *local, const int orig_rc, + int nob, ktime_t kstart) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_commitrw) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj, + rnb, pages, local, orig_rc, nob, + kstart); + + RETURN(rc); +} + +static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp, + int len, void *karg, void __user *uarg) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_iocontrol) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg); + RETURN(rc); +} + +static inline void obd_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + ENTRY; + if (!obd) { + CERROR("NULL device\n"); + EXIT; + return; + } + + if (obd->obd_set_up && OBP(obd, import_event)) + OBP(obd, import_event)(obd, imp, event); + + EXIT; +} + +static inline int obd_notify(struct obd_device *obd, + struct obd_device *watched, + enum obd_notify_event ev) +{ + int rc; + ENTRY; + + rc = obd_check_dev(obd); + if (rc) + return rc; + + if (!obd->obd_set_up) { + CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name); + RETURN(-EINVAL); + } + + if (!OBP(obd, notify)) { + CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name); + RETURN(-ENOSYS); + } + + rc = OBP(obd, notify)(obd, watched, ev); + + RETURN(rc); +} + +static inline int obd_notify_observer(struct obd_device *observer, + struct obd_device *observed, + enum obd_notify_event ev) +{ + int rc = 0; + int rc2 = 0; + struct obd_notify_upcall *onu; + + if (observer->obd_observer) + rc = obd_notify(observer->obd_observer, observed, ev); + + /* + * Also, call non-obd listener, if any + */ + onu = &observer->obd_upcall; + if (onu->onu_upcall != NULL) + rc2 = onu->onu_upcall(observer, observed, ev, onu->onu_owner); + + return rc ? rc : rc2; +} + +static inline int obd_quotactl(struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_quotactl) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl); + RETURN(rc); +} + +static inline int obd_health_check(const struct lu_env *env, + struct obd_device *obd) +{ + /* returns: 0 on healthy + * >0 on unhealthy + reason code/flag + * however the only suppored reason == 1 right now + * We'll need to define some better reasons + * or flags in the future. + * <0 on error + */ + int rc; + + ENTRY; + + /* NULL method is normal here */ + if (obd == NULL || !obd->obd_type) { + CERROR("cleaned up obd\n"); + RETURN(-EOPNOTSUPP); + } + if (!obd->obd_set_up || obd->obd_stopping) + RETURN(0); + if (!OBP(obd, health_check)) + RETURN(0); + + rc = OBP(obd, health_check)(env, obd); + RETURN(rc); +} + +static inline int obd_register_observer(struct obd_device *obd, + struct obd_device *observer) +{ + int rc; + ENTRY; + + rc = obd_check_dev(obd); + if (rc) + return rc; + + down_write(&obd->obd_observer_link_sem); + if (obd->obd_observer && observer) { + up_write(&obd->obd_observer_link_sem); + RETURN(-EALREADY); + } + obd->obd_observer = observer; + up_write(&obd->obd_observer_link_sem); + RETURN(0); +} + +/* metadata helpers */ +enum mps_stat_idx { + LPROC_MD_CLOSE, + LPROC_MD_CREATE, + LPROC_MD_ENQUEUE, + LPROC_MD_GETATTR, + LPROC_MD_INTENT_LOCK, + LPROC_MD_LINK, + LPROC_MD_RENAME, + LPROC_MD_SETATTR, + LPROC_MD_FSYNC, + LPROC_MD_READ_PAGE, + LPROC_MD_UNLINK, + LPROC_MD_SETXATTR, + LPROC_MD_GETXATTR, + LPROC_MD_INTENT_GETATTR_ASYNC, + LPROC_MD_REVALIDATE_LOCK, + LPROC_MD_LAST_OPC, +}; + +static inline int md_get_root(struct obd_export *exp, const char *fileset, + struct lu_fid *fid) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, get_root)(exp, fileset, fid); +} + +static inline int md_getattr(struct obd_export *exp, + struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_GETATTR); + + return MDP(exp->exp_obd, getattr)(exp, op_data, request); +} + +static inline int md_null_inode(struct obd_export *exp, + const struct lu_fid *fid) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, null_inode)(exp, fid); +} + +static inline int md_close(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_CLOSE); + + return MDP(exp->exp_obd, close)(exp, op_data, mod, request); +} + +static inline int md_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, size_t datalen, umode_t mode, + uid_t uid, gid_t gid, kernel_cap_t cap_effective, + __u64 rdev, struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_CREATE); + + return MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode, + uid, gid, cap_effective, rdev, + request); +} + +static inline int md_enqueue(struct obd_export *exp, + struct ldlm_enqueue_info *einfo, + const union ldlm_policy_data *policy, + struct md_op_data *op_data, + struct lustre_handle *lockh, + __u64 extra_lock_flags) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_ENQUEUE); + + return MDP(exp->exp_obd, enqueue)(exp, einfo, policy, op_data, lockh, + extra_lock_flags); +} + +static inline int md_getattr_name(struct obd_export *exp, + struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, getattr_name)(exp, op_data, request); +} + +static inline int md_intent_lock(struct obd_export *exp, + struct md_op_data *op_data, + struct lookup_intent *it, + struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_INTENT_LOCK); + + return MDP(exp->exp_obd, intent_lock)(exp, op_data, it, reqp, + cb_blocking, extra_lock_flags); +} + +static inline int md_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_LINK); + + return MDP(exp->exp_obd, link)(exp, op_data, request); +} + +static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old_name, size_t oldlen, + const char *new_name, size_t newlen, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_RENAME); + + return MDP(exp->exp_obd, rename)(exp, op_data, old_name, oldlen, + new_name, newlen, request); +} + +static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, size_t ealen, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_SETATTR); + + return MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, request); +} + +static inline int md_fsync(struct obd_export *exp, const struct lu_fid *fid, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_FSYNC); + + return MDP(exp->exp_obd, fsync)(exp, fid, request); +} + +/* FLR: resync mirrored files. */ +static inline int md_file_resync(struct obd_export *exp, + struct md_op_data *data) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, file_resync)(exp, data); +} + +static inline int md_read_page(struct obd_export *exp, + struct md_op_data *op_data, + struct md_readdir_info *mrinfo, + __u64 hash_offset, struct page **ppage) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_READ_PAGE); + + return MDP(exp->exp_obd, read_page)(exp, op_data, mrinfo, hash_offset, + ppage); +} + +static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_UNLINK); + + return MDP(exp->exp_obd, unlink)(exp, op_data, request); +} + +static inline int md_get_lustre_md(struct obd_export *exp, + struct req_capsule *pill, + struct obd_export *dt_exp, + struct obd_export *md_exp, + struct lustre_md *md) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, get_lustre_md)(exp, pill, dt_exp, md_exp, md); +} + +static inline int md_free_lustre_md(struct obd_export *exp, + struct lustre_md *md) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, free_lustre_md)(exp, md); +} + +static inline int md_merge_attr(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + struct cl_attr *attr, + ldlm_blocking_callback cb) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, merge_attr)(exp, lsm, attr, cb); +} + +static inline int md_setxattr(struct obd_export *exp, const struct lu_fid *fid, + u64 obd_md_valid, const char *name, + const void *value, size_t value_size, + unsigned int xattr_flags, u32 suppgid, + struct ptlrpc_request **req) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_SETXATTR); + + return MDP(exp->exp_obd, setxattr)(exp, fid, obd_md_valid, name, + value, value_size, xattr_flags, + suppgid, req); +} + +static inline int md_getxattr(struct obd_export *exp, const struct lu_fid *fid, + u64 obd_md_valid, const char *name, + size_t buf_size, struct ptlrpc_request **req) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_GETXATTR); + + return MDP(exp->exp_obd, getxattr)(exp, fid, obd_md_valid, name, + buf_size, req); +} + +static inline int md_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct lookup_intent *it) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, set_open_replay_data)(exp, och, it); +} + +static inline int md_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, clear_open_replay_data)(exp, och); +} + +static inline int md_set_lock_data(struct obd_export *exp, + const struct lustre_handle *lockh, + void *data, __u64 *bits) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits); +} + +static inline +int md_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, + union ldlm_policy_data *policy, enum ldlm_mode mode, + enum ldlm_cancel_flags cancel_flags, void *opaque) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode, + cancel_flags, opaque); +} + +static inline enum ldlm_mode md_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, + enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, + struct lustre_handle *lockh) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, lock_match)(exp, flags, fid, type, + policy, mode, lockh); +} + +static inline int md_init_ea_size(struct obd_export *exp, __u32 ea_size, + __u32 def_ea_size) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, init_ea_size)(exp, ea_size, def_ea_size); +} + +static inline int md_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_INTENT_GETATTR_ASYNC); + + return MDP(exp->exp_obd, intent_getattr_async)(exp, minfo); +} + +static inline int md_revalidate_lock(struct obd_export *exp, + struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_REVALIDATE_LOCK); + + return MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits); +} + +static inline int md_get_fid_from_lsm(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + const char *name, int namelen, + struct lu_fid *fid) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, get_fid_from_lsm)(exp, lsm, name, namelen, + fid); +} + +/* Unpack an MD struct from disk to in-memory format. + * Returns +ve size of unpacked MD (0 for free), or -ve error. + * + * If *plsm != NULL and lmm == NULL then *lsm will be freed. + * If *plsm == NULL then it will be allocated. + */ +static inline int md_unpackmd(struct obd_export *exp, + struct lmv_stripe_md **plsm, + const union lmv_mds_md *lmm, size_t lmm_size) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, unpackmd)(exp, plsm, lmm, lmm_size); +} + +static inline int md_rmfid(struct obd_export *exp, struct fid_array *fa, + int *rcs, struct ptlrpc_request_set *set) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, rmfid)(exp, fa, rcs, set); +} + +/* OBD Metadata Support */ + +extern int obd_init_caches(void); +extern void obd_cleanup_caches(void); + +typedef int (*register_lwp_cb)(void *data); + +struct lwp_register_item { + struct obd_export **lri_exp; + register_lwp_cb lri_cb_func; + void *lri_cb_data; + struct list_head lri_list; + atomic_t lri_ref; + char lri_name[MTI_NAME_MAXLEN]; +}; + +/* obd_mount.c */ +#ifdef HAVE_SERVER_SUPPORT +int lustre_register_lwp_item(const char *lwpname, struct obd_export **exp, + register_lwp_cb cb_func, void *cb_data); +void lustre_deregister_lwp_item(struct obd_export **exp); +struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx); +void lustre_notify_lwp_list(struct obd_export *exp); +int tgt_name2lwp_name(const char *tgt_name, char *lwp_name, int len, __u32 idx); +int lustre_tgt_register_fs(void); +void lustre_tgt_unregister_fs(void); +#endif /* HAVE_SERVER_SUPPORT */ +int lustre_check_exclusion(struct super_block *sb, char *svname); + +/* lustre_peer.c */ +int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index); +int class_add_uuid(const char *uuid, __u64 nid); +int class_del_uuid (const char *uuid); +int class_add_nids_to_uuid(struct obd_uuid *uuid, lnet_nid_t *nids, + int nid_count); +int class_check_uuid(struct obd_uuid *uuid, __u64 nid); + +/* class_obd.c */ +extern char obd_jobid_name[]; + +extern unsigned int obd_lbug_on_eviction; +extern unsigned int obd_dump_on_eviction; + +static inline bool do_dump_on_eviction(struct obd_device *exp_obd) +{ + if (obd_lbug_on_eviction && + strncmp(exp_obd->obd_type->typ_name, LUSTRE_MGC_NAME, + strlen(LUSTRE_MGC_NAME))) { + CERROR("LBUG upon eviction\n"); + LBUG(); + } + + return obd_dump_on_eviction; +} + +/* statfs_pack.c */ +struct kstatfs; +void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs); +void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs); + +/* root squash info */ +struct root_squash_info { + uid_t rsi_uid; + gid_t rsi_gid; + struct list_head rsi_nosquash_nids; + spinlock_t rsi_lock; +}; + +int server_name2index(const char *svname, __u32 *idx, const char **endptr); + +/* linux-module.c */ +struct obd_ioctl_data; +int obd_ioctl_getdata(struct obd_ioctl_data **data, int *len, void __user *arg); +int class_procfs_init(void); +int class_procfs_clean(void); + +extern void obd_heat_add(struct obd_heat_instance *instance, + unsigned int time_second, __u64 count, + unsigned int weight, unsigned int period_second); +extern void obd_heat_decay(struct obd_heat_instance *instance, + __u64 time_second, unsigned int weight, + unsigned int period_second); +extern __u64 obd_heat_get(struct obd_heat_instance *instance, + unsigned int time_second, unsigned int weight, + unsigned int period_second); +extern void obd_heat_clear(struct obd_heat_instance *instance, int count); + +/* struct kobj_type */ +static inline +struct attribute *_get_attr_matches(const struct kobj_type *typ, + const char *key, size_t keylen, + int (*is_match)(const char *, const char *, + size_t)) +{ + int i; + +#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS + for (i = 0; typ->default_groups[i]; i++) { + int k; + struct attribute **attrs; + + attrs = (struct attribute **)typ->default_groups[i]->attrs; + for (k = 0; attrs[k]; k++) { + if (is_match(attrs[k]->name, key, keylen)) + return (struct attribute *)attrs[k]; + } + } +#else + for (i = 0; typ->default_attrs[i]; i++) { + if (is_match(typ->default_attrs[i]->name, key, keylen)) + return typ->default_attrs[i]; + } +#endif + return NULL; +} + +static inline +int _attr_name_exact(const char *attr_name, const char *key, size_t len) +{ + return !strcmp(attr_name, key); +} + +static inline +struct attribute *get_attr_by_name(const struct kobj_type *typ, + const char *name) +{ + return _get_attr_matches(typ, name, 0, _attr_name_exact); +} + +static inline +int _attr_name_starts_with(const char *attr_name, const char *name, size_t len) +{ + return !strncmp(attr_name, name, len); +} + +static inline +struct attribute *get_attr_starts_with(const struct kobj_type *typ, + const char *name, + size_t len) +{ + return _get_attr_matches(typ, name, len, _attr_name_starts_with); +} + +#endif /* __LINUX_OBD_CLASS_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/obd_support.h b/drivers/staging/lustrefsx/lustre/include/obd_support.h new file mode 100644 index 0000000000000..28d2650e11b06 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obd_support.h @@ -0,0 +1,1055 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _OBD_SUPPORT +#define _OBD_SUPPORT + +#include +#include +#include +#include +#include + +#include +#include +#include + +/* global variables */ +extern struct lprocfs_stats *obd_memory; +enum { + OBD_MEMORY_STAT = 0, + OBD_STATS_NUM, +}; + +extern unsigned int obd_debug_peer_on_timeout; +extern unsigned int obd_dump_on_timeout; +extern unsigned int obd_dump_on_eviction; +extern unsigned int obd_lbug_on_eviction; +/* obd_timeout should only be used for recovery, not for + networking / disk / timings affected by load (use Adaptive Timeouts) */ +extern unsigned int obd_timeout; /* seconds */ +extern unsigned int ldlm_timeout; /* seconds */ +extern unsigned int obd_timeout_set; +extern unsigned int ldlm_timeout_set; +extern unsigned int bulk_timeout; +extern unsigned int at_min; +extern unsigned int at_max; +extern unsigned int at_history; +extern int at_early_margin; +extern int at_extra; +extern unsigned long obd_max_dirty_pages; +extern atomic_long_t obd_dirty_pages; +extern char obd_jobid_var[]; + +/* Some hash init argument constants */ +#define HASH_NID_STATS_BKT_BITS 5 +#define HASH_NID_STATS_CUR_BITS 7 +#define HASH_NID_STATS_MAX_BITS 12 +#define HASH_GEN_BKT_BITS 5 +#define HASH_GEN_CUR_BITS 7 +#define HASH_GEN_MAX_BITS 12 +#define HASH_LQE_BKT_BITS 5 +#define HASH_LQE_CUR_BITS 7 +#define HASH_LQE_MAX_BITS 12 +#define HASH_EXP_LOCK_BKT_BITS 5 +#define HASH_EXP_LOCK_CUR_BITS 7 +#define HASH_EXP_LOCK_MAX_BITS 16 +#define HASH_JOB_STATS_BKT_BITS 5 +#define HASH_JOB_STATS_CUR_BITS 7 +#define HASH_JOB_STATS_MAX_BITS 12 + +/* Timeout definitions */ +#define OBD_TIMEOUT_DEFAULT 100 +#define LDLM_TIMEOUT_DEFAULT 20 +#define MDS_LDLM_TIMEOUT_DEFAULT 6 +/* Time to wait for all clients to reconnect during recovery (hard limit) */ +#define OBD_RECOVERY_TIME_HARD (obd_timeout * 9) +/* Time to wait for all clients to reconnect during recovery (soft limit) */ +/* Should be very conservative; must catch the first reconnect after reboot */ +#define OBD_RECOVERY_TIME_SOFT (obd_timeout * 3) +/* Change recovery-small 26b time if you change this */ +#define PING_INTERVAL max(obd_timeout / 4, 1U) +/* a bit more than maximal journal commit time in seconds */ +#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U) +/* Client may skip 1 ping; we must wait at least 2.5. But for multiple + * failover targets the client only pings one server at a time, and pings + * can be lost on a loaded network. Since eviction has serious consequences, + * and there's no urgent need to evict a client just because it's idle, we + * should be very conservative here. */ +#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6) +#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */ +#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */ + /* Max connect interval for nonresponsive servers; ~50s to avoid building up + connect requests in the LND queues, but within obd_timeout so we don't + miss the recovery window */ +#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout)) +#define CONNECTION_SWITCH_INC 5 /* Connection timeout backoff */ +/* In general this should be low to have quick detection of a system + running on a backup server. (If it's too low, import_select_connection + will increase the timeout anyhow.) */ +#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20) +/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */ +#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \ + INITIAL_CONNECT_TIMEOUT) +/* The min time a target should wait for clients to reconnect in recovery */ +#define OBD_RECOVERY_TIME_MIN (2*RECONNECT_DELAY_MAX) +#define OBD_IR_FACTOR_MIN 1 +#define OBD_IR_FACTOR_MAX 10 +#define OBD_IR_FACTOR_DEFAULT (OBD_IR_FACTOR_MAX/2) +/* default timeout for the MGS to become IR_FULL */ +#define OBD_IR_MGS_TIMEOUT (4*obd_timeout) +/* Unlink should happen within this many seconds. */ +#define PTLRPC_REQ_LONG_UNLINK 300 + +/** + * Time interval of shrink, if the client is "idle" more than this interval, + * then the ll_grant thread will return the requested grant space to filter + */ +#define GRANT_SHRINK_INTERVAL 1200/*20 minutes*/ + +#define OBD_FAIL_MDS 0x100 +#define OBD_FAIL_MDS_HANDLE_UNPACK 0x101 +#define OBD_FAIL_MDS_GETATTR_NET 0x102 +#define OBD_FAIL_MDS_GETATTR_PACK 0x103 +#define OBD_FAIL_MDS_READPAGE_NET 0x104 +#define OBD_FAIL_MDS_READPAGE_PACK 0x105 +#define OBD_FAIL_MDS_SENDPAGE 0x106 +#define OBD_FAIL_MDS_REINT_NET 0x107 +#define OBD_FAIL_MDS_REINT_UNPACK 0x108 +#define OBD_FAIL_MDS_REINT_SETATTR 0x109 +#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a +#define OBD_FAIL_MDS_REINT_CREATE 0x10b +#define OBD_FAIL_MDS_REINT_CREATE_WRITE 0x10c +#define OBD_FAIL_MDS_REINT_UNLINK 0x10d +#define OBD_FAIL_MDS_REINT_UNLINK_WRITE 0x10e +#define OBD_FAIL_MDS_REINT_LINK 0x10f +#define OBD_FAIL_MDS_REINT_LINK_WRITE 0x110 +#define OBD_FAIL_MDS_REINT_RENAME 0x111 +#define OBD_FAIL_MDS_REINT_RENAME_WRITE 0x112 +#define OBD_FAIL_MDS_OPEN_NET 0x113 +#define OBD_FAIL_MDS_OPEN_PACK 0x114 +#define OBD_FAIL_MDS_CLOSE_NET 0x115 +#define OBD_FAIL_MDS_CLOSE_PACK 0x116 +#define OBD_FAIL_MDS_CONNECT_NET 0x117 +#define OBD_FAIL_MDS_CONNECT_PACK 0x118 +#define OBD_FAIL_MDS_REINT_NET_REP 0x119 +#define OBD_FAIL_MDS_DISCONNECT_NET 0x11a +#define OBD_FAIL_MDS_GET_ROOT_NET 0x11b +#define OBD_FAIL_MDS_GET_ROOT_PACK 0x11c +#define OBD_FAIL_MDS_STATFS_PACK 0x11d +#define OBD_FAIL_MDS_STATFS_SUM_PACK 0x11d +#define OBD_FAIL_MDS_STATFS_NET 0x11e +#define OBD_FAIL_MDS_STATFS_SUM_NET 0x11e +#define OBD_FAIL_MDS_GETATTR_NAME_NET 0x11f +#define OBD_FAIL_MDS_PIN_NET 0x120 +#define OBD_FAIL_MDS_UNPIN_NET 0x121 +#define OBD_FAIL_MDS_ALL_REPLY_NET 0x122 +#define OBD_FAIL_MDS_ALL_REQUEST_NET 0x123 +#define OBD_FAIL_MDS_SYNC_NET 0x124 +#define OBD_FAIL_MDS_SYNC_PACK 0x125 +/* OBD_FAIL_MDS_DONE_WRITING_NET 0x126 obsolete since 2.8.0 */ +/* OBD_FAIL_MDS_DONE_WRITING_PACK 0x127 obsolete since 2.8.0 */ +#define OBD_FAIL_MDS_ALLOC_OBDO 0x128 +#define OBD_FAIL_MDS_PAUSE_OPEN 0x129 +#define OBD_FAIL_MDS_STATFS_LCW_SLEEP 0x12a +#define OBD_FAIL_MDS_OPEN_CREATE 0x12b +#define OBD_FAIL_MDS_OST_SETATTR 0x12c +/* OBD_FAIL_MDS_QUOTACHECK_NET 0x12d obsolete since 2.4 */ +#define OBD_FAIL_MDS_QUOTACTL_NET 0x12e +#define OBD_FAIL_MDS_CLIENT_ADD 0x12f +#define OBD_FAIL_MDS_GETXATTR_NET 0x130 +#define OBD_FAIL_MDS_GETXATTR_PACK 0x131 +#define OBD_FAIL_MDS_SETXATTR_NET 0x132 +#define OBD_FAIL_MDS_SETXATTR 0x133 +#define OBD_FAIL_MDS_SETXATTR_WRITE 0x134 +#define OBD_FAIL_MDS_FS_SETUP 0x135 +#define OBD_FAIL_MDS_RESEND 0x136 +#define OBD_FAIL_MDS_LLOG_CREATE_FAILED 0x137 +#define OBD_FAIL_MDS_LOV_SYNC_RACE 0x138 +#define OBD_FAIL_MDS_OSC_PRECREATE 0x139 +#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a +#define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b +#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ 0x13c +#define OBD_FAIL_MDS_DROP_QUOTA_REQ 0x13d +#define OBD_FAIL_MDS_REMOVE_COMMON_EA 0x13e +#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING 0x13f +#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140 +#define OBD_FAIL_MDS_LOV_PREP_CREATE 0x141 +#define OBD_FAIL_MDS_REINT_DELAY 0x142 +#define OBD_FAIL_MDS_READLINK_EPROTO 0x143 +#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144 +#define OBD_FAIL_MDS_PDO_LOCK 0x145 +#define OBD_FAIL_MDS_PDO_LOCK2 0x146 +#define OBD_FAIL_MDS_OSC_CREATE_FAIL 0x147 +#define OBD_FAIL_MDS_NEGATIVE_POSITIVE 0x148 +#define OBD_FAIL_MDS_HSM_STATE_GET_NET 0x149 +#define OBD_FAIL_MDS_HSM_STATE_SET_NET 0x14a +#define OBD_FAIL_MDS_HSM_PROGRESS_NET 0x14b +#define OBD_FAIL_MDS_HSM_REQUEST_NET 0x14c +#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET 0x14d +#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET 0x14e +#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET 0x14f +#define OBD_FAIL_MDS_HSM_ACTION_NET 0x150 +#define OBD_FAIL_MDS_CHANGELOG_INIT 0x151 +#define OBD_FAIL_MDS_HSM_SWAP_LAYOUTS 0x152 +#define OBD_FAIL_MDS_RENAME 0x153 +#define OBD_FAIL_MDS_RENAME2 0x154 +#define OBD_FAIL_MDS_RENAME3 0x155 +#define OBD_FAIL_MDS_RENAME4 0x156 +#define OBD_FAIL_MDS_LDLM_REPLY_NET 0x157 +#define OBD_FAIL_MDS_STALE_DIR_LAYOUT 0x158 +#define OBD_FAIL_MDS_REINT_MULTI_NET 0x159 +#define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a +#define OBD_FAIL_MDS_LLOG_CREATE_FAILED2 0x15b +#define OBD_FAIL_MDS_FLD_LOOKUP 0x15c +#define OBD_FAIL_MDS_CHANGELOG_REORDER 0x15d +#define OBD_FAIL_MDS_LLOG_UMOUNT_RACE 0x15e +#define OBD_FAIL_MDS_CHANGELOG_RACE 0x15f +#define OBD_FAIL_MDS_INTENT_DELAY 0x160 +#define OBD_FAIL_MDS_XATTR_REP 0x161 +#define OBD_FAIL_MDS_TRACK_OVERFLOW 0x162 +#define OBD_FAIL_MDS_LOV_CREATE_RACE 0x163 +#define OBD_FAIL_MDS_HSM_CDT_DELAY 0x164 +#define OBD_FAIL_MDS_ORPHAN_DELETE 0x165 +#define OBD_FAIL_MDS_RMFID_NET 0x166 +#define OBD_FAIL_MDS_CREATE_RACE 0x167 +#define OBD_FAIL_MDS_STATFS_SPOOF 0x168 +#define OBD_FAIL_MDS_REINT_OPEN 0x169 +#define OBD_FAIL_MDS_REINT_OPEN2 0x16a +#define OBD_FAIL_MDS_COMMITRW_DELAY 0x16b +#define OBD_FAIL_MDS_CHANGELOG_DEL 0x16c +#define OBD_FAIL_MDS_CHANGELOG_IDX_PUMP 0x16d +#define OBD_FAIL_MDS_DELAY_DELORPHAN 0x16e + +/* layout lock */ +#define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 +#define OBD_FAIL_MDS_NO_LL_OPEN 0x171 +#define OBD_FAIL_MDS_LL_BLOCK 0x172 +#define OBD_FAIL_MDS_LOD_CREATE_PAUSE 0x173 + +/* CMD */ +#define OBD_FAIL_MDS_IS_SUBDIR_NET 0x180 +#define OBD_FAIL_MDS_IS_SUBDIR_PACK 0x181 +#define OBD_FAIL_MDS_SET_INFO_NET 0x182 +#define OBD_FAIL_MDS_WRITEPAGE_NET 0x183 +#define OBD_FAIL_MDS_WRITEPAGE_PACK 0x184 +#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185 +#define OBD_FAIL_MDS_GET_INFO_NET 0x186 +#define OBD_FAIL_MDS_DQACQ_NET 0x187 +#define OBD_FAIL_MDS_STRIPE_CREATE 0x188 +#define OBD_FAIL_MDS_STRIPE_FID 0x189 +#define OBD_FAIL_MDS_LINK_RENAME_RACE 0x18a + +/* OI scrub */ +#define OBD_FAIL_OSD_SCRUB_DELAY 0x190 +#define OBD_FAIL_OSD_SCRUB_CRASH 0x191 +#define OBD_FAIL_OSD_SCRUB_FATAL 0x192 +#define OBD_FAIL_OSD_FID_MAPPING 0x193 +#define OBD_FAIL_OSD_LMA_INCOMPAT 0x194 +#define OBD_FAIL_OSD_COMPAT_INVALID_ENTRY 0x195 +#define OBD_FAIL_OSD_COMPAT_NO_ENTRY 0x196 +#define OBD_FAIL_OSD_OST_EA_FID_SET 0x197 +#define OBD_FAIL_OSD_NO_OI_ENTRY 0x198 +#define OBD_FAIL_OSD_INDEX_CRASH 0x199 +#define OBD_FAIL_OSD_TXN_START 0x19a +#define OBD_FAIL_OSD_DUPLICATE_MAP 0x19b +#define OBD_FAIL_OSD_REF_DEL 0x19c +#define OBD_FAIL_OSD_OI_ENOSPC 0x19d +#define OBD_FAIL_OSD_DOTDOT_ENOSPC 0x19e + +#define OBD_FAIL_OFD_SET_OID 0x1e0 + +#define OBD_FAIL_OST 0x200 +#define OBD_FAIL_OST_CONNECT_NET 0x201 +#define OBD_FAIL_OST_DISCONNECT_NET 0x202 +#define OBD_FAIL_OST_GET_INFO_NET 0x203 +#define OBD_FAIL_OST_CREATE_NET 0x204 +#define OBD_FAIL_OST_DESTROY_NET 0x205 +#define OBD_FAIL_OST_GETATTR_NET 0x206 +#define OBD_FAIL_OST_SETATTR_NET 0x207 +#define OBD_FAIL_OST_OPEN_NET 0x208 +#define OBD_FAIL_OST_CLOSE_NET 0x209 +#define OBD_FAIL_OST_BRW_NET 0x20a +#define OBD_FAIL_OST_PUNCH_NET 0x20b +#define OBD_FAIL_OST_STATFS_NET 0x20c +#define OBD_FAIL_OST_HANDLE_UNPACK 0x20d +#define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e +#define OBD_FAIL_OST_BRW_READ_BULK 0x20f +#define OBD_FAIL_OST_SYNC_NET 0x210 +#define OBD_FAIL_OST_ALL_REPLY_NET 0x211 +#define OBD_FAIL_OST_ALL_REQUEST_NET 0x212 +#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213 +#define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214 +#define OBD_FAIL_OST_ENOSPC 0x215 +#define OBD_FAIL_OST_EROFS 0x216 +#define OBD_FAIL_SRV_ENOENT 0x217 +/* OBD_FAIL_OST_QUOTACHECK_NET 0x218 obsolete since 2.4 */ +#define OBD_FAIL_OST_QUOTACTL_NET 0x219 +#define OBD_FAIL_OST_CHECKSUM_RECEIVE 0x21a +#define OBD_FAIL_OST_CHECKSUM_SEND 0x21b +#define OBD_FAIL_OST_BRW_SIZE 0x21c +#define OBD_FAIL_OST_DROP_REQ 0x21d +#define OBD_FAIL_OST_SETATTR_CREDITS 0x21e +#define OBD_FAIL_OST_HOLD_WRITE_RPC 0x21f +#define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220 +#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221 +#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 +#define OBD_FAIL_OST_PAUSE_CREATE 0x223 +#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224 +#define OBD_FAIL_OST_CONNECT_NET2 0x225 +#define OBD_FAIL_OST_NOMEM 0x226 +#define OBD_FAIL_OST_BRW_PAUSE_BULK2 0x227 +#define OBD_FAIL_OST_MAPBLK_ENOSPC 0x228 +#define OBD_FAIL_OST_ENOINO 0x229 +#define OBD_FAIL_OST_DQACQ_NET 0x230 +#define OBD_FAIL_OST_STATFS_EINPROGRESS 0x231 +#define OBD_FAIL_OST_SET_INFO_NET 0x232 +#define OBD_FAIL_OST_NODESTROY 0x233 +/* OBD_FAIL_OST_READ_SIZE 0x234 obsolete since 2.14 */ +#define OBD_FAIL_OST_LADVISE_NET 0x235 +#define OBD_FAIL_OST_PAUSE_PUNCH 0x236 +#define OBD_FAIL_OST_LADVISE_PAUSE 0x237 +#define OBD_FAIL_OST_FAKE_RW 0x238 +#define OBD_FAIL_OST_LIST_ASSERT 0x239 +#define OBD_FAIL_OST_GL_WORK_ALLOC 0x240 +#define OBD_FAIL_OST_SKIP_LV_CHECK 0x241 +#define OBD_FAIL_OST_STATFS_DELAY 0x242 +#define OBD_FAIL_OST_INTEGRITY_FAULT 0x243 +#define OBD_FAIL_OST_INTEGRITY_CMP 0x244 +#define OBD_FAIL_OST_DISCONNECT_DELAY 0x245 +#define OBD_FAIL_OST_PREPARE_DELAY 0x247 +#define OBD_FAIL_OST_2BIG_NIOBUF 0x248 +#define OBD_FAIL_OST_FALLOCATE_NET 0x249 +#define OBD_FAIL_OST_SEEK_NET 0x24a +#define OBD_FAIL_OST_WR_ATTR_DELAY 0x250 +#define OBD_FAIL_OST_RESTART_IO 0x251 +#define OBD_FAIL_OST_GET_LAST_FID 0x252 + +#define OBD_FAIL_LDLM 0x300 +#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 +#define OBD_FAIL_LDLM_ENQUEUE_NET 0x302 +#define OBD_FAIL_LDLM_CONVERT_NET 0x303 +#define OBD_FAIL_LDLM_CANCEL_NET 0x304 +#define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305 +#define OBD_FAIL_LDLM_CP_CALLBACK_NET 0x306 +#define OBD_FAIL_LDLM_GL_CALLBACK_NET 0x307 +#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 +#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309 +#define OBD_FAIL_LDLM_CREATE_RESOURCE 0x30a +#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b +#define OBD_FAIL_LDLM_REPLY 0x30c +#define OBD_FAIL_LDLM_RECOV_CLIENTS 0x30d +#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e +#define OBD_FAIL_LDLM_GLIMPSE 0x30f +#define OBD_FAIL_LDLM_CANCEL_RACE 0x310 +#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE 0x311 +#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 +#define OBD_FAIL_LDLM_CLOSE_THREAD 0x313 +#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314 +#define OBD_FAIL_LDLM_CP_CB_WAIT 0x315 +#define OBD_FAIL_LDLM_OST_FAIL_RACE 0x316 +#define OBD_FAIL_LDLM_INTR_CP_AST 0x317 +#define OBD_FAIL_LDLM_CP_BL_RACE 0x318 +#define OBD_FAIL_LDLM_NEW_LOCK 0x319 +#define OBD_FAIL_LDLM_AGL_DELAY 0x31a +#define OBD_FAIL_LDLM_AGL_NOLOCK 0x31b +#define OBD_FAIL_LDLM_OST_LVB 0x31c +#define OBD_FAIL_LDLM_ENQUEUE_HANG 0x31d +#define OBD_FAIL_LDLM_BL_EVICT 0x31e +#define OBD_FAIL_LDLM_PAUSE_CANCEL2 0x31f +#define OBD_FAIL_LDLM_CP_CB_WAIT2 0x320 +#define OBD_FAIL_LDLM_CP_CB_WAIT3 0x321 +#define OBD_FAIL_LDLM_CP_CB_WAIT4 0x322 +#define OBD_FAIL_LDLM_CP_CB_WAIT5 0x323 +#define OBD_FAIL_LDLM_SRV_BL_AST 0x324 +#define OBD_FAIL_LDLM_SRV_CP_AST 0x325 +#define OBD_FAIL_LDLM_SRV_GL_AST 0x326 +#define OBD_FAIL_LDLM_WATERMARK_LOW 0x327 +#define OBD_FAIL_LDLM_WATERMARK_HIGH 0x328 +#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329 + +#define OBD_FAIL_LDLM_GRANT_CHECK 0x32a +#define OBD_FAIL_LDLM_PROLONG_PAUSE 0x32b +#define OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE 0x32c +#define OBD_FAIL_LDLM_LOCK_REPLAY 0x32d +#define OBD_FAIL_LDLM_REPLAY_PAUSE 0x32e + +/* LOCKLESS IO */ +#define OBD_FAIL_LDLM_SET_CONTENTION 0x385 + +#define OBD_FAIL_OSC 0x400 +#define OBD_FAIL_OSC_BRW_READ_BULK 0x401 +#define OBD_FAIL_OSC_BRW_WRITE_BULK 0x402 +#define OBD_FAIL_OSC_LOCK_BL_AST 0x403 +#define OBD_FAIL_OSC_LOCK_CP_AST 0x404 +#define OBD_FAIL_OSC_MATCH 0x405 +#define OBD_FAIL_OSC_BRW_PREP_REQ 0x406 +#define OBD_FAIL_OSC_SHUTDOWN 0x407 +#define OBD_FAIL_OSC_CHECKSUM_RECEIVE 0x408 +#define OBD_FAIL_OSC_CHECKSUM_SEND 0x409 +#define OBD_FAIL_OSC_BRW_PREP_REQ2 0x40a +/* #define OBD_FAIL_OSC_CONNECT_CKSUM 0x40b Obsolete since 2.9 */ +#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY 0x40c +#define OBD_FAIL_OSC_DIO_PAUSE 0x40d +#define OBD_FAIL_OSC_OBJECT_CONTENTION 0x40e +#define OBD_FAIL_OSC_CP_CANCEL_RACE 0x40f +#define OBD_FAIL_OSC_CP_ENQ_RACE 0x410 +#define OBD_FAIL_OSC_NO_GRANT 0x411 +#define OBD_FAIL_OSC_DELAY_SETTIME 0x412 +#define OBD_FAIL_OSC_CONNECT_GRANT_PARAM 0x413 +#define OBD_FAIL_OSC_DELAY_IO 0x414 +#define OBD_FAIL_OSC_NO_SIZE_DATA 0x415 +#define OBD_FAIL_OSC_DELAY_CANCEL 0x416 +#define OBD_FAIL_OSC_SLOW_PAGE_EVICT 0x417 + +#define OBD_FAIL_PTLRPC 0x500 +#define OBD_FAIL_PTLRPC_ACK 0x501 +#define OBD_FAIL_PTLRPC_RQBD 0x502 +#define OBD_FAIL_PTLRPC_BULK_GET_NET 0x503 +#define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 +#define OBD_FAIL_PTLRPC_DROP_RPC 0x505 +#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506 +#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507 +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508 +#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a +#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c +#define OBD_FAIL_PTLRPC_IMP_DEACTIVE 0x50d +#define OBD_FAIL_PTLRPC_DUMP_LOG 0x50e +#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f +#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510 +#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT 0x511 +#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT 0x512 +#define OBD_FAIL_PTLRPC_DROP_REQ_OPC 0x513 +#define OBD_FAIL_PTLRPC_FINISH_REPLAY 0x514 +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515 +#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL 0x516 +#define OBD_FAIL_PTLRPC_CANCEL_RESEND 0x517 +#define OBD_FAIL_PTLRPC_DROP_BULK 0x51a +#define OBD_FAIL_PTLRPC_LONG_REQ_UNLINK 0x51b +#define OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK 0x51c +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB3 0x520 +#define OBD_FAIL_PTLRPC_BULK_ATTACH 0x521 +#define OBD_FAIL_PTLRPC_BULK_REPLY_ATTACH 0x522 +#define OBD_FAIL_PTLRPC_RESEND_RACE 0x525 +#define OBD_FAIL_PTLRPC_ROUND_XID 0x530 +#define OBD_FAIL_PTLRPC_CONNECT_RACE 0x531 +#define OBD_FAIL_NET_ERROR_RPC 0x532 +#define OBD_FAIL_PTLRPC_IDLE_RACE 0x533 +#define OBD_FAIL_PTLRPC_ENQ_RESEND 0x534 + +#define OBD_FAIL_OBD_PING_NET 0x600 +/* OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 obsolete since 1.5 */ +#define OBD_FAIL_OBD_LOGD_NET 0x602 +/* OBD_FAIL_OBD_QC_CALLBACK_NET 0x603 obsolete since 2.4 */ +#define OBD_FAIL_OBD_DQACQ 0x604 +#define OBD_FAIL_OBD_LLOG_SETUP 0x605 +/* OBD_FAIL_OBD_LOG_CANCEL_REP 0x606 obsolete since 1.5 */ +#define OBD_FAIL_OBD_IDX_READ_NET 0x607 +#define OBD_FAIL_OBD_IDX_READ_BREAK 0x608 +#define OBD_FAIL_OBD_NO_LRU 0x609 +#define OBD_FAIL_OBDCLASS_MODULE_LOAD 0x60a +#define OBD_FAIL_OBD_ZERO_NLINK_RACE 0x60b +#define OBD_FAIL_OBD_STOP_MDS_RACE 0x60c +#define OBD_FAIL_OBD_SETUP 0x60d + +#define OBD_FAIL_TGT_REPLY_NET 0x700 +#define OBD_FAIL_TGT_CONN_RACE 0x701 +#define OBD_FAIL_TGT_FORCE_RECONNECT 0x702 +#define OBD_FAIL_TGT_DELAY_CONNECT 0x703 +#define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 +#define OBD_FAIL_TGT_DELAY_PRECREATE 0x705 +#define OBD_FAIL_TGT_TOOMANY_THREADS 0x706 +#define OBD_FAIL_TGT_REPLAY_DROP 0x707 +#define OBD_FAIL_TGT_FAKE_EXP 0x708 +#define OBD_FAIL_TGT_REPLAY_DELAY 0x709 +/* #define OBD_FAIL_TGT_LAST_REPLAY 0x710 (obsoleted) */ +#define OBD_FAIL_TGT_CLIENT_ADD 0x711 +#define OBD_FAIL_TGT_RCVG_FLAG 0x712 +#define OBD_FAIL_TGT_DELAY_CONDITIONAL 0x713 +#define OBD_FAIL_TGT_REPLAY_DELAY2 0x714 +#define OBD_FAIL_TGT_REPLAY_RECONNECT 0x715 +#define OBD_FAIL_TGT_MOUNT_RACE 0x716 +#define OBD_FAIL_TGT_REPLAY_TIMEOUT 0x717 +#define OBD_FAIL_TGT_CLIENT_DEL 0x718 +#define OBD_FAIL_TGT_SLUGGISH_NET 0x719 +#define OBD_FAIL_TGT_RCVD_EIO 0x720 +#define OBD_FAIL_TGT_RECOVERY_REQ_RACE 0x721 +#define OBD_FAIL_TGT_REPLY_DATA_RACE 0x722 +#define OBD_FAIL_TGT_RECOVERY_CONNECT 0x724 +#define OBD_FAIL_TGT_NO_GRANT 0x725 +#define OBD_FAIL_TGT_TXN_NO_CANCEL 0x726 + +#define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 +#define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 +#define OBD_FAIL_MDC_OLD_EXT_FLAGS 0x802 +#define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803 +#define OBD_FAIL_MDC_RPCS_SEM 0x804 /* deprecated */ +#define OBD_FAIL_MDC_LIGHTWEIGHT 0x805 +#define OBD_FAIL_MDC_CLOSE 0x806 +#define OBD_FAIL_MDC_MERGE 0x807 +#define OBD_FAIL_MDC_GLIMPSE_DDOS 0x808 + +#define OBD_FAIL_MGS 0x900 +#define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901 +#define OBD_FAIL_MGS_ALL_REPLY_NET 0x902 +#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903 +#define OBD_FAIL_MGS_PAUSE_REQ 0x904 +#define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905 +#define OBD_FAIL_MGS_CONNECT_NET 0x906 +#define OBD_FAIL_MGS_DISCONNECT_NET 0x907 +#define OBD_FAIL_MGS_SET_INFO_NET 0x908 +#define OBD_FAIL_MGS_EXCEPTION_NET 0x909 +#define OBD_FAIL_MGS_TARGET_REG_NET 0x90a +#define OBD_FAIL_MGS_TARGET_DEL_NET 0x90b +#define OBD_FAIL_MGS_CONFIG_READ_NET 0x90c +#define OBD_FAIL_MGS_LDLM_REPLY_NET 0x90d +#define OBD_FAIL_MGS_WRITE_TARGET_DELAY 0x90e + +#define OBD_FAIL_QUOTA_DQACQ_NET 0xA01 +#define OBD_FAIL_QUOTA_EDQUOT 0xA02 +#define OBD_FAIL_QUOTA_DELAY_REINT 0xA03 +#define OBD_FAIL_QUOTA_RECOVERABLE_ERR 0xA04 +#define OBD_FAIL_QUOTA_INIT 0xA05 +#define OBD_FAIL_QUOTA_PREACQ 0xA06 +#define OBD_FAIL_QUOTA_RECALC 0xA07 + +#define OBD_FAIL_LPROC_REMOVE 0xB00 + +#define OBD_FAIL_SEQ 0x1000 +#define OBD_FAIL_SEQ_QUERY_NET 0x1001 +#define OBD_FAIL_SEQ_EXHAUST 0x1002 + +#define OBD_FAIL_FLD 0x1100 +#define OBD_FAIL_FLD_QUERY_NET 0x1101 +#define OBD_FAIL_FLD_READ_NET 0x1102 +#define OBD_FAIL_FLD_QUERY_REQ 0x1103 + +#define OBD_FAIL_SEC_CTX 0x1200 +#define OBD_FAIL_SEC_CTX_INIT_NET 0x1201 +#define OBD_FAIL_SEC_CTX_INIT_CONT_NET 0x1202 +#define OBD_FAIL_SEC_CTX_FINI_NET 0x1203 +#define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204 + +#define OBD_FAIL_LLOG 0x1300 +/* was OBD_FAIL_LLOG_ORIGIN_CONNECT_NET 0x1301 until 2.4 */ +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET 0x1302 +/* was OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET 0x1303 until 2.11 */ +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET 0x1305 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET 0x1306 +/* was OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET 0x1307 until 2.1 */ +/* was OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET 0x1308 until 1.8 */ +/* was OBD_FAIL_LLOG_CATINFO_NET 0x1309 until 2.3 */ +#define OBD_FAIL_MDS_SYNC_CAPA_SL 0x1310 +#define OBD_FAIL_SEQ_ALLOC 0x1311 +#define OBD_FAIL_CAT_RECORDS 0x1312 +#define OBD_FAIL_CAT_FREE_RECORDS 0x1313 +#define OBD_FAIL_TIME_IN_CHLOG_USER 0x1314 +#define CFS_FAIL_CHLOG_USER_REG_UNREG_RACE 0x1315 +#define OBD_FAIL_FORCE_GC_THREAD 0x1316 +#define OBD_FAIL_LLOG_PROCESS_TIMEOUT 0x1317 +#define OBD_FAIL_LLOG_PURGE_DELAY 0x1318 +#define OBD_FAIL_PLAIN_RECORDS 0x1319 +#define OBD_FAIL_CATALOG_FULL_CHECK 0x131a +#define OBD_FAIL_CATLIST 0x131b +#define OBD_FAIL_LLOG_PAUSE_AFTER_PAD 0x131c +#define OBD_FAIL_LLOG_ADD_GAP 0x131d + +#define OBD_FAIL_LLITE 0x1400 +#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE 0x1401 +#define OBD_FAIL_LOCK_STATE_WAIT_INTR 0x1402 +#define OBD_FAIL_LOV_INIT 0x1403 +#define OBD_FAIL_GLIMPSE_DELAY 0x1404 +#define OBD_FAIL_LLITE_XATTR_ENOMEM 0x1405 +#define OBD_FAIL_MAKE_LOVEA_HOLE 0x1406 +#define OBD_FAIL_LLITE_LOST_LAYOUT 0x1407 +#define OBD_FAIL_LLITE_NO_CHECK_DEAD 0x1408 +#define OBD_FAIL_GETATTR_DELAY 0x1409 +#define OBD_FAIL_LLITE_CREATE_FILE_PAUSE 0x1409 +#define OBD_FAIL_LLITE_NEWNODE_PAUSE 0x140a +#define OBD_FAIL_LLITE_SETDIRSTRIPE_PAUSE 0x140b +#define OBD_FAIL_LLITE_CREATE_NODE_PAUSE 0x140c +#define OBD_FAIL_LLITE_IMUTEX_SEC 0x140e +#define OBD_FAIL_LLITE_IMUTEX_NOSEC 0x140f +#define OBD_FAIL_LLITE_OPEN_BY_NAME 0x1410 +#define OBD_FAIL_LLITE_PCC_FAKE_ERROR 0x1411 +#define OBD_FAIL_LLITE_PCC_DETACH_MKWRITE 0x1412 +#define OBD_FAIL_LLITE_PCC_MKWRITE_PAUSE 0x1413 +#define OBD_FAIL_LLITE_PCC_ATTACH_PAUSE 0x1414 +#define OBD_FAIL_LLITE_SHORT_COMMIT 0x1415 +#define OBD_FAIL_LLITE_CREATE_FILE_PAUSE2 0x1416 +#define OBD_FAIL_LLITE_RACE_MOUNT 0x1417 +#define OBD_FAIL_LLITE_PAGE_ALLOC 0x1418 +#define OBD_FAIL_LLITE_OPEN_DELAY 0x1419 +#define OBD_FAIL_LLITE_XATTR_PAUSE 0x1420 +#define OBD_FAIL_LLITE_PAGE_INVALIDATE_PAUSE 0x1421 +#define OBD_FAIL_LLITE_READPAGE_PAUSE 0x1422 +#define OBD_FAIL_LLITE_READPAGE_PAUSE2 0x1424 + +#define OBD_FAIL_FID_INDIR 0x1501 +#define OBD_FAIL_FID_INLMA 0x1502 +#define OBD_FAIL_FID_IGIF 0x1504 +#define OBD_FAIL_FID_LOOKUP 0x1505 +#define OBD_FAIL_FID_NOLMA 0x1506 + +/* LFSCK */ +#define OBD_FAIL_LFSCK_DELAY1 0x1600 +#define OBD_FAIL_LFSCK_DELAY2 0x1601 +#define OBD_FAIL_LFSCK_DELAY3 0x1602 +#define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603 +#define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604 +#define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605 +#define OBD_FAIL_LFSCK_FATAL1 0x1608 +#define OBD_FAIL_LFSCK_FATAL2 0x1609 +#define OBD_FAIL_LFSCK_CRASH 0x160a +#define OBD_FAIL_LFSCK_NO_AUTO 0x160b +#define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c +#define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d +#define OBD_FAIL_LFSCK_DELAY4 0x160e +#define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f +#define OBD_FAIL_LFSCK_DANGLING 0x1610 +#define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611 +#define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612 +#define OBD_FAIL_LFSCK_BAD_OWNER 0x1613 +#define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614 +#define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615 +#define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616 +#define OBD_FAIL_LFSCK_NOPFID 0x1617 +#define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618 +#define OBD_FAIL_LFSCK_INVALID_PFID 0x1619 +#define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a +#define OBD_FAIL_LFSCK_DELAY5 0x161b +#define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c +#define OBD_FAIL_LFSCK_NO_LINKEA 0x161d +#define OBD_FAIL_LFSCK_BAD_PARENT 0x161e +#define OBD_FAIL_LFSCK_DANGLING2 0x1620 +#define OBD_FAIL_LFSCK_DANGLING3 0x1621 +#define OBD_FAIL_LFSCK_MUL_REF 0x1622 +#define OBD_FAIL_LFSCK_BAD_TYPE 0x1623 +#define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624 +#define OBD_FAIL_LFSCK_LESS_NLINK 0x1626 +#define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628 +#define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629 +#define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a +#define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b +#define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c +#define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d +#define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e +#define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f +#define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630 +#define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631 +#define OBD_FAIL_LFSCK_NO_ENCFLAG 0x1632 + +#define OBD_FAIL_LFSCK_NOTIFY_NET 0x16f0 +#define OBD_FAIL_LFSCK_QUERY_NET 0x16f1 + +/* UPDATE */ +#define OBD_FAIL_OUT_UPDATE_NET 0x1700 +#define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701 +#define OBD_FAIL_SPLIT_UPDATE_REC 0x1702 +#define OBD_FAIL_LARGE_STRIPE 0x1703 +#define OBD_FAIL_OUT_ENOSPC 0x1704 +#define OBD_FAIL_INVALIDATE_UPDATE 0x1705 +#define OBD_FAIL_OUT_UPDATE_DROP 0x1707 +#define OBD_FAIL_OUT_OBJECT_MISS 0x1708 + +/* MIGRATE */ +#define OBD_FAIL_MIGRATE_ENTRIES 0x1801 + +/* LMV */ +#define OBD_FAIL_UNKNOWN_LMV_STRIPE 0x1901 + +/* FLR */ +#define OBD_FAIL_FLR_GLIMPSE_IMMUTABLE 0x1A00 +#define OBD_FAIL_FLR_LV_DELAY 0x1A01 +#define OBD_FAIL_FLR_LV_INC 0x1A02 +#define OBD_FAIL_FLR_RANDOM_PICK_MIRROR 0x1A03 + +/* DT */ +#define OBD_FAIL_DT_DECLARE_ATTR_GET 0x2000 +#define OBD_FAIL_DT_ATTR_GET 0x2001 +#define OBD_FAIL_DT_DECLARE_ATTR_SET 0x2002 +#define OBD_FAIL_DT_ATTR_SET 0x2003 +#define OBD_FAIL_DT_DECLARE_XATTR_GET 0x2004 +#define OBD_FAIL_DT_XATTR_GET 0x2005 +#define OBD_FAIL_DT_DECLARE_XATTR_SET 0x2006 +#define OBD_FAIL_DT_XATTR_SET 0x2007 +#define OBD_FAIL_DT_DECLARE_XATTR_DEL 0x2008 +#define OBD_FAIL_DT_XATTR_DEL 0x2009 +#define OBD_FAIL_DT_XATTR_LIST 0x200a +#define OBD_FAIL_DT_DECLARE_CREATE 0x200b +#define OBD_FAIL_DT_CREATE 0x200c +#define OBD_FAIL_DT_DECLARE_DESTROY 0x200d +#define OBD_FAIL_DT_DESTROY 0x200e +#define OBD_FAIL_DT_INDEX_TRY 0x200f +#define OBD_FAIL_DT_DECLARE_REF_ADD 0x2010 +#define OBD_FAIL_DT_REF_ADD 0x2011 +#define OBD_FAIL_DT_DECLARE_REF_DEL 0x2012 +#define OBD_FAIL_DT_REF_DEL 0x2013 +#define OBD_FAIL_DT_DECLARE_INSERT 0x2014 +#define OBD_FAIL_DT_INSERT 0x2015 +#define OBD_FAIL_DT_DECLARE_DELETE 0x2016 +#define OBD_FAIL_DT_DELETE 0x2017 +#define OBD_FAIL_DT_LOOKUP 0x2018 +#define OBD_FAIL_DT_TXN_STOP 0x2019 + +#define OBD_FAIL_OSP_CHECK_INVALID_REC 0x2100 +#define OBD_FAIL_OSP_CHECK_ENOMEM 0x2101 +#define OBD_FAIL_OSP_FAKE_PRECREATE 0x2102 +#define OBD_FAIL_OSP_RPCS_SEM 0x2104 +#define OBD_FAIL_OSP_CANT_PROCESS_LLOG 0x2105 +#define OBD_FAIL_OSP_INVALID_LOGID 0x2106 +#define OBD_FAIL_OSP_CON_EVENT_DELAY 0x2107 +#define OBD_FAIL_OSP_PRECREATE_PAUSE 0x2108 +#define OBD_FAIL_OSP_GET_LAST_FID 0x2109 + +/* barrier */ +#define OBD_FAIL_MGS_BARRIER_READ_NET 0x2200 +#define OBD_FAIL_MGS_BARRIER_NOTIFY_NET 0x2201 + +#define OBD_FAIL_BARRIER_DELAY 0x2202 +#define OBD_FAIL_BARRIER_FAILURE 0x2203 + +#define OBD_FAIL_OSD_FAIL_AT_TRUNCATE 0x2301 + +/* LNet is allocated failure locations 0xe000 to 0xffff */ +/* Assign references to moved code to reduce code changes */ +#define OBD_FAIL_PRECHECK(id) (unlikely(CFS_FAIL_PRECHECK(id))) +#define OBD_FAIL_CHECK(id) CFS_FAIL_CHECK(id) +#define OBD_FAIL_CHECK_QUIET(id) CFS_FAIL_CHECK_QUIET(id) +#define OBD_FAIL_CHECK_VALUE(id, value) CFS_FAIL_CHECK_VALUE(id, value) +#define OBD_FAIL_CHECK_ORSET(id, value) CFS_FAIL_CHECK_ORSET(id, value) +#define OBD_FAIL_CHECK_RESET(id, value) CFS_FAIL_CHECK_RESET(id, value) +#define OBD_FAIL_RETURN(id, ret) CFS_FAIL_RETURN(id, ret) +#define OBD_FAIL_TIMEOUT(id, secs) CFS_FAIL_TIMEOUT(id, secs) +#define OBD_FAIL_TIMEOUT_MS(id, ms) CFS_FAIL_TIMEOUT_MS(id, ms) +#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs) +#define OBD_RACE(id) CFS_RACE(id) +#define OBD_FAIL_ONCE CFS_FAIL_ONCE +#define OBD_FAILED CFS_FAILED + +#define LUT_FAIL_CLASS(fail_id) (((fail_id) >> 8) << 16) +#define LUT_FAIL_MGT LUT_FAIL_CLASS(OBD_FAIL_MGS) +#define LUT_FAIL_MDT LUT_FAIL_CLASS(OBD_FAIL_MDS) +#define LUT_FAIL_OST LUT_FAIL_CLASS(OBD_FAIL_OST) + +extern atomic64_t libcfs_kmem; + +#ifdef CONFIG_PROC_FS +#define obd_memory_add(size) \ + lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size)) +#define obd_memory_sub(size) \ + lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size)) +#define obd_memory_sum() \ + lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT, \ + LPROCFS_FIELDS_FLAGS_SUM) + +extern void obd_update_maxusage(void); +extern __u64 obd_memory_max(void); + +#else /* CONFIG_PROC_FS */ + +extern __u64 obd_alloc; + +extern __u64 obd_max_alloc; + +static inline void obd_memory_add(long size) +{ + obd_alloc += size; + if (obd_alloc > obd_max_alloc) + obd_max_alloc = obd_alloc; +} + +static inline void obd_memory_sub(long size) +{ + obd_alloc -= size; +} + +#define obd_memory_sum() (obd_alloc) + +#define obd_memory_max() (obd_max_alloc) + +#endif /* !CONFIG_PROC_FS */ + +#define OBD_DEBUG_MEMUSAGE (1) + +#if OBD_DEBUG_MEMUSAGE +#define OBD_ALLOC_POST(ptr, size, name) \ + obd_memory_add(size); \ + CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \ + (int)(size), ptr) + +#define OBD_FREE_PRE(ptr, size, name) \ + LASSERT(ptr); \ + obd_memory_sub(size); \ + CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \ + (int)(size), ptr); + +#else /* !OBD_DEBUG_MEMUSAGE */ + +#define OBD_ALLOC_POST(ptr, size, name) ((void)0) +#define OBD_FREE_PRE(ptr, size, name) ((void)0) + +#endif /* !OBD_DEBUG_MEMUSAGE */ + +#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags) \ +do { \ + if (cptab) \ + ptr = cfs_cpt_malloc((cptab), (cpt), (size), \ + (flags) | __GFP_ZERO | __GFP_NOWARN); \ + if (!(cptab) || unlikely(!(ptr))) /* retry without CPT if failure */ \ + ptr = kmalloc(size, (flags) | __GFP_ZERO); \ + if (likely((ptr) != NULL)) \ + OBD_ALLOC_POST((ptr), (size), "kmalloced"); \ +} while (0) + +#define OBD_ALLOC_GFP(ptr, size, gfp_mask) \ + __OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask) + +#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_NOFS) +#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_KERNEL) +#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof(*(ptr))) +#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof(*(ptr))) +#define OBD_ALLOC_PTR_ARRAY(ptr, n) OBD_ALLOC(ptr, (n) * sizeof(*(ptr))) +#define OBD_ALLOC_PTR_ARRAY_WAIT(ptr, n) \ + OBD_ALLOC_WAIT(ptr, (n) * sizeof(*(ptr))) + +#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask) \ + __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask) + +#define OBD_CPT_ALLOC(ptr, cptab, cpt, size) \ + OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS) + +#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt) \ + OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof(*(ptr))) + +/* Direct use of __vmalloc() allows for protection flag specification + * (and particularly to not set __GFP_FS, which is likely to cause some + * deadlock situations in our code). + */ +#define __OBD_VMALLOC_VERBOSE(ptr, cptab, cpt, size) \ +do { \ + (ptr) = cptab == NULL ? \ + __ll_vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO) : \ + cfs_cpt_vzalloc(cptab, cpt, size); \ + if (unlikely((ptr) == NULL)) { \ + CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n", \ + (int)(size)); \ + CERROR("%llu total bytes allocated by Lustre, %lld by LNET\n",\ + obd_memory_sum(), libcfs_kmem_read());\ + } else { \ + OBD_ALLOC_POST(ptr, size, "vmalloced"); \ + } \ +} while(0) + +#define OBD_VMALLOC(ptr, size) \ + __OBD_VMALLOC_VERBOSE(ptr, NULL, 0, size) +#define OBD_CPT_VMALLOC(ptr, cptab, cpt, size) \ + __OBD_VMALLOC_VERBOSE(ptr, cptab, cpt, size) + +#define OBD_ALLOC_LARGE(ptr, size) \ +do { \ + /* LU-8196 - force large allocations to use vmalloc, not kmalloc */ \ + if ((size) > KMALLOC_MAX_SIZE) \ + ptr = NULL; \ + else \ + OBD_ALLOC_GFP(ptr, size, GFP_NOFS | __GFP_NOWARN); \ + if (ptr == NULL) \ + OBD_VMALLOC(ptr, size); \ +} while (0) + +#define OBD_ALLOC_PTR_ARRAY_LARGE(ptr, n) \ + OBD_ALLOC_LARGE(ptr, (n) * sizeof(*(ptr))) + +#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size) \ +do { \ + OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS | __GFP_NOWARN); \ + if (ptr == NULL) \ + OBD_CPT_VMALLOC(ptr, cptab, cpt, size); \ +} while (0) + +#ifdef CONFIG_DEBUG_SLAB +#define POISON(ptr, c, s) do {} while (0) +#define POISON_PTR(ptr) ((void)0) +#else +#define POISON(ptr, c, s) memset(ptr, c, s) +#define POISON_PTR(ptr) (ptr) = (void *)0xdeadbeef +#endif + +#ifdef POISON_BULK +#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_SIZE); \ + kunmap(page); } while (0) +#else +#define POISON_PAGE(page, val) do { } while (0) +#endif + +#define OBD_FREE(ptr, size) \ +do { \ + OBD_FREE_PRE(ptr, size, "kfreed"); \ + POISON(ptr, 0x5a, size); \ + kfree(ptr); \ + POISON_PTR(ptr); \ +} while (0) + +#define OBD_FREE_LARGE(ptr, size) \ +do { \ + if (is_vmalloc_addr(ptr)) { \ + OBD_FREE_PRE(ptr, size, "vfreed"); \ + POISON(ptr, 0x5a, size); \ + libcfs_vfree_atomic(ptr); \ + POISON_PTR(ptr); \ + } else { \ + OBD_FREE(ptr, size); \ + } \ +} while (0) + +#define OBD_FREE_PTR_ARRAY_LARGE(ptr, n) \ + OBD_FREE_LARGE(ptr, (n) * sizeof(*(ptr))) + +/* we memset() the slab object to 0 when allocation succeeds, so DO NOT + * HAVE A CTOR THAT DOES ANYTHING. its work will be cleared here. we'd + * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */ +#define OBD_SLAB_FREE_RTN0(ptr, slab) \ +({ \ + kmem_cache_free((slab), (ptr)); \ + (ptr) = NULL; \ + 0; \ +}) + +#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type) \ +do { \ + LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt())); \ + (ptr) = (cptab) == NULL ? \ + kmem_cache_zalloc(slab, (type)) : \ + cfs_mem_cache_cpt_alloc(slab, cptab, cpt, (type) | __GFP_ZERO); \ + if (likely((ptr))) \ + OBD_ALLOC_POST(ptr, size, "slab-alloced"); \ +} while(0) + +#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags) \ + __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags) +#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags) \ + __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags) + +#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof(*(ptr))) +#define OBD_FREE_PTR_ARRAY(ptr, n) OBD_FREE(ptr, (n) * sizeof(*(ptr))) + +#define OBD_SLAB_FREE(ptr, slab, size) \ +do { \ + OBD_FREE_PRE(ptr, size, "slab-freed"); \ + POISON(ptr, 0x5a, size); \ + kmem_cache_free(slab, ptr); \ + POISON_PTR(ptr); \ +} while(0) + +#define OBD_SLAB_ALLOC(ptr, slab, size) \ + OBD_SLAB_ALLOC_GFP(ptr, slab, size, GFP_NOFS) + +#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size) \ + OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, GFP_NOFS) + +#define OBD_SLAB_ALLOC_PTR(ptr, slab) \ + OBD_SLAB_ALLOC(ptr, slab, sizeof(*(ptr))) + +#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt) \ + OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof(*(ptr))) + +#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags) \ + OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof(*(ptr)), flags) + +#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags) \ + OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof(*(ptr)), flags) + +#define OBD_SLAB_FREE_PTR(ptr, slab) \ + OBD_SLAB_FREE((ptr), (slab), sizeof(*(ptr))) + +#define KEY_IS(str) \ + (keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0) + +#ifdef HAVE_SERVER_SUPPORT +/* LUSTRE_LMA_FL_MASKS defines which flags will be stored in LMA */ + +static inline int lma_to_lustre_flags(__u32 lma_flags) +{ + return (((lma_flags & LMAI_ORPHAN) ? LUSTRE_ORPHAN_FL : 0) | + ((lma_flags & LMAI_ENCRYPT) ? LUSTRE_ENCRYPT_FL : 0)); +} + +static inline int lustre_to_lma_flags(__u32 la_flags) +{ + return (((la_flags & LUSTRE_ORPHAN_FL) ? LMAI_ORPHAN : 0) | + ((la_flags & LUSTRE_ENCRYPT_FL) ? LMAI_ENCRYPT : 0)); +} +#endif /* HAVE_SERVER_SUPPORT */ + +/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values + * for the client inode i_flags. The LUSTRE_*_FL are the Lustre wire + * protocol equivalents of LDISKFS_*_FL values stored on disk, while + * the S_* flags are kernel-internal values that change between kernel + * versions. These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS. + * See b=16526 for a full history. + */ +static inline int ll_ext_to_inode_flags(int ext_flags) +{ + return (((ext_flags & LUSTRE_SYNC_FL) ? S_SYNC : 0) | + ((ext_flags & LUSTRE_NOATIME_FL) ? S_NOATIME : 0) | + ((ext_flags & LUSTRE_APPEND_FL) ? S_APPEND : 0) | + ((ext_flags & LUSTRE_DIRSYNC_FL) ? S_DIRSYNC : 0) | +#if defined(S_ENCRYPTED) + ((ext_flags & LUSTRE_ENCRYPT_FL) ? S_ENCRYPTED : 0) | +#endif + ((ext_flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0)); +} + +static inline int ll_inode_to_ext_flags(int inode_flags) +{ + return (((inode_flags & S_SYNC) ? LUSTRE_SYNC_FL : 0) | + ((inode_flags & S_NOATIME) ? LUSTRE_NOATIME_FL : 0) | + ((inode_flags & S_APPEND) ? LUSTRE_APPEND_FL : 0) | + ((inode_flags & S_DIRSYNC) ? LUSTRE_DIRSYNC_FL : 0) | +#if defined(S_ENCRYPTED) + ((inode_flags & S_ENCRYPTED) ? LUSTRE_ENCRYPT_FL : 0) | +#endif + ((inode_flags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0)); +} + +struct obd_heat_instance { + __u64 ohi_heat; + __u64 ohi_time_second; + __u64 ohi_count; +}; + +/* Define a fixed 4096-byte encryption unit size */ +#define LUSTRE_ENCRYPTION_BLOCKBITS 12 +#define LUSTRE_ENCRYPTION_UNIT_SIZE ((size_t)1 << LUSTRE_ENCRYPTION_BLOCKBITS) +#define LUSTRE_ENCRYPTION_MASK (~(LUSTRE_ENCRYPTION_UNIT_SIZE - 1)) + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/obd_target.h b/drivers/staging/lustrefsx/lustre/include/obd_target.h new file mode 100644 index 0000000000000..60337ca659ba2 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obd_target.h @@ -0,0 +1,73 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __OBD_TARGET_H +#define __OBD_TARGET_H +#include + +/* server-side individual type definitions */ + +#define OBT_MAGIC 0xBDDECEAE +/* hold common fields for "target" device */ +struct obd_device_target { + __u32 obt_magic; + __u32 obt_instance; + struct lu_target *obt_lut; + __u64 obt_mount_count; + struct obd_job_stats obt_jobstats; + struct nm_config_file *obt_nodemap_config_file; +}; + +#define OBJ_SUBDIR_COUNT 32 /* set to zero for no subdirs */ + +struct filter_obd { + /* NB this field MUST be first */ + struct obd_device_target fo_obt; +}; + +struct echo_obd { + struct obd_device_target eo_obt; + struct obdo eo_oa; + spinlock_t eo_lock; + u64 eo_lastino; + struct lustre_handle eo_nl_lock; + atomic_t eo_prep; +}; + +struct ost_obd { + struct ptlrpc_service *ost_service; + struct ptlrpc_service *ost_create_service; + struct ptlrpc_service *ost_io_service; + struct ptlrpc_service *ost_seq_service; + struct ptlrpc_service *ost_out_service; + struct mutex ost_health_mutex; +}; + +#endif /* __OBD_TARGET_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/obj_update.h b/drivers/staging/lustrefsx/lustre/include/obj_update.h new file mode 100644 index 0000000000000..8c88de86005ea --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obj_update.h @@ -0,0 +1,115 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + * + * Selection of object_update and object_update_param handling functions + */ + +#ifndef _OBJ_UPDATE_H_ +#define _OBJ_UPDATE_H_ + +#include + +static inline size_t +object_update_param_size(const struct object_update_param *param) +{ + return cfs_size_round(sizeof(*param) + param->oup_len); +} + +static inline size_t +object_update_params_size(const struct object_update *update) +{ + const struct object_update_param *param; + size_t total_size = 0; + unsigned int i; + + param = &update->ou_params[0]; + for (i = 0; i < update->ou_params_count; i++) { + size_t size = object_update_param_size(param); + + param = (struct object_update_param *)((char *)param + size); + total_size += size; + } + + return total_size; +} + +static inline size_t +object_update_size(const struct object_update *update) +{ + return offsetof(struct object_update, ou_params[0]) + + object_update_params_size(update); +} + +static inline struct object_update * +object_update_request_get(const struct object_update_request *our, + unsigned int index, size_t *size) +{ + void *ptr; + unsigned int i; + + if (index >= our->ourq_count) + return NULL; + + ptr = (void *)&our->ourq_updates[0]; + for (i = 0; i < index; i++) + ptr += object_update_size(ptr); + + if (size != NULL) + *size = object_update_size(ptr); + + return ptr; +} + + + +static inline struct object_update_result * +object_update_result_get(const struct object_update_reply *reply, + unsigned int index, size_t *size) +{ + __u16 count = reply->ourp_count; + unsigned int i; + void *ptr; + + if (index >= count) + return NULL; + + ptr = (char *)reply + + cfs_size_round(offsetof(struct object_update_reply, + ourp_lens[count])); + for (i = 0; i < index; i++) { + if (reply->ourp_lens[i] == 0) + return NULL; + + ptr += cfs_size_round(reply->ourp_lens[i]); + } + + if (size != NULL) + *size = reply->ourp_lens[index]; + + return ptr; +} +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/range_lock.h b/drivers/staging/lustrefsx/lustre/include/range_lock.h new file mode 100644 index 0000000000000..674b27d52be75 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/range_lock.h @@ -0,0 +1,77 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Range lock is used to allow multiple threads writing a single shared + * file given each thread is writing to a non-overlapping portion of the + * file. + * + * Refer to the possible upstream kernel version of range lock by + * Jan Kara : https://lkml.org/lkml/2013/1/31/480 + * + * This file could later replaced by the upstream kernel version. + */ +/* + * Author: Prakash Surya + * Author: Bobi Jam + */ +#ifndef _RANGE_LOCK_H +#define _RANGE_LOCK_H + +#include + +#define RL_FMT "[%llu, %llu]" +#define RL_PARA(range) \ + (unsigned long long)(range)->rl_start, \ + (unsigned long long)(range)->rl_end + +struct range_lock { + __u64 rl_start, + rl_end, + rl_subtree_last; + struct rb_node rl_rb; + /** + * Process to enqueue this lock. + */ + struct task_struct *rl_task; + /** + * Number of ranges which are blocking acquisition of the lock + */ + unsigned int rl_blocking_ranges; + /** + * Sequence number of range lock. This number is used to get to know + * the order the locks are queued. One lock can only block another + * if it has a higher rl_sequence. + */ + __u64 rl_sequence; +}; + +struct range_lock_tree { + struct interval_tree_root rlt_root; + spinlock_t rlt_lock; + __u64 rlt_sequence; +}; + +void range_lock_tree_init(struct range_lock_tree *tree); +void range_lock_init(struct range_lock *lock, __u64 start, __u64 end); +int range_lock(struct range_lock_tree *tree, struct range_lock *lock); +void range_unlock(struct range_lock_tree *tree, struct range_lock *lock); +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/seq_range.h b/drivers/staging/lustrefsx/lustre/include/seq_range.h new file mode 100644 index 0000000000000..374d1932f0bdf --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/seq_range.h @@ -0,0 +1,192 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + * + * Copyright 2015 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * Define lu_seq_range associated functions + */ + +#ifndef _SEQ_RANGE_H_ +#define _SEQ_RANGE_H_ + +#include + +/** + * computes the sequence range type \a range + */ + +static inline unsigned fld_range_type(const struct lu_seq_range *range) +{ + return range->lsr_flags & LU_SEQ_RANGE_MASK; +} + +/** + * Is this sequence range an OST? \a range + */ + +static inline bool fld_range_is_ost(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_OST; +} + +/** + * Is this sequence range an MDT? \a range + */ + +static inline bool fld_range_is_mdt(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_MDT; +} + +/** + * ANY range is only used when the fld client sends a fld query request, + * but it does not know whether the seq is an MDT or OST, so it will send the + * request with ANY type, which means any seq type from the lookup can be + * expected. /a range + */ +static inline unsigned fld_range_is_any(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_ANY; +} + +/** + * Apply flags to range \a range \a flags + */ + +static inline void fld_range_set_type(struct lu_seq_range *range, + unsigned flags) +{ + range->lsr_flags |= flags; +} + +/** + * Add MDT to range type \a range + */ + +static inline void fld_range_set_mdt(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_MDT); +} + +/** + * Add OST to range type \a range + */ + +static inline void fld_range_set_ost(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_OST); +} + +/** + * Add ANY to range type \a range + */ + +static inline void fld_range_set_any(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_ANY); +} + +/** + * computes width of given sequence range \a range + */ + +static inline __u64 lu_seq_range_space(const struct lu_seq_range *range) +{ + return range->lsr_end - range->lsr_start; +} + +/** + * initialize range to zero \a range + */ + +static inline void lu_seq_range_init(struct lu_seq_range *range) +{ + memset(range, 0, sizeof(*range)); +} + +/** + * check if given seq id \a s is within given range \a range + */ + +static inline bool lu_seq_range_within(const struct lu_seq_range *range, + __u64 seq) +{ + return seq >= range->lsr_start && seq < range->lsr_end; +} + +/** + * Is the range sane? Is the end after the beginning? \a range + */ + +static inline bool lu_seq_range_is_sane(const struct lu_seq_range *range) +{ + return range->lsr_end >= range->lsr_start; +} + +/** + * Is the range 0? \a range + */ + +static inline bool lu_seq_range_is_zero(const struct lu_seq_range *range) +{ + return range->lsr_start == 0 && range->lsr_end == 0; +} + +/** + * Is the range out of space? \a range + */ + +static inline bool lu_seq_range_is_exhausted(const struct lu_seq_range *range) +{ + return lu_seq_range_space(range) == 0; +} + +/** + * return 0 if two ranges have the same location, nonzero if they are + * different \a r1 \a r2 + */ + +static inline int lu_seq_range_compare_loc(const struct lu_seq_range *r1, + const struct lu_seq_range *r2) +{ + return r1->lsr_index != r2->lsr_index || + r1->lsr_flags != r2->lsr_flags; +} + +/** + * printf string and argument list for sequence range + */ +#define DRANGE "[%#16.16llx-%#16.16llx]:%x:%s" + +#define PRANGE(range) \ + (unsigned long long)(range)->lsr_start, \ + (unsigned long long)(range)->lsr_end, \ + (range)->lsr_index, \ + fld_range_is_mdt(range) ? "mdt" : "ost" + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lgss.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lgss.h new file mode 100644 index 0000000000000..52c67fe981f60 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lgss.h @@ -0,0 +1,58 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2022, Whamcloud. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _LGSS_H +#define _LGSS_H + +#include + +/* + * sparse kernel source annotations + */ +#ifndef __user +#define __user +#endif + +struct lgssd_ioctl_param { + /* in */ + __u32 version; + __u32 secid; + char __user *uuid; + __u32 lustre_svc; + __kernel_uid_t uid; + __kernel_gid_t gid; + __u64 send_token_size; + char __user *send_token; + __u64 reply_buf_size; + char __user *reply_buf; + /* out */ + __u64 status; + __u64 reply_length; +}; + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_access_log.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_access_log.h new file mode 100644 index 0000000000000..4972976725ced --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_access_log.h @@ -0,0 +1,85 @@ +#ifndef _LUSTRE_ACCESS_LOG_H +# define _LUSTRE_ACCESS_LOG_H + +#include +#include +/* + * This is due to us being out of kernel and the way the OpenSFS branch + * handles CFLAGS. + */ +#ifdef __KERNEL__ +# include +#else +# include +#endif + +enum ofd_access_flags { + OFD_ACCESS_READ = 0x1, + OFD_ACCESS_WRITE = 0x2, +}; + +struct ofd_access_entry_v1 { + struct lu_fid oae_parent_fid; /* 16 */ + __u64 oae_begin; /* 24 */ + __u64 oae_end; /* 32 */ + __u64 oae_time; /* 40 */ + __u32 oae_size; /* 44 */ + __u32 oae_segment_count; /* 48 */ + __u32 oae_flags; /* 52 enum ofd_access_flags */ + __u32 oae_reserved1; /* 56 */ + __u32 oae_reserved2; /* 60 */ + __u32 oae_reserved3; /* 64 */ +}; + +/* The name of the subdirectory of devtmpfs (/dev) containing the + * control and access log char devices. */ +#define LUSTRE_ACCESS_LOG_DIR_NAME "lustre-access-log" + +enum { + LUSTRE_ACCESS_LOG_VERSION_1 = 0x00010000, + LUSTRE_ACCESS_LOG_TYPE_OFD = 0x1, + LUSTRE_ACCESS_LOG_NAME_SIZE = 128, +}; + +struct lustre_access_log_info_v1 { + __u32 lali_version; /* LUSTRE_ACCESS_LOG_VERSION_1 */ + __u32 lali_type; /* LUSTRE_ACCESS_LOG_TYPE_OFD */ + char lali_name[LUSTRE_ACCESS_LOG_NAME_SIZE]; /* obd_name */ + __u32 lali_log_size; + __u32 lali_entry_size; + /* Underscore prefixed members are intended for test and debug + * purposes only. */ + __u32 _lali_head; + __u32 _lali_tail; + __u32 _lali_entry_space; + __u32 _lali_entry_count; + __u32 _lali_drop_count; + __u32 _lali_is_closed; +}; + +enum { + /* /dev/lustre-access-log/control ioctl: return lustre access log + * interface version. */ + LUSTRE_ACCESS_LOG_IOCTL_VERSION = _IO('O', 0x81), + + /* /dev/lustre-access-log/control ioctl: return device major + * used for access log devices. (The major is dynamically + * allocated during ofd module initialization. */ + LUSTRE_ACCESS_LOG_IOCTL_MAJOR = _IO('O', 0x82), + + /* /dev/lustre-access-log/control ioctl: get global control event + * count and store it into file private_data. */ + LUSTRE_ACCESS_LOG_IOCTL_PRESCAN = _IO('O', 0x83), + + /* /dev/lustre-access-log/OBDNAME ioctl: populate struct + * lustre_access_log_info_v1 for the current device. */ + LUSTRE_ACCESS_LOG_IOCTL_INFO = _IOR('O', 0x84, struct lustre_access_log_info_v1), + + /* /dev/lustre-access-log/OBDNAME ioctl: only entries whose + * PFID MDT index is equal to arg will be added to the log. A + * value of 0xfffffffff ((__u32)-1) will disable filtering + * which is the default. Added in V2. */ + LUSTRE_ACCESS_LOG_IOCTL_FILTER = _IOW('O', 0x85, __u32), +}; + +#endif /* _LUSTRE_ACCESS_LOG_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h new file mode 100644 index 0000000000000..38084241d8998 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h @@ -0,0 +1,74 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + * + * lustre/include/lustre/lustre_barrier_user.h + * + * Lustre write barrier (on MDT) userspace interfaces. + * + * Author: Fan, Yong + */ +#ifndef _LUSTRE_BARRIER_USER_H +# define _LUSTRE_BARRIER_USER_H + +#include +#include + +#define BARRIER_VERSION_V1 1 +#define BARRIER_TIMEOUT_DEFAULT 30 + +enum barrier_commands { + BC_FREEZE = 1, + BC_THAW = 2, + BC_STAT = 3, + BC_RESCAN = 4, +}; + +enum barrier_status { + BS_INIT = 0, + BS_FREEZING_P1 = 1, + BS_FREEZING_P2 = 2, + BS_FROZEN = 3, + BS_THAWING = 4, + BS_THAWED = 5, + BS_FAILED = 6, + BS_EXPIRED = 7, + BS_RESCAN = 8, +}; + +struct barrier_ctl { + __u32 bc_version; + __u32 bc_cmd; + union { + __s32 bc_timeout; + __u32 bc_total; + }; + union { + __u32 bc_status; + __u32 bc_absence; + }; + char bc_name[12]; + __u32 bc_padding; +}; + +#endif /* _LUSTRE_BARRIER_USER_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h new file mode 100644 index 0000000000000..97bd28f188380 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h @@ -0,0 +1,346 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _UAPI_LUSTRE_CFG_H +#define _UAPI_LUSTRE_CFG_H + +#include +#include +#include +#include + +/** \defgroup cfg cfg + * + * @{ + */ + +/* + * 1cf6 + * lcfG + */ +#define LUSTRE_CFG_VERSION 0x1cf60001 +#define LUSTRE_CFG_MAX_BUFCOUNT 8 + +#define LCFG_HDR_SIZE(count) \ + __ALIGN_KERNEL(offsetof(struct lustre_cfg, lcfg_buflens[(count)]), 8) + +/** If the LCFG_REQUIRED bit is set in a configuration command, + * then the client is required to understand this parameter + * in order to mount the filesystem. If it does not understand + * a REQUIRED command the client mount will fail. + */ +#define LCFG_REQUIRED 0x0001000 + +enum lcfg_command_type { + LCFG_ATTACH = 0x00cf001, /**< create a new obd instance */ + LCFG_DETACH = 0x00cf002, /**< destroy obd instance */ + LCFG_SETUP = 0x00cf003, /**< call type-specific setup */ + LCFG_CLEANUP = 0x00cf004, /**< call type-specific cleanup + */ + LCFG_ADD_UUID = 0x00cf005, /**< add a nid to a niduuid */ + LCFG_DEL_UUID = 0x00cf006, /**< remove a nid from + * a niduuid + */ + LCFG_MOUNTOPT = 0x00cf007, /**< create a profile + * (mdc, osc) + */ + LCFG_DEL_MOUNTOPT = 0x00cf008, /**< destroy a profile */ + LCFG_SET_TIMEOUT = 0x00cf009, /**< set obd_timeout */ + LCFG_SET_UPCALL = 0x00cf00a, /**< deprecated */ + LCFG_ADD_CONN = 0x00cf00b, /**< add a failover niduuid to + * an obd + */ + LCFG_DEL_CONN = 0x00cf00c, /**< remove a failover niduuid */ + LCFG_LOV_ADD_OBD = 0x00cf00d, /**< add an osc to a lov */ + LCFG_LOV_DEL_OBD = 0x00cf00e, /**< remove an osc from a lov */ + LCFG_PARAM = 0x00cf00f, /**< set a proc parameter */ + LCFG_MARKER = 0x00cf010, /**< metadata about next + * cfg rec + */ + LCFG_LOG_START = 0x00ce011, /**< mgc only, process a + * cfg log + */ + LCFG_LOG_END = 0x00ce012, /**< stop processing updates */ + LCFG_LOV_ADD_INA = 0x00ce013, /**< like LOV_ADD_OBD, + * inactive + */ + LCFG_ADD_MDC = 0x00cf014, /**< add an mdc to a lmv */ + LCFG_DEL_MDC = 0x00cf015, /**< remove an mdc from a lmv */ + LCFG_SPTLRPC_CONF = 0x00ce016, /**< security */ + LCFG_POOL_NEW = 0x00ce020, /**< create an ost pool name */ + LCFG_POOL_ADD = 0x00ce021, /**< add an ost to a pool */ + LCFG_POOL_REM = 0x00ce022, /**< remove an ost from a pool */ + LCFG_POOL_DEL = 0x00ce023, /**< destroy an ost pool name */ + LCFG_SET_LDLM_TIMEOUT = 0x00ce030, /**< set ldlm_timeout */ + LCFG_PRE_CLEANUP = 0x00cf031, /**< call type-specific pre + * cleanup cleanup + */ + LCFG_SET_PARAM = 0x00ce032, /**< use set_param syntax to set + * a proc parameters + */ + LCFG_NODEMAP_ADD = 0x00ce040, /**< create a cluster */ + LCFG_NODEMAP_DEL = 0x00ce041, /**< destroy a cluster */ + LCFG_NODEMAP_ADD_RANGE = 0x00ce042, /**< add a nid range */ + LCFG_NODEMAP_DEL_RANGE = 0x00ce043, /**< delete an nid range */ + LCFG_NODEMAP_ADD_UIDMAP = 0x00ce044, /**< add a uidmap */ + LCFG_NODEMAP_DEL_UIDMAP = 0x00ce045, /**< delete a uidmap */ + LCFG_NODEMAP_ADD_GIDMAP = 0x00ce046, /**< add a gidmap */ + LCFG_NODEMAP_DEL_GIDMAP = 0x00ce047, /**< delete a gidmap */ + LCFG_NODEMAP_ACTIVATE = 0x00ce048, /**< activate cluster + * id mapping + */ + LCFG_NODEMAP_ADMIN = 0x00ce049, /**< allow cluster to use id 0 */ + LCFG_NODEMAP_ADD_PROJIDMAP = 0x00ce04a, /**< add a projidmap */ + LCFG_NODEMAP_DEL_PROJIDMAP = 0x00ce04b, /**< delete a projidmap */ + LCFG_NODEMAP_TRUSTED = 0x00ce050, /**< trust a clusters ids */ + LCFG_NODEMAP_SQUASH_UID = 0x00ce051, /**< default map uid */ + LCFG_NODEMAP_SQUASH_GID = 0x00ce052, /**< default map gid */ + LCFG_NODEMAP_ADD_SHKEY = 0x00ce053, /**< add shared key to cluster */ + LCFG_NODEMAP_DEL_SHKEY = 0x00ce054, /**< delete shared key from + * cluster + */ + LCFG_NODEMAP_TEST_NID = 0x00ce055, /**< test for nodemap + * membership + */ + LCFG_NODEMAP_TEST_ID = 0x00ce056, /**< test uid/gid mapping */ + LCFG_NODEMAP_SET_FILESET = 0x00ce057, /**< set fileset */ + LCFG_NODEMAP_DENY_UNKNOWN = 0x00ce058, /**< deny squashed nodemap + * users + */ + LCFG_NODEMAP_MAP_MODE = 0x00ce059, /**< set the mapping mode */ + LCFG_NODEMAP_AUDIT_MODE = 0x00ce05a, /**< set the audit mode */ + LCFG_NODEMAP_SET_SEPOL = 0x00ce05b, /**< set SELinux policy */ + LCFG_NODEMAP_FORBID_ENCRYPT = 0x00ce05c, /**< forbid encryption */ + LCFG_NODEMAP_SQUASH_PROJID = 0x00ce05d, /**< default map projid */ +}; + +struct lustre_cfg_bufs { + void *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT]; + __u32 lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT]; + __u32 lcfg_bufcount; +}; + +struct lustre_cfg { + __u32 lcfg_version; + __u32 lcfg_command; + + __u32 lcfg_num; + __u32 lcfg_flags; + __u64 lcfg_nid; + __u32 lcfg_nal; /* not used any more */ + + __u32 lcfg_bufcount; + __u32 lcfg_buflens[0]; +}; + +struct lcfg_type_data { + __u32 ltd_type; + char *ltd_name; + char *ltd_bufs[4]; +}; + +static struct lcfg_type_data lcfg_data_table[] = { + { LCFG_ATTACH, "attach", { "type", "UUID", "3", "4" } }, + { LCFG_DETACH, "detach", { "1", "2", "3", "4" } }, + { LCFG_SETUP, "setup", { "UUID", "node", "options", "failout" } }, + { LCFG_CLEANUP, "cleanup", { "1", "2", "3", "4" } }, + { LCFG_ADD_UUID, "add_uuid", { "node", "2", "3", "4" } }, + { LCFG_DEL_UUID, "del_uuid", { "1", "2", "3", "4" } }, + { LCFG_MOUNTOPT, "new_profile", { "name", "lov", "lmv", "4" } }, + { LCFG_DEL_MOUNTOPT, "del_mountopt", { "1", "2", "3", "4" } }, + { LCFG_SET_TIMEOUT, "set_timeout", { "parameter", "2", "3", "4" } }, + { LCFG_SET_UPCALL, "set_upcall", { "1", "2", "3", "4" } }, + { LCFG_ADD_CONN, "add_conn", { "node", "2", "3", "4" } }, + { LCFG_DEL_CONN, "del_conn", { "1", "2", "3", "4" } }, + { LCFG_LOV_ADD_OBD, "add_osc", { "ost", "index", "gen", "UUID" } }, + { LCFG_LOV_DEL_OBD, "del_osc", { "1", "2", "3", "4" } }, + { LCFG_PARAM, "conf_param", { "parameter", "value", "3", "4" } }, + { LCFG_MARKER, "marker", { "1", "2", "3", "4" } }, + { LCFG_LOG_START, "log_start", { "1", "2", "3", "4" } }, + { LCFG_LOG_END, "log_end", { "1", "2", "3", "4" } }, + { LCFG_LOV_ADD_INA, "add_osc_inactive", { "1", "2", "3", "4" } }, + { LCFG_ADD_MDC, "add_mdc", { "mdt", "index", "gen", "UUID" } }, + { LCFG_DEL_MDC, "del_mdc", { "1", "2", "3", "4" } }, + { LCFG_SPTLRPC_CONF, "security", { "parameter", "2", "3", "4" } }, + { LCFG_POOL_NEW, "new_pool", { "fsname", "pool", "3", "4" } }, + { LCFG_POOL_ADD, "add_pool", { "fsname", "pool", "ost", "4" } }, + { LCFG_POOL_REM, "remove_pool", { "fsname", "pool", "ost", "4" } }, + { LCFG_POOL_DEL, "del_pool", { "fsname", "pool", "3", "4" } }, + { LCFG_SET_LDLM_TIMEOUT, "set_ldlm_timeout", + { "parameter", "2", "3", "4" } }, + { LCFG_SET_PARAM, "set_param", { "parameter", "value", "3", "4" } }, + { 0, NULL, { NULL, NULL, NULL, NULL } } +}; + +static inline struct lcfg_type_data *lcfg_cmd2data(__u32 cmd) +{ + int i = 0; + + while (lcfg_data_table[i].ltd_type != 0) { + if (lcfg_data_table[i].ltd_type == cmd) + return &lcfg_data_table[i]; + i++; + } + return NULL; +} + +enum cfg_record_type { + PORTALS_CFG_TYPE = 1, + LUSTRE_CFG_TYPE = 123, +}; + +#define LUSTRE_CFG_BUFLEN(lcfg, idx) \ + ((lcfg)->lcfg_bufcount <= (idx) ? 0 : (lcfg)->lcfg_buflens[(idx)]) + +static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs, + __u32 index, void *buf, __u32 buflen) +{ + if (index >= LUSTRE_CFG_MAX_BUFCOUNT) + return; + + if (!bufs) + return; + + if (bufs->lcfg_bufcount <= index) + bufs->lcfg_bufcount = index + 1; + + bufs->lcfg_buf[index] = buf; + bufs->lcfg_buflen[index] = buflen; +} + +static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs, + __u32 index, char *str) +{ + lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0); +} + +static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, + char *name) +{ + memset((bufs), 0, sizeof(*bufs)); + if (name) + lustre_cfg_bufs_set_string(bufs, 0, name); +} + +static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, __u32 index) +{ + __u32 i; + __kernel_size_t offset; + __u32 bufcount; + + if (!lcfg) + return NULL; + + bufcount = lcfg->lcfg_bufcount; + if (index >= bufcount) + return NULL; + + offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount); + for (i = 0; i < index; i++) + offset += __ALIGN_KERNEL(lcfg->lcfg_buflens[i], 8); + return (char *)lcfg + offset; +} + +static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs, + struct lustre_cfg *lcfg) +{ + __u32 i; + + bufs->lcfg_bufcount = lcfg->lcfg_bufcount; + for (i = 0; i < bufs->lcfg_bufcount; i++) { + bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i]; + bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i); + } +} + +static inline __u32 lustre_cfg_len(__u32 bufcount, __u32 *buflens) +{ + __u32 i; + __u32 len; + + len = LCFG_HDR_SIZE(bufcount); + for (i = 0; i < bufcount; i++) + len += __ALIGN_KERNEL(buflens[i], 8); + + return __ALIGN_KERNEL(len, 8); +} + +static inline void lustre_cfg_init(struct lustre_cfg *lcfg, int cmd, + struct lustre_cfg_bufs *bufs) +{ + char *ptr; + __u32 i; + + lcfg->lcfg_version = LUSTRE_CFG_VERSION; + lcfg->lcfg_command = cmd; + lcfg->lcfg_bufcount = bufs->lcfg_bufcount; + + ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount); + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i]; + if (bufs->lcfg_buf[i]) { + memcpy(ptr, bufs->lcfg_buf[i], bufs->lcfg_buflen[i]); + ptr += __ALIGN_KERNEL(bufs->lcfg_buflen[i], 8); + } + } +} + +static inline int lustre_cfg_sanity_check(void *buf, __kernel_size_t len) +{ + struct lustre_cfg *lcfg = (struct lustre_cfg *)buf; + + if (!lcfg) + return -EINVAL; + + /* check that the first bits of the struct are valid */ + if (len < LCFG_HDR_SIZE(0)) + return -EINVAL; + + if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) + return -EINVAL; + + if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT) + return -EINVAL; + + /* check that the buflens are valid */ + if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount)) + return -EINVAL; + + /* make sure all the pointers point inside the data */ + if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)) + return -EINVAL; + + return 0; +} + +/** @} cfg */ + +#endif /* _UAPI_LUSTRE_CFG_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h new file mode 100644 index 0000000000000..54f73fdcca9ce --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h @@ -0,0 +1,231 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Lustre disk format definitions. + * + * Author: Nathan Rutman + */ + +#ifndef _UAPI_LUSTRE_DISK_H +#define _UAPI_LUSTRE_DISK_H + +/** \defgroup disk disk + * + * @{ + */ +#include + +/****************** on-disk files ********************/ + +#define MDT_LOGS_DIR "LOGS" /* COMPAT_146 */ +#define MOUNT_CONFIGS_DIR "CONFIGS" +#define CONFIGS_FILE "mountdata" +/** Persistent mount data are stored on the disk in this file. */ +#define MOUNT_DATA_FILE MOUNT_CONFIGS_DIR"/"CONFIGS_FILE +#define LAST_RCVD "last_rcvd" +#define REPLY_DATA "reply_data" +#define LOV_OBJID "lov_objid" +#define LOV_OBJSEQ "lov_objseq" +#define HEALTH_CHECK "health_check" +#define CAPA_KEYS "capa_keys" +#define CHANGELOG_USERS "changelog_users" +#define MGS_NIDTBL_DIR "NIDTBL_VERSIONS" +#define QMT_DIR "quota_master" +#define QSD_DIR "quota_slave" +#define QSD_DIR_DT "quota_slave_dt" +#define QSD_DIR_MD "quota_slave_md" +#define HSM_ACTIONS "hsm_actions" +#define LFSCK_DIR "LFSCK" +#define LFSCK_BOOKMARK "lfsck_bookmark" +#define LFSCK_LAYOUT "lfsck_layout" +#define LFSCK_NAMESPACE "lfsck_namespace" +#define REMOTE_PARENT_DIR "REMOTE_PARENT_DIR" +#define INDEX_BACKUP_DIR "index_backup" +#define MDT_ORPHAN_DIR "PENDING" + +/* On-disk configuration file. In host-endian order. */ +struct lustre_disk_data { + __u32 ldd_magic; + __u32 ldd_feature_compat; /* compatible feature flags */ + __u32 ldd_feature_rocompat; /* read-only compatible feature flags */ + __u32 ldd_feature_incompat; /* incompatible feature flags */ + + __u32 ldd_config_ver; /* config rewrite count - not used */ + __u32 ldd_flags; /* LDD_SV_TYPE */ + __u32 ldd_svindex; /* server index (0001), must match + * svname + */ + __u32 ldd_mount_type; /* target fs type LDD_MT_* */ + char ldd_fsname[64]; /* filesystem this server is part of, + * MTI_NAME_MAXLEN + */ + char ldd_svname[64]; /* this server's name (lustre-mdt0001)*/ + __u8 ldd_uuid[40]; /* server UUID (COMPAT_146) */ + + char ldd_userdata[1024 - 200]; /* arbitrary user string '200' */ + __u8 ldd_padding[4096 - 1024]; /* 1024 */ + char ldd_mount_opts[4096]; /* target fs mount opts '4096' */ + char ldd_params[4096]; /* key=value pairs '8192' */ +}; + +/****************** persistent mount data *********************/ + +#define LDD_F_SV_TYPE_MDT 0x0001 +#define LDD_F_SV_TYPE_OST 0x0002 +#define LDD_F_SV_TYPE_MGS 0x0004 +#define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT | \ + LDD_F_SV_TYPE_OST | \ + LDD_F_SV_TYPE_MGS) +#define LDD_F_SV_ALL 0x0008 +/** need an index assignment */ +#define LDD_F_NEED_INDEX 0x0010 +/** never registered */ +#define LDD_F_VIRGIN 0x0020 +/** update the config logs for this server */ +#define LDD_F_UPDATE 0x0040 +/** rewrite the LDD */ +#define LDD_F_REWRITE_LDD 0x0080 +/** regenerate config logs for this fs or server */ +#define LDD_F_WRITECONF 0x0100 +/** COMPAT_14 */ +/*#define LDD_F_UPGRADE14 0x0200 deprecated since 1.8 */ +/** process as lctl conf_param */ +#define LDD_F_PARAM 0x0400 +/** all nodes are specified as service nodes */ +#define LDD_F_NO_PRIMNODE 0x1000 +/** IR enable flag */ +#define LDD_F_IR_CAPABLE 0x2000 +/** the MGS refused to register the target. */ +#define LDD_F_ERROR 0x4000 +/** process at lctl conf_param */ +#define LDD_F_PARAM2 0x8000 +/** the target shouldn't use local logs */ +#define LDD_F_NO_LOCAL_LOGS 0x10000 + +#define LDD_MAGIC 0x1dd00001 + +#define XATTR_TARGET_RENAME "trusted.rename_tgt" + +enum ldd_mount_type { + LDD_MT_EXT3 = 0, + LDD_MT_LDISKFS, + LDD_MT_SMFS, + LDD_MT_REISERFS, + LDD_MT_LDISKFS2, + LDD_MT_ZFS, + LDD_MT_LAST +}; + +/****************** last_rcvd file *********************/ + +#define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */ +#define LR_SERVER_SIZE 512 +#define LR_CLIENT_START 8192 +#define LR_CLIENT_SIZE 128 +#if LR_CLIENT_START < LR_SERVER_SIZE +#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE" +#endif + +/* + * Data stored per server at the head of the last_rcvd file. In le32 order. + */ +struct lr_server_data { + __u8 lsd_uuid[40]; /* server UUID */ + __u64 lsd_last_transno; /* last completed transaction ID */ + __u64 lsd_compat14; /* reserved - compat with old last_rcvd */ + __u64 lsd_mount_count; /* incarnation number */ + __u32 lsd_feature_compat; /* compatible feature flags */ + __u32 lsd_feature_rocompat;/* read-only compatible feature flags */ + __u32 lsd_feature_incompat;/* incompatible feature flags */ + __u32 lsd_server_size; /* size of server data area */ + __u32 lsd_client_start; /* start of per-client data area */ + __u16 lsd_client_size; /* size of per-client data area */ + __u16 lsd_subdir_count; /* number of subdirectories for objects */ + __u64 lsd_catalog_oid; /* recovery catalog object id */ + __u32 lsd_catalog_ogen; /* recovery catalog inode generation */ + __u8 lsd_peeruuid[40]; /* UUID of MDS associated with this OST */ + __u32 lsd_osd_index; /* index number of OST in LOV */ + __u32 lsd_padding1; /* was lsd_mdt_index, unused in 2.4.0 */ + __u32 lsd_start_epoch; /* VBR: start epoch from last boot */ + /** transaction values since lsd_trans_table_time */ + __u64 lsd_trans_table[LR_EXPIRE_INTERVALS]; + /** start point of transno table below */ + __u32 lsd_trans_table_time; /* time of first slot in table above */ + __u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */ + __u8 lsd_padding[LR_SERVER_SIZE - 288]; +}; + +/* Data stored per client in the last_rcvd file. In le32 order. */ +struct lsd_client_data { + __u8 lcd_uuid[40]; /* client UUID */ + __u64 lcd_last_transno; /* last completed transaction ID */ + __u64 lcd_last_xid; /* xid for the last transaction */ + __u32 lcd_last_result; /* result from last RPC */ + __u32 lcd_last_data; /* per-op data (disposition for + * open &c.) + */ + /* for MDS_CLOSE requests */ + __u64 lcd_last_close_transno; /* last completed transaction ID */ + __u64 lcd_last_close_xid; /* xid for the last transaction */ + __u32 lcd_last_close_result; /* result from last RPC */ + __u32 lcd_last_close_data; /* per-op data */ + /* VBR: last versions */ + __u64 lcd_pre_versions[4]; + __u32 lcd_last_epoch; + /* generation counter of client slot in last_rcvd */ + __u32 lcd_generation; + __u8 lcd_padding[LR_CLIENT_SIZE - 128]; +}; + +/* Data stored in each slot of the reply_data file. + * + * The lrd_client_gen field is assigned with lcd_generation value + * to allow identify which client the reply data belongs to. + */ +struct lsd_reply_data { + __u64 lrd_transno; /* transaction number */ + __u64 lrd_xid; /* transmission id */ + __u64 lrd_data; /* per-operation data */ + __u32 lrd_result; /* request result */ + __u32 lrd_client_gen; /* client generation */ +}; + +/* Header of the reply_data file */ +#define LRH_MAGIC 0xbdabda01 +struct lsd_reply_header { + __u32 lrh_magic; + __u32 lrh_header_size; + __u32 lrh_reply_size; + __u8 lrh_pad[sizeof(struct lsd_reply_data) - 12]; +}; + +/** @} disk */ + +#endif /* _UAPI_LUSTRE_DISK_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h new file mode 100644 index 0000000000000..f11ad3b3b2115 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h @@ -0,0 +1,364 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * Copyright 2016 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * all fid manipulation functions go here + * + * FIDS are globally unique within a Lustre filessytem, and are made up + * of three parts: sequence, Object ID, and version. + * + */ +#ifndef _UAPI_LUSTRE_FID_H_ +#define _UAPI_LUSTRE_FID_H_ + +#include +#include + +/** returns fid object sequence */ +static inline __u64 fid_seq(const struct lu_fid *fid) +{ + return fid->f_seq; +} + +/** returns fid object id */ +static inline __u32 fid_oid(const struct lu_fid *fid) +{ + return fid->f_oid; +} + +/** returns fid object version */ +static inline __u32 fid_ver(const struct lu_fid *fid) +{ + return fid->f_ver; +} + +static inline void fid_zero(struct lu_fid *fid) +{ + memset(fid, 0, sizeof(*fid)); +} + +static inline __u64 fid_ver_oid(const struct lu_fid *fid) +{ + return (__u64)fid_ver(fid) << 32 | fid_oid(fid); +} + +static inline bool fid_seq_is_mdt0(__u64 seq) +{ + return seq == FID_SEQ_OST_MDT0; +} + +static inline bool fid_seq_is_mdt(__u64 seq) +{ + return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL; +}; + +static inline bool fid_seq_is_echo(__u64 seq) +{ + return seq == FID_SEQ_ECHO; +} + +static inline bool fid_is_echo(const struct lu_fid *fid) +{ + return fid_seq_is_echo(fid_seq(fid)); +} + +static inline bool fid_seq_is_llog(__u64 seq) +{ + return seq == FID_SEQ_LLOG; +} + +static inline bool fid_is_llog(const struct lu_fid *fid) +{ + /* file with OID == 0 is not llog but contains last oid */ + return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 0; +} + +static inline bool fid_seq_is_rsvd(__u64 seq) +{ + return seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD; +}; + +static inline bool fid_seq_is_special(__u64 seq) +{ + return seq == FID_SEQ_SPECIAL; +}; + +static inline bool fid_seq_is_local_file(__u64 seq) +{ + return seq == FID_SEQ_LOCAL_FILE || + seq == FID_SEQ_LOCAL_NAME; +}; + +static inline bool fid_seq_is_root(__u64 seq) +{ + return seq == FID_SEQ_ROOT; +} + +static inline bool fid_seq_is_dot(__u64 seq) +{ + return seq == FID_SEQ_DOT_LUSTRE; +} + +static inline bool fid_seq_is_default(__u64 seq) +{ + return seq == FID_SEQ_LOV_DEFAULT; +} + +static inline bool fid_is_mdt0(const struct lu_fid *fid) +{ + return fid_seq_is_mdt0(fid_seq(fid)); +} + +static inline void lu_root_fid(struct lu_fid *fid) +{ + fid->f_seq = FID_SEQ_ROOT; + fid->f_oid = FID_OID_ROOT; + fid->f_ver = 0; +} + +static inline void lu_echo_root_fid(struct lu_fid *fid) +{ + fid->f_seq = FID_SEQ_ROOT; + fid->f_oid = FID_OID_ECHO_ROOT; + fid->f_ver = 0; +} + +static inline void lu_update_log_fid(struct lu_fid *fid, __u32 index) +{ + fid->f_seq = FID_SEQ_UPDATE_LOG; + fid->f_oid = index; + fid->f_ver = 0; +} + +static inline void lu_update_log_dir_fid(struct lu_fid *fid, __u32 index) +{ + fid->f_seq = FID_SEQ_UPDATE_LOG_DIR; + fid->f_oid = index; + fid->f_ver = 0; +} + +/** + * Check if a fid is igif or not. + * \param fid the fid to be tested. + * \return true if the fid is an igif; otherwise false. + */ +static inline bool fid_seq_is_igif(__u64 seq) +{ + return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX; +} + +static inline bool fid_is_igif(const struct lu_fid *fid) +{ + return fid_seq_is_igif(fid_seq(fid)); +} + +/** + * Check if a fid is idif or not. + * \param fid the fid to be tested. + * \return true if the fid is an idif; otherwise false. + */ +static inline bool fid_seq_is_idif(__u64 seq) +{ + return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX; +} + +static inline bool fid_is_idif(const struct lu_fid *fid) +{ + return fid_seq_is_idif(fid_seq(fid)); +} + +static inline bool fid_is_local_file(const struct lu_fid *fid) +{ + return fid_seq_is_local_file(fid_seq(fid)); +} + +static inline bool fid_seq_is_norm(__u64 seq) +{ + return (seq >= FID_SEQ_NORMAL); +} + +static inline bool fid_is_norm(const struct lu_fid *fid) +{ + return fid_seq_is_norm(fid_seq(fid)); +} + +static inline int fid_is_layout_rbtree(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_LAYOUT_RBTREE; +} + +static inline bool fid_seq_is_update_log(__u64 seq) +{ + return seq == FID_SEQ_UPDATE_LOG; +} + +static inline bool fid_is_update_log(const struct lu_fid *fid) +{ + return fid_seq_is_update_log(fid_seq(fid)); +} + +static inline bool fid_seq_is_update_log_dir(__u64 seq) +{ + return seq == FID_SEQ_UPDATE_LOG_DIR; +} + +static inline bool fid_is_update_log_dir(const struct lu_fid *fid) +{ + return fid_seq_is_update_log_dir(fid_seq(fid)); +} + +/* convert an OST objid into an IDIF FID SEQ number */ +static inline __u64 fid_idif_seq(__u64 id, __u32 ost_idx) +{ + return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff); +} + +/* convert a packed IDIF FID into an OST objid */ +static inline __u64 fid_idif_id(__u64 seq, __u32 oid, __u32 ver) +{ + return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid; +} + +static inline __u32 idif_ost_idx(__u64 seq) +{ + return (seq >> 16) & 0xffff; +} + +/* extract ost index from IDIF FID */ +static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid) +{ + return idif_ost_idx(fid_seq(fid)); +} + +/* Check whether the fid is for LAST_ID */ +static inline bool fid_is_last_id(const struct lu_fid *fid) +{ + if (fid_oid(fid) != 0) + return false; + + if (fid_is_idif(fid) && ((fid_seq(fid) & 0xFFFF) != 0)) + return false; + + if (fid_seq(fid) == FID_SEQ_UPDATE_LOG || + fid_seq(fid) == FID_SEQ_UPDATE_LOG_DIR || + fid_seq_is_igif(fid_seq(fid))) + return false; + + return true; +} + +/** + * Get inode number from an igif. + * \param fid an igif to get inode number from. + * \return inode number for the igif. + */ +static inline __kernel_ino_t lu_igif_ino(const struct lu_fid *fid) +{ + return fid_seq(fid); +} + +/** + * Get inode generation from an igif. + * \param fid an igif to get inode generation from. + * \return inode generation for the igif. + */ +static inline __u32 lu_igif_gen(const struct lu_fid *fid) +{ + return fid_oid(fid); +} + +/** + * Build igif from the inode number/generation. + */ +static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen) +{ + fid->f_seq = ino; + fid->f_oid = gen; + fid->f_ver = 0; +} + +/* + * Fids are transmitted across network (in the sender byte-ordering), + * and stored on disk in big-endian order. + */ +static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src) +{ + dst->f_seq = __cpu_to_le64(fid_seq(src)); + dst->f_oid = __cpu_to_le32(fid_oid(src)); + dst->f_ver = __cpu_to_le32(fid_ver(src)); +} + +static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src) +{ + dst->f_seq = __le64_to_cpu(fid_seq(src)); + dst->f_oid = __le32_to_cpu(fid_oid(src)); + dst->f_ver = __le32_to_cpu(fid_ver(src)); +} + +static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src) +{ + dst->f_seq = __cpu_to_be64(fid_seq(src)); + dst->f_oid = __cpu_to_be32(fid_oid(src)); + dst->f_ver = __cpu_to_be32(fid_ver(src)); +} + +static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src) +{ + dst->f_seq = __be64_to_cpu(fid_seq(src)); + dst->f_oid = __be32_to_cpu(fid_oid(src)); + dst->f_ver = __be32_to_cpu(fid_ver(src)); +} + +static inline bool fid_is_sane(const struct lu_fid *fid) +{ + return fid && ((fid_seq(fid) >= FID_SEQ_START && !fid_ver(fid)) || + fid_is_igif(fid) || fid_is_idif(fid) || + fid_seq_is_rsvd(fid_seq(fid))); +} + +static inline bool lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1) +{ + return !memcmp(f0, f1, sizeof(*f0)); +} + +static inline int lu_fid_cmp(const struct lu_fid *f0, + const struct lu_fid *f1) +{ + if (fid_seq(f0) != fid_seq(f1)) + return fid_seq(f0) > fid_seq(f1) ? 1 : -1; + + if (fid_oid(f0) != fid_oid(f1)) + return fid_oid(f0) > fid_oid(f1) ? 1 : -1; + + if (fid_ver(f0) != fid_ver(f1)) + return fid_ver(f0) > fid_ver(f1) ? 1 : -1; + + return 0; +} +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h new file mode 100644 index 0000000000000..d530794f4e9cb --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h @@ -0,0 +1,99 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * FIEMAP data structures and flags. This header file will be used until + * fiemap.h is available in the upstream kernel. + * + * Author: Kalpak Shah + * Author: Andreas Dilger + */ + +#ifndef _LUSTRE_FIEMAP_H +#define _LUSTRE_FIEMAP_H + +#ifdef __KERNEL__ +# include +#else +# include +#endif +#include +#include + +/** + * XXX: We use fiemap_extent::fe_reserved[0], notice the high 16bits of it + * is used to locate the stripe number starting from the very beginning to + * resume the fiemap call. + */ +#define fe_device fe_reserved[0] + +static inline int get_fe_device(struct fiemap_extent *fe) +{ + return fe->fe_device & 0xffff; +} +static inline void set_fe_device(struct fiemap_extent *fe, int devno) +{ + fe->fe_device = (fe->fe_device & 0xffff0000) | (devno & 0xffff); +} +static inline int get_fe_stripenr(struct fiemap_extent *fe) +{ + return fe->fe_device >> 16; +} +static inline void set_fe_stripenr(struct fiemap_extent *fe, int nr) +{ + fe->fe_device = (fe->fe_device & 0xffff) | (nr << 16); +} +static inline void set_fe_device_stripenr(struct fiemap_extent *fe, int devno, + int nr) +{ + fe->fe_device = (nr << 16) | (devno & 0xffff); +} + +static inline __kernel_size_t fiemap_count_to_size(__kernel_size_t extent_count) +{ + return sizeof(struct fiemap) + extent_count * + sizeof(struct fiemap_extent); +} + +static inline unsigned int fiemap_size_to_count(__kernel_size_t array_size) +{ + return (array_size - sizeof(struct fiemap)) / + sizeof(struct fiemap_extent); +} + +#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */ + +#ifdef FIEMAP_FLAGS_COMPAT +#undef FIEMAP_FLAGS_COMPAT +#endif + +#define FIEMAP_EXTENT_NET 0x80000000 /* Data stored remotely. + * Sets NO_DIRECT flag */ + +#endif /* _LUSTRE_FIEMAP_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h new file mode 100644 index 0000000000000..5edfca121c4df --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h @@ -0,0 +1,3755 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Lustre wire protocol definitions. + */ + +/** \defgroup lustreidl lustreidl + * + * Lustre wire protocol definitions. + * + * ALL structs passing over the wire should be declared here. Structs + * that are used in interfaces with userspace should go in lustre_user.h. + * + * All structs being declared here should be built from simple fixed-size + * types defined in linux/types.h or be built from other types or + * structs also declared in this file. Similarly, all flags and magic + * values in those structs should also be declared here. This ensures + * that the Lustre wire protocol is not influenced by external dependencies. + * + * The only other acceptable items in this file are VERY SIMPLE accessor + * functions to avoid callers grubbing inside the structures. Nothing that + * depends on external functions or definitions should be in here. + * + * Structs must be properly aligned to put 64-bit values on an 8-byte + * boundary. Any structs being added here must also be added to + * utils/wirecheck.c and "make newwiretest" run to regenerate the + * utils/wiretest.c sources. This allows us to verify that wire structs + * have the proper alignment/size on all architectures. + * + * DO NOT CHANGE any of the structs, flags, values declared here and used + * in released Lustre versions. Some structs may have padding fields that + * can be used. Some structs might allow addition at the end (verify this + * in the code to ensure that new/old clients that see this larger struct + * do not fail, otherwise you need to implement protocol compatibility). + * + * @{ + */ + +#ifndef _LUSTRE_IDL_H_ +#define _LUSTRE_IDL_H_ + +#include +#include +#include +#include +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * GENERAL STUFF + */ +/* FOO_REQUEST_PORTAL is for incoming requests on the FOO + * FOO_REPLY_PORTAL is for incoming replies on the FOO + * FOO_BULK_PORTAL is for incoming bulk on the FOO + */ + +#define CONNMGR_REQUEST_PORTAL 1 +#define CONNMGR_REPLY_PORTAL 2 +#define OSC_REPLY_PORTAL 4 +#define OST_IO_PORTAL 6 +#define OST_CREATE_PORTAL 7 +#define OST_BULK_PORTAL 8 +#define MDC_REPLY_PORTAL 10 +#define MDS_REQUEST_PORTAL 12 +#define MDS_IO_PORTAL 13 +#define MDS_BULK_PORTAL 14 +#define LDLM_CB_REQUEST_PORTAL 15 +#define LDLM_CB_REPLY_PORTAL 16 +#define LDLM_CANCEL_REQUEST_PORTAL 17 +#define LDLM_CANCEL_REPLY_PORTAL 18 +/* #define MDS_SETATTR_PORTAL 22 obsolete after 2.13 */ +#define MDS_READPAGE_PORTAL 23 +#define OUT_PORTAL 24 +#define MGC_REPLY_PORTAL 25 +#define MGS_REQUEST_PORTAL 26 +#define MGS_REPLY_PORTAL 27 +#define OST_REQUEST_PORTAL 28 +#define FLD_REQUEST_PORTAL 29 +#define SEQ_METADATA_PORTAL 30 +#define SEQ_DATA_PORTAL 31 +#define SEQ_CONTROLLER_PORTAL 32 +#define MGS_BULK_PORTAL 33 +/* #define DVS_PORTAL 63 */ +/* reserved for Cray DVS - spitzcor@cray.com, roe@cray.com, n8851@cray.com */ + +/** + * Describes a range of sequence, lsr_start is included but lsr_end is + * not in the range. + * Same structure is used in fld module where lsr_index field holds mdt id + * of the home mdt. + */ +struct lu_seq_range { + __u64 lsr_start; + __u64 lsr_end; + __u32 lsr_index; + __u32 lsr_flags; +}; + +struct lu_seq_range_array { + __u32 lsra_count; + __u32 lsra_padding; + struct lu_seq_range lsra_lsr[0]; +}; + +#define LU_SEQ_RANGE_MDT 0x0 +#define LU_SEQ_RANGE_OST 0x1 +#define LU_SEQ_RANGE_ANY 0x3 + +#define LU_SEQ_RANGE_MASK 0x3 + +/** \defgroup lu_fid lu_fid + * @{ */ + +extern void lustre_lma_swab(struct lustre_mdt_attrs *lma); +extern void lustre_lma_init(struct lustre_mdt_attrs *lma, + const struct lu_fid *fid, + __u32 compat, __u32 incompat); +extern void lustre_loa_swab(struct lustre_ost_attrs *loa, + bool to_cpu); +extern void lustre_loa_init(struct lustre_ost_attrs *loa, + const struct lu_fid *fid, + __u32 compat, __u32 incompat); + +/* copytool can use any nonnegative integer to represent archive-Ids during + * register with MDT thru kuc. + * archive num = 0 => all + * archive num from 1 to MAX_U32 + */ +#define LL_HSM_ORIGIN_MAX_ARCHIVE (sizeof(__u32) * 8) +/* the max count of archive ids that one agent can support */ +#define LL_HSM_MAX_ARCHIVES_PER_AGENT 1024 + +/** + * HSM on-disk attributes stored in a separate xattr. + */ +struct hsm_attrs { + /** Bitfield for supported data in this structure. For future use. */ + __u32 hsm_compat; + + /** HSM flags, see hsm_flags enum below */ + __u32 hsm_flags; + /** backend archive id associated with the file */ + __u64 hsm_arch_id; + /** version associated with the last archiving, if any */ + __u64 hsm_arch_ver; +}; +extern void lustre_hsm_swab(struct hsm_attrs *attrs); + +/** + * fid constants + */ +enum { + /** LASTID file has zero OID */ + LUSTRE_FID_LASTID_OID = 0UL, + /** initial fid id value */ + LUSTRE_FID_INIT_OID = 1UL +}; + +/** + * Different FID Format + * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0 + * + * FID: + * File IDentifier generated by client from range allocated by the seq service. + * First 0x400 sequences [2^33, 2^33 + 0x400] are reserved for system use. Note + * that on ldiskfs MDTs that IGIF FIDs can use inode numbers starting at 12, + * but this is in the IGIF SEQ rangeand does not conflict with assigned FIDs. + * + * IGIF: + * Inode and Generation In FID, a surrogate FID used to globally identify an + * existing object on OLD formatted MDT file system. This would only be used on + * MDT0 in a DNE filesystem, because there are not expected to be any OLD + * formatted DNE filesystems. Belongs to a sequence in [12, 2^32 - 1] range, + * where sequence number is inode number, and inode generation is used as OID. + * NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem, + * which is the maximum possible for an ldiskfs backend. NOTE: This assumes + * that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible + * to clients, which has always been true. + * + * IDIF: + * Object ID in FID, a surrogate FID used to globally identify an existing + * object on OLD formatted OST file system. Belongs to a sequence in + * [2^32, 2^33 - 1]. Sequence number is calculated as: + * 1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff) + * that is, SEQ consists of 16-bit OST index, and higher 16 bits of object ID. + * The generation of unique SEQ values per OST allows the IDIF FIDs to be + * identified in the FLD correctly. The OID field is calculated as: + * objid & 0xffffffff + * that is, it consists of lower 32 bits of object ID. NOTE This assumes that + * no more than 2^48-1 objects have ever been created on an OST, and that no + * more than 65535 OSTs are in use. Both are very reasonable assumptions (can + * uniquely map all objects on an OST that created 1M objects per second for 9 + * years, or combinations thereof). + * + * OST_MDT0: + * Surrogate FID used to identify an existing object on OLD formatted OST + * filesystem. Belongs to the reserved sequence 0, and is used internally prior + * to the introduction of FID-on-OST, at which point IDIF will be used to + * identify objects as residing on a specific OST. + * + * LLOG: + * For Lustre Log objects the object sequence 1 is used. This is compatible with + * both OLD and NEW.1 namespaces, as this SEQ number is in the ext3/ldiskfs + * reserved inode range and does not conflict with IGIF sequence numbers. + * + * ECHO: + * For testing OST IO performance the object sequence 2 is used. This is + * compatible with both OLD and NEW.1 namespaces, as this SEQ number is in the + * ext3/ldiskfs reserved inode range and does not conflict with IGIF sequence + * numbers. + * + * OST_MDT1 .. OST_MAX: + * For testing with multiple MDTs the object sequence 3 through 9 is used, + * allowing direct mapping of MDTs 1 through 7 respectively, for a total of 8 + * MDTs including OST_MDT0. This matches the legacy CMD project "group" + * mappings. However, this SEQ range is only for testing prior to any production + * DNE release, as the objects in this range conflict across all OSTs, as the + * OST index is not part of the FID. + * + * + * For compatibility with existing OLD OST network protocol structures, the FID + * must map onto the o_id and o_gr in a manner that ensures existing objects are + * identified consistently for IO, as well as onto the lock namespace to ensure + * both IDIFs map onto the same objects for IO as well as resources in the DLM. + * + * DLM OLD OBIF/IDIF: + * resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases + * + * DLM NEW.1 FID (this is the same for both the MDT and OST): + * resource[] = {SEQ, OID, VER, HASH}; + * + * Note that for mapping IDIF values to DLM resource names the o_id may be + * larger than the 2^33 reserved sequence numbers for IDIF, so it is possible + * for the o_id numbers to overlap FID SEQ numbers in the resource. However, in + * all production releases the OLD o_seq field is always zero, and all valid FID + * OID values are non-zero, so the lock resources will not collide. + * + * For objects within the IDIF range, group extraction (non-CMD) will be: + * o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid; + * o_seq = 0; // formerly group number + */ + +/** + * Note that reserved SEQ numbers below 12 will conflict with ldiskfs + * inodes in the IGIF namespace, so these reserved SEQ numbers can be + * used for other purposes and not risk collisions with existing inodes. + */ +enum fid_seq { + FID_SEQ_OST_MDT0 = 0, + FID_SEQ_LLOG = 1, /* unnamed llogs */ + FID_SEQ_ECHO = 2, + FID_SEQ_UNUSED_START = 3, /* Unused */ + FID_SEQ_UNUSED_END = 9, /* Unused */ + FID_SEQ_LLOG_NAME = 10, /* named llogs */ + FID_SEQ_RSVD = 11, + FID_SEQ_IGIF = 12, + FID_SEQ_IGIF_MAX = 0x0ffffffffULL, + FID_SEQ_IDIF = 0x100000000ULL, + FID_SEQ_IDIF_MAX = 0x1ffffffffULL, + /* Normal FID sequence starts from this value, i.e. 1<<33 */ + FID_SEQ_START = 0x200000000ULL, + /* sequence for local pre-defined FIDs listed in local_oid */ + FID_SEQ_LOCAL_FILE = 0x200000001ULL, + FID_SEQ_DOT_LUSTRE = 0x200000002ULL, + /* sequence is used for local named objects FIDs generated + * by local_object_storage library */ + FID_SEQ_LOCAL_NAME = 0x200000003ULL, + /* Because current FLD will only cache the fid sequence, instead + * of oid on the client side, if the FID needs to be exposed to + * clients sides, it needs to make sure all of fids under one + * sequence will be located in one MDT. */ + FID_SEQ_SPECIAL = 0x200000004ULL, + FID_SEQ_QUOTA = 0x200000005ULL, + FID_SEQ_QUOTA_GLB = 0x200000006ULL, + FID_SEQ_ROOT = 0x200000007ULL, /* Located on MDT0 */ + FID_SEQ_LAYOUT_RBTREE = 0x200000008ULL, + /* sequence is used for update logs of cross-MDT operation */ + FID_SEQ_UPDATE_LOG = 0x200000009ULL, + /* Sequence is used for the directory under which update logs + * are created. */ + FID_SEQ_UPDATE_LOG_DIR = 0x20000000aULL, + FID_SEQ_NORMAL = 0x200000400ULL, + FID_SEQ_LOV_DEFAULT = 0xffffffffffffffffULL +}; + +#define OBIF_OID_MAX_BITS 32 +#define OBIF_MAX_OID (1ULL << OBIF_OID_MAX_BITS) +#define OBIF_OID_MASK ((1ULL << OBIF_OID_MAX_BITS) - 1) +#define IDIF_OID_MAX_BITS 48 +#define IDIF_MAX_OID (1ULL << IDIF_OID_MAX_BITS) +#define IDIF_OID_MASK ((1ULL << IDIF_OID_MAX_BITS) - 1) + +/** OID for FID_SEQ_SPECIAL */ +enum special_oid { + /* Big Filesystem Lock to serialize rename operations */ + FID_OID_SPECIAL_BFL = 1UL, +}; + +/** OID for FID_SEQ_DOT_LUSTRE */ +enum dot_lustre_oid { + FID_OID_DOT_LUSTRE = 1UL, + FID_OID_DOT_LUSTRE_OBF = 2UL, + FID_OID_DOT_LUSTRE_LPF = 3UL, +}; + +/** OID for FID_SEQ_ROOT */ +enum root_oid { + FID_OID_ROOT = 1UL, + FID_OID_ECHO_ROOT = 2UL, +}; + +struct lu_orphan_rec { + /* The MDT-object's FID referenced by the orphan OST-object */ + struct lu_fid lor_fid; + __u32 lor_uid; + __u32 lor_gid; +}; + +struct lu_orphan_ent { + /* The orphan OST-object's FID */ + struct lu_fid loe_key; + struct lu_orphan_rec loe_rec; +}; + +struct lu_orphan_rec_v2 { + struct lu_orphan_rec lor_rec; + struct ost_layout lor_layout; + __u32 lor_padding; +}; + +struct lu_orphan_ent_v2 { + /* The orphan OST-object's FID */ + struct lu_fid loe_key; + struct lu_orphan_rec_v2 loe_rec; +}; + +struct lu_orphan_rec_v3 { + struct lu_orphan_rec lor_rec; + struct ost_layout lor_layout; + /* The OST-object declared layout version in PFID EA.*/ + __u32 lor_layout_version; + /* The OST-object declared layout range (of version) in PFID EA.*/ + __u32 lor_range; + __u32 lor_padding_1; + __u64 lor_padding_2; +}; + +struct lu_orphan_ent_v3 { + /* The orphan OST-object's FID */ + struct lu_fid loe_key; + struct lu_orphan_rec_v3 loe_rec; +}; + +/** @} lu_fid */ + +/** \defgroup lu_dir lu_dir + * @{ */ + +/** + * Enumeration of possible directory entry attributes. + * + * Attributes follow directory entry header in the order they appear in this + * enumeration. + */ +enum lu_dirent_attrs { + LUDA_FID = 0x0001, + LUDA_TYPE = 0x0002, + LUDA_64BITHASH = 0x0004, + + /* The following attrs are used for MDT internal only, + * not visible to client */ + + /* Something in the record is unknown, to be verified in further. */ + LUDA_UNKNOWN = 0x0400, + /* Ignore this record, go to next directly. */ + LUDA_IGNORE = 0x0800, + /* The system is upgraded, has beed or to be repaired (dryrun). */ + LUDA_UPGRADE = 0x1000, + /* The dirent has been repaired, or to be repaired (dryrun). */ + LUDA_REPAIR = 0x2000, + /* Only check but not repair the dirent inconsistency */ + LUDA_VERIFY_DRYRUN = 0x4000, + /* Verify the dirent consistency */ + LUDA_VERIFY = 0x8000, +}; + +#define LU_DIRENT_ATTRS_MASK 0xff00 + +/** + * Layout of readdir pages, as transmitted on wire. + */ +struct lu_dirent { + /** valid if LUDA_FID is set. */ + struct lu_fid lde_fid; + /** a unique entry identifier: a hash or an offset. */ + __u64 lde_hash; + /** total record length, including all attributes. */ + __u16 lde_reclen; + /** name length */ + __u16 lde_namelen; + /** optional variable size attributes following this entry. + * taken from enum lu_dirent_attrs. + */ + __u32 lde_attrs; + /** name is followed by the attributes indicated in ->ldp_attrs, in + * their natural order. After the last attribute, padding bytes are + * added to make ->lde_reclen a multiple of 8. + */ + char lde_name[0]; +}; + +/* + * Definitions of optional directory entry attributes formats. + * + * Individual attributes do not have their length encoded in a generic way. It + * is assumed that consumer of an attribute knows its format. This means that + * it is impossible to skip over an unknown attribute, except by skipping over all + * remaining attributes (by using ->lde_reclen), which is not too + * constraining, because new server versions will append new attributes at + * the end of an entry. + */ + +/** + * Fid directory attribute: a fid of an object referenced by the entry. This + * will be almost always requested by the client and supplied by the server. + * + * Aligned to 8 bytes. + */ +/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */ + +/** + * File type. + * + * Aligned to 2 bytes. + */ +struct luda_type { + __u16 lt_type; +}; + +struct lu_dirpage { + __u64 ldp_hash_start; + __u64 ldp_hash_end; + __u32 ldp_flags; + __u32 ldp_pad0; + struct lu_dirent ldp_entries[0]; +}; + +enum lu_dirpage_flags { + /** + * dirpage contains no entry. + */ + LDF_EMPTY = 1 << 0, + /** + * last entry's lde_hash equals ldp_hash_end. + */ + LDF_COLLIDE = 1 << 1 +}; + +static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp) +{ + if (__le32_to_cpu(dp->ldp_flags) & LDF_EMPTY) + return NULL; + else + return dp->ldp_entries; +} + +static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent) +{ + struct lu_dirent *next; + + if (__le16_to_cpu(ent->lde_reclen) != 0) + next = ((void *)ent) + __le16_to_cpu(ent->lde_reclen); + else + next = NULL; + + return next; +} + +static inline __kernel_size_t lu_dirent_calc_size(size_t namelen, __u16 attr) +{ + __kernel_size_t size; + + if (attr & LUDA_TYPE) { + const __kernel_size_t align = sizeof(struct luda_type) - 1; + + size = (sizeof(struct lu_dirent) + namelen + 1 + align) & + ~align; + size += sizeof(struct luda_type); + } else { + size = sizeof(struct lu_dirent) + namelen + 1; + } + + return (size + 7) & ~7; +} + +static inline __u16 lu_dirent_type_get(struct lu_dirent *ent) +{ + __u16 type = 0; + struct luda_type *lt; + int len = 0; + + if (__le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) { + const unsigned int align = sizeof(struct luda_type) - 1; + + len = __le16_to_cpu(ent->lde_namelen); + len = (len + align) & ~align; + lt = (void *)ent->lde_name + len; + type = __le16_to_cpu(lt->lt_type); + } + + return type; +} + +#define MDS_DIR_END_OFF 0xfffffffffffffffeULL + +/** + * MDS_READPAGE page size + * + * This is the directory page size packed in MDS_READPAGE RPC. + * It's different than PAGE_SIZE because the client needs to + * access the struct lu_dirpage header packed at the beginning of + * the "page" and without this there isn't any way to know find the + * lu_dirpage header is if client and server PAGE_SIZE differ. + */ +#define LU_PAGE_SHIFT 12 +#define LU_PAGE_SIZE (1UL << LU_PAGE_SHIFT) +#define LU_PAGE_MASK (~(LU_PAGE_SIZE - 1)) + +#define LU_PAGE_COUNT (1 << (PAGE_SHIFT - LU_PAGE_SHIFT)) + +/** @} lu_dir */ + +struct lustre_handle { + __u64 cookie; +}; +#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL + +static inline bool lustre_handle_is_used(const struct lustre_handle *lh) +{ + return lh->cookie != 0; +} + +static inline bool lustre_handle_equal(const struct lustre_handle *lh1, + const struct lustre_handle *lh2) +{ + return lh1->cookie == lh2->cookie; +} + +static inline void lustre_handle_copy(struct lustre_handle *tgt, + const struct lustre_handle *src) +{ + tgt->cookie = src->cookie; +} + +/* lustre_msg struct magic. DON'T use swabbed values of MAGIC as magic! */ +enum lustre_msg_magic { + LUSTRE_MSG_MAGIC_V2 = 0x0BD00BD3, + LUSTRE_MSG_MAGIC_V2_SWABBED = 0xD30BD00B, + LUSTRE_MSG_MAGIC = LUSTRE_MSG_MAGIC_V2 +}; + +/* flags for lm_flags */ +enum lustre_msghdr { + MSGHDR_AT_SUPPORT = 0x1, /* adaptive timeouts, lm_cksum valid + * in early reply messages */ + MSGHDR_CKSUM_INCOMPAT18 = 0x2, /* compat for 1.8, needs to be set well + * beyond 2.8.0 for compatibility */ +}; + +#define lustre_msg lustre_msg_v2 +/* we depend on this structure to be 8-byte aligned */ +/* this type is only endian-adjusted in lustre_unpack_msg() */ +struct lustre_msg_v2 { + __u32 lm_bufcount; /* number of buffers in lm_buflens[] */ + __u32 lm_secflvr; /* 0 = no crypto, or sptlrpc security flavour */ + __u32 lm_magic; /* RPC version magic = LUSTRE_MSG_MAGIC_V2 */ + __u32 lm_repsize; /* size of preallocated reply buffer */ + __u32 lm_cksum; /* CRC32 of ptlrpc_body early reply messages */ + __u32 lm_flags; /* enum lustre_msghdr MSGHDR_* flags */ + __u32 lm_padding_2; /* unused */ + __u32 lm_padding_3; /* unused */ + __u32 lm_buflens[0]; /* length of additional buffers in bytes, + * padded to a multiple of 8 bytes. */ + /* + * message buffers are packed after padded lm_buflens[] array, + * padded to a multiple of 8 bytes each to align contents. + */ +}; + +/* ptlrpc_body packet pb_types */ +#define PTL_RPC_MSG_REQUEST 4711 /* normal RPC request message */ +#define PTL_RPC_MSG_ERR 4712 /* error reply if request unprocessed */ +#define PTL_RPC_MSG_REPLY 4713 /* normal RPC reply message */ + +/* ptlrpc_body pb_version ((target_version << 16) | rpc_version) */ +enum lustre_msg_version { + PTLRPC_MSG_VERSION = 0x00000003, + LUSTRE_VERSION_MASK = 0xffff0000, + LUSTRE_OBD_VERSION = 0x00010000, + LUSTRE_MDS_VERSION = 0x00020000, + LUSTRE_OST_VERSION = 0x00030000, + LUSTRE_DLM_VERSION = 0x00040000, + LUSTRE_LOG_VERSION = 0x00050000, + LUSTRE_MGS_VERSION = 0x00060000, +}; + +/* pb_flags that apply to all request messages */ +/* #define MSG_LAST_REPLAY 0x0001 obsolete 2.0 => {REQ,LOCK}_REPLAY_DONE */ +#define MSG_RESENT 0x0002 /* was previously sent, no reply seen */ +#define MSG_REPLAY 0x0004 /* was processed, got reply, recovery */ +/* #define MSG_AT_SUPPORT 0x0008 obsolete since 1.5, AT always enabled */ +/* #define MSG_DELAY_REPLAY 0x0010 obsolete since 2.0 */ +/* #define MSG_VERSION_REPLAY 0x0020 obsolete since 1.8.2, VBR always on */ +#define MSG_REQ_REPLAY_DONE 0x0040 /* request replay over, locks next */ +#define MSG_LOCK_REPLAY_DONE 0x0080 /* lock replay over, client done */ + +/* pb_op_flags for connect opcodes: MDS_CONNECT, OST_CONNECT, MGS_CONNECT */ +#define MSG_CONNECT_RECOVERING 0x00000001 /* target is in recovery */ +#define MSG_CONNECT_RECONNECT 0x00000002 /* tgt already has client import */ +#define MSG_CONNECT_REPLAYABLE 0x00000004 /* target supports RPC replay */ +/* #define MSG_CONNECT_PEER 0x00000008 obsolete since 1.2, removed in 1.5 */ +#define MSG_CONNECT_LIBCLIENT 0x00000010 /* obsolete since 2.3, removed 2.6 */ +#define MSG_CONNECT_INITIAL 0x00000020 /* first client connection attempt */ +/* #define MSG_CONNECT_ASYNC 0x00000040 obsolete since 1.5 */ +#define MSG_CONNECT_NEXT_VER 0x00000080 /* use next version of lustre_msg */ +#define MSG_CONNECT_TRANSNO 0x00000100 /* client sent transno in replay */ + +/* number of previous object versions in pb_pre_versions[] */ +#define PTLRPC_NUM_VERSIONS 4 +/* without gss, ptlrpc_body is put at the first buffer. */ +struct ptlrpc_body_v3 { + struct lustre_handle pb_handle; + __u32 pb_type; /* request/reply/err type: PTL_RPC_MSG_* */ + __u32 pb_version; /* LUSTRE_*_VERSION | PTLRPC_MSG_VERSION */ + __u32 pb_opc; /* RPC opcodes: MDS_*, OST_*, LDLM_, ... */ + __u32 pb_status; /* negative Linux x86 error number */ + __u64 pb_last_xid; /* highest replied XID w/o lower unreplied XID*/ + __u16 pb_tag; /* multiple modifying RPCs virtual slot index */ + __u16 pb_padding0; + __u32 pb_padding1; + __u64 pb_last_committed;/* rep: highest pb_transno committed to disk */ + __u64 pb_transno; /* server-assigned transno for modifying RPCs */ + __u32 pb_flags; /* req: MSG_* flags */ + __u32 pb_op_flags; /* req: MSG_CONNECT_* flags */ + __u32 pb_conn_cnt; /* connect instance of this client on server */ + __u32 pb_timeout; /* req: max wait time; rep: service estimate */ + __u32 pb_service_time; /* rep: server arrival to reply in seconds */ + __u32 pb_limit; /* rep: dynamic DLM LRU lock count limit */ + __u64 pb_slv; /* rep: dynamic DLM LRU server lock volume */ + /* VBR: rep: previous pb_version(s) of objects modified by this RPC */ + __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS]; + __u64 pb_mbits; /**< match bits for bulk request */ + /* padding for future needs - fix lustre_swab_ptlrpc_body() also */ + __u64 pb_padding64_0; + __u64 pb_padding64_1; + __u64 pb_padding64_2; + char pb_jobid[LUSTRE_JOBID_SIZE]; /* req: ASCII jobid from env + NUL */ +}; +#define ptlrpc_body ptlrpc_body_v3 + +struct ptlrpc_body_v2 { + struct lustre_handle pb_handle; + __u32 pb_type; + __u32 pb_version; + __u32 pb_opc; + __u32 pb_status; + __u64 pb_last_xid; /* highest replied XID without lower unreplied XID */ + __u16 pb_tag; /* virtual slot idx for multiple modifying RPCs */ + __u16 pb_padding0; + __u32 pb_padding1; + __u64 pb_last_committed; + __u64 pb_transno; + __u32 pb_flags; + __u32 pb_op_flags; + __u32 pb_conn_cnt; + __u32 pb_timeout; /* for req, the deadline, for rep, the service est */ + __u32 pb_service_time; /* for rep, actual service time, also used for + net_latency of req */ + __u32 pb_limit; + __u64 pb_slv; + /* VBR: pre-versions */ + __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS]; + __u64 pb_mbits; /**< unused in V2 */ + /* padding for future needs */ + __u64 pb_padding64_0; + __u64 pb_padding64_1; + __u64 pb_padding64_2; +}; + +/* message body offset for lustre_msg_v2 */ +/* ptlrpc body offset in all request/reply messages */ +#define MSG_PTLRPC_BODY_OFF 0 + +/* normal request/reply message record offset */ +#define REQ_REC_OFF 1 +#define REPLY_REC_OFF 1 + +/* ldlm request message body offset */ +#define DLM_LOCKREQ_OFF 1 /* lockreq offset */ +#define DLM_REQ_REC_OFF 2 /* normal dlm request record offset */ + +/* ldlm intent lock message body offset */ +#define DLM_INTENT_IT_OFF 2 /* intent lock it offset */ +#define DLM_INTENT_REC_OFF 3 /* intent lock record offset */ + +/* ldlm reply message body offset */ +#define DLM_LOCKREPLY_OFF 1 /* lockrep offset */ +#define DLM_REPLY_REC_OFF 2 /* reply record offset */ + +/** only use in req->rq_{req,rep}_swab_mask */ +#define MSG_PTLRPC_HEADER_OFF 31 + +/* Connect flags */ +#define OBD_CONNECT_RDONLY 0x1ULL /*client has read-only access*/ +#define OBD_CONNECT_INDEX 0x2ULL /*connect specific LOV idx */ +#define OBD_CONNECT_MDS 0x4ULL /*connect from MDT to OST */ +#define OBD_CONNECT_GRANT 0x8ULL /*OSC gets grant at connect */ +#define OBD_CONNECT_SRVLOCK 0x10ULL /*server takes locks for cli */ +#define OBD_CONNECT_VERSION 0x20ULL /*Lustre versions in ocd */ +#define OBD_CONNECT_REQPORTAL 0x40ULL /*Separate non-IO req portal */ +#define OBD_CONNECT_ACL 0x80ULL /*access control lists */ +#define OBD_CONNECT_XATTR 0x100ULL /*client use extended attr */ +#define OBD_CONNECT_LARGE_ACL 0x200ULL /* more than 32 ACL entries */ +/* was OBD_CONNECT_TRUNCLOCK 0x400ULL *locks on server for punch */ +/* temporary reuse until 2.21.53 to indicate pre-2.15 client, see LU-15478 */ +#define OBD_CONNECT_OLD_FALLOC 0x400ULL /* missing o_valid flags */ +#define OBD_CONNECT_TRANSNO 0x800ULL /*replay sends init transno */ +#define OBD_CONNECT_IBITS 0x1000ULL /* not checked in 2.11+ */ +#define OBD_CONNECT_BARRIER 0x2000ULL /* write barrier. Resevered to + * avoid use on client. + */ +#define OBD_CONNECT_ATTRFID 0x4000ULL /*Server can GetAttr By Fid*/ +#define OBD_CONNECT_NODEVOH 0x8000ULL /*No open hndl on specl nodes*/ +#define OBD_CONNECT_RMT_CLIENT 0x10000ULL /* Remote client, never used + * in production. Removed in + * 2.9. Keep this flag to + * avoid reusing. + */ +#define OBD_CONNECT_RMT_CLIENT_FORCE 0x20000ULL /* Remote client by force, + * never used in production. + * Removed in 2.9. Keep this + * flag to avoid reusing. + */ +#define OBD_CONNECT_BRW_SIZE 0x40000ULL /*Max bytes per rpc */ +#define OBD_CONNECT_QUOTA64 0x80000ULL /*Not used since 2.4 */ +#define OBD_CONNECT_MDS_CAPA 0x100000ULL /*MDS capability */ +#define OBD_CONNECT_OSS_CAPA 0x200000ULL /*OSS capability */ +#define OBD_CONNECT_CANCELSET 0x400000ULL /*Early batched cancels. */ +#define OBD_CONNECT_SOM 0x800000ULL /*Size on MDS */ +#define OBD_CONNECT_AT 0x1000000ULL /*client uses AT */ +#define OBD_CONNECT_LRU_RESIZE 0x2000000ULL /*LRU resize feature. */ +#define OBD_CONNECT_MDS_MDS 0x4000000ULL /*MDS-MDS connection */ +#define OBD_CONNECT_REAL 0x8000000ULL /* obsolete since 2.8 */ +#define OBD_CONNECT_CHANGE_QS 0x10000000ULL /*Not used since 2.4 */ +#define OBD_CONNECT_CKSUM 0x20000000ULL /*support several cksum algos*/ +#define OBD_CONNECT_FID 0x40000000ULL /*FID is supported by server */ +#define OBD_CONNECT_VBR 0x80000000ULL /*version based recovery */ +#define OBD_CONNECT_LOV_V3 0x100000000ULL /*client supports LOV v3 EA */ +#define OBD_CONNECT_GRANT_SHRINK 0x200000000ULL /* support grant shrink */ +#define OBD_CONNECT_SKIP_ORPHAN 0x400000000ULL /* don't reuse orphan objids */ +#define OBD_CONNECT_MAX_EASIZE 0x800000000ULL /* preserved for large EA */ +#define OBD_CONNECT_FULL20 0x1000000000ULL /* it is 2.0 client */ +#define OBD_CONNECT_LAYOUTLOCK 0x2000000000ULL /* client uses layout lock */ +#define OBD_CONNECT_64BITHASH 0x4000000000ULL /* client supports 64-bits + * directory hash */ +#define OBD_CONNECT_MAXBYTES 0x8000000000ULL /* max stripe size */ +#define OBD_CONNECT_IMP_RECOV 0x10000000000ULL /* imp recovery support */ +#define OBD_CONNECT_JOBSTATS 0x20000000000ULL /* jobid in ptlrpc_body */ +#define OBD_CONNECT_UMASK 0x40000000000ULL /* create uses client umask */ +#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS + * RPC error properly */ +#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for + * finer space reservation */ +#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8 + * policy and 2.x server */ +#define OBD_CONNECT_LVB_TYPE 0x400000000000ULL /* variable type of LVB */ +#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */ +#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */ +#define OBD_CONNECT_SHORTIO 0x2000000000000ULL/* short io */ +#define OBD_CONNECT_PINGLESS 0x4000000000000ULL/* pings not required */ +#define OBD_CONNECT_FLOCK_DEAD 0x8000000000000ULL/* improved flock deadlock detection */ +#define OBD_CONNECT_DISP_STRIPE 0x10000000000000ULL/* create stripe disposition*/ +#define OBD_CONNECT_OPEN_BY_FID 0x20000000000000ULL /* open by fid won't pack + name in request */ +#define OBD_CONNECT_LFSCK 0x40000000000000ULL/* support online LFSCK */ +#define OBD_CONNECT_UNLINK_CLOSE 0x100000000000000ULL/* close file in unlink */ +#define OBD_CONNECT_MULTIMODRPCS 0x200000000000000ULL /* support multiple modify + RPCs in parallel */ +#define OBD_CONNECT_DIR_STRIPE 0x400000000000000ULL /* striped DNE dir */ +#define OBD_CONNECT_SUBTREE 0x800000000000000ULL /* fileset mount */ +/* was OBD_CONNECT_LOCKAHEAD_OLD 0x1000000000000000ULL old lockahead 2.12-2.13*/ + +/** bulk matchbits is sent within ptlrpc_body */ +#define OBD_CONNECT_BULK_MBITS 0x2000000000000000ULL +#define OBD_CONNECT_OBDOPACK 0x4000000000000000ULL /* compact OUT obdo */ +#define OBD_CONNECT_FLAGS2 0x8000000000000000ULL /* second flags word */ +/* ocd_connect_flags2 flags */ +#define OBD_CONNECT2_FILE_SECCTX 0x1ULL /* set file security context at create */ +#define OBD_CONNECT2_LOCKAHEAD 0x2ULL /* ladvise lockahead v2 */ +#define OBD_CONNECT2_DIR_MIGRATE 0x4ULL /* migrate striped dir */ +#define OBD_CONNECT2_SUM_STATFS 0x8ULL /* MDT return aggregated stats */ +#define OBD_CONNECT2_OVERSTRIPING 0x10ULL /* OST overstriping support */ +#define OBD_CONNECT2_FLR 0x20ULL /* FLR support */ +#define OBD_CONNECT2_WBC_INTENTS 0x40ULL /* create/unlink/... intents for wbc, also operations under client-held parent locks */ +#define OBD_CONNECT2_LOCK_CONVERT 0x80ULL /* IBITS lock convert support */ +#define OBD_CONNECT2_ARCHIVE_ID_ARRAY 0x100ULL /* store HSM archive_id in array */ +#define OBD_CONNECT2_INC_XID 0x200ULL /* Increasing xid */ +#define OBD_CONNECT2_SELINUX_POLICY 0x400ULL /* has client SELinux policy */ +#define OBD_CONNECT2_LSOM 0x800ULL /* LSOM support */ +#define OBD_CONNECT2_PCC 0x1000ULL /* Persistent Client Cache */ +#define OBD_CONNECT2_CRUSH 0x2000ULL /* crush hash striped directory */ +#define OBD_CONNECT2_ASYNC_DISCARD 0x4000ULL /* support async DoM data discard */ +#define OBD_CONNECT2_ENCRYPT 0x8000ULL /* client-to-disk encrypt */ +#define OBD_CONNECT2_FIDMAP 0x10000ULL /* FID map */ +#define OBD_CONNECT2_GETATTR_PFID 0x20000ULL /* pack parent FID in getattr */ +#define OBD_CONNECT2_LSEEK 0x40000ULL /* SEEK_HOLE/DATA RPC */ +#define OBD_CONNECT2_DOM_LVB 0x80000ULL /* pack DOM glimpse data in LVB */ +#define OBD_CONNECT2_REP_MBITS 0x100000ULL /* match reply mbits not xid*/ +#define OBD_CONNECT2_MODE_CONVERT 0x200000ULL /* LDLM mode convert */ +#define OBD_CONNECT2_BATCH_RPC 0x400000ULL /* Multi-RPC batch request */ +#define OBD_CONNECT2_PCCRO 0x800000ULL /* Read-only PCC */ +#define OBD_CONNECT2_ATOMIC_OPEN_LOCK 0x4000000ULL/* request lock on 1st open */ +#define OBD_CONNECT2_ENCRYPT_NAME 0x8000000ULL /* name encrypt */ +/* risk of forwards incompatibility with upstream - use high order bits to mitigate */ +#define OBD_CONNECT2_MDLL_BYPASS 0x800000000000000ULL /* disable metadata lazy load */ +#define OBD_CONNECT2_MDLL 0x1000000000000000ULL /* enable metadata lazy load */ +#define OBD_CONNECT2_MDLL_AUTO_REFRESH 0x2000000000000000ULL /* enable metadata lazy load auto-refresh */ +/* XXX README XXX: + * Please DO NOT add flag values here before first ensuring that this same + * flag value is not in use on some other branch. Please clear any such + * changes with senior engineers before starting to use a new flag. Then, + * submit a small patch against EVERY branch that ONLY adds the new flag, + * updates obd_connect_names[], adds the flag to check_obd_connect_data(), + * and updates wiretests accordingly, so it can be approved and landed easily + * to reserve the flag for future use. + */ + +#define OCD_HAS_FLAG(ocd, flg) \ + (!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg)) + + +#ifdef HAVE_LRU_RESIZE_SUPPORT +#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE +#else +#define LRU_RESIZE_CONNECT_FLAG 0 +#endif + +#define MDT_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \ + OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \ + OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH | \ + OBD_CONNECT_ATTRFID | OBD_CONNECT_CANCELSET | \ + OBD_CONNECT_AT | OBD_CONNECT_BRW_SIZE | \ + OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | \ + LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_VBR | \ + OBD_CONNECT_LOV_V3 | OBD_CONNECT_FULL20 | \ + OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \ + OBD_CONNECT_EINPROGRESS | \ + OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \ + OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\ + OBD_CONNECT_PINGLESS | OBD_CONNECT_MAX_EASIZE |\ + OBD_CONNECT_FLOCK_DEAD | \ + OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | \ + OBD_CONNECT_OPEN_BY_FID | \ + OBD_CONNECT_DIR_STRIPE | OBD_CONNECT_GRANT | \ + OBD_CONNECT_SRVLOCK | OBD_CONNECT_BULK_MBITS |\ + OBD_CONNECT_CKSUM |\ + OBD_CONNECT_MULTIMODRPCS |\ + OBD_CONNECT_SUBTREE | OBD_CONNECT_LARGE_ACL |\ + OBD_CONNECT_GRANT_PARAM | \ + OBD_CONNECT_GRANT_SHRINK | \ + OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2) + +#define MDT_CONNECT_SUPPORTED2 (OBD_CONNECT2_FILE_SECCTX | \ + OBD_CONNECT2_DIR_MIGRATE | \ + OBD_CONNECT2_SUM_STATFS | \ + OBD_CONNECT2_OVERSTRIPING | \ + OBD_CONNECT2_FLR |\ + OBD_CONNECT2_LOCK_CONVERT | \ + OBD_CONNECT2_ARCHIVE_ID_ARRAY | \ + OBD_CONNECT2_INC_XID | \ + OBD_CONNECT2_SELINUX_POLICY | \ + OBD_CONNECT2_LSOM | \ + OBD_CONNECT2_ASYNC_DISCARD | \ + OBD_CONNECT2_PCC | \ + OBD_CONNECT2_MDLL_BYPASS | \ + OBD_CONNECT2_MDLL | \ + OBD_CONNECT2_MDLL_AUTO_REFRESH | \ + OBD_CONNECT2_CRUSH | \ + OBD_CONNECT2_ENCRYPT | \ + OBD_CONNECT2_GETATTR_PFID |\ + OBD_CONNECT2_LSEEK | OBD_CONNECT2_DOM_LVB |\ + OBD_CONNECT2_REP_MBITS | \ + OBD_CONNECT2_ATOMIC_OPEN_LOCK | \ + OBD_CONNECT2_ENCRYPT_NAME) + +#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \ + OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ + OBD_CONNECT_INDEX | \ + OBD_CONNECT_BRW_SIZE | OBD_CONNECT_CANCELSET | \ + OBD_CONNECT_AT | LRU_RESIZE_CONNECT_FLAG | \ + OBD_CONNECT_CKSUM | OBD_CONNECT_VBR | \ + OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \ + OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 |\ + OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \ + OBD_CONNECT_MAX_EASIZE | \ + OBD_CONNECT_EINPROGRESS | \ + OBD_CONNECT_JOBSTATS | \ + OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\ + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \ + OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK | \ + OBD_CONNECT_BULK_MBITS | \ + OBD_CONNECT_GRANT_PARAM | \ + OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2) + +#define OST_CONNECT_SUPPORTED2 (OBD_CONNECT2_LOCKAHEAD | OBD_CONNECT2_INC_XID |\ + OBD_CONNECT2_ENCRYPT | OBD_CONNECT2_LSEEK |\ + OBD_CONNECT2_REP_MBITS) + +#define ECHO_CONNECT_SUPPORTED (OBD_CONNECT_FID | OBD_CONNECT_FLAGS2) +#define ECHO_CONNECT_SUPPORTED2 OBD_CONNECT2_REP_MBITS + +#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \ + OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \ + OBD_CONNECT_PINGLESS |\ + OBD_CONNECT_BULK_MBITS | OBD_CONNECT_BARRIER | \ + OBD_CONNECT_FLAGS2) + +#define MGS_CONNECT_SUPPORTED2 OBD_CONNECT2_REP_MBITS + +/* Features required for this version of the client to work with server */ +#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_FID | \ + OBD_CONNECT_ATTRFID | \ + OBD_CONNECT_FULL20) + +/* This structure is used for both request and reply. + * + * If we eventually have separate connect data for different types, which we + * almost certainly will, then perhaps we stick a union in here. */ +struct obd_connect_data { + __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */ + __u32 ocd_version; /* lustre release version number */ + __u32 ocd_grant; /* initial cache grant amount (bytes) */ + __u32 ocd_index; /* LOV index to connect to */ + __u32 ocd_brw_size; /* Maximum BRW size in bytes */ + __u64 ocd_ibits_known; /* inode bits this client understands */ + __u8 ocd_grant_blkbits; /* log2 of the backend filesystem blocksize */ + __u8 ocd_grant_inobits; /* log2 of the per-inode space consumption */ + __u16 ocd_grant_tax_kb; /* extent insertion overhead, in 1K blocks */ + __u32 ocd_grant_max_blks;/* maximum number of blocks per extent */ + __u64 ocd_transno; /* first transno from client to be replayed */ + __u32 ocd_group; /* MDS group on OST */ + __u32 ocd_cksum_types; /* supported checksum algorithms */ + __u32 ocd_max_easize; /* How big LOV EA can be on MDS */ + __u32 ocd_instance; /* instance # of this target */ + __u64 ocd_maxbytes; /* Maximum stripe size in bytes */ + /* Fields after ocd_maxbytes are only accessible by the receiver + * if the corresponding flag in ocd_connect_flags is set. Accessing + * any field after ocd_maxbytes on the receiver without a valid flag + * may result in out-of-bound memory access and kernel oops. */ + __u16 ocd_maxmodrpcs; /* Maximum modify RPCs in parallel */ + __u16 padding0; /* added 2.1.0. also fix lustre_swab_connect */ + __u32 padding1; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 ocd_connect_flags2; + __u64 padding3; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding4; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding5; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding6; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding7; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding8; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding9; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingA; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingB; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingC; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingD; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingE; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingF; /* added 2.1.0. also fix lustre_swab_connect */ +}; +/* XXX README XXX: + * Please DO NOT use any fields here before first ensuring that this same + * field is not in use on some other branch. Please clear any such changes + * with senior engineers before starting to use a new field. Then, submit + * a small patch against EVERY branch that ONLY adds the new field along with + * the matching OBD_CONNECT flag, so that can be approved and landed easily to + * reserve the flag for future use. */ + +/* + * Supported checksum algorithms. Up to 32 checksum types are supported. + * (32-bit mask stored in obd_connect_data::ocd_cksum_types) + * Please update DECLARE_CKSUM_NAME in obd_cksum.h when adding a new + * algorithm and also the OBD_FL_CKSUM* flags, OBD_CKSUM_ALL flag, + * OBD_FL_CKSUM_ALL flag and potentially OBD_CKSUM_T10_ALL flag. + */ +enum cksum_types { + OBD_CKSUM_CRC32 = 0x00000001, + OBD_CKSUM_ADLER = 0x00000002, + OBD_CKSUM_CRC32C = 0x00000004, + OBD_CKSUM_RESERVED = 0x00000008, + OBD_CKSUM_T10IP512 = 0x00000010, + OBD_CKSUM_T10IP4K = 0x00000020, + OBD_CKSUM_T10CRC512 = 0x00000040, + OBD_CKSUM_T10CRC4K = 0x00000080, +}; + +#define OBD_CKSUM_T10_ALL (OBD_CKSUM_T10IP512 | OBD_CKSUM_T10IP4K | \ + OBD_CKSUM_T10CRC512 | OBD_CKSUM_T10CRC4K) + +#define OBD_CKSUM_ALL (OBD_CKSUM_CRC32 | OBD_CKSUM_ADLER | OBD_CKSUM_CRC32C | \ + OBD_CKSUM_T10_ALL) + +/* + * The default checksum algorithm used on top of T10PI GRD tags for RPC. + * Considering that the checksum-of-checksums is only computing CRC32 on a + * 4KB chunk of GRD tags for a 1MB RPC for 512B sectors, or 16KB of GRD + * tags for 16MB of 4KB sectors, this is only 1/256 or 1/1024 of the + * total data being checksummed, so the checksum type used here should not + * affect overall system performance noticeably. + */ +#define OBD_CKSUM_T10_TOP OBD_CKSUM_ADLER + +/* + * OST requests: OBDO & OBD request records + */ + +/* opcodes */ +enum ost_cmd { + OST_REPLY = 0, /* reply ? */ + OST_GETATTR = 1, + OST_SETATTR = 2, + OST_READ = 3, + OST_WRITE = 4, + OST_CREATE = 5, + OST_DESTROY = 6, + OST_GET_INFO = 7, + OST_CONNECT = 8, + OST_DISCONNECT = 9, + OST_PUNCH = 10, + OST_OPEN = 11, + OST_CLOSE = 12, + OST_STATFS = 13, + OST_SYNC = 16, + OST_SET_INFO = 17, + OST_QUOTACHECK = 18, /* not used since 2.4 */ + OST_QUOTACTL = 19, + OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */ + OST_LADVISE = 21, + OST_FALLOCATE = 22, + OST_SEEK = 23, + OST_LAST_OPC /* must be < 33 to avoid MDS_GETATTR */ +}; +#define OST_FIRST_OPC OST_REPLY + +enum obdo_flags { + OBD_FL_INLINEDATA = 0x00000001, + OBD_FL_OBDMDEXISTS = 0x00000002, + OBD_FL_DELORPHAN = 0x00000004, /* if set in o_flags delete orphans */ + OBD_FL_NORPC = 0x00000008, /* set in o_flags do in OSC not OST */ + OBD_FL_IDONLY = 0x00000010, /* set in o_flags only adjust obj id*/ + OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */ + OBD_FL_DEBUG_CHECK = 0x00000040, /* echo client/server debug check */ + OBD_FL_NO_PRJQUOTA = 0x00000080, /* the object's project is over + * quota */ + OBD_FL_NO_USRQUOTA = 0x00000100, /* the object's owner is over quota */ + OBD_FL_NO_GRPQUOTA = 0x00000200, /* the object's group is over quota */ + OBD_FL_CREATE_CROW = 0x00000400, /* object should be create on write */ + OBD_FL_SRVLOCK = 0x00000800, /* delegate DLM locking to server */ + OBD_FL_CKSUM_CRC32 = 0x00001000, /* CRC32 checksum type */ + OBD_FL_CKSUM_ADLER = 0x00002000, /* ADLER checksum type */ + OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */ + OBD_FL_CKSUM_T10IP512 = 0x00005000, /* T10PI IP cksum, 512B sector */ + OBD_FL_CKSUM_T10IP4K = 0x00006000, /* T10PI IP cksum, 4KB sector */ + OBD_FL_CKSUM_T10CRC512 = 0x00007000, /* T10PI CRC cksum, 512B sector */ + OBD_FL_CKSUM_T10CRC4K = 0x00008000, /* T10PI CRC cksum, 4KB sector */ + OBD_FL_CKSUM_RSVD3 = 0x00010000, /* for future cksum types */ + OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */ + OBD_FL_MMAP = 0x00040000, /* object is mmapped on the client. + * XXX: obsoleted - reserved for old + * clients prior than 2.2 */ + OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */ + OBD_FL_NOSPC_BLK = 0x00100000, /* no more block space on OST */ + OBD_FL_FLUSH = 0x00200000, /* flush pages on the OST */ + OBD_FL_SHORT_IO = 0x00400000, /* short io request */ + OBD_FL_ROOT_SQUASH = 0x00800000, /* root squash */ + /* OBD_FL_LOCAL_MASK = 0xF0000000, was local-only flags until 2.10 */ + + /* + * Note that while the original checksum values were separate bits, + * in 2.x we can actually allow all values from 1-31. T10-PI checksum + * types already use values which are not separate bits. + */ + OBD_FL_CKSUM_ALL = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER | + OBD_FL_CKSUM_CRC32C | OBD_FL_CKSUM_T10IP512 | + OBD_FL_CKSUM_T10IP4K | OBD_FL_CKSUM_T10CRC512 | + OBD_FL_CKSUM_T10CRC4K, + + OBD_FL_NO_QUOTA_ALL = OBD_FL_NO_USRQUOTA | OBD_FL_NO_GRPQUOTA | + OBD_FL_NO_PRJQUOTA, +}; + +/* + * All LOV EA magics should have the same postfix, if some new version + * Lustre instroduces new LOV EA magic, then when down-grade to an old + * Lustre, even though the old version system does not recognizes such + * new magic, it still can distinguish the corrupted cases by checking + * the magic's postfix. + */ +#define LOV_MAGIC_MAGIC 0x0BD0 +#define LOV_MAGIC_MASK 0xFFFF + +#define LOV_MAGIC_V1 (0x0BD10000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC_JOIN_V1 (0x0BD20000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC_V3 (0x0BD30000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC_MIGRATE (0x0BD40000 | LOV_MAGIC_MAGIC) +/* reserved for specifying OSTs */ +#define LOV_MAGIC_SPECIFIC (0x0BD50000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC LOV_MAGIC_V1 +#define LOV_MAGIC_COMP_V1 (0x0BD60000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC_FOREIGN (0x0BD70000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC_SEL (0x0BD80000 | LOV_MAGIC_MAGIC) + +/* + * magic for fully defined striping + * the idea is that we should have different magics for striping "hints" + * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct + * lov_mds_md_v[13]). at the moment the magics are used in wire protocol, + * we can't just change it w/o long way preparation, but we still need a + * mechanism to allow LOD to differentiate hint versus ready striping. + * so, at the moment we do a trick: MDT knows what to expect from request + * depending on the case (replay uses ready striping, non-replay req uses + * hints), so MDT replaces magic with appropriate one and now LOD can + * easily understand what's inside -bzzz + * + * those *_DEF magics are only used on server side internally, they + * won't be put on wire or disk. + */ +#define LOV_MAGIC_DEFINED 0x10000000 +#define LOV_MAGIC_V1_DEFINED (LOV_MAGIC_DEFINED | LOV_MAGIC_V1) +#define LOV_MAGIC_V3_DEFINED (LOV_MAGIC_DEFINED | LOV_MAGIC_V3) +#define LOV_MAGIC_COMP_V1_DEFINED (LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1) + +#define lov_pattern(pattern) (pattern & ~LOV_PATTERN_F_MASK) +#define lov_pattern_flags(pattern) (pattern & LOV_PATTERN_F_MASK) + +#define lov_ost_data lov_ost_data_v1 +struct lov_ost_data_v1 { /* per-stripe data structure (little-endian)*/ + struct ost_id l_ost_oi; /* OST object ID */ + __u32 l_ost_gen; /* generation of this l_ost_idx */ + __u32 l_ost_idx; /* OST index in LOV (lov_tgt_desc->tgts) */ +}; + +#define lov_mds_md lov_mds_md_v1 +struct lov_mds_md_v1 { /* LOV EA mds/wire data (little-endian) */ + __u32 lmm_magic; /* magic number = LOV_MAGIC_V1 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + /* lmm_stripe_count used to be __u32 */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + __u16 lmm_layout_gen; /* layout generation number */ + struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +}; + +#define MAX_MD_SIZE_OLD (sizeof(struct lov_mds_md) + \ + 4 * sizeof(struct lov_ost_data)) +#define MAX_MD_SIZE (sizeof(struct lov_comp_md_v1) + \ + 4 * (sizeof(struct lov_comp_md_entry_v1) + \ + MAX_MD_SIZE_OLD)) +#define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data)) + +/* This is the default MDT reply size allocated, should the striping be bigger, + * it will be reallocated in mdt_fix_reply. + * 100 stripes is a bit less than 2.5k of data */ +#define DEF_REP_MD_SIZE (sizeof(struct lov_mds_md) + \ + 100 * sizeof(struct lov_ost_data)) + +#define XATTR_NAME_ACL_ACCESS "system.posix_acl_access" +#define XATTR_NAME_ACL_DEFAULT "system.posix_acl_default" +#define XATTR_USER_PREFIX "user." +#define XATTR_TRUSTED_PREFIX "trusted." +#define XATTR_SECURITY_PREFIX "security." +#define XATTR_ENCRYPTION_PREFIX "encryption." + +#define XATTR_NAME_SOM "trusted.som" +#define XATTR_NAME_LOV "trusted.lov" +#define XATTR_NAME_LMA "trusted.lma" +#define XATTR_NAME_LMV "trusted.lmv" +#define XATTR_NAME_DEFAULT_LMV "trusted.dmv" +#define XATTR_NAME_LINK "trusted.link" +#define XATTR_NAME_FID "trusted.fid" +#define XATTR_NAME_VERSION "trusted.version" +#define XATTR_NAME_SOM "trusted.som" +#define XATTR_NAME_HSM "trusted.hsm" +#define XATTR_NAME_LFSCK_BITMAP "trusted.lfsck_bitmap" +#define XATTR_NAME_DUMMY "trusted.dummy" +#define XATTR_NAME_PROJID "trusted.projid" + +#define LL_XATTR_NAME_ENCRYPTION_CONTEXT_OLD XATTR_SECURITY_PREFIX"c" +#define LL_XATTR_NAME_ENCRYPTION_CONTEXT XATTR_ENCRYPTION_PREFIX"c" + +#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_ns" +#define XATTR_NAME_MAX_LEN 32 /* increase this, if there is longer name. */ + +struct lov_mds_md_v3 { /* LOV EA mds/wire data (little-endian) */ + __u32 lmm_magic; /* magic number = LOV_MAGIC_V3 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + /* lmm_stripe_count used to be __u32 */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + __u16 lmm_layout_gen; /* layout generation number */ + char lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* must be 32bit aligned */ + struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +}; + +static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic) +{ + if (stripes == (__u16)-1) + stripes = 0; + + if (lmm_magic == LOV_MAGIC_V3) + return sizeof(struct lov_mds_md_v3) + + stripes * sizeof(struct lov_ost_data_v1); + else + return sizeof(struct lov_mds_md_v1) + + stripes * sizeof(struct lov_ost_data_v1); +} + +static inline __u32 +lov_mds_md_max_stripe_count(__kernel_size_t buf_size, __u32 lmm_magic) +{ + switch (lmm_magic) { + case LOV_MAGIC_V1: { + struct lov_mds_md_v1 lmm; + + if (buf_size < sizeof(lmm)) + return 0; + + return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]); + } + case LOV_MAGIC_V3: { + struct lov_mds_md_v3 lmm; + + if (buf_size < sizeof(lmm)) + return 0; + + return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]); + } + default: + return 0; + } +} + +#define OBD_MD_FLID (0x00000001ULL) /* object ID */ +#define OBD_MD_FLATIME (0x00000002ULL) /* access time */ +#define OBD_MD_FLMTIME (0x00000004ULL) /* data modification time */ +#define OBD_MD_FLCTIME (0x00000008ULL) /* change time */ +#define OBD_MD_FLSIZE (0x00000010ULL) /* size */ +#define OBD_MD_FLBLOCKS (0x00000020ULL) /* allocated blocks count */ +#define OBD_MD_FLBLKSZ (0x00000040ULL) /* block size */ +#define OBD_MD_FLMODE (0x00000080ULL) /* access bits (mode & ~S_IFMT) */ +#define OBD_MD_FLTYPE (0x00000100ULL) /* object type (mode & S_IFMT) */ +#define OBD_MD_FLUID (0x00000200ULL) /* user ID */ +#define OBD_MD_FLGID (0x00000400ULL) /* group ID */ +#define OBD_MD_FLFLAGS (0x00000800ULL) /* flags word */ +#define OBD_MD_DOM_SIZE (0X00001000ULL) /* Data-on-MDT component size */ +#define OBD_MD_FLNLINK (0x00002000ULL) /* link count */ +#define OBD_MD_FLPARENT (0x00004000ULL) /* parent FID */ +#define OBD_MD_LAYOUT_VERSION (0x00008000ULL) /* OST object layout version */ +#define OBD_MD_FLRDEV (0x00010000ULL) /* device number */ +#define OBD_MD_FLEASIZE (0x00020000ULL) /* extended attribute data */ +#define OBD_MD_LINKNAME (0x00040000ULL) /* symbolic link target */ +#define OBD_MD_FLHANDLE (0x00080000ULL) /* file/lock handle */ +#define OBD_MD_FLCKSUM (0x00100000ULL) /* bulk data checksum */ +/* OBD_MD_FLQOS (0x00200000ULL) has never been used */ +/* OBD_MD_FLCOOKIE (0x00800000ULL) obsolete in 2.8 */ +#define OBD_MD_FLPRJQUOTA (0x00400000ULL) /* over quota flags sent from ost */ +#define OBD_MD_FLGROUP (0x01000000ULL) /* group */ +#define OBD_MD_FLFID (0x02000000ULL) /* ->ost write inline fid */ +/* OBD_MD_FLEPOCH (0x04000000ULL) obsolete 2.7.50 */ + /* ->mds if epoch opens or closes */ +#define OBD_MD_FLGRANT (0x08000000ULL) /* ost preallocation space grant */ +#define OBD_MD_FLDIREA (0x10000000ULL) /* dir's extended attribute data */ +#define OBD_MD_FLUSRQUOTA (0x20000000ULL) /* over quota flags sent from ost */ +#define OBD_MD_FLGRPQUOTA (0x40000000ULL) /* over quota flags sent from ost */ +#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */ + +#define OBD_MD_MDS (0x0000000100000000ULL) /* where an inode lives on */ +/* OBD_MD_REINT (0x0000000200000000ULL) obsolete 1.8 */ +#define OBD_MD_MEA (0x0000000400000000ULL) /* CMD split EA */ +#define OBD_MD_TSTATE (0x0000000800000000ULL) /* transient state field */ + +#define OBD_MD_FLXATTR (0x0000001000000000ULL) /* xattr */ +#define OBD_MD_FLXATTRLS (0x0000002000000000ULL) /* xattr list */ +#define OBD_MD_FLXATTRRM (0x0000004000000000ULL) /* xattr remove */ +#define OBD_MD_FLACL (0x0000008000000000ULL) /* ACL */ +#define OBD_MD_FLAGSTATFS (0x0000010000000000ULL) /* aggregated statfs */ +/* OBD_MD_FLMDSCAPA (0x0000020000000000ULL) obsolete 2.7.54 */ +/* OBD_MD_FLOSSCAPA (0x0000040000000000ULL) obsolete 2.7.54 */ +/* OBD_MD_FLCKSPLIT (0x0000080000000000ULL) obsolete 2.3.58*/ +#define OBD_MD_FLCROSSREF (0x0000100000000000ULL) /* Cross-ref case */ +#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes + * under lock; for xattr + * requests means the + * client holds the lock */ +#define OBD_MD_FLOBJCOUNT (0x0000400000000000ULL) /* for multiple destroy */ + +#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */ +#define OBD_MD_CLOSE_INTENT_EXECED (0x0020000000000000ULL) /* close intent + executed */ + +#define OBD_MD_DEFAULT_MEA (0x0040000000000000ULL) /* default MEA */ +#define OBD_MD_FLOSTLAYOUT (0x0080000000000000ULL) /* contain ost_layout */ +#define OBD_MD_FLPROJID (0x0100000000000000ULL) /* project ID */ +#define OBD_MD_SECCTX (0x0200000000000000ULL) /* embed security xattr */ +#define OBD_MD_FLLAZYSIZE (0x0400000000000000ULL) /* Lazy size */ +#define OBD_MD_FLLAZYBLOCKS (0x0800000000000000ULL) /* Lazy blocks */ +#define OBD_MD_FLBTIME (0x1000000000000000ULL) /* birth time */ +#define OBD_MD_ENCCTX (0x2000000000000000ULL) /* embed encryption ctx */ +#define OBD_MD_NAMEHASH (0x4000000000000000ULL) /* use hash instead of name + * in case of encryption + */ + +#define OBD_MD_FLALLQUOTA (OBD_MD_FLUSRQUOTA | \ + OBD_MD_FLGRPQUOTA | \ + OBD_MD_FLPRJQUOTA) + +#define OBD_MD_FLGETATTR (OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | \ + OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLKSZ | \ + OBD_MD_FLMODE | OBD_MD_FLTYPE | OBD_MD_FLUID | \ + OBD_MD_FLGID | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \ + OBD_MD_FLPARENT | OBD_MD_FLRDEV | OBD_MD_FLGROUP | \ + OBD_MD_FLPROJID | OBD_MD_FLBTIME) + +#define OBD_MD_FLXATTRALL (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS) + +/* don't forget obdo_fid which is way down at the bottom so it can + * come after the definition of llog_cookie */ + +enum hss_valid { + HSS_SETMASK = 0x01, + HSS_CLEARMASK = 0x02, + HSS_ARCHIVE_ID = 0x04, +}; + +struct hsm_state_set { + __u32 hss_valid; + __u32 hss_archive_id; + __u64 hss_setmask; + __u64 hss_clearmask; +}; + +/* ost_body.data values for OST_BRW */ + +#define OBD_BRW_READ 0x01 +#define OBD_BRW_WRITE 0x02 +#define OBD_BRW_RWMASK (OBD_BRW_READ | OBD_BRW_WRITE) +#define OBD_BRW_NDELAY 0x04 /* Non-delay RPC should be issued for + * this page. Non-delay RPCs have bit + * rq_no_delay set. */ +#define OBD_BRW_SYNC 0x08 /* this page is a part of synchronous + * transfer and is not accounted in + * the grant. */ +#define OBD_BRW_CHECK 0x10 +#define OBD_BRW_FROM_GRANT 0x20 /* the osc manages this under llite */ +#define OBD_BRW_GRANTED 0x40 /* the ost manages this */ +/* OBD_BRW_NOCACHE is currently neither set nor tested */ +#define OBD_BRW_NOCACHE 0x80 /* this page is a part of non-cached IO */ +#define OBD_BRW_NOQUOTA 0x100 /* do not enforce quota */ +#define OBD_BRW_SRVLOCK 0x200 /* Client holds no lock over this page */ +#define OBD_BRW_ASYNC 0x400 /* Server may delay commit to disk */ +#define OBD_BRW_MEMALLOC 0x800 /* Client runs in the "kswapd" context */ +#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */ +#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */ +#define OBD_BRW_SOFT_SYNC 0x4000 /* This flag notifies the server + * that the client is running low on + * space for unstable pages; asking + * it to sync quickly */ +#define OBD_BRW_OVER_PRJQUOTA 0x8000 /* Running out of project quota */ +#define OBD_BRW_RDMA_ONLY 0x20000 /* RPC contains RDMA-only pages*/ +#define OBD_BRW_SYS_RESOURCE 0x40000 /* page has CAP_SYS_RESOURCE */ + +#define OBD_BRW_OVER_ALLQUOTA (OBD_BRW_OVER_USRQUOTA | \ + OBD_BRW_OVER_GRPQUOTA | \ + OBD_BRW_OVER_PRJQUOTA) + +#define OBD_BRW_DONE 0x40000000UL /* + * osd-ldiskfs inernal, + * IO has been issued before + */ +#define OBD_BRW_LOCAL1 0x80000000UL /* + * osd-ldiskfs internal, + * page mapped to real block + */ + +#define OBD_BRW_LOCALS (OBD_BRW_LOCAL1 | OBD_BRW_DONE) + +#define OBD_MAX_GRANT 0x7fffffffUL /* Max grant allowed to one client: 2 GiB */ + +#define OBD_OBJECT_EOF LUSTRE_EOF + +#define OST_MIN_PRECREATE 32 +#define OST_MAX_PRECREATE 20000 + +struct obd_ioobj { + struct ost_id ioo_oid; /* object ID, if multi-obj BRW */ + __u32 ioo_max_brw; /* low 16 bits were o_mode before 2.4, + * now (PTLRPC_BULK_OPS_COUNT - 1) in + * high 16 bits in 2.4 and later */ + __u32 ioo_bufcnt; /* number of niobufs for this object */ +}; + +/* NOTE: IOOBJ_MAX_BRW_BITS defines the _offset_ of the max_brw field in + * ioo_max_brw, NOT the maximum number of bits in PTLRPC_BULK_OPS_BITS. + * That said, ioo_max_brw is a 32-bit field so the limit is also 16 bits. */ +#define IOOBJ_MAX_BRW_BITS 16 +#define ioobj_max_brw_get(ioo) (((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1) +#define ioobj_max_brw_set(ioo, num) \ +do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0) + +/* multiple of 8 bytes => can array */ +struct niobuf_remote { + __u64 rnb_offset; + __u32 rnb_len; + __u32 rnb_flags; +}; + +/* lock value block communicated between the filter and llite */ + +/* OST_LVB_ERR_INIT is needed because the return code in rc is + * negative, i.e. because ((MASK + rc) & MASK) != MASK. */ +#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL +#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL +#define OST_LVB_IS_ERR(blocks) \ + ((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK) +#define OST_LVB_SET_ERR(blocks, rc) \ + do { blocks = OST_LVB_ERR_INIT + rc; } while (0) +#define OST_LVB_GET_ERR(blocks) (int)(blocks - OST_LVB_ERR_INIT) + +struct ost_lvb_v1 { + __u64 lvb_size; + __s64 lvb_mtime; + __s64 lvb_atime; + __s64 lvb_ctime; + __u64 lvb_blocks; +}; + +struct ost_lvb { + __u64 lvb_size; + __s64 lvb_mtime; + __s64 lvb_atime; + __s64 lvb_ctime; + __u64 lvb_blocks; + __u32 lvb_mtime_ns; + __u32 lvb_atime_ns; + __u32 lvb_ctime_ns; + __u32 lvb_padding; +}; + +/* + * lquota data structures + */ + +/* The lquota_id structure is an union of all the possible identifier types that + * can be used with quota, this includes: + * - 64-bit user ID + * - 64-bit group ID + * - a FID which can be used for per-directory quota in the future */ +union lquota_id { + struct lu_fid qid_fid; /* FID for per-directory quota */ + __u64 qid_uid; /* user identifier */ + __u64 qid_gid; /* group identifier */ + __u64 qid_projid; /* project identifier */ +}; + +/* quotactl management */ +struct obd_quotactl { + __u32 qc_cmd; + __u32 qc_type; /* see Q_* flag below */ + __u32 qc_id; + __u32 qc_stat; + struct obd_dqinfo qc_dqinfo; + struct obd_dqblk qc_dqblk; + char qc_poolname[]; +}; + +#define Q_COPY(out, in, member) (out)->member = (in)->member + +/* NOTE: + * - in and out maybe a type of struct if_quotactl or struct obd_quotactl + * - in and out need not be of the same type. + */ +#define __QCTL_COPY(out, in, need_pname) \ +do { \ + Q_COPY(out, in, qc_cmd); \ + Q_COPY(out, in, qc_type); \ + Q_COPY(out, in, qc_id); \ + Q_COPY(out, in, qc_stat); \ + Q_COPY(out, in, qc_dqinfo); \ + Q_COPY(out, in, qc_dqblk); \ + if (need_pname && LUSTRE_Q_CMD_IS_POOL(in->qc_cmd)) { \ + size_t len = strnlen(in->qc_poolname, LOV_MAXPOOLNAME); \ + \ + memcpy(out->qc_poolname, in->qc_poolname, len); \ + out->qc_poolname[len] = '\0'; \ + } \ +} while (0) + +#define QCTL_COPY(out, in) __QCTL_COPY(out, in, true) +#define QCTL_COPY_NO_PNAME(out, in) __QCTL_COPY(out, in, false) + +/* Body of quota request used for quota acquire/release RPCs between quota + * master (aka QMT) and slaves (ak QSD). */ +struct quota_body { + struct lu_fid qb_fid; /* FID of global index packing the pool ID + * and type (data or metadata) as well as + * the quota type (user or group). */ + union lquota_id qb_id; /* uid or gid or directory FID */ + __u32 qb_flags; /* see below */ + __u32 qb_padding; + __u64 qb_count; /* acquire/release count (kbytes/inodes) */ + __u64 qb_usage; /* current slave usage (kbytes/inodes) */ + __u64 qb_slv_ver; /* slave index file version */ + struct lustre_handle qb_lockh; /* per-ID lock handle */ + struct lustre_handle qb_glb_lockh; /* global lock handle */ + __u64 qb_padding1[4]; +}; + +/* When the quota_body is used in the reply of quota global intent + * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */ +#define qb_slv_fid qb_fid +/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in + * quota reply */ +#define qb_qunit qb_usage + +#define QUOTA_DQACQ_FL_ACQ 0x1 /* acquire quota */ +#define QUOTA_DQACQ_FL_PREACQ 0x2 /* pre-acquire */ +#define QUOTA_DQACQ_FL_REL 0x4 /* release quota */ +#define QUOTA_DQACQ_FL_REPORT 0x8 /* report usage */ + +/* Quota types currently supported */ +enum { + LQUOTA_TYPE_USR = 0x00, /* maps to USRQUOTA */ + LQUOTA_TYPE_GRP = 0x01, /* maps to GRPQUOTA */ + LQUOTA_TYPE_PRJ = 0x02, /* maps to PRJQUOTA */ + LQUOTA_TYPE_MAX +}; + +/* There are 2 different resource types on which a quota limit can be enforced: + * - inodes on the MDTs + * - blocks on the OSTs */ +enum { + LQUOTA_RES_MD = 0x01, /* skip 0 to avoid null oid in FID */ + LQUOTA_RES_DT = 0x02, + LQUOTA_LAST_RES, + LQUOTA_FIRST_RES = LQUOTA_RES_MD +}; +#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1) + +/* + * Space accounting support + * Format of an accounting record, providing disk usage information for a given + * user or group + */ +struct lquota_acct_rec { /* 16 bytes */ + __u64 bspace; /* current space in use */ + __u64 ispace; /* current # inodes in use */ +}; + +/* + * Global quota index support + * Format of a global record, providing global quota settings for a given quota + * identifier + */ +struct lquota_glb_rec { /* 32 bytes */ + __u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */ + __u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */ + __u64 qbr_time; /* grace time, in seconds */ + __u64 qbr_granted; /* how much is granted to slaves, in #inodes or + * kbytes */ +}; + +/* + * Slave index support + * Format of a slave record, recording how much space is granted to a given + * slave + */ +struct lquota_slv_rec { /* 8 bytes */ + __u64 qsr_granted; /* space granted to the slave for the key=ID, + * in #inodes or kbytes */ +}; + +/* Data structures associated with the quota locks */ + +/* Glimpse descriptor used for the index & per-ID quota locks */ +struct ldlm_gl_lquota_desc { + union lquota_id gl_id; /* quota ID subject to the glimpse */ + __u64 gl_flags; /* see LQUOTA_FL* below */ + __u64 gl_ver; /* new index version */ + __u64 gl_hardlimit; /* new hardlimit or qunit value */ + __u64 gl_softlimit; /* new softlimit */ + __u64 gl_time; + __u64 gl_pad2; +}; +#define gl_qunit gl_hardlimit /* current qunit value used when + * glimpsing per-ID quota locks */ + +/* quota glimpse flags */ +#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */ + +/* LVB used with quota (global and per-ID) locks */ +struct lquota_lvb { + __u64 lvb_flags; /* see LQUOTA_FL* above */ + __u64 lvb_id_may_rel; /* space that might be released later */ + __u64 lvb_id_rel; /* space released by the slave for this ID */ + __u64 lvb_id_qunit; /* current qunit value */ + __u64 lvb_pad1; +}; + +/* LVB used with global quota lock */ +#define lvb_glb_ver lvb_id_may_rel /* current version of the global index */ + +/* op codes */ +enum quota_cmd { + QUOTA_DQACQ = 601, + QUOTA_DQREL = 602, + QUOTA_LAST_OPC +}; +#define QUOTA_FIRST_OPC QUOTA_DQACQ + +/* + * MDS REQ RECORDS + */ + +/* opcodes */ +enum mds_cmd { + MDS_GETATTR = 33, + MDS_GETATTR_NAME = 34, + MDS_CLOSE = 35, + MDS_REINT = 36, + MDS_READPAGE = 37, + MDS_CONNECT = 38, + MDS_DISCONNECT = 39, + MDS_GET_ROOT = 40, + MDS_STATFS = 41, + MDS_PIN = 42, /* obsolete, never used in a release */ + MDS_UNPIN = 43, /* obsolete, never used in a release */ + MDS_SYNC = 44, + MDS_DONE_WRITING = 45, /* obsolete since 2.8.0 */ + MDS_SET_INFO = 46, + MDS_QUOTACHECK = 47, /* not used since 2.4 */ + MDS_QUOTACTL = 48, + MDS_GETXATTR = 49, + MDS_SETXATTR = 50, /* obsolete, now it's MDS_REINT op */ + MDS_WRITEPAGE = 51, + MDS_IS_SUBDIR = 52, /* obsolete, never used in a release */ + MDS_GET_INFO = 53, + MDS_HSM_STATE_GET = 54, + MDS_HSM_STATE_SET = 55, + MDS_HSM_ACTION = 56, + MDS_HSM_PROGRESS = 57, + MDS_HSM_REQUEST = 58, + MDS_HSM_CT_REGISTER = 59, + MDS_HSM_CT_UNREGISTER = 60, + MDS_SWAP_LAYOUTS = 61, + MDS_RMFID = 62, + MDS_LAST_OPC +}; + +#define MDS_FIRST_OPC MDS_GETATTR + + +/* opcodes for object update */ +enum update_cmd { + OUT_UPDATE = 1000, + OUT_UPDATE_LAST_OPC +}; + +#define OUT_UPDATE_FIRST_OPC OUT_UPDATE + +/* + * Do not exceed 63 + */ + +enum mds_reint_op { + REINT_SETATTR = 1, + REINT_CREATE = 2, + REINT_LINK = 3, + REINT_UNLINK = 4, + REINT_RENAME = 5, + REINT_OPEN = 6, + REINT_SETXATTR = 7, + REINT_RMENTRY = 8, + REINT_MIGRATE = 9, + REINT_RESYNC = 10, + REINT_MAX +}; + +/* the disposition of the intent outlines what was executed */ +#define DISP_IT_EXECD 0x00000001 +#define DISP_LOOKUP_EXECD 0x00000002 +#define DISP_LOOKUP_NEG 0x00000004 +#define DISP_LOOKUP_POS 0x00000008 +#define DISP_OPEN_CREATE 0x00000010 +#define DISP_OPEN_OPEN 0x00000020 +#define DISP_ENQ_COMPLETE 0x00400000 /* obsolete and unused */ +#define DISP_ENQ_OPEN_REF 0x00800000 +#define DISP_ENQ_CREATE_REF 0x01000000 +#define DISP_OPEN_LOCK 0x02000000 +#define DISP_OPEN_LEASE 0x04000000 +#define DISP_OPEN_STRIPE 0x08000000 +#define DISP_OPEN_DENY 0x10000000 + +/* INODE LOCK PARTS */ +enum mds_ibits_locks { + MDS_INODELOCK_LOOKUP = 0x000001, /* For namespace, dentry etc. Was + * used to protect permission (mode, + * owner, group, etc) before 2.4. */ + MDS_INODELOCK_UPDATE = 0x000002, /* size, links, timestamps */ + MDS_INODELOCK_OPEN = 0x000004, /* For opened files */ + MDS_INODELOCK_LAYOUT = 0x000008, /* for layout */ + + /* The PERM bit is added in 2.4, and is used to protect permission + * (mode, owner, group, ACL, etc.) separate from LOOKUP lock. + * For remote directories (in DNE) these locks will be granted by + * different MDTs (different LDLM namespace). + * + * For local directory, the MDT always grants UPDATE|PERM together. + * For remote directory, master MDT (where remote directory is) grants + * UPDATE|PERM, and remote MDT (where name entry is) grants LOOKUP_LOCK. + */ + MDS_INODELOCK_PERM = 0x000010, + MDS_INODELOCK_XATTR = 0x000020, /* non-permission extended attrs */ + MDS_INODELOCK_DOM = 0x000040, /* Data for Data-on-MDT files */ + /* Do not forget to increase MDS_INODELOCK_NUMBITS when adding bits */ +}; +#define MDS_INODELOCK_NUMBITS 7 +/* This FULL lock is useful to take on unlink sort of operations */ +#define MDS_INODELOCK_FULL ((1 << MDS_INODELOCK_NUMBITS) - 1) +/* DOM lock shouldn't be canceled early, use this macro for ELC */ +#define MDS_INODELOCK_ELC (MDS_INODELOCK_FULL & ~MDS_INODELOCK_DOM) + +/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2], + * but was moved into name[1] along with the OID to avoid consuming the + * name[2,3] fields that need to be used for the quota id (also a FID). */ +enum { + LUSTRE_RES_ID_SEQ_OFF = 0, + LUSTRE_RES_ID_VER_OID_OFF = 1, + LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */ + LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2, + LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3, + LUSTRE_RES_ID_HSH_OFF = 3 +}; + +#define MDS_STATUS_CONN 1 +#define MDS_STATUS_LOV 2 + +enum { + /* these should be identical to their EXT4_*_FL counterparts, they are + * redefined here only to avoid dragging in fs/ext4/ext4.h */ + LUSTRE_SYNC_FL = 0x00000008, /* Synchronous updates */ + LUSTRE_IMMUTABLE_FL = 0x00000010, /* Immutable file */ + LUSTRE_APPEND_FL = 0x00000020, /* file writes may only append */ + LUSTRE_NODUMP_FL = 0x00000040, /* do not dump file */ + LUSTRE_NOATIME_FL = 0x00000080, /* do not update atime */ + LUSTRE_INDEX_FL = 0x00001000, /* hash-indexed directory */ + LUSTRE_DIRSYNC_FL = 0x00010000, /* dirsync behaviour (dir only) */ + LUSTRE_TOPDIR_FL = 0x00020000, /* Top of directory hierarchies*/ + LUSTRE_INLINE_DATA_FL = 0x10000000, /* Inode has inline data. */ + LUSTRE_PROJINHERIT_FL = 0x20000000, /* Create with parents projid */ + + /* These flags will not be identical to any EXT4_*_FL counterparts, + * and only reserved for lustre purpose. Note: these flags might + * be conflict with some of EXT4 flags, so + * 1. these conflict flags needs to be removed when the flag is + * wired by la_flags see osd_attr_get(). + * 2. If these flags needs to be stored into inode, they will be + * stored in LMA. see LMAI_XXXX */ + LUSTRE_ORPHAN_FL = 0x00002000, + LUSTRE_SET_SYNC_FL = 0x00040000, /* Synchronous setattr on OSTs */ + LUSTRE_ENCRYPT_FL = 0x00800000, /* encrypted file */ + + LUSTRE_LMA_FL_MASKS = LUSTRE_ENCRYPT_FL | LUSTRE_ORPHAN_FL, +}; + +#ifndef FS_XFLAG_SYNC +#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#endif +#ifndef FS_XFLAG_NOATIME +#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#endif +#ifndef FS_XFLAG_IMMUTABLE +#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#endif +#ifndef FS_XFLAG_APPEND +#define FS_XFLAG_APPEND 0x00000010 /* all writes append */ +#endif +#ifndef FS_XFLAG_PROJINHERIT +#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#endif + +/* 64 possible states */ +enum md_transient_state { + MS_RESTORE = (1 << 0), /* restore is running */ +}; + +struct mdt_body { + struct lu_fid mbo_fid1; + struct lu_fid mbo_fid2; + struct lustre_handle mbo_open_handle; + __u64 mbo_valid; + __u64 mbo_size; /* Offset, in the case of MDS_READPAGE */ + __s64 mbo_mtime; + __s64 mbo_atime; + __s64 mbo_ctime; + __u64 mbo_blocks; /* XID, in the case of MDS_READPAGE */ + __u64 mbo_version; /* was mbo_ioepoch before 2.11 */ + __u64 mbo_t_state; /* transient file state defined in + * enum md_transient_state + * was "ino" until 2.4.0 */ + __u32 mbo_fsuid; + __u32 mbo_fsgid; + __u32 mbo_capability; + __u32 mbo_mode; + __u32 mbo_uid; + __u32 mbo_gid; + __u32 mbo_flags; /* most replies: LUSTRE_*_FL file attributes, + * data_version: OBD_FL_* flags + */ + __u32 mbo_rdev; + __u32 mbo_nlink; /* #bytes to read in the case of MDS_READPAGE */ + __u32 mbo_layout_gen; /* was "generation" until 2.4.0 */ + __u32 mbo_suppgid; + __u32 mbo_eadatasize; + __u32 mbo_aclsize; + __u32 mbo_max_mdsize; + __u32 mbo_unused3; /* was max_cookiesize until 2.8 */ + __u32 mbo_uid_h; /* high 32-bits of uid, for FUID */ + __u32 mbo_gid_h; /* high 32-bits of gid, for FUID */ + __u32 mbo_projid; + __u64 mbo_dom_size; /* size of DOM component */ + __u64 mbo_dom_blocks; /* blocks consumed by DOM component */ + __u64 mbo_btime; + __u64 mbo_padding_9; /* also fix lustre_swab_mdt_body */ + __u64 mbo_padding_10; +}; /* 216 */ + +struct mdt_ioepoch { + struct lustre_handle mio_open_handle; + __u64 mio_unused1; /* was ioepoch */ + __u32 mio_unused2; /* was flags */ + __u32 mio_padding; +}; + +/* permissions for md_perm.mp_perm */ +enum { + CFS_SETUID_PERM = 0x01, + CFS_SETGID_PERM = 0x02, + CFS_SETGRP_PERM = 0x04, +}; + +struct mdt_rec_setattr { + __u32 sa_opcode; + __u32 sa_cap; + __u32 sa_fsuid; + __u32 sa_fsuid_h; + __u32 sa_fsgid; + __u32 sa_fsgid_h; + __u32 sa_suppgid; + __u32 sa_suppgid_h; + __u32 sa_padding_1; + __u32 sa_padding_1_h; + struct lu_fid sa_fid; + __u64 sa_valid; + __u32 sa_uid; + __u32 sa_gid; + __u64 sa_size; + __u64 sa_blocks; + __s64 sa_mtime; + __s64 sa_atime; + __s64 sa_ctime; + __u32 sa_attr_flags; + __u32 sa_mode; + __u32 sa_bias; /* some operation flags */ + __u32 sa_projid; + __u32 sa_padding_4; + __u32 sa_padding_5; +}; + +/* + * Attribute flags used in mdt_rec_setattr::sa_valid. + * The kernel's #defines for ATTR_* should not be used over the network + * since the client and MDS may run different kernels (see bug 13828) + * Therefore, we should only use MDS_ATTR_* attributes for sa_valid. + */ +enum mds_attr_flags { + MDS_ATTR_MODE = 0x1ULL, /* = 1 */ + MDS_ATTR_UID = 0x2ULL, /* = 2 */ + MDS_ATTR_GID = 0x4ULL, /* = 4 */ + MDS_ATTR_SIZE = 0x8ULL, /* = 8 */ + MDS_ATTR_ATIME = 0x10ULL, /* = 16 */ + MDS_ATTR_MTIME = 0x20ULL, /* = 32 */ + MDS_ATTR_CTIME = 0x40ULL, /* = 64 */ + MDS_ATTR_ATIME_SET = 0x80ULL, /* = 128 */ + MDS_ATTR_MTIME_SET = 0x100ULL, /* = 256 */ + MDS_ATTR_FORCE = 0x200ULL, /* = 512, change it */ + MDS_ATTR_ATTR_FLAG = 0x400ULL, /* = 1024 */ + MDS_ATTR_KILL_SUID = 0x800ULL, /* = 2048 */ + MDS_ATTR_KILL_SGID = 0x1000ULL, /* = 4096 */ + MDS_ATTR_CTIME_SET = 0x2000ULL, /* = 8192 */ + MDS_ATTR_FROM_OPEN = 0x4000ULL, /* = 16384, from open O_TRUNC */ + MDS_ATTR_BLOCKS = 0x8000ULL, /* = 32768 */ + MDS_ATTR_PROJID = 0x10000ULL, /* = 65536 */ + MDS_ATTR_LSIZE = 0x20000ULL, /* = 131072 */ + MDS_ATTR_LBLOCKS = 0x40000ULL, /* = 262144 */ + MDS_ATTR_OVERRIDE = 0x2000000ULL, /* = 33554432 */ +}; + +enum mds_op_bias { +/* MDS_CHECK_SPLIT = 1 << 0, obsolete before 2.3.58 */ + /* used for remote object getattr/open by name: in the original + * getattr/open request, MDT found the object against name is on another + * MDT, then packed FID and LOOKUP lock in reply and returned -EREMOTE, + * and client knew it's a remote object, then set this flag in + * getattr/open request and sent to the corresponding MDT to finish + * getattr/open, which fetched attributes and UPDATE lock/opened file. + */ + MDS_CROSS_REF = 1 << 1, +/* MDS_VTX_BYPASS = 1 << 2, obsolete since 2.3.54 */ + MDS_PERM_BYPASS = 1 << 3, +/* MDS_SOM = 1 << 4, obsolete since 2.8.0 */ + MDS_QUOTA_IGNORE = 1 << 5, +/* MDS_CLOSE_CLEANUP = 1 << 6, obsolete since 2.3.51 */ + MDS_KEEP_ORPHAN = 1 << 7, + MDS_RECOV_OPEN = 1 << 8, + MDS_DATA_MODIFIED = 1 << 9, + MDS_CREATE_VOLATILE = 1 << 10, + MDS_OWNEROVERRIDE = 1 << 11, + MDS_HSM_RELEASE = 1 << 12, + MDS_CLOSE_MIGRATE = 1 << 13, + MDS_CLOSE_LAYOUT_SWAP = 1 << 14, + MDS_CLOSE_LAYOUT_MERGE = 1 << 15, + MDS_CLOSE_RESYNC_DONE = 1 << 16, + MDS_CLOSE_LAYOUT_SPLIT = 1 << 17, + MDS_TRUNC_KEEP_LEASE = 1 << 18, + MDS_PCC_ATTACH = 1 << 19, + MDS_CLOSE_UPDATE_TIMES = 1 << 20, + /* setstripe create only, don't restripe if target exists */ + MDS_SETSTRIPE_CREATE = 1 << 21, + MDS_FID_OP = 1 << 22, + /* migrate dirent only */ + MDS_MIGRATE_NSONLY = 1 << 23, +}; + +#define MDS_CLOSE_INTENT (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP | \ + MDS_CLOSE_LAYOUT_MERGE | MDS_CLOSE_LAYOUT_SPLIT | \ + MDS_CLOSE_RESYNC_DONE) + +/* instance of mdt_reint_rec */ +struct mdt_rec_create { + __u32 cr_opcode; + __u32 cr_cap; + __u32 cr_fsuid; + __u32 cr_fsuid_h; + __u32 cr_fsgid; + __u32 cr_fsgid_h; + __u32 cr_suppgid1; + __u32 cr_suppgid1_h; + __u32 cr_suppgid2; + __u32 cr_suppgid2_h; + struct lu_fid cr_fid1; + struct lu_fid cr_fid2; + struct lustre_handle cr_open_handle_old; /* in case of open replay */ + __s64 cr_time; + union { + __u64 cr_rdev; + __u32 cr_archive_id; + }; + __u64 cr_ioepoch; + __u64 cr_padding_1; /* rr_blocks */ + __u32 cr_mode; + __u32 cr_bias; + /* use of helpers set/get_mrc_cr_flags() is needed to access + * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to + * extend cr_flags size without breaking 1.8 compat */ + __u32 cr_flags_l; /* for use with open, low 32 bits */ + __u32 cr_flags_h; /* for use with open, high 32 bits */ + __u32 cr_umask; /* umask for create */ + __u32 cr_padding_4; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_link { + __u32 lk_opcode; + __u32 lk_cap; + __u32 lk_fsuid; + __u32 lk_fsuid_h; + __u32 lk_fsgid; + __u32 lk_fsgid_h; + __u32 lk_suppgid1; + __u32 lk_suppgid1_h; + __u32 lk_suppgid2; + __u32 lk_suppgid2_h; + struct lu_fid lk_fid1; + struct lu_fid lk_fid2; + __s64 lk_time; + __u64 lk_padding_1; /* rr_atime */ + __u64 lk_padding_2; /* rr_ctime */ + __u64 lk_padding_3; /* rr_size */ + __u64 lk_padding_4; /* rr_blocks */ + __u32 lk_bias; + __u32 lk_padding_5; /* rr_mode */ + __u32 lk_padding_6; /* rr_flags */ + __u32 lk_padding_7; /* rr_padding_2 */ + __u32 lk_padding_8; /* rr_padding_3 */ + __u32 lk_padding_9; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_unlink { + __u32 ul_opcode; + __u32 ul_cap; + __u32 ul_fsuid; + __u32 ul_fsuid_h; + __u32 ul_fsgid; + __u32 ul_fsgid_h; + __u32 ul_suppgid1; + __u32 ul_suppgid1_h; + __u32 ul_suppgid2; + __u32 ul_suppgid2_h; + struct lu_fid ul_fid1; + struct lu_fid ul_fid2; + __s64 ul_time; + __u64 ul_padding_2; /* rr_atime */ + __u64 ul_padding_3; /* rr_ctime */ + __u64 ul_padding_4; /* rr_size */ + __u64 ul_padding_5; /* rr_blocks */ + __u32 ul_bias; + __u32 ul_mode; + __u32 ul_padding_6; /* rr_flags */ + __u32 ul_padding_7; /* rr_padding_2 */ + __u32 ul_padding_8; /* rr_padding_3 */ + __u32 ul_padding_9; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_rename { + __u32 rn_opcode; + __u32 rn_cap; + __u32 rn_fsuid; + __u32 rn_fsuid_h; + __u32 rn_fsgid; + __u32 rn_fsgid_h; + __u32 rn_suppgid1; + __u32 rn_suppgid1_h; + __u32 rn_suppgid2; + __u32 rn_suppgid2_h; + struct lu_fid rn_fid1; + struct lu_fid rn_fid2; + __s64 rn_time; + __u64 rn_padding_1; /* rr_atime */ + __u64 rn_padding_2; /* rr_ctime */ + __u64 rn_padding_3; /* rr_size */ + __u64 rn_padding_4; /* rr_blocks */ + __u32 rn_bias; /* some operation flags */ + __u32 rn_mode; /* cross-ref rename has mode */ + __u32 rn_padding_5; /* rr_flags */ + __u32 rn_padding_6; /* rr_padding_2 */ + __u32 rn_padding_7; /* rr_padding_3 */ + __u32 rn_padding_8; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_setxattr { + __u32 sx_opcode; + __u32 sx_cap; + __u32 sx_fsuid; + __u32 sx_fsuid_h; + __u32 sx_fsgid; + __u32 sx_fsgid_h; + __u32 sx_suppgid1; + __u32 sx_suppgid1_h; + __u32 sx_suppgid2; + __u32 sx_suppgid2_h; + struct lu_fid sx_fid; + __u64 sx_padding_1; /* These three are rr_fid2 */ + __u32 sx_padding_2; + __u32 sx_padding_3; + __u64 sx_valid; + __s64 sx_time; + __u64 sx_padding_5; /* rr_ctime */ + __u64 sx_padding_6; /* rr_size */ + __u64 sx_padding_7; /* rr_blocks */ + __u32 sx_size; + __u32 sx_flags; + __u32 sx_padding_8; /* rr_flags */ + __u32 sx_padding_9; /* rr_padding_2 */ + __u32 sx_padding_10; /* rr_padding_3 */ + __u32 sx_padding_11; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec + * FLR: for file resync MDS_REINT_RESYNC RPC. */ +struct mdt_rec_resync { + __u32 rs_opcode; + __u32 rs_cap; + __u32 rs_fsuid; + __u32 rs_fsuid_h; + __u32 rs_fsgid; + __u32 rs_fsgid_h; + __u32 rs_suppgid1; + __u32 rs_suppgid1_h; + __u32 rs_suppgid2; + __u32 rs_suppgid2_h; + struct lu_fid rs_fid; + __u8 rs_padding0[sizeof(struct lu_fid)]; + struct lustre_handle rs_lease_handle; /* rr_mtime */ + __s64 rs_padding1; /* rr_atime */ + __s64 rs_padding2; /* rr_ctime */ + __u64 rs_padding3; /* rr_size */ + __u64 rs_padding4; /* rr_blocks */ + __u32 rs_bias; + __u32 rs_padding5; /* rr_mode */ + __u32 rs_padding6; /* rr_flags */ + __u32 rs_padding7; /* rr_flags_h */ + __u32 rs_padding8; /* rr_umask */ + __u16 rs_mirror_id; + __u16 rs_padding9; /* rr_padding_4 */ +}; + +/* + * mdt_rec_reint is the template for all mdt_reint_xxx structures. + * Do NOT change the size of various members, otherwise the value + * will be broken in lustre_swab_mdt_rec_reint(). + * + * If you add new members in other mdt_reint_xxx structres and need to use the + * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also. + */ +struct mdt_rec_reint { + __u32 rr_opcode; + __u32 rr_cap; + __u32 rr_fsuid; + __u32 rr_fsuid_h; + __u32 rr_fsgid; + __u32 rr_fsgid_h; + __u32 rr_suppgid1; + __u32 rr_suppgid1_h; + __u32 rr_suppgid2; + __u32 rr_suppgid2_h; + struct lu_fid rr_fid1; + struct lu_fid rr_fid2; + __s64 rr_mtime; + __s64 rr_atime; + __s64 rr_ctime; + __u64 rr_size; + __u64 rr_blocks; + __u32 rr_bias; + __u32 rr_mode; + __u32 rr_flags; + __u32 rr_flags_h; + __u32 rr_umask; + __u16 rr_mirror_id; + __u16 rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */ +}; + +#define LMV_DESC_QOS_MAXAGE_DEFAULT 60 /* Seconds */ + +/* lmv structures */ +struct lmv_desc { + __u32 ld_tgt_count; /* how many MDS's */ + __u32 ld_active_tgt_count; /* how many active */ + __u32 ld_default_stripe_count; /* how many objects are used */ + __u32 ld_pattern; /* default hash pattern */ + __u64 ld_default_hash_size; + __u64 ld_padding_1; /* also fix lustre_swab_lmv_desc */ + __u32 ld_padding_2; /* also fix lustre_swab_lmv_desc */ + __u32 ld_qos_maxage; /* in second */ + __u32 ld_padding_3; /* also fix lustre_swab_lmv_desc */ + __u32 ld_padding_4; /* also fix lustre_swab_lmv_desc */ + struct obd_uuid ld_uuid; +}; + +/* LMV layout EA, and it will be stored both in master and slave object */ +struct lmv_mds_md_v1 { + __u32 lmv_magic; + __u32 lmv_stripe_count; + __u32 lmv_master_mdt_index; /* On master object, it is master + * MDT index, on slave object, it + * is stripe index of the slave obj */ + __u32 lmv_hash_type; /* dir stripe policy, i.e. indicate + * which hash function to be used, + * Note: only lower 16 bits is being + * used for now. Higher 16 bits will + * be used to mark the object status, + * for example migrating or dead. */ + __u32 lmv_layout_version; /* increased each time layout changed, + * by directory migration, restripe + * and LFSCK. */ + __u32 lmv_migrate_offset; /* once this is set, it means this + * directory is been migrated, stripes + * before this offset belong to target, + * from this to source. */ + __u32 lmv_migrate_hash; /* hash type of source stripes of + * migrating directory */ + __u32 lmv_padding2; + __u64 lmv_padding3; + char lmv_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */ + struct lu_fid lmv_stripe_fids[0]; /* FIDs for each stripe */ +}; + +/* stripe count before directory split */ +#define lmv_split_offset lmv_migrate_offset +/* stripe count after directory merge */ +#define lmv_merge_offset lmv_migrate_offset +/* directory hash type after merge */ +#define lmv_merge_hash lmv_migrate_hash + +/* foreign LMV EA */ +struct lmv_foreign_md { + __u32 lfm_magic; /* magic number = LMV_MAGIC_FOREIGN */ + __u32 lfm_length; /* length of lfm_value */ + __u32 lfm_type; /* type, see LU_FOREIGN_TYPE_ */ + __u32 lfm_flags; /* flags, type specific */ + char lfm_value[]; /* free format value */ +}; + +#define LMV_MAGIC_V1 0x0CD20CD0 /* normal stripe lmv magic */ +#define LMV_MAGIC LMV_MAGIC_V1 + +/* #define LMV_USER_MAGIC 0x0CD30CD0 */ +#define LMV_MAGIC_STRIPE 0x0CD40CD0 /* magic for dir sub_stripe */ +#define LMV_MAGIC_FOREIGN 0x0CD50CD0 /* magic for lmv foreign */ + +/** + * The FNV-1a hash algorithm is as follows: + * hash = FNV_offset_basis + * for each octet_of_data to be hashed + * hash = hash XOR octet_of_data + * hash = hash × FNV_prime + * return hash + * http://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function#FNV-1a_hash + * + * http://www.isthe.com/chongo/tech/comp/fnv/index.html#FNV-reference-source + * FNV_prime is 2^40 + 2^8 + 0xb3 = 0x100000001b3ULL + **/ +#define LUSTRE_FNV_1A_64_PRIME 0x100000001b3ULL +#define LUSTRE_FNV_1A_64_OFFSET_BIAS 0xcbf29ce484222325ULL +static inline __u64 lustre_hash_fnv_1a_64(const void *buf, __kernel_size_t size) +{ + __u64 hash = LUSTRE_FNV_1A_64_OFFSET_BIAS; + const unsigned char *p = buf; + __kernel_size_t i; + + for (i = 0; i < size; i++) { + hash ^= p[i]; + hash *= LUSTRE_FNV_1A_64_PRIME; + } + + return hash; +} + +/* CRUSH placement group count */ +#define LMV_CRUSH_PG_COUNT 4096 + +union lmv_mds_md { + __u32 lmv_magic; + struct lmv_mds_md_v1 lmv_md_v1; + struct lmv_user_md lmv_user_md; + struct lmv_foreign_md lmv_foreign_md; +}; + +static inline __kernel_ssize_t lmv_mds_md_size(int stripe_count, + unsigned int lmm_magic) +{ + __kernel_ssize_t len = -EINVAL; + + switch (lmm_magic) { + case LMV_MAGIC_V1: { + struct lmv_mds_md_v1 *lmm1; + + len = sizeof(*lmm1); + len += stripe_count * sizeof(lmm1->lmv_stripe_fids[0]); + break; } + default: + break; + } + return len; +} + +static inline int lmv_mds_md_stripe_count_get(const union lmv_mds_md *lmm) +{ + switch (__le32_to_cpu(lmm->lmv_magic)) { + case LMV_MAGIC_V1: + return __le32_to_cpu(lmm->lmv_md_v1.lmv_stripe_count); + case LMV_USER_MAGIC: + return __le32_to_cpu(lmm->lmv_user_md.lum_stripe_count); + default: + return -EINVAL; + } +} + +static inline int lmv_mds_md_hash_type_get(const union lmv_mds_md *lmm) +{ + switch (__le32_to_cpu(lmm->lmv_magic)) { + case LMV_MAGIC_V1: + return __le32_to_cpu(lmm->lmv_md_v1.lmv_hash_type); + case LMV_USER_MAGIC: + return __le32_to_cpu(lmm->lmv_user_md.lum_hash_type); + default: + return -EINVAL; + } +} + +enum fld_rpc_opc { + FLD_QUERY = 900, + FLD_READ = 901, + FLD_LAST_OPC, + FLD_FIRST_OPC = FLD_QUERY +}; + +enum seq_rpc_opc { + SEQ_QUERY = 700, + SEQ_LAST_OPC, + SEQ_FIRST_OPC = SEQ_QUERY +}; + +enum seq_op { + SEQ_ALLOC_SUPER = 0, + SEQ_ALLOC_META = 1 +}; + +enum fld_op { + FLD_CREATE = 0, + FLD_DELETE = 1, + FLD_LOOKUP = 2, +}; + +/* LFSCK opcodes */ +enum lfsck_cmd { + LFSCK_NOTIFY = 1101, + LFSCK_QUERY = 1102, + LFSCK_LAST_OPC, + LFSCK_FIRST_OPC = LFSCK_NOTIFY +}; + +/* + * LOV data structures + */ + +#define LOV_MAX_UUID_BUFFER_SIZE 8192 +/* The size of the buffer the lov/mdc reserves for the + * array of UUIDs returned by the MDS. With the current + * protocol, this will limit the max number of OSTs per LOV */ + +#define LOV_DESC_MAGIC 0xB0CCDE5C +#define LOV_DESC_QOS_MAXAGE_DEFAULT 5 /* Seconds */ +#define LOV_DESC_STRIPE_SIZE_DEFAULT (1 << LNET_MTU_BITS) + +/* LOV settings descriptor (should only contain static info) */ +struct lov_desc { + __u32 ld_tgt_count; /* how many OBD's */ + __u32 ld_active_tgt_count; /* how many active */ + __s32 ld_default_stripe_count; /* how many objects are used */ + __u32 ld_pattern; /* default PATTERN_RAID0 */ + __u64 ld_default_stripe_size; /* in bytes */ + __s64 ld_default_stripe_offset; /* starting OST index */ + __u32 ld_padding_0; /* unused */ + __u32 ld_qos_maxage; /* in second */ + __u32 ld_padding_1; /* also fix lustre_swab_lov_desc */ + __u32 ld_padding_2; /* also fix lustre_swab_lov_desc */ + struct obd_uuid ld_uuid; +}; + +#define ld_magic ld_active_tgt_count /* for swabbing from llogs */ + +/* + * LDLM requests: + */ +/* opcodes -- MUST be distinct from OST/MDS opcodes */ +enum ldlm_cmd { + LDLM_ENQUEUE = 101, + LDLM_CONVERT = 102, + LDLM_CANCEL = 103, + LDLM_BL_CALLBACK = 104, + LDLM_CP_CALLBACK = 105, + LDLM_GL_CALLBACK = 106, + LDLM_SET_INFO = 107, + LDLM_LAST_OPC +}; +#define LDLM_FIRST_OPC LDLM_ENQUEUE + +#define RES_NAME_SIZE 4 +struct ldlm_res_id { + __u64 name[RES_NAME_SIZE]; +}; + +#define DLDLMRES "[%#llx:%#llx:%#llx].%#llx" +#define PLDLMRES(res) (unsigned long long)(res)->lr_name.name[0], \ + (unsigned long long)(res)->lr_name.name[1], \ + (unsigned long long)(res)->lr_name.name[2], \ + (unsigned long long)(res)->lr_name.name[3] + +/* lock types */ +enum ldlm_mode { + LCK_MINMODE = 0, + LCK_EX = 1, + LCK_PW = 2, + LCK_PR = 4, + LCK_CW = 8, + LCK_CR = 16, + LCK_NL = 32, + LCK_GROUP = 64, + LCK_COS = 128, + LCK_MAXMODE +}; + +#define LCK_MODE_NUM 8 + +enum ldlm_type { + LDLM_PLAIN = 10, + LDLM_EXTENT = 11, + LDLM_FLOCK = 12, + LDLM_IBITS = 13, + LDLM_MAX_TYPE +}; + +#define LDLM_MIN_TYPE LDLM_PLAIN + +struct ldlm_extent { + __u64 start; + __u64 end; + __u64 gid; +}; + +static inline bool ldlm_extent_equal(const struct ldlm_extent *ex1, + const struct ldlm_extent *ex2) +{ + return ex1->start == ex2->start && ex1->end == ex2->end; +} + +struct ldlm_inodebits { + __u64 bits; + union { + __u64 try_bits; /* optional bits to try */ + __u64 cancel_bits; /* for lock convert */ + }; + __u64 li_gid; +}; + +struct ldlm_flock_wire { + __u64 lfw_start; + __u64 lfw_end; + __u64 lfw_owner; + __u32 lfw_padding; + __u32 lfw_pid; +}; + +/* it's important that the fields of the ldlm_extent structure match + * the first fields of the ldlm_flock structure because there is only + * one ldlm_swab routine to process the ldlm_policy_data_t union. if + * this ever changes we will need to swab the union differently based + * on the resource type. */ + +union ldlm_wire_policy_data { + struct ldlm_extent l_extent; + struct ldlm_flock_wire l_flock; + struct ldlm_inodebits l_inodebits; +}; + +struct barrier_lvb { + __u32 lvb_status; + __u32 lvb_index; + __u64 lvb_padding; +}; + +struct ldlm_gl_barrier_desc { + __u32 lgbd_status; + __u32 lgbd_timeout; + __u64 lgbd_padding; +}; + +union ldlm_gl_desc { + struct ldlm_gl_lquota_desc lquota_desc; + struct ldlm_gl_barrier_desc barrier_desc; +}; + +enum ldlm_intent_flags { + IT_OPEN = 0x00000001, + IT_CREAT = 0x00000002, + IT_OPEN_CREAT = IT_OPEN | IT_CREAT, /* To allow case label. */ + IT_READDIR = 0x00000004, /* Used by mdc, not put on the wire. */ + IT_GETATTR = 0x00000008, + IT_LOOKUP = 0x00000010, +/* IT_UNLINK = 0x00000020, Obsolete. */ +/* IT_TRUNC = 0x00000040, Obsolete. */ + IT_GETXATTR = 0x00000080, +/* IT_EXEC = 0x00000100, Obsolete. */ +/* IT_PIN = 0x00000200, Obsolete. */ + IT_LAYOUT = 0x00000400, + IT_QUOTA_DQACQ = 0x00000800, + IT_QUOTA_CONN = 0x00001000, +/* IT_SETXATTR = 0x00002000, Obsolete. */ + IT_GLIMPSE = 0x00004000, + IT_BRW = 0x00008000, +}; + +struct ldlm_intent { + __u64 opc; +}; + +struct ldlm_resource_desc { + enum ldlm_type lr_type; + __u32 lr_pad; /* also fix lustre_swab_ldlm_resource_desc */ + struct ldlm_res_id lr_name; +}; + +struct ldlm_lock_desc { + struct ldlm_resource_desc l_resource; + enum ldlm_mode l_req_mode; + enum ldlm_mode l_granted_mode; + union ldlm_wire_policy_data l_policy_data; +}; + +#define LDLM_LOCKREQ_HANDLES 2 +#define LDLM_ENQUEUE_CANCEL_OFF 1 + +struct ldlm_request { + __u32 lock_flags; /* LDLM_FL_*, see lustre_dlm_flags.h */ + __u32 lock_count; /* number of locks in lock_handle[] */ + struct ldlm_lock_desc lock_desc;/* lock descriptor */ + struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES]; +}; + +struct ldlm_reply { + __u32 lock_flags; + __u32 lock_padding; /* also fix lustre_swab_ldlm_reply */ + struct ldlm_lock_desc lock_desc; + struct lustre_handle lock_handle; + __u64 lock_policy_res1; + __u64 lock_policy_res2; +}; + +#define ldlm_flags_to_wire(flags) ((__u32)(flags)) +#define ldlm_flags_from_wire(flags) ((__u64)(flags)) + +/* + * Opcodes for mountconf (mgs and mgc) + */ +enum mgs_cmd { + MGS_CONNECT = 250, + MGS_DISCONNECT = 251, + MGS_EXCEPTION = 252, /* node died, etc. */ + MGS_TARGET_REG = 253, /* whenever target starts up */ + MGS_TARGET_DEL = 254, + MGS_SET_INFO = 255, + MGS_CONFIG_READ = 256, + MGS_LAST_OPC, + MGS_FIRST_OPC = MGS_CONNECT +}; + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 18, 53, 0) +#define MGS_PARAM_MAXLEN 1024 +#define KEY_SET_INFO "set_info" + +struct mgs_send_param { + char mgs_param[MGS_PARAM_MAXLEN]; +}; +#endif + +/* We pass this info to the MGS so it can write config logs */ +#define MTI_NAME_MAXLEN 64 +#define MTI_PARAM_MAXLEN 4096 +#define MTI_NIDS_MAX 32 +struct mgs_target_info { + __u32 mti_lustre_ver; + __u32 mti_stripe_index; + __u32 mti_config_ver; + __u32 mti_flags; /* LDD_F_* */ + __u32 mti_nid_count; + __u32 mti_instance; /* Running instance of target */ + char mti_fsname[MTI_NAME_MAXLEN]; + char mti_svname[MTI_NAME_MAXLEN]; + char mti_uuid[sizeof(struct obd_uuid)]; + __u64 mti_nids[MTI_NIDS_MAX]; /* host nids (lnet_nid_t) */ + char mti_params[MTI_PARAM_MAXLEN]; +}; + +struct mgs_nidtbl_entry { + __u64 mne_version; /* table version of this entry */ + __u32 mne_instance; /* target instance # */ + __u32 mne_index; /* target index */ + __u32 mne_length; /* length of this entry - by bytes */ + __u8 mne_type; /* target type LDD_F_SV_TYPE_OST/MDT */ + __u8 mne_nid_type; /* type of nid(mbz). for ipv6. */ + __u8 mne_nid_size; /* size of each NID, by bytes */ + __u8 mne_nid_count; /* # of NIDs in buffer */ + union { + lnet_nid_t nids[0]; /* variable size buffer for NIDs. */ + } u; +}; + +enum mgs_cfg_type { + MGS_CFG_T_CONFIG = 0, + MGS_CFG_T_SPTLRPC = 1, + MGS_CFG_T_RECOVER = 2, + MGS_CFG_T_PARAMS = 3, + MGS_CFG_T_NODEMAP = 4, + MGS_CFG_T_BARRIER = 5, + MGS_CFG_T_MAX +}; + +struct mgs_config_body { + char mcb_name[MTI_NAME_MAXLEN]; /* logname */ + __u64 mcb_offset; /* next index of config log to request */ + __u16 mcb_type; /* type of log: MGS_CFG_T_[CONFIG|RECOVER] */ + __u8 mcb_nm_cur_pass; + __u8 mcb_bits; /* bits unit size of config log */ + __u32 mcb_units; /* # of units for bulk transfer */ +}; + +struct mgs_config_res { + __u64 mcr_offset; /* index of last config log */ + union { + __u64 mcr_size; /* size of the log */ + __u64 mcr_nm_cur_pass; /* current nodemap config pass */ + }; +}; + +/* Config marker flags (in config log) */ +#define CM_START 0x01 +#define CM_END 0x02 +#define CM_SKIP 0x04 +#define CM_UPGRADE146 0x08 +#define CM_EXCLUDE 0x10 +#define CM_START_SKIP (CM_START | CM_SKIP) + +struct cfg_marker { + __u32 cm_step; /* aka config version */ + __u32 cm_flags; + __u32 cm_vers; /* lustre release version number */ + __u32 cm_padding; /* 64 bit align */ + __s64 cm_createtime; /*when this record was first created */ + __s64 cm_canceltime; /*when this record is no longer valid*/ + char cm_tgtname[MTI_NAME_MAXLEN]; + char cm_comment[MTI_NAME_MAXLEN]; +}; + +/* + * Opcodes for multiple servers. + */ +enum obd_cmd { + OBD_PING = 400, +/* OBD_LOG_CANCEL = 401, obsolete since 1.5 */ +/* OBD_QC_CALLBACK = 402, obsolete since 2.4 */ + OBD_IDX_READ = 403, + OBD_LAST_OPC, + OBD_FIRST_OPC = OBD_PING +}; + +/** + * llog contexts indices. + * + * There is compatibility problem with indexes below, they are not + * continuous and must keep their numbers for compatibility needs. + * See LU-5218 for details. + */ +enum llog_ctxt_id { + LLOG_CONFIG_ORIG_CTXT = 0, + LLOG_CONFIG_REPL_CTXT = 1, + LLOG_MDS_OST_ORIG_CTXT = 2, + LLOG_MDS_OST_REPL_CTXT = 3, /* kept just to avoid re-assignment */ + LLOG_SIZE_ORIG_CTXT = 4, + LLOG_SIZE_REPL_CTXT = 5, + LLOG_TEST_ORIG_CTXT = 8, + LLOG_TEST_REPL_CTXT = 9, /* kept just to avoid re-assignment */ + LLOG_CHANGELOG_ORIG_CTXT = 12, /**< changelog generation on mdd */ + LLOG_CHANGELOG_REPL_CTXT = 13, /**< changelog access on clients */ + /* for multiple changelog consumers */ + LLOG_CHANGELOG_USER_ORIG_CTXT = 14, + LLOG_AGENT_ORIG_CTXT = 15, /**< agent requests generation on cdt */ + LLOG_UPDATELOG_ORIG_CTXT = 16, /* update log. reserve for the client */ + LLOG_UPDATELOG_REPL_CTXT = 17, /* update log. reserve for the client */ + LLOG_MAX_CTXTS +}; + +/** Identifier for a single log object */ +struct llog_logid { + struct ost_id lgl_oi; + __u32 lgl_ogen; +} __attribute__((packed)); + +/** Records written to the CATALOGS list */ +#define CATLIST "CATALOGS" +struct llog_catid { + struct llog_logid lci_logid; + __u32 lci_padding1; + __u32 lci_padding2; + __u32 lci_padding3; +} __attribute__((packed)); + +/* Log data record types - there is no specific reason that these need to + * be related to the RPC opcodes, but no reason not to (may be handy later?) + */ +#define LLOG_OP_MAGIC 0x10600000 +#define LLOG_OP_MASK 0xfff00000 + +enum llog_op_type { + LLOG_PAD_MAGIC = LLOG_OP_MAGIC | 0x00000, + OST_SZ_REC = LLOG_OP_MAGIC | 0x00f00, + /* OST_RAID1_REC = LLOG_OP_MAGIC | 0x01000, never used */ + MDS_UNLINK_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | + REINT_UNLINK, /* obsolete after 2.5.0 */ + MDS_UNLINK64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | + REINT_UNLINK, + /* MDS_SETATTR_REC = LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */ + MDS_SETATTR64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | + REINT_SETATTR, + OBD_CFG_REC = LLOG_OP_MAGIC | 0x20000, + /* PTL_CFG_REC = LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */ + LLOG_GEN_REC = LLOG_OP_MAGIC | 0x40000, + /* LLOG_JOIN_REC = LLOG_OP_MAGIC | 0x50000, obsolete 1.8.0 */ + CHANGELOG_REC = LLOG_OP_MAGIC | 0x60000, + CHANGELOG_USER_REC = LLOG_OP_MAGIC | 0x70000, + CHANGELOG_USER_REC2 = LLOG_OP_MAGIC | 0x70002, + HSM_AGENT_REC = LLOG_OP_MAGIC | 0x80000, + UPDATE_REC = LLOG_OP_MAGIC | 0xa0000, /* Resevered to avoid + * use on client. + */ + LLOG_HDR_MAGIC = LLOG_OP_MAGIC | 0x45539, + LLOG_LOGID_MAGIC = LLOG_OP_MAGIC | 0x4553b, +}; + +#define LLOG_REC_HDR_NEEDS_SWABBING(r) \ + (((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC)) + +/** Log record header - stored in little endian order. + * Each record must start with this struct, end with a llog_rec_tail, + * and be a multiple of 256 bits in size. + */ +struct llog_rec_hdr { + __u32 lrh_len; + __u32 lrh_index; + __u32 lrh_type; + __u32 lrh_id; +} __attribute__((packed)); + +struct llog_rec_tail { + __u32 lrt_len; + __u32 lrt_index; +} __attribute__((packed)); + +/* Where data follow just after header */ +#define REC_DATA(ptr) \ + ((void *)((char *)ptr + sizeof(struct llog_rec_hdr))) + +#define REC_DATA_LEN(rec) \ + (rec->lrh_len - sizeof(struct llog_rec_hdr) - \ + sizeof(struct llog_rec_tail)) + +struct llog_logid_rec { + struct llog_rec_hdr lid_hdr; + struct llog_logid lid_id; + __u32 lid_padding1; + __u64 lid_padding2; + __u64 lid_padding3; + struct llog_rec_tail lid_tail; +} __attribute__((packed)); + +struct llog_unlink_rec { + struct llog_rec_hdr lur_hdr; + __u64 lur_oid; + __u32 lur_oseq; + __u32 lur_count; + struct llog_rec_tail lur_tail; +} __attribute__((packed)); + +struct llog_unlink64_rec { + struct llog_rec_hdr lur_hdr; + struct lu_fid lur_fid; + __u32 lur_count; /* to destroy the lost precreated */ + __u32 lur_padding1; + __u64 lur_padding2; + __u64 lur_padding3; + struct llog_rec_tail lur_tail; +} __attribute__((packed)); + +struct llog_setattr64_rec { + struct llog_rec_hdr lsr_hdr; + struct ost_id lsr_oi; + __u32 lsr_uid; + __u32 lsr_uid_h; + __u32 lsr_gid; + __u32 lsr_gid_h; + __u64 lsr_valid; + struct llog_rec_tail lsr_tail; +} __attribute__((packed)); + +/* Extended to support project quota */ +struct llog_setattr64_rec_v2 { + struct llog_rec_hdr lsr_hdr; + struct ost_id lsr_oi; + __u32 lsr_uid; + __u32 lsr_uid_h; + __u32 lsr_gid; + __u32 lsr_gid_h; + __u64 lsr_valid; + __u32 lsr_projid; + __u32 lsr_layout_version; + __u64 lsr_padding2; + __u64 lsr_padding3; + struct llog_rec_tail lsr_tail; +} __attribute__((packed)); + +struct llog_size_change_rec { + struct llog_rec_hdr lsc_hdr; + struct ll_fid lsc_fid; + __u32 lsc_ioepoch; + __u32 lsc_padding1; + __u64 lsc_padding2; + __u64 lsc_padding3; + struct llog_rec_tail lsc_tail; +} __attribute__((packed)); + +#define CHANGELOG_MAGIC 0xca103000 + +/** \a changelog_rec_type's that can't be masked */ +#define CHANGELOG_MINMASK BIT(CL_MARK) +/** bits covering all \a changelog_rec_type's */ +#define CHANGELOG_ALLMASK (BIT(CL_LAST) - 1) +/** default \a changelog_rec_type mask. Allow all of them, except + * CL_ATIME since it can really be time consuming, and not necessary + * under normal use. + * Remove also CL_OPEN, CL_GETXATTR and CL_DN_OPEN from default list as it can + * be costly and only necessary for audit purpose. + */ +#define CHANGELOG_DEFMASK (CHANGELOG_ALLMASK & \ + ~(BIT(CL_ATIME) | BIT(CL_OPEN) | BIT(CL_GETXATTR) | BIT(CL_DN_OPEN))) + +/* changelog llog name, needed by client replicators */ +#define CHANGELOG_CATALOG "changelog_catalog" + +struct changelog_setinfo { + __u64 cs_recno; + __u32 cs_id; +} __attribute__((packed)); + +/** changelog record */ +struct llog_changelog_rec { + struct llog_rec_hdr cr_hdr; + struct changelog_rec cr; /**< Variable length field */ + struct llog_rec_tail cr_do_not_use; /**< for_sizeof_only */ +} __attribute__((packed)); + +#define CHANGELOG_USER_PREFIX "cl" +#define CHANGELOG_USER_NAMELEN 16 /* base name including NUL terminator */ +#define CHANGELOG_USER_NAMELEN_FULL 30 /* basename plus 'cl$ID-' prefix */ + +struct llog_changelog_user_rec { + struct llog_rec_hdr cur_hdr; + __u32 cur_id; + /* only intended to be used in relative time comparisons to + * detect idle users */ + __u32 cur_time; + __u64 cur_endrec; + struct llog_rec_tail cur_tail; +} __attribute__((packed)); + +/* this is twice the size of CHANGELOG_USER_REC */ +struct llog_changelog_user_rec2 { + struct llog_rec_hdr cur_hdr; + __u32 cur_id; + /* only for use in relative time comparisons to detect idle users */ + __u32 cur_time; + __u64 cur_endrec; + __u32 cur_mask; + __u32 cur_padding1; + char cur_name[CHANGELOG_USER_NAMELEN]; + __u64 cur_padding2; + __u64 cur_padding3; + struct llog_rec_tail cur_tail; +} __attribute__((packed)); + +enum agent_req_status { + ARS_WAITING, + ARS_STARTED, + ARS_FAILED, + ARS_CANCELED, + ARS_SUCCEED, +}; + +static inline const char *agent_req_status2name(enum agent_req_status ars) +{ + switch (ars) { + case ARS_WAITING: + return "WAITING"; + case ARS_STARTED: + return "STARTED"; + case ARS_FAILED: + return "FAILED"; + case ARS_CANCELED: + return "CANCELED"; + case ARS_SUCCEED: + return "SUCCEED"; + default: + return "UNKNOWN"; + } +} + +struct llog_agent_req_rec { + struct llog_rec_hdr arr_hdr; /**< record header */ + __u32 arr_status; /**< status of the request */ + /* must match enum + * agent_req_status */ + __u32 arr_archive_id; /**< backend archive number */ + __u64 arr_flags; /**< req flags */ + __u64 arr_compound_id; /**< compound cookie, ignored */ + __u64 arr_req_create; /**< req. creation time */ + __u64 arr_req_change; /**< req. status change time */ + struct hsm_action_item arr_hai; /**< req. to the agent */ + struct llog_rec_tail arr_tail; /**< record tail for_sizezof_only */ +} __attribute__((packed)); + +/* Old llog gen for compatibility */ +struct llog_gen { + __u64 mnt_cnt; + __u64 conn_cnt; +} __attribute__((packed)); + +struct llog_gen_rec { + struct llog_rec_hdr lgr_hdr; + struct llog_gen lgr_gen; + __u64 padding1; + __u64 padding2; + __u64 padding3; + struct llog_rec_tail lgr_tail; +}; + +/* flags for the logs */ +enum llog_flag { + LLOG_F_ZAP_WHEN_EMPTY = 0x1, + LLOG_F_IS_CAT = 0x2, + LLOG_F_IS_PLAIN = 0x4, + LLOG_F_EXT_JOBID = 0x8, + LLOG_F_IS_FIXSIZE = 0x10, + LLOG_F_EXT_EXTRA_FLAGS = 0x20, + LLOG_F_EXT_X_UIDGID = 0x40, + LLOG_F_EXT_X_NID = 0x80, + LLOG_F_EXT_X_OMODE = 0x100, + LLOG_F_EXT_X_XATTR = 0x200, + LLOG_F_RM_ON_ERR = 0x400, + + /* Note: Flags covered by LLOG_F_EXT_MASK will be inherited from + * catlog to plain log, so do not add LLOG_F_IS_FIXSIZE here, + * because the catlog record is usually fixed size, but its plain + * log record can be variable */ + LLOG_F_EXT_MASK = LLOG_F_EXT_JOBID | LLOG_F_EXT_EXTRA_FLAGS | + LLOG_F_EXT_X_UIDGID | LLOG_F_EXT_X_NID | + LLOG_F_EXT_X_OMODE | LLOG_F_EXT_X_XATTR, +}; + +/* means first record of catalog */ +enum { + LLOG_CAT_FIRST = -1, +}; + +/* On-disk header structure of each log object, stored in little endian order */ +#define LLOG_MIN_CHUNK_SIZE 8192 +#define LLOG_HEADER_SIZE (96) /* sizeof (llog_log_hdr) + sizeof(llh_tail) + * - sizeof(llh_bitmap) */ +#define LLOG_BITMAP_BYTES (LLOG_MIN_CHUNK_SIZE - LLOG_HEADER_SIZE) +#define LLOG_MIN_REC_SIZE (24) /* round(llog_rec_hdr + llog_rec_tail) */ + +struct llog_log_hdr { + struct llog_rec_hdr llh_hdr; + __s64 llh_timestamp; + __u32 llh_count; + __u32 llh_bitmap_offset; + __u32 llh_size; + __u32 llh_flags; + /* for a catalog the first/oldest and still in-use plain slot is just + * next to it. It will serve as the upper limit after Catalog has + * wrapped around */ + __u32 llh_cat_idx; + struct obd_uuid llh_tgtuuid; + __u32 llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32)-23]; + /* These fields must always be at the end of the llog_log_hdr. + * Note: llh_bitmap size is variable because llog chunk size could be + * bigger than LLOG_MIN_CHUNK_SIZE, i.e. sizeof(llog_log_hdr) > 8192 + * bytes, and the real size is stored in llh_hdr.lrh_len, which means + * llh_tail should only be refered by LLOG_HDR_TAIL(). + * But this structure is also used by client/server llog interface + * (see llog_client.c), it will be kept in its original way to avoid + * compatiblity issue. */ + __u32 llh_bitmap[LLOG_BITMAP_BYTES / sizeof(__u32)]; + struct llog_rec_tail llh_tail; +} __attribute__((packed)); +#undef LLOG_HEADER_SIZE +#undef LLOG_BITMAP_BYTES + +#define LLOG_HDR_BITMAP_SIZE(llh) (__u32)((llh->llh_hdr.lrh_len - \ + llh->llh_bitmap_offset - \ + sizeof(llh->llh_tail)) * 8) +#define LLOG_HDR_BITMAP(llh) (__u32 *)((char *)(llh) + \ + (llh)->llh_bitmap_offset) +#define LLOG_HDR_TAIL(llh) ((struct llog_rec_tail *)((char *)llh + \ + llh->llh_hdr.lrh_len - \ + sizeof(llh->llh_tail))) + +/** log cookies are used to reference a specific log file and a record therein, + and pass record offset from llog_process_thread to llog_write */ +struct llog_cookie { + union { + struct llog_logid lgc_lgl; + __u64 lgc_offset; + }; + __u32 lgc_subsys; + __u32 lgc_index; + __u32 lgc_padding; +} __attribute__((packed)); + +/** llog protocol */ +enum llogd_rpc_ops { + LLOG_ORIGIN_HANDLE_CREATE = 501, + LLOG_ORIGIN_HANDLE_NEXT_BLOCK = 502, + LLOG_ORIGIN_HANDLE_READ_HEADER = 503, +/* LLOG_ORIGIN_HANDLE_WRITE_REC = 504, Obsolete by 2.1. */ +/* LLOG_ORIGIN_HANDLE_CLOSE = 505, Obsolete by 1.8. */ +/* LLOG_ORIGIN_CONNECT = 506, Obsolete by 2.4. */ +/* LLOG_CATINFO = 507, Obsolete by 2.3. */ + LLOG_ORIGIN_HANDLE_PREV_BLOCK = 508, + LLOG_ORIGIN_HANDLE_DESTROY = 509, /* Obsolete by 2.11. */ + LLOG_LAST_OPC, + LLOG_FIRST_OPC = LLOG_ORIGIN_HANDLE_CREATE +}; + +struct llogd_body { + struct llog_logid lgd_logid; + __u32 lgd_ctxt_idx; + __u32 lgd_llh_flags; + __u32 lgd_index; + __u32 lgd_saved_index; + __u32 lgd_len; + __u64 lgd_cur_offset; +} __attribute__((packed)); + +struct llogd_conn_body { + struct llog_gen lgdc_gen; + struct llog_logid lgdc_logid; + __u32 lgdc_ctxt_idx; +} __attribute__((packed)); + +/* Note: 64-bit types are 64-bit aligned in structure */ +struct obdo { + __u64 o_valid; /* hot fields in this obdo */ + struct ost_id o_oi; + __u64 o_parent_seq; + __u64 o_size; /* o_size-o_blocks == ost_lvb */ + __s64 o_mtime; + __s64 o_atime; + __s64 o_ctime; + __u64 o_blocks; /* brw: cli sent cached bytes */ + __u64 o_grant; + + /* 32-bit fields start here: keep an even number of them via padding */ + __u32 o_blksize; /* optimal IO blocksize */ + __u32 o_mode; /* brw: cli sent cache remain */ + __u32 o_uid; + __u32 o_gid; + __u32 o_flags; + __u32 o_nlink; /* brw: checksum */ + __u32 o_parent_oid; + __u32 o_misc; /* brw: o_dropped */ + + __u64 o_ioepoch; /* epoch in ost writes */ + __u32 o_stripe_idx; /* holds stripe idx */ + __u32 o_parent_ver; + struct lustre_handle o_handle; /* brw: lock handle to prolong + * locks */ + /* Originally, the field is llog_cookie for destroy with unlink cookie + * from MDS, it is obsolete in 2.8. Then reuse it by client to transfer + * layout and PFL information in IO, setattr RPCs. Since llog_cookie is + * not used on wire any longer, remove it from the obdo, then it can be + * enlarged freely in the further without affect related RPCs. + * + * sizeof(ost_layout) + sieof(__u32) == sizeof(llog_cookie). */ + struct ost_layout o_layout; + __u32 o_layout_version; + __u32 o_uid_h; + __u32 o_gid_h; + + __u64 o_data_version; /* getattr: sum of iversion for + * each stripe. + * brw: grant space consumed on + * the client for the write */ + __u32 o_projid; + __u32 o_padding_4; /* also fix + * lustre_swab_obdo() */ + __u64 o_padding_5; + __u64 o_padding_6; +}; + +#define o_dirty o_blocks +#define o_undirty o_mode +#define o_dropped o_misc +#define o_cksum o_nlink +#define o_grant_used o_data_version +#define o_falloc_mode o_nlink + +struct lfsck_request { + __u32 lr_event; + __u32 lr_index; + __u32 lr_flags; + __u32 lr_valid; + union { + __u32 lr_speed; + __u32 lr_status; + }; + __u16 lr_version; + __u16 lr_active; + __u16 lr_param; + __u16 lr_async_windows; + __u32 lr_flags2; + struct lu_fid lr_fid; + struct lu_fid lr_fid2; + __u32 lr_comp_id; + __u32 lr_padding_0; + __u64 lr_padding_1; + __u64 lr_padding_2; + __u64 lr_padding_3; +}; + +struct lfsck_reply { + __u32 lr_status; + __u32 lr_padding_1; + __u64 lr_repaired; +}; + +enum lfsck_events { + LE_LASTID_REBUILDING = 1, + LE_LASTID_REBUILT = 2, + LE_PHASE1_DONE = 3, + LE_PHASE2_DONE = 4, + LE_START = 5, + LE_STOP = 6, + LE_QUERY = 7, + /* LE_FID_ACCESSED = 8, moved to lfsck_events_local */ + LE_PEER_EXIT = 9, + LE_CONDITIONAL_DESTROY = 10, + LE_PAIRS_VERIFY = 11, + LE_SET_LMV_MASTER = 15, + LE_SET_LMV_SLAVE = 16, +}; + +enum lfsck_event_flags { + LEF_TO_OST = 0x00000001, + LEF_FROM_OST = 0x00000002, + LEF_SET_LMV_HASH = 0x00000004, + LEF_SET_LMV_ALL = 0x00000008, + LEF_RECHECK_NAME_HASH = 0x00000010, + LEF_QUERY_ALL = 0x00000020, +}; + +/* request structure for OST's */ +struct ost_body { + struct obdo oa; +}; + +/* Key for FIEMAP to be used in get_info calls */ +struct ll_fiemap_info_key { + char lfik_name[8]; + struct obdo lfik_oa; + struct fiemap lfik_fiemap; +}; + +#define IDX_INFO_MAGIC 0x3D37CC37 + +/* Index file transfer through the network. The server serializes the index into + * a byte stream which is sent to the client via a bulk transfer */ +struct idx_info { + __u32 ii_magic; + + /* reply: see idx_info_flags below */ + __u32 ii_flags; + + /* request & reply: number of lu_idxpage (to be) transferred */ + __u16 ii_count; + __u16 ii_pad0; + + /* request: requested attributes passed down to the iterator API */ + __u32 ii_attrs; + + /* request & reply: index file identifier (FID) */ + struct lu_fid ii_fid; + + /* reply: version of the index file before starting to walk the index. + * Please note that the version can be modified at any time during the + * transfer */ + __u64 ii_version; + + /* request: hash to start with: + * reply: hash of the first entry of the first lu_idxpage and hash + * of the entry to read next if any */ + __u64 ii_hash_start; + __u64 ii_hash_end; + + /* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is + * set */ + __u16 ii_keysize; + + /* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC + * is set */ + __u16 ii_recsize; + + __u32 ii_pad1; + __u64 ii_pad2; + __u64 ii_pad3; +}; + +#define II_END_OFF MDS_DIR_END_OFF /* all entries have been read */ + +/* List of flags used in idx_info::ii_flags */ +enum idx_info_flags { + II_FL_NOHASH = 1 << 0, /* client doesn't care about hash value */ + II_FL_VARKEY = 1 << 1, /* keys can be of variable size */ + II_FL_VARREC = 1 << 2, /* records can be of variable size */ + II_FL_NONUNQ = 1 << 3, /* index supports non-unique keys */ + II_FL_NOKEY = 1 << 4, /* client doesn't care about key */ +}; + +#define LIP_MAGIC 0x8A6D6B6C + +/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */ +struct lu_idxpage { + /* 16-byte header */ + __u32 lip_magic; + __u16 lip_flags; + __u16 lip_nr; /* number of entries in the container */ + __u64 lip_pad0; /* additional padding for future use */ + + /* key/record pairs are stored in the remaining 4080 bytes. + * depending upon the flags in idx_info::ii_flags, each key/record + * pair might be preceded by: + * - a hash value + * - the key size (II_FL_VARKEY is set) + * - the record size (II_FL_VARREC is set) + * + * For the time being, we only support fixed-size key & record. */ + char lip_entries[0]; +}; + +#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries)) + +/* Gather all possible type associated with a 4KB container */ +union lu_page { + struct lu_dirpage lp_dir; /* for MDS_READPAGE */ + struct lu_idxpage lp_idx; /* for OBD_IDX_READ */ + char lp_array[LU_PAGE_SIZE]; +}; + +/* security opcodes */ +enum sec_cmd { + SEC_CTX_INIT = 801, + SEC_CTX_INIT_CONT = 802, + SEC_CTX_FINI = 803, + SEC_LAST_OPC, + SEC_FIRST_OPC = SEC_CTX_INIT +}; + +/** The link ea holds 1 \a link_ea_entry for each hardlink */ +#define LINK_EA_MAGIC 0x11EAF1DFUL +struct link_ea_header { + __u32 leh_magic; + __u32 leh_reccount; + __u64 leh_len; /* total size */ + __u32 leh_overflow_time; + __u32 leh_padding; +}; + +/** Hardlink data is name and parent fid. + * Stored in this crazy struct for maximum packing and endian-neutrality + */ +struct link_ea_entry { + /** __u16 stored big-endian, unaligned */ + unsigned char lee_reclen[2]; + unsigned char lee_parent_fid[sizeof(struct lu_fid)]; + char lee_name[0]; +} __attribute__((packed)); + +/** fid2path request/reply structure */ +struct getinfo_fid2path { + struct lu_fid gf_fid; + __u64 gf_recno; + __u32 gf_linkno; + __u32 gf_pathlen; + union { + char gf_path[0]; + struct lu_fid gf_root_fid[0]; + } gf_u; +} __attribute__((packed)); + +/** path2parent request/reply structures */ +struct getparent { + struct lu_fid gp_fid; /**< parent FID */ + __u32 gp_linkno; /**< hardlink number */ + __u32 gp_name_size; /**< size of the name field */ + char gp_name[0]; /**< zero-terminated link name */ +} __attribute__((packed)); + +enum layout_intent_opc { + LAYOUT_INTENT_ACCESS = 0, /** generic access */ + LAYOUT_INTENT_READ = 1, /** not used */ + LAYOUT_INTENT_WRITE = 2, /** write file, for comp layout */ + LAYOUT_INTENT_GLIMPSE = 3, /** not used */ + LAYOUT_INTENT_TRUNC = 4, /** truncate file, for comp layout */ + LAYOUT_INTENT_RELEASE = 5, /** reserved for HSM release */ + LAYOUT_INTENT_RESTORE = 6, /** reserved for HSM restore */ +}; + +/* enqueue layout lock with intent */ +struct layout_intent { + __u32 li_opc; /* intent operation for enqueue, read, write etc */ + __u32 li_flags; + struct lu_extent li_extent; +} __attribute__((packed)); + +/** + * On the wire version of hsm_progress structure. + * + * Contains the userspace hsm_progress and some internal fields. + */ +struct hsm_progress_kernel { + /* Field taken from struct hsm_progress */ + struct lu_fid hpk_fid; + __u64 hpk_cookie; + struct hsm_extent hpk_extent; + __u16 hpk_flags; + __u16 hpk_errval; /* positive val */ + __u32 hpk_padding1; + /* Additional fields */ + __u64 hpk_data_version; + __u64 hpk_padding2; +} __attribute__((packed)); + +/** + * OUT_UPDATE RPC Format + * + * During the cross-ref operation, the Master MDT, which the client send the + * request to, will disassembly the operation into object updates, then OSP + * will send these updates to the remote MDT to be executed. + * + * An UPDATE_OBJ RPC does a list of updates. Each update belongs to an + * operation and does a type of modification to an object. + * + * Request Format + * + * update_buf + * update (1st) + * update (2nd) + * ... + * update (ub_count-th) + * + * ub_count must be less than or equal to UPDATE_PER_RPC_MAX. + * + * Reply Format + * + * update_reply + * rc [+ buffers] (1st) + * rc [+ buffers] (2st) + * ... + * rc [+ buffers] (nr_count-th) + * + * ur_count must be less than or equal to UPDATE_PER_RPC_MAX and should usually + * be equal to ub_count. + */ + +/** + * Type of each update, if adding/deleting update, please also update + * update_opcode in lustre/target/out_lib.c. + */ +enum update_type { + OUT_START = 0, + OUT_CREATE = 1, + OUT_DESTROY = 2, + OUT_REF_ADD = 3, + OUT_REF_DEL = 4, + OUT_ATTR_SET = 5, + OUT_ATTR_GET = 6, + OUT_XATTR_SET = 7, + OUT_XATTR_GET = 8, + OUT_INDEX_LOOKUP = 9, + OUT_INDEX_INSERT = 10, + OUT_INDEX_DELETE = 11, + OUT_WRITE = 12, + OUT_XATTR_DEL = 13, + OUT_PUNCH = 14, + OUT_READ = 15, + OUT_NOOP = 16, + OUT_XATTR_LIST = 17, + OUT_LAST +}; + +enum update_flag { + UPDATE_FL_OST = 0x00000001, /* op from OST (not MDT) */ + UPDATE_FL_SYNC = 0x00000002, /* commit before replying */ + UPDATE_FL_COMMITTED = 0x00000004, /* op committed globally */ + UPDATE_FL_NOLOG = 0x00000008 /* for idempotent updates */ +}; + +struct object_update_param { + __u16 oup_len; /* length of this parameter */ + __u16 oup_padding; + __u32 oup_padding2; + char oup_buf[0]; +} __attribute__((packed)); + +/* object update */ +struct object_update { + __u16 ou_type; /* enum update_type */ + __u16 ou_params_count; /* update parameters count */ + __u32 ou_result_size; /* how many bytes can return */ + __u32 ou_flags; /* enum update_flag */ + __u32 ou_padding1; /* padding 1 */ + __u64 ou_batchid; /* op transno on master */ + struct lu_fid ou_fid; /* object to be updated */ + struct object_update_param ou_params[0]; /* update params */ +}; + +#define UPDATE_REQUEST_MAGIC_V1 0xBDDE0001 +#define UPDATE_REQUEST_MAGIC_V2 0xBDDE0002 +#define UPDATE_REQUEST_MAGIC UPDATE_REQUEST_MAGIC_V2 +/* Hold object_updates sending to the remote OUT in single RPC */ +struct object_update_request { + __u32 ourq_magic; + __u16 ourq_count; /* number of ourq_updates[] */ + __u16 ourq_padding; + struct object_update ourq_updates[0]; +}; + +#define OUT_UPDATE_HEADER_MAGIC 0xBDDF0001 +#define OUT_UPDATE_MAX_INLINE_SIZE 4096 +/* Header for updates request between MDTs */ +struct out_update_header { + __u32 ouh_magic; + __u32 ouh_count; + __u32 ouh_inline_length; + __u32 ouh_reply_size; + __u32 ouh_inline_data[0]; +}; + +struct out_update_buffer { + __u32 oub_size; + __u32 oub_padding; +}; + +/* the result of object update */ +struct object_update_result { + __u32 our_rc; + __u16 our_datalen; + __u16 our_padding; + __u32 our_data[0]; +}; + +#define UPDATE_REPLY_MAGIC_V1 0x00BD0001 +#define UPDATE_REPLY_MAGIC_V2 0x00BD0002 +#define UPDATE_REPLY_MAGIC UPDATE_REPLY_MAGIC_V2 +/* Hold object_update_results being replied from the remote OUT. */ +struct object_update_reply { + __u32 ourp_magic; + __u16 ourp_count; + __u16 ourp_padding; + __u16 ourp_lens[0]; +}; + +/* read update result */ +struct out_read_reply { + __u32 orr_size; + __u32 orr_padding; + __u64 orr_offset; + char orr_data[0]; +}; + +/** layout swap request structure + * fid1 and fid2 are in mdt_body + */ +struct mdc_swap_layouts { + __u64 msl_flags; +} __attribute__((packed)); + +#define INLINE_RESYNC_ARRAY_SIZE 15 +struct close_data_resync_done { + __u32 resync_count; + __u32 resync_ids_inline[INLINE_RESYNC_ARRAY_SIZE]; +}; + +struct close_data { + struct lustre_handle cd_handle; + struct lu_fid cd_fid; + __u64 cd_data_version; + union { + __u64 cd_reserved[8]; + struct close_data_resync_done cd_resync; + /* split close */ + __u16 cd_mirror_id; + /* PCC release */ + __u32 cd_archive_id; + }; +}; + +/* Update llog format */ +struct update_op { + struct lu_fid uop_fid; + __u16 uop_type; + __u16 uop_param_count; + __u16 uop_params_off[]; +} __attribute__((packed)); + +struct update_ops { + struct update_op uops_op[0]; +}; + +struct update_params { + struct object_update_param up_params[0]; +}; + +enum update_records_flag { + UPDATE_RECORD_CONTINUE = 1 >> 0, +}; +/* + * This is the update record format used to store the updates in + * disk. All updates of the operation will be stored in ur_ops. + * All of parameters for updates of the operation will be stored + * in ur_params. + * To save the space of the record, parameters in ur_ops will only + * remember their offset in ur_params, so to avoid storing duplicate + * parameters in ur_params, which can help us save a lot space for + * operation like creating striped directory. + */ +struct update_records { + __u64 ur_master_transno; + __u64 ur_batchid; + __u32 ur_flags; + /* If the operation includes multiple updates, then ur_index + * means the index of the update inside the whole updates. */ + __u32 ur_index; + __u32 ur_update_count; + __u32 ur_param_count; + struct update_ops ur_ops; + /* Note ur_ops has a variable size, so comment out + * the following ur_params, in case some use it directly + * update_records->ur_params + * + * struct update_params ur_params; + */ +}; + +struct llog_update_record { + struct llog_rec_hdr lur_hdr; + struct update_records lur_update_rec; + /* Note ur_update_rec has a variable size, so comment out + * the following ur_tail, in case someone use it directly + * + * struct llog_rec_tail lur_tail; + */ +}; + +/* sepol string format is: + * <1-digit for SELinux status>::: + */ +/* Max length of the sepol string + * Should be large enough to contain a sha512sum of the policy + */ +#define SELINUX_MODE_LEN 1 +#define SELINUX_POLICY_VER_LEN 3 /* 3 chars to leave room for the future */ +#define SELINUX_POLICY_HASH_LEN 64 +#define LUSTRE_NODEMAP_SEPOL_LENGTH (SELINUX_MODE_LEN + NAME_MAX + \ + SELINUX_POLICY_VER_LEN + \ + SELINUX_POLICY_HASH_LEN + 3) + +/* nodemap records, uses 32 byte record length */ +#define LUSTRE_NODEMAP_NAME_LENGTH 16 +struct nodemap_cluster_rec { + char ncr_name[LUSTRE_NODEMAP_NAME_LENGTH + 1]; + __u8 ncr_flags; + __u16 ncr_padding1; + __u32 ncr_squash_projid; + __u32 ncr_squash_uid; + __u32 ncr_squash_gid; +}; + +/* lnet_nid_t is 8 bytes */ +struct nodemap_range_rec { + lnet_nid_t nrr_start_nid; + lnet_nid_t nrr_end_nid; + __u64 nrr_padding1; + __u64 nrr_padding2; +}; + +struct nodemap_id_rec { + __u32 nir_id_fs; + __u32 nir_padding1; + __u64 nir_padding2; + __u64 nir_padding3; + __u64 nir_padding4; +}; + +struct nodemap_global_rec { + __u8 ngr_is_active; + __u8 ngr_padding1; + __u16 ngr_padding2; + __u32 ngr_padding3; + __u64 ngr_padding4; + __u64 ngr_padding5; + __u64 ngr_padding6; +}; + +union nodemap_rec { + struct nodemap_cluster_rec ncr; + struct nodemap_range_rec nrr; + struct nodemap_id_rec nir; + struct nodemap_global_rec ngr; +}; + +/* + * rawobj stuff for GSS + */ +typedef struct netobj_s { + __u32 len; + __u8 data[0]; +} netobj_t; + +typedef struct rawobj_s { + __u32 len; + __u8 *data; +} rawobj_t; + +/* + * GSS headers + * following 3 headers must have the same sizes and offsets + */ +struct gss_header { + __u8 gh_version; /* gss version */ + __u8 gh_sp; /* sec part */ + __u16 gh_pad0; + __u32 gh_flags; /* wrap flags */ + __u32 gh_proc; /* proc */ + __u32 gh_seq; /* sequence */ + __u32 gh_svc; /* service */ + __u32 gh_pad1; + __u32 gh_pad2; + __u32 gh_pad3; + netobj_t gh_handle; /* context handle */ +}; + +struct gss_rep_header { + __u8 gh_version; + __u8 gh_sp; + __u16 gh_pad0; + __u32 gh_flags; + __u32 gh_proc; + __u32 gh_major; + __u32 gh_minor; + __u32 gh_seqwin; + __u32 gh_pad2; + __u32 gh_pad3; + netobj_t gh_handle; +}; + +struct gss_err_header { + __u8 gh_version; + __u8 gh_sp; + __u16 gh_pad0; + __u32 gh_flags; + __u32 gh_proc; + __u32 gh_major; + __u32 gh_minor; + __u32 gh_pad1; + __u32 gh_pad2; + __u32 gh_pad3; + netobj_t gh_handle; +}; + +/* + * GSS part of wire context information sent from client, saved and + * used later by server. + */ +struct gss_wire_ctx { + __u32 gw_flags; + __u32 gw_proc; + __u32 gw_seq; + __u32 gw_svc; + rawobj_t gw_handle; +}; + +/* This is the lu_ladvise struct which goes out on the wire. + * Corresponds to the userspace arg llapi_lu_ladvise. + * value[1-4] are unspecified fields, used differently by different advices */ +struct lu_ladvise { + __u16 lla_advice; /* advice type */ + __u16 lla_value1; /* values for different advice types */ + __u32 lla_value2; + __u64 lla_start; /* first byte of extent for advice */ + __u64 lla_end; /* last byte of extent for advice */ + __u32 lla_value3; + __u32 lla_value4; +}; + +/* This is the ladvise_hdr which goes on the wire, corresponds to the userspace + * arg llapi_ladvise_hdr. + * value[1-3] are unspecified fields, used differently by different advices */ +struct ladvise_hdr { + __u32 lah_magic; /* LADVISE_MAGIC */ + __u32 lah_count; /* number of advices */ + __u64 lah_flags; /* from enum ladvise_flag */ + __u32 lah_value1; /* unused */ + __u32 lah_value2; /* unused */ + __u64 lah_value3; /* unused */ + struct lu_ladvise lah_advise[0]; /* advices in this header */ +}; + +#if defined(__cplusplus) +} +#endif + +#endif +/** @} lustreidl */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h new file mode 100644 index 0000000000000..bfd91cf52e6fb --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h @@ -0,0 +1,231 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2017, Intel Corporation. + */ +#ifndef _UAPI_LUSTRE_IOCTL_H +#define _UAPI_LUSTRE_IOCTL_H + +#include +#include +#include +#include + +/* + * sparse kernel source annotations + */ +#ifndef __user +#define __user +#endif + +enum md_echo_cmd { + ECHO_MD_CREATE = 1, /* Open/Create file on MDT */ + ECHO_MD_MKDIR = 2, /* Mkdir on MDT */ + ECHO_MD_DESTROY = 3, /* Unlink file on MDT */ + ECHO_MD_RMDIR = 4, /* Rmdir on MDT */ + ECHO_MD_LOOKUP = 5, /* Lookup on MDT */ + ECHO_MD_GETATTR = 6, /* Getattr on MDT */ + ECHO_MD_SETATTR = 7, /* Setattr on MDT */ + ECHO_MD_ALLOC_FID = 8, /* Get FIDs from MDT */ +}; + +#define OBD_DEV_ID 1 +#define OBD_DEV_NAME "obd" +#define OBD_DEV_PATH "/dev/" OBD_DEV_NAME + +#define OBD_IOCTL_VERSION 0x00010004 +#define OBD_DEV_BY_DEVNAME 0xffffd0de + +struct obd_ioctl_data { + __u32 ioc_len; + __u32 ioc_version; + + union { + __u64 ioc_cookie; + __u64 ioc_u64_1; + }; + union { + __u32 ioc_conn1; + __u32 ioc_u32_1; + }; + union { + __u32 ioc_conn2; + __u32 ioc_u32_2; + }; + + struct obdo ioc_obdo1; + struct obdo ioc_obdo2; + + __u64 ioc_count; + __u64 ioc_offset; + __u32 ioc_dev; + __u32 ioc_command; + + __u64 ioc_nid; + __u32 ioc_nal; + __u32 ioc_type; + + /* buffers the kernel will treat as user pointers */ + __u32 ioc_plen1; + char __user *ioc_pbuf1; + __u32 ioc_plen2; + char __user *ioc_pbuf2; + + /* inline buffers for various arguments */ + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + __u32 ioc_inllen3; + char *ioc_inlbuf3; + __u32 ioc_inllen4; + char *ioc_inlbuf4; + + char ioc_bulk[0]; +}; + +struct obd_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +static inline __u32 obd_ioctl_packlen(struct obd_ioctl_data *data) +{ + __u32 len = __ALIGN_KERNEL(sizeof(*data), 8); + + len += __ALIGN_KERNEL(data->ioc_inllen1, 8); + len += __ALIGN_KERNEL(data->ioc_inllen2, 8); + len += __ALIGN_KERNEL(data->ioc_inllen3, 8); + len += __ALIGN_KERNEL(data->ioc_inllen4, 8); + + return len; +} + +/* + * OBD_IOC_DATA_TYPE is only for compatibility reasons with older + * Linux Lustre user tools. New ioctls should NOT use this macro as + * the ioctl "size". Instead the ioctl should get a "size" argument + * which is the actual data type used by the ioctl, to ensure the + * ioctl interface is versioned correctly. + */ +#define OBD_IOC_DATA_TYPE long + +#define OBD_IOC_CREATE _IOWR('f', 101, OBD_IOC_DATA_TYPE) +#define OBD_IOC_DESTROY _IOW('f', 104, OBD_IOC_DATA_TYPE) +/* OBD_IOC_PREALLOCATE _IOWR('f', 105, OBD_IOC_DATA_TYPE) */ + +#define OBD_IOC_SETATTR _IOW('f', 107, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETATTR _IOWR('f', 108, OBD_IOC_DATA_TYPE) +#define OBD_IOC_READ _IOWR('f', 109, OBD_IOC_DATA_TYPE) +#define OBD_IOC_WRITE _IOWR('f', 110, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_STATFS _IOWR('f', 113, OBD_IOC_DATA_TYPE) +#define OBD_IOC_SYNC _IOW('f', 114, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_BRW_READ _IOWR('f', 125, OBD_IOC_DATA_TYPE) +#define OBD_IOC_BRW_WRITE _IOWR('f', 126, OBD_IOC_DATA_TYPE) +#define OBD_IOC_NAME2DEV _IOWR('f', 127, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETDTNAME _IOR('f', 127, char[MAX_OBD_NAME]) +/* ioctl codes 128-143 are reserved for fsverity */ +#define OBD_IOC_UUID2DEV _IOWR('f', 130, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETNAME_OLD _IOWR('f', 131, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETMDNAME _IOR('f', 131, char[MAX_OBD_NAME]) +/* OBD_IOC_LOV_GET_CONFIG _IOWR('f', 132, OBD_IOC_DATA_TYPE) until 2.14 */ +#define OBD_IOC_CLIENT_RECOVER _IOW('f', 133, OBD_IOC_DATA_TYPE) +/* ioctl codes 128-143 are reserved for fsverity */ +/* FS_IOC_ENABLE_VERITY _IOW('f', 133, struct fsverity_enable_arg) */ +/* FS_IOC_MEASURE_VERITY _IOW('f', 134, struct fsverity_digest) */ +/* was OBD_IOC_NO_TRANSNO _IOW('f', 140, OBD_IOC_DATA_TYPE) until 2.14 */ +#define OBD_IOC_SET_READONLY _IOW('f', 141, OBD_IOC_DATA_TYPE) +#define OBD_IOC_ABORT_RECOVERY _IOR('f', 142, OBD_IOC_DATA_TYPE) +enum obd_abort_recovery_flags { + OBD_FLG_ABORT_RECOV_OST = 0x00008, /* LMD_FLG_ABORT_RECOV */ + OBD_FLG_ABORT_RECOV_MDT = 0x40000, /* LMD_FLG_ABORT_RECOV_MDT */ +}; +/* ioctl codes 128-143 are reserved for fsverity */ +#define OBD_GET_VERSION _IOWR('f', 144, OBD_IOC_DATA_TYPE) +/* OBD_IOC_GSS_SUPPORT _IOWR('f', 145, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_CLOSE_UUID _IOWR('f', 147, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_CHANGELOG_SEND _IOW('f', 148, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_GETDEVICE _IOWR('f', 149, OBD_IOC_DATA_TYPE) +#define OBD_IOC_FID2PATH _IOWR('f', 150, OBD_IOC_DATA_TYPE) +/* lustre/lustre_user.h 151-153 */ +/* OBD_IOC_LOV_SETSTRIPE 154 LL_IOC_LOV_SETSTRIPE */ +/* OBD_IOC_LOV_GETSTRIPE 155 LL_IOC_LOV_GETSTRIPE */ +/* OBD_IOC_LOV_SETEA 156 LL_IOC_LOV_SETEA */ +/* lustre/lustre_user.h 157-159 */ +/* OBD_IOC_QUOTACHECK _IOW('f', 160, int) */ +/* OBD_IOC_POLL_QUOTACHECK _IOR('f', 161, struct if_quotacheck *) */ +#define OBD_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl) +/* lustre/lustre_user.h 163-176 */ +#define OBD_IOC_CHANGELOG_REG _IOW('f', 177, struct obd_ioctl_data) +#define OBD_IOC_CHANGELOG_DEREG _IOW('f', 178, struct obd_ioctl_data) +#define OBD_IOC_CHANGELOG_CLEAR _IOW('f', 179, struct obd_ioctl_data) +/* OBD_IOC_RECORD _IOWR('f', 180, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_ENDRECORD _IOWR('f', 181, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_PARSE _IOWR('f', 182, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_DORECORD _IOWR('f', 183, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_PROCESS_CFG _IOWR('f', 184, OBD_IOC_DATA_TYPE) +/* OBD_IOC_DUMP_LOG _IOWR('f', 185, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_CLEAR_LOG _IOWR('f', 186, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_PARAM _IOW('f', 187, OBD_IOC_DATA_TYPE) +#define OBD_IOC_POOL _IOWR('f', 188, OBD_IOC_DATA_TYPE) +#define OBD_IOC_REPLACE_NIDS _IOWR('f', 189, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_CATLOGLIST _IOWR('f', 190, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_INFO _IOWR('f', 191, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_PRINT _IOWR('f', 192, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_CANCEL _IOWR('f', 193, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_REMOVE _IOWR('f', 194, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_CHECK _IOWR('f', 195, OBD_IOC_DATA_TYPE) +/* OBD_IOC_LLOG_CATINFO _IOWR('f', 196, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_NODEMAP _IOWR('f', 197, OBD_IOC_DATA_TYPE) +#define OBD_IOC_CLEAR_CONFIGS _IOWR('f', 198, OBD_IOC_DATA_TYPE) + +/* ECHO_IOC_GET_STRIPE _IOWR('f', 200, OBD_IOC_DATA_TYPE) */ +/* ECHO_IOC_SET_STRIPE _IOWR('f', 201, OBD_IOC_DATA_TYPE) */ +/* ECHO_IOC_ENQUEUE _IOWR('f', 202, OBD_IOC_DATA_TYPE) */ +/* ECHO_IOC_CANCEL _IOWR('f', 203, OBD_IOC_DATA_TYPE) */ + +#define OBD_IOC_LCFG_FORK _IOWR('f', 208, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LCFG_ERASE _IOWR('f', 209, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GET_OBJ_VERSION _IOR('f', 210, OBD_IOC_DATA_TYPE) + +/* lustre/lustre_user.h 211-220 */ +/* was #define OBD_IOC_GET_MNTOPT _IOW('f', 220, mntopt_t) until 2.11 */ +#define OBD_IOC_ECHO_MD _IOR('f', 221, struct obd_ioctl_data) +#define OBD_IOC_ECHO_ALLOC_SEQ _IOWR('f', 222, struct obd_ioctl_data) +#define OBD_IOC_START_LFSCK _IOWR('f', 230, OBD_IOC_DATA_TYPE) +#define OBD_IOC_STOP_LFSCK _IOW('f', 231, OBD_IOC_DATA_TYPE) +#define OBD_IOC_QUERY_LFSCK _IOR('f', 232, struct obd_ioctl_data) +#define OBD_IOC_CHLG_POLL _IOR('f', 233, long) +/* lustre/lustre_user.h 240-249 */ +/* was LIBCFS_IOC_DEBUG_MASK _IOWR('f', 250, long) until 2.11 */ + +#define OBD_IOC_BARRIER _IOWR('f', 261, OBD_IOC_DATA_TYPE) + +#define IOC_OSC_SET_ACTIVE _IOWR('h', 21, void *) + +#endif /* _UAPI_LUSTRE_IOCTL_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h new file mode 100644 index 0000000000000..26819ff7995cf --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h @@ -0,0 +1,98 @@ +/* + * LGPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. + * + * LGPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Author: Nathan Rutman + * + * Kernel <-> userspace communication routines. + * The definitions below are used in the kernel and userspace. + */ + +#ifndef __UAPI_KERNELCOMM_H__ +#define __UAPI_KERNELCOMM_H__ + +#include + +/* KUC message header. + * All current and future KUC messages should use this header. + * To avoid having to include Lustre headers from libcfs, define this here. + */ +struct kuc_hdr { + __u16 kuc_magic; + __u8 kuc_transport; /* Each new Lustre feature should use a different + transport */ + __u8 kuc_flags; + __u16 kuc_msgtype; /* Message type or opcode, transport-specific */ + __u16 kuc_msglen; /* Including header */ +} __attribute__((aligned(sizeof(__u64)))); + + +#define KUC_MAGIC 0x191C /*Lustre9etLinC */ + +/* kuc_msgtype values are defined in each transport */ +enum kuc_transport_type { + KUC_TRANSPORT_GENERIC = 1, + KUC_TRANSPORT_HSM = 2, +}; + +enum kuc_generic_message_type { + KUC_MSG_SHUTDOWN = 1, +}; + +/* KUC Broadcast Groups. This determines which userspace process hears which + * messages. Mutliple transports may be used within a group, or multiple + * groups may use the same transport. Broadcast + * groups need not be used if e.g. a UID is specified instead; + * use group 0 to signify unicast. + */ +#define KUC_GRP_HSM 0x02 +#define KUC_GRP_MAX KUC_GRP_HSM + +enum lk_flags { + LK_FLG_STOP = 0x0001, + LK_FLG_DATANR = 0x0002, +}; +#define LK_NOFD -1U + +/* kernelcomm control structure, passed from userspace to kernel. + * For compatibility with old copytools, users who pass ARCHIVE_IDs + * to kernel using lk_data_count and lk_data should fill lk_flags with + * LK_FLG_DATANR. Otherwise kernel will take lk_data_count as bitmap of + * ARCHIVE IDs. + */ +struct lustre_kernelcomm { + __u32 lk_wfd; + __u32 lk_rfd; + __u32 lk_uid; + __u32 lk_group; + __u32 lk_data_count; + __u32 lk_flags; + __u32 lk_data[0]; +} __attribute__((packed)); + +#endif /* __UAPI_KERNELCOMM_H__ */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h new file mode 100644 index 0000000000000..68c8d3a1009c4 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h @@ -0,0 +1,238 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * lustre/include/lustre/lustre_lfsck_user.h + * + * Lustre LFSCK userspace interfaces. + * + * Author: Fan, Yong + */ + +#ifndef _LUSTRE_LFSCK_USER_H +# define _LUSTRE_LFSCK_USER_H + +#include +#include + +/** + * state machine: + * + * LS_INIT + * | + * (lfsck|start) + * | + * v + * LS_SCANNING_PHASE1 + * | ^ + * | : + * | (lfsck:restart) + * | : + * v : + * ----------------------------------------------------------------- + * | |^ |^ |^ |^ |^ + * | |: |: |: |: |: + * v v: v: v: v: v: + * LS_SCANNING_PHASE2 LS_FAILED LS_STOPPED LS_PAUSED LS_CRASHED LS_PARTIAL + * (CO_) (CO_) (CO_) + * | ^ ^: ^: ^: ^: ^: + * | : |: |: |: |: |: + * | (lfsck:restart) |: |: |: |: |: + * v : |v |v |v |v |v + * ----------------------------------------------------------------- + * | + * v + * LS_COMPLETED + */ +enum lfsck_status { + /* The lfsck file is new created, for new MDT, upgrading from old disk, + * or re-creating the lfsck file manually. */ + LS_INIT = 0, + + /* The first-step system scanning. The checked items during the phase1 + * scanning depends on the LFSCK type. */ + LS_SCANNING_PHASE1 = 1, + + /* The second-step system scanning. The checked items during the phase2 + * scanning depends on the LFSCK type. */ + LS_SCANNING_PHASE2 = 2, + + /* The LFSCK processing has completed for all objects. */ + LS_COMPLETED = 3, + + /* The LFSCK exited automatically for failure, will not auto restart. */ + LS_FAILED = 4, + + /* The LFSCK is stopped manually, will not auto restart. */ + LS_STOPPED = 5, + + /* LFSCK is paused automatically when umount, + * will be restarted automatically when remount. */ + LS_PAUSED = 6, + + /* System crashed during the LFSCK, + * will be restarted automatically after recovery. */ + LS_CRASHED = 7, + + /* Some OST/MDT failed during the LFSCK, or not join the LFSCK. */ + LS_PARTIAL = 8, + + /* The LFSCK is failed because its controller is failed. */ + LS_CO_FAILED = 9, + + /* The LFSCK is stopped because its controller is stopped. */ + LS_CO_STOPPED = 10, + + /* The LFSCK is paused because its controller is paused. */ + LS_CO_PAUSED = 11, + + LS_MAX +}; + +static inline const char *lfsck_status2name(int status) +{ + static const char * const lfsck_status_names[] = { + [LS_INIT] = "init", + [LS_SCANNING_PHASE1] = "scanning-phase1", + [LS_SCANNING_PHASE2] = "scanning-phase2", + [LS_COMPLETED] = "completed", + [LS_FAILED] = "failed", + [LS_STOPPED] = "stopped", + [LS_PAUSED] = "paused", + [LS_CRASHED] = "crashed", + [LS_PARTIAL] = "partial", + [LS_CO_FAILED] = "co-failed", + [LS_CO_STOPPED] = "co-stopped", + [LS_CO_PAUSED] = "co-paused" + }; + + if (status < 0 || status >= LS_MAX) + return "unknown"; + + return lfsck_status_names[status]; +} + +enum lfsck_param_flags { + /* Reset LFSCK iterator position to the device beginning. */ + LPF_RESET = 0x0001, + + /* Exit when fail. */ + LPF_FAILOUT = 0x0002, + + /* Dryrun mode, only check without modification */ + LPF_DRYRUN = 0x0004, + + /* LFSCK runs on all targets. */ + LPF_ALL_TGT = 0x0008, + + /* Broadcast the command to other MDTs. Only valid on the sponsor MDT */ + LPF_BROADCAST = 0x0010, + + /* Handle orphan OST-objects. */ + LPF_OST_ORPHAN = 0x0020, + + /* Create OST-object for dangling LOV EA. */ + LPF_CREATE_OSTOBJ = 0x0040, + + /* Create MDT-object for dangling name entry. */ + LPF_CREATE_MDTOBJ = 0x0080, + + /* Do not return until the LFSCK not running. */ + LPF_WAIT = 0x0100, + + /* Delay to create OST-object for dangling LOV EA. */ + LPF_DELAY_CREATE_OSTOBJ = 0x0200, +}; + +enum lfsck_type { + /* For MDT and OST internal OSD consistency check/repair. */ + LFSCK_TYPE_SCRUB = 0x0000, + + /* For MDT-OST (layout, object) consistency check/repair. */ + LFSCK_TYPE_LAYOUT = 0x0001, + + /* For MDT (FID-in-dirent, linkEA) consistency check/repair. */ + LFSCK_TYPE_NAMESPACE = 0x0004, + LFSCK_TYPES_SUPPORTED = (LFSCK_TYPE_SCRUB | LFSCK_TYPE_LAYOUT | + LFSCK_TYPE_NAMESPACE), + LFSCK_TYPES_DEF = LFSCK_TYPES_SUPPORTED, + LFSCK_TYPES_ALL = ((__u16)(~0)) +}; + +#define LFSCK_VERSION_V1 1 +#define LFSCK_VERSION_V2 2 + +#define LFSCK_SPEED_NO_LIMIT 0 +#define LFSCK_SPEED_LIMIT_DEF LFSCK_SPEED_NO_LIMIT +#define LFSCK_ASYNC_WIN_DEFAULT 1024 +#define LFSCK_ASYNC_WIN_MAX ((__u16)(~0)) +#define LFSCK_TYPE_BITS 16 + +enum lfsck_start_valid { + LSV_SPEED_LIMIT = 0x00000001, + LSV_ERROR_HANDLE = 0x00000002, + LSV_DRYRUN = 0x00000004, + LSV_ASYNC_WINDOWS = 0x00000008, + LSV_CREATE_OSTOBJ = 0x00000010, + LSV_CREATE_MDTOBJ = 0x00000020, + LSV_DELAY_CREATE_OSTOBJ = 0x00000040, +}; + +/* Arguments for starting lfsck. */ +struct lfsck_start { + /* Which arguments are valid, see 'enum lfsck_start_valid'. */ + __u32 ls_valid; + + /* How many items can be scanned at most per second. */ + __u32 ls_speed_limit; + + /* For compatibility between user space tools and kernel service. */ + __u16 ls_version; + + /* Which LFSCK components to be (have been) started. */ + __u16 ls_active; + + /* Flags for the LFSCK, see 'enum lfsck_param_flags'. */ + __u16 ls_flags; + + /* The windows size for async requests pipeline. */ + __u16 ls_async_windows; +}; + +struct lfsck_stop { + __u32 ls_status; + __u16 ls_flags; + __u16 ls_padding_1; /* For 64-bits aligned. */ + __u64 ls_padding_2; +}; + +struct lfsck_query { + __u16 lu_types; + __u16 lu_flags; + __u32 lu_mdts_count[LFSCK_TYPE_BITS][LS_MAX + 1]; + __u32 lu_osts_count[LFSCK_TYPE_BITS][LS_MAX + 1]; + __u64 lu_repaired[LFSCK_TYPE_BITS]; +}; + +#endif /* _LUSTRE_LFSCK_USER_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h new file mode 100644 index 0000000000000..bcf46eb21e6c2 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h @@ -0,0 +1,80 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre_log_user.h + * + * Userspace-usable portion of Generic infrastructure for managing + * a collection of logs. + * See lustre_log.h for more details. + */ + +#ifndef _LUSTRE_LOG_USER_H +#define _LUSTRE_LOG_USER_H + +#include +#include + +/* Lustre logs use FIDs constructed from oi_id and oi_seq directly, + * without attempting to use the IGIF and IDIF ranges as is done + * elsewhere, because of compatibility concerns (see lu-2888). + */ + +static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid) +{ + /* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS) + * logid's by non-zero ogen (inode generation) and convert them + * into IGIF */ + if (id->lgl_ogen == 0) { + fid->f_seq = id->lgl_oi.oi.oi_seq; + fid->f_oid = id->lgl_oi.oi.oi_id; + fid->f_ver = 0; + } else { + lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen); + } +} + +static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id) +{ + id->lgl_oi.oi.oi_seq = fid->f_seq; + id->lgl_oi.oi.oi_id = fid->f_oid; + id->lgl_ogen = 0; +} + +static inline void logid_set_id(struct llog_logid *log_id, __u64 id) +{ + log_id->lgl_oi.oi.oi_id = id; +} + +static inline __u64 logid_id(struct llog_logid *log_id) +{ + return log_id->lgl_oi.oi.oi_id; +} + +#endif /* ifndef _LUSTRE_LOG_USER_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h new file mode 100644 index 0000000000000..90fa213f83e90 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h @@ -0,0 +1,237 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * Copyright 2015 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * Define ost_id associated functions + */ + +#ifndef _UAPI_LUSTRE_OSTID_H_ +#define _UAPI_LUSTRE_OSTID_H_ + +#include +#include +#include + +static inline __u64 lmm_oi_id(const struct ost_id *oi) +{ + return oi->oi.oi_id; +} + +static inline __u64 lmm_oi_seq(const struct ost_id *oi) +{ + return oi->oi.oi_seq; +} + +static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq) +{ + oi->oi.oi_seq = seq; +} + +static inline void lmm_oi_set_id(struct ost_id *oi, __u64 oid) +{ + oi->oi.oi_id = oid; +} + +static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi, + const struct ost_id *src_oi) +{ + dst_oi->oi.oi_id = __le64_to_cpu(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = __le64_to_cpu(src_oi->oi.oi_seq); +} + +static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi, + const struct ost_id *src_oi) +{ + dst_oi->oi.oi_id = __cpu_to_le64(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = __cpu_to_le64(src_oi->oi.oi_seq); +} + +/* extract OST sequence (group) from a wire ost_id (id/seq) pair */ +static inline __u64 ostid_seq(const struct ost_id *ostid) +{ + if (fid_seq_is_mdt0(ostid->oi.oi_seq)) + return FID_SEQ_OST_MDT0; + + if (fid_seq_is_default(ostid->oi.oi_seq)) + return FID_SEQ_LOV_DEFAULT; + + if (fid_is_idif(&ostid->oi_fid)) + return FID_SEQ_OST_MDT0; + + return fid_seq(&ostid->oi_fid); +} + +/* extract OST objid from a wire ost_id (id/seq) pair */ +static inline __u64 ostid_id(const struct ost_id *ostid) +{ + if (fid_seq_is_mdt0(ostid->oi.oi_seq)) + return ostid->oi.oi_id & IDIF_OID_MASK; + + if (fid_seq_is_default(ostid->oi.oi_seq)) + return ostid->oi.oi_id; + + if (fid_is_idif(&ostid->oi_fid)) + return fid_idif_id(fid_seq(&ostid->oi_fid), + fid_oid(&ostid->oi_fid), 0); + + return fid_oid(&ostid->oi_fid); +} + +static inline void ostid_set_seq(struct ost_id *oi, __u64 seq) +{ + if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) { + oi->oi.oi_seq = seq; + } else { + oi->oi_fid.f_seq = seq; + /* + * Note: if f_oid + f_ver is zero, we need init it + * to be 1, otherwise, ostid_seq will treat this + * as old ostid (oi_seq == 0) + */ + if (!oi->oi_fid.f_oid && !oi->oi_fid.f_ver) + oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID; + } +} + +static inline void ostid_set_seq_mdt0(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_OST_MDT0); +} + +static inline void ostid_set_seq_echo(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_ECHO); +} + +static inline void ostid_set_seq_llog(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_LLOG); +} + +static inline void ostid_cpu_to_le(const struct ost_id *src_oi, + struct ost_id *dst_oi) +{ + if (fid_seq_is_mdt0(src_oi->oi.oi_seq)) { + dst_oi->oi.oi_id = __cpu_to_le64(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = __cpu_to_le64(src_oi->oi.oi_seq); + } else { + fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid); + } +} + +static inline void ostid_le_to_cpu(const struct ost_id *src_oi, + struct ost_id *dst_oi) +{ + if (fid_seq_is_mdt0(src_oi->oi.oi_seq)) { + dst_oi->oi.oi_id = __le64_to_cpu(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = __le64_to_cpu(src_oi->oi.oi_seq); + } else { + fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid); + } +} + +/** + * Sigh, because pre-2.4 uses + * struct lov_mds_md_v1 { + * ........ + * __u64 lmm_object_id; + * __u64 lmm_object_seq; + * ...... + * } + * to identify the LOV(MDT) object, and lmm_object_seq will + * be normal_fid, which make it hard to combine these conversion + * to ostid_to FID. so we will do lmm_oi/fid conversion separately + * + * We can tell the lmm_oi by this way, + * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0 + * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL + * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k}, + * lmm_oi.f_ver = 0 + * + * But currently lmm_oi/lsm_oi does not have any "real" usages, + * except for printing some information, and the user can always + * get the real FID from LMA, besides this multiple case check might + * make swab more complicate. So we will keep using id/seq for lmm_oi. + */ + +static inline void fid_to_lmm_oi(const struct lu_fid *fid, + struct ost_id *oi) +{ + oi->oi.oi_id = fid_oid(fid); + oi->oi.oi_seq = fid_seq(fid); +} + +/** + * Unpack an OST object id/seq (group) into a FID. This is needed for + * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper + * FIDs. Note that if an id/seq is already in FID/IDIF format it will + * be passed through unchanged. Only legacy OST objects in "group 0" + * will be mapped into the IDIF namespace so that they can fit into the + * struct lu_fid fields without loss. + */ +static inline int ostid_to_fid(struct lu_fid *fid, const struct ost_id *ostid, + __u32 ost_idx) +{ + __u64 seq = ostid_seq(ostid); + + if (ost_idx > 0xffff) + return -EBADF; + + if (fid_seq_is_mdt0(seq)) { + __u64 oid = ostid_id(ostid); + + /* This is a "legacy" (old 1.x/2.early) OST object in "group 0" + * that we map into the IDIF namespace. It allows up to 2^48 + * objects per OST, as this is the object namespace that has + * been in production for years. This can handle create rates + * of 1M objects/s/OST for 9 years, or combinations thereof. + */ + if (oid >= IDIF_MAX_OID) + return -EBADF; + + fid->f_seq = fid_idif_seq(oid, ost_idx); + /* truncate to 32 bits by assignment */ + fid->f_oid = oid; + /* in theory, not currently used */ + fid->f_ver = oid >> 48; + } else if (!fid_seq_is_default(seq)) { + /* This is either an IDIF object, which identifies objects + * across all OSTs, or a regular FID. The IDIF namespace + * maps legacy OST objects into the FID namespace. In both + * cases, we just pass the FID through, no conversion needed. + */ + if (ostid->oi_fid.f_ver) + return -EBADF; + + *fid = ostid->oi_fid; + } + + return 0; +} +#endif /* _UAPI_LUSTRE_OSTID_H_ */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h new file mode 100644 index 0000000000000..8b9177046d999 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h @@ -0,0 +1,95 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * User-settable parameter keys + * + * Author: Nathan Rutman + */ + +#ifndef _UAPI_LUSTRE_PARAM_H +#define _UAPI_LUSTRE_PARAM_H + +/** \defgroup param param + * + * @{ + */ + +/****************** User-settable parameter keys *********************/ +/* e.g. + * tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda + * lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0 + * ... testfs-MDT0000.lov.stripesize=4M + * ... testfs-OST0000.ost.client_cache_seconds=15 + * ... testfs.sys.timeout= + * ... testfs.llite.max_read_ahead_mb=16 + */ + +/* System global or special params not handled in obd's proc + * See mgs_write_log_sys() + */ +#define PARAM_TIMEOUT "timeout=" /* global */ +#define PARAM_LDLM_TIMEOUT "ldlm_timeout=" /* global */ +#define PARAM_AT_MIN "at_min=" /* global */ +#define PARAM_AT_MAX "at_max=" /* global */ +#define PARAM_AT_EXTRA "at_extra=" /* global */ +#define PARAM_AT_EARLY_MARGIN "at_early_margin=" /* global */ +#define PARAM_AT_HISTORY "at_history=" /* global */ +#define PARAM_JOBID_VAR "jobid_var=" /* global */ +#define PARAM_MGSNODE "mgsnode=" /* only at mounttime */ +#define PARAM_FAILNODE "failover.node=" /* add failover nid */ +#define PARAM_FAILMODE "failover.mode=" /* initial mount only */ +#define PARAM_ACTIVE "active=" /* activate/deactivate */ +#define PARAM_NETWORK "network=" /* bind on nid */ +#define PARAM_ID_UPCALL "identity_upcall=" /* identity upcall */ +#define PARAM_ROOTSQUASH "root_squash=" /* root squash */ +#define PARAM_NOSQUASHNIDS "nosquash_nids=" /* no squash nids */ + +/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */ +#define PARAM_OST "ost." +#define PARAM_OSD "osd." +#define PARAM_OSC "osc." +#define PARAM_MDT "mdt." +#define PARAM_HSM "mdt.hsm." +#define PARAM_MDD "mdd." +#define PARAM_MDC "mdc." +#define PARAM_LLITE "llite." +#define PARAM_LOV "lov." +#define PARAM_LOD "lod." +#define PARAM_OSP "osp." +#define PARAM_SYS "sys." /* global */ +#define PARAM_SRPC "srpc." +#define PARAM_SRPC_FLVR "srpc.flavor." +#define PARAM_SRPC_UDESC "srpc.udesc.cli2mdt" +#define PARAM_SEC "security." +#define PARAM_QUOTA "quota." /* global */ + +/** @} param */ + +#endif /* _UAPI_LUSTRE_PARAM_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h new file mode 100644 index 0000000000000..de4fe08aa2eac --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h @@ -0,0 +1,2795 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre/lustre_user.h + * + * Lustre public user-space interface definitions. + */ + +#ifndef _LUSTRE_USER_H +#define _LUSTRE_USER_H + +/** \defgroup lustreuser lustreuser + * + * @{ + */ +#ifndef __KERNEL__ +# define __USE_ISOC99 1 +# include +# include /* snprintf() */ +# include + +# define __USE_GNU 1 +# define __USE_XOPEN2K8 1 +# define FILEID_LUSTRE 0x97 /* for name_to_handle_at() (and llapi_fd2fid()) */ +#endif /* !__KERNEL__ */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifdef __STRICT_ANSI__ +#define typeof __typeof__ +#endif + +/* + * This is a temporary solution of adding quota type. + * Should be removed as soon as system header is updated. + */ +#undef LL_MAXQUOTAS +#define LL_MAXQUOTAS 3 +#undef INITQFNAMES +#define INITQFNAMES { \ + "user", /* USRQUOTA */ \ + "group", /* GRPQUOTA */ \ + "project", /* PRJQUOTA */ \ + "undefined", \ +}; +#ifndef USRQUOTA +#define USRQUOTA 0 +#endif +#ifndef GRPQUOTA +#define GRPQUOTA 1 +#endif +#ifndef PRJQUOTA +#define PRJQUOTA 2 +#endif + +/* + * We need to always use 64bit version because the structure + * is shared across entire cluster where 32bit and 64bit machines + * are co-existing. + */ +#if __BITS_PER_LONG != 64 || defined(__ARCH_WANT_STAT64) +typedef struct stat64 lstat_t; +#define lstat_f lstat64 +#define fstat_f fstat64 +#define fstatat_f fstatat64 +#else +typedef struct stat lstat_t; +#define lstat_f lstat +#define fstat_f fstat +#define fstatat_f fstatat +#endif + +#ifndef STATX_BASIC_STATS +/* + * Timestamp structure for the timestamps in struct statx. + * + * tv_sec holds the number of seconds before (negative) or after (positive) + * 00:00:00 1st January 1970 UTC. + * + * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time. + * + * __reserved is held in case we need a yet finer resolution. + */ +struct statx_timestamp { + __s64 tv_sec; + __u32 tv_nsec; + __s32 __reserved; +}; + +/* + * Structures for the extended file attribute retrieval system call + * (statx()). + * + * The caller passes a mask of what they're specifically interested in as a + * parameter to statx(). What statx() actually got will be indicated in + * st_mask upon return. + * + * For each bit in the mask argument: + * + * - if the datum is not supported: + * + * - the bit will be cleared, and + * + * - the datum will be set to an appropriate fabricated value if one is + * available (eg. CIFS can take a default uid and gid), otherwise + * + * - the field will be cleared; + * + * - otherwise, if explicitly requested: + * + * - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is + * set or if the datum is considered out of date, and + * + * - the field will be filled in and the bit will be set; + * + * - otherwise, if not requested, but available in approximate form without any + * effort, it will be filled in anyway, and the bit will be set upon return + * (it might not be up to date, however, and no attempt will be made to + * synchronise the internal state first); + * + * - otherwise the field and the bit will be cleared before returning. + * + * Items in STATX_BASIC_STATS may be marked unavailable on return, but they + * will have values installed for compatibility purposes so that stat() and + * co. can be emulated in userspace. + */ +struct statx { + /* 0x00 */ + __u32 stx_mask; /* What results were written [uncond] */ + __u32 stx_blksize; /* Preferred general I/O size [uncond] */ + __u64 stx_attributes; /* Flags conveying information about the file [uncond] */ + /* 0x10 */ + __u32 stx_nlink; /* Number of hard links */ + __u32 stx_uid; /* User ID of owner */ + __u32 stx_gid; /* Group ID of owner */ + __u16 stx_mode; /* File mode */ + __u16 __spare0[1]; + /* 0x20 */ + __u64 stx_ino; /* Inode number */ + __u64 stx_size; /* File size */ + __u64 stx_blocks; /* Number of 512-byte blocks allocated */ + __u64 stx_attributes_mask; /* Mask to show what's supported in stx_attributes */ + /* 0x40 */ + struct statx_timestamp stx_atime; /* Last access time */ + struct statx_timestamp stx_btime; /* File creation time */ + struct statx_timestamp stx_ctime; /* Last attribute change time */ + struct statx_timestamp stx_mtime; /* Last data modification time */ + /* 0x80 */ + __u32 stx_rdev_major; /* Device ID of special file [if bdev/cdev] */ + __u32 stx_rdev_minor; + __u32 stx_dev_major; /* ID of device containing file [uncond] */ + __u32 stx_dev_minor; + /* 0x90 */ + __u64 __spare2[14]; /* Spare space for future expansion */ + /* 0x100 */ +}; + +/* + * Flags to be stx_mask + * + * Query request/result mask for statx() and struct statx::stx_mask. + * + * These bits should be set in the mask argument of statx() to request + * particular items when calling statx(). + */ +#define STATX_TYPE 0x00000001U /* Want/got stx_mode & S_IFMT */ +#define STATX_MODE 0x00000002U /* Want/got stx_mode & ~S_IFMT */ +#define STATX_NLINK 0x00000004U /* Want/got stx_nlink */ +#define STATX_UID 0x00000008U /* Want/got stx_uid */ +#define STATX_GID 0x00000010U /* Want/got stx_gid */ +#define STATX_ATIME 0x00000020U /* Want/got stx_atime */ +#define STATX_MTIME 0x00000040U /* Want/got stx_mtime */ +#define STATX_CTIME 0x00000080U /* Want/got stx_ctime */ +#define STATX_INO 0x00000100U /* Want/got stx_ino */ +#define STATX_SIZE 0x00000200U /* Want/got stx_size */ +#define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ +#define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ +#define STATX_BTIME 0x00000800U /* Want/got stx_btime */ +#define STATX_ALL 0x00000fffU /* All currently supported flags */ +#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ + +/* + * Attributes to be found in stx_attributes and masked in stx_attributes_mask. + * + * These give information about the features or the state of a file that might + * be of use to ordinary userspace programs such as GUIs or ls rather than + * specialised tools. + * + * Note that the flags marked [I] correspond to generic FS_IOC_FLAGS + * semantically. Where possible, the numerical value is picked to correspond + * also. + */ +#define STATX_ATTR_COMPRESSED 0x00000004 /* [I] File is compressed by the fs */ +#define STATX_ATTR_IMMUTABLE 0x00000010 /* [I] File is marked immutable */ +#define STATX_ATTR_APPEND 0x00000020 /* [I] File is append-only */ +#define STATX_ATTR_NODUMP 0x00000040 /* [I] File is not to be dumped */ +#define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */ + +#define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ + +#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */ +#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */ +#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */ +#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */ + +#endif /* STATX_BASIC_STATS */ + +typedef struct statx lstatx_t; + +#define LUSTRE_EOF 0xffffffffffffffffULL + +/* for statfs() */ +#define LL_SUPER_MAGIC 0x0BD00BD0 + +#define FSFILT_IOC_GETVERSION _IOR('f', 3, long) + +/* FIEMAP flags supported by Lustre */ +#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER) + +enum obd_statfs_state { + OS_STATFS_DEGRADED = 0x00000001, /**< RAID degraded/rebuilding */ + OS_STATFS_READONLY = 0x00000002, /**< filesystem is read-only */ + OS_STATFS_NOPRECREATE = 0x00000004, /**< no object precreation */ + OS_STATFS_UNUSED1 = 0x00000008, /**< obsolete 1.6, was EROFS=30 */ + OS_STATFS_UNUSED2 = 0x00000010, /**< obsolete 1.6, was EROFS=30 */ + OS_STATFS_ENOSPC = 0x00000020, /**< not enough free space */ + OS_STATFS_ENOINO = 0x00000040, /**< not enough inodes */ + OS_STATFS_SUM = 0x00000100, /**< aggregated for all tagrets */ + OS_STATFS_NONROT = 0x00000200, /**< non-rotational device */ +}; + +/** filesystem statistics/attributes for target device */ +struct obd_statfs { + __u64 os_type; /* EXT4_SUPER_MAGIC, UBERBLOCK_MAGIC */ + __u64 os_blocks; /* total size in #os_bsize blocks */ + __u64 os_bfree; /* number of unused blocks */ + __u64 os_bavail; /* blocks available for allocation */ + __u64 os_files; /* total number of objects */ + __u64 os_ffree; /* # objects that could be created */ + __u8 os_fsid[40]; /* identifier for filesystem */ + __u32 os_bsize; /* block size in bytes for os_blocks */ + __u32 os_namelen; /* maximum length of filename in bytes*/ + __u64 os_maxbytes; /* maximum object size in bytes */ + __u32 os_state; /**< obd_statfs_state OS_STATFS_* */ + __u32 os_fprecreated; /* objs available now to the caller */ + /* used in QoS code to find preferred + * OSTs */ + __u32 os_granted; /* space granted for MDS */ + __u32 os_spare3; /* Unused padding fields. Remember */ + __u32 os_spare4; /* to fix lustre_swab_obd_statfs() */ + __u32 os_spare5; + __u32 os_spare6; + __u32 os_spare7; + __u32 os_spare8; + __u32 os_spare9; +}; + +/** additional filesystem attributes for target device */ +struct obd_statfs_info { + __u32 os_reserved_mb_low; /* reserved mb low */ + __u32 os_reserved_mb_high; /* reserved mb high */ + bool os_enable_pre; /* enable pre create logic */ +}; + +/** + * File IDentifier. + * + * FID is a cluster-wide unique identifier of a file or an object (stripe). + * FIDs are never reused. + **/ +struct lu_fid { + /** + * FID sequence. Sequence is a unit of migration: all files (objects) + * with FIDs from a given sequence are stored on the same server. + * Lustre should support 2^64 objects, so even if each sequence + * has only a single object we can still enumerate 2^64 objects. + **/ + __u64 f_seq; + /* FID number within sequence. */ + __u32 f_oid; + /** + * FID version, used to distinguish different versions (in the sense + * of snapshots, etc.) of the same file system object. Not currently + * used. + **/ + __u32 f_ver; +} __attribute__((packed)); + +static inline bool fid_is_zero(const struct lu_fid *fid) +{ + return fid->f_seq == 0 && fid->f_oid == 0; +} + +/* The data name_to_handle_at() places in a struct file_handle (at f_handle) */ +struct lustre_file_handle { + struct lu_fid lfh_child; + struct lu_fid lfh_parent; +}; + +/* Currently, the filter_fid::ff_parent::f_ver is not the real parent + * MDT-object's FID::f_ver, instead it is the OST-object index in its + * parent MDT-object's layout EA. */ +#define f_stripe_idx f_ver + +struct ost_layout { + __u32 ol_stripe_size; + __u32 ol_stripe_count; + __u64 ol_comp_start; + __u64 ol_comp_end; + __u32 ol_comp_id; +} __attribute__((packed)); + +/* The filter_fid structure has changed several times over its lifetime. + * For a long time "trusted.fid" held the MDT inode parent FID/IGIF and + * stripe_index and the "self FID" (objid/seq) to be able to recover the + * OST objects in case of corruption. With the move to 2.4 and OSD-API for + * the OST, the "trusted.lma" xattr was added to the OST objects to store + * the "self FID" to be consistent with the MDT on-disk format, and the + * filter_fid only stored the MDT inode parent FID and stripe index. + * + * In 2.10, the addition of PFL composite layouts required more information + * to be stored into the filter_fid in order to be able to identify which + * component the OST object belonged. As well, the stripe size may vary + * between components, so it was no longer safe to assume the stripe size + * or stripe_count of a file. This is also more robust for plain layouts. + * + * For ldiskfs OSTs that were formatted with 256-byte inodes, there is not + * enough space to store both the filter_fid and LMA in the inode, so they + * are packed into struct lustre_ost_attrs on disk in trusted.lma to avoid + * an extra seek for every OST object access. + * + * In 2.11, FLR mirror layouts also need to store the layout version and + * range so that writes to old versions of the layout are not allowed. + * That ensures that mirrored objects are not modified by evicted clients, + * and ensures that the components are correctly marked stale on the MDT. + */ +struct filter_fid_18_23 { + struct lu_fid ff_parent; /* stripe_idx in f_ver */ + __u64 ff_objid; + __u64 ff_seq; +}; + +struct filter_fid_24_29 { + struct lu_fid ff_parent; /* stripe_idx in f_ver */ +}; + +struct filter_fid_210 { + struct lu_fid ff_parent; /* stripe_idx in f_ver */ + struct ost_layout ff_layout; +}; + +struct filter_fid { + struct lu_fid ff_parent; /* stripe_idx in f_ver */ + struct ost_layout ff_layout; + __u32 ff_layout_version; + __u32 ff_range; /* range of layout version that + * write are allowed */ +} __attribute__((packed)); + +/* Userspace should treat lu_fid as opaque, and only use the following methods + * to print or parse them. Other functions (e.g. compare, swab) could be moved + * here from lustre_idl.h if needed. */ +struct lu_fid; + +enum lma_compat { + LMAC_HSM = 0x00000001, +/* LMAC_SOM = 0x00000002, obsolete since 2.8.0 */ + LMAC_NOT_IN_OI = 0x00000004, /* the object does NOT need OI mapping */ + LMAC_FID_ON_OST = 0x00000008, /* For OST-object, its OI mapping is + * under /O//d. */ + LMAC_STRIPE_INFO = 0x00000010, /* stripe info in the LMA EA. */ + LMAC_COMP_INFO = 0x00000020, /* Component info in the LMA EA. */ + LMAC_IDX_BACKUP = 0x00000040, /* Has index backup. */ +}; + +/** + * Masks for all features that should be supported by a Lustre version to + * access a specific file. + * This information is stored in lustre_mdt_attrs::lma_incompat. + */ +enum lma_incompat { + LMAI_RELEASED = 0x00000001, /* file is released */ + LMAI_AGENT = 0x00000002, /* agent inode */ + LMAI_REMOTE_PARENT = 0x00000004, /* the parent of the object + is on the remote MDT */ + LMAI_STRIPED = 0x00000008, /* striped directory inode */ + LMAI_ORPHAN = 0x00000010, /* inode is orphan */ + LMAI_ENCRYPT = 0x00000020, /* inode is encrypted */ + LMA_INCOMPAT_SUPP = (LMAI_AGENT | LMAI_REMOTE_PARENT | \ + LMAI_STRIPED | LMAI_ORPHAN | LMAI_ENCRYPT) +}; + + +/** + * Following struct for object attributes, that will be kept inode's EA. + * Introduced in 2.0 release (please see b15993, for details) + * Added to all objects since Lustre 2.4 as contains self FID + */ +struct lustre_mdt_attrs { + /** + * Bitfield for supported data in this structure. From enum lma_compat. + * lma_self_fid and lma_flags are always available. + */ + __u32 lma_compat; + /** + * Per-file incompat feature list. Lustre version should support all + * flags set in this field. The supported feature mask is available in + * LMA_INCOMPAT_SUPP. + */ + __u32 lma_incompat; + /** FID of this inode */ + struct lu_fid lma_self_fid; +}; + +struct lustre_ost_attrs { + /* Use lustre_mdt_attrs directly for now, need a common header + * structure if want to change lustre_mdt_attrs in future. */ + struct lustre_mdt_attrs loa_lma; + + /* Below five elements are for OST-object's PFID EA, the + * lma_parent_fid::f_ver is composed of the stripe_count (high 16 bits) + * and the stripe_index (low 16 bits), the size should not exceed + * 5 * sizeof(__u64)) to be accessable by old Lustre. If the flag + * LMAC_STRIPE_INFO is set, then loa_parent_fid and loa_stripe_size + * are valid; if the flag LMAC_COMP_INFO is set, then the next three + * loa_comp_* elements are valid. */ + struct lu_fid loa_parent_fid; + __u32 loa_stripe_size; + __u32 loa_comp_id; + __u64 loa_comp_start; + __u64 loa_comp_end; +}; + +/** + * Prior to 2.4, the LMA structure also included SOM attributes which has since + * been moved to a dedicated xattr + * lma_flags was also removed because of lma_compat/incompat fields. + */ +#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64)) + +enum lustre_som_flags { + /* Unknow or no SoM data, must get size from OSTs. */ + SOM_FL_UNKNOWN = 0x0000, + /* Known strictly correct, FLR or DoM file (SoM guaranteed). */ + SOM_FL_STRICT = 0x0001, + /* Known stale - was right at some point in the past, but it is + * known (or likely) to be incorrect now (e.g. opened for write). */ + SOM_FL_STALE = 0x0002, + /* Approximate, may never have been strictly correct, + * need to sync SOM data to achieve eventual consistency. */ + SOM_FL_LAZY = 0x0004, +}; + +struct lustre_som_attrs { + __u16 lsa_valid; + __u16 lsa_reserved[3]; + __u64 lsa_size; + __u64 lsa_blocks; +}; + +/** + * OST object IDentifier. + */ +struct ost_id { + union { + struct { + __u64 oi_id; + __u64 oi_seq; + } oi; + struct lu_fid oi_fid; + }; +} __attribute__((packed)); + +#define DOSTID "%#llx:%llu" +#define POSTID(oi) ((unsigned long long)ostid_seq(oi)), \ + ((unsigned long long)ostid_id(oi)) + +struct ll_futimes_3 { + __u64 lfu_atime_sec; + __u64 lfu_atime_nsec; + __u64 lfu_mtime_sec; + __u64 lfu_mtime_nsec; + __u64 lfu_ctime_sec; + __u64 lfu_ctime_nsec; +}; + +/* + * Maximum number of mirrors currently implemented. + */ +#define LUSTRE_MIRROR_COUNT_MAX 16 + +/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */ +enum ll_lease_mode { + LL_LEASE_RDLCK = 0x01, + LL_LEASE_WRLCK = 0x02, + LL_LEASE_UNLCK = 0x04, +}; + +enum ll_lease_flags { + LL_LEASE_RESYNC = 0x1, + LL_LEASE_RESYNC_DONE = 0x2, + LL_LEASE_LAYOUT_MERGE = 0x4, + LL_LEASE_LAYOUT_SPLIT = 0x8, + LL_LEASE_PCC_ATTACH = 0x10, +}; + +#define IOC_IDS_MAX 4096 +struct ll_ioc_lease { + __u32 lil_mode; + __u32 lil_flags; + __u32 lil_count; + __u32 lil_ids[0]; +}; + +struct ll_ioc_lease_id { + __u32 lil_mode; + __u32 lil_flags; + __u32 lil_count; + __u16 lil_mirror_id; + __u16 lil_padding1; + __u64 lil_padding2; + __u32 lil_ids[0]; +}; + +/* + * The ioctl naming rules: + * LL_* - works on the currently opened filehandle instead of parent dir + * *_OBD_* - gets data for both OSC or MDC (LOV, LMV indirectly) + * *_MDC_* - gets/sets data related to MDC + * *_LOV_* - gets/sets data related to OSC/LOV + * *FILE* - called on parent dir and passes in a filename + * *STRIPE* - set/get lov_user_md + * *INFO - set/get lov_user_mds_data + */ +/* lustre_ioctl.h 101-150 */ +/* ioctl codes 128-143 are reserved for fsverity */ +#define LL_IOC_GETFLAGS _IOR ('f', 151, long) +#define LL_IOC_SETFLAGS _IOW ('f', 152, long) +#define LL_IOC_CLRFLAGS _IOW ('f', 153, long) +#define LL_IOC_LOV_SETSTRIPE _IOW ('f', 154, long) +#define LL_IOC_LOV_SETSTRIPE_NEW _IOWR('f', 154, struct lov_user_md) +#define LL_IOC_LOV_GETSTRIPE _IOW ('f', 155, long) +#define LL_IOC_LOV_GETSTRIPE_NEW _IOR('f', 155, struct lov_user_md) +#define LL_IOC_LOV_SETEA _IOW ('f', 156, long) +/* LL_IOC_RECREATE_OBJ 157 obsolete */ +/* LL_IOC_RECREATE_FID 157 obsolete */ +#define LL_IOC_GROUP_LOCK _IOW ('f', 158, long) +#define LL_IOC_GROUP_UNLOCK _IOW ('f', 159, long) +/* LL_IOC_QUOTACHECK 160 OBD_IOC_QUOTACHECK */ +/* LL_IOC_POLL_QUOTACHECK 161 OBD_IOC_POLL_QUOTACHECK */ +/* LL_IOC_QUOTACTL 162 OBD_IOC_QUOTACTL */ +#define IOC_OBD_STATFS _IOWR('f', 164, struct obd_statfs *) +/* IOC_LOV_GETINFO 165 obsolete */ +#define LL_IOC_FLUSHCTX _IOW ('f', 166, long) +/* LL_IOC_RMTACL 167 obsolete */ +#define LL_IOC_GETOBDCOUNT _IOR ('f', 168, long) +#define LL_IOC_LLOOP_ATTACH _IOWR('f', 169, long) +#define LL_IOC_LLOOP_DETACH _IOWR('f', 170, long) +#define LL_IOC_LLOOP_INFO _IOWR('f', 171, struct lu_fid) +#define LL_IOC_LLOOP_DETACH_BYDEV _IOWR('f', 172, long) +#define LL_IOC_PATH2FID _IOR ('f', 173, long) +#define LL_IOC_GET_CONNECT_FLAGS _IOWR('f', 174, __u64 *) +#define LL_IOC_GET_MDTIDX _IOR ('f', 175, int) +#define LL_IOC_FUTIMES_3 _IOWR('f', 176, struct ll_futimes_3) +#define LL_IOC_FLR_SET_MIRROR _IOW ('f', 177, long) +/* lustre_ioctl.h 177-210 */ +#define LL_IOC_HSM_STATE_GET _IOR('f', 211, struct hsm_user_state) +#define LL_IOC_HSM_STATE_SET _IOW('f', 212, struct hsm_state_set) +#define LL_IOC_HSM_CT_START _IOW('f', 213, struct lustre_kernelcomm) +#define LL_IOC_HSM_COPY_START _IOW('f', 214, struct hsm_copy *) +#define LL_IOC_HSM_COPY_END _IOW('f', 215, struct hsm_copy *) +#define LL_IOC_HSM_PROGRESS _IOW('f', 216, struct hsm_user_request) +#define LL_IOC_HSM_REQUEST _IOW('f', 217, struct hsm_user_request) +#define LL_IOC_DATA_VERSION _IOR('f', 218, struct ioc_data_version) +#define LL_IOC_LOV_SWAP_LAYOUTS _IOW('f', 219, \ + struct lustre_swap_layouts) +#define LL_IOC_HSM_ACTION _IOR('f', 220, \ + struct hsm_current_action) +/* lustre_ioctl.h 221-232 */ +#define LL_IOC_LMV_SETSTRIPE _IOWR('f', 240, struct lmv_user_md) +#define LL_IOC_LMV_GETSTRIPE _IOWR('f', 241, struct lmv_user_md) +#define LL_IOC_REMOVE_ENTRY _IOWR('f', 242, __u64) +#define LL_IOC_RMFID _IOR('f', 242, struct fid_array) +#define LL_IOC_UNLOCK_FOREIGN _IO('f', 242) +#define LL_IOC_SET_LEASE _IOWR('f', 243, struct ll_ioc_lease) +#define LL_IOC_SET_LEASE_OLD _IOWR('f', 243, long) +#define LL_IOC_GET_LEASE _IO('f', 244) +#define LL_IOC_HSM_IMPORT _IOWR('f', 245, struct hsm_user_import) +#define LL_IOC_LMV_SET_DEFAULT_STRIPE _IOWR('f', 246, struct lmv_user_md) +#define LL_IOC_MIGRATE _IOR('f', 247, int) +#define LL_IOC_FID2MDTIDX _IOWR('f', 248, struct lu_fid) +#define LL_IOC_GETPARENT _IOWR('f', 249, struct getparent) +#define LL_IOC_LADVISE _IOR('f', 250, struct llapi_lu_ladvise) +#define LL_IOC_HEAT_GET _IOWR('f', 251, struct lu_heat) +#define LL_IOC_HEAT_SET _IOW('f', 251, __u64) +#define LL_IOC_PCC_DETACH _IOW('f', 252, struct lu_pcc_detach) +#define LL_IOC_PCC_DETACH_BY_FID _IOW('f', 252, struct lu_pcc_detach_fid) +#define LL_IOC_PCC_STATE _IOR('f', 252, struct lu_pcc_state) +#define LL_IOC_PROJECT _IOW('f', 253, struct lu_project) + +#ifndef FS_IOC_FSGETXATTR +/* + * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR. +*/ +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + __u32 fsx_projid; /* project identifier (get/set) */ + unsigned char fsx_pad[12]; +}; +#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) +#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) +#endif +#ifndef FS_XFLAG_PROJINHERIT +#define FS_XFLAG_PROJINHERIT 0x00000200 +#endif + + +#define LL_STATFS_LMV 1 +#define LL_STATFS_LOV 2 +#define LL_STATFS_NODELAY 4 + +#define IOC_MDC_TYPE 'i' +#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *) +#define IOC_MDC_GETFILESTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *) +#define IOC_MDC_GETFILEINFO_V1 _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data_v1 *) +#define IOC_MDC_GETFILEINFO_V2 _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data) +#define LL_IOC_MDC_GETINFO_V1 _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data_v1 *) +#define LL_IOC_MDC_GETINFO_V2 _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data) +#define IOC_MDC_GETFILEINFO IOC_MDC_GETFILEINFO_V1 +#define LL_IOC_MDC_GETINFO LL_IOC_MDC_GETINFO_V1 + +#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */ + +/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular + * files, but are unlikely to be used in practice and are not harmful if + * used incorrectly. O_NOCTTY and FASYNC are only meaningful for character + * devices and are safe for use on new files. See LU-4209. */ +/* To be compatible with old statically linked binary we keep the check for + * the older 0100000000 flag. This is already removed upstream. LU-812. */ +#define O_LOV_DELAY_CREATE_1_8 0100000000 /* FMODE_NONOTIFY masked in 2.6.36 */ +#ifndef FASYNC +#define FASYNC 00020000 /* fcntl, for BSD compatibility */ +#endif +#define O_LOV_DELAY_CREATE_MASK (O_NOCTTY | FASYNC) +#define O_LOV_DELAY_CREATE (O_LOV_DELAY_CREATE_1_8 | \ + O_LOV_DELAY_CREATE_MASK) +/* O_FILE_ENC principle is similar to O_LOV_DELAY_CREATE above, + * for access to encrypted files without the encryption key. + */ +#define O_FILE_ENC (O_NOCTTY | O_NDELAY) + +#define O_LU_NOIMPORT_MASK (O_NOCTTY | O_DSYNC | O_DIRECT) +#define O_LU_NOIMPORT O_LU_NOIMPORT_MASK + +#define LL_FILE_IGNORE_LOCK 0x00000001 +#define LL_FILE_GROUP_LOCKED 0x00000002 +#define LL_FILE_READAHEA 0x00000004 +#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */ +#define LL_FILE_FLOCK_WARNING 0x00000020 /* warned about disabled flock */ + +#define LOV_USER_MAGIC_V1 0x0BD10BD0 +#define LOV_USER_MAGIC LOV_USER_MAGIC_V1 +#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0 +#define LOV_USER_MAGIC_V3 0x0BD30BD0 +/* 0x0BD40BD0 is occupied by LOV_MAGIC_MIGRATE */ +#define LOV_USER_MAGIC_SPECIFIC 0x0BD50BD0 /* for specific OSTs */ +#define LOV_USER_MAGIC_COMP_V1 0x0BD60BD0 +#define LOV_USER_MAGIC_FOREIGN 0x0BD70BD0 +#define LOV_USER_MAGIC_SEL 0x0BD80BD0 + +#define LMV_USER_MAGIC 0x0CD30CD0 /* default lmv magic */ +#define LMV_USER_MAGIC_V0 0x0CD20CD0 /* old default lmv magic*/ +#define LMV_USER_MAGIC_SPECIFIC 0x0CD40CD0 + +#define LOV_PATTERN_NONE 0x000 +#define LOV_PATTERN_RAID0 0x001 +#define LOV_PATTERN_RAID1 0x002 +#define LOV_PATTERN_MDT 0x100 +#define LOV_PATTERN_OVERSTRIPING 0x200 +#define LOV_PATTERN_FOREIGN 0x400 + +#define LOV_PATTERN_F_MASK 0xffff0000 +#define LOV_PATTERN_F_HOLE 0x40000000 /* there is hole in LOV EA */ +#define LOV_PATTERN_F_RELEASED 0x80000000 /* HSM released file */ +#define LOV_PATTERN_DEFAULT 0xffffffff + +#define LOV_OFFSET_DEFAULT ((__u16)-1) +#define LMV_OFFSET_DEFAULT ((__u32)-1) + +static inline bool lov_pattern_supported(__u32 pattern) +{ + return (pattern & ~LOV_PATTERN_F_RELEASED) == LOV_PATTERN_RAID0 || + (pattern & ~LOV_PATTERN_F_RELEASED) == + (LOV_PATTERN_RAID0 | LOV_PATTERN_OVERSTRIPING) || + (pattern & ~LOV_PATTERN_F_RELEASED) == LOV_PATTERN_MDT; +} + +/* RELEASED and MDT patterns are not valid in many places, so rather than + * having many extra checks on lov_pattern_supported, we have this separate + * check for non-released, non-DOM components + */ +static inline bool lov_pattern_supported_normal_comp(__u32 pattern) +{ + return pattern == LOV_PATTERN_RAID0 || + pattern == (LOV_PATTERN_RAID0 | LOV_PATTERN_OVERSTRIPING); + +} + +#define LOV_MAXPOOLNAME 15 +#define LOV_POOLNAMEF "%.15s" + +#define LOV_MIN_STRIPE_BITS 16 /* maximum PAGE_SIZE (ia64), power of 2 */ +#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS) +#define LOV_MAX_STRIPE_COUNT_OLD 160 +/* This calculation is crafted so that input of 4096 will result in 160 + * which in turn is equal to old maximal stripe count. + * XXX: In fact this is too simpified for now, what it also need is to get + * ea_type argument to clearly know how much space each stripe consumes. + * + * The limit of 12 pages is somewhat arbitrary, but is a reasonably large + * allocation that is sufficient for the current generation of systems. + * + * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */ +#define LOV_MAX_STRIPE_COUNT 2000 /* ~((12 * 4096 - 256) / 24) */ +#define LOV_ALL_STRIPES 0xffff /* only valid for directories */ +#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */ + +#define XATTR_LUSTRE_PREFIX "lustre." +#define XATTR_LUSTRE_LOV XATTR_LUSTRE_PREFIX"lov" + +/* Please update if XATTR_LUSTRE_LOV".set" groks more flags in the future */ +#define allowed_lustre_lov(att) (strcmp((att), XATTR_LUSTRE_LOV".add") == 0 || \ + strcmp((att), XATTR_LUSTRE_LOV".set") == 0 || \ + strcmp((att), XATTR_LUSTRE_LOV".set.flags") == 0 || \ + strcmp((att), XATTR_LUSTRE_LOV".del") == 0) + +#define lov_user_ost_data lov_user_ost_data_v1 +struct lov_user_ost_data_v1 { /* per-stripe data structure */ + struct ost_id l_ost_oi; /* OST object ID */ + __u32 l_ost_gen; /* generation of this OST index */ + __u32 l_ost_idx; /* OST index in LOV */ +} __attribute__((packed)); + +#define lov_user_md lov_user_md_v1 +struct lov_user_md_v1 { /* LOV EA user data (host-endian) */ + __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V1 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* MDT parent inode id/seq (id/0 for 1.x) */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + union { + __u16 lmm_stripe_offset; /* starting stripe offset in + * lmm_objects, use when writing */ + __u16 lmm_layout_gen; /* layout generation number + * used when reading */ + }; + struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +} __attribute__((packed, __may_alias__)); + +struct lov_user_md_v3 { /* LOV EA user data (host-endian) */ + __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V3 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* MDT parent inode id/seq (id/0 for 1.x) */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + union { + __u16 lmm_stripe_offset; /* starting stripe offset in + * lmm_objects, use when writing */ + __u16 lmm_layout_gen; /* layout generation number + * used when reading */ + }; + char lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */ + struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +} __attribute__((packed, __may_alias__)); + +struct lov_foreign_md { + __u32 lfm_magic; /* magic number = LOV_MAGIC_FOREIGN */ + __u32 lfm_length; /* length of lfm_value */ + __u32 lfm_type; /* type, see LU_FOREIGN_TYPE_ */ + __u32 lfm_flags; /* flags, type specific */ + char lfm_value[]; +} __attribute__((packed)); + +#define foreign_size(lfm) (((struct lov_foreign_md *)lfm)->lfm_length + \ + offsetof(struct lov_foreign_md, lfm_value)) + +#define foreign_size_le(lfm) \ + (le32_to_cpu(((struct lov_foreign_md *)lfm)->lfm_length) + \ + offsetof(struct lov_foreign_md, lfm_value)) + +/** + * The stripe size fields are shared for the extension size storage, however + * the extension size is stored in KB, not bytes. + */ +#define SEL_UNIT_SIZE 1024llu + +struct lu_extent { + __u64 e_start; + __u64 e_end; +} __attribute__((packed)); + +#define DEXT "[%#llx, %#llx)" +#define PEXT(ext) (unsigned long long)(ext)->e_start, (unsigned long long)(ext)->e_end + +static inline bool lu_extent_is_overlapped(struct lu_extent *e1, + struct lu_extent *e2) +{ + return e1->e_start < e2->e_end && e2->e_start < e1->e_end; +} + +static inline bool lu_extent_is_whole(struct lu_extent *e) +{ + return e->e_start == 0 && e->e_end == LUSTRE_EOF; +} + +enum lov_comp_md_entry_flags { + LCME_FL_STALE = 0x00000001, /* FLR: stale data */ + LCME_FL_PREF_RD = 0x00000002, /* FLR: preferred for reading */ + LCME_FL_PREF_WR = 0x00000004, /* FLR: preferred for writing */ + LCME_FL_PREF_RW = LCME_FL_PREF_RD | LCME_FL_PREF_WR, + LCME_FL_OFFLINE = 0x00000008, /* Not used */ + LCME_FL_INIT = 0x00000010, /* instantiated */ + LCME_FL_NOSYNC = 0x00000020, /* FLR: no sync for the mirror */ + LCME_FL_EXTENSION = 0x00000040, /* extension comp, never init */ + LCME_FL_NEG = 0x80000000 /* used to indicate a negative flag, + * won't be stored on disk + */ +}; + +#define LCME_KNOWN_FLAGS (LCME_FL_NEG | LCME_FL_INIT | LCME_FL_STALE | \ + LCME_FL_PREF_RW | LCME_FL_NOSYNC | \ + LCME_FL_EXTENSION) + +/* The component flags can be set by users at creation/modification time. */ +#define LCME_USER_COMP_FLAGS (LCME_FL_PREF_RW | LCME_FL_NOSYNC | \ + LCME_FL_EXTENSION) + +/* The mirror flags can be set by users at creation time. */ +#define LCME_USER_MIRROR_FLAGS (LCME_FL_PREF_RW) + +/* The allowed flags obtained from the client at component creation time. */ +#define LCME_CL_COMP_FLAGS (LCME_USER_MIRROR_FLAGS | LCME_FL_EXTENSION) + +/* The mirror flags sent by client */ +#define LCME_MIRROR_FLAGS (LCME_FL_NOSYNC) + +/* These flags have meaning when set in a default layout and will be inherited + * from the default/template layout set on a directory. + */ +#define LCME_TEMPLATE_FLAGS (LCME_FL_PREF_RW | LCME_FL_NOSYNC | \ + LCME_FL_EXTENSION) + +/* the highest bit in obdo::o_layout_version is used to mark if the file is + * being resynced. */ +#define LU_LAYOUT_RESYNC LCME_FL_NEG + +/* lcme_id can be specified as certain flags, and the the first + * bit of lcme_id is used to indicate that the ID is representing + * certain LCME_FL_* but not a real ID. Which implies we can have + * at most 31 flags (see LCME_FL_XXX). */ +enum lcme_id { + LCME_ID_INVAL = 0x0, + LCME_ID_MAX = 0x7FFFFFFF, + LCME_ID_ALL = 0xFFFFFFFF, + LCME_ID_NOT_ID = LCME_FL_NEG +}; + +#define LCME_ID_MASK LCME_ID_MAX + +struct lov_comp_md_entry_v1 { + __u32 lcme_id; /* unique id of component */ + __u32 lcme_flags; /* LCME_FL_XXX */ + struct lu_extent lcme_extent; /* file extent for component */ + __u32 lcme_offset; /* offset of component blob, + start from lov_comp_md_v1 */ + __u32 lcme_size; /* size of component blob */ + __u32 lcme_layout_gen; + __u64 lcme_timestamp; /* snapshot time if applicable*/ + __u32 lcme_padding_1; +} __attribute__((packed)); + +#define SEQ_ID_MAX 0x0000FFFF +#define SEQ_ID_MASK SEQ_ID_MAX +/* bit 30:16 of lcme_id is used to store mirror id */ +#define MIRROR_ID_MASK 0x7FFF0000 +#define MIRROR_ID_NEG 0x8000 +#define MIRROR_ID_SHIFT 16 + +static inline __u32 pflr_id(__u16 mirror_id, __u16 seqid) +{ + return ((mirror_id << MIRROR_ID_SHIFT) & MIRROR_ID_MASK) | seqid; +} + +static inline __u16 mirror_id_of(__u32 id) +{ + return (id & MIRROR_ID_MASK) >> MIRROR_ID_SHIFT; +} + +/** + * on-disk data for lcm_flags. Valid if lcm_magic is LOV_MAGIC_COMP_V1. + */ +enum lov_comp_md_flags { + /* the least 4 bits are used by FLR to record file state */ + LCM_FL_NONE = 0x0, + LCM_FL_RDONLY = 0x1, + LCM_FL_WRITE_PENDING = 0x2, + LCM_FL_SYNC_PENDING = 0x3, + LCM_FL_PCC_RDONLY = 0x8, + LCM_FL_FLR_MASK = 0xB, +}; + +struct lov_comp_md_v1 { + __u32 lcm_magic; /* LOV_USER_MAGIC_COMP_V1 */ + __u32 lcm_size; /* overall size including this struct */ + __u32 lcm_layout_gen; + __u16 lcm_flags; + __u16 lcm_entry_count; + /* lcm_mirror_count stores the number of actual mirrors minus 1, + * so that non-flr files will have value 0 meaning 1 mirror. */ + __u16 lcm_mirror_count; + __u16 lcm_padding1[3]; + __u64 lcm_padding2; + struct lov_comp_md_entry_v1 lcm_entries[0]; +} __attribute__((packed)); + +static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic) +{ + if (stripes == (__u16)-1) + stripes = 0; + + if (lmm_magic == LOV_USER_MAGIC_V1) + return sizeof(struct lov_user_md_v1) + + stripes * sizeof(struct lov_user_ost_data_v1); + return sizeof(struct lov_user_md_v3) + + stripes * sizeof(struct lov_user_ost_data_v1); +} + +/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to + * use this. It is unsafe to #define those values in this header as it + * is possible the application has already #included . */ +#define lov_user_mds_data lov_user_mds_data_v2 +struct lov_user_mds_data_v1 { + lstat_t lmd_st; /* MDS stat struct */ + struct lov_user_md_v1 lmd_lmm; /* LOV EA V1 user data */ +} __attribute__((packed)); + +struct lov_user_mds_data_v2 { + struct lu_fid lmd_fid; /* Lustre FID */ + lstatx_t lmd_stx; /* MDS statx struct */ + __u64 lmd_flags; /* MDS stat flags */ + __u32 lmd_lmmsize; /* LOV EA size */ + __u32 lmd_padding; /* unused */ + struct lov_user_md_v1 lmd_lmm; /* LOV EA user data */ +} __attribute__((packed)); + +struct lmv_user_mds_data { + struct lu_fid lum_fid; + __u32 lum_padding; + __u32 lum_mds; +} __attribute__((packed, __may_alias__)); + +enum lmv_hash_type { + LMV_HASH_TYPE_UNKNOWN = 0, /* 0 is reserved for testing purpose */ + LMV_HASH_TYPE_ALL_CHARS = 1, + LMV_HASH_TYPE_FNV_1A_64 = 2, + LMV_HASH_TYPE_CRUSH = 3, + LMV_HASH_TYPE_MAX, +}; + +static __attribute__((unused)) const char *mdt_hash_name[] = { + "none", + "all_char", + "fnv_1a_64", + "crush", +}; + +#define LMV_HASH_TYPE_DEFAULT LMV_HASH_TYPE_FNV_1A_64 + +/* Right now only the lower part(0-16bits) of lmv_hash_type is being used, + * and the higher part will be the flag to indicate the status of object, + * for example the object is being migrated. And the hash function + * might be interpreted differently with different flags. */ +#define LMV_HASH_TYPE_MASK 0x0000ffff + +static inline bool lmv_is_known_hash_type(__u32 type) +{ + return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 || + (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS || + (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_CRUSH; +} + +/* fixed layout, such directories won't split automatically */ +/* NB, update LMV_HASH_FLAG_KNOWN when adding new flag */ +#define LMV_HASH_FLAG_FIXED 0x02000000 +#define LMV_HASH_FLAG_MERGE 0x04000000 +#define LMV_HASH_FLAG_SPLIT 0x08000000 + +/* The striped directory has ever lost its master LMV EA, then LFSCK + * re-generated it. This flag is used to indicate such case. It is an + * on-disk flag. */ +#define LMV_HASH_FLAG_LOST_LMV 0x10000000 + +#define LMV_HASH_FLAG_BAD_TYPE 0x20000000 +#define LMV_HASH_FLAG_MIGRATION 0x80000000 + +#define LMV_HASH_FLAG_LAYOUT_CHANGE \ + (LMV_HASH_FLAG_MIGRATION | LMV_HASH_FLAG_SPLIT | LMV_HASH_FLAG_MERGE) + +#define LMV_HASH_FLAG_KNOWN 0xbe000000 + +/* both SPLIT and MIGRATION are set for directory split */ +static inline bool lmv_hash_is_splitting(__u32 hash) +{ + return (hash & LMV_HASH_FLAG_LAYOUT_CHANGE) == + (LMV_HASH_FLAG_SPLIT | LMV_HASH_FLAG_MIGRATION); +} + +/* both MERGE and MIGRATION are set for directory merge */ +static inline bool lmv_hash_is_merging(__u32 hash) +{ + return (hash & LMV_HASH_FLAG_LAYOUT_CHANGE) == + (LMV_HASH_FLAG_MERGE | LMV_HASH_FLAG_MIGRATION); +} + +/* only MIGRATION is set for directory migration */ +static inline bool lmv_hash_is_migrating(__u32 hash) +{ + return (hash & LMV_HASH_FLAG_LAYOUT_CHANGE) == LMV_HASH_FLAG_MIGRATION; +} + +static inline bool lmv_hash_is_restriping(__u32 hash) +{ + return lmv_hash_is_splitting(hash) || lmv_hash_is_merging(hash); +} + +static inline bool lmv_hash_is_layout_changing(__u32 hash) +{ + return lmv_hash_is_splitting(hash) || lmv_hash_is_merging(hash) || + lmv_hash_is_migrating(hash); +} + +struct lustre_foreign_type { + __u32 lft_type; + const char *lft_name; +}; + +/** + * LOV/LMV foreign types + **/ +enum lustre_foreign_types { + LU_FOREIGN_TYPE_NONE = 0, + LU_FOREIGN_TYPE_SYMLINK = 0xda05, + /* must be the max/last one */ + LU_FOREIGN_TYPE_UNKNOWN = 0xffffffff, +}; + +extern struct lustre_foreign_type lu_foreign_types[]; + +/* Got this according to how get LOV_MAX_STRIPE_COUNT, see above, + * (max buffer size - lmv+rpc header) / sizeof(struct lmv_user_mds_data) */ +#define LMV_MAX_STRIPE_COUNT 2000 /* ((12 * 4096 - 256) / 24) */ +#define lmv_user_md lmv_user_md_v1 +struct lmv_user_md_v1 { + __u32 lum_magic; /* must be the first field */ + __u32 lum_stripe_count; /* dirstripe count */ + __u32 lum_stripe_offset; /* MDT idx for default dirstripe */ + __u32 lum_hash_type; /* Dir stripe policy */ + __u32 lum_type; /* LMV type: default */ + __u8 lum_max_inherit; /* inherit depth of default LMV */ + __u8 lum_max_inherit_rr; /* inherit depth of default LMV to round-robin mkdir */ + __u16 lum_padding1; + __u32 lum_padding2; + __u32 lum_padding3; + char lum_pool_name[LOV_MAXPOOLNAME + 1]; + struct lmv_user_mds_data lum_objects[0]; +} __attribute__((packed)); + +static inline __u32 lmv_foreign_to_md_stripes(__u32 size) +{ + if (size <= sizeof(struct lmv_user_md)) + return 0; + + size -= sizeof(struct lmv_user_md); + return (size + sizeof(struct lmv_user_mds_data) - 1) / + sizeof(struct lmv_user_mds_data); +} + +/* + * NB, historically default layout didn't set type, but use XATTR name to differ + * from normal layout, for backward compatibility, define LMV_TYPE_DEFAULT 0x0, + * and still use the same method. + */ +enum lmv_type { + LMV_TYPE_DEFAULT = 0x0000, +}; + +/* lum_max_inherit will be decreased by 1 after each inheritance if it's not + * LMV_INHERIT_UNLIMITED or > LMV_INHERIT_MAX. + */ +enum { + /* for historical reason, 0 means unlimited inheritance */ + LMV_INHERIT_UNLIMITED = 0, + /* unlimited lum_max_inherit by default for plain stripe (0 or 1) */ + LMV_INHERIT_DEFAULT_PLAIN = LMV_INHERIT_UNLIMITED, + /* not inherit any more */ + LMV_INHERIT_END = 1, + /* for multiple stripes, the default lum_max_inherit is 3 */ + LMV_INHERIT_DEFAULT_STRIPED = 3, + /* max inherit depth */ + LMV_INHERIT_MAX = 250, + /* [251, 254] are reserved */ + /* not set, or when inherit depth goes beyond end, */ + LMV_INHERIT_NONE = 255, +}; + +enum { + /* not set, or when inherit_rr depth goes beyond end, */ + LMV_INHERIT_RR_NONE = 0, + /* disable lum_max_inherit_rr by default */ + LMV_INHERIT_RR_DEFAULT = LMV_INHERIT_RR_NONE, + /* not inherit any more */ + LMV_INHERIT_RR_END = 1, + /* default inherit_rr of ROOT */ + LMV_INHERIT_RR_ROOT = 3, + /* max inherit depth */ + LMV_INHERIT_RR_MAX = 250, + /* [251, 254] are reserved */ + /* unlimited inheritance */ + LMV_INHERIT_RR_UNLIMITED = 255, +}; + +static inline int lmv_user_md_size(int stripes, int lmm_magic) +{ + int size = sizeof(struct lmv_user_md); + + if (lmm_magic == LMV_USER_MAGIC_SPECIFIC) + size += stripes * sizeof(struct lmv_user_mds_data); + + return size; +} + +struct ll_recreate_obj { + __u64 lrc_id; + __u32 lrc_ost_idx; +}; + +struct ll_fid { + __u64 id; /* holds object id */ + __u32 generation; /* holds object generation */ + __u32 f_type; /* holds object type or stripe idx when passing it to + * OST for saving into EA. */ +}; + +#define UUID_MAX 40 +struct obd_uuid { + char uuid[UUID_MAX]; +}; + +static inline bool obd_uuid_equals(const struct obd_uuid *u1, + const struct obd_uuid *u2) +{ + return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0; +} + +static inline int obd_uuid_empty(struct obd_uuid *uuid) +{ + return uuid->uuid[0] == '\0'; +} + +static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp) +{ + strncpy((char *)uuid->uuid, tmp, sizeof(*uuid)); + uuid->uuid[sizeof(*uuid) - 1] = '\0'; +} + +/* For printf's only, make sure uuid is terminated */ +static inline char *obd_uuid2str(const struct obd_uuid *uuid) +{ + if (uuid == NULL) + return NULL; + + if (uuid->uuid[sizeof(*uuid) - 1] != '\0') { + /* Obviously not safe, but for printfs, no real harm done... + we're always null-terminated, even in a race. */ + static char temp[sizeof(*uuid->uuid)]; + + memcpy(temp, uuid->uuid, sizeof(*uuid->uuid) - 1); + temp[sizeof(*uuid->uuid) - 1] = '\0'; + + return temp; + } + return (char *)(uuid->uuid); +} + +#define LUSTRE_MAXFSNAME 8 +#define LUSTRE_MAXINSTANCE 16 + +/* Extract fsname from uuid (or target name) of a target + e.g. (myfs-OST0007_UUID -> myfs) + see also deuuidify. */ +static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen) +{ + char *p; + + strncpy(buf, uuid, buflen - 1); + buf[buflen - 1] = '\0'; + p = strrchr(buf, '-'); + if (p != NULL) + *p = '\0'; +} + +/* printf display format for Lustre FIDs + * usage: printf("file FID is "DFID"\n", PFID(fid)); */ +#define FID_NOBRACE_LEN 40 +#define FID_LEN (FID_NOBRACE_LEN + 2) +#define DFID_NOBRACE "%#llx:0x%x:0x%x" +#define DFID "[" DFID_NOBRACE "]" +#define PFID(fid) (unsigned long long)(fid)->f_seq, (fid)->f_oid, (fid)->f_ver + +/* scanf input parse format for fids in DFID_NOBRACE format + * Need to strip '[' from DFID format first or use "["SFID"]" at caller. + * usage: sscanf(fidstr, SFID, RFID(&fid)); */ +#define SFID "0x%llx:0x%x:0x%x" +#define RFID(fid) (unsigned long long *)&((fid)->f_seq), &((fid)->f_oid), &((fid)->f_ver) + +/********* Quotas **********/ + +/* From linux/fs/quota/quota.c */ +static inline __u64 toqb(__kernel_size_t space) +{ + return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS; +} + +#define Q_QUOTACHECK 0x800100 /* deprecated as of 2.4 */ +#define Q_INITQUOTA 0x800101 /* deprecated as of 2.4 */ +#define Q_GETOINFO 0x800102 /* get obd quota info */ +#define Q_GETOQUOTA 0x800103 /* get obd quotas */ +#define Q_FINVALIDATE 0x800104 /* deprecated as of 2.4 */ + +/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */ +#define LUSTRE_Q_QUOTAON 0x800002 /* deprecated as of 2.4 */ +#define LUSTRE_Q_QUOTAOFF 0x800003 /* deprecated as of 2.4 */ +#define LUSTRE_Q_GETINFO 0x800005 /* get information about quota files */ +#define LUSTRE_Q_SETINFO 0x800006 /* set information about quota files */ +#define LUSTRE_Q_GETQUOTA 0x800007 /* get user quota structure */ +#define LUSTRE_Q_SETQUOTA 0x800008 /* set user quota structure */ +/* lustre-specific control commands */ +#define LUSTRE_Q_INVALIDATE 0x80000b /* deprecated as of 2.4 */ +#define LUSTRE_Q_FINVALIDATE 0x80000c /* deprecated as of 2.4 */ +#define LUSTRE_Q_GETDEFAULT 0x80000d /* get default quota */ +#define LUSTRE_Q_SETDEFAULT 0x80000e /* set default quota */ +#define LUSTRE_Q_GETQUOTAPOOL 0x80000f /* get user pool quota */ +#define LUSTRE_Q_SETQUOTAPOOL 0x800010 /* set user pool quota */ +#define LUSTRE_Q_GETINFOPOOL 0x800011 /* get pool quota info */ +#define LUSTRE_Q_SETINFOPOOL 0x800012 /* set pool quota info */ +#define LUSTRE_Q_GETDEFAULT_POOL 0x800013 /* get default pool quota*/ +#define LUSTRE_Q_SETDEFAULT_POOL 0x800014 /* set default pool quota */ +#define LUSTRE_Q_DELETEQID 0x800015 /* delete quota ID */ +/* In the current Lustre implementation, the grace time is either the time + * or the timestamp to be used after some quota ID exceeds the soft limt, + * 48 bits should be enough, its high 16 bits can be used as quota flags. + * */ +#define LQUOTA_GRACE_BITS 48 +#define LQUOTA_GRACE_MASK ((1ULL << LQUOTA_GRACE_BITS) - 1) +#define LQUOTA_GRACE_MAX LQUOTA_GRACE_MASK +#define LQUOTA_GRACE(t) (t & LQUOTA_GRACE_MASK) +#define LQUOTA_FLAG(t) (t >> LQUOTA_GRACE_BITS) +#define LQUOTA_GRACE_FLAG(t, f) ((__u64)t | (__u64)f << LQUOTA_GRACE_BITS) + +/* special grace time, only notify the user when its quota is over soft limit + * but doesn't block new writes until the hard limit is reached. */ +#define NOTIFY_GRACE "notify" +#define NOTIFY_GRACE_TIME LQUOTA_GRACE_MASK + +/* different quota flags */ + +/* the default quota flag, the corresponding quota ID will use the default + * quota setting, the hardlimit and softlimit of its quota record in the global + * quota file will be set to 0, the low 48 bits of the grace will be set to 0 + * and high 16 bits will contain this flag (see above comment). + * */ +#define LQUOTA_FLAG_DEFAULT 0x0001 +#define LQUOTA_FLAG_DELETED 0x0002 + +#define LUSTRE_Q_CMD_IS_POOL(cmd) \ + (cmd == LUSTRE_Q_GETQUOTAPOOL || \ + cmd == LUSTRE_Q_SETQUOTAPOOL || \ + cmd == LUSTRE_Q_SETINFOPOOL || \ + cmd == LUSTRE_Q_GETINFOPOOL || \ + cmd == LUSTRE_Q_SETDEFAULT_POOL || \ + cmd == LUSTRE_Q_GETDEFAULT_POOL) + +#define ALLQUOTA 255 /* set all quota */ +static inline const char *qtype_name(int qtype) +{ + switch (qtype) { + case USRQUOTA: + return "usr"; + case GRPQUOTA: + return "grp"; + case PRJQUOTA: + return "prj"; + } + return "unknown"; +} + +#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629 + +/* permission */ +#define N_PERMS_MAX 64 + +struct perm_downcall_data { + __u64 pdd_nid; + __u32 pdd_perm; + __u32 pdd_padding; +}; + +struct identity_downcall_data { + __u32 idd_magic; + __u32 idd_err; + __u32 idd_uid; + __u32 idd_gid; + __u32 idd_nperms; + __u32 idd_ngroups; + struct perm_downcall_data idd_perms[N_PERMS_MAX]; + __u32 idd_groups[0]; +}; + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 16, 53, 0) +/* old interface struct is deprecated in 2.14 */ +#define SEPOL_DOWNCALL_MAGIC_OLD 0x8b8bb842 +struct sepol_downcall_data_old { + __u32 sdd_magic; + __s64 sdd_sepol_mtime; + __u16 sdd_sepol_len; + char sdd_sepol[0]; +}; +#endif + +#define SEPOL_DOWNCALL_MAGIC 0x8b8bb843 +struct sepol_downcall_data { + __u32 sdd_magic; + __u16 sdd_sepol_len; + __u16 sdd_padding1; + __s64 sdd_sepol_mtime; + char sdd_sepol[0]; +}; + +#ifdef NEED_QUOTA_DEFS +#ifndef QIF_BLIMITS +#define QIF_BLIMITS 1 +#define QIF_SPACE 2 +#define QIF_ILIMITS 4 +#define QIF_INODES 8 +#define QIF_BTIME 16 +#define QIF_ITIME 32 +#define QIF_LIMITS (QIF_BLIMITS | QIF_ILIMITS) +#define QIF_USAGE (QIF_SPACE | QIF_INODES) +#define QIF_TIMES (QIF_BTIME | QIF_ITIME) +#define QIF_ALL (QIF_LIMITS | QIF_USAGE | QIF_TIMES) +#endif + +#endif /* !__KERNEL__ */ + +/* lustre volatile file support + * file name header: ".^L^S^T^R:volatile" + */ +#define LUSTRE_VOLATILE_HDR ".\x0c\x13\x14\x12:VOLATILE" +#define LUSTRE_VOLATILE_HDR_LEN 14 + +enum lustre_quota_version { + LUSTRE_QUOTA_V2 = 1 +}; + +/* XXX: same as if_dqinfo struct in kernel */ +struct obd_dqinfo { + __u64 dqi_bgrace; + __u64 dqi_igrace; + __u32 dqi_flags; + __u32 dqi_valid; +}; + +/* XXX: same as if_dqblk struct in kernel, plus one padding */ +struct obd_dqblk { + __u64 dqb_bhardlimit; /* kbytes unit */ + __u64 dqb_bsoftlimit; /* kbytes unit */ + __u64 dqb_curspace; /* bytes unit */ + __u64 dqb_ihardlimit; + __u64 dqb_isoftlimit; + __u64 dqb_curinodes; + __u64 dqb_btime; + __u64 dqb_itime; + __u32 dqb_valid; + __u32 dqb_padding; +}; + +enum { + QC_GENERAL = 0, + QC_MDTIDX = 1, + QC_OSTIDX = 2, + QC_UUID = 3 +}; + +struct if_quotactl { + __u32 qc_cmd; + __u32 qc_type; + __u32 qc_id; + __u32 qc_stat; + __u32 qc_valid; + __u32 qc_idx; + struct obd_dqinfo qc_dqinfo; + struct obd_dqblk qc_dqblk; + char obd_type[16]; + struct obd_uuid obd_uuid; + char qc_poolname[]; +}; + +/* swap layout flags */ +#define SWAP_LAYOUTS_CHECK_DV1 (1 << 0) +#define SWAP_LAYOUTS_CHECK_DV2 (1 << 1) +#define SWAP_LAYOUTS_KEEP_MTIME (1 << 2) +#define SWAP_LAYOUTS_KEEP_ATIME (1 << 3) +#define SWAP_LAYOUTS_CLOSE (1 << 4) + +/* Swap XATTR_NAME_HSM as well, only on the MDT so far */ +#define SWAP_LAYOUTS_MDS_HSM (1 << 31) +struct lustre_swap_layouts { + __u64 sl_flags; + __u32 sl_fd; + __u32 sl_gid; + __u64 sl_dv1; + __u64 sl_dv2; +}; + +/** Bit-mask of valid attributes */ +/* The LA_* flags are written to disk as part of the ChangeLog records + * so they are part of the on-disk and network protocol, and cannot be changed. + * Only the first 12 bits are currently saved. + */ +enum la_valid { + LA_ATIME = 1 << 0, /* 0x00001 */ + LA_MTIME = 1 << 1, /* 0x00002 */ + LA_CTIME = 1 << 2, /* 0x00004 */ + LA_SIZE = 1 << 3, /* 0x00008 */ + LA_MODE = 1 << 4, /* 0x00010 */ + LA_UID = 1 << 5, /* 0x00020 */ + LA_GID = 1 << 6, /* 0x00040 */ + LA_BLOCKS = 1 << 7, /* 0x00080 */ + LA_TYPE = 1 << 8, /* 0x00100 */ + LA_FLAGS = 1 << 9, /* 0x00200 */ + LA_NLINK = 1 << 10, /* 0x00400 */ + LA_RDEV = 1 << 11, /* 0x00800 */ + LA_BLKSIZE = 1 << 12, /* 0x01000 */ + LA_KILL_SUID = 1 << 13, /* 0x02000 */ + LA_KILL_SGID = 1 << 14, /* 0x04000 */ + LA_PROJID = 1 << 15, /* 0x08000 */ + LA_LAYOUT_VERSION = 1 << 16, /* 0x10000 */ + LA_LSIZE = 1 << 17, /* 0x20000 */ + LA_LBLOCKS = 1 << 18, /* 0x40000 */ + LA_BTIME = 1 << 19, /* 0x80000 */ + /** + * Attributes must be transmitted to OST objects + */ + LA_REMOTE_ATTR_SET = (LA_UID | LA_GID | LA_PROJID | LA_LAYOUT_VERSION) +}; + +#define MDS_FMODE_READ 00000001 +#define MDS_FMODE_WRITE 00000002 + +#define MDS_FMODE_CLOSED 00000000 +#define MDS_FMODE_EXEC 00000004 +/* MDS_FMODE_EPOCH 01000000 obsolete since 2.8.0 */ +/* MDS_FMODE_TRUNC 02000000 obsolete since 2.8.0 */ +/* MDS_FMODE_SOM 04000000 obsolete since 2.8.0 */ + +#define MDS_OPEN_CREATED 00000010 +/* MDS_OPEN_CROSS 00000020 obsolete in 2.12, internal use only */ + +#define MDS_OPEN_CREAT 00000100 +#define MDS_OPEN_EXCL 00000200 +#define MDS_OPEN_TRUNC 00001000 +#define MDS_OPEN_APPEND 00002000 +#define MDS_OPEN_SYNC 00010000 +#define MDS_OPEN_DIRECTORY 00200000 + +#define MDS_OPEN_NOIMPORT 020000000 /* nocache object create */ +#define MDS_OPEN_BY_FID 040000000 /* open_by_fid for known object */ +#define MDS_OPEN_DELAY_CREATE 0100000000 /* delay initial object create */ +#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */ +#define MDS_OPEN_JOIN_FILE 0400000000 /* open for join file. + * We do not support JOIN FILE + * anymore, reserve this flags + * just for preventing such bit + * to be reused. */ + +#define MDS_OPEN_LOCK 04000000000 /* This open requires open lock */ +#define MDS_OPEN_HAS_EA 010000000000 /* specify object create pattern */ +#define MDS_OPEN_HAS_OBJS 020000000000 /* Just set the EA the obj exist */ +#define MDS_OPEN_NORESTORE 0100000000000ULL /* Do not restore file at open */ +#define MDS_OPEN_NEWSTRIPE 0200000000000ULL /* New stripe needed (restripe or + * hsm restore) */ +#define MDS_OPEN_VOLATILE 0400000000000ULL /* File is volatile = created + unlinked */ +#define MDS_OPEN_LEASE 01000000000000ULL /* Open the file and grant lease + * delegation, succeed if it's not + * being opened with conflict mode. + */ +#define MDS_OPEN_RELEASE 02000000000000ULL /* Open the file for HSM release */ + +#define MDS_OPEN_RESYNC 04000000000000ULL /* FLR: file resync */ +#define MDS_OPEN_PCC 010000000000000ULL /* PCC: auto RW-PCC cache attach + * for newly created file */ +#define MDS_OP_WITH_FID 020000000000000ULL /* operation carried out by FID */ +#define MDS_OPEN_DEFAULT_LMV 040000000000000ULL /* open fetches default LMV */ + +/* lustre internal open flags, which should not be set from user space */ +#define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS | \ + MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK | \ + MDS_OPEN_BY_FID | MDS_OPEN_LEASE | \ + MDS_OPEN_RELEASE | MDS_OPEN_RESYNC | \ + MDS_OPEN_PCC | MDS_OP_WITH_FID | \ + MDS_OPEN_DEFAULT_LMV) + + +/********* Changelogs **********/ +/** Changelog record types */ +enum changelog_rec_type { + CL_NONE = -1, + CL_MARK = 0, + CL_CREATE = 1, /* namespace */ + CL_MKDIR = 2, /* namespace */ + CL_HARDLINK = 3, /* namespace */ + CL_SOFTLINK = 4, /* namespace */ + CL_MKNOD = 5, /* namespace */ + CL_UNLINK = 6, /* namespace */ + CL_RMDIR = 7, /* namespace */ + CL_RENAME = 8, /* namespace */ + CL_EXT = 9, /* namespace extended record (2nd half of rename) */ + CL_OPEN = 10, /* not currently used */ + CL_CLOSE = 11, /* may be written to log only with mtime change */ + CL_LAYOUT = 12, /* file layout/striping modified */ + CL_TRUNC = 13, + CL_SETATTR = 14, + CL_SETXATTR = 15, + CL_XATTR = CL_SETXATTR, /* Deprecated name */ + CL_HSM = 16, /* HSM specific events, see flags */ + CL_MTIME = 17, /* Precedence: setattr > mtime > ctime > atime */ + CL_CTIME = 18, + CL_ATIME = 19, + CL_MIGRATE = 20, + CL_FLRW = 21, /* FLR: file was firstly written */ + CL_RESYNC = 22, /* FLR: file was resync-ed */ + CL_GETXATTR = 23, + CL_DN_OPEN = 24, /* denied open */ + CL_LAST, +}; + +static inline const char *changelog_type2str(int type) { + static const char *const changelog_str[] = { + "MARK", "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK", + "RMDIR", "RENME", "RNMTO", "OPEN", "CLOSE", "LYOUT", "TRUNC", + "SATTR", "XATTR", "HSM", "MTIME", "CTIME", "ATIME", "MIGRT", + "FLRW", "RESYNC","GXATR", "NOPEN", + }; + + if (type >= 0 && type < CL_LAST) + return changelog_str[type]; + return NULL; +} + +/* 12 bits of per-record data can be stored in the bottom of the flags */ +#define CLF_FLAGSHIFT 12 +enum changelog_rec_flags { + CLF_VERSION = 0x1000, + CLF_RENAME = 0x2000, + CLF_JOBID = 0x4000, + CLF_EXTRA_FLAGS = 0x8000, + CLF_SUPPORTED = CLF_VERSION | CLF_RENAME | CLF_JOBID | + CLF_EXTRA_FLAGS, + CLF_FLAGMASK = (1U << CLF_FLAGSHIFT) - 1, + CLF_VERMASK = ~CLF_FLAGMASK, +}; + + +/* Anything under the flagmask may be per-type (if desired) */ +/* Flags for unlink */ +#define CLF_UNLINK_LAST 0x0001 /* Unlink of last hardlink */ +#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */ + /* HSM cleaning needed */ +/* Flags for rename */ +#define CLF_RENAME_LAST 0x0001 /* rename unlink last hardlink + * of target */ +#define CLF_RENAME_LAST_EXISTS 0x0002 /* rename unlink last hardlink of target + * has an archive in backend */ + +/* Flags for HSM */ +/* 12b used (from high weight to low weight): + * 2b for flags + * 3b for event + * 7b for error code + */ +#define CLF_HSM_ERR_L 0 /* HSM return code, 7 bits */ +#define CLF_HSM_ERR_H 6 +#define CLF_HSM_EVENT_L 7 /* HSM event, 3 bits, see enum hsm_event */ +#define CLF_HSM_EVENT_H 9 +#define CLF_HSM_FLAG_L 10 /* HSM flags, 2 bits, 1 used, 1 spare */ +#define CLF_HSM_FLAG_H 11 +#define CLF_HSM_SPARE_L 12 /* 4 spare bits */ +#define CLF_HSM_SPARE_H 15 +#define CLF_HSM_LAST 15 + +/* Remove bits higher than _h, then extract the value + * between _h and _l by shifting lower weigth to bit 0. */ +#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \ + >> (CLF_HSM_LAST - _h + _l)) + +#define CLF_HSM_SUCCESS 0x00 +#define CLF_HSM_MAXERROR 0x7E +#define CLF_HSM_ERROVERFLOW 0x7F + +#define CLF_HSM_DIRTY 1 /* file is dirty after HSM request end */ + +/* 3 bits field => 8 values allowed */ +enum hsm_event { + HE_ARCHIVE = 0, + HE_RESTORE = 1, + HE_CANCEL = 2, + HE_RELEASE = 3, + HE_REMOVE = 4, + HE_STATE = 5, + HE_SPARE1 = 6, + HE_SPARE2 = 7, + /* Leaving HE_SPARE2 as is. Its referred in the Lemur code */ + HE_IMPORT = 7, +}; + +static inline enum hsm_event hsm_get_cl_event(__u16 flags) +{ + return (enum hsm_event)CLF_GET_BITS(flags, CLF_HSM_EVENT_H, + CLF_HSM_EVENT_L); +} + +static inline void hsm_set_cl_event(enum changelog_rec_flags *clf_flags, + enum hsm_event he) +{ + *clf_flags = (enum changelog_rec_flags) + (*clf_flags | (he << CLF_HSM_EVENT_L)); +} + +static inline __u16 hsm_get_cl_flags(enum changelog_rec_flags clf_flags) +{ + return CLF_GET_BITS(clf_flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L); +} + +static inline void hsm_set_cl_flags(enum changelog_rec_flags *clf_flags, + unsigned int bits) +{ + *clf_flags = (enum changelog_rec_flags) + (*clf_flags | (bits << CLF_HSM_FLAG_L)); +} + +static inline int hsm_get_cl_error(enum changelog_rec_flags clf_flags) +{ + return CLF_GET_BITS(clf_flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L); +} + +static inline void hsm_set_cl_error(enum changelog_rec_flags *clf_flags, + unsigned int error) +{ + *clf_flags = (enum changelog_rec_flags) + (*clf_flags | (error << CLF_HSM_ERR_L)); +} + +enum changelog_rec_extra_flags { + CLFE_INVALID = 0, + CLFE_UIDGID = 0x0001, + CLFE_NID = 0x0002, + CLFE_OPEN = 0x0004, + CLFE_XATTR = 0x0008, + CLFE_SUPPORTED = CLFE_UIDGID | CLFE_NID | CLFE_OPEN | CLFE_XATTR +}; + +enum changelog_send_flag { + /* Not yet implemented */ + CHANGELOG_FLAG_FOLLOW = 0x01, + /* Blocking IO makes sense in case of slow user parsing of the records, + * but it also prevents us from cleaning up if the records are not + * consumed. */ + CHANGELOG_FLAG_BLOCK = 0x02, + /* Pack jobid into the changelog records if available. */ + CHANGELOG_FLAG_JOBID = 0x04, + /* Pack additional flag bits into the changelog record */ + CHANGELOG_FLAG_EXTRA_FLAGS = 0x08, +}; + +enum changelog_send_extra_flag { + /* Pack uid/gid into the changelog record */ + CHANGELOG_EXTRA_FLAG_UIDGID = 0x01, + /* Pack nid into the changelog record */ + CHANGELOG_EXTRA_FLAG_NID = 0x02, + /* Pack open mode into the changelog record */ + CHANGELOG_EXTRA_FLAG_OMODE = 0x04, + /* Pack xattr name into the changelog record */ + CHANGELOG_EXTRA_FLAG_XATTR = 0x08, +}; + +/* unlink/rename/rmdir would log with the full path. + * Set the max to use PATH_MAX + */ +#define CR_MAXSIZE __ALIGN_KERNEL(NAME_MAX + PATH_MAX + 2 + \ + changelog_rec_offset(CLF_SUPPORTED, \ + CLFE_SUPPORTED), 8) + +/* 31 usable bytes string + null terminator. */ +#define LUSTRE_JOBID_SIZE 32 + +/* This is the minimal changelog record. It can contain extensions + * such as rename fields or process jobid. Its exact content is described + * by the cr_flags and cr_extra_flags. + * + * Extensions are packed in the same order as their corresponding flags, + * then in the same order as their corresponding extra flags. + */ +struct changelog_rec { + __u16 cr_namelen; + __u16 cr_flags; /**< \a changelog_rec_flags */ + __u32 cr_type; /**< \a changelog_rec_type */ + __u64 cr_index; /**< changelog record number */ + __u64 cr_prev; /**< last index for this target fid */ + __u64 cr_time; + union { + struct lu_fid cr_tfid; /**< target fid */ + __u32 cr_markerflags; /**< CL_MARK flags */ + }; + struct lu_fid cr_pfid; /**< parent fid */ +} __attribute__ ((packed)); + +/* Changelog extension for RENAME. */ +struct changelog_ext_rename { + struct lu_fid cr_sfid; /**< source fid, or zero */ + struct lu_fid cr_spfid; /**< source parent fid, or zero */ +}; + +/* Changelog extension to include JOBID. */ +struct changelog_ext_jobid { + char cr_jobid[LUSTRE_JOBID_SIZE]; /**< zero-terminated string. */ +}; + +/* Changelog extension to include additional flags. */ +struct changelog_ext_extra_flags { + __u64 cr_extra_flags; /* Additional CLFE_* flags */ +}; + +/* Changelog extra extension to include UID/GID. */ +struct changelog_ext_uidgid { + __u64 cr_uid; + __u64 cr_gid; +}; + +/* Changelog extra extension to include NID. */ +struct changelog_ext_nid { + /* have __u64 instead of lnet_nid_t type for use by client api */ + __u64 cr_nid; + /* for use when IPv6 support is added */ + __u64 extra; + __u32 padding; +}; + +/* Changelog extra extension to include low 32 bits of MDS_OPEN_* flags. */ +struct changelog_ext_openmode { + __u32 cr_openflags; +}; + +/* Changelog extra extension to include xattr */ +struct changelog_ext_xattr { + char cr_xattr[XATTR_NAME_MAX + 1]; /**< zero-terminated string. */ +}; + +static inline struct changelog_ext_extra_flags *changelog_rec_extra_flags( + const struct changelog_rec *rec); + +static inline __kernel_size_t changelog_rec_offset(enum changelog_rec_flags crf, + enum changelog_rec_extra_flags cref) +{ + __kernel_size_t size = sizeof(struct changelog_rec); + + if (crf & CLF_RENAME) + size += sizeof(struct changelog_ext_rename); + + if (crf & CLF_JOBID) + size += sizeof(struct changelog_ext_jobid); + + if (crf & CLF_EXTRA_FLAGS) { + size += sizeof(struct changelog_ext_extra_flags); + if (cref & CLFE_UIDGID) + size += sizeof(struct changelog_ext_uidgid); + if (cref & CLFE_NID) + size += sizeof(struct changelog_ext_nid); + if (cref & CLFE_OPEN) + size += sizeof(struct changelog_ext_openmode); + if (cref & CLFE_XATTR) + size += sizeof(struct changelog_ext_xattr); + } + + return size; +} + +static inline __kernel_size_t changelog_rec_size(const struct changelog_rec *rec) +{ + enum changelog_rec_extra_flags cref = CLFE_INVALID; + + if (rec->cr_flags & CLF_EXTRA_FLAGS) + cref = (enum changelog_rec_extra_flags) + changelog_rec_extra_flags(rec)->cr_extra_flags; + + return changelog_rec_offset( + (enum changelog_rec_flags)rec->cr_flags, cref); +} + +static inline __kernel_size_t changelog_rec_varsize(const struct changelog_rec *rec) +{ + return changelog_rec_size(rec) - sizeof(*rec) + rec->cr_namelen; +} + +static inline +struct changelog_ext_rename *changelog_rec_rename(const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = (enum changelog_rec_flags) + (rec->cr_flags & CLF_VERSION); + + return (struct changelog_ext_rename *)((char *)rec + + changelog_rec_offset(crf, + CLFE_INVALID)); +} + +/* The jobid follows the rename extension, if present */ +static inline +struct changelog_ext_jobid *changelog_rec_jobid(const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = (enum changelog_rec_flags) + (rec->cr_flags & (CLF_VERSION | CLF_RENAME)); + + return (struct changelog_ext_jobid *)((char *)rec + + changelog_rec_offset(crf, + CLFE_INVALID)); +} + +/* The additional flags follow the rename and jobid extensions, if present */ +static inline +struct changelog_ext_extra_flags *changelog_rec_extra_flags( + const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = (enum changelog_rec_flags) + (rec->cr_flags & (CLF_VERSION | CLF_RENAME | CLF_JOBID)); + + return (struct changelog_ext_extra_flags *)((char *)rec + + changelog_rec_offset(crf, + CLFE_INVALID)); +} + +/* The uid/gid is the first extra extension */ +static inline +struct changelog_ext_uidgid *changelog_rec_uidgid( + const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = (enum changelog_rec_flags) + (rec->cr_flags & + (CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS)); + + return (struct changelog_ext_uidgid *)((char *)rec + + changelog_rec_offset(crf, + CLFE_INVALID)); +} + +/* The nid is the second extra extension */ +static inline +struct changelog_ext_nid *changelog_rec_nid(const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = (enum changelog_rec_flags) + (rec->cr_flags & + (CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS)); + enum changelog_rec_extra_flags cref = CLFE_INVALID; + + if (rec->cr_flags & CLF_EXTRA_FLAGS) + cref = (enum changelog_rec_extra_flags) + (changelog_rec_extra_flags(rec)->cr_extra_flags & + CLFE_UIDGID); + + return (struct changelog_ext_nid *)((char *)rec + + changelog_rec_offset(crf, cref)); +} + +/* The OPEN mode is the third extra extension */ +static inline +struct changelog_ext_openmode *changelog_rec_openmode( + const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = (enum changelog_rec_flags) + (rec->cr_flags & + (CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS)); + enum changelog_rec_extra_flags cref = CLFE_INVALID; + + if (rec->cr_flags & CLF_EXTRA_FLAGS) { + cref = (enum changelog_rec_extra_flags) + (changelog_rec_extra_flags(rec)->cr_extra_flags & + (CLFE_UIDGID | CLFE_NID)); + } + + return (struct changelog_ext_openmode *)((char *)rec + + changelog_rec_offset(crf, cref)); +} + +/* The xattr name is the fourth extra extension */ +static inline +struct changelog_ext_xattr *changelog_rec_xattr( + const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = (enum changelog_rec_flags) + (rec->cr_flags & + (CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS)); + enum changelog_rec_extra_flags cref = CLFE_INVALID; + + if (rec->cr_flags & CLF_EXTRA_FLAGS) + cref = (enum changelog_rec_extra_flags) + (changelog_rec_extra_flags(rec)->cr_extra_flags & + (CLFE_UIDGID | CLFE_NID | CLFE_OPEN)); + + return (struct changelog_ext_xattr *)((char *)rec + + changelog_rec_offset(crf, cref)); +} + +/* The name follows the rename, jobid and extra flags extns, if present */ +static inline char *changelog_rec_name(const struct changelog_rec *rec) +{ + enum changelog_rec_extra_flags cref = CLFE_INVALID; + + if (rec->cr_flags & CLF_EXTRA_FLAGS) + cref = (enum changelog_rec_extra_flags) + changelog_rec_extra_flags(rec)->cr_extra_flags; + + return (char *)rec + changelog_rec_offset( + (enum changelog_rec_flags)(rec->cr_flags & CLF_SUPPORTED), + (enum changelog_rec_extra_flags)(cref & CLFE_SUPPORTED)); +} + +static inline char *changelog_rec_sname(const struct changelog_rec *rec) +{ + char *str = changelog_rec_name(rec); + + while (*str != '\0') + str++; + return str + 1; +} + +static inline __kernel_size_t changelog_rec_snamelen(const struct changelog_rec *rec) +{ + return strlen(changelog_rec_sname(rec)); +} + +/** + * Remap a record to the desired format as specified by the crf flags. + * The record must be big enough to contain the final remapped version. + * Superfluous extension fields are removed and missing ones are added + * and zeroed. The flags of the record are updated accordingly. + * + * The jobid and rename extensions can be added to a record, to match the + * format an application expects, typically. In this case, the newly added + * fields will be zeroed. + * The Jobid field can be removed, to guarantee compatibility with older + * clients that don't expect this field in the records they process. + * + * The following assumptions are being made: + * - CLF_RENAME will not be removed + * - CLF_JOBID will not be added without CLF_RENAME being added too + * - CLF_EXTRA_FLAGS will not be added without CLF_JOBID being added too + * + * @param[in,out] rec The record to remap. + * @param[in] crf_wanted Flags describing the desired extensions. + * @param[in] cref_want Flags describing the desired extra extensions. + */ +static inline int changelog_remap_rec(struct changelog_rec *rec, + __kernel_size_t rec_size, + enum changelog_rec_flags crf_wanted, + enum changelog_rec_extra_flags cref_want) +{ + char *xattr_mov = NULL; + char *omd_mov = NULL; + char *nid_mov = NULL; + char *uidgid_mov = NULL; + char *ef_mov; + char *jid_mov; + char *rnm_mov; + enum changelog_rec_extra_flags cref = CLFE_INVALID; + + crf_wanted = (enum changelog_rec_flags) + (crf_wanted & CLF_SUPPORTED); + cref_want = (enum changelog_rec_extra_flags) + (cref_want & CLFE_SUPPORTED); + + if ((rec->cr_flags & CLF_SUPPORTED) == crf_wanted) { + if (!(rec->cr_flags & CLF_EXTRA_FLAGS) || + (rec->cr_flags & CLF_EXTRA_FLAGS && + (changelog_rec_extra_flags(rec)->cr_extra_flags & + CLFE_SUPPORTED) == + cref_want)) + return 0; + } + + if ((changelog_rec_offset(crf_wanted, cref_want) + rec->cr_namelen) > + rec_size) + return -EOVERFLOW; + + /* First move the variable-length name field */ + memmove((char *)rec + changelog_rec_offset(crf_wanted, cref_want), + changelog_rec_name(rec), rec->cr_namelen); + + /* Locations of extensions in the remapped record */ + if (rec->cr_flags & CLF_EXTRA_FLAGS) { + xattr_mov = (char *)rec + + changelog_rec_offset( + (enum changelog_rec_flags) + (crf_wanted & CLF_SUPPORTED), + (enum changelog_rec_extra_flags) + (cref_want & ~CLFE_XATTR)); + omd_mov = (char *)rec + + changelog_rec_offset( + (enum changelog_rec_flags) + (crf_wanted & CLF_SUPPORTED), + (enum changelog_rec_extra_flags) + (cref_want & ~(CLFE_OPEN | CLFE_XATTR))); + nid_mov = (char *)rec + + changelog_rec_offset( + (enum changelog_rec_flags) + (crf_wanted & CLF_SUPPORTED), + (enum changelog_rec_extra_flags) + (cref_want & + ~(CLFE_NID | CLFE_OPEN | CLFE_XATTR))); + uidgid_mov = (char *)rec + + changelog_rec_offset( + (enum changelog_rec_flags) + (crf_wanted & CLF_SUPPORTED), + (enum changelog_rec_extra_flags) + (cref_want & ~(CLFE_UIDGID | + CLFE_NID | + CLFE_OPEN | + CLFE_XATTR))); + cref = (enum changelog_rec_extra_flags) + changelog_rec_extra_flags(rec)->cr_extra_flags; + } + + ef_mov = (char *)rec + + changelog_rec_offset( + (enum changelog_rec_flags) + (crf_wanted & ~CLF_EXTRA_FLAGS), CLFE_INVALID); + jid_mov = (char *)rec + + changelog_rec_offset((enum changelog_rec_flags)(crf_wanted & + ~(CLF_EXTRA_FLAGS | CLF_JOBID)), + CLFE_INVALID); + rnm_mov = (char *)rec + + changelog_rec_offset((enum changelog_rec_flags)(crf_wanted & + ~(CLF_EXTRA_FLAGS | + CLF_JOBID | + CLF_RENAME)), + CLFE_INVALID); + + /* Move the extension fields to the desired positions */ + if ((crf_wanted & CLF_EXTRA_FLAGS) && + (rec->cr_flags & CLF_EXTRA_FLAGS)) { + if ((cref_want & CLFE_XATTR) && (cref & CLFE_XATTR)) + memmove(xattr_mov, changelog_rec_xattr(rec), + sizeof(struct changelog_ext_xattr)); + + if ((cref_want & CLFE_OPEN) && (cref & CLFE_OPEN)) + memmove(omd_mov, changelog_rec_openmode(rec), + sizeof(struct changelog_ext_openmode)); + + if ((cref_want & CLFE_NID) && (cref & CLFE_NID)) + memmove(nid_mov, changelog_rec_nid(rec), + sizeof(struct changelog_ext_nid)); + + if ((cref_want & CLFE_UIDGID) && (cref & CLFE_UIDGID)) + memmove(uidgid_mov, changelog_rec_uidgid(rec), + sizeof(struct changelog_ext_uidgid)); + + memmove(ef_mov, changelog_rec_extra_flags(rec), + sizeof(struct changelog_ext_extra_flags)); + } + + if ((crf_wanted & CLF_JOBID) && (rec->cr_flags & CLF_JOBID)) + memmove(jid_mov, changelog_rec_jobid(rec), + sizeof(struct changelog_ext_jobid)); + + if ((crf_wanted & CLF_RENAME) && (rec->cr_flags & CLF_RENAME)) + memmove(rnm_mov, changelog_rec_rename(rec), + sizeof(struct changelog_ext_rename)); + + /* Clear newly added fields */ + if (xattr_mov && (cref_want & CLFE_XATTR) && + !(cref & CLFE_XATTR)) + memset(xattr_mov, 0, sizeof(struct changelog_ext_xattr)); + + if (omd_mov && (cref_want & CLFE_OPEN) && + !(cref & CLFE_OPEN)) + memset(omd_mov, 0, sizeof(struct changelog_ext_openmode)); + + if (nid_mov && (cref_want & CLFE_NID) && + !(cref & CLFE_NID)) + memset(nid_mov, 0, sizeof(struct changelog_ext_nid)); + + if (uidgid_mov && (cref_want & CLFE_UIDGID) && + !(cref & CLFE_UIDGID)) + memset(uidgid_mov, 0, sizeof(struct changelog_ext_uidgid)); + + if ((crf_wanted & CLF_EXTRA_FLAGS) && + !(rec->cr_flags & CLF_EXTRA_FLAGS)) + memset(ef_mov, 0, sizeof(struct changelog_ext_extra_flags)); + + if ((crf_wanted & CLF_JOBID) && !(rec->cr_flags & CLF_JOBID)) + memset(jid_mov, 0, sizeof(struct changelog_ext_jobid)); + + if ((crf_wanted & CLF_RENAME) && !(rec->cr_flags & CLF_RENAME)) + memset(rnm_mov, 0, sizeof(struct changelog_ext_rename)); + + /* Update the record's flags accordingly */ + rec->cr_flags = (rec->cr_flags & CLF_FLAGMASK) | crf_wanted; + if (rec->cr_flags & CLF_EXTRA_FLAGS) + changelog_rec_extra_flags(rec)->cr_extra_flags = + changelog_rec_extra_flags(rec)->cr_extra_flags | + cref_want; + + return 0; +} + +enum changelog_message_type { + CL_RECORD = 10, /* message is a changelog_rec */ + CL_EOF = 11, /* at end of current changelog */ +}; + +/********* Misc **********/ + +struct ioc_data_version { + __u64 idv_version; + __u32 idv_layout_version; /* FLR: layout version for OST objects */ + __u32 idv_flags; /* enum ioc_data_version_flags */ +}; + +enum ioc_data_version_flags { + LL_DV_RD_FLUSH = (1 << 0), /* Flush dirty pages from clients */ + LL_DV_WR_FLUSH = (1 << 1), /* Flush all caching pages from clients */ + LL_DV_SZ_UPDATE = (1 << 2), /* Update the file size on the client */ +}; + +#ifndef offsetof +#define offsetof(typ, memb) ((unsigned long)((char *)&(((typ *)0)->memb))) +#endif + +#define dot_lustre_name ".lustre" +#define dot_fscrypt_name ".fscrypt" + + +/********* HSM **********/ + +/** HSM per-file state + * See HSM_FLAGS below. + */ +enum hsm_states { + HS_NONE = 0x00000000, + HS_EXISTS = 0x00000001, + HS_DIRTY = 0x00000002, + HS_RELEASED = 0x00000004, + HS_ARCHIVED = 0x00000008, + HS_NORELEASE = 0x00000010, + HS_NOARCHIVE = 0x00000020, + HS_LOST = 0x00000040, +}; + +/* HSM user-setable flags. */ +#define HSM_USER_MASK (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY) + +/* Other HSM flags. */ +#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED) + +/* + * All HSM-related possible flags that could be applied to a file. + * This should be kept in sync with hsm_states. + */ +#define HSM_FLAGS_MASK (HSM_USER_MASK | HSM_STATUS_MASK) + +/** + * HSM request progress state + */ +enum hsm_progress_states { + HPS_NONE = 0, + HPS_WAITING = 1, + HPS_RUNNING = 2, + HPS_DONE = 3, +}; + +static inline const char *hsm_progress_state2name(enum hsm_progress_states s) +{ + switch (s) { + case HPS_WAITING: return "waiting"; + case HPS_RUNNING: return "running"; + case HPS_DONE: return "done"; + default: return "unknown"; + } +} + +struct hsm_extent { + __u64 offset; + __u64 length; +} __attribute__((packed)); + +/** + * Current HSM states of a Lustre file. + * + * This structure purpose is to be sent to user-space mainly. It describes the + * current HSM flags and in-progress action. + */ +struct hsm_user_state { + /** Current HSM states, from enum hsm_states. */ + __u32 hus_states; + __u32 hus_archive_id; + /** The current undergoing action, if there is one */ + __u32 hus_in_progress_state; + __u32 hus_in_progress_action; + struct hsm_extent hus_in_progress_location; + char hus_extended_info[]; +}; + +struct hsm_state_set_ioc { + struct lu_fid hssi_fid; + __u64 hssi_setmask; + __u64 hssi_clearmask; +}; + +/* + * This structure describes the current in-progress action for a file. + * it is retuned to user space and send over the wire + */ +struct hsm_current_action { + /** The current undergoing action, if there is one */ + /* state is one of hsm_progress_states */ + __u32 hca_state; + /* action is one of hsm_user_action */ + __u32 hca_action; + struct hsm_extent hca_location; +}; + +/***** HSM user requests ******/ +/* User-generated (lfs/ioctl) request types */ +enum hsm_user_action { + HUA_NONE = 1, /* no action (noop) */ + HUA_ARCHIVE = 10, /* copy to hsm */ + HUA_RESTORE = 11, /* prestage */ + HUA_RELEASE = 12, /* drop ost objects */ + HUA_REMOVE = 13, /* remove from archive */ + HUA_CANCEL = 14, /* cancel a request */ + HUA_IMPORT = 15, /* add a new file */ +}; + +static inline const char *hsm_user_action2name(enum hsm_user_action a) +{ + switch (a) { + case HUA_NONE: return "NOOP"; + case HUA_ARCHIVE: return "ARCHIVE"; + case HUA_RESTORE: return "RESTORE"; + case HUA_RELEASE: return "RELEASE"; + case HUA_REMOVE: return "REMOVE"; + case HUA_CANCEL: return "CANCEL"; + case HUA_IMPORT: return "IMPORT"; + default: return "UNKNOWN"; + } +} + +/* + * List of hr_flags (bit field) + */ +#define HSM_FORCE_ACTION 0x0001 +/* used by CT, cannot be set by user */ +#define HSM_GHOST_COPY 0x0002 + +/** + * Contains all the fixed part of struct hsm_user_request. + * + */ +struct hsm_request { + __u32 hr_action; /* enum hsm_user_action */ + __u32 hr_archive_id; /* archive id, used only with HUA_ARCHIVE */ + __u64 hr_flags; /* request flags */ + __u32 hr_itemcount; /* item count in hur_user_item vector */ + __u32 hr_data_len; +}; + +struct hsm_user_item { + struct lu_fid hui_fid; + struct hsm_extent hui_extent; +} __attribute__((packed)); + +struct hsm_user_request { + struct hsm_request hur_request; + struct hsm_user_item hur_user_item[0]; + /* extra data blob at end of struct (after all + * hur_user_items), only use helpers to access it + */ +} __attribute__((packed)); + +/** Return pointer to data field in a hsm user request */ +static inline void *hur_data(struct hsm_user_request *hur) +{ + return &(hur->hur_user_item[hur->hur_request.hr_itemcount]); +} + +/** + * Compute the current length of the provided hsm_user_request. This returns -1 + * instead of an errno because __kernel_ssize_t is defined to be only + * [ -1, SSIZE_MAX ] + * + * return -1 on bounds check error. + */ +static inline __kernel_size_t hur_len(struct hsm_user_request *hur) +{ + __u64 size; + + /* can't overflow a __u64 since hr_itemcount is only __u32 */ + size = offsetof(struct hsm_user_request, hur_user_item[0]) + + (__u64)hur->hur_request.hr_itemcount * + sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len; + + if ((__kernel_ssize_t)size < 0) + return -1; + + return size; +} + +/****** HSM RPCs to copytool *****/ +/* Message types the copytool may receive */ +enum hsm_message_type { + HMT_ACTION_LIST = 100, /* message is a hsm_action_list */ +}; + +/* Actions the copytool may be instructed to take for a given action_item */ +enum hsm_copytool_action { + HSMA_NONE = 10, /* no action */ + HSMA_ARCHIVE = 20, /* arbitrary offset */ + HSMA_RESTORE = 21, + HSMA_REMOVE = 22, + HSMA_CANCEL = 23, + HSMA_IMPORT = 24 +}; + +static inline const char *hsm_copytool_action2name(enum hsm_copytool_action a) +{ + switch (a) { + case HSMA_NONE: return "NOOP"; + case HSMA_ARCHIVE: return "ARCHIVE"; + case HSMA_RESTORE: return "RESTORE"; + case HSMA_REMOVE: return "REMOVE"; + case HSMA_CANCEL: return "CANCEL"; + case HSMA_IMPORT: return "IMPORT"; + default: return "UNKNOWN"; + } +} + +/* Copytool item action description */ +struct hsm_action_item { + __u32 hai_len; /* valid size of this struct */ + __u32 hai_action; /* hsm_copytool_action, but use known size */ + struct lu_fid hai_fid; /* Lustre FID to operate on */ + struct lu_fid hai_dfid; /* fid used for data access */ + struct hsm_extent hai_extent; /* byte range to operate on */ + __u64 hai_cookie; /* action cookie from coordinator */ + __u64 hai_gid; /* grouplock id */ + char hai_data[0]; /* variable length */ +} __attribute__((packed)); + +/** + * helper function which print in hexa the first bytes of + * hai opaque field + * + * \param hai [IN] record to print + * \param buffer [IN,OUT] buffer to write the hex string to + * \param len [IN] max buffer length + * + * \retval buffer + */ +static inline char *hai_dump_data_field(const struct hsm_action_item *hai, + char *buffer, __kernel_size_t len) +{ + int i; + int data_len; + char *ptr; + + ptr = buffer; + data_len = hai->hai_len - sizeof(*hai); + for (i = 0; (i < data_len) && (len > 2); i++) { + snprintf(ptr, 3, "%02X", (unsigned char)hai->hai_data[i]); + ptr += 2; + len -= 2; + } + + *ptr = '\0'; + + return buffer; +} + +/* Copytool action list */ +#define HAL_VERSION 1 +#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */ +struct hsm_action_list { + __u32 hal_version; + __u32 hal_count; /* number of hai's to follow */ + __u64 hal_compound_id; /* returned by coordinator, ignored */ + __u64 hal_flags; + __u32 hal_archive_id; /* which archive backend */ + __u32 padding1; + char hal_fsname[0]; /* null-terminated */ + /* struct hsm_action_item[hal_count] follows, aligned on 8-byte + boundaries. See hai_zero */ +} __attribute__((packed)); + +/* Return pointer to first hai in action list */ +static inline struct hsm_action_item *hai_first(struct hsm_action_list *hal) +{ + __kernel_size_t offset = __ALIGN_KERNEL(strlen(hal->hal_fsname) + 1, 8); + + return (struct hsm_action_item *)(hal->hal_fsname + offset); +} + +/* Return pointer to next hai */ +static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai) +{ + __kernel_size_t offset = __ALIGN_KERNEL(hai->hai_len, 8); + + return (struct hsm_action_item *)((char *)hai + offset); +} + +/* Return size of an hsm_action_list */ +static inline __kernel_size_t hal_size(struct hsm_action_list *hal) +{ + __u32 i; + __kernel_size_t sz; + struct hsm_action_item *hai; + + sz = sizeof(*hal) + __ALIGN_KERNEL(strlen(hal->hal_fsname) + 1, 8); + hai = hai_first(hal); + for (i = 0; i < hal->hal_count ; i++, hai = hai_next(hai)) + sz += __ALIGN_KERNEL(hai->hai_len, 8); + + return sz; +} + +/* HSM file import + * describe the attributes to be set on imported file + */ +struct hsm_user_import { + __u64 hui_size; + __u64 hui_atime; + __u64 hui_mtime; + __u32 hui_atime_ns; + __u32 hui_mtime_ns; + __u32 hui_uid; + __u32 hui_gid; + __u32 hui_mode; + __u32 hui_archive_id; +}; + +/* Copytool progress reporting */ +#define HP_FLAG_COMPLETED 0x01 +#define HP_FLAG_RETRY 0x02 + +struct hsm_progress { + struct lu_fid hp_fid; + __u64 hp_cookie; + struct hsm_extent hp_extent; + __u16 hp_flags; + __u16 hp_errval; /* positive val */ + __u32 padding; +}; + +struct hsm_copy { + __u64 hc_data_version; + __u16 hc_flags; + __u16 hc_errval; /* positive val */ + __u32 padding; + struct hsm_action_item hc_hai; +}; + +enum lu_ladvise_type { + LU_LADVISE_INVALID = 0, + LU_LADVISE_WILLREAD = 1, + LU_LADVISE_DONTNEED = 2, + LU_LADVISE_LOCKNOEXPAND = 3, + LU_LADVISE_LOCKAHEAD = 4, + LU_LADVISE_MAX +}; + +#define LU_LADVISE_NAMES { \ + [LU_LADVISE_WILLREAD] = "willread", \ + [LU_LADVISE_DONTNEED] = "dontneed", \ + [LU_LADVISE_LOCKNOEXPAND] = "locknoexpand", \ + [LU_LADVISE_LOCKAHEAD] = "lockahead", \ +} + +/* This is the userspace argument for ladvise. It is currently the same as + * what goes on the wire (struct lu_ladvise), but is defined separately as we + * may need info which is only used locally. */ +struct llapi_lu_ladvise { + __u16 lla_advice; /* advice type */ + __u16 lla_value1; /* values for different advice types */ + __u32 lla_value2; + __u64 lla_start; /* first byte of extent for advice */ + __u64 lla_end; /* last byte of extent for advice */ + __u32 lla_value3; + __u32 lla_value4; +}; + +enum ladvise_flag { + LF_ASYNC = 0x00000001, + LF_UNSET = 0x00000002, +}; + +#define LADVISE_MAGIC 0x1ADF1CE0 +/* Masks of valid flags for each advice */ +#define LF_LOCKNOEXPAND_MASK LF_UNSET +/* Flags valid for all advices not explicitly specified */ +#define LF_DEFAULT_MASK LF_ASYNC +/* All flags */ +#define LF_MASK (LF_ASYNC | LF_UNSET) + +#define lla_lockahead_mode lla_value1 +#define lla_peradvice_flags lla_value2 +#define lla_lockahead_result lla_value3 + +/* This is the userspace argument for ladvise, corresponds to ladvise_hdr which + * is used on the wire. It is defined separately as we may need info which is + * only used locally. */ +struct llapi_ladvise_hdr { + __u32 lah_magic; /* LADVISE_MAGIC */ + __u32 lah_count; /* number of advices */ + __u64 lah_flags; /* from enum ladvise_flag */ + __u32 lah_value1; /* unused */ + __u32 lah_value2; /* unused */ + __u64 lah_value3; /* unused */ + struct llapi_lu_ladvise lah_advise[0]; /* advices in this header */ +}; + +#define LAH_COUNT_MAX (1024) + +/* Shared key */ +enum sk_crypt_alg { + SK_CRYPT_INVALID = -1, + SK_CRYPT_EMPTY = 0, + SK_CRYPT_AES256_CTR = 1, +}; + +enum sk_hmac_alg { + SK_HMAC_INVALID = -1, + SK_HMAC_EMPTY = 0, + SK_HMAC_SHA256 = 1, + SK_HMAC_SHA512 = 2, +}; + +struct sk_crypt_type { + const char *sct_name; + int sct_type; +}; + +struct sk_hmac_type { + const char *sht_name; + int sht_type; +}; + +enum lock_mode_user { + MODE_READ_USER = 1, + MODE_WRITE_USER, + MODE_MAX_USER, +}; + +#define LOCK_MODE_NAMES { \ + [MODE_READ_USER] = "READ",\ + [MODE_WRITE_USER] = "WRITE"\ +} + +enum lockahead_results { + LLA_RESULT_SENT = 0, + LLA_RESULT_DIFFERENT, + LLA_RESULT_SAME, +}; + +enum lu_heat_flag_bit { + LU_HEAT_FLAG_BIT_INVALID = 0, + LU_HEAT_FLAG_BIT_OFF, + LU_HEAT_FLAG_BIT_CLEAR, +}; + +enum lu_heat_flag { + LU_HEAT_FLAG_OFF = 1ULL << LU_HEAT_FLAG_BIT_OFF, + LU_HEAT_FLAG_CLEAR = 1ULL << LU_HEAT_FLAG_BIT_CLEAR, +}; + +enum obd_heat_type { + OBD_HEAT_READSAMPLE = 0, + OBD_HEAT_WRITESAMPLE = 1, + OBD_HEAT_READBYTE = 2, + OBD_HEAT_WRITEBYTE = 3, + OBD_HEAT_COUNT +}; + +#define LU_HEAT_NAMES { \ + [OBD_HEAT_READSAMPLE] = "readsample", \ + [OBD_HEAT_WRITESAMPLE] = "writesample", \ + [OBD_HEAT_READBYTE] = "readbyte", \ + [OBD_HEAT_WRITEBYTE] = "writebyte", \ +} + +struct lu_heat { + __u32 lh_count; + __u32 lh_flags; + __u64 lh_heat[0]; +}; + +enum lu_pcc_type { + LU_PCC_NONE = 0, + LU_PCC_READWRITE, + LU_PCC_MAX +}; + +static inline const char *pcc_type2string(enum lu_pcc_type type) +{ + switch (type) { + case LU_PCC_NONE: + return "none"; + case LU_PCC_READWRITE: + return "readwrite"; + default: + return "fault"; + } +} + +struct lu_pcc_attach { + __u32 pcca_type; /* PCC type */ + __u32 pcca_id; /* archive ID for readwrite, group ID for readonly */ +}; + +enum lu_pcc_detach_opts { + PCC_DETACH_OPT_NONE = 0, /* Detach only, keep the PCC copy */ + PCC_DETACH_OPT_UNCACHE, /* Remove the cached file after detach */ +}; + +struct lu_pcc_detach_fid { + /* fid of the file to detach */ + struct lu_fid pccd_fid; + __u32 pccd_opt; +}; + +struct lu_pcc_detach { + __u32 pccd_opt; +}; + +enum lu_pcc_state_flags { + PCC_STATE_FL_NONE = 0x0, + /* The inode attr is cached locally */ + PCC_STATE_FL_ATTR_VALID = 0x01, + /* The file is being attached into PCC */ + PCC_STATE_FL_ATTACHING = 0x02, +}; + +struct lu_pcc_state { + __u32 pccs_type; /* enum lu_pcc_type */ + __u32 pccs_open_count; + __u32 pccs_flags; /* enum lu_pcc_state_flags */ + __u32 pccs_padding; + char pccs_path[PATH_MAX]; +}; + +enum lu_project_type { + LU_PROJECT_NONE = 0, + LU_PROJECT_SET, + LU_PROJECT_GET, + LU_PROJECT_MAX +}; + +struct lu_project { + __u32 project_type; /* enum lu_project_type */ + __u32 project_id; + __u32 project_xflags; + __u32 project_reserved; + char project_name[NAME_MAX + 1]; +}; + +struct fid_array { + __u32 fa_nr; + /* make header's size equal lu_fid */ + __u32 fa_padding0; + __u64 fa_padding1; + struct lu_fid fa_fids[0]; +}; +#define OBD_MAX_FIDS_IN_ARRAY 4096 + +/* more types could be defined upon need for more complex + * format to be used in foreign symlink LOV/LMV EAs, like + * one to describe a delimiter string and occurence number + * of delimited sub-string, ... + */ +enum ll_foreign_symlink_upcall_item_type { + EOB_TYPE = 1, + STRING_TYPE = 2, + POSLEN_TYPE = 3, +}; + +/* may need to be modified to allow for more format items to be defined, and + * like for ll_foreign_symlink_upcall_item_type enum + */ +struct ll_foreign_symlink_upcall_item { + __u32 type; + union { + struct { + __u32 pos; + __u32 len; + }; + struct { + size_t size; + union { + /* internal storage of constant string */ + char *string; + /* upcall stores constant string in a raw */ + char bytestring[0]; + }; + }; + }; +}; + +#define POSLEN_ITEM_SZ (offsetof(struct ll_foreign_symlink_upcall_item, len) + \ + sizeof(((struct ll_foreign_symlink_upcall_item *)0)->len)) +#define STRING_ITEM_SZ(sz) ( \ + offsetof(struct ll_foreign_symlink_upcall_item, bytestring) + \ + (sz + sizeof(__u32) - 1) / sizeof(__u32) * sizeof(__u32)) + +/* presently limited to not cause max stack frame size to be reached + * because of temporary automatic array of + * "struct ll_foreign_symlink_upcall_item" presently used in + * foreign_symlink_upcall_info_store() + */ +#define MAX_NB_UPCALL_ITEMS 32 + +#if defined(__cplusplus) +} +#endif + +/** @} lustreuser */ + +#endif /* _LUSTRE_USER_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h new file mode 100644 index 0000000000000..5983a5ba1b366 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h @@ -0,0 +1,33 @@ +#ifndef _LUSTRE_VER_H_ +#define _LUSTRE_VER_H_ + +/* + * LUSTRE_VERSION_STRING + * + * Note that some files may seem to include this header unnecessarily. + * If the file uses LUSTRE_VERSION_STRING, it is likely doing the include + * for compatibility with the Lustre code in the Linux kernel. + * In the Linux kernel, they are likely hard coding LUSTRE_VERSION_STRING + * right here in this file. The out-of-kernel Lustre code generates + * LUSTRE_VERSION_STRING in autoconf with AC_DEFINE. + */ + +#define OBD_OCD_VERSION(major, minor, patch, fix) \ + (((major) << 24) + ((minor) << 16) + ((patch) << 8) + (fix)) + +#define OBD_OCD_VERSION_MAJOR(version) ((int)((version) >> 24) & 255) +#define OBD_OCD_VERSION_MINOR(version) ((int)((version) >> 16) & 255) +#define OBD_OCD_VERSION_PATCH(version) ((int)((version) >> 8) & 255) +#define OBD_OCD_VERSION_FIX(version) ((int)((version) >> 0) & 255) + +#define LUSTRE_VERSION_CODE \ + OBD_OCD_VERSION(LUSTRE_MAJOR, LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX) + +/* If lustre version of client and servers it connects to differs by more + * than this amount, client would issue a warning. + * (set in lustre/autoconf/lustre-version.ac) */ +#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 3, 50, 0) + +extern int allow_version_mismatch; + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/upcall_cache.h b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h new file mode 100644 index 0000000000000..6ac4a40185b5e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h @@ -0,0 +1,153 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _UPCALL_CACHE_H +#define _UPCALL_CACHE_H + +#include +#include + +/** \defgroup ucache ucache + * + * @{ + */ + +#define UC_CACHE_NEW 0x01 +#define UC_CACHE_ACQUIRING 0x02 +#define UC_CACHE_INVALID 0x04 +#define UC_CACHE_EXPIRED 0x08 + +#define UC_CACHE_IS_NEW(i) ((i)->ue_flags & UC_CACHE_NEW) +#define UC_CACHE_IS_INVALID(i) ((i)->ue_flags & UC_CACHE_INVALID) +#define UC_CACHE_IS_ACQUIRING(i) ((i)->ue_flags & UC_CACHE_ACQUIRING) +#define UC_CACHE_IS_EXPIRED(i) ((i)->ue_flags & UC_CACHE_EXPIRED) +#define UC_CACHE_IS_VALID(i) ((i)->ue_flags == 0) + +#define UC_CACHE_SET_NEW(i) ((i)->ue_flags |= UC_CACHE_NEW) +#define UC_CACHE_SET_INVALID(i) ((i)->ue_flags |= UC_CACHE_INVALID) +#define UC_CACHE_SET_ACQUIRING(i) ((i)->ue_flags |= UC_CACHE_ACQUIRING) +#define UC_CACHE_SET_EXPIRED(i) ((i)->ue_flags |= UC_CACHE_EXPIRED) +#define UC_CACHE_SET_VALID(i) ((i)->ue_flags = 0) + +#define UC_CACHE_CLEAR_NEW(i) ((i)->ue_flags &= ~UC_CACHE_NEW) +#define UC_CACHE_CLEAR_ACQUIRING(i) ((i)->ue_flags &= ~UC_CACHE_ACQUIRING) +#define UC_CACHE_CLEAR_INVALID(i) ((i)->ue_flags &= ~UC_CACHE_INVALID) +#define UC_CACHE_CLEAR_EXPIRED(i) ((i)->ue_flags &= ~UC_CACHE_EXPIRED) + +struct upcall_cache_entry; + +struct md_perm { + lnet_nid_t mp_nid; + uint32_t mp_perm; +}; + +struct md_identity { + struct upcall_cache_entry *mi_uc_entry; + uid_t mi_uid; + gid_t mi_gid; + struct group_info *mi_ginfo; + int mi_nperms; + struct md_perm *mi_perms; +}; + +struct upcall_cache_entry { + struct list_head ue_hash; + uint64_t ue_key; + atomic_t ue_refcount; + int ue_flags; + wait_queue_head_t ue_waitq; + time64_t ue_acquire_expire; + time64_t ue_expire; + union { + struct md_identity identity; + } u; +}; + +#define UC_CACHE_HASH_SIZE (128) +#define UC_CACHE_HASH_INDEX(id) ((id) & (UC_CACHE_HASH_SIZE - 1)) +#define UC_CACHE_UPCALL_MAXPATH (1024UL) + +struct upcall_cache; + +struct upcall_cache_ops { + void (*init_entry)(struct upcall_cache_entry *, void *args); + void (*free_entry)(struct upcall_cache *, + struct upcall_cache_entry *); + int (*upcall_compare)(struct upcall_cache *, + struct upcall_cache_entry *, + __u64 key, void *args); + int (*downcall_compare)(struct upcall_cache *, + struct upcall_cache_entry *, + __u64 key, void *args); + int (*do_upcall)(struct upcall_cache *, + struct upcall_cache_entry *); + int (*parse_downcall)(struct upcall_cache *, + struct upcall_cache_entry *, void *); +}; + +struct upcall_cache { + struct list_head uc_hashtable[UC_CACHE_HASH_SIZE]; + spinlock_t uc_lock; + struct rw_semaphore uc_upcall_rwsem; + + char uc_name[40]; /* for upcall */ + char uc_upcall[UC_CACHE_UPCALL_MAXPATH]; + time64_t uc_acquire_expire; /* seconds */ + time64_t uc_entry_expire; /* seconds */ + struct upcall_cache_ops *uc_ops; +}; + +struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache, + __u64 key, void *args); +void upcall_cache_put_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry); +int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key, + void *args); +void upcall_cache_flush(struct upcall_cache *cache, int force); + +static inline void upcall_cache_flush_idle(struct upcall_cache *cache) +{ + upcall_cache_flush(cache, 0); +} + +static inline void upcall_cache_flush_all(struct upcall_cache *cache) +{ + upcall_cache_flush(cache, 1); +} + +void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args); +struct upcall_cache *upcall_cache_init(const char *name, const char *upcall, + struct upcall_cache_ops *ops); +void upcall_cache_cleanup(struct upcall_cache *cache); + +/** @} ucache */ + +#endif /* _UPCALL_CACHE_H */ diff --git a/drivers/staging/lustrefsx/lustre/ldlm/l_lock.c b/drivers/staging/lustrefsx/lustre/ldlm/l_lock.c new file mode 100644 index 0000000000000..0c10b3098276e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/l_lock.c @@ -0,0 +1,73 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LDLM +#include + +#include +#include + +/** + * Lock a lock and its resource. + * + * LDLM locking uses resource to serialize access to locks + * but there is a case when we change resource of lock upon + * enqueue reply. We rely on rcu_assign_pointer(lock->l_resource, new_res) + * being an atomic operation. + */ +struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock) +{ + struct ldlm_resource *res; + + rcu_read_lock(); + while (1) { + res = rcu_dereference(lock->l_resource); + lock_res(res); + if (res == lock->l_resource) { + ldlm_set_res_locked(lock); + rcu_read_unlock(); + return res; + } + unlock_res(res); + } +} +EXPORT_SYMBOL(lock_res_and_lock); + +/** + * Unlock a lock and its resource previously locked with lock_res_and_lock + */ +void unlock_res_and_lock(struct ldlm_lock *lock) +{ + ldlm_clear_res_locked(lock); + + unlock_res(lock->l_resource); +} +EXPORT_SYMBOL(unlock_res_and_lock); diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c new file mode 100644 index 0000000000000..7d8e5e2de885a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c @@ -0,0 +1,1095 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ldlm/ldlm_extent.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +/** + * This file contains implementation of EXTENT lock type + * + * EXTENT lock type is for locking a contiguous range of values, represented + * by 64-bit starting and ending offsets (inclusive). There are several extent + * lock modes, some of which may be mutually incompatible. Extent locks are + * considered incompatible if their modes are incompatible and their extents + * intersect. See the lock mode compatibility matrix in lustre_dlm.h. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include +#include +#include + +#include "ldlm_internal.h" + +#ifdef HAVE_SERVER_SUPPORT +# define LDLM_MAX_GROWN_EXTENT (32 * 1024 * 1024 - 1) + +/** + * Fix up the ldlm_extent after expanding it. + * + * After expansion has been done, we might still want to do certain adjusting + * based on overall contention of the resource and the like to avoid granting + * overly wide locks. + */ +static void ldlm_extent_internal_policy_fixup(struct ldlm_lock *req, + struct ldlm_extent *new_ex, + int conflicting) +{ + enum ldlm_mode req_mode = req->l_req_mode; + __u64 req_start = req->l_req_extent.start; + __u64 req_end = req->l_req_extent.end; + __u64 req_align, mask; + + if (conflicting > 32 && (req_mode == LCK_PW || req_mode == LCK_CW)) { + if (req_end < req_start + LDLM_MAX_GROWN_EXTENT) + new_ex->end = min(req_start + LDLM_MAX_GROWN_EXTENT, + new_ex->end); + } + + if (new_ex->start == 0 && new_ex->end == OBD_OBJECT_EOF) { + EXIT; + return; + } + + /* we need to ensure that the lock extent is properly aligned to what + * the client requested. Also we need to make sure it's also server + * page size aligned otherwise a server page can be covered by two + * write locks. */ + mask = PAGE_SIZE; + req_align = (req_end + 1) | req_start; + if (req_align != 0 && (req_align & (mask - 1)) == 0) { + while ((req_align & mask) == 0) + mask <<= 1; + } + mask -= 1; + /* We can only shrink the lock, not grow it. + * This should never cause lock to be smaller than requested, + * since requested lock was already aligned on these boundaries. */ + new_ex->start = ((new_ex->start - 1) | mask) + 1; + new_ex->end = ((new_ex->end + 1) & ~mask) - 1; + LASSERTF(new_ex->start <= req_start, + "mask %#llx grant start %llu req start %llu\n", + mask, new_ex->start, req_start); + LASSERTF(new_ex->end >= req_end, + "mask %#llx grant end %llu req end %llu\n", + mask, new_ex->end, req_end); +} + +/** + * Return the maximum extent that: + * - contains the requested extent + * - does not overlap existing conflicting extents outside the requested one + * + * This allows clients to request a small required extent range, but if there + * is no contention on the lock the full lock can be granted to the client. + * This avoids the need for many smaller lock requests to be granted in the + * common (uncontended) case. + * + * Use interval tree to expand the lock extent for granted lock. + */ +static void ldlm_extent_internal_policy_granted(struct ldlm_lock *req, + struct ldlm_extent *new_ex) +{ + struct ldlm_resource *res = req->l_resource; + enum ldlm_mode req_mode = req->l_req_mode; + __u64 req_start = req->l_req_extent.start; + __u64 req_end = req->l_req_extent.end; + struct ldlm_interval_tree *tree; + struct interval_node_extent limiter = { + .start = new_ex->start, + .end = new_ex->end, + }; + int conflicting = 0; + int idx; + ENTRY; + + lockmode_verify(req_mode); + + /* Using interval tree to handle the LDLM extent granted locks. */ + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + struct interval_node_extent ext = { + .start = req_start, + .end = req_end, + }; + + tree = &res->lr_itree[idx]; + if (lockmode_compat(tree->lit_mode, req_mode)) + continue; + + conflicting += tree->lit_size; + if (conflicting > 4) + limiter.start = req_start; + + if (interval_is_overlapped(tree->lit_root, &ext)) + CDEBUG(D_INFO, + "req_mode = %d, tree->lit_mode = %d, " + "tree->lit_size = %d\n", + req_mode, tree->lit_mode, tree->lit_size); + interval_expand(tree->lit_root, &ext, &limiter); + limiter.start = max(limiter.start, ext.start); + limiter.end = min(limiter.end, ext.end); + if (limiter.start == req_start && limiter.end == req_end) + break; + } + + new_ex->start = limiter.start; + new_ex->end = limiter.end; + LASSERT(new_ex->start <= req_start); + LASSERT(new_ex->end >= req_end); + + ldlm_extent_internal_policy_fixup(req, new_ex, conflicting); + EXIT; +} + +/* The purpose of this function is to return: + * - the maximum extent + * - containing the requested extent + * - and not overlapping existing conflicting extents outside the requested one + */ +static void +ldlm_extent_internal_policy_waiting(struct ldlm_lock *req, + struct ldlm_extent *new_ex) +{ + struct ldlm_resource *res = req->l_resource; + enum ldlm_mode req_mode = req->l_req_mode; + __u64 req_start = req->l_req_extent.start; + __u64 req_end = req->l_req_extent.end; + struct ldlm_lock *lock; + int conflicting = 0; + ENTRY; + + lockmode_verify(req_mode); + + /* for waiting locks */ + list_for_each_entry(lock, &res->lr_waiting, l_res_link) { + struct ldlm_extent *l_extent = &lock->l_policy_data.l_extent; + + /* We already hit the minimum requested size, search no more */ + if (new_ex->start == req_start && new_ex->end == req_end) { + EXIT; + return; + } + + /* Don't conflict with ourselves */ + if (req == lock) + continue; + + /* Locks are compatible, overlap doesn't matter */ + /* Until bug 20 is fixed, try to avoid granting overlapping + * locks on one client (they take a long time to cancel) */ + if (lockmode_compat(lock->l_req_mode, req_mode) && + lock->l_export != req->l_export) + continue; + + /* If this is a high-traffic lock, don't grow downwards at all + * or grow upwards too much */ + ++conflicting; + if (conflicting > 4) + new_ex->start = req_start; + + /* If lock doesn't overlap new_ex, skip it. */ + if (!ldlm_extent_overlap(l_extent, new_ex)) + continue; + + /* Locks conflicting in requested extents and we can't satisfy + * both locks, so ignore it. Either we will ping-pong this + * extent (we would regardless of what extent we granted) or + * lock is unused and it shouldn't limit our extent growth. */ + if (ldlm_extent_overlap(&lock->l_req_extent,&req->l_req_extent)) + continue; + + /* We grow extents downwards only as far as they don't overlap + * with already-granted locks, on the assumption that clients + * will be writing beyond the initial requested end and would + * then need to enqueue a new lock beyond previous request. + * l_req_extent->end strictly < req_start, checked above. */ + if (l_extent->start < req_start && new_ex->start != req_start) { + if (l_extent->end >= req_start) + new_ex->start = req_start; + else + new_ex->start = min(l_extent->end+1, req_start); + } + + /* If we need to cancel this lock anyways because our request + * overlaps the granted lock, we grow up to its requested + * extent start instead of limiting this extent, assuming that + * clients are writing forwards and the lock had over grown + * its extent downwards before we enqueued our request. */ + if (l_extent->end > req_end) { + if (l_extent->start <= req_end) + new_ex->end = max(lock->l_req_extent.start - 1, + req_end); + else + new_ex->end = max(l_extent->start - 1, req_end); + } + } + + ldlm_extent_internal_policy_fixup(req, new_ex, conflicting); + EXIT; +} + + +/* In order to determine the largest possible extent we can grant, we need + * to scan all of the queues. */ +static void ldlm_extent_policy(struct ldlm_resource *res, + struct ldlm_lock *lock, __u64 *flags) +{ + struct ldlm_extent new_ex = { .start = 0, .end = OBD_OBJECT_EOF }; + + if (lock->l_export == NULL) + /* + * this is a local lock taken by server (e.g., as a part of + * OST-side locking, or unlink handling). Expansion doesn't + * make a lot of sense for local locks, because they are + * dropped immediately on operation completion and would only + * conflict with other threads. + */ + return; + + if (lock->l_policy_data.l_extent.start == 0 && + lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF) + /* fast-path whole file locks */ + return; + + /* Because reprocess_queue zeroes flags and uses it to return + * LDLM_FL_LOCK_CHANGED, we must check for the NO_EXPANSION flag + * in the lock flags rather than the 'flags' argument */ + if (likely(!(lock->l_flags & LDLM_FL_NO_EXPANSION))) { + ldlm_extent_internal_policy_granted(lock, &new_ex); + ldlm_extent_internal_policy_waiting(lock, &new_ex); + } else { + LDLM_DEBUG(lock, "Not expanding manually requested lock.\n"); + new_ex.start = lock->l_policy_data.l_extent.start; + new_ex.end = lock->l_policy_data.l_extent.end; + /* In case the request is not on correct boundaries, we call + * fixup. (normally called in ldlm_extent_internal_policy_*) */ + ldlm_extent_internal_policy_fixup(lock, &new_ex, 0); + } + + if (!ldlm_extent_equal(&new_ex, &lock->l_policy_data.l_extent)) { + *flags |= LDLM_FL_LOCK_CHANGED; + lock->l_policy_data.l_extent.start = new_ex.start; + lock->l_policy_data.l_extent.end = new_ex.end; + } +} + +static bool ldlm_check_contention(struct ldlm_lock *lock, int contended_locks) +{ + struct ldlm_resource *res = lock->l_resource; + time64_t now = ktime_get_seconds(); + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_SET_CONTENTION)) + return true; + + CDEBUG(D_DLMTRACE, "contended locks = %d\n", contended_locks); + if (contended_locks > ldlm_res_to_ns(res)->ns_contended_locks) + res->lr_contention_time = now; + + return now < res->lr_contention_time + + ldlm_res_to_ns(res)->ns_contention_time; +} + +struct ldlm_extent_compat_args { + struct list_head *work_list; + struct ldlm_lock *lock; + enum ldlm_mode mode; + int *locks; + int *compat; +}; + +static enum interval_iter ldlm_extent_compat_cb(struct interval_node *n, + void *data) +{ + struct ldlm_extent_compat_args *priv = data; + struct ldlm_interval *node = to_ldlm_interval(n); + struct ldlm_extent *extent; + struct list_head *work_list = priv->work_list; + struct ldlm_lock *lock, *enq = priv->lock; + enum ldlm_mode mode = priv->mode; + int count = 0; + ENTRY; + + LASSERT(!list_empty(&node->li_group)); + + list_for_each_entry(lock, &node->li_group, l_sl_policy) { + /* interval tree is for granted lock */ + LASSERTF(mode == lock->l_granted_mode, + "mode = %s, lock->l_granted_mode = %s\n", + ldlm_lockname[mode], + ldlm_lockname[lock->l_granted_mode]); + count++; + if (lock->l_blocking_ast && + lock->l_granted_mode != LCK_GROUP) + ldlm_add_ast_work_item(lock, enq, work_list); + } + + /* don't count conflicting glimpse locks */ + extent = ldlm_interval_extent(node); + if (!(mode == LCK_PR && + extent->start == 0 && extent->end == OBD_OBJECT_EOF)) + *priv->locks += count; + + if (priv->compat) + *priv->compat = 0; + + RETURN(INTERVAL_ITER_CONT); +} + +/** + * Determine if the lock is compatible with all locks on the queue. + * + * If \a work_list is provided, conflicting locks are linked there. + * If \a work_list is not provided, we exit this function on first conflict. + * + * \retval 0 if the lock is not compatible + * \retval 1 if the lock is compatible + * \retval 2 if \a req is a group lock and it is compatible and requires + * no further checking + * \retval negative error, such as EAGAIN for group locks + */ +static int +ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req, + __u64 *flags, struct list_head *work_list, + int *contended_locks) +{ + struct ldlm_resource *res = req->l_resource; + enum ldlm_mode req_mode = req->l_req_mode; + __u64 req_start = req->l_req_extent.start; + __u64 req_end = req->l_req_extent.end; + struct ldlm_lock *lock; + int check_contention; + int compat = 1; + ENTRY; + + lockmode_verify(req_mode); + + /* Using interval tree for granted lock */ + if (queue == &res->lr_granted) { + struct ldlm_interval_tree *tree; + struct ldlm_extent_compat_args data = {.work_list = work_list, + .lock = req, + .locks = contended_locks, + .compat = &compat }; + struct interval_node_extent ex = { .start = req_start, + .end = req_end }; + int idx, rc; + + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + tree = &res->lr_itree[idx]; + if (tree->lit_root == NULL) /* empty tree, skipped */ + continue; + + data.mode = tree->lit_mode; + if (lockmode_compat(req_mode, tree->lit_mode)) { + struct ldlm_interval *node; + struct ldlm_extent *extent; + + if (req_mode != LCK_GROUP) + continue; + + /* group lock, grant it immediately if + * compatible */ + node = to_ldlm_interval(tree->lit_root); + extent = ldlm_interval_extent(node); + if (req->l_policy_data.l_extent.gid == + extent->gid) + RETURN(2); + } + + if (tree->lit_mode == LCK_GROUP) { + if (*flags & (LDLM_FL_BLOCK_NOWAIT | + LDLM_FL_SPECULATIVE)) { + compat = -EAGAIN; + goto destroylock; + } + + if (!work_list) + RETURN(0); + + /* if work list is not NULL,add all + locks in the tree to work list */ + compat = 0; + interval_iterate(tree->lit_root, + ldlm_extent_compat_cb, &data); + continue; + } + + /* We've found a potentially blocking lock, check + * compatibility. This handles locks other than GROUP + * locks, which are handled separately above. + * + * Locks with FL_SPECULATIVE are asynchronous requests + * which must never wait behind another lock, so they + * fail if any conflicting lock is found. */ + if (!work_list || (*flags & LDLM_FL_SPECULATIVE)) { + rc = interval_is_overlapped(tree->lit_root, + &ex); + if (rc) { + if (!work_list) { + RETURN(0); + } else { + compat = -EAGAIN; + goto destroylock; + } + } + } else { + interval_search(tree->lit_root, &ex, + ldlm_extent_compat_cb, &data); + if (!list_empty(work_list) && compat) + compat = 0; + } + } + } else { /* for waiting queue */ + list_for_each_entry(lock, queue, l_res_link) { + check_contention = 1; + + /* We stop walking the queue if we hit ourselves so + * we don't take conflicting locks enqueued after us + * into account, or we'd wait forever. */ + if (req == lock) + break; + + /* locks are compatible, overlap doesn't matter */ + if (lockmode_compat(lock->l_req_mode, req_mode)) { + if (req_mode == LCK_PR && + ((lock->l_policy_data.l_extent.start <= + req->l_policy_data.l_extent.start) && + (lock->l_policy_data.l_extent.end >= + req->l_policy_data.l_extent.end))) { + /* If we met a PR lock just like us or + wider, and nobody down the list + conflicted with it, that means we + can skip processing of the rest of + the list and safely place ourselves + at the end of the list, or grant + (dependent if we met an conflicting + locks before in the list). In case + of 1st enqueue only we continue + traversing if there is something + conflicting down the list because + we need to make sure that something + is marked as AST_SENT as well, in + cse of empy worklist we would exit + on first conflict met. */ + /* There IS a case where such flag is + not set for a lock, yet it blocks + something. Luckily for us this is + only during destroy, so lock is + exclusive. So here we are safe */ + if (!ldlm_is_ast_sent(lock)) + RETURN(compat); + } + + /* non-group locks are compatible, overlap doesn't + matter */ + if (likely(req_mode != LCK_GROUP)) + continue; + + /* If we are trying to get a GROUP lock and there is + another one of this kind, we need to compare gid */ + if (req->l_policy_data.l_extent.gid == + lock->l_policy_data.l_extent.gid) { + /* If existing lock with matched gid is granted, + we grant new one too. */ + if (ldlm_is_granted(lock)) + RETURN(2); + + /* Otherwise we are scanning queue of waiting + * locks and it means current request would + * block along with existing lock (that is + * already blocked. + * If we are in nonblocking mode - return + * immediately */ + if (*flags & (LDLM_FL_BLOCK_NOWAIT + | LDLM_FL_SPECULATIVE)) { + compat = -EAGAIN; + goto destroylock; + } + /* If this group lock is compatible with another + * group lock on the waiting list, they must be + * together in the list, so they can be granted + * at the same time. Otherwise the later lock + * can get stuck behind another, incompatible, + * lock. */ + ldlm_resource_insert_lock_after(lock, req); + /* Because 'lock' is not granted, we can stop + * processing this queue and return immediately. + * There is no need to check the rest of the + * list. */ + RETURN(0); + } + } + + if (unlikely(req_mode == LCK_GROUP && + !ldlm_is_granted(lock))) { + compat = 0; + if (lock->l_req_mode != LCK_GROUP) { + /* Ok, we hit non-GROUP lock, there should be no + more GROUP locks later on, queue in front of + first non-GROUP lock */ + + ldlm_resource_insert_lock_before(lock, req); + break; + } + LASSERT(req->l_policy_data.l_extent.gid != + lock->l_policy_data.l_extent.gid); + continue; + } + + if (unlikely(lock->l_req_mode == LCK_GROUP)) { + /* If compared lock is GROUP, then requested is + * PR/PW so this is not compatible; extent + * range does not matter */ + if (*flags & (LDLM_FL_BLOCK_NOWAIT + | LDLM_FL_SPECULATIVE)) { + compat = -EAGAIN; + goto destroylock; + } + } else if (lock->l_policy_data.l_extent.end < req_start || + lock->l_policy_data.l_extent.start > req_end) { + /* if a non group lock doesn't overlap skip it */ + continue; + } else if (lock->l_req_extent.end < req_start || + lock->l_req_extent.start > req_end) { + /* false contention, the requests doesn't really overlap */ + check_contention = 0; + } + + if (!work_list) + RETURN(0); + + if (*flags & LDLM_FL_SPECULATIVE) { + compat = -EAGAIN; + goto destroylock; + } + + /* don't count conflicting glimpse locks */ + if (lock->l_req_mode == LCK_PR && + lock->l_policy_data.l_extent.start == 0 && + lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF) + check_contention = 0; + + *contended_locks += check_contention; + + compat = 0; + if (lock->l_blocking_ast && + lock->l_req_mode != LCK_GROUP) + ldlm_add_ast_work_item(lock, req, work_list); + } + } + + if (ldlm_check_contention(req, *contended_locks) && + compat == 0 && + (*flags & LDLM_FL_DENY_ON_CONTENTION) && + req->l_req_mode != LCK_GROUP && + req_end - req_start <= + ldlm_res_to_ns(req->l_resource)->ns_max_nolock_size) + GOTO(destroylock, compat = -EUSERS); + + RETURN(compat); +destroylock: + list_del_init(&req->l_res_link); + ldlm_lock_destroy_nolock(req); + RETURN(compat); +} + +/** + * This function refresh eviction timer for cancelled lock. + * \param[in] lock ldlm lock for refresh + * \param[in] arg ldlm prolong arguments, timeout, export, extent + * and counter are used + */ +void ldlm_lock_prolong_one(struct ldlm_lock *lock, + struct ldlm_prolong_args *arg) +{ + timeout_t timeout; + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PROLONG_PAUSE, 3); + + if (arg->lpa_export != lock->l_export || + lock->l_flags & LDLM_FL_DESTROYED) + /* ignore unrelated locks */ + return; + + arg->lpa_locks_cnt++; + + if (!(lock->l_flags & LDLM_FL_AST_SENT)) + /* ignore locks not being cancelled */ + return; + + /* We are in the middle of the process - BL AST is sent, CANCEL + * is ahead. Take half of BL AT + IO AT process time. + */ + timeout = arg->lpa_timeout + (ldlm_bl_timeout(lock) >> 1); + + arg->lpa_blocks_cnt++; + + /* OK. this is a possible lock the user holds doing I/O + * let's refresh eviction timer for it. + */ + ldlm_refresh_waiting_lock(lock, timeout); +} +EXPORT_SYMBOL(ldlm_lock_prolong_one); + +static enum interval_iter ldlm_resource_prolong_cb(struct interval_node *n, + void *data) +{ + struct ldlm_prolong_args *arg = data; + struct ldlm_interval *node = to_ldlm_interval(n); + struct ldlm_lock *lock; + + ENTRY; + + LASSERT(!list_empty(&node->li_group)); + + list_for_each_entry(lock, &node->li_group, l_sl_policy) { + ldlm_lock_prolong_one(lock, arg); + } + + RETURN(INTERVAL_ITER_CONT); +} + +/** + * Walk through granted tree and prolong locks if they overlaps extent. + * + * \param[in] arg prolong args + */ +void ldlm_resource_prolong(struct ldlm_prolong_args *arg) +{ + struct ldlm_interval_tree *tree; + struct ldlm_resource *res; + struct interval_node_extent ex = { .start = arg->lpa_extent.start, + .end = arg->lpa_extent.end }; + int idx; + + ENTRY; + + res = ldlm_resource_get(arg->lpa_export->exp_obd->obd_namespace, NULL, + &arg->lpa_resid, LDLM_EXTENT, 0); + if (IS_ERR(res)) { + CDEBUG(D_DLMTRACE, "Failed to get resource for resid %llu/%llu\n", + arg->lpa_resid.name[0], arg->lpa_resid.name[1]); + RETURN_EXIT; + } + + lock_res(res); + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + tree = &res->lr_itree[idx]; + if (tree->lit_root == NULL) /* empty tree, skipped */ + continue; + + /* There is no possibility to check for the groupID + * so all the group locks are considered as valid + * here, especially because the client is supposed + * to check it has such a lock before sending an RPC. + */ + if (!(tree->lit_mode & arg->lpa_mode)) + continue; + + interval_search(tree->lit_root, &ex, + ldlm_resource_prolong_cb, arg); + } + + unlock_res(res); + ldlm_resource_putref(res); + + EXIT; +} +EXPORT_SYMBOL(ldlm_resource_prolong); + +/** + * Process a granting attempt for extent lock. + * Must be called with ns lock held. + * + * This function looks for any conflicts for \a lock in the granted or + * waiting queues. The lock is granted if no conflicts are found in + * either queue. + */ +int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, struct list_head *work_list) +{ + struct ldlm_resource *res = lock->l_resource; + int rc, rc2 = 0; + int contended_locks = 0; + struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ? + NULL : work_list; + ENTRY; + + LASSERT(!ldlm_is_granted(lock)); + LASSERT(!(*flags & LDLM_FL_DENY_ON_CONTENTION) || + !ldlm_is_ast_discard_data(lock)); + check_res_locked(res); + *err = ELDLM_OK; + + if (intention == LDLM_PROCESS_RESCAN) { + /* Careful observers will note that we don't handle -EAGAIN + * here, but it's ok for a non-obvious reason -- compat_queue + * can only return -EAGAIN if (flags & BLOCK_NOWAIT | + * SPECULATIVE). flags should always be zero here, and if that + * ever stops being true, we want to find out. */ + LASSERT(*flags == 0); + rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags, + NULL, &contended_locks); + if (rc == 1) { + rc = ldlm_extent_compat_queue(&res->lr_waiting, lock, + flags, NULL, + &contended_locks); + } + if (rc == 0) + RETURN(LDLM_ITER_STOP); + + ldlm_resource_unlink_lock(lock); + + if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_EVICT_RACE)) + ldlm_extent_policy(res, lock, flags); + ldlm_grant_lock(lock, grant_work); + RETURN(LDLM_ITER_CONTINUE); + } + + contended_locks = 0; + rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags, + work_list, &contended_locks); + if (rc < 0) + GOTO(out, *err = rc); + + if (rc != 2) { + rc2 = ldlm_extent_compat_queue(&res->lr_waiting, lock, + flags, work_list, + &contended_locks); + if (rc2 < 0) + GOTO(out, *err = rc = rc2); + } + + if (rc + rc2 == 2) { + ldlm_extent_policy(res, lock, flags); + ldlm_resource_unlink_lock(lock); + ldlm_grant_lock(lock, grant_work); + } else { + /* Adding LDLM_FL_NO_TIMEOUT flag to granted lock to + * force client to wait for the lock endlessly once + * the lock is enqueued -bzzz */ + *flags |= LDLM_FL_NO_TIMEOUT; + } + + RETURN(LDLM_ITER_CONTINUE); +out: + return rc; +} +#endif /* HAVE_SERVER_SUPPORT */ + +struct ldlm_kms_shift_args { + __u64 old_kms; + __u64 kms; + bool complete; +}; + +/* Callback for interval_iterate functions, used by ldlm_extent_shift_Kms */ +static enum interval_iter ldlm_kms_shift_cb(struct interval_node *n, + void *args) +{ + struct ldlm_kms_shift_args *arg = args; + struct ldlm_interval *node = to_ldlm_interval(n); + struct ldlm_lock *tmplock; + struct ldlm_lock *lock = NULL; + + ENTRY; + + /* Since all locks in an interval have the same extent, we can just + * use the first lock without kms_ignore set. */ + list_for_each_entry(tmplock, &node->li_group, l_sl_policy) { + if (ldlm_is_kms_ignore(tmplock)) + continue; + + lock = tmplock; + + break; + } + + /* No locks in this interval without kms_ignore set */ + if (!lock) + RETURN(INTERVAL_ITER_CONT); + + /* If we find a lock with a greater or equal kms, we are not the + * highest lock (or we share that distinction with another lock), and + * don't need to update KMS. Return old_kms and stop looking. */ + if (lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF || + lock->l_policy_data.l_extent.end + 1 >= arg->old_kms) { + arg->kms = arg->old_kms; + arg->complete = true; + RETURN(INTERVAL_ITER_STOP); + } + + if (lock->l_policy_data.l_extent.end + 1 > arg->kms) + arg->kms = lock->l_policy_data.l_extent.end + 1; + + /* Since interval_iterate_reverse starts with the highest lock and + * works down, for PW locks, we only need to check if we should update + * the kms, then stop walking the tree. PR locks are not exclusive, so + * the highest start does not imply the highest end and we must + * continue. (Only one group lock is allowed per resource, so this is + * irrelevant for group locks.)*/ + if (lock->l_granted_mode == LCK_PW) + RETURN(INTERVAL_ITER_STOP); + else + RETURN(INTERVAL_ITER_CONT); +} + +/* When a lock is cancelled by a client, the KMS may undergo change if this + * is the "highest lock". This function returns the new KMS value, updating + * it only if we were the highest lock. + * + * Caller must hold lr_lock already. + * + * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */ +__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms) +{ + struct ldlm_resource *res = lock->l_resource; + struct ldlm_interval_tree *tree; + struct ldlm_kms_shift_args args; + int idx = 0; + + ENTRY; + + args.old_kms = old_kms; + args.kms = 0; + args.complete = false; + + /* don't let another thread in ldlm_extent_shift_kms race in + * just after we finish and take our lock into account in its + * calculation of the kms */ + ldlm_set_kms_ignore(lock); + + /* We iterate over the lock trees, looking for the largest kms smaller + * than the current one. */ + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + tree = &res->lr_itree[idx]; + + /* If our already known kms is >= than the highest 'end' in + * this tree, we don't need to check this tree, because + * the kms from a tree can be lower than in_max_high (due to + * kms_ignore), but it can never be higher. */ + if (!tree->lit_root || args.kms >= tree->lit_root->in_max_high) + continue; + + interval_iterate_reverse(tree->lit_root, ldlm_kms_shift_cb, + &args); + + /* this tells us we're not the highest lock, so we don't need + * to check the remaining trees */ + if (args.complete) + break; + } + + LASSERTF(args.kms <= args.old_kms, "kms %llu old_kms %llu\n", args.kms, + args.old_kms); + + RETURN(args.kms); +} +EXPORT_SYMBOL(ldlm_extent_shift_kms); + +struct kmem_cache *ldlm_interval_slab; +static struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock) +{ + struct ldlm_interval *node; + ENTRY; + + LASSERT(lock->l_resource->lr_type == LDLM_EXTENT); + OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS); + if (node == NULL) + RETURN(NULL); + + INIT_LIST_HEAD(&node->li_group); + ldlm_interval_attach(node, lock); + RETURN(node); +} + +void ldlm_interval_free(struct ldlm_interval *node) +{ + if (node) { + LASSERT(list_empty(&node->li_group)); + LASSERT(!interval_is_intree(&node->li_node)); + OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); + } +} + +/* interval tree, for LDLM_EXTENT. */ +void ldlm_interval_attach(struct ldlm_interval *n, + struct ldlm_lock *l) +{ + LASSERT(l->l_tree_node == NULL); + LASSERT(l->l_resource->lr_type == LDLM_EXTENT); + + list_add_tail(&l->l_sl_policy, &n->li_group); + l->l_tree_node = n; +} + +struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l) +{ + struct ldlm_interval *n = l->l_tree_node; + + if (n == NULL) + return NULL; + + LASSERT(!list_empty(&n->li_group)); + l->l_tree_node = NULL; + list_del_init(&l->l_sl_policy); + + return list_empty(&n->li_group) ? n : NULL; +} + +static inline int ldlm_mode_to_index(enum ldlm_mode mode) +{ + int index; + + LASSERT(mode != 0); + LASSERT(is_power_of_2(mode)); + index = ilog2(mode); + LASSERT(index < LCK_MODE_NUM); + return index; +} + +int ldlm_extent_alloc_lock(struct ldlm_lock *lock) +{ + lock->l_tree_node = NULL; + if (ldlm_interval_alloc(lock) == NULL) + return -ENOMEM; + return 0; +} + +/** Add newly granted lock into interval tree for the resource. */ +void ldlm_extent_add_lock(struct ldlm_resource *res, + struct ldlm_lock *lock) +{ + struct interval_node *found, **root; + struct ldlm_interval *node; + struct ldlm_extent *extent; + int idx, rc; + + LASSERT(ldlm_is_granted(lock)); + + node = lock->l_tree_node; + LASSERT(node != NULL); + LASSERT(!interval_is_intree(&node->li_node)); + + idx = ldlm_mode_to_index(lock->l_granted_mode); + LASSERT(lock->l_granted_mode == BIT(idx)); + LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode); + + /* node extent initialize */ + extent = &lock->l_policy_data.l_extent; + + rc = interval_set(&node->li_node, extent->start, extent->end); + LASSERT(!rc); + + root = &res->lr_itree[idx].lit_root; + found = interval_insert(&node->li_node, root); + if (found) { /* The policy group found. */ + struct ldlm_interval *tmp = ldlm_interval_detach(lock); + LASSERT(tmp != NULL); + ldlm_interval_free(tmp); + ldlm_interval_attach(to_ldlm_interval(found), lock); + } + res->lr_itree[idx].lit_size++; + + /* even though we use interval tree to manage the extent lock, we also + * add the locks into grant list, for debug purpose, .. */ + ldlm_resource_add_lock(res, &res->lr_granted, lock); + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GRANT_CHECK)) { + struct ldlm_lock *lck; + + list_for_each_entry_reverse(lck, &res->lr_granted, + l_res_link) { + if (lck == lock) + continue; + if (lockmode_compat(lck->l_granted_mode, + lock->l_granted_mode)) + continue; + if (ldlm_extent_overlap(&lck->l_req_extent, + &lock->l_req_extent)) { + CDEBUG(D_ERROR, "granting conflicting lock %p " + "%p\n", lck, lock); + ldlm_resource_dump(D_ERROR, res); + LBUG(); + } + } + } +} + +/** Remove cancelled lock from resource interval tree. */ +void ldlm_extent_unlink_lock(struct ldlm_lock *lock) +{ + struct ldlm_resource *res = lock->l_resource; + struct ldlm_interval *node = lock->l_tree_node; + struct ldlm_interval_tree *tree; + int idx; + + if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */ + return; + + idx = ldlm_mode_to_index(lock->l_granted_mode); + LASSERT(lock->l_granted_mode == BIT(idx)); + tree = &res->lr_itree[idx]; + + LASSERT(tree->lit_root != NULL); /* assure the tree is not null */ + + tree->lit_size--; + node = ldlm_interval_detach(lock); + if (node) { + interval_erase(&node->li_node, &tree->lit_root); + ldlm_interval_free(node); + } +} + +void ldlm_extent_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy) +{ + lpolicy->l_extent.start = wpolicy->l_extent.start; + lpolicy->l_extent.end = wpolicy->l_extent.end; + lpolicy->l_extent.gid = wpolicy->l_extent.gid; +} + +void ldlm_extent_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy) +{ + memset(wpolicy, 0, sizeof(*wpolicy)); + wpolicy->l_extent.start = lpolicy->l_extent.start; + wpolicy->l_extent.end = lpolicy->l_extent.end; + wpolicy->l_extent.gid = lpolicy->l_extent.gid; +} + diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c new file mode 100644 index 0000000000000..745c1ea580fa8 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c @@ -0,0 +1,958 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003 Hewlett-Packard Development Company LP. + * Developed under the sponsorship of the US Government under + * Subcontract No. B514193 + * + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +/** + * This file implements POSIX lock type for Lustre. + * Its policy properties are start and end of extent and PID. + * + * These locks are only done through MDS due to POSIX semantics requiring + * e.g. that locks could be only partially released and as such split into + * two parts, and also that two adjacent locks from the same process may be + * merged into a single wider lock. + * + * Lock modes are mapped like this: + * PR and PW for READ and WRITE locks + * NL to request a releasing of a portion of the lock + * + * These flock locks never timeout. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include +#include + +#include "ldlm_internal.h" + +int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag); + +/** + * list_for_remaining_safe - iterate over the remaining entries in a list + * and safeguard against removal of a list entry. + * \param pos the &struct list_head to use as a loop counter. pos MUST + * have been initialized prior to using it in this macro. + * \param n another &struct list_head to use as temporary storage + * \param head the head for your list. + */ +#define list_for_remaining_safe(pos, n, head) \ + for (n = pos->next; pos != (head); pos = n, n = pos->next) + +static inline int +ldlm_same_flock_owner(struct ldlm_lock *lock, struct ldlm_lock *new) +{ + return ((new->l_policy_data.l_flock.owner == + lock->l_policy_data.l_flock.owner) && + (new->l_export == lock->l_export)); +} + +static inline int +ldlm_flocks_overlap(struct ldlm_lock *lock, struct ldlm_lock *new) +{ + return ((new->l_policy_data.l_flock.start <= + lock->l_policy_data.l_flock.end) && + (new->l_policy_data.l_flock.end >= + lock->l_policy_data.l_flock.start)); +} + +static inline void ldlm_flock_blocking_link(struct ldlm_lock *req, + struct ldlm_lock *lock) +{ + /* For server only */ + if (req->l_export == NULL) + return; + + LASSERT(hlist_unhashed(&req->l_exp_flock_hash)); + + req->l_policy_data.l_flock.blocking_owner = + lock->l_policy_data.l_flock.owner; + req->l_policy_data.l_flock.blocking_export = + lock->l_export; + atomic_set(&req->l_policy_data.l_flock.blocking_refs, 0); + + cfs_hash_add(req->l_export->exp_flock_hash, + &req->l_policy_data.l_flock.owner, + &req->l_exp_flock_hash); +} + +static inline void ldlm_flock_blocking_unlink(struct ldlm_lock *req) +{ + /* For server only */ + if (req->l_export == NULL) + return; + + check_res_locked(req->l_resource); + if (req->l_export->exp_flock_hash != NULL && + !hlist_unhashed(&req->l_exp_flock_hash)) + cfs_hash_del(req->l_export->exp_flock_hash, + &req->l_policy_data.l_flock.owner, + &req->l_exp_flock_hash); +} + +static inline void +ldlm_flock_destroy(struct ldlm_lock *lock, enum ldlm_mode mode, __u64 flags) +{ + ENTRY; + + LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: %#llx)", + mode, flags); + + /* Safe to not lock here, since it should be empty anyway */ + LASSERT(hlist_unhashed(&lock->l_exp_flock_hash)); + + list_del_init(&lock->l_res_link); + if (flags == LDLM_FL_WAIT_NOREPROC) { + /* client side - set a flag to prevent sending a CANCEL */ + lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING; + + /* when reaching here, it is under lock_res_and_lock(). Thus, + * need call the nolock version of ldlm_lock_decref_internal + */ + ldlm_lock_decref_internal_nolock(lock, mode); + } + + ldlm_lock_destroy_nolock(lock); + EXIT; +} + +#ifdef HAVE_SERVER_SUPPORT +/** + * POSIX locks deadlock detection code. + * + * Given a new lock \a req and an existing lock \a bl_lock it conflicts + * with, we need to iterate through all blocked POSIX locks for this + * export and see if there is a deadlock condition arising. (i.e. when + * one client holds a lock on something and want a lock on something + * else and at the same time another client has the opposite situation). + */ + +struct ldlm_flock_lookup_cb_data { + __u64 *bl_owner; + struct ldlm_lock *lock; + struct obd_export *exp; +}; + +static int ldlm_flock_lookup_cb(struct obd_export *exp, void *data) +{ + struct ldlm_flock_lookup_cb_data *cb_data = data; + struct ldlm_lock *lock; + + if (exp->exp_failed) + return 0; + + lock = cfs_hash_lookup(exp->exp_flock_hash, cb_data->bl_owner); + if (lock == NULL) + return 0; + + /* Stop on first found lock. Same process can't sleep twice */ + cb_data->lock = lock; + cb_data->exp = class_export_get(exp); + + return 1; +} + +static int +ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *bl_lock) +{ + struct obd_export *req_exp = req->l_export; + struct obd_export *bl_exp = bl_lock->l_export; + __u64 req_owner = req->l_policy_data.l_flock.owner; + __u64 bl_owner = bl_lock->l_policy_data.l_flock.owner; + + /* For server only */ + if (req_exp == NULL) + return 0; + + class_export_get(bl_exp); + while (1) { + struct ldlm_flock_lookup_cb_data cb_data = { + .bl_owner = &bl_owner, + .lock = NULL, + .exp = NULL, + }; + struct ptlrpc_connection *bl_exp_conn; + struct obd_export *bl_exp_new; + struct ldlm_lock *lock = NULL; + struct ldlm_flock *flock; + + bl_exp_conn = bl_exp->exp_connection; + if (bl_exp->exp_flock_hash != NULL) { + int found; + + found = obd_nid_export_for_each(bl_exp->exp_obd, + &bl_exp_conn->c_peer.nid, + ldlm_flock_lookup_cb, + &cb_data); + if (found) + lock = cb_data.lock; + } + if (lock == NULL) + break; + + class_export_put(bl_exp); + bl_exp = cb_data.exp; + + LASSERT(req != lock); + flock = &lock->l_policy_data.l_flock; + LASSERT(flock->owner == bl_owner); + bl_owner = flock->blocking_owner; + bl_exp_new = class_export_get(flock->blocking_export); + class_export_put(bl_exp); + + cfs_hash_put(bl_exp->exp_flock_hash, &lock->l_exp_flock_hash); + bl_exp = bl_exp_new; + + if (bl_exp->exp_failed) + break; + + if (bl_owner == req_owner && + nid_same(&bl_exp_conn->c_peer.nid, + &req_exp->exp_connection->c_peer.nid)) { + class_export_put(bl_exp); + return 1; + } + } + class_export_put(bl_exp); + + return 0; +} + +static void ldlm_flock_cancel_on_deadlock(struct ldlm_lock *lock, + struct list_head *work_list) +{ + CDEBUG(D_INFO, "reprocess deadlock req=%p\n", lock); + + if ((exp_connect_flags(lock->l_export) & + OBD_CONNECT_FLOCK_DEAD) == 0) { + CERROR("deadlock found, but client doesn't support flock canceliation\n"); + } else { + LASSERT(lock->l_completion_ast); + LASSERT(!ldlm_is_ast_sent(lock)); + lock->l_flags |= (LDLM_FL_AST_SENT | LDLM_FL_CANCEL_ON_BLOCK | + LDLM_FL_FLOCK_DEADLOCK); + ldlm_flock_blocking_unlink(lock); + ldlm_resource_unlink_lock(lock); + ldlm_add_ast_work_item(lock, NULL, work_list); + } +} +#endif /* HAVE_SERVER_SUPPORT */ + +/** + * Process a granting attempt for flock lock. + * Must be called under ns lock held. + * + * This function looks for any conflicts for \a lock in the granted or + * waiting queues. The lock is granted if no conflicts are found in + * either queue. + */ +int +ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, struct list_head *work_list) +{ + struct ldlm_resource *res = req->l_resource; + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + struct list_head *tmp; + struct list_head *ownlocks = NULL; + struct ldlm_lock *lock = NULL; + struct ldlm_lock *new = req; + struct ldlm_lock *new2 = NULL; + enum ldlm_mode mode = req->l_req_mode; + int local = ns_is_client(ns); + int added = (mode == LCK_NL); + int overlaps = 0; + int splitted = 0; + const struct ldlm_callback_suite null_cbs = { NULL }; +#ifdef HAVE_SERVER_SUPPORT + struct list_head *grant_work = (intention == LDLM_PROCESS_ENQUEUE ? + NULL : work_list); +#endif + ENTRY; + + CDEBUG(D_DLMTRACE, "flags %#llx owner %llu pid %u mode %u start " + "%llu end %llu\n", *flags, + new->l_policy_data.l_flock.owner, + new->l_policy_data.l_flock.pid, mode, + req->l_policy_data.l_flock.start, + req->l_policy_data.l_flock.end); + + *err = ELDLM_OK; + + if (local) { + /* No blocking ASTs are sent to the clients for + * Posix file & record locks + */ + req->l_blocking_ast = NULL; + } else { + /* Called on the server for lock cancels. */ + req->l_blocking_ast = ldlm_flock_blocking_ast; + } + +reprocess: + if ((*flags == LDLM_FL_WAIT_NOREPROC) || (mode == LCK_NL)) { + /* This loop determines where this processes locks start + * in the resource lr_granted list. + */ + list_for_each(tmp, &res->lr_granted) { + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + if (ldlm_same_flock_owner(lock, req)) { + ownlocks = tmp; + break; + } + } + } +#ifdef HAVE_SERVER_SUPPORT + else { + int reprocess_failed = 0; + lockmode_verify(mode); + + /* This loop determines if there are existing locks + * that conflict with the new lock request. + */ + list_for_each(tmp, &res->lr_granted) { + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + + if (ldlm_same_flock_owner(lock, req)) { + if (!ownlocks) + ownlocks = tmp; + continue; + } + + if (req->l_req_mode == LCK_PR && + lock->l_granted_mode == LCK_PR && + lock->l_policy_data.l_flock.start <= + req->l_policy_data.l_flock.start && + lock->l_policy_data.l_flock.end >= + req->l_policy_data.l_flock.end) { + /* there can't be granted WR lock */ + break; + } + /* locks are compatible, overlap doesn't matter */ + if (lockmode_compat(lock->l_granted_mode, mode)) + continue; + + if (!ldlm_flocks_overlap(lock, req)) + continue; + + if (intention != LDLM_PROCESS_ENQUEUE) { + if (ldlm_flock_deadlock(req, lock)) { + ldlm_flock_cancel_on_deadlock( + req, grant_work); + RETURN(LDLM_ITER_CONTINUE); + } + reprocess_failed = 1; + break; + } + + if (*flags & LDLM_FL_BLOCK_NOWAIT) { + ldlm_flock_destroy(req, mode, *flags); + *err = -EAGAIN; + RETURN(LDLM_ITER_STOP); + } + + if (*flags & LDLM_FL_TEST_LOCK) { + ldlm_flock_destroy(req, mode, *flags); + req->l_req_mode = lock->l_granted_mode; + req->l_policy_data.l_flock.pid = + lock->l_policy_data.l_flock.pid; + req->l_policy_data.l_flock.start = + lock->l_policy_data.l_flock.start; + req->l_policy_data.l_flock.end = + lock->l_policy_data.l_flock.end; + *flags |= LDLM_FL_LOCK_CHANGED; + RETURN(LDLM_ITER_STOP); + } + + /* add lock to blocking list before deadlock + * check to prevent race + */ + ldlm_flock_blocking_link(req, lock); + + if (ldlm_flock_deadlock(req, lock)) { + ldlm_flock_blocking_unlink(req); + ldlm_flock_destroy(req, mode, *flags); + *err = -EDEADLK; + RETURN(LDLM_ITER_STOP); + } + + ldlm_resource_add_lock(res, &res->lr_waiting, req); + *flags |= LDLM_FL_BLOCK_GRANTED; + RETURN(LDLM_ITER_STOP); + } + if (reprocess_failed) + RETURN(LDLM_ITER_CONTINUE); + } + + if (*flags & LDLM_FL_TEST_LOCK) { + ldlm_flock_destroy(req, mode, *flags); + req->l_req_mode = LCK_NL; + *flags |= LDLM_FL_LOCK_CHANGED; + RETURN(LDLM_ITER_STOP); + } + + /* In case we had slept on this lock request take it off of the + * deadlock detection hash list. + */ + ldlm_flock_blocking_unlink(req); +#endif /* HAVE_SERVER_SUPPORT */ + + /* Scan the locks owned by this process that overlap this request. + * We may have to merge or split existing locks. + */ + if (!ownlocks) + ownlocks = &res->lr_granted; + + list_for_remaining_safe(ownlocks, tmp, &res->lr_granted) { + lock = list_entry(ownlocks, struct ldlm_lock, l_res_link); + + if (!ldlm_same_flock_owner(lock, new)) + break; + + if (lock->l_granted_mode == mode) { + /* If the modes are the same then we need to process + * locks that overlap OR adjoin the new lock. The extra + * logic condition is necessary to deal with arithmetic + * overflow and underflow. + */ + if ((new->l_policy_data.l_flock.start > + (lock->l_policy_data.l_flock.end + 1)) + && (lock->l_policy_data.l_flock.end != + OBD_OBJECT_EOF)) + continue; + + if ((new->l_policy_data.l_flock.end < + (lock->l_policy_data.l_flock.start - 1)) + && (lock->l_policy_data.l_flock.start != 0)) + break; + + if (new->l_policy_data.l_flock.start < + lock->l_policy_data.l_flock.start) { + lock->l_policy_data.l_flock.start = + new->l_policy_data.l_flock.start; + } else { + new->l_policy_data.l_flock.start = + lock->l_policy_data.l_flock.start; + } + + if (new->l_policy_data.l_flock.end > + lock->l_policy_data.l_flock.end) { + lock->l_policy_data.l_flock.end = + new->l_policy_data.l_flock.end; + } else { + new->l_policy_data.l_flock.end = + lock->l_policy_data.l_flock.end; + } + + if (added) { + ldlm_flock_destroy(lock, mode, *flags); + } else { + new = lock; + added = 1; + } + continue; + } + + if (new->l_policy_data.l_flock.start > + lock->l_policy_data.l_flock.end) + continue; + + if (new->l_policy_data.l_flock.end < + lock->l_policy_data.l_flock.start) + break; + + ++overlaps; + + if (new->l_policy_data.l_flock.start <= + lock->l_policy_data.l_flock.start) { + if (new->l_policy_data.l_flock.end < + lock->l_policy_data.l_flock.end) { + lock->l_policy_data.l_flock.start = + new->l_policy_data.l_flock.end + 1; + break; + } + ldlm_flock_destroy(lock, lock->l_req_mode, *flags); + continue; + } + if (new->l_policy_data.l_flock.end >= + lock->l_policy_data.l_flock.end) { + lock->l_policy_data.l_flock.end = + new->l_policy_data.l_flock.start - 1; + continue; + } + + /* split the existing lock into two locks */ + + /* if this is an F_UNLCK operation then we could avoid + * allocating a new lock and use the req lock passed in + * with the request but this would complicate the reply + * processing since updates to req get reflected in the + * reply. The client side replays the lock request so + * it must see the original lock data in the reply. + */ + + /* XXX - if ldlm_lock_new() can sleep we should + * release the lr_lock, allocate the new lock, + * and restart processing this lock. + */ + if (new2 == NULL) { + unlock_res_and_lock(req); + new2 = ldlm_lock_create(ns, &res->lr_name, LDLM_FLOCK, + lock->l_granted_mode, &null_cbs, + NULL, 0, LVB_T_NONE); + lock_res_and_lock(req); + if (IS_ERR(new2)) { + ldlm_flock_destroy(req, lock->l_granted_mode, + *flags); + *err = PTR_ERR(new2); + RETURN(LDLM_ITER_STOP); + } + goto reprocess; + } + + splitted = 1; + + new2->l_granted_mode = lock->l_granted_mode; + new2->l_policy_data.l_flock.pid = + new->l_policy_data.l_flock.pid; + new2->l_policy_data.l_flock.owner = + new->l_policy_data.l_flock.owner; + new2->l_policy_data.l_flock.start = + lock->l_policy_data.l_flock.start; + new2->l_policy_data.l_flock.end = + new->l_policy_data.l_flock.start - 1; + lock->l_policy_data.l_flock.start = + new->l_policy_data.l_flock.end + 1; + new2->l_conn_export = lock->l_conn_export; + if (lock->l_export != NULL) { + new2->l_export = class_export_lock_get(lock->l_export, + new2); + if (new2->l_export->exp_lock_hash && + hlist_unhashed(&new2->l_exp_hash)) + cfs_hash_add(new2->l_export->exp_lock_hash, + &new2->l_remote_handle, + &new2->l_exp_hash); + } + if (*flags == LDLM_FL_WAIT_NOREPROC) + ldlm_lock_addref_internal_nolock(new2, + lock->l_granted_mode); + + /* insert new2 at lock */ + ldlm_resource_add_lock(res, ownlocks, new2); + LDLM_LOCK_RELEASE(new2); + break; + } + + /* if new2 is created but never used, destroy it*/ + if (splitted == 0 && new2 != NULL) + ldlm_lock_destroy_nolock(new2); + + /* At this point we're granting the lock request. */ + req->l_granted_mode = req->l_req_mode; + + /* Add req to the granted queue before calling ldlm_reprocess_all(). */ + if (!added) { + list_del_init(&req->l_res_link); + /* insert new lock before ownlocks in list. */ + ldlm_resource_add_lock(res, ownlocks, req); + } + + if (*flags != LDLM_FL_WAIT_NOREPROC) { +#ifdef HAVE_SERVER_SUPPORT + if (intention == LDLM_PROCESS_ENQUEUE) { + /* If this is an unlock, reprocess the waitq and + * send completions ASTs for locks that can now be + * granted. The only problem with doing this + * reprocessing here is that the completion ASTs for + * newly granted locks will be sent before the unlock + * completion is sent. It shouldn't be an issue. Also + * note that ldlm_process_flock_lock() will recurse, + * but only once because 'intention' won't be + * LDLM_PROCESS_ENQUEUE from ldlm_reprocess_queue. + */ + if ((mode == LCK_NL) && overlaps) { + LIST_HEAD(rpc_list); + int rc; + +restart: + ldlm_reprocess_queue(res, &res->lr_waiting, + &rpc_list, + LDLM_PROCESS_RESCAN, 0); + + unlock_res_and_lock(req); + rc = ldlm_run_ast_work(ns, &rpc_list, + LDLM_WORK_CP_AST); + lock_res_and_lock(req); + if (rc == -ERESTART) + GOTO(restart, rc); + } + } else { + LASSERT(req->l_completion_ast); + ldlm_add_ast_work_item(req, NULL, grant_work); + } +#else /* !HAVE_SERVER_SUPPORT */ + /* The only one possible case for client-side calls flock + * policy function is ldlm_flock_completion_ast inside which + * carries LDLM_FL_WAIT_NOREPROC flag. + */ + CERROR("Illegal parameter for client-side-only module.\n"); + LBUG(); +#endif /* HAVE_SERVER_SUPPORT */ + } + + /* In case we're reprocessing the requested lock we can't destroy + * it until after calling ldlm_add_ast_work_item() above so that laawi() + * can bump the reference count on \a req. Otherwise \a req + * could be freed before the completion AST can be sent. + */ + if (added) + ldlm_flock_destroy(req, mode, *flags); + + ldlm_resource_dump(D_INFO, res); + RETURN(LDLM_ITER_CONTINUE); +} + +/** + * Flock completion callback function. + * + * \param lock [in,out]: A lock to be handled + * \param flags [in]: flags + * \param *data [in]: ldlm_work_cp_ast_lock() will use ldlm_cb_set_arg + * + * \retval 0 : success + * \retval <0 : failure + */ +int +ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) +{ + struct file_lock *getlk = lock->l_ast_data; + struct obd_device *obd; + enum ldlm_error err; + int rc = 0; + ENTRY; + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT2, 4); + if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT3)) { + lock_res_and_lock(lock); + lock->l_flags |= LDLM_FL_FAIL_LOC; + unlock_res_and_lock(lock); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT3, 4); + } + CDEBUG(D_DLMTRACE, "flags: %#llx data: %p getlk: %p\n", + flags, data, getlk); + + LASSERT(flags != LDLM_FL_WAIT_NOREPROC); + + if (flags & LDLM_FL_FAILED) + goto granted; + + if (!(flags & LDLM_FL_BLOCKED_MASK)) { + if (NULL == data) + /* mds granted the lock in the reply */ + goto granted; + /* CP AST RPC: lock get granted, wake it up */ + wake_up(&lock->l_waitq); + RETURN(0); + } + + LDLM_DEBUG(lock, + "client-side enqueue returned a blocked lock, sleeping"); + obd = class_exp2obd(lock->l_conn_export); + + /* Go to sleep until the lock is granted. */ + rc = l_wait_event_abortable(lock->l_waitq, + is_granted_or_cancelled(lock)); + if (rc < 0) { + /* take lock off the deadlock detection hash list. */ + lock_res_and_lock(lock); + ldlm_flock_blocking_unlink(lock); + + /* client side - set flag to prevent lock from being + * put on LRU list + */ + ldlm_set_cbpending(lock); + unlock_res_and_lock(lock); + + LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)", + rc); + RETURN(rc); + } + +granted: + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT4)) { + lock_res_and_lock(lock); + /* DEADLOCK is always set with CBPENDING */ + lock->l_flags |= LDLM_FL_FLOCK_DEADLOCK | LDLM_FL_CBPENDING; + unlock_res_and_lock(lock); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT4, 4); + } + if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT5)) { + lock_res_and_lock(lock); + /* DEADLOCK is always set with CBPENDING */ + lock->l_flags |= (LDLM_FL_FAIL_LOC | + LDLM_FL_FLOCK_DEADLOCK | LDLM_FL_CBPENDING); + unlock_res_and_lock(lock); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT5, 4); + } + + lock_res_and_lock(lock); + + + /* Protect against race where lock could have been just destroyed + * due to overlap in ldlm_process_flock_lock(). + */ + if (ldlm_is_destroyed(lock)) { + unlock_res_and_lock(lock); + LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed"); + + /* An error is still to be returned, to propagate it up to + * ldlm_cli_enqueue_fini() caller. */ + RETURN(-EIO); + } + + /* ldlm_lock_enqueue() has already placed lock on the granted list. */ + ldlm_resource_unlink_lock(lock); + + /* Import invalidation. We need to actually release the lock + * references being held, so that it can go away. No point in + * holding the lock even if app still believes it has it, since + * server already dropped it anyway. Only for granted locks too. + */ + /* Do the same for DEADLOCK'ed locks. */ + if (ldlm_is_failed(lock) || ldlm_is_flock_deadlock(lock)) { + int mode; + + if (flags & LDLM_FL_TEST_LOCK) + LASSERT(ldlm_is_test_lock(lock)); + + if (ldlm_is_test_lock(lock) || ldlm_is_flock_deadlock(lock)) + mode = getlk->fl_type; + else + mode = lock->l_req_mode; + + if (ldlm_is_flock_deadlock(lock)) { + LDLM_DEBUG(lock, "client-side enqueue deadlock " + "received"); + rc = -EDEADLK; + } + ldlm_flock_destroy(lock, mode, LDLM_FL_WAIT_NOREPROC); + unlock_res_and_lock(lock); + + /* Need to wake up the waiter if we were evicted */ + wake_up(&lock->l_waitq); + + /* An error is still to be returned, to propagate it up to + * ldlm_cli_enqueue_fini() caller. + */ + RETURN(rc ? : -EIO); + } + + LDLM_DEBUG(lock, "client-side enqueue granted"); + + if (flags & LDLM_FL_TEST_LOCK) { + /* + * fcntl(F_GETLK) request + * The old mode was saved in getlk->fl_type so that if the mode + * in the lock changes we can decref the appropriate refcount. + */ + LASSERT(ldlm_is_test_lock(lock)); + ldlm_flock_destroy(lock, getlk->fl_type, LDLM_FL_WAIT_NOREPROC); + switch (lock->l_granted_mode) { + case LCK_PR: + getlk->fl_type = F_RDLCK; + break; + case LCK_PW: + getlk->fl_type = F_WRLCK; + break; + default: + getlk->fl_type = F_UNLCK; + } + getlk->fl_pid = (pid_t)lock->l_policy_data.l_flock.pid; + getlk->fl_start = (loff_t)lock->l_policy_data.l_flock.start; + getlk->fl_end = (loff_t)lock->l_policy_data.l_flock.end; + } else { + __u64 noreproc = LDLM_FL_WAIT_NOREPROC; + + /* We need to reprocess the lock to do merges or splits + * with existing locks owned by this process. + */ + ldlm_process_flock_lock(lock, &noreproc, 1, &err, NULL); + } + unlock_res_and_lock(lock); + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_flock_completion_ast); + +int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + ENTRY; + + LASSERT(lock); + LASSERT(flag == LDLM_CB_CANCELING); + + /* take lock off the deadlock detection hash list. */ + lock_res_and_lock(lock); + ldlm_flock_blocking_unlink(lock); + unlock_res_and_lock(lock); + RETURN(0); +} + +void ldlm_flock_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy) +{ + lpolicy->l_flock.start = wpolicy->l_flock.lfw_start; + lpolicy->l_flock.end = wpolicy->l_flock.lfw_end; + lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid; + lpolicy->l_flock.owner = wpolicy->l_flock.lfw_owner; +} + +void ldlm_flock_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy) +{ + memset(wpolicy, 0, sizeof(*wpolicy)); + wpolicy->l_flock.lfw_start = lpolicy->l_flock.start; + wpolicy->l_flock.lfw_end = lpolicy->l_flock.end; + wpolicy->l_flock.lfw_pid = lpolicy->l_flock.pid; + wpolicy->l_flock.lfw_owner = lpolicy->l_flock.owner; +} + +/* + * Export handle<->flock hash operations. + */ +static unsigned +ldlm_export_flock_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_u64_hash(*(__u64 *)key, mask); +} + +static void * +ldlm_export_flock_key(struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); + return &lock->l_policy_data.l_flock.owner; +} + +static int +ldlm_export_flock_keycmp(const void *key, struct hlist_node *hnode) +{ + return !memcmp(ldlm_export_flock_key(hnode), key, sizeof(__u64)); +} + +static void * +ldlm_export_flock_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); +} + +static void +ldlm_export_flock_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + struct ldlm_flock *flock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); + LDLM_LOCK_GET(lock); + + flock = &lock->l_policy_data.l_flock; + LASSERT(flock->blocking_export != NULL); + class_export_get(flock->blocking_export); + atomic_inc(&flock->blocking_refs); +} + +static void +ldlm_export_flock_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + struct ldlm_flock *flock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); + + flock = &lock->l_policy_data.l_flock; + LASSERT(flock->blocking_export != NULL); + class_export_put(flock->blocking_export); + if (atomic_dec_and_test(&flock->blocking_refs)) { + flock->blocking_owner = 0; + flock->blocking_export = NULL; + } + LDLM_LOCK_RELEASE(lock); +} + +static struct cfs_hash_ops ldlm_export_flock_ops = { + .hs_hash = ldlm_export_flock_hash, + .hs_key = ldlm_export_flock_key, + .hs_keycmp = ldlm_export_flock_keycmp, + .hs_object = ldlm_export_flock_object, + .hs_get = ldlm_export_flock_get, + .hs_put = ldlm_export_flock_put, + .hs_put_locked = ldlm_export_flock_put, +}; + +int ldlm_init_flock_export(struct obd_export *exp) +{ + if( strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDT_NAME) != 0) + RETURN(0); + + exp->exp_flock_hash = + cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid), + HASH_EXP_LOCK_CUR_BITS, + HASH_EXP_LOCK_MAX_BITS, + HASH_EXP_LOCK_BKT_BITS, 0, + CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA, + &ldlm_export_flock_ops, + CFS_HASH_DEFAULT | CFS_HASH_NBLK_CHANGE); + if (!exp->exp_flock_hash) + RETURN(-ENOMEM); + + RETURN(0); +} + +void ldlm_destroy_flock_export(struct obd_export *exp) +{ + ENTRY; + if (exp->exp_flock_hash) { + cfs_hash_putref(exp->exp_flock_hash); + exp->exp_flock_hash = NULL; + } + EXIT; +} diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c new file mode 100644 index 0000000000000..41257a97dc571 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c @@ -0,0 +1,667 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ldlm/ldlm_inodebits.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +/** + * This file contains implementation of IBITS lock type + * + * IBITS lock type contains a bit mask determining various properties of an + * object. The meanings of specific bits are specific to the caller and are + * opaque to LDLM code. + * + * Locks with intersecting bitmasks and conflicting lock modes (e.g. LCK_PW) + * are considered conflicting. See the lock mode compatibility matrix + * in lustre_dlm.h. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include + +#include "ldlm_internal.h" + +#ifdef HAVE_SERVER_SUPPORT + +/** + * It should iterate through all waiting locks on a given resource queue and + * attempt to grant them. An optimization is to check only heads waitintg + * locks for each inodebit type. + * + * Must be called with resource lock held. + */ +int ldlm_reprocess_inodebits_queue(struct ldlm_resource *res, + struct list_head *queue, + struct list_head *work_list, + enum ldlm_process_intention intention, + __u64 mask) +{ + __u64 flags; + int rc = LDLM_ITER_CONTINUE; + enum ldlm_error err; + LIST_HEAD(bl_ast_list); + struct ldlm_ibits_queues *queues = res->lr_ibits_queues; + int i; + + ENTRY; + + check_res_locked(res); + + LASSERT(res->lr_type == LDLM_IBITS); + LASSERT(intention == LDLM_PROCESS_RESCAN || + intention == LDLM_PROCESS_RECOVERY); + + if (intention == LDLM_PROCESS_RECOVERY) + return ldlm_reprocess_queue(res, queue, work_list, intention, + 0); + +restart: + CDEBUG(D_DLMTRACE, "--- Reprocess resource "DLDLMRES" (%p)\n", + PLDLMRES(res), res); + if (mask) + CDEBUG(D_DLMTRACE, "Hint %llx\n", mask); + else + mask = MDS_INODELOCK_FULL; + + for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) { + LIST_HEAD(rpc_list); + struct list_head *head = &queues->liq_waiting[i]; + struct ldlm_lock *pending; + struct ldlm_ibits_node *node; + + if (list_empty(head) || !(mask & (1 << i))) + continue; + + node = list_entry(head->next, struct ldlm_ibits_node, + lin_link[i]); + + pending = node->lock; + LDLM_DEBUG(pending, "Reprocessing lock from queue %d", i); + + flags = 0; + rc = ldlm_process_inodebits_lock(pending, &flags, intention, + &err, &rpc_list); + if (ldlm_is_granted(pending)) { + list_splice(&rpc_list, work_list); + mask |= pending->l_policy_data.l_inodebits.bits; + i = ffs(pending->l_policy_data.l_inodebits.bits) - 2; + } else { + list_splice(&rpc_list, &bl_ast_list); + } + } + + if (!list_empty(&bl_ast_list)) { + unlock_res(res); + + rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &bl_ast_list, + LDLM_WORK_BL_AST); + + lock_res(res); + if (rc == -ERESTART) { + mask = 0; + GOTO(restart, rc); + } + } + + if (!list_empty(&bl_ast_list)) + ldlm_discard_bl_list(&bl_ast_list); + + RETURN(rc); +} + +/** + * Determine if the lock is compatible with all locks on the queue. + * + * If \a work_list is provided, conflicting locks are linked there. + * If \a work_list is not provided, we exit this function on first conflict. + * + * \retval 0 if there are conflicting locks in the \a queue + * \retval 1 if the lock is compatible to all locks in \a queue + * + * IBITS locks in granted queue are organized in bunches of + * same-mode/same-bits locks called "skip lists". The First lock in the + * bunch contains a pointer to the end of the bunch. This allows us to + * skip an entire bunch when iterating the list in search for conflicting + * locks if first lock of the bunch is not conflicting with us. + */ +static int +ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req, + __u64 *ldlm_flags, struct list_head *work_list) +{ + enum ldlm_mode req_mode = req->l_req_mode; + struct list_head *tmp; + struct ldlm_lock *lock; + __u64 req_bits = req->l_policy_data.l_inodebits.bits; + __u64 *try_bits = &req->l_policy_data.l_inodebits.try_bits; + int compat = 1; + + ENTRY; + + lockmode_verify(req_mode); + + /* There is no sense in lock with no bits set. Also such a lock + * would be compatible with any other bit lock. + * Meanwhile that can be true if there were just try_bits and all + * are failed, so just exit gracefully and let the caller to care. + */ + if ((req_bits | *try_bits) == 0) + RETURN(0); + + /* Group lock could be only DOM */ + if (unlikely(req_mode == LCK_GROUP && + (req_bits | *try_bits) != MDS_INODELOCK_DOM)) + RETURN(-EPROTO); + + list_for_each(tmp, queue) { + struct list_head *mode_tail; + + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + /* We stop walking the queue if we hit ourselves so we don't + * take conflicting locks enqueued after us into account, + * or we'd wait forever. */ + if (req == lock) + RETURN(compat); + + /* last lock in mode group */ + LASSERT(lock->l_sl_mode.prev != NULL); + mode_tail = &list_entry(lock->l_sl_mode.prev, struct ldlm_lock, + l_sl_mode)->l_res_link; + + /* if request lock is not COS_INCOMPAT and COS is disabled, + * they are compatible, IOW this request is from a local + * transaction on a DNE system. */ + if (lock->l_req_mode == LCK_COS && !ldlm_is_cos_incompat(req) && + !ldlm_is_cos_enabled(req)) { + /* jump to last lock in mode group */ + tmp = mode_tail; + continue; + } + + if (lockmode_compat(lock->l_req_mode, req_mode)) { + /* non group locks are compatible, bits don't matter */ + if (likely(req_mode != LCK_GROUP)) { + /* jump to last lock in mode group */ + tmp = mode_tail; + continue; + } + + if (req->l_policy_data.l_inodebits.li_gid == + lock->l_policy_data.l_inodebits.li_gid) { + if (ldlm_is_granted(lock)) + RETURN(2); + + if (*ldlm_flags & LDLM_FL_BLOCK_NOWAIT) + RETURN(-EWOULDBLOCK); + + /* Place the same group together */ + ldlm_resource_insert_lock_after(lock, req); + RETURN(0); + } + } + + /* GROUP locks are placed to a head of the waiting list, but + * grouped by gid. */ + if (unlikely(req_mode == LCK_GROUP && !ldlm_is_granted(lock))) { + compat = 0; + if (lock->l_req_mode != LCK_GROUP) { + /* Already not a GROUP lock, insert before. */ + ldlm_resource_insert_lock_before(lock, req); + break; + } + /* Still GROUP but a different gid(the same gid would + * be handled above). Keep searching for the same gid */ + LASSERT(req->l_policy_data.l_inodebits.li_gid != + lock->l_policy_data.l_inodebits.li_gid); + continue; + } + + for (;;) { + struct list_head *head; + + /* Advance loop cursor to last lock in policy group. */ + tmp = &list_entry(lock->l_sl_policy.prev, + struct ldlm_lock, + l_sl_policy)->l_res_link; + + /* New lock's try_bits are filtered out by ibits + * of all locks in both granted and waiting queues. + */ + *try_bits &= ~(lock->l_policy_data.l_inodebits.bits | + lock->l_policy_data.l_inodebits.try_bits); + + if ((req_bits | *try_bits) == 0) + RETURN(0); + + /* The new lock ibits is more preferable than try_bits + * of waiting locks so drop conflicting try_bits in + * the waiting queue. + * Notice that try_bits of granted locks must be zero. + */ + lock->l_policy_data.l_inodebits.try_bits &= ~req_bits; + + /* Locks with overlapping bits conflict. */ + if (lock->l_policy_data.l_inodebits.bits & req_bits) { + /* COS lock mode has a special compatibility + * requirement: it is only compatible with + * locks from the same client. */ + if (lock->l_req_mode == LCK_COS && + !ldlm_is_cos_incompat(req) && + ldlm_is_cos_enabled(req) && + lock->l_client_cookie == req->l_client_cookie) + goto skip_work_list; + + compat = 0; + + if (unlikely(lock->l_req_mode == LCK_GROUP)) { + LASSERT(ldlm_has_dom(lock)); + + if (*ldlm_flags & LDLM_FL_BLOCK_NOWAIT) + RETURN(-EWOULDBLOCK); + + /* Local combined DOM lock came across + * GROUP DOM lock, it makes the thread + * to be blocked for a long time, not + * allowed, the trybits to be used + * instead. + */ + if (!req->l_export && + (req_bits & MDS_INODELOCK_DOM) && + (req_bits & ~MDS_INODELOCK_DOM)) + LBUG(); + + goto skip_work_list; + } + + /* Found a conflicting policy group. */ + if (!work_list) + RETURN(0); + + /* Add locks of the policy group to @work_list + * as blocking locks for @req */ + if (lock->l_blocking_ast) + ldlm_add_ast_work_item(lock, req, + work_list); + head = &lock->l_sl_policy; + list_for_each_entry(lock, head, l_sl_policy) + if (lock->l_blocking_ast) + ldlm_add_ast_work_item(lock, + req, work_list); + } +skip_work_list: + if (tmp == mode_tail) + break; + + tmp = tmp->next; + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + } /* Loop over policy groups within one mode group. */ + } /* Loop over mode groups within @queue. */ + + RETURN(compat); +} + +/** + * Process a granting attempt for IBITS lock. + * Must be called with ns lock held + * + * This function looks for any conflicts for \a lock in the granted or + * waiting queues. The lock is granted if no conflicts are found in + * either queue. + */ +int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *ldlm_flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, + struct list_head *work_list) +{ + struct ldlm_resource *res = lock->l_resource; + struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ? + NULL : work_list; + int rc, rc2 = 0; + ENTRY; + + *err = ELDLM_LOCK_ABORTED; + LASSERT(!ldlm_is_granted(lock)); + check_res_locked(res); + + if (intention == LDLM_PROCESS_RESCAN) { + struct list_head *bl_list = + *ldlm_flags & LDLM_FL_BLOCK_NOWAIT ? NULL : work_list; + + LASSERT(lock->l_policy_data.l_inodebits.bits != 0); + + /* It is possible that some of granted locks was not canceled + * but converted and is kept in granted queue. So there is + * a window where lock with 'ast_sent' might become granted + * again. Meanwhile a new lock may appear in that window and + * conflicts with the converted lock so the following scenario + * is possible: + * + * 1) lock1 conflicts with lock2 + * 2) bl_ast was sent for lock2 + * 3) lock3 comes and conflicts with lock2 too + * 4) no bl_ast sent because lock2->l_bl_ast_sent is 1 + * 5) lock2 was converted for lock1 but not for lock3 + * 6) lock1 granted, lock3 still is waiting for lock2, but + * there will never be another bl_ast for that + * + * To avoid this scenario the work_list is used below to collect + * any blocked locks from granted queue during every reprocess + * and bl_ast will be sent if needed. + */ + *ldlm_flags = 0; + rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, + ldlm_flags, bl_list); + if (!rc) + RETURN(LDLM_ITER_STOP); + rc = ldlm_inodebits_compat_queue(&res->lr_waiting, lock, + ldlm_flags, NULL); + if (!rc) + RETURN(LDLM_ITER_STOP); + + /* grant also try_bits if any */ + if (lock->l_policy_data.l_inodebits.try_bits != 0) { + lock->l_policy_data.l_inodebits.bits |= + lock->l_policy_data.l_inodebits.try_bits; + lock->l_policy_data.l_inodebits.try_bits = 0; + *ldlm_flags |= LDLM_FL_LOCK_CHANGED; + } + ldlm_resource_unlink_lock(lock); + ldlm_grant_lock(lock, grant_work); + + *err = ELDLM_OK; + RETURN(LDLM_ITER_CONTINUE); + } + + rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, + ldlm_flags, work_list); + if (rc < 0) + GOTO(out, *err = rc); + + if (rc != 2) { + rc2 = ldlm_inodebits_compat_queue(&res->lr_waiting, lock, + ldlm_flags, work_list); + if (rc2 < 0) + GOTO(out, *err = rc = rc2); + } + + if (rc + rc2 != 2) { + /* if there were only bits to try and all are conflicting */ + if ((lock->l_policy_data.l_inodebits.bits | + lock->l_policy_data.l_inodebits.try_bits)) { + /* There is no sense to set LDLM_FL_NO_TIMEOUT to + * @ldlm_flags for DOM lock while they are enqueued + * through intents, i.e. @lock here is local which does + * not timeout. */ + *err = ELDLM_OK; + } + } else { + /* grant also all remaining try_bits */ + if (lock->l_policy_data.l_inodebits.try_bits != 0) { + lock->l_policy_data.l_inodebits.bits |= + lock->l_policy_data.l_inodebits.try_bits; + lock->l_policy_data.l_inodebits.try_bits = 0; + *ldlm_flags |= LDLM_FL_LOCK_CHANGED; + } + LASSERT(lock->l_policy_data.l_inodebits.bits); + ldlm_resource_unlink_lock(lock); + ldlm_grant_lock(lock, grant_work); + *err = ELDLM_OK; + } + + RETURN(LDLM_ITER_CONTINUE); +out: + return rc; +} +#endif /* HAVE_SERVER_SUPPORT */ + +void ldlm_ibits_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy) +{ + lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits; + /** + * try_bits and li_gid are to be handled outside of generic + * write_to_local due to different behavior on a server and client. + */ +} + +void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy) +{ + memset(wpolicy, 0, sizeof(*wpolicy)); + wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits; + wpolicy->l_inodebits.try_bits = lpolicy->l_inodebits.try_bits; + wpolicy->l_inodebits.li_gid = lpolicy->l_inodebits.li_gid; +} + +/** + * Attempt to convert already granted IBITS lock with several bits set to + * a lock with less bits (downgrade). + * + * Such lock conversion is used to keep lock with non-blocking bits instead of + * cancelling it, introduced for better support of DoM files. + */ +int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop) +{ + ENTRY; + + check_res_locked(lock->l_resource); + + /* Just return if there are no conflicting bits */ + if ((lock->l_policy_data.l_inodebits.bits & to_drop) == 0) { + LDLM_WARN(lock, "try to drop unset bits %#llx/%#llx", + lock->l_policy_data.l_inodebits.bits, to_drop); + /* nothing to do */ + RETURN(0); + } + + /* remove lock from a skiplist and put in the new place + * according with new inodebits */ + ldlm_resource_unlink_lock(lock); + lock->l_policy_data.l_inodebits.bits &= ~to_drop; + ldlm_grant_lock_with_skiplist(lock); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_inodebits_drop); + +/* convert single lock */ +int ldlm_cli_inodebits_convert(struct ldlm_lock *lock, + enum ldlm_cancel_flags cancel_flags) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + struct ldlm_lock_desc ld = { { 0 } }; + __u64 drop_bits, new_bits; + __u32 flags = 0; + int rc; + + ENTRY; + + check_res_locked(lock->l_resource); + + /* Lock is being converted already */ + if (ldlm_is_converting(lock)) { + if (!(cancel_flags & LCF_ASYNC)) { + unlock_res_and_lock(lock); + wait_event_idle(lock->l_waitq, + is_lock_converted(lock)); + lock_res_and_lock(lock); + } + RETURN(0); + } + + /* lru_cancel may happen in parallel and call ldlm_cli_cancel_list() + * independently. + */ + if (ldlm_is_canceling(lock)) + RETURN(-EINVAL); + + /* no need in only local convert */ + if (lock->l_flags & (LDLM_FL_LOCAL_ONLY | LDLM_FL_CANCEL_ON_BLOCK)) + RETURN(-EINVAL); + + drop_bits = lock->l_policy_data.l_inodebits.cancel_bits; + /* no cancel bits - means that caller needs full cancel */ + if (drop_bits == 0) + RETURN(-EINVAL); + + new_bits = lock->l_policy_data.l_inodebits.bits & ~drop_bits; + /* check if all lock bits are dropped, proceed with cancel */ + if (!new_bits) + RETURN(-EINVAL); + + /* check if no dropped bits, consider this as successful convert */ + if (lock->l_policy_data.l_inodebits.bits == new_bits) + RETURN(0); + + ldlm_set_converting(lock); + /* Finally call cancel callback for remaining bits only. + * It is important to have converting flag during that + * so blocking_ast callback can distinguish convert from + * cancels. + */ + ld.l_policy_data.l_inodebits.cancel_bits = drop_bits; + unlock_res_and_lock(lock); + lock->l_blocking_ast(lock, &ld, lock->l_ast_data, LDLM_CB_CANCELING); + /* now notify server about convert */ + rc = ldlm_cli_convert_req(lock, &flags, new_bits); + lock_res_and_lock(lock); + if (rc) + GOTO(full_cancel, rc); + + /* Finally clear these bits in lock ibits */ + ldlm_inodebits_drop(lock, drop_bits); + + /* Being locked again check if lock was canceled, it is important + * to do and don't drop cbpending below + */ + if (ldlm_is_canceling(lock)) + GOTO(full_cancel, rc = -EINVAL); + + /* also check again if more bits to be cancelled appeared */ + if (drop_bits != lock->l_policy_data.l_inodebits.cancel_bits) + GOTO(clear_converting, rc = -EAGAIN); + + /* clear cbpending flag early, it is safe to match lock right after + * client convert because it is downgrade always. + */ + ldlm_clear_cbpending(lock); + ldlm_clear_bl_ast(lock); + spin_lock(&ns->ns_lock); + if (list_empty(&lock->l_lru)) + ldlm_lock_add_to_lru_nolock(lock); + spin_unlock(&ns->ns_lock); + + /* the job is done, zero the cancel_bits. If more conflicts appear, + * it will result in another cycle of ldlm_cli_inodebits_convert(). + */ +full_cancel: + lock->l_policy_data.l_inodebits.cancel_bits = 0; +clear_converting: + ldlm_clear_converting(lock); + RETURN(rc); +} + +int ldlm_inodebits_alloc_lock(struct ldlm_lock *lock) +{ + if (ldlm_is_ns_srv(lock)) { + int i; + + OBD_SLAB_ALLOC_PTR(lock->l_ibits_node, ldlm_inodebits_slab); + if (lock->l_ibits_node == NULL) + return -ENOMEM; + for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) + INIT_LIST_HEAD(&lock->l_ibits_node->lin_link[i]); + lock->l_ibits_node->lock = lock; + } else { + lock->l_ibits_node = NULL; + } + return 0; +} + +void ldlm_inodebits_add_lock(struct ldlm_resource *res, struct list_head *head, + struct ldlm_lock *lock, bool tail) +{ + int i; + + if (!ldlm_is_ns_srv(lock)) + return; + + if (head == &res->lr_waiting) { + for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) { + if (!(lock->l_policy_data.l_inodebits.bits & BIT(i))) + continue; + if (tail) + list_add_tail(&lock->l_ibits_node->lin_link[i], + &res->lr_ibits_queues->liq_waiting[i]); + else + list_add(&lock->l_ibits_node->lin_link[i], + &res->lr_ibits_queues->liq_waiting[i]); + } + } else if (head == &res->lr_granted && lock->l_ibits_node != NULL) { + for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) + LASSERT(list_empty(&lock->l_ibits_node->lin_link[i])); + OBD_SLAB_FREE_PTR(lock->l_ibits_node, ldlm_inodebits_slab); + lock->l_ibits_node = NULL; + } else if (head != &res->lr_granted) { + /* we are inserting in a middle of a list, after @head */ + struct ldlm_lock *orig = list_entry(head, struct ldlm_lock, + l_res_link); + LASSERT(orig->l_policy_data.l_inodebits.bits == + lock->l_policy_data.l_inodebits.bits); + /* The is no a use case to insert before with exactly matched + * set of bits */ + LASSERT(tail == false); + + for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) { + if (!(lock->l_policy_data.l_inodebits.bits & (1 << i))) + continue; + list_add(&lock->l_ibits_node->lin_link[i], + &orig->l_ibits_node->lin_link[i]); + } + } +} + +void ldlm_inodebits_unlink_lock(struct ldlm_lock *lock) +{ + int i; + + ldlm_unlink_lock_skiplist(lock); + if (!ldlm_is_ns_srv(lock)) + return; + + for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) + list_del_init(&lock->l_ibits_node->lin_link[i]); +} diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h new file mode 100644 index 0000000000000..517ab6091de5c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h @@ -0,0 +1,424 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +extern int ldlm_srv_namespace_nr; +extern int ldlm_cli_namespace_nr; +extern struct mutex ldlm_srv_namespace_lock; +extern struct list_head ldlm_srv_namespace_list; +extern struct mutex ldlm_cli_namespace_lock; +extern struct list_head ldlm_cli_active_namespace_list; +extern struct list_head ldlm_cli_inactive_namespace_list; +extern unsigned int ldlm_cancel_unused_locks_before_replay; +extern struct kmem_cache *ldlm_glimpse_work_kmem; + +static inline int ldlm_namespace_nr_read(enum ldlm_side client) +{ + return client == LDLM_NAMESPACE_SERVER ? + ldlm_srv_namespace_nr : ldlm_cli_namespace_nr; +} + +static inline void ldlm_namespace_nr_inc(enum ldlm_side client) +{ + if (client == LDLM_NAMESPACE_SERVER) + ldlm_srv_namespace_nr++; + else + ldlm_cli_namespace_nr++; +} + +static inline void ldlm_namespace_nr_dec(enum ldlm_side client) +{ + if (client == LDLM_NAMESPACE_SERVER) + ldlm_srv_namespace_nr--; + else + ldlm_cli_namespace_nr--; +} + +static inline struct list_head *ldlm_namespace_list(enum ldlm_side client) +{ + return client == LDLM_NAMESPACE_SERVER ? + &ldlm_srv_namespace_list : &ldlm_cli_active_namespace_list; +} + +static inline +struct list_head *ldlm_namespace_inactive_list(enum ldlm_side client) +{ + return client == LDLM_NAMESPACE_SERVER ? + &ldlm_srv_namespace_list : &ldlm_cli_inactive_namespace_list; +} + +static inline struct mutex *ldlm_namespace_lock(enum ldlm_side client) +{ + return client == LDLM_NAMESPACE_SERVER ? + &ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock; +} + +/* ns_bref is the number of resources in this namespace */ +static inline int ldlm_ns_empty(struct ldlm_namespace *ns) +{ + return atomic_read(&ns->ns_bref) == 0; +} + +void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *, + enum ldlm_side); +void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *, + enum ldlm_side); +struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side); + +/* ldlm_request.c */ +int ldlm_cancel_lru(struct ldlm_namespace *ns, int min, + enum ldlm_cancel_flags cancel_flags, + enum ldlm_lru_flags lru_flags); +int ldlm_cancel_lru_local(struct ldlm_namespace *ns, + struct list_head *cancels, int min, int max, + enum ldlm_cancel_flags cancel_flags, + enum ldlm_lru_flags lru_flags); +extern unsigned int ldlm_enqueue_min; +/* ldlm_resource.c */ +extern struct kmem_cache *ldlm_resource_slab; +extern struct kmem_cache *ldlm_lock_slab; +extern struct kmem_cache *ldlm_inodebits_slab; +extern struct kmem_cache *ldlm_interval_tree_slab; + +void ldlm_resource_insert_lock_after(struct ldlm_lock *original, + struct ldlm_lock *new); +void ldlm_resource_insert_lock_before(struct ldlm_lock *original, + struct ldlm_lock *new); + +/* ldlm_lock.c */ + +typedef enum { + LDLM_WORK_BL_AST, + LDLM_WORK_CP_AST, + LDLM_WORK_REVOKE_AST, + LDLM_WORK_GL_AST +} ldlm_desc_ast_t; + +void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock); +void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list); +int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, + enum req_location loc, void *data, int size); +struct ldlm_lock * +ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *, + enum ldlm_type type, enum ldlm_mode mode, + const struct ldlm_callback_suite *cbs, + void *data, __u32 lvb_len, enum lvb_type lvb_type); +enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env, + struct ldlm_namespace *, + struct ldlm_lock **, + void *cookie, __u64 *flags); +void ldlm_lock_addref_internal(struct ldlm_lock *, enum ldlm_mode mode); +void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, enum ldlm_mode mode); +void ldlm_lock_decref_internal(struct ldlm_lock *, enum ldlm_mode mode); +void ldlm_lock_decref_internal_nolock(struct ldlm_lock *, enum ldlm_mode mode); +void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, + struct list_head *work_list); +#ifdef HAVE_SERVER_SUPPORT +int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue, + struct list_head *work_list, + enum ldlm_process_intention intention, __u64 hint); +int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags, + struct list_head *rpc_list); +void ldlm_discard_bl_list(struct list_head *bl_list); +void ldlm_clear_blocking_lock(struct ldlm_lock *lock); +void ldlm_clear_blocking_data(struct ldlm_lock *lock); +#endif +int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list, + ldlm_desc_ast_t ast_type); +int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq); +int ldlm_lock_remove_from_lru_check(struct ldlm_lock *lock, ktime_t last_use); +#define ldlm_lock_remove_from_lru(lock) \ + ldlm_lock_remove_from_lru_check(lock, ktime_set(0, 0)) +int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock); +void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock); +void ldlm_lock_touch_in_lru(struct ldlm_lock *lock); +void ldlm_lock_destroy_nolock(struct ldlm_lock *lock); + +int ldlm_export_cancel_blocked_locks(struct obd_export *exp); +int ldlm_export_cancel_locks(struct obd_export *exp); +void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock); + +/* ldlm_lockd.c */ +int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, + struct ldlm_lock *lock); +int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, + struct list_head *cancels, int count, + enum ldlm_cancel_flags cancel_flags); +int ldlm_bl_to_thread_ns(struct ldlm_namespace *ns); +int ldlm_bl_thread_wakeup(void); + +void ldlm_handle_bl_callback(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, struct ldlm_lock *lock); +void ldlm_bl_desc2lock(const struct ldlm_lock_desc *ld, struct ldlm_lock *lock); + +#ifdef HAVE_SERVER_SUPPORT +/* ldlm_plain.c */ +int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, struct list_head *work_list); + +/* ldlm_inodebits.c */ +int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, + struct list_head *work_list); +int ldlm_reprocess_inodebits_queue(struct ldlm_resource *res, + struct list_head *queue, + struct list_head *work_list, + enum ldlm_process_intention intention, + __u64 hint); +/* ldlm_extent.c */ +int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, struct list_head *work_list); +#endif +int ldlm_extent_alloc_lock(struct ldlm_lock *lock); +void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock); +void ldlm_extent_unlink_lock(struct ldlm_lock *lock); + +int ldlm_inodebits_alloc_lock(struct ldlm_lock *lock); +void ldlm_inodebits_add_lock(struct ldlm_resource *res, struct list_head *head, + struct ldlm_lock *lock, bool tail); +void ldlm_inodebits_unlink_lock(struct ldlm_lock *lock); + +/* ldlm_flock.c */ +int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, struct list_head *work_list); +int ldlm_init_flock_export(struct obd_export *exp); +void ldlm_destroy_flock_export(struct obd_export *exp); + +/* l_lock.c */ +void l_check_ns_lock(struct ldlm_namespace *ns); +void l_check_no_ns_lock(struct ldlm_namespace *ns); + +extern struct dentry *ldlm_svc_debugfs_dir; + +struct ldlm_state { + struct ptlrpc_service *ldlm_cb_service; + struct ptlrpc_service *ldlm_cancel_service; + struct ptlrpc_client *ldlm_client; + struct ldlm_bl_pool *ldlm_bl_pool; +}; + +/* interval tree, for LDLM_EXTENT. */ +extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */ +extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l); +extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l); +extern void ldlm_interval_free(struct ldlm_interval *node); +/* this function must be called with res lock held */ +static inline struct ldlm_extent * +ldlm_interval_extent(struct ldlm_interval *node) +{ + struct ldlm_lock *lock; + LASSERT(!list_empty(&node->li_group)); + + lock = list_entry(node->li_group.next, struct ldlm_lock, + l_sl_policy); + return &lock->l_policy_data.l_extent; +} + +int ldlm_init(void); +void ldlm_exit(void); + +enum ldlm_policy_res { + LDLM_POLICY_CANCEL_LOCK, + LDLM_POLICY_KEEP_LOCK, + LDLM_POLICY_SKIP_LOCK +}; + +#define LDLM_POOL_SYSFS_PRINT_int(v) sprintf(buf, "%d\n", v) +#define LDLM_POOL_SYSFS_SET_int(a, b) { a = b; } +#define LDLM_POOL_SYSFS_PRINT_u64(v) sprintf(buf, "%lld\n", v) +#define LDLM_POOL_SYSFS_SET_u64(a, b) { a = b; } +#define LDLM_POOL_SYSFS_PRINT_atomic(v) sprintf(buf, "%d\n", atomic_read(&v)) +#define LDLM_POOL_SYSFS_SET_atomic(a, b) atomic_set(&a, b) + +#define LDLM_POOL_SYSFS_READER_SHOW(var, type) \ + static ssize_t var##_show(struct kobject *kobj, \ + struct attribute *attr, \ + char *buf) \ + { \ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\ + pl_kobj); \ + type tmp; \ + \ + spin_lock(&pl->pl_lock); \ + tmp = pl->pl_##var; \ + spin_unlock(&pl->pl_lock); \ + \ + return LDLM_POOL_SYSFS_PRINT_##type(tmp); \ + } \ + struct __##var##__dummy_read {;} /* semicolon catcher */ + +#define LDLM_POOL_SYSFS_WRITER_STORE(var, type) \ + static ssize_t var##_store(struct kobject *kobj, \ + struct attribute *attr, \ + const char *buffer, \ + size_t count) \ + { \ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\ + pl_kobj); \ + unsigned long tmp; \ + int rc; \ + \ + rc = kstrtoul(buffer, 10, &tmp); \ + if (rc < 0) { \ + return rc; \ + } \ + \ + spin_lock(&pl->pl_lock); \ + LDLM_POOL_SYSFS_SET_##type(pl->pl_##var, tmp); \ + spin_unlock(&pl->pl_lock); \ + \ + return count; \ + } \ + struct __##var##__dummy_write {; } /* semicolon catcher */ + +#define LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(var, type) \ + static ssize_t var##_show(struct kobject *kobj, \ + struct attribute *attr, \ + char *buf) \ + { \ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\ + pl_kobj); \ + \ + return LDLM_POOL_SYSFS_PRINT_##type(pl->pl_##var); \ + } \ + struct __##var##__dummy_read {; } /* semicolon catcher */ + +#define LDLM_POOL_SYSFS_WRITER_NOLOCK_STORE(var, type) \ + static ssize_t var##_store(struct kobject *kobj, \ + struct attribute *attr, \ + const char *buffer, \ + size_t count) \ + { \ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\ + pl_kobj); \ + unsigned long tmp; \ + int rc; \ + \ + rc = kstrtoul(buffer, 10, &tmp); \ + if (rc < 0) { \ + return rc; \ + } \ + \ + LDLM_POOL_SYSFS_SET_##type(pl->pl_##var, tmp); \ + \ + return count; \ + } \ + struct __##var##__dummy_write {; } /* semicolon catcher */ + +static inline void +ldlm_add_var(struct ldebugfs_vars *vars, struct dentry *debugfs_entry, + const char *name, void *data, const struct file_operations *ops) +{ + vars->name = name; + vars->data = data; + vars->fops = ops; + ldebugfs_add_vars(debugfs_entry, vars, NULL); +} + +static inline int is_granted_or_cancelled(struct ldlm_lock *lock) +{ + int ret = 0; + + lock_res_and_lock(lock); + ret = is_granted_or_cancelled_nolock(lock); + unlock_res_and_lock(lock); + + return ret; +} + +static inline bool is_bl_done(struct ldlm_lock *lock) +{ + bool bl_done = true; + + if (!ldlm_is_bl_done(lock)) { + lock_res_and_lock(lock); + bl_done = ldlm_is_bl_done(lock); + unlock_res_and_lock(lock); + } + + return bl_done; +} + +static inline bool is_lock_converted(struct ldlm_lock *lock) +{ + bool ret = 0; + + lock_res_and_lock(lock); + ret = (lock->l_policy_data.l_inodebits.cancel_bits == 0); + unlock_res_and_lock(lock); + + return ret; +} + +typedef void (*ldlm_policy_wire_to_local_t)(const union ldlm_wire_policy_data *, + union ldlm_policy_data *); +typedef void (*ldlm_policy_local_to_wire_t)(const union ldlm_policy_data *, + union ldlm_wire_policy_data *); +void ldlm_plain_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy); +void ldlm_plain_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy); +void ldlm_ibits_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy); +void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy); +void ldlm_extent_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy); +void ldlm_extent_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy); +void ldlm_flock_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy); +void ldlm_flock_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy); + +/* ldlm_reclaim.c */ +#ifdef HAVE_SERVER_SUPPORT +extern __u64 ldlm_reclaim_threshold; +extern __u64 ldlm_lock_limit; +extern __u64 ldlm_reclaim_threshold_mb; +extern __u64 ldlm_lock_limit_mb; +extern struct percpu_counter ldlm_granted_total; +#endif +int ldlm_reclaim_setup(void); +void ldlm_reclaim_cleanup(void); +void ldlm_reclaim_add(struct ldlm_lock *lock); +void ldlm_reclaim_del(struct ldlm_lock *lock); +bool ldlm_reclaim_full(void); + +static inline bool ldlm_res_eq(const struct ldlm_res_id *res0, + const struct ldlm_res_id *res1) +{ + return memcmp(res0, res1, sizeof(*res0)) == 0; +} diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c new file mode 100644 index 0000000000000..bf61555c331da --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c @@ -0,0 +1,3569 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +/** + * This file deals with various client/target related logic including recovery. + * + * TODO: This code more logically belongs in the ptlrpc module than in ldlm and + * should be moved. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ldlm_internal.h" + +/* + * @priority: If non-zero, move the selected connection to the list head. + * @create: If zero, only search in existing connections. + */ +static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority, int create) +{ + struct ptlrpc_connection *ptlrpc_conn; + struct obd_import_conn *imp_conn = NULL, *item; + lnet_nid_t nid4refnet = LNET_NID_ANY; + u32 refnet = imp->imp_conn_restricted_net; + int rc = 0; + + ENTRY; + + if (!create && !priority) { + CDEBUG(D_HA, "Nothing to do\n"); + RETURN(-EINVAL); + } + + /* refnet is used to restrict network connections */ + if (refnet != LNET_NIDNET(LNET_NID_ANY)) { + CDEBUG(D_HA, "imp %s: restrict %s to %s net\n", + imp->imp_obd->obd_name, uuid->uuid, + libcfs_net2str(refnet)); + nid4refnet = LNET_MKNID(refnet, 0); + } + + ptlrpc_conn = ptlrpc_uuid_to_connection(uuid, nid4refnet); + if (!ptlrpc_conn) { + CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid); + RETURN(-ENOENT); + } + + if (create) { + OBD_ALLOC(imp_conn, sizeof(*imp_conn)); + if (!imp_conn) + GOTO(out_put, rc = -ENOMEM); + } + + spin_lock(&imp->imp_lock); + list_for_each_entry(item, &imp->imp_conn_list, oic_item) { + if (obd_uuid_equals(uuid, &item->oic_uuid)) { + if (priority) { + list_move(&item->oic_item, + &imp->imp_conn_list); + item->oic_last_attempt = 0; + } + CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n", + imp, imp->imp_obd->obd_name, uuid->uuid, + (priority ? ", moved to head" : "")); + spin_unlock(&imp->imp_lock); + GOTO(out_free, rc = 0); + } + } + /* No existing import connection found for \a uuid. */ + if (create) { + imp_conn->oic_conn = ptlrpc_conn; + imp_conn->oic_uuid = *uuid; + imp_conn->oic_last_attempt = 0; + if (priority) + list_add(&imp_conn->oic_item, &imp->imp_conn_list); + else + list_add_tail(&imp_conn->oic_item, + &imp->imp_conn_list); + CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n", + imp, imp->imp_obd->obd_name, uuid->uuid, + (priority ? "head" : "tail")); + } else { + spin_unlock(&imp->imp_lock); + GOTO(out_free, rc = -ENOENT); + } + + spin_unlock(&imp->imp_lock); + RETURN(0); +out_free: + if (imp_conn) + OBD_FREE(imp_conn, sizeof(*imp_conn)); +out_put: + ptlrpc_connection_put(ptlrpc_conn); + RETURN(rc); +} + +int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid) +{ + return import_set_conn(imp, uuid, 1, 0); +} + +int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority) +{ + return import_set_conn(imp, uuid, priority, 1); +} +EXPORT_SYMBOL(client_import_add_conn); + +int client_import_dyn_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + lnet_nid_t prim_nid, int priority) +{ + struct ptlrpc_connection *ptlrpc_conn; + int rc; + + ptlrpc_conn = ptlrpc_uuid_to_connection(uuid, prim_nid); + if (!ptlrpc_conn) { + const char *str_uuid = obd_uuid2str(uuid); + + rc = class_add_uuid(str_uuid, prim_nid); + if (rc) { + CERROR("%s: failed to add UUID '%s': rc = %d\n", + imp->imp_obd->obd_name, str_uuid, rc); + return rc; + } + } + return import_set_conn(imp, uuid, priority, 1); +} +EXPORT_SYMBOL(client_import_dyn_add_conn); + +int client_import_add_nids_to_conn(struct obd_import *imp, lnet_nid_t *nids, + int nid_count, struct obd_uuid *uuid) +{ + struct obd_import_conn *conn; + int rc = -ENOENT; + + ENTRY; + if (nid_count <= 0 || !nids) + return rc; + + spin_lock(&imp->imp_lock); + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + if (class_check_uuid(&conn->oic_uuid, nids[0])) { + *uuid = conn->oic_uuid; + spin_unlock(&imp->imp_lock); + rc = class_add_nids_to_uuid(&conn->oic_uuid, nids, + nid_count); + RETURN(rc); + } + } + spin_unlock(&imp->imp_lock); + RETURN(rc); +} +EXPORT_SYMBOL(client_import_add_nids_to_conn); + +int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid) +{ + struct obd_import_conn *imp_conn; + struct obd_export *dlmexp; + int rc = -ENOENT; + + ENTRY; + + spin_lock(&imp->imp_lock); + if (list_empty(&imp->imp_conn_list)) { + LASSERT(!imp->imp_connection); + GOTO(out, rc); + } + + list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) { + if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid)) + continue; + LASSERT(imp_conn->oic_conn); + + if (imp_conn == imp->imp_conn_current) { + LASSERT(imp_conn->oic_conn == imp->imp_connection); + + if (imp->imp_state != LUSTRE_IMP_CLOSED && + imp->imp_state != LUSTRE_IMP_DISCON) { + CERROR("can't remove current connection\n"); + GOTO(out, rc = -EBUSY); + } + + ptlrpc_connection_put(imp->imp_connection); + imp->imp_connection = NULL; + + dlmexp = class_conn2export(&imp->imp_dlm_handle); + if (dlmexp && dlmexp->exp_connection) { + LASSERT(dlmexp->exp_connection == + imp_conn->oic_conn); + ptlrpc_connection_put(dlmexp->exp_connection); + dlmexp->exp_connection = NULL; + } + + if (dlmexp != NULL) + class_export_put(dlmexp); + } + + list_del(&imp_conn->oic_item); + ptlrpc_connection_put(imp_conn->oic_conn); + OBD_FREE(imp_conn, sizeof(*imp_conn)); + CDEBUG(D_HA, "imp %p@%s: remove connection %s\n", + imp, imp->imp_obd->obd_name, uuid->uuid); + rc = 0; + break; + } +out: + spin_unlock(&imp->imp_lock); + if (rc == -ENOENT) + CERROR("connection %s not found\n", uuid->uuid); + RETURN(rc); +} +EXPORT_SYMBOL(client_import_del_conn); + +/** + * Find conn UUID by peer NID. \a peer is a server NID. This function is used + * to find a conn uuid of \a imp which can reach \a peer. + */ +int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer, + struct obd_uuid *uuid) +{ + struct obd_import_conn *conn; + int rc = -ENOENT; + + ENTRY; + + spin_lock(&imp->imp_lock); + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + /* Check if conn UUID does have this peer NID. */ + if (class_check_uuid(&conn->oic_uuid, peer)) { + *uuid = conn->oic_uuid; + rc = 0; + break; + } + } + spin_unlock(&imp->imp_lock); + RETURN(rc); +} +EXPORT_SYMBOL(client_import_find_conn); + +void client_destroy_import(struct obd_import *imp) +{ + /* + * Drop security policy instance after all RPCs have finished/aborted + * to let all busy contexts be released. + */ + class_import_get(imp); + class_destroy_import(imp); + sptlrpc_import_sec_put(imp); + class_import_put(imp); +} +EXPORT_SYMBOL(client_destroy_import); + +/** + * Check whether or not the OSC is on MDT. + * In the config log, + * osc on MDT + * setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID + * osc on client + * setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID + * + **/ +static int osc_on_mdt(char *obdname) +{ + char *ptr; + + ptr = strrchr(obdname, '-'); + if (ptr == NULL) + return 0; + + if (strncmp(ptr + 1, "MDT", 3) == 0) + return 1; + + return 0; +} + +/* + * Configure an RPC client OBD device. + * + * lcfg parameters: + * 1 - client UUID + * 2 - server UUID + * 3 - inactive-on-startup + * 4 - restrictive net + */ +int client_obd_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp; + struct obd_uuid server_uuid; + int rq_portal, rp_portal, connect_op; + const char *name = obd->obd_type->typ_name; + enum ldlm_ns_type ns_type = LDLM_NS_TYPE_UNKNOWN; + char *cli_name = lustre_cfg_buf(lcfg, 0); + int rc; + + ENTRY; + + /* + * In a more perfect world, we would hang a ptlrpc_client off of + * obd_type and just use the values from there. + */ + if (!strcmp(name, LUSTRE_OSC_NAME)) { + rq_portal = OST_REQUEST_PORTAL; + rp_portal = OSC_REPLY_PORTAL; + connect_op = OST_CONNECT; + cli->cl_sp_me = LUSTRE_SP_CLI; + cli->cl_sp_to = LUSTRE_SP_OST; + ns_type = LDLM_NS_TYPE_OSC; + } else if (!strcmp(name, LUSTRE_MDC_NAME) || + !strcmp(name, LUSTRE_LWP_NAME)) { + rq_portal = MDS_REQUEST_PORTAL; + rp_portal = MDC_REPLY_PORTAL; + connect_op = MDS_CONNECT; + if (is_lwp_on_ost(cli_name)) + cli->cl_sp_me = LUSTRE_SP_OST; + else if (is_lwp_on_mdt(cli_name)) + cli->cl_sp_me = LUSTRE_SP_MDT; + else + cli->cl_sp_me = LUSTRE_SP_CLI; + cli->cl_sp_to = LUSTRE_SP_MDT; + ns_type = LDLM_NS_TYPE_MDC; + } else if (!strcmp(name, LUSTRE_OSP_NAME)) { + if (strstr(lustre_cfg_buf(lcfg, 1), "OST") == NULL) { + /* OSP_on_MDT for other MDTs */ + connect_op = MDS_CONNECT; + cli->cl_sp_to = LUSTRE_SP_MDT; + ns_type = LDLM_NS_TYPE_MDC; + rq_portal = OUT_PORTAL; + } else { + /* OSP on MDT for OST */ + connect_op = OST_CONNECT; + cli->cl_sp_to = LUSTRE_SP_OST; + ns_type = LDLM_NS_TYPE_OSC; + rq_portal = OST_REQUEST_PORTAL; + } + rp_portal = OSC_REPLY_PORTAL; + cli->cl_sp_me = LUSTRE_SP_MDT; + } else if (!strcmp(name, LUSTRE_MGC_NAME)) { + rq_portal = MGS_REQUEST_PORTAL; + rp_portal = MGC_REPLY_PORTAL; + connect_op = MGS_CONNECT; + cli->cl_sp_me = LUSTRE_SP_MGC; + cli->cl_sp_to = LUSTRE_SP_MGS; + cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID; + ns_type = LDLM_NS_TYPE_MGC; + } else { + CERROR("unknown client OBD type \"%s\", can't setup\n", + name); + RETURN(-EINVAL); + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("requires a TARGET UUID\n"); + RETURN(-EINVAL); + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) { + CERROR("client UUID must be less than 38 characters\n"); + RETURN(-EINVAL); + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) { + CERROR("setup requires a SERVER UUID\n"); + RETURN(-EINVAL); + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) { + CERROR("target UUID must be less than 38 characters\n"); + RETURN(-EINVAL); + } + + init_rwsem(&cli->cl_sem); + mutex_init(&cli->cl_mgc_mutex); + cli->cl_seq = NULL; + init_rwsem(&cli->cl_seq_rwsem); + cli->cl_conn_count = 0; + memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2), + min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2), + sizeof(server_uuid))); + + cli->cl_dirty_pages = 0; + cli->cl_dirty_max_pages = 0; + cli->cl_avail_grant = 0; + /* FIXME: Should limit this for the sum of all cl_dirty_max_pages. */ + /* + * cl_dirty_max_pages may be changed at connect time in + * ptlrpc_connect_interpret(). + */ + client_adjust_max_dirty(cli); + init_waitqueue_head(&cli->cl_cache_waiters); + INIT_LIST_HEAD(&cli->cl_loi_ready_list); + INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list); + INIT_LIST_HEAD(&cli->cl_loi_write_list); + INIT_LIST_HEAD(&cli->cl_loi_read_list); + spin_lock_init(&cli->cl_loi_list_lock); + atomic_set(&cli->cl_pending_w_pages, 0); + atomic_set(&cli->cl_pending_r_pages, 0); + cli->cl_r_in_flight = 0; + cli->cl_w_in_flight = 0; + + spin_lock_init(&cli->cl_read_rpc_hist.oh_lock); + spin_lock_init(&cli->cl_write_rpc_hist.oh_lock); + spin_lock_init(&cli->cl_read_page_hist.oh_lock); + spin_lock_init(&cli->cl_write_page_hist.oh_lock); + spin_lock_init(&cli->cl_read_offset_hist.oh_lock); + spin_lock_init(&cli->cl_write_offset_hist.oh_lock); + + /* lru for osc. */ + INIT_LIST_HEAD(&cli->cl_lru_osc); + atomic_set(&cli->cl_lru_shrinkers, 0); + atomic_long_set(&cli->cl_lru_busy, 0); + atomic_long_set(&cli->cl_lru_in_list, 0); + INIT_LIST_HEAD(&cli->cl_lru_list); + spin_lock_init(&cli->cl_lru_list_lock); + atomic_long_set(&cli->cl_unstable_count, 0); + INIT_LIST_HEAD(&cli->cl_shrink_list); + INIT_LIST_HEAD(&cli->cl_grant_chain); + + INIT_LIST_HEAD(&cli->cl_flight_waiters); + cli->cl_rpcs_in_flight = 0; + + init_waitqueue_head(&cli->cl_destroy_waitq); + atomic_set(&cli->cl_destroy_in_flight, 0); + + + cli->cl_supp_cksum_types = OBD_CKSUM_CRC32; + cli->cl_preferred_cksum_type = 0; +#ifdef ENABLE_CHECKSUM + /* Turn on checksumming by default. */ + cli->cl_checksum = 1; + /* + * The supported checksum types will be worked out at connect time + * Set cl_chksum* to CRC32 for now to avoid returning screwed info + * through procfs. + */ + cli->cl_cksum_type = cli->cl_supp_cksum_types; +#endif + atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS); + + /* + * Set it to possible maximum size. It may be reduced by ocd_brw_size + * from OFD after connecting. + */ + cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES; + + cli->cl_max_short_io_bytes = OBD_DEF_SHORT_IO_BYTES; + + /* + * set cl_chunkbits default value to PAGE_SHIFT, + * it will be updated at OSC connection time. + */ + cli->cl_chunkbits = PAGE_SHIFT; + + if (!strcmp(name, LUSTRE_MDC_NAME)) { + cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; + } else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 128 /* MB */) { + cli->cl_max_rpcs_in_flight = 2; + } else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 256 /* MB */) { + cli->cl_max_rpcs_in_flight = 3; + } else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 512 /* MB */) { + cli->cl_max_rpcs_in_flight = 4; + } else { + if (osc_on_mdt(obd->obd_name)) + cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_MAX; + else + cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; + } + + spin_lock_init(&cli->cl_mod_rpcs_lock); + spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock); + cli->cl_max_mod_rpcs_in_flight = 0; + cli->cl_mod_rpcs_in_flight = 0; + cli->cl_close_rpcs_in_flight = 0; + init_waitqueue_head(&cli->cl_mod_rpcs_waitq); + cli->cl_mod_rpcs_init = ktime_get_real(); + cli->cl_mod_tag_bitmap = NULL; + + INIT_LIST_HEAD(&cli->cl_chg_dev_linkage); + + if (connect_op == MDS_CONNECT) { + cli->cl_max_mod_rpcs_in_flight = cli->cl_max_rpcs_in_flight - 1; + OBD_ALLOC(cli->cl_mod_tag_bitmap, + BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long)); + if (cli->cl_mod_tag_bitmap == NULL) + GOTO(err, rc = -ENOMEM); + } + + rc = ldlm_get_ref(); + if (rc) { + CERROR("ldlm_get_ref failed: %d\n", rc); + GOTO(err, rc); + } + + ptlrpc_init_client(rq_portal, rp_portal, name, + &obd->obd_ldlm_client); + + imp = class_new_import(obd); + if (imp == NULL) + GOTO(err_ldlm, rc = -ENOENT); + imp->imp_client = &obd->obd_ldlm_client; + imp->imp_connect_op = connect_op; + memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1), + LUSTRE_CFG_BUFLEN(lcfg, 1)); + class_import_put(imp); + + if (lustre_cfg_buf(lcfg, 4)) { + __u32 refnet = libcfs_str2net(lustre_cfg_string(lcfg, 4)); + + if (refnet == LNET_NET_ANY) { + rc = -EINVAL; + CERROR("%s: bad mount option 'network=%s': rc = %d\n", + obd->obd_name, lustre_cfg_string(lcfg, 4), + rc); + GOTO(err_import, rc); + } + imp->imp_conn_restricted_net = refnet; + } else { + imp->imp_conn_restricted_net = LNET_NIDNET(LNET_NID_ANY); + } + + rc = client_import_add_conn(imp, &server_uuid, 1); + if (rc) { + CERROR("can't add initial connection\n"); + GOTO(err_import, rc); + } + imp->imp_connection = NULL; + + cli->cl_import = imp; + /* cli->cl_max_mds_easize updated by mdc_init_ea_size() */ + cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3); + + if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) { + if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) { + CDEBUG(D_HA, "marking %s %s->%s as inactive\n", + name, obd->obd_name, + cli->cl_target_uuid.uuid); + spin_lock(&imp->imp_lock); + imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); + } + } + + obd->obd_namespace = ldlm_namespace_new(obd, obd->obd_name, + LDLM_NAMESPACE_CLIENT, + LDLM_NAMESPACE_GREEDY, + ns_type); + if (IS_ERR(obd->obd_namespace)) { + rc = PTR_ERR(obd->obd_namespace); + CERROR("%s: unable to create client namespace: rc = %d\n", + obd->obd_name, rc); + obd->obd_namespace = NULL; + GOTO(err_import, rc); + } + + RETURN(rc); + +err_import: + class_destroy_import(imp); +err_ldlm: + ldlm_put_ref(); +err: + if (cli->cl_mod_tag_bitmap != NULL) + OBD_FREE(cli->cl_mod_tag_bitmap, + BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long)); + cli->cl_mod_tag_bitmap = NULL; + + RETURN(rc); +} +EXPORT_SYMBOL(client_obd_setup); + +int client_obd_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + + ENTRY; + + ldlm_namespace_free_post(obd->obd_namespace); + obd->obd_namespace = NULL; + + obd_cleanup_client_import(obd); + LASSERT(obd->u.cli.cl_import == NULL); + + ldlm_put_ref(); + + if (cli->cl_mod_tag_bitmap != NULL) + OBD_FREE(cli->cl_mod_tag_bitmap, + BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long)); + cli->cl_mod_tag_bitmap = NULL; + + RETURN(0); +} +EXPORT_SYMBOL(client_obd_cleanup); + +/* ->o_connect() method for client side (OSC and MDC and MGC) */ +int client_connect_import(const struct lu_env *env, + struct obd_export **exp, + struct obd_device *obd, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata) +{ + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp = cli->cl_import; + struct obd_connect_data *ocd; + struct lustre_handle conn = { 0 }; + int rc; + + ENTRY; + + *exp = NULL; + down_write(&cli->cl_sem); + if (cli->cl_conn_count > 0) + GOTO(out_sem, rc = -EALREADY); + + rc = class_connect(&conn, obd, cluuid); + if (rc) + GOTO(out_sem, rc); + + cli->cl_conn_count++; + *exp = class_conn2export(&conn); + + LASSERT(obd->obd_namespace); + + imp->imp_dlm_handle = conn; + rc = ptlrpc_init_import(imp); + if (rc != 0) + GOTO(out_ldlm, rc); + + ocd = &imp->imp_connect_data; + if (data) { + *ocd = *data; + imp->imp_connect_flags_orig = data->ocd_connect_flags; + imp->imp_connect_flags2_orig = data->ocd_connect_flags2; + } + + rc = ptlrpc_connect_import(imp); + if (rc != 0) { + LASSERT(imp->imp_state == LUSTRE_IMP_DISCON); + GOTO(out_ldlm, rc); + } + LASSERT(*exp != NULL && (*exp)->exp_connection); + + if (data) { + LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) == + ocd->ocd_connect_flags, "old %#llx, new %#llx\n", + data->ocd_connect_flags, ocd->ocd_connect_flags); + data->ocd_connect_flags = ocd->ocd_connect_flags; + data->ocd_connect_flags2 = ocd->ocd_connect_flags2; + } + + ptlrpc_pinger_add_import(imp); + + EXIT; + + if (rc) { +out_ldlm: + cli->cl_conn_count--; + class_disconnect(*exp); + *exp = NULL; + } +out_sem: + up_write(&cli->cl_sem); + + if (!rc && localdata) { + LASSERT(cli->cl_cache == NULL); /* only once */ + cli->cl_cache = (struct cl_client_cache *)localdata; + cl_cache_incref(cli->cl_cache); + cli->cl_lru_left = &cli->cl_cache->ccc_lru_left; + + /* add this osc into entity list */ + LASSERT(list_empty(&cli->cl_lru_osc)); + spin_lock(&cli->cl_cache->ccc_lru_lock); + list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru); + spin_unlock(&cli->cl_cache->ccc_lru_lock); + } + + return rc; +} +EXPORT_SYMBOL(client_connect_import); + +int client_disconnect_export(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + struct client_obd *cli; + struct obd_import *imp; + int rc = 0, err; + + ENTRY; + + if (!obd) { + CERROR("invalid export for disconnect: exp %p cookie %#llx\n", + exp, exp ? exp->exp_handle.h_cookie : -1); + RETURN(-EINVAL); + } + + cli = &obd->u.cli; + imp = cli->cl_import; + + down_write(&cli->cl_sem); + CDEBUG(D_INFO, "disconnect %s - %zu\n", obd->obd_name, + cli->cl_conn_count); + + if (cli->cl_conn_count == 0) { + CERROR("disconnecting disconnected device (%s)\n", + obd->obd_name); + GOTO(out_disconnect, rc = -EINVAL); + } + + cli->cl_conn_count--; + if (cli->cl_conn_count != 0) + GOTO(out_disconnect, rc = 0); + + /* + * Mark import deactivated now, so we don't try to reconnect if any + * of the cleanup RPCs fails (e.g. LDLM cancel, etc). We don't + * fully deactivate the import, or that would drop all requests. + */ + spin_lock(&imp->imp_lock); + imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); + + /* + * Some non-replayable imports (MDS's OSCs) are pinged, so just + * delete it regardless. (It's safe to delete an import that was + * never added.) + */ + (void)ptlrpc_pinger_del_import(imp); + + if (obd->obd_namespace != NULL) { + /* obd_force == local only */ + ldlm_cli_cancel_unused(obd->obd_namespace, NULL, + obd->obd_force ? LCF_LOCAL : 0, NULL); + ldlm_namespace_free_prior(obd->obd_namespace, imp, + obd->obd_force); + } + + /* + * There's no need to hold sem while disconnecting an import, + * and it may actually cause deadlock in GSS. + */ + up_write(&cli->cl_sem); + rc = ptlrpc_disconnect_import(imp, 0); + down_write(&cli->cl_sem); + + ptlrpc_invalidate_import(imp); + + EXIT; + +out_disconnect: + /* + * Use server style - class_disconnect should be always called for + * o_disconnect. + */ + err = class_disconnect(exp); + if (!rc && err) + rc = err; + + up_write(&cli->cl_sem); + + RETURN(rc); +} +EXPORT_SYMBOL(client_disconnect_export); + +#ifdef HAVE_SERVER_SUPPORT +int server_disconnect_export(struct obd_export *exp) +{ + int rc; + + ENTRY; + + /* Disconnect early so that clients can't keep using export. */ + rc = class_disconnect(exp); + /* Close import to avoid sending any requests. */ + if (exp->exp_imp_reverse) + ptlrpc_cleanup_imp(exp->exp_imp_reverse); + + ldlm_bl_thread_wakeup(); + + /* complete all outstanding replies */ + spin_lock(&exp->exp_lock); + while (!list_empty(&exp->exp_outstanding_replies)) { + struct ptlrpc_reply_state *rs = + list_entry(exp->exp_outstanding_replies.next, + struct ptlrpc_reply_state, rs_exp_list); + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + + spin_lock(&svcpt->scp_rep_lock); + + list_del_init(&rs->rs_exp_list); + + spin_lock(&rs->rs_lock); + /* clear rs_convert_lock to make sure rs is handled and put */ + rs->rs_convert_lock = 0; + ptlrpc_schedule_difficult_reply(rs); + spin_unlock(&rs->rs_lock); + + spin_unlock(&svcpt->scp_rep_lock); + } + spin_unlock(&exp->exp_lock); + + RETURN(rc); +} +EXPORT_SYMBOL(server_disconnect_export); + +static inline int target_check_recovery_timer(struct obd_device *target) +{ + ktime_t remaining; + s64 timeout; + + if (!target->obd_recovering || target->obd_recovery_start == 0) + return 0; + + remaining = hrtimer_get_remaining(&target->obd_recovery_timer); + timeout = ktime_divns(remaining, NSEC_PER_SEC); + if (timeout > -30) + return 0; + + /* the recovery timer should expire, but it isn't triggered, + * it's better to abort the recovery of this target to speed up + * the recovery of the whole cluster. */ + spin_lock(&target->obd_dev_lock); + if (target->obd_recovering) { + CERROR("%s: Aborting recovery\n", target->obd_name); + target->obd_abort_recovery = 1; + wake_up(&target->obd_next_transno_waitq); + } + spin_unlock(&target->obd_dev_lock); + return 0; +} + +/* + * -------------------------------------------------------------------------- + * from old lib/target.c + * -------------------------------------------------------------------------- + */ +static int target_handle_reconnect(struct lustre_handle *conn, + struct obd_export *exp, + struct obd_uuid *cluuid) +{ + struct obd_device *target; + struct lustre_handle *hdl; + ktime_t remaining; + s64 timeout; + int rc = 0; + + ENTRY; + hdl = &exp->exp_imp_reverse->imp_remote_handle; + if (!exp->exp_connection || !lustre_handle_is_used(hdl)) { + conn->cookie = exp->exp_handle.h_cookie; + CDEBUG(D_HA, + "connect export for UUID '%s' at %p, cookie %#llx\n", + cluuid->uuid, exp, conn->cookie); + RETURN(0); + } + + target = exp->exp_obd; + + /* Might be a re-connect after a partition. */ + if (memcmp(&conn->cookie, &hdl->cookie, sizeof(conn->cookie))) { + LCONSOLE_WARN("%s: already connected client %s (at %s) with handle %#llx. Rejecting client with the same UUID trying to reconnect with handle %#llx\n", + target->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), + hdl->cookie, conn->cookie); + memset(conn, 0, sizeof(*conn)); + /* + * target_handle_connect() treats EALREADY and + * -EALREADY differently. -EALREADY is an error + * (same UUID, different handle). + */ + RETURN(-EALREADY); + } + + if (!target->obd_recovering) { + LCONSOLE_WARN("%s: Client %s (at %s) reconnecting\n", + target->obd_name, obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp)); + GOTO(out_already, rc); + } + + remaining = hrtimer_get_remaining(&target->obd_recovery_timer); + timeout = ktime_divns(remaining, NSEC_PER_SEC); + if (timeout > 0) { + LCONSOLE_WARN("%s: Client %s (at %s) reconnected, waiting for %d clients in recovery for %lld:%.02lld\n", + target->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), + atomic_read(&target->obd_max_recoverable_clients), + timeout / 60, timeout % 60); + } else { + struct target_distribute_txn_data *tdtd; + int size = 0; + int count = 0; + char *buf = NULL; + + target_check_recovery_timer(target); + + tdtd = class_exp2tgt(exp)->lut_tdtd; + if (tdtd && tdtd->tdtd_show_update_logs_retrievers) + buf = tdtd->tdtd_show_update_logs_retrievers( + tdtd->tdtd_show_retrievers_cbdata, + &size, &count); + + if (count > 0) + LCONSOLE_WARN("%s: Client %s (at %s) reconnecting, waiting for %d MDTs (%s) in recovery for %lld:%.02lld. Please wait until all MDTs recovered or you may force MDT evicition via 'lctl --device %s abort_recovery.\n", + target->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), count, + buf ? buf : "unknown (not enough RAM)", + (abs(timeout) + target->obd_recovery_timeout) / 60, + (abs(timeout) + target->obd_recovery_timeout) % 60, + target->obd_name); + else + LCONSOLE_WARN("%s: Recovery already passed deadline %lld:%.02lld. If you do not want to wait more, you may force taget eviction via 'lctl --device %s abort_recovery.\n", + target->obd_name, abs(timeout) / 60, + abs(timeout) % 60, target->obd_name); + + if (buf != NULL) + OBD_FREE(buf, size); + } + +out_already: + conn->cookie = exp->exp_handle.h_cookie; + /* + * target_handle_connect() treats EALREADY and + * -EALREADY differently. EALREADY means we are + * doing a valid reconnect from the same client. + */ + RETURN(EALREADY); +} + +static void +check_and_start_recovery_timer(struct obd_device *obd, + struct ptlrpc_request *req, int new_client); + +/** + * update flags for import during reconnect process + */ +static int rev_import_flags_update(struct obd_import *revimp, + struct ptlrpc_request *req) +{ + int rc; + struct obd_connect_data *data; + + data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA); + + if (data->ocd_connect_flags & OBD_CONNECT_AT) + revimp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT; + else + revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + + revimp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18; + + revimp->imp_connect_data = *data; + rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, &req->rq_flvr); + if (rc) { + CERROR("%s: cannot get reverse import %s security: rc = %d\n", + revimp->imp_client->cli_name, + libcfs_id2str(req->rq_peer), rc); + return rc; + } + + return 0; +} + +/** + * Allocate a new reverse import for an export. + * + * \retval -errno in case error hit + * \retval 0 if reverse import correctly init + **/ +int rev_import_init(struct obd_export *export) +{ + struct obd_device *obd = export->exp_obd; + struct obd_import *revimp; + + LASSERT(export->exp_imp_reverse == NULL); + + revimp = class_new_import(obd); + if (revimp == NULL) + return -ENOMEM; + + revimp->imp_remote_handle.cookie = 0ULL; + revimp->imp_client = &obd->obd_ldlm_client; + revimp->imp_dlm_fake = 1; + + /* it is safe to connect import in new state as no sends possible */ + spin_lock(&export->exp_lock); + export->exp_imp_reverse = revimp; + spin_unlock(&export->exp_lock); + class_import_put(revimp); + + return 0; +} +EXPORT_SYMBOL(rev_import_init); + +/** + * Handle reconnect for an export. + * + * \param exp export to handle reconnect process + * \param req client reconnect request + * + * \retval -rc in case securitfy flavor can't be changed + * \retval 0 in case none problems + */ +static int rev_import_reconnect(struct obd_export *exp, + struct ptlrpc_request *req) +{ + struct obd_import *revimp = exp->exp_imp_reverse; + struct lustre_handle *lh; + int rc; + + /* avoid sending a request until import flags are changed */ + ptlrpc_import_enter_resend(revimp); + + ptlrpc_connection_put(revimp->imp_connection); + + /* + * client from recovery don't have a handle so we need to take from + * request. it may produce situation when wrong client connected + * to recovery as we trust a client uuid + */ + lh = req_capsule_client_get(&req->rq_pill, &RMF_CONN); + revimp->imp_remote_handle = *lh; + + /* + * unknown versions will be caught in + * ptlrpc_handle_server_req_in->lustre_unpack_msg() + */ + revimp->imp_msg_magic = req->rq_reqmsg->lm_magic; + + revimp->imp_connection = ptlrpc_connection_addref(exp->exp_connection); + + rc = rev_import_flags_update(revimp, req); + if (rc != 0) { + /* + * it is safe to still be in RECOVERY phase as we are not able + * to setup correct security flavor so requests are not able to + * be delivered correctly + */ + return rc; + } + + /* resend all rpc's via new connection */ + return ptlrpc_import_recovery_state_machine(revimp); +} + +int target_handle_connect(struct ptlrpc_request *req) +{ + struct obd_device *target = NULL; + struct obd_export *export = NULL; + /* + * connect handle - filled from target_handle_reconnect in + * reconnect case + */ + struct lustre_handle conn; + struct lustre_handle *tmp; + struct obd_uuid cluuid; + char *str; + int rc = 0; + char *target_start; + int target_len; + bool mds_conn = false, lw_client = false, initial_conn = false; + bool mds_mds_conn = false; + bool new_mds_mds_conn = false; + struct obd_connect_data *data, *tmpdata; + int size, tmpsize; + lnet_nid_t *client_nid = NULL; + struct ptlrpc_connection *pcon = NULL; + + ENTRY; + + OBD_RACE(OBD_FAIL_TGT_CONN_RACE); + + str = req_capsule_client_get(&req->rq_pill, &RMF_TGTUUID); + if (str == NULL) { + DEBUG_REQ(D_ERROR, req, "bad target UUID for connect"); + GOTO(out, rc = -EINVAL); + } + + target = class_dev_by_str(str); + if (!target) { + deuuidify(str, NULL, &target_start, &target_len); + LCONSOLE_ERROR_MSG(0x137, + "%s: not available for connect from %s (no target). If you are running an HA pair check that the target is mounted on the other server.\n", + str, libcfs_nid2str(req->rq_peer.nid)); + GOTO(out, rc = -ENODEV); + } + + spin_lock(&target->obd_dev_lock); + + target->obd_conn_inprogress++; + + if (target->obd_stopping || !target->obd_set_up) { + spin_unlock(&target->obd_dev_lock); + + deuuidify(str, NULL, &target_start, &target_len); + LCONSOLE_INFO("%.*s: Not available for connect from %s (%s)\n", + target_len, target_start, + libcfs_nid2str(req->rq_peer.nid), + (target->obd_stopping ? + "stopping" : "not set up")); + GOTO(out, rc = -ENODEV); + } + + if (target->obd_no_conn) { + spin_unlock(&target->obd_dev_lock); + + CDEBUG(D_INFO, + "%s: Temporarily refusing client connection from %s\n", + target->obd_name, libcfs_nid2str(req->rq_peer.nid)); + GOTO(out, rc = -EAGAIN); + } + + spin_unlock(&target->obd_dev_lock); + + str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID); + if (str == NULL) { + DEBUG_REQ(D_ERROR, req, "bad client UUID for connect"); + GOTO(out, rc = -EINVAL); + } + + obd_str2uuid(&cluuid, str); + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_CONN); + if (tmp == NULL) + GOTO(out, rc = -EPROTO); + + conn = *tmp; + + size = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA, + RCL_CLIENT); + if (size < 0 || size > 8 * sizeof(struct obd_connect_data)) + GOTO(out, rc = -EPROTO); + data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA); + if (!data) + GOTO(out, rc = -EPROTO); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + GOTO(out, rc); + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0) + /* + * Don't allow clients to connect that are using old 1.8 format + * protocol conventions (LUSTRE_MSG_MAGIC_v1, !MSGHDR_CKSUM_INCOMPAT18, + * ldlm_flock_policy_wire format, MDT_ATTR_xTIME_SET, etc). The + * FULL20 flag should be set on all connections since 2.0, but no + * longer affects behaviour. + * + * Later this check will be disabled and the flag can be retired + * completely once interop with 3.0 is no longer needed. + */ + if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20)) + GOTO(out, rc = -EPROTO); + + /* + * Don't allow liblustre clients to connect. + * - testing was disabled in v2_2_50_0-61-g6a75d65 + * - building was disabled in v2_5_58_0-28-g7277179 + * - client code was deleted in v2_6_50_0-101-gcdfbc72, + * - clients were refused connect for version difference > 0.0.1.32 + */ + if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) { + DEBUG_REQ(D_WARNING, req, "Refusing libclient connection"); + GOTO(out, rc = -EPROTO); + } +#endif + + /* + * Note: lw_client is needed in MDS-MDS failover during update log + * processing, so we needs to allow lw_client to be connected at + * anytime, instead of only the initial connection + */ + lw_client = OCD_HAS_FLAG(data, LIGHTWEIGHT); + + if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) { + initial_conn = true; + mds_conn = OCD_HAS_FLAG(data, MDS); + mds_mds_conn = OCD_HAS_FLAG(data, MDS_MDS); + + /* + * OBD_CONNECT_MNE_SWAB is removed at 2.14 + * Checking OBD_CONNECT_FID can be removed in the future. + * + * Via check OBD_CONNECT_FID, we can distinguish whether + * the OBD_CONNECT_MDS_MDS/OBD_CONNECT_MNE_SWAB is from + * MGC or MDT, since MGC does not use OBD_CONNECT_FID. + */ + if (!lw_client && + (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) && + (data->ocd_connect_flags & OBD_CONNECT_FID) && + (data->ocd_connect_flags & OBD_CONNECT_VERSION)) { + __u32 major = OBD_OCD_VERSION_MAJOR(data->ocd_version); + __u32 minor = OBD_OCD_VERSION_MINOR(data->ocd_version); + __u32 patch = OBD_OCD_VERSION_PATCH(data->ocd_version); + + /* + * We do not support the MDT-MDT interoperations with + * different version MDT because of protocol changes. + */ + if (unlikely(major != LUSTRE_MAJOR || + minor != LUSTRE_MINOR || + abs(patch - LUSTRE_PATCH) > 3)) { + LCONSOLE_WARN("%s (%u.%u.%u.%u) refused the connection from different version MDT (%d.%d.%d.%d) %s %s\n", + target->obd_name, LUSTRE_MAJOR, + LUSTRE_MINOR, LUSTRE_PATCH, + LUSTRE_FIX, major, minor, patch, + OBD_OCD_VERSION_FIX(data->ocd_version), + libcfs_nid2str(req->rq_peer.nid), + str); + GOTO(out, rc = -EPROTO); + } + } + } + + /* lctl gets a backstage, all-access pass. */ + if (obd_uuid_equals(&cluuid, &target->obd_uuid)) + goto dont_check_exports; + + export = obd_uuid_lookup(target, &cluuid); + if (!export) + goto no_export; + + /* We've found an export in the hash. */ + + spin_lock(&export->exp_lock); + + if (export->exp_connecting) { /* b=9635, et. al. */ + spin_unlock(&export->exp_lock); + LCONSOLE_WARN("%s: Export %p already connecting from %s\n", + export->exp_obd->obd_name, export, + libcfs_nid2str(req->rq_peer.nid)); + class_export_put(export); + export = NULL; + rc = -EALREADY; + } else if ((mds_conn || (lw_client && initial_conn) || + OCD_HAS_FLAG(data, MDS_MDS)) && export->exp_connection) { + spin_unlock(&export->exp_lock); + if (req->rq_peer.nid != + lnet_nid_to_nid4(&export->exp_connection->c_peer.nid)) { + /* MDS or LWP reconnected after failover. */ + LCONSOLE_WARN("%s: Received %s connection from %s, removing former export from %s\n", + target->obd_name, + lw_client ? "LWP" : "MDS", + libcfs_nid2str(req->rq_peer.nid), + libcfs_nidstr(&export->exp_connection->c_peer.nid)); + } else { + /* New connection from the same NID. */ + LCONSOLE_WARN("%s: Received new %s connection from %s, %s former export from same NID\n", + target->obd_name, + lw_client ? "LWP" : "MDS", + libcfs_nid2str(req->rq_peer.nid), + OCD_HAS_FLAG(data, MDS_MDS) ? + "keep" : "remove"); + } + + if (req->rq_peer.nid == + lnet_nid_to_nid4(&export->exp_connection->c_peer.nid) && + OCD_HAS_FLAG(data, MDS_MDS)) { + /* + * Because exports between MDTs will always be + * kept, let's do not fail such export if they + * come from the same NID, otherwise it might + * cause eviction between MDTs, which might + * cause namespace inconsistency + */ + spin_lock(&export->exp_lock); + export->exp_connecting = 1; + export->exp_conn_cnt = 0; + spin_unlock(&export->exp_lock); + conn.cookie = export->exp_handle.h_cookie; + rc = EALREADY; + } else { + class_fail_export(export); + class_export_put(export); + export = NULL; + rc = 0; + } + } else if (export->exp_connection != NULL && initial_conn && + req->rq_peer.nid != lnet_nid_to_nid4(&export->exp_connection->c_peer.nid)) { + spin_unlock(&export->exp_lock); + /* In MDS failover we have static UUID but NID can change. */ + LCONSOLE_WARN("%s: Client %s seen on new nid %s when existing nid %s is already connected\n", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid), + libcfs_nidstr( + &export->exp_connection->c_peer.nid)); + rc = -EALREADY; + class_export_put(export); + export = NULL; + } else if (OBD_FAIL_PRECHECK(OBD_FAIL_TGT_RECOVERY_CONNECT) && + !lw_client) { + spin_unlock(&export->exp_lock); + rc = -EAGAIN; + } else { + export->exp_connecting = 1; + spin_unlock(&export->exp_lock); + LASSERT(export->exp_obd == target); + + rc = target_handle_reconnect(&conn, export, &cluuid); + } + + /* If we found an export, we already unlocked. */ + if (!export) { +no_export: + OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_CONNECT, 2 * obd_timeout); + } else if (req->rq_export == NULL && + atomic_read(&export->exp_rpc_count) > 0) { + LCONSOLE_WARN("%s: Client %s (at %s) refused connection, still busy with %d references\n", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid), + refcount_read(&export->exp_handle.h_ref)); + GOTO(out, rc = -EBUSY); + } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1 && + rc != EALREADY) { + if (!strstr(cluuid.uuid, "mdt")) + LCONSOLE_WARN("%s: Rejecting reconnect from the known client %s (at %s) because it is indicating it is a new client\n", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid)); + GOTO(out, rc = -EALREADY); + } else { + OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout); + } + + if (rc < 0) + GOTO(out, rc); + + CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %lld last %lld\n", + target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), + target->obd_recovering ? "recovering/" : "", data->ocd_transno, + export, ktime_get_seconds(), + export ? export->exp_last_request_time : 0); + + /* + * If this is the first time a client connects, reset the recovery + * timer. Discard lightweight connections which might be local. + */ + if (!lw_client && rc == 0 && target->obd_recovering) + check_and_start_recovery_timer(target, req, export == NULL); + + /* + * We want to handle EALREADY but *not* -EALREADY from + * target_handle_reconnect(), return reconnection state in a flag. + */ + if (rc == EALREADY) { + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT); + rc = 0; + } else { + LASSERT(rc == 0); + } + + /* Tell the client if we support replayable requests. */ + if (target->obd_replayable) + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_REPLAYABLE); + client_nid = &req->rq_peer.nid; + + if (export == NULL) { + /* allow lightweight connections during recovery */ + /* + * allow "new" MDT to be connected during recovery, since we + * need retrieve recovery update records from it + */ + if (target->obd_recovering && !lw_client && !mds_mds_conn) { + struct hrtimer *timer = &target->obd_recovery_timer; + ktime_t remaining; + s64 timeout, left; + int in_progress; + int connected; + int known; + int stale; + char *msg; + + connected = atomic_read(&target->obd_connected_clients); + in_progress = atomic_read(&target->obd_lock_replay_clients); + known = + atomic_read(&target->obd_max_recoverable_clients); + stale = target->obd_stale_clients; + remaining = hrtimer_get_remaining(timer); + left = ktime_divns(remaining, NSEC_PER_SEC); + + if (ktime_to_ns(remaining) > 0) { + msg = "to recover in"; + timeout = left; + } else { + msg = "already passed deadline"; + timeout = -left; + + target_check_recovery_timer(target); + } + + LCONSOLE_WARN("%s: Denying connection for new client %s (at %s), waiting for %d known clients (%d recovered, %d in progress, and %d evicted) %s %lld:%.02lld\n", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid), known, + connected - in_progress, in_progress, + stale, msg, timeout / 60, timeout % 60); + rc = -EBUSY; + } else { +dont_check_exports: + rc = obd_connect(req->rq_svc_thread->t_env, + &export, target, &cluuid, data, + client_nid); + if (mds_conn && OBD_FAIL_CHECK(OBD_FAIL_TGT_RCVG_FLAG)) + lustre_msg_add_op_flags(req->rq_repmsg, + MSG_CONNECT_RECOVERING); + if (rc == 0) { + conn.cookie = export->exp_handle.h_cookie; + rc = rev_import_init(export); + } + + if (mds_mds_conn) + new_mds_mds_conn = true; + } + } else { + rc = obd_reconnect(req->rq_svc_thread->t_env, + export, target, &cluuid, data, client_nid); + } + if (rc) + GOTO(out, rc); + + LASSERT(target->u.obt.obt_magic == OBT_MAGIC); + data->ocd_instance = target->u.obt.obt_instance; + + /* + * Return only the parts of obd_connect_data that we understand, so the + * client knows that we don't understand the rest. + */ + if (data) { + tmpsize = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA, + RCL_SERVER); + tmpdata = req_capsule_server_get(&req->rq_pill, + &RMF_CONNECT_DATA); + /* + * Don't use struct assignment here, because the client reply + * buffer may be smaller/larger than the local struct + * obd_connect_data. + */ + memcpy(tmpdata, data, min(tmpsize, size)); + } + + /* + * If the client and the server are the same node, we will already + * have an export that really points to the client's DLM export, + * because we have a shared handles table. + * + * XXX this will go away when shaver stops sending the "connect" handle + * in the real "remote handle" field of the request --phik 24 Apr 2003 + */ + ptlrpc_request_change_export(req, export); + + pcon = ptlrpc_connection_get(req->rq_peer, req->rq_self, &cluuid); + if (pcon == NULL) + GOTO(out, rc = -ENOTCONN); + + spin_lock(&export->exp_lock); + + if (export->exp_disconnected) { + spin_unlock(&export->exp_lock); + GOTO(out, rc = -ENODEV); + } + if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) { + spin_unlock(&export->exp_lock); + CDEBUG(D_RPCTRACE, + "%s: %s already connected at greater or equal conn_cnt: %d >= %d\n", + cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), + export->exp_conn_cnt, + lustre_msg_get_conn_cnt(req->rq_reqmsg)); + + GOTO(out, rc = -EALREADY); + } + LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0); + export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg); + + /* Check to see if connection came from another NID. */ + if (export->exp_connection != NULL && + lnet_nid_to_nid4(&export->exp_connection->c_peer.nid) != + req->rq_peer.nid) { + obd_nid_del(export->exp_obd, export); + ptlrpc_connection_put(export->exp_connection); + export->exp_connection = NULL; + } + + if (export->exp_connection == NULL) { + export->exp_connection = pcon; + pcon = NULL; + } + obd_nid_add(export->exp_obd, export); + + spin_unlock(&export->exp_lock); + + lustre_msg_set_handle(req->rq_repmsg, &conn); + + rc = rev_import_reconnect(export, req); + if (rc != 0) + GOTO(out, rc); + + if (target->obd_recovering && !export->exp_in_recovery && !lw_client) { + int has_transno; + __u64 transno = data->ocd_transno; + + spin_lock(&export->exp_lock); + /* + * possible race with class_disconnect_stale_exports, + * export may be already in the eviction process + */ + if (export->exp_failed) { + spin_unlock(&export->exp_lock); + GOTO(out, rc = -ENODEV); + } + export->exp_in_recovery = 1; + export->exp_req_replay_needed = 1; + export->exp_lock_replay_needed = 1; + spin_unlock(&export->exp_lock); + + has_transno = !!(lustre_msg_get_op_flags(req->rq_reqmsg) & + MSG_CONNECT_TRANSNO); + if (has_transno && transno == 0) + CWARN("Connect with zero transno!\n"); + + if (has_transno && transno > 0 && + transno < target->obd_next_recovery_transno && + transno > target->obd_last_committed) { + /* Another way is to use cmpxchg() to be lock-free. */ + spin_lock(&target->obd_recovery_task_lock); + if (transno < target->obd_next_recovery_transno) + target->obd_next_recovery_transno = transno; + spin_unlock(&target->obd_recovery_task_lock); + } + + atomic_inc(&target->obd_req_replay_clients); + atomic_inc(&target->obd_lock_replay_clients); + /* + * Note: MDS-MDS connection is allowed to be connected during + * recovery, no matter if the exports needs to be recoveried. + * Because we need retrieve updates logs from all other MDTs. + * So if the MDS-MDS export is new, obd_max_recoverable_clients + * also needs to be increased to match other recovery checking + * condition. + */ + if (new_mds_mds_conn) + atomic_inc(&target->obd_max_recoverable_clients); + + if (atomic_inc_return(&target->obd_connected_clients) == + atomic_read(&target->obd_max_recoverable_clients)) + wake_up(&target->obd_next_transno_waitq); + } + + /* Tell the client we're in recovery, when client is involved in it. */ + if (target->obd_recovering && !lw_client) + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING); + +out: + if (export) { + spin_lock(&export->exp_lock); + export->exp_connecting = 0; + spin_unlock(&export->exp_lock); + + class_export_put(export); + } + if (target != NULL) { + spin_lock(&target->obd_dev_lock); + target->obd_conn_inprogress--; + spin_unlock(&target->obd_dev_lock); + class_decref(target, "find", current); + } + if (pcon) + ptlrpc_connection_put(pcon); + req->rq_status = rc; + RETURN(rc); +} + +int target_handle_disconnect(struct ptlrpc_request *req) +{ + int rc; + + ENTRY; + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(rc); + + /* In case of target disconnect, updating sec ctx immediately is + * required in order to record latest sequence number used. + * Sequence is normally updated on export destroy, but this event + * can occur too late, ie after a new target connect request has + * been processed. + * Maintaining correct sequence when client connection becomes idle + * ensures that GSS does not erroneously consider requests as replays. + */ + rc = sptlrpc_export_update_ctx(req->rq_export); + if (rc) + RETURN(rc); + + /* Keep the rq_export around so we can send the reply. */ + req->rq_status = obd_disconnect(class_export_get(req->rq_export)); + + RETURN(0); +} + +void target_destroy_export(struct obd_export *exp) +{ + struct obd_import *imp = NULL; + /* + * exports created from last_rcvd data, and "fake" + * exports created by lctl don't have an import + */ + spin_lock(&exp->exp_lock); + if (exp->exp_imp_reverse != NULL) { + imp = exp->exp_imp_reverse; + exp->exp_imp_reverse = NULL; + } + spin_unlock(&exp->exp_lock); + if (imp != NULL) + client_destroy_import(imp); + + LASSERT_ATOMIC_ZERO(&exp->exp_locks_count); + LASSERT_ATOMIC_ZERO(&exp->exp_rpc_count); + LASSERT_ATOMIC_ZERO(&exp->exp_cb_count); + LASSERT_ATOMIC_ZERO(&exp->exp_replay_count); +} +EXPORT_SYMBOL(target_destroy_export); + +/* + * Recovery functions + */ +static void target_request_copy_get(struct ptlrpc_request *req) +{ + class_export_rpc_inc(req->rq_export); + LASSERT(list_empty(&req->rq_list)); + INIT_LIST_HEAD(&req->rq_replay_list); + + /* Increase refcount to keep request in queue. */ + atomic_inc(&req->rq_refcount); + /* Let export know it has replays to be handled. */ + atomic_inc(&req->rq_export->exp_replay_count); +} + +static void target_request_copy_put(struct ptlrpc_request *req) +{ + LASSERT(list_empty(&req->rq_replay_list)); + LASSERT_ATOMIC_POS(&req->rq_export->exp_replay_count); + + atomic_dec(&req->rq_export->exp_replay_count); + class_export_rpc_dec(req->rq_export); + ptlrpc_server_drop_request(req); +} + +static int target_exp_enqueue_req_replay(struct ptlrpc_request *req) +{ + __u64 transno = lustre_msg_get_transno(req->rq_reqmsg); + struct obd_export *exp = req->rq_export; + struct ptlrpc_request *reqiter; + struct ptlrpc_request *dup_req = NULL; + int dup = 0; + + LASSERT(exp); + + spin_lock(&exp->exp_lock); + list_for_each_entry(reqiter, &exp->exp_req_replay_queue, + rq_replay_list) { + if (lustre_msg_get_transno(reqiter->rq_reqmsg) == transno) { + dup_req = reqiter; + dup = 1; + break; + } + } + + if (dup) { + /* We expect it with RESENT and REPLAY flags. */ + if ((lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY)) != (MSG_RESENT | MSG_REPLAY)) + CERROR("invalid flags %x of resent replay\n", + lustre_msg_get_flags(req->rq_reqmsg)); + + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) { + __u32 new_conn; + + new_conn = lustre_msg_get_conn_cnt(req->rq_reqmsg); + if (new_conn > + lustre_msg_get_conn_cnt(dup_req->rq_reqmsg)) + lustre_msg_set_conn_cnt(dup_req->rq_reqmsg, + new_conn); + } + } else { + list_add_tail(&req->rq_replay_list, + &exp->exp_req_replay_queue); + } + + spin_unlock(&exp->exp_lock); + return dup; +} + +static void target_exp_dequeue_req_replay(struct ptlrpc_request *req) +{ + LASSERT(!list_empty(&req->rq_replay_list)); + LASSERT(req->rq_export); + + spin_lock(&req->rq_export->exp_lock); + list_del_init(&req->rq_replay_list); + spin_unlock(&req->rq_export->exp_lock); +} + +static void target_finish_recovery(struct lu_target *lut) +{ + struct obd_device *obd = lut->lut_obd; + + ENTRY; + + /* Only log a recovery message when recovery has occurred. */ + if (obd->obd_recovery_start) { + time64_t now = ktime_get_seconds(); + time64_t elapsed_time; + + elapsed_time = max_t(time64_t, now - obd->obd_recovery_start, + 1); + LCONSOLE_INFO("%s: Recovery over after %lld:%.02lld, of %d clients %d recovered and %d %s evicted.\n", + obd->obd_name, elapsed_time / 60, + elapsed_time % 60, + atomic_read(&obd->obd_max_recoverable_clients), + atomic_read(&obd->obd_connected_clients), + obd->obd_stale_clients, + obd->obd_stale_clients == 1 ? "was" : "were"); + if (obd->obd_stale_clients && do_dump_on_eviction(obd)) + libcfs_debug_dumplog(); + } + + ldlm_reprocess_recovery_done(obd->obd_namespace); + spin_lock(&obd->obd_recovery_task_lock); + if (!list_empty(&obd->obd_req_replay_queue) || + !list_empty(&obd->obd_lock_replay_queue) || + !list_empty(&obd->obd_final_req_queue)) { + CERROR("%s: Recovery queues ( %s%s%s) are not empty\n", + obd->obd_name, + list_empty(&obd->obd_req_replay_queue) ? "" : "req ", + list_empty(&obd->obd_lock_replay_queue) ? \ + "" : "lock ", + list_empty(&obd->obd_final_req_queue) ? \ + "" : "final "); + spin_unlock(&obd->obd_recovery_task_lock); + LBUG(); + } + spin_unlock(&obd->obd_recovery_task_lock); + + obd->obd_recovery_end = ktime_get_seconds(); + + /* When recovery finished, cleanup orphans on MDS and OST. */ + if (obd->obd_type && OBP(obd, postrecov)) { + int rc = OBP(obd, postrecov)(obd); + + if (rc < 0) + LCONSOLE_WARN("%s: Post recovery failed, rc %d\n", + obd->obd_name, rc); + } + EXIT; +} + +static void abort_req_replay_queue(struct obd_device *obd) +{ + struct ptlrpc_request *req, *n; + LIST_HEAD(abort_list); + + spin_lock(&obd->obd_recovery_task_lock); + list_splice_init(&obd->obd_req_replay_queue, &abort_list); + spin_unlock(&obd->obd_recovery_task_lock); + list_for_each_entry_safe(req, n, &abort_list, rq_list) { + DEBUG_REQ(D_WARNING, req, "aborted:"); + req->rq_status = -ENOTCONN; + if (ptlrpc_error(req)) { + DEBUG_REQ(D_ERROR, req, + "failed abort_req_reply; skipping"); + } + target_exp_dequeue_req_replay(req); + target_request_copy_put(req); + } +} + +static void abort_lock_replay_queue(struct obd_device *obd) +{ + struct ptlrpc_request *req, *n; + LIST_HEAD(abort_list); + + spin_lock(&obd->obd_recovery_task_lock); + list_splice_init(&obd->obd_lock_replay_queue, &abort_list); + spin_unlock(&obd->obd_recovery_task_lock); + list_for_each_entry_safe(req, n, &abort_list, rq_list) { + DEBUG_REQ(D_ERROR, req, "aborted:"); + req->rq_status = -ENOTCONN; + if (ptlrpc_error(req)) { + DEBUG_REQ(D_ERROR, req, + "failed abort_lock_reply; skipping"); + } + target_request_copy_put(req); + } +} + +/* + * Called from a cleanup function if the device is being cleaned up + * forcefully. The exports should all have been disconnected already, + * the only thing left to do is + * - clear the recovery flags + * - cancel the timer + * - free queued requests and replies, but don't send replies + * Because the obd_stopping flag is set, no new requests should be received. + */ +void target_cleanup_recovery(struct obd_device *obd) +{ + struct ptlrpc_request *req, *n; + LIST_HEAD(clean_list); + + spin_lock(&obd->obd_dev_lock); + if (!obd->obd_recovering) { + spin_unlock(&obd->obd_dev_lock); + EXIT; + return; + } + obd->obd_recovering = obd->obd_abort_recovery = 0; + obd->obd_abort_recov_mdt = 0; + spin_unlock(&obd->obd_dev_lock); + + spin_lock(&obd->obd_recovery_task_lock); + target_cancel_recovery_timer(obd); + list_splice_init(&obd->obd_req_replay_queue, &clean_list); + spin_unlock(&obd->obd_recovery_task_lock); + + list_for_each_entry_safe(req, n, &clean_list, rq_list) { + LASSERT(req->rq_reply_state == NULL); + target_exp_dequeue_req_replay(req); + target_request_copy_put(req); + } + + spin_lock(&obd->obd_recovery_task_lock); + list_splice_init(&obd->obd_lock_replay_queue, &clean_list); + list_splice_init(&obd->obd_final_req_queue, &clean_list); + spin_unlock(&obd->obd_recovery_task_lock); + + list_for_each_entry_safe(req, n, &clean_list, rq_list) { + LASSERT(req->rq_reply_state == NULL); + target_request_copy_put(req); + } + + EXIT; +} +EXPORT_SYMBOL(target_cleanup_recovery); + +/* obd_recovery_task_lock should be held */ +void target_cancel_recovery_timer(struct obd_device *obd) +{ + CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name); + hrtimer_cancel(&obd->obd_recovery_timer); +} + +static void target_start_recovery_timer(struct obd_device *obd) +{ + ktime_t delay; + + if (obd->obd_recovery_start != 0) + return; + + spin_lock(&obd->obd_dev_lock); + if (!obd->obd_recovering || obd->obd_abort_recovery) { + spin_unlock(&obd->obd_dev_lock); + return; + } + + LASSERT(obd->obd_recovery_timeout != 0); + + if (obd->obd_recovery_start != 0) { + spin_unlock(&obd->obd_dev_lock); + return; + } + + obd->obd_recovery_start = ktime_get_seconds(); + delay = ktime_set(obd->obd_recovery_start + + obd->obd_recovery_timeout, 0); + hrtimer_start(&obd->obd_recovery_timer, delay, HRTIMER_MODE_ABS); + spin_unlock(&obd->obd_dev_lock); + + LCONSOLE_WARN("%s: Will be in recovery for at least %u:%02u, or until %d client%s reconnect%s\n", + obd->obd_name, + obd->obd_recovery_timeout / 60, + obd->obd_recovery_timeout % 60, + atomic_read(&obd->obd_max_recoverable_clients), + (atomic_read(&obd->obd_max_recoverable_clients) == 1) ? + "" : "s", + (atomic_read(&obd->obd_max_recoverable_clients) == 1) ? + "s" : ""); +} + +/** + * extend recovery window. + * + * if @extend is true, extend recovery window to have @dr_timeout remaining + * at least; otherwise, make sure the recovery timeout value is not less + * than @dr_timeout. + */ +static void extend_recovery_timer(struct obd_device *obd, timeout_t dr_timeout, + bool extend) +{ + ktime_t left_ns; + timeout_t timeout; + timeout_t left; + + spin_lock(&obd->obd_dev_lock); + if (!obd->obd_recovering || obd->obd_abort_recovery || + obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + return; + } + LASSERT(obd->obd_recovery_start != 0); + + left_ns = hrtimer_get_remaining(&obd->obd_recovery_timer); + left = ktime_divns(left_ns, NSEC_PER_SEC); + + if (extend) { + timeout = obd->obd_recovery_timeout; + /* dr_timeout will happen after the hrtimer has expired. + * Add the excess time to the soft recovery timeout without + * exceeding the hard recovery timeout. + */ + if (dr_timeout > left) { + timeout += dr_timeout - left; + timeout = min_t(timeout_t, obd->obd_recovery_time_hard, + timeout); + } + } else { + timeout = clamp_t(timeout_t, dr_timeout, + obd->obd_recovery_timeout, + obd->obd_recovery_time_hard); + } + + if (timeout == obd->obd_recovery_time_hard) + CWARN("%s: extended recovery timer reached hard limit: %d, extend: %d\n", + obd->obd_name, timeout, extend); + + if (obd->obd_recovery_timeout < timeout) { + ktime_t end, now; + + obd->obd_recovery_timeout = timeout; + end = ktime_set(obd->obd_recovery_start + timeout, 0); + now = ktime_set(ktime_get_seconds(), 0); + left_ns = ktime_sub(end, now); + hrtimer_start(&obd->obd_recovery_timer, end, HRTIMER_MODE_ABS); + left = ktime_divns(left_ns, NSEC_PER_SEC); + } + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_HA, "%s: recovery timer will expire in %d seconds\n", + obd->obd_name, left); +} + +/* Reset the timer with each new client connection */ +/* + * This timer is actually reconnect_timer, which is for making sure + * the total recovery window is at least as big as my reconnect + * attempt timing. So the initial recovery time_out will be set to + * OBD_RECOVERY_FACTOR * obd_timeout. If the timeout coming + * from client is bigger than this, then the recovery time_out will + * be extended to make sure the client could be reconnected, in the + * process, the timeout from the new client should be ignored. + */ +static void +check_and_start_recovery_timer(struct obd_device *obd, + struct ptlrpc_request *req, + int new_client) +{ + timeout_t service_timeout = lustre_msg_get_service_timeout(req->rq_reqmsg); + struct obd_device_target *obt = &obd->u.obt; + + if (!new_client && service_timeout) + /* + * Teach server about old server's estimates, as first guess + * at how long new requests will take. + */ + at_measured(&req->rq_rqbd->rqbd_svcpt->scp_at_estimate, + service_timeout); + + target_start_recovery_timer(obd); + + /* + * Convert the service time to RPC timeout, + * and reuse service_timeout to limit stack usage. + */ + service_timeout = at_est2timeout(service_timeout); + + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_SLUGGISH_NET) && + service_timeout < at_extra) + service_timeout = at_extra; + + /* + * We expect other clients to timeout within service_timeout, then try + * to reconnect, then try the failover server. The max delay between + * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL. + */ + service_timeout += 2 * INITIAL_CONNECT_TIMEOUT; + + LASSERT(obt->obt_magic == OBT_MAGIC); + service_timeout += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC); + if (service_timeout > obd->obd_recovery_timeout && !new_client) + extend_recovery_timer(obd, service_timeout, false); +} + +/** Health checking routines */ +static inline int exp_connect_healthy(struct obd_export *exp) +{ + return exp->exp_in_recovery; +} + +/** if export done req_replay or has replay in queue */ +static inline int exp_req_replay_healthy(struct obd_export *exp) +{ + return (!exp->exp_req_replay_needed || + atomic_read(&exp->exp_replay_count) > 0); +} + + +static inline int exp_req_replay_healthy_or_from_mdt(struct obd_export *exp) +{ + return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) || + exp_req_replay_healthy(exp); +} + +/** if export done lock_replay or has replay in queue */ +static inline int exp_lock_replay_healthy(struct obd_export *exp) +{ + return (!exp->exp_lock_replay_needed || + atomic_read(&exp->exp_replay_count) > 0); +} + +static inline int exp_vbr_healthy(struct obd_export *exp) +{ + return !exp->exp_vbr_failed; +} + +static inline int exp_finished(struct obd_export *exp) +{ + return exp->exp_in_recovery && !exp->exp_lock_replay_needed; +} + +static inline int exp_finished_or_from_mdt(struct obd_export *exp) +{ + return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) || + exp_finished(exp); +} + +static int check_for_next_transno(struct lu_target *lut) +{ + struct ptlrpc_request *req = NULL; + struct obd_device *obd = lut->lut_obd; + struct target_distribute_txn_data *tdtd = lut->lut_tdtd; + int wake_up = 0, connected, completed, queue_len; + __u64 req_transno = 0; + __u64 update_transno = 0; + __u64 next_transno = 0; + + ENTRY; + + spin_lock(&obd->obd_recovery_task_lock); + if (!list_empty(&obd->obd_req_replay_queue)) { + req = list_entry(obd->obd_req_replay_queue.next, + struct ptlrpc_request, rq_list); + req_transno = lustre_msg_get_transno(req->rq_reqmsg); + } + + if (!obd->obd_abort_recov_mdt && tdtd) + update_transno = distribute_txn_get_next_transno(tdtd); + + connected = atomic_read(&obd->obd_connected_clients); + completed = connected - atomic_read(&obd->obd_req_replay_clients); + queue_len = obd->obd_requests_queued_for_recovery; + next_transno = obd->obd_next_recovery_transno; + + CDEBUG(D_HA, + "max: %d, connected: %d, completed: %d, queue_len: %d, req_transno: %llu, next_transno: %llu\n", + atomic_read(&obd->obd_max_recoverable_clients), + connected, completed, + queue_len, req_transno, next_transno); + + if (obd->obd_abort_recovery) { + CDEBUG(D_HA, "waking for aborted recovery\n"); + wake_up = 1; + } else if (obd->obd_recovery_expired) { + CDEBUG(D_HA, "waking for expired recovery\n"); + wake_up = 1; + } else if (!obd->obd_abort_recov_mdt && tdtd && req && + is_req_replayed_by_update(req)) { + LASSERTF(req_transno < next_transno, + "req_transno %llu next_transno%llu\n", req_transno, + next_transno); + CDEBUG(D_HA, "waking for duplicate req (%llu)\n", + req_transno); + wake_up = 1; + } else if (req_transno == next_transno || + (update_transno != 0 && update_transno <= next_transno)) { + CDEBUG(D_HA, "waking for next (%lld)\n", next_transno); + wake_up = 1; + } else if (queue_len > 0 && + queue_len == atomic_read(&obd->obd_req_replay_clients)) { + /** handle gaps occured due to lost reply or VBR */ + LASSERTF(req_transno >= next_transno, + "req_transno: %llu, next_transno: %llu\n", + req_transno, next_transno); + CDEBUG(D_HA, + "%s: waking for gap in transno, VBR is %s (skip: %lld, ql: %d, comp: %d, conn: %d, next: %lld, next_update %lld last_committed: %lld)\n", + obd->obd_name, obd->obd_version_recov ? "ON" : "OFF", + next_transno, queue_len, completed, connected, + req_transno, update_transno, obd->obd_last_committed); + obd->obd_next_recovery_transno = req_transno; + wake_up = 1; + } else if (atomic_read(&obd->obd_req_replay_clients) == 0) { + CDEBUG(D_HA, "waking for completed recovery\n"); + wake_up = 1; + } else if (OBD_FAIL_CHECK(OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS)) { + CDEBUG(D_HA, + "accepting transno gaps is explicitly allowed by fail_lock, waking up (%lld)\n", + next_transno); + obd->obd_next_recovery_transno = req_transno; + wake_up = 1; + } + spin_unlock(&obd->obd_recovery_task_lock); + return wake_up; +} + +static int check_for_next_lock(struct lu_target *lut) +{ + struct obd_device *obd = lut->lut_obd; + int wake_up = 0; + + spin_lock(&obd->obd_recovery_task_lock); + if (!list_empty(&obd->obd_lock_replay_queue)) { + CDEBUG(D_HA, "waking for next lock\n"); + wake_up = 1; + } else if (atomic_read(&obd->obd_lock_replay_clients) == 0) { + CDEBUG(D_HA, "waking for completed lock replay\n"); + wake_up = 1; + } else if (obd->obd_abort_recovery) { + CDEBUG(D_HA, "waking for aborted recovery\n"); + wake_up = 1; + } else if (obd->obd_recovery_expired) { + CDEBUG(D_HA, "waking for expired recovery\n"); + wake_up = 1; + } + spin_unlock(&obd->obd_recovery_task_lock); + + return wake_up; +} + +static int check_update_llog(struct lu_target *lut) +{ + struct obd_device *obd = lut->lut_obd; + struct target_distribute_txn_data *tdtd = lut->lut_tdtd; + + if (obd->obd_abort_recovery) { + CDEBUG(D_HA, "waking for aborted recovery\n"); + return 1; + } + + if (atomic_read(&tdtd->tdtd_recovery_threads_count) == 0) { + CDEBUG(D_HA, "waking for completion of reading update log\n"); + return 1; + } + + return 0; +} + +/** + * wait for recovery events, + * check its status with help of check_routine + * evict dead clients via health_check + */ +static int target_recovery_overseer(struct lu_target *lut, + int (*check_routine)(struct lu_target *), + int (*health_check)(struct obd_export *)) +{ + struct obd_device *obd = lut->lut_obd; + struct target_distribute_txn_data *tdtd; + time64_t last = 0; + time64_t now; +repeat: + if (obd->obd_recovering && obd->obd_recovery_start == 0) { + now = ktime_get_seconds(); + if (now - last > 600) { + LCONSOLE_INFO("%s: in recovery but waiting for the first client to connect\n", + obd->obd_name); + last = now; + } + } + if (obd->obd_recovery_start != 0 && ktime_get_seconds() >= + (obd->obd_recovery_start + obd->obd_recovery_time_hard)) { + __u64 next_update_transno = 0; + + /* + * Only abort the recovery if there are no update recovery + * left in the queue + */ + spin_lock(&obd->obd_recovery_task_lock); + if (!obd->obd_abort_recov_mdt && lut->lut_tdtd) { + next_update_transno = + distribute_txn_get_next_transno(lut->lut_tdtd); + + tdtd = lut->lut_tdtd; + /* + * If next_update_transno == 0, it probably because + * updatelog retrieve threads did not get any records + * yet, let's wait those threads stopped + */ + if (next_update_transno == 0) { + spin_unlock(&obd->obd_recovery_task_lock); + + while (wait_event_timeout( + tdtd->tdtd_recovery_threads_waitq, + check_update_llog(lut), + cfs_time_seconds(60)) == 0); + + spin_lock(&obd->obd_recovery_task_lock); + next_update_transno = + distribute_txn_get_next_transno(tdtd); + } + } + + if (next_update_transno != 0 && !obd->obd_abort_recovery) { + obd->obd_next_recovery_transno = next_update_transno; + spin_unlock(&obd->obd_recovery_task_lock); + /* + * Disconnect unfinished exports from clients, and + * keep connection from MDT to make sure the update + * recovery will still keep trying until some one + * manually abort the recovery + */ + class_disconnect_stale_exports(obd, + exp_finished_or_from_mdt); + /* Abort all of replay & replay lock req from clients */ + abort_req_replay_queue(obd); + abort_lock_replay_queue(obd); + CDEBUG(D_HA, + "%s: there are still update replay (%#llx)in the queue.\n", + obd->obd_name, next_update_transno); + } else { + obd->obd_abort_recovery = 1; + spin_unlock(&obd->obd_recovery_task_lock); + CWARN("%s recovery is aborted by hard timeout\n", + obd->obd_name); + } + } + + while (wait_event_timeout(obd->obd_next_transno_waitq, + check_routine(lut), + cfs_time_seconds(60)) == 0) + ; /* wait indefinitely for event, but don't trigger watchdog */ + + if (obd->obd_abort_recovery) { + CWARN("recovery is aborted, evict exports in recovery\n"); + if (lut->lut_tdtd != NULL) { + tdtd = lut->lut_tdtd; + /* + * Let's wait all of the update log recovery thread + * finished + */ + wait_event_idle( + tdtd->tdtd_recovery_threads_waitq, + atomic_read(&tdtd->tdtd_recovery_threads_count) + == 0); + /* Then abort the update recovery list */ + dtrq_list_destroy(lut->lut_tdtd); + } + + /** evict exports which didn't finish recovery yet */ + class_disconnect_stale_exports(obd, exp_finished); + return 1; + } else if (obd->obd_recovery_expired) { + obd->obd_recovery_expired = 0; + + /** If some clients died being recovered, evict them */ + LCONSOLE_WARN("%s: recovery is timed out, evict stale exports\n", + obd->obd_name); + /** evict cexports with no replay in queue, they are stalled */ + class_disconnect_stale_exports(obd, health_check); + + /** continue with VBR */ + spin_lock(&obd->obd_dev_lock); + obd->obd_version_recov = 1; + spin_unlock(&obd->obd_dev_lock); + /** + * reset timer, recovery will proceed with versions now, + * timeout is set just to handle reconnection delays + */ + extend_recovery_timer(obd, RECONNECT_DELAY_MAX, true); + /** + * Wait for recovery events again, after evicting bad clients + */ + goto repeat; + } + return 0; +} + +static struct ptlrpc_request *target_next_replay_lock(struct lu_target *lut) +{ + struct obd_device *obd = lut->lut_obd; + struct ptlrpc_request *req = NULL; + + CDEBUG(D_HA, "Waiting for lock\n"); + if (target_recovery_overseer(lut, check_for_next_lock, + exp_lock_replay_healthy)) + abort_lock_replay_queue(obd); + + spin_lock(&obd->obd_recovery_task_lock); + if (!list_empty(&obd->obd_lock_replay_queue)) { + req = list_entry(obd->obd_lock_replay_queue.next, + struct ptlrpc_request, rq_list); + list_del_init(&req->rq_list); + spin_unlock(&obd->obd_recovery_task_lock); + } else { + spin_unlock(&obd->obd_recovery_task_lock); + LASSERT(list_empty(&obd->obd_lock_replay_queue)); + LASSERT(atomic_read(&obd->obd_lock_replay_clients) == 0); + /** evict exports failed VBR */ + class_disconnect_stale_exports(obd, exp_vbr_healthy); + } + return req; +} + +static struct ptlrpc_request *target_next_final_ping(struct obd_device *obd) +{ + struct ptlrpc_request *req = NULL; + + spin_lock(&obd->obd_recovery_task_lock); + if (!list_empty(&obd->obd_final_req_queue)) { + req = list_entry(obd->obd_final_req_queue.next, + struct ptlrpc_request, rq_list); + list_del_init(&req->rq_list); + spin_unlock(&obd->obd_recovery_task_lock); + if (req->rq_export->exp_in_recovery) { + spin_lock(&req->rq_export->exp_lock); + req->rq_export->exp_in_recovery = 0; + spin_unlock(&req->rq_export->exp_lock); + } + } else { + spin_unlock(&obd->obd_recovery_task_lock); + } + return req; +} + +static void handle_recovery_req(struct ptlrpc_thread *thread, + struct ptlrpc_request *req, + svc_handler_t handler) +{ + ENTRY; + + /** + * export can be evicted during recovery, no need to handle replays for + * it after that, discard such request silently + */ + if (req->rq_export->exp_disconnected) + RETURN_EXIT; + + req->rq_session.lc_thread = thread; + req->rq_svc_thread = thread; + req->rq_svc_thread->t_env->le_ses = &req->rq_session; + + /* thread context */ + lu_context_enter(&thread->t_env->le_ctx); + (void)handler(req); + lu_context_exit(&thread->t_env->le_ctx); + + req->rq_svc_thread->t_env->le_ses = NULL; + + /* don't reset timer for final stage */ + if (!exp_finished(req->rq_export)) { + timeout_t timeout = obd_timeout; + + /** + * Add request @timeout to the recovery time so next request from + * this client may come in recovery time + */ + if (!AT_OFF) { + struct ptlrpc_service_part *svcpt; + timeout_t est_timeout; + + svcpt = req->rq_rqbd->rqbd_svcpt; + /* + * If the server sent early reply for this request, + * the client will recalculate the timeout according to + * current server estimate service time, so we will + * use the maxium timeout here for waiting the client + * sending the next req + */ + est_timeout = at_get(&svcpt->scp_at_estimate); + timeout = max_t(timeout_t, at_est2timeout(est_timeout), + lustre_msg_get_timeout(req->rq_reqmsg)); + /* + * Add 2 net_latency, one for balance rq_deadline + * (see ptl_send_rpc), one for resend the req to server, + * Note: client will pack net_latency in replay req + * (see ptlrpc_replay_req) + */ + timeout += 2 * lustre_msg_get_service_timeout(req->rq_reqmsg); + } + extend_recovery_timer(class_exp2obd(req->rq_export), timeout, + true); + } + EXIT; +} + +/** Checking routines for recovery */ +static int check_for_recovery_ready(struct lu_target *lut) +{ + struct obd_device *obd = lut->lut_obd; + unsigned int clnts = atomic_read(&obd->obd_connected_clients); + + CDEBUG(D_HA, + "connected %d stale %d max_recoverable_clients %d abort %d expired %d\n", + clnts, obd->obd_stale_clients, + atomic_read(&obd->obd_max_recoverable_clients), + obd->obd_abort_recovery, obd->obd_recovery_expired); + + if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) { + LASSERT(clnts <= + atomic_read(&obd->obd_max_recoverable_clients)); + if (clnts + obd->obd_stale_clients < + atomic_read(&obd->obd_max_recoverable_clients)) + return 0; + } + + if (!obd->obd_abort_recov_mdt && lut->lut_tdtd != NULL) { + if (!lut->lut_tdtd->tdtd_replay_ready && + !obd->obd_abort_recovery && !obd->obd_stopping) { + /* + * Let's extend recovery timer, in case the recovery + * timer expired, and some clients got evicted + */ + extend_recovery_timer(obd, obd->obd_recovery_timeout, + true); + CDEBUG(D_HA, + "%s update recovery is not ready, extend recovery %d\n", + obd->obd_name, obd->obd_recovery_timeout); + return 0; + } + } + + return 1; +} + +enum { + REQUEST_RECOVERY = 1, + UPDATE_RECOVERY = 2, +}; + +static __u64 get_next_replay_req_transno(struct obd_device *obd) +{ + __u64 transno = 0; + + if (!list_empty(&obd->obd_req_replay_queue)) { + struct ptlrpc_request *req; + + req = list_entry(obd->obd_req_replay_queue.next, + struct ptlrpc_request, rq_list); + transno = lustre_msg_get_transno(req->rq_reqmsg); + } + + return transno; +} + +static __u64 get_next_transno(struct lu_target *lut, int *type) +{ + struct obd_device *obd = lut->lut_obd; + struct target_distribute_txn_data *tdtd = lut->lut_tdtd; + __u64 transno = 0; + __u64 update_transno; + + ENTRY; + + transno = get_next_replay_req_transno(obd); + if (type != NULL) + *type = REQUEST_RECOVERY; + + if (!tdtd || obd->obd_abort_recov_mdt) + RETURN(transno); + + update_transno = distribute_txn_get_next_transno(tdtd); + if (transno == 0 || (transno >= update_transno && + update_transno != 0)) { + transno = update_transno; + if (type != NULL) + *type = UPDATE_RECOVERY; + } + + RETURN(transno); +} + +/** + * drop duplicate replay request + * + * Because the operation has been replayed by update recovery, the request + * with the same transno will be dropped and also notify the client to send + * next replay request. + * + * \param[in] env execution environment + * \param[in] obd failover obd device + * \param[in] req request to be dropped + */ +static void drop_duplicate_replay_req(struct lu_env *env, + struct obd_device *obd, + struct ptlrpc_request *req) +{ + DEBUG_REQ(D_HA, req, + "remove t%lld from %s because duplicate update records found", + lustre_msg_get_transno(req->rq_reqmsg), + libcfs_nid2str(req->rq_peer.nid)); + + /* + * Right now, only for MDS reint operation update replay and + * normal request replay can have the same transno + */ + if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT) { + req_capsule_set(&req->rq_pill, &RQF_MDS_REINT); + req->rq_status = req_capsule_server_pack(&req->rq_pill); + if (likely(req->rq_export)) + target_committed_to_req(req); + lustre_msg_set_transno(req->rq_repmsg, req->rq_transno); + target_send_reply(req, req->rq_status, 0); + } else { + DEBUG_REQ(D_ERROR, req, "wrong opc from %s", + libcfs_nid2str(req->rq_peer.nid)); + } + target_exp_dequeue_req_replay(req); + target_request_copy_put(req); + obd->obd_replayed_requests++; +} + +#define WATCHDOG_TIMEOUT (obd_timeout * 10) + +static void replay_request_or_update(struct lu_env *env, + struct lu_target *lut, + struct target_recovery_data *trd, + struct ptlrpc_thread *thread) +{ + struct obd_device *obd = lut->lut_obd; + struct ptlrpc_request *req = NULL; + int type; + __u64 transno; + + ENTRY; + + CDEBUG(D_HA, "Waiting for transno %lld\n", + obd->obd_next_recovery_transno); + + /* Replay all of request and update by transno */ + do { + struct target_distribute_txn_data *tdtd = lut->lut_tdtd; + + CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val); + + /** + * It is needed to extend recovery window above + * recovery_time_soft. Extending is possible only in the + * end of recovery window (see more details in + * handle_recovery_req()). + */ + CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300); + + if (target_recovery_overseer(lut, check_for_next_transno, + exp_req_replay_healthy_or_from_mdt)) { + abort_req_replay_queue(obd); + abort_lock_replay_queue(obd); + goto abort; + } + + spin_lock(&obd->obd_recovery_task_lock); + transno = get_next_transno(lut, &type); + if (type == REQUEST_RECOVERY && transno != 0) { + /* + * Drop replay request from client side, if the + * replay has been executed by update with the + * same transno + */ + req = list_entry(obd->obd_req_replay_queue.next, + struct ptlrpc_request, rq_list); + + list_del_init(&req->rq_list); + obd->obd_requests_queued_for_recovery--; + spin_unlock(&obd->obd_recovery_task_lock); + + /* + * Let's check if the request has been redone by + * update replay + */ + if (is_req_replayed_by_update(req)) { + struct distribute_txn_replay_req *dtrq; + + dtrq = distribute_txn_lookup_finish_list(tdtd, + transno); + LASSERT(dtrq != NULL); + spin_lock(&tdtd->tdtd_replay_list_lock); + list_del_init(&dtrq->dtrq_list); + spin_unlock(&tdtd->tdtd_replay_list_lock); + dtrq_destroy(dtrq); + + drop_duplicate_replay_req(env, obd, req); + + continue; + } + + LASSERT(trd->trd_processing_task == current->pid); + DEBUG_REQ(D_HA, req, "processing x%llu t%lld from %s", + req->rq_xid, + lustre_msg_get_transno(req->rq_reqmsg), + libcfs_nid2str(req->rq_peer.nid)); + + ptlrpc_watchdog_init(&thread->t_watchdog, + WATCHDOG_TIMEOUT); + handle_recovery_req(thread, req, + trd->trd_recovery_handler); + ptlrpc_watchdog_disable(&thread->t_watchdog); + + /** + * bz18031: increase next_recovery_transno before + * target_request_copy_put() will drop exp_rpc reference + */ + spin_lock(&obd->obd_recovery_task_lock); + obd->obd_next_recovery_transno++; + spin_unlock(&obd->obd_recovery_task_lock); + target_exp_dequeue_req_replay(req); + target_request_copy_put(req); + obd->obd_replayed_requests++; + } else if (type == UPDATE_RECOVERY && transno != 0) { + struct distribute_txn_replay_req *dtrq; + int rc; + + spin_unlock(&obd->obd_recovery_task_lock); + + LASSERT(tdtd != NULL); + dtrq = distribute_txn_get_next_req(tdtd); + lu_context_enter(&thread->t_env->le_ctx); + ptlrpc_watchdog_init(&thread->t_watchdog, + WATCHDOG_TIMEOUT); + rc = tdtd->tdtd_replay_handler(env, tdtd, dtrq); + ptlrpc_watchdog_disable(&thread->t_watchdog); + lu_context_exit(&thread->t_env->le_ctx); + extend_recovery_timer(obd, obd_timeout, true); + + if (rc == 0 && dtrq->dtrq_xid != 0) { + CDEBUG(D_HA, + "Move x%llu t%llu to finish list\n", + dtrq->dtrq_xid, + dtrq->dtrq_master_transno); + + /* Add it to the replay finish list */ + spin_lock(&tdtd->tdtd_replay_list_lock); + list_add(&dtrq->dtrq_list, + &tdtd->tdtd_replay_finish_list); + spin_unlock(&tdtd->tdtd_replay_list_lock); + + spin_lock(&obd->obd_recovery_task_lock); + if (transno == obd->obd_next_recovery_transno) + obd->obd_next_recovery_transno++; + else if (transno > + obd->obd_next_recovery_transno) + obd->obd_next_recovery_transno = + transno + 1; + spin_unlock(&obd->obd_recovery_task_lock); + } else { + dtrq_destroy(dtrq); + } + } else { + spin_unlock(&obd->obd_recovery_task_lock); +abort: + LASSERT(list_empty(&obd->obd_req_replay_queue)); + LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0); + /** evict exports failed VBR */ + class_disconnect_stale_exports(obd, exp_vbr_healthy); + break; + } + } while (1); +} + +static int target_recovery_thread(void *arg) +{ + struct lu_target *lut = arg; + struct obd_device *obd = lut->lut_obd; + struct ptlrpc_request *req; + struct target_recovery_data *trd = &obd->obd_recovery_data; + unsigned long delta; + struct lu_env *env; + struct ptlrpc_thread *thread = NULL; + int rc = 0; + + ENTRY; + unshare_fs_struct(); + OBD_ALLOC_PTR(thread); + if (thread == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC_PTR(env); + if (env == NULL) + GOTO(out_thread, rc = -ENOMEM); + rc = lu_env_add(env); + if (rc) + GOTO(out_env, rc); + + rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD); + if (rc) + GOTO(out_env_remove, rc); + + thread->t_env = env; + thread->t_id = -1; /* force filter_iobuf_get/put to use local buffers */ + thread->t_task = current; + env->le_ctx.lc_thread = thread; + tgt_io_thread_init(thread); /* init thread_big_cache for IO requests */ + + CDEBUG(D_HA, "%s: started recovery thread pid %d\n", obd->obd_name, + current->pid); + trd->trd_processing_task = current->pid; + + spin_lock(&obd->obd_dev_lock); + obd->obd_recovering = 1; + spin_unlock(&obd->obd_dev_lock); + complete(&trd->trd_starting); + + /* first of all, we have to know the first transno to replay */ + if (target_recovery_overseer(lut, check_for_recovery_ready, + exp_connect_healthy)) { + abort_req_replay_queue(obd); + abort_lock_replay_queue(obd); + if (lut->lut_tdtd != NULL) + dtrq_list_destroy(lut->lut_tdtd); + } + + /* next stage: replay requests or update */ + delta = jiffies; + CDEBUG(D_INFO, "1: request replay stage - %d clients from t%llu\n", + atomic_read(&obd->obd_req_replay_clients), + obd->obd_next_recovery_transno); + replay_request_or_update(env, lut, trd, thread); + + /** + * The second stage: replay locks + */ + CDEBUG(D_INFO, "2: lock replay stage - %d clients\n", + atomic_read(&obd->obd_lock_replay_clients)); + while ((req = target_next_replay_lock(lut))) { + LASSERT(trd->trd_processing_task == current->pid); + DEBUG_REQ(D_HA, req, "processing lock from %s:", + libcfs_nid2str(req->rq_peer.nid)); + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_LOCK_REPLAY)) { + req->rq_status = -ENODEV; + target_request_copy_put(req); + continue; + } + handle_recovery_req(thread, req, + trd->trd_recovery_handler); + target_request_copy_put(req); + obd->obd_replayed_locks++; + } + + /** + * The third stage: reply on final pings, at this moment all clients + * must have request in final queue + */ + CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_RECONNECT, cfs_fail_val); + CDEBUG(D_INFO, "3: final stage - process recovery completion pings\n"); + /** Update server last boot epoch */ + tgt_boot_epoch_update(lut); + /* + * We drop recoverying flag to forward all new requests + * to regular mds_handle() since now + */ + spin_lock(&obd->obd_dev_lock); + obd->obd_recovering = obd->obd_abort_recovery = 0; + obd->obd_abort_recov_mdt = 0; + spin_unlock(&obd->obd_dev_lock); + spin_lock(&obd->obd_recovery_task_lock); + target_cancel_recovery_timer(obd); + spin_unlock(&obd->obd_recovery_task_lock); + while ((req = target_next_final_ping(obd))) { + LASSERT(trd->trd_processing_task == current->pid); + DEBUG_REQ(D_HA, req, "processing final ping from %s:", + libcfs_nid2str(req->rq_peer.nid)); + handle_recovery_req(thread, req, + trd->trd_recovery_handler); + /* + * Because the waiting client can not send ping to server, + * so we need refresh the last_request_time, to avoid the + * export is being evicted + */ + ptlrpc_update_export_timer(req->rq_export, 0); + target_request_copy_put(req); + } + + delta = jiffies_to_msecs(jiffies - delta) / MSEC_PER_SEC; + CDEBUG(D_INFO, "4: recovery completed in %lus - %d/%d reqs/locks\n", + delta, obd->obd_replayed_requests, obd->obd_replayed_locks); + if (delta > OBD_RECOVERY_TIME_SOFT) { + CWARN("too long recovery - read logs\n"); + libcfs_debug_dumplog(); + } + + target_finish_recovery(lut); + lu_context_fini(&env->le_ctx); + trd->trd_processing_task = 0; + complete_all(&trd->trd_finishing); + tgt_io_thread_done(thread); +out_env_remove: + lu_env_remove(env); +out_env: + OBD_FREE_PTR(env); +out_thread: + OBD_FREE_PTR(thread); + RETURN(rc); +} + +static int target_start_recovery_thread(struct lu_target *lut, + svc_handler_t handler) +{ + struct obd_device *obd = lut->lut_obd; + int rc = 0; + struct target_recovery_data *trd = &obd->obd_recovery_data; + int index; + + memset(trd, 0, sizeof(*trd)); + init_completion(&trd->trd_starting); + init_completion(&trd->trd_finishing); + trd->trd_recovery_handler = handler; + + rc = server_name2index(obd->obd_name, &index, NULL); + if (rc < 0) + return rc; + + if (!IS_ERR(kthread_run(target_recovery_thread, + lut, "tgt_recover_%d", index))) { + wait_for_completion(&trd->trd_starting); + LASSERT(obd->obd_recovering != 0); + } else { + rc = -ECHILD; + } + + return rc; +} + +void target_stop_recovery_thread(struct obd_device *obd) +{ + if (obd->obd_recovery_data.trd_processing_task > 0) { + struct target_recovery_data *trd = &obd->obd_recovery_data; + /** recovery can be done but postrecovery is not yet */ + spin_lock(&obd->obd_dev_lock); + if (obd->obd_recovering) { + CERROR("%s: Aborting recovery\n", obd->obd_name); + obd->obd_abort_recovery = 1; + wake_up(&obd->obd_next_transno_waitq); + } + spin_unlock(&obd->obd_dev_lock); + wait_for_completion(&trd->trd_finishing); + } +} +EXPORT_SYMBOL(target_stop_recovery_thread); + +void target_recovery_fini(struct obd_device *obd) +{ + class_disconnect_exports(obd); + target_stop_recovery_thread(obd); + target_cleanup_recovery(obd); +} +EXPORT_SYMBOL(target_recovery_fini); + +static enum hrtimer_restart target_recovery_expired(struct hrtimer *timer) +{ + struct obd_device *obd = container_of(timer, struct obd_device, + obd_recovery_timer); + + CDEBUG(D_HA, + "%s: recovery timed out; %d clients are still in recovery after %llu seconds (%d clients connected)\n", + obd->obd_name, atomic_read(&obd->obd_lock_replay_clients), + ktime_get_seconds() - obd->obd_recovery_start, + atomic_read(&obd->obd_connected_clients)); + + obd->obd_recovery_expired = 1; + wake_up(&obd->obd_next_transno_waitq); + return HRTIMER_NORESTART; +} + +void target_recovery_init(struct lu_target *lut, svc_handler_t handler) +{ + struct obd_device *obd = lut->lut_obd; + + if (lut->lut_bottom->dd_rdonly) + return; + + if (atomic_read(&obd->obd_max_recoverable_clients) == 0) { + /** Update server last boot epoch */ + tgt_boot_epoch_update(lut); + return; + } + + CDEBUG(D_HA, "RECOVERY: service %s, %d recoverable clients, " + "last_transno %llu\n", obd->obd_name, + atomic_read(&obd->obd_max_recoverable_clients), + obd->obd_last_committed); + LASSERT(obd->obd_stopping == 0); + obd->obd_next_recovery_transno = obd->obd_last_committed + 1; + obd->obd_recovery_start = 0; + obd->obd_recovery_end = 0; + + hrtimer_init(&obd->obd_recovery_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + obd->obd_recovery_timer.function = &target_recovery_expired; + target_start_recovery_thread(lut, handler); +} +EXPORT_SYMBOL(target_recovery_init); + +static int target_process_req_flags(struct obd_device *obd, + struct ptlrpc_request *req) +{ + struct obd_export *exp = req->rq_export; + + LASSERT(exp != NULL); + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REQ_REPLAY_DONE) { + /* client declares he's ready to replay locks */ + spin_lock(&exp->exp_lock); + if (exp->exp_req_replay_needed) { + exp->exp_req_replay_needed = 0; + spin_unlock(&exp->exp_lock); + + LASSERT_ATOMIC_POS(&obd->obd_req_replay_clients); + atomic_dec(&obd->obd_req_replay_clients); + } else { + spin_unlock(&exp->exp_lock); + } + } + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) { + /* + * client declares he's ready to complete recovery + * so, we put the request on th final queue + */ + spin_lock(&exp->exp_lock); + if (exp->exp_lock_replay_needed) { + exp->exp_lock_replay_needed = 0; + spin_unlock(&exp->exp_lock); + + LASSERT_ATOMIC_POS(&obd->obd_lock_replay_clients); + atomic_dec(&obd->obd_lock_replay_clients); + } else { + spin_unlock(&exp->exp_lock); + } + } + return 0; +} + +int target_queue_recovery_request(struct ptlrpc_request *req, + struct obd_device *obd) +{ + __u64 transno = lustre_msg_get_transno(req->rq_reqmsg); + struct ptlrpc_request *reqiter; + int inserted = 0; + + ENTRY; + + if (obd->obd_recovery_data.trd_processing_task == current->pid) { + /* Processing the queue right now, don't re-add. */ + RETURN(1); + } + + target_process_req_flags(obd, req); + + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) { + if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) { + if (cfs_fail_val == 1) { + cfs_race_state = 1; + cfs_fail_val = 0; + wake_up(&cfs_race_waitq); + + schedule_timeout_interruptible( + cfs_time_seconds(1)); + } + } + + /* + * client declares he's ready to complete recovery + * so, we put the request on th final queue + */ + target_request_copy_get(req); + DEBUG_REQ(D_HA, req, "queue final req"); + wake_up(&obd->obd_next_transno_waitq); + spin_lock(&obd->obd_recovery_task_lock); + if (obd->obd_recovering) { + struct ptlrpc_request *tmp; + struct ptlrpc_request *duplicate = NULL; + + if (likely(!req->rq_export->exp_replay_done)) { + req->rq_export->exp_replay_done = 1; + list_add_tail(&req->rq_list, + &obd->obd_final_req_queue); + spin_unlock(&obd->obd_recovery_task_lock); + RETURN(0); + } + + /* + * XXX O(n), but only happens if final ping is + * timed out, probably reorganize the list as + * a hash list later + */ + list_for_each_entry_safe(reqiter, tmp, + &obd->obd_final_req_queue, + rq_list) { + if (reqiter->rq_export == req->rq_export) { + list_del_init(&reqiter->rq_list); + duplicate = reqiter; + break; + } + } + + list_add_tail(&req->rq_list, + &obd->obd_final_req_queue); + req->rq_export->exp_replay_done = 1; + spin_unlock(&obd->obd_recovery_task_lock); + + if (duplicate != NULL) { + DEBUG_REQ(D_HA, duplicate, + "put prev final req"); + target_request_copy_put(duplicate); + } + RETURN(0); + } else { + spin_unlock(&obd->obd_recovery_task_lock); + target_request_copy_put(req); + RETURN(obd->obd_stopping ? -ENOTCONN : 1); + } + } + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REQ_REPLAY_DONE) { + /* client declares he's ready to replay locks */ + target_request_copy_get(req); + DEBUG_REQ(D_HA, req, "queue lock replay req"); + wake_up(&obd->obd_next_transno_waitq); + spin_lock(&obd->obd_recovery_task_lock); + LASSERT(obd->obd_recovering); + /* usually due to recovery abort */ + if (!req->rq_export->exp_in_recovery) { + spin_unlock(&obd->obd_recovery_task_lock); + target_request_copy_put(req); + RETURN(-ENOTCONN); + } + LASSERT(req->rq_export->exp_lock_replay_needed); + list_add_tail(&req->rq_list, &obd->obd_lock_replay_queue); + spin_unlock(&obd->obd_recovery_task_lock); + RETURN(0); + } + + /* + * CAVEAT EMPTOR: The incoming request message has been swabbed + * (i.e. buflens etc are in my own byte order), but type-dependent + * buffers (eg mdt_body, ost_body etc) have NOT been swabbed. + */ + + if (!transno) { + INIT_LIST_HEAD(&req->rq_list); + DEBUG_REQ(D_HA, req, "not queueing"); + RETURN(1); + } + + /* + * If we're processing the queue, we want don't want to queue this + * message. + * + * Also, if this request has a transno less than the one we're waiting + * for, we should process it now. It could (and currently always will) + * be an open request for a descriptor that was opened some time ago. + * + * Also, a resent, replayed request that has already been + * handled will pass through here and be processed immediately. + */ + CDEBUG(D_HA, + "Next recovery transno: %llu, current: %llu, replaying\n", + obd->obd_next_recovery_transno, transno); + + /* + * If the request has been replayed by update replay, then sends this + * request to the recovery thread (replay_request_or_update()), where + * it will be handled + */ + spin_lock(&obd->obd_recovery_task_lock); + if (transno < obd->obd_next_recovery_transno && + !is_req_replayed_by_update(req)) { + /* Processing the queue right now, don't re-add. */ + LASSERT(list_empty(&req->rq_list)); + spin_unlock(&obd->obd_recovery_task_lock); + RETURN(1); + } + spin_unlock(&obd->obd_recovery_task_lock); + + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_DROP)) + RETURN(0); + + target_request_copy_get(req); + if (!req->rq_export->exp_in_recovery) { + target_request_copy_put(req); + RETURN(-ENOTCONN); + } + LASSERT(req->rq_export->exp_req_replay_needed); + + if (target_exp_enqueue_req_replay(req)) { + DEBUG_REQ(D_ERROR, req, "dropping resent queued req"); + target_request_copy_put(req); + RETURN(0); + } + + /* XXX O(n^2) */ + spin_lock(&obd->obd_recovery_task_lock); + LASSERT(obd->obd_recovering); + list_for_each_entry(reqiter, &obd->obd_req_replay_queue, rq_list) { + if (lustre_msg_get_transno(reqiter->rq_reqmsg) > transno) { + list_add_tail(&req->rq_list, &reqiter->rq_list); + inserted = 1; + goto added; + } + + if (unlikely(lustre_msg_get_transno(reqiter->rq_reqmsg) == + transno)) { + DEBUG_REQ(D_ERROR, req, + "dropping replay: transno has been claimed by another client"); + spin_unlock(&obd->obd_recovery_task_lock); + target_exp_dequeue_req_replay(req); + target_request_copy_put(req); + RETURN(0); + } + } +added: + if (!inserted) + list_add_tail(&req->rq_list, &obd->obd_req_replay_queue); + + obd->obd_requests_queued_for_recovery++; + spin_unlock(&obd->obd_recovery_task_lock); + wake_up(&obd->obd_next_transno_waitq); + RETURN(0); +} + +void target_committed_to_req(struct ptlrpc_request *req) +{ + struct obd_export *exp = req->rq_export; + + if (!exp->exp_obd->obd_no_transno && req->rq_repmsg != NULL) + lustre_msg_set_last_committed(req->rq_repmsg, + exp->exp_last_committed); + else + DEBUG_REQ(D_IOCTL, req, + "not sending last_committed update (%d/%d)", + exp->exp_obd->obd_no_transno, + req->rq_repmsg == NULL); + + CDEBUG(D_INFO, "last_committed %llu, transno %llu, xid %llu\n", + exp->exp_last_committed, req->rq_transno, req->rq_xid); +} + +#endif /* HAVE_SERVER_SUPPORT */ + +/** + * Packs current SLV and Limit into \a req. + */ +int target_pack_pool_reply(struct ptlrpc_request *req) +{ + struct obd_device *obd; + + ENTRY; + + /* + * Check that we still have all structures alive as this may + * be some late RPC at shutdown time. + */ + if (unlikely(!req->rq_export || !req->rq_export->exp_obd || + !exp_connect_lru_resize(req->rq_export))) { + lustre_msg_set_slv(req->rq_repmsg, 0); + lustre_msg_set_limit(req->rq_repmsg, 0); + RETURN(0); + } + + /* OBD is alive here as export is alive, which we checked above. */ + obd = req->rq_export->exp_obd; + + read_lock(&obd->obd_pool_lock); + lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv); + lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit); + read_unlock(&obd->obd_pool_lock); + + RETURN(0); +} + +static int target_send_reply_msg(struct ptlrpc_request *req, + int rc, int fail_id) +{ + if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) { + DEBUG_REQ(D_ERROR, req, "dropping reply"); + return -ECOMM; + } + /* + * We can have a null rq_reqmsg in the event of bad signature or + * no context when unwrapping + */ + if (req->rq_reqmsg && + unlikely(lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT && + OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET_REP))) + return -ECOMM; + + if (unlikely(rc)) { + DEBUG_REQ(D_NET, req, "processing error (%d)", rc); + req->rq_status = rc; + return ptlrpc_send_error(req, 1); + } + DEBUG_REQ(D_NET, req, "sending reply"); + + return ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT); +} + +void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) +{ + struct ptlrpc_service_part *svcpt; + int netrc; + struct ptlrpc_reply_state *rs; + struct obd_export *exp; + + ENTRY; + + if (req->rq_no_reply) { + EXIT; + return; + } + + svcpt = req->rq_rqbd->rqbd_svcpt; + rs = req->rq_reply_state; + if (rs == NULL || !rs->rs_difficult) { + /* no notifiers */ + target_send_reply_msg(req, rc, fail_id); + EXIT; + return; + } + + /* must be an export if locks saved */ + LASSERT(req->rq_export != NULL); + /* req/reply consistent */ + LASSERT(rs->rs_svcpt == svcpt); + + /* "fresh" reply */ + LASSERT(!rs->rs_scheduled); + LASSERT(!rs->rs_scheduled_ever); + LASSERT(!rs->rs_handled); + LASSERT(!rs->rs_sent); + LASSERT(!rs->rs_unlinked); + LASSERT(rs->rs_export == NULL); + LASSERT(list_empty(&rs->rs_obd_list)); + LASSERT(list_empty(&rs->rs_exp_list)); + + exp = class_export_get(req->rq_export); + + /* disable reply scheduling while I'm setting up */ + rs->rs_scheduled = 1; + rs->rs_sent = 0; + rs->rs_unlinked = 0; + rs->rs_xid = req->rq_xid; + rs->rs_transno = req->rq_transno; + rs->rs_export = exp; + rs->rs_opc = lustre_msg_get_opc(req->rq_reqmsg); + + spin_lock(&exp->exp_uncommitted_replies_lock); + CDEBUG(D_NET, "rs transno = %llu, last committed = %llu\n", + rs->rs_transno, exp->exp_last_committed); + if (rs->rs_transno > exp->exp_last_committed) { + /* not committed already */ + list_add_tail(&rs->rs_obd_list, + &exp->exp_uncommitted_replies); + } + spin_unlock(&exp->exp_uncommitted_replies_lock); + + spin_lock(&exp->exp_lock); + list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies); + spin_unlock(&exp->exp_lock); + + netrc = target_send_reply_msg(req, rc, fail_id); + + spin_lock(&svcpt->scp_rep_lock); + + atomic_inc(&svcpt->scp_nreps_difficult); + + if (netrc != 0) { + /* + * error sending: reply is off the net. Also we need +1 + * reply ref until ptlrpc_handle_rs() is done + * with the reply state (if the send was successful, there + * would have been +1 ref for the net, which + * reply_out_callback leaves alone) + */ + rs->rs_sent = 1; + rs->rs_unlinked = 1; + ptlrpc_rs_addref(rs); + } + + spin_lock(&rs->rs_lock); + if (rs->rs_transno <= exp->exp_last_committed || + (rs->rs_unlinked && !rs->rs_no_ack) || + list_empty(&rs->rs_exp_list) || /* completed already */ + list_empty(&rs->rs_obd_list)) { + CDEBUG(D_HA, "Schedule reply immediately\n"); + ptlrpc_dispatch_difficult_reply(rs); + } else { + list_add(&rs->rs_list, &svcpt->scp_rep_active); + rs->rs_scheduled = 0; /* allow notifier to schedule */ + } + spin_unlock(&rs->rs_lock); + spin_unlock(&svcpt->scp_rep_lock); + EXIT; +} + +enum ldlm_mode lck_compat_array[] = { + [LCK_EX] = LCK_COMPAT_EX, + [LCK_PW] = LCK_COMPAT_PW, + [LCK_PR] = LCK_COMPAT_PR, + [LCK_CW] = LCK_COMPAT_CW, + [LCK_CR] = LCK_COMPAT_CR, + [LCK_NL] = LCK_COMPAT_NL, + [LCK_GROUP] = LCK_COMPAT_GROUP, + [LCK_COS] = LCK_COMPAT_COS, +}; + +/** + * Rather arbitrary mapping from LDLM error codes to errno values. This should + * not escape to the user level. + */ +int ldlm_error2errno(enum ldlm_error error) +{ + int result; + + switch (error) { + case ELDLM_OK: + case ELDLM_LOCK_MATCHED: + result = 0; + break; + case ELDLM_LOCK_CHANGED: + result = -ESTALE; + break; + case ELDLM_LOCK_ABORTED: + result = -ENAVAIL; + break; + case ELDLM_LOCK_REPLACED: + result = -ESRCH; + break; + case ELDLM_NO_LOCK_DATA: + result = -ENOENT; + break; + case ELDLM_NAMESPACE_EXISTS: + result = -EEXIST; + break; + case ELDLM_BAD_NAMESPACE: + result = -EBADF; + break; + default: + if (((int)error) < 0) { /* cast to signed type */ + result = error; /* as ldlm_error can be unsigned */ + } else { + CERROR("Invalid DLM result code: %d\n", error); + result = -EPROTO; + } + } + return result; +} +EXPORT_SYMBOL(ldlm_error2errno); + +/** + * Dual to ldlm_error2errno(): maps errno values back to enum ldlm_error. + */ +enum ldlm_error ldlm_errno2error(int err_no) +{ + int error; + + switch (err_no) { + case 0: + error = ELDLM_OK; + break; + case -ESTALE: + error = ELDLM_LOCK_CHANGED; + break; + case -ENAVAIL: + error = ELDLM_LOCK_ABORTED; + break; + case -ESRCH: + error = ELDLM_LOCK_REPLACED; + break; + case -ENOENT: + error = ELDLM_NO_LOCK_DATA; + break; + case -EEXIST: + error = ELDLM_NAMESPACE_EXISTS; + break; + case -EBADF: + error = ELDLM_BAD_NAMESPACE; + break; + default: + error = err_no; + } + return error; +} + +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void ldlm_dump_export_locks(struct obd_export *exp) +{ + spin_lock(&exp->exp_locks_list_guard); + if (!list_empty(&exp->exp_locks_list)) { + struct ldlm_lock *lock; + + CERROR("dumping locks for export %p, ignore if the unmount doesn't hang\n", + exp); + list_for_each_entry(lock, &exp->exp_locks_list, + l_exp_refs_link) + LDLM_ERROR(lock, "lock:"); + } + spin_unlock(&exp->exp_locks_list_guard); +} +#endif + +#ifdef HAVE_SERVER_SUPPORT +static inline const char *bulk2type(struct ptlrpc_request *req) +{ + if (req->rq_bulk_read) + return "READ"; + if (req->rq_bulk_write) + return "WRITE"; + return "UNKNOWN"; +} + +int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_request *req = desc->bd_req; + time64_t start = ktime_get_seconds(); + time64_t deadline; + int rc = 0; + + ENTRY; + + /* If there is eviction in progress, wait for it to finish. */ + wait_event_idle( + exp->exp_obd->obd_evict_inprogress_waitq, + !atomic_read(&exp->exp_obd->obd_evict_inprogress)); + + /* Check if client was evicted or reconnected already. */ + if (exp->exp_failed || + exp->exp_conn_cnt > lustre_msg_get_conn_cnt(req->rq_reqmsg)) { + rc = -ENOTCONN; + } else { + if (req->rq_bulk_read) + rc = sptlrpc_svc_wrap_bulk(req, desc); + + if (OCD_HAS_FLAG(&exp->exp_connect_data, BULK_MBITS)) + req->rq_mbits = lustre_msg_get_mbits(req->rq_reqmsg); + else /* old version, bulk matchbits is rq_xid */ + req->rq_mbits = req->rq_xid; + + if (rc == 0) + rc = ptlrpc_start_bulk_transfer(desc); + } + + if (rc < 0) { + DEBUG_REQ(D_ERROR, req, "bulk %s failed: rc = %d", + bulk2type(req), rc); + RETURN(rc); + } + + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) { + ptlrpc_abort_bulk(desc); + RETURN(0); + } + + /* limit actual bulk transfer to bulk_timeout seconds */ + deadline = start + bulk_timeout; + if (deadline > req->rq_deadline) + deadline = req->rq_deadline; + + do { + time64_t timeoutl = deadline - ktime_get_seconds(); + time64_t rq_deadline; + + while (timeoutl >= 0 && + wait_event_idle_timeout( + desc->bd_waitq, + !ptlrpc_server_bulk_active(desc) || + exp->exp_failed || + exp->exp_conn_cnt > + lustre_msg_get_conn_cnt(req->rq_reqmsg), + timeoutl ? cfs_time_seconds(1) : 1) == 0) + timeoutl -= 1; + rc = timeoutl < 0 ? -ETIMEDOUT : 0; + + /* Wait again if we changed rq_deadline. */ + rq_deadline = READ_ONCE(req->rq_deadline); + deadline = start + bulk_timeout; + if (deadline > rq_deadline) + deadline = rq_deadline; + } while (rc == -ETIMEDOUT && + deadline > ktime_get_seconds()); + + if (rc == -ETIMEDOUT) { + DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %lld%+llds", + bulk2type(req), deadline - start, + ktime_get_real_seconds() - deadline); + ptlrpc_abort_bulk(desc); + } else if (exp->exp_failed) { + DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s", + bulk2type(req)); + rc = -ENOTCONN; + ptlrpc_abort_bulk(desc); + } else if (exp->exp_conn_cnt > + lustre_msg_get_conn_cnt(req->rq_reqmsg)) { + DEBUG_REQ(D_ERROR, req, "Reconnect on bulk %s", + bulk2type(req)); + /* We don't reply anyway. */ + rc = -ETIMEDOUT; + ptlrpc_abort_bulk(desc); + } else if (desc->bd_failure) { + DEBUG_REQ(D_ERROR, req, "network error on bulk %s", + bulk2type(req)); + /* XXX should this be a different errno? */ + rc = -ETIMEDOUT; + } else { + if (req->rq_bulk_write) + rc = sptlrpc_svc_unwrap_bulk(req, desc); + if (rc == 0 && desc->bd_nob_transferred != desc->bd_nob) { + DEBUG_REQ(D_ERROR, req, "truncated bulk %s %d(%d)", + bulk2type(req), desc->bd_nob_transferred, + desc->bd_nob); + /* XXX should this be a different errno? */ + rc = -ETIMEDOUT; + } + } + + RETURN(rc); +} +EXPORT_SYMBOL(target_bulk_io); + +#endif /* HAVE_SERVER_SUPPORT */ diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c new file mode 100644 index 0000000000000..3ceadc9086a97 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c @@ -0,0 +1,2898 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ldlm/ldlm_lock.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include + +#include +#include + +#include "ldlm_internal.h" + +struct kmem_cache *ldlm_glimpse_work_kmem; +EXPORT_SYMBOL(ldlm_glimpse_work_kmem); + +/* lock types */ +char *ldlm_lockname[] = { + [0] = "--", + [LCK_EX] = "EX", + [LCK_PW] = "PW", + [LCK_PR] = "PR", + [LCK_CW] = "CW", + [LCK_CR] = "CR", + [LCK_NL] = "NL", + [LCK_GROUP] = "GROUP", + [LCK_COS] = "COS" +}; +EXPORT_SYMBOL(ldlm_lockname); + +char *ldlm_typename[] = { + [LDLM_PLAIN] = "PLN", + [LDLM_EXTENT] = "EXT", + [LDLM_FLOCK] = "FLK", + [LDLM_IBITS] = "IBT", +}; + +static ldlm_policy_wire_to_local_t ldlm_policy_wire_to_local[] = { + [LDLM_PLAIN - LDLM_MIN_TYPE] = ldlm_plain_policy_wire_to_local, + [LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_wire_to_local, + [LDLM_FLOCK - LDLM_MIN_TYPE] = ldlm_flock_policy_wire_to_local, + [LDLM_IBITS - LDLM_MIN_TYPE] = ldlm_ibits_policy_wire_to_local, +}; + +static ldlm_policy_local_to_wire_t ldlm_policy_local_to_wire[] = { + [LDLM_PLAIN - LDLM_MIN_TYPE] = ldlm_plain_policy_local_to_wire, + [LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_local_to_wire, + [LDLM_FLOCK - LDLM_MIN_TYPE] = ldlm_flock_policy_local_to_wire, + [LDLM_IBITS - LDLM_MIN_TYPE] = ldlm_ibits_policy_local_to_wire, +}; + +/** + * Converts lock policy from local format to on the wire lock_desc format + */ +void ldlm_convert_policy_to_wire(enum ldlm_type type, + const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy) +{ + ldlm_policy_local_to_wire_t convert; + + convert = ldlm_policy_local_to_wire[type - LDLM_MIN_TYPE]; + + convert(lpolicy, wpolicy); +} + +/** + * Converts lock policy from on the wire lock_desc format to local format + */ +void ldlm_convert_policy_to_local(struct obd_export *exp, enum ldlm_type type, + const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy) +{ + ldlm_policy_wire_to_local_t convert; + + convert = ldlm_policy_wire_to_local[type - LDLM_MIN_TYPE]; + + convert(wpolicy, lpolicy); +} + +const char *ldlm_it2str(enum ldlm_intent_flags it) +{ + switch (it) { + case IT_OPEN: + return "open"; + case IT_CREAT: + return "creat"; + case (IT_OPEN | IT_CREAT): + return "open|creat"; + case IT_READDIR: + return "readdir"; + case IT_GETATTR: + return "getattr"; + case IT_LOOKUP: + return "lookup"; + case IT_GETXATTR: + return "getxattr"; + case IT_LAYOUT: + return "layout"; + default: + CERROR("Unknown intent 0x%08x\n", it); + return "UNKNOWN"; + } +} +EXPORT_SYMBOL(ldlm_it2str); + +#ifdef HAVE_SERVER_SUPPORT +static ldlm_processing_policy ldlm_processing_policy_table[] = { + [LDLM_PLAIN] = ldlm_process_plain_lock, + [LDLM_EXTENT] = ldlm_process_extent_lock, + [LDLM_FLOCK] = ldlm_process_flock_lock, + [LDLM_IBITS] = ldlm_process_inodebits_lock, +}; + +ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res) +{ + return ldlm_processing_policy_table[res->lr_type]; +} +EXPORT_SYMBOL(ldlm_get_processing_policy); + +static ldlm_reprocessing_policy ldlm_reprocessing_policy_table[] = { + [LDLM_PLAIN] = ldlm_reprocess_queue, + [LDLM_EXTENT] = ldlm_reprocess_queue, + [LDLM_FLOCK] = ldlm_reprocess_queue, + [LDLM_IBITS] = ldlm_reprocess_inodebits_queue, +}; + +ldlm_reprocessing_policy ldlm_get_reprocessing_policy(struct ldlm_resource *res) +{ + return ldlm_reprocessing_policy_table[res->lr_type]; +} + +#endif /* HAVE_SERVER_SUPPORT */ + +void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg) +{ + ns->ns_policy = arg; +} +EXPORT_SYMBOL(ldlm_register_intent); + +/* + * REFCOUNTED LOCK OBJECTS + */ + + +/** + * Get a reference on a lock. + * + * Lock refcounts, during creation: + * - one special one for allocation, dec'd only once in destroy + * - one for being a lock that's in-use + * - one for the addref associated with a new lock + */ +struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock) +{ + refcount_inc(&lock->l_handle.h_ref); + return lock; +} +EXPORT_SYMBOL(ldlm_lock_get); + +static void lock_handle_free(struct rcu_head *rcu) +{ + struct ldlm_lock *lock = container_of(rcu, struct ldlm_lock, + l_handle.h_rcu); + + OBD_FREE_PRE(lock, sizeof(*lock), "slab-freed"); + kmem_cache_free(ldlm_lock_slab, lock); +} + +/** + * Release lock reference. + * + * Also frees the lock if it was last reference. + */ +void ldlm_lock_put(struct ldlm_lock *lock) +{ + ENTRY; + + LASSERT(lock->l_resource != LP_POISON); + LASSERT(refcount_read(&lock->l_handle.h_ref) > 0); + if (refcount_dec_and_test(&lock->l_handle.h_ref)) { + struct ldlm_resource *res; + + LDLM_DEBUG(lock, + "final lock_put on destroyed lock, freeing it."); + + res = lock->l_resource; + LASSERT(ldlm_is_destroyed(lock)); + LASSERT(list_empty(&lock->l_exp_list)); + LASSERT(list_empty(&lock->l_res_link)); + LASSERT(list_empty(&lock->l_pending_chain)); + + lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats, + LDLM_NSS_LOCKS); + lu_ref_del(&res->lr_reference, "lock", lock); + if (lock->l_export) { + class_export_lock_put(lock->l_export, lock); + lock->l_export = NULL; + } + + if (lock->l_lvb_data != NULL) + OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len); + + if (res->lr_type == LDLM_EXTENT) { + ldlm_interval_free(ldlm_interval_detach(lock)); + } else if (res->lr_type == LDLM_IBITS) { + if (lock->l_ibits_node != NULL) + OBD_SLAB_FREE_PTR(lock->l_ibits_node, + ldlm_inodebits_slab); + } + ldlm_resource_putref(res); + lock->l_resource = NULL; + lu_ref_fini(&lock->l_reference); + call_rcu(&lock->l_handle.h_rcu, lock_handle_free); + } + + EXIT; +} +EXPORT_SYMBOL(ldlm_lock_put); + +/** + * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked. + */ +int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock) +{ + int rc = 0; + if (!list_empty(&lock->l_lru)) { + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + LASSERT(lock->l_resource->lr_type != LDLM_FLOCK); + if (ns->ns_last_pos == &lock->l_lru) + ns->ns_last_pos = lock->l_lru.prev; + list_del_init(&lock->l_lru); + LASSERT(ns->ns_nr_unused > 0); + ns->ns_nr_unused--; + rc = 1; + } + return rc; +} + +/** + * Removes LDLM lock \a lock from LRU. Obtains the LRU lock first. + * + * If \a last_use is non-zero, it will remove the lock from LRU only if + * it matches lock's l_last_used. + * + * \retval 0 if \a last_use is set, the lock is not in LRU list or \a last_use + * doesn't match lock's l_last_used; + * otherwise, the lock hasn't been in the LRU list. + * \retval 1 the lock was in LRU list and removed. + */ +int ldlm_lock_remove_from_lru_check(struct ldlm_lock *lock, ktime_t last_use) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + int rc = 0; + + ENTRY; + if (ldlm_is_ns_srv(lock)) { + LASSERT(list_empty(&lock->l_lru)); + RETURN(0); + } + + spin_lock(&ns->ns_lock); + if (!ktime_compare(last_use, ktime_set(0, 0)) || + !ktime_compare(last_use, lock->l_last_used)) + rc = ldlm_lock_remove_from_lru_nolock(lock); + spin_unlock(&ns->ns_lock); + + RETURN(rc); +} + +/** + * Adds LDLM lock \a lock to namespace LRU. Assumes LRU is already locked. + */ +void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + lock->l_last_used = ktime_get(); + LASSERT(list_empty(&lock->l_lru)); + LASSERT(lock->l_resource->lr_type != LDLM_FLOCK); + list_add_tail(&lock->l_lru, &ns->ns_unused_list); + LASSERT(ns->ns_nr_unused >= 0); + ns->ns_nr_unused++; +} + +/** + * Adds LDLM lock \a lock to namespace LRU. Obtains necessary LRU locks + * first. + */ +void ldlm_lock_add_to_lru(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + ENTRY; + spin_lock(&ns->ns_lock); + ldlm_lock_add_to_lru_nolock(lock); + spin_unlock(&ns->ns_lock); + EXIT; +} + +/** + * Moves LDLM lock \a lock that is already in namespace LRU to the tail of + * the LRU. Performs necessary LRU locking + */ +void ldlm_lock_touch_in_lru(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + ENTRY; + if (ldlm_is_ns_srv(lock)) { + LASSERT(list_empty(&lock->l_lru)); + EXIT; + return; + } + + spin_lock(&ns->ns_lock); + if (!list_empty(&lock->l_lru)) { + ldlm_lock_remove_from_lru_nolock(lock); + ldlm_lock_add_to_lru_nolock(lock); + } + spin_unlock(&ns->ns_lock); + EXIT; +} + +/** + * Helper to destroy a locked lock. + * + * Used by ldlm_lock_destroy and ldlm_lock_destroy_nolock + * Must be called with l_lock and lr_lock held. + * + * Does not actually free the lock data, but rather marks the lock as + * destroyed by setting l_destroyed field in the lock to 1. Destroys a + * handle->lock association too, so that the lock can no longer be found + * and removes the lock from LRU list. Actual lock freeing occurs when + * last lock reference goes away. + * + * Original comment (of some historical value): + * This used to have a 'strict' flag, which recovery would use to mark an + * in-use lock as needing-to-die. Lest I am ever tempted to put it back, I + * shall explain why it's gone: with the new hash table scheme, once you call + * ldlm_lock_destroy, you can never drop your final references on this lock. + * Because it's not in the hash table anymore. -phil + */ +static int ldlm_lock_destroy_internal(struct ldlm_lock *lock) +{ + ENTRY; + + if (lock->l_readers || lock->l_writers) { + LDLM_ERROR(lock, "lock still has references"); + LBUG(); + } + + if (!list_empty(&lock->l_res_link)) { + LDLM_ERROR(lock, "lock still on resource"); + LBUG(); + } + + if (ldlm_is_destroyed(lock)) { + LASSERT(list_empty(&lock->l_lru)); + EXIT; + return 0; + } + ldlm_set_destroyed(lock); + + if (lock->l_export && lock->l_export->exp_lock_hash) { + /* NB: it's safe to call cfs_hash_del() even lock isn't + * in exp_lock_hash. */ + /* In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() */ + /* coverity[overrun-buffer-val] */ + cfs_hash_del(lock->l_export->exp_lock_hash, + &lock->l_remote_handle, &lock->l_exp_hash); + } + + ldlm_lock_remove_from_lru(lock); + class_handle_unhash(&lock->l_handle); + + EXIT; + return 1; +} + +/** + * Destroys a LDLM lock \a lock. Performs necessary locking first. + */ +void ldlm_lock_destroy(struct ldlm_lock *lock) +{ + int first; + ENTRY; + lock_res_and_lock(lock); + first = ldlm_lock_destroy_internal(lock); + unlock_res_and_lock(lock); + + /* drop reference from hashtable only for first destroy */ + if (first) { + lu_ref_del(&lock->l_reference, "hash", lock); + LDLM_LOCK_RELEASE(lock); + } + EXIT; +} + +/** + * Destroys a LDLM lock \a lock that is already locked. + */ +void ldlm_lock_destroy_nolock(struct ldlm_lock *lock) +{ + int first; + ENTRY; + first = ldlm_lock_destroy_internal(lock); + /* drop reference from hashtable only for first destroy */ + if (first) { + lu_ref_del(&lock->l_reference, "hash", lock); + LDLM_LOCK_RELEASE(lock); + } + EXIT; +} + +static const char lock_handle_owner[] = "ldlm"; + +/** + * + * Allocate and initialize new lock structure. + * + * usage: pass in a resource on which you have done ldlm_resource_get + * new lock will take over the refcount. + * returns: lock with refcount 2 - one for current caller and one for remote + */ +static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource) +{ + struct ldlm_lock *lock; + ENTRY; + + if (resource == NULL) + LBUG(); + + OBD_SLAB_ALLOC_PTR_GFP(lock, ldlm_lock_slab, GFP_NOFS); + if (lock == NULL) + RETURN(NULL); + + RCU_INIT_POINTER(lock->l_resource, resource); + lu_ref_add(&resource->lr_reference, "lock", lock); + + refcount_set(&lock->l_handle.h_ref, 2); + INIT_LIST_HEAD(&lock->l_res_link); + INIT_LIST_HEAD(&lock->l_lru); + INIT_LIST_HEAD(&lock->l_pending_chain); + INIT_LIST_HEAD(&lock->l_bl_ast); + INIT_LIST_HEAD(&lock->l_cp_ast); + INIT_LIST_HEAD(&lock->l_rk_ast); + init_waitqueue_head(&lock->l_waitq); + lock->l_blocking_lock = NULL; + INIT_LIST_HEAD(&lock->l_sl_mode); + INIT_LIST_HEAD(&lock->l_sl_policy); + INIT_HLIST_NODE(&lock->l_exp_hash); + INIT_HLIST_NODE(&lock->l_exp_flock_hash); + + lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats, + LDLM_NSS_LOCKS); + INIT_HLIST_NODE(&lock->l_handle.h_link); + class_handle_hash(&lock->l_handle, lock_handle_owner); + + lu_ref_init(&lock->l_reference); + lu_ref_add(&lock->l_reference, "hash", lock); + lock->l_callback_timestamp = 0; + lock->l_activity = 0; + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + INIT_LIST_HEAD(&lock->l_exp_refs_link); + lock->l_exp_refs_nr = 0; + lock->l_exp_refs_target = NULL; +#endif + INIT_LIST_HEAD(&lock->l_exp_list); + + RETURN(lock); +} + +/** + * Moves LDLM lock \a lock to another resource. + * This is used on client when server returns some other lock than requested + * (typically as a result of intent operation) + */ +int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock, + const struct ldlm_res_id *new_resid) +{ + struct ldlm_resource *oldres; + struct ldlm_resource *newres; + int type; + ENTRY; + + LASSERT(ns_is_client(ns)); + + oldres = lock_res_and_lock(lock); + if (memcmp(new_resid, &oldres->lr_name, + sizeof(oldres->lr_name)) == 0) { + /* Nothing to do */ + unlock_res_and_lock(lock); + RETURN(0); + } + + LASSERT(new_resid->name[0] != 0); + + /* This function assumes that the lock isn't on any lists */ + LASSERT(list_empty(&lock->l_res_link)); + + type = oldres->lr_type; + unlock_res_and_lock(lock); + + newres = ldlm_resource_get(ns, NULL, new_resid, type, 1); + if (IS_ERR(newres)) + RETURN(PTR_ERR(newres)); + + lu_ref_add(&newres->lr_reference, "lock", lock); + /* + * To flip the lock from the old to the new resource, oldres + * and newres have to be locked. Resource spin-locks are taken + * in the memory address order to avoid dead-locks. + * As this is the only circumstance where ->l_resource + * can change, and this cannot race with itself, it is safe + * to access lock->l_resource without being careful about locking. + */ + oldres = lock->l_resource; + if (oldres < newres) { + lock_res(oldres); + lock_res_nested(newres, LRT_NEW); + } else { + lock_res(newres); + lock_res_nested(oldres, LRT_NEW); + } + LASSERT(memcmp(new_resid, &oldres->lr_name, + sizeof oldres->lr_name) != 0); + rcu_assign_pointer(lock->l_resource, newres); + unlock_res(oldres); + unlock_res(newres); + + /* ...and the flowers are still standing! */ + lu_ref_del(&oldres->lr_reference, "lock", lock); + ldlm_resource_putref(oldres); + + RETURN(0); +} + +/** \defgroup ldlm_handles LDLM HANDLES + * Ways to get hold of locks without any addresses. + * @{ + */ + +/** + * Fills in handle for LDLM lock \a lock into supplied \a lockh + * Does not take any references. + */ +void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh) +{ + lockh->cookie = lock->l_handle.h_cookie; +} +EXPORT_SYMBOL(ldlm_lock2handle); + +/** + * Obtain a lock reference by handle. + * + * if \a flags: atomically get the lock and set the flags. + * Return NULL if flag already set + */ +struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle, + __u64 flags) +{ + struct ldlm_lock *lock; + ENTRY; + + LASSERT(handle); + + if (!lustre_handle_is_used(handle)) + RETURN(NULL); + + lock = class_handle2object(handle->cookie, lock_handle_owner); + if (lock == NULL) + RETURN(NULL); + + if (lock->l_export != NULL && lock->l_export->exp_failed) { + CDEBUG(D_INFO, "lock export failed: lock %p, exp %p\n", + lock, lock->l_export); + LDLM_LOCK_PUT(lock); + RETURN(NULL); + } + + /* It's unlikely but possible that someone marked the lock as + * destroyed after we did handle2object on it */ + if ((flags == 0) && !ldlm_is_destroyed(lock)) { + lu_ref_add_atomic(&lock->l_reference, "handle", lock); + RETURN(lock); + } + + lock_res_and_lock(lock); + + LASSERT(lock->l_resource != NULL); + + lu_ref_add_atomic(&lock->l_reference, "handle", lock); + if (unlikely(ldlm_is_destroyed(lock))) { + unlock_res_and_lock(lock); + CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock); + LDLM_LOCK_PUT(lock); + RETURN(NULL); + } + + /* If we're setting flags, make sure none of them are already set. */ + if (flags != 0) { + if ((lock->l_flags & flags) != 0) { + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + RETURN(NULL); + } + + lock->l_flags |= flags; + } + + unlock_res_and_lock(lock); + RETURN(lock); +} +EXPORT_SYMBOL(__ldlm_handle2lock); +/** @} ldlm_handles */ + +/** + * Fill in "on the wire" representation for given LDLM lock into supplied + * lock descriptor \a desc structure. + */ +void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc) +{ + ldlm_res2desc(lock->l_resource, &desc->l_resource); + desc->l_req_mode = lock->l_req_mode; + desc->l_granted_mode = lock->l_granted_mode; + ldlm_convert_policy_to_wire(lock->l_resource->lr_type, + &lock->l_policy_data, + &desc->l_policy_data); +} + +/** + * Add a lock to list of conflicting locks to send AST to. + * + * Only add if we have not sent a blocking AST to the lock yet. + */ +static void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, + struct list_head *work_list) +{ + if (!ldlm_is_ast_sent(lock)) { + LDLM_DEBUG(lock, "lock incompatible; sending blocking AST."); + ldlm_set_ast_sent(lock); + /* If the enqueuing client said so, tell the AST recipient to + * discard dirty data, rather than writing back. */ + if (ldlm_is_ast_discard_data(new)) + ldlm_set_discard_data(lock); + + /* Lock can be converted from a blocking state back to granted + * after lock convert or COS downgrade but still be in an + * older bl_list because it is controlled only by + * ldlm_work_bl_ast_lock(), let it be processed there. + */ + if (list_empty(&lock->l_bl_ast)) { + list_add(&lock->l_bl_ast, work_list); + LDLM_LOCK_GET(lock); + } + LASSERT(lock->l_blocking_lock == NULL); + lock->l_blocking_lock = LDLM_LOCK_GET(new); + } +} + +/** + * Add a lock to list of just granted locks to send completion AST to. + */ +static void ldlm_add_cp_work_item(struct ldlm_lock *lock, + struct list_head *work_list) +{ + if (!ldlm_is_cp_reqd(lock)) { + ldlm_set_cp_reqd(lock); + LDLM_DEBUG(lock, "lock granted; sending completion AST."); + LASSERT(list_empty(&lock->l_cp_ast)); + list_add(&lock->l_cp_ast, work_list); + LDLM_LOCK_GET(lock); + } +} + +/** + * Aggregator function to add AST work items into a list. Determines + * what sort of an AST work needs to be done and calls the proper + * adding function. + * Must be called with lr_lock held. + */ +void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, + struct list_head *work_list) +{ + ENTRY; + check_res_locked(lock->l_resource); + if (new) + ldlm_add_bl_work_item(lock, new, work_list); + else + ldlm_add_cp_work_item(lock, work_list); + EXIT; +} + +/** + * Add specified reader/writer reference to LDLM lock with handle \a lockh. + * r/w reference type is determined by \a mode + * Calls ldlm_lock_addref_internal. + */ +void ldlm_lock_addref(const struct lustre_handle *lockh, enum ldlm_mode mode) +{ + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(lockh); + LASSERTF(lock != NULL, "Non-existing lock: %#llx\n", lockh->cookie); + ldlm_lock_addref_internal(lock, mode); + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_addref); + +/** + * Helper function. + * Add specified reader/writer reference to LDLM lock \a lock. + * r/w reference type is determined by \a mode + * Removes lock from LRU if it is there. + * Assumes the LDLM lock is already locked. + */ +void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock, + enum ldlm_mode mode) +{ + ldlm_lock_remove_from_lru(lock); + if (mode & (LCK_NL | LCK_CR | LCK_PR)) { + lock->l_readers++; + lu_ref_add_atomic(&lock->l_reference, "reader", lock); + } + if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) { + lock->l_writers++; + lu_ref_add_atomic(&lock->l_reference, "writer", lock); + } + LDLM_LOCK_GET(lock); + lu_ref_add_atomic(&lock->l_reference, "user", lock); + LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]); +} + +/** + * Attempts to add reader/writer reference to a lock with handle \a lockh, and + * fails if lock is already LDLM_FL_CBPENDING or destroyed. + * + * \retval 0 success, lock was addref-ed + * + * \retval -EAGAIN lock is being canceled. + */ +int ldlm_lock_addref_try(const struct lustre_handle *lockh, enum ldlm_mode mode) +{ + struct ldlm_lock *lock; + int result; + + result = -EAGAIN; + lock = ldlm_handle2lock(lockh); + if (lock != NULL) { + lock_res_and_lock(lock); + if (lock->l_readers != 0 || lock->l_writers != 0 || + !ldlm_is_cbpending(lock)) { + ldlm_lock_addref_internal_nolock(lock, mode); + result = 0; + } + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + } + return result; +} +EXPORT_SYMBOL(ldlm_lock_addref_try); + +/** + * Add specified reader/writer reference to LDLM lock \a lock. + * Locks LDLM lock and calls ldlm_lock_addref_internal_nolock to do the work. + * Only called for local locks. + */ +void ldlm_lock_addref_internal(struct ldlm_lock *lock, enum ldlm_mode mode) +{ + lock_res_and_lock(lock); + ldlm_lock_addref_internal_nolock(lock, mode); + unlock_res_and_lock(lock); +} + +/** + * Removes reader/writer reference for LDLM lock \a lock. + * Assumes LDLM lock is already locked. + * only called in ldlm_flock_destroy and for local locks. + * Does NOT add lock to LRU if no r/w references left to accomodate flock locks + * that cannot be placed in LRU. + */ +void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, + enum ldlm_mode mode) +{ + LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); + if (mode & (LCK_NL | LCK_CR | LCK_PR)) { + LASSERT(lock->l_readers > 0); + lu_ref_del(&lock->l_reference, "reader", lock); + lock->l_readers--; + } + if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) { + LASSERT(lock->l_writers > 0); + lu_ref_del(&lock->l_reference, "writer", lock); + lock->l_writers--; + } + + lu_ref_del(&lock->l_reference, "user", lock); + LDLM_LOCK_RELEASE(lock); /* matches the LDLM_LOCK_GET() in addref */ +} + +/** + * Removes reader/writer reference for LDLM lock \a lock. + * Locks LDLM lock first. + * If the lock is determined to be client lock on a client and r/w refcount + * drops to zero and the lock is not blocked, the lock is added to LRU lock + * on the namespace. + * For blocked LDLM locks if r/w count drops to zero, blocking_ast is called. + */ +void ldlm_lock_decref_internal(struct ldlm_lock *lock, enum ldlm_mode mode) +{ + struct ldlm_namespace *ns; + + ENTRY; + + lock_res_and_lock(lock); + + ns = ldlm_lock_to_ns(lock); + + ldlm_lock_decref_internal_nolock(lock, mode); + + if ((ldlm_is_local(lock) || lock->l_req_mode == LCK_GROUP) && + !lock->l_readers && !lock->l_writers) { + /* If this is a local lock on a server namespace and this was + * the last reference, cancel the lock. + * + * Group locks are special: + * They must not go in LRU, but they are not called back + * like non-group locks, instead they are manually released. + * They have an l_writers reference which they keep until + * they are manually released, so we remove them when they have + * no more reader or writer references. - LU-6368 */ + ldlm_set_cbpending(lock); + } + + if (!lock->l_readers && !lock->l_writers && ldlm_is_cbpending(lock)) { + unsigned int mask = D_DLMTRACE; + + /* If we received a blocked AST and this was the last reference, + * run the callback. */ + if (ldlm_is_ns_srv(lock) && lock->l_export) + mask |= D_WARNING; + LDLM_DEBUG_LIMIT(mask, lock, + "final decref done on %sCBPENDING lock", + mask & D_WARNING ? "non-local " : ""); + + LDLM_LOCK_GET(lock); /* dropped by bl thread */ + ldlm_lock_remove_from_lru(lock); + unlock_res_and_lock(lock); + + if (ldlm_is_fail_loc(lock)) + OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); + + if (ldlm_is_atomic_cb(lock) || + ldlm_bl_to_thread_lock(ns, NULL, lock) != 0) + ldlm_handle_bl_callback(ns, NULL, lock); + } else if (ns_is_client(ns) && + !lock->l_readers && !lock->l_writers && + !ldlm_is_no_lru(lock) && + !ldlm_is_bl_ast(lock) && + !ldlm_is_converting(lock)) { + + /* If this is a client-side namespace and this was the last + * reference, put it on the LRU. + */ + ldlm_lock_add_to_lru(lock); + unlock_res_and_lock(lock); + LDLM_DEBUG(lock, "add lock into lru list"); + + if (ldlm_is_fail_loc(lock)) + OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); + + ldlm_pool_recalc(&ns->ns_pool, true); + } else { + LDLM_DEBUG(lock, "do not add lock into lru list"); + unlock_res_and_lock(lock); + } + + EXIT; +} + +/** + * Decrease reader/writer refcount for LDLM lock with handle \a lockh + */ +void ldlm_lock_decref(const struct lustre_handle *lockh, enum ldlm_mode mode) +{ + struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); + LASSERTF(lock != NULL, "Non-existing lock: %#llx\n", lockh->cookie); + ldlm_lock_decref_internal(lock, mode); + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_decref); + +/** + * Decrease reader/writer refcount for LDLM lock with handle + * \a lockh and mark it for subsequent cancellation once r/w refcount + * drops to zero instead of putting into LRU. + * + */ +void ldlm_lock_decref_and_cancel(const struct lustre_handle *lockh, + enum ldlm_mode mode) +{ + struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); + ENTRY; + + LASSERT(lock != NULL); + + LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); + lock_res_and_lock(lock); + ldlm_set_cbpending(lock); + unlock_res_and_lock(lock); + ldlm_lock_decref_internal(lock, mode); + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_decref_and_cancel); + +struct sl_insert_point { + struct list_head *res_link; + struct list_head *mode_link; + struct list_head *policy_link; +}; + +/** + * Finds a position to insert the new lock into granted lock list. + * + * Used for locks eligible for skiplist optimization. + * + * Parameters: + * queue [input]: the granted list where search acts on; + * req [input]: the lock whose position to be located; + * prev [output]: positions within 3 lists to insert @req to + * Return Value: + * filled @prev + * NOTE: called by + * - ldlm_grant_lock_with_skiplist + */ +static void search_granted_lock(struct list_head *queue, + struct ldlm_lock *req, + struct sl_insert_point *prev) +{ + struct list_head *tmp; + struct ldlm_lock *lock, *mode_end, *policy_end; + ENTRY; + + list_for_each(tmp, queue) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + mode_end = list_entry(lock->l_sl_mode.prev, + struct ldlm_lock, l_sl_mode); + + if (lock->l_req_mode != req->l_req_mode) { + /* jump to last lock of mode group */ + tmp = &mode_end->l_res_link; + continue; + } + + /* suitable mode group is found */ + if (lock->l_resource->lr_type == LDLM_PLAIN) { + /* insert point is last lock of the mode group */ + prev->res_link = &mode_end->l_res_link; + prev->mode_link = &mode_end->l_sl_mode; + prev->policy_link = &req->l_sl_policy; + EXIT; + return; + } else if (lock->l_resource->lr_type == LDLM_IBITS) { + for (;;) { + policy_end = + list_entry(lock->l_sl_policy.prev, + struct ldlm_lock, + l_sl_policy); + + if (lock->l_policy_data.l_inodebits.bits == + req->l_policy_data.l_inodebits.bits) { + /* insert point is last lock of + * the policy group */ + prev->res_link = + &policy_end->l_res_link; + prev->mode_link = + &policy_end->l_sl_mode; + prev->policy_link = + &policy_end->l_sl_policy; + EXIT; + return; + } + + if (policy_end == mode_end) + /* done with mode group */ + break; + + /* go to next policy group within mode group */ + tmp = policy_end->l_res_link.next; + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + } /* loop over policy groups within the mode group */ + + /* insert point is last lock of the mode group, + * new policy group is started */ + prev->res_link = &mode_end->l_res_link; + prev->mode_link = &mode_end->l_sl_mode; + prev->policy_link = &req->l_sl_policy; + EXIT; + return; + } else { + LDLM_ERROR(lock,"is not LDLM_PLAIN or LDLM_IBITS lock"); + LBUG(); + } + } + + /* insert point is last lock on the queue, + * new mode group and new policy group are started */ + prev->res_link = queue->prev; + prev->mode_link = &req->l_sl_mode; + prev->policy_link = &req->l_sl_policy; + EXIT; +} + +/** + * Add a lock into resource granted list after a position described by + * \a prev. + */ +static void ldlm_granted_list_add_lock(struct ldlm_lock *lock, + struct sl_insert_point *prev) +{ + struct ldlm_resource *res = lock->l_resource; + ENTRY; + + check_res_locked(res); + + ldlm_resource_dump(D_INFO, res); + LDLM_DEBUG(lock, "About to add lock:"); + + if (ldlm_is_destroyed(lock)) { + CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); + return; + } + + LASSERT(list_empty(&lock->l_res_link)); + LASSERT(list_empty(&lock->l_sl_mode)); + LASSERT(list_empty(&lock->l_sl_policy)); + + /* + * lock->link == prev->link means lock is first starting the group. + * Don't re-add to itself to suppress kernel warnings. + */ + if (&lock->l_res_link != prev->res_link) + list_add(&lock->l_res_link, prev->res_link); + if (&lock->l_sl_mode != prev->mode_link) + list_add(&lock->l_sl_mode, prev->mode_link); + if (&lock->l_sl_policy != prev->policy_link) + list_add(&lock->l_sl_policy, prev->policy_link); + + EXIT; +} + +/** + * Add a lock to granted list on a resource maintaining skiplist + * correctness. + */ +void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock) +{ + struct sl_insert_point prev; + + LASSERT(ldlm_is_granted(lock)); + + search_granted_lock(&lock->l_resource->lr_granted, lock, &prev); + ldlm_granted_list_add_lock(lock, &prev); +} + +/** + * Perform lock granting bookkeeping. + * + * Includes putting the lock into granted list and updating lock mode. + * NOTE: called by + * - ldlm_lock_enqueue + * - ldlm_reprocess_queue + * + * must be called with lr_lock held + */ +void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list) +{ + struct ldlm_resource *res = lock->l_resource; + ENTRY; + + check_res_locked(res); + + lock->l_granted_mode = lock->l_req_mode; + + if (work_list && lock->l_completion_ast != NULL) + ldlm_add_ast_work_item(lock, NULL, work_list); + + if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) + ldlm_grant_lock_with_skiplist(lock); + else if (res->lr_type == LDLM_EXTENT) + ldlm_extent_add_lock(res, lock); + else if (res->lr_type == LDLM_FLOCK) { + /* We should not add locks to granted list in the following + * cases: + * - this is an UNLOCK but not a real lock; + * - this is a TEST lock; + * - this is a F_CANCELLK lock (async flock has req_mode == 0) + * - this is a deadlock (flock cannot be granted) */ + if (lock->l_req_mode == 0 || + lock->l_req_mode == LCK_NL || + ldlm_is_test_lock(lock) || + ldlm_is_flock_deadlock(lock)) + RETURN_EXIT; + ldlm_resource_add_lock(res, &res->lr_granted, lock); + } else { + LBUG(); + } + + ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock); + EXIT; +} + +/** + * Check if the given @lock meets the criteria for a match. + * A reference on the lock is taken if matched. + * + * @lock test-against this lock + * @data parameters + * + * RETURN returns true if @lock matches @data, false otherwise + */ +static bool lock_matches(struct ldlm_lock *lock, struct ldlm_match_data *data) +{ + union ldlm_policy_data *lpol = &lock->l_policy_data; + enum ldlm_mode match = LCK_MINMODE; + + if (lock == data->lmd_old) + return true; + + /* Check if this lock can be matched. + * Used by LU-2919(exclusive open) for open lease lock */ + if (ldlm_is_excl(lock)) + return false; + + /* llite sometimes wants to match locks that will be + * canceled when their users drop, but we allow it to match + * if it passes in CBPENDING and the lock still has users. + * this is generally only going to be used by children + * whose parents already hold a lock so forward progress + * can still happen. */ + if (ldlm_is_cbpending(lock) && + !(data->lmd_flags & LDLM_FL_CBPENDING)) + return false; + + if (!(data->lmd_match & LDLM_MATCH_UNREF) && ldlm_is_cbpending(lock) && + lock->l_readers == 0 && lock->l_writers == 0) + return false; + + if (!(lock->l_req_mode & *data->lmd_mode)) + return false; + + /* When we search for ast_data, we are not doing a traditional match, + * so we don't worry about IBITS or extent matching. + */ + if (data->lmd_match & (LDLM_MATCH_AST | LDLM_MATCH_AST_ANY)) { + if (!lock->l_ast_data) + return false; + + if (data->lmd_match & LDLM_MATCH_AST_ANY) + goto matched; + } + + match = lock->l_req_mode; + + switch (lock->l_resource->lr_type) { + case LDLM_EXTENT: + if (!(data->lmd_match & LDLM_MATCH_RIGHT) && + (lpol->l_extent.start > data->lmd_policy->l_extent.start || + lpol->l_extent.end < data->lmd_policy->l_extent.end)) + return false; + + if (unlikely(match == LCK_GROUP) && + data->lmd_policy->l_extent.gid != LDLM_GID_ANY && + lpol->l_extent.gid != data->lmd_policy->l_extent.gid) + return false; + break; + case LDLM_IBITS: + /* We match if we have existing lock with same or wider set + of bits. */ + if ((lpol->l_inodebits.bits & + data->lmd_policy->l_inodebits.bits) != + data->lmd_policy->l_inodebits.bits) + return false; + + if (unlikely(match == LCK_GROUP) && + data->lmd_policy->l_inodebits.li_gid != LDLM_GID_ANY && + lpol->l_inodebits.li_gid != + data->lmd_policy->l_inodebits.li_gid) + return false; + break; + default: + ; + } + + /* We match if we have existing lock with same or wider set + of bits. */ + if (!(data->lmd_match & LDLM_MATCH_UNREF) && LDLM_HAVE_MASK(lock, GONE)) + return false; + + if (!equi(data->lmd_flags & LDLM_FL_LOCAL_ONLY, ldlm_is_local(lock))) + return false; + + /* Filter locks by skipping flags */ + if (data->lmd_skip_flags & lock->l_flags) + return false; + +matched: + if (data->lmd_flags & LDLM_FL_TEST_LOCK) { + LDLM_LOCK_GET(lock); + ldlm_lock_touch_in_lru(lock); + } else { + ldlm_lock_addref_internal_nolock(lock, match); + } + + *data->lmd_mode = match; + data->lmd_lock = lock; + + return true; +} + +static unsigned int itree_overlap_cb(struct interval_node *in, void *args) +{ + struct ldlm_interval *node = to_ldlm_interval(in); + struct ldlm_match_data *data = args; + struct ldlm_lock *lock; + + list_for_each_entry(lock, &node->li_group, l_sl_policy) { + if (lock_matches(lock, data)) + return INTERVAL_ITER_STOP; + } + return INTERVAL_ITER_CONT; +} + +/** + * Search for a lock with given parameters in interval trees. + * + * \param res search for a lock in this resource + * \param data parameters + * + * \retval a referenced lock or NULL. + */ +struct ldlm_lock *search_itree(struct ldlm_resource *res, + struct ldlm_match_data *data) +{ + struct interval_node_extent ext = { + .start = data->lmd_policy->l_extent.start, + .end = data->lmd_policy->l_extent.end + }; + int idx; + + data->lmd_lock = NULL; + + if (data->lmd_match & LDLM_MATCH_RIGHT) + ext.end = OBD_OBJECT_EOF; + + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + struct ldlm_interval_tree *tree = &res->lr_itree[idx]; + + if (tree->lit_root == NULL) + continue; + + if (!(tree->lit_mode & *data->lmd_mode)) + continue; + + interval_search(tree->lit_root, &ext, + itree_overlap_cb, data); + if (data->lmd_lock) + return data->lmd_lock; + } + + return NULL; +} +EXPORT_SYMBOL(search_itree); + + +/** + * Search for a lock with given properties in a queue. + * + * \param queue search for a lock in this queue + * \param data parameters + * + * \retval a referenced lock or NULL. + */ +static struct ldlm_lock *search_queue(struct list_head *queue, + struct ldlm_match_data *data) +{ + struct ldlm_lock *lock; + + data->lmd_lock = NULL; + + list_for_each_entry(lock, queue, l_res_link) + if (lock_matches(lock, data)) + return data->lmd_lock; + + return NULL; +} + +void ldlm_lock_fail_match_locked(struct ldlm_lock *lock) +{ + if ((lock->l_flags & LDLM_FL_FAIL_NOTIFIED) == 0) { + lock->l_flags |= LDLM_FL_FAIL_NOTIFIED; + wake_up(&lock->l_waitq); + } +} +EXPORT_SYMBOL(ldlm_lock_fail_match_locked); + +void ldlm_lock_fail_match(struct ldlm_lock *lock) +{ + lock_res_and_lock(lock); + ldlm_lock_fail_match_locked(lock); + unlock_res_and_lock(lock); +} + +/** + * Mark lock as "matchable" by OST. + * + * Used to prevent certain races in LOV/OSC where the lock is granted, but LVB + * is not yet valid. + * Assumes LDLM lock is already locked. + */ +void ldlm_lock_allow_match_locked(struct ldlm_lock *lock) +{ + ldlm_set_lvb_ready(lock); + wake_up(&lock->l_waitq); +} +EXPORT_SYMBOL(ldlm_lock_allow_match_locked); + +/** + * Mark lock as "matchable" by OST. + * Locks the lock and then \see ldlm_lock_allow_match_locked + */ +void ldlm_lock_allow_match(struct ldlm_lock *lock) +{ + lock_res_and_lock(lock); + ldlm_lock_allow_match_locked(lock); + unlock_res_and_lock(lock); +} +EXPORT_SYMBOL(ldlm_lock_allow_match); + +/** + * Attempt to find a lock with specified properties. + * + * Typically returns a reference to matched lock unless LDLM_FL_TEST_LOCK is + * set in \a flags + * + * Can be called in two ways: + * + * If 'ns' is NULL, then lockh describes an existing lock that we want to look + * for a duplicate of. + * + * Otherwise, all of the fields must be filled in, to match against. + * + * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the + * server (ie, connh is NULL) + * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted + * list will be considered + * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked + * to be canceled can still be matched as long as they still have reader + * or writer refernces + * If 'flags' contains LDLM_FL_TEST_LOCK, then don't actually reference a lock, + * just tell us if we would have matched. + * + * \retval 1 if it finds an already-existing lock that is compatible; in this + * case, lockh is filled in with a addref()ed lock + * + * We also check security context, and if that fails we simply return 0 (to + * keep caller code unchanged), the context failure will be discovered by + * caller sometime later. + */ +enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns, + __u64 flags, __u64 skip_flags, + const struct ldlm_res_id *res_id, + enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, + struct lustre_handle *lockh, + enum ldlm_match_flags match_flags) +{ + struct ldlm_match_data data = { + .lmd_old = NULL, + .lmd_lock = NULL, + .lmd_mode = &mode, + .lmd_policy = policy, + .lmd_flags = flags, + .lmd_skip_flags = skip_flags, + .lmd_match = match_flags, + }; + struct ldlm_resource *res; + struct ldlm_lock *lock; + int matched; + + ENTRY; + + if (ns == NULL) { + data.lmd_old = ldlm_handle2lock(lockh); + LASSERT(data.lmd_old != NULL); + + ns = ldlm_lock_to_ns(data.lmd_old); + res_id = &data.lmd_old->l_resource->lr_name; + type = data.lmd_old->l_resource->lr_type; + *data.lmd_mode = data.lmd_old->l_req_mode; + } + + res = ldlm_resource_get(ns, NULL, res_id, type, 0); + if (IS_ERR(res)) { + LASSERT(data.lmd_old == NULL); + RETURN(0); + } + + LDLM_RESOURCE_ADDREF(res); + lock_res(res); + if (res->lr_type == LDLM_EXTENT) + lock = search_itree(res, &data); + else + lock = search_queue(&res->lr_granted, &data); + if (!lock && !(flags & LDLM_FL_BLOCK_GRANTED)) + lock = search_queue(&res->lr_waiting, &data); + matched = lock ? mode : 0; + unlock_res(res); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + + if (lock) { + ldlm_lock2handle(lock, lockh); + if ((flags & LDLM_FL_LVB_READY) && + (!ldlm_is_lvb_ready(lock))) { + __u64 wait_flags = LDLM_FL_LVB_READY | + LDLM_FL_DESTROYED | LDLM_FL_FAIL_NOTIFIED; + + if (lock->l_completion_ast) { + int err = lock->l_completion_ast(lock, + LDLM_FL_WAIT_NOREPROC, + NULL); + if (err) + GOTO(out_fail_match, matched = 0); + } + + wait_event_idle_timeout( + lock->l_waitq, + lock->l_flags & wait_flags, + cfs_time_seconds(obd_timeout)); + + if (!ldlm_is_lvb_ready(lock)) + GOTO(out_fail_match, matched = 0); + } + + /* check user's security context */ + if (lock->l_conn_export && + sptlrpc_import_check_ctx( + class_exp2cliimp(lock->l_conn_export))) + GOTO(out_fail_match, matched = 0); + + LDLM_DEBUG(lock, "matched (%llu %llu)", + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[2] : policy->l_extent.start, + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[3] : policy->l_extent.end); + +out_fail_match: + if (flags & LDLM_FL_TEST_LOCK) + LDLM_LOCK_RELEASE(lock); + else if (!matched) + ldlm_lock_decref_internal(lock, mode); + } + + /* less verbose for test-only */ + if (!matched && !(flags & LDLM_FL_TEST_LOCK)) { + LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res " + "%llu/%llu (%llu %llu)", ns, + type, mode, res_id->name[0], res_id->name[1], + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[2] : policy->l_extent.start, + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[3] : policy->l_extent.end); + } + if (data.lmd_old != NULL) + LDLM_LOCK_PUT(data.lmd_old); + + return matched; +} +EXPORT_SYMBOL(ldlm_lock_match_with_skip); + +enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh, + __u64 *bits) +{ + struct ldlm_lock *lock; + enum ldlm_mode mode = 0; + ENTRY; + + lock = ldlm_handle2lock(lockh); + if (lock != NULL) { + lock_res_and_lock(lock); + if (LDLM_HAVE_MASK(lock, GONE)) + GOTO(out, mode); + + if (ldlm_is_cbpending(lock) && + lock->l_readers == 0 && lock->l_writers == 0) + GOTO(out, mode); + + if (bits) + *bits = lock->l_policy_data.l_inodebits.bits; + mode = lock->l_granted_mode; + ldlm_lock_addref_internal_nolock(lock, mode); + } + + EXIT; + +out: + if (lock != NULL) { + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + } + return mode; +} +EXPORT_SYMBOL(ldlm_revalidate_lock_handle); + +/** The caller must guarantee that the buffer is large enough. */ +int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, + enum req_location loc, void *data, int size) +{ + void *lvb; + ENTRY; + + LASSERT(data != NULL); + LASSERT(size >= 0); + + switch (lock->l_lvb_type) { + case LVB_T_OST: + if (size == sizeof(struct ost_lvb)) { + if (loc == RCL_CLIENT) + lvb = req_capsule_client_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_ost_lvb); + else + lvb = req_capsule_server_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_ost_lvb); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + RETURN(-EPROTO); + } + + memcpy(data, lvb, size); + } else if (size == sizeof(struct ost_lvb_v1)) { + struct ost_lvb *olvb = data; + + if (loc == RCL_CLIENT) + lvb = req_capsule_client_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_ost_lvb_v1); + else + lvb = req_capsule_server_sized_swab_get(pill, + &RMF_DLM_LVB, size, + lustre_swab_ost_lvb_v1); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + RETURN(-EPROTO); + } + + memcpy(data, lvb, size); + olvb->lvb_mtime_ns = 0; + olvb->lvb_atime_ns = 0; + olvb->lvb_ctime_ns = 0; + } else { + LDLM_ERROR(lock, "Replied unexpected ost LVB size %d", + size); + RETURN(-EINVAL); + } + break; + case LVB_T_LQUOTA: + if (size == sizeof(struct lquota_lvb)) { + if (loc == RCL_CLIENT) + lvb = req_capsule_client_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_lquota_lvb); + else + lvb = req_capsule_server_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_lquota_lvb); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + RETURN(-EPROTO); + } + + memcpy(data, lvb, size); + } else { + LDLM_ERROR(lock, "Replied unexpected lquota LVB size %d", + size); + RETURN(-EINVAL); + } + break; + case LVB_T_LAYOUT: + if (size == 0) + break; + + if (loc == RCL_CLIENT) + lvb = req_capsule_client_get(pill, &RMF_DLM_LVB); + else + lvb = req_capsule_server_get(pill, &RMF_DLM_LVB); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + RETURN(-EPROTO); + } + + memcpy(data, lvb, size); + break; + default: + LDLM_ERROR(lock, "Unknown LVB type: %d", lock->l_lvb_type); + libcfs_debug_dumpstack(NULL); + RETURN(-EINVAL); + } + + RETURN(0); +} + +/** + * Create and fill in new LDLM lock with specified properties. + * Returns a referenced lock + */ +struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + enum ldlm_type type, + enum ldlm_mode mode, + const struct ldlm_callback_suite *cbs, + void *data, __u32 lvb_len, + enum lvb_type lvb_type) +{ + struct ldlm_lock *lock; + struct ldlm_resource *res; + int rc; + ENTRY; + + res = ldlm_resource_get(ns, NULL, res_id, type, 1); + if (IS_ERR(res)) + RETURN(ERR_CAST(res)); + + lock = ldlm_lock_new(res); + if (!lock) { + ldlm_resource_putref(res); + RETURN(ERR_PTR(-ENOMEM)); + } + + lock->l_req_mode = mode; + lock->l_ast_data = data; + lock->l_pid = current->pid; + if (ns_is_server(ns)) + ldlm_set_ns_srv(lock); + if (cbs) { + lock->l_blocking_ast = cbs->lcs_blocking; + lock->l_completion_ast = cbs->lcs_completion; + lock->l_glimpse_ast = cbs->lcs_glimpse; + } + + switch (type) { + case LDLM_EXTENT: + rc = ldlm_extent_alloc_lock(lock); + break; + case LDLM_IBITS: + rc = ldlm_inodebits_alloc_lock(lock); + break; + default: + rc = 0; + } + if (rc) + GOTO(out, rc); + + if (lvb_len) { + lock->l_lvb_len = lvb_len; + OBD_ALLOC_LARGE(lock->l_lvb_data, lvb_len); + if (lock->l_lvb_data == NULL) + GOTO(out, rc = -ENOMEM); + } + + lock->l_lvb_type = lvb_type; + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK)) + GOTO(out, rc = -ENOENT); + + RETURN(lock); + +out: + ldlm_lock_destroy(lock); + LDLM_LOCK_RELEASE(lock); + RETURN(ERR_PTR(rc)); +} + +#ifdef HAVE_SERVER_SUPPORT +static enum ldlm_error ldlm_lock_enqueue_helper(struct ldlm_lock *lock, + __u64 *flags) +{ + struct ldlm_resource *res = lock->l_resource; + enum ldlm_error rc = ELDLM_OK; + LIST_HEAD(rpc_list); + ldlm_processing_policy policy; + + ENTRY; + + policy = ldlm_get_processing_policy(res); + policy(lock, flags, LDLM_PROCESS_ENQUEUE, &rc, &rpc_list); + if (rc == ELDLM_OK && lock->l_granted_mode != lock->l_req_mode && + res->lr_type != LDLM_FLOCK) + rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list); + + if (!list_empty(&rpc_list)) + ldlm_discard_bl_list(&rpc_list); + + RETURN(rc); +} +#endif + +/** + * Enqueue (request) a lock. + * + * Does not block. As a result of enqueue the lock would be put + * into granted or waiting list. + * + * If namespace has intent policy sent and the lock has LDLM_FL_HAS_INTENT flag + * set, skip all the enqueueing and delegate lock processing to intent policy + * function. + */ +enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env, + struct ldlm_namespace *ns, + struct ldlm_lock **lockp, + void *cookie, __u64 *flags) +{ + struct ldlm_lock *lock = *lockp; + struct ldlm_resource *res; + int local = ns_is_client(ns); + enum ldlm_error rc = ELDLM_OK; + struct ldlm_interval *node = NULL; +#ifdef HAVE_SERVER_SUPPORT + bool reconstruct = false; +#endif + ENTRY; + + /* policies are not executed on the client or during replay */ + if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT + && !local && ns->ns_policy) { + rc = ns->ns_policy(env, ns, lockp, cookie, lock->l_req_mode, + *flags, NULL); + if (rc == ELDLM_LOCK_REPLACED) { + /* The lock that was returned has already been granted, + * and placed into lockp. If it's not the same as the + * one we passed in, then destroy the old one and our + * work here is done. */ + if (lock != *lockp) { + ldlm_lock_destroy(lock); + LDLM_LOCK_RELEASE(lock); + } + *flags |= LDLM_FL_LOCK_CHANGED; + RETURN(0); + } else if (rc != ELDLM_OK && + ldlm_is_granted(lock)) { + LASSERT(*flags & LDLM_FL_RESENT); + /* It may happen that ns_policy returns an error in + * resend case, object may be unlinked or just some + * error occurs. It is unclear if lock reached the + * client in the original reply, just leave the lock on + * server, not returning it again to client. Due to + * LU-6529, the server will not OOM. */ + RETURN(rc); + } else if (rc != ELDLM_OK || + (rc == ELDLM_OK && (*flags & LDLM_FL_INTENT_ONLY))) { + ldlm_lock_destroy(lock); + RETURN(rc); + } + } + + if (*flags & LDLM_FL_RESENT) { + /* Reconstruct LDLM_FL_SRV_ENQ_MASK @flags for reply. + * Set LOCK_CHANGED always. + * Check if the lock is granted for BLOCK_GRANTED. + * Take NO_TIMEOUT from the lock as it is inherited through + * LDLM_FL_INHERIT_MASK */ + *flags |= LDLM_FL_LOCK_CHANGED; + if (!ldlm_is_granted(lock)) + *flags |= LDLM_FL_BLOCK_GRANTED; + *flags |= lock->l_flags & LDLM_FL_NO_TIMEOUT; + RETURN(ELDLM_OK); + } + +#ifdef HAVE_SERVER_SUPPORT + /* For a replaying lock, it might be already in granted list. So + * unlinking the lock will cause the interval node to be freed, we + * have to allocate the interval node early otherwise we can't regrant + * this lock in the future. - jay + * + * The only time the ldlm_resource changes for the ldlm_lock is when + * ldlm_lock_change_resource() is called and that only happens for + * the Lustre client case. + */ + if (!local && (*flags & LDLM_FL_REPLAY) && + lock->l_resource->lr_type == LDLM_EXTENT) + OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS); + + reconstruct = !local && lock->l_resource->lr_type == LDLM_FLOCK && + !(*flags & LDLM_FL_TEST_LOCK); + if (reconstruct) { + rc = req_can_reconstruct(cookie, NULL); + if (rc != 0) { + if (rc == 1) + rc = 0; + RETURN(rc); + } + } +#endif + res = lock_res_and_lock(lock); + if (local && ldlm_is_granted(lock)) { + /* The server returned a blocked lock, but it was granted + * before we got a chance to actually enqueue it. We don't + * need to do anything else. */ + *flags &= ~LDLM_FL_BLOCKED_MASK; + GOTO(out, rc = ELDLM_OK); + } + + ldlm_resource_unlink_lock(lock); + if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) { + if (node == NULL) { + ldlm_lock_destroy_nolock(lock); + GOTO(out, rc = -ENOMEM); + } + + INIT_LIST_HEAD(&node->li_group); + ldlm_interval_attach(node, lock); + node = NULL; + } + + /* Some flags from the enqueue want to make it into the AST, via the + * lock's l_flags. */ + if (*flags & LDLM_FL_AST_DISCARD_DATA) + ldlm_set_ast_discard_data(lock); + if (*flags & LDLM_FL_TEST_LOCK) + ldlm_set_test_lock(lock); + if (*flags & LDLM_FL_COS_INCOMPAT) + ldlm_set_cos_incompat(lock); + if (*flags & LDLM_FL_COS_ENABLED) + ldlm_set_cos_enabled(lock); + + /* This distinction between local lock trees is very important; a client + * namespace only has information about locks taken by that client, and + * thus doesn't have enough information to decide for itself if it can + * be granted (below). In this case, we do exactly what the server + * tells us to do, as dictated by the 'flags'. + * + * We do exactly the same thing during recovery, when the server is + * more or less trusting the clients not to lie. + * + * FIXME (bug 268): Detect obvious lies by checking compatibility in + * granted queue. */ + if (local) { + if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED)) + ldlm_resource_add_lock(res, &res->lr_waiting, lock); + else + ldlm_grant_lock(lock, NULL); + GOTO(out, rc = ELDLM_OK); +#ifdef HAVE_SERVER_SUPPORT + } else if (*flags & LDLM_FL_REPLAY) { + if (*flags & LDLM_FL_BLOCK_WAIT) { + ldlm_resource_add_lock(res, &res->lr_waiting, lock); + GOTO(out, rc = ELDLM_OK); + } else if (*flags & LDLM_FL_BLOCK_GRANTED) { + ldlm_grant_lock(lock, NULL); + GOTO(out, rc = ELDLM_OK); + } + /* If no flags, fall through to normal enqueue path. */ + } + + rc = ldlm_lock_enqueue_helper(lock, flags); + GOTO(out, rc); +#else + } else { + CERROR("This is client-side-only module, cannot handle " + "LDLM_NAMESPACE_SERVER resource type lock.\n"); + LBUG(); + } +#endif + +out: + unlock_res_and_lock(lock); + +#ifdef HAVE_SERVER_SUPPORT + if (reconstruct) { + struct ptlrpc_request *req = cookie; + + tgt_mk_reply_data(NULL, NULL, + &req->rq_export->exp_target_data, + req, 0, NULL, false, 0); + } +#endif + if (node) + OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); + return rc; +} + +#ifdef HAVE_SERVER_SUPPORT +/** + * Iterate through all waiting locks on a given resource queue and attempt to + * grant them. + * + * Must be called with resource lock held. + */ +int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue, + struct list_head *work_list, + enum ldlm_process_intention intention, __u64 hint) +{ + struct list_head *tmp, *pos; + ldlm_processing_policy policy; + __u64 flags; + int rc = LDLM_ITER_CONTINUE; + enum ldlm_error err; + LIST_HEAD(bl_ast_list); + + ENTRY; + + check_res_locked(res); + + policy = ldlm_get_processing_policy(res); + LASSERT(policy); + LASSERT(intention == LDLM_PROCESS_RESCAN || + intention == LDLM_PROCESS_RECOVERY); + +restart: + list_for_each_safe(tmp, pos, queue) { + struct ldlm_lock *pending; + LIST_HEAD(rpc_list); + + pending = list_entry(tmp, struct ldlm_lock, l_res_link); + + CDEBUG(D_INFO, "Reprocessing lock %p\n", pending); + + flags = 0; + rc = policy(pending, &flags, intention, &err, &rpc_list); + if (pending->l_granted_mode == pending->l_req_mode || + res->lr_type == LDLM_FLOCK) { + list_splice(&rpc_list, work_list); + } else { + list_splice(&rpc_list, &bl_ast_list); + } + /* + * When this is called from recovery done, we always want + * to scan the whole list no matter what 'rc' is returned. + */ + if (rc != LDLM_ITER_CONTINUE && + intention == LDLM_PROCESS_RESCAN) + break; + } + + if (!list_empty(&bl_ast_list)) { + unlock_res(res); + + rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &bl_ast_list, + LDLM_WORK_BL_AST); + + lock_res(res); + if (rc == -ERESTART) + GOTO(restart, rc); + } + + if (!list_empty(&bl_ast_list)) + ldlm_discard_bl_list(&bl_ast_list); + + RETURN(intention == LDLM_PROCESS_RESCAN ? rc : LDLM_ITER_CONTINUE); +} + +/** + * Conflicting locks are detected for a lock to be enqueued, add the lock + * into waiting list and send blocking ASTs to the conflicting locks. + * + * \param[in] lock The lock to be enqueued. + * \param[out] flags Lock flags for the lock to be enqueued. + * \param[in] rpc_list Conflicting locks list. + * + * \retval -ERESTART: Some lock was instantly canceled while sending + * blocking ASTs, caller needs to re-check conflicting + * locks. + * \retval -EAGAIN: Lock was destroyed, caller should return error. + * \reval 0: Lock is successfully added in waiting list. + */ +int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags, + struct list_head *rpc_list) +{ + struct ldlm_resource *res = lock->l_resource; + int rc; + ENTRY; + + check_res_locked(res); + + /* If either of the compat_queue()s returned failure, then we + * have ASTs to send and must go onto the waiting list. + * + * bug 2322: we used to unlink and re-add here, which was a + * terrible folly -- if we goto restart, we could get + * re-ordered! Causes deadlock, because ASTs aren't sent! */ + if (list_empty(&lock->l_res_link)) + ldlm_resource_add_lock(res, &res->lr_waiting, lock); + unlock_res(res); + + rc = ldlm_run_ast_work(ldlm_res_to_ns(res), rpc_list, + LDLM_WORK_BL_AST); + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_OST_FAIL_RACE) && + !ns_is_client(ldlm_res_to_ns(res))) + class_fail_export(lock->l_export); + + if (rc == -ERESTART) + ldlm_reprocess_all(res, 0); + + lock_res(res); + if (rc == -ERESTART) { + /* 15715: The lock was granted and destroyed after + * resource lock was dropped. Interval node was freed + * in ldlm_lock_destroy. Anyway, this always happens + * when a client is being evicted. So it would be + * ok to return an error. -jay */ + if (ldlm_is_destroyed(lock)) + RETURN(-EAGAIN); + + /* lock was granted while resource was unlocked. */ + if (ldlm_is_granted(lock)) { + /* bug 11300: if the lock has been granted, + * break earlier because otherwise, we will go + * to restart and ldlm_resource_unlink will be + * called and it causes the interval node to be + * freed. Then we will fail at + * ldlm_extent_add_lock() */ + *flags &= ~LDLM_FL_BLOCKED_MASK; + } + + } + *flags |= LDLM_FL_BLOCK_GRANTED; + + RETURN(0); +} + +/** + * Discard all AST work items from list. + * + * If for whatever reason we do not want to send ASTs to conflicting locks + * anymore, disassemble the list with this function. + */ +void ldlm_discard_bl_list(struct list_head *bl_list) +{ + struct ldlm_lock *lock, *tmp; + + ENTRY; + + list_for_each_entry_safe(lock, tmp, bl_list, l_bl_ast) { + LASSERT(!list_empty(&lock->l_bl_ast)); + list_del_init(&lock->l_bl_ast); + ldlm_clear_ast_sent(lock); + LASSERT(lock->l_bl_ast_run == 0); + ldlm_clear_blocking_lock(lock); + LDLM_LOCK_RELEASE(lock); + } + EXIT; +} + +/** + * Process a call to blocking AST callback for a lock in ast_work list + */ +static int +ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_lock *lock; + struct ldlm_lock_desc d; + struct ldlm_bl_desc bld; + int rc; + + ENTRY; + + if (list_empty(arg->list)) + RETURN(-ENOENT); + + lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast); + + /* nobody should touch l_bl_ast but some locks in the list may become + * granted after lock convert or COS downgrade, these locks should be + * just skipped here and removed from the list. + */ + lock_res_and_lock(lock); + list_del_init(&lock->l_bl_ast); + + /* lock is not blocking lock anymore, but was kept in the list because + * it can managed only here. + */ + if (!ldlm_is_ast_sent(lock)) { + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + RETURN(0); + } + + LASSERT(lock->l_blocking_lock); + ldlm_lock2desc(lock->l_blocking_lock, &d); + /* copy blocking lock ibits in cancel_bits as well, + * new client may use them for lock convert and it is + * important to use new field to convert locks from + * new servers only + */ + d.l_policy_data.l_inodebits.cancel_bits = + lock->l_blocking_lock->l_policy_data.l_inodebits.bits; + + /* Blocking lock is being destroyed here but some information about it + * may be needed inside l_blocking_ast() function below, + * e.g. in mdt_blocking_ast(). So save needed data in bl_desc. + */ + bld.bl_same_client = lock->l_client_cookie == + lock->l_blocking_lock->l_client_cookie; + bld.bl_cos_incompat = ldlm_is_cos_incompat(lock->l_blocking_lock); + arg->bl_desc = &bld; + + LASSERT(ldlm_is_ast_sent(lock)); + LASSERT(lock->l_bl_ast_run == 0); + lock->l_bl_ast_run++; + ldlm_clear_blocking_lock(lock); + unlock_res_and_lock(lock); + + rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING); + + LDLM_LOCK_RELEASE(lock); + + RETURN(rc); +} + +/** + * Process a call to revocation AST callback for a lock in ast_work list + */ +static int +ldlm_work_revoke_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_lock_desc desc; + int rc; + struct ldlm_lock *lock; + ENTRY; + + if (list_empty(arg->list)) + RETURN(-ENOENT); + + lock = list_entry(arg->list->next, struct ldlm_lock, l_rk_ast); + list_del_init(&lock->l_rk_ast); + + /* the desc just pretend to exclusive */ + ldlm_lock2desc(lock, &desc); + desc.l_req_mode = LCK_EX; + desc.l_granted_mode = 0; + + rc = lock->l_blocking_ast(lock, &desc, (void*)arg, LDLM_CB_BLOCKING); + LDLM_LOCK_RELEASE(lock); + + RETURN(rc); +} + +/** + * Process a call to glimpse AST callback for a lock in ast_work list + */ +int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_glimpse_work *gl_work; + struct ldlm_lock *lock; + int rc = 0; + ENTRY; + + if (list_empty(arg->list)) + RETURN(-ENOENT); + + gl_work = list_entry(arg->list->next, struct ldlm_glimpse_work, + gl_list); + list_del_init(&gl_work->gl_list); + + lock = gl_work->gl_lock; + + /* transfer the glimpse descriptor to ldlm_cb_set_arg */ + arg->gl_desc = gl_work->gl_desc; + arg->gl_interpret_reply = gl_work->gl_interpret_reply; + arg->gl_interpret_data = gl_work->gl_interpret_data; + + /* invoke the actual glimpse callback */ + rc = lock->l_glimpse_ast(lock, (void *)arg); + if (rc == 0) + rc = 1; /* update LVB if this is server lock */ + else if (rc == -ELDLM_NO_LOCK_DATA) + ldlm_lvbo_update(lock->l_resource, lock, NULL, 1); + + LDLM_LOCK_RELEASE(lock); + if (gl_work->gl_flags & LDLM_GL_WORK_SLAB_ALLOCATED) + OBD_SLAB_FREE_PTR(gl_work, ldlm_glimpse_work_kmem); + else + OBD_FREE_PTR(gl_work); + + RETURN(rc); +} +#endif + +/** + * Process a call to completion AST callback for a lock in ast_work list + */ +static int +ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_lock *lock; + ldlm_completion_callback completion_callback; + int rc = 0; + + ENTRY; + + if (list_empty(arg->list)) + RETURN(-ENOENT); + + lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast); + + /* It's possible to receive a completion AST before we've set + * the l_completion_ast pointer: either because the AST arrived + * before the reply, or simply because there's a small race + * window between receiving the reply and finishing the local + * enqueue. (bug 842) + * + * This can't happen with the blocking_ast, however, because we + * will never call the local blocking_ast until we drop our + * reader/writer reference, which we won't do until we get the + * reply and finish enqueueing. */ + + /* nobody should touch l_cp_ast */ + lock_res_and_lock(lock); + list_del_init(&lock->l_cp_ast); + LASSERT(ldlm_is_cp_reqd(lock)); + /* save l_completion_ast since it can be changed by + * mds_intent_policy(), see bug 14225 */ + completion_callback = lock->l_completion_ast; + ldlm_clear_cp_reqd(lock); + unlock_res_and_lock(lock); + + if (completion_callback != NULL) + rc = completion_callback(lock, 0, (void *)arg); + LDLM_LOCK_RELEASE(lock); + + RETURN(rc); +} + +/** + * Process list of locks in need of ASTs being sent. + * + * Used on server to send multiple ASTs together instead of sending one by + * one. + */ +int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list, + ldlm_desc_ast_t ast_type) +{ + struct ldlm_cb_set_arg *arg; + set_producer_func work_ast_lock; + int rc; + + if (list_empty(rpc_list)) + RETURN(0); + + OBD_ALLOC_PTR(arg); + if (arg == NULL) + RETURN(-ENOMEM); + + atomic_set(&arg->restart, 0); + arg->list = rpc_list; + + switch (ast_type) { + case LDLM_WORK_CP_AST: + arg->type = LDLM_CP_CALLBACK; + work_ast_lock = ldlm_work_cp_ast_lock; + break; +#ifdef HAVE_SERVER_SUPPORT + case LDLM_WORK_BL_AST: + arg->type = LDLM_BL_CALLBACK; + work_ast_lock = ldlm_work_bl_ast_lock; + break; + case LDLM_WORK_REVOKE_AST: + arg->type = LDLM_BL_CALLBACK; + work_ast_lock = ldlm_work_revoke_ast_lock; + break; + case LDLM_WORK_GL_AST: + arg->type = LDLM_GL_CALLBACK; + work_ast_lock = ldlm_work_gl_ast_lock; + break; +#endif + default: + LBUG(); + } + + /* We create a ptlrpc request set with flow control extension. + * This request set will use the work_ast_lock function to produce new + * requests and will send a new request each time one completes in order + * to keep the number of requests in flight to ns_max_parallel_ast */ + arg->set = ptlrpc_prep_fcset(ns->ns_max_parallel_ast ? : UINT_MAX, + work_ast_lock, arg); + if (arg->set == NULL) + GOTO(out, rc = -ENOMEM); + + ptlrpc_set_wait(NULL, arg->set); + ptlrpc_set_destroy(arg->set); + + rc = atomic_read(&arg->restart) ? -ERESTART : 0; + GOTO(out, rc); +out: + OBD_FREE_PTR(arg); + return rc; +} + +/** + * Try to grant all waiting locks on a resource. + * + * Calls ldlm_reprocess_queue on waiting queue. + * + * Typically called after some resource locks are cancelled to see + * if anything could be granted as a result of the cancellation. + */ +static void __ldlm_reprocess_all(struct ldlm_resource *res, + enum ldlm_process_intention intention, + __u64 hint) +{ + LIST_HEAD(rpc_list); +#ifdef HAVE_SERVER_SUPPORT + ldlm_reprocessing_policy reprocess; + struct obd_device *obd; + int rc; + + ENTRY; + + /* Local lock trees don't get reprocessed. */ + if (ns_is_client(ldlm_res_to_ns(res))) { + EXIT; + return; + } + + /* Disable reprocess during lock replay stage but allow during + * request replay stage. + */ + obd = ldlm_res_to_ns(res)->ns_obd; + if (obd->obd_recovering && + atomic_read(&obd->obd_req_replay_clients) == 0) + RETURN_EXIT; +restart: + lock_res(res); + reprocess = ldlm_get_reprocessing_policy(res); + reprocess(res, &res->lr_waiting, &rpc_list, intention, hint); + unlock_res(res); + + rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &rpc_list, + LDLM_WORK_CP_AST); + if (rc == -ERESTART) { + LASSERT(list_empty(&rpc_list)); + hint = 0; + goto restart; + } +#else + ENTRY; + + if (!ns_is_client(ldlm_res_to_ns(res))) { + CERROR("This is client-side-only module, cannot handle " + "LDLM_NAMESPACE_SERVER resource type lock.\n"); + LBUG(); + } +#endif + EXIT; +} + +void ldlm_reprocess_all(struct ldlm_resource *res, __u64 hint) +{ + __ldlm_reprocess_all(res, LDLM_PROCESS_RESCAN, hint); +} +EXPORT_SYMBOL(ldlm_reprocess_all); + +static int ldlm_reprocess_res(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + + /* This is only called once after recovery done. LU-8306. */ + __ldlm_reprocess_all(res, LDLM_PROCESS_RECOVERY, 0); + return 0; +} + +/** + * Iterate through all resources on a namespace attempting to grant waiting + * locks. + */ +void ldlm_reprocess_recovery_done(struct ldlm_namespace *ns) +{ + ENTRY; + + if (ns != NULL) { + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_reprocess_res, NULL, 0); + } + EXIT; +} + +/** + * Helper function to call blocking AST for LDLM lock \a lock in a + * "cancelling" mode. + */ +void ldlm_cancel_callback(struct ldlm_lock *lock) +{ + check_res_locked(lock->l_resource); + if (!ldlm_is_cancel(lock)) { + ldlm_set_cancel(lock); + if (lock->l_blocking_ast) { + unlock_res_and_lock(lock); + lock->l_blocking_ast(lock, NULL, lock->l_ast_data, + LDLM_CB_CANCELING); + lock_res_and_lock(lock); + } else { + LDLM_DEBUG(lock, "no blocking ast"); + } + + /* only canceller can set bl_done bit */ + ldlm_set_bl_done(lock); + wake_up(&lock->l_waitq); + } else if (!ldlm_is_bl_done(lock)) { + /* The lock is guaranteed to have been canceled once + * returning from this function. */ + unlock_res_and_lock(lock); + wait_event_idle(lock->l_waitq, is_bl_done(lock)); + lock_res_and_lock(lock); + } +} + +/** + * Remove skiplist-enabled LDLM lock \a req from granted list + */ +void ldlm_unlink_lock_skiplist(struct ldlm_lock *req) +{ + if (req->l_resource->lr_type != LDLM_PLAIN && + req->l_resource->lr_type != LDLM_IBITS) + return; + + list_del_init(&req->l_sl_policy); + list_del_init(&req->l_sl_mode); +} + +/** + * Attempts to cancel LDLM lock \a lock that has no reader/writer references. + */ +void ldlm_lock_cancel(struct ldlm_lock *lock) +{ + struct ldlm_resource *res; + struct ldlm_namespace *ns; + ENTRY; + + lock_res_and_lock(lock); + + res = lock->l_resource; + ns = ldlm_res_to_ns(res); + + /* Please do not, no matter how tempting, remove this LBUG without + * talking to me first. -phik */ + if (lock->l_readers || lock->l_writers) { + LDLM_ERROR(lock, "lock still has references"); + unlock_res_and_lock(lock); + LBUG(); + } + + if (ldlm_is_waited(lock)) + ldlm_del_waiting_lock(lock); + + /* Releases cancel callback. */ + ldlm_cancel_callback(lock); + + /* Yes, second time, just in case it was added again while we were + * running with no res lock in ldlm_cancel_callback */ + if (ldlm_is_waited(lock)) + ldlm_del_waiting_lock(lock); + + ldlm_resource_unlink_lock(lock); + ldlm_lock_destroy_nolock(lock); + + if (ldlm_is_granted(lock)) + ldlm_pool_del(&ns->ns_pool, lock); + + /* Make sure we will not be called again for same lock what is possible + * if not to zero out lock->l_granted_mode */ + lock->l_granted_mode = LCK_MINMODE; + unlock_res_and_lock(lock); + + EXIT; +} +EXPORT_SYMBOL(ldlm_lock_cancel); + +/** + * Set opaque data into the lock that only makes sense to upper layer. + */ +int ldlm_lock_set_data(const struct lustre_handle *lockh, void *data) +{ + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + int rc = -EINVAL; + ENTRY; + + if (lock) { + if (lock->l_ast_data == NULL) + lock->l_ast_data = data; + if (lock->l_ast_data == data) + rc = 0; + LDLM_LOCK_PUT(lock); + } + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_lock_set_data); + +struct export_cl_data { + const struct lu_env *ecl_env; + struct obd_export *ecl_exp; + int ecl_loop; +}; + +static void ldlm_cancel_lock_for_export(struct obd_export *exp, + struct ldlm_lock *lock, + struct export_cl_data *ecl) +{ + struct ldlm_resource *res; + + res = ldlm_resource_getref(lock->l_resource); + + ldlm_lvbo_update(res, lock, NULL, 1); + ldlm_lock_cancel(lock); + if (!exp->exp_obd->obd_stopping) + ldlm_reprocess_all(res, lock->l_policy_data.l_inodebits.bits); + ldlm_resource_putref(res); + + ecl->ecl_loop++; + if ((ecl->ecl_loop & -ecl->ecl_loop) == ecl->ecl_loop) { + CDEBUG(D_INFO, "Export %p, %d locks cancelled.\n", + exp, ecl->ecl_loop); + } +} + +/** + * Iterator function for ldlm_export_cancel_locks. + * Cancels passed locks. + */ +static int +ldlm_cancel_locks_for_export_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) + +{ + struct export_cl_data *ecl = (struct export_cl_data *)data; + struct obd_export *exp = ecl->ecl_exp; + struct ldlm_lock *lock = cfs_hash_object(hs, hnode); + + LDLM_LOCK_GET(lock); + ldlm_cancel_lock_for_export(exp, lock, ecl); + LDLM_LOCK_RELEASE(lock); + + return 0; +} + +/** + * Cancel all blocked locks for given export. + * + * Typically called on client disconnection/eviction + */ +int ldlm_export_cancel_blocked_locks(struct obd_export *exp) +{ + struct lu_env env; + struct export_cl_data ecl = { + .ecl_exp = exp, + .ecl_loop = 0, + }; + int rc; + + rc = lu_env_init(&env, LCT_DT_THREAD); + if (rc) + RETURN(rc); + ecl.ecl_env = &env; + + while (!list_empty(&exp->exp_bl_list)) { + struct ldlm_lock *lock; + + spin_lock_bh(&exp->exp_bl_list_lock); + if (!list_empty(&exp->exp_bl_list)) { + lock = list_entry(exp->exp_bl_list.next, + struct ldlm_lock, l_exp_list); + LDLM_LOCK_GET(lock); + list_del_init(&lock->l_exp_list); + } else { + lock = NULL; + } + spin_unlock_bh(&exp->exp_bl_list_lock); + + if (lock == NULL) + break; + + ldlm_cancel_lock_for_export(exp, lock, &ecl); + LDLM_LOCK_RELEASE(lock); + } + + lu_env_fini(&env); + + CDEBUG(D_DLMTRACE, "Export %p, canceled %d locks, " + "left on hash table %d.\n", exp, ecl.ecl_loop, + atomic_read(&exp->exp_lock_hash->hs_count)); + + return ecl.ecl_loop; +} + +/** + * Cancel all locks for given export. + * + * Typically called after client disconnection/eviction + */ +int ldlm_export_cancel_locks(struct obd_export *exp) +{ + struct export_cl_data ecl; + struct lu_env env; + int rc; + + rc = lu_env_init(&env, LCT_DT_THREAD); + if (rc) + RETURN(rc); + ecl.ecl_env = &env; + ecl.ecl_exp = exp; + ecl.ecl_loop = 0; + + cfs_hash_for_each_empty(exp->exp_lock_hash, + ldlm_cancel_locks_for_export_cb, &ecl); + + CDEBUG(D_DLMTRACE, "Export %p, canceled %d locks, " + "left on hash table %d.\n", exp, ecl.ecl_loop, + atomic_read(&exp->exp_lock_hash->hs_count)); + + if (ecl.ecl_loop > 0 && + atomic_read(&exp->exp_lock_hash->hs_count) == 0 && + exp->exp_obd->obd_stopping) + ldlm_reprocess_recovery_done(exp->exp_obd->obd_namespace); + + lu_env_fini(&env); + + return ecl.ecl_loop; +} + +/** + * Downgrade an PW/EX lock to COS | CR mode. + * + * A lock mode convertion from PW/EX mode to less conflict mode. The + * convertion may fail if lock was canceled before downgrade, but it doesn't + * indicate any problem, because such lock has no reader or writer, and will + * be released soon. + * + * Used by Commit on Sharing (COS) code to force object changes commit in case + * of conflict. Converted lock is considered as new lock and all blocking AST + * things are cleared, so any pending or new blocked lock on that lock will + * cause new call to blocking_ast and force resource object commit. + * + * Also used by layout_change to replace EX lock to CR lock. + * + * \param lock A lock to convert + * \param new_mode new lock mode + */ +void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode) +{ +#ifdef HAVE_SERVER_SUPPORT + ENTRY; + + LASSERT(new_mode == LCK_COS || new_mode == LCK_CR); + + lock_res_and_lock(lock); + + if (!(lock->l_granted_mode & (LCK_PW | LCK_EX))) { + unlock_res_and_lock(lock); + + LASSERT(lock->l_granted_mode == LCK_MINMODE); + LDLM_DEBUG(lock, "lock was canceled before downgrade"); + RETURN_EXIT; + } + + ldlm_resource_unlink_lock(lock); + /* + * Remove the lock from pool as it will be added again in + * ldlm_grant_lock() called below. + */ + ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock); + + /* Consider downgraded lock as a new lock and clear all states + * related to a previous blocking AST processing. + */ + ldlm_clear_blocking_data(lock); + + lock->l_req_mode = new_mode; + ldlm_grant_lock(lock, NULL); + unlock_res_and_lock(lock); + + ldlm_reprocess_all(lock->l_resource, + lock->l_policy_data.l_inodebits.bits); + + EXIT; +#endif +} +EXPORT_SYMBOL(ldlm_lock_mode_downgrade); + +/** + * Print lock with lock handle \a lockh description into debug log. + * + * Used when printing all locks on a resource for debug purposes. + */ +void ldlm_lock_dump_handle(int level, const struct lustre_handle *lockh) +{ + struct ldlm_lock *lock; + + if (!((libcfs_debug | D_ERROR) & level)) + return; + + lock = ldlm_handle2lock(lockh); + if (lock == NULL) + return; + + LDLM_DEBUG_LIMIT(level, lock, "###"); + + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_dump_handle); + +/** + * Print lock information with custom message into debug log. + * Helper function. + */ +void _ldlm_lock_debug(struct ldlm_lock *lock, + struct libcfs_debug_msg_data *msgdata, + const char *fmt, ...) +{ + va_list args; + struct obd_export *exp = lock->l_export; + struct ldlm_resource *resource = NULL; + struct va_format vaf; + char *nid = "local"; + + rcu_read_lock(); + resource = rcu_dereference(lock->l_resource); + if (resource && !atomic_inc_not_zero(&resource->lr_refcount)) + resource = NULL; + rcu_read_unlock(); + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + if (exp && exp->exp_connection) { + nid = obd_export_nid2str(exp); + } else if (exp && exp->exp_obd != NULL) { + struct obd_import *imp = exp->exp_obd->u.cli.cl_import; + nid = obd_import_nid2str(imp); + } + + if (resource == NULL) { + libcfs_debug_msg(msgdata, + "%pV ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n", + &vaf, + lock, + lock->l_handle.h_cookie, + refcount_read(&lock->l_handle.h_ref), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + lock->l_flags, nid, + lock->l_remote_handle.cookie, + exp ? refcount_read(&exp->exp_handle.h_ref) : -99, + lock->l_pid, lock->l_callback_timestamp, + lock->l_lvb_type); + va_end(args); + return; + } + + switch (resource->lr_type) { + case LDLM_EXTENT: + libcfs_debug_msg(msgdata, + "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s [%llu->%llu] (req %llu->%llu) gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n", + &vaf, + ldlm_lock_to_ns_name(lock), lock, + lock->l_handle.h_cookie, + refcount_read(&lock->l_handle.h_ref), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + PLDLMRES(resource), + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_policy_data.l_extent.start, + lock->l_policy_data.l_extent.end, + lock->l_req_extent.start, lock->l_req_extent.end, + lock->l_req_extent.gid, + lock->l_flags, nid, + lock->l_remote_handle.cookie, + exp ? refcount_read(&exp->exp_handle.h_ref) : -99, + lock->l_pid, lock->l_callback_timestamp, + lock->l_lvb_type); + break; + + case LDLM_FLOCK: + libcfs_debug_msg(msgdata, + "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s pid: %d [%llu->%llu] flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld\n", + &vaf, + ldlm_lock_to_ns_name(lock), lock, + lock->l_handle.h_cookie, + refcount_read(&lock->l_handle.h_ref), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + PLDLMRES(resource), + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_policy_data.l_flock.pid, + lock->l_policy_data.l_flock.start, + lock->l_policy_data.l_flock.end, + lock->l_flags, nid, + lock->l_remote_handle.cookie, + exp ? refcount_read(&exp->exp_handle.h_ref) : -99, + lock->l_pid, lock->l_callback_timestamp); + break; + + case LDLM_IBITS: + libcfs_debug_msg(msgdata, + "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n", + &vaf, + ldlm_lock_to_ns_name(lock), + lock, lock->l_handle.h_cookie, + refcount_read(&lock->l_handle.h_ref), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + PLDLMRES(resource), + lock->l_policy_data.l_inodebits.bits, + lock->l_policy_data.l_inodebits.try_bits, + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_policy_data.l_inodebits.li_gid, + lock->l_flags, nid, + lock->l_remote_handle.cookie, + exp ? refcount_read(&exp->exp_handle.h_ref) : -99, + lock->l_pid, lock->l_callback_timestamp, + lock->l_lvb_type); + break; + + default: + libcfs_debug_msg(msgdata, + "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n", + &vaf, + ldlm_lock_to_ns_name(lock), + lock, lock->l_handle.h_cookie, + refcount_read(&lock->l_handle.h_ref), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + PLDLMRES(resource), + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_flags, nid, + lock->l_remote_handle.cookie, + exp ? refcount_read(&exp->exp_handle.h_ref) : -99, + lock->l_pid, lock->l_callback_timestamp, + lock->l_lvb_type); + break; + } + va_end(args); + ldlm_resource_putref(resource); +} +EXPORT_SYMBOL(_ldlm_lock_debug); diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c new file mode 100644 index 0000000000000..f82df7df0e444 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c @@ -0,0 +1,3488 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ldlm/ldlm_lockd.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include +#include +#include +#include +#include "ldlm_internal.h" + +static int ldlm_num_threads; +module_param(ldlm_num_threads, int, 0444); +MODULE_PARM_DESC(ldlm_num_threads, "number of DLM service threads to start"); + +static unsigned int ldlm_cpu_bind = 1; +module_param(ldlm_cpu_bind, uint, 0444); +MODULE_PARM_DESC(ldlm_cpu_bind, + "bind DLM service threads to particular CPU partitions"); + +static char *ldlm_cpts; +module_param(ldlm_cpts, charp, 0444); +MODULE_PARM_DESC(ldlm_cpts, "CPU partitions ldlm threads should run on"); + +static DEFINE_MUTEX(ldlm_ref_mutex); +static int ldlm_refcount; + +struct kobject *ldlm_kobj; +struct kset *ldlm_ns_kset; +struct kset *ldlm_svc_kset; + +/* LDLM state */ + +static struct ldlm_state *ldlm_state; + +/* + * timeout for initial callback (AST) reply (bz10399) + * Due to having to send a 32 bit time value over the + * wire return it as timeout_t instead of time64_t + */ +static inline timeout_t ldlm_get_rq_timeout(void) +{ + /* Non-AT value */ + timeout_t timeout = min(ldlm_timeout, obd_timeout / 3); + + return timeout < 1 ? 1 : timeout; +} + +struct ldlm_bl_pool { + spinlock_t blp_lock; + + /* + * blp_prio_list is used for callbacks that should be handled + * as a priority. It is used for LDLM_FL_DISCARD_DATA requests. + * see b=13843 + */ + struct list_head blp_prio_list; + + /* + * blp_list is used for all other callbacks which are likely + * to take longer to process. + */ + struct list_head blp_list; + + wait_queue_head_t blp_waitq; + struct completion blp_comp; + atomic_t blp_num_threads; + atomic_t blp_busy_threads; + int blp_min_threads; + int blp_max_threads; +}; + +struct ldlm_bl_work_item { + struct list_head blwi_entry; + struct ldlm_namespace *blwi_ns; + struct ldlm_lock_desc blwi_ld; + struct ldlm_lock *blwi_lock; + struct list_head blwi_head; + int blwi_count; + struct completion blwi_comp; + enum ldlm_cancel_flags blwi_flags; + int blwi_mem_pressure; +}; + +#ifdef HAVE_SERVER_SUPPORT + +/** + * Protects both waiting_locks_list and expired_lock_thread. + */ +static DEFINE_SPINLOCK(waiting_locks_spinlock); /* BH lock (timer) */ + +/** + * List for contended locks. + * + * As soon as a lock is contended, it gets placed on this list and + * expected time to get a response is filled in the lock. A special + * thread walks the list looking for locks that should be released and + * schedules client evictions for those that have not been released in + * time. + * + * All access to it should be under waiting_locks_spinlock. + */ +static LIST_HEAD(waiting_locks_list); +static void waiting_locks_callback(TIMER_DATA_TYPE unused); +static CFS_DEFINE_TIMER(waiting_locks_timer, waiting_locks_callback, 0, 0); + +enum elt_state { + ELT_STOPPED, + ELT_READY, + ELT_TERMINATE, +}; + +static DECLARE_WAIT_QUEUE_HEAD(expired_lock_wait_queue); +static enum elt_state expired_lock_thread_state = ELT_STOPPED; +static int expired_lock_dump; +static LIST_HEAD(expired_lock_list); + +static int ldlm_lock_busy(struct ldlm_lock *lock); +static int ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t timeout); +static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t timeout); + +static inline int have_expired_locks(void) +{ + int need_to_run; + + ENTRY; + spin_lock_bh(&waiting_locks_spinlock); + need_to_run = !list_empty(&expired_lock_list); + spin_unlock_bh(&waiting_locks_spinlock); + + RETURN(need_to_run); +} + +/** + * Check expired lock list for expired locks and time them out. + */ +static int expired_lock_main(void *arg) +{ + struct list_head *expired = &expired_lock_list; + int do_dump; + + ENTRY; + + expired_lock_thread_state = ELT_READY; + wake_up(&expired_lock_wait_queue); + + while (1) { + wait_event_idle(expired_lock_wait_queue, + have_expired_locks() || + expired_lock_thread_state == ELT_TERMINATE); + + spin_lock_bh(&waiting_locks_spinlock); + if (expired_lock_dump) { + spin_unlock_bh(&waiting_locks_spinlock); + + /* from waiting_locks_callback, but not in timer */ + libcfs_debug_dumplog(); + + spin_lock_bh(&waiting_locks_spinlock); + expired_lock_dump = 0; + } + + do_dump = 0; + + while (!list_empty(expired)) { + struct obd_export *export; + struct ldlm_lock *lock; + + lock = list_entry(expired->next, struct ldlm_lock, + l_pending_chain); + if ((void *)lock < LP_POISON + PAGE_SIZE && + (void *)lock >= LP_POISON) { + spin_unlock_bh(&waiting_locks_spinlock); + CERROR("free lock on elt list %p\n", lock); + LBUG(); + } + list_del_init(&lock->l_pending_chain); + if ((void *)lock->l_export < + LP_POISON + PAGE_SIZE && + (void *)lock->l_export >= LP_POISON) { + CERROR("lock with free export on elt list %p\n", + lock->l_export); + lock->l_export = NULL; + LDLM_ERROR(lock, "free export"); + /* + * release extra ref grabbed by + * ldlm_add_waiting_lock() or + * ldlm_failed_ast() + */ + LDLM_LOCK_RELEASE(lock); + continue; + } + + if (ldlm_is_destroyed(lock)) { + /* + * release the lock refcount where + * waiting_locks_callback() founds + */ + LDLM_LOCK_RELEASE(lock); + continue; + } + export = class_export_lock_get(lock->l_export, lock); + spin_unlock_bh(&waiting_locks_spinlock); + + /* Check if we need to prolong timeout */ + if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) && + lock->l_callback_timestamp != 0 && /* not AST error */ + ldlm_lock_busy(lock)) { + LDLM_DEBUG(lock, "prolong the busy lock"); + lock_res_and_lock(lock); + ldlm_add_waiting_lock(lock, + ldlm_bl_timeout(lock) >> 1); + unlock_res_and_lock(lock); + } else { + spin_lock_bh(&export->exp_bl_list_lock); + list_del_init(&lock->l_exp_list); + spin_unlock_bh(&export->exp_bl_list_lock); + + LDLM_ERROR(lock, + "lock callback timer expired after %llds: evicting client at %s ", + ktime_get_seconds() - + lock->l_blast_sent, + obd_export_nid2str(export)); + ldlm_lock_to_ns(lock)->ns_timeouts++; + if (do_dump_on_eviction(export->exp_obd)) + do_dump++; + class_fail_export(export); + } + class_export_lock_put(export, lock); + /* + * release extra ref grabbed by ldlm_add_waiting_lock() + * or ldlm_failed_ast() + */ + LDLM_LOCK_RELEASE(lock); + + spin_lock_bh(&waiting_locks_spinlock); + } + spin_unlock_bh(&waiting_locks_spinlock); + + if (do_dump) { + CERROR("dump the log upon eviction\n"); + libcfs_debug_dumplog(); + } + + if (expired_lock_thread_state == ELT_TERMINATE) + break; + } + + expired_lock_thread_state = ELT_STOPPED; + wake_up(&expired_lock_wait_queue); + RETURN(0); +} + +/** + * Check if there is a request in the export request list + * which prevents the lock canceling. + */ +static int ldlm_lock_busy(struct ldlm_lock *lock) +{ + struct ptlrpc_request *req; + int match = 0; + + ENTRY; + + if (lock->l_export == NULL) + return 0; + + spin_lock(&lock->l_export->exp_rpc_lock); + list_for_each_entry(req, &lock->l_export->exp_hp_rpcs, + rq_exp_list) { + if (req->rq_ops->hpreq_lock_match) { + match = req->rq_ops->hpreq_lock_match(req, lock); + if (match) + break; + } + } + spin_unlock(&lock->l_export->exp_rpc_lock); + RETURN(match); +} + +/* This is called from within a timer interrupt and cannot schedule */ +static void waiting_locks_callback(TIMER_DATA_TYPE unused) +{ + struct ldlm_lock *lock; + int need_dump = 0; + + spin_lock_bh(&waiting_locks_spinlock); + while (!list_empty(&waiting_locks_list)) { + lock = list_entry(waiting_locks_list.next, struct ldlm_lock, + l_pending_chain); + if (lock->l_callback_timestamp > ktime_get_seconds() || + lock->l_req_mode == LCK_GROUP) + break; + + /* + * no needs to take an extra ref on the lock since it was in + * the waiting_locks_list and ldlm_add_waiting_lock() + * already grabbed a ref + */ + list_move(&lock->l_pending_chain, &expired_lock_list); + need_dump = 1; + } + + if (!list_empty(&expired_lock_list)) { + if (obd_dump_on_timeout && need_dump) + expired_lock_dump = __LINE__; + + wake_up(&expired_lock_wait_queue); + } + + /* + * Make sure the timer will fire again if we have any locks + * left. + */ + if (!list_empty(&waiting_locks_list)) { + time64_t now = ktime_get_seconds(); + timeout_t delta = 0; + + lock = list_entry(waiting_locks_list.next, struct ldlm_lock, + l_pending_chain); + if (lock->l_callback_timestamp - now > 0) + delta = lock->l_callback_timestamp - now; + mod_timer(&waiting_locks_timer, + jiffies + cfs_time_seconds(delta)); + } + spin_unlock_bh(&waiting_locks_spinlock); +} + +/** + * Add lock to the list of contended locks. + * + * Indicate that we're waiting for a client to call us back cancelling a given + * lock. We add it to the pending-callback chain, and schedule the lock-timeout + * timer to fire appropriately. (We round up to the next second, to avoid + * floods of timer firings during periods of high lock contention and traffic). + * As done by ldlm_add_waiting_lock(), the caller must grab a lock reference + * if it has been added to the waiting list (1 is returned). + * + * Called with the namespace lock held. + */ +static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t delay) +{ + unsigned long timeout_jiffies = jiffies; + time64_t deadline; + timeout_t timeout; + + lock->l_blast_sent = ktime_get_seconds(); + if (!list_empty(&lock->l_pending_chain)) + return 0; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT) || + OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) + delay = 1; + + deadline = lock->l_blast_sent + delay; + if (likely(deadline > lock->l_callback_timestamp)) + lock->l_callback_timestamp = deadline; + + timeout = clamp_t(timeout_t, + lock->l_callback_timestamp - lock->l_blast_sent, + 0, delay); + timeout_jiffies += cfs_time_seconds(timeout); + + if (time_before(timeout_jiffies, waiting_locks_timer.expires) || + !timer_pending(&waiting_locks_timer)) + mod_timer(&waiting_locks_timer, timeout_jiffies); + + /* + * if the new lock has a shorter timeout than something earlier on + * the list, we'll wait the longer amount of time; no big deal. + */ + /* FIFO */ + list_add_tail(&lock->l_pending_chain, &waiting_locks_list); + return 1; +} + +static void ldlm_add_blocked_lock(struct ldlm_lock *lock) +{ + spin_lock_bh(&lock->l_export->exp_bl_list_lock); + if (list_empty(&lock->l_exp_list)) { + if (!ldlm_is_granted(lock)) + list_add_tail(&lock->l_exp_list, + &lock->l_export->exp_bl_list); + else + list_add(&lock->l_exp_list, + &lock->l_export->exp_bl_list); + } + spin_unlock_bh(&lock->l_export->exp_bl_list_lock); + + /* + * A blocked lock is added. Adjust the position in + * the stale list if the export is in the list. + * If export is stale and not in the list - it is being + * processed and will be placed on the right position + * on obd_stale_export_put(). + */ + if (!list_empty(&lock->l_export->exp_stale_list)) + obd_stale_export_adjust(lock->l_export); +} + +static int ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t timeout) +{ + int ret; + + /* NB: must be called with hold of lock_res_and_lock() */ + LASSERT(ldlm_is_res_locked(lock)); + LASSERT(!ldlm_is_cancel_on_block(lock)); + + /* + * Do not put cross-MDT lock in the waiting list, since we + * will not evict it due to timeout for now + */ + if (lock->l_export != NULL && + (exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS)) + return 0; + + spin_lock_bh(&waiting_locks_spinlock); + if (ldlm_is_cancel(lock)) { + spin_unlock_bh(&waiting_locks_spinlock); + return 0; + } + + if (ldlm_is_destroyed(lock)) { + static time64_t next; + + spin_unlock_bh(&waiting_locks_spinlock); + LDLM_ERROR(lock, "not waiting on destroyed lock (b=5653)"); + if (ktime_get_seconds() > next) { + next = ktime_get_seconds() + 14400; + libcfs_debug_dumpstack(NULL); + } + return 0; + } + + ldlm_set_waited(lock); + ret = __ldlm_add_waiting_lock(lock, timeout); + if (ret) { + /* + * grab ref on the lock if it has been added to the + * waiting list + */ + LDLM_LOCK_GET(lock); + } + spin_unlock_bh(&waiting_locks_spinlock); + + if (ret) + ldlm_add_blocked_lock(lock); + + LDLM_DEBUG(lock, "%sadding to wait list(timeout: %d, AT: %s)", + ret == 0 ? "not re-" : "", timeout, + AT_OFF ? "off" : "on"); + return ret; +} + +/** + * Remove a lock from the pending list, likely because it had its cancellation + * callback arrive without incident. This adjusts the lock-timeout timer if + * needed. Returns 0 if the lock wasn't pending after all, 1 if it was. + * As done by ldlm_del_waiting_lock(), the caller must release the lock + * reference when the lock is removed from any list (1 is returned). + * + * Called with namespace lock held. + */ +static int __ldlm_del_waiting_lock(struct ldlm_lock *lock) +{ + struct list_head *list_next; + + if (list_empty(&lock->l_pending_chain)) + return 0; + + list_next = lock->l_pending_chain.next; + if (lock->l_pending_chain.prev == &waiting_locks_list) { + /* Removing the head of the list, adjust timer. */ + if (list_next == &waiting_locks_list) { + /* No more, just cancel. */ + del_timer(&waiting_locks_timer); + } else { + time64_t now = ktime_get_seconds(); + struct ldlm_lock *next; + timeout_t delta = 0; + + next = list_entry(list_next, struct ldlm_lock, + l_pending_chain); + if (next->l_callback_timestamp - now > 0) + delta = lock->l_callback_timestamp - now; + + mod_timer(&waiting_locks_timer, + jiffies + cfs_time_seconds(delta)); + } + } + list_del_init(&lock->l_pending_chain); + + return 1; +} + +int ldlm_del_waiting_lock(struct ldlm_lock *lock) +{ + int ret; + + if (lock->l_export == NULL) { + /* We don't have a "waiting locks list" on clients. */ + CDEBUG(D_DLMTRACE, "Client lock %p : no-op\n", lock); + return 0; + } + + spin_lock_bh(&waiting_locks_spinlock); + ret = __ldlm_del_waiting_lock(lock); + ldlm_clear_waited(lock); + spin_unlock_bh(&waiting_locks_spinlock); + + /* remove the lock out of export blocking list */ + spin_lock_bh(&lock->l_export->exp_bl_list_lock); + list_del_init(&lock->l_exp_list); + spin_unlock_bh(&lock->l_export->exp_bl_list_lock); + + if (ret) { + /* + * release lock ref if it has indeed been removed + * from a list + */ + LDLM_LOCK_RELEASE(lock); + } + + LDLM_DEBUG(lock, "%s", ret == 0 ? "wasn't waiting" : "removed"); + return ret; +} + +/** + * Prolong the contended lock waiting time. + * + * Called with namespace lock held. + */ +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, timeout_t timeout) +{ + if (lock->l_export == NULL) { + /* We don't have a "waiting locks list" on clients. */ + LDLM_DEBUG(lock, "client lock: no-op"); + return 0; + } + + if (exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS) { + /* We don't have a "waiting locks list" on OSP. */ + LDLM_DEBUG(lock, "MDS-MDS lock: no-op"); + return 0; + } + + spin_lock_bh(&waiting_locks_spinlock); + + if (list_empty(&lock->l_pending_chain)) { + spin_unlock_bh(&waiting_locks_spinlock); + LDLM_DEBUG(lock, "wasn't waiting"); + return 0; + } + + /* + * we remove/add the lock to the waiting list, so no needs to + * release/take a lock reference + */ + __ldlm_del_waiting_lock(lock); + __ldlm_add_waiting_lock(lock, timeout); + spin_unlock_bh(&waiting_locks_spinlock); + + LDLM_DEBUG(lock, "refreshed to %ds", timeout); + return 1; +} +EXPORT_SYMBOL(ldlm_refresh_waiting_lock); + +#else /* HAVE_SERVER_SUPPORT */ + +int ldlm_del_waiting_lock(struct ldlm_lock *lock) +{ + RETURN(0); +} + +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, timeout_t timeout) +{ + RETURN(0); +} + +#endif /* !HAVE_SERVER_SUPPORT */ + +#ifdef HAVE_SERVER_SUPPORT + +/** + * Calculate the per-export Blocking timeout (covering BL AST, data flush, + * lock cancel, and their replies). Used for lock callback timeout and AST + * re-send period. + * + * \param[in] lock lock which is getting the blocking callback + * + * \retval timeout in seconds to wait for the client reply + */ +timeout_t ldlm_bl_timeout(struct ldlm_lock *lock) +{ + timeout_t timeout; + + if (AT_OFF) + return obd_timeout / 2; + + /* + * Since these are non-updating timeouts, we should be conservative. + * Take more than usually, 150% + * It would be nice to have some kind of "early reply" mechanism for + * lock callbacks too... + */ + timeout = at_get(&lock->l_export->exp_bl_lock_at); + return max_t(timeout_t, timeout + (timeout >> 1), + (timeout_t)ldlm_enqueue_min); +} +EXPORT_SYMBOL(ldlm_bl_timeout); + +/** + * Perform lock cleanup if AST sending failed. + */ +static void ldlm_failed_ast(struct ldlm_lock *lock, int rc, + const char *ast_type) +{ + LCONSOLE_ERROR_MSG(0x138, + "%s: A client on nid %s was evicted due to a lock %s callback time out: rc %d\n", + lock->l_export->exp_obd->obd_name, + obd_export_nid2str(lock->l_export), ast_type, rc); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + spin_lock_bh(&waiting_locks_spinlock); + if (__ldlm_del_waiting_lock(lock) == 0) + /* + * the lock was not in any list, grab an extra ref before adding + * the lock to the expired list + */ + LDLM_LOCK_GET(lock); + /* differentiate it from expired locks */ + lock->l_callback_timestamp = 0; + list_add(&lock->l_pending_chain, &expired_lock_list); + wake_up(&expired_lock_wait_queue); + spin_unlock_bh(&waiting_locks_spinlock); +} + +/** + * Perform lock cleanup if AST reply came with error. + */ +static int ldlm_handle_ast_error(struct ldlm_lock *lock, + struct ptlrpc_request *req, int rc, + const char *ast_type) +{ + struct lnet_processid *peer = &req->rq_import->imp_connection->c_peer; + + if (!req->rq_replied || (rc && rc != -EINVAL)) { + if (ldlm_is_cancel(lock)) { + LDLM_DEBUG(lock, + "%s AST (req@%p x%llu) timeout from nid %s, but cancel was received (AST reply lost?)", + ast_type, req, req->rq_xid, + libcfs_nidstr(&peer->nid)); + ldlm_lock_cancel(lock); + rc = -ERESTART; + } else if (rc == -ENODEV || rc == -ESHUTDOWN || + (rc == -EIO && + req->rq_import->imp_state == LUSTRE_IMP_CLOSED)) { + /* + * Upon umount process the AST fails because cannot be + * sent. This shouldn't lead to the client eviction. + * -ENODEV error is returned by ptl_send_rpc() for + * new request in such import. + * -SHUTDOWN is returned by ptlrpc_import_delay_req() + * if imp_invalid is set or obd_no_recov. + * Meanwhile there is also check for LUSTRE_IMP_CLOSED + * in ptlrpc_import_delay_req() as well with -EIO code. + * In all such cases errors are ignored. + */ + LDLM_DEBUG(lock, + "%s AST can't be sent due to a server %s failure or umount process: rc = %d\n", + ast_type, + req->rq_import->imp_obd->obd_name, rc); + } else { + LDLM_ERROR(lock, + "client (nid %s) %s %s AST (req@%p x%llu status %d rc %d), evict it", + libcfs_nidstr(&peer->nid), + req->rq_replied ? "returned error from" : + "failed to reply to", + ast_type, req, req->rq_xid, + (req->rq_repmsg != NULL) ? + lustre_msg_get_status(req->rq_repmsg) : 0, + rc); + ldlm_failed_ast(lock, rc, ast_type); + } + return rc; + } + + if (rc == -EINVAL) { + struct ldlm_resource *res = lock->l_resource; + + LDLM_DEBUG(lock, + "client (nid %s) returned %d from %s AST (req@%p x%llu) - normal race", + libcfs_nidstr(&peer->nid), + req->rq_repmsg ? + lustre_msg_get_status(req->rq_repmsg) : -1, + ast_type, req, req->rq_xid); + if (res) { + /* + * update lvbo to return proper attributes. + * see b=23174 + */ + ldlm_resource_getref(res); + ldlm_lvbo_update(res, lock, NULL, 1); + ldlm_resource_putref(res); + } + ldlm_lock_cancel(lock); + rc = -ERESTART; + } + + return rc; +} + +static int ldlm_cb_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *args, int rc) +{ + struct ldlm_cb_async_args *ca = args; + struct ldlm_lock *lock = ca->ca_lock; + struct ldlm_cb_set_arg *arg = ca->ca_set_arg; + + ENTRY; + + LASSERT(lock != NULL); + + switch (arg->type) { + case LDLM_GL_CALLBACK: + /* + * Update the LVB from disk if the AST failed + * (this is a legal race) + * + * - Glimpse callback of local lock just returns + * -ELDLM_NO_LOCK_DATA. + * - Glimpse callback of remote lock might return + * -ELDLM_NO_LOCK_DATA when inode is cleared. LU-274 + */ + if (unlikely(arg->gl_interpret_reply)) { + rc = arg->gl_interpret_reply(NULL, req, args, rc); + } else if (rc == -ELDLM_NO_LOCK_DATA) { + LDLM_DEBUG(lock, + "lost race - client has a lock but no inode"); + ldlm_lvbo_update(lock->l_resource, lock, NULL, 1); + } else if (rc != 0) { + rc = ldlm_handle_ast_error(lock, req, rc, "glimpse"); + } else { + rc = ldlm_lvbo_update(lock->l_resource, + lock, req, 1); + } + break; + case LDLM_BL_CALLBACK: + if (rc != 0) + rc = ldlm_handle_ast_error(lock, req, rc, "blocking"); + break; + case LDLM_CP_CALLBACK: + if (rc != 0) + rc = ldlm_handle_ast_error(lock, req, rc, "completion"); + break; + default: + LDLM_ERROR(lock, "invalid opcode for lock callback %d", + arg->type); + LBUG(); + } + + /* release extra reference taken in ldlm_ast_fini() */ + LDLM_LOCK_RELEASE(lock); + + if (rc == -ERESTART) + atomic_inc(&arg->restart); + + RETURN(0); +} + +static void ldlm_update_resend(struct ptlrpc_request *req, void *data) +{ + struct ldlm_cb_async_args *ca = data; + struct ldlm_lock *lock = ca->ca_lock; + + ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock)); +} + +static inline int ldlm_ast_fini(struct ptlrpc_request *req, + struct ldlm_cb_set_arg *arg, + struct ldlm_lock *lock, + int instant_cancel) +{ + int rc = 0; + + ENTRY; + + if (unlikely(instant_cancel)) { + rc = ptl_send_rpc(req, 1); + ptlrpc_req_finished(req); + if (rc == 0) + atomic_inc(&arg->restart); + } else { + LDLM_LOCK_GET(lock); + ptlrpc_set_add_req(arg->set, req); + } + + RETURN(rc); +} + +/** + * Check if there are requests in the export request list which prevent + * the lock canceling and make these requests high priority ones. + */ +static void ldlm_lock_reorder_req(struct ldlm_lock *lock) +{ + struct ptlrpc_request *req; + + ENTRY; + + if (lock->l_export == NULL) { + LDLM_DEBUG(lock, "client lock: no-op"); + RETURN_EXIT; + } + + spin_lock(&lock->l_export->exp_rpc_lock); + list_for_each_entry(req, &lock->l_export->exp_hp_rpcs, + rq_exp_list) { + /* + * Do not process requests that were not yet added to there + * incoming queue or were already removed from there for + * processing. We evaluate ptlrpc_nrs_req_can_move() without + * holding svcpt->scp_req_lock, and then redo the check with + * the lock held once we need to obtain a reliable result. + */ + if (ptlrpc_nrs_req_can_move(req) && + req->rq_ops->hpreq_lock_match && + req->rq_ops->hpreq_lock_match(req, lock)) + ptlrpc_nrs_req_hp_move(req); + } + spin_unlock(&lock->l_export->exp_rpc_lock); + EXIT; +} + +/** + * ->l_blocking_ast() method for server-side locks. This is invoked when newly + * enqueued server lock conflicts with given one. + * + * Sends blocking AST RPC to the client owning that lock; arms timeout timer + * to wait for client response. + */ +int ldlm_server_blocking_ast(struct ldlm_lock *lock, + struct ldlm_lock_desc *desc, + void *data, int flag) +{ + struct ldlm_cb_async_args *ca; + struct ldlm_cb_set_arg *arg = data; + struct ldlm_request *body; + struct ptlrpc_request *req; + int instant_cancel = 0; + int rc = 0; + + ENTRY; + + if (flag == LDLM_CB_CANCELING) + /* Don't need to do anything here. */ + RETURN(0); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_SRV_BL_AST)) { + LDLM_DEBUG(lock, "dropping BL AST"); + RETURN(0); + } + + LASSERT(lock); + LASSERT(data != NULL); + if (lock->l_export->exp_obd->obd_recovering != 0) + LDLM_ERROR(lock, "BUG 6063: lock collide during recovery"); + + ldlm_lock_reorder_req(lock); + + req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse, + &RQF_LDLM_BL_CALLBACK, + LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK); + if (req == NULL) + RETURN(-ENOMEM); + + ca = ptlrpc_req_async_args(ca, req); + ca->ca_set_arg = arg; + ca->ca_lock = lock; + + req->rq_interpret_reply = ldlm_cb_interpret; + + lock_res_and_lock(lock); + if (ldlm_is_destroyed(lock)) { + /* What's the point? */ + unlock_res_and_lock(lock); + ptlrpc_req_finished(req); + RETURN(0); + } + + if (!ldlm_is_granted(lock)) { + /* + * this blocking AST will be communicated as part of the + * completion AST instead + */ + ldlm_add_blocked_lock(lock); + ldlm_set_waited(lock); + unlock_res_and_lock(lock); + + ptlrpc_req_finished(req); + LDLM_DEBUG(lock, "lock not granted, not sending blocking AST"); + RETURN(0); + } + + if (ldlm_is_cancel_on_block(lock)) + instant_cancel = 1; + + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + body->lock_handle[0] = lock->l_remote_handle; + body->lock_handle[1].cookie = lock->l_handle.h_cookie; + body->lock_desc = *desc; + body->lock_flags |= ldlm_flags_to_wire(lock->l_flags & LDLM_FL_AST_MASK); + + LDLM_DEBUG(lock, "server preparing blocking AST"); + + ptlrpc_request_set_replen(req); + ldlm_set_cbpending(lock); + if (instant_cancel) { + unlock_res_and_lock(lock); + ldlm_lock_cancel(lock); + + req->rq_no_resend = 1; + } else { + LASSERT(ldlm_is_granted(lock)); + ldlm_add_waiting_lock(lock, ldlm_bl_timeout(lock)); + unlock_res_and_lock(lock); + + /* Do not resend after lock callback timeout */ + req->rq_delay_limit = ldlm_bl_timeout(lock); + req->rq_resend_cb = ldlm_update_resend; + } + + req->rq_send_state = LUSTRE_IMP_FULL; + /* ptlrpc_request_alloc_pack already set timeout */ + if (AT_OFF) + req->rq_timeout = ldlm_get_rq_timeout(); + + if (lock->l_export && lock->l_export->exp_nid_stats && + lock->l_export->exp_nid_stats->nid_ldlm_stats) + lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats, + LDLM_BL_CALLBACK - LDLM_FIRST_OPC); + + rc = ldlm_ast_fini(req, arg, lock, instant_cancel); + + RETURN(rc); +} + +/** + * ->l_completion_ast callback for a remote lock in server namespace. + * + * Sends AST to the client notifying it of lock granting. If initial + * lock response was not sent yet, instead of sending another RPC, just + * mark the lock as granted and client will understand + */ +int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) +{ + struct ldlm_cb_set_arg *arg = data; + struct ldlm_request *body; + struct ptlrpc_request *req; + struct ldlm_cb_async_args *ca; + int instant_cancel = 0; + int rc = 0; + int lvb_len; + + ENTRY; + + LASSERT(lock != NULL); + LASSERT(data != NULL); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_SRV_CP_AST)) { + LDLM_DEBUG(lock, "dropping CP AST"); + RETURN(0); + } + + req = ptlrpc_request_alloc(lock->l_export->exp_imp_reverse, + &RQF_LDLM_CP_CALLBACK); + if (req == NULL) + RETURN(-ENOMEM); + + /* server namespace, doesn't need lock */ + lvb_len = ldlm_lvbo_size(lock); + /* + * LU-3124 & LU-2187: to not return layout in completion AST because + * it may deadlock for LU-2187, or client may not have enough space + * for large layout. The layout will be returned to client with an + * extra RPC to fetch xattr.lov + */ + if (ldlm_has_layout(lock)) + lvb_len = 0; + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT, lvb_len); + rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + ca = ptlrpc_req_async_args(ca, req); + ca->ca_set_arg = arg; + ca->ca_lock = lock; + + req->rq_interpret_reply = ldlm_cb_interpret; + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + + body->lock_handle[0] = lock->l_remote_handle; + body->lock_handle[1].cookie = lock->l_handle.h_cookie; + body->lock_flags = ldlm_flags_to_wire(flags); + ldlm_lock2desc(lock, &body->lock_desc); + if (lvb_len > 0) { + void *lvb = req_capsule_client_get(&req->rq_pill, &RMF_DLM_LVB); + lvb_len = ldlm_lvbo_fill(lock, lvb, &lvb_len); + if (lvb_len < 0) { + /* + * We still need to send the RPC to wake up the blocked + * enqueue thread on the client. + * + * Consider old client, there is no better way to notify + * the failure, just zero-sized the LVB, then the client + * will fail out as "-EPROTO". + */ + req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB, 0, + RCL_CLIENT); + instant_cancel = 1; + } else { + req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB, lvb_len, + RCL_CLIENT); + } + } + + LDLM_DEBUG(lock, "server preparing completion AST"); + + ptlrpc_request_set_replen(req); + + req->rq_send_state = LUSTRE_IMP_FULL; + /* ptlrpc_request_pack already set timeout */ + if (AT_OFF) + req->rq_timeout = ldlm_get_rq_timeout(); + + /* We only send real blocking ASTs after the lock is granted */ + lock_res_and_lock(lock); + if (ldlm_is_ast_sent(lock)) { + body->lock_flags |= ldlm_flags_to_wire(LDLM_FL_AST_SENT); + /* Copy AST flags like LDLM_FL_DISCARD_DATA. */ + body->lock_flags |= ldlm_flags_to_wire(lock->l_flags & + LDLM_FL_AST_MASK); + + /* + * We might get here prior to ldlm_handle_enqueue setting + * LDLM_FL_CANCEL_ON_BLOCK flag. Then we will put this lock + * into waiting list, but this is safe and similar code in + * ldlm_handle_enqueue will call ldlm_lock_cancel() still, + * that would not only cancel the lock, but will also remove + * it from waiting list + */ + if (ldlm_is_cancel_on_block(lock)) { + unlock_res_and_lock(lock); + ldlm_lock_cancel(lock); + + instant_cancel = 1; + req->rq_no_resend = 1; + + lock_res_and_lock(lock); + } else { + /* start the lock-timeout clock */ + ldlm_add_waiting_lock(lock, ldlm_bl_timeout(lock)); + /* Do not resend after lock callback timeout */ + req->rq_delay_limit = ldlm_bl_timeout(lock); + req->rq_resend_cb = ldlm_update_resend; + } + } + unlock_res_and_lock(lock); + + if (lock->l_export && lock->l_export->exp_nid_stats && + lock->l_export->exp_nid_stats->nid_ldlm_stats) + lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats, + LDLM_CP_CALLBACK - LDLM_FIRST_OPC); + + rc = ldlm_ast_fini(req, arg, lock, instant_cancel); + + RETURN(lvb_len < 0 ? lvb_len : rc); +} + +/** + * Server side ->l_glimpse_ast handler for client locks. + * + * Sends glimpse AST to the client and waits for reply. Then updates + * lvbo with the result. + */ +int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data) +{ + struct ldlm_cb_set_arg *arg = data; + struct ldlm_request *body; + struct ptlrpc_request *req; + struct ldlm_cb_async_args *ca; + int rc; + struct req_format *req_fmt; + + ENTRY; + + LASSERT(lock != NULL); + + if (arg->gl_desc != NULL) + /* There is a glimpse descriptor to pack */ + req_fmt = &RQF_LDLM_GL_CALLBACK_DESC; + else + req_fmt = &RQF_LDLM_GL_CALLBACK; + + req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse, + req_fmt, LUSTRE_DLM_VERSION, + LDLM_GL_CALLBACK); + + if (req == NULL) + RETURN(-ENOMEM); + + if (arg->gl_desc != NULL) { + /* copy the GL descriptor */ + union ldlm_gl_desc *desc; + + desc = req_capsule_client_get(&req->rq_pill, &RMF_DLM_GL_DESC); + *desc = *arg->gl_desc; + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + body->lock_handle[0] = lock->l_remote_handle; + ldlm_lock2desc(lock, &body->lock_desc); + + ca = ptlrpc_req_async_args(ca, req); + ca->ca_set_arg = arg; + ca->ca_lock = lock; + + /* server namespace, doesn't need lock */ + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + ldlm_lvbo_size(lock)); + ptlrpc_request_set_replen(req); + + req->rq_send_state = LUSTRE_IMP_FULL; + /* ptlrpc_request_alloc_pack already set timeout */ + if (AT_OFF) + req->rq_timeout = ldlm_get_rq_timeout(); + + req->rq_interpret_reply = ldlm_cb_interpret; + + if (lock->l_export && lock->l_export->exp_nid_stats) { + struct nid_stat *nid_stats = lock->l_export->exp_nid_stats; + + lprocfs_counter_incr(nid_stats->nid_ldlm_stats, + LDLM_GL_CALLBACK - LDLM_FIRST_OPC); + } + + rc = ldlm_ast_fini(req, arg, lock, 0); + + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_server_glimpse_ast); + +int ldlm_glimpse_locks(struct ldlm_resource *res, + struct list_head *gl_work_list) +{ + int rc; + + ENTRY; + + rc = ldlm_run_ast_work(ldlm_res_to_ns(res), gl_work_list, + LDLM_WORK_GL_AST); + if (rc == -ERESTART) + ldlm_reprocess_all(res, 0); + + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_glimpse_locks); + +/* return LDLM lock associated with a lock callback request */ +struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req) +{ + struct ldlm_cb_async_args *ca; + struct ldlm_lock *lock; + + ENTRY; + + ca = ptlrpc_req_async_args(ca, req); + lock = ca->ca_lock; + if (lock == NULL) + RETURN(ERR_PTR(-EFAULT)); + + RETURN(lock); +} +EXPORT_SYMBOL(ldlm_request_lock); + +/** + * Main server-side entry point into LDLM for enqueue. This is called by ptlrpc + * service threads to carry out client lock enqueueing requests. + */ +int ldlm_handle_enqueue0(struct ldlm_namespace *ns, + struct ptlrpc_request *req, + const struct ldlm_request *dlm_req, + const struct ldlm_callback_suite *cbs) +{ + struct ldlm_reply *dlm_rep; + __u64 flags; + enum ldlm_error err = ELDLM_OK; + struct ldlm_lock *lock = NULL; + void *cookie = NULL; + int rc = 0; + struct ldlm_resource *res = NULL; + const struct lu_env *env = req->rq_svc_thread->t_env; + + ENTRY; + + LDLM_DEBUG_NOLOCK("server-side enqueue handler START"); + + ldlm_request_cancel(req, dlm_req, LDLM_ENQUEUE_CANCEL_OFF, LATF_SKIP); + flags = ldlm_flags_from_wire(dlm_req->lock_flags); + + LASSERT(req->rq_export); + + /* for intent enqueue the stat will be updated inside intent policy */ + if (ptlrpc_req2svc(req)->srv_stats != NULL && + !(dlm_req->lock_flags & LDLM_FL_HAS_INTENT)) + ldlm_svc_get_eopc(dlm_req, ptlrpc_req2svc(req)->srv_stats); + + if (req->rq_export && req->rq_export->exp_nid_stats && + req->rq_export->exp_nid_stats->nid_ldlm_stats) + lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats, + LDLM_ENQUEUE - LDLM_FIRST_OPC); + + if (unlikely(dlm_req->lock_desc.l_resource.lr_type < LDLM_MIN_TYPE || + dlm_req->lock_desc.l_resource.lr_type >= LDLM_MAX_TYPE)) { + DEBUG_REQ(D_ERROR, req, "invalid lock request type %d", + dlm_req->lock_desc.l_resource.lr_type); + GOTO(out, rc = -EFAULT); + } + + if (unlikely(dlm_req->lock_desc.l_req_mode <= LCK_MINMODE || + dlm_req->lock_desc.l_req_mode >= LCK_MAXMODE || + dlm_req->lock_desc.l_req_mode & + (dlm_req->lock_desc.l_req_mode-1))) { + DEBUG_REQ(D_ERROR, req, "invalid lock request mode %d", + dlm_req->lock_desc.l_req_mode); + GOTO(out, rc = -EFAULT); + } + + if (unlikely((flags & LDLM_FL_REPLAY) || + (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))) { + /* Find an existing lock in the per-export lock hash */ + /* + * In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() + */ + /* coverity[overrun-buffer-val] */ + lock = cfs_hash_lookup(req->rq_export->exp_lock_hash, + (void *)&dlm_req->lock_handle[0]); + if (lock != NULL) { + DEBUG_REQ(D_DLMTRACE, req, + "found existing lock cookie %#llx", + lock->l_handle.h_cookie); + flags |= LDLM_FL_RESENT; + GOTO(existing_lock, rc = 0); + } + } else { + if (ldlm_reclaim_full()) { + DEBUG_REQ(D_DLMTRACE, req, + "Too many granted locks, reject current enqueue request and let the client retry later"); + GOTO(out, rc = -EINPROGRESS); + } + } + + /* The lock's callback data might be set in the policy function */ + lock = ldlm_lock_create(ns, &dlm_req->lock_desc.l_resource.lr_name, + dlm_req->lock_desc.l_resource.lr_type, + dlm_req->lock_desc.l_req_mode, + cbs, NULL, 0, LVB_T_NONE); + if (IS_ERR(lock)) { + rc = PTR_ERR(lock); + lock = NULL; + GOTO(out, rc); + } + + lock->l_remote_handle = dlm_req->lock_handle[0]; + LDLM_DEBUG(lock, "server-side enqueue handler, new lock created"); + + /* + * Initialize resource lvb but not for a lock being replayed since + * Client already got lvb sent in this case. + * This must occur early since some policy methods assume resource + * lvb is available (lr_lvb_data != NULL). + */ + res = lock->l_resource; + if (!(flags & LDLM_FL_REPLAY)) { + /* non-replayed lock, delayed lvb init may need to be done */ + rc = ldlm_lvbo_init(res); + if (rc < 0) { + LDLM_DEBUG(lock, "delayed lvb init failed (rc %d)", rc); + GOTO(out, rc); + } + } + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_BLOCKED, obd_timeout * 2); + /* + * Don't enqueue a lock onto the export if it is been disonnected + * due to eviction (b=3822) or server umount (b=24324). + * Cancel it now instead. + */ + if (req->rq_export->exp_disconnected) { + LDLM_ERROR(lock, "lock on disconnected export %p", + req->rq_export); + GOTO(out, rc = -ENOTCONN); + } + + lock->l_export = class_export_lock_get(req->rq_export, lock); + if (lock->l_export->exp_lock_hash) + cfs_hash_add(lock->l_export->exp_lock_hash, + &lock->l_remote_handle, + &lock->l_exp_hash); + + /* + * Inherit the enqueue flags before the operation, because we do not + * keep the res lock on return and next operations (BL AST) may proceed + * without them. + */ + lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags & + LDLM_FL_INHERIT_MASK); + + ldlm_convert_policy_to_local(req->rq_export, + dlm_req->lock_desc.l_resource.lr_type, + &dlm_req->lock_desc.l_policy_data, + &lock->l_policy_data); + if (dlm_req->lock_desc.l_resource.lr_type == LDLM_EXTENT) { + lock->l_req_extent = lock->l_policy_data.l_extent; + } else if (dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) { + lock->l_policy_data.l_inodebits.try_bits = + dlm_req->lock_desc.l_policy_data.l_inodebits.try_bits; + lock->l_policy_data.l_inodebits.li_gid = + dlm_req->lock_desc.l_policy_data.l_inodebits.li_gid; + } + +existing_lock: + cookie = req; + if (!(flags & LDLM_FL_HAS_INTENT)) { + /* based on the assumption that lvb size never changes during + * resource life time otherwise it need resource->lr_lock's + * protection */ + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, + RCL_SERVER, ldlm_lvbo_size(lock)); + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR)) + GOTO(out, rc = -ENOMEM); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + GOTO(out, rc); + } + + err = ldlm_lock_enqueue(env, ns, &lock, cookie, &flags); + if (err) { + if ((int)err < 0) + rc = (int)err; + GOTO(out, err); + } + + dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + + ldlm_lock2desc(lock, &dlm_rep->lock_desc); + ldlm_lock2handle(lock, &dlm_rep->lock_handle); + + if (lock && lock->l_resource->lr_type == LDLM_EXTENT) + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_BL_EVICT, 6); + + /* + * We never send a blocking AST until the lock is granted, but + * we can tell it right now + */ + lock_res_and_lock(lock); + + /* + * Now take into account flags to be inherited from original lock + * request both in reply to client and in our own lock flags. + */ + dlm_rep->lock_flags = ldlm_flags_to_wire(flags); + lock->l_flags |= flags & LDLM_FL_INHERIT_MASK; + + /* + * Don't move a pending lock onto the export if it has already been + * disconnected due to eviction (b=5683) or server umount (b=24324). + * Cancel it now instead. + */ + if (unlikely(req->rq_export->exp_disconnected || + OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT))) { + LDLM_ERROR(lock, "lock on destroyed export %p", req->rq_export); + rc = -ENOTCONN; + } else if (ldlm_is_ast_sent(lock)) { + /* fill lock desc for possible lock convert */ + if (lock->l_blocking_lock && + lock->l_resource->lr_type == LDLM_IBITS) { + struct ldlm_lock *bl_lock = lock->l_blocking_lock; + struct ldlm_lock_desc *rep_desc = &dlm_rep->lock_desc; + + LDLM_DEBUG(lock, + "save blocking bits %llx in granted lock", + bl_lock->l_policy_data.l_inodebits.bits); + /* + * If lock is blocked then save blocking ibits + * in returned lock policy for the possible lock + * convert on a client. + */ + rep_desc->l_policy_data.l_inodebits.cancel_bits = + bl_lock->l_policy_data.l_inodebits.bits; + } + dlm_rep->lock_flags |= ldlm_flags_to_wire(LDLM_FL_AST_SENT); + if (ldlm_is_granted(lock)) { + /* + * Only cancel lock if it was granted, because it would + * be destroyed immediately and would never be granted + * in the future, causing timeouts on client. Not + * granted lock will be cancelled immediately after + * sending completion AST. + */ + if (ldlm_is_cancel_on_block(lock)) { + unlock_res_and_lock(lock); + ldlm_lock_cancel(lock); + lock_res_and_lock(lock); + } else { + ldlm_add_waiting_lock(lock, + ldlm_bl_timeout(lock)); + } + } + } + unlock_res_and_lock(lock); + + EXIT; +out: + req->rq_status = rc ?: err; /* return either error - b=11190 */ + if (!req->rq_packed_final) { + int rc1 = lustre_pack_reply(req, 1, NULL, NULL); + if (rc == 0) + rc = rc1; + } + + /* + * The LOCK_CHANGED code in ldlm_lock_enqueue depends on this + * ldlm_reprocess_all. If this moves, revisit that code. -phil + */ + if (lock != NULL) { + LDLM_DEBUG(lock, + "server-side enqueue handler, sending reply (err=%d, rc=%d)", + err, rc); + + if (rc == 0 && + req_capsule_has_field(&req->rq_pill, &RMF_DLM_LVB, + RCL_SERVER) && + ldlm_lvbo_size(lock) > 0) { + void *buf; + int buflen; + +retry: + buf = req_capsule_server_get(&req->rq_pill, + &RMF_DLM_LVB); + LASSERTF(buf != NULL, "req %p, lock %p\n", req, lock); + buflen = req_capsule_get_size(&req->rq_pill, + &RMF_DLM_LVB, RCL_SERVER); + /* + * non-replayed lock, delayed lvb init may + * need to be occur now + */ + if ((buflen > 0) && !(flags & LDLM_FL_REPLAY)) { + int rc2; + + rc2 = ldlm_lvbo_fill(lock, buf, &buflen); + if (rc2 >= 0) { + req_capsule_shrink(&req->rq_pill, + &RMF_DLM_LVB, + rc2, RCL_SERVER); + } else if (rc2 == -ERANGE) { + rc2 = req_capsule_server_grow( + &req->rq_pill, + &RMF_DLM_LVB, buflen); + if (!rc2) { + goto retry; + } else { + /* + * if we can't grow the buffer, + * it's ok to return empty lvb + * to client. + */ + req_capsule_shrink( + &req->rq_pill, + &RMF_DLM_LVB, 0, + RCL_SERVER); + } + } else { + rc = rc2; + } + } else if (flags & LDLM_FL_REPLAY) { + /* no LVB resend upon replay */ + if (buflen > 0) + req_capsule_shrink(&req->rq_pill, + &RMF_DLM_LVB, + 0, RCL_SERVER); + else + rc = buflen; + } else { + rc = buflen; + } + } + + if (rc != 0 && !(flags & LDLM_FL_RESENT)) { + if (lock->l_export) { + ldlm_lock_cancel(lock); + } else { + lock_res_and_lock(lock); + ldlm_resource_unlink_lock(lock); + ldlm_lock_destroy_nolock(lock); + unlock_res_and_lock(lock); + } + ldlm_reprocess_all(lock->l_resource, lock->l_policy_data.l_inodebits.bits); + } + + if (!err && !ldlm_is_cbpending(lock) && + dlm_req->lock_desc.l_resource.lr_type != LDLM_FLOCK) + ldlm_reprocess_all(lock->l_resource, + lock->l_policy_data.l_inodebits.bits); + + LDLM_LOCK_RELEASE(lock); + } + + LDLM_DEBUG_NOLOCK("server-side enqueue handler END (lock %p, rc %d)", + lock, rc); + + return rc; +} + +/* + * Clear the blocking lock, the race is possible between ldlm_handle_convert0() + * and ldlm_work_bl_ast_lock(), so this is done under lock with check for NULL. + */ +void ldlm_clear_blocking_lock(struct ldlm_lock *lock) +{ + if (lock->l_blocking_lock) { + LDLM_LOCK_RELEASE(lock->l_blocking_lock); + lock->l_blocking_lock = NULL; + } +} + +/* A lock can be converted to new ibits or mode and should be considered + * as new lock. Clear all states related to a previous blocking AST + * processing so new conflicts will cause new blocking ASTs. + * + * This is used during lock convert below and lock downgrade to COS mode in + * ldlm_lock_mode_downgrade(). + */ +void ldlm_clear_blocking_data(struct ldlm_lock *lock) +{ + ldlm_clear_ast_sent(lock); + lock->l_bl_ast_run = 0; + ldlm_clear_blocking_lock(lock); +} + +/** + * Main LDLM entry point for server code to process lock conversion requests. + */ +int ldlm_handle_convert0(struct ptlrpc_request *req, + const struct ldlm_request *dlm_req) +{ + struct obd_export *exp = req->rq_export; + struct ldlm_reply *dlm_rep; + struct ldlm_lock *lock; + __u64 bits; + __u64 new_bits; + int rc; + + ENTRY; + + if (exp && exp->exp_nid_stats && exp->exp_nid_stats->nid_ldlm_stats) + lprocfs_counter_incr(exp->exp_nid_stats->nid_ldlm_stats, + LDLM_CONVERT - LDLM_FIRST_OPC); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(rc); + + dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + dlm_rep->lock_flags = dlm_req->lock_flags; + + lock = ldlm_handle2lock(&dlm_req->lock_handle[0]); + if (!lock) { + LDLM_DEBUG_NOLOCK("server lock is canceled already"); + req->rq_status = ELDLM_NO_LOCK_DATA; + RETURN(0); + } + + LDLM_DEBUG(lock, "server-side convert handler START"); + + lock_res_and_lock(lock); + bits = lock->l_policy_data.l_inodebits.bits; + new_bits = dlm_req->lock_desc.l_policy_data.l_inodebits.bits; + + if (ldlm_is_cancel(lock)) { + LDLM_DEBUG(lock, "convert on canceled lock!"); + unlock_res_and_lock(lock); + GOTO(out_put, rc = ELDLM_NO_LOCK_DATA); + } + + if (dlm_req->lock_desc.l_req_mode != lock->l_granted_mode) { + LDLM_ERROR(lock, "lock mode differs!"); + unlock_res_and_lock(lock); + GOTO(out_put, rc = -EPROTO); + } + + if (bits == new_bits) { + /* + * This can be valid situation if CONVERT RPCs are + * re-ordered. Just finish silently + */ + LDLM_DEBUG(lock, "lock is converted already!"); + unlock_res_and_lock(lock); + } else { + if (ldlm_is_waited(lock)) + ldlm_del_waiting_lock(lock); + + ldlm_clear_cbpending(lock); + lock->l_policy_data.l_inodebits.cancel_bits = 0; + ldlm_inodebits_drop(lock, bits & ~new_bits); + + ldlm_clear_blocking_data(lock); + unlock_res_and_lock(lock); + + /* All old bits should be reprocessed to send new BL AST if + * it wasn't sent earlier due to LDLM_FL_AST_SENT bit set. + * */ + ldlm_reprocess_all(lock->l_resource, bits); + } + + dlm_rep->lock_handle = lock->l_remote_handle; + ldlm_ibits_policy_local_to_wire(&lock->l_policy_data, + &dlm_rep->lock_desc.l_policy_data); + rc = ELDLM_OK; + EXIT; +out_put: + LDLM_DEBUG(lock, "server-side convert handler END, rc = %d", rc); + LDLM_LOCK_PUT(lock); + req->rq_status = rc; + return 0; +} + +/** + * Cancel all the locks whose handles are packed into ldlm_request + * + * Called by server code expecting such combined cancel activity + * requests. + */ +int ldlm_request_cancel(struct ptlrpc_request *req, + const struct ldlm_request *dlm_req, + int first, enum lustre_at_flags flags) +{ + struct ldlm_resource *res, *pres = NULL; + struct ldlm_lock *lock; + int i, count, done = 0; + unsigned int size; + + ENTRY; + + size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT); + if (size <= offsetof(struct ldlm_request, lock_handle) || + (size - offsetof(struct ldlm_request, lock_handle)) / + sizeof(struct lustre_handle) < dlm_req->lock_count) + RETURN(0); + + count = dlm_req->lock_count ? dlm_req->lock_count : 1; + if (first >= count) + RETURN(0); + + if (count == 1 && dlm_req->lock_handle[0].cookie == 0) + RETURN(0); + + /* + * There is no lock on the server at the replay time, + * skip lock cancelling to make replay tests to pass. + */ + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) + RETURN(0); + + LDLM_DEBUG_NOLOCK("server-side cancel handler START: %d locks, starting at %d", + count, first); + + for (i = first; i < count; i++) { + lock = ldlm_handle2lock(&dlm_req->lock_handle[i]); + if (!lock) { + /* below message checked in replay-single.sh test_36 */ + LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock (cookie %llu)", + dlm_req->lock_handle[i].cookie); + continue; + } + + res = lock->l_resource; + done++; + + /* + * This code is an optimization to only attempt lock + * granting on the resource (that could be CPU-expensive) + * after we are done cancelling lock in that resource. + */ + if (res != pres) { + if (pres != NULL) { + ldlm_reprocess_all(pres, 0); + LDLM_RESOURCE_DELREF(pres); + ldlm_resource_putref(pres); + } + if (res != NULL) { + ldlm_resource_getref(res); + LDLM_RESOURCE_ADDREF(res); + + if (!ldlm_is_discard_data(lock)) + ldlm_lvbo_update(res, lock, + NULL, 1); + } + pres = res; + } + + if ((flags & LATF_STATS) && ldlm_is_ast_sent(lock) && + lock->l_blast_sent != 0) { + timeout_t delay = 0; + + if (ktime_get_seconds() > lock->l_blast_sent) + delay = ktime_get_seconds() - + lock->l_blast_sent; + LDLM_DEBUG(lock, + "server cancels blocked lock after %ds", + delay); + at_measured(&lock->l_export->exp_bl_lock_at, delay); + } + ldlm_lock_cancel(lock); + LDLM_LOCK_PUT(lock); + } + if (pres != NULL) { + ldlm_reprocess_all(pres, 0); + LDLM_RESOURCE_DELREF(pres); + ldlm_resource_putref(pres); + } + LDLM_DEBUG_NOLOCK("server-side cancel handler END"); + RETURN(done); +} +EXPORT_SYMBOL(ldlm_request_cancel); + +/** + * Main LDLM entry point for server code to cancel locks. + * + * Typically gets called from service handler on LDLM_CANCEL opc. + */ +int ldlm_handle_cancel(struct ptlrpc_request *req) +{ + struct ldlm_request *dlm_req; + int rc; + + ENTRY; + + dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + if (dlm_req == NULL) { + CDEBUG(D_INFO, "bad request buffer for cancel\n"); + RETURN(-EFAULT); + } + + if (req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) < + offsetof(struct ldlm_request, lock_handle[1])) + RETURN(-EPROTO); + + if (req->rq_export && req->rq_export->exp_nid_stats && + req->rq_export->exp_nid_stats->nid_ldlm_stats) + lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats, + LDLM_CANCEL - LDLM_FIRST_OPC); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(rc); + + if (!ldlm_request_cancel(req, dlm_req, 0, LATF_STATS)) + req->rq_status = LUSTRE_ESTALE; + + RETURN(ptlrpc_reply(req)); +} +#endif /* HAVE_SERVER_SUPPORT */ + +/** + * Server may pass additional information about blocking lock. + * For IBITS locks it is conflicting bits which can be used for + * lock convert instead of cancel. + */ +void ldlm_bl_desc2lock(const struct ldlm_lock_desc *ld, struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + check_res_locked(lock->l_resource); + if (ns_is_client(ns) && ld && + (lock->l_resource->lr_type == LDLM_IBITS)) { + /* + * Lock description contains policy of blocking lock, + * and its cancel_bits is used to pass conflicting bits. + * NOTE: ld can be NULL or can be not NULL but zeroed if + * passed from ldlm_bl_thread_blwi(), check below used bits + * in ld to make sure it is valid description. + * + * If server may replace lock resource keeping the same cookie, + * never use cancel bits from different resource, full cancel + * is to be used. + */ + if (ld->l_policy_data.l_inodebits.cancel_bits && + ldlm_res_eq(&ld->l_resource.lr_name, + &lock->l_resource->lr_name) && + !(ldlm_is_cbpending(lock) && + lock->l_policy_data.l_inodebits.cancel_bits == 0)) { + /* always combine conflicting ibits */ + lock->l_policy_data.l_inodebits.cancel_bits |= + ld->l_policy_data.l_inodebits.cancel_bits; + } else { + /* If cancel_bits are not obtained or + * if the lock is already CBPENDING and + * has no cancel_bits set + * - the full lock is to be cancelled + */ + lock->l_policy_data.l_inodebits.cancel_bits = 0; + } + } +} + +/** + * Callback handler for receiving incoming blocking ASTs. + * + * This can only happen on client side. + */ +void ldlm_handle_bl_callback(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, struct ldlm_lock *lock) +{ + int do_ast; + + ENTRY; + + LDLM_DEBUG(lock, "client blocking AST callback handler"); + + lock_res_and_lock(lock); + + /* get extra information from desc if any */ + ldlm_bl_desc2lock(ld, lock); + ldlm_set_cbpending(lock); + + do_ast = (!lock->l_readers && !lock->l_writers); + unlock_res_and_lock(lock); + + if (do_ast) { + CDEBUG(D_DLMTRACE, + "Lock %p already unused, calling callback (%p)\n", + lock, lock->l_blocking_ast); + if (lock->l_blocking_ast != NULL) + lock->l_blocking_ast(lock, ld, lock->l_ast_data, + LDLM_CB_BLOCKING); + } else { + CDEBUG(D_DLMTRACE, + "Lock %p is referenced, will be cancelled later\n", + lock); + } + + LDLM_DEBUG(lock, "client blocking callback handler END"); + LDLM_LOCK_RELEASE(lock); + EXIT; +} + +static int ldlm_callback_reply(struct ptlrpc_request *req, int rc) +{ + if (req->rq_no_reply) + return 0; + + req->rq_status = rc; + if (!req->rq_packed_final) { + rc = lustre_pack_reply(req, 1, NULL, NULL); + if (rc) + return rc; + } + return ptlrpc_reply(req); +} + +/** + * Callback handler for receiving incoming completion ASTs. + * + * This only can happen on client side. + */ +static int ldlm_handle_cp_callback(struct ptlrpc_request *req, + struct ldlm_namespace *ns, + struct ldlm_request *dlm_req, + struct ldlm_lock *lock) +{ + LIST_HEAD(ast_list); + int lvb_len; + int rc = 0; + + ENTRY; + + LDLM_DEBUG(lock, "client completion callback handler START"); + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) { + long to = cfs_time_seconds(1); + + ldlm_callback_reply(req, 0); + + while (to > 0) { + to = schedule_timeout_interruptible(to); + if (ldlm_is_granted(lock) || + ldlm_is_destroyed(lock)) + break; + } + } + + lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT); + if (lvb_len < 0) { + LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len); + GOTO(out, rc = lvb_len); + } else if (lvb_len > 0) { + if (lock->l_lvb_len > 0) { + /* for extent lock, lvb contains ost_lvb{}. */ + LASSERT(lock->l_lvb_data != NULL); + + if (unlikely(lock->l_lvb_len < lvb_len)) { + LDLM_ERROR(lock, + "Replied LVB is larger than expectation, expected = %d, replied = %d", + lock->l_lvb_len, lvb_len); + GOTO(out, rc = -EINVAL); + } + } + } + + lock_res_and_lock(lock); + + if (!ldlm_res_eq(&dlm_req->lock_desc.l_resource.lr_name, + &lock->l_resource->lr_name)) { + ldlm_resource_unlink_lock(lock); + unlock_res_and_lock(lock); + rc = ldlm_lock_change_resource(ns, lock, + &dlm_req->lock_desc.l_resource.lr_name); + if (rc < 0) { + LDLM_ERROR(lock, "Failed to allocate resource"); + GOTO(out, rc); + } + LDLM_DEBUG(lock, "completion AST, new resource"); + lock_res_and_lock(lock); + } + + if (ldlm_is_failed(lock)) { + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + RETURN(-EINVAL); + } + + if (ldlm_is_destroyed(lock) || + ldlm_is_granted(lock)) { + /* b=11300: the lock has already been granted */ + unlock_res_and_lock(lock); + LDLM_DEBUG(lock, "Double grant race happened"); + GOTO(out, rc = 0); + } + + /* + * If we receive the completion AST before the actual enqueue returned, + * then we might need to switch lock modes, resources, or extents. + */ + if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) { + lock->l_req_mode = dlm_req->lock_desc.l_granted_mode; + LDLM_DEBUG(lock, "completion AST, new lock mode"); + } + + if (lock->l_resource->lr_type != LDLM_PLAIN) { + ldlm_convert_policy_to_local(req->rq_export, + dlm_req->lock_desc.l_resource.lr_type, + &dlm_req->lock_desc.l_policy_data, + &lock->l_policy_data); + LDLM_DEBUG(lock, "completion AST, new policy data"); + } + + ldlm_resource_unlink_lock(lock); + + if (dlm_req->lock_flags & LDLM_FL_AST_SENT) { + /* + * BL_AST locks are not needed in LRU. + * Let ldlm_cancel_lru() be fast. + */ + ldlm_lock_remove_from_lru(lock); + ldlm_bl_desc2lock(&dlm_req->lock_desc, lock); + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST; + LDLM_DEBUG(lock, "completion AST includes blocking AST"); + } + + if (lock->l_lvb_len > 0) { + rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT, + lock->l_lvb_data, lvb_len); + if (rc < 0) { + unlock_res_and_lock(lock); + GOTO(out, rc); + } + } + + ldlm_grant_lock(lock, &ast_list); + unlock_res_and_lock(lock); + + LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work"); + + /* + * Let Enqueue to call osc_lock_upcall() and initialize + * l_ast_data + */ + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2); + + ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST); + + LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)", + lock); + GOTO(out, rc); + +out: + if (rc < 0) { + lock_res_and_lock(lock); + ldlm_set_failed(lock); + unlock_res_and_lock(lock); + wake_up(&lock->l_waitq); + } + LDLM_LOCK_RELEASE(lock); + + return 0; +} + +/** + * Callback handler for receiving incoming glimpse ASTs. + * + * This only can happen on client side. After handling the glimpse AST + * we also consider dropping the lock here if it is unused locally for a + * long time. + */ +static void ldlm_handle_gl_callback(struct ptlrpc_request *req, + struct ldlm_namespace *ns, + struct ldlm_request *dlm_req, + struct ldlm_lock *lock) +{ + struct ldlm_lock_desc *ld = &dlm_req->lock_desc; + int rc = -ENOSYS; + + ENTRY; + + LDLM_DEBUG(lock, "client glimpse AST callback handler"); + + if (lock->l_glimpse_ast != NULL) + rc = lock->l_glimpse_ast(lock, req); + + if (req->rq_repmsg != NULL) { + ptlrpc_reply(req); + } else { + req->rq_status = rc; + ptlrpc_error(req); + } + + lock_res_and_lock(lock); + if (lock->l_granted_mode == LCK_PW && + !lock->l_readers && !lock->l_writers && + ktime_after(ktime_get(), + ktime_add(lock->l_last_used, ns->ns_dirty_age_limit))) { + unlock_res_and_lock(lock); + + /* For MDS glimpse it is always DOM lock, set corresponding + * cancel_bits to perform lock convert if needed + */ + if (lock->l_resource->lr_type == LDLM_IBITS) + ld->l_policy_data.l_inodebits.cancel_bits = + MDS_INODELOCK_DOM; + if (ldlm_bl_to_thread_lock(ns, ld, lock)) + ldlm_handle_bl_callback(ns, ld, lock); + + EXIT; + return; + } + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + EXIT; +} + +static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi, + enum ldlm_cancel_flags cancel_flags) +{ + struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool; + + ENTRY; + + spin_lock(&blp->blp_lock); + if (blwi->blwi_lock && + ldlm_is_discard_data(blwi->blwi_lock)) { + /* add LDLM_FL_DISCARD_DATA requests to the priority list */ + list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list); + } else { + /* other blocking callbacks are added to the regular list */ + list_add_tail(&blwi->blwi_entry, &blp->blp_list); + } + spin_unlock(&blp->blp_lock); + + wake_up(&blp->blp_waitq); + + /* + * can not check blwi->blwi_flags as blwi could be already freed in + * LCF_ASYNC mode + */ + if (!(cancel_flags & LCF_ASYNC)) + wait_for_completion(&blwi->blwi_comp); + + RETURN(0); +} + +static inline void init_blwi(struct ldlm_bl_work_item *blwi, + struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, + struct list_head *cancels, int count, + struct ldlm_lock *lock, + enum ldlm_cancel_flags cancel_flags) +{ + init_completion(&blwi->blwi_comp); + INIT_LIST_HEAD(&blwi->blwi_head); + + if (current->flags & PF_MEMALLOC) + blwi->blwi_mem_pressure = 1; + + blwi->blwi_ns = ns; + blwi->blwi_flags = cancel_flags; + if (ld != NULL) + blwi->blwi_ld = *ld; + if (count) { + list_splice_init(cancels, &blwi->blwi_head); + blwi->blwi_count = count; + } else { + blwi->blwi_lock = lock; + } +} + +/** + * Queues a list of locks \a cancels containing \a count locks + * for later processing by a blocking thread. If \a count is zero, + * then the lock referenced as \a lock is queued instead. + * + * The blocking thread would then call ->l_blocking_ast callback in the lock. + * If list addition fails an error is returned and caller is supposed to + * call ->l_blocking_ast itself. + */ +static int ldlm_bl_to_thread(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, + struct ldlm_lock *lock, + struct list_head *cancels, int count, + enum ldlm_cancel_flags cancel_flags) +{ + ENTRY; + + if (cancels && count == 0) + RETURN(0); + + if (cancel_flags & LCF_ASYNC) { + struct ldlm_bl_work_item *blwi; + + OBD_ALLOC(blwi, sizeof(*blwi)); + if (blwi == NULL) + RETURN(-ENOMEM); + init_blwi(blwi, ns, ld, cancels, count, lock, cancel_flags); + + RETURN(__ldlm_bl_to_thread(blwi, cancel_flags)); + } else { + /* + * if it is synchronous call do minimum mem alloc, as it could + * be triggered from kernel shrinker + */ + struct ldlm_bl_work_item blwi; + + memset(&blwi, 0, sizeof(blwi)); + init_blwi(&blwi, ns, ld, cancels, count, lock, cancel_flags); + RETURN(__ldlm_bl_to_thread(&blwi, cancel_flags)); + } +} + + +int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, + struct ldlm_lock *lock) +{ + return ldlm_bl_to_thread(ns, ld, lock, NULL, 0, LCF_ASYNC); +} + +int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, + struct list_head *cancels, int count, + enum ldlm_cancel_flags cancel_flags) +{ + return ldlm_bl_to_thread(ns, ld, NULL, cancels, count, cancel_flags); +} + +int ldlm_bl_to_thread_ns(struct ldlm_namespace *ns) +{ + return ldlm_bl_to_thread(ns, NULL, NULL, NULL, 0, LCF_ASYNC); +} + +int ldlm_bl_thread_wakeup(void) +{ + wake_up(&ldlm_state->ldlm_bl_pool->blp_waitq); + return 0; +} + +/* Setinfo coming from Server (eg MDT) to Client (eg MDC)! */ +static int ldlm_handle_setinfo(struct ptlrpc_request *req) +{ + struct obd_device *obd = req->rq_export->exp_obd; + char *key; + void *val; + int keylen, vallen; + int rc = -ENOSYS; + + ENTRY; + + DEBUG_REQ(D_HSM, req, "%s: handle setinfo", obd->obd_name); + + req_capsule_set(&req->rq_pill, &RQF_OBD_SET_INFO); + + key = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + if (key == NULL) { + DEBUG_REQ(D_IOCTL, req, "no set_info key"); + RETURN(-EFAULT); + } + keylen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT); + val = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL); + if (val == NULL) { + DEBUG_REQ(D_IOCTL, req, "no set_info val"); + RETURN(-EFAULT); + } + vallen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT); + + /* We are responsible for swabbing contents of val */ + + if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) + /* Pass it on to mdc (the "export" in this case) */ + rc = obd_set_info_async(req->rq_svc_thread->t_env, + req->rq_export, + sizeof(KEY_HSM_COPYTOOL_SEND), + KEY_HSM_COPYTOOL_SEND, + vallen, val, NULL); + else + DEBUG_REQ(D_WARNING, req, "ignoring unknown key '%s'", key); + + return rc; +} + +static inline void ldlm_callback_errmsg(struct ptlrpc_request *req, + const char *msg, int rc, + const struct lustre_handle *handle) +{ + DEBUG_REQ((req->rq_no_reply || rc) ? D_WARNING : D_DLMTRACE, req, + "%s, NID=%s lock=%#llx: rc = %d", + msg, libcfs_id2str(req->rq_peer), + handle ? handle->cookie : 0, rc); + if (req->rq_no_reply) + CWARN("No reply was sent, maybe cause b=21636.\n"); + else if (rc) + CWARN("Send reply failed, maybe cause b=21636.\n"); +} + +/* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */ +static int ldlm_callback_handler(struct ptlrpc_request *req) +{ + struct ldlm_namespace *ns; + struct ldlm_request *dlm_req; + struct ldlm_lock *lock; + int rc; + + ENTRY; + + /* + * Requests arrive in sender's byte order. The ptlrpc service + * handler has already checked and, if necessary, byte-swapped the + * incoming request message body, but I am responsible for the + * message buffers. + */ + + /* do nothing for sec context finalize */ + if (lustre_msg_get_opc(req->rq_reqmsg) == SEC_CTX_FINI) + RETURN(0); + + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + + if (req->rq_export == NULL) { + rc = ldlm_callback_reply(req, -ENOTCONN); + ldlm_callback_errmsg(req, "Operate on unconnected server", + rc, NULL); + RETURN(0); + } + + LASSERT(req->rq_export != NULL); + LASSERT(req->rq_export->exp_obd != NULL); + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case LDLM_BL_CALLBACK: + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) { + if (cfs_fail_err) + ldlm_callback_reply(req, -(int)cfs_fail_err); + RETURN(0); + } + break; + case LDLM_CP_CALLBACK: + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET)) + RETURN(0); + break; + case LDLM_GL_CALLBACK: + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GL_CALLBACK_NET)) + RETURN(0); + break; + case LDLM_SET_INFO: + rc = ldlm_handle_setinfo(req); + ldlm_callback_reply(req, rc); + RETURN(0); + default: + CERROR("unknown opcode %u\n", + lustre_msg_get_opc(req->rq_reqmsg)); + ldlm_callback_reply(req, -EPROTO); + RETURN(0); + } + + ns = req->rq_export->exp_obd->obd_namespace; + LASSERT(ns != NULL); + + req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK); + + dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + if (dlm_req == NULL) { + rc = ldlm_callback_reply(req, -EPROTO); + ldlm_callback_errmsg(req, "Operate without parameter", rc, + NULL); + RETURN(0); + } + + /* + * Force a known safe race, send a cancel to the server for a lock + * which the server has already started a blocking callback on. + */ + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) && + lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) { + rc = ldlm_cli_cancel(&dlm_req->lock_handle[0], 0); + if (rc < 0) + CERROR("ldlm_cli_cancel: %d\n", rc); + } + + lock = ldlm_handle2lock_long(&dlm_req->lock_handle[0], 0); + if (!lock) { + CDEBUG(D_DLMTRACE, + "callback on lock %#llx - lock disappeared\n", + dlm_req->lock_handle[0].cookie); + rc = ldlm_callback_reply(req, -EINVAL); + ldlm_callback_errmsg(req, "Operate with invalid parameter", rc, + &dlm_req->lock_handle[0]); + RETURN(0); + } + + if (ldlm_is_fail_loc(lock) && + lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) + OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); + + /* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */ + lock_res_and_lock(lock); + lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags & + LDLM_FL_AST_MASK); + if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) { + /* + * If somebody cancels lock and cache is already dropped, + * or lock is failed before cp_ast received on client, + * we can tell the server we have no lock. Otherwise, we + * should send cancel after dropping the cache. + */ + if ((ldlm_is_canceling(lock) && ldlm_is_bl_done(lock)) || + ldlm_is_failed(lock)) { + LDLM_DEBUG(lock, + "callback on lock %llx - lock disappeared", + dlm_req->lock_handle[0].cookie); + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + rc = ldlm_callback_reply(req, -EINVAL); + ldlm_callback_errmsg(req, "Operate on stale lock", rc, + &dlm_req->lock_handle[0]); + RETURN(0); + } + /* + * BL_AST locks are not needed in LRU. + * Let ldlm_cancel_lru() be fast. + */ + ldlm_lock_remove_from_lru(lock); + ldlm_set_bl_ast(lock); + } + if (lock->l_remote_handle.cookie == 0) + lock->l_remote_handle = dlm_req->lock_handle[1]; + unlock_res_and_lock(lock); + + /* + * We want the ost thread to get this reply so that it can respond + * to ost requests (write cache writeback) that might be triggered + * in the callback. + * + * But we'd also like to be able to indicate in the reply that we're + * cancelling right now, because it's unused, or have an intent result + * in the reply, so we might have to push the responsibility for sending + * the reply down into the AST handlers, alas. + */ + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case LDLM_BL_CALLBACK: + CDEBUG(D_INODE, "blocking ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK); + if (!ldlm_is_cancel_on_block(lock)) { + rc = ldlm_callback_reply(req, 0); + if (req->rq_no_reply || rc) + ldlm_callback_errmsg(req, "Normal process", rc, + &dlm_req->lock_handle[0]); + } + if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock)) + ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock); + break; + case LDLM_CP_CALLBACK: + CDEBUG(D_INODE, "completion ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK); + rc = ldlm_handle_cp_callback(req, ns, dlm_req, lock); + if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) + ldlm_callback_reply(req, rc); + break; + case LDLM_GL_CALLBACK: + CDEBUG(D_INODE, "glimpse ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK); + ldlm_handle_gl_callback(req, ns, dlm_req, lock); + break; + default: + LBUG(); /* checked above */ + } + + RETURN(0); +} + +#ifdef HAVE_SERVER_SUPPORT +/** + * Main handler for canceld thread. + * + * Separated into its own thread to avoid deadlocks. + */ +static int ldlm_cancel_handler(struct ptlrpc_request *req) +{ + int rc; + + ENTRY; + + /* + * Requests arrive in sender's byte order. The ptlrpc service + * handler has already checked and, if necessary, byte-swapped the + * incoming request message body, but I am responsible for the + * message buffers. + */ + + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + + if (req->rq_export == NULL) { + struct ldlm_request *dlm_req; + + CERROR("%s from %s arrived at %llu with bad export cookie %llu\n", + ll_opcode2str(lustre_msg_get_opc(req->rq_reqmsg)), + libcfs_nid2str(req->rq_peer.nid), + (unsigned long long)req->rq_arrival_time.tv_sec, + lustre_msg_get_handle(req->rq_reqmsg)->cookie); + + if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_CANCEL) { + req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK); + dlm_req = req_capsule_client_get(&req->rq_pill, + &RMF_DLM_REQ); + if (dlm_req != NULL) + ldlm_lock_dump_handle(D_ERROR, + &dlm_req->lock_handle[0]); + } + ldlm_callback_reply(req, -ENOTCONN); + RETURN(0); + } + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + /* XXX FIXME move this back to mds/handler.c, b=249 */ + case LDLM_CANCEL: + req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL); + CDEBUG(D_INODE, "cancel\n"); + if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_NET) || + CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND) || + CFS_FAIL_CHECK(OBD_FAIL_LDLM_BL_EVICT)) + RETURN(0); + rc = ldlm_handle_cancel(req); + break; + case LDLM_CONVERT: + { + struct ldlm_request *dlm_req; + + req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT); + CDEBUG(D_INODE, "convert\n"); + + dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + if (dlm_req == NULL) { + CDEBUG(D_INFO, "bad request buffer for cancel\n"); + rc = ldlm_callback_reply(req, -EPROTO); + } else { + req->rq_status = ldlm_handle_convert0(req, dlm_req); + rc = ptlrpc_reply(req); + } + break; + } + default: + CERROR("invalid opcode %d\n", + lustre_msg_get_opc(req->rq_reqmsg)); + req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK); + rc = ldlm_callback_reply(req, -EINVAL); + } + + RETURN(rc); +} + +static int ldlm_cancel_hpreq_lock_match(struct ptlrpc_request *req, + struct ldlm_lock *lock) +{ + struct ldlm_request *dlm_req; + struct lustre_handle lockh; + int rc = 0; + int i; + + ENTRY; + + dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + if (dlm_req == NULL) + RETURN(0); + + ldlm_lock2handle(lock, &lockh); + for (i = 0; i < dlm_req->lock_count; i++) { + if (lustre_handle_equal(&dlm_req->lock_handle[i], + &lockh)) { + DEBUG_REQ(D_RPCTRACE, req, + "Prio raised by lock %#llx", lockh.cookie); + rc = 1; + break; + } + } + + RETURN(rc); +} + +static int ldlm_cancel_hpreq_check(struct ptlrpc_request *req) +{ + struct ldlm_request *dlm_req; + int rc = 0; + int i; + unsigned int size; + + ENTRY; + + /* no prolong in recovery */ + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) + RETURN(0); + + dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + if (dlm_req == NULL) + RETURN(-EFAULT); + + size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT); + if (size <= offsetof(struct ldlm_request, lock_handle) || + (size - offsetof(struct ldlm_request, lock_handle)) / + sizeof(struct lustre_handle) < dlm_req->lock_count) + RETURN(-EPROTO); + + for (i = 0; i < dlm_req->lock_count; i++) { + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(&dlm_req->lock_handle[i]); + if (lock == NULL) + continue; + + rc = ldlm_is_ast_sent(lock) ? 1 : 0; + if (rc) + LDLM_DEBUG(lock, "hpreq cancel/convert lock"); + LDLM_LOCK_PUT(lock); + + if (rc) + break; + } + + RETURN(rc); +} + +static struct ptlrpc_hpreq_ops ldlm_cancel_hpreq_ops = { + .hpreq_lock_match = ldlm_cancel_hpreq_lock_match, + .hpreq_check = ldlm_cancel_hpreq_check, + .hpreq_fini = NULL, +}; + +static int ldlm_hpreq_handler(struct ptlrpc_request *req) +{ + ENTRY; + + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + + if (req->rq_export == NULL) + RETURN(0); + + if (LDLM_CANCEL == lustre_msg_get_opc(req->rq_reqmsg)) { + req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL); + req->rq_ops = &ldlm_cancel_hpreq_ops; + } else if (LDLM_CONVERT == lustre_msg_get_opc(req->rq_reqmsg)) { + req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT); + req->rq_ops = &ldlm_cancel_hpreq_ops; + } + RETURN(0); +} + +static int ldlm_revoke_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) + +{ + struct list_head *rpc_list = data; + struct ldlm_lock *lock = cfs_hash_object(hs, hnode); + + lock_res_and_lock(lock); + + if (!ldlm_is_granted(lock)) { + unlock_res_and_lock(lock); + return 0; + } + + LASSERT(lock->l_resource); + if (lock->l_resource->lr_type != LDLM_IBITS && + lock->l_resource->lr_type != LDLM_PLAIN) { + unlock_res_and_lock(lock); + return 0; + } + + if (ldlm_is_ast_sent(lock)) { + unlock_res_and_lock(lock); + return 0; + } + + LASSERT(lock->l_blocking_ast); + LASSERT(!lock->l_blocking_lock); + + ldlm_set_ast_sent(lock); + if (lock->l_export && lock->l_export->exp_lock_hash) { + /* + * NB: it's safe to call cfs_hash_del() even lock isn't + * in exp_lock_hash. + */ + /* + * In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() + */ + /* coverity[overrun-buffer-val] */ + cfs_hash_del(lock->l_export->exp_lock_hash, + &lock->l_remote_handle, &lock->l_exp_hash); + } + + list_add_tail(&lock->l_rk_ast, rpc_list); + LDLM_LOCK_GET(lock); + + unlock_res_and_lock(lock); + return 0; +} + +void ldlm_revoke_export_locks(struct obd_export *exp) +{ + int rc; + LIST_HEAD(rpc_list); + ENTRY; + + cfs_hash_for_each_nolock(exp->exp_lock_hash, + ldlm_revoke_lock_cb, &rpc_list, 0); + rc = ldlm_run_ast_work(exp->exp_obd->obd_namespace, &rpc_list, + LDLM_WORK_REVOKE_AST); + + if (rc == -ERESTART) + ldlm_reprocess_recovery_done(exp->exp_obd->obd_namespace); + + EXIT; +} +EXPORT_SYMBOL(ldlm_revoke_export_locks); +#endif /* HAVE_SERVER_SUPPORT */ + +static int ldlm_bl_get_work(struct ldlm_bl_pool *blp, + struct ldlm_bl_work_item **p_blwi, + struct obd_export **p_exp) +{ + struct ldlm_bl_work_item *blwi = NULL; + static unsigned int num_bl; + static unsigned int num_stale; + int num_th = atomic_read(&blp->blp_num_threads); + + *p_exp = obd_stale_export_get(); + + spin_lock(&blp->blp_lock); + if (*p_exp != NULL) { + if (num_th == 1 || ++num_stale < num_th) { + spin_unlock(&blp->blp_lock); + return 1; + } + num_stale = 0; + } + + /* process a request from the blp_list at least every blp_num_threads */ + if (!list_empty(&blp->blp_list) && + (list_empty(&blp->blp_prio_list) || num_bl == 0)) + blwi = list_entry(blp->blp_list.next, + struct ldlm_bl_work_item, blwi_entry); + else + if (!list_empty(&blp->blp_prio_list)) + blwi = list_entry(blp->blp_prio_list.next, + struct ldlm_bl_work_item, + blwi_entry); + + if (blwi) { + if (++num_bl >= num_th) + num_bl = 0; + list_del(&blwi->blwi_entry); + } + spin_unlock(&blp->blp_lock); + *p_blwi = blwi; + + if (*p_exp != NULL && *p_blwi != NULL) { + obd_stale_export_put(*p_exp); + *p_exp = NULL; + } + + return (*p_blwi != NULL || *p_exp != NULL) ? 1 : 0; +} + +/* This only contains temporary data until the thread starts */ +struct ldlm_bl_thread_data { + struct ldlm_bl_pool *bltd_blp; + struct completion bltd_comp; + int bltd_num; +}; + +static int ldlm_bl_thread_main(void *arg); + +static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp, bool check_busy) +{ + struct ldlm_bl_thread_data bltd = { .bltd_blp = blp }; + struct task_struct *task; + + init_completion(&bltd.bltd_comp); + + bltd.bltd_num = atomic_inc_return(&blp->blp_num_threads); + if (bltd.bltd_num >= blp->blp_max_threads) { + atomic_dec(&blp->blp_num_threads); + return 0; + } + + LASSERTF(bltd.bltd_num > 0, "thread num:%d\n", bltd.bltd_num); + if (check_busy && + atomic_read(&blp->blp_busy_threads) < (bltd.bltd_num - 1)) { + atomic_dec(&blp->blp_num_threads); + return 0; + } + + task = kthread_run(ldlm_bl_thread_main, &bltd, "ldlm_bl_%02d", + bltd.bltd_num); + if (IS_ERR(task)) { + CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n", + bltd.bltd_num, PTR_ERR(task)); + atomic_dec(&blp->blp_num_threads); + return PTR_ERR(task); + } + wait_for_completion(&bltd.bltd_comp); + + return 0; +} + +/* Not fatal if racy and have a few too many threads */ +static int ldlm_bl_thread_need_create(struct ldlm_bl_pool *blp, + struct ldlm_bl_work_item *blwi) +{ + if (atomic_read(&blp->blp_num_threads) >= blp->blp_max_threads) + return 0; + + if (atomic_read(&blp->blp_busy_threads) < + atomic_read(&blp->blp_num_threads)) + return 0; + + if (blwi != NULL && (blwi->blwi_ns == NULL || + blwi->blwi_mem_pressure)) + return 0; + + return 1; +} + +static int ldlm_bl_thread_blwi(struct ldlm_bl_pool *blp, + struct ldlm_bl_work_item *blwi) +{ + /* '1' for consistency with code that checks !mpflag to restore */ + unsigned int mpflags = 1; + + ENTRY; + + if (blwi->blwi_ns == NULL) + /* added by ldlm_cleanup() */ + RETURN(LDLM_ITER_STOP); + + if (blwi->blwi_mem_pressure) + mpflags = memalloc_noreclaim_save(); + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL2, 4); + + if (blwi->blwi_count) { + int count; + /* + * The special case when we cancel locks in lru + * asynchronously, we pass the list of locks here. + * Thus locks are marked LDLM_FL_CANCELING, but NOT + * canceled locally yet. + */ + count = ldlm_cli_cancel_list_local(&blwi->blwi_head, + blwi->blwi_count, + LCF_BL_AST); + ldlm_cli_cancel_list(&blwi->blwi_head, count, NULL, + blwi->blwi_flags); + } else if (blwi->blwi_lock) { + ldlm_handle_bl_callback(blwi->blwi_ns, &blwi->blwi_ld, + blwi->blwi_lock); + } else { + ldlm_pool_recalc(&blwi->blwi_ns->ns_pool, true); + spin_lock(&blwi->blwi_ns->ns_lock); + blwi->blwi_ns->ns_rpc_recalc = 0; + spin_unlock(&blwi->blwi_ns->ns_lock); + ldlm_namespace_put(blwi->blwi_ns); + } + + if (blwi->blwi_mem_pressure) + memalloc_noreclaim_restore(mpflags); + + if (blwi->blwi_flags & LCF_ASYNC) + OBD_FREE(blwi, sizeof(*blwi)); + else + complete(&blwi->blwi_comp); + + RETURN(0); +} + +/** + * Cancel stale locks on export. Cancel blocked locks first. + * If the given export has blocked locks, the next in the list may have + * them too, thus cancel not blocked locks only if the current export has + * no blocked locks. + **/ +static int ldlm_bl_thread_exports(struct ldlm_bl_pool *blp, + struct obd_export *exp) +{ + int num; + + ENTRY; + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_BL_EVICT, 4); + + num = ldlm_export_cancel_blocked_locks(exp); + if (num == 0) + ldlm_export_cancel_locks(exp); + + obd_stale_export_put(exp); + + RETURN(0); +} + + +/** + * Main blocking requests processing thread. + * + * Callers put locks into its queue by calling ldlm_bl_to_thread. + * This thread in the end ends up doing actual call to ->l_blocking_ast + * for queued locks. + */ +static int ldlm_bl_thread_main(void *arg) +{ + struct lu_env *env; + struct ldlm_bl_pool *blp; + struct ldlm_bl_thread_data *bltd = arg; + int rc; + + ENTRY; + + OBD_ALLOC_PTR(env); + if (!env) + RETURN(-ENOMEM); + rc = lu_env_init(env, LCT_DT_THREAD); + if (rc) + GOTO(out_env, rc); + rc = lu_env_add(env); + if (rc) + GOTO(out_env_fini, rc); + + blp = bltd->bltd_blp; + + complete(&bltd->bltd_comp); + /* cannot use bltd after this, it is only on caller's stack */ + + while (1) { + struct ldlm_bl_work_item *blwi = NULL; + struct obd_export *exp = NULL; + int rc; + + rc = ldlm_bl_get_work(blp, &blwi, &exp); + + if (rc == 0) + wait_event_idle_exclusive(blp->blp_waitq, + ldlm_bl_get_work(blp, &blwi, + &exp)); + atomic_inc(&blp->blp_busy_threads); + + if (ldlm_bl_thread_need_create(blp, blwi)) + /* discard the return value, we tried */ + ldlm_bl_thread_start(blp, true); + + if (exp) + rc = ldlm_bl_thread_exports(blp, exp); + else if (blwi) + rc = ldlm_bl_thread_blwi(blp, blwi); + + atomic_dec(&blp->blp_busy_threads); + + if (rc == LDLM_ITER_STOP) + break; + + /* + * If there are many namespaces, we will not sleep waiting for + * work, and must do a cond_resched to avoid holding the CPU + * for too long + */ + cond_resched(); + } + + atomic_dec(&blp->blp_num_threads); + complete(&blp->blp_comp); + + lu_env_remove(env); +out_env_fini: + lu_env_fini(env); +out_env: + OBD_FREE_PTR(env); + RETURN(rc); +} + + +static int ldlm_setup(void); +static int ldlm_cleanup(void); + +int ldlm_get_ref(void) +{ + int rc = 0; + + ENTRY; + mutex_lock(&ldlm_ref_mutex); + if (++ldlm_refcount == 1) { + rc = ldlm_setup(); + if (rc) + ldlm_refcount--; + } + mutex_unlock(&ldlm_ref_mutex); + + RETURN(rc); +} + +void ldlm_put_ref(void) +{ + ENTRY; + mutex_lock(&ldlm_ref_mutex); + if (ldlm_refcount == 1) { + int rc = ldlm_cleanup(); + + if (rc) + CERROR("ldlm_cleanup failed: %d\n", rc); + else + ldlm_refcount--; + } else { + ldlm_refcount--; + } + mutex_unlock(&ldlm_ref_mutex); + + EXIT; +} + +/* + * Export handle<->lock hash operations. + */ +static unsigned +ldlm_export_lock_hash(struct cfs_hash *hs, const void *key, unsigned int mask) +{ + return cfs_hash_u64_hash(((struct lustre_handle *)key)->cookie, mask); +} + +static void * +ldlm_export_lock_key(struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + return &lock->l_remote_handle; +} + +static void +ldlm_export_lock_keycpy(struct hlist_node *hnode, void *key) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + lock->l_remote_handle = *(struct lustre_handle *)key; +} + +static int +ldlm_export_lock_keycmp(const void *key, struct hlist_node *hnode) +{ + return lustre_handle_equal(ldlm_export_lock_key(hnode), key); +} + +static void * +ldlm_export_lock_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ldlm_lock, l_exp_hash); +} + +static void +ldlm_export_lock_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + LDLM_LOCK_GET(lock); +} + +static void +ldlm_export_lock_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + LDLM_LOCK_RELEASE(lock); +} + +static struct cfs_hash_ops ldlm_export_lock_ops = { + .hs_hash = ldlm_export_lock_hash, + .hs_key = ldlm_export_lock_key, + .hs_keycmp = ldlm_export_lock_keycmp, + .hs_keycpy = ldlm_export_lock_keycpy, + .hs_object = ldlm_export_lock_object, + .hs_get = ldlm_export_lock_get, + .hs_put = ldlm_export_lock_put, + .hs_put_locked = ldlm_export_lock_put, +}; + +int ldlm_init_export(struct obd_export *exp) +{ + int rc; + + ENTRY; + + exp->exp_lock_hash = + cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid), + HASH_EXP_LOCK_CUR_BITS, + HASH_EXP_LOCK_MAX_BITS, + HASH_EXP_LOCK_BKT_BITS, 0, + CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA, + &ldlm_export_lock_ops, + CFS_HASH_DEFAULT | CFS_HASH_REHASH_KEY | + CFS_HASH_NBLK_CHANGE); + + if (!exp->exp_lock_hash) + RETURN(-ENOMEM); + + rc = ldlm_init_flock_export(exp); + if (rc) + GOTO(err, rc); + + RETURN(0); +err: + ldlm_destroy_export(exp); + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_init_export); + +void ldlm_destroy_export(struct obd_export *exp) +{ + ENTRY; + cfs_hash_putref(exp->exp_lock_hash); + exp->exp_lock_hash = NULL; + + ldlm_destroy_flock_export(exp); + EXIT; +} +EXPORT_SYMBOL(ldlm_destroy_export); + +static ssize_t cancel_unused_locks_before_replay_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", ldlm_cancel_unused_locks_before_replay); +} + +static ssize_t cancel_unused_locks_before_replay_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + int rc; + unsigned long val; + + rc = kstrtoul(buffer, 10, &val); + if (rc) + return rc; + + ldlm_cancel_unused_locks_before_replay = val; + + return count; +} +LUSTRE_RW_ATTR(cancel_unused_locks_before_replay); + +static struct attribute *ldlm_attrs[] = { + &lustre_attr_cancel_unused_locks_before_replay.attr, + NULL, +}; + +static struct attribute_group ldlm_attr_group = { + .attrs = ldlm_attrs, +}; + +static int ldlm_setup(void) +{ + static struct ptlrpc_service_conf conf; + struct ldlm_bl_pool *blp = NULL; +#ifdef HAVE_SERVER_SUPPORT + struct task_struct *task; +#endif /* HAVE_SERVER_SUPPORT */ + int i; + int rc = 0; + + ENTRY; + + if (ldlm_state != NULL) + RETURN(-EALREADY); + + OBD_ALLOC(ldlm_state, sizeof(*ldlm_state)); + if (ldlm_state == NULL) + RETURN(-ENOMEM); + + ldlm_kobj = kobject_create_and_add("ldlm", &lustre_kset->kobj); + if (!ldlm_kobj) + GOTO(out, -ENOMEM); + + rc = sysfs_create_group(ldlm_kobj, &ldlm_attr_group); + if (rc) + GOTO(out, rc); + + ldlm_ns_kset = kset_create_and_add("namespaces", NULL, ldlm_kobj); + if (!ldlm_ns_kset) + GOTO(out, -ENOMEM); + + ldlm_svc_kset = kset_create_and_add("services", NULL, ldlm_kobj); + if (!ldlm_svc_kset) + GOTO(out, -ENOMEM); + + rc = ldlm_debugfs_setup(); + if (rc != 0) + GOTO(out, rc); + + memset(&conf, 0, sizeof(conf)); + conf = (typeof(conf)) { + .psc_name = "ldlm_cbd", + .psc_watchdog_factor = 2, + .psc_buf = { + .bc_nbufs = LDLM_CLIENT_NBUFS, + .bc_buf_size = LDLM_BUFSIZE, + .bc_req_max_size = LDLM_MAXREQSIZE, + .bc_rep_max_size = LDLM_MAXREPSIZE, + .bc_req_portal = LDLM_CB_REQUEST_PORTAL, + .bc_rep_portal = LDLM_CB_REPLY_PORTAL, + }, + .psc_thr = { + .tc_thr_name = "ldlm_cb", + .tc_thr_factor = LDLM_THR_FACTOR, + .tc_nthrs_init = LDLM_NTHRS_INIT, + .tc_nthrs_base = LDLM_NTHRS_BASE, + .tc_nthrs_max = LDLM_NTHRS_MAX, + .tc_nthrs_user = ldlm_num_threads, + .tc_cpu_bind = ldlm_cpu_bind, + .tc_ctx_tags = LCT_MD_THREAD | LCT_DT_THREAD, + }, + .psc_cpt = { + .cc_pattern = ldlm_cpts, + .cc_affinity = true, + }, + .psc_ops = { + .so_req_handler = ldlm_callback_handler, + }, + }; + ldlm_state->ldlm_cb_service = \ + ptlrpc_register_service(&conf, ldlm_svc_kset, + ldlm_svc_debugfs_dir); + if (IS_ERR(ldlm_state->ldlm_cb_service)) { + CERROR("failed to start service\n"); + rc = PTR_ERR(ldlm_state->ldlm_cb_service); + ldlm_state->ldlm_cb_service = NULL; + GOTO(out, rc); + } + +#ifdef HAVE_SERVER_SUPPORT + memset(&conf, 0, sizeof(conf)); + conf = (typeof(conf)) { + .psc_name = "ldlm_canceld", + .psc_watchdog_factor = 6, + .psc_buf = { + .bc_nbufs = LDLM_SERVER_NBUFS, + .bc_buf_size = LDLM_BUFSIZE, + .bc_req_max_size = LDLM_MAXREQSIZE, + .bc_rep_max_size = LDLM_MAXREPSIZE, + .bc_req_portal = LDLM_CANCEL_REQUEST_PORTAL, + .bc_rep_portal = LDLM_CANCEL_REPLY_PORTAL, + + }, + .psc_thr = { + .tc_thr_name = "ldlm_cn", + .tc_thr_factor = LDLM_THR_FACTOR, + .tc_nthrs_init = LDLM_NTHRS_INIT, + .tc_nthrs_base = LDLM_NTHRS_BASE, + .tc_nthrs_max = LDLM_NTHRS_MAX, + .tc_nthrs_user = ldlm_num_threads, + .tc_cpu_bind = ldlm_cpu_bind, + .tc_ctx_tags = LCT_MD_THREAD | \ + LCT_DT_THREAD | \ + LCT_CL_THREAD, + }, + .psc_cpt = { + .cc_pattern = ldlm_cpts, + .cc_affinity = true, + }, + .psc_ops = { + .so_req_handler = ldlm_cancel_handler, + .so_hpreq_handler = ldlm_hpreq_handler, + }, + }; + ldlm_state->ldlm_cancel_service = \ + ptlrpc_register_service(&conf, ldlm_svc_kset, + ldlm_svc_debugfs_dir); + if (IS_ERR(ldlm_state->ldlm_cancel_service)) { + CERROR("failed to start service\n"); + rc = PTR_ERR(ldlm_state->ldlm_cancel_service); + ldlm_state->ldlm_cancel_service = NULL; + GOTO(out, rc); + } +#endif /* HAVE_SERVER_SUPPORT */ + + OBD_ALLOC(blp, sizeof(*blp)); + if (blp == NULL) + GOTO(out, rc = -ENOMEM); + ldlm_state->ldlm_bl_pool = blp; + + spin_lock_init(&blp->blp_lock); + INIT_LIST_HEAD(&blp->blp_list); + INIT_LIST_HEAD(&blp->blp_prio_list); + init_waitqueue_head(&blp->blp_waitq); + atomic_set(&blp->blp_num_threads, 0); + atomic_set(&blp->blp_busy_threads, 0); + + if (ldlm_num_threads == 0) { + blp->blp_min_threads = LDLM_NTHRS_INIT; + blp->blp_max_threads = LDLM_NTHRS_MAX; + } else { + blp->blp_min_threads = blp->blp_max_threads = \ + min_t(int, LDLM_NTHRS_MAX, max_t(int, LDLM_NTHRS_INIT, + ldlm_num_threads)); + } + + for (i = 0; i < blp->blp_min_threads; i++) { + rc = ldlm_bl_thread_start(blp, false); + if (rc < 0) + GOTO(out, rc); + } + +#ifdef HAVE_SERVER_SUPPORT + task = kthread_run(expired_lock_main, NULL, "ldlm_elt"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("Cannot start ldlm expired-lock thread: %d\n", rc); + GOTO(out, rc); + } + + wait_event(expired_lock_wait_queue, + expired_lock_thread_state == ELT_READY); +#endif /* HAVE_SERVER_SUPPORT */ + + rc = ldlm_pools_init(); + if (rc) { + CERROR("Failed to initialize LDLM pools: %d\n", rc); + GOTO(out, rc); + } + + rc = ldlm_reclaim_setup(); + if (rc) { + CERROR("Failed to setup reclaim thread: rc = %d\n", rc); + GOTO(out, rc); + } + RETURN(0); + + out: + ldlm_cleanup(); + RETURN(rc); +} + +static int ldlm_cleanup(void) +{ + ENTRY; + + if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) || + !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) { + CERROR("ldlm still has namespaces; clean these up first.\n"); + ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE); + ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE); + RETURN(-EBUSY); + } + + ldlm_reclaim_cleanup(); + ldlm_pools_fini(); + + if (ldlm_state->ldlm_bl_pool != NULL) { + struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool; + + while (atomic_read(&blp->blp_num_threads) > 0) { + struct ldlm_bl_work_item blwi = { .blwi_ns = NULL }; + + init_completion(&blp->blp_comp); + + spin_lock(&blp->blp_lock); + list_add_tail(&blwi.blwi_entry, &blp->blp_list); + wake_up(&blp->blp_waitq); + spin_unlock(&blp->blp_lock); + + wait_for_completion(&blp->blp_comp); + } + + OBD_FREE(blp, sizeof(*blp)); + } + + if (ldlm_state->ldlm_cb_service != NULL) + ptlrpc_unregister_service(ldlm_state->ldlm_cb_service); +#ifdef HAVE_SERVER_SUPPORT + if (ldlm_state->ldlm_cancel_service != NULL) + ptlrpc_unregister_service(ldlm_state->ldlm_cancel_service); +#endif + + if (ldlm_ns_kset) + kset_unregister(ldlm_ns_kset); + if (ldlm_svc_kset) + kset_unregister(ldlm_svc_kset); + if (ldlm_kobj) { + sysfs_remove_group(ldlm_kobj, &ldlm_attr_group); + kobject_put(ldlm_kobj); + } + + ldlm_debugfs_cleanup(); + +#ifdef HAVE_SERVER_SUPPORT + if (expired_lock_thread_state != ELT_STOPPED) { + expired_lock_thread_state = ELT_TERMINATE; + wake_up(&expired_lock_wait_queue); + wait_event(expired_lock_wait_queue, + expired_lock_thread_state == ELT_STOPPED); + } +#endif + + OBD_FREE(ldlm_state, sizeof(*ldlm_state)); + ldlm_state = NULL; + + RETURN(0); +} + +int ldlm_init(void) +{ + ldlm_resource_slab = kmem_cache_create("ldlm_resources", + sizeof(struct ldlm_resource), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_resource_slab == NULL) + return -ENOMEM; + + ldlm_lock_slab = kmem_cache_create("ldlm_locks", + sizeof(struct ldlm_lock), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_lock_slab == NULL) + goto out_resource; + + ldlm_interval_slab = kmem_cache_create("interval_node", + sizeof(struct ldlm_interval), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_interval_slab == NULL) + goto out_lock; + + ldlm_interval_tree_slab = kmem_cache_create("interval_tree", + sizeof(struct ldlm_interval_tree) * LCK_MODE_NUM, + 0, SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_interval_tree_slab == NULL) + goto out_interval; + +#ifdef HAVE_SERVER_SUPPORT + ldlm_inodebits_slab = kmem_cache_create("ldlm_ibits_node", + sizeof(struct ldlm_ibits_node), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_inodebits_slab == NULL) + goto out_interval_tree; + + ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem", + sizeof(struct ldlm_glimpse_work), + 0, 0, NULL); + if (ldlm_glimpse_work_kmem == NULL) + goto out_inodebits; +#endif + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + class_export_dump_hook = ldlm_dump_export_locks; +#endif + return 0; +#ifdef HAVE_SERVER_SUPPORT +out_inodebits: + kmem_cache_destroy(ldlm_inodebits_slab); +out_interval_tree: + kmem_cache_destroy(ldlm_interval_tree_slab); +#endif +out_interval: + kmem_cache_destroy(ldlm_interval_slab); +out_lock: + kmem_cache_destroy(ldlm_lock_slab); +out_resource: + kmem_cache_destroy(ldlm_resource_slab); + + return -ENOMEM; +} + +void ldlm_exit(void) +{ + if (ldlm_refcount) + CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount); + synchronize_rcu(); + kmem_cache_destroy(ldlm_resource_slab); + /* + * ldlm_lock_put() use RCU to call ldlm_lock_free, so need call + * rcu_barrier() to wait all outstanding RCU callbacks to complete, + * so that ldlm_lock_free() get a chance to be called. + */ + rcu_barrier(); + kmem_cache_destroy(ldlm_lock_slab); + kmem_cache_destroy(ldlm_interval_slab); + kmem_cache_destroy(ldlm_interval_tree_slab); +#ifdef HAVE_SERVER_SUPPORT + kmem_cache_destroy(ldlm_inodebits_slab); + kmem_cache_destroy(ldlm_glimpse_work_kmem); +#endif +} diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c new file mode 100644 index 0000000000000..38a94b159000a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c @@ -0,0 +1,180 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ldlm/ldlm_plain.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +/** + * This file contains implementation of PLAIN lock type. + * + * PLAIN locks are the simplest form of LDLM locking, and are used when + * there only needs to be a single lock on a resource. This avoids some + * of the complexity of EXTENT and IBITS lock types, but doesn't allow + * different "parts" of a resource to be locked concurrently. Example + * use cases for PLAIN locks include locking of MGS configuration logs + * and (as of Lustre 2.4) quota records. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include + +#include "ldlm_internal.h" + +#ifdef HAVE_SERVER_SUPPORT +/** + * Determine if the lock is compatible with all locks on the queue. + * + * If \a work_list is provided, conflicting locks are linked there. + * If \a work_list is not provided, we exit this function on first conflict. + * + * \retval 0 if there are conflicting locks in the \a queue + * \retval 1 if the lock is compatible to all locks in \a queue + */ +static inline int +ldlm_plain_compat_queue(struct list_head *queue, struct ldlm_lock *req, + struct list_head *work_list) +{ + enum ldlm_mode req_mode = req->l_req_mode; + struct ldlm_lock *lock, *next_lock; + int compat = 1; + + ENTRY; + lockmode_verify(req_mode); + + list_for_each_entry_safe(lock, next_lock, queue, l_res_link) { + + /* + * We stop walking the queue if we hit ourselves so we don't + * take conflicting locks enqueued after us into account, + * or we'd wait forever. + */ + if (req == lock) + RETURN(compat); + + /* Advance loop cursor to last lock of mode group. */ + next_lock = list_entry(list_entry(lock->l_sl_mode.prev, + struct ldlm_lock, + l_sl_mode)->l_res_link.next, + struct ldlm_lock, l_res_link); + + if (lockmode_compat(lock->l_req_mode, req_mode)) + continue; + + if (!work_list) + RETURN(0); + + compat = 0; + + /* + * Add locks of the mode group to \a work_list as + * blocking locks for \a req. + */ + if (lock->l_blocking_ast) + ldlm_add_ast_work_item(lock, req, work_list); + + { + struct list_head *head; + + head = &lock->l_sl_mode; + list_for_each_entry(lock, head, l_sl_mode) + if (lock->l_blocking_ast) + ldlm_add_ast_work_item(lock, req, + work_list); + } + } + + RETURN(compat); +} + +/** + * Process a granting attempt for plain lock. + * Must be called with ns lock held. + * + * This function looks for any conflicts for \a lock in the granted or + * waiting queues. The lock is granted if no conflicts are found in + * either queue. + */ +int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, struct list_head *work_list) +{ + struct ldlm_resource *res = lock->l_resource; + struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ? + NULL : work_list; + int rc; + + ENTRY; + LASSERT(!ldlm_is_granted(lock)); + check_res_locked(res); + *err = ELDLM_OK; + + if (intention == LDLM_PROCESS_RESCAN) { + LASSERT(work_list != NULL); + rc = ldlm_plain_compat_queue(&res->lr_granted, lock, NULL); + if (!rc) + RETURN(LDLM_ITER_STOP); + rc = ldlm_plain_compat_queue(&res->lr_waiting, lock, NULL); + if (!rc) + RETURN(LDLM_ITER_STOP); + + ldlm_resource_unlink_lock(lock); + ldlm_grant_lock(lock, grant_work); + RETURN(LDLM_ITER_CONTINUE); + } + + rc = ldlm_plain_compat_queue(&res->lr_granted, lock, work_list); + rc += ldlm_plain_compat_queue(&res->lr_waiting, lock, work_list); + + if (rc == 2) { + ldlm_resource_unlink_lock(lock); + ldlm_grant_lock(lock, grant_work); + } + + RETURN(LDLM_ITER_CONTINUE); +} +#endif /* HAVE_SERVER_SUPPORT */ + +void ldlm_plain_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy) +{ + /* No policy for plain locks */ +} + +void ldlm_plain_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy) +{ + /* No policy for plain locks */ +} diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c new file mode 100644 index 0000000000000..d23240c7f19c9 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c @@ -0,0 +1,1568 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ldlm/ldlm_pool.c + * + * Author: Yury Umanets + */ + +/* + * Idea of this code is rather simple. Each second, for each server namespace + * we have SLV - server lock volume which is calculated on current number of + * granted locks, grant speed for past period, etc - that is, locking load. + * This SLV number may be thought as a flow definition for simplicity. It is + * sent to clients with each occasion to let them know what is current load + * situation on the server. By default, at the beginning, SLV on server is + * set max value which is calculated as the following: allow to one client + * have all locks of limit ->pl_limit for 10h. + * + * Next, on clients, number of cached locks is not limited artificially in any + * way as it was before. Instead, client calculates CLV, that is, client lock + * volume for each lock and compares it with last SLV from the server. CLV is + * calculated as the number of locks in LRU * lock live time in seconds. If + * CLV > SLV - lock is canceled. + * + * Client has LVF, that is, lock volume factor which regulates how much + * sensitive client should be about last SLV from server. The higher LVF is the + * more locks will be canceled on client. Default value for it is 1. Setting + * LVF to 2 means that client will cancel locks 2 times faster. + * + * Locks on a client will be canceled more intensively in these cases: + * (1) if SLV is smaller, that is, load is higher on the server; + * (2) client has a lot of locks (the more locks are held by client, the bigger + * chances that some of them should be canceled); + * (3) client has old locks (taken some time ago); + * + * Thus, according to flow paradigm that we use for better understanding SLV, + * CLV is the volume of particle in flow described by SLV. According to this, + * if flow is getting thinner, more and more particles become outside of it and + * as particles are locks, they should be canceled. + * + * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). + * Andreas Dilger(adilger@clusterfs.com) proposed few nice ideas like using LVF + * and many cleanups. Flow definition to allow more easy understanding of the + * logic belongs to Nikita Danilov(nikita@clusterfs.com) as well as many + * cleanups and fixes. And design and implementation are done by Yury Umanets + * (umka@clusterfs.com). + * + * Glossary for terms used: + * + * pl_limit - Number of allowed locks in pool. Applies to server and client + * side (tunable); + * + * pl_granted - Number of granted locks (calculated); + * pl_grant_rate - Number of granted locks for last T (calculated); + * pl_cancel_rate - Number of canceled locks for last T (calculated); + * pl_grant_speed - Grant speed (GR - CR) for last T (calculated); + * pl_grant_plan - Planned number of granted locks for next T (calculated); + * pl_server_lock_volume - Current server lock volume (calculated); + * + * As it may be seen from list above, we have few possible tunables which may + * affect behavior much. They all may be modified via sysfs. However, they also + * give a possibility for constructing few pre-defined behavior policies. If + * none of predefines is suitable for a working pattern being used, new one may + * be "constructed" via sysfs tunables. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include +#include +#include +#include "ldlm_internal.h" + +#ifdef HAVE_LRU_RESIZE_SUPPORT + +/* + * 50 ldlm locks for 1MB of RAM. + */ +#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_SHIFT)) * 50) + +/* + * Maximal possible grant step plan in %. + */ +#define LDLM_POOL_MAX_GSP (30) + +/* + * Minimal possible grant step plan in %. + */ +#define LDLM_POOL_MIN_GSP (1) + +/* + * This controls the speed of reaching LDLM_POOL_MAX_GSP + * with increasing thread period. + */ +#define LDLM_POOL_GSP_STEP_SHIFT (2) + +/* + * LDLM_POOL_GSP% of all locks is default GP. + */ +#define LDLM_POOL_GP(L) (((L) * LDLM_POOL_MAX_GSP) / 100) + +/* + * Max age for locks on clients. + */ +#define LDLM_POOL_MAX_AGE (36000) + +/* + * The granularity of SLV calculation. + */ +#define LDLM_POOL_SLV_SHIFT (10) + +static inline __u64 dru(__u64 val, __u32 shift, int round_up) +{ + return (val + (round_up ? (1 << shift) - 1 : 0)) >> shift; +} + +static inline __u64 ldlm_pool_slv_max(__u32 L) +{ + /* + * Allow to have all locks for 1 client for 10 hrs. + * Formula is the following: limit * 10h / 1 client. + */ + __u64 lim = (__u64)L * LDLM_POOL_MAX_AGE / 1; + return lim; +} + +static inline __u64 ldlm_pool_slv_min(__u32 L) +{ + return 1; +} + +enum { + LDLM_POOL_FIRST_STAT = 0, + LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT, + LDLM_POOL_GRANT_STAT, + LDLM_POOL_CANCEL_STAT, + LDLM_POOL_GRANT_RATE_STAT, + LDLM_POOL_CANCEL_RATE_STAT, + LDLM_POOL_GRANT_PLAN_STAT, + LDLM_POOL_SLV_STAT, + LDLM_POOL_SHRINK_REQTD_STAT, + LDLM_POOL_SHRINK_FREED_STAT, + LDLM_POOL_RECALC_STAT, + LDLM_POOL_TIMING_STAT, + LDLM_POOL_LAST_STAT +}; + +static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl) +{ + return container_of(pl, struct ldlm_namespace, ns_pool); +} + +/** + * Calculates suggested grant_step in % of available locks for passed + * \a period. This is later used in grant_plan calculations. + */ +static inline int ldlm_pool_t2gsp(unsigned int t) +{ + /* + * This yields 1% grant step for anything below LDLM_POOL_GSP_STEP + * and up to 30% for anything higher than LDLM_POOL_GSP_STEP. + * + * How this will affect execution is the following: + * + * - for thread period 1s we will have grant_step 1% which good from + * pov of taking some load off from server and push it out to clients. + * This is like that because 1% for grant_step means that server will + * not allow clients to get lots of locks in short period of time and + * keep all old locks in their caches. Clients will always have to + * get some locks back if they want to take some new; + * + * - for thread period 10s (which is default) we will have 23% which + * means that clients will have enough of room to take some new locks + * without getting some back. All locks from this 23% which were not + * taken by clients in current period will contribute in SLV growing. + * SLV growing means more locks cached on clients until limit or grant + * plan is reached. + */ + return LDLM_POOL_MAX_GSP - + ((LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) >> + (t >> LDLM_POOL_GSP_STEP_SHIFT)); +} + +static inline int ldlm_pool_granted(struct ldlm_pool *pl) +{ + return atomic_read(&pl->pl_granted); +} + +/** + * Recalculates next grant limit on passed \a pl. + * + * \pre ->pl_lock is locked. + */ +static void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl) +{ + int granted, grant_step, limit; + + limit = ldlm_pool_get_limit(pl); + granted = ldlm_pool_granted(pl); + + grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period); + grant_step = ((limit - granted) * grant_step) / 100; + pl->pl_grant_plan = granted + grant_step; + limit = (limit * 5) >> 2; + if (pl->pl_grant_plan > limit) + pl->pl_grant_plan = limit; +} + +/** + * Recalculates next SLV on passed \a pl. + * + * \pre ->pl_lock is locked. + */ +static void ldlm_pool_recalc_slv(struct ldlm_pool *pl) +{ + int granted; + int grant_plan; + int round_up; + __u64 slv; + __u64 slv_factor; + __u64 grant_usage; + __u32 limit; + + slv = pl->pl_server_lock_volume; + grant_plan = pl->pl_grant_plan; + limit = ldlm_pool_get_limit(pl); + granted = ldlm_pool_granted(pl); + round_up = granted < limit; + + grant_usage = max_t(int, limit - (granted - grant_plan), 1); + + /* + * Find out SLV change factor which is the ratio of grant usage + * from limit. SLV changes as fast as the ratio of grant plan + * consumption. The more locks from grant plan are not consumed + * by clients in last interval (idle time), the faster grows + * SLV. And the opposite, the more grant plan is over-consumed + * (load time) the faster drops SLV. + */ + slv_factor = (grant_usage << LDLM_POOL_SLV_SHIFT); + do_div(slv_factor, limit); + slv = slv * slv_factor; + slv = dru(slv, LDLM_POOL_SLV_SHIFT, round_up); + + if (slv > ldlm_pool_slv_max(limit)) + slv = ldlm_pool_slv_max(limit); + else if (slv < ldlm_pool_slv_min(limit)) + slv = ldlm_pool_slv_min(limit); + + pl->pl_server_lock_volume = slv; +} + +/** + * Recalculates next stats on passed \a pl. + * + * \pre ->pl_lock is locked. + */ +static void ldlm_pool_recalc_stats(struct ldlm_pool *pl, timeout_t period) +{ + int grant_plan = pl->pl_grant_plan; + __u64 slv = pl->pl_server_lock_volume; + int granted = ldlm_pool_granted(pl); + int grant_rate = atomic_read(&pl->pl_grant_rate) / period; + int cancel_rate = atomic_read(&pl->pl_cancel_rate) / period; + + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT, + slv); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT, + granted); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT, + grant_rate); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT, + grant_plan); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT, + cancel_rate); +} + +/** + * Sets current SLV into obd accessible via ldlm_pl2ns(pl)->ns_obd. + */ +static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl) +{ + struct obd_device *obd; + + /* + * Set new SLV in obd field for using it later without accessing the + * pool. This is required to avoid race between sending reply to client + * with new SLV and cleanup server stack in which we can't guarantee + * that namespace is still alive. We know only that obd is alive as + * long as valid export is alive. + */ + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL); + write_lock(&obd->obd_pool_lock); + obd->obd_pool_slv = pl->pl_server_lock_volume; + write_unlock(&obd->obd_pool_lock); +} + +/** + * Recalculates all pool fields on passed \a pl. + * + * \pre ->pl_lock is not locked. + */ +static int ldlm_srv_pool_recalc(struct ldlm_pool *pl, bool force) +{ + timeout_t recalc_interval_sec; + + ENTRY; + + recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time; + if (!force && recalc_interval_sec < pl->pl_recalc_period) + RETURN(0); + + spin_lock(&pl->pl_lock); + recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time; + if (!force && recalc_interval_sec < pl->pl_recalc_period) { + spin_unlock(&pl->pl_lock); + RETURN(0); + } + /* + * Recalc SLV after last period. This should be done + * _before_ recalculating new grant plan. + */ + ldlm_pool_recalc_slv(pl); + + /* + * Make sure that pool informed obd of last SLV changes. + */ + ldlm_srv_pool_push_slv(pl); + + /* + * Update grant_plan for new period. + */ + ldlm_pool_recalc_grant_plan(pl); + + pl->pl_recalc_time = ktime_get_seconds(); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, + recalc_interval_sec); + spin_unlock(&pl->pl_lock); + RETURN(0); +} + +/** + * This function is used on server side as main entry point for memory + * pressure handling. It decreases SLV on \a pl according to passed + * \a nr and \a gfp_mask. + * + * Our goal here is to decrease SLV such a way that clients hold \a nr + * locks smaller in next 10h. + */ +static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, + int nr, gfp_t gfp_mask) +{ + __u32 limit; + + /* + * VM is asking how many entries may be potentially freed. + */ + if (nr == 0) + return ldlm_pool_granted(pl); + + /* + * Client already canceled locks but server is already in shrinker + * and can't cancel anything. Let's catch this race. + */ + if (ldlm_pool_granted(pl) == 0) + RETURN(0); + + spin_lock(&pl->pl_lock); + + /* + * We want shrinker to possibly cause cancellation of @nr locks from + * clients or grant approximately @nr locks smaller next intervals. + * + * This is why we decreased SLV by @nr. This effect will only be as + * long as one re-calc interval (1s these days) and this should be + * enough to pass this decreased SLV to all clients. On next recalc + * interval pool will either increase SLV if locks load is not high + * or will keep on same level or even decrease again, thus, shrinker + * decreased SLV will affect next recalc intervals and this way will + * make locking load lower. + */ + if (nr < pl->pl_server_lock_volume) { + pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr; + } else { + limit = ldlm_pool_get_limit(pl); + pl->pl_server_lock_volume = ldlm_pool_slv_min(limit); + } + + /* + * Make sure that pool informed obd of last SLV changes. + */ + ldlm_srv_pool_push_slv(pl); + spin_unlock(&pl->pl_lock); + + /* + * We did not really free any memory here so far, it only will be + * freed later may be, so that we return 0 to not confuse VM. + */ + return 0; +} + +/** + * Setup server side pool \a pl with passed \a limit. + */ +static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit) +{ + struct obd_device *obd; + + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL && obd != LP_POISON); + LASSERT(obd->obd_type != LP_POISON); + write_lock(&obd->obd_pool_lock); + obd->obd_pool_limit = limit; + write_unlock(&obd->obd_pool_lock); + + ldlm_pool_set_limit(pl, limit); + return 0; +} + +/** + * Sets SLV and Limit from ldlm_pl2ns(pl)->ns_obd tp passed \a pl. + */ +static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl) +{ + struct obd_device *obd; + + /* + * Get new SLV and Limit from obd which is updated with coming + * RPCs. + */ + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL); + read_lock(&obd->obd_pool_lock); + pl->pl_server_lock_volume = obd->obd_pool_slv; + ldlm_pool_set_limit(pl, obd->obd_pool_limit); + read_unlock(&obd->obd_pool_lock); +} + +/** + * Recalculates client size pool \a pl according to current SLV and Limit. + */ +static int ldlm_cli_pool_recalc(struct ldlm_pool *pl, bool force) +{ + timeout_t recalc_interval_sec; + int ret; + + ENTRY; + + recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time; + if (!force && recalc_interval_sec < pl->pl_recalc_period) + RETURN(0); + + spin_lock(&pl->pl_lock); + /* + * Check if we need to recalc lists now. + */ + recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time; + if (!force && recalc_interval_sec < pl->pl_recalc_period) { + spin_unlock(&pl->pl_lock); + RETURN(0); + } + + /* + * Make sure that pool knows last SLV and Limit from obd. + */ + ldlm_cli_pool_pop_slv(pl); + spin_unlock(&pl->pl_lock); + + /* + * In the time of canceling locks on client we do not need to maintain + * sharp timing, we only want to cancel locks asap according to new SLV. + * It may be called when SLV has changed much, this is why we do not + * take into account pl->pl_recalc_time here. + */ + ret = ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC, 0); + + spin_lock(&pl->pl_lock); + /* + * Time of LRU resizing might be longer than period, + * so update after LRU resizing rather than before it. + */ + pl->pl_recalc_time = ktime_get_seconds(); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, + recalc_interval_sec); + spin_unlock(&pl->pl_lock); + RETURN(ret); +} + +/** + * This function is main entry point for memory pressure handling on client + * side. Main goal of this function is to cancel some number of locks on + * passed \a pl according to \a nr and \a gfp_mask. + */ +static int ldlm_cli_pool_shrink(struct ldlm_pool *pl, + int nr, gfp_t gfp_mask) +{ + struct ldlm_namespace *ns; + int unused; + + ns = ldlm_pl2ns(pl); + + /* + * Do not cancel locks in case lru resize is disabled for this ns. + */ + if (!ns_connect_lru_resize(ns)) + RETURN(0); + + /* + * Make sure that pool knows last SLV and Limit from obd. + */ + spin_lock(&pl->pl_lock); + ldlm_cli_pool_pop_slv(pl); + spin_unlock(&pl->pl_lock); + + spin_lock(&ns->ns_lock); + unused = ns->ns_nr_unused; + spin_unlock(&ns->ns_lock); + + if (nr == 0) + return (unused / 100) * sysctl_vfs_cache_pressure; + else + return ldlm_cancel_lru(ns, nr, LCF_ASYNC, 0); +} + +static struct ldlm_pool_ops ldlm_srv_pool_ops = { + .po_recalc = ldlm_srv_pool_recalc, + .po_shrink = ldlm_srv_pool_shrink, + .po_setup = ldlm_srv_pool_setup +}; + +static struct ldlm_pool_ops ldlm_cli_pool_ops = { + .po_recalc = ldlm_cli_pool_recalc, + .po_shrink = ldlm_cli_pool_shrink +}; + +/** + * Pool recalc wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + * + * \retval time in seconds for the next recalc of this pool + */ +time64_t ldlm_pool_recalc(struct ldlm_pool *pl, bool force) +{ + timeout_t recalc_interval_sec; + int count; + + recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time; + if (recalc_interval_sec > 0) { + spin_lock(&pl->pl_lock); + recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time; + + if (recalc_interval_sec > 0) { + /* + * Update pool statistics every recalc interval. + */ + ldlm_pool_recalc_stats(pl, recalc_interval_sec); + + /* + * Zero out all rates and speed for the last period. + */ + atomic_set(&pl->pl_grant_rate, 0); + atomic_set(&pl->pl_cancel_rate, 0); + } + spin_unlock(&pl->pl_lock); + } + + if (pl->pl_ops->po_recalc != NULL) { + count = pl->pl_ops->po_recalc(pl, force); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT, + count); + } + + return pl->pl_recalc_time + pl->pl_recalc_period; +} + +/** + * Pool shrink wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + */ +int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, gfp_t gfp_mask) +{ + int cancel = 0; + + if (pl->pl_ops->po_shrink != NULL) { + cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask); + if (nr > 0) { + lprocfs_counter_add(pl->pl_stats, + LDLM_POOL_SHRINK_REQTD_STAT, + nr); + lprocfs_counter_add(pl->pl_stats, + LDLM_POOL_SHRINK_FREED_STAT, + cancel); + CDEBUG(D_DLMTRACE, + "%s: request to shrink %d locks, shrunk %d\n", + pl->pl_name, nr, cancel); + } + } + return cancel; +} + +/** + * Pool setup wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + * + * Sets passed \a limit into pool \a pl. + */ +int ldlm_pool_setup(struct ldlm_pool *pl, int limit) +{ + if (pl->pl_ops->po_setup != NULL) + return pl->pl_ops->po_setup(pl, limit); + return 0; +} + +static int lprocfs_pool_state_seq_show(struct seq_file *m, void *unused) +{ + int granted, grant_rate, cancel_rate, grant_step; + int grant_speed, grant_plan, lvf; + struct ldlm_pool *pl = m->private; + timeout_t period; + __u64 slv, clv; + __u32 limit; + + spin_lock(&pl->pl_lock); + slv = pl->pl_server_lock_volume; + clv = pl->pl_client_lock_volume; + limit = ldlm_pool_get_limit(pl); + grant_plan = pl->pl_grant_plan; + granted = ldlm_pool_granted(pl); + period = ktime_get_seconds() - pl->pl_recalc_time; + if (period <= 0) + period = 1; + grant_rate = atomic_read(&pl->pl_grant_rate) / period; + cancel_rate = atomic_read(&pl->pl_cancel_rate) / period; + grant_speed = grant_rate - cancel_rate; + lvf = atomic_read(&pl->pl_lock_volume_factor); + grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period); + spin_unlock(&pl->pl_lock); + + seq_printf(m, "LDLM pool state (%s):\n" + " SLV: %llu\n" + " CLV: %llu\n" + " LVF: %d\n", + pl->pl_name, slv, clv, (lvf * 100) >> 8); + + if (ns_is_server(ldlm_pl2ns(pl))) { + seq_printf(m, " GSP: %d%%\n", grant_step); + seq_printf(m, " GP: %d\n", grant_plan); + } + + seq_printf(m, " GR: %d\n CR: %d\n GS: %d\n G: %d\n L: %d\n", + grant_rate, cancel_rate, grant_speed, + granted, limit); + return 0; +} + +LDEBUGFS_SEQ_FOPS_RO(lprocfs_pool_state); + +static ssize_t grant_speed_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, + pl_kobj); + int grant_speed; + timeout_t period; + + spin_lock(&pl->pl_lock); + /* serialize with ldlm_pool_recalc */ + period = ktime_get_seconds() - pl->pl_recalc_time; + if (period <= 0) + period = 1; + grant_speed = (atomic_read(&pl->pl_grant_rate) - + atomic_read(&pl->pl_cancel_rate)) / period; + spin_unlock(&pl->pl_lock); + return sprintf(buf, "%d\n", grant_speed); +} +LUSTRE_RO_ATTR(grant_speed); + +LDLM_POOL_SYSFS_READER_SHOW(grant_plan, int); +LUSTRE_RO_ATTR(grant_plan); + +LDLM_POOL_SYSFS_READER_SHOW(recalc_period, int); +LDLM_POOL_SYSFS_WRITER_STORE(recalc_period, int); +LUSTRE_RW_ATTR(recalc_period); + +LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(server_lock_volume, u64); +LUSTRE_RO_ATTR(server_lock_volume); + +LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(client_lock_volume, u64); +LUSTRE_RO_ATTR(client_lock_volume); + +LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(limit, atomic); +LDLM_POOL_SYSFS_WRITER_NOLOCK_STORE(limit, atomic); +LUSTRE_RW_ATTR(limit); + +LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(granted, atomic); +LUSTRE_RO_ATTR(granted); + +LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(cancel_rate, atomic); +LUSTRE_RO_ATTR(cancel_rate); + +LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(grant_rate, atomic); +LUSTRE_RO_ATTR(grant_rate); + +static ssize_t lock_volume_factor_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, pl_kobj); + unsigned long tmp; + + tmp = (atomic_read(&pl->pl_lock_volume_factor) * 100) >> 8; + return sprintf(buf, "%lu\n", tmp); +} + +static ssize_t lock_volume_factor_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, pl_kobj); + unsigned long tmp; + int rc; + + rc = kstrtoul(buffer, 10, &tmp); + if (rc < 0) { + return rc; + } + + tmp = (tmp << 8) / 100; + atomic_set(&pl->pl_lock_volume_factor, tmp); + + return count; + +} +LUSTRE_RW_ATTR(lock_volume_factor); + +static ssize_t recalc_time_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, pl_kobj); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + ktime_get_seconds() - pl->pl_recalc_time); +} +LUSTRE_RO_ATTR(recalc_time); + +/* These are for pools in /sys/fs/lustre/ldlm/namespaces/.../pool */ +static struct attribute *ldlm_pl_attrs[] = { + &lustre_attr_grant_speed.attr, + &lustre_attr_grant_plan.attr, + &lustre_attr_recalc_period.attr, + &lustre_attr_server_lock_volume.attr, + &lustre_attr_client_lock_volume.attr, + &lustre_attr_recalc_time.attr, + &lustre_attr_limit.attr, + &lustre_attr_granted.attr, + &lustre_attr_cancel_rate.attr, + &lustre_attr_grant_rate.attr, + &lustre_attr_lock_volume_factor.attr, + NULL, +}; + +KOBJ_ATTRIBUTE_GROUPS(ldlm_pl); + +static void ldlm_pl_release(struct kobject *kobj) +{ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, + pl_kobj); + complete(&pl->pl_kobj_unregister); +} + +static struct kobj_type ldlm_pl_ktype = { + .default_groups = KOBJ_ATTR_GROUPS(ldlm_pl), + .sysfs_ops = &lustre_sysfs_ops, + .release = ldlm_pl_release, +}; + +static int ldlm_pool_sysfs_init(struct ldlm_pool *pl) +{ + struct ldlm_namespace *ns = ldlm_pl2ns(pl); + int err; + + init_completion(&pl->pl_kobj_unregister); + err = kobject_init_and_add(&pl->pl_kobj, &ldlm_pl_ktype, &ns->ns_kobj, + "pool"); + + return err; +} + +static int ldlm_pool_debugfs_init(struct ldlm_pool *pl) +{ + struct ldlm_namespace *ns = ldlm_pl2ns(pl); + struct dentry *debugfs_ns_parent; + struct ldebugfs_vars pool_vars[2]; + int rc = 0; + + ENTRY; + + debugfs_ns_parent = ns->ns_debugfs_entry; + if (IS_ERR_OR_NULL(debugfs_ns_parent)) { + CERROR("%s: debugfs entry is not initialized\n", + ldlm_ns_name(ns)); + GOTO(out, rc = -EINVAL); + } + pl->pl_debugfs_entry = debugfs_create_dir("pool", debugfs_ns_parent); + + memset(pool_vars, 0, sizeof(pool_vars)); + + ldlm_add_var(&pool_vars[0], pl->pl_debugfs_entry, "state", pl, + &lprocfs_pool_state_fops); + + pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT - + LDLM_POOL_FIRST_STAT, 0); + if (!pl->pl_stats) + GOTO(out, rc = -ENOMEM); + + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT, + LPROCFS_CNTR_AVGMINMAX, "granted", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT, + LPROCFS_CNTR_AVGMINMAX, "grant", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT, + LPROCFS_CNTR_AVGMINMAX, "cancel", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT, + LPROCFS_CNTR_AVGMINMAX, "grant_rate", "locks/s"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT, + LPROCFS_CNTR_AVGMINMAX, "cancel_rate", "locks/s"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT, + LPROCFS_CNTR_AVGMINMAX, "grant_plan", "locks/s"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT, + LPROCFS_CNTR_AVGMINMAX, "slv", "slv"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT, + LPROCFS_CNTR_AVGMINMAX, "shrink_request", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT, + LPROCFS_CNTR_AVGMINMAX, "shrink_freed", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT, + LPROCFS_CNTR_AVGMINMAX, "recalc_freed", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT, + LPROCFS_CNTR_AVGMINMAX, "recalc_timing", "sec"); + debugfs_create_file("stats", 0644, pl->pl_debugfs_entry, + pl->pl_stats, &ldebugfs_stats_seq_fops); + + EXIT; +out: + return rc; +} + +static void ldlm_pool_sysfs_fini(struct ldlm_pool *pl) +{ + kobject_put(&pl->pl_kobj); + wait_for_completion(&pl->pl_kobj_unregister); +} + +static void ldlm_pool_debugfs_fini(struct ldlm_pool *pl) +{ + if (pl->pl_stats != NULL) { + lprocfs_free_stats(&pl->pl_stats); + pl->pl_stats = NULL; + } + debugfs_remove_recursive(pl->pl_debugfs_entry); + pl->pl_debugfs_entry = NULL; +} + +int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, + int idx, enum ldlm_side client) +{ + int rc; + + ENTRY; + + spin_lock_init(&pl->pl_lock); + atomic_set(&pl->pl_granted, 0); + pl->pl_recalc_time = ktime_get_seconds(); + atomic_set(&pl->pl_lock_volume_factor, 1 << 8); + + atomic_set(&pl->pl_grant_rate, 0); + atomic_set(&pl->pl_cancel_rate, 0); + pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L); + + snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d", + ldlm_ns_name(ns), idx); + + if (client == LDLM_NAMESPACE_SERVER) { + pl->pl_ops = &ldlm_srv_pool_ops; + ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L); + pl->pl_recalc_period = LDLM_POOL_SRV_DEF_RECALC_PERIOD; + pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L); + } else { + ldlm_pool_set_limit(pl, 1); + pl->pl_server_lock_volume = 0; + pl->pl_ops = &ldlm_cli_pool_ops; + pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD; + } + pl->pl_client_lock_volume = 0; + rc = ldlm_pool_debugfs_init(pl); + if (rc) + RETURN(rc); + + rc = ldlm_pool_sysfs_init(pl); + if (rc) + RETURN(rc); + + CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name); + + RETURN(rc); +} + +void ldlm_pool_fini(struct ldlm_pool *pl) +{ + ENTRY; + ldlm_pool_sysfs_fini(pl); + ldlm_pool_debugfs_fini(pl); + + /* + * Pool should not be used after this point. We can't free it here as + * it lives in struct ldlm_namespace, but still interested in catching + * any abnormal using cases. + */ + POISON(pl, 0x5a, sizeof(*pl)); + EXIT; +} + +/** + * Add new taken ldlm lock \a lock into pool \a pl accounting. + */ +void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock) +{ + /* + * FLOCK locks are special in a sense that they are almost never + * cancelled, instead special kind of lock is used to drop them. + * also there is no LRU for flock locks, so no point in tracking + * them anyway. + * + * PLAIN locks are used by config and quota, the quantity is small + * and usually they are not in LRU. + */ + if (lock->l_resource->lr_type == LDLM_FLOCK || + lock->l_resource->lr_type == LDLM_PLAIN) + return; + + ldlm_reclaim_add(lock); + + atomic_inc(&pl->pl_granted); + atomic_inc(&pl->pl_grant_rate); + lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT); + /* + * Do not do pool recalc for client side as all locks which + * potentially may be canceled has already been packed into + * enqueue/cancel rpc. Also we do not want to run out of stack + * with too long call paths. + */ + if (ns_is_server(ldlm_pl2ns(pl))) + ldlm_pool_recalc(pl, false); +} + +/** + * Remove ldlm lock \a lock from pool \a pl accounting. + */ +void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock) +{ + /* + * Filter out FLOCK & PLAIN locks. Read above comment in + * ldlm_pool_add(). + */ + if (lock->l_resource->lr_type == LDLM_FLOCK || + lock->l_resource->lr_type == LDLM_PLAIN) + return; + + ldlm_reclaim_del(lock); + + LASSERT(atomic_read(&pl->pl_granted) > 0); + atomic_dec(&pl->pl_granted); + atomic_inc(&pl->pl_cancel_rate); + + lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT); + + if (ns_is_server(ldlm_pl2ns(pl))) + ldlm_pool_recalc(pl, false); +} + +/** + * Returns current \a pl SLV. + * + * \pre ->pl_lock is not locked. + */ +__u64 ldlm_pool_get_slv(struct ldlm_pool *pl) +{ + __u64 slv; + + spin_lock(&pl->pl_lock); + slv = pl->pl_server_lock_volume; + spin_unlock(&pl->pl_lock); + return slv; +} + +/** + * Sets passed \a slv to \a pl. + * + * \pre ->pl_lock is not locked. + */ +void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv) +{ + spin_lock(&pl->pl_lock); + pl->pl_server_lock_volume = slv; + spin_unlock(&pl->pl_lock); +} + +/** + * Returns current \a pl CLV. + * + * \pre ->pl_lock is not locked. + */ +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl) +{ + __u64 slv; + + spin_lock(&pl->pl_lock); + slv = pl->pl_client_lock_volume; + spin_unlock(&pl->pl_lock); + return slv; +} + +/** + * Sets passed \a clv to \a pl. + * + * \pre ->pl_lock is not locked. + */ +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv) +{ + spin_lock(&pl->pl_lock); + pl->pl_client_lock_volume = clv; + spin_unlock(&pl->pl_lock); +} + +/** + * Returns current \a pl limit. + */ +__u32 ldlm_pool_get_limit(struct ldlm_pool *pl) +{ + return atomic_read(&pl->pl_limit); +} + +/** + * Sets passed \a limit to \a pl. + */ +void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit) +{ + atomic_set(&pl->pl_limit, limit); +} + +/** + * Returns current LVF from \a pl. + */ +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl) +{ + return atomic_read(&pl->pl_lock_volume_factor); +} + +/* + * count locks from all namespaces (if possible). Returns number of + * cached locks. + */ +static unsigned long ldlm_pools_count(enum ldlm_side client, gfp_t gfp_mask) +{ + unsigned long total = 0; + int nr_ns; + struct ldlm_namespace *ns; + struct ldlm_namespace *ns_old = NULL; /* loop detection */ + + if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS)) + return 0; + + /* + * Find out how many resources we may release. + */ + for (nr_ns = ldlm_namespace_nr_read(client); + nr_ns > 0; nr_ns--) { + mutex_lock(ldlm_namespace_lock(client)); + if (list_empty(ldlm_namespace_list(client))) { + mutex_unlock(ldlm_namespace_lock(client)); + return 0; + } + ns = ldlm_namespace_first_locked(client); + + if (ns == ns_old) { + mutex_unlock(ldlm_namespace_lock(client)); + break; + } + + if (ldlm_ns_empty(ns)) { + ldlm_namespace_move_to_inactive_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + continue; + } + + if (ns_old == NULL) + ns_old = ns; + + ldlm_namespace_get(ns); + ldlm_namespace_move_to_active_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask); + ldlm_namespace_put(ns); + } + + return total; +} + +static unsigned long ldlm_pools_scan(enum ldlm_side client, int nr, + gfp_t gfp_mask) +{ + unsigned long freed = 0; + int tmp, nr_ns; + struct ldlm_namespace *ns; + + if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS)) + return -1; + + /* + * Shrink at least ldlm_namespace_nr_read(client) namespaces. + */ + for (tmp = nr_ns = ldlm_namespace_nr_read(client); + tmp > 0; tmp--) { + int cancel, nr_locks; + + /* + * Do not call shrink under ldlm_namespace_lock(client) + */ + mutex_lock(ldlm_namespace_lock(client)); + if (list_empty(ldlm_namespace_list(client))) { + mutex_unlock(ldlm_namespace_lock(client)); + break; + } + ns = ldlm_namespace_first_locked(client); + ldlm_namespace_get(ns); + ldlm_namespace_move_to_active_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + + nr_locks = ldlm_pool_granted(&ns->ns_pool); + /* + * We use to shrink propotionally but with new shrinker API, + * we lost the total number of freeable locks. + */ + cancel = 1 + min_t(int, nr_locks, nr / nr_ns); + freed += ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask); + ldlm_namespace_put(ns); + } + /* + * we only decrease the SLV in server pools shrinker, return + * SHRINK_STOP to kernel to avoid needless loop. LU-1128 + */ + return (client == LDLM_NAMESPACE_SERVER) ? SHRINK_STOP : freed; +} + +#ifdef HAVE_SHRINKER_COUNT +static unsigned long ldlm_pools_srv_count(struct shrinker *s, + struct shrink_control *sc) +{ + return ldlm_pools_count(LDLM_NAMESPACE_SERVER, sc->gfp_mask); +} + +static unsigned long ldlm_pools_srv_scan(struct shrinker *s, + struct shrink_control *sc) +{ + return ldlm_pools_scan(LDLM_NAMESPACE_SERVER, sc->nr_to_scan, + sc->gfp_mask); +} + +static unsigned long ldlm_pools_cli_count(struct shrinker *s, + struct shrink_control *sc) +{ + return ldlm_pools_count(LDLM_NAMESPACE_CLIENT, sc->gfp_mask); +} + +static unsigned long ldlm_pools_cli_scan(struct shrinker *s, + struct shrink_control *sc) +{ + return ldlm_pools_scan(LDLM_NAMESPACE_CLIENT, sc->nr_to_scan, + sc->gfp_mask); +} + +static struct shrinker ldlm_pools_srv_shrinker = { + .count_objects = ldlm_pools_srv_count, + .scan_objects = ldlm_pools_srv_scan, + .seeks = DEFAULT_SEEKS, +}; + +static struct shrinker ldlm_pools_cli_shrinker = { + .count_objects = ldlm_pools_cli_count, + .scan_objects = ldlm_pools_cli_scan, + .seeks = DEFAULT_SEEKS, +}; +#else +/* + * Cancel \a nr locks from all namespaces (if possible). Returns number of + * cached locks after shrink is finished. All namespaces are asked to + * cancel approximately equal amount of locks to keep balancing. + */ +static int ldlm_pools_shrink(enum ldlm_side client, int nr, gfp_t gfp_mask) +{ + unsigned long total = 0; + + if (client == LDLM_NAMESPACE_CLIENT && nr != 0 && + !(gfp_mask & __GFP_FS)) + return -1; + + total = ldlm_pools_count(client, gfp_mask); + + if (nr == 0 || total == 0) + return total; + + return ldlm_pools_scan(client, nr, gfp_mask); +} + +static int ldlm_pools_srv_shrink(struct shrinker *shrinker, + struct shrink_control *sc) +{ + return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER, + sc->nr_to_scan, sc->gfp_mask); +} + +static int ldlm_pools_cli_shrink(struct shrinker *shrinker, + struct shrink_control *sc) +{ + return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT, + sc->nr_to_scan, sc->gfp_mask); +} + +static struct shrinker ldlm_pools_srv_shrinker = { + .shrink = ldlm_pools_srv_shrink, + .seeks = DEFAULT_SEEKS, +}; + +static struct shrinker ldlm_pools_cli_shrinker = { + .shrink = ldlm_pools_cli_shrink, + .seeks = DEFAULT_SEEKS, +}; +#endif /* HAVE_SHRINKER_COUNT */ + +static time64_t ldlm_pools_recalc_delay(enum ldlm_side side) +{ + struct ldlm_namespace *ns; + struct ldlm_namespace *ns_old = NULL; + /* seconds of sleep if no active namespaces */ + time64_t delay = ktime_get_seconds() + + (side == LDLM_NAMESPACE_SERVER ? + LDLM_POOL_SRV_DEF_RECALC_PERIOD : + LDLM_POOL_CLI_DEF_RECALC_PERIOD); + int nr; + + /* Recalc at least ldlm_namespace_nr(side) namespaces. */ + for (nr = ldlm_namespace_nr_read(side); nr > 0; nr--) { + int skip; + /* + * Lock the list, get first @ns in the list, getref, move it + * to the tail, unlock and call pool recalc. This way we avoid + * calling recalc under @ns lock, which is really good as we + * get rid of potential deadlock on side nodes when canceling + * locks synchronously. + */ + mutex_lock(ldlm_namespace_lock(side)); + if (list_empty(ldlm_namespace_list(side))) { + mutex_unlock(ldlm_namespace_lock(side)); + break; + } + ns = ldlm_namespace_first_locked(side); + + if (ns_old == ns) { /* Full pass complete */ + mutex_unlock(ldlm_namespace_lock(side)); + break; + } + + /* We got an empty namespace, need to move it back to inactive + * list. + * The race with parallel resource creation is fine: + * - If they do namespace_get before our check, we fail the + * check and they move this item to the end of the list anyway + * - If we do the check and then they do namespace_get, then + * we move the namespace to inactive and they will move + * it back to active (synchronised by the lock, so no clash + * there). + */ + if (ldlm_ns_empty(ns)) { + ldlm_namespace_move_to_inactive_locked(ns, side); + mutex_unlock(ldlm_namespace_lock(side)); + continue; + } + + if (ns_old == NULL) + ns_old = ns; + + spin_lock(&ns->ns_lock); + /* + * skip ns which is being freed, and we don't want to increase + * its refcount again, not even temporarily. bz21519 & LU-499. + */ + if (ns->ns_stopping) { + skip = 1; + } else { + skip = 0; + ldlm_namespace_get(ns); + } + spin_unlock(&ns->ns_lock); + + ldlm_namespace_move_to_active_locked(ns, side); + mutex_unlock(ldlm_namespace_lock(side)); + + /* + * After setup is done - recalc the pool. + */ + if (!skip) { + delay = min(delay, + ldlm_pool_recalc(&ns->ns_pool, false)); + ldlm_namespace_put(ns); + } + } + + return delay; +} + +static void ldlm_pools_recalc_task(struct work_struct *ws); +static DECLARE_DELAYED_WORK(ldlm_pools_recalc_work, ldlm_pools_recalc_task); + +static void ldlm_pools_recalc_task(struct work_struct *ws) +{ + /* seconds of sleep if no active namespaces */ + time64_t delay; +#ifdef HAVE_SERVER_SUPPORT + struct ldlm_namespace *ns; + unsigned long nr_l = 0, nr_p = 0, l; + int equal = 0; + + /* Check all modest namespaces first. */ + mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER)); + list_for_each_entry(ns, ldlm_namespace_list(LDLM_NAMESPACE_SERVER), + ns_list_chain) { + if (ns->ns_appetite != LDLM_NAMESPACE_MODEST) + continue; + + l = ldlm_pool_granted(&ns->ns_pool); + if (l == 0) + l = 1; + + /* + * Set the modest pools limit equal to their avg granted + * locks + ~6%. + */ + l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0); + ldlm_pool_setup(&ns->ns_pool, l); + nr_l += l; + nr_p++; + } + + /* + * Make sure than modest namespaces did not eat more that 2/3 + * of limit. + */ + if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) { + CWARN("'Modest' pools eat out 2/3 of server locks limit (%lu of %lu). This means that you have too many clients for this amount of server RAM. Upgrade server!\n", + nr_l, LDLM_POOL_HOST_L); + equal = 1; + } + + /* The rest is given to greedy namespaces. */ + list_for_each_entry(ns, ldlm_namespace_list(LDLM_NAMESPACE_SERVER), + ns_list_chain) { + if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY) + continue; + + if (equal) { + /* + * In the case 2/3 locks are eaten out by + * modest pools, we re-setup equal limit + * for _all_ pools. + */ + l = LDLM_POOL_HOST_L / + ldlm_namespace_nr_read(LDLM_NAMESPACE_SERVER); + } else { + /* + * All the rest of greedy pools will have + * all locks in equal parts. + */ + l = (LDLM_POOL_HOST_L - nr_l) / + (ldlm_namespace_nr_read(LDLM_NAMESPACE_SERVER) - + nr_p); + } + ldlm_pool_setup(&ns->ns_pool, l); + } + mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER)); + + delay = min(ldlm_pools_recalc_delay(LDLM_NAMESPACE_SERVER), + ldlm_pools_recalc_delay(LDLM_NAMESPACE_CLIENT)); +#else /* !HAVE_SERVER_SUPPORT */ + delay = ldlm_pools_recalc_delay(LDLM_NAMESPACE_CLIENT); +#endif /* HAVE_SERVER_SUPPORT */ + + /* Wake up the blocking threads from time to time. */ + ldlm_bl_thread_wakeup(); + + delay -= ktime_get_seconds(); + if (delay <= 0) { + /* Prevent too frequent recalculation. */ + CDEBUG(D_DLMTRACE, "Negative interval(%lld)\n", delay); + delay = 1; + } + + schedule_delayed_work(&ldlm_pools_recalc_work, cfs_time_seconds(delay)); +} + +static bool ldlm_pools_init_done; + +int ldlm_pools_init(void) +{ + time64_t delay; + int rc; + +#ifdef HAVE_SERVER_SUPPORT + delay = min(LDLM_POOL_SRV_DEF_RECALC_PERIOD, + LDLM_POOL_CLI_DEF_RECALC_PERIOD); +#else + delay = LDLM_POOL_CLI_DEF_RECALC_PERIOD; +#endif + + rc = register_shrinker(&ldlm_pools_srv_shrinker); + if (rc) + goto out; + + rc = register_shrinker(&ldlm_pools_cli_shrinker); + if (rc) + goto out_shrinker; + + schedule_delayed_work(&ldlm_pools_recalc_work, delay); + ldlm_pools_init_done = true; + return 0; + +out_shrinker: + unregister_shrinker(&ldlm_pools_cli_shrinker); +out: + return rc; +} + +void ldlm_pools_fini(void) +{ + if (ldlm_pools_init_done) { + unregister_shrinker(&ldlm_pools_srv_shrinker); + unregister_shrinker(&ldlm_pools_cli_shrinker); + + cancel_delayed_work_sync(&ldlm_pools_recalc_work); + } + + ldlm_pools_init_done = false; +} + +#else /* !HAVE_LRU_RESIZE_SUPPORT */ +int ldlm_pool_setup(struct ldlm_pool *pl, int limit) +{ + return 0; +} + +time64_t ldlm_pool_recalc(struct ldlm_pool *pl, bool force) +{ + return 0; +} + +int ldlm_pool_shrink(struct ldlm_pool *pl, + int nr, gfp_t gfp_mask) +{ + return 0; +} + +int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, + int idx, enum ldlm_side client) +{ + return 0; +} + +void ldlm_pool_fini(struct ldlm_pool *pl) +{ + return; +} + +void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock) +{ + return; +} + +void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock) +{ + return; +} + +__u64 ldlm_pool_get_slv(struct ldlm_pool *pl) +{ + return 1; +} + +void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv) +{ + return; +} + +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl) +{ + return 1; +} + +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv) +{ + return; +} + +__u32 ldlm_pool_get_limit(struct ldlm_pool *pl) +{ + return 0; +} + +void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit) +{ + return; +} + +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl) +{ + return 0; +} + +int ldlm_pools_init(void) +{ + return 0; +} + +void ldlm_pools_fini(void) +{ + return; +} + +#endif /* HAVE_LRU_RESIZE_SUPPORT */ diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c new file mode 100644 index 0000000000000..d371dc2cade21 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c @@ -0,0 +1,415 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2015, Intel Corporation. + * Use is subject to license terms. + * + * Author: Niu Yawei + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include "ldlm_internal.h" + +/* + * To avoid ldlm lock exhausting server memory, two global parameters: + * ldlm_reclaim_threshold & ldlm_lock_limit are used for reclaiming + * granted locks and rejecting incoming enqueue requests defensively. + * + * ldlm_reclaim_threshold: When the amount of granted locks reaching this + * threshold, server start to revoke locks gradually. + * + * ldlm_lock_limit: When the amount of granted locks reaching this + * threshold, server will return -EINPROGRESS to any incoming enqueue + * request until the lock count is shrunk below the threshold again. + * + * ldlm_reclaim_threshold & ldlm_lock_limit is set to 20% & 30% of the + * total memory by default. It is tunable via proc entry, when it's set + * to 0, the feature is disabled. + */ + +#ifdef HAVE_SERVER_SUPPORT + +/* Lock count is stored in ldlm_reclaim_threshold & ldlm_lock_limit */ +__u64 ldlm_reclaim_threshold; +__u64 ldlm_lock_limit; + +/* Represents ldlm_reclaim_threshold & ldlm_lock_limit in MB, used for + * proc interface. */ +__u64 ldlm_reclaim_threshold_mb; +__u64 ldlm_lock_limit_mb; + +struct percpu_counter ldlm_granted_total; +static atomic_t ldlm_nr_reclaimer; +static s64 ldlm_last_reclaim_age_ns; +static ktime_t ldlm_last_reclaim_time; + +struct ldlm_reclaim_cb_data { + struct list_head rcd_rpc_list; + int rcd_added; + int rcd_total; + int rcd_cursor; + int rcd_start; + bool rcd_skip; + s64 rcd_age_ns; + struct cfs_hash_bd *rcd_prev_bd; +}; + +static inline bool ldlm_lock_reclaimable(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + /* FLOCK & PLAIN lock are not reclaimable. FLOCK is + * explicitly controlled by application, PLAIN lock + * is used by quota global lock and config lock. + */ + if (ns->ns_client == LDLM_NAMESPACE_SERVER && + (lock->l_resource->lr_type == LDLM_IBITS || + lock->l_resource->lr_type == LDLM_EXTENT)) + return true; + return false; +} + +/** + * Callback function for revoking locks from certain resource. + * + * \param [in] hs ns_rs_hash + * \param [in] bd current bucket of ns_rsh_hash + * \param [in] hnode hnode of the resource + * \param [in] arg opaque data + * + * \retval 0 continue the scan + * \retval 1 stop the iteration + */ +static int ldlm_reclaim_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) + +{ + struct ldlm_resource *res; + struct ldlm_reclaim_cb_data *data; + struct ldlm_lock *lock; + struct ldlm_ns_bucket *nsb; + int rc = 0; + + data = (struct ldlm_reclaim_cb_data *)arg; + + LASSERTF(data->rcd_added < data->rcd_total, "added:%d >= total:%d\n", + data->rcd_added, data->rcd_total); + + nsb = cfs_hash_bd_extra_get(hs, bd); + res = cfs_hash_object(hs, hnode); + + if (data->rcd_prev_bd != bd) { + if (data->rcd_prev_bd != NULL) + ldlm_res_to_ns(res)->ns_reclaim_start++; + data->rcd_prev_bd = bd; + data->rcd_cursor = 0; + data->rcd_start = nsb->nsb_reclaim_start % + cfs_hash_bd_count_get(bd); + } + + if (data->rcd_skip && data->rcd_cursor < data->rcd_start) { + data->rcd_cursor++; + return 0; + } + + nsb->nsb_reclaim_start++; + + lock_res(res); + list_for_each_entry(lock, &res->lr_granted, l_res_link) { + if (!ldlm_lock_reclaimable(lock)) + continue; + + if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW) && + ktime_before(ktime_get(), + ktime_add_ns(lock->l_last_used, + data->rcd_age_ns))) + continue; + + if (!ldlm_is_ast_sent(lock)) { + ldlm_set_ast_sent(lock); + LASSERT(list_empty(&lock->l_rk_ast)); + list_add(&lock->l_rk_ast, &data->rcd_rpc_list); + LDLM_LOCK_GET(lock); + if (++data->rcd_added == data->rcd_total) { + rc = 1; /* stop the iteration */ + break; + } + } + } + unlock_res(res); + + return rc; +} + +/** + * Revoke locks from the resources of a namespace in a roundrobin + * manner. + * + * \param[in] ns namespace to do the lock revoke on + * \param[in] count count of lock to be revoked + * \param[in] age only revoke locks older than the 'age' + * \param[in] skip scan from the first lock on resource if the + * 'skip' is false, otherwise, continue scan + * from the last scanned position + * \param[out] count count of lock still to be revoked + */ +static void ldlm_reclaim_res(struct ldlm_namespace *ns, int *count, + s64 age_ns, bool skip) +{ + struct ldlm_reclaim_cb_data data; + int idx, type, start; + int rc; + ENTRY; + + LASSERT(*count != 0); + + if (ns->ns_obd) { + type = server_name2index(ns->ns_obd->obd_name, &idx, NULL); + if (type != LDD_F_SV_TYPE_MDT && type != LDD_F_SV_TYPE_OST) { + EXIT; + return; + } + } + + if (atomic_read(&ns->ns_bref) == 0) { + EXIT; + return; + } + + INIT_LIST_HEAD(&data.rcd_rpc_list); + data.rcd_added = 0; + data.rcd_total = *count; + data.rcd_age_ns = age_ns; + data.rcd_skip = skip; + data.rcd_prev_bd = NULL; + start = ns->ns_reclaim_start % CFS_HASH_NBKT(ns->ns_rs_hash); + + cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_reclaim_lock_cb, &data, + start); + + CDEBUG(D_DLMTRACE, "NS(%s): %d locks to be reclaimed, found %d/%d " + "locks.\n", ldlm_ns_name(ns), *count, data.rcd_added, + data.rcd_total); + + LASSERTF(*count >= data.rcd_added, "count:%d, added:%d\n", *count, + data.rcd_added); + + rc = ldlm_run_ast_work(ns, &data.rcd_rpc_list, LDLM_WORK_REVOKE_AST); + if (rc == -ERESTART) + ldlm_reprocess_recovery_done(ns); + + *count -= data.rcd_added; + EXIT; +} + +#define LDLM_RECLAIM_BATCH 512 +#define LDLM_RECLAIM_AGE_MIN (300 * NSEC_PER_SEC) +#define LDLM_RECLAIM_AGE_MAX (LDLM_DEFAULT_MAX_ALIVE * NSEC_PER_SEC * 3 / 4) + +static inline s64 ldlm_reclaim_age(void) +{ + s64 age_ns = ldlm_last_reclaim_age_ns; + ktime_t now = ktime_get(); + ktime_t diff; + + diff = ktime_sub(now, ldlm_last_reclaim_time); + age_ns += ktime_to_ns(diff); + if (age_ns > LDLM_RECLAIM_AGE_MAX) + age_ns = LDLM_RECLAIM_AGE_MAX; + else if (age_ns < (LDLM_RECLAIM_AGE_MIN * 2)) + age_ns = LDLM_RECLAIM_AGE_MIN; + return age_ns; +} + +/** + * Revoke certain amount of locks from all the server namespaces + * in a roundrobin manner. Lock age is used to avoid reclaim on + * the non-aged locks. + */ +static void ldlm_reclaim_ns(void) +{ + struct ldlm_namespace *ns; + int count = LDLM_RECLAIM_BATCH; + int ns_nr, nr_processed; + enum ldlm_side ns_cli = LDLM_NAMESPACE_SERVER; + s64 age_ns; + bool skip = true; + ENTRY; + + if (!atomic_add_unless(&ldlm_nr_reclaimer, 1, 1)) { + EXIT; + return; + } + + age_ns = ldlm_reclaim_age(); +again: + nr_processed = 0; + ns_nr = ldlm_namespace_nr_read(ns_cli); + while (count > 0 && nr_processed < ns_nr) { + mutex_lock(ldlm_namespace_lock(ns_cli)); + + if (list_empty(ldlm_namespace_list(ns_cli))) { + mutex_unlock(ldlm_namespace_lock(ns_cli)); + goto out; + } + + ns = ldlm_namespace_first_locked(ns_cli); + ldlm_namespace_move_to_active_locked(ns, ns_cli); + mutex_unlock(ldlm_namespace_lock(ns_cli)); + + ldlm_reclaim_res(ns, &count, age_ns, skip); + ldlm_namespace_put(ns); + nr_processed++; + } + + if (count > 0 && age_ns > LDLM_RECLAIM_AGE_MIN) { + age_ns >>= 1; + if (age_ns < (LDLM_RECLAIM_AGE_MIN * 2)) + age_ns = LDLM_RECLAIM_AGE_MIN; + skip = false; + goto again; + } + + ldlm_last_reclaim_age_ns = age_ns; + ldlm_last_reclaim_time = ktime_get(); +out: + atomic_add_unless(&ldlm_nr_reclaimer, -1, 0); + EXIT; +} + +void ldlm_reclaim_add(struct ldlm_lock *lock) +{ + if (!ldlm_lock_reclaimable(lock)) + return; + percpu_counter_add(&ldlm_granted_total, 1); + lock->l_last_used = ktime_get(); +} + +void ldlm_reclaim_del(struct ldlm_lock *lock) +{ + if (!ldlm_lock_reclaimable(lock)) + return; + percpu_counter_sub(&ldlm_granted_total, 1); +} + +/** + * Check on the total granted locks: return true if it reaches the + * high watermark (ldlm_lock_limit), otherwise return false; It also + * triggers lock reclaim if the low watermark (ldlm_reclaim_threshold) + * is reached. + * + * \retval true high watermark reached. + * \retval false high watermark not reached. + */ +bool ldlm_reclaim_full(void) +{ + __u64 high = ldlm_lock_limit; + __u64 low = ldlm_reclaim_threshold; + + if (low != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW)) + low = cfs_fail_val; + + if (low != 0 && + percpu_counter_sum_positive(&ldlm_granted_total) > low) + ldlm_reclaim_ns(); + + if (high != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_HIGH)) + high = cfs_fail_val; + + if (high != 0 && + percpu_counter_sum_positive(&ldlm_granted_total) > high) + return true; + + return false; +} + +static inline __u64 ldlm_ratio2locknr(int ratio) +{ + __u64 locknr; + + locknr = ((__u64)NUM_CACHEPAGES << PAGE_SHIFT) * ratio; + do_div(locknr, 100 * sizeof(struct ldlm_lock)); + + return locknr; +} + +static inline __u64 ldlm_locknr2mb(__u64 locknr) +{ + return (locknr * sizeof(struct ldlm_lock) + 512 * 1024) >> 20; +} + +#define LDLM_WM_RATIO_LOW_DEFAULT 20 +#define LDLM_WM_RATIO_HIGH_DEFAULT 30 + +int ldlm_reclaim_setup(void) +{ + atomic_set(&ldlm_nr_reclaimer, 0); + + ldlm_reclaim_threshold = ldlm_ratio2locknr(LDLM_WM_RATIO_LOW_DEFAULT); + ldlm_reclaim_threshold_mb = ldlm_locknr2mb(ldlm_reclaim_threshold); + ldlm_lock_limit = ldlm_ratio2locknr(LDLM_WM_RATIO_HIGH_DEFAULT); + ldlm_lock_limit_mb = ldlm_locknr2mb(ldlm_lock_limit); + + ldlm_last_reclaim_age_ns = LDLM_RECLAIM_AGE_MAX; + ldlm_last_reclaim_time = ktime_get(); + +#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG + return percpu_counter_init(&ldlm_granted_total, 0, GFP_KERNEL); +#else + return percpu_counter_init(&ldlm_granted_total, 0); +#endif +} + +void ldlm_reclaim_cleanup(void) +{ + percpu_counter_destroy(&ldlm_granted_total); +} + +#else /* HAVE_SERVER_SUPPORT */ + +bool ldlm_reclaim_full(void) +{ + return false; +} + +void ldlm_reclaim_add(struct ldlm_lock *lock) +{ +} + +void ldlm_reclaim_del(struct ldlm_lock *lock) +{ +} + +int ldlm_reclaim_setup(void) +{ + return 0; +} + +void ldlm_reclaim_cleanup(void) +{ +} + +#endif /* HAVE_SERVER_SUPPORT */ diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c new file mode 100644 index 0000000000000..38ca9ed9caad6 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c @@ -0,0 +1,2650 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +/** + * This file contains Asynchronous System Trap (AST) handlers and related + * LDLM request-processing routines. + * + * An AST is a callback issued on a lock when its state is changed. There are + * several different types of ASTs (callbacks) registered for each lock: + * + * - completion AST: when a lock is enqueued by some process, but cannot be + * granted immediately due to other conflicting locks on the same resource, + * the completion AST is sent to notify the caller when the lock is + * eventually granted + * + * - blocking AST: when a lock is granted to some process, if another process + * enqueues a conflicting (blocking) lock on a resource, a blocking AST is + * sent to notify the holder(s) of the lock(s) of the conflicting lock + * request. The lock holder(s) must release their lock(s) on that resource in + * a timely manner or be evicted by the server. + * + * - glimpse AST: this is used when a process wants information about a lock + * (i.e. the lock value block (LVB)) but does not necessarily require holding + * the lock. If the resource is locked, the lock holder(s) are sent glimpse + * ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL + * their lock(s) if they are idle. If the resource is not locked, the server + * may grant the lock. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include +#include + +#include "ldlm_internal.h" + +unsigned int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT; +module_param(ldlm_enqueue_min, uint, 0644); +MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum"); +EXPORT_SYMBOL(ldlm_enqueue_min); + +/* in client side, whether the cached locks will be canceled before replay */ +unsigned int ldlm_cancel_unused_locks_before_replay = 1; + +struct lock_wait_data { + struct ldlm_lock *lwd_lock; + __u32 lwd_conn_cnt; +}; + +struct ldlm_async_args { + struct lustre_handle lock_handle; +}; + +/** + * ldlm_request_bufsize + * + * If opcode=LDLM_ENQUEUE, 1 slot is already occupied, + * LDLM_LOCKREQ_HANDLE -1 slots are available. + * Otherwise, LDLM_LOCKREQ_HANDLE slots are available. + * + * \param[in] count + * \param[in] type + * + * \retval size of the request buffer + */ +int ldlm_request_bufsize(int count, int type) +{ + int avail = LDLM_LOCKREQ_HANDLES; + + if (type == LDLM_ENQUEUE) + avail -= LDLM_ENQUEUE_CANCEL_OFF; + + if (count > avail) + avail = (count - avail) * sizeof(struct lustre_handle); + else + avail = 0; + + return sizeof(struct ldlm_request) + avail; +} + +void ldlm_expired_completion_wait(struct lock_wait_data *lwd) +{ + struct ldlm_lock *lock = lwd->lwd_lock; + struct obd_import *imp; + struct obd_device *obd; + + ENTRY; + if (lock->l_conn_export == NULL) { + static time64_t next_dump, last_dump; + + LDLM_ERROR(lock, + "lock timed out (enqueued at %lld, %llds ago); not entering recovery in server code, just going back to sleep", + lock->l_activity, + ktime_get_real_seconds() - lock->l_activity); + if (ktime_get_seconds() > next_dump) { + last_dump = next_dump; + next_dump = ktime_get_seconds() + 300; + ldlm_namespace_dump(D_DLMTRACE, + ldlm_lock_to_ns(lock)); + if (last_dump == 0) + libcfs_debug_dumplog(); + } + RETURN_EXIT; + } + + obd = lock->l_conn_export->exp_obd; + imp = obd->u.cli.cl_import; + ptlrpc_fail_import(imp, lwd->lwd_conn_cnt); + LDLM_ERROR(lock, + "lock timed out (enqueued at %lld, %llds ago), entering recovery for %s@%s", + lock->l_activity, + ktime_get_real_seconds() - lock->l_activity, + obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid); + + EXIT; +} + +int is_granted_or_cancelled_nolock(struct ldlm_lock *lock) +{ + int ret = 0; + + check_res_locked(lock->l_resource); + if (ldlm_is_granted(lock) && !ldlm_is_cp_reqd(lock)) + ret = 1; + else if (ldlm_is_failed(lock) || ldlm_is_cancel(lock)) + ret = 1; + return ret; +} +EXPORT_SYMBOL(is_granted_or_cancelled_nolock); + +/** + * Calculate the Completion timeout (covering enqueue, BL AST, data flush, + * lock cancel, and their replies). Used for lock completion timeout on the + * client side. + * + * \param[in] lock lock which is waiting the completion callback + * + * \retval timeout in seconds to wait for the server reply + */ +/* + * We use the same basis for both server side and client side functions + * from a single node. + */ +static timeout_t ldlm_cp_timeout(struct ldlm_lock *lock) +{ + timeout_t timeout; + + if (AT_OFF) + return obd_timeout; + + /* + * Wait a long time for enqueue - server may have to callback a + * lock from another client. Server will evict the other client if it + * doesn't respond reasonably, and then give us the lock. + */ + timeout = at_get(ldlm_lock_to_ns_at(lock)); + return max(3 * timeout, (timeout_t)ldlm_enqueue_min); +} + +/** + * Helper function for ldlm_completion_ast(), updating timings when lock is + * actually granted. + */ +static int ldlm_completion_tail(struct ldlm_lock *lock, void *data) +{ + int result = 0; + + if (ldlm_is_destroyed(lock) || ldlm_is_failed(lock)) { + LDLM_DEBUG(lock, "client-side enqueue: destroyed"); + result = -EIO; + } else if (data == NULL) { + LDLM_DEBUG(lock, "client-side enqueue: granted"); + } else { + /* Take into AT only CP RPC, not immediately granted locks */ + timeout_t delay = 0; + + /* Discard negative timeouts. We should also limit the + * maximum value of the timeout + */ + if (ktime_get_real_seconds() > lock->l_activity) + delay = ktime_get_real_seconds() - lock->l_activity; + + LDLM_DEBUG(lock, "client-side enqueue: granted after %ds", + delay); + /* Update our time estimate */ + at_measured(ldlm_lock_to_ns_at(lock), delay); + } + return result; +} + +/** + * Implementation of ->l_completion_ast() for a client, that doesn't wait + * until lock is granted. Suitable for locks enqueued through ptlrpcd, of + * other threads that cannot block for long. + */ +int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data) +{ + ENTRY; + + if (flags == LDLM_FL_WAIT_NOREPROC) { + LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock"); + RETURN(0); + } + + if (!(flags & LDLM_FL_BLOCKED_MASK)) { + wake_up(&lock->l_waitq); + RETURN(ldlm_completion_tail(lock, data)); + } + + LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, going forward"); + ldlm_reprocess_all(lock->l_resource, 0); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_completion_ast_async); + +/** + * Generic LDLM "completion" AST. This is called in several cases: + * + * - when a reply to an ENQUEUE RPC is received from the server + * (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at + * this point (determined by flags); + * + * - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has + * been granted; + * + * - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock + * gets correct lvb; + * + * - to force all locks when resource is destroyed (cleanup_resource()); + * + * If lock is not granted in the first case, this function waits until second + * or penultimate cases happen in some other thread. + * + */ +int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) +{ + /* XXX ALLOCATE - 160 bytes */ + struct lock_wait_data lwd; + struct obd_device *obd; + struct obd_import *imp = NULL; + timeout_t timeout; + int rc = 0; + + ENTRY; + + if (flags == LDLM_FL_WAIT_NOREPROC) { + LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock"); + goto noreproc; + } + + if (!(flags & LDLM_FL_BLOCKED_MASK)) { + wake_up(&lock->l_waitq); + RETURN(0); + } + + LDLM_DEBUG(lock, "client-side enqueue returned a blocked locksleeping"); + +noreproc: + + obd = class_exp2obd(lock->l_conn_export); + + /* if this is a local lock, then there is no import */ + if (obd != NULL) + imp = obd->u.cli.cl_import; + + timeout = ldlm_cp_timeout(lock); + + lwd.lwd_lock = lock; + lock->l_activity = ktime_get_real_seconds(); + + if (imp != NULL) { + spin_lock(&imp->imp_lock); + lwd.lwd_conn_cnt = imp->imp_conn_cnt; + spin_unlock(&imp->imp_lock); + } + + if (ns_is_client(ldlm_lock_to_ns(lock)) && + OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST, + OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) { + ldlm_set_fail_loc(lock); + rc = -EINTR; + } else { + /* Go to sleep until the lock is granted or cancelled. */ + if (ldlm_is_no_timeout(lock)) { + LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT"); + rc = l_wait_event_abortable( + lock->l_waitq, + is_granted_or_cancelled(lock)); + } else { + if (wait_event_idle_timeout( + lock->l_waitq, + is_granted_or_cancelled(lock), + cfs_time_seconds(timeout)) == 0) { + ldlm_expired_completion_wait(&lwd); + rc = l_wait_event_abortable( + lock->l_waitq, + is_granted_or_cancelled(lock)); + } + } + } + + if (rc) { + LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)", + rc); + RETURN(rc); + } + + RETURN(ldlm_completion_tail(lock, data)); +} +EXPORT_SYMBOL(ldlm_completion_ast); + +/** + * A helper to build a blocking AST function + * + * Perform a common operation for blocking ASTs: + * defferred lock cancellation. + * + * \param lock the lock blocking or canceling AST was called on + * \retval 0 + * \see mdt_blocking_ast + * \see ldlm_blocking_ast + */ +int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock) +{ + int do_ast; + + ENTRY; + + ldlm_set_cbpending(lock); + do_ast = (!lock->l_readers && !lock->l_writers); + unlock_res_and_lock(lock); + + if (do_ast) { + struct lustre_handle lockh; + int rc; + + LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel"); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (rc < 0) + CERROR("ldlm_cli_cancel: %d\n", rc); + } else { + LDLM_DEBUG(lock, + "Lock still has references, will be cancelled later"); + } + RETURN(0); +} +EXPORT_SYMBOL(ldlm_blocking_ast_nocheck); + +/** + * Server blocking AST + * + * ->l_blocking_ast() callback for LDLM locks acquired by server-side + * OBDs. + * + * \param lock the lock which blocks a request or cancelling lock + * \param desc unused + * \param data unused + * \param flag indicates whether this cancelling or blocking callback + * \retval 0 + * \see ldlm_blocking_ast_nocheck + */ +int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + ENTRY; + + if (flag == LDLM_CB_CANCELING) { + /* Don't need to do anything here. */ + RETURN(0); + } + + lock_res_and_lock(lock); + /* + * Get this: if ldlm_blocking_ast is racing with intent_policy, such + * that ldlm_blocking_ast is called just before intent_policy method + * takes the lr_lock, then by the time we get the lock, we might not + * be the correct blocking function anymore. So check, and return + * early, if so. + */ + if (lock->l_blocking_ast != ldlm_blocking_ast) { + unlock_res_and_lock(lock); + RETURN(0); + } + RETURN(ldlm_blocking_ast_nocheck(lock)); +} +EXPORT_SYMBOL(ldlm_blocking_ast); + +/** + * Implements ldlm_lock::l_glimpse_ast for extent locks acquired on the server. + * + * Returning -ELDLM_NO_LOCK_DATA actually works, but the reason for that is + * rather subtle: with OST-side locking, it may so happen that _all_ extent + * locks are held by the OST. If client wants to obtain the current file size + * it calls ll_glimpse_size(), and (as all locks are held only on the server), + * this dummy glimpse callback fires and does nothing. The client still + * receives the correct file size due to the following fragment of code in + * ldlm_cb_interpret(): + * + * if (rc == -ELDLM_NO_LOCK_DATA) { + * LDLM_DEBUG(lock, "lost race - client has a lock but no" + * "inode"); + * ldlm_res_lvbo_update(lock->l_resource, NULL, 1); + * } + * + * That is, after the glimpse returns this error, ofd_lvbo_update() is called + * and returns the updated file attributes from the inode to the client. + * + * See also comment in ofd_intent_policy() on why servers must set a non-NULL + * l_glimpse_ast when grabbing DLM locks. Otherwise, the server will assume + * that the object is in the process of being destroyed. + * + * \param[in] lock DLM lock being glimpsed, unused + * \param[in] reqp pointer to ptlrpc_request, unused + * + * \retval -ELDLM_NO_LOCK_DATA to get attributes from disk object + */ +int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp) +{ + return -ELDLM_NO_LOCK_DATA; +} + +/** + * Enqueue a local lock (typically on a server). + */ +int ldlm_cli_enqueue_local(const struct lu_env *env, + struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + enum ldlm_type type, union ldlm_policy_data *policy, + enum ldlm_mode mode, __u64 *flags, + ldlm_blocking_callback blocking, + ldlm_completion_callback completion, + ldlm_glimpse_callback glimpse, + void *data, __u32 lvb_len, enum lvb_type lvb_type, + const __u64 *client_cookie, + struct lustre_handle *lockh) +{ + struct ldlm_lock *lock; + int err; + const struct ldlm_callback_suite cbs = { .lcs_completion = completion, + .lcs_blocking = blocking, + .lcs_glimpse = glimpse, + }; + + ENTRY; + + LASSERT(!(*flags & LDLM_FL_REPLAY)); + if (unlikely(ns_is_client(ns))) { + CERROR("Trying to enqueue local lock in a shadow namespace\n"); + LBUG(); + } + + lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len, + lvb_type); + if (IS_ERR(lock)) + GOTO(out_nolock, err = PTR_ERR(lock)); + + err = ldlm_lvbo_init(lock->l_resource); + if (err < 0) { + LDLM_ERROR(lock, "delayed lvb init failed (rc %d)", err); + ldlm_lock_destroy_nolock(lock); + GOTO(out, err); + } + + ldlm_lock2handle(lock, lockh); + + /* + * NB: we don't have any lock now (lock_res_and_lock) + * because it's a new lock + */ + ldlm_lock_addref_internal_nolock(lock, mode); + ldlm_set_local(lock); + if (*flags & LDLM_FL_ATOMIC_CB) + ldlm_set_atomic_cb(lock); + + if (*flags & LDLM_FL_CANCEL_ON_BLOCK) + ldlm_set_cancel_on_block(lock); + + if (policy != NULL) + lock->l_policy_data = *policy; + if (client_cookie != NULL) + lock->l_client_cookie = *client_cookie; + if (type == LDLM_EXTENT) { + /* extent lock without policy is a bug */ + if (policy == NULL) + LBUG(); + + lock->l_req_extent = policy->l_extent; + } + + err = ldlm_lock_enqueue(env, ns, &lock, policy, flags); + if (unlikely(err != ELDLM_OK)) + GOTO(out, err); + + if (policy != NULL) + *policy = lock->l_policy_data; + + if (lock->l_completion_ast) + lock->l_completion_ast(lock, *flags, NULL); + + LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created"); + EXIT; + out: + LDLM_LOCK_RELEASE(lock); + out_nolock: + return err; +} +EXPORT_SYMBOL(ldlm_cli_enqueue_local); + +static void failed_lock_cleanup(struct ldlm_namespace *ns, + struct ldlm_lock *lock, int mode) +{ + int need_cancel = 0; + + /* Set a flag to prevent us from sending a CANCEL (b=407) */ + lock_res_and_lock(lock); + /* Check that lock is not granted or failed, we might race. */ + if (!ldlm_is_granted(lock) && !ldlm_is_failed(lock)) { + /* + * Make sure that this lock will not be found by raced + * bl_ast and -EINVAL reply is sent to server anyways. + * b=17645 + */ + lock->l_flags |= LDLM_FL_FAILED | + LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING; + if (!(ldlm_is_bl_ast(lock) && + lock->l_remote_handle.cookie != 0)) + lock->l_flags |= LDLM_FL_LOCAL_ONLY; + need_cancel = 1; + } + unlock_res_and_lock(lock); + + if (need_cancel) + LDLM_DEBUG(lock, + "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING"); + else + LDLM_DEBUG(lock, "lock was granted or failed in race"); + + /* + * XXX - HACK because we shouldn't call ldlm_lock_destroy() + * from llite/file.c/ll_file_flock(). + */ + /* + * This code makes for the fact that we do not have blocking handler on + * a client for flock locks. As such this is the place where we must + * completely kill failed locks. (interrupted and those that + * were waiting to be granted when server evicted us. + */ + if (lock->l_resource->lr_type == LDLM_FLOCK) { + lock_res_and_lock(lock); + if (!ldlm_is_destroyed(lock)) { + ldlm_resource_unlink_lock(lock); + ldlm_lock_decref_internal_nolock(lock, mode); + ldlm_lock_destroy_nolock(lock); + } + unlock_res_and_lock(lock); + } else { + ldlm_lock_decref_internal(lock, mode); + } +} + +static bool ldlm_request_slot_needed(struct ldlm_enqueue_info *einfo) +{ + /* exclude EXTENT locks and DOM-only IBITS locks because they + * are asynchronous and don't wait on server being blocked. + */ + return einfo->ei_req_slot && + (einfo->ei_type == LDLM_FLOCK || + (einfo->ei_type == LDLM_IBITS && + einfo->ei_inodebits != MDS_INODELOCK_DOM)); +} + +/** + * Finishing portion of client lock enqueue code. + * + * Called after receiving reply from server. + */ +int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, + struct ldlm_enqueue_info *einfo, + __u8 with_policy, __u64 *ldlm_flags, void *lvb, + __u32 lvb_len, const struct lustre_handle *lockh, + int rc, bool request_slot) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + const struct lu_env *env = NULL; + int is_replay = *ldlm_flags & LDLM_FL_REPLAY; + struct ldlm_lock *lock; + struct ldlm_reply *reply; + int cleanup_phase = 1; + + ENTRY; + + if (request_slot) + obd_put_request_slot(&req->rq_import->imp_obd->u.cli); + + ptlrpc_put_mod_rpc_slot(req); + + if (req && req->rq_svc_thread) + env = req->rq_svc_thread->t_env; + + lock = ldlm_handle2lock(lockh); + /* ldlm_cli_enqueue is holding a reference on this lock. */ + if (!lock) { + LASSERT(einfo->ei_type == LDLM_FLOCK); + RETURN(-ENOLCK); + } + + LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len), + "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len); + + if (rc != ELDLM_OK) { + LASSERT(!is_replay); + LDLM_DEBUG(lock, "client-side enqueue END (%s)", + rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED"); + + if (rc != ELDLM_LOCK_ABORTED) + GOTO(cleanup, rc); + } + + /* Before we return, swab the reply */ + reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + if (reply == NULL) + GOTO(cleanup, rc = -EPROTO); + + if (lvb_len > 0) { + int size = 0; + + size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, + RCL_SERVER); + if (size < 0) { + LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size); + GOTO(cleanup, rc = size); + } else if (unlikely(size > lvb_len)) { + LDLM_ERROR(lock, + "Replied LVB is larger than expectation, expected = %d, replied = %d", + lvb_len, size); + GOTO(cleanup, rc = -EINVAL); + } + lvb_len = size; + } + + if (rc == ELDLM_LOCK_ABORTED) { + if (lvb_len > 0 && lvb != NULL) + rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER, + lvb, lvb_len); + GOTO(cleanup, rc = rc ? : ELDLM_LOCK_ABORTED); + } + + /* lock enqueued on the server */ + cleanup_phase = 0; + + lock_res_and_lock(lock); + /* Key change rehash lock in per-export hash with new key */ + if (exp->exp_lock_hash) { + /* + * In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() + */ + /* coverity[overrun-buffer-val] */ + cfs_hash_rehash_key(exp->exp_lock_hash, + &lock->l_remote_handle, + &reply->lock_handle, + &lock->l_exp_hash); + } else { + lock->l_remote_handle = reply->lock_handle; + } + + *ldlm_flags = ldlm_flags_from_wire(reply->lock_flags); + lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags & + LDLM_FL_INHERIT_MASK); + unlock_res_and_lock(lock); + + CDEBUG(D_INFO, "local: %p, remote cookie: %#llx, flags: %#llx\n", + lock, reply->lock_handle.cookie, *ldlm_flags); + + /* + * If enqueue returned a blocked lock but the completion handler has + * already run, then it fixed up the resource and we don't need to do it + * again. + */ + if ((*ldlm_flags) & LDLM_FL_LOCK_CHANGED) { + int newmode = reply->lock_desc.l_req_mode; + + LASSERT(!is_replay); + if (newmode && newmode != lock->l_req_mode) { + LDLM_DEBUG(lock, "server returned different mode %s", + ldlm_lockname[newmode]); + lock->l_req_mode = newmode; + } + + if (!ldlm_res_eq(&reply->lock_desc.l_resource.lr_name, + &lock->l_resource->lr_name)) { + CDEBUG(D_INFO, + "remote intent success, locking "DLDLMRES", instead of "DLDLMRES"\n", + PLDLMRES(&reply->lock_desc.l_resource), + PLDLMRES(lock->l_resource)); + + rc = ldlm_lock_change_resource(ns, lock, + &reply->lock_desc.l_resource.lr_name); + if (rc || lock->l_resource == NULL) + GOTO(cleanup, rc = -ENOMEM); + LDLM_DEBUG(lock, "client-side enqueue, new resource"); + } + + if (with_policy) { + /* We assume lock type cannot change on server*/ + ldlm_convert_policy_to_local(exp, + lock->l_resource->lr_type, + &reply->lock_desc.l_policy_data, + &lock->l_policy_data); + } + + if (einfo->ei_type != LDLM_PLAIN) + LDLM_DEBUG(lock, + "client-side enqueue, new policy data"); + } + + if ((*ldlm_flags) & LDLM_FL_AST_SENT) { + lock_res_and_lock(lock); + ldlm_bl_desc2lock(&reply->lock_desc, lock); + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST; + unlock_res_and_lock(lock); + LDLM_DEBUG(lock, "enqueue reply includes blocking AST"); + } + + /* + * If the lock has already been granted by a completion AST, don't + * clobber the LVB with an older one. + */ + if (lvb_len > 0) { + /* + * We must lock or a racing completion might update lvb without + * letting us know and we'll clobber the correct value. + * Cannot unlock after the check either, a that still leaves + * a tiny window for completion to get in + */ + lock_res_and_lock(lock); + if (!ldlm_is_granted(lock)) + rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER, + lock->l_lvb_data, lvb_len); + unlock_res_and_lock(lock); + if (rc < 0) { + cleanup_phase = 1; + GOTO(cleanup, rc); + } + } + + if (!is_replay) { + rc = ldlm_lock_enqueue(env, ns, &lock, NULL, ldlm_flags); + if (lock->l_completion_ast != NULL) { + int err = lock->l_completion_ast(lock, *ldlm_flags, + NULL); + + if (!rc) + rc = err; + if (rc) + cleanup_phase = 1; + } + } + + if (lvb_len > 0 && lvb != NULL) { + /* + * Copy the LVB here, and not earlier, because the completion + * AST (if any) can override what we got in the reply + */ + memcpy(lvb, lock->l_lvb_data, lvb_len); + } + + LDLM_DEBUG(lock, "client-side enqueue END"); + EXIT; +cleanup: + if (cleanup_phase == 1 && rc) + failed_lock_cleanup(ns, lock, einfo->ei_mode); + /* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */ + LDLM_LOCK_PUT(lock); + LDLM_LOCK_RELEASE(lock); + return rc; +} +EXPORT_SYMBOL(ldlm_cli_enqueue_fini); + +/** + * Estimate number of lock handles that would fit into request of given + * size. PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into + * a single page on the send/receive side. XXX: 512 should be changed to + * more adequate value. + */ +static inline int ldlm_req_handles_avail(int req_size, int off) +{ + int avail; + + avail = min_t(int, LDLM_MAXREQSIZE, PAGE_SIZE - 512) - req_size; + if (likely(avail >= 0)) + avail /= (int)sizeof(struct lustre_handle); + else + avail = 0; + avail += LDLM_LOCKREQ_HANDLES - off; + + return avail; +} + +static inline int ldlm_capsule_handles_avail(struct req_capsule *pill, + enum req_location loc, + int off) +{ + __u32 size = req_capsule_msg_size(pill, loc); + + return ldlm_req_handles_avail(size, off); +} + +static inline int ldlm_format_handles_avail(struct obd_import *imp, + const struct req_format *fmt, + enum req_location loc, int off) +{ + __u32 size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc); + + return ldlm_req_handles_avail(size, off); +} + +/** + * Cancel LRU locks and pack them into the enqueue request. Pack there the given + * \a count locks in \a cancels. + * + * This is to be called by functions preparing their own requests that + * might contain lists of locks to cancel in addition to actual operation + * that needs to be performed. + */ +int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req, + int version, int opc, int canceloff, + struct list_head *cancels, int count) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + struct req_capsule *pill = &req->rq_pill; + struct ldlm_request *dlm = NULL; + LIST_HEAD(head); + int avail, to_free = 0, pack = 0; + int rc; + + ENTRY; + + if (cancels == NULL) + cancels = &head; + if (ns_connect_cancelset(ns)) { + /* Estimate the amount of available space in the request. */ + req_capsule_filled_sizes(pill, RCL_CLIENT); + avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff); + + /* If we have reached the limit, free +1 slot for the new one */ + if (!ns_connect_lru_resize(ns) && opc == LDLM_ENQUEUE && + ns->ns_nr_unused >= ns->ns_max_unused) + to_free = 1; + + /* + * Cancel LRU locks here _only_ if the server supports + * EARLY_CANCEL. Otherwise we have to send extra CANCEL + * RPC, which will make us slower. + */ + if (avail > count) + count += ldlm_cancel_lru_local(ns, cancels, to_free, + avail - count, 0, + LDLM_LRU_FLAG_NO_WAIT); + if (avail > count) + pack = count; + else + pack = avail; + req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT, + ldlm_request_bufsize(pack, opc)); + } + + rc = ptlrpc_request_pack(req, version, opc); + if (rc) { + ldlm_lock_list_put(cancels, l_bl_ast, count); + RETURN(rc); + } + + if (ns_connect_cancelset(ns)) { + if (canceloff) { + dlm = req_capsule_client_get(pill, &RMF_DLM_REQ); + LASSERT(dlm); + /* + * Skip first lock handler in ldlm_request_pack(), + * this method will increment @lock_count according + * to the lock handle amount actually written to + * the buffer. + */ + dlm->lock_count = canceloff; + } + /* Pack into the request @pack lock handles. */ + ldlm_cli_cancel_list(cancels, pack, req, 0); + /* Prepare and send separate cancel RPC for others. */ + ldlm_cli_cancel_list(cancels, count - pack, NULL, 0); + } else { + ldlm_lock_list_put(cancels, l_bl_ast, count); + } + RETURN(0); +} +EXPORT_SYMBOL(ldlm_prep_elc_req); + +int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req, + struct list_head *cancels, int count) +{ + return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE, + LDLM_ENQUEUE_CANCEL_OFF, cancels, count); +} +EXPORT_SYMBOL(ldlm_prep_enqueue_req); + +struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len) +{ + struct ptlrpc_request *req; + int rc; + + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len); + ptlrpc_request_set_replen(req); + RETURN(req); +} +EXPORT_SYMBOL(ldlm_enqueue_pack); + +/** + * Client-side lock enqueue. + * + * If a request has some specific initialisation it is passed in \a reqp, + * otherwise it is created in ldlm_cli_enqueue. + * + * Supports sync and async requests, pass \a async flag accordingly. If a + * request was created in ldlm_cli_enqueue and it is the async request, + * pass it to the caller in \a reqp. + */ +int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, + struct ldlm_enqueue_info *einfo, + const struct ldlm_res_id *res_id, + union ldlm_policy_data const *policy, __u64 *flags, + void *lvb, __u32 lvb_len, enum lvb_type lvb_type, + struct lustre_handle *lockh, int async) +{ + struct ldlm_namespace *ns; + struct ldlm_lock *lock; + struct ldlm_request *body; + int is_replay = *flags & LDLM_FL_REPLAY; + int req_passed_in = 1; + int rc, err; + bool need_req_slot; + struct ptlrpc_request *req; + + ENTRY; + + LASSERT(exp != NULL); + + ns = exp->exp_obd->obd_namespace; + + /* + * If we're replaying this lock, just check some invariants. + * If we're creating a new lock, get everything all setup nice. + */ + if (is_replay) { + lock = ldlm_handle2lock_long(lockh, 0); + LASSERT(lock != NULL); + LDLM_DEBUG(lock, "client-side enqueue START"); + LASSERT(exp == lock->l_conn_export); + } else { + const struct ldlm_callback_suite cbs = { + .lcs_completion = einfo->ei_cb_cp, + .lcs_blocking = einfo->ei_cb_bl, + .lcs_glimpse = einfo->ei_cb_gl + }; + lock = ldlm_lock_create(ns, res_id, einfo->ei_type, + einfo->ei_mode, &cbs, einfo->ei_cbdata, + lvb_len, lvb_type); + if (IS_ERR(lock)) + RETURN(PTR_ERR(lock)); + + if (einfo->ei_cb_created) + einfo->ei_cb_created(lock); + + /* for the local lock, add the reference */ + ldlm_lock_addref_internal(lock, einfo->ei_mode); + ldlm_lock2handle(lock, lockh); + if (policy != NULL) + lock->l_policy_data = *policy; + + if (einfo->ei_type == LDLM_EXTENT) { + /* extent lock without policy is a bug */ + if (policy == NULL) + LBUG(); + + lock->l_req_extent = policy->l_extent; + } + LDLM_DEBUG(lock, "client-side enqueue START, flags %#llx", + *flags); + } + + lock->l_conn_export = exp; + lock->l_export = NULL; + lock->l_blocking_ast = einfo->ei_cb_bl; + lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL | + LDLM_FL_ATOMIC_CB)); + lock->l_activity = ktime_get_real_seconds(); + + /* lock not sent to server yet */ + if (reqp == NULL || *reqp == NULL) { + req = ldlm_enqueue_pack(exp, lvb_len); + if (IS_ERR(req)) { + failed_lock_cleanup(ns, lock, einfo->ei_mode); + LDLM_LOCK_RELEASE(lock); + RETURN(PTR_ERR(req)); + } + + req_passed_in = 0; + if (reqp) + *reqp = req; + } else { + int len; + + req = *reqp; + len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, + RCL_CLIENT); + LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n", + DLM_LOCKREQ_OFF, len, (int)sizeof(*body)); + } + + if (*flags & LDLM_FL_NDELAY) { + DEBUG_REQ(D_DLMTRACE, req, "enqueue lock with no delay"); + req->rq_no_resend = req->rq_no_delay = 1; + /* + * probably set a shorter timeout value and handle ETIMEDOUT + * in osc_lock_upcall() correctly + */ + /* lustre_msg_set_timeout(req, req->rq_timeout / 2); */ + } + + /* Dump lock data into the request buffer */ + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + ldlm_lock2desc(lock, &body->lock_desc); + body->lock_flags = ldlm_flags_to_wire(*flags); + body->lock_handle[0] = *lockh; + + /* extended LDLM opcodes in client stats */ + if (exp->exp_obd->obd_svc_stats != NULL) { + /* glimpse is intent with no intent buffer */ + if (*flags & LDLM_FL_HAS_INTENT && + !req_capsule_has_field(&req->rq_pill, &RMF_LDLM_INTENT, + RCL_CLIENT)) + lprocfs_counter_incr(exp->exp_obd->obd_svc_stats, + PTLRPC_LAST_CNTR + + LDLM_GLIMPSE_ENQUEUE); + else + ldlm_svc_get_eopc(body, exp->exp_obd->obd_svc_stats); + } + + /* It is important to obtain modify RPC slot first (if applicable), so + * that threads that are waiting for a modify RPC slot are not polluting + * our rpcs in flight counter. */ + + if (einfo->ei_mod_slot) + ptlrpc_get_mod_rpc_slot(req); + + need_req_slot = ldlm_request_slot_needed(einfo); + + if (need_req_slot) { + rc = obd_get_request_slot(&req->rq_import->imp_obd->u.cli); + if (rc) { + if (einfo->ei_mod_slot) + ptlrpc_put_mod_rpc_slot(req); + failed_lock_cleanup(ns, lock, einfo->ei_mode); + LDLM_LOCK_RELEASE(lock); + if (!req_passed_in) + ptlrpc_req_finished(req); + GOTO(out, rc); + } + } + + if (async) { + LASSERT(reqp != NULL); + RETURN(0); + } + + LDLM_DEBUG(lock, "sending request"); + + rc = ptlrpc_queue_wait(req); + + err = ldlm_cli_enqueue_fini(exp, req, einfo, policy ? 1 : 0, flags, + lvb, lvb_len, lockh, rc, need_req_slot); + + /* + * If ldlm_cli_enqueue_fini did not find the lock, we need to free + * one reference that we took + */ + if (err == -ENOLCK) + LDLM_LOCK_RELEASE(lock); + else + rc = err; + +out: + if (!req_passed_in && req != NULL) { + ptlrpc_req_finished(req); + if (reqp) + *reqp = NULL; + } + + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_cli_enqueue); + +/** + * Client-side IBITS lock convert. + * + * Inform server that lock has been converted instead of canceling. + * Server finishes convert on own side and does reprocess to grant + * all related waiting locks. + * + * Since convert means only ibits downgrading, client doesn't need to + * wait for server reply to finish local converting process so this request + * is made asynchronous. + * + */ +int ldlm_cli_convert_req(struct ldlm_lock *lock, __u32 *flags, __u64 new_bits) +{ + struct ldlm_request *body; + struct ptlrpc_request *req; + struct obd_export *exp = lock->l_conn_export; + + ENTRY; + + LASSERT(exp != NULL); + + /* + * this is better to check earlier and it is done so already, + * but this check is kept too as final one to issue an error + * if any new code will miss such check. + */ + if (!exp_connect_lock_convert(exp)) { + LDLM_ERROR(lock, "server doesn't support lock convert\n"); + RETURN(-EPROTO); + } + + if (lock->l_resource->lr_type != LDLM_IBITS) { + LDLM_ERROR(lock, "convert works with IBITS locks only."); + RETURN(-EINVAL); + } + + LDLM_DEBUG(lock, "client-side convert"); + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION, + LDLM_CONVERT); + if (req == NULL) + RETURN(-ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + body->lock_handle[0] = lock->l_remote_handle; + + body->lock_desc.l_req_mode = lock->l_req_mode; + body->lock_desc.l_granted_mode = lock->l_granted_mode; + + body->lock_desc.l_policy_data.l_inodebits.bits = new_bits; + body->lock_desc.l_policy_data.l_inodebits.cancel_bits = 0; + + body->lock_flags = ldlm_flags_to_wire(*flags); + body->lock_count = 1; + + ptlrpc_request_set_replen(req); + + /* + * Use cancel portals for convert as well as high-priority handling. + */ + req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL; + req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; + + ptlrpc_at_set_req_timeout(req); + + if (exp->exp_obd->obd_svc_stats != NULL) + lprocfs_counter_incr(exp->exp_obd->obd_svc_stats, + LDLM_CONVERT - LDLM_FIRST_OPC); + + ptlrpcd_add_req(req); + RETURN(0); +} + +/** + * Cancel locks locally. + * Returns: + * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server + * \retval LDLM_FL_CANCELING otherwise; + * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC. + */ +static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock) +{ + __u64 rc = LDLM_FL_LOCAL_ONLY; + + ENTRY; + + if (lock->l_conn_export) { + bool local_only; + + LDLM_DEBUG(lock, "client-side cancel"); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL, + cfs_fail_val); + + /* Set this flag to prevent others from getting new references*/ + lock_res_and_lock(lock); + ldlm_set_cbpending(lock); + local_only = !!(lock->l_flags & + (LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK)); + ldlm_cancel_callback(lock); + rc = (ldlm_is_bl_ast(lock)) ? + LDLM_FL_BL_AST : LDLM_FL_CANCELING; + unlock_res_and_lock(lock); + + if (local_only) { + CDEBUG(D_DLMTRACE, + "not sending request (at caller's instruction)\n"); + rc = LDLM_FL_LOCAL_ONLY; + } + ldlm_lock_cancel(lock); + } else { + if (ns_is_client(ldlm_lock_to_ns(lock))) { + LDLM_ERROR(lock, "Trying to cancel local lock"); + LBUG(); + } + LDLM_DEBUG(lock, "server-side local cancel"); + ldlm_lock_cancel(lock); + ldlm_reprocess_all(lock->l_resource, + lock->l_policy_data.l_inodebits.bits); + } + + RETURN(rc); +} + +/** + * Pack \a count locks in \a head into ldlm_request buffer of request \a req. + */ +static void ldlm_cancel_pack(struct ptlrpc_request *req, + struct list_head *head, int count) +{ + struct ldlm_request *dlm; + struct ldlm_lock *lock; + int max, packed = 0; + + ENTRY; + + dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + LASSERT(dlm != NULL); + + /* Check the room in the request buffer. */ + max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) - + sizeof(struct ldlm_request); + max /= sizeof(struct lustre_handle); + max += LDLM_LOCKREQ_HANDLES; + LASSERT(max >= dlm->lock_count + count); + + /* + * XXX: it would be better to pack lock handles grouped by resource. + * so that the server cancel would call filter_lvbo_update() less + * frequently. + */ + list_for_each_entry(lock, head, l_bl_ast) { + if (!count--) + break; + LASSERT(lock->l_conn_export); + /* Pack the lock handle to the given request buffer. */ + LDLM_DEBUG(lock, "packing"); + dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle; + packed++; + } + CDEBUG(D_DLMTRACE, "%d locks packed\n", packed); + EXIT; +} + +/** + * Prepare and send a batched cancel RPC. It will include \a count lock + * handles of locks given in \a cancels list. + */ +int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels, + int count, enum ldlm_cancel_flags flags) +{ + struct ptlrpc_request *req = NULL; + struct obd_import *imp; + int free, sent = 0; + int rc = 0; + + ENTRY; + + LASSERT(exp != NULL); + LASSERT(count > 0); + + CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val); + + if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE)) + RETURN(count); + + free = ldlm_format_handles_avail(class_exp2cliimp(exp), + &RQF_LDLM_CANCEL, RCL_CLIENT, 0); + if (count > free) + count = free; + + while (1) { + imp = class_exp2cliimp(exp); + if (imp == NULL || imp->imp_invalid) { + CDEBUG(D_DLMTRACE, + "skipping cancel on invalid import %p\n", imp); + RETURN(count); + } + + req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT); + req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT, + ldlm_request_bufsize(count, LDLM_CANCEL)); + + rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL); + if (rc) { + ptlrpc_request_free(req); + GOTO(out, rc); + } + + /* + * If OSP want cancel cross-MDT lock, let's not block it in + * in recovery, otherwise the lock will not released, if + * the remote target is also in recovery, and it also need + * this lock, it might cause deadlock. + */ + if (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS && + exp->exp_obd->obd_lu_dev != NULL && + exp->exp_obd->obd_lu_dev->ld_site != NULL) { + struct lu_device *top_dev; + + top_dev = exp->exp_obd->obd_lu_dev->ld_site->ls_top_dev; + if (top_dev != NULL && + top_dev->ld_obd->obd_recovering) + req->rq_allow_replay = 1; + } + + req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL; + req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; + ptlrpc_at_set_req_timeout(req); + + ldlm_cancel_pack(req, cancels, count); + + ptlrpc_request_set_replen(req); + if (flags & LCF_ASYNC) { + ptlrpcd_add_req(req); + sent = count; + GOTO(out, 0); + } + + rc = ptlrpc_queue_wait(req); + if (rc == LUSTRE_ESTALE) { + CDEBUG(D_DLMTRACE, + "client/server (nid %s) out of sync -- not fatal\n", + libcfs_nidstr(&req->rq_import->imp_connection->c_peer.nid)); + rc = 0; + } else if (rc == -ETIMEDOUT && /* check there was no reconnect*/ + req->rq_import_generation == imp->imp_generation) { + ptlrpc_req_finished(req); + continue; + } else if (rc != ELDLM_OK) { + /* -ESHUTDOWN is common on umount */ + CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR, + "Got rc %d from cancel RPC: canceling anyway\n", + rc); + break; + } + sent = count; + break; + } + + ptlrpc_req_finished(req); + EXIT; +out: + return sent ? sent : rc; +} + +static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp) +{ + LASSERT(imp != NULL); + return &imp->imp_obd->obd_namespace->ns_pool; +} + +/** + * Update client's OBD pool related fields with new SLV and Limit from \a req. + */ +int ldlm_cli_update_pool(struct ptlrpc_request *req) +{ + struct ldlm_namespace *ns; + struct obd_device *obd; + __u64 new_slv, ratio; + __u32 new_limit; + + ENTRY; + if (unlikely(!req->rq_import || !req->rq_import->imp_obd || + !imp_connect_lru_resize(req->rq_import))) + /* Do nothing for corner cases. */ + RETURN(0); + + /* + * In some cases RPC may contain SLV and limit zeroed out. This + * is the case when server does not support LRU resize feature. + * This is also possible in some recovery cases when server-side + * reqs have no reference to the OBD export and thus access to + * server-side namespace is not possible. + */ + if (lustre_msg_get_slv(req->rq_repmsg) == 0 || + lustre_msg_get_limit(req->rq_repmsg) == 0) { + DEBUG_REQ(D_HA, req, + "Zero SLV or limit found (SLV=%llu, limit=%u)", + lustre_msg_get_slv(req->rq_repmsg), + lustre_msg_get_limit(req->rq_repmsg)); + RETURN(0); + } + + new_limit = lustre_msg_get_limit(req->rq_repmsg); + new_slv = lustre_msg_get_slv(req->rq_repmsg); + obd = req->rq_import->imp_obd; + + read_lock(&obd->obd_pool_lock); + if (obd->obd_pool_slv == new_slv && + obd->obd_pool_limit == new_limit) { + read_unlock(&obd->obd_pool_lock); + RETURN(0); + } + read_unlock(&obd->obd_pool_lock); + + /* + * OBD device keeps the new pool attributes before they are handled by + * the pool. + */ + write_lock(&obd->obd_pool_lock); + obd->obd_pool_slv = new_slv; + obd->obd_pool_limit = new_limit; + write_unlock(&obd->obd_pool_lock); + + /* + * Check if an urgent pool recalc is needed, let it to be a change of + * SLV on 10%. It is applicable to LRU resize enabled case only. + */ + ns = obd->obd_namespace; + if (!ns_connect_lru_resize(ns) || + ldlm_pool_get_slv(&ns->ns_pool) < new_slv) + RETURN(0); + + ratio = 100 * new_slv / ldlm_pool_get_slv(&ns->ns_pool); + if (100 - ratio >= ns->ns_recalc_pct && + !ns->ns_stopping && !ns->ns_rpc_recalc) { + bool recalc = false; + + spin_lock(&ns->ns_lock); + if (!ns->ns_stopping && !ns->ns_rpc_recalc) { + ldlm_namespace_get(ns); + recalc = true; + ns->ns_rpc_recalc = 1; + } + spin_unlock(&ns->ns_lock); + if (recalc) + ldlm_bl_to_thread_ns(ns); + } + + RETURN(0); +} + +int ldlm_cli_convert(struct ldlm_lock *lock, + enum ldlm_cancel_flags cancel_flags) +{ + int rc = -EINVAL; + + LASSERT(!lock->l_readers && !lock->l_writers); + LDLM_DEBUG(lock, "client lock convert START"); + + if (lock->l_resource->lr_type == LDLM_IBITS) { + lock_res_and_lock(lock); + do { + rc = ldlm_cli_inodebits_convert(lock, cancel_flags); + } while (rc == -EAGAIN); + unlock_res_and_lock(lock); + } + + LDLM_DEBUG(lock, "client lock convert END"); + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_cli_convert); + +/** + * Client side lock cancel. + * + * Lock must not have any readers or writers by this time. + */ +int ldlm_cli_cancel(const struct lustre_handle *lockh, + enum ldlm_cancel_flags cancel_flags) +{ + struct obd_export *exp; + int avail, count = 1; + __u64 rc = 0; + struct ldlm_namespace *ns; + struct ldlm_lock *lock; + LIST_HEAD(cancels); + + ENTRY; + + lock = ldlm_handle2lock_long(lockh, 0); + if (lock == NULL) { + LDLM_DEBUG_NOLOCK("lock is already being destroyed"); + RETURN(0); + } + + lock_res_and_lock(lock); + LASSERT(!ldlm_is_converting(lock)); + + /* Lock is being canceled and the caller doesn't want to wait */ + if (ldlm_is_canceling(lock)) { + if (cancel_flags & LCF_ASYNC) { + unlock_res_and_lock(lock); + } else { + unlock_res_and_lock(lock); + wait_event_idle(lock->l_waitq, is_bl_done(lock)); + } + LDLM_LOCK_RELEASE(lock); + RETURN(0); + } + + ldlm_set_canceling(lock); + unlock_res_and_lock(lock); + + if (cancel_flags & LCF_LOCAL) + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE, + cfs_fail_val); + + rc = ldlm_cli_cancel_local(lock); + if (rc == LDLM_FL_LOCAL_ONLY || cancel_flags & LCF_LOCAL) { + LDLM_LOCK_RELEASE(lock); + RETURN(0); + } + /* + * Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL + * RPC which goes to canceld portal, so we can cancel other LRU locks + * here and send them all as one LDLM_CANCEL RPC. + */ + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, &cancels); + + exp = lock->l_conn_export; + if (exp_connect_cancelset(exp)) { + avail = ldlm_format_handles_avail(class_exp2cliimp(exp), + &RQF_LDLM_CANCEL, + RCL_CLIENT, 0); + LASSERT(avail > 0); + + ns = ldlm_lock_to_ns(lock); + count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1, + LCF_BL_AST, 0); + } + ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_cli_cancel); + +/** + * Locally cancel up to \a count locks in list \a cancels. + * Return the number of cancelled locks. + */ +int ldlm_cli_cancel_list_local(struct list_head *cancels, int count, + enum ldlm_cancel_flags cancel_flags) +{ + LIST_HEAD(head); + struct ldlm_lock *lock, *next; + int left = 0, bl_ast = 0; + __u64 rc; + + left = count; + list_for_each_entry_safe(lock, next, cancels, l_bl_ast) { + if (left-- == 0) + break; + + if (cancel_flags & LCF_LOCAL) { + rc = LDLM_FL_LOCAL_ONLY; + ldlm_lock_cancel(lock); + } else { + rc = ldlm_cli_cancel_local(lock); + } + /* + * Until we have compound requests and can send LDLM_CANCEL + * requests batched with generic RPCs, we need to send cancels + * with the LDLM_FL_BL_AST flag in a separate RPC from + * the one being generated now. + */ + if (!(cancel_flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) { + LDLM_DEBUG(lock, "Cancel lock separately"); + list_move(&lock->l_bl_ast, &head); + bl_ast++; + continue; + } + if (rc == LDLM_FL_LOCAL_ONLY) { + /* CANCEL RPC should not be sent to server. */ + list_del_init(&lock->l_bl_ast); + LDLM_LOCK_RELEASE(lock); + count--; + } + } + if (bl_ast > 0) { + count -= bl_ast; + ldlm_cli_cancel_list(&head, bl_ast, NULL, 0); + } + + RETURN(count); +} + +/** + * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back + * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g. + * readahead requests, ...) + */ +static enum ldlm_policy_res +ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, + int added, int min) +{ + enum ldlm_policy_res result = LDLM_POLICY_CANCEL_LOCK; + + /* + * don't check @added & @min since we want to process all locks + * from unused list. + * It's fine to not take lock to access lock->l_resource since + * the lock has already been granted so it won't change. + */ + switch (lock->l_resource->lr_type) { + case LDLM_EXTENT: + case LDLM_IBITS: + if (ns->ns_cancel != NULL && ns->ns_cancel(lock) != 0) + break; + fallthrough; + default: + result = LDLM_POLICY_SKIP_LOCK; + break; + } + + RETURN(result); +} + +/** + * Callback function for LRU-resize policy. Decides whether to keep + * \a lock in LRU for \a added in current scan and \a min number of locks + * to be preferably canceled. + * + * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning + * + * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU + */ +static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int added, int min) +{ + ktime_t cur = ktime_get(); + struct ldlm_pool *pl = &ns->ns_pool; + u64 slv, lvf, lv; + s64 la; + + if (added < min) + return LDLM_POLICY_CANCEL_LOCK; + + /* + * Despite of the LV, It doesn't make sense to keep the lock which + * is unused for ns_max_age time. + */ + if (ktime_after(cur, ktime_add(lock->l_last_used, ns->ns_max_age))) + return LDLM_POLICY_CANCEL_LOCK; + + slv = ldlm_pool_get_slv(pl); + lvf = ldlm_pool_get_lvf(pl); + la = div_u64(ktime_to_ns(ktime_sub(cur, lock->l_last_used)), + NSEC_PER_SEC); + lv = lvf * la * ns->ns_nr_unused >> 8; + + /* Inform pool about current CLV to see it via debugfs. */ + ldlm_pool_set_clv(pl, lv); + + /* + * Stop when SLV is not yet come from server or lv is smaller than + * it is. + */ + if (slv == 0 || lv < slv) + return LDLM_POLICY_KEEP_LOCK; + + return LDLM_POLICY_CANCEL_LOCK; +} + +static enum ldlm_policy_res +ldlm_cancel_lrur_no_wait_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int added, int min) +{ + enum ldlm_policy_res result; + + result = ldlm_cancel_lrur_policy(ns, lock, added, min); + if (result == LDLM_POLICY_KEEP_LOCK) + return result; + + return ldlm_cancel_no_wait_policy(ns, lock, added, min); +} + +/** + * Callback function for aged policy. Decides whether to keep + * \a lock in LRU for \a added in current scan and \a min number of locks + * to be preferably canceled. + * + * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning + * + * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU + */ +static enum ldlm_policy_res ldlm_cancel_aged_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int added, int min) +{ + if ((added >= min) && + ktime_before(ktime_get(), + ktime_add(lock->l_last_used, ns->ns_max_age))) + return LDLM_POLICY_KEEP_LOCK; + + return LDLM_POLICY_CANCEL_LOCK; +} + +static enum ldlm_policy_res +ldlm_cancel_aged_no_wait_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int added, int min) +{ + enum ldlm_policy_res result; + + result = ldlm_cancel_aged_policy(ns, lock, added, min); + if (result == LDLM_POLICY_KEEP_LOCK) + return result; + + return ldlm_cancel_no_wait_policy(ns, lock, added, min); +} + +typedef enum ldlm_policy_res +(*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *ns, struct ldlm_lock *lock, + int added, int min); + +static ldlm_cancel_lru_policy_t +ldlm_cancel_lru_policy(struct ldlm_namespace *ns, enum ldlm_lru_flags lru_flags) +{ + if (ns_connect_lru_resize(ns)) { + if (lru_flags & LDLM_LRU_FLAG_NO_WAIT) + return ldlm_cancel_lrur_no_wait_policy; + else + return ldlm_cancel_lrur_policy; + } else { + if (lru_flags & LDLM_LRU_FLAG_NO_WAIT) + return ldlm_cancel_aged_no_wait_policy; + else + return ldlm_cancel_aged_policy; + } +} + +/** + * - Free space in LRU for \a min new locks, + * redundant unused locks are canceled locally; + * - also cancel locally unused aged locks; + * - do not cancel more than \a max locks; + * - if some locks are cancelled, try to cancel at least \a batch locks + * - GET the found locks and add them into the \a cancels list. + * + * A client lock can be added to the l_bl_ast list only when it is + * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing + * CANCEL. There are the following use cases: + * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and + * ldlm_cli_cancel(), which check and set this flag properly. As any + * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed + * later without any special locking. + * + * Locks are cancelled according to the LRU resize policy (SLV from server) + * if LRU resize is enabled; otherwise, the "aged policy" is used; + * + * LRU flags: + * ---------------------------------------- + * + * flags & LDLM_LRU_FLAG_NO_WAIT - cancel locks w/o sending any RPCs or waiting + * for any outstanding RPC to complete. + * + * flags & LDLM_CANCEL_CLEANUP - when cancelling read locks, do not check for + * other read locks covering the same pages, just + * discard those pages. + */ +static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, + struct list_head *cancels, + int min, int max, int batch, + enum ldlm_lru_flags lru_flags) +{ + ldlm_cancel_lru_policy_t pf; + int added = 0; + int no_wait = lru_flags & LDLM_LRU_FLAG_NO_WAIT; + ENTRY; + + /* + * Let only 1 thread to proceed. However, not for those which have the + * @max limit given (ELC), as LRU may be left not cleaned up in full. + */ + if (max == 0) { + if (test_and_set_bit(LDLM_LRU_CANCEL, &ns->ns_flags)) + RETURN(0); + } else if (test_bit(LDLM_LRU_CANCEL, &ns->ns_flags)) + RETURN(0); + + LASSERT(ergo(max, min <= max)); + /* No sense to give @batch for ELC */ + LASSERT(ergo(max, batch == 0)); + + if (!ns_connect_lru_resize(ns)) + min = max_t(int, min, ns->ns_nr_unused - ns->ns_max_unused); + + /* If at least 1 lock is to be cancelled, cancel at least @batch locks */ + if (min && min < batch) + min = batch; + + pf = ldlm_cancel_lru_policy(ns, lru_flags); + LASSERT(pf != NULL); + + /* For any flags, stop scanning if @max is reached. */ + while (!list_empty(&ns->ns_unused_list) && (max == 0 || added < max)) { + struct ldlm_lock *lock; + struct list_head *item, *next; + enum ldlm_policy_res result; + ktime_t last_use = ktime_set(0, 0); + + spin_lock(&ns->ns_lock); + item = no_wait ? ns->ns_last_pos : &ns->ns_unused_list; + for (item = item->next, next = item->next; + item != &ns->ns_unused_list; + item = next, next = item->next) { + lock = list_entry(item, struct ldlm_lock, l_lru); + + /* No locks which got blocking requests. */ + LASSERT(!ldlm_is_bl_ast(lock)); + + if (!ldlm_is_canceling(lock)) + break; + + /* + * Somebody is already doing CANCEL. No need for this + * lock in LRU, do not traverse it again. + */ + ldlm_lock_remove_from_lru_nolock(lock); + } + if (item == &ns->ns_unused_list) { + spin_unlock(&ns->ns_lock); + break; + } + + last_use = lock->l_last_used; + + LDLM_LOCK_GET(lock); + spin_unlock(&ns->ns_lock); + lu_ref_add(&lock->l_reference, __FUNCTION__, current); + + /* + * Pass the lock through the policy filter and see if it + * should stay in LRU. + * + * Even for shrinker policy we stop scanning if + * we find a lock that should stay in the cache. + * We should take into account lock age anyway + * as a new lock is a valuable resource even if + * it has a low weight. + * + * That is, for shrinker policy we drop only + * old locks, but additionally choose them by + * their weight. Big extent locks will stay in + * the cache. + */ + result = pf(ns, lock, added, min); + if (result == LDLM_POLICY_KEEP_LOCK) { + lu_ref_del(&lock->l_reference, __func__, current); + LDLM_LOCK_RELEASE(lock); + break; + } + + if (result == LDLM_POLICY_SKIP_LOCK) { + lu_ref_del(&lock->l_reference, __func__, current); + if (no_wait) { + spin_lock(&ns->ns_lock); + if (!list_empty(&lock->l_lru) && + lock->l_lru.prev == ns->ns_last_pos) + ns->ns_last_pos = &lock->l_lru; + spin_unlock(&ns->ns_lock); + } + + LDLM_LOCK_RELEASE(lock); + continue; + } + + lock_res_and_lock(lock); + /* Check flags again under the lock. */ + if (ldlm_is_canceling(lock) || + ldlm_lock_remove_from_lru_check(lock, last_use) == 0) { + /* + * Another thread is removing lock from LRU, or + * somebody is already doing CANCEL, or there + * is a blocking request which will send cancel + * by itself, or the lock is no longer unused or + * the lock has been used since the pf() call and + * pages could be put under it. + */ + unlock_res_and_lock(lock); + lu_ref_del(&lock->l_reference, __FUNCTION__, current); + LDLM_LOCK_RELEASE(lock); + continue; + } + LASSERT(!lock->l_readers && !lock->l_writers); + + /* + * If we have chosen to cancel this lock voluntarily, we + * better send cancel notification to server, so that it + * frees appropriate state. This might lead to a race + * where while we are doing cancel here, server is also + * silently cancelling this lock. + */ + ldlm_clear_cancel_on_block(lock); + + /* + * Setting the CBPENDING flag is a little misleading, + * but prevents an important race; namely, once + * CBPENDING is set, the lock can accumulate no more + * readers/writers. Since readers and writers are + * already zero here, ldlm_lock_decref() won't see + * this flag and call l_blocking_ast + */ + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING; + + if ((lru_flags & LDLM_LRU_FLAG_CLEANUP) && + (lock->l_resource->lr_type == LDLM_EXTENT || + ldlm_has_dom(lock)) && lock->l_granted_mode == LCK_PR) + ldlm_set_discard_data(lock); + + /* + * We can't re-add to l_lru as it confuses the + * refcounting in ldlm_lock_remove_from_lru() if an AST + * arrives after we drop lr_lock below. We use l_bl_ast + * and can't use l_pending_chain as it is used both on + * server and client nevertheless b=5666 says it is + * used only on server + */ + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, cancels); + unlock_res_and_lock(lock); + lu_ref_del(&lock->l_reference, __FUNCTION__, current); + added++; + /* Once a lock added, batch the requested amount */ + if (min == 0) + min = batch; + } + + if (max == 0) + clear_bit(LDLM_LRU_CANCEL, &ns->ns_flags); + + RETURN(added); +} + +int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, + int min, int max, + enum ldlm_cancel_flags cancel_flags, + enum ldlm_lru_flags lru_flags) +{ + int added; + + added = ldlm_prepare_lru_list(ns, cancels, min, max, 0, lru_flags); + if (added <= 0) + return added; + + return ldlm_cli_cancel_list_local(cancels, added, cancel_flags); +} + +/** + * Cancel at least \a min locks from given namespace LRU. + * + * When called with LCF_ASYNC the blocking callback will be handled + * in a thread and this function will return after the thread has been + * asked to call the callback. When called with LCF_ASYNC the blocking + * callback will be performed in this function. + */ +int ldlm_cancel_lru(struct ldlm_namespace *ns, int min, + enum ldlm_cancel_flags cancel_flags, + enum ldlm_lru_flags lru_flags) +{ + LIST_HEAD(cancels); + int count, rc; + + ENTRY; + + /* + * Just prepare the list of locks, do not actually cancel them yet. + * Locks are cancelled later in a separate thread. + */ + count = ldlm_prepare_lru_list(ns, &cancels, min, 0, + ns->ns_cancel_batch, lru_flags); + rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags); + if (rc == 0) + RETURN(count); + + RETURN(0); +} + +/** + * Find and cancel locally unused locks found on resource, matched to the + * given policy, mode. GET the found locks and add them into the \a cancels + * list. + */ +int ldlm_cancel_resource_local(struct ldlm_resource *res, + struct list_head *cancels, + union ldlm_policy_data *policy, + enum ldlm_mode mode, __u64 lock_flags, + enum ldlm_cancel_flags cancel_flags, + void *opaque) +{ + struct ldlm_lock *lock; + int count = 0; + + ENTRY; + + lock_res(res); + list_for_each_entry(lock, &res->lr_granted, l_res_link) { + if (opaque != NULL && lock->l_ast_data != opaque) { + LDLM_ERROR(lock, "data %p doesn't match opaque %p", + lock->l_ast_data, opaque); + continue; + } + + if (lock->l_readers || lock->l_writers) + continue; + + /* + * If somebody is already doing CANCEL, or blocking AST came + * then skip this lock. + */ + if (ldlm_is_bl_ast(lock) || ldlm_is_canceling(lock)) + continue; + + if (lockmode_compat(lock->l_granted_mode, mode)) + continue; + + /* + * If policy is given and this is IBITS lock, add to list only + * those locks that match by policy. + */ + if (policy && (lock->l_resource->lr_type == LDLM_IBITS)) { + if (!(lock->l_policy_data.l_inodebits.bits & + policy->l_inodebits.bits)) + continue; + /* Skip locks with DoM bit if it is not set in policy + * to don't flush data by side-bits. Lock convert will + * drop those bits separately. + */ + if (ldlm_has_dom(lock) && + !(policy->l_inodebits.bits & MDS_INODELOCK_DOM)) + continue; + } + + /* See CBPENDING comment in ldlm_cancel_lru */ + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING | + lock_flags; + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, cancels); + LDLM_LOCK_GET(lock); + count++; + } + unlock_res(res); + + RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags)); +} +EXPORT_SYMBOL(ldlm_cancel_resource_local); + +/** + * Cancel client-side locks from a list and send/prepare cancel RPCs to the + * server. + * If \a req is NULL, send CANCEL request to server with handles of locks + * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests + * separately per lock. + * If \a req is not NULL, put handles of locks in \a cancels into the request + * buffer at the offset \a off. + * Destroy \a cancels at the end. + */ +int ldlm_cli_cancel_list(struct list_head *cancels, int count, + struct ptlrpc_request *req, + enum ldlm_cancel_flags flags) +{ + struct ldlm_lock *lock; + int res = 0; + + ENTRY; + + if (list_empty(cancels) || count == 0) + RETURN(0); + + /* + * XXX: requests (both batched and not) could be sent in parallel. + * Usually it is enough to have just 1 RPC, but it is possible that + * there are too many locks to be cancelled in LRU or on a resource. + * It would also speed up the case when the server does not support + * the feature. + */ + while (count > 0) { + LASSERT(!list_empty(cancels)); + lock = list_entry(cancels->next, struct ldlm_lock, + l_bl_ast); + LASSERT(lock->l_conn_export); + + if (exp_connect_cancelset(lock->l_conn_export)) { + res = count; + if (req) + ldlm_cancel_pack(req, cancels, count); + else + res = ldlm_cli_cancel_req(lock->l_conn_export, + cancels, count, + flags); + } else { + res = ldlm_cli_cancel_req(lock->l_conn_export, + cancels, 1, flags); + } + + if (res < 0) { + CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR, + "ldlm_cli_cancel_list: %d\n", res); + res = count; + } + + count -= res; + ldlm_lock_list_put(cancels, l_bl_ast, res); + } + LASSERT(count == 0); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_cli_cancel_list); + +/** + * Cancel all locks on a resource that have 0 readers/writers. + * + * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying + * to notify the server. + */ +int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + union ldlm_policy_data *policy, + enum ldlm_mode mode, + enum ldlm_cancel_flags flags, void *opaque) +{ + struct ldlm_resource *res; + LIST_HEAD(cancels); + int count; + int rc; + + ENTRY; + + res = ldlm_resource_get(ns, NULL, res_id, 0, 0); + if (IS_ERR(res)) { + /* This is not a problem. */ + CDEBUG(D_INFO, "No resource %llu\n", res_id->name[0]); + RETURN(0); + } + + LDLM_RESOURCE_ADDREF(res); + count = ldlm_cancel_resource_local(res, &cancels, policy, mode, + 0, flags | LCF_BL_AST, opaque); + rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags); + if (rc != ELDLM_OK) + CERROR("canceling unused lock "DLDLMRES": rc = %d\n", + PLDLMRES(res), rc); + + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource); + +struct ldlm_cli_cancel_arg { + int lc_flags; + void *lc_opaque; +}; + +static int +ldlm_cli_hash_cancel_unused(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + struct ldlm_cli_cancel_arg *lc = arg; + + ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name, + NULL, LCK_MINMODE, lc->lc_flags, + lc->lc_opaque); + /* must return 0 for hash iteration */ + return 0; +} + +/** + * Cancel all locks on a namespace (or a specific resource, if given) + * that have 0 readers/writers. + * + * If flags & LCF_LOCAL, throw the locks away without trying + * to notify the server. + */ +int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + enum ldlm_cancel_flags flags, void *opaque) +{ + struct ldlm_cli_cancel_arg arg = { + .lc_flags = flags, + .lc_opaque = opaque, + }; + + ENTRY; + + if (ns == NULL) + RETURN(ELDLM_OK); + + if (res_id != NULL) { + RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL, + LCK_MINMODE, flags, + opaque)); + } else { + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_cli_hash_cancel_unused, &arg, 0); + RETURN(ELDLM_OK); + } +} + +/* Lock iterators. */ + +int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter, + void *closure) +{ + struct list_head *tmp, *next; + struct ldlm_lock *lock; + int rc = LDLM_ITER_CONTINUE; + + ENTRY; + + if (!res) + RETURN(LDLM_ITER_CONTINUE); + + lock_res(res); + list_for_each_safe(tmp, next, &res->lr_granted) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (iter(lock, closure) == LDLM_ITER_STOP) + GOTO(out, rc = LDLM_ITER_STOP); + } + + list_for_each_safe(tmp, next, &res->lr_waiting) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (iter(lock, closure) == LDLM_ITER_STOP) + GOTO(out, rc = LDLM_ITER_STOP); + } +out: + unlock_res(res); + RETURN(rc); +} + +struct iter_helper_data { + ldlm_iterator_t iter; + void *closure; +}; + +static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure) +{ + struct iter_helper_data *helper = closure; + + return helper->iter(lock, helper->closure); +} + +static int ldlm_res_iter_helper(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) + +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + + return ldlm_resource_foreach(res, ldlm_iter_helper, arg) == + LDLM_ITER_STOP; +} + +void ldlm_namespace_foreach(struct ldlm_namespace *ns, + ldlm_iterator_t iter, void *closure) + +{ + struct iter_helper_data helper = { .iter = iter, .closure = closure }; + + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_res_iter_helper, &helper, 0); + +} + +/* + * non-blocking function to manipulate a lock whose cb_data is being put away. + * return 0: find no resource + * > 0: must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE. + * < 0: errors + */ +int ldlm_resource_iterate(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_iterator_t iter, void *data) +{ + struct ldlm_resource *res; + int rc; + + ENTRY; + + LASSERTF(ns != NULL, "must pass in namespace\n"); + + res = ldlm_resource_get(ns, NULL, res_id, 0, 0); + if (IS_ERR(res)) + RETURN(0); + + LDLM_RESOURCE_ADDREF(res); + rc = ldlm_resource_foreach(res, iter, data); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_resource_iterate); + +/* Lock replay */ +static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure) +{ + struct list_head *list = closure; + + /* we use l_pending_chain here, because it's unused on clients. */ + LASSERTF(list_empty(&lock->l_pending_chain), + "lock %p next %p prev %p\n", + lock, &lock->l_pending_chain.next, + &lock->l_pending_chain.prev); + /* + * b=9573: don't replay locks left after eviction, or + * b=17614: locks being actively cancelled. Get a reference + * on a lock so that it does not disapear under us (e.g. due to cancel) + */ + if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_BL_DONE))) { + list_add(&lock->l_pending_chain, list); + LDLM_LOCK_GET(lock); + } + + return LDLM_ITER_CONTINUE; +} + +static int replay_lock_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *args, int rc) +{ + struct ldlm_async_args *aa = args; + struct ldlm_lock *lock; + struct ldlm_reply *reply; + struct obd_export *exp; + + ENTRY; + atomic_dec(&req->rq_import->imp_replay_inflight); + wake_up(&req->rq_import->imp_replay_waitq); + + if (rc != ELDLM_OK) + GOTO(out, rc); + + reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + if (reply == NULL) + GOTO(out, rc = -EPROTO); + + lock = ldlm_handle2lock(&aa->lock_handle); + if (!lock) { + CERROR("received replay ack for unknown local cookie %#llx remote cookie %#llx from server %s id %s\n", + aa->lock_handle.cookie, reply->lock_handle.cookie, + req->rq_export->exp_client_uuid.uuid, + libcfs_id2str(req->rq_peer)); + GOTO(out, rc = -ESTALE); + } + + /* Key change rehash lock in per-export hash with new key */ + exp = req->rq_export; + if (exp && exp->exp_lock_hash) { + /* + * In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() + */ + /* coverity[overrun-buffer-val] */ + cfs_hash_rehash_key(exp->exp_lock_hash, + &lock->l_remote_handle, + &reply->lock_handle, + &lock->l_exp_hash); + } else { + lock->l_remote_handle = reply->lock_handle; + } + + LDLM_DEBUG(lock, "replayed lock:"); + ptlrpc_import_recovery_state_machine(req->rq_import); + LDLM_LOCK_PUT(lock); +out: + if (rc != ELDLM_OK) + ptlrpc_connect_import(req->rq_import); + + RETURN(rc); +} + +static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock) +{ + struct ptlrpc_request *req; + struct ldlm_async_args *aa; + struct ldlm_request *body; + int flags; + + ENTRY; + + + /* b=11974: Do not replay a lock which is actively being canceled */ + if (ldlm_is_bl_done(lock)) { + LDLM_DEBUG(lock, "Not replaying canceled lock:"); + RETURN(0); + } + + /* + * If this is reply-less callback lock, we cannot replay it, since + * server might have long dropped it, but notification of that event was + * lost by network. (and server granted conflicting lock already) + */ + if (ldlm_is_cancel_on_block(lock)) { + LDLM_DEBUG(lock, "Not replaying reply-less lock:"); + ldlm_lock_cancel(lock); + RETURN(0); + } + + /* + * If granted mode matches the requested mode, this lock is granted. + * + * If we haven't been granted anything and are on a resource list, + * then we're blocked/waiting. + * + * If we haven't been granted anything and we're NOT on a resource list, + * then we haven't got a reply yet and don't have a known disposition. + * This happens whenever a lock enqueue is the request that triggers + * recovery. + */ + if (ldlm_is_granted(lock)) + flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED; + else if (!list_empty(&lock->l_res_link)) + flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT; + else + flags = LDLM_FL_REPLAY; + + req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE, + LUSTRE_DLM_VERSION, LDLM_ENQUEUE); + if (req == NULL) + RETURN(-ENOMEM); + + /* We're part of recovery, so don't wait for it. */ + req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS; + /* If the state changed while we were prepared, don't wait */ + req->rq_no_delay = 1; + + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + ldlm_lock2desc(lock, &body->lock_desc); + body->lock_flags = ldlm_flags_to_wire(flags); + + ldlm_lock2handle(lock, &body->lock_handle[0]); + if (lock->l_lvb_len > 0) + req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB); + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + lock->l_lvb_len); + ptlrpc_request_set_replen(req); + /* + * notify the server we've replayed all requests. + * also, we mark the request to be put on a dedicated + * queue to be processed after all request replayes. + * b=6063 + */ + lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE); + + LDLM_DEBUG(lock, "replaying lock:"); + + atomic_inc(&imp->imp_replay_inflight); + aa = ptlrpc_req_async_args(aa, req); + aa->lock_handle = body->lock_handle[0]; + req->rq_interpret_reply = replay_lock_interpret; + ptlrpcd_add_req(req); + + RETURN(0); +} + +/** + * Cancel as many unused locks as possible before replay. since we are + * in recovery, we can't wait for any outstanding RPCs to send any RPC + * to the server. + * + * Called only in recovery before replaying locks. there is no need to + * replay locks that are unused. since the clients may hold thousands of + * cached unused locks, dropping the unused locks can greatly reduce the + * load on the servers at recovery time. + */ +static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns) +{ + int canceled; + LIST_HEAD(cancels); + + CDEBUG(D_DLMTRACE, + "Dropping as many unused locks as possible before replay for namespace %s (%d)\n", + ldlm_ns_name(ns), ns->ns_nr_unused); + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_REPLAY_PAUSE, cfs_fail_val); + + /* + * We don't need to care whether or not LRU resize is enabled + * because the LDLM_LRU_FLAG_NO_WAIT policy doesn't use the + * count parameter + */ + canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0, + LCF_LOCAL, LDLM_LRU_FLAG_NO_WAIT); + + CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n", + canceled, ldlm_ns_name(ns)); +} + +static int lock_can_replay(struct obd_import *imp) +{ + struct client_obd *cli = &imp->imp_obd->u.cli; + + CDEBUG(D_HA, "check lock replay limit, inflights = %u(%u)\n", + atomic_read(&imp->imp_replay_inflight) - 1, + cli->cl_max_rpcs_in_flight); + + /* +1 due to ldlm_lock_replay() increment */ + return atomic_read(&imp->imp_replay_inflight) < + 1 + min_t(u32, cli->cl_max_rpcs_in_flight, 8); +} + +int __ldlm_replay_locks(struct obd_import *imp, bool rate_limit) +{ + struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; + LIST_HEAD(list); + struct ldlm_lock *lock, *next; + int rc = 0; + + ENTRY; + + while (atomic_read(&imp->imp_replay_inflight) != 1) + cond_resched(); + + /* don't replay locks if import failed recovery */ + if (imp->imp_vbr_failed) + RETURN(0); + + if (ldlm_cancel_unused_locks_before_replay) + ldlm_cancel_unused_locks_for_replay(ns); + + ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list); + + list_for_each_entry_safe(lock, next, &list, l_pending_chain) { + list_del_init(&lock->l_pending_chain); + /* If we disconnected in the middle - cleanup and let + * reconnection to happen again. LU-14027 */ + if (rc || (imp->imp_state != LUSTRE_IMP_REPLAY_LOCKS)) { + LDLM_LOCK_RELEASE(lock); + continue; + } + rc = replay_one_lock(imp, lock); + LDLM_LOCK_RELEASE(lock); + + if (rate_limit) + wait_event_idle_exclusive(imp->imp_replay_waitq, + lock_can_replay(imp)); + } + + RETURN(rc); +} + +/** + * Lock replay uses rate control and can sleep waiting so + * must be in separate thread from ptlrpcd itself + */ +static int ldlm_lock_replay_thread(void *data) +{ + struct obd_import *imp = data; + + unshare_fs_struct(); + CDEBUG(D_HA, "lock replay thread %s to %s@%s\n", + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + + __ldlm_replay_locks(imp, true); + atomic_dec(&imp->imp_replay_inflight); + ptlrpc_import_recovery_state_machine(imp); + class_import_put(imp); + + return 0; +} + +int ldlm_replay_locks(struct obd_import *imp) +{ + struct task_struct *task; + int rc = 0; + + /* ensure this doesn't fall to 0 before all have been queued */ + if (atomic_inc_return(&imp->imp_replay_inflight) > 1) { + atomic_dec(&imp->imp_replay_inflight); + return 0; + } + class_import_get(imp); + + task = kthread_run(ldlm_lock_replay_thread, imp, "ldlm_lock_replay"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CDEBUG(D_HA, "can't start lock replay thread: rc = %d\n", rc); + + /* run lock replay without rate control */ + rc = __ldlm_replay_locks(imp, false); + atomic_dec(&imp->imp_replay_inflight); + class_import_put(imp); + } + + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c new file mode 100644 index 0000000000000..92fef33fc9860 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c @@ -0,0 +1,1809 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ldlm/ldlm_resource.c + * + * Author: Phil Schwan + * Author: Peter Braam + */ + +#define DEBUG_SUBSYSTEM S_LDLM +#include +#include +#include +#include +#include "ldlm_internal.h" + +struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab; +struct kmem_cache *ldlm_interval_tree_slab; +struct kmem_cache *ldlm_inodebits_slab; + +int ldlm_srv_namespace_nr = 0; +int ldlm_cli_namespace_nr = 0; + +DEFINE_MUTEX(ldlm_srv_namespace_lock); +LIST_HEAD(ldlm_srv_namespace_list); + +DEFINE_MUTEX(ldlm_cli_namespace_lock); +/* Client Namespaces that have active resources in them. + * Once all resources go away, ldlm_poold moves such namespaces to the + * inactive list */ +LIST_HEAD(ldlm_cli_active_namespace_list); +/* Client namespaces that don't have any locks in them */ +LIST_HEAD(ldlm_cli_inactive_namespace_list); + +static struct dentry *ldlm_debugfs_dir; +static struct dentry *ldlm_ns_debugfs_dir; +struct dentry *ldlm_svc_debugfs_dir; + +/* during debug dump certain amount of granted locks for one resource to avoid + * DDOS. */ +static unsigned int ldlm_dump_granted_max = 256; + +static ssize_t ldebugfs_dump_ns_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE); + ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE); + RETURN(count); +} + +LDEBUGFS_FOPS_WR_ONLY(ldlm, dump_ns); + +static int ldlm_rw_uint_seq_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%u\n", *(unsigned int *)m->private); + return 0; +} + +static ssize_t +ldlm_rw_uint_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *seq = file->private_data; + + if (!count) + return 0; + + return kstrtouint_from_user(buffer, count, 0, + (unsigned int *)seq->private); +} + +LDEBUGFS_SEQ_FOPS(ldlm_rw_uint); + +#ifdef HAVE_SERVER_SUPPORT + +static int seq_watermark_show(struct seq_file *m, void *data) +{ + seq_printf(m, "%llu\n", *(__u64 *)m->private); + return 0; +} + +static ssize_t seq_watermark_write(struct file *file, + const char __user *buffer, size_t count, + loff_t *off) +{ + struct seq_file *m = file->private_data; + u64 value; + __u64 watermark; + __u64 *data = m->private; + bool wm_low = (data == &ldlm_reclaim_threshold_mb) ? true : false; + char kernbuf[22] = ""; + int rc; + + if (count >= sizeof(kernbuf)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + kernbuf[count] = 0; + + rc = sysfs_memparse(kernbuf, count, &value, "MiB"); + if (rc < 0) { + CERROR("Failed to set %s, rc = %d.\n", + wm_low ? "lock_reclaim_threshold_mb" : "lock_limit_mb", + rc); + return rc; + } else if (value != 0 && value < (1 << 20)) { + CERROR("%s should be greater than 1MB.\n", + wm_low ? "lock_reclaim_threshold_mb" : "lock_limit_mb"); + return -EINVAL; + } + watermark = value >> 20; + + if (wm_low) { + if (ldlm_lock_limit_mb != 0 && watermark > ldlm_lock_limit_mb) { + CERROR("lock_reclaim_threshold_mb must be smaller than " + "lock_limit_mb.\n"); + return -EINVAL; + } + + *data = watermark; + if (watermark != 0) { + watermark <<= 20; + do_div(watermark, sizeof(struct ldlm_lock)); + } + ldlm_reclaim_threshold = watermark; + } else { + if (ldlm_reclaim_threshold_mb != 0 && + watermark < ldlm_reclaim_threshold_mb) { + CERROR("lock_limit_mb must be greater than " + "lock_reclaim_threshold_mb.\n"); + return -EINVAL; + } + + *data = watermark; + if (watermark != 0) { + watermark <<= 20; + do_div(watermark, sizeof(struct ldlm_lock)); + } + ldlm_lock_limit = watermark; + } + + return count; +} + +static int seq_watermark_open(struct inode *inode, struct file *file) +{ + return single_open(file, seq_watermark_show, inode->i_private); +} + +static const struct file_operations ldlm_watermark_fops = { + .owner = THIS_MODULE, + .open = seq_watermark_open, + .read = seq_read, + .write = seq_watermark_write, + .llseek = seq_lseek, + .release = lprocfs_single_release, +}; + +static int seq_granted_show(struct seq_file *m, void *data) +{ + seq_printf(m, "%llu\n", percpu_counter_sum_positive( + (struct percpu_counter *)m->private)); + return 0; +} + +static int seq_granted_open(struct inode *inode, struct file *file) +{ + return single_open(file, seq_granted_show, inode->i_private); +} + +static const struct file_operations ldlm_granted_fops = { + .owner = THIS_MODULE, + .open = seq_granted_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* HAVE_SERVER_SUPPORT */ + +static struct ldebugfs_vars ldlm_debugfs_list[] = { + { .name = "dump_namespaces", + .fops = &ldlm_dump_ns_fops, + .proc_mode = 0222 }, + { .name = "dump_granted_max", + .fops = &ldlm_rw_uint_fops, + .data = &ldlm_dump_granted_max }, +#ifdef HAVE_SERVER_SUPPORT + { .name = "lock_reclaim_threshold_mb", + .fops = &ldlm_watermark_fops, + .data = &ldlm_reclaim_threshold_mb }, + { .name = "lock_limit_mb", + .fops = &ldlm_watermark_fops, + .data = &ldlm_lock_limit_mb }, + { .name = "lock_granted_count", + .fops = &ldlm_granted_fops, + .data = &ldlm_granted_total }, +#endif + { NULL } +}; + +int ldlm_debugfs_setup(void) +{ + ENTRY; + ldlm_debugfs_dir = debugfs_create_dir(OBD_LDLM_DEVICENAME, + debugfs_lustre_root); + ldlm_ns_debugfs_dir = debugfs_create_dir("namespaces", + ldlm_debugfs_dir); + ldlm_svc_debugfs_dir = debugfs_create_dir("services", + ldlm_debugfs_dir); + + ldebugfs_add_vars(ldlm_debugfs_dir, ldlm_debugfs_list, NULL); + + RETURN(0); +} + +void ldlm_debugfs_cleanup(void) +{ + debugfs_remove_recursive(ldlm_debugfs_dir); + + ldlm_svc_debugfs_dir = NULL; + ldlm_ns_debugfs_dir = NULL; + ldlm_debugfs_dir = NULL; +} + +static ssize_t resource_count_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + __u64 res = 0; + int i; + + /* result is not strictly consistant */ + for (i = 0; i < (1 << ns->ns_bucket_bits); i++) + res += atomic_read(&ns->ns_rs_buckets[i].nsb_count); + return sprintf(buf, "%lld\n", res); +} +LUSTRE_RO_ATTR(resource_count); + +static ssize_t lock_count_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + __u64 locks; + + locks = lprocfs_stats_collector(ns->ns_stats, LDLM_NSS_LOCKS, + LPROCFS_FIELDS_FLAGS_SUM); + return sprintf(buf, "%lld\n", locks); +} +LUSTRE_RO_ATTR(lock_count); + +static ssize_t lock_unused_count_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%d\n", ns->ns_nr_unused); +} +LUSTRE_RO_ATTR(lock_unused_count); + +static ssize_t lru_size_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + __u32 *nr = &ns->ns_max_unused; + + if (ns_connect_lru_resize(ns)) + nr = &ns->ns_nr_unused; + return sprintf(buf, "%u\n", *nr); +} + +static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long tmp; + int lru_resize; + int err; + + if (strncmp(buffer, "clear", 5) == 0) { + CDEBUG(D_DLMTRACE, + "dropping all unused locks from namespace %s\n", + ldlm_ns_name(ns)); + /* Try to cancel all @ns_nr_unused locks. */ + ldlm_cancel_lru(ns, INT_MAX, 0, LDLM_LRU_FLAG_CLEANUP); + return count; + } + + err = kstrtoul(buffer, 10, &tmp); + if (err != 0) { + CERROR("lru_size: invalid value written\n"); + return -EINVAL; + } + lru_resize = (tmp == 0); + + if (ns_connect_lru_resize(ns)) { + if (!lru_resize) + ns->ns_max_unused = (unsigned int)tmp; + + if (tmp > ns->ns_nr_unused) + tmp = ns->ns_nr_unused; + tmp = ns->ns_nr_unused - tmp; + + CDEBUG(D_DLMTRACE, + "changing namespace %s unused locks from %u to %u\n", + ldlm_ns_name(ns), ns->ns_nr_unused, + (unsigned int)tmp); + + if (!lru_resize) { + CDEBUG(D_DLMTRACE, + "disable lru_resize for namespace %s\n", + ldlm_ns_name(ns)); + ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE; + } + ldlm_cancel_lru(ns, tmp, LCF_ASYNC, 0); + } else { + CDEBUG(D_DLMTRACE, + "changing namespace %s max_unused from %u to %u\n", + ldlm_ns_name(ns), ns->ns_max_unused, + (unsigned int)tmp); + + /* Make sure that LRU resize was originally supported before + * turning it on here. + */ + if (lru_resize && + (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) { + CDEBUG(D_DLMTRACE, + "enable lru_resize for namespace %s\n", + ldlm_ns_name(ns)); + ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE; + } + ns->ns_max_unused = (unsigned int)tmp; + ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0); + } + + return count; +} +LUSTRE_RW_ATTR(lru_size); + +static ssize_t lru_cancel_batch_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return snprintf(buf, sizeof(buf) - 1, "%u\n", ns->ns_cancel_batch); +} + +static ssize_t lru_cancel_batch_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long tmp; + + if (kstrtoul(buffer, 10, &tmp)) + return -EINVAL; + + ns->ns_cancel_batch = (unsigned int)tmp; + + return count; +} +LUSTRE_RW_ATTR(lru_cancel_batch); + +static ssize_t ns_recalc_pct_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return snprintf(buf, sizeof(buf) - 1, "%u\n", ns->ns_recalc_pct); +} + +static ssize_t ns_recalc_pct_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long tmp; + + if (kstrtoul(buffer, 10, &tmp)) + return -EINVAL; + + if (tmp > 100) + return -ERANGE; + + ns->ns_recalc_pct = (unsigned int)tmp; + + return count; +} +LUSTRE_RW_ATTR(ns_recalc_pct); + +static ssize_t lru_max_age_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%lld\n", ktime_to_ms(ns->ns_max_age)); +} + +static ssize_t lru_max_age_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + int scale = NSEC_PER_MSEC; + unsigned long long tmp; + char *buf; + + /* Did the user ask in seconds or milliseconds. Default is in ms */ + buf = strstr(buffer, "ms"); + if (!buf) { + buf = strchr(buffer, 's'); + if (buf) + scale = NSEC_PER_SEC; + } + + if (buf) + *buf = '\0'; + + if (kstrtoull(buffer, 10, &tmp)) + return -EINVAL; + + ns->ns_max_age = ktime_set(0, tmp * scale); + + return count; +} +LUSTRE_RW_ATTR(lru_max_age); + +static ssize_t early_lock_cancel_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%d\n", ns_connect_cancelset(ns)); +} + +static ssize_t early_lock_cancel_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long supp = -1; + int rc; + + rc = kstrtoul(buffer, 10, &supp); + if (rc < 0) + return rc; + + if (supp == 0) + ns->ns_connect_flags &= ~OBD_CONNECT_CANCELSET; + else if (ns->ns_orig_connect_flags & OBD_CONNECT_CANCELSET) + ns->ns_connect_flags |= OBD_CONNECT_CANCELSET; + return count; +} +LUSTRE_RW_ATTR(early_lock_cancel); + +static ssize_t dirty_age_limit_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", + ktime_divns(ns->ns_dirty_age_limit, NSEC_PER_SEC)); +} + +static ssize_t dirty_age_limit_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long long tmp; + + if (kstrtoull(buffer, 10, &tmp)) + return -EINVAL; + + ns->ns_dirty_age_limit = ktime_set(tmp, 0); + + return count; +} +LUSTRE_RW_ATTR(dirty_age_limit); + +#ifdef HAVE_SERVER_SUPPORT +static ssize_t ctime_age_limit_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", ns->ns_ctime_age_limit); +} + +static ssize_t ctime_age_limit_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long tmp; + + if (kstrtoul(buffer, 10, &tmp)) + return -EINVAL; + + ns->ns_ctime_age_limit = tmp; + + return count; +} +LUSTRE_RW_ATTR(ctime_age_limit); + +static ssize_t lock_timeouts_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%d\n", ns->ns_timeouts); +} +LUSTRE_RO_ATTR(lock_timeouts); + +static ssize_t max_nolock_bytes_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%u\n", ns->ns_max_nolock_size); +} + +static ssize_t max_nolock_bytes_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long tmp; + int err; + + err = kstrtoul(buffer, 10, &tmp); + if (err != 0) + return -EINVAL; + + ns->ns_max_nolock_size = tmp; + + return count; +} +LUSTRE_RW_ATTR(max_nolock_bytes); + +static ssize_t contention_seconds_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return scnprintf(buf, PAGE_SIZE, "%d\n", ns->ns_contention_time); +} + +static ssize_t contention_seconds_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned int tmp; + + if (kstrtouint(buffer, 10, &tmp)) + return -EINVAL; + + ns->ns_contention_time = tmp; + + return count; +} +LUSTRE_RW_ATTR(contention_seconds); + +static ssize_t contended_locks_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%u\n", ns->ns_contended_locks); +} + +static ssize_t contended_locks_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long tmp; + int err; + + err = kstrtoul(buffer, 10, &tmp); + if (err != 0) + return -EINVAL; + + ns->ns_contended_locks = tmp; + + return count; +} +LUSTRE_RW_ATTR(contended_locks); + +static ssize_t max_parallel_ast_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%u\n", ns->ns_max_parallel_ast); +} + +static ssize_t max_parallel_ast_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long tmp; + int err; + + err = kstrtoul(buffer, 10, &tmp); + if (err != 0) + return -EINVAL; + + ns->ns_max_parallel_ast = tmp; + + return count; +} +LUSTRE_RW_ATTR(max_parallel_ast); + +#endif /* HAVE_SERVER_SUPPORT */ + +/* These are for namespaces in /sys/fs/lustre/ldlm/namespaces/ */ +static struct attribute *ldlm_ns_attrs[] = { + &lustre_attr_resource_count.attr, + &lustre_attr_lock_count.attr, + &lustre_attr_lock_unused_count.attr, + &lustre_attr_ns_recalc_pct.attr, + &lustre_attr_lru_size.attr, + &lustre_attr_lru_cancel_batch.attr, + &lustre_attr_lru_max_age.attr, + &lustre_attr_early_lock_cancel.attr, + &lustre_attr_dirty_age_limit.attr, +#ifdef HAVE_SERVER_SUPPORT + &lustre_attr_ctime_age_limit.attr, + &lustre_attr_lock_timeouts.attr, + &lustre_attr_max_nolock_bytes.attr, + &lustre_attr_contention_seconds.attr, + &lustre_attr_contended_locks.attr, + &lustre_attr_max_parallel_ast.attr, +#endif + NULL, +}; + +static void ldlm_ns_release(struct kobject *kobj) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + complete(&ns->ns_kobj_unregister); +} + +KOBJ_ATTRIBUTE_GROUPS(ldlm_ns); + +static struct kobj_type ldlm_ns_ktype = { + .default_groups = KOBJ_ATTR_GROUPS(ldlm_ns), + .sysfs_ops = &lustre_sysfs_ops, + .release = ldlm_ns_release, +}; + +static void ldlm_namespace_debugfs_unregister(struct ldlm_namespace *ns) +{ + if (IS_ERR_OR_NULL(ns->ns_debugfs_entry)) + CERROR("dlm namespace %s has no procfs dir?\n", + ldlm_ns_name(ns)); + else + debugfs_remove_recursive(ns->ns_debugfs_entry); + + if (ns->ns_stats != NULL) + lprocfs_free_stats(&ns->ns_stats); +} + +void ldlm_namespace_sysfs_unregister(struct ldlm_namespace *ns) +{ + kobject_put(&ns->ns_kobj); + wait_for_completion(&ns->ns_kobj_unregister); +} + +int ldlm_namespace_sysfs_register(struct ldlm_namespace *ns) +{ + int err; + + ns->ns_kobj.kset = ldlm_ns_kset; + init_completion(&ns->ns_kobj_unregister); + err = kobject_init_and_add(&ns->ns_kobj, &ldlm_ns_ktype, NULL, + "%s", ldlm_ns_name(ns)); + + ns->ns_stats = lprocfs_alloc_stats(LDLM_NSS_LAST, 0); + if (!ns->ns_stats) { + kobject_put(&ns->ns_kobj); + return -ENOMEM; + } + + lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS, + LPROCFS_CNTR_AVGMINMAX, "locks", "locks"); + + return err; +} + +static int ldlm_namespace_debugfs_register(struct ldlm_namespace *ns) +{ + struct dentry *ns_entry; + + if (!IS_ERR_OR_NULL(ns->ns_debugfs_entry)) { + ns_entry = ns->ns_debugfs_entry; + } else { + ns_entry = debugfs_create_dir(ldlm_ns_name(ns), + ldlm_ns_debugfs_dir); + if (!ns_entry) + return -ENOMEM; + ns->ns_debugfs_entry = ns_entry; + } + + return 0; +} +#undef MAX_STRING_SIZE + +static unsigned ldlm_res_hop_hash(struct cfs_hash *hs, + const void *key, unsigned int mask) +{ + const struct ldlm_res_id *id = key; + unsigned int val = 0; + unsigned int i; + + for (i = 0; i < RES_NAME_SIZE; i++) + val += id->name[i]; + return val & mask; +} + +static unsigned int ldlm_res_hop_fid_hash(const struct ldlm_res_id *id, unsigned int bits) +{ + struct lu_fid fid; + __u32 hash; + __u32 val; + + fid.f_seq = id->name[LUSTRE_RES_ID_SEQ_OFF]; + fid.f_oid = (__u32)id->name[LUSTRE_RES_ID_VER_OID_OFF]; + fid.f_ver = (__u32)(id->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32); + + hash = fid_flatten32(&fid); + hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */ + if (id->name[LUSTRE_RES_ID_HSH_OFF] != 0) { + val = id->name[LUSTRE_RES_ID_HSH_OFF]; + } else { + val = fid_oid(&fid); + } + hash += (val >> 5) + (val << 11); + return cfs_hash_32(hash, bits); +} + +static void *ldlm_res_hop_key(struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + return &res->lr_name; +} + +static int ldlm_res_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + return ldlm_res_eq((const struct ldlm_res_id *)key, + (const struct ldlm_res_id *)&res->lr_name); +} + +static void *ldlm_res_hop_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ldlm_resource, lr_hash); +} + +static void +ldlm_res_hop_get_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + ldlm_resource_getref(res); +} + +static void ldlm_res_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + ldlm_resource_putref(res); +} + +static struct cfs_hash_ops ldlm_ns_hash_ops = { + .hs_hash = ldlm_res_hop_hash, + .hs_key = ldlm_res_hop_key, + .hs_keycmp = ldlm_res_hop_keycmp, + .hs_keycpy = NULL, + .hs_object = ldlm_res_hop_object, + .hs_get = ldlm_res_hop_get_locked, + .hs_put = ldlm_res_hop_put +}; + +static struct { + /** hash bucket bits */ + unsigned nsd_bkt_bits; + /** hash bits */ + unsigned nsd_all_bits; +} ldlm_ns_hash_defs[] = { + [LDLM_NS_TYPE_MDC] = { + .nsd_bkt_bits = 11, + .nsd_all_bits = 16, + }, + [LDLM_NS_TYPE_MDT] = { + .nsd_bkt_bits = 14, + .nsd_all_bits = 21, + }, + [LDLM_NS_TYPE_OSC] = { + .nsd_bkt_bits = 8, + .nsd_all_bits = 12, + }, + [LDLM_NS_TYPE_OST] = { + .nsd_bkt_bits = 11, + .nsd_all_bits = 17, + }, + [LDLM_NS_TYPE_MGC] = { + .nsd_bkt_bits = 3, + .nsd_all_bits = 4, + }, + [LDLM_NS_TYPE_MGT] = { + .nsd_bkt_bits = 3, + .nsd_all_bits = 4, + }, +}; + +/** + * Create and initialize new empty namespace. + */ +struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name, + enum ldlm_side client, + enum ldlm_appetite apt, + enum ldlm_ns_type ns_type) +{ + struct ldlm_namespace *ns = NULL; + int idx; + int rc; + + ENTRY; + LASSERT(obd != NULL); + + rc = ldlm_get_ref(); + if (rc) { + CERROR("%s: ldlm_get_ref failed: rc = %d\n", name, rc); + RETURN(ERR_PTR(rc)); + } + + if (ns_type >= ARRAY_SIZE(ldlm_ns_hash_defs) || + ldlm_ns_hash_defs[ns_type].nsd_bkt_bits == 0) { + rc = -EINVAL; + CERROR("%s: unknown namespace type %d: rc = %d\n", + name, ns_type, rc); + GOTO(out_ref, rc); + } + + OBD_ALLOC_PTR(ns); + if (!ns) + GOTO(out_ref, rc = -ENOMEM); + + ns->ns_rs_hash = cfs_hash_create(name, + ldlm_ns_hash_defs[ns_type].nsd_all_bits, + ldlm_ns_hash_defs[ns_type].nsd_all_bits, + ldlm_ns_hash_defs[ns_type].nsd_bkt_bits, + 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &ldlm_ns_hash_ops, + CFS_HASH_DEPTH | + CFS_HASH_BIGNAME | + CFS_HASH_SPIN_BKTLOCK | + CFS_HASH_NO_ITEMREF); + if (!ns->ns_rs_hash) + GOTO(out_ns, rc = -ENOMEM); + + ns->ns_bucket_bits = ldlm_ns_hash_defs[ns_type].nsd_all_bits - + ldlm_ns_hash_defs[ns_type].nsd_bkt_bits; + + OBD_ALLOC_PTR_ARRAY_LARGE(ns->ns_rs_buckets, 1 << ns->ns_bucket_bits); + if (!ns->ns_rs_buckets) + GOTO(out_hash, rc = -ENOMEM); + + for (idx = 0; idx < (1 << ns->ns_bucket_bits); idx++) { + struct ldlm_ns_bucket *nsb = &ns->ns_rs_buckets[idx]; + + at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0); + nsb->nsb_namespace = ns; + nsb->nsb_reclaim_start = 0; + atomic_set(&nsb->nsb_count, 0); + } + + ns->ns_obd = obd; + ns->ns_appetite = apt; + ns->ns_client = client; + ns->ns_name = kstrdup(name, GFP_KERNEL); + if (!ns->ns_name) + GOTO(out_hash, rc = -ENOMEM); + + INIT_LIST_HEAD(&ns->ns_list_chain); + INIT_LIST_HEAD(&ns->ns_unused_list); + spin_lock_init(&ns->ns_lock); + atomic_set(&ns->ns_bref, 0); + init_waitqueue_head(&ns->ns_waitq); + + ns->ns_max_nolock_size = NS_DEFAULT_MAX_NOLOCK_BYTES; + ns->ns_contention_time = NS_DEFAULT_CONTENTION_SECONDS; + ns->ns_contended_locks = NS_DEFAULT_CONTENDED_LOCKS; + + ns->ns_max_parallel_ast = LDLM_DEFAULT_PARALLEL_AST_LIMIT; + ns->ns_nr_unused = 0; + ns->ns_max_unused = LDLM_DEFAULT_LRU_SIZE; + ns->ns_cancel_batch = LDLM_DEFAULT_LRU_SHRINK_BATCH; + ns->ns_recalc_pct = LDLM_DEFAULT_SLV_RECALC_PCT; + ns->ns_max_age = ktime_set(LDLM_DEFAULT_MAX_ALIVE, 0); + ns->ns_ctime_age_limit = LDLM_CTIME_AGE_LIMIT; + ns->ns_dirty_age_limit = ktime_set(LDLM_DIRTY_AGE_LIMIT, 0); + ns->ns_timeouts = 0; + ns->ns_orig_connect_flags = 0; + ns->ns_connect_flags = 0; + ns->ns_stopping = 0; + ns->ns_reclaim_start = 0; + ns->ns_last_pos = &ns->ns_unused_list; + ns->ns_flags = 0; + + rc = ldlm_namespace_sysfs_register(ns); + if (rc) { + CERROR("%s: cannot initialize ns sysfs: rc = %d\n", name, rc); + GOTO(out_hash, rc); + } + + rc = ldlm_namespace_debugfs_register(ns); + if (rc) { + CERROR("%s: cannot initialize ns proc: rc = %d\n", name, rc); + GOTO(out_sysfs, rc); + } + + idx = ldlm_namespace_nr_read(client); + rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client); + if (rc) { + CERROR("%s: cannot initialize lock pool, rc = %d\n", name, rc); + GOTO(out_proc, rc); + } + + ldlm_namespace_register(ns, client); + RETURN(ns); +out_proc: + ldlm_namespace_debugfs_unregister(ns); +out_sysfs: + ldlm_namespace_sysfs_unregister(ns); + ldlm_namespace_cleanup(ns, 0); +out_hash: + OBD_FREE_PTR_ARRAY_LARGE(ns->ns_rs_buckets, 1 << ns->ns_bucket_bits); + kfree(ns->ns_name); + cfs_hash_putref(ns->ns_rs_hash); +out_ns: + OBD_FREE_PTR(ns); +out_ref: + ldlm_put_ref(); + RETURN(ERR_PTR(rc)); +} +EXPORT_SYMBOL(ldlm_namespace_new); + +/** + * Cancel and destroy all locks on a resource. + * + * If flags contains FL_LOCAL_ONLY, don't try to tell the server, just + * clean up. This is currently only used for recovery, and we make + * certain assumptions as a result--notably, that we shouldn't cancel + * locks with refs. + */ +static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, + __u64 flags) +{ + struct list_head *tmp; + int rc = 0, client = ns_is_client(ldlm_res_to_ns(res)); + bool local_only = !!(flags & LDLM_FL_LOCAL_ONLY); + + do { + struct ldlm_lock *lock = NULL; + + /* First, we look for non-cleaned-yet lock + * all cleaned locks are marked by CLEANED flag. */ + lock_res(res); + list_for_each(tmp, q) { + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + if (ldlm_is_cleaned(lock)) { + lock = NULL; + continue; + } + LDLM_LOCK_GET(lock); + ldlm_set_cleaned(lock); + break; + } + + if (lock == NULL) { + unlock_res(res); + break; + } + + /* Set CBPENDING so nothing in the cancellation path + * can match this lock. */ + ldlm_set_cbpending(lock); + ldlm_set_failed(lock); + lock->l_flags |= flags; + + /* ... without sending a CANCEL message for local_only. */ + if (local_only) + ldlm_set_local_only(lock); + + if (local_only && (lock->l_readers || lock->l_writers)) { + /* + * This is a little bit gross, but much better than the + * alternative: pretend that we got a blocking AST from + * the server, so that when the lock is decref'd, it + * will go away ... + */ + unlock_res(res); + LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY"); + if (lock->l_flags & LDLM_FL_FAIL_LOC) + schedule_timeout_uninterruptible( + cfs_time_seconds(4)); + + if (lock->l_completion_ast) + lock->l_completion_ast(lock, + LDLM_FL_FAILED, NULL); + LDLM_LOCK_RELEASE(lock); + continue; + } + + if (client) { + struct lustre_handle lockh; + + unlock_res(res); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_LOCAL); + if (rc) + CERROR("ldlm_cli_cancel: %d\n", rc); + } else { + unlock_res(res); + LDLM_DEBUG(lock, + "Freeing a lock still held by a client node"); + ldlm_lock_cancel(lock); + } + LDLM_LOCK_RELEASE(lock); + } while (1); +} + +static int ldlm_resource_clean(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + __u64 flags = *(__u64 *)arg; + + cleanup_resource(res, &res->lr_granted, flags); + cleanup_resource(res, &res->lr_waiting, flags); + + return 0; +} + +static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + + lock_res(res); + CERROR("%s: namespace resource "DLDLMRES" (%p) refcount nonzero " + "(%d) after lock cleanup; forcing cleanup.\n", + ldlm_ns_name(ldlm_res_to_ns(res)), PLDLMRES(res), res, + atomic_read(&res->lr_refcount) - 1); + + /* Use D_NETERROR since it is in the default mask */ + ldlm_resource_dump(D_NETERROR, res); + unlock_res(res); + return 0; +} + +/** + * Cancel and destroy all locks in the namespace. + * + * Typically used during evictions when server notified client that it was + * evicted and all of its state needs to be destroyed. + * Also used during shutdown. + */ +int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags) +{ + if (ns == NULL) { + CDEBUG(D_INFO, "NULL ns, skipping cleanup\n"); + return ELDLM_OK; + } + + cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, + &flags, 0); + cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, + NULL, 0); + return ELDLM_OK; +} +EXPORT_SYMBOL(ldlm_namespace_cleanup); + +/** + * Attempts to free namespace. + * + * Only used when namespace goes away, like during an unmount. + */ +static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force) +{ + ENTRY; + + /* At shutdown time, don't call the cancellation callback */ + ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0); + + if (atomic_read(&ns->ns_bref) > 0) { + int rc; + CDEBUG(D_DLMTRACE, + "dlm namespace %s free waiting on refcount %d\n", + ldlm_ns_name(ns), atomic_read(&ns->ns_bref)); +force_wait: + if (force) + rc = wait_event_idle_timeout( + ns->ns_waitq, + atomic_read(&ns->ns_bref) == 0, + cfs_time_seconds(1) / 4); + else + rc = l_wait_event_abortable( + ns->ns_waitq, atomic_read(&ns->ns_bref) == 0); + + /* Forced cleanups should be able to reclaim all references, + * so it's safe to wait forever... we can't leak locks... */ + if (force && rc == 0) { + rc = -ETIMEDOUT; + LCONSOLE_ERROR("Forced cleanup waiting for %s " + "namespace with %d resources in use, " + "(rc=%d)\n", ldlm_ns_name(ns), + atomic_read(&ns->ns_bref), rc); + GOTO(force_wait, rc); + } + + if (atomic_read(&ns->ns_bref)) { + LCONSOLE_ERROR("Cleanup waiting for %s namespace " + "with %d resources in use, (rc=%d)\n", + ldlm_ns_name(ns), + atomic_read(&ns->ns_bref), rc); + RETURN(ELDLM_NAMESPACE_EXISTS); + } + CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n", + ldlm_ns_name(ns)); + } + + RETURN(ELDLM_OK); +} + +/** + * Performs various cleanups for passed \a ns to make it drop refc and be + * ready for freeing. Waits for refc == 0. + * + * The following is done: + * (0) Unregister \a ns from its list to make inaccessible for potential + * users like pools thread and others; + * (1) Clear all locks in \a ns. + */ +void ldlm_namespace_free_prior(struct ldlm_namespace *ns, + struct obd_import *imp, + int force) +{ + int rc; + + ENTRY; + if (!ns) { + EXIT; + return; + } + + spin_lock(&ns->ns_lock); + ns->ns_stopping = 1; + spin_unlock(&ns->ns_lock); + + /* + * Can fail with -EINTR when force == 0 in which case try harder. + */ + rc = __ldlm_namespace_free(ns, force); + if (rc != ELDLM_OK) { + if (imp) { + ptlrpc_disconnect_import(imp, 0); + ptlrpc_invalidate_import(imp); + } + + /* + * With all requests dropped and the import inactive + * we are gaurenteed all reference will be dropped. + */ + rc = __ldlm_namespace_free(ns, 1); + LASSERT(rc == 0); + } + EXIT; +} +EXPORT_SYMBOL(ldlm_namespace_free_prior); + +/** + * Performs freeing memory structures related to \a ns. This is only done + * when ldlm_namespce_free_prior() successfully removed all resources + * referencing \a ns and its refc == 0. + */ +void ldlm_namespace_free_post(struct ldlm_namespace *ns) +{ + ENTRY; + if (!ns) { + EXIT; + return; + } + + /* Make sure that nobody can find this ns in its list. */ + ldlm_namespace_unregister(ns, ns->ns_client); + /* Fini pool _before_ parent proc dir is removed. This is important as + * ldlm_pool_fini() removes own proc dir which is child to @dir. + * Removing it after @dir may cause oops. */ + ldlm_pool_fini(&ns->ns_pool); + + ldlm_namespace_debugfs_unregister(ns); + ldlm_namespace_sysfs_unregister(ns); + cfs_hash_putref(ns->ns_rs_hash); + OBD_FREE_PTR_ARRAY_LARGE(ns->ns_rs_buckets, 1 << ns->ns_bucket_bits); + kfree(ns->ns_name); + /* Namespace \a ns should be not on list at this time, otherwise + * this will cause issues related to using freed \a ns in poold + * thread. + */ + LASSERT(list_empty(&ns->ns_list_chain)); + OBD_FREE_PTR(ns); + ldlm_put_ref(); + EXIT; +} +EXPORT_SYMBOL(ldlm_namespace_free_post); + +/** + * Cleanup the resource, and free namespace. + * bug 12864: + * Deadlock issue: + * proc1: destroy import + * class_disconnect_export(grab cl_sem) -> + * -> ldlm_namespace_free -> + * -> lprocfs_remove(grab _lprocfs_lock). + * proc2: read proc info + * lprocfs_fops_read(grab _lprocfs_lock) -> + * -> osc_rd_active, etc(grab cl_sem). + * + * So that I have to split the ldlm_namespace_free into two parts - the first + * part ldlm_namespace_free_prior is used to cleanup the resource which is + * being used; the 2nd part ldlm_namespace_free_post is used to unregister the + * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem + * held. + */ +void ldlm_namespace_free(struct ldlm_namespace *ns, + struct obd_import *imp, + int force) +{ + ldlm_namespace_free_prior(ns, imp, force); + ldlm_namespace_free_post(ns); +} +EXPORT_SYMBOL(ldlm_namespace_free); + +void ldlm_namespace_get(struct ldlm_namespace *ns) +{ + atomic_inc(&ns->ns_bref); +} + +/* This is only for callers that care about refcount */ +static int ldlm_namespace_get_return(struct ldlm_namespace *ns) +{ + return atomic_inc_return(&ns->ns_bref); +} + +void ldlm_namespace_put(struct ldlm_namespace *ns) +{ + if (atomic_dec_and_lock(&ns->ns_bref, &ns->ns_lock)) { + wake_up(&ns->ns_waitq); + spin_unlock(&ns->ns_lock); + } +} + +/** Register \a ns in the list of namespaces */ +void ldlm_namespace_register(struct ldlm_namespace *ns, enum ldlm_side client) +{ + mutex_lock(ldlm_namespace_lock(client)); + LASSERT(list_empty(&ns->ns_list_chain)); + list_add(&ns->ns_list_chain, ldlm_namespace_inactive_list(client)); + ldlm_namespace_nr_inc(client); + mutex_unlock(ldlm_namespace_lock(client)); +} + +/** Unregister \a ns from the list of namespaces. */ +void ldlm_namespace_unregister(struct ldlm_namespace *ns, enum ldlm_side client) +{ + mutex_lock(ldlm_namespace_lock(client)); + LASSERT(!list_empty(&ns->ns_list_chain)); + /* Some asserts and possibly other parts of the code are still + * using list_empty(&ns->ns_list_chain). This is why it is + * important to use list_del_init() here. */ + list_del_init(&ns->ns_list_chain); + ldlm_namespace_nr_dec(client); + mutex_unlock(ldlm_namespace_lock(client)); +} + +/** Should be called with ldlm_namespace_lock(client) taken. */ +void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *ns, + enum ldlm_side client) +{ + LASSERT(!list_empty(&ns->ns_list_chain)); + LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); + list_move_tail(&ns->ns_list_chain, ldlm_namespace_list(client)); +} + +/** Should be called with ldlm_namespace_lock(client) taken. */ +void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *ns, + enum ldlm_side client) +{ + LASSERT(!list_empty(&ns->ns_list_chain)); + LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); + list_move_tail(&ns->ns_list_chain, + ldlm_namespace_inactive_list(client)); +} + +/** Should be called with ldlm_namespace_lock(client) taken. */ +struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side client) +{ + LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); + LASSERT(!list_empty(ldlm_namespace_list(client))); + return container_of(ldlm_namespace_list(client)->next, + struct ldlm_namespace, ns_list_chain); +} + +static bool ldlm_resource_extent_new(struct ldlm_resource *res) +{ + int idx; + + OBD_SLAB_ALLOC(res->lr_itree, ldlm_interval_tree_slab, + sizeof(*res->lr_itree) * LCK_MODE_NUM); + if (res->lr_itree == NULL) + return false; + /* Initialize interval trees for each lock mode. */ + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + res->lr_itree[idx].lit_size = 0; + res->lr_itree[idx].lit_mode = BIT(idx); + res->lr_itree[idx].lit_root = NULL; + } + return true; +} + +static bool ldlm_resource_inodebits_new(struct ldlm_resource *res) +{ + int i; + + OBD_ALLOC_PTR(res->lr_ibits_queues); + if (res->lr_ibits_queues == NULL) + return false; + for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) + INIT_LIST_HEAD(&res->lr_ibits_queues->liq_waiting[i]); + return true; +} + +/** Create and initialize new resource. */ +static struct ldlm_resource *ldlm_resource_new(enum ldlm_type ldlm_type) +{ + struct ldlm_resource *res; + bool rc; + + OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, GFP_NOFS); + if (res == NULL) + return NULL; + + switch (ldlm_type) { + case LDLM_EXTENT: + rc = ldlm_resource_extent_new(res); + break; + case LDLM_IBITS: + rc = ldlm_resource_inodebits_new(res); + break; + default: + rc = true; + break; + } + if (!rc) { + OBD_SLAB_FREE_PTR(res, ldlm_resource_slab); + return NULL; + } + + INIT_LIST_HEAD(&res->lr_granted); + INIT_LIST_HEAD(&res->lr_waiting); + + atomic_set(&res->lr_refcount, 1); + spin_lock_init(&res->lr_lock); + lu_ref_init(&res->lr_reference); + + /* Since LVB init can be delayed now, there is no longer need to + * immediatelly acquire mutex here. */ + mutex_init(&res->lr_lvb_mutex); + res->lr_lvb_initialized = false; + + return res; +} + +static void __ldlm_resource_free(struct rcu_head *head) +{ + struct ldlm_resource *res = container_of(head, struct ldlm_resource, + lr_rcu); + + OBD_SLAB_FREE_PTR(res, ldlm_resource_slab); +} + +static void ldlm_resource_free(struct ldlm_resource *res) +{ + if (res->lr_type == LDLM_EXTENT) { + if (res->lr_itree != NULL) + OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab, + sizeof(*res->lr_itree) * LCK_MODE_NUM); + } else if (res->lr_type == LDLM_IBITS) { + if (res->lr_ibits_queues != NULL) + OBD_FREE_PTR(res->lr_ibits_queues); + } + + call_rcu(&res->lr_rcu, __ldlm_resource_free); +} + +/** + * Return a reference to resource with given name, creating it if necessary. + * Args: namespace with ns_lock unlocked + * Locks: takes and releases NS hash-lock and res->lr_lock + * Returns: referenced, unlocked ldlm_resource or ERR_PTR + */ +struct ldlm_resource * +ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, + const struct ldlm_res_id *name, enum ldlm_type type, + int create) +{ + struct hlist_node *hnode; + struct ldlm_resource *res = NULL; + struct cfs_hash_bd bd; + __u64 version; + int ns_refcount = 0; + int hash; + + LASSERT(ns != NULL); + LASSERT(parent == NULL); + LASSERT(ns->ns_rs_hash != NULL); + LASSERT(name->name[0] != 0); + + cfs_hash_bd_get_and_lock(ns->ns_rs_hash, (void *)name, &bd, 0); + hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name); + if (hnode != NULL) { + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0); + GOTO(found, res); + } + + version = cfs_hash_bd_version_get(&bd); + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0); + + if (create == 0) + return ERR_PTR(-ENOENT); + + LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE, + "type: %d\n", type); + res = ldlm_resource_new(type); + if (res == NULL) + return ERR_PTR(-ENOMEM); + + hash = ldlm_res_hop_fid_hash(name, ns->ns_bucket_bits); + res->lr_ns_bucket = &ns->ns_rs_buckets[hash]; + res->lr_name = *name; + res->lr_type = type; + + cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1); + hnode = (version == cfs_hash_bd_version_get(&bd)) ? NULL : + cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name); + + if (hnode != NULL) { + /* Someone won the race and already added the resource. */ + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + /* Clean lu_ref for failed resource. */ + lu_ref_fini(&res->lr_reference); + ldlm_resource_free(res); +found: + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + return res; + } + /* We won! Let's add the resource. */ + cfs_hash_bd_add_locked(ns->ns_rs_hash, &bd, &res->lr_hash); + if (atomic_inc_return(&res->lr_ns_bucket->nsb_count) == 1) + ns_refcount = ldlm_namespace_get_return(ns); + + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2); + + /* Let's see if we happened to be the very first resource in this + * namespace. If so, and this is a client namespace, we need to move + * the namespace into the active namespaces list to be patrolled by + * the ldlm_poold. */ + if (ns_is_client(ns) && ns_refcount == 1) { + mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT)); + ldlm_namespace_move_to_active_locked(ns, LDLM_NAMESPACE_CLIENT); + mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT)); + } + + return res; +} +EXPORT_SYMBOL(ldlm_resource_get); + +struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res) +{ + LASSERT(res != NULL); + LASSERT(res != LP_POISON); + atomic_inc(&res->lr_refcount); + CDEBUG(D_INFO, "getref res: %p count: %d\n", res, + atomic_read(&res->lr_refcount)); + return res; +} + +static void __ldlm_resource_putref_final(struct cfs_hash_bd *bd, + struct ldlm_resource *res) +{ + struct ldlm_ns_bucket *nsb = res->lr_ns_bucket; + + if (!list_empty(&res->lr_granted)) { + ldlm_resource_dump(D_ERROR, res); + LBUG(); + } + + if (!list_empty(&res->lr_waiting)) { + ldlm_resource_dump(D_ERROR, res); + LBUG(); + } + + cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash, + bd, &res->lr_hash); + lu_ref_fini(&res->lr_reference); + if (atomic_dec_and_test(&nsb->nsb_count)) + ldlm_namespace_put(nsb->nsb_namespace); +} + +/* Returns 1 if the resource was freed, 0 if it remains. */ +int ldlm_resource_putref(struct ldlm_resource *res) +{ + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + struct cfs_hash_bd bd; + + LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "putref res: %p count: %d\n", + res, atomic_read(&res->lr_refcount) - 1); + + cfs_hash_bd_get(ns->ns_rs_hash, &res->lr_name, &bd); + if (cfs_hash_bd_dec_and_lock(ns->ns_rs_hash, &bd, &res->lr_refcount)) { + __ldlm_resource_putref_final(&bd, res); + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free) + ns->ns_lvbo->lvbo_free(res); + ldlm_resource_free(res); + return 1; + } + return 0; +} +EXPORT_SYMBOL(ldlm_resource_putref); + +static void __ldlm_resource_add_lock(struct ldlm_resource *res, + struct list_head *head, + struct ldlm_lock *lock, + bool tail) +{ + check_res_locked(res); + + if (ldlm_is_destroyed(lock)) { + CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); + return; + } + + LASSERT(list_empty(&lock->l_res_link)); + + if (tail) + list_add_tail(&lock->l_res_link, head); + else + list_add(&lock->l_res_link, head); + + if (res->lr_type == LDLM_IBITS) + ldlm_inodebits_add_lock(res, head, lock, tail); + + ldlm_resource_dump(D_INFO, res); +} + +/** + * Add a lock into a given resource into specified lock list. + */ +void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head, + struct ldlm_lock *lock) +{ + LDLM_DEBUG(lock, "About to add this lock"); + + __ldlm_resource_add_lock(res, head, lock, true); +} + +/** + * Insert a lock into resource after specified lock. + */ +void ldlm_resource_insert_lock_after(struct ldlm_lock *original, + struct ldlm_lock *new) +{ + LASSERT(!list_empty(&original->l_res_link)); + + LDLM_DEBUG(new, "About to insert this lock after %p: ", original); + __ldlm_resource_add_lock(original->l_resource, + &original->l_res_link, + new, false); +} + +/** + * Insert a lock into resource before the specified lock. + * + * IBITS waiting locks are to be inserted to the ibit lists as well, and only + * the insert-after operation is supported for them, because the set of bits + * of the previous and the new locks must match. Therefore, get the previous + * lock and insert after. + */ +void ldlm_resource_insert_lock_before(struct ldlm_lock *original, + struct ldlm_lock *new) +{ + LASSERT(!list_empty(&original->l_res_link)); + + LDLM_DEBUG(new, "About to insert this lock before %p: ", original); + __ldlm_resource_add_lock(original->l_resource, + original->l_res_link.prev, new, false); +} + +void ldlm_resource_unlink_lock(struct ldlm_lock *lock) +{ + int type = lock->l_resource->lr_type; + + check_res_locked(lock->l_resource); + switch (type) { + case LDLM_PLAIN: + ldlm_unlink_lock_skiplist(lock); + break; + case LDLM_EXTENT: + ldlm_extent_unlink_lock(lock); + break; + case LDLM_IBITS: + ldlm_inodebits_unlink_lock(lock); + break; + } + list_del_init(&lock->l_res_link); +} +EXPORT_SYMBOL(ldlm_resource_unlink_lock); + +void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc) +{ + desc->lr_type = res->lr_type; + desc->lr_name = res->lr_name; +} + +/** + * Print information about all locks in all namespaces on this node to debug + * log. + */ +void ldlm_dump_all_namespaces(enum ldlm_side client, int level) +{ + struct list_head *tmp; + + if (!((libcfs_debug | D_ERROR) & level)) + return; + + mutex_lock(ldlm_namespace_lock(client)); + + list_for_each(tmp, ldlm_namespace_list(client)) { + struct ldlm_namespace *ns; + + ns = list_entry(tmp, struct ldlm_namespace, ns_list_chain); + ldlm_namespace_dump(level, ns); + } + + mutex_unlock(ldlm_namespace_lock(client)); +} + +static int ldlm_res_hash_dump(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + int level = (int)(unsigned long)arg; + + lock_res(res); + ldlm_resource_dump(level, res); + unlock_res(res); + + return 0; +} + +/** + * Print information about all locks in this namespace on this node to debug + * log. + */ +void ldlm_namespace_dump(int level, struct ldlm_namespace *ns) +{ + if (!((libcfs_debug | D_ERROR) & level)) + return; + + CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n", + ldlm_ns_name(ns), atomic_read(&ns->ns_bref), + ns_is_client(ns) ? "client" : "server"); + + if (ktime_get_seconds() < ns->ns_next_dump) + return; + + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_res_hash_dump, + (void *)(unsigned long)level, 0); + spin_lock(&ns->ns_lock); + ns->ns_next_dump = ktime_get_seconds() + 10; + spin_unlock(&ns->ns_lock); +} + +/** + * Print information about all locks in this resource to debug log. + */ +void ldlm_resource_dump(int level, struct ldlm_resource *res) +{ + struct ldlm_lock *lock; + unsigned int granted = 0; + + BUILD_BUG_ON(RES_NAME_SIZE != 4); + + if (!((libcfs_debug | D_ERROR) & level)) + return; + + CDEBUG(level, "--- Resource: "DLDLMRES" (%p) refcount = %d\n", + PLDLMRES(res), res, atomic_read(&res->lr_refcount)); + + if (!list_empty(&res->lr_granted)) { + CDEBUG(level, "Granted locks (in reverse order):\n"); + list_for_each_entry_reverse(lock, &res->lr_granted, + l_res_link) { + LDLM_DEBUG_LIMIT(level, lock, "###"); + if (!(level & D_CANTMASK) && + ++granted > ldlm_dump_granted_max) { + CDEBUG(level, + "only dump %d granted locks to avoid DDOS.\n", + granted); + break; + } + } + } + + if (!list_empty(&res->lr_waiting)) { + CDEBUG(level, "Waiting locks:\n"); + list_for_each_entry(lock, &res->lr_waiting, l_res_link) + LDLM_DEBUG_LIMIT(level, lock, "###"); + } +} +EXPORT_SYMBOL(ldlm_resource_dump); diff --git a/drivers/staging/lustrefsx/lustre/llite/Makefile b/drivers/staging/lustrefsx/lustre/llite/Makefile new file mode 100644 index 0000000000000..4650e91efc0df --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/Makefile @@ -0,0 +1,15 @@ +obj-$(CONFIG_LUSTREFSX_FS) += lustre.o + +lustre-y := dcache.o dir.o file.o llite_lib.o llite_nfs.o +lustre-y += rw.o lproc_llite.o namei.o symlink.o llite_mmap.o +lustre-y += xattr.o xattr_cache.o +lustre-y += rw26.o super25.o statahead.o xattr_security.o +lustre-y += glimpse.o +lustre-y += lcommon_cl.o +lustre-y += lcommon_misc.o +lustre-y += vvp_dev.o vvp_page.o vvp_io.o vvp_object.o +lustre-y += pcc.o crypto.o +lustre-y += llite_foreign.o llite_foreign_symlink.o +lustre-y += acl.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/llite/acl.c b/drivers/staging/lustrefsx/lustre/llite/acl.c new file mode 100644 index 0000000000000..bdd6841781409 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/acl.c @@ -0,0 +1,136 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/acl.c + * + * Author: Peter Braam + * Author: Phil Schwan + * Author: Andreas Dilger + */ + +#include "llite_internal.h" + +struct posix_acl *ll_get_acl( + #ifdef HAVE_ACL_WITH_DENTRY + struct user_namespace *ns, struct dentry *dentry, int type) + #elif defined HAVE_GET_ACL_RCU_ARG + struct inode *inode, int type, bool rcu) + #else + struct inode *inode, int type) + #endif /* HAVE_GET_ACL_RCU_ARG */ +{ +#ifdef HAVE_ACL_WITH_DENTRY + struct inode *inode = dentry->d_inode; +#endif + struct ll_inode_info *lli = ll_i2info(inode); + struct posix_acl *acl = NULL; + ENTRY; + +#ifdef HAVE_GET_ACL_RCU_ARG + if (rcu) + return ERR_PTR(-ECHILD); +#endif + + read_lock(&lli->lli_lock); + /* VFS' acl_permission_check->check_acl will release the refcount */ + acl = posix_acl_dup(lli->lli_posix_acl); + read_unlock(&lli->lli_lock); + + RETURN(acl); +} + +#ifdef HAVE_IOP_SET_ACL +int ll_set_acl(struct user_namespace *mnt_userns, +#ifdef HAVE_ACL_WITH_DENTRY + struct dentry *dentry, +#else + struct inode *inode, +#endif + struct posix_acl *acl, int type) +{ +#ifdef HAVE_ACL_WITH_DENTRY + struct inode *inode = dentry->d_inode; +#endif + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + const char *name = NULL; + char *value = NULL; + size_t value_size = 0; + int rc = 0; + ENTRY; + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_POSIX_ACL_ACCESS; + if (acl) + rc = posix_acl_update_mode(mnt_userns, inode, + &inode->i_mode, &acl); + break; + + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + rc = acl ? -EACCES : 0; + break; + + default: + rc = -EINVAL; + break; + } + if (rc) + return rc; + + if (acl) { + value_size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(value_size, GFP_NOFS); + if (value == NULL) + GOTO(out, rc = -ENOMEM); + + rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size); + if (rc < 0) + GOTO(out_value, rc); + } + + rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), + value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM, + name, value, value_size, 0, 0, &req); + + ptlrpc_req_finished(req); +out_value: + kfree(value); +out: + if (rc) + forget_cached_acl(inode, type); + else + set_cached_acl(inode, type, acl); + RETURN(rc); +} +#endif /* HAVE_IOP_SET_ACL */ diff --git a/drivers/staging/lustrefsx/lustre/llite/crypto.c b/drivers/staging/lustrefsx/lustre/llite/crypto.c new file mode 100644 index 0000000000000..a832d4d119d6e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/crypto.c @@ -0,0 +1,562 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2019, 2020, Whamcloud. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#include "llite_internal.h" + +#ifdef HAVE_LUSTRE_CRYPTO +#include + +static int ll_get_context(struct inode *inode, void *ctx, size_t len) +{ + int rc; + + /* Get enc context xattr directly instead of going through the VFS, + * as there is no xattr handler for "encryption.". + */ + rc = ll_xattr_list(inode, xattr_for_enc(inode), + XATTR_ENCRYPTION_T, ctx, len, OBD_MD_FLXATTR); + + /* used as encryption unit size */ + if (S_ISREG(inode->i_mode)) + inode->i_blkbits = LUSTRE_ENCRYPTION_BLOCKBITS; + return rc; +} + +int ll_set_encflags(struct inode *inode, void *encctx, __u32 encctxlen, + bool preload) +{ + unsigned int ext_flags; + int rc = 0; + + /* used as encryption unit size */ + if (S_ISREG(inode->i_mode)) + inode->i_blkbits = LUSTRE_ENCRYPTION_BLOCKBITS; + ext_flags = ll_inode_to_ext_flags(inode->i_flags) | LUSTRE_ENCRYPT_FL; + ll_update_inode_flags(inode, ext_flags); + + if (encctx && encctxlen) + rc = ll_xattr_cache_insert(inode, + xattr_for_enc(inode), + encctx, encctxlen); + if (rc) + return rc; + + return preload ? llcrypt_get_encryption_info(inode) : 0; +} + +/* ll_set_context has 2 distinct behaviors, depending on the value of inode + * parameter: + * - inode is NULL: + * passed fs_data is a struct md_op_data *. We need to store enc ctx in + * op_data, so that it will be sent along to the server with the request that + * the caller is preparing, thus saving a setxattr request. + * - inode is not NULL: + * normal case, letting proceed with setxattr operation. + * This use case should only be used when explicitly setting a new encryption + * policy on an existing, empty directory. + */ +static int ll_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + struct ptlrpc_request *req = NULL; + struct ll_sb_info *sbi; + int rc; + + if (inode == NULL) { + struct md_op_data *op_data = (struct md_op_data *)fs_data; + + if (!op_data) + return -EINVAL; + + OBD_ALLOC(op_data->op_file_encctx, len); + if (op_data->op_file_encctx == NULL) + return -ENOMEM; + op_data->op_file_encctx_size = len; + memcpy(op_data->op_file_encctx, ctx, len); + return 0; + } + + /* Encrypting the root directory is not allowed */ + if (is_root_inode(inode)) + return -EPERM; + + sbi = ll_i2sbi(inode); + /* Send setxattr request to lower layers directly instead of going + * through the VFS, as there is no xattr handler for "encryption.". + */ + rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), + OBD_MD_FLXATTR, xattr_for_enc(inode), + ctx, len, XATTR_CREATE, ll_i2suppgid(inode), &req); + if (rc) + return rc; + ptlrpc_req_finished(req); + + return ll_set_encflags(inode, (void *)ctx, len, false); +} + +/** + * ll_file_open_encrypt() - overlay to llcrypt_file_open + * @inode: the inode being opened + * @filp: the struct file being set up + * + * This overlay function is necessary to handle encrypted file open without + * the key. We allow this access pattern to applications that know what they + * are doing, by using the specific flag O_FILE_ENC. + * This flag is only compatible with O_DIRECT IOs, to make sure ciphertext + * data is wiped from page cache once IOs are finished. + */ +int ll_file_open_encrypt(struct inode *inode, struct file *filp) +{ + int rc; + + rc = llcrypt_file_open(inode, filp); + if (likely(rc != -ENOKEY)) + return rc; + + if (rc == -ENOKEY && + (filp->f_flags & O_FILE_ENC) == O_FILE_ENC && + filp->f_flags & O_DIRECT) + /* allow file open with O_FILE_ENC flag when we have O_DIRECT */ + rc = 0; + + return rc; +} + +void llcrypt_free_ctx(void *encctx, __u32 size) +{ + if (encctx) + OBD_FREE(encctx, size); +} + +#ifdef HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED +bool ll_sb_has_test_dummy_encryption(struct super_block *sb) +{ + struct ll_sb_info *sbi = s2lsi(sb)->lsi_llsbi; + + return sbi ? + unlikely(test_bit(LL_SBI_TEST_DUMMY_ENCRYPTION, sbi->ll_flags)) : + false; +} + +static bool ll_dummy_context(struct inode *inode) +{ + return ll_sb_has_test_dummy_encryption(inode->i_sb); +} +#else +static const union llcrypt_context * +ll_get_dummy_context(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + + return lsi ? lsi->lsi_dummy_enc_ctx.ctx : NULL; +} + +bool ll_sb_has_test_dummy_encryption(struct super_block *sb) +{ + return ll_get_dummy_context(sb) != NULL; +} +#endif + +bool ll_sbi_has_encrypt(struct ll_sb_info *sbi) +{ + return test_bit(LL_SBI_ENCRYPT, sbi->ll_flags); +} + +void ll_sbi_set_encrypt(struct ll_sb_info *sbi, bool set) +{ + if (set) { + set_bit(LL_SBI_ENCRYPT, sbi->ll_flags); + } else { + clear_bit(LL_SBI_ENCRYPT, sbi->ll_flags); + clear_bit(LL_SBI_TEST_DUMMY_ENCRYPTION, sbi->ll_flags); + } +} + +bool ll_sbi_has_name_encrypt(struct ll_sb_info *sbi) +{ + return test_bit(LL_SBI_ENCRYPT_NAME, sbi->ll_flags); +} + +void ll_sbi_set_name_encrypt(struct ll_sb_info *sbi, bool set) +{ + if (set) + set_bit(LL_SBI_ENCRYPT_NAME, sbi->ll_flags); + else + clear_bit(LL_SBI_ENCRYPT_NAME, sbi->ll_flags); +} + +static bool ll_empty_dir(struct inode *inode) +{ + /* used by llcrypt_ioctl_set_policy(), because a policy can only be set + * on an empty dir. + */ + /* Here we choose to return true, meaning we always call .set_context. + * Then we rely on server side, with mdd_fix_attr() that calls + * mdd_dir_is_empty() when setting encryption flag on directory. + */ + return true; +} + +/** + * ll_setup_filename() - overlay to llcrypt_setup_filename + * @dir: the directory that will be searched + * @iname: the user-provided filename being searched for + * @lookup: 1 if we're allowed to proceed without the key because it's + * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot + * proceed without the key because we're going to create the dir_entry. + * @fname: the filename information to be filled in + * @fid: fid retrieved from user-provided filename + * + * This overlay function is necessary to properly encode @fname after + * encryption, as it will be sent over the wire. + * This overlay function is also necessary to handle the case of operations + * carried out without the key. Normally llcrypt makes use of digested names in + * that case. Having a digested name works for local file systems that can call + * llcrypt_match_name(), but Lustre server side is not aware of encryption. + * So for keyless @lookup operations on long names, for Lustre we choose to + * present to users the encoded struct ll_digest_filename, instead of a digested + * name. FID and name hash can then easily be extracted and put into the + * requests sent to servers. + */ +int ll_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct llcrypt_name *fname, + struct lu_fid *fid) +{ + int digested = 0; + struct qstr dname; + int rc; + + if (fid && IS_ENCRYPTED(dir) && llcrypt_policy_has_filename_enc(dir) && + !llcrypt_has_encryption_key(dir)) { + struct lustre_sb_info *lsi = s2lsi(dir->i_sb); + + if ((!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI) && + iname->name[0] == LLCRYPT_DIGESTED_CHAR) || + ((lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI) && + iname->name[0] == LLCRYPT_DIGESTED_CHAR_OLD)) + digested = 1; + } + + dname.name = iname->name + digested; + dname.len = iname->len - digested; + + if (fid) { + fid->f_seq = 0; + fid->f_oid = 0; + fid->f_ver = 0; + } + rc = llcrypt_setup_filename(dir, &dname, lookup, fname); + if (rc == -ENOENT && lookup) { + if (((is_root_inode(dir) && + iname->len == strlen(dot_fscrypt_name) && + strncmp(iname->name, dot_fscrypt_name, iname->len) == 0) || + (!llcrypt_has_encryption_key(dir) && + unlikely(filename_is_volatile(iname->name, + iname->len, NULL))))) { + /* In case of subdir mount of an encrypted directory, + * we allow lookup of /.fscrypt directory. + */ + /* For purpose of migration or mirroring without enc key + * we allow lookup of volatile file without enc context. + */ + memset(fname, 0, sizeof(struct llcrypt_name)); + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + rc = 0; + } else if (!llcrypt_has_encryption_key(dir)) { + rc = -ENOKEY; + } + } + if (rc) + return rc; + + if (digested) { + /* Without the key, for long names user should have struct + * ll_digest_filename representation of the dentry instead of + * the name. So make sure it is valid, return fid and put + * excerpt of cipher text name in disk_name. + */ + struct ll_digest_filename *digest; + + if (fname->crypto_buf.len < sizeof(struct ll_digest_filename)) { + rc = -EINVAL; + goto out_free; + } + digest = (struct ll_digest_filename *)fname->disk_name.name; + *fid = digest->ldf_fid; + if (!fid_is_sane(fid)) { + rc = -EINVAL; + goto out_free; + } + fname->disk_name.name = digest->ldf_excerpt; + fname->disk_name.len = sizeof(digest->ldf_excerpt); + } + if (IS_ENCRYPTED(dir) && + !name_is_dot_or_dotdot(fname->disk_name.name, + fname->disk_name.len)) { + int presented_len = critical_chars(fname->disk_name.name, + fname->disk_name.len); + char *buf; + + buf = kmalloc(presented_len + 1, GFP_NOFS); + if (!buf) { + rc = -ENOMEM; + goto out_free; + } + + if (presented_len == fname->disk_name.len) + memcpy(buf, fname->disk_name.name, presented_len); + else + critical_encode(fname->disk_name.name, + fname->disk_name.len, buf); + buf[presented_len] = '\0'; + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = buf; + fname->crypto_buf.len = presented_len; + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; + } + + return rc; + +out_free: + llcrypt_free_filename(fname); + return rc; +} + +/** + * ll_fname_disk_to_usr() - overlay to llcrypt_fname_disk_to_usr + * @inode: the inode to convert name + * @hash: major hash for inode + * @minor_hash: minor hash for inode + * @iname: the user-provided filename needing conversion + * @oname: the filename information to be filled in + * @fid: the user-provided fid for filename + * + * The caller must have allocated sufficient memory for the @oname string. + * + * This overlay function is necessary to properly decode @iname before + * decryption, as it comes from the wire. + * This overlay function is also necessary to handle the case of operations + * carried out without the key. Normally llcrypt makes use of digested names in + * that case. Having a digested name works for local file systems that can call + * llcrypt_match_name(), but Lustre server side is not aware of encryption. + * So for keyless @lookup operations on long names, for Lustre we choose to + * present to users the encoded struct ll_digest_filename, instead of a digested + * name. FID and name hash can then easily be extracted and put into the + * requests sent to servers. + */ +int ll_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + struct llcrypt_str *iname, struct llcrypt_str *oname, + struct lu_fid *fid) +{ + struct llcrypt_str lltr = LLTR_INIT(iname->name, iname->len); + struct ll_digest_filename digest; + int digested = 0; + char *buf = NULL; + int rc; + + if (IS_ENCRYPTED(inode)) { + if (!name_is_dot_or_dotdot(lltr.name, lltr.len) && + strnchr(lltr.name, lltr.len, '=')) { + /* Only proceed to critical decode if + * iname contains espace char '='. + */ + int len = lltr.len; + + buf = kmalloc(len, GFP_NOFS); + if (!buf) + return -ENOMEM; + + len = critical_decode(lltr.name, len, buf); + lltr.name = buf; + lltr.len = len; + } + if (lltr.len > LL_CRYPTO_BLOCK_SIZE * 2 && + !llcrypt_has_encryption_key(inode) && + llcrypt_policy_has_filename_enc(inode)) { + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + + digested = 1; + /* Without the key for long names, set the dentry name + * to the representing struct ll_digest_filename. It + * will be encoded by llcrypt for display, and will + * enable further lookup requests. + */ + if (!fid) + return -EINVAL; + digest.ldf_fid = *fid; + memcpy(digest.ldf_excerpt, + LLCRYPT_EXTRACT_DIGEST(lltr.name, lltr.len), + sizeof(digest.ldf_excerpt)); + + lltr.name = (char *)&digest; + lltr.len = sizeof(digest); + + if (!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI)) + oname->name[0] = LLCRYPT_DIGESTED_CHAR; + else + oname->name[0] = LLCRYPT_DIGESTED_CHAR_OLD; + oname->name = oname->name + 1; + oname->len--; + } + } + + rc = llcrypt_fname_disk_to_usr(inode, hash, minor_hash, &lltr, oname); + + kfree(buf); + oname->name = oname->name - digested; + oname->len = oname->len + digested; + + return rc; +} + +/* Copied from llcrypt_d_revalidate, as it is not exported */ +/* + * Validate dentries in encrypted directories to make sure we aren't potentially + * caching stale dentries after a key has been added. + */ +int ll_revalidate_d_crypto(struct dentry *dentry, unsigned int flags) +{ + struct dentry *dir; + int err; + int valid; + + /* + * Plaintext names are always valid, since llcrypt doesn't support + * reverting to ciphertext names without evicting the directory's inode + * -- which implies eviction of the dentries in the directory. + */ + if (!llcrypt_is_nokey_name(dentry)) + return 1; + + /* + * Ciphertext name; valid if the directory's key is still unavailable. + * + * Although llcrypt forbids rename() on ciphertext names, we still must + * use dget_parent() here rather than use ->d_parent directly. That's + * because a corrupted fs image may contain directory hard links, which + * the VFS handles by moving the directory's dentry tree in the dcache + * each time ->lookup() finds the directory and it already has a dentry + * elsewhere. Thus ->d_parent can be changing, and we must safely grab + * a reference to some ->d_parent to prevent it from being freed. + */ + + if (flags & LOOKUP_RCU) + return -ECHILD; + + dir = dget_parent(dentry); + err = llcrypt_get_encryption_info(d_inode(dir)); + valid = !llcrypt_has_encryption_key(d_inode(dir)); + dput(dir); + + if (err < 0) + return err; + + return valid; +} + +const struct llcrypt_operations lustre_cryptops = { + .key_prefix = "lustre:", + .get_context = ll_get_context, + .set_context = ll_set_context, +#ifdef HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED + .dummy_context = ll_dummy_context, +#else + .get_dummy_context = ll_get_dummy_context, +#endif + .empty_dir = ll_empty_dir, + .max_namelen = NAME_MAX, +}; +#else /* !HAVE_LUSTRE_CRYPTO */ +int ll_set_encflags(struct inode *inode, void *encctx, __u32 encctxlen, + bool preload) +{ + return 0; +} + +int ll_file_open_encrypt(struct inode *inode, struct file *filp) +{ + return llcrypt_file_open(inode, filp); +} + +void llcrypt_free_ctx(void *encctx, __u32 size) +{ +} + +bool ll_sb_has_test_dummy_encryption(struct super_block *sb) +{ + return false; +} + +bool ll_sbi_has_encrypt(struct ll_sb_info *sbi) +{ + return false; +} + +void ll_sbi_set_encrypt(struct ll_sb_info *sbi, bool set) +{ +} + +bool ll_sbi_has_name_encrypt(struct ll_sb_info *sbi) +{ + return false; +} + +void ll_sbi_set_name_encrypt(struct ll_sb_info *sbi, bool set) +{ +} + +int ll_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct llcrypt_name *fname, + struct lu_fid *fid) +{ + if (fid) { + fid->f_seq = 0; + fid->f_oid = 0; + fid->f_ver = 0; + } + + return llcrypt_setup_filename(dir, iname, lookup, fname); +} + +int ll_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + struct llcrypt_str *iname, struct llcrypt_str *oname, + struct lu_fid *fid) +{ + return llcrypt_fname_disk_to_usr(inode, hash, minor_hash, iname, oname); +} + +int ll_revalidate_d_crypto(struct dentry *dentry, unsigned int flags) +{ + return 1; +} +#endif + diff --git a/drivers/staging/lustrefsx/lustre/llite/dcache.c b/drivers/staging/lustrefsx/lustre/llite/dcache.c new file mode 100644 index 0000000000000..b736bfc948ede --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/dcache.c @@ -0,0 +1,388 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include + +#include "llite_internal.h" + +static void free_dentry_data(struct rcu_head *head) +{ + struct ll_dentry_data *lld; + + lld = container_of(head, struct ll_dentry_data, lld_rcu_head); + OBD_FREE_PTR(lld); +} + +/* should NOT be called with the dcache lock, see fs/dcache.c */ +static void ll_release(struct dentry *de) +{ + struct ll_dentry_data *lld; + ENTRY; + LASSERT(de != NULL); + lld = ll_d2d(de); + if (lld == NULL) /* NFS copies the de->d_op methods (bug 4655) */ + RETURN_EXIT; + + de->d_fsdata = NULL; + call_rcu(&lld->lld_rcu_head, free_dentry_data); + + EXIT; +} + +/* Compare if two dentries are the same. Don't match if the existing dentry + * is marked invalid. Returns 1 if different, 0 if the same. + * + * This avoids a race where ll_lookup_it() instantiates a dentry, but we get + * an AST before calling d_revalidate_it(). The dentry still exists (marked + * INVALID) so d_lookup() matches it, but we have no lock on it (so + * lock_match() fails) and we spin around real_lookup(). + * + * This race doesn't apply to lookups in d_alloc_parallel(), and for + * those we want to ensure that only one dentry with a given name is + * in ll_lookup_nd() at a time. So allow invalid dentries to match + * while d_in_lookup(). We will be called again when the lookup + * completes, and can give a different answer then. + */ +#if defined(HAVE_D_COMPARE_5ARGS) +static int ll_dcompare(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, + const struct qstr *name) +#elif defined(HAVE_D_COMPARE_4ARGS) +static int ll_dcompare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) +#endif +{ + ENTRY; + + if (len != name->len) + RETURN(1); + + if (memcmp(str, name->name, len)) + RETURN(1); + + CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n", + name->len, name->name, dentry, dentry->d_flags, + ll_d_count(dentry)); + + /* mountpoint is always valid */ + if (d_mountpoint((struct dentry *)dentry)) + RETURN(0); + + /* ensure exclusion against parallel lookup of the same name */ + if (d_in_lookup((struct dentry *)dentry)) + return 0; + + if (d_lustre_invalid(dentry)) + RETURN(1); + + RETURN(0); +} + +/** + * Called when last reference to a dentry is dropped and dcache wants to know + * whether or not it should cache it: + * - return 1 to delete the dentry immediately + * - return 0 to cache the dentry + * Should NOT be called with the dcache lock, see fs/dcache.c + */ +static int ll_ddelete(const struct dentry *de) +{ + ENTRY; + LASSERT(de); + + CDEBUG(D_DENTRY, "%s dentry %pd (%p, parent %p, inode %p) %s%s\n", + d_lustre_invalid(de) ? "deleting" : "keeping", + de, de, de->d_parent, de->d_inode, + d_unhashed((struct dentry *)de) ? "" : "hashed,", + list_empty(&de->d_subdirs) ? "" : "subdirs"); + + /* kernel >= 2.6.38 last refcount is decreased after this function. */ + LASSERT(ll_d_count(de) == 1); + + if (d_lustre_invalid(de)) + RETURN(1); + RETURN(0); +} + +#ifdef HAVE_D_INIT +static int ll_d_init(struct dentry *de) +{ + struct ll_dentry_data *lld; + + OBD_ALLOC_PTR(lld); + lld->lld_invalid = 1; + de->d_fsdata = lld; + return 0; +} +#else /* !HAVE_D_INIT */ + +bool ll_d_setup(struct dentry *de, bool do_put) +{ + struct ll_dentry_data *lld; + bool success = true; + + if (de->d_fsdata) + return success; + + OBD_ALLOC_PTR(lld); + if (likely(lld)) { + spin_lock(&de->d_lock); + /* Since the first d_fsdata test was not + * done under the spinlock it could have + * changed by time the memory is allocated. + */ + if (!de->d_fsdata) { + lld->lld_invalid = 1; + de->d_fsdata = lld; + } + spin_unlock(&de->d_lock); + /* See if we lost the race to set d_fsdata. */ + if (de->d_fsdata != lld) + OBD_FREE_PTR(lld); + } else { + success = false; + if (do_put) + dput(de); + } + + return success; +} +#endif /* !HAVE_D_INIT */ + +void ll_intent_drop_lock(struct lookup_intent *it) +{ + if (it->it_op && it->it_lock_mode) { + struct lustre_handle handle; + + handle.cookie = it->it_lock_handle; + + CDEBUG(D_DLMTRACE, "releasing lock with cookie %#llx from it %p\n", + handle.cookie, it); + ldlm_lock_decref(&handle, it->it_lock_mode); + + /* bug 494: intent_release may be called multiple times, from + * this thread and we don't want to double-decref this lock */ + it->it_lock_mode = 0; + if (it->it_remote_lock_mode != 0) { + handle.cookie = it->it_remote_lock_handle; + + CDEBUG(D_DLMTRACE, + "releasing remote lock with cookie %#llx from it %p\n", + handle.cookie, it); + ldlm_lock_decref(&handle, + it->it_remote_lock_mode); + it->it_remote_lock_mode = 0; + } + } +} + +void ll_intent_release(struct lookup_intent *it) +{ + ENTRY; + + CDEBUG(D_INFO, "intent %p released\n", it); + ll_intent_drop_lock(it); + /* We are still holding extra reference on a request, need to free it */ + if (it_disposition(it, DISP_ENQ_OPEN_REF)) + ptlrpc_req_finished(it->it_request); /* ll_file_open */ + + if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */ + ptlrpc_req_finished(it->it_request); + + it->it_disposition = 0; + it->it_request = NULL; + EXIT; +} + +/* mark aliases invalid and prune unused aliases */ +void ll_prune_aliases(struct inode *inode) +{ + struct dentry *dentry; + ENTRY; + + LASSERT(inode != NULL); + + CDEBUG(D_INODE, "marking dentries for inode "DFID"(%p) invalid\n", + PFID(ll_inode2fid(inode)), inode); + + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) + d_lustre_invalidate(dentry); + spin_unlock(&inode->i_lock); + + d_prune_aliases(inode); + + EXIT; +} + +int ll_revalidate_it_finish(struct ptlrpc_request *request, + struct lookup_intent *it, + struct dentry *de) +{ + struct inode *inode = de->d_inode; + __u64 bits = 0; + int rc = 0; + + ENTRY; + + if (!request) + RETURN(0); + + if (it_disposition(it, DISP_LOOKUP_NEG)) + RETURN(-ENOENT); + + rc = ll_prep_inode(&inode, &request->rq_pill, NULL, it); + if (rc) + RETURN(rc); + + ll_set_lock_data(ll_i2sbi(inode)->ll_md_exp, inode, it, + &bits); + if (bits & MDS_INODELOCK_LOOKUP) { + ll_update_dir_depth(de->d_parent->d_inode, inode); + if (!ll_d_setup(de, true)) + RETURN(-ENOMEM); + d_lustre_revalidate(de); + } + + RETURN(rc); +} + +void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry) +{ + LASSERT(it != NULL); + LASSERT(dentry != NULL); + + if (it->it_lock_mode && dentry->d_inode != NULL) { + struct inode *inode = dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + + CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL); + } + + /* drop lookup or getattr locks immediately */ + if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) + ll_intent_drop_lock(it); +} + +static int ll_revalidate_dentry(struct dentry *dentry, + unsigned int lookup_flags) +{ + struct inode *dir = dentry->d_parent->d_inode; + struct ll_dentry_data *lld = dentry->d_fsdata; + struct ll_sb_info *sbi; + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%s, flags=%u\n", + dentry->d_name.name, lookup_flags); + + rc = ll_revalidate_d_crypto(dentry, lookup_flags); + if (rc != 1) + return rc; + + /* If this is intermediate component path lookup and we were able to get + * to this dentry, then its lock has not been revoked and the + * path component is valid. */ + if (lookup_flags & (LOOKUP_CONTINUE | LOOKUP_PARENT)) + return 1; + + /* Symlink - always valid as long as the dentry was found */ + /* only special case is to prevent ELOOP error from VFS during open + * of a foreign symlink file/dir with O_NOFOLLOW, like it happens for + * real symlinks. This will allow to open foreign symlink file/dir + * for get[dir]stripe/unlock ioctl()s. + */ + if (d_is_symlink(dentry)) { + if (!S_ISLNK(dentry->d_inode->i_mode) && + !(lookup_flags & LOOKUP_FOLLOW)) + return 0; + else + return 1; + } + + /* + * VFS warns us that this is the second go around and previous + * operation failed (most likely open|creat), so this time + * we better talk to the server via the lookup path by name, + * not by fid. + */ + if (lookup_flags & LOOKUP_REVAL) + return 0; + + if (lookup_flags & LOOKUP_RCU) + return -ECHILD; + + /* + * To support metadata lazy load, we want to bypass negative lookup cache + * on the client. A negative dentry cache is a dentry node that does not + * have an inode associated with it. In these cases, return 0 here + * to force a lookup call to the server. + */ + sbi = ll_s2sbi(dentry->d_sb); + if (d_is_negative(dentry) && + sbi->ll_neg_dentry_timeout != OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS) { + LASSERT(lld != NULL); + if (!lld->lld_neg_cache_timeout) + lld->lld_neg_cache_timeout = jiffies + sbi->ll_neg_dentry_timeout * HZ; + + if (time_after(jiffies, lld->lld_neg_cache_timeout)) { + CDEBUG(D_VFSTRACE, + "negative dentry past timeout - flags: %u\n", lookup_flags); + return 0; + } + CDEBUG(D_VFSTRACE, + "negative dentry within timeout - flags: %u\n", lookup_flags); + } + + if (dentry_may_statahead(dir, dentry)) + ll_revalidate_statahead(dir, &dentry, dentry->d_inode == NULL); + + return 1; +} + +const struct dentry_operations ll_d_ops = { +#ifdef HAVE_D_INIT + .d_init = ll_d_init, +#endif + .d_revalidate = ll_revalidate_dentry, + .d_release = ll_release, + .d_delete = ll_ddelete, + .d_compare = ll_dcompare, +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/dir.c b/drivers/staging/lustrefsx/lustre/llite/dir.c new file mode 100644 index 0000000000000..12125350ae7b0 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/dir.c @@ -0,0 +1,2616 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/llite/dir.c + * + * Directory code for lustre client. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include // for wait_on_buffer +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llite_internal.h" + +static int ll_check_and_trigger_restore(struct inode *dir) +{ + struct ll_sb_info *sbi = ll_i2sbi(dir); + const int max_retry = atomic_read(&sbi->ll_dir_restore_max_retry_count); + int retry_count = 0; + u32 hus_states; + __u32 gen = 0; + int rc; + + /* Skip restore if server does not support or if disabled */ + if (!exp_mdll(sbi->ll_md_exp) || exp_bypass_mdll(sbi->ll_md_exp)) + return 0; + + /* + * TODO-MDLL: + * use API that does a cached read instead of + * going to the mdt for getting the hsm state. + * Tracked with Simba-21644 + */ +try_again: + rc = ll_get_hsm_state(dir, &hus_states); + if (rc == 0 && (hus_states & HS_RELEASED)) { + CDEBUG(D_HSM, + "MDLL Calling ll_layout_restore for dir "DFID" retry: %d" + "\n", PFID(ll_inode2fid(dir)), retry_count); + rc = ll_layout_restore(dir, 0, OBD_OBJECT_EOF); + if (rc) { + CERROR("MDLL ll_layout_restore ("DFID") error rc: %d\n", + PFID(ll_inode2fid(dir)), rc); + rc = -EAGAIN; + if (max_retry == 0) + goto out_exit; + } else { + CDEBUG(D_HSM, "MDLL Restore triggered for dir "DFID"\n", + PFID(ll_inode2fid(dir))); + ll_layout_refresh(dir, &gen); + CDEBUG(D_HSM, "MDLL Restore done for dir "DFID"\n", + PFID(ll_inode2fid(dir))); + } + /* If the max_retry is set to 0, then the behavior would be + * without a retry. There wont be any check for the hsm state + * after the completed restore. This case would be similar to + * the behaviour without this retry changes. The default + * value of the max_retry would be 1. + * A value of -1 would retry indefinitely. + */ + /* In case of an mdt restart, the ll_layout_refresh would + * return back only after the mdt has restarted and the + * existing network connection gets a reset. When the retry + * happens, the mdt would be up and running. + * Ideally the directory restore would be done with a single + * retry if the mdt does not crash/restart again. + */ + if ((max_retry < 0) || + (max_retry >= 0 && retry_count < max_retry)) { + retry_count++; + goto try_again; + } else if (max_retry > 0 && retry_count >= max_retry) { + rc = ll_get_hsm_state(dir, &hus_states); + if (rc == 0 && (hus_states & HS_RELEASED)) { + CDEBUG(D_HSM, + "MDLL reached max retry %d for ("DFID")" + "hsm_state: %d\n", + retry_count, PFID(ll_inode2fid(dir)), + hus_states); + rc = -EAGAIN; + goto out_exit; + } + } + } + if (rc != 0) { + CDEBUG(D_HSM, + "MDLL error calling ll_get_hsm_state for dir "DFID" rc: " + "%d\n", PFID(ll_inode2fid(dir)), rc); + rc = -EAGAIN; + } + +out_exit: + return rc; +} + +/* + * (new) readdir implementation overview. + * + * Original lustre readdir implementation cached exact copy of raw directory + * pages on the client. These pages were indexed in client page cache by + * logical offset in the directory file. This design, while very simple and + * intuitive had some inherent problems: + * + * . it implies that byte offset to the directory entry serves as a + * telldir(3)/seekdir(3) cookie, but that offset is not stable: in + * ext3/htree directory entries may move due to splits, and more + * importantly, + * + * . it is incompatible with the design of split directories for cmd3, + * that assumes that names are distributed across nodes based on their + * hash, and so readdir should be done in hash order. + * + * New readdir implementation does readdir in hash order, and uses hash of a + * file name as a telldir/seekdir cookie. This led to number of complications: + * + * . hash is not unique, so it cannot be used to index cached directory + * pages on the client (note, that it requires a whole pageful of hash + * collided entries to cause two pages to have identical hashes); + * + * . hash is not unique, so it cannot, strictly speaking, be used as an + * entry cookie. ext3/htree has the same problem and lustre implementation + * mimics their solution: seekdir(hash) positions directory at the first + * entry with the given hash. + * + * Client side. + * + * 0. caching + * + * Client caches directory pages using hash of the first entry as an index. As + * noted above hash is not unique, so this solution doesn't work as is: + * special processing is needed for "page hash chains" (i.e., sequences of + * pages filled with entries all having the same hash value). + * + * First, such chains have to be detected. To this end, server returns to the + * client the hash of the first entry on the page next to one returned. When + * client detects that this hash is the same as hash of the first entry on the + * returned page, page hash collision has to be handled. Pages in the + * hash chain, except first one, are termed "overflow pages". + * + * Proposed (unimplimented) solution to index uniqueness problem is to + * not cache overflow pages. Instead, when page hash collision is + * detected, all overflow pages from emerging chain should be + * immediately requested from the server and placed in a special data + * structure. This data structure can be used by ll_readdir() to + * process entries from overflow pages. When readdir invocation + * finishes, overflow pages are discarded. If page hash collision chain + * weren't completely processed, next call to readdir will again detect + * page hash collision, again read overflow pages in, process next + * portion of entries and again discard the pages. This is not as + * wasteful as it looks, because, given reasonable hash, page hash + * collisions are extremely rare. + * + * 1. directory positioning + * + * When seekdir(hash) is called, original + * + * + * + * + * + * + * + * + * Server. + * + * identification of and access to overflow pages + * + * page format + * + * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains + * a header lu_dirpage which describes the start/end hash, and whether this + * page is empty (contains no dir entry) or hash collide with next page. + * After client receives reply, several pages will be integrated into dir page + * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the + * lu_dirpage for this integrated page will be adjusted. See + * mdc_adjust_dirpages(). + * + */ +struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data, + __u64 offset, int *partial_readdir_rc) +{ + struct md_readdir_info mrinfo = { + .mr_blocking_ast = ll_md_blocking_ast }; + struct page *page; + int rc; + + rc = ll_check_and_trigger_restore(dir); + if (rc != 0) + return ERR_PTR(rc); + + rc = md_read_page(ll_i2mdexp(dir), op_data, &mrinfo, offset, &page); + if (rc != 0) + return ERR_PTR(rc); + + if (partial_readdir_rc && mrinfo.mr_partial_readdir_rc) + *partial_readdir_rc = mrinfo.mr_partial_readdir_rc; + + return page; +} + +void ll_release_page(struct inode *inode, struct page *page, + bool remove) +{ + kunmap(page); + + /* Always remove the page for striped dir, because the page is + * built from temporarily in LMV layer */ + if (inode && ll_dir_striped(inode)) { + __free_page(page); + return; + } + + if (remove) { + lock_page(page); + if (likely(page->mapping != NULL)) + cfs_delete_from_page_cache(page); + unlock_page(page); + } + put_page(page); +} + +#ifdef HAVE_DIR_CONTEXT +int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data, + struct dir_context *ctx, int *partial_readdir_rc) +{ +#else +int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data, + void *cookie, filldir_t filldir, int *partial_readdir_rc) +{ +#endif + struct ll_sb_info *sbi = ll_i2sbi(inode); + __u64 pos = *ppos; + bool is_api32 = ll_need_32bit_api(sbi); + bool is_hash64 = test_bit(LL_SBI_64BIT_HASH, sbi->ll_flags); + struct page *page; + bool done = false; + struct llcrypt_str lltr = LLTR_INIT(NULL, 0); + int rc = 0; + ENTRY; + + if (IS_ENCRYPTED(inode)) { + rc = llcrypt_fname_alloc_buffer(inode, NAME_MAX, &lltr); + if (rc < 0) + RETURN(rc); + } + + page = ll_get_dir_page(inode, op_data, pos, partial_readdir_rc); + + while (rc == 0 && !done) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + __u64 hash; + __u64 next; + + if (IS_ERR(page)) { + rc = PTR_ERR(page); + break; + } + + hash = MDS_DIR_END_OFF; + dp = page_address(page); + for (ent = lu_dirent_start(dp); ent != NULL && !done; + ent = lu_dirent_next(ent)) { + __u16 type; + int namelen; + struct lu_fid fid; + __u64 lhash; + __u64 ino; + + hash = le64_to_cpu(ent->lde_hash); + if (hash < pos) /* Skip until we find target hash */ + continue; + + namelen = le16_to_cpu(ent->lde_namelen); + if (namelen == 0) /* Skip dummy record */ + continue; + + if (is_api32 && is_hash64) + lhash = hash >> 32; + else + lhash = hash; + fid_le_to_cpu(&fid, &ent->lde_fid); + ino = cl_fid_build_ino(&fid, is_api32); + type = S_DT(lu_dirent_type_get(ent)); + /* For ll_nfs_get_name_filldir(), it will try to access + * 'ent' through 'lde_name', so the parameter 'name' + * for 'filldir()' must be part of the 'ent'. */ +#ifdef HAVE_DIR_CONTEXT + ctx->pos = lhash; + if (!IS_ENCRYPTED(inode)) { + done = !dir_emit(ctx, ent->lde_name, namelen, + ino, type); + } else { + /* Directory is encrypted */ + int save_len = lltr.len; + struct llcrypt_str de_name = + LLTR_INIT(ent->lde_name, namelen); + + rc = ll_fname_disk_to_usr(inode, 0, 0, &de_name, + &lltr, &fid); + de_name = lltr; + lltr.len = save_len; + if (rc) { + done = 1; + break; + } + done = !dir_emit(ctx, de_name.name, de_name.len, + ino, type); + } +#else + /* HAVE_DIR_CONTEXT is defined from kernel 3.11, whereas + * IS_ENCRYPTED is brought by kernel 4.14. + * So there is no need to handle encryption case here. + */ + done = filldir(cookie, ent->lde_name, namelen, lhash, + ino, type); +#endif + } + + if (done) { + pos = hash; + ll_release_page(inode, page, false); + break; + } + + next = le64_to_cpu(dp->ldp_hash_end); + pos = next; + if (pos == MDS_DIR_END_OFF) { + /* + * End of directory reached. + */ + done = 1; + ll_release_page(inode, page, false); + } else { + /* + * Normal case: continue to the next + * page. + */ + ll_release_page(inode, page, + le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + next = pos; + page = ll_get_dir_page(inode, op_data, pos, + partial_readdir_rc); + } + } +#ifdef HAVE_DIR_CONTEXT + ctx->pos = pos; +#else + *ppos = pos; +#endif + llcrypt_fname_free_buffer(&lltr); + RETURN(rc); +} + +#ifdef HAVE_DIR_CONTEXT +static int ll_iterate(struct file *filp, struct dir_context *ctx) +#else +static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) +#endif +{ + struct inode *inode = file_inode(filp); + struct ll_file_data *lfd = filp->private_data; + struct ll_sb_info *sbi = ll_i2sbi(inode); + bool hash64 = test_bit(LL_SBI_64BIT_HASH, sbi->ll_flags); + int api32 = ll_need_32bit_api(sbi); + struct md_op_data *op_data; + struct lu_fid pfid = { 0 }; + ktime_t kstart = ktime_get(); + /* result of possible partial readdir */ + int partial_readdir_rc = 0; + __u64 pos; + int rc; + + ENTRY; + + LASSERT(lfd != NULL); + pos = lfd->lfd_pos; + + CDEBUG(D_VFSTRACE, + "VFS Op:inode="DFID"(%p) pos/size%lu/%llu 32bit_api %d\n", + PFID(ll_inode2fid(inode)), + inode, (unsigned long)pos, i_size_read(inode), api32); + + if (IS_ENCRYPTED(inode)) { + rc = llcrypt_get_encryption_info(inode); + if (rc && rc != -ENOKEY) + GOTO(out, rc); + } + + if (pos == MDS_DIR_END_OFF) + /* + * end-of-file. + */ + GOTO(out, rc = 0); + + if (unlikely(ll_dir_striped(inode))) { + /* + * This is only needed for striped dir to fill .., + * see lmv_read_page() + */ + if (file_dentry(filp)->d_parent != NULL && + file_dentry(filp)->d_parent->d_inode != NULL) { + __u64 ibits = MDS_INODELOCK_LOOKUP; + struct inode *parent = + file_dentry(filp)->d_parent->d_inode; + + if (ll_have_md_lock(parent, &ibits, LCK_MINMODE)) + pfid = *ll_inode2fid(parent); + } + + /* If it can not find in cache, do lookup .. on the master + * object */ + if (fid_is_zero(&pfid)) { + rc = ll_dir_get_parent_fid(inode, &pfid); + if (rc != 0) + RETURN(rc); + } + } + + op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0, + LUSTRE_OPC_ANY, inode); + if (IS_ERR(op_data)) + GOTO(out, rc = PTR_ERR(op_data)); + + /* foreign dirs are browsed out of Lustre */ + if (unlikely(op_data->op_mea1 != NULL && + op_data->op_mea1->lsm_md_magic == LMV_MAGIC_FOREIGN)) { + ll_finish_md_op_data(op_data); + RETURN(-ENODATA); + } + + op_data->op_fid3 = pfid; + +#ifdef HAVE_DIR_CONTEXT + ctx->pos = pos; + rc = ll_dir_read(inode, &pos, op_data, ctx, &partial_readdir_rc); + pos = ctx->pos; +#else + rc = ll_dir_read(inode, &pos, op_data, cookie, filldir, + &partial_readdir_rc); +#endif + lfd->lfd_pos = pos; + if (!lfd->fd_partial_readdir_rc) + lfd->fd_partial_readdir_rc = partial_readdir_rc; + + if (pos == MDS_DIR_END_OFF) { + if (api32) + pos = LL_DIR_END_OFF_32BIT; + else + pos = LL_DIR_END_OFF; + } else { + if (api32 && hash64) + pos = pos >> 32; + } +#ifdef HAVE_DIR_CONTEXT + ctx->pos = pos; +#else + filp->f_pos = pos; +#endif + ll_finish_md_op_data(op_data); + +out: + if (!rc) + ll_stats_ops_tally(sbi, LPROC_LL_READDIR, + ktime_us_delta(ktime_get(), kstart)); + + RETURN(rc); +} + +/** + * Create striped directory with specified stripe(@lump) + * + * \param[in] dparent the parent of the directory. + * \param[in] lump the specified stripes. + * \param[in] dirname the name of the directory. + * \param[in] mode the specified mode of the directory. + * + * \retval =0 if striped directory is being created successfully. + * <0 if the creation is failed. + */ +static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump, + size_t len, const char *dirname, umode_t mode, + bool createonly) +{ + struct inode *parent = dparent->d_inode; + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + struct ll_sb_info *sbi = ll_i2sbi(parent); + struct inode *inode = NULL; + struct dentry dentry = { + .d_parent = dparent, + .d_name = { + .name = dirname, + .len = strlen(dirname), + .hash = ll_full_name_hash(dparent, dirname, + strlen(dirname)), + }, + .d_sb = dparent->d_sb, + }; + bool encrypt = false; + int hash_flags; + int err; + + ENTRY; + if (unlikely(!lmv_user_magic_supported(lump->lum_magic))) + RETURN(-EINVAL); + + if (lump->lum_magic != LMV_MAGIC_FOREIGN) { + CDEBUG(D_VFSTRACE, + "VFS Op:inode="DFID"(%p) name %s stripe_offset %d, stripe_count: %u\n", + PFID(ll_inode2fid(parent)), parent, dirname, + (int)lump->lum_stripe_offset, lump->lum_stripe_count); + } else { + struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lump; + + CDEBUG(D_VFSTRACE, + "VFS Op:inode="DFID"(%p) name %s foreign, length %u, value '%.*s'\n", + PFID(ll_inode2fid(parent)), parent, dirname, + lfm->lfm_length, lfm->lfm_length, lfm->lfm_value); + } + + if (lump->lum_stripe_count > 1 && + !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_DIR_STRIPE)) + RETURN(-EINVAL); + + if (IS_DEADDIR(parent) && + !OBD_FAIL_CHECK(OBD_FAIL_LLITE_NO_CHECK_DEAD)) + RETURN(-ENOENT); + + /* MDS < 2.14 doesn't support 'crush' hash type, and cannot handle + * unknown hash if client doesn't set a valid one. switch to fnv_1a_64. + */ + if (!(exp_connect_flags2(sbi->ll_md_exp) & OBD_CONNECT2_CRUSH)) { + enum lmv_hash_type type = lump->lum_hash_type & + LMV_HASH_TYPE_MASK; + + if (type >= LMV_HASH_TYPE_CRUSH || + type == LMV_HASH_TYPE_UNKNOWN) + lump->lum_hash_type = (lump->lum_hash_type ^ type) | + LMV_HASH_TYPE_FNV_1A_64; + } + + hash_flags = lump->lum_hash_type & ~LMV_HASH_TYPE_MASK; + if (hash_flags & ~LMV_HASH_FLAG_KNOWN) + RETURN(-EINVAL); + + if (unlikely(!lmv_user_magic_supported(cpu_to_le32(lump->lum_magic)))) + lustre_swab_lmv_user_md(lump); + + if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent))) + mode &= ~current_umask(); + mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR; + op_data = ll_prep_md_op_data(NULL, parent, NULL, dirname, + strlen(dirname), mode, LUSTRE_OPC_MKDIR, + lump); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_dir_depth = ll_i2info(parent)->lli_inherit_depth ?: + ll_i2info(parent)->lli_dir_depth; + + if (ll_sbi_has_encrypt(sbi) && + (IS_ENCRYPTED(parent) || + unlikely(ll_sb_has_test_dummy_encryption(parent->i_sb)))) { + err = llcrypt_get_encryption_info(parent); + if (err) + GOTO(out_op_data, err); + if (!llcrypt_has_encryption_key(parent)) + GOTO(out_op_data, err = -ENOKEY); + encrypt = true; + } + + if (test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags)) { + /* selinux_dentry_init_security() uses dentry->d_parent and name + * to determine the security context for the file. So our fake + * dentry should be real enough for this purpose. */ + err = ll_dentry_init_security(&dentry, mode, &dentry.d_name, + &op_data->op_file_secctx_name, + &op_data->op_file_secctx_name_size, + &op_data->op_file_secctx, + &op_data->op_file_secctx_size, + &op_data->op_file_secctx_slot); + if (err < 0) + GOTO(out_op_data, err); + } + + if (encrypt) { + err = llcrypt_inherit_context(parent, NULL, op_data, false); + if (err) + GOTO(out_op_data, err); + } + + op_data->op_cli_flags |= CLI_SET_MEA; + if (createonly) + op_data->op_bias |= MDS_SETSTRIPE_CREATE; + + err = md_create(sbi->ll_md_exp, op_data, lump, len, mode, + from_kuid(&init_user_ns, current_fsuid()), + from_kgid(&init_user_ns, current_fsgid()), + current_cap(), 0, &request); + if (err) + GOTO(out_request, err); + + CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_SETDIRSTRIPE_PAUSE, cfs_fail_val); + + err = ll_prep_inode(&inode, &request->rq_pill, parent->i_sb, NULL); + if (err) + GOTO(out_inode, err); + + dentry.d_inode = inode; + + if (test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags)) + err = ll_inode_notifysecctx(inode, op_data->op_file_secctx, + op_data->op_file_secctx_size); + else + err = ll_inode_init_security(&dentry, inode, parent); + + if (err) + GOTO(out_inode, err); + + if (encrypt) { + err = ll_set_encflags(inode, op_data->op_file_encctx, + op_data->op_file_encctx_size, false); + if (err) + GOTO(out_inode, err); + } + +out_inode: + iput(inode); +out_request: + ptlrpc_req_finished(request); +out_op_data: + ll_finish_md_op_data(op_data); + + return err; +} + +int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, + int set_default) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + int rc = 0; + int lum_size; + ENTRY; + + if (lump != NULL) { + switch (lump->lmm_magic) { + case LOV_USER_MAGIC_V1: + lum_size = sizeof(struct lov_user_md_v1); + break; + case LOV_USER_MAGIC_V3: + lum_size = sizeof(struct lov_user_md_v3); + break; + case LOV_USER_MAGIC_COMP_V1: + lum_size = ((struct lov_comp_md_v1 *)lump)->lcm_size; + break; + case LMV_USER_MAGIC: { + struct lmv_user_md *lmv = (struct lmv_user_md *)lump; + + /* MDS < 2.14 doesn't support 'crush' hash type, and + * cannot handle unknown hash if client doesn't set a + * valid one. switch to fnv_1a_64. + */ + if (!(exp_connect_flags2(sbi->ll_md_exp) & + OBD_CONNECT2_CRUSH)) { + enum lmv_hash_type type = lmv->lum_hash_type & + LMV_HASH_TYPE_MASK; + + if (type >= LMV_HASH_TYPE_CRUSH || + type == LMV_HASH_TYPE_UNKNOWN) + lmv->lum_hash_type = + (lmv->lum_hash_type ^ type) | + LMV_HASH_TYPE_FNV_1A_64; + } + if (lmv->lum_magic != cpu_to_le32(LMV_USER_MAGIC)) + lustre_swab_lmv_user_md(lmv); + lum_size = sizeof(*lmv); + break; + } + case LOV_USER_MAGIC_SPECIFIC: { + struct lov_user_md_v3 *v3 = + (struct lov_user_md_v3 *)lump; + if (v3->lmm_stripe_count > LOV_MAX_STRIPE_COUNT) + RETURN(-EINVAL); + lum_size = lov_user_md_size(v3->lmm_stripe_count, + LOV_USER_MAGIC_SPECIFIC); + break; + } + default: + CDEBUG(D_IOCTL, "bad userland LOV MAGIC:" + " %#08x != %#08x nor %#08x\n", + lump->lmm_magic, LOV_USER_MAGIC_V1, + LOV_USER_MAGIC_V3); + RETURN(-EINVAL); + } + + /* + * This is coming from userspace, so should be in + * local endian. But the MDS would like it in little + * endian, so we swab it before we send it. + */ + if ((__swab32(lump->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) == + le32_to_cpu(LOV_MAGIC_MAGIC)) + lustre_swab_lov_user_md(lump, 0); + } else { + lum_size = sizeof(struct lov_user_md_v1); + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + /* swabbing is done in lov_setstripe() on server side */ + rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, &req); + ll_finish_md_op_data(op_data); + ptlrpc_req_finished(req); + if (rc) + RETURN(rc); + + RETURN(rc); +} + +int ll_dir_get_default_layout(struct inode *inode, void **plmm, int *plmm_size, + struct ptlrpc_request **request, u64 valid, + enum get_default_layout_type type) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct mdt_body *body; + struct lov_mds_md *lmm = NULL; + struct ptlrpc_request *req = NULL; + int lmm_size = OBD_MAX_DEFAULT_EA_SIZE; + struct md_op_data *op_data; + struct lu_fid fid; + int rc; + + ENTRY; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, lmm_size, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = valid | OBD_MD_FLEASIZE | OBD_MD_FLDIREA; + + if (type == GET_DEFAULT_LAYOUT_ROOT) { + lu_root_fid(&op_data->op_fid1); + fid = op_data->op_fid1; + } else { + fid = *ll_inode2fid(inode); + } + + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc < 0) { + CDEBUG(D_INFO, "md_getattr failed on inode "DFID": rc %d\n", + PFID(&fid), rc); + GOTO(out, rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + lmm_size = body->mbo_eadatasize; + + if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || + lmm_size == 0) { + GOTO(out, rc = -ENODATA); + } + + lmm = req_capsule_server_sized_get(&req->rq_pill, + &RMF_MDT_MD, lmm_size); + LASSERT(lmm != NULL); + + /* + * This is coming from the MDS, so is probably in + * little endian. We convert it to host endian before + * passing it to userspace. + */ + /* We don't swab objects for directories */ + switch (le32_to_cpu(lmm->lmm_magic)) { + case LOV_MAGIC_V1: + case LOV_MAGIC_V3: + case LOV_MAGIC_COMP_V1: + case LOV_USER_MAGIC_SPECIFIC: + if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) + lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0); + break; + case LMV_MAGIC_V1: + if (LMV_MAGIC != cpu_to_le32(LMV_MAGIC)) + lustre_swab_lmv_mds_md((union lmv_mds_md *)lmm); + break; + case LMV_USER_MAGIC: + if (LMV_USER_MAGIC != cpu_to_le32(LMV_USER_MAGIC)) + lustre_swab_lmv_user_md((struct lmv_user_md *)lmm); + break; + case LMV_MAGIC_FOREIGN: { + struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lmm; + + if (LMV_MAGIC_FOREIGN != cpu_to_le32(LMV_MAGIC_FOREIGN)) { + __swab32s(&lfm->lfm_magic); + __swab32s(&lfm->lfm_length); + __swab32s(&lfm->lfm_type); + __swab32s(&lfm->lfm_flags); + } + break; + } + default: + CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic); + rc = -EPROTO; + } +out: + *plmm = lmm; + *plmm_size = lmm_size; + *request = req; + return rc; +} + +/** + * This function will be used to get default LOV/LMV/Default LMV + * @valid will be used to indicate which stripe it will retrieve. + * If the directory does not have its own default layout, then the + * function will request the default layout from root FID. + * OBD_MD_MEA LMV stripe EA + * OBD_MD_DEFAULT_MEA Default LMV stripe EA + * otherwise Default LOV EA. + * Each time, it can only retrieve 1 stripe EA + **/ +int ll_dir_getstripe_default(struct inode *inode, void **plmm, int *plmm_size, + struct ptlrpc_request **request, + struct ptlrpc_request **root_request, + u64 valid) +{ + struct ptlrpc_request *req = NULL; + struct ptlrpc_request *root_req = NULL; + struct lov_mds_md *lmm = NULL; + int lmm_size = 0; + int rc = 0; + ENTRY; + + rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size, + &req, valid, 0); + if (rc == -ENODATA && !fid_is_root(ll_inode2fid(inode)) && + !(valid & OBD_MD_MEA) && root_request != NULL) { + int rc2 = ll_dir_get_default_layout(inode, (void **)&lmm, + &lmm_size, &root_req, valid, + GET_DEFAULT_LAYOUT_ROOT); + if (rc2 == 0) + rc = 0; + } + + *plmm = lmm; + *plmm_size = lmm_size; + *request = req; + if (root_request != NULL) + *root_request = root_req; + + RETURN(rc); +} + +/** + * This function will be used to get default LOV/LMV/Default LMV + * @valid will be used to indicate which stripe it will retrieve + * OBD_MD_MEA LMV stripe EA + * OBD_MD_DEFAULT_MEA Default LMV stripe EA + * otherwise Default LOV EA. + * Each time, it can only retrieve 1 stripe EA + **/ +int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size, + struct ptlrpc_request **request, u64 valid) +{ + struct ptlrpc_request *req = NULL; + struct lov_mds_md *lmm = NULL; + int lmm_size = 0; + int rc = 0; + ENTRY; + + rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size, + &req, valid, 0); + + *plmm = lmm; + *plmm_size = lmm_size; + *request = req; + + RETURN(rc); +} + +int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid) +{ + struct md_op_data *op_data; + int rc; + int mdt_index; + ENTRY; + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + RETURN(-ENOMEM); + + op_data->op_flags |= MF_GET_MDT_IDX; + op_data->op_fid1 = *fid; + rc = md_getattr(sbi->ll_md_exp, op_data, NULL); + mdt_index = op_data->op_mds; + OBD_FREE_PTR(op_data); + if (rc < 0) + RETURN(rc); + + RETURN(mdt_index); +} + +/* + * Get MDT index for the inode. + */ +int ll_get_mdt_idx(struct inode *inode) +{ + return ll_get_mdt_idx_by_fid(ll_i2sbi(inode), ll_inode2fid(inode)); +} + +/** + * Generic handler to do any pre-copy work. + * + * It sends a first hsm_progress (with extent length == 0) to coordinator as a + * first information for it that real work has started. + * + * Moreover, for a ARCHIVE request, it will sample the file data version and + * store it in \a copy. + * + * \return 0 on success. + */ +static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct hsm_progress_kernel hpk; + int rc = 0; + int rc2; + ENTRY; + + /* Forge a hsm_progress based on data from copy. */ + hpk.hpk_fid = copy->hc_hai.hai_fid; + hpk.hpk_cookie = copy->hc_hai.hai_cookie; + hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset; + hpk.hpk_extent.length = 0; + hpk.hpk_flags = 0; + hpk.hpk_errval = 0; + hpk.hpk_data_version = 0; + + + /* For archive request, we need to read the current file version. */ + if (copy->hc_hai.hai_action == HSMA_ARCHIVE) { + struct inode *inode; + __u64 data_version = 0; + + /* Get inode for this fid */ + inode = search_inode_for_lustre(sb, ©->hc_hai.hai_fid); + if (IS_ERR(inode)) { + hpk.hpk_flags |= HP_FLAG_RETRY; + /* hpk_errval is >= 0 */ + hpk.hpk_errval = -PTR_ERR(inode); + GOTO(progress, rc = PTR_ERR(inode)); + } + + /* Read current file data version */ + rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH); + iput(inode); + if (rc != 0) { + CDEBUG(D_HSM, "Could not read file data version of " + DFID" (rc = %d). Archive request (" + "%#llx) could not be done.\n", + PFID(©->hc_hai.hai_fid), rc, + copy->hc_hai.hai_cookie); + hpk.hpk_flags |= HP_FLAG_RETRY; + /* hpk_errval must be >= 0 */ + hpk.hpk_errval = -rc; + GOTO(progress, rc); + } + + /* Store in the hsm_copy for later copytool use. + * Always modified even if no lsm. */ + copy->hc_data_version = data_version; + + } else if (copy->hc_hai.hai_action == HSMA_IMPORT) { + + /* IMPORT sends its progress using alloc fid when possible */ + hpk.hpk_fid = copy->hc_hai.hai_dfid; + } + +progress: + /* On error, the request should be considered as completed */ + if (hpk.hpk_errval > 0) + hpk.hpk_flags |= HP_FLAG_COMPLETED; + + rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk), + &hpk, NULL); + + /* Return first error */ + RETURN(rc != 0 ? rc : rc2); +} + +/** + * Generic handler to do any post-copy work. + * + * It will send the last hsm_progress update to coordinator to inform it + * that copy is finished and whether it was successful or not. + * + * Moreover, + * - for ARCHIVE request, it will sample the file data version and compare it + * with the version saved in ll_ioc_copy_start(). If they do not match, copy + * will be considered as failed. + * - for RESTORE request, it will sample the file data version and send it to + * coordinator which is useful if the file was imported as 'released'. + * + * \return 0 on success. + */ +static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct hsm_progress_kernel hpk; + int rc = 0; + int rc2; + ENTRY; + + /* If you modify the logic here, also check llapi_hsm_copy_end(). */ + /* Take care: copy->hc_hai.hai_action, len, gid and data are not + * initialized if copy_end was called with copy == NULL. + */ + + /* Forge a hsm_progress based on data from copy. */ + hpk.hpk_fid = copy->hc_hai.hai_fid; + hpk.hpk_cookie = copy->hc_hai.hai_cookie; + hpk.hpk_extent = copy->hc_hai.hai_extent; + hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED; + hpk.hpk_errval = copy->hc_errval; + hpk.hpk_data_version = 0; + + /* For archive request, we need to check the file data was not changed. + * + * For restore request, we need to send the file data version, this is + * useful when the file was created using hsm_import. + */ + if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) || + (copy->hc_hai.hai_action == HSMA_RESTORE)) && + (copy->hc_errval == 0)) { + struct inode *inode; + __u64 data_version = 0; + + /* Get lsm for this fid */ + inode = search_inode_for_lustre(sb, ©->hc_hai.hai_fid); + if (IS_ERR(inode)) { + hpk.hpk_flags |= HP_FLAG_RETRY; + /* hpk_errval must be >= 0 */ + hpk.hpk_errval = -PTR_ERR(inode); + GOTO(progress, rc = PTR_ERR(inode)); + } + + rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH); + iput(inode); + if (rc) { + CDEBUG(D_HSM, "Could not read file data version. " + "Request could not be confirmed.\n"); + if (hpk.hpk_errval == 0) + hpk.hpk_errval = -rc; + GOTO(progress, rc); + } + + /* Store in the hsm_copy for later copytool use. + * Always modified even if no lsm. */ + hpk.hpk_data_version = data_version; + + /* File could have been stripped during archiving, so we need + * to check anyway. */ + if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) && + (copy->hc_data_version != data_version)) { + CDEBUG(D_HSM, "File data version mismatched. " + "File content was changed during archiving. " + DFID", start:%#llx current:%#llx\n", + PFID(©->hc_hai.hai_fid), + copy->hc_data_version, data_version); + /* File was changed, send error to cdt. Do not ask for + * retry because if a file is modified frequently, + * the cdt will loop on retried archive requests. + * The policy engine will ask for a new archive later + * when the file will not be modified for some tunable + * time */ + hpk.hpk_flags &= ~HP_FLAG_RETRY; + rc = -EBUSY; + /* hpk_errval must be >= 0 */ + hpk.hpk_errval = -rc; + GOTO(progress, rc); + } + + } else if (copy->hc_hai.hai_action == HSMA_IMPORT) { + + /* IMPORT sends its progress using alloc fid when possible */ + hpk.hpk_fid = copy->hc_hai.hai_dfid; + } + +progress: + rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk), + &hpk, NULL); + + /* Return first error */ + RETURN(rc != 0 ? rc : rc2); +} + + +static int copy_and_ct_start(int cmd, struct obd_export *exp, + const struct lustre_kernelcomm __user *data) +{ + struct lustre_kernelcomm *lk; + struct lustre_kernelcomm *tmp; + size_t size = sizeof(*lk); + size_t new_size; + int i; + int rc; + + /* copy data from userspace to get numbers of archive_id */ + OBD_ALLOC(lk, size); + if (lk == NULL) + return -ENOMEM; + + if (copy_from_user(lk, data, size)) + GOTO(out_lk, rc = -EFAULT); + + if (lk->lk_flags & LK_FLG_STOP) + goto do_ioctl; + + if (!(lk->lk_flags & LK_FLG_DATANR)) { + __u32 archive_mask = lk->lk_data_count; + int count; + + /* old hsm agent to old MDS */ + if (!exp_connect_archive_id_array(exp)) + goto do_ioctl; + + /* old hsm agent to new MDS */ + lk->lk_flags |= LK_FLG_DATANR; + + if (archive_mask == 0) + goto do_ioctl; + + count = hweight32(archive_mask); + new_size = offsetof(struct lustre_kernelcomm, lk_data[count]); + OBD_ALLOC(tmp, new_size); + if (tmp == NULL) + GOTO(out_lk, rc = -ENOMEM); + + memcpy(tmp, lk, size); + tmp->lk_data_count = count; + OBD_FREE(lk, size); + lk = tmp; + size = new_size; + + count = 0; + for (i = 0; i < sizeof(archive_mask) * 8; i++) { + if (BIT(i) & archive_mask) { + lk->lk_data[count] = i + 1; + count++; + } + } + goto do_ioctl; + } + + /* new hsm agent to new mds */ + if (lk->lk_data_count > 0) { + new_size = offsetof(struct lustre_kernelcomm, + lk_data[lk->lk_data_count]); + OBD_ALLOC(tmp, new_size); + if (tmp == NULL) + GOTO(out_lk, rc = -ENOMEM); + + OBD_FREE(lk, size); + lk = tmp; + size = new_size; + + if (copy_from_user(lk, data, size)) + GOTO(out_lk, rc = -EFAULT); + } + + /* new hsm agent to old MDS */ + if (!exp_connect_archive_id_array(exp)) { + __u32 archives = 0; + + if (lk->lk_data_count > LL_HSM_ORIGIN_MAX_ARCHIVE) + GOTO(out_lk, rc = -EINVAL); + + for (i = 0; i < lk->lk_data_count; i++) { + if (lk->lk_data[i] > LL_HSM_ORIGIN_MAX_ARCHIVE) { + rc = -EINVAL; + CERROR("%s: archive id %d requested but only " + "[0 - %zu] supported: rc = %d\n", + exp->exp_obd->obd_name, lk->lk_data[i], + LL_HSM_ORIGIN_MAX_ARCHIVE, rc); + GOTO(out_lk, rc); + } + + if (lk->lk_data[i] == 0) { + archives = 0; + break; + } + + archives |= (1 << (lk->lk_data[i] - 1)); + } + lk->lk_flags &= ~LK_FLG_DATANR; + lk->lk_data_count = archives; + } +do_ioctl: + rc = obd_iocontrol(cmd, exp, size, lk, NULL); +out_lk: + OBD_FREE(lk, size); + return rc; +} + +static int check_owner(int type, int id) +{ + switch (type) { + case USRQUOTA: + if (!uid_eq(current_euid(), make_kuid(&init_user_ns, id))) + return -EPERM; + break; + case GRPQUOTA: + if (!in_egroup_p(make_kgid(&init_user_ns, id))) + return -EPERM; + break; + case PRJQUOTA: + break; + } + return 0; +} + +int quotactl_ioctl(struct super_block *sb, struct if_quotactl *qctl) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + int cmd = qctl->qc_cmd; + int type = qctl->qc_type; + int id = qctl->qc_id; + int valid = qctl->qc_valid; + int rc = 0; + + ENTRY; + + switch (cmd) { + case Q_SETQUOTA: + case Q_SETINFO: + case LUSTRE_Q_SETDEFAULT: + case LUSTRE_Q_SETQUOTAPOOL: + case LUSTRE_Q_SETINFOPOOL: + case LUSTRE_Q_SETDEFAULT_POOL: + case LUSTRE_Q_DELETEQID: + if (!capable(CAP_SYS_ADMIN)) + RETURN(-EPERM); + + if (sb->s_flags & SB_RDONLY) + RETURN(-EROFS); + break; + case Q_GETQUOTA: + case LUSTRE_Q_GETDEFAULT: + case LUSTRE_Q_GETQUOTAPOOL: + case LUSTRE_Q_GETDEFAULT_POOL: + if (check_owner(type, id) && + (!capable(CAP_SYS_ADMIN))) + RETURN(-EPERM); + break; + case Q_GETINFO: + case LUSTRE_Q_GETINFOPOOL: + break; + default: + CERROR("unsupported quotactl op: %#x\n", cmd); + RETURN(-ENOTSUPP); + } + + if (valid != QC_GENERAL) { + if (cmd == Q_GETINFO) + qctl->qc_cmd = Q_GETOINFO; + else if (cmd == Q_GETQUOTA || + cmd == LUSTRE_Q_GETQUOTAPOOL) + qctl->qc_cmd = Q_GETOQUOTA; + else + RETURN(-EINVAL); + + switch (valid) { + case QC_MDTIDX: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp, + sizeof(*qctl), qctl, NULL); + break; + case QC_OSTIDX: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp, + sizeof(*qctl), qctl, NULL); + break; + case QC_UUID: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp, + sizeof(*qctl), qctl, NULL); + if (rc == -EAGAIN) + rc = obd_iocontrol(OBD_IOC_QUOTACTL, + sbi->ll_dt_exp, + sizeof(*qctl), qctl, NULL); + break; + default: + rc = -EINVAL; + break; + } + + if (rc) + RETURN(rc); + + qctl->qc_cmd = cmd; + } else { + struct obd_quotactl *oqctl; + int oqctl_len = sizeof(*oqctl); + + if (LUSTRE_Q_CMD_IS_POOL(cmd)) + oqctl_len += LOV_MAXPOOLNAME + 1; + + OBD_ALLOC(oqctl, oqctl_len); + if (oqctl == NULL) + RETURN(-ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(sbi->ll_md_exp, oqctl); + if (rc) { + OBD_FREE(oqctl, oqctl_len); + RETURN(rc); + } + /* If QIF_SPACE is not set, client should collect the + * space usage from OSSs by itself */ + if ((cmd == Q_GETQUOTA || cmd == LUSTRE_Q_GETQUOTAPOOL) && + !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) && + !oqctl->qc_dqblk.dqb_curspace) { + struct obd_quotactl *oqctl_tmp; + int qctl_len = sizeof(*oqctl_tmp) + LOV_MAXPOOLNAME + 1; + + OBD_ALLOC(oqctl_tmp, qctl_len); + if (oqctl_tmp == NULL) + GOTO(out, rc = -ENOMEM); + + if (cmd == LUSTRE_Q_GETQUOTAPOOL) { + oqctl_tmp->qc_cmd = LUSTRE_Q_GETQUOTAPOOL; + memcpy(oqctl_tmp->qc_poolname, + qctl->qc_poolname, + LOV_MAXPOOLNAME + 1); + } else { + oqctl_tmp->qc_cmd = Q_GETOQUOTA; + } + oqctl_tmp->qc_id = oqctl->qc_id; + oqctl_tmp->qc_type = oqctl->qc_type; + + /* collect space usage from OSTs */ + oqctl_tmp->qc_dqblk.dqb_curspace = 0; + rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp); + if (!rc || rc == -EREMOTEIO) { + oqctl->qc_dqblk.dqb_curspace = + oqctl_tmp->qc_dqblk.dqb_curspace; + oqctl->qc_dqblk.dqb_valid |= QIF_SPACE; + } + + /* collect space & inode usage from MDTs */ + oqctl_tmp->qc_cmd = Q_GETOQUOTA; + oqctl_tmp->qc_dqblk.dqb_curspace = 0; + oqctl_tmp->qc_dqblk.dqb_curinodes = 0; + rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp); + if (!rc || rc == -EREMOTEIO) { + oqctl->qc_dqblk.dqb_curspace += + oqctl_tmp->qc_dqblk.dqb_curspace; + oqctl->qc_dqblk.dqb_curinodes = + oqctl_tmp->qc_dqblk.dqb_curinodes; + oqctl->qc_dqblk.dqb_valid |= QIF_INODES; + } else { + oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE; + } + + OBD_FREE(oqctl_tmp, qctl_len); + } +out: + QCTL_COPY(qctl, oqctl); + OBD_FREE(oqctl, oqctl_len); + } + + RETURN(rc); +} + +int ll_rmfid(struct file *file, void __user *arg) +{ + const struct fid_array __user *ufa = arg; + struct inode *inode = file_inode(file); + struct fid_array *lfa = NULL; + size_t size; + unsigned nr; + int i, rc, *rcs = NULL; + ENTRY; + + if (!capable(CAP_DAC_READ_SEARCH) && + !test_bit(LL_SBI_USER_FID2PATH, ll_i2sbi(inode)->ll_flags)) + RETURN(-EPERM); + /* Only need to get the buflen */ + if (get_user(nr, &ufa->fa_nr)) + RETURN(-EFAULT); + /* DoS protection */ + if (nr > OBD_MAX_FIDS_IN_ARRAY) + RETURN(-E2BIG); + + size = offsetof(struct fid_array, fa_fids[nr]); + OBD_ALLOC(lfa, size); + if (!lfa) + RETURN(-ENOMEM); + OBD_ALLOC_PTR_ARRAY(rcs, nr); + if (!rcs) + GOTO(free_lfa, rc = -ENOMEM); + + if (copy_from_user(lfa, arg, size)) + GOTO(free_rcs, rc = -EFAULT); + + /* Call mdc_iocontrol */ + rc = md_rmfid(ll_i2mdexp(file_inode(file)), lfa, rcs, NULL); + if (!rc) { + for (i = 0; i < nr; i++) + if (rcs[i]) + lfa->fa_fids[i].f_ver = rcs[i]; + if (copy_to_user(arg, lfa, size)) + rc = -EFAULT; + } + +free_rcs: + OBD_FREE_PTR_ARRAY(rcs, nr); +free_lfa: + OBD_FREE(lfa, size); + + RETURN(rc); +} + +/* This function tries to get a single name component, + * to send to the server. No actual path traversal involved, + * so we limit to NAME_MAX */ +static char *ll_getname(const char __user *filename) +{ + int ret = 0, len; + char *tmp; + + OBD_ALLOC(tmp, NAME_MAX + 1); + + if (!tmp) + return ERR_PTR(-ENOMEM); + + len = strncpy_from_user(tmp, filename, NAME_MAX + 1); + if (len < 0) + ret = -ENOENT; + else if (len > NAME_MAX) + ret = -ENAMETOOLONG; + + if (ret) { + OBD_FREE(tmp, NAME_MAX + 1); + tmp = ERR_PTR(ret); + } + return tmp; +} + +#define ll_putname(filename) OBD_FREE(filename, NAME_MAX + 1); + +static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct dentry *dentry = file_dentry(file); + struct inode *inode = file_inode(file); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_ioctl_data *data = NULL; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%#x\n", + PFID(ll_inode2fid(inode)), inode, cmd); + + /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */ + if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */ + return -ENOTTY; + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1); + switch (cmd) { + case FS_IOC_GETFLAGS: + case FS_IOC_SETFLAGS: + RETURN(ll_iocontrol(inode, file, cmd, arg)); + case FSFILT_IOC_GETVERSION: + case FS_IOC_GETVERSION: + RETURN(put_user(inode->i_generation, (int __user *)arg)); + /* We need to special case any other ioctls we want to handle, + * to send them to the MDS/OST as appropriate and to properly + * network encode the arg field. */ + case FS_IOC_SETVERSION: + RETURN(-ENOTSUPP); + + case LL_IOC_GET_MDTIDX: { + int mdtidx; + + mdtidx = ll_get_mdt_idx(inode); + if (mdtidx < 0) + RETURN(mdtidx); + + if (put_user((int)mdtidx, (int __user *)arg)) + RETURN(-EFAULT); + + return 0; + } + case IOC_MDC_LOOKUP: { + int namelen, len = 0; + char *filename; + + rc = obd_ioctl_getdata(&data, &len, (void __user *)arg); + if (rc != 0) + RETURN(rc); + + filename = data->ioc_inlbuf1; + namelen = strlen(filename); + if (namelen < 1) { + CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); + GOTO(out_free, rc = -EINVAL); + } + + rc = ll_get_fid_by_name(inode, filename, namelen, NULL, NULL); + if (rc < 0) { + CERROR("%s: lookup %.*s failed: rc = %d\n", + sbi->ll_fsname, namelen, filename, rc); + GOTO(out_free, rc); + } +out_free: + OBD_FREE_LARGE(data, len); + return rc; + } + case LL_IOC_LMV_SETSTRIPE: { + struct lmv_user_md *lum; + char *filename; + int namelen = 0; + int lumlen = 0; + umode_t mode; + bool createonly = false; + int len; + int rc; + + rc = obd_ioctl_getdata(&data, &len, (void __user *)arg); + if (rc) + RETURN(rc); + + if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL || + data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0) + GOTO(lmv_out_free, rc = -EINVAL); + + filename = data->ioc_inlbuf1; + namelen = data->ioc_inllen1; + + if (namelen < 1) { + CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); + GOTO(lmv_out_free, rc = -EINVAL); + } + lum = (struct lmv_user_md *)data->ioc_inlbuf2; + lumlen = data->ioc_inllen2; + + if (!lmv_user_magic_supported(lum->lum_magic)) { + CERROR("%s: wrong lum magic %x : rc = %d\n", filename, + lum->lum_magic, -EINVAL); + GOTO(lmv_out_free, rc = -EINVAL); + } + + if ((lum->lum_magic == LMV_USER_MAGIC || + lum->lum_magic == LMV_USER_MAGIC_SPECIFIC) && + lumlen < sizeof(*lum)) { + CERROR("%s: wrong lum size %d for magic %x : rc = %d\n", + filename, lumlen, lum->lum_magic, -EINVAL); + GOTO(lmv_out_free, rc = -EINVAL); + } + + if (lum->lum_magic == LMV_MAGIC_FOREIGN && + lumlen < sizeof(struct lmv_foreign_md)) { + CERROR("%s: wrong lum magic %x or size %d: rc = %d\n", + filename, lum->lum_magic, lumlen, -EFAULT); + GOTO(lmv_out_free, rc = -EINVAL); + } + + mode = data->ioc_type; + createonly = data->ioc_obdo1.o_flags & OBD_FL_OBDMDEXISTS; + rc = ll_dir_setdirstripe(dentry, lum, lumlen, filename, mode, + createonly); +lmv_out_free: + OBD_FREE_LARGE(data, len); + RETURN(rc); + + } + case LL_IOC_LMV_SET_DEFAULT_STRIPE: { + struct lmv_user_md lum; + struct lmv_user_md __user *ulump = + (struct lmv_user_md __user *)arg; + int rc; + + if (copy_from_user(&lum, ulump, sizeof(lum))) + RETURN(-EFAULT); + + if (lum.lum_magic != LMV_USER_MAGIC) + RETURN(-EINVAL); + + rc = ll_dir_setstripe(inode, (struct lov_user_md *)&lum, 0); + + RETURN(rc); + } + case LL_IOC_LOV_SETSTRIPE_NEW: + case LL_IOC_LOV_SETSTRIPE: { + struct lov_user_md_v3 *lumv3 = NULL; + struct lov_user_md_v1 lumv1; + struct lov_user_md_v1 *lumv1_ptr = &lumv1; + struct lov_user_md_v1 __user *lumv1p = + (struct lov_user_md_v1 __user *)arg; + struct lov_user_md_v3 __user *lumv3p = + (struct lov_user_md_v3 __user *)arg; + int lum_size = 0; + + int set_default = 0; + + BUILD_BUG_ON(sizeof(struct lov_user_md_v3) <= + sizeof(struct lov_comp_md_v1)); + BUILD_BUG_ON(sizeof(*lumv3) != sizeof(*lumv3p)); + /* first try with v1 which is smaller than v3 */ + if (copy_from_user(&lumv1, lumv1p, sizeof(lumv1))) + RETURN(-EFAULT); + + if (is_root_inode(inode)) + set_default = 1; + + switch (lumv1.lmm_magic) { + case LOV_USER_MAGIC_V3: + case LOV_USER_MAGIC_SPECIFIC: + lum_size = ll_lov_user_md_size(&lumv1); + if (lum_size < 0) + RETURN(lum_size); + OBD_ALLOC(lumv3, lum_size); + if (!lumv3) + RETURN(-ENOMEM); + if (copy_from_user(lumv3, lumv3p, lum_size)) + GOTO(out, rc = -EFAULT); + lumv1_ptr = (struct lov_user_md_v1 *)lumv3; + break; + case LOV_USER_MAGIC_V1: + break; + default: + GOTO(out, rc = -ENOTSUPP); + } + + /* in v1 and v3 cases lumv1 points to data */ + rc = ll_dir_setstripe(inode, lumv1_ptr, set_default); +out: + if (lumv3) + OBD_FREE(lumv3, lum_size); + RETURN(rc); + } + case LL_IOC_LMV_GETSTRIPE: { + struct lmv_user_md __user *ulmv = + (struct lmv_user_md __user *)arg; + struct lmv_user_md lum; + struct ptlrpc_request *request = NULL; + struct ptlrpc_request *root_request = NULL; + union lmv_mds_md *lmm = NULL; + int lmmsize; + u64 valid = 0; + struct lmv_user_md *tmp = NULL; + int mdt_index; + int lum_size; + int stripe_count; + int max_stripe_count; + int i; + int rc; + + if (copy_from_user(&lum, ulmv, sizeof(*ulmv))) + RETURN(-EFAULT); + + max_stripe_count = lum.lum_stripe_count; + /* lum_magic will indicate which stripe the ioctl will like + * to get, LMV_MAGIC_V1 is for normal LMV stripe, LMV_USER_MAGIC + * is for default LMV stripe */ + if (lum.lum_magic == LMV_MAGIC_V1) + valid |= OBD_MD_MEA; + else if (lum.lum_magic == LMV_USER_MAGIC) + valid |= OBD_MD_DEFAULT_MEA; + else + RETURN(-EINVAL); + + rc = ll_dir_getstripe_default(inode, (void **)&lmm, &lmmsize, + &request, &root_request, valid); + if (rc != 0) + GOTO(finish_req, rc); + + /* Get default LMV EA */ + if (lum.lum_magic == LMV_USER_MAGIC) { + struct lmv_user_md *lum; + struct ll_inode_info *lli; + + if (lmmsize > sizeof(*ulmv)) + GOTO(finish_req, rc = -EINVAL); + + lum = (struct lmv_user_md *)lmm; + if (lum->lum_max_inherit == LMV_INHERIT_NONE) + GOTO(finish_req, rc = -ENODATA); + + if (root_request != NULL) { + lli = ll_i2info(inode); + if (lum->lum_max_inherit != + LMV_INHERIT_UNLIMITED) { + if (lum->lum_max_inherit < + LMV_INHERIT_END || + lum->lum_max_inherit > + LMV_INHERIT_MAX || + lum->lum_max_inherit <= + lli->lli_dir_depth) + GOTO(finish_req, rc = -ENODATA); + + lum->lum_max_inherit -= + lli->lli_dir_depth; + } + + if (lum->lum_max_inherit_rr != + LMV_INHERIT_RR_UNLIMITED) { + if (lum->lum_max_inherit_rr == + LMV_INHERIT_NONE || + lum->lum_max_inherit_rr < + LMV_INHERIT_RR_END || + lum->lum_max_inherit_rr > + LMV_INHERIT_RR_MAX || + lum->lum_max_inherit_rr <= + lli->lli_dir_depth) { + lum->lum_max_inherit_rr = + LMV_INHERIT_RR_NONE; + goto out_copy; + } + + lum->lum_max_inherit_rr -= + lli->lli_dir_depth; + } + } +out_copy: + if (copy_to_user(ulmv, lmm, lmmsize)) + GOTO(finish_req, rc = -EFAULT); + + GOTO(finish_req, rc); + } + + /* if foreign LMV case, fake stripes number */ + if (lmm->lmv_magic == LMV_MAGIC_FOREIGN) { + struct lmv_foreign_md *lfm; + + lfm = (struct lmv_foreign_md *)lmm; + if (lfm->lfm_length < XATTR_SIZE_MAX - + offsetof(typeof(*lfm), lfm_value)) { + __u32 size = lfm->lfm_length + + offsetof(typeof(*lfm), lfm_value); + + stripe_count = lmv_foreign_to_md_stripes(size); + } else { + CERROR("invalid %d foreign size returned\n", + lfm->lfm_length); + return -EINVAL; + } + } else { + stripe_count = lmv_mds_md_stripe_count_get(lmm); + } + if (max_stripe_count < stripe_count) { + lum.lum_stripe_count = stripe_count; + if (copy_to_user(ulmv, &lum, sizeof(lum))) + GOTO(finish_req, rc = -EFAULT); + GOTO(finish_req, rc = -E2BIG); + } + + /* enough room on user side and foreign case */ + if (lmm->lmv_magic == LMV_MAGIC_FOREIGN) { + struct lmv_foreign_md *lfm; + __u32 size; + + lfm = (struct lmv_foreign_md *)lmm; + size = lfm->lfm_length + + offsetof(struct lmv_foreign_md, lfm_value); + if (copy_to_user(ulmv, lfm, size)) + GOTO(finish_req, rc = -EFAULT); + GOTO(finish_req, rc); + } + + lum_size = lmv_user_md_size(stripe_count, + LMV_USER_MAGIC_SPECIFIC); + OBD_ALLOC(tmp, lum_size); + if (tmp == NULL) + GOTO(finish_req, rc = -ENOMEM); + + mdt_index = ll_get_mdt_idx(inode); + if (mdt_index < 0) + GOTO(out_tmp, rc = -ENOMEM); + + tmp->lum_magic = LMV_MAGIC_V1; + tmp->lum_stripe_count = 0; + tmp->lum_stripe_offset = mdt_index; + tmp->lum_hash_type = lmv_mds_md_hash_type_get(lmm); + for (i = 0; i < stripe_count; i++) { + struct lu_fid fid; + + fid_le_to_cpu(&fid, &lmm->lmv_md_v1.lmv_stripe_fids[i]); + if (fid_is_sane(&fid)) { + mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid); + if (mdt_index < 0) + GOTO(out_tmp, rc = mdt_index); + + tmp->lum_objects[i].lum_mds = mdt_index; + tmp->lum_objects[i].lum_fid = fid; + } + + tmp->lum_stripe_count++; + } + + if (copy_to_user(ulmv, tmp, lum_size)) + GOTO(out_tmp, rc = -EFAULT); +out_tmp: + OBD_FREE(tmp, lum_size); +finish_req: + ptlrpc_req_finished(request); + ptlrpc_req_finished(root_request); + return rc; + } + + case LL_IOC_UNLOCK_FOREIGN: + /* if not a foreign symlink do nothing */ + if (ll_foreign_is_removable(dentry, true)) { + CDEBUG(D_INFO, + "prevent rmdir of non-foreign dir ("DFID")\n", + PFID(ll_inode2fid(inode))); + RETURN(-EOPNOTSUPP); + } + RETURN(0); + + case LL_IOC_REMOVE_ENTRY: { + char *filename = NULL; + int namelen = 0; + int rc; + + /* Here is a little hack to avoid sending REINT_RMENTRY to + * unsupported server, which might crash the server(LU-2730), + * Because both LVB_TYPE and REINT_RMENTRY will be supported + * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the + * server will support REINT_RMENTRY XXX*/ + if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE)) + RETURN(-ENOTSUPP); + + filename = ll_getname((const char __user *)arg); + if (IS_ERR(filename)) + RETURN(PTR_ERR(filename)); + + namelen = strlen(filename); + if (namelen < 1) + GOTO(out_rmdir, rc = -EINVAL); + + rc = ll_rmdir_entry(inode, filename, namelen); +out_rmdir: + if (filename) + ll_putname(filename); + RETURN(rc); + } + case LL_IOC_RMFID: + RETURN(ll_rmfid(file, (void __user *)arg)); + case LL_IOC_LOV_SWAP_LAYOUTS: + RETURN(-EPERM); + case IOC_OBD_STATFS: + RETURN(ll_obd_statfs(inode, (void __user *)arg)); + case LL_IOC_LOV_GETSTRIPE: + case LL_IOC_LOV_GETSTRIPE_NEW: + case LL_IOC_MDC_GETINFO_V1: + case LL_IOC_MDC_GETINFO_V2: + case IOC_MDC_GETFILEINFO_V1: + case IOC_MDC_GETFILEINFO_V2: + case IOC_MDC_GETFILESTRIPE: { + struct ptlrpc_request *request = NULL; + struct ptlrpc_request *root_request = NULL; + struct lov_user_md __user *lump; + struct lov_mds_md *lmm = NULL; + struct mdt_body *body; + char *filename = NULL; + lstat_t __user *statp = NULL; + lstatx_t __user *stxp = NULL; + __u64 __user *flagsp = NULL; + __u32 __user *lmmsizep = NULL; + struct lu_fid __user *fidp = NULL; + int lmmsize; + bool api32; + + if (cmd == IOC_MDC_GETFILEINFO_V1 || + cmd == IOC_MDC_GETFILEINFO_V2 || + cmd == IOC_MDC_GETFILESTRIPE) { + filename = ll_getname((const char __user *)arg); + if (IS_ERR(filename)) + RETURN(PTR_ERR(filename)); + + rc = ll_lov_getstripe_ea_info(inode, filename, &lmm, + &lmmsize, &request); + } else { + rc = ll_dir_getstripe_default(inode, (void **)&lmm, + &lmmsize, &request, + &root_request, 0); + } + + if (request) { + body = req_capsule_server_get(&request->rq_pill, + &RMF_MDT_BODY); + LASSERT(body != NULL); + } else { + GOTO(out_req, rc); + } + + if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO_V1 || + cmd == LL_IOC_MDC_GETINFO_V1 || + cmd == IOC_MDC_GETFILEINFO_V2 || + cmd == LL_IOC_MDC_GETINFO_V2)) { + lmmsize = 0; + rc = 0; + } + + if (rc < 0) + GOTO(out_req, rc); + + if (cmd == IOC_MDC_GETFILESTRIPE || + cmd == LL_IOC_LOV_GETSTRIPE || + cmd == LL_IOC_LOV_GETSTRIPE_NEW) { + lump = (struct lov_user_md __user *)arg; + } else if (cmd == IOC_MDC_GETFILEINFO_V1 || + cmd == LL_IOC_MDC_GETINFO_V1){ + struct lov_user_mds_data_v1 __user *lmdp; + + lmdp = (struct lov_user_mds_data_v1 __user *)arg; + statp = &lmdp->lmd_st; + lump = &lmdp->lmd_lmm; + } else { + struct lov_user_mds_data __user *lmdp; + + lmdp = (struct lov_user_mds_data __user *)arg; + fidp = &lmdp->lmd_fid; + stxp = &lmdp->lmd_stx; + flagsp = &lmdp->lmd_flags; + lmmsizep = &lmdp->lmd_lmmsize; + lump = &lmdp->lmd_lmm; + } + + if (lmmsize == 0) { + /* If the file has no striping then zero out *lump so + * that the caller isn't confused by garbage. */ + if (clear_user(lump, sizeof(*lump))) + GOTO(out_req, rc = -EFAULT); + } else if (copy_to_user(lump, lmm, lmmsize)) { + if (copy_to_user(lump, lmm, sizeof(*lump))) + GOTO(out_req, rc = -EFAULT); + rc = -EOVERFLOW; + } + api32 = test_bit(LL_SBI_32BIT_API, sbi->ll_flags); + + if (cmd == IOC_MDC_GETFILEINFO_V1 || + cmd == LL_IOC_MDC_GETINFO_V1) { + lstat_t st = { 0 }; + + st.st_dev = inode->i_sb->s_dev; + st.st_mode = body->mbo_mode; + st.st_nlink = body->mbo_nlink; + st.st_uid = body->mbo_uid; + st.st_gid = body->mbo_gid; + st.st_rdev = body->mbo_rdev; + if (llcrypt_require_key(inode) == -ENOKEY) + st.st_size = round_up(st.st_size, + LUSTRE_ENCRYPTION_UNIT_SIZE); + else + st.st_size = body->mbo_size; + st.st_blksize = PAGE_SIZE; + st.st_blocks = body->mbo_blocks; + st.st_atime = body->mbo_atime; + st.st_mtime = body->mbo_mtime; + st.st_ctime = body->mbo_ctime; + st.st_ino = cl_fid_build_ino(&body->mbo_fid1, + api32); + + if (copy_to_user(statp, &st, sizeof(st))) + GOTO(out_req, rc = -EFAULT); + } else if (cmd == IOC_MDC_GETFILEINFO_V2 || + cmd == LL_IOC_MDC_GETINFO_V2) { + lstatx_t stx = { 0 }; + __u64 valid = body->mbo_valid; + + stx.stx_blksize = PAGE_SIZE; + stx.stx_nlink = body->mbo_nlink; + stx.stx_uid = body->mbo_uid; + stx.stx_gid = body->mbo_gid; + stx.stx_mode = body->mbo_mode; + stx.stx_ino = cl_fid_build_ino(&body->mbo_fid1, + api32); + if (llcrypt_require_key(inode) == -ENOKEY) + stx.stx_size = round_up(stx.stx_size, + LUSTRE_ENCRYPTION_UNIT_SIZE); + else + stx.stx_size = body->mbo_size; + stx.stx_blocks = body->mbo_blocks; + stx.stx_atime.tv_sec = body->mbo_atime; + stx.stx_ctime.tv_sec = body->mbo_ctime; + stx.stx_mtime.tv_sec = body->mbo_mtime; + stx.stx_btime.tv_sec = body->mbo_btime; + stx.stx_rdev_major = MAJOR(body->mbo_rdev); + stx.stx_rdev_minor = MINOR(body->mbo_rdev); + stx.stx_dev_major = MAJOR(inode->i_sb->s_dev); + stx.stx_dev_minor = MINOR(inode->i_sb->s_dev); + stx.stx_mask |= STATX_BASIC_STATS | STATX_BTIME; + + /* + * For a striped directory, the size and blocks returned + * from MDT is not correct. + * The size and blocks are aggregated by client across + * all stripes. + * Thus for a striped directory, do not return the valid + * FLSIZE and FLBLOCKS flags to the caller. + * However, this whould be better decided by the MDS + * instead of the client. + */ + if (cmd == LL_IOC_MDC_GETINFO_V2 && + ll_i2info(inode)->lli_lsm_md != NULL) + valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); + + if (flagsp && copy_to_user(flagsp, &valid, + sizeof(*flagsp))) + GOTO(out_req, rc = -EFAULT); + + if (fidp && copy_to_user(fidp, &body->mbo_fid1, + sizeof(*fidp))) + GOTO(out_req, rc = -EFAULT); + + if (!(valid & OBD_MD_FLSIZE)) + stx.stx_mask &= ~STATX_SIZE; + if (!(valid & OBD_MD_FLBLOCKS)) + stx.stx_mask &= ~STATX_BLOCKS; + + if (stxp && copy_to_user(stxp, &stx, sizeof(stx))) + GOTO(out_req, rc = -EFAULT); + + if (lmmsizep && copy_to_user(lmmsizep, &lmmsize, + sizeof(*lmmsizep))) + GOTO(out_req, rc = -EFAULT); + } + + EXIT; +out_req: + ptlrpc_req_finished(request); + ptlrpc_req_finished(root_request); + if (filename) + ll_putname(filename); + return rc; + } + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl; + int qctl_len = sizeof(*qctl) + LOV_MAXPOOLNAME + 1; + + OBD_ALLOC(qctl, qctl_len); + if (!qctl) + RETURN(-ENOMEM); + + if (copy_from_user(qctl, (void __user *)arg, sizeof(*qctl))) + GOTO(out_quotactl, rc = -EFAULT); + + if (LUSTRE_Q_CMD_IS_POOL(qctl->qc_cmd)) { + char __user *from = (char __user *)arg + + offsetof(typeof(*qctl), qc_poolname); + if (copy_from_user(qctl->qc_poolname, from, + LOV_MAXPOOLNAME + 1)) + GOTO(out_quotactl, rc = -EFAULT); + } + + rc = quotactl_ioctl(inode->i_sb, qctl); + if (rc == 0 && + copy_to_user((void __user *)arg, qctl, sizeof(*qctl))) + rc = -EFAULT; + +out_quotactl: + OBD_FREE(qctl, qctl_len); + RETURN(rc); + } + case OBD_IOC_GETNAME_OLD: + case OBD_IOC_GETDTNAME: + case OBD_IOC_GETMDNAME: + RETURN(ll_get_obd_name(inode, cmd, arg)); + case LL_IOC_HSM_STATE_GET: { + struct md_op_data *op_data; + struct hsm_user_state *hus; + int rc; + + OBD_ALLOC_PTR(hus); + if (hus == NULL) + RETURN(-ENOMEM); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hus); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(hus); + RETURN(PTR_ERR(op_data)); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), + op_data, NULL); + + if (copy_to_user((void __user *)arg, hus, sizeof(*hus))) + rc = -EFAULT; + + ll_finish_md_op_data(op_data); + OBD_FREE_PTR(hus); + RETURN(rc); + } + case LL_IOC_HSM_STATE_SET: { + struct hsm_state_set *hss; + int rc; + + OBD_ALLOC_PTR(hss); + if (hss == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) { + OBD_FREE_PTR(hss); + RETURN(-EFAULT); + } + + rc = ll_hsm_state_set(inode, hss); + + OBD_FREE_PTR(hss); + RETURN(rc); + } + case LL_IOC_HSM_ACTION: { + struct md_op_data *op_data; + struct hsm_current_action *hca; + int rc; + + OBD_ALLOC_PTR(hca); + if (hca == NULL) + RETURN(-ENOMEM); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hca); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(hca); + RETURN(PTR_ERR(op_data)); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), + op_data, NULL); + + if (copy_to_user((char __user *)arg, hca, sizeof(*hca))) + rc = -EFAULT; + + ll_finish_md_op_data(op_data); + OBD_FREE_PTR(hca); + RETURN(rc); + } + case LL_IOC_FLUSHCTX: + RETURN(ll_flush_ctx(inode)); + case LL_IOC_GETOBDCOUNT: { + u32 count, vallen; + struct obd_export *exp; + + if (copy_from_user(&count, (int __user *)arg, sizeof(int))) + RETURN(-EFAULT); + + /* get ost count when count is zero, get mdt count otherwise */ + exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp; + vallen = sizeof(count); + rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT), + KEY_TGT_COUNT, &vallen, &count); + if (rc) { + CERROR("get target count failed: %d\n", rc); + RETURN(rc); + } + + if (copy_to_user((int __user *)arg, &count, sizeof(int))) + RETURN(-EFAULT); + + RETURN(0); + } + case LL_IOC_PATH2FID: + if (copy_to_user((void __user *)arg, ll_inode2fid(inode), + sizeof(struct lu_fid))) + RETURN(-EFAULT); + RETURN(0); + case LL_IOC_GET_CONNECT_FLAGS: { + RETURN(obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, + (void __user *)arg)); + } + case OBD_IOC_FID2PATH: + RETURN(ll_fid2path(inode, (void __user *)arg)); + case LL_IOC_GETPARENT: + RETURN(ll_getparent(file, (void __user *)arg)); + case LL_IOC_FID2MDTIDX: { + struct obd_export *exp = ll_i2mdexp(inode); + struct lu_fid fid; + __u32 index; + + if (copy_from_user(&fid, (const struct lu_fid __user *)arg, + sizeof(fid))) + RETURN(-EFAULT); + + /* Call mdc_iocontrol */ + rc = obd_iocontrol(LL_IOC_FID2MDTIDX, exp, sizeof(fid), &fid, + (__u32 __user *)&index); + if (rc != 0) + RETURN(rc); + + RETURN(index); + } + case LL_IOC_HSM_REQUEST: { + struct hsm_user_request *hur; + ssize_t totalsize; + + OBD_ALLOC_PTR(hur); + if (hur == NULL) + RETURN(-ENOMEM); + + /* We don't know the true size yet; copy the fixed-size part */ + if (copy_from_user(hur, (void __user *)arg, sizeof(*hur))) { + OBD_FREE_PTR(hur); + RETURN(-EFAULT); + } + + /* Compute the whole struct size */ + totalsize = hur_len(hur); + OBD_FREE_PTR(hur); + if (totalsize < 0) + RETURN(-E2BIG); + + /* Final size will be more than double totalsize */ + if (totalsize >= MDS_MAXREQSIZE / 3) + RETURN(-E2BIG); + + OBD_ALLOC_LARGE(hur, totalsize); + if (hur == NULL) + RETURN(-ENOMEM); + + /* Copy the whole struct */ + if (copy_from_user(hur, (void __user *)arg, totalsize)) + GOTO(out_hur, rc = -EFAULT); + + if (hur->hur_request.hr_action == HUA_RELEASE) { + const struct lu_fid *fid; + struct inode *f; + int i; + + for (i = 0; i < hur->hur_request.hr_itemcount; i++) { + fid = &hur->hur_user_item[i].hui_fid; + f = search_inode_for_lustre(inode->i_sb, fid); + if (IS_ERR(f)) { + rc = PTR_ERR(f); + break; + } + + rc = ll_hsm_release(f); + iput(f); + if (rc != 0) + break; + } + } else { + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize, + hur, NULL); + } + +out_hur: + OBD_FREE_LARGE(hur, totalsize); + + RETURN(rc); + } + case LL_IOC_HSM_PROGRESS: { + struct hsm_progress_kernel hpk; + struct hsm_progress hp; + + if (copy_from_user(&hp, (void __user *)arg, sizeof(hp))) + RETURN(-EFAULT); + + hpk.hpk_fid = hp.hp_fid; + hpk.hpk_cookie = hp.hp_cookie; + hpk.hpk_extent = hp.hp_extent; + hpk.hpk_flags = hp.hp_flags; + hpk.hpk_errval = hp.hp_errval; + hpk.hpk_data_version = 0; + + /* File may not exist in Lustre; all progress + * reported to Lustre root */ + rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk, + NULL); + RETURN(rc); + } + case LL_IOC_HSM_CT_START: + if (!capable(CAP_SYS_ADMIN)) + RETURN(-EPERM); + + rc = copy_and_ct_start(cmd, sbi->ll_md_exp, + (struct lustre_kernelcomm __user *)arg); + RETURN(rc); + + case LL_IOC_HSM_COPY_START: { + struct hsm_copy *copy; + int rc; + + OBD_ALLOC_PTR(copy); + if (copy == NULL) + RETURN(-ENOMEM); + if (copy_from_user(copy, (char __user *)arg, sizeof(*copy))) { + OBD_FREE_PTR(copy); + RETURN(-EFAULT); + } + + rc = ll_ioc_copy_start(inode->i_sb, copy); + if (copy_to_user((char __user *)arg, copy, sizeof(*copy))) + rc = -EFAULT; + + OBD_FREE_PTR(copy); + RETURN(rc); + } + case LL_IOC_HSM_IMPORT: { + struct hsm_user_import *hui; + + OBD_ALLOC_PTR(hui); + if (hui == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) { + OBD_FREE_PTR(hui); + RETURN(-EFAULT); + } + + rc = ll_hsm_import(inode, file, hui); + + CDEBUG(D_HSM, "MDLL hsm_state import: %d\n", rc); + OBD_FREE_PTR(hui); + RETURN(rc); + } + case LL_IOC_HSM_COPY_END: { + struct hsm_copy *copy; + int rc; + + OBD_ALLOC_PTR(copy); + if (copy == NULL) + RETURN(-ENOMEM); + if (copy_from_user(copy, (char __user *)arg, sizeof(*copy))) { + OBD_FREE_PTR(copy); + RETURN(-EFAULT); + } + + rc = ll_ioc_copy_end(inode->i_sb, copy); + CDEBUG(D_HSM, "MDLL hsm_copy_end: %d\n", rc); + + if (copy_to_user((char __user *)arg, copy, sizeof(*copy))) + rc = -EFAULT; + + OBD_FREE_PTR(copy); + RETURN(rc); + } + case LL_IOC_MIGRATE: { + struct lmv_user_md *lum; + int len; + char *filename; + int namelen = 0; + __u32 flags; + int rc; + + rc = obd_ioctl_getdata(&data, &len, (void __user *)arg); + if (rc) + RETURN(rc); + + if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL || + data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0) + GOTO(migrate_free, rc = -EINVAL); + + filename = data->ioc_inlbuf1; + namelen = data->ioc_inllen1; + flags = data->ioc_type; + + if (namelen < 1 || namelen != strlen(filename) + 1) { + CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); + GOTO(migrate_free, rc = -EINVAL); + } + + lum = (struct lmv_user_md *)data->ioc_inlbuf2; + if (lum->lum_magic != LMV_USER_MAGIC && + lum->lum_magic != LMV_USER_MAGIC_SPECIFIC) { + rc = -EINVAL; + CERROR("%s: wrong lum magic %x: rc = %d\n", + filename, lum->lum_magic, rc); + GOTO(migrate_free, rc); + } + + rc = ll_migrate(inode, file, lum, filename, flags); +migrate_free: + OBD_FREE_LARGE(data, len); + + RETURN(rc); + } + case FS_IOC_FSGETXATTR: + RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg)); + case FS_IOC_FSSETXATTR: + RETURN(ll_ioctl_fssetxattr(inode, cmd, arg)); + case LL_IOC_PROJECT: + RETURN(ll_ioctl_project(file, cmd, arg)); + case LL_IOC_PCC_DETACH_BY_FID: { + struct lu_pcc_detach_fid *detach; + struct lu_fid *fid; + struct inode *inode2; + unsigned long ino; + + OBD_ALLOC_PTR(detach); + if (detach == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(detach, + (const struct lu_pcc_detach_fid __user *)arg, + sizeof(*detach))) + GOTO(out_detach, rc = -EFAULT); + + fid = &detach->pccd_fid; + ino = cl_fid_build_ino(fid, ll_need_32bit_api(sbi)); + inode2 = ilookup5(inode->i_sb, ino, ll_test_inode_by_fid, fid); + if (inode2 == NULL) + /* Target inode is not in inode cache, and PCC file + * has aleady released, return immdiately. + */ + GOTO(out_detach, rc = 0); + + if (!S_ISREG(inode2->i_mode)) + GOTO(out_iput, rc = -EINVAL); + + if (!inode_owner_or_capable(&init_user_ns, inode2)) + GOTO(out_iput, rc = -EPERM); + + rc = pcc_ioctl_detach(inode2, detach->pccd_opt); +out_iput: + iput(inode2); +out_detach: + OBD_FREE_PTR(detach); + RETURN(rc); + } +#ifdef HAVE_LUSTRE_CRYPTO + case LL_IOC_SET_ENCRYPTION_POLICY: + if (!ll_sbi_has_encrypt(ll_i2sbi(inode))) + return -EOPNOTSUPP; + return llcrypt_ioctl_set_policy(file, (const void __user *)arg); + case LL_IOC_GET_ENCRYPTION_POLICY_EX: + if (!ll_sbi_has_encrypt(ll_i2sbi(inode))) + return -EOPNOTSUPP; + return llcrypt_ioctl_get_policy_ex(file, (void __user *)arg); + case LL_IOC_ADD_ENCRYPTION_KEY: + if (!ll_sbi_has_encrypt(ll_i2sbi(inode))) + return -EOPNOTSUPP; + rc = llcrypt_ioctl_add_key(file, (void __user *)arg); +#ifdef CONFIG_LL_ENCRYPTION + if (!rc) + sptlrpc_enc_pool_add_user(); +#endif + return rc; + case LL_IOC_REMOVE_ENCRYPTION_KEY: + if (!ll_sbi_has_encrypt(ll_i2sbi(inode))) + return -EOPNOTSUPP; + rc = llcrypt_ioctl_remove_key(file, (void __user *)arg); +#ifdef CONFIG_LL_ENCRYPTION + if (!rc) + sptlrpc_enc_pool_del_user(); +#endif + return rc; + case LL_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + if (!ll_sbi_has_encrypt(ll_i2sbi(inode))) + return -EOPNOTSUPP; + rc = llcrypt_ioctl_remove_key_all_users(file, + (void __user *)arg); +#ifdef CONFIG_LL_ENCRYPTION + if (!rc) + sptlrpc_enc_pool_del_user(); +#endif + return rc; + case LL_IOC_GET_ENCRYPTION_KEY_STATUS: + if (!ll_sbi_has_encrypt(ll_i2sbi(inode))) + return -EOPNOTSUPP; + return llcrypt_ioctl_get_key_status(file, (void __user *)arg); +#endif + default: + RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL, + (void __user *)arg)); + } +} + +static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + struct ll_file_data *fd = file->private_data; + struct ll_sb_info *sbi = ll_i2sbi(inode); + int api32 = ll_need_32bit_api(sbi); + loff_t ret = -EINVAL; + ENTRY; + + inode_lock(inode); + switch (origin) { + case SEEK_SET: + break; + case SEEK_CUR: + offset += file->f_pos; + break; + case SEEK_END: + if (offset > 0) + GOTO(out, ret); + if (api32) + offset += LL_DIR_END_OFF_32BIT; + else + offset += LL_DIR_END_OFF; + break; + default: + GOTO(out, ret); + } + + if (offset >= 0 && + ((api32 && offset <= LL_DIR_END_OFF_32BIT) || + (!api32 && offset <= LL_DIR_END_OFF))) { + if (offset != file->f_pos) { + bool hash64; + + hash64 = test_bit(LL_SBI_64BIT_HASH, sbi->ll_flags); + if ((api32 && offset == LL_DIR_END_OFF_32BIT) || + (!api32 && offset == LL_DIR_END_OFF)) + fd->lfd_pos = MDS_DIR_END_OFF; + else if (api32 && hash64) + fd->lfd_pos = offset << 32; + else + fd->lfd_pos = offset; + file->f_pos = offset; + file->f_version = 0; + } + ret = offset; + } + GOTO(out, ret); + +out: + inode_unlock(inode); + return ret; +} + +static int ll_dir_open(struct inode *inode, struct file *file) +{ + ENTRY; + RETURN(ll_file_open(inode, file)); +} + +static int ll_dir_release(struct inode *inode, struct file *file) +{ + ENTRY; + RETURN(ll_file_release(inode, file)); +} + +/* notify error if partially read striped directory */ +static int ll_dir_flush(struct file *file, fl_owner_t id) +{ + struct ll_file_data *lfd = file->private_data; + int rc = lfd->fd_partial_readdir_rc; + + lfd->fd_partial_readdir_rc = 0; + + return rc; +} + +const struct file_operations ll_dir_operations = { + .llseek = ll_dir_seek, + .open = ll_dir_open, + .release = ll_dir_release, + .read = generic_read_dir, +#ifdef HAVE_DIR_CONTEXT + .iterate_shared = ll_iterate, +#else + .readdir = ll_readdir, +#endif + .unlocked_ioctl = ll_dir_ioctl, + .fsync = ll_fsync, + .flush = ll_dir_flush, +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/file.c b/drivers/staging/lustrefsx/lustre/llite/file.c new file mode 100644 index 0000000000000..1b3d8d90d193c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/file.c @@ -0,0 +1,6260 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/llite/file.c + * + * Author: Peter Braam + * Author: Phil Schwan + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LLITE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cl_object.h" +#include "llite_internal.h" +#include "vvp_internal.h" + +struct split_param { + struct inode *sp_inode; + __u16 sp_mirror_id; +}; + +struct pcc_param { + __u64 pa_data_version; + __u32 pa_archive_id; + __u32 pa_layout_gen; +}; + +static int +ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg); + +static int ll_lease_close(struct obd_client_handle *och, struct inode *inode, + bool *lease_broken); + +static struct ll_file_data *ll_file_data_get(void) +{ + struct ll_file_data *fd; + + OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS); + if (fd == NULL) + return NULL; + + fd->fd_write_failed = false; + pcc_file_init(&fd->fd_pcc_file); + + return fd; +} + +static void ll_file_data_put(struct ll_file_data *fd) +{ + if (fd != NULL) + OBD_SLAB_FREE_PTR(fd, ll_file_data_slab); +} + +/** + * Packs all the attributes into @op_data for the CLOSE rpc. + */ +static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data, + struct obd_client_handle *och) +{ + ENTRY; + + ll_prep_md_op_data(op_data, inode, NULL, NULL, + 0, 0, LUSTRE_OPC_ANY, NULL); + + op_data->op_attr.ia_mode = inode->i_mode; + op_data->op_attr.ia_atime = inode->i_atime; + op_data->op_attr.ia_mtime = inode->i_mtime; + op_data->op_attr.ia_ctime = inode->i_ctime; + /* In case of encrypted file without the key, visible size was rounded + * up to next LUSTRE_ENCRYPTION_UNIT_SIZE, and clear text size was + * stored into lli_lazysize in ll_merge_attr(), so set proper file size + * now that we are closing. + */ + if (llcrypt_require_key(inode) == -ENOKEY && + ll_i2info(inode)->lli_attr_valid & OBD_MD_FLLAZYSIZE) + op_data->op_attr.ia_size = ll_i2info(inode)->lli_lazysize; + else + op_data->op_attr.ia_size = i_size_read(inode); + op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET | + ATTR_MTIME | ATTR_MTIME_SET | + ATTR_CTIME); + op_data->op_xvalid |= OP_XVALID_CTIME_SET; + op_data->op_attr_blocks = inode->i_blocks; + op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags); + if (test_bit(LLIF_PROJECT_INHERIT, &ll_i2info(inode)->lli_flags)) + op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL; + op_data->op_open_handle = och->och_open_handle; + + if (och->och_flags & FMODE_WRITE && + test_and_clear_bit(LLIF_DATA_MODIFIED, &ll_i2info(inode)->lli_flags)) + /* For HSM: if inode data has been modified, pack it so that + * MDT can set data dirty flag in the archive. */ + op_data->op_bias |= MDS_DATA_MODIFIED; + + EXIT; +} + +/** + * Perform a close, possibly with a bias. + * The meaning of "data" depends on the value of "bias". + * + * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version. + * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to + * swap layouts with. + */ +static int ll_close_inode_openhandle(struct inode *inode, + struct obd_client_handle *och, + enum mds_op_bias bias, void *data) +{ + struct obd_export *md_exp = ll_i2mdexp(inode); + const struct ll_inode_info *lli = ll_i2info(inode); + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + int rc; + ENTRY; + + if (class_exp2obd(md_exp) == NULL) { + CERROR("%s: invalid MDC connection handle closing "DFID"\n", + ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid)); + GOTO(out, rc = 0); + } + + OBD_ALLOC_PTR(op_data); + /* We leak openhandle and request here on error, but not much to be + * done in OOM case since app won't retry close on error either. */ + if (op_data == NULL) + GOTO(out, rc = -ENOMEM); + + ll_prepare_close(inode, op_data, och); + switch (bias) { + case MDS_CLOSE_LAYOUT_MERGE: + /* merge blocks from the victim inode */ + op_data->op_attr_blocks += ((struct inode *)data)->i_blocks; + op_data->op_attr.ia_valid |= ATTR_SIZE; + op_data->op_xvalid |= OP_XVALID_BLOCKS; + fallthrough; + case MDS_CLOSE_LAYOUT_SPLIT: + case MDS_CLOSE_LAYOUT_SWAP: { + struct split_param *sp = data; + + LASSERT(data != NULL); + op_data->op_bias |= bias; + op_data->op_data_version = 0; + op_data->op_lease_handle = och->och_lease_handle; + if (bias == MDS_CLOSE_LAYOUT_SPLIT) { + op_data->op_fid2 = *ll_inode2fid(sp->sp_inode); + op_data->op_mirror_id = sp->sp_mirror_id; + } else { + op_data->op_fid2 = *ll_inode2fid(data); + } + break; + } + + case MDS_CLOSE_RESYNC_DONE: { + struct ll_ioc_lease *ioc = data; + + LASSERT(data != NULL); + op_data->op_attr_blocks += + ioc->lil_count * op_data->op_attr_blocks; + op_data->op_attr.ia_valid |= ATTR_SIZE; + op_data->op_xvalid |= OP_XVALID_BLOCKS; + op_data->op_bias |= MDS_CLOSE_RESYNC_DONE; + + op_data->op_lease_handle = och->och_lease_handle; + op_data->op_data = &ioc->lil_ids[0]; + op_data->op_data_size = + ioc->lil_count * sizeof(ioc->lil_ids[0]); + break; + } + + case MDS_PCC_ATTACH: { + struct pcc_param *param = data; + + LASSERT(data != NULL); + op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH; + op_data->op_archive_id = param->pa_archive_id; + op_data->op_data_version = param->pa_data_version; + op_data->op_lease_handle = och->och_lease_handle; + break; + } + + case MDS_HSM_RELEASE: + LASSERT(data != NULL); + op_data->op_bias |= MDS_HSM_RELEASE; + op_data->op_data_version = *(__u64 *)data; + op_data->op_lease_handle = och->och_lease_handle; + op_data->op_attr.ia_valid |= ATTR_SIZE; + op_data->op_xvalid |= OP_XVALID_BLOCKS; + break; + + default: + LASSERT(data == NULL); + break; + } + + if (!(op_data->op_attr.ia_valid & ATTR_SIZE)) + op_data->op_xvalid |= OP_XVALID_LAZYSIZE; + if (!(op_data->op_xvalid & OP_XVALID_BLOCKS)) + op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS; + + rc = md_close(md_exp, op_data, och->och_mod, &req); + if (rc != 0 && rc != -EINTR) + CERROR("%s: inode "DFID" mdc close failed: rc = %d\n", + md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc); + + if (rc == 0 && op_data->op_bias & bias) { + struct mdt_body *body; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED)) + rc = -EBUSY; + + if (bias & MDS_PCC_ATTACH) { + struct pcc_param *param = data; + + param->pa_layout_gen = body->mbo_layout_gen; + } + } + + ll_finish_md_op_data(op_data); + EXIT; +out: + + md_clear_open_replay_data(md_exp, och); + och->och_open_handle.cookie = DEAD_HANDLE_MAGIC; + OBD_FREE_PTR(och); + + ptlrpc_req_finished(req); /* This is close request */ + return rc; +} + +int ll_md_real_close(struct inode *inode, fmode_t fmode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_client_handle **och_p; + struct obd_client_handle *och; + __u64 *och_usecount; + int rc = 0; + ENTRY; + + if (fmode & FMODE_WRITE) { + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else if (fmode & FMODE_EXEC) { + och_p = &lli->lli_mds_exec_och; + och_usecount = &lli->lli_open_fd_exec_count; + } else { + LASSERT(fmode & FMODE_READ); + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + + mutex_lock(&lli->lli_och_mutex); + if (*och_usecount > 0) { + /* There are still users of this handle, so skip + * freeing it. */ + mutex_unlock(&lli->lli_och_mutex); + RETURN(0); + } + + och = *och_p; + *och_p = NULL; + mutex_unlock(&lli->lli_och_mutex); + + if (och != NULL) { + /* There might be a race and this handle may already + * be closed. */ + rc = ll_close_inode_openhandle(inode, och, 0, NULL); + } + + RETURN(rc); +} + +static int ll_md_close(struct inode *inode, struct file *file) +{ + union ldlm_policy_data policy = { + .l_inodebits = { MDS_INODELOCK_OPEN }, + }; + __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK; + struct ll_file_data *fd = file->private_data; + struct ll_inode_info *lli = ll_i2info(inode); + struct lustre_handle lockh; + enum ldlm_mode lockmode; + int rc = 0; + ENTRY; + + /* clear group lock, if present */ + if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) + ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid); + + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och != NULL) { + bool lease_broken; + struct obd_client_handle *lease_och; + + lease_och = fd->fd_lease_och; + fd->fd_lease_och = NULL; + mutex_unlock(&lli->lli_och_mutex); + + /* Usually the lease is not released when the + * application crashed, we need to release here. */ + rc = ll_lease_close(lease_och, inode, &lease_broken); + + mutex_lock(&lli->lli_och_mutex); + + CDEBUG_LIMIT(rc ? D_ERROR : D_INODE, + "Clean up lease "DFID" %d/%d\n", + PFID(&lli->lli_fid), rc, lease_broken); + } + + if (fd->fd_och != NULL) { + struct obd_client_handle *och; + + och = fd->fd_och; + fd->fd_och = NULL; + mutex_unlock(&lli->lli_och_mutex); + + rc = ll_close_inode_openhandle(inode, och, 0, NULL); + GOTO(out, rc); + } + + /* Let's see if we have good enough OPEN lock on the file and if + we can skip talking to MDS */ + if (fd->fd_omode & FMODE_WRITE) { + lockmode = LCK_CW; + LASSERT(lli->lli_open_fd_write_count); + lli->lli_open_fd_write_count--; + } else if (fd->fd_omode & FMODE_EXEC) { + lockmode = LCK_PR; + LASSERT(lli->lli_open_fd_exec_count); + lli->lli_open_fd_exec_count--; + } else { + lockmode = LCK_CR; + LASSERT(lli->lli_open_fd_read_count); + lli->lli_open_fd_read_count--; + } + mutex_unlock(&lli->lli_och_mutex); + + /* LU-4398: do not cache write open lock if the file has exec bit */ + if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) || + !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode), + LDLM_IBITS, &policy, lockmode, &lockh)) + rc = ll_md_real_close(inode, fd->fd_omode); + +out: + file->private_data = NULL; + ll_file_data_put(fd); + + RETURN(rc); +} + +/* While this returns an error code, fput() the caller does not, so we need + * to make every effort to clean up all of our state here. Also, applications + * rarely check close errors and even if an error is returned they will not + * re-try the close call. + */ +int ll_file_release(struct inode *inode, struct file *file) +{ + struct ll_file_data *fd; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + ktime_t kstart = ktime_get(); + int rc; + + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + + fd = file->private_data; + LASSERT(fd != NULL); + + /* The last ref on @file, maybe not the the owner pid of statahead, + * because parent and child process can share the same file handle. */ + if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd) + ll_deauthorize_statahead(inode, fd); + + if (is_root_inode(inode)) { + file->private_data = NULL; + ll_file_data_put(fd); + GOTO(out, rc = 0); + } + + pcc_file_release(inode, file); + + if (!S_ISDIR(inode->i_mode)) { + if (lli->lli_clob != NULL) + lov_read_and_clear_async_rc(lli->lli_clob); + lli->lli_async_rc = 0; + } + + lli->lli_close_fd_time = ktime_get(); + + rc = ll_md_close(inode, file); + + if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val)) + libcfs_debug_dumplog(); + +out: + if (!rc && !is_root_inode(inode)) + ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, + ktime_us_delta(ktime_get(), kstart)); + RETURN(rc); +} + +static inline int ll_dom_readpage(void *data, struct page *page) +{ + /* since ll_dom_readpage is a page cache helper, it is safe to assume + * mapping and host pointers are set here + */ + struct inode *inode; + struct niobuf_local *lnb = data; + void *kaddr; + int rc = 0; + + inode = page2inode(page); + + kaddr = kmap_atomic(page); + memcpy(kaddr, lnb->lnb_data, lnb->lnb_len); + if (lnb->lnb_len < PAGE_SIZE) + memset(kaddr + lnb->lnb_len, 0, + PAGE_SIZE - lnb->lnb_len); + flush_dcache_page(page); + SetPageUptodate(page); + kunmap_atomic(kaddr); + + if (inode && IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) { + if (!llcrypt_has_encryption_key(inode)) { + CDEBUG(D_SEC, "no enc key for "DFID"\n", + PFID(ll_inode2fid(inode))); + rc = -ENOKEY; + } else { + unsigned int offs = 0; + + while (offs < PAGE_SIZE) { + /* decrypt only if page is not empty */ + if (memcmp(page_address(page) + offs, + page_address(ZERO_PAGE(0)), + LUSTRE_ENCRYPTION_UNIT_SIZE) == 0) + break; + + rc = llcrypt_decrypt_pagecache_blocks(page, + LUSTRE_ENCRYPTION_UNIT_SIZE, + offs); + if (rc) + break; + + offs += LUSTRE_ENCRYPTION_UNIT_SIZE; + } + } + } + unlock_page(page); + + return rc; +} + +#ifdef HAVE_READ_CACHE_PAGE_WANTS_FILE +static inline int ll_dom_read_folio(struct file *file, struct folio *folio0) +{ + return ll_dom_readpage(file->private_data, folio_page(folio0, 0)); +} +#else +#define ll_dom_read_folio ll_dom_readpage +#endif + +void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req) +{ + struct lu_env *env; + struct cl_io *io; + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct address_space *mapping = inode->i_mapping; + struct page *vmpage; + struct niobuf_remote *rnb; + struct mdt_body *body; + char *data; + unsigned long index, start; + struct niobuf_local lnb; + __u16 refcheck; + int rc; + + ENTRY; + + if (obj == NULL) + RETURN_EXIT; + + if (!req_capsule_field_present(&req->rq_pill, &RMF_NIOBUF_INLINE, + RCL_SERVER)) + RETURN_EXIT; + + rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE); + if (rnb == NULL || rnb->rnb_len == 0) + RETURN_EXIT; + + /* LU-11595: Server may return whole file and that is OK always or + * it may return just file tail and its offset must be aligned with + * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is + * smaller then offset may be not aligned and that data is just ignored. + */ + if (rnb->rnb_offset & ~PAGE_MASK) + RETURN_EXIT; + + /* Server returns whole file or just file tail if it fills in reply + * buffer, in both cases total size should be equal to the file size. + */ + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size && + !(inode && IS_ENCRYPTED(inode))) { + CERROR("%s: server returns off/len %llu/%u but size %llu\n", + ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset, + rnb->rnb_len, body->mbo_dom_size); + RETURN_EXIT; + } + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN_EXIT; + io = vvp_env_thread_io(env); + io->ci_obj = obj; + io->ci_ignore_layout = 1; + rc = cl_io_init(env, io, CIT_MISC, obj); + if (rc) + GOTO(out_io, rc); + + CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n", + rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size); + + data = (char *)rnb + sizeof(*rnb); + + lnb.lnb_file_offset = rnb->rnb_offset; + start = lnb.lnb_file_offset >> PAGE_SHIFT; + index = 0; + LASSERT((lnb.lnb_file_offset & ~PAGE_MASK) == 0); + lnb.lnb_page_offset = 0; + do { + struct cl_page *page; + + lnb.lnb_data = data + (index << PAGE_SHIFT); + lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT); + if (lnb.lnb_len > PAGE_SIZE) + lnb.lnb_len = PAGE_SIZE; + + vmpage = ll_read_cache_page(mapping, index + start, + ll_dom_read_folio, &lnb); + if (IS_ERR(vmpage)) { + CWARN("%s: cannot fill page %lu for "DFID + " with data: rc = %li\n", + ll_i2sbi(inode)->ll_fsname, index + start, + PFID(lu_object_fid(&obj->co_lu)), + PTR_ERR(vmpage)); + break; + } + lock_page(vmpage); + if (vmpage->mapping == NULL) { + unlock_page(vmpage); + put_page(vmpage); + /* page was truncated */ + break; + } + /* attach VM page to CL page cache */ + page = cl_page_find(env, obj, vmpage->index, vmpage, + CPT_CACHEABLE); + if (IS_ERR(page)) { + ClearPageUptodate(vmpage); + unlock_page(vmpage); + put_page(vmpage); + break; + } + cl_page_export(env, page, 1); + cl_page_put(env, page); + unlock_page(vmpage); + put_page(vmpage); + index++; + } while (rnb->rnb_len > (index << PAGE_SHIFT)); + +out_io: + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + + EXIT; +} + +static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize, + struct lookup_intent *itp) +{ + struct ll_sb_info *sbi = ll_i2sbi(de->d_inode); + struct dentry *parent = de->d_parent; + char *name = NULL; + int len = 0; + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + int rc; + ENTRY; + + LASSERT(parent != NULL); + LASSERT(itp->it_flags & MDS_OPEN_BY_FID); + + /* if server supports open-by-fid, or file name is invalid, don't pack + * name in open request */ + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) || + !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) { +retry: + len = de->d_name.len; + name = kmalloc(len + 1, GFP_NOFS); + if (!name) + RETURN(-ENOMEM); + + /* race here */ + spin_lock(&de->d_lock); + if (len != de->d_name.len) { + spin_unlock(&de->d_lock); + kfree(name); + goto retry; + } + memcpy(name, de->d_name.name, len); + name[len] = '\0'; + spin_unlock(&de->d_lock); + + if (!lu_name_is_valid_2(name, len)) { + kfree(name); + RETURN(-ESTALE); + } + } + + op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode, + name, len, 0, LUSTRE_OPC_OPEN, NULL); + if (IS_ERR(op_data)) { + kfree(name); + RETURN(PTR_ERR(op_data)); + } + op_data->op_data = lmm; + op_data->op_data_size = lmmsize; + + OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_OPEN_DELAY, cfs_fail_val); + + rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req, + &ll_md_blocking_ast, 0); + kfree(name); + ll_finish_md_op_data(op_data); + if (rc == -ESTALE) { + /* reason for keep own exit path - don`t flood log + * with messages with -ESTALE errors. + */ + if (!it_disposition(itp, DISP_OPEN_OPEN) || + it_open_error(DISP_OPEN_OPEN, itp)) + GOTO(out, rc); + ll_release_openhandle(de, itp); + GOTO(out, rc); + } + + if (it_disposition(itp, DISP_LOOKUP_NEG)) + GOTO(out, rc = -ENOENT); + + if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) { + rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp); + CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc); + GOTO(out, rc); + } + + rc = ll_prep_inode(&de->d_inode, &req->rq_pill, NULL, itp); + + if (!rc && itp->it_lock_mode) { + __u64 bits = 0; + + /* If we got a lock back and it has a LOOKUP bit set, + * make sure the dentry is marked as valid so we can find it. + * We don't need to care about actual hashing since other bits + * of kernel will deal with that later. + */ + ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, &bits); + if (bits & MDS_INODELOCK_LOOKUP) { + d_lustre_revalidate(de); + ll_update_dir_depth(parent->d_inode, de->d_inode); + } + + /* if DoM bit returned along with LAYOUT bit then there + * can be read-on-open data returned. + */ + if (bits & MDS_INODELOCK_DOM && bits & MDS_INODELOCK_LAYOUT) + ll_dom_finish_open(de->d_inode, req); + } + +out: + ptlrpc_req_finished(req); + ll_intent_drop_lock(itp); + + /* We did open by fid, but by the time we got to the server, the object + * disappeared. This is possible if the object was unlinked, but it's + * also possible if the object was unlinked by a rename. In the case + * of an object renamed over our existing one, we can't fail this open. + * O_CREAT also goes through this path if we had an existing dentry, + * and it's obviously wrong to return ENOENT for O_CREAT. + * + * Instead let's return -ESTALE, and the VFS will retry the open with + * LOOKUP_REVAL, which we catch in ll_revalidate_dentry and fail to + * revalidate, causing a lookup. This causes extra lookups in the case + * where we had a dentry in cache but the file is being unlinked and we + * lose the race with unlink, but this should be very rare. + */ + if (rc == -ENOENT) + rc = -ESTALE; + + RETURN(rc); +} + +static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it, + struct obd_client_handle *och) +{ + struct mdt_body *body; + + body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY); + och->och_open_handle = body->mbo_open_handle; + och->och_fid = body->mbo_fid1; + och->och_lease_handle.cookie = it->it_lock_handle; + och->och_magic = OBD_CLIENT_HANDLE_MAGIC; + och->och_flags = it->it_flags; + + return md_set_open_replay_data(md_exp, och, it); +} + +static int ll_local_open(struct file *file, struct lookup_intent *it, + struct ll_file_data *fd, struct obd_client_handle *och) +{ + struct inode *inode = file_inode(file); + ENTRY; + + LASSERT(!file->private_data); + + LASSERT(fd != NULL); + + if (och) { + int rc; + + rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och); + if (rc != 0) + RETURN(rc); + } + + file->private_data = fd; + ll_readahead_init(inode, &fd->fd_ras); + fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); + + RETURN(0); +} + +void ll_track_file_opens(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + + /* do not skew results with delays from never-opened inodes */ + if (ktime_to_ns(lli->lli_close_fd_time)) + ll_stats_ops_tally(sbi, LPROC_LL_INODE_OPCLTM, + ktime_us_delta(ktime_get(), lli->lli_close_fd_time)); + + if (ktime_after(ktime_get(), + ktime_add_ms(lli->lli_close_fd_time, + sbi->ll_oc_max_ms))) { + lli->lli_open_fd_count = 1; + lli->lli_close_fd_time = ns_to_ktime(0); + } else { + lli->lli_open_fd_count++; + } + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_OCOUNT, + lli->lli_open_fd_count); +} + +/* Open a file, and (for the very first open) create objects on the OSTs at + * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object + * creation or open until ll_lov_setstripe() ioctl is called. + * + * If we already have the stripe MD locally then we don't request it in + * md_open(), by passing a lmm_size = 0. + * + * It is up to the application to ensure no other processes open this file + * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be + * used. We might be able to avoid races of that sort by getting lli_open_sem + * before returning in the O_LOV_DELAY_CREATE case and dropping it here + * or in ll_file_release(), but I'm not sure that is desirable/necessary. + */ +int ll_file_open(struct inode *inode, struct file *file) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct lookup_intent *it, oit = { .it_op = IT_OPEN, + .it_flags = file->f_flags }; + struct obd_client_handle **och_p = NULL; + __u64 *och_usecount = NULL; + struct ll_file_data *fd; + ktime_t kstart = ktime_get(); + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n", + PFID(ll_inode2fid(inode)), inode, file->f_flags); + + it = file->private_data; /* XXX: compat macro */ + file->private_data = NULL; /* prevent ll_local_open assertion */ + + if (S_ISREG(inode->i_mode)) { + rc = ll_file_open_encrypt(inode, file); + if (rc) { + if (it && it->it_disposition) + ll_release_openhandle(file_dentry(file), it); + GOTO(out_nofiledata, rc); + } + } + + fd = ll_file_data_get(); + if (fd == NULL) + GOTO(out_nofiledata, rc = -ENOMEM); + + fd->fd_file = file; + if (S_ISDIR(inode->i_mode)) + ll_authorize_statahead(inode, fd); + + ll_track_file_opens(inode); + if (is_root_inode(inode)) { + file->private_data = fd; + RETURN(0); + } + + if (!it || !it->it_disposition) { + CDEBUG(D_HSM, "MDLL file->f_flags=0x%x/0%o\n", + file->f_flags, file->f_flags); + /* Convert f_flags into access mode. We cannot use file->f_mode, + * because everything but O_ACCMODE mask was stripped from + * there */ + if ((oit.it_flags + 1) & O_ACCMODE) + oit.it_flags++; + if (file->f_flags & O_TRUNC) + oit.it_flags |= FMODE_WRITE; + + /* kernel only call f_op->open in dentry_open. filp_open calls + * dentry_open after call to open_namei that checks permissions. + * Only nfsd_open call dentry_open directly without checking + * permissions and because of that this code below is safe. + */ + if (oit.it_flags & (FMODE_WRITE | FMODE_READ)) + oit.it_flags |= MDS_OPEN_OWNEROVERRIDE; + + /* We do not want O_EXCL here, presumably we opened the file + * already? XXX - NFS implications? */ + oit.it_flags &= ~O_EXCL; + + /* bug20584, if "it_flags" contains O_CREAT, the file will be + * created if necessary, then "IT_CREAT" should be set to keep + * consistent with it */ + if (oit.it_flags & O_CREAT) + oit.it_op |= IT_CREAT; + + it = &oit; + } + +restart: + /* Let's see if we have file open on MDS already. */ + if (it->it_flags & FMODE_WRITE) { + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else if (it->it_flags & FMODE_EXEC) { + och_p = &lli->lli_mds_exec_och; + och_usecount = &lli->lli_open_fd_exec_count; + } else { + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + + mutex_lock(&lli->lli_och_mutex); + if (*och_p) { /* Open handle is present */ + if (it_disposition(it, DISP_OPEN_OPEN)) { + /* Well, there's extra open request that we do not need, + * let's close it somehow. This will decref request. */ + rc = it_open_error(DISP_OPEN_OPEN, it); + if (rc) { + mutex_unlock(&lli->lli_och_mutex); + GOTO(out_openerr, rc); + } + + ll_release_openhandle(file_dentry(file), it); + } + (*och_usecount)++; + + rc = ll_local_open(file, it, fd, NULL); + if (rc) { + (*och_usecount)--; + mutex_unlock(&lli->lli_och_mutex); + GOTO(out_openerr, rc); + } + } else { + LASSERT(*och_usecount == 0); + if (!it->it_disposition) { + struct dentry *dentry = file_dentry(file); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_dentry_data *ldd; + + /* We cannot just request lock handle now, new ELC code + * means that one of other OPEN locks for this file + * could be cancelled, and since blocking ast handler + * would attempt to grab och_mutex as well, that would + * result in a deadlock + */ + mutex_unlock(&lli->lli_och_mutex); + /* + * Normally called under two situations: + * 1. NFS export. + * 2. A race/condition on MDS resulting in no open + * handle to be returned from LOOKUP|OPEN request, + * for example if the target entry was a symlink. + * + * In NFS path we know there's pathologic behavior + * so we always enable open lock caching when coming + * from there. It's detected by setting a flag in + * ll_iget_for_nfs. + * + * After reaching number of opens of this inode + * we always ask for an open lock on it to handle + * bad userspace actors that open and close files + * in a loop for absolutely no good reason + */ + + ldd = ll_d2d(dentry); + if (filename_is_volatile(dentry->d_name.name, + dentry->d_name.len, + NULL)) { + /* There really is nothing here, but this + * make this more readable I think. + * We do not want openlock for volatile + * files under any circumstances + */ + } else if (ldd && ldd->lld_nfs_dentry) { + /* NFS path. This also happens to catch + * open by fh files I guess + */ + it->it_flags |= MDS_OPEN_LOCK; + /* clear the flag for future lookups */ + ldd->lld_nfs_dentry = 0; + } else if (sbi->ll_oc_thrsh_count > 0) { + /* Take MDS_OPEN_LOCK with many opens */ + if (lli->lli_open_fd_count >= + sbi->ll_oc_thrsh_count) + it->it_flags |= MDS_OPEN_LOCK; + + /* If this is open after we just closed */ + else if (ktime_before(ktime_get(), + ktime_add_ms(lli->lli_close_fd_time, + sbi->ll_oc_thrsh_ms))) + it->it_flags |= MDS_OPEN_LOCK; + } + + /* + * Always specify MDS_OPEN_BY_FID because we don't want + * to get file with different fid. + */ + it->it_flags |= MDS_OPEN_BY_FID; + rc = ll_intent_file_open(dentry, NULL, 0, it); + if (rc) + GOTO(out_openerr, rc); + + goto restart; + } + OBD_ALLOC(*och_p, sizeof(struct obd_client_handle)); + if (!*och_p) + GOTO(out_och_free, rc = -ENOMEM); + + (*och_usecount)++; + + /* md_intent_lock() didn't get a request ref if there was an + * open error, so don't do cleanup on the request here + * (bug 3430) */ + /* XXX (green): Should not we bail out on any error here, not + * just open error? */ + rc = it_open_error(DISP_OPEN_OPEN, it); + if (rc != 0) + GOTO(out_och_free, rc); + + LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF), + "inode %p: disposition %x, status %d\n", inode, + it_disposition(it, ~0), it->it_status); + + rc = ll_local_open(file, it, fd, *och_p); + if (rc) + GOTO(out_och_free, rc); + } + + rc = pcc_file_open(inode, file); + if (rc) + GOTO(out_och_free, rc); + + mutex_unlock(&lli->lli_och_mutex); + + fd = NULL; + + /* Must do this outside lli_och_mutex lock to prevent deadlock where + different kind of OPEN lock for this same inode gets cancelled + by ldlm_cancel_lru */ + if (!S_ISREG(inode->i_mode)) + GOTO(out_och_free, rc); + cl_lov_delay_create_clear(&file->f_flags); + cl_lu_noimport_clear(&file->f_flags); + GOTO(out_och_free, rc); + +out_och_free: + if (rc) { + if (och_p && *och_p) { + OBD_FREE(*och_p, sizeof(struct obd_client_handle)); + *och_p = NULL; /* OBD_FREE writes some magic there */ + (*och_usecount)--; + } + mutex_unlock(&lli->lli_och_mutex); + +out_openerr: + if (lli->lli_opendir_key == fd) + ll_deauthorize_statahead(inode, fd); + + if (fd != NULL) + ll_file_data_put(fd); + } else { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, + ktime_us_delta(ktime_get(), kstart)); + } + +out_nofiledata: + if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) { + ptlrpc_req_finished(it->it_request); + it_clear_disposition(it, DISP_ENQ_OPEN_REF); + } + + return rc; +} + +static int ll_md_blocking_lease_ast(struct ldlm_lock *lock, + struct ldlm_lock_desc *desc, void *data, int flag) +{ + int rc; + struct lustre_handle lockh; + ENTRY; + + switch (flag) { + case LDLM_CB_BLOCKING: + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (rc < 0) { + CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc); + RETURN(rc); + } + break; + case LDLM_CB_CANCELING: + /* do nothing */ + break; + } + RETURN(0); +} + +/** + * When setting a lease on a file, we take ownership of the lli_mds_*_och + * and save it as fd->fd_och so as to force client to reopen the file even + * if it has an open lock in cache already. + */ +static int ll_lease_och_acquire(struct inode *inode, struct file *file, + struct lustre_handle *old_open_handle) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = file->private_data; + struct obd_client_handle **och_p; + __u64 *och_usecount; + int rc = 0; + ENTRY; + + /* Get the openhandle of the file */ + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och != NULL) + GOTO(out_unlock, rc = -EBUSY); + + if (fd->fd_och == NULL) { + if (file->f_mode & FMODE_WRITE) { + LASSERT(lli->lli_mds_write_och != NULL); + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else { + LASSERT(lli->lli_mds_read_och != NULL); + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + + if (*och_usecount > 1) + GOTO(out_unlock, rc = -EBUSY); + + fd->fd_och = *och_p; + *och_usecount = 0; + *och_p = NULL; + } + + *old_open_handle = fd->fd_och->och_open_handle; + + EXIT; +out_unlock: + mutex_unlock(&lli->lli_och_mutex); + return rc; +} + +/** + * Release ownership on lli_mds_*_och when putting back a file lease. + */ +static int ll_lease_och_release(struct inode *inode, struct file *file) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = file->private_data; + struct obd_client_handle **och_p; + struct obd_client_handle *old_och = NULL; + __u64 *och_usecount; + int rc = 0; + ENTRY; + + mutex_lock(&lli->lli_och_mutex); + if (file->f_mode & FMODE_WRITE) { + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else { + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + + /* The file may have been open by another process (broken lease) so + * *och_p is not NULL. In this case we should simply increase usecount + * and close fd_och. + */ + if (*och_p != NULL) { + old_och = fd->fd_och; + (*och_usecount)++; + } else { + *och_p = fd->fd_och; + *och_usecount = 1; + } + fd->fd_och = NULL; + mutex_unlock(&lli->lli_och_mutex); + + if (old_och != NULL) + rc = ll_close_inode_openhandle(inode, old_och, 0, NULL); + + RETURN(rc); +} + +/** + * Acquire a lease and open the file. + */ +static struct obd_client_handle * +ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode, + __u64 open_flags) +{ + struct lookup_intent it = { .it_op = IT_OPEN }; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + struct lustre_handle old_open_handle = { 0 }; + struct obd_client_handle *och = NULL; + int rc; + int rc2; + ENTRY; + + if (fmode != FMODE_WRITE && fmode != FMODE_READ) + RETURN(ERR_PTR(-EINVAL)); + + if (file != NULL) { + if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC)) + RETURN(ERR_PTR(-EPERM)); + + rc = ll_lease_och_acquire(inode, file, &old_open_handle); + if (rc) + RETURN(ERR_PTR(rc)); + } + + OBD_ALLOC_PTR(och); + if (och == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + GOTO(out, rc = PTR_ERR(op_data)); + + /* To tell the MDT this openhandle is from the same owner */ + op_data->op_open_handle = old_open_handle; + + it.it_flags = fmode | open_flags; + it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE; + rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req, + &ll_md_blocking_lease_ast, + /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise + * it can be cancelled which may mislead applications that the lease is + * broken; + * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal + * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast + * doesn't deal with openhandle, so normal openhandle will be leaked. */ + LDLM_FL_NO_LRU | LDLM_FL_EXCL); + ll_finish_md_op_data(op_data); + ptlrpc_req_finished(req); + if (rc < 0) + GOTO(out_release_it, rc); + + if (it_disposition(&it, DISP_LOOKUP_NEG)) + GOTO(out_release_it, rc = -ENOENT); + + rc = it_open_error(DISP_OPEN_OPEN, &it); + if (rc) + GOTO(out_release_it, rc); + + LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF)); + rc = ll_och_fill(sbi->ll_md_exp, &it, och); + if (rc) + GOTO(out_release_it, rc); + + if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ + GOTO(out_close, rc = -EOPNOTSUPP); + + /* already get lease, handle lease lock */ + ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL); + if (!it.it_lock_mode || + !(it.it_lock_bits & MDS_INODELOCK_OPEN)) { + /* open lock must return for lease */ + CERROR(DFID "lease granted but no open lock, %d/%llu.\n", + PFID(ll_inode2fid(inode)), it.it_lock_mode, + it.it_lock_bits); + GOTO(out_close, rc = -EPROTO); + } + + ll_intent_release(&it); + RETURN(och); + +out_close: + /* Cancel open lock */ + if (it.it_lock_mode != 0) { + ldlm_lock_decref_and_cancel(&och->och_lease_handle, + it.it_lock_mode); + it.it_lock_mode = 0; + och->och_lease_handle.cookie = 0ULL; + } + rc2 = ll_close_inode_openhandle(inode, och, 0, NULL); + if (rc2 < 0) + CERROR("%s: error closing file "DFID": %d\n", + sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2); + och = NULL; /* och has been freed in ll_close_inode_openhandle() */ +out_release_it: + ll_intent_release(&it); +out: + if (och != NULL) + OBD_FREE_PTR(och); + RETURN(ERR_PTR(rc)); +} + +/** + * Check whether a layout swap can be done between two inodes. + * + * \param[in] inode1 First inode to check + * \param[in] inode2 Second inode to check + * + * \retval 0 on success, layout swap can be performed between both inodes + * \retval negative error code if requirements are not met + */ +static int ll_check_swap_layouts_validity(struct inode *inode1, + struct inode *inode2) +{ + if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode)) + return -EINVAL; + + if (inode_permission(&init_user_ns, inode1, MAY_WRITE) || + inode_permission(&init_user_ns, inode2, MAY_WRITE)) + return -EPERM; + + if (inode1->i_sb != inode2->i_sb) + return -EXDEV; + + return 0; +} + +static int ll_swap_layouts_close(struct obd_client_handle *och, + struct inode *inode, struct inode *inode2) +{ + const struct lu_fid *fid1 = ll_inode2fid(inode); + const struct lu_fid *fid2; + int rc; + ENTRY; + + CDEBUG(D_INODE, "%s: biased close of file "DFID"\n", + ll_i2sbi(inode)->ll_fsname, PFID(fid1)); + + rc = ll_check_swap_layouts_validity(inode, inode2); + if (rc < 0) + GOTO(out_free_och, rc); + + /* We now know that inode2 is a lustre inode */ + fid2 = ll_inode2fid(inode2); + + rc = lu_fid_cmp(fid1, fid2); + if (rc == 0) + GOTO(out_free_och, rc = -EINVAL); + + /* Close the file and {swap,merge} layouts between inode & inode2. + * NB: lease lock handle is released in mdc_close_layout_swap_pack() + * because we still need it to pack l_remote_handle to MDT. */ + rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP, + inode2); + + och = NULL; /* freed in ll_close_inode_openhandle() */ + +out_free_och: + if (och != NULL) + OBD_FREE_PTR(och); + + RETURN(rc); +} + +/** + * Release lease and close the file. + * It will check if the lease has ever broken. + */ +static int ll_lease_close_intent(struct obd_client_handle *och, + struct inode *inode, + bool *lease_broken, enum mds_op_bias bias, + void *data) +{ + struct ldlm_lock *lock; + bool cancelled = true; + int rc; + ENTRY; + + lock = ldlm_handle2lock(&och->och_lease_handle); + if (lock != NULL) { + lock_res_and_lock(lock); + cancelled = ldlm_is_cancel(lock); + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + } + + CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n", + PFID(&ll_i2info(inode)->lli_fid), cancelled, bias); + + if (lease_broken != NULL) + *lease_broken = cancelled; + + if (!cancelled && !bias) + ldlm_cli_cancel(&och->och_lease_handle, 0); + + if (cancelled) { /* no need to excute intent */ + bias = 0; + data = NULL; + } + + rc = ll_close_inode_openhandle(inode, och, bias, data); + RETURN(rc); +} + +static int ll_lease_close(struct obd_client_handle *och, struct inode *inode, + bool *lease_broken) +{ + return ll_lease_close_intent(och, inode, lease_broken, 0, NULL); +} + +/** + * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT + */ +static int ll_lease_file_resync(struct obd_client_handle *och, + struct inode *inode, unsigned long arg) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + struct ll_ioc_lease_id ioc; + __u64 data_version_unused; + int rc; + ENTRY; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg, + sizeof(ioc))) + RETURN(-EFAULT); + + /* before starting file resync, it's necessary to clean up page cache + * in client memory, otherwise once the layout version is increased, + * writing back cached data will be denied the OSTs. */ + rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH); + if (rc) + GOTO(out, rc); + + op_data->op_lease_handle = och->och_lease_handle; + op_data->op_mirror_id = ioc.lil_mirror_id; + rc = md_file_resync(sbi->ll_md_exp, op_data); + if (rc) + GOTO(out, rc); + + EXIT; +out: + ll_finish_md_op_data(op_data); + return rc; +} + +int ll_merge_attr(const struct lu_env *env, struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct cl_attr *attr = vvp_env_thread_attr(env); + s64 atime; + s64 mtime; + s64 ctime; + int rc = 0; + + ENTRY; + + ll_inode_size_lock(inode); + + /* Merge timestamps the most recently obtained from MDS with + * timestamps obtained from OSTs. + * + * Do not overwrite atime of inode because it may be refreshed + * by file_accessed() function. If the read was served by cache + * data, there is no RPC to be sent so that atime may not be + * transferred to OSTs at all. MDT only updates atime at close time + * if it's at least 'mdd.*.atime_diff' older. + * All in all, the atime in Lustre does not strictly comply with + * POSIX. Solving this problem needs to send an RPC to MDT for each + * read, this will hurt performance. + */ + if (test_and_clear_bit(LLIF_UPDATE_ATIME, &lli->lli_flags) || + inode->i_atime.tv_sec < lli->lli_atime) + inode->i_atime.tv_sec = lli->lli_atime; + + inode->i_mtime.tv_sec = lli->lli_mtime; + inode->i_ctime.tv_sec = lli->lli_ctime; + + mtime = inode->i_mtime.tv_sec; + atime = inode->i_atime.tv_sec; + ctime = inode->i_ctime.tv_sec; + + cl_object_attr_lock(obj); + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE)) + rc = -EINVAL; + else + rc = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + + if (rc != 0) + GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc)); + + if (atime < attr->cat_atime) + atime = attr->cat_atime; + + if (ctime < attr->cat_ctime) + ctime = attr->cat_ctime; + + if (mtime < attr->cat_mtime) + mtime = attr->cat_mtime; + + CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n", + PFID(&lli->lli_fid), attr->cat_size); + + if (llcrypt_require_key(inode) == -ENOKEY) { + /* Without the key, round up encrypted file size to next + * LUSTRE_ENCRYPTION_UNIT_SIZE. Clear text size is put in + * lli_lazysize for proper file size setting at close time. + */ + lli->lli_attr_valid |= OBD_MD_FLLAZYSIZE; + lli->lli_lazysize = attr->cat_size; + attr->cat_size = round_up(attr->cat_size, + LUSTRE_ENCRYPTION_UNIT_SIZE); + } + i_size_write(inode, attr->cat_size); + inode->i_blocks = attr->cat_blocks; + + inode->i_mtime.tv_sec = mtime; + inode->i_atime.tv_sec = atime; + inode->i_ctime.tv_sec = ctime; + +out_size_unlock: + ll_inode_size_unlock(inode); + + RETURN(rc); +} + +/** + * Set designated mirror for I/O. + * + * So far only read, write, and truncated can support to issue I/O to + * designated mirror. + */ +void ll_io_set_mirror(struct cl_io *io, const struct file *file) +{ + struct ll_file_data *fd = file->private_data; + + /* clear layout version for generic(non-resync) I/O in case it carries + * stale layout version due to I/O restart */ + io->ci_layout_version = 0; + + /* FLR: disable non-delay for designated mirror I/O because obviously + * only one mirror is available */ + if (fd->fd_designated_mirror > 0) { + io->ci_ndelay = 0; + io->ci_designated_mirror = fd->fd_designated_mirror; + io->ci_layout_version = fd->fd_layout_version; + } + + CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n", + file->f_path.dentry->d_name.name, io->ci_designated_mirror); +} + +/* + * This is relatime_need_update() from Linux 5.17, which is not exported. + */ +static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, + struct timespec64 now) +{ + + if (!(mnt->mnt_flags & MNT_RELATIME)) + return 1; + /* + * Is mtime younger than atime? If yes, update atime: + */ + if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0) + return 1; + /* + * Is ctime younger than atime? If yes, update atime: + */ + if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0) + return 1; + + /* + * Is the previous atime value older than 6 hours? If yes, + * update atime: + */ + if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 6*60*60) + return 1; + /* + * Good, we can skip the atime update: + */ + return 0; +} + +/* + * Very similar to kernel function: !__atime_needs_update() + */ +static bool file_is_noatime(const struct file *file) +{ + struct vfsmount *mnt = file->f_path.mnt; + struct inode *inode = file_inode((struct file *)file); + struct timespec64 now; + + if (file->f_flags & O_NOATIME) + return true; + + if (inode->i_flags & S_NOATIME) + return true; + + if (IS_NOATIME(inode)) + return true; + + if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY)) + return true; + + if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) + return true; + + if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) + return true; + + now = current_time(inode); + + if (!relatime_need_update(mnt, inode, now)) + return true; + + return false; +} + +void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot, + struct vvp_io_args *args) +{ + struct inode *inode = file_inode(file); + struct ll_file_data *fd = file->private_data; + + io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK; + io->ci_lock_no_expand = fd->ll_lock_no_expand; + + if (iot == CIT_WRITE) { + io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND); + io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC || + file->f_flags & O_DIRECT || + IS_SYNC(inode)); +#ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS + io->u.ci_wr.wr_sync |= !!(args && + (args->u.normal.via_iocb->ki_flags & + IOCB_DSYNC)); +#endif + } + + io->ci_obj = ll_i2info(inode)->lli_clob; + io->ci_lockreq = CILR_MAYBE; + if (ll_file_nolock(file)) { + io->ci_lockreq = CILR_NEVER; + io->ci_no_srvlock = 1; + } else if (file->f_flags & O_APPEND) { + io->ci_lockreq = CILR_MANDATORY; + } + io->ci_noatime = file_is_noatime(file); + io->ci_async_readahead = false; + + /* FLR: only use non-delay I/O for read as there is only one + * avaliable mirror for write. */ + io->ci_ndelay = !(iot == CIT_WRITE); + + ll_io_set_mirror(io, file); +} + +static void ll_heat_add(struct inode *inode, enum cl_io_type iot, + __u64 count) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + enum obd_heat_type sample_type; + enum obd_heat_type iobyte_type; + __u64 now = ktime_get_real_seconds(); + + if (!ll_sbi_has_file_heat(sbi) || + lli->lli_heat_flags & LU_HEAT_FLAG_OFF) + return; + + if (iot == CIT_READ) { + sample_type = OBD_HEAT_READSAMPLE; + iobyte_type = OBD_HEAT_READBYTE; + } else if (iot == CIT_WRITE) { + sample_type = OBD_HEAT_WRITESAMPLE; + iobyte_type = OBD_HEAT_WRITEBYTE; + } else { + return; + } + + spin_lock(&lli->lli_heat_lock); + obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1, + sbi->ll_heat_decay_weight, sbi->ll_heat_period_second); + obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count, + sbi->ll_heat_decay_weight, sbi->ll_heat_period_second); + spin_unlock(&lli->lli_heat_lock); +} + +static ssize_t +ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args, + struct file *file, enum cl_io_type iot, + loff_t *ppos, size_t count) +{ + struct vvp_io *vio = vvp_env_io(env); + struct inode *inode = file_inode(file); + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_file_data *fd = file->private_data; + struct range_lock range; + bool range_locked = false; + struct cl_io *io; + ssize_t result = 0; + int rc = 0; + int rc2 = 0; + unsigned int retried = 0, dio_lock = 0; + bool is_aio = false; + bool is_parallel_dio = false; + struct cl_dio_aio *ci_dio_aio = NULL; + size_t per_bytes; + bool partial_io = false; + size_t max_io_pages, max_cached_pages; + + ENTRY; + + CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n", + file_dentry(file)->d_name.name, + iot == CIT_READ ? "read" : "write", *ppos, count); + + max_io_pages = PTLRPC_MAX_BRW_PAGES * OBD_MAX_RIF_DEFAULT; + max_cached_pages = sbi->ll_cache->ccc_lru_max; + if (max_io_pages > (max_cached_pages >> 2)) + max_io_pages = max_cached_pages >> 2; + + io = vvp_env_thread_io(env); + if (file->f_flags & O_DIRECT) { + if (file->f_flags & O_APPEND) + dio_lock = 1; + if (!is_sync_kiocb(args->u.normal.via_iocb)) + is_aio = true; + + /* the kernel does not support AIO on pipes, and parallel DIO + * uses part of the AIO path, so we must not do parallel dio + * to pipes + */ + is_parallel_dio = !iov_iter_is_pipe(args->u.normal.via_iter) && + !is_aio; + + if (!ll_sbi_has_parallel_dio(sbi)) + is_parallel_dio = false; + + ci_dio_aio = cl_dio_aio_alloc(args->u.normal.via_iocb, + ll_i2info(inode)->lli_clob, is_aio); + if (!ci_dio_aio) + GOTO(out, rc = -ENOMEM); + } + +restart: + /** + * IO block size need be aware of cached page limit, otherwise + * if we have small max_cached_mb but large block IO issued, io + * could not be finished and blocked whole client. + */ + if (file->f_flags & O_DIRECT) + per_bytes = count; + else + per_bytes = min(max_io_pages << PAGE_SHIFT, count); + partial_io = per_bytes < count; + io = vvp_env_thread_io(env); + ll_io_init(io, file, iot, args); + io->ci_dio_aio = ci_dio_aio; + io->ci_dio_lock = dio_lock; + io->ci_ndelay_tried = retried; + io->ci_parallel_dio = is_parallel_dio; + + if (cl_io_rw_init(env, io, iot, *ppos, per_bytes) == 0) { + if (file->f_flags & O_APPEND) + range_lock_init(&range, 0, LUSTRE_EOF); + else + range_lock_init(&range, *ppos, *ppos + per_bytes - 1); + + vio->vui_fd = file->private_data; + vio->vui_iter = args->u.normal.via_iter; + vio->vui_iocb = args->u.normal.via_iocb; + /* Direct IO reads must also take range lock, + * or multiple reads will try to work on the same pages + * See LU-6227 for details. + */ + if (((iot == CIT_WRITE) || + (iot == CIT_READ && (file->f_flags & O_DIRECT))) && + !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n", + RL_PARA(&range)); + rc = range_lock(&lli->lli_write_tree, &range); + if (rc < 0) + GOTO(out, rc); + + range_locked = true; + } + + ll_cl_add(inode, env, io, LCC_RW); + rc = cl_io_loop(env, io); + ll_cl_remove(inode, env); + } else { + /* cl_io_rw_init() handled IO */ + rc = io->ci_result; + } + + if (io->ci_dio_aio && !is_aio) { + struct cl_sync_io *anchor = &io->ci_dio_aio->cda_sync; + + /* for dio, EIOCBQUEUED is an implementation detail, + * and we don't return it to userspace + */ + if (rc == -EIOCBQUEUED) + rc = 0; + + /* N/B: parallel DIO may be disabled during i/o submission; + * if that occurs, I/O shifts to sync, so it's all resolved + * before we get here, and this wait call completes + * immediately. + */ + rc2 = cl_sync_io_wait_recycle(env, anchor, 0, 0); + if (rc2 < 0) + rc = rc2; + } + + if (range_locked) { + CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n", + RL_PARA(&range)); + range_unlock(&lli->lli_write_tree, &range); + range_locked = false; + } + + /* + * In order to move forward AIO, ci_nob was increased, + * but that doesn't mean io have been finished, it just + * means io have been submited, we will always return + * EIOCBQUEUED to the caller, So we could only return + * number of bytes in non-AIO case. + */ + if (io->ci_nob > 0) { + if (!is_aio) { + if (rc2 == 0) { + result += io->ci_nob; + *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */ + } else if (rc2) { + result = 0; + } + } + count -= io->ci_nob; + + /* prepare IO restart */ + if (count > 0) + args->u.normal.via_iter = vio->vui_iter; + + if (partial_io) { + /** + * Reexpand iov count because it was zero + * after IO finish. + */ + iov_iter_reexpand(vio->vui_iter, count); + if (per_bytes == io->ci_nob) + io->ci_need_restart = 1; + } + } +out: + cl_io_fini(env, io); + + CDEBUG(D_VFSTRACE, + "%s: %d io complete with rc: %d, result: %zd, restart: %d\n", + file->f_path.dentry->d_name.name, + iot, rc, result, io->ci_need_restart); + + if ((rc == 0 || rc == -ENODATA || rc == -ENOLCK) && + count > 0 && io->ci_need_restart) { + CDEBUG(D_VFSTRACE, + "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n", + file_dentry(file)->d_name.name, + iot == CIT_READ ? "read" : "write", + *ppos, count, result, rc); + /* preserve the tried count for FLR */ + retried = io->ci_ndelay_tried; + dio_lock = io->ci_dio_lock; + goto restart; + } + + if (io->ci_dio_aio) { + /* + * VFS will call aio_complete() if no -EIOCBQUEUED + * is returned for AIO, so we can not call aio_complete() + * in our end_io(). + * + * NB: This is safe because the atomic_dec_and_lock in + * cl_sync_io_init has implicit memory barriers, so this will + * be seen by whichever thread completes the DIO/AIO, even if + * it's not this one + */ + if (rc != -EIOCBQUEUED) + io->ci_dio_aio->cda_no_aio_complete = 1; + /** + * Drop one extra reference so that end_io() could be + * called for this IO context, we could call it after + * we make sure all AIO requests have been proceed. + */ + cl_sync_io_note(env, &io->ci_dio_aio->cda_sync, + rc == -EIOCBQUEUED ? 0 : rc); + if (!is_aio) { + LASSERT(io->ci_dio_aio->cda_creator_free); + cl_dio_aio_free(env, io->ci_dio_aio); + io->ci_dio_aio = NULL; + } + } + + if (iot == CIT_READ) { + if (result > 0) + ll_stats_ops_tally(ll_i2sbi(inode), + LPROC_LL_READ_BYTES, result); + } else if (iot == CIT_WRITE) { + if (result > 0) { + ll_stats_ops_tally(ll_i2sbi(inode), + LPROC_LL_WRITE_BYTES, result); + fd->fd_write_failed = false; + } else if (result == 0 && rc == 0) { + rc = io->ci_result; + if (rc < 0) + fd->fd_write_failed = true; + else + fd->fd_write_failed = false; + } else if (rc != -ERESTARTSYS) { + fd->fd_write_failed = true; + } + } + + CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result); + if (result > 0) + ll_heat_add(inode, iot, result); + + RETURN(result > 0 ? result : rc); +} + +/** + * The purpose of fast read is to overcome per I/O overhead and improve IOPS + * especially for small I/O. + * + * To serve a read request, CLIO has to create and initialize a cl_io and + * then request DLM lock. This has turned out to have siginificant overhead + * and affects the performance of small I/O dramatically. + * + * It's not necessary to create a cl_io for each I/O. Under the help of read + * ahead, most of the pages being read are already in memory cache and we can + * read those pages directly because if the pages exist, the corresponding DLM + * lock must exist so that page content must be valid. + * + * In fast read implementation, the llite speculatively finds and reads pages + * in memory cache. There are three scenarios for fast read: + * - If the page exists and is uptodate, kernel VM will provide the data and + * CLIO won't be intervened; + * - If the page was brought into memory by read ahead, it will be exported + * and read ahead parameters will be updated; + * - Otherwise the page is not in memory, we can't do fast read. Therefore, + * it will go back and invoke normal read, i.e., a cl_io will be created + * and DLM lock will be requested. + * + * POSIX compliance: posix standard states that read is intended to be atomic. + * Lustre read implementation is in line with Linux kernel read implementation + * and neither of them complies with POSIX standard in this matter. Fast read + * doesn't make the situation worse on single node but it may interleave write + * results from multiple nodes due to short read handling in ll_file_aio_read(). + * + * \param env - lu_env + * \param iocb - kiocb from kernel + * \param iter - user space buffers where the data will be copied + * + * \retval - number of bytes have been read, or error code if error occurred. + */ +static ssize_t +ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t result; + + if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp)))) + return 0; + + /* NB: we can't do direct IO for fast read because it will need a lock + * to make IO engine happy. */ + if (iocb->ki_filp->f_flags & O_DIRECT) + return 0; + + result = generic_file_read_iter(iocb, iter); + + /* If the first page is not in cache, generic_file_aio_read() will be + * returned with -ENODATA. Fall back to full read path. + * See corresponding code in ll_readpage(). + * + * if we raced with page deletion, we might get EIO. Rather than add + * locking to the fast path for this rare case, fall back to the full + * read path. (See vvp_io_read_start() for rest of handling. + */ + if (result == -ENODATA || result == -EIO) + result = 0; + + if (result > 0) { + ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result); + ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)), + LPROC_LL_READ_BYTES, result); + } + + return result; +} + +/** + * Confine read iter lest read beyond the EOF + * + * \param iocb [in] kernel iocb + * \param to [in] reader iov_iter + * + * \retval <0 failure + * \retval 0 success + * \retval >0 @iocb->ki_pos has passed the EOF + */ +static int file_read_confine_iter(struct lu_env *env, struct kiocb *iocb, + struct iov_iter *to) +{ + struct cl_attr *attr = vvp_env_thread_attr(env); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ll_inode_info *lli = ll_i2info(inode); + loff_t read_end = iocb->ki_pos + iov_iter_count(to); + loff_t kms; + loff_t size; + int rc; + + cl_object_attr_lock(lli->lli_clob); + rc = cl_object_attr_get(env, lli->lli_clob, attr); + cl_object_attr_unlock(lli->lli_clob); + if (rc != 0) + return rc; + + kms = attr->cat_kms; + /* if read beyond end-of-file, adjust read count */ + if (kms > 0 && (iocb->ki_pos >= kms || read_end > kms)) { + rc = ll_glimpse_size(inode); + if (rc != 0) + return rc; + + size = i_size_read(inode); + if (iocb->ki_pos >= size || read_end > size) { + CDEBUG(D_VFSTRACE, + "%s: read [%llu, %llu] over eof, kms %llu, file_size %llu.\n", + file_dentry(file)->d_name.name, + iocb->ki_pos, read_end, kms, size); + + if (iocb->ki_pos >= size) + return 1; + + if (read_end > size) + iov_iter_truncate(to, size - iocb->ki_pos); + } + } + + return rc; +} + +/* + * Read from a file (through the page cache). + */ +static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct lu_env *env; + struct vvp_io_args *args; + struct file *file = iocb->ki_filp; + ssize_t result; + ssize_t rc2; + __u16 refcheck; + ktime_t kstart = ktime_get(); + bool cached; + bool stale_data = false; + + ENTRY; + + CDEBUG(D_VFSTRACE|D_IOTRACE, "file %s:"DFID", ppos: %lld, count: %zu\n", + file_dentry(file)->d_name.name, + PFID(ll_inode2fid(file_inode(file))), iocb->ki_pos, + iov_iter_count(to)); + + if (!iov_iter_count(to)) + RETURN(0); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + result = file_read_confine_iter(env, iocb, to); + if (result < 0) + GOTO(out, result); + else if (result > 0) + stale_data = true; + + /** + * Currently when PCC read failed, we do not fall back to the + * normal read path, just return the error. + * The resaon is that: for RW-PCC, the file data may be modified + * in the PCC and inconsistent with the data on OSTs (or file + * data has been removed from the Lustre file system), at this + * time, fallback to the normal read path may read the wrong + * data. + * TODO: for RO-PCC (readonly PCC), fall back to normal read + * path: read data from data copy on OSTs. + */ + result = pcc_file_read_iter(iocb, to, &cached); + if (cached) + GOTO(out, result); + + ll_ras_enter(file, iocb->ki_pos, iov_iter_count(to)); + + result = ll_do_fast_read(iocb, to); + if (result < 0 || iov_iter_count(to) == 0) + GOTO(out, result); + + args = ll_env_args(env); + args->u.normal.via_iter = to; + args->u.normal.via_iocb = iocb; + + rc2 = ll_file_io_generic(env, args, file, CIT_READ, + &iocb->ki_pos, iov_iter_count(to)); + if (rc2 > 0) + result += rc2; + else if (result == 0) + result = rc2; + +out: + cl_env_put(env, &refcheck); + + if (stale_data && result > 0) { + /** + * we've reached EOF before the read, the data read are cached + * stale data. + */ + iov_iter_truncate(to, 0); + result = 0; + } + + if (result > 0) { + ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid, + file->private_data, iocb->ki_pos, result, + READ); + ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_READ, + ktime_us_delta(ktime_get(), kstart)); + } + + RETURN(result); +} + +/** + * Similar trick to ll_do_fast_read, this improves write speed for tiny writes. + * If a page is already in the page cache and dirty (and some other things - + * See ll_tiny_write_begin for the instantiation of these rules), then we can + * write to it without doing a full I/O, because Lustre already knows about it + * and will write it out. This saves a lot of processing time. + * + * All writes here are within one page, so exclusion is handled by the page + * lock on the vm page. We do not do tiny writes for writes which touch + * multiple pages because it's very unlikely multiple sequential pages are + * are already dirty. + * + * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common + * and are unlikely to be to already dirty pages. + * + * Attribute updates are important here, we do them in ll_tiny_write_end. + */ +static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t count = iov_iter_count(iter); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + bool lock_inode = !IS_NOSEC(inode); + ssize_t result = 0; + + ENTRY; + + /* Restrict writes to single page and < PAGE_SIZE. See comment at top + * of function for why. + */ + if (count >= PAGE_SIZE || + (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE) + RETURN(0); + + if (unlikely(lock_inode)) + inode_lock(inode); + result = __generic_file_write_iter(iocb, iter); + + if (unlikely(lock_inode)) + inode_unlock(inode); + + /* If the page is not already dirty, ll_tiny_write_begin returns + * -ENODATA. We continue on to normal write. + */ + if (result == -ENODATA) + result = 0; + + if (result > 0) { + ll_heat_add(inode, CIT_WRITE, result); + set_bit(LLIF_DATA_MODIFIED, &ll_i2info(inode)->lli_flags); + } + + CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count); + + RETURN(result); +} + +/* + * Write to a file (through the page cache). + */ +static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct vvp_io_args *args; + struct lu_env *env; + ssize_t rc_tiny = 0, rc_normal; + struct file *file = iocb->ki_filp; + __u16 refcheck; + bool cached; + ktime_t kstart = ktime_get(); + int result; + + ENTRY; + + CDEBUG(D_VFSTRACE|D_IOTRACE, "file %s:"DFID", ppos: %lld, count: %zu\n", + file_dentry(file)->d_name.name, + PFID(ll_inode2fid(file_inode(file))), iocb->ki_pos, + iov_iter_count(from)); + + if (!iov_iter_count(from)) + GOTO(out, rc_normal = 0); + + /** + * When PCC write failed, we usually do not fall back to the normal + * write path, just return the error. But there is a special case when + * returned error code is -ENOSPC due to running out of space on PCC HSM + * bakcend. At this time, it will fall back to normal I/O path and + * retry the I/O. As the file is in HSM released state, it will restore + * the file data to OSTs first and redo the write again. And the + * restore process will revoke the layout lock and detach the file + * from PCC cache automatically. + */ + result = pcc_file_write_iter(iocb, from, &cached); + if (cached && result != -ENOSPC && result != -EDQUOT) + GOTO(out, rc_normal = result); + + /* NB: we can't do direct IO for tiny writes because they use the page + * cache, we can't do sync writes because tiny writes can't flush + * pages, and we can't do append writes because we can't guarantee the + * required DLM locks are held to protect file size. + */ + if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) && + !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND))) + rc_tiny = ll_do_tiny_write(iocb, from); + + /* In case of error, go on and try normal write - Only stop if tiny + * write completed I/O. + */ + if (iov_iter_count(from) == 0) + GOTO(out, rc_normal = rc_tiny); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + args = ll_env_args(env); + args->u.normal.via_iter = from; + args->u.normal.via_iocb = iocb; + + rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE, + &iocb->ki_pos, iov_iter_count(from)); + + /* On success, combine bytes written. */ + if (rc_tiny >= 0 && rc_normal > 0) + rc_normal += rc_tiny; + /* On error, only return error from normal write if tiny write did not + * write any bytes. Otherwise return bytes written by tiny write. + */ + else if (rc_tiny > 0) + rc_normal = rc_tiny; + + cl_env_put(env, &refcheck); +out: + if (rc_normal > 0) { + ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid, + file->private_data, iocb->ki_pos, + rc_normal, WRITE); + ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_WRITE, + ktime_us_delta(ktime_get(), kstart)); + } + + RETURN(rc_normal); +} + +#ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER +/* + * XXX: exact copy from kernel code (__generic_file_aio_write_nolock) + */ +static int ll_file_get_iov_count(const struct iovec *iov, + unsigned long *nr_segs, size_t *count, + int access_flags) +{ + size_t cnt = 0; + unsigned long seg; + + for (seg = 0; seg < *nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + cnt += iv->iov_len; + if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(access_flags, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + *nr_segs = seg; + cnt -= iv->iov_len; /* This segment is no good */ + break; + } + *count = cnt; + return 0; +} + +static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct iov_iter to; + size_t iov_count; + ssize_t result; + ENTRY; + + result = ll_file_get_iov_count(iov, &nr_segs, &iov_count, VERIFY_READ); + if (result) + RETURN(result); + + if (!iov_count) + RETURN(0); + +# ifdef HAVE_IOV_ITER_INIT_DIRECTION + iov_iter_init(&to, READ, iov, nr_segs, iov_count); +# else /* !HAVE_IOV_ITER_INIT_DIRECTION */ + iov_iter_init(&to, iov, nr_segs, iov_count, 0); +# endif /* HAVE_IOV_ITER_INIT_DIRECTION */ + + result = ll_file_read_iter(iocb, &to); + + RETURN(result); +} + +static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct iovec iov = { .iov_base = buf, .iov_len = count }; + struct kiocb kiocb; + ssize_t result; + + ENTRY; + + if (!count) + RETURN(0); + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; +#ifdef HAVE_KIOCB_KI_LEFT + kiocb.ki_left = count; +#elif defined(HAVE_KI_NBYTES) + kiocb.i_nbytes = count; +#endif + + result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos); + *ppos = kiocb.ki_pos; + + RETURN(result); +} + +/* + * Write to a file (through the page cache). + * AIO stuff + */ +static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct iov_iter from; + size_t iov_count; + ssize_t result; + ENTRY; + + result = ll_file_get_iov_count(iov, &nr_segs, &iov_count, VERIFY_WRITE); + if (result) + RETURN(result); + + if (!iov_count) + RETURN(0); + +# ifdef HAVE_IOV_ITER_INIT_DIRECTION + iov_iter_init(&from, WRITE, iov, nr_segs, iov_count); +# else /* !HAVE_IOV_ITER_INIT_DIRECTION */ + iov_iter_init(&from, iov, nr_segs, iov_count, 0); +# endif /* HAVE_IOV_ITER_INIT_DIRECTION */ + + result = ll_file_write_iter(iocb, &from); + + RETURN(result); +} + +static ssize_t ll_file_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct iovec iov = { .iov_base = (void __user *)buf, + .iov_len = count }; + struct kiocb kiocb; + ssize_t result; + + ENTRY; + + if (!count) + RETURN(0); + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; +#ifdef HAVE_KIOCB_KI_LEFT + kiocb.ki_left = count; +#elif defined(HAVE_KI_NBYTES) + kiocb.ki_nbytes = count; +#endif + + result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos); + *ppos = kiocb.ki_pos; + + RETURN(result); +} +#endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + +int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, + __u64 flags, struct lov_user_md *lum, int lum_size) +{ + struct lookup_intent oit = { + .it_op = IT_OPEN, + .it_flags = flags | MDS_OPEN_BY_FID, + }; + int rc; + ENTRY; + + if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) == + le32_to_cpu(LOV_MAGIC_MAGIC)) { + /* this code will only exist for big-endian systems */ + lustre_swab_lov_user_md(lum, 0); + } + + ll_inode_size_lock(inode); + rc = ll_intent_file_open(dentry, lum, lum_size, &oit); + if (rc < 0) + GOTO(out_unlock, rc); + + ll_release_openhandle(dentry, &oit); + +out_unlock: + ll_inode_size_unlock(inode); + ll_intent_release(&oit); + + RETURN(rc); +} + +int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, + struct lov_mds_md **lmmp, int *lmm_size, + struct ptlrpc_request **request) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct mdt_body *body; + struct lov_mds_md *lmm = NULL; + struct ptlrpc_request *req = NULL; + struct md_op_data *op_data; + int rc, lmmsize; + + ENTRY; + + rc = ll_get_default_mdsize(sbi, &lmmsize); + if (rc) + RETURN(rc); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, + strlen(filename), lmmsize, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA; + rc = md_getattr_name(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc < 0) { + CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n", + filename, rc); + GOTO(out, rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); /* checked by mdc_getattr_name */ + + lmmsize = body->mbo_eadatasize; + + if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || + lmmsize == 0) + GOTO(out, rc = -ENODATA); + + lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize); + LASSERT(lmm != NULL); + + if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) && + lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) && + lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) && + lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN)) + GOTO(out, rc = -EPROTO); + + /* + * This is coming from the MDS, so is probably in + * little endian. We convert it to host endian before + * passing it to userspace. + */ + if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) { + int stripe_count = 0; + + if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) || + lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) { + stripe_count = le16_to_cpu(lmm->lmm_stripe_count); + if (le32_to_cpu(lmm->lmm_pattern) & + LOV_PATTERN_F_RELEASED) + stripe_count = 0; + lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0); + + /* if function called for directory - we should + * avoid swab not existent lsm objects + */ + if (lmm->lmm_magic == LOV_MAGIC_V1 && + S_ISREG(body->mbo_mode)) + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v1 *)lmm)->lmm_objects, + stripe_count); + else if (lmm->lmm_magic == LOV_MAGIC_V3 && + S_ISREG(body->mbo_mode)) + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v3 *)lmm)->lmm_objects, + stripe_count); + } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_COMP_V1)) { + lustre_swab_lov_comp_md_v1( + (struct lov_comp_md_v1 *)lmm); + } + } + + if (lmm->lmm_magic == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *comp_v1 = NULL; + struct lov_comp_md_entry_v1 *ent; + struct lov_user_md_v1 *v1; + __u32 off; + int i = 0; + + comp_v1 = (struct lov_comp_md_v1 *)lmm; + /* Dump the striping information */ + for (; i < comp_v1->lcm_entry_count; i++) { + ent = &comp_v1->lcm_entries[i]; + off = ent->lcme_offset; + v1 = (struct lov_user_md_v1 *)((char *)lmm + off); + CDEBUG(D_INFO, + "comp[%d]: stripe_count=%u, stripe_size=%u\n", + i, v1->lmm_stripe_count, v1->lmm_stripe_size); + } + + /** + * Return valid stripe_count and stripe_size instead of 0 for + * DoM files to avoid divide-by-zero for older userspace that + * calls this ioctl, e.g. lustre ADIO driver. + */ + if (lmm->lmm_stripe_count == 0) + lmm->lmm_stripe_count = 1; + if (lmm->lmm_stripe_size == 0) { + /* Since the first component of the file data is placed + * on the MDT for faster access, the stripe_size of the + * second one is always that applications which are + * doing large IOs. + */ + if (lmm->lmm_pattern == LOV_PATTERN_MDT) + i = comp_v1->lcm_entry_count > 1 ? 1 : 0; + else + i = comp_v1->lcm_entry_count > 1 ? + comp_v1->lcm_entry_count - 1 : 0; + ent = &comp_v1->lcm_entries[i]; + off = ent->lcme_offset; + v1 = (struct lov_user_md_v1 *)((char *)lmm + off); + lmm->lmm_stripe_size = v1->lmm_stripe_size; + } + } +out: + *lmmp = lmm; + *lmm_size = lmmsize; + *request = req; + RETURN(rc); +} + +static int ll_lov_setea(struct inode *inode, struct file *file, + void __user *arg) +{ + __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE; + struct lov_user_md *lump; + int lum_size = sizeof(struct lov_user_md) + + sizeof(struct lov_user_ost_data); + int rc; + ENTRY; + + if (!capable(CAP_SYS_ADMIN)) + RETURN(-EPERM); + + OBD_ALLOC_LARGE(lump, lum_size); + if (lump == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(lump, arg, lum_size)) + GOTO(out_lump, rc = -EFAULT); + + rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump, + lum_size); + cl_lov_delay_create_clear(&file->f_flags); + +out_lump: + OBD_FREE_LARGE(lump, lum_size); + RETURN(rc); +} + +static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size) +{ + struct lu_env *env; + __u16 refcheck; + int rc; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size); + cl_env_put(env, &refcheck); + RETURN(rc); +} + +static int ll_lov_setstripe(struct inode *inode, struct file *file, + void __user *arg) +{ + struct lov_user_md __user *lum = (struct lov_user_md __user *)arg; + struct lov_user_md *klum; + int lum_size, rc; + __u64 flags = FMODE_WRITE; + ENTRY; + + rc = ll_copy_user_md(lum, &klum); + if (rc < 0) + RETURN(rc); + + lum_size = rc; + rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum, + lum_size); + if (!rc) { + __u32 gen; + + rc = put_user(0, &lum->lmm_stripe_count); + if (rc) + GOTO(out, rc); + + rc = ll_layout_refresh(inode, &gen); + if (rc) + GOTO(out, rc); + + rc = ll_file_getstripe(inode, arg, lum_size); + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode) && + ll_i2info(inode)->lli_clob) { + struct iattr attr = { 0 }; + + rc = cl_setattr_ost(ll_i2info(inode)->lli_clob, &attr, + OP_XVALID_FLAGS, LUSTRE_ENCRYPT_FL); + } + } + cl_lov_delay_create_clear(&file->f_flags); + +out: + OBD_FREE_LARGE(klum, lum_size); + RETURN(rc); +} + + +static int +ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct ll_file_data *fd = file->private_data; + struct ll_grouplock grouplock; + int rc; + ENTRY; + + if (arg == 0) { + CWARN("group id for group lock must not be 0\n"); + RETURN(-EINVAL); + } + + if (ll_file_nolock(file)) + RETURN(-EOPNOTSUPP); +retry: + if (file->f_flags & O_NONBLOCK) { + if (!mutex_trylock(&lli->lli_group_mutex)) + RETURN(-EAGAIN); + } else + mutex_lock(&lli->lli_group_mutex); + + if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { + CWARN("group lock already existed with gid %lu\n", + fd->fd_grouplock.lg_gid); + GOTO(out, rc = -EINVAL); + } + if (arg != lli->lli_group_gid && lli->lli_group_users != 0) { + if (file->f_flags & O_NONBLOCK) + GOTO(out, rc = -EAGAIN); + mutex_unlock(&lli->lli_group_mutex); + wait_var_event(&lli->lli_group_users, !lli->lli_group_users); + GOTO(retry, rc = 0); + } + LASSERT(fd->fd_grouplock.lg_lock == NULL); + + /** + * XXX: group lock needs to protect all OST objects while PFL + * can add new OST objects during the IO, so we'd instantiate + * all OST objects before getting its group lock. + */ + if (obj) { + struct lu_env *env; + __u16 refcheck; + struct cl_layout cl = { + .cl_is_composite = false, + }; + struct lu_extent ext = { + .e_start = 0, + .e_end = OBD_OBJECT_EOF, + }; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out, rc = PTR_ERR(env)); + + rc = cl_object_layout_get(env, obj, &cl); + if (rc >= 0 && cl.cl_is_composite) + rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE, + &ext); + + cl_env_put(env, &refcheck); + if (rc < 0) + GOTO(out, rc); + } + + rc = cl_get_grouplock(ll_i2info(inode)->lli_clob, + arg, (file->f_flags & O_NONBLOCK), &grouplock); + + if (rc) + GOTO(out, rc); + + fd->fd_flags |= LL_FILE_GROUP_LOCKED; + fd->fd_grouplock = grouplock; + if (lli->lli_group_users == 0) + lli->lli_group_gid = grouplock.lg_gid; + lli->lli_group_users++; + + CDEBUG(D_INFO, "group lock %lu obtained\n", arg); +out: + mutex_unlock(&lli->lli_group_mutex); + + RETURN(rc); +} + +static int ll_put_grouplock(struct inode *inode, struct file *file, + unsigned long arg) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = file->private_data; + struct ll_grouplock grouplock; + int rc; + ENTRY; + + mutex_lock(&lli->lli_group_mutex); + if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + CWARN("no group lock held\n"); + GOTO(out, rc = -EINVAL); + } + + LASSERT(fd->fd_grouplock.lg_lock != NULL); + + if (fd->fd_grouplock.lg_gid != arg) { + CWARN("group lock %lu doesn't match current id %lu\n", + arg, fd->fd_grouplock.lg_gid); + GOTO(out, rc = -EINVAL); + } + + grouplock = fd->fd_grouplock; + memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock)); + fd->fd_flags &= ~LL_FILE_GROUP_LOCKED; + + cl_put_grouplock(&grouplock); + + lli->lli_group_users--; + if (lli->lli_group_users == 0) { + lli->lli_group_gid = 0; + wake_up_var(&lli->lli_group_users); + } + CDEBUG(D_INFO, "group lock %lu released\n", arg); + GOTO(out, rc = 0); +out: + mutex_unlock(&lli->lli_group_mutex); + + RETURN(rc); +} + +/** + * Close inode open handle + * + * \param dentry [in] dentry which contains the inode + * \param it [in,out] intent which contains open info and result + * + * \retval 0 success + * \retval <0 failure + */ +int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it) +{ + struct inode *inode = dentry->d_inode; + struct obd_client_handle *och; + int rc; + ENTRY; + + LASSERT(inode); + + /* Root ? Do nothing. */ + if (is_root_inode(inode)) + RETURN(0); + + /* No open handle to close? Move away */ + if (!it_disposition(it, DISP_OPEN_OPEN)) + RETURN(0); + + LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0); + + OBD_ALLOC(och, sizeof(*och)); + if (!och) + GOTO(out, rc = -ENOMEM); + + rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och); + if (rc) + GOTO(out, rc); + + rc = ll_close_inode_openhandle(inode, och, 0, NULL); +out: + /* this one is in place of ll_file_open */ + if (it_disposition(it, DISP_ENQ_OPEN_REF)) { + ptlrpc_req_finished(it->it_request); + it_clear_disposition(it, DISP_ENQ_OPEN_REF); + } + RETURN(rc); +} + +/** + * Get size for inode for which FIEMAP mapping is requested. + * Make the FIEMAP get_info call and returns the result. + * \param fiemap kernel buffer to hold extens + * \param num_bytes kernel buffer size + */ +static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap, + size_t num_bytes) +{ + struct lu_env *env; + __u16 refcheck; + int rc = 0; + struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, }; + ENTRY; + + /* Checks for fiemap flags */ + if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) { + fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT; + return -EBADR; + } + + /* Check for FIEMAP_FLAG_SYNC */ + if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) { + rc = filemap_fdatawrite(inode->i_mapping); + if (rc) + return rc; + } + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + if (i_size_read(inode) == 0) { + rc = ll_glimpse_size(inode); + if (rc) + GOTO(out, rc); + } + + fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE); + obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid); + + /* If filesize is 0, then there would be no objects for mapping */ + if (fmkey.lfik_oa.o_size == 0) { + fiemap->fm_mapped_extents = 0; + GOTO(out, rc = 0); + } + + fmkey.lfik_fiemap = *fiemap; + + rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob, + &fmkey, fiemap, &num_bytes); +out: + cl_env_put(env, &refcheck); + RETURN(rc); +} + +int ll_fid2path(struct inode *inode, void __user *arg) +{ + struct obd_export *exp = ll_i2mdexp(inode); + const struct getinfo_fid2path __user *gfin = arg; + __u32 pathlen; + struct getinfo_fid2path *gfout; + size_t outsize; + int rc; + + ENTRY; + + if (!capable(CAP_DAC_READ_SEARCH) && + !test_bit(LL_SBI_USER_FID2PATH, ll_i2sbi(inode)->ll_flags)) + RETURN(-EPERM); + + /* Only need to get the buflen */ + if (get_user(pathlen, &gfin->gf_pathlen)) + RETURN(-EFAULT); + + if (pathlen > PATH_MAX) + RETURN(-EINVAL); + + outsize = sizeof(*gfout) + pathlen; + OBD_ALLOC(gfout, outsize); + if (gfout == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(gfout, arg, sizeof(*gfout))) + GOTO(gf_free, rc = -EFAULT); + /* append root FID after gfout to let MDT know the root FID so that it + * can lookup the correct path, this is mainly for fileset. + * old server without fileset mount support will ignore this. */ + *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode); + + /* Call mdc_iocontrol */ + rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL); + if (rc != 0) + GOTO(gf_free, rc); + + if (copy_to_user(arg, gfout, outsize)) + rc = -EFAULT; + +gf_free: + OBD_FREE(gfout, outsize); + RETURN(rc); +} + +static int +ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc) +{ + struct cl_object *obj = ll_i2info(inode)->lli_clob; + struct lu_env *env; + struct cl_io *io; + __u16 refcheck; + int result; + + ENTRY; + + ioc->idv_version = 0; + ioc->idv_layout_version = UINT_MAX; + + /* If no file object initialized, we consider its version is 0. */ + if (obj == NULL) + RETURN(0); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = vvp_env_thread_io(env); + io->ci_obj = obj; + io->u.ci_data_version.dv_data_version = 0; + io->u.ci_data_version.dv_layout_version = UINT_MAX; + io->u.ci_data_version.dv_flags = ioc->idv_flags; + +restart: + if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0) + result = cl_io_loop(env, io); + else + result = io->ci_result; + + ioc->idv_version = io->u.ci_data_version.dv_data_version; + ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version; + + cl_io_fini(env, io); + + if (unlikely(io->ci_need_restart)) + goto restart; + + cl_env_put(env, &refcheck); + + RETURN(result); +} + +/* + * Read the data_version for inode. + * + * This value is computed using stripe object version on OST. + * Version is computed using server side locking. + * + * @param flags if do sync on the OST side; + * 0: no sync + * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs + * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs + */ +int ll_data_version(struct inode *inode, __u64 *data_version, int flags) +{ + struct ioc_data_version ioc = { .idv_flags = flags }; + int rc; + + rc = ll_ioc_data_version(inode, &ioc); + if (!rc) + *data_version = ioc.idv_version; + + return rc; +} + +/* + * Trigger a HSM release request for the provided inode. + */ +int ll_hsm_release(struct inode *inode) +{ + struct lu_env *env; + struct obd_client_handle *och = NULL; + __u64 data_version = 0; + __u16 refcheck; + int rc; + + ENTRY; + + CDEBUG(D_INODE, "%s: Releasing file "DFID".\n", + ll_i2sbi(inode)->ll_fsname, + PFID(&ll_i2info(inode)->lli_fid)); + + /* + * For directory, this is not the right + * way to do the release. Ideally this should clean + * up the directory without triggering update to the backend. + * Right now, this just sets the RELEASED bit for the + * directory. This is left as is so as to have a way to set + * the RELEASED bit as a deug/recovery method + * instead of doing a rm on the directory. + * TODO-MDLL: Tracking SIM - Simba-21969 + */ + if (S_ISDIR(inode->i_mode)) + och = ll_lease_open(inode, NULL, FMODE_READ, MDS_OPEN_RELEASE); + else + och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE); + if (IS_ERR(och)) + GOTO(out, rc = PTR_ERR(och)); + + /* Grab latest data_version and [am]time values */ + rc = ll_data_version(inode, &data_version, + LL_DV_WR_FLUSH | LL_DV_SZ_UPDATE); + if (rc != 0) + GOTO(out, rc); + + /* Don't need to merge these attrs for directories */ + if (!S_ISDIR(inode->i_mode)) { + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out, rc = PTR_ERR(env)); + + rc = ll_merge_attr(env, inode); + cl_env_put(env, &refcheck); + + /* If error happen, we have the wrong size for a file. + * Don't release it. + */ + if (rc != 0) + GOTO(out, rc); + } + + /* Release the file. + * NB: lease lock handle is released in mdc_hsm_release_pack() because + * we still need it to pack l_remote_handle to MDT. */ + rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE, + &data_version); + och = NULL; + + EXIT; +out: + if (och != NULL && !IS_ERR(och)) /* close the file */ + ll_lease_close(och, inode, NULL); + + return rc; +} + +struct ll_swap_stack { + __u64 dv1; + __u64 dv2; + struct inode *inode1; + struct inode *inode2; + bool check_dv1; + bool check_dv2; +}; + +static int ll_swap_layouts(struct file *file1, struct file *file2, + struct lustre_swap_layouts *lsl) +{ + struct mdc_swap_layouts msl; + struct md_op_data *op_data; + __u32 gid; + __u64 dv; + struct ll_swap_stack *llss = NULL; + int rc; + + OBD_ALLOC_PTR(llss); + if (llss == NULL) + RETURN(-ENOMEM); + + llss->inode1 = file_inode(file1); + llss->inode2 = file_inode(file2); + + rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2); + if (rc < 0) + GOTO(free, rc); + + /* we use 2 bool because it is easier to swap than 2 bits */ + if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1) + llss->check_dv1 = true; + + if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2) + llss->check_dv2 = true; + + /* we cannot use lsl->sl_dvX directly because we may swap them */ + llss->dv1 = lsl->sl_dv1; + llss->dv2 = lsl->sl_dv2; + + rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2)); + if (rc == 0) /* same file, done! */ + GOTO(free, rc); + + if (rc < 0) { /* sequentialize it */ + swap(llss->inode1, llss->inode2); + swap(file1, file2); + swap(llss->dv1, llss->dv2); + swap(llss->check_dv1, llss->check_dv2); + } + + gid = lsl->sl_gid; + if (gid != 0) { /* application asks to flush dirty cache */ + rc = ll_get_grouplock(llss->inode1, file1, gid); + if (rc < 0) + GOTO(free, rc); + + rc = ll_get_grouplock(llss->inode2, file2, gid); + if (rc < 0) { + ll_put_grouplock(llss->inode1, file1, gid); + GOTO(free, rc); + } + } + + /* ultimate check, before swaping the layouts we check if + * dataversion has changed (if requested) */ + if (llss->check_dv1) { + rc = ll_data_version(llss->inode1, &dv, 0); + if (rc) + GOTO(putgl, rc); + if (dv != llss->dv1) + GOTO(putgl, rc = -EAGAIN); + } + + if (llss->check_dv2) { + rc = ll_data_version(llss->inode2, &dv, 0); + if (rc) + GOTO(putgl, rc); + if (dv != llss->dv2) + GOTO(putgl, rc = -EAGAIN); + } + + /* struct md_op_data is used to send the swap args to the mdt + * only flags is missing, so we use struct mdc_swap_layouts + * through the md_op_data->op_data */ + /* flags from user space have to be converted before they are send to + * server, no flag is sent today, they are only used on the client */ + msl.msl_flags = 0; + rc = -ENOMEM; + op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0, + 0, LUSTRE_OPC_ANY, &msl); + if (IS_ERR(op_data)) + GOTO(free, rc = PTR_ERR(op_data)); + + rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1), + sizeof(*op_data), op_data, NULL); + ll_finish_md_op_data(op_data); + + if (rc < 0) + GOTO(putgl, rc); + +putgl: + if (gid != 0) { + ll_put_grouplock(llss->inode2, file2, gid); + ll_put_grouplock(llss->inode1, file1, gid); + } + +free: + if (llss != NULL) + OBD_FREE_PTR(llss); + + RETURN(rc); +} + +int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss) +{ + struct obd_export *exp = ll_i2mdexp(inode); + struct md_op_data *op_data; + int rc; + ENTRY; + + /* Detect out-of range masks */ + if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK) + RETURN(-EINVAL); + + /* Non-root users are forbidden to set or clear flags which are + * NOT defined in HSM_USER_MASK. */ + if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) && + !capable(CAP_SYS_ADMIN)) + RETURN(-EPERM); + + if (!exp_connect_archive_id_array(exp)) { + /* Detect out-of range archive id */ + if ((hss->hss_valid & HSS_ARCHIVE_ID) && + (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE)) + RETURN(-EINVAL); + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hss); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data), + op_data, NULL); + + ll_finish_md_op_data(op_data); + + RETURN(rc); +} + +int ll_hsm_import(struct inode *inode, struct file *file, + struct hsm_user_import *hui) +{ + struct hsm_state_set *hss = NULL; + struct iattr *attr = NULL; + int rc; + ENTRY; + + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + RETURN(-EINVAL); + + /* set HSM flags */ + OBD_ALLOC_PTR(hss); + if (hss == NULL) + GOTO(out, rc = -ENOMEM); + + hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID; + hss->hss_archive_id = hui->hui_archive_id; + hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED; + rc = ll_hsm_state_set(inode, hss); + if (rc != 0) + GOTO(out, rc); + + OBD_ALLOC_PTR(attr); + if (attr == NULL) + GOTO(out, rc = -ENOMEM); + + attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO); + + if (S_ISDIR(inode->i_mode)) + attr->ia_mode |= S_IFDIR; + else + attr->ia_mode |= S_IFREG; + + attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid); + attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid); + attr->ia_size = hui->hui_size; + attr->ia_mtime.tv_sec = hui->hui_mtime; + attr->ia_mtime.tv_nsec = hui->hui_mtime_ns; + attr->ia_atime.tv_sec = hui->hui_atime; + attr->ia_atime.tv_nsec = hui->hui_atime_ns; + + attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE | + ATTR_UID | ATTR_GID | + ATTR_MTIME | ATTR_MTIME_SET | + ATTR_ATIME | ATTR_ATIME_SET; + + /* + * TODO-MDLL check if this needs to be done here + * or in ll_setattr_raw(). The ll_setattr_raw does a + * unlock() before it calls the ll_md_setattr() for + * regular files using S_ISREG(). Calling this for + * inodes other than files might result in a deadlock. + * Tracked with Simba-20393. + */ + if (S_ISREG(inode->i_mode)) + inode_lock(inode); + + rc = ll_setattr_raw(file_dentry(file), attr, 0, true); + if (rc == -ENODATA) + rc = 0; + + if (S_ISREG(inode->i_mode)) + inode_unlock(inode); + +out: + if (hss != NULL) + OBD_FREE_PTR(hss); + + if (attr != NULL) + OBD_FREE_PTR(attr); + + RETURN(rc); +} + +static inline long ll_lease_type_from_fmode(fmode_t fmode) +{ + return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) | + ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0); +} + +static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu) +{ + struct inode *inode = file_inode(file); + struct iattr ia = { + .ia_valid = ATTR_ATIME | ATTR_ATIME_SET | + ATTR_MTIME | ATTR_MTIME_SET | + ATTR_CTIME, + .ia_atime = { + .tv_sec = lfu->lfu_atime_sec, + .tv_nsec = lfu->lfu_atime_nsec, + }, + .ia_mtime = { + .tv_sec = lfu->lfu_mtime_sec, + .tv_nsec = lfu->lfu_mtime_nsec, + }, + .ia_ctime = { + .tv_sec = lfu->lfu_ctime_sec, + .tv_nsec = lfu->lfu_ctime_nsec, + }, + }; + int rc; + ENTRY; + + if (!capable(CAP_SYS_ADMIN)) + RETURN(-EPERM); + + if (!S_ISREG(inode->i_mode)) + RETURN(-EINVAL); + + inode_lock(inode); + rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET, + false); + inode_unlock(inode); + + RETURN(rc); +} + +static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode) +{ + switch (mode) { + case MODE_READ_USER: + return CLM_READ; + case MODE_WRITE_USER: + return CLM_WRITE; + default: + return -EINVAL; + } +} + +static const char *const user_lockname[] = LOCK_MODE_NAMES; + +/* Used to allow the upper layers of the client to request an LDLM lock + * without doing an actual read or write. + * + * Used for ladvise lockahead to manually request specific locks. + * + * \param[in] file file this ladvise lock request is on + * \param[in] ladvise ladvise struct describing this lock request + * + * \retval 0 success, no detailed result available (sync requests + * and requests sent to the server [not handled locally] + * cannot return detailed results) + * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request, + * see definitions for details. + * \retval negative negative errno on error + */ +int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise) +{ + struct lu_env *env = NULL; + struct cl_io *io = NULL; + struct cl_lock *lock = NULL; + struct cl_lock_descr *descr = NULL; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + enum cl_lock_mode cl_mode; + off_t start = ladvise->lla_start; + off_t end = ladvise->lla_end; + int result; + __u16 refcheck; + + ENTRY; + + CDEBUG(D_VFSTRACE, + "Lock request: file=%pd, inode=%p, mode=%s start=%llu, end=%llu\n", + dentry, dentry->d_inode, + user_lockname[ladvise->lla_lockahead_mode], (__u64) start, + (__u64) end); + + cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode); + if (cl_mode < 0) + GOTO(out, result = cl_mode); + + /* Get IO environment */ + result = cl_io_get(inode, &env, &io, &refcheck); + if (result <= 0) + GOTO(out, result); + + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result > 0) { + /* + * nothing to do for this io. This currently happens when + * stripe sub-object's are not yet created. + */ + result = io->ci_result; + } else if (result == 0) { + lock = vvp_env_lock(env); + descr = &lock->cll_descr; + + descr->cld_obj = io->ci_obj; + /* Convert byte offsets to pages */ + descr->cld_start = cl_index(io->ci_obj, start); + descr->cld_end = cl_index(io->ci_obj, end); + descr->cld_mode = cl_mode; + /* CEF_MUST is used because we do not want to convert a + * lockahead request to a lockless lock */ + descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND; + + if (ladvise->lla_peradvice_flags & LF_ASYNC) + descr->cld_enq_flags |= CEF_SPECULATIVE; + + result = cl_lock_request(env, io, lock); + + /* On success, we need to release the lock */ + if (result >= 0) + cl_lock_release(env, lock); + } + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + + /* -ECANCELED indicates a matching lock with a different extent + * was already present, and -EEXIST indicates a matching lock + * on exactly the same extent was already present. + * We convert them to positive values for userspace to make + * recognizing true errors easier. + * Note we can only return these detailed results on async requests, + * as sync requests look the same as i/o requests for locking. */ + if (result == -ECANCELED) + result = LLA_RESULT_DIFFERENT; + else if (result == -EEXIST) + result = LLA_RESULT_SAME; + +out: + RETURN(result); +} +static const char *const ladvise_names[] = LU_LADVISE_NAMES; + +static int ll_ladvise_sanity(struct inode *inode, + struct llapi_lu_ladvise *ladvise) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + enum lu_ladvise_type advice = ladvise->lla_advice; + /* Note the peradvice flags is a 32 bit field, so per advice flags must + * be in the first 32 bits of enum ladvise_flags */ + __u32 flags = ladvise->lla_peradvice_flags; + /* 3 lines at 80 characters per line, should be plenty */ + int rc = 0; + + if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) { + rc = -EINVAL; + CDEBUG(D_VFSTRACE, + "%s: advice with value '%d' not recognized, last supported advice is %s (value '%d'): rc = %d\n", + sbi->ll_fsname, advice, + ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc); + GOTO(out, rc); + } + + /* Per-advice checks */ + switch (advice) { + case LU_LADVISE_LOCKNOEXPAND: + if (flags & ~LF_LOCKNOEXPAND_MASK) { + rc = -EINVAL; + CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: " + "rc = %d\n", sbi->ll_fsname, flags, + ladvise_names[advice], rc); + GOTO(out, rc); + } + break; + case LU_LADVISE_LOCKAHEAD: + /* Currently only READ and WRITE modes can be requested */ + if (ladvise->lla_lockahead_mode >= MODE_MAX_USER || + ladvise->lla_lockahead_mode == 0) { + rc = -EINVAL; + CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: " + "rc = %d\n", sbi->ll_fsname, + ladvise->lla_lockahead_mode, + ladvise_names[advice], rc); + GOTO(out, rc); + } + fallthrough; + case LU_LADVISE_WILLREAD: + case LU_LADVISE_DONTNEED: + default: + /* Note fall through above - These checks apply to all advices + * except LOCKNOEXPAND */ + if (flags & ~LF_DEFAULT_MASK) { + rc = -EINVAL; + CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: " + "rc = %d\n", sbi->ll_fsname, flags, + ladvise_names[advice], rc); + GOTO(out, rc); + } + if (ladvise->lla_start >= ladvise->lla_end) { + rc = -EINVAL; + CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) " + "for %s: rc = %d\n", sbi->ll_fsname, + ladvise->lla_start, ladvise->lla_end, + ladvise_names[advice], rc); + GOTO(out, rc); + } + break; + } + +out: + return rc; +} +#undef ERRSIZE + +/* + * Give file access advices + * + * The ladvise interface is similar to Linux fadvise() system call, except it + * forwards the advices directly from Lustre client to server. The server side + * codes will apply appropriate read-ahead and caching techniques for the + * corresponding files. + * + * A typical workload for ladvise is e.g. a bunch of different clients are + * doing small random reads of a file, so prefetching pages into OSS cache + * with big linear reads before the random IO is a net benefit. Fetching + * all that data into each client cache with fadvise() may not be, due to + * much more data being sent to the client. + */ +static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags, + struct llapi_lu_ladvise *ladvise) +{ + struct lu_env *env; + struct cl_io *io; + struct cl_ladvise_io *lio; + int rc; + __u16 refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = vvp_env_thread_io(env); + io->ci_obj = ll_i2info(inode)->lli_clob; + + /* initialize parameters for ladvise */ + lio = &io->u.ci_ladvise; + lio->li_start = ladvise->lla_start; + lio->li_end = ladvise->lla_end; + lio->li_fid = ll_inode2fid(inode); + lio->li_advice = ladvise->lla_advice; + lio->li_flags = flags; + + if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0) + rc = cl_io_loop(env, io); + else + rc = io->ci_result; + + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + RETURN(rc); +} + +static int ll_lock_noexpand(struct file *file, int flags) +{ + struct ll_file_data *fd = file->private_data; + + fd->ll_lock_no_expand = !(flags & LF_UNSET); + + return 0; +} + +int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd, + unsigned long arg) +{ + struct fsxattr fsxattr; + + if (copy_from_user(&fsxattr, + (const struct fsxattr __user *)arg, + sizeof(fsxattr))) + RETURN(-EFAULT); + + fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags); + if (test_bit(LLIF_PROJECT_INHERIT, &ll_i2info(inode)->lli_flags)) + fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT; + fsxattr.fsx_projid = ll_i2info(inode)->lli_projid; + if (copy_to_user((struct fsxattr __user *)arg, + &fsxattr, sizeof(fsxattr))) + RETURN(-EFAULT); + + RETURN(0); +} + +int ll_ioctl_check_project(struct inode *inode, __u32 xflags, + __u32 projid) +{ + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() == &init_user_ns) { + /* + * Caller is allowed to change the project ID. if it is being + * changed, make sure that the new value is valid. + */ + if (ll_i2info(inode)->lli_projid != projid && + !projid_valid(make_kprojid(&init_user_ns, projid))) + return -EINVAL; + + return 0; + } + + if (ll_i2info(inode)->lli_projid != projid) + return -EINVAL; + + if (test_bit(LLIF_PROJECT_INHERIT, &ll_i2info(inode)->lli_flags)) { + if (!(xflags & FS_XFLAG_PROJINHERIT)) + return -EINVAL; + } else { + if (xflags & FS_XFLAG_PROJINHERIT) + return -EINVAL; + } + + return 0; +} + +static int ll_set_project(struct inode *inode, __u32 xflags, __u32 projid) +{ + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + struct cl_object *obj; + unsigned int inode_flags; + int rc = 0; + + rc = ll_ioctl_check_project(inode, xflags, projid); + if (rc) + RETURN(rc); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + inode_flags = ll_xflags_to_inode_flags(xflags); + op_data->op_attr_flags = ll_inode_to_ext_flags(inode_flags); + if (xflags & FS_XFLAG_PROJINHERIT) + op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL; + op_data->op_projid = projid; + op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS; + rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL, 0, &req); + ptlrpc_req_finished(req); + if (rc) + GOTO(out_fsxattr, rc); + ll_update_inode_flags(inode, op_data->op_attr_flags); + + /* Avoid OST RPC if this is only ioctl setting project inherit flag */ + if (xflags == 0 || xflags == FS_XFLAG_PROJINHERIT) + GOTO(out_fsxattr, rc); + + obj = ll_i2info(inode)->lli_clob; + if (obj) { + struct iattr attr = { 0 }; + + rc = cl_setattr_ost(obj, &attr, OP_XVALID_FLAGS, xflags); + } + +out_fsxattr: + ll_finish_md_op_data(op_data); + RETURN(rc); +} + +int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd, + unsigned long arg) +{ + struct fsxattr fsxattr; + + ENTRY; + + if (copy_from_user(&fsxattr, + (const struct fsxattr __user *)arg, + sizeof(fsxattr))) + RETURN(-EFAULT); + + RETURN(ll_set_project(inode, fsxattr.fsx_xflags, + fsxattr.fsx_projid)); +} + +int ll_ioctl_project(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct lu_project lu_project; + struct dentry *dentry = file_dentry(file); + struct inode *inode = file_inode(file); + struct dentry *child_dentry = NULL; + int rc = 0, name_len; + + if (copy_from_user(&lu_project, + (const struct lu_project __user *)arg, + sizeof(lu_project))) + RETURN(-EFAULT); + + /* apply child dentry if name is valid */ + name_len = strnlen(lu_project.project_name, NAME_MAX); + if (name_len > 0 && name_len <= NAME_MAX) { + inode_lock(inode); + child_dentry = lookup_one_len(lu_project.project_name, + dentry, name_len); + inode_unlock(inode); + if (IS_ERR(child_dentry)) { + rc = PTR_ERR(child_dentry); + goto out; + } + inode = child_dentry->d_inode; + if (!inode) { + rc = -ENOENT; + goto out; + } + } else if (name_len > NAME_MAX) { + rc = -EINVAL; + goto out; + } + + switch (lu_project.project_type) { + case LU_PROJECT_SET: + rc = ll_set_project(inode, lu_project.project_xflags, + lu_project.project_id); + break; + case LU_PROJECT_GET: + lu_project.project_xflags = + ll_inode_flags_to_xflags(inode->i_flags); + if (test_bit(LLIF_PROJECT_INHERIT, + &ll_i2info(inode)->lli_flags)) + lu_project.project_xflags |= FS_XFLAG_PROJINHERIT; + lu_project.project_id = ll_i2info(inode)->lli_projid; + if (copy_to_user((struct lu_project __user *)arg, + &lu_project, sizeof(lu_project))) { + rc = -EFAULT; + goto out; + } + break; + default: + rc = -EINVAL; + break; + } +out: + if (!IS_ERR_OR_NULL(child_dentry)) + dput(child_dentry); + RETURN(rc); +} + +static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc, + unsigned long arg) +{ + struct inode *inode = file_inode(file); + struct ll_file_data *fd = file->private_data; + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_client_handle *och = NULL; + struct split_param sp; + struct pcc_param param; + bool lease_broken = false; + fmode_t fmode = 0; + enum mds_op_bias bias = 0; + struct file *layout_file = NULL; + void *data = NULL; + size_t data_size = 0; + bool attached = false; + long rc, rc2 = 0; + + ENTRY; + + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och != NULL) { + och = fd->fd_lease_och; + fd->fd_lease_och = NULL; + } + mutex_unlock(&lli->lli_och_mutex); + + if (och == NULL) + RETURN(-ENOLCK); + + fmode = och->och_flags; + + switch (ioc->lil_flags) { + case LL_LEASE_RESYNC_DONE: + if (ioc->lil_count > IOC_IDS_MAX) + GOTO(out_lease_close, rc = -EINVAL); + + data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]); + OBD_ALLOC(data, data_size); + if (!data) + GOTO(out_lease_close, rc = -ENOMEM); + + if (copy_from_user(data, (void __user *)arg, data_size)) + GOTO(out_lease_close, rc = -EFAULT); + + bias = MDS_CLOSE_RESYNC_DONE; + break; + case LL_LEASE_LAYOUT_MERGE: { + int fd; + + if (ioc->lil_count != 1) + GOTO(out_lease_close, rc = -EINVAL); + + arg += sizeof(*ioc); + if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32))) + GOTO(out_lease_close, rc = -EFAULT); + + layout_file = fget(fd); + if (!layout_file) + GOTO(out_lease_close, rc = -EBADF); + + if ((file->f_flags & O_ACCMODE) == O_RDONLY || + (layout_file->f_flags & O_ACCMODE) == O_RDONLY) + GOTO(out_lease_close, rc = -EPERM); + + data = file_inode(layout_file); + bias = MDS_CLOSE_LAYOUT_MERGE; + break; + } + case LL_LEASE_LAYOUT_SPLIT: { + int fdv; + int mirror_id; + + if (ioc->lil_count != 2) + GOTO(out_lease_close, rc = -EINVAL); + + arg += sizeof(*ioc); + if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32))) + GOTO(out_lease_close, rc = -EFAULT); + + arg += sizeof(__u32); + if (copy_from_user(&mirror_id, (void __user *)arg, + sizeof(__u32))) + GOTO(out_lease_close, rc = -EFAULT); + + layout_file = fget(fdv); + if (!layout_file) + GOTO(out_lease_close, rc = -EBADF); + + /* if layout_file == file, it means to destroy the mirror */ + sp.sp_inode = file_inode(layout_file); + sp.sp_mirror_id = (__u16)mirror_id; + data = &sp; + bias = MDS_CLOSE_LAYOUT_SPLIT; + break; + } + case LL_LEASE_PCC_ATTACH: + if (ioc->lil_count != 1) + RETURN(-EINVAL); + + if (IS_ENCRYPTED(inode)) + RETURN(-EOPNOTSUPP); + + arg += sizeof(*ioc); + if (copy_from_user(¶m.pa_archive_id, (void __user *)arg, + sizeof(__u32))) + GOTO(out_lease_close, rc2 = -EFAULT); + + rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id); + if (rc2) + GOTO(out_lease_close, rc2); + + attached = true; + /* Grab latest data version */ + rc2 = ll_data_version(inode, ¶m.pa_data_version, + LL_DV_WR_FLUSH); + if (rc2) + GOTO(out_lease_close, rc2); + + data = ¶m; + bias = MDS_PCC_ATTACH; + break; + default: + /* without close intent */ + break; + } + +out_lease_close: + rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data); + if (rc < 0) + GOTO(out, rc); + + rc = ll_lease_och_release(inode, file); + if (rc < 0) + GOTO(out, rc); + + if (lease_broken) + fmode = 0; + EXIT; + +out: + switch (ioc->lil_flags) { + case LL_LEASE_RESYNC_DONE: + if (data) + OBD_FREE(data, data_size); + break; + case LL_LEASE_LAYOUT_MERGE: + case LL_LEASE_LAYOUT_SPLIT: + if (layout_file) + fput(layout_file); + + ll_layout_refresh(inode, &fd->fd_layout_version); + break; + case LL_LEASE_PCC_ATTACH: + if (!rc) + rc = rc2; + rc = pcc_readwrite_attach_fini(file, inode, + param.pa_layout_gen, + lease_broken, rc, + attached); + break; + } + + if (!rc) + rc = ll_lease_type_from_fmode(fmode); + RETURN(rc); +} + +static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc, + unsigned long arg) +{ + struct inode *inode = file_inode(file); + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = file->private_data; + struct obd_client_handle *och = NULL; + __u64 open_flags = 0; + bool lease_broken; + fmode_t fmode; + long rc; + ENTRY; + + switch (ioc->lil_mode) { + case LL_LEASE_WRLCK: + if (!(file->f_mode & FMODE_WRITE)) + RETURN(-EPERM); + fmode = FMODE_WRITE; + break; + case LL_LEASE_RDLCK: + if (!(file->f_mode & FMODE_READ)) + RETURN(-EPERM); + fmode = FMODE_READ; + break; + case LL_LEASE_UNLCK: + RETURN(ll_file_unlock_lease(file, ioc, arg)); + default: + RETURN(-EINVAL); + } + + CDEBUG(D_INODE, "Set lease with mode %u\n", fmode); + + /* apply for lease */ + if (ioc->lil_flags & LL_LEASE_RESYNC) + open_flags = MDS_OPEN_RESYNC; + och = ll_lease_open(inode, file, fmode, open_flags); + if (IS_ERR(och)) + RETURN(PTR_ERR(och)); + + if (ioc->lil_flags & LL_LEASE_RESYNC) { + rc = ll_lease_file_resync(och, inode, arg); + if (rc) { + ll_lease_close(och, inode, NULL); + RETURN(rc); + } + rc = ll_layout_refresh(inode, &fd->fd_layout_version); + if (rc) { + ll_lease_close(och, inode, NULL); + RETURN(rc); + } + } + + rc = 0; + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och == NULL) { + fd->fd_lease_och = och; + och = NULL; + } + mutex_unlock(&lli->lli_och_mutex); + if (och != NULL) { + /* impossible now that only excl is supported for now */ + ll_lease_close(och, inode, &lease_broken); + rc = -EBUSY; + } + RETURN(rc); +} + +static void ll_heat_get(struct inode *inode, struct lu_heat *heat) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + __u64 now = ktime_get_real_seconds(); + int i; + + spin_lock(&lli->lli_heat_lock); + heat->lh_flags = lli->lli_heat_flags; + for (i = 0; i < heat->lh_count; i++) + heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i], + now, sbi->ll_heat_decay_weight, + sbi->ll_heat_period_second); + spin_unlock(&lli->lli_heat_lock); +} + +static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags) +{ + struct ll_inode_info *lli = ll_i2info(inode); + int rc = 0; + + spin_lock(&lli->lli_heat_lock); + if (flags & LU_HEAT_FLAG_CLEAR) + obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT); + + if (flags & LU_HEAT_FLAG_OFF) + lli->lli_heat_flags |= LU_HEAT_FLAG_OFF; + else + lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF; + + spin_unlock(&lli->lli_heat_lock); + + RETURN(rc); +} + +static long +ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(file); + struct ll_file_data *fd = file->private_data; + int flags, rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n", + PFID(ll_inode2fid(inode)), inode, cmd); + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1); + + /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */ + if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */ + RETURN(-ENOTTY); + + switch (cmd) { + case LL_IOC_GETFLAGS: + /* Get the current value of the file flags */ + return put_user(fd->fd_flags, (int __user *)arg); + case LL_IOC_SETFLAGS: + case LL_IOC_CLRFLAGS: + /* Set or clear specific file flags */ + /* XXX This probably needs checks to ensure the flags are + * not abused, and to handle any flag side effects. + */ + if (get_user(flags, (int __user *) arg)) + RETURN(-EFAULT); + + if (cmd == LL_IOC_SETFLAGS) { + if ((flags & LL_FILE_IGNORE_LOCK) && + !(file->f_flags & O_DIRECT)) { + CERROR("%s: unable to disable locking on " + "non-O_DIRECT file\n", current->comm); + RETURN(-EINVAL); + } + + fd->fd_flags |= flags; + } else { + fd->fd_flags &= ~flags; + } + RETURN(0); + case LL_IOC_LOV_SETSTRIPE: + case LL_IOC_LOV_SETSTRIPE_NEW: + RETURN(ll_lov_setstripe(inode, file, (void __user *)arg)); + case LL_IOC_LOV_SETEA: + RETURN(ll_lov_setea(inode, file, (void __user *)arg)); + case LL_IOC_LOV_SWAP_LAYOUTS: { + struct file *file2; + struct lustre_swap_layouts lsl; + + if (copy_from_user(&lsl, (char __user *)arg, + sizeof(struct lustre_swap_layouts))) + RETURN(-EFAULT); + + if ((file->f_flags & O_ACCMODE) == O_RDONLY) + RETURN(-EPERM); + + file2 = fget(lsl.sl_fd); + if (file2 == NULL) + RETURN(-EBADF); + + /* O_WRONLY or O_RDWR */ + if ((file2->f_flags & O_ACCMODE) == O_RDONLY) + GOTO(out, rc = -EPERM); + + if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) { + struct inode *inode2; + struct ll_inode_info *lli; + struct obd_client_handle *och = NULL; + + lli = ll_i2info(inode); + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och != NULL) { + och = fd->fd_lease_och; + fd->fd_lease_och = NULL; + } + mutex_unlock(&lli->lli_och_mutex); + if (och == NULL) + GOTO(out, rc = -ENOLCK); + inode2 = file_inode(file2); + rc = ll_swap_layouts_close(och, inode, inode2); + } else { + rc = ll_swap_layouts(file, file2, &lsl); + } +out: + fput(file2); + RETURN(rc); + } + case LL_IOC_LOV_GETSTRIPE: + case LL_IOC_LOV_GETSTRIPE_NEW: + RETURN(ll_file_getstripe(inode, (void __user *)arg, 0)); + case FS_IOC_GETFLAGS: + case FS_IOC_SETFLAGS: + RETURN(ll_iocontrol(inode, file, cmd, arg)); + case FSFILT_IOC_GETVERSION: + case FS_IOC_GETVERSION: + RETURN(put_user(inode->i_generation, (int __user *)arg)); + /* We need to special case any other ioctls we want to handle, + * to send them to the MDS/OST as appropriate and to properly + * network encode the arg field. */ + case FS_IOC_SETVERSION: + RETURN(-ENOTSUPP); + + case LL_IOC_GROUP_LOCK: + RETURN(ll_get_grouplock(inode, file, arg)); + case LL_IOC_GROUP_UNLOCK: + RETURN(ll_put_grouplock(inode, file, arg)); + case IOC_OBD_STATFS: + RETURN(ll_obd_statfs(inode, (void __user *)arg)); + + case LL_IOC_FLUSHCTX: + RETURN(ll_flush_ctx(inode)); + case LL_IOC_PATH2FID: { + if (copy_to_user((void __user *)arg, ll_inode2fid(inode), + sizeof(struct lu_fid))) + RETURN(-EFAULT); + + RETURN(0); + } + case LL_IOC_GETPARENT: + RETURN(ll_getparent(file, (struct getparent __user *)arg)); + + case OBD_IOC_FID2PATH: + RETURN(ll_fid2path(inode, (void __user *)arg)); + case LL_IOC_DATA_VERSION: { + struct ioc_data_version idv; + int rc; + + if (copy_from_user(&idv, (char __user *)arg, sizeof(idv))) + RETURN(-EFAULT); + + idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH; + rc = ll_ioc_data_version(inode, &idv); + + if (rc == 0 && + copy_to_user((char __user *)arg, &idv, sizeof(idv))) + RETURN(-EFAULT); + + RETURN(rc); + } + + case LL_IOC_GET_MDTIDX: { + int mdtidx; + + mdtidx = ll_get_mdt_idx(inode); + if (mdtidx < 0) + RETURN(mdtidx); + + if (put_user((int)mdtidx, (int __user *)arg)) + RETURN(-EFAULT); + + RETURN(0); + } + case OBD_IOC_GETNAME_OLD: + case OBD_IOC_GETDTNAME: + case OBD_IOC_GETMDNAME: + RETURN(ll_get_obd_name(inode, cmd, arg)); + case LL_IOC_HSM_STATE_GET: { + struct md_op_data *op_data; + struct hsm_user_state *hus; + int rc; + + OBD_ALLOC_PTR(hus); + if (hus == NULL) + RETURN(-ENOMEM); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hus); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(hus); + RETURN(PTR_ERR(op_data)); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), + op_data, NULL); + + if (copy_to_user((void __user *)arg, hus, sizeof(*hus))) + rc = -EFAULT; + + ll_finish_md_op_data(op_data); + OBD_FREE_PTR(hus); + RETURN(rc); + } + case LL_IOC_HSM_STATE_SET: { + struct hsm_state_set *hss; + int rc; + + OBD_ALLOC_PTR(hss); + if (hss == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) { + OBD_FREE_PTR(hss); + RETURN(-EFAULT); + } + + rc = ll_hsm_state_set(inode, hss); + + OBD_FREE_PTR(hss); + RETURN(rc); + } + case LL_IOC_HSM_ACTION: { + struct md_op_data *op_data; + struct hsm_current_action *hca; + const char *action; + int rc; + + OBD_ALLOC_PTR(hca); + if (hca == NULL) + RETURN(-ENOMEM); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hca); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(hca); + RETURN(PTR_ERR(op_data)); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), + op_data, NULL); + if (rc < 0) + GOTO(skip_copy, rc); + + /* The hsm_current_action retreived from the server could + * contain corrupt information. If it is incorrect data collect + * debug information. We still send the data even if incorrect + * to user land to handle. + */ + action = hsm_user_action2name(hca->hca_action); + if (strcmp(action, "UNKNOWN") == 0 || + hca->hca_state > HPS_DONE) { + CDEBUG(D_HSM, + "HSM current state %s action %s, offset = %llu, length %llu\n", + hsm_progress_state2name(hca->hca_state), action, + hca->hca_location.offset, hca->hca_location.length); + } + + if (copy_to_user((char __user *)arg, hca, sizeof(*hca))) + rc = -EFAULT; +skip_copy: + ll_finish_md_op_data(op_data); + OBD_FREE_PTR(hca); + RETURN(rc); + } + case LL_IOC_SET_LEASE_OLD: { + struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg }; + + RETURN(ll_file_set_lease(file, &ioc, 0)); + } + case LL_IOC_SET_LEASE: { + struct ll_ioc_lease ioc; + + if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc))) + RETURN(-EFAULT); + + RETURN(ll_file_set_lease(file, &ioc, arg)); + } + case LL_IOC_GET_LEASE: { + struct ll_inode_info *lli = ll_i2info(inode); + struct ldlm_lock *lock = NULL; + fmode_t fmode = 0; + + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och != NULL) { + struct obd_client_handle *och = fd->fd_lease_och; + + lock = ldlm_handle2lock(&och->och_lease_handle); + if (lock != NULL) { + lock_res_and_lock(lock); + if (!ldlm_is_cancel(lock)) + fmode = och->och_flags; + + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + } + } + mutex_unlock(&lli->lli_och_mutex); + + RETURN(ll_lease_type_from_fmode(fmode)); + } + case LL_IOC_HSM_IMPORT: { + struct hsm_user_import *hui; + + OBD_ALLOC_PTR(hui); + if (hui == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) { + OBD_FREE_PTR(hui); + RETURN(-EFAULT); + } + + rc = ll_hsm_import(inode, file, hui); + + OBD_FREE_PTR(hui); + RETURN(rc); + } + case LL_IOC_FUTIMES_3: { + struct ll_futimes_3 lfu; + + if (copy_from_user(&lfu, + (const struct ll_futimes_3 __user *)arg, + sizeof(lfu))) + RETURN(-EFAULT); + + RETURN(ll_file_futimes_3(file, &lfu)); + } + case LL_IOC_LADVISE: { + struct llapi_ladvise_hdr *k_ladvise_hdr; + struct llapi_ladvise_hdr __user *u_ladvise_hdr; + int i; + int num_advise; + int alloc_size = sizeof(*k_ladvise_hdr); + + rc = 0; + u_ladvise_hdr = (void __user *)arg; + OBD_ALLOC_PTR(k_ladvise_hdr); + if (k_ladvise_hdr == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size)) + GOTO(out_ladvise, rc = -EFAULT); + + if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC || + k_ladvise_hdr->lah_count < 1) + GOTO(out_ladvise, rc = -EINVAL); + + num_advise = k_ladvise_hdr->lah_count; + if (num_advise >= LAH_COUNT_MAX) + GOTO(out_ladvise, rc = -EFBIG); + + OBD_FREE_PTR(k_ladvise_hdr); + alloc_size = offsetof(typeof(*k_ladvise_hdr), + lah_advise[num_advise]); + OBD_ALLOC(k_ladvise_hdr, alloc_size); + if (k_ladvise_hdr == NULL) + RETURN(-ENOMEM); + + /* + * TODO: submit multiple advices to one server in a single RPC + */ + if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size)) + GOTO(out_ladvise, rc = -EFAULT); + + for (i = 0; i < num_advise; i++) { + struct llapi_lu_ladvise *k_ladvise = + &k_ladvise_hdr->lah_advise[i]; + struct llapi_lu_ladvise __user *u_ladvise = + &u_ladvise_hdr->lah_advise[i]; + + rc = ll_ladvise_sanity(inode, k_ladvise); + if (rc) + GOTO(out_ladvise, rc); + + switch (k_ladvise->lla_advice) { + case LU_LADVISE_LOCKNOEXPAND: + rc = ll_lock_noexpand(file, + k_ladvise->lla_peradvice_flags); + GOTO(out_ladvise, rc); + case LU_LADVISE_LOCKAHEAD: + + rc = ll_file_lock_ahead(file, k_ladvise); + + if (rc < 0) + GOTO(out_ladvise, rc); + + if (put_user(rc, + &u_ladvise->lla_lockahead_result)) + GOTO(out_ladvise, rc = -EFAULT); + break; + default: + rc = ll_ladvise(inode, file, + k_ladvise_hdr->lah_flags, + k_ladvise); + if (rc) + GOTO(out_ladvise, rc); + break; + } + + } + +out_ladvise: + OBD_FREE(k_ladvise_hdr, alloc_size); + RETURN(rc); + } + case LL_IOC_FLR_SET_MIRROR: { + /* mirror I/O must be direct to avoid polluting page cache + * by stale data. */ + if (!(file->f_flags & O_DIRECT)) + RETURN(-EINVAL); + + fd->fd_designated_mirror = (__u32)arg; + RETURN(0); + } + case FS_IOC_FSGETXATTR: + RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg)); + case FS_IOC_FSSETXATTR: + RETURN(ll_ioctl_fssetxattr(inode, cmd, arg)); + case LL_IOC_PROJECT: + RETURN(ll_ioctl_project(file, cmd, arg)); + case BLKSSZGET: + RETURN(put_user(PAGE_SIZE, (int __user *)arg)); + case LL_IOC_HEAT_GET: { + struct lu_heat uheat; + struct lu_heat *heat; + int size; + + if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat))) + RETURN(-EFAULT); + + if (uheat.lh_count > OBD_HEAT_COUNT) + uheat.lh_count = OBD_HEAT_COUNT; + + size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]); + OBD_ALLOC(heat, size); + if (heat == NULL) + RETURN(-ENOMEM); + + heat->lh_count = uheat.lh_count; + ll_heat_get(inode, heat); + rc = copy_to_user((char __user *)arg, heat, size); + OBD_FREE(heat, size); + RETURN(rc ? -EFAULT : 0); + } + case LL_IOC_HEAT_SET: { + __u64 flags; + + if (copy_from_user(&flags, (void __user *)arg, sizeof(flags))) + RETURN(-EFAULT); + + rc = ll_heat_set(inode, flags); + RETURN(rc); + } + case LL_IOC_PCC_DETACH: { + struct lu_pcc_detach *detach; + + OBD_ALLOC_PTR(detach); + if (detach == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(detach, + (const struct lu_pcc_detach __user *)arg, + sizeof(*detach))) + GOTO(out_detach_free, rc = -EFAULT); + + if (!S_ISREG(inode->i_mode)) + GOTO(out_detach_free, rc = -EINVAL); + + if (!inode_owner_or_capable(&init_user_ns, inode)) + GOTO(out_detach_free, rc = -EPERM); + + rc = pcc_ioctl_detach(inode, detach->pccd_opt); +out_detach_free: + OBD_FREE_PTR(detach); + RETURN(rc); + } + case LL_IOC_PCC_STATE: { + struct lu_pcc_state __user *ustate = + (struct lu_pcc_state __user *)arg; + struct lu_pcc_state *state; + + OBD_ALLOC_PTR(state); + if (state == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(state, ustate, sizeof(*state))) + GOTO(out_state, rc = -EFAULT); + + rc = pcc_ioctl_state(file, inode, state); + if (rc) + GOTO(out_state, rc); + + if (copy_to_user(ustate, state, sizeof(*state))) + GOTO(out_state, rc = -EFAULT); + +out_state: + OBD_FREE_PTR(state); + RETURN(rc); + } +#ifdef HAVE_LUSTRE_CRYPTO + case LL_IOC_SET_ENCRYPTION_POLICY: + if (!ll_sbi_has_encrypt(ll_i2sbi(inode))) + return -EOPNOTSUPP; + return llcrypt_ioctl_set_policy(file, (const void __user *)arg); + case LL_IOC_GET_ENCRYPTION_POLICY_EX: + if (!ll_sbi_has_encrypt(ll_i2sbi(inode))) + return -EOPNOTSUPP; + return llcrypt_ioctl_get_policy_ex(file, (void __user *)arg); + case LL_IOC_ADD_ENCRYPTION_KEY: + if (!ll_sbi_has_encrypt(ll_i2sbi(inode))) + return -EOPNOTSUPP; + return llcrypt_ioctl_add_key(file, (void __user *)arg); + case LL_IOC_REMOVE_ENCRYPTION_KEY: + if (!ll_sbi_has_encrypt(ll_i2sbi(inode))) + return -EOPNOTSUPP; + return llcrypt_ioctl_remove_key(file, (void __user *)arg); + case LL_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + if (!ll_sbi_has_encrypt(ll_i2sbi(inode))) + return -EOPNOTSUPP; + return llcrypt_ioctl_remove_key_all_users(file, + (void __user *)arg); + case LL_IOC_GET_ENCRYPTION_KEY_STATUS: + if (!ll_sbi_has_encrypt(ll_i2sbi(inode))) + return -EOPNOTSUPP; + return llcrypt_ioctl_get_key_status(file, (void __user *)arg); +#endif + + case LL_IOC_UNLOCK_FOREIGN: { + struct dentry *dentry = file_dentry(file); + + /* if not a foreign symlink do nothing */ + if (ll_foreign_is_removable(dentry, true)) { + CDEBUG(D_INFO, + "prevent unlink of non-foreign file ("DFID")\n", + PFID(ll_inode2fid(inode))); + RETURN(-EOPNOTSUPP); + } + RETURN(0); + } + + default: + RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL, + (void __user *)arg)); + } +} + +loff_t ll_lseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + struct lu_env *env; + struct cl_io *io; + struct cl_lseek_io *lsio; + __u16 refcheck; + int rc; + loff_t retval; + + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = vvp_env_thread_io(env); + io->ci_obj = ll_i2info(inode)->lli_clob; + ll_io_set_mirror(io, file); + + lsio = &io->u.ci_lseek; + lsio->ls_start = offset; + lsio->ls_whence = whence; + lsio->ls_result = -ENXIO; + + do { + rc = cl_io_init(env, io, CIT_LSEEK, io->ci_obj); + if (!rc) { + struct vvp_io *vio = vvp_env_io(env); + + vio->vui_fd = file->private_data; + rc = cl_io_loop(env, io); + } else { + rc = io->ci_result; + } + retval = rc ? : lsio->ls_result; + cl_io_fini(env, io); + } while (unlikely(io->ci_need_restart)); + + cl_env_put(env, &refcheck); + + /* Without the key, SEEK_HOLE return value has to be + * rounded up to next LUSTRE_ENCRYPTION_UNIT_SIZE. + */ + if (llcrypt_require_key(inode) == -ENOKEY && whence == SEEK_HOLE) + retval = round_up(retval, LUSTRE_ENCRYPTION_UNIT_SIZE); + + RETURN(retval); +} + +static loff_t ll_file_seek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file_inode(file); + loff_t retval = offset, eof = 0; + ktime_t kstart = ktime_get(); + + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n", + PFID(ll_inode2fid(inode)), inode, retval, retval, + origin); + + if (origin == SEEK_END) { + retval = ll_glimpse_size(inode); + if (retval != 0) + RETURN(retval); + eof = i_size_read(inode); + } + + if (origin == SEEK_HOLE || origin == SEEK_DATA) { + if (offset < 0) + return -ENXIO; + + /* flush local cache first if any */ + cl_sync_file_range(inode, offset, OBD_OBJECT_EOF, + CL_FSYNC_LOCAL, 0); + + retval = ll_lseek(file, offset, origin); + if (retval < 0) + return retval; + retval = vfs_setpos(file, retval, ll_file_maxbytes(inode)); + } else { + retval = generic_file_llseek_size(file, offset, origin, + ll_file_maxbytes(inode), eof); + } + if (retval >= 0) + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, + ktime_us_delta(ktime_get(), kstart)); + RETURN(retval); +} + +static int ll_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = file->private_data; + int rc, err; + + LASSERT(!S_ISDIR(inode->i_mode)); + + /* catch async errors that were recorded back when async writeback + * failed for pages in this mapping. */ + rc = lli->lli_async_rc; + lli->lli_async_rc = 0; + if (lli->lli_clob != NULL) { + err = lov_read_and_clear_async_rc(lli->lli_clob); + if (rc == 0) + rc = err; + } + + /* The application has been told write failure already. + * Do not report failure again. */ + if (fd->fd_write_failed) + return 0; + return rc ? -EIO : 0; +} + +/** + * Called to make sure a portion of file has been written out. + * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST. + * + * Return how many pages have been written. + */ +int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, + enum cl_fsync_mode mode, int ignore_layout) +{ + struct lu_env *env; + struct cl_io *io; + struct cl_fsync_io *fio; + int result; + __u16 refcheck; + ENTRY; + + if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL && + mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL) + RETURN(-EINVAL); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = vvp_env_thread_io(env); + io->ci_obj = ll_i2info(inode)->lli_clob; + io->ci_ignore_layout = ignore_layout; + + /* initialize parameters for sync */ + fio = &io->u.ci_fsync; + fio->fi_start = start; + fio->fi_end = end; + fio->fi_fid = ll_inode2fid(inode); + fio->fi_mode = mode; + fio->fi_nr_written = 0; + + if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0) + result = cl_io_loop(env, io); + else + result = io->ci_result; + if (result == 0) + result = fio->fi_nr_written; + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + + RETURN(result); +} + +/* + * When dentry is provided (the 'else' case), file_dentry() may be + * null and dentry must be used directly rather than pulled from + * file_dentry() as is done otherwise. + */ + +int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct dentry *dentry = file_dentry(file); + struct inode *inode = dentry->d_inode; + struct ll_inode_info *lli = ll_i2info(inode); + struct ptlrpc_request *req; + ktime_t kstart = ktime_get(); + int rc, err; + + ENTRY; + + CDEBUG(D_VFSTRACE, + "VFS Op:inode="DFID"(%p), start %lld, end %lld, datasync %d\n", + PFID(ll_inode2fid(inode)), inode, start, end, datasync); + + /* fsync's caller has already called _fdata{sync,write}, we want + * that IO to finish before calling the osc and mdc sync methods */ + rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + + /* catch async errors that were recorded back when async writeback + * failed for pages in this mapping. */ + if (!S_ISDIR(inode->i_mode)) { + err = lli->lli_async_rc; + lli->lli_async_rc = 0; + if (rc == 0) + rc = err; + if (lli->lli_clob != NULL) { + err = lov_read_and_clear_async_rc(lli->lli_clob); + if (rc == 0) + rc = err; + } + } + + err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req); + if (!rc) + rc = err; + if (!err) + ptlrpc_req_finished(req); + + if (S_ISREG(inode->i_mode)) { + struct ll_file_data *fd = file->private_data; + bool cached; + + /* Sync metadata on MDT first, and then sync the cached data + * on PCC. + */ + err = pcc_fsync(file, start, end, datasync, &cached); + if (!cached) + err = cl_sync_file_range(inode, start, end, + CL_FSYNC_ALL, 0); + if (rc == 0 && err < 0) + rc = err; + if (rc < 0) + fd->fd_write_failed = true; + else + fd->fd_write_failed = false; + } + + if (!rc) + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, + ktime_us_delta(ktime_get(), kstart)); + RETURN(rc); +} + +static int +ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) +{ + struct inode *inode = file_inode(file); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ldlm_enqueue_info einfo = { + .ei_type = LDLM_FLOCK, + .ei_cb_cp = ldlm_flock_completion_ast, + .ei_cbdata = file_lock, + }; + struct md_op_data *op_data; + struct lustre_handle lockh = { 0 }; + union ldlm_policy_data flock = { { 0 } }; + int fl_type = file_lock->fl_type; + ktime_t kstart = ktime_get(); + __u64 flags = 0; + int rc; + int rc2 = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n", + PFID(ll_inode2fid(inode)), file_lock); + + if (file_lock->fl_flags & FL_FLOCK) { + LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK)); + /* flocks are whole-file locks */ + flock.l_flock.end = OFFSET_MAX; + /* For flocks owner is determined by the local file desctiptor*/ + flock.l_flock.owner = (unsigned long)file_lock->fl_file; + } else if (file_lock->fl_flags & FL_POSIX) { + flock.l_flock.owner = (unsigned long)file_lock->fl_owner; + flock.l_flock.start = file_lock->fl_start; + flock.l_flock.end = file_lock->fl_end; + } else { + RETURN(-EINVAL); + } + flock.l_flock.pid = file_lock->fl_pid; + +#if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner) + /* Somewhat ugly workaround for svc lockd. + * lockd installs custom fl_lmops->lm_compare_owner that checks + * for the fl_owner to be the same (which it always is on local node + * I guess between lockd processes) and then compares pid. + * As such we assign pid to the owner field to make it all work, + * conflict with normal locks is unlikely since pid space and + * pointer space for current->files are not intersecting */ + if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner) + flock.l_flock.owner = (unsigned long)file_lock->fl_pid; +#endif + + switch (fl_type) { + case F_RDLCK: + einfo.ei_mode = LCK_PR; + break; + case F_UNLCK: + /* An unlock request may or may not have any relation to + * existing locks so we may not be able to pass a lock handle + * via a normal ldlm_lock_cancel() request. The request may even + * unlock a byte range in the middle of an existing lock. In + * order to process an unlock request we need all of the same + * information that is given with a normal read or write record + * lock request. To avoid creating another ldlm unlock (cancel) + * message we'll treat a LCK_NL flock request as an unlock. */ + einfo.ei_mode = LCK_NL; + break; + case F_WRLCK: + einfo.ei_mode = LCK_PW; + break; + default: + CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type); + RETURN (-ENOTSUPP); + } + + switch (cmd) { + case F_SETLKW: +#ifdef F_SETLKW64 + case F_SETLKW64: +#endif + flags = 0; + break; + case F_SETLK: +#ifdef F_SETLK64 + case F_SETLK64: +#endif + flags = LDLM_FL_BLOCK_NOWAIT; + break; + case F_GETLK: +#ifdef F_GETLK64 + case F_GETLK64: +#endif + flags = LDLM_FL_TEST_LOCK; + break; + default: + CERROR("unknown fcntl lock command: %d\n", cmd); + RETURN (-EINVAL); + } + + /* Save the old mode so that if the mode in the lock changes we + * can decrement the appropriate reader or writer refcount. */ + file_lock->fl_type = einfo.ei_mode; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, " + "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)), + flock.l_flock.pid, flags, einfo.ei_mode, + flock.l_flock.start, flock.l_flock.end); + + rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh, + flags); + + /* Restore the file lock type if not TEST lock. */ + if (!(flags & LDLM_FL_TEST_LOCK)) + file_lock->fl_type = fl_type; + +#ifdef HAVE_LOCKS_LOCK_FILE_WAIT + if ((rc == 0 || file_lock->fl_type == F_UNLCK) && + !(flags & LDLM_FL_TEST_LOCK)) + rc2 = locks_lock_file_wait(file, file_lock); +#else + if ((file_lock->fl_flags & FL_FLOCK) && + (rc == 0 || file_lock->fl_type == F_UNLCK)) + rc2 = flock_lock_file_wait(file, file_lock); + if ((file_lock->fl_flags & FL_POSIX) && + (rc == 0 || file_lock->fl_type == F_UNLCK) && + !(flags & LDLM_FL_TEST_LOCK)) + rc2 = posix_lock_file_wait(file, file_lock); +#endif /* HAVE_LOCKS_LOCK_FILE_WAIT */ + + if (rc2 && file_lock->fl_type != F_UNLCK) { + einfo.ei_mode = LCK_NL; + md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, + &lockh, flags); + rc = rc2; + } + + ll_finish_md_op_data(op_data); + + if (!rc) + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, + ktime_us_delta(ktime_get(), kstart)); + RETURN(rc); +} + +int ll_get_fid_by_name(struct inode *parent, const char *name, + int namelen, struct lu_fid *fid, + struct inode **inode) +{ + struct md_op_data *op_data = NULL; + struct mdt_body *body; + struct ptlrpc_request *req; + int rc; + ENTRY; + + op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE; + rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc < 0) + RETURN(rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out_req, rc = -EFAULT); + if (fid != NULL) + *fid = body->mbo_fid1; + + if (inode != NULL) + rc = ll_prep_inode(inode, &req->rq_pill, parent->i_sb, NULL); +out_req: + ptlrpc_req_finished(req); + RETURN(rc); +} + +int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum, + const char *name, __u32 flags) +{ + struct dentry *dchild = NULL; + struct inode *child_inode = NULL; + struct md_op_data *op_data; + struct ptlrpc_request *request = NULL; + struct obd_client_handle *och = NULL; + struct qstr qstr; + struct mdt_body *body; + __u64 data_version = 0; + size_t namelen = strlen(name); + int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic); + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n", + PFID(ll_inode2fid(parent)), name, + lum->lum_stripe_offset, lum->lum_stripe_count); + + if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) && + lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC)) + lustre_swab_lmv_user_md(lum); + + /* Get child FID first */ + qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen); + qstr.name = name; + qstr.len = namelen; + dchild = d_lookup(file_dentry(file), &qstr); + if (dchild) { + if (dchild->d_inode) + child_inode = igrab(dchild->d_inode); + dput(dchild); + } + + if (!child_inode) { + rc = ll_get_fid_by_name(parent, name, namelen, NULL, + &child_inode); + if (rc) + RETURN(rc); + } + + if (!child_inode) + RETURN(-ENOENT); + + if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) & + OBD_CONNECT2_DIR_MIGRATE)) { + if (le32_to_cpu(lum->lum_stripe_count) > 1 || + ll_dir_striped(child_inode)) { + CERROR("%s: MDT doesn't support stripe directory " + "migration!\n", ll_i2sbi(parent)->ll_fsname); + GOTO(out_iput, rc = -EOPNOTSUPP); + } + } + + /* + * lfs migrate command needs to be blocked on the client + * by checking the migrate FID against the FID of the + * filesystem root. + */ + if (is_root_inode(child_inode)) + GOTO(out_iput, rc = -EINVAL); + + op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, + child_inode->i_mode, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + GOTO(out_iput, rc = PTR_ERR(op_data)); + + inode_lock(child_inode); + op_data->op_fid3 = *ll_inode2fid(child_inode); + if (!fid_is_sane(&op_data->op_fid3)) { + CERROR("%s: migrate %s, but FID "DFID" is insane\n", + ll_i2sbi(parent)->ll_fsname, name, + PFID(&op_data->op_fid3)); + GOTO(out_unlock, rc = -EINVAL); + } + + op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA; + op_data->op_data = lum; + op_data->op_data_size = lumlen; + + /* migrate dirent only for subdirs if MDS_MIGRATE_NSONLY set */ + if (S_ISDIR(child_inode->i_mode) && (flags & MDS_MIGRATE_NSONLY) && + lmv_dir_layout_changing(ll_i2info(parent)->lli_lsm_md)) + op_data->op_bias |= MDS_MIGRATE_NSONLY; + +again: + if (S_ISREG(child_inode->i_mode)) { + och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0); + if (IS_ERR(och)) { + rc = PTR_ERR(och); + och = NULL; + GOTO(out_unlock, rc); + } + + rc = ll_data_version(child_inode, &data_version, + LL_DV_WR_FLUSH); + if (rc != 0) + GOTO(out_close, rc); + + op_data->op_open_handle = och->och_open_handle; + op_data->op_data_version = data_version; + op_data->op_lease_handle = och->och_lease_handle; + op_data->op_bias |= MDS_CLOSE_MIGRATE; + + spin_lock(&och->och_mod->mod_open_req->rq_lock); + och->och_mod->mod_open_req->rq_replay = 0; + spin_unlock(&och->och_mod->mod_open_req->rq_lock); + } + + rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, + op_data->op_name, op_data->op_namelen, + op_data->op_name, op_data->op_namelen, &request); + if (rc == 0) { + LASSERT(request != NULL); + ll_update_times(request, parent); + } + + if (rc == 0 || rc == -EAGAIN) { + body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + /* If the server does release layout lock, then we cleanup + * the client och here, otherwise release it in out_close: */ + if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) { + obd_mod_put(och->och_mod); + md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp, + och); + och->och_open_handle.cookie = DEAD_HANDLE_MAGIC; + OBD_FREE_PTR(och); + och = NULL; + } + } + + if (request != NULL) { + ptlrpc_req_finished(request); + request = NULL; + } + + /* Try again if the lease has cancelled. */ + if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) + goto again; + +out_close: + if (och) + ll_lease_close(och, child_inode, NULL); + if (!rc) + clear_nlink(child_inode); +out_unlock: + inode_unlock(child_inode); + ll_finish_md_op_data(op_data); +out_iput: + iput(child_inode); + RETURN(rc); +} + +static int +ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock) +{ + struct ll_file_data *fd = file->private_data; + ENTRY; + + /* + * In order to avoid flood of warning messages, only print one message + * for one file. And the entire message rate on the client is limited + * by CDEBUG_LIMIT too. + */ + if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) { + fd->fd_flags |= LL_FILE_FLOCK_WARNING; + CDEBUG_LIMIT(D_CONSOLE, + "flock disabled, mount with '-o [local]flock' to enable\r\n"); + } + RETURN(-ENOSYS); +} + +/** + * test if some locks matching bits and l_req_mode are acquired + * - bits can be in different locks + * - if found clear the common lock bits in *bits + * - the bits not found, are kept in *bits + * \param inode [IN] + * \param bits [IN] searched lock bits [IN] + * \param l_req_mode [IN] searched lock mode + * \retval boolean, true iff all bits are found + */ +int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode) +{ + struct lustre_handle lockh; + union ldlm_policy_data policy; + enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ? + (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode; + struct lu_fid *fid; + __u64 flags; + int i; + ENTRY; + + if (!inode) + RETURN(0); + + fid = &ll_i2info(inode)->lli_fid; + CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid), + ldlm_lockname[mode]); + + flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK; + for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) { + policy.l_inodebits.bits = *bits & BIT(i); + if (policy.l_inodebits.bits == 0) + continue; + + if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, + &policy, mode, &lockh)) { + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(&lockh); + if (lock) { + *bits &= + ~(lock->l_policy_data.l_inodebits.bits); + LDLM_LOCK_PUT(lock); + } else { + *bits &= ~policy.l_inodebits.bits; + } + } + } + RETURN(*bits == 0); +} + +enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits, + struct lustre_handle *lockh, __u64 flags, + enum ldlm_mode mode) +{ + union ldlm_policy_data policy = { .l_inodebits = { bits } }; + struct lu_fid *fid; + enum ldlm_mode rc; + ENTRY; + + fid = &ll_i2info(inode)->lli_fid; + CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid)); + + rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags, + fid, LDLM_IBITS, &policy, mode, lockh); + + RETURN(rc); +} + +static int ll_inode_revalidate_fini(struct inode *inode, int rc) +{ + /* Already unlinked. Just update nlink and return success */ + if (rc == -ENOENT) { + clear_nlink(inode); + /* If it is striped directory, and there is bad stripe + * Let's revalidate the dentry again, instead of returning + * error */ + if (ll_dir_striped(inode)) + return 0; + + /* This path cannot be hit for regular files unless in + * case of obscure races, so no need to to validate + * size. */ + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return 0; + } else if (rc != 0) { + CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR, + "%s: revalidate FID "DFID" error: rc = %d\n", + ll_i2sbi(inode)->ll_fsname, + PFID(ll_inode2fid(inode)), rc); + } + + return rc; +} + +static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op) +{ + struct inode *parent; + struct inode *inode = dentry->d_inode; + struct obd_export *exp = ll_i2mdexp(inode); + struct lookup_intent oit = { + .it_op = op, + }; + struct ptlrpc_request *req = NULL; + struct md_op_data *op_data; + const char *name = NULL; + size_t namelen = 0; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n", + PFID(ll_inode2fid(inode)), inode, dentry->d_name.name); + + if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID) { + parent = dentry->d_parent->d_inode; + name = dentry->d_name.name; + namelen = dentry->d_name.len; + } else { + parent = inode; + } + + op_data = ll_prep_md_op_data(NULL, parent, inode, name, namelen, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + /* Call getattr by fid */ + if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID) + op_data->op_flags = MF_GETATTR_BY_FID; + rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0); + ll_finish_md_op_data(op_data); + if (rc < 0) { + rc = ll_inode_revalidate_fini(inode, rc); + GOTO(out, rc); + } + + rc = ll_revalidate_it_finish(req, &oit, dentry); + if (rc != 0) { + ll_intent_release(&oit); + GOTO(out, rc); + } + + /* Unlinked? Unhash dentry, so it is not picked up later by + * do_lookup() -> ll_revalidate_it(). We cannot use d_drop + * here to preserve get_cwd functionality on 2.6. + * Bug 10503 */ + if (!dentry->d_inode->i_nlink) + d_lustre_invalidate(dentry); + + ll_lookup_finish_locks(&oit, dentry); +out: + ptlrpc_req_finished(req); + + return rc; +} + +static int ll_merge_md_attr(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_attr attr = { 0 }; + int rc; + + LASSERT(lli->lli_lsm_md != NULL); + + if (!lmv_dir_striped(lli->lli_lsm_md)) + RETURN(0); + + down_read(&lli->lli_lsm_sem); + rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md, + &attr, ll_md_blocking_ast); + up_read(&lli->lli_lsm_sem); + if (rc != 0) + RETURN(rc); + + spin_lock(&inode->i_lock); + set_nlink(inode, attr.cat_nlink); + spin_unlock(&inode->i_lock); + + inode->i_blocks = attr.cat_blocks; + i_size_write(inode, attr.cat_size); + + ll_i2info(inode)->lli_atime = attr.cat_atime; + ll_i2info(inode)->lli_mtime = attr.cat_mtime; + ll_i2info(inode)->lli_ctime = attr.cat_ctime; + + RETURN(0); +} + +int ll_getattr_dentry(struct dentry *de, struct kstat *stat, u32 request_mask, + unsigned int flags, bool foreign) +{ + struct inode *inode = de->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + struct inode *dir = de->d_parent->d_inode; + bool need_glimpse = true; + ktime_t kstart = ktime_get(); + int rc; + + /* The OST object(s) determine the file size, blocks and mtime. */ + if (!(request_mask & STATX_SIZE || request_mask & STATX_BLOCKS || + request_mask & STATX_MTIME)) + need_glimpse = false; + + if (dentry_may_statahead(dir, de)) + ll_start_statahead(dir, de, need_glimpse && + !(flags & AT_STATX_DONT_SYNC)); + + if (flags & AT_STATX_DONT_SYNC) + GOTO(fill_attr, rc = 0); + + rc = ll_inode_revalidate(de, IT_GETATTR); + if (rc < 0) + RETURN(rc); + + /* foreign file/dir are always of zero length, so don't + * need to validate size. + */ + if (S_ISREG(inode->i_mode) && !foreign) { + bool cached; + + if (!need_glimpse) + GOTO(fill_attr, rc); + + rc = pcc_inode_getattr(inode, request_mask, flags, &cached); + if (cached && rc < 0) + RETURN(rc); + + if (cached) + GOTO(fill_attr, rc); + + /* + * If the returned attr is masked with OBD_MD_FLSIZE & + * OBD_MD_FLBLOCKS & OBD_MD_FLMTIME, it means that the file size + * or blocks obtained from MDT is strictly correct, and the file + * is usually not being modified by clients, and the [a|m|c]time + * got from MDT is also strictly correct. + * Under this circumstance, it does not need to send glimpse + * RPCs to OSTs for file attributes such as the size and blocks. + */ + if (lli->lli_attr_valid & OBD_MD_FLSIZE && + lli->lli_attr_valid & OBD_MD_FLBLOCKS && + lli->lli_attr_valid & OBD_MD_FLMTIME) { + inode->i_mtime.tv_sec = lli->lli_mtime; + if (lli->lli_attr_valid & OBD_MD_FLATIME) + inode->i_atime.tv_sec = lli->lli_atime; + if (lli->lli_attr_valid & OBD_MD_FLCTIME) + inode->i_ctime.tv_sec = lli->lli_ctime; + GOTO(fill_attr, rc); + } + + /* In case of restore, the MDT has the right size and has + * already send it back without granting the layout lock, + * inode is up-to-date so glimpse is useless. + * Also to glimpse we need the layout, in case of a running + * restore the MDT holds the layout lock so the glimpse will + * block up to the end of restore (getattr will block) + */ + if (!test_bit(LLIF_FILE_RESTORING, &lli->lli_flags)) { + rc = ll_glimpse_size(inode); + if (rc < 0) + RETURN(rc); + } + } else { + /* If object isn't regular a file then don't validate size. */ + /* foreign dir is not striped dir */ + if (ll_dir_striped(inode) && !foreign) { + rc = ll_merge_md_attr(inode); + if (rc < 0) + RETURN(rc); + } + + if (lli->lli_attr_valid & OBD_MD_FLATIME) + inode->i_atime.tv_sec = lli->lli_atime; + if (lli->lli_attr_valid & OBD_MD_FLMTIME) + inode->i_mtime.tv_sec = lli->lli_mtime; + if (lli->lli_attr_valid & OBD_MD_FLCTIME) + inode->i_ctime.tv_sec = lli->lli_ctime; + } + +fill_attr: + OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30); + + if (ll_need_32bit_api(sbi)) { + stat->ino = cl_fid_build_ino(&lli->lli_fid, 1); + stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev); + stat->rdev = ll_compat_encode_dev(inode->i_rdev); + } else { + stat->ino = inode->i_ino; + stat->dev = inode->i_sb->s_dev; + stat->rdev = inode->i_rdev; + } + + /* foreign symlink to be exposed as a real symlink */ + if (!foreign) + stat->mode = inode->i_mode; + else + stat->mode = (inode->i_mode & ~S_IFMT) | S_IFLNK; + + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + /* stat->blksize is used to tell about preferred IO size */ + if (sbi->ll_stat_blksize) + stat->blksize = sbi->ll_stat_blksize; + else if (S_ISREG(inode->i_mode)) + stat->blksize = 1 << min(PTLRPC_MAX_BRW_BITS + 1, + LL_MAX_BLKSIZE_BITS); + else + stat->blksize = 1 << inode->i_sb->s_blocksize_bits; + + stat->nlink = inode->i_nlink; + stat->size = i_size_read(inode); + stat->blocks = inode->i_blocks; + +#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR) + if (flags & AT_STATX_DONT_SYNC) { + if (stat->size == 0 && + lli->lli_attr_valid & OBD_MD_FLLAZYSIZE) + stat->size = lli->lli_lazysize; + if (stat->blocks == 0 && + lli->lli_attr_valid & OBD_MD_FLLAZYBLOCKS) + stat->blocks = lli->lli_lazyblocks; + } + + if (lli->lli_attr_valid & OBD_MD_FLBTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = lli->lli_btime; + } + + stat->attributes_mask = STATX_ATTR_IMMUTABLE | STATX_ATTR_APPEND; +#ifdef HAVE_LUSTRE_CRYPTO + stat->attributes_mask |= STATX_ATTR_ENCRYPTED; +#endif + stat->attributes |= ll_inode_to_ext_flags(inode->i_flags); + /* if Lustre specific LUSTRE_ENCRYPT_FL flag is set, also set + * ext4 equivalent to please statx + */ + if (stat->attributes & LUSTRE_ENCRYPT_FL) + stat->attributes |= STATX_ATTR_ENCRYPTED; + stat->result_mask &= request_mask; +#endif + + ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, + ktime_us_delta(ktime_get(), kstart)); + + return 0; +} + +#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR) +int ll_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) +{ + return ll_getattr_dentry(path->dentry, stat, request_mask, flags, + false); +} +#else +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat) +{ + return ll_getattr_dentry(de, stat, STATX_BASIC_STATS, + AT_STATX_SYNC_AS_STAT, false); +} +#endif + +int cl_falloc(struct file *file, struct inode *inode, int mode, loff_t offset, + loff_t len) +{ + struct lu_env *env; + struct cl_io *io; + __u16 refcheck; + int rc; + loff_t size = i_size_read(inode); + + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = vvp_env_thread_io(env); + io->ci_obj = ll_i2info(inode)->lli_clob; + ll_io_set_mirror(io, file); + + io->ci_verify_layout = 1; + io->u.ci_setattr.sa_parent_fid = lu_object_fid(&io->ci_obj->co_lu); + io->u.ci_setattr.sa_falloc_mode = mode; + io->u.ci_setattr.sa_falloc_offset = offset; + io->u.ci_setattr.sa_falloc_end = offset + len; + io->u.ci_setattr.sa_subtype = CL_SETATTR_FALLOCATE; + + CDEBUG(D_INODE, "UID %u GID %u\n", + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); + + io->u.ci_setattr.sa_falloc_uid = from_kuid(&init_user_ns, inode->i_uid); + io->u.ci_setattr.sa_falloc_gid = from_kgid(&init_user_ns, inode->i_gid); + + if (io->u.ci_setattr.sa_falloc_end > size) { + loff_t newsize = io->u.ci_setattr.sa_falloc_end; + + /* Check new size against VFS/VM file size limit and rlimit */ + rc = inode_newsize_ok(inode, newsize); + if (rc) + goto out; + if (newsize > ll_file_maxbytes(inode)) { + CDEBUG(D_INODE, "file size too large %llu > %llu\n", + (unsigned long long)newsize, + ll_file_maxbytes(inode)); + rc = -EFBIG; + goto out; + } + } + + do { + rc = cl_io_init(env, io, CIT_SETATTR, io->ci_obj); + if (!rc) + rc = cl_io_loop(env, io); + else + rc = io->ci_result; + cl_io_fini(env, io); + } while (unlikely(io->ci_need_restart)); + +out: + cl_env_put(env, &refcheck); + RETURN(rc); +} + +long ll_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(filp); + int rc; + + if (offset < 0 || len <= 0) + RETURN(-EINVAL); + /* + * Encrypted inodes can't handle collapse range or zero range or insert + * range since we would need to re-encrypt blocks with a different IV or + * XTS tweak (which are based on the logical block number). + * Similar to what ext4 does. + */ + if (IS_ENCRYPTED(inode) && + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | + FALLOC_FL_ZERO_RANGE))) + RETURN(-EOPNOTSUPP); + + /* + * mode == 0 (which is standard prealloc) and PUNCH is supported + * Rest of mode options are not supported yet. + */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + RETURN(-EOPNOTSUPP); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FALLOCATE, 1); + + rc = cl_falloc(filp, inode, mode, offset, len); + /* + * ENOTSUPP (524) is an NFSv3 specific error code erroneously + * used by Lustre in several places. Retuning it here would + * confuse applications that explicity test for EOPNOTSUPP + * (95) and fall back to ftruncate(). + */ + if (rc == -ENOTSUPP) + rc = -EOPNOTSUPP; + + RETURN(rc); +} + +static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + int rc; + size_t num_bytes; + struct fiemap *fiemap; + unsigned int extent_count = fieinfo->fi_extents_max; + + num_bytes = sizeof(*fiemap) + (extent_count * + sizeof(struct fiemap_extent)); + OBD_ALLOC_LARGE(fiemap, num_bytes); + + if (fiemap == NULL) + RETURN(-ENOMEM); + + fiemap->fm_flags = fieinfo->fi_flags; + fiemap->fm_extent_count = fieinfo->fi_extents_max; + fiemap->fm_start = start; + fiemap->fm_length = len; + if (extent_count > 0 && + copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start, + sizeof(struct fiemap_extent)) != 0) + GOTO(out, rc = -EFAULT); + + rc = ll_do_fiemap(inode, fiemap, num_bytes); + + if (IS_ENCRYPTED(inode)) { + int i; + + for (i = 0; i < fiemap->fm_mapped_extents; i++) + fiemap->fm_extents[i].fe_flags |= + FIEMAP_EXTENT_DATA_ENCRYPTED | + FIEMAP_EXTENT_ENCODED; + } + + fieinfo->fi_flags = fiemap->fm_flags; + fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents; + if (extent_count > 0 && + copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0], + fiemap->fm_mapped_extents * + sizeof(struct fiemap_extent)) != 0) + GOTO(out, rc = -EFAULT); +out: + OBD_FREE_LARGE(fiemap, num_bytes); + return rc; +} + +int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) +{ + int rc = 0; + struct ll_sb_info *sbi; + struct root_squash_info *squash; + struct cred *cred = NULL; + const struct cred *old_cred = NULL; + bool squash_id = false; + ktime_t kstart = ktime_get(); + + ENTRY; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + /* + * as root inode are NOT getting validated in lookup operation, + * need to do it before permission check. + */ + + if (is_root_inode(inode)) { + rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP); + if (rc) + RETURN(rc); + } + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n", + PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask); + + /* squash fsuid/fsgid if needed */ + sbi = ll_i2sbi(inode); + squash = &sbi->ll_squash; + if (unlikely(squash->rsi_uid != 0 && + uid_eq(current_fsuid(), GLOBAL_ROOT_UID) && + !test_bit(LL_SBI_NOROOTSQUASH, sbi->ll_flags))) { + squash_id = true; + } + if (squash_id) { + CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n", + __kuid_val(current_fsuid()), __kgid_val(current_fsgid()), + squash->rsi_uid, squash->rsi_gid); + + /* update current process's credentials + * and FS capability */ + cred = prepare_creds(); + if (cred == NULL) + RETURN(-ENOMEM); + + cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid); + cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid); + cred->cap_effective = cap_drop_nfsd_set(cred->cap_effective); + cred->cap_effective = cap_drop_fs_set(cred->cap_effective); + + old_cred = override_creds(cred); + } + + rc = generic_permission(mnt_userns, inode, mask); + /* restore current process's credentials and FS capability */ + if (squash_id) { + revert_creds(old_cred); + put_cred(cred); + } + + if (!rc) + ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, + ktime_us_delta(ktime_get(), kstart)); + + RETURN(rc); +} + +/* -o localflock - only provides locally consistent flock locks */ +static const struct file_operations ll_file_operations = { +#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER +# ifdef HAVE_SYNC_READ_WRITE + .read = new_sync_read, + .write = new_sync_write, +# endif + .read_iter = ll_file_read_iter, + .write_iter = ll_file_write_iter, +#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .read = ll_file_read, + .aio_read = ll_file_aio_read, + .write = ll_file_write, + .aio_write = ll_file_aio_write, +#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .unlocked_ioctl = ll_file_ioctl, + .open = ll_file_open, + .release = ll_file_release, + .mmap = ll_file_mmap, + .llseek = ll_file_seek, +#ifndef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT + .splice_read = generic_file_splice_read, +#else + .splice_read = pcc_file_splice_read, +#endif +#ifdef HAVE_ITER_FILE_SPLICE_WRITE + .splice_write = iter_file_splice_write, +#endif + .fsync = ll_fsync, + .flush = ll_flush, + .fallocate = ll_fallocate, +}; + +static const struct file_operations ll_file_operations_flock = { +#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER +# ifdef HAVE_SYNC_READ_WRITE + .read = new_sync_read, + .write = new_sync_write, +# endif /* HAVE_SYNC_READ_WRITE */ + .read_iter = ll_file_read_iter, + .write_iter = ll_file_write_iter, +#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .read = ll_file_read, + .aio_read = ll_file_aio_read, + .write = ll_file_write, + .aio_write = ll_file_aio_write, +#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .unlocked_ioctl = ll_file_ioctl, + .open = ll_file_open, + .release = ll_file_release, + .mmap = ll_file_mmap, + .llseek = ll_file_seek, +#ifndef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT + .splice_read = generic_file_splice_read, +#else + .splice_read = pcc_file_splice_read, +#endif +#ifdef HAVE_ITER_FILE_SPLICE_WRITE + .splice_write = iter_file_splice_write, +#endif + .fsync = ll_fsync, + .flush = ll_flush, + .flock = ll_file_flock, + .lock = ll_file_flock, + .fallocate = ll_fallocate, +}; + +/* These are for -o noflock - to return ENOSYS on flock calls */ +static const struct file_operations ll_file_operations_noflock = { +#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER +# ifdef HAVE_SYNC_READ_WRITE + .read = new_sync_read, + .write = new_sync_write, +# endif /* HAVE_SYNC_READ_WRITE */ + .read_iter = ll_file_read_iter, + .write_iter = ll_file_write_iter, +#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .read = ll_file_read, + .aio_read = ll_file_aio_read, + .write = ll_file_write, + .aio_write = ll_file_aio_write, +#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .unlocked_ioctl = ll_file_ioctl, + .open = ll_file_open, + .release = ll_file_release, + .mmap = ll_file_mmap, + .llseek = ll_file_seek, +#ifndef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT + .splice_read = generic_file_splice_read, +#else + .splice_read = pcc_file_splice_read, +#endif +#ifdef HAVE_ITER_FILE_SPLICE_WRITE + .splice_write = iter_file_splice_write, +#endif + .fsync = ll_fsync, + .flush = ll_flush, + .flock = ll_file_noflock, + .lock = ll_file_noflock, + .fallocate = ll_fallocate, +}; + +const struct inode_operations ll_file_inode_operations = { + .setattr = ll_setattr, + .getattr = ll_getattr, + .permission = ll_inode_permission, +#ifdef HAVE_IOP_XATTR + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .removexattr = ll_removexattr, +#endif + .listxattr = ll_listxattr, + .fiemap = ll_fiemap, + .get_acl = ll_get_acl, +#ifdef HAVE_IOP_SET_ACL + .set_acl = ll_set_acl, +#endif +}; + +const struct file_operations *ll_select_file_operations(struct ll_sb_info *sbi) +{ + const struct file_operations *fops = &ll_file_operations_noflock; + + if (test_bit(LL_SBI_FLOCK, sbi->ll_flags)) + fops = &ll_file_operations_flock; + else if (test_bit(LL_SBI_LOCALFLOCK, sbi->ll_flags)) + fops = &ll_file_operations; + + return fops; +} + +int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct lu_env *env; + int rc; + __u16 refcheck; + ENTRY; + + if (obj == NULL) + RETURN(0); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + rc = cl_conf_set(env, lli->lli_clob, conf); + if (rc < 0) + GOTO(out, rc); + + if (conf->coc_opc == OBJECT_CONF_SET) { + struct ldlm_lock *lock = conf->coc_lock; + struct cl_layout cl = { + .cl_layout_gen = 0, + }; + + LASSERT(lock != NULL); + LASSERT(ldlm_has_layout(lock)); + + /* it can only be allowed to match after layout is + * applied to inode otherwise false layout would be + * seen. Applying layout shoud happen before dropping + * the intent lock. */ + ldlm_lock_allow_match(lock); + + rc = cl_object_layout_get(env, obj, &cl); + if (rc < 0) + GOTO(out, rc); + + CDEBUG(D_VFSTRACE, + DFID": layout version change: %u -> %u\n", + PFID(&lli->lli_fid), ll_layout_version_get(lli), + cl.cl_layout_gen); + ll_layout_version_set(lli, cl.cl_layout_gen); + } + +out: + cl_env_put(env, &refcheck); + + RETURN(rc < 0 ? rc : 0); +} + +/* Fetch layout from MDT with getxattr request, if it's not ready yet */ +static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock) + +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req; + void *lvbdata; + void *lmm; + int lmmsize; + int rc; + ENTRY; + + CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n", + PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock), + lock->l_lvb_data, lock->l_lvb_len); + + if (lock->l_lvb_data != NULL) + RETURN(0); + + /* if layout lock was granted right away, the layout is returned + * within DLM_LVB of dlm reply; otherwise if the lock was ever + * blocked and then granted via completion ast, we have to fetch + * layout here. Please note that we can't use the LVB buffer in + * completion AST because it doesn't have a large enough buffer */ + rc = ll_get_default_mdsize(sbi, &lmmsize); + if (rc < 0) + RETURN(rc); + + rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR, + XATTR_NAME_LOV, lmmsize, &req); + if (rc < 0) { + if (rc == -ENODATA) + GOTO(out, rc = 0); /* empty layout */ + else + RETURN(rc); + } + + lmmsize = rc; + rc = 0; + if (lmmsize == 0) /* empty layout */ + GOTO(out, rc = 0); + + lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize); + if (lmm == NULL) + GOTO(out, rc = -EFAULT); + + OBD_ALLOC_LARGE(lvbdata, lmmsize); + if (lvbdata == NULL) + GOTO(out, rc = -ENOMEM); + + memcpy(lvbdata, lmm, lmmsize); + lock_res_and_lock(lock); + if (unlikely(lock->l_lvb_data == NULL)) { + lock->l_lvb_type = LVB_T_LAYOUT; + lock->l_lvb_data = lvbdata; + lock->l_lvb_len = lmmsize; + lvbdata = NULL; + } + unlock_res_and_lock(lock); + + if (lvbdata) + OBD_FREE_LARGE(lvbdata, lmmsize); + + EXIT; + +out: + ptlrpc_req_finished(req); + return rc; +} + +/** + * Apply the layout to the inode. Layout lock is held and will be released + * in this function. + */ +static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode, + struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ldlm_lock *lock; + struct cl_object_conf conf; + int rc = 0; + bool lvb_ready; + bool wait_layout = false; + ENTRY; + + LASSERT(lustre_handle_is_used(lockh)); + + lock = ldlm_handle2lock(lockh); + LASSERT(lock != NULL); + + if (!ldlm_has_layout(lock)) + GOTO(out, rc = -EAGAIN); + + LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured", + PFID(&lli->lli_fid), inode); + + /* in case this is a caching lock and reinstate with new inode */ + md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL); + + lock_res_and_lock(lock); + lvb_ready = ldlm_is_lvb_ready(lock); + unlock_res_and_lock(lock); + + /* checking lvb_ready is racy but this is okay. The worst case is + * that multi processes may configure the file on the same time. */ + if (lvb_ready) + GOTO(out, rc = 0); + + rc = ll_layout_fetch(inode, lock); + if (rc < 0) + GOTO(out, rc); + + /* for layout lock, lmm is stored in lock's lvb. + * lvb_data is immutable if the lock is held so it's safe to access it + * without res lock. + * + * set layout to file. Unlikely this will fail as old layout was + * surely eliminated */ + memset(&conf, 0, sizeof conf); + conf.coc_opc = OBJECT_CONF_SET; + conf.coc_inode = inode; + conf.coc_lock = lock; + conf.u.coc_layout.lb_buf = lock->l_lvb_data; + conf.u.coc_layout.lb_len = lock->l_lvb_len; + rc = ll_layout_conf(inode, &conf); + + /* refresh layout failed, need to wait */ + wait_layout = rc == -EBUSY; + EXIT; +out: + LDLM_LOCK_PUT(lock); + ldlm_lock_decref(lockh, mode); + + /* wait for IO to complete if it's still being used. */ + if (wait_layout) { + CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n", + sbi->ll_fsname, PFID(&lli->lli_fid), inode); + + memset(&conf, 0, sizeof conf); + conf.coc_opc = OBJECT_CONF_WAIT; + conf.coc_inode = inode; + rc = ll_layout_conf(inode, &conf); + if (rc == 0) + rc = -EAGAIN; + + CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n", + sbi->ll_fsname, PFID(&lli->lli_fid), rc); + } + RETURN(rc); +} + +/** + * Issue layout intent RPC to MDS. + * \param inode [in] file inode + * \param intent [in] layout intent + * + * \retval 0 on success + * \retval < 0 error code + */ +static int ll_layout_intent(struct inode *inode, struct layout_intent *intent) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + struct lookup_intent it; + struct ptlrpc_request *req; + int rc; + ENTRY; + + op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, + 0, 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_data = intent; + op_data->op_data_size = sizeof(*intent); + + memset(&it, 0, sizeof(it)); + it.it_op = IT_LAYOUT; + if (intent->li_opc == LAYOUT_INTENT_WRITE || + intent->li_opc == LAYOUT_INTENT_TRUNC) + it.it_flags = FMODE_WRITE; + + LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)", + sbi->ll_fsname, PFID(&lli->lli_fid), inode); + + rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req, + &ll_md_blocking_ast, 0); + if (it.it_request != NULL) + ptlrpc_req_finished(it.it_request); + it.it_request = NULL; + + ll_finish_md_op_data(op_data); + + /* set lock data in case this is a new lock */ + if (!rc) + ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL); + + ll_intent_drop_lock(&it); + + RETURN(rc); +} + +/** + * This function checks if there exists a LAYOUT lock on the client side, + * or enqueues it if it doesn't have one in cache. + * + * This function will not hold layout lock so it may be revoked any time after + * this function returns. Any operations depend on layout should be redone + * in that case. + * + * This function should be called before lov_io_init() to get an uptodate + * layout version, the caller should save the version number and after IO + * is finished, this function should be called again to verify that layout + * is not changed during IO time. + */ +int ll_layout_refresh(struct inode *inode, __u32 *gen) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct lustre_handle lockh; + struct layout_intent intent = { + .li_opc = LAYOUT_INTENT_ACCESS, + }; + enum ldlm_mode mode; + int rc; + ENTRY; + + *gen = ll_layout_version_get(lli); + if (!test_bit(LL_SBI_LAYOUT_LOCK, sbi->ll_flags) || + *gen != CL_LAYOUT_GEN_NONE) + RETURN(0); + + /* sanity checks */ + LASSERT(fid_is_sane(ll_inode2fid(inode))); + + /* take layout lock mutex to enqueue layout lock exclusively. */ + mutex_lock(&lli->lli_layout_mutex); + + while (1) { + /* mostly layout lock is caching on the local side, so try to + * match it before grabbing layout lock mutex. */ + mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0, + LCK_CR | LCK_CW | LCK_PR | + LCK_PW | LCK_EX); + if (mode != 0) { /* hit cached lock */ + rc = ll_layout_lock_set(&lockh, mode, inode); + if (rc == -EAGAIN) + continue; + break; + } + + rc = ll_layout_intent(inode, &intent); + if (rc != 0) + break; + } + + if (rc == 0) + *gen = ll_layout_version_get(lli); + mutex_unlock(&lli->lli_layout_mutex); + + RETURN(rc); +} + +/** + * Issue layout intent RPC indicating where in a file an IO is about to write. + * + * \param[in] inode file inode. + * \param[in] ext write range with start offset of fille in bytes where + * an IO is about to write, and exclusive end offset in + * bytes. + * + * \retval 0 on success + * \retval < 0 error code + */ +int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc, + struct lu_extent *ext) +{ + struct layout_intent intent = { + .li_opc = opc, + .li_extent.e_start = ext->e_start, + .li_extent.e_end = ext->e_end, + }; + int rc; + ENTRY; + + rc = ll_layout_intent(inode, &intent); + + RETURN(rc); +} + +/** + * This function send a restore request to the MDT + */ +int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length) +{ + struct hsm_user_request *hur; + int len, rc; + ENTRY; + + len = sizeof(struct hsm_user_request) + + sizeof(struct hsm_user_item); + OBD_ALLOC(hur, len); + if (hur == NULL) + RETURN(-ENOMEM); + + hur->hur_request.hr_action = HUA_RESTORE; + hur->hur_request.hr_archive_id = 0; + hur->hur_request.hr_flags = 0; + memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid, + sizeof(hur->hur_user_item[0].hui_fid)); + hur->hur_user_item[0].hui_extent.offset = offset; + hur->hur_user_item[0].hui_extent.length = length; + hur->hur_request.hr_itemcount = 1; + rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp, + len, hur, NULL); + OBD_FREE(hur, len); + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/foreign_symlink.h b/drivers/staging/lustrefsx/lustre/llite/foreign_symlink.h new file mode 100644 index 0000000000000..b1c10e4c6156e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/foreign_symlink.h @@ -0,0 +1,48 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ + +#ifndef LLITE_FOREIGN_SYMLINK_H +#define LLITE_FOREIGN_SYMLINK_H + +/* llite/llite_foreign_symlink.c */ +ssize_t foreign_symlink_enable_show(struct kobject *kobj, + struct attribute *attr, char *buf); +ssize_t foreign_symlink_enable_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count); +ssize_t foreign_symlink_prefix_show(struct kobject *kobj, + struct attribute *attr, char *buf); +ssize_t foreign_symlink_prefix_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count); +ssize_t foreign_symlink_upcall_show(struct kobject *kobj, + struct attribute *attr, char *buf); +ssize_t foreign_symlink_upcall_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count); +ssize_t foreign_symlink_upcall_info_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count); +extern struct inode_operations ll_foreign_file_symlink_inode_operations; +extern struct inode_operations ll_foreign_dir_symlink_inode_operations; + +#endif /* LLITE_FOREIGN_SYMLINK_H */ diff --git a/drivers/staging/lustrefsx/lustre/llite/glimpse.c b/drivers/staging/lustrefsx/lustre/llite/glimpse.c new file mode 100644 index 0000000000000..bd5e6b691cff0 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/glimpse.c @@ -0,0 +1,228 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * glimpse code used by vvp (and other Lustre clients in the future). + * + * Author: Nikita Danilov + * Author: Oleg Drokin + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "cl_object.h" +#include "llite_internal.h" +#include "vvp_internal.h" + +static const struct cl_lock_descr whole_file = { + .cld_start = 0, + .cld_end = CL_PAGE_EOF, + .cld_mode = CLM_READ +}; + +/* + * Check whether file has possible unwritten pages. + * + * \retval 1 file is mmap-ed or has dirty pages + * 0 otherwise + */ +blkcnt_t dirty_cnt(struct inode *inode) +{ + blkcnt_t cnt = 0; + struct vvp_object *vob = cl_inode2vvp(inode); + void *results[1]; + + if (inode->i_mapping != NULL) + cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree, + results, 0, 1, + PAGECACHE_TAG_DIRTY); + if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0) + cnt = 1; + + return (cnt > 0) ? 1 : 0; +} + +int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, + struct inode *inode, struct cl_object *clob, int agl) +{ + const struct lu_fid *fid = lu_object_fid(&clob->co_lu); + struct cl_lock *lock = vvp_env_lock(env); + struct cl_lock_descr *descr = &lock->cll_descr; + int result; + + ENTRY; + result = 0; + + CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid)); + + /* NOTE: this looks like DLM lock request, but it may + * not be one. Due to CEF_GLIMPSE flag (translated + * to LDLM_FL_HAS_INTENT by osc), this is + * glimpse request, that won't revoke any + * conflicting DLM locks held. Instead, + * ll_glimpse_callback() will be called on each + * client holding a DLM lock against this file, + * and resulting size will be returned for each + * stripe. DLM lock on [0, EOF] is acquired only + * if there were no conflicting locks. If there + * were conflicting locks, enqueuing or waiting + * fails with -ENAVAIL, but valid inode + * attributes are returned anyway. */ + *descr = whole_file; + descr->cld_obj = clob; + descr->cld_mode = CLM_READ; + descr->cld_enq_flags = CEF_GLIMPSE | CEF_MUST; + if (agl) + descr->cld_enq_flags |= CEF_SPECULATIVE | CEF_NONBLOCK; + /* + * CEF_MUST protects glimpse lock from conversion into + * a lockless mode. + */ + result = cl_lock_request(env, io, lock); + if (result < 0) + RETURN(result); + + if (!agl) { + ll_merge_attr(env, inode); + if (i_size_read(inode) > 0 && inode->i_blocks == 0) { + /* + * LU-417: Add dirty pages block count + * lest i_blocks reports 0, some "cp" or + * "tar" may think it's a completely + * sparse file and skip it. + */ + inode->i_blocks = dirty_cnt(inode); + } + } + + cl_lock_release(env, lock); + + RETURN(result); +} + +/** + * Get an IO environment for special operations such as glimpse locks and + * manually requested locks (ladvise lockahead) + * + * \param[in] inode inode the operation is being performed on + * \param[out] envout thread specific execution environment + * \param[out] ioout client io description + * \param[out] refcheck reference check + * + * \retval 1 on success + * \retval 0 not a regular file, cannot get environment + * \retval negative negative errno on error + */ +int cl_io_get(struct inode *inode, struct lu_env **envout, + struct cl_io **ioout, u16 *refcheck) +{ + struct lu_env *env; + struct cl_io *io; + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *clob = lli->lli_clob; + int result; + + if (S_ISREG(inode->i_mode)) { + env = cl_env_get(refcheck); + if (!IS_ERR(env)) { + io = vvp_env_thread_io(env); + io->ci_obj = clob; + *envout = env; + *ioout = io; + result = 1; + } else { + result = PTR_ERR(env); + } + } else { + result = 0; + } + return result; +} + +int cl_glimpse_size0(struct inode *inode, int agl) +{ + /* + * We don't need ast_flags argument to cl_glimpse_size(), because + * osc_lock_enqueue() takes care of the possible deadlock that said + * argument was introduced to avoid. + */ + /* + * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to + * cl_glimpse_size(), which doesn't make sense: glimpse locks are not + * blocking anyway. + */ + struct lu_env *env = NULL; + struct cl_io *io = NULL; + u16 refcheck; + int retried = 0; + int result; + + ENTRY; + + result = cl_io_get(inode, &env, &io, &refcheck); + if (result <= 0) + RETURN(result); + + do { + io->ci_ndelay_tried = retried++; + io->ci_ndelay = io->ci_verify_layout = 1; + result = cl_io_init(env, io, CIT_GLIMPSE, io->ci_obj); + if (result > 0) { + /* + * nothing to do for this io. This currently happens + * when stripe sub-object's are not yet created. + */ + result = io->ci_result; + } else if (result == 0) { + result = cl_glimpse_lock(env, io, inode, io->ci_obj, + agl); + /** + * need to limit retries for FLR mirrors if fast read + * is short because of concurrent truncate. + */ + if (!agl && result == -EAGAIN && + !io->ci_tried_all_mirrors) + io->ci_need_restart = 1; + } + + OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, cfs_fail_val ?: 4); + cl_io_fini(env, io); + } while (unlikely(io->ci_need_restart)); + + cl_env_put(env, &refcheck); + RETURN(result); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c new file mode 100644 index 0000000000000..dfc7edf29e81a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c @@ -0,0 +1,287 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "llite_internal.h" +#include "vvp_internal.h" + +/** + * An `emergency' environment used by cl_inode_fini() when cl_env_get() + * fails. Access to this environment is serialized by cl_inode_fini_guard + * mutex. + */ +struct lu_env *cl_inode_fini_env; +__u16 cl_inode_fini_refcheck; + +/** + * A mutex serializing calls to slp_inode_fini() under extreme memory + * pressure, when environments cannot be allocated. + */ +static DEFINE_MUTEX(cl_inode_fini_guard); + +int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr, + enum op_xvalid xvalid, unsigned int attr_flags) +{ + struct lu_env *env; + struct cl_io *io; + int result; + __u16 refcheck; + + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = vvp_env_thread_io(env); + io->ci_obj = obj; + io->ci_verify_layout = 1; + + io->u.ci_setattr.sa_attr.lvb_atime = attr->ia_atime.tv_sec; + io->u.ci_setattr.sa_attr.lvb_mtime = attr->ia_mtime.tv_sec; + io->u.ci_setattr.sa_attr.lvb_ctime = attr->ia_ctime.tv_sec; + io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size; + io->u.ci_setattr.sa_attr_flags = attr_flags; + io->u.ci_setattr.sa_avalid = attr->ia_valid; + io->u.ci_setattr.sa_xvalid = xvalid; + io->u.ci_setattr.sa_parent_fid = lu_object_fid(&obj->co_lu); + if (attr->ia_valid & ATTR_SIZE) + io->u.ci_setattr.sa_subtype = CL_SETATTR_TRUNC; +again: + if (attr->ia_valid & ATTR_FILE) + ll_io_set_mirror(io, attr->ia_file); + + if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) { + struct vvp_io *vio = vvp_env_io(env); + + if (attr->ia_valid & ATTR_FILE) + /* + * populate the file descriptor for ftruncate to honor + * group lock - see LU-787 + */ + vio->vui_fd = attr->ia_file->private_data; + + result = cl_io_loop(env, io); + } else { + result = io->ci_result; + } + cl_io_fini(env, io); + if (unlikely(io->ci_need_restart)) + goto again; + + cl_env_put(env, &refcheck); + RETURN(result); +} + +/** + * Initialize or update CLIO structures for regular files when new + * meta-data arrives from the server. + * + * \param inode regular file inode + * \param md new file metadata from MDS + * - allocates cl_object if necessary, + * - updated layout, if object was already here. + */ +int cl_file_inode_init(struct inode *inode, struct lustre_md *md) +{ + struct lu_env *env; + struct ll_inode_info *lli; + struct cl_object *clob; + struct lu_site *site; + struct lu_fid *fid; + struct cl_object_conf conf = { + .coc_inode = inode, + .u = { + .coc_layout = md->layout, + } + }; + int result = 0; + __u16 refcheck; + + LASSERT(md->body->mbo_valid & OBD_MD_FLID); + LASSERT(S_ISREG(inode->i_mode)); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + site = ll_i2sbi(inode)->ll_site; + lli = ll_i2info(inode); + fid = &lli->lli_fid; + LASSERT(fid_is_sane(fid)); + + if (lli->lli_clob == NULL) { + /* clob is slave of inode, empty lli_clob means for new inode, + * there is no clob in cache with the given fid, so it is + * unnecessary to perform lookup-alloc-lookup-insert, just + * alloc and insert directly. + */ + if (!(inode->i_state & I_NEW)) { + result = -EIO; + CERROR("%s: unexpected not-NEW inode "DFID": rc = %d\n", + ll_i2sbi(inode)->ll_fsname, PFID(fid), result); + goto out; + } + + conf.coc_lu.loc_flags = LOC_F_NEW; + clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev), + fid, &conf); + if (!IS_ERR(clob)) { + /* + * No locking is necessary, as new inode is + * locked by I_NEW bit. + */ + lli->lli_clob = clob; + lu_object_ref_add(&clob->co_lu, "inode", inode); + } else { + result = PTR_ERR(clob); + } + } else { + result = cl_conf_set(env, lli->lli_clob, &conf); + if (result == -EBUSY) { + /* ignore the error since I/O will handle it later */ + result = 0; + } + } + + if (result != 0) + CERROR("%s: failed to initialize cl_object "DFID": rc = %d\n", + ll_i2sbi(inode)->ll_fsname, PFID(fid), result); + +out: + cl_env_put(env, &refcheck); + + return result; +} + +/** + * Wait for others drop their references of the object at first, then we drop + * the last one, which will lead to the object be destroyed immediately. + * Must be called after cl_object_kill() against this object. + * + * The reason we want to do this is: destroying top object will wait for sub + * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs) + * to initiate top object destroying which may deadlock. See bz22520. + */ +static void cl_object_put_last(struct lu_env *env, struct cl_object *obj) +{ + struct lu_object_header *header = obj->co_lu.lo_header; + + if (unlikely(atomic_read(&header->loh_ref) != 1)) { + struct lu_site *site = obj->co_lu.lo_dev->ld_site; + wait_queue_head_t *wq; + + wq = lu_site_wq_from_fid(site, &header->loh_fid); + + wait_event(*wq, atomic_read(&header->loh_ref) == 1); + } + + cl_object_put(env, obj); +} + +void cl_inode_fini(struct inode *inode) +{ + struct lu_env *env; + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *clob = lli->lli_clob; + __u16 refcheck; + int emergency; + + if (clob != NULL) { + env = cl_env_get(&refcheck); + emergency = IS_ERR(env); + if (emergency) { + mutex_lock(&cl_inode_fini_guard); + LASSERT(cl_inode_fini_env != NULL); + env = cl_inode_fini_env; + } + + /* + * cl_object cache is a slave to inode cache (which, in turn + * is a slave to dentry cache), don't keep cl_object in memory + * when its master is evicted. + */ + cl_object_kill(env, clob); + lu_object_ref_del(&clob->co_lu, "inode", inode); + cl_object_put_last(env, clob); + lli->lli_clob = NULL; + if (emergency) + mutex_unlock(&cl_inode_fini_guard); + else + cl_env_put(env, &refcheck); + } +} + +/** + * build inode number from passed @fid. + * + * For 32-bit systems or syscalls limit the inode number to a 32-bit value + * to avoid EOVERFLOW errors. This will inevitably result in inode number + * collisions, but fid_flatten32() tries hard to avoid this if possible. + */ +__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32) +{ + if (BITS_PER_LONG == 32 || api32) + RETURN(fid_flatten32(fid)); + + RETURN(fid_flatten(fid)); +} + +/** + * build inode generation from passed @fid. If our FID overflows the 32-bit + * inode number then return a non-zero generation to distinguish them. + */ +__u32 cl_fid_build_gen(const struct lu_fid *fid) +{ + if (fid_is_igif(fid)) + RETURN(lu_igif_gen(fid)); + + RETURN(fid_flatten(fid) >> 32); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c new file mode 100644 index 0000000000000..70290ad705018 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c @@ -0,0 +1,189 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * cl code used by vvp (and other Lustre clients in the future). + * + */ +#define DEBUG_SUBSYSTEM S_LLITE +#include +#include +#include +#include + +#include "llite_internal.h" + +/* + * Initialize the default and maximum LOV EA and cookie sizes. This allows + * us to make MDS RPCs with large enough reply buffers to hold the + * maximum-sized (= maximum striped) EA and cookie without having to + * calculate this (via a call into the LOV + OSCs) each time we make an RPC. + */ +static int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp) +{ + u32 val_size; + u32 max_easize; + u32 def_easize; + int rc; + + ENTRY; + + val_size = sizeof(max_easize); + rc = obd_get_info(NULL, dt_exp, sizeof(KEY_MAX_EASIZE), KEY_MAX_EASIZE, + &val_size, &max_easize); + if (rc != 0) + RETURN(rc); + + val_size = sizeof(def_easize); + rc = obd_get_info(NULL, dt_exp, sizeof(KEY_DEFAULT_EASIZE), + KEY_DEFAULT_EASIZE, &val_size, &def_easize); + if (rc != 0) + RETURN(rc); + + /* + * default cookiesize is 0 because from 2.4 server doesn't send + * llog cookies to client. + */ + CDEBUG(D_HA, "updating def/max_easize: %d/%d\n", + def_easize, max_easize); + + rc = md_init_ea_size(md_exp, max_easize, def_easize); + RETURN(rc); +} + +/** + * This function is used as an upcall-callback hooked llite clients + * into obd_notify() listeners chain to handle notifications about + * change of import connect_flags. See lustre_common_fill_super(). + */ +int cl_ocd_update(struct obd_device *host, struct obd_device *watched, + enum obd_notify_event ev, void *owner) +{ + struct lustre_client_ocd *lco; + struct client_obd *cli; + u64 flags; + int result; + + ENTRY; + + if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME) && + watched->obd_set_up && !watched->obd_stopping) { + cli = &watched->u.cli; + lco = owner; + flags = cli->cl_import->imp_connect_data.ocd_connect_flags; + CDEBUG(D_SUPER, "Changing connect_flags: %#llx -> %#llx\n", + lco->lco_flags, flags); + mutex_lock(&lco->lco_lock); + lco->lco_flags &= flags; + /* for each osc event update ea size */ + if (lco->lco_dt_exp) + cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp); + + mutex_unlock(&lco->lco_lock); + result = 0; + } else { + CERROR("unexpected notification from %s %s" + "(setup:%d,stopping:%d)!\n", + watched->obd_type->typ_name, + watched->obd_name, watched->obd_set_up, + watched->obd_stopping); + result = -EINVAL; + } + RETURN(result); +} + +#define GROUPLOCK_SCOPE "grouplock" + +int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock, + struct ll_grouplock *lg) +{ + struct lu_env *env; + struct cl_io *io; + struct cl_lock *lock; + struct cl_lock_descr *descr; + u32 enqflags; + u16 refcheck; + int rc; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + io = vvp_env_thread_io(env); + io->ci_obj = obj; + + rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (rc != 0) { + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + /* Does not make sense to take GL for released layout */ + if (rc > 0) + rc = -ENOTSUPP; + return rc; + } + + lock = vvp_env_lock(env); + descr = &lock->cll_descr; + descr->cld_obj = obj; + descr->cld_start = 0; + descr->cld_end = CL_PAGE_EOF; + descr->cld_gid = gid; + descr->cld_mode = CLM_GROUP; + + enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0); + descr->cld_enq_flags = enqflags; + + rc = cl_lock_request(env, io, lock); + if (rc < 0) { + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + return rc; + } + + lg->lg_env = env; + lg->lg_io = io; + lg->lg_lock = lock; + lg->lg_gid = gid; + + return 0; +} + +void cl_put_grouplock(struct ll_grouplock *lg) +{ + struct lu_env *env = lg->lg_env; + struct cl_io *io = lg->lg_io; + struct cl_lock *lock = lg->lg_lock; + + LASSERT(lg->lg_env != NULL); + LASSERT(lg->lg_gid != 0); + + cl_lock_release(env, lock); + cl_io_fini(env, io); + cl_env_put(env, NULL); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_foreign.c b/drivers/staging/lustrefsx/lustre/llite/llite_foreign.c new file mode 100644 index 0000000000000..9e2a7cbd44c08 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/llite_foreign.c @@ -0,0 +1,281 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2020 Intel Corporation. + */ +#define DEBUG_SUBSYSTEM S_LLITE + +#include "llite_internal.h" + +static void ll_manage_foreign_file(struct inode *inode, + struct lov_foreign_md *lfm) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + + if (le32_to_cpu(lfm->lfm_type) == LU_FOREIGN_TYPE_SYMLINK) { + CDEBUG(D_INFO, + "%s: inode %p of fid "DFID": Foreign file of type symlink, faking a symlink\n", + sbi->ll_fsname, inode, PFID(ll_inode2fid(inode))); + /* change inode_operations to add symlink methods, and clear + * IOP_NOFOLLOW to ensure file will be treated as a symlink + * by Kernel (see in * d_flags_for_inode()). + */ + inode->i_op = &ll_foreign_file_symlink_inode_operations; + inode->i_opflags &= ~IOP_NOFOLLOW; + } else { + CDEBUG(D_INFO, + "%s: inode %p of fid "DFID": Foreign file of type %ux, nothing special to do\n", + sbi->ll_fsname, inode, PFID(ll_inode2fid(inode)), + le32_to_cpu(lfm->lfm_type)); + } +} + +static void ll_manage_foreign_dir(struct inode *inode, + struct lmv_foreign_md *lfm) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + + if (lfm->lfm_type == LU_FOREIGN_TYPE_SYMLINK) { + CDEBUG(D_INFO, + "%s: inode %p of fid "DFID": Foreign dir of type symlink, faking a symlink\n", + sbi->ll_fsname, inode, PFID(ll_inode2fid(inode))); + /* change inode_operations to add symlink methods + * IOP_NOFOLLOW should not be set for dirs + */ + inode->i_op = &ll_foreign_dir_symlink_inode_operations; + } else { + CDEBUG(D_INFO, + "%s: inode %p of fid "DFID": Foreign dir of type %ux, nothing special to do\n", + sbi->ll_fsname, inode, PFID(ll_inode2fid(inode)), + le32_to_cpu(lfm->lfm_type)); + } +} + +int ll_manage_foreign(struct inode *inode, struct lustre_md *lmd) +{ + int rc = 0; + + ENTRY; + /* apply any foreign file/dir policy */ + if (S_ISREG((inode)->i_mode)) { + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + + if (lmd->layout.lb_buf != NULL && lmd->layout.lb_len != 0) { + struct lov_foreign_md *lfm = lmd->layout.lb_buf; + + if (lfm->lfm_magic == LOV_MAGIC_FOREIGN) + ll_manage_foreign_file(inode, lfm); + GOTO(out, rc); + } + + if (obj) { + struct lov_foreign_md lfm = { + .lfm_magic = LOV_MAGIC, + }; + struct cl_layout cl = { + .cl_buf.lb_buf = &lfm, + .cl_buf.lb_len = sizeof(lfm), + }; + struct lu_env *env; + u16 refcheck; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out, rc = PTR_ERR(env)); + rc = cl_object_layout_get(env, obj, &cl); + /* error is likely to be -ERANGE because of the small + * buffer we use, only the content is significant here + */ + if (rc < 0 && rc != -ERANGE) { + cl_env_put(env, &refcheck); + GOTO(out, rc); + } + if (lfm.lfm_magic == LOV_MAGIC_FOREIGN) + ll_manage_foreign_file(inode, &lfm); + cl_env_put(env, &refcheck); + } + } else if (S_ISDIR((inode)->i_mode)) { + if (lmd->lfm != NULL && + lmd->lfm->lfm_magic == LMV_MAGIC_FOREIGN) { + ll_manage_foreign_dir(inode, lmd->lfm); + } else { + struct ll_inode_info *lli = ll_i2info(inode); + struct lmv_foreign_md *lfm; + + down_read(&lli->lli_lsm_sem); + lfm = (struct lmv_foreign_md *)(lli->lli_lsm_md); + if (lfm && lfm->lfm_magic == LMV_MAGIC_FOREIGN) + ll_manage_foreign_dir(inode, lfm); + up_read(&lli->lli_lsm_sem); + } + } +out: + RETURN(rc); +} + +/* dentry must be spliced to inode (dentry->d_inode != NULL) !!! */ +bool ll_foreign_is_openable(struct dentry *dentry, unsigned int flags) +{ + /* check for faked symlink here as they should not be opened (unless + * O_NOFOLLOW!) and thus wants ll_atomic_open() to return 1 from + * finish_no_open() in order to get follow_link() to be called in both + * path_lookupat() and path_openupat(). + * This will not break regular symlink handling as they have + * been treated/filtered upstream. + */ + if (d_is_symlink(dentry) && !S_ISLNK(dentry->d_inode->i_mode) && + !(flags & O_NOFOLLOW)) + return false; + + return true; +} + +static bool should_preserve_foreign_file(struct lov_foreign_md *lfm, + struct ll_inode_info *lli, bool unset) +{ + /* for now, only avoid foreign fake symlink file removal */ + + if (unset) + if (lfm->lfm_type == LU_FOREIGN_TYPE_SYMLINK) { + set_bit(LLIF_FOREIGN_REMOVABLE, &lli->lli_flags); + return true; + } else { + return false; + } + else + return lfm->lfm_type == LU_FOREIGN_TYPE_SYMLINK && + !test_bit(LLIF_FOREIGN_REMOVABLE, &lli->lli_flags); +} + +static bool should_preserve_foreign_dir(struct lmv_foreign_md *lfm, + struct ll_inode_info *lli, bool unset) +{ + /* for now, only avoid foreign fake symlink dir removal */ + + if (unset) + if (lfm->lfm_type == LU_FOREIGN_TYPE_SYMLINK) { + set_bit(LLIF_FOREIGN_REMOVABLE, &lli->lli_flags); + return true; + } else { + return false; + } + else + return lfm->lfm_type == LU_FOREIGN_TYPE_SYMLINK && + !test_bit(LLIF_FOREIGN_REMOVABLE, &lli->lli_flags); +} + +/* XXX + * instead of fetching type from foreign LOV/LMV, we may simply + * check (d_is_symlink(dentry) && !S_ISLNK(dentry->d_inode->i_mode)) + * to identify a fake symlink + */ +bool ll_foreign_is_removable(struct dentry *dentry, bool unset) +{ + struct inode *inode = dentry->d_inode; + struct qstr *name = &dentry->d_name; + bool preserve_foreign = false; + int rc = 0; + + ENTRY; + if (inode == NULL) + return 0; + + /* some foreign types may not be allowed to be unlinked in order to + * keep references with external objects + */ + if (S_ISREG(inode->i_mode)) { + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + + if (obj) { + struct lov_foreign_md lfm = { + .lfm_magic = LOV_MAGIC, + }; + struct cl_layout cl = { + .cl_buf.lb_buf = &lfm, + .cl_buf.lb_len = sizeof(lfm), + }; + struct lu_env *env; + u16 refcheck; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out, rc = PTR_ERR(env)); + rc = cl_object_layout_get(env, obj, &cl); + /* error is likely to be -ERANGE because of the small + * buffer we use, only the content is significant here + */ + if (rc < 0 && rc != -ERANGE) { + cl_env_put(env, &refcheck); + goto out; + } else { + rc = 0; + } + if (lfm.lfm_magic == LOV_MAGIC_FOREIGN) + preserve_foreign = + should_preserve_foreign_file(&lfm, lli, + unset); + cl_env_put(env, &refcheck); + if (preserve_foreign) { + CDEBUG(D_INFO, + "%s unlink of foreign file (%.*s, "DFID")\n", + unset ? "allow" : "prevent", + name->len, name->name, + PFID(ll_inode2fid(inode))); + RETURN(false); + } + } else { + CDEBUG(D_INFO, + "unable to check if file (%.*s, "DFID") is foreign...\n", + name->len, name->name, + PFID(ll_inode2fid(inode))); + /* XXX should we prevent removal ?? */ + } + } else if (S_ISDIR(inode->i_mode)) { + struct ll_inode_info *lli = ll_i2info(inode); + struct lmv_foreign_md *lfm; + + down_read(&lli->lli_lsm_sem); + lfm = (struct lmv_foreign_md *)(lli->lli_lsm_md); + if (!lfm) + CDEBUG(D_INFO, + "unable to check if dir (%.*s, "DFID") is foreign...\n", + name->len, name->name, + PFID(ll_inode2fid(inode))); + else if (lfm->lfm_magic == LMV_MAGIC_FOREIGN) + preserve_foreign = should_preserve_foreign_dir(lfm, lli, + unset); + up_read(&lli->lli_lsm_sem); + if (preserve_foreign) { + CDEBUG(D_INFO, + "%s unlink of foreign dir (%.*s, "DFID")\n", + unset ? "allow" : "prevent", + name->len, name->name, + PFID(ll_inode2fid(inode))); + RETURN(false); + } + } + +out: + RETURN(true); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_foreign_symlink.c b/drivers/staging/lustrefsx/lustre/llite/llite_foreign_symlink.c new file mode 100644 index 0000000000000..b9ee7daf2e3ca --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/llite_foreign_symlink.c @@ -0,0 +1,865 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2020 Intel Corporation. + */ +/* + * Foreign symlink implementation. + * + * Methods in this source file allow to construct a relative path from the + * LOV/LMV foreign content, to complement it with a prefix, and then to + * expose it to the VFS as a symlink destination. + * The default/internal mechanism simply takes the full foreign free string + * as the relative path, and for more complex internal formats an upcall has + * been implemented to provide format's details (presently just in terms of + * constant strings and substrings positions in EA, but this can be enhanced) + * to llite layer. + */ + +#include +#include +#include +#include +#include +#define DEBUG_SUBSYSTEM S_LLITE + +#include "llite_internal.h" + +/* allocate space for "//'\0'" and copy prefix in, + * returns start position for suffix in *destname + * must be called with ll_foreign_symlink_sem locked for read, to + * protect against sbi->ll_foreign_symlink_prefix change + * on output, provides position where to start prefix complement + */ +static int foreign_symlink_alloc_and_copy_prefix(struct ll_sb_info *sbi, + struct inode *inode, + char **destname, + size_t suffix_size) +{ + size_t prefix_size, full_size; + + ENTRY; + + /* allocate enough for "//'\0'" */ + prefix_size = sbi->ll_foreign_symlink_prefix_size - 1; + full_size = suffix_size + prefix_size + 3; + if (full_size > PATH_MAX) { + CERROR("%s: inode "DFID": resolved destination path too long\n", + sbi->ll_fsname, PFID(ll_inode2fid(inode))); + RETURN(-EINVAL); + } + OBD_ALLOC(*destname, full_size); + if (*destname == NULL) + RETURN(-ENOMEM); + + memcpy(*destname + 1, sbi->ll_foreign_symlink_prefix, + prefix_size); + (*destname)[0] = '/'; + (*destname)[prefix_size + 1] = '/'; + + RETURN(prefix_size + 2); +} + +/* if no upcall registered, default foreign symlink parsing method + * is to use the full lfm_value as a relative path to complement + * foreign_prefix + */ +static int ll_foreign_symlink_default_parse(struct ll_sb_info *sbi, + struct inode *inode, + struct lov_foreign_md *lfm, + char **destname) +{ + int suffix_pos; + + down_read(&sbi->ll_foreign_symlink_sem); + suffix_pos = foreign_symlink_alloc_and_copy_prefix(sbi, inode, + destname, + lfm->lfm_length); + up_read(&sbi->ll_foreign_symlink_sem); + + if (suffix_pos < 0) + RETURN(suffix_pos); + + memcpy(*destname + suffix_pos, lfm->lfm_value, + lfm->lfm_length); + (*destname)[suffix_pos + lfm->lfm_length] = '\0'; + + RETURN(0); +} + +/* if an upcall has been registered, foreign symlink will be + * constructed as per upcall provided format + * presently we only support a serie of constant strings and sub-strings + * to be taken from lfm_value content + */ +static int ll_foreign_symlink_upcall_parse(struct ll_sb_info *sbi, + struct inode *inode, + struct lov_foreign_md *lfm, + char **destname) +{ + int pos = 0, suffix_pos = -1, items_size = 0; + struct ll_foreign_symlink_upcall_item *foreign_symlink_items = + sbi->ll_foreign_symlink_upcall_items; + int i = 0, rc = 0; + + ENTRY; + + down_read(&sbi->ll_foreign_symlink_sem); + + /* compute size of relative path of destination path + * could be done once during upcall items/infos reading + * and stored as new ll_sb_info field + */ + for (i = 0; i < sbi->ll_foreign_symlink_upcall_nb_items; i++) { + switch (foreign_symlink_items[i].type) { + case STRING_TYPE: + items_size += foreign_symlink_items[i].size; + break; + case POSLEN_TYPE: + items_size += foreign_symlink_items[i].len; + break; + case EOB_TYPE: + /* should be the last item */ + break; + default: + CERROR("%s: unexpected type '%u' found in items\n", + sbi->ll_fsname, foreign_symlink_items[i].type); + GOTO(failed, rc = -EINVAL); + } + } + + suffix_pos = foreign_symlink_alloc_and_copy_prefix(sbi, inode, destname, + items_size); + if (suffix_pos < 0) + GOTO(failed, rc = suffix_pos); + + /* rescan foreign_symlink_items[] to create faked symlink dest path */ + i = 0; + while (foreign_symlink_items[i].type != EOB_TYPE) { + if (foreign_symlink_items[i].type == STRING_TYPE) { + memcpy(*destname + suffix_pos + pos, + foreign_symlink_items[i].string, + foreign_symlink_items[i].size); + pos += foreign_symlink_items[i].size; + } else if (foreign_symlink_items[i].type == POSLEN_TYPE) { + if (lfm->lfm_length < foreign_symlink_items[i].pos + + foreign_symlink_items[i].len) { + CERROR("%s: "DFID" foreign EA too short to find (%u,%u) item\n", + sbi->ll_fsname, + PFID(ll_inode2fid(inode)), + foreign_symlink_items[i].pos, + foreign_symlink_items[i].len); + GOTO(failed, rc = -EINVAL); + } + memcpy(*destname + suffix_pos + pos, + lfm->lfm_value + foreign_symlink_items[i].pos, + foreign_symlink_items[i].len); + pos += foreign_symlink_items[i].len; + } else { + CERROR("%s: unexpected type '%u' found in items\n", + sbi->ll_fsname, foreign_symlink_items[i].type); + GOTO(failed, rc = -EINVAL); + } + i++; + } +failed: + up_read(&sbi->ll_foreign_symlink_sem); + + if (rc != 0 && suffix_pos >= 0) { + OBD_FREE_LARGE(*destname, suffix_pos + items_size); + *destname = NULL; + } + + RETURN(rc); +} + +static int ll_foreign_symlink_parse(struct ll_sb_info *sbi, + struct inode *inode, + struct lov_foreign_md *lfm, + char **destname) +{ + int rc; + + /* if no user-land upcall registered, assuming whole free field + * of foreign LOV is relative path of faked symlink destination, + * to be completed by prefix + */ + if (!test_bit(LL_SBI_FOREIGN_SYMLINK_UPCALL, sbi->ll_flags)) + rc = ll_foreign_symlink_default_parse(sbi, inode, lfm, + destname); + else /* upcall is available */ + rc = ll_foreign_symlink_upcall_parse(sbi, inode, lfm, + destname); + return rc; +} + +/* Don't need lli_size_mutex locked as LOV/LMV are EAs + * and should not be stored in data blocks + */ +static int ll_foreign_readlink_internal(struct inode *inode, char **symname) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct lov_foreign_md *lfm = NULL; + char *destname = NULL; + size_t lfm_size = 0; + int rc; + + ENTRY; + + if (S_ISREG(inode->i_mode)) { + struct cl_object *obj = lli->lli_clob; + struct cl_layout cl = { + .cl_buf.lb_len = 0, /* to get real size */ + }; + struct lu_env *env; + u16 refcheck; + + if (!obj) { + CERROR("%s: inode "DFID": can not get layout, no cl_object\n", + sbi->ll_fsname, PFID(ll_inode2fid(inode))); + GOTO(failed, rc = -EINVAL); + } + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + /* get layout size */ + rc = cl_object_layout_get(env, obj, &cl); + if (rc <= 0) { + CERROR("%s: inode "DFID": error trying to get layout size : %d\n", + sbi->ll_fsname, PFID(ll_inode2fid(inode)), rc); + cl_env_put(env, &refcheck); + RETURN(rc); + } + OBD_ALLOC(lfm, rc); + if (!lfm) { + CERROR("%s: inode "DFID": can not allocate enough mem to get layout\n", + sbi->ll_fsname, PFID(ll_inode2fid(inode))); + cl_env_put(env, &refcheck); + RETURN(-ENOMEM); + } + cl.cl_buf.lb_len = rc; + cl.cl_buf.lb_buf = lfm; + /* get layout */ + rc = cl_object_layout_get(env, obj, &cl); + if (rc <= 0) { + CERROR("%s: inode "DFID": error trying to get layout : %d\n", + sbi->ll_fsname, PFID(ll_inode2fid(inode)), rc); + OBD_FREE(lfm, cl.cl_buf.lb_len); + cl_env_put(env, &refcheck); + RETURN(rc); + } + lfm_size = cl.cl_buf.lb_len; + cl_env_put(env, &refcheck); + } else if (S_ISDIR(inode->i_mode)) { + down_read(&lli->lli_lsm_sem); + + /* should be casted lmv_foreign_md, but it is ok as both foreign LOV + * and LMV formats are identical, and then we also only need + * one set of parsing routines for both foreign files and dirs! + */ + lfm = (struct lov_foreign_md *)(lli->lli_lsm_md); + if (lfm != NULL) { + CDEBUG(D_INFO, "%s: inode "DFID": LMV cached found\n", + sbi->ll_fsname, PFID(ll_inode2fid(inode))); + } else { + CERROR("%s: inode "DFID": cannot get layout, no LMV cached\n", + sbi->ll_fsname, PFID(ll_inode2fid(inode))); + GOTO(failed, rc = -EINVAL); + } + } else { + CERROR("%s: inode "DFID": not a regular file nor directory\n", + sbi->ll_fsname, PFID(ll_inode2fid(inode))); + GOTO(failed, rc = -EINVAL); + } + + /* XXX no assert nor double check of magic, length and type ? */ + + rc = ll_foreign_symlink_parse(sbi, inode, lfm, &destname); + +failed: + if (S_ISDIR(inode->i_mode)) + up_read(&lli->lli_lsm_sem); + + if (S_ISREG(inode->i_mode) && lfm) + OBD_FREE(lfm, lfm_size); + + if (!rc) { + *symname = destname; + CDEBUG(D_INFO, + "%s: inode "DFID": faking symlink to dest '%s'\n", + sbi->ll_fsname, PFID(ll_inode2fid(inode)), destname); + } + + RETURN(rc); +} + +#ifdef HAVE_SYMLINK_OPS_USE_NAMEIDATA +static void ll_foreign_put_link(struct dentry *dentry, + struct nameidata *nd, void *cookie) +#else +# ifdef HAVE_IOP_GET_LINK +static void ll_foreign_put_link(void *cookie) +# else +static void ll_foreign_put_link(struct inode *unused, void *cookie) +# endif +#endif +{ + /* to avoid allocating an unnecessary big buffer, and since ways to + * build the symlink path from foreign LOV/LMV can be multiple and + * not constant. So it size is not known and we need to use + * strlen(cookie)+1 to determine its size and to avoid false positive + * to be reported by memory leak check code + */ + OBD_FREE_LARGE(cookie, strlen(cookie) + 1); +} + +#ifdef HAVE_SYMLINK_OPS_USE_NAMEIDATA +static void *ll_foreign_follow_link(struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + int rc; + char *symname = NULL; + + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op\n"); + /* + * Limit the recursive symlink depth to 5 instead of default + * 8 links when kernel has 4k stack to prevent stack overflow. + * For 8k stacks we need to limit it to 7 for local servers. + */ + if (THREAD_SIZE < 8192 && current->link_count >= 6) + rc = -ELOOP; + else if (THREAD_SIZE == 8192 && current->link_count >= 8) + rc = -ELOOP; + else + rc = ll_foreign_readlink_internal(inode, &symname); + + if (rc) + symname = ERR_PTR(rc); + + nd_set_link(nd, symname); + RETURN(symname); +} + +#elif defined(HAVE_IOP_GET_LINK) +static const char *ll_foreign_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + char *symname = NULL; + int rc; + + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op\n"); + if (!dentry) + RETURN(ERR_PTR(-ECHILD)); + rc = ll_foreign_readlink_internal(inode, &symname); + + /* + * symname must be freed when we are done + * + * XXX we may avoid the need to do so if we use + * lli_symlink_name cache to retain symname and + * let ll_clear_inode free it... + */ + set_delayed_call(done, ll_foreign_put_link, symname); + RETURN(rc ? ERR_PTR(rc) : symname); +} + +# else /* !HAVE_IOP_GET_LINK */ +static const char *ll_foreign_follow_link(struct dentry *dentry, + void **cookie) +{ + struct inode *inode = d_inode(dentry); + char *symname = NULL; + int rc; + + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op\n"); + rc = ll_foreign_readlink_internal(inode, &symname); + if (rc < 0) + return ERR_PTR(rc); + + /* XXX need to also return symname in cookie in order to delay + * its release ?? + */ + + RETURN(symname); +} + +#endif /* HAVE_SYMLINK_OPS_USE_NAMEIDATA, HAVE_IOP_GET_LINK */ + +/* + * Should only be called for already in-use/cache foreign dir inode + * when foreign fake-symlink behaviour has been enabled afterward + */ +static struct dentry *ll_foreign_dir_lookup(struct inode *parent, + struct dentry *dentry, + unsigned int flags) +{ + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n", + dentry->d_name.len, dentry->d_name.name, + PFID(ll_inode2fid(parent)), parent); + + return ERR_PTR(-ENODATA); +} + +static bool has_same_mount_namespace(struct ll_sb_info *sbi) +{ + bool same; + + same = (sbi->ll_mnt_ns == current->nsproxy->mnt_ns); + if (!same) + LCONSOLE_WARN("%s: client mount %s and '%s.%d' not in same mnt-namespace\n", + sbi->ll_fsname, sbi->ll_kset.kobj.name, + current->comm, current->pid); + + return same; +} + +ssize_t foreign_symlink_enable_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%d\n", + test_bit(LL_SBI_FOREIGN_SYMLINK, sbi->ll_flags)); +} + +/* + * XXX + * There should be already in-use/cached inodes of foreign files/dirs who + * will not-be/continue-to-be handled as fake-symlink, depending if + * feature is being enabled/disabled, until being revalidated. + * Also, does it require sbi->ll_lock protection ? + */ +ssize_t foreign_symlink_enable_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + if (!has_same_mount_namespace(sbi)) + return -EINVAL; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + if (val) + set_bit(LL_SBI_FOREIGN_SYMLINK, sbi->ll_flags); + else + clear_bit(LL_SBI_FOREIGN_SYMLINK, sbi->ll_flags); + + return count; +} + +ssize_t foreign_symlink_prefix_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + ssize_t size; + + down_read(&sbi->ll_foreign_symlink_sem); + size = snprintf(buf, PAGE_SIZE, "%s\n", sbi->ll_foreign_symlink_prefix); + up_read(&sbi->ll_foreign_symlink_sem); + + return size; +} + +ssize_t foreign_symlink_prefix_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + char *new, *old; + size_t new_len, old_len; + + if (!has_same_mount_namespace(sbi)) + return -EINVAL; + + /* XXX strip buffer of any CR/LF,space,... ?? */ + + /* check buffer looks like a valid absolute path */ + if (*buffer != '/') { + CERROR("foreign symlink prefix must be an absolute path\n"); + return -EINVAL; + } + new_len = strnlen(buffer, count); + if (new_len < count) + CDEBUG(D_INFO, "NUL byte found in %zu bytes\n", count); + if (new_len > PATH_MAX) { + CERROR("%s: foreign symlink prefix length %zu > PATH_MAX\n", + sbi->ll_fsname, new_len); + return -EINVAL; + } + OBD_ALLOC(new, new_len + 1); + if (new == NULL) { + CERROR("%s: can not allocate space for foreign path prefix\n", + sbi->ll_fsname); + return -ENOSPC; + } + + down_write(&sbi->ll_foreign_symlink_sem); + old_len = sbi->ll_foreign_symlink_prefix_size; + old = sbi->ll_foreign_symlink_prefix; + memcpy(new, buffer, new_len); + *(new + new_len) = '\0'; + + sbi->ll_foreign_symlink_prefix = new; + sbi->ll_foreign_symlink_prefix_size = new_len + 1; + up_write(&sbi->ll_foreign_symlink_sem); + + if (old) + OBD_FREE(old, old_len); + + return new_len; +} + +ssize_t foreign_symlink_upcall_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + ssize_t size; + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + down_read(&sbi->ll_foreign_symlink_sem); + size = snprintf(buf, PAGE_SIZE, "%s\n", sbi->ll_foreign_symlink_upcall); + up_read(&sbi->ll_foreign_symlink_sem); + + return size; +} + +ssize_t foreign_symlink_upcall_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + char *old = NULL, *new = NULL; + size_t new_len; + + if (!has_same_mount_namespace(sbi)) + return -EINVAL; + + /* XXX strip buffer of any CR/LF,space,... ?? */ + + /* check buffer looks like a valid absolute path */ + if (*buffer != '/' && strcmp(buffer, "none")) { + CERROR("foreign symlink upcall must be an absolute path\n"); + return -EINVAL; + } + new_len = strnlen(buffer, count); + if (new_len < count) + CDEBUG(D_INFO, "NULL byte found in %zu bytes\n", count); + if (new_len > PATH_MAX) { + CERROR("%s: foreign symlink upcall path length %zu > PATH_MAX\n", + sbi->ll_fsname, new_len); + return -EINVAL; + } + + OBD_ALLOC(new, new_len + 1); + if (new == NULL) { + CERROR("%s: can not allocate space for foreign symlink upcall path\n", + sbi->ll_fsname); + return -ENOSPC; + } + memcpy(new, buffer, new_len); + *(new + new_len) = '\0'; + + down_write(&sbi->ll_foreign_symlink_sem); + old = sbi->ll_foreign_symlink_upcall; + + sbi->ll_foreign_symlink_upcall = new; + /* LL_SBI_FOREIGN_SYMLINK_UPCALL will be set by + * foreign_symlink_upcall_info_store() upon valid being provided + * by upcall + * XXX there is a potential race if there are multiple concurent + * attempts to set upcall path and execution occur in different + * order, we may end up using the format provided by a different + * upcall than the one set in ll_foreign_symlink_upcall + */ + clear_bit(LL_SBI_FOREIGN_SYMLINK_UPCALL, sbi->ll_flags); + up_write(&sbi->ll_foreign_symlink_sem); + + if (strcmp(new, "none")) { + char *argv[] = { + [0] = new, + /* sbi sysfs object name */ + [1] = (char *)sbi->ll_kset.kobj.name, + [2] = NULL + }; + char *envp[] = { + [0] = "HOME=/", + [1] = "PATH=/sbin:/usr/sbin", + [2] = NULL + }; + int rc; + + rc = call_usermodehelper(new, argv, envp, UMH_WAIT_EXEC); + if (rc < 0) + CERROR("%s: error invoking foreign symlink upcall %s: rc %d\n", + sbi->ll_fsname, new, rc); + else + CDEBUG(D_INFO, "%s: invoked upcall %s\n", + sbi->ll_fsname, new); + } + + if (old) + OBD_FREE_LARGE(old, strlen(old) + 1); + + return new_len; +} + +/* foreign_symlink_upcall_info_store() stores format items in + * foreign_symlink_items[], and foreign_symlink_upcall_parse() + * uses it to parse each foreign symlink LOV/LMV EAs + */ +ssize_t foreign_symlink_upcall_info_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + struct ll_foreign_symlink_upcall_item items[MAX_NB_UPCALL_ITEMS], *item; + struct ll_foreign_symlink_upcall_item *new_items, *old_items; + size_t remaining = count; + int nb_items = 0, old_nb_items, i, rc = 0; + + ENTRY; + + if (!has_same_mount_namespace(sbi)) + return -EINVAL; + + /* parse buffer to check validity of infos and fill symlink format + * descriptors + */ + + if (count % sizeof(__u32) != 0) { + CERROR("%s: invalid size '%zu' of infos buffer returned by foreign symlink upcall\n", + sbi->ll_fsname, count); + RETURN(-EINVAL); + } + + /* evaluate number of items provided */ + while (remaining > 0) { + item = (struct ll_foreign_symlink_upcall_item *) + &buffer[count - remaining]; + switch (item->type) { + case STRING_TYPE: { + /* a constant string following */ + if (item->size >= remaining - + offsetof(struct ll_foreign_symlink_upcall_item, + bytestring) - sizeof(item->type)) { + /* size of string must not overflow remaining + * bytes minus EOB_TYPE item + */ + CERROR("%s: constant string too long in infos buffer returned by foreign symlink upcall\n", + sbi->ll_fsname); + GOTO(failed, rc = -EINVAL); + } + OBD_ALLOC(items[nb_items].string, + item->size); + if (items[nb_items].string == NULL) { + CERROR("%s: constant string allocation has failed for constant string of size %zu\n", + sbi->ll_fsname, item->size); + GOTO(failed, rc = -ENOMEM); + } + memcpy(items[nb_items].string, + item->bytestring, item->size); + items[nb_items].size = item->size; + /* string items to fit on __u32 boundary */ + remaining = remaining - STRING_ITEM_SZ(item->size); + break; + } + case POSLEN_TYPE: { + /* a tuple (pos,len) following to delimit a sub-string + * in lfm_value + */ + items[nb_items].pos = item->pos; + items[nb_items].len = item->len; + remaining -= POSLEN_ITEM_SZ; + break; + } + case EOB_TYPE: + if (remaining != sizeof(item->type)) { + CERROR("%s: early end of infos buffer returned by foreign symlink upcall\n", + sbi->ll_fsname); + GOTO(failed, rc = -EINVAL); + } + remaining -= sizeof(item->type); + break; + default: + CERROR("%s: wrong type '%u' encountered at pos %zu , with %zu remaining bytes, in infos buffer returned by foreign symlink upcall\n", + sbi->ll_fsname, (__u32)buffer[count - remaining], + count - remaining, remaining); + GOTO(failed, rc = -EINVAL); + } + + items[nb_items].type = item->type; + nb_items++; + if (nb_items >= MAX_NB_UPCALL_ITEMS) { + CERROR("%s: too many items in infos buffer returned by foreign symlink upcall\n", + sbi->ll_fsname); + GOTO(failed, rc = -EINVAL); + } + } + /* valid format has been provided by foreign symlink user upcall */ + OBD_ALLOC_LARGE(new_items, nb_items * + sizeof(struct ll_foreign_symlink_upcall_item)); + if (new_items == NULL) { + CERROR("%s: constant string allocation has failed for constant string of size %zu\n", + sbi->ll_fsname, nb_items * + sizeof(struct ll_foreign_symlink_upcall_item)); + GOTO(failed, rc = -ENOMEM); + } + for (i = 0; i < nb_items; i++) + *((struct ll_foreign_symlink_upcall_item *)new_items + i) = + items[i]; + + down_write(&sbi->ll_foreign_symlink_sem); + old_items = sbi->ll_foreign_symlink_upcall_items; + old_nb_items = sbi->ll_foreign_symlink_upcall_nb_items; + sbi->ll_foreign_symlink_upcall_items = new_items; + sbi->ll_foreign_symlink_upcall_nb_items = nb_items; + set_bit(LL_SBI_FOREIGN_SYMLINK_UPCALL, sbi->ll_flags); + up_write(&sbi->ll_foreign_symlink_sem); + + /* free old_items */ + if (old_items != NULL) { + for (i = 0 ; i < old_nb_items; i++) + if (old_items[i].type == STRING_TYPE) + OBD_FREE(old_items[i].string, + old_items[i].size); + + OBD_FREE_LARGE(old_items, old_nb_items * + sizeof(struct ll_foreign_symlink_upcall_item)); + } + +failed: + /* clean items[] and free any strings */ + if (rc != 0) { + for (i = 0; i < nb_items; i++) { + switch (items[i].type) { + case STRING_TYPE: + OBD_FREE(items[i].string, items[i].size); + items[i].string = NULL; + items[i].size = 0; + break; + case POSLEN_TYPE: + items[i].pos = 0; + items[i].len = 0; + break; + case EOB_TYPE: + break; + default: + CERROR("%s: wrong '%u'type encountered in foreign symlink upcall items\n", + sbi->ll_fsname, items[i].type); + GOTO(failed, rc = -EINVAL); + break; + } + items[i].type = 0; + } + } + + RETURN(rc == 0 ? count : rc); +} + +/* foreign fake-symlink version of ll_getattr() */ +#if defined(HAVE_USER_NAMESPACE_ARG) +int ll_foreign_symlink_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + return ll_getattr_dentry(path->dentry, stat, request_mask, flags, + true); +} +#elif defined(HAVE_INODEOPS_ENHANCED_GETATTR) +int ll_foreign_symlink_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + return ll_getattr_dentry(path->dentry, stat, request_mask, flags, + true); +} +#else +int ll_foreign_symlink_getattr(struct vfsmount *mnt, struct dentry *de, + struct kstat *stat) +{ + return ll_getattr_dentry(de, stat, STATX_BASIC_STATS, + AT_STATX_SYNC_AS_STAT, true); +} +#endif + +struct inode_operations ll_foreign_file_symlink_inode_operations = { +#ifdef HAVE_IOP_GENERIC_READLINK + .readlink = generic_readlink, +#endif + .setattr = ll_setattr, +#ifdef HAVE_IOP_GET_LINK + .get_link = ll_foreign_get_link, +#else + .follow_link = ll_foreign_follow_link, + /* .put_link method required since need to release symlink copy buf */ + .put_link = ll_foreign_put_link, +#endif + .getattr = ll_foreign_symlink_getattr, + .permission = ll_inode_permission, +#ifdef HAVE_IOP_XATTR + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .removexattr = ll_removexattr, +#endif + .listxattr = ll_listxattr, +}; + +struct inode_operations ll_foreign_dir_symlink_inode_operations = { + .lookup = ll_foreign_dir_lookup, +#ifdef HAVE_IOP_GENERIC_READLINK + .readlink = generic_readlink, +#endif + .setattr = ll_setattr, +#ifdef HAVE_IOP_GET_LINK + .get_link = ll_foreign_get_link, +#else + .follow_link = ll_foreign_follow_link, + .put_link = ll_foreign_put_link, +#endif + .getattr = ll_foreign_symlink_getattr, + .permission = ll_inode_permission, +#ifdef HAVE_IOP_XATTR + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .removexattr = ll_removexattr, +#endif + .listxattr = ll_listxattr, +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h new file mode 100644 index 0000000000000..a52f12abf289b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h @@ -0,0 +1,1860 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef LLITE_INTERNAL_H +#define LLITE_INTERNAL_H +#include +#include /* for s2sbi */ +#include + +/* for struct cl_lock_descr and struct cl_io */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vvp_internal.h" +#include "pcc.h" +#include "foreign_symlink.h" + +#ifndef FMODE_EXEC +#define FMODE_EXEC 0 +#endif + +#ifndef HAVE_VM_FAULT_RETRY +#define VM_FAULT_RETRY 0 +#endif + +/* Kernel 3.1 kills LOOKUP_CONTINUE, LOOKUP_PARENT is equivalent to it. + * seem kernel commit 49084c3bb2055c401f3493c13edae14d49128ca0 */ +#ifndef LOOKUP_CONTINUE +#define LOOKUP_CONTINUE LOOKUP_PARENT +#endif + +/** Only used on client-side for indicating the tail of dir hash/offset. */ +#define LL_DIR_END_OFF 0x7fffffffffffffffULL +#define LL_DIR_END_OFF_32BIT 0x7fffffffUL + +/* 4UL * 1024 * 1024 */ +#define LL_MAX_BLKSIZE_BITS 22 + +#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0") + +#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) + +struct ll_dentry_data { + unsigned int lld_sa_generation; + unsigned int lld_invalid:1; + unsigned int lld_nfs_dentry:1; + struct rcu_head lld_rcu_head; + unsigned long lld_neg_cache_timeout; +}; + +#define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata)) + +#define LLI_INODE_MAGIC 0x111d0de5 +#define LLI_INODE_DEAD 0xdeadd00d + +struct ll_getname_data { +#ifdef HAVE_DIR_CONTEXT + struct dir_context ctx; +#endif + char *lgd_name; /* points to a buffer with NAME_MAX+1 size */ + struct lu_fid lgd_fid; /* target fid we are looking for */ + int lgd_found; /* inode matched? */ +}; + +struct ll_grouplock { + struct lu_env *lg_env; + struct cl_io *lg_io; + struct cl_lock *lg_lock; + unsigned long lg_gid; +}; + +/* See comment on trunc_sem_down_read_nowait */ +struct ll_trunc_sem { + /* when positive, this is a count of readers, when -1, it indicates + * the semaphore is held for write, and 0 is unlocked + */ + atomic_t ll_trunc_readers; + /* this tracks a count of waiting writers */ + atomic_t ll_trunc_waiters; +}; + +struct ll_inode_info { + __u32 lli_inode_magic; + rwlock_t lli_lock; + + volatile unsigned long lli_flags; + struct posix_acl *lli_posix_acl; + + /* identifying fields for both metadata and data stacks. */ + struct lu_fid lli_fid; + /* master inode fid for stripe directory */ + struct lu_fid lli_pfid; + + /* We need all three because every inode may be opened in different + * modes */ + struct obd_client_handle *lli_mds_read_och; + struct obd_client_handle *lli_mds_write_och; + struct obd_client_handle *lli_mds_exec_och; + __u64 lli_open_fd_read_count; + __u64 lli_open_fd_write_count; + __u64 lli_open_fd_exec_count; + + /* Number of times this inode was opened */ + u64 lli_open_fd_count; + /* When last close was performed on this inode */ + ktime_t lli_close_fd_time; + + /* Protects access to och pointers and their usage counters */ + struct mutex lli_och_mutex; + + struct inode lli_vfs_inode; + + /* the most recent timestamps obtained from mds */ + s64 lli_atime; + s64 lli_mtime; + s64 lli_ctime; + s64 lli_btime; + spinlock_t lli_agl_lock; + + /* Try to make the d::member and f::member are aligned. Before using + * these members, make clear whether it is directory or not. */ + union { + /* for directory */ + struct { + /* metadata statahead */ + /* since parent-child threads can share the same @file + * struct, "opendir_key" is the token when dir close for + * case of parent exit before child -- it is me should + * cleanup the dir readahead. */ + void *lli_opendir_key; + struct ll_statahead_info *lli_sai; + /* protect statahead stuff. */ + spinlock_t lli_sa_lock; + /* "opendir_pid" is the token when lookup/revalid + * -- I am the owner of dir statahead. */ + pid_t lli_opendir_pid; + /* directory depth to ROOT */ + unsigned short lli_dir_depth; + /* directory depth to ancestor whose default LMV is + * inherited. + */ + unsigned short lli_inherit_depth; + /* stat will try to access statahead entries or start + * statahead if this flag is set, and this flag will be + * set upon dir open, and cleared when dir is closed, + * statahead hit ratio is too low, or start statahead + * thread failed. */ + unsigned short lli_sa_enabled:1; + /* generation for statahead */ + unsigned int lli_sa_generation; + /* rw lock protects lli_lsm_md */ + struct rw_semaphore lli_lsm_sem; + /* directory stripe information */ + struct lmv_stripe_md *lli_lsm_md; + /* directory default LMV */ + struct lmv_stripe_md *lli_default_lsm_md; + }; + + /* for non-directory */ + struct { + struct mutex lli_size_mutex; + char *lli_symlink_name; + struct ll_trunc_sem lli_trunc_sem; + struct range_lock_tree lli_write_tree; + struct mutex lli_setattr_mutex; + + struct rw_semaphore lli_glimpse_sem; + ktime_t lli_glimpse_time; + struct list_head lli_agl_list; + __u64 lli_agl_index; + + /* for writepage() only to communicate to fsync */ + int lli_async_rc; + + /* protect the file heat fields */ + spinlock_t lli_heat_lock; + __u32 lli_heat_flags; + struct obd_heat_instance lli_heat_instances[OBD_HEAT_COUNT]; + + /* + * Whenever a process try to read/write the file, the + * jobid of the process will be saved here, and it'll + * be packed into the write PRC when flush later. + * + * So the read/write statistics for jobid will not be + * accurate if the file is shared by different jobs. + */ + char lli_jobid[LUSTRE_JOBID_SIZE]; + + struct mutex lli_pcc_lock; + enum lu_pcc_state_flags lli_pcc_state; + /* + * @lli_pcc_generation saves the gobal PCC generation + * when the file was successfully attached into PCC. + * The flags of the PCC dataset are saved in + * @lli_pcc_dsflags. + * The gobal PCC generation will be increased when add + * or delete a PCC backend, or change the configuration + * parameters for PCC. + * If @lli_pcc_generation is same as the gobal PCC + * generation, we can use the saved flags of the PCC + * dataset to determine whether need to try auto attach + * safely. + */ + __u64 lli_pcc_generation; + enum pcc_dataset_flags lli_pcc_dsflags; + struct pcc_inode *lli_pcc_inode; + + struct mutex lli_group_mutex; + __u64 lli_group_users; + unsigned long lli_group_gid; + + __u64 lli_attr_valid; + __u64 lli_lazysize; + __u64 lli_lazyblocks; + }; + }; + + /* XXX: For following frequent used members, although they maybe special + * used for non-directory object, it is some time-wasting to check + * whether the object is directory or not before using them. On the + * other hand, currently, sizeof(f) > sizeof(d), it cannot reduce + * the "ll_inode_info" size even if moving those members into u.f. + * So keep them out side. + * + * In the future, if more members are added only for directory, + * some of the following members can be moved into u.f. + */ + struct cl_object *lli_clob; + + /* mutex to request for layout lock exclusively. */ + struct mutex lli_layout_mutex; + /* Layout version, protected by lli_layout_lock */ + __u32 lli_layout_gen; + spinlock_t lli_layout_lock; + + __u32 lli_projid; /* project id */ + + struct rw_semaphore lli_xattrs_list_rwsem; + struct mutex lli_xattrs_enq_lock; + struct list_head lli_xattrs; /* ll_xattr_entry->xe_list */ + struct list_head lli_lccs; /* list of ll_cl_context */ + seqlock_t lli_page_inv_lock; +}; + +#ifndef HAVE_USER_NAMESPACE_ARG +#define inode_permission(ns, inode, mask) inode_permission(inode, mask) +#define generic_permission(ns, inode, mask) generic_permission(inode, mask) +#define simple_setattr(ns, de, iattr) simple_setattr(de, iattr) +#define ll_inode_permission(ns, inode, mask) ll_inode_permission(inode, mask) +#ifdef HAVE_INODEOPS_ENHANCED_GETATTR +#define ll_getattr(ns, path, stat, mask, fl) ll_getattr(path, stat, mask, fl) +#endif /* HAVE_INODEOPS_ENHANCED_GETATTR */ +#define ll_setattr(ns, de, attr) ll_setattr(de, attr) +#endif + +static inline void ll_trunc_sem_init(struct ll_trunc_sem *sem) +{ + atomic_set(&sem->ll_trunc_readers, 0); + atomic_set(&sem->ll_trunc_waiters, 0); +} + +/* This version of down read ignores waiting writers, meaning if the semaphore + * is already held for read, this down_read will 'join' that reader and also + * take the semaphore. + * + * This lets us avoid an unusual deadlock. + * + * We must take lli_trunc_sem in read mode on entry in to various i/o paths + * in Lustre, in order to exclude truncates. Some of these paths then need to + * take the mmap_lock, while still holding the trunc_sem. The problem is that + * page faults hold the mmap_lock when calling in to Lustre, and then must also + * take the trunc_sem to exclude truncate. + * + * This means the locking order for trunc_sem and mmap_lock is sometimes AB, + * sometimes BA. This is almost OK because in both cases, we take the trunc + * sem for read, so it doesn't block. + * + * However, if a write mode user (truncate, a setattr op) arrives in the + * middle of this, the second reader on the truncate_sem will wait behind that + * writer. + * + * So we have, on our truncate sem, in order (where 'reader' and 'writer' refer + * to the mode in which they take the semaphore): + * reader (holding mmap_lock, needs truncate_sem) + * writer + * reader (holding truncate sem, waiting for mmap_lock) + * + * And so the readers deadlock. + * + * The solution is this modified semaphore, where this down_read ignores + * waiting write operations, and all waiters are woken up at once, so readers + * using down_read_nowait cannot get stuck behind waiting writers, regardless + * of the order they arrived in. + * + * down_read_nowait is only used in the page fault case, where we already hold + * the mmap_lock. This is because otherwise repeated read and write operations + * (which take the truncate sem) could prevent a truncate from ever starting. + * This could still happen with page faults, but without an even more complex + * mechanism, this is unavoidable. + * + * LU-12460 + */ +static inline void trunc_sem_down_read_nowait(struct ll_trunc_sem *sem) +{ + wait_var_event(&sem->ll_trunc_readers, + atomic_inc_unless_negative(&sem->ll_trunc_readers)); +} + +static inline void trunc_sem_down_read(struct ll_trunc_sem *sem) +{ + wait_var_event(&sem->ll_trunc_readers, + atomic_read(&sem->ll_trunc_waiters) == 0 && + atomic_inc_unless_negative(&sem->ll_trunc_readers)); +} + +static inline void trunc_sem_up_read(struct ll_trunc_sem *sem) +{ + if (atomic_dec_return(&sem->ll_trunc_readers) == 0 && + atomic_read(&sem->ll_trunc_waiters)) + wake_up_var(&sem->ll_trunc_readers); +} + +static inline void trunc_sem_down_write(struct ll_trunc_sem *sem) +{ + atomic_inc(&sem->ll_trunc_waiters); + wait_var_event(&sem->ll_trunc_readers, + atomic_cmpxchg(&sem->ll_trunc_readers, 0, -1) == 0); + atomic_dec(&sem->ll_trunc_waiters); +} + +static inline void trunc_sem_up_write(struct ll_trunc_sem *sem) +{ + atomic_set(&sem->ll_trunc_readers, 0); + /* match the smp_mb() in wait_var_event()->prepare_to_wait() */ + smp_mb(); + wake_up_var(&sem->ll_trunc_readers); +} + +#ifdef CONFIG_LUSTRE_FS_POSIX_ACL +static inline void lli_clear_acl(struct ll_inode_info *lli) +{ + if (lli->lli_posix_acl) { + posix_acl_release(lli->lli_posix_acl); + lli->lli_posix_acl = NULL; + } +} + +static inline void lli_replace_acl(struct ll_inode_info *lli, + struct lustre_md *md) +{ + write_lock(&lli->lli_lock); + if (lli->lli_posix_acl) + posix_acl_release(lli->lli_posix_acl); + lli->lli_posix_acl = md->posix_acl; + write_unlock(&lli->lli_lock); +} +#else +static inline void lli_clear_acl(struct ll_inode_info *lli) +{ +} + +static inline void lli_replace_acl(struct ll_inode_info *lli, + struct lustre_md *md) +{ +} +#endif + +static inline __u32 ll_layout_version_get(struct ll_inode_info *lli) +{ + __u32 gen; + + spin_lock(&lli->lli_layout_lock); + gen = lli->lli_layout_gen; + spin_unlock(&lli->lli_layout_lock); + + return gen; +} + +static inline void ll_layout_version_set(struct ll_inode_info *lli, __u32 gen) +{ + spin_lock(&lli->lli_layout_lock); + lli->lli_layout_gen = gen; + spin_unlock(&lli->lli_layout_lock); +} + +enum ll_file_flags { + /* File data is modified. */ + LLIF_DATA_MODIFIED = 0, + /* File is being restored */ + LLIF_FILE_RESTORING = 1, + /* Xattr cache is attached to the file */ + LLIF_XATTR_CACHE = 2, + /* Project inherit */ + LLIF_PROJECT_INHERIT = 3, + /* update atime from MDS even if it's older than local inode atime. */ + LLIF_UPDATE_ATIME = 4, + /* foreign file/dir can be unlinked unconditionnaly */ + LLIF_FOREIGN_REMOVABLE = 5, + /* Xattr cache is filled */ + LLIF_XATTR_CACHE_FILLED = 7, + +}; + +int ll_xattr_cache_destroy(struct inode *inode); +int ll_xattr_cache_empty(struct inode *inode); + +int ll_xattr_cache_get(struct inode *inode, + const char *name, + char *buffer, + size_t size, + __u64 valid); + +int ll_xattr_cache_insert(struct inode *inode, + const char *name, + char *buffer, + size_t size); + +static inline bool obd_connect_has_secctx(struct obd_connect_data *data) +{ +#ifdef CONFIG_SECURITY + return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 && + data->ocd_connect_flags2 & OBD_CONNECT2_FILE_SECCTX; +#else + return false; +#endif +} + +static inline void obd_connect_set_secctx(struct obd_connect_data *data) +{ +#ifdef CONFIG_SECURITY + data->ocd_connect_flags2 |= OBD_CONNECT2_FILE_SECCTX; +#endif +} + +/* Only smack and selinux is known to use security contexts */ +static inline bool ll_xattr_is_seclabel(const char *name) +{ + return !strcmp(name, XATTR_NAME_SELINUX) || + !strcmp(name, XATTR_NAME_SMACK); +} + +static inline bool ll_xattr_suffix_is_seclabel(const char *suffix) +{ + return !strcmp(suffix, XATTR_SELINUX_SUFFIX) || + !strcmp(suffix, XATTR_SMACK_SUFFIX); +} + +int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name, + const char **secctx_name, __u32 *secctx_name_size, + void **secctx, __u32 *secctx_size, + int *secctx_slot); + +int ll_inode_init_security(struct dentry *dentry, struct inode *inode, + struct inode *dir); + +int ll_inode_notifysecctx(struct inode *inode, + void *secctx, __u32 secctxlen); + +void ll_secctx_name_free(struct ll_sb_info *sbi); + +int ll_secctx_name_store(struct inode *in); + +__u32 ll_secctx_name_get(struct ll_sb_info *sbi, const char **secctx_name); + +int ll_security_secctx_name_filter(struct ll_sb_info *sbi, int xattr_type, + const char *suffix); + +static inline bool obd_connect_has_enc(struct obd_connect_data *data) +{ +#ifdef HAVE_LUSTRE_CRYPTO + return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 && + data->ocd_connect_flags2 & OBD_CONNECT2_ENCRYPT; +#else + return false; +#endif +} + +static inline void obd_connect_set_enc(struct obd_connect_data *data) +{ +#ifdef HAVE_LUSTRE_CRYPTO + data->ocd_connect_flags2 |= OBD_CONNECT2_ENCRYPT; +#endif +} + +static inline bool obd_connect_has_name_enc(struct obd_connect_data *data) +{ +#ifdef HAVE_LUSTRE_CRYPTO + return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 && + data->ocd_connect_flags2 & OBD_CONNECT2_ENCRYPT_NAME; +#else + return false; +#endif +} + +static inline void obd_connect_set_name_enc(struct obd_connect_data *data) +{ +#ifdef HAVE_LUSTRE_CRYPTO + data->ocd_connect_flags2 |= OBD_CONNECT2_ENCRYPT_NAME; +#endif +} + +/* + * Locking to guarantee consistency of non-atomic updates to long long i_size, + * consistency between file size and KMS. + * + * Implemented by ->lli_size_mutex and ->lsm_lock, nested in that order. + */ + +void ll_inode_size_lock(struct inode *inode); +void ll_inode_size_unlock(struct inode *inode); + +static inline struct ll_inode_info *ll_i2info(struct inode *inode) +{ + return container_of(inode, struct ll_inode_info, lli_vfs_inode); +} + +static inline struct pcc_inode *ll_i2pcci(struct inode *inode) +{ + return ll_i2info(inode)->lli_pcc_inode; +} + +/* default to use at least 16M for fast read if possible */ +#define RA_REMAIN_WINDOW_MIN MiB_TO_PAGES(16UL) + +/* default read-ahead on a given client mountpoint. */ +#define SBI_DEFAULT_READ_AHEAD_MAX MiB_TO_PAGES(1024UL) + +/* default read-ahead for a single file descriptor */ +#define SBI_DEFAULT_READ_AHEAD_PER_FILE_MAX MiB_TO_PAGES(256UL) + +/* default read-ahead full files smaller than limit on the second read */ +#define SBI_DEFAULT_READ_AHEAD_WHOLE_MAX MiB_TO_PAGES(2UL) + +/* default range pages */ +#define SBI_DEFAULT_RA_RANGE_PAGES MiB_TO_PAGES(1ULL) + +/* Min range pages */ +#define RA_MIN_MMAP_RANGE_PAGES 16UL + +enum ra_stat { + RA_STAT_HIT = 0, + RA_STAT_MISS, + RA_STAT_DISTANT_READPAGE, + RA_STAT_MISS_IN_WINDOW, + RA_STAT_FAILED_GRAB_PAGE, + RA_STAT_FAILED_MATCH, + RA_STAT_DISCARDED, + RA_STAT_ZERO_LEN, + RA_STAT_ZERO_WINDOW, + RA_STAT_EOF, + RA_STAT_MAX_IN_FLIGHT, + RA_STAT_WRONG_GRAB_PAGE, + RA_STAT_FAILED_REACH_END, + RA_STAT_ASYNC, + RA_STAT_FAILED_FAST_READ, + RA_STAT_MMAP_RANGE_READ, + _NR_RA_STAT, +}; + +struct ll_ra_info { + atomic_t ra_cur_pages; + unsigned long ra_max_pages; + unsigned long ra_max_pages_per_file; + unsigned long ra_range_pages; + unsigned long ra_max_read_ahead_whole_pages; + struct workqueue_struct *ll_readahead_wq; + /* + * Max number of active works could be triggered + * for async readahead. + */ + unsigned int ra_async_max_active; + /* how many async readahead triggered in flight */ + atomic_t ra_async_inflight; + /* Threshold to control when to trigger async readahead */ + unsigned long ra_async_pages_per_file_threshold; +}; + +/* ra_io_arg will be filled in the beginning of ll_readahead with + * ras_lock, then the following ll_read_ahead_pages will read RA + * pages according to this arg, all the items in this structure are + * counted by page index. + */ +struct ra_io_arg { + pgoff_t ria_start_idx; /* start offset of read-ahead*/ + pgoff_t ria_end_idx; /* end offset of read-ahead*/ + unsigned long ria_reserved; /* reserved pages for read-ahead */ + pgoff_t ria_end_idx_min;/* minimum end to cover current read */ + bool ria_eof; /* reach end of file */ + /* If stride read pattern is detected, ria_stoff is the byte offset + * where stride read is started. Note: for normal read-ahead, the + * value here is meaningless, and also it will not be accessed*/ + loff_t ria_stoff; + /* ria_length and ria_bytes are the length and pages length in the + * stride I/O mode. And they will also be used to check whether + * it is stride I/O read-ahead in the read-ahead pages*/ + loff_t ria_length; + loff_t ria_bytes; +}; + +/* LL_HIST_MAX=32 causes an overflow */ +#define LL_HIST_MAX 28 +#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */ +#define LL_PROCESS_HIST_MAX 10 +struct per_process_info { + pid_t pid; + struct obd_histogram pp_r_hist; + struct obd_histogram pp_w_hist; +}; + +/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */ +struct ll_rw_extents_info { + ktime_t pp_init; + struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1]; +}; + +#define LL_OFFSET_HIST_MAX 100 +struct ll_rw_process_info { + pid_t rw_pid; + int rw_op; + loff_t rw_range_start; + loff_t rw_range_end; + loff_t rw_last_file_pos; + loff_t rw_offset; + size_t rw_smallest_extent; + size_t rw_largest_extent; + struct ll_file_data *rw_last_file; +}; + +enum stats_track_type { + STATS_TRACK_ALL = 0, /* track all processes */ + STATS_TRACK_PID, /* track process with this pid */ + STATS_TRACK_PPID, /* track processes with this ppid */ + STATS_TRACK_GID, /* track processes with this gid */ + STATS_TRACK_LAST, +}; + +/* flags for sbi->ll_flags */ +enum ll_sbi_flags { + LL_SBI_NOLCK, /* DLM locking disabled directio-only */ + LL_SBI_CHECKSUM, /* checksum each page as it's written */ + LL_SBI_LOCALFLOCK, /* local flocks instead of fs-wide */ + LL_SBI_FLOCK, /* flock enabled */ + LL_SBI_USER_XATTR, /* support user xattr */ + LL_SBI_LRU_RESIZE, /* lru resize support */ + LL_SBI_LAZYSTATFS, /* lazystatfs mount option */ + LL_SBI_32BIT_API, /* generate 32 bit inodes. */ + LL_SBI_USER_FID2PATH, /* fid2path by unprivileged users */ + LL_SBI_VERBOSE, /* verbose mount/umount */ + LL_SBI_ALWAYS_PING, /* ping even if server suppress_pings */ + LL_SBI_TEST_DUMMY_ENCRYPTION, /* test dummy encryption */ + LL_SBI_ENCRYPT, /* client side encryption */ + LL_SBI_FOREIGN_SYMLINK, /* foreign fake-symlink support */ + LL_SBI_FOREIGN_SYMLINK_UPCALL, /* foreign fake-symlink upcall set */ + LL_SBI_ALLOW_VERSION_MISMATCH, /* allow client/server version mismatch */ + LL_SBI_MDLL_BYPASS, /* disable metadata lazy load */ + LL_SBI_NUM_MOUNT_OPT, + + LL_SBI_ACL, /* support ACL */ + LL_SBI_AGL_ENABLED, /* enable agl */ + LL_SBI_64BIT_HASH, /* support 64-bits dir hash/offset */ + LL_SBI_LAYOUT_LOCK, /* layout lock support */ + LL_SBI_XATTR_CACHE, /* support for xattr cache */ + LL_SBI_NOROOTSQUASH, /* do not apply root squash */ + LL_SBI_FAST_READ, /* fast read support */ + LL_SBI_FILE_SECCTX, /* file security context at create */ + LL_SBI_TINY_WRITE, /* tiny write support */ + LL_SBI_FILE_HEAT, /* file heat support */ + LL_SBI_PARALLEL_DIO, /* parallel (async) O_DIRECT RPCs */ + LL_SBI_ENCRYPT_NAME, /* name encryption */ + LL_SBI_MDLL_AUTO_REFRESH, /* enable metadata lazy load */ + LL_SBI_MDLL, /* enable metadata lazy load auto-refresh */ + LL_SBI_NUM_FLAGS +}; + +int ll_sbi_flags_seq_show(struct seq_file *m, void *v); + +/* This is embedded into llite super-blocks to keep track of connect + * flags (capabilities) supported by all imports given mount is + * connected to. */ +struct lustre_client_ocd { + /* This is conjunction of connect_flags across all imports + * (LOVs) this mount is connected to. This field is updated by + * cl_ocd_update() under ->lco_lock. */ + __u64 lco_flags; + struct mutex lco_lock; + struct obd_export *lco_md_exp; + struct obd_export *lco_dt_exp; +}; + +struct ll_sb_info { + /* this protects pglist and ra_info. It isn't safe to + * grab from interrupt contexts */ + spinlock_t ll_lock; + spinlock_t ll_pp_extent_lock; /* pp_extent entry*/ + spinlock_t ll_process_lock; /* ll_rw_process_info */ + struct obd_uuid ll_sb_uuid; + struct obd_export *ll_md_exp; + struct obd_export *ll_dt_exp; + struct obd_device *ll_md_obd; + struct obd_device *ll_dt_obd; + struct dentry *ll_debugfs_entry; + struct lu_fid ll_root_fid; /* root object fid */ + struct mnt_namespace *ll_mnt_ns; + + DECLARE_BITMAP(ll_flags, LL_SBI_NUM_FLAGS); /* enum ll_sbi_flags */ + unsigned int ll_xattr_cache_enabled:1, + ll_xattr_cache_set:1, /* already set to 0/1 */ + ll_client_common_fill_super_succeeded:1, + ll_checksum_set:1, + ll_inode_cache_enabled:1; + + struct lustre_client_ocd ll_lco; + + struct lprocfs_stats *ll_stats; /* lprocfs stats counter */ + + /* Used to track "unstable" pages on a client, and maintain a + * LRU list of clean pages. An "unstable" page is defined as + * any page which is sent to a server as part of a bulk request, + * but is uncommitted to stable storage. */ + struct cl_client_cache *ll_cache; + + struct lprocfs_stats *ll_ra_stats; + + struct ll_ra_info ll_ra_info; + unsigned int ll_namelen; + const struct file_operations *ll_fop; + + struct lu_site *ll_site; + struct cl_device *ll_cl; + + /* Statistics */ + struct ll_rw_extents_info *ll_rw_extents_info; + int ll_extent_process_count; + unsigned int ll_offset_process_count; + struct ll_rw_process_info *ll_rw_process_info; + struct ll_rw_process_info *ll_rw_offset_info; + ktime_t ll_process_stats_init; + unsigned int ll_rw_offset_entry_count; + int ll_stats_track_id; + enum stats_track_type ll_stats_track_type; + int ll_rw_stats_on; + + /* metadata stat-ahead */ + unsigned int ll_sa_running_max;/* max concurrent + * statahead instances */ + unsigned int ll_sa_max; /* max statahead RPCs */ + atomic_t ll_sa_total; /* statahead thread started + * count */ + atomic_t ll_sa_wrong; /* statahead thread stopped for + * low hit ratio */ + atomic_t ll_sa_running; /* running statahead thread + * count */ + atomic_t ll_agl_total; /* AGL thread started count */ + + dev_t ll_sdev_orig; /* save s_dev before assign for + * clustred nfs */ + /* root squash */ + struct root_squash_info ll_squash; + struct path ll_mnt; + + /* st_blksize returned by stat(2), when non-zero */ + unsigned int ll_stat_blksize; + + /* maximum relative age of cached statfs results */ + unsigned int ll_statfs_max_age; + + /* + * seconds after which negative dentries should be invalidated. + * -1 disables invalidation of negative entries based on timeout + * 0 always triggers serverside validation + */ + int ll_neg_dentry_timeout; + + /* + * MDLL directory restore retry count + * This would determine the number of times the restore would be + * retried before returning error to the client. The retry would + * be based on the released bit of the directory. + * A value of -1 would retry indefinitely. + */ +#define LL_MDLL_DIR_RESTORE_DEF_RETRY_COUNT 1 + atomic_t ll_dir_restore_max_retry_count; + + struct kset ll_kset; /* sysfs object */ + struct completion ll_kobj_unregister; + + /* File heat */ + unsigned int ll_heat_decay_weight; + unsigned int ll_heat_period_second; + + /* Opens of the same inode before we start requesting open lock */ + u32 ll_oc_thrsh_count; + + /* Time in ms between last inode close and next open to be considered + * instant back to back and would trigger an open lock request + */ + u32 ll_oc_thrsh_ms; + + /* Time in ms after last file close that we no longer count prior opens*/ + u32 ll_oc_max_ms; + + /* filesystem fsname */ + char ll_fsname[LUSTRE_MAXFSNAME + 1]; + + /* Persistent Client Cache */ + struct pcc_super ll_pcc_super; + + /* to protect vs updates in all following foreign symlink fields */ + struct rw_semaphore ll_foreign_symlink_sem; + /* foreign symlink path prefix */ + char *ll_foreign_symlink_prefix; + /* full prefix size including leading '\0' */ + size_t ll_foreign_symlink_prefix_size; + /* foreign symlink path upcall */ + char *ll_foreign_symlink_upcall; + /* foreign symlink path upcall infos */ + struct ll_foreign_symlink_upcall_item *ll_foreign_symlink_upcall_items; + /* foreign symlink path upcall nb infos */ + unsigned int ll_foreign_symlink_upcall_nb_items; + + /* cached file security context xattr name. e.g: security.selinux */ + char *ll_secctx_name; + __u32 ll_secctx_name_size; +}; + +#define SBI_DEFAULT_HEAT_DECAY_WEIGHT ((80 * 256 + 50) / 100) +#define SBI_DEFAULT_HEAT_PERIOD_SECOND (60) + +#define SBI_DEFAULT_OPENCACHE_THRESHOLD_COUNT (5) +#define SBI_DEFAULT_OPENCACHE_THRESHOLD_MS (100) /* 0.1 second */ +#define SBI_DEFAULT_OPENCACHE_THRESHOLD_MAX_MS (60000) /* 1 minute */ + +/* + * per file-descriptor read-ahead data. + */ +struct ll_readahead_state { + spinlock_t ras_lock; + /* End byte that read(2) try to read. */ + loff_t ras_last_read_end_bytes; + /* + * number of bytes read after last read-ahead window reset. As window + * is reset on each seek, this is effectively a number of consecutive + * accesses. Maybe ->ras_accessed_in_window is better name. + * + * XXX nikita: window is also reset (by ras_update()) when Lustre + * believes that memory pressure evicts read-ahead pages. In that + * case, it probably doesn't make sense to expand window to + * PTLRPC_MAX_BRW_PAGES on the third access. + */ + loff_t ras_consecutive_bytes; + /* + * number of read requests after the last read-ahead window reset + * As window is reset on each seek, this is effectively the number + * on consecutive read request and is used to trigger read-ahead. + */ + unsigned long ras_consecutive_requests; + /* + * Parameters of current read-ahead window. Handled by + * ras_update(). On the initial access to the file or after a seek, + * window is reset to 0. After 3 consecutive accesses, window is + * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by + * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages. + */ + pgoff_t ras_window_start_idx; + pgoff_t ras_window_pages; + + /* Page index where min range read starts */ + pgoff_t ras_range_min_start_idx; + /* Page index where mmap range read ends */ + pgoff_t ras_range_max_end_idx; + /* number of mmap pages where last time detected */ + pgoff_t ras_last_range_pages; + /* number of mmap range requests */ + pgoff_t ras_range_requests; + + /* + * Optimal RPC size in pages. + * It decides how many pages will be sent for each read-ahead. + */ + unsigned long ras_rpc_pages; + /* + * Where next read-ahead should start at. This lies within read-ahead + * window. Read-ahead window is read in pieces rather than at once + * because: 1. lustre limits total number of pages under read-ahead by + * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages + * not covered by DLM lock. + */ + pgoff_t ras_next_readahead_idx; + /* + * Total number of ll_file_read requests issued, reads originating + * due to mmap are not counted in this total. This value is used to + * trigger full file read-ahead after multiple reads to a small file. + */ + unsigned long ras_requests; + /* + * The following 3 items are used for detecting the stride I/O + * mode. + * In stride I/O mode, + * ...............|-----data-----|****gap*****|--------|******|.... + * offset |-stride_bytes-|-stride_gap-| + * ras_stride_offset = offset; + * ras_stride_length = stride_bytes + stride_gap; + * ras_stride_bytes = stride_bytes; + * Note: all these three items are counted by bytes. + */ + loff_t ras_stride_offset; + loff_t ras_stride_length; + loff_t ras_stride_bytes; + /* + * number of consecutive stride request count, and it is similar as + * ras_consecutive_requests, but used for stride I/O mode. + * Note: only more than 2 consecutive stride request are detected, + * stride read-ahead will be enable + */ + unsigned long ras_consecutive_stride_requests; + /* index of the last page that async readahead starts */ + pgoff_t ras_async_last_readpage_idx; + /* whether we should increase readahead window */ + bool ras_need_increase_window; + /* whether ra miss check should be skipped */ + bool ras_no_miss_check; +}; + +struct ll_readahead_work { + /** File to readahead */ + struct file *lrw_file; + pgoff_t lrw_start_idx; + pgoff_t lrw_end_idx; + pid_t lrw_user_pid; + + /* async worker to handler read */ + struct work_struct lrw_readahead_work; + char lrw_jobid[LUSTRE_JOBID_SIZE]; +}; + +extern struct kmem_cache *ll_file_data_slab; +struct lustre_handle; +struct ll_file_data { + struct ll_readahead_state fd_ras; + struct ll_grouplock fd_grouplock; + __u64 lfd_pos; + __u32 fd_flags; + fmode_t fd_omode; + /* openhandle if lease exists for this file. + * Borrow lli->lli_och_mutex to protect assignment */ + struct obd_client_handle *fd_lease_och; + struct obd_client_handle *fd_och; + struct file *fd_file; + /* Indicate whether need to report failure when close. + * true: failure is known, not report again. + * false: unknown failure, should report. */ + bool fd_write_failed; + bool ll_lock_no_expand; + /* Used by mirrored file to lead IOs to a specific mirror, usually + * for mirror resync. 0 means default. */ + __u32 fd_designated_mirror; + /* The layout version when resync starts. Resync I/O should carry this + * layout version for verification to OST objects */ + __u32 fd_layout_version; + struct pcc_file fd_pcc_file; + /* striped directory may read partially if some stripe inaccessible, + * -errno is saved here, and will return to user in close(). + */ + int fd_partial_readdir_rc; +}; + +void llite_tunables_unregister(void); +int llite_tunables_register(void); + +static inline struct inode *ll_info2i(struct ll_inode_info *lli) +{ + return &lli->lli_vfs_inode; +} + +__u32 ll_i2suppgid(struct inode *i); +void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2); + +static inline int ll_need_32bit_api(struct ll_sb_info *sbi) +{ +#if BITS_PER_LONG == 32 + return 1; +#elif defined(CONFIG_COMPAT) + if (unlikely(test_bit(LL_SBI_32BIT_API, sbi->ll_flags))) + return true; + +# ifdef CONFIG_X86_X32 + /* in_compat_syscall() returns true when called from a kthread + * and CONFIG_X86_X32 is enabled, which is wrong. So check + * whether the caller comes from a syscall (ie. not a kthread) + * before calling in_compat_syscall(). */ + if (current->flags & PF_KTHREAD) + return false; +# endif + + return unlikely(in_compat_syscall()); +#else + return unlikely(test_bit(LL_SBI_32BIT_API, sbi->ll_flags)); +#endif +} + +static inline bool ll_sbi_has_fast_read(struct ll_sb_info *sbi) +{ + return test_bit(LL_SBI_FAST_READ, sbi->ll_flags); +} + +static inline bool ll_sbi_has_tiny_write(struct ll_sb_info *sbi) +{ + return test_bit(LL_SBI_TINY_WRITE, sbi->ll_flags); +} + +static inline bool ll_sbi_has_file_heat(struct ll_sb_info *sbi) +{ + return test_bit(LL_SBI_FILE_HEAT, sbi->ll_flags); +} + +static inline bool ll_sbi_has_foreign_symlink(struct ll_sb_info *sbi) +{ + return test_bit(LL_SBI_FOREIGN_SYMLINK, sbi->ll_flags); +} + +static inline bool ll_sbi_has_parallel_dio(struct ll_sb_info *sbi) +{ + return test_bit(LL_SBI_PARALLEL_DIO, sbi->ll_flags); +} + +void ll_ras_enter(struct file *f, loff_t pos, size_t count); + +/* llite/lcommon_misc.c */ +int cl_ocd_update(struct obd_device *host, struct obd_device *watched, + enum obd_notify_event ev, void *owner); +int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock, + struct ll_grouplock *lg); +void cl_put_grouplock(struct ll_grouplock *lg); + +/* llite/lproc_llite.c */ +int ll_debugfs_register_super(struct super_block *sb, const char *name); +void ll_debugfs_unregister_super(struct super_block *sb); +void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, long count); +void ll_free_rw_stats_info(struct ll_sb_info *sbi); + +enum { + LPROC_LL_READ_BYTES, + LPROC_LL_WRITE_BYTES, + LPROC_LL_READ, + LPROC_LL_WRITE, + LPROC_LL_IOCTL, + LPROC_LL_OPEN, + LPROC_LL_RELEASE, + LPROC_LL_MMAP, + LPROC_LL_FAULT, + LPROC_LL_MKWRITE, + LPROC_LL_LLSEEK, + LPROC_LL_FSYNC, + LPROC_LL_READDIR, + LPROC_LL_SETATTR, + LPROC_LL_TRUNC, + LPROC_LL_FLOCK, + LPROC_LL_GETATTR, + LPROC_LL_CREATE, + LPROC_LL_LINK, + LPROC_LL_UNLINK, + LPROC_LL_SYMLINK, + LPROC_LL_MKDIR, + LPROC_LL_RMDIR, + LPROC_LL_MKNOD, + LPROC_LL_RENAME, + LPROC_LL_STATFS, + LPROC_LL_SETXATTR, + LPROC_LL_GETXATTR, + LPROC_LL_GETXATTR_HITS, + LPROC_LL_LISTXATTR, + LPROC_LL_REMOVEXATTR, + LPROC_LL_INODE_PERM, + LPROC_LL_FALLOCATE, + LPROC_LL_INODE_OCOUNT, + LPROC_LL_INODE_OPCLTM, + LPROC_LL_FILE_OPCODES +}; + +/* llite/dir.c */ +enum get_default_layout_type { + GET_DEFAULT_LAYOUT_ROOT = 1, +}; + +extern const struct file_operations ll_dir_operations; +extern const struct inode_operations ll_dir_inode_operations; +#ifdef HAVE_DIR_CONTEXT +int ll_dir_read(struct inode *inode, __u64 *pos, struct md_op_data *op_data, + struct dir_context *ctx, int *partial_readdir_rc); +#else +int ll_dir_read(struct inode *inode, __u64 *pos, struct md_op_data *op_data, + void *cookie, filldir_t filldir, int *partial_readdir_rc); +#endif +int ll_get_mdt_idx(struct inode *inode); +int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid); +struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data, + __u64 offset, int *partial_readdir_rc); +void ll_release_page(struct inode *inode, struct page *page, bool remove); +int quotactl_ioctl(struct super_block *sb, struct if_quotactl *qctl); + +/* llite/namei.c */ +extern const struct inode_operations ll_special_inode_operations; + +struct inode *ll_iget(struct super_block *sb, ino_t hash, + struct lustre_md *lic); +int ll_test_inode_by_fid(struct inode *inode, void *opaque); +int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, + void *data, int flag); +struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de); +int ll_rmdir_entry(struct inode *dir, char *name, int namelen); +void ll_update_times(struct ptlrpc_request *request, struct inode *inode); + +/* llite/rw.c */ +int ll_writepage(struct page *page, struct writeback_control *wbc); +int ll_writepages(struct address_space *, struct writeback_control *wbc); +int ll_readpage(struct file *file, struct page *page); +#ifdef HAVE_AOPS_READ_FOLIO +int ll_read_folio(struct file *file, struct folio *folio); +#endif +int ll_io_read_page(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, struct file *file); +void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras); +int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io); + +enum lcc_type; +void ll_cl_add(struct inode *inode, const struct lu_env *env, struct cl_io *io, + enum lcc_type type); +void ll_cl_remove(struct inode *inode, const struct lu_env *env); +struct ll_cl_context *ll_cl_find(struct inode *inode); + +extern const struct address_space_operations ll_aops; + +/* llite/file.c */ +extern const struct inode_operations ll_file_inode_operations; +const struct file_operations *ll_select_file_operations(struct ll_sb_info *sbi); +extern int ll_have_md_lock(struct inode *inode, __u64 *bits, + enum ldlm_mode l_req_mode); +extern enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits, + struct lustre_handle *lockh, __u64 flags, + enum ldlm_mode mode); + +int ll_file_open(struct inode *inode, struct file *file); +int ll_file_release(struct inode *inode, struct file *file); +int ll_release_openhandle(struct dentry *, struct lookup_intent *); +int ll_md_real_close(struct inode *inode, fmode_t fmode); +void ll_track_file_opens(struct inode *inode); +extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, + struct ll_file_data *file, loff_t pos, + size_t count, int rw); +#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR) +int ll_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +#else +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat); +#endif /* HAVE_USER_NAMESPACE_ARG */ +int ll_getattr_dentry(struct dentry *de, struct kstat *stat, u32 request_mask, + unsigned int flags, bool foreign); +#ifdef CONFIG_LUSTRE_FS_POSIX_ACL +struct posix_acl *ll_get_acl( + #ifdef HAVE_ACL_WITH_DENTRY + struct user_namespace *, struct dentry *, int); + #elif defined HAVE_GET_ACL_RCU_ARG + struct inode *inode, int type, bool rcu); + #else + struct inode *inode, int type); + #endif /* HAVE_GET_ACL_RCU_ARG */ + +int ll_set_acl(struct user_namespace *mnt_userns, + #ifdef HAVE_ACL_WITH_DENTRY + struct dentry *dentry, + #else + struct inode *inode, + #endif + struct posix_acl *acl, int type); +#else /* !CONFIG_LUSTRE_FS_POSIX_ACL */ +#define ll_get_acl NULL +#define ll_set_acl NULL +#endif /* CONFIG_LUSTRE_FS_POSIX_ACL */ + +static inline int ll_xflags_to_inode_flags(int xflags) +{ + return ((xflags & FS_XFLAG_SYNC) ? S_SYNC : 0) | + ((xflags & FS_XFLAG_NOATIME) ? S_NOATIME : 0) | + ((xflags & FS_XFLAG_APPEND) ? S_APPEND : 0) | + ((xflags & FS_XFLAG_IMMUTABLE) ? S_IMMUTABLE : 0); +} + +static inline int ll_inode_flags_to_xflags(int inode_flags) +{ + return ((inode_flags & S_SYNC) ? FS_XFLAG_SYNC : 0) | + ((inode_flags & S_NOATIME) ? FS_XFLAG_NOATIME : 0) | + ((inode_flags & S_APPEND) ? FS_XFLAG_APPEND : 0) | + ((inode_flags & S_IMMUTABLE) ? FS_XFLAG_IMMUTABLE : 0); +} + +int ll_migrate(struct inode *parent, struct file *file, + struct lmv_user_md *lum, const char *name, __u32 flags); +int ll_get_fid_by_name(struct inode *parent, const char *name, + int namelen, struct lu_fid *fid, struct inode **inode); +int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask); +int ll_ioctl_check_project(struct inode *inode, __u32 xflags, __u32 projid); +int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd, + unsigned long arg); +int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd, + unsigned long arg); +int ll_ioctl_project(struct file *file, unsigned int cmd, + unsigned long arg); + +int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, + __u64 flags, struct lov_user_md *lum, + int lum_size); +int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, + struct lov_mds_md **lmm, int *lmm_size, + struct ptlrpc_request **request); +int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, + int set_default); +int ll_dir_get_default_layout(struct inode *inode, void **plmm, int *plmm_size, + struct ptlrpc_request **request, u64 valid, + enum get_default_layout_type type); +int ll_dir_getstripe_default(struct inode *inode, void **lmmp, + int *lmm_size, struct ptlrpc_request **request, + struct ptlrpc_request **root_request, u64 valid); +int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size, + struct ptlrpc_request **request, u64 valid); +int ll_fsync(struct file *file, loff_t start, loff_t end, int data); +int ll_merge_attr(const struct lu_env *env, struct inode *inode); +int ll_fid2path(struct inode *inode, void __user *arg); +int ll_data_version(struct inode *inode, __u64 *data_version, int flags); +int ll_hsm_release(struct inode *inode); +int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss); +void ll_io_set_mirror(struct cl_io *io, const struct file *file); +int ll_hsm_import(struct inode *inode, struct file *file, + struct hsm_user_import *hui); + +/* llite/dcache.c */ + +extern const struct dentry_operations ll_d_ops; +#ifndef HAVE_D_INIT +bool ll_d_setup(struct dentry *de, bool do_put); + +static inline bool lld_is_init(struct dentry *dentry) +{ + return ll_d2d(dentry); +} +#else +#define ll_d_setup(de, do_put) (true) +#define lld_is_init(dentry) (true) +#endif + +void ll_intent_drop_lock(struct lookup_intent *); +void ll_intent_release(struct lookup_intent *); +void ll_prune_aliases(struct inode *inode); +void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry); +int ll_revalidate_it_finish(struct ptlrpc_request *request, + struct lookup_intent *it, struct dentry *de); + +/* llite/llite_lib.c */ +extern const struct super_operations lustre_super_operations; + +void ll_lli_init(struct ll_inode_info *lli); +int ll_fill_super(struct super_block *sb); +void ll_put_super(struct super_block *sb); +void ll_kill_super(struct super_block *sb); +struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock); +void ll_dir_clear_lsm_md(struct inode *inode); +void ll_clear_inode(struct inode *inode); +int volatile_ref_file(const char *volatile_name, int volatile_len, + struct file **ref_file); +int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, + enum op_xvalid xvalid, bool hsm_import); +int ll_setattr(struct user_namespace *mnt_userns, struct dentry *de, + struct iattr *attr); +int ll_statfs(struct dentry *de, struct kstatfs *sfs); +int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs, + u32 flags); +int ll_update_inode(struct inode *inode, struct lustre_md *md); +void ll_update_inode_flags(struct inode *inode, unsigned int ext_flags); +void ll_update_dir_depth(struct inode *dir, struct inode *inode); +int ll_read_inode2(struct inode *inode, void *opaque); +void ll_truncate_inode_pages_final(struct inode *inode); +void ll_delete_inode(struct inode *inode); +int ll_iocontrol(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); +int ll_flush_ctx(struct inode *inode); +void ll_umount_begin(struct super_block *sb); +int ll_remount_fs(struct super_block *sb, int *flags, char *data); +int ll_show_options(struct seq_file *seq, struct dentry *dentry); +void ll_dirty_page_discard_warn(struct inode *inode, int ioret); +int ll_prep_inode(struct inode **inode, struct req_capsule *pill, + struct super_block *sb, struct lookup_intent *it); +int ll_obd_statfs(struct inode *inode, void __user *arg); +int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize); +int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize); +int ll_set_default_mdsize(struct ll_sb_info *sbi, int default_mdsize); + +void ll_unlock_md_op_lsm(struct md_op_data *op_data); +struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, + struct inode *i1, struct inode *i2, + const char *name, size_t namelen, + __u32 mode, enum md_op_code opc, + void *data); +void ll_finish_md_op_data(struct md_op_data *op_data); +int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg); +void ll_compute_rootsquash_state(struct ll_sb_info *sbi); +ssize_t ll_copy_user_md(const struct lov_user_md __user *md, + struct lov_user_md **kbuf); +void ll_open_cleanup(struct super_block *sb, struct req_capsule *pill); + +void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req); + +/* Compute expected user md size when passing in a md from user space */ +static inline ssize_t ll_lov_user_md_size(const struct lov_user_md *lum) +{ + switch (lum->lmm_magic) { + case LOV_USER_MAGIC_V1: + return sizeof(struct lov_user_md_v1); + case LOV_USER_MAGIC_V3: + return sizeof(struct lov_user_md_v3); + case LOV_USER_MAGIC_SPECIFIC: + if (lum->lmm_stripe_count > LOV_MAX_STRIPE_COUNT) + return -EINVAL; + + return lov_user_md_size(lum->lmm_stripe_count, + LOV_USER_MAGIC_SPECIFIC); + case LOV_USER_MAGIC_COMP_V1: + return ((struct lov_comp_md_v1 *)lum)->lcm_size; + case LOV_USER_MAGIC_FOREIGN: + return foreign_size(lum); + } + + return -EINVAL; +} + +/* llite/llite_nfs.c */ +extern const struct export_operations lustre_export_operations; +__u32 get_uuid2int(const char *name, int len); +struct inode *search_inode_for_lustre(struct super_block *sb, + const struct lu_fid *fid); +int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid); + +/* llite/symlink.c */ +extern const struct inode_operations ll_fast_symlink_inode_operations; + +/** + * IO arguments for various VFS I/O interfaces. + */ +struct vvp_io_args { + /** normal/sendfile/splice */ + union { + struct { + struct kiocb *via_iocb; + struct iov_iter *via_iter; + } normal; + } u; +}; + +enum lcc_type { + LCC_RW = 1, + LCC_MMAP +}; + +struct ll_cl_context { + struct list_head lcc_list; + void *lcc_cookie; + const struct lu_env *lcc_env; + struct cl_io *lcc_io; + struct cl_page *lcc_page; + enum lcc_type lcc_type; + pgoff_t lcc_end_index; +}; + +struct ll_thread_info { + struct vvp_io_args lti_args; + struct ra_io_arg lti_ria; + struct ll_cl_context lti_io_ctx; +}; + +extern struct lu_context_key ll_thread_key; + +static inline struct ll_thread_info *ll_env_info(const struct lu_env *env) +{ + struct ll_thread_info *lti; + + lti = lu_context_key_get(&env->le_ctx, &ll_thread_key); + LASSERT(lti != NULL); + + return lti; +} + +static inline struct vvp_io_args *ll_env_args(const struct lu_env *env) +{ + return &ll_env_info(env)->lti_args; +} + +void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot, + struct vvp_io_args *args); + +/* llite/llite_mmap.c */ + +int ll_file_mmap(struct file * file, struct vm_area_struct * vma); +void policy_from_vma(union ldlm_policy_data *policy, struct vm_area_struct *vma, + unsigned long addr, size_t count); +struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, + size_t count); + +#define ll_s2sbi(sb) (s2lsi(sb)->lsi_llsbi) + +/* don't need an addref as the sb_info should be holding one */ +static inline struct obd_export *ll_s2dtexp(struct super_block *sb) +{ + return ll_s2sbi(sb)->ll_dt_exp; +} + +/* don't need an addref as the sb_info should be holding one */ +static inline struct obd_export *ll_s2mdexp(struct super_block *sb) +{ + return ll_s2sbi(sb)->ll_md_exp; +} + +static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi) +{ + struct obd_device *obd = sbi->ll_md_exp->exp_obd; + if (obd == NULL) + LBUG(); + return &obd->u.cli; +} + +// FIXME: replace the name of this with LL_SB to conform to kernel stuff +static inline struct ll_sb_info *ll_i2sbi(struct inode *inode) +{ + return ll_s2sbi(inode->i_sb); +} + +static inline struct obd_export *ll_i2dtexp(struct inode *inode) +{ + return ll_s2dtexp(inode->i_sb); +} + +static inline struct obd_export *ll_i2mdexp(struct inode *inode) +{ + return ll_s2mdexp(inode->i_sb); +} + +static inline struct lu_fid *ll_inode2fid(struct inode *inode) +{ + struct lu_fid *fid; + + LASSERT(inode != NULL); + fid = &ll_i2info(inode)->lli_fid; + + return fid; +} + +static inline bool ll_dir_striped(struct inode *inode) +{ + LASSERT(inode); + return S_ISDIR(inode->i_mode) && + lmv_dir_striped(ll_i2info(inode)->lli_lsm_md); +} + +static inline loff_t ll_file_maxbytes(struct inode *inode) +{ + struct cl_object *obj = ll_i2info(inode)->lli_clob; + + if (obj == NULL) + return MAX_LFS_FILESIZE; + + return min_t(loff_t, cl_object_maxbytes(obj), MAX_LFS_FILESIZE); +} + +/* llite/xattr.c */ +extern const struct xattr_handler *ll_xattr_handlers[]; + +#define XATTR_USER_T 1 +#define XATTR_TRUSTED_T 2 +#define XATTR_SECURITY_T 3 +#define XATTR_ACL_ACCESS_T 4 +#define XATTR_ACL_DEFAULT_T 5 +#define XATTR_LUSTRE_T 6 +#define XATTR_OTHER_T 7 +#define XATTR_ENCRYPTION_T 9 + +ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size); +int ll_xattr_list(struct inode *inode, const char *name, int type, + void *buffer, size_t size, u64 valid); +const struct xattr_handler *get_xattr_type(const char *name); +int ll_get_hsm_state(struct inode *inode, u32 *hus_states); + +/** + * Common IO arguments for various VFS I/O interfaces. + */ +int cl_sb_init(struct super_block *sb); +int cl_sb_fini(struct super_block *sb); + +enum ras_update_flags { + LL_RAS_HIT = 0x1, + LL_RAS_MMAP = 0x2 +}; +void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len); +void ll_ra_stats_inc(struct inode *inode, enum ra_stat which); + +/* statahead.c */ + +#define LL_SA_RPC_MIN 2 +#define LL_SA_RPC_DEF 32 +#define LL_SA_RPC_MAX 512 + +/* XXX: If want to support more concurrent statahead instances, + * please consider to decentralize the RPC lists attached + * on related import, such as imp_{sending,delayed}_list. + * LU-11079 */ +#define LL_SA_RUNNING_MAX 256 +#define LL_SA_RUNNING_DEF 16 + +#define LL_SA_CACHE_BIT 5 +#define LL_SA_CACHE_SIZE (1 << LL_SA_CACHE_BIT) +#define LL_SA_CACHE_MASK (LL_SA_CACHE_SIZE - 1) + +/* per inode struct, for dir only */ +struct ll_statahead_info { + struct dentry *sai_dentry; + atomic_t sai_refcount; /* when access this struct, hold + * refcount */ + unsigned int sai_max; /* max ahead of lookup */ + __u64 sai_sent; /* stat requests sent count */ + __u64 sai_replied; /* stat requests which received + * reply */ + __u64 sai_index; /* index of statahead entry */ + __u64 sai_index_wait; /* index of entry which is the + * caller is waiting for */ + __u64 sai_hit; /* hit count */ + __u64 sai_miss; /* miss count: + * for "ls -al" case, includes + * hidden dentry miss; + * for "ls -l" case, it does not + * include hidden dentry miss. + * "sai_miss_hidden" is used for + * the later case. + */ + unsigned int sai_consecutive_miss; /* consecutive miss */ + unsigned int sai_miss_hidden;/* "ls -al", but first dentry + * is not a hidden one */ + unsigned int sai_skip_hidden;/* skipped hidden dentry count + */ + unsigned int sai_ls_all:1, /* "ls -al", do stat-ahead for + * hidden entries */ + sai_in_readpage:1;/* statahead is in readdir()*/ + wait_queue_head_t sai_waitq; /* stat-ahead wait queue */ + struct task_struct *sai_task; /* stat-ahead thread */ + struct task_struct *sai_agl_task; /* AGL thread */ + struct list_head sai_interim_entries; /* entries which got async + * stat reply, but not + * instantiated */ + struct list_head sai_entries; /* completed entries */ + struct list_head sai_agls; /* AGLs to be sent */ + struct list_head sai_cache[LL_SA_CACHE_SIZE]; + spinlock_t sai_cache_lock[LL_SA_CACHE_SIZE]; + atomic_t sai_cache_count; /* entry count in cache */ +}; + +int ll_revalidate_statahead(struct inode *dir, struct dentry **dentry, + bool unplug); +int ll_start_statahead(struct inode *dir, struct dentry *dentry, bool agl); +void ll_authorize_statahead(struct inode *dir, void *key); +void ll_deauthorize_statahead(struct inode *dir, void *key); + +/* glimpse.c */ +blkcnt_t dirty_cnt(struct inode *inode); + +int cl_glimpse_size0(struct inode *inode, int agl); +int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, + struct inode *inode, struct cl_object *clob, int agl); + +static inline int cl_glimpse_size(struct inode *inode) +{ + return cl_glimpse_size0(inode, 0); +} + +/* AGL is 'asychronous glimpse lock', which is a speculative lock taken as + * part of statahead */ +static inline int cl_agl(struct inode *inode) +{ + return cl_glimpse_size0(inode, 1); +} + +int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise); + +int cl_io_get(struct inode *inode, struct lu_env **envout, + struct cl_io **ioout, __u16 *refcheck); + +static inline int ll_glimpse_size(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + + down_read(&lli->lli_glimpse_sem); + rc = cl_glimpse_size(inode); + lli->lli_glimpse_time = ktime_get(); + up_read(&lli->lli_glimpse_sem); + return rc; +} + +/* dentry may statahead when statahead is enabled and current process has opened + * parent directory, and this dentry hasn't accessed statahead cache before */ +static inline bool +dentry_may_statahead(struct inode *dir, struct dentry *dentry) +{ + struct ll_inode_info *lli; + struct ll_dentry_data *ldd; + + if (ll_i2sbi(dir)->ll_sa_max == 0) + return false; + + lli = ll_i2info(dir); + + /* statahead is not allowed for this dir, there may be three causes: + * 1. dir is not opened. + * 2. statahead hit ratio is too low. + * 3. previous stat started statahead thread failed. */ + if (!lli->lli_sa_enabled) + return false; + + /* not the same process, don't statahead */ + if (lli->lli_opendir_pid != current->pid) + return false; + + /* + * When stating a dentry, kernel may trigger 'revalidate' or 'lookup' + * multiple times, eg. for 'getattr', 'getxattr' and etc. + * For patchless client, lookup intent is not accurate, which may + * misguide statahead. For example: + * The 'revalidate' call for 'getattr' and 'getxattr' of a dentry will + * have the same intent -- IT_GETATTR, while one dentry should access + * statahead cache once, otherwise statahead windows is messed up. + * The solution is as following: + * Assign 'lld_sa_generation' with 'lli_sa_generation' when a dentry + * IT_GETATTR for the first time, and subsequent IT_GETATTR will + * bypass interacting with statahead cache by checking + * 'lld_sa_generation == lli->lli_sa_generation'. + */ + ldd = ll_d2d(dentry); + if (ldd != NULL && lli->lli_sa_generation && + ldd->lld_sa_generation == lli->lli_sa_generation) + return false; + + return true; +} + +int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, + enum cl_fsync_mode mode, int ignore_layout); + +static inline int ll_file_nolock(const struct file *file) +{ + struct ll_file_data *fd = file->private_data; + struct inode *inode = file_inode((struct file *)file); + + LASSERT(fd != NULL); + return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) || + test_bit(LL_SBI_NOLCK, ll_i2sbi(inode)->ll_flags)); +} + +static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode, + struct lookup_intent *it, __u64 *bits) +{ + if (!it->it_lock_set) { + struct lustre_handle handle; + + /* If this inode is a remote object, it will get two + * separate locks in different namespaces, Master MDT, + * where the name entry is, will grant LOOKUP lock, + * remote MDT, where the object is, will grant + * UPDATE|PERM lock. The inode will be attched to both + * LOOKUP and PERM locks, so revoking either locks will + * case the dcache being cleared */ + if (it->it_remote_lock_mode) { + handle.cookie = it->it_remote_lock_handle; + CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID + "(%p) for remote lock %#llx\n", + PFID(ll_inode2fid(inode)), inode, + handle.cookie); + md_set_lock_data(exp, &handle, inode, NULL); + } + + handle.cookie = it->it_lock_handle; + + CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID"(%p)" + " for lock %#llx\n", + PFID(ll_inode2fid(inode)), inode, handle.cookie); + + md_set_lock_data(exp, &handle, inode, &it->it_lock_bits); + it->it_lock_set = 1; + } + + if (bits != NULL) + *bits = it->it_lock_bits; +} + +static inline int d_lustre_invalid(const struct dentry *dentry) +{ + return !ll_d2d(dentry) || ll_d2d(dentry)->lld_invalid; +} + +/* + * Mark dentry INVALID, if dentry refcount is zero (this is normally case for + * ll_md_blocking_ast), it will be pruned by ll_prune_aliases() and + * ll_prune_negative_children(); otherwise dput() of the last refcount will + * unhash this dentry and kill it. + */ +static inline void d_lustre_invalidate(struct dentry *dentry) +{ + struct ll_sb_info *sbi = ll_s2sbi(dentry->d_sb); + + CDEBUG(D_DENTRY, "invalidate dentry %pd (%p) parent %p inode %p refc %d\n", + dentry, dentry, + dentry->d_parent, dentry->d_inode, ll_d_count(dentry)); + + spin_lock(&dentry->d_lock); + if (lld_is_init(dentry)) { + if (sbi->ll_neg_dentry_timeout != OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS) + ll_d2d(dentry)->lld_neg_cache_timeout = + jiffies + sbi->ll_neg_dentry_timeout * HZ; + ll_d2d(dentry)->lld_invalid = 1; + } + spin_unlock(&dentry->d_lock); +} + +static inline void d_lustre_revalidate(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + LASSERT(ll_d2d(dentry)); + ll_d2d(dentry)->lld_invalid = 0; + spin_unlock(&dentry->d_lock); +} + +static inline dev_t ll_compat_encode_dev(dev_t dev) +{ + /* The compat_sys_*stat*() syscalls will fail unless the + * device majors and minors are both less than 256. Note that + * the value returned here will be passed through + * old_encode_dev() in cp_compat_stat(). And so we are not + * trying to return a valid compat (u16) device number, just + * one that will pass the old_valid_dev() check. */ + + return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff); +} + +int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf); +int ll_layout_refresh(struct inode *inode, __u32 *gen); +int ll_layout_restore(struct inode *inode, loff_t start, __u64 length); +int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc, + struct lu_extent *ext); + +int ll_xattr_init(void); +void ll_xattr_fini(void); + +int ll_page_sync_io(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, enum cl_req_type crt); + +int ll_getparent(struct file *file, struct getparent __user *arg); + +/* lcommon_cl.c */ +int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr, + enum op_xvalid xvalid, unsigned int attr_flags); + +extern struct lu_env *cl_inode_fini_env; +extern __u16 cl_inode_fini_refcheck; + +int cl_file_inode_init(struct inode *inode, struct lustre_md *md); +void cl_inode_fini(struct inode *inode); + +u64 cl_fid_build_ino(const struct lu_fid *fid, int api32); +u32 cl_fid_build_gen(const struct lu_fid *fid); + +static inline struct pcc_super *ll_i2pccs(struct inode *inode) +{ + return &ll_i2sbi(inode)->ll_pcc_super; +} + +static inline struct pcc_super *ll_info2pccs(struct ll_inode_info *lli) +{ + return ll_i2pccs(ll_info2i(lli)); +} + +/* crypto.c */ +/* The digested form is made of a FID (16 bytes) followed by the second-to-last + * ciphertext block (16 bytes), so a total length of 32 bytes. + * That way, llcrypt does not compute a digested form of this digest. + */ +struct ll_digest_filename { + struct lu_fid ldf_fid; + char ldf_excerpt[LL_CRYPTO_BLOCK_SIZE]; +}; + +int ll_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct llcrypt_name *fname, + struct lu_fid *fid); +int ll_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + struct llcrypt_str *iname, struct llcrypt_str *oname, + struct lu_fid *fid); +int ll_revalidate_d_crypto(struct dentry *dentry, unsigned int flags); +int ll_file_open_encrypt(struct inode *inode, struct file *filp); +static inline char *xattr_for_enc(struct inode *inode) +{ + if (ll_sbi_has_name_encrypt(ll_i2sbi(inode))) + return LL_XATTR_NAME_ENCRYPTION_CONTEXT; + + return LL_XATTR_NAME_ENCRYPTION_CONTEXT_OLD; +} +#ifdef HAVE_LUSTRE_CRYPTO +extern const struct llcrypt_operations lustre_cryptops; +#endif + +/* llite/llite_foreign.c */ +int ll_manage_foreign(struct inode *inode, struct lustre_md *lmd); +bool ll_foreign_is_openable(struct dentry *dentry, unsigned int flags); +bool ll_foreign_is_removable(struct dentry *dentry, bool unset); + +int ll_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); + +#endif /* LLITE_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c new file mode 100644 index 0000000000000..0623294d9f4c0 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c @@ -0,0 +1,3909 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/llite/llite_lib.c + * + * Lustre Light Super operations + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef HAVE_CPUS_READ_LOCK +#include +#endif +#include +#include +#ifdef HAVE_UAPI_LINUX_MOUNT_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include "llite_internal.h" + +struct kmem_cache *ll_file_data_slab; + +#ifndef log2 +#define log2(n) ffz(~(n)) +#endif + +/** + * If there is only one number of core visible to Lustre, + * async readahead will be disabled, to avoid massive over + * subscription, we use 1/2 of active cores as default max + * async readahead requests. + */ +static inline unsigned int ll_get_ra_async_max_active(void) +{ + return cfs_cpt_weight(cfs_cpt_tab, CFS_CPT_ANY) >> 1; +} + +static struct ll_sb_info *ll_init_sbi(void) +{ + struct ll_sb_info *sbi = NULL; + unsigned long pages; + unsigned long lru_page_max; + struct sysinfo si; + int rc; + + ENTRY; + + OBD_ALLOC_PTR(sbi); + if (sbi == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + rc = pcc_super_init(&sbi->ll_pcc_super); + if (rc < 0) + GOTO(out_sbi, rc); + + spin_lock_init(&sbi->ll_lock); + mutex_init(&sbi->ll_lco.lco_lock); + spin_lock_init(&sbi->ll_pp_extent_lock); + spin_lock_init(&sbi->ll_process_lock); + sbi->ll_rw_stats_on = 0; + sbi->ll_statfs_max_age = OBD_STATFS_CACHE_SECONDS; + sbi->ll_neg_dentry_timeout = OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS; + + si_meminfo(&si); + pages = si.totalram - si.totalhigh; + lru_page_max = pages / 2; + + sbi->ll_ra_info.ra_async_max_active = ll_get_ra_async_max_active(); + sbi->ll_ra_info.ll_readahead_wq = + cfs_cpt_bind_workqueue("ll-readahead-wq", cfs_cpt_tab, + 0, CFS_CPT_ANY, + sbi->ll_ra_info.ra_async_max_active); + if (IS_ERR(sbi->ll_ra_info.ll_readahead_wq)) + GOTO(out_pcc, rc = PTR_ERR(sbi->ll_ra_info.ll_readahead_wq)); + + /* initialize ll_cache data */ + sbi->ll_cache = cl_cache_init(lru_page_max); + if (sbi->ll_cache == NULL) + GOTO(out_destroy_ra, rc = -ENOMEM); + + /* initialize foreign symlink prefix path */ + OBD_ALLOC(sbi->ll_foreign_symlink_prefix, sizeof("/mnt/")); + if (sbi->ll_foreign_symlink_prefix == NULL) + GOTO(out_destroy_ra, rc = -ENOMEM); + memcpy(sbi->ll_foreign_symlink_prefix, "/mnt/", sizeof("/mnt/")); + sbi->ll_foreign_symlink_prefix_size = sizeof("/mnt/"); + + /* initialize foreign symlink upcall path, none by default */ + OBD_ALLOC(sbi->ll_foreign_symlink_upcall, sizeof("none")); + if (sbi->ll_foreign_symlink_upcall == NULL) + GOTO(out_destroy_ra, rc = -ENOMEM); + memcpy(sbi->ll_foreign_symlink_upcall, "none", sizeof("none")); + sbi->ll_foreign_symlink_upcall_items = NULL; + sbi->ll_foreign_symlink_upcall_nb_items = 0; + init_rwsem(&sbi->ll_foreign_symlink_sem); + /* foreign symlink support (LL_SBI_FOREIGN_SYMLINK in ll_flags) + * not enabled by default + */ + + sbi->ll_secctx_name = NULL; + sbi->ll_secctx_name_size = 0; + + sbi->ll_ra_info.ra_max_pages = + min(pages / 32, SBI_DEFAULT_READ_AHEAD_MAX); + sbi->ll_ra_info.ra_max_pages_per_file = + min(sbi->ll_ra_info.ra_max_pages / 4, + SBI_DEFAULT_READ_AHEAD_PER_FILE_MAX); + sbi->ll_ra_info.ra_async_pages_per_file_threshold = + sbi->ll_ra_info.ra_max_pages_per_file; + sbi->ll_ra_info.ra_range_pages = SBI_DEFAULT_RA_RANGE_PAGES; + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = -1; + atomic_set(&sbi->ll_ra_info.ra_async_inflight, 0); + + set_bit(LL_SBI_VERBOSE, sbi->ll_flags); +#ifdef ENABLE_CHECKSUM + set_bit(LL_SBI_CHECKSUM, sbi->ll_flags); +#endif +#ifdef ENABLE_FLOCK + set_bit(LL_SBI_FLOCK, sbi->ll_flags); +#endif + +#ifdef HAVE_LRU_RESIZE_SUPPORT + set_bit(LL_SBI_LRU_RESIZE, sbi->ll_flags); +#endif + set_bit(LL_SBI_LAZYSTATFS, sbi->ll_flags); + + /* metadata statahead is enabled by default */ + sbi->ll_sa_running_max = LL_SA_RUNNING_DEF; + sbi->ll_sa_max = LL_SA_RPC_DEF; + atomic_set(&sbi->ll_sa_total, 0); + atomic_set(&sbi->ll_sa_wrong, 0); + atomic_set(&sbi->ll_sa_running, 0); + atomic_set(&sbi->ll_agl_total, 0); + set_bit(LL_SBI_AGL_ENABLED, sbi->ll_flags); + set_bit(LL_SBI_FAST_READ, sbi->ll_flags); + set_bit(LL_SBI_TINY_WRITE, sbi->ll_flags); + set_bit(LL_SBI_PARALLEL_DIO, sbi->ll_flags); + ll_sbi_set_encrypt(sbi, true); + ll_sbi_set_name_encrypt(sbi, true); + + /* root squash */ + sbi->ll_squash.rsi_uid = 0; + sbi->ll_squash.rsi_gid = 0; + INIT_LIST_HEAD(&sbi->ll_squash.rsi_nosquash_nids); + spin_lock_init(&sbi->ll_squash.rsi_lock); + + /* Per-filesystem file heat */ + sbi->ll_heat_decay_weight = SBI_DEFAULT_HEAT_DECAY_WEIGHT; + sbi->ll_heat_period_second = SBI_DEFAULT_HEAT_PERIOD_SECOND; + + /* Per-fs open heat level before requesting open lock */ + sbi->ll_oc_thrsh_count = SBI_DEFAULT_OPENCACHE_THRESHOLD_COUNT; + sbi->ll_oc_max_ms = SBI_DEFAULT_OPENCACHE_THRESHOLD_MAX_MS; + sbi->ll_oc_thrsh_ms = SBI_DEFAULT_OPENCACHE_THRESHOLD_MS; + + /* MDLL */ + atomic_set(&sbi->ll_dir_restore_max_retry_count, + LL_MDLL_DIR_RESTORE_DEF_RETRY_COUNT); + + RETURN(sbi); +out_destroy_ra: + if (sbi->ll_foreign_symlink_prefix) + OBD_FREE(sbi->ll_foreign_symlink_prefix, sizeof("/mnt/")); + if (sbi->ll_cache) { + cl_cache_decref(sbi->ll_cache); + sbi->ll_cache = NULL; + } + destroy_workqueue(sbi->ll_ra_info.ll_readahead_wq); +out_pcc: + pcc_super_fini(&sbi->ll_pcc_super); +out_sbi: + OBD_FREE_PTR(sbi); + RETURN(ERR_PTR(rc)); +} + +static void ll_free_sbi(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + ENTRY; + + if (sbi != NULL) { + if (!list_empty(&sbi->ll_squash.rsi_nosquash_nids)) + cfs_free_nidlist(&sbi->ll_squash.rsi_nosquash_nids); + if (sbi->ll_ra_info.ll_readahead_wq) + destroy_workqueue(sbi->ll_ra_info.ll_readahead_wq); + if (sbi->ll_cache != NULL) { + cl_cache_decref(sbi->ll_cache); + sbi->ll_cache = NULL; + } + if (sbi->ll_foreign_symlink_prefix) { + OBD_FREE(sbi->ll_foreign_symlink_prefix, + sbi->ll_foreign_symlink_prefix_size); + sbi->ll_foreign_symlink_prefix = NULL; + } + if (sbi->ll_foreign_symlink_upcall) { + OBD_FREE(sbi->ll_foreign_symlink_upcall, + strlen(sbi->ll_foreign_symlink_upcall) + + 1); + sbi->ll_foreign_symlink_upcall = NULL; + } + if (sbi->ll_foreign_symlink_upcall_items) { + int i; + int nb_items = sbi->ll_foreign_symlink_upcall_nb_items; + struct ll_foreign_symlink_upcall_item *items = + sbi->ll_foreign_symlink_upcall_items; + + for (i = 0 ; i < nb_items; i++) + if (items[i].type == STRING_TYPE) + OBD_FREE(items[i].string, + items[i].size); + + OBD_FREE_LARGE(items, nb_items * + sizeof(struct ll_foreign_symlink_upcall_item)); + sbi->ll_foreign_symlink_upcall_items = NULL; + } + if (sbi->ll_secctx_name) + ll_secctx_name_free(sbi); + + ll_free_rw_stats_info(sbi); + pcc_super_fini(&sbi->ll_pcc_super); + OBD_FREE(sbi, sizeof(*sbi)); + } + EXIT; +} + +static int client_common_fill_super(struct super_block *sb, char *md, char *dt) +{ + struct inode *root = NULL; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_statfs *osfs = NULL; + struct ptlrpc_request *request = NULL; + struct obd_connect_data *data = NULL; + struct obd_uuid *uuid; + struct md_op_data *op_data; + struct lustre_md lmd; + u64 valid; + int size, err, checksum; + bool api32; + void *encctx; + int encctxlen; + + ENTRY; + sbi->ll_md_obd = class_name2obd(md); + if (!sbi->ll_md_obd) { + CERROR("MD %s: not setup or attached\n", md); + RETURN(-EINVAL); + } + + OBD_ALLOC_PTR(data); + if (data == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC_PTR(osfs); + if (osfs == NULL) { + OBD_FREE_PTR(data); + RETURN(-ENOMEM); + } + + /* pass client page size via ocd_grant_blkbits, the server should report + * back its backend blocksize for grant calculation purpose */ + data->ocd_grant_blkbits = PAGE_SHIFT; + + /* indicate MDT features supported by this client */ + data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH | + OBD_CONNECT_ATTRFID | OBD_CONNECT_GRANT | + OBD_CONNECT_VERSION | OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_SRVLOCK | + OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | + OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | + OBD_CONNECT_AT | OBD_CONNECT_LOV_V3 | + OBD_CONNECT_VBR | OBD_CONNECT_FULL20 | + OBD_CONNECT_64BITHASH | + OBD_CONNECT_EINPROGRESS | + OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS| + OBD_CONNECT_MAX_EASIZE | + OBD_CONNECT_FLOCK_DEAD | + OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | + OBD_CONNECT_OPEN_BY_FID | + OBD_CONNECT_DIR_STRIPE | + OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM | + OBD_CONNECT_SUBTREE | + OBD_CONNECT_MULTIMODRPCS | + OBD_CONNECT_GRANT_PARAM | + OBD_CONNECT_GRANT_SHRINK | + OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2; + + data->ocd_connect_flags2 = OBD_CONNECT2_DIR_MIGRATE | + OBD_CONNECT2_SUM_STATFS | + OBD_CONNECT2_OVERSTRIPING | + OBD_CONNECT2_FLR | + OBD_CONNECT2_LOCK_CONVERT | + OBD_CONNECT2_ARCHIVE_ID_ARRAY | + OBD_CONNECT2_INC_XID | + OBD_CONNECT2_LSOM | + OBD_CONNECT2_ASYNC_DISCARD | + OBD_CONNECT2_PCC | + OBD_CONNECT2_CRUSH | OBD_CONNECT2_LSEEK | + OBD_CONNECT2_GETATTR_PFID | + OBD_CONNECT2_DOM_LVB | + OBD_CONNECT2_REP_MBITS | + OBD_CONNECT2_ATOMIC_OPEN_LOCK | + OBD_CONNECT2_MDLL; + + if (test_bit(LL_SBI_MDLL, sbi->ll_flags)) + data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL; + + if (test_bit(LL_SBI_MDLL_BYPASS, sbi->ll_flags)) + data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL_BYPASS; + + if (test_bit(LL_SBI_MDLL_AUTO_REFRESH, sbi->ll_flags)) + data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL_AUTO_REFRESH; + +#ifdef HAVE_LRU_RESIZE_SUPPORT + if (test_bit(LL_SBI_LRU_RESIZE, sbi->ll_flags)) + data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; +#endif + data->ocd_connect_flags |= OBD_CONNECT_ACL_FLAGS; + + data->ocd_cksum_types = obd_cksum_types_supported_client(); + + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT)) + /* flag mdc connection as lightweight, only used for test + * purpose, use with care */ + data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT; + + data->ocd_ibits_known = MDS_INODELOCK_FULL; + data->ocd_version = LUSTRE_VERSION_CODE; + + if (sb->s_flags & SB_RDONLY) + data->ocd_connect_flags |= OBD_CONNECT_RDONLY; + if (test_bit(LL_SBI_USER_XATTR, sbi->ll_flags)) + data->ocd_connect_flags |= OBD_CONNECT_XATTR; + +#ifdef SB_NOSEC + /* Setting this indicates we correctly support S_NOSEC (See kernel + * commit 9e1f1de02c2275d7172e18dc4e7c2065777611bf) + */ + sb->s_flags |= SB_NOSEC; +#endif + sbi->ll_fop = ll_select_file_operations(sbi); + + /* always ping even if server suppress_pings */ + if (test_bit(LL_SBI_ALWAYS_PING, sbi->ll_flags)) + data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS; + + obd_connect_set_secctx(data); + if (ll_sbi_has_encrypt(sbi)) { + obd_connect_set_name_enc(data); + obd_connect_set_enc(data); + } + +#if defined(CONFIG_SECURITY) + data->ocd_connect_flags2 |= OBD_CONNECT2_SELINUX_POLICY; +#endif + + data->ocd_brw_size = MD_MAX_BRW_SIZE; + + err = obd_connect(NULL, &sbi->ll_md_exp, sbi->ll_md_obd, + &sbi->ll_sb_uuid, data, sbi->ll_cache); + if (err == -EBUSY) { + LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing " + "recovery, of which this client is not a " + "part. Please wait for recovery to complete," + " abort, or time out.\n", md); + GOTO(out, err); + } else if (err) { + CERROR("cannot connect to %s: rc = %d\n", md, err); + GOTO(out, err); + } + + sbi->ll_md_exp->exp_connect_data = *data; + + err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp, + LUSTRE_SEQ_METADATA); + if (err) { + CERROR("%s: Can't init metadata layer FID infrastructure, " + "rc = %d\n", sbi->ll_md_exp->exp_obd->obd_name, err); + GOTO(out_md, err); + } + + /* For mount, we only need fs info from MDT0, and also in DNE, it + * can make sure the client can be mounted as long as MDT0 is + * avaible */ + err = obd_statfs(NULL, sbi->ll_md_exp, osfs, + ktime_get_seconds() - sbi->ll_statfs_max_age, + OBD_STATFS_FOR_MDT0); + if (err) + GOTO(out_md_fid, err); + + /* This needs to be after statfs to ensure connect has finished. + * Note that "data" does NOT contain the valid connect reply. + * If connecting to a 1.8 server there will be no LMV device, so + * we can access the MDC export directly and exp_connect_flags will + * be non-zero, but if accessing an upgraded 2.1 server it will + * have the correct flags filled in. + * XXX: fill in the LMV exp_connect_flags from MDC(s). */ + valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD; + if (exp_connect_flags(sbi->ll_md_exp) != 0 && + valid != CLIENT_CONNECT_MDT_REQD) { + char *buf; + + OBD_ALLOC_WAIT(buf, PAGE_SIZE); + obd_connect_flags2str(buf, PAGE_SIZE, + valid ^ CLIENT_CONNECT_MDT_REQD, 0, ","); + LCONSOLE_ERROR_MSG(0x170, "Server %s does not support " + "feature(s) needed for correct operation " + "of this client (%s). Please upgrade " + "server or downgrade client.\n", + sbi->ll_md_exp->exp_obd->obd_name, buf); + OBD_FREE(buf, PAGE_SIZE); + GOTO(out_md_fid, err = -EPROTO); + } + + size = sizeof(*data); + err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA), + KEY_CONN_DATA, &size, data); + if (err) { + CERROR("%s: Get connect data failed: rc = %d\n", + sbi->ll_md_exp->exp_obd->obd_name, err); + GOTO(out_md_fid, err); + } + + LASSERT(osfs->os_bsize); + sb->s_blocksize = osfs->os_bsize; + sb->s_blocksize_bits = log2(osfs->os_bsize); + sb->s_magic = LL_SUPER_MAGIC; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sbi->ll_inode_cache_enabled = 1; + sbi->ll_namelen = osfs->os_namelen; + sbi->ll_mnt.mnt = current->fs->root.mnt; + sbi->ll_mnt_ns = current->nsproxy->mnt_ns; + + if (test_bit(LL_SBI_USER_XATTR, sbi->ll_flags) && + !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) { + LCONSOLE_INFO("Disabling user_xattr feature because " + "it is not supported on the server\n"); + clear_bit(LL_SBI_USER_XATTR, sbi->ll_flags); + } + + if (data->ocd_connect_flags & OBD_CONNECT_ACL) { +#ifdef SB_POSIXACL + sb->s_flags |= SB_POSIXACL; +#endif + set_bit(LL_SBI_ACL, sbi->ll_flags); + } else { + LCONSOLE_INFO("client wants to enable acl, but mdt not!\n"); +#ifdef SB_POSIXACL + sb->s_flags &= ~SB_POSIXACL; +#endif + clear_bit(LL_SBI_ACL, sbi->ll_flags); + } + + if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH) + set_bit(LL_SBI_64BIT_HASH, sbi->ll_flags); + + if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) + set_bit(LL_SBI_LAYOUT_LOCK, sbi->ll_flags); + + if (obd_connect_has_secctx(data)) + set_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags); + + if (ll_sbi_has_encrypt(sbi) && !obd_connect_has_enc(data)) { + if (ll_sb_has_test_dummy_encryption(sb)) + LCONSOLE_WARN("%s: server %s does not support encryption feature, encryption deactivated.\n", + sbi->ll_fsname, + sbi->ll_md_exp->exp_obd->obd_name); + ll_sbi_set_encrypt(sbi, false); + } + + if (ll_sbi_has_name_encrypt(sbi) && !obd_connect_has_name_enc(data)) { + struct lustre_sb_info *lsi = s2lsi(sb); + + if (ll_sb_has_test_dummy_encryption(sb)) + LCONSOLE_WARN("%s: server %s does not support name encryption, not using it.\n", + sbi->ll_fsname, + sbi->ll_md_exp->exp_obd->obd_name); + lsi->lsi_flags &= ~LSI_FILENAME_ENC; + lsi->lsi_flags &= ~LSI_FILENAME_ENC_B64_OLD_CLI; + ll_sbi_set_name_encrypt(sbi, false); + } + + if (data->ocd_ibits_known & MDS_INODELOCK_XATTR) { + if (!(data->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)) { + LCONSOLE_INFO("%s: disabling xattr cache due to " + "unknown maximum xattr size.\n", dt); + } else if (!sbi->ll_xattr_cache_set) { + /* If xattr_cache is already set (no matter 0 or 1) + * during processing llog, it won't be enabled here. */ + set_bit(LL_SBI_XATTR_CACHE, sbi->ll_flags); + sbi->ll_xattr_cache_enabled = 1; + } + } + + sbi->ll_dt_obd = class_name2obd(dt); + if (!sbi->ll_dt_obd) { + CERROR("DT %s: not setup or attached\n", dt); + GOTO(out_md_fid, err = -ENODEV); + } + + /* pass client page size via ocd_grant_blkbits, the server should report + * back its backend blocksize for grant calculation purpose */ + data->ocd_grant_blkbits = PAGE_SHIFT; + + /* indicate OST features supported by this client */ + data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION | + OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | + OBD_CONNECT_SRVLOCK | + OBD_CONNECT_AT | OBD_CONNECT_OSS_CAPA | + OBD_CONNECT_VBR | OBD_CONNECT_FULL20 | + OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | + OBD_CONNECT_EINPROGRESS | + OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_LAYOUTLOCK | + OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK | + OBD_CONNECT_BULK_MBITS | OBD_CONNECT_SHORTIO | + OBD_CONNECT_FLAGS2 | OBD_CONNECT_GRANT_SHRINK; + data->ocd_connect_flags2 = OBD_CONNECT2_LOCKAHEAD | + OBD_CONNECT2_INC_XID | OBD_CONNECT2_LSEEK | + OBD_CONNECT2_REP_MBITS; + + if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_GRANT_PARAM)) + data->ocd_connect_flags |= OBD_CONNECT_GRANT_PARAM; + + /* OBD_CONNECT_CKSUM should always be set, even if checksums are + * disabled by default, because it can still be enabled on the + * fly via /sys. As a consequence, we still need to come to an + * agreement on the supported algorithms at connect time + */ + data->ocd_connect_flags |= OBD_CONNECT_CKSUM; + + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY)) + data->ocd_cksum_types = OBD_CKSUM_ADLER; + else + data->ocd_cksum_types = obd_cksum_types_supported_client(); + +#ifdef HAVE_LRU_RESIZE_SUPPORT + data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; +#endif + /* always ping even if server suppress_pings */ + if (test_bit(LL_SBI_ALWAYS_PING, sbi->ll_flags)) + data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS; + + if (ll_sbi_has_encrypt(sbi)) + obd_connect_set_enc(data); + + CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d " + "ocd_grant: %d\n", data->ocd_connect_flags, + data->ocd_version, data->ocd_grant); + + sbi->ll_dt_obd->obd_upcall.onu_owner = &sbi->ll_lco; + sbi->ll_dt_obd->obd_upcall.onu_upcall = cl_ocd_update; + + data->ocd_brw_size = DT_MAX_BRW_SIZE; + + err = obd_connect(NULL, &sbi->ll_dt_exp, sbi->ll_dt_obd, + &sbi->ll_sb_uuid, data, sbi->ll_cache); + if (err == -EBUSY) { + LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing " + "recovery, of which this client is not a " + "part. Please wait for recovery to " + "complete, abort, or time out.\n", dt); + GOTO(out_md, err); + } else if (err) { + CERROR("%s: Cannot connect to %s: rc = %d\n", + sbi->ll_dt_exp->exp_obd->obd_name, dt, err); + GOTO(out_md, err); + } + + if (ll_sbi_has_encrypt(sbi) && + !obd_connect_has_enc(&sbi->ll_dt_obd->u.lov.lov_ocd)) { + if (ll_sb_has_test_dummy_encryption(sb)) + LCONSOLE_WARN("%s: server %s does not support encryption feature, encryption deactivated.\n", + sbi->ll_fsname, dt); + ll_sbi_set_encrypt(sbi, false); + } else if (ll_sb_has_test_dummy_encryption(sb)) { + LCONSOLE_WARN("Test dummy encryption mode enabled\n"); + } + + sbi->ll_dt_exp->exp_connect_data = *data; + + /* Don't change value if it was specified in the config log */ + if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages == -1) { + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = + max_t(unsigned long, SBI_DEFAULT_READ_AHEAD_WHOLE_MAX, + (data->ocd_brw_size >> PAGE_SHIFT)); + if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages > + sbi->ll_ra_info.ra_max_pages_per_file) + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = + sbi->ll_ra_info.ra_max_pages_per_file; + } + + err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp, + LUSTRE_SEQ_METADATA); + if (err) { + CERROR("%s: Can't init data layer FID infrastructure, " + "rc = %d\n", sbi->ll_dt_exp->exp_obd->obd_name, err); + GOTO(out_dt, err); + } + + mutex_lock(&sbi->ll_lco.lco_lock); + sbi->ll_lco.lco_flags = data->ocd_connect_flags; + sbi->ll_lco.lco_md_exp = sbi->ll_md_exp; + sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp; + mutex_unlock(&sbi->ll_lco.lco_lock); + + fid_zero(&sbi->ll_root_fid); + err = md_get_root(sbi->ll_md_exp, get_mount_fileset(sb), + &sbi->ll_root_fid); + if (err) { + CERROR("cannot mds_connect: rc = %d\n", err); + GOTO(out_lock_cn_cb, err); + } + if (!fid_is_sane(&sbi->ll_root_fid)) { + CERROR("%s: Invalid root fid "DFID" during mount\n", + sbi->ll_md_exp->exp_obd->obd_name, + PFID(&sbi->ll_root_fid)); + GOTO(out_lock_cn_cb, err = -EINVAL); + } + CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid)); + + sb->s_op = &lustre_super_operations; + sb->s_xattr = ll_xattr_handlers; +#if THREAD_SIZE >= 8192 /*b=17630*/ + sb->s_export_op = &lustre_export_operations; +#endif +#ifdef HAVE_LUSTRE_CRYPTO + llcrypt_set_ops(sb, &lustre_cryptops); +#endif + + /* make root inode + * XXX: move this to after cbd setup? */ + valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMODEASIZE | + OBD_MD_ENCCTX; + if (test_bit(LL_SBI_ACL, sbi->ll_flags)) + valid |= OBD_MD_FLACL; + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + GOTO(out_lock_cn_cb, err = -ENOMEM); + + op_data->op_fid1 = sbi->ll_root_fid; + op_data->op_mode = 0; + op_data->op_valid = valid; + + err = md_getattr(sbi->ll_md_exp, op_data, &request); + + /* We need enc ctx info, so reset it in op_data to + * prevent it from being freed. + */ + encctx = op_data->op_file_encctx; + encctxlen = op_data->op_file_encctx_size; + op_data->op_file_encctx = NULL; + op_data->op_file_encctx_size = 0; + OBD_FREE_PTR(op_data); + if (err) { + CERROR("%s: md_getattr failed for root: rc = %d\n", + sbi->ll_md_exp->exp_obd->obd_name, err); + GOTO(out_lock_cn_cb, err); + } + + err = md_get_lustre_md(sbi->ll_md_exp, &request->rq_pill, + sbi->ll_dt_exp, sbi->ll_md_exp, &lmd); + if (err) { + CERROR("failed to understand root inode md: rc = %d\n", err); + ptlrpc_req_finished(request); + GOTO(out_lock_cn_cb, err); + } + + LASSERT(fid_is_sane(&sbi->ll_root_fid)); + api32 = test_bit(LL_SBI_32BIT_API, sbi->ll_flags); + root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid, api32), &lmd); + md_free_lustre_md(sbi->ll_md_exp, &lmd); + + if (IS_ERR(root)) { + lmd_clear_acl(&lmd); + err = IS_ERR(root) ? PTR_ERR(root) : -EBADF; + root = NULL; + CERROR("%s: bad ll_iget() for root: rc = %d\n", + sbi->ll_fsname, err); + ptlrpc_req_finished(request); + GOTO(out_root, err); + } + + err = ll_secctx_name_store(root); + if (err < 0 && ll_security_xattr_wanted(root)) + CWARN("%s: file security contextes not supported: rc = %d\n", + sbi->ll_fsname, err); + + err = 0; + if (encctxlen) { + CDEBUG(D_SEC, + "server returned encryption ctx for root inode "DFID"\n", + PFID(&sbi->ll_root_fid)); + err = ll_set_encflags(root, encctx, encctxlen, true); + if (err) + CWARN("%s: cannot set enc ctx for "DFID": rc = %d\n", + sbi->ll_fsname, + PFID(&sbi->ll_root_fid), err); + } + ptlrpc_req_finished(request); + + checksum = test_bit(LL_SBI_CHECKSUM, sbi->ll_flags); + if (sbi->ll_checksum_set) { + err = obd_set_info_async(NULL, sbi->ll_dt_exp, + sizeof(KEY_CHECKSUM), KEY_CHECKSUM, + sizeof(checksum), &checksum, NULL); + if (err) { + CERROR("%s: Set checksum failed: rc = %d\n", + sbi->ll_dt_exp->exp_obd->obd_name, err); + GOTO(out_root, err); + } + } + cl_sb_init(sb); + + sb->s_root = d_make_root(root); + if (sb->s_root == NULL) { + err = -ENOMEM; + CERROR("%s: can't make root dentry: rc = %d\n", + sbi->ll_fsname, err); + GOTO(out_root, err); + } + + sbi->ll_sdev_orig = sb->s_dev; + + /* We set sb->s_dev equal on all lustre clients in order to support + * NFS export clustering. NFSD requires that the FSID be the same + * on all clients. */ + /* s_dev is also used in lt_compare() to compare two fs, but that is + * only a node-local comparison. */ + uuid = obd_get_uuid(sbi->ll_md_exp); + if (uuid != NULL) + sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid)); + + if (data != NULL) + OBD_FREE_PTR(data); + if (osfs != NULL) + OBD_FREE_PTR(osfs); + + if (sbi->ll_dt_obd) { + err = sysfs_create_link(&sbi->ll_kset.kobj, + &sbi->ll_dt_obd->obd_kset.kobj, + sbi->ll_dt_obd->obd_type->typ_name); + if (err < 0) { + CERROR("%s: could not register %s in llite: rc = %d\n", + dt, sbi->ll_fsname, err); + err = 0; + } + } + + if (sbi->ll_md_obd) { + err = sysfs_create_link(&sbi->ll_kset.kobj, + &sbi->ll_md_obd->obd_kset.kobj, + sbi->ll_md_obd->obd_type->typ_name); + if (err < 0) { + CERROR("%s: could not register %s in llite: rc = %d\n", + md, sbi->ll_fsname, err); + err = 0; + } + } + + RETURN(err); +out_root: + iput(root); +out_lock_cn_cb: + obd_fid_fini(sbi->ll_dt_exp->exp_obd); +out_dt: + obd_disconnect(sbi->ll_dt_exp); + sbi->ll_dt_exp = NULL; + sbi->ll_dt_obd = NULL; +out_md_fid: + obd_fid_fini(sbi->ll_md_exp->exp_obd); +out_md: + obd_disconnect(sbi->ll_md_exp); + sbi->ll_md_exp = NULL; + sbi->ll_md_obd = NULL; +out: + if (data != NULL) + OBD_FREE_PTR(data); + if (osfs != NULL) + OBD_FREE_PTR(osfs); + return err; +} + +int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize) +{ + int size, rc; + + size = sizeof(*lmmsize); + rc = obd_get_info(NULL, sbi->ll_dt_exp, sizeof(KEY_MAX_EASIZE), + KEY_MAX_EASIZE, &size, lmmsize); + if (rc != 0) { + CERROR("%s: cannot get max LOV EA size: rc = %d\n", + sbi->ll_dt_exp->exp_obd->obd_name, rc); + RETURN(rc); + } + + CDEBUG(D_INFO, "max LOV ea size: %d\n", *lmmsize); + + size = sizeof(int); + rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE), + KEY_MAX_EASIZE, &size, lmmsize); + if (rc) + CERROR("Get max mdsize error rc %d\n", rc); + + CDEBUG(D_INFO, "max LMV ea size: %d\n", *lmmsize); + + RETURN(rc); +} + +/** + * Get the value of the default_easize parameter. + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] sbi superblock info for this filesystem + * \param[out] lmmsize pointer to storage location for value + * + * \retval 0 on success + * \retval negative negated errno on failure + */ +int ll_get_default_mdsize(struct ll_sb_info *sbi, int *lmmsize) +{ + int size, rc; + + size = sizeof(int); + rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_EASIZE), + KEY_DEFAULT_EASIZE, &size, lmmsize); + if (rc) + CERROR("Get default mdsize error rc %d\n", rc); + + RETURN(rc); +} + +/** + * Set the default_easize parameter to the given value. + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] sbi superblock info for this filesystem + * \param[in] lmmsize the size to set + * + * \retval 0 on success + * \retval negative negated errno on failure + */ +int ll_set_default_mdsize(struct ll_sb_info *sbi, int lmmsize) +{ + int rc; + + if (lmmsize < sizeof(struct lov_mds_md) || + lmmsize > OBD_MAX_DEFAULT_EA_SIZE) + return -EINVAL; + + rc = obd_set_info_async(NULL, sbi->ll_md_exp, + sizeof(KEY_DEFAULT_EASIZE), KEY_DEFAULT_EASIZE, + sizeof(int), &lmmsize, NULL); + + RETURN(rc); +} + +static void client_common_put_super(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + ENTRY; + + cl_sb_fini(sb); + + obd_fid_fini(sbi->ll_dt_exp->exp_obd); + obd_disconnect(sbi->ll_dt_exp); + sbi->ll_dt_exp = NULL; + + ll_debugfs_unregister_super(sb); + + obd_fid_fini(sbi->ll_md_exp->exp_obd); + obd_disconnect(sbi->ll_md_exp); + sbi->ll_md_exp = NULL; + + EXIT; +} + +void ll_kill_super(struct super_block *sb) +{ + struct ll_sb_info *sbi; + ENTRY; + + /* not init sb ?*/ + if (!(sb->s_flags & SB_ACTIVE)) + return; + + sbi = ll_s2sbi(sb); + /* we need restore s_dev from changed for clustred NFS before put_super + * because new kernels have cached s_dev and change sb->s_dev in + * put_super not affected real removing devices */ + if (sbi) { + sb->s_dev = sbi->ll_sdev_orig; + + /* wait running statahead threads to quit */ + while (atomic_read(&sbi->ll_sa_running) > 0) + schedule_timeout_uninterruptible( + cfs_time_seconds(1) >> 3); + } + + EXIT; +} + +/* Since we use this table for ll_sbi_flags_seq_show make + * sure what you want displayed for a specific token that + * is listed more than once below be listed first. For + * example we want "checksum" displayed, not "nochecksum" + * for the sbi_flags. + */ +static const match_table_t ll_sbi_flags_name = { + {LL_SBI_NOLCK, "nolock"}, + {LL_SBI_CHECKSUM, "checksum"}, + {LL_SBI_CHECKSUM, "nochecksum"}, + {LL_SBI_LOCALFLOCK, "localflock"}, + {LL_SBI_FLOCK, "flock"}, + {LL_SBI_FLOCK, "noflock"}, + {LL_SBI_USER_XATTR, "user_xattr"}, + {LL_SBI_USER_XATTR, "nouser_xattr"}, + {LL_SBI_LRU_RESIZE, "lruresize"}, + {LL_SBI_LRU_RESIZE, "nolruresize"}, + {LL_SBI_LAZYSTATFS, "lazystatfs"}, + {LL_SBI_LAZYSTATFS, "nolazystatfs"}, + {LL_SBI_32BIT_API, "32bitapi"}, + {LL_SBI_USER_FID2PATH, "user_fid2path"}, + {LL_SBI_USER_FID2PATH, "nouser_fid2path"}, + {LL_SBI_VERBOSE, "verbose"}, + {LL_SBI_VERBOSE, "noverbose"}, + {LL_SBI_ALWAYS_PING, "always_ping"}, + {LL_SBI_TEST_DUMMY_ENCRYPTION, "test_dummy_encryption=%s"}, + {LL_SBI_TEST_DUMMY_ENCRYPTION, "test_dummy_encryption"}, + {LL_SBI_ENCRYPT, "encrypt"}, + {LL_SBI_ENCRYPT, "noencrypt"}, + {LL_SBI_FOREIGN_SYMLINK, "foreign_symlink=%s"}, + {LL_SBI_ALLOW_VERSION_MISMATCH, "allow_version_mismatch"}, + {LL_SBI_MDLL_BYPASS, "mdll_bypass"}, + {LL_SBI_NUM_MOUNT_OPT, NULL}, + + {LL_SBI_ACL, "acl"}, + {LL_SBI_AGL_ENABLED, "agl"}, + {LL_SBI_64BIT_HASH, "64bit_hash"}, + {LL_SBI_LAYOUT_LOCK, "layout"}, + {LL_SBI_XATTR_CACHE, "xattr_cache"}, + {LL_SBI_NOROOTSQUASH, "norootsquash"}, + {LL_SBI_FAST_READ, "fast_read"}, + {LL_SBI_FILE_SECCTX, "file_secctx"}, + {LL_SBI_TINY_WRITE, "tiny_write"}, + {LL_SBI_FILE_HEAT, "file_heat"}, + {LL_SBI_PARALLEL_DIO, "parallel_dio"}, + {LL_SBI_ENCRYPT_NAME, "name_encrypt"}, + {LL_SBI_MDLL_AUTO_REFRESH, "mdll_auto_refresh"}, + {LL_SBI_MDLL, "mdll"}, +}; + +int ll_sbi_flags_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + int i; + + for (i = 0; i < LL_SBI_NUM_FLAGS; i++) { + int j; + + if (!test_bit(i, ll_s2sbi(sb)->ll_flags)) + continue; + + for (j = 0; j < ARRAY_SIZE(ll_sbi_flags_name); j++) { + if (ll_sbi_flags_name[j].token == i && + ll_sbi_flags_name[j].pattern) { + seq_printf(m, "%s ", + ll_sbi_flags_name[j].pattern); + break; + } + } + } + seq_puts(m, "\b\n"); + return 0; +} + +/* non-client-specific mount options are parsed in lmd_parse */ +static int ll_options(char *options, struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + char *s2, *s1, *opts; + int err = 0; + + ENTRY; + if (!options) + RETURN(0); + + /* Disallow version mismatch by default */ + allow_version_mismatch = 0; + + /* Don't stomp on lmd_opts */ + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + RETURN(-ENOMEM); + s1 = opts; + s2 = opts; + + CDEBUG(D_CONFIG, "Parsing opts %s\n", options); + + while ((s1 = strsep(&opts, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + bool turn_off = false; + int token; + + if (!*s1) + continue; + + CDEBUG(D_SUPER, "next opt=%s\n", s1); + + if (strncmp(s1, "no", 2) == 0) + turn_off = true; + + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = NULL; + args[0].from = NULL; + token = match_token(s1, ll_sbi_flags_name, args); + if (token == LL_SBI_NUM_MOUNT_OPT) { + if (match_wildcard("context", s1) || + match_wildcard("fscontext", s1) || + match_wildcard("defcontext", s1) || + match_wildcard("rootcontext",s1)) + continue; + + LCONSOLE_ERROR_MSG(0x152, + "Unknown option '%s', won't mount.\n", + s1); + RETURN(-EINVAL); + } + + switch (token) { + case LL_SBI_NOLCK: + case LL_SBI_32BIT_API: + case LL_SBI_64BIT_HASH: + case LL_SBI_ALWAYS_PING: + case LL_SBI_MDLL_AUTO_REFRESH: + case LL_SBI_MDLL: + case LL_SBI_MDLL_BYPASS: + set_bit(token, sbi->ll_flags); + break; + + case LL_SBI_FLOCK: + clear_bit(LL_SBI_LOCALFLOCK, sbi->ll_flags); + if (turn_off) + clear_bit(LL_SBI_FLOCK, sbi->ll_flags); + else + set_bit(token, sbi->ll_flags); + break; + + case LL_SBI_LOCALFLOCK: + clear_bit(LL_SBI_FLOCK, sbi->ll_flags); + set_bit(token, sbi->ll_flags); + break; + + case LL_SBI_CHECKSUM: + sbi->ll_checksum_set = 1; + fallthrough; + case LL_SBI_USER_XATTR: + case LL_SBI_USER_FID2PATH: + case LL_SBI_LRU_RESIZE: + case LL_SBI_LAZYSTATFS: + case LL_SBI_VERBOSE: + if (turn_off) + clear_bit(token, sbi->ll_flags); + else + set_bit(token, sbi->ll_flags); + break; + case LL_SBI_TEST_DUMMY_ENCRYPTION: { +#ifdef HAVE_LUSTRE_CRYPTO +#ifdef HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED + set_bit(token, sbi->ll_flags); +#else + struct lustre_sb_info *lsi = s2lsi(sb); + + err = llcrypt_set_test_dummy_encryption(sb, &args[0], + &lsi->lsi_dummy_enc_ctx); + if (!err) + break; + + if (err == -EEXIST) + LCONSOLE_WARN( + "Can't change test_dummy_encryption"); + else if (err == -EINVAL) + LCONSOLE_WARN( + "Value of option \"%s\" unrecognized", + options); + else + LCONSOLE_WARN( + "Error processing option \"%s\" [%d]", + options, err); + err = -1; +#endif +#else + LCONSOLE_WARN("Test dummy encryption mount option ignored: encryption not supported\n"); +#endif + break; + } + case LL_SBI_ENCRYPT: +#ifdef HAVE_LUSTRE_CRYPTO + if (turn_off) + clear_bit(token, sbi->ll_flags); + else + set_bit(token, sbi->ll_flags); +#else + LCONSOLE_WARN("noencrypt or encrypt mount option ignored: encryption not supported\n"); +#endif + break; + case LL_SBI_ALLOW_VERSION_MISMATCH: + allow_version_mismatch = 1; + break; + case LL_SBI_FOREIGN_SYMLINK: + /* non-default prefix provided ? */ + if (args->from) { + size_t old_len; + char *old; + + /* path must be absolute */ + if (args->from[0] != '/') { + LCONSOLE_ERROR_MSG(0x152, + "foreign prefix '%s' must be an absolute path\n", + args->from); + RETURN(-EINVAL); + } + + old_len = sbi->ll_foreign_symlink_prefix_size; + old = sbi->ll_foreign_symlink_prefix; + /* alloc for path length and '\0' */ + sbi->ll_foreign_symlink_prefix = match_strdup(args); + if (!sbi->ll_foreign_symlink_prefix) { + /* restore previous */ + sbi->ll_foreign_symlink_prefix = old; + sbi->ll_foreign_symlink_prefix_size = + old_len; + RETURN(-ENOMEM); + } + sbi->ll_foreign_symlink_prefix_size = + args->to - args->from + 1; + OBD_ALLOC_POST(sbi->ll_foreign_symlink_prefix, + sbi->ll_foreign_symlink_prefix_size, + "kmalloced"); + if (old) + OBD_FREE(old, old_len); + + /* enable foreign symlink support */ + set_bit(token, sbi->ll_flags); + } else { + LCONSOLE_ERROR_MSG(0x152, + "invalid %s option\n", s1); + } + fallthrough; + default: + break; + } + } + kfree(opts); + RETURN(err); +} + +void ll_lli_init(struct ll_inode_info *lli) +{ + lli->lli_inode_magic = LLI_INODE_MAGIC; + lli->lli_flags = 0; + rwlock_init(&lli->lli_lock); + lli->lli_posix_acl = NULL; + /* Do not set lli_fid, it has been initialized already. */ + fid_zero(&lli->lli_pfid); + lli->lli_mds_read_och = NULL; + lli->lli_mds_write_och = NULL; + lli->lli_mds_exec_och = NULL; + lli->lli_open_fd_read_count = 0; + lli->lli_open_fd_write_count = 0; + lli->lli_open_fd_exec_count = 0; + mutex_init(&lli->lli_och_mutex); + spin_lock_init(&lli->lli_agl_lock); + spin_lock_init(&lli->lli_layout_lock); + ll_layout_version_set(lli, CL_LAYOUT_GEN_NONE); + lli->lli_clob = NULL; + + init_rwsem(&lli->lli_xattrs_list_rwsem); + mutex_init(&lli->lli_xattrs_enq_lock); + + LASSERT(lli->lli_vfs_inode.i_mode != 0); + if (S_ISDIR(lli->lli_vfs_inode.i_mode)) { + lli->lli_opendir_key = NULL; + lli->lli_sai = NULL; + spin_lock_init(&lli->lli_sa_lock); + lli->lli_opendir_pid = 0; + lli->lli_sa_enabled = 0; + init_rwsem(&lli->lli_lsm_sem); + } else { + mutex_init(&lli->lli_size_mutex); + mutex_init(&lli->lli_setattr_mutex); + lli->lli_symlink_name = NULL; + ll_trunc_sem_init(&lli->lli_trunc_sem); + range_lock_tree_init(&lli->lli_write_tree); + init_rwsem(&lli->lli_glimpse_sem); + lli->lli_glimpse_time = ktime_set(0, 0); + INIT_LIST_HEAD(&lli->lli_agl_list); + lli->lli_agl_index = 0; + lli->lli_async_rc = 0; + spin_lock_init(&lli->lli_heat_lock); + obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT); + lli->lli_heat_flags = 0; + mutex_init(&lli->lli_pcc_lock); + lli->lli_pcc_state = PCC_STATE_FL_NONE; + lli->lli_pcc_inode = NULL; + lli->lli_pcc_dsflags = PCC_DATASET_INVALID; + lli->lli_pcc_generation = 0; + mutex_init(&lli->lli_group_mutex); + lli->lli_group_users = 0; + lli->lli_group_gid = 0; + } + mutex_init(&lli->lli_layout_mutex); + memset(lli->lli_jobid, 0, sizeof(lli->lli_jobid)); + /* ll_cl_context initialize */ + INIT_LIST_HEAD(&lli->lli_lccs); + seqlock_init(&lli->lli_page_inv_lock); +} + +#define MAX_STRING_SIZE 128 + +#ifndef HAVE_SUPER_SETUP_BDI_NAME +#ifndef HAVE_BDI_CAP_MAP_COPY +# define BDI_CAP_MAP_COPY 0 +#endif + +static int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + char buf[MAX_STRING_SIZE]; + va_list args; + int err; + + err = bdi_init(&lsi->lsi_bdi); + if (err) + return err; + + lsi->lsi_flags |= LSI_BDI_INITIALIZED; + lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY; + lsi->lsi_bdi.name = "lustre"; + va_start(args, fmt); + vsnprintf(buf, MAX_STRING_SIZE, fmt, args); + va_end(args); + err = bdi_register(&lsi->lsi_bdi, NULL, "%s", buf); + va_end(args); + if (!err) + sb->s_bdi = &lsi->lsi_bdi; + + return err; +} +#endif /* !HAVE_SUPER_SETUP_BDI_NAME */ + +int ll_fill_super(struct super_block *sb) +{ + struct lustre_profile *lprof = NULL; + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = NULL; + char *dt = NULL, *md = NULL; + char *profilenm = get_profile_name(sb); + struct config_llog_instance *cfg; + /* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */ + const int instlen = LUSTRE_MAXINSTANCE + 2; + unsigned long cfg_instance = ll_get_cfg_instance(sb); + char name[MAX_STRING_SIZE]; + int md_len = 0; + int dt_len = 0; + uuid_t uuid; + char *ptr; + int len; + int err; + + ENTRY; + /* for ASLR, to map between cfg_instance and hashed ptr */ + CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n", + profilenm, cfg_instance, sb); + + OBD_RACE(OBD_FAIL_LLITE_RACE_MOUNT); + + OBD_ALLOC_PTR(cfg); + if (cfg == NULL) + GOTO(out_free_cfg, err = -ENOMEM); + + /* client additional sb info */ + lsi->lsi_llsbi = sbi = ll_init_sbi(); + if (IS_ERR(sbi)) + GOTO(out_free_cfg, err = PTR_ERR(sbi)); + + err = ll_options(lsi->lsi_lmd->lmd_opts, sb); + if (err) + GOTO(out_free_cfg, err); + + /* LSI_FILENAME_ENC is only used by embedded llcrypt */ +#ifdef CONFIG_LL_ENCRYPTION + if (ll_sb_has_test_dummy_encryption(sb)) + /* enable filename encryption by default for dummy enc mode */ + lsi->lsi_flags |= LSI_FILENAME_ENC; + else + /* filename encryption is disabled by default */ + lsi->lsi_flags &= ~LSI_FILENAME_ENC; + /* Lustre 2.15 uses old-style base64 encoding by default */ + lsi->lsi_flags |= LSI_FILENAME_ENC_B64_OLD_CLI; +#endif + + /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */ + sb->s_d_op = &ll_d_ops; + + /* UUID handling */ + generate_random_uuid(uuid.b); + snprintf(sbi->ll_sb_uuid.uuid, sizeof(sbi->ll_sb_uuid), "%pU", uuid.b); + + CDEBUG(D_CONFIG, "llite sb uuid: %s\n", sbi->ll_sb_uuid.uuid); + + /* Get fsname */ + len = strlen(profilenm); + ptr = strrchr(profilenm, '-'); + if (ptr && (strcmp(ptr, "-client") == 0)) + len -= 7; + + if (len > LUSTRE_MAXFSNAME) { + if (unlikely(len >= MAX_STRING_SIZE)) + len = MAX_STRING_SIZE - 1; + strncpy(name, profilenm, len); + name[len] = '\0'; + err = -ENAMETOOLONG; + CERROR("%s: fsname longer than %u characters: rc = %d\n", + name, LUSTRE_MAXFSNAME, err); + GOTO(out_free_cfg, err); + } + strncpy(sbi->ll_fsname, profilenm, len); + sbi->ll_fsname[len] = '\0'; + + /* Mount info */ + snprintf(name, sizeof(name), "%.*s-%016lx", len, + profilenm, cfg_instance); + + err = super_setup_bdi_name(sb, "%s", name); + if (err) + GOTO(out_free_cfg, err); + + /* disable kernel readahead */ + sb->s_bdi->ra_pages = 0; +#ifdef HAVE_BDI_IO_PAGES + sb->s_bdi->io_pages = 0; +#endif + + /* Call ll_debugfs_register_super() before lustre_process_log() + * so that "llite.*.*" params can be processed correctly. + */ + err = ll_debugfs_register_super(sb, name); + if (err < 0) { + CERROR("%s: could not register mountpoint in llite: rc = %d\n", + sbi->ll_fsname, err); + err = 0; + } + + /* The cfg_instance is a value unique to this super, in case some + * joker tries to mount the same fs at two mount points. + */ + cfg->cfg_instance = cfg_instance; + cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid; + cfg->cfg_callback = class_config_llog_handler; + cfg->cfg_sub_clds = CONFIG_SUB_CLIENT; + /* set up client obds */ + err = lustre_process_log(sb, profilenm, cfg); + if (err < 0) + GOTO(out_debugfs, err); + + /* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */ + lprof = class_get_profile(profilenm); + if (lprof == NULL) { + LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be" + " read from the MGS. Does that filesystem " + "exist?\n", profilenm); + GOTO(out_debugfs, err = -EINVAL); + } + CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm, + lprof->lp_md, lprof->lp_dt); + + dt_len = strlen(lprof->lp_dt) + instlen + 2; + OBD_ALLOC(dt, dt_len); + if (!dt) + GOTO(out_profile, err = -ENOMEM); + snprintf(dt, dt_len - 1, "%s-%016lx", lprof->lp_dt, cfg_instance); + + md_len = strlen(lprof->lp_md) + instlen + 2; + OBD_ALLOC(md, md_len); + if (!md) + GOTO(out_free_dt, err = -ENOMEM); + snprintf(md, md_len - 1, "%s-%016lx", lprof->lp_md, cfg_instance); + + /* connections, registrations, sb setup */ + err = client_common_fill_super(sb, md, dt); + if (err < 0) + GOTO(out_free_md, err); + + sbi->ll_client_common_fill_super_succeeded = 1; + +out_free_md: + if (md) + OBD_FREE(md, md_len); +out_free_dt: + if (dt) + OBD_FREE(dt, dt_len); +out_profile: + if (lprof) + class_put_profile(lprof); +out_debugfs: + if (err < 0) + ll_debugfs_unregister_super(sb); +out_free_cfg: + if (cfg) + OBD_FREE_PTR(cfg); + + if (err) + ll_put_super(sb); + else if (test_bit(LL_SBI_VERBOSE, sbi->ll_flags)) + LCONSOLE_WARN("Mounted %s\n", profilenm); + RETURN(err); +} /* ll_fill_super */ + +void ll_put_super(struct super_block *sb) +{ + struct config_llog_instance cfg, params_cfg; + struct obd_device *obd; + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = ll_s2sbi(sb); + char *profilenm = get_profile_name(sb); + unsigned long cfg_instance = ll_get_cfg_instance(sb); + long ccc_count; + int next, force = 1, rc = 0; + ENTRY; + + if (IS_ERR(sbi)) + GOTO(out_no_sbi, 0); + + /* Should replace instance_id with something better for ASLR */ + CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n", + profilenm, cfg_instance, sb); + + cfg.cfg_instance = cfg_instance; + lustre_end_log(sb, profilenm, &cfg); + + params_cfg.cfg_instance = cfg_instance; + lustre_end_log(sb, PARAMS_FILENAME, ¶ms_cfg); + + if (sbi->ll_md_exp) { + obd = class_exp2obd(sbi->ll_md_exp); + if (obd) + force = obd->obd_force; + } + + /* Wait for unstable pages to be committed to stable storage */ + if (force == 0) { + rc = l_wait_event_abortable( + sbi->ll_cache->ccc_unstable_waitq, + atomic_long_read(&sbi->ll_cache->ccc_unstable_nr) == 0); + } + + ccc_count = atomic_long_read(&sbi->ll_cache->ccc_unstable_nr); + if (force == 0 && rc != -ERESTARTSYS) + LASSERTF(ccc_count == 0, "count: %li\n", ccc_count); + + /* We need to set force before the lov_disconnect in + * lustre_common_put_super, since l_d cleans up osc's as well. + */ + if (force) { + next = 0; + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, + &next)) != NULL) { + obd->obd_force = force; + } + } + + if (sbi->ll_client_common_fill_super_succeeded) { + /* Only if client_common_fill_super succeeded */ + client_common_put_super(sb); + } + + next = 0; + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next))) + class_manual_cleanup(obd); + + if (test_bit(LL_SBI_VERBOSE, sbi->ll_flags)) + LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : ""); + + if (profilenm) + class_del_profile(profilenm); + +#ifndef HAVE_SUPER_SETUP_BDI_NAME + if (lsi->lsi_flags & LSI_BDI_INITIALIZED) { + bdi_destroy(&lsi->lsi_bdi); + lsi->lsi_flags &= ~LSI_BDI_INITIALIZED; + } +#endif + + llcrypt_free_dummy_context(&lsi->lsi_dummy_enc_ctx); + ll_free_sbi(sb); + lsi->lsi_llsbi = NULL; +out_no_sbi: + lustre_common_put_super(sb); + + cl_env_cache_purge(~0); + + EXIT; +} /* client_put_super */ + +struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock) +{ + struct inode *inode = NULL; + + /* NOTE: we depend on atomic igrab() -bzzz */ + lock_res_and_lock(lock); + if (lock->l_resource->lr_lvb_inode) { + struct ll_inode_info * lli; + lli = ll_i2info(lock->l_resource->lr_lvb_inode); + if (lli->lli_inode_magic == LLI_INODE_MAGIC) { + inode = igrab(lock->l_resource->lr_lvb_inode); + } else { + inode = lock->l_resource->lr_lvb_inode; + LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ? D_INFO : + D_WARNING, lock, "lr_lvb_inode %p is " + "bogus: magic %08x", + lock->l_resource->lr_lvb_inode, + lli->lli_inode_magic); + inode = NULL; + } + } + unlock_res_and_lock(lock); + return inode; +} + +void ll_dir_clear_lsm_md(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + + LASSERT(S_ISDIR(inode->i_mode)); + + if (lli->lli_lsm_md) { + lmv_free_memmd(lli->lli_lsm_md); + lli->lli_lsm_md = NULL; + } + + if (lli->lli_default_lsm_md) { + lmv_free_memmd(lli->lli_default_lsm_md); + lli->lli_default_lsm_md = NULL; + } +} + +static struct inode *ll_iget_anon_dir(struct super_block *sb, + const struct lu_fid *fid, + struct lustre_md *md) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct ll_inode_info *lli; + struct mdt_body *body = md->body; + struct inode *inode; + ino_t ino; + + ENTRY; + + LASSERT(md->lmv); + ino = cl_fid_build_ino(fid, test_bit(LL_SBI_32BIT_API, sbi->ll_flags)); + inode = iget_locked(sb, ino); + if (inode == NULL) { + CERROR("%s: failed get simple inode "DFID": rc = -ENOENT\n", + sbi->ll_fsname, PFID(fid)); + RETURN(ERR_PTR(-ENOENT)); + } + + lli = ll_i2info(inode); + if (inode->i_state & I_NEW) { + inode->i_mode = (inode->i_mode & ~S_IFMT) | + (body->mbo_mode & S_IFMT); + LASSERTF(S_ISDIR(inode->i_mode), "Not slave inode "DFID"\n", + PFID(fid)); + + inode->i_mtime.tv_sec = 0; + inode->i_atime.tv_sec = 0; + inode->i_ctime.tv_sec = 0; + inode->i_rdev = 0; + +#ifdef HAVE_BACKING_DEV_INFO + /* initializing backing dev info. */ + inode->i_mapping->backing_dev_info = + &s2lsi(inode->i_sb)->lsi_bdi; +#endif + inode->i_op = &ll_dir_inode_operations; + inode->i_fop = &ll_dir_operations; + lli->lli_fid = *fid; + ll_lli_init(lli); + + /* master object FID */ + lli->lli_pfid = body->mbo_fid1; + CDEBUG(D_INODE, "lli %p slave "DFID" master "DFID"\n", + lli, PFID(fid), PFID(&lli->lli_pfid)); + unlock_new_inode(inode); + } else { + /* in directory restripe/auto-split, a directory will be + * transformed to a stripe if it's plain, set its pfid here, + * otherwise ll_lock_cancel_bits() can't find the master inode. + */ + lli->lli_pfid = body->mbo_fid1; + } + + RETURN(inode); +} + +static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md) +{ + struct lu_fid *fid; + struct lmv_stripe_md *lsm = md->lmv; + struct ll_inode_info *lli = ll_i2info(inode); + int i; + + LASSERT(lsm != NULL); + + CDEBUG(D_INODE, "%s: "DFID" set dir layout:\n", + ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid)); + lsm_md_dump(D_INODE, lsm); + + if (!lmv_dir_striped(lsm)) + goto out; + + /* XXX sigh, this lsm_root initialization should be in + * LMV layer, but it needs ll_iget right now, so we + * put this here right now. */ + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + fid = &lsm->lsm_md_oinfo[i].lmo_fid; + LASSERT(lsm->lsm_md_oinfo[i].lmo_root == NULL); + + if (!fid_is_sane(fid)) + continue; + + /* Unfortunately ll_iget will call ll_update_inode, + * where the initialization of slave inode is slightly + * different, so it reset lsm_md to NULL to avoid + * initializing lsm for slave inode. */ + lsm->lsm_md_oinfo[i].lmo_root = + ll_iget_anon_dir(inode->i_sb, fid, md); + if (IS_ERR(lsm->lsm_md_oinfo[i].lmo_root)) { + int rc = PTR_ERR(lsm->lsm_md_oinfo[i].lmo_root); + + lsm->lsm_md_oinfo[i].lmo_root = NULL; + while (i-- > 0) { + iput(lsm->lsm_md_oinfo[i].lmo_root); + lsm->lsm_md_oinfo[i].lmo_root = NULL; + } + return rc; + } + } +out: + lli->lli_lsm_md = lsm; + + return 0; +} + +static void ll_update_default_lsm_md(struct inode *inode, struct lustre_md *md) +{ + struct ll_inode_info *lli = ll_i2info(inode); + + ENTRY; + + if (!md->default_lmv) { + /* clear default lsm */ + if (lli->lli_default_lsm_md) { + down_write(&lli->lli_lsm_sem); + if (lli->lli_default_lsm_md) { + lmv_free_memmd(lli->lli_default_lsm_md); + lli->lli_default_lsm_md = NULL; + } + lli->lli_inherit_depth = 0; + up_write(&lli->lli_lsm_sem); + } + RETURN_EXIT; + } + + if (lli->lli_default_lsm_md) { + /* do nonthing if default lsm isn't changed */ + down_read(&lli->lli_lsm_sem); + if (lli->lli_default_lsm_md && + lsm_md_eq(lli->lli_default_lsm_md, md->default_lmv)) { + up_read(&lli->lli_lsm_sem); + RETURN_EXIT; + } + up_read(&lli->lli_lsm_sem); + } + + down_write(&lli->lli_lsm_sem); + if (lli->lli_default_lsm_md) + lmv_free_memmd(lli->lli_default_lsm_md); + lli->lli_default_lsm_md = md->default_lmv; + lsm_md_dump(D_INODE, md->default_lmv); + md->default_lmv = NULL; + up_write(&lli->lli_lsm_sem); + RETURN_EXIT; +} + +static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct lmv_stripe_md *lsm = md->lmv; + struct cl_attr *attr; + int rc = 0; + + ENTRY; + + LASSERT(S_ISDIR(inode->i_mode)); + CDEBUG(D_INODE, "update lsm %p of "DFID"\n", lli->lli_lsm_md, + PFID(ll_inode2fid(inode))); + + /* update default LMV */ + if (md->default_lmv) + ll_update_default_lsm_md(inode, md); + + /* after dir migration/restripe, a stripe may be turned into a + * directory, in this case, zero out its lli_pfid. + */ + if (unlikely(fid_is_norm(&lli->lli_pfid))) + fid_zero(&lli->lli_pfid); + + /* + * no striped information from request, lustre_md from req does not + * include stripeEA, see ll_md_setattr() + */ + if (!lsm) + RETURN(0); + + /* + * normally dir layout doesn't change, only take read lock to check + * that to avoid blocking other MD operations. + */ + down_read(&lli->lli_lsm_sem); + + /* some current lookup initialized lsm, and unchanged */ + if (lli->lli_lsm_md && lsm_md_eq(lli->lli_lsm_md, lsm)) + GOTO(unlock, rc = 0); + + /* if dir layout doesn't match, check whether version is increased, + * which means layout is changed, this happens in dir split/merge and + * lfsck. + * + * foreign LMV should not change. + */ + if (lli->lli_lsm_md && lmv_dir_striped(lli->lli_lsm_md) && + lsm->lsm_md_layout_version <= + lli->lli_lsm_md->lsm_md_layout_version) { + CERROR("%s: "DFID" dir layout mismatch:\n", + ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid)); + lsm_md_dump(D_ERROR, lli->lli_lsm_md); + lsm_md_dump(D_ERROR, lsm); + GOTO(unlock, rc = -EINVAL); + } + + up_read(&lli->lli_lsm_sem); + down_write(&lli->lli_lsm_sem); + /* clear existing lsm */ + if (lli->lli_lsm_md) { + lmv_free_memmd(lli->lli_lsm_md); + lli->lli_lsm_md = NULL; + } + + rc = ll_init_lsm_md(inode, md); + up_write(&lli->lli_lsm_sem); + + if (rc) + RETURN(rc); + + /* set md->lmv to NULL, so the following free lustre_md will not free + * this lsm. + */ + md->lmv = NULL; + + /* md_merge_attr() may take long, since lsm is already set, switch to + * read lock. + */ + down_read(&lli->lli_lsm_sem); + + if (!lmv_dir_striped(lli->lli_lsm_md)) + GOTO(unlock, rc = 0); + + OBD_ALLOC_PTR(attr); + if (!attr) + GOTO(unlock, rc = -ENOMEM); + + /* validate the lsm */ + rc = md_merge_attr(ll_i2mdexp(inode), lli->lli_lsm_md, attr, + ll_md_blocking_ast); + if (!rc) { + if (md->body->mbo_valid & OBD_MD_FLNLINK) + md->body->mbo_nlink = attr->cat_nlink; + if (md->body->mbo_valid & OBD_MD_FLSIZE) + md->body->mbo_size = attr->cat_size; + if (md->body->mbo_valid & OBD_MD_FLATIME) + md->body->mbo_atime = attr->cat_atime; + if (md->body->mbo_valid & OBD_MD_FLCTIME) + md->body->mbo_ctime = attr->cat_ctime; + if (md->body->mbo_valid & OBD_MD_FLMTIME) + md->body->mbo_mtime = attr->cat_mtime; + } + + OBD_FREE_PTR(attr); + GOTO(unlock, rc); +unlock: + up_read(&lli->lli_lsm_sem); + + return rc; +} + +void ll_clear_inode(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + + if (S_ISDIR(inode->i_mode)) { + /* these should have been cleared in ll_file_release */ + LASSERT(lli->lli_opendir_key == NULL); + LASSERT(lli->lli_sai == NULL); + LASSERT(lli->lli_opendir_pid == 0); + } else { + pcc_inode_free(inode); + } + + md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode)); + + LASSERT(!lli->lli_open_fd_write_count); + LASSERT(!lli->lli_open_fd_read_count); + LASSERT(!lli->lli_open_fd_exec_count); + + if (lli->lli_mds_write_och) + ll_md_real_close(inode, FMODE_WRITE); + if (lli->lli_mds_exec_och) + ll_md_real_close(inode, FMODE_EXEC); + if (lli->lli_mds_read_och) + ll_md_real_close(inode, FMODE_READ); + + if (S_ISLNK(inode->i_mode) && lli->lli_symlink_name) { + OBD_FREE(lli->lli_symlink_name, + strlen(lli->lli_symlink_name) + 1); + lli->lli_symlink_name = NULL; + } + + ll_xattr_cache_destroy(inode); + + forget_all_cached_acls(inode); + lli_clear_acl(lli); + lli->lli_inode_magic = LLI_INODE_DEAD; + + if (S_ISDIR(inode->i_mode)) + ll_dir_clear_lsm_md(inode); + else if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) + LASSERT(list_empty(&lli->lli_agl_list)); + + /* + * XXX This has to be done before lsm is freed below, because + * cl_object still uses inode lsm. + */ + cl_inode_fini(inode); + + llcrypt_put_encryption_info(inode); + + EXIT; +} + +static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data) +{ + struct lustre_md md; + struct inode *inode = dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *request = NULL; + int rc, ia_valid; + + ENTRY; + + op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + /* If this is a chgrp of a regular file, we want to reserve enough + * quota to cover the entire file size. + */ + if (S_ISREG(inode->i_mode) && op_data->op_attr.ia_valid & ATTR_GID && + from_kgid(&init_user_ns, op_data->op_attr.ia_gid) != + from_kgid(&init_user_ns, inode->i_gid)) { + op_data->op_xvalid |= OP_XVALID_BLOCKS; + op_data->op_attr_blocks = inode->i_blocks; + } + + + rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &request); + if (rc) { + ptlrpc_req_finished(request); + if (rc == -ENOENT) { + clear_nlink(inode); + /* Unlinked special device node? Or just a race? + * Pretend we done everything. */ + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) { + ia_valid = op_data->op_attr.ia_valid; + op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS; + rc = simple_setattr(&init_user_ns, dentry, + &op_data->op_attr); + op_data->op_attr.ia_valid = ia_valid; + } + } else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) { + CERROR("md_setattr fails: rc = %d\n", rc); + } + RETURN(rc); + } + + rc = md_get_lustre_md(sbi->ll_md_exp, &request->rq_pill, sbi->ll_dt_exp, + sbi->ll_md_exp, &md); + if (rc) { + ptlrpc_req_finished(request); + RETURN(rc); + } + + ia_valid = op_data->op_attr.ia_valid; + /* inode size will be in ll_setattr_ost, can't do it now since dirty + * cache is not cleared yet. */ + op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE); + if (S_ISREG(inode->i_mode)) + inode_lock(inode); + rc = simple_setattr(&init_user_ns, dentry, &op_data->op_attr); + if (S_ISREG(inode->i_mode)) + inode_unlock(inode); + op_data->op_attr.ia_valid = ia_valid; + + rc = ll_update_inode(inode, &md); + ptlrpc_req_finished(request); + + RETURN(rc); +} + +/** + * Zero portion of page that is part of @inode. + * This implies, if necessary: + * - taking cl_lock on range corresponding to concerned page + * - grabbing vm page + * - associating cl_page + * - proceeding to clio read + * - zeroing range in page + * - proceeding to cl_page flush + * - releasing cl_lock + * + * \param[in] inode inode + * \param[in] index page index + * \param[in] offset offset in page to start zero from + * \param[in] len len to zero + * + * \retval 0 on success + * \retval negative errno on failure + */ +int ll_io_zero_page(struct inode *inode, pgoff_t index, pgoff_t offset, + unsigned len) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *clob = lli->lli_clob; + __u16 refcheck; + struct lu_env *env = NULL; + struct cl_io *io = NULL; + struct cl_page *clpage = NULL; + struct page *vmpage = NULL; + unsigned from = index << PAGE_SHIFT; + struct cl_lock *lock = NULL; + struct cl_lock_descr *descr = NULL; + struct cl_2queue *queue = NULL; + struct cl_sync_io *anchor = NULL; + bool holdinglock = false; + int rc; + + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = vvp_env_thread_io(env); + io->ci_obj = clob; + rc = cl_io_rw_init(env, io, CIT_WRITE, from, PAGE_SIZE); + if (rc) + GOTO(putenv, rc); + + lock = vvp_env_lock(env); + descr = &lock->cll_descr; + descr->cld_obj = io->ci_obj; + descr->cld_start = cl_index(io->ci_obj, from); + descr->cld_end = cl_index(io->ci_obj, from + PAGE_SIZE - 1); + descr->cld_mode = CLM_WRITE; + descr->cld_enq_flags = CEF_MUST | CEF_NONBLOCK; + + /* request lock for page */ + rc = cl_lock_request(env, io, lock); + /* -ECANCELED indicates a matching lock with a different extent + * was already present, and -EEXIST indicates a matching lock + * on exactly the same extent was already present. + * In both cases it means we are covered. + */ + if (rc == -ECANCELED || rc == -EEXIST) + rc = 0; + else if (rc < 0) + GOTO(iofini, rc); + else + holdinglock = true; + + /* grab page */ + vmpage = grab_cache_page_nowait(inode->i_mapping, index); + if (vmpage == NULL) + GOTO(rellock, rc = -EOPNOTSUPP); + + if (!PageDirty(vmpage)) { + /* associate cl_page */ + clpage = cl_page_find(env, clob, vmpage->index, + vmpage, CPT_CACHEABLE); + if (IS_ERR(clpage)) + GOTO(pagefini, rc = PTR_ERR(clpage)); + + cl_page_assume(env, io, clpage); + } + + if (!PageUptodate(vmpage) && !PageDirty(vmpage) && + !PageWriteback(vmpage)) { + /* read page */ + /* Set PagePrivate2 to detect special case of empty page + * in osc_brw_fini_request(). + * It is also used to tell ll_io_read_page() that we do not + * want the vmpage to be unlocked. + */ + SetPagePrivate2(vmpage); + rc = ll_io_read_page(env, io, clpage, NULL); + if (!PagePrivate2(vmpage)) { + /* PagePrivate2 was cleared in osc_brw_fini_request() + * meaning we read an empty page. In this case, in order + * to avoid allocating unnecessary block in truncated + * file, we must not zero and write as below. Subsequent + * server-side truncate will handle things correctly. + */ + cl_page_unassume(env, io, clpage); + GOTO(clpfini, rc = 0); + } + ClearPagePrivate2(vmpage); + if (rc) + GOTO(clpfini, rc); + } + + /* Thanks to PagePrivate2 flag, ll_io_read_page() did not unlock + * the vmpage, so we are good to proceed and zero range in page. + */ + zero_user(vmpage, offset, len); + + if (holdinglock && clpage) { + /* explicitly write newly modified page */ + queue = &io->ci_queue; + cl_2queue_init(queue); + anchor = &vvp_env_info(env)->vti_anchor; + cl_sync_io_init(anchor, 1); + clpage->cp_sync_io = anchor; + cl_2queue_add(queue, clpage, true); + rc = cl_io_submit_rw(env, io, CRT_WRITE, queue); + if (rc) + GOTO(queuefini1, rc); + rc = cl_sync_io_wait(env, anchor, 0); + if (rc) + GOTO(queuefini2, rc); + cl_page_assume(env, io, clpage); + +queuefini2: + cl_2queue_discard(env, io, queue); +queuefini1: + cl_2queue_disown(env, io, queue); + cl_2queue_fini(env, queue); + } + +clpfini: + if (clpage) + cl_page_put(env, clpage); +pagefini: + unlock_page(vmpage); + put_page(vmpage); +rellock: + if (holdinglock) + cl_lock_release(env, lock); +iofini: + cl_io_fini(env, io); +putenv: + if (env) + cl_env_put(env, &refcheck); + + RETURN(rc); +} + +/** + * Get reference file from volatile file name. + * Volatile file name may look like: + * /LUSTRE_VOLATILE_HDR:::fd= + * where fd is opened descriptor of reference file. + * + * \param[in] volatile_name volatile file name + * \param[in] volatile_len volatile file name length + * \param[out] ref_file pointer to struct file of reference file + * + * \retval 0 on success + * \retval negative errno on failure + */ +int volatile_ref_file(const char *volatile_name, int volatile_len, + struct file **ref_file) +{ + char *p, *q, *fd_str; + int fd, rc; + + p = strnstr(volatile_name, ":fd=", volatile_len); + if (!p || strlen(p + 4) == 0) + return -EINVAL; + + q = strchrnul(p + 4, ':'); + fd_str = kstrndup(p + 4, q - p - 4, GFP_NOFS); + if (!fd_str) + return -ENOMEM; + rc = kstrtouint(fd_str, 10, &fd); + kfree(fd_str); + if (rc) + return -EINVAL; + + *ref_file = fget(fd); + if (!(*ref_file)) + return -EINVAL; + return 0; +} + +/* If this inode has objects allocated to it (lsm != NULL), then the OST + * object(s) determine the file size and mtime. Otherwise, the MDS will + * keep these values until such a time that objects are allocated for it. + * We do the MDS operations first, as it is checking permissions for us. + * We don't to the MDS RPC if there is nothing that we want to store there, + * otherwise there is no harm in updating mtime/atime on the MDS if we are + * going to do an RPC anyways. + * + * If we are doing a truncate, we will send the mtime and ctime updates + * to the OST with the punch RPC, otherwise we do an explicit setattr RPC. + * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE + * at the same time. + * + * In case of HSMimport, we only set attr on MDS. + */ +int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, + enum op_xvalid xvalid, bool hsm_import) +{ + struct inode *inode = dentry->d_inode; + struct ll_inode_info *lli = ll_i2info(inode); + struct md_op_data *op_data = NULL; + ktime_t kstart = ktime_get(); + int rc = 0; + + ENTRY; + + CDEBUG(D_VFSTRACE, "%s: setattr inode "DFID"(%p) from %llu to %llu, " + "valid %x, hsm_import %d\n", + ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid), + inode, i_size_read(inode), attr->ia_size, attr->ia_valid, + hsm_import); + + if (attr->ia_valid & ATTR_SIZE) { + /* Check new size against VFS/VM file size limit and rlimit */ + rc = inode_newsize_ok(inode, attr->ia_size); + if (rc) + RETURN(rc); + + /* The maximum Lustre file size is variable, based on the + * OST maximum object size and number of stripes. This + * needs another check in addition to the VFS check above. */ + if (attr->ia_size > ll_file_maxbytes(inode)) { + CDEBUG(D_INODE,"file "DFID" too large %llu > %llu\n", + PFID(&lli->lli_fid), attr->ia_size, + ll_file_maxbytes(inode)); + RETURN(-EFBIG); + } + + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; + } + + /* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */ + if (attr->ia_valid & TIMES_SET_FLAGS) { + if ((!uid_eq(current_fsuid(), inode->i_uid)) && + !capable(CAP_FOWNER)) + RETURN(-EPERM); + } + + /* We mark all of the fields "set" so MDS/OST does not re-set them */ + if (!(xvalid & OP_XVALID_CTIME_SET) && + (attr->ia_valid & ATTR_CTIME)) { + attr->ia_ctime = current_time(inode); + xvalid |= OP_XVALID_CTIME_SET; + } + if (!(attr->ia_valid & ATTR_ATIME_SET) && + (attr->ia_valid & ATTR_ATIME)) { + attr->ia_atime = current_time(inode); + attr->ia_valid |= ATTR_ATIME_SET; + } + if (!(attr->ia_valid & ATTR_MTIME_SET) && + (attr->ia_valid & ATTR_MTIME)) { + attr->ia_mtime = current_time(inode); + attr->ia_valid |= ATTR_MTIME_SET; + } + + if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME)) + CDEBUG(D_INODE, "setting mtime %lld, ctime %lld, now = %lld\n", + (s64)attr->ia_mtime.tv_sec, (s64)attr->ia_ctime.tv_sec, + ktime_get_real_seconds()); + + if (S_ISREG(inode->i_mode)) + inode_unlock(inode); + + /* We always do an MDS RPC, even if we're only changing the size; + * only the MDS knows whether truncate() should fail with -ETXTBUSY */ + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + GOTO(out, rc = -ENOMEM); + + if (!hsm_import && attr->ia_valid & ATTR_SIZE) { + /* If we are changing file size, file content is + * modified, flag it. + */ + xvalid |= OP_XVALID_OWNEROVERRIDE; + op_data->op_bias |= MDS_DATA_MODIFIED; + clear_bit(LLIF_DATA_MODIFIED, &lli->lli_flags); + } + + if (attr->ia_valid & ATTR_FILE) { + struct ll_file_data *fd = attr->ia_file->private_data; + + if (fd->fd_lease_och) + op_data->op_bias |= MDS_TRUNC_KEEP_LEASE; + } + + op_data->op_attr = *attr; + op_data->op_xvalid = xvalid; + + rc = ll_md_setattr(dentry, op_data); + if (rc) + GOTO(out, rc); + + if (!S_ISREG(inode->i_mode) || hsm_import) + GOTO(out, rc = 0); + + if (attr->ia_valid & (ATTR_SIZE | ATTR_ATIME | ATTR_ATIME_SET | + ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME) || + xvalid & OP_XVALID_CTIME_SET) { + bool cached = false; + + rc = pcc_inode_setattr(inode, attr, &cached); + if (cached) { + if (rc) { + CERROR("%s: PCC inode "DFID" setattr failed: " + "rc = %d\n", + ll_i2sbi(inode)->ll_fsname, + PFID(&lli->lli_fid), rc); + GOTO(out, rc); + } + } else { + unsigned int flags = 0; + + /* For truncate and utimes sending attributes to OSTs, + * setting mtime/atime to the past will be performed + * under PW [0:EOF] extent lock (new_size:EOF for + * truncate). It may seem excessive to send mtime/atime + * updates to OSTs when not setting times to past, but + * it is necessary due to possible time + * de-synchronization between MDT inode and OST objects + */ + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) { + xvalid |= OP_XVALID_FLAGS; + flags = LUSTRE_ENCRYPT_FL; + /* Call to ll_io_zero_page is not necessary if + * truncating on PAGE_SIZE boundary, because + * whole pages will be wiped. + * In case of Direct IO, all we need is to set + * new size. + */ + if (attr->ia_valid & ATTR_SIZE && + attr->ia_size & ~PAGE_MASK && + !(attr->ia_valid & ATTR_FILE && + attr->ia_file->f_flags & O_DIRECT)) { + pgoff_t offset = + attr->ia_size & (PAGE_SIZE - 1); + + rc = ll_io_zero_page(inode, + attr->ia_size >> PAGE_SHIFT, + offset, PAGE_SIZE - offset); + if (rc) + GOTO(out, rc); + } + /* If encrypted volatile file without the key, + * we need to fetch size from reference file, + * and set it on OST objects. This happens when + * migrating or extending an encrypted file + * without the key. + */ + if (filename_is_volatile(dentry->d_name.name, + dentry->d_name.len, + NULL) && + llcrypt_require_key(inode) == -ENOKEY) { + struct file *ref_file; + struct inode *ref_inode; + struct ll_inode_info *ref_lli; + struct cl_object *ref_obj; + struct cl_attr ref_attr = { 0 }; + struct lu_env *env; + __u16 refcheck; + + rc = volatile_ref_file( + dentry->d_name.name, + dentry->d_name.len, + &ref_file); + if (rc) + GOTO(out, rc); + + ref_inode = file_inode(ref_file); + if (!ref_inode) { + fput(ref_file); + GOTO(out, rc = -EINVAL); + } + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out, rc = PTR_ERR(env)); + + ref_lli = ll_i2info(ref_inode); + ref_obj = ref_lli->lli_clob; + cl_object_attr_lock(ref_obj); + rc = cl_object_attr_get(env, ref_obj, + &ref_attr); + cl_object_attr_unlock(ref_obj); + cl_env_put(env, &refcheck); + fput(ref_file); + if (rc) + GOTO(out, rc); + + attr->ia_valid |= ATTR_SIZE; + attr->ia_size = ref_attr.cat_size; + } + } + rc = cl_setattr_ost(lli->lli_clob, attr, xvalid, flags); + } + } + + /* If the file was restored, it needs to set dirty flag. + * + * We've already sent MDS_DATA_MODIFIED flag in + * ll_md_setattr() for truncate. However, the MDT refuses to + * set the HS_DIRTY flag on released files, so we have to set + * it again if the file has been restored. Please check how + * LLIF_DATA_MODIFIED is set in vvp_io_setattr_fini(). + * + * Please notice that if the file is not released, the previous + * MDS_DATA_MODIFIED has taken effect and usually + * LLIF_DATA_MODIFIED is not set(see vvp_io_setattr_fini()). + * This way we can save an RPC for common open + trunc + * operation. */ + if (test_and_clear_bit(LLIF_DATA_MODIFIED, &lli->lli_flags)) { + struct hsm_state_set hss = { + .hss_valid = HSS_SETMASK, + .hss_setmask = HS_DIRTY, + }; + int rc2; + + rc2 = ll_hsm_state_set(inode, &hss); + /* truncate and write can happen at the same time, so that + * the file can be set modified even though the file is not + * restored from released state, and ll_hsm_state_set() is + * not applicable for the file, and rc2 < 0 is normal in this + * case. */ + if (rc2 < 0) + CDEBUG(D_INFO, DFID "HSM set dirty failed: rc2 = %d\n", + PFID(ll_inode2fid(inode)), rc2); + } + + EXIT; +out: + if (op_data != NULL) + ll_finish_md_op_data(op_data); + + if (S_ISREG(inode->i_mode)) { + inode_lock(inode); + if ((attr->ia_valid & ATTR_SIZE) && !hsm_import) + inode_dio_wait(inode); + /* Once we've got the i_mutex, it's safe to set the S_NOSEC + * flag. ll_update_inode (called from ll_md_setattr), clears + * inode flags, so there is a gap where S_NOSEC is not set. + * This can cause a writer to take the i_mutex unnecessarily, + * but this is safe to do and should be rare. */ + inode_has_no_xattr(inode); + } + + if (!rc) + ll_stats_ops_tally(ll_i2sbi(inode), attr->ia_valid & ATTR_SIZE ? + LPROC_LL_TRUNC : LPROC_LL_SETATTR, + ktime_us_delta(ktime_get(), kstart)); + + RETURN(rc); +} + +int ll_setattr(struct user_namespace *mnt_userns, struct dentry *de, + struct iattr *attr) +{ + int mode = de->d_inode->i_mode; + enum op_xvalid xvalid = 0; + int rc; + + rc = llcrypt_prepare_setattr(de, attr); + if (rc) + return rc; + + if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) == + (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) + xvalid |= OP_XVALID_OWNEROVERRIDE; + + if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) == + (ATTR_SIZE|ATTR_MODE)) && + (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) || + (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) && + !(attr->ia_mode & S_ISGID)))) + attr->ia_valid |= ATTR_FORCE; + + if ((attr->ia_valid & ATTR_MODE) && + (mode & S_ISUID) && + !(attr->ia_mode & S_ISUID) && + !(attr->ia_valid & ATTR_KILL_SUID)) + attr->ia_valid |= ATTR_KILL_SUID; + + if ((attr->ia_valid & ATTR_MODE) && + ((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) && + !(attr->ia_mode & S_ISGID) && + !(attr->ia_valid & ATTR_KILL_SGID)) + attr->ia_valid |= ATTR_KILL_SGID; + + return ll_setattr_raw(de, attr, xvalid, false); +} + +int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs, + u32 flags) +{ + struct obd_statfs obd_osfs = { 0 }; + time64_t max_age; + int rc; + + ENTRY; + max_age = ktime_get_seconds() - sbi->ll_statfs_max_age; + + if (test_bit(LL_SBI_LAZYSTATFS, sbi->ll_flags)) + flags |= OBD_STATFS_NODELAY; + + rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags); + if (rc) + RETURN(rc); + + osfs->os_type = LL_SUPER_MAGIC; + + CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n", + osfs->os_bavail, osfs->os_blocks, osfs->os_ffree, osfs->os_files); + + if (osfs->os_state & OS_STATFS_SUM) + GOTO(out, rc); + + rc = obd_statfs(NULL, sbi->ll_dt_exp, &obd_osfs, max_age, flags); + if (rc) /* Possibly a filesystem with no OSTs. Report MDT totals. */ + GOTO(out, rc = 0); + + CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n", + obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree, + obd_osfs.os_files); + + osfs->os_bsize = obd_osfs.os_bsize; + osfs->os_blocks = obd_osfs.os_blocks; + osfs->os_bfree = obd_osfs.os_bfree; + osfs->os_bavail = obd_osfs.os_bavail; + + /* If we have _some_ OSTs, but don't have as many free objects on the + * OSTs as inodes on the MDTs, reduce the reported number of inodes + * to compensate, so that the "inodes in use" number is correct. + * This should be kept in sync with lod_statfs() behaviour. + */ + if (obd_osfs.os_files && obd_osfs.os_ffree < osfs->os_ffree) { + osfs->os_files = (osfs->os_files - osfs->os_ffree) + + obd_osfs.os_ffree; + osfs->os_ffree = obd_osfs.os_ffree; + } + +out: + RETURN(rc); +} + +static int ll_statfs_project(struct inode *inode, struct kstatfs *sfs) +{ + struct if_quotactl qctl = { + .qc_cmd = LUSTRE_Q_GETQUOTA, + .qc_type = PRJQUOTA, + .qc_valid = QC_GENERAL, + }; + u64 limit, curblock; + int ret; + + qctl.qc_id = ll_i2info(inode)->lli_projid; + ret = quotactl_ioctl(inode->i_sb, &qctl); + if (ret) { + /* ignore errors if project ID does not have + * a quota limit or feature unsupported. + */ + if (ret == -ESRCH || ret == -EOPNOTSUPP) + ret = 0; + return ret; + } + + limit = ((qctl.qc_dqblk.dqb_bsoftlimit ? + qctl.qc_dqblk.dqb_bsoftlimit : + qctl.qc_dqblk.dqb_bhardlimit) * 1024) / sfs->f_bsize; + if (limit && sfs->f_blocks > limit) { + curblock = (qctl.qc_dqblk.dqb_curspace + + sfs->f_bsize - 1) / sfs->f_bsize; + sfs->f_blocks = limit; + sfs->f_bfree = sfs->f_bavail = + (sfs->f_blocks > curblock) ? + (sfs->f_blocks - curblock) : 0; + } + + limit = qctl.qc_dqblk.dqb_isoftlimit ? + qctl.qc_dqblk.dqb_isoftlimit : + qctl.qc_dqblk.dqb_ihardlimit; + if (limit && sfs->f_files > limit) { + sfs->f_files = limit; + sfs->f_ffree = (sfs->f_files > + qctl.qc_dqblk.dqb_curinodes) ? + (sfs->f_files - qctl.qc_dqblk.dqb_curinodes) : 0; + } + + return 0; +} + +int ll_statfs(struct dentry *de, struct kstatfs *sfs) +{ + struct super_block *sb = de->d_sb; + struct obd_statfs osfs; + __u64 fsid = huge_encode_dev(sb->s_dev); + ktime_t kstart = ktime_get(); + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op:sb=%s (%p)\n", sb->s_id, sb); + + /* Some amount of caching on the client is allowed */ + rc = ll_statfs_internal(ll_s2sbi(sb), &osfs, OBD_STATFS_SUM); + if (rc) + return rc; + + statfs_unpack(sfs, &osfs); + + /* We need to downshift for all 32-bit kernels, because we can't + * tell if the kernel is being called via sys_statfs64() or not. + * Stop before overflowing f_bsize - in which case it is better + * to just risk EOVERFLOW if caller is using old sys_statfs(). */ + if (sizeof(long) < 8) { + while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) { + sfs->f_bsize <<= 1; + + osfs.os_blocks >>= 1; + osfs.os_bfree >>= 1; + osfs.os_bavail >>= 1; + } + } + + sfs->f_blocks = osfs.os_blocks; + sfs->f_bfree = osfs.os_bfree; + sfs->f_bavail = osfs.os_bavail; + sfs->f_fsid.val[0] = (__u32)fsid; + sfs->f_fsid.val[1] = (__u32)(fsid >> 32); + if (ll_i2info(de->d_inode)->lli_projid) + return ll_statfs_project(de->d_inode, sfs); + + ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STATFS, + ktime_us_delta(ktime_get(), kstart)); + + return 0; +} + +void ll_inode_size_lock(struct inode *inode) +{ + struct ll_inode_info *lli; + + LASSERT(!S_ISDIR(inode->i_mode)); + + lli = ll_i2info(inode); + mutex_lock(&lli->lli_size_mutex); +} + +void ll_inode_size_unlock(struct inode *inode) +{ + struct ll_inode_info *lli; + + lli = ll_i2info(inode); + mutex_unlock(&lli->lli_size_mutex); +} + +void ll_update_inode_flags(struct inode *inode, unsigned int ext_flags) +{ + /* do not clear encryption flag */ + ext_flags |= ll_inode_to_ext_flags(inode->i_flags) & LUSTRE_ENCRYPT_FL; + inode->i_flags = ll_ext_to_inode_flags(ext_flags); + if (ext_flags & LUSTRE_PROJINHERIT_FL) + set_bit(LLIF_PROJECT_INHERIT, &ll_i2info(inode)->lli_flags); + else + clear_bit(LLIF_PROJECT_INHERIT, &ll_i2info(inode)->lli_flags); +} + +int ll_update_inode(struct inode *inode, struct lustre_md *md) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct mdt_body *body = md->body; + struct ll_sb_info *sbi = ll_i2sbi(inode); + bool api32; + int rc = 0; + + if (body->mbo_valid & OBD_MD_FLEASIZE) { + rc = cl_file_inode_init(inode, md); + if (rc) + return rc; + } + + if (S_ISDIR(inode->i_mode)) { + rc = ll_update_lsm_md(inode, md); + if (rc != 0) + return rc; + } + + if (body->mbo_valid & OBD_MD_FLACL) + lli_replace_acl(lli, md); + + api32 = test_bit(LL_SBI_32BIT_API, sbi->ll_flags); + inode->i_ino = cl_fid_build_ino(&body->mbo_fid1, api32); + inode->i_generation = cl_fid_build_gen(&body->mbo_fid1); + + if (body->mbo_valid & OBD_MD_FLATIME) { + if (body->mbo_atime > inode->i_atime.tv_sec) + inode->i_atime.tv_sec = body->mbo_atime; + lli->lli_atime = body->mbo_atime; + } + + if (body->mbo_valid & OBD_MD_FLMTIME) { + if (body->mbo_mtime > inode->i_mtime.tv_sec) { + CDEBUG(D_INODE, + "setting ino %lu mtime from %lld to %llu\n", + inode->i_ino, (s64)inode->i_mtime.tv_sec, + body->mbo_mtime); + inode->i_mtime.tv_sec = body->mbo_mtime; + } + lli->lli_mtime = body->mbo_mtime; + } + + if (body->mbo_valid & OBD_MD_FLCTIME) { + if (body->mbo_ctime > inode->i_ctime.tv_sec) + inode->i_ctime.tv_sec = body->mbo_ctime; + lli->lli_ctime = body->mbo_ctime; + } + + if (body->mbo_valid & OBD_MD_FLBTIME) + lli->lli_btime = body->mbo_btime; + + /* Clear i_flags to remove S_NOSEC before permissions are updated */ + if (body->mbo_valid & OBD_MD_FLFLAGS) + ll_update_inode_flags(inode, body->mbo_flags); + if (body->mbo_valid & OBD_MD_FLMODE) + inode->i_mode = (inode->i_mode & S_IFMT) | + (body->mbo_mode & ~S_IFMT); + + if (body->mbo_valid & OBD_MD_FLTYPE) + inode->i_mode = (inode->i_mode & ~S_IFMT) | + (body->mbo_mode & S_IFMT); + + LASSERT(inode->i_mode != 0); + if (body->mbo_valid & OBD_MD_FLUID) + inode->i_uid = make_kuid(&init_user_ns, body->mbo_uid); + if (body->mbo_valid & OBD_MD_FLGID) + inode->i_gid = make_kgid(&init_user_ns, body->mbo_gid); + if (body->mbo_valid & OBD_MD_FLPROJID) + lli->lli_projid = body->mbo_projid; + if (body->mbo_valid & OBD_MD_FLNLINK) { + spin_lock(&inode->i_lock); + set_nlink(inode, body->mbo_nlink); + spin_unlock(&inode->i_lock); + } + if (body->mbo_valid & OBD_MD_FLRDEV) + inode->i_rdev = old_decode_dev(body->mbo_rdev); + + if (body->mbo_valid & OBD_MD_FLID) { + /* FID shouldn't be changed! */ + if (fid_is_sane(&lli->lli_fid)) { + LASSERTF(lu_fid_eq(&lli->lli_fid, &body->mbo_fid1), + "Trying to change FID "DFID + " to the "DFID", inode "DFID"(%p)\n", + PFID(&lli->lli_fid), PFID(&body->mbo_fid1), + PFID(ll_inode2fid(inode)), inode); + } else { + lli->lli_fid = body->mbo_fid1; + } + } + + LASSERT(fid_seq(&lli->lli_fid) != 0); + + /* In case of encrypted file without the key, please do not lose + * clear text size stored into lli_lazysize in ll_merge_attr(), + * we will need it in ll_prepare_close(). + */ + if (lli->lli_attr_valid & OBD_MD_FLLAZYSIZE && lli->lli_lazysize && + llcrypt_require_key(inode) == -ENOKEY) + lli->lli_attr_valid = body->mbo_valid | OBD_MD_FLLAZYSIZE; + else + lli->lli_attr_valid = body->mbo_valid; + if (body->mbo_valid & OBD_MD_FLSIZE) { + i_size_write(inode, body->mbo_size); + + CDEBUG(D_VFSTRACE, "inode="DFID", updating i_size %llu\n", + PFID(ll_inode2fid(inode)), + (unsigned long long)body->mbo_size); + + if (body->mbo_valid & OBD_MD_FLBLOCKS) + inode->i_blocks = body->mbo_blocks; + } else { + if (body->mbo_valid & OBD_MD_FLLAZYSIZE) + lli->lli_lazysize = body->mbo_size; + if (body->mbo_valid & OBD_MD_FLLAZYBLOCKS) + lli->lli_lazyblocks = body->mbo_blocks; + } + + if (body->mbo_valid & OBD_MD_TSTATE) { + /* Set LLIF_FILE_RESTORING if restore ongoing and + * clear it when done to ensure to start again + * glimpsing updated attrs + */ + if (body->mbo_t_state & MS_RESTORE) + set_bit(LLIF_FILE_RESTORING, &lli->lli_flags); + else + clear_bit(LLIF_FILE_RESTORING, &lli->lli_flags); + } + + return 0; +} + +/* child default LMV is inherited from parent */ +static inline bool ll_default_lmv_inherited(struct lmv_stripe_md *pdmv, + struct lmv_stripe_md *cdmv) +{ + if (!pdmv || !cdmv) + return false; + + if (pdmv->lsm_md_magic != cdmv->lsm_md_magic || + pdmv->lsm_md_stripe_count != cdmv->lsm_md_stripe_count || + pdmv->lsm_md_master_mdt_index != cdmv->lsm_md_master_mdt_index || + pdmv->lsm_md_hash_type != cdmv->lsm_md_hash_type) + return false; + + if (cdmv->lsm_md_max_inherit != + lmv_inherit_next(pdmv->lsm_md_max_inherit)) + return false; + + if (cdmv->lsm_md_max_inherit_rr != + lmv_inherit_rr_next(pdmv->lsm_md_max_inherit_rr)) + return false; + + return true; +} + +/* update directory depth to ROOT, called after LOOKUP lock is fetched. */ +void ll_update_dir_depth(struct inode *dir, struct inode *inode) +{ + struct ll_inode_info *plli; + struct ll_inode_info *lli; + + if (!S_ISDIR(inode->i_mode)) + return; + + if (inode == dir) + return; + + plli = ll_i2info(dir); + lli = ll_i2info(inode); + lli->lli_dir_depth = plli->lli_dir_depth + 1; + if (plli->lli_default_lsm_md && lli->lli_default_lsm_md) { + down_read(&plli->lli_lsm_sem); + down_read(&lli->lli_lsm_sem); + if (ll_default_lmv_inherited(plli->lli_default_lsm_md, + lli->lli_default_lsm_md)) + lli->lli_inherit_depth = + plli->lli_inherit_depth + 1; + else + lli->lli_inherit_depth = 0; + up_read(&lli->lli_lsm_sem); + up_read(&plli->lli_lsm_sem); + } else { + lli->lli_inherit_depth = 0; + } + + CDEBUG(D_INODE, DFID" depth %hu default LMV depth %hu\n", + PFID(&lli->lli_fid), lli->lli_dir_depth, lli->lli_inherit_depth); +} + +void ll_truncate_inode_pages_final(struct inode *inode) +{ + struct address_space *mapping = &inode->i_data; + unsigned long nrpages; + unsigned long flags; + + truncate_inode_pages_final(mapping); + + /* Workaround for LU-118: Note nrpages may not be totally updated when + * truncate_inode_pages() returns, as there can be a page in the process + * of deletion (inside __delete_from_page_cache()) in the specified + * range. Thus mapping->nrpages can be non-zero when this function + * returns even after truncation of the whole mapping. Only do this if + * npages isn't already zero. + */ + nrpages = mapping->nrpages; + if (nrpages) { + ll_xa_lock_irqsave(&mapping->i_pages, flags); + nrpages = mapping->nrpages; + ll_xa_unlock_irqrestore(&mapping->i_pages, flags); + } /* Workaround end */ + + LASSERTF(nrpages == 0, "%s: inode="DFID"(%p) nrpages=%lu, " + "see https://jira.whamcloud.com/browse/LU-118\n", + ll_i2sbi(inode)->ll_fsname, + PFID(ll_inode2fid(inode)), inode, nrpages); +} + +int ll_read_inode2(struct inode *inode, void *opaque) +{ + struct lustre_md *md = opaque; + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(&lli->lli_fid), inode); + + /* Core attributes from the MDS first. This is a new inode, and + * the VFS doesn't zero times in the core inode so we have to do + * it ourselves. They will be overwritten by either MDS or OST + * attributes - we just need to make sure they aren't newer. + */ + inode->i_mtime.tv_sec = 0; + inode->i_atime.tv_sec = 0; + inode->i_ctime.tv_sec = 0; + inode->i_rdev = 0; + rc = ll_update_inode(inode, md); + if (rc != 0) + RETURN(rc); + + /* OIDEBUG(inode); */ + +#ifdef HAVE_BACKING_DEV_INFO + /* initializing backing dev info. */ + inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi; +#endif + if (S_ISREG(inode->i_mode)) { + struct ll_sb_info *sbi = ll_i2sbi(inode); + inode->i_op = &ll_file_inode_operations; + inode->i_fop = sbi->ll_fop; + inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops; + EXIT; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ll_dir_inode_operations; + inode->i_fop = &ll_dir_operations; + EXIT; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &ll_fast_symlink_inode_operations; + EXIT; + } else { + inode->i_op = &ll_special_inode_operations; + + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + + EXIT; + } + + return 0; +} + +void ll_delete_inode(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + ENTRY; + + if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL) { + /* It is last chance to write out dirty pages, + * otherwise we may lose data while umount. + * + * If i_nlink is 0 then just discard data. This is safe because + * local inode gets i_nlink 0 from server only for the last + * unlink, so that file is not opened somewhere else + */ + cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, inode->i_nlink ? + CL_FSYNC_LOCAL : CL_FSYNC_DISCARD, 1); + } + + ll_truncate_inode_pages_final(inode); + ll_clear_inode(inode); + clear_inode(inode); + + EXIT; +} + +int ll_iocontrol(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + int rc, flags = 0; + ENTRY; + + switch (cmd) { + case FS_IOC_GETFLAGS: { + struct mdt_body *body; + struct md_op_data *op_data; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, + 0, 0, LUSTRE_OPC_ANY, + NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = OBD_MD_FLFLAGS; + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc) { + CERROR("%s: failure inode "DFID": rc = %d\n", + sbi->ll_md_exp->exp_obd->obd_name, + PFID(ll_inode2fid(inode)), rc); + RETURN(-abs(rc)); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + + flags = body->mbo_flags; + /* if Lustre specific LUSTRE_ENCRYPT_FL flag is set, also set + * ext4 equivalent to please lsattr and other e2fsprogs tools + */ + if (flags & LUSTRE_ENCRYPT_FL) + flags |= STATX_ATTR_ENCRYPTED; + + ptlrpc_req_finished(req); + + RETURN(put_user(flags, (int __user *)arg)); + } + case FS_IOC_SETFLAGS: { + struct iattr *attr; + struct md_op_data *op_data; + struct cl_object *obj; + struct fsxattr fa = { 0 }; + + if (get_user(flags, (int __user *)arg)) + RETURN(-EFAULT); + + fa.fsx_projid = ll_i2info(inode)->lli_projid; + if (flags & LUSTRE_PROJINHERIT_FL) + fa.fsx_xflags = FS_XFLAG_PROJINHERIT; + + rc = ll_ioctl_check_project(inode, fa.fsx_xflags, + fa.fsx_projid); + if (rc) + RETURN(rc); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_attr_flags = flags; + op_data->op_xvalid |= OP_XVALID_FLAGS; + rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &req); + ll_finish_md_op_data(op_data); + ptlrpc_req_finished(req); + if (rc) + RETURN(rc); + + ll_update_inode_flags(inode, flags); + + obj = ll_i2info(inode)->lli_clob; + if (obj == NULL) + RETURN(0); + + OBD_ALLOC_PTR(attr); + if (attr == NULL) + RETURN(-ENOMEM); + + rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS, flags); + + OBD_FREE_PTR(attr); + RETURN(rc); + } + default: + RETURN(-ENOSYS); + } + + RETURN(0); +} + +int ll_flush_ctx(struct inode *inode) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + + CDEBUG(D_SEC, "flush context for user %d\n", + from_kuid(&init_user_ns, current_uid())); + + obd_set_info_async(NULL, sbi->ll_md_exp, + sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX, + 0, NULL, NULL); + obd_set_info_async(NULL, sbi->ll_dt_exp, + sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX, + 0, NULL, NULL); + return 0; +} + +/* umount -f client means force down, don't save state */ +void ll_umount_begin(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_device *obd; + struct obd_ioctl_data *ioc_data; + int cnt; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb, + sb->s_count, atomic_read(&sb->s_active)); + + obd = class_exp2obd(sbi->ll_md_exp); + if (obd == NULL) { + CERROR("Invalid MDC connection handle %#llx\n", + sbi->ll_md_exp->exp_handle.h_cookie); + EXIT; + return; + } + obd->obd_force = 1; + + obd = class_exp2obd(sbi->ll_dt_exp); + if (obd == NULL) { + CERROR("Invalid LOV connection handle %#llx\n", + sbi->ll_dt_exp->exp_handle.h_cookie); + EXIT; + return; + } + obd->obd_force = 1; + + OBD_ALLOC_PTR(ioc_data); + if (ioc_data) { + obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp, + sizeof *ioc_data, ioc_data, NULL); + + obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp, + sizeof *ioc_data, ioc_data, NULL); + + OBD_FREE_PTR(ioc_data); + } + + /* Really, we'd like to wait until there are no requests outstanding, + * and then continue. For now, we just periodically checking for vfs + * to decrement mnt_cnt and hope to finish it within 10sec. + */ + cnt = 10; + while (cnt > 0 && + !may_umount(sbi->ll_mnt.mnt)) { + ssleep(1); + cnt -= 1; + } + + EXIT; +} + +int ll_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + char *profilenm = get_profile_name(sb); + int err; + __u32 read_only; + + if ((*flags & MS_RDONLY) != (sb->s_flags & SB_RDONLY)) { + read_only = *flags & MS_RDONLY; + err = obd_set_info_async(NULL, sbi->ll_md_exp, + sizeof(KEY_READ_ONLY), + KEY_READ_ONLY, sizeof(read_only), + &read_only, NULL); + if (err) { + LCONSOLE_WARN("Failed to remount %s %s (%d)\n", + profilenm, read_only ? + "read-only" : "read-write", err); + return err; + } + + if (read_only) + sb->s_flags |= SB_RDONLY; + else + sb->s_flags &= ~SB_RDONLY; + + if (test_bit(LL_SBI_VERBOSE, sbi->ll_flags)) + LCONSOLE_WARN("Remounted %s %s\n", profilenm, + read_only ? "read-only" : "read-write"); + } + return 0; +} + +/** + * Cleanup the open handle that is cached on MDT-side. + * + * For open case, the client side open handling thread may hit error + * after the MDT grant the open. Under such case, the client should + * send close RPC to the MDT as cleanup; otherwise, the open handle + * on the MDT will be leaked there until the client umount or evicted. + * + * In further, if someone unlinked the file, because the open handle + * holds the reference on such file/object, then it will block the + * subsequent threads that want to locate such object via FID. + * + * \param[in] sb super block for this file-system + * \param[in] open_req pointer to the original open request + */ +void ll_open_cleanup(struct super_block *sb, struct req_capsule *pill) +{ + struct mdt_body *body; + struct md_op_data *op_data; + struct ptlrpc_request *close_req = NULL; + struct obd_export *exp = ll_s2sbi(sb)->ll_md_exp; + ENTRY; + + body = req_capsule_server_get(pill, &RMF_MDT_BODY); + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) { + CWARN("%s: cannot allocate op_data to release open handle for " + DFID"\n", ll_s2sbi(sb)->ll_fsname, PFID(&body->mbo_fid1)); + + RETURN_EXIT; + } + + op_data->op_fid1 = body->mbo_fid1; + op_data->op_open_handle = body->mbo_open_handle; + op_data->op_mod_time = ktime_get_real_seconds(); + md_close(exp, op_data, NULL, &close_req); + ptlrpc_req_finished(close_req); + ll_finish_md_op_data(op_data); + + EXIT; +} + +/* set filesystem-wide default LMV for subdir mount if it's enabled on ROOT. */ +static int ll_fileset_default_lmv_fixup(struct inode *inode, + struct lustre_md *md) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + union lmv_mds_md *lmm = NULL; + int size = 0; + int rc; + + LASSERT(is_root_inode(inode)); + LASSERT(!fid_is_root(&sbi->ll_root_fid)); + LASSERT(!md->default_lmv); + + rc = ll_dir_get_default_layout(inode, (void **)&lmm, &size, &req, + OBD_MD_DEFAULT_MEA, + GET_DEFAULT_LAYOUT_ROOT); + if (rc && rc != -ENODATA) + GOTO(out, rc); + + rc = 0; + if (lmm && size) { + rc = md_unpackmd(sbi->ll_md_exp, &md->default_lmv, lmm, size); + if (rc < 0) + GOTO(out, rc); + + rc = 0; + } + EXIT; +out: + if (req) + ptlrpc_req_finished(req); + return rc; +} + +int ll_prep_inode(struct inode **inode, struct req_capsule *pill, + struct super_block *sb, struct lookup_intent *it) +{ + struct ll_sb_info *sbi = NULL; + struct lustre_md md = { NULL }; + bool default_lmv_deleted = false; + int rc; + + ENTRY; + + LASSERT(*inode || sb); + sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode); + rc = md_get_lustre_md(sbi->ll_md_exp, pill, sbi->ll_dt_exp, + sbi->ll_md_exp, &md); + if (rc != 0) + GOTO(out, rc); + + /* + * clear default_lmv only if intent_getattr reply doesn't contain it. + * but it needs to be done after iget, check this early because + * ll_update_lsm_md() may change md. + */ + if (it && (it->it_op & (IT_LOOKUP | IT_GETATTR)) && + S_ISDIR(md.body->mbo_mode) && !md.default_lmv) { + if (unlikely(*inode && is_root_inode(*inode) && + !fid_is_root(&sbi->ll_root_fid))) { + rc = ll_fileset_default_lmv_fixup(*inode, &md); + if (rc) + GOTO(out, rc); + } + + if (!md.default_lmv) + default_lmv_deleted = true; + } + + if (*inode) { + rc = ll_update_inode(*inode, &md); + if (rc != 0) + GOTO(out, rc); + } else { + bool api32 = test_bit(LL_SBI_32BIT_API, sbi->ll_flags); + struct lu_fid *fid1 = &md.body->mbo_fid1; + + LASSERT(sb != NULL); + + /* + * At this point server returns to client's same fid as client + * generated for creating. So using ->fid1 is okay here. + */ + if (!fid_is_sane(fid1)) { + CERROR("%s: Fid is insane "DFID"\n", + sbi->ll_fsname, PFID(fid1)); + GOTO(out, rc = -EINVAL); + } + + *inode = ll_iget(sb, cl_fid_build_ino(fid1, api32), &md); + if (IS_ERR(*inode)) { + lmd_clear_acl(&md); + rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM; + *inode = NULL; + CERROR("new_inode -fatal: rc %d\n", rc); + GOTO(out, rc); + } + } + + /* Handling piggyback layout lock. + * Layout lock can be piggybacked by getattr and open request. + * The lsm can be applied to inode only if it comes with a layout lock + * otherwise correct layout may be overwritten, for example: + * 1. proc1: mdt returns a lsm but not granting layout + * 2. layout was changed by another client + * 3. proc2: refresh layout and layout lock granted + * 4. proc1: to apply a stale layout */ + if (it != NULL && it->it_lock_mode != 0) { + struct lustre_handle lockh; + struct ldlm_lock *lock; + + lockh.cookie = it->it_lock_handle; + lock = ldlm_handle2lock(&lockh); + LASSERT(lock != NULL); + if (ldlm_has_layout(lock)) { + struct cl_object_conf conf; + + memset(&conf, 0, sizeof(conf)); + conf.coc_opc = OBJECT_CONF_SET; + conf.coc_inode = *inode; + conf.coc_lock = lock; + conf.u.coc_layout = md.layout; + (void)ll_layout_conf(*inode, &conf); + } + LDLM_LOCK_PUT(lock); + } + + if (default_lmv_deleted) + ll_update_default_lsm_md(*inode, &md); + + /* we may want to apply some policy for foreign file/dir */ + if (ll_sbi_has_foreign_symlink(sbi)) { + rc = ll_manage_foreign(*inode, &md); + if (rc < 0) + GOTO(out, rc); + } + + GOTO(out, rc = 0); + +out: + /* cleanup will be done if necessary */ + md_free_lustre_md(sbi->ll_md_exp, &md); + + if (rc != 0 && it != NULL && it->it_op & IT_OPEN) { + ll_intent_drop_lock(it); + ll_open_cleanup(sb != NULL ? sb : (*inode)->i_sb, pill); + } + + return rc; +} + +int ll_obd_statfs(struct inode *inode, void __user *arg) +{ + struct ll_sb_info *sbi = NULL; + struct obd_export *exp; + struct obd_ioctl_data *data = NULL; + __u32 type; + int len = 0, rc; + + if (inode) + sbi = ll_i2sbi(inode); + if (!sbi) + GOTO(out_statfs, rc = -EINVAL); + + rc = obd_ioctl_getdata(&data, &len, arg); + if (rc) + GOTO(out_statfs, rc); + + if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 || + !data->ioc_pbuf1 || !data->ioc_pbuf2) + GOTO(out_statfs, rc = -EINVAL); + + if (data->ioc_inllen1 != sizeof(__u32) || + data->ioc_inllen2 != sizeof(__u32) || + data->ioc_plen1 != sizeof(struct obd_statfs) || + data->ioc_plen2 != sizeof(struct obd_uuid)) + GOTO(out_statfs, rc = -EINVAL); + + memcpy(&type, data->ioc_inlbuf1, sizeof(__u32)); + if (type & LL_STATFS_LMV) + exp = sbi->ll_md_exp; + else if (type & LL_STATFS_LOV) + exp = sbi->ll_dt_exp; + else + GOTO(out_statfs, rc = -ENODEV); + + rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, data, NULL); + if (rc) + GOTO(out_statfs, rc); +out_statfs: + OBD_FREE_LARGE(data, len); + return rc; +} + +/* + * this is normally called in ll_fini_md_op_data(), but sometimes it needs to + * be called early to avoid deadlock. + */ +void ll_unlock_md_op_lsm(struct md_op_data *op_data) +{ + if (op_data->op_mea2_sem) { + up_read_non_owner(op_data->op_mea2_sem); + op_data->op_mea2_sem = NULL; + } + + if (op_data->op_mea1_sem) { + up_read_non_owner(op_data->op_mea1_sem); + op_data->op_mea1_sem = NULL; + } +} + +/* this function prepares md_op_data hint for passing it down to MD stack. */ +struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, + struct inode *i1, struct inode *i2, + const char *name, size_t namelen, + __u32 mode, enum md_op_code opc, + void *data) +{ + struct llcrypt_name fname = { 0 }; + int rc; + + LASSERT(i1 != NULL); + + if (name == NULL) { + /* Do not reuse namelen for something else. */ + if (namelen != 0) + return ERR_PTR(-EINVAL); + } else { + if ((!IS_ENCRYPTED(i1) || + (opc != LUSTRE_OPC_LOOKUP && opc != LUSTRE_OPC_CREATE)) && + namelen > ll_i2sbi(i1)->ll_namelen) + return ERR_PTR(-ENAMETOOLONG); + + /* "/" is not valid name, but it's allowed */ + if (!lu_name_is_valid_2(name, namelen) && + strncmp("/", name, namelen) != 0) + return ERR_PTR(-EINVAL); + } + + if (op_data == NULL) + OBD_ALLOC_PTR(op_data); + + if (op_data == NULL) + return ERR_PTR(-ENOMEM); + + ll_i2gids(op_data->op_suppgids, i1, i2); + /* If the client is using a subdir mount and looks at what it sees as + * /.fscrypt, interpret it as the .fscrypt dir at the root of the fs. + */ + if (unlikely(i1->i_sb && i1->i_sb->s_root && is_root_inode(i1) && + !fid_is_root(ll_inode2fid(i1)) && + name && namelen == strlen(dot_fscrypt_name) && + strncmp(name, dot_fscrypt_name, namelen) == 0)) + lu_root_fid(&op_data->op_fid1); + else + op_data->op_fid1 = *ll_inode2fid(i1); + + if (S_ISDIR(i1->i_mode)) { + down_read_non_owner(&ll_i2info(i1)->lli_lsm_sem); + op_data->op_mea1_sem = &ll_i2info(i1)->lli_lsm_sem; + op_data->op_mea1 = ll_i2info(i1)->lli_lsm_md; + op_data->op_default_mea1 = ll_i2info(i1)->lli_default_lsm_md; + } + + if (i2) { + op_data->op_fid2 = *ll_inode2fid(i2); + if (S_ISDIR(i2->i_mode)) { + if (i2 != i1) { + /* i2 is typically a child of i1, and MUST be + * further from the root to avoid deadlocks. + */ + down_read_non_owner(&ll_i2info(i2)->lli_lsm_sem); + op_data->op_mea2_sem = + &ll_i2info(i2)->lli_lsm_sem; + } + op_data->op_mea2 = ll_i2info(i2)->lli_lsm_md; + } + } else { + fid_zero(&op_data->op_fid2); + } + + if (test_bit(LL_SBI_64BIT_HASH, ll_i2sbi(i1)->ll_flags)) + op_data->op_cli_flags |= CLI_HASH64; + + if (ll_need_32bit_api(ll_i2sbi(i1))) + op_data->op_cli_flags |= CLI_API32; + + if ((i2 && is_root_inode(i2)) || + opc == LUSTRE_OPC_LOOKUP || opc == LUSTRE_OPC_CREATE) { + /* In case of lookup, ll_setup_filename() has already been + * called in ll_lookup_it(), so just take provided name. + * Also take provided name if we are dealing with root inode. + */ + fname.disk_name.name = (unsigned char *)name; + fname.disk_name.len = namelen; + } else if (name && namelen) { + struct qstr dname = QSTR_INIT(name, namelen); + struct inode *dir; + struct lu_fid *pfid = NULL; + struct lu_fid fid; + int lookup; + + if (!S_ISDIR(i1->i_mode) && i2 && S_ISDIR(i2->i_mode)) { + /* special case when called from ll_link() */ + dir = i2; + lookup = 0; + } else { + dir = i1; + lookup = (int)(opc == LUSTRE_OPC_ANY); + } + if (opc == LUSTRE_OPC_ANY && lookup) + pfid = &fid; + rc = ll_setup_filename(dir, &dname, lookup, &fname, pfid); + if (rc) { + ll_finish_md_op_data(op_data); + return ERR_PTR(rc); + } + if (pfid && !fid_is_zero(pfid)) { + if (i2 == NULL) + op_data->op_fid2 = fid; + op_data->op_bias = MDS_FID_OP; + } + if (fname.disk_name.name && + fname.disk_name.name != (unsigned char *)name) { + /* op_data->op_name must be freed after use */ + op_data->op_flags |= MF_OPNAME_KMALLOCED; + } + } + + /* In fact LUSTRE_OPC_LOOKUP, LUSTRE_OPC_OPEN + * are LUSTRE_OPC_ANY + */ + if (opc == LUSTRE_OPC_LOOKUP || opc == LUSTRE_OPC_OPEN) + op_data->op_code = LUSTRE_OPC_ANY; + else + op_data->op_code = opc; + op_data->op_name = fname.disk_name.name; + op_data->op_namelen = fname.disk_name.len; + op_data->op_mode = mode; + op_data->op_mod_time = ktime_get_real_seconds(); + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = current_cap(); + op_data->op_mds = 0; + if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) && + filename_is_volatile(name, namelen, &op_data->op_mds)) { + op_data->op_bias |= MDS_CREATE_VOLATILE; + } + op_data->op_data = data; + + return op_data; +} + +void ll_finish_md_op_data(struct md_op_data *op_data) +{ + ll_unlock_md_op_lsm(op_data); + ll_security_release_secctx(op_data->op_file_secctx, + op_data->op_file_secctx_size, + op_data->op_file_secctx_slot); + if (op_data->op_flags & MF_OPNAME_KMALLOCED) + /* allocated via ll_setup_filename called + * from ll_prep_md_op_data + */ + kfree(op_data->op_name); + llcrypt_free_ctx(op_data->op_file_encctx, op_data->op_file_encctx_size); + OBD_FREE_PTR(op_data); +} + +int ll_show_options(struct seq_file *seq, struct dentry *dentry) +{ + struct ll_sb_info *sbi; + int i; + + LASSERT(seq && dentry); + sbi = ll_s2sbi(dentry->d_sb); + + if (test_bit(LL_SBI_NOLCK, sbi->ll_flags)) + seq_puts(seq, "nolock"); + + for (i = 1; ll_sbi_flags_name[i].token != LL_SBI_NUM_MOUNT_OPT; i++) { + /* match_table in some cases has patterns for both enabled and + * disabled cases. Ignore 'no'xxx versions if bit is set. + */ + if (test_bit(ll_sbi_flags_name[i].token, sbi->ll_flags) && + strncmp(ll_sbi_flags_name[i].pattern, "no", 2)) { + if (ll_sbi_flags_name[i].token == + LL_SBI_FOREIGN_SYMLINK) { + seq_show_option(seq, "foreign_symlink", + sbi->ll_foreign_symlink_prefix); + } else { + seq_printf(seq, ",%s", + ll_sbi_flags_name[i].pattern); + } + + /* You can have either localflock or flock but not + * both. If localflock is set don't print flock or + * noflock. + */ + if (ll_sbi_flags_name[i].token == LL_SBI_LOCALFLOCK) + i += 2; + } else if (!test_bit(ll_sbi_flags_name[i].token, sbi->ll_flags) && + !strncmp(ll_sbi_flags_name[i].pattern, "no", 2)) { + seq_printf(seq, ",%s", + ll_sbi_flags_name[i].pattern); + } + } + + llcrypt_show_test_dummy_encryption(seq, ',', dentry->d_sb); + + if (test_bit(LL_SBI_MDLL, sbi->ll_flags)) + seq_puts(seq, ",mdll"); + + if (test_bit(LL_SBI_MDLL_BYPASS, sbi->ll_flags)) + seq_puts(seq, ",mdll_bypass"); + + if (test_bit(LL_SBI_MDLL_AUTO_REFRESH, sbi->ll_flags)) + seq_puts(seq, ",mdll_auto_refresh"); + + RETURN(0); +} + +/** + * Get obd name by cmd, and copy out to user space + */ +int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_device *obd; + ENTRY; + + if (cmd == OBD_IOC_GETNAME_OLD || cmd == OBD_IOC_GETDTNAME) + obd = class_exp2obd(sbi->ll_dt_exp); + else if (cmd == OBD_IOC_GETMDNAME) + obd = class_exp2obd(sbi->ll_md_exp); + else + RETURN(-EINVAL); + + if (!obd) + RETURN(-ENOENT); + + if (copy_to_user((void __user *)arg, obd->obd_name, + strlen(obd->obd_name) + 1)) + RETURN(-EFAULT); + + RETURN(0); +} + +struct dname_buf { + struct work_struct db_work; + struct dentry *db_dentry; + /* Let's hope the path is not too long, 32 bytes for the work struct + * on my kernel + */ + char buf[PAGE_SIZE - sizeof(struct work_struct) - sizeof(void *)]; +}; + +static void ll_dput_later(struct work_struct *work) +{ + struct dname_buf *db = container_of(work, struct dname_buf, db_work); + + dput(db->db_dentry); + free_page((unsigned long)db); +} + +static char* ll_d_path(struct dentry *dentry, char *buf, int bufsize) +{ + char *path = NULL; + + struct path p; + + p.dentry = dentry; + p.mnt = current->fs->root.mnt; + path_get(&p); + path = d_path(&p, buf, bufsize); + path_put(&p); + return path; +} + +void ll_dirty_page_discard_warn(struct inode *inode, int ioret) +{ + struct dname_buf *db; + char *path = NULL; + struct dentry *dentry = NULL; + + /* this can be called inside spin lock so use GFP_ATOMIC. */ + db = (struct dname_buf *)__get_free_page(GFP_ATOMIC); + if (db != NULL) { + + dentry = d_find_alias(inode); + if (dentry != NULL) + path = ll_d_path(dentry, db->buf, sizeof(db->buf)); + } + + /* The below message is checked in recovery-small.sh test_24b */ + CDEBUG(D_WARNING, + "%s: dirty page discard: %s/fid: "DFID"/%s may get corrupted " + "(rc %d)\n", ll_i2sbi(inode)->ll_fsname, + s2lsi(inode->i_sb)->lsi_lmd->lmd_dev, + PFID(ll_inode2fid(inode)), + (path && !IS_ERR(path)) ? path : "", ioret); + + if (dentry != NULL) { + /* We cannot dput here since if we happen to be the last holder + * then we can end up waiting for page evictions that + * in turn wait for RPCs that need this instance of ptlrpcd + * (callng brw_interpret->*page_completion*->vmpage_error->here) + * LU-15340 + */ + INIT_WORK(&db->db_work, ll_dput_later); + db->db_dentry = dentry; + schedule_work(&db->db_work); + } else { + if (db != NULL) + free_page((unsigned long)db); + } +} + +ssize_t ll_copy_user_md(const struct lov_user_md __user *md, + struct lov_user_md **kbuf) +{ + struct lov_user_md lum; + ssize_t lum_size; + ENTRY; + + if (copy_from_user(&lum, md, sizeof(lum))) + RETURN(-EFAULT); + + lum_size = ll_lov_user_md_size(&lum); + if (lum_size < 0) + RETURN(lum_size); + + OBD_ALLOC_LARGE(*kbuf, lum_size); + if (*kbuf == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(*kbuf, md, lum_size) != 0) { + OBD_FREE_LARGE(*kbuf, lum_size); + RETURN(-EFAULT); + } + + RETURN(lum_size); +} + +/* + * Compute llite root squash state after a change of root squash + * configuration setting or add/remove of a lnet nid + */ +void ll_compute_rootsquash_state(struct ll_sb_info *sbi) +{ + struct root_squash_info *squash = &sbi->ll_squash; + int i; + bool matched; + struct lnet_processid id; + + /* Update norootsquash flag */ + spin_lock(&squash->rsi_lock); + if (list_empty(&squash->rsi_nosquash_nids)) + clear_bit(LL_SBI_NOROOTSQUASH, sbi->ll_flags); + else { + /* Do not apply root squash as soon as one of our NIDs is + * in the nosquash_nids list */ + matched = false; + i = 0; + while (LNetGetId(i++, &id) != -ENOENT) { + if (nid_is_lo0(&id.nid)) + continue; + if (cfs_match_nid(lnet_nid_to_nid4(&id.nid), + &squash->rsi_nosquash_nids)) { + matched = true; + break; + } + } + if (matched) + set_bit(LL_SBI_NOROOTSQUASH, sbi->ll_flags); + else + clear_bit(LL_SBI_NOROOTSQUASH, sbi->ll_flags); + } + spin_unlock(&squash->rsi_lock); +} + +/** + * Parse linkea content to extract information about a given hardlink + * + * \param[in] ldata - Initialized linkea data + * \param[in] linkno - Link identifier + * \param[out] parent_fid - The entry's parent FID + * \param[out] ln - Entry name destination buffer + * + * \retval 0 on success + * \retval Appropriate negative error code on failure + */ +static int ll_linkea_decode(struct linkea_data *ldata, unsigned int linkno, + struct lu_fid *parent_fid, struct lu_name *ln) +{ + unsigned int idx; + int rc; + ENTRY; + + rc = linkea_init_with_rec(ldata); + if (rc < 0) + RETURN(rc); + + if (linkno >= ldata->ld_leh->leh_reccount) + /* beyond last link */ + RETURN(-ENODATA); + + linkea_first_entry(ldata); + for (idx = 0; ldata->ld_lee != NULL; idx++) { + linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, ln, + parent_fid); + if (idx == linkno) + break; + + linkea_next_entry(ldata); + } + + if (idx < linkno) + RETURN(-ENODATA); + + RETURN(0); +} + +/** + * Get parent FID and name of an identified link. Operation is performed for + * a given link number, letting the caller iterate over linkno to list one or + * all links of an entry. + * + * \param[in] file - File descriptor against which to perform the operation + * \param[in,out] arg - User-filled structure containing the linkno to operate + * on and the available size. It is eventually filled with + * the requested information or left untouched on error + * + * \retval - 0 on success + * \retval - Appropriate negative error code on failure + */ +int ll_getparent(struct file *file, struct getparent __user *arg) +{ + struct inode *inode = file_inode(file); + struct linkea_data *ldata; + struct lu_buf buf = LU_BUF_NULL; + struct lu_name ln; + struct lu_fid parent_fid; + __u32 linkno; + __u32 name_size; + int rc; + + ENTRY; + + if (!capable(CAP_DAC_READ_SEARCH) && + !test_bit(LL_SBI_USER_FID2PATH, ll_i2sbi(inode)->ll_flags)) + RETURN(-EPERM); + + if (get_user(name_size, &arg->gp_name_size)) + RETURN(-EFAULT); + + if (get_user(linkno, &arg->gp_linkno)) + RETURN(-EFAULT); + + if (name_size > PATH_MAX) + RETURN(-EINVAL); + + OBD_ALLOC(ldata, sizeof(*ldata)); + if (ldata == NULL) + RETURN(-ENOMEM); + + rc = linkea_data_new(ldata, &buf); + if (rc < 0) + GOTO(ldata_free, rc); + + rc = ll_xattr_list(inode, XATTR_NAME_LINK, XATTR_TRUSTED_T, buf.lb_buf, + buf.lb_len, OBD_MD_FLXATTR); + if (rc < 0) + GOTO(lb_free, rc); + + rc = ll_linkea_decode(ldata, linkno, &parent_fid, &ln); + if (rc < 0) + GOTO(lb_free, rc); + + if (ln.ln_namelen >= name_size) + GOTO(lb_free, rc = -EOVERFLOW); + + if (copy_to_user(&arg->gp_fid, &parent_fid, sizeof(arg->gp_fid))) + GOTO(lb_free, rc = -EFAULT); + + if (copy_to_user(&arg->gp_name, ln.ln_name, ln.ln_namelen)) + GOTO(lb_free, rc = -EFAULT); + + if (put_user('\0', arg->gp_name + ln.ln_namelen)) + GOTO(lb_free, rc = -EFAULT); + +lb_free: + lu_buf_free(&buf); +ldata_free: + OBD_FREE(ldata, sizeof(*ldata)); + + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c new file mode 100644 index 0000000000000..16d73ebd71146 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c @@ -0,0 +1,616 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "llite_internal.h" +#include + +static const struct vm_operations_struct ll_file_vm_ops; + +void policy_from_vma(union ldlm_policy_data *policy, struct vm_area_struct *vma, + unsigned long addr, size_t count) +{ + policy->l_extent.start = ((addr - vma->vm_start) & PAGE_MASK) + + (vma->vm_pgoff << PAGE_SHIFT); + policy->l_extent.end = (policy->l_extent.start + count - 1) | + ~PAGE_MASK; +} + +/* + * Linux commit v6.0-rc3-225-gf39af05949a4 + * mm: add VMA iterator + */ +#ifndef VMA_ITERATOR +#define vma_iterator vm_area_struct * +#define vma_iter_init(vmip, mm, addr) *(vmip) = find_vma(mm, addr) +#define for_each_vma(vmi, vma) \ + for (vma = vmi; vma != NULL; vma = vma->vm_next) +#endif + +struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, + size_t count) +{ + struct vm_area_struct *vma, *ret = NULL; + struct vma_iterator vmi; + + ENTRY; + + /* mmap_lock must have been held by caller. */ + LASSERT(!mmap_write_trylock(mm)); + + vma_iter_init(&vmi, mm, addr); + for_each_vma(vmi, vma) { + if (vma->vm_start < (addr + count)) + break; + if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops && + vma->vm_flags & VM_SHARED) { + ret = vma; + break; + } + } + RETURN(ret); +} + +/** + * API independent part for page fault initialization. + * \param env - corespondent lu_env to processing + * \param vma - virtual memory area addressed to page fault + * \param index - page index corespondent to fault. + * \param mkwrite - whether it is mmap write. + * + * \return error codes from cl_io_init. + */ +static struct cl_io * +ll_fault_io_init(struct lu_env *env, struct vm_area_struct *vma, + pgoff_t index, bool mkwrite) +{ + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct cl_io *io; + struct cl_fault_io *fio; + int rc; + ENTRY; + + if (ll_file_nolock(file)) + RETURN(ERR_PTR(-EOPNOTSUPP)); + +restart: + io = vvp_env_thread_io(env); + io->ci_obj = ll_i2info(inode)->lli_clob; + LASSERT(io->ci_obj != NULL); + + fio = &io->u.ci_fault; + fio->ft_index = index; + fio->ft_executable = vma->vm_flags & VM_EXEC; + + if (mkwrite) { + fio->ft_mkwrite = 1; + fio->ft_writable = 1; + } + + CDEBUG(D_MMAP, + DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu\n", + PFID(&ll_i2info(inode)->lli_fid), vma, vma->vm_start, + vma->vm_end, vma->vm_flags, fio->ft_index); + + if (vma->vm_flags & VM_SEQ_READ) + io->ci_seq_read = 1; + else if (vma->vm_flags & VM_RAND_READ) + io->ci_rand_read = 1; + + rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj); + if (rc == 0) { + struct vvp_io *vio = vvp_env_io(env); + struct ll_file_data *fd = file->private_data; + + LASSERT(vio->vui_cl.cis_io == io); + + /* mmap lock must be MANDATORY it has to cache + * pages. */ + io->ci_lockreq = CILR_MANDATORY; + vio->vui_fd = fd; + } else { + cl_io_fini(env, io); + if (io->ci_need_restart) + goto restart; + + io = ERR_PTR(rc); + } + + RETURN(io); +} + +/* Sharing code of page_mkwrite method for rhel5 and rhel6 */ +static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage, + bool *retry) +{ + struct lu_env *env; + struct cl_io *io; + struct vvp_io *vio; + int result; + __u16 refcheck; + sigset_t old, new; + struct inode *inode = NULL; + struct ll_inode_info *lli; + ENTRY; + + LASSERT(vmpage != NULL); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = ll_fault_io_init(env, vma, vmpage->index, true); + if (IS_ERR(io)) + GOTO(out, result = PTR_ERR(io)); + + result = io->ci_result; + if (result < 0) + GOTO(out_io, result); + + vio = vvp_env_io(env); + vio->u.fault.ft_vma = vma; + vio->u.fault.ft_vmpage = vmpage; + + siginitsetinv(&new, sigmask(SIGKILL) | sigmask(SIGTERM)); + sigprocmask(SIG_BLOCK, &new, &old); + + inode = vvp_object_inode(io->ci_obj); + lli = ll_i2info(inode); + + result = cl_io_loop(env, io); + + sigprocmask(SIG_SETMASK, &old, NULL); + + if (result == 0) { + lock_page(vmpage); + if (vmpage->mapping == NULL) { + unlock_page(vmpage); + + /* page was truncated and lock was cancelled, return + * ENODATA so that VM_FAULT_NOPAGE will be returned + * to handle_mm_fault(). */ + if (result == 0) + result = -ENODATA; + } else if (!PageDirty(vmpage)) { + /* race, the page has been cleaned by ptlrpcd after + * it was unlocked, it has to be added into dirty + * cache again otherwise this soon-to-dirty page won't + * consume any grants, even worse if this page is being + * transferred because it will break RPC checksum. + */ + unlock_page(vmpage); + + CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has " + "been written out, retry.\n", + vmpage, vmpage->index); + + *retry = true; + result = -EAGAIN; + } + + if (result == 0) + set_bit(LLIF_DATA_MODIFIED, &lli->lli_flags); + } + EXIT; + +out_io: + cl_io_fini(env, io); +out: + cl_env_put(env, &refcheck); + CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result); + LASSERT(ergo(result == 0, PageLocked(vmpage))); + + /* if page has been unmapped, presumably due to lock reclaim for + * concurrent usage, add some delay before retrying to prevent + * entering live-lock situation with competitors + */ + if (result == -ENODATA && inode != NULL) { + CDEBUG(D_MMAP, "delaying new page-fault for inode %p to " + "prevent live-lock\n", inode); + msleep(10); + } + + return result; +} + +static inline int to_fault_error(int result) +{ + switch(result) { + case 0: + result = VM_FAULT_LOCKED; + break; + case -ENOMEM: + result = VM_FAULT_OOM; + break; + default: + result = VM_FAULT_SIGBUS; + break; + } + return result; +} + +int ll_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int ret; + unsigned int seq; + + /* this seqlock lets us notice if a page has been deleted on this inode + * during the fault process, allowing us to catch an erroneous SIGBUS + * See LU-16160 + */ + do { + seq = read_seqbegin(&ll_i2info(inode)->lli_page_inv_lock); + ret = __ll_filemap_fault(vma, vmf); + } while (read_seqretry(&ll_i2info(inode)->lli_page_inv_lock, seq) && + (ret & VM_FAULT_SIGBUS)); + + return ret; +} + +/** + * Lustre implementation of a vm_operations_struct::fault() method, called by + * VM to server page fault (both in kernel and user space). + * + * \param vma - is virtiual area struct related to page fault + * \param vmf - structure which describe type and address where hit fault + * + * \return allocated and filled _locked_ page for address + * \retval VM_FAULT_ERROR on general error + * \retval NOPAGE_OOM not have memory for allocate new page + */ +static vm_fault_t ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + struct lu_env *env; + struct cl_io *io; + struct vvp_io *vio = NULL; + struct page *vmpage; + int result = 0; + int fault_ret = 0; + __u16 refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + if (ll_sbi_has_fast_read(ll_i2sbi(inode))) { + /* do fast fault */ + bool allow_retry = vmf->flags & FAULT_FLAG_ALLOW_RETRY; + bool has_retry = vmf->flags & FAULT_FLAG_RETRY_NOWAIT; + + /* To avoid loops, instruct downstream to not drop mmap_sem */ + /** + * only need FAULT_FLAG_ALLOW_RETRY prior to Linux 5.1 + * (6b4c9f4469819), where FAULT_FLAG_RETRY_NOWAIT is enough + * to not drop mmap_sem when failed to lock the page. + */ + vmf->flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; + ll_cl_add(inode, env, NULL, LCC_MMAP); + fault_ret = ll_filemap_fault(vma, vmf); + ll_cl_remove(inode, env); + if (!has_retry) + vmf->flags &= ~FAULT_FLAG_RETRY_NOWAIT; + if (!allow_retry) + vmf->flags &= ~FAULT_FLAG_ALLOW_RETRY; + + /* - If there is no error, then the page was found in cache and + * uptodate; + * - If VM_FAULT_RETRY is set, the page existed but failed to + * lock. We will try slow path to avoid loops. + * - Otherwise, it should try normal fault under DLM lock. */ + if (!(fault_ret & VM_FAULT_RETRY) && + !(fault_ret & VM_FAULT_ERROR)) + GOTO(out, result = 0); + + fault_ret = 0; + } + + io = ll_fault_io_init(env, vma, vmf->pgoff, false); + if (IS_ERR(io)) + GOTO(out, result = PTR_ERR(io)); + + result = io->ci_result; + if (result == 0) { + vio = vvp_env_io(env); + vio->u.fault.ft_vma = vma; + vio->u.fault.ft_vmpage = NULL; + vio->u.fault.ft_vmf = vmf; + vio->u.fault.ft_flags = 0; + vio->u.fault.ft_flags_valid = 0; + + /* May call ll_readpage() */ + ll_cl_add(inode, env, io, LCC_MMAP); + + result = cl_io_loop(env, io); + + ll_cl_remove(inode, env); + + /* ft_flags are only valid if we reached + * the call to filemap_fault */ + if (vio->u.fault.ft_flags_valid) + fault_ret = vio->u.fault.ft_flags; + + vmpage = vio->u.fault.ft_vmpage; + if (result != 0 && vmpage != NULL) { + put_page(vmpage); + vmf->page = NULL; + } + } + cl_io_fini(env, io); + +out: + cl_env_put(env, &refcheck); + if (result != 0 && !(fault_ret & VM_FAULT_RETRY)) + fault_ret |= to_fault_error(result); + + CDEBUG(D_MMAP, "%s fault %d/%d\n", current->comm, fault_ret, result); + RETURN(fault_ret); +} + +#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY +static vm_fault_t ll_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; +#else +static vm_fault_t ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +#endif + int count = 0; + bool printed = false; + bool cached; + vm_fault_t result; + ktime_t kstart = ktime_get(); + sigset_t old, new; + + result = pcc_fault(vma, vmf, &cached); + if (cached) + goto out; + + CDEBUG(D_MMAP|D_IOTRACE, + DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu\n", + PFID(&ll_i2info(file_inode(vma->vm_file))->lli_fid), + vma, vma->vm_start, vma->vm_end, vma->vm_flags, vmf->pgoff); + + /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite + * so that it can be killed by admin but not cause segfault by + * other signals. + */ + siginitsetinv(&new, sigmask(SIGKILL) | sigmask(SIGTERM)); + sigprocmask(SIG_BLOCK, &new, &old); + + /* make sure offset is not a negative number */ + if (vmf->pgoff > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) + return VM_FAULT_SIGBUS; + +restart: + result = ll_fault0(vma, vmf); + if (vmf->page && + !(result & (VM_FAULT_RETRY | VM_FAULT_ERROR | VM_FAULT_LOCKED))) { + struct page *vmpage = vmf->page; + + /* lock the page, then check if this page has been truncated + * or deleted from Lustre and retry if so + */ + lock_page(vmpage); + if (unlikely(vmpage->mapping == NULL) || + vmpage->private == 0) { /* unlucky */ + unlock_page(vmpage); + put_page(vmpage); + vmf->page = NULL; + + if (!printed && ++count > 16) { + struct inode *inode = file_inode(vma->vm_file); + + CWARN("%s: FID "DFID" under heavy mmap contention by '%s', consider revising IO pattern\n", + ll_i2sbi(inode)->ll_fsname, + PFID(&ll_i2info(inode)->lli_fid), + current->comm); + printed = true; + } + + goto restart; + } + + result |= VM_FAULT_LOCKED; + } + sigprocmask(SIG_SETMASK, &old, NULL); + +out: + if (vmf->page && result == VM_FAULT_LOCKED) { + ll_rw_stats_tally(ll_i2sbi(file_inode(vma->vm_file)), + current->pid, vma->vm_file->private_data, + cl_offset(NULL, vmf->page->index), PAGE_SIZE, + READ); + ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)), + LPROC_LL_FAULT, + ktime_us_delta(ktime_get(), kstart)); + } + + return result; +} + +#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY +static vm_fault_t ll_page_mkwrite(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; +#else +static vm_fault_t ll_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ +#endif + int count = 0; + bool printed = false; + bool retry; + bool cached; + ktime_t kstart = ktime_get(); + vm_fault_t result; + + CDEBUG(D_MMAP|D_IOTRACE, + DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu\n", + PFID(&ll_i2info(file_inode(vma->vm_file))->lli_fid), + vma, vma->vm_start, vma->vm_end, vma->vm_flags, + vmf->page->index); + + result = pcc_page_mkwrite(vma, vmf, &cached); + if (cached) + goto out; + + file_update_time(vma->vm_file); + do { + retry = false; + result = ll_page_mkwrite0(vma, vmf->page, &retry); + + if (!printed && ++count > 16) { + const struct dentry *de = file_dentry(vma->vm_file); + + CWARN("app(%s): the page %lu of file "DFID" is under heavy contention\n", + current->comm, vmf->pgoff, + PFID(ll_inode2fid(de->d_inode))); + printed = true; + } + } while (retry); + + switch (result) { + case 0: + LASSERT(PageLocked(vmf->page)); + result = VM_FAULT_LOCKED; + break; + case -ENODATA: + case -EFAULT: + result = VM_FAULT_NOPAGE; + break; + case -ENOMEM: + result = VM_FAULT_OOM; + break; + case -EAGAIN: + result = VM_FAULT_RETRY; + break; + default: + result = VM_FAULT_SIGBUS; + break; + } + +out: + if (result == VM_FAULT_LOCKED) { + ll_rw_stats_tally(ll_i2sbi(file_inode(vma->vm_file)), + current->pid, vma->vm_file->private_data, + cl_offset(NULL, vmf->page->index), PAGE_SIZE, + WRITE); + ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)), + LPROC_LL_MKWRITE, + ktime_us_delta(ktime_get(), kstart)); + } + + return result; +} + +/** + * To avoid cancel the locks covering mmapped region for lock cache pressure, + * we track the mapped vma count in vvp_object::vob_mmap_cnt. + */ +static void ll_vm_open(struct vm_area_struct * vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct vvp_object *vob = cl_inode2vvp(inode); + + ENTRY; + LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); + atomic_inc(&vob->vob_mmap_cnt); + pcc_vm_open(vma); + EXIT; +} + +/** + * Dual to ll_vm_open(). + */ +static void ll_vm_close(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct vvp_object *vob = cl_inode2vvp(inode); + + ENTRY; + atomic_dec(&vob->vob_mmap_cnt); + LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); + pcc_vm_close(vma); + EXIT; +} + +static const struct vm_operations_struct ll_file_vm_ops = { + .fault = ll_fault, + .page_mkwrite = ll_page_mkwrite, + .open = ll_vm_open, + .close = ll_vm_close, +}; + +int ll_file_mmap(struct file *file, struct vm_area_struct * vma) +{ + struct inode *inode = file_inode(file); + ktime_t kstart = ktime_get(); + bool cached; + int rc; + + ENTRY; + CDEBUG(D_VFSTRACE | D_MMAP, + "VFS_Op: fid="DFID" vma=%p start=%#lx end=%#lx vm_flags=%#lx\n", + PFID(&ll_i2info(inode)->lli_fid), + vma, vma->vm_start, vma->vm_end, vma->vm_flags); + + if (ll_file_nolock(file)) + RETURN(-EOPNOTSUPP); + + rc = pcc_file_mmap(file, vma, &cached); + if (cached && rc != 0) + RETURN(rc); + + rc = generic_file_mmap(file, vma); + if (rc == 0) { + vma->vm_ops = &ll_file_vm_ops; + vma->vm_ops->open(vma); + /* update the inode's size and mtime */ + if (!cached) + rc = ll_glimpse_size(inode); + } + + if (!rc) + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MMAP, + ktime_us_delta(ktime_get(), kstart)); + + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c new file mode 100644 index 0000000000000..7d16d9d165506 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c @@ -0,0 +1,401 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/lustre/llite/llite_nfs.c + * + * NFS export of Lustre Light File System + * + * Author: Yury Umanets + * Author: Huang Hua + */ + +#define DEBUG_SUBSYSTEM S_LLITE +#include "llite_internal.h" +#include + +u32 get_uuid2int(const char *name, int len) +{ + u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9; + + while (len--) { + u32 key = key1 + (key0 ^ (*name++ * 7152373)); + + if (key & 0x80000000) + key -= 0x7fffffff; + + key1 = key0; + key0 = key; + } + return (key0 << 1); +} + +struct inode *search_inode_for_lustre(struct super_block *sb, + const struct lu_fid *fid) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct ptlrpc_request *req = NULL; + struct inode *inode = NULL; + int eadatalen = 0; + unsigned long hash = cl_fid_build_ino(fid, ll_need_32bit_api(sbi)); + struct md_op_data *op_data; + int rc; + + ENTRY; + + CDEBUG(D_INFO, "searching inode for:(%lu,"DFID")\n", hash, PFID(fid)); + + inode = ilookup5(sb, hash, ll_test_inode_by_fid, (void *)fid); + if (inode) + RETURN(inode); + + rc = ll_get_default_mdsize(sbi, &eadatalen); + if (rc) + RETURN(ERR_PTR(rc)); + + /* + * Because inode is NULL, ll_prep_md_op_data can not + * be used here. So we allocate op_data ourselves + */ + OBD_ALLOC_PTR(op_data); + if (!op_data) + return ERR_PTR(-ENOMEM); + + op_data->op_fid1 = *fid; + op_data->op_mode = eadatalen; + op_data->op_valid = OBD_MD_FLEASIZE; + + /* mds_fid2dentry ignores f_type */ + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + OBD_FREE_PTR(op_data); + if (rc) { + /* + * Suppress erroneous/confusing messages when NFS + * is out of sync and requests old data. + */ + CDEBUG(D_INFO, "can't get object attrs, fid "DFID", rc %d\n", + PFID(fid), rc); + RETURN(ERR_PTR(rc)); + } + rc = ll_prep_inode(&inode, &req->rq_pill, sb, NULL); + ptlrpc_req_finished(req); + if (rc) + RETURN(ERR_PTR(rc)); + + RETURN(inode); +} + +static struct dentry * +ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, struct lu_fid *parent) +{ + struct inode *inode; + struct dentry *result; + + ENTRY; + + if (!fid_is_sane(fid)) + RETURN(ERR_PTR(-ESTALE)); + + CDEBUG(D_INFO, "Get dentry for fid: "DFID"\n", PFID(fid)); + + inode = search_inode_for_lustre(sb, fid); + if (IS_ERR(inode)) + RETURN(ERR_CAST(inode)); + + if (is_bad_inode(inode)) { + /* we didn't find the right inode.. */ + iput(inode); + RETURN(ERR_PTR(-ESTALE)); + } + + /* N.B. d_obtain_alias() drops inode ref on error */ + result = d_obtain_alias(inode); + if (!IS_ERR(result)) { + struct ll_dentry_data *ldd; + + if (!ll_d_setup(result, true)) + RETURN(ERR_PTR(-ENOMEM)); + ldd = ll_d2d(result); + /* + * Need to signal to the ll_file_open that + * we came from NFS and so opencache needs to be + * enabled for this one + */ + spin_lock(&result->d_lock); + ldd->lld_nfs_dentry = 1; + spin_unlock(&result->d_lock); + } + + RETURN(result); +} + +#ifndef FILEID_INVALID +#define FILEID_INVALID 0xff +#endif +#ifndef FILEID_LUSTRE +#define FILEID_LUSTRE 0x97 +#endif + +/** + * \a connectable - is nfsd will connect himself or this should be done + * at lustre + * + * The return value is file handle type: + * 1 -- contains child file handle; + * 2 -- contains child file handle and parent file handle; + * 255 -- error. + */ +static int ll_encode_fh(struct inode *inode, u32 *fh, int *plen, + struct inode *parent) +{ + int fileid_len = sizeof(struct lustre_file_handle) / 4; + struct lustre_file_handle *lfh = (void *)fh; + + ENTRY; + + CDEBUG(D_INFO, "%s: encoding for ("DFID") maxlen=%d minlen=%d\n", + ll_i2sbi(inode)->ll_fsname, + PFID(ll_inode2fid(inode)), *plen, fileid_len); + + if (*plen < fileid_len) { + *plen = fileid_len; + RETURN(FILEID_INVALID); + } + + lfh->lfh_child = *ll_inode2fid(inode); + if (parent) + lfh->lfh_parent = *ll_inode2fid(parent); + else + fid_zero(&lfh->lfh_parent); + *plen = fileid_len; + + RETURN(FILEID_LUSTRE); +} + +static inline int +do_nfs_get_name_filldir(struct ll_getname_data *lgd, const char *name, + int namelen, loff_t hash, u64 ino, unsigned int type) +{ + /* + * It is hack to access lde_fid for comparison with lgd_fid. + * So the input 'name' must be part of the 'lu_dirent', and + * so must appear to be a non-const pointer to an empty array. + */ + char (*n)[0] = (void *)name; + /* NOTE: This should be container_of(). However container_of() in + * kernels earlier than v4.13-rc1~37^2~94 cause this to generate a + * warning, which fails when we compile with -Werror. Those earlier + * kernels don't have container_of_safe, calling that instead will use + * the lustre-local version which doesn't generate the warning. + */ + struct lu_dirent *lde = container_of_safe(n, struct lu_dirent, lde_name); + struct lu_fid fid; + + fid_le_to_cpu(&fid, &lde->lde_fid); + if (lu_fid_eq(&fid, &lgd->lgd_fid)) { + memcpy(lgd->lgd_name, name, namelen); + lgd->lgd_name[namelen] = 0; + lgd->lgd_found = 1; + } + return lgd->lgd_found; +} + +#ifdef HAVE_FILLDIR_USE_CTX_RETURN_BOOL +static bool +ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name, int namelen, + loff_t hash, u64 ino, unsigned int type) +{ + struct ll_getname_data *lgd = + container_of(ctx, struct ll_getname_data, ctx); + int err = do_nfs_get_name_filldir(lgd, name, namelen, hash, ino, type); + + return err == 0; +} +#elif defined(HAVE_FILLDIR_USE_CTX) +static int +ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name, int namelen, + loff_t hash, u64 ino, unsigned int type) +{ + struct ll_getname_data *lgd = + container_of(ctx, struct ll_getname_data, ctx); + + return do_nfs_get_name_filldir(lgd, name, namelen, hash, ino, type); +} +#else +static int ll_nfs_get_name_filldir(void *cookie, const char *name, int namelen, + loff_t hash, u64 ino, unsigned int type) +{ + struct ll_getname_data *lgd = cookie; + + return do_nfs_get_name_filldir(lgd, name, namelen, hash, ino, type); +} +#endif /* HAVE_FILLDIR_USE_CTX */ + +static int ll_get_name(struct dentry *dentry, char *name, struct dentry *child) +{ + struct inode *dir = dentry->d_inode; + struct ll_getname_data lgd = { + .lgd_name = name, + .lgd_fid = ll_i2info(child->d_inode)->lli_fid, +#ifdef HAVE_DIR_CONTEXT + .ctx.actor = ll_nfs_get_name_filldir, +#endif + .lgd_found = 0, + }; + struct md_op_data *op_data; + u64 pos = 0; + int rc; + + ENTRY; + + if (!dir || !S_ISDIR(dir->i_mode)) + GOTO(out, rc = -ENOTDIR); + + if (!dir->i_fop) + GOTO(out, rc = -EINVAL); + + op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, + LUSTRE_OPC_ANY, dir); + if (IS_ERR(op_data)) + GOTO(out, rc = PTR_ERR(op_data)); + + inode_lock(dir); +#ifdef HAVE_DIR_CONTEXT + rc = ll_dir_read(dir, &pos, op_data, &lgd.ctx, NULL); +#else + rc = ll_dir_read(dir, &pos, op_data, &lgd, ll_nfs_get_name_filldir, + NULL); +#endif + inode_unlock(dir); + ll_finish_md_op_data(op_data); + if (!rc && !lgd.lgd_found) + rc = -ENOENT; + EXIT; +out: + return rc; +} + +static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct lustre_file_handle *lfh = (struct lustre_file_handle *)fid; + + if (fh_type != FILEID_LUSTRE) + RETURN(ERR_PTR(-EPROTO)); + + RETURN(ll_iget_for_nfs(sb, &lfh->lfh_child, &lfh->lfh_parent)); +} + +static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct lustre_file_handle *lfh = (struct lustre_file_handle *)fid; + + if (fh_type != FILEID_LUSTRE) + RETURN(ERR_PTR(-EPROTO)); + + RETURN(ll_iget_for_nfs(sb, &lfh->lfh_parent, NULL)); +} + +int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid) +{ + struct ptlrpc_request *req = NULL; + struct ll_sb_info *sbi; + struct mdt_body *body; + static const char dotdot[] = ".."; + struct md_op_data *op_data; + int rc; + int lmmsize; + + ENTRY; + + LASSERT(dir && S_ISDIR(dir->i_mode)); + + sbi = ll_s2sbi(dir->i_sb); + + CDEBUG(D_INFO, "%s: getting parent for ("DFID")\n", + sbi->ll_fsname, PFID(ll_inode2fid(dir))); + + rc = ll_get_default_mdsize(sbi, &lmmsize); + if (rc != 0) + RETURN(rc); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot, + strlen(dotdot), lmmsize, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + rc = md_getattr_name(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc != 0) { + CERROR("%s: failure inode "DFID" get parent: rc = %d\n", + sbi->ll_fsname, PFID(ll_inode2fid(dir)), rc); + RETURN(rc); + } + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + + /* + * LU-3952: MDT may lost the FID of its parent, we should not crash + * the NFS server, ll_iget_for_nfs() will handle the error. + */ + if (body->mbo_valid & OBD_MD_FLID) { + CDEBUG(D_INFO, "parent for "DFID" is "DFID"\n", + PFID(ll_inode2fid(dir)), PFID(&body->mbo_fid1)); + *parent_fid = body->mbo_fid1; + } + + ptlrpc_req_finished(req); + RETURN(0); +} + +static struct dentry *ll_get_parent(struct dentry *dchild) +{ + struct lu_fid parent_fid = { 0 }; + int rc; + struct dentry *dentry; + + ENTRY; + + rc = ll_dir_get_parent_fid(dchild->d_inode, &parent_fid); + if (rc != 0) + RETURN(ERR_PTR(rc)); + + dentry = ll_iget_for_nfs(dchild->d_inode->i_sb, &parent_fid, NULL); + + RETURN(dentry); +} + +const struct export_operations lustre_export_operations = { + .get_parent = ll_get_parent, + .encode_fh = ll_encode_fh, + .get_name = ll_get_name, + .fh_to_dentry = ll_fh_to_dentry, + .fh_to_parent = ll_fh_to_parent, +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c new file mode 100644 index 0000000000000..af2629f1e9c32 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c @@ -0,0 +1,2585 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include + +#include +#include +#include + +#include "llite_internal.h" +#include "vvp_internal.h" + +static struct kobject *llite_kobj; +static struct dentry *llite_root; + +static void llite_kobj_release(struct kobject *kobj) +{ + if (!IS_ERR_OR_NULL(llite_root)) { + debugfs_remove(llite_root); + llite_root = NULL; + } + + kfree(kobj); +} + +static struct kobj_type llite_kobj_ktype = { + .release = llite_kobj_release, + .sysfs_ops = &lustre_sysfs_ops, +}; + +int llite_tunables_register(void) +{ + int rc; + + llite_kobj = kzalloc(sizeof(*llite_kobj), GFP_KERNEL); + if (!llite_kobj) + return -ENOMEM; + + llite_kobj->kset = lustre_kset; + rc = kobject_init_and_add(llite_kobj, &llite_kobj_ktype, + &lustre_kset->kobj, "%s", "llite"); + if (rc) + goto free_kobj; + + llite_root = debugfs_create_dir("llite", debugfs_lustre_root); + return 0; + +free_kobj: + kobject_put(llite_kobj); + llite_kobj = NULL; + + return rc; +} + +void llite_tunables_unregister(void) +{ + kobject_put(llite_kobj); + llite_kobj = NULL; +} + +/* /lustre/llite mount point registration */ +static const struct file_operations ll_rw_extents_stats_fops; +static const struct file_operations ll_rw_extents_stats_pp_fops; +static const struct file_operations ll_rw_offset_stats_fops; + +/** + * ll_stats_pid_write() - Determine if stats collection should be enabled + * @buf: Buffer containing the data written + * @len: Number of bytes in the buffer + * + * Several proc files begin collecting stats when a value is written, and stop + * collecting when either '0' or 'disable' is written. This function checks the + * written value to see if collection should be enabled or disabled. + * + * Return: If '0' or 'disable' is provided, 0 is returned. If the text + * equivalent of a number is written, that number is returned. Otherwise, + * 1 is returned. Non-zero return values indicate collection should be enabled. + */ +static s64 ll_stats_pid_write(const char __user *buf, size_t len) +{ + unsigned long long value = 1; + char kernbuf[16]; + int rc; + + rc = kstrtoull_from_user(buf, len, 0, &value); + if (rc < 0 && len < sizeof(kernbuf)) { + if (copy_from_user(kernbuf, buf, len)) + return -EFAULT; + kernbuf[len] = 0; + + if (kernbuf[len - 1] == '\n') + kernbuf[len - 1] = 0; + + if (strncasecmp(kernbuf, "disable", 7) == 0) + value = 0; + } + + return value; +} + +static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY); + if (rc) + return rc; + + return sprintf(buf, "%u\n", osfs.os_bsize); +} +LUSTRE_RO_ATTR(blocksize); + +static ssize_t stat_blocksize_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return sprintf(buf, "%u\n", sbi->ll_stat_blksize); +} + +static ssize_t stat_blocksize_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + if (val != 0 && (val < PAGE_SIZE || (val & (val - 1))) != 0) + return -ERANGE; + + sbi->ll_stat_blksize = val; + + return count; +} +LUSTRE_RW_ATTR(stat_blocksize); + +static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + struct obd_statfs osfs; + u32 blk_size; + u64 result; + int rc; + + rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY); + if (rc) + return rc; + + blk_size = osfs.os_bsize >> 10; + result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); +} +LUSTRE_RO_ATTR(kbytestotal); + +static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + struct obd_statfs osfs; + u32 blk_size; + u64 result; + int rc; + + rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY); + if (rc) + return rc; + + blk_size = osfs.os_bsize >> 10; + result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); +} +LUSTRE_RO_ATTR(kbytesfree); + +static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + struct obd_statfs osfs; + u32 blk_size; + u64 result; + int rc; + + rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY); + if (rc) + return rc; + + blk_size = osfs.os_bsize >> 10; + result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); +} +LUSTRE_RO_ATTR(kbytesavail); + +static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY); + if (rc) + return rc; + + return sprintf(buf, "%llu\n", osfs.os_files); +} +LUSTRE_RO_ATTR(filestotal); + +static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY); + if (rc) + return rc; + + return sprintf(buf, "%llu\n", osfs.os_ffree); +} +LUSTRE_RO_ATTR(filesfree); + +static ssize_t client_type_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return sprintf(buf, "local client\n"); +} +LUSTRE_RO_ATTR(client_type); + +LUSTRE_RW_ATTR(foreign_symlink_enable); + +LUSTRE_RW_ATTR(foreign_symlink_prefix); + +LUSTRE_RW_ATTR(foreign_symlink_upcall); + +LUSTRE_WO_ATTR(foreign_symlink_upcall_info); + +static ssize_t fstype_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return sprintf(buf, "lustre\n"); +} +LUSTRE_RO_ATTR(fstype); + +static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return sprintf(buf, "%s\n", sbi->ll_sb_uuid.uuid); +} +LUSTRE_RO_ATTR(uuid); + +static int ll_site_stats_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + + /* + * See description of statistical counters in struct cl_site, and + * struct lu_site. + */ + return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m); +} + +LDEBUGFS_SEQ_FOPS_RO(ll_site_stats); + +static ssize_t max_read_ahead_mb_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%lu\n", + PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages)); +} + +static ssize_t max_read_ahead_mb_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + u64 ra_max_mb, pages_number; + int rc; + + rc = sysfs_memparse(buffer, count, &ra_max_mb, "MiB"); + if (rc) + return rc; + + pages_number = round_up(ra_max_mb, 1024 * 1024) >> PAGE_SHIFT; + CDEBUG(D_INFO, "%s: set max_read_ahead_mb=%llu (%llu pages)\n", + sbi->ll_fsname, PAGES_TO_MiB(pages_number), pages_number); + if (pages_number > cfs_totalram_pages() / 2) { + /* 1/2 of RAM */ + CERROR("%s: cannot set max_read_ahead_mb=%llu > totalram/2=%luMB\n", + sbi->ll_fsname, PAGES_TO_MiB(pages_number), + PAGES_TO_MiB(cfs_totalram_pages() / 2)); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_pages = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} +LUSTRE_RW_ATTR(max_read_ahead_mb); + +static ssize_t max_read_ahead_per_file_mb_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%lu\n", + PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file)); +} + +static ssize_t max_read_ahead_per_file_mb_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + u64 ra_max_file_mb, pages_number; + int rc; + + rc = sysfs_memparse(buffer, count, &ra_max_file_mb, "MiB"); + if (rc) + return rc; + + pages_number = round_up(ra_max_file_mb, 1024 * 1024) >> PAGE_SHIFT; + if (pages_number > sbi->ll_ra_info.ra_max_pages) { + CERROR("%s: cannot set max_read_ahead_per_file_mb=%llu > max_read_ahead_mb=%lu\n", + sbi->ll_fsname, PAGES_TO_MiB(pages_number), + PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages)); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_pages_per_file = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} +LUSTRE_RW_ATTR(max_read_ahead_per_file_mb); + +static ssize_t max_read_ahead_whole_mb_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%lu\n", + PAGES_TO_MiB(sbi->ll_ra_info.ra_max_read_ahead_whole_pages)); +} + +static ssize_t max_read_ahead_whole_mb_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + u64 ra_max_whole_mb, pages_number; + int rc; + + rc = sysfs_memparse(buffer, count, &ra_max_whole_mb, "MiB"); + if (rc) + return rc; + + pages_number = round_up(ra_max_whole_mb, 1024 * 1024) >> PAGE_SHIFT; + /* Cap this at the current max readahead window size, the readahead + * algorithm does this anyway so it's pointless to set it larger. + */ + if (pages_number > sbi->ll_ra_info.ra_max_pages_per_file) { + CERROR("%s: cannot set max_read_ahead_whole_mb=%llu > max_read_ahead_per_file_mb=%lu\n", + sbi->ll_fsname, PAGES_TO_MiB(pages_number), + PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file)); + + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} +LUSTRE_RW_ATTR(max_read_ahead_whole_mb); + +static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct cl_client_cache *cache = sbi->ll_cache; + struct ll_ra_info *ra = &sbi->ll_ra_info; + long max_cached_mb; + long unused_mb; + + mutex_lock(&cache->ccc_max_cache_mb_lock); + max_cached_mb = PAGES_TO_MiB(cache->ccc_lru_max); + unused_mb = PAGES_TO_MiB(atomic_long_read(&cache->ccc_lru_left)); + mutex_unlock(&cache->ccc_max_cache_mb_lock); + + seq_printf(m, "users: %d\n" + "max_cached_mb: %ld\n" + "used_mb: %ld\n" + "unused_mb: %ld\n" + "reclaim_count: %u\n" + "max_read_ahead_mb: %lu\n" + "used_read_ahead_mb: %d\n", + atomic_read(&cache->ccc_users), + max_cached_mb, + max_cached_mb - unused_mb, + unused_mb, + cache->ccc_lru_shrinkers, + PAGES_TO_MiB(ra->ra_max_pages), + PAGES_TO_MiB(atomic_read(&ra->ra_cur_pages))); + return 0; +} + +static ssize_t ll_max_cached_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct cl_client_cache *cache = sbi->ll_cache; + struct lu_env *env; + long diff = 0; + long nrpages = 0; + __u16 refcheck; + u64 pages_number; + int rc; + char kernbuf[128], *ptr; + + ENTRY; + if (count >= sizeof(kernbuf)) + RETURN(-EINVAL); + + if (copy_from_user(kernbuf, buffer, count)) + RETURN(-EFAULT); + kernbuf[count] = '\0'; + + ptr = lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count); + rc = sysfs_memparse(ptr, count, &pages_number, "MiB"); + if (rc) + RETURN(rc); + + pages_number >>= PAGE_SHIFT; + + if (pages_number < 0 || pages_number > cfs_totalram_pages()) { + CERROR("%s: can't set max cache more than %lu MB\n", + sbi->ll_fsname, + PAGES_TO_MiB(cfs_totalram_pages())); + RETURN(-ERANGE); + } + /* Allow enough cache so clients can make well-formed RPCs */ + pages_number = max_t(long, pages_number, PTLRPC_MAX_BRW_PAGES); + + mutex_lock(&cache->ccc_max_cache_mb_lock); + diff = pages_number - cache->ccc_lru_max; + + /* easy - add more LRU slots. */ + if (diff >= 0) { + atomic_long_add(diff, &cache->ccc_lru_left); + GOTO(out, rc = 0); + } + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out_unlock, rc = PTR_ERR(env)); + + diff = -diff; + while (diff > 0) { + long tmp; + + /* reduce LRU budget from free slots. */ + do { + long lru_left_old, lru_left_new, lru_left_ret; + + lru_left_old = atomic_long_read(&cache->ccc_lru_left); + if (lru_left_old == 0) + break; + + lru_left_new = lru_left_old > diff ? + lru_left_old - diff : 0; + lru_left_ret = + atomic_long_cmpxchg(&cache->ccc_lru_left, + lru_left_old, + lru_left_new); + if (likely(lru_left_old == lru_left_ret)) { + diff -= lru_left_old - lru_left_new; + nrpages += lru_left_old - lru_left_new; + break; + } + } while (1); + + if (diff <= 0) + break; + + if (sbi->ll_dt_exp == NULL) { /* being initialized */ + rc = -ENODEV; + break; + } + + /* Request extra free slots to avoid them all being used + * by other processes before this can continue shrinking. + */ + tmp = diff + min_t(long, diff, MiB_TO_PAGES(1024)); + /* difficult - have to ask OSCs to drop LRU slots. */ + rc = obd_set_info_async(env, sbi->ll_dt_exp, + sizeof(KEY_CACHE_LRU_SHRINK), + KEY_CACHE_LRU_SHRINK, + sizeof(tmp), &tmp, NULL); + if (rc < 0) + break; + } + cl_env_put(env, &refcheck); + +out: + if (rc >= 0) { + cache->ccc_lru_max = pages_number; + rc = count; + } else { + atomic_long_add(nrpages, &cache->ccc_lru_left); + } +out_unlock: + mutex_unlock(&cache->ccc_max_cache_mb_lock); + return rc; +} +LDEBUGFS_SEQ_FOPS(ll_max_cached_mb); + +static ssize_t checksums_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + test_bit(LL_SBI_CHECKSUM, sbi->ll_flags)); +} + +static ssize_t checksums_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int tmp; + int rc; + + if (!sbi->ll_dt_exp) + /* Not set up yet */ + return -EAGAIN; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + if (val) + set_bit(LL_SBI_CHECKSUM, sbi->ll_flags); + else + clear_bit(LL_SBI_CHECKSUM, sbi->ll_flags); + tmp = val; + + rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM), + KEY_CHECKSUM, sizeof(tmp), &tmp, NULL); + if (rc) + CWARN("Failed to set OSC checksum flags: %d\n", rc); + + return count; +} +LUSTRE_RW_ATTR(checksums); + +LUSTRE_ATTR(checksum_pages, 0644, checksums_show, checksums_store); + +static ssize_t ll_rd_track_id(struct kobject *kobj, char *buf, + enum stats_track_type type) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + if (sbi->ll_stats_track_type == type) + return sprintf(buf, "%d\n", sbi->ll_stats_track_id); + else if (sbi->ll_stats_track_type == STATS_TRACK_ALL) + return sprintf(buf, "0 (all)\n"); + + return sprintf(buf, "untracked\n"); +} + +static ssize_t ll_wr_track_id(struct kobject *kobj, const char *buffer, + size_t count, enum stats_track_type type) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned long pid; + int rc; + + rc = kstrtoul(buffer, 10, &pid); + if (rc) + return rc; + + sbi->ll_stats_track_id = pid; + if (pid == 0) + sbi->ll_stats_track_type = STATS_TRACK_ALL; + else + sbi->ll_stats_track_type = type; + lprocfs_clear_stats(sbi->ll_stats); + return count; +} + +static ssize_t stats_track_pid_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return ll_rd_track_id(kobj, buf, STATS_TRACK_PID); +} + +static ssize_t stats_track_pid_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PID); +} +LUSTRE_RW_ATTR(stats_track_pid); + +static ssize_t stats_track_ppid_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return ll_rd_track_id(kobj, buf, STATS_TRACK_PPID); +} + +static ssize_t stats_track_ppid_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PPID); +} +LUSTRE_RW_ATTR(stats_track_ppid); + +static ssize_t stats_track_gid_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return ll_rd_track_id(kobj, buf, STATS_TRACK_GID); +} + +static ssize_t stats_track_gid_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_GID); +} +LUSTRE_RW_ATTR(stats_track_gid); + +static ssize_t statahead_running_max_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_sa_running_max); +} + +static ssize_t statahead_running_max_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned long val; + int rc; + + rc = kstrtoul(buffer, 0, &val); + if (rc) + return rc; + + if (val <= LL_SA_RUNNING_MAX) { + sbi->ll_sa_running_max = val; + return count; + } + + CERROR("Bad statahead_running_max value %lu. Valid values " + "are in the range [0, %d]\n", val, LL_SA_RUNNING_MAX); + + return -ERANGE; +} +LUSTRE_RW_ATTR(statahead_running_max); + +static ssize_t statahead_max_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return sprintf(buf, "%u\n", sbi->ll_sa_max); +} + +static ssize_t statahead_max_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned long val; + int rc; + + rc = kstrtoul(buffer, 0, &val); + if (rc) + return rc; + + if (val <= LL_SA_RPC_MAX) + sbi->ll_sa_max = val; + else + CERROR("Bad statahead_max value %lu. Valid values are in the range [0, %d]\n", + val, LL_SA_RPC_MAX); + + return count; +} +LUSTRE_RW_ATTR(statahead_max); + +static ssize_t statahead_agl_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + test_bit(LL_SBI_AGL_ENABLED, sbi->ll_flags)); +} + +static ssize_t statahead_agl_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + if (val) + set_bit(LL_SBI_AGL_ENABLED, sbi->ll_flags); + else + clear_bit(LL_SBI_AGL_ENABLED, sbi->ll_flags); + + return count; +} +LUSTRE_RW_ATTR(statahead_agl); + +static int ll_statahead_stats_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + seq_printf(m, "statahead total: %u\n" + "statahead wrong: %u\n" + "agl total: %u\n", + atomic_read(&sbi->ll_sa_total), + atomic_read(&sbi->ll_sa_wrong), + atomic_read(&sbi->ll_agl_total)); + return 0; +} + +LDEBUGFS_SEQ_FOPS_RO(ll_statahead_stats); + +static ssize_t lazystatfs_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + test_bit(LL_SBI_LAZYSTATFS, sbi->ll_flags)); +} + +static ssize_t lazystatfs_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + if (val) + set_bit(LL_SBI_LAZYSTATFS, sbi->ll_flags); + else + clear_bit(LL_SBI_LAZYSTATFS, sbi->ll_flags); + + return count; +} +LUSTRE_RW_ATTR(lazystatfs); + +static ssize_t statfs_max_age_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_statfs_max_age); +} + +static ssize_t statfs_max_age_store(struct kobject *kobj, + struct attribute *attr, const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + if (val > OBD_STATFS_CACHE_MAX_AGE) + return -EINVAL; + + sbi->ll_statfs_max_age = val; + + return count; +} +LUSTRE_RW_ATTR(statfs_max_age); + +static ssize_t neg_dentry_timeout_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%d\n", sbi->ll_neg_dentry_timeout); +} + +static ssize_t neg_dentry_timeout_store(struct kobject *kobj, + struct attribute *attr, const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + int val; + int rc; + + rc = kstrtoint(buffer, 10, &val); + if (rc) + return rc; + if (val < OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS) + return -EINVAL; + + sbi->ll_neg_dentry_timeout = val; + + return count; +} +LUSTRE_RW_ATTR(neg_dentry_timeout); + +static ssize_t max_easize_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int ealen; + int rc; + + rc = ll_get_max_mdsize(sbi, &ealen); + if (rc) + return rc; + + /* Limit xattr size returned to userspace based on kernel maximum */ + return scnprintf(buf, PAGE_SIZE, "%u\n", + ealen > XATTR_SIZE_MAX ? XATTR_SIZE_MAX : ealen); +} +LUSTRE_RO_ATTR(max_easize); + +/** + * Get default_easize. + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] m seq_file handle + * \param[in] v unused for single entry + * + * \retval 0 on success + * \retval negative negated errno on failure + */ +static ssize_t default_easize_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int ealen; + int rc; + + rc = ll_get_default_mdsize(sbi, &ealen); + if (rc) + return rc; + + /* Limit xattr size returned to userspace based on kernel maximum */ + return scnprintf(buf, PAGE_SIZE, "%u\n", + ealen > XATTR_SIZE_MAX ? XATTR_SIZE_MAX : ealen); +} + +/** + * Set default_easize. + * + * Range checking on the passed value is handled by + * ll_set_default_mdsize(). + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] file proc file + * \param[in] buffer string passed from user space + * \param[in] count \a buffer length + * \param[in] off unused for single entry + * + * \retval positive \a count on success + * \retval negative negated errno on failure + */ +static ssize_t default_easize_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + if (count == 0) + return 0; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + rc = ll_set_default_mdsize(sbi, val); + if (rc) + return rc; + + return count; +} +LUSTRE_RW_ATTR(default_easize); + +LDEBUGFS_SEQ_FOPS_RO(ll_sbi_flags); + +static ssize_t xattr_cache_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return sprintf(buf, "%u\n", sbi->ll_xattr_cache_enabled); +} + +static ssize_t xattr_cache_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + if (val && !test_bit(LL_SBI_XATTR_CACHE, sbi->ll_flags)) + return -ENOTSUPP; + + sbi->ll_xattr_cache_enabled = val; + sbi->ll_xattr_cache_set = 1; + + return count; +} +LUSTRE_RW_ATTR(xattr_cache); + +static ssize_t tiny_write_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + test_bit(LL_SBI_TINY_WRITE, sbi->ll_flags)); +} + +static ssize_t tiny_write_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + spin_lock(&sbi->ll_lock); + if (val) + set_bit(LL_SBI_TINY_WRITE, sbi->ll_flags); + else + clear_bit(LL_SBI_TINY_WRITE, sbi->ll_flags); + spin_unlock(&sbi->ll_lock); + + return count; +} +LUSTRE_RW_ATTR(tiny_write); + +static ssize_t parallel_dio_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", + test_bit(LL_SBI_PARALLEL_DIO, sbi->ll_flags)); +} + +static ssize_t parallel_dio_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + spin_lock(&sbi->ll_lock); + if (val) + set_bit(LL_SBI_PARALLEL_DIO, sbi->ll_flags); + else + clear_bit(LL_SBI_PARALLEL_DIO, sbi->ll_flags); + spin_unlock(&sbi->ll_lock); + + return count; +} +LUSTRE_RW_ATTR(parallel_dio); + +static ssize_t max_read_ahead_async_active_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + sbi->ll_ra_info.ra_async_max_active); +} + +static ssize_t max_read_ahead_async_active_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + /** + * It doesn't make any sense to make it exceed what + * workqueue could acutally support. This can easily + * over subscripe the cores but Lustre internally + * throttles to avoid those impacts. + */ + if (val > WQ_UNBOUND_MAX_ACTIVE) { + CERROR("%s: cannot set max_read_ahead_async_active=%u larger than %u\n", + sbi->ll_fsname, val, WQ_UNBOUND_MAX_ACTIVE); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_async_max_active = val; + spin_unlock(&sbi->ll_lock); + + return count; +} +LUSTRE_RW_ATTR(max_read_ahead_async_active); + +static ssize_t read_ahead_async_file_threshold_mb_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%lu\n", PAGES_TO_MiB( + sbi->ll_ra_info.ra_async_pages_per_file_threshold)); +} + +static ssize_t +read_ahead_async_file_threshold_mb_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + unsigned long pages_number; + unsigned long max_ra_per_file; + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + int rc; + + rc = kstrtoul(buffer, 10, &pages_number); + if (rc) + return rc; + + pages_number = MiB_TO_PAGES(pages_number); + max_ra_per_file = sbi->ll_ra_info.ra_max_pages_per_file; + if (pages_number < 0 || pages_number > max_ra_per_file) { + CERROR("%s: can't set read_ahead_async_file_threshold_mb=%lu > " + "max_read_readahead_per_file_mb=%lu\n", sbi->ll_fsname, + PAGES_TO_MiB(pages_number), + PAGES_TO_MiB(max_ra_per_file)); + return -ERANGE; + } + sbi->ll_ra_info.ra_async_pages_per_file_threshold = pages_number; + + return count; +} +LUSTRE_RW_ATTR(read_ahead_async_file_threshold_mb); + +static ssize_t read_ahead_range_kb_show(struct kobject *kobj, + struct attribute *attr,char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%lu\n", + sbi->ll_ra_info.ra_range_pages << (PAGE_SHIFT - 10)); +} + +static ssize_t +read_ahead_range_kb_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + unsigned long pages_number; + unsigned long max_ra_per_file; + u64 val; + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + int rc; + + rc = sysfs_memparse(buffer, count, &val, "KiB"); + if (rc < 0) + return rc; + + pages_number = val >> PAGE_SHIFT; + /* Disable mmap range read */ + if (pages_number == 0) + goto out; + + max_ra_per_file = sbi->ll_ra_info.ra_max_pages_per_file; + if (pages_number > max_ra_per_file || + pages_number < RA_MIN_MMAP_RANGE_PAGES) + return -ERANGE; + +out: + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_range_pages = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} +LUSTRE_RW_ATTR(read_ahead_range_kb); + +static ssize_t fast_read_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + test_bit(LL_SBI_FAST_READ, sbi->ll_flags)); +} + +static ssize_t fast_read_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + spin_lock(&sbi->ll_lock); + if (val) + set_bit(LL_SBI_FAST_READ, sbi->ll_flags); + else + clear_bit(LL_SBI_FAST_READ, sbi->ll_flags); + spin_unlock(&sbi->ll_lock); + + return count; +} +LUSTRE_RW_ATTR(fast_read); + +static ssize_t file_heat_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + test_bit(LL_SBI_FILE_HEAT, sbi->ll_flags)); +} + +static ssize_t file_heat_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + spin_lock(&sbi->ll_lock); + if (val) + set_bit(LL_SBI_FILE_HEAT, sbi->ll_flags); + else + clear_bit(LL_SBI_FILE_HEAT, sbi->ll_flags); + spin_unlock(&sbi->ll_lock); + + return count; +} +LUSTRE_RW_ATTR(file_heat); + +static ssize_t heat_decay_percentage_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + (sbi->ll_heat_decay_weight * 100 + 128) / 256); +} + +static ssize_t heat_decay_percentage_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned long val; + int rc; + + rc = kstrtoul(buffer, 10, &val); + if (rc) + return rc; + + if (val < 0 || val > 100) + return -ERANGE; + + sbi->ll_heat_decay_weight = (val * 256 + 50) / 100; + + return count; +} +LUSTRE_RW_ATTR(heat_decay_percentage); + +static ssize_t heat_period_second_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_heat_period_second); +} + +static ssize_t heat_period_second_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned long val; + int rc; + + rc = kstrtoul(buffer, 10, &val); + if (rc) + return rc; + + if (val <= 0) + return -ERANGE; + + sbi->ll_heat_period_second = val; + + return count; +} +LUSTRE_RW_ATTR(heat_period_second); + +static ssize_t opencache_threshold_count_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + if (sbi->ll_oc_thrsh_count) + return snprintf(buf, PAGE_SIZE, "%u\n", + sbi->ll_oc_thrsh_count); + else + return snprintf(buf, PAGE_SIZE, "off\n"); +} + +static ssize_t opencache_threshold_count_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) { + bool enable; + /* also accept "off" to disable and "on" to always cache */ + rc = kstrtobool(buffer, &enable); + if (rc) + return rc; + val = enable; + } + sbi->ll_oc_thrsh_count = val; + + return count; +} +LUSTRE_RW_ATTR(opencache_threshold_count); + +static ssize_t opencache_threshold_ms_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_oc_thrsh_ms); +} + +static ssize_t opencache_threshold_ms_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + sbi->ll_oc_thrsh_ms = val; + + return count; +} +LUSTRE_RW_ATTR(opencache_threshold_ms); + +static ssize_t opencache_max_ms_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_oc_max_ms); +} + +static ssize_t opencache_max_ms_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + sbi->ll_oc_max_ms = val; + + return count; +} +LUSTRE_RW_ATTR(opencache_max_ms); + +static ssize_t inode_cache_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_inode_cache_enabled); +} + +static ssize_t inode_cache_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + sbi->ll_inode_cache_enabled = val; + + return count; +} +LUSTRE_RW_ATTR(inode_cache); + +static int ll_unstable_stats_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct cl_client_cache *cache = sbi->ll_cache; + long pages; + int mb; + + pages = atomic_long_read(&cache->ccc_unstable_nr); + mb = (pages * PAGE_SIZE) >> 20; + + seq_printf(m, "unstable_check: %8d\n" + "unstable_pages: %12ld\n" + "unstable_mb: %8d\n", + cache->ccc_unstable_check, pages, mb); + return 0; +} + +static ssize_t ll_unstable_stats_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *unused) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)seq->private); + char kernbuf[128]; + bool val; + int rc; + + if (count == 0) + return 0; + if (count >= sizeof(kernbuf)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + kernbuf[count] = 0; + + buffer += lprocfs_find_named_value(kernbuf, "unstable_check:", &count) - + kernbuf; + rc = kstrtobool_from_user(buffer, count, &val); + if (rc < 0) + return rc; + + /* borrow lru lock to set the value */ + spin_lock(&sbi->ll_cache->ccc_lru_lock); + sbi->ll_cache->ccc_unstable_check = val; + spin_unlock(&sbi->ll_cache->ccc_lru_lock); + + return count; +} + +LDEBUGFS_SEQ_FOPS(ll_unstable_stats); + +static int ll_root_squash_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct root_squash_info *squash = &sbi->ll_squash; + + seq_printf(m, "%u:%u\n", squash->rsi_uid, squash->rsi_gid); + return 0; +} + +static ssize_t ll_root_squash_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct root_squash_info *squash = &sbi->ll_squash; + + return lprocfs_wr_root_squash(buffer, count, squash, sbi->ll_fsname); +} + +LDEBUGFS_SEQ_FOPS(ll_root_squash); + +static int ll_nosquash_nids_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct root_squash_info *squash = &sbi->ll_squash; + int len; + + spin_lock(&squash->rsi_lock); + if (!list_empty(&squash->rsi_nosquash_nids)) { + len = cfs_print_nidlist(m->buf + m->count, m->size - m->count, + &squash->rsi_nosquash_nids); + m->count += len; + seq_putc(m, '\n'); + } else { + seq_puts(m, "NONE\n"); + } + spin_unlock(&squash->rsi_lock); + + return 0; +} + +static ssize_t ll_nosquash_nids_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct root_squash_info *squash = &sbi->ll_squash; + int rc; + + rc = lprocfs_wr_nosquash_nids(buffer, count, squash, sbi->ll_fsname); + if (rc < 0) + return rc; + + ll_compute_rootsquash_state(sbi); + + return rc; +} + +LDEBUGFS_SEQ_FOPS(ll_nosquash_nids); + +#ifdef CONFIG_LL_ENCRYPTION +static int ll_filename_enc_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct lustre_sb_info *lsi = s2lsi(sb); + + seq_printf(m, "%u\n", lsi->lsi_flags & LSI_FILENAME_ENC ? 1 : 0); + return 0; +} + +static ssize_t ll_filename_enc_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = ll_s2sbi(sb); + bool val; + int rc; + + rc = kstrtobool_from_user(buffer, count, &val); + if (rc) + return rc; + + if (val) { + if (!ll_sbi_has_name_encrypt(sbi)) { + /* server does not support name encryption, + * so force it to NULL on client + */ + CDEBUG(D_SEC, "%s: server does not support name encryption\n", + sbi->ll_fsname); + lsi->lsi_flags &= ~LSI_FILENAME_ENC; + return -EOPNOTSUPP; + } + + lsi->lsi_flags |= LSI_FILENAME_ENC; + } else { + lsi->lsi_flags &= ~LSI_FILENAME_ENC; + } + + return count; +} + +LDEBUGFS_SEQ_FOPS(ll_filename_enc); + +static int ll_old_b64_enc_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct lustre_sb_info *lsi = s2lsi(sb); + + seq_printf(m, "%u\n", + lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI ? 1 : 0); + return 0; +} + +static ssize_t ll_old_b64_enc_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = ll_s2sbi(sb); + bool val; + int rc; + + rc = kstrtobool_from_user(buffer, count, &val); + if (rc) + return rc; + + if (val) { + if (!ll_sbi_has_name_encrypt(sbi)) { + /* server does not support name encryption, + * so force it to NULL on client + */ + CDEBUG(D_SEC, + "%s: server does not support name encryption\n", + sbi->ll_fsname); + lsi->lsi_flags &= ~LSI_FILENAME_ENC_B64_OLD_CLI; + return -EOPNOTSUPP; + } + + lsi->lsi_flags |= LSI_FILENAME_ENC_B64_OLD_CLI; + } else { + lsi->lsi_flags &= ~LSI_FILENAME_ENC_B64_OLD_CLI; + } + + return count; +} + +LDEBUGFS_SEQ_FOPS(ll_old_b64_enc); +#endif /* CONFIG_LL_ENCRYPTION */ + +static int ll_pcc_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + return pcc_super_dump(&sbi->ll_pcc_super, m); +} + +static ssize_t ll_pcc_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int rc; + char *kernbuf; + + if (count >= LPROCFS_WR_PCC_MAX_CMD) + return -EINVAL; + + if (!(exp_connect_flags2(sbi->ll_md_exp) & OBD_CONNECT2_PCC)) + return -EOPNOTSUPP; + + OBD_ALLOC(kernbuf, count + 1); + if (kernbuf == NULL) + return -ENOMEM; + + if (copy_from_user(kernbuf, buffer, count)) + GOTO(out_free_kernbuff, rc = -EFAULT); + + rc = pcc_cmd_handle(kernbuf, count, &sbi->ll_pcc_super); +out_free_kernbuff: + OBD_FREE(kernbuf, count + 1); + return rc ? rc : count; +} +LDEBUGFS_SEQ_FOPS(ll_pcc); + +static int ll_mdll_dir_restore_max_retry_count_seq_show(struct seq_file *m, + void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + seq_printf(m, "%d\n", + atomic_read(&sbi->ll_dir_restore_max_retry_count)); + + return 0; +} + +static ssize_t +ll_mdll_dir_restore_max_retry_count_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + rc = kstrtoint_from_user(buffer, count, 0, &val); + if (rc) + return rc; + + /* + * Right now there is no limitation set on the retry count. + * This is done as we dont know what the right max limit + * would be. The max value would depend on the number of + * files in the directory that is being restored and as well + * if the mdt keeps restarting. The client calls are + * interruptible and can be used to break from long retries. + */ + if (val < -1) + return -EINVAL; + + atomic_set(&sbi->ll_dir_restore_max_retry_count, val); + + return count; +} + +LDEBUGFS_SEQ_FOPS(ll_mdll_dir_restore_max_retry_count); + +struct ldebugfs_vars lprocfs_llite_obd_vars[] = { + { .name = "site", + .fops = &ll_site_stats_fops }, + { .name = "max_cached_mb", + .fops = &ll_max_cached_mb_fops }, + { .name = "statahead_stats", + .fops = &ll_statahead_stats_fops }, + { .name = "unstable_stats", + .fops = &ll_unstable_stats_fops }, + { .name = "sbi_flags", + .fops = &ll_sbi_flags_fops }, + { .name = "root_squash", + .fops = &ll_root_squash_fops }, + { .name = "nosquash_nids", + .fops = &ll_nosquash_nids_fops }, + { .name = "pcc", + .fops = &ll_pcc_fops, }, +#ifdef CONFIG_LL_ENCRYPTION + { .name = "enable_filename_encryption", + .fops = &ll_filename_enc_fops, }, + { .name = "filename_enc_use_old_base64", + .fops = &ll_old_b64_enc_fops, }, +#endif + { .name = "mdll_dir_restore_max_retry_count", + .fops = &ll_mdll_dir_restore_max_retry_count_fops }, + { NULL } +}; + +#define MAX_STRING_SIZE 128 + +static struct attribute *llite_attrs[] = { + &lustre_attr_blocksize.attr, + &lustre_attr_stat_blocksize.attr, + &lustre_attr_kbytestotal.attr, + &lustre_attr_kbytesfree.attr, + &lustre_attr_kbytesavail.attr, + &lustre_attr_filestotal.attr, + &lustre_attr_filesfree.attr, + &lustre_attr_client_type.attr, + &lustre_attr_foreign_symlink_enable.attr, + &lustre_attr_foreign_symlink_prefix.attr, + &lustre_attr_foreign_symlink_upcall.attr, + &lustre_attr_foreign_symlink_upcall_info.attr, + &lustre_attr_fstype.attr, + &lustre_attr_uuid.attr, + &lustre_attr_checksums.attr, + &lustre_attr_checksum_pages.attr, + &lustre_attr_max_read_ahead_mb.attr, + &lustre_attr_max_read_ahead_per_file_mb.attr, + &lustre_attr_max_read_ahead_whole_mb.attr, + &lustre_attr_max_read_ahead_async_active.attr, + &lustre_attr_read_ahead_async_file_threshold_mb.attr, + &lustre_attr_read_ahead_range_kb.attr, + &lustre_attr_stats_track_pid.attr, + &lustre_attr_stats_track_ppid.attr, + &lustre_attr_stats_track_gid.attr, + &lustre_attr_statahead_running_max.attr, + &lustre_attr_statahead_max.attr, + &lustre_attr_statahead_agl.attr, + &lustre_attr_lazystatfs.attr, + &lustre_attr_statfs_max_age.attr, + &lustre_attr_max_easize.attr, + &lustre_attr_default_easize.attr, + &lustre_attr_xattr_cache.attr, + &lustre_attr_fast_read.attr, + &lustre_attr_tiny_write.attr, + &lustre_attr_neg_dentry_timeout.attr, + &lustre_attr_parallel_dio.attr, + &lustre_attr_file_heat.attr, + &lustre_attr_heat_decay_percentage.attr, + &lustre_attr_heat_period_second.attr, + &lustre_attr_opencache_threshold_count.attr, + &lustre_attr_opencache_threshold_ms.attr, + &lustre_attr_opencache_max_ms.attr, + &lustre_attr_inode_cache.attr, + NULL, +}; + +KOBJ_ATTRIBUTE_GROUPS(llite); /* creates llite_groups */ + +static void sbi_kobj_release(struct kobject *kobj) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + complete(&sbi->ll_kobj_unregister); +} + +static struct kobj_type sbi_ktype = { + .default_groups = KOBJ_ATTR_GROUPS(llite), + .sysfs_ops = &lustre_sysfs_ops, + .release = sbi_kobj_release, +}; + +static const struct llite_file_opcode { + __u32 opcode; + __u32 type; + const char *opname; +} llite_opcode_table[LPROC_LL_FILE_OPCODES] = { + /* file operation */ + { LPROC_LL_READ_BYTES, LPROCFS_TYPE_BYTES_FULL, "read_bytes" }, + { LPROC_LL_WRITE_BYTES, LPROCFS_TYPE_BYTES_FULL, "write_bytes" }, + { LPROC_LL_READ, LPROCFS_TYPE_LATENCY, "read" }, + { LPROC_LL_WRITE, LPROCFS_TYPE_LATENCY, "write" }, + { LPROC_LL_IOCTL, LPROCFS_TYPE_REQS, "ioctl" }, + { LPROC_LL_OPEN, LPROCFS_TYPE_LATENCY, "open" }, + { LPROC_LL_RELEASE, LPROCFS_TYPE_LATENCY, "close" }, + { LPROC_LL_MMAP, LPROCFS_TYPE_LATENCY, "mmap" }, + { LPROC_LL_FAULT, LPROCFS_TYPE_LATENCY, "page_fault" }, + { LPROC_LL_MKWRITE, LPROCFS_TYPE_LATENCY, "page_mkwrite" }, + { LPROC_LL_LLSEEK, LPROCFS_TYPE_LATENCY, "seek" }, + { LPROC_LL_FSYNC, LPROCFS_TYPE_LATENCY, "fsync" }, + { LPROC_LL_READDIR, LPROCFS_TYPE_LATENCY, "readdir" }, + { LPROC_LL_INODE_OCOUNT,LPROCFS_TYPE_REQS | + LPROCFS_CNTR_AVGMINMAX | + LPROCFS_CNTR_STDDEV, "opencount" }, + { LPROC_LL_INODE_OPCLTM,LPROCFS_TYPE_LATENCY, "openclosetime" }, + /* inode operation */ + { LPROC_LL_SETATTR, LPROCFS_TYPE_LATENCY, "setattr" }, + { LPROC_LL_TRUNC, LPROCFS_TYPE_LATENCY, "truncate" }, + { LPROC_LL_FLOCK, LPROCFS_TYPE_LATENCY, "flock" }, + { LPROC_LL_GETATTR, LPROCFS_TYPE_LATENCY, "getattr" }, + { LPROC_LL_FALLOCATE, LPROCFS_TYPE_LATENCY, "fallocate"}, + /* dir inode operation */ + { LPROC_LL_CREATE, LPROCFS_TYPE_LATENCY, "create" }, + { LPROC_LL_LINK, LPROCFS_TYPE_LATENCY, "link" }, + { LPROC_LL_UNLINK, LPROCFS_TYPE_LATENCY, "unlink" }, + { LPROC_LL_SYMLINK, LPROCFS_TYPE_LATENCY, "symlink" }, + { LPROC_LL_MKDIR, LPROCFS_TYPE_LATENCY, "mkdir" }, + { LPROC_LL_RMDIR, LPROCFS_TYPE_LATENCY, "rmdir" }, + { LPROC_LL_MKNOD, LPROCFS_TYPE_LATENCY, "mknod" }, + { LPROC_LL_RENAME, LPROCFS_TYPE_LATENCY, "rename" }, + /* special inode operation */ + { LPROC_LL_STATFS, LPROCFS_TYPE_LATENCY, "statfs" }, + { LPROC_LL_SETXATTR, LPROCFS_TYPE_LATENCY, "setxattr" }, + { LPROC_LL_GETXATTR, LPROCFS_TYPE_LATENCY, "getxattr" }, + { LPROC_LL_GETXATTR_HITS, LPROCFS_TYPE_REQS, "getxattr_hits" }, + { LPROC_LL_LISTXATTR, LPROCFS_TYPE_LATENCY, "listxattr" }, + { LPROC_LL_REMOVEXATTR, LPROCFS_TYPE_LATENCY, "removexattr" }, + { LPROC_LL_INODE_PERM, LPROCFS_TYPE_LATENCY, "inode_permission" }, +}; + +void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, long count) +{ + if (!sbi->ll_stats) + return; + + if (sbi->ll_stats_track_type == STATS_TRACK_ALL) + lprocfs_counter_add(sbi->ll_stats, op, count); + else if (sbi->ll_stats_track_type == STATS_TRACK_PID && + sbi->ll_stats_track_id == current->pid) + lprocfs_counter_add(sbi->ll_stats, op, count); + else if (sbi->ll_stats_track_type == STATS_TRACK_PPID && + sbi->ll_stats_track_id == current->real_parent->pid) + lprocfs_counter_add(sbi->ll_stats, op, count); + else if (sbi->ll_stats_track_type == STATS_TRACK_GID && + sbi->ll_stats_track_id == + from_kgid(&init_user_ns, current_gid())) + lprocfs_counter_add(sbi->ll_stats, op, count); +} +EXPORT_SYMBOL(ll_stats_ops_tally); + +static const char *const ra_stat_string[] = { + [RA_STAT_HIT] = "hits", + [RA_STAT_MISS] = "misses", + [RA_STAT_DISTANT_READPAGE] = "readpage_not_consecutive", + [RA_STAT_MISS_IN_WINDOW] = "miss_inside_window", + [RA_STAT_FAILED_GRAB_PAGE] = "failed_grab_cache_page", + [RA_STAT_FAILED_MATCH] = "failed_lock_match", + [RA_STAT_DISCARDED] = "read_but_discarded", + [RA_STAT_ZERO_LEN] = "zero_length_file", + [RA_STAT_ZERO_WINDOW] = "zero_size_window", + [RA_STAT_EOF] = "readahead_to_eof", + [RA_STAT_MAX_IN_FLIGHT] = "hit_max_readahead_issue", + [RA_STAT_WRONG_GRAB_PAGE] = "wrong_page_from_grab_cache_page", + [RA_STAT_FAILED_REACH_END] = "failed_to_reach_end", + [RA_STAT_ASYNC] = "async_readahead", + [RA_STAT_FAILED_FAST_READ] = "failed_to_fast_read", + [RA_STAT_MMAP_RANGE_READ] = "mmap_range_read", +}; + +int ll_debugfs_register_super(struct super_block *sb, const char *name) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = ll_s2sbi(sb); + int err, id; + + ENTRY; + LASSERT(sbi); + + if (IS_ERR_OR_NULL(llite_root)) + goto out_ll_kset; + + sbi->ll_debugfs_entry = debugfs_create_dir(name, llite_root); + ldebugfs_add_vars(sbi->ll_debugfs_entry, lprocfs_llite_obd_vars, sb); + + debugfs_create_file("dump_page_cache", 0444, sbi->ll_debugfs_entry, sbi, + &vvp_dump_pgcache_file_ops); + + debugfs_create_file("extents_stats", 0644, sbi->ll_debugfs_entry, sbi, + &ll_rw_extents_stats_fops); + + debugfs_create_file("extents_stats_per_process", 0644, + sbi->ll_debugfs_entry, sbi, + &ll_rw_extents_stats_pp_fops); + + debugfs_create_file("offset_stats", 0644, sbi->ll_debugfs_entry, sbi, + &ll_rw_offset_stats_fops); + + /* File operations stats */ + sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES, + LPROCFS_STATS_FLAG_NONE); + if (sbi->ll_stats == NULL) + GOTO(out_debugfs, err = -ENOMEM); + + /* do counter init */ + for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) { + u32 type = llite_opcode_table[id].type; + void *ptr = "unknown"; + + if (type & LPROCFS_TYPE_REQS) + ptr = "reqs"; + else if (type & LPROCFS_TYPE_BYTES) + ptr = "bytes"; + else if (type & LPROCFS_TYPE_USEC) + ptr = "usec"; + lprocfs_counter_init(sbi->ll_stats, + llite_opcode_table[id].opcode, type, + llite_opcode_table[id].opname, ptr); + } + + debugfs_create_file("stats", 0644, sbi->ll_debugfs_entry, + sbi->ll_stats, &ldebugfs_stats_seq_fops); + + sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string), + LPROCFS_STATS_FLAG_NONE); + if (sbi->ll_ra_stats == NULL) + GOTO(out_stats, err = -ENOMEM); + + for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++) + lprocfs_counter_init(sbi->ll_ra_stats, id, 0, + ra_stat_string[id], "pages"); + + debugfs_create_file("read_ahead_stats", 0644, sbi->ll_debugfs_entry, + sbi->ll_ra_stats, &ldebugfs_stats_seq_fops); + +out_ll_kset: + /* Yes we also register sysfs mount kset here as well */ + sbi->ll_kset.kobj.parent = llite_kobj; + sbi->ll_kset.kobj.ktype = &sbi_ktype; + init_completion(&sbi->ll_kobj_unregister); + err = kobject_set_name(&sbi->ll_kset.kobj, "%s", name); + if (err) + GOTO(out_ra_stats, err); + + err = kset_register(&sbi->ll_kset); + if (err) + GOTO(out_ra_stats, err); + + lsi->lsi_kobj = kobject_get(&sbi->ll_kset.kobj); + + RETURN(0); +out_ra_stats: + lprocfs_free_stats(&sbi->ll_ra_stats); +out_stats: + lprocfs_free_stats(&sbi->ll_stats); +out_debugfs: + debugfs_remove_recursive(sbi->ll_debugfs_entry); + + RETURN(err); +} + +void ll_debugfs_unregister_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = ll_s2sbi(sb); + + debugfs_remove_recursive(sbi->ll_debugfs_entry); + + if (sbi->ll_dt_obd) + sysfs_remove_link(&sbi->ll_kset.kobj, + sbi->ll_dt_obd->obd_type->typ_name); + + if (sbi->ll_md_obd) + sysfs_remove_link(&sbi->ll_kset.kobj, + sbi->ll_md_obd->obd_type->typ_name); + + kobject_put(lsi->lsi_kobj); + + kset_unregister(&sbi->ll_kset); + wait_for_completion(&sbi->ll_kobj_unregister); + + lprocfs_free_stats(&sbi->ll_ra_stats); + lprocfs_free_stats(&sbi->ll_stats); +} +#undef MAX_STRING_SIZE + +static void ll_display_extents_info(struct ll_rw_extents_info *rw_extents, + struct seq_file *seq, int which) +{ + unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; + unsigned long start, end, r, w; + char *unitp = "KMGTPEZY"; + int i, units = 10; + struct per_process_info *pp_info; + + pp_info = &rw_extents->pp_extents[which]; + read_cum = 0; + write_cum = 0; + start = 0; + + for (i = 0; i < LL_HIST_MAX; i++) { + read_tot += pp_info->pp_r_hist.oh_buckets[i]; + write_tot += pp_info->pp_w_hist.oh_buckets[i]; + } + + for (i = 0; i < LL_HIST_MAX; i++) { + r = pp_info->pp_r_hist.oh_buckets[i]; + w = pp_info->pp_w_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + end = 1 << (i + LL_HIST_START - units); + seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4u %4u | " + "%14lu %4u %4u\n", start, *unitp, end, *unitp, + (i == LL_HIST_MAX - 1) ? '+' : ' ', + r, pct(r, read_tot), pct(read_cum, read_tot), + w, pct(w, write_tot), pct(write_cum, write_tot)); + start = end; + if (start == (1 << 10)) { + start = 1; + units += 10; + unitp++; + } + if (read_cum == read_tot && write_cum == write_tot) + break; + } +} + +static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v) +{ + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *rw_extents = sbi->ll_rw_extents_info; + int k; + + if (!sbi->ll_rw_stats_on || !rw_extents) { + seq_puts(seq, "disabled\n write anything to this file to activate, then '0' or 'disable' to deactivate\n"); + return 0; + } + + spin_lock(&sbi->ll_pp_extent_lock); + lprocfs_stats_header(seq, ktime_get_real(), rw_extents->pp_init, 25, + ":", true, ""); + seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write"); + seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n", + "extents", "calls", "%", "cum%", "calls", "%", "cum%"); + + for (k = 0; k < LL_PROCESS_HIST_MAX; k++) { + if (rw_extents->pp_extents[k].pid != 0) { + seq_printf(seq, "\nPID: %d\n", + rw_extents->pp_extents[k].pid); + ll_display_extents_info(rw_extents, seq, k); + } + } + spin_unlock(&sbi->ll_pp_extent_lock); + return 0; +} + +static int alloc_rw_stats_info(struct ll_sb_info *sbi) +{ + struct ll_rw_extents_info *rw_extents; + struct ll_rw_process_info *offset; + struct ll_rw_process_info *process; + int i, rc = 0; + + OBD_ALLOC(rw_extents, sizeof(*rw_extents)); + if (!rw_extents) + return -ENOMEM; + + for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) { + spin_lock_init(&rw_extents->pp_extents[i].pp_r_hist.oh_lock); + spin_lock_init(&rw_extents->pp_extents[i].pp_w_hist.oh_lock); + } + rw_extents->pp_init = ktime_get_real(); + + spin_lock(&sbi->ll_pp_extent_lock); + if (!sbi->ll_rw_extents_info) + sbi->ll_rw_extents_info = rw_extents; + spin_unlock(&sbi->ll_pp_extent_lock); + /* another writer allocated the struct before we got the lock */ + if (sbi->ll_rw_extents_info != rw_extents) + OBD_FREE(rw_extents, sizeof(*rw_extents)); + + OBD_ALLOC(process, sizeof(*process) * LL_PROCESS_HIST_MAX); + if (!process) + GOTO(out, rc = -ENOMEM); + OBD_ALLOC(offset, sizeof(*offset) * LL_OFFSET_HIST_MAX); + if (!offset) + GOTO(out_free, rc = -ENOMEM); + + spin_lock(&sbi->ll_process_lock); + if (!sbi->ll_rw_process_info) + sbi->ll_rw_process_info = process; + if (!sbi->ll_rw_offset_info) + sbi->ll_rw_offset_info = offset; + spin_unlock(&sbi->ll_process_lock); + sbi->ll_process_stats_init = ktime_get_real(); + + /* another writer allocated the structs before we got the lock */ + if (sbi->ll_rw_offset_info != offset) + OBD_FREE(offset, sizeof(*offset) * LL_OFFSET_HIST_MAX); + if (sbi->ll_rw_process_info != process) { +out_free: + OBD_FREE(process, sizeof(*process) * LL_PROCESS_HIST_MAX); + } + +out: + return rc; +} + +void ll_free_rw_stats_info(struct ll_sb_info *sbi) +{ + if (sbi->ll_rw_extents_info) { + OBD_FREE(sbi->ll_rw_extents_info, + sizeof(*sbi->ll_rw_extents_info)); + sbi->ll_rw_extents_info = NULL; + } + if (sbi->ll_rw_offset_info) { + OBD_FREE(sbi->ll_rw_offset_info, + sizeof(*sbi->ll_rw_offset_info) * LL_OFFSET_HIST_MAX); + sbi->ll_rw_offset_info = NULL; + } + if (sbi->ll_rw_process_info) { + OBD_FREE(sbi->ll_rw_process_info, + sizeof(*sbi->ll_rw_process_info) * LL_PROCESS_HIST_MAX); + sbi->ll_rw_process_info = NULL; + } +} + +static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *rw_extents; + int i; + __s64 value; + + if (len == 0) + return -EINVAL; + + value = ll_stats_pid_write(buf, len); + + if (value == 0) { + sbi->ll_rw_stats_on = 0; + } else { + if (!sbi->ll_rw_extents_info) { + int rc = alloc_rw_stats_info(sbi); + + if (rc) + return rc; + } + sbi->ll_rw_stats_on = 1; + } + + + spin_lock(&sbi->ll_pp_extent_lock); + rw_extents = sbi->ll_rw_extents_info; + if (rw_extents) { + rw_extents->pp_init = ktime_get_real(); + for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { + rw_extents->pp_extents[i].pid = 0; + lprocfs_oh_clear(&rw_extents->pp_extents[i].pp_r_hist); + lprocfs_oh_clear(&rw_extents->pp_extents[i].pp_w_hist); + } + } + spin_unlock(&sbi->ll_pp_extent_lock); + + return len; +} + +LDEBUGFS_SEQ_FOPS(ll_rw_extents_stats_pp); + +static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v) +{ + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *rw_extents = sbi->ll_rw_extents_info; + + if (!sbi->ll_rw_stats_on || !rw_extents) { + seq_puts(seq, "disabled\n write anything to this file to activate, then '0' or 'disable' to deactivate\n"); + return 0; + } + + spin_lock(&sbi->ll_lock); + lprocfs_stats_header(seq, ktime_get_real(), rw_extents->pp_init, 25, + ":", true, ""); + + seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write"); + seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n", + "extents", "calls", "%", "cum%", + "calls", "%", "cum%"); + + ll_display_extents_info(rw_extents, seq, LL_PROCESS_HIST_MAX); + spin_unlock(&sbi->ll_lock); + + return 0; +} + +static ssize_t ll_rw_extents_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *rw_extents; + int i; + __s64 value; + + if (len == 0) + return -EINVAL; + + value = ll_stats_pid_write(buf, len); + + if (value == 0) { + sbi->ll_rw_stats_on = 0; + } else { + if (!sbi->ll_rw_extents_info) { + int rc = alloc_rw_stats_info(sbi); + + if (rc) + return rc; + } + sbi->ll_rw_stats_on = 1; + } + + spin_lock(&sbi->ll_pp_extent_lock); + rw_extents = sbi->ll_rw_extents_info; + if (rw_extents) { + rw_extents->pp_init = ktime_get_real(); + for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) { + rw_extents->pp_extents[i].pid = 0; + lprocfs_oh_clear(&rw_extents->pp_extents[i].pp_r_hist); + lprocfs_oh_clear(&rw_extents->pp_extents[i].pp_w_hist); + } + } + spin_unlock(&sbi->ll_pp_extent_lock); + + return len; +} + +LDEBUGFS_SEQ_FOPS(ll_rw_extents_stats); + +void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, + struct ll_file_data *file, loff_t pos, + size_t count, int rw) +{ + int i, cur = -1; + struct ll_rw_process_info *process; + struct ll_rw_process_info *offset; + int *off_count = &sbi->ll_rw_offset_entry_count; + int *process_count = &sbi->ll_offset_process_count; + struct ll_rw_extents_info *rw_extents; + + if (!sbi->ll_rw_stats_on) + return; + + spin_lock(&sbi->ll_pp_extent_lock); + rw_extents = sbi->ll_rw_extents_info; + if (!rw_extents) { + spin_unlock(&sbi->ll_pp_extent_lock); + return; + } + + /* Extent statistics */ + for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { + if (rw_extents->pp_extents[i].pid == pid) { + cur = i; + break; + } + } + + if (cur == -1) { + /* new process */ + sbi->ll_extent_process_count = + (sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX; + cur = sbi->ll_extent_process_count; + rw_extents->pp_extents[cur].pid = pid; + lprocfs_oh_clear(&rw_extents->pp_extents[cur].pp_r_hist); + lprocfs_oh_clear(&rw_extents->pp_extents[cur].pp_w_hist); + } + + for (i = 0; (count >= 1 << (LL_HIST_START + i)) && + (i < (LL_HIST_MAX - 1)); i++); + if (rw == 0) { + rw_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++; + rw_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++; + } else { + rw_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++; + rw_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++; + } + spin_unlock(&sbi->ll_pp_extent_lock); + + spin_lock(&sbi->ll_process_lock); + process = sbi->ll_rw_process_info; + offset = sbi->ll_rw_offset_info; + if (!process || !offset) + goto out_unlock; + + /* Offset statistics */ + for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { + if (process[i].rw_pid == pid) { + if (process[i].rw_last_file != file) { + process[i].rw_range_start = pos; + process[i].rw_last_file_pos = pos + count; + process[i].rw_smallest_extent = count; + process[i].rw_largest_extent = count; + process[i].rw_offset = 0; + process[i].rw_last_file = file; + goto out_unlock; + } + if (process[i].rw_last_file_pos != pos) { + *off_count = + (*off_count + 1) % LL_OFFSET_HIST_MAX; + offset[*off_count].rw_op = process[i].rw_op; + offset[*off_count].rw_pid = pid; + offset[*off_count].rw_range_start = + process[i].rw_range_start; + offset[*off_count].rw_range_end = + process[i].rw_last_file_pos; + offset[*off_count].rw_smallest_extent = + process[i].rw_smallest_extent; + offset[*off_count].rw_largest_extent = + process[i].rw_largest_extent; + offset[*off_count].rw_offset = + process[i].rw_offset; + process[i].rw_op = rw; + process[i].rw_range_start = pos; + process[i].rw_smallest_extent = count; + process[i].rw_largest_extent = count; + process[i].rw_offset = pos - + process[i].rw_last_file_pos; + } + if (process[i].rw_smallest_extent > count) + process[i].rw_smallest_extent = count; + if (process[i].rw_largest_extent < count) + process[i].rw_largest_extent = count; + process[i].rw_last_file_pos = pos + count; + goto out_unlock; + } + } + *process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX; + process[*process_count].rw_pid = pid; + process[*process_count].rw_op = rw; + process[*process_count].rw_range_start = pos; + process[*process_count].rw_last_file_pos = pos + count; + process[*process_count].rw_smallest_extent = count; + process[*process_count].rw_largest_extent = count; + process[*process_count].rw_offset = 0; + process[*process_count].rw_last_file = file; + +out_unlock: + spin_unlock(&sbi->ll_process_lock); +} + +static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v) +{ + struct ll_sb_info *sbi = seq->private; + struct ll_rw_process_info *offset; + struct ll_rw_process_info *process; + int i; + + if (!sbi->ll_rw_stats_on) { + seq_puts(seq, "disabled\n write anything to this file to activate, then '0' or 'disable' to deactivate\n"); + return 0; + } + + spin_lock(&sbi->ll_process_lock); + lprocfs_stats_header(seq, ktime_get_real(), sbi->ll_process_stats_init, + 25, ":", true, ""); + seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n", + "R/W", "PID", "RANGE START", "RANGE END", + "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET"); + + /* We stored the discontiguous offsets here; print them first */ + offset = sbi->ll_rw_offset_info; + for (i = 0; offset && i < LL_OFFSET_HIST_MAX; i++) { + if (offset[i].rw_pid != 0) + seq_printf(seq, + "%3c %10d %14llu %14llu %17lu %17lu %14lld\n", + offset[i].rw_op == READ ? 'R' : 'W', + offset[i].rw_pid, + offset[i].rw_range_start, + offset[i].rw_range_end, + (unsigned long)offset[i].rw_smallest_extent, + (unsigned long)offset[i].rw_largest_extent, + offset[i].rw_offset); + } + + /* Then print the current offsets for each process */ + process = sbi->ll_rw_process_info; + for (i = 0; process && i < LL_PROCESS_HIST_MAX; i++) { + if (process[i].rw_pid != 0) + seq_printf(seq, + "%3c %10d %14llu %14llu %17lu %17lu %14lld\n", + process[i].rw_op == READ ? 'R' : 'W', + process[i].rw_pid, + process[i].rw_range_start, + process[i].rw_last_file_pos, + (unsigned long)process[i].rw_smallest_extent, + (unsigned long)process[i].rw_largest_extent, + process[i].rw_offset); + } + spin_unlock(&sbi->ll_process_lock); + + return 0; +} + +static ssize_t ll_rw_offset_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = seq->private; + __s64 value; + + if (len == 0) + return -EINVAL; + + value = ll_stats_pid_write(buf, len); + + if (value == 0) { + sbi->ll_rw_stats_on = 0; + } else { + if (!sbi->ll_rw_process_info || !sbi->ll_rw_offset_info) { + int rc = alloc_rw_stats_info(sbi); + + if (rc) + return rc; + } + sbi->ll_rw_stats_on = 1; + } + + spin_lock(&sbi->ll_process_lock); + sbi->ll_offset_process_count = 0; + sbi->ll_rw_offset_entry_count = 0; + sbi->ll_process_stats_init = ktime_get_real(); + if (sbi->ll_rw_process_info) + memset(sbi->ll_rw_process_info, 0, + sizeof(struct ll_rw_process_info) * LL_PROCESS_HIST_MAX); + if (sbi->ll_rw_offset_info) + memset(sbi->ll_rw_offset_info, 0, + sizeof(struct ll_rw_process_info) * LL_OFFSET_HIST_MAX); + spin_unlock(&sbi->ll_process_lock); + + return len; +} + +LDEBUGFS_SEQ_FOPS(ll_rw_offset_stats); diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c new file mode 100644 index 0000000000000..5af9ab7477a93 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/namei.c @@ -0,0 +1,2222 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include "llite_internal.h" + +#ifndef HAVE_USER_NAMESPACE_ARG +#define ll_create_nd(ns, dir, de, mode, ex) ll_create_nd(dir, de, mode, ex) +#define ll_mkdir(ns, dir, dch, mode) ll_mkdir(dir, dch, mode) +#define ll_mknod(ns, dir, dch, mode, rd) ll_mknod(dir, dch, mode, rd) +#ifdef HAVE_IOPS_RENAME_WITH_FLAGS +#define ll_rename(ns, src, sdc, tgt, tdc, fl) ll_rename(src, sdc, tgt, tdc, fl) +#else +#define ll_rename(ns, src, sdc, tgt, tdc) ll_rename(src, sdc, tgt, tdc) +#endif /* HAVE_IOPS_RENAME_WITH_FLAGS */ +#define ll_symlink(nd, dir, dch, old) ll_symlink(dir, dch, old) +#endif + +static int ll_create_it(struct inode *dir, struct dentry *dentry, + struct lookup_intent *it, + void *secctx, __u32 secctxlen, bool encrypt, + void *encctx, __u32 encctxlen, unsigned int open_flags); + +/* called from iget5_locked->find_inode() under inode_lock spinlock */ +static int ll_test_inode(struct inode *inode, void *opaque) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct lustre_md *md = opaque; + + if (unlikely(!(md->body->mbo_valid & OBD_MD_FLID))) { + CERROR("MDS body missing FID\n"); + return 0; + } + + if (!lu_fid_eq(&lli->lli_fid, &md->body->mbo_fid1)) + return 0; + + return 1; +} + +static int ll_set_inode(struct inode *inode, void *opaque) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct mdt_body *body = ((struct lustre_md *)opaque)->body; + + if (unlikely(!(body->mbo_valid & OBD_MD_FLID))) { + CERROR("MDS body missing FID\n"); + return -EINVAL; + } + + lli->lli_fid = body->mbo_fid1; + if (unlikely(!(body->mbo_valid & OBD_MD_FLTYPE))) { + CERROR("Can not initialize inode "DFID" without object type: " + "valid = %#llx\n", + PFID(&lli->lli_fid), body->mbo_valid); + return -EINVAL; + } + + inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mbo_mode & S_IFMT); + if (unlikely(inode->i_mode == 0)) { + CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid)); + return -EINVAL; + } + + ll_lli_init(lli); + + return 0; +} + + +/** + * Get an inode by inode number(@hash), which is already instantiated by + * the intent lookup). + */ +struct inode *ll_iget(struct super_block *sb, ino_t hash, + struct lustre_md *md) +{ + struct inode *inode; + int rc = 0; + + ENTRY; + + LASSERT(hash != 0); + inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md); + if (inode == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + if (inode->i_state & I_NEW) { + rc = ll_read_inode2(inode, md); + if (rc == 0 && S_ISREG(inode->i_mode) && + ll_i2info(inode)->lli_clob == NULL) + rc = cl_file_inode_init(inode, md); + + if (rc != 0) { + /* Let's clear directory lsm here, otherwise + * make_bad_inode() will reset the inode mode + * to regular, then ll_clear_inode will not + * be able to clear lsm_md */ + if (S_ISDIR(inode->i_mode)) + ll_dir_clear_lsm_md(inode); + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + inode = ERR_PTR(rc); + } else { + inode_has_no_xattr(inode); + unlock_new_inode(inode); + } + } else if (is_bad_inode(inode)) { + iput(inode); + inode = ERR_PTR(-ESTALE); + } else if (!(inode->i_state & (I_FREEING | I_CLEAR))) { + rc = ll_update_inode(inode, md); + CDEBUG(D_VFSTRACE, "got inode: "DFID"(%p): rc = %d\n", + PFID(&md->body->mbo_fid1), inode, rc); + if (rc != 0) { + if (S_ISDIR(inode->i_mode)) + ll_dir_clear_lsm_md(inode); + iput(inode); + inode = ERR_PTR(rc); + } + } + + RETURN(inode); +} + +/* mark negative sub file dentries invalid and prune unused dentries */ +static void ll_prune_negative_children(struct inode *dir) +{ + struct dentry *dentry; + struct dentry *child; + + ENTRY; + +restart: + spin_lock(&dir->i_lock); + hlist_for_each_entry(dentry, &dir->i_dentry, d_alias) { + spin_lock(&dentry->d_lock); + list_for_each_entry(child, &dentry->d_subdirs, d_child) { + if (child->d_inode) + continue; + + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); + if (lld_is_init(child)) + ll_d2d(child)->lld_invalid = 1; + if (!ll_d_count(child)) { + dget_dlock(child); + __d_drop(child); + spin_unlock(&child->d_lock); + spin_unlock(&dentry->d_lock); + spin_unlock(&dir->i_lock); + + CDEBUG(D_DENTRY, "prune negative dentry %pd\n", + child); + + dput(child); + goto restart; + } + spin_unlock(&child->d_lock); + } + spin_unlock(&dentry->d_lock); + } + spin_unlock(&dir->i_lock); + + EXIT; +} + +int ll_test_inode_by_fid(struct inode *inode, void *opaque) +{ + return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque); +} + +static int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock) +{ + struct lu_env *env; + struct ll_inode_info *lli = ll_i2info(inode); + __u16 refcheck; + int rc; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_REPLAY_PAUSE, cfs_fail_val); + + /* reach MDC layer to flush data under the DoM ldlm lock */ + rc = cl_object_flush(env, lli->lli_clob, lock); + if (rc == -ENODATA) { + CDEBUG(D_INODE, "inode "DFID" layout has no DoM stripe\n", + PFID(ll_inode2fid(inode))); + /* most likely result of layout change, do nothing */ + rc = 0; + } + + cl_env_put(env, &refcheck); + RETURN(rc); +} + +static void ll_lock_cancel_bits(struct ldlm_lock *lock, __u64 to_cancel) +{ + struct inode *inode = ll_inode_from_resource_lock(lock); + struct ll_inode_info *lli; + __u64 bits = to_cancel; + int rc; + + ENTRY; + + if (!inode) { + /* That means the inode is evicted most likely and may cause + * the skipping of lock cleanups below, so print the message + * about that in log. + */ + if (lock->l_resource->lr_lvb_inode) + LDLM_DEBUG(lock, + "can't take inode for the lock (%sevicted)\n", + lock->l_resource->lr_lvb_inode->i_state & + I_FREEING ? "" : "not "); + RETURN_EXIT; + } + + if (!fid_res_name_eq(ll_inode2fid(inode), + &lock->l_resource->lr_name)) { + LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)", + PFID(ll_inode2fid(inode)), inode); + LBUG(); + } + + if (bits & MDS_INODELOCK_XATTR) { + ll_xattr_cache_empty(inode); + bits &= ~MDS_INODELOCK_XATTR; + } + + /* For OPEN locks we differentiate between lock modes + * LCK_CR, LCK_CW, LCK_PR - bug 22891 */ + if (bits & MDS_INODELOCK_OPEN) + ll_have_md_lock(inode, &bits, lock->l_req_mode); + + if (bits & MDS_INODELOCK_OPEN) { + fmode_t fmode; + + switch (lock->l_req_mode) { + case LCK_CW: + fmode = FMODE_WRITE; + break; + case LCK_PR: + fmode = FMODE_EXEC; + break; + case LCK_CR: + fmode = FMODE_READ; + break; + default: + LDLM_ERROR(lock, "bad lock mode for OPEN lock"); + LBUG(); + } + + ll_md_real_close(inode, fmode); + + bits &= ~MDS_INODELOCK_OPEN; + } + + if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE | + MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM | + MDS_INODELOCK_DOM)) + ll_have_md_lock(inode, &bits, LCK_MINMODE); + + if (bits & MDS_INODELOCK_DOM) { + rc = ll_dom_lock_cancel(inode, lock); + if (rc < 0) + CDEBUG(D_INODE, "cannot flush DoM data " + DFID": rc = %d\n", + PFID(ll_inode2fid(inode)), rc); + } + + if (bits & MDS_INODELOCK_LAYOUT) { + struct cl_object_conf conf = { + .coc_opc = OBJECT_CONF_INVALIDATE, + .coc_inode = inode, + }; + + rc = ll_layout_conf(inode, &conf); + if (rc < 0) + CDEBUG(D_INODE, "cannot invalidate layout of " + DFID": rc = %d\n", + PFID(ll_inode2fid(inode)), rc); + } + + lli = ll_i2info(inode); + + if (bits & MDS_INODELOCK_UPDATE) + set_bit(LLIF_UPDATE_ATIME, &lli->lli_flags); + + if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) { + CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, " + "pfid = "DFID"\n", PFID(ll_inode2fid(inode)), + lli, PFID(&lli->lli_pfid)); + truncate_inode_pages(inode->i_mapping, 0); + + if (unlikely(!fid_is_zero(&lli->lli_pfid))) { + struct inode *master_inode = NULL; + unsigned long hash; + + /* This is slave inode, since all of the child dentry + * is connected on the master inode, so we have to + * invalidate the negative children on master inode */ + CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n", + PFID(ll_inode2fid(inode)), PFID(&lli->lli_pfid)); + + hash = cl_fid_build_ino(&lli->lli_pfid, + ll_need_32bit_api(ll_i2sbi(inode))); + + /* Do not lookup the inode with ilookup5, otherwise + * it will cause dead lock, + * 1. Client1 send chmod req to the MDT0, then on MDT0, + * it enqueues master and all of its slaves lock, + * (mdt_attr_set() -> mdt_lock_slaves()), after gets + * master and stripe0 lock, it will send the enqueue + * req (for stripe1) to MDT1, then MDT1 finds the lock + * has been granted to client2. Then MDT1 sends blocking + * ast to client2. + * 2. At the same time, client2 tries to unlink + * the striped dir (rm -rf striped_dir), and during + * lookup, it will hold the master inode of the striped + * directory, whose inode state is NEW, then tries to + * revalidate all of its slaves, (ll_prep_inode()-> + * ll_iget()->ll_read_inode2()-> ll_update_inode().). + * And it will be blocked on the server side because + * of 1. + * 3. Then the client get the blocking_ast req, cancel + * the lock, but being blocked if using ->ilookup5()), + * because master inode state is NEW. */ + master_inode = ilookup5_nowait(inode->i_sb, hash, + ll_test_inode_by_fid, + (void *)&lli->lli_pfid); + if (master_inode) { + ll_prune_negative_children(master_inode); + iput(master_inode); + } + } else { + ll_prune_negative_children(inode); + } + } + + /* at umount s_root becomes NULL */ + if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) && + inode->i_sb->s_root && !is_root_inode(inode)) + ll_prune_aliases(inode); + + if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) + forget_all_cached_acls(inode); + + iput(inode); + RETURN_EXIT; +} + +/* Check if the given lock may be downgraded instead of canceling and + * that convert is really needed. */ +int ll_md_need_convert(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + struct inode *inode; + __u64 wanted = lock->l_policy_data.l_inodebits.cancel_bits; + __u64 bits = lock->l_policy_data.l_inodebits.bits & ~wanted; + enum ldlm_mode mode = LCK_MINMODE; + + if (!lock->l_conn_export || + !exp_connect_lock_convert(lock->l_conn_export)) + return 0; + + if (!wanted || !bits || ldlm_is_cancel(lock)) + return 0; + + /* do not convert locks other than DOM for now */ + if (!((bits | wanted) & MDS_INODELOCK_DOM)) + return 0; + + /* We may have already remaining bits in some other lock so + * lock convert will leave us just extra lock for the same bit. + * Check if client has other lock with the same bits and the same + * or lower mode and don't convert if any. + */ + switch (lock->l_req_mode) { + case LCK_PR: + mode = LCK_PR; + fallthrough; + case LCK_PW: + mode |= LCK_CR; + break; + case LCK_CW: + mode = LCK_CW; + fallthrough; + case LCK_CR: + mode |= LCK_CR; + break; + default: + /* do not convert other modes */ + return 0; + } + + /* is lock is too old to be converted? */ + lock_res_and_lock(lock); + if (ktime_after(ktime_get(), + ktime_add(lock->l_last_used, ns->ns_dirty_age_limit))) { + unlock_res_and_lock(lock); + return 0; + } + unlock_res_and_lock(lock); + + inode = ll_inode_from_resource_lock(lock); + ll_have_md_lock(inode, &bits, mode); + iput(inode); + return !!(bits); +} + +int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *ld, + void *data, int flag) +{ + struct lustre_handle lockh; + int rc; + + ENTRY; + + switch (flag) { + case LDLM_CB_BLOCKING: + { + __u64 cancel_flags = LCF_ASYNC; + + /* if lock convert is not needed then still have to + * pass lock via ldlm_cli_convert() to keep all states + * correct, set cancel_bits to full lock bits to cause + * full cancel to happen. + */ + if (!ll_md_need_convert(lock)) { + lock_res_and_lock(lock); + lock->l_policy_data.l_inodebits.cancel_bits = + lock->l_policy_data.l_inodebits.bits; + unlock_res_and_lock(lock); + } + rc = ldlm_cli_convert(lock, cancel_flags); + if (!rc) + RETURN(0); + /* continue with cancel otherwise */ + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, cancel_flags); + if (rc < 0) { + CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc); + RETURN(rc); + } + break; + } + case LDLM_CB_CANCELING: + { + __u64 to_cancel = lock->l_policy_data.l_inodebits.bits; + + /* Nothing to do for non-granted locks */ + if (!ldlm_is_granted(lock)) + break; + + /* If 'ld' is supplied then bits to be cancelled are passed + * implicitly by lock converting and cancel_bits from 'ld' + * should be used. Otherwise full cancel is being performed + * and lock inodebits are used. + * + * Note: we cannot rely on cancel_bits in lock itself at this + * moment because they can be changed by concurrent thread, + * so ldlm_cli_inodebits_convert() pass cancel bits implicitly + * in 'ld' parameter. + */ + if (ld) { + /* partial bits cancel allowed only during convert */ + LASSERT(ldlm_is_converting(lock)); + /* mask cancel bits by lock bits so only no any unused + * bits are passed to ll_lock_cancel_bits() + */ + to_cancel &= ld->l_policy_data.l_inodebits.cancel_bits; + } + ll_lock_cancel_bits(lock, to_cancel); + break; + } + default: + LBUG(); + } + + RETURN(0); +} + +__u32 ll_i2suppgid(struct inode *i) +{ + if (in_group_p(i->i_gid)) + return (__u32)from_kgid(&init_user_ns, i->i_gid); + else + return (__u32) __kgid_val(INVALID_GID); +} + +/* Pack the required supplementary groups into the supplied groups array. + * If we don't need to use the groups from the target inode(s) then we + * instead pack one or more groups from the user's supplementary group + * array in case it might be useful. Not needed if doing an MDS-side upcall. */ +void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2) +{ + LASSERT(i1 != NULL); + LASSERT(suppgids != NULL); + + suppgids[0] = ll_i2suppgid(i1); + + if (i2) + suppgids[1] = ll_i2suppgid(i2); + else + suppgids[1] = -1; +} + +/* + * try to reuse three types of dentry: + * 1. unhashed alias, this one is unhashed by d_invalidate (but it may be valid + * by concurrent .revalidate). + * 2. INVALID alias (common case for no valid ldlm lock held, but this flag may + * be cleared by others calling d_lustre_revalidate). + * 3. DISCONNECTED alias. + */ +static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry) +{ + struct dentry *alias, *discon_alias, *invalid_alias; + + if (hlist_empty(&inode->i_dentry)) + return NULL; + + discon_alias = invalid_alias = NULL; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(alias, &inode->i_dentry, d_alias) { + LASSERT(alias != dentry); + + spin_lock(&alias->d_lock); + if ((alias->d_flags & DCACHE_DISCONNECTED) && + S_ISDIR(inode->i_mode)) + /* LASSERT(last_discon == NULL); LU-405, bz 20055 */ + discon_alias = alias; + else if (alias->d_parent == dentry->d_parent && + alias->d_name.hash == dentry->d_name.hash && + alias->d_name.len == dentry->d_name.len && + memcmp(alias->d_name.name, dentry->d_name.name, + dentry->d_name.len) == 0) + invalid_alias = alias; + spin_unlock(&alias->d_lock); + + if (invalid_alias) + break; + } + alias = invalid_alias ?: discon_alias ?: NULL; + if (alias) { + spin_lock(&alias->d_lock); + dget_dlock(alias); + spin_unlock(&alias->d_lock); + } + spin_unlock(&inode->i_lock); + + return alias; +} + +/* + * Similar to d_splice_alias(), but lustre treats invalid alias + * similar to DCACHE_DISCONNECTED, and tries to use it anyway. + */ +struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de) +{ + struct dentry *new; + + if (inode) { + new = ll_find_alias(inode, de); + if (new) { + if (!ll_d_setup(new, true)) + return ERR_PTR(-ENOMEM); + d_move(new, de); + iput(inode); + CDEBUG(D_DENTRY, + "Reuse dentry %p inode %p refc %d flags %#x\n", + new, new->d_inode, ll_d_count(new), new->d_flags); + return new; + } + } + if (!ll_d_setup(de, false)) + return ERR_PTR(-ENOMEM); + d_add(de, inode); + + /* this needs only to be done for foreign symlink dirs as + * DCACHE_SYMLINK_TYPE is already set by d_flags_for_inode() + * kernel routine for files with symlink ops (ie, real symlink) + */ + if (inode && S_ISDIR(inode->i_mode) && + ll_sbi_has_foreign_symlink(ll_i2sbi(inode)) && +#ifdef HAVE_IOP_GET_LINK + inode->i_op->get_link) { +#else + inode->i_op->follow_link) { +#endif + CDEBUG(D_INFO, "%s: inode "DFID": faking foreign dir as a symlink\n", + ll_i2sbi(inode)->ll_fsname, PFID(ll_inode2fid(inode))); + spin_lock(&de->d_lock); + /* like d_flags_for_inode() already does for files */ + de->d_flags = (de->d_flags & ~DCACHE_ENTRY_TYPE) | + DCACHE_SYMLINK_TYPE; + spin_unlock(&de->d_lock); + } + + CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n", + de, de->d_inode, ll_d_count(de), de->d_flags); + return de; +} + +static int ll_lookup_it_finish(struct ptlrpc_request *request, + struct lookup_intent *it, + struct inode *parent, struct dentry **de, + void *secctx, __u32 secctxlen, + void *encctx, __u32 encctxlen, + ktime_t kstart, bool encrypt) +{ + struct inode *inode = NULL; + __u64 bits = 0; + int rc; + struct dentry *alias; + ENTRY; + + /* NB 1 request reference will be taken away by ll_intent_lock() + * when I return */ + CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it, + it->it_disposition); + if (!it_disposition(it, DISP_LOOKUP_NEG)) { + struct req_capsule *pill = &request->rq_pill; + struct mdt_body *body = req_capsule_server_get(pill, + &RMF_MDT_BODY); + + rc = ll_prep_inode(&inode, &request->rq_pill, (*de)->d_sb, it); + if (rc) + RETURN(rc); + + /* If encryption context was returned by MDT, put it in + * inode now to save an extra getxattr and avoid deadlock. + */ + if (body->mbo_valid & OBD_MD_ENCCTX) { + encctx = req_capsule_server_get(pill, &RMF_FILE_ENCCTX); + encctxlen = req_capsule_get_size(pill, + &RMF_FILE_ENCCTX, + RCL_SERVER); + + if (encctxlen) { + CDEBUG(D_SEC, + "server returned encryption ctx for "DFID"\n", + PFID(ll_inode2fid(inode))); + rc = ll_xattr_cache_insert(inode, + xattr_for_enc(inode), + encctx, encctxlen); + if (rc) + CWARN("%s: cannot set enc ctx for "DFID": rc = %d\n", + ll_i2sbi(inode)->ll_fsname, + PFID(ll_inode2fid(inode)), rc); + else if (encrypt) { + rc = llcrypt_get_encryption_info(inode); + if (rc) + CDEBUG(D_SEC, + "cannot get enc info for "DFID": rc = %d\n", + PFID(ll_inode2fid(inode)), rc); + } + } + } + + ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits); + /* OPEN can return data if lock has DoM+LAYOUT bits set */ + if (it->it_op & IT_OPEN && + bits & MDS_INODELOCK_DOM && bits & MDS_INODELOCK_LAYOUT) + ll_dom_finish_open(inode, request); + + /* We used to query real size from OSTs here, but actually + * this is not needed. For stat() calls size would be updated + * from subsequent do_revalidate()->ll_inode_revalidate_it() in + * 2.4 and + * vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6 + * Everybody else who needs correct file size would call + * ll_glimpse_size or some equivalent themselves anyway. + * Also see bug 7198. + */ + + /* If security context was returned by MDT, put it in + * inode now to save an extra getxattr from security hooks, + * and avoid deadlock. + */ + if (body->mbo_valid & OBD_MD_SECCTX) { + secctx = req_capsule_server_get(pill, &RMF_FILE_SECCTX); + secctxlen = req_capsule_get_size(pill, + &RMF_FILE_SECCTX, + RCL_SERVER); + + if (secctxlen) + CDEBUG(D_SEC, "server returned security context" + " for "DFID"\n", + PFID(ll_inode2fid(inode))); + } + + /* resume normally on error */ + ll_inode_notifysecctx(inode, secctx, secctxlen); + } + + /* Only hash *de if it is unhashed (new dentry). + * Atoimc_open may passin hashed dentries for open. + */ + alias = ll_splice_alias(inode, *de); + if (IS_ERR(alias)) + GOTO(out, rc = PTR_ERR(alias)); + + *de = alias; + + if (!it_disposition(it, DISP_LOOKUP_NEG)) { + /* we have lookup look - unhide dentry */ + if (bits & MDS_INODELOCK_LOOKUP) { + d_lustre_revalidate(*de); + ll_update_dir_depth(parent, (*de)->d_inode); + } + + if (encrypt) { + rc = llcrypt_get_encryption_info(inode); + if (rc) + GOTO(out, rc); + } + } else if (!it_disposition(it, DISP_OPEN_CREATE)) { + /* + * If file was created on the server, the dentry is revalidated + * in ll_create_it if the lock allows for it. + */ + /* Check that parent has UPDATE lock. */ + struct lookup_intent parent_it = { + .it_op = IT_GETATTR, + .it_lock_handle = 0 }; + struct lu_fid fid = ll_i2info(parent)->lli_fid; + + /* If it is striped directory, get the real stripe parent */ + if (unlikely(ll_dir_striped(parent))) { + rc = md_get_fid_from_lsm(ll_i2mdexp(parent), + ll_i2info(parent)->lli_lsm_md, + (*de)->d_name.name, + (*de)->d_name.len, &fid); + if (rc != 0) + GOTO(out, rc); + } + + if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it, &fid, + NULL)) { + d_lustre_revalidate(*de); + ll_intent_release(&parent_it); + } + } + + if (it_disposition(it, DISP_OPEN_CREATE)) { + ll_stats_ops_tally(ll_i2sbi(parent), LPROC_LL_MKNOD, + ktime_us_delta(ktime_get(), kstart)); + } + + GOTO(out, rc = 0); + +out: + if (rc != 0 && it->it_op & IT_OPEN) { + ll_intent_drop_lock(it); + ll_open_cleanup((*de)->d_sb, &request->rq_pill); + } + + return rc; +} + +static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, + struct lookup_intent *it, + void **secctx, __u32 *secctxlen, + int *secctxslot, + struct pcc_create_attach *pca, + bool encrypt, + void **encctx, __u32 *encctxlen) +{ + ktime_t kstart = ktime_get(); + struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; + struct dentry *save = dentry, *retval; + struct ptlrpc_request *req = NULL; + struct md_op_data *op_data = NULL; + struct lov_user_md *lum = NULL; + struct ll_sb_info *sbi = ll_i2sbi(parent); + __u32 opc; + int rc; + struct llcrypt_name fname; + struct lu_fid fid; + ENTRY; + + if (dentry->d_name.len > sbi->ll_namelen) + RETURN(ERR_PTR(-ENAMETOOLONG)); + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p), intent=%s\n", + dentry, PFID(ll_inode2fid(parent)), parent, LL_IT2STR(it)); + + if (d_mountpoint(dentry)) + CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it)); + + if (it == NULL || it->it_op == IT_GETXATTR) + it = &lookup_it; + + if (it->it_op == IT_GETATTR && dentry_may_statahead(parent, dentry)) { + rc = ll_revalidate_statahead(parent, &dentry, 0); + if (rc == 1) + RETURN(dentry == save ? NULL : dentry); + } + + if (it->it_op & IT_OPEN && it->it_flags & FMODE_WRITE && + dentry->d_sb->s_flags & SB_RDONLY) + RETURN(ERR_PTR(-EROFS)); + + if (it->it_op & IT_CREAT) + opc = LUSTRE_OPC_CREATE; + else + opc = LUSTRE_OPC_LOOKUP; + + /* Here we should be calling llcrypt_prepare_lookup(). But it installs a + * custom ->d_revalidate() method, so we lose ll_d_ops. + * To workaround this, call ll_setup_filename() and do the rest + * manually. Also make a copy of llcrypt_d_revalidate() (unfortunately + * not exported function) and call it from ll_revalidate_dentry(), to + * ensure we do not cache stale dentries after a key has been added. + */ + rc = ll_setup_filename(parent, &dentry->d_name, 1, &fname, &fid); + if ((!rc || rc == -ENOENT) && fname.is_ciphertext_name) { + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dentry->d_lock); + } + if (rc == -ENOENT) + RETURN(NULL); + if (rc) + RETURN(ERR_PTR(rc)); + + op_data = ll_prep_md_op_data(NULL, parent, NULL, fname.disk_name.name, + fname.disk_name.len, 0, opc, NULL); + if (IS_ERR(op_data)) { + llcrypt_free_filename(&fname); + RETURN(ERR_CAST(op_data)); + } + if (!fid_is_zero(&fid)) { + op_data->op_fid2 = fid; + op_data->op_bias = MDS_FID_OP; + if (it->it_op & IT_OPEN) + it->it_flags |= MDS_OPEN_BY_FID; + } + + /* enforce umask if acl disabled or MDS doesn't support umask */ + if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent))) + it->it_create_mode &= ~current_umask(); + + if (it->it_op & IT_CREAT && + test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags)) { + rc = ll_dentry_init_security(dentry, it->it_create_mode, + &dentry->d_name, + &op_data->op_file_secctx_name, + &op_data->op_file_secctx_name_size, + &op_data->op_file_secctx, + &op_data->op_file_secctx_size, + &op_data->op_file_secctx_slot); + if (rc < 0) + GOTO(out, retval = ERR_PTR(rc)); + if (secctx != NULL) + *secctx = op_data->op_file_secctx; + if (secctxlen != NULL) + *secctxlen = op_data->op_file_secctx_size; + if (secctxslot != NULL) + *secctxslot = op_data->op_file_secctx_slot; + } else { + if (secctx != NULL) + *secctx = NULL; + if (secctxlen != NULL) + *secctxlen = 0; + if (secctxslot != NULL) + *secctxslot = 0; + } + if (it->it_op & IT_CREAT && encrypt) { + if (unlikely(filename_is_volatile(dentry->d_name.name, + dentry->d_name.len, NULL))) { + /* get encryption context from reference file */ + int ctx_size = LLCRYPT_ENC_CTX_SIZE; + struct lustre_sb_info *lsi; + struct file *ref_file; + struct inode *ref_inode; + void *ctx; + + rc = volatile_ref_file(dentry->d_name.name, + dentry->d_name.len, + &ref_file); + if (rc) + GOTO(out, retval = ERR_PTR(rc)); + + ref_inode = file_inode(ref_file); + if (!ref_inode) { + fput(ref_file); + GOTO(inherit, rc = -EINVAL); + } + + lsi = s2lsi(ref_inode->i_sb); + +getctx: + OBD_ALLOC(ctx, ctx_size); + if (!ctx) + GOTO(out, retval = ERR_PTR(-ENOMEM)); + +#ifdef CONFIG_LL_ENCRYPTION + rc = lsi->lsi_cop->get_context(ref_inode, + ctx, ctx_size); +#elif defined(HAVE_LUSTRE_CRYPTO) + rc = ref_inode->i_sb->s_cop->get_context(ref_inode, + ctx, ctx_size); +#else + rc = -ENODATA; +#endif + if (rc == -ERANGE) { + OBD_FREE(ctx, ctx_size); + ctx_size *= 2; + goto getctx; + } + fput(ref_file); + if (rc < 0) { + OBD_FREE(ctx, ctx_size); + GOTO(inherit, rc); + } + + op_data->op_file_encctx_size = rc; + if (rc == ctx_size) { + op_data->op_file_encctx = ctx; + } else { + OBD_ALLOC(op_data->op_file_encctx, + op_data->op_file_encctx_size); + if (!op_data->op_file_encctx) { + OBD_FREE(ctx, ctx_size); + GOTO(out, retval = ERR_PTR(-ENOMEM)); + } + memcpy(op_data->op_file_encctx, ctx, + op_data->op_file_encctx_size); + OBD_FREE(ctx, ctx_size); + } + } else { +inherit: + rc = llcrypt_inherit_context(parent, NULL, op_data, + false); + if (rc) + GOTO(out, retval = ERR_PTR(rc)); + } + if (encctx != NULL) + *encctx = op_data->op_file_encctx; + if (encctxlen != NULL) + *encctxlen = op_data->op_file_encctx_size; + } else { + if (encctx != NULL) + *encctx = NULL; + if (encctxlen != NULL) + *encctxlen = 0; + } + + /* ask for security context upon intent: + * get name of security xattr to request to server + */ + if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN)) + op_data->op_file_secctx_name_size = + ll_secctx_name_get(sbi, &op_data->op_file_secctx_name); + + if (pca && pca->pca_dataset) { + OBD_ALLOC_PTR(lum); + if (lum == NULL) + GOTO(out, retval = ERR_PTR(-ENOMEM)); + + lum->lmm_magic = LOV_USER_MAGIC_V1; + lum->lmm_pattern = LOV_PATTERN_F_RELEASED | LOV_PATTERN_RAID0; + op_data->op_data = lum; + op_data->op_data_size = sizeof(*lum); + op_data->op_archive_id = pca->pca_dataset->pccd_rwid; + it->it_flags |= MDS_OPEN_PCC; + } + + rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req, + &ll_md_blocking_ast, 0); + /* If the MDS allows the client to chgrp (CFS_SETGRP_PERM), but the + * client does not know which suppgid should be sent to the MDS, or + * some other(s) changed the target file's GID after this RPC sent + * to the MDS with the suppgid as the original GID, then we should + * try again with right suppgid. */ + if (rc == -EACCES && it->it_op & IT_OPEN && + it_disposition(it, DISP_OPEN_DENY)) { + struct mdt_body *body; + + LASSERT(req != NULL); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (op_data->op_suppgids[0] == body->mbo_gid || + op_data->op_suppgids[1] == body->mbo_gid || + !in_group_p(make_kgid(&init_user_ns, body->mbo_gid))) + GOTO(out, retval = ERR_PTR(-EACCES)); + + fid_zero(&op_data->op_fid2); + op_data->op_suppgids[1] = body->mbo_gid; + ptlrpc_req_finished(req); + req = NULL; + ll_intent_release(it); + rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req, + &ll_md_blocking_ast, 0); + } + + if (rc < 0) + GOTO(out, retval = ERR_PTR(rc)); + + if (pca && pca->pca_dataset) { + rc = pcc_inode_create(parent->i_sb, pca->pca_dataset, + &op_data->op_fid2, + &pca->pca_dentry); + if (rc) + GOTO(out, retval = ERR_PTR(rc)); + } + + /* dir layout may change */ + ll_unlock_md_op_lsm(op_data); + rc = ll_lookup_it_finish(req, it, parent, &dentry, + secctx != NULL ? *secctx : NULL, + secctxlen != NULL ? *secctxlen : 0, + encctx != NULL ? *encctx : NULL, + encctxlen != NULL ? *encctxlen : 0, + kstart, encrypt); + if (rc != 0) { + ll_intent_release(it); + GOTO(out, retval = ERR_PTR(rc)); + } + + if ((it->it_op & IT_OPEN) && dentry->d_inode && + !S_ISREG(dentry->d_inode->i_mode) && + !S_ISDIR(dentry->d_inode->i_mode)) { + ll_release_openhandle(dentry, it); + } + ll_lookup_finish_locks(it, dentry); + + GOTO(out, retval = (dentry == save) ? NULL : dentry); + +out: + if (op_data != NULL && !IS_ERR(op_data)) { + if (secctx != NULL && secctxlen != NULL) { + /* caller needs sec ctx info, so reset it in op_data to + * prevent it from being freed */ + op_data->op_file_secctx = NULL; + op_data->op_file_secctx_size = 0; + } + if (encctx != NULL && encctxlen != NULL && + it->it_op & IT_CREAT && encrypt) { + /* caller needs enc ctx info, so reset it in op_data to + * prevent it from being freed + */ + op_data->op_file_encctx = NULL; + op_data->op_file_encctx_size = 0; + } + llcrypt_free_filename(&fname); + ll_finish_md_op_data(op_data); + } + + if (lum != NULL) + OBD_FREE_PTR(lum); + + ptlrpc_req_finished(req); + return retval; +} + +static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry, + unsigned int flags) +{ + struct lookup_intent *itp, it = { .it_op = IT_GETATTR }; + struct dentry *de; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p), flags=%u\n", + dentry, PFID(ll_inode2fid(parent)), parent, flags); + + /* + * Optimize away (CREATE && !OPEN). Let .create handle the race. + * but only if we have write permissions there, otherwise we need + * to proceed with lookup. LU-4185 + */ + if ((flags & LOOKUP_CREATE) && !(flags & LOOKUP_OPEN) && + (inode_permission(&init_user_ns, + parent, MAY_WRITE | MAY_EXEC) == 0)) + return NULL; + + if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE)) + itp = NULL; + else + itp = ⁢ + de = ll_lookup_it(parent, dentry, itp, NULL, NULL, NULL, NULL, false, + NULL, NULL); + + if (itp != NULL) + ll_intent_release(itp); + + return de; +} + +#ifdef FMODE_CREATED /* added in Linux v4.18-rc1-20-g73a09dd */ +# define ll_is_opened(o, f) ((f)->f_mode & FMODE_OPENED) +# define ll_finish_open(f, d, o) finish_open((f), (d), NULL) +# define ll_last_arg +# define ll_set_created(o, f) \ +do { \ + (f)->f_mode |= FMODE_CREATED; \ +} while (0) + +#else +# define ll_is_opened(o, f) (*(o)) +# define ll_finish_open(f, d, o) finish_open((f), (d), NULL, (o)) +# define ll_last_arg , int *opened +# define ll_set_created(o, f) \ +do { \ + *(o) |= FILE_CREATED; \ +} while (0) + +#endif + +/* + * For cached negative dentry and new dentry, handle lookup/create/open + * together. + */ +static int ll_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned open_flags, + umode_t mode ll_last_arg) +{ + struct lookup_intent *it; + struct dentry *de; + long long lookup_flags = LOOKUP_OPEN; + void *secctx = NULL; + __u32 secctxlen = 0; + int secctxslot = 0; + void *encctx = NULL; + __u32 encctxlen = 0; + struct ll_sb_info *sbi = NULL; + struct pcc_create_attach pca = { NULL, NULL }; + bool encrypt = false; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, + "VFS Op:name=%pd, dir="DFID"(%p), file %p, open_flags %x, mode %x opened %d\n", + dentry, PFID(ll_inode2fid(dir)), dir, file, open_flags, mode, + ll_is_opened(opened, file)); + + /* Only negative dentries enter here */ + LASSERT(dentry->d_inode == NULL); + + if (!d_unhashed(dentry)) { + /* A valid negative dentry that just passed revalidation, + * there's little point to try and open it server-side, + * even though there's a minuscule chance it might succeed. + * Either way it's a valid race to just return -ENOENT here. + */ + if (!(open_flags & O_CREAT)) + return -ENOENT; + + /* Otherwise we just unhash it to be rehashed afresh via + * lookup if necessary + */ + d_drop(dentry); + } + + OBD_ALLOC(it, sizeof(*it)); + if (!it) + RETURN(-ENOMEM); + + it->it_op = IT_OPEN; + if (open_flags & O_CREAT) { + it->it_op |= IT_CREAT; + lookup_flags |= LOOKUP_CREATE; + sbi = ll_i2sbi(dir); + /* Volatile file is used for HSM restore, so do not use PCC */ + if (!filename_is_volatile(dentry->d_name.name, + dentry->d_name.len, NULL)) { + struct pcc_matcher item; + struct pcc_dataset *dataset; + + item.pm_uid = from_kuid(&init_user_ns, current_uid()); + item.pm_gid = from_kgid(&init_user_ns, current_gid()); + item.pm_projid = ll_i2info(dir)->lli_projid; + item.pm_name = &dentry->d_name; + dataset = pcc_dataset_match_get(&sbi->ll_pcc_super, + &item); + pca.pca_dataset = dataset; + } + } + it->it_create_mode = (mode & S_IALLUGO) | S_IFREG; + it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags); + it->it_flags &= ~MDS_OPEN_FL_INTERNAL; + + if (ll_sbi_has_encrypt(ll_i2sbi(dir)) && IS_ENCRYPTED(dir)) { + /* in case of create, this is going to be a regular file because + * we set S_IFREG bit on it->it_create_mode above + */ + rc = llcrypt_get_encryption_info(dir); + if (rc) + GOTO(out_release, rc); + encrypt = true; + if (open_flags & O_CREAT) { + /* For migration or mirroring without enc key, we still + * need to be able to create a volatile file. + */ + if (!llcrypt_has_encryption_key(dir) && + (!filename_is_volatile(dentry->d_name.name, + dentry->d_name.len, NULL) || + (open_flags & O_FILE_ENC) != O_FILE_ENC || + !(open_flags & O_DIRECT))) + GOTO(out_release, rc = -ENOKEY); + } + } + + OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_FILE_PAUSE2, cfs_fail_val); + + /* We can only arrive at this path when we have no inode, so + * we only need to request open lock if it was requested + * for every open + */ + if (ll_i2sbi(dir)->ll_oc_thrsh_count == 1 && + exp_connect_flags2(ll_i2mdexp(dir)) & + OBD_CONNECT2_ATOMIC_OPEN_LOCK) + it->it_flags |= MDS_OPEN_LOCK; + + /* Dentry added to dcache tree in ll_lookup_it */ + de = ll_lookup_it(dir, dentry, it, &secctx, &secctxlen, &secctxslot, + &pca, encrypt, &encctx, &encctxlen); + if (IS_ERR(de)) + rc = PTR_ERR(de); + else if (de != NULL) + dentry = de; + + CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_FILE_PAUSE, cfs_fail_val); + + if (!rc) { + if (it_disposition(it, DISP_OPEN_CREATE)) { + /* Dentry instantiated in ll_create_it. */ + rc = ll_create_it(dir, dentry, it, secctx, secctxlen, + encrypt, encctx, encctxlen, + open_flags); + ll_security_release_secctx(secctx, secctxlen, + secctxslot); + llcrypt_free_ctx(encctx, encctxlen); + if (rc) { + /* We dget in ll_splice_alias. */ + if (de != NULL) + dput(de); + goto out_release; + } + + rc = pcc_inode_create_fini(dentry->d_inode, &pca); + if (rc) { + if (de != NULL) + dput(de); + GOTO(out_release, rc); + } + + ll_set_created(opened, file); + } else { + /* Open the file with O_CREAT, but the file already + * existed on MDT. This may happend in the case that + * the LOOKUP ibits lock is revoked and the + * corresponding dentry cache is deleted. + * i.e. In the current Lustre, the truncate operation + * will revoke the LOOKUP ibits lock, and the file + * dentry cache will be invalidated. The following open + * with O_CREAT flag will call into ->atomic_open, the + * file was wrongly though as newly created file and + * try to auto cache the file. So after client knows it + * is not a DISP_OPEN_CREATE, it should cleanup the + * already created PCC copy. + */ + pcc_create_attach_cleanup(dir->i_sb, &pca); + + if (open_flags & O_CREAT && encrypt && + dentry->d_inode) { + rc = ll_set_encflags(dentry->d_inode, encctx, + encctxlen, true); + llcrypt_free_ctx(encctx, encctxlen); + if (rc) + GOTO(out_release, rc); + } + } + + /* check also if a foreign file is openable */ + if (dentry->d_inode && it_disposition(it, DISP_OPEN_OPEN) && + ll_foreign_is_openable(dentry, open_flags)) { + /* Open dentry. */ + if (S_ISFIFO(dentry->d_inode->i_mode)) { + /* We cannot call open here as it might + * deadlock. This case is unreachable in + * practice because of OBD_CONNECT_NODEVOH. */ + rc = finish_no_open(file, de); + } else { + file->private_data = it; + rc = ll_finish_open(file, dentry, opened); + /* We dget in ll_splice_alias. finish_open takes + * care of dget for fd open. + */ + if (de != NULL) + dput(de); + } + } else { + rc = finish_no_open(file, de); + } + } else { + pcc_create_attach_cleanup(dir->i_sb, &pca); + } + +out_release: + ll_intent_release(it); + OBD_FREE(it, sizeof(*it)); + + RETURN(rc); +} + +/* We depend on "mode" being set with the proper file type/umask by now */ +static struct inode *ll_create_node(struct inode *dir, struct lookup_intent *it) +{ + struct inode *inode = NULL; + struct ptlrpc_request *request = NULL; + struct ll_sb_info *sbi = ll_i2sbi(dir); + int rc; + ENTRY; + + LASSERT(it && it->it_disposition); + + LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF)); + request = it->it_request; + it_clear_disposition(it, DISP_ENQ_CREATE_REF); + rc = ll_prep_inode(&inode, &request->rq_pill, dir->i_sb, it); + if (rc) + GOTO(out, inode = ERR_PTR(rc)); + + /* Pause to allow for a race with concurrent access by fid */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_NODE_PAUSE, cfs_fail_val); + + /* We asked for a lock on the directory, but were granted a + * lock on the inode. Since we finally have an inode pointer, + * stuff it in the lock. */ + CDEBUG(D_DLMTRACE, "setting l_ast_data to inode "DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL); + EXIT; + out: + ptlrpc_req_finished(request); + return inode; +} + +/* + * By the time this is called, we already have created the directory cache + * entry for the new file, but it is so far negative - it has no inode. + * + * We defer creating the OBD object(s) until open, to keep the intent and + * non-intent code paths similar, and also because we do not have the MDS + * inode number before calling ll_create_node() (which is needed for LOV), + * so we would need to do yet another RPC to the MDS to store the LOV EA + * data on the MDS. If needed, we would pass the PACKED lmm as data and + * lmm_size in datalen (the MDS still has code which will handle that). + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ll_create_it(struct inode *dir, struct dentry *dentry, + struct lookup_intent *it, + void *secctx, __u32 secctxlen, bool encrypt, + void *encctx, __u32 encctxlen, unsigned int open_flags) +{ + struct inode *inode; + __u64 bits = 0; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p), intent=%s\n", + dentry, PFID(ll_inode2fid(dir)), dir, LL_IT2STR(it)); + + rc = it_open_error(DISP_OPEN_CREATE, it); + if (rc) + RETURN(rc); + + inode = ll_create_node(dir, it); + if (IS_ERR(inode)) + RETURN(PTR_ERR(inode)); + + /* must be done before d_instantiate, because it calls + * security_d_instantiate, which means a getxattr if security + * context is not set yet + */ + rc = ll_inode_notifysecctx(inode, secctx, secctxlen); + if (rc) + RETURN(rc); + + d_instantiate(dentry, inode); + + if (encrypt) { + bool preload = true; + + /* For migration or mirroring without enc key, we + * create a volatile file without enc context. + */ + if (!llcrypt_has_encryption_key(dir) && + filename_is_volatile(dentry->d_name.name, + dentry->d_name.len, NULL) && + (open_flags & O_FILE_ENC) == O_FILE_ENC && + open_flags & O_DIRECT) + preload = false; + rc = ll_set_encflags(inode, encctx, encctxlen, preload); + if (rc) + RETURN(rc); + } + + if (!test_bit(LL_SBI_FILE_SECCTX, ll_i2sbi(inode)->ll_flags)) { + rc = ll_inode_init_security(dentry, inode, dir); + if (rc) + RETURN(rc); + } + + ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, inode, it, &bits); + if (bits & MDS_INODELOCK_LOOKUP) { + d_lustre_revalidate(dentry); + ll_update_dir_depth(dir, inode); + } + + RETURN(0); +} + +void ll_update_times(struct ptlrpc_request *request, struct inode *inode) +{ + struct mdt_body *body = req_capsule_server_get(&request->rq_pill, + &RMF_MDT_BODY); + + LASSERT(body); + if (body->mbo_valid & OBD_MD_FLMTIME && + body->mbo_mtime > inode->i_mtime.tv_sec) { + CDEBUG(D_INODE, + "setting fid " DFID " mtime from %lld to %llu\n", + PFID(ll_inode2fid(inode)), + (s64)inode->i_mtime.tv_sec, body->mbo_mtime); + inode->i_mtime.tv_sec = body->mbo_mtime; + } + + if (body->mbo_valid & OBD_MD_FLCTIME && + body->mbo_ctime > inode->i_ctime.tv_sec) + inode->i_ctime.tv_sec = body->mbo_ctime; +} + +/* once default LMV (space balanced) is set on ROOT, it should take effect if + * default LMV is not set on parent directory. + */ +static void ll_qos_mkdir_prep(struct md_op_data *op_data, struct inode *dir) +{ + struct inode *root = dir->i_sb->s_root->d_inode; + struct ll_inode_info *rlli = ll_i2info(root); + struct ll_inode_info *lli = ll_i2info(dir); + struct lmv_stripe_md *lsm; + unsigned short depth; + + op_data->op_dir_depth = lli->lli_inherit_depth ?: lli->lli_dir_depth; + depth = lli->lli_dir_depth; + + /* parent directory is striped */ + if (unlikely(lli->lli_lsm_md)) + return; + + /* default LMV set on parent directory */ + if (unlikely(lli->lli_default_lsm_md)) + return; + + /* parent is ROOT */ + if (unlikely(dir == root)) + return; + + /* default LMV not set on ROOT */ + if (!rlli->lli_default_lsm_md) + return; + + down_read(&rlli->lli_lsm_sem); + lsm = rlli->lli_default_lsm_md; + if (!lsm) + goto unlock; + + /* not space balanced */ + if (lsm->lsm_md_master_mdt_index != LMV_OFFSET_DEFAULT) + goto unlock; + + /** + * Check if the fs default is to be applied. + * depth == 0 means 'not inited' for not root dir. + */ + if (lsm->lsm_md_max_inherit != LMV_INHERIT_NONE && + (lsm->lsm_md_max_inherit == LMV_INHERIT_UNLIMITED || + (depth && lsm->lsm_md_max_inherit > depth))) { + op_data->op_flags |= MF_QOS_MKDIR; + if (lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE && + (lsm->lsm_md_max_inherit_rr == LMV_INHERIT_RR_UNLIMITED || + (depth && lsm->lsm_md_max_inherit_rr > depth))) + op_data->op_flags |= MF_RR_MKDIR; + CDEBUG(D_INODE, DFID" requests qos mkdir %#x\n", + PFID(&lli->lli_fid), op_data->op_flags); + } +unlock: + up_read(&rlli->lli_lsm_sem); +} + +static int ll_new_node(struct inode *dir, struct dentry *dchild, + const char *tgt, umode_t mode, __u64 rdev, __u32 opc) +{ + struct qstr *name = &dchild->d_name; + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data = NULL; + struct inode *inode = NULL; + struct ll_sb_info *sbi = ll_i2sbi(dir); + struct llcrypt_str *disk_link = NULL; + bool encrypt = false; + int err; + + ENTRY; + if (unlikely(tgt != NULL)) { + disk_link = (struct llcrypt_str *)rdev; + rdev = 0; + if (!disk_link) + RETURN(-EINVAL); + } + +again: + op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, + name->len, 0, opc, NULL); + if (IS_ERR(op_data)) + GOTO(err_exit, err = PTR_ERR(op_data)); + + if (S_ISDIR(mode)) + ll_qos_mkdir_prep(op_data, dir); + + if (test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags)) { + err = ll_dentry_init_security(dchild, mode, &dchild->d_name, + &op_data->op_file_secctx_name, + &op_data->op_file_secctx_name_size, + &op_data->op_file_secctx, + &op_data->op_file_secctx_size, + &op_data->op_file_secctx_slot); + if (err < 0) + GOTO(err_exit, err); + } + + if (ll_sbi_has_encrypt(sbi) && + ((IS_ENCRYPTED(dir) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) || + (unlikely(ll_sb_has_test_dummy_encryption(dir->i_sb)) && + S_ISDIR(mode)))) { + err = llcrypt_get_encryption_info(dir); + if (err) + GOTO(err_exit, err); + if (!llcrypt_has_encryption_key(dir)) + GOTO(err_exit, err = -ENOKEY); + encrypt = true; + } + + if (encrypt) { + err = llcrypt_inherit_context(dir, NULL, op_data, false); + if (err) + GOTO(err_exit, err); + + if (S_ISLNK(mode)) { + /* llcrypt needs inode to encrypt target name, so create + * a fake inode and associate encryption context got + * from llcrypt_inherit_context. + */ + struct inode *fakeinode = + dchild->d_sb->s_op->alloc_inode(dchild->d_sb); + + if (!fakeinode) + GOTO(err_exit, err = -ENOMEM); + fakeinode->i_sb = dchild->d_sb; + fakeinode->i_mode |= S_IFLNK; +#ifdef IOP_XATTR + fakeinode->i_opflags |= IOP_XATTR; +#endif + ll_lli_init(ll_i2info(fakeinode)); + err = ll_set_encflags(fakeinode, + op_data->op_file_encctx, + op_data->op_file_encctx_size, + true); + if (!err) + err = __llcrypt_encrypt_symlink(fakeinode, tgt, + strlen(tgt), + disk_link); + + ll_xattr_cache_destroy(fakeinode); + llcrypt_put_encryption_info(fakeinode); + dchild->d_sb->s_op->destroy_inode(fakeinode); + if (err) + GOTO(err_exit, err); + } + } + + err = md_create(sbi->ll_md_exp, op_data, tgt ? disk_link->name : NULL, + tgt ? disk_link->len : 0, mode, + from_kuid(&init_user_ns, current_fsuid()), + from_kgid(&init_user_ns, current_fsgid()), + current_cap(), rdev, &request); +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 15, 58, 0) + /* + * server < 2.12.58 doesn't pack default LMV in intent_getattr reply, + * fetch default LMV here. + */ + if (unlikely(err == -EREMOTE)) { + struct ll_inode_info *lli = ll_i2info(dir); + struct lmv_user_md *lum; + int lumsize; + int err2; + + ptlrpc_req_finished(request); + request = NULL; + ll_finish_md_op_data(op_data); + op_data = NULL; + + err2 = ll_dir_getstripe(dir, (void **)&lum, &lumsize, &request, + OBD_MD_DEFAULT_MEA); + if (err2 == 0) { + struct lustre_md md = { NULL }; + + md.body = req_capsule_server_get(&request->rq_pill, + &RMF_MDT_BODY); + if (!md.body) + GOTO(err_exit, err = -EPROTO); + + OBD_ALLOC_PTR(md.default_lmv); + if (!md.default_lmv) + GOTO(err_exit, err = -ENOMEM); + + md.default_lmv->lsm_md_magic = lum->lum_magic; + md.default_lmv->lsm_md_stripe_count = + lum->lum_stripe_count; + md.default_lmv->lsm_md_master_mdt_index = + lum->lum_stripe_offset; + md.default_lmv->lsm_md_hash_type = lum->lum_hash_type; + md.default_lmv->lsm_md_max_inherit = + lum->lum_max_inherit; + md.default_lmv->lsm_md_max_inherit_rr = + lum->lum_max_inherit_rr; + + err = ll_update_inode(dir, &md); + md_free_lustre_md(sbi->ll_md_exp, &md); + if (err) + GOTO(err_exit, err); + } else if (err2 == -ENODATA && lli->lli_default_lsm_md) { + /* + * If there are no default stripe EA on the MDT, but the + * client has default stripe, then it probably means + * default stripe EA has just been deleted. + */ + down_write(&lli->lli_lsm_sem); + if (lli->lli_default_lsm_md) + OBD_FREE_PTR(lli->lli_default_lsm_md); + lli->lli_default_lsm_md = NULL; + up_write(&lli->lli_lsm_sem); + } else { + GOTO(err_exit, err); + } + + ptlrpc_req_finished(request); + request = NULL; + goto again; + } +#endif + + if (err < 0) + GOTO(err_exit, err); + + ll_update_times(request, dir); + + CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_NEWNODE_PAUSE, cfs_fail_val); + + err = ll_prep_inode(&inode, &request->rq_pill, dchild->d_sb, NULL); + if (err) + GOTO(err_exit, err); + + /* must be done before d_instantiate, because it calls + * security_d_instantiate, which means a getxattr if security + * context is not set yet + */ + err = ll_inode_notifysecctx(inode, + op_data->op_file_secctx, + op_data->op_file_secctx_size); + if (err) + GOTO(err_exit, err); + + d_instantiate(dchild, inode); + + if (encrypt) { + err = ll_set_encflags(inode, op_data->op_file_encctx, + op_data->op_file_encctx_size, true); + if (err) + GOTO(err_exit, err); + + if (S_ISLNK(mode)) { + struct ll_inode_info *lli = ll_i2info(inode); + + /* Cache the plaintext symlink target + * for later use by get_link() + */ + OBD_ALLOC(lli->lli_symlink_name, strlen(tgt) + 1); + /* do not return an error if we cannot + * cache the symlink locally + */ + if (lli->lli_symlink_name) + memcpy(lli->lli_symlink_name, + tgt, strlen(tgt) + 1); + } + } + + if (!test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags)) { + err = ll_inode_init_security(dchild, inode, dir); + if (err) + GOTO(err_exit, err); + } + + EXIT; +err_exit: + if (request != NULL) + ptlrpc_req_finished(request); + + if (!IS_ERR_OR_NULL(op_data)) + ll_finish_md_op_data(op_data); + + RETURN(err); +} + +static int ll_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dchild, umode_t mode, dev_t rdev) +{ + ktime_t kstart = ktime_get(); + int err; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p) mode %o dev %x\n", + dchild, PFID(ll_inode2fid(dir)), dir, mode, rdev); + + if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir))) + mode &= ~current_umask(); + + switch (mode & S_IFMT) { + case 0: + mode |= S_IFREG; + fallthrough; + case S_IFREG: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = ll_new_node(dir, dchild, NULL, mode, old_encode_dev(rdev), + LUSTRE_OPC_MKNOD); + break; + case S_IFDIR: + err = -EPERM; + break; + default: + err = -EINVAL; + } + + if (!err) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, + ktime_us_delta(ktime_get(), kstart)); + + RETURN(err); +} + +/* + * Plain create. Intent create is handled in atomic_open. + */ +static int ll_create_nd(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + umode_t mode, bool want_excl) +{ + ktime_t kstart = ktime_get(); + int rc; + + CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_FILE_PAUSE, cfs_fail_val); + + CDEBUG(D_VFSTRACE, + "VFS Op:name=%pd, dir="DFID"(%p), flags=%u, excl=%d\n", + dentry, PFID(ll_inode2fid(dir)), dir, mode, want_excl); + + /* Using mknod(2) to create a regular file is designed to not recognize + * volatile file name, so we use ll_mknod() here. */ + rc = ll_mknod(mnt_userns, dir, dentry, mode, 0); + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, unhashed %d\n", + dentry, d_unhashed(dentry)); + + if (!rc) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, + ktime_us_delta(ktime_get(), kstart)); + + return rc; +} + +static int ll_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dchild, const char *oldpath) +{ + ktime_t kstart = ktime_get(); + int len = strlen(oldpath); + struct llcrypt_str disk_link; + int err; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p), target=%.*s\n", + dchild, PFID(ll_inode2fid(dir)), dir, 3000, oldpath); + + err = llcrypt_prepare_symlink(dir, oldpath, len, dir->i_sb->s_blocksize, + &disk_link); + if (err) + RETURN(err); + + err = ll_new_node(dir, dchild, oldpath, S_IFLNK | S_IRWXUGO, + (__u64)&disk_link, LUSTRE_OPC_SYMLINK); + + if (disk_link.name != (unsigned char *)oldpath) + kfree(disk_link.name); + + if (!err) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, + ktime_us_delta(ktime_get(), kstart)); + + RETURN(err); +} + +static int ll_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + struct inode *src = old_dentry->d_inode; + struct qstr *name = &new_dentry->d_name; + struct ll_sb_info *sbi = ll_i2sbi(dir); + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + ktime_t kstart = ktime_get(); + int err; + + ENTRY; + CDEBUG(D_VFSTRACE, + "VFS Op: inode="DFID"(%p), dir="DFID"(%p), target=%pd\n", + PFID(ll_inode2fid(src)), src, + PFID(ll_inode2fid(dir)), dir, new_dentry); + + err = llcrypt_prepare_link(old_dentry, dir, new_dentry); + if (err) + RETURN(err); + + op_data = ll_prep_md_op_data(NULL, src, dir, name->name, name->len, + 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + err = md_link(sbi->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (err) + GOTO(out, err); + + ll_update_times(request, dir); + ll_stats_ops_tally(sbi, LPROC_LL_LINK, + ktime_us_delta(ktime_get(), kstart)); + EXIT; +out: + ptlrpc_req_finished(request); + RETURN(err); +} + +static int ll_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dchild, umode_t mode) +{ + ktime_t kstart = ktime_get(); + int err; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p)\n", + dchild, PFID(ll_inode2fid(dir)), dir); + + if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir))) + mode &= ~current_umask(); + + mode = (mode & (S_IRWXUGO|S_ISVTX)) | S_IFDIR; + + err = ll_new_node(dir, dchild, NULL, mode, 0, LUSTRE_OPC_MKDIR); + if (err == 0) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, + ktime_us_delta(ktime_get(), kstart)); + + RETURN(err); +} + +static int ll_rmdir(struct inode *dir, struct dentry *dchild) +{ + struct qstr *name = &dchild->d_name; + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + ktime_t kstart = ktime_get(); + int rc; + + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p)\n", + dchild, PFID(ll_inode2fid(dir)), dir); + + if (unlikely(d_mountpoint(dchild))) + RETURN(-EBUSY); + + /* some foreign dir may not be allowed to be removed */ + if (!ll_foreign_is_removable(dchild, false)) + RETURN(-EPERM); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, name->len, + S_IFDIR, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + if (dchild->d_inode != NULL) + op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); + + if (fid_is_zero(&op_data->op_fid2)) + op_data->op_fid2 = op_data->op_fid3; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (!rc) { + struct mdt_body *body; + + ll_update_times(request, dir); + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, + ktime_us_delta(ktime_get(), kstart)); + + /* + * The server puts attributes in on the last unlink, use them + * to update the link count so the inode can be freed + * immediately. + */ + body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); + if (body->mbo_valid & OBD_MD_FLNLINK) { + spin_lock(&dchild->d_inode->i_lock); + set_nlink(dchild->d_inode, body->mbo_nlink); + spin_unlock(&dchild->d_inode->i_lock); + } + } + + ptlrpc_req_finished(request); + + RETURN(rc); +} + +/** + * Remove dir entry + **/ +int ll_rmdir_entry(struct inode *dir, char *name, int namelen) +{ + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + ktime_t kstart = ktime_get(); + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n", + namelen, name, PFID(ll_inode2fid(dir)), dir); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, name, strlen(name), + S_IFDIR, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + op_data->op_cli_flags |= CLI_RM_ENTRY; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (!rc) + ll_update_times(request, dir); + + ptlrpc_req_finished(request); + if (!rc) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, + ktime_us_delta(ktime_get(), kstart)); + RETURN(rc); +} + +static int ll_unlink(struct inode *dir, struct dentry *dchild) +{ + struct qstr *name = &dchild->d_name; + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + struct mdt_body *body; + ktime_t kstart = ktime_get(); + int rc; + + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p)\n", + dchild, PFID(ll_inode2fid(dir)), dir); + + /* + * XXX: unlink bind mountpoint maybe call to here, + * just check it as vfs_unlink does. + */ + if (unlikely(d_mountpoint(dchild))) + RETURN(-EBUSY); + + /* some foreign file/dir may not be allowed to be unlinked */ + if (!ll_foreign_is_removable(dchild, false)) + RETURN(-EPERM); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, name->len, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); + /* notify lower layer if inode has dirty pages */ + if (S_ISREG(dchild->d_inode->i_mode) && + ll_i2info(dchild->d_inode)->lli_clob && + dirty_cnt(dchild->d_inode)) + op_data->op_cli_flags |= CLI_DIRTY_DATA; + if (fid_is_zero(&op_data->op_fid2)) + op_data->op_fid2 = op_data->op_fid3; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (rc) + GOTO(out, rc); + + /* + * The server puts attributes in on the last unlink, use them to update + * the link count so the inode can be freed immediately. + */ + body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); + if (body->mbo_valid & OBD_MD_FLNLINK) { + spin_lock(&dchild->d_inode->i_lock); + set_nlink(dchild->d_inode, body->mbo_nlink); + spin_unlock(&dchild->d_inode->i_lock); + } + + ll_update_times(request, dir); + +out: + ptlrpc_req_finished(request); + if (!rc) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, + ktime_us_delta(ktime_get(), kstart)); + RETURN(rc); +} + +static int ll_rename(struct user_namespace *mnt_userns, + struct inode *src, struct dentry *src_dchild, + struct inode *tgt, struct dentry *tgt_dchild +#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_IOPS_RENAME_WITH_FLAGS) + , unsigned int flags +#endif + ) +{ + struct ptlrpc_request *request = NULL; + struct ll_sb_info *sbi = ll_i2sbi(src); + struct md_op_data *op_data; + ktime_t kstart = ktime_get(); + umode_t mode = 0; + struct llcrypt_name foldname, fnewname; + int err; + ENTRY; + +#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_IOPS_RENAME_WITH_FLAGS) + if (flags) + return -EINVAL; +#endif + + CDEBUG(D_VFSTRACE, + "VFS Op:oldname=%pd, src_dir="DFID"(%p), newname=%pd, tgt_dir="DFID"(%p)\n", + src_dchild, PFID(ll_inode2fid(src)), src, + tgt_dchild, PFID(ll_inode2fid(tgt)), tgt); + + if (unlikely(d_mountpoint(src_dchild) || d_mountpoint(tgt_dchild))) + RETURN(-EBUSY); + +#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_IOPS_RENAME_WITH_FLAGS) + err = llcrypt_prepare_rename(src, src_dchild, tgt, tgt_dchild, flags); +#else + err = llcrypt_prepare_rename(src, src_dchild, tgt, tgt_dchild, 0); +#endif + if (err) + RETURN(err); + /* we prevent an encrypted file from being renamed + * into an unencrypted dir + */ + if (IS_ENCRYPTED(src) && !IS_ENCRYPTED(tgt)) + RETURN(-EXDEV); + + if (src_dchild->d_inode) + mode = src_dchild->d_inode->i_mode; + + if (tgt_dchild->d_inode) + mode = tgt_dchild->d_inode->i_mode; + + op_data = ll_prep_md_op_data(NULL, src, tgt, NULL, 0, mode, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + /* If the client is using a subdir mount and does a rename to what it + * sees as /.fscrypt, interpret it as the .fscrypt dir at fs root. + */ + if (unlikely(is_root_inode(tgt) && !fid_is_root(ll_inode2fid(tgt)) && + tgt_dchild->d_name.len == strlen(dot_fscrypt_name) && + strncmp(tgt_dchild->d_name.name, dot_fscrypt_name, + tgt_dchild->d_name.len) == 0)) + lu_root_fid(&op_data->op_fid2); + + if (src_dchild->d_inode) + op_data->op_fid3 = *ll_inode2fid(src_dchild->d_inode); + + if (tgt_dchild->d_inode) + op_data->op_fid4 = *ll_inode2fid(tgt_dchild->d_inode); + + err = ll_setup_filename(src, &src_dchild->d_name, 1, &foldname, NULL); + if (err) + RETURN(err); + err = ll_setup_filename(tgt, &tgt_dchild->d_name, 1, &fnewname, NULL); + if (err) { + llcrypt_free_filename(&foldname); + RETURN(err); + } + err = md_rename(sbi->ll_md_exp, op_data, + foldname.disk_name.name, foldname.disk_name.len, + fnewname.disk_name.name, fnewname.disk_name.len, + &request); + llcrypt_free_filename(&foldname); + llcrypt_free_filename(&fnewname); + ll_finish_md_op_data(op_data); + if (!err) { + ll_update_times(request, src); + ll_update_times(request, tgt); + } + + ptlrpc_req_finished(request); + + if (!err) { + d_move(src_dchild, tgt_dchild); + ll_stats_ops_tally(sbi, LPROC_LL_RENAME, + ktime_us_delta(ktime_get(), kstart)); + } + + RETURN(err); +} + +const struct inode_operations ll_dir_inode_operations = { + .mknod = ll_mknod, + .atomic_open = ll_atomic_open, + .lookup = ll_lookup_nd, + .create = ll_create_nd, + /* We need all these non-raw things for NFSD, to not patch it. */ + .unlink = ll_unlink, + .mkdir = ll_mkdir, + .rmdir = ll_rmdir, + .symlink = ll_symlink, + .link = ll_link, + .rename = ll_rename, + .setattr = ll_setattr, + .getattr = ll_getattr, + .permission = ll_inode_permission, +#ifdef HAVE_IOP_XATTR + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .removexattr = ll_removexattr, +#endif + .listxattr = ll_listxattr, + .get_acl = ll_get_acl, +#ifdef HAVE_IOP_SET_ACL + .set_acl = ll_set_acl, +#endif +}; + +const struct inode_operations ll_special_inode_operations = { + .setattr = ll_setattr, + .getattr = ll_getattr, + .permission = ll_inode_permission, +#ifdef HAVE_IOP_XATTR + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .removexattr = ll_removexattr, +#endif + .listxattr = ll_listxattr, + .get_acl = ll_get_acl, +#ifdef HAVE_IOP_SET_ACL + .set_acl = ll_set_acl, +#endif +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/pcc.c b/drivers/staging/lustrefsx/lustre/llite/pcc.c new file mode 100644 index 0000000000000..9f176b5ea92fa --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/pcc.c @@ -0,0 +1,2748 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, DDN Storage Corporation. + */ +/* + * Persistent Client Cache + * + * PCC is a new framework which provides a group of local cache on Lustre + * client side. It works in two modes: RW-PCC enables a read-write cache on the + * local SSDs of a single client; RO-PCC provides a read-only cache on the + * local SSDs of multiple clients. Less overhead is visible to the applications + * and network latencies and lock conflicts can be significantly reduced. + * + * For RW-PCC, no global namespace will be provided. Each client uses its own + * local storage as a cache for itself. Local file system is used to manage + * the data on local caches. Cached I/O is directed to local file system while + * normal I/O is directed to OSTs. RW-PCC uses HSM for data synchronization. + * It uses HSM copytool to restore file from local caches to Lustre OSTs. Each + * PCC has a copytool instance running with unique archive number. Any remote + * access from another Lustre client would trigger the data synchronization. If + * a client with RW-PCC goes offline, the cached data becomes inaccessible for + * other client temporarily. And after the RW-PCC client reboots and the + * copytool restarts, the data will be accessible again. + * + * Following is what will happen in different conditions for RW-PCC: + * + * > When file is being created on RW-PCC + * + * A normal HSM released file is created on MDT; + * An empty mirror file is created on local cache; + * The HSM status of the Lustre file will be set to archived and released; + * The archive number will be set to the proper value. + * + * > When file is being prefetched to RW-PCC + * + * An file is copied to the local cache; + * The HSM status of the Lustre file will be set to archived and released; + * The archive number will be set to the proper value. + * + * > When file is being accessed from PCC + * + * Data will be read directly from local cache; + * Metadata will be read from MDT, except file size; + * File size will be got from local cache. + * + * > When PCC cached file is being accessed on another client + * + * RW-PCC cached files are automatically restored when a process on another + * client tries to read or modify them. The corresponding I/O will block + * waiting for the released file to be restored. This is transparent to the + * process. + * + * For RW-PCC, when a file is being created, a rule-based policy is used to + * determine whether it will be cached. Rule-based caching of newly created + * files can determine which file can use a cache on PCC directly without any + * admission control. + * + * RW-PCC design can accelerate I/O intensive applications with one-to-one + * mappings between files and accessing clients. However, in several use cases, + * files will never be updated, but need to be read simultaneously from many + * clients. RO-PCC implements a read-only caching on Lustre clients using + * SSDs. RO-PCC is based on the same framework as RW-PCC, expect + * that no HSM mechanism is used. + * + * The main advantages to use this SSD cache on the Lustre clients via PCC + * is that: + * - The I/O stack becomes much simpler for the cached data, as there is no + * interference with I/Os from other clients, which enables easier + * performance optimizations; + * - The requirements on the HW inside the client nodes are small, any kind of + * SSDs or even HDDs can be used as cache devices; + * - Caching reduces the pressure on the object storage targets (OSTs), as + * small or random I/Os can be regularized to big sequential I/Os and + * temporary files do not even need to be flushed to OSTs. + * + * PCC can accelerate applications with certain I/O patterns: + * - small-sized random writes (< 1MB) from a single client + * - repeated read of data that is larger than RAM + * - clients with high network latency + * + * Author: Li Xi + * Author: Qian Yingjin + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "pcc.h" +#include +#include +#include +#include "llite_internal.h" + +struct kmem_cache *pcc_inode_slab; + +int pcc_super_init(struct pcc_super *super) +{ + struct cred *cred; + + super->pccs_cred = cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + /* Never override disk quota limits or use reserved space */ + cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); + init_rwsem(&super->pccs_rw_sem); + INIT_LIST_HEAD(&super->pccs_datasets); + super->pccs_generation = 1; + + return 0; +} + +/* Rule based auto caching */ +static void pcc_id_list_free(struct list_head *id_list) +{ + struct pcc_match_id *id, *n; + + list_for_each_entry_safe(id, n, id_list, pmi_linkage) { + list_del_init(&id->pmi_linkage); + OBD_FREE_PTR(id); + } +} + +static void pcc_fname_list_free(struct list_head *fname_list) +{ + struct pcc_match_fname *fname, *n; + + list_for_each_entry_safe(fname, n, fname_list, pmf_linkage) { + OBD_FREE(fname->pmf_name, strlen(fname->pmf_name) + 1); + list_del_init(&fname->pmf_linkage); + OBD_FREE_PTR(fname); + } +} + +static void pcc_expression_free(struct pcc_expression *expr) +{ + LASSERT(expr->pe_field >= PCC_FIELD_UID && + expr->pe_field < PCC_FIELD_MAX); + switch (expr->pe_field) { + case PCC_FIELD_UID: + case PCC_FIELD_GID: + case PCC_FIELD_PROJID: + pcc_id_list_free(&expr->pe_cond); + break; + case PCC_FIELD_FNAME: + pcc_fname_list_free(&expr->pe_cond); + break; + default: + LBUG(); + } + OBD_FREE_PTR(expr); +} + +static void pcc_conjunction_free(struct pcc_conjunction *conjunction) +{ + struct pcc_expression *expression, *n; + + LASSERT(list_empty(&conjunction->pc_linkage)); + list_for_each_entry_safe(expression, n, + &conjunction->pc_expressions, + pe_linkage) { + list_del_init(&expression->pe_linkage); + pcc_expression_free(expression); + } + OBD_FREE_PTR(conjunction); +} + +static void pcc_rule_conds_free(struct list_head *cond_list) +{ + struct pcc_conjunction *conjunction, *n; + + list_for_each_entry_safe(conjunction, n, cond_list, pc_linkage) { + list_del_init(&conjunction->pc_linkage); + pcc_conjunction_free(conjunction); + } +} + +static void pcc_cmd_fini(struct pcc_cmd *cmd) +{ + if (cmd->pccc_cmd == PCC_ADD_DATASET) { + if (!list_empty(&cmd->u.pccc_add.pccc_conds)) + pcc_rule_conds_free(&cmd->u.pccc_add.pccc_conds); + if (cmd->u.pccc_add.pccc_conds_str) + OBD_FREE(cmd->u.pccc_add.pccc_conds_str, + strlen(cmd->u.pccc_add.pccc_conds_str) + 1); + } +} + +#define PCC_DISJUNCTION_DELIM (',') +#define PCC_CONJUNCTION_DELIM ('&') +#define PCC_EXPRESSION_DELIM ('=') + +static int +pcc_fname_list_add(struct cfs_lstr *id, struct list_head *fname_list) +{ + struct pcc_match_fname *fname; + + OBD_ALLOC_PTR(fname); + if (fname == NULL) + return -ENOMEM; + + OBD_ALLOC(fname->pmf_name, id->ls_len + 1); + if (fname->pmf_name == NULL) { + OBD_FREE_PTR(fname); + return -ENOMEM; + } + + memcpy(fname->pmf_name, id->ls_str, id->ls_len); + list_add_tail(&fname->pmf_linkage, fname_list); + return 0; +} + +static int +pcc_fname_list_parse(char *str, int len, struct list_head *fname_list) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc = 0; + + ENTRY; + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(fname_list); + while (src.ls_str) { + rc = cfs_gettok(&src, ' ', &res); + if (rc == 0) { + rc = -EINVAL; + break; + } + rc = pcc_fname_list_add(&res, fname_list); + if (rc) + break; + } + if (rc) + pcc_fname_list_free(fname_list); + RETURN(rc); +} + +static int +pcc_id_list_parse(char *str, int len, struct list_head *id_list, + enum pcc_field type) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc = 0; + + ENTRY; + + if (type != PCC_FIELD_UID && type != PCC_FIELD_GID && + type != PCC_FIELD_PROJID) + RETURN(-EINVAL); + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(id_list); + while (src.ls_str) { + struct pcc_match_id *id; + __u32 id_val; + + if (cfs_gettok(&src, ' ', &res) == 0) + GOTO(out, rc = -EINVAL); + + if (!cfs_str2num_check(res.ls_str, res.ls_len, + &id_val, 0, (u32)~0U)) + GOTO(out, rc = -EINVAL); + + OBD_ALLOC_PTR(id); + if (id == NULL) + GOTO(out, rc = -ENOMEM); + + id->pmi_id = id_val; + list_add_tail(&id->pmi_linkage, id_list); + } +out: + if (rc) + pcc_id_list_free(id_list); + RETURN(rc); +} + +static inline bool +pcc_check_field(struct cfs_lstr *field, char *str) +{ + int len = strlen(str); + + return (field->ls_len == len && + strncmp(field->ls_str, str, len) == 0); +} + +static int +pcc_expression_parse(struct cfs_lstr *src, struct list_head *cond_list) +{ + struct pcc_expression *expr; + struct cfs_lstr field; + int rc = 0; + + OBD_ALLOC_PTR(expr); + if (expr == NULL) + return -ENOMEM; + + rc = cfs_gettok(src, PCC_EXPRESSION_DELIM, &field); + if (rc == 0 || src->ls_len <= 2 || src->ls_str[0] != '{' || + src->ls_str[src->ls_len - 1] != '}') + GOTO(out, rc = -EINVAL); + + /* Skip '{' and '}' */ + src->ls_str++; + src->ls_len -= 2; + + if (pcc_check_field(&field, "uid")) { + if (pcc_id_list_parse(src->ls_str, + src->ls_len, + &expr->pe_cond, + PCC_FIELD_UID) < 0) + GOTO(out, rc = -EINVAL); + expr->pe_field = PCC_FIELD_UID; + } else if (pcc_check_field(&field, "gid")) { + if (pcc_id_list_parse(src->ls_str, + src->ls_len, + &expr->pe_cond, + PCC_FIELD_GID) < 0) + GOTO(out, rc = -EINVAL); + expr->pe_field = PCC_FIELD_GID; + } else if (pcc_check_field(&field, "projid")) { + if (pcc_id_list_parse(src->ls_str, + src->ls_len, + &expr->pe_cond, + PCC_FIELD_PROJID) < 0) + GOTO(out, rc = -EINVAL); + expr->pe_field = PCC_FIELD_PROJID; + } else if (pcc_check_field(&field, "fname")) { + if (pcc_fname_list_parse(src->ls_str, + src->ls_len, + &expr->pe_cond) < 0) + GOTO(out, rc = -EINVAL); + expr->pe_field = PCC_FIELD_FNAME; + } else { + GOTO(out, rc = -EINVAL); + } + + list_add_tail(&expr->pe_linkage, cond_list); + return 0; +out: + OBD_FREE_PTR(expr); + return rc; +} + +static int +pcc_conjunction_parse(struct cfs_lstr *src, struct list_head *cond_list) +{ + struct pcc_conjunction *conjunction; + struct cfs_lstr expr; + int rc = 0; + + OBD_ALLOC_PTR(conjunction); + if (conjunction == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&conjunction->pc_expressions); + list_add_tail(&conjunction->pc_linkage, cond_list); + + while (src->ls_str) { + rc = cfs_gettok(src, PCC_CONJUNCTION_DELIM, &expr); + if (rc == 0) { + rc = -EINVAL; + break; + } + rc = pcc_expression_parse(&expr, + &conjunction->pc_expressions); + if (rc) + break; + } + return rc; +} + +static int pcc_conds_parse(char *str, int len, struct list_head *cond_list) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc = 0; + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(cond_list); + while (src.ls_str) { + rc = cfs_gettok(&src, PCC_DISJUNCTION_DELIM, &res); + if (rc == 0) { + rc = -EINVAL; + break; + } + rc = pcc_conjunction_parse(&res, cond_list); + if (rc) + break; + } + return rc; +} + +static int pcc_id_parse(struct pcc_cmd *cmd, const char *id) +{ + int rc; + + OBD_ALLOC(cmd->u.pccc_add.pccc_conds_str, strlen(id) + 1); + if (cmd->u.pccc_add.pccc_conds_str == NULL) + return -ENOMEM; + + memcpy(cmd->u.pccc_add.pccc_conds_str, id, strlen(id)); + + rc = pcc_conds_parse(cmd->u.pccc_add.pccc_conds_str, + strlen(cmd->u.pccc_add.pccc_conds_str), + &cmd->u.pccc_add.pccc_conds); + if (rc) + pcc_cmd_fini(cmd); + + return rc; +} + +static int +pcc_parse_value_pair(struct pcc_cmd *cmd, char *buffer) +{ + char *key, *val; + unsigned long id; + int rc; + + val = buffer; + key = strsep(&val, "="); + if (val == NULL || strlen(val) == 0) + return -EINVAL; + + /* Key of the value pair */ + if (strcmp(key, "rwid") == 0) { + rc = kstrtoul(val, 10, &id); + if (rc) + return rc; + if (id <= 0) + return -EINVAL; + cmd->u.pccc_add.pccc_rwid = id; + } else if (strcmp(key, "roid") == 0) { + rc = kstrtoul(val, 10, &id); + if (rc) + return rc; + if (id <= 0) + return -EINVAL; + cmd->u.pccc_add.pccc_roid = id; + } else if (strcmp(key, "auto_attach") == 0) { + rc = kstrtoul(val, 10, &id); + if (rc) + return rc; + if (id == 0) + cmd->u.pccc_add.pccc_flags &= ~PCC_DATASET_AUTO_ATTACH; + } else if (strcmp(key, "open_attach") == 0) { + rc = kstrtoul(val, 10, &id); + if (rc) + return rc; + if (id == 0) + cmd->u.pccc_add.pccc_flags &= ~PCC_DATASET_OPEN_ATTACH; + } else if (strcmp(key, "io_attach") == 0) { + rc = kstrtoul(val, 10, &id); + if (rc) + return rc; + if (id == 0) + cmd->u.pccc_add.pccc_flags &= ~PCC_DATASET_IO_ATTACH; + } else if (strcmp(key, "stat_attach") == 0) { + rc = kstrtoul(val, 10, &id); + if (rc) + return rc; + if (id == 0) + cmd->u.pccc_add.pccc_flags &= ~PCC_DATASET_STAT_ATTACH; + } else if (strcmp(key, "rwpcc") == 0) { + rc = kstrtoul(val, 10, &id); + if (rc) + return rc; + if (id > 0) + cmd->u.pccc_add.pccc_flags |= PCC_DATASET_RWPCC; + } else if (strcmp(key, "ropcc") == 0) { + rc = kstrtoul(val, 10, &id); + if (rc) + return rc; + if (id > 0) + cmd->u.pccc_add.pccc_flags |= PCC_DATASET_ROPCC; + } else { + return -EINVAL; + } + + return 0; +} + +static int +pcc_parse_value_pairs(struct pcc_cmd *cmd, char *buffer) +{ + char *val; + char *token; + int rc; + + switch (cmd->pccc_cmd) { + case PCC_ADD_DATASET: + /* Enable auto attach by default */ + cmd->u.pccc_add.pccc_flags |= PCC_DATASET_AUTO_ATTACH; + break; + case PCC_DEL_DATASET: + case PCC_CLEAR_ALL: + break; + default: + return -EINVAL; + } + + val = buffer; + while (val != NULL && strlen(val) != 0) { + token = strsep(&val, " "); + rc = pcc_parse_value_pair(cmd, token); + if (rc) + return rc; + } + + switch (cmd->pccc_cmd) { + case PCC_ADD_DATASET: + if (cmd->u.pccc_add.pccc_flags & PCC_DATASET_RWPCC && + cmd->u.pccc_add.pccc_flags & PCC_DATASET_ROPCC) + return -EINVAL; + /* + * By default, a PCC backend can provide caching service for + * both RW-PCC and RO-PCC. + */ + if ((cmd->u.pccc_add.pccc_flags & PCC_DATASET_PCC_ALL) == 0) + cmd->u.pccc_add.pccc_flags |= PCC_DATASET_PCC_ALL; + + /* For RW-PCC, the value of @rwid must be non zero. */ + if (cmd->u.pccc_add.pccc_flags & PCC_DATASET_RWPCC && + cmd->u.pccc_add.pccc_rwid == 0) + return -EINVAL; + + break; + case PCC_DEL_DATASET: + case PCC_CLEAR_ALL: + break; + default: + return -EINVAL; + } + return 0; +} + +static void +pcc_dataset_rule_fini(struct pcc_match_rule *rule) +{ + if (!list_empty(&rule->pmr_conds)) + pcc_rule_conds_free(&rule->pmr_conds); + LASSERT(rule->pmr_conds_str != NULL); + OBD_FREE(rule->pmr_conds_str, strlen(rule->pmr_conds_str) + 1); +} + +static int +pcc_dataset_rule_init(struct pcc_match_rule *rule, struct pcc_cmd *cmd) +{ + int rc = 0; + + LASSERT(cmd->u.pccc_add.pccc_conds_str); + OBD_ALLOC(rule->pmr_conds_str, + strlen(cmd->u.pccc_add.pccc_conds_str) + 1); + if (rule->pmr_conds_str == NULL) + return -ENOMEM; + + memcpy(rule->pmr_conds_str, + cmd->u.pccc_add.pccc_conds_str, + strlen(cmd->u.pccc_add.pccc_conds_str)); + + INIT_LIST_HEAD(&rule->pmr_conds); + if (!list_empty(&cmd->u.pccc_add.pccc_conds)) + rc = pcc_conds_parse(rule->pmr_conds_str, + strlen(rule->pmr_conds_str), + &rule->pmr_conds); + + if (rc) + pcc_dataset_rule_fini(rule); + + return rc; +} + +/* Rule Matching */ +static int +pcc_id_list_match(struct list_head *id_list, __u32 id_val) +{ + struct pcc_match_id *id; + + list_for_each_entry(id, id_list, pmi_linkage) { + if (id->pmi_id == id_val) + return 1; + } + return 0; +} + +static bool +cfs_match_wildcard(const char *pattern, const char *content) +{ + if (*pattern == '\0' && *content == '\0') + return true; + + if (*pattern == '*' && *(pattern + 1) != '\0' && *content == '\0') + return false; + + while (*pattern == *content) { + pattern++; + content++; + if (*pattern == '\0' && *content == '\0') + return true; + + if (*pattern == '*' && *(pattern + 1) != '\0' && + *content == '\0') + return false; + } + + if (*pattern == '*') + return (cfs_match_wildcard(pattern + 1, content) || + cfs_match_wildcard(pattern, content + 1)); + + return false; +} + +static int +pcc_fname_list_match(struct list_head *fname_list, const char *name) +{ + struct pcc_match_fname *fname; + + list_for_each_entry(fname, fname_list, pmf_linkage) { + if (cfs_match_wildcard(fname->pmf_name, name)) + return 1; + } + return 0; +} + +static int +pcc_expression_match(struct pcc_expression *expr, struct pcc_matcher *matcher) +{ + switch (expr->pe_field) { + case PCC_FIELD_UID: + return pcc_id_list_match(&expr->pe_cond, matcher->pm_uid); + case PCC_FIELD_GID: + return pcc_id_list_match(&expr->pe_cond, matcher->pm_gid); + case PCC_FIELD_PROJID: + return pcc_id_list_match(&expr->pe_cond, matcher->pm_projid); + case PCC_FIELD_FNAME: + return pcc_fname_list_match(&expr->pe_cond, + matcher->pm_name->name); + default: + return 0; + } +} + +static int +pcc_conjunction_match(struct pcc_conjunction *conjunction, + struct pcc_matcher *matcher) +{ + struct pcc_expression *expr; + int matched; + + list_for_each_entry(expr, &conjunction->pc_expressions, pe_linkage) { + matched = pcc_expression_match(expr, matcher); + if (!matched) + return 0; + } + + return 1; +} + +static int +pcc_cond_match(struct pcc_match_rule *rule, struct pcc_matcher *matcher) +{ + struct pcc_conjunction *conjunction; + int matched; + + list_for_each_entry(conjunction, &rule->pmr_conds, pc_linkage) { + matched = pcc_conjunction_match(conjunction, matcher); + if (matched) + return 1; + } + + return 0; +} + +struct pcc_dataset* +pcc_dataset_match_get(struct pcc_super *super, struct pcc_matcher *matcher) +{ + struct pcc_dataset *dataset; + struct pcc_dataset *selected = NULL; + + down_read(&super->pccs_rw_sem); + list_for_each_entry(dataset, &super->pccs_datasets, pccd_linkage) { + if (!(dataset->pccd_flags & PCC_DATASET_RWPCC)) + continue; + + if (pcc_cond_match(&dataset->pccd_rule, matcher)) { + atomic_inc(&dataset->pccd_refcount); + selected = dataset; + break; + } + } + up_read(&super->pccs_rw_sem); + if (selected) + CDEBUG(D_CACHE, "PCC create, matched %s - %d:%d:%d:%s\n", + dataset->pccd_rule.pmr_conds_str, + matcher->pm_uid, matcher->pm_gid, + matcher->pm_projid, matcher->pm_name->name); + + return selected; +} + +/** + * pcc_dataset_add - Add a Cache policy to control which files need be + * cached and where it will be cached. + * + * @super: superblock of pcc + * @cmd: pcc command + */ +static int +pcc_dataset_add(struct pcc_super *super, struct pcc_cmd *cmd) +{ + char *pathname = cmd->pccc_pathname; + struct pcc_dataset *dataset; + struct pcc_dataset *tmp; + bool found = false; + int rc; + + OBD_ALLOC_PTR(dataset); + if (dataset == NULL) + return -ENOMEM; + + rc = kern_path(pathname, LOOKUP_DIRECTORY, &dataset->pccd_path); + if (unlikely(rc)) { + OBD_FREE_PTR(dataset); + return rc; + } + strncpy(dataset->pccd_pathname, pathname, PATH_MAX); + dataset->pccd_rwid = cmd->u.pccc_add.pccc_rwid; + dataset->pccd_roid = cmd->u.pccc_add.pccc_roid; + dataset->pccd_flags = cmd->u.pccc_add.pccc_flags; + atomic_set(&dataset->pccd_refcount, 1); + + rc = pcc_dataset_rule_init(&dataset->pccd_rule, cmd); + if (rc) { + pcc_dataset_put(dataset); + return rc; + } + + down_write(&super->pccs_rw_sem); + list_for_each_entry(tmp, &super->pccs_datasets, pccd_linkage) { + if (strcmp(tmp->pccd_pathname, pathname) == 0 || + (dataset->pccd_rwid != 0 && + dataset->pccd_rwid == tmp->pccd_rwid) || + (dataset->pccd_roid != 0 && + dataset->pccd_roid == tmp->pccd_roid)) { + found = true; + break; + } + } + if (!found) + list_add(&dataset->pccd_linkage, &super->pccs_datasets); + up_write(&super->pccs_rw_sem); + + if (found) { + pcc_dataset_put(dataset); + rc = -EEXIST; + } + + return rc; +} + +struct pcc_dataset * +pcc_dataset_get(struct pcc_super *super, enum lu_pcc_type type, __u32 id) +{ + struct pcc_dataset *dataset; + struct pcc_dataset *selected = NULL; + + if (id == 0) + return NULL; + + /* + * archive ID (read-write ID) or read-only ID is unique in the list, + * we just return last added one as first priority. + */ + down_read(&super->pccs_rw_sem); + list_for_each_entry(dataset, &super->pccs_datasets, pccd_linkage) { + if (type == LU_PCC_READWRITE && (dataset->pccd_rwid != id || + !(dataset->pccd_flags & PCC_DATASET_RWPCC))) + continue; + atomic_inc(&dataset->pccd_refcount); + selected = dataset; + break; + } + up_read(&super->pccs_rw_sem); + if (selected) + CDEBUG(D_CACHE, "matched id %u, PCC mode %d\n", id, type); + + return selected; +} + +void +pcc_dataset_put(struct pcc_dataset *dataset) +{ + if (atomic_dec_and_test(&dataset->pccd_refcount)) { + pcc_dataset_rule_fini(&dataset->pccd_rule); + path_put(&dataset->pccd_path); + OBD_FREE_PTR(dataset); + } +} + +static int +pcc_dataset_del(struct pcc_super *super, char *pathname) +{ + struct list_head *l, *tmp; + struct pcc_dataset *dataset; + int rc = -ENOENT; + + down_write(&super->pccs_rw_sem); + list_for_each_safe(l, tmp, &super->pccs_datasets) { + dataset = list_entry(l, struct pcc_dataset, pccd_linkage); + if (strcmp(dataset->pccd_pathname, pathname) == 0) { + list_del_init(&dataset->pccd_linkage); + pcc_dataset_put(dataset); + super->pccs_generation++; + rc = 0; + break; + } + } + up_write(&super->pccs_rw_sem); + return rc; +} + +static void +pcc_dataset_dump(struct pcc_dataset *dataset, struct seq_file *m) +{ + seq_printf(m, "%s:\n", dataset->pccd_pathname); + seq_printf(m, " rwid: %u\n", dataset->pccd_rwid); + seq_printf(m, " flags: %x\n", dataset->pccd_flags); + seq_printf(m, " autocache: %s\n", dataset->pccd_rule.pmr_conds_str); +} + +int +pcc_super_dump(struct pcc_super *super, struct seq_file *m) +{ + struct pcc_dataset *dataset; + + down_read(&super->pccs_rw_sem); + list_for_each_entry(dataset, &super->pccs_datasets, pccd_linkage) { + pcc_dataset_dump(dataset, m); + } + up_read(&super->pccs_rw_sem); + return 0; +} + +static void pcc_remove_datasets(struct pcc_super *super) +{ + struct pcc_dataset *dataset, *tmp; + + down_write(&super->pccs_rw_sem); + list_for_each_entry_safe(dataset, tmp, + &super->pccs_datasets, pccd_linkage) { + list_del(&dataset->pccd_linkage); + pcc_dataset_put(dataset); + } + super->pccs_generation++; + up_write(&super->pccs_rw_sem); +} + +void pcc_super_fini(struct pcc_super *super) +{ + pcc_remove_datasets(super); + put_cred(super->pccs_cred); +} + +static bool pathname_is_valid(const char *pathname) +{ + /* Needs to be absolute path */ + if (pathname == NULL || strlen(pathname) == 0 || + strlen(pathname) >= PATH_MAX || pathname[0] != '/') + return false; + return true; +} + +static struct pcc_cmd * +pcc_cmd_parse(char *buffer, unsigned long count) +{ + static struct pcc_cmd *cmd; + char *token; + char *val; + int rc = 0; + + OBD_ALLOC_PTR(cmd); + if (cmd == NULL) + GOTO(out, rc = -ENOMEM); + + /* clear all setting */ + if (strncmp(buffer, "clear", 5) == 0) { + cmd->pccc_cmd = PCC_CLEAR_ALL; + GOTO(out, rc = 0); + } + + val = buffer; + token = strsep(&val, " "); + if (val == NULL || strlen(val) == 0) + GOTO(out_free_cmd, rc = -EINVAL); + + /* Type of the command */ + if (strcmp(token, "add") == 0) + cmd->pccc_cmd = PCC_ADD_DATASET; + else if (strcmp(token, "del") == 0) + cmd->pccc_cmd = PCC_DEL_DATASET; + else + GOTO(out_free_cmd, rc = -EINVAL); + + /* Pathname of the dataset */ + token = strsep(&val, " "); + if ((val == NULL && cmd->pccc_cmd != PCC_DEL_DATASET) || + !pathname_is_valid(token)) + GOTO(out_free_cmd, rc = -EINVAL); + cmd->pccc_pathname = token; + + if (cmd->pccc_cmd == PCC_ADD_DATASET) { + /* List of ID */ + LASSERT(val); + token = val; + val = strrchr(token, '}'); + if (!val) + GOTO(out_free_cmd, rc = -EINVAL); + + /* Skip '}' */ + val++; + if (*val == '\0') { + val = NULL; + } else if (*val == ' ') { + *val = '\0'; + val++; + } else { + GOTO(out_free_cmd, rc = -EINVAL); + } + + rc = pcc_id_parse(cmd, token); + if (rc) + GOTO(out_free_cmd, rc); + + rc = pcc_parse_value_pairs(cmd, val); + if (rc) + GOTO(out_cmd_fini, rc = -EINVAL); + } + goto out; +out_cmd_fini: + pcc_cmd_fini(cmd); +out_free_cmd: + OBD_FREE_PTR(cmd); +out: + if (rc) + cmd = ERR_PTR(rc); + return cmd; +} + +int pcc_cmd_handle(char *buffer, unsigned long count, + struct pcc_super *super) +{ + int rc = 0; + struct pcc_cmd *cmd; + + cmd = pcc_cmd_parse(buffer, count); + if (IS_ERR(cmd)) + return PTR_ERR(cmd); + + switch (cmd->pccc_cmd) { + case PCC_ADD_DATASET: + rc = pcc_dataset_add(super, cmd); + break; + case PCC_DEL_DATASET: + rc = pcc_dataset_del(super, cmd->pccc_pathname); + break; + case PCC_CLEAR_ALL: + pcc_remove_datasets(super); + break; + default: + rc = -EINVAL; + break; + } + + pcc_cmd_fini(cmd); + OBD_FREE_PTR(cmd); + return rc; +} + +static inline void pcc_inode_lock(struct inode *inode) +{ + mutex_lock(&ll_i2info(inode)->lli_pcc_lock); +} + +static inline void pcc_inode_unlock(struct inode *inode) +{ + mutex_unlock(&ll_i2info(inode)->lli_pcc_lock); +} + +static void pcc_inode_init(struct pcc_inode *pcci, struct ll_inode_info *lli) +{ + pcci->pcci_lli = lli; + lli->lli_pcc_inode = pcci; + atomic_set(&pcci->pcci_refcount, 0); + pcci->pcci_type = LU_PCC_NONE; + pcci->pcci_layout_gen = CL_LAYOUT_GEN_NONE; + atomic_set(&pcci->pcci_active_ios, 0); + init_waitqueue_head(&pcci->pcci_waitq); +} + +static void pcc_inode_fini(struct pcc_inode *pcci) +{ + struct ll_inode_info *lli = pcci->pcci_lli; + + path_put(&pcci->pcci_path); + pcci->pcci_type = LU_PCC_NONE; + OBD_SLAB_FREE_PTR(pcci, pcc_inode_slab); + lli->lli_pcc_inode = NULL; +} + +static void pcc_inode_get(struct pcc_inode *pcci) +{ + atomic_inc(&pcci->pcci_refcount); +} + +static void pcc_inode_put(struct pcc_inode *pcci) +{ + if (atomic_dec_and_test(&pcci->pcci_refcount)) + pcc_inode_fini(pcci); +} + +void pcc_inode_free(struct inode *inode) +{ + struct pcc_inode *pcci = ll_i2pcci(inode); + + if (pcci) { + WARN_ON(atomic_read(&pcci->pcci_refcount) > 1); + pcc_inode_put(pcci); + } +} + +/* + * TODO: + * As Andreas suggested, we'd better use new layout to + * reduce overhead: + * (fid->f_oid >> 16 & oxFFFF)/FID + */ +#define PCC_DATASET_MAX_PATH (6 * 5 + FID_NOBRACE_LEN + 1) +static int pcc_fid2dataset_path(char *buf, int sz, struct lu_fid *fid) +{ + return scnprintf(buf, sz, "%04x/%04x/%04x/%04x/%04x/%04x/" + DFID_NOBRACE, + (fid)->f_oid & 0xFFFF, + (fid)->f_oid >> 16 & 0xFFFF, + (unsigned int)((fid)->f_seq & 0xFFFF), + (unsigned int)((fid)->f_seq >> 16 & 0xFFFF), + (unsigned int)((fid)->f_seq >> 32 & 0xFFFF), + (unsigned int)((fid)->f_seq >> 48 & 0xFFFF), + PFID(fid)); +} + +static inline const struct cred *pcc_super_cred(struct super_block *sb) +{ + return ll_s2sbi(sb)->ll_pcc_super.pccs_cred; +} + +void pcc_file_init(struct pcc_file *pccf) +{ + pccf->pccf_file = NULL; + pccf->pccf_type = LU_PCC_NONE; +} + +static inline bool pcc_auto_attach_enabled(enum pcc_dataset_flags flags, + enum pcc_io_type iot) +{ + if (iot == PIT_OPEN) + return flags & PCC_DATASET_OPEN_ATTACH; + if (iot == PIT_GETATTR) + return flags & PCC_DATASET_STAT_ATTACH; + else + return flags & PCC_DATASET_AUTO_ATTACH; +} + +static const char pcc_xattr_layout[] = XATTR_USER_PREFIX "PCC.layout"; + +static int pcc_layout_xattr_set(struct pcc_inode *pcci, __u32 gen) +{ + struct dentry *pcc_dentry = pcci->pcci_path.dentry; + struct ll_inode_info *lli = pcci->pcci_lli; + int rc; + + ENTRY; + + if (!(lli->lli_pcc_dsflags & PCC_DATASET_AUTO_ATTACH)) + RETURN(0); + + rc = ll_vfs_setxattr(pcc_dentry, pcc_dentry->d_inode, pcc_xattr_layout, + &gen, sizeof(gen), 0); + + RETURN(rc); +} + +static int pcc_get_layout_info(struct inode *inode, struct cl_layout *clt) +{ + struct lu_env *env; + struct ll_inode_info *lli = ll_i2info(inode); + __u16 refcheck; + int rc; + + ENTRY; + + if (!lli->lli_clob) + RETURN(-EINVAL); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + rc = cl_object_layout_get(env, lli->lli_clob, clt); + if (rc < 0) + CDEBUG(D_INODE, "Cannot get layout for "DFID"\n", + PFID(ll_inode2fid(inode))); + + cl_env_put(env, &refcheck); + RETURN(rc < 0 ? rc : 0); +} + +/* Must be called with pcci->pcci_lock held */ +static void pcc_inode_attach_init(struct pcc_dataset *dataset, + struct pcc_inode *pcci, + struct dentry *dentry, + enum lu_pcc_type type) +{ + pcci->pcci_path.mnt = mntget(dataset->pccd_path.mnt); + pcci->pcci_path.dentry = dentry; + LASSERT(atomic_read(&pcci->pcci_refcount) == 0); + atomic_set(&pcci->pcci_refcount, 1); + pcci->pcci_type = type; + pcci->pcci_attr_valid = false; +} + +static inline void pcc_inode_dsflags_set(struct ll_inode_info *lli, + struct pcc_dataset *dataset) +{ + lli->lli_pcc_generation = ll_info2pccs(lli)->pccs_generation; + lli->lli_pcc_dsflags = dataset->pccd_flags; +} + +static void pcc_inode_attach_set(struct pcc_super *super, + struct pcc_dataset *dataset, + struct ll_inode_info *lli, + struct pcc_inode *pcci, + struct dentry *dentry, + enum lu_pcc_type type) +{ + pcc_inode_init(pcci, lli); + pcc_inode_attach_init(dataset, pcci, dentry, type); + down_read(&super->pccs_rw_sem); + pcc_inode_dsflags_set(lli, dataset); + up_read(&super->pccs_rw_sem); +} + +static inline void pcc_layout_gen_set(struct pcc_inode *pcci, + __u32 gen) +{ + pcci->pcci_layout_gen = gen; +} + +static inline bool pcc_inode_has_layout(struct pcc_inode *pcci) +{ + return pcci->pcci_layout_gen != CL_LAYOUT_GEN_NONE; +} + +static struct dentry *pcc_lookup(struct dentry *base, char *pathname) +{ + char *ptr = NULL, *component; + struct dentry *parent; + struct dentry *child = ERR_PTR(-ENOENT); + + ptr = pathname; + + /* move past any initial '/' to the start of the first path component*/ + while (*ptr == '/') + ptr++; + + /* store the start of the first path component */ + component = ptr; + + parent = dget(base); + while (ptr) { + /* find the start of the next component - if we don't find it, + * the current component is the last component + */ + ptr = strchr(ptr, '/'); + /* put a NUL char in place of the '/' before the next compnent + * so we can treat this component as a string; note the full + * path string is NUL terminated to this is not needed for the + * last component + */ + if (ptr) + *ptr = '\0'; + + /* look up the current component */ + inode_lock(parent->d_inode); + child = lookup_one_len(component, parent, strlen(component)); + inode_unlock(parent->d_inode); + + /* repair the path string: put '/' back in place of the NUL */ + if (ptr) + *ptr = '/'; + + dput(parent); + + if (IS_ERR_OR_NULL(child)) + break; + + /* we may find a cached negative dentry */ + if (!d_is_positive(child)) { + dput(child); + child = NULL; + break; + } + + /* descend in to the next level of the path */ + parent = child; + + /* move the pointer past the '/' to the next component */ + if (ptr) + ptr++; + component = ptr; + } + + /* NULL child means we didn't find anything */ + if (!child) + child = ERR_PTR(-ENOENT); + + return child; +} + +static int pcc_try_dataset_attach(struct inode *inode, __u32 gen, + enum lu_pcc_type type, + struct pcc_dataset *dataset, + bool *cached) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct pcc_inode *pcci = lli->lli_pcc_inode; + const struct cred *old_cred; + struct dentry *pcc_dentry = NULL; + char pathname[PCC_DATASET_MAX_PATH]; + __u32 pcc_gen; + int rc; + + ENTRY; + + if (type == LU_PCC_READWRITE && + !(dataset->pccd_flags & PCC_DATASET_RWPCC)) + RETURN(0); + + rc = pcc_fid2dataset_path(pathname, PCC_DATASET_MAX_PATH, + &lli->lli_fid); + + old_cred = override_creds(pcc_super_cred(inode->i_sb)); + pcc_dentry = pcc_lookup(dataset->pccd_path.dentry, pathname); + if (IS_ERR(pcc_dentry)) { + rc = PTR_ERR(pcc_dentry); + CDEBUG(D_CACHE, "%s: path lookup error on "DFID":%s: rc = %d\n", + ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid), + pathname, rc); + /* ignore this error */ + GOTO(out, rc = 0); + } + + rc = ll_vfs_getxattr(pcc_dentry, pcc_dentry->d_inode, pcc_xattr_layout, + &pcc_gen, sizeof(pcc_gen)); + if (rc < 0) + /* ignore this error */ + GOTO(out_put_pcc_dentry, rc = 0); + + rc = 0; + /* The file is still valid cached in PCC, attach it immediately. */ + if (pcc_gen == gen) { + CDEBUG(D_CACHE, DFID" L.Gen (%d) consistent, auto attached.\n", + PFID(&lli->lli_fid), gen); + if (!pcci) { + OBD_SLAB_ALLOC_PTR_GFP(pcci, pcc_inode_slab, GFP_NOFS); + if (pcci == NULL) + GOTO(out_put_pcc_dentry, rc = -ENOMEM); + + pcc_inode_init(pcci, lli); + dget(pcc_dentry); + pcc_inode_attach_init(dataset, pcci, pcc_dentry, type); + } else { + /* + * This happened when a file was once attached into + * PCC, and some processes keep this file opened + * (pcci->refcount > 1) and corresponding PCC file + * without any I/O activity, and then this file was + * detached by the manual detach command or the + * revocation of the layout lock (i.e. cached LRU lock + * shrinking). + */ + pcc_inode_get(pcci); + pcci->pcci_type = type; + } + pcc_inode_dsflags_set(lli, dataset); + pcc_layout_gen_set(pcci, gen); + *cached = true; + } +out_put_pcc_dentry: + dput(pcc_dentry); +out: + revert_creds(old_cred); + RETURN(rc); +} + +static int pcc_try_datasets_attach(struct inode *inode, enum pcc_io_type iot, + __u32 gen, enum lu_pcc_type type, + bool *cached) +{ + struct pcc_super *super = &ll_i2sbi(inode)->ll_pcc_super; + struct ll_inode_info *lli = ll_i2info(inode); + struct pcc_dataset *dataset = NULL, *tmp; + int rc = 0; + + ENTRY; + + down_read(&super->pccs_rw_sem); + list_for_each_entry_safe(dataset, tmp, + &super->pccs_datasets, pccd_linkage) { + if (!pcc_auto_attach_enabled(dataset->pccd_flags, iot)) + break; + + rc = pcc_try_dataset_attach(inode, gen, type, dataset, cached); + if (rc < 0 || (!rc && *cached)) + break; + } + + /* + * Update the saved dataset flags for the inode accordingly if failed. + */ + if (!rc && !*cached) { + /* + * Currently auto attach strategy for a PCC backend is + * unchangeable once once it was added into the PCC datasets on + * a client as the support to change auto attach strategy is + * not implemented yet. + */ + /* + * If tried to attach from one PCC backend: + * @lli_pcc_generation > 0: + * 1) The file was once attached into PCC, but now the + * corresponding PCC backend should be removed from the client; + * 2) The layout generation was changed, the data has been + * restored; + * 3) The corresponding PCC copy is not existed on PCC + * @lli_pcc_generation == 0: + * The file is never attached into PCC but in a HSM released + * state, or once attached into PCC but the inode was evicted + * from icache later. + * Set the saved dataset flags with PCC_DATASET_NONE. Then this + * file will skip from the candidates to try auto attach until + * the file is attached into PCC again. + * + * If the file was never attached into PCC, or once attached but + * its inode was evicted from icache (lli_pcc_generation == 0), + * or the corresponding dataset was removed from the client, + * set the saved dataset flags with PCC_DATASET_NONE. + * + * TODO: If the file was once attached into PCC but not try to + * auto attach due to the change of the configuration parameters + * for this dataset (i.e. change from auto attach enabled to + * auto attach disabled for this dataset), update the saved + * dataset flags with the found one. + */ + lli->lli_pcc_dsflags = PCC_DATASET_NONE; + } + up_read(&super->pccs_rw_sem); + + RETURN(rc); +} + +/* + * TODO: For RW-PCC, it is desirable to store HSM info as a layout (LU-10606). + * Thus the client can get archive ID from the layout directly. When try to + * attach the file automatically which is in HSM released state (according to + * LOV_PATTERN_F_RELEASED in the layout), it can determine whether the file is + * valid cached on PCC more precisely according to the @rwid (archive ID) in + * the PCC dataset and the archive ID in HSM attrs. + */ +static int pcc_try_auto_attach(struct inode *inode, bool *cached, + enum pcc_io_type iot) +{ + struct pcc_super *super = &ll_i2sbi(inode)->ll_pcc_super; + struct cl_layout clt = { + .cl_layout_gen = 0, + .cl_is_released = false, + }; + struct ll_inode_info *lli = ll_i2info(inode); + __u32 gen; + int rc; + + ENTRY; + + /* + * Quick check whether there is PCC device. + */ + if (list_empty(&super->pccs_datasets)) + RETURN(0); + + /* + * The file layout lock was cancelled. And this open does not + * obtain valid layout lock from MDT (i.e. the file is being + * HSM restoring). + */ + if (iot == PIT_OPEN) { + if (ll_layout_version_get(lli) == CL_LAYOUT_GEN_NONE) + RETURN(0); + } else { + rc = ll_layout_refresh(inode, &gen); + if (rc) + RETURN(rc); + } + + rc = pcc_get_layout_info(inode, &clt); + if (rc) + RETURN(rc); + + if (iot != PIT_OPEN && gen != clt.cl_layout_gen) { + CDEBUG(D_CACHE, DFID" layout changed from %d to %d.\n", + PFID(ll_inode2fid(inode)), gen, clt.cl_layout_gen); + RETURN(-EINVAL); + } + + if (clt.cl_is_released) + rc = pcc_try_datasets_attach(inode, iot, clt.cl_layout_gen, + LU_PCC_READWRITE, cached); + + RETURN(rc); +} + +static inline bool pcc_may_auto_attach(struct inode *inode, + enum pcc_io_type iot) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct pcc_super *super = ll_i2pccs(inode); + + /* Known the file was not in any PCC backend. */ + if (lli->lli_pcc_dsflags & PCC_DATASET_NONE) + return false; + + /* + * lli_pcc_generation == 0 means that the file was never attached into + * PCC, or may be once attached into PCC but detached as the inode is + * evicted from icache (i.e. "echo 3 > /proc/sys/vm/drop_caches" or + * icache shrinking due to the memory pressure), which will cause the + * file detach from PCC when releasing the inode from icache. + * In either case, we still try to attach. + */ + /* lli_pcc_generation == 0, or the PCC setting was changed, + * or there is no PCC setup on the client and the try will return + * immediately in pcc_try_auto_attach(). + */ + if (super->pccs_generation != lli->lli_pcc_generation) + return true; + + /* The cached setting @lli_pcc_dsflags is valid */ + if (iot == PIT_OPEN) + return lli->lli_pcc_dsflags & PCC_DATASET_OPEN_ATTACH; + + if (iot == PIT_GETATTR) + return lli->lli_pcc_dsflags & PCC_DATASET_STAT_ATTACH; + + return lli->lli_pcc_dsflags & PCC_DATASET_IO_ATTACH; +} + +int pcc_file_open(struct inode *inode, struct file *file) +{ + struct pcc_inode *pcci; + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = file->private_data; + struct pcc_file *pccf = &fd->fd_pcc_file; + struct file *pcc_file; + struct path *path; + bool cached = false; + int rc = 0; + + ENTRY; + + if (!S_ISREG(inode->i_mode)) + RETURN(0); + + if (IS_ENCRYPTED(inode)) + RETURN(0); + + pcc_inode_lock(inode); + pcci = ll_i2pcci(inode); + + if (lli->lli_pcc_state & PCC_STATE_FL_ATTACHING) + GOTO(out_unlock, rc = 0); + + if (!pcci || !pcc_inode_has_layout(pcci)) { + if (pcc_may_auto_attach(inode, PIT_OPEN)) + rc = pcc_try_auto_attach(inode, &cached, PIT_OPEN); + + if (rc < 0 || !cached) + GOTO(out_unlock, rc); + + if (!pcci) + pcci = ll_i2pcci(inode); + } + + pcc_inode_get(pcci); + WARN_ON(pccf->pccf_file); + + path = &pcci->pcci_path; + CDEBUG(D_CACHE, "opening pcc file '%pd'\n", path->dentry); + + pcc_file = dentry_open(path, file->f_flags, + pcc_super_cred(inode->i_sb)); + if (IS_ERR_OR_NULL(pcc_file)) { + rc = pcc_file == NULL ? -EINVAL : PTR_ERR(pcc_file); + pcc_inode_put(pcci); + } else { + pccf->pccf_file = pcc_file; + pccf->pccf_type = pcci->pcci_type; + } + +out_unlock: + pcc_inode_unlock(inode); + RETURN(rc); +} + +void pcc_file_release(struct inode *inode, struct file *file) +{ + struct pcc_inode *pcci; + struct ll_file_data *fd = file->private_data; + struct pcc_file *pccf; + struct path *path; + + ENTRY; + + if (!S_ISREG(inode->i_mode) || fd == NULL) + RETURN_EXIT; + + pccf = &fd->fd_pcc_file; + pcc_inode_lock(inode); + if (pccf->pccf_file == NULL) + goto out; + + pcci = ll_i2pcci(inode); + LASSERT(pcci); + path = &pcci->pcci_path; + CDEBUG(D_CACHE, "releasing pcc file \"%pd\"\n", path->dentry); + pcc_inode_put(pcci); + fput(pccf->pccf_file); + pccf->pccf_file = NULL; +out: + pcc_inode_unlock(inode); + RETURN_EXIT; +} + +static void pcc_io_init(struct inode *inode, enum pcc_io_type iot, bool *cached) +{ + struct pcc_inode *pcci; + + pcc_inode_lock(inode); + pcci = ll_i2pcci(inode); + if (pcci && pcc_inode_has_layout(pcci)) { + LASSERT(atomic_read(&pcci->pcci_refcount) > 0); + atomic_inc(&pcci->pcci_active_ios); + *cached = true; + } else { + *cached = false; + if (pcc_may_auto_attach(inode, iot)) { + (void) pcc_try_auto_attach(inode, cached, iot); + if (*cached) { + pcci = ll_i2pcci(inode); + LASSERT(atomic_read(&pcci->pcci_refcount) > 0); + atomic_inc(&pcci->pcci_active_ios); + } + } + } + pcc_inode_unlock(inode); +} + +static void pcc_io_fini(struct inode *inode) +{ + struct pcc_inode *pcci = ll_i2pcci(inode); + + LASSERT(pcci && atomic_read(&pcci->pcci_active_ios) > 0); + if (atomic_dec_and_test(&pcci->pcci_active_ios)) + wake_up(&pcci->pcci_waitq); +} + + +static ssize_t +__pcc_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + +#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER + return file->f_op->read_iter(iocb, iter); +#else + struct iovec iov; + struct iov_iter i; + ssize_t bytes = 0; + + iov_for_each(iov, i, *iter) { + ssize_t res; + + res = file->f_op->aio_read(iocb, &iov, 1, iocb->ki_pos); + if (-EIOCBQUEUED == res) + res = wait_on_sync_kiocb(iocb); + if (res <= 0) { + if (bytes == 0) + bytes = res; + break; + } + + bytes += res; + if (res < iov.iov_len) + break; + } + + if (bytes > 0) + iov_iter_advance(iter, bytes); + return bytes; +#endif +} + +ssize_t pcc_file_read_iter(struct kiocb *iocb, + struct iov_iter *iter, bool *cached) +{ + struct file *file = iocb->ki_filp; + struct ll_file_data *fd = file->private_data; + struct pcc_file *pccf = &fd->fd_pcc_file; + struct inode *inode = file_inode(file); + ssize_t result; + + ENTRY; + + if (pccf->pccf_file == NULL) { + *cached = false; + RETURN(0); + } + + pcc_io_init(inode, PIT_READ, cached); + if (!*cached) + RETURN(0); + + iocb->ki_filp = pccf->pccf_file; + /* generic_file_aio_read does not support ext4-dax, + * __pcc_file_read_iter uses ->aio_read hook directly + * to add support for ext4-dax. + */ + result = __pcc_file_read_iter(iocb, iter); + iocb->ki_filp = file; + + pcc_io_fini(inode); + RETURN(result); +} + +static ssize_t +__pcc_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + +#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER + return file->f_op->write_iter(iocb, iter); +#else + struct iovec iov; + struct iov_iter i; + ssize_t bytes = 0; + + iov_for_each(iov, i, *iter) { + ssize_t res; + + res = file->f_op->aio_write(iocb, &iov, 1, iocb->ki_pos); + if (-EIOCBQUEUED == res) + res = wait_on_sync_kiocb(iocb); + if (res <= 0) { + if (bytes == 0) + bytes = res; + break; + } + + bytes += res; + if (res < iov.iov_len) + break; + } + + if (bytes > 0) + iov_iter_advance(iter, bytes); + return bytes; +#endif +} + +ssize_t pcc_file_write_iter(struct kiocb *iocb, + struct iov_iter *iter, bool *cached) +{ + struct file *file = iocb->ki_filp; + struct ll_file_data *fd = file->private_data; + struct pcc_file *pccf = &fd->fd_pcc_file; + struct inode *inode = file_inode(file); + ssize_t result; + + ENTRY; + + if (pccf->pccf_file == NULL) { + *cached = false; + RETURN(0); + } + + if (pccf->pccf_type != LU_PCC_READWRITE) { + *cached = false; + RETURN(-EAGAIN); + } + + pcc_io_init(inode, PIT_WRITE, cached); + if (!*cached) + RETURN(0); + + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_PCC_FAKE_ERROR)) + GOTO(out, result = -ENOSPC); + + iocb->ki_filp = pccf->pccf_file; + + /* Since __pcc_file_write_iter makes write calls via + * the normal vfs interface to the local PCC file system, + * the inode lock is not needed. + */ + result = __pcc_file_write_iter(iocb, iter); + iocb->ki_filp = file; +out: + pcc_io_fini(inode); + RETURN(result); +} + +int pcc_inode_setattr(struct inode *inode, struct iattr *attr, + bool *cached) +{ + int rc; + const struct cred *old_cred; + struct iattr attr2 = *attr; + struct dentry *pcc_dentry; + struct pcc_inode *pcci; + + ENTRY; + + if (!S_ISREG(inode->i_mode)) { + *cached = false; + RETURN(0); + } + + pcc_io_init(inode, PIT_SETATTR, cached); + if (!*cached) + RETURN(0); + + attr2.ia_valid = attr->ia_valid & (ATTR_SIZE | ATTR_ATIME | + ATTR_ATIME_SET | ATTR_MTIME | ATTR_MTIME_SET | + ATTR_CTIME | ATTR_UID | ATTR_GID); + pcci = ll_i2pcci(inode); + pcc_dentry = pcci->pcci_path.dentry; + inode_lock(pcc_dentry->d_inode); + old_cred = override_creds(pcc_super_cred(inode->i_sb)); +#ifdef HAVE_USER_NAMESPACE_ARG + rc = pcc_dentry->d_inode->i_op->setattr(&init_user_ns, pcc_dentry, + &attr2); +#else + rc = pcc_dentry->d_inode->i_op->setattr(pcc_dentry, &attr2); +#endif + revert_creds(old_cred); + inode_unlock(pcc_dentry->d_inode); + + pcc_io_fini(inode); + RETURN(rc); +} + +int pcc_inode_getattr(struct inode *inode, u32 request_mask, + unsigned int flags, bool *cached) +{ + struct ll_inode_info *lli = ll_i2info(inode); + const struct cred *old_cred; + struct kstat stat; + s64 atime; + s64 mtime; + s64 ctime; + int rc; + + ENTRY; + + if (!S_ISREG(inode->i_mode)) { + *cached = false; + RETURN(0); + } + + pcc_io_init(inode, PIT_GETATTR, cached); + if (!*cached) + RETURN(0); + + old_cred = override_creds(pcc_super_cred(inode->i_sb)); + rc = ll_vfs_getattr(&ll_i2pcci(inode)->pcci_path, &stat, request_mask, + flags); + revert_creds(old_cred); + if (rc) + GOTO(out, rc); + + ll_inode_size_lock(inode); + if (test_and_clear_bit(LLIF_UPDATE_ATIME, &lli->lli_flags) || + inode->i_atime.tv_sec < lli->lli_atime) + inode->i_atime.tv_sec = lli->lli_atime; + + inode->i_mtime.tv_sec = lli->lli_mtime; + inode->i_ctime.tv_sec = lli->lli_ctime; + + atime = inode->i_atime.tv_sec; + mtime = inode->i_mtime.tv_sec; + ctime = inode->i_ctime.tv_sec; + + if (atime < stat.atime.tv_sec) + atime = stat.atime.tv_sec; + + if (ctime < stat.ctime.tv_sec) + ctime = stat.ctime.tv_sec; + + if (mtime < stat.mtime.tv_sec) + mtime = stat.mtime.tv_sec; + + i_size_write(inode, stat.size); + inode->i_blocks = stat.blocks; + + inode->i_atime.tv_sec = atime; + inode->i_mtime.tv_sec = mtime; + inode->i_ctime.tv_sec = ctime; + + ll_inode_size_unlock(inode); +out: + pcc_io_fini(inode); + RETURN(rc); +} + +#ifdef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT +ssize_t pcc_file_splice_read(struct file *in_file, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t count, unsigned int flags) +{ + struct inode *inode = file_inode(in_file); + struct ll_file_data *fd = in_file->private_data; + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + bool cached = false; + ssize_t result; + + ENTRY; + + if (!pcc_file) + RETURN(default_file_splice_read(in_file, ppos, pipe, + count, flags)); + + pcc_io_init(inode, PIT_SPLICE_READ, &cached); + if (!cached) + RETURN(default_file_splice_read(in_file, ppos, pipe, + count, flags)); + + result = default_file_splice_read(pcc_file, ppos, pipe, count, flags); + + pcc_io_fini(inode); + RETURN(result); +} +#endif /* HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT */ + +int pcc_fsync(struct file *file, loff_t start, loff_t end, + int datasync, bool *cached) +{ + struct inode *inode = file_inode(file); + struct ll_file_data *fd = file->private_data; + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + int rc; + + ENTRY; + + if (!pcc_file) { + *cached = false; + RETURN(0); + } + + pcc_io_init(inode, PIT_FSYNC, cached); + if (!*cached) + RETURN(0); + + rc = file_inode(pcc_file)->i_fop->fsync(pcc_file, + start, end, datasync); + + pcc_io_fini(inode); + RETURN(rc); +} + +int pcc_file_mmap(struct file *file, struct vm_area_struct *vma, + bool *cached) +{ + struct inode *inode = file_inode(file); + struct ll_file_data *fd = file->private_data; + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + struct pcc_inode *pcci; + int rc = 0; + + ENTRY; + + if (!pcc_file || !file_inode(pcc_file)->i_fop->mmap) { + *cached = false; + RETURN(0); + } + + pcc_inode_lock(inode); + pcci = ll_i2pcci(inode); + if (pcci && pcc_inode_has_layout(pcci)) { + LASSERT(atomic_read(&pcci->pcci_refcount) > 1); + *cached = true; + vma->vm_file = pcc_file; + rc = file_inode(pcc_file)->i_fop->mmap(pcc_file, vma); + vma->vm_file = file; + /* Save the vm ops of backend PCC */ + vma->vm_private_data = (void *)vma->vm_ops; + } else { + *cached = false; + } + pcc_inode_unlock(inode); + + RETURN(rc); +} + +void pcc_vm_open(struct vm_area_struct *vma) +{ + struct pcc_inode *pcci; + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct ll_file_data *fd = file->private_data; + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data; + + ENTRY; + + if (!pcc_file || !pcc_vm_ops || !pcc_vm_ops->open) + RETURN_EXIT; + + pcc_inode_lock(inode); + pcci = ll_i2pcci(inode); + if (pcci && pcc_inode_has_layout(pcci)) { + vma->vm_file = pcc_file; + pcc_vm_ops->open(vma); + vma->vm_file = file; + } + pcc_inode_unlock(inode); + EXIT; +} + +void pcc_vm_close(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct ll_file_data *fd = file->private_data; + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data; + + ENTRY; + + if (!pcc_file || !pcc_vm_ops || !pcc_vm_ops->close) + RETURN_EXIT; + + pcc_inode_lock(inode); + /* Layout lock maybe revoked here */ + vma->vm_file = pcc_file; + pcc_vm_ops->close(vma); + vma->vm_file = file; + pcc_inode_unlock(inode); + EXIT; +} + +int pcc_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + bool *cached) +{ + struct page *page = vmf->page; + struct mm_struct *mm = vma->vm_mm; + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct ll_file_data *fd = file->private_data; + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data; + int rc; + + ENTRY; + + if (!pcc_file || !pcc_vm_ops) { + *cached = false; + RETURN(0); + } + + if (!pcc_vm_ops->page_mkwrite && + page->mapping == pcc_file->f_mapping) { + CDEBUG(D_MMAP, + "%s: PCC backend fs not support ->page_mkwrite()\n", + ll_i2sbi(inode)->ll_fsname); + pcc_ioctl_detach(inode, PCC_DETACH_OPT_UNCACHE); + mmap_read_unlock(mm); + *cached = true; + RETURN(VM_FAULT_RETRY | VM_FAULT_NOPAGE); + } + /* Pause to allow for a race with concurrent detach */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_PCC_MKWRITE_PAUSE, cfs_fail_val); + + pcc_io_init(inode, PIT_PAGE_MKWRITE, cached); + if (!*cached) { + /* This happens when the file is detached from PCC after got + * the fault page via ->fault() on the inode of the PCC copy. + * Here it can not simply fall back to normal Lustre I/O path. + * The reason is that the address space of fault page used by + * ->page_mkwrite() is still the one of PCC inode. In the + * normal Lustre ->page_mkwrite() I/O path, it will be wrongly + * handled as the address space of the fault page is not + * consistent with the one of the Lustre inode (though the + * fault page was truncated). + * As the file is detached from PCC, the fault page must + * be released frist, and retry the mmap write (->fault() and + * ->page_mkwrite). + * We use an ugly and tricky method by returning + * VM_FAULT_NOPAGE | VM_FAULT_RETRY to the caller + * __do_page_fault and retry the memory fault handling. + */ + if (page->mapping == pcc_file->f_mapping) { + *cached = true; + mmap_read_unlock(mm); + RETURN(VM_FAULT_RETRY | VM_FAULT_NOPAGE); + } + + RETURN(0); + } + + /* + * This fault injection can also be used to simulate -ENOSPC and + * -EDQUOT failure of underlying PCC backend fs. + */ + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_PCC_DETACH_MKWRITE)) { + pcc_io_fini(inode); + pcc_ioctl_detach(inode, PCC_DETACH_OPT_UNCACHE); + mmap_read_unlock(mm); + RETURN(VM_FAULT_RETRY | VM_FAULT_NOPAGE); + } + + vma->vm_file = pcc_file; +#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY + rc = pcc_vm_ops->page_mkwrite(vmf); +#else + rc = pcc_vm_ops->page_mkwrite(vma, vmf); +#endif + vma->vm_file = file; + + pcc_io_fini(inode); + RETURN(rc); +} + +int pcc_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + bool *cached) +{ + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct ll_file_data *fd = file->private_data; + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data; + int rc; + + ENTRY; + + if (!pcc_file || !pcc_vm_ops || !pcc_vm_ops->fault) { + *cached = false; + RETURN(0); + } + + pcc_io_init(inode, PIT_FAULT, cached); + if (!*cached) + RETURN(0); + + vma->vm_file = pcc_file; +#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY + rc = pcc_vm_ops->fault(vmf); +#else + rc = pcc_vm_ops->fault(vma, vmf); +#endif + vma->vm_file = file; + + pcc_io_fini(inode); + RETURN(rc); +} + +static void __pcc_layout_invalidate(struct pcc_inode *pcci) +{ + pcci->pcci_type = LU_PCC_NONE; + pcc_layout_gen_set(pcci, CL_LAYOUT_GEN_NONE); + if (atomic_read(&pcci->pcci_active_ios) == 0) + return; + + CDEBUG(D_CACHE, "Waiting for IO completion: %d\n", + atomic_read(&pcci->pcci_active_ios)); + wait_event_idle(pcci->pcci_waitq, + atomic_read(&pcci->pcci_active_ios) == 0); +} + +void pcc_layout_invalidate(struct inode *inode) +{ + struct pcc_inode *pcci; + + ENTRY; + + pcc_inode_lock(inode); + pcci = ll_i2pcci(inode); + if (pcci && pcc_inode_has_layout(pcci)) { + LASSERT(atomic_read(&pcci->pcci_refcount) > 0); + __pcc_layout_invalidate(pcci); + + CDEBUG(D_CACHE, "Invalidate "DFID" layout gen %d\n", + PFID(&ll_i2info(inode)->lli_fid), pcci->pcci_layout_gen); + + pcc_inode_put(pcci); + } + pcc_inode_unlock(inode); + + EXIT; +} + +static int pcc_inode_remove(struct inode *inode, struct dentry *pcc_dentry) +{ + int rc; + + rc = vfs_unlink(&init_user_ns, + pcc_dentry->d_parent->d_inode, pcc_dentry); + if (rc) + CWARN("%s: failed to unlink PCC file %pd, rc = %d\n", + ll_i2sbi(inode)->ll_fsname, pcc_dentry, rc); + + return rc; +} + +/* Create directory under base if directory does not exist */ +static struct dentry * +pcc_mkdir(struct dentry *base, const char *name, umode_t mode) +{ + int rc; + struct dentry *dentry; + struct inode *dir = base->d_inode; + + inode_lock(dir); + dentry = lookup_one_len(name, base, strlen(name)); + if (IS_ERR(dentry)) + goto out; + + if (d_is_positive(dentry)) + goto out; + + rc = vfs_mkdir(&init_user_ns, dir, dentry, mode); + if (rc) { + dput(dentry); + dentry = ERR_PTR(rc); + goto out; + } +out: + inode_unlock(dir); + return dentry; +} + +static struct dentry * +pcc_mkdir_p(struct dentry *root, char *path, umode_t mode) +{ + char *ptr, *entry_name; + struct dentry *parent; + struct dentry *child = ERR_PTR(-EINVAL); + + ptr = path; + while (*ptr == '/') + ptr++; + + entry_name = ptr; + parent = dget(root); + while ((ptr = strchr(ptr, '/')) != NULL) { + *ptr = '\0'; + child = pcc_mkdir(parent, entry_name, mode); + *ptr = '/'; + dput(parent); + if (IS_ERR(child)) + break; + + parent = child; + ptr++; + entry_name = ptr; + } + + return child; +} + +/* Create file under base. If file already exist, return failure */ +static struct dentry * +pcc_create(struct dentry *base, const char *name, umode_t mode) +{ + int rc; + struct dentry *dentry; + struct inode *dir = base->d_inode; + + inode_lock(dir); + dentry = lookup_one_len(name, base, strlen(name)); + if (IS_ERR(dentry)) + goto out; + + if (d_is_positive(dentry)) + goto out; + + rc = vfs_create(&init_user_ns, dir, dentry, mode, false); + if (rc) { + dput(dentry); + dentry = ERR_PTR(rc); + goto out; + } +out: + inode_unlock(dir); + return dentry; +} + +static int __pcc_inode_create(struct pcc_dataset *dataset, + struct lu_fid *fid, + struct dentry **dentry) +{ + char *path; + struct dentry *base; + struct dentry *child; + int rc = 0; + + OBD_ALLOC(path, PCC_DATASET_MAX_PATH); + if (path == NULL) + return -ENOMEM; + + pcc_fid2dataset_path(path, PCC_DATASET_MAX_PATH, fid); + + base = pcc_mkdir_p(dataset->pccd_path.dentry, path, 0); + if (IS_ERR(base)) { + rc = PTR_ERR(base); + GOTO(out, rc); + } + + snprintf(path, PCC_DATASET_MAX_PATH, DFID_NOBRACE, PFID(fid)); + child = pcc_create(base, path, 0); + if (IS_ERR(child)) { + rc = PTR_ERR(child); + GOTO(out_base, rc); + } + *dentry = child; + +out_base: + dput(base); +out: + OBD_FREE(path, PCC_DATASET_MAX_PATH); + return rc; +} + +/* + * Reset uid, gid or size for the PCC copy masked by @valid. + * TODO: Set the project ID for PCC copy. + */ +int pcc_inode_reset_iattr(struct dentry *dentry, unsigned int valid, + kuid_t uid, kgid_t gid, loff_t size) +{ + struct inode *inode = dentry->d_inode; + struct iattr attr; + int rc; + + ENTRY; + + attr.ia_valid = valid; + attr.ia_uid = uid; + attr.ia_gid = gid; + attr.ia_size = size; + + inode_lock(inode); + rc = notify_change(&init_user_ns, dentry, &attr, NULL); + inode_unlock(inode); + + RETURN(rc); +} + +int pcc_inode_create(struct super_block *sb, struct pcc_dataset *dataset, + struct lu_fid *fid, struct dentry **pcc_dentry) +{ + const struct cred *old_cred; + int rc; + + old_cred = override_creds(pcc_super_cred(sb)); + rc = __pcc_inode_create(dataset, fid, pcc_dentry); + revert_creds(old_cred); + return rc; +} + +int pcc_inode_create_fini(struct inode *inode, struct pcc_create_attach *pca) +{ + struct dentry *pcc_dentry = pca->pca_dentry; + struct pcc_super *super = ll_i2pccs(inode); + const struct cred *old_cred; + struct pcc_inode *pcci; + int rc; + + ENTRY; + + if (!pca->pca_dataset) + RETURN(0); + + if (!inode) + GOTO(out_dataset_put, rc = 0); + + LASSERT(pcc_dentry); + + old_cred = override_creds(super->pccs_cred); + pcc_inode_lock(inode); + LASSERT(ll_i2pcci(inode) == NULL); + OBD_SLAB_ALLOC_PTR_GFP(pcci, pcc_inode_slab, GFP_NOFS); + if (pcci == NULL) + GOTO(out_put, rc = -ENOMEM); + + rc = pcc_inode_reset_iattr(pcc_dentry, ATTR_UID | ATTR_GID, + old_cred->suid, old_cred->sgid, 0); + if (rc) + GOTO(out_put, rc); + + pcc_inode_attach_set(super, pca->pca_dataset, ll_i2info(inode), + pcci, pcc_dentry, LU_PCC_READWRITE); + + rc = pcc_layout_xattr_set(pcci, 0); + if (rc) { + (void) pcc_inode_remove(inode, pcci->pcci_path.dentry); + pcc_inode_put(pcci); + GOTO(out_unlock, rc); + } + + /* Set the layout generation of newly created file with 0 */ + pcc_layout_gen_set(pcci, 0); + +out_put: + if (rc) { + (void) pcc_inode_remove(inode, pcc_dentry); + dput(pcc_dentry); + + if (pcci) + OBD_SLAB_FREE_PTR(pcci, pcc_inode_slab); + } +out_unlock: + pcc_inode_unlock(inode); + revert_creds(old_cred); +out_dataset_put: + pcc_dataset_put(pca->pca_dataset); + RETURN(rc); +} + +void pcc_create_attach_cleanup(struct super_block *sb, + struct pcc_create_attach *pca) +{ + if (!pca->pca_dataset) + return; + + if (pca->pca_dentry) { + const struct cred *old_cred; + int rc; + + old_cred = override_creds(pcc_super_cred(sb)); + rc = vfs_unlink(&init_user_ns, + pca->pca_dentry->d_parent->d_inode, + pca->pca_dentry); + if (rc) + CWARN("%s: failed to unlink PCC file %pd: rc = %d\n", + ll_s2sbi(sb)->ll_fsname, pca->pca_dentry, rc); + /* ignore the unlink failure */ + revert_creds(old_cred); + dput(pca->pca_dentry); + } + + pcc_dataset_put(pca->pca_dataset); +} + +static int pcc_filp_write(struct file *filp, const void *buf, ssize_t count, + loff_t *offset) +{ + while (count > 0) { + ssize_t size; + + size = cfs_kernel_write(filp, buf, count, offset); + if (size < 0) + return size; + count -= size; + buf += size; + } + return 0; +} + +static ssize_t pcc_copy_data(struct file *src, struct file *dst) +{ + ssize_t rc = 0; + ssize_t rc2; + loff_t pos, offset = 0; + size_t buf_len = 1048576; + void *buf; + + ENTRY; + + OBD_ALLOC_LARGE(buf, buf_len); + if (buf == NULL) + RETURN(-ENOMEM); + + while (1) { + if (signal_pending(current)) + GOTO(out_free, rc = -EINTR); + + pos = offset; + rc2 = cfs_kernel_read(src, buf, buf_len, &pos); + if (rc2 < 0) + GOTO(out_free, rc = rc2); + else if (rc2 == 0) + break; + + pos = offset; + rc = pcc_filp_write(dst, buf, rc2, &pos); + if (rc < 0) + GOTO(out_free, rc); + offset += rc2; + } + + rc = offset; +out_free: + OBD_FREE_LARGE(buf, buf_len); + RETURN(rc); +} + +static int pcc_attach_allowed_check(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct pcc_inode *pcci; + int rc = 0; + + ENTRY; + + pcc_inode_lock(inode); + if (lli->lli_pcc_state & PCC_STATE_FL_ATTACHING) + GOTO(out_unlock, rc = -EBUSY); + + pcci = ll_i2pcci(inode); + if (pcci && pcc_inode_has_layout(pcci)) + GOTO(out_unlock, rc = -EEXIST); + + lli->lli_pcc_state |= PCC_STATE_FL_ATTACHING; +out_unlock: + pcc_inode_unlock(inode); + RETURN(rc); +} + +int pcc_readwrite_attach(struct file *file, struct inode *inode, + __u32 archive_id) +{ + struct pcc_dataset *dataset; + struct ll_inode_info *lli = ll_i2info(inode); + struct pcc_super *super = ll_i2pccs(inode); + struct pcc_inode *pcci; + const struct cred *old_cred; + struct dentry *dentry; + struct file *pcc_filp; + struct path path; + ssize_t ret; + int rc; + + ENTRY; + + rc = pcc_attach_allowed_check(inode); + if (rc) + RETURN(rc); + + dataset = pcc_dataset_get(&ll_i2sbi(inode)->ll_pcc_super, + LU_PCC_READWRITE, archive_id); + if (dataset == NULL) + RETURN(-ENOENT); + + old_cred = override_creds(super->pccs_cred); + rc = __pcc_inode_create(dataset, &lli->lli_fid, &dentry); + if (rc) + GOTO(out_dataset_put, rc); + + path.mnt = dataset->pccd_path.mnt; + path.dentry = dentry; + pcc_filp = dentry_open(&path, O_WRONLY | O_LARGEFILE, current_cred()); + if (IS_ERR_OR_NULL(pcc_filp)) { + rc = pcc_filp == NULL ? -EINVAL : PTR_ERR(pcc_filp); + GOTO(out_dentry, rc); + } + + rc = pcc_inode_reset_iattr(dentry, ATTR_UID | ATTR_GID, + old_cred->uid, old_cred->gid, 0); + if (rc) + GOTO(out_fput, rc); + + ret = pcc_copy_data(file, pcc_filp); + if (ret < 0) + GOTO(out_fput, rc = ret); + + /* + * It must to truncate the PCC copy to the same size of the Lustre + * copy after copy data. Otherwise, it may get wrong file size after + * re-attach a file. See LU-13023 for details. + */ + rc = pcc_inode_reset_iattr(dentry, ATTR_SIZE, KUIDT_INIT(0), + KGIDT_INIT(0), ret); + if (rc) + GOTO(out_fput, rc); + + /* Pause to allow for a race with concurrent HSM remove */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_PCC_ATTACH_PAUSE, cfs_fail_val); + + pcc_inode_lock(inode); + pcci = ll_i2pcci(inode); + LASSERT(!pcci); + OBD_SLAB_ALLOC_PTR_GFP(pcci, pcc_inode_slab, GFP_NOFS); + if (pcci == NULL) + GOTO(out_unlock, rc = -ENOMEM); + + pcc_inode_attach_set(super, dataset, lli, pcci, + dentry, LU_PCC_READWRITE); +out_unlock: + pcc_inode_unlock(inode); +out_fput: + fput(pcc_filp); +out_dentry: + if (rc) { + (void) pcc_inode_remove(inode, dentry); + dput(dentry); + } +out_dataset_put: + pcc_dataset_put(dataset); + revert_creds(old_cred); + + RETURN(rc); +} + +int pcc_readwrite_attach_fini(struct file *file, struct inode *inode, + __u32 gen, bool lease_broken, int rc, + bool attached) +{ + struct ll_inode_info *lli = ll_i2info(inode); + const struct cred *old_cred; + struct pcc_inode *pcci; + __u32 gen2; + + ENTRY; + + old_cred = override_creds(pcc_super_cred(inode->i_sb)); + pcc_inode_lock(inode); + pcci = ll_i2pcci(inode); + if (rc || lease_broken) { + if (attached && pcci) + pcc_inode_put(pcci); + + GOTO(out_unlock, rc); + } + + /* PCC inode may be released due to layout lock revocatioin */ + if (!pcci) + GOTO(out_unlock, rc = -ESTALE); + + LASSERT(attached); + rc = pcc_layout_xattr_set(pcci, gen); + if (rc) + GOTO(out_put, rc); + + LASSERT(lli->lli_pcc_state & PCC_STATE_FL_ATTACHING); + rc = ll_layout_refresh(inode, &gen2); + if (!rc) { + if (gen2 == gen) { + pcc_layout_gen_set(pcci, gen); + } else { + CDEBUG(D_CACHE, + DFID" layout changed from %d to %d.\n", + PFID(ll_inode2fid(inode)), gen, gen2); + GOTO(out_put, rc = -ESTALE); + } + } + +out_put: + if (rc) { + (void) pcc_inode_remove(inode, pcci->pcci_path.dentry); + pcc_inode_put(pcci); + } +out_unlock: + lli->lli_pcc_state &= ~PCC_STATE_FL_ATTACHING; + pcc_inode_unlock(inode); + revert_creds(old_cred); + RETURN(rc); +} + +static int pcc_hsm_remove(struct inode *inode) +{ + struct hsm_user_request *hur; + __u32 gen; + int len; + int rc; + + ENTRY; + + rc = ll_layout_restore(inode, 0, OBD_OBJECT_EOF); + if (rc) { + CDEBUG(D_CACHE, DFID" RESTORE failure: %d\n", + PFID(&ll_i2info(inode)->lli_fid), rc); + RETURN(rc); + } + + ll_layout_refresh(inode, &gen); + + len = sizeof(struct hsm_user_request) + + sizeof(struct hsm_user_item); + OBD_ALLOC(hur, len); + if (hur == NULL) + RETURN(-ENOMEM); + + hur->hur_request.hr_action = HUA_REMOVE; + hur->hur_request.hr_archive_id = 0; + hur->hur_request.hr_flags = 0; + memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid, + sizeof(hur->hur_user_item[0].hui_fid)); + hur->hur_user_item[0].hui_extent.offset = 0; + hur->hur_user_item[0].hui_extent.length = OBD_OBJECT_EOF; + hur->hur_request.hr_itemcount = 1; + rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp, + len, hur, NULL); + if (rc) + CDEBUG(D_CACHE, DFID" HSM REMOVE failure: %d\n", + PFID(&ll_i2info(inode)->lli_fid), rc); + + OBD_FREE(hur, len); + RETURN(rc); +} + +int pcc_ioctl_detach(struct inode *inode, __u32 opt) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct pcc_inode *pcci; + bool hsm_remove = false; + int rc = 0; + + ENTRY; + + pcc_inode_lock(inode); + pcci = lli->lli_pcc_inode; + if (!pcci || lli->lli_pcc_state & PCC_STATE_FL_ATTACHING || + !pcc_inode_has_layout(pcci)) + GOTO(out_unlock, rc = 0); + + LASSERT(atomic_read(&pcci->pcci_refcount) > 0); + + if (pcci->pcci_type == LU_PCC_READWRITE) { + if (opt == PCC_DETACH_OPT_UNCACHE) { + hsm_remove = true; + /* + * The file will be removed from PCC, set the flags + * with PCC_DATASET_NONE even the later removal of the + * PCC copy fails. + */ + lli->lli_pcc_dsflags = PCC_DATASET_NONE; + } + + __pcc_layout_invalidate(pcci); + pcc_inode_put(pcci); + } + +out_unlock: + pcc_inode_unlock(inode); + if (hsm_remove) { + const struct cred *old_cred; + + old_cred = override_creds(pcc_super_cred(inode->i_sb)); + rc = pcc_hsm_remove(inode); + revert_creds(old_cred); + } + + RETURN(rc); +} + +int pcc_ioctl_state(struct file *file, struct inode *inode, + struct lu_pcc_state *state) +{ + int rc = 0; + int count; + char *buf; + char *path; + int buf_len = sizeof(state->pccs_path); + struct ll_file_data *fd = file->private_data; + struct pcc_file *pccf = &fd->fd_pcc_file; + struct pcc_inode *pcci; + + ENTRY; + + if (buf_len <= 0) + RETURN(-EINVAL); + + OBD_ALLOC(buf, buf_len); + if (buf == NULL) + RETURN(-ENOMEM); + + pcc_inode_lock(inode); + pcci = ll_i2pcci(inode); + if (pcci == NULL) { + state->pccs_type = LU_PCC_NONE; + GOTO(out_unlock, rc = 0); + } + + count = atomic_read(&pcci->pcci_refcount); + if (count == 0) { + state->pccs_type = LU_PCC_NONE; + state->pccs_open_count = 0; + GOTO(out_unlock, rc = 0); + } + + if (pcc_inode_has_layout(pcci)) + count--; + if (pccf->pccf_file != NULL) + count--; + state->pccs_type = pcci->pcci_type; + state->pccs_open_count = count; + state->pccs_flags = ll_i2info(inode)->lli_pcc_state; + path = dentry_path_raw(pcci->pcci_path.dentry, buf, buf_len); + if (IS_ERR(path)) + GOTO(out_unlock, rc = PTR_ERR(path)); + + if (strlcpy(state->pccs_path, path, buf_len) >= buf_len) + GOTO(out_unlock, rc = -ENAMETOOLONG); + +out_unlock: + pcc_inode_unlock(inode); + OBD_FREE(buf, buf_len); + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/pcc.h b/drivers/staging/lustrefsx/lustre/llite/pcc.h new file mode 100644 index 0000000000000..067daefb939c6 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/pcc.h @@ -0,0 +1,268 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, DDN Storage Corporation. + */ +/* + * + * Persistent Client Cache + * + * Author: Li Xi + */ + +#ifndef LLITE_PCC_H +#define LLITE_PCC_H + +#include +#include +#include +#include +#include + +extern struct kmem_cache *pcc_inode_slab; + +#define LPROCFS_WR_PCC_MAX_CMD 4096 + +/* User/Group/Project ID */ +struct pcc_match_id { + __u32 pmi_id; + struct list_head pmi_linkage; +}; + +/* wildcard file name */ +struct pcc_match_fname { + char *pmf_name; + struct list_head pmf_linkage; +}; + +enum pcc_field { + PCC_FIELD_UID, + PCC_FIELD_GID, + PCC_FIELD_PROJID, + PCC_FIELD_FNAME, + PCC_FIELD_MAX +}; + +struct pcc_expression { + enum pcc_field pe_field; + struct list_head pe_cond; + struct list_head pe_linkage; +}; + +struct pcc_conjunction { + /* link to disjunction */ + struct list_head pc_linkage; + /* list of logical conjunction */ + struct list_head pc_expressions; +}; + +/** + * Match rule for auto PCC-cached files. + */ +struct pcc_match_rule { + char *pmr_conds_str; + struct list_head pmr_conds; +}; + +struct pcc_matcher { + __u32 pm_uid; + __u32 pm_gid; + __u32 pm_projid; + struct qstr *pm_name; +}; + +enum pcc_dataset_flags { + PCC_DATASET_INVALID = 0x0, + /* Indicate that known the file is not in PCC. */ + PCC_DATASET_NONE = 0x01, + /* Try auto attach at open, enabled by default */ + PCC_DATASET_OPEN_ATTACH = 0x02, + /* Try auto attach during IO when layout refresh, enabled by default */ + PCC_DATASET_IO_ATTACH = 0x04, + /* Try auto attach at stat */ + PCC_DATASET_STAT_ATTACH = 0x08, + PCC_DATASET_AUTO_ATTACH = PCC_DATASET_OPEN_ATTACH | + PCC_DATASET_IO_ATTACH | + PCC_DATASET_STAT_ATTACH, + /* PCC backend is only used for RW-PCC */ + PCC_DATASET_RWPCC = 0x10, + /* PCC backend is only used for RO-PCC */ + PCC_DATASET_ROPCC = 0x20, + /* PCC backend provides caching services for both RW-PCC and RO-PCC */ + PCC_DATASET_PCC_ALL = PCC_DATASET_RWPCC | PCC_DATASET_ROPCC, +}; + +struct pcc_dataset { + __u32 pccd_rwid; /* Archive ID */ + __u32 pccd_roid; /* Readonly ID */ + struct pcc_match_rule pccd_rule; /* Match rule */ + enum pcc_dataset_flags pccd_flags; /* Flags of PCC backend */ + char pccd_pathname[PATH_MAX]; /* full path */ + struct path pccd_path; /* Root path */ + struct list_head pccd_linkage; /* Linked to pccs_datasets */ + atomic_t pccd_refcount; /* Reference count */ +}; + +struct pcc_super { + /* Protect pccs_datasets */ + struct rw_semaphore pccs_rw_sem; + /* List of datasets */ + struct list_head pccs_datasets; + /* creds of process who forced instantiation of super block */ + const struct cred *pccs_cred; + /* + * Gobal PCC Generation: it will be increased once the configuration + * for PCC is changed, i.e. add or delete a PCC backend, modify the + * parameters for PCC. + */ + __u64 pccs_generation; +}; + +struct pcc_inode { + struct ll_inode_info *pcci_lli; + /* Cache path on local file system */ + struct path pcci_path; + /* + * If reference count is 0, then the cache is not inited, if 1, then + * no one is using it. + */ + atomic_t pcci_refcount; + /* Whether readonly or readwrite PCC */ + enum lu_pcc_type pcci_type; + /* Whether the inode attr is cached locally */ + bool pcci_attr_valid; + /* Layout generation */ + __u32 pcci_layout_gen; + /* + * How many IOs are on going on this cached object. Layout can be + * changed only if there is no active IO. + */ + atomic_t pcci_active_ios; + /* Waitq - wait for PCC I/O completion. */ + wait_queue_head_t pcci_waitq; +}; + +struct pcc_file { + /* Opened cache file */ + struct file *pccf_file; + /* Whether readonly or readwrite PCC */ + enum lu_pcc_type pccf_type; +}; + +enum pcc_io_type { + /* read system call */ + PIT_READ = 1, + /* write system call */ + PIT_WRITE, + /* truncate, utime system calls */ + PIT_SETATTR, + /* stat system call */ + PIT_GETATTR, + /* mmap write handling */ + PIT_PAGE_MKWRITE, + /* page fault handling */ + PIT_FAULT, + /* fsync system call handling */ + PIT_FSYNC, +#ifdef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT + /* splice_read system call */ + PIT_SPLICE_READ, +#endif + /* open system call */ + PIT_OPEN +}; + +enum pcc_cmd_type { + PCC_ADD_DATASET = 0, + PCC_DEL_DATASET, + PCC_CLEAR_ALL, +}; + +struct pcc_cmd { + enum pcc_cmd_type pccc_cmd; + char *pccc_pathname; + union { + struct pcc_cmd_add { + __u32 pccc_rwid; + __u32 pccc_roid; + struct list_head pccc_conds; + char *pccc_conds_str; + enum pcc_dataset_flags pccc_flags; + } pccc_add; + struct pcc_cmd_del { + __u32 pccc_pad; + } pccc_del; + } u; +}; + +struct pcc_create_attach { + struct pcc_dataset *pca_dataset; + struct dentry *pca_dentry; +}; + +int pcc_super_init(struct pcc_super *super); +void pcc_super_fini(struct pcc_super *super); +int pcc_cmd_handle(char *buffer, unsigned long count, + struct pcc_super *super); +int pcc_super_dump(struct pcc_super *super, struct seq_file *m); +int pcc_readwrite_attach(struct file *file, struct inode *inode, + __u32 arch_id); +int pcc_readwrite_attach_fini(struct file *file, struct inode *inode, + __u32 gen, bool lease_broken, int rc, + bool attached); +int pcc_ioctl_detach(struct inode *inode, __u32 opt); +int pcc_ioctl_state(struct file *file, struct inode *inode, + struct lu_pcc_state *state); +void pcc_file_init(struct pcc_file *pccf); +int pcc_file_open(struct inode *inode, struct file *file); +void pcc_file_release(struct inode *inode, struct file *file); +ssize_t pcc_file_read_iter(struct kiocb *iocb, struct iov_iter *iter, + bool *cached); +ssize_t pcc_file_write_iter(struct kiocb *iocb, struct iov_iter *iter, + bool *cached); +int pcc_inode_getattr(struct inode *inode, u32 request_mask, + unsigned int flags, bool *cached); +int pcc_inode_setattr(struct inode *inode, struct iattr *attr, bool *cached); +#ifdef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT +ssize_t pcc_file_splice_read(struct file *in_file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags); +#endif +int pcc_fsync(struct file *file, loff_t start, loff_t end, + int datasync, bool *cached); +int pcc_file_mmap(struct file *file, struct vm_area_struct *vma, bool *cached); +void pcc_vm_open(struct vm_area_struct *vma); +void pcc_vm_close(struct vm_area_struct *vma); +int pcc_fault(struct vm_area_struct *mva, struct vm_fault *vmf, bool *cached); +int pcc_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + bool *cached); +int pcc_inode_create(struct super_block *sb, struct pcc_dataset *dataset, + struct lu_fid *fid, struct dentry **pcc_dentry); +int pcc_inode_create_fini(struct inode *inode, struct pcc_create_attach *pca); +void pcc_create_attach_cleanup(struct super_block *sb, + struct pcc_create_attach *pca); +struct pcc_dataset *pcc_dataset_match_get(struct pcc_super *super, + struct pcc_matcher *matcher); +void pcc_dataset_put(struct pcc_dataset *dataset); +void pcc_inode_free(struct inode *inode); +void pcc_layout_invalidate(struct inode *inode); +#endif /* LLITE_PCC_H */ diff --git a/drivers/staging/lustrefsx/lustre/llite/rw.c b/drivers/staging/lustrefsx/lustre/llite/rw.c new file mode 100644 index 0000000000000..7a48518c6c22c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/rw.c @@ -0,0 +1,2046 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/llite/rw.c + * + * Lustre Lite I/O page cache routines shared by different kernel revs + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +/* current_is_kswapd() */ +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include "llite_internal.h" +#include + +static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which); + +/** + * Get readahead pages from the filesystem readahead pool of the client for a + * thread. + * + * /param sbi superblock for filesystem readahead state ll_ra_info + * /param ria per-thread readahead state + * /param pages number of pages requested for readahead for the thread. + * + * WARNING: This algorithm is used to reduce contention on sbi->ll_lock. + * It should work well if the ra_max_pages is much greater than the single + * file's read-ahead window, and not too many threads contending for + * these readahead pages. + * + * TODO: There may be a 'global sync problem' if many threads are trying + * to get an ra budget that is larger than the remaining readahead pages + * and reach here at exactly the same time. They will compute /a ret to + * consume the remaining pages, but will fail at atomic_add_return() and + * get a zero ra window, although there is still ra space remaining. - Jay */ + +static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, + struct ra_io_arg *ria, + unsigned long pages, + unsigned long pages_min) +{ + struct ll_ra_info *ra = &sbi->ll_ra_info; + long ret; + + ENTRY; + + WARN_ON_ONCE(pages_min > pages); + /** + * Don't try readahead aggresively if we are limited + * LRU pages, otherwise, it could cause deadlock. + */ + pages = min(sbi->ll_cache->ccc_lru_max >> 2, pages); + /** + * if this happen, we reserve more pages than needed, + * this will make us leak @ra_cur_pages, because + * ll_ra_count_put() acutally freed @pages. + */ + if (unlikely(pages_min > pages)) + pages_min = pages; + + /* + * If read-ahead pages left are less than 1M, do not do read-ahead, + * otherwise it will form small read RPC(< 1M), which hurt server + * performance a lot. + */ + ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), + pages); + if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages)) + GOTO(out, ret = 0); + + if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) { + atomic_sub(ret, &ra->ra_cur_pages); + ret = 0; + } + +out: + if (ret < pages_min) { + /* override ra limit for maximum performance */ + atomic_add(pages_min - ret, &ra->ra_cur_pages); + ret = pages_min; + } + RETURN(ret); +} + +void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long pages) +{ + struct ll_ra_info *ra = &sbi->ll_ra_info; + atomic_sub(pages, &ra->ra_cur_pages); +} + +static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which) +{ + LASSERTF(which < _NR_RA_STAT, "which: %u\n", which); + lprocfs_counter_incr(sbi->ll_ra_stats, which); +} + +static inline bool ll_readahead_enabled(struct ll_sb_info *sbi) +{ + return sbi->ll_ra_info.ra_max_pages_per_file > 0 && + sbi->ll_ra_info.ra_max_pages > 0; +} + +void ll_ra_stats_inc(struct inode *inode, enum ra_stat which) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + ll_ra_stats_inc_sbi(sbi, which); +} + +#define RAS_CDEBUG(ras) \ + CDEBUG(D_READA, \ + "lre %llu cr %lu cb %llu wsi %lu wp %lu nra %lu rpc %lu " \ + "r %lu csr %lu so %llu sb %llu sl %llu lr %lu\n", \ + ras->ras_last_read_end_bytes, ras->ras_consecutive_requests, \ + ras->ras_consecutive_bytes, ras->ras_window_start_idx, \ + ras->ras_window_pages, ras->ras_next_readahead_idx, \ + ras->ras_rpc_pages, ras->ras_requests, \ + ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \ + ras->ras_stride_bytes, ras->ras_stride_length, \ + ras->ras_async_last_readpage_idx) + +static bool pos_in_window(loff_t pos, loff_t point, + unsigned long before, unsigned long after) +{ + loff_t start = point - before; + loff_t end = point + after; + + if (start > point) + start = 0; + if (end < point) + end = ~0; + + return start <= pos && pos <= end; +} + +enum ll_ra_page_hint { + MAYNEED = 0, /* this page possibly accessed soon */ + WILLNEED /* this page is gurateed to be needed */ +}; + +/** + * Initiates read-ahead of a page with given index. + * + * \retval +ve: page was already uptodate so it will be skipped + * from being added; + * \retval -ve: page wasn't added to \a queue for error; + * \retval 0: page was added into \a queue for read ahead. + */ +static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, pgoff_t index, + enum ll_ra_page_hint hint) +{ + struct cl_object *clob = io->ci_obj; + struct inode *inode = vvp_object_inode(clob); + struct page *vmpage = NULL; + struct cl_page *page; + struct vvp_page *vpg; + enum ra_stat which = _NR_RA_STAT; /* keep gcc happy */ + int rc = 0; + const char *msg = NULL; + + ENTRY; + + switch (hint) { + case MAYNEED: + vmpage = grab_cache_page_nowait(inode->i_mapping, index); + if (vmpage == NULL) { + which = RA_STAT_FAILED_GRAB_PAGE; + msg = "g_c_p_n failed"; + GOTO(out, rc = -EBUSY); + } + break; + case WILLNEED: + vmpage = find_or_create_page(inode->i_mapping, index, + GFP_NOFS); + if (vmpage == NULL) + GOTO(out, rc = -ENOMEM); + break; + default: + /* should not come here */ + GOTO(out, rc = -EINVAL); + } + + /* Check if vmpage was truncated or reclaimed */ + if (vmpage->mapping != inode->i_mapping) { + which = RA_STAT_WRONG_GRAB_PAGE; + msg = "g_c_p_n returned invalid page"; + GOTO(out, rc = -EBUSY); + } + + page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE); + if (IS_ERR(page)) { + which = RA_STAT_FAILED_GRAB_PAGE; + msg = "cl_page_find failed"; + GOTO(out, rc = PTR_ERR(page)); + } + + lu_ref_add(&page->cp_reference, "ra", current); + cl_page_assume(env, io, page); + vpg = cl2vvp_page(cl_object_page_slice(clob, page)); + if (!vpg->vpg_defer_uptodate && !PageUptodate(vmpage)) { + if (hint == MAYNEED) { + vpg->vpg_defer_uptodate = 1; + vpg->vpg_ra_used = 0; + } + cl_page_list_add(queue, page, true); + } else { + /* skip completed pages */ + cl_page_unassume(env, io, page); + /* This page is already uptodate, returning a positive number + * to tell the callers about this */ + rc = 1; + } + + lu_ref_del(&page->cp_reference, "ra", current); + cl_page_put(env, page); + +out: + if (vmpage != NULL) { + if (rc != 0) + unlock_page(vmpage); + put_page(vmpage); + } + if (msg != NULL && hint == MAYNEED) { + ll_ra_stats_inc(inode, which); + CDEBUG(D_READA, "%s\n", msg); + + } + + RETURN(rc); +} + +#define RIA_DEBUG(ria) \ + CDEBUG(D_READA, "rs %lu re %lu ro %llu rl %llu rb %llu\n", \ + ria->ria_start_idx, ria->ria_end_idx, ria->ria_stoff, \ + ria->ria_length, ria->ria_bytes) + +static inline int stride_io_mode(struct ll_readahead_state *ras) +{ + return ras->ras_consecutive_stride_requests > 1; +} + +/* The function calculates how many bytes will be read in + * [off, off + length], in such stride IO area, + * stride_offset = st_off, stride_lengh = st_len, + * stride_bytes = st_bytes + * + * |------------------|*****|------------------|*****|------------|*****|.... + * st_off + * |--- st_bytes ---| + * |----- st_len -----| + * + * How many bytes it should read in such pattern + * |-------------------------------------------------------------| + * off + * |<------ length ------->| + * + * = |<----->| + |-------------------------------------| + |---| + * start_left st_bytes * i end_left + */ +static loff_t stride_byte_count(loff_t st_off, loff_t st_len, loff_t st_bytes, + loff_t off, loff_t length) +{ + u64 start = off > st_off ? off - st_off : 0; + u64 end = off + length > st_off ? off + length - st_off : 0; + u64 start_left; + u64 end_left; + u64 bytes_count; + + if (st_len == 0 || length == 0 || end == 0) + return length; + + start = div64_u64_rem(start, st_len, &start_left); + if (start_left < st_bytes) + start_left = st_bytes - start_left; + else + start_left = 0; + + end = div64_u64_rem(end, st_len, &end_left); + if (end_left > st_bytes) + end_left = st_bytes; + + CDEBUG(D_READA, "start %llu, end %llu start_left %llu end_left %llu\n", + start, end, start_left, end_left); + + if (start == end) + bytes_count = end_left - (st_bytes - start_left); + else + bytes_count = start_left + + st_bytes * (end - start - 1) + end_left; + + CDEBUG(D_READA, + "st_off %llu, st_len %llu st_bytes %llu off %llu length %llu bytescount %llu\n", + st_off, st_len, st_bytes, off, length, bytes_count); + + return bytes_count; +} + +static unsigned long ria_page_count(struct ra_io_arg *ria) +{ + loff_t length_bytes = ria->ria_end_idx >= ria->ria_start_idx ? + (loff_t)(ria->ria_end_idx - + ria->ria_start_idx + 1) << PAGE_SHIFT : 0; + loff_t bytes_count; + + if (ria->ria_length > ria->ria_bytes && ria->ria_bytes && + (ria->ria_length & ~PAGE_MASK || ria->ria_bytes & ~PAGE_MASK || + ria->ria_stoff & ~PAGE_MASK)) { + /* Over-estimate un-aligned page stride read */ + unsigned long pg_count = ((ria->ria_bytes + + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; + pg_count *= length_bytes / ria->ria_length + 1; + + return pg_count; + } + bytes_count = stride_byte_count(ria->ria_stoff, ria->ria_length, + ria->ria_bytes, + (loff_t)ria->ria_start_idx<> PAGE_SHIFT; +} + +static pgoff_t ras_align(struct ll_readahead_state *ras, pgoff_t index) +{ + unsigned opt_size = min(ras->ras_window_pages, ras->ras_rpc_pages); + + if (opt_size == 0) + opt_size = 1; + return index - (index % opt_size); +} + +/* Check whether the index is in the defined ra-window */ +static bool ras_inside_ra_window(pgoff_t idx, struct ra_io_arg *ria) +{ + loff_t pos = (loff_t)idx << PAGE_SHIFT; + + /* If ria_length == ria_bytes, it means non-stride I/O mode, + * idx should always inside read-ahead window in this case + * For stride I/O mode, just check whether the idx is inside + * the ria_bytes. + */ + if (ria->ria_length == 0 || ria->ria_length == ria->ria_bytes) + return true; + + if (pos >= ria->ria_stoff) { + u64 offset; + + div64_u64_rem(pos - ria->ria_stoff, ria->ria_length, &offset); + + if (offset < ria->ria_bytes || + (ria->ria_length - offset) < PAGE_SIZE) + return true; + } else if (pos + PAGE_SIZE > ria->ria_stoff) { + return true; + } + + return false; +} + +static unsigned long +ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, struct ll_readahead_state *ras, + struct ra_io_arg *ria, pgoff_t *ra_end, pgoff_t skip_index) +{ + struct cl_read_ahead ra = { 0 }; + /* busy page count is per stride */ + int rc = 0, count = 0, busy_page_count = 0; + pgoff_t page_idx; + + LASSERT(ria != NULL); + RIA_DEBUG(ria); + + for (page_idx = ria->ria_start_idx; + page_idx <= ria->ria_end_idx && ria->ria_reserved > 0; + page_idx++) { + if (skip_index && page_idx == skip_index) + continue; + if (ras_inside_ra_window(page_idx, ria)) { + if (ra.cra_end_idx == 0 || ra.cra_end_idx < page_idx) { + pgoff_t end_idx; + + /* + * Do not shrink ria_end_idx at any case until + * the minimum end of current read is covered. + * + * Do not extend read lock accross stripe if + * lock contention detected. + */ + if (ra.cra_contention && + page_idx > ria->ria_end_idx_min) { + ria->ria_end_idx = *ra_end; + break; + } + + cl_read_ahead_release(env, &ra); + + rc = cl_io_read_ahead(env, io, page_idx, &ra); + if (rc < 0) + break; + + /* + * Only shrink ria_end_idx if the matched + * LDLM lock doesn't cover more. + */ + if (page_idx > ra.cra_end_idx) { + ria->ria_end_idx = ra.cra_end_idx; + break; + } + + CDEBUG(D_READA, "idx: %lu, ra: %lu, rpc: %lu\n", + page_idx, ra.cra_end_idx, + ra.cra_rpc_pages); + LASSERTF(ra.cra_end_idx >= page_idx, + "object: %p, indcies %lu / %lu\n", + io->ci_obj, ra.cra_end_idx, page_idx); + /* update read ahead RPC size. + * NB: it's racy but doesn't matter */ + if (ras->ras_rpc_pages != ra.cra_rpc_pages && + ra.cra_rpc_pages > 0) + ras->ras_rpc_pages = ra.cra_rpc_pages; + if (!skip_index) { + /* trim it to align with optimal RPC size */ + end_idx = ras_align(ras, ria->ria_end_idx + 1); + if (end_idx > 0 && !ria->ria_eof) + ria->ria_end_idx = end_idx - 1; + } + if (ria->ria_end_idx < ria->ria_end_idx_min) + ria->ria_end_idx = ria->ria_end_idx_min; + } + if (page_idx > ria->ria_end_idx) + break; + + /* If the page is inside the read-ahead window */ + rc = ll_read_ahead_page(env, io, queue, page_idx, + MAYNEED); + if (rc < 0 && rc != -EBUSY) + break; + if (rc == -EBUSY) { + busy_page_count++; + CDEBUG(D_READA, + "skip busy page: %lu\n", page_idx); + /* For page unaligned readahead the first + * last pages of each region can be read by + * another reader on the same node, and so + * may be busy. So only stop for > 2 busy + * pages. */ + if (busy_page_count > 2) + break; + } + + *ra_end = page_idx; + /* Only subtract from reserve & count the page if we + * really did readahead on that page. */ + if (rc == 0) { + ria->ria_reserved--; + count++; + } + } else if (stride_io_mode(ras)) { + /* If it is not in the read-ahead window, and it is + * read-ahead mode, then check whether it should skip + * the stride gap. + */ + loff_t pos = (loff_t)page_idx << PAGE_SHIFT; + u64 offset; + + div64_u64_rem(pos - ria->ria_stoff, ria->ria_length, + &offset); + if (offset >= ria->ria_bytes) { + pos += (ria->ria_length - offset); + if ((pos >> PAGE_SHIFT) >= page_idx + 1) + page_idx = (pos >> PAGE_SHIFT) - 1; + busy_page_count = 0; + CDEBUG(D_READA, + "Stride: jump %llu pages to %lu\n", + ria->ria_length - offset, page_idx); + continue; + } + } + } + + cl_read_ahead_release(env, &ra); + + return count; +} + +static void ll_readahead_work_free(struct ll_readahead_work *work) +{ + fput(work->lrw_file); + OBD_FREE_PTR(work); +} + +static void ll_readahead_handle_work(struct work_struct *wq); +static void ll_readahead_work_add(struct inode *inode, + struct ll_readahead_work *work) +{ + INIT_WORK(&work->lrw_readahead_work, ll_readahead_handle_work); + queue_work(ll_i2sbi(inode)->ll_ra_info.ll_readahead_wq, + &work->lrw_readahead_work); +} + +static int ll_readahead_file_kms(const struct lu_env *env, + struct cl_io *io, __u64 *kms) +{ + struct cl_object *clob; + struct inode *inode; + struct cl_attr *attr = vvp_env_thread_attr(env); + int ret; + + clob = io->ci_obj; + inode = vvp_object_inode(clob); + + cl_object_attr_lock(clob); + ret = cl_object_attr_get(env, clob, attr); + cl_object_attr_unlock(clob); + + if (ret != 0) + RETURN(ret); + + *kms = attr->cat_kms; + return 0; +} + +static void ll_readahead_handle_work(struct work_struct *wq) +{ + struct ll_readahead_work *work; + struct lu_env *env; + __u16 refcheck; + struct ra_io_arg *ria; + struct inode *inode; + struct ll_file_data *fd; + struct ll_readahead_state *ras; + struct cl_io *io; + struct cl_2queue *queue; + pgoff_t ra_end_idx = 0; + unsigned long pages, pages_min = 0; + struct file *file; + __u64 kms; + int rc; + pgoff_t eof_index; + struct ll_sb_info *sbi; + + work = container_of(wq, struct ll_readahead_work, + lrw_readahead_work); + fd = work->lrw_file->private_data; + ras = &fd->fd_ras; + file = work->lrw_file; + inode = file_inode(file); + sbi = ll_i2sbi(inode); + + CDEBUG(D_READA|D_IOTRACE, + "%s: async ra from %lu to %lu triggered by user pid %d\n", + file_dentry(file)->d_name.name, work->lrw_start_idx, + work->lrw_end_idx, work->lrw_user_pid); + + env = cl_env_alloc(&refcheck, LCT_NOREF); + if (IS_ERR(env)) + GOTO(out_free_work, rc = PTR_ERR(env)); + + io = vvp_env_thread_io(env); + ll_io_init(io, file, CIT_READ, NULL); + + rc = ll_readahead_file_kms(env, io, &kms); + if (rc != 0) + GOTO(out_put_env, rc); + + if (kms == 0) { + ll_ra_stats_inc(inode, RA_STAT_ZERO_LEN); + GOTO(out_put_env, rc = 0); + } + + ria = &ll_env_info(env)->lti_ria; + memset(ria, 0, sizeof(*ria)); + + ria->ria_start_idx = work->lrw_start_idx; + /* Truncate RA window to end of file */ + eof_index = (pgoff_t)(kms - 1) >> PAGE_SHIFT; + if (eof_index <= work->lrw_end_idx) { + work->lrw_end_idx = eof_index; + ria->ria_eof = true; + } + if (work->lrw_end_idx <= work->lrw_start_idx) + GOTO(out_put_env, rc = 0); + + ria->ria_end_idx = work->lrw_end_idx; + pages = ria->ria_end_idx - ria->ria_start_idx + 1; + ria->ria_reserved = ll_ra_count_get(sbi, ria, + ria_page_count(ria), pages_min); + + CDEBUG(D_READA, + "async reserved pages: %lu/%lu/%lu, ra_cur %d, ra_max %lu\n", + ria->ria_reserved, pages, pages_min, + atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages), + ll_i2sbi(inode)->ll_ra_info.ra_max_pages); + + if (ria->ria_reserved < pages) { + ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT); + if (PAGES_TO_MiB(ria->ria_reserved) < 1) { + ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved); + GOTO(out_put_env, rc = 0); + } + } + + rc = cl_io_rw_init(env, io, CIT_READ, ria->ria_start_idx, pages); + if (rc) + GOTO(out_put_env, rc); + + /* overwrite jobid inited in vvp_io_init() */ + if (strncmp(ll_i2info(inode)->lli_jobid, work->lrw_jobid, + sizeof(work->lrw_jobid))) + memcpy(ll_i2info(inode)->lli_jobid, work->lrw_jobid, + sizeof(work->lrw_jobid)); + + vvp_env_io(env)->vui_fd = fd; + io->ci_state = CIS_LOCKED; + io->ci_async_readahead = true; + rc = cl_io_start(env, io); + if (rc) + GOTO(out_io_fini, rc); + + queue = &io->ci_queue; + cl_2queue_init(queue); + + rc = ll_read_ahead_pages(env, io, &queue->c2_qin, ras, ria, + &ra_end_idx, 0); + if (ria->ria_reserved != 0) + ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved); + if (queue->c2_qin.pl_nr > 0) { + int count = queue->c2_qin.pl_nr; + + rc = cl_io_submit_rw(env, io, CRT_READ, queue); + if (rc == 0) + task_io_account_read(PAGE_SIZE * count); + } + if (ria->ria_end_idx == ra_end_idx && ra_end_idx == (kms >> PAGE_SHIFT)) + ll_ra_stats_inc(inode, RA_STAT_EOF); + + if (ra_end_idx != ria->ria_end_idx) + ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END); + + /* TODO: discard all pages until page reinit route is implemented */ + cl_page_list_discard(env, io, &queue->c2_qin); + + /* Unlock unsent read pages in case of error. */ + cl_page_list_disown(env, io, &queue->c2_qin); + + cl_2queue_fini(env, queue); +out_io_fini: + cl_io_end(env, io); + cl_io_fini(env, io); +out_put_env: + cl_env_put(env, &refcheck); +out_free_work: + if (ra_end_idx > 0) + ll_ra_stats_inc_sbi(ll_i2sbi(inode), RA_STAT_ASYNC); + atomic_dec(&sbi->ll_ra_info.ra_async_inflight); + ll_readahead_work_free(work); +} + +static int ll_readahead(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, + struct ll_readahead_state *ras, bool hit, + struct file *file, pgoff_t skip_index, + pgoff_t *start_idx) +{ + struct vvp_io *vio = vvp_env_io(env); + struct ll_thread_info *lti = ll_env_info(env); + unsigned long pages, pages_min = 0; + pgoff_t ra_end_idx = 0, end_idx = 0; + struct inode *inode; + struct ra_io_arg *ria = <i->lti_ria; + struct cl_object *clob; + int ret = 0; + __u64 kms; + struct ll_sb_info *sbi; + struct ll_ra_info *ra; + + ENTRY; + + clob = io->ci_obj; + inode = vvp_object_inode(clob); + sbi = ll_i2sbi(inode); + ra = &sbi->ll_ra_info; + + /** + * In case we have a limited max_cached_mb, readahead + * should be stopped if it have run out of all LRU slots. + */ + if (atomic_read(&ra->ra_cur_pages) >= sbi->ll_cache->ccc_lru_max) { + ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT); + RETURN(0); + } + + memset(ria, 0, sizeof(*ria)); + ret = ll_readahead_file_kms(env, io, &kms); + if (ret != 0) + RETURN(ret); + + if (kms == 0) { + ll_ra_stats_inc(inode, RA_STAT_ZERO_LEN); + RETURN(0); + } + + spin_lock(&ras->ras_lock); + + /** + * Note: other thread might rollback the ras_next_readahead_idx, + * if it can not get the full size of prepared pages, see the + * end of this function. For stride read ahead, it needs to + * make sure the offset is no less than ras_stride_offset, + * so that stride read ahead can work correctly. + */ + if (stride_io_mode(ras)) + *start_idx = max_t(pgoff_t, ras->ras_next_readahead_idx, + ras->ras_stride_offset >> PAGE_SHIFT); + else + *start_idx = ras->ras_next_readahead_idx; + + if (ras->ras_window_pages > 0) + end_idx = ras->ras_window_start_idx + ras->ras_window_pages - 1; + + if (skip_index) + end_idx = *start_idx + ras->ras_window_pages - 1; + + /* Enlarge the RA window to encompass the full read */ + if (vio->vui_ra_valid && + end_idx < vio->vui_ra_start_idx + vio->vui_ra_pages - 1) + end_idx = vio->vui_ra_start_idx + vio->vui_ra_pages - 1; + + if (end_idx != 0) { + pgoff_t eof_index; + + /* Truncate RA window to end of file */ + eof_index = (pgoff_t)((kms - 1) >> PAGE_SHIFT); + if (eof_index <= end_idx) { + end_idx = eof_index; + ria->ria_eof = true; + } + } + ria->ria_start_idx = *start_idx; + ria->ria_end_idx = end_idx; + /* If stride I/O mode is detected, get stride window*/ + if (stride_io_mode(ras)) { + ria->ria_stoff = ras->ras_stride_offset; + ria->ria_length = ras->ras_stride_length; + ria->ria_bytes = ras->ras_stride_bytes; + } + spin_unlock(&ras->ras_lock); + + if (end_idx == 0) { + ll_ra_stats_inc(inode, RA_STAT_ZERO_WINDOW); + RETURN(0); + } + pages = ria_page_count(ria); + if (pages == 0) { + ll_ra_stats_inc(inode, RA_STAT_ZERO_WINDOW); + RETURN(0); + } + + RAS_CDEBUG(ras); + CDEBUG(D_READA, DFID": ria: %lu/%lu, bead: %lu/%lu, hit: %d\n", + PFID(lu_object_fid(&clob->co_lu)), + ria->ria_start_idx, ria->ria_end_idx, + vio->vui_ra_valid ? vio->vui_ra_start_idx : 0, + vio->vui_ra_valid ? vio->vui_ra_pages : 0, + hit); + + /* at least to extend the readahead window to cover current read */ + if (!hit && vio->vui_ra_valid && + vio->vui_ra_start_idx + vio->vui_ra_pages > ria->ria_start_idx) { + ria->ria_end_idx_min = + vio->vui_ra_start_idx + vio->vui_ra_pages - 1; + pages_min = vio->vui_ra_start_idx + vio->vui_ra_pages - + ria->ria_start_idx; + /** + * For performance reason, exceeding @ra_max_pages + * are allowed, but this should be limited with RPC + * size in case a large block size read issued. Trim + * to RPC boundary. + */ + pages_min = min(pages_min, ras->ras_rpc_pages - + (ria->ria_start_idx % ras->ras_rpc_pages)); + } + + /* don't over reserved for mmap range read */ + if (skip_index) + pages_min = 0; + if (pages_min > pages) + pages = pages_min; + ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, pages, + pages_min); + if (ria->ria_reserved < pages) + ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT); + + CDEBUG(D_READA, "reserved pages: %lu/%lu/%lu, ra_cur %d, ra_max %lu\n", + ria->ria_reserved, pages, pages_min, + atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages), + ll_i2sbi(inode)->ll_ra_info.ra_max_pages); + + ret = ll_read_ahead_pages(env, io, queue, ras, ria, &ra_end_idx, + skip_index); + if (ria->ria_reserved != 0) + ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved); + + if (ra_end_idx == end_idx && ra_end_idx == (kms >> PAGE_SHIFT)) + ll_ra_stats_inc(inode, RA_STAT_EOF); + + CDEBUG(D_READA, + "ra_end_idx = %lu end_idx = %lu stride end = %lu pages = %d\n", + ra_end_idx, end_idx, ria->ria_end_idx, ret); + + if (ra_end_idx != end_idx) + ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END); + if (ra_end_idx > 0) { + /* update the ras so that the next read-ahead tries from + * where we left off. */ + spin_lock(&ras->ras_lock); + ras->ras_next_readahead_idx = ra_end_idx + 1; + spin_unlock(&ras->ras_lock); + RAS_CDEBUG(ras); + } + + RETURN(ret); +} + +static int ll_readpages(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, + pgoff_t start, pgoff_t end) +{ + int ret = 0; + __u64 kms; + pgoff_t page_idx; + int count = 0; + + ENTRY; + + ret = ll_readahead_file_kms(env, io, &kms); + if (ret != 0) + RETURN(ret); + + if (kms == 0) + RETURN(0); + + if (end != 0) { + unsigned long end_index; + + end_index = (unsigned long)((kms - 1) >> PAGE_SHIFT); + if (end_index <= end) + end = end_index; + } + + for (page_idx = start; page_idx <= end; page_idx++) { + ret= ll_read_ahead_page(env, io, queue, page_idx, + WILLNEED); + if (ret < 0) + break; + else if (ret == 0) /* ret 1 is already uptodate */ + count++; + } + + RETURN(count > 0 ? count : ret); +} + +static void ras_set_start(struct ll_readahead_state *ras, pgoff_t index) +{ + ras->ras_window_start_idx = ras_align(ras, index); +} + +/* called with the ras_lock held or from places where it doesn't matter */ +static void ras_reset(struct ll_readahead_state *ras, pgoff_t index) +{ + ras->ras_consecutive_requests = 0; + ras->ras_consecutive_bytes = 0; + ras->ras_window_pages = 0; + ras_set_start(ras, index); + ras->ras_next_readahead_idx = max(ras->ras_window_start_idx, index + 1); + + RAS_CDEBUG(ras); +} + +/* called with the ras_lock held or from places where it doesn't matter */ +static void ras_stride_reset(struct ll_readahead_state *ras) +{ + ras->ras_consecutive_stride_requests = 0; + ras->ras_stride_length = 0; + ras->ras_stride_bytes = 0; + RAS_CDEBUG(ras); +} + +void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) +{ + spin_lock_init(&ras->ras_lock); + ras->ras_rpc_pages = PTLRPC_MAX_BRW_PAGES; + ras_reset(ras, 0); + ras->ras_last_read_end_bytes = 0; + ras->ras_requests = 0; + ras->ras_range_min_start_idx = 0; + ras->ras_range_max_end_idx = 0; + ras->ras_range_requests = 0; + ras->ras_last_range_pages = 0; +} + +/* + * Check whether the read request is in the stride window. + * If it is in the stride window, return true, otherwise return false. + */ +static bool read_in_stride_window(struct ll_readahead_state *ras, + loff_t pos, loff_t count) +{ + loff_t stride_gap; + + if (ras->ras_stride_length == 0 || ras->ras_stride_bytes == 0 || + ras->ras_stride_bytes == ras->ras_stride_length) + return false; + + stride_gap = pos - ras->ras_last_read_end_bytes - 1; + + /* If it is contiguous read */ + if (stride_gap == 0) + return ras->ras_consecutive_bytes + count <= + ras->ras_stride_bytes; + + /* Otherwise check the stride by itself */ + return (ras->ras_stride_length - ras->ras_stride_bytes) == stride_gap && + ras->ras_consecutive_bytes == ras->ras_stride_bytes && + count <= ras->ras_stride_bytes; +} + +static void ras_init_stride_detector(struct ll_readahead_state *ras, + loff_t pos, loff_t count) +{ + loff_t stride_gap = pos - ras->ras_last_read_end_bytes - 1; + + LASSERT(ras->ras_consecutive_stride_requests == 0); + + if (pos <= ras->ras_last_read_end_bytes) { + /*Reset stride window for forward read*/ + ras_stride_reset(ras); + return; + } + + ras->ras_stride_bytes = ras->ras_consecutive_bytes; + ras->ras_stride_length = stride_gap + ras->ras_consecutive_bytes; + ras->ras_consecutive_stride_requests++; + ras->ras_stride_offset = pos; + + RAS_CDEBUG(ras); +} + +static unsigned long +stride_page_count(struct ll_readahead_state *ras, loff_t len) +{ + loff_t bytes_count = + stride_byte_count(ras->ras_stride_offset, + ras->ras_stride_length, ras->ras_stride_bytes, + ras->ras_window_start_idx << PAGE_SHIFT, len); + + return (bytes_count + PAGE_SIZE - 1) >> PAGE_SHIFT; +} + +/* Stride Read-ahead window will be increased inc_len according to + * stride I/O pattern */ +static void ras_stride_increase_window(struct ll_readahead_state *ras, + struct ll_ra_info *ra, loff_t inc_bytes) +{ + loff_t window_bytes, stride_bytes; + u64 left_bytes; + u64 step; + loff_t end; + + /* temporarily store in page units to reduce LASSERT() cost below */ + end = ras->ras_window_start_idx + ras->ras_window_pages; + + LASSERT(ras->ras_stride_length > 0); + LASSERTF(end >= (ras->ras_stride_offset >> PAGE_SHIFT), + "window_start_idx %lu, window_pages %lu stride_offset %llu\n", + ras->ras_window_start_idx, ras->ras_window_pages, + ras->ras_stride_offset); + + end <<= PAGE_SHIFT; + if (end <= ras->ras_stride_offset) + stride_bytes = 0; + else + stride_bytes = end - ras->ras_stride_offset; + + div64_u64_rem(stride_bytes, ras->ras_stride_length, &left_bytes); + window_bytes = (ras->ras_window_pages << PAGE_SHIFT); + if (left_bytes < ras->ras_stride_bytes) { + if (ras->ras_stride_bytes - left_bytes >= inc_bytes) { + window_bytes += inc_bytes; + goto out; + } else { + window_bytes += (ras->ras_stride_bytes - left_bytes); + inc_bytes -= (ras->ras_stride_bytes - left_bytes); + } + } else { + window_bytes += (ras->ras_stride_length - left_bytes); + } + + LASSERT(ras->ras_stride_bytes != 0); + + step = div64_u64_rem(inc_bytes, ras->ras_stride_bytes, &left_bytes); + + window_bytes += step * ras->ras_stride_length + left_bytes; + LASSERT(window_bytes > 0); + +out: + if (stride_page_count(ras, window_bytes) <= + ra->ra_max_pages_per_file || ras->ras_window_pages == 0) + ras->ras_window_pages = (window_bytes >> PAGE_SHIFT); + + LASSERT(ras->ras_window_pages > 0); + + RAS_CDEBUG(ras); +} + +static void ras_increase_window(struct inode *inode, + struct ll_readahead_state *ras, + struct ll_ra_info *ra) +{ + /* The stretch of ra-window should be aligned with max rpc_size + * but current clio architecture does not support retrieve such + * information from lower layer. FIXME later + */ + if (stride_io_mode(ras)) { + ras_stride_increase_window(ras, ra, + (loff_t)ras->ras_rpc_pages << PAGE_SHIFT); + } else { + pgoff_t window_pages; + + window_pages = min(ras->ras_window_pages + ras->ras_rpc_pages, + ra->ra_max_pages_per_file); + if (window_pages < ras->ras_rpc_pages) + ras->ras_window_pages = window_pages; + else + ras->ras_window_pages = ras_align(ras, window_pages); + } +} + +/** + * Seek within 8 pages are considered as sequential read for now. + */ +static inline bool is_loose_seq_read(struct ll_readahead_state *ras, loff_t pos) +{ + return pos_in_window(pos, ras->ras_last_read_end_bytes, + 8UL << PAGE_SHIFT, 8UL << PAGE_SHIFT); +} + +static inline bool is_loose_mmap_read(struct ll_sb_info *sbi, + struct ll_readahead_state *ras, + unsigned long pos) +{ + unsigned long range_pages = sbi->ll_ra_info.ra_range_pages; + + return pos_in_window(pos, ras->ras_last_read_end_bytes, + range_pages << PAGE_SHIFT, + range_pages << PAGE_SHIFT); +} + +/** + * We have observed slow mmap read performances for some + * applications. The problem is if access pattern is neither + * sequential nor stride, but could be still adjacent in a + * small range and then seek a random position. + * + * So the pattern could be something like this: + * + * [1M data] [hole] [0.5M data] [hole] [0.7M data] [1M data] + * + * + * Every time an application reads mmap data, it may not only + * read a single 4KB page, but aslo a cluster of nearby pages in + * a range(e.g. 1MB) of the first page after a cache miss. + * + * The readahead engine is modified to track the range size of + * a cluster of mmap reads, so that after a seek and/or cache miss, + * the range size is used to efficiently prefetch multiple pages + * in a single RPC rather than many small RPCs. + */ +static void ras_detect_cluster_range(struct ll_readahead_state *ras, + struct ll_sb_info *sbi, + unsigned long pos, unsigned long count) +{ + pgoff_t last_pages, pages; + pgoff_t end_idx = (pos + count - 1) >> PAGE_SHIFT; + + last_pages = ras->ras_range_max_end_idx - + ras->ras_range_min_start_idx + 1; + /* First time come here */ + if (!ras->ras_range_max_end_idx) + goto out; + + /* Random or Stride read */ + if (!is_loose_mmap_read(sbi, ras, pos)) + goto out; + + ras->ras_range_requests++; + if (ras->ras_range_max_end_idx < end_idx) + ras->ras_range_max_end_idx = end_idx; + + if (ras->ras_range_min_start_idx > (pos >> PAGE_SHIFT)) + ras->ras_range_min_start_idx = pos >> PAGE_SHIFT; + + /* Out of range, consider it as random or stride */ + pages = ras->ras_range_max_end_idx - + ras->ras_range_min_start_idx + 1; + if (pages <= sbi->ll_ra_info.ra_range_pages) + return; +out: + ras->ras_last_range_pages = last_pages; + ras->ras_range_requests = 0; + ras->ras_range_min_start_idx = pos >> PAGE_SHIFT; + ras->ras_range_max_end_idx = end_idx; +} + +static void ras_detect_read_pattern(struct ll_readahead_state *ras, + struct ll_sb_info *sbi, + loff_t pos, size_t count, bool mmap) +{ + bool stride_detect = false; + pgoff_t index = pos >> PAGE_SHIFT; + + /* + * Reset the read-ahead window in two cases. First when the app seeks + * or reads to some other part of the file. Secondly if we get a + * read-ahead miss that we think we've previously issued. This can + * be a symptom of there being so many read-ahead pages that the VM + * is reclaiming it before we get to it. + */ + if (!is_loose_seq_read(ras, pos)) { + /* Check whether it is in stride I/O mode */ + if (!read_in_stride_window(ras, pos, count)) { + if (ras->ras_consecutive_stride_requests == 0) + ras_init_stride_detector(ras, pos, count); + else + ras_stride_reset(ras); + ras->ras_consecutive_bytes = 0; + ras_reset(ras, index); + } else { + ras->ras_consecutive_bytes = 0; + ras->ras_consecutive_requests = 0; + if (++ras->ras_consecutive_stride_requests > 1) + stride_detect = true; + RAS_CDEBUG(ras); + } + ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE); + } else if (stride_io_mode(ras)) { + /* + * If this is contiguous read but in stride I/O mode + * currently, check whether stride step still is valid, + * if invalid, it will reset the stride ra window to + * be zero. + */ + if (!read_in_stride_window(ras, pos, count)) { + ras_stride_reset(ras); + ras->ras_window_pages = 0; + ras->ras_next_readahead_idx = index; + } + } + + ras->ras_consecutive_bytes += count; + if (mmap) { + pgoff_t idx = ras->ras_consecutive_bytes >> PAGE_SHIFT; + unsigned long ra_range_pages = + max_t(unsigned long, RA_MIN_MMAP_RANGE_PAGES, + sbi->ll_ra_info.ra_range_pages); + + if ((idx >= ra_range_pages && + idx % ra_range_pages == 0) || stride_detect) + ras->ras_need_increase_window = true; + } else if ((ras->ras_consecutive_requests > 1 || stride_detect)) { + ras->ras_need_increase_window = true; + } + + ras->ras_last_read_end_bytes = pos + count - 1; +} + +void ll_ras_enter(struct file *f, loff_t pos, size_t count) +{ + struct ll_file_data *fd = f->private_data; + struct ll_readahead_state *ras = &fd->fd_ras; + struct inode *inode = file_inode(f); + unsigned long index = pos >> PAGE_SHIFT; + struct ll_sb_info *sbi = ll_i2sbi(inode); + + spin_lock(&ras->ras_lock); + ras->ras_requests++; + ras->ras_consecutive_requests++; + ras->ras_need_increase_window = false; + ras->ras_no_miss_check = false; + /* + * On the second access to a file smaller than the tunable + * ra_max_read_ahead_whole_pages trigger RA on all pages in the + * file up to ra_max_pages_per_file. This is simply a best effort + * and only occurs once per open file. Normal RA behavior is reverted + * to for subsequent IO. + */ + if (ras->ras_requests >= 2) { + __u64 kms_pages; + struct ll_ra_info *ra = &sbi->ll_ra_info; + + kms_pages = (i_size_read(inode) + PAGE_SIZE - 1) >> + PAGE_SHIFT; + + CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages, + ra->ra_max_read_ahead_whole_pages, + ra->ra_max_pages_per_file); + + if (kms_pages && + kms_pages <= ra->ra_max_read_ahead_whole_pages) { + ras->ras_window_start_idx = 0; + ras->ras_next_readahead_idx = index + 1; + ras->ras_window_pages = min(ra->ra_max_pages_per_file, + ra->ra_max_read_ahead_whole_pages); + ras->ras_no_miss_check = true; + GOTO(out_unlock, 0); + } + } + ras_detect_read_pattern(ras, sbi, pos, count, false); +out_unlock: + spin_unlock(&ras->ras_lock); +} + +static bool index_in_stride_window(struct ll_readahead_state *ras, + pgoff_t index) +{ + loff_t pos = (loff_t)index << PAGE_SHIFT; + + if (ras->ras_stride_length == 0 || ras->ras_stride_bytes == 0 || + ras->ras_stride_bytes == ras->ras_stride_length) + return false; + + if (pos >= ras->ras_stride_offset) { + u64 offset; + + div64_u64_rem(pos - ras->ras_stride_offset, + ras->ras_stride_length, &offset); + if (offset < ras->ras_stride_bytes || + ras->ras_stride_length - offset < PAGE_SIZE) + return true; + } else if (ras->ras_stride_offset - pos < PAGE_SIZE) { + return true; + } + + return false; +} + +/* + * ll_ras_enter() is used to detect read pattern according to pos and count. + * + * ras_update() is used to detect cache miss and + * reset window or increase window accordingly + */ +static void ras_update(struct ll_sb_info *sbi, struct inode *inode, + struct ll_readahead_state *ras, pgoff_t index, + enum ras_update_flags flags, struct cl_io *io) +{ + struct ll_ra_info *ra = &sbi->ll_ra_info; + bool hit = flags & LL_RAS_HIT; + + ENTRY; + spin_lock(&ras->ras_lock); + + if (!hit) + CDEBUG(D_READA|D_IOTRACE, DFID " pages at %lu miss.\n", + PFID(ll_inode2fid(inode)), index); + ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS); + + /* + * The readahead window has been expanded to cover whole + * file size, we don't care whether ra miss happen or not. + * Because we will read whole file to page cache even if + * some pages missed. + */ + if (ras->ras_no_miss_check) + GOTO(out_unlock, 0); + + if (io && io->ci_rand_read) + GOTO(out_unlock, 0); + + if (io && io->ci_seq_read) { + if (!hit) { + /* to avoid many small read RPC here */ + ras->ras_window_pages = sbi->ll_ra_info.ra_range_pages; + ll_ra_stats_inc_sbi(sbi, RA_STAT_MMAP_RANGE_READ); + } + goto skip; + } + + if (flags & LL_RAS_MMAP) { + unsigned long ra_pages; + + ras_detect_cluster_range(ras, sbi, index << PAGE_SHIFT, + PAGE_SIZE); + ras_detect_read_pattern(ras, sbi, (loff_t)index << PAGE_SHIFT, + PAGE_SIZE, true); + + /* we did not detect anything but we could prefetch */ + if (!ras->ras_need_increase_window && + ras->ras_window_pages <= sbi->ll_ra_info.ra_range_pages && + ras->ras_range_requests >= 2) { + if (!hit) { + ra_pages = max_t(unsigned long, + RA_MIN_MMAP_RANGE_PAGES, + ras->ras_last_range_pages); + if (index < ra_pages / 2) + index = 0; + else + index -= ra_pages / 2; + ras->ras_window_pages = ra_pages; + ll_ra_stats_inc_sbi(sbi, + RA_STAT_MMAP_RANGE_READ); + } else { + ras->ras_window_pages = 0; + } + goto skip; + } + } + + if (!hit && ras->ras_window_pages && + index < ras->ras_next_readahead_idx && + pos_in_window(index, ras->ras_window_start_idx, 0, + ras->ras_window_pages)) { + ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW); + ras->ras_need_increase_window = false; + + if (index_in_stride_window(ras, index) && + stride_io_mode(ras)) { + /* + * if (index != ras->ras_last_readpage + 1) + * ras->ras_consecutive_pages = 0; + */ + ras_reset(ras, index); + + /* + * If stride-RA hit cache miss, the stride + * detector will not be reset to avoid the + * overhead of redetecting read-ahead mode, + * but on the condition that the stride window + * is still intersect with normal sequential + * read-ahead window. + */ + if (ras->ras_window_start_idx < ras->ras_stride_offset) + ras_stride_reset(ras); + RAS_CDEBUG(ras); + } else { + /* + * Reset both stride window and normal RA + * window. + */ + ras_reset(ras, index); + /* ras->ras_consecutive_pages++; */ + ras->ras_consecutive_bytes = 0; + ras_stride_reset(ras); + GOTO(out_unlock, 0); + } + } + +skip: + ras_set_start(ras, index); + + if (stride_io_mode(ras)) { + /* Since stride readahead is sentivite to the offset + * of read-ahead, so we use original offset here, + * instead of ras_window_start_idx, which is RPC aligned. + */ + ras->ras_next_readahead_idx = max(index + 1, + ras->ras_next_readahead_idx); + ras->ras_window_start_idx = + max_t(pgoff_t, ras->ras_window_start_idx, + ras->ras_stride_offset >> PAGE_SHIFT); + } else { + if (ras->ras_next_readahead_idx < ras->ras_window_start_idx) + ras->ras_next_readahead_idx = ras->ras_window_start_idx; + if (!hit) + ras->ras_next_readahead_idx = index + 1; + } + + if (ras->ras_need_increase_window) { + ras_increase_window(inode, ras, ra); + ras->ras_need_increase_window = false; + } + + EXIT; +out_unlock: + spin_unlock(&ras->ras_lock); +} + +int ll_writepage(struct page *vmpage, struct writeback_control *wbc) +{ + struct inode *inode = vmpage->mapping->host; + struct ll_inode_info *lli = ll_i2info(inode); + struct lu_env *env; + struct cl_io *io; + struct cl_page *page; + struct cl_object *clob; + bool redirtied = false; + bool unlocked = false; + int result; + __u16 refcheck; + ENTRY; + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageWriteback(vmpage)); + + LASSERT(ll_i2dtexp(inode) != NULL); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out, result = PTR_ERR(env)); + + clob = ll_i2info(inode)->lli_clob; + LASSERT(clob != NULL); + + io = vvp_env_thread_io(env); + io->ci_obj = clob; + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, clob); + if (result == 0) { + page = cl_page_find(env, clob, vmpage->index, + vmpage, CPT_CACHEABLE); + if (!IS_ERR(page)) { + lu_ref_add(&page->cp_reference, "writepage", + current); + cl_page_assume(env, io, page); + result = cl_page_flush(env, io, page); + if (result != 0) { + /* + * Re-dirty page on error so it retries write, + * but not in case when IO has actually + * occurred and completed with an error. + */ + if (!PageError(vmpage)) { + redirty_page_for_writepage(wbc, vmpage); + result = 0; + redirtied = true; + } + } + cl_page_disown(env, io, page); + unlocked = true; + lu_ref_del(&page->cp_reference, + "writepage", current); + cl_page_put(env, page); + } else { + result = PTR_ERR(page); + } + } + cl_io_fini(env, io); + + if (redirtied && wbc->sync_mode == WB_SYNC_ALL) { + loff_t offset = cl_offset(clob, vmpage->index); + + /* Flush page failed because the extent is being written out. + * Wait for the write of extent to be finished to avoid + * breaking kernel which assumes ->writepage should mark + * PageWriteback or clean the page. */ + result = cl_sync_file_range(inode, offset, + offset + PAGE_SIZE - 1, + CL_FSYNC_LOCAL, 1); + if (result > 0) { + /* actually we may have written more than one page. + * decreasing this page because the caller will count + * it. */ + wbc->nr_to_write -= result - 1; + result = 0; + } + } + + cl_env_put(env, &refcheck); + GOTO(out, result); + +out: + if (result < 0) { + if (!lli->lli_async_rc) + lli->lli_async_rc = result; + SetPageError(vmpage); + if (!unlocked) + unlock_page(vmpage); + } + return result; +} + +int ll_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + loff_t start; + loff_t end; + enum cl_fsync_mode mode; + int range_whole = 0; + int result; + ENTRY; + + if (wbc->range_cyclic) { + start = (loff_t)mapping->writeback_index << PAGE_SHIFT; + end = OBD_OBJECT_EOF; + } else { + start = wbc->range_start; + end = wbc->range_end; + if (end == LLONG_MAX) { + end = OBD_OBJECT_EOF; + range_whole = start == 0; + } + } + + mode = CL_FSYNC_NONE; + if (wbc->sync_mode == WB_SYNC_ALL) + mode = CL_FSYNC_LOCAL; + + if (ll_i2info(inode)->lli_clob == NULL) + RETURN(0); + + /* for directio, it would call writepages() to evict cached pages + * inside the IO context of write, which will cause deadlock at + * layout_conf since it waits for active IOs to complete. */ + result = cl_sync_file_range(inode, start, end, mode, 1); + if (result > 0) { + wbc->nr_to_write -= result; + result = 0; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) { + if (end == OBD_OBJECT_EOF) + mapping->writeback_index = 0; + else + mapping->writeback_index = (end >> PAGE_SHIFT) + 1; + } + RETURN(result); +} + +struct ll_cl_context *ll_cl_find(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_cl_context *lcc; + struct ll_cl_context *found = NULL; + + read_lock(&lli->lli_lock); + list_for_each_entry(lcc, &lli->lli_lccs, lcc_list) { + if (lcc->lcc_cookie == current) { + found = lcc; + break; + } + } + read_unlock(&lli->lli_lock); + + return found; +} + +void ll_cl_add(struct inode *inode, const struct lu_env *env, struct cl_io *io, + enum lcc_type type) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx; + + memset(lcc, 0, sizeof(*lcc)); + INIT_LIST_HEAD(&lcc->lcc_list); + lcc->lcc_cookie = current; + lcc->lcc_env = env; + lcc->lcc_io = io; + lcc->lcc_type = type; + + write_lock(&lli->lli_lock); + list_add(&lcc->lcc_list, &lli->lli_lccs); + write_unlock(&lli->lli_lock); +} + +void ll_cl_remove(struct inode *inode, const struct lu_env *env) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx; + + write_lock(&lli->lli_lock); + list_del_init(&lcc->lcc_list); + write_unlock(&lli->lli_lock); +} + +int ll_io_read_page(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, struct file *file) +{ + struct inode *inode = vvp_object_inode(page->cp_obj); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_file_data *fd = NULL; + struct ll_readahead_state *ras = NULL; + struct cl_2queue *queue = &io->ci_queue; + struct cl_sync_io *anchor = NULL; + struct vvp_page *vpg; + int rc = 0, rc2 = 0; + bool uptodate; + struct vvp_io *vio = vvp_env_io(env); + bool mmap = !vio->vui_ra_valid; + pgoff_t ra_start_index = 0; + pgoff_t io_start_index; + pgoff_t io_end_index; + bool unlockpage = true; + ENTRY; + + if (file) { + fd = file->private_data; + ras = &fd->fd_ras; + } + + /* PagePrivate2 is set in ll_io_zero_page() to tell us the vmpage + * must not be unlocked after processing. + */ + if (page->cp_vmpage && PagePrivate2(page->cp_vmpage)) + unlockpage = false; + + vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page)); + uptodate = vpg->vpg_defer_uptodate; + + if (ll_readahead_enabled(sbi) && !vpg->vpg_ra_updated && ras) { + enum ras_update_flags flags = 0; + + if (uptodate) + flags |= LL_RAS_HIT; + if (mmap) + flags |= LL_RAS_MMAP; + ras_update(sbi, inode, ras, vvp_index(vpg), flags, io); + } + + cl_2queue_init(queue); + if (uptodate) { + vpg->vpg_ra_used = 1; + cl_page_export(env, page, 1); + cl_page_disown(env, io, page); + } else { + anchor = &vvp_env_info(env)->vti_anchor; + cl_sync_io_init(anchor, 1); + page->cp_sync_io = anchor; + + cl_2queue_add(queue, page, true); + } + + /* mmap does not set the ci_rw fields */ + if (!mmap) { + io_start_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos); + io_end_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos + + io->u.ci_rw.crw_count - 1); + } else { + io_start_index = vvp_index(vpg); + io_end_index = vvp_index(vpg); + } + + if (ll_readahead_enabled(sbi) && ras && !io->ci_rand_read) { + pgoff_t skip_index = 0; + + if (ras->ras_next_readahead_idx < vvp_index(vpg)) + skip_index = vvp_index(vpg); + rc2 = ll_readahead(env, io, &queue->c2_qin, ras, + uptodate, file, skip_index, + &ra_start_index); + CDEBUG(D_READA|D_IOTRACE, + DFID " %d pages read ahead at %lu, triggered by user read at %lu\n", + PFID(ll_inode2fid(inode)), rc2, ra_start_index, + vvp_index(vpg)); + } else if (vvp_index(vpg) == io_start_index && + io_end_index - io_start_index > 0) { + rc2 = ll_readpages(env, io, &queue->c2_qin, io_start_index + 1, + io_end_index); + CDEBUG(D_READA, DFID " %d pages read at %lu\n", + PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg)); + } + + if (queue->c2_qin.pl_nr > 0) { + int count = queue->c2_qin.pl_nr; + rc = cl_io_submit_rw(env, io, CRT_READ, queue); + if (rc == 0) + task_io_account_read(PAGE_SIZE * count); + } + + + if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */ + rc = cl_sync_io_wait(env, anchor, 0); + + cl_page_assume(env, io, page); + cl_page_list_del(env, &queue->c2_qout, page); + + if (!PageUptodate(cl_page_vmpage(page))) { + /* Failed to read a mirror, discard this page so that + * new page can be created with new mirror. + * + * TODO: this is not needed after page reinit + * route is implemented */ + cl_page_discard(env, io, page); + } + if (unlockpage) + cl_page_disown(env, io, page); + } + + /* TODO: discard all pages until page reinit route is implemented */ + cl_page_list_discard(env, io, &queue->c2_qin); + + /* Unlock unsent read pages in case of error. */ + cl_page_list_disown(env, io, &queue->c2_qin); + + cl_2queue_fini(env, queue); + + RETURN(rc); +} + +/* + * Possible return value: + * 0 no async readahead triggered and fast read could not be used. + * 1 no async readahead, but fast read could be used. + * 2 async readahead triggered and fast read could be used too. + * < 0 on error. + */ +static int kickoff_async_readahead(struct file *file, unsigned long pages) +{ + struct ll_readahead_work *lrw; + struct inode *inode = file_inode(file); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_file_data *fd = file->private_data; + struct ll_readahead_state *ras = &fd->fd_ras; + struct ll_ra_info *ra = &sbi->ll_ra_info; + unsigned long throttle; + pgoff_t start_idx = ras_align(ras, ras->ras_next_readahead_idx); + pgoff_t end_idx = start_idx + pages - 1; + + /** + * In case we have a limited max_cached_mb, readahead + * should be stopped if it have run out of all LRU slots. + */ + if (atomic_read(&ra->ra_cur_pages) >= sbi->ll_cache->ccc_lru_max) { + ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT); + return 0; + } + + throttle = min(ra->ra_async_pages_per_file_threshold, + ra->ra_max_pages_per_file); + /* + * If this is strided i/o or the window is smaller than the + * throttle limit, we do not do async readahead. Otherwise, + * we do async readahead, allowing the user thread to do fast i/o. + */ + if (stride_io_mode(ras) || !throttle || + ras->ras_window_pages < throttle || + atomic_read(&ra->ra_async_inflight) > ra->ra_async_max_active) + return 0; + + if ((atomic_read(&ra->ra_cur_pages) + pages) > ra->ra_max_pages) + return 0; + + if (ras->ras_async_last_readpage_idx == start_idx) + return 1; + + /* ll_readahead_work_free() free it */ + OBD_ALLOC_PTR(lrw); + if (lrw) { + atomic_inc(&sbi->ll_ra_info.ra_async_inflight); + lrw->lrw_file = get_file(file); + lrw->lrw_start_idx = start_idx; + lrw->lrw_end_idx = end_idx; + lrw->lrw_user_pid = current->pid; + spin_lock(&ras->ras_lock); + ras->ras_next_readahead_idx = end_idx + 1; + ras->ras_async_last_readpage_idx = start_idx; + spin_unlock(&ras->ras_lock); + memcpy(lrw->lrw_jobid, ll_i2info(inode)->lli_jobid, + sizeof(lrw->lrw_jobid)); + ll_readahead_work_add(inode, lrw); + } else { + return -ENOMEM; + } + + return 2; +} + +/* + * Check if we can issue a readahead RPC, if that is + * the case, we can't do fast IO because we will need + * a cl_io to issue the RPC. + */ +static bool ll_use_fast_io(struct file *file, + struct ll_readahead_state *ras, pgoff_t index) +{ + unsigned long fast_read_pages = + max(RA_REMAIN_WINDOW_MIN, ras->ras_rpc_pages); + loff_t skip_pages; + loff_t stride_bytes = ras->ras_stride_bytes; + + if (stride_io_mode(ras) && stride_bytes) { + skip_pages = (ras->ras_stride_length + + ras->ras_stride_bytes - 1) / stride_bytes; + skip_pages *= fast_read_pages; + } else { + skip_pages = fast_read_pages; + } + + if (ras->ras_window_start_idx + ras->ras_window_pages < + ras->ras_next_readahead_idx + skip_pages || + kickoff_async_readahead(file, fast_read_pages) > 0) + return true; + + return false; +} + +int ll_readpage(struct file *file, struct page *vmpage) +{ + struct inode *inode = file_inode(file); + struct cl_object *clob = ll_i2info(inode)->lli_clob; + struct ll_sb_info *sbi = ll_i2sbi(inode); + const struct lu_env *env = NULL; + struct cl_read_ahead ra = { 0 }; + struct ll_cl_context *lcc; + struct cl_io *io = NULL; + struct cl_page *page; + int result; + ENTRY; + + if (OBD_FAIL_PRECHECK(OBD_FAIL_LLITE_READPAGE_PAUSE)) { + unlock_page(vmpage); + OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_READPAGE_PAUSE, cfs_fail_val); + lock_page(vmpage); + } + + /* + * The @vmpage got truncated. + * This is a kernel bug introduced since kernel 5.12: + * comment: cbd59c48ae2bcadc4a7599c29cf32fd3f9b78251 + * ("mm/filemap: use head pages in generic_file_buffered_read") + * + * The page end offset calculation in filemap_get_read_batch() was off + * by one. When a read is submitted with end offset 1048575, then it + * calculates the end page for read of 256 where it should be 255. This + * results in the readpage() for the page with index 256 is over stripe + * boundary and may not covered by a DLM extent lock. + * + * This happens in a corner race case: filemap_get_read_batch() adds + * the page with index 256 for read which is not in the current read + * I/O context, and this page is being invalidated and will be removed + * from page cache due to the lock protected it being revoken. This + * results in this page in the read path not covered by any DLM lock. + * + * The solution is simple. Check whether the page was truncated in + * ->readpage(). If so, just return AOP_TRUNCATED_PAGE to the upper + * caller. Then the kernel will retry to batch pages, and it will not + * add the truncated page into batches as it was removed from page + * cache of the file. + */ + if (vmpage->mapping != inode->i_mapping) { + unlock_page(vmpage); + RETURN(AOP_TRUNCATED_PAGE); + } + + lcc = ll_cl_find(inode); + if (lcc != NULL) { + env = lcc->lcc_env; + io = lcc->lcc_io; + } + + if (io == NULL) { /* fast read */ + struct inode *inode = file_inode(file); + struct ll_file_data *fd = file->private_data; + struct ll_readahead_state *ras = &fd->fd_ras; + struct lu_env *local_env = NULL; + struct vvp_page *vpg; + + CDEBUG(D_VFSTRACE, "fast read pgno: %ld\n", vmpage->index); + + result = -ENODATA; + + /* TODO: need to verify the layout version to make sure + * the page is not invalid due to layout change. */ + page = cl_vmpage_page(vmpage, clob); + if (page == NULL) { + unlock_page(vmpage); + ll_ra_stats_inc_sbi(sbi, RA_STAT_FAILED_FAST_READ); + RETURN(result); + } + + vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page)); + if (vpg->vpg_defer_uptodate) { + enum ras_update_flags flags = LL_RAS_HIT; + + if (lcc && lcc->lcc_type == LCC_MMAP) + flags |= LL_RAS_MMAP; + + /* For fast read, it updates read ahead state only + * if the page is hit in cache because non cache page + * case will be handled by slow read later. */ + ras_update(sbi, inode, ras, vvp_index(vpg), flags, io); + /* avoid duplicate ras_update() call */ + vpg->vpg_ra_updated = 1; + + if (ll_use_fast_io(file, ras, vvp_index(vpg))) + result = 0; + } + + if (!env) { + local_env = cl_env_percpu_get(); + env = local_env; + } + + /* export the page and skip io stack */ + if (result == 0) { + vpg->vpg_ra_used = 1; + cl_page_export(env, page, 1); + } else { + ll_ra_stats_inc_sbi(sbi, RA_STAT_FAILED_FAST_READ); + } + /* release page refcount before unlocking the page to ensure + * the object won't be destroyed in the calling path of + * cl_page_put(). Please see comment in ll_releasepage(). */ + cl_page_put(env, page); + unlock_page(vmpage); + if (local_env) + cl_env_percpu_put(local_env); + + RETURN(result); + } + + if (lcc && lcc->lcc_type != LCC_MMAP) { + CDEBUG(D_VFSTRACE, "pgno:%ld, beyond read end_index:%ld\n", + vmpage->index, lcc->lcc_end_index); + + /* + * This handles a kernel bug introduced in kernel 5.12: + * comment: cbd59c48ae2bcadc4a7599c29cf32fd3f9b78251 + * ("mm/filemap: use head pages in generic_file_buffered_read") + * + * See above in this function for a full description of the + * bug. Briefly, the kernel will try to read 1 more page than + * was actually requested *if that page is already in cache*. + * + * Because this page is beyond the boundary of the requested + * read, Lustre does not lock it as part of the read. This + * means we must check if there is a valid dlmlock on this + * this page and reference it before we attempt to read in the + * page. If there is not a valid dlmlock, then we are racing + * with dlmlock cancellation and the page is being removed + * from the cache. + * + * That means we should return AOP_TRUNCATED_PAGE, which will + * cause the kernel to retry the read, which should allow the + * page to be removed from cache as the lock is cancelled. + * + * This should never occur except in kernels with the bug + * mentioned above. + */ + if (vmpage->index >= lcc->lcc_end_index) { + result = cl_io_read_ahead(env, io, vmpage->index, &ra); + if (result < 0 || vmpage->index > ra.cra_end_idx) { + cl_read_ahead_release(env, &ra); + unlock_page(vmpage); + RETURN(AOP_TRUNCATED_PAGE); + } + } + } + + /** + * Direct read can fall back to buffered read, but DIO is done + * with lockless i/o, and buffered requires LDLM locking, so in + * this case we must restart without lockless. + */ + if (file->f_flags & O_DIRECT && + lcc && lcc->lcc_type == LCC_RW && + !io->ci_dio_lock) { + unlock_page(vmpage); + io->ci_dio_lock = 1; + io->ci_need_restart = 1; + GOTO(out, result = -ENOLCK); + } + + LASSERT(io->ci_state == CIS_IO_GOING); + page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE); + if (!IS_ERR(page)) { + LASSERT(page->cp_type == CPT_CACHEABLE); + if (likely(!PageUptodate(vmpage))) { + cl_page_assume(env, io, page); + + result = ll_io_read_page(env, io, page, file); + } else { + /* Page from a non-object file. */ + unlock_page(vmpage); + result = 0; + } + cl_page_put(env, page); + } else { + unlock_page(vmpage); + result = PTR_ERR(page); + } + +out: + if (ra.cra_release != NULL) + cl_read_ahead_release(env, &ra); + + /* this delay gives time for the actual read of the page to finish and + * unlock the page in vvp_page_completion_read before we return to our + * caller and the caller tries to use the page, allowing us to test + * races with the page being unlocked after readpage() but before it's + * used by the caller + */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_READPAGE_PAUSE2, cfs_fail_val); + + RETURN(result); +} + +#ifdef HAVE_AOPS_READ_FOLIO +int ll_read_folio(struct file *file, struct folio *folio) +{ + return ll_readpage(file, folio_page(folio, 0)); +} +#endif diff --git a/drivers/staging/lustrefsx/lustre/llite/rw26.c b/drivers/staging/lustrefsx/lustre/llite/rw26.c new file mode 100644 index 0000000000000..792e88003fb3d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/rw26.c @@ -0,0 +1,1023 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/lustre/llite/rw26.c + * + * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "llite_internal.h" +#include + +#ifdef HAVE_INVALIDATE_FOLIO +/** + * Implements Linux VM address_space::invalidate_folio() method. This method is + * called when the folio is truncated from a file, either as a result of + * explicit truncate, or when inode is removed from memory (as a result of + * final iput(), umount, or memory pressure induced icache shrinking). + * + * [0, off] bytes of the folio remain valid (this is for a case of non-page + * aligned truncate). Lustre leaves partially truncated folios in the cache, + * relying on struct inode::i_size to limit further accesses. + */ +static void ll_invalidate_folio(struct folio *folio, size_t offset, size_t len) +{ + struct inode *inode; + struct lu_env *env; + struct cl_page *page; + struct cl_object *obj; + + LASSERT(!folio_test_writeback(folio)); + LASSERT(folio_test_locked(folio)); + + if (!(offset == 0 && len == folio_size(folio)) && + !folio_test_large(folio)) + return; + + /* Drop the pages from the folio */ + env = cl_env_percpu_get(); + LASSERT(!IS_ERR(env)); + + inode = folio_inode(folio); + obj = ll_i2info(inode)->lli_clob; + if (obj != NULL) { + int n, npgs = folio_nr_pages(folio); + + for (n = 0; n < npgs; n++) { + struct page *vmpage = folio_page(folio, n); + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageWriteback(vmpage)); + + page = cl_vmpage_page(vmpage, obj); + if (page != NULL) { + cl_page_delete(env, page); + cl_page_put(env, page); + } + } + } else { + LASSERT(!folio_get_private(folio)); + } + cl_env_percpu_put(env); +} +#else + +/** + * Implements Linux VM address_space::invalidatepage() method. This method is + * called when the page is truncate from a file, either as a result of + * explicit truncate, or when inode is removed from memory (as a result of + * final iput(), umount, or memory pressure induced icache shrinking). + * + * [0, offset] bytes of the page remain valid (this is for a case of not-page + * aligned truncate). Lustre leaves partially truncated page in the cache, + * relying on struct inode::i_size to limit further accesses. + */ +static void ll_invalidatepage(struct page *vmpage, +#ifdef HAVE_INVALIDATE_RANGE + unsigned int offset, unsigned int length +#else + unsigned long offset +#endif + ) +{ + struct inode *inode; + struct lu_env *env; + struct cl_page *page; + struct cl_object *obj; + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageWriteback(vmpage)); + + /* + * It is safe to not check anything in invalidatepage/releasepage + * below because they are run with page locked and all our io is + * happening with locked page too + */ +#ifdef HAVE_INVALIDATE_RANGE + if (offset == 0 && length == PAGE_SIZE) { +#else + if (offset == 0) { +#endif + /* See the comment in ll_releasepage() */ + env = cl_env_percpu_get(); + LASSERT(!IS_ERR(env)); + + inode = vmpage->mapping->host; + obj = ll_i2info(inode)->lli_clob; + if (obj != NULL) { + page = cl_vmpage_page(vmpage, obj); + if (page != NULL) { + cl_page_delete(env, page); + cl_page_put(env, page); + } + } else + LASSERT(vmpage->private == 0); + + cl_env_percpu_put(env); + } + + if (OBD_FAIL_PRECHECK(OBD_FAIL_LLITE_PAGE_INVALIDATE_PAUSE)) { + unlock_page(vmpage); + OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_PAGE_INVALIDATE_PAUSE, + cfs_fail_val); + lock_page(vmpage); + } +} +#endif + +static bool do_release_page(struct page *vmpage, gfp_t wait) +{ + struct lu_env *env; + struct cl_object *obj; + struct cl_page *page; + struct address_space *mapping; + int result = 0; + + ENTRY; + + LASSERT(PageLocked(vmpage)); + if (PageWriteback(vmpage) || PageDirty(vmpage)) + RETURN(0); + + mapping = vmpage->mapping; + if (mapping == NULL) + RETURN(1); + + obj = ll_i2info(mapping->host)->lli_clob; + if (obj == NULL) + RETURN(1); + + page = cl_vmpage_page(vmpage, obj); + if (page == NULL) + RETURN(1); + + env = cl_env_percpu_get(); + LASSERT(!IS_ERR(env)); + + if (!cl_page_in_use(page)) { + result = 1; + cl_page_delete(env, page); + } + + /* To use percpu env array, the call path can not be rescheduled; + * otherwise percpu array will be messed if ll_releaspage() called + * again on the same CPU. + * + * If this page holds the last refc of cl_object, the following + * call path may cause reschedule: + * cl_page_put -> cl_page_free -> cl_object_put -> + * lu_object_put -> lu_object_free -> lov_delete_raid0. + * + * However, the kernel can't get rid of this inode until all pages have + * been cleaned up. Now that we hold page lock here, it's pretty safe + * that we won't get into object delete path. + */ + LASSERT(cl_object_refc(obj) > 1); + cl_page_put(env, page); + + cl_env_percpu_put(env); + RETURN(result); +} + +#ifdef HAVE_AOPS_RELEASE_FOLIO +static bool ll_release_folio(struct folio *folio, gfp_t wait) +{ + struct page *vmpage = folio_page(folio, 0); + + /* folio_nr_pages(folio) == 1 is fixed with grab_cache_page* */ + BUG_ON(folio_nr_pages(folio) != 1); + + return do_release_page(vmpage, wait); +} +#else /* !HAVE_AOPS_RELEASE_FOLIO */ +#ifdef HAVE_RELEASEPAGE_WITH_INT +#define RELEASEPAGE_ARG_TYPE int +#else +#define RELEASEPAGE_ARG_TYPE gfp_t +#endif +static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask) +{ + return do_release_page(vmpage, gfp_mask); +} +#endif /* HAVE_AOPS_RELEASE_FOLIO */ + +static ssize_t ll_get_user_pages(int rw, struct iov_iter *iter, + struct page ***pages, ssize_t *npages, + size_t maxsize) +{ +#if defined(HAVE_DIO_ITER) + size_t start; + size_t result; + + result = iov_iter_get_pages_alloc2(iter, pages, maxsize, &start); + if (result > 0) + *npages = DIV_ROUND_UP(result + start, PAGE_SIZE); + + return result; +#else + unsigned long addr; + size_t page_count; + size_t size; + long result; + + if (!maxsize) + return 0; + + if (!iter->nr_segs) + return 0; + + addr = (unsigned long)iter->iov->iov_base + iter->iov_offset; + if (addr & ~PAGE_MASK) + return -EINVAL; + + size = min_t(size_t, maxsize, iter->iov->iov_len); + page_count = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + OBD_ALLOC_PTR_ARRAY_LARGE(*pages, page_count); + if (*pages == NULL) + return -ENOMEM; + + mmap_read_lock(current->mm); + result = get_user_pages(current, current->mm, addr, page_count, + rw == READ, 0, *pages, NULL); + mmap_read_unlock(current->mm); + + if (unlikely(result != page_count)) { + ll_release_user_pages(*pages, page_count); + *pages = NULL; + + if (result >= 0) + return -EFAULT; + + return result; + } + *npages = page_count; + + return size; +#endif +} + +/* iov_iter_alignment() is introduced in 3.16 similar to HAVE_DIO_ITER */ +#if defined(HAVE_DIO_ITER) +static unsigned long iov_iter_alignment_vfs(const struct iov_iter *i) +{ + return iov_iter_alignment(i); +} +#else /* copied from alignment_iovec() */ +static unsigned long iov_iter_alignment_vfs(const struct iov_iter *i) +{ + const struct iovec *iov = i->iov; + unsigned long res; + size_t size = i->count; + size_t n; + + if (!size) + return 0; + + res = (unsigned long)iov->iov_base + i->iov_offset; + n = iov->iov_len - i->iov_offset; + if (n >= size) + return res | size; + + size -= n; + res |= n; + while (size > (++iov)->iov_len) { + res |= (unsigned long)iov->iov_base | iov->iov_len; + size -= iov->iov_len; + } + res |= (unsigned long)iov->iov_base | size; + + return res; +} +#endif + +/* + * Lustre could relax a bit for alignment, io count is not + * necessary page alignment. + */ +static unsigned long ll_iov_iter_alignment(struct iov_iter *i) +{ + size_t orig_size = i->count; + size_t count = orig_size & ~PAGE_MASK; + unsigned long res; + + if (!count) + return iov_iter_alignment_vfs(i); + + if (orig_size > PAGE_SIZE) { + iov_iter_truncate(i, orig_size - count); + res = iov_iter_alignment_vfs(i); + iov_iter_reexpand(i, orig_size); + + return res; + } + + res = iov_iter_alignment_vfs(i); + /* start address is page aligned */ + if ((res & ~PAGE_MASK) == orig_size) + return PAGE_SIZE; + + return res; +} + +static int +ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io, size_t size, + int rw, struct inode *inode, struct cl_sub_dio *sdio) +{ + struct ll_dio_pages *pv = &sdio->csd_dio_pages; + struct cl_page *page; + struct cl_2queue *queue = &io->ci_queue; + struct cl_object *obj = io->ci_obj; + struct cl_sync_io *anchor = &sdio->csd_sync; + loff_t offset = pv->ldp_file_offset; + int io_pages = 0; + size_t page_size = cl_page_size(obj); + int i; + ssize_t rc = 0; + + ENTRY; + + cl_2queue_init(queue); + for (i = 0; i < pv->ldp_count; i++) { + LASSERT(!(offset & (PAGE_SIZE - 1))); + page = cl_page_find(env, obj, cl_index(obj, offset), + pv->ldp_pages[i], CPT_TRANSIENT); + if (IS_ERR(page)) { + rc = PTR_ERR(page); + break; + } + LASSERT(page->cp_type == CPT_TRANSIENT); + rc = cl_page_own(env, io, page); + if (rc) { + cl_page_put(env, page); + break; + } + + page->cp_sync_io = anchor; + if (inode && IS_ENCRYPTED(inode)) { + /* In case of Direct IO on encrypted file, we need to + * add a reference to the inode on the cl_page. + * This info is required by llcrypt to proceed + * to encryption/decryption. + * This is safe because we know these pages are private + * to the thread doing the Direct IO. + */ + page->cp_inode = inode; + } + /* We keep the refcount from cl_page_find, so we don't need + * another one here + */ + cl_2queue_add(queue, page, false); + /* + * Set page clip to tell transfer formation engine + * that page has to be sent even if it is beyond KMS. + */ + if (size < page_size) + cl_page_clip(env, page, 0, size); + ++io_pages; + + offset += page_size; + size -= page_size; + } + if (rc == 0 && io_pages > 0) { + int iot = rw == READ ? CRT_READ : CRT_WRITE; + + atomic_add(io_pages, &anchor->csi_sync_nr); + /* + * Avoid out-of-order execution of adding inflight + * modifications count and io submit. + */ + smp_mb(); + rc = cl_io_submit_rw(env, io, iot, queue); + if (rc == 0) { + cl_page_list_splice(&queue->c2_qout, &sdio->csd_pages); + } else { + atomic_add(-queue->c2_qin.pl_nr, + &anchor->csi_sync_nr); + cl_page_list_for_each(page, &queue->c2_qin) + page->cp_sync_io = NULL; + } + /* handle partially submitted reqs */ + if (queue->c2_qin.pl_nr > 0) { + CERROR(DFID " failed to submit %d dio pages: %zd\n", + PFID(lu_object_fid(&obj->co_lu)), + queue->c2_qin.pl_nr, rc); + if (rc == 0) + rc = -EIO; + } + } + + cl_2queue_discard(env, io, queue); + cl_2queue_disown(env, io, queue); + cl_2queue_fini(env, queue); + RETURN(rc); +} + +#ifdef KMALLOC_MAX_SIZE +#define MAX_MALLOC KMALLOC_MAX_SIZE +#else +#define MAX_MALLOC (128 * 1024) +#endif + +/* This is the maximum size of a single O_DIRECT request, based on the + * kmalloc limit. We need to fit all of the brw_page structs, each one + * representing PAGE_SIZE worth of user data, into a single buffer, and + * then truncate this to be a full-sized RPC. For 4kB PAGE_SIZE this is + * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. */ +#define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_SIZE) & \ + ~((size_t)DT_MAX_BRW_SIZE - 1)) + +static ssize_t +ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw) +{ + struct ll_cl_context *lcc; + const struct lu_env *env; + struct cl_io *io; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct cl_dio_aio *ll_dio_aio; + struct cl_sub_dio *ldp_aio; + size_t count = iov_iter_count(iter); + ssize_t tot_bytes = 0, result = 0; + loff_t file_offset = iocb->ki_pos; + bool sync_submit = false; + struct vvp_io *vio; + ssize_t rc2; + + /* Check EOF by ourselves */ + if (rw == READ && file_offset >= i_size_read(inode)) + return 0; + + /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */ + if (file_offset & ~PAGE_MASK) + RETURN(-EINVAL); + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), " + "offset=%lld=%llx, pages %zd (max %lu)\n", + PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE, + file_offset, file_offset, count >> PAGE_SHIFT, + MAX_DIO_SIZE >> PAGE_SHIFT); + + /* Check that all user buffers are aligned as well */ + if (ll_iov_iter_alignment(iter) & ~PAGE_MASK) + RETURN(-EINVAL); + + lcc = ll_cl_find(inode); + if (lcc == NULL) + RETURN(-EIO); + + env = lcc->lcc_env; + LASSERT(!IS_ERR(env)); + vio = vvp_env_io(env); + io = lcc->lcc_io; + LASSERT(io != NULL); + + ll_dio_aio = io->ci_dio_aio; + LASSERT(ll_dio_aio); + LASSERT(ll_dio_aio->cda_iocb == iocb); + + /* We cannot do parallel submission of sub-I/Os - for AIO or regular + * DIO - unless lockless because it causes us to release the lock + * early. + * + * There are also several circumstances in which we must disable + * parallel DIO, so we check if it is enabled. + * + * The check for "is_sync_kiocb" excludes AIO, which does not need to + * be disabled in these situations. + */ + if (io->ci_dio_lock || (is_sync_kiocb(iocb) && !io->ci_parallel_dio)) + sync_submit = true; + + while (iov_iter_count(iter)) { + struct ll_dio_pages *pvec; + struct page **pages; + + count = min_t(size_t, iov_iter_count(iter), MAX_DIO_SIZE); + if (rw == READ) { + if (file_offset >= i_size_read(inode)) + break; + + if (file_offset + count > i_size_read(inode)) + count = i_size_read(inode) - file_offset; + } + + /* if we are doing sync_submit, then we free this below, + * otherwise it is freed on the final call to cl_sync_io_note + * (either in this function or from a ptlrpcd daemon) + */ + ldp_aio = cl_sub_dio_alloc(ll_dio_aio, sync_submit); + if (!ldp_aio) + GOTO(out, result = -ENOMEM); + + pvec = &ldp_aio->csd_dio_pages; + + result = ll_get_user_pages(rw, iter, &pages, + &pvec->ldp_count, count); + if (unlikely(result <= 0)) { + cl_sync_io_note(env, &ldp_aio->csd_sync, result); + if (sync_submit) { + LASSERT(ldp_aio->csd_creator_free); + cl_sub_dio_free(ldp_aio); + } + GOTO(out, result); + } + + count = result; + pvec->ldp_file_offset = file_offset; + pvec->ldp_pages = pages; + + result = ll_direct_rw_pages(env, io, count, + rw, inode, ldp_aio); + /* We've submitted pages and can now remove the extra + * reference for that + */ + cl_sync_io_note(env, &ldp_aio->csd_sync, result); + + if (sync_submit) { + rc2 = cl_sync_io_wait(env, &ldp_aio->csd_sync, + 0); + if (result == 0 && rc2) + result = rc2; + LASSERT(ldp_aio->csd_creator_free); + cl_sub_dio_free(ldp_aio); + } + if (unlikely(result < 0)) + GOTO(out, result); + + iov_iter_advance(iter, count); + tot_bytes += count; + file_offset += count; + } + +out: + ll_dio_aio->cda_bytes += tot_bytes; + + if (rw == WRITE) + vio->u.readwrite.vui_written += tot_bytes; + else + vio->u.readwrite.vui_read += tot_bytes; + + /* AIO is not supported on pipes, so we cannot return EIOCBQEUED like + * we normally would for both DIO and AIO here + */ + if (result == 0 && !iov_iter_is_pipe(iter)) + result = -EIOCBQUEUED; + + return result; +} + +#if defined(HAVE_DIO_ITER) +static ssize_t ll_direct_IO( +#ifndef HAVE_IOV_ITER_RW + int rw, +#endif + struct kiocb *iocb, struct iov_iter *iter +#ifndef HAVE_DIRECTIO_2ARGS + , loff_t file_offset +#endif + ) +{ + int nrw; + +#ifndef HAVE_IOV_ITER_RW + nrw = rw; +#else + nrw = iov_iter_rw(iter); +#endif + + return ll_direct_IO_impl(iocb, iter, nrw); +} + +#else /* !defined(HAVE_DIO_ITER) */ + +static ssize_t +ll_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t file_offset, unsigned long nr_segs) +{ + struct iov_iter iter; + + iov_iter_init(&iter, iov, nr_segs, iov_length(iov, nr_segs), 0); + return ll_direct_IO_impl(iocb, &iter, rw); +} + +#endif /* !defined(HAVE_DIO_ITER) */ + +/** + * Prepare partially written-to page for a write. + * @pg is owned when passed in and disowned when it returns non-zero result to + * the caller. + */ +static int ll_prepare_partial_page(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, struct file *file) +{ + struct cl_attr *attr = vvp_env_thread_attr(env); + struct cl_object *obj = io->ci_obj; + struct vvp_page *vpg = cl_object_page_slice(obj, pg); + loff_t offset = cl_offset(obj, vvp_index(vpg)); + int result; + ENTRY; + + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + if (result) { + cl_page_disown(env, io, pg); + GOTO(out, result); + } + + /* + * If are writing to a new page, no need to read old data. + * The extent locking will have updated the KMS, and for our + * purposes here we can treat it like i_size. + */ + if (attr->cat_kms <= offset) { + char *kaddr = kmap_atomic(vpg->vpg_page); + + memset(kaddr, 0, cl_page_size(obj)); + kunmap_atomic(kaddr); + GOTO(out, result = 0); + } + + if (vpg->vpg_defer_uptodate) { + vpg->vpg_ra_used = 1; + GOTO(out, result = 0); + } + + result = ll_io_read_page(env, io, pg, file); + if (result) + GOTO(out, result); + + /* ll_io_read_page() disowns the page */ + result = cl_page_own(env, io, pg); + if (!result) { + if (!PageUptodate(cl_page_vmpage(pg))) { + cl_page_disown(env, io, pg); + result = -EIO; + } + } else if (result == -ENOENT) { + /* page was truncated */ + result = -EAGAIN; + } + EXIT; + +out: + return result; +} + +static int ll_tiny_write_begin(struct page *vmpage, struct address_space *mapping) +{ + /* Page must be present, up to date, dirty, and not in writeback. */ + if (!vmpage || !PageUptodate(vmpage) || !PageDirty(vmpage) || + PageWriteback(vmpage) || vmpage->mapping != mapping) + return -ENODATA; + + return 0; +} + +static int ll_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, +#ifdef HAVE_GRAB_CACHE_PAGE_WRITE_BEGIN_WITH_FLAGS + unsigned int flags, +#endif + struct page **pagep, void **fsdata) +{ + struct ll_cl_context *lcc = NULL; + const struct lu_env *env = NULL; + struct cl_io *io = NULL; + struct cl_page *page = NULL; + struct inode *inode = file_inode(file); + struct cl_object *clob = ll_i2info(mapping->host)->lli_clob; + pgoff_t index = pos >> PAGE_SHIFT; + struct page *vmpage = NULL; + unsigned from = pos & (PAGE_SIZE - 1); + unsigned to = from + len; + int result = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "Writing %lu of %d to %d bytes\n", index, from, len); + + lcc = ll_cl_find(inode); + if (lcc == NULL) { + vmpage = grab_cache_page_nowait(mapping, index); + result = ll_tiny_write_begin(vmpage, mapping); + GOTO(out, result); + } + + env = lcc->lcc_env; + io = lcc->lcc_io; + + if (file->f_flags & O_DIRECT) { + /* direct IO failed because it couldn't clean up cached pages, + * this causes a problem for mirror write because the cached + * page may belong to another mirror, which will result in + * problem submitting the I/O. */ + if (io->ci_designated_mirror > 0) + GOTO(out, result = -EBUSY); + + /** + * Direct write can fall back to buffered read, but DIO is done + * with lockless i/o, and buffered requires LDLM locking, so + * in this case we must restart without lockless. + */ + if (!io->ci_dio_lock) { + io->ci_dio_lock = 1; + io->ci_need_restart = 1; + GOTO(out, result = -ENOLCK); + } + } +again: + /* To avoid deadlock, try to lock page first. */ + vmpage = grab_cache_page_nowait(mapping, index); + + if (unlikely(vmpage == NULL || + PageDirty(vmpage) || PageWriteback(vmpage))) { + struct vvp_io *vio = vvp_env_io(env); + struct cl_page_list *plist = &vio->u.readwrite.vui_queue; + + /* if the page is already in dirty cache, we have to commit + * the pages right now; otherwise, it may cause deadlock + * because it holds page lock of a dirty page and request for + * more grants. It's okay for the dirty page to be the first + * one in commit page list, though. */ + if (vmpage != NULL && plist->pl_nr > 0) { + unlock_page(vmpage); + put_page(vmpage); + vmpage = NULL; + } + + /* commit pages and then wait for page lock */ + result = vvp_io_write_commit(env, io); + if (result < 0) + GOTO(out, result); + + if (vmpage == NULL) { + vmpage = grab_cache_page_write_begin(mapping, index +#ifdef HAVE_GRAB_CACHE_PAGE_WRITE_BEGIN_WITH_FLAGS + , flags +#endif + ); + if (vmpage == NULL) + GOTO(out, result = -ENOMEM); + } + } + + /* page was truncated */ + if (mapping != vmpage->mapping) { + CDEBUG(D_VFSTRACE, "page: %lu was truncated\n", index); + unlock_page(vmpage); + put_page(vmpage); + vmpage = NULL; + goto again; + } + + page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE); + if (IS_ERR(page)) + GOTO(out, result = PTR_ERR(page)); + + lcc->lcc_page = page; + lu_ref_add(&page->cp_reference, "cl_io", io); + + cl_page_assume(env, io, page); + if (!PageUptodate(vmpage)) { + /* + * We're completely overwriting an existing page, + * so _don't_ set it up to date until commit_write + */ + if (from == 0 && to == PAGE_SIZE) { + CL_PAGE_HEADER(D_PAGE, env, page, "full page write\n"); + POISON_PAGE(vmpage, 0x11); + } else { + /* TODO: can be optimized at OSC layer to check if it + * is a lockless IO. In that case, it's not necessary + * to read the data. */ + result = ll_prepare_partial_page(env, io, page, file); + if (result) { + /* vmpage should have been unlocked */ + put_page(vmpage); + vmpage = NULL; + + if (result == -EAGAIN) + goto again; + GOTO(out, result); + } + } + } + EXIT; +out: + if (result < 0) { + if (vmpage != NULL) { + unlock_page(vmpage); + put_page(vmpage); + } + /* On tiny_write failure, page and io are always null. */ + if (!IS_ERR_OR_NULL(page)) { + lu_ref_del(&page->cp_reference, "cl_io", io); + cl_page_put(env, page); + } + if (io) + io->ci_result = result; + } else { + *pagep = vmpage; + *fsdata = lcc; + } + RETURN(result); +} + +static int ll_tiny_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int copied, + struct page *vmpage) +{ + struct cl_page *clpage = (struct cl_page *) vmpage->private; + loff_t kms = pos+copied; + loff_t to = kms & (PAGE_SIZE-1) ? kms & (PAGE_SIZE-1) : PAGE_SIZE; + __u16 refcheck; + struct lu_env *env = cl_env_get(&refcheck); + int rc = 0; + + ENTRY; + + if (IS_ERR(env)) { + rc = PTR_ERR(env); + goto out; + } + + /* This page is dirty in cache, so it should have a cl_page pointer + * set in vmpage->private. + */ + LASSERT(clpage != NULL); + + if (copied == 0) + goto out_env; + + /* Update the underlying size information in the OSC/LOV objects this + * page is part of. + */ + cl_page_touch(env, clpage, to); + +out_env: + cl_env_put(env, &refcheck); + +out: + /* Must return page unlocked. */ + unlock_page(vmpage); + + RETURN(rc); +} + +static int ll_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *vmpage, void *fsdata) +{ + struct ll_cl_context *lcc = fsdata; + const struct lu_env *env; + struct cl_io *io; + struct vvp_io *vio; + struct cl_page *page; + unsigned from = pos & (PAGE_SIZE - 1); + bool unplug = false; + int result = 0; + ENTRY; + + put_page(vmpage); + + CDEBUG(D_VFSTRACE, "pos %llu, len %u, copied %u\n", pos, len, copied); + + if (lcc == NULL) { + result = ll_tiny_write_end(file, mapping, pos, len, copied, + vmpage); + GOTO(out, result); + } + + LASSERT(lcc != NULL); + env = lcc->lcc_env; + page = lcc->lcc_page; + io = lcc->lcc_io; + vio = vvp_env_io(env); + + LASSERT(cl_page_is_owned(page, io)); + if (copied > 0) { + struct cl_page_list *plist = &vio->u.readwrite.vui_queue; + + lcc->lcc_page = NULL; /* page will be queued */ + + /* Add it into write queue */ + cl_page_list_add(plist, page, true); + if (plist->pl_nr == 1) /* first page */ + vio->u.readwrite.vui_from = from; + else + LASSERT(from == 0); + vio->u.readwrite.vui_to = from + copied; + + /* To address the deadlock in balance_dirty_pages() where + * this dirty page may be written back in the same thread. */ + if (PageDirty(vmpage)) + unplug = true; + + /* We may have one full RPC, commit it soon */ + if (plist->pl_nr >= PTLRPC_MAX_BRW_PAGES) + unplug = true; + + CL_PAGE_DEBUG(D_VFSTRACE, env, page, + "queued page: %d.\n", plist->pl_nr); + } else { + cl_page_disown(env, io, page); + + lcc->lcc_page = NULL; + lu_ref_del(&page->cp_reference, "cl_io", io); + cl_page_put(env, page); + + /* page list is not contiguous now, commit it now */ + unplug = true; + } + if (unplug || io->u.ci_wr.wr_sync) + result = vvp_io_write_commit(env, io); + + if (result < 0) + io->ci_result = result; + + +out: + RETURN(result >= 0 ? copied : result); +} + +#ifdef CONFIG_MIGRATION +static int ll_migrate_folio(struct address_space *mapping, + struct folio_migr *newpage, struct folio_migr *page, + enum migrate_mode mode) +{ + /* Always fail page migration until we have a proper implementation */ + return -EIO; +} +#endif + +const struct address_space_operations ll_aops = { +#ifdef HAVE_DIRTY_FOLIO + .dirty_folio = filemap_dirty_folio, +#else + .set_page_dirty = __set_page_dirty_nobuffers, +#endif +#ifdef HAVE_INVALIDATE_FOLIO + .invalidate_folio = ll_invalidate_folio, +#else + .invalidatepage = ll_invalidatepage, +#endif +#ifdef HAVE_AOPS_READ_FOLIO + .read_folio = ll_read_folio, +#else + .readpage = ll_readpage, +#endif +#ifdef HAVE_AOPS_RELEASE_FOLIO + .release_folio = ll_release_folio, +#else + .releasepage = (void *)ll_releasepage, +#endif + .direct_IO = ll_direct_IO, + .writepage = ll_writepage, + .writepages = ll_writepages, + .write_begin = ll_write_begin, + .write_end = ll_write_end, +#ifdef CONFIG_MIGRATION + .migrate_folio = ll_migrate_folio, +#endif +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/statahead.c b/drivers/staging/lustrefsx/lustre/llite/statahead.c new file mode 100644 index 0000000000000..faf2860c8b481 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/statahead.c @@ -0,0 +1,1790 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include "llite_internal.h" + +#define SA_OMITTED_ENTRY_MAX 8ULL + +typedef enum { + /** negative values are for error cases */ + SA_ENTRY_INIT = 0, /** init entry */ + SA_ENTRY_SUCC = 1, /** stat succeed */ + SA_ENTRY_INVA = 2, /** invalid entry */ +} se_state_t; + +/* + * sa_entry is not refcounted: statahead thread allocates it and do async stat, + * and in async stat callback ll_statahead_interpret() will add it into + * sai_interim_entries, later statahead thread will call sa_handle_callback() to + * instantiate entry and move it into sai_entries, and then only scanner process + * can access and free it. + */ +struct sa_entry { + /* link into sai_interim_entries or sai_entries */ + struct list_head se_list; + /* link into sai hash table locally */ + struct list_head se_hash; + /* entry index in the sai */ + __u64 se_index; + /* low layer ldlm lock handle */ + __u64 se_handle; + /* entry status */ + se_state_t se_state; + /* entry size, contains name */ + int se_size; + /* pointer to async getattr enqueue info */ + struct md_enqueue_info *se_minfo; + /* pointer to the async getattr request */ + struct ptlrpc_request *se_req; + /* pointer to the target inode */ + struct inode *se_inode; + /* entry name */ + struct qstr se_qstr; + /* entry fid */ + struct lu_fid se_fid; +}; + +static unsigned int sai_generation; +static DEFINE_SPINLOCK(sai_generation_lock); + +static inline int sa_unhashed(struct sa_entry *entry) +{ + return list_empty(&entry->se_hash); +} + +/* sa_entry is ready to use */ +static inline int sa_ready(struct sa_entry *entry) +{ + /* Make sure sa_entry is updated and ready to use */ + smp_rmb(); + return (entry->se_state != SA_ENTRY_INIT); +} + +/* hash value to put in sai_cache */ +static inline int sa_hash(int val) +{ + return val & LL_SA_CACHE_MASK; +} + +/* hash entry into sai_cache */ +static inline void +sa_rehash(struct ll_statahead_info *sai, struct sa_entry *entry) +{ + int i = sa_hash(entry->se_qstr.hash); + + spin_lock(&sai->sai_cache_lock[i]); + list_add_tail(&entry->se_hash, &sai->sai_cache[i]); + spin_unlock(&sai->sai_cache_lock[i]); +} + +/* unhash entry from sai_cache */ +static inline void +sa_unhash(struct ll_statahead_info *sai, struct sa_entry *entry) +{ + int i = sa_hash(entry->se_qstr.hash); + + spin_lock(&sai->sai_cache_lock[i]); + list_del_init(&entry->se_hash); + spin_unlock(&sai->sai_cache_lock[i]); +} + +static inline int agl_should_run(struct ll_statahead_info *sai, + struct inode *inode) +{ + return inode && S_ISREG(inode->i_mode) && sai->sai_agl_task; +} + +static inline struct ll_inode_info * +agl_first_entry(struct ll_statahead_info *sai) +{ + return list_first_entry(&sai->sai_agls, struct ll_inode_info, + lli_agl_list); +} + +/* statahead window is full */ +static inline int sa_sent_full(struct ll_statahead_info *sai) +{ + return atomic_read(&sai->sai_cache_count) >= sai->sai_max; +} + +/* got async stat replies */ +static inline int sa_has_callback(struct ll_statahead_info *sai) +{ + return !list_empty(&sai->sai_interim_entries); +} + +static inline int agl_list_empty(struct ll_statahead_info *sai) +{ + return list_empty(&sai->sai_agls); +} + +/** + * (1) hit ratio less than 80% + * or + * (2) consecutive miss more than 8 + * then means low hit. + */ +static inline int sa_low_hit(struct ll_statahead_info *sai) +{ + return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) || + (sai->sai_consecutive_miss > 8)); +} + +/* + * if the given index is behind of statahead window more than + * SA_OMITTED_ENTRY_MAX, then it is old. + */ +static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index) +{ + return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX < + sai->sai_index); +} + +/* allocate sa_entry and hash it to allow scanner process to find it */ +static struct sa_entry * +sa_alloc(struct dentry *parent, struct ll_statahead_info *sai, __u64 index, + const char *name, int len, const struct lu_fid *fid) +{ + struct ll_inode_info *lli; + struct sa_entry *entry; + int entry_size; + char *dname; + + ENTRY; + + entry_size = sizeof(struct sa_entry) + (len & ~3) + 4; + OBD_ALLOC(entry, entry_size); + if (unlikely(!entry)) + RETURN(ERR_PTR(-ENOMEM)); + + CDEBUG(D_READA, "alloc sa entry %.*s(%p) index %llu\n", + len, name, entry, index); + + entry->se_index = index; + + entry->se_state = SA_ENTRY_INIT; + entry->se_size = entry_size; + dname = (char *)entry + sizeof(struct sa_entry); + memcpy(dname, name, len); + dname[len] = 0; + entry->se_qstr.hash = ll_full_name_hash(parent, name, len); + entry->se_qstr.len = len; + entry->se_qstr.name = dname; + entry->se_fid = *fid; + + lli = ll_i2info(sai->sai_dentry->d_inode); + + spin_lock(&lli->lli_sa_lock); + INIT_LIST_HEAD(&entry->se_list); + sa_rehash(sai, entry); + spin_unlock(&lli->lli_sa_lock); + + atomic_inc(&sai->sai_cache_count); + + RETURN(entry); +} + +/* free sa_entry, which should have been unhashed and not in any list */ +static void sa_free(struct ll_statahead_info *sai, struct sa_entry *entry) +{ + CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n", + entry->se_qstr.len, entry->se_qstr.name, entry, + entry->se_index); + + LASSERT(list_empty(&entry->se_list)); + LASSERT(sa_unhashed(entry)); + + OBD_FREE(entry, entry->se_size); + atomic_dec(&sai->sai_cache_count); +} + +/* + * find sa_entry by name, used by directory scanner, lock is not needed because + * only scanner can remove the entry from cache. + */ +static struct sa_entry * +sa_get(struct ll_statahead_info *sai, const struct qstr *qstr) +{ + struct sa_entry *entry; + int i = sa_hash(qstr->hash); + + list_for_each_entry(entry, &sai->sai_cache[i], se_hash) { + if (entry->se_qstr.hash == qstr->hash && + entry->se_qstr.len == qstr->len && + memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0) + return entry; + } + return NULL; +} + +/* unhash and unlink sa_entry, and then free it */ +static inline void +sa_kill(struct ll_statahead_info *sai, struct sa_entry *entry) +{ + struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); + + LASSERT(!sa_unhashed(entry)); + LASSERT(!list_empty(&entry->se_list)); + LASSERT(sa_ready(entry)); + + sa_unhash(sai, entry); + + spin_lock(&lli->lli_sa_lock); + list_del_init(&entry->se_list); + spin_unlock(&lli->lli_sa_lock); + + iput(entry->se_inode); + + sa_free(sai, entry); +} + +/* called by scanner after use, sa_entry will be killed */ +static void +sa_put(struct ll_statahead_info *sai, struct sa_entry *entry) +{ + struct sa_entry *tmp, *next; + + if (entry && entry->se_state == SA_ENTRY_SUCC) { + struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode); + + sai->sai_hit++; + sai->sai_consecutive_miss = 0; + sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max); + } else { + sai->sai_miss++; + sai->sai_consecutive_miss++; + } + + if (entry) + sa_kill(sai, entry); + + /* + * kill old completed entries, only scanner process does this, no need + * to lock + */ + list_for_each_entry_safe(tmp, next, &sai->sai_entries, se_list) { + if (!is_omitted_entry(sai, tmp->se_index)) + break; + sa_kill(sai, tmp); + } +} + +/* + * update state and sort add entry to sai_entries by index, return true if + * scanner is waiting on this entry. + */ +static bool +__sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret) +{ + struct sa_entry *se; + struct list_head *pos = &sai->sai_entries; + __u64 index = entry->se_index; + + LASSERT(!sa_ready(entry)); + LASSERT(list_empty(&entry->se_list)); + + list_for_each_entry_reverse(se, &sai->sai_entries, se_list) { + if (se->se_index < entry->se_index) { + pos = &se->se_list; + break; + } + } + list_add(&entry->se_list, pos); + /* + * LU-9210: ll_statahead_interpet must be able to see this before + * we wake it up + */ + smp_store_release(&entry->se_state, + ret < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); + + return (index == sai->sai_index_wait); +} + +/* finish async stat RPC arguments */ +static void sa_fini_data(struct md_enqueue_info *minfo) +{ + struct md_op_data *op_data = &minfo->mi_data; + + if (op_data->op_flags & MF_OPNAME_KMALLOCED) + /* allocated via ll_setup_filename called from sa_prep_data */ + kfree(op_data->op_name); + ll_unlock_md_op_lsm(&minfo->mi_data); + iput(minfo->mi_dir); + OBD_FREE_PTR(minfo); +} + +static int ll_statahead_interpret(struct ptlrpc_request *req, + struct md_enqueue_info *minfo, int rc); + +/* + * prepare arguments for async stat RPC. + */ +static struct md_enqueue_info * +sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry) +{ + struct md_enqueue_info *minfo; + struct ldlm_enqueue_info *einfo; + struct md_op_data *op_data; + + OBD_ALLOC_PTR(minfo); + if (!minfo) + return ERR_PTR(-ENOMEM); + + op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, + entry->se_qstr.name, entry->se_qstr.len, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(minfo); + return (struct md_enqueue_info *)op_data; + } + + if (!child) + op_data->op_fid2 = entry->se_fid; + + minfo->mi_it.it_op = IT_GETATTR; + minfo->mi_dir = igrab(dir); + minfo->mi_cb = ll_statahead_interpret; + minfo->mi_cbdata = entry; + + einfo = &minfo->mi_einfo; + einfo->ei_type = LDLM_IBITS; + einfo->ei_mode = it_to_lock_mode(&minfo->mi_it); + einfo->ei_cb_bl = ll_md_blocking_ast; + einfo->ei_cb_cp = ldlm_completion_ast; + einfo->ei_cb_gl = NULL; + einfo->ei_cbdata = NULL; + einfo->ei_req_slot = 1; + + return minfo; +} + +/* + * release resources used in async stat RPC, update entry state and wakeup if + * scanner process it waiting on this entry. + */ +static void +sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret) +{ + struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); + struct md_enqueue_info *minfo = entry->se_minfo; + struct ptlrpc_request *req = entry->se_req; + bool wakeup; + + /* release resources used in RPC */ + if (minfo) { + entry->se_minfo = NULL; + ll_intent_release(&minfo->mi_it); + sa_fini_data(minfo); + } + + if (req) { + entry->se_req = NULL; + ptlrpc_req_finished(req); + } + + spin_lock(&lli->lli_sa_lock); + wakeup = __sa_make_ready(sai, entry, ret); + spin_unlock(&lli->lli_sa_lock); + + if (wakeup) + wake_up(&sai->sai_waitq); +} + +/* insert inode into the list of sai_agls */ +static void ll_agl_add(struct ll_statahead_info *sai, + struct inode *inode, int index) +{ + struct ll_inode_info *child = ll_i2info(inode); + struct ll_inode_info *parent = ll_i2info(sai->sai_dentry->d_inode); + + spin_lock(&child->lli_agl_lock); + if (child->lli_agl_index == 0) { + child->lli_agl_index = index; + spin_unlock(&child->lli_agl_lock); + + LASSERT(list_empty(&child->lli_agl_list)); + + spin_lock(&parent->lli_agl_lock); + /* Re-check under the lock */ + if (agl_should_run(sai, inode)) { + if (agl_list_empty(sai)) + wake_up_process(sai->sai_agl_task); + igrab(inode); + list_add_tail(&child->lli_agl_list, &sai->sai_agls); + } else + child->lli_agl_index = 0; + spin_unlock(&parent->lli_agl_lock); + } else { + spin_unlock(&child->lli_agl_lock); + } +} + +/* allocate sai */ +static struct ll_statahead_info *ll_sai_alloc(struct dentry *dentry) +{ + struct ll_statahead_info *sai; + struct ll_inode_info *lli = ll_i2info(dentry->d_inode); + int i; + + ENTRY; + + OBD_ALLOC_PTR(sai); + if (!sai) + RETURN(NULL); + + sai->sai_dentry = dget(dentry); + atomic_set(&sai->sai_refcount, 1); + sai->sai_max = LL_SA_RPC_MIN; + sai->sai_index = 1; + init_waitqueue_head(&sai->sai_waitq); + + INIT_LIST_HEAD(&sai->sai_interim_entries); + INIT_LIST_HEAD(&sai->sai_entries); + INIT_LIST_HEAD(&sai->sai_agls); + + for (i = 0; i < LL_SA_CACHE_SIZE; i++) { + INIT_LIST_HEAD(&sai->sai_cache[i]); + spin_lock_init(&sai->sai_cache_lock[i]); + } + atomic_set(&sai->sai_cache_count, 0); + + spin_lock(&sai_generation_lock); + lli->lli_sa_generation = ++sai_generation; + if (unlikely(sai_generation == 0)) + lli->lli_sa_generation = ++sai_generation; + spin_unlock(&sai_generation_lock); + + RETURN(sai); +} + +/* free sai */ +static inline void ll_sai_free(struct ll_statahead_info *sai) +{ + LASSERT(sai->sai_dentry != NULL); + dput(sai->sai_dentry); + OBD_FREE_PTR(sai); +} + +/* + * take refcount of sai if sai for @dir exists, which means statahead is on for + * this directory. + */ +static inline struct ll_statahead_info *ll_sai_get(struct inode *dir) +{ + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = NULL; + + spin_lock(&lli->lli_sa_lock); + sai = lli->lli_sai; + if (sai) + atomic_inc(&sai->sai_refcount); + spin_unlock(&lli->lli_sa_lock); + + return sai; +} + +/* + * put sai refcount after use, if refcount reaches zero, free sai and sa_entries + * attached to it. + */ +static void ll_sai_put(struct ll_statahead_info *sai) +{ + struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); + + if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) { + struct sa_entry *entry, *next; + struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode); + + lli->lli_sai = NULL; + spin_unlock(&lli->lli_sa_lock); + + LASSERT(!sai->sai_task); + LASSERT(!sai->sai_agl_task); + LASSERT(sai->sai_sent == sai->sai_replied); + LASSERT(!sa_has_callback(sai)); + + list_for_each_entry_safe(entry, next, &sai->sai_entries, + se_list) + sa_kill(sai, entry); + + LASSERT(atomic_read(&sai->sai_cache_count) == 0); + LASSERT(agl_list_empty(sai)); + + ll_sai_free(sai); + atomic_dec(&sbi->ll_sa_running); + } +} + +/* Do NOT forget to drop inode refcount when into sai_agls. */ +static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai) +{ + struct ll_inode_info *lli = ll_i2info(inode); + u64 index = lli->lli_agl_index; + ktime_t expire; + int rc; + + ENTRY; + + LASSERT(list_empty(&lli->lli_agl_list)); + + /* AGL maybe fall behind statahead with one entry */ + if (is_omitted_entry(sai, index + 1)) { + lli->lli_agl_index = 0; + iput(inode); + RETURN_EXIT; + } + + /* + * In case of restore, the MDT has the right size and has already + * sent it back without granting the layout lock, inode is up-to-date. + * Then AGL (async glimpse lock) is useless. + * Also to glimpse we need the layout, in case of a runninh restore + * the MDT holds the layout lock so the glimpse will block up to the + * end of restore (statahead/agl will block) + */ + if (test_bit(LLIF_FILE_RESTORING, &lli->lli_flags)) { + lli->lli_agl_index = 0; + iput(inode); + RETURN_EXIT; + } + + /* Someone is in glimpse (sync or async), do nothing. */ + rc = down_write_trylock(&lli->lli_glimpse_sem); + if (rc == 0) { + lli->lli_agl_index = 0; + iput(inode); + RETURN_EXIT; + } + + /* + * Someone triggered glimpse within 1 sec before. + * 1) The former glimpse succeeded with glimpse lock granted by OST, and + * if the lock is still cached on client, AGL needs to do nothing. If + * it is cancelled by other client, AGL maybe cannot obtaion new lock + * for no glimpse callback triggered by AGL. + * 2) The former glimpse succeeded, but OST did not grant glimpse lock. + * Under such case, it is quite possible that the OST will not grant + * glimpse lock for AGL also. + * 3) The former glimpse failed, compared with other two cases, it is + * relative rare. AGL can ignore such case, and it will not muchly + * affect the performance. + */ + expire = ktime_sub_ns(ktime_get(), NSEC_PER_SEC); + if (ktime_to_ns(lli->lli_glimpse_time) && + ktime_before(expire, lli->lli_glimpse_time)) { + up_write(&lli->lli_glimpse_sem); + lli->lli_agl_index = 0; + iput(inode); + RETURN_EXIT; + } + + CDEBUG(D_READA, + "Handling (init) async glimpse: inode = " DFID", idx = %llu\n", + PFID(&lli->lli_fid), index); + + cl_agl(inode); + lli->lli_agl_index = 0; + lli->lli_glimpse_time = ktime_get(); + up_write(&lli->lli_glimpse_sem); + + CDEBUG(D_READA, + "Handled (init) async glimpse: inode= " DFID", idx = %llu, rc = %d\n", + PFID(&lli->lli_fid), index, rc); + + iput(inode); + + EXIT; +} + +/* + * prepare inode for sa entry, add it into agl list, now sa_entry is ready + * to be used by scanner process. + */ +static void sa_instantiate(struct ll_statahead_info *sai, + struct sa_entry *entry) +{ + struct inode *dir = sai->sai_dentry->d_inode; + struct inode *child; + struct md_enqueue_info *minfo; + struct lookup_intent *it; + struct ptlrpc_request *req; + struct mdt_body *body; + int rc = 0; + + ENTRY; + + LASSERT(entry->se_handle != 0); + + minfo = entry->se_minfo; + it = &minfo->mi_it; + req = entry->se_req; + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (!body) + GOTO(out, rc = -EFAULT); + + child = entry->se_inode; + /* revalidate; unlinked and re-created with the same name */ + if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->mbo_fid1))) { + if (child) { + entry->se_inode = NULL; + iput(child); + } + /* The mdt_body is invalid. Skip this entry */ + GOTO(out, rc = -EAGAIN); + } + + it->it_lock_handle = entry->se_handle; + rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL); + if (rc != 1) + GOTO(out, rc = -EAGAIN); + + rc = ll_prep_inode(&child, &req->rq_pill, dir->i_sb, it); + if (rc) + GOTO(out, rc); + + /* If encryption context was returned by MDT, put it in + * inode now to save an extra getxattr. + */ + if (body->mbo_valid & OBD_MD_ENCCTX) { + void *encctx = req_capsule_server_get(&req->rq_pill, + &RMF_FILE_ENCCTX); + __u32 encctxlen = req_capsule_get_size(&req->rq_pill, + &RMF_FILE_ENCCTX, + RCL_SERVER); + + if (encctxlen) { + CDEBUG(D_SEC, + "server returned encryption ctx for "DFID"\n", + PFID(ll_inode2fid(child))); + rc = ll_xattr_cache_insert(child, + xattr_for_enc(child), + encctx, encctxlen); + if (rc) + CWARN("%s: cannot set enc ctx for "DFID": rc = %d\n", + ll_i2sbi(child)->ll_fsname, + PFID(ll_inode2fid(child)), rc); + } + } + + CDEBUG(D_READA, "%s: setting %.*s"DFID" l_data to inode %p\n", + ll_i2sbi(dir)->ll_fsname, entry->se_qstr.len, + entry->se_qstr.name, PFID(ll_inode2fid(child)), child); + ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL); + + entry->se_inode = child; + + if (agl_should_run(sai, child)) + ll_agl_add(sai, child, entry->se_index); + + EXIT; + +out: + /* + * sa_make_ready() will drop ldlm ibits lock refcount by calling + * ll_intent_drop_lock() in spite of failures. Do not worry about + * calling ll_intent_drop_lock() more than once. + */ + sa_make_ready(sai, entry, rc); +} + +/* once there are async stat replies, instantiate sa_entry from replies */ +static void sa_handle_callback(struct ll_statahead_info *sai) +{ + struct ll_inode_info *lli; + + lli = ll_i2info(sai->sai_dentry->d_inode); + + spin_lock(&lli->lli_sa_lock); + while (sa_has_callback(sai)) { + struct sa_entry *entry; + + entry = list_entry(sai->sai_interim_entries.next, + struct sa_entry, se_list); + list_del_init(&entry->se_list); + spin_unlock(&lli->lli_sa_lock); + + sa_instantiate(sai, entry); + spin_lock(&lli->lli_sa_lock); + } + spin_unlock(&lli->lli_sa_lock); +} + +/* + * callback for async stat RPC, because this is called in ptlrpcd context, we + * only put sa_entry in sai_interim_entries, and wake up statahead thread to + * really prepare inode and instantiate sa_entry later. + */ +static int ll_statahead_interpret(struct ptlrpc_request *req, + struct md_enqueue_info *minfo, int rc) +{ + struct lookup_intent *it = &minfo->mi_it; + struct inode *dir = minfo->mi_dir; + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = lli->lli_sai; + struct sa_entry *entry = (struct sa_entry *)minfo->mi_cbdata; + __u64 handle = 0; + + ENTRY; + + if (it_disposition(it, DISP_LOOKUP_NEG)) + rc = -ENOENT; + + /* + * because statahead thread will wait for all inflight RPC to finish, + * sai should be always valid, no need to refcount + */ + LASSERT(sai != NULL); + LASSERT(entry != NULL); + + CDEBUG(D_READA, "sa_entry %.*s rc %d\n", + entry->se_qstr.len, entry->se_qstr.name, rc); + + if (rc != 0) { + ll_intent_release(it); + sa_fini_data(minfo); + } else { + /* + * release ibits lock ASAP to avoid deadlock when statahead + * thread enqueues lock on parent in readdir and another + * process enqueues lock on child with parent lock held, eg. + * unlink. + */ + handle = it->it_lock_handle; + ll_intent_drop_lock(it); + ll_unlock_md_op_lsm(&minfo->mi_data); + } + + spin_lock(&lli->lli_sa_lock); + if (rc != 0) { + if (__sa_make_ready(sai, entry, rc)) + wake_up(&sai->sai_waitq); + } else { + int first = 0; + + entry->se_minfo = minfo; + entry->se_req = ptlrpc_request_addref(req); + /* + * Release the async ibits lock ASAP to avoid deadlock + * when statahead thread tries to enqueue lock on parent + * for readpage and other tries to enqueue lock on child + * with parent's lock held, for example: unlink. + */ + entry->se_handle = handle; + if (!sa_has_callback(sai)) + first = 1; + + list_add_tail(&entry->se_list, &sai->sai_interim_entries); + if (first && sai->sai_task) + wake_up_process(sai->sai_task); + } + sai->sai_replied++; + + spin_unlock(&lli->lli_sa_lock); + + RETURN(rc); +} + +/* async stat for file not found in dcache */ +static int sa_lookup(struct inode *dir, struct sa_entry *entry) +{ + struct md_enqueue_info *minfo; + int rc; + + ENTRY; + + minfo = sa_prep_data(dir, NULL, entry); + if (IS_ERR(minfo)) + RETURN(PTR_ERR(minfo)); + + rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo); + if (rc < 0) + sa_fini_data(minfo); + + RETURN(rc); +} + +/** + * async stat for file found in dcache, similar to .revalidate + * + * \retval 1 dentry valid, no RPC sent + * \retval 0 dentry invalid, will send async stat RPC + * \retval negative number upon error + */ +static int sa_revalidate(struct inode *dir, struct sa_entry *entry, + struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct lookup_intent it = { .it_op = IT_GETATTR, + .it_lock_handle = 0 }; + struct md_enqueue_info *minfo; + int rc; + + ENTRY; + + if (unlikely(!inode)) + RETURN(1); + + if (d_mountpoint(dentry)) + RETURN(1); + + minfo = sa_prep_data(dir, inode, entry); + if (IS_ERR(minfo)) + RETURN(PTR_ERR(minfo)); + + entry->se_inode = igrab(inode); + rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode), + NULL); + if (rc == 1) { + entry->se_handle = it.it_lock_handle; + ll_intent_release(&it); + sa_fini_data(minfo); + RETURN(1); + } + + rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo); + if (rc < 0) { + entry->se_inode = NULL; + iput(inode); + sa_fini_data(minfo); + } + + RETURN(rc); +} + +/* async stat for file with @name */ +static void sa_statahead(struct dentry *parent, const char *name, int len, + const struct lu_fid *fid) +{ + struct inode *dir = parent->d_inode; + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = lli->lli_sai; + struct dentry *dentry = NULL; + struct sa_entry *entry; + int rc; + + ENTRY; + + entry = sa_alloc(parent, sai, sai->sai_index, name, len, fid); + if (IS_ERR(entry)) + RETURN_EXIT; + + dentry = d_lookup(parent, &entry->se_qstr); + if (!dentry) { + rc = sa_lookup(dir, entry); + } else { + rc = sa_revalidate(dir, entry, dentry); + if (rc == 1 && agl_should_run(sai, dentry->d_inode)) + ll_agl_add(sai, dentry->d_inode, entry->se_index); + } + + if (dentry) + dput(dentry); + + if (rc != 0) + sa_make_ready(sai, entry, rc); + else + sai->sai_sent++; + + sai->sai_index++; + + EXIT; +} + +/* async glimpse (agl) thread main function */ +static int ll_agl_thread(void *arg) +{ + struct dentry *parent = (struct dentry *)arg; + struct inode *dir = parent->d_inode; + struct ll_inode_info *plli = ll_i2info(dir); + struct ll_inode_info *clli; + /* + * We already own this reference, so it is safe to take it + * without a lock. + */ + struct ll_statahead_info *sai = plli->lli_sai; + + ENTRY; + + CDEBUG(D_READA, "agl thread started: sai %p, parent %pd\n", + sai, parent); + + while (({set_current_state(TASK_IDLE); + !kthread_should_stop(); })) { + spin_lock(&plli->lli_agl_lock); + clli = list_first_entry_or_null(&sai->sai_agls, + struct ll_inode_info, + lli_agl_list); + if (clli) { + __set_current_state(TASK_RUNNING); + list_del_init(&clli->lli_agl_list); + spin_unlock(&plli->lli_agl_lock); + ll_agl_trigger(&clli->lli_vfs_inode, sai); + cond_resched(); + } else { + spin_unlock(&plli->lli_agl_lock); + schedule(); + } + } + __set_current_state(TASK_RUNNING); + RETURN(0); +} + +static void ll_stop_agl(struct ll_statahead_info *sai) +{ + struct dentry *parent = sai->sai_dentry; + struct ll_inode_info *plli = ll_i2info(parent->d_inode); + struct ll_inode_info *clli; + struct task_struct *agl_task; + + spin_lock(&plli->lli_agl_lock); + agl_task = sai->sai_agl_task; + sai->sai_agl_task = NULL; + spin_unlock(&plli->lli_agl_lock); + if (!agl_task) + return; + + CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n", + sai, (unsigned int)agl_task->pid); + kthread_stop(agl_task); + + spin_lock(&plli->lli_agl_lock); + while ((clli = list_first_entry_or_null(&sai->sai_agls, + struct ll_inode_info, + lli_agl_list)) != NULL) { + list_del_init(&clli->lli_agl_list); + spin_unlock(&plli->lli_agl_lock); + clli->lli_agl_index = 0; + iput(&clli->lli_vfs_inode); + spin_lock(&plli->lli_agl_lock); + } + spin_unlock(&plli->lli_agl_lock); + CDEBUG(D_READA, "agl thread stopped: sai %p, parent %pd\n", + sai, parent); + ll_sai_put(sai); +} + +/* start agl thread */ +static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) +{ + int node = cfs_cpt_spread_node(cfs_cpt_tab, CFS_CPT_ANY); + struct ll_inode_info *plli; + struct task_struct *task; + + ENTRY; + + CDEBUG(D_READA, "start agl thread: sai %p, parent %pd\n", + sai, parent); + + plli = ll_i2info(parent->d_inode); + task = kthread_create_on_node(ll_agl_thread, parent, node, "ll_agl_%d", + plli->lli_opendir_pid); + if (IS_ERR(task)) { + CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task)); + RETURN_EXIT; + } + sai->sai_agl_task = task; + atomic_inc(&ll_i2sbi(d_inode(parent))->ll_agl_total); + /* Get an extra reference that the thread holds */ + ll_sai_get(d_inode(parent)); + + wake_up_process(task); + + EXIT; +} + +/* statahead thread main function */ +static int ll_statahead_thread(void *arg) +{ + struct dentry *parent = (struct dentry *)arg; + struct inode *dir = parent->d_inode; + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_sb_info *sbi = ll_i2sbi(dir); + struct ll_statahead_info *sai = lli->lli_sai; + int first = 0; + struct md_op_data *op_data; + struct page *page = NULL; + __u64 pos = 0; + int rc = 0; + + ENTRY; + + CDEBUG(D_READA, "statahead thread starting: sai %p, parent %pd\n", + sai, parent); + + OBD_ALLOC_PTR(op_data); + if (!op_data) + GOTO(out, rc = -ENOMEM); + + while (pos != MDS_DIR_END_OFF && sai->sai_task) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + + op_data = ll_prep_md_op_data(op_data, dir, dir, NULL, 0, 0, + LUSTRE_OPC_ANY, dir); + if (IS_ERR(op_data)) { + rc = PTR_ERR(op_data); + break; + } + + sai->sai_in_readpage = 1; + page = ll_get_dir_page(dir, op_data, pos, NULL); + ll_unlock_md_op_lsm(op_data); + sai->sai_in_readpage = 0; + if (IS_ERR(page)) { + rc = PTR_ERR(page); + CDEBUG(D_READA, + "error reading dir "DFID" at %llu /%llu opendir_pid = %u: rc = %d\n", + PFID(ll_inode2fid(dir)), pos, sai->sai_index, + lli->lli_opendir_pid, rc); + break; + } + + dp = page_address(page); + for (ent = lu_dirent_start(dp); + ent != NULL && sai->sai_task && + !sa_low_hit(sai); + ent = lu_dirent_next(ent)) { + __u64 hash; + int namelen; + char *name; + struct lu_fid fid; + struct llcrypt_str lltr = LLTR_INIT(NULL, 0); + + hash = le64_to_cpu(ent->lde_hash); + if (unlikely(hash < pos)) + /* + * Skip until we find target hash value. + */ + continue; + + namelen = le16_to_cpu(ent->lde_namelen); + if (unlikely(namelen == 0)) + /* + * Skip dummy record. + */ + continue; + + name = ent->lde_name; + if (name[0] == '.') { + if (namelen == 1) { + /* + * skip "." + */ + continue; + } else if (name[1] == '.' && namelen == 2) { + /* + * skip ".." + */ + continue; + } else if (!sai->sai_ls_all) { + /* + * skip hidden files. + */ + sai->sai_skip_hidden++; + continue; + } + } + + /* + * don't stat-ahead first entry. + */ + if (unlikely(++first == 1)) + continue; + + fid_le_to_cpu(&fid, &ent->lde_fid); + + while (({set_current_state(TASK_IDLE); + sai->sai_task; })) { + if (sa_has_callback(sai)) { + __set_current_state(TASK_RUNNING); + sa_handle_callback(sai); + } + + spin_lock(&lli->lli_agl_lock); + while (sa_sent_full(sai) && + !agl_list_empty(sai)) { + struct ll_inode_info *clli; + + __set_current_state(TASK_RUNNING); + clli = agl_first_entry(sai); + list_del_init(&clli->lli_agl_list); + spin_unlock(&lli->lli_agl_lock); + + ll_agl_trigger(&clli->lli_vfs_inode, + sai); + cond_resched(); + spin_lock(&lli->lli_agl_lock); + } + spin_unlock(&lli->lli_agl_lock); + + if (!sa_sent_full(sai)) + break; + schedule(); + } + __set_current_state(TASK_RUNNING); + + if (IS_ENCRYPTED(dir)) { + struct llcrypt_str de_name = + LLTR_INIT(ent->lde_name, namelen); + struct lu_fid fid; + + rc = llcrypt_fname_alloc_buffer(dir, NAME_MAX, + &lltr); + if (rc < 0) + continue; + + fid_le_to_cpu(&fid, &ent->lde_fid); + if (ll_fname_disk_to_usr(dir, 0, 0, &de_name, + &lltr, &fid)) { + llcrypt_fname_free_buffer(&lltr); + continue; + } + + name = lltr.name; + namelen = lltr.len; + } + + sa_statahead(parent, name, namelen, &fid); + llcrypt_fname_free_buffer(&lltr); + } + + pos = le64_to_cpu(dp->ldp_hash_end); + ll_release_page(dir, page, + le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); + + if (sa_low_hit(sai)) { + rc = -EFAULT; + atomic_inc(&sbi->ll_sa_wrong); + CDEBUG(D_READA, + "Statahead for dir "DFID" hit ratio too low: hit/miss %llu/%llu, sent/replied %llu/%llu, stoppingstatahead thread: pid %d\n", + PFID(&lli->lli_fid), sai->sai_hit, + sai->sai_miss, sai->sai_sent, + sai->sai_replied, current->pid); + break; + } + } + ll_finish_md_op_data(op_data); + + if (rc < 0) { + spin_lock(&lli->lli_sa_lock); + sai->sai_task = NULL; + lli->lli_sa_enabled = 0; + spin_unlock(&lli->lli_sa_lock); + } + + /* + * statahead is finished, but statahead entries need to be cached, wait + * for file release to stop me. + */ + while (({set_current_state(TASK_IDLE); + sai->sai_task; })) { + if (sa_has_callback(sai)) { + __set_current_state(TASK_RUNNING); + sa_handle_callback(sai); + } else { + schedule(); + } + } + __set_current_state(TASK_RUNNING); + + EXIT; +out: + ll_stop_agl(sai); + + /* + * wait for inflight statahead RPCs to finish, and then we can free sai + * safely because statahead RPC will access sai data + */ + while (sai->sai_sent != sai->sai_replied) + /* in case we're not woken up, timeout wait */ + msleep(125); + + /* release resources held by statahead RPCs */ + sa_handle_callback(sai); + + CDEBUG(D_READA, "%s: statahead thread stopped: sai %p, parent %pd\n", + sbi->ll_fsname, sai, parent); + + spin_lock(&lli->lli_sa_lock); + sai->sai_task = NULL; + spin_unlock(&lli->lli_sa_lock); + wake_up(&sai->sai_waitq); + + ll_sai_put(sai); + + return rc; +} + +/* authorize opened dir handle @key to statahead */ +void ll_authorize_statahead(struct inode *dir, void *key) +{ + struct ll_inode_info *lli = ll_i2info(dir); + + spin_lock(&lli->lli_sa_lock); + if (!lli->lli_opendir_key && !lli->lli_sai) { + /* + * if lli_sai is not NULL, it means previous statahead is not + * finished yet, we'd better not start a new statahead for now. + */ + LASSERT(lli->lli_opendir_pid == 0); + lli->lli_opendir_key = key; + lli->lli_opendir_pid = current->pid; + lli->lli_sa_enabled = 1; + } + spin_unlock(&lli->lli_sa_lock); +} + +/* + * deauthorize opened dir handle @key to statahead, and notify statahead thread + * to quit if it's running. + */ +void ll_deauthorize_statahead(struct inode *dir, void *key) +{ + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai; + + LASSERT(lli->lli_opendir_key == key); + LASSERT(lli->lli_opendir_pid != 0); + + CDEBUG(D_READA, "deauthorize statahead for "DFID"\n", + PFID(&lli->lli_fid)); + + spin_lock(&lli->lli_sa_lock); + lli->lli_opendir_key = NULL; + lli->lli_opendir_pid = 0; + lli->lli_sa_enabled = 0; + sai = lli->lli_sai; + if (sai && sai->sai_task) { + /* + * statahead thread may not have quit yet because it needs to + * cache entries, now it's time to tell it to quit. + * + * wake_up_process() provides the necessary barriers + * to pair with set_current_state(). + */ + struct task_struct *task = sai->sai_task; + + sai->sai_task = NULL; + wake_up_process(task); + } + spin_unlock(&lli->lli_sa_lock); +} + +enum { + /** + * not first dirent, or is "." + */ + LS_NOT_FIRST_DE = 0, + /** + * the first non-hidden dirent + */ + LS_FIRST_DE, + /** + * the first hidden dirent, that is "." + */ + LS_FIRST_DOT_DE +}; + +/* file is first dirent under @dir */ +static int is_first_dirent(struct inode *dir, struct dentry *dentry) +{ + struct qstr *target = &dentry->d_name; + struct md_op_data *op_data; + int dot_de; + struct page *page = NULL; + int rc = LS_NOT_FIRST_DE; + __u64 pos = 0; + struct llcrypt_str lltr = LLTR_INIT(NULL, 0); + + ENTRY; + + op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, + LUSTRE_OPC_ANY, dir); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + if (IS_ENCRYPTED(dir)) { + int rc2 = llcrypt_fname_alloc_buffer(dir, NAME_MAX, &lltr); + + if (rc2 < 0) + RETURN(rc2); + } + + /** + *FIXME choose the start offset of the readdir + */ + + page = ll_get_dir_page(dir, op_data, 0, NULL); + + while (1) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + + if (IS_ERR(page)) { + struct ll_inode_info *lli = ll_i2info(dir); + + rc = PTR_ERR(page); + CERROR("%s: reading dir "DFID" at %llu opendir_pid = %u : rc = %d\n", + ll_i2sbi(dir)->ll_fsname, + PFID(ll_inode2fid(dir)), pos, + lli->lli_opendir_pid, rc); + break; + } + + dp = page_address(page); + for (ent = lu_dirent_start(dp); ent != NULL; + ent = lu_dirent_next(ent)) { + __u64 hash; + int namelen; + char *name; + + hash = le64_to_cpu(ent->lde_hash); + /* + * The ll_get_dir_page() can return any page containing + * the given hash which may be not the start hash. + */ + if (unlikely(hash < pos)) + continue; + + namelen = le16_to_cpu(ent->lde_namelen); + if (unlikely(namelen == 0)) + /* + * skip dummy record. + */ + continue; + + name = ent->lde_name; + if (name[0] == '.') { + if (namelen == 1) + /* + * skip "." + */ + continue; + else if (name[1] == '.' && namelen == 2) + /* + * skip ".." + */ + continue; + else + dot_de = 1; + } else { + dot_de = 0; + } + + if (dot_de && target->name[0] != '.') { + CDEBUG(D_READA, "%.*s skip hidden file %.*s\n", + target->len, target->name, + namelen, name); + continue; + } + + if (IS_ENCRYPTED(dir)) { + struct llcrypt_str de_name = + LLTR_INIT(ent->lde_name, namelen); + struct lu_fid fid; + + fid_le_to_cpu(&fid, &ent->lde_fid); + if (ll_fname_disk_to_usr(dir, 0, 0, &de_name, + &lltr, &fid)) + continue; + name = lltr.name; + namelen = lltr.len; + } + + if (target->len != namelen || + memcmp(target->name, name, namelen) != 0) + rc = LS_NOT_FIRST_DE; + else if (!dot_de) + rc = LS_FIRST_DE; + else + rc = LS_FIRST_DOT_DE; + + ll_release_page(dir, page, false); + GOTO(out, rc); + } + pos = le64_to_cpu(dp->ldp_hash_end); + if (pos == MDS_DIR_END_OFF) { + /* + * End of directory reached. + */ + ll_release_page(dir, page, false); + GOTO(out, rc); + } else { + /* + * chain is exhausted + * Normal case: continue to the next page. + */ + ll_release_page(dir, page, le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + page = ll_get_dir_page(dir, op_data, pos, NULL); + } + } + EXIT; +out: + llcrypt_fname_free_buffer(&lltr); + ll_finish_md_op_data(op_data); + + return rc; +} + +/** + * revalidate @dentryp from statahead cache + * + * \param[in] dir parent directory + * \param[in] sai sai structure + * \param[out] dentryp pointer to dentry which will be revalidated + * \param[in] unplug unplug statahead window only (normally for negative + * dentry) + * \retval 1 on success, dentry is saved in @dentryp + * \retval 0 if revalidation failed (no proper lock on client) + * \retval negative number upon error + */ +static int revalidate_statahead_dentry(struct inode *dir, + struct ll_statahead_info *sai, + struct dentry **dentryp, + bool unplug) +{ + struct sa_entry *entry = NULL; + struct ll_inode_info *lli = ll_i2info(dir); + int rc = 0; + + ENTRY; + + if ((*dentryp)->d_name.name[0] == '.') { + if (sai->sai_ls_all || + sai->sai_miss_hidden >= sai->sai_skip_hidden) { + /* + * Hidden dentry is the first one, or statahead + * thread does not skip so many hidden dentries + * before "sai_ls_all" enabled as below. + */ + } else { + if (!sai->sai_ls_all) + /* + * It maybe because hidden dentry is not + * the first one, "sai_ls_all" was not + * set, then "ls -al" missed. Enable + * "sai_ls_all" for such case. + */ + sai->sai_ls_all = 1; + + /* + * Such "getattr" has been skipped before + * "sai_ls_all" enabled as above. + */ + sai->sai_miss_hidden++; + RETURN(-EAGAIN); + } + } + + if (unplug) + GOTO(out, rc = 1); + + entry = sa_get(sai, &(*dentryp)->d_name); + if (!entry) + GOTO(out, rc = -EAGAIN); + + /* if statahead is busy in readdir, help it do post-work */ + if (!sa_ready(entry) && sai->sai_in_readpage) + sa_handle_callback(sai); + + if (!sa_ready(entry)) { + spin_lock(&lli->lli_sa_lock); + sai->sai_index_wait = entry->se_index; + spin_unlock(&lli->lli_sa_lock); + rc = wait_event_idle_timeout(sai->sai_waitq, sa_ready(entry), + cfs_time_seconds(30)); + if (rc == 0) { + /* + * entry may not be ready, so it may be used by inflight + * statahead RPC, don't free it. + */ + entry = NULL; + GOTO(out, rc = -EAGAIN); + } + } + + /* + * We need to see the value that was set immediately before we + * were woken up. + */ + if (smp_load_acquire(&entry->se_state) == SA_ENTRY_SUCC && + entry->se_inode) { + struct inode *inode = entry->se_inode; + struct lookup_intent it = { .it_op = IT_GETATTR, + .it_lock_handle = + entry->se_handle }; + __u64 bits; + + rc = md_revalidate_lock(ll_i2mdexp(dir), &it, + ll_inode2fid(inode), &bits); + if (rc == 1) { + if (!(*dentryp)->d_inode) { + struct dentry *alias; + + alias = ll_splice_alias(inode, *dentryp); + if (IS_ERR(alias)) { + ll_intent_release(&it); + GOTO(out, rc = PTR_ERR(alias)); + } + *dentryp = alias; + /* + * statahead prepared this inode, transfer inode + * refcount from sa_entry to dentry + */ + entry->se_inode = NULL; + } else if ((*dentryp)->d_inode != inode) { + /* revalidate, but inode is recreated */ + CDEBUG(D_READA, + "%s: stale dentry %pd inode " DFID", statahead inode "DFID "\n", + ll_i2sbi(inode)->ll_fsname, *dentryp, + PFID(ll_inode2fid((*dentryp)->d_inode)), + PFID(ll_inode2fid(inode))); + ll_intent_release(&it); + GOTO(out, rc = -ESTALE); + } + + if ((bits & MDS_INODELOCK_LOOKUP) && + d_lustre_invalid(*dentryp)) { + d_lustre_revalidate(*dentryp); + ll_update_dir_depth(dir, (*dentryp)->d_inode); + } + + ll_intent_release(&it); + } + } +out: + /* + * statahead cached sa_entry can be used only once, and will be killed + * right after use, so if lookup/revalidate accessed statahead cache, + * set dentry ldd_sa_generation to parent lli_sa_generation, later if we + * stat this file again, we know we've done statahead before, see + * dentry_may_statahead(). + */ + if (lld_is_init(*dentryp)) + ll_d2d(*dentryp)->lld_sa_generation = lli->lli_sa_generation; + sa_put(sai, entry); + spin_lock(&lli->lli_sa_lock); + if (sai->sai_task) + wake_up_process(sai->sai_task); + spin_unlock(&lli->lli_sa_lock); + + RETURN(rc); +} + +/** + * start statahead thread + * + * \param[in] dir parent directory + * \param[in] dentry dentry that triggers statahead, normally the first + * dirent under @dir + * \param[in] agl indicate whether AGL is needed + * \retval -EAGAIN on success, because when this function is + * called, it's already in lookup call, so client should + * do it itself instead of waiting for statahead thread + * to do it asynchronously. + * \retval negative number upon error + */ +static int start_statahead_thread(struct inode *dir, struct dentry *dentry, + bool agl) +{ + int node = cfs_cpt_spread_node(cfs_cpt_tab, CFS_CPT_ANY); + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = NULL; + struct dentry *parent = dentry->d_parent; + struct task_struct *task; + struct ll_sb_info *sbi = ll_i2sbi(parent->d_inode); + int first = LS_FIRST_DE; + int rc = 0; + + ENTRY; + + /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */ + first = is_first_dirent(dir, dentry); + if (first == LS_NOT_FIRST_DE) + /* It is not "ls -{a}l" operation, no need statahead for it. */ + GOTO(out, rc = -EFAULT); + + if (unlikely(atomic_inc_return(&sbi->ll_sa_running) > + sbi->ll_sa_running_max)) { + CDEBUG(D_READA, + "Too many concurrent statahead instances, avoid new statahead instance temporarily.\n"); + GOTO(out, rc = -EMFILE); + } + + sai = ll_sai_alloc(parent); + if (!sai) + GOTO(out, rc = -ENOMEM); + + sai->sai_ls_all = (first == LS_FIRST_DOT_DE); + + /* + * if current lli_opendir_key was deauthorized, or dir re-opened by + * another process, don't start statahead, otherwise the newly spawned + * statahead thread won't be notified to quit. + */ + spin_lock(&lli->lli_sa_lock); + if (unlikely(lli->lli_sai || !lli->lli_opendir_key || + lli->lli_opendir_pid != current->pid)) { + spin_unlock(&lli->lli_sa_lock); + GOTO(out, rc = -EPERM); + } + lli->lli_sai = sai; + spin_unlock(&lli->lli_sa_lock); + + CDEBUG(D_READA, "start statahead thread: [pid %d] [parent %pd]\n", + current->pid, parent); + + task = kthread_create_on_node(ll_statahead_thread, parent, node, + "ll_sa_%u", lli->lli_opendir_pid); + if (IS_ERR(task)) { + spin_lock(&lli->lli_sa_lock); + lli->lli_sai = NULL; + spin_unlock(&lli->lli_sa_lock); + rc = PTR_ERR(task); + CERROR("can't start ll_sa thread, rc: %d\n", rc); + GOTO(out, rc); + } + + if (test_bit(LL_SBI_AGL_ENABLED, ll_i2sbi(parent->d_inode)->ll_flags) && + agl) + ll_start_agl(parent, sai); + + atomic_inc(&ll_i2sbi(parent->d_inode)->ll_sa_total); + sai->sai_task = task; + + wake_up_process(task); + /* + * We don't stat-ahead for the first dirent since we are already in + * lookup. + */ + RETURN(-EAGAIN); + +out: + /* + * once we start statahead thread failed, disable statahead so that + * subsequent stat won't waste time to try it. + */ + spin_lock(&lli->lli_sa_lock); + if (lli->lli_opendir_pid == current->pid) + lli->lli_sa_enabled = 0; + spin_unlock(&lli->lli_sa_lock); + + if (sai) + ll_sai_free(sai); + if (first != LS_NOT_FIRST_DE) + atomic_dec(&sbi->ll_sa_running); + + RETURN(rc); +} + +/* + * Check whether statahead for @dir was started. + */ +static inline bool ll_statahead_started(struct inode *dir, bool agl) +{ + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai; + + spin_lock(&lli->lli_sa_lock); + sai = lli->lli_sai; + if (sai && (sai->sai_agl_task != NULL) != agl) + CDEBUG(D_READA, + "%s: Statahead AGL hint changed from %d to %d\n", + ll_i2sbi(dir)->ll_fsname, + sai->sai_agl_task != NULL, agl); + spin_unlock(&lli->lli_sa_lock); + + return !!sai; +} + +/** + * statahead entry function, this is called when client getattr on a file, it + * will start statahead thread if this is the first dir entry, else revalidate + * dentry from statahead cache. + * + * \param[in] dir parent directory + * \param[out] dentryp dentry to getattr + * \param[in] agl whether start the agl thread + * + * \retval 1 on success + * \retval 0 revalidation from statahead cache failed, caller needs + * to getattr from server directly + * \retval negative number on error, caller often ignores this and + * then getattr from server + */ +int ll_start_statahead(struct inode *dir, struct dentry *dentry, bool agl) +{ + if (!ll_statahead_started(dir, agl)) + return start_statahead_thread(dir, dentry, agl); + return 0; +} + +/** + * revalidate dentry from statahead cache. + * + * \param[in] dir parent directory + * \param[out] dentryp dentry to getattr + * \param[in] unplug unplug statahead window only (normally for negative + * dentry) + * \retval 1 on success + * \retval 0 revalidation from statahead cache failed, caller needs + * to getattr from server directly + * \retval negative number on error, caller often ignores this and + * then getattr from server + */ +int ll_revalidate_statahead(struct inode *dir, struct dentry **dentryp, + bool unplug) +{ + struct ll_statahead_info *sai; + int rc = 0; + + sai = ll_sai_get(dir); + if (sai) { + rc = revalidate_statahead_dentry(dir, sai, dentryp, unplug); + CDEBUG(D_READA, "revalidate statahead %pd: rc = %d.\n", + *dentryp, rc); + ll_sai_put(sai); + } + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/llite/super25.c b/drivers/staging/lustrefsx/lustre/llite/super25.c new file mode 100644 index 0000000000000..3238621d3ef62 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/super25.c @@ -0,0 +1,340 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#define D_MOUNT (D_SUPER | D_CONFIG/*|D_WARNING */) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "llite_internal.h" +#include "vvp_internal.h" + +static struct kmem_cache *ll_inode_cachep; + +static struct inode *ll_alloc_inode(struct super_block *sb) +{ + struct ll_inode_info *lli; +#ifdef HAVE_ALLOC_INODE_SB + lli = alloc_inode_sb(sb, ll_inode_cachep, GFP_NOFS); + if (!lli) + return NULL; + OBD_ALLOC_POST(lli, sizeof(*lli), "slab-alloced"); + memset(lli, 0, sizeof(*lli)); +#else + OBD_SLAB_ALLOC_PTR_GFP(lli, ll_inode_cachep, GFP_NOFS); + if (!lli) + return NULL; +#endif + inode_init_once(&lli->lli_vfs_inode); + return &lli->lli_vfs_inode; +} + +static void ll_inode_destroy_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ll_inode_info *ptr = ll_i2info(inode); + llcrypt_free_inode(inode); + OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep); +} + +static void ll_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ll_inode_destroy_callback); +} + +static int ll_drop_inode(struct inode *inode) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + int drop; + + if (!sbi->ll_inode_cache_enabled) + return 1; + + drop = generic_drop_inode(inode); + if (!drop) + drop = llcrypt_drop_inode(inode); + + return drop; +} + +/* exported operations */ +const struct super_operations lustre_super_operations = +{ + .alloc_inode = ll_alloc_inode, + .destroy_inode = ll_destroy_inode, + .drop_inode = ll_drop_inode, + .evict_inode = ll_delete_inode, + .put_super = ll_put_super, + .statfs = ll_statfs, + .umount_begin = ll_umount_begin, + .remount_fs = ll_remount_fs, + .show_options = ll_show_options, +}; + +/** + * This is the entry point for the mount call into Lustre. + * This is called when a client is mounted, and this is + * where we start setting things up. + * + * @lmd2data data Mount options (e.g. -o flock,abort_recov) + */ +static int lustre_fill_super(struct super_block *sb, void *lmd2_data, + int silent) +{ + struct lustre_mount_data *lmd; + struct lustre_sb_info *lsi; + int rc; + + ENTRY; + + CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb); + + lsi = lustre_init_lsi(sb); + if (!lsi) + RETURN(-ENOMEM); + lmd = lsi->lsi_lmd; + + /* + * Disable lockdep during mount, because mount locking patterns are + * 'special'. + */ + lockdep_off(); + + /* + * LU-639: the OBD cleanup of last mount may not finish yet, wait here. + */ + obd_zombie_barrier(); + + /* Figure out the lmd from the mount options */ + if (lmd_parse(lmd2_data, lmd)) { + lustre_put_lsi(sb); + GOTO(out, rc = -EINVAL); + } + + if (!lmd_is_client(lmd)) { +#ifdef HAVE_SERVER_SUPPORT +#if LUSTRE_VERSION_CODE > OBD_OCD_VERSION(2, 15, 51, 0) + static bool printed; + + if (!printed) { + LCONSOLE_WARN("%s: mounting server target with '-t lustre' deprecated, use '-t lustre_tgt'\n", + lmd->lmd_profile); + printed = true; + } +#endif + rc = server_fill_super(sb); +#else + rc = -ENODEV; + CERROR("%s: This is client-side-only module, cannot handle server mount: rc = %d\n", + lmd->lmd_profile, rc); + lustre_put_lsi(sb); +#endif + GOTO(out, rc); + } + + CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile); + rc = lustre_start_mgc(sb); + if (rc) { + lustre_common_put_super(sb); + GOTO(out, rc); + } + /* Connect and start */ + rc = ll_fill_super(sb); + /* ll_file_super will call lustre_common_put_super on failure, + * which takes care of the module reference. + * + * If error happens in fill_super() call, @lsi will be killed there. + * This is why we do not put it here. + */ +out: + if (rc) { + CERROR("llite: Unable to mount %s: rc = %d\n", + s2lsi(sb) ? lmd->lmd_dev : "", rc); + } else { + CDEBUG(D_SUPER, "%s: Mount complete\n", + lmd->lmd_dev); + } + lockdep_on(); + return rc; +} + +/***************** FS registration ******************/ +static struct dentry *lustre_mount(struct file_system_type *fs_type, int flags, + const char *devname, void *data) +{ + return mount_nodev(fs_type, flags, data, lustre_fill_super); +} + +static void lustre_kill_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + + if (lsi && !IS_SERVER(lsi)) + ll_kill_super(sb); + + kill_anon_super(sb); +} + +/** Register the "lustre" fs type + */ +static struct file_system_type lustre_fs_type = { + .owner = THIS_MODULE, + .name = "lustre", + .mount = lustre_mount, + .kill_sb = lustre_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE, +}; +MODULE_ALIAS_FS("lustre"); + +static int __init lustre_init(void) +{ + struct lnet_processid lnet_id; + int i, rc; + unsigned long lustre_inode_cache_flags; + + BUILD_BUG_ON(sizeof(LUSTRE_VOLATILE_HDR) != + LUSTRE_VOLATILE_HDR_LEN + 1); + + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_INFO, "Lustre client module (%p).\n", + &lustre_super_operations); + + lustre_inode_cache_flags = SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD; +#ifdef SLAB_ACCOUNT + lustre_inode_cache_flags |= SLAB_ACCOUNT; +#endif + + ll_inode_cachep = kmem_cache_create("lustre_inode_cache", + sizeof(struct ll_inode_info), + 0, lustre_inode_cache_flags, NULL); + if (ll_inode_cachep == NULL) + GOTO(out_cache, rc = -ENOMEM); + + ll_file_data_slab = kmem_cache_create("ll_file_data", + sizeof(struct ll_file_data), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (ll_file_data_slab == NULL) + GOTO(out_cache, rc = -ENOMEM); + + pcc_inode_slab = kmem_cache_create("ll_pcc_inode", + sizeof(struct pcc_inode), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (pcc_inode_slab == NULL) + GOTO(out_cache, rc = -ENOMEM); + + rc = llite_tunables_register(); + if (rc) + GOTO(out_cache, rc); + + /* Nodes with small feet have little entropy. The NID for this + * node gives the most entropy in the low bits. */ + for (i = 0;; i++) { + if (LNetGetId(i, &lnet_id) == -ENOENT) + break; + + add_device_randomness(&lnet_id.nid, sizeof(lnet_id.nid)); + } + + rc = vvp_global_init(); + if (rc != 0) + GOTO(out_tunables, rc); + + cl_inode_fini_env = cl_env_alloc(&cl_inode_fini_refcheck, + LCT_REMEMBER | LCT_NOREF); + if (IS_ERR(cl_inode_fini_env)) + GOTO(out_vvp, rc = PTR_ERR(cl_inode_fini_env)); + + cl_inode_fini_env->le_ctx.lc_cookie = 0x4; + + rc = ll_xattr_init(); + if (rc != 0) + GOTO(out_inode_fini_env, rc); + + rc = register_filesystem(&lustre_fs_type); + if (rc) + GOTO(out_xattr, rc); + + RETURN(0); + +out_xattr: + ll_xattr_fini(); +out_inode_fini_env: + cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck); +out_vvp: + vvp_global_fini(); +out_tunables: + llite_tunables_unregister(); +out_cache: + kmem_cache_destroy(ll_inode_cachep); + kmem_cache_destroy(ll_file_data_slab); + kmem_cache_destroy(pcc_inode_slab); + return rc; +} + +static void __exit lustre_exit(void) +{ + unregister_filesystem(&lustre_fs_type); + + llite_tunables_unregister(); + + ll_xattr_fini(); + cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck); + vvp_global_fini(); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + + kmem_cache_destroy(ll_inode_cachep); + kmem_cache_destroy(ll_file_data_slab); + kmem_cache_destroy(pcc_inode_slab); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Client File System"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(lustre_init); +module_exit(lustre_exit); diff --git a/drivers/staging/lustrefsx/lustre/llite/symlink.c b/drivers/staging/lustrefsx/lustre/llite/symlink.c new file mode 100644 index 0000000000000..1a4bf5f9aa5db --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/symlink.c @@ -0,0 +1,338 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#include +#include +#include +#include +#define DEBUG_SUBSYSTEM S_LLITE + +#include "llite_internal.h" + +/* Must be called with lli_size_mutex locked */ +/* HAVE_IOP_GET_LINK is defined from kernel 4.5, whereas + * IS_ENCRYPTED is brought by kernel 4.14. + * So there is no need to handle encryption case otherwise. + */ +#ifdef HAVE_IOP_GET_LINK +static int ll_readlink_internal(struct inode *inode, + struct ptlrpc_request **request, + char **symname, struct delayed_call *done) +#else +static int ll_readlink_internal(struct inode *inode, + struct ptlrpc_request **request, char **symname) +#endif +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + int rc, symlen = i_size_read(inode) + 1; + struct mdt_body *body; + struct md_op_data *op_data; + + ENTRY; + + *request = NULL; + + if (lli->lli_symlink_name) { + int print_limit = min_t(int, PAGE_SIZE - 128, symlen); + + *symname = lli->lli_symlink_name; + /* + * If the total CDEBUG() size is larger than a page, it + * will print a warning to the console, avoid this by + * printing just the last part of the symlink. + */ + CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n", + print_limit < symlen ? "..." : "", print_limit, + (*symname) + symlen - print_limit, symlen); + RETURN(0); + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = OBD_MD_LINKNAME; + rc = md_getattr(sbi->ll_md_exp, op_data, request); + ll_finish_md_op_data(op_data); + if (rc) { + if (rc != -ENOENT) + CERROR("%s: inode "DFID": rc = %d\n", + ll_i2sbi(inode)->ll_fsname, + PFID(ll_inode2fid(inode)), rc); + GOTO(failed, rc); + } + + body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + if ((body->mbo_valid & OBD_MD_LINKNAME) == 0) { + CERROR("OBD_MD_LINKNAME not set on reply\n"); + GOTO(failed, rc = -EPROTO); + } + + LASSERT(symlen != 0); + if (body->mbo_eadatasize != symlen) { + CERROR("%s: inode "DFID": symlink length %d not expected %d\n", + sbi->ll_fsname, PFID(ll_inode2fid(inode)), + body->mbo_eadatasize - 1, symlen - 1); + GOTO(failed, rc = -EPROTO); + } + + *symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD); + if (!*symname || + (!IS_ENCRYPTED(inode) && + strnlen(*symname, symlen) != symlen - 1)) { + /* not full/NULL terminated */ + CERROR("%s: inode "DFID": symlink not NULL terminated string of length %d\n", + sbi->ll_fsname, + PFID(ll_inode2fid(inode)), symlen - 1); + GOTO(failed, rc = -EPROTO); + } + +#ifdef HAVE_IOP_GET_LINK + if (IS_ENCRYPTED(inode)) { + const char *target = llcrypt_get_symlink(inode, *symname, + symlen, done); + if (IS_ERR(target)) + RETURN(PTR_ERR(target)); + symlen = strlen(target) + 1; + *symname = (char *)target; + + /* Do not cache symlink targets encoded without the key, + * since those become outdated once the key is added. + */ + if (!llcrypt_has_encryption_key(inode)) + RETURN(0); + } +#endif + + OBD_ALLOC(lli->lli_symlink_name, symlen); + /* do not return an error if we cannot cache the symlink locally */ + if (lli->lli_symlink_name) { + memcpy(lli->lli_symlink_name, *symname, symlen); + *symname = lli->lli_symlink_name; + } + RETURN(0); + +failed: + RETURN(rc); +} + +#ifdef HAVE_SYMLINK_OPS_USE_NAMEIDATA +static void ll_put_link(struct dentry *dentry, + struct nameidata *nd, void *cookie) +#else +# ifdef HAVE_IOP_GET_LINK +static void ll_put_link(void *cookie) +# else +static void ll_put_link(struct inode *unused, void *cookie) +# endif +#endif +{ + ptlrpc_req_finished(cookie); +} + +#ifdef HAVE_SYMLINK_OPS_USE_NAMEIDATA +static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + struct ptlrpc_request *request = NULL; + int rc; + char *symname = NULL; + + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op\n"); + /* + * Limit the recursive symlink depth to 5 instead of default + * 8 links when kernel has 4k stack to prevent stack overflow. + * For 8k stacks we need to limit it to 7 for local servers. + */ + if (THREAD_SIZE < 8192 && current->link_count >= 6) { + rc = -ELOOP; + } else if (THREAD_SIZE == 8192 && current->link_count >= 8) { + rc = -ELOOP; + } else { + ll_inode_size_lock(inode); + rc = ll_readlink_internal(inode, &request, &symname); + ll_inode_size_unlock(inode); + } + if (rc) { + ptlrpc_req_finished(request); + request = NULL; + symname = ERR_PTR(rc); + } + + nd_set_link(nd, symname); + /* + * symname may contain a pointer to the request message buffer, + * we delay request releasing until ll_put_link then. + */ + RETURN(request); +} +#else +# ifdef HAVE_IOP_GET_LINK +static const char *ll_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct ptlrpc_request *request; + char *symname = NULL; + int rc; + + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, inode="DFID"(%p)\n", + dentry, PFID(ll_inode2fid(inode)), inode); + if (!dentry) + RETURN(ERR_PTR(-ECHILD)); + ll_inode_size_lock(inode); + rc = ll_readlink_internal(inode, &request, &symname, done); + ll_inode_size_unlock(inode); + if (rc < 0) { + ptlrpc_req_finished(request); + return ERR_PTR(rc); + } + + /* + * symname may contain a pointer to the request message buffer, + * we delay request releasing then. + */ + set_delayed_call(done, ll_put_link, request); + RETURN(symname); +} +# else +static const char *ll_follow_link(struct dentry *dentry, void **cookie) +{ + struct inode *inode = d_inode(dentry); + struct ptlrpc_request *request; + char *symname = NULL; + int rc; + + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op\n"); + ll_inode_size_lock(inode); + rc = ll_readlink_internal(inode, &request, &symname); + ll_inode_size_unlock(inode); + if (rc < 0) { + ptlrpc_req_finished(request); + return ERR_PTR(rc); + } + + /* + * symname may contain a pointer to the request message buffer, + * we delay request releasing until ll_put_link then. + */ + *cookie = request; + RETURN(symname); +} +# endif /* HAVE_IOP_GET_LINK */ +#endif /* HAVE_SYMLINK_OPS_USE_NAMEIDATA */ + +/** + * ll_getattr_link() - link-specific getattr to set the correct st_size + * for encrypted symlinks + * + * Override st_size of encrypted symlinks to be the length of the decrypted + * symlink target (or the no-key encoded symlink target, if the key is + * unavailable) rather than the length of the encrypted symlink target. This is + * necessary for st_size to match the symlink target that userspace actually + * sees. POSIX requires this, and some userspace programs depend on it. + * + * For non encrypted symlinks, this is a just calling ll_getattr(). + * For encrypted symlinks, this additionally requires reading the symlink target + * from disk if needed, setting up the inode's encryption key if possible, and + * then decrypting or encoding the symlink target. This makes lstat() more + * heavyweight than is normally the case. However, decrypted symlink targets + * will be cached in ->i_link, so usually the symlink won't have to be read and + * decrypted again later if/when it is actually followed, readlink() is called, + * or lstat() is called again. + * + * Return: 0 on success, -errno on failure + */ +#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR) +static int ll_getattr_link( +#if defined(HAVE_USER_NAMESPACE_ARG) + struct user_namespace *mnt_userns, +#endif + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = d_inode(dentry); + DEFINE_DELAYED_CALL(done); + const char *link; + int rc; + + rc = ll_getattr(mnt_userns, path, stat, request_mask, flags); + if (rc || !IS_ENCRYPTED(inode)) + return rc; + + /* + * To get the symlink target that userspace will see (whether it's the + * decrypted target or the no-key encoded target), we can just get it + * in the same way the VFS does during path resolution and readlink(). + */ + link = READ_ONCE(inode->i_link); + if (!link) { + link = inode->i_op->get_link(dentry, inode, &done); + if (IS_ERR(link)) + return PTR_ERR(link); + } + stat->size = strlen(link); + do_delayed_call(&done); + return 0; +} +#else /* HAVE_INODEOPS_ENHANCED_GETATTR */ +#define ll_getattr_link ll_getattr +#endif + +const struct inode_operations ll_fast_symlink_inode_operations = { +#ifdef HAVE_IOP_GENERIC_READLINK + .readlink = generic_readlink, +#endif + .setattr = ll_setattr, +#ifdef HAVE_IOP_GET_LINK + .get_link = ll_get_link, +#else + .follow_link = ll_follow_link, + .put_link = ll_put_link, +#endif + .getattr = ll_getattr_link, + .permission = ll_inode_permission, +#ifdef HAVE_IOP_XATTR + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .removexattr = ll_removexattr, +#endif + .listxattr = ll_listxattr, +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c new file mode 100644 index 0000000000000..d5bb6c18e22ed --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c @@ -0,0 +1,623 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * cl_device and cl_device_type implementation for VVP layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include "llite_internal.h" +#include "vvp_internal.h" +#include + +/***************************************************************************** + * + * Vvp device and device type functions. + * + */ + +/* + * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical + * "llite_" (var. "ll_") prefix. + */ + +static struct kmem_cache *ll_thread_kmem; +struct kmem_cache *vvp_object_kmem; +static struct kmem_cache *vvp_session_kmem; +static struct kmem_cache *vvp_thread_kmem; + +static struct lu_kmem_descr vvp_caches[] = { + { + .ckd_cache = &ll_thread_kmem, + .ckd_name = "ll_thread_kmem", + .ckd_size = sizeof(struct ll_thread_info), + }, + { + .ckd_cache = &vvp_object_kmem, + .ckd_name = "vvp_object_kmem", + .ckd_size = sizeof(struct vvp_object), + }, + { + .ckd_cache = &vvp_session_kmem, + .ckd_name = "vvp_session_kmem", + .ckd_size = sizeof (struct vvp_session) + }, + { + .ckd_cache = &vvp_thread_kmem, + .ckd_name = "vvp_thread_kmem", + .ckd_size = sizeof(struct vvp_thread_info), + }, + { + .ckd_cache = NULL + } +}; + +static void *ll_thread_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct ll_thread_info *lti; + + OBD_SLAB_ALLOC_PTR_GFP(lti, ll_thread_kmem, GFP_NOFS); + if (lti == NULL) + lti = ERR_PTR(-ENOMEM); + + return lti; +} + +static void ll_thread_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct ll_thread_info *lti = data; + + OBD_SLAB_FREE_PTR(lti, ll_thread_kmem); +} + +struct lu_context_key ll_thread_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = ll_thread_key_init, + .lct_fini = ll_thread_key_fini, +}; + +static void *vvp_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct vvp_session *session; + + OBD_SLAB_ALLOC_PTR_GFP(session, vvp_session_kmem, GFP_NOFS); + if (session == NULL) + session = ERR_PTR(-ENOMEM); + return session; +} + +static void vvp_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct vvp_session *session = data; + OBD_SLAB_FREE_PTR(session, vvp_session_kmem); +} + +struct lu_context_key vvp_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = vvp_session_key_init, + .lct_fini = vvp_session_key_fini +}; + +static void *vvp_thread_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct vvp_thread_info *vti; + + OBD_SLAB_ALLOC_PTR_GFP(vti, vvp_thread_kmem, GFP_NOFS); + if (vti == NULL) + vti = ERR_PTR(-ENOMEM); + return vti; +} + +static void vvp_thread_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct vvp_thread_info *vti = data; + OBD_SLAB_FREE_PTR(vti, vvp_thread_kmem); +} + +struct lu_context_key vvp_thread_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = vvp_thread_key_init, + .lct_fini = vvp_thread_key_fini, +}; + +/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */ +LU_TYPE_INIT_FINI(vvp, &ll_thread_key, &vvp_session_key, &vvp_thread_key); + +static const struct lu_device_operations vvp_lu_ops = { + .ldo_object_alloc = vvp_object_alloc +}; + +static struct lu_device *vvp_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct vvp_device *vdv = lu2vvp_dev(d); + struct cl_site *site = lu2cl_site(d->ld_site); + struct lu_device *next = cl2lu_dev(vdv->vdv_next); + + if (d->ld_site != NULL) { + cl_site_fini(site); + OBD_FREE_PTR(site); + } + + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(vdv); + return next; +} + +static struct lu_device *vvp_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct vvp_device *vdv; + struct lu_device *lud; + struct cl_site *site; + int rc; + ENTRY; + + OBD_ALLOC_PTR(vdv); + if (vdv == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + lud = &vdv->vdv_cl.cd_lu_dev; + cl_device_init(&vdv->vdv_cl, t); + vvp2lu_dev(vdv)->ld_ops = &vvp_lu_ops; + + OBD_ALLOC_PTR(site); + if (site != NULL) { + rc = cl_site_init(site, &vdv->vdv_cl); + if (rc == 0) + rc = lu_site_init_finish(&site->cs_lu); + else { + LASSERT(lud->ld_site == NULL); + CERROR("Cannot init lu_site, rc %d.\n", rc); + OBD_FREE_PTR(site); + } + } else + rc = -ENOMEM; + if (rc != 0) { + vvp_device_free(env, lud); + lud = ERR_PTR(rc); + } + RETURN(lud); +} + +static int vvp_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct vvp_device *vdv; + int rc; + ENTRY; + + vdv = lu2vvp_dev(d); + vdv->vdv_next = lu2cl_dev(next); + + LASSERT(d->ld_site != NULL && next->ld_type != NULL); + next->ld_site = d->ld_site; + rc = next->ld_type->ldt_ops->ldto_device_init( + env, next, next->ld_type->ldt_name, NULL); + if (rc == 0) { + lu_device_get(next); + lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init); + } + RETURN(rc); +} + +static struct lu_device *vvp_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + return cl2lu_dev(lu2vvp_dev(d)->vdv_next); +} + +static const struct lu_device_type_operations vvp_device_type_ops = { + .ldto_init = vvp_type_init, + .ldto_fini = vvp_type_fini, + + .ldto_start = vvp_type_start, + .ldto_stop = vvp_type_stop, + + .ldto_device_alloc = vvp_device_alloc, + .ldto_device_free = vvp_device_free, + .ldto_device_init = vvp_device_init, + .ldto_device_fini = vvp_device_fini, +}; + +struct lu_device_type vvp_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_VVP_NAME, + .ldt_ops = &vvp_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +unsigned int (*vvp_account_page_dirtied)(struct page *page, + struct address_space *mapping); + +/** + * A mutex serializing calls to vvp_inode_fini() under extreme memory + * pressure, when environments cannot be allocated. + */ +int vvp_global_init(void) +{ + int rc; + + rc = lu_kmem_init(vvp_caches); + if (rc != 0) + return rc; + + rc = lu_device_type_init(&vvp_device_type); + if (rc != 0) + goto out_kmem; + +#ifndef HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT +#ifdef HAVE_KALLSYMS_LOOKUP_NAME + /* + * Kernel v5.2-5678-gac1c3e4 no longer exports account_page_dirtied + */ + vvp_account_page_dirtied = (void *) + cfs_kallsyms_lookup_name("account_page_dirtied"); +#endif +#endif + + return 0; + +out_kmem: + lu_kmem_fini(vvp_caches); + + return rc; +} + +void vvp_global_fini(void) +{ + lu_device_type_fini(&vvp_device_type); + lu_kmem_fini(vvp_caches); +} + +/***************************************************************************** + * + * mirror obd-devices into cl devices. + * + */ + +int cl_sb_init(struct super_block *sb) +{ + struct ll_sb_info *sbi; + struct cl_device *cl; + struct lu_env *env; + int rc = 0; + __u16 refcheck; + + sbi = ll_s2sbi(sb); + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + cl = cl_type_setup(env, NULL, &vvp_device_type, + sbi->ll_dt_exp->exp_obd->obd_lu_dev); + if (!IS_ERR(cl)) { + sbi->ll_cl = cl; + sbi->ll_site = cl2lu_dev(cl)->ld_site; + } + cl_env_put(env, &refcheck); + } else + rc = PTR_ERR(env); + RETURN(rc); +} + +int cl_sb_fini(struct super_block *sb) +{ + struct ll_sb_info *sbi; + struct lu_env *env; + struct cl_device *cld; + __u16 refcheck; + int result; + + ENTRY; + sbi = ll_s2sbi(sb); + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + cld = sbi->ll_cl; + + if (cld != NULL) { + cl_stack_fini(env, cld); + sbi->ll_cl = NULL; + sbi->ll_site = NULL; + } + cl_env_put(env, &refcheck); + result = 0; + } else { + CERROR("Cannot cleanup cl-stack due to memory shortage.\n"); + result = PTR_ERR(env); + } + + RETURN(result); +} + +/**************************************************************************** + * + * debugfs/lustre/llite/$MNT/dump_page_cache + * + ****************************************************************************/ + +struct vvp_seq_private { + struct ll_sb_info *vsp_sbi; + struct lu_env *vsp_env; + u16 vsp_refcheck; + struct cl_object *vsp_clob; + struct rhashtable_iter vsp_iter; + u32 vsp_page_index; + /* + * prev_pos is the 'pos' of the last object returned + * by ->start of ->next. + */ + loff_t vvp_prev_pos; +}; + +unsigned int ll_filemap_get_one_page_contig(struct address_space *mapping, + pgoff_t start, struct page **pg) +{ +#ifdef HAVE_FILEMAP_GET_FOLIOS_CONTIG + struct folio_batch fbatch; + int nr; + + folio_batch_init(&fbatch); + *pg = NULL; + + nr = filemap_get_folios_contig(mapping, &start, start, &fbatch); + if (nr == PAGEVEC_SIZE) { + --nr; + *pg = folio_page(fbatch.folios[nr], 0); + return 1; + } + return 0; +#else /* !HAVE_FILEMAP_GET_FOLIOS_CONTIG */ + return find_get_pages_contig(mapping, start, 1, pg); +#endif +} + +static struct page *vvp_pgcache_current(struct vvp_seq_private *priv) +{ + struct lu_device *dev = &priv->vsp_sbi->ll_cl->cd_lu_dev; + struct lu_object_header *h; + struct page *vmpage = NULL; + + rhashtable_walk_start(&priv->vsp_iter); + while ((h = rhashtable_walk_next(&priv->vsp_iter)) != NULL) { + struct inode *inode; + int nr; + + if (IS_ERR(h)) { + if (PTR_ERR(h) == -EAGAIN) + continue; + break; + } + + if (!priv->vsp_clob) { + struct lu_object *lu_obj; + + lu_obj = lu_object_get_first(h, dev); + if (!lu_obj) + continue; + + priv->vsp_clob = lu2cl(lu_obj); + lu_object_ref_add_atomic(lu_obj, "dump", current); + priv->vsp_page_index = 0; + } + + inode = vvp_object_inode(priv->vsp_clob); + nr = ll_filemap_get_one_page_contig(inode->i_mapping, + priv->vsp_page_index, + &vmpage); + if (nr > 0) { + priv->vsp_page_index = vmpage->index; + break; + } + lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", current); + cl_object_put(priv->vsp_env, priv->vsp_clob); + priv->vsp_clob = NULL; + priv->vsp_page_index = 0; + } + rhashtable_walk_stop(&priv->vsp_iter); + return vmpage; +} + +#define seq_page_flag(seq, page, flag, has_flags) do { \ + if (test_bit(PG_##flag, &(page)->flags)) { \ + seq_printf(seq, "%s"#flag, has_flags ? "|" : ""); \ + has_flags = 1; \ + } \ +} while(0) + +static void vvp_pgcache_page_show(const struct lu_env *env, + struct seq_file *seq, struct cl_page *page) +{ + struct vvp_page *vpg; + struct page *vmpage; + int has_flags; + + vpg = cl2vvp_page(cl_page_at(page, &vvp_device_type)); + vmpage = vpg->vpg_page; + seq_printf(seq, " %5i | %p %p %s %s %s | %p "DFID"(%p) %lu %u [", + 0 /* gen */, + vpg, page, + "none", + vpg->vpg_defer_uptodate ? "du" : "- ", + PageWriteback(vmpage) ? "wb" : "-", + vmpage, + PFID(ll_inode2fid(vmpage->mapping->host)), + vmpage->mapping->host, vmpage->index, + page_count(vmpage)); + has_flags = 0; + seq_page_flag(seq, vmpage, locked, has_flags); + seq_page_flag(seq, vmpage, error, has_flags); + seq_page_flag(seq, vmpage, referenced, has_flags); + seq_page_flag(seq, vmpage, uptodate, has_flags); + seq_page_flag(seq, vmpage, dirty, has_flags); + seq_page_flag(seq, vmpage, writeback, has_flags); + seq_printf(seq, "%s]\n", has_flags ? "" : "-"); +} + +static int vvp_pgcache_show(struct seq_file *f, void *v) +{ + struct vvp_seq_private *priv = f->private; + struct page *vmpage = v; + struct cl_page *page; + + seq_printf(f, "%8lx@" DFID ": ", vmpage->index, + PFID(lu_object_fid(&priv->vsp_clob->co_lu))); + lock_page(vmpage); + page = cl_vmpage_page(vmpage, priv->vsp_clob); + unlock_page(vmpage); + put_page(vmpage); + + if (page) { + vvp_pgcache_page_show(priv->vsp_env, f, page); + cl_page_put(priv->vsp_env, page); + } else { + seq_puts(f, "missing\n"); + } + + return 0; +} + +static void vvp_pgcache_rewind(struct vvp_seq_private *priv) +{ + if (priv->vvp_prev_pos) { + struct lu_site *s = priv->vsp_sbi->ll_cl->cd_lu_dev.ld_site; + + rhashtable_walk_exit(&priv->vsp_iter); + rhashtable_walk_enter(&s->ls_obj_hash, &priv->vsp_iter); + priv->vvp_prev_pos = 0; + if (priv->vsp_clob) { + lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", + current); + cl_object_put(priv->vsp_env, priv->vsp_clob); + } + priv->vsp_clob = NULL; + } +} + +static struct page *vvp_pgcache_next_page(struct vvp_seq_private *priv) +{ + priv->vsp_page_index += 1; + return vvp_pgcache_current(priv); +} + +static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos) +{ + struct vvp_seq_private *priv = f->private; + + if (*pos == 0) { + vvp_pgcache_rewind(priv); + } else if (*pos == priv->vvp_prev_pos) { + /* Return the current item */; + } else { + WARN_ON(*pos != priv->vvp_prev_pos + 1); + priv->vsp_page_index += 1; + } + + priv->vvp_prev_pos = *pos; + return vvp_pgcache_current(priv); +} + +static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos) +{ + struct vvp_seq_private *priv = f->private; + + WARN_ON(*pos != priv->vvp_prev_pos); + *pos += 1; + priv->vvp_prev_pos = *pos; + return vvp_pgcache_next_page(priv); +} + +static void vvp_pgcache_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static const struct seq_operations vvp_pgcache_ops = { + .start = vvp_pgcache_start, + .next = vvp_pgcache_next, + .stop = vvp_pgcache_stop, + .show = vvp_pgcache_show +}; + +static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp) +{ + struct vvp_seq_private *priv; + struct lu_site *s; + + priv = __seq_open_private(filp, &vvp_pgcache_ops, sizeof(*priv)); + if (!priv) + return -ENOMEM; + + priv->vsp_sbi = inode->i_private; + priv->vsp_env = cl_env_get(&priv->vsp_refcheck); + priv->vsp_clob = NULL; + if (IS_ERR(priv->vsp_env)) { + int err = PTR_ERR(priv->vsp_env); + + seq_release_private(inode, filp); + return err; + } + + s = priv->vsp_sbi->ll_cl->cd_lu_dev.ld_site; + rhashtable_walk_enter(&s->ls_obj_hash, &priv->vsp_iter); + + return 0; +} + +static int vvp_dump_pgcache_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct vvp_seq_private *priv = seq->private; + + if (priv->vsp_clob) { + lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", current); + cl_object_put(priv->vsp_env, priv->vsp_clob); + } + cl_env_put(priv->vsp_env, &priv->vsp_refcheck); + rhashtable_walk_exit(&priv->vsp_iter); + return seq_release_private(inode, file); +} + +const struct file_operations vvp_dump_pgcache_file_ops = { + .owner = THIS_MODULE, + .open = vvp_dump_pgcache_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = vvp_dump_pgcache_seq_release, +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h new file mode 100644 index 0000000000000..1511c320522d8 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h @@ -0,0 +1,311 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Internal definitions for VVP layer. + * + * Author: Nikita Danilov + */ + +#ifndef VVP_INTERNAL_H +#define VVP_INTERNAL_H + +#include + +enum obd_notify_event; +struct inode; +struct lustre_md; +struct obd_device; +struct obd_export; +struct page; + +/** + * IO state private to VVP layer. + */ +struct vvp_io { + /** super class */ + struct cl_io_slice vui_cl; + struct cl_io_lock_link vui_link; + /** + * I/O vector information to or from which read/write is going. + */ + struct iov_iter *vui_iter; + /** + * Total size for the left IO. + */ + size_t vui_tot_count; + + union { + struct vvp_fault_io { + struct vm_area_struct *ft_vma; + /** + * locked page returned from vvp_io + */ + struct page *ft_vmpage; + /** + * kernel fault info + */ + struct vm_fault *ft_vmf; + /** + * fault API used bitflags for return code. + */ + unsigned int ft_flags; + /** + * check that flags are from filemap_fault + */ + bool ft_flags_valid; + struct cl_page_list ft_queue; + } fault; + struct { + struct cl_page_list vui_queue; + unsigned long vui_written; + unsigned long vui_read; + int vui_from; + int vui_to; + } readwrite; /* normal io */ + } u; + + /** + * Layout version when this IO is initialized + */ + __u32 vui_layout_gen; + /** + * File descriptor against which IO is done. + */ + struct ll_file_data *vui_fd; + struct kiocb *vui_iocb; + + /* Readahead state. */ + pgoff_t vui_ra_start_idx; + pgoff_t vui_ra_pages; + /* Set when vui_ra_{start,count} have been initialized. */ + bool vui_ra_valid; +}; + +extern struct lu_device_type vvp_device_type; + +extern struct lu_context_key vvp_session_key; +extern struct lu_context_key vvp_thread_key; + +extern struct kmem_cache *vvp_object_kmem; + +struct vvp_thread_info { + struct cl_lock vti_lock; + struct cl_lock_descr vti_descr; + struct cl_io vti_io; + struct cl_attr vti_attr; + struct cl_sync_io vti_anchor; +}; + +static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env) +{ + struct vvp_thread_info *vti; + + vti = lu_context_key_get(&env->le_ctx, &vvp_thread_key); + LASSERT(vti != NULL); + + return vti; +} + +static inline struct cl_lock *vvp_env_lock(const struct lu_env *env) +{ + struct cl_lock *lock = &vvp_env_info(env)->vti_lock; + + memset(lock, 0, sizeof(*lock)); + + return lock; +} + +static inline struct cl_attr *vvp_env_thread_attr(const struct lu_env *env) +{ + struct cl_attr *attr = &vvp_env_info(env)->vti_attr; + + memset(attr, 0, sizeof(*attr)); + + return attr; +} + +static inline struct cl_io *vvp_env_thread_io(const struct lu_env *env) +{ + struct cl_io *io = &vvp_env_info(env)->vti_io; + + memset(io, 0, sizeof(*io)); + + return io; +} + +struct vvp_session { + struct vvp_io vs_ios; +}; + +static inline struct vvp_session *vvp_env_session(const struct lu_env *env) +{ + struct vvp_session *ses; + + ses = lu_context_key_get(env->le_ses, &vvp_session_key); + LASSERT(ses != NULL); + + return ses; +} + +static inline struct vvp_io *vvp_env_io(const struct lu_env *env) +{ + return &vvp_env_session(env)->vs_ios; +} + +/** + * VPP-private object state. + */ +struct vvp_object { + struct cl_object_header vob_header; + struct cl_object vob_cl; + struct inode *vob_inode; + + /** + * Number of outstanding mmaps on this file. + * + * \see ll_vm_open(), ll_vm_close(). + */ + atomic_t vob_mmap_cnt; + + /** + * various flags + * vob_discard_page_warned + * if pages belonging to this object are discarded when a client + * is evicted, some debug info will be printed, this flag will be set + * during processing the first discarded page, then avoid flooding + * debug message for lots of discarded pages. + * + * \see ll_dirty_page_discard_warn. + */ + unsigned int vob_discard_page_warned:1; +}; + +/** + * VVP-private page state. + */ +struct vvp_page { + struct cl_page_slice vpg_cl; + unsigned vpg_defer_uptodate:1, + vpg_ra_updated:1, + vpg_ra_used:1; + /** VM page */ + struct page *vpg_page; +}; + +static inline struct vvp_page *cl2vvp_page(const struct cl_page_slice *slice) +{ + return container_of(slice, struct vvp_page, vpg_cl); +} + +static inline pgoff_t vvp_index(struct vvp_page *vpg) +{ + return vpg->vpg_page->index; +} + +struct vvp_device { + struct cl_device vdv_cl; + struct cl_device *vdv_next; +}; + +static inline struct lu_device *vvp2lu_dev(struct vvp_device *vdv) +{ + return &vdv->vdv_cl.cd_lu_dev; +} + +static inline struct vvp_device *lu2vvp_dev(const struct lu_device *d) +{ + return container_of_safe(d, struct vvp_device, vdv_cl.cd_lu_dev); +} + +static inline struct vvp_device *cl2vvp_dev(const struct cl_device *d) +{ + return container_of_safe(d, struct vvp_device, vdv_cl); +} + +static inline struct vvp_object *cl2vvp(const struct cl_object *obj) +{ + return container_of_safe(obj, struct vvp_object, vob_cl); +} + +static inline struct vvp_object *lu2vvp(const struct lu_object *obj) +{ + return container_of_safe(obj, struct vvp_object, vob_cl.co_lu); +} + +static inline struct inode *vvp_object_inode(const struct cl_object *obj) +{ + return cl2vvp(obj)->vob_inode; +} + +int vvp_object_invariant(const struct cl_object *obj); +struct vvp_object *cl_inode2vvp(struct inode *inode); + +static inline struct page *cl2vm_page(const struct cl_page_slice *slice) +{ + return cl2vvp_page(slice)->vpg_page; +} + +#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK +# define CLOBINVRNT(env, clob, expr) \ + do { \ + if (unlikely(!(expr))) { \ + LU_OBJECT_DEBUG(D_ERROR, (env), &(clob)->co_lu, \ + #expr); \ + LINVRNT(0); \ + } \ + } while (0) +#else /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */ +# define CLOBINVRNT(env, clob, expr) \ + ((void)sizeof(env), (void)sizeof(clob), (void)sizeof !!(expr)) +#endif /* CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */ + +int lov_read_and_clear_async_rc(struct cl_object *clob); + +int vvp_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io); +int vvp_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index); +struct lu_object *vvp_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); + +int vvp_global_init(void); +void vvp_global_fini(void); + +#if !defined(HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT) || \ +defined(HAVE_KALLSYMS_LOOKUP_NAME) +extern unsigned int (*vvp_account_page_dirtied)(struct page *page, + struct address_space *mapping); +#endif + +extern const struct file_operations vvp_dump_pgcache_file_ops; + +#endif /* VVP_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c new file mode 100644 index 0000000000000..37421981c2e10 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c @@ -0,0 +1,1853 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_io for VVP layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include + +#include "llite_internal.h" +#include "vvp_internal.h" +#include + +static struct vvp_io *cl2vvp_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct vvp_io *vio; + + vio = container_of(slice, struct vvp_io, vui_cl); + LASSERT(vio == vvp_env_io(env)); + + return vio; +} + +/** + * For swapping layout. The file's layout may have changed. + * To avoid populating pages to a wrong stripe, we have to verify the + * correctness of layout. It works because swapping layout processes + * have to acquire group lock. + */ +static bool can_populate_pages(const struct lu_env *env, struct cl_io *io, + struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct vvp_io *vio = vvp_env_io(env); + bool rc = true; + + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + /* don't need lock here to check lli_layout_gen as we have held + * extent lock and GROUP lock has to hold to swap layout */ + if (ll_layout_version_get(lli) != vio->vui_layout_gen || + OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_LOST_LAYOUT, 0)) { + io->ci_need_restart = 1; + /* this will cause a short read/write */ + io->ci_continue = 0; + rc = false; + } + case CIT_FAULT: + /* fault is okay because we've already had a page. */ + default: + break; + } + + return rc; +} + +static void vvp_object_size_lock(struct cl_object *obj) +{ + struct inode *inode = vvp_object_inode(obj); + + ll_inode_size_lock(inode); + cl_object_attr_lock(obj); +} + +static void vvp_object_size_unlock(struct cl_object *obj) +{ + struct inode *inode = vvp_object_inode(obj); + + cl_object_attr_unlock(obj); + ll_inode_size_unlock(inode); +} + +/** + * Helper function that if necessary adjusts file size (inode->i_size), when + * position at the offset \a pos is accessed. File size can be arbitrary stale + * on a Lustre client, but client at least knows KMS. If accessed area is + * inside [0, KMS], set file size to KMS, otherwise glimpse file size. + * + * Locking: i_size_lock is used to serialize changes to inode size and to + * protect consistency between inode size and cl_object + * attributes. cl_object_size_lock() protects consistency between cl_attr's of + * top-object and sub-objects. + */ +static int vvp_prep_size(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io, loff_t start, size_t count, + int *exceed) +{ + struct cl_attr *attr = vvp_env_thread_attr(env); + struct inode *inode = vvp_object_inode(obj); + loff_t pos = start + count - 1; + loff_t kms; + int result; + + /* + * Consistency guarantees: following possibilities exist for the + * relation between region being accessed and real file size at this + * moment: + * + * (A): the region is completely inside of the file; + * + * (B-x): x bytes of region are inside of the file, the rest is + * outside; + * + * (C): the region is completely outside of the file. + * + * This classification is stable under DLM lock already acquired by + * the caller, because to change the class, other client has to take + * DLM lock conflicting with our lock. Also, any updates to ->i_size + * by other threads on this client are serialized by + * ll_inode_size_lock(). This guarantees that short reads are handled + * correctly in the face of concurrent writes and truncates. + */ + vvp_object_size_lock(obj); + result = cl_object_attr_get(env, obj, attr); + if (result == 0) { + kms = attr->cat_kms; + if (pos > kms) { + /* + * A glimpse is necessary to determine whether we + * return a short read (B) or some zeroes at the end + * of the buffer (C) + */ + vvp_object_size_unlock(obj); + result = cl_glimpse_lock(env, io, inode, obj, 0); + if (result == 0 && exceed != NULL) { + /* If objective page index exceed end-of-file + * page index, return directly. Do not expect + * kernel will check such case correctly. + * linux-2.6.18-128.1.1 miss to do that. + * --bug 17336 */ + loff_t size = i_size_read(inode); + unsigned long cur_index = start >> + PAGE_SHIFT; + + if ((size == 0 && cur_index != 0) || + (((size - 1) >> PAGE_SHIFT) < + cur_index)) + *exceed = 1; + } + + return result; + } else { + /* + * region is within kms and, hence, within real file + * size (A). We need to increase i_size to cover the + * read region so that generic_file_read() will do its + * job, but that doesn't mean the kms size is + * _correct_, it is only the _minimum_ size. If + * someone does a stat they will get the correct size + * which will always be >= the kms value here. + * b=11081 + */ + if (i_size_read(inode) < kms) { + i_size_write(inode, kms); + CDEBUG(D_VFSTRACE, + DFID" updating i_size %llu\n", + PFID(lu_object_fid(&obj->co_lu)), + (__u64)i_size_read(inode)); + } + } + } + + vvp_object_size_unlock(obj); + + return result; +} + +/***************************************************************************** + * + * io operations. + * + */ + +static int vvp_io_one_lock_index(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + pgoff_t start, pgoff_t end) +{ + struct vvp_io *vio = vvp_env_io(env); + struct cl_lock_descr *descr = &vio->vui_link.cill_descr; + struct cl_object *obj = io->ci_obj; + + CLOBINVRNT(env, obj, vvp_object_invariant(obj)); + ENTRY; + + CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end); + + memset(&vio->vui_link, 0, sizeof vio->vui_link); + + if (vio->vui_fd && (vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + descr->cld_mode = CLM_GROUP; + descr->cld_gid = vio->vui_fd->fd_grouplock.lg_gid; + enqflags |= CEF_LOCK_MATCH; + } else { + descr->cld_mode = mode; + } + + descr->cld_obj = obj; + descr->cld_start = start; + descr->cld_end = end; + descr->cld_enq_flags = enqflags; + + cl_io_lock_add(env, io, &vio->vui_link); + + RETURN(0); +} + +static int vvp_io_one_lock(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + loff_t start, loff_t end) +{ + struct cl_object *obj = io->ci_obj; + + return vvp_io_one_lock_index(env, io, enqflags, mode, + cl_index(obj, start), cl_index(obj, end)); +} + +static int vvp_io_write_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + + cl_page_list_init(&vio->u.readwrite.vui_queue); + vio->u.readwrite.vui_written = 0; + vio->u.readwrite.vui_from = 0; + vio->u.readwrite.vui_to = PAGE_SIZE; + + return 0; +} + +static int vvp_io_read_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + + vio->u.readwrite.vui_read = 0; + + return 0; +} + +static void vvp_io_write_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + + LASSERT(vio->u.readwrite.vui_queue.pl_nr == 0); +} + +static int vvp_io_fault_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct inode *inode = vvp_object_inode(ios->cis_obj); + + LASSERT(inode == file_inode(vio->vui_fd->fd_file)); + + return 0; +} + +static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct vvp_io *vio = cl2vvp_io(env, ios); + struct inode *inode = vvp_object_inode(obj); + __u32 gen = 0; + int rc; + ENTRY; + + CLOBINVRNT(env, obj, vvp_object_invariant(obj)); + + CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d " + "need write layout %d, restore needed %d, invalidate_lock %d\n", + PFID(lu_object_fid(&obj->co_lu)), + io->ci_ignore_layout, io->ci_verify_layout, + vio->vui_layout_gen, io->ci_need_write_intent, + io->ci_restore_needed, io->ci_invalidate_page_cache); + +#ifdef HAVE_INVALIDATE_LOCK + if (io->ci_invalidate_page_cache) { + filemap_invalidate_unlock(inode->i_mapping); + io->ci_invalidate_page_cache = 0; + } +#endif /* HAVE_INVALIDATE_LOCK */ + + if (io->ci_restore_needed) { + /* file was detected release, we need to restore it + * before finishing the io + */ + rc = ll_layout_restore(inode, 0, OBD_OBJECT_EOF); + /* if restore registration failed, no restart, + * we will return -ENODATA */ + /* The layout will change after restore, so we need to + * block on layout lock held by the MDT + * as MDT will not send new layout in lvb (see LU-3124) + * we have to explicitly fetch it, all this will be done + * by ll_layout_refresh(). + * Even if ll_layout_restore() returns zero, it doesn't mean + * that restore has been successful. Therefore it sets + * ci_verify_layout so that it will check layout at the end + * of this function. + */ + if (rc) { + io->ci_restore_needed = 1; + io->ci_need_restart = 0; + io->ci_verify_layout = 0; + io->ci_result = rc; + GOTO(out, rc); + } + + io->ci_restore_needed = 0; + + /* Even if ll_layout_restore() returns zero, it doesn't mean + * that restore has been successful. Therefore it should verify + * if there was layout change and restart I/O correspondingly. + */ + ll_layout_refresh(inode, &gen); + io->ci_need_restart = vio->vui_layout_gen != gen; + if (io->ci_need_restart) { + CDEBUG(D_VFSTRACE, + DFID" layout changed from %d to %d.\n", + PFID(lu_object_fid(&obj->co_lu)), + vio->vui_layout_gen, gen); + /* today successful restore is the only possible + * case */ + /* restore was done, clear restoring state */ + clear_bit(LLIF_FILE_RESTORING, + &ll_i2info(vvp_object_inode(obj))->lli_flags); + } + GOTO(out, 0); + } + + /** + * dynamic layout change needed, send layout intent + * RPC. + */ + if (io->ci_need_write_intent) { + enum layout_intent_opc opc = LAYOUT_INTENT_WRITE; + + io->ci_need_write_intent = 0; + + LASSERT(io->ci_type == CIT_WRITE || cl_io_is_fallocate(io) || + cl_io_is_trunc(io) || cl_io_is_mkwrite(io)); + + CDEBUG(D_VFSTRACE, DFID" write layout, type %u "DEXT"\n", + PFID(lu_object_fid(&obj->co_lu)), io->ci_type, + PEXT(&io->ci_write_intent)); + + if (cl_io_is_trunc(io)) + opc = LAYOUT_INTENT_TRUNC; + + rc = ll_layout_write_intent(inode, opc, &io->ci_write_intent); + io->ci_result = rc; + if (!rc) + io->ci_need_restart = 1; + GOTO(out, rc); + } + + if (!io->ci_need_restart && + !io->ci_ignore_layout && io->ci_verify_layout) { + /* check layout version */ + ll_layout_refresh(inode, &gen); + io->ci_need_restart = vio->vui_layout_gen != gen; + if (io->ci_need_restart) { + CDEBUG(D_VFSTRACE, + DFID" layout changed from %d to %d.\n", + PFID(lu_object_fid(&obj->co_lu)), + vio->vui_layout_gen, gen); + } + GOTO(out, 0); + } +out: + EXIT; +} + +static void vvp_io_fault_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_page *page = io->u.ci_fault.ft_page; + + CLOBINVRNT(env, io->ci_obj, vvp_object_invariant(io->ci_obj)); + + if (page != NULL) { + lu_ref_del(&page->cp_reference, "fault", io); + cl_page_put(env, page); + io->u.ci_fault.ft_page = NULL; + } + vvp_io_fini(env, ios); +} + +static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma) +{ + /* + * we only want to hold PW locks if the mmap() can generate + * writes back to the file and that only happens in shared + * writable vmas + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return CLM_WRITE; + return CLM_READ; +} + +static int vvp_mmap_locks(const struct lu_env *env, + struct vvp_io *vio, struct cl_io *io) +{ + struct vvp_thread_info *vti = vvp_env_info(env); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct cl_lock_descr *descr = &vti->vti_descr; + union ldlm_policy_data policy; + struct iovec iov; + struct iov_iter i; + unsigned long addr; + ssize_t count; + int result = 0; + ENTRY; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + /* nfs or loop back device write */ + if (vio->vui_iter == NULL) + RETURN(0); + + /* No MM (e.g. NFS)? No vmas too. */ + if (mm == NULL) + RETURN(0); + + if (!iter_is_iovec(vio->vui_iter) && !iov_iter_is_kvec(vio->vui_iter)) + RETURN(0); + + for (i = *vio->vui_iter; + iov_iter_count(&i); + iov_iter_advance(&i, iov.iov_len)) { + iov = iov_iter_iovec(&i); + addr = (unsigned long)iov.iov_base; + count = iov.iov_len; + + if (count == 0) + continue; + + count += addr & ~PAGE_MASK; + addr &= PAGE_MASK; + + mmap_read_lock(mm); + while ((vma = our_vma(mm, addr, count)) != NULL) { + struct dentry *de = file_dentry(vma->vm_file); + struct inode *inode = de->d_inode; + int flags = CEF_MUST; + + if (ll_file_nolock(vma->vm_file)) { + /* + * For no lock case is not allowed for mmap + */ + result = -EINVAL; + break; + } + + /* + * XXX: Required lock mode can be weakened: CIT_WRITE + * io only ever reads user level buffer, and CIT_READ + * only writes on it. + */ + policy_from_vma(&policy, vma, addr, count); + descr->cld_mode = vvp_mode_from_vma(vma); + descr->cld_obj = ll_i2info(inode)->lli_clob; + descr->cld_start = cl_index(descr->cld_obj, + policy.l_extent.start); + descr->cld_end = cl_index(descr->cld_obj, + policy.l_extent.end); + descr->cld_enq_flags = flags; + result = cl_io_lock_alloc_add(env, io, descr); + + CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n", + descr->cld_mode, descr->cld_start, + descr->cld_end); + + if (result < 0) + break; + + if (vma->vm_end - addr >= count) + break; + + count -= vma->vm_end - addr; + addr = vma->vm_end; + } + mmap_read_unlock(mm); + if (result < 0) + break; + } + RETURN(result); +} + +static void vvp_io_advance(const struct lu_env *env, + const struct cl_io_slice *ios, + size_t nob) +{ + struct cl_object *obj = ios->cis_io->ci_obj; + struct vvp_io *vio = cl2vvp_io(env, ios); + + CLOBINVRNT(env, obj, vvp_object_invariant(obj)); + + /* + * Since 3.16(26978b8b4) vfs revert iov iter to + * original position even io succeed, so instead + * of relying on VFS, we move iov iter by ourselves. + */ + iov_iter_advance(vio->vui_iter, nob); + CDEBUG(D_VFSTRACE, "advancing %ld bytes\n", nob); + vio->vui_tot_count -= nob; + iov_iter_reexpand(vio->vui_iter, vio->vui_tot_count); +} + +static void vvp_io_update_iov(const struct lu_env *env, + struct vvp_io *vio, struct cl_io *io) +{ + size_t size = io->u.ci_rw.crw_count; + + if (!vio->vui_iter) + return; + + iov_iter_truncate(vio->vui_iter, size); +} + +static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io, + enum cl_lock_mode mode, loff_t start, loff_t end) +{ + struct vvp_io *vio = vvp_env_io(env); + int result; + int ast_flags = 0; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + ENTRY; + + vvp_io_update_iov(env, vio, io); + + if (io->u.ci_rw.crw_nonblock) + ast_flags |= CEF_NONBLOCK; + if (io->ci_lock_no_expand) + ast_flags |= CEF_LOCK_NO_EXPAND; + if (vio->vui_fd) { + /* Group lock held means no lockless any more */ + if (vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED) + io->ci_dio_lock = 1; + + if (ll_file_nolock(vio->vui_fd->fd_file) || + (vio->vui_fd->fd_file->f_flags & O_DIRECT && + !io->ci_dio_lock)) + ast_flags |= CEF_NEVER; + } + + result = vvp_mmap_locks(env, vio, io); + if (result == 0) + result = vvp_io_one_lock(env, io, ast_flags, mode, start, end); + + RETURN(result); +} + +static int vvp_io_read_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_io_rw_common *rd = &io->u.ci_rd.rd; + int result; + + ENTRY; + result = vvp_io_rw_lock(env, io, CLM_READ, rd->crw_pos, + rd->crw_pos + rd->crw_count - 1); + RETURN(result); +} + +static int vvp_io_fault_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct vvp_io *vio = cl2vvp_io(env, ios); + /* + * XXX LDLM_FL_CBPENDING + */ + return vvp_io_one_lock_index(env, + io, 0, + vvp_mode_from_vma(vio->u.fault.ft_vma), + io->u.ci_fault.ft_index, + io->u.ci_fault.ft_index); +} + +static int vvp_io_write_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + loff_t start; + loff_t end; + + if (io->u.ci_wr.wr_append) { + start = 0; + end = OBD_OBJECT_EOF; + } else { + start = io->u.ci_wr.wr.crw_pos; + end = start + io->u.ci_wr.wr.crw_count - 1; + } + + RETURN(vvp_io_rw_lock(env, io, CLM_WRITE, start, end)); +} + +static int vvp_io_setattr_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) + +{ + return 0; +} + +/** + * Implementation of cl_io_operations::cio_lock() method for CIT_SETATTR io. + * + * Handles "lockless io" mode when extent locking is done by server. + */ +static int vvp_io_setattr_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + __u64 lock_start = 0; + __u64 lock_end = OBD_OBJECT_EOF; + __u32 enqflags = 0; + + if (cl_io_is_trunc(io)) { + struct inode *inode = vvp_object_inode(io->ci_obj); + + /* set enqueue flags to CEF_MUST in case of encrypted file, + * to prevent lockless truncate + */ + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) + enqflags = CEF_MUST; + else if (io->u.ci_setattr.sa_attr.lvb_size == 0) + enqflags = CEF_DISCARD_DATA; + } else if (cl_io_is_fallocate(io)) { + lock_start = io->u.ci_setattr.sa_falloc_offset; + lock_end = io->u.ci_setattr.sa_falloc_end - 1; + } else { + unsigned int valid = io->u.ci_setattr.sa_avalid; + + if (!(valid & TIMES_SET_FLAGS)) + return 0; + + if ((!(valid & ATTR_MTIME) || + io->u.ci_setattr.sa_attr.lvb_mtime >= + io->u.ci_setattr.sa_attr.lvb_ctime) && + (!(valid & ATTR_ATIME) || + io->u.ci_setattr.sa_attr.lvb_atime >= + io->u.ci_setattr.sa_attr.lvb_ctime)) + return 0; + } + + return vvp_io_one_lock(env, io, enqflags, CLM_WRITE, + lock_start, lock_end); +} + +static int vvp_do_vmtruncate(struct inode *inode, size_t size) +{ + int result; + + /* + * Only ll_inode_size_lock is taken at this level. + */ + ll_inode_size_lock(inode); + result = inode_newsize_ok(inode, size); + if (result < 0) { + ll_inode_size_unlock(inode); + return result; + } + i_size_write(inode, size); + + ll_truncate_pagecache(inode, size); + ll_inode_size_unlock(inode); + return result; +} + +static int vvp_io_setattr_time(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct cl_attr *attr = vvp_env_thread_attr(env); + int result; + unsigned valid = CAT_CTIME; + + cl_object_attr_lock(obj); + attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime; + if (io->u.ci_setattr.sa_avalid & ATTR_ATIME_SET) { + attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime; + valid |= CAT_ATIME; + } + if (io->u.ci_setattr.sa_avalid & ATTR_MTIME_SET) { + attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime; + valid |= CAT_MTIME; + } + result = cl_object_attr_update(env, obj, attr, valid); + cl_object_attr_unlock(obj); + + return result; +} + +static int vvp_io_setattr_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct inode *inode = vvp_object_inode(io->ci_obj); + struct ll_inode_info *lli = ll_i2info(inode); + int mode = io->u.ci_setattr.sa_falloc_mode; + + if (cl_io_is_trunc(io)) { + trunc_sem_down_write(&lli->lli_trunc_sem); + mutex_lock(&lli->lli_setattr_mutex); + inode_dio_wait(inode); + } else if (cl_io_is_fallocate(io)) { + loff_t size; + + trunc_sem_down_write(&lli->lli_trunc_sem); + mutex_lock(&lli->lli_setattr_mutex); + inode_dio_wait(inode); + + ll_merge_attr(env, inode); + size = i_size_read(inode); + if (io->u.ci_setattr.sa_falloc_end > size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + size = io->u.ci_setattr.sa_falloc_end; + io->u.ci_setattr.sa_avalid |= ATTR_SIZE; + } + io->u.ci_setattr.sa_attr.lvb_size = size; + } else { + mutex_lock(&lli->lli_setattr_mutex); + } + + if (io->u.ci_setattr.sa_avalid & TIMES_SET_FLAGS) + return vvp_io_setattr_time(env, ios); + + return 0; +} + +static void vvp_io_setattr_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct inode *inode = vvp_object_inode(io->ci_obj); + struct ll_inode_info *lli = ll_i2info(inode); + + if (cl_io_is_trunc(io)) { + /* Truncate in memory pages - they must be clean pages + * because osc has already notified to destroy osc_extents. */ + vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size); + mutex_unlock(&lli->lli_setattr_mutex); + trunc_sem_up_write(&lli->lli_trunc_sem); + } else if (cl_io_is_fallocate(io)) { + mutex_unlock(&lli->lli_setattr_mutex); + trunc_sem_up_write(&lli->lli_trunc_sem); + } else { + mutex_unlock(&lli->lli_setattr_mutex); + } +} + +static void vvp_io_setattr_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + bool restore_needed = ios->cis_io->ci_restore_needed; + struct inode *inode = vvp_object_inode(ios->cis_obj); + + vvp_io_fini(env, ios); + + if (restore_needed && !ios->cis_io->ci_restore_needed) { + /* restore finished, set data modified flag for HSM */ + set_bit(LLIF_DATA_MODIFIED, &ll_i2info(inode)->lli_flags); + } +} + +static int vvp_io_read_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = vvp_object_inode(obj); + struct ll_inode_info *lli = ll_i2info(inode); + struct file *file = vio->vui_fd->fd_file; + loff_t pos = io->u.ci_rd.rd.crw_pos; + size_t cnt = io->u.ci_rd.rd.crw_count; + size_t tot = vio->vui_tot_count; + struct ll_cl_context *lcc; + unsigned int seq; + int exceed = 0; + int result; + int total_bytes_read = 0; + struct iov_iter iter; + pgoff_t page_offset; + + ENTRY; + + CLOBINVRNT(env, obj, vvp_object_invariant(obj)); + + CDEBUG(D_VFSTRACE, "%s: read [%llu, %llu)\n", + file_dentry(file)->d_name.name, + pos, pos + cnt); + + trunc_sem_down_read(&lli->lli_trunc_sem); + + if (io->ci_async_readahead) { + file_accessed(file); + RETURN(0); + } + + if (!can_populate_pages(env, io, inode)) + RETURN(0); + + if (!(file->f_flags & O_DIRECT)) { + result = cl_io_lru_reserve(env, io, pos, cnt); + if (result) + RETURN(result); + } + + /* Unless this is reading a sparse file, otherwise the lock has already + * been acquired so vvp_prep_size() is an empty op. */ + result = vvp_prep_size(env, obj, io, pos, cnt, &exceed); + if (result != 0) + RETURN(result); + else if (exceed != 0) + GOTO(out, result); + + LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, + "Read ino %lu, %zu bytes, offset %lld, size %llu\n", + inode->i_ino, cnt, pos, i_size_read(inode)); + + /* initialize read-ahead window once per syscall */ + if (!vio->vui_ra_valid) { + vio->vui_ra_valid = true; + vio->vui_ra_start_idx = cl_index(obj, pos); + vio->vui_ra_pages = 0; + page_offset = pos & ~PAGE_MASK; + if (page_offset) { + vio->vui_ra_pages++; + if (tot > PAGE_SIZE - page_offset) + tot -= (PAGE_SIZE - page_offset); + else + tot = 0; + } + vio->vui_ra_pages += (tot + PAGE_SIZE - 1) >> PAGE_SHIFT; + + CDEBUG(D_READA, "tot %zu, ra_start %lu, ra_count %lu\n", + vio->vui_tot_count, vio->vui_ra_start_idx, + vio->vui_ra_pages); + } + + /* BUG: 5972 */ + file_accessed(file); + LASSERT(vio->vui_iocb->ki_pos == pos); + iter = *vio->vui_iter; + + lcc = ll_cl_find(inode); + lcc->lcc_end_index = DIV_ROUND_UP(pos + iter.count, PAGE_SIZE); + CDEBUG(D_VFSTRACE, "count:%ld iocb pos:%lld\n", iter.count, pos); + + /* this seqlock lets us notice if a page has been deleted on this inode + * during the fault process, allowing us to catch an erroneous short + * read or EIO + * See LU-16160 + */ + do { + seq = read_seqbegin(&ll_i2info(inode)->lli_page_inv_lock); + result = generic_file_read_iter(vio->vui_iocb, &iter); + if (result >= 0) { + io->ci_nob += result; + total_bytes_read += result; + } + /* if we got a short read or -EIO and we raced with page invalidation, + * retry + */ + } while (read_seqretry(&ll_i2info(inode)->lli_page_inv_lock, seq) && + ((result >= 0 && iov_iter_count(&iter) > 0) + || result == -EIO)); + +out: + if (result >= 0) { + if (total_bytes_read < cnt) + io->ci_continue = 0; + result = 0; + } else if (result == -EIOCBQUEUED) { + io->ci_nob += vio->u.readwrite.vui_read; + vio->vui_iocb->ki_pos = pos + vio->u.readwrite.vui_read; + } + + return result; +} + +static int vvp_io_commit_sync(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *plist, int from, int to) +{ + struct cl_2queue *queue = &io->ci_queue; + struct cl_page *page; + unsigned int bytes = 0; + int rc = 0; + ENTRY; + + if (plist->pl_nr == 0) + RETURN(0); + + if (from > 0 || to != PAGE_SIZE) { + page = cl_page_list_first(plist); + if (plist->pl_nr == 1) { + cl_page_clip(env, page, from, to); + } else { + if (from > 0) + cl_page_clip(env, page, from, PAGE_SIZE); + if (to != PAGE_SIZE) { + page = cl_page_list_last(plist); + cl_page_clip(env, page, 0, to); + } + } + } + + cl_2queue_init(queue); + cl_page_list_splice(plist, &queue->c2_qin); + rc = cl_io_submit_sync(env, io, CRT_WRITE, queue, 0); + + /* plist is not sorted any more */ + cl_page_list_splice(&queue->c2_qin, plist); + cl_page_list_splice(&queue->c2_qout, plist); + cl_2queue_fini(env, queue); + + if (rc == 0) { + /* calculate bytes */ + bytes = plist->pl_nr << PAGE_SHIFT; + bytes -= from + PAGE_SIZE - to; + + while (plist->pl_nr > 0) { + page = cl_page_list_first(plist); + cl_page_list_del(env, plist, page); + + cl_page_clip(env, page, 0, PAGE_SIZE); + + SetPageUptodate(cl_page_vmpage(page)); + cl_page_disown(env, io, page); + + /* held in ll_cl_init() */ + lu_ref_del(&page->cp_reference, "cl_io", io); + cl_page_put(env, page); + } + } + + RETURN(bytes > 0 ? bytes : rc); +} + +/* + * From kernel v4.19-rc5-248-g9b89a0355144 use XArrary + * Prior kernels use radix_tree for tags + */ +static inline void ll_page_tag_dirty(struct page *page, + struct address_space *mapping) +{ +#ifndef HAVE_RADIX_TREE_TAG_SET + __xa_set_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); +#else + radix_tree_tag_set(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); +#endif +} + +/* + * Kernels 4.2 - 4.5 pass memcg argument to account_page_dirtied() + * Kernel v5.2-5678-gac1c3e4 no longer exports account_page_dirtied + */ +static inline void ll_account_page_dirtied(struct page *page, + struct address_space *mapping) +{ +#ifdef HAVE_ACCOUNT_PAGE_DIRTIED_3ARGS + struct mem_cgroup *memcg = mem_cgroup_begin_page_stat(page); + + account_page_dirtied(page, mapping, memcg); + mem_cgroup_end_page_stat(memcg); +#elif defined(HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT) + account_page_dirtied(page, mapping); +#else + vvp_account_page_dirtied(page, mapping); +#endif + ll_page_tag_dirty(page, mapping); +} + +/* Taken from kernel set_page_dirty, __set_page_dirty_nobuffers + * Last change to this area: b93b016313b3ba8003c3b8bb71f569af91f19fc7 + * + * Current with Linus tip of tree (7/13/2019): + * v5.2-rc4-224-ge01e060fe0 + * + * Backwards compat for 3.x, 5.x kernels relating to memcg handling + * & rename of radix tree to xarray. + */ +void vvp_set_pagevec_dirty(struct pagevec *pvec) +{ + struct page *page = pvec->pages[0]; + int count = pagevec_count(pvec); + int i; +#ifdef HAVE_KALLSYMS_LOOKUP_NAME + struct address_space *mapping = page->mapping; + unsigned long flags; + unsigned long skip_pages = 0; + int dirtied = 0; +#endif + + ENTRY; + + BUILD_BUG_ON(PAGEVEC_SIZE > BITS_PER_LONG); + LASSERTF(page->mapping, + "mapping must be set. page %p, page->private (cl_page) %p\n", + page, (void *) page->private); + + /* + * kernels without HAVE_KALLSYMS_LOOKUP_NAME also don't have + * account_dirty_page exported, and if we can't access that symbol, + * we can't do page dirtying in batch (taking the xarray lock only once) + * so we just fall back to a looped call to __set_page_dirty_nobuffers + */ +#ifndef HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT + if (!vvp_account_page_dirtied) { + for (i = 0; i < count; i++) + __set_page_dirty_nobuffers(pvec->pages[i]); + EXIT; + } +#endif + +#ifdef HAVE_KALLSYMS_LOOKUP_NAME + for (i = 0; i < count; i++) { + page = pvec->pages[i]; + + ClearPageReclaim(page); + + vvp_lock_page_memcg(page); + if (TestSetPageDirty(page)) { + /* page is already dirty .. no extra work needed + * set a flag for the i'th page to be skipped + */ + vvp_unlock_page_memcg(page); + skip_pages |= (1 << i); + } + } + + ll_xa_lock_irqsave(&mapping->i_pages, flags); + + /* Notes on differences with __set_page_dirty_nobuffers: + * 1. We don't need to call page_mapping because we know this is a page + * cache page. + * 2. We have the pages locked, so there is no need for the careful + * mapping/mapping2 dance. + * 3. No mapping is impossible. (Race w/truncate mentioned in + * dirty_nobuffers should be impossible because we hold the page lock.) + * 4. All mappings are the same because i/o is only to one file. + */ + for (i = 0; i < count; i++) { + page = pvec->pages[i]; + /* if the i'th page was unlocked above, skip it here */ + if ((skip_pages >> i) & 1) + continue; + + LASSERTF(page->mapping == mapping, + "all pages must have the same mapping. page %p, mapping %p, first mapping %p\n", + page, page->mapping, mapping); + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); + ll_account_page_dirtied(page, mapping); + dirtied++; + vvp_unlock_page_memcg(page); + } + ll_xa_unlock_irqrestore(&mapping->i_pages, flags); + + CDEBUG(D_VFSTRACE, "mapping %p, count %d, dirtied %d\n", mapping, + count, dirtied); + + if (mapping->host && dirtied) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } +#endif + EXIT; +} + +static void write_commit_callback(const struct lu_env *env, struct cl_io *io, + struct pagevec *pvec) +{ + int count = 0; + int i = 0; + + ENTRY; + + count = pagevec_count(pvec); + LASSERT(count > 0); + + for (i = 0; i < count; i++) { + struct page *vmpage = pvec->pages[i]; + SetPageUptodate(vmpage); + } + + vvp_set_pagevec_dirty(pvec); + + for (i = 0; i < count; i++) { + struct page *vmpage = pvec->pages[i]; + struct cl_page *page = (struct cl_page *) vmpage->private; + cl_page_disown(env, io, page); + lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io)); + cl_page_put(env, page); + } + + EXIT; +} + +/* make sure the page list is contiguous */ +static bool page_list_sanity_check(struct cl_object *obj, + struct cl_page_list *plist) +{ + struct cl_page *page; + pgoff_t index = CL_PAGE_EOF; + + cl_page_list_for_each(page, plist) { + struct vvp_page *vpg = cl_object_page_slice(obj, page); + + if (index == CL_PAGE_EOF) { + index = vvp_index(vpg); + continue; + } + + ++index; + if (index == vvp_index(vpg)) + continue; + + return false; + } + return true; +} + +/* Return how many bytes have queued or written */ +int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io) +{ + struct cl_object *obj = io->ci_obj; + struct inode *inode = vvp_object_inode(obj); + struct vvp_io *vio = vvp_env_io(env); + struct cl_page_list *queue = &vio->u.readwrite.vui_queue; + struct cl_page *page; + int rc = 0; + int bytes = 0; + unsigned int npages = vio->u.readwrite.vui_queue.pl_nr; + ENTRY; + + if (npages == 0) + RETURN(0); + + CDEBUG(D_VFSTRACE, "commit async pages: %d, from %d, to %d\n", + npages, vio->u.readwrite.vui_from, vio->u.readwrite.vui_to); + + LASSERT(page_list_sanity_check(obj, queue)); + + /* submit IO with async write */ + rc = cl_io_commit_async(env, io, queue, + vio->u.readwrite.vui_from, + vio->u.readwrite.vui_to, + write_commit_callback); + npages -= queue->pl_nr; /* already committed pages */ + if (npages > 0) { + /* calculate how many bytes were written */ + bytes = npages << PAGE_SHIFT; + + /* first page */ + bytes -= vio->u.readwrite.vui_from; + if (queue->pl_nr == 0) /* last page */ + bytes -= PAGE_SIZE - vio->u.readwrite.vui_to; + LASSERTF(bytes > 0, "bytes = %d, pages = %d\n", bytes, npages); + + vio->u.readwrite.vui_written += bytes; + + CDEBUG(D_VFSTRACE, "Committed %d pages %d bytes, tot: %ld\n", + npages, bytes, vio->u.readwrite.vui_written); + + /* the first page must have been written. */ + vio->u.readwrite.vui_from = 0; + } + LASSERT(page_list_sanity_check(obj, queue)); + LASSERT(ergo(rc == 0, queue->pl_nr == 0)); + + /* out of quota, try sync write */ + if (rc == -EDQUOT && !cl_io_is_mkwrite(io)) { + struct ll_inode_info *lli = ll_i2info(inode); + + rc = vvp_io_commit_sync(env, io, queue, + vio->u.readwrite.vui_from, + vio->u.readwrite.vui_to); + if (rc > 0) { + vio->u.readwrite.vui_written += rc; + rc = 0; + } + if (lli->lli_clob != NULL) + lov_read_and_clear_async_rc(lli->lli_clob); + lli->lli_async_rc = 0; + } + + /* update inode size */ + ll_merge_attr(env, inode); + + /* Now the pages in queue were failed to commit, discard them + * unless they were dirtied before. */ + while (queue->pl_nr > 0) { + page = cl_page_list_first(queue); + cl_page_list_del(env, queue, page); + + if (!PageDirty(cl_page_vmpage(page))) + cl_page_discard(env, io, page); + + cl_page_disown(env, io, page); + + /* held in ll_cl_init() */ + lu_ref_del(&page->cp_reference, "cl_io", io); + cl_page_put(env, page); + } + cl_page_list_fini(env, queue); + + RETURN(rc); +} + +static int vvp_io_write_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = vvp_object_inode(obj); + struct ll_inode_info *lli = ll_i2info(inode); + struct file *file = vio->vui_fd->fd_file; + ssize_t result = 0; + loff_t pos = io->u.ci_wr.wr.crw_pos; + size_t cnt = io->u.ci_wr.wr.crw_count; + bool lock_inode = !IS_NOSEC(inode); + size_t nob = io->ci_nob; + struct iov_iter iter; + size_t written = 0; + + ENTRY; + + trunc_sem_down_read(&lli->lli_trunc_sem); + + if (!can_populate_pages(env, io, inode)) + RETURN(0); + + if (cl_io_is_append(io)) { + /* + * PARALLEL IO This has to be changed for parallel IO doing + * out-of-order writes. + */ + ll_merge_attr(env, inode); + pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode); + vio->vui_iocb->ki_pos = pos; + } else { + LASSERTF(vio->vui_iocb->ki_pos == pos, + "ki_pos %lld [%lld, %lld)\n", + vio->vui_iocb->ki_pos, + pos, pos + cnt); + } + + CDEBUG(D_VFSTRACE, "%s: write [%llu, %llu)\n", + file_dentry(file)->d_name.name, + pos, pos + cnt); + + /* The maximum Lustre file size is variable, based on the OST maximum + * object size and number of stripes. This needs another check in + * addition to the VFS checks earlier. */ + if (pos + cnt > ll_file_maxbytes(inode)) { + CDEBUG(D_INODE, + "%s: file %s ("DFID") offset %llu > maxbytes %llu\n", + ll_i2sbi(inode)->ll_fsname, + file_dentry(file)->d_name.name, + PFID(ll_inode2fid(inode)), pos + cnt, + ll_file_maxbytes(inode)); + RETURN(-EFBIG); + } + + /* Tests to verify we take the i_mutex correctly */ + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_SEC) && !lock_inode) + RETURN(-EINVAL); + + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_NOSEC) && lock_inode) + RETURN(-EINVAL); + + if (!(file->f_flags & O_DIRECT)) { + result = cl_io_lru_reserve(env, io, pos, cnt); + if (result) + RETURN(result); + } + + if (vio->vui_iter == NULL) { + /* from a temp io in ll_cl_init(). */ + result = 0; + } else { + /* + * When using the locked AIO function (generic_file_aio_write()) + * testing has shown the inode mutex to be a limiting factor + * with multi-threaded single shared file performance. To get + * around this, we now use the lockless version. To maintain + * consistency, proper locking to protect against writes, + * trucates, etc. is handled in the higher layers of lustre. + */ + lock_inode = !IS_NOSEC(inode); + iter = *vio->vui_iter; + + if (unlikely(lock_inode)) + inode_lock(inode); + result = __generic_file_write_iter(vio->vui_iocb, &iter); + if (unlikely(lock_inode)) + inode_unlock(inode); + + written = result; + if (result > 0) +#ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS + result = generic_write_sync(vio->vui_iocb, result); +#else + { + ssize_t err; + + err = generic_write_sync(vio->vui_iocb->ki_filp, pos, + result); + if (err < 0 && result > 0) + result = err; + } +#endif + } + + if (result > 0) { + result = vvp_io_write_commit(env, io); + /* Simulate short commit */ + if (CFS_FAULT_CHECK(OBD_FAIL_LLITE_SHORT_COMMIT)) { + vio->u.readwrite.vui_written >>= 1; + if (vio->u.readwrite.vui_written > 0) + io->ci_need_restart = 1; + } + if (vio->u.readwrite.vui_written > 0) { + result = vio->u.readwrite.vui_written; + CDEBUG(D_VFSTRACE, "%s: write nob %zd, result: %zd\n", + file_dentry(file)->d_name.name, + io->ci_nob, result); + io->ci_nob += result; + } else { + io->ci_continue = 0; + } + } + if (vio->vui_iocb->ki_pos != (pos + io->ci_nob - nob)) { + CDEBUG(D_VFSTRACE, + "%s: write position mismatch: ki_pos %lld vs. pos %lld, written %zd, commit %zd: rc = %zd\n", + file_dentry(file)->d_name.name, + vio->vui_iocb->ki_pos, pos + io->ci_nob - nob, + written, io->ci_nob - nob, result); + /* + * Rewind ki_pos and vui_iter to where it has + * successfully committed. + */ + vio->vui_iocb->ki_pos = pos + io->ci_nob - nob; + } + if (result > 0 || result == -EIOCBQUEUED) { + set_bit(LLIF_DATA_MODIFIED, &ll_i2info(inode)->lli_flags); + + if (result != -EIOCBQUEUED && result < cnt) + io->ci_continue = 0; + if (result > 0) + result = 0; + /* move forward */ + if (result == -EIOCBQUEUED) { + io->ci_nob += vio->u.readwrite.vui_written; + vio->vui_iocb->ki_pos = pos + + vio->u.readwrite.vui_written; + } + } + + RETURN(result); +} + +static void vvp_io_rw_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct inode *inode = vvp_object_inode(ios->cis_obj); + struct ll_inode_info *lli = ll_i2info(inode); + + trunc_sem_up_read(&lli->lli_trunc_sem); +} + +static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) +{ + struct vm_fault *vmf = cfio->ft_vmf; + + cfio->ft_flags = ll_filemap_fault(cfio->ft_vma, vmf); + cfio->ft_flags_valid = 1; + + if (vmf->page) { + LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n", + get_vmf_address(vmf)); + if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { + lock_page(vmf->page); + cfio->ft_flags |= VM_FAULT_LOCKED; + } + + cfio->ft_vmpage = vmf->page; + + return 0; + } + + if (cfio->ft_flags & VM_FAULT_SIGBUS) { + CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", get_vmf_address(vmf)); + return -EFAULT; + } + + if (cfio->ft_flags & VM_FAULT_OOM) { + CDEBUG(D_PAGE, "got addr %p - OOM\n", get_vmf_address(vmf)); + return -ENOMEM; + } + + if (cfio->ft_flags & VM_FAULT_RETRY) + return -EAGAIN; + + CERROR("unknown error in page fault %d\n", cfio->ft_flags); + + return -EINVAL; +} + +static void mkwrite_commit_callback(const struct lu_env *env, struct cl_io *io, + struct pagevec *pvec) +{ + vvp_set_pagevec_dirty(pvec); +} + +static int vvp_io_fault_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = vvp_object_inode(obj); + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_fault_io *fio = &io->u.ci_fault; + struct vvp_fault_io *cfio = &vio->u.fault; + loff_t offset; + int result = 0; + struct page *vmpage = NULL; + struct cl_page *page; + loff_t size; + pgoff_t last_index; + ENTRY; + + trunc_sem_down_read_nowait(&lli->lli_trunc_sem); + + /* offset of the last byte on the page */ + offset = cl_offset(obj, fio->ft_index + 1) - 1; + LASSERT(cl_index(obj, offset) == fio->ft_index); + result = vvp_prep_size(env, obj, io, 0, offset + 1, NULL); + if (result != 0) + RETURN(result); + + /* must return locked page */ + if (fio->ft_mkwrite) { + LASSERT(cfio->ft_vmpage != NULL); + lock_page(cfio->ft_vmpage); + } else { + result = vvp_io_kernel_fault(cfio); + if (result != 0) + RETURN(result); + } + + vmpage = cfio->ft_vmpage; + LASSERT(PageLocked(vmpage)); + + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE)) + generic_error_remove_page(vmpage->mapping, vmpage); + + size = i_size_read(inode); + /* Though we have already held a cl_lock upon this page, but + * it still can be truncated locally. */ + if (unlikely((vmpage->mapping != inode->i_mapping) || + (page_offset(vmpage) > size))) { + CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n"); + + /* return +1 to stop cl_io_loop() and ll_fault() will catch + * and retry. */ + GOTO(out, result = +1); + } + + last_index = cl_index(obj, size - 1); + + if (fio->ft_mkwrite ) { + /* + * Capture the size while holding the lli_trunc_sem from above + * we want to make sure that we complete the mkwrite action + * while holding this lock. We need to make sure that we are + * not past the end of the file. + */ + if (last_index < fio->ft_index) { + CDEBUG(D_PAGE, + "llite: mkwrite and truncate race happened: " + "%p: 0x%lx 0x%lx\n", + vmpage->mapping,fio->ft_index,last_index); + /* + * We need to return if we are + * passed the end of the file. This will propagate + * up the call stack to ll_page_mkwrite where + * we will return VM_FAULT_NOPAGE. Any non-negative + * value returned here will be silently + * converted to 0. If the vmpage->mapping is null + * the error code would be converted back to ENODATA + * in ll_page_mkwrite0. Thus we return -ENODATA + * to handle both cases + */ + GOTO(out, result = -ENODATA); + } + } + + page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE); + if (IS_ERR(page)) + GOTO(out, result = PTR_ERR(page)); + + /* if page is going to be written, we should add this page into cache + * earlier. */ + if (fio->ft_mkwrite) { + wait_on_page_writeback(vmpage); + if (!PageDirty(vmpage)) { + struct cl_page_list *plist = &vio->u.fault.ft_queue; + struct vvp_page *vpg = cl_object_page_slice(obj, page); + int to = PAGE_SIZE; + + /* vvp_page_assume() calls wait_on_page_writeback(). */ + cl_page_assume(env, io, page); + + cl_page_list_init(plist); + cl_page_list_add(plist, page, true); + + /* size fixup */ + if (last_index == vvp_index(vpg)) + to = ((size - 1) & ~PAGE_MASK) + 1; + + /* Do not set Dirty bit here so that in case IO is + * started before the page is really made dirty, we + * still have chance to detect it. */ + result = cl_io_commit_async(env, io, plist, 0, to, + mkwrite_commit_callback); + /* Have overquota flag, trying sync write to check + * whether indeed out of quota */ + if (result == -EDQUOT) { + cl_page_get(page); + result = vvp_io_commit_sync(env, io, + plist, 0, to); + if (result >= 0) { + io->ci_noquota = 1; + cl_page_own(env, io, page); + cl_page_list_add(plist, page, true); + lu_ref_add(&page->cp_reference, + "cl_io", io); + result = cl_io_commit_async(env, io, + plist, 0, to, + mkwrite_commit_callback); + io->ci_noquota = 0; + } else { + cl_page_put(env, page); + } + } + + LASSERT(cl_page_is_owned(page, io)); + cl_page_list_fini(env, plist); + + vmpage = NULL; + if (result < 0) { + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + + cl_page_put(env, page); + + /* we're in big trouble, what can we do now? */ + if (result == -EDQUOT) + result = -ENOSPC; + GOTO(out, result); + } else { + cl_page_disown(env, io, page); + } + } + } + + /* + * The ft_index is only used in the case of + * a mkwrite action. We need to check + * our assertions are correct, since + * we should have caught this above + */ + LASSERT(!fio->ft_mkwrite || fio->ft_index <= last_index); + if (fio->ft_index == last_index) + /* + * Last page is mapped partially. + */ + fio->ft_nob = size - cl_offset(obj, fio->ft_index); + else + fio->ft_nob = cl_page_size(obj); + + lu_ref_add(&page->cp_reference, "fault", io); + fio->ft_page = page; + EXIT; + +out: + /* return unlocked vmpage to avoid deadlocking */ + if (vmpage != NULL) + unlock_page(vmpage); + + cfio->ft_flags &= ~VM_FAULT_LOCKED; + + return result; +} + +static void vvp_io_fault_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct inode *inode = vvp_object_inode(ios->cis_obj); + struct ll_inode_info *lli = ll_i2info(inode); + + CLOBINVRNT(env, ios->cis_io->ci_obj, + vvp_object_invariant(ios->cis_io->ci_obj)); + trunc_sem_up_read(&lli->lli_trunc_sem); +} + +static int vvp_io_fsync_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + /* we should mark TOWRITE bit to each dirty page in radix tree to + * verify pages have been written, but this is difficult because of + * race. */ + return 0; +} + +static int vvp_io_read_ahead(const struct lu_env *env, + const struct cl_io_slice *ios, + pgoff_t start, struct cl_read_ahead *ra) +{ + int result = 0; + ENTRY; + + if (ios->cis_io->ci_type == CIT_READ || + ios->cis_io->ci_type == CIT_FAULT) { + struct vvp_io *vio = cl2vvp_io(env, ios); + + if (unlikely(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + ra->cra_end_idx = CL_PAGE_EOF; + result = +1; /* no need to call down */ + } + } + + RETURN(result); +} + +static int vvp_io_lseek_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + __u64 lock_start = io->u.ci_lseek.ls_start; + __u64 lock_end = OBD_OBJECT_EOF; + __u32 enqflags = CEF_MUST; /* always take client lock */ + + return vvp_io_one_lock(env, io, enqflags, CLM_READ, + lock_start, lock_end); +} + +static int vvp_io_lseek_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct inode *inode = vvp_object_inode(io->ci_obj); + __u64 start = io->u.ci_lseek.ls_start; + + inode_lock(inode); + inode_dio_wait(inode); + + /* At the moment we have DLM lock so just update inode + * to know the file size. + */ + ll_merge_attr(env, inode); + if (start >= i_size_read(inode)) { + io->u.ci_lseek.ls_result = -ENXIO; + return -ENXIO; + } + return 0; +} + +static void vvp_io_lseek_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct inode *inode = vvp_object_inode(io->ci_obj); + + if (io->u.ci_lseek.ls_result > i_size_read(inode)) + io->u.ci_lseek.ls_result = -ENXIO; + + inode_unlock(inode); +} + +static const struct cl_io_operations vvp_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = vvp_io_fini, + .cio_iter_init = vvp_io_read_iter_init, + .cio_lock = vvp_io_read_lock, + .cio_start = vvp_io_read_start, + .cio_end = vvp_io_rw_end, + .cio_advance = vvp_io_advance, + }, + [CIT_WRITE] = { + .cio_fini = vvp_io_fini, + .cio_iter_init = vvp_io_write_iter_init, + .cio_iter_fini = vvp_io_write_iter_fini, + .cio_lock = vvp_io_write_lock, + .cio_start = vvp_io_write_start, + .cio_end = vvp_io_rw_end, + .cio_advance = vvp_io_advance, + }, + [CIT_SETATTR] = { + .cio_fini = vvp_io_setattr_fini, + .cio_iter_init = vvp_io_setattr_iter_init, + .cio_lock = vvp_io_setattr_lock, + .cio_start = vvp_io_setattr_start, + .cio_end = vvp_io_setattr_end + }, + [CIT_FAULT] = { + .cio_fini = vvp_io_fault_fini, + .cio_iter_init = vvp_io_fault_iter_init, + .cio_lock = vvp_io_fault_lock, + .cio_start = vvp_io_fault_start, + .cio_end = vvp_io_fault_end, + }, + [CIT_FSYNC] = { + .cio_start = vvp_io_fsync_start, + .cio_fini = vvp_io_fini + }, + [CIT_GLIMPSE] = { + .cio_fini = vvp_io_fini + }, + [CIT_MISC] = { + .cio_fini = vvp_io_fini + }, + [CIT_LADVISE] = { + .cio_fini = vvp_io_fini + }, + [CIT_LSEEK] = { + .cio_fini = vvp_io_fini, + .cio_lock = vvp_io_lseek_lock, + .cio_start = vvp_io_lseek_start, + .cio_end = vvp_io_lseek_end, + }, + }, + .cio_read_ahead = vvp_io_read_ahead +}; + +int vvp_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct vvp_io *vio = vvp_env_io(env); + struct inode *inode = vvp_object_inode(obj); + int result; + + CLOBINVRNT(env, obj, vvp_object_invariant(obj)); + ENTRY; + + CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d " + "restore needed %d\n", + PFID(lu_object_fid(&obj->co_lu)), + io->ci_ignore_layout, io->ci_verify_layout, + vio->vui_layout_gen, io->ci_restore_needed); + + CL_IO_SLICE_CLEAN(vio, vui_cl); + cl_io_slice_add(io, &vio->vui_cl, obj, &vvp_io_ops); + vio->vui_ra_valid = false; + result = 0; + if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) { + size_t count; + struct ll_inode_info *lli = ll_i2info(inode); + + count = io->u.ci_rw.crw_count; + /* "If nbyte is 0, read() will return 0 and have no other + * results." -- Single Unix Spec */ + if (count == 0) + result = 1; + else + vio->vui_tot_count = count; + + /* for read/write, we store the jobid in the inode, and + * it'll be fetched by osc when building RPC. + * + * it's not accurate if the file is shared by different + * jobs. + */ + lustre_get_jobid(lli->lli_jobid, sizeof(lli->lli_jobid)); + } else if (io->ci_type == CIT_SETATTR) { + if (!cl_io_is_trunc(io)) + io->ci_lockreq = CILR_MANDATORY; + } + + /* Enqueue layout lock and get layout version. We need to do this + * even for operations requiring to open file, such as read and write, + * because it might not grant layout lock in IT_OPEN. */ + if (result == 0 && !io->ci_ignore_layout) { + result = ll_layout_refresh(inode, &vio->vui_layout_gen); + if (result == -ENOENT) + /* If the inode on MDS has been removed, but the objects + * on OSTs haven't been destroyed (async unlink), layout + * fetch will return -ENOENT, we'd ingore this error + * and continue with dirty flush. LU-3230. */ + result = 0; + if (result < 0) + CERROR("%s: refresh file layout " DFID " error %d.\n", + ll_i2sbi(inode)->ll_fsname, + PFID(lu_object_fid(&obj->co_lu)), result); + } + +#ifdef HAVE_INVALIDATE_LOCK + if (io->ci_invalidate_page_cache) + filemap_invalidate_lock(inode->i_mapping); +#endif /* HAVE_INVALIDATE_LOCK */ + + io->ci_result = result < 0 ? result : 0; + RETURN(result); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_object.c b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c new file mode 100644 index 0000000000000..2413da9498cd3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c @@ -0,0 +1,324 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * cl_object implementation for VVP layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include + +#include + +#include +#include "llite_internal.h" +#include "vvp_internal.h" + +/***************************************************************************** + * + * Object operations. + * + */ + +int vvp_object_invariant(const struct cl_object *obj) +{ + struct inode *inode = vvp_object_inode(obj); + struct ll_inode_info *lli = ll_i2info(inode); + + return (S_ISREG(inode->i_mode) || inode->i_mode == 0) && + lli->lli_clob == obj; +} + +static int vvp_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct vvp_object *obj = lu2vvp(o); + struct inode *inode = obj->vob_inode; + struct ll_inode_info *lli; + + (*p)(env, cookie, "(%d) inode: %p ", + atomic_read(&obj->vob_mmap_cnt), + inode); + if (inode) { + lli = ll_i2info(inode); + (*p)(env, cookie, "%lu/%u %o %u %d %p "DFID, + inode->i_ino, inode->i_generation, inode->i_mode, + inode->i_nlink, atomic_read(&inode->i_count), + lli->lli_clob, PFID(&lli->lli_fid)); + } + return 0; +} + +static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct inode *inode = vvp_object_inode(obj); + + /* + * lov overwrites most of these fields in + * lov_attr_get()->...lov_merge_lvb_kms(), except when inode + * attributes are newer. + */ + + attr->cat_size = i_size_read(inode); + attr->cat_mtime = inode->i_mtime.tv_sec; + attr->cat_atime = inode->i_atime.tv_sec; + attr->cat_ctime = inode->i_ctime.tv_sec; + attr->cat_blocks = inode->i_blocks; + attr->cat_uid = from_kuid(&init_user_ns, inode->i_uid); + attr->cat_gid = from_kgid(&init_user_ns, inode->i_gid); + attr->cat_projid = ll_i2info(inode)->lli_projid; + /* KMS is not known by this layer */ + return 0; /* layers below have to fill in the rest */ +} + +static int vvp_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct inode *inode = vvp_object_inode(obj); + + if (valid & CAT_UID) + inode->i_uid = make_kuid(&init_user_ns, attr->cat_uid); + if (valid & CAT_GID) + inode->i_gid = make_kgid(&init_user_ns, attr->cat_gid); + if (valid & CAT_ATIME) + inode->i_atime.tv_sec = attr->cat_atime; + if (valid & CAT_MTIME) + inode->i_mtime.tv_sec = attr->cat_mtime; + if (valid & CAT_CTIME) + inode->i_ctime.tv_sec = attr->cat_ctime; + if (0 && valid & CAT_SIZE) + i_size_write(inode, attr->cat_size); + if (valid & CAT_PROJID) + ll_i2info(inode)->lli_projid = attr->cat_projid; + /* not currently necessary */ + if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE|CAT_PROJID)) + mark_inode_dirty(inode); + return 0; +} + +static int vvp_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct ll_inode_info *lli = ll_i2info(conf->coc_inode); + + if (conf->coc_opc == OBJECT_CONF_INVALIDATE) { + CDEBUG(D_VFSTRACE, DFID ": losing layout lock\n", + PFID(&lli->lli_fid)); + + ll_layout_version_set(lli, CL_LAYOUT_GEN_NONE); + + /* Clean up page mmap for this inode. + * The reason for us to do this is that if the page has + * already been installed into memory space, the process + * can access it without interacting with lustre, so this + * page may be stale due to layout change, and the process + * will never be notified. + * This operation is expensive but mmap processes have to pay + * a price themselves. */ + unmap_mapping_range(conf->coc_inode->i_mapping, + 0, OBD_OBJECT_EOF, 0); + pcc_layout_invalidate(conf->coc_inode); + } + return 0; +} + +static int vvp_prune(const struct lu_env *env, struct cl_object *obj) +{ + struct inode *inode = vvp_object_inode(obj); + int rc; + ENTRY; + + rc = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1); + if (rc < 0) { + CDEBUG(D_VFSTRACE, DFID ": writeback failed: %d\n", + PFID(lu_object_fid(&obj->co_lu)), rc); + RETURN(rc); + } + + ll_truncate_inode_pages_final(inode); + mapping_clear_exiting(inode->i_mapping); + + RETURN(0); +} + +static int vvp_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb) +{ + struct inode *inode = vvp_object_inode(obj); + + ENTRY; + lvb->lvb_mtime = inode->i_mtime.tv_sec; + lvb->lvb_atime = inode->i_atime.tv_sec; + lvb->lvb_ctime = inode->i_ctime.tv_sec; + + /* + * LU-417: Add dirty pages block count lest i_blocks reports 0, some + * "cp" or "tar" on remote node may think it's a completely sparse file + * and skip it. + */ + if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0) + lvb->lvb_blocks = dirty_cnt(inode); + + RETURN(0); +} + +static void vvp_req_attr_set(const struct lu_env *env, struct cl_object *obj, + struct cl_req_attr *attr) +{ + struct inode *inode; + struct obdo *oa; + u64 valid_flags = OBD_MD_FLTYPE | OBD_MD_FLUID | OBD_MD_FLGID; + + oa = attr->cra_oa; + inode = vvp_object_inode(obj); + + if (attr->cra_type == CRT_WRITE) { + valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME; + obdo_set_o_projid(oa, ll_i2info(inode)->lli_projid); + } else if (attr->cra_type == CRT_READ) { + valid_flags |= OBD_MD_FLATIME; + } + obdo_from_inode(oa, inode, valid_flags & attr->cra_flags); + obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid); + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_INVALID_PFID)) + oa->o_parent_oid++; + memcpy(attr->cra_jobid, ll_i2info(inode)->lli_jobid, + sizeof(attr->cra_jobid)); +} + +static const struct cl_object_operations vvp_ops = { + .coo_page_init = vvp_page_init, + .coo_io_init = vvp_io_init, + .coo_attr_get = vvp_attr_get, + .coo_attr_update = vvp_attr_update, + .coo_conf_set = vvp_conf_set, + .coo_prune = vvp_prune, + .coo_glimpse = vvp_object_glimpse, + .coo_req_attr_set = vvp_req_attr_set +}; + +static int vvp_object_init0(const struct lu_env *env, + struct vvp_object *vob, + const struct cl_object_conf *conf) +{ + vob->vob_inode = conf->coc_inode; + cl_object_page_init(&vob->vob_cl, sizeof(struct vvp_page)); + return 0; +} + +static int vvp_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct vvp_device *dev = lu2vvp_dev(obj->lo_dev); + struct vvp_object *vob = lu2vvp(obj); + struct lu_object *below; + struct lu_device *under; + int result; + + under = &dev->vdv_next->cd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under); + if (below != NULL) { + const struct cl_object_conf *cconf; + + cconf = lu2cl_conf(conf); + lu_object_add(obj, below); + result = vvp_object_init0(env, vob, cconf); + } else + result = -ENOMEM; + + return result; +} + +static void vvp_object_free_rcu(struct rcu_head *head) +{ + struct vvp_object *vob = container_of(head, struct vvp_object, + vob_header.coh_lu.loh_rcu); + + kmem_cache_free(vvp_object_kmem, vob); +} + +static void vvp_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct vvp_object *vob = lu2vvp(obj); + + lu_object_fini(obj); + lu_object_header_fini(obj->lo_header); + OBD_FREE_PRE(vob, sizeof(*vob), "slab-freed"); + call_rcu(&vob->vob_header.coh_lu.loh_rcu, vvp_object_free_rcu); +} + +static const struct lu_object_operations vvp_lu_obj_ops = { + .loo_object_init = vvp_object_init, + .loo_object_free = vvp_object_free, + .loo_object_print = vvp_object_print, +}; + +struct vvp_object *cl_inode2vvp(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct lu_object *lu; + + LASSERT(obj != NULL); + lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type); + LASSERT(lu != NULL); + + return lu2vvp(lu); +} + +struct lu_object *vvp_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct vvp_object *vob; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR_GFP(vob, vvp_object_kmem, GFP_NOFS); + if (vob != NULL) { + struct cl_object_header *hdr; + + obj = &vob->vob_cl.co_lu; + hdr = &vob->vob_header; + cl_object_header_init(hdr); + hdr->coh_page_bufsize = cfs_size_round(sizeof(struct cl_page)); + + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + + vob->vob_cl.co_ops = &vvp_ops; + obj->lo_ops = &vvp_lu_obj_ops; + } else + obj = NULL; + return obj; +} diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_page.c b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c new file mode 100644 index 0000000000000..5ee33e5c78b3e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c @@ -0,0 +1,485 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_page for VVP layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include + +#include +#include "llite_internal.h" +#include "vvp_internal.h" + +/***************************************************************************** + * + * Page operations. + * + */ +static void vvp_page_fini(const struct lu_env *env, + struct cl_page_slice *slice, + struct pagevec *pvec) +{ + struct vvp_page *vpg = cl2vvp_page(slice); + struct page *vmpage = vpg->vpg_page; + + /* + * vmpage->private was already cleared when page was moved into + * VPG_FREEING state. + */ + LASSERT((struct cl_page *)vmpage->private != slice->cpl_page); + LASSERT(vmpage != NULL); + if (pvec) { + if (!pagevec_add(pvec, vmpage)) + pagevec_release(pvec); + } else { + put_page(vmpage); + } +} + +static int vvp_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io, + int nonblock) +{ + struct vvp_page *vpg = cl2vvp_page(slice); + struct page *vmpage = vpg->vpg_page; + + ENTRY; + + LASSERT(vmpage != NULL); + if (nonblock) { + if (!trylock_page(vmpage)) + return -EAGAIN; + + if (unlikely(PageWriteback(vmpage))) { + unlock_page(vmpage); + return -EAGAIN; + } + + return 0; + } + + lock_page(vmpage); + wait_on_page_writeback(vmpage); + + RETURN(0); +} + +static void vvp_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + wait_on_page_writeback(vmpage); +} + +static void vvp_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); +} + +static void vvp_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io) +{ + struct page *vmpage = cl2vm_page(slice); + + ENTRY; + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + + unlock_page(cl2vm_page(slice)); + + EXIT; +} + +static void vvp_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + struct vvp_page *vpg = cl2vvp_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + + if (vpg->vpg_defer_uptodate && !vpg->vpg_ra_used && vmpage->mapping) + ll_ra_stats_inc(vmpage->mapping->host, RA_STAT_DISCARDED); + + generic_error_remove_page(vmpage->mapping, vmpage); +} + +static void vvp_page_export(const struct lu_env *env, + const struct cl_page_slice *slice, + int uptodate) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + if (uptodate) + SetPageUptodate(vmpage); + else + ClearPageUptodate(vmpage); +} + +static int vvp_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA; +} + +static int vvp_page_prep_read(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + ENTRY; + /* Skip the page already marked as PG_uptodate. */ + RETURN(PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0); +} + +static int vvp_page_prep_write(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + struct cl_page *pg = slice->cpl_page; + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageDirty(vmpage)); + + /* ll_writepage path is not a sync write, so need to set page writeback + * flag + */ + if (pg->cp_sync_io == NULL) + set_page_writeback(vmpage); + + return 0; +} + +static void vvp_page_delete(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct cl_page *cp = slice->cpl_page; + + if (cp->cp_type == CPT_CACHEABLE) { + struct page *vmpage = cp->cp_vmpage; + struct inode *inode = vmpage->mapping->host; + + LASSERT(PageLocked(vmpage)); + LASSERT((struct cl_page *)vmpage->private == cp); + + /* Drop the reference count held in vvp_page_init */ + atomic_dec(&cp->cp_ref); + + ClearPagePrivate(vmpage); + vmpage->private = 0; + + /* clearpageuptodate prevents the page being read by the + * kernel after it has been deleted from Lustre, which avoids + * potential stale data reads. The seqlock allows us to see + * that a page was potentially deleted and catch the resulting + * SIGBUS - see ll_filemap_fault() (LU-16160) */ + write_seqlock(&ll_i2info(inode)->lli_page_inv_lock); + ClearPageUptodate(vmpage); + write_sequnlock(&ll_i2info(inode)->lli_page_inv_lock); + + /* + * The reference from vmpage to cl_page is removed, + * but the reference back is still here. It is removed + * later in cl_page_free(). + */ + } +} + +/** + * Handles page transfer errors at VM level. + * + * This takes inode as a separate argument, because inode on which error is to + * be set can be different from \a vmpage inode in case of direct-io. + */ +static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, + int ioret) +{ + struct vvp_object *obj = cl_inode2vvp(inode); + + if (ioret == 0) { + ClearPageError(vmpage); + obj->vob_discard_page_warned = 0; + } else { + SetPageError(vmpage); + if (ioret == -ENOSPC) + set_bit(AS_ENOSPC, &inode->i_mapping->flags); + else + set_bit(AS_EIO, &inode->i_mapping->flags); + + if ((ioret == -ESHUTDOWN || ioret == -EINTR || + ioret == -EIO) && obj->vob_discard_page_warned == 0) { + obj->vob_discard_page_warned = 1; + ll_dirty_page_discard_warn(inode, ioret); + } + } +} + +static void vvp_page_completion_read(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct vvp_page *vpg = cl2vvp_page(slice); + struct page *vmpage = vpg->vpg_page; + struct cl_page *page = slice->cpl_page; + struct inode *inode = vvp_object_inode(page->cp_obj); + + ENTRY; + LASSERT(PageLocked(vmpage)); + CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret); + + if (vpg->vpg_defer_uptodate) + ll_ra_count_put(ll_i2sbi(inode), 1); + + if (ioret == 0) { + if (!vpg->vpg_defer_uptodate) + cl_page_export(env, page, 1); + } else if (vpg->vpg_defer_uptodate) { + vpg->vpg_defer_uptodate = 0; + if (ioret == -EAGAIN) { + /* mirror read failed, it needs to destroy the page + * because subpage would be from wrong osc when trying + * to read from a new mirror + */ + generic_error_remove_page(vmpage->mapping, vmpage); + } + } + + if (page->cp_sync_io == NULL) + unlock_page(vmpage); + + EXIT; +} + +static void vvp_page_completion_write(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct vvp_page *vpg = cl2vvp_page(slice); + struct cl_page *pg = slice->cpl_page; + struct page *vmpage = vpg->vpg_page; + + ENTRY; + CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret); + + if (pg->cp_sync_io != NULL) { + LASSERT(PageLocked(vmpage)); + LASSERT(!PageWriteback(vmpage)); + } else { + LASSERT(PageWriteback(vmpage)); + /* + * Only mark the page error only when it's an async write + * because applications won't wait for IO to finish. + */ + vvp_vmpage_error(vvp_object_inode(pg->cp_obj), vmpage, ioret); + + end_page_writeback(vmpage); + } + EXIT; +} + +/** + * Implements cl_page_operations::cpo_make_ready() method. + * + * This is called to yank a page from the transfer cache and to send it out as + * a part of transfer. This function try-locks the page. If try-lock failed, + * page is owned by some concurrent IO, and should be skipped (this is bad, + * but hopefully rare situation, as it usually results in transfer being + * shorter than possible). + * + * \retval 0 success, page can be placed into transfer + * + * \retval -EAGAIN page is either used by concurrent IO has been + * truncated. Skip it. + */ +static int vvp_page_make_ready(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct page *vmpage = cl2vm_page(slice); + struct cl_page *pg = slice->cpl_page; + int result = 0; + + lock_page(vmpage); + if (clear_page_dirty_for_io(vmpage)) { + LASSERT(pg->cp_state == CPS_CACHED); + /* This actually clears the dirty bit in the radix + * tree. + */ + set_page_writeback(vmpage); + CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n"); + } else if (pg->cp_state == CPS_PAGEOUT) { + /* is it possible for osc_flush_async_page() to already + * make it ready? + */ + result = -EALREADY; + } else { + CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n", + pg->cp_state); + LBUG(); + } + unlock_page(vmpage); + RETURN(result); +} + +static int vvp_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct vvp_page *vpg = cl2vvp_page(slice); + struct page *vmpage = vpg->vpg_page; + + (*printer)(env, cookie, + LUSTRE_VVP_NAME"-page@%p(%d:%d) vm@%p ", + vpg, vpg->vpg_defer_uptodate, vpg->vpg_ra_used, vmpage); + + if (vmpage != NULL) { + (*printer)(env, cookie, "%lx %d:%d %lx %lu %slru", + (long)vmpage->flags, page_count(vmpage), + page_mapcount(vmpage), vmpage->private, + page_index(vmpage), + list_empty(&vmpage->lru) ? "not-" : ""); + } + + (*printer)(env, cookie, "\n"); + + return 0; +} + +static int vvp_page_fail(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + /* + * Cached read? + */ + LBUG(); + + return 0; +} + +static const struct cl_page_operations vvp_page_ops = { + .cpo_own = vvp_page_own, + .cpo_assume = vvp_page_assume, + .cpo_unassume = vvp_page_unassume, + .cpo_disown = vvp_page_disown, + .cpo_discard = vvp_page_discard, + .cpo_delete = vvp_page_delete, + .cpo_export = vvp_page_export, + .cpo_is_vmlocked = vvp_page_is_vmlocked, + .cpo_fini = vvp_page_fini, + .cpo_print = vvp_page_print, + .io = { + [CRT_READ] = { + .cpo_prep = vvp_page_prep_read, + .cpo_completion = vvp_page_completion_read, + .cpo_make_ready = vvp_page_fail, + }, + [CRT_WRITE] = { + .cpo_prep = vvp_page_prep_write, + .cpo_completion = vvp_page_completion_write, + .cpo_make_ready = vvp_page_make_ready, + }, + }, +}; + +static void vvp_transient_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct cl_page *page = slice->cpl_page; + + /* + * For transient pages, remove it from the radix tree. + */ + cl_page_delete(env, page); +} + +static int vvp_transient_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return -EBUSY; +} + +static const struct cl_page_operations vvp_transient_page_ops = { + .cpo_discard = vvp_transient_page_discard, + .cpo_is_vmlocked = vvp_transient_page_is_vmlocked, + .cpo_print = vvp_page_print, +}; + +int vvp_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index) +{ + struct vvp_page *vpg = cl_object_page_slice(obj, page); + struct page *vmpage = page->cp_vmpage; + + CLOBINVRNT(env, obj, vvp_object_invariant(obj)); + + vpg->vpg_page = vmpage; + + if (page->cp_type == CPT_TRANSIENT) { + /* DIO pages are referenced by userspace, we don't need to take + * a reference on them. (contrast with get_page() call above) + */ + cl_page_slice_add(page, &vpg->vpg_cl, obj, + &vvp_transient_page_ops); + } else { + get_page(vmpage); + /* in cache, decref in vvp_page_delete */ + atomic_inc(&page->cp_ref); + SetPagePrivate(vmpage); + vmpage->private = (unsigned long)page; + cl_page_slice_add(page, &vpg->vpg_cl, obj, + &vvp_page_ops); + } + + return 0; +} diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr.c b/drivers/staging/lustrefsx/lustre/llite/xattr.c new file mode 100644 index 0000000000000..0f04ab22f61ec --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/xattr.c @@ -0,0 +1,934 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#include +#include +#include +#include +#ifdef HAVE_LINUX_SELINUX_IS_ENABLED +#include +#endif + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include + +#include "llite_internal.h" + +#ifndef HAVE_XATTR_HANDLER_NAME +static inline const char *xattr_prefix(const struct xattr_handler *handler) +{ + return handler->prefix; +} +#endif + +const struct xattr_handler *get_xattr_type(const char *name) +{ + int i; + + for (i = 0; ll_xattr_handlers[i]; i++) { + const char *prefix = xattr_prefix(ll_xattr_handlers[i]); + size_t prefix_len = strlen(prefix); + + if (!strncmp(prefix, name, prefix_len)) + return ll_xattr_handlers[i]; + } + + return NULL; +} + +static int xattr_type_filter(struct ll_sb_info *sbi, + const struct xattr_handler *handler) +{ + /* No handler means XATTR_OTHER_T */ + if (!handler) + return -EOPNOTSUPP; + + if ((handler->flags == XATTR_ACL_ACCESS_T || + handler->flags == XATTR_ACL_DEFAULT_T) && + !test_bit(LL_SBI_ACL, sbi->ll_flags)) + return -EOPNOTSUPP; + + if (handler->flags == XATTR_USER_T && + !test_bit(LL_SBI_USER_XATTR, sbi->ll_flags)) + return -EOPNOTSUPP; + + if (handler->flags == XATTR_TRUSTED_T && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return 0; +} + +#ifndef HAVE_USER_NAMESPACE_ARG +#define ll_xattr_set_common(hd, ns, de, inode, name, value, size, flags) \ + ll_xattr_set_common(hd, de, inode, name, value, size, flags) +#endif + +static int ll_xattr_set_common(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + const char *pv = value; + char *fullname; + ktime_t kstart = ktime_get(); + u64 valid; + int rc; + ENTRY; + + /* When setxattr() is called with a size of 0 the value is + * unconditionally replaced by "". When removexattr() is + * called we get a NULL value and XATTR_REPLACE for flags. */ + if (!value && flags == XATTR_REPLACE) + valid = OBD_MD_FLXATTRRM; + else + valid = OBD_MD_FLXATTR; + + /* FIXME: enable IMA when the conditions are ready */ + if (handler->flags == XATTR_SECURITY_T && + (!strcmp(name, "ima") || !strcmp(name, "evm"))) + RETURN(-EOPNOTSUPP); + + rc = xattr_type_filter(sbi, handler); + if (rc) + RETURN(rc); + + if ((handler->flags == XATTR_ACL_ACCESS_T || + handler->flags == XATTR_ACL_DEFAULT_T) && + !inode_owner_or_capable(mnt_userns, inode)) + RETURN(-EPERM); + + /* b10667: ignore lustre special xattr for now */ + if (!strcmp(name, "hsm") || + ((handler->flags == XATTR_TRUSTED_T && !strcmp(name, "lov")) || + (handler->flags == XATTR_LUSTRE_T && !strcmp(name, "lov")))) + RETURN(0); + + rc = ll_security_secctx_name_filter(sbi, handler->flags, name); + if (rc) + RETURN(rc); + + /* + * In user.* namespace, only regular files and directories can have + * extended attributes. + */ + if (handler->flags == XATTR_USER_T) { + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + RETURN(-EPERM); + } + + /* This check is required for compatibility with 2.14, in which + * encryption context is stored in security.c xattr. + * Setting the encryption context should only be possible by llcrypt + * when defining an encryption policy on a directory. + * When new files/dirs are created in an encrypted dir, the enc + * context is set directly in the create request. + */ + if (handler->flags == XATTR_SECURITY_T && strcmp(name, "c") == 0) + RETURN(-EPERM); + + fullname = kasprintf(GFP_KERNEL, "%s%s", xattr_prefix(handler), name); + if (!fullname) + RETURN(-ENOMEM); + + rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, fullname, + pv, size, flags, ll_i2suppgid(inode), &req); + kfree(fullname); + if (rc) { + if (rc == -EOPNOTSUPP && handler->flags == XATTR_USER_T) { + LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n"); + clear_bit(LL_SBI_USER_XATTR, sbi->ll_flags); + } + RETURN(rc); + } + + ptlrpc_req_finished(req); + + ll_stats_ops_tally(ll_i2sbi(inode), valid == OBD_MD_FLXATTRRM ? + LPROC_LL_REMOVEXATTR : LPROC_LL_SETXATTR, + ktime_us_delta(ktime_get(), kstart)); + + RETURN(0); +} + +int ll_get_hsm_state(struct inode *inode, u32 *hus_states) +{ + struct md_op_data *op_data; + struct hsm_user_state *hus; + int rc; + + OBD_ALLOC_PTR(hus); + if (!hus) + return -ENOMEM; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hus); + if (!IS_ERR(op_data)) { + rc = obd_iocontrol(LL_IOC_HSM_STATE_GET, ll_i2mdexp(inode), + sizeof(*op_data), op_data, NULL); + if (!rc) + *hus_states = hus->hus_states; + else + CDEBUG(D_VFSTRACE, "obd_iocontrol failed. rc = %d\n", + rc); + + ll_finish_md_op_data(op_data); + } else { + rc = PTR_ERR(op_data); + CDEBUG(D_VFSTRACE, "Could not prepare the opdata. rc = %d\n", + rc); + } + OBD_FREE_PTR(hus); + return rc; +} + +static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump, size_t size) +{ + struct lov_comp_md_v1 *comp_v1 = (struct lov_comp_md_v1 *)lump; + struct lov_user_md *v1 = lump; + bool need_clear_release = false; + bool release_checked = false; + bool is_composite = false; + u16 entry_count = 1; + int rc = 0; + int i; + + if (!lump) + return 0; + + if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) { + if (size < sizeof(*comp_v1)) + return -ERANGE; + + entry_count = comp_v1->lcm_entry_count; + if (size < offsetof(typeof(*comp_v1), lcm_entries[entry_count])) + return -ERANGE; + is_composite = true; + } + + for (i = 0; i < entry_count; i++) { + if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) { + void *ptr = comp_v1; + + if (comp_v1->lcm_entries[i].lcme_offset + sizeof(*v1) > + size) + return -ERANGE; + + ptr += comp_v1->lcm_entries[i].lcme_offset; + v1 = (struct lov_user_md *)ptr; + } + + /* + * Attributes that are saved via getxattr will always + * have the stripe_offset as 0. Instead, the MDS + * should be allowed to pick the starting OST index. + * b=17846 + */ + if (!is_composite && v1->lmm_stripe_offset == 0) + v1->lmm_stripe_offset = -1; + + /* Avoid anyone directly setting the RELEASED flag. */ + if (v1->lmm_pattern & LOV_PATTERN_F_RELEASED) { + if (!release_checked) { + u32 state = HS_NONE; + + rc = ll_get_hsm_state(inode, &state); + if (rc) + return rc; + + if (!(state & HS_ARCHIVED)) + need_clear_release = true; + release_checked = true; + } + if (need_clear_release) + v1->lmm_pattern ^= LOV_PATTERN_F_RELEASED; + } + } + + return rc; +} + +static int ll_setstripe_ea(struct dentry *dentry, struct lov_user_md *lump, + size_t size) +{ + struct inode *inode = dentry->d_inode; + int rc = 0; + + /* + * It is possible to set an xattr to a "" value of zero size. + * For this case we are going to treat it as a removal. + */ + if (!size && lump) + lump = NULL; + + if (size && size < sizeof(*lump)) { + /* ll_adjust_lum() or ll_lov_user_md_size() might access + * before size - just give up now. + */ + return -ERANGE; + } + rc = ll_adjust_lum(inode, lump, size); + if (rc) + return rc; + + if (lump && S_ISREG(inode->i_mode)) { + u64 it_flags = FMODE_WRITE; + ssize_t lum_size; + + lum_size = ll_lov_user_md_size(lump); + if (lum_size < 0 || size < lum_size) + return -ERANGE; + + rc = ll_lov_setstripe_ea_info(inode, dentry, it_flags, lump, + lum_size); + /** + * b=10667: ignore -EEXIST. + * Silently eat error on setting trusted.lov/lustre.lov + * attribute for platforms that added the default option + * to copy all attributes in 'cp' command. Both rsync and + * tar --xattrs also will try to set LOVEA for existing + * files. + */ + if (rc == -EEXIST) + rc = 0; + } else if (S_ISDIR(inode->i_mode)) { + if (size != 0 && size < sizeof(struct lov_user_md)) + return -EINVAL; + + rc = ll_dir_setstripe(inode, lump, 0); + } + + return rc; +} + +#ifndef HAVE_USER_NAMESPACE_ARG +#define ll_xattr_set(hd, ns, de, inode, name, value, size, flags) \ + ll_xattr_set(hd, de, inode, name, value, size, flags) +#endif + +static int ll_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) +{ + ktime_t kstart = ktime_get(); + int op_type = flags == XATTR_REPLACE ? LPROC_LL_REMOVEXATTR : + LPROC_LL_SETXATTR; + int rc; + + LASSERT(inode); + LASSERT(name); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), xattr %s\n", + PFID(ll_inode2fid(inode)), inode, name); + + /* lustre/trusted.lov.xxx would be passed through xattr API */ + if (!strcmp(name, "lov")) { + rc = ll_setstripe_ea(dentry, (struct lov_user_md *)value, + size); + ll_stats_ops_tally(ll_i2sbi(inode), op_type, + ktime_us_delta(ktime_get(), kstart)); + return rc; + } else if (!strcmp(name, "lma") || !strcmp(name, "link")) { + ll_stats_ops_tally(ll_i2sbi(inode), op_type, + ktime_us_delta(ktime_get(), kstart)); + return 0; + } + + if (strncmp(name, "lov.", 4) == 0 && + (__swab32(((struct lov_user_md *)value)->lmm_magic) & + le32_to_cpu(LOV_MAGIC_MASK)) == le32_to_cpu(LOV_MAGIC_MAGIC)) + lustre_swab_lov_user_md((struct lov_user_md *)value, 0); + + return ll_xattr_set_common(handler, mnt_userns, dentry, inode, name, + value, size, flags); +} + +int ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer, + size_t size, u64 valid) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + void *xdata; + int rc; + ENTRY; + + /* This check is required for compatibility with 2.14, in which + * encryption context is stored in security.c xattr. Accessing the + * encryption context should only be possible by llcrypt. + */ + if (type == XATTR_SECURITY_T && strcmp(name, "security.c") == 0) + GOTO(out_xattr, rc = -EPERM); + + if (sbi->ll_xattr_cache_enabled && type != XATTR_ACL_ACCESS_T && + (type != XATTR_SECURITY_T || !ll_xattr_is_seclabel(name)) && + (type != XATTR_TRUSTED_T || strcmp(name, XATTR_NAME_SOM))) { + rc = ll_xattr_cache_get(inode, name, buffer, size, valid); + if (rc == -EAGAIN) + goto getxattr_nocache; + if (rc < 0) + GOTO(out_xattr, rc); + + /* Add "system.posix_acl_access" to the list */ + if (lli->lli_posix_acl && valid & OBD_MD_FLXATTRLS) { + if (size == 0) { + rc += sizeof(XATTR_NAME_ACL_ACCESS); + } else if (size - rc >= sizeof(XATTR_NAME_ACL_ACCESS)) { + memcpy(buffer + rc, XATTR_NAME_ACL_ACCESS, + sizeof(XATTR_NAME_ACL_ACCESS)); + rc += sizeof(XATTR_NAME_ACL_ACCESS); + } else { + GOTO(out_xattr, rc = -ERANGE); + } + } + } else { +getxattr_nocache: + rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, + name, size, &req); + if (rc < 0) + GOTO(out_xattr, rc); + + /* only detect the xattr size */ + if (size == 0) + GOTO(out, rc); + + if (size < rc) + GOTO(out, rc = -ERANGE); + + /* do not need swab xattr data */ + xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, + rc); + if (!xdata) + GOTO(out, rc = -EPROTO); + + memcpy(buffer, xdata, rc); + } + + EXIT; + +out_xattr: + if (rc == -EOPNOTSUPP && type == XATTR_USER_T) { + LCONSOLE_INFO("%s: disabling user_xattr feature because " + "it is not supported on the server: rc = %d\n", + sbi->ll_fsname, rc); + clear_bit(LL_SBI_USER_XATTR, sbi->ll_flags); + } +out: + ptlrpc_req_finished(req); + RETURN(rc); +} + +static int ll_xattr_get_common(const struct xattr_handler *handler, + struct dentry *dentry, + struct inode *inode, + const char *name, void *buffer, size_t size) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + ktime_t kstart = ktime_get(); + char *fullname; + int rc; + + ENTRY; + + rc = xattr_type_filter(sbi, handler); + if (rc) + RETURN(rc); + + rc = ll_security_secctx_name_filter(sbi, handler->flags, name); + if (rc) + RETURN(rc); + +#ifdef CONFIG_LUSTRE_FS_POSIX_ACL + /* posix acl is under protection of LOOKUP lock. when calling to this, + * we just have path resolution to the target inode, so we have great + * chance that cached ACL is uptodate. + */ + if (handler->flags == XATTR_ACL_ACCESS_T) { + struct ll_inode_info *lli = ll_i2info(inode); + struct posix_acl *acl; + + read_lock(&lli->lli_lock); + acl = posix_acl_dup(lli->lli_posix_acl); + read_unlock(&lli->lli_lock); + + if (!acl) + RETURN(-ENODATA); + + rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + posix_acl_release(acl); + RETURN(rc); + } + if (handler->flags == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode)) + RETURN(-ENODATA); +#endif + + fullname = kasprintf(GFP_KERNEL, "%s%s", xattr_prefix(handler), name); + if (!fullname) + RETURN(-ENOMEM); + + rc = ll_xattr_list(inode, fullname, handler->flags, buffer, size, + OBD_MD_FLXATTR); + kfree(fullname); + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, + ktime_us_delta(ktime_get(), kstart)); + + RETURN(rc); +} + +static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size) +{ + ssize_t rc; + + if (S_ISREG(inode->i_mode)) { + struct cl_object *obj = ll_i2info(inode)->lli_clob; + struct cl_layout cl = { + .cl_buf.lb_buf = buf, + .cl_buf.lb_len = buf_size, + }; + struct lu_env *env; + u16 refcheck; + + if (!obj) + RETURN(-ENODATA); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + rc = cl_object_layout_get(env, obj, &cl); + if (rc < 0) + GOTO(out_env, rc); + + if (!cl.cl_size) + GOTO(out_env, rc = -ENODATA); + + rc = cl.cl_size; + + if (!buf_size) + GOTO(out_env, rc); + + LASSERT(buf && rc <= buf_size); + + /* + * Do not return layout gen for getxattr() since + * otherwise it would confuse tar --xattr by + * recognizing layout gen as stripe offset when the + * file is restored. See LU-2809. + */ + if ((((struct lov_mds_md *)buf)->lmm_magic & + __swab32(LOV_MAGIC_MAGIC)) == __swab32(LOV_MAGIC_MAGIC)) + lustre_swab_lov_user_md((struct lov_user_md *)buf, + cl.cl_size); + + switch (((struct lov_mds_md *)buf)->lmm_magic) { + case LOV_MAGIC_V1: + case LOV_MAGIC_V3: + case LOV_MAGIC_SPECIFIC: + ((struct lov_mds_md *)buf)->lmm_layout_gen = 0; + break; + case LOV_MAGIC_COMP_V1: + case LOV_MAGIC_FOREIGN: + goto out_env; + default: + CERROR("Invalid LOV magic %08x\n", + ((struct lov_mds_md *)buf)->lmm_magic); + GOTO(out_env, rc = -EINVAL); + } + +out_env: + cl_env_put(env, &refcheck); + + RETURN(rc); + } else if (S_ISDIR(inode->i_mode)) { + struct ptlrpc_request *req = NULL; + struct ptlrpc_request *root_req = NULL; + struct lov_mds_md *lmm = NULL; + int lmm_size = 0; + + rc = ll_dir_getstripe_default(inode, (void **)&lmm, &lmm_size, + &req, &root_req, 0); + if (rc < 0) + GOTO(out_req, rc); + + if (!buf_size) + GOTO(out_req, rc = lmm_size); + + if (buf_size < lmm_size) + GOTO(out_req, rc = -ERANGE); + + memcpy(buf, lmm, lmm_size); + GOTO(out_req, rc = lmm_size); +out_req: + if (req) + ptlrpc_req_finished(req); + if (root_req) + ptlrpc_req_finished(root_req); + + RETURN(rc); + } else { + RETURN(-ENODATA); + } +} + +static int ll_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + LASSERT(inode); + LASSERT(name); + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), xattr %s\n", + PFID(ll_inode2fid(inode)), inode, name); + + if (!strcmp(name, "lov")) { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1); + + return ll_getxattr_lov(inode, buffer, size); + } + + return ll_xattr_get_common(handler, dentry, inode, name, buffer, size); +} + +ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + ktime_t kstart = ktime_get(); + char *xattr_name; + ssize_t rc, rc2; + size_t len, rem; + + LASSERT(inode); + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + + rc = ll_xattr_list(inode, NULL, XATTR_OTHER_T, buffer, size, + OBD_MD_FLXATTRLS); + if (rc < 0) + RETURN(rc); + + /* + * If we're being called to get the size of the xattr list + * (size == 0) then just assume that a lustre.lov xattr + * exists. + */ + if (!size) + goto out; + + xattr_name = buffer; + rem = rc; + + while (rem > 0) { + const struct xattr_handler *xh = get_xattr_type(xattr_name); + bool hide_xattr = false; + + /* Hide virtual project id xattr from the list when + * parent has the inherit flag and the same project id, + * so project id won't be messed up by copying the xattrs + * when mv to a tree with different project id. + */ + if (xh && xh->flags == XATTR_TRUSTED_T && + strcmp(xattr_name, XATTR_NAME_PROJID) == 0) { + struct inode *dir = d_inode(dentry->d_parent); + + if ((ll_i2info(inode)->lli_projid == + ll_i2info(dir)->lli_projid) && + test_bit(LLIF_PROJECT_INHERIT, + &ll_i2info(dir)->lli_flags)) + hide_xattr = true; + } else if (xh && xh->flags == XATTR_SECURITY_T && + strcmp(xattr_name, "security.c") == 0) { + /* Listing xattrs should not expose encryption + * context. There is no handler defined for + * XATTR_ENCRYPTION_PREFIX, so this test is just + * needed for compatibility with 2.14, in which + * encryption context is stored in security.c xattr. + */ + hide_xattr = true; + } + + len = strnlen(xattr_name, rem - 1) + 1; + rem -= len; + if (!xattr_type_filter(sbi, hide_xattr ? NULL : xh)) { + /* Skip OK xattr type, leave it in buffer. */ + xattr_name += len; + continue; + } + + /* + * Move up remaining xattrs in buffer + * removing the xattr that is not OK. + */ + memmove(xattr_name, xattr_name + len, rem); + rc -= len; + } + + rc2 = ll_getxattr_lov(inode, NULL, 0); + if (rc2 == -ENODATA) + RETURN(rc); + + if (rc2 < 0) + RETURN(rc2); + + if (size < rc + sizeof(XATTR_LUSTRE_LOV)) + RETURN(-ERANGE); + + memcpy(buffer + rc, XATTR_LUSTRE_LOV, sizeof(XATTR_LUSTRE_LOV)); + +out: + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, + ktime_us_delta(ktime_get(), kstart)); + + RETURN(rc + sizeof(XATTR_LUSTRE_LOV)); +} + +#ifdef HAVE_XATTR_HANDLER_SIMPLIFIED +static int ll_xattr_get_common_4_3(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + return ll_xattr_get_common(handler, dentry, dentry->d_inode, name, + buffer, size); +} + +static int ll_xattr_get_4_3(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + return ll_xattr_get(handler, dentry, dentry->d_inode, name, buffer, + size); +} + +static int ll_xattr_set_common_4_3(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return ll_xattr_set_common(handler, dentry, dentry->d_inode, name, + value, size, flags); +} + +static int ll_xattr_set_4_3(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return ll_xattr_set(handler, dentry, dentry->d_inode, name, value, + size, flags); +} + +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) +const struct xattr_handler *get_xattr_handler(int handler_flag) +{ + int i = 0; + + while (ll_xattr_handlers[i]) { + if (ll_xattr_handlers[i]->flags == handler_flag) + return ll_xattr_handlers[i]; + i++; + } + return NULL; +} + +static int ll_xattr_get_common_3_11(struct dentry *dentry, const char *name, + void *buffer, size_t size, int handler_flags) +{ + const struct xattr_handler *handler = get_xattr_handler(handler_flags); + + if (!handler) + return -ENXIO; + + return ll_xattr_get_common(handler, dentry, dentry->d_inode, name, + buffer, size); +} + +static int ll_xattr_get_3_11(struct dentry *dentry, const char *name, + void *buffer, size_t size, int handler_flags) +{ + const struct xattr_handler *handler = get_xattr_handler(handler_flags); + + if (!handler) + return -ENXIO; + + return ll_xattr_get(handler, dentry, dentry->d_inode, name, buffer, + size); +} + +static int ll_xattr_set_common_3_11(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + int handler_flags) +{ + const struct xattr_handler *handler = get_xattr_handler(handler_flags); + + if (!handler) + return -ENXIO; + + return ll_xattr_set_common(handler, NULL, dentry, dentry->d_inode, name, + value, size, flags); +} + +static int ll_xattr_set_3_11(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + int handler_flags) +{ + const struct xattr_handler *handler = get_xattr_handler(handler_flags); + + if (!handler) + return -ENXIO; + + return ll_xattr_set(handler, NULL, dentry, dentry->d_inode, name, value, + size, flags); +} +#endif + +static const struct xattr_handler ll_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = XATTR_USER_T, +#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED) + .get = ll_xattr_get_common_4_3, + .set = ll_xattr_set_common_4_3, +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) + .get = ll_xattr_get_common_3_11, + .set = ll_xattr_set_common_3_11, +#else + .get = ll_xattr_get_common, + .set = ll_xattr_set_common, +#endif +}; + +static const struct xattr_handler ll_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = XATTR_TRUSTED_T, +#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED) + .get = ll_xattr_get_4_3, + .set = ll_xattr_set_4_3, +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) + .get = ll_xattr_get_3_11, + .set = ll_xattr_set_3_11, +#else + .get = ll_xattr_get, + .set = ll_xattr_set, +#endif +}; + +static const struct xattr_handler ll_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = XATTR_SECURITY_T, +#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED) + .get = ll_xattr_get_common_4_3, + .set = ll_xattr_set_common_4_3, +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) + .get = ll_xattr_get_common_3_11, + .set = ll_xattr_set_common_3_11, +#else + .get = ll_xattr_get_common, + .set = ll_xattr_set_common, +#endif +}; + +static const struct xattr_handler ll_acl_access_xattr_handler = { +#ifdef HAVE_XATTR_HANDLER_NAME + .name = XATTR_NAME_POSIX_ACL_ACCESS, +#else + .prefix = XATTR_NAME_POSIX_ACL_ACCESS, +#endif + .flags = XATTR_ACL_ACCESS_T, +#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED) + .get = ll_xattr_get_common_4_3, + .set = ll_xattr_set_common_4_3, +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) + .get = ll_xattr_get_common_3_11, + .set = ll_xattr_set_common_3_11, +#else + .get = ll_xattr_get_common, + .set = ll_xattr_set_common, +#endif +}; + +static const struct xattr_handler ll_acl_default_xattr_handler = { +#ifdef HAVE_XATTR_HANDLER_NAME + .name = XATTR_NAME_POSIX_ACL_DEFAULT, +#else + .prefix = XATTR_NAME_POSIX_ACL_DEFAULT, +#endif + .flags = XATTR_ACL_DEFAULT_T, +#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED) + .get = ll_xattr_get_common_4_3, + .set = ll_xattr_set_common_4_3, +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) + .get = ll_xattr_get_common_3_11, + .set = ll_xattr_set_common_3_11, +#else + .get = ll_xattr_get_common, + .set = ll_xattr_set_common, +#endif +}; + +static const struct xattr_handler ll_lustre_xattr_handler = { + .prefix = XATTR_LUSTRE_PREFIX, + .flags = XATTR_LUSTRE_T, +#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED) + .get = ll_xattr_get_4_3, + .set = ll_xattr_set_4_3, +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) + .get = ll_xattr_get_3_11, + .set = ll_xattr_set_3_11, +#else + .get = ll_xattr_get, + .set = ll_xattr_set, +#endif +}; + +const struct xattr_handler *ll_xattr_handlers[] = { + &ll_user_xattr_handler, + &ll_trusted_xattr_handler, + &ll_security_xattr_handler, +#ifdef CONFIG_LUSTRE_FS_POSIX_ACL + &ll_acl_access_xattr_handler, + &ll_acl_default_xattr_handler, +#endif + &ll_lustre_xattr_handler, + NULL, +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c new file mode 100644 index 0000000000000..0a751744e4f20 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c @@ -0,0 +1,671 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Copyright (c) 2013, 2017, Intel Corporation. + * + * Author: Andrew Perepechko + * + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include "llite_internal.h" + +/* If we ever have hundreds of extended attributes, we might want to consider + * using a hash or a tree structure instead of list for faster lookups. + */ +struct ll_xattr_entry { + struct list_head xe_list; /* protected with + * lli_xattrs_list_rwsem */ + char *xe_name; /* xattr name, \0-terminated */ + char *xe_value; /* xattr value */ + unsigned xe_namelen; /* strlen(xe_name) + 1 */ + unsigned xe_vallen; /* xattr value length */ +}; + +static struct kmem_cache *xattr_kmem; +static struct lu_kmem_descr xattr_caches[] = { + { + .ckd_cache = &xattr_kmem, + .ckd_name = "xattr_kmem", + .ckd_size = sizeof(struct ll_xattr_entry) + }, + { + .ckd_cache = NULL + } +}; + +int ll_xattr_init(void) +{ + return lu_kmem_init(xattr_caches); +} + +void ll_xattr_fini(void) +{ + lu_kmem_fini(xattr_caches); +} + +/** + * Initializes xattr cache for an inode. + * + * This initializes the xattr list and marks cache presence. + */ +static void ll_xattr_cache_init(struct ll_inode_info *lli) +{ + ENTRY; + + LASSERT(lli != NULL); + + INIT_LIST_HEAD(&lli->lli_xattrs); + set_bit(LLIF_XATTR_CACHE, &lli->lli_flags); +} + +/** + * This looks for a specific extended attribute. + * + * Find in @cache and return @xattr_name attribute in @xattr, + * for the NULL @xattr_name return the first cached @xattr. + * + * \retval 0 success + * \retval -ENODATA if not found + */ +static int ll_xattr_cache_find(struct list_head *cache, + const char *xattr_name, + struct ll_xattr_entry **xattr) +{ + struct ll_xattr_entry *entry; + + ENTRY; + + list_for_each_entry(entry, cache, xe_list) { + /* xattr_name == NULL means look for any entry */ + if (xattr_name == NULL || + strcmp(xattr_name, entry->xe_name) == 0) { + *xattr = entry; + CDEBUG(D_CACHE, "find: [%s]=%.*s\n", + entry->xe_name, entry->xe_vallen, + entry->xe_value); + RETURN(0); + } + } + + RETURN(-ENODATA); +} + +/** + * This adds an xattr. + * + * Add @xattr_name attr with @xattr_val value and @xattr_val_len length, + * + * \retval 0 success + * \retval -ENOMEM if no memory could be allocated for the cached attr + * \retval -EPROTO if duplicate xattr is being added + */ +static int ll_xattr_cache_add(struct list_head *cache, + const char *xattr_name, + const char *xattr_val, + unsigned xattr_val_len) +{ + struct ll_xattr_entry *xattr; + + ENTRY; + + if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) { + if (!strcmp(xattr_name, LL_XATTR_NAME_ENCRYPTION_CONTEXT) || + !strcmp(xattr_name, LL_XATTR_NAME_ENCRYPTION_CONTEXT_OLD)) + /* it means enc ctx was already in cache, + * ignore error as it cannot be modified + */ + RETURN(0); + + CDEBUG(D_CACHE, "duplicate xattr: [%s]\n", xattr_name); + RETURN(-EPROTO); + } + + OBD_SLAB_ALLOC_PTR_GFP(xattr, xattr_kmem, GFP_NOFS); + if (xattr == NULL) { + CDEBUG(D_CACHE, "failed to allocate xattr\n"); + RETURN(-ENOMEM); + } + + xattr->xe_namelen = strlen(xattr_name) + 1; + + OBD_ALLOC(xattr->xe_name, xattr->xe_namelen); + if (!xattr->xe_name) { + CDEBUG(D_CACHE, "failed to alloc xattr name %u\n", + xattr->xe_namelen); + goto err_name; + } + OBD_ALLOC(xattr->xe_value, xattr_val_len); + if (!xattr->xe_value) { + CDEBUG(D_CACHE, "failed to alloc xattr value %d\n", + xattr_val_len); + goto err_value; + } + + memcpy(xattr->xe_name, xattr_name, xattr->xe_namelen); + memcpy(xattr->xe_value, xattr_val, xattr_val_len); + xattr->xe_vallen = xattr_val_len; + list_add(&xattr->xe_list, cache); + + CDEBUG(D_CACHE, "set: [%s]=%.*s\n", xattr_name, + xattr_val_len, xattr_val); + + RETURN(0); +err_value: + OBD_FREE(xattr->xe_name, xattr->xe_namelen); +err_name: + OBD_SLAB_FREE_PTR(xattr, xattr_kmem); + + RETURN(-ENOMEM); +} + +/** + * This removes an extended attribute from cache. + * + * Remove @xattr_name attribute from @cache. + * + * \retval 0 success + * \retval -ENODATA if @xattr_name is not cached + */ +static int ll_xattr_cache_del(struct list_head *cache, + const char *xattr_name) +{ + struct ll_xattr_entry *xattr; + + ENTRY; + + CDEBUG(D_CACHE, "del xattr: %s\n", xattr_name); + + if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) { + list_del(&xattr->xe_list); + OBD_FREE(xattr->xe_name, xattr->xe_namelen); + OBD_FREE(xattr->xe_value, xattr->xe_vallen); + OBD_SLAB_FREE_PTR(xattr, xattr_kmem); + + RETURN(0); + } + + RETURN(-ENODATA); +} + +/** + * This iterates cached extended attributes. + * + * Walk over cached attributes in @cache and + * fill in @xld_buffer or only calculate buffer + * size if @xld_buffer is NULL. + * + * \retval >= 0 buffer list size + * \retval -ENODATA if the list cannot fit @xld_size buffer + */ +static int ll_xattr_cache_list(struct list_head *cache, + char *xld_buffer, + int xld_size) +{ + struct ll_xattr_entry *xattr, *tmp; + int xld_tail = 0; + + ENTRY; + + list_for_each_entry_safe(xattr, tmp, cache, xe_list) { + CDEBUG(D_CACHE, "list: buffer=%p[%d] name=%s\n", + xld_buffer, xld_tail, xattr->xe_name); + + if (xld_buffer) { + xld_size -= xattr->xe_namelen; + if (xld_size < 0) + break; + memcpy(&xld_buffer[xld_tail], + xattr->xe_name, xattr->xe_namelen); + } + xld_tail += xattr->xe_namelen; + } + + if (xld_size < 0) + RETURN(-ERANGE); + + RETURN(xld_tail); +} + +/** + * Check if the xattr cache is initialized. + * + * \retval 0 @cache is not initialized + * \retval 1 @cache is initialized + */ +static int ll_xattr_cache_valid(struct ll_inode_info *lli) +{ + return test_bit(LLIF_XATTR_CACHE, &lli->lli_flags); +} + +/** + * Check if the xattr cache is filled. + * + * \retval 0 @cache is not filled + * \retval 1 @cache is filled + */ +static int ll_xattr_cache_filled(struct ll_inode_info *lli) +{ + return test_bit(LLIF_XATTR_CACHE_FILLED, &lli->lli_flags); +} + +/** + * This finalizes the xattr cache. + * + * Free all xattr memory. @lli is the inode info pointer. + * + * \retval 0 no error occured + */ +static int ll_xattr_cache_destroy_locked(struct ll_inode_info *lli) +{ + ENTRY; + + if (!ll_xattr_cache_valid(lli)) + RETURN(0); + + while (ll_xattr_cache_del(&lli->lli_xattrs, NULL) == 0) + /* empty loop */ ; + + clear_bit(LLIF_XATTR_CACHE_FILLED, &lli->lli_flags); + clear_bit(LLIF_XATTR_CACHE, &lli->lli_flags); + + RETURN(0); +} + +int ll_xattr_cache_destroy(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + + ENTRY; + + down_write(&lli->lli_xattrs_list_rwsem); + rc = ll_xattr_cache_destroy_locked(lli); + up_write(&lli->lli_xattrs_list_rwsem); + + RETURN(rc); +} + +/** + * ll_xattr_cache_empty - empty xattr cache for @ino + * + * Similar to ll_xattr_cache_destroy(), but preserves encryption context. + * So only LLIF_XATTR_CACHE_FILLED flag is cleared, but not LLIF_XATTR_CACHE. + */ +int ll_xattr_cache_empty(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_xattr_entry *entry, *n; + + ENTRY; + + down_write(&lli->lli_xattrs_list_rwsem); + if (!ll_xattr_cache_valid(lli) || + !ll_xattr_cache_filled(lli)) + GOTO(out_empty, 0); + + list_for_each_entry_safe(entry, n, &lli->lli_xattrs, xe_list) { + if (strcmp(entry->xe_name, xattr_for_enc(inode)) == 0) + continue; + + CDEBUG(D_CACHE, "delete: %s\n", entry->xe_name); + list_del(&entry->xe_list); + OBD_FREE(entry->xe_name, entry->xe_namelen); + OBD_FREE(entry->xe_value, entry->xe_vallen); + OBD_SLAB_FREE_PTR(entry, xattr_kmem); + } + clear_bit(LLIF_XATTR_CACHE_FILLED, &lli->lli_flags); + +out_empty: + up_write(&lli->lli_xattrs_list_rwsem); + RETURN(0); +} + +/** + * Match or enqueue a PR lock. + * + * Find or request an LDLM lock with xattr data. + * Since LDLM does not provide API for atomic match_or_enqueue, + * the function handles it with a separate enq lock. + * If successful, the function exits with a write lock held + * on lli_xattrs_list_rwsem. + * + * \retval 0 no error occured + * \retval -ENOMEM not enough memory + */ +static int ll_xattr_find_get_lock(struct inode *inode, + struct lookup_intent *oit, + struct ptlrpc_request **req) +{ + enum ldlm_mode mode; + struct lustre_handle lockh = { 0 }; + struct md_op_data *op_data; + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_export *exp = sbi->ll_md_exp; + int rc; + + ENTRY; + + mutex_lock(&lli->lli_xattrs_enq_lock); + /* inode may have been shrunk and recreated, so data is gone, match lock + * only when data exists. */ + if (ll_xattr_cache_filled(lli)) { + /* Try matching first. */ + mode = ll_take_md_lock(inode, MDS_INODELOCK_XATTR, &lockh, 0, + LCK_PR); + if (mode != 0) { + /* fake oit in mdc_revalidate_lock() manner */ + oit->it_lock_handle = lockh.cookie; + oit->it_lock_mode = mode; + goto out; + } + } + + /* Enqueue if the lock isn't cached locally. */ + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) { + mutex_unlock(&lli->lli_xattrs_enq_lock); + RETURN(PTR_ERR(op_data)); + } + + op_data->op_valid = OBD_MD_FLXATTR | OBD_MD_FLXATTRLS; + + rc = md_intent_lock(exp, op_data, oit, req, &ll_md_blocking_ast, 0); + ll_finish_md_op_data(op_data); + *req = oit->it_request; + + if (rc < 0) { + CDEBUG(D_CACHE, "md_intent_lock failed with %d for fid "DFID"\n", + rc, PFID(ll_inode2fid(inode))); + mutex_unlock(&lli->lli_xattrs_enq_lock); + RETURN(rc); + } + +out: + down_write(&lli->lli_xattrs_list_rwsem); + mutex_unlock(&lli->lli_xattrs_enq_lock); + + RETURN(0); +} + +/** + * Refill the xattr cache. + * + * Fetch and cache the whole of xattrs for @inode, thanks to the write lock + * on lli_xattrs_list_rwsem obtained from ll_xattr_find_get_lock(). + * If successful, this write lock is kept. + * + * \retval 0 no error occured + * \retval -EPROTO network protocol error + * \retval -ENOMEM not enough memory for the cache + */ +static int ll_xattr_cache_refill(struct inode *inode) +{ + struct lookup_intent oit = { .it_op = IT_GETXATTR }; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + const char *xdata, *xval, *xtail, *xvtail; + struct ll_inode_info *lli = ll_i2info(inode); + struct mdt_body *body; + __u32 *xsizes; + int rc = 0, i; + + ENTRY; + + CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_XATTR_PAUSE, cfs_fail_val ?: 2); + + rc = ll_xattr_find_get_lock(inode, &oit, &req); + if (rc) + GOTO(err_req, rc); + + /* Do we have the data at this point? */ + if (ll_xattr_cache_filled(lli)) { + ll_stats_ops_tally(sbi, LPROC_LL_GETXATTR_HITS, 1); + ll_intent_drop_lock(&oit); + GOTO(err_req, rc = 0); + } + + /* Matched but no cache? Cancelled on error by a parallel refill. */ + if (unlikely(req == NULL)) { + CDEBUG(D_CACHE, "cancelled by a parallel getxattr\n"); + ll_intent_drop_lock(&oit); + GOTO(err_unlock, rc = -EAGAIN); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) { + CERROR("no MDT BODY in the refill xattr reply\n"); + GOTO(err_cancel, rc = -EPROTO); + } + /* do not need swab xattr data */ + xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, + body->mbo_eadatasize); + xval = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS, + body->mbo_aclsize); + xsizes = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS_LENS, + body->mbo_max_mdsize * + sizeof(__u32)); + if (xdata == NULL || xval == NULL || xsizes == NULL) { + CERROR("wrong setxattr reply\n"); + GOTO(err_cancel, rc = -EPROTO); + } + + xtail = xdata + body->mbo_eadatasize; + xvtail = xval + body->mbo_aclsize; + + CDEBUG(D_CACHE, "caching: xdata=%p xtail=%p\n", xdata, xtail); + + if (!ll_xattr_cache_valid(lli)) + ll_xattr_cache_init(lli); + + for (i = 0; i < body->mbo_max_mdsize; i++) { + CDEBUG(D_CACHE, "caching [%s]=%.*s\n", xdata, *xsizes, xval); + /* Perform consistency checks: attr names and vals in pill */ + if (memchr(xdata, 0, xtail - xdata) == NULL) { + CERROR("xattr protocol violation (names are broken)\n"); + rc = -EPROTO; + } else if (xval + *xsizes > xvtail) { + CERROR("xattr protocol violation (vals are broken)\n"); + rc = -EPROTO; + } else if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_XATTR_ENOMEM)) { + rc = -ENOMEM; + } else if (!strcmp(xdata, XATTR_NAME_ACL_ACCESS)) { + /* Filter out ACL ACCESS since it's cached separately */ + CDEBUG(D_CACHE, "not caching %s\n", + XATTR_NAME_ACL_ACCESS); + rc = 0; + } else if (ll_xattr_is_seclabel(xdata)) { + /* Filter out security label, it is cached in slab */ + CDEBUG(D_CACHE, "not caching %s\n", xdata); + rc = 0; + } else if (!strcmp(xdata, XATTR_NAME_SOM)) { + /* Filter out trusted.som, it is not cached on client */ + CDEBUG(D_CACHE, "not caching trusted.som\n"); + rc = 0; + } else { + rc = ll_xattr_cache_add(&lli->lli_xattrs, xdata, xval, + *xsizes); + } + if (rc < 0) { + ll_xattr_cache_destroy_locked(lli); + GOTO(err_cancel, rc); + } + xdata += strlen(xdata) + 1; + xval += *xsizes; + xsizes++; + } + + if (xdata != xtail || xval != xvtail) + CERROR("a hole in xattr data\n"); + else + set_bit(LLIF_XATTR_CACHE_FILLED, &lli->lli_flags); + + ll_set_lock_data(sbi->ll_md_exp, inode, &oit, NULL); + ll_intent_drop_lock(&oit); + + ptlrpc_req_finished(req); + RETURN(0); + +err_cancel: + ldlm_lock_decref_and_cancel((struct lustre_handle *) + &oit.it_lock_handle, + oit.it_lock_mode); +err_unlock: + up_write(&lli->lli_xattrs_list_rwsem); +err_req: + if (rc == -ERANGE) + rc = -EAGAIN; + + ptlrpc_req_finished(req); + RETURN(rc); +} + +/** + * Get an xattr value or list xattrs using the write-through cache. + * + * Get the xattr value (@valid has OBD_MD_FLXATTR set) of @name or + * list xattr names (@valid has OBD_MD_FLXATTRLS set) for @inode. + * The resulting value/list is stored in @buffer if the former + * is not larger than @size. + * + * \retval 0 no error occured + * \retval -EPROTO network protocol error + * \retval -ENOMEM not enough memory for the cache + * \retval -ERANGE the buffer is not large enough + * \retval -ENODATA no such attr or the list is empty + */ +int ll_xattr_cache_get(struct inode *inode, + const char *name, + char *buffer, + size_t size, + __u64 valid) +{ + struct ll_inode_info *lli = ll_i2info(inode); + int rc = 0; + + ENTRY; + + LASSERT(!!(valid & OBD_MD_FLXATTR) ^ !!(valid & OBD_MD_FLXATTRLS)); + + down_read(&lli->lli_xattrs_list_rwsem); + /* For performance reasons, we do not want to refill complete xattr + * cache if we are just interested in encryption context. + */ + if ((valid & OBD_MD_FLXATTRLS || + strcmp(name, xattr_for_enc(inode)) != 0) && + !ll_xattr_cache_filled(lli)) { + up_read(&lli->lli_xattrs_list_rwsem); + rc = ll_xattr_cache_refill(inode); + if (rc) + RETURN(rc); + /* Turn the write lock obtained in ll_xattr_cache_refill() + * into a read lock. + */ + downgrade_write(&lli->lli_xattrs_list_rwsem); + } else { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR_HITS, 1); + } + + if (!ll_xattr_cache_valid(lli)) + GOTO(out, rc = -ENODATA); + + if (valid & OBD_MD_FLXATTR) { + struct ll_xattr_entry *xattr; + + rc = ll_xattr_cache_find(&lli->lli_xattrs, name, &xattr); + if (rc == 0) { + rc = xattr->xe_vallen; + /* zero size means we are only requested size in rc */ + if (size != 0) { + if (size >= xattr->xe_vallen) + memcpy(buffer, xattr->xe_value, + xattr->xe_vallen); + else + rc = -ERANGE; + } + /* Return the project id when the virtual project id xattr + * is explicitly asked. + */ + } else if (strcmp(name, XATTR_NAME_PROJID) == 0) { + /* 10 chars to hold u32 in decimal, plus ending \0 */ + char projid[11]; + + rc = snprintf(projid, sizeof(projid), + "%u", lli->lli_projid); + if (size != 0) { + if (rc <= size) + memcpy(buffer, projid, rc); + else + rc = -ERANGE; + } + } + } else if (valid & OBD_MD_FLXATTRLS) { + rc = ll_xattr_cache_list(&lli->lli_xattrs, + size ? buffer : NULL, size); + } + + GOTO(out, rc); +out: + up_read(&lli->lli_xattrs_list_rwsem); + + RETURN(rc); +} + +/** + * Insert an xattr value into the cache. + * + * Add @name xattr with @buffer value and @size length for @inode. + * Init cache for @inode if necessary. + * + * \retval 0 success + * \retval < 0 from ll_xattr_cache_add(), except -EPROTO is ignored for + * LL_XATTR_NAME_ENCRYPTION_CONTEXT xattr + */ +int ll_xattr_cache_insert(struct inode *inode, + const char *name, + char *buffer, + size_t size) +{ + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + + ENTRY; + + down_write(&lli->lli_xattrs_list_rwsem); + if (!ll_xattr_cache_valid(lli)) + ll_xattr_cache_init(lli); + rc = ll_xattr_cache_add(&lli->lli_xattrs, name, buffer, size); + up_write(&lli->lli_xattrs_list_rwsem); + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c new file mode 100644 index 0000000000000..df34ab353efb3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c @@ -0,0 +1,328 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * GPL HEADER END + */ + +/* + * Copyright (c) 2014 Bull SAS + * + * Copyright (c) 2015, 2016, Intel Corporation. + * Author: Sebastien Buisson sebastien.buisson@bull.net + */ + +/* + * lustre/llite/xattr_security.c + * Handler for storing security labels as extended attributes. + */ + +#include +#include +#ifdef HAVE_LINUX_SELINUX_IS_ENABLED +#include +#endif +#include +#include "llite_internal.h" + +#ifndef XATTR_SELINUX_SUFFIX +# define XATTR_SELINUX_SUFFIX "selinux" +#endif + +#ifndef XATTR_NAME_SELINUX +# define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX +#endif + +#ifdef HAVE_SECURITY_DENTRY_INIT_SECURTY_WITH_CTX +#define HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG 1 +#endif + +/* + * Check for LL_SBI_FILE_SECCTX before calling. + */ +int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name, + const char **secctx_name, __u32 *secctx_name_size, + void **secctx, __u32 *secctx_size, int *secctx_slot) +{ + struct ll_sb_info *sbi = ll_s2sbi(dentry->d_sb); +#ifdef HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG + const char *secctx_name_lsm = NULL; +#endif +#ifdef HAVE_SECURITY_DENTRY_INIT_SECURTY_WITH_CTX + struct lsmcontext ctx = {}; +#endif + int rc; + + /* + * Before kernel 5.15-rc1-20-g15bf32398ad4, + * security_inode_init_security() does not return to us the name of the + * extended attribute to store the context under (for example + * "security.selinux"). So we only call it when we think we know what + * the name of the extended attribute will be. This is OK-ish since + * SELinux is the only module that implements + * security_dentry_init_security(). Note that the NFS client code just + * calls it and assumes that if anything is returned then it must come + * from SELinux. + */ + + *secctx_name_size = ll_secctx_name_get(sbi, secctx_name); + /* xattr name length == 0 means no LSM module manage file contexts */ + if (*secctx_name_size == 0) + return 0; + + rc = security_dentry_init_security(dentry, mode, name, +#ifdef HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG + &secctx_name_lsm, +#endif +#ifdef HAVE_SECURITY_DENTRY_INIT_SECURTY_WITH_CTX + &ctx); +#else + secctx, secctx_size); +#endif + /* ignore error if the hook is not supported by the LSM module */ + if (rc == -EOPNOTSUPP) + return 0; + if (rc < 0) + return rc; + +#ifdef HAVE_SECURITY_DENTRY_INIT_SECURTY_WITH_CTX + *secctx = ctx.context; + *secctx_size = ctx.len; + *secctx_slot = ctx.slot; +#endif + +#ifdef HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG + if (strncmp(*secctx_name, secctx_name_lsm, *secctx_name_size) != 0) { + CERROR("%s: LSM secctx_name '%s' does not match the one stored by Lustre '%s'\n", + sbi->ll_fsname, secctx_name_lsm, *secctx_name); + return -EOPNOTSUPP; + } +#endif + + return 0; +} + +/** + * A helper function for security_inode_init_security() + * that takes care of setting xattrs + * + * Get security context of @inode from @xattr_array, + * and put it in 'security.xxx' xattr of dentry + * stored in @fs_info. + * + * \retval 0 success + * \retval -ENOMEM if no memory could be allocated for xattr name + * \retval < 0 failure to set xattr + */ +static int +ll_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + struct dentry *dentry = fs_info; + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name; xattr++) { + char *full_name; + + full_name = kasprintf(GFP_KERNEL, "%s%s", + XATTR_SECURITY_PREFIX, xattr->name); + if (!full_name) { + err = -ENOMEM; + break; + } + + err = ll_vfs_setxattr(dentry, inode, full_name, xattr->value, + xattr->value_len, XATTR_CREATE); + kfree(full_name); + if (err < 0) + break; + } + return err; +} + +/** + * Initializes security context + * + * Get security context of @inode in @dir, + * and put it in 'security.xxx' xattr of @dentry. + * + * \retval 0 success, or SELinux is disabled + * \retval -ENOMEM if no memory could be allocated for xattr name + * \retval < 0 failure to get security context or set xattr + */ +int +ll_inode_init_security(struct dentry *dentry, struct inode *inode, + struct inode *dir) +{ + int rc; + + if (!ll_security_xattr_wanted(dir)) + return 0; + + rc = security_inode_init_security(inode, dir, NULL, + &ll_initxattrs, dentry); + if (rc == -EOPNOTSUPP) + return 0; + + return rc; +} + +/** + * Notify security context to the security layer + * + * Notify security context @secctx of inode @inode to the security layer. + * + * \retval 0 success, or SELinux is disabled or not supported by the fs + * \retval < 0 failure to set the security context + */ +int ll_inode_notifysecctx(struct inode *inode, + void *secctx, __u32 secctxlen) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + int rc; + + if (!test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags) || + !ll_security_xattr_wanted(inode) || + !secctx || !secctxlen) + return 0; + + /* no need to protect selinux_inode_setsecurity() by + * inode_lock. Taking it would lead to a client deadlock + * LU-13617 + */ + rc = security_inode_notifysecctx(inode, secctx, secctxlen); + if (rc) + CWARN("%s: cannot set security context for "DFID": rc = %d\n", + sbi->ll_fsname, PFID(ll_inode2fid(inode)), rc); + + return rc; +} + +/** + * Free the security context xattr name used by policy + */ +void ll_secctx_name_free(struct ll_sb_info *sbi) +{ + OBD_FREE(sbi->ll_secctx_name, sbi->ll_secctx_name_size + 1); + sbi->ll_secctx_name = NULL; + sbi->ll_secctx_name_size = 0; +} + +/** + * Get security context xattr name used by policy and save it. + * + * \retval > 0 length of xattr name + * \retval == 0 no LSM module registered supporting security contexts + * \retval <= 0 failure to get xattr name or xattr is not supported + */ +int ll_secctx_name_store(struct inode *in) +{ + struct ll_sb_info *sbi = ll_i2sbi(in); + int rc = 0; + + if (!ll_security_xattr_wanted(in)) + return 0; + + /* get size of xattr name */ + rc = security_inode_listsecurity(in, NULL, 0); + if (rc <= 0) + return rc; + + if (sbi->ll_secctx_name) + ll_secctx_name_free(sbi); + + OBD_ALLOC(sbi->ll_secctx_name, rc + 1); + if (!sbi->ll_secctx_name) + return -ENOMEM; + + /* save the xattr name */ + sbi->ll_secctx_name_size = rc; + rc = security_inode_listsecurity(in, sbi->ll_secctx_name, + sbi->ll_secctx_name_size); + if (rc <= 0) + goto err_free; + + if (rc > sbi->ll_secctx_name_size) { + rc = -ERANGE; + goto err_free; + } + + /* sanity check */ + sbi->ll_secctx_name[rc] = '\0'; + if (rc < sizeof(XATTR_SECURITY_PREFIX)) { + rc = -EINVAL; + goto err_free; + } + if (strncmp(sbi->ll_secctx_name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1) != 0) { + rc = -EOPNOTSUPP; + goto err_free; + } + + return rc; + +err_free: + ll_secctx_name_free(sbi); + return rc; +} + +/** + * Retrieved file security context xattr name stored. + * + * \retval security context xattr name size stored. + * \retval 0 no xattr name stored. + */ +__u32 ll_secctx_name_get(struct ll_sb_info *sbi, const char **secctx_name) +{ + if (!sbi->ll_secctx_name || !sbi->ll_secctx_name_size) + return 0; + + *secctx_name = sbi->ll_secctx_name; + + return sbi->ll_secctx_name_size; +} + +/** + * Filter out xattr file security context if not managed by LSM + * + * This is done to improve performance for application that blindly try to get + * file context (like "ls -l" for security.linux). + * See LU-549 for more information. + * + * \retval 0 xattr not filtered + * \retval -EOPNOTSUPP no enabled LSM security module supports the xattr + */ +int ll_security_secctx_name_filter(struct ll_sb_info *sbi, int xattr_type, + const char *suffix) +{ + const char *cached_suffix = NULL; + + if (xattr_type != XATTR_SECURITY_T || + !ll_xattr_suffix_is_seclabel(suffix)) + return 0; + + /* is the xattr label used by lsm ? */ + if (!ll_secctx_name_get(sbi, &cached_suffix)) + return -EOPNOTSUPP; + + cached_suffix += sizeof(XATTR_SECURITY_PREFIX) - 1; + if (strcmp(suffix, cached_suffix) != 0) + return -EOPNOTSUPP; + + return 0; +} diff --git a/drivers/staging/lustrefsx/lustre/lmv/Makefile b/drivers/staging/lustrefsx/lustre/lmv/Makefile new file mode 100644 index 0000000000000..40626f49283fb --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lmv/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTREFSX_FS) += lmv.o + +lmv-y := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c new file mode 100644 index 0000000000000..0b76f7b028835 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c @@ -0,0 +1,87 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LMV +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "lmv_internal.h" + +int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds) +{ + struct obd_device *obd = lmv2obd_dev(lmv); + int rc; + + ENTRY; + + /* + * FIXME: Currently ZFS still use local seq for ROOT unfortunately, and + * this fid_is_local check should be removed once LU-2240 is fixed + */ + if (!fid_is_sane(fid) || !(fid_seq_in_fldb(fid_seq(fid)) || + fid_seq_is_local_file(fid_seq(fid)))) { + rc = -EINVAL; + CERROR("%s: invalid FID "DFID": rc = %d\n", obd->obd_name, + PFID(fid), rc); + RETURN(rc); + } + + rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds, + LU_SEQ_RANGE_MDT, NULL); + if (rc) { + CERROR("%s: Error while looking for mds number. Seq %#llx: rc = %d\n", + obd->obd_name, fid_seq(fid), rc); + RETURN(rc); + } + + CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n", + *mds, PFID(fid)); + + if (*mds >= lmv->lmv_mdt_descs.ltd_tgts_size) { + rc = -EINVAL; + CERROR("%s: FLD lookup got invalid mds #%x (max: %x) for fid="DFID": rc = %d\n", + obd->obd_name, *mds, lmv->lmv_mdt_descs.ltd_tgts_size, + PFID(fid), rc); + } + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c new file mode 100644 index 0000000000000..97f1d9f592de0 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c @@ -0,0 +1,595 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LMV +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "lmv_internal.h" + +static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it, + const struct lu_fid *parent_fid, + struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags, + const char *secctx_name, __u32 secctx_name_size) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct ptlrpc_request *req = NULL; + struct lustre_handle plock; + struct md_op_data *op_data; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + int pmode; + int rc = 0; + ENTRY; + + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + LASSERT((body->mbo_valid & OBD_MD_MDS)); + + /* + * We got LOOKUP lock, but we really need attrs. + */ + pmode = it->it_lock_mode; + if (pmode) { + plock.cookie = it->it_lock_handle; + it->it_lock_mode = 0; + it->it_request = NULL; + } + + LASSERT(fid_is_sane(&body->mbo_fid1)); + + tgt = lmv_fid2tgt(lmv, &body->mbo_fid1); + if (IS_ERR(tgt)) + GOTO(out, rc = PTR_ERR(tgt)); + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + GOTO(out, rc = -ENOMEM); + + op_data->op_fid1 = body->mbo_fid1; + /* Sent the parent FID to the remote MDT */ + if (parent_fid != NULL) { + /* The parent fid is only for remote open to + * check whether the open is from OBF, + * see mdt_cross_open */ + LASSERT(it->it_op & IT_OPEN); + op_data->op_fid2 = *parent_fid; + } + + op_data->op_bias = MDS_CROSS_REF; + op_data->op_cli_flags = CLI_NO_SLOT; + CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%u\n", + PFID(&body->mbo_fid1), tgt->ltd_index); + + /* ask for security context upon intent */ + if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN) && + secctx_name_size != 0 && secctx_name != NULL) { + op_data->op_file_secctx_name = secctx_name; + op_data->op_file_secctx_name_size = secctx_name_size; + CDEBUG(D_SEC, "'%.*s' is security xattr to fetch for " + DFID"\n", + secctx_name_size, secctx_name, PFID(&body->mbo_fid1)); + } + + rc = md_intent_lock(tgt->ltd_exp, op_data, it, &req, cb_blocking, + extra_lock_flags); + if (rc) + GOTO(out_free_op_data, rc); + + /* + * LLite needs LOOKUP lock to track dentry revocation in order to + * maintain dcache consistency. Thus drop UPDATE|PERM lock here + * and put LOOKUP in request. + */ + if (it->it_lock_mode != 0) { + it->it_remote_lock_handle = + it->it_lock_handle; + it->it_remote_lock_mode = it->it_lock_mode; + } + + if (pmode) { + it->it_lock_handle = plock.cookie; + it->it_lock_mode = pmode; + } + + EXIT; +out_free_op_data: + OBD_FREE_PTR(op_data); +out: + if (rc && pmode) + ldlm_lock_decref(&plock, pmode); + + ptlrpc_req_finished(*reqp); + *reqp = req; + return rc; +} + +int lmv_revalidate_slaves(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + ldlm_blocking_callback cb_blocking, + int extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct ptlrpc_request *req = NULL; + struct mdt_body *body; + struct md_op_data *op_data; + int i; + int valid_stripe_count = 0; + int rc = 0; + + ENTRY; + + /** + * revalidate slaves has some problems, temporarily return, + * we may not need that + */ + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + RETURN(-ENOMEM); + + /** + * Loop over the stripe information, check validity and update them + * from MDS if needed. + */ + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + struct lu_fid fid; + struct lookup_intent it = { .it_op = IT_GETATTR }; + struct lustre_handle *lockh = NULL; + struct lmv_tgt_desc *tgt = NULL; + struct inode *inode; + + fid = lsm->lsm_md_oinfo[i].lmo_fid; + inode = lsm->lsm_md_oinfo[i].lmo_root; + + if (!inode) + continue; + + /* + * Prepare op_data for revalidating. Note that @fid2 shluld be + * defined otherwise it will go to server and take new lock + * which is not needed here. + */ + memset(op_data, 0, sizeof(*op_data)); + op_data->op_fid1 = fid; + op_data->op_fid2 = fid; + /* shard revalidate only needs to fetch attributes and UPDATE + * lock, which is similar to the bottom half of remote object + * getattr, set this flag so that MDT skips checking whether + * it's remote object. + */ + op_data->op_bias = MDS_CROSS_REF; + op_data->op_cli_flags = CLI_NO_SLOT; + + tgt = lmv_tgt(lmv, lsm->lsm_md_oinfo[i].lmo_mds); + if (!tgt) + GOTO(cleanup, rc = -ENODEV); + + CDEBUG(D_INODE, "Revalidate slave "DFID" -> mds #%u\n", + PFID(&fid), tgt->ltd_index); + + if (req != NULL) { + ptlrpc_req_finished(req); + req = NULL; + } + + rc = md_intent_lock(tgt->ltd_exp, op_data, &it, &req, + cb_blocking, extra_lock_flags); + if (rc == -ENOENT || rc == -ESHUTDOWN) { + /* skip stripe that doesn't exist or is inaccessible */ + rc = 0; + continue; + } + + if (rc < 0) + GOTO(cleanup, rc); + + lockh = (struct lustre_handle *)&it.it_lock_handle; + if (rc > 0 && req == NULL) { + /* slave inode is still valid */ + CDEBUG(D_INODE, "slave "DFID" is still valid.\n", + PFID(&fid)); + rc = 0; + } else { + /* refresh slave from server */ + body = req_capsule_server_get(&req->rq_pill, + &RMF_MDT_BODY); + if (body == NULL) { + if (it.it_lock_mode && lockh) { + ldlm_lock_decref(lockh, + it.it_lock_mode); + it.it_lock_mode = 0; + } + GOTO(cleanup, rc = -ENOENT); + } + + i_size_write(inode, body->mbo_size); + inode->i_blocks = body->mbo_blocks; + spin_lock(&inode->i_lock); + set_nlink(inode, body->mbo_nlink); + spin_unlock(&inode->i_lock); + inode->i_atime.tv_sec = body->mbo_atime; + inode->i_ctime.tv_sec = body->mbo_ctime; + inode->i_mtime.tv_sec = body->mbo_mtime; + } + + md_set_lock_data(tgt->ltd_exp, lockh, inode, NULL); + if (it.it_lock_mode != 0 && lockh != NULL) { + ldlm_lock_decref(lockh, it.it_lock_mode); + it.it_lock_mode = 0; + } + + valid_stripe_count++; + } + +cleanup: + if (req != NULL) + ptlrpc_req_finished(req); + + /* if all stripes are invalid, return -ENOENT to notify user */ + if (!rc && !valid_stripe_count) + rc = -ENOENT; + + OBD_FREE_PTR(op_data); + RETURN(rc); +} + +/* + * IT_OPEN is intended to open (and create, possible) an object. Parent (pid) + * may be split dir. + */ +static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, + struct lookup_intent *it, + struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + __u64 flags = it->it_flags; + int rc; + + ENTRY; + + /* do not allow file creation in foreign dir */ + if ((it->it_op & IT_CREAT) && lmv_dir_foreign(op_data->op_mea1)) + RETURN(-ENODATA); + + if ((it->it_op & IT_CREAT) && !(flags & MDS_OPEN_BY_FID)) { + /* don't allow create under dir with bad hash */ + if (lmv_dir_bad_hash(op_data->op_mea1)) + RETURN(-EBADF); + + if (lmv_dir_layout_changing(op_data->op_mea1)) { + if (flags & O_EXCL) { + /* + * open(O_CREAT | O_EXCL) needs to check + * existing name, which should be done on both + * old and new layout, check old layout on + * client side. + */ + rc = lmv_old_layout_lookup(lmv, op_data); + if (rc != -ENOENT) + RETURN(rc); + + op_data->op_new_layout = true; + } else { + /* + * open(O_CREAT) will be sent to MDT in old + * layout first, to avoid creating new file + * under old layout, clear O_CREAT. + */ + it->it_flags &= ~O_CREAT; + } + } + } + +retry: + if (it->it_flags & MDS_OPEN_BY_FID) { + LASSERT(fid_is_sane(&op_data->op_fid2)); + + /* for striped directory, we can't know parent stripe fid + * without name, but we can set it to child fid, and MDT + * will obtain it from linkea in open in such case. */ + if (lmv_dir_striped(op_data->op_mea1)) + op_data->op_fid1 = op_data->op_fid2; + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid2); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + op_data->op_mds = tgt->ltd_index; + } else { + LASSERT(fid_is_sane(&op_data->op_fid1)); + LASSERT(fid_is_zero(&op_data->op_fid2)); + LASSERT(op_data->op_name != NULL); + + tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } + + /* If it is ready to open the file by FID, do not need + * allocate FID at all, otherwise it will confuse MDT */ + if ((it->it_op & IT_CREAT) && !(it->it_flags & MDS_OPEN_BY_FID)) { + /* + * For lookup(IT_CREATE) cases allocate new fid and setup FLD + * for it. + */ + rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); + if (rc != 0) + RETURN(rc); + } + + CDEBUG(D_INODE, "OPEN_INTENT with fid1="DFID", fid2="DFID"," + " name='%s' -> mds #%u\n", PFID(&op_data->op_fid1), + PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_index); + + rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking, + extra_lock_flags); + if (rc != 0) + RETURN(rc); + /* + * Nothing is found, do not access body->fid1 as it is zero and thus + * pointless. + */ + if ((it->it_disposition & DISP_LOOKUP_NEG) && + !(it->it_disposition & DISP_OPEN_CREATE) && + !(it->it_disposition & DISP_OPEN_OPEN)) { + if (!(it->it_flags & MDS_OPEN_BY_FID) && + lmv_dir_retry_check_update(op_data)) { + ptlrpc_req_finished(*reqp); + it->it_request = NULL; + it->it_disposition = 0; + *reqp = NULL; + + it->it_flags = flags; + fid_zero(&op_data->op_fid2); + goto retry; + } + + RETURN(rc); + } + + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + /* Not cross-ref case, just get out of here. */ + if (unlikely((body->mbo_valid & OBD_MD_MDS))) { + rc = lmv_intent_remote(exp, it, &op_data->op_fid1, reqp, + cb_blocking, extra_lock_flags, + op_data->op_file_secctx_name, + op_data->op_file_secctx_name_size); + if (rc != 0) + RETURN(rc); + + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + } + + RETURN(rc); +} + +/* + * Handler for: getattr, lookup and revalidate cases. + */ +static int +lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data, + struct lookup_intent *it, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = NULL; + struct mdt_body *body; + int rc; + ENTRY; + + /* foreign dir is not striped */ + if (lmv_dir_foreign(op_data->op_mea1)) { + /* only allow getattr/lookup for itself */ + if (op_data->op_name != NULL) + RETURN(-ENODATA); + RETURN(0); + } + +retry: + if (op_data->op_flags & MF_GETATTR_BY_FID) { + /* getattr by FID, replace fid1 with stripe FID, + * NB, don't replace if name is "/", because it may be a subtree + * mount, and if it's a striped directory, fid1 will be replaced + * to stripe FID by hash, while fid2 is master object FID, which + * will be treated as a remote object if the two FIDs are + * located on different MDTs, and LOOKUP lock can't be fetched. + */ + LASSERT(op_data->op_name); + if (op_data->op_namelen != 1 || + strncmp(op_data->op_name, "/", 1) != 0) { + tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } + + /* name is used to locate stripe target, clear it here + * to avoid packing name in request, so that MDS knows + * it's getattr by FID. + */ + op_data->op_name = NULL; + op_data->op_namelen = 0; + + /* getattr request is sent to MDT where fid2 inode is */ + tgt = lmv_fid2tgt(lmv, &op_data->op_fid2); + } else if (op_data->op_name) { + /* getattr by name */ + tgt = lmv_locate_tgt(lmv, op_data); + if (!fid_is_sane(&op_data->op_fid2)) + fid_zero(&op_data->op_fid2); + } else { + /* old way to getattr by FID, parent FID not packed */ + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + } + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID + ", name='%s' -> mds #%u\n", + PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), + op_data->op_name ? op_data->op_name : "", + tgt->ltd_index); + + op_data->op_bias &= ~MDS_CROSS_REF; + + rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking, + extra_lock_flags); + if (rc < 0) + RETURN(rc); + + if (*reqp == NULL) { + /* If RPC happens, lsm information will be revalidated + * during update_inode process (see ll_update_lsm_md) */ + if (lmv_dir_striped(op_data->op_mea2)) { + rc = lmv_revalidate_slaves(exp, op_data->op_mea2, + cb_blocking, + extra_lock_flags); + if (rc != 0) + RETURN(rc); + } + RETURN(rc); + } else if (it_disposition(it, DISP_LOOKUP_NEG) && + lmv_dir_retry_check_update(op_data)) { + ptlrpc_req_finished(*reqp); + it->it_request = NULL; + it->it_disposition = 0; + *reqp = NULL; + + goto retry; + } + + if (!it_has_reply_body(it)) + RETURN(0); + + /* + * MDS has returned success. Probably name has been resolved in + * remote inode. Let's check this. + */ + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + /* Not cross-ref case, just get out of here. */ + if (unlikely((body->mbo_valid & OBD_MD_MDS))) { + rc = lmv_intent_remote(exp, it, NULL, reqp, cb_blocking, + extra_lock_flags, + op_data->op_file_secctx_name, + op_data->op_file_secctx_name_size); + if (rc != 0) + RETURN(rc); + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + } + + RETURN(rc); +} + +int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, + struct lookup_intent *it, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + int rc; + ENTRY; + + LASSERT(it != NULL); + LASSERT(fid_is_sane(&op_data->op_fid1)); + + CDEBUG(D_INODE, "INTENT LOCK '%s' for "DFID" '%.*s' on "DFID"\n", + LL_IT2STR(it), PFID(&op_data->op_fid2), + (int)op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid1)); + + if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT | IT_GETXATTR)) + rc = lmv_intent_lookup(exp, op_data, it, reqp, cb_blocking, + extra_lock_flags); + else if (it->it_op & IT_OPEN) + rc = lmv_intent_open(exp, op_data, it, reqp, cb_blocking, + extra_lock_flags); + else + LBUG(); + + if (rc < 0) { + struct lustre_handle lock_handle; + + if (it->it_lock_mode != 0) { + lock_handle.cookie = it->it_lock_handle; + ldlm_lock_decref_and_cancel(&lock_handle, + it->it_lock_mode); + } + + it->it_lock_handle = 0; + it->it_lock_mode = 0; + + if (it->it_remote_lock_mode != 0) { + lock_handle.cookie = it->it_remote_lock_handle; + ldlm_lock_decref_and_cancel(&lock_handle, + it->it_remote_lock_mode); + } + + it->it_remote_lock_handle = 0; + it->it_remote_lock_mode = 0; + } + + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h new file mode 100644 index 0000000000000..a1d4436b6af80 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h @@ -0,0 +1,202 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _LMV_INTERNAL_H_ +#define _LMV_INTERNAL_H_ + +#include +#include + +#define LMV_MAX_TGT_COUNT 128 + +#define LL_IT2STR(it) \ + ((it) ? ldlm_it2str((it)->it_op) : "0") + +int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, + struct lookup_intent *it, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags); + +int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, + void *, int); +int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds); +int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data); + +int lmv_revalidate_slaves(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + ldlm_blocking_callback cb_blocking, + int extra_lock_flags); + +int lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **preq); +void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt, + int activate); + +int lmv_statfs_check_update(struct obd_device *obd, struct lmv_tgt_desc *tgt); + +static inline struct obd_device *lmv2obd_dev(struct lmv_obd *lmv) +{ + return container_of_safe(lmv, struct obd_device, u.lmv); +} + +static inline struct lu_tgt_desc * +lmv_tgt(struct lmv_obd *lmv, __u32 index) +{ + return index < lmv->lmv_mdt_descs.ltd_tgts_size ? + LTD_TGT(&lmv->lmv_mdt_descs, index) : NULL; +} + +static inline bool +lmv_mdt0_inited(struct lmv_obd *lmv) +{ + return lmv->lmv_mdt_descs.ltd_tgts_size > 0 && + test_bit(0, lmv->lmv_mdt_descs.ltd_tgt_bitmap); +} + +#define lmv_foreach_tgt(lmv, tgt) ltd_foreach_tgt(&(lmv)->lmv_mdt_descs, tgt) + +#define lmv_foreach_tgt_safe(lmv, tgt, tmp) \ + ltd_foreach_tgt_safe(&(lmv)->lmv_mdt_descs, tgt, tmp) + +static inline +struct lu_tgt_desc *lmv_first_connected_tgt(struct lmv_obd *lmv) +{ + struct lu_tgt_desc *tgt; + + tgt = ltd_first_tgt(&lmv->lmv_mdt_descs); + while (tgt && !tgt->ltd_exp) + tgt = ltd_next_tgt(&lmv->lmv_mdt_descs, tgt); + + return tgt; +} + +static inline +struct lu_tgt_desc *lmv_next_connected_tgt(struct lmv_obd *lmv, + struct lu_tgt_desc *tgt) +{ + do { + tgt = ltd_next_tgt(&lmv->lmv_mdt_descs, tgt); + } while (tgt && !tgt->ltd_exp); + + return tgt; +} + +#define lmv_foreach_connected_tgt(lmv, tgt) \ + for (tgt = lmv_first_connected_tgt(lmv); tgt; \ + tgt = lmv_next_connected_tgt(lmv, tgt)) + +static inline int +lmv_fid2tgt_index(struct lmv_obd *lmv, const struct lu_fid *fid) +{ + u32 mdt_idx; + int rc; + + if (lmv->lmv_mdt_count < 2) + return 0; + + rc = lmv_fld_lookup(lmv, fid, &mdt_idx); + if (rc < 0) + return rc; + + return mdt_idx; +} + +static inline struct lmv_tgt_desc * +lmv_fid2tgt(struct lmv_obd *lmv, const struct lu_fid *fid) +{ + struct lu_tgt_desc *tgt; + int index; + + index = lmv_fid2tgt_index(lmv, fid); + if (index < 0) + return ERR_PTR(index); + + tgt = lmv_tgt(lmv, index); + + return tgt ? tgt : ERR_PTR(-ENODEV); +} + +static inline int lmv_stripe_md_size(int stripe_count) +{ + struct lmv_stripe_md *lsm; + + return sizeof(*lsm) + stripe_count * sizeof(lsm->lsm_md_oinfo[0]); +} + +/* for file under migrating directory, return the target stripe info */ +static inline const struct lmv_oinfo * +lsm_name_to_stripe_info(const struct lmv_stripe_md *lsm, const char *name, + int namelen, bool new_layout) +{ + int stripe_index; + + LASSERT(lmv_dir_striped(lsm)); + + stripe_index = __lmv_name_to_stripe_index(lsm->lsm_md_hash_type, + lsm->lsm_md_stripe_count, + lsm->lsm_md_migrate_hash, + lsm->lsm_md_migrate_offset, + name, namelen, new_layout); + if (stripe_index < 0) + return ERR_PTR(stripe_index); + + return &lsm->lsm_md_oinfo[stripe_index]; +} + +static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data) +{ + const struct lmv_stripe_md *lsm = op_data->op_mea1; + + if (!lsm) + return false; + + if (lmv_dir_layout_changing(lsm) && !op_data->op_new_layout) { + op_data->op_new_layout = true; + return true; + } + + if (lmv_dir_bad_hash(lsm) && + op_data->op_stripe_index < lsm->lsm_md_stripe_count - 1) { + op_data->op_stripe_index++; + return true; + } + + return false; +} + +struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv, + struct md_op_data *op_data); +int lmv_old_layout_lookup(struct lmv_obd *lmv, struct md_op_data *op_data); + +/* lproc_lmv.c */ +int lmv_tunables_init(struct obd_device *obd); +#endif diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c new file mode 100644 index 0000000000000..88ed384beb47d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c @@ -0,0 +1,3915 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LMV + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lmv_internal.h" + +static int lmv_check_connect(struct obd_device *obd); +static inline bool lmv_op_default_rr_mkdir(const struct md_op_data *op_data); + +void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt, + int activate) +{ + if (tgt->ltd_active == activate) + return; + + tgt->ltd_active = activate; + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count += + (activate ? 1 : -1); + + tgt->ltd_exp->exp_obd->obd_inactive = !activate; +} + +/** + * Error codes: + * + * -EINVAL : UUID can't be found in the LMV's target list + * -ENOTCONN: The UUID is found, but the target connection is bad (!) + * -EBADF : The UUID is found, but the OBD of the wrong type (!) + */ +static int lmv_set_mdc_active(struct lmv_obd *lmv, + const struct obd_uuid *uuid, + int activate) +{ + struct lu_tgt_desc *tgt = NULL; + struct obd_device *obd; + int rc = 0; + + ENTRY; + + CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n", + lmv, uuid->uuid, activate); + + spin_lock(&lmv->lmv_lock); + lmv_foreach_connected_tgt(lmv, tgt) { + CDEBUG(D_INFO, "Target idx %d is %s conn %#llx\n", + tgt->ltd_index, tgt->ltd_uuid.uuid, + tgt->ltd_exp->exp_handle.h_cookie); + + if (obd_uuid_equals(uuid, &tgt->ltd_uuid)) + break; + } + + if (!tgt) + GOTO(out_lmv_lock, rc = -EINVAL); + + obd = class_exp2obd(tgt->ltd_exp); + if (obd == NULL) + GOTO(out_lmv_lock, rc = -ENOTCONN); + + CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n", + obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd, + obd->obd_type->typ_name, tgt->ltd_index); + LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0); + + if (tgt->ltd_active == activate) { + CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd, + activate ? "" : "in"); + GOTO(out_lmv_lock, rc); + } + + CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, + activate ? "" : "in"); + lmv_activate_target(lmv, tgt, activate); + EXIT; + + out_lmv_lock: + spin_unlock(&lmv->lmv_lock); + return rc; +} + +static struct obd_uuid *lmv_get_uuid(struct obd_export *exp) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0); + + return tgt ? obd_get_uuid(tgt->ltd_exp) : NULL; +} + +static int lmv_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev) +{ + struct obd_connect_data *conn_data; + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_uuid *uuid; + int rc = 0; + ENTRY; + + if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) { + CERROR("unexpected notification of %s %s!\n", + watched->obd_type->typ_name, + watched->obd_name); + RETURN(-EINVAL); + } + + uuid = &watched->u.cli.cl_target_uuid; + if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) { + /* + * Set MDC as active before notifying the observer, so the + * observer can use the MDC normally. + */ + rc = lmv_set_mdc_active(lmv, uuid, + ev == OBD_NOTIFY_ACTIVE); + if (rc) { + CERROR("%sactivation of %s failed: %d\n", + ev == OBD_NOTIFY_ACTIVE ? "" : "de", + uuid->uuid, rc); + RETURN(rc); + } + } else if (ev == OBD_NOTIFY_OCD) { + conn_data = &watched->u.cli.cl_import->imp_connect_data; + /* + * XXX: Make sure that ocd_connect_flags from all targets are + * the same. Otherwise one of MDTs runs wrong version or + * something like this. --umka + */ + obd->obd_self_export->exp_connect_data = *conn_data; + } + + /* + * Pass the notification up the chain. + */ + if (obd->obd_observer) + rc = obd_notify(obd->obd_observer, watched, ev); + + RETURN(rc); +} + +static int lmv_connect(const struct lu_env *env, + struct obd_export **pexp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *data, + void *localdata) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lustre_handle conn = { 0 }; + struct obd_export *exp; + int rc; + ENTRY; + + rc = class_connect(&conn, obd, cluuid); + if (rc) { + CERROR("class_connection() returned %d\n", rc); + RETURN(rc); + } + + exp = class_conn2export(&conn); + + lmv->connected = 0; + lmv->conn_data = *data; + lmv->lmv_cache = localdata; + + lmv->lmv_tgts_kobj = kobject_create_and_add("target_obds", + &obd->obd_kset.kobj); + if (!lmv->lmv_tgts_kobj) { + CERROR("%s: cannot create /sys/fs/lustre/%s/%s/target_obds\n", + obd->obd_name, obd->obd_type->typ_name, obd->obd_name); + } + + rc = lmv_check_connect(obd); + if (rc != 0) + GOTO(out_sysfs, rc); + + *pexp = exp; + + RETURN(rc); + +out_sysfs: + if (lmv->lmv_tgts_kobj) + kobject_put(lmv->lmv_tgts_kobj); + + class_disconnect(exp); + + return rc; +} + +static int lmv_init_ea_size(struct obd_export *exp, __u32 easize, + __u32 def_easize) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int change = 0; + int rc = 0; + + ENTRY; + + if (lmv->max_easize < easize) { + lmv->max_easize = easize; + change = 1; + } + if (lmv->max_def_easize < def_easize) { + lmv->max_def_easize = def_easize; + change = 1; + } + + if (change == 0) + RETURN(0); + + if (lmv->connected == 0) + RETURN(0); + + lmv_foreach_connected_tgt(lmv, tgt) { + if (!tgt->ltd_active) + continue; + + rc = md_init_ea_size(tgt->ltd_exp, easize, def_easize); + if (rc) { + CERROR("%s: obd_init_ea_size() failed on MDT target %d:" + " rc = %d\n", obd->obd_name, tgt->ltd_index, rc); + break; + } + } + RETURN(rc); +} + +#define MAX_STRING_SIZE 128 + +static int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_device *mdc_obd; + struct obd_export *mdc_exp; + struct lu_fld_target target; + int rc; + ENTRY; + + mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME, + &obd->obd_uuid); + if (!mdc_obd) { + CERROR("target %s not attached\n", tgt->ltd_uuid.uuid); + RETURN(-EINVAL); + } + + CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s\n", + mdc_obd->obd_name, mdc_obd->obd_uuid.uuid, + tgt->ltd_uuid.uuid, obd->obd_uuid.uuid); + + if (!mdc_obd->obd_set_up) { + CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid); + RETURN(-EINVAL); + } + + rc = obd_connect(NULL, &mdc_exp, mdc_obd, &obd->obd_uuid, + &lmv->conn_data, lmv->lmv_cache); + if (rc) { + CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc); + RETURN(rc); + } + + /* + * Init fid sequence client for this mdc and add new fld target. + */ + rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA); + if (rc) + RETURN(rc); + + target.ft_srv = NULL; + target.ft_exp = mdc_exp; + target.ft_idx = tgt->ltd_index; + + fld_client_add_target(&lmv->lmv_fld, &target); + + rc = obd_register_observer(mdc_obd, obd); + if (rc) { + obd_disconnect(mdc_exp); + CERROR("target %s register_observer error %d\n", + tgt->ltd_uuid.uuid, rc); + RETURN(rc); + } + + if (obd->obd_observer) { + /* + * Tell the observer about the new target. + */ + rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd, + OBD_NOTIFY_ACTIVE); + if (rc) { + obd_disconnect(mdc_exp); + RETURN(rc); + } + } + + tgt->ltd_active = 1; + tgt->ltd_exp = mdc_exp; + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count++; + + md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize); + + rc = lu_qos_add_tgt(&lmv->lmv_qos, tgt); + if (rc) { + obd_disconnect(mdc_exp); + RETURN(rc); + } + + CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n", + mdc_obd->obd_name, mdc_obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + + lmv_statfs_check_update(obd, tgt); + + if (lmv->lmv_tgts_kobj) + /* Even if we failed to create the link, that's fine */ + rc = sysfs_create_link(lmv->lmv_tgts_kobj, + &mdc_obd->obd_kset.kobj, + mdc_obd->obd_name); + RETURN(0); +} + +static void lmv_del_target(struct lmv_obd *lmv, struct lu_tgt_desc *tgt) +{ + LASSERT(tgt); + ltd_del_tgt(&lmv->lmv_mdt_descs, tgt); + OBD_FREE_PTR(tgt); +} + +static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp, + __u32 index, int gen) +{ + struct obd_device *mdc_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + struct lu_tgt_descs *ltd = &lmv->lmv_mdt_descs; + int rc = 0; + + ENTRY; + + CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index); + mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME, + &obd->obd_uuid); + if (!mdc_obd) { + CERROR("%s: Target %s not attached: rc = %d\n", + obd->obd_name, uuidp->uuid, -EINVAL); + RETURN(-EINVAL); + } + + OBD_ALLOC_PTR(tgt); + if (!tgt) + RETURN(-ENOMEM); + + mutex_init(&tgt->ltd_fid_mutex); + tgt->ltd_index = index; + tgt->ltd_uuid = *uuidp; + tgt->ltd_active = 0; + + mutex_lock(<d->ltd_mutex); + rc = ltd_add_tgt(ltd, tgt); + mutex_unlock(<d->ltd_mutex); + + if (rc) + GOTO(out_tgt, rc); + + if (!lmv->connected) + /* lmv_check_connect() will connect this target. */ + RETURN(0); + + rc = lmv_connect_mdc(obd, tgt); + if (!rc) { + int easize = sizeof(struct lmv_stripe_md) + + lmv->lmv_mdt_count * sizeof(struct lu_fid); + + lmv_init_ea_size(obd->obd_self_export, easize, 0); + } + + RETURN(rc); + +out_tgt: + OBD_FREE_PTR(tgt); + return rc; +} + +static int lmv_check_connect(struct obd_device *obd) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int easize; + int rc; + + ENTRY; + + if (lmv->connected) + RETURN(0); + + mutex_lock(&lmv->lmv_mdt_descs.ltd_mutex); + if (lmv->connected) + GOTO(unlock, rc = 0); + + if (!lmv->lmv_mdt_count) { + CERROR("%s: no targets configured: rc = -EINVAL\n", + obd->obd_name); + GOTO(unlock, rc = -EINVAL); + } + + if (!lmv_mdt0_inited(lmv)) { + CERROR("%s: no target configured for index 0: rc = -EINVAL.\n", + obd->obd_name); + GOTO(unlock, rc = -EINVAL); + } + + CDEBUG(D_CONFIG, "Time to connect %s to %s\n", + obd->obd_uuid.uuid, obd->obd_name); + + lmv_foreach_tgt(lmv, tgt) { + rc = lmv_connect_mdc(obd, tgt); + if (rc) + GOTO(out_disc, rc); + } + + lmv->connected = 1; + easize = lmv_mds_md_size(lmv->lmv_mdt_count, LMV_MAGIC); + lmv_init_ea_size(obd->obd_self_export, easize, 0); + EXIT; +unlock: + mutex_unlock(&lmv->lmv_mdt_descs.ltd_mutex); + + return rc; + +out_disc: + lmv_foreach_tgt(lmv, tgt) { + tgt->ltd_active = 0; + if (!tgt->ltd_exp) + continue; + + --lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count; + obd_disconnect(tgt->ltd_exp); + } + + goto unlock; +} + +static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_device *mdc_obd; + int rc; + ENTRY; + + LASSERT(tgt != NULL); + LASSERT(obd != NULL); + + mdc_obd = class_exp2obd(tgt->ltd_exp); + + if (mdc_obd) { + mdc_obd->obd_force = obd->obd_force; + mdc_obd->obd_fail = obd->obd_fail; + mdc_obd->obd_no_recov = obd->obd_no_recov; + + if (lmv->lmv_tgts_kobj) + sysfs_remove_link(lmv->lmv_tgts_kobj, + mdc_obd->obd_name); + } + + rc = obd_fid_fini(tgt->ltd_exp->exp_obd); + if (rc) + CERROR("Can't finalize fids factory\n"); + + CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n", + tgt->ltd_exp->exp_obd->obd_name, + tgt->ltd_exp->exp_obd->obd_uuid.uuid); + + obd_register_observer(tgt->ltd_exp->exp_obd, NULL); + rc = obd_disconnect(tgt->ltd_exp); + if (rc) { + if (tgt->ltd_active) { + CERROR("Target %s disconnect error %d\n", + tgt->ltd_uuid.uuid, rc); + } + } + + lmv_activate_target(lmv, tgt, 0); + tgt->ltd_exp = NULL; + RETURN(0); +} + +static int lmv_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + lmv_foreach_connected_tgt(lmv, tgt) + lmv_disconnect_mdc(obd, tgt); + + if (lmv->lmv_tgts_kobj) + kobject_put(lmv->lmv_tgts_kobj); + + if (!lmv->connected) + class_export_put(exp); + rc = class_disconnect(exp); + lmv->connected = 0; + + RETURN(rc); +} + +static int lmv_fid2path(struct obd_export *exp, int len, void *karg, + void __user *uarg) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct getinfo_fid2path *gf; + struct lmv_tgt_desc *tgt; + struct getinfo_fid2path *remote_gf = NULL; + struct lu_fid root_fid; + int remote_gf_size = 0; + int rc; + + gf = karg; + tgt = lmv_fid2tgt(lmv, &gf->gf_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + root_fid = *gf->gf_u.gf_root_fid; + LASSERT(fid_is_sane(&root_fid)); + +repeat_fid2path: + rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg); + if (rc != 0 && rc != -EREMOTE) + GOTO(out_fid2path, rc); + + /* If remote_gf != NULL, it means just building the + * path on the remote MDT, copy this path segement to gf */ + if (remote_gf != NULL) { + struct getinfo_fid2path *ori_gf; + char *ptr; + int len; + + ori_gf = (struct getinfo_fid2path *)karg; + if (strlen(ori_gf->gf_u.gf_path) + 1 + + strlen(gf->gf_u.gf_path) + 1 > ori_gf->gf_pathlen) + GOTO(out_fid2path, rc = -EOVERFLOW); + + ptr = ori_gf->gf_u.gf_path; + + len = strlen(gf->gf_u.gf_path); + /* move the current path to the right to release space + * for closer-to-root part */ + memmove(ptr + len + 1, ptr, strlen(ori_gf->gf_u.gf_path)); + memcpy(ptr, gf->gf_u.gf_path, len); + ptr[len] = '/'; + } + + CDEBUG(D_INFO, "%s: get path %s "DFID" rec: %llu ln: %u\n", + tgt->ltd_exp->exp_obd->obd_name, + gf->gf_u.gf_path, PFID(&gf->gf_fid), gf->gf_recno, + gf->gf_linkno); + + if (rc == 0) + GOTO(out_fid2path, rc); + + /* sigh, has to go to another MDT to do path building further */ + if (remote_gf == NULL) { + remote_gf_size = sizeof(*remote_gf) + PATH_MAX; + OBD_ALLOC(remote_gf, remote_gf_size); + if (remote_gf == NULL) + GOTO(out_fid2path, rc = -ENOMEM); + remote_gf->gf_pathlen = PATH_MAX; + } + + if (!fid_is_sane(&gf->gf_fid)) { + CERROR("%s: invalid FID "DFID": rc = %d\n", + tgt->ltd_exp->exp_obd->obd_name, + PFID(&gf->gf_fid), -EINVAL); + GOTO(out_fid2path, rc = -EINVAL); + } + + tgt = lmv_fid2tgt(lmv, &gf->gf_fid); + if (IS_ERR(tgt)) + GOTO(out_fid2path, rc = -EINVAL); + + remote_gf->gf_fid = gf->gf_fid; + remote_gf->gf_recno = -1; + remote_gf->gf_linkno = -1; + memset(remote_gf->gf_u.gf_path, 0, remote_gf->gf_pathlen); + *remote_gf->gf_u.gf_root_fid = root_fid; + gf = remote_gf; + goto repeat_fid2path; + +out_fid2path: + if (remote_gf != NULL) + OBD_FREE(remote_gf, remote_gf_size); + RETURN(rc); +} + +static int lmv_hsm_req_count(struct lmv_obd *lmv, + const struct hsm_user_request *hur, + const struct lmv_tgt_desc *tgt_mds) +{ + struct lmv_tgt_desc *curr_tgt; + __u32 i; + int nr = 0; + + /* count how many requests must be sent to the given target */ + for (i = 0; i < hur->hur_request.hr_itemcount; i++) { + curr_tgt = lmv_fid2tgt(lmv, &hur->hur_user_item[i].hui_fid); + if (IS_ERR(curr_tgt)) + RETURN(PTR_ERR(curr_tgt)); + if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) + nr++; + } + return nr; +} + +static int lmv_hsm_req_build(struct lmv_obd *lmv, + struct hsm_user_request *hur_in, + const struct lmv_tgt_desc *tgt_mds, + struct hsm_user_request *hur_out) +{ + __u32 i, nr_out; + struct lmv_tgt_desc *curr_tgt; + + /* build the hsm_user_request for the given target */ + hur_out->hur_request = hur_in->hur_request; + nr_out = 0; + for (i = 0; i < hur_in->hur_request.hr_itemcount; i++) { + curr_tgt = lmv_fid2tgt(lmv, &hur_in->hur_user_item[i].hui_fid); + if (IS_ERR(curr_tgt)) + RETURN(PTR_ERR(curr_tgt)); + if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) { + hur_out->hur_user_item[nr_out] = + hur_in->hur_user_item[i]; + nr_out++; + } + } + hur_out->hur_request.hr_itemcount = nr_out; + memcpy(hur_data(hur_out), hur_data(hur_in), + hur_in->hur_request.hr_data_len); + + RETURN(0); +} + +static int lmv_hsm_ct_unregister(struct obd_device *obd, unsigned int cmd, + int len, struct lustre_kernelcomm *lk, + void __user *uarg) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lu_tgt_desc *tgt; + int rc; + + ENTRY; + + /* unregister request (call from llapi_hsm_copytool_fini) */ + lmv_foreach_connected_tgt(lmv, tgt) + /* best effort: try to clean as much as possible + * (continue on error) */ + obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg); + + /* Whatever the result, remove copytool from kuc groups. + * Unreached coordinators will get EPIPE on next requests + * and will unregister automatically. + */ + rc = libcfs_kkuc_group_rem(&obd->obd_uuid, lk->lk_uid, lk->lk_group); + + RETURN(rc); +} + +static int lmv_hsm_ct_register(struct obd_device *obd, unsigned int cmd, + int len, struct lustre_kernelcomm *lk, + void __user *uarg) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct file *filp; + bool any_set = false; + struct kkuc_ct_data *kcd; + size_t kcd_size; + struct lu_tgt_desc *tgt; + __u32 i; + int err; + int rc = 0; + + ENTRY; + + filp = fget(lk->lk_wfd); + if (!filp) + RETURN(-EBADF); + + if (lk->lk_flags & LK_FLG_DATANR) + kcd_size = offsetof(struct kkuc_ct_data, + kcd_archives[lk->lk_data_count]); + else + kcd_size = sizeof(*kcd); + + OBD_ALLOC(kcd, kcd_size); + if (kcd == NULL) + GOTO(err_fput, rc = -ENOMEM); + + kcd->kcd_nr_archives = lk->lk_data_count; + if (lk->lk_flags & LK_FLG_DATANR) { + kcd->kcd_magic = KKUC_CT_DATA_ARRAY_MAGIC; + if (lk->lk_data_count > 0) + memcpy(kcd->kcd_archives, lk->lk_data, + sizeof(*kcd->kcd_archives) * lk->lk_data_count); + } else { + kcd->kcd_magic = KKUC_CT_DATA_BITMAP_MAGIC; + } + + rc = libcfs_kkuc_group_add(filp, &obd->obd_uuid, lk->lk_uid, + lk->lk_group, kcd, kcd_size); + OBD_FREE(kcd, kcd_size); + if (rc) + GOTO(err_fput, rc); + + /* All or nothing: try to register to all MDS. + * In case of failure, unregister from previous MDS, + * except if it because of inactive target. */ + lmv_foreach_connected_tgt(lmv, tgt) { + err = obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg); + if (err) { + if (tgt->ltd_active) { + /* permanent error */ + CERROR("%s: iocontrol MDC %s on MDT" + " idx %d cmd %x: err = %d\n", + lmv2obd_dev(lmv)->obd_name, + tgt->ltd_uuid.uuid, tgt->ltd_index, cmd, + err); + rc = err; + lk->lk_flags |= LK_FLG_STOP; + i = tgt->ltd_index; + /* unregister from previous MDS */ + lmv_foreach_connected_tgt(lmv, tgt) { + if (tgt->ltd_index >= i) + break; + + obd_iocontrol(cmd, tgt->ltd_exp, len, + lk, uarg); + } + GOTO(err_kkuc_rem, rc); + } + /* else: transient error. + * kuc will register to the missing MDT + * when it is back */ + } else { + any_set = true; + } + } + + if (!any_set) + /* no registration done: return error */ + GOTO(err_kkuc_rem, rc = -ENOTCONN); + + RETURN(0); + +err_kkuc_rem: + libcfs_kkuc_group_rem(&obd->obd_uuid, lk->lk_uid, lk->lk_group); + +err_fput: + fput(filp); + return rc; +} + +static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, + int len, void *karg, void __user *uarg) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lu_tgt_desc *tgt = NULL; + int set = 0; + __u32 count = lmv->lmv_mdt_count; + int rc = 0; + + ENTRY; + + if (count == 0) + RETURN(-ENOTTY); + + switch (cmd) { + case IOC_OBD_STATFS: { + struct obd_ioctl_data *data = karg; + struct obd_device *mdc_obd; + struct obd_statfs stat_buf = {0}; + __u32 index; + + memcpy(&index, data->ioc_inlbuf2, sizeof(__u32)); + + if (index >= lmv->lmv_mdt_descs.ltd_tgts_size) + RETURN(-ENODEV); + + tgt = lmv_tgt(lmv, index); + if (!tgt) + RETURN(-EAGAIN); + + if (!tgt->ltd_active) + RETURN(-ENODATA); + + mdc_obd = class_exp2obd(tgt->ltd_exp); + if (!mdc_obd) + RETURN(-EINVAL); + + /* copy UUID */ + if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd), + min((int) data->ioc_plen2, + (int) sizeof(struct obd_uuid)))) + RETURN(-EFAULT); + + rc = obd_statfs(NULL, tgt->ltd_exp, &stat_buf, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + 0); + if (rc) + RETURN(rc); + if (copy_to_user(data->ioc_pbuf1, &stat_buf, + min((int) data->ioc_plen1, + (int) sizeof(stat_buf)))) + RETURN(-EFAULT); + break; + } + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct obd_quotactl *oqctl; + + if (qctl->qc_valid == QC_MDTIDX) { + tgt = lmv_tgt(lmv, qctl->qc_idx); + } else if (qctl->qc_valid == QC_UUID) { + lmv_foreach_tgt(lmv, tgt) { + if (!obd_uuid_equals(&tgt->ltd_uuid, + &qctl->obd_uuid)) + continue; + + if (!tgt->ltd_exp) + RETURN(-EINVAL); + + break; + } + } else { + RETURN(-EINVAL); + } + + if (!tgt || !tgt->ltd_exp) + RETURN(-EINVAL); + + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + RETURN(-ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(tgt->ltd_exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_MDTIDX; + qctl->obd_uuid = tgt->ltd_uuid; + } + OBD_FREE_PTR(oqctl); + break; + } + case LL_IOC_GET_CONNECT_FLAGS: { + tgt = lmv_tgt(lmv, 0); + rc = -ENODATA; + if (tgt && tgt->ltd_exp) + rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); + break; + } + case LL_IOC_FID2MDTIDX: { + struct lu_fid *fid = karg; + int mdt_index; + + rc = lmv_fld_lookup(lmv, fid, &mdt_index); + if (rc != 0) + RETURN(rc); + + /* Note: this is from llite(see ll_dir_ioctl()), @uarg does not + * point to user space memory for FID2MDTIDX. */ + *(__u32 *)uarg = mdt_index; + break; + } + case OBD_IOC_FID2PATH: { + rc = lmv_fid2path(exp, len, karg, uarg); + break; + } + case LL_IOC_HSM_STATE_GET: + case LL_IOC_HSM_STATE_SET: + case LL_IOC_HSM_ACTION: { + struct md_op_data *op_data = karg; + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + if (tgt->ltd_exp == NULL) + RETURN(-EINVAL); + + rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); + break; + } + case LL_IOC_HSM_PROGRESS: { + const struct hsm_progress_kernel *hpk = karg; + + tgt = lmv_fid2tgt(lmv, &hpk->hpk_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); + break; + } + case LL_IOC_HSM_REQUEST: { + struct hsm_user_request *hur = karg; + unsigned int reqcount = hur->hur_request.hr_itemcount; + + if (reqcount == 0) + RETURN(0); + + /* if the request is about a single fid + * or if there is a single MDS, no need to split + * the request. */ + if (reqcount == 1 || count == 1) { + tgt = lmv_fid2tgt(lmv, &hur->hur_user_item[0].hui_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); + } else { + /* split fid list to their respective MDS */ + lmv_foreach_connected_tgt(lmv, tgt) { + int nr, rc1; + size_t reqlen; + struct hsm_user_request *req; + + nr = lmv_hsm_req_count(lmv, hur, tgt); + if (nr < 0) + RETURN(nr); + if (nr == 0) /* nothing for this MDS */ + continue; + + /* build a request with fids for this MDS */ + reqlen = offsetof(typeof(*hur), + hur_user_item[nr]) + + hur->hur_request.hr_data_len; + OBD_ALLOC_LARGE(req, reqlen); + if (req == NULL) + RETURN(-ENOMEM); + rc1 = lmv_hsm_req_build(lmv, hur, tgt, req); + if (rc1 < 0) + GOTO(hsm_req_err, rc1); + rc1 = obd_iocontrol(cmd, tgt->ltd_exp, reqlen, + req, uarg); +hsm_req_err: + if (rc1 != 0 && rc == 0) + rc = rc1; + OBD_FREE_LARGE(req, reqlen); + } + } + break; + } + case LL_IOC_LOV_SWAP_LAYOUTS: { + struct md_op_data *op_data = karg; + struct lmv_tgt_desc *tgt1, *tgt2; + + tgt1 = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt1)) + RETURN(PTR_ERR(tgt1)); + + tgt2 = lmv_fid2tgt(lmv, &op_data->op_fid2); + if (IS_ERR(tgt2)) + RETURN(PTR_ERR(tgt2)); + + if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL)) + RETURN(-EINVAL); + + /* only files on same MDT can have their layouts swapped */ + if (tgt1->ltd_index != tgt2->ltd_index) + RETURN(-EPERM); + + rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg); + break; + } + case LL_IOC_HSM_CT_START: { + struct lustre_kernelcomm *lk = karg; + if (lk->lk_flags & LK_FLG_STOP) + rc = lmv_hsm_ct_unregister(obd, cmd, len, lk, uarg); + else + rc = lmv_hsm_ct_register(obd, cmd, len, lk, uarg); + break; + } + default: + lmv_foreach_connected_tgt(lmv, tgt) { + struct obd_device *mdc_obd; + int err; + + /* ll_umount_begin() sets force flag but for lmv, not + * mdc. Let's pass it through */ + mdc_obd = class_exp2obd(tgt->ltd_exp); + mdc_obd->obd_force = obd->obd_force; + err = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); + if (err) { + if (tgt->ltd_active) { + CERROR("error: iocontrol MDC %s on MDT" + " idx %d cmd %x: err = %d\n", + tgt->ltd_uuid.uuid, + tgt->ltd_index, cmd, err); + if (!rc) + rc = err; + } + } else + set = 1; + } + if (!set && !rc) + rc = -EIO; + } + RETURN(rc); +} + +int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + LASSERT(op_data); + LASSERT(fid); + + tgt = lmv_tgt(lmv, op_data->op_mds); + if (!tgt) + RETURN(-ENODEV); + + if (!tgt->ltd_active || !tgt->ltd_exp) + RETURN(-ENODEV); + + /* + * New seq alloc and FLD setup should be atomic. Otherwise we may find + * on server that seq in new allocated fid is not yet known. + */ + mutex_lock(&tgt->ltd_fid_mutex); + rc = obd_fid_alloc(NULL, tgt->ltd_exp, fid, NULL); + mutex_unlock(&tgt->ltd_fid_mutex); + if (rc > 0) { + LASSERT(fid_is_sane(fid)); + rc = 0; + } + + RETURN(rc); +} + +static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_desc *desc; + struct lnet_processid lnet_id; + int i = 0; + int rc; + + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("LMV setup requires a descriptor\n"); + RETURN(-EINVAL); + } + + desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1); + if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) { + CERROR("Lmv descriptor size wrong: %d > %d\n", + (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1)); + RETURN(-EINVAL); + } + + obd_str2uuid(&lmv->lmv_mdt_descs.ltd_lmv_desc.ld_uuid, + desc->ld_uuid.uuid); + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count = 0; + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count = 0; + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage = + LMV_DESC_QOS_MAXAGE_DEFAULT; + lmv->max_def_easize = 0; + lmv->max_easize = 0; + + spin_lock_init(&lmv->lmv_lock); + + /* + * initialize rr_index to lower 32bit of netid, so that client + * can distribute subdirs evenly from the beginning. + */ + while (LNetGetId(i++, &lnet_id) != -ENOENT) { + if (!nid_is_lo0(&lnet_id.nid)) { + lmv->lmv_qos_rr_index = ntohl(lnet_id.nid.nid_addr[0]); + break; + } + } + + rc = lmv_tunables_init(obd); + if (rc) + CWARN("%s: error adding LMV sysfs/debugfs files: rc = %d\n", + obd->obd_name, rc); + + rc = fld_client_init(&lmv->lmv_fld, obd->obd_name, + LUSTRE_CLI_FLD_HASH_DHT); + if (rc) + CERROR("Can't init FLD, err %d\n", rc); + + rc = lu_tgt_descs_init(&lmv->lmv_mdt_descs, true); + if (rc) + CWARN("%s: error initialize target table: rc = %d\n", + obd->obd_name, rc); + + RETURN(rc); +} + +static int lmv_cleanup(struct obd_device *obd) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lu_tgt_desc *tgt; + struct lu_tgt_desc *tmp; + + ENTRY; + + fld_client_fini(&lmv->lmv_fld); + lmv_foreach_tgt_safe(lmv, tgt, tmp) + lmv_del_target(lmv, tgt); + lu_tgt_descs_fini(&lmv->lmv_mdt_descs); + + RETURN(0); +} + +static int lmv_process_config(struct obd_device *obd, size_t len, void *buf) +{ + struct lustre_cfg *lcfg = buf; + struct obd_uuid obd_uuid; + int gen; + __u32 index; + int rc; + ENTRY; + + switch (lcfg->lcfg_command) { + case LCFG_ADD_MDC: + /* modify_mdc_tgts add 0:lustre-clilmv 1:lustre-MDT0000_UUID + * 2:0 3:1 4:lustre-MDT0000-mdc_UUID */ + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) + GOTO(out, rc = -EINVAL); + + obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1)); + + if (sscanf(lustre_cfg_buf(lcfg, 2), "%u", &index) != 1) + GOTO(out, rc = -EINVAL); + if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1) + GOTO(out, rc = -EINVAL); + rc = lmv_add_target(obd, &obd_uuid, index, gen); + GOTO(out, rc); + default: + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + GOTO(out, rc = -EINVAL); + } +out: + RETURN(rc); +} + +static int lmv_select_statfs_mdt(struct lmv_obd *lmv, __u32 flags) +{ + int i; + + if (flags & OBD_STATFS_FOR_MDT0) + return 0; + + if (lmv->lmv_statfs_start || lmv->lmv_mdt_count == 1) + return lmv->lmv_statfs_start; + + /* choose initial MDT for this client */ + for (i = 0;; i++) { + struct lnet_processid lnet_id; + if (LNetGetId(i, &lnet_id) == -ENOENT) + break; + + if (!nid_is_lo0(&lnet_id.nid)) { + /* We dont need a full 64-bit modulus, just enough + * to distribute the requests across MDTs evenly. + */ + lmv->lmv_statfs_start = nidhash(&lnet_id.nid) % + lmv->lmv_mdt_count; + break; + } + } + + return lmv->lmv_statfs_start; +} + +static int lmv_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, time64_t max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_statfs *temp; + struct lu_tgt_desc *tgt; + __u32 i; + __u32 idx; + int rc = 0; + + ENTRY; + + OBD_ALLOC(temp, sizeof(*temp)); + if (temp == NULL) + RETURN(-ENOMEM); + + /* distribute statfs among MDTs */ + idx = lmv_select_statfs_mdt(lmv, flags); + + for (i = 0; i < lmv->lmv_mdt_descs.ltd_tgts_size; i++, idx++) { + idx = idx % lmv->lmv_mdt_descs.ltd_tgts_size; + tgt = lmv_tgt(lmv, idx); + if (!tgt || !tgt->ltd_exp) + continue; + + rc = obd_statfs(env, tgt->ltd_exp, temp, max_age, + flags | OBD_STATFS_NESTED); + if (rc) { + CERROR("%s: can't stat MDS #%d: rc = %d\n", + tgt->ltd_exp->exp_obd->obd_name, i, rc); + GOTO(out_free_temp, rc); + } + + if (temp->os_state & OS_STATFS_SUM || + flags == OBD_STATFS_FOR_MDT0) { + /* reset to the last aggregated values + * and don't sum with non-aggrated data */ + /* If the statfs is from mount, it needs to retrieve + * necessary information from MDT0. i.e. mount does + * not need the merged osfs from all of MDT. Also + * clients can be mounted as long as MDT0 is in + * service */ + *osfs = *temp; + break; + } + + if (i == 0) { + *osfs = *temp; + } else { + osfs->os_bavail += temp->os_bavail; + osfs->os_blocks += temp->os_blocks; + osfs->os_ffree += temp->os_ffree; + osfs->os_files += temp->os_files; + osfs->os_granted += temp->os_granted; + } + } + + EXIT; +out_free_temp: + OBD_FREE(temp, sizeof(*temp)); + return rc; +} + +static int lmv_statfs_update(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct obd_device *obd = oinfo->oi_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = oinfo->oi_tgt; + struct obd_statfs *osfs = oinfo->oi_osfs; + + /* + * NB: don't deactivate TGT upon error, because we may not trigger async + * statfs any longer, then there is no chance to activate TGT. + */ + if (!rc) { + spin_lock(&lmv->lmv_lock); + tgt->ltd_statfs = *osfs; + tgt->ltd_statfs_age = ktime_get_seconds(); + spin_unlock(&lmv->lmv_lock); + set_bit(LQ_DIRTY, &lmv->lmv_qos.lq_flags); + } + + return rc; +} + +/* update tgt statfs async if it's ld_qos_maxage old */ +int lmv_statfs_check_update(struct obd_device *obd, struct lmv_tgt_desc *tgt) +{ + struct obd_info oinfo = { + .oi_obd = obd, + .oi_tgt = tgt, + .oi_cb_up = lmv_statfs_update, + }; + int rc; + + if (ktime_get_seconds() - tgt->ltd_statfs_age < + obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage) + return 0; + + rc = obd_statfs_async(tgt->ltd_exp, &oinfo, 0, NULL); + + return rc; +} + +static int lmv_get_root(struct obd_export *exp, const char *fileset, + struct lu_fid *fid) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lu_tgt_desc *tgt = lmv_tgt(lmv, 0); + int rc; + + ENTRY; + + if (!tgt) + RETURN(-ENODEV); + + rc = md_get_root(tgt->ltd_exp, fileset, fid); + RETURN(rc); +} + +static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid, + u64 obd_md_valid, const char *name, size_t buf_size, + struct ptlrpc_request **req) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_getxattr(tgt->ltd_exp, fid, obd_md_valid, name, buf_size, req); + + RETURN(rc); +} + +static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid, + u64 obd_md_valid, const char *name, + const void *value, size_t value_size, + unsigned int xattr_flags, u32 suppgid, + struct ptlrpc_request **req) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_setxattr(tgt->ltd_exp, fid, obd_md_valid, name, + value, value_size, xattr_flags, suppgid, req); + + RETURN(rc); +} + +static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + if (op_data->op_flags & MF_GET_MDT_IDX) { + op_data->op_mds = tgt->ltd_index; + RETURN(0); + } + + rc = md_getattr(tgt->ltd_exp, op_data, request); + + RETURN(rc); +} + +static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lu_tgt_desc *tgt; + + ENTRY; + + CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid)); + + /* + * With DNE every object can have two locks in different namespaces: + * lookup lock in space of MDT storing direntry and update/open lock in + * space of MDT storing inode. + */ + lmv_foreach_connected_tgt(lmv, tgt) + md_null_inode(tgt->ltd_exp, fid); + + RETURN(0); +} + +static int lmv_close(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod, struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1)); + rc = md_close(tgt->ltd_exp, op_data, mod, request); + RETURN(rc); +} + +static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, + struct md_op_data *op_data) +{ + struct lu_tgt_desc *tgt, *cur = NULL; + __u64 total_avail = 0; + __u64 total_weight = 0; + __u64 cur_weight = 0; + int total_usable = 0; + __u64 rand; + int rc; + + ENTRY; + + if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs)) + RETURN(ERR_PTR(-EAGAIN)); + + down_write(&lmv->lmv_qos.lq_rw_sem); + + if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs)) + GOTO(unlock, tgt = ERR_PTR(-EAGAIN)); + + rc = ltd_qos_penalties_calc(&lmv->lmv_mdt_descs); + if (rc) + GOTO(unlock, tgt = ERR_PTR(rc)); + + lmv_foreach_tgt(lmv, tgt) { + if (!tgt->ltd_exp || !tgt->ltd_active) { + tgt->ltd_qos.ltq_usable = 0; + continue; + } + + tgt->ltd_qos.ltq_usable = 1; + lu_tgt_qos_weight_calc(tgt); + if (tgt->ltd_index == op_data->op_mds) + cur = tgt; + total_avail += tgt->ltd_qos.ltq_avail; + total_weight += tgt->ltd_qos.ltq_weight; + total_usable++; + } + + /* If current MDT has above-average space and dir is not aleady using + * round-robin to spread across more MDTs, stay on the parent MDT + * to avoid creating needless remote MDT directories. Remote dirs + * close to the root balance space more effectively than bottom dirs, + * so prefer to create remote dirs at top level of directory tree. + * "16 / (dir_depth + 10)" is the factor to make it less likely + * for top-level directories to stay local unless they have more than + * average free space, while deep dirs prefer local until more full. + * depth=0 -> 160%, depth=3 -> 123%, depth=6 -> 100%, + * depth=9 -> 84%, depth=12 -> 73%, depth=15 -> 64% + */ + if (!lmv_op_default_rr_mkdir(op_data)) { + rand = total_avail * 16 / + (total_usable * (op_data->op_dir_depth + 10)); + if (cur && cur->ltd_qos.ltq_avail >= rand) { + tgt = cur; + GOTO(unlock, tgt); + } + } + + rand = lu_prandom_u64_max(total_weight); + + lmv_foreach_connected_tgt(lmv, tgt) { + if (!tgt->ltd_qos.ltq_usable) + continue; + + cur_weight += tgt->ltd_qos.ltq_weight; + if (cur_weight < rand) + continue; + + ltd_qos_update(&lmv->lmv_mdt_descs, tgt, &total_weight); + GOTO(unlock, tgt); + } + + /* no proper target found */ + GOTO(unlock, tgt = ERR_PTR(-EAGAIN)); +unlock: + up_write(&lmv->lmv_qos.lq_rw_sem); + + return tgt; +} + +static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv) +{ + struct lu_tgt_desc *tgt; + int i; + int index; + + ENTRY; + + spin_lock(&lmv->lmv_lock); + for (i = 0; i < lmv->lmv_mdt_descs.ltd_tgts_size; i++) { + index = (i + lmv->lmv_qos_rr_index) % + lmv->lmv_mdt_descs.ltd_tgts_size; + tgt = lmv_tgt(lmv, index); + if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) + continue; + + lmv->lmv_qos_rr_index = (tgt->ltd_index + 1) % + lmv->lmv_mdt_descs.ltd_tgts_size; + spin_unlock(&lmv->lmv_lock); + + RETURN(tgt); + } + spin_unlock(&lmv->lmv_lock); + + RETURN(ERR_PTR(-ENODEV)); +} + +/* locate MDT which is less full (avoid the most full MDT) */ +static struct lu_tgt_desc *lmv_locate_tgt_lf(struct lmv_obd *lmv) +{ + struct lu_tgt_desc *min = NULL; + struct lu_tgt_desc *tgt; + __u64 avail = 0; + __u64 rand; + + ENTRY; + + if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs)) + RETURN(ERR_PTR(-EAGAIN)); + + down_write(&lmv->lmv_qos.lq_rw_sem); + + if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs)) + GOTO(unlock, tgt = ERR_PTR(-EAGAIN)); + + lmv_foreach_tgt(lmv, tgt) { + if (!tgt->ltd_exp || !tgt->ltd_active) { + tgt->ltd_qos.ltq_usable = 0; + continue; + } + + tgt->ltd_qos.ltq_usable = 1; + lu_tgt_qos_weight_calc(tgt); + avail += tgt->ltd_qos.ltq_avail; + if (!min || min->ltd_qos.ltq_avail > tgt->ltd_qos.ltq_avail) + min = tgt; + } + + /* avoid the most full MDT */ + if (min) + avail -= min->ltd_qos.ltq_avail; + + rand = lu_prandom_u64_max(avail); + avail = 0; + lmv_foreach_connected_tgt(lmv, tgt) { + if (!tgt->ltd_qos.ltq_usable) + continue; + + if (tgt == min) + continue; + + avail += tgt->ltd_qos.ltq_avail; + if (avail < rand) + continue; + + GOTO(unlock, tgt); + } + + /* no proper target found */ + GOTO(unlock, tgt = ERR_PTR(-EAGAIN)); +unlock: + up_write(&lmv->lmv_qos.lq_rw_sem); + + RETURN(tgt); +} + +/* locate MDT by file name, for striped directory, the file name hash decides + * which stripe its dirent is stored. + */ +static struct lmv_tgt_desc * +lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm, + const char *name, int namelen, struct lu_fid *fid, + __u32 *mds, bool new_layout) +{ + struct lmv_tgt_desc *tgt; + const struct lmv_oinfo *oinfo; + + if (!lmv_dir_striped(lsm) || !namelen) { + tgt = lmv_fid2tgt(lmv, fid); + if (IS_ERR(tgt)) + return tgt; + + *mds = tgt->ltd_index; + return tgt; + } + + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_NAME_HASH)) { + if (cfs_fail_val >= lsm->lsm_md_stripe_count) + return ERR_PTR(-EBADF); + oinfo = &lsm->lsm_md_oinfo[cfs_fail_val]; + } else { + oinfo = lsm_name_to_stripe_info(lsm, name, namelen, new_layout); + if (IS_ERR(oinfo)) + return ERR_CAST(oinfo); + } + + /* check stripe FID is sane */ + if (!fid_is_sane(&oinfo->lmo_fid)) + return ERR_PTR(-ENODEV); + + *fid = oinfo->lmo_fid; + *mds = oinfo->lmo_mds; + tgt = lmv_tgt(lmv, oinfo->lmo_mds); + + CDEBUG(D_INODE, "locate MDT %u parent "DFID"\n", *mds, PFID(fid)); + + return tgt ? tgt : ERR_PTR(-ENODEV); +} + +/** + * Locate MDT of op_data->op_fid1 + * + * For striped directory, it will locate the stripe by name hash, if hash_type + * is unknown, it will return the stripe specified by 'op_data->op_stripe_index' + * which is set outside, and if dir is migrating, 'op_data->op_new_layout' + * indicates whether old or new layout is used to locate. + * + * For plain direcotry, it just locate the MDT of op_data->op_fid1. + * + * \param[in] lmv LMV device + * \param[in/out] op_data client MD stack parameters, name, namelen etc, + * op_mds and op_fid1 will be updated if op_mea1 + * indicates fid1 represents a striped directory. + * + * retval pointer to the lmv_tgt_desc if succeed. + * ERR_PTR(errno) if failed. + */ +struct lmv_tgt_desc * +lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data) +{ + struct lmv_stripe_md *lsm = op_data->op_mea1; + struct lmv_oinfo *oinfo; + struct lmv_tgt_desc *tgt; + + if (lmv_dir_foreign(lsm)) + return ERR_PTR(-ENODATA); + + /* During creating VOLATILE file, it should honor the mdt + * index if the file under striped dir is being restored, see + * ct_restore(). */ + if (op_data->op_bias & MDS_CREATE_VOLATILE && + op_data->op_mds != LMV_OFFSET_DEFAULT) { + tgt = lmv_tgt(lmv, op_data->op_mds); + if (!tgt) + return ERR_PTR(-ENODEV); + + if (lmv_dir_striped(lsm)) { + int i; + + /* refill the right parent fid */ + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + oinfo = &lsm->lsm_md_oinfo[i]; + if (oinfo->lmo_mds == op_data->op_mds) { + op_data->op_fid1 = oinfo->lmo_fid; + break; + } + } + + if (i == lsm->lsm_md_stripe_count) + op_data->op_fid1 = lsm->lsm_md_oinfo[0].lmo_fid; + } + } else if (lmv_dir_bad_hash(lsm)) { + LASSERT(op_data->op_stripe_index < lsm->lsm_md_stripe_count); + oinfo = &lsm->lsm_md_oinfo[op_data->op_stripe_index]; + + op_data->op_fid1 = oinfo->lmo_fid; + op_data->op_mds = oinfo->lmo_mds; + tgt = lmv_tgt(lmv, oinfo->lmo_mds); + if (!tgt) + return ERR_PTR(-ENODEV); + } else { + tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea1, + op_data->op_name, op_data->op_namelen, + &op_data->op_fid1, &op_data->op_mds, + op_data->op_new_layout); + } + + return tgt; +} + +/* Locate MDT of op_data->op_fid2 for link/rename */ +static struct lmv_tgt_desc * +lmv_locate_tgt2(struct lmv_obd *lmv, struct md_op_data *op_data) +{ + struct lmv_tgt_desc *tgt; + int rc; + + LASSERT(op_data->op_name); + if (lmv_dir_layout_changing(op_data->op_mea2)) { + struct lu_fid fid1 = op_data->op_fid1; + struct lmv_stripe_md *lsm1 = op_data->op_mea1; + struct ptlrpc_request *request = NULL; + + /* + * avoid creating new file under old layout of migrating + * directory, check it here. + */ + tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea2, + op_data->op_name, op_data->op_namelen, + &op_data->op_fid2, &op_data->op_mds, false); + if (IS_ERR(tgt)) + RETURN(tgt); + + op_data->op_fid1 = op_data->op_fid2; + op_data->op_mea1 = op_data->op_mea2; + rc = md_getattr_name(tgt->ltd_exp, op_data, &request); + op_data->op_fid1 = fid1; + op_data->op_mea1 = lsm1; + if (!rc) { + ptlrpc_req_finished(request); + RETURN(ERR_PTR(-EEXIST)); + } + + if (rc != -ENOENT) + RETURN(ERR_PTR(rc)); + } + + return lmv_locate_tgt_by_name(lmv, op_data->op_mea2, op_data->op_name, + op_data->op_namelen, &op_data->op_fid2, + &op_data->op_mds, true); +} + +int lmv_old_layout_lookup(struct lmv_obd *lmv, struct md_op_data *op_data) +{ + struct lu_tgt_desc *tgt; + struct ptlrpc_request *request; + int rc; + + LASSERT(lmv_dir_layout_changing(op_data->op_mea1)); + LASSERT(!op_data->op_new_layout); + + tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = md_getattr_name(tgt->ltd_exp, op_data, &request); + if (!rc) { + ptlrpc_req_finished(request); + return -EEXIST; + } + + return rc; +} + +/* mkdir by QoS upon 'lfs mkdir -i -1'. + * + * NB, mkdir by QoS only if parent is not striped, this is to avoid remote + * directories under striped directory. + */ +static inline bool lmv_op_user_qos_mkdir(const struct md_op_data *op_data) +{ + const struct lmv_user_md *lum = op_data->op_data; + + if (op_data->op_code != LUSTRE_OPC_MKDIR) + return false; + + if (lmv_dir_striped(op_data->op_mea1)) + return false; + + return (op_data->op_cli_flags & CLI_SET_MEA) && lum && + le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC && + le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT; +} + +/* mkdir by QoS if either ROOT or parent default LMV is space balanced. */ +static inline bool lmv_op_default_qos_mkdir(const struct md_op_data *op_data) +{ + const struct lmv_stripe_md *lsm = op_data->op_default_mea1; + + if (op_data->op_code != LUSTRE_OPC_MKDIR) + return false; + + if (lmv_dir_striped(op_data->op_mea1)) + return false; + + return (op_data->op_flags & MF_QOS_MKDIR) || + (lsm && lsm->lsm_md_master_mdt_index == LMV_OFFSET_DEFAULT); +} + +/* if parent default LMV is space balanced, and + * 1. max_inherit_rr is set + * 2. or parent is ROOT + * mkdir roundrobin. Or if parent doesn't have default LMV, while ROOT default + * LMV requests roundrobin mkdir, do the same. + * NB, this needs to check server is balanced, which is done by caller. + */ +static inline bool lmv_op_default_rr_mkdir(const struct md_op_data *op_data) +{ + const struct lmv_stripe_md *lsm = op_data->op_default_mea1; + + return (op_data->op_flags & MF_RR_MKDIR) || + (lsm && lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE) || + fid_is_root(&op_data->op_fid1); +} + +/* 'lfs mkdir -i ' */ +static inline bool lmv_op_user_specific_mkdir(const struct md_op_data *op_data) +{ + const struct lmv_user_md *lum = op_data->op_data; + + return op_data->op_code == LUSTRE_OPC_MKDIR && + op_data->op_cli_flags & CLI_SET_MEA && lum && + (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC || + le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) && + le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT; +} + +/* parent default LMV master_mdt_index is not -1. */ +static inline bool +lmv_op_default_specific_mkdir(const struct md_op_data *op_data) +{ + return op_data->op_code == LUSTRE_OPC_MKDIR && + op_data->op_default_mea1 && + op_data->op_default_mea1->lsm_md_master_mdt_index != + LMV_OFFSET_DEFAULT; +} + +/* locate MDT by space usage */ +static struct lu_tgt_desc *lmv_locate_tgt_by_space(struct lmv_obd *lmv, + struct md_op_data *op_data, + struct lmv_tgt_desc *tgt) +{ + struct lmv_tgt_desc *tmp = tgt; + + tgt = lmv_locate_tgt_qos(lmv, op_data); + if (tgt == ERR_PTR(-EAGAIN)) { + if (ltd_qos_is_balanced(&lmv->lmv_mdt_descs) && + !lmv_op_default_rr_mkdir(op_data) && + !lmv_op_user_qos_mkdir(op_data)) + /* if not necessary, don't create remote directory. */ + tgt = tmp; + else + tgt = lmv_locate_tgt_rr(lmv); + } + + /* + * only update statfs after QoS mkdir, this means the cached statfs may + * be stale, and current mkdir may not follow QoS accurately, but it's + * not serious, and avoids periodic statfs when client doesn't mkdir by + * QoS. + */ + if (!IS_ERR(tgt)) { + op_data->op_mds = tgt->ltd_index; + lmv_statfs_check_update(lmv2obd_dev(lmv), tgt); + } + + return tgt; +} + +int lmv_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, size_t datalen, umode_t mode, uid_t uid, + gid_t gid, kernel_cap_t cap_effective, __u64 rdev, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + struct mdt_body *repbody; + int rc; + + ENTRY; + + if (!lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count) + RETURN(-EIO); + + if (lmv_dir_bad_hash(op_data->op_mea1)) + RETURN(-EBADF); + + if (lmv_dir_layout_changing(op_data->op_mea1)) { + /* + * if parent is migrating, create() needs to lookup existing + * name in both old and new layout, check old layout on client. + */ + rc = lmv_old_layout_lookup(lmv, op_data); + if (rc != -ENOENT) + RETURN(rc); + + op_data->op_new_layout = true; + } + + tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + /* the order to apply policy in mkdir: + * 1. is "lfs mkdir -i N"? mkdir on MDT N. + * 2. is "lfs mkdir -i -1"? mkdir by space usage. + * 3. is starting MDT specified in default LMV? mkdir on MDT N. + * 4. is default LMV space balanced? mkdir by space usage. + */ + if (lmv_op_user_specific_mkdir(op_data)) { + struct lmv_user_md *lum = op_data->op_data; + + op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset); + tgt = lmv_tgt(lmv, op_data->op_mds); + if (!tgt) + RETURN(-ENODEV); + } else if (lmv_op_user_qos_mkdir(op_data)) { + tgt = lmv_locate_tgt_by_space(lmv, op_data, tgt); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } else if (lmv_op_default_specific_mkdir(op_data)) { + op_data->op_mds = + op_data->op_default_mea1->lsm_md_master_mdt_index; + tgt = lmv_tgt(lmv, op_data->op_mds); + if (!tgt) + RETURN(-ENODEV); + } else if (lmv_op_default_qos_mkdir(op_data)) { + tgt = lmv_locate_tgt_by_space(lmv, op_data, tgt); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } + +retry: + rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); + if (rc) + RETURN(rc); + + CDEBUG(D_INODE, "CREATE name '%.*s' "DFID" on "DFID" -> mds #%x\n", + (int)op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid2), PFID(&op_data->op_fid1), + op_data->op_mds); + + op_data->op_flags |= MF_MDC_CANCEL_FID1; + rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid, + cap_effective, rdev, request); + if (rc == 0) { + if (*request == NULL) + RETURN(rc); + CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2)); + } + + /* dir restripe needs to send to MDT where dir is located */ + if (rc != -EREMOTE || + !(exp_connect_flags2(exp) & OBD_CONNECT2_CRUSH)) + RETURN(rc); + + repbody = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); + if (repbody == NULL) + RETURN(-EPROTO); + + /* Not cross-ref case, just get out of here. */ + if (likely(!(repbody->mbo_valid & OBD_MD_MDS))) + RETURN(rc); + + op_data->op_fid2 = repbody->mbo_fid1; + ptlrpc_req_finished(*request); + *request = NULL; + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid2); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + op_data->op_mds = tgt->ltd_index; + goto retry; +} + +static int +lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + const union ldlm_policy_data *policy, struct md_op_data *op_data, + struct lustre_handle *lockh, __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + CDEBUG(D_INODE, "ENQUEUE on "DFID"\n", PFID(&op_data->op_fid1)); + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + CDEBUG(D_INODE, "ENQUEUE on "DFID" -> mds #%u\n", + PFID(&op_data->op_fid1), tgt->ltd_index); + + rc = md_enqueue(tgt->ltd_exp, einfo, policy, op_data, lockh, + extra_lock_flags); + + RETURN(rc); +} + +int +lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data, + struct ptlrpc_request **preq) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + int rc; + + ENTRY; + +retry: + if (op_data->op_namelen == 2 && + op_data->op_name[0] == '.' && op_data->op_name[1] == '.') + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + else + tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n", + (int)op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid1), tgt->ltd_index); + + rc = md_getattr_name(tgt->ltd_exp, op_data, preq); + if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) { + ptlrpc_req_finished(*preq); + *preq = NULL; + goto retry; + } + + if (rc) + RETURN(rc); + + body = req_capsule_server_get(&(*preq)->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + if (body->mbo_valid & OBD_MD_MDS) { + op_data->op_fid1 = body->mbo_fid1; + op_data->op_valid |= OBD_MD_FLCROSSREF; + op_data->op_namelen = 0; + op_data->op_name = NULL; + + ptlrpc_req_finished(*preq); + *preq = NULL; + + goto retry; + } + + RETURN(rc); +} + +#define md_op_data_fid(op_data, fl) \ + (fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \ + fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \ + fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \ + fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \ + NULL) + +static int lmv_early_cancel(struct obd_export *exp, struct lmv_tgt_desc *tgt, + struct md_op_data *op_data, __u32 op_tgt, + enum ldlm_mode mode, int bits, int flag) +{ + struct lu_fid *fid = md_op_data_fid(op_data, flag); + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + union ldlm_policy_data policy = { { 0 } }; + int rc = 0; + ENTRY; + + if (!fid_is_sane(fid)) + RETURN(0); + + if (tgt == NULL) { + tgt = lmv_fid2tgt(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } + + if (tgt->ltd_index != op_tgt) { + CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid)); + policy.l_inodebits.bits = bits; + rc = md_cancel_unused(tgt->ltd_exp, fid, &policy, + mode, LCF_ASYNC, NULL); + } else { + CDEBUG(D_INODE, + "EARLY_CANCEL skip operation target %d on "DFID"\n", + op_tgt, PFID(fid)); + op_data->op_flags |= flag; + rc = 0; + } + + RETURN(rc); +} + +/* + * llite passes fid of an target inode in op_data->op_fid1 and id of directory in + * op_data->op_fid2 + */ +static int lmv_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + LASSERT(op_data->op_namelen != 0); + + CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n", + PFID(&op_data->op_fid2), (int)op_data->op_namelen, + op_data->op_name, PFID(&op_data->op_fid1)); + + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = current_cap(); + + tgt = lmv_locate_tgt2(lmv, op_data); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + /* + * Cancel UPDATE lock on child (fid1). + */ + op_data->op_flags |= MF_MDC_CANCEL_FID2; + rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1); + if (rc != 0) + RETURN(rc); + + rc = md_link(tgt->ltd_exp, op_data, request); + + RETURN(rc); +} + +/* migrate the top directory */ +static inline bool lmv_op_topdir_migrate(const struct md_op_data *op_data) +{ + if (!S_ISDIR(op_data->op_mode)) + return false; + + if (lmv_dir_layout_changing(op_data->op_mea1)) + return false; + + return true; +} + +/* migrate top dir to specific MDTs */ +static inline bool lmv_topdir_specific_migrate(const struct md_op_data *op_data) +{ + const struct lmv_user_md *lum = op_data->op_data; + + if (!lmv_op_topdir_migrate(op_data)) + return false; + + return le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT; +} + +/* migrate top dir in QoS mode if user issued "lfs migrate -m -1..." */ +static inline bool lmv_topdir_qos_migrate(const struct md_op_data *op_data) +{ + const struct lmv_user_md *lum = op_data->op_data; + + if (!lmv_op_topdir_migrate(op_data)) + return false; + + return le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT; +} + +static inline bool lmv_subdir_specific_migrate(const struct md_op_data *op_data) +{ + const struct lmv_user_md *lum = op_data->op_data; + + if (!S_ISDIR(op_data->op_mode)) + return false; + + if (!lmv_dir_layout_changing(op_data->op_mea1)) + return false; + + return le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT; +} + +static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, + const char *name, size_t namelen, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_stripe_md *lsm = op_data->op_mea1; + struct lmv_tgt_desc *parent_tgt; + struct lmv_tgt_desc *sp_tgt; + struct lmv_tgt_desc *tp_tgt = NULL; + struct lmv_tgt_desc *child_tgt; + struct lmv_tgt_desc *tgt; + struct lu_fid target_fid = { 0 }; + int rc; + + ENTRY; + + LASSERT(op_data->op_cli_flags & CLI_MIGRATE); + + CDEBUG(D_INODE, "MIGRATE "DFID"/%.*s\n", + PFID(&op_data->op_fid1), (int)namelen, name); + + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = current_cap(); + + parent_tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(parent_tgt)) + RETURN(PTR_ERR(parent_tgt)); + + if (lmv_dir_striped(lsm)) { + const struct lmv_oinfo *oinfo; + + oinfo = lsm_name_to_stripe_info(lsm, name, namelen, false); + if (IS_ERR(oinfo)) + RETURN(PTR_ERR(oinfo)); + + /* save source stripe FID in fid4 temporarily for ELC */ + op_data->op_fid4 = oinfo->lmo_fid; + sp_tgt = lmv_tgt(lmv, oinfo->lmo_mds); + if (!sp_tgt) + RETURN(-ENODEV); + + /* + * if parent is being migrated too, fill op_fid2 with target + * stripe fid, otherwise the target stripe is not created yet. + */ + if (lmv_dir_layout_changing(lsm)) { + oinfo = lsm_name_to_stripe_info(lsm, name, namelen, + true); + if (IS_ERR(oinfo)) + RETURN(PTR_ERR(oinfo)); + + op_data->op_fid2 = oinfo->lmo_fid; + tp_tgt = lmv_tgt(lmv, oinfo->lmo_mds); + if (!tp_tgt) + RETURN(-ENODEV); + + /* parent unchanged and update namespace only */ + if (lu_fid_eq(&op_data->op_fid4, &op_data->op_fid2) && + op_data->op_bias & MDS_MIGRATE_NSONLY) + RETURN(-EALREADY); + } + } else { + sp_tgt = parent_tgt; + } + + child_tgt = lmv_fid2tgt(lmv, &op_data->op_fid3); + if (IS_ERR(child_tgt)) + RETURN(PTR_ERR(child_tgt)); + + if (lmv_topdir_specific_migrate(op_data)) { + struct lmv_user_md *lum = op_data->op_data; + + op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset); + } else if (lmv_topdir_qos_migrate(op_data)) { + tgt = lmv_locate_tgt_lf(lmv); + if (tgt == ERR_PTR(-EAGAIN)) + tgt = lmv_locate_tgt_rr(lmv); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + op_data->op_mds = tgt->ltd_index; + } else if (lmv_subdir_specific_migrate(op_data)) { + struct lmv_user_md *lum = op_data->op_data; + __u32 i; + + LASSERT(tp_tgt); + if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) { + /* adjust MDTs in lum, since subdir is located on where + * its parent stripe is, not the first specified MDT. + */ + for (i = 0; i < le32_to_cpu(lum->lum_stripe_count); + i++) { + if (le32_to_cpu(lum->lum_objects[i].lum_mds) == + tp_tgt->ltd_index) + break; + } + + if (i == le32_to_cpu(lum->lum_stripe_count)) + RETURN(-ENODEV); + + lum->lum_objects[i].lum_mds = + lum->lum_objects[0].lum_mds; + lum->lum_objects[0].lum_mds = + cpu_to_le32(tp_tgt->ltd_index); + } + /* NB, the above adjusts subdir migration for command like + * "lfs migrate -m 0,1,2 ...", but for migration like + * "lfs migrate -m 0 -c 2 ...", the top dir is migrated to MDT0 + * and MDT1, however its subdir may be migrated to MDT1 and MDT2 + */ + + lum->lum_stripe_offset = cpu_to_le32(tp_tgt->ltd_index); + op_data->op_mds = tp_tgt->ltd_index; + } else if (tp_tgt) { + op_data->op_mds = tp_tgt->ltd_index; + } else { + op_data->op_mds = sp_tgt->ltd_index; + } + + rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data); + if (rc) + RETURN(rc); + + /* + * for directory, send migrate request to the MDT where the object will + * be migrated to, because we can't create a striped directory remotely. + * + * otherwise, send to the MDT where source is located because regular + * file may open lease. + * + * NB. if MDT doesn't support DIR_MIGRATE, send to source MDT too for + * backward compatibility. + */ + if (S_ISDIR(op_data->op_mode) && + (exp_connect_flags2(exp) & OBD_CONNECT2_DIR_MIGRATE)) { + tgt = lmv_fid2tgt(lmv, &target_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } else { + tgt = child_tgt; + } + + /* cancel UPDATE lock of parent master object */ + rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1); + if (rc) + RETURN(rc); + + /* cancel UPDATE lock of source parent */ + if (sp_tgt != parent_tgt) { + /* + * migrate RPC packs master object FID, because we can only pack + * two FIDs in reint RPC, but MDS needs to know both source + * parent and target parent, and it will obtain them from master + * FID and LMV, the other FID in RPC is kept for target. + * + * since this FID is not passed to MDC, cancel it anyway. + */ + rc = lmv_early_cancel(exp, sp_tgt, op_data, -1, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID4); + if (rc) + RETURN(rc); + + op_data->op_flags &= ~MF_MDC_CANCEL_FID4; + } + op_data->op_fid4 = target_fid; + + /* cancel UPDATE locks of target parent */ + rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2); + if (rc) + RETURN(rc); + + /* cancel LOOKUP lock of source if source is remote object */ + if (child_tgt != sp_tgt) { + rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index, + LCK_EX, MDS_INODELOCK_LOOKUP, + MF_MDC_CANCEL_FID3); + if (rc) + RETURN(rc); + } + + /* cancel ELC locks of source */ + rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3); + if (rc) + RETURN(rc); + + rc = md_rename(tgt->ltd_exp, op_data, name, namelen, NULL, 0, request); + + RETURN(rc); +} + +static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, size_t oldlen, + const char *new, size_t newlen, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *sp_tgt; + struct lmv_tgt_desc *tp_tgt = NULL; + struct lmv_tgt_desc *src_tgt = NULL; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + int rc; + + ENTRY; + + LASSERT(oldlen != 0); + + if (op_data->op_cli_flags & CLI_MIGRATE) { + rc = lmv_migrate(exp, op_data, old, oldlen, request); + RETURN(rc); + } + + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = current_cap(); + + op_data->op_name = new; + op_data->op_namelen = newlen; + + tp_tgt = lmv_locate_tgt2(lmv, op_data); + if (IS_ERR(tp_tgt)) + RETURN(PTR_ERR(tp_tgt)); + + /* Since the target child might be destroyed, and it might become + * orphan, and we can only check orphan on the local MDT right now, so + * we send rename request to the MDT where target child is located. If + * target child does not exist, then it will send the request to the + * target parent */ + if (fid_is_sane(&op_data->op_fid4)) { + tgt = lmv_fid2tgt(lmv, &op_data->op_fid4); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } else { + tgt = tp_tgt; + } + + op_data->op_flags |= MF_MDC_CANCEL_FID4; + + /* cancel UPDATE locks of target parent */ + rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2); + if (rc != 0) + RETURN(rc); + + if (fid_is_sane(&op_data->op_fid4)) { + /* cancel LOOKUP lock of target on target parent */ + if (tgt != tp_tgt) { + rc = lmv_early_cancel(exp, tp_tgt, op_data, + tgt->ltd_index, LCK_EX, + MDS_INODELOCK_LOOKUP, + MF_MDC_CANCEL_FID4); + if (rc != 0) + RETURN(rc); + } + } + + if (fid_is_sane(&op_data->op_fid3)) { + src_tgt = lmv_fid2tgt(lmv, &op_data->op_fid3); + if (IS_ERR(src_tgt)) + RETURN(PTR_ERR(src_tgt)); + + /* cancel ELC locks of source */ + rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_index, + LCK_EX, MDS_INODELOCK_ELC, + MF_MDC_CANCEL_FID3); + if (rc != 0) + RETURN(rc); + } + + op_data->op_name = old; + op_data->op_namelen = oldlen; +retry: + sp_tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(sp_tgt)) + RETURN(PTR_ERR(sp_tgt)); + + /* cancel UPDATE locks of source parent */ + rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1); + if (rc != 0) + RETURN(rc); + + if (fid_is_sane(&op_data->op_fid3)) { + /* cancel LOOKUP lock of source on source parent */ + if (src_tgt != sp_tgt) { + rc = lmv_early_cancel(exp, sp_tgt, op_data, + tgt->ltd_index, LCK_EX, + MDS_INODELOCK_LOOKUP, + MF_MDC_CANCEL_FID3); + if (rc != 0) + RETURN(rc); + } + } + +rename: + CDEBUG(D_INODE, "RENAME "DFID"/%.*s to "DFID"/%.*s\n", + PFID(&op_data->op_fid1), (int)oldlen, old, + PFID(&op_data->op_fid2), (int)newlen, new); + + rc = md_rename(tgt->ltd_exp, op_data, old, oldlen, new, newlen, + request); + if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) { + ptlrpc_req_finished(*request); + *request = NULL; + goto retry; + } + + if (rc && rc != -EXDEV) + RETURN(rc); + + body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + /* Not cross-ref case, just get out of here. */ + if (likely(!(body->mbo_valid & OBD_MD_MDS))) + RETURN(rc); + + op_data->op_fid4 = body->mbo_fid1; + + ptlrpc_req_finished(*request); + *request = NULL; + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid4); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + if (fid_is_sane(&op_data->op_fid4)) { + /* cancel LOOKUP lock of target on target parent */ + if (tgt != tp_tgt) { + rc = lmv_early_cancel(exp, tp_tgt, op_data, + tgt->ltd_index, LCK_EX, + MDS_INODELOCK_LOOKUP, + MF_MDC_CANCEL_FID4); + if (rc != 0) + RETURN(rc); + } + } + + goto rename; +} + +static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, size_t ealen, struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc = 0; + + ENTRY; + + CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x/0x%x\n", + PFID(&op_data->op_fid1), op_data->op_attr.ia_valid, + op_data->op_xvalid); + + op_data->op_flags |= MF_MDC_CANCEL_FID1; + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, request); + + RETURN(rc); +} + +static int lmv_fsync(struct obd_export *exp, const struct lu_fid *fid, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_fsync(tgt->ltd_exp, fid, request); + RETURN(rc); +} + +struct stripe_dirent { + struct page *sd_page; + struct lu_dirpage *sd_dp; + struct lu_dirent *sd_ent; + bool sd_eof; +}; + +struct lmv_dir_ctxt { + struct lmv_obd *ldc_lmv; + struct md_op_data *ldc_op_data; + struct md_readdir_info *ldc_mrinfo; + __u64 ldc_hash; + int ldc_count; + struct stripe_dirent ldc_stripes[0]; +}; + +static inline void stripe_dirent_unload(struct stripe_dirent *stripe) +{ + if (stripe->sd_page) { + kunmap(stripe->sd_page); + put_page(stripe->sd_page); + stripe->sd_page = NULL; + stripe->sd_ent = NULL; + } +} + +static inline void put_lmv_dir_ctxt(struct lmv_dir_ctxt *ctxt) +{ + int i; + + for (i = 0; i < ctxt->ldc_count; i++) + stripe_dirent_unload(&ctxt->ldc_stripes[i]); +} + +/* if @ent is dummy, or . .., get next */ +static struct lu_dirent *stripe_dirent_get(struct lmv_dir_ctxt *ctxt, + struct lu_dirent *ent, + int stripe_index) +{ + for (; ent; ent = lu_dirent_next(ent)) { + /* Skip dummy entry */ + if (le16_to_cpu(ent->lde_namelen) == 0) + continue; + + /* skip . and .. for other stripes */ + if (stripe_index && + (strncmp(ent->lde_name, ".", + le16_to_cpu(ent->lde_namelen)) == 0 || + strncmp(ent->lde_name, "..", + le16_to_cpu(ent->lde_namelen)) == 0)) + continue; + + if (le64_to_cpu(ent->lde_hash) >= ctxt->ldc_hash) + break; + } + + return ent; +} + +static struct lu_dirent *stripe_dirent_load(struct lmv_dir_ctxt *ctxt, + struct stripe_dirent *stripe, + int stripe_index) +{ + struct md_op_data *op_data = ctxt->ldc_op_data; + struct lmv_oinfo *oinfo; + struct lu_fid fid = op_data->op_fid1; + struct inode *inode = op_data->op_data; + struct lmv_tgt_desc *tgt; + struct lu_dirent *ent = stripe->sd_ent; + __u64 hash = ctxt->ldc_hash; + int rc = 0; + + ENTRY; + + LASSERT(stripe == &ctxt->ldc_stripes[stripe_index]); + LASSERT(!ent); + + do { + if (stripe->sd_page) { + __u64 end = le64_to_cpu(stripe->sd_dp->ldp_hash_end); + + /* @hash should be the last dirent hash */ + LASSERTF(hash <= end, + "ctxt@%p stripe@%p hash %llx end %llx\n", + ctxt, stripe, hash, end); + /* unload last page */ + stripe_dirent_unload(stripe); + /* eof */ + if (end == MDS_DIR_END_OFF) { + stripe->sd_eof = true; + break; + } + hash = end; + } + + oinfo = &op_data->op_mea1->lsm_md_oinfo[stripe_index]; + if (!oinfo->lmo_root) { + rc = -ENOENT; + break; + } + + tgt = lmv_tgt(ctxt->ldc_lmv, oinfo->lmo_mds); + if (!tgt) { + rc = -ENODEV; + break; + } + + /* op_data is shared by stripes, reset after use */ + op_data->op_fid1 = oinfo->lmo_fid; + op_data->op_fid2 = oinfo->lmo_fid; + op_data->op_data = oinfo->lmo_root; + + rc = md_read_page(tgt->ltd_exp, op_data, ctxt->ldc_mrinfo, hash, + &stripe->sd_page); + + op_data->op_fid1 = fid; + op_data->op_fid2 = fid; + op_data->op_data = inode; + + if (rc) + break; + + stripe->sd_dp = page_address(stripe->sd_page); + ent = stripe_dirent_get(ctxt, lu_dirent_start(stripe->sd_dp), + stripe_index); + /* in case a page filled with ., .. and dummy, read next */ + } while (!ent); + + stripe->sd_ent = ent; + if (rc) { + LASSERT(!ent); + /* treat error as eof, so dir can be partially accessed */ + stripe->sd_eof = true; + ctxt->ldc_mrinfo->mr_partial_readdir_rc = rc; + LCONSOLE_WARN("dir "DFID" stripe %d readdir failed: %d, " + "directory is partially accessed!\n", + PFID(&ctxt->ldc_op_data->op_fid1), stripe_index, + rc); + } + + RETURN(ent); +} + +static int lmv_file_resync(struct obd_export *exp, struct md_op_data *data) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + rc = lmv_check_connect(obd); + if (rc != 0) + RETURN(rc); + + tgt = lmv_fid2tgt(lmv, &data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + data->op_flags |= MF_MDC_CANCEL_FID1; + rc = md_file_resync(tgt->ltd_exp, data); + RETURN(rc); +} + +/** + * Get dirent with the closest hash for striped directory + * + * This function will search the dir entry, whose hash value is the + * closest(>=) to hash from all of sub-stripes, and it is only being called + * for striped directory. + * + * \param[in] ctxt dir read context + * + * \retval dirent get the entry successfully + * NULL does not get the entry, normally it means + * it reaches the end of the directory, while read + * stripe dirent error is ignored to allow partial + * access. + */ +static struct lu_dirent *lmv_dirent_next(struct lmv_dir_ctxt *ctxt) +{ + struct stripe_dirent *stripe; + struct lu_dirent *ent = NULL; + int i; + int min = -1; + + /* TODO: optimize with k-way merge sort */ + for (i = 0; i < ctxt->ldc_count; i++) { + stripe = &ctxt->ldc_stripes[i]; + if (stripe->sd_eof) + continue; + + if (!stripe->sd_ent) { + stripe_dirent_load(ctxt, stripe, i); + if (!stripe->sd_ent) { + LASSERT(stripe->sd_eof); + continue; + } + } + + if (min == -1 || + le64_to_cpu(ctxt->ldc_stripes[min].sd_ent->lde_hash) > + le64_to_cpu(stripe->sd_ent->lde_hash)) { + min = i; + if (le64_to_cpu(stripe->sd_ent->lde_hash) == + ctxt->ldc_hash) + break; + } + } + + if (min != -1) { + stripe = &ctxt->ldc_stripes[min]; + ent = stripe->sd_ent; + /* pop found dirent */ + stripe->sd_ent = stripe_dirent_get(ctxt, lu_dirent_next(ent), + min); + } + + return ent; +} + +/** + * Build dir entry page for striped directory + * + * This function gets one entry by @offset from a striped directory. It will + * read entries from all of stripes, and choose one closest to the required + * offset(&offset). A few notes + * 1. skip . and .. for non-zero stripes, because there can only have one . + * and .. in a directory. + * 2. op_data will be shared by all of stripes, instead of allocating new + * one, so need to restore before reusing. + * + * \param[in] exp obd export refer to LMV + * \param[in] op_data hold those MD parameters of read_entry + * \param[in] mrinfo ldlm callback being used in enqueue in mdc_read_entry, + * and partial readdir result will be stored in it. + * \param[in] offset starting hash offset + * \param[out] ppage the page holding the entry. Note: because the entry + * will be accessed in upper layer, so we need hold the + * page until the usages of entry is finished, see + * ll_dir_entry_next. + * + * retval =0 if get entry successfully + * <0 cannot get entry + */ +static int lmv_striped_read_page(struct obd_export *exp, + struct md_op_data *op_data, + struct md_readdir_info *mrinfo, __u64 offset, + struct page **ppage) +{ + struct page *page = NULL; + struct lu_dirpage *dp; + void *start; + struct lu_dirent *ent; + struct lu_dirent *last_ent; + int stripe_count; + struct lmv_dir_ctxt *ctxt; + struct lu_dirent *next = NULL; + __u16 ent_size; + size_t left_bytes; + int rc = 0; + ENTRY; + + /* Allocate a page and read entries from all of stripes and fill + * the page by hash order */ + page = alloc_page(GFP_KERNEL); + if (!page) + RETURN(-ENOMEM); + + /* Initialize the entry page */ + dp = kmap(page); + memset(dp, 0, sizeof(*dp)); + dp->ldp_hash_start = cpu_to_le64(offset); + + start = dp + 1; + left_bytes = PAGE_SIZE - sizeof(*dp); + ent = start; + last_ent = ent; + + /* initalize dir read context */ + stripe_count = op_data->op_mea1->lsm_md_stripe_count; + OBD_ALLOC(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count])); + if (!ctxt) + GOTO(free_page, rc = -ENOMEM); + ctxt->ldc_lmv = &exp->exp_obd->u.lmv; + ctxt->ldc_op_data = op_data; + ctxt->ldc_mrinfo = mrinfo; + ctxt->ldc_hash = offset; + ctxt->ldc_count = stripe_count; + + while (1) { + next = lmv_dirent_next(ctxt); + + /* end of directory */ + if (!next) { + ctxt->ldc_hash = MDS_DIR_END_OFF; + break; + } + ctxt->ldc_hash = le64_to_cpu(next->lde_hash); + + ent_size = le16_to_cpu(next->lde_reclen); + + /* the last entry lde_reclen is 0, but it might not be the last + * one of this temporay dir page */ + if (!ent_size) + ent_size = lu_dirent_calc_size( + le16_to_cpu(next->lde_namelen), + le32_to_cpu(next->lde_attrs)); + /* page full */ + if (ent_size > left_bytes) + break; + + memcpy(ent, next, ent_size); + + /* Replace . with master FID and Replace .. with the parent FID + * of master object */ + if (strncmp(ent->lde_name, ".", + le16_to_cpu(ent->lde_namelen)) == 0 && + le16_to_cpu(ent->lde_namelen) == 1) + fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid1); + else if (strncmp(ent->lde_name, "..", + le16_to_cpu(ent->lde_namelen)) == 0 && + le16_to_cpu(ent->lde_namelen) == 2) + fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid3); + + CDEBUG(D_INODE, "entry %.*s hash %#llx\n", + le16_to_cpu(ent->lde_namelen), ent->lde_name, + le64_to_cpu(ent->lde_hash)); + + left_bytes -= ent_size; + ent->lde_reclen = cpu_to_le16(ent_size); + last_ent = ent; + ent = (void *)ent + ent_size; + }; + + last_ent->lde_reclen = 0; + + if (ent == start) + dp->ldp_flags |= LDF_EMPTY; + else if (ctxt->ldc_hash == le64_to_cpu(last_ent->lde_hash)) + dp->ldp_flags |= LDF_COLLIDE; + dp->ldp_flags = cpu_to_le32(dp->ldp_flags); + dp->ldp_hash_end = cpu_to_le64(ctxt->ldc_hash); + + put_lmv_dir_ctxt(ctxt); + OBD_FREE(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count])); + + *ppage = page; + + RETURN(0); + +free_page: + kunmap(page); + __free_page(page); + + return rc; +} + +static int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data, + struct md_readdir_info *mrinfo, __u64 offset, + struct page **ppage) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + if (unlikely(lmv_dir_foreign(op_data->op_mea1))) + RETURN(-ENODATA); + + if (unlikely(lmv_dir_striped(op_data->op_mea1))) { + rc = lmv_striped_read_page(exp, op_data, mrinfo, offset, ppage); + RETURN(rc); + } + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_read_page(tgt->ltd_exp, op_data, mrinfo, offset, ppage); + + RETURN(rc); +} + +/** + * Unlink a file/directory + * + * Unlink a file or directory under the parent dir. The unlink request + * usually will be sent to the MDT where the child is located, but if + * the client does not have the child FID then request will be sent to the + * MDT where the parent is located. + * + * If the parent is a striped directory then it also needs to locate which + * stripe the name of the child is located, and replace the parent FID + * (@op->op_fid1) with the stripe FID. Note: if the stripe is unknown, + * it will walk through all of sub-stripes until the child is being + * unlinked finally. + * + * \param[in] exp export refer to LMV + * \param[in] op_data different parameters transferred beween client + * MD stacks, name, namelen, FIDs etc. + * op_fid1 is the parent FID, op_fid2 is the child + * FID. + * \param[out] request point to the request of unlink. + * + * retval 0 if succeed + * negative errno if failed. + */ +static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + struct lmv_tgt_desc *parent_tgt; + struct mdt_body *body; + int rc; + + ENTRY; + + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = current_cap(); + +retry: + parent_tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(parent_tgt)) + RETURN(PTR_ERR(parent_tgt)); + + if (likely(!fid_is_zero(&op_data->op_fid2))) { + tgt = lmv_fid2tgt(lmv, &op_data->op_fid2); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } else { + tgt = parent_tgt; + } + + /* + * If child's fid is given, cancel unused locks for it if it is from + * another export than parent. + * + * LOOKUP lock for child (fid3) should also be cancelled on parent + * tgt_tgt in mdc_unlink(). + */ + op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3; + + if (parent_tgt != tgt) + rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index, + LCK_EX, MDS_INODELOCK_LOOKUP, + MF_MDC_CANCEL_FID3); + + rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3); + if (rc) + RETURN(rc); + + CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%u\n", + PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), + tgt->ltd_index); + + rc = md_unlink(tgt->ltd_exp, op_data, request); + if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) { + ptlrpc_req_finished(*request); + *request = NULL; + goto retry; + } + + if (rc != -EREMOTE) + RETURN(rc); + + body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + /* Not cross-ref case, just get out of here. */ + if (likely(!(body->mbo_valid & OBD_MD_MDS))) + RETURN(rc); + + /* This is a remote object, try remote MDT. */ + op_data->op_fid2 = body->mbo_fid1; + ptlrpc_req_finished(*request); + *request = NULL; + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid2); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + goto retry; +} + +static int lmv_precleanup(struct obd_device *obd) +{ + ENTRY; + libcfs_kkuc_group_rem(&obd->obd_uuid, 0, KUC_GRP_HSM); + fld_client_debugfs_fini(&obd->u.lmv.lmv_fld); + lprocfs_obd_cleanup(obd); + lprocfs_free_md_stats(obd); + RETURN(0); +} + +/** + * Get by key a value associated with a LMV device. + * + * Dispatch request to lower-layer devices as needed. + * + * \param[in] env execution environment for this thread + * \param[in] exp export for the LMV device + * \param[in] keylen length of key identifier + * \param[in] key identifier of key to get value for + * \param[in] vallen size of \a val + * \param[out] val pointer to storage location for value + * \param[in] lsm optional striping metadata of object + * + * \retval 0 on success + * \retval negative negated errno on failure + */ +static int lmv_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val) +{ + struct obd_device *obd; + struct lmv_obd *lmv; + struct lu_tgt_desc *tgt; + int rc = 0; + + ENTRY; + + obd = class_exp2obd(exp); + if (obd == NULL) { + CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n", + exp->exp_handle.h_cookie); + RETURN(-EINVAL); + } + + lmv = &obd->u.lmv; + if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) { + LASSERT(*vallen == sizeof(__u32)); + lmv_foreach_connected_tgt(lmv, tgt) { + if (!obd_get_info(env, tgt->ltd_exp, keylen, key, + vallen, val)) + RETURN(0); + } + RETURN(-EINVAL); + } else if (KEY_IS(KEY_MAX_EASIZE) || + KEY_IS(KEY_DEFAULT_EASIZE) || + KEY_IS(KEY_CONN_DATA)) { + /* + * Forwarding this request to first MDS, it should know LOV + * desc. + */ + tgt = lmv_tgt(lmv, 0); + if (!tgt) + RETURN(-ENODEV); + + rc = obd_get_info(env, tgt->ltd_exp, keylen, key, vallen, val); + if (!rc && KEY_IS(KEY_CONN_DATA)) + exp->exp_connect_data = *(struct obd_connect_data *)val; + RETURN(rc); + } else if (KEY_IS(KEY_TGT_COUNT)) { + *((int *)val) = lmv->lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count; + RETURN(0); + } + + CDEBUG(D_IOCTL, "Invalid key\n"); + RETURN(-EINVAL); +} + +static int lmv_rmfid(struct obd_export *exp, struct fid_array *fa, + int *__rcs, struct ptlrpc_request_set *_set) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request_set *set = _set; + struct lmv_obd *lmv = &obd->u.lmv; + int tgt_count = lmv->lmv_mdt_count; + struct lu_tgt_desc *tgt; + struct fid_array *fat, **fas = NULL; + int i, rc, **rcs = NULL; + + if (!set) { + set = ptlrpc_prep_set(); + if (!set) + RETURN(-ENOMEM); + } + + /* split FIDs by targets */ + OBD_ALLOC_PTR_ARRAY(fas, tgt_count); + if (fas == NULL) + GOTO(out, rc = -ENOMEM); + OBD_ALLOC_PTR_ARRAY(rcs, tgt_count); + if (rcs == NULL) + GOTO(out_fas, rc = -ENOMEM); + + for (i = 0; i < fa->fa_nr; i++) { + unsigned int idx; + + rc = lmv_fld_lookup(lmv, &fa->fa_fids[i], &idx); + if (rc) { + CDEBUG(D_OTHER, "can't lookup "DFID": rc = %d\n", + PFID(&fa->fa_fids[i]), rc); + continue; + } + LASSERT(idx < tgt_count); + if (!fas[idx]) + OBD_ALLOC(fas[idx], offsetof(struct fid_array, + fa_fids[fa->fa_nr])); + if (!fas[idx]) + GOTO(out, rc = -ENOMEM); + if (!rcs[idx]) + OBD_ALLOC_PTR_ARRAY(rcs[idx], fa->fa_nr); + if (!rcs[idx]) + GOTO(out, rc = -ENOMEM); + + fat = fas[idx]; + fat->fa_fids[fat->fa_nr++] = fa->fa_fids[i]; + } + + lmv_foreach_connected_tgt(lmv, tgt) { + fat = fas[tgt->ltd_index]; + if (!fat || fat->fa_nr == 0) + continue; + rc = md_rmfid(tgt->ltd_exp, fat, rcs[tgt->ltd_index], set); + } + + rc = ptlrpc_set_wait(NULL, set); + if (rc == 0) { + int j = 0; + for (i = 0; i < tgt_count; i++) { + fat = fas[i]; + if (!fat || fat->fa_nr == 0) + continue; + /* copy FIDs back */ + memcpy(fa->fa_fids + j, fat->fa_fids, + fat->fa_nr * sizeof(struct lu_fid)); + /* copy rcs back */ + memcpy(__rcs + j, rcs[i], fat->fa_nr * sizeof(**rcs)); + j += fat->fa_nr; + } + } + if (set != _set) + ptlrpc_set_destroy(set); + +out: + for (i = 0; i < tgt_count; i++) { + if (fas && fas[i]) + OBD_FREE(fas[i], offsetof(struct fid_array, + fa_fids[fa->fa_nr])); + if (rcs && rcs[i]) + OBD_FREE_PTR_ARRAY(rcs[i], fa->fa_nr); + } + if (rcs) + OBD_FREE_PTR_ARRAY(rcs, tgt_count); +out_fas: + if (fas) + OBD_FREE_PTR_ARRAY(fas, tgt_count); + + RETURN(rc); +} + +/** + * Asynchronously set by key a value associated with a LMV device. + * + * Dispatch request to lower-layer devices as needed. + * + * \param[in] env execution environment for this thread + * \param[in] exp export for the LMV device + * \param[in] keylen length of key identifier + * \param[in] key identifier of key to store value for + * \param[in] vallen size of value to store + * \param[in] val pointer to data to be stored + * \param[in] set optional list of related ptlrpc requests + * + * \retval 0 on success + * \retval negative negated errno on failure + */ +static int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct lmv_tgt_desc *tgt; + struct obd_device *obd; + struct lmv_obd *lmv; + int rc = 0; + ENTRY; + + obd = class_exp2obd(exp); + if (obd == NULL) { + CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n", + exp->exp_handle.h_cookie); + RETURN(-EINVAL); + } + lmv = &obd->u.lmv; + + if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX) || + KEY_IS(KEY_DEFAULT_EASIZE)) { + int err = 0; + + lmv_foreach_connected_tgt(lmv, tgt) { + err = obd_set_info_async(env, tgt->ltd_exp, + keylen, key, vallen, val, set); + if (err && rc == 0) + rc = err; + } + + RETURN(rc); + } + + RETURN(-EINVAL); +} + +static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm, + const struct lmv_mds_md_v1 *lmm1) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + int stripe_count; + int cplen; + int i; + int rc = 0; + ENTRY; + + lsm->lsm_md_magic = le32_to_cpu(lmm1->lmv_magic); + lsm->lsm_md_stripe_count = le32_to_cpu(lmm1->lmv_stripe_count); + lsm->lsm_md_master_mdt_index = le32_to_cpu(lmm1->lmv_master_mdt_index); + if (OBD_FAIL_CHECK(OBD_FAIL_UNKNOWN_LMV_STRIPE)) + lsm->lsm_md_hash_type = LMV_HASH_TYPE_UNKNOWN; + else + lsm->lsm_md_hash_type = le32_to_cpu(lmm1->lmv_hash_type); + lsm->lsm_md_layout_version = le32_to_cpu(lmm1->lmv_layout_version); + lsm->lsm_md_migrate_offset = le32_to_cpu(lmm1->lmv_migrate_offset); + lsm->lsm_md_migrate_hash = le32_to_cpu(lmm1->lmv_migrate_hash); + cplen = strlcpy(lsm->lsm_md_pool_name, lmm1->lmv_pool_name, + sizeof(lsm->lsm_md_pool_name)); + + if (cplen >= sizeof(lsm->lsm_md_pool_name)) + RETURN(-E2BIG); + + CDEBUG(D_INFO, "unpack lsm count %d/%d, master %d hash_type %#x/%#x " + "layout_version %d\n", lsm->lsm_md_stripe_count, + lsm->lsm_md_migrate_offset, lsm->lsm_md_master_mdt_index, + lsm->lsm_md_hash_type, lsm->lsm_md_migrate_hash, + lsm->lsm_md_layout_version); + + stripe_count = le32_to_cpu(lmm1->lmv_stripe_count); + for (i = 0; i < stripe_count; i++) { + fid_le_to_cpu(&lsm->lsm_md_oinfo[i].lmo_fid, + &lmm1->lmv_stripe_fids[i]); + /* + * set default value -1, so lmv_locate_tgt() knows this stripe + * target is not initialized. + */ + lsm->lsm_md_oinfo[i].lmo_mds = LMV_OFFSET_DEFAULT; + if (!fid_is_sane(&lsm->lsm_md_oinfo[i].lmo_fid)) + continue; + + rc = lmv_fld_lookup(lmv, &lsm->lsm_md_oinfo[i].lmo_fid, + &lsm->lsm_md_oinfo[i].lmo_mds); + if (rc == -ENOENT) + continue; + + if (rc) + RETURN(rc); + + CDEBUG(D_INFO, "unpack fid #%d "DFID"\n", i, + PFID(&lsm->lsm_md_oinfo[i].lmo_fid)); + } + + RETURN(rc); +} + +static inline int lmv_unpack_user_md(struct obd_export *exp, + struct lmv_stripe_md *lsm, + const struct lmv_user_md *lmu) +{ + lsm->lsm_md_magic = le32_to_cpu(lmu->lum_magic); + lsm->lsm_md_stripe_count = le32_to_cpu(lmu->lum_stripe_count); + lsm->lsm_md_master_mdt_index = le32_to_cpu(lmu->lum_stripe_offset); + lsm->lsm_md_hash_type = le32_to_cpu(lmu->lum_hash_type); + lsm->lsm_md_max_inherit = lmu->lum_max_inherit; + lsm->lsm_md_max_inherit_rr = lmu->lum_max_inherit_rr; + lsm->lsm_md_pool_name[LOV_MAXPOOLNAME] = 0; + + return 0; +} + +static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp, + const union lmv_mds_md *lmm, size_t lmm_size) +{ + struct lmv_stripe_md *lsm; + int lsm_size; + int rc; + bool allocated = false; + ENTRY; + + LASSERT(lsmp != NULL); + + lsm = *lsmp; + /* Free memmd */ + if (lsm != NULL && lmm == NULL) { + int i; + struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lsm; + + if (lfm->lfm_magic == LMV_MAGIC_FOREIGN) { + size_t lfm_size; + + lfm_size = lfm->lfm_length + offsetof(typeof(*lfm), + lfm_value[0]); + OBD_FREE_LARGE(lfm, lfm_size); + RETURN(0); + } + + if (lmv_dir_striped(lsm)) { + for (i = 0; i < lsm->lsm_md_stripe_count; i++) + iput(lsm->lsm_md_oinfo[i].lmo_root); + lsm_size = lmv_stripe_md_size(lsm->lsm_md_stripe_count); + } else { + lsm_size = lmv_stripe_md_size(0); + } + OBD_FREE(lsm, lsm_size); + *lsmp = NULL; + RETURN(0); + } + + /* foreign lmv case */ + if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_FOREIGN) { + struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lsm; + + if (lfm == NULL) { + OBD_ALLOC_LARGE(lfm, lmm_size); + if (lfm == NULL) + RETURN(-ENOMEM); + *lsmp = (struct lmv_stripe_md *)lfm; + } + lfm->lfm_magic = le32_to_cpu(lmm->lmv_foreign_md.lfm_magic); + lfm->lfm_length = le32_to_cpu(lmm->lmv_foreign_md.lfm_length); + lfm->lfm_type = le32_to_cpu(lmm->lmv_foreign_md.lfm_type); + lfm->lfm_flags = le32_to_cpu(lmm->lmv_foreign_md.lfm_flags); + memcpy(&lfm->lfm_value, &lmm->lmv_foreign_md.lfm_value, + lfm->lfm_length); + RETURN(lmm_size); + } + + if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_STRIPE) + RETURN(-EPERM); + + /* Unpack memmd */ + if (le32_to_cpu(lmm->lmv_magic) != LMV_MAGIC_V1 && + le32_to_cpu(lmm->lmv_magic) != LMV_USER_MAGIC) { + CERROR("%s: invalid lmv magic %x: rc = %d\n", + exp->exp_obd->obd_name, le32_to_cpu(lmm->lmv_magic), + -EIO); + RETURN(-EIO); + } + + if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_V1) + lsm_size = lmv_stripe_md_size(lmv_mds_md_stripe_count_get(lmm)); + else + /** + * Unpack default dirstripe(lmv_user_md) to lmv_stripe_md, + * stripecount should be 0 then. + */ + lsm_size = lmv_stripe_md_size(0); + + if (lsm == NULL) { + OBD_ALLOC(lsm, lsm_size); + if (lsm == NULL) + RETURN(-ENOMEM); + allocated = true; + *lsmp = lsm; + } + + switch (le32_to_cpu(lmm->lmv_magic)) { + case LMV_MAGIC_V1: + rc = lmv_unpack_md_v1(exp, lsm, &lmm->lmv_md_v1); + break; + case LMV_USER_MAGIC: + rc = lmv_unpack_user_md(exp, lsm, &lmm->lmv_user_md); + break; + default: + CERROR("%s: unrecognized magic %x\n", exp->exp_obd->obd_name, + le32_to_cpu(lmm->lmv_magic)); + rc = -EINVAL; + break; + } + + if (rc != 0 && allocated) { + OBD_FREE(lsm, lsm_size); + *lsmp = NULL; + lsm_size = rc; + } + RETURN(lsm_size); +} + +void lmv_free_memmd(struct lmv_stripe_md *lsm) +{ + lmv_unpackmd(NULL, &lsm, NULL, 0); +} +EXPORT_SYMBOL(lmv_free_memmd); + +static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, + union ldlm_policy_data *policy, + enum ldlm_mode mode, enum ldlm_cancel_flags flags, + void *opaque) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + struct lu_tgt_desc *tgt; + int err; + int rc = 0; + + ENTRY; + + LASSERT(fid != NULL); + + lmv_foreach_connected_tgt(lmv, tgt) { + if (!tgt->ltd_active) + continue; + + err = md_cancel_unused(tgt->ltd_exp, fid, policy, mode, flags, + opaque); + if (!rc) + rc = err; + } + RETURN(rc); +} + +static int lmv_set_lock_data(struct obd_export *exp, + const struct lustre_handle *lockh, + void *data, __u64 *bits) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0); + int rc; + + ENTRY; + + if (tgt == NULL || tgt->ltd_exp == NULL) + RETURN(-EINVAL); + rc = md_set_lock_data(tgt->ltd_exp, lockh, data, bits); + RETURN(rc); +} + +static enum ldlm_mode +lmv_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, struct lustre_handle *lockh) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + enum ldlm_mode rc; + struct lu_tgt_desc *tgt; + int i; + int index; + + ENTRY; + + CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid)); + + /* + * With DNE every object can have two locks in different namespaces: + * lookup lock in space of MDT storing direntry and update/open lock in + * space of MDT storing inode. Try the MDT that the FID maps to first, + * since this can be easily found, and only try others if that fails. + */ + for (i = 0, index = lmv_fid2tgt_index(lmv, fid); + i < lmv->lmv_mdt_descs.ltd_tgts_size; + i++, index = (index + 1) % lmv->lmv_mdt_descs.ltd_tgts_size) { + if (index < 0) { + CDEBUG(D_HA, "%s: "DFID" is inaccessible: rc = %d\n", + obd->obd_name, PFID(fid), index); + index = 0; + } + + tgt = lmv_tgt(lmv, index); + if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) + continue; + + rc = md_lock_match(tgt->ltd_exp, flags, fid, type, policy, mode, + lockh); + if (rc) + RETURN(rc); + } + + RETURN(0); +} + +static int +lmv_get_lustre_md(struct obd_export *exp, struct req_capsule *pill, + struct obd_export *dt_exp, struct obd_export *md_exp, + struct lustre_md *md) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0); + + if (!tgt || !tgt->ltd_exp) + return -EINVAL; + + return md_get_lustre_md(tgt->ltd_exp, pill, dt_exp, md_exp, md); +} + +static int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0); + + ENTRY; + + if (md->default_lmv) { + lmv_free_memmd(md->default_lmv); + md->default_lmv = NULL; + } + if (md->lmv != NULL) { + lmv_free_memmd(md->lmv); + md->lmv = NULL; + } + if (!tgt || !tgt->ltd_exp) + RETURN(-EINVAL); + RETURN(md_free_lustre_md(tgt->ltd_exp, md)); +} + +static int lmv_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct lookup_intent *it) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, &och->och_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + RETURN(md_set_open_replay_data(tgt->ltd_exp, och, it)); +} + +static int lmv_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, &och->och_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + RETURN(md_clear_open_replay_data(tgt->ltd_exp, och)); +} + +static int lmv_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo) +{ + struct md_op_data *op_data = &minfo->mi_data; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *ptgt; + struct lmv_tgt_desc *ctgt; + int rc; + + ENTRY; + + if (!fid_is_sane(&op_data->op_fid2)) + RETURN(-EINVAL); + + ptgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(ptgt)) + RETURN(PTR_ERR(ptgt)); + + ctgt = lmv_fid2tgt(lmv, &op_data->op_fid2); + if (IS_ERR(ctgt)) + RETURN(PTR_ERR(ctgt)); + + /* + * remote object needs two RPCs to lookup and getattr, considering the + * complexity don't support statahead for now. + */ + if (ctgt != ptgt) + RETURN(-EREMOTE); + + rc = md_intent_getattr_async(ptgt->ltd_exp, minfo); + + RETURN(rc); +} + +static int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits); + RETURN(rc); +} + +static int lmv_get_fid_from_lsm(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + const char *name, int namelen, + struct lu_fid *fid) +{ + const struct lmv_oinfo *oinfo; + + LASSERT(lmv_dir_striped(lsm)); + + oinfo = lsm_name_to_stripe_info(lsm, name, namelen, false); + if (IS_ERR(oinfo)) + return PTR_ERR(oinfo); + + *fid = oinfo->lmo_fid; + + RETURN(0); +} + +/** + * For lmv, only need to send request to master MDT, and the master MDT will + * process with other slave MDTs. The only exception is Q_GETOQUOTA for which + * we directly fetch data from the slave MDTs. + */ +static int lmv_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0); + __u64 curspace, curinodes; + int rc = 0; + + ENTRY; + + if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) { + CERROR("master lmv inactive\n"); + RETURN(-EIO); + } + + if (oqctl->qc_cmd != Q_GETOQUOTA) { + rc = obd_quotactl(tgt->ltd_exp, oqctl); + RETURN(rc); + } + + curspace = curinodes = 0; + lmv_foreach_connected_tgt(lmv, tgt) { + int err; + + if (!tgt->ltd_active) + continue; + + err = obd_quotactl(tgt->ltd_exp, oqctl); + if (err) { + CERROR("getquota on mdt %d failed. %d\n", + tgt->ltd_index, err); + if (!rc) + rc = err; + } else { + curspace += oqctl->qc_dqblk.dqb_curspace; + curinodes += oqctl->qc_dqblk.dqb_curinodes; + } + } + oqctl->qc_dqblk.dqb_curspace = curspace; + oqctl->qc_dqblk.dqb_curinodes = curinodes; + + RETURN(rc); +} + +static int lmv_merge_attr(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + struct cl_attr *attr, + ldlm_blocking_callback cb_blocking) +{ + int rc; + int i; + + if (!lmv_dir_striped(lsm)) + return 0; + + rc = lmv_revalidate_slaves(exp, lsm, cb_blocking, 0); + if (rc < 0) + return rc; + + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + struct inode *inode = lsm->lsm_md_oinfo[i].lmo_root; + + if (!inode) + continue; + + CDEBUG(D_INFO, + "" DFID " size %llu, blocks %llu nlink %u, atime %lld ctime %lld, mtime %lld.\n", + PFID(&lsm->lsm_md_oinfo[i].lmo_fid), + i_size_read(inode), (unsigned long long)inode->i_blocks, + inode->i_nlink, (s64)inode->i_atime.tv_sec, + (s64)inode->i_ctime.tv_sec, (s64)inode->i_mtime.tv_sec); + + /* for slave stripe, it needs to subtract nlink for . and .. */ + if (i != 0) + attr->cat_nlink += inode->i_nlink - 2; + else + attr->cat_nlink = inode->i_nlink; + + attr->cat_size += i_size_read(inode); + attr->cat_blocks += inode->i_blocks; + + if (attr->cat_atime < inode->i_atime.tv_sec) + attr->cat_atime = inode->i_atime.tv_sec; + + if (attr->cat_ctime < inode->i_ctime.tv_sec) + attr->cat_ctime = inode->i_ctime.tv_sec; + + if (attr->cat_mtime < inode->i_mtime.tv_sec) + attr->cat_mtime = inode->i_mtime.tv_sec; + } + return 0; +} + +static const struct obd_ops lmv_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = lmv_setup, + .o_cleanup = lmv_cleanup, + .o_precleanup = lmv_precleanup, + .o_process_config = lmv_process_config, + .o_connect = lmv_connect, + .o_disconnect = lmv_disconnect, + .o_statfs = lmv_statfs, + .o_get_info = lmv_get_info, + .o_set_info_async = lmv_set_info_async, + .o_notify = lmv_notify, + .o_get_uuid = lmv_get_uuid, + .o_fid_alloc = lmv_fid_alloc, + .o_iocontrol = lmv_iocontrol, + .o_quotactl = lmv_quotactl +}; + +static const struct md_ops lmv_md_ops = { + .m_get_root = lmv_get_root, + .m_null_inode = lmv_null_inode, + .m_close = lmv_close, + .m_create = lmv_create, + .m_enqueue = lmv_enqueue, + .m_getattr = lmv_getattr, + .m_getxattr = lmv_getxattr, + .m_getattr_name = lmv_getattr_name, + .m_intent_lock = lmv_intent_lock, + .m_link = lmv_link, + .m_rename = lmv_rename, + .m_setattr = lmv_setattr, + .m_setxattr = lmv_setxattr, + .m_fsync = lmv_fsync, + .m_file_resync = lmv_file_resync, + .m_read_page = lmv_read_page, + .m_unlink = lmv_unlink, + .m_init_ea_size = lmv_init_ea_size, + .m_cancel_unused = lmv_cancel_unused, + .m_set_lock_data = lmv_set_lock_data, + .m_lock_match = lmv_lock_match, + .m_get_lustre_md = lmv_get_lustre_md, + .m_free_lustre_md = lmv_free_lustre_md, + .m_merge_attr = lmv_merge_attr, + .m_set_open_replay_data = lmv_set_open_replay_data, + .m_clear_open_replay_data = lmv_clear_open_replay_data, + .m_intent_getattr_async = lmv_intent_getattr_async, + .m_revalidate_lock = lmv_revalidate_lock, + .m_get_fid_from_lsm = lmv_get_fid_from_lsm, + .m_unpackmd = lmv_unpackmd, + .m_rmfid = lmv_rmfid, +}; + +static int __init lmv_init(void) +{ + return class_register_type(&lmv_obd_ops, &lmv_md_ops, true, + LUSTRE_LMV_NAME, NULL); +} + +static void __exit lmv_exit(void) +{ + class_unregister_type(LUSTRE_LMV_NAME); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Logical Metadata Volume"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(lmv_init); +module_exit(lmv_exit); diff --git a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c new file mode 100644 index 0000000000000..c2a5db2f9daf1 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c @@ -0,0 +1,322 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include + +#include "lmv_internal.h" + +static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", obd->u.lmv.lmv_mdt_count); +} +LUSTRE_RO_ATTR(numobd); + +static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count); +} +LUSTRE_RO_ATTR(activeobd); + +static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%s\n", + obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_uuid.uuid); +} +LUSTRE_RO_ATTR(desc_uuid); + +static ssize_t qos_maxage_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage); +} + +static ssize_t qos_maxage_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage = val; + + return count; +} +LUSTRE_RW_ATTR(qos_maxage); + +static ssize_t qos_prio_free_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u%%\n", + (obd->u.lmv.lmv_qos.lq_prio_free * 100 + 255) >> 8); +} + +static ssize_t qos_prio_free_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lmv_obd *lmv = &obd->u.lmv; + char buf[6], *tmp; + unsigned int val; + int rc; + + /* "100%\n\0" should be largest string */ + if (count >= sizeof(buf)) + return -ERANGE; + + strncpy(buf, buffer, sizeof(buf)); + buf[sizeof(buf) - 1] = '\0'; + tmp = strchr(buf, '%'); + if (tmp) + *tmp = '\0'; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + + if (val > 100) + return -EINVAL; + + lmv->lmv_qos.lq_prio_free = (val << 8) / 100; + set_bit(LQ_DIRTY, &lmv->lmv_qos.lq_flags); + set_bit(LQ_RESET, &lmv->lmv_qos.lq_flags); + + return count; +} +LUSTRE_RW_ATTR(qos_prio_free); + +static ssize_t qos_threshold_rr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u%%\n", + (obd->u.lmv.lmv_qos.lq_threshold_rr * 100 + + (QOS_THRESHOLD_MAX - 1)) / QOS_THRESHOLD_MAX); +} + +static ssize_t qos_threshold_rr_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lmv_obd *lmv = &obd->u.lmv; + char buf[6], *tmp; + unsigned int val; + int rc; + + /* "100%\n\0" should be largest string */ + if (count >= sizeof(buf)) + return -ERANGE; + + strncpy(buf, buffer, sizeof(buf)); + buf[sizeof(buf) - 1] = '\0'; + tmp = strchr(buf, '%'); + if (tmp) + *tmp = '\0'; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + + if (val > 100) + return -EINVAL; + + lmv->lmv_qos.lq_threshold_rr = (val * QOS_THRESHOLD_MAX) / 100; + set_bit(LQ_DIRTY, &lmv->lmv_qos.lq_flags); + + return count; +} +LUSTRE_RW_ATTR(qos_threshold_rr); + +#ifdef CONFIG_PROC_FS +static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos) +{ + struct obd_device *obd = p->private; + struct lmv_obd *lmv = &obd->u.lmv; + struct lu_tgt_desc *tgt; + + while (*pos < lmv->lmv_mdt_descs.ltd_tgts_size) { + tgt = lmv_tgt(lmv, (__u32)*pos); + if (tgt) + return tgt; + + ++*pos; + } + + return NULL; +} + +static void lmv_tgt_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct obd_device *obd = p->private; + struct lmv_obd *lmv = &obd->u.lmv; + struct lu_tgt_desc *tgt; + + ++*pos; + while (*pos < lmv->lmv_mdt_descs.ltd_tgts_size) { + tgt = lmv_tgt(lmv, (__u32)*pos); + if (tgt) + return tgt; + + ++*pos; + } + + return NULL; +} + +static int lmv_tgt_seq_show(struct seq_file *p, void *v) +{ + struct lmv_tgt_desc *tgt = v; + + if (!tgt) + return 0; + + seq_printf(p, "%u: %s %sACTIVE\n", + tgt->ltd_index, tgt->ltd_uuid.uuid, + tgt->ltd_active ? "" : "IN"); + return 0; +} + +static const struct seq_operations lmv_tgt_sops = { + .start = lmv_tgt_seq_start, + .stop = lmv_tgt_seq_stop, + .next = lmv_tgt_seq_next, + .show = lmv_tgt_seq_show, +}; + +static int lmv_target_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lmv_tgt_sops); + if (rc) + return rc; + + seq = file->private_data; + seq->private = pde_data(inode); + return 0; +} + +static const struct proc_ops lmv_proc_target_fops = { + PROC_OWNER(THIS_MODULE) + .proc_open = lmv_target_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +static struct attribute *lmv_attrs[] = { + &lustre_attr_activeobd.attr, + &lustre_attr_desc_uuid.attr, + &lustre_attr_numobd.attr, + &lustre_attr_qos_maxage.attr, + &lustre_attr_qos_prio_free.attr, + &lustre_attr_qos_threshold_rr.attr, + NULL, +}; + +KOBJ_ATTRIBUTE_GROUPS(lmv); /* creates lmv_groups */ + +int lmv_tunables_init(struct obd_device *obd) +{ + int rc; + + obd->obd_ktype.default_groups = KOBJ_ATTR_GROUPS(lmv); + rc = lprocfs_obd_setup(obd, true); + if (rc) + goto out_failed; +#ifdef CONFIG_PROC_FS + rc = lprocfs_alloc_md_stats(obd, 0); + if (rc) { + lprocfs_obd_cleanup(obd); + goto out_failed; + } + + rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", + 0444, &lmv_proc_target_fops, obd); + if (rc) { + lprocfs_free_md_stats(obd); + lprocfs_obd_cleanup(obd); + CWARN("%s: error adding LMV target_obd file: rc = %d\n", + obd->obd_name, rc); + rc = 0; + } +#endif /* CONFIG_PROC_FS */ +out_failed: + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/lov/Makefile b/drivers/staging/lustrefsx/lustre/lov/Makefile new file mode 100644 index 0000000000000..dae11b1647cbe --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_LUSTREFSX_FS) += lov.o + +lov-y := lov_dev.o lov_ea.o lov_io.o lov_lock.o lov_merge.o lov_obd.o +lov-y += lov_object.o lov_offset.o lov_pack.o lov_page.o lov_pool.o +lov-y += lov_request.o lovsub_dev.o lovsub_object.o +lov-y += lproc_lov.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h new file mode 100644 index 0000000000000..1f5669800d62f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h @@ -0,0 +1,815 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Internal interfaces of LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#ifndef LOV_CL_INTERNAL_H +#define LOV_CL_INTERNAL_H + +#include +#include +#include +#include "lov_internal.h" + +/** \defgroup lov lov + * Logical object volume layer. This layer implements data striping (raid0). + * + * At the lov layer top-entity (object, page, lock, io) is connected to one or + * more sub-entities: top-object, representing a file is connected to a set of + * sub-objects, each representing a stripe, file-level top-lock is connected + * to a set of per-stripe sub-locks, top-page is connected to a (single) + * sub-page, and a top-level IO is connected to a set of (potentially + * concurrent) sub-IO's. + * + * Sub-object, sub-page, and sub-io have well-defined top-object and top-page + * respectively, while a single sub-lock can be part of multiple top-locks. + * + * Reference counting models are different for different types of entities: + * + * - top-object keeps a reference to its sub-objects, and destroys them + * when it is destroyed. + * + * - top-page keeps a reference to its sub-page, and destroys it when it + * is destroyed. + * + * - IO's are not reference counted. + * + * To implement a connection between top and sub entities, lov layer is split + * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both + * implementing full set of cl-interfaces. For example, top-object has vvp and + * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is + * used to track child-parent relationship. + * + * @{ + */ + +struct lovsub_device; +struct lovsub_object; + +enum lov_device_flags { + LOV_DEV_INITIALIZED = BIT(0), +}; + +/* + * Upper half. + */ + +/* Data-on-MDT array item in lov_device::ld_md_tgts[] */ +struct lovdom_device { + struct cl_device *ldm_mdc; + int ldm_idx; +}; + +struct lov_device { + /* + * XXX Locking of lov-private data is missing. + */ + struct cl_device ld_cl; + struct lov_obd *ld_lov; + /** size of lov_device::ld_target[] array */ + __u32 ld_target_nr; + struct lovsub_device **ld_target; + __u32 ld_flags; + + /* Data-on-MDT devices */ + __u32 ld_md_tgts_nr; + struct lovdom_device *ld_md_tgts; + struct obd_device *ld_lmv; + /* LU site for subdevices */ + struct lu_site ld_site; +}; + +/** + * Layout type. + */ +enum lov_layout_type { + LLT_EMPTY, /** empty file without body (mknod + truncate) */ + LLT_RELEASED, /** file with no objects (data in HSM) */ + LLT_COMP, /** support composite layout */ + LLT_FOREIGN, /** foreign layout */ + LLT_NR +}; + +static inline char *llt2str(enum lov_layout_type llt) +{ + switch (llt) { + case LLT_EMPTY: + return "EMPTY"; + case LLT_RELEASED: + return "RELEASED"; + case LLT_COMP: + return "COMPOSITE"; + case LLT_FOREIGN: + return "FOREIGN"; + case LLT_NR: + LBUG(); + } + LBUG(); + return ""; +} + +/** + * Return lov_layout_entry_type associated with a given composite layout + * entry. + */ +static inline __u32 lov_entry_type(struct lov_stripe_md_entry *lsme) +{ + if ((lov_pattern(lsme->lsme_pattern) & LOV_PATTERN_RAID0) || + (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_MDT) || + (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_FOREIGN)) + return lov_pattern(lsme->lsme_pattern & + ~LOV_PATTERN_OVERSTRIPING); + return 0; +} + +struct lov_layout_entry; +struct lov_object; +struct lov_lock_sub; + +struct lov_comp_layout_entry_ops { + int (*lco_init)(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, unsigned int index, + const struct cl_object_conf *conf, + struct lov_layout_entry *lle); + void (*lco_fini)(const struct lu_env *env, + struct lov_layout_entry *lle); + int (*lco_getattr)(const struct lu_env *env, struct lov_object *obj, + unsigned int index, struct lov_layout_entry *lle, + struct cl_attr **attr); +}; + +struct lov_layout_raid0 { + unsigned lo_nr; + /** + * When this is true, lov_object::lo_attr contains + * valid up to date attributes for a top-level + * object. This field is reset to 0 when attributes of + * any sub-object change. + */ + bool lo_attr_valid; + /** + * Array of sub-objects. Allocated when top-object is + * created (lov_init_raid0()). + * + * Top-object is a strict master of its sub-objects: + * it is created before them, and outlives its + * children (this later is necessary so that basic + * functions like cl_object_top() always + * work). Top-object keeps a reference on every + * sub-object. + * + * When top-object is destroyed (lov_delete_raid0()) + * it releases its reference to a sub-object and waits + * until the latter is finally destroyed. + */ + struct lovsub_object **lo_sub; + /** + * protect lo_sub + */ + spinlock_t lo_sub_lock; + /** + * Cached object attribute, built from sub-object + * attributes. + */ + struct cl_attr lo_attr; +}; + +struct lov_layout_dom { + /* keep this always at first place so DOM layout entry + * can be addressed also as RAID0 after initialization. + */ + struct lov_layout_raid0 lo_dom_r0; + struct lovsub_object *lo_dom; + struct lov_oinfo *lo_loi; +}; + +struct lov_layout_entry { + __u32 lle_type; + unsigned int lle_valid:1; + unsigned int lle_preference; + struct lu_extent *lle_extent; + struct lov_stripe_md_entry *lle_lsme; + struct lov_comp_layout_entry_ops *lle_comp_ops; + union { + struct lov_layout_raid0 lle_raid0; + struct lov_layout_dom lle_dom; + }; +}; + +struct lov_mirror_entry { + unsigned short lre_mirror_id; + unsigned short lre_stale:1, /* set if any components is stale */ + lre_valid:1, /* set if at least one of components + * in this mirror is valid */ + lre_foreign:1; /* set if it is a foreign component */ + int lre_preference; /* overall preference of this mirror */ + + unsigned short lre_start; /* index to lo_entries, start index of + * this mirror */ + unsigned short lre_end; /* end index of this mirror */ +}; + +enum lov_object_flags { + /* Layout is invalid, set when layout lock is lost */ + LO_LAYOUT_INVALID = 0x1, +}; + +/** + * lov-specific file state. + * + * lov object has particular layout type, determining how top-object is built + * on top of sub-objects. Layout type can change dynamically. When this + * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode, + * all state pertaining to the old layout type is destroyed, and new state is + * constructed. All object methods take said semaphore in the shared mode, + * providing serialization against transition between layout types. + * + * To avoid multiple `if' or `switch' statements, selecting behavior for the + * current layout type, object methods perform double-dispatch, invoking + * function corresponding to the current layout type. + */ +struct lov_object { + struct cl_object lo_cl; + /** + * Serializes object operations with transitions between layout types. + * + * This semaphore is taken in shared mode by all object methods, and + * is taken in exclusive mode when object type is changed. + * + * \see lov_object::lo_type + */ + struct rw_semaphore lo_type_guard; + /** + * Type of an object. Protected by lov_object::lo_type_guard. + */ + enum lov_layout_type lo_type; + /** + * Object flags. + */ + unsigned long lo_obj_flags; + /** + * How many IOs are on going on this object. Layout can be changed + * only if there is no active IO. + */ + atomic_t lo_active_ios; + /** + * Waitq - wait for no one else is using lo_lsm + */ + wait_queue_head_t lo_waitq; + /** + * Layout metadata. NULL if empty layout. + */ + struct lov_stripe_md *lo_lsm; + + union lov_layout_state { + struct lov_layout_state_empty { + } empty; + struct lov_layout_state_released { + } released; + struct lov_layout_composite { + /** + * flags of lov_comp_md_v1::lcm_flags. Mainly used + * by FLR. + */ + uint32_t lo_flags; + /** + * For FLR: index of preferred mirror to read. + * Preferred mirror is initialized by the preferred + * bit of lsme. It can be changed when the preferred + * is inaccessible. + * In order to make lov_lsm_entry() return the same + * mirror in the same IO context, it's only possible + * to change the preferred mirror when the + * lo_active_ios reaches zero. + */ + int lo_preferred_mirror; + /** + * For FLR: Number of (valid) mirrors. + */ + unsigned lo_mirror_count; + struct lov_mirror_entry *lo_mirrors; + /** + * Current entry count of lo_entries, include + * invalid entries. + */ + unsigned int lo_entry_count; + struct lov_layout_entry *lo_entries; + } composite; + } u; + /** + * Thread that acquired lov_object::lo_type_guard in an exclusive + * mode. + */ + struct task_struct *lo_owner; +}; + +static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i) +{ + LASSERT(lov->lo_type == LLT_COMP); + LASSERTF(i < lov->u.composite.lo_entry_count, + "entry %d entry_count %d\n", i, + lov->u.composite.lo_entry_count); + + return &lov->u.composite.lo_entries[i].lle_raid0; +} + +static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i) +{ + LASSERT(lov->lo_lsm != NULL); + LASSERT(i < lov->lo_lsm->lsm_entry_count); + + return lov->lo_lsm->lsm_entries[i]; +} + +static inline unsigned lov_flr_state(const struct lov_object *lov) +{ + if (lov->lo_type != LLT_COMP) + return LCM_FL_NONE; + + return lov->u.composite.lo_flags & LCM_FL_FLR_MASK; +} + +static inline bool lov_is_flr(const struct lov_object *lov) +{ + return lov_flr_state(lov) != LCM_FL_NONE; +} + +static inline struct lov_layout_entry *lov_entry(struct lov_object *lov, int i) +{ + LASSERT(lov->lo_type == LLT_COMP); + LASSERTF(i < lov->u.composite.lo_entry_count, + "entry %d entry_count %d\n", i, + lov->u.composite.lo_entry_count); + + return &lov->u.composite.lo_entries[i]; +} + +#define lov_for_layout_entry(lov, entry, start, end) \ + if (lov->u.composite.lo_entries && \ + lov->u.composite.lo_entry_count > 0) \ + for (entry = lov_entry(lov, start); \ + entry <= lov_entry(lov, end); entry++) + +#define lov_foreach_layout_entry(lov, entry) \ + lov_for_layout_entry(lov, entry, 0, \ + (lov)->u.composite.lo_entry_count - 1) + +#define lov_foreach_mirror_layout_entry(lov, entry, lre) \ + lov_for_layout_entry(lov, entry, (lre)->lre_start, (lre)->lre_end) + +static inline struct lov_mirror_entry * +lov_mirror_entry(struct lov_object *lov, int i) +{ + LASSERT(i < lov->u.composite.lo_mirror_count); + return &lov->u.composite.lo_mirrors[i]; +} + +#define lov_foreach_mirror_entry(lov, lre) \ + for (lre = lov_mirror_entry(lov, 0); \ + lre <= lov_mirror_entry(lov, \ + lov->u.composite.lo_mirror_count - 1); \ + lre++) + +static inline unsigned +lov_layout_entry_index(struct lov_object *lov, struct lov_layout_entry *entry) +{ + struct lov_layout_entry *first = &lov->u.composite.lo_entries[0]; + unsigned index = (unsigned)(entry - first); + + LASSERT(entry >= first); + LASSERT(index < lov->u.composite.lo_entry_count); + + return index; +} + +/** + * State lov_lock keeps for each sub-lock. + */ +struct lov_lock_sub { + /** sub-lock itself */ + struct cl_lock sub_lock; + /** Set if the sublock has ever been enqueued, meaning it may + * hold resources of underlying layers */ + unsigned int sub_is_enqueued:1, + sub_initialized:1; + int sub_index; +}; + +/** + * lov-specific lock state. + */ +struct lov_lock { + struct cl_lock_slice lls_cl; + /** Number of sub-locks in this lock */ + int lls_nr; + /** sublock array */ + struct lov_lock_sub lls_sub[0]; +}; + +struct lov_page { + struct cl_page_slice lps_cl; +}; + +/* + * Bottom half. + */ + +struct lovsub_device { + struct cl_device acid_cl; + struct cl_device *acid_next; +}; + +struct lovsub_object { + struct cl_object_header lso_header; + struct cl_object lso_cl; + struct lov_object *lso_super; + int lso_index; +}; + +/** + * Describe the environment settings for sublocks. + */ +struct lov_sublock_env { + const struct lu_env *lse_env; + struct cl_io *lse_io; +}; + +struct lov_thread_info { + struct cl_object_conf lti_stripe_conf; + struct lu_fid lti_fid; + struct ost_lvb lti_lvb; + struct cl_2queue lti_cl2q; + struct cl_page_list lti_plist; +}; + +/** + * State that lov_io maintains for every sub-io. + */ +struct lov_io_sub { + /** + * Linkage into a list (hanging off lov_io::lis_subios) + */ + struct list_head sub_list; + /** + * Linkage into a list (hanging off lov_io::lis_active) of all + * sub-io's active for the current IO iteration. + */ + struct list_head sub_linkage; + unsigned int sub_subio_index; + /** + * sub-io for a stripe. Ideally sub-io's can be stopped and resumed + * independently, with lov acting as a scheduler to maximize overall + * throughput. + */ + struct cl_io sub_io; + /** + * environment, in which sub-io executes. + */ + struct lu_env *sub_env; + /** + * environment's refcheck. + * + * \see cl_env_get() + */ + __u16 sub_refcheck; +}; + +/** + * IO state private for LOV. + */ +#define LIS_CACHE_ENTRY_NONE -ENOENT +struct lov_io { + /** super-class */ + struct cl_io_slice lis_cl; + + /** + * FLR: index to lo_mirrors. Valid only if lov_is_flr() returns true. + * + * The mirror index of this io. Preserved over cl_io_init() + * if io->ci_ndelay_tried is greater than zero. + */ + int lis_mirror_index; + /** + * FLR: the layout gen when lis_mirror_index was cached. The + * mirror index makes sense only when the layout gen doesn't + * change. + */ + int lis_mirror_layout_gen; + + /** + * fields below this will be initialized in lov_io_init(). + */ + unsigned lis_preserved; + + /** + * Pointer to the object slice. This is a duplicate of + * lov_io::lis_cl::cis_object. + */ + struct lov_object *lis_object; + /** + * Original end-of-io position for this IO, set by the upper layer as + * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this, + * changes pos and count to fit IO into a single stripe and uses saved + * value to determine when IO iterations have to stop. + * + * This is used only for CIT_READ and CIT_WRITE io's. + */ + loff_t lis_io_endpos; + + /** + * Record the stripe index before the truncate size, used for setting OST + * object size for truncate. LU-14128. lis_trunc_stripe_index[i] refers to + * lov_object.u.composite.lo_entries[i]. + */ + int *lis_trunc_stripe_index; + + /** + * starting position within a file, for the current io loop iteration + * (stripe), used by ci_io_loop(). + */ + loff_t lis_pos; + /** + * end position with in a file, for the current stripe io. This is + * exclusive (i.e., next offset after last byte affected by io). + */ + loff_t lis_endpos; + int lis_nr_subios; + + /** + * the index of ls_single_subio in ls_subios array + */ + int lis_single_subio_index; + struct lov_io_sub lis_single_subio; + + /** + * List of active sub-io's. Active sub-io's are under the range + * of [lis_pos, lis_endpos). + */ + struct list_head lis_active; + /** + * All sub-io's created in this lov_io. + */ + struct list_head lis_subios; + /* Cached results from stripe & offset calculations for page init */ + int lis_cached_entry; + int lis_cached_stripe; + loff_t lis_cached_off; + loff_t lis_cached_suboff; + struct lov_io_sub *lis_cached_sub; +}; + +struct lov_session { + struct lov_io ls_io; + struct lov_sublock_env ls_subenv; +}; + +extern struct lu_device_type lov_device_type; +extern struct lu_device_type lovsub_device_type; + +extern struct lu_context_key lov_key; +extern struct lu_context_key lov_session_key; + +extern struct kmem_cache *lov_lock_kmem; +extern struct kmem_cache *lov_object_kmem; +extern struct kmem_cache *lov_thread_kmem; +extern struct kmem_cache *lov_session_kmem; + +extern struct kmem_cache *lovsub_object_kmem; + +int lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int lov_lock_init_empty (const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int lov_io_init_composite(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +int lov_io_init_empty (const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +int lov_io_init_released(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); + +struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio, + int stripe); + +int lov_page_init_empty (const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index); +int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index); +int lov_page_init_foreign(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index); +struct lu_object *lov_object_alloc (const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); + +struct lu_object *lovsub_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); + +int lov_page_stripe(const struct cl_page *page); +bool lov_page_is_empty(const struct cl_page *page); +int lov_lsm_entry(const struct lov_stripe_md *lsm, __u64 offset); +int lov_io_layout_at(struct lov_io *lio, __u64 offset); + +#define lov_foreach_target(lov, var) \ + for (var = 0; var < lov_targets_nr(lov); ++var) + +static inline struct lu_extent *lov_io_extent(struct lov_io *io, int i) +{ + return &lov_lse(io->lis_object, i)->lsme_extent; +} + +/** + * For layout entries within @ext. + */ +#define lov_foreach_io_layout(ind, lio, ext) \ + for (ind = lov_io_layout_at(lio, (ext)->e_start); \ + ind >= 0 && \ + lu_extent_is_overlapped(lov_io_extent(lio, ind), ext); \ + ind = lov_io_layout_at(lio, lov_io_extent(lio, ind)->e_end)) + +/***************************************************************************** + * + * Type conversions. + * + * Accessors. + * + */ + +static inline struct lov_session *lov_env_session(const struct lu_env *env) +{ + struct lov_session *ses; + + ses = lu_context_key_get(env->le_ses, &lov_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct lov_io *lov_env_io(const struct lu_env *env) +{ + return &lov_env_session(env)->ls_io; +} + +static inline int lov_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &lov_device_type; +} + +static inline int lovsub_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &lovsub_device_type; +} + +static inline struct lu_device *lov2lu_dev(struct lov_device *lov) +{ + return &lov->ld_cl.cd_lu_dev; +} + +static inline struct lov_device *lu2lov_dev(const struct lu_device *d) +{ + LINVRNT(d->ld_type == &lov_device_type); + return container_of(d, struct lov_device, ld_cl.cd_lu_dev); +} + +static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub) +{ + return &lovsub->acid_cl; +} + +static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub) +{ + return &lovsub2cl_dev(lovsub)->cd_lu_dev; +} + +static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d) +{ + LINVRNT(d->ld_type == &lovsub_device_type); + return container_of(d, struct lovsub_device, acid_cl.cd_lu_dev); +} + +static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d) +{ + LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type); + return container_of(d, struct lovsub_device, acid_cl); +} + +static inline struct lu_object *lov2lu(struct lov_object *lov) +{ + return &lov->lo_cl.co_lu; +} + +static inline struct cl_object *lov2cl(struct lov_object *lov) +{ + return &lov->lo_cl; +} + +static inline struct lov_object *lu2lov(const struct lu_object *obj) +{ + LINVRNT(lov_is_object(obj)); + return container_of(obj, struct lov_object, lo_cl.co_lu); +} + +static inline struct lov_object *cl2lov(const struct cl_object *obj) +{ + LINVRNT(lov_is_object(&obj->co_lu)); + return container_of(obj, struct lov_object, lo_cl); +} + +static inline struct lu_object *lovsub2lu(struct lovsub_object *los) +{ + return &los->lso_cl.co_lu; +} + +static inline struct cl_object *lovsub2cl(struct lovsub_object *los) +{ + return &los->lso_cl; +} + +static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj) +{ + LINVRNT(lovsub_is_object(&obj->co_lu)); + return container_of(obj, struct lovsub_object, lso_cl); +} + +static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj) +{ + LINVRNT(lovsub_is_object(obj)); + return container_of(obj, struct lovsub_object, lso_cl.co_lu); +} + +static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice) +{ + LINVRNT(lov_is_object(&slice->cls_obj->co_lu)); + return container_of(slice, struct lov_lock, lls_cl); +} + +static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice) +{ + LINVRNT(lov_is_object(&slice->cpl_obj->co_lu)); + return container_of(slice, struct lov_page, lps_cl); +} + +static inline struct lov_io *cl2lov_io(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio; + + lio = container_of(ios, struct lov_io, lis_cl); + LASSERT(lio == lov_env_io(env)); + return lio; +} + +static inline int lov_targets_nr(const struct lov_device *lov) +{ + return lov->ld_lov->desc.ld_tgt_count; +} + +static inline struct lov_thread_info *lov_env_info(const struct lu_env *env) +{ + struct lov_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &lov_key); + LASSERT(info != NULL); + return info; +} + +/* lov_pack.c */ +int lov_getstripe(const struct lu_env *env, struct lov_object *obj, + struct lov_stripe_md *lsm, struct lov_user_md __user *lump, + size_t size); + +/** @} lov */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_dev.c b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c new file mode 100644 index 0000000000000..e83ee157fd7ff --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c @@ -0,0 +1,592 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_device and cl_device_type for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +/* class_name2obd() */ +#include + +#include "lov_cl_internal.h" + +struct kmem_cache *lov_lock_kmem; +struct kmem_cache *lov_object_kmem; +struct kmem_cache *lov_thread_kmem; +struct kmem_cache *lov_session_kmem; + +struct kmem_cache *lovsub_object_kmem; + +struct lu_kmem_descr lov_caches[] = { + { + .ckd_cache = &lov_lock_kmem, + .ckd_name = "lov_lock_kmem", + .ckd_size = sizeof(struct lov_lock) + }, + { + .ckd_cache = &lov_object_kmem, + .ckd_name = "lov_object_kmem", + .ckd_size = sizeof(struct lov_object) + }, + { + .ckd_cache = &lov_thread_kmem, + .ckd_name = "lov_thread_kmem", + .ckd_size = sizeof(struct lov_thread_info) + }, + { + .ckd_cache = &lov_session_kmem, + .ckd_name = "lov_session_kmem", + .ckd_size = sizeof(struct lov_session) + }, + { + .ckd_cache = &lovsub_object_kmem, + .ckd_name = "lovsub_object_kmem", + .ckd_size = sizeof(struct lovsub_object) + }, + { + .ckd_cache = NULL + } +}; + +/***************************************************************************** + * + * Lov device and device type functions. + * + */ + +static void *lov_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct lov_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, GFP_NOFS); + if (!info) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void lov_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct lov_thread_info *info = data; + OBD_SLAB_FREE_PTR(info, lov_thread_kmem); +} + +struct lu_context_key lov_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = lov_key_init, + .lct_fini = lov_key_fini +}; + +static void *lov_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct lov_session *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, GFP_NOFS); + if (!info) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void lov_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct lov_session *info = data; + + OBD_SLAB_FREE_PTR(info, lov_session_kmem); +} + +struct lu_context_key lov_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = lov_session_key_init, + .lct_fini = lov_session_key_fini +}; + +/* type constructor/destructor: lov_type_{init,fini,start,stop}() */ +LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key); + + +static int lov_mdc_dev_init(const struct lu_env *env, struct lov_device *ld, + struct lu_device *mdc_dev, __u32 idx, __u32 nr) +{ + struct cl_device *cl; + + ENTRY; + cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type, + mdc_dev); + if (IS_ERR(cl)) + RETURN(PTR_ERR(cl)); + + ld->ld_md_tgts[nr].ldm_mdc = cl; + ld->ld_md_tgts[nr].ldm_idx = idx; + RETURN(0); +} + +static struct lu_device *lov_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct lov_device *ld = lu2lov_dev(d); + int i; + + LASSERT(ld->ld_lov != NULL); + + if (ld->ld_lmv) { + class_decref(ld->ld_lmv, "lov", d); + ld->ld_lmv = NULL; + } + + if (ld->ld_md_tgts) { + for (i = 0; i < ld->ld_md_tgts_nr; i++) { + if (!ld->ld_md_tgts[i].ldm_mdc) + continue; + + cl_stack_fini(env, ld->ld_md_tgts[i].ldm_mdc); + ld->ld_md_tgts[i].ldm_mdc = NULL; + ld->ld_lov->lov_mdc_tgts[i].lmtd_mdc = NULL; + } + } + + if (ld->ld_target) { + lov_foreach_target(ld, i) { + struct lovsub_device *lsd; + + lsd = ld->ld_target[i]; + if (lsd) { + cl_stack_fini(env, lovsub2cl_dev(lsd)); + ld->ld_target[i] = NULL; + } + } + } + RETURN(NULL); +} + +static int lov_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct lov_device *ld = lu2lov_dev(d); + int i; + int rc = 0; + + /* check all added already MDC subdevices and initialize them */ + for (i = 0; i < ld->ld_md_tgts_nr; i++) { + struct obd_device *mdc; + __u32 idx; + + mdc = ld->ld_lov->lov_mdc_tgts[i].lmtd_mdc; + idx = ld->ld_lov->lov_mdc_tgts[i].lmtd_index; + + if (!mdc) + continue; + + rc = lov_mdc_dev_init(env, ld, mdc->obd_lu_dev, idx, i); + if (rc) { + CERROR("%s: failed to add MDC %s as target: rc = %d\n", + d->ld_obd->obd_name, + obd_uuid2str(&mdc->obd_uuid), rc); + GOTO(out_err, rc); + } + } + + if (!ld->ld_target) + RETURN(0); + + lov_foreach_target(ld, i) { + struct lovsub_device *lsd; + struct cl_device *cl; + struct lov_tgt_desc *desc; + + desc = ld->ld_lov->lov_tgts[i]; + if (!desc) + continue; + + cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type, + desc->ltd_obd->obd_lu_dev); + if (IS_ERR(cl)) + GOTO(out_err, rc = PTR_ERR(cl)); + + lsd = cl2lovsub_dev(cl); + ld->ld_target[i] = lsd; + } + ld->ld_flags |= LOV_DEV_INITIALIZED; + RETURN(0); + +out_err: + lu_device_fini(d); + RETURN(rc); +} + +/* Free the lov specific data created for the back end lu_device. */ +static struct lu_device *lov_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct lov_device *ld = lu2lov_dev(d); + const int nr = ld->ld_target_nr; + + lu_site_fini(&ld->ld_site); + + cl_device_fini(lu2cl_dev(d)); + if (ld->ld_target) { + OBD_FREE_PTR_ARRAY(ld->ld_target, nr); + ld->ld_target = NULL; + } + if (ld->ld_md_tgts) { + OBD_FREE_PTR_ARRAY(ld->ld_md_tgts, LOV_MDC_TGT_MAX); + ld->ld_md_tgts = NULL; + } + /* free array of MDCs */ + if (ld->ld_lov->lov_mdc_tgts) { + OBD_FREE_PTR_ARRAY(ld->ld_lov->lov_mdc_tgts, LOV_MDC_TGT_MAX); + ld->ld_lov->lov_mdc_tgts = NULL; + } + + OBD_FREE_PTR(ld); + return NULL; +} + +static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev, + __u32 index) +{ + struct lov_device *ld = lu2lov_dev(dev); + + ENTRY; + + if (ld->ld_target[index]) { + cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index])); + ld->ld_target[index] = NULL; + } + EXIT; +} + +static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev) +{ + int result; + __u32 tgt_size; + __u32 sub_size; + + ENTRY; + result = 0; + tgt_size = dev->ld_lov->lov_tgt_size; + sub_size = dev->ld_target_nr; + if (sub_size < tgt_size) { + struct lovsub_device **newd; + const size_t sz = sizeof(newd[0]); + + OBD_ALLOC_PTR_ARRAY(newd, tgt_size); + if (newd) { + if (sub_size > 0) { + memcpy(newd, dev->ld_target, sub_size * sz); + OBD_FREE(dev->ld_target, sub_size * sz); + } + + dev->ld_target = newd; + dev->ld_target_nr = tgt_size; + } else { + result = -ENOMEM; + } + } + + RETURN(result); +} + +static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev, + __u32 index) +{ + struct obd_device *obd = dev->ld_obd; + struct lov_device *ld = lu2lov_dev(dev); + struct lov_tgt_desc *tgt; + struct lovsub_device *lsd; + struct cl_device *cl; + int rc; + + ENTRY; + + lov_tgts_getref(obd); + + tgt = obd->u.lov.lov_tgts[index]; + LASSERT(tgt != NULL); + LASSERT(tgt->ltd_obd != NULL); + + if (!tgt->ltd_obd->obd_set_up) { + CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid)); + RETURN(-EINVAL); + } + + rc = lov_expand_targets(env, ld); + if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) { + cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type, + tgt->ltd_obd->obd_lu_dev); + if (!IS_ERR(cl)) { + lsd = cl2lovsub_dev(cl); + ld->ld_target[index] = lsd; + } else { + CERROR("add failed (%d), deleting %s\n", rc, + obd_uuid2str(&tgt->ltd_uuid)); + lov_cl_del_target(env, dev, index); + rc = PTR_ERR(cl); + } + } + + lov_tgts_putref(obd); + + RETURN(rc); +} + +/** + * Add new MDC target device in LOV. + * + * This function is part of the configuration log processing. It adds new MDC + * device to the MDC device array indexed by their indexes. + * + * \param[in] env execution environment + * \param[in] d LU device of LOV device + * \param[in] mdc MDC device to add + * \param[in] idx MDC device index + * + * \retval 0 if successful + * \retval negative value on error + */ +static int lov_add_mdc_target(const struct lu_env *env, struct lu_device *d, + struct obd_device *mdc, __u32 idx) +{ + struct lov_device *ld = lu2lov_dev(d); + struct obd_device *lov_obd = d->ld_obd; + struct obd_device *lmv_obd; + int next; + int rc = 0; + + ENTRY; + + LASSERT(mdc != NULL); + if (ld->ld_md_tgts_nr == LOV_MDC_TGT_MAX) { + /* + * If the maximum value of LOV_MDC_TGT_MAX will become too + * small then all MD target handling must be rewritten in LOD + * manner, check lod_add_device() and related functionality. + */ + CERROR("%s: cannot serve more than %d MDC devices\n", + lov_obd->obd_name, LOV_MDC_TGT_MAX); + RETURN(-ERANGE); + } + + /* + * grab FLD from lmv, do that here, when first MDC is added + * to be sure LMV is set up and can be found + */ + if (!ld->ld_lmv) { + next = 0; + while ((lmv_obd = class_devices_in_group(&lov_obd->obd_uuid, + &next)) != NULL) { + if ((strncmp(lmv_obd->obd_type->typ_name, + LUSTRE_LMV_NAME, + strlen(LUSTRE_LMV_NAME)) == 0)) + break; + } + if (!lmv_obd) { + CERROR("%s: cannot find LMV OBD by UUID (%s)\n", + lov_obd->obd_name, + obd_uuid2str(&lmv_obd->obd_uuid)); + RETURN(-ENODEV); + } + spin_lock(&lmv_obd->obd_dev_lock); + class_incref(lmv_obd, "lov", ld); + spin_unlock(&lmv_obd->obd_dev_lock); + ld->ld_lmv = lmv_obd; + } + + LASSERT(lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_mdc == + NULL); + + if (ld->ld_flags & LOV_DEV_INITIALIZED) { + rc = lov_mdc_dev_init(env, ld, mdc->obd_lu_dev, idx, + ld->ld_md_tgts_nr); + if (rc) { + CERROR("%s: failed to add MDC %s as target: rc = %d\n", + lov_obd->obd_name, obd_uuid2str(&mdc->obd_uuid), + rc); + RETURN(rc); + } + } + + lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_mdc = mdc; + lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_index = idx; + ld->ld_md_tgts_nr++; + + RETURN(rc); +} + +static int lov_process_config(const struct lu_env *env, + struct lu_device *d, struct lustre_cfg *cfg) +{ + struct obd_device *obd = d->ld_obd; + int cmd; + int rc; + int gen; + u32 index; + + lov_tgts_getref(obd); + + cmd = cfg->lcfg_command; + + rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen); + if (rc < 0) + GOTO(out, rc); + + switch (cmd) { + case LCFG_LOV_ADD_OBD: + case LCFG_LOV_ADD_INA: + rc = lov_cl_add_target(env, d, index); + if (rc != 0) + lov_del_target(d->ld_obd, index, NULL, 0); + break; + case LCFG_LOV_DEL_OBD: + lov_cl_del_target(env, d, index); + break; + case LCFG_ADD_MDC: + { + struct obd_device *mdc; + struct obd_uuid tgt_uuid; + + /* + * modify_mdc_tgts add 0:lustre-clilmv 1:lustre-MDT0000_UUID + * 2:0 3:1 4:lustre-MDT0000-mdc_UUID + */ + if (LUSTRE_CFG_BUFLEN(cfg, 1) > sizeof(tgt_uuid.uuid)) + GOTO(out, rc = -EINVAL); + + obd_str2uuid(&tgt_uuid, lustre_cfg_buf(cfg, 1)); + + rc = kstrtou32(lustre_cfg_buf(cfg, 2), 10, &index); + if (rc) + GOTO(out, rc); + + mdc = class_find_client_obd(&tgt_uuid, LUSTRE_MDC_NAME, + &obd->obd_uuid); + if (!mdc) + GOTO(out, rc = -ENODEV); + rc = lov_add_mdc_target(env, d, mdc, index); + break; + } + } +out: + lov_tgts_putref(obd); + RETURN(rc); +} + +static const struct lu_device_operations lov_lu_ops = { + .ldo_object_alloc = lov_object_alloc, + .ldo_process_config = lov_process_config, +}; + +static struct lu_device *lov_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct lov_device *ld; + struct obd_device *obd; + int rc; + + OBD_ALLOC_PTR(ld); + if (!ld) + RETURN(ERR_PTR(-ENOMEM)); + + cl_device_init(&ld->ld_cl, t); + d = lov2lu_dev(ld); + d->ld_ops = &lov_lu_ops; + + /* setup the LOV OBD */ + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + rc = lov_setup(obd, cfg); + if (rc) + GOTO(out, rc); + + /* Alloc MDC devices array */ + /* XXX: need dynamic allocation at some moment */ + OBD_ALLOC_PTR_ARRAY(ld->ld_md_tgts, LOV_MDC_TGT_MAX); + if (!ld->ld_md_tgts) + GOTO(out, rc = -ENOMEM); + + ld->ld_md_tgts_nr = 0; + + ld->ld_lov = &obd->u.lov; + OBD_ALLOC_PTR_ARRAY(ld->ld_lov->lov_mdc_tgts, LOV_MDC_TGT_MAX); + if (!ld->ld_lov->lov_mdc_tgts) + GOTO(out_md_tgts, rc = -ENOMEM); + + rc = lu_site_init(&ld->ld_site, d); + if (rc != 0) + GOTO(out_mdc_tgts, rc); + + rc = lu_site_init_finish(&ld->ld_site); + if (rc != 0) + GOTO(out_site, rc); + + RETURN(d); +out_site: + lu_site_fini(&ld->ld_site); +out_mdc_tgts: + OBD_FREE_PTR_ARRAY(ld->ld_lov->lov_mdc_tgts, LOV_MDC_TGT_MAX); + ld->ld_lov->lov_mdc_tgts = NULL; +out_md_tgts: + OBD_FREE_PTR_ARRAY(ld->ld_md_tgts, LOV_MDC_TGT_MAX); + ld->ld_md_tgts = NULL; +out: + OBD_FREE_PTR(ld); + + return ERR_PTR(rc); +} + +static const struct lu_device_type_operations lov_device_type_ops = { + .ldto_init = lov_type_init, + .ldto_fini = lov_type_fini, + + .ldto_start = lov_type_start, + .ldto_stop = lov_type_stop, + + .ldto_device_alloc = lov_device_alloc, + .ldto_device_free = lov_device_free, + + .ldto_device_init = lov_device_init, + .ldto_device_fini = lov_device_fini +}; + +struct lu_device_type lov_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_LOV_NAME, + .ldt_ops = &lov_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +/** @} lov */ diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_ea.c b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c new file mode 100644 index 0000000000000..beb0f63df28e9 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c @@ -0,0 +1,716 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/lov/lov_ea.c + * + * Author: Wang Di + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include +#include +#include + +#include +#include "lov_internal.h" + +static inline void +lu_extent_le_to_cpu(struct lu_extent *dst, const struct lu_extent *src) +{ + dst->e_start = le64_to_cpu(src->e_start); + dst->e_end = le64_to_cpu(src->e_end); +} + +/* + * Find minimum stripe maxbytes value. For inactive or + * reconnecting targets use LUSTRE_EXT3_STRIPE_MAXBYTES. + */ +static loff_t lov_tgt_maxbytes(struct lov_tgt_desc *tgt) +{ + struct obd_import *imp; + loff_t maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES; + + if (!tgt->ltd_active) + return maxbytes; + + imp = tgt->ltd_obd->u.cli.cl_import; + if (!imp) + return maxbytes; + + spin_lock(&imp->imp_lock); + if ((imp->imp_state == LUSTRE_IMP_FULL || + imp->imp_state == LUSTRE_IMP_IDLE) && + (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) && + imp->imp_connect_data.ocd_maxbytes > 0) + maxbytes = imp->imp_connect_data.ocd_maxbytes; + + spin_unlock(&imp->imp_lock); + + return maxbytes; +} + +static int lsm_lmm_verify_v1v3(struct lov_mds_md *lmm, size_t lmm_size, + u16 stripe_count) +{ + u32 pattern = le32_to_cpu(lmm->lmm_pattern); + int rc = 0; + + if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) { + rc = -EINVAL; + CERROR("lov: bad stripe count %d: rc = %d\n", + stripe_count, rc); + lov_dump_lmm_common(D_WARNING, lmm); + goto out; + } + + if (lmm_oi_id(&lmm->lmm_oi) == 0) { + rc = -EINVAL; + CERROR("lov: zero object id: rc = %d\n", rc); + lov_dump_lmm_common(D_WARNING, lmm); + goto out; + } + + if (!lov_pattern_supported(lov_pattern(pattern))) { + rc = -EINVAL; + CERROR("lov: unrecognized striping pattern: rc = %d\n", rc); + lov_dump_lmm_common(D_WARNING, lmm); + goto out; + } + + if (lmm->lmm_stripe_size == 0 || + (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) { + rc = -EINVAL; + CERROR("lov: bad stripe size %u: rc = %d\n", + le32_to_cpu(lmm->lmm_stripe_size), rc); + lov_dump_lmm_common(D_WARNING, lmm); + goto out; + } + +out: + return rc; +} + +static void lsme_free(struct lov_stripe_md_entry *lsme) +{ + unsigned int stripe_count; + unsigned int i; + size_t lsme_size; + + if (lsme->lsme_magic == LOV_MAGIC_FOREIGN) { + /* + * TODO: In addition to HSM foreign layout, It needs to add + * support for other kinds of foreign layout types such as + * DAOS, S3. When add these supports, it will use non-inline + * @lov_hsm_base to store layout information, and need to + * free extra allocated buffer. + */ + OBD_FREE_LARGE(lsme, sizeof(*lsme)); + return; + } + + stripe_count = lsme->lsme_stripe_count; + if (!lsme_inited(lsme) || + lsme->lsme_pattern & LOV_PATTERN_F_RELEASED) + stripe_count = 0; + for (i = 0; i < stripe_count; i++) + OBD_SLAB_FREE_PTR(lsme->lsme_oinfo[i], lov_oinfo_slab); + + lsme_size = offsetof(typeof(*lsme), lsme_oinfo[stripe_count]); + OBD_FREE_LARGE(lsme, lsme_size); +} + +void lsm_free(struct lov_stripe_md *lsm) +{ + unsigned int entry_count = lsm->lsm_entry_count; + unsigned int i; + size_t lsm_size; + + if (lsm->lsm_magic == LOV_MAGIC_FOREIGN) { + OBD_FREE_LARGE(lsm_foreign(lsm), lsm->lsm_foreign_size); + } else { + for (i = 0; i < entry_count; i++) + lsme_free(lsm->lsm_entries[i]); + } + + lsm_size = lsm->lsm_magic == LOV_MAGIC_FOREIGN ? + offsetof(typeof(*lsm), lsm_entries[1]) : + offsetof(typeof(*lsm), lsm_entries[entry_count]); + OBD_FREE(lsm, lsm_size); +} + +/** + * Unpack a struct lov_mds_md into a struct lov_stripe_md_entry. + * + * The caller should set id and extent. + */ +static struct lov_stripe_md_entry * +lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size, + const char *pool_name, bool inited, struct lov_ost_data_v1 *objects, + loff_t *maxbytes) +{ + struct lov_stripe_md_entry *lsme; + size_t lsme_size; + loff_t min_stripe_maxbytes = 0; + loff_t lov_bytes; + u32 magic; + u32 pattern; + unsigned int stripe_count; + unsigned int i; + int rc; + + magic = le32_to_cpu(lmm->lmm_magic); + if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) + RETURN(ERR_PTR(-EINVAL)); + + pattern = le32_to_cpu(lmm->lmm_pattern); + if (pattern & LOV_PATTERN_F_RELEASED || !inited) + stripe_count = 0; + else + stripe_count = le16_to_cpu(lmm->lmm_stripe_count); + + if (buf_size < lov_mds_md_size(stripe_count, magic)) { + CERROR("LOV EA %s too small: %zu, need %u\n", + magic == LOV_MAGIC_V1 ? "V1" : "V3", buf_size, + lov_mds_md_size(stripe_count, magic == LOV_MAGIC_V1 ? + LOV_MAGIC_V1 : LOV_MAGIC_V3)); + lov_dump_lmm_common(D_WARNING, lmm); + return ERR_PTR(-EINVAL); + } + + rc = lsm_lmm_verify_v1v3(lmm, buf_size, stripe_count); + if (rc < 0) + return ERR_PTR(rc); + + lsme_size = offsetof(typeof(*lsme), lsme_oinfo[stripe_count]); + OBD_ALLOC_LARGE(lsme, lsme_size); + if (!lsme) + RETURN(ERR_PTR(-ENOMEM)); + + lsme->lsme_magic = magic; + lsme->lsme_pattern = pattern; + lsme->lsme_flags = 0; + lsme->lsme_stripe_size = le32_to_cpu(lmm->lmm_stripe_size); + /* preserve the possible -1 stripe count for uninstantiated component */ + lsme->lsme_stripe_count = le16_to_cpu(lmm->lmm_stripe_count); + lsme->lsme_layout_gen = le16_to_cpu(lmm->lmm_layout_gen); + + if (pool_name) { + size_t pool_name_len; + + pool_name_len = strlcpy(lsme->lsme_pool_name, pool_name, + sizeof(lsme->lsme_pool_name)); + if (pool_name_len >= sizeof(lsme->lsme_pool_name)) + GOTO(out_lsme, rc = -E2BIG); + } + + /* with Data-on-MDT set maxbytes to stripe size */ + if (lsme_is_dom(lsme)) { + if (maxbytes) { + lov_bytes = lsme->lsme_stripe_size; + goto out_dom1; + } else { + goto out_dom2; + } + } + + for (i = 0; i < stripe_count; i++) { + struct lov_oinfo *loi; + struct lov_tgt_desc *ltd; + + OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS); + if (!loi) + GOTO(out_lsme, rc = -ENOMEM); + + lsme->lsme_oinfo[i] = loi; + + ostid_le_to_cpu(&objects[i].l_ost_oi, &loi->loi_oi); + loi->loi_ost_idx = le32_to_cpu(objects[i].l_ost_idx); + loi->loi_ost_gen = le32_to_cpu(objects[i].l_ost_gen); + if (lov_oinfo_is_dummy(loi)) + continue; + + if (loi->loi_ost_idx >= lov->desc.ld_tgt_count && + !lov2obd(lov)->obd_process_conf) { + CERROR("%s: OST index %d more than OST count %d\n", + (char*)lov->desc.ld_uuid.uuid, + loi->loi_ost_idx, lov->desc.ld_tgt_count); + lov_dump_lmm_v1(D_WARNING, lmm); + GOTO(out_lsme, rc = -EINVAL); + } + + ltd = lov->lov_tgts[loi->loi_ost_idx]; + if (!ltd) { + CERROR("%s: OST index %d missing\n", + (char*)lov->desc.ld_uuid.uuid, loi->loi_ost_idx); + lov_dump_lmm_v1(D_WARNING, lmm); + continue; + } + + lov_bytes = lov_tgt_maxbytes(ltd); + if (min_stripe_maxbytes == 0 || lov_bytes < min_stripe_maxbytes) + min_stripe_maxbytes = lov_bytes; + } + + if (maxbytes) { + if (min_stripe_maxbytes == 0) + min_stripe_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES; + + if (stripe_count == 0) + stripe_count = lov->desc.ld_tgt_count; + + if (min_stripe_maxbytes <= LLONG_MAX / stripe_count) + lov_bytes = min_stripe_maxbytes * stripe_count; + else + lov_bytes = MAX_LFS_FILESIZE; +out_dom1: + *maxbytes = min_t(loff_t, lov_bytes, MAX_LFS_FILESIZE); + } +out_dom2: + + return lsme; + +out_lsme: + for (i = 0; i < stripe_count; i++) { + struct lov_oinfo *loi = lsme->lsme_oinfo[i]; + + if (loi) + OBD_SLAB_FREE_PTR(lsme->lsme_oinfo[i], lov_oinfo_slab); + } + OBD_FREE_LARGE(lsme, lsme_size); + + return ERR_PTR(rc); +} + +static struct +lov_stripe_md *lsm_unpackmd_v1v3(struct lov_obd *lov, struct lov_mds_md *lmm, + size_t buf_size, const char *pool_name, + struct lov_ost_data_v1 *objects) +{ + struct lov_stripe_md *lsm; + struct lov_stripe_md_entry *lsme; + size_t lsm_size; + loff_t maxbytes; + u32 pattern; + int rc; + + pattern = le32_to_cpu(lmm->lmm_pattern); + + lsme = lsme_unpack(lov, lmm, buf_size, pool_name, true, objects, + &maxbytes); + if (IS_ERR(lsme)) + RETURN(ERR_CAST(lsme)); + + lsme->lsme_flags = LCME_FL_INIT; + lsme->lsme_extent.e_start = 0; + lsme->lsme_extent.e_end = LUSTRE_EOF; + + lsm_size = offsetof(typeof(*lsm), lsm_entries[1]); + OBD_ALLOC(lsm, lsm_size); + if (!lsm) + GOTO(out_lsme, rc = -ENOMEM); + + atomic_set(&lsm->lsm_refc, 1); + spin_lock_init(&lsm->lsm_lock); + lsm->lsm_maxbytes = maxbytes; + lmm_oi_le_to_cpu(&lsm->lsm_oi, &lmm->lmm_oi); + lsm->lsm_magic = le32_to_cpu(lmm->lmm_magic); + lsm->lsm_layout_gen = le16_to_cpu(lmm->lmm_layout_gen); + lsm->lsm_entry_count = 1; + lsm->lsm_is_released = pattern & LOV_PATTERN_F_RELEASED; + lsm->lsm_entries[0] = lsme; + + return lsm; + +out_lsme: + lsme_free(lsme); + + return ERR_PTR(rc); +} + +static struct lov_stripe_md * +lsm_unpackmd_v1(struct lov_obd *lov, void *buf, size_t buf_size) +{ + struct lov_mds_md_v1 *lmm = buf; + + return lsm_unpackmd_v1v3(lov, buf, buf_size, NULL, lmm->lmm_objects); +} + +static const struct lsm_operations lsm_v1_ops = { + .lsm_unpackmd = lsm_unpackmd_v1, +}; + +static struct lov_stripe_md * +lsm_unpackmd_v3(struct lov_obd *lov, void *buf, size_t buf_size) +{ + struct lov_mds_md_v3 *lmm = buf; + + return lsm_unpackmd_v1v3(lov, buf, buf_size, lmm->lmm_pool_name, + lmm->lmm_objects); +} + +static const struct lsm_operations lsm_v3_ops = { + .lsm_unpackmd = lsm_unpackmd_v3, +}; + +static int lsm_verify_comp_md_v1(struct lov_comp_md_v1 *lcm, + size_t lcm_buf_size) +{ + unsigned int entry_count; + unsigned int i; + size_t lcm_size; + + lcm_size = le32_to_cpu(lcm->lcm_size); + if (lcm_buf_size < lcm_size) { + CERROR("bad LCM buffer size %zu, expected %zu\n", + lcm_buf_size, lcm_size); + RETURN(-EINVAL); + } + + entry_count = le16_to_cpu(lcm->lcm_entry_count); + for (i = 0; i < entry_count; i++) { + struct lov_comp_md_entry_v1 *lcme = &lcm->lcm_entries[i]; + size_t blob_offset; + size_t blob_size; + + blob_offset = le32_to_cpu(lcme->lcme_offset); + blob_size = le32_to_cpu(lcme->lcme_size); + + if (lcm_size < blob_offset || lcm_size < blob_size || + lcm_size < blob_offset + blob_size) { + CERROR("LCM entry %u has invalid blob: " + "LCM size = %zu, offset = %zu, size = %zu\n", + le32_to_cpu(lcme->lcme_id), + lcm_size, blob_offset, blob_size); + RETURN(-EINVAL); + } + } + + return 0; +} + +static struct lov_stripe_md_entry * +lsme_unpack_foreign(struct lov_obd *lov, void *buf, size_t buf_size, + bool inited, loff_t *maxbytes) +{ + struct lov_stripe_md_entry *lsme; + struct lov_foreign_md *lfm = buf; + __u32 magic; + + ENTRY; + + magic = le32_to_cpu(lfm->lfm_magic); + if (magic != LOV_MAGIC_FOREIGN) + RETURN(ERR_PTR(-EINVAL)); + + OBD_ALLOC_LARGE(lsme, sizeof(*lsme)); + if (!lsme) + RETURN(ERR_PTR(-ENOMEM)); + + lsme->lsme_magic = magic; + lsme->lsme_pattern = LOV_PATTERN_FOREIGN; + lsme->lsme_flags = 0; + + if (maxbytes) + *maxbytes = MAX_LFS_FILESIZE; + + RETURN(lsme); +} + +static struct lov_stripe_md_entry * +lsme_unpack_comp(struct lov_obd *lov, struct lov_mds_md *lmm, + size_t lmm_buf_size, bool inited, loff_t *maxbytes) +{ + unsigned int magic; + + magic = le32_to_cpu(lmm->lmm_magic); + if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3 && + magic != LOV_MAGIC_FOREIGN) + RETURN(ERR_PTR(-EINVAL)); + + if (magic != LOV_MAGIC_FOREIGN && + le16_to_cpu(lmm->lmm_stripe_count) == 0 && + lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_MDT) + RETURN(ERR_PTR(-EINVAL)); + + if (magic == LOV_MAGIC_V1) { + return lsme_unpack(lov, lmm, lmm_buf_size, NULL, + inited, lmm->lmm_objects, maxbytes); + } else if (magic == LOV_MAGIC_V3) { + struct lov_mds_md_v3 *lmm3 = (struct lov_mds_md_v3 *)lmm; + + return lsme_unpack(lov, lmm, lmm_buf_size, lmm3->lmm_pool_name, + inited, lmm3->lmm_objects, maxbytes); + } else { /* LOV_MAGIC_FOREIGN */ + return lsme_unpack_foreign(lov, lmm, lmm_buf_size, + inited, maxbytes); + } +} + +static struct lov_stripe_md * +lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size) +{ + struct lov_comp_md_v1 *lcm = buf; + struct lov_stripe_md *lsm; + size_t lsm_size; + unsigned int entry_count = 0; + unsigned int i; + loff_t maxbytes; + int rc; + + rc = lsm_verify_comp_md_v1(buf, buf_size); + if (rc < 0) + return ERR_PTR(rc); + + entry_count = le16_to_cpu(lcm->lcm_entry_count); + + lsm_size = offsetof(typeof(*lsm), lsm_entries[entry_count]); + OBD_ALLOC(lsm, lsm_size); + if (!lsm) + return ERR_PTR(-ENOMEM); + + atomic_set(&lsm->lsm_refc, 1); + spin_lock_init(&lsm->lsm_lock); + lsm->lsm_magic = le32_to_cpu(lcm->lcm_magic); + lsm->lsm_layout_gen = le32_to_cpu(lcm->lcm_layout_gen); + lsm->lsm_entry_count = entry_count; + lsm->lsm_mirror_count = le16_to_cpu(lcm->lcm_mirror_count); + lsm->lsm_flags = le16_to_cpu(lcm->lcm_flags); + lsm->lsm_is_released = true; + lsm->lsm_maxbytes = LLONG_MIN; + + for (i = 0; i < entry_count; i++) { + struct lov_comp_md_entry_v1 *lcme = &lcm->lcm_entries[i]; + struct lov_stripe_md_entry *lsme; + size_t blob_offset; + size_t blob_size; + void *blob; + + blob_offset = le32_to_cpu(lcme->lcme_offset); + blob_size = le32_to_cpu(lcme->lcme_size); + blob = (char *)lcm + blob_offset; + + lsme = lsme_unpack_comp(lov, blob, blob_size, + le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT, + (i == entry_count - 1) ? &maxbytes : + NULL); + if (IS_ERR(lsme)) + GOTO(out_lsm, rc = PTR_ERR(lsme)); + + if (!(lsme->lsme_pattern & LOV_PATTERN_F_RELEASED)) + lsm->lsm_is_released = false; + + lsm->lsm_entries[i] = lsme; + lsme->lsme_id = le32_to_cpu(lcme->lcme_id); + lsme->lsme_flags = le32_to_cpu(lcme->lcme_flags); + if (lsme->lsme_flags & LCME_FL_NOSYNC) + lsme->lsme_timestamp = + le64_to_cpu(lcme->lcme_timestamp); + lu_extent_le_to_cpu(&lsme->lsme_extent, &lcme->lcme_extent); + + if (i == entry_count - 1) { + lsm->lsm_maxbytes = (loff_t)lsme->lsme_extent.e_start + + maxbytes; + /* + * the last component hasn't been defined, or + * lsm_maxbytes overflowed. + */ + if (!lsme_is_dom(lsme) && + (lsme->lsme_extent.e_end != LUSTRE_EOF || + lsm->lsm_maxbytes < + (loff_t)lsme->lsme_extent.e_start)) + lsm->lsm_maxbytes = MAX_LFS_FILESIZE; + } + } + + RETURN(lsm); + +out_lsm: + for (i = 0; i < entry_count; i++) + if (lsm->lsm_entries[i]) + lsme_free(lsm->lsm_entries[i]); + + OBD_FREE(lsm, lsm_size); + + RETURN(ERR_PTR(rc)); +} + +static const struct lsm_operations lsm_comp_md_v1_ops = { + .lsm_unpackmd = lsm_unpackmd_comp_md_v1, +}; + +static struct +lov_stripe_md *lsm_unpackmd_foreign(struct lov_obd *lov, void *buf, + size_t buf_size) +{ + struct lov_foreign_md *lfm = buf; + struct lov_stripe_md *lsm; + size_t lsm_size; + struct lov_stripe_md_entry *lsme; + + lsm_size = offsetof(typeof(*lsm), lsm_entries[1]); + OBD_ALLOC(lsm, lsm_size); + if (lsm == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + atomic_set(&lsm->lsm_refc, 1); + spin_lock_init(&lsm->lsm_lock); + lsm->lsm_magic = le32_to_cpu(lfm->lfm_magic); + lsm->lsm_foreign_size = foreign_size_le(lfm); + + /* alloc for full foreign EA including format fields */ + OBD_ALLOC_LARGE(lsme, lsm->lsm_foreign_size); + if (lsme == NULL) { + OBD_FREE(lsm, lsm_size); + RETURN(ERR_PTR(-ENOMEM)); + } + + /* copy full foreign EA including format fields */ + memcpy(lsme, buf, lsm->lsm_foreign_size); + + lsm_foreign(lsm) = lsme; + + return lsm; +} + +static const struct lsm_operations lsm_foreign_ops = { + .lsm_unpackmd = lsm_unpackmd_foreign, +}; + +const struct lsm_operations *lsm_op_find(int magic) +{ + switch (magic) { + case LOV_MAGIC_V1: + return &lsm_v1_ops; + case LOV_MAGIC_V3: + return &lsm_v3_ops; + case LOV_MAGIC_COMP_V1: + return &lsm_comp_md_v1_ops; + case LOV_MAGIC_FOREIGN: + return &lsm_foreign_ops; + default: + CERROR("unrecognized lsm_magic %08x\n", magic); + return NULL; + } +} + +void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm) +{ + int i, j; + + CDEBUG_LIMIT(level, + "lsm %p, objid "DOSTID", maxbytes %#llx, magic 0x%08X, refc: %d, entry: %u, mirror: %u, flags: %u,layout_gen %u\n", + lsm, POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic, + atomic_read(&lsm->lsm_refc), lsm->lsm_entry_count, + lsm->lsm_mirror_count, lsm->lsm_flags, lsm->lsm_layout_gen); + + if (lsm->lsm_magic == LOV_MAGIC_FOREIGN) { + struct lov_foreign_md *lfm = (void *)lsm_foreign(lsm); + + CDEBUG_LIMIT(level, + "foreign LOV EA, magic %x, length %u, type %x, flags %x, value '%.*s'\n", + lfm->lfm_magic, lfm->lfm_length, lfm->lfm_type, + lfm->lfm_flags, lfm->lfm_length, lfm->lfm_value); + return; + } + + for (i = 0; i < lsm->lsm_entry_count; i++) { + struct lov_stripe_md_entry *lse = lsm->lsm_entries[i]; + + CDEBUG(level, DEXT ": id: %u, flags: %x, " + "magic 0x%08X, layout_gen %u, " + "stripe count %u, sstripe size %u, " + "pool: ["LOV_POOLNAMEF"]\n", + PEXT(&lse->lsme_extent), lse->lsme_id, lse->lsme_flags, + lse->lsme_magic, lse->lsme_layout_gen, + lse->lsme_stripe_count, lse->lsme_stripe_size, + lse->lsme_pool_name); + if (!lsme_inited(lse) || + lse->lsme_pattern & LOV_PATTERN_F_RELEASED) + continue; + for (j = 0; j < lse->lsme_stripe_count; j++) { + CDEBUG(level, " oinfo:%p: ostid: "DOSTID + " ost idx: %d gen: %d\n", + lse->lsme_oinfo[j], + POSTID(&lse->lsme_oinfo[j]->loi_oi), + lse->lsme_oinfo[j]->loi_ost_idx, + lse->lsme_oinfo[j]->loi_ost_gen); + } + } +} + +int lov_lsm_entry(const struct lov_stripe_md *lsm, __u64 offset) +{ + int i; + + for (i = 0; i < lsm->lsm_entry_count; i++) { + struct lov_stripe_md_entry *lse = lsm->lsm_entries[i]; + + if ((offset >= lse->lsme_extent.e_start && + offset < lse->lsme_extent.e_end) || + (offset == OBD_OBJECT_EOF && + lse->lsme_extent.e_end == OBD_OBJECT_EOF)) + return i; + } + + return -1; +} + +/** + * lmm_layout_gen overlaps stripe_offset field, it needs to be reset back when + * sending to MDT for passing striping checks + */ +void lov_fix_ea_for_replay(void *lovea) +{ + struct lov_user_md *lmm = lovea; + struct lov_comp_md_v1 *c1; + int i; + + switch (le32_to_cpu(lmm->lmm_magic)) { + case LOV_USER_MAGIC_V1: + case LOV_USER_MAGIC_V3: + lmm->lmm_stripe_offset = LOV_OFFSET_DEFAULT; + break; + + case LOV_USER_MAGIC_COMP_V1: + c1 = (void *)lmm; + for (i = 0; i < le16_to_cpu(c1->lcm_entry_count); i++) { + struct lov_comp_md_entry_v1 *ent = &c1->lcm_entries[i]; + + if (le32_to_cpu(ent->lcme_flags) & LCME_FL_INIT) { + lmm = (void *)((char *)c1 + + le32_to_cpu(ent->lcme_offset)); + lmm->lmm_stripe_offset = LOV_OFFSET_DEFAULT; + } + } + } +} +EXPORT_SYMBOL(lov_fix_ea_for_replay); diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h new file mode 100644 index 0000000000000..9341d2e80b7c4 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h @@ -0,0 +1,375 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef LOV_INTERNAL_H +#define LOV_INTERNAL_H + +#include +#include + +/* If we are unable to get the maximum object size from the OST in + * ocd_maxbytes using OBD_CONNECT_MAXBYTES, then we fall back to using + * the old maximum object size from ext3. */ +#define LUSTRE_EXT3_STRIPE_MAXBYTES 0x1fffffff000ULL + +struct lov_stripe_md_entry { + struct lu_extent lsme_extent; + u32 lsme_id; + u32 lsme_magic; + u32 lsme_flags; + u32 lsme_pattern; + u64 lsme_timestamp; + u32 lsme_stripe_size; + u16 lsme_stripe_count; + u16 lsme_layout_gen; + char lsme_pool_name[LOV_MAXPOOLNAME + 1]; + struct lov_oinfo *lsme_oinfo[]; +}; + +static inline bool lsme_is_dom(struct lov_stripe_md_entry *lsme) +{ + return (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_MDT); +} + +static inline void copy_lsm_entry(struct lov_stripe_md_entry *dst, + struct lov_stripe_md_entry *src) +{ + unsigned i; + + for (i = 0; i < src->lsme_stripe_count; i++) + *dst->lsme_oinfo[i] = *src->lsme_oinfo[i]; + memcpy(dst, src, offsetof(typeof(*src), lsme_oinfo)); +} + +struct lov_stripe_md { + atomic_t lsm_refc; + spinlock_t lsm_lock; + pid_t lsm_lock_owner; /* debugging */ + + union { + /* maximum possible file size, might change as OSTs status + * changes, e.g. disconnected, deactivated + */ + loff_t lsm_maxbytes; + /* size of full foreign LOV */ + size_t lsm_foreign_size; + }; + struct ost_id lsm_oi; + u32 lsm_magic; + u32 lsm_layout_gen; + u16 lsm_flags; + bool lsm_is_released; + u16 lsm_mirror_count; + u16 lsm_entry_count; + struct lov_stripe_md_entry *lsm_entries[]; +}; + +#define lsm_foreign(lsm) (lsm->lsm_entries[0]) + +static inline bool lsme_is_foreign(const struct lov_stripe_md_entry *lsme) +{ + return lsme->lsme_magic == LOV_MAGIC_FOREIGN; +} + +static inline bool lsm_entry_is_foreign(const struct lov_stripe_md *lsm, + int index) +{ + return lsme_is_foreign(lsm->lsm_entries[index]); +} + +static inline bool lsme_inited(const struct lov_stripe_md_entry *lsme) +{ + return lsme->lsme_flags & LCME_FL_INIT; +} + +static inline bool lsm_entry_inited(const struct lov_stripe_md *lsm, int index) +{ + return lsme_inited(lsm->lsm_entries[index]); +} + +static inline bool lsm_is_composite(__u32 magic) +{ + return magic == LOV_MAGIC_COMP_V1; +} + +static inline size_t lov_comp_md_size(const struct lov_stripe_md *lsm) +{ + struct lov_stripe_md_entry *lsme; + size_t size; + int entry; + + if (lsm->lsm_magic == LOV_MAGIC_V1 || lsm->lsm_magic == LOV_MAGIC_V3) + return lov_mds_md_size(lsm->lsm_entries[0]->lsme_stripe_count, + lsm->lsm_entries[0]->lsme_magic); + + if (lsm->lsm_magic == LOV_MAGIC_FOREIGN) + return lsm->lsm_foreign_size; + + LASSERT(lsm->lsm_magic == LOV_MAGIC_COMP_V1); + + size = sizeof(struct lov_comp_md_v1) + + sizeof(struct lov_comp_md_entry_v1) * lsm->lsm_entry_count; + for (entry = 0; entry < lsm->lsm_entry_count; entry++) { + u16 stripe_count; + + lsme = lsm->lsm_entries[entry]; + + if (lsme_inited(lsme)) + stripe_count = lsme->lsme_stripe_count; + else + stripe_count = 0; + + size += lov_mds_md_size(stripe_count, + lsme->lsme_magic); + } + + return size; +} + +static inline bool lsm_has_objects(struct lov_stripe_md *lsm) +{ + return lsm != NULL && !lsm->lsm_is_released; +} + +static inline unsigned int lov_comp_index(int entry, int stripe) +{ + LASSERT(entry >= 0 && entry <= SHRT_MAX); + LASSERT(stripe >= 0 && stripe < USHRT_MAX); + + return entry << 16 | stripe; +} + +static inline int lov_comp_stripe(int index) +{ + return index & 0xffff; +} + +static inline int lov_comp_entry(int index) +{ + return index >> 16; +} + +struct lsm_operations { + struct lov_stripe_md *(*lsm_unpackmd)(struct lov_obd *, void *, size_t); +}; + +const struct lsm_operations *lsm_op_find(int magic); +void lsm_free(struct lov_stripe_md *lsm); + +/* lov_do_div64(a, b) returns a % b, and a = a / b. + * The 32-bit code is LOV-specific due to knowing about stripe limits in + * order to reduce the divisor to a 32-bit number. If the divisor is + * already a 32-bit value the compiler handles this directly. */ +#if BITS_PER_LONG == 64 +# define lov_do_div64(n, base) ({ \ + uint64_t __base = (base); \ + uint64_t __rem; \ + __rem = ((uint64_t)(n)) % __base; \ + (n) = ((uint64_t)(n)) / __base; \ + __rem; \ +}) +#elif BITS_PER_LONG == 32 +# define lov_do_div64(n, base) ({ \ + uint64_t __num = (n); \ + uint64_t __rem; \ + if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) { \ + int __remainder; \ + LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), \ + "64 bit lov division %llu / %llu\n", \ + __num, (uint64_t)(base)); \ + __remainder = __num & (LOV_MIN_STRIPE_SIZE - 1); \ + __num >>= LOV_MIN_STRIPE_BITS; \ + __rem = do_div(__num, (base) >> LOV_MIN_STRIPE_BITS); \ + __rem <<= LOV_MIN_STRIPE_BITS; \ + __rem += __remainder; \ + } else { \ + __rem = do_div(__num, base); \ + } \ + (n) = __num; \ + __rem; \ +}) +#endif + +#define pool_tgt_count(p) ((p)->pool_obds.op_count) +#define pool_tgt_array(p) ((p)->pool_obds.op_array) +#define pool_tgt_rw_sem(p) ((p)->pool_obds.op_rw_sem) + +struct pool_desc { + char pool_name[LOV_MAXPOOLNAME + 1]; + struct lu_tgt_pool pool_obds; + atomic_t pool_refcount; + struct rhash_head pool_hash; /* access by poolname */ + struct list_head pool_list; /* serial access */ + struct rcu_head pool_rcu; + struct proc_dir_entry *pool_proc_entry; + struct obd_device *pool_lobd; /* owner */ +}; + +int lov_pool_hash_init(struct rhashtable *tbl); +void lov_pool_hash_destroy(struct rhashtable *tbl); + +struct lov_request { + struct obd_info rq_oi; + struct lov_request_set *rq_rqset; + struct list_head rq_link; + int rq_idx; /* index in lov->tgts array */ +}; + +struct lov_request_set { + struct obd_info *set_oi; + struct obd_device *set_obd; + int set_count; + atomic_t set_completes; + atomic_t set_success; + struct list_head set_list; +}; + +extern struct kmem_cache *lov_oinfo_slab; + +extern struct lu_kmem_descr lov_caches[]; + +#define lov_uuid2str(lv, index) \ + (char *)((lv)->lov_tgts[index]->ltd_uuid.uuid) + +/* lov_merge.c */ +int lov_merge_lvb_kms(struct lov_stripe_md *lsm, int index, + struct ost_lvb *lvb, __u64 *kms_place); + +/* lov_offset.c */ +loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index); +u64 lov_stripe_size(struct lov_stripe_md *lsm, int index, + u64 ost_size, int stripeno); +int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off, + int stripeno, loff_t *obd_off); +loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size, + int stripeno); +int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno, + struct lu_extent *ext, u64 *obd_start, u64 *obd_end); +int lov_stripe_number(struct lov_stripe_md *lsm, int index, loff_t lov_off); +pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index, + pgoff_t stripe_index, int stripe); + +/* lov_request.c */ +int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo, + struct lov_request_set **reqset); +int lov_fini_statfs_set(struct lov_request_set *set); + +/* lov_obd.c */ +void lov_tgts_getref(struct obd_device *obd); +void lov_tgts_putref(struct obd_device *obd); +void lov_stripe_lock(struct lov_stripe_md *md); +void lov_stripe_unlock(struct lov_stripe_md *md); +void lov_fix_desc(struct lov_desc *desc); +void lov_fix_desc_stripe_size(__u64 *val); +void lov_fix_desc_stripe_count(__u32 *val); +void lov_fix_desc_pattern(__u32 *val); +void lov_fix_desc_qos_maxage(__u32 *val); +__u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic, + __u16 stripe_count); +int lov_connect_obd(struct obd_device *obd, u32 index, int activate, + struct obd_connect_data *data); +int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg); +int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg, + u32 *indexp, int *genp); +int lov_del_target(struct obd_device *obd, u32 index, + struct obd_uuid *uuidp, int gen); + +/* lov_pack.c */ +ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf, + size_t buf_size); +struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf, + size_t buf_size); +int lov_free_memmd(struct lov_stripe_md **lsmp); + +void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm); +void lov_dump_lmm_common(int level, void *lmmp); + +/* lov_ea.c */ +void lsm_free_plain(struct lov_stripe_md *lsm); +void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm); + +/* lproc_lov.c */ +int lov_tunables_init(struct obd_device *obd); + +/* lov_cl.c */ +extern struct lu_device_type lov_device_type; + +#define LOV_MDC_TGT_MAX 256 + +/* high level pool methods */ +int lov_pool_new(struct obd_device *obd, char *poolname); +int lov_pool_del(struct obd_device *obd, char *poolname); +int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname); +int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname); + +static inline struct lov_stripe_md *lsm_addref(struct lov_stripe_md *lsm) +{ + LASSERT(atomic_read(&lsm->lsm_refc) > 0); + atomic_inc(&lsm->lsm_refc); + return lsm; +} + +static inline bool lov_oinfo_is_dummy(const struct lov_oinfo *loi) +{ + if (unlikely(loi->loi_oi.oi.oi_id == 0 && + loi->loi_oi.oi.oi_seq == 0 && + loi->loi_ost_idx == 0 && + loi->loi_ost_gen == 0)) + return true; + + return false; +} + +static inline struct obd_device *lov2obd(const struct lov_obd *lov) +{ + return container_of_safe(lov, struct obd_device, u.lov); +} + +static inline void lov_lsm2layout(struct lov_stripe_md *lsm, + struct lov_stripe_md_entry *lsme, + struct ost_layout *ol) +{ + ol->ol_stripe_size = lsme->lsme_stripe_size; + ol->ol_stripe_count = lsme->lsme_stripe_count; + if (lsm->lsm_magic == LOV_MAGIC_COMP_V1) { + ol->ol_comp_start = lsme->lsme_extent.e_start; + ol->ol_comp_end = lsme->lsme_extent.e_end; + ol->ol_comp_id = lsme->lsme_id; + } else { + ol->ol_comp_start = 0; + ol->ol_comp_end = 0; + ol->ol_comp_id = 0; + } +} + +struct pool_desc *lov_pool_find(struct obd_device *obd, char *poolname); +void lov_pool_putref(struct pool_desc *pool); +#endif diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_io.c b/drivers/staging/lustrefsx/lustre/lov/lov_io.c new file mode 100644 index 0000000000000..ce4fa30b84b6f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_io.c @@ -0,0 +1,1987 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_io for LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +static inline struct lov_io_sub *lov_sub_alloc(struct lov_io *lio, int index) +{ + struct lov_io_sub *sub; + + if (lio->lis_nr_subios == 0) { + LASSERT(lio->lis_single_subio_index == -1); + sub = &lio->lis_single_subio; + lio->lis_single_subio_index = index; + memset(sub, 0, sizeof(*sub)); + } else { + OBD_ALLOC_PTR(sub); + } + + if (sub) { + INIT_LIST_HEAD(&sub->sub_list); + INIT_LIST_HEAD(&sub->sub_linkage); + sub->sub_subio_index = index; + } + + return sub; +} + +static inline void lov_sub_free(struct lov_io *lio, struct lov_io_sub *sub) +{ + if (sub->sub_subio_index == lio->lis_single_subio_index) { + LASSERT(sub == &lio->lis_single_subio); + lio->lis_single_subio_index = -1; + } else { + OBD_FREE_PTR(sub); + } +} + +static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio, + struct lov_io_sub *sub) +{ + ENTRY; + + cl_io_fini(sub->sub_env, &sub->sub_io); + + if (sub->sub_env && !IS_ERR(sub->sub_env)) { + cl_env_put(sub->sub_env, &sub->sub_refcheck); + sub->sub_env = NULL; + } + EXIT; +} + +static inline bool +is_index_within_mirror(struct lov_object *lov, int index, int mirror_index) +{ + struct lov_layout_composite *comp = &lov->u.composite; + struct lov_mirror_entry *lre = &comp->lo_mirrors[mirror_index]; + + return (index >= lre->lre_start && index <= lre->lre_end); +} + +static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio, + struct lov_io_sub *sub) +{ + struct lov_object *lov = lio->lis_object; + struct cl_io *sub_io; + struct cl_object *sub_obj; + struct cl_io *io = lio->lis_cl.cis_io; + int index = lov_comp_entry(sub->sub_subio_index); + int stripe = lov_comp_stripe(sub->sub_subio_index); + int result = 0; + LASSERT(sub->sub_env == NULL); + ENTRY; + + if (unlikely(!lov_r0(lov, index)->lo_sub || + !lov_r0(lov, index)->lo_sub[stripe])) + RETURN(-EIO); + + LASSERTF(ergo(lov_is_flr(lov), + is_index_within_mirror(lov, index, + lio->lis_mirror_index)), + DFID "iot = %d, index = %d, mirror = %d\n", + PFID(lu_object_fid(lov2lu(lov))), io->ci_type, index, + lio->lis_mirror_index); + + /* obtain new environment */ + sub->sub_env = cl_env_get(&sub->sub_refcheck); + if (IS_ERR(sub->sub_env)) { + result = PTR_ERR(sub->sub_env); + RETURN(result); + } + + sub_obj = lovsub2cl(lov_r0(lov, index)->lo_sub[stripe]); + sub_io = &sub->sub_io; + + sub_io->ci_obj = sub_obj; + sub_io->ci_result = 0; + + sub_io->ci_parent = io; + sub_io->ci_lockreq = io->ci_lockreq; + sub_io->ci_type = io->ci_type; + sub_io->ci_no_srvlock = io->ci_no_srvlock; + sub_io->ci_noatime = io->ci_noatime; + sub_io->ci_async_readahead = io->ci_async_readahead; + sub_io->ci_lock_no_expand = io->ci_lock_no_expand; + sub_io->ci_ndelay = io->ci_ndelay; + sub_io->ci_layout_version = io->ci_layout_version; + sub_io->ci_tried_all_mirrors = io->ci_tried_all_mirrors; + + result = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj); + + if (result < 0) + lov_io_sub_fini(env, lio, sub); + + RETURN(result); +} + +struct lov_io_sub *lov_sub_get(const struct lu_env *env, + struct lov_io *lio, int index) +{ + struct lov_io_sub *sub; + int rc = 0; + + ENTRY; + + list_for_each_entry(sub, &lio->lis_subios, sub_list) { + if (sub->sub_subio_index == index) { + rc = 1; + break; + } + } + + if (rc == 0) { + sub = lov_sub_alloc(lio, index); + if (!sub) + GOTO(out, rc = -ENOMEM); + + rc = lov_io_sub_init(env, lio, sub); + if (rc < 0) { + lov_sub_free(lio, sub); + GOTO(out, rc); + } + + list_add_tail(&sub->sub_list, &lio->lis_subios); + lio->lis_nr_subios++; + } +out: + if (rc < 0) + sub = ERR_PTR(rc); + else + sub->sub_io.ci_noquota = lio->lis_cl.cis_io->ci_noquota; + RETURN(sub); +} + +/***************************************************************************** + * + * Lov io operations. + * + */ +static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio, + struct cl_io *io) +{ + ENTRY; + + LASSERT(lio->lis_object != NULL); + + INIT_LIST_HEAD(&lio->lis_subios); + lio->lis_single_subio_index = -1; + lio->lis_nr_subios = 0; + + RETURN(0); +} + +/** + * Decide if it will need write intent RPC + */ +static int lov_io_mirror_write_intent(struct lov_io *lio, + struct lov_object *obj, struct cl_io *io) +{ + struct lov_layout_composite *comp = &obj->u.composite; + struct lu_extent *ext = &io->ci_write_intent; + struct lov_mirror_entry *lre; + struct lov_mirror_entry *primary; + struct lov_layout_entry *lle; + size_t count = 0; + ENTRY; + + *ext = (typeof(*ext)) { lio->lis_pos, lio->lis_endpos }; + io->ci_need_write_intent = 0; + + if (!(io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io) || + cl_io_is_fallocate(io) || cl_io_is_trunc(io))) + RETURN(0); + + /* + * FLR: check if it needs to send a write intent RPC to server. + * Writing to sync_pending file needs write intent RPC to change + * the file state back to write_pending, so that the layout version + * can be increased when the state changes to sync_pending at a later + * time. Otherwise there exists a chance that an evicted client may + * dirty the file data while resync client is working on it. + * Designated I/O is allowed for resync workload. + */ + if (lov_flr_state(obj) == LCM_FL_RDONLY || + (lov_flr_state(obj) == LCM_FL_SYNC_PENDING && + io->ci_designated_mirror == 0)) { + io->ci_need_write_intent = 1; + RETURN(0); + } + + LASSERT((lov_flr_state(obj) == LCM_FL_WRITE_PENDING)); + LASSERT(comp->lo_preferred_mirror >= 0); + + /* + * need to iterate all components to see if there are + * multiple components covering the writing component + */ + primary = &comp->lo_mirrors[comp->lo_preferred_mirror]; + LASSERT(!primary->lre_stale); + lov_foreach_mirror_layout_entry(obj, lle, primary) { + LASSERT(lle->lle_valid); + if (!lu_extent_is_overlapped(ext, lle->lle_extent)) + continue; + + ext->e_start = min(ext->e_start, lle->lle_extent->e_start); + ext->e_end = max(ext->e_end, lle->lle_extent->e_end); + ++count; + } + if (count == 0) { + CERROR(DFID ": cannot find any valid components covering " + "file extent "DEXT", mirror: %d\n", + PFID(lu_object_fid(lov2lu(obj))), PEXT(ext), + primary->lre_mirror_id); + RETURN(-EIO); + } + + count = 0; + lov_foreach_mirror_entry(obj, lre) { + if (lre == primary) + continue; + + lov_foreach_mirror_layout_entry(obj, lle, lre) { + if (!lle->lle_valid) + continue; + + if (lu_extent_is_overlapped(ext, lle->lle_extent)) { + ++count; + break; + } + } + } + + CDEBUG(D_VFSTRACE, DFID "there are %zd components to be staled to " + "modify file extent "DEXT", iot: %d\n", + PFID(lu_object_fid(lov2lu(obj))), count, PEXT(ext), io->ci_type); + + io->ci_need_write_intent = count > 0; + + RETURN(0); +} + +static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj, + struct cl_io *io) +{ + struct lov_layout_composite *comp = &obj->u.composite; + int index; + int i; + int result; + ENTRY; + + if (!lov_is_flr(obj)) { + /* only locks/pages are manipulated for CIT_MISC op, no + * cl_io_loop() will be called, don't check/set mirror info. + */ + if (io->ci_type != CIT_MISC) { + LASSERT(comp->lo_preferred_mirror == 0); + lio->lis_mirror_index = comp->lo_preferred_mirror; + } + io->ci_ndelay = 0; + RETURN(0); + } + + /* transfer the layout version for verification */ + if (io->ci_layout_version == 0) + io->ci_layout_version = obj->lo_lsm->lsm_layout_gen; + + /* find the corresponding mirror for designated mirror IO */ + if (io->ci_designated_mirror > 0) { + struct lov_mirror_entry *entry; + + LASSERT(!io->ci_ndelay); + + CDEBUG(D_LAYOUT, "designated I/O mirror state: %d\n", + lov_flr_state(obj)); + + if ((cl_io_is_trunc(io) || io->ci_type == CIT_WRITE || + cl_io_is_fallocate(io)) && + (io->ci_layout_version != obj->lo_lsm->lsm_layout_gen)) { + /* + * For resync I/O, the ci_layout_version was the layout + * version when resync starts. If it doesn't match the + * current object layout version, it means the layout + * has been changed + */ + RETURN(-ESTALE); + } + + io->ci_layout_version |= LU_LAYOUT_RESYNC; + + index = 0; + lio->lis_mirror_index = -1; + lov_foreach_mirror_entry(obj, entry) { + if (entry->lre_mirror_id == + io->ci_designated_mirror) { + lio->lis_mirror_index = index; + break; + } + + index++; + } + + RETURN(lio->lis_mirror_index < 0 ? -EINVAL : 0); + } + + result = lov_io_mirror_write_intent(lio, obj, io); + if (result) + RETURN(result); + + if (io->ci_need_write_intent) { + CDEBUG(D_VFSTRACE, DFID " need write intent for [%llu, %llu)\n", + PFID(lu_object_fid(lov2lu(obj))), + lio->lis_pos, lio->lis_endpos); + + if (cl_io_is_trunc(io)) { + /** + * for truncate, we uses [size, EOF) to judge whether + * a write intent needs to be send, but we need to + * restore the write extent to [0, size], in truncate, + * the byte in the size position is accessed. + */ + io->ci_write_intent.e_start = 0; + io->ci_write_intent.e_end = + io->u.ci_setattr.sa_attr.lvb_size + 1; + } + /* stop cl_io_init() loop */ + RETURN(1); + } + + if (io->ci_ndelay_tried == 0 || /* first time to try */ + /* reset the mirror index if layout has changed */ + lio->lis_mirror_layout_gen != obj->lo_lsm->lsm_layout_gen) { + lio->lis_mirror_layout_gen = obj->lo_lsm->lsm_layout_gen; + index = lio->lis_mirror_index = comp->lo_preferred_mirror; + } else { + index = lio->lis_mirror_index; + LASSERT(index >= 0); + + /* move mirror index to the next one */ + index = (index + 1) % comp->lo_mirror_count; + } + + for (i = 0; i < comp->lo_mirror_count; i++) { + struct lu_extent ext = { .e_start = lio->lis_pos, + .e_end = lio->lis_pos + 1 }; + struct lov_mirror_entry *lre; + struct lov_layout_entry *lle; + bool found = false; + + lre = &comp->lo_mirrors[(index + i) % comp->lo_mirror_count]; + if (!lre->lre_valid) + continue; + + if (lre->lre_foreign) + continue; + + lov_foreach_mirror_layout_entry(obj, lle, lre) { + if (!lle->lle_valid) + continue; + + if (lu_extent_is_overlapped(&ext, lle->lle_extent)) { + found = true; + break; + } + } /* each component of the mirror */ + if (found) { + index = (index + i) % comp->lo_mirror_count; + break; + } + } /* each mirror */ + + if (i == comp->lo_mirror_count) { + CERROR(DFID": failed to find a component covering " + "I/O region at %llu\n", + PFID(lu_object_fid(lov2lu(obj))), lio->lis_pos); + + dump_lsm(D_ERROR, obj->lo_lsm); + + RETURN(-EIO); + } + + CDEBUG(D_VFSTRACE, DFID ": flr state: %d, move mirror from %d to %d, " + "have retried: %d, mirror count: %d\n", + PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj), + lio->lis_mirror_index, index, io->ci_ndelay_tried, + comp->lo_mirror_count); + + lio->lis_mirror_index = index; + + /* + * FLR: if all mirrors have been tried once, most likely the network + * of this client has been partitioned. We should relinquish CPU for + * a while before trying again. + */ + if (io->ci_ndelay && io->ci_ndelay_tried > 0 && + (io->ci_ndelay_tried % comp->lo_mirror_count == 0)) { + schedule_timeout_interruptible(cfs_time_seconds(1) / 100); + if (signal_pending(current)) + RETURN(-EINTR); + + /** + * we'd set ci_tried_all_mirrors to turn off fast mirror + * switching for read after we've tried all mirrors several + * rounds. + */ + io->ci_tried_all_mirrors = io->ci_ndelay_tried % + (comp->lo_mirror_count * 4) == 0; + } + ++io->ci_ndelay_tried; + + CDEBUG(D_VFSTRACE, "use %sdelayed RPC state for this IO\n", + io->ci_ndelay ? "non-" : ""); + + RETURN(0); +} + +static int lov_io_slice_init(struct lov_io *lio, + struct lov_object *obj, struct cl_io *io) +{ + int index; + int result = 0; + ENTRY; + + io->ci_result = 0; + lio->lis_object = obj; + lio->lis_cached_entry = LIS_CACHE_ENTRY_NONE; + + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; + lio->lis_io_endpos = lio->lis_endpos; + if (cl_io_is_append(io)) { + LASSERT(io->ci_type == CIT_WRITE); + + /* + * If there is LOV EA hole, then we may cannot locate + * the current file-tail exactly. + */ + if (unlikely(obj->lo_lsm->lsm_entries[0]->lsme_pattern & + LOV_PATTERN_F_HOLE)) + GOTO(out, result = -EIO); + + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + } + break; + + case CIT_SETATTR: + if (cl_io_is_fallocate(io)) { + lio->lis_pos = io->u.ci_setattr.sa_falloc_offset; + lio->lis_endpos = io->u.ci_setattr.sa_falloc_end; + } else if (cl_io_is_trunc(io)) { + lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size; + lio->lis_endpos = OBD_OBJECT_EOF; + } else { + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + } + break; + + case CIT_DATA_VERSION: + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + break; + + case CIT_FAULT: { + pgoff_t index = io->u.ci_fault.ft_index; + + lio->lis_pos = cl_offset(io->ci_obj, index); + lio->lis_endpos = cl_offset(io->ci_obj, index + 1); + break; + } + + case CIT_FSYNC: { + lio->lis_pos = io->u.ci_fsync.fi_start; + lio->lis_endpos = io->u.ci_fsync.fi_end; + break; + } + + case CIT_LADVISE: { + lio->lis_pos = io->u.ci_ladvise.li_start; + lio->lis_endpos = io->u.ci_ladvise.li_end; + break; + } + + case CIT_LSEEK: { + lio->lis_pos = io->u.ci_lseek.ls_start; + lio->lis_endpos = OBD_OBJECT_EOF; + break; + } + + case CIT_GLIMPSE: + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + + if (lov_flr_state(obj) == LCM_FL_RDONLY && + !OBD_FAIL_CHECK(OBD_FAIL_FLR_GLIMPSE_IMMUTABLE)) + /* SoM is accurate, no need glimpse */ + GOTO(out, result = 1); + break; + + case CIT_MISC: + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + break; + + default: + LBUG(); + } + + /* + * CIT_MISC + ci_ignore_layout can identify the I/O from the OSC layer, + * it won't care/access lov layout related info. + */ + if (io->ci_ignore_layout && io->ci_type == CIT_MISC) + GOTO(out, result = 0); + + LASSERT(obj->lo_lsm != NULL); + + result = lov_io_mirror_init(lio, obj, io); + if (result) + GOTO(out, result); + + /* check if it needs to instantiate layout */ + if (!(io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io) || + cl_io_is_fallocate(io) || + (cl_io_is_trunc(io) && io->u.ci_setattr.sa_attr.lvb_size > 0))) + GOTO(out, result = 0); + + /* + * for truncate, it only needs to instantiate the components + * before the truncated size. + */ + if (cl_io_is_trunc(io)) { + io->ci_write_intent.e_start = 0; + /* for writes, e_end is endpos, the location of the file + * pointer after the write is completed, so it is not accessed. + * For truncate, 'end' is the size, and *is* acccessed. + * In other words, writes are [start, end), but truncate is + * [start, size], where both are included. So add 1 to the + * size when creating the write intent to account for this. + */ + io->ci_write_intent.e_end = + io->u.ci_setattr.sa_attr.lvb_size + 1; + } else { + io->ci_write_intent.e_start = lio->lis_pos; + io->ci_write_intent.e_end = lio->lis_endpos; + } + + index = 0; + lov_foreach_io_layout(index, lio, &io->ci_write_intent) { + if (!lsm_entry_inited(obj->lo_lsm, index)) { + io->ci_need_write_intent = 1; + break; + } + } + + if (io->ci_need_write_intent && io->ci_designated_mirror > 0) { + /* + * REINT_SYNC RPC has already tried to instantiate all of the + * components involved, obviously it didn't succeed. Skip this + * mirror for now. The server won't be able to figure out + * which mirror it should instantiate components + */ + CERROR(DFID": trying to instantiate components for designated " + "I/O, file state: %d\n", + PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj)); + + io->ci_need_write_intent = 0; + GOTO(out, result = -EIO); + } + + if (io->ci_need_write_intent) + GOTO(out, result = 1); + + EXIT; + +out: + return result; +} + +static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_object *lov = cl2lov(ios->cis_obj); + struct lov_io_sub *sub; + + ENTRY; + LASSERT(list_empty(&lio->lis_active)); + + while ((sub = list_first_entry_or_null(&lio->lis_subios, + struct lov_io_sub, + sub_list)) != NULL) { + list_del_init(&sub->sub_list); + lio->lis_nr_subios--; + + lov_io_sub_fini(env, lio, sub); + lov_sub_free(lio, sub); + } + LASSERT(lio->lis_nr_subios == 0); + + LASSERT(atomic_read(&lov->lo_active_ios) > 0); + if (atomic_dec_and_test(&lov->lo_active_ios)) + wake_up(&lov->lo_waitq); + EXIT; +} + +static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio, + loff_t start, loff_t end) +{ + struct cl_io *io = &sub->sub_io; + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + struct cl_io *parent = lio->lis_cl.cis_io; + int index = lov_comp_entry(sub->sub_subio_index); + int stripe = lov_comp_stripe(sub->sub_subio_index); + + switch (io->ci_type) { + case CIT_SETATTR: { + io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr; + io->u.ci_setattr.sa_attr_flags = + parent->u.ci_setattr.sa_attr_flags; + io->u.ci_setattr.sa_avalid = parent->u.ci_setattr.sa_avalid; + io->u.ci_setattr.sa_xvalid = parent->u.ci_setattr.sa_xvalid; + io->u.ci_setattr.sa_falloc_mode = + parent->u.ci_setattr.sa_falloc_mode; + io->u.ci_setattr.sa_stripe_index = stripe; + io->u.ci_setattr.sa_parent_fid = + parent->u.ci_setattr.sa_parent_fid; + /* For SETATTR(fallocate) pass the subtype to lower IO */ + io->u.ci_setattr.sa_subtype = parent->u.ci_setattr.sa_subtype; + if (cl_io_is_fallocate(io)) { + io->u.ci_setattr.sa_falloc_offset = start; + io->u.ci_setattr.sa_falloc_end = end; + io->u.ci_setattr.sa_falloc_uid = + parent->u.ci_setattr.sa_falloc_uid; + io->u.ci_setattr.sa_falloc_gid = + parent->u.ci_setattr.sa_falloc_gid; + } + if (cl_io_is_trunc(io)) { + loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size; + + new_size = lov_size_to_stripe(lsm, index, new_size, + stripe); + io->u.ci_setattr.sa_attr.lvb_size = new_size; + } + lov_lsm2layout(lsm, lsm->lsm_entries[index], + &io->u.ci_setattr.sa_layout); + break; + } + case CIT_DATA_VERSION: { + io->u.ci_data_version.dv_data_version = 0; + io->u.ci_data_version.dv_flags = + parent->u.ci_data_version.dv_flags; + break; + } + case CIT_FAULT: { + struct cl_object *obj = parent->ci_obj; + loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index); + + io->u.ci_fault = parent->u.ci_fault; + off = lov_size_to_stripe(lsm, index, off, stripe); + io->u.ci_fault.ft_index = cl_index(obj, off); + break; + } + case CIT_FSYNC: { + io->u.ci_fsync.fi_start = start; + io->u.ci_fsync.fi_end = end; + io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid; + io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode; + break; + } + case CIT_READ: + case CIT_WRITE: { + io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent); + io->ci_tried_all_mirrors = parent->ci_tried_all_mirrors; + if (cl_io_is_append(parent)) { + io->u.ci_wr.wr_append = 1; + } else { + io->u.ci_rw.crw_pos = start; + io->u.ci_rw.crw_count = end - start; + } + break; + } + case CIT_LADVISE: { + io->u.ci_ladvise.li_start = start; + io->u.ci_ladvise.li_end = end; + io->u.ci_ladvise.li_fid = parent->u.ci_ladvise.li_fid; + io->u.ci_ladvise.li_advice = parent->u.ci_ladvise.li_advice; + io->u.ci_ladvise.li_flags = parent->u.ci_ladvise.li_flags; + break; + } + case CIT_LSEEK: { + io->u.ci_lseek.ls_start = start; + io->u.ci_lseek.ls_whence = parent->u.ci_lseek.ls_whence; + io->u.ci_lseek.ls_result = parent->u.ci_lseek.ls_result; + break; + } + case CIT_GLIMPSE: + case CIT_MISC: + default: + break; + } +} + +static loff_t lov_offset_mod(loff_t val, int delta) +{ + if (val != OBD_OBJECT_EOF) + val += delta; + return val; +} + +static int lov_io_add_sub(const struct lu_env *env, struct lov_io *lio, + struct lov_io_sub *sub, u64 start, u64 end) +{ + int rc; + + end = lov_offset_mod(end, 1); + lov_io_sub_inherit(sub, lio, start, end); + rc = cl_io_iter_init(sub->sub_env, &sub->sub_io); + if (rc != 0) { + cl_io_iter_fini(sub->sub_env, &sub->sub_io); + return rc; + } + + list_add_tail(&sub->sub_linkage, &lio->lis_active); + + return rc; +} +static int lov_io_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + bool is_trunc = cl_io_is_trunc(ios->cis_io); + struct lov_io_sub *sub; + struct lu_extent ext; + int index; + int rc = 0; + + ENTRY; + + ext.e_start = lio->lis_pos; + ext.e_end = lio->lis_endpos; + + if (is_trunc) { + OBD_ALLOC_PTR_ARRAY(lio->lis_trunc_stripe_index, + lio->lis_object->u.composite.lo_entry_count); + if (lio->lis_trunc_stripe_index == NULL) + RETURN(-ENOMEM); + } + + lov_foreach_io_layout(index, lio, &ext) { + struct lov_layout_entry *le = lov_entry(lio->lis_object, index); + struct lov_layout_raid0 *r0 = &le->lle_raid0; + u64 start; + u64 end; + int stripe; + bool tested_trunc_stripe = false; + + if (is_trunc) + lio->lis_trunc_stripe_index[index] = -1; + + CDEBUG(D_VFSTRACE, "component[%d] flags %#x\n", + index, lsm->lsm_entries[index]->lsme_flags); + if (!lsm_entry_inited(lsm, index)) { + /* + * Read from uninitialized components should return + * zero filled pages. + */ + continue; + } + + if (lsm_entry_is_foreign(lsm, index)) + continue; + + if (!le->lle_valid && !ios->cis_io->ci_designated_mirror) { + CERROR("I/O to invalid component: %d, mirror: %d\n", + index, lio->lis_mirror_index); + RETURN(-EIO); + } + + for (stripe = 0; stripe < r0->lo_nr; stripe++) { + if (!lov_stripe_intersects(lsm, index, stripe, + &ext, &start, &end)) + continue; + + if (unlikely(!r0->lo_sub[stripe])) { + if (ios->cis_io->ci_type == CIT_READ || + ios->cis_io->ci_type == CIT_WRITE || + ios->cis_io->ci_type == CIT_FAULT) + RETURN(-EIO); + + continue; + } + + if (is_trunc && !tested_trunc_stripe) { + int prev; + u64 tr_start; + + prev = (stripe == 0) ? r0->lo_nr - 1 : + stripe - 1; + /** + * Only involving previous stripe if the + * truncate in this component is at the + * beginning of this stripe. + */ + tested_trunc_stripe = true; + if (ext.e_start < lsm->lsm_entries[index]-> + lsme_extent.e_start) { + /* need previous stripe involvement */ + lio->lis_trunc_stripe_index[index] = prev; + } else { + tr_start = ext.e_start; + tr_start = lov_do_div64(tr_start, + stripe_width(lsm, index)); + /* tr_start %= stripe_swidth */ + if (tr_start == stripe * lsm-> + lsm_entries[index]-> + lsme_stripe_size) + lio->lis_trunc_stripe_index[index] = prev; + } + } + + /* if the last stripe is the trunc stripeno */ + if (is_trunc && lio->lis_trunc_stripe_index[index] == stripe) + lio->lis_trunc_stripe_index[index] = -1; + + sub = lov_sub_get(env, lio, + lov_comp_index(index, stripe)); + if (IS_ERR(sub)) + return PTR_ERR(sub); + + rc = lov_io_add_sub(env, lio, sub, start, end); + if (rc != 0) + break; + } + if (rc != 0) + break; + + if (is_trunc && lio->lis_trunc_stripe_index[index] != -1) { + stripe = lio->lis_trunc_stripe_index[index]; + if (unlikely(!r0->lo_sub[stripe])) { + lio->lis_trunc_stripe_index[index] = -1; + continue; + } + sub = lov_sub_get(env, lio, + lov_comp_index(index, stripe)); + if (IS_ERR(sub)) + return PTR_ERR(sub); + + /** + * the prev sub could be used by another truncate, we'd + * skip it. LU-14128 happends when expand truncate + + * read get wrong kms. + */ + if (!list_empty(&sub->sub_linkage)) { + lio->lis_trunc_stripe_index[index] = -1; + continue; + } + + (void)lov_stripe_intersects(lsm, index, stripe, &ext, + &start, &end); + rc = lov_io_add_sub(env, lio, sub, start, end); + if (rc != 0) + break; + + } + } + RETURN(rc); +} + +static int lov_io_rw_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_io *io = ios->cis_io; + struct lov_stripe_md_entry *lse; + loff_t start = io->u.ci_rw.crw_pos; + loff_t next; + int index; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + ENTRY; + + if (cl_io_is_append(io)) + RETURN(lov_io_iter_init(env, ios)); + + index = lov_io_layout_at(lio, io->u.ci_rw.crw_pos); + if (index < 0) { /* non-existing layout component */ + if (io->ci_type == CIT_READ) { + /* + * TODO: it needs to detect the next component and + * then set the next pos + */ + io->ci_continue = 0; + + RETURN(lov_io_iter_init(env, ios)); + } + + RETURN(-ENODATA); + } + + if (!lov_entry(lio->lis_object, index)->lle_valid && + !io->ci_designated_mirror) + RETURN(io->ci_type == CIT_READ ? -EAGAIN : -EIO); + + lse = lov_lse(lio->lis_object, index); + + if (lsme_is_foreign(lse)) + RETURN(-EINVAL); + + next = MAX_LFS_FILESIZE; + if (lse->lsme_stripe_count > 1) { + unsigned long ssize = lse->lsme_stripe_size; + + lov_do_div64(start, ssize); + next = (start + 1) * ssize; + if (next <= start * ssize) + next = MAX_LFS_FILESIZE; + } + + LASSERTF(io->u.ci_rw.crw_pos >= lse->lsme_extent.e_start, + "pos %lld, [%lld, %lld)\n", io->u.ci_rw.crw_pos, + lse->lsme_extent.e_start, lse->lsme_extent.e_end); + next = min_t(__u64, next, lse->lsme_extent.e_end); + next = min_t(loff_t, next, lio->lis_io_endpos); + + io->ci_continue = next < lio->lis_io_endpos; + io->u.ci_rw.crw_count = next - io->u.ci_rw.crw_pos; + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; + CDEBUG(D_VFSTRACE, + "stripe: %llu chunk: [%llu, %llu) %llu, %zd\n", + (__u64)start, lio->lis_pos, lio->lis_endpos, + (__u64)lio->lis_io_endpos, io->u.ci_rw.crw_count); + + /* + * XXX The following call should be optimized: we know, that + * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe. + */ + RETURN(lov_io_iter_init(env, ios)); +} + +static int lov_io_setattr_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_io *io = ios->cis_io; + int index; + ENTRY; + + if (cl_io_is_trunc(io) && lio->lis_pos > 0) { + index = lov_io_layout_at(lio, lio->lis_pos - 1); + /* no entry found for such offset */ + if (index < 0) + RETURN(io->ci_result = -ENODATA); + } + + RETURN(lov_io_iter_init(env, ios)); +} + +static int lov_io_call(const struct lu_env *env, struct lov_io *lio, + int (*iofunc)(const struct lu_env *, struct cl_io *)) +{ + struct cl_io *parent = lio->lis_cl.cis_io; + struct lov_io_sub *sub; + int rc = 0; + + ENTRY; + list_for_each_entry(sub, &lio->lis_active, sub_linkage) { + rc = iofunc(sub->sub_env, &sub->sub_io); + if (rc) + break; + + if (parent->ci_result == 0) + parent->ci_result = sub->sub_io.ci_result; + } + RETURN(rc); +} + +static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios) +{ + ENTRY; + RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock)); +} + +static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios) +{ + ENTRY; + RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start)); +} + +static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io) +{ + ENTRY; + /* + * It's possible that lov_io_start() wasn't called against this + * sub-io, either because previous sub-io failed, or upper layer + * completed IO. + */ + if (io->ci_state == CIS_IO_GOING) + cl_io_end(env, io); + else + io->ci_state = CIS_IO_FINISHED; + RETURN(0); +} + +static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io) +{ + cl_io_iter_fini(env, io); + RETURN(0); +} + +static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io) +{ + cl_io_unlock(env, io); + RETURN(0); +} + +static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios) +{ + int rc; + + /* Before ending each i/o, we must set lis_cached_entry to tell the + * next i/o not to use stale cached lis information. + */ + cl2lov_io(env, ios)->lis_cached_entry = LIS_CACHE_ENTRY_NONE; + + rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper); + LASSERT(rc == 0); +} + +static void +lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_io *parent = lio->lis_cl.cis_io; + struct cl_data_version_io *pdv = &parent->u.ci_data_version; + struct lov_io_sub *sub; + + ENTRY; + list_for_each_entry(sub, &lio->lis_active, sub_linkage) { + struct cl_data_version_io *sdv = &sub->sub_io.u.ci_data_version; + + lov_io_end_wrapper(sub->sub_env, &sub->sub_io); + + pdv->dv_data_version += sdv->dv_data_version; + if (pdv->dv_layout_version > sdv->dv_layout_version) + pdv->dv_layout_version = sdv->dv_layout_version; + + if (parent->ci_result == 0) + parent->ci_result = sub->sub_io.ci_result; + } + + EXIT; +} + +static void lov_io_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + int rc; + + ENTRY; + + if (lio->lis_trunc_stripe_index != NULL) + OBD_FREE_PTR_ARRAY(lio->lis_trunc_stripe_index, + lio->lis_object->u.composite.lo_entry_count); + lio->lis_trunc_stripe_index = NULL; + + rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper); + LASSERT(rc == 0); + while (!list_empty(&lio->lis_active)) + list_del_init(lio->lis_active.next); + EXIT; +} + +static void lov_io_unlock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + int rc; + + ENTRY; + rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper); + LASSERT(rc == 0); + EXIT; +} + +static int lov_io_read_ahead(const struct lu_env *env, + const struct cl_io_slice *ios, + pgoff_t start, struct cl_read_ahead *ra) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_object *loo = lio->lis_object; + struct cl_object *obj = lov2cl(loo); + struct lov_layout_raid0 *r0; + struct lov_io_sub *sub; + loff_t offset; + loff_t suboff; + pgoff_t ra_end; + unsigned int pps; /* pages per stripe */ + int stripe; + int index; + int rc; + ENTRY; + + offset = cl_offset(obj, start); + index = lov_io_layout_at(lio, offset); + if (index < 0 || !lsm_entry_inited(loo->lo_lsm, index) || + lsm_entry_is_foreign(loo->lo_lsm, index)) + RETURN(-ENODATA); + + /* avoid readahead to expand to stale components */ + if (!lov_entry(loo, index)->lle_valid) + RETURN(-EIO); + + stripe = lov_stripe_number(loo->lo_lsm, index, offset); + + r0 = lov_r0(loo, index); + if (unlikely(!r0->lo_sub[stripe])) + RETURN(-EIO); + + sub = lov_sub_get(env, lio, lov_comp_index(index, stripe)); + if (IS_ERR(sub)) + RETURN(PTR_ERR(sub)); + + lov_stripe_offset(loo->lo_lsm, index, offset, stripe, &suboff); + rc = cl_io_read_ahead(sub->sub_env, &sub->sub_io, + cl_index(lovsub2cl(r0->lo_sub[stripe]), suboff), + ra); + + CDEBUG(D_READA, DFID " cra_end = %lu, stripes = %d, rc = %d\n", + PFID(lu_object_fid(lov2lu(loo))), ra->cra_end_idx, + r0->lo_nr, rc); + if (rc != 0) + RETURN(rc); + + /** + * Adjust the stripe index by layout of comp. ra->cra_end is the + * maximum page index covered by an underlying DLM lock. + * This function converts cra_end from stripe level to file level, and + * make sure it's not beyond stripe and component boundary. + */ + + /* cra_end is stripe level, convert it into file level */ + ra_end = ra->cra_end_idx; + if (ra_end != CL_PAGE_EOF) + ra->cra_end_idx = lov_stripe_pgoff(loo->lo_lsm, index, + ra_end, stripe); + + /* boundary of current component */ + ra_end = cl_index(obj, (loff_t)lov_io_extent(lio, index)->e_end); + if (ra_end != CL_PAGE_EOF && ra->cra_end_idx >= ra_end) + ra->cra_end_idx = ra_end - 1; + + if (r0->lo_nr == 1) /* single stripe file */ + RETURN(0); + + pps = lov_lse(loo, index)->lsme_stripe_size >> PAGE_SHIFT; + + CDEBUG(D_READA, DFID " max_index = %lu, pps = %u, index = %d, " + "stripe_size = %u, stripe no = %u, start index = %lu\n", + PFID(lu_object_fid(lov2lu(loo))), ra->cra_end_idx, pps, index, + lov_lse(loo, index)->lsme_stripe_size, stripe, start); + + /* never exceed the end of the stripe */ + ra->cra_end_idx = min_t(pgoff_t, ra->cra_end_idx, + start + pps - start % pps - 1); + RETURN(0); +} + +int lov_io_lru_reserve(const struct lu_env *env, + const struct cl_io_slice *ios, loff_t pos, size_t bytes) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + struct lov_io_sub *sub; + struct lu_extent ext; + int index; + int rc = 0; + + ENTRY; + + ext.e_start = pos; + ext.e_end = pos + bytes; + lov_foreach_io_layout(index, lio, &ext) { + struct lov_layout_entry *le = lov_entry(lio->lis_object, index); + struct lov_layout_raid0 *r0 = &le->lle_raid0; + u64 start; + u64 end; + int stripe; + + if (!lsm_entry_inited(lsm, index)) + continue; + + if (!le->lle_valid && !ios->cis_io->ci_designated_mirror) { + CERROR(DFID": I/O to invalid component: %d, mirror: %d\n", + PFID(lu_object_fid(lov2lu(lio->lis_object))), + index, lio->lis_mirror_index); + RETURN(-EIO); + } + + for (stripe = 0; stripe < r0->lo_nr; stripe++) { + if (!lov_stripe_intersects(lsm, index, stripe, + &ext, &start, &end)) + continue; + + if (unlikely(!r0->lo_sub[stripe])) + RETURN(-EIO); + + sub = lov_sub_get(env, lio, + lov_comp_index(index, stripe)); + if (IS_ERR(sub)) + return PTR_ERR(sub); + + rc = cl_io_lru_reserve(sub->sub_env, &sub->sub_io, start, + end - start + 1); + if (rc != 0) + RETURN(rc); + } + } + + RETURN(0); +} + +/** + * lov implementation of cl_operations::cio_submit() method. It takes a list + * of pages in \a queue, splits it into per-stripe sub-lists, invokes + * cl_io_submit() on underlying devices to submit sub-lists, and then splices + * everything back. + * + * Major complication of this function is a need to handle memory cleansing: + * cl_io_submit() is called to write out pages as a part of VM memory + * reclamation, and hence it may not fail due to memory shortages (system + * dead-locks otherwise). To deal with this, some resources (sub-lists, + * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a + * not-memory cleansing context), and in case of memory shortage, these + * pre-allocated resources are used by lov_io_submit() under + * lov_device::ld_mutex mutex. + */ +static int lov_io_submit(const struct lu_env *env, + const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue) +{ + struct cl_page_list *qin = &queue->c2_qin; + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_io_sub *sub; + struct cl_page_list *plist = &lov_env_info(env)->lti_plist; + struct cl_page *page = cl_page_list_first(qin); + struct cl_page *tmp; + bool dio = false; + int index; + int rc = 0; + ENTRY; + + if (page->cp_type == CPT_TRANSIENT) + dio = true; + + cl_page_list_init(plist); + while (qin->pl_nr > 0) { + struct cl_2queue *cl2q = &lov_env_info(env)->lti_cl2q; + + page = cl_page_list_first(qin); + if (lov_page_is_empty(page)) { + cl_page_list_move(&queue->c2_qout, qin, page); + + /* + * it could only be mirror read to get here therefore + * the pages will be transient. We don't care about + * the return code of cl_page_prep() at all. + */ + (void) cl_page_prep(env, ios->cis_io, page, crt); + cl_page_completion(env, page, crt, 0); + continue; + } + + cl_2queue_init(cl2q); + cl_page_list_move(&cl2q->c2_qin, qin, page); + + index = page->cp_lov_index; + /* DIO is already split by stripe */ + if (!dio) { + cl_page_list_for_each_safe(page, tmp, qin) { + /* this page is not on this stripe */ + if (index != page->cp_lov_index) + continue; + + cl_page_list_move(&cl2q->c2_qin, qin, page); + } + } else { + cl_page_list_splice(qin, &cl2q->c2_qin); + } + + sub = lov_sub_get(env, lio, index); + if (!IS_ERR(sub)) { + rc = cl_io_submit_rw(sub->sub_env, &sub->sub_io, + crt, cl2q); + } else { + rc = PTR_ERR(sub); + } + + cl_page_list_splice(&cl2q->c2_qin, plist); + cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout); + cl_2queue_fini(env, cl2q); + + if (rc != 0) + break; + } + + cl_page_list_splice(plist, qin); + cl_page_list_fini(env, plist); + + RETURN(rc); +} + +static int lov_io_commit_async(const struct lu_env *env, + const struct cl_io_slice *ios, + struct cl_page_list *queue, int from, int to, + cl_commit_cbt cb) +{ + struct cl_page_list *plist = &lov_env_info(env)->lti_plist; + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_io_sub *sub; + struct cl_page *page; + int rc = 0; + ENTRY; + + if (lio->lis_nr_subios == 1) { + int idx = lio->lis_single_subio_index; + + LASSERT(!lov_page_is_empty(cl_page_list_first(queue))); + + sub = lov_sub_get(env, lio, idx); + LASSERT(!IS_ERR(sub)); + LASSERT(sub == &lio->lis_single_subio); + rc = cl_io_commit_async(sub->sub_env, &sub->sub_io, queue, + from, to, cb); + RETURN(rc); + } + + cl_page_list_init(plist); + while (queue->pl_nr > 0) { + int stripe_to = to; + int index; + + LASSERT(plist->pl_nr == 0); + page = cl_page_list_first(queue); + LASSERT(!lov_page_is_empty(page)); + + cl_page_list_move(plist, queue, page); + + index = page->cp_lov_index; + while (queue->pl_nr > 0) { + page = cl_page_list_first(queue); + if (index != page->cp_lov_index) + break; + + cl_page_list_move(plist, queue, page); + } + + if (queue->pl_nr > 0) /* still has more pages */ + stripe_to = PAGE_SIZE; + + sub = lov_sub_get(env, lio, index); + if (!IS_ERR(sub)) { + rc = cl_io_commit_async(sub->sub_env, &sub->sub_io, + plist, from, stripe_to, cb); + } else { + rc = PTR_ERR(sub); + break; + } + + if (plist->pl_nr > 0) /* short write */ + break; + + from = 0; + + if (lov_comp_entry(index) != + lov_comp_entry(page->cp_lov_index)) + cl_io_extent_release(sub->sub_env, &sub->sub_io); + } + + /* for error case, add the page back into the qin list */ + LASSERT(ergo(rc == 0, plist->pl_nr == 0)); + while (plist->pl_nr > 0) { + /* error occurred, add the uncommitted pages back into queue */ + page = cl_page_list_last(plist); + cl_page_list_move_head(queue, plist, page); + } + + RETURN(rc); +} + +static int lov_io_fault_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_fault_io *fio; + struct lov_io *lio; + struct lov_io_sub *sub; + loff_t offset; + int entry; + int stripe; + + ENTRY; + + fio = &ios->cis_io->u.ci_fault; + lio = cl2lov_io(env, ios); + + /** + * LU-14502: ft_page could be an existing cl_page associated with + * the vmpage covering the fault index, and the page may still + * refer to another mirror of an old IO. + */ + if (lov_is_flr(lio->lis_object)) { + offset = cl_offset(ios->cis_obj, fio->ft_index); + entry = lov_io_layout_at(lio, offset); + if (entry < 0) { + CERROR(DFID": page fault index %lu invalid component: " + "%d, mirror: %d\n", + PFID(lu_object_fid(&ios->cis_obj->co_lu)), + fio->ft_index, entry, + lio->lis_mirror_index); + RETURN(-EIO); + } + stripe = lov_stripe_number(lio->lis_object->lo_lsm, + entry, offset); + + if (fio->ft_page->cp_lov_index != + lov_comp_index(entry, stripe)) { + CDEBUG(D_INFO, DFID": page fault at index %lu, " + "at mirror %u comp entry %u stripe %u, " + "been used with comp entry %u stripe %u\n", + PFID(lu_object_fid(&ios->cis_obj->co_lu)), + fio->ft_index, lio->lis_mirror_index, + entry, stripe, + lov_comp_entry(fio->ft_page->cp_lov_index), + lov_comp_stripe(fio->ft_page->cp_lov_index)); + + fio->ft_page->cp_lov_index = + lov_comp_index(entry, stripe); + } + } + + sub = lov_sub_get(env, lio, fio->ft_page->cp_lov_index); + sub->sub_io.u.ci_fault.ft_nob = fio->ft_nob; + + RETURN(lov_io_start(env, ios)); +} + +static int lov_io_setattr_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_io *parent = ios->cis_io; + struct lov_io_sub *sub; + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + + ENTRY; + + if (cl_io_is_fallocate(parent)) { + list_for_each_entry(sub, &lio->lis_active, sub_linkage) { + loff_t size = parent->u.ci_setattr.sa_attr.lvb_size; + int index = lov_comp_entry(sub->sub_subio_index); + int stripe = lov_comp_stripe(sub->sub_subio_index); + + size = lov_size_to_stripe(lsm, index, size, stripe); + sub->sub_io.u.ci_setattr.sa_attr.lvb_size = size; + sub->sub_io.u.ci_setattr.sa_avalid = + parent->u.ci_setattr.sa_avalid; + } + } + + RETURN(lov_io_start(env, ios)); +} + +static void lov_io_fsync_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_io_sub *sub; + unsigned int *written = &ios->cis_io->u.ci_fsync.fi_nr_written; + ENTRY; + + *written = 0; + list_for_each_entry(sub, &lio->lis_active, sub_linkage) { + struct cl_io *subio = &sub->sub_io; + + lov_io_end_wrapper(sub->sub_env, subio); + + if (subio->ci_result == 0) + *written += subio->u.ci_fsync.fi_nr_written; + } + RETURN_EXIT; +} + +static void lov_io_lseek_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_io *io = lio->lis_cl.cis_io; + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + struct lov_io_sub *sub; + loff_t offset = -ENXIO; + __u64 hole_off = 0; + bool seek_hole = io->u.ci_lseek.ls_whence == SEEK_HOLE; + + ENTRY; + + list_for_each_entry(sub, &lio->lis_active, sub_linkage) { + struct cl_io *subio = &sub->sub_io; + int index = lov_comp_entry(sub->sub_subio_index); + int stripe = lov_comp_stripe(sub->sub_subio_index); + loff_t sub_off, lov_off; + __u64 comp_end = lsm->lsm_entries[index]->lsme_extent.e_end; + + lov_io_end_wrapper(sub->sub_env, subio); + + if (io->ci_result == 0) + io->ci_result = sub->sub_io.ci_result; + + if (io->ci_result) + continue; + + CDEBUG(D_INFO, DFID": entry %x stripe %u: SEEK_%s from %lld\n", + PFID(lu_object_fid(lov2lu(lio->lis_object))), + index, stripe, seek_hole ? "HOLE" : "DATA", + subio->u.ci_lseek.ls_start); + + /* first subio with positive result is what we need */ + sub_off = subio->u.ci_lseek.ls_result; + /* Expected error, offset is out of stripe file size */ + if (sub_off == -ENXIO) + continue; + /* Any other errors are not expected with ci_result == 0 */ + if (sub_off < 0) { + CDEBUG(D_INFO, "unexpected error: rc = %lld\n", + sub_off); + io->ci_result = sub_off; + continue; + } + lov_off = lov_stripe_size(lsm, index, sub_off + 1, stripe) - 1; + if (lov_off < 0) { + /* the only way to get negatove lov_off here is too big + * result. Return -EOVERFLOW then. + */ + io->ci_result = -EOVERFLOW; + CDEBUG(D_INFO, "offset %llu is too big: rc = %d\n", + (u64)lov_off, io->ci_result); + continue; + } + if (lov_off < io->u.ci_lseek.ls_start) { + io->ci_result = -EINVAL; + CDEBUG(D_INFO, "offset %lld < start %lld: rc = %d\n", + sub_off, io->u.ci_lseek.ls_start, io->ci_result); + continue; + } + /* resulting offset can be out of component range if stripe + * object is full and its file size was returned as virtual + * hole start. Skip this result, the next component will give + * us correct lseek result but keep possible hole offset in + * case there is no more components ahead + */ + if (lov_off >= comp_end) { + /* must be SEEK_HOLE case */ + if (likely(seek_hole)) { + /* save comp end as potential hole offset */ + hole_off = max_t(__u64, comp_end, hole_off); + } else { + io->ci_result = -EINVAL; + CDEBUG(D_INFO, + "off %lld >= comp_end %llu: rc = %d\n", + lov_off, comp_end, io->ci_result); + } + continue; + } + + CDEBUG(D_INFO, "SEEK_%s: %lld->%lld/%lld: rc = %d\n", + seek_hole ? "HOLE" : "DATA", + subio->u.ci_lseek.ls_start, sub_off, lov_off, + sub->sub_io.ci_result); + offset = min_t(__u64, offset, lov_off); + } + /* no result but some component returns hole as component end */ + if (seek_hole && offset == -ENXIO && hole_off > 0) + offset = hole_off; + + io->u.ci_lseek.ls_result = offset; + RETURN_EXIT; +} + +static const struct cl_io_operations lov_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_rw_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_WRITE] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_rw_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_SETATTR] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_setattr_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_setattr_start, + .cio_end = lov_io_end + }, + [CIT_DATA_VERSION] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_data_version_end, + }, + [CIT_FAULT] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_fault_start, + .cio_end = lov_io_end + }, + [CIT_FSYNC] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_fsync_end + }, + [CIT_LADVISE] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_LSEEK] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_lseek_end + }, + [CIT_GLIMPSE] = { + .cio_fini = lov_io_fini, + }, + [CIT_MISC] = { + .cio_fini = lov_io_fini + } + }, + .cio_read_ahead = lov_io_read_ahead, + .cio_lru_reserve = lov_io_lru_reserve, + .cio_submit = lov_io_submit, + .cio_commit_async = lov_io_commit_async, +}; + +/***************************************************************************** + * + * Empty lov io operations. + * + */ + +static void lov_empty_io_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_object *lov = cl2lov(ios->cis_obj); + ENTRY; + + if (atomic_dec_and_test(&lov->lo_active_ios)) + wake_up(&lov->lo_waitq); + EXIT; +} + +static int lov_empty_io_submit(const struct lu_env *env, + const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue) +{ + return -EBADF; +} + +static void lov_empty_impossible(const struct lu_env *env, + struct cl_io_slice *ios) +{ + LBUG(); +} + +#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible) + +/** + * An io operation vector for files without stripes. + */ +static const struct cl_io_operations lov_empty_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = lov_empty_io_fini, +#if 0 + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE +#endif + }, + [CIT_WRITE] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_SETATTR] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_FAULT] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_FSYNC] = { + .cio_fini = lov_empty_io_fini + }, + [CIT_LADVISE] = { + .cio_fini = lov_empty_io_fini + }, + [CIT_GLIMPSE] = { + .cio_fini = lov_empty_io_fini + }, + [CIT_MISC] = { + .cio_fini = lov_empty_io_fini + } + }, + .cio_submit = lov_empty_io_submit, + .cio_commit_async = LOV_EMPTY_IMPOSSIBLE +}; + +int lov_io_init_composite(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct lov_io *lio = lov_env_io(env); + struct lov_object *lov = cl2lov(obj); + int result; + + ENTRY; + + INIT_LIST_HEAD(&lio->lis_active); + result = lov_io_slice_init(lio, lov, io); + if (result) + GOTO(out, result); + + result = lov_io_subio_init(env, lio, io); + if (!result) { + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops); + atomic_inc(&lov->lo_active_ios); + } + EXIT; +out: + io->ci_result = result < 0 ? result : 0; + return result; +} + +int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_io *lio = lov_env_io(env); + int result; + ENTRY; + + lio->lis_object = lov; + switch (io->ci_type) { + default: + LBUG(); + case CIT_MISC: + case CIT_GLIMPSE: + case CIT_READ: + result = 0; + break; + case CIT_FSYNC: + case CIT_LADVISE: + case CIT_LSEEK: + case CIT_SETATTR: + case CIT_DATA_VERSION: + result = +1; + break; + case CIT_WRITE: + result = -EBADF; + break; + case CIT_FAULT: + result = -EFAULT; + CERROR("Page fault on a file without stripes: "DFID"\n", + PFID(lu_object_fid(&obj->co_lu))); + break; + } + if (result == 0) { + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops); + atomic_inc(&lov->lo_active_ios); + } + + io->ci_result = result < 0 ? result : 0; + RETURN(result); +} + +int lov_io_init_released(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_io *lio = lov_env_io(env); + int result; + ENTRY; + + LASSERT(lov->lo_lsm != NULL); + lio->lis_object = lov; + + switch (io->ci_type) { + default: + LASSERTF(0, "invalid type %d\n", io->ci_type); + result = -EOPNOTSUPP; + break; + case CIT_GLIMPSE: + case CIT_MISC: + case CIT_FSYNC: + case CIT_LADVISE: + case CIT_DATA_VERSION: + result = 1; + break; + case CIT_SETATTR: + /* + * the truncate to 0 is managed by MDT: + * - in open, for open O_TRUNC + * - in setattr, for truncate + */ + /* + * the truncate is for size > 0 so triggers a restore, + * also trigger a restore for prealloc/punch + */ + if (cl_io_is_trunc(io) || cl_io_is_fallocate(io)) { + io->ci_restore_needed = 1; + result = -ENODATA; + } else + result = 1; + break; + case CIT_READ: + case CIT_WRITE: + case CIT_FAULT: + case CIT_LSEEK: + io->ci_restore_needed = 1; + result = -ENODATA; + break; + } + + if (result == 0) { + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops); + atomic_inc(&lov->lo_active_ios); + } + + io->ci_result = result < 0 ? result : 0; + RETURN(result); +} + +/** + * Return the index in composite:lo_entries by the file offset + */ +int lov_io_layout_at(struct lov_io *lio, __u64 offset) +{ + struct lov_object *lov = lio->lis_object; + struct lov_layout_composite *comp = &lov->u.composite; + int start_index = 0; + int end_index = comp->lo_entry_count - 1; + int i; + + LASSERT(lov->lo_type == LLT_COMP); + + /* This is actual file offset so nothing can cover eof. */ + if (offset == LUSTRE_EOF) + return -1; + + if (lov_is_flr(lov)) { + struct lov_mirror_entry *lre; + + LASSERT(lio->lis_mirror_index >= 0); + + lre = &comp->lo_mirrors[lio->lis_mirror_index]; + start_index = lre->lre_start; + end_index = lre->lre_end; + } + + for (i = start_index; i <= end_index; i++) { + struct lov_layout_entry *lle = lov_entry(lov, i); + + LASSERT(!lsme_is_foreign(lle->lle_lsme)); + + if ((offset >= lle->lle_extent->e_start && + offset < lle->lle_extent->e_end) || + (offset == OBD_OBJECT_EOF && + lle->lle_extent->e_end == OBD_OBJECT_EOF)) + return i; + } + + return -1; +} + +/** @} lov */ diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_lock.c b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c new file mode 100644 index 0000000000000..40777f3921586 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c @@ -0,0 +1,382 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_lock for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lov lock operations. + * + */ + +static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env, + const struct cl_lock *parent, + struct lov_lock_sub *lls) +{ + struct lov_sublock_env *subenv; + struct lov_io *lio = lov_env_io(env); + struct cl_io *io = lio->lis_cl.cis_io; + struct lov_io_sub *sub; + + subenv = &lov_env_session(env)->ls_subenv; + + /* + * FIXME: We tend to use the subio's env & io to call the sublock + * lock operations because osc lock sometimes stores some control + * variables in thread's IO infomation(Now only lockless information). + * However, if the lock's host(object) is different from the object + * for current IO, we have no way to get the subenv and subio because + * they are not initialized at all. As a temp fix, in this case, + * we still borrow the parent's env to call sublock operations. + */ + if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) { + subenv->lse_env = env; + subenv->lse_io = io; + } else { + sub = lov_sub_get(env, lio, lls->sub_index); + if (!IS_ERR(sub)) { + subenv->lse_env = sub->sub_env; + subenv->lse_io = &sub->sub_io; + } else { + subenv = (void *)sub; + } + } + return subenv; +} + +static int lov_sublock_init(const struct lu_env *env, + const struct cl_lock *parent, + struct lov_lock_sub *lls) +{ + struct lov_sublock_env *subenv; + int result; + + ENTRY; + + subenv = lov_sublock_env_get(env, parent, lls); + if (!IS_ERR(subenv)) { + result = cl_lock_init(subenv->lse_env, &lls->sub_lock, + subenv->lse_io); + } else { + /* error occurs. */ + result = PTR_ERR(subenv); + } + RETURN(result); +} + +/** + * Creates sub-locks for a given lov_lock for the first time. + * + * Goes through all sub-objects of top-object, and creates sub-locks on every + * sub-object intersecting with top-lock extent. This is complicated by the + * fact that top-lock (that is being created) can be accessed concurrently + * through already created sub-locks (possibly shared with other top-locks). + */ +static struct lov_lock *lov_lock_sub_init(const struct lu_env *env, + const struct cl_io *io, + const struct cl_object *obj, + struct cl_lock *lock) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_io *lio = lov_env_io(env); + bool is_trunc = cl_io_is_trunc(io); + struct lov_lock *lovlck; + struct lu_extent ext; + loff_t start; + loff_t end; + int result = 0; + int i; + int index; + int nr; + + ENTRY; + + LASSERT(ergo(is_trunc, lio->lis_trunc_stripe_index != NULL)); + + ext.e_start = cl_offset(obj, lock->cll_descr.cld_start); + if (lock->cll_descr.cld_end == CL_PAGE_EOF) + ext.e_end = OBD_OBJECT_EOF; + else + ext.e_end = cl_offset(obj, lock->cll_descr.cld_end + 1); + + nr = 0; + lov_foreach_io_layout(index, lio, &ext) { + struct lov_layout_raid0 *r0 = lov_r0(lov, index); + + for (i = 0; i < r0->lo_nr; i++) { + if (likely(r0->lo_sub[i])) {/* spare layout */ + if (lov_stripe_intersects(lov->lo_lsm, index, i, &ext, &start, &end) || + (is_trunc && i == lio->lis_trunc_stripe_index[index])) + nr++; + } + } + } + /** + * Aggressive lock request (from cl_setattr_ost) which asks for + * [eof, -1) lock, could come across uninstantiated layout extent, + * hence a 0 nr is possible. + */ + + OBD_ALLOC_LARGE(lovlck, offsetof(struct lov_lock, lls_sub[nr])); + if (!lovlck) + RETURN(ERR_PTR(-ENOMEM)); + + lovlck->lls_nr = nr; + nr = 0; + lov_foreach_io_layout(index, lov_env_io(env), &ext) { + struct lov_layout_raid0 *r0 = lov_r0(lov, index); + + for (i = 0; i < r0->lo_nr; ++i) { + struct lov_lock_sub *lls; + struct cl_lock_descr *descr; + + if (unlikely(!r0->lo_sub[i])) + continue; + + if (lov_stripe_intersects(lov->lo_lsm, index, i, &ext, &start, &end) || + (is_trunc && i == lio->lis_trunc_stripe_index[index])) + goto init_sublock; + + continue; +init_sublock: + LASSERT(nr < lovlck->lls_nr); + lls = &lovlck->lls_sub[nr]; + descr = &lls->sub_lock.cll_descr; + LASSERT(descr->cld_obj == NULL); + descr->cld_obj = lovsub2cl(r0->lo_sub[i]); + descr->cld_start = cl_index(descr->cld_obj, start); + descr->cld_end = cl_index(descr->cld_obj, end); + descr->cld_mode = lock->cll_descr.cld_mode; + descr->cld_gid = lock->cll_descr.cld_gid; + descr->cld_enq_flags = lock->cll_descr.cld_enq_flags; + + lls->sub_index = lov_comp_index(index, i); + + /* initialize sub lock */ + result = lov_sublock_init(env, lock, lls); + if (result < 0) + break; + + lls->sub_initialized = 1; + nr++; + } + if (result < 0) + break; + } + LASSERT(ergo(result == 0, nr == lovlck->lls_nr)); + + if (result != 0) { + for (i = 0; i < nr; ++i) { + if (!lovlck->lls_sub[i].sub_initialized) + break; + + cl_lock_fini(env, &lovlck->lls_sub[i].sub_lock); + } + + OBD_FREE_LARGE(lovlck, + offsetof(struct lov_lock, lls_sub[nr])); + lovlck = ERR_PTR(result); + } + + RETURN(lovlck); +} + +static void lov_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct lov_lock *lovlck; + int i; + + ENTRY; + lovlck = cl2lov_lock(slice); + for (i = 0; i < lovlck->lls_nr; ++i) { + LASSERT(!lovlck->lls_sub[i].sub_is_enqueued); + if (lovlck->lls_sub[i].sub_initialized) + cl_lock_fini(env, &lovlck->lls_sub[i].sub_lock); + } + OBD_FREE_LARGE(lovlck, + offsetof(struct lov_lock, lls_sub[lovlck->lls_nr])); + EXIT; +} + +/** + * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This + * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock + * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock + * state machines in the face of sub-locks sharing (by multiple top-locks), + * and concurrent sub-lock cancellations. + */ +static int lov_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *io, struct cl_sync_io *anchor) +{ + struct cl_lock *lock = slice->cls_lock; + struct lov_lock *lovlck = cl2lov_lock(slice); + int i; + int rc = 0; + + ENTRY; + + for (i = 0; i < lovlck->lls_nr; ++i) { + struct lov_lock_sub *lls = &lovlck->lls_sub[i]; + struct lov_sublock_env *subenv; + + subenv = lov_sublock_env_get(env, lock, lls); + if (IS_ERR(subenv)) { + rc = PTR_ERR(subenv); + break; + } + + rc = cl_lock_enqueue(subenv->lse_env, subenv->lse_io, + &lls->sub_lock, anchor); + if (rc != 0) + break; + + lls->sub_is_enqueued = 1; + } + RETURN(rc); +} + +static void lov_lock_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct cl_lock *lock = slice->cls_lock; + struct lov_lock *lovlck = cl2lov_lock(slice); + int i; + + ENTRY; + + for (i = 0; i < lovlck->lls_nr; ++i) { + struct lov_lock_sub *lls = &lovlck->lls_sub[i]; + struct cl_lock *sublock = &lls->sub_lock; + struct lov_sublock_env *subenv; + + if (!lls->sub_is_enqueued) + continue; + + lls->sub_is_enqueued = 0; + subenv = lov_sublock_env_get(env, lock, lls); + if (!IS_ERR(subenv)) { + cl_lock_cancel(subenv->lse_env, sublock); + } else { + CL_LOCK_DEBUG(D_ERROR, env, slice->cls_lock, + "lov_lock_cancel fails with %ld.\n", + PTR_ERR(subenv)); + } + } +} + +static int lov_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + int i; + + (*p)(env, cookie, "%d\n", lck->lls_nr); + for (i = 0; i < lck->lls_nr; ++i) { + struct lov_lock_sub *sub; + + sub = &lck->lls_sub[i]; + (*p)(env, cookie, " %d %x: ", i, sub->sub_is_enqueued); + cl_lock_print(env, cookie, p, &sub->sub_lock); + } + return 0; +} + +static const struct cl_lock_operations lov_lock_ops = { + .clo_fini = lov_lock_fini, + .clo_enqueue = lov_lock_enqueue, + .clo_cancel = lov_lock_cancel, + .clo_print = lov_lock_print +}; + +int lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct lov_lock *lck; + int result = 0; + + ENTRY; + lck = lov_lock_sub_init(env, io, obj, lock); + if (!IS_ERR(lck)) + cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops); + else + result = PTR_ERR(lck); + RETURN(result); +} + +static void lov_empty_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + + OBD_SLAB_FREE_PTR(lck, lov_lock_kmem); +} + +static int lov_empty_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + (*p)(env, cookie, "empty\n"); + return 0; +} + +/* XXX: more methods will be added later. */ +static const struct cl_lock_operations lov_empty_lock_ops = { + .clo_fini = lov_empty_lock_fini, + .clo_print = lov_empty_lock_print +}; + +int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct lov_lock *lck; + int result = -ENOMEM; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, GFP_NOFS); + if (lck) { + cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops); + result = 0; + } + RETURN(result); +} + +/** @} lov */ diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_merge.c b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c new file mode 100644 index 0000000000000..30fb5b42ac656 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c @@ -0,0 +1,108 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include + +#include +#include "lov_internal.h" + +/** Merge the lock value block(&lvb) attributes and KMS from each of the + * stripes in a file into a single lvb. It is expected that the caller + * initializes the current atime, mtime, ctime to avoid regressing a more + * uptodate time on the local client. + */ +int lov_merge_lvb_kms(struct lov_stripe_md *lsm, int index, + struct ost_lvb *lvb, __u64 *kms_place) +{ + struct lov_stripe_md_entry *lse = lsm->lsm_entries[index]; + u64 size = 0; + u64 kms = 0; + u64 blocks = 0; + s64 current_mtime = lvb->lvb_mtime; + s64 current_atime = lvb->lvb_atime; + s64 current_ctime = lvb->lvb_ctime; + int i; + int rc = 0; + + assert_spin_locked(&lsm->lsm_lock); + LASSERT(lsm->lsm_lock_owner == current->pid); + + CDEBUG(D_INODE, "MDT ID "DOSTID" initial value: s=%llu m=%llu" + " a=%llu c=%llu b=%llu\n", POSTID(&lsm->lsm_oi), + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime, + lvb->lvb_blocks); + for (i = 0; i < lse->lsme_stripe_count; i++) { + struct lov_oinfo *loi = lse->lsme_oinfo[i]; + u64 lov_size; + u64 tmpsize; + + if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) { + rc = OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks); + continue; + } + + tmpsize = loi->loi_kms; + lov_size = lov_stripe_size(lsm, index, tmpsize, i); + if (lov_size > kms) + kms = lov_size; + + if (loi->loi_lvb.lvb_size > tmpsize) + tmpsize = loi->loi_lvb.lvb_size; + + lov_size = lov_stripe_size(lsm, index, tmpsize, i); + if (lov_size > size) + size = lov_size; + /* merge blocks, mtime, atime */ + blocks += loi->loi_lvb.lvb_blocks; + if (loi->loi_lvb.lvb_mtime > current_mtime) + current_mtime = loi->loi_lvb.lvb_mtime; + if (loi->loi_lvb.lvb_atime > current_atime) + current_atime = loi->loi_lvb.lvb_atime; + if (loi->loi_lvb.lvb_ctime > current_ctime) + current_ctime = loi->loi_lvb.lvb_ctime; + + CDEBUG(D_INODE, "MDT ID "DOSTID" on OST[%u]: s=%llu m=%llu" + " a=%llu c=%llu b=%llu\n", POSTID(&lsm->lsm_oi), + loi->loi_ost_idx, loi->loi_lvb.lvb_size, + loi->loi_lvb.lvb_mtime, loi->loi_lvb.lvb_atime, + loi->loi_lvb.lvb_ctime, loi->loi_lvb.lvb_blocks); + } + + *kms_place = kms; + lvb->lvb_size = size; + lvb->lvb_blocks = blocks; + lvb->lvb_mtime = current_mtime; + lvb->lvb_atime = current_atime; + lvb->lvb_ctime = current_ctime; + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_obd.c b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c new file mode 100644 index 0000000000000..3efba72ddee2b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c @@ -0,0 +1,1350 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/lov/lov_obd.c + * + * Author: Phil Schwan + * Author: Peter Braam + * Author: Mike Shaver + * Author: Nathan Rutman + */ + +#define DEBUG_SUBSYSTEM S_LOV +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lov_internal.h" + +/* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion. + Any function that expects lov_tgts to remain stationary must take a ref. */ +void lov_tgts_getref(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + + /* nobody gets through here until lov_putref is done */ + mutex_lock(&lov->lov_lock); + atomic_inc(&lov->lov_refcount); + mutex_unlock(&lov->lov_lock); +} + +static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt); + +void lov_tgts_putref(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + + mutex_lock(&lov->lov_lock); + /* ok to dec to 0 more than once -- ltd_exp's will be null */ + if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) { + LIST_HEAD(kill); + struct lov_tgt_desc *tgt, *n; + int i; + + CDEBUG(D_CONFIG, "destroying %d lov targets\n", + lov->lov_death_row); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + tgt = lov->lov_tgts[i]; + + if (!tgt || !tgt->ltd_reap) + continue; + list_add(&tgt->ltd_kill, &kill); + /* XXX - right now there is a dependency on ld_tgt_count + * being the maximum tgt index for computing the + * mds_max_easize. So we can't shrink it. */ + lu_tgt_pool_remove(&lov->lov_packed, i); + lov->lov_tgts[i] = NULL; + lov->lov_death_row--; + } + mutex_unlock(&lov->lov_lock); + + list_for_each_entry_safe(tgt, n, &kill, ltd_kill) { + list_del(&tgt->ltd_kill); + /* Disconnect */ + __lov_del_obd(obd, tgt); + } + } else { + mutex_unlock(&lov->lov_lock); + } +} + +static int lov_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev); + +int lov_connect_osc(struct obd_device *obd, u32 index, int activate, + struct obd_connect_data *data) +{ + struct lov_obd *lov = &obd->u.lov; + struct obd_uuid *tgt_uuid; + struct obd_device *tgt_obd; + static struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" }; + struct obd_import *imp; + int rc; + ENTRY; + + if (lov->lov_tgts[index] == NULL) + RETURN(-EINVAL); + + tgt_uuid = &lov->lov_tgts[index]->ltd_uuid; + tgt_obd = lov->lov_tgts[index]->ltd_obd; + + if (!tgt_obd->obd_set_up) { + CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid)); + RETURN(-EINVAL); + } + + /* override the sp_me from lov */ + tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me; + + if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX)) + data->ocd_index = index; + + /* + * Divine LOV knows that OBDs under it are OSCs. + */ + imp = tgt_obd->u.cli.cl_import; + + if (activate) { + tgt_obd->obd_no_recov = 0; + /* FIXME this is probably supposed to be + ptlrpc_set_import_active. Horrible naming. */ + ptlrpc_activate_import(imp, false); + } + + rc = obd_register_observer(tgt_obd, obd); + if (rc) { + CERROR("Target %s register_observer error %d\n", + obd_uuid2str(tgt_uuid), rc); + RETURN(rc); + } + + if (imp->imp_invalid) { + CDEBUG(D_CONFIG, "%s: not connecting - administratively disabled\n", + obd_uuid2str(tgt_uuid)); + RETURN(0); + } + + rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd, + &lov_osc_uuid, data, lov->lov_cache); + if (rc || !lov->lov_tgts[index]->ltd_exp) { + CERROR("Target %s connect error %d\n", + obd_uuid2str(tgt_uuid), rc); + RETURN(-ENODEV); + } + + lov->lov_tgts[index]->ltd_reap = 0; + + CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index, + obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in"); + + if (lov->lov_tgts_kobj) { + /* Even if we failed, that's ok */ + rc = sysfs_create_link(lov->lov_tgts_kobj, + &tgt_obd->obd_kset.kobj, + tgt_obd->obd_name); + if (rc) { + CERROR("%s: can't register LOV target /sys/fs/lustre/%s/%s/target_obds/%s : rc = %d\n", + obd->obd_name, obd->obd_type->typ_name, + obd->obd_name, + lov->lov_tgts[index]->ltd_exp->exp_obd->obd_name, + rc); + } + } + RETURN(0); +} + +static int lov_connect(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *data, + void *localdata) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + struct lustre_handle conn; + int i, rc; + ENTRY; + + CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects); + + rc = class_connect(&conn, obd, cluuid); + if (rc) + RETURN(rc); + + *exp = class_conn2export(&conn); + + /* Why should there ever be more than 1 connect? */ + lov->lov_connects++; + LASSERT(lov->lov_connects == 1); + + memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd)); + if (data) + lov->lov_ocd = *data; + + lov_tgts_getref(obd); + + if (localdata) { + lov->lov_cache = localdata; + cl_cache_incref(lov->lov_cache); + } + + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + tgt = lov->lov_tgts[i]; + if (!tgt || obd_uuid_empty(&tgt->ltd_uuid)) + continue; + /* Flags will be lowest common denominator */ + rc = lov_connect_osc(obd, i, tgt->ltd_activate, &lov->lov_ocd); + if (rc) { + CERROR("%s: lov connect tgt %d failed: %d\n", + obd->obd_name, i, rc); + continue; + } + /* connect to administrative disabled ost */ + if (!lov->lov_tgts[i]->ltd_exp) + continue; + + rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd, + OBD_NOTIFY_CONNECT); + if (rc) { + CERROR("%s error sending notify %d\n", + obd->obd_name, rc); + } + } + + lov_tgts_putref(obd); + + RETURN(0); +} + +static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) +{ + struct lov_obd *lov = &obd->u.lov; + struct obd_device *osc_obd; + int rc; + ENTRY; + + osc_obd = class_exp2obd(tgt->ltd_exp); + CDEBUG(D_CONFIG, "%s: disconnecting target %s\n", obd->obd_name, + osc_obd ? osc_obd->obd_name : ""); + + if (tgt->ltd_active) { + tgt->ltd_active = 0; + lov->desc.ld_active_tgt_count--; + tgt->ltd_exp->exp_obd->obd_inactive = 1; + } + + if (osc_obd) { + if (lov->lov_tgts_kobj) + sysfs_remove_link(lov->lov_tgts_kobj, + osc_obd->obd_name); + + /* Pass it on to our clients. + * XXX This should be an argument to disconnect, + * XXX not a back-door flag on the OBD. Ah well. + */ + osc_obd->obd_force = obd->obd_force; + osc_obd->obd_fail = obd->obd_fail; + osc_obd->obd_no_recov = obd->obd_no_recov; + } + + obd_register_observer(osc_obd, NULL); + + rc = obd_disconnect(tgt->ltd_exp); + if (rc) { + CERROR("Target %s disconnect error %d\n", + tgt->ltd_uuid.uuid, rc); + rc = 0; + } + + tgt->ltd_exp = NULL; + RETURN(0); +} + +static int lov_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + u32 index; + int rc; + + ENTRY; + if (!lov->lov_tgts) + goto out; + + /* Only disconnect the underlying layers on the final disconnect. */ + lov->lov_connects--; + if (lov->lov_connects != 0) { + /* why should there be more than 1 connect? */ + CWARN("%s: unexpected disconnect #%d\n", + obd->obd_name, lov->lov_connects); + goto out; + } + + /* hold another ref so lov_del_obd() doesn't spin in putref each time */ + lov_tgts_getref(obd); + + for (index = 0; index < lov->desc.ld_tgt_count; index++) { + if (lov->lov_tgts[index] && lov->lov_tgts[index]->ltd_exp) { + /* Disconnection is the last we know about an OBD */ + lov_del_target(obd, index, NULL, + lov->lov_tgts[index]->ltd_gen); + } + } + lov_tgts_putref(obd); + +out: + rc = class_disconnect(exp); /* bz 9811 */ + RETURN(rc); +} + +/* Error codes: + * + * -EINVAL : UUID can't be found in the LOV's target list + * -ENOTCONN: The UUID is found, but the target connection is bad (!) + * -EBADF : The UUID is found, but the OBD is the wrong type (!) + * any >= 0 : is log target index + */ +static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid, + enum obd_notify_event ev) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + int index; + bool activate, active; + ENTRY; + + CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n", + lov, uuid->uuid, ev); + + lov_tgts_getref(obd); + for (index = 0; index < lov->desc.ld_tgt_count; index++) { + tgt = lov->lov_tgts[index]; + if (tgt && obd_uuid_equals(uuid, &tgt->ltd_uuid)) + break; + } + + if (index == lov->desc.ld_tgt_count) + GOTO(out, index = -EINVAL); + + if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) { + activate = (ev == OBD_NOTIFY_ACTIVATE); + + /* + * LU-642, initially inactive OSC could miss the obd_connect, + * we make up for it here. + */ + if (activate && !tgt->ltd_exp) { + int rc; + struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"}; + + rc = obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd, + &lov_osc_uuid, &lov->lov_ocd, + lov->lov_cache); + if (rc || !tgt->ltd_exp) + GOTO(out, index = rc); + } + + if (lov->lov_tgts[index]->ltd_activate == activate) { + CDEBUG(D_INFO, "OSC %s already %sactivate!\n", + uuid->uuid, activate ? "" : "de"); + } else { + lov->lov_tgts[index]->ltd_activate = activate; + CDEBUG(D_CONFIG, "%sactivate OSC %s\n", + activate ? "" : "de", obd_uuid2str(uuid)); + } + } else if (ev == OBD_NOTIFY_INACTIVE || ev == OBD_NOTIFY_ACTIVE) { + active = (ev == OBD_NOTIFY_ACTIVE); + + if (lov->lov_tgts[index]->ltd_active == active) { + CDEBUG(D_INFO, "OSC %s already %sactive!\n", + uuid->uuid, active ? "" : "in"); + GOTO(out, index); + } + CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n", + obd_uuid2str(uuid), active ? "" : "in"); + + lov->lov_tgts[index]->ltd_active = active; + if (active) { + lov->desc.ld_active_tgt_count++; + lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 0; + } else { + lov->desc.ld_active_tgt_count--; + lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1; + } + } else { + CERROR("%s: unknown event %d for uuid %s\n", obd->obd_name, + ev, uuid->uuid); + } + + if (tgt->ltd_exp) + CDEBUG(D_INFO, "%s: lov idx %d conn %llx\n", obd_uuid2str(uuid), + index, tgt->ltd_exp->exp_handle.h_cookie); + + out: + lov_tgts_putref(obd); + RETURN(index); +} + +static int lov_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev) +{ + int rc = 0; + struct lov_obd *lov = &obd->u.lov; + ENTRY; + + down_read(&lov->lov_notify_lock); + if (!lov->lov_connects) + GOTO(out_notify_lock, rc = 0); + + if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE || + ev == OBD_NOTIFY_ACTIVATE || ev == OBD_NOTIFY_DEACTIVATE) { + struct obd_uuid *uuid; + + LASSERT(watched); + + if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { + CERROR("unexpected notification of %s %s\n", + watched->obd_type->typ_name, watched->obd_name); + GOTO(out_notify_lock, rc = -EINVAL); + } + + uuid = &watched->u.cli.cl_target_uuid; + + /* Set OSC as active before notifying the observer, so the + * observer can use the OSC normally. + */ + rc = lov_set_osc_active(obd, uuid, ev); + if (rc < 0) { + CERROR("%s: event %d failed: rc = %d\n", obd->obd_name, + ev, rc); + GOTO(out_notify_lock, rc); + } + } + + /* Pass the notification up the chain. */ + rc = obd_notify_observer(obd, watched, ev); + +out_notify_lock: + up_read(&lov->lov_notify_lock); + + RETURN(rc); +} + +static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, + u32 index, int gen, int active) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + struct obd_device *tgt_obd; + int rc; + + ENTRY; + CDEBUG(D_CONFIG, "uuid:%s idx:%u gen:%d active:%d\n", + uuidp->uuid, index, gen, active); + + if (gen <= 0) { + CERROR("%s: request to add '%s' with invalid generation: %d\n", + obd->obd_name, uuidp->uuid, gen); + RETURN(-EINVAL); + } + + tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME, &obd->obd_uuid); + if (tgt_obd == NULL) + RETURN(-EINVAL); + + mutex_lock(&lov->lov_lock); + + if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) { + tgt = lov->lov_tgts[index]; + rc = -EEXIST; + CERROR("%s: UUID %s already assigned at index %d: rc = %d\n", + obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), index, rc); + mutex_unlock(&lov->lov_lock); + RETURN(rc); + } + + if (index >= lov->lov_tgt_size) { + /* We need to reallocate the lov target array. */ + struct lov_tgt_desc **newtgts, **old = NULL; + __u32 newsize, oldsize = 0; + + newsize = max(lov->lov_tgt_size, 2U); + while (newsize < index + 1) + newsize = newsize << 1; + OBD_ALLOC_PTR_ARRAY(newtgts, newsize); + if (newtgts == NULL) { + mutex_unlock(&lov->lov_lock); + RETURN(-ENOMEM); + } + + if (lov->lov_tgt_size) { + memcpy(newtgts, lov->lov_tgts, sizeof(*newtgts) * + lov->lov_tgt_size); + old = lov->lov_tgts; + oldsize = lov->lov_tgt_size; + } + + lov->lov_tgts = newtgts; + lov->lov_tgt_size = newsize; + smp_rmb(); + if (old) + OBD_FREE_PTR_ARRAY(old, oldsize); + + CDEBUG(D_CONFIG, "tgts: %p size: %d\n", + lov->lov_tgts, lov->lov_tgt_size); + } + + OBD_ALLOC_PTR(tgt); + if (!tgt) { + mutex_unlock(&lov->lov_lock); + RETURN(-ENOMEM); + } + + rc = lu_tgt_pool_add(&lov->lov_packed, index, lov->lov_tgt_size); + if (rc) { + mutex_unlock(&lov->lov_lock); + OBD_FREE_PTR(tgt); + RETURN(rc); + } + + tgt->ltd_uuid = *uuidp; + tgt->ltd_obd = tgt_obd; + /* XXX - add a sanity check on the generation number. */ + tgt->ltd_gen = gen; + tgt->ltd_index = index; + tgt->ltd_activate = active; + lov->lov_tgts[index] = tgt; + if (index >= lov->desc.ld_tgt_count) + lov->desc.ld_tgt_count = index + 1; + + mutex_unlock(&lov->lov_lock); + + CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n", + index, tgt->ltd_gen, lov->desc.ld_tgt_count); + + if (lov->lov_connects == 0) { + /* lov_connect hasn't been called yet. We'll do the + lov_connect_osc on this target when that fn first runs, + because we don't know the connect flags yet. */ + RETURN(0); + } + + lov_tgts_getref(obd); + + rc = lov_connect_osc(obd, index, active, &lov->lov_ocd); + if (rc) + GOTO(out, rc); + + /* connect to administrative disabled ost */ + if (!tgt->ltd_exp) + GOTO(out, rc = 0); + + rc = lov_notify(obd, tgt->ltd_exp->exp_obd, + active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE); + +out: + if (rc) { + CERROR("%s: add failed, deleting %s: rc = %d\n", + obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), rc); + lov_del_target(obd, index, NULL, 0); + } + lov_tgts_putref(obd); + RETURN(rc); +} + +/* Schedule a target for deletion */ +int lov_del_target(struct obd_device *obd, u32 index, + struct obd_uuid *uuidp, int gen) +{ + struct lov_obd *lov = &obd->u.lov; + int count = lov->desc.ld_tgt_count; + int rc = 0; + ENTRY; + + if (index >= count) { + CERROR("LOV target index %d >= number of LOV OBDs %d.\n", + index, count); + RETURN(-EINVAL); + } + + /* to make sure there's no ongoing lov_notify() now */ + down_write(&lov->lov_notify_lock); + lov_tgts_getref(obd); + + if (!lov->lov_tgts[index]) { + CERROR("LOV target at index %d is not setup.\n", index); + GOTO(out, rc = -EINVAL); + } + + if (uuidp && !obd_uuid_equals(uuidp, &lov->lov_tgts[index]->ltd_uuid)) { + CERROR("LOV target UUID %s at index %d doesn't match %s.\n", + lov_uuid2str(lov, index), index, + obd_uuid2str(uuidp)); + GOTO(out, rc = -EINVAL); + } + + CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n", + lov_uuid2str(lov, index), index, + lov->lov_tgts[index]->ltd_gen, lov->lov_tgts[index]->ltd_exp, + lov->lov_tgts[index]->ltd_active); + + lov->lov_tgts[index]->ltd_reap = 1; + lov->lov_death_row++; + /* we really delete it from lov_tgts_putref() */ +out: + lov_tgts_putref(obd); + up_write(&lov->lov_notify_lock); + + RETURN(rc); +} + +static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) +{ + struct obd_device *osc_obd; + + LASSERT(tgt); + LASSERT(tgt->ltd_reap); + + osc_obd = class_exp2obd(tgt->ltd_exp); + + CDEBUG(D_CONFIG, "Removing tgt %s : %s\n", + tgt->ltd_uuid.uuid, + osc_obd ? osc_obd->obd_name : ""); + + if (tgt->ltd_exp) + lov_disconnect_obd(obd, tgt); + + OBD_FREE_PTR(tgt); + + /* Manual cleanup - no cleanup logs to clean up the osc's. We must + do it ourselves. And we can't do it from lov_cleanup, + because we just lost our only reference to it. */ + if (osc_obd) + class_manual_cleanup(osc_obd); +} + +void lov_fix_desc_stripe_size(__u64 *val) +{ + if (*val < LOV_MIN_STRIPE_SIZE) { + if (*val != 0) + LCONSOLE_INFO("Increasing default stripe size to " + "minimum %u\n", + LOV_DESC_STRIPE_SIZE_DEFAULT); + *val = LOV_DESC_STRIPE_SIZE_DEFAULT; + } else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) { + *val &= ~(LOV_MIN_STRIPE_SIZE - 1); + LCONSOLE_WARN("Changing default stripe size to %llu (a " + "multiple of %u)\n", + *val, LOV_MIN_STRIPE_SIZE); + } +} + +void lov_fix_desc_stripe_count(__u32 *val) +{ + if (*val == 0) + *val = 1; +} + +void lov_fix_desc_pattern(__u32 *val) +{ + /* from lov_setstripe */ + if ((*val != 0) && !lov_pattern_supported_normal_comp(*val)) { + LCONSOLE_WARN("lov: Unknown stripe pattern: %#x\n", *val); + *val = 0; + } +} + +void lov_fix_desc_qos_maxage(__u32 *val) +{ + if (*val == 0) + *val = LOV_DESC_QOS_MAXAGE_DEFAULT; +} + +void lov_fix_desc(struct lov_desc *desc) +{ + lov_fix_desc_stripe_size(&desc->ld_default_stripe_size); + lov_fix_desc_stripe_count(&desc->ld_default_stripe_count); + lov_fix_desc_pattern(&desc->ld_pattern); + lov_fix_desc_qos_maxage(&desc->ld_qos_maxage); +} + +int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lov_desc *desc; + struct lov_obd *lov = &obd->u.lov; + int rc; + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("LOV setup requires a descriptor\n"); + RETURN(-EINVAL); + } + + desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1); + + if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) { + CERROR("descriptor size wrong: %d > %d\n", + (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1)); + RETURN(-EINVAL); + } + + if (desc->ld_magic != LOV_DESC_MAGIC) { + if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) { + CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n", + obd->obd_name, desc); + lustre_swab_lov_desc(desc); + } else { + CERROR("%s: Bad lov desc magic: %#x\n", + obd->obd_name, desc->ld_magic); + RETURN(-EINVAL); + } + } + + lov_fix_desc(desc); + + desc->ld_active_tgt_count = 0; + lov->desc = *desc; + lov->lov_tgt_size = 0; + + mutex_init(&lov->lov_lock); + atomic_set(&lov->lov_refcount, 0); + lov->lov_sp_me = LUSTRE_SP_CLI; + + init_rwsem(&lov->lov_notify_lock); + + INIT_LIST_HEAD(&lov->lov_pool_list); + lov->lov_pool_count = 0; + rc = lov_pool_hash_init(&lov->lov_pools_hash_body); + if (rc) + GOTO(out, rc); + + rc = lu_tgt_pool_init(&lov->lov_packed, 0); + if (rc) + GOTO(out, rc); + + rc = lov_tunables_init(obd); + if (rc) + GOTO(out, rc); + + lov->lov_tgts_kobj = kobject_create_and_add("target_obds", + &obd->obd_kset.kobj); + +out: + return rc; +} + +static int lov_cleanup(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + struct list_head *pos, *tmp; + struct pool_desc *pool; + ENTRY; + + if (lov->lov_tgts_kobj) { + kobject_put(lov->lov_tgts_kobj); + lov->lov_tgts_kobj = NULL; + } + + list_for_each_safe(pos, tmp, &lov->lov_pool_list) { + pool = list_entry(pos, struct pool_desc, pool_list); + /* free pool structs */ + CDEBUG(D_INFO, "delete pool %p\n", pool); + /* In the function below, .hs_keycmp resolves to + * pool_hashkey_keycmp() */ + /* coverity[overrun-buffer-val] */ + lov_pool_del(obd, pool->pool_name); + } + lov_pool_hash_destroy(&lov->lov_pools_hash_body); + lu_tgt_pool_free(&lov->lov_packed); + + lprocfs_obd_cleanup(obd); + if (lov->lov_tgts) { + int i; + lov_tgts_getref(obd); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->lov_tgts[i]) + continue; + + /* Inactive targets may never have connected */ + if (lov->lov_tgts[i]->ltd_active) + /* We should never get here - these + * should have been removed in the + * disconnect. */ + CERROR("%s: lov tgt %d not cleaned! " + "deathrow=%d, lovrc=%d\n", + obd->obd_name, i, lov->lov_death_row, + atomic_read(&lov->lov_refcount)); + lov_del_target(obd, i, NULL, 0); + } + lov_tgts_putref(obd); + OBD_FREE_PTR_ARRAY(lov->lov_tgts, lov->lov_tgt_size); + lov->lov_tgt_size = 0; + } + + if (lov->lov_cache != NULL) { + cl_cache_decref(lov->lov_cache); + lov->lov_cache = NULL; + } + + RETURN(0); +} + +int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg, + u32 *indexp, int *genp) +{ + struct obd_uuid obd_uuid; + int cmd; + int rc = 0; + + ENTRY; + switch (cmd = lcfg->lcfg_command) { + case LCFG_ADD_MDC: + case LCFG_DEL_MDC: + break; + case LCFG_LOV_ADD_OBD: + case LCFG_LOV_ADD_INA: + case LCFG_LOV_DEL_OBD: { + u32 index; + int gen; + + /* lov_modify_tgts add 0:lov_mdsA 1:ost1_UUID 2:0 3:1 */ + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) + GOTO(out, rc = -EINVAL); + + obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1)); + + rc = kstrtou32(lustre_cfg_buf(lcfg, 2), 10, indexp); + if (rc) + GOTO(out, rc); + rc = kstrtoint(lustre_cfg_buf(lcfg, 3), 10, genp); + if (rc) + GOTO(out, rc); + index = *indexp; + gen = *genp; + if (cmd == LCFG_LOV_ADD_OBD) + rc = lov_add_target(obd, &obd_uuid, index, gen, 1); + else if (cmd == LCFG_LOV_ADD_INA) + rc = lov_add_target(obd, &obd_uuid, index, gen, 0); + else + rc = lov_del_target(obd, index, &obd_uuid, gen); + + GOTO(out, rc); + } + case LCFG_PARAM: { + struct lov_desc *desc = &(obd->u.lov.desc); + ssize_t count; + + if (!desc) + GOTO(out, rc = -EINVAL); + + count = class_modify_config(lcfg, PARAM_LOV, + &obd->obd_kset.kobj); + GOTO(out, rc = count < 0 ? count : 0); + } + case LCFG_POOL_NEW: + case LCFG_POOL_ADD: + case LCFG_POOL_DEL: + case LCFG_POOL_REM: + GOTO(out, rc); + + default: { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + GOTO(out, rc = -EINVAL); + + } + } +out: + RETURN(rc); +} + +static int lov_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, time64_t max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + struct obd_info oinfo = { + .oi_osfs = osfs, + .oi_flags = flags, + }; + struct ptlrpc_request_set *rqset; + struct lov_request_set *set = NULL; + struct lov_request *req; + int rc = 0; + int rc2; + + ENTRY; + + rqset = ptlrpc_prep_set(); + if (rqset == NULL) + RETURN(-ENOMEM); + + rc = lov_prep_statfs_set(obd, &oinfo, &set); + if (rc < 0) + GOTO(out_rqset, rc); + + list_for_each_entry(req, &set->set_list, rq_link) { + rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp, + &req->rq_oi, max_age, rqset); + if (rc < 0) + GOTO(out_set, rc); + } + + rc = ptlrpc_set_wait(env, rqset); + +out_set: + if (rc < 0) + atomic_set(&set->set_completes, 0); + + rc2 = lov_fini_statfs_set(set); + if (rc == 0) + rc = rc2; + +out_rqset: + ptlrpc_set_destroy(rqset); + + RETURN(rc); +} + +static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void __user *uarg) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + int i = 0, rc = 0, count = lov->desc.ld_tgt_count; + + ENTRY; + switch (cmd) { + case IOC_OBD_STATFS: { + struct obd_ioctl_data *data = karg; + struct obd_device *osc_obd; + struct obd_statfs stat_buf = {0}; + struct obd_import *imp; + __u32 index; + __u32 flags; + + memcpy(&index, data->ioc_inlbuf2, sizeof(index)); + if (index >= count) + RETURN(-ENODEV); + + if (!lov->lov_tgts[index]) + /* Try again with the next index */ + RETURN(-EAGAIN); + + osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp); + if (!osc_obd) + RETURN(-EINVAL); + + imp = osc_obd->u.cli.cl_import; + if (!lov->lov_tgts[index]->ltd_active && + imp->imp_state != LUSTRE_IMP_IDLE) + RETURN(-ENODATA); + + /* copy UUID */ + if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd), + min_t(unsigned long, data->ioc_plen2, + sizeof(struct obd_uuid)))) + RETURN(-EFAULT); + + memcpy(&flags, data->ioc_inlbuf1, sizeof(flags)); + flags = flags & LL_STATFS_NODELAY ? OBD_STATFS_NODELAY : 0; + + /* got statfs data */ + rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + flags); + if (rc) + RETURN(rc); + if (copy_to_user(data->ioc_pbuf1, &stat_buf, + min_t(unsigned long, data->ioc_plen1, + sizeof(struct obd_statfs)))) + RETURN(-EFAULT); + break; + } + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct lov_tgt_desc *tgt = NULL; + struct obd_quotactl *oqctl; + + if (qctl->qc_valid == QC_OSTIDX) { + if (count <= qctl->qc_idx) + RETURN(-EINVAL); + + tgt = lov->lov_tgts[qctl->qc_idx]; + if (!tgt || !tgt->ltd_exp) + RETURN(-EINVAL); + } else if (qctl->qc_valid == QC_UUID) { + for (i = 0; i < count; i++) { + tgt = lov->lov_tgts[i]; + if (!tgt || + !obd_uuid_equals(&tgt->ltd_uuid, + &qctl->obd_uuid)) + continue; + + if (tgt->ltd_exp == NULL) + RETURN(-EINVAL); + + break; + } + } else { + RETURN(-EINVAL); + } + + if (i >= count) + RETURN(-EAGAIN); + + LASSERT(tgt && tgt->ltd_exp); + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + RETURN(-ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(tgt->ltd_exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_OSTIDX; + qctl->obd_uuid = tgt->ltd_uuid; + } + OBD_FREE_PTR(oqctl); + break; + } + default: { + int set = 0; + + if (count == 0) + RETURN(-ENOTTY); + + for (i = 0; i < count; i++) { + int err; + struct obd_device *osc_obd; + + /* OST was disconnected */ + if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp) + continue; + + /* ll_umount_begin() sets force on lov, pass to osc */ + osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp); + if (osc_obd) + osc_obd->obd_force = obd->obd_force; + err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp, + len, karg, uarg); + if (err) { + if (lov->lov_tgts[i]->ltd_active) { + CDEBUG_LIMIT(err == -ENOTTY ? + D_IOCTL : D_WARNING, + "iocontrol OSC %s on OST idx %d cmd %x: err = %d\n", + lov_uuid2str(lov, i), + i, cmd, err); + if (!rc) + rc = err; + } + } else { + set = 1; + } + } + if (!set && !rc) + rc = -EIO; + } + } + + RETURN(rc); +} + +static int lov_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + struct lov_desc *ld = &lov->desc; + int rc = 0; + ENTRY; + + if (vallen == NULL || val == NULL) + RETURN(-EFAULT); + + lov_tgts_getref(obd); + + if (KEY_IS(KEY_MAX_EASIZE)) { + *((u32 *)val) = exp->exp_connect_data.ocd_max_easize; + } else if (KEY_IS(KEY_DEFAULT_EASIZE)) { + u32 def_stripe_count = min_t(u32, ld->ld_default_stripe_count, + LOV_MAX_STRIPE_COUNT); + + *((u32 *)val) = lov_mds_md_size(def_stripe_count, LOV_MAGIC_V3); + } else if (KEY_IS(KEY_TGT_COUNT)) { + *((int *)val) = lov->desc.ld_tgt_count; + } else { + rc = -EINVAL; + } + + lov_tgts_putref(obd); + + RETURN(rc); +} + +static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, + __u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + bool do_inactive = false, no_set = false; + u32 i; + int rc = 0; + int err; + + ENTRY; + + if (set == NULL) { + no_set = true; + set = ptlrpc_prep_set(); + if (!set) + RETURN(-ENOMEM); + } + + lov_tgts_getref(obd); + + if (KEY_IS(KEY_CHECKSUM)) + do_inactive = true; + + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + tgt = lov->lov_tgts[i]; + + /* OST was disconnected */ + if (tgt == NULL || tgt->ltd_exp == NULL) + continue; + + /* OST is inactive and we don't want inactive OSCs */ + if (!tgt->ltd_active && !do_inactive) + continue; + + err = obd_set_info_async(env, tgt->ltd_exp, keylen, key, + vallen, val, set); + + if (rc == 0) + rc = err; + } + + /* cycle through MDC target for Data-on-MDT */ + for (i = 0; i < LOV_MDC_TGT_MAX; i++) { + struct obd_device *mdc; + + mdc = lov->lov_mdc_tgts[i].lmtd_mdc; + if (mdc == NULL) + continue; + + err = obd_set_info_async(env, mdc->obd_self_export, + keylen, key, vallen, val, set); + if (rc == 0) + rc = err; + } + + lov_tgts_putref(obd); + if (no_set) { + err = ptlrpc_set_wait(env, set); + if (rc == 0) + rc = err; + ptlrpc_set_destroy(set); + } + RETURN(rc); +} + +void lov_stripe_lock(struct lov_stripe_md *md) +__acquires(&md->lsm_lock) +{ + LASSERT(md->lsm_lock_owner != current->pid); + spin_lock(&md->lsm_lock); + LASSERT(md->lsm_lock_owner == 0); + md->lsm_lock_owner = current->pid; +} + +void lov_stripe_unlock(struct lov_stripe_md *md) +__releases(&md->lsm_lock) +{ + LASSERT(md->lsm_lock_owner == current->pid); + md->lsm_lock_owner = 0; + spin_unlock(&md->lsm_lock); +} + +static int lov_quotactl(struct obd_device *obd, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + struct pool_desc *pool = NULL; + __u64 curspace = 0; + __u64 bhardlimit = 0; + int i, rc = 0; + + ENTRY; + if (oqctl->qc_cmd != Q_GETOQUOTA && + oqctl->qc_cmd != LUSTRE_Q_SETQUOTA && + oqctl->qc_cmd != LUSTRE_Q_GETQUOTAPOOL) { + rc = -EFAULT; + CERROR("%s: bad quota opc %x for lov obd: rc = %d\n", + obd->obd_name, oqctl->qc_cmd, rc); + RETURN(rc); + } + + if (oqctl->qc_cmd == LUSTRE_Q_GETQUOTAPOOL) { + pool = lov_pool_find(obd, oqctl->qc_poolname); + if (!pool) + RETURN(-ENOENT); + /* Set Q_GETOQUOTA back as targets report it's own + * usage and doesn't care about pools */ + oqctl->qc_cmd = Q_GETOQUOTA; + } + + /* for lov tgt */ + lov_tgts_getref(obd); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + int err; + + tgt = lov->lov_tgts[i]; + + if (!tgt) + continue; + + if (pool && + lu_tgt_check_index(tgt->ltd_index, &pool->pool_obds)) + continue; + + if (!tgt->ltd_active || tgt->ltd_reap) { + if (oqctl->qc_cmd == Q_GETOQUOTA && + lov->lov_tgts[i]->ltd_activate) { + rc = -ENETDOWN; + CERROR("%s: ost %d is inactive: rc = %d\n", + obd->obd_name, i, rc); + } else { + CDEBUG(D_HA, "ost %d is inactive\n", i); + } + continue; + } + + err = obd_quotactl(tgt->ltd_exp, oqctl); + if (err) { + if (tgt->ltd_active && !rc) + rc = err; + continue; + } + + if (oqctl->qc_cmd == Q_GETOQUOTA) { + curspace += oqctl->qc_dqblk.dqb_curspace; + bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit; + } + } + lov_tgts_putref(obd); + if (pool) + lov_pool_putref(pool); + + if (oqctl->qc_cmd == Q_GETOQUOTA) { + oqctl->qc_dqblk.dqb_curspace = curspace; + oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit; + } + RETURN(rc); +} + +static const struct obd_ops lov_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = lov_setup, + .o_cleanup = lov_cleanup, + .o_connect = lov_connect, + .o_disconnect = lov_disconnect, + .o_statfs = lov_statfs, + .o_iocontrol = lov_iocontrol, + .o_get_info = lov_get_info, + .o_set_info_async = lov_set_info_async, + .o_notify = lov_notify, + .o_pool_new = lov_pool_new, + .o_pool_rem = lov_pool_remove, + .o_pool_add = lov_pool_add, + .o_pool_del = lov_pool_del, + .o_quotactl = lov_quotactl, +}; + +struct kmem_cache *lov_oinfo_slab; + +static int __init lov_init(void) +{ + int rc; + ENTRY; + + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_INFO, "Lustre LOV module (%p).\n", &lov_caches); + + rc = lu_kmem_init(lov_caches); + if (rc) + return rc; + + lov_oinfo_slab = kmem_cache_create("lov_oinfo", + sizeof(struct lov_oinfo), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (lov_oinfo_slab == NULL) { + lu_kmem_fini(lov_caches); + return -ENOMEM; + } + + rc = class_register_type(&lov_obd_ops, NULL, true, + LUSTRE_LOV_NAME, &lov_device_type); + if (rc) { + kmem_cache_destroy(lov_oinfo_slab); + lu_kmem_fini(lov_caches); + } + + RETURN(rc); +} + +static void __exit lov_exit(void) +{ + class_unregister_type(LUSTRE_LOV_NAME); + kmem_cache_destroy(lov_oinfo_slab); + lu_kmem_fini(lov_caches); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Logical Object Volume"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(lov_init); +module_exit(lov_exit); diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_object.c b/drivers/staging/lustrefsx/lustre/lov/lov_object.c new file mode 100644 index 0000000000000..5aac191d64c4e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_object.c @@ -0,0 +1,2336 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_object for LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include + +#include "lov_cl_internal.h" + +static inline struct lov_device *lov_object_dev(struct lov_object *obj) +{ + return lu2lov_dev(obj->lo_cl.co_lu.lo_dev); +} + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Layout operations. + * + */ + +struct lov_layout_operations { + int (*llo_init)(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, struct lov_stripe_md *lsm, + const struct cl_object_conf *conf, + union lov_layout_state *state); + int (*llo_delete)(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state); + void (*llo_fini)(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state); + int (*llo_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o); + int (*llo_page_init)(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index); + int (*llo_lock_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); + int (*llo_io_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); + int (*llo_getattr)(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); + int (*llo_flush)(const struct lu_env *env, struct cl_object *obj, + struct ldlm_lock *lock); +}; + +static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov); +static struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov); + +static void lov_lsm_put(struct lov_stripe_md *lsm) +{ + if (lsm != NULL) + lov_free_memmd(&lsm); +} + +/***************************************************************************** + * + * Lov object layout operations. + * + */ + +static struct cl_object *lov_sub_find(const struct lu_env *env, + struct cl_device *dev, + const struct lu_fid *fid, + const struct cl_object_conf *conf) +{ + struct lu_object *o; + + ENTRY; + + o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu); + LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type)); + RETURN(lu2cl(o)); +} + +static int lov_page_slice_fixup(struct lov_object *lov, + struct cl_object *stripe) +{ + struct cl_object_header *hdr = cl_object_header(&lov->lo_cl); + struct cl_object *o; + + if (stripe == NULL) + return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off - + cfs_size_round(sizeof(struct lov_page)); + + cl_object_for_each(o, stripe) + o->co_slice_off += hdr->coh_page_bufsize; + + return cl_object_header(stripe)->coh_page_bufsize; +} + +static int lov_init_sub(const struct lu_env *env, struct lov_object *lov, + struct cl_object *subobj, struct lov_oinfo *oinfo, + int idx) +{ + struct cl_object_header *hdr; + struct cl_object_header *subhdr; + struct cl_object_header *parent; + int entry = lov_comp_entry(idx); + int stripe = lov_comp_stripe(idx); + int result; + + if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) { + /* For sanity:test_206. + * Do not leave the object in cache to avoid accessing + * freed memory. This is because osc_object is referring to + * lov_oinfo of lsm_stripe_data which will be freed due to + * this failure. */ + cl_object_kill(env, subobj); + cl_object_put(env, subobj); + return -EIO; + } + + hdr = cl_object_header(lov2cl(lov)); + subhdr = cl_object_header(subobj); + + CDEBUG(D_INODE, DFID"@%p[%d:%d] -> "DFID"@%p: ostid: "DOSTID + " ost idx: %d gen: %d\n", + PFID(lu_object_fid(&subobj->co_lu)), subhdr, entry, stripe, + PFID(lu_object_fid(lov2lu(lov))), hdr, POSTID(&oinfo->loi_oi), + oinfo->loi_ost_idx, oinfo->loi_ost_gen); + + /* reuse ->coh_attr_guard to protect coh_parent change */ + spin_lock(&subhdr->coh_attr_guard); + parent = subhdr->coh_parent; + if (parent == NULL) { + struct lovsub_object *lso = cl2lovsub(subobj); + + subhdr->coh_parent = hdr; + spin_unlock(&subhdr->coh_attr_guard); + subhdr->coh_nesting = hdr->coh_nesting + 1; + lu_object_ref_add(&subobj->co_lu, "lov-parent", lov); + lso->lso_super = lov; + lso->lso_index = idx; + result = 0; + } else { + struct lu_object *old_obj; + struct lov_object *old_lov; + unsigned int mask = D_INODE; + + spin_unlock(&subhdr->coh_attr_guard); + old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type); + LASSERT(old_obj != NULL); + old_lov = cl2lov(lu2cl(old_obj)); + if (test_bit(LO_LAYOUT_INVALID, &old_lov->lo_obj_flags)) { + /* the object's layout has already changed but isn't + * refreshed */ + lu_object_unhash(env, &subobj->co_lu); + result = -EAGAIN; + } else { + mask = D_ERROR; + result = -EIO; + } + + LU_OBJECT_DEBUG(mask, env, &subobj->co_lu, + "stripe %d is already owned.", idx); + LU_OBJECT_DEBUG(mask, env, old_obj, "owned."); + LU_OBJECT_HEADER(mask, env, lov2lu(lov), "try to own.\n"); + cl_object_put(env, subobj); + } + return result; +} + +static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, unsigned int index, + const struct cl_object_conf *conf, + struct lov_layout_entry *lle) +{ + struct lov_layout_raid0 *r0 = &lle->lle_raid0; + struct lov_thread_info *lti = lov_env_info(env); + struct cl_object_conf *subconf = <i->lti_stripe_conf; + struct lu_fid *ofid = <i->lti_fid; + struct cl_object *stripe; + struct lov_stripe_md_entry *lse = lov_lse(lov, index); + int result; + int psz, sz; + int i; + + ENTRY; + + spin_lock_init(&r0->lo_sub_lock); + r0->lo_nr = lse->lsme_stripe_count; + + OBD_ALLOC_PTR_ARRAY_LARGE(r0->lo_sub, r0->lo_nr); + if (r0->lo_sub == NULL) + GOTO(out, result = -ENOMEM); + + psz = 0; + result = 0; + memset(subconf, 0, sizeof(*subconf)); + + /* + * Create stripe cl_objects. + */ + for (i = 0; i < r0->lo_nr; ++i) { + struct cl_device *subdev; + struct lov_oinfo *oinfo = lse->lsme_oinfo[i]; + int ost_idx = oinfo->loi_ost_idx; + struct obd_export *exp; + + if (lov_oinfo_is_dummy(oinfo)) + continue; + + result = ostid_to_fid(ofid, &oinfo->loi_oi, oinfo->loi_ost_idx); + if (result != 0) + GOTO(out, result); + + if (dev->ld_target[ost_idx] == NULL) { + CERROR("%s: OST %04x is not initialized\n", + lov2obd(dev->ld_lov)->obd_name, ost_idx); + GOTO(out, result = -EIO); + } + + exp = dev->ld_lov->lov_tgts[ost_idx]->ltd_exp; + if (likely(exp)) { + /* the more fast OSTs the better */ + if (exp->exp_obd->obd_osfs.os_state & OS_STATFS_NONROT) + lle->lle_preference++; + } + + subdev = lovsub2cl_dev(dev->ld_target[ost_idx]); + subconf->u.coc_oinfo = oinfo; + LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx); + /* In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() */ + /* coverity[overrun-buffer-val] */ + stripe = lov_sub_find(env, subdev, ofid, subconf); + if (IS_ERR(stripe)) + GOTO(out, result = PTR_ERR(stripe)); + + result = lov_init_sub(env, lov, stripe, oinfo, + lov_comp_index(index, i)); + if (result == -EAGAIN) { /* try again */ + --i; + result = 0; + continue; + } + + if (result == 0) { + r0->lo_sub[i] = cl2lovsub(stripe); + + sz = lov_page_slice_fixup(lov, stripe); + LASSERT(ergo(psz > 0, psz == sz)); + psz = sz; + } + } + if (result == 0) + result = psz; +out: + RETURN(result); +} + +static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov, + struct lov_layout_raid0 *r0, + struct lovsub_object *los, int idx) +{ + struct cl_object *sub; + struct lu_site *site; + wait_queue_head_t *wq; + + LASSERT(r0->lo_sub[idx] == los); + + sub = lovsub2cl(los); + site = sub->co_lu.lo_dev->ld_site; + wq = lu_site_wq_from_fid(site, &sub->co_lu.lo_header->loh_fid); + + cl_object_kill(env, sub); + /* release a reference to the sub-object and ... */ + lu_object_ref_del(&sub->co_lu, "lov-parent", lov); + cl_object_put(env, sub); + + /* ... wait until it is actually destroyed---sub-object clears its + * ->lo_sub[] slot in lovsub_object_free() */ + wait_event(*wq, r0->lo_sub[idx] != los); + LASSERT(r0->lo_sub[idx] == NULL); +} + +static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov, + struct lov_layout_entry *lle) +{ + struct lov_layout_raid0 *r0 = &lle->lle_raid0; + + ENTRY; + + if (r0->lo_sub != NULL) { + int i; + + for (i = 0; i < r0->lo_nr; ++i) { + struct lovsub_object *los = r0->lo_sub[i]; + + if (los != NULL) { + cl_object_prune(env, &los->lso_cl); + /* + * If top-level object is to be evicted from + * the cache, so are its sub-objects. + */ + lov_subobject_kill(env, lov, r0, los, i); + } + } + } + + EXIT; +} + +static void lov_fini_raid0(const struct lu_env *env, + struct lov_layout_entry *lle) +{ + struct lov_layout_raid0 *r0 = &lle->lle_raid0; + + if (r0->lo_sub != NULL) { + OBD_FREE_PTR_ARRAY_LARGE(r0->lo_sub, r0->lo_nr); + r0->lo_sub = NULL; + } +} + +static int lov_print_raid0(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lov_layout_entry *lle) +{ + const struct lov_layout_raid0 *r0 = &lle->lle_raid0; + int i; + + for (i = 0; i < r0->lo_nr; ++i) { + struct lu_object *sub; + + if (r0->lo_sub[i] != NULL) { + sub = lovsub2lu(r0->lo_sub[i]); + lu_object_print(env, cookie, p, sub); + } else { + (*p)(env, cookie, "sub %d absent\n", i); + } + } + return 0; +} + +static int lov_attr_get_raid0(const struct lu_env *env, struct lov_object *lov, + unsigned int index, struct lov_layout_entry *lle, + struct cl_attr **lov_attr) +{ + struct lov_layout_raid0 *r0 = &lle->lle_raid0; + struct lov_stripe_md *lsm = lov->lo_lsm; + struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb; + struct cl_attr *attr = &r0->lo_attr; + __u64 kms = 0; + int result = 0; + + if (r0->lo_attr_valid) { + *lov_attr = attr; + return 0; + } + + memset(lvb, 0, sizeof(*lvb)); + + /* XXX: timestamps can be negative by sanity:test_39m, + * how can it be? */ + lvb->lvb_atime = LLONG_MIN; + lvb->lvb_ctime = LLONG_MIN; + lvb->lvb_mtime = LLONG_MIN; + + /* + * XXX that should be replaced with a loop over sub-objects, + * doing cl_object_attr_get() on them. But for now, let's + * reuse old lov code. + */ + + /* + * XXX take lsm spin-lock to keep lov_merge_lvb_kms() + * happy. It's not needed, because new code uses + * ->coh_attr_guard spin-lock to protect consistency of + * sub-object attributes. + */ + lov_stripe_lock(lsm); + result = lov_merge_lvb_kms(lsm, index, lvb, &kms); + lov_stripe_unlock(lsm); + if (result == 0) { + cl_lvb2attr(attr, lvb); + attr->cat_kms = kms; + r0->lo_attr_valid = 1; + *lov_attr = attr; + } + + return result; +} + +static struct lov_comp_layout_entry_ops raid0_ops = { + .lco_init = lov_init_raid0, + .lco_fini = lov_fini_raid0, + .lco_getattr = lov_attr_get_raid0, +}; + +static int lov_attr_get_dom(const struct lu_env *env, struct lov_object *lov, + unsigned int index, struct lov_layout_entry *lle, + struct cl_attr **lov_attr) +{ + struct lov_layout_dom *dom = &lle->lle_dom; + struct lov_oinfo *loi = dom->lo_loi; + struct cl_attr *attr = &dom->lo_dom_r0.lo_attr; + + if (dom->lo_dom_r0.lo_attr_valid) { + *lov_attr = attr; + return 0; + } + + if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) + return OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks); + + cl_lvb2attr(attr, &loi->loi_lvb); + + /* DoM component size can be bigger than stripe size after + * client's setattr RPC, so do not count anything beyond + * component end. Alternatively, check that limit on server + * and do not allow size overflow there. */ + if (attr->cat_size > lle->lle_extent->e_end) + attr->cat_size = lle->lle_extent->e_end; + + attr->cat_kms = attr->cat_size; + + dom->lo_dom_r0.lo_attr_valid = 1; + *lov_attr = attr; + + return 0; +} + +/** + * Lookup FLD to get MDS index of the given DOM object FID. + * + * \param[in] ld LOV device + * \param[in] fid FID to lookup + * \param[out] nr index in MDC array to return back + * + * \retval 0 and \a mds filled with MDS index if successful + * \retval negative value on error + */ +static int lov_fld_lookup(struct lov_device *ld, const struct lu_fid *fid, + __u32 *nr) +{ + __u32 mds_idx; + int i, rc; + + ENTRY; + + rc = fld_client_lookup(&ld->ld_lmv->u.lmv.lmv_fld, fid_seq(fid), + &mds_idx, LU_SEQ_RANGE_MDT, NULL); + if (rc) { + CERROR("%s: error while looking for mds number. Seq %#llx" + ", err = %d\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)), + fid_seq(fid), rc); + RETURN(rc); + } + + CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n", + mds_idx, PFID(fid)); + + /* find proper MDC device in the array */ + for (i = 0; i < ld->ld_md_tgts_nr; i++) { + if (ld->ld_md_tgts[i].ldm_mdc != NULL && + ld->ld_md_tgts[i].ldm_idx == mds_idx) + break; + } + + if (i == ld->ld_md_tgts_nr) { + CERROR("%s: cannot find corresponding MDC device for mds #%x " + "for fid="DFID"\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)), + mds_idx, PFID(fid)); + rc = -EINVAL; + } else { + *nr = i; + } + RETURN(rc); +} + +/** + * Implementation of lov_comp_layout_entry_ops::lco_init for DOM object. + * + * Init the DOM object for the first time. It prepares also RAID0 entry + * for it to use in common methods with ordinary RAID0 layout entries. + * + * \param[in] env execution environment + * \param[in] dev LOV device + * \param[in] lov LOV object + * \param[in] index Composite layout entry index in LSM + * \param[in] lle Composite LOV layout entry + */ +static int lov_init_dom(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, unsigned int index, + const struct cl_object_conf *conf, + struct lov_layout_entry *lle) +{ + struct lov_thread_info *lti = lov_env_info(env); + struct lov_stripe_md_entry *lsme = lov_lse(lov, index); + struct cl_object *clo; + struct lu_object *o = lov2lu(lov); + const struct lu_fid *fid = lu_object_fid(o); + struct cl_device *mdcdev; + struct lov_oinfo *loi = NULL; + struct cl_object_conf *sconf = <i->lti_stripe_conf; + int rc; + __u32 idx = 0; + + ENTRY; + + /* DOM entry may be not zero index due to FLR but must start from 0 */ + if (unlikely(lle->lle_extent->e_start != 0)) { + CERROR("%s: DOM entry must be the first stripe in a mirror\n", + lov2obd(dev->ld_lov)->obd_name); + dump_lsm(D_ERROR, lov->lo_lsm); + RETURN(-EINVAL); + } + + /* find proper MDS device */ + rc = lov_fld_lookup(dev, fid, &idx); + if (rc) + RETURN(rc); + + LASSERTF(dev->ld_md_tgts[idx].ldm_mdc != NULL, + "LOV md target[%u] is NULL\n", idx); + + /* check lsm is DOM, more checks are needed */ + LASSERT(lsme->lsme_stripe_count == 0); + + /* + * Create lower cl_objects. + */ + mdcdev = dev->ld_md_tgts[idx].ldm_mdc; + + LASSERTF(mdcdev != NULL, "non-initialized mdc subdev\n"); + + /* DoM object has no oinfo in LSM entry, create it exclusively */ + OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS); + if (loi == NULL) + RETURN(-ENOMEM); + + fid_to_ostid(lu_object_fid(lov2lu(lov)), &loi->loi_oi); + + sconf->u.coc_oinfo = loi; +again: + clo = lov_sub_find(env, mdcdev, fid, sconf); + if (IS_ERR(clo)) + GOTO(out, rc = PTR_ERR(clo)); + + rc = lov_init_sub(env, lov, clo, loi, lov_comp_index(index, 0)); + if (rc == -EAGAIN) /* try again */ + goto again; + else if (rc != 0) + GOTO(out, rc); + + lle->lle_dom.lo_dom = cl2lovsub(clo); + spin_lock_init(&lle->lle_dom.lo_dom_r0.lo_sub_lock); + lle->lle_dom.lo_dom_r0.lo_nr = 1; + lle->lle_dom.lo_dom_r0.lo_sub = &lle->lle_dom.lo_dom; + lle->lle_dom.lo_loi = loi; + + rc = lov_page_slice_fixup(lov, clo); + RETURN(rc); + +out: + if (loi != NULL) + OBD_SLAB_FREE_PTR(loi, lov_oinfo_slab); + return rc; +} + +/** + * Implementation of lov_layout_operations::llo_fini for DOM object. + * + * Finish the DOM object and free related memory. + * + * \param[in] env execution environment + * \param[in] lov LOV object + * \param[in] state LOV layout state + */ +static void lov_fini_dom(const struct lu_env *env, + struct lov_layout_entry *lle) +{ + if (lle->lle_dom.lo_dom != NULL) + lle->lle_dom.lo_dom = NULL; + if (lle->lle_dom.lo_loi != NULL) + OBD_SLAB_FREE_PTR(lle->lle_dom.lo_loi, lov_oinfo_slab); +} + +static struct lov_comp_layout_entry_ops dom_ops = { + .lco_init = lov_init_dom, + .lco_fini = lov_fini_dom, + .lco_getattr = lov_attr_get_dom, +}; + +static int lov_init_composite(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, struct lov_stripe_md *lsm, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + struct lov_layout_composite *comp = &state->composite; + struct lov_layout_entry *lle; + struct lov_mirror_entry *lre; + unsigned int entry_count; + unsigned int psz = 0; + unsigned int mirror_count; + int flr_state = lsm->lsm_flags & LCM_FL_FLR_MASK; + int result = 0; + unsigned int seq; + int i, j, preference; + bool dom_size = 0; + + ENTRY; + + LASSERT(lsm->lsm_entry_count > 0); + LASSERT(lov->lo_lsm == NULL); + lov->lo_lsm = lsm_addref(lsm); + set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags); + + dump_lsm(D_INODE, lsm); + + entry_count = lsm->lsm_entry_count; + + comp->lo_flags = lsm->lsm_flags; + comp->lo_mirror_count = lsm->lsm_mirror_count + 1; + comp->lo_entry_count = lsm->lsm_entry_count; + comp->lo_preferred_mirror = -1; + + if (equi(flr_state == LCM_FL_NONE, comp->lo_mirror_count > 1)) + RETURN(-EINVAL); + + OBD_ALLOC_PTR_ARRAY(comp->lo_mirrors, comp->lo_mirror_count); + if (comp->lo_mirrors == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC_PTR_ARRAY(comp->lo_entries, entry_count); + if (comp->lo_entries == NULL) + RETURN(-ENOMEM); + + /* Initiate all entry types and extents data at first */ + for (i = 0, j = 0, mirror_count = 1; i < entry_count; i++) { + int mirror_id = 0; + + lle = &comp->lo_entries[i]; + + lle->lle_lsme = lsm->lsm_entries[i]; + lle->lle_type = lov_entry_type(lle->lle_lsme); + lle->lle_preference = 0; + switch (lle->lle_type) { + case LOV_PATTERN_RAID0: + lle->lle_comp_ops = &raid0_ops; + break; + case LOV_PATTERN_MDT: + /* Allowed to have several DOM stripes in different + * mirrors with the same DoM size. + */ + if (!dom_size) { + dom_size = lle->lle_lsme->lsme_extent.e_end; + } else if (dom_size != + lle->lle_lsme->lsme_extent.e_end) { + CERROR("%s: DOM entries with different sizes\n", + lov2obd(dev->ld_lov)->obd_name); + dump_lsm(D_ERROR, lsm); + RETURN(-EINVAL); + } + lle->lle_comp_ops = &dom_ops; + break; + case LOV_PATTERN_FOREIGN: + lle->lle_comp_ops = NULL; + break; + default: + CERROR("%s: unknown composite layout entry type %i\n", + lov2obd(dev->ld_lov)->obd_name, + lsm->lsm_entries[i]->lsme_pattern); + dump_lsm(D_ERROR, lsm); + RETURN(-EIO); + } + + lle->lle_extent = &lle->lle_lsme->lsme_extent; + lle->lle_valid = !(lle->lle_lsme->lsme_flags & LCME_FL_STALE); + + if (flr_state != LCM_FL_NONE) + mirror_id = mirror_id_of(lle->lle_lsme->lsme_id); + + lre = &comp->lo_mirrors[j]; + if (i > 0) { + if (mirror_id == lre->lre_mirror_id) { + lre->lre_valid |= lle->lle_valid; + lre->lre_stale |= !lle->lle_valid; + lre->lre_foreign |= + lsme_is_foreign(lle->lle_lsme); + lre->lre_end = i; + continue; + } + + /* new mirror detected, assume that the mirrors + * are shorted in layout */ + ++mirror_count; + ++j; + if (j >= comp->lo_mirror_count) + break; + + lre = &comp->lo_mirrors[j]; + } + + /* entries must be sorted by mirrors */ + lre->lre_mirror_id = mirror_id; + lre->lre_start = lre->lre_end = i; + lre->lre_preference = lle->lle_lsme->lsme_flags & + LCME_FL_PREF_RD ? 1000 : 0; + lre->lre_valid = lle->lle_valid; + lre->lre_stale = !lle->lle_valid; + lre->lre_foreign = lsme_is_foreign(lle->lle_lsme); + } + + /* sanity check for FLR */ + if (mirror_count != comp->lo_mirror_count) { + CDEBUG(D_INODE, DFID + " doesn't have the # of mirrors it claims, %u/%u\n", + PFID(lu_object_fid(lov2lu(lov))), mirror_count, + comp->lo_mirror_count + 1); + + GOTO(out, result = -EINVAL); + } + + lov_foreach_layout_entry(lov, lle) { + int index = lov_layout_entry_index(lov, lle); + + /** + * If the component has not been init-ed on MDS side, for + * PFL layout, we'd know that the components beyond this one + * will be dynamically init-ed later on file write/trunc ops. + */ + if (!lsme_inited(lle->lle_lsme)) + continue; + + if (lsme_is_foreign(lle->lle_lsme)) + continue; + + result = lle->lle_comp_ops->lco_init(env, dev, lov, index, + conf, lle); + if (result < 0) + break; + + LASSERT(ergo(psz > 0, psz == result)); + psz = result; + } + + if (psz > 0) + cl_object_header(&lov->lo_cl)->coh_page_bufsize += psz; + + /* decide the preferred mirror. It uses the hash value of lov_object + * so that different clients would use different mirrors for read. */ + mirror_count = 0; + preference = -1; + seq = hash_long((unsigned long)lov, 8); + for (i = 0; i < comp->lo_mirror_count; i++) { + unsigned int idx = (i + seq) % comp->lo_mirror_count; + + lre = lov_mirror_entry(lov, idx); + if (lre->lre_stale) + continue; + + if (lre->lre_foreign) + continue; + + mirror_count++; /* valid mirror */ + + /* aggregated preference of all involved OSTs */ + for (j = lre->lre_start; j <= lre->lre_end; j++) { + lre->lre_preference += + comp->lo_entries[j].lle_preference; + } + + if (lre->lre_preference > preference) { + preference = lre->lre_preference; + comp->lo_preferred_mirror = idx; + } + } + if (!mirror_count) { + CDEBUG(D_INODE, DFID + " doesn't have any valid mirrors\n", + PFID(lu_object_fid(lov2lu(lov)))); + + comp->lo_preferred_mirror = 0; + } + + LASSERT(comp->lo_preferred_mirror >= 0); + + EXIT; +out: + return result > 0 ? 0 : result; +} + +static int lov_init_empty(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, struct lov_stripe_md *lsm, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + return 0; +} + +static int lov_init_released(const struct lu_env *env, + struct lov_device *dev, struct lov_object *lov, + struct lov_stripe_md *lsm, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + LASSERT(lsm != NULL); + LASSERT(lsm->lsm_is_released); + LASSERT(lov->lo_lsm == NULL); + + lov->lo_lsm = lsm_addref(lsm); + return 0; +} + +static int lov_init_foreign(const struct lu_env *env, + struct lov_device *dev, struct lov_object *lov, + struct lov_stripe_md *lsm, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + LASSERT(lsm != NULL); + LASSERT(lov->lo_type == LLT_FOREIGN); + LASSERT(lov->lo_lsm == NULL); + + lov->lo_lsm = lsm_addref(lsm); + return 0; +} + +static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED || + lov->lo_type == LLT_FOREIGN); + + lov_layout_wait(env, lov); + return 0; +} + +static int lov_delete_composite(const struct lu_env *env, + struct lov_object *lov, + union lov_layout_state *state) +{ + struct lov_layout_entry *entry; + + ENTRY; + + dump_lsm(D_INODE, lov->lo_lsm); + + lov_layout_wait(env, lov); + lov_foreach_layout_entry(lov, entry) { + if (entry->lle_lsme && lsme_is_foreign(entry->lle_lsme)) + continue; + + lov_delete_raid0(env, lov, entry); + } + + RETURN(0); +} + +static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED); +} + +static void lov_fini_composite(const struct lu_env *env, + struct lov_object *lov, + union lov_layout_state *state) +{ + struct lov_layout_composite *comp = &state->composite; + ENTRY; + + if (comp->lo_entries != NULL) { + struct lov_layout_entry *entry; + + lov_foreach_layout_entry(lov, entry) + if (entry->lle_comp_ops) + entry->lle_comp_ops->lco_fini(env, entry); + + OBD_FREE_PTR_ARRAY(comp->lo_entries, comp->lo_entry_count); + comp->lo_entries = NULL; + } + + if (comp->lo_mirrors != NULL) { + OBD_FREE_PTR_ARRAY(comp->lo_mirrors, comp->lo_mirror_count); + comp->lo_mirrors = NULL; + } + + memset(comp, 0, sizeof(*comp)); + + dump_lsm(D_INODE, lov->lo_lsm); + lov_free_memmd(&lov->lo_lsm); + + EXIT; +} + +static void lov_fini_released(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + ENTRY; + dump_lsm(D_INODE, lov->lo_lsm); + lov_free_memmd(&lov->lo_lsm); + EXIT; +} + +static int lov_print_empty(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + (*p)(env, cookie, "empty %d\n", + test_bit(LO_LAYOUT_INVALID, &lu2lov(o)->lo_obj_flags)); + return 0; +} + +static int lov_print_composite(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct lov_object *lov = lu2lov(o); + struct lov_stripe_md *lsm = lov->lo_lsm; + int i; + + (*p)(env, cookie, "entries: %d, %s, lsm{%p 0x%08X %d %u}:\n", + lsm->lsm_entry_count, + test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? "invalid" : + "valid", lsm, lsm->lsm_magic, atomic_read(&lsm->lsm_refc), + lsm->lsm_layout_gen); + + for (i = 0; i < lsm->lsm_entry_count; i++) { + struct lov_stripe_md_entry *lse = lsm->lsm_entries[i]; + struct lov_layout_entry *lle = lov_entry(lov, i); + + (*p)(env, cookie, + DEXT ": { 0x%08X, %u, %#x, %u, %#x, %u, %u }\n", + PEXT(&lse->lsme_extent), lse->lsme_magic, + lse->lsme_id, lse->lsme_pattern, lse->lsme_layout_gen, + lse->lsme_flags, lse->lsme_stripe_count, + lse->lsme_stripe_size); + + if (!lsme_is_foreign(lse)) + lov_print_raid0(env, cookie, p, lle); + } + + return 0; +} + +static int lov_print_released(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct lov_object *lov = lu2lov(o); + struct lov_stripe_md *lsm = lov->lo_lsm; + + (*p)(env, cookie, + "released: %s, lsm{%p 0x%08X %d %u}:\n", + test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? "invalid" : + "valid", lsm, lsm->lsm_magic, atomic_read(&lsm->lsm_refc), + lsm->lsm_layout_gen); + return 0; +} + +static int lov_print_foreign(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct lov_object *lov = lu2lov(o); + struct lov_stripe_md *lsm = lov->lo_lsm; + + (*p)(env, cookie, + "foreign: %s, lsm{%p 0x%08X %d %u}:\n", + test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? + "invalid" : "valid", lsm, + lsm->lsm_magic, atomic_read(&lsm->lsm_refc), + lsm->lsm_layout_gen); + (*p)(env, cookie, + "raw_ea_content '%.*s'\n", + (int)lsm->lsm_foreign_size, (char *)lsm_foreign(lsm)); + return 0; +} + +/** + * Implements cl_object_operations::coo_attr_get() method for an object + * without stripes (LLT_EMPTY layout type). + * + * The only attributes this layer is authoritative in this case is + * cl_attr::cat_blocks---it's 0. + */ +static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + attr->cat_blocks = 0; + return 0; +} + +static int lov_attr_get_composite(const struct lu_env *env, + struct cl_object *obj, + struct cl_attr *attr) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_layout_entry *entry; + int result = 0; + + ENTRY; + + attr->cat_size = 0; + attr->cat_blocks = 0; + attr->cat_kms = 0; + + lov_foreach_layout_entry(lov, entry) { + struct cl_attr *lov_attr = NULL; + int index = lov_layout_entry_index(lov, entry); + + if (!entry->lle_valid) + continue; + + /* PFL: This component has not been init-ed. */ + if (!lsm_entry_inited(lov->lo_lsm, index)) + continue; + + result = entry->lle_comp_ops->lco_getattr(env, lov, index, + entry, &lov_attr); + if (result < 0) + RETURN(result); + + if (lov_attr == NULL) + continue; + + CDEBUG(D_INODE, "COMP ID #%i: s=%llu m=%llu a=%llu c=%llu " + "b=%llu\n", index - 1, lov_attr->cat_size, + lov_attr->cat_mtime, lov_attr->cat_atime, + lov_attr->cat_ctime, lov_attr->cat_blocks); + + /* merge results */ + attr->cat_blocks += lov_attr->cat_blocks; + if (attr->cat_size < lov_attr->cat_size) + attr->cat_size = lov_attr->cat_size; + if (attr->cat_kms < lov_attr->cat_kms) + attr->cat_kms = lov_attr->cat_kms; + if (attr->cat_atime < lov_attr->cat_atime) + attr->cat_atime = lov_attr->cat_atime; + if (attr->cat_ctime < lov_attr->cat_ctime) + attr->cat_ctime = lov_attr->cat_ctime; + if (attr->cat_mtime < lov_attr->cat_mtime) + attr->cat_mtime = lov_attr->cat_mtime; + } + + RETURN(0); +} + +static int lov_flush_composite(const struct lu_env *env, + struct cl_object *obj, + struct ldlm_lock *lock) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_layout_entry *lle; + int rc = -ENODATA; + + ENTRY; + + lov_foreach_layout_entry(lov, lle) { + if (!lsme_is_dom(lle->lle_lsme)) + continue; + rc = cl_object_flush(env, lovsub2cl(lle->lle_dom.lo_dom), lock); + break; + } + + RETURN(rc); +} + +static int lov_flush_empty(const struct lu_env *env, struct cl_object *obj, + struct ldlm_lock *lock) +{ + return 0; +} + +const static struct lov_layout_operations lov_dispatch[] = { + [LLT_EMPTY] = { + .llo_init = lov_init_empty, + .llo_delete = lov_delete_empty, + .llo_fini = lov_fini_empty, + .llo_print = lov_print_empty, + .llo_page_init = lov_page_init_empty, + .llo_lock_init = lov_lock_init_empty, + .llo_io_init = lov_io_init_empty, + .llo_getattr = lov_attr_get_empty, + .llo_flush = lov_flush_empty, + }, + [LLT_RELEASED] = { + .llo_init = lov_init_released, + .llo_delete = lov_delete_empty, + .llo_fini = lov_fini_released, + .llo_print = lov_print_released, + .llo_page_init = lov_page_init_empty, + .llo_lock_init = lov_lock_init_empty, + .llo_io_init = lov_io_init_released, + .llo_getattr = lov_attr_get_empty, + .llo_flush = lov_flush_empty, + }, + [LLT_COMP] = { + .llo_init = lov_init_composite, + .llo_delete = lov_delete_composite, + .llo_fini = lov_fini_composite, + .llo_print = lov_print_composite, + .llo_page_init = lov_page_init_composite, + .llo_lock_init = lov_lock_init_composite, + .llo_io_init = lov_io_init_composite, + .llo_getattr = lov_attr_get_composite, + .llo_flush = lov_flush_composite, + }, + [LLT_FOREIGN] = { + .llo_init = lov_init_foreign, + .llo_delete = lov_delete_empty, + .llo_fini = lov_fini_released, + .llo_print = lov_print_foreign, + .llo_page_init = lov_page_init_foreign, + .llo_lock_init = lov_lock_init_empty, + .llo_io_init = lov_io_init_empty, + .llo_getattr = lov_attr_get_empty, + .llo_flush = lov_flush_empty, + }, +}; + +/** + * Performs a double-dispatch based on the layout type of an object. + */ +#define LOV_2DISPATCH_NOLOCK(obj, op, ...) \ +({ \ + struct lov_object *__obj = (obj); \ + enum lov_layout_type __llt; \ + \ + __llt = __obj->lo_type; \ + LASSERT(__llt < ARRAY_SIZE(lov_dispatch)); \ + lov_dispatch[__llt].op(__VA_ARGS__); \ +}) + +/** + * Return lov_layout_type associated with a given lsm + */ +static enum lov_layout_type lov_type(struct lov_stripe_md *lsm) +{ + if (lsm == NULL) + return LLT_EMPTY; + + if (lsm->lsm_is_released) + return LLT_RELEASED; + + if (lsm->lsm_magic == LOV_MAGIC_V1 || + lsm->lsm_magic == LOV_MAGIC_V3 || + lsm->lsm_magic == LOV_MAGIC_COMP_V1) + return LLT_COMP; + + if (lsm->lsm_magic == LOV_MAGIC_FOREIGN) + return LLT_FOREIGN; + + return LLT_EMPTY; +} + +static inline void lov_conf_freeze(struct lov_object *lov) +{ + CDEBUG(D_INODE, "To take share lov(%p) owner %p/%p\n", + lov, lov->lo_owner, current); + if (lov->lo_owner != current) + down_read(&lov->lo_type_guard); +} + +static inline void lov_conf_thaw(struct lov_object *lov) +{ + CDEBUG(D_INODE, "To release share lov(%p) owner %p/%p\n", + lov, lov->lo_owner, current); + if (lov->lo_owner != current) + up_read(&lov->lo_type_guard); +} + +#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...) \ +({ \ + struct lov_object *__obj = (obj); \ + int __lock = !!(lock); \ + typeof(lov_dispatch[0].op(__VA_ARGS__)) __result; \ + \ + if (__lock) \ + lov_conf_freeze(__obj); \ + __result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__); \ + if (__lock) \ + lov_conf_thaw(__obj); \ + __result; \ +}) + +/** + * Performs a locked double-dispatch based on the layout type of an object. + */ +#define LOV_2DISPATCH(obj, op, ...) \ + LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__) + +#define LOV_2DISPATCH_VOID(obj, op, ...) \ +do { \ + struct lov_object *__obj = (obj); \ + enum lov_layout_type __llt; \ + \ + lov_conf_freeze(__obj); \ + __llt = __obj->lo_type; \ + LASSERT(__llt < ARRAY_SIZE(lov_dispatch)); \ + lov_dispatch[__llt].op(__VA_ARGS__); \ + lov_conf_thaw(__obj); \ +} while (0) + +static void lov_conf_lock(struct lov_object *lov) +{ + LASSERT(lov->lo_owner != current); + down_write(&lov->lo_type_guard); + LASSERT(lov->lo_owner == NULL); + lov->lo_owner = current; + CDEBUG(D_INODE, "Took exclusive lov(%p) owner %p\n", + lov, lov->lo_owner); +} + +static void lov_conf_unlock(struct lov_object *lov) +{ + CDEBUG(D_INODE, "To release exclusive lov(%p) owner %p\n", + lov, lov->lo_owner); + lov->lo_owner = NULL; + up_write(&lov->lo_type_guard); +} + +static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov) +{ + ENTRY; + + while (atomic_read(&lov->lo_active_ios) > 0) { + CDEBUG(D_INODE, "file:"DFID" wait for active IO, now: %d.\n", + PFID(lu_object_fid(lov2lu(lov))), + atomic_read(&lov->lo_active_ios)); + + wait_event_idle(lov->lo_waitq, + atomic_read(&lov->lo_active_ios) == 0); + } + RETURN(0); +} + +static int lov_layout_change(const struct lu_env *unused, + struct lov_object *lov, struct lov_stripe_md *lsm, + const struct cl_object_conf *conf) +{ + enum lov_layout_type llt = lov_type(lsm); + union lov_layout_state *state = &lov->u; + const struct lov_layout_operations *old_ops; + const struct lov_layout_operations *new_ops; + struct lov_device *lov_dev = lov_object_dev(lov); + struct lu_env *env; + __u16 refcheck; + int rc; + ENTRY; + + LASSERT(lov->lo_type < ARRAY_SIZE(lov_dispatch)); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + LASSERT(llt < ARRAY_SIZE(lov_dispatch)); + + CDEBUG(D_INODE, DFID" from %s to %s\n", + PFID(lu_object_fid(lov2lu(lov))), + llt2str(lov->lo_type), llt2str(llt)); + + old_ops = &lov_dispatch[lov->lo_type]; + new_ops = &lov_dispatch[llt]; + + rc = cl_object_prune(env, &lov->lo_cl); + if (rc != 0) + GOTO(out, rc); + + rc = old_ops->llo_delete(env, lov, &lov->u); + if (rc != 0) + GOTO(out, rc); + + old_ops->llo_fini(env, lov, &lov->u); + + LASSERT(atomic_read(&lov->lo_active_ios) == 0); + + CDEBUG(D_INODE, DFID "Apply new layout lov %p, type %d\n", + PFID(lu_object_fid(lov2lu(lov))), lov, llt); + + /* page bufsize fixup */ + cl_object_header(&lov->lo_cl)->coh_page_bufsize -= + lov_page_slice_fixup(lov, NULL); + + lov->lo_type = llt; + rc = new_ops->llo_init(env, lov_dev, lov, lsm, conf, state); + if (rc != 0) { + struct obd_device *obd = lov2obd(lov_dev->ld_lov); + + CERROR("%s: cannot apply new layout on "DFID" : rc = %d\n", + obd->obd_name, PFID(lu_object_fid(lov2lu(lov))), rc); + new_ops->llo_delete(env, lov, state); + new_ops->llo_fini(env, lov, state); + /* this file becomes an EMPTY file. */ + lov->lo_type = LLT_EMPTY; + GOTO(out, rc); + } + +out: + cl_env_put(env, &refcheck); + RETURN(rc); +} + +/***************************************************************************** + * + * Lov object operations. + * + */ +static int lov_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct lov_object *lov = lu2lov(obj); + struct lov_device *dev = lov_object_dev(lov); + const struct cl_object_conf *cconf = lu2cl_conf(conf); + union lov_layout_state *set = &lov->u; + const struct lov_layout_operations *ops; + struct lov_stripe_md *lsm = NULL; + int rc; + ENTRY; + + init_rwsem(&lov->lo_type_guard); + atomic_set(&lov->lo_active_ios, 0); + init_waitqueue_head(&lov->lo_waitq); + cl_object_page_init(lu2cl(obj), sizeof(struct lov_page)); + + lov->lo_type = LLT_EMPTY; + if (cconf->u.coc_layout.lb_buf != NULL) { + lsm = lov_unpackmd(dev->ld_lov, + cconf->u.coc_layout.lb_buf, + cconf->u.coc_layout.lb_len); + if (IS_ERR(lsm)) + RETURN(PTR_ERR(lsm)); + + dump_lsm(D_INODE, lsm); + } + + /* no locking is necessary, as object is being created */ + lov->lo_type = lov_type(lsm); + ops = &lov_dispatch[lov->lo_type]; + rc = ops->llo_init(env, dev, lov, lsm, cconf, set); + if (rc != 0) + GOTO(out_lsm, rc); + +out_lsm: + lov_lsm_put(lsm); + + RETURN(rc); +} + +static int lov_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct lov_stripe_md *lsm = NULL; + struct lov_object *lov = cl2lov(obj); + int result = 0; + ENTRY; + + if (conf->coc_opc == OBJECT_CONF_SET && + conf->u.coc_layout.lb_buf != NULL) { + lsm = lov_unpackmd(lov_object_dev(lov)->ld_lov, + conf->u.coc_layout.lb_buf, + conf->u.coc_layout.lb_len); + if (IS_ERR(lsm)) + RETURN(PTR_ERR(lsm)); + dump_lsm(D_INODE, lsm); + } + + if (conf->coc_opc == OBJECT_CONF_INVALIDATE) { + set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags); + GOTO(out_lsm, result = 0); + } + + lov_conf_lock(lov); + if (conf->coc_opc == OBJECT_CONF_WAIT) { + if (test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) && + atomic_read(&lov->lo_active_ios) > 0) { + lov_conf_unlock(lov); + result = lov_layout_wait(env, lov); + lov_conf_lock(lov); + } + GOTO(out, result); + } + + LASSERT(conf->coc_opc == OBJECT_CONF_SET); + + if ((lsm == NULL && lov->lo_lsm == NULL) || + ((lsm != NULL && lov->lo_lsm != NULL) && + (lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen) && + (lov->lo_lsm->lsm_flags == lsm->lsm_flags) && + (lov->lo_lsm->lsm_entries[0]->lsme_pattern == + lsm->lsm_entries[0]->lsme_pattern))) { + /* same version of layout */ + clear_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags); + GOTO(out, result = 0); + } + + /* will change layout - check if there still exists active IO. */ + if (atomic_read(&lov->lo_active_ios) > 0) { + set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags); + GOTO(out, result = -EBUSY); + } + + result = lov_layout_change(env, lov, lsm, conf); + if (result) + set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags); + else + clear_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags); + EXIT; + +out: + lov_conf_unlock(lov); +out_lsm: + lov_lsm_put(lsm); + CDEBUG(D_INODE, DFID" lo_layout_invalid=%u\n", + PFID(lu_object_fid(lov2lu(lov))), + test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags)); + RETURN(result); +} + +static void lov_object_delete(const struct lu_env *env, struct lu_object *obj) +{ + struct lov_object *lov = lu2lov(obj); + + ENTRY; + LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u); + EXIT; +} + +static void lov_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct lov_object *lov = lu2lov(obj); + + ENTRY; + LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u); + lu_object_fini(obj); + OBD_SLAB_FREE_PTR(lov, lov_object_kmem); + EXIT; +} + +static int lov_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + return LOV_2DISPATCH_NOLOCK(lu2lov(o), llo_print, env, cookie, p, o); +} + +static int lov_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index) +{ + return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_page_init, env, obj, page, + index); +} + +/** + * Implements cl_object_operations::clo_io_init() method for lov + * layer. Dispatches to the appropriate layout io initialization method. + */ +static int lov_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + CL_IO_SLICE_CLEAN(lov_env_io(env), lis_preserved); + + CDEBUG(D_INODE, DFID "io %p type %d ignore/verify layout %d/%d\n", + PFID(lu_object_fid(&obj->co_lu)), io, io->ci_type, + io->ci_ignore_layout, io->ci_verify_layout); + + /* IO type CIT_MISC with ci_ignore_layout set are usually invoked from + * the OSC layer. It shouldn't take lov layout conf lock in that case, + * because as long as the OSC object exists, the layout can't be + * reconfigured. */ + return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init, + !(io->ci_ignore_layout && io->ci_type == CIT_MISC), + env, obj, io); +} + +/** + * An implementation of cl_object_operations::clo_attr_get() method for lov + * layer. For raid0 layout this collects and merges attributes of all + * sub-objects. + */ +static int lov_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + /* do not take lock, as this function is called under a + * spin-lock. Layout is protected from changing by ongoing IO. */ + return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr); +} + +static int lov_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + /* + * No dispatch is required here, as no layout implements this. + */ + return 0; +} + +static int lov_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + /* No need to lock because we've taken one refcount of layout. */ + return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock, + io); +} + +/** + * We calculate on which OST the mapping will end. If the length of mapping + * is greater than (stripe_size * stripe_count) then the last_stripe will + * will be one just before start_stripe. Else we check if the mapping + * intersects each OST and find last_stripe. + * This function returns the last_stripe and also sets the stripe_count + * over which the mapping is spread + * + * \param lsm [in] striping information for the file + * \param index [in] stripe component index + * \param ext [in] logical extent of mapping + * \param start_stripe [in] starting stripe of the mapping + * \param stripe_count [out] the number of stripes across which to map is + * returned + * + * \retval last_stripe return the last stripe of the mapping + */ +static int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, int index, + struct lu_extent *ext, + int start_stripe, int *stripe_count) +{ + struct lov_stripe_md_entry *lsme = lsm->lsm_entries[index]; + int init_stripe; + int last_stripe; + int i, j; + + init_stripe = lov_stripe_number(lsm, index, ext->e_start); + + if (ext->e_end - ext->e_start > + lsme->lsme_stripe_size * lsme->lsme_stripe_count) { + if (init_stripe == start_stripe) { + last_stripe = (start_stripe < 1) ? + lsme->lsme_stripe_count - 1 : start_stripe - 1; + *stripe_count = lsme->lsme_stripe_count; + } else if (init_stripe < start_stripe) { + last_stripe = (init_stripe < 1) ? + lsme->lsme_stripe_count - 1 : init_stripe - 1; + *stripe_count = lsme->lsme_stripe_count - + (start_stripe - init_stripe); + } else { + last_stripe = init_stripe - 1; + *stripe_count = init_stripe - start_stripe; + } + } else { + for (j = 0, i = start_stripe; j < lsme->lsme_stripe_count; + i = (i + 1) % lsme->lsme_stripe_count, j++) { + if (!lov_stripe_intersects(lsm, index, i, ext, NULL, + NULL)) + break; + if ((start_stripe != init_stripe) && (i == init_stripe)) + break; + } + *stripe_count = j; + last_stripe = (start_stripe + j - 1) % lsme->lsme_stripe_count; + } + + return last_stripe; +} + +/** + * Set fe_device and copy extents from local buffer into main return buffer. + * + * \param fiemap [out] fiemap to hold all extents + * \param lcl_fm_ext [in] array of fiemap extents get from OSC layer + * \param ost_index [in] OST index to be written into the fm_device + * field for each extent + * \param ext_count [in] number of extents to be copied + * \param current_extent [in] where to start copying in the extent array + */ +static void fiemap_prepare_and_copy_exts(struct fiemap *fiemap, + struct fiemap_extent *lcl_fm_ext, + int ost_index, unsigned int ext_count, + int current_extent, int abs_stripeno) +{ + char *to; + unsigned int ext; + + for (ext = 0; ext < ext_count; ext++) { + set_fe_device_stripenr(&lcl_fm_ext[ext], ost_index, + abs_stripeno); + lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET; + } + + /* Copy fm_extent's from fm_local to return buffer */ + to = (char *)fiemap + fiemap_count_to_size(current_extent); + memcpy(to, lcl_fm_ext, ext_count * sizeof(struct fiemap_extent)); +} + +#define FIEMAP_BUFFER_SIZE 4096 + +/** + * Non-zero fe_logical indicates that this is a continuation FIEMAP + * call. The local end offset and the device are sent in the first + * fm_extent. This function calculates the stripe number from the index. + * This function returns a stripe_no on which mapping is to be restarted. + * + * This function returns fm_end_offset which is the in-OST offset at which + * mapping should be restarted. If fm_end_offset=0 is returned then caller + * will re-calculate proper offset in next stripe. + * Note that the first extent is passed to lov_get_info via the value field. + * + * \param fiemap [in] fiemap request header + * \param lsm [in] striping information for the file + * \param index [in] stripe component index + * \param ext [in] logical extent of mapping + * \param start_stripe [out] starting stripe will be returned in this + */ +static u64 fiemap_calc_fm_end_offset(struct fiemap *fiemap, + struct lov_stripe_md *lsm, + int index, struct lu_extent *ext, + int *start_stripe) +{ + struct lov_stripe_md_entry *lsme = lsm->lsm_entries[index]; + u64 local_end = fiemap->fm_extents[0].fe_logical; + u64 lun_end; + u64 fm_end_offset; + int stripe_no = -1; + + if (fiemap->fm_extent_count == 0 || + fiemap->fm_extents[0].fe_logical == 0) + return 0; + + stripe_no = *start_stripe; + + if (stripe_no == -1) + return -EINVAL; + + /* If we have finished mapping on previous device, shift logical + * offset to start of next device */ + if (lov_stripe_intersects(lsm, index, stripe_no, ext, NULL, &lun_end) && + local_end < lun_end) { + fm_end_offset = local_end; + } else { + /* This is a special value to indicate that caller should + * calculate offset in next stripe. */ + fm_end_offset = 0; + *start_stripe = (stripe_no + 1) % lsme->lsme_stripe_count; + } + + return fm_end_offset; +} + +struct fiemap_state { + struct fiemap *fs_fm; + struct lu_extent fs_ext; /* current entry extent */ + u64 fs_length; + u64 fs_end_offset; /* last iteration offset */ + int fs_cur_extent; /* collected exts so far */ + int fs_cnt_need; /* # of extents buf can hold */ + int fs_start_stripe; + int fs_last_stripe; + bool fs_device_done; /* enough for this OST */ + bool fs_finish_stripe; /* reached fs_last_stripe */ + bool fs_enough; /* enough for this call */ +}; + +static struct cl_object *lov_find_subobj(const struct lu_env *env, + struct lov_object *lov, + struct lov_stripe_md *lsm, + int index) +{ + struct lov_device *dev = lu2lov_dev(lov2lu(lov)->lo_dev); + struct lov_thread_info *lti = lov_env_info(env); + struct lu_fid *ofid = <i->lti_fid; + struct lov_oinfo *oinfo; + struct cl_device *subdev; + int entry = lov_comp_entry(index); + int stripe = lov_comp_stripe(index); + int ost_idx; + int rc; + struct cl_object *result; + + if (lov->lo_type != LLT_COMP) + GOTO(out, result = NULL); + + if (entry >= lsm->lsm_entry_count || + stripe >= lsm->lsm_entries[entry]->lsme_stripe_count) + GOTO(out, result = NULL); + + oinfo = lsm->lsm_entries[entry]->lsme_oinfo[stripe]; + ost_idx = oinfo->loi_ost_idx; + rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx); + if (rc != 0) + GOTO(out, result = NULL); + + subdev = lovsub2cl_dev(dev->ld_target[ost_idx]); + result = lov_sub_find(env, subdev, ofid, NULL); +out: + if (result == NULL) + result = ERR_PTR(-EINVAL); + return result; +} + +static int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj, + struct lov_stripe_md *lsm, struct fiemap *fiemap, + size_t *buflen, struct ll_fiemap_info_key *fmkey, + int index, int stripe_last, int stripeno, + struct fiemap_state *fs) +{ + struct lov_stripe_md_entry *lsme = lsm->lsm_entries[index]; + struct cl_object *subobj; + struct lov_obd *lov = lu2lov_dev(obj->co_lu.lo_dev)->ld_lov; + struct fiemap_extent *fm_ext = &fs->fs_fm->fm_extents[0]; + u64 req_fm_len; /* max requested extent coverage */ + u64 len_mapped_single_call; + u64 obd_start; + u64 obd_end; + unsigned int ext_count; + /* EOF for object */ + bool ost_eof = false; + /* done with required mapping for this OST? */ + bool ost_done = false; + int ost_index; + int rc = 0; + + fs->fs_device_done = false; + /* Find out range of mapping on this stripe */ + if ((lov_stripe_intersects(lsm, index, stripeno, &fs->fs_ext, + &obd_start, &obd_end)) == 0) + return 0; + + if (lov_oinfo_is_dummy(lsme->lsme_oinfo[stripeno])) + return -EIO; + + /* If this is a continuation FIEMAP call and we are on + * starting stripe then obd_start needs to be set to + * end_offset */ + if (fs->fs_end_offset != 0 && stripeno == fs->fs_start_stripe) + obd_start = fs->fs_end_offset; + + if (lov_size_to_stripe(lsm, index, fs->fs_ext.e_end, stripeno) == + obd_start) + return 0; + + req_fm_len = obd_end - obd_start + 1; + fs->fs_fm->fm_length = 0; + len_mapped_single_call = 0; + + /* find lobsub object */ + subobj = lov_find_subobj(env, cl2lov(obj), lsm, + lov_comp_index(index, stripeno)); + if (IS_ERR(subobj)) + return PTR_ERR(subobj); + /* If the output buffer is very large and the objects have many + * extents we may need to loop on a single OST repeatedly */ + do { + if (fiemap->fm_extent_count > 0) { + /* Don't get too many extents. */ + if (fs->fs_cur_extent + fs->fs_cnt_need > + fiemap->fm_extent_count) + fs->fs_cnt_need = fiemap->fm_extent_count - + fs->fs_cur_extent; + } + + obd_start += len_mapped_single_call; + fs->fs_fm->fm_length = req_fm_len - len_mapped_single_call; + req_fm_len = fs->fs_fm->fm_length; + /** + * If we've collected enough extent map, we'd request 1 more, + * to see whether we coincidentally finished all available + * extent map, so that FIEMAP_EXTENT_LAST would be set. + */ + fs->fs_fm->fm_extent_count = fs->fs_enough ? + 1 : fs->fs_cnt_need; + fs->fs_fm->fm_mapped_extents = 0; + fs->fs_fm->fm_flags = fiemap->fm_flags; + + ost_index = lsme->lsme_oinfo[stripeno]->loi_ost_idx; + + if (ost_index < 0 || ost_index >= lov->desc.ld_tgt_count) + GOTO(obj_put, rc = -EINVAL); + /* If OST is inactive, return extent with UNKNOWN flag. */ + if (!lov->lov_tgts[ost_index]->ltd_active) { + fs->fs_fm->fm_flags |= FIEMAP_EXTENT_LAST; + fs->fs_fm->fm_mapped_extents = 1; + + fm_ext[0].fe_logical = obd_start; + fm_ext[0].fe_length = obd_end - obd_start + 1; + fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN; + + goto inactive_tgt; + } + + fs->fs_fm->fm_start = obd_start; + fs->fs_fm->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER; + memcpy(&fmkey->lfik_fiemap, fs->fs_fm, sizeof(*fs->fs_fm)); + *buflen = fiemap_count_to_size(fs->fs_fm->fm_extent_count); + + rc = cl_object_fiemap(env, subobj, fmkey, fs->fs_fm, buflen); + if (rc != 0) + GOTO(obj_put, rc); +inactive_tgt: + ext_count = fs->fs_fm->fm_mapped_extents; + if (ext_count == 0) { + ost_done = true; + fs->fs_device_done = true; + /* If last stripe has hold at the end, + * we need to return */ + if (stripeno == fs->fs_last_stripe) { + fiemap->fm_mapped_extents = 0; + fs->fs_finish_stripe = true; + GOTO(obj_put, rc); + } + break; + } else if (fs->fs_enough) { + /* + * We've collected enough extents and there are + * more extents after it. + */ + GOTO(obj_put, rc); + } + + /* If we just need num of extents, got to next device */ + if (fiemap->fm_extent_count == 0) { + fs->fs_cur_extent += ext_count; + break; + } + + /* prepare to copy retrived map extents */ + len_mapped_single_call = fm_ext[ext_count - 1].fe_logical + + fm_ext[ext_count - 1].fe_length - + obd_start; + + /* Have we finished mapping on this device? */ + if (req_fm_len <= len_mapped_single_call) { + ost_done = true; + fs->fs_device_done = true; + } + + /* Clear the EXTENT_LAST flag which can be present on + * the last extent */ + if (fm_ext[ext_count - 1].fe_flags & FIEMAP_EXTENT_LAST) + fm_ext[ext_count - 1].fe_flags &= ~FIEMAP_EXTENT_LAST; + if (lov_stripe_size(lsm, index, + fm_ext[ext_count - 1].fe_logical + + fm_ext[ext_count - 1].fe_length, + stripeno) >= fmkey->lfik_oa.o_size) { + ost_eof = true; + fs->fs_device_done = true; + } + + fiemap_prepare_and_copy_exts(fiemap, fm_ext, ost_index, + ext_count, fs->fs_cur_extent, + stripe_last + stripeno); + fs->fs_cur_extent += ext_count; + + /* Ran out of available extents? */ + if (fs->fs_cur_extent >= fiemap->fm_extent_count) + fs->fs_enough = true; + } while (!ost_done && !ost_eof); + + if (stripeno == fs->fs_last_stripe) + fs->fs_finish_stripe = true; +obj_put: + cl_object_put(env, subobj); + + return rc; +} + +/** + * Break down the FIEMAP request and send appropriate calls to individual OSTs. + * This also handles the restarting of FIEMAP calls in case mapping overflows + * the available number of extents in single call. + * + * \param env [in] lustre environment + * \param obj [in] file object + * \param fmkey [in] fiemap request header and other info + * \param fiemap [out] fiemap buffer holding retrived map extents + * \param buflen [in/out] max buffer length of @fiemap, when iterate + * each OST, it is used to limit max map needed + * \retval 0 success + * \retval < 0 error + */ +static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj, + struct ll_fiemap_info_key *fmkey, + struct fiemap *fiemap, size_t *buflen) +{ + struct lov_stripe_md_entry *lsme; + struct lov_stripe_md *lsm; + struct fiemap *fm_local = NULL; + loff_t whole_start; + loff_t whole_end; + int entry; + int start_entry = -1; + int end_entry; + int cur_stripe = 0; + int stripe_count; + unsigned int buffer_size = FIEMAP_BUFFER_SIZE; + int rc = 0; + struct fiemap_state fs = { 0 }; + struct lu_extent range; + int cur_ext; + int stripe_last; + int start_stripe = 0; + bool resume = false; + ENTRY; + + lsm = lov_lsm_addref(cl2lov(obj)); + if (lsm == NULL) { + /* no extent: there is no object for mapping */ + fiemap->fm_mapped_extents = 0; + return 0; + } + + if (!(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) { + /** + * If the entry count > 1 or stripe_count > 1 and the + * application does not understand DEVICE_ORDER flag, + * it cannot interpret the extents correctly. + */ + if (lsm->lsm_entry_count > 1 || + (lsm->lsm_entry_count == 1 && + lsm->lsm_entries[0]->lsme_stripe_count > 1)) + GOTO(out_lsm, rc = -ENOTSUPP); + } + + /* No support for DOM layout yet. */ + if (lsme_is_dom(lsm->lsm_entries[0])) + GOTO(out_lsm, rc = -ENOTSUPP); + + if (lsm->lsm_is_released) { + if (fiemap->fm_start < fmkey->lfik_oa.o_size) { + /** + * released file, return a minimal FIEMAP if + * request fits in file-size. + */ + fiemap->fm_mapped_extents = 1; + fiemap->fm_extents[0].fe_logical = fiemap->fm_start; + if (fiemap->fm_start + fiemap->fm_length < + fmkey->lfik_oa.o_size) + fiemap->fm_extents[0].fe_length = + fiemap->fm_length; + else + fiemap->fm_extents[0].fe_length = + fmkey->lfik_oa.o_size - + fiemap->fm_start; + fiemap->fm_extents[0].fe_flags |= + FIEMAP_EXTENT_UNKNOWN | FIEMAP_EXTENT_LAST; + } + GOTO(out_lsm, rc = 0); + } + + /* buffer_size is small to hold fm_extent_count of extents. */ + if (fiemap_count_to_size(fiemap->fm_extent_count) < buffer_size) + buffer_size = fiemap_count_to_size(fiemap->fm_extent_count); + + OBD_ALLOC_LARGE(fm_local, buffer_size); + if (fm_local == NULL) + GOTO(out_lsm, rc = -ENOMEM); + + /** + * Requested extent count exceeds the fiemap buffer size, shrink our + * ambition. + */ + if (fiemap_count_to_size(fiemap->fm_extent_count) > *buflen) + fiemap->fm_extent_count = fiemap_size_to_count(*buflen); + + fs.fs_enough = false; + fs.fs_cur_extent = 0; + fs.fs_fm = fm_local; + fs.fs_cnt_need = fiemap_size_to_count(buffer_size); + + whole_start = fiemap->fm_start; + /* whole_start is beyond the end of the file */ + if (whole_start > fmkey->lfik_oa.o_size) + GOTO(out_fm_local, rc = -EINVAL); + whole_end = (fiemap->fm_length == OBD_OBJECT_EOF) ? + fmkey->lfik_oa.o_size + 1 : + whole_start + fiemap->fm_length; + /** + * If fiemap->fm_length != OBD_OBJECT_EOF but whole_end exceeds file + * size + */ + if (whole_end > fmkey->lfik_oa.o_size + 1) + whole_end = fmkey->lfik_oa.o_size + 1; + + /** + * the high 16bits of fe_device remember which stripe the last + * call has been arrived, we'd continue from there in this call. + */ + if (fiemap->fm_extent_count && fiemap->fm_extents[0].fe_logical) + resume = true; + stripe_last = get_fe_stripenr(&fiemap->fm_extents[0]); + /** + * stripe_last records stripe number we've been processed in the last + * call + */ + end_entry = lsm->lsm_entry_count - 1; + cur_stripe = 0; + for (entry = 0; entry <= end_entry; entry++) { + lsme = lsm->lsm_entries[entry]; + if (cur_stripe + lsme->lsme_stripe_count >= stripe_last) { + start_entry = entry; + start_stripe = stripe_last - cur_stripe; + break; + } + + cur_stripe += lsme->lsme_stripe_count; + } + if (start_entry == -1) { + CERROR(DFID": FIEMAP does not init start entry, cur_stripe=%d, " + "stripe_last=%d\n", PFID(lu_object_fid(&obj->co_lu)), + cur_stripe, stripe_last); + GOTO(out_fm_local, rc = -EINVAL); + } + /** + * @start_entry & @start_stripe records the position of fiemap + * resumption @stripe_last keeps recording the absolution position + * we'are processing. @resume indicates we'd honor @start_stripe. + */ + + range.e_start = whole_start; + range.e_end = whole_end; + + for (entry = start_entry; entry <= end_entry; entry++) { + /* remeber to update stripe_last accordingly */ + lsme = lsm->lsm_entries[entry]; + + /* FLR could contain component holes between entries */ + if (!lsme_inited(lsme)) { + stripe_last += lsme->lsme_stripe_count; + resume = false; + continue; + } + + if (!lu_extent_is_overlapped(&range, &lsme->lsme_extent)) { + stripe_last += lsme->lsme_stripe_count; + resume = false; + continue; + } + + /* prepare for a component entry iteration */ + if (lsme->lsme_extent.e_start > whole_start) + fs.fs_ext.e_start = lsme->lsme_extent.e_start; + else + fs.fs_ext.e_start = whole_start; + if (lsme->lsme_extent.e_end > whole_end) + fs.fs_ext.e_end = whole_end; + else + fs.fs_ext.e_end = lsme->lsme_extent.e_end; + + /* Calculate start stripe, last stripe and length of mapping */ + if (resume) { + fs.fs_start_stripe = start_stripe; + /* put stripe_last to the first stripe of the comp */ + stripe_last -= start_stripe; + resume = false; + } else { + fs.fs_start_stripe = lov_stripe_number(lsm, entry, + fs.fs_ext.e_start); + } + fs.fs_last_stripe = fiemap_calc_last_stripe(lsm, entry, + &fs.fs_ext, fs.fs_start_stripe, + &stripe_count); + /** + * A new mirror component is under process, reset + * fs.fs_end_offset and then fiemap_for_stripe() starts from + * the overlapping extent, otherwise starts from + * fs.fs_end_offset. + */ + if (entry > start_entry && lsme->lsme_extent.e_start == 0) { + /* new mirror */ + fs.fs_end_offset = 0; + } else { + fs.fs_end_offset = fiemap_calc_fm_end_offset(fiemap, + lsm, entry, &fs.fs_ext, + &fs.fs_start_stripe); + } + + /* Check each stripe */ + for (cur_stripe = fs.fs_start_stripe; stripe_count > 0; + --stripe_count, + cur_stripe = (cur_stripe + 1) % lsme->lsme_stripe_count) { + /* reset fs_finish_stripe */ + fs.fs_finish_stripe = false; + rc = fiemap_for_stripe(env, obj, lsm, fiemap, buflen, + fmkey, entry, stripe_last, + cur_stripe, &fs); + if (rc < 0) + GOTO(out_fm_local, rc); + if (fs.fs_enough) { + stripe_last += cur_stripe; + GOTO(finish, rc); + } + if (fs.fs_finish_stripe) + break; + } /* for each stripe */ + stripe_last += lsme->lsme_stripe_count; + } /* for covering layout component entry */ + +finish: + if (fs.fs_cur_extent > 0) + cur_ext = fs.fs_cur_extent - 1; + else + cur_ext = 0; + + /* done all the processing */ + if (entry > end_entry) + fiemap->fm_extents[cur_ext].fe_flags |= FIEMAP_EXTENT_LAST; + + /* Indicate that we are returning device offsets unless file just has + * single stripe */ + if (lsm->lsm_entry_count > 1 || + (lsm->lsm_entry_count == 1 && + lsm->lsm_entries[0]->lsme_stripe_count > 1)) + fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER; + + if (fiemap->fm_extent_count == 0) + goto skip_last_device_calc; + +skip_last_device_calc: + fiemap->fm_mapped_extents = fs.fs_cur_extent; +out_fm_local: + OBD_FREE_LARGE(fm_local, buffer_size); + +out_lsm: + lov_lsm_put(lsm); + return rc; +} + +static int lov_object_getstripe(const struct lu_env *env, struct cl_object *obj, + struct lov_user_md __user *lum, size_t size) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_stripe_md *lsm; + int rc = 0; + ENTRY; + + lsm = lov_lsm_addref(lov); + if (lsm == NULL) + RETURN(-ENODATA); + + rc = lov_getstripe(env, cl2lov(obj), lsm, lum, size); + lov_lsm_put(lsm); + RETURN(rc); +} + +static int lov_object_layout_get(const struct lu_env *env, + struct cl_object *obj, + struct cl_layout *cl) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_stripe_md *lsm = lov_lsm_addref(lov); + struct lu_buf *buf = &cl->cl_buf; + ssize_t rc; + ENTRY; + + if (lsm == NULL) { + cl->cl_size = 0; + cl->cl_layout_gen = CL_LAYOUT_GEN_EMPTY; + + RETURN(0); + } + + cl->cl_size = lov_comp_md_size(lsm); + cl->cl_layout_gen = lsm->lsm_layout_gen; + cl->cl_is_released = lsm->lsm_is_released; + cl->cl_is_composite = lsm_is_composite(lsm->lsm_magic); + + rc = lov_lsm_pack(lsm, buf->lb_buf, buf->lb_len); + lov_lsm_put(lsm); + + /* return error or number of bytes */ + RETURN(rc); +} + +static loff_t lov_object_maxbytes(struct cl_object *obj) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_stripe_md *lsm = lov_lsm_addref(lov); + loff_t maxbytes; + + if (lsm == NULL) + return LLONG_MAX; + + maxbytes = lsm->lsm_maxbytes; + + lov_lsm_put(lsm); + + return maxbytes; +} + +static int lov_object_flush(const struct lu_env *env, struct cl_object *obj, + struct ldlm_lock *lock) +{ + return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_flush, true, env, obj, + lock); +} + +static const struct cl_object_operations lov_ops = { + .coo_page_init = lov_page_init, + .coo_lock_init = lov_lock_init, + .coo_io_init = lov_io_init, + .coo_attr_get = lov_attr_get, + .coo_attr_update = lov_attr_update, + .coo_conf_set = lov_conf_set, + .coo_getstripe = lov_object_getstripe, + .coo_layout_get = lov_object_layout_get, + .coo_maxbytes = lov_object_maxbytes, + .coo_fiemap = lov_object_fiemap, + .coo_object_flush = lov_object_flush +}; + +static const struct lu_object_operations lov_lu_obj_ops = { + .loo_object_init = lov_object_init, + .loo_object_delete = lov_object_delete, + .loo_object_release = NULL, + .loo_object_free = lov_object_free, + .loo_object_print = lov_object_print, + .loo_object_invariant = NULL, +}; + +struct lu_object *lov_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct lov_object *lov; + struct lu_object *obj; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(lov, lov_object_kmem, GFP_NOFS); + if (lov != NULL) { + obj = lov2lu(lov); + lu_object_init(obj, NULL, dev); + lov->lo_cl.co_ops = &lov_ops; + lov->lo_type = -1; /* invalid, to catch uninitialized type */ + /* + * object io operation vector (cl_object::co_iop) is installed + * later in lov_object_init(), as different vectors are used + * for object with different layouts. + */ + obj->lo_ops = &lov_lu_obj_ops; + } else + obj = NULL; + RETURN(obj); +} + +static struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov) +{ + struct lov_stripe_md *lsm = NULL; + + lov_conf_freeze(lov); + if (lov->lo_lsm != NULL) { + lsm = lsm_addref(lov->lo_lsm); + CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n", + lsm, atomic_read(&lsm->lsm_refc), + test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags), + current); + } + lov_conf_thaw(lov); + return lsm; +} + +int lov_read_and_clear_async_rc(struct cl_object *clob) +{ + struct lu_object *luobj; + int rc = 0; + ENTRY; + + luobj = lu_object_locate(&cl_object_header(clob)->coh_lu, + &lov_device_type); + if (luobj != NULL) { + struct lov_object *lov = lu2lov(luobj); + + lov_conf_freeze(lov); + switch (lov->lo_type) { + case LLT_COMP: { + struct lov_stripe_md *lsm; + int i; + + lsm = lov->lo_lsm; + LASSERT(lsm != NULL); + for (i = 0; i < lsm->lsm_entry_count; i++) { + struct lov_stripe_md_entry *lse = + lsm->lsm_entries[i]; + int j; + + if (!lsme_inited(lse)) + break; + + for (j = 0; j < lse->lsme_stripe_count; j++) { + struct lov_oinfo *loi = + lse->lsme_oinfo[j]; + + if (lov_oinfo_is_dummy(loi)) + continue; + + if (loi->loi_ar.ar_rc && !rc) + rc = loi->loi_ar.ar_rc; + loi->loi_ar.ar_rc = 0; + } + } + } + fallthrough; + case LLT_RELEASED: + case LLT_EMPTY: + case LLT_FOREIGN: + break; + default: + LBUG(); + } + lov_conf_thaw(lov); + } + RETURN(rc); +} +EXPORT_SYMBOL(lov_read_and_clear_async_rc); + +/** @} lov */ diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_offset.c b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c new file mode 100644 index 0000000000000..86d4ae9745e07 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c @@ -0,0 +1,308 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include + +#include + +#include "lov_internal.h" + +loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index) +{ + struct lov_stripe_md_entry *entry = lsm->lsm_entries[index]; + + LASSERT(index < lsm->lsm_entry_count); + + if (lsme_is_dom(entry)) + return (loff_t)entry->lsme_stripe_size; + + return (loff_t)entry->lsme_stripe_size * entry->lsme_stripe_count; +} + +/* compute object size given "stripeno" and the ost size */ +u64 lov_stripe_size(struct lov_stripe_md *lsm, int index, u64 ost_size, + int stripeno) +{ + unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size; + unsigned long stripe_size; + loff_t swidth; + loff_t lov_size; + + ENTRY; + + if (ost_size == 0) + RETURN(0); + + swidth = stripe_width(lsm, index); + + /* lov_do_div64(a, b) returns a % b, and a = a / b */ + stripe_size = lov_do_div64(ost_size, ssize); + if (stripe_size) + lov_size = ost_size * swidth + stripeno * ssize + stripe_size; + else + lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize; + + RETURN(lov_size); +} + +/** + * Compute file level page index by stripe level page offset + */ +pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index, + pgoff_t stripe_index, int stripe) +{ + loff_t offset; + + offset = lov_stripe_size(lsm, index, + (stripe_index << PAGE_SHIFT) + 1, + stripe); + return offset >> PAGE_SHIFT; +} + +/* + * we have an offset in file backed by an lov and want to find out where + * that offset lands in our given stripe of the file. for the easy + * case where the offset is within the stripe, we just have to scale the + * offset down to make it relative to the stripe instead of the lov. + * + * the harder case is what to do when the offset doesn't intersect the + * stripe. callers will want start offsets clamped ahead to the start + * of the nearest stripe in the file. end offsets similarly clamped to the + * nearest ending byte of a stripe in the file: + * + * all this function does is move offsets to the nearest region of the + * stripe, and it does its work "mod" the full length of all the stripes. + * consider a file with 3 stripes: + * + * S E + * --------------------------------------------------------------------- + * | 0 | 1 | 2 | 0 | 1 | 2 | + * --------------------------------------------------------------------- + * + * to find stripe 1's offsets for S and E, it divides by the full stripe + * width and does its math in the context of a single set of stripes: + * + * S E + * ----------------------------------- + * | 0 | 1 | 2 | + * ----------------------------------- + * + * it'll notice that E is outside stripe 1 and clamp it to the end of the + * stripe, then multiply it back out by lov_off to give the real offsets in + * the stripe: + * + * S E + * --------------------------------------------------------------------- + * | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------------------------------------------- + * + * it would have done similarly and pulled S forward to the start of a 1 + * stripe if, say, S had landed in a 0 stripe. + * + * this rounding isn't always correct. consider an E lov offset that lands + * on a 0 stripe, the "mod stripe width" math will pull it forward to the + * start of a 1 stripe, when in fact it wanted to be rounded back to the end + * of a previous 1 stripe. this logic is handled by callers and this is why: + * + * this function returns < 0 when the offset was "before" the stripe and + * was moved forward to the start of the stripe in question; 0 when it + * falls in the stripe and no shifting was done; > 0 when the offset + * was outside the stripe and was pulled back to its final byte. + */ +int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off, + int stripeno, loff_t *obdoff) +{ + unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size; + loff_t stripe_off; + loff_t this_stripe; + loff_t swidth; + int ret = 0; + + if (lov_off == OBD_OBJECT_EOF) { + *obdoff = OBD_OBJECT_EOF; + return 0; + } + + swidth = stripe_width(lsm, index); + + /* lov_do_div64(a, b) returns a % b, and a = a / b */ + stripe_off = lov_do_div64(lov_off, swidth); + + this_stripe = (loff_t)stripeno * ssize; + if (stripe_off < this_stripe) { + stripe_off = 0; + ret = -1; + } else { + stripe_off -= this_stripe; + + if (stripe_off >= ssize) { + stripe_off = ssize; + ret = 1; + } + } + + *obdoff = lov_off * ssize + stripe_off; + return ret; +} + +/* + * Given a whole-file size and a stripe number, give the file size which + * corresponds to the individual object of that stripe. + * + * This behaves basically in the same was as lov_stripe_offset, except that + * file sizes falling before the beginning of a stripe are clamped to the end + * of the previous stripe, not the beginning of the next: + * + * S + * --------------------------------------------------------------------- + * | 0 | 1 | 2 | 0 | 1 | 2 | + * --------------------------------------------------------------------- + * + * if clamped to stripe 2 becomes: + * + * S + * --------------------------------------------------------------------- + * | 0 | 1 | 2 | 0 | 1 | 2 | + * --------------------------------------------------------------------- + */ +loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size, + int stripeno) +{ + unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size; + loff_t stripe_off; + loff_t this_stripe; + loff_t swidth; + + if (file_size == OBD_OBJECT_EOF) + return OBD_OBJECT_EOF; + + swidth = stripe_width(lsm, index); + + /* lov_do_div64(a, b) returns a % b, and a = a / b */ + stripe_off = lov_do_div64(file_size, swidth); + + this_stripe = (loff_t)stripeno * ssize; + if (stripe_off < this_stripe) { + /* Move to end of previous stripe, or zero */ + if (file_size > 0) { + file_size--; + stripe_off = ssize; + } else { + stripe_off = 0; + } + } else { + stripe_off -= this_stripe; + + if (stripe_off >= ssize) { + /* Clamp to end of this stripe */ + stripe_off = ssize; + } + } + + return (file_size * ssize + stripe_off); +} + +/* + * given an extent in an lov and a stripe, calculate the extent of the stripe + * that is contained within the lov extent. this returns true if the given + * stripe does intersect with the lov extent. + * + * Closed interval [@obd_start, @obd_end] will be returned if caller needs them. + */ +int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno, + struct lu_extent *ext, u64 *obd_start, u64 *obd_end) +{ + struct lov_stripe_md_entry *entry = lsm->lsm_entries[index]; + u64 start, end; + int start_side, end_side; + u64 loc_start, loc_end; + + if (!lu_extent_is_overlapped(ext, &entry->lsme_extent)) + return 0; + + if (!obd_start) + obd_start = &loc_start; + if (!obd_end) + obd_end = &loc_end; + + start = max_t(__u64, ext->e_start, entry->lsme_extent.e_start); + end = min_t(__u64, ext->e_end, entry->lsme_extent.e_end); + if (end != OBD_OBJECT_EOF) + end--; + + start_side = lov_stripe_offset(lsm, index, start, stripeno, obd_start); + end_side = lov_stripe_offset(lsm, index, end, stripeno, obd_end); + + CDEBUG(D_INODE, "[%lld->%lld] -> [(%d) %lld->%lld (%d)]\n", + start, end, start_side, *obd_start, *obd_end, end_side); + + /* + * this stripe doesn't intersect the file extent when neither + * start or the end intersected the stripe and obd_start and + * obd_end got rounded up to the save value. + */ + if (start_side != 0 && end_side != 0 && *obd_start == *obd_end) + return 0; + + /* + * as mentioned in the lov_stripe_offset commentary, end + * might have been shifted in the wrong direction. This + * happens when an end offset is before the stripe when viewed + * through the "mod stripe size" math. we detect it being shifted + * in the wrong direction and touch it up. + * interestingly, this can't underflow since end must be > start + * if we passed through the previous check. + * (should we assert for that somewhere?) + */ + if (end_side != 0) + (*obd_end)--; + + return 1; +} + +/* compute which stripe number "lov_off" will be written into */ +int lov_stripe_number(struct lov_stripe_md *lsm, int index, loff_t lov_off) +{ + unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size; + loff_t stripe_off; + loff_t swidth; + + swidth = stripe_width(lsm, index); + + stripe_off = lov_do_div64(lov_off, swidth); + + /* Puts stripe_off/ssize result into stripe_off */ + lov_do_div64(stripe_off, ssize); + + return stripe_off; +} diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pack.c b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c new file mode 100644 index 0000000000000..42f1446f046da --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c @@ -0,0 +1,483 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/lov/lov_pack.c + * + * (Un)packing of OST/MDS requests + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include +#include +#include +#include +#include + +#include "lov_cl_internal.h" +#include "lov_internal.h" + +void lov_dump_lmm_common(int level, void *lmmp) +{ + struct lov_mds_md *lmm = lmmp; + struct ost_id oi; + + lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi); + CDEBUG_LIMIT(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n", + POSTID(&oi), le32_to_cpu(lmm->lmm_magic), + le32_to_cpu(lmm->lmm_pattern)); + CDEBUG_LIMIT(level, "stripe_size %u, stripe_count %u, layout_gen %u\n", + le32_to_cpu(lmm->lmm_stripe_size), + le16_to_cpu(lmm->lmm_stripe_count), + le16_to_cpu(lmm->lmm_layout_gen)); +} + +static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod, + int stripe_count) +{ + int i; + + if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) { + CDEBUG_LIMIT(level, + "bad stripe_count %u > max_stripe_count %u\n", + stripe_count, LOV_V1_INSANE_STRIPE_COUNT); + return; + } + + for (i = 0; i < stripe_count; ++i, ++lod) { + struct ost_id oi; + + ostid_le_to_cpu(&lod->l_ost_oi, &oi); + CDEBUG_LIMIT(level, "stripe %u idx %u subobj "DOSTID"\n", i, + le32_to_cpu(lod->l_ost_idx), POSTID(&oi)); + } +} + +void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm) +{ + lov_dump_lmm_common(level, lmm); + lov_dump_lmm_objects(level, lmm->lmm_objects, + le16_to_cpu(lmm->lmm_stripe_count)); +} + +/** + * Pack LOV striping metadata for disk storage format (in little + * endian byte order). + * + * This follows the getxattr() conventions. If \a buf_size is zero + * then return the size needed. If \a buf_size is too small then + * return -ERANGE. Otherwise return the size of the result. + */ +static ssize_t lov_lsm_pack_v1v3(const struct lov_stripe_md *lsm, void *buf, + size_t buf_size) +{ + struct lov_mds_md_v1 *lmmv1 = buf; + struct lov_mds_md_v3 *lmmv3 = buf; + struct lov_ost_data_v1 *lmm_objects; + size_t lmm_size; + unsigned int i; + + ENTRY; + + lmm_size = lov_mds_md_size(lsm->lsm_entries[0]->lsme_stripe_count, + lsm->lsm_magic); + if (buf_size == 0) + RETURN(lmm_size); + + if (buf_size < lmm_size) + RETURN(-ERANGE); + + /* + * lmmv1 and lmmv3 point to the same struct and have the + * same first fields + */ + lmmv1->lmm_magic = cpu_to_le32(lsm->lsm_magic); + lmm_oi_cpu_to_le(&lmmv1->lmm_oi, &lsm->lsm_oi); + lmmv1->lmm_stripe_size = cpu_to_le32( + lsm->lsm_entries[0]->lsme_stripe_size); + lmmv1->lmm_stripe_count = cpu_to_le16( + lsm->lsm_entries[0]->lsme_stripe_count); + lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_entries[0]->lsme_pattern); + lmmv1->lmm_layout_gen = cpu_to_le16(lsm->lsm_layout_gen); + + if (lsm->lsm_magic == LOV_MAGIC_V3) { + BUILD_BUG_ON(sizeof(lsm->lsm_entries[0]->lsme_pool_name) != + sizeof(lmmv3->lmm_pool_name)); + strlcpy(lmmv3->lmm_pool_name, + lsm->lsm_entries[0]->lsme_pool_name, + sizeof(lmmv3->lmm_pool_name)); + lmm_objects = lmmv3->lmm_objects; + } else { + lmm_objects = lmmv1->lmm_objects; + } + + if (lsm->lsm_is_released) + RETURN(lmm_size); + + for (i = 0; i < lsm->lsm_entries[0]->lsme_stripe_count; i++) { + struct lov_oinfo *loi = lsm->lsm_entries[0]->lsme_oinfo[i]; + + ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi); + lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen); + lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx); + } + + RETURN(lmm_size); +} + +static ssize_t lov_lsm_pack_foreign(const struct lov_stripe_md *lsm, void *buf, + size_t buf_size) +{ + struct lov_foreign_md *lfm = buf; + size_t lfm_size; + + lfm_size = lsm->lsm_foreign_size; + + if (buf_size == 0) + RETURN(lfm_size); + + /* if buffer too small return ERANGE but copy the size the + * caller has requested anyway. This may be useful to get + * only the header without the need to alloc the full size + */ + if (buf_size < lfm_size) { + memcpy(lfm, lsm_foreign(lsm), buf_size); + RETURN(-ERANGE); + } + + /* full foreign LOV is already avail in its cache + * no need to translate format fields to little-endian + */ + memcpy(lfm, lsm_foreign(lsm), lsm->lsm_foreign_size); + + RETURN(lfm_size); +} + +ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf, + size_t buf_size) +{ + struct lov_comp_md_v1 *lcmv1 = buf; + struct lov_comp_md_entry_v1 *lcme; + struct lov_ost_data_v1 *lmm_objects; + size_t lmm_size; + unsigned int entry; + unsigned int offset; + unsigned int size; + unsigned int i; + + ENTRY; + + if (lsm->lsm_magic == LOV_MAGIC_V1 || lsm->lsm_magic == LOV_MAGIC_V3) + return lov_lsm_pack_v1v3(lsm, buf, buf_size); + + if (lsm->lsm_magic == LOV_MAGIC_FOREIGN) + return lov_lsm_pack_foreign(lsm, buf, buf_size); + + lmm_size = lov_comp_md_size(lsm); + if (buf_size == 0) + RETURN(lmm_size); + + if (buf_size < lmm_size) + RETURN(-ERANGE); + + lcmv1->lcm_magic = cpu_to_le32(lsm->lsm_magic); + lcmv1->lcm_size = cpu_to_le32(lmm_size); + lcmv1->lcm_layout_gen = cpu_to_le32(lsm->lsm_layout_gen); + lcmv1->lcm_flags = cpu_to_le16(lsm->lsm_flags); + lcmv1->lcm_mirror_count = cpu_to_le16(lsm->lsm_mirror_count); + lcmv1->lcm_entry_count = cpu_to_le16(lsm->lsm_entry_count); + + offset = sizeof(*lcmv1) + sizeof(*lcme) * lsm->lsm_entry_count; + + for (entry = 0; entry < lsm->lsm_entry_count; entry++) { + struct lov_stripe_md_entry *lsme; + struct lov_mds_md *lmm; + __u16 stripe_count; + + lsme = lsm->lsm_entries[entry]; + lcme = &lcmv1->lcm_entries[entry]; + + lcme->lcme_id = cpu_to_le32(lsme->lsme_id); + lcme->lcme_flags = cpu_to_le32(lsme->lsme_flags); + if (lsme->lsme_flags & LCME_FL_NOSYNC) + lcme->lcme_timestamp = + cpu_to_le64(lsme->lsme_timestamp); + lcme->lcme_extent.e_start = + cpu_to_le64(lsme->lsme_extent.e_start); + lcme->lcme_extent.e_end = + cpu_to_le64(lsme->lsme_extent.e_end); + lcme->lcme_offset = cpu_to_le32(offset); + + lmm = (struct lov_mds_md *)((char *)lcmv1 + offset); + lmm->lmm_magic = cpu_to_le32(lsme->lsme_magic); + /* lmm->lmm_oi not set */ + lmm->lmm_pattern = cpu_to_le32(lsme->lsme_pattern); + lmm->lmm_stripe_size = cpu_to_le32(lsme->lsme_stripe_size); + lmm->lmm_stripe_count = cpu_to_le16(lsme->lsme_stripe_count); + lmm->lmm_layout_gen = cpu_to_le16(lsme->lsme_layout_gen); + + if (lsme->lsme_magic == LOV_MAGIC_V3) { + struct lov_mds_md_v3 *lmmv3 = + (struct lov_mds_md_v3 *)lmm; + + strlcpy(lmmv3->lmm_pool_name, lsme->lsme_pool_name, + sizeof(lmmv3->lmm_pool_name)); + lmm_objects = lmmv3->lmm_objects; + } else { + lmm_objects = + ((struct lov_mds_md_v1 *)lmm)->lmm_objects; + } + + if (lsme_inited(lsme) && + !(lsme->lsme_pattern & LOV_PATTERN_F_RELEASED)) + stripe_count = lsme->lsme_stripe_count; + else + stripe_count = 0; + + for (i = 0; i < stripe_count; i++) { + struct lov_oinfo *loi = lsme->lsme_oinfo[i]; + + ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi); + lmm_objects[i].l_ost_gen = + cpu_to_le32(loi->loi_ost_gen); + lmm_objects[i].l_ost_idx = + cpu_to_le32(loi->loi_ost_idx); + } + + size = lov_mds_md_size(stripe_count, lsme->lsme_magic); + lcme->lcme_size = cpu_to_le32(size); + offset += size; + } /* for each layout component */ + + RETURN(lmm_size); +} + +/* Find the max stripecount we should use */ +__u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic, __u16 stripe_count) +{ + __u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD; + + if (!stripe_count) + stripe_count = lov->desc.ld_default_stripe_count; + if (stripe_count > lov->desc.ld_active_tgt_count) + stripe_count = lov->desc.ld_active_tgt_count; + if (!stripe_count) + stripe_count = 1; + + /* + * stripe count is based on whether ldiskfs can handle + * larger EA sizes + */ + if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE && + lov->lov_ocd.ocd_max_easize) + max_stripes = lov_mds_md_max_stripe_count( + lov->lov_ocd.ocd_max_easize, magic); + + if (stripe_count > max_stripes) + stripe_count = max_stripes; + + return stripe_count; +} + +int lov_free_memmd(struct lov_stripe_md **lsmp) +{ + struct lov_stripe_md *lsm = *lsmp; + int refc; + + *lsmp = NULL; + refc = atomic_dec_return(&lsm->lsm_refc); + LASSERT(refc >= 0); + if (refc == 0) + lsm_free(lsm); + + return refc; +} + +/* + * Unpack LOV object metadata from disk storage. It is packed in LE byte + * order and is opaque to the networking layer. + */ +struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf, + size_t buf_size) +{ + const struct lsm_operations *op; + struct lov_stripe_md *lsm; + u32 magic; + + ENTRY; + + if (buf_size < sizeof(magic)) + RETURN(ERR_PTR(-EINVAL)); + + magic = le32_to_cpu(*(u32 *)buf); + op = lsm_op_find(magic); + if (!op) + RETURN(ERR_PTR(-EINVAL)); + + lsm = op->lsm_unpackmd(lov, buf, buf_size); + + RETURN(lsm); +} + +/* + * Retrieve object striping information. + * + * @lump is a pointer to an in-core struct with lmm_ost_count indicating + * the maximum number of OST indices which will fit in the user buffer. + * lmm_magic must be LOV_USER_MAGIC. + * + * If @size > 0, User specified limited buffer size, usually the buffer is from + * ll_lov_setstripe(), and the buffer can only hold basic layout template info. + */ +int lov_getstripe(const struct lu_env *env, struct lov_object *obj, + struct lov_stripe_md *lsm, struct lov_user_md __user *lump, + size_t size) +{ + /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */ + struct lov_mds_md *lmmk, *lmm; + struct lov_foreign_md *lfm; + struct lov_user_md_v1 lum; + size_t lmmk_size, lum_size = 0; + ssize_t lmm_size; + int rc = 0; + + ENTRY; + + if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3 && + lsm->lsm_magic != LOV_MAGIC_COMP_V1 && + lsm->lsm_magic != LOV_MAGIC_FOREIGN) { + CERROR("bad LSM MAGIC: 0x%08X != 0x%08X nor 0x%08X\n", + lsm->lsm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3); + GOTO(out, rc = -EIO); + } + + lmmk_size = lov_comp_md_size(lsm); + + OBD_ALLOC_LARGE(lmmk, lmmk_size); + if (!lmmk) + GOTO(out, rc = -ENOMEM); + + lmm_size = lov_lsm_pack(lsm, lmmk, lmmk_size); + if (lmm_size < 0) + GOTO(out_free, rc = lmm_size); + + if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) { + if (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) || + lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) { + lustre_swab_lov_mds_md(lmmk); + lustre_swab_lov_user_md_objects( + (struct lov_user_ost_data *)lmmk->lmm_objects, + lmmk->lmm_stripe_count); + } else if (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_COMP_V1)) { + lustre_swab_lov_comp_md_v1( + (struct lov_comp_md_v1 *)lmmk); + } else if (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_FOREIGN)) { + lfm = (struct lov_foreign_md *)lmmk; + __swab32s(&lfm->lfm_magic); + __swab32s(&lfm->lfm_length); + __swab32s(&lfm->lfm_type); + __swab32s(&lfm->lfm_flags); + } + } + + /* + * Legacy appication passes limited buffer, we need to figure out + * the user buffer size by the passed in lmm_stripe_count. + */ + if (lsm->lsm_magic != LOV_MAGIC_FOREIGN) + if (copy_from_user(&lum, lump, sizeof(struct lov_user_md_v1))) + GOTO(out_free, rc = -EFAULT); + + if (lum.lmm_magic == LOV_USER_MAGIC_V1 || + lum.lmm_magic == LOV_USER_MAGIC_V3) + lum_size = lov_user_md_size(lum.lmm_stripe_count, + lum.lmm_magic); + + if (lum_size != 0) { + struct lov_mds_md *comp_md = lmmk; + + /* + * Legacy app (ADIO for instance) treats the layout as V1/V3 + * blindly, we'd return a reasonable V1/V3 for them. + */ + if (lmmk->lmm_magic == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *comp_v1; + struct cl_object *cl_obj; + struct cl_attr attr; + int i; + + attr.cat_size = 0; + cl_obj = cl_object_top(&obj->lo_cl); + cl_object_attr_lock(cl_obj); + cl_object_attr_get(env, cl_obj, &attr); + cl_object_attr_unlock(cl_obj); + + /* + * return the last instantiated component if file size + * is non-zero, otherwise, return the last component. + */ + comp_v1 = (struct lov_comp_md_v1 *)lmmk; + i = attr.cat_size == 0 ? comp_v1->lcm_entry_count : 0; + for (; i < comp_v1->lcm_entry_count; i++) { + if (!(comp_v1->lcm_entries[i].lcme_flags & + LCME_FL_INIT)) + break; + } + if (i > 0) + i--; + comp_md = (struct lov_mds_md *)((char *)comp_v1 + + comp_v1->lcm_entries[i].lcme_offset); + lum_size = comp_v1->lcm_entries[i].lcme_size; + } + + lmm = comp_md; + lmm_size = min(lum_size, lmmk_size); + } else { + lmm = lmmk; + lmm_size = lmmk_size; + } + + /** + * User specified limited buffer size, usually the buffer is + * from ll_lov_setstripe(), and the buffer can only hold basic + * layout template info. + */ + if (size == 0 || size > lmm_size) + size = lmm_size; + if (copy_to_user(lump, lmm, size)) + GOTO(out_free, rc = -EFAULT); + +out_free: + OBD_FREE_LARGE(lmmk, lmmk_size); +out: + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_page.c b/drivers/staging/lustrefsx/lustre/lov/lov_page.c new file mode 100644 index 0000000000000..887b304e81d6e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_page.c @@ -0,0 +1,197 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_page for LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lov page operations. + * + */ + +static int lov_comp_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct lov_page *lp = cl2lov_page(slice); + + return (*printer)(env, cookie, + LUSTRE_LOV_NAME"-page@%p\n", lp); +} + +static const struct cl_page_operations lov_comp_page_ops = { + .cpo_print = lov_comp_page_print +}; + +int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index) +{ + struct lov_object *loo = cl2lov(obj); + struct lov_io *lio = lov_env_io(env); + struct cl_object *subobj; + struct cl_object *o; + struct lov_io_sub *sub; + struct lov_page *lpg = cl_object_page_slice(obj, page); + struct lov_layout_raid0 *r0; + loff_t offset; + loff_t suboff; + bool stripe_cached = false; + int entry; + int stripe; + int rc; + + ENTRY; + + /* Direct i/o (CPT_TRANSIENT) is split strictly to stripes, so we can + * cache the stripe information. Buffered i/o is differently + * organized, and stripe calculation isn't a significant cost for + * buffered i/o, so we only cache this for direct i/o. + */ + stripe_cached = lio->lis_cached_entry != LIS_CACHE_ENTRY_NONE && + page->cp_type == CPT_TRANSIENT; + + offset = cl_offset(obj, index); + + if (stripe_cached) { + entry = lio->lis_cached_entry; + stripe = lio->lis_cached_stripe; + /* Offset can never go backwards in an i/o, so this is valid */ + suboff = lio->lis_cached_suboff + offset - lio->lis_cached_off; + } else { + entry = lov_io_layout_at(lio, offset); + + stripe = lov_stripe_number(loo->lo_lsm, entry, offset); + rc = lov_stripe_offset(loo->lo_lsm, entry, offset, stripe, + &suboff); + LASSERT(rc == 0); + lio->lis_cached_entry = entry; + lio->lis_cached_stripe = stripe; + lio->lis_cached_off = offset; + lio->lis_cached_suboff = suboff; + } + + if (entry < 0 || !lsm_entry_inited(loo->lo_lsm, entry)) { + /* non-existing layout component */ + lov_page_init_empty(env, obj, page, index); + RETURN(0); + } + + CDEBUG(D_PAGE, "offset %llu, entry %d, stripe %d, suboff %llu\n", + offset, entry, stripe, suboff); + + page->cp_lov_index = lov_comp_index(entry, stripe); + cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_comp_page_ops); + + if (!stripe_cached) { + sub = lov_sub_get(env, lio, page->cp_lov_index); + if (IS_ERR(sub)) + RETURN(PTR_ERR(sub)); + } else { + sub = lio->lis_cached_sub; + } + + lio->lis_cached_sub = sub; + + r0 = lov_r0(loo, entry); + LASSERT(stripe < r0->lo_nr); + + subobj = lovsub2cl(r0->lo_sub[stripe]); + cl_object_for_each(o, subobj) { + if (o->co_ops->coo_page_init) { + rc = o->co_ops->coo_page_init(sub->sub_env, o, page, + cl_index(subobj, suboff)); + if (rc != 0) + break; + } + } + + RETURN(rc); +} + +static int lov_empty_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct lov_page *lp = cl2lov_page(slice); + + return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p, empty.\n", lp); +} + +static const struct cl_page_operations lov_empty_page_ops = { + .cpo_print = lov_empty_page_print +}; + +int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index) +{ + struct lov_page *lpg = cl_object_page_slice(obj, page); + void *addr; + + ENTRY; + + page->cp_lov_index = ~0; + cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_empty_page_ops); + addr = kmap(page->cp_vmpage); + memset(addr, 0, cl_page_size(obj)); + kunmap(page->cp_vmpage); + cl_page_export(env, page, 1); + RETURN(0); +} + +int lov_page_init_foreign(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index) +{ + CDEBUG(D_PAGE, DFID" has no data\n", PFID(lu_object_fid(&obj->co_lu))); + RETURN(-ENODATA); +} + +bool lov_page_is_empty(const struct cl_page *page) +{ + const struct cl_page_slice *slice = cl_page_at(page, &lov_device_type); + + LASSERT(slice != NULL); + return slice->cpl_ops == &lov_empty_page_ops; +} + + +/** @} lov */ + diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c new file mode 100644 index 0000000000000..afccd0523c2c9 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c @@ -0,0 +1,484 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/lov/lov_pool.c + * + * OST pool methods + * + * Author: Jacques-Charles LAFOUCRIERE + * Author: Alex Lyashkov + * Author: Nathaniel Rutman + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include +#include +#include + +#include +#include "lov_internal.h" + +#define pool_tgt(_p, _i) \ + _p->pool_lobd->u.lov.lov_tgts[_p->pool_obds.op_array[_i]] + +static u32 pool_hashfh(const void *data, u32 len, u32 seed) +{ + const char *pool_name = data; + + return hashlen_hash(cfs_hashlen_string((void *)(unsigned long)seed, + pool_name)); +} + +static int pool_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct pool_desc *pool = obj; + const char *pool_name = arg->key; + + return strcmp(pool_name, pool->pool_name); +} + +static const struct rhashtable_params pools_hash_params = { + .key_len = 1, /* actually variable */ + .key_offset = offsetof(struct pool_desc, pool_name), + .head_offset = offsetof(struct pool_desc, pool_hash), + .hashfn = pool_hashfh, + .obj_cmpfn = pool_cmpfn, + .automatic_shrinking = true, +}; + +static void lov_pool_getref(struct pool_desc *pool) +{ + CDEBUG(D_INFO, "pool %p\n", pool); + atomic_inc(&pool->pool_refcount); +} + +void lov_pool_putref(struct pool_desc *pool) +{ + CDEBUG(D_INFO, "pool %p\n", pool); + if (atomic_dec_and_test(&pool->pool_refcount)) { + LASSERT(list_empty(&pool->pool_list)); + LASSERT(pool->pool_proc_entry == NULL); + lu_tgt_pool_free(&(pool->pool_obds)); + kfree_rcu(pool, pool_rcu); + EXIT; + } +} + +#ifdef CONFIG_PROC_FS +/* + * pool /proc seq_file methods + */ +/* + * iterator is used to go through the target pool entries + * index is the current entry index in the lp_array[] array + * index >= pos returned to the seq_file interface + * pos is from 0 to (pool->pool_obds.op_count - 1) + */ +#define POOL_IT_MAGIC 0xB001CEA0 +struct pool_iterator { + int magic; + struct pool_desc *pool; + int idx; /* from 0 to pool_tgt_size - 1 */ +}; + +static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct pool_iterator *iter = (struct pool_iterator *)s->private; + int prev_idx; + + LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X\n", iter->magic); + + (*pos)++; + /* test if end of file */ + if (*pos > pool_tgt_count(iter->pool)) + return NULL; + + /* iterate to find a non empty entry */ + prev_idx = iter->idx; + iter->idx++; + if (iter->idx >= pool_tgt_count(iter->pool)) { + iter->idx = prev_idx; /* we stay on the last entry */ + return NULL; + } + /* return != NULL to continue */ + return iter; +} + +static void *pool_proc_start(struct seq_file *s, loff_t *pos) +{ + struct pool_desc *pool = (struct pool_desc *)s->private; + struct pool_iterator *iter; + + lov_pool_getref(pool); + if ((pool_tgt_count(pool) == 0) || + (*pos >= pool_tgt_count(pool))) { + /* iter is not created, so stop() has no way to + * find pool to dec ref */ + lov_pool_putref(pool); + return NULL; + } + + OBD_ALLOC_PTR(iter); + if (!iter) + return ERR_PTR(-ENOMEM); + iter->magic = POOL_IT_MAGIC; + iter->pool = pool; + iter->idx = 0; + + /* we use seq_file private field to memorized iterator so + * we can free it at stop() */ + /* /!\ do not forget to restore it to pool before freeing it */ + s->private = iter; + down_read(&pool_tgt_rw_sem(pool)); + if (*pos > 0) { + loff_t i; + void *ptr; + + i = 0; + do { + ptr = pool_proc_next(s, &iter, &i); + } while ((i < *pos) && (ptr != NULL)); + return ptr; + } + return iter; +} + +static void pool_proc_stop(struct seq_file *s, void *v) +{ + struct pool_iterator *iter = (struct pool_iterator *)s->private; + + /* in some cases stop() method is called 2 times, without + * calling start() method (see seq_read() from fs/seq_file.c) + * we have to free only if s->private is an iterator */ + if ((iter) && (iter->magic == POOL_IT_MAGIC)) { + up_read(&pool_tgt_rw_sem(iter->pool)); + /* we restore s->private so next call to pool_proc_start() + * will work */ + s->private = iter->pool; + lov_pool_putref(iter->pool); + OBD_FREE_PTR(iter); + } +} + +static int pool_proc_show(struct seq_file *s, void *v) +{ + struct pool_iterator *iter = (struct pool_iterator *)v; + struct lov_tgt_desc *tgt; + + LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X\n", iter->magic); + LASSERT(iter->pool != NULL); + LASSERT(iter->idx <= pool_tgt_count(iter->pool)); + + tgt = pool_tgt(iter->pool, iter->idx); + if (tgt) + seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid))); + + return 0; +} + +static const struct seq_operations pool_proc_ops = { + .start = pool_proc_start, + .next = pool_proc_next, + .stop = pool_proc_stop, + .show = pool_proc_show, +}; + +static int pool_proc_open(struct inode *inode, struct file *file) +{ + int rc; + + rc = seq_open(file, &pool_proc_ops); + if (!rc) { + struct seq_file *s = file->private_data; + s->private = pde_data(inode); + } + return rc; +} + +const static struct proc_ops pool_proc_operations = { + .proc_open = pool_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +static void pools_hash_exit(void *vpool, void *data) +{ + struct pool_desc *pool = vpool; + + lov_pool_putref(pool); +} + +int lov_pool_hash_init(struct rhashtable *tbl) +{ + return rhashtable_init(tbl, &pools_hash_params); +} + +void lov_pool_hash_destroy(struct rhashtable *tbl) +{ + rhashtable_free_and_destroy(tbl, pools_hash_exit, NULL); +} + +int lov_pool_new(struct obd_device *obd, char *poolname) +{ + struct lov_obd *lov; + struct pool_desc *new_pool; + int rc; + ENTRY; + + lov = &(obd->u.lov); + + if (strlen(poolname) > LOV_MAXPOOLNAME) + RETURN(-ENAMETOOLONG); + + /* OBD_ALLOC doesn't work with direct use of kfree_rcu */ + new_pool = kmalloc(sizeof(*new_pool), GFP_KERNEL); + if (new_pool == NULL) + RETURN(-ENOMEM); + + strlcpy(new_pool->pool_name, poolname, sizeof(new_pool->pool_name)); + new_pool->pool_lobd = obd; + /* ref count init to 1 because when created a pool is always used + * up to deletion + */ + atomic_set(&new_pool->pool_refcount, 1); + rc = lu_tgt_pool_init(&new_pool->pool_obds, 0); + if (rc) + GOTO(out_free_pool, rc); + +#ifdef CONFIG_PROC_FS + /* get ref for /proc file */ + lov_pool_getref(new_pool); + new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry, + poolname, new_pool, + &pool_proc_operations); + if (IS_ERR(new_pool->pool_proc_entry)) { + CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname); + new_pool->pool_proc_entry = NULL; + lov_pool_putref(new_pool); + } + CDEBUG(D_INFO, "pool %p - proc %p\n", + new_pool, new_pool->pool_proc_entry); +#endif + + spin_lock(&obd->obd_dev_lock); + list_add_tail(&new_pool->pool_list, &lov->lov_pool_list); + lov->lov_pool_count++; + spin_unlock(&obd->obd_dev_lock); + + /* Add to hash table only when it is fully ready. */ + rc = rhashtable_lookup_insert_fast(&lov->lov_pools_hash_body, + &new_pool->pool_hash, + pools_hash_params); + if (rc) { + if (rc != -EEXIST) + /* + * Hide -E2BIG and -EBUSY which + * are not helpful. + */ + rc = -ENOMEM; + GOTO(out_err, rc); + } + + CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n", + poolname, lov->lov_pool_count); + + RETURN(0); + +out_err: + spin_lock(&obd->obd_dev_lock); + list_del_init(&new_pool->pool_list); + lov->lov_pool_count--; + spin_unlock(&obd->obd_dev_lock); + lprocfs_remove(&new_pool->pool_proc_entry); + lu_tgt_pool_free(&new_pool->pool_obds); +out_free_pool: + OBD_FREE_PTR(new_pool); + + return rc; +} + +struct pool_desc *lov_pool_find(struct obd_device *obd, char *poolname) +{ + struct pool_desc *pool; + struct lov_obd *lov = &obd->u.lov; + + rcu_read_lock(); + pool = rhashtable_lookup(&lov->lov_pools_hash_body, + poolname, + pools_hash_params); + if (pool && !atomic_inc_not_zero(&pool->pool_refcount)) + pool = NULL; + rcu_read_unlock(); + + return pool; +} + +int lov_pool_del(struct obd_device *obd, char *poolname) +{ + struct lov_obd *lov; + struct pool_desc *pool; + ENTRY; + + lov = &(obd->u.lov); + + /* lookup and kill hash reference */ + rcu_read_lock(); + pool = rhashtable_lookup(&lov->lov_pools_hash_body, poolname, + pools_hash_params); + if (pool && rhashtable_remove_fast(&lov->lov_pools_hash_body, + &pool->pool_hash, + pools_hash_params) != 0) + pool = NULL; + rcu_read_unlock(); + if (!pool) + RETURN(-ENOENT); + + if (pool->pool_proc_entry != NULL) { + CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry); + lprocfs_remove(&pool->pool_proc_entry); + lov_pool_putref(pool); + } + + spin_lock(&obd->obd_dev_lock); + list_del_init(&pool->pool_list); + lov->lov_pool_count--; + spin_unlock(&obd->obd_dev_lock); + + /* release last reference */ + lov_pool_putref(pool); + + RETURN(0); +} + + +int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname) +{ + struct obd_uuid ost_uuid; + struct lov_obd *lov; + struct pool_desc *pool; + unsigned int lov_idx; + int rc; + ENTRY; + + lov = &(obd->u.lov); + + rcu_read_lock(); + pool = rhashtable_lookup(&lov->lov_pools_hash_body, poolname, + pools_hash_params); + if (pool && !atomic_inc_not_zero(&pool->pool_refcount)) + pool = NULL; + rcu_read_unlock(); + if (!pool) + RETURN(-ENOENT); + + obd_str2uuid(&ost_uuid, ostname); + + + /* search ost in lov array */ + lov_tgts_getref(obd); + for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) { + if (!lov->lov_tgts[lov_idx]) + continue; + if (obd_uuid_equals(&ost_uuid, + &(lov->lov_tgts[lov_idx]->ltd_uuid))) + break; + } + /* test if ost found in lov */ + if (lov_idx == lov->desc.ld_tgt_count) + GOTO(out, rc = -EINVAL); + + rc = lu_tgt_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size); + if (rc) + GOTO(out, rc); + + CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n", + ostname, poolname, pool_tgt_count(pool)); + + EXIT; +out: + lov_tgts_putref(obd); + lov_pool_putref(pool); + + return rc; +} + +int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname) +{ + struct obd_uuid ost_uuid; + struct lov_obd *lov; + struct pool_desc *pool; + unsigned int lov_idx; + int rc = 0; + ENTRY; + + lov = &(obd->u.lov); + + /* lookup and kill hash reference */ + rcu_read_lock(); + pool = rhashtable_lookup(&lov->lov_pools_hash_body, poolname, + pools_hash_params); + if (pool && !atomic_inc_not_zero(&pool->pool_refcount)) + pool = NULL; + rcu_read_unlock(); + if (!pool) + RETURN(-ENOENT); + + obd_str2uuid(&ost_uuid, ostname); + + lov_tgts_getref(obd); + /* search ost in lov array, to get index */ + for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) { + if (!lov->lov_tgts[lov_idx]) + continue; + + if (obd_uuid_equals(&ost_uuid, + &(lov->lov_tgts[lov_idx]->ltd_uuid))) + break; + } + + /* test if ost found in lov */ + if (lov_idx == lov->desc.ld_tgt_count) + GOTO(out, rc = -EINVAL); + + lu_tgt_pool_remove(&pool->pool_obds, lov_idx); + + CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname, + poolname); + + EXIT; +out: + lov_tgts_putref(obd); + lov_pool_putref(pool); + + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_request.c b/drivers/staging/lustrefsx/lustre/lov/lov_request.c new file mode 100644 index 0000000000000..4994011a7895b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_request.c @@ -0,0 +1,392 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include +#include + +#include +#include "lov_internal.h" + +static void lov_init_set(struct lov_request_set *set) +{ + set->set_count = 0; + atomic_set(&set->set_completes, 0); + atomic_set(&set->set_success, 0); + INIT_LIST_HEAD(&set->set_list); +} + +static void lov_finish_set(struct lov_request_set *set) +{ + struct list_head *pos, *n; + struct lov_request *req; + + ENTRY; + + LASSERT(set != NULL); + list_for_each_safe(pos, n, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + list_del_init(&req->rq_link); + + if (req->rq_oi.oi_osfs) + OBD_FREE_PTR(req->rq_oi.oi_osfs); + + OBD_FREE_PTR(req); + } + + OBD_FREE_PTR(set); + EXIT; +} + +static void +lov_update_set(struct lov_request_set *set, struct lov_request *req, int rc) +{ + atomic_inc(&set->set_completes); + if (rc == 0) + atomic_inc(&set->set_success); +} + +static void +lov_set_add_req(struct lov_request *req, struct lov_request_set *set) +{ + list_add_tail(&req->rq_link, &set->set_list); + set->set_count++; + req->rq_rqset = set; +} + +static int lov_check_set(struct lov_obd *lov, int idx) +{ + int rc = 0; + + mutex_lock(&lov->lov_lock); + + if (!lov->lov_tgts[idx] || lov->lov_tgts[idx]->ltd_active || + (lov->lov_tgts[idx]->ltd_exp && + class_exp2cliimp(lov->lov_tgts[idx]->ltd_exp)->imp_connect_tried)) + rc = 1; + + mutex_unlock(&lov->lov_lock); + return rc; +} + +/* + * Check if the OSC connection exists and is active. + * If the OSC has not yet had a chance to connect to the OST the first time, + * wait once for it to connect instead of returning an error. + */ +static int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx) +{ + struct lov_tgt_desc *tgt; + struct obd_import *imp = NULL; + int rc = 0; + int cnt; + + mutex_lock(&lov->lov_lock); + + tgt = lov->lov_tgts[ost_idx]; + + if (unlikely(!tgt)) + GOTO(out, rc = 0); + + if (likely(tgt->ltd_active)) + GOTO(out, rc = 1); + + if (tgt->ltd_exp) + imp = class_exp2cliimp(tgt->ltd_exp); + if (imp && imp->imp_connect_tried) + GOTO(out, rc = 0); + if (imp && imp->imp_state == LUSTRE_IMP_IDLE) + GOTO(out, rc = 0); + + mutex_unlock(&lov->lov_lock); + + cnt = obd_timeout; + while (cnt > 0 && + !lov_check_set(lov, ost_idx)) { + ssleep(1); + cnt -= 1; + } + if (tgt->ltd_active) + return 1; + + return 0; + +out: + mutex_unlock(&lov->lov_lock); + return rc; +} + +#define LOV_U64_MAX ((__u64)~0ULL) +#define LOV_SUM_MAX(tot, add) \ + do { \ + if ((tot) + (add) < (tot)) \ + (tot) = LOV_U64_MAX; \ + else \ + (tot) += (add); \ + } while (0) + +static int +lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, int success) +{ + ENTRY; + + if (success) { + __u32 expected_stripes = lov_get_stripe_count(&obd->u.lov, + LOV_MAGIC, 0); + if (osfs->os_files != LOV_U64_MAX) + lov_do_div64(osfs->os_files, expected_stripes); + if (osfs->os_ffree != LOV_U64_MAX) + lov_do_div64(osfs->os_ffree, expected_stripes); + + spin_lock(&obd->obd_osfs_lock); + memcpy(&obd->obd_osfs, osfs, sizeof(*osfs)); + obd->obd_osfs_age = ktime_get_seconds(); + spin_unlock(&obd->obd_osfs_lock); + RETURN(0); + } + + RETURN(-EIO); +} + +int lov_fini_statfs_set(struct lov_request_set *set) +{ + int rc = 0; + ENTRY; + + if (!set) + RETURN(0); + + if (atomic_read(&set->set_completes)) { + rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs, + atomic_read(&set->set_success)); + } + + lov_finish_set(set); + + RETURN(rc); +} + +static void +lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs, + int success) +{ + int shift = 0, quit = 0; + __u64 tmp; + + if (success == 0) { + memcpy(osfs, lov_sfs, sizeof(*lov_sfs)); + } else { + if (osfs->os_bsize != lov_sfs->os_bsize) { + /* assume all block sizes are always powers of 2 */ + /* get the bits difference */ + tmp = osfs->os_bsize | lov_sfs->os_bsize; + for (shift = 0; shift <= 64; ++shift) { + if (tmp & 1) { + if (quit) + break; + quit = 1; + shift = 0; + } + tmp >>= 1; + } + } + + if (osfs->os_bsize < lov_sfs->os_bsize) { + osfs->os_bsize = lov_sfs->os_bsize; + + osfs->os_bfree >>= shift; + osfs->os_bavail >>= shift; + osfs->os_blocks >>= shift; + } else if (shift != 0) { + lov_sfs->os_bfree >>= shift; + lov_sfs->os_bavail >>= shift; + lov_sfs->os_blocks >>= shift; + } +#ifdef MIN_DF + /* + * Sandia requested that df (and so, statfs) only + * returned minimal available space on + * a single OST, so people would be able to + * write this much data guaranteed. + */ + if (osfs->os_bavail > lov_sfs->os_bavail) { + /* + * Presumably if new bavail is smaller, + * new bfree is bigger as well + */ + osfs->os_bfree = lov_sfs->os_bfree; + osfs->os_bavail = lov_sfs->os_bavail; + } +#else + osfs->os_bfree += lov_sfs->os_bfree; + osfs->os_bavail += lov_sfs->os_bavail; +#endif + osfs->os_blocks += lov_sfs->os_blocks; + /* + * XXX not sure about this one - depends on policy. + * - could be minimum if we always stripe on all OBDs + * (but that would be wrong for any other policy, + * if one of the OBDs has no more objects left) + * - could be sum if we stripe whole objects + * - could be average, just to give a nice number + * + * To give a "reasonable" (if not wholly accurate) + * number, we divide the total number of free objects + * by expected stripe count (watch out for overflow). + */ + LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files); + LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree); + } +} + +/* + * The callback for osc_statfs_async that finilizes a request info when a + * response is received. + */ +static int cb_statfs_update(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct lov_request *lovreq; + struct lov_request_set *set; + struct obd_statfs *osfs, *lov_sfs; + struct lov_obd *lov; + struct lov_tgt_desc *tgt; + struct obd_device *lovobd, *tgtobd; + int success; + + ENTRY; + + lovreq = container_of(oinfo, struct lov_request, rq_oi); + set = lovreq->rq_rqset; + lovobd = set->set_obd; + lov = &lovobd->u.lov; + osfs = set->set_oi->oi_osfs; + lov_sfs = oinfo->oi_osfs; + success = atomic_read(&set->set_success); + /* + * XXX: the same is done in lov_update_common_set, however + * lovset->set_exp is not initialized. + */ + lov_update_set(set, lovreq, rc); + if (rc) + GOTO(out, rc); + + lov_tgts_getref(lovobd); + tgt = lov->lov_tgts[lovreq->rq_idx]; + if (!tgt || !tgt->ltd_active) + GOTO(out_update, rc); + + tgtobd = class_exp2obd(tgt->ltd_exp); + spin_lock(&tgtobd->obd_osfs_lock); + memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs)); + if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0) + tgtobd->obd_osfs_age = ktime_get_seconds(); + spin_unlock(&tgtobd->obd_osfs_lock); + +out_update: + lov_update_statfs(osfs, lov_sfs, success); + lov_tgts_putref(lovobd); +out: + RETURN(0); +} + +int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + struct lov_obd *lov = &obd->u.lov; + int rc = 0, i; + + ENTRY; + + OBD_ALLOC(set, sizeof(*set)); + if (!set) + RETURN(-ENOMEM); + lov_init_set(set); + + set->set_obd = obd; + set->set_oi = oinfo; + + /* We only get block data from the OBD */ + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + struct lov_tgt_desc *ltd = lov->lov_tgts[i]; + struct lov_request *req; + + if (!ltd) { + CDEBUG(D_HA, "lov idx %d inactive\n", i); + continue; + } + + /* + * skip targets that have been explicitely disabled by the + * administrator + */ + if (!ltd->ltd_exp) { + CDEBUG(D_HA, "lov idx %d administratively disabled\n", + i); + continue; + } + + if (oinfo->oi_flags & OBD_STATFS_NODELAY && + class_exp2cliimp(ltd->ltd_exp)->imp_state != + LUSTRE_IMP_IDLE && !ltd->ltd_active) { + CDEBUG(D_HA, "lov idx %d inactive\n", i); + continue; + } + + if (!ltd->ltd_active) + lov_check_and_wait_active(lov, i); + + OBD_ALLOC(req, sizeof(*req)); + if (!req) + GOTO(out_set, rc = -ENOMEM); + + OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs)); + if (!req->rq_oi.oi_osfs) { + OBD_FREE(req, sizeof(*req)); + GOTO(out_set, rc = -ENOMEM); + } + + req->rq_idx = i; + req->rq_oi.oi_cb_up = cb_statfs_update; + req->rq_oi.oi_flags = oinfo->oi_flags; + + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out_set, rc = -EIO); + *reqset = set; + RETURN(rc); +out_set: + lov_fini_statfs_set(set); + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c new file mode 100644 index 0000000000000..4f2640bc7c530 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c @@ -0,0 +1,145 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_device and cl_device_type for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lov-sub device and device type functions. + * + */ + +static int lovsub_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct lovsub_device *lsd = lu2lovsub_dev(d); + struct lu_device_type *ldt; + int rc; + + ENTRY; + next->ld_site = d->ld_site; + ldt = next->ld_type; + LASSERT(ldt != NULL); + rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL); + if (rc) { + next->ld_site = NULL; + RETURN(rc); + } + + lu_device_get(next); + lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init); + lsd->acid_next = lu2cl_dev(next); + RETURN(rc); +} + +static struct lu_device *lovsub_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct lu_device *next; + struct lovsub_device *lsd; + + ENTRY; + lsd = lu2lovsub_dev(d); + next = cl2lu_dev(lsd->acid_next); + lsd->acid_next = NULL; + RETURN(next); +} + +static struct lu_device *lovsub_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct lovsub_device *lsd = lu2lovsub_dev(d); + struct lu_device *next = cl2lu_dev(lsd->acid_next); + + lu_site_print(env, d->ld_site, &d->ld_ref, D_ERROR, lu_cdebug_printer); + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(lsd); + return next; +} + +static const struct lu_device_operations lovsub_lu_ops = { + .ldo_object_alloc = lovsub_object_alloc, + .ldo_process_config = NULL, + .ldo_recovery_complete = NULL +}; + +static struct lu_device *lovsub_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct lovsub_device *lsd; + + OBD_ALLOC_PTR(lsd); + if (lsd) { + int result; + + result = cl_device_init(&lsd->acid_cl, t); + if (result == 0) { + d = lovsub2lu_dev(lsd); + d->ld_ops = &lovsub_lu_ops; + } else + d = ERR_PTR(result); + } else + d = ERR_PTR(-ENOMEM); + return d; +} + +static const struct lu_device_type_operations lovsub_device_type_ops = { + .ldto_device_alloc = lovsub_device_alloc, + .ldto_device_free = lovsub_device_free, + + .ldto_device_init = lovsub_device_init, + .ldto_device_fini = lovsub_device_fini +}; + +#define LUSTRE_LOVSUB_NAME "lovsub" + +struct lu_device_type lovsub_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_LOVSUB_NAME, + .ldt_ops = &lovsub_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + + +/** @} lov */ + diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c new file mode 100644 index 0000000000000..cd239733270ef --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c @@ -0,0 +1,202 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_object for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lovsub object operations. + * + */ + +static int lovsub_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct lovsub_device *dev = lu2lovsub_dev(obj->lo_dev); + struct lu_object *below; + struct lu_device *under; + int result; + + ENTRY; + under = &dev->acid_next->cd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under); + if (below) { + lu_object_add(obj, below); + cl_object_page_init(lu2cl(obj), 0); + result = 0; + } else + result = -ENOMEM; + RETURN(result); + +} + +static void lovsub_object_free_rcu(struct rcu_head *head) +{ + struct lovsub_object *los = container_of(head, struct lovsub_object, + lso_header.coh_lu.loh_rcu); + + kmem_cache_free(lovsub_object_kmem, los); +} + +static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct lovsub_object *los = lu2lovsub(obj); + struct lov_object *lov = los->lso_super; + + ENTRY; + + /* + * We can't assume lov was assigned here, because of the shadow + * object handling in lu_object_find. + */ + if (lov) { + int index = lov_comp_entry(los->lso_index); + int stripe = lov_comp_stripe(los->lso_index); + struct lov_layout_raid0 *r0 = lov_r0(lov, index); + + LASSERT(lov->lo_type == LLT_COMP); + LASSERT(r0->lo_sub[stripe] == los); + spin_lock(&r0->lo_sub_lock); + r0->lo_sub[stripe] = NULL; + spin_unlock(&r0->lo_sub_lock); + } + + lu_object_fini(obj); + lu_object_header_fini(&los->lso_header.coh_lu); + OBD_FREE_PRE(los, sizeof(*los), "slab-freed"); + call_rcu(&los->lso_header.coh_lu.loh_rcu, lovsub_object_free_rcu); + EXIT; +} + +static int lovsub_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *obj) +{ + struct lovsub_object *los = lu2lovsub(obj); + + return (*p)(env, cookie, "[%d]", los->lso_index); +} + +static int lovsub_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct lovsub_object *los = cl2lovsub(obj); + struct lov_object *lov = cl2lovsub(obj)->lso_super; + + ENTRY; + lov_r0(lov, lov_comp_entry(los->lso_index))->lo_attr_valid = 0; + RETURN(0); +} + +static int lovsub_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, + struct ost_lvb *lvb) +{ + struct lovsub_object *los = cl2lovsub(obj); + + ENTRY; + RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb)); +} + +/** + * Implementation of struct cl_object_operations::coo_req_attr_set() for lovsub + * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx + * field, which is filled there. + */ +static void lovsub_req_attr_set(const struct lu_env *env, struct cl_object *obj, + struct cl_req_attr *attr) +{ + struct lovsub_object *subobj = cl2lovsub(obj); + struct lov_stripe_md *lsm = subobj->lso_super->lo_lsm; + + ENTRY; + cl_req_attr_set(env, &subobj->lso_super->lo_cl, attr); + + /* + * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it + * unconditionally. It never changes anyway. + */ + attr->cra_oa->o_stripe_idx = lov_comp_stripe(subobj->lso_index); + lov_lsm2layout(lsm, lsm->lsm_entries[lov_comp_entry(subobj->lso_index)], + &attr->cra_oa->o_layout); + attr->cra_oa->o_valid |= OBD_MD_FLOSTLAYOUT; + EXIT; +} + +static const struct cl_object_operations lovsub_ops = { + .coo_attr_update = lovsub_attr_update, + .coo_glimpse = lovsub_object_glimpse, + .coo_req_attr_set = lovsub_req_attr_set +}; + +static const struct lu_object_operations lovsub_lu_obj_ops = { + .loo_object_init = lovsub_object_init, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = lovsub_object_free, + .loo_object_print = lovsub_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *lovsub_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct lovsub_object *los; + struct lu_object *obj; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, GFP_NOFS); + if (los) { + struct cl_object_header *hdr; + + obj = lovsub2lu(los); + hdr = &los->lso_header; + cl_object_header_init(hdr); + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + los->lso_cl.co_ops = &lovsub_ops; + obj->lo_ops = &lovsub_lu_obj_ops; + } else + obj = NULL; + RETURN(obj); +} + +/** @} lov */ diff --git a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c new file mode 100644 index 0000000000000..ac7358100a3e4 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c @@ -0,0 +1,310 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include "lov_internal.h" + +static ssize_t stripesize_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &obd->u.lov.desc; + + return scnprintf(buf, PAGE_SIZE, "%llu\n", desc->ld_default_stripe_size); +} + +static ssize_t stripesize_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &obd->u.lov.desc; + u64 val; + int rc; + + rc = sysfs_memparse(buf, count, &val, "B"); + if (rc < 0) + return rc; + + lov_fix_desc_stripe_size(&val); + desc->ld_default_stripe_size = val; + + return count; +} +LUSTRE_RW_ATTR(stripesize); + +static ssize_t stripeoffset_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &obd->u.lov.desc; + + return sprintf(buf, "%lld\n", desc->ld_default_stripe_offset); +} + +static ssize_t stripeoffset_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &obd->u.lov.desc; + long val; + int rc; + + rc = kstrtol(buf, 0, &val); + if (rc) + return rc; + if (val < -1 || val > LOV_MAX_STRIPE_COUNT) + return -ERANGE; + + desc->ld_default_stripe_offset = val; + + return count; +} +LUSTRE_RW_ATTR(stripeoffset); + +static ssize_t stripetype_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &obd->u.lov.desc; + + return sprintf(buf, "%u\n", desc->ld_pattern); +} + +static ssize_t stripetype_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &obd->u.lov.desc; + u32 pattern; + int rc; + + rc = kstrtouint(buffer, 0, &pattern); + if (rc) + return rc; + + lov_fix_desc_pattern(&pattern); + desc->ld_pattern = pattern; + + return count; +} +LUSTRE_RW_ATTR(stripetype); + +static ssize_t stripecount_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &obd->u.lov.desc; + + return sprintf(buf, "%d\n", + (__s16)(desc->ld_default_stripe_count + 1) - 1); +} + +static ssize_t stripecount_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &obd->u.lov.desc; + int stripe_count; + int rc; + + rc = kstrtoint(buffer, 0, &stripe_count); + if (rc) + return rc; + + if (stripe_count < -1) + return -ERANGE; + + lov_fix_desc_stripe_count(&stripe_count); + desc->ld_default_stripe_count = stripe_count; + + return count; +} +LUSTRE_RW_ATTR(stripecount); + +static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &obd->u.lov.desc; + + return sprintf(buf, "%u\n", desc->ld_tgt_count); +} +LUSTRE_RO_ATTR(numobd); + +static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &obd->u.lov.desc; + + return sprintf(buf, "%u\n", desc->ld_active_tgt_count); +} +LUSTRE_RO_ATTR(activeobd); + +static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &obd->u.lov.desc; + + return sprintf(buf, "%s\n", desc->ld_uuid.uuid); +} +LUSTRE_RO_ATTR(desc_uuid); + +#ifdef CONFIG_PROC_FS +static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos) +{ + struct obd_device *obd = p->private; + struct lov_obd *lov = &obd->u.lov; + + while (*pos < lov->desc.ld_tgt_count) { + if (lov->lov_tgts[*pos]) + return lov->lov_tgts[*pos]; + ++*pos; + } + return NULL; +} + +static void lov_tgt_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct obd_device *obd = p->private; + struct lov_obd *lov = &obd->u.lov; + + while (++*pos < lov->desc.ld_tgt_count) { + if (lov->lov_tgts[*pos]) + return lov->lov_tgts[*pos]; + } + return NULL; +} + +static int lov_tgt_seq_show(struct seq_file *p, void *v) +{ + struct lov_tgt_desc *tgt = v; + + seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index, + obd_uuid2str(&tgt->ltd_uuid), + tgt->ltd_active ? "" : "IN"); + return 0; +} + +static const struct seq_operations lov_tgt_sops = { + .start = lov_tgt_seq_start, + .stop = lov_tgt_seq_stop, + .next = lov_tgt_seq_next, + .show = lov_tgt_seq_show, +}; + +static int lov_target_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lov_tgt_sops); + if (rc) + return rc; + + seq = file->private_data; + seq->private = pde_data(inode); + return 0; +} + +static const struct proc_ops lov_proc_target_fops = { + PROC_OWNER(THIS_MODULE) + .proc_open = lov_target_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = lprocfs_seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +static struct attribute *lov_attrs[] = { + &lustre_attr_activeobd.attr, + &lustre_attr_numobd.attr, + &lustre_attr_desc_uuid.attr, + &lustre_attr_stripesize.attr, + &lustre_attr_stripeoffset.attr, + &lustre_attr_stripetype.attr, + &lustre_attr_stripecount.attr, + NULL, +}; + +KOBJ_ATTRIBUTE_GROUPS(lov); /* creates lov_groups */ + +int lov_tunables_init(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + int rc; + + obd->obd_ktype.default_groups = KOBJ_ATTR_GROUPS(lov); + rc = lprocfs_obd_setup(obd, false); + if (rc) + GOTO(out, rc); + +#ifdef CONFIG_PROC_FS + rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", 0444, + &lov_proc_target_fops, obd); + if (rc) + CWARN("%s: Error adding the target_obd file : rc %d\n", + obd->obd_name, rc); + + lov->lov_pool_proc_entry = lprocfs_register("pools", + obd->obd_proc_entry, + NULL, NULL); + if (IS_ERR(lov->lov_pool_proc_entry)) { + rc = PTR_ERR(lov->lov_pool_proc_entry); + CERROR("%s: error setting up debugfs for pools : rc %d\n", + obd->obd_name, rc); + lov->lov_pool_proc_entry = NULL; + } +#endif /* CONFIG_FS_PROC */ +out: + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/mdc/Makefile b/drivers/staging/lustrefsx/lustre/mdc/Makefile new file mode 100644 index 0000000000000..5e997efd3b33a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_LUSTREFSX_FS) += mdc.o + +mdc-y := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o +mdc-y += mdc_changelog.o mdc_dev.o +mdc-y += mdc_acl.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c new file mode 100644 index 0000000000000..91d9aade97d96 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c @@ -0,0 +1,795 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include "mdc_internal.h" + +static ssize_t active_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp; + ssize_t len; + + with_imp_locked(obd, imp, len) + len = sprintf(buf, "%d\n", !imp->imp_deactive); + return len; +} + +static ssize_t active_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp, *imp0; + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + with_imp_locked(obd, imp0, rc) + imp = class_import_get(imp0); + if (rc) + return rc; + /* opposite senses */ + if (imp->imp_deactive == val) + rc = ptlrpc_set_import_active(imp, val); + else + CDEBUG(D_CONFIG, "activate %u: ignoring repeat request\n", + val); + class_import_put(imp); + return rc ?: count; +} +LUSTRE_RW_ATTR(active); + +static ssize_t max_rpcs_in_flight_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + ssize_t len; + u32 max; + + max = obd_get_max_rpcs_in_flight(&obd->u.cli); + len = sprintf(buf, "%u\n", max); + + return len; +} + +static ssize_t max_rpcs_in_flight_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + rc = obd_set_max_rpcs_in_flight(&obd->u.cli, val); + if (rc) + count = rc; + + return count; +} +LUSTRE_RW_ATTR(max_rpcs_in_flight); + +static ssize_t max_mod_rpcs_in_flight_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + u16 max; + + max = obd_get_max_mod_rpcs_in_flight(&obd->u.cli); + return sprintf(buf, "%hu\n", max); +} + +static ssize_t max_mod_rpcs_in_flight_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + u16 val; + int rc; + + rc = kstrtou16(buffer, 10, &val); + if (rc) + return rc; + + rc = obd_set_max_mod_rpcs_in_flight(&obd->u.cli, val); + if (rc) + count = rc; + + return count; +} +LUSTRE_RW_ATTR(max_mod_rpcs_in_flight); + +static int mdc_max_dirty_mb_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + struct client_obd *cli = &obd->u.cli; + + seq_printf(m, "%lu\n", PAGES_TO_MiB(cli->cl_dirty_max_pages)); + return 0; +} + +static ssize_t mdc_max_dirty_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *sfl = file->private_data; + struct obd_device *obd = sfl->private; + struct client_obd *cli = &obd->u.cli; + char kernbuf[22] = ""; + u64 pages_number; + int rc; + + if (count >= sizeof(kernbuf)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + kernbuf[count] = 0; + + rc = sysfs_memparse(kernbuf, count, &pages_number, "MiB"); + if (rc < 0) + return rc; + + /* MB -> pages */ + pages_number = round_up(pages_number, 1024 * 1024) >> PAGE_SHIFT; + if (pages_number <= 0 || + pages_number >= MiB_TO_PAGES(OSC_MAX_DIRTY_MB_MAX) || + pages_number > cfs_totalram_pages() / 4) /* 1/4 of RAM */ + return -ERANGE; + + spin_lock(&cli->cl_loi_list_lock); + cli->cl_dirty_max_pages = pages_number; + osc_wake_cache_waiters(cli); + spin_unlock(&cli->cl_loi_list_lock); + + return count; +} +LPROC_SEQ_FOPS(mdc_max_dirty_mb); + +DECLARE_CKSUM_NAME; + +static int mdc_checksum_type_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + int i; + + if (obd == NULL) + return 0; + + for (i = 0; i < ARRAY_SIZE(cksum_name); i++) { + if ((BIT(i) & obd->u.cli.cl_supp_cksum_types) == 0) + continue; + if (obd->u.cli.cl_cksum_type == BIT(i)) + seq_printf(m, "[%s] ", cksum_name[i]); + else + seq_printf(m, "%s ", cksum_name[i]); + } + seq_puts(m, "\n"); + + return 0; +} + +static ssize_t mdc_checksum_type_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + char kernbuf[10]; + int rc = -EINVAL; + int i; + + if (obd == NULL) + return 0; + + if (count > sizeof(kernbuf) - 1) + return -EINVAL; + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + if (count > 0 && kernbuf[count - 1] == '\n') + kernbuf[count - 1] = '\0'; + else + kernbuf[count] = '\0'; + + for (i = 0; i < ARRAY_SIZE(cksum_name); i++) { + if (strcasecmp(kernbuf, cksum_name[i]) == 0) { + obd->u.cli.cl_preferred_cksum_type = BIT(i); + if (obd->u.cli.cl_supp_cksum_types & BIT(i)) { + obd->u.cli.cl_cksum_type = BIT(i); + rc = count; + } else { + rc = -ENOTSUPP; + } + break; + } + } + + return rc; +} +LPROC_SEQ_FOPS(mdc_checksum_type); + +static ssize_t checksums_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%d\n", !!obd->u.cli.cl_checksum); +} + +static ssize_t checksums_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + obd->u.cli.cl_checksum = val; + + return count; +} +LUSTRE_RW_ATTR(checksums); + +static ssize_t checksum_dump_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%d\n", !!obd->u.cli.cl_checksum_dump); +} + +static ssize_t checksum_dump_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + obd->u.cli.cl_checksum_dump = val; + + return count; +} +LUSTRE_RW_ATTR(checksum_dump); + +LUSTRE_ATTR(mds_conn_uuid, 0444, conn_uuid_show, NULL); +LUSTRE_RO_ATTR(conn_uuid); + +LUSTRE_RW_ATTR(ping); + +static int mdc_cached_mb_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + struct client_obd *cli = &obd->u.cli; + int shift = 20 - PAGE_SHIFT; + + seq_printf(m, "used_mb: %ld\n" + "busy_cnt: %ld\n" + "reclaim: %llu\n", + (atomic_long_read(&cli->cl_lru_in_list) + + atomic_long_read(&cli->cl_lru_busy)) >> shift, + atomic_long_read(&cli->cl_lru_busy), + cli->cl_lru_reclaim); + + return 0; +} + +/* shrink the number of caching pages to a specific number */ +static ssize_t +mdc_cached_mb_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *sfl = file->private_data; + struct obd_device *obd = sfl->private; + struct client_obd *cli = &obd->u.cli; + u64 pages_number; + const char *tmp; + long rc; + char kernbuf[128]; + + if (count >= sizeof(kernbuf)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + kernbuf[count] = 0; + + tmp = lprocfs_find_named_value(kernbuf, "used_mb:", &count); + rc = sysfs_memparse(tmp, count, &pages_number, "MiB"); + if (rc < 0) + return rc; + + pages_number >>= PAGE_SHIFT; + + rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number; + if (rc > 0) { + struct lu_env *env; + __u16 refcheck; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + (void)osc_lru_shrink(env, cli, rc, true); + cl_env_put(env, &refcheck); + } + } + + return count; +} +LPROC_SEQ_FOPS(mdc_cached_mb); + +static int mdc_unstable_stats_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + struct client_obd *cli = &obd->u.cli; + long pages; + int mb; + + pages = atomic_long_read(&cli->cl_unstable_count); + mb = (pages * PAGE_SIZE) >> 20; + + seq_printf(m, "unstable_pages: %20ld\n" + "unstable_mb: %10d\n", pages, mb); + return 0; +} +LPROC_SEQ_FOPS_RO(mdc_unstable_stats); + +static ssize_t mdc_rpc_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *obd = seq->private; + struct client_obd *cli = &obd->u.cli; + + lprocfs_oh_clear(&cli->cl_mod_rpcs_hist); + + lprocfs_oh_clear(&cli->cl_read_rpc_hist); + lprocfs_oh_clear(&cli->cl_write_rpc_hist); + lprocfs_oh_clear(&cli->cl_read_page_hist); + lprocfs_oh_clear(&cli->cl_write_page_hist); + lprocfs_oh_clear(&cli->cl_read_offset_hist); + lprocfs_oh_clear(&cli->cl_write_offset_hist); + cli->cl_mod_rpcs_init = ktime_get_real(); + + return len; +} + +static int mdc_rpc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *obd = seq->private; + struct client_obd *cli = &obd->u.cli; + unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; + int i; + + obd_mod_rpc_stats_seq_show(cli, seq); + + spin_lock(&cli->cl_loi_list_lock); + + seq_printf(seq, "\nread RPCs in flight: %d\n", + cli->cl_r_in_flight); + seq_printf(seq, "write RPCs in flight: %d\n", + cli->cl_w_in_flight); + seq_printf(seq, "pending write pages: %d\n", + atomic_read(&cli->cl_pending_w_pages)); + seq_printf(seq, "pending read pages: %d\n", + atomic_read(&cli->cl_pending_r_pages)); + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "pages per rpc rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_page_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_page_hist.oh_buckets[i]; + + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3u %3u | %10lu %3u %3u\n", + 1 << i, r, pct(r, read_tot), + pct(read_cum, read_tot), w, + pct(w, write_tot), + pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "rpcs in flight rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist); + + read_cum = 0; + write_cum = 0; + for (i = 1; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i]; + + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3u %3u | %10lu %3u %3u\n", + i, r, pct(r, read_tot), pct(read_cum, read_tot), w, + pct(w, write_tot), pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "offset rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_offset_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_offset_hist.oh_buckets[i]; + + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3u %3u | %10lu %3u %3u\n", + (i == 0) ? 0 : 1 << (i - 1), + r, pct(r, read_tot), pct(read_cum, read_tot), + w, pct(w, write_tot), pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + spin_unlock(&cli->cl_loi_list_lock); + + return 0; +} +LPROC_SEQ_FOPS(mdc_rpc_stats); + +static int mdc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *obd = seq->private; + struct osc_stats *stats = &obd2osc_dev(obd)->osc_stats; + + lprocfs_stats_header(seq, ktime_get_real(), stats->os_init, 25, ":", + true, ""); + seq_printf(seq, "lockless_write_bytes\t\t%llu\n", + stats->os_lockless_writes); + seq_printf(seq, "lockless_read_bytes\t\t%llu\n", + stats->os_lockless_reads); + return 0; +} + +static ssize_t mdc_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *obd = seq->private; + struct osc_stats *stats = &obd2osc_dev(obd)->osc_stats; + + memset(stats, 0, sizeof(*stats)); + stats->os_init = ktime_get_real(); + + return len; +} +LPROC_SEQ_FOPS(mdc_stats); + +static int mdc_dom_min_repsize_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + + seq_printf(m, "%u\n", obd->u.cli.cl_dom_min_inline_repsize); + + return 0; +} + +static ssize_t mdc_dom_min_repsize_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + unsigned int val; + int rc; + + rc = kstrtouint_from_user(buffer, count, 0, &val); + if (rc) + return rc; + + if (val > MDC_DOM_MAX_INLINE_REPSIZE) + return -ERANGE; + + obd->u.cli.cl_dom_min_inline_repsize = val; + return count; +} +LPROC_SEQ_FOPS(mdc_dom_min_repsize); + +static int mdc_lsom_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + + seq_printf(m, "%s\n", dev->u.cli.cl_lsom_update ? "On" : "Off"); + + return 0; +} + +static ssize_t mdc_lsom_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev; + bool val; + int rc; + + dev = ((struct seq_file *)file->private_data)->private; + rc = kstrtobool_from_user(buffer, count, &val); + if (rc) + return rc; + + dev->u.cli.cl_lsom_update = val; + return count; +} +LPROC_SEQ_FOPS(mdc_lsom); + + +LPROC_SEQ_FOPS_RO_TYPE(mdc, connect_flags); +LPROC_SEQ_FOPS_RO_TYPE(mdc, server_uuid); +LPROC_SEQ_FOPS_RO_TYPE(mdc, timeouts); +LPROC_SEQ_FOPS_RO_TYPE(mdc, state); +LPROC_SEQ_FOPS_RW_TYPE(mdc, obd_max_pages_per_rpc); +LPROC_SEQ_FOPS_RW_TYPE(mdc, import); +LPROC_SEQ_FOPS_RW_TYPE(mdc, pinger_recov); + +struct lprocfs_vars lprocfs_mdc_obd_vars[] = { + { .name = "connect_flags", + .fops = &mdc_connect_flags_fops }, + { .name = "mds_server_uuid", + .fops = &mdc_server_uuid_fops }, + { .name = "max_pages_per_rpc", + .fops = &mdc_obd_max_pages_per_rpc_fops }, + { .name = "max_dirty_mb", + .fops = &mdc_max_dirty_mb_fops }, + { .name = "mdc_cached_mb", + .fops = &mdc_cached_mb_fops }, + { .name = "checksum_type", + .fops = &mdc_checksum_type_fops }, + { .name = "timeouts", + .fops = &mdc_timeouts_fops }, + { .name = "import", + .fops = &mdc_import_fops }, + { .name = "state", + .fops = &mdc_state_fops }, + { .name = "pinger_recov", + .fops = &mdc_pinger_recov_fops }, + { .name = "rpc_stats", + .fops = &mdc_rpc_stats_fops }, + { .name = "unstable_stats", + .fops = &mdc_unstable_stats_fops }, + { .name = "mdc_stats", + .fops = &mdc_stats_fops }, + { .name = "mdc_dom_min_repsize", + .fops = &mdc_dom_min_repsize_fops }, + { .name = "mdc_lsom", + .fops = &mdc_lsom_fops }, + { NULL } +}; + +static ssize_t cur_lost_grant_bytes_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &obd->u.cli; + + return scnprintf(buf, PAGE_SIZE, "%lu\n", cli->cl_lost_grant); +} +LUSTRE_RO_ATTR(cur_lost_grant_bytes); + +static ssize_t cur_dirty_grant_bytes_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &obd->u.cli; + + return scnprintf(buf, PAGE_SIZE, "%lu\n", cli->cl_dirty_grant); +} +LUSTRE_RO_ATTR(cur_dirty_grant_bytes); + +static ssize_t grant_shrink_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp; + ssize_t len; + + with_imp_locked(obd, imp, len) + len = scnprintf(buf, PAGE_SIZE, "%d\n", + !imp->imp_grant_shrink_disabled && + OCD_HAS_FLAG(&imp->imp_connect_data, + GRANT_SHRINK)); + + return len; +} + +static ssize_t grant_shrink_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp; + bool val; + int rc; + + if (obd == NULL) + return 0; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + with_imp_locked(obd, imp, rc) { + spin_lock(&imp->imp_lock); + imp->imp_grant_shrink_disabled = !val; + spin_unlock(&imp->imp_lock); + } + + return rc ?: count; +} +LUSTRE_RW_ATTR(grant_shrink); + +static ssize_t grant_shrink_interval_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%lld\n", obd->u.cli.cl_grant_shrink_interval); +} + +static ssize_t grant_shrink_interval_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + if (val == 0) + return -ERANGE; + + obd->u.cli.cl_grant_shrink_interval = val; + osc_update_next_shrink(&obd->u.cli); + osc_schedule_grant_work(); + + return count; +} +LUSTRE_RW_ATTR(grant_shrink_interval); + +static struct attribute *mdc_attrs[] = { + &lustre_attr_active.attr, + &lustre_attr_checksums.attr, + &lustre_attr_checksum_dump.attr, + &lustre_attr_max_rpcs_in_flight.attr, + &lustre_attr_max_mod_rpcs_in_flight.attr, + &lustre_attr_mds_conn_uuid.attr, + &lustre_attr_conn_uuid.attr, + &lustre_attr_ping.attr, + &lustre_attr_grant_shrink.attr, + &lustre_attr_grant_shrink_interval.attr, + &lustre_attr_cur_lost_grant_bytes.attr, + &lustre_attr_cur_dirty_grant_bytes.attr, + NULL, +}; + +KOBJ_ATTRIBUTE_GROUPS(mdc); /* creates mdc_groups */ + +int mdc_tunables_init(struct obd_device *obd) +{ + int rc; + + obd->obd_ktype.default_groups = KOBJ_ATTR_GROUPS(mdc); + obd->obd_vars = lprocfs_mdc_obd_vars; + + rc = lprocfs_obd_setup(obd, false); + if (rc) + goto out_failed; +#ifdef CONFIG_PROC_FS + rc = lprocfs_alloc_md_stats(obd, 0); + if (rc) { + lprocfs_obd_cleanup(obd); + goto out_failed; + } +#endif + rc = sptlrpc_lprocfs_cliobd_attach(obd); + if (rc) { +#ifdef CONFIG_PROC_FS + lprocfs_free_md_stats(obd); +#endif + lprocfs_obd_cleanup(obd); + goto out_failed; + } + ptlrpc_lprocfs_register_obd(obd); + +out_failed: + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_acl.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_acl.c new file mode 100644 index 0000000000000..81263900fc1fc --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_acl.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +#include + +#include "mdc_internal.h" + +int mdc_unpack_acl(struct req_capsule *pill, struct lustre_md *md) +{ + struct mdt_body *body = md->body; + struct posix_acl *acl; + void *buf; + int rc; + + /* for ACL, it's possible that FLACL is set but aclsize is zero. + * only when aclsize != 0 there's an actual segment for ACL + * in reply buffer. + */ + if (!body->mbo_aclsize) { + md->posix_acl = NULL; + return 0; + } + + buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->mbo_aclsize); + if (!buf) + return -EPROTO; + + acl = posix_acl_from_xattr(&init_user_ns, buf, body->mbo_aclsize); + if (IS_ERR_OR_NULL(acl)) { + rc = acl ? PTR_ERR(acl) : 0; + CERROR("convert xattr to acl: %d\n", rc); + return rc; + } + + rc = posix_acl_valid(&init_user_ns, acl); + if (rc) { + CERROR("validate acl: %d\n", rc); + posix_acl_release(acl); + return rc; + } + + md->posix_acl = acl; + return 0; +} diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c new file mode 100644 index 0000000000000..843c4de8a43b7 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c @@ -0,0 +1,881 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Commissariat a l'Energie Atomique et aux Energies + * Alternatives. + * + * Copyright (c) 2017, Intel Corporation. + * + * Author: Henri Doreau + */ + +#define DEBUG_SUBSYSTEM S_MDC + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "mdc_internal.h" + + +/* + * -- Changelog delivery through character device -- + */ + +/** + * Mutex to protect chlg_registered_devices below + */ +static DEFINE_MUTEX(chlg_registered_dev_lock); + +/** + * Global linked list of all registered devices (one per MDT). + */ +static LIST_HEAD(chlg_registered_devices); + + +struct chlg_registered_dev { + /* Device name of the form "changelog-{MDTNAME}" */ + char ced_name[32]; + /* changelog char device */ + struct cdev ced_cdev; + struct device ced_device; + /* OBDs referencing this device (multiple mount point) */ + struct list_head ced_obds; + /* Reference counter for proper deregistration */ + struct kref ced_refs; + /* Link within the global chlg_registered_devices */ + struct list_head ced_link; +}; + +struct chlg_reader_state { + /* Shortcut to the corresponding OBD device */ + struct obd_device *crs_obd; + /* the corresponding chlg_registered_dev */ + struct chlg_registered_dev *crs_ced; + /* Producer thread (if any) */ + struct task_struct *crs_prod_task; + /* An error occurred that prevents from reading further */ + int crs_err; + /* EOF, no more records available */ + bool crs_eof; + /* Desired start position */ + __u64 crs_start_offset; + /* Wait queue for the catalog processing thread */ + wait_queue_head_t crs_waitq_prod; + /* Wait queue for the record copy threads */ + wait_queue_head_t crs_waitq_cons; + /* Mutex protecting crs_rec_count and crs_rec_queue */ + struct mutex crs_lock; + /* Number of item in the list */ + __u64 crs_rec_count; + /* List of prefetched enqueued_record::enq_linkage_items */ + struct list_head crs_rec_queue; + unsigned int crs_last_catidx; + unsigned int crs_last_idx; + bool crs_poll; +}; + +struct chlg_rec_entry { + /* Link within the chlg_reader_state::crs_rec_queue list */ + struct list_head enq_linkage; + /* Data (enq_record) field length */ + __u64 enq_length; + /* Copy of a changelog record (see struct llog_changelog_rec) */ + struct changelog_rec enq_record[]; +}; + +enum { + /* Number of records to prefetch locally. */ + CDEV_CHLG_MAX_PREFETCH = 1024, +}; + +DEFINE_IDR(mdc_changelog_minor_idr); +static DEFINE_SPINLOCK(chlg_minor_lock); + +static int chlg_minor_alloc(int *pminor) +{ + void *minor_allocated = (void *)-1; + int minor; + + idr_preload(GFP_KERNEL); + spin_lock(&chlg_minor_lock); + minor = idr_alloc(&mdc_changelog_minor_idr, minor_allocated, 0, + MDC_CHANGELOG_DEV_COUNT, GFP_NOWAIT); + spin_unlock(&chlg_minor_lock); + idr_preload_end(); + + if (minor < 0) + return minor; + + *pminor = minor; + return 0; +} + +static void chlg_minor_free(int minor) +{ + spin_lock(&chlg_minor_lock); + idr_remove(&mdc_changelog_minor_idr, minor); + spin_unlock(&chlg_minor_lock); +} + +static void chlg_device_release(struct device *dev) +{ + struct chlg_registered_dev *entry = dev_get_drvdata(dev); + + chlg_minor_free(MINOR(entry->ced_cdev.dev)); + OBD_FREE_PTR(entry); +} + +/** + * Deregister a changelog character device whose refcount has reached zero. + */ +static void chlg_dev_clear(struct kref *kref) +{ + struct chlg_registered_dev *entry; + + ENTRY; + entry = container_of(kref, struct chlg_registered_dev, + ced_refs); + + list_del(&entry->ced_link); + cdev_device_del(&entry->ced_cdev, &entry->ced_device); + put_device(&entry->ced_device); + EXIT; +} + +static inline struct obd_device* chlg_obd_get(struct chlg_registered_dev *dev) +{ + struct obd_device *obd; + + mutex_lock(&chlg_registered_dev_lock); + if (list_empty(&dev->ced_obds)) + return NULL; + + obd = list_first_entry(&dev->ced_obds, struct obd_device, + u.cli.cl_chg_dev_linkage); + class_incref(obd, "changelog", dev); + mutex_unlock(&chlg_registered_dev_lock); + return obd; +} + +static inline void chlg_obd_put(struct chlg_registered_dev *dev, + struct obd_device *obd) +{ + class_decref(obd, "changelog", dev); +} + +/** + * ChangeLog catalog processing callback invoked on each record. + * If the current record is eligible to userland delivery, push + * it into the crs_rec_queue where the consumer code will fetch it. + * + * @param[in] env (unused) + * @param[in] llh Client-side handle used to identify the llog + * @param[in] hdr Header of the current llog record + * @param[in,out] data chlg_reader_state passed from caller + * + * @return 0 or LLOG_PROC_* control code on success, negated error on failure. + */ +static int chlg_read_cat_process_cb(const struct lu_env *env, + struct llog_handle *llh, + struct llog_rec_hdr *hdr, void *data) +{ + struct llog_changelog_rec *rec; + struct chlg_reader_state *crs = data; + struct chlg_rec_entry *enq; + size_t len; + int rc; + ENTRY; + + LASSERT(crs != NULL); + LASSERT(hdr != NULL); + + rec = container_of(hdr, struct llog_changelog_rec, cr_hdr); + + crs->crs_last_catidx = llh->lgh_hdr->llh_cat_idx; + crs->crs_last_idx = hdr->lrh_index; + + if (rec->cr_hdr.lrh_type != CHANGELOG_REC) { + rc = -EINVAL; + CERROR("%s: not a changelog rec %x/%d in llog : rc = %d\n", + crs->crs_obd->obd_name, rec->cr_hdr.lrh_type, + rec->cr.cr_type, rc); + RETURN(rc); + } + + /* Skip undesired records */ + if (rec->cr.cr_index < crs->crs_start_offset) + RETURN(0); + + CDEBUG(D_HSM, "%llu %02d%-5s %llu 0x%x t="DFID" p="DFID" %.*s\n", + rec->cr.cr_index, rec->cr.cr_type, + changelog_type2str(rec->cr.cr_type), rec->cr.cr_time, + rec->cr.cr_flags & CLF_FLAGMASK, + PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid), + rec->cr.cr_namelen, changelog_rec_name(&rec->cr)); + + wait_event_interruptible(crs->crs_waitq_prod, + crs->crs_rec_count < CDEV_CHLG_MAX_PREFETCH || + kthread_should_stop()); + + if (kthread_should_stop()) + RETURN(LLOG_PROC_BREAK); + + len = changelog_rec_size(&rec->cr) + rec->cr.cr_namelen; + OBD_ALLOC(enq, sizeof(*enq) + len); + if (enq == NULL) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&enq->enq_linkage); + enq->enq_length = len; + memcpy(enq->enq_record, &rec->cr, len); + + mutex_lock(&crs->crs_lock); + list_add_tail(&enq->enq_linkage, &crs->crs_rec_queue); + crs->crs_rec_count++; + mutex_unlock(&crs->crs_lock); + + wake_up(&crs->crs_waitq_cons); + + RETURN(0); +} + +/** + * Remove record from the list it is attached to and free it. + */ +static void enq_record_delete(struct chlg_rec_entry *rec) +{ + list_del(&rec->enq_linkage); + OBD_FREE(rec, sizeof(*rec) + rec->enq_length); +} + +/** + * Record prefetch thread entry point. Opens the changelog catalog and starts + * reading records. + * + * @param[in,out] args chlg_reader_state passed from caller. + * @return 0 on success, negated error code on failure. + */ +static int chlg_load(void *args) +{ + struct chlg_reader_state *crs = args; + struct chlg_registered_dev *ced = crs->crs_ced; + struct obd_device *obd = NULL; + struct llog_ctxt *ctx = NULL; + struct llog_handle *llh = NULL; + int rc; + ENTRY; + + crs->crs_last_catidx = 0; + crs->crs_last_idx = 0; + +again: + obd = chlg_obd_get(ced); + if (obd == NULL) + RETURN(-ENODEV); + + crs->crs_obd = obd; + + ctx = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT); + if (ctx == NULL) + GOTO(err_out, rc = -ENOENT); + + rc = llog_open(NULL, ctx, &llh, NULL, CHANGELOG_CATALOG, + LLOG_OPEN_EXISTS); + if (rc) { + CERROR("%s: fail to open changelog catalog: rc = %d\n", + obd->obd_name, rc); + GOTO(err_out, rc); + } + + + rc = llog_init_handle(NULL, llh, + LLOG_F_IS_CAT | + LLOG_F_EXT_JOBID | + LLOG_F_EXT_EXTRA_FLAGS | + LLOG_F_EXT_X_UIDGID | + LLOG_F_EXT_X_NID | + LLOG_F_EXT_X_OMODE | + LLOG_F_EXT_X_XATTR, + NULL); + if (rc) { + CERROR("%s: fail to init llog handle: rc = %d\n", + obd->obd_name, rc); + GOTO(err_out, rc); + } + + rc = llog_cat_process(NULL, llh, chlg_read_cat_process_cb, crs, + crs->crs_last_catidx, crs->crs_last_idx); + if (rc < 0) { + CERROR("%s: fail to process llog: rc = %d\n", obd->obd_name, rc); + GOTO(err_out, rc); + } + if (!kthread_should_stop() && crs->crs_poll) { + llog_cat_close(NULL, llh); + llog_ctxt_put(ctx); + class_decref(obd, "changelog", crs); + schedule_timeout_interruptible(cfs_time_seconds(1)); + goto again; + } + + crs->crs_eof = true; + +err_out: + if (rc < 0) + crs->crs_err = rc; + + wake_up(&crs->crs_waitq_cons); + + if (llh != NULL) + llog_cat_close(NULL, llh); + + if (ctx != NULL) + llog_ctxt_put(ctx); + + crs->crs_obd = NULL; + chlg_obd_put(ced, obd); + wait_event_interruptible(crs->crs_waitq_prod, kthread_should_stop()); + + RETURN(rc); +} + +static int chlg_start_thread(struct file *file) +{ + struct chlg_reader_state *crs = file->private_data; + struct task_struct *task; + int rc = 0; + + if (likely(crs->crs_prod_task)) + return 0; + if (unlikely(file->f_mode & FMODE_READ) == 0) + return 0; + + mutex_lock(&crs->crs_lock); + if (crs->crs_prod_task == NULL) { + task = kthread_run(chlg_load, crs, "chlg_load_thread"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("%s: cannot start changelog thread: rc = %d\n", + crs->crs_ced->ced_name, rc); + GOTO(out, rc); + } + crs->crs_prod_task = task; + } +out: + mutex_unlock(&crs->crs_lock); + return rc; +} + +/** + * Read handler, dequeues records from the chlg_reader_state if any. + * No partial records are copied to userland so this function can return less + * data than required (short read). + * + * @param[in] file File pointer to the character device. + * @param[out] buff Userland buffer where to copy the records. + * @param[in] count Userland buffer size. + * @param[out] ppos File position, updated with the index number of the next + * record to read. + * @return number of copied bytes on success, negated error code on failure. + */ +static ssize_t chlg_read(struct file *file, char __user *buff, size_t count, + loff_t *ppos) +{ + struct chlg_reader_state *crs = file->private_data; + struct chlg_rec_entry *rec; + struct chlg_rec_entry *tmp; + size_t written_total = 0; + ssize_t rc; + LIST_HEAD(consumed); + ENTRY; + + if (file->f_flags & O_NONBLOCK && crs->crs_rec_count == 0) { + if (crs->crs_err < 0) + RETURN(crs->crs_err); + else if (crs->crs_eof) + RETURN(0); + else + RETURN(-EAGAIN); + } + + rc = chlg_start_thread(file); + if (rc) + RETURN(rc); + + rc = wait_event_interruptible(crs->crs_waitq_cons, + crs->crs_rec_count > 0 || crs->crs_eof || crs->crs_err); + + mutex_lock(&crs->crs_lock); + list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) { + if (written_total + rec->enq_length > count) + break; + + if (copy_to_user(buff, rec->enq_record, rec->enq_length)) { + rc = -EFAULT; + break; + } + + buff += rec->enq_length; + written_total += rec->enq_length; + + crs->crs_rec_count--; + list_move_tail(&rec->enq_linkage, &consumed); + + crs->crs_start_offset = rec->enq_record->cr_index + 1; + } + mutex_unlock(&crs->crs_lock); + + if (written_total > 0) { + rc = written_total; + wake_up(&crs->crs_waitq_prod); + } else if (rc == 0) { + rc = crs->crs_err; + } + + list_for_each_entry_safe(rec, tmp, &consumed, enq_linkage) + enq_record_delete(rec); + + *ppos = crs->crs_start_offset; + + RETURN(rc); +} + +/** + * Jump to a given record index. Helper for chlg_llseek(). + * + * @param[in,out] crs Internal reader state. + * @param[in] offset Desired offset (index record). + * @return 0 on success, negated error code on failure. + */ +static int chlg_set_start_offset(struct chlg_reader_state *crs, __u64 offset) +{ + struct chlg_rec_entry *rec; + struct chlg_rec_entry *tmp; + + mutex_lock(&crs->crs_lock); + if (offset < crs->crs_start_offset) { + mutex_unlock(&crs->crs_lock); + return -ERANGE; + } + + crs->crs_start_offset = offset; + list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) { + struct changelog_rec *cr = rec->enq_record; + + if (cr->cr_index >= crs->crs_start_offset) + break; + + crs->crs_rec_count--; + enq_record_delete(rec); + } + + mutex_unlock(&crs->crs_lock); + wake_up(&crs->crs_waitq_prod); + return 0; +} + +/** + * Move read pointer to a certain record index, encoded as an offset. + * + * @param[in,out] file File pointer to the changelog character device + * @param[in] off Offset to skip, actually a record index, not byte count + * @param[in] whence Relative/Absolute interpretation of the offset + * @return the resulting position on success or negated error code on failure. + */ +static loff_t chlg_llseek(struct file *file, loff_t off, int whence) +{ + struct chlg_reader_state *crs = file->private_data; + loff_t pos; + int rc; + + switch (whence) { + case SEEK_SET: + pos = off; + break; + case SEEK_CUR: + pos = file->f_pos + off; + break; + case SEEK_END: + default: + return -EINVAL; + } + + /* We cannot go backward */ + if (pos < file->f_pos) + return -EINVAL; + + rc = chlg_set_start_offset(crs, pos); + if (rc != 0) + return rc; + + file->f_pos = pos; + return pos; +} + +/** + * Clear record range for a given changelog reader. + * + * @param[in] crs Current internal state. + * @param[in] reader Changelog reader ID (cl1, cl2...) + * @param[in] record Record index up which to clear + * @return 0 on success, negated error code on failure. + */ +static int chlg_clear(struct chlg_reader_state *crs, __u32 reader, __u64 record) +{ + struct obd_device *obd = NULL; + struct changelog_setinfo cs = { + .cs_recno = record, + .cs_id = reader + }; + int rc; + + obd = chlg_obd_get(crs->crs_ced); + if (obd == NULL) + return -ENODEV; + + rc = obd_set_info_async(NULL, obd->obd_self_export, + strlen(KEY_CHANGELOG_CLEAR), + KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, NULL); + + chlg_obd_put(crs->crs_ced, obd); + return rc; +} + +/** Maximum changelog control command size */ +#define CHLG_CONTROL_CMD_MAX 64 + +/** + * Handle writes() into the changelog character device. Write() can be used + * to request special control operations. + * + * @param[in] file File pointer to the changelog character device + * @param[in] buff User supplied data (written data) + * @param[in] count Number of written bytes + * @param[in] off (unused) + * @return number of written bytes on success, negated error code on failure. + */ +static ssize_t chlg_write(struct file *file, const char __user *buff, + size_t count, loff_t *off) +{ + struct chlg_reader_state *crs = file->private_data; + char *kbuf; + __u64 record; + __u32 reader; + int rc = 0; + ENTRY; + + if (count > CHLG_CONTROL_CMD_MAX) + RETURN(-EINVAL); + + OBD_ALLOC(kbuf, CHLG_CONTROL_CMD_MAX); + if (kbuf == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(kbuf, buff, count)) + GOTO(out_kbuf, rc = -EFAULT); + + kbuf[CHLG_CONTROL_CMD_MAX - 1] = '\0'; + + if (sscanf(kbuf, "clear:cl%u:%llu", &reader, &record) == 2) + rc = chlg_clear(crs, reader, record); + else + rc = -EINVAL; + + EXIT; +out_kbuf: + OBD_FREE(kbuf, CHLG_CONTROL_CMD_MAX); + return rc < 0 ? rc : count; +} + +/** + * Open handler, initialize internal CRS state and spawn prefetch thread if + * needed. + * @param[in] inode Inode struct for the open character device. + * @param[in] file Corresponding file pointer. + * @return 0 on success, negated error code on failure. + */ +static int chlg_open(struct inode *inode, struct file *file) +{ + struct chlg_reader_state *crs; + struct chlg_registered_dev *dev; + ENTRY; + + dev = container_of(inode->i_cdev, struct chlg_registered_dev, ced_cdev); + + OBD_ALLOC_PTR(crs); + if (!crs) + RETURN(-ENOMEM); + + kref_get(&dev->ced_refs); + crs->crs_ced = dev; + crs->crs_err = false; + crs->crs_eof = false; + + mutex_init(&crs->crs_lock); + INIT_LIST_HEAD(&crs->crs_rec_queue); + init_waitqueue_head(&crs->crs_waitq_prod); + init_waitqueue_head(&crs->crs_waitq_cons); + crs->crs_prod_task = NULL; + + file->private_data = crs; + RETURN(0); +} + +/** + * Close handler, release resources. + * + * @param[in] inode Inode struct for the open character device. + * @param[in] file Corresponding file pointer. + * @return 0 on success, negated error code on failure. + */ +static int chlg_release(struct inode *inode, struct file *file) +{ + struct chlg_reader_state *crs = file->private_data; + struct chlg_rec_entry *rec; + struct chlg_rec_entry *tmp; + int rc = 0; + + if (crs->crs_prod_task) + rc = kthread_stop(crs->crs_prod_task); + + list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) + enq_record_delete(rec); + + kref_put(&crs->crs_ced->ced_refs, chlg_dev_clear); + OBD_FREE_PTR(crs); + + return rc; +} + +/** + * Poll handler, indicates whether the device is readable (new records) and + * writable (always). + * + * @param[in] file Device file pointer. + * @param[in] wait (opaque) + * @return combination of the poll status flags. + */ +static unsigned int chlg_poll(struct file *file, poll_table *wait) +{ + struct chlg_reader_state *crs = file->private_data; + unsigned int mask = 0; + int rc; + + rc = chlg_start_thread(file); + if (rc) + RETURN(rc); + + mutex_lock(&crs->crs_lock); + poll_wait(file, &crs->crs_waitq_cons, wait); + if (crs->crs_rec_count > 0) + mask |= POLLIN | POLLRDNORM; + if (crs->crs_err) + mask |= POLLERR; + if (crs->crs_eof) + mask |= POLLHUP; + mutex_unlock(&crs->crs_lock); + return mask; +} + +static long chlg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int rc; + + struct chlg_reader_state *crs = file->private_data; + switch (cmd) { + case OBD_IOC_CHLG_POLL: + crs->crs_poll = !!arg; + rc = 0; + break; + default: + rc = -EINVAL; + break; + } + return rc; +} + +static const struct file_operations chlg_fops = { + .owner = THIS_MODULE, + .llseek = chlg_llseek, + .read = chlg_read, + .write = chlg_write, + .open = chlg_open, + .release = chlg_release, + .poll = chlg_poll, + .unlocked_ioctl = chlg_ioctl, +}; + +/** + * This uses obd_name of the form: "testfs-MDT0000-mdc-ffff88006501600" + * and returns a name of the form: "changelog-testfs-MDT0000". + */ +static void get_target_name(char *name, size_t name_len, struct obd_device *obd) +{ + int i; + + snprintf(name, name_len, "%s", obd->obd_name); + + /* Find the 2nd '-' from the end and truncate on it */ + for (i = 0; i < 2; i++) { + char *p = strrchr(name, '-'); + + if (p == NULL) + return; + *p = '\0'; + } +} + +/** + * Find a changelog character device by name. + * All devices registered during MDC setup are listed in a global list with + * their names attached. + */ +static struct chlg_registered_dev * +chlg_registered_dev_find_by_name(const char *name) +{ + struct chlg_registered_dev *dit; + + LASSERT(mutex_is_locked(&chlg_registered_dev_lock)); + list_for_each_entry(dit, &chlg_registered_devices, ced_link) + if (strcmp(name, dit->ced_name) == 0) + return dit; + return NULL; +} + +/** + * Find chlg_registered_dev structure for a given OBD device. + * This is bad O(n^2) but for each filesystem: + * - N is # of MDTs times # of mount points + * - this only runs at shutdown + */ +static struct chlg_registered_dev * +chlg_registered_dev_find_by_obd(const struct obd_device *obd) +{ + struct chlg_registered_dev *dit; + struct obd_device *oit; + + LASSERT(mutex_is_locked(&chlg_registered_dev_lock)); + list_for_each_entry(dit, &chlg_registered_devices, ced_link) + list_for_each_entry(oit, &dit->ced_obds, + u.cli.cl_chg_dev_linkage) + if (oit == obd) + return dit; + return NULL; +} + +/** + * Changelog character device initialization. + * Register a misc character device with a dynamic minor number, under a name + * of the form: 'changelog-fsname-MDTxxxx'. Reference this OBD device with it. + * + * @param[in] obd This MDC obd_device. + * @return 0 on success, negated error code on failure. + */ +int mdc_changelog_cdev_init(struct obd_device *obd) +{ + struct chlg_registered_dev *exist; + struct chlg_registered_dev *entry; + int minor, rc; + ENTRY; + + OBD_ALLOC_PTR(entry); + if (entry == NULL) + RETURN(-ENOMEM); + + get_target_name(entry->ced_name, sizeof(entry->ced_name), obd); + + kref_init(&entry->ced_refs); + INIT_LIST_HEAD(&entry->ced_obds); + INIT_LIST_HEAD(&entry->ced_link); + + mutex_lock(&chlg_registered_dev_lock); + exist = chlg_registered_dev_find_by_name(entry->ced_name); + if (exist != NULL) { + kref_get(&exist->ced_refs); + list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &exist->ced_obds); + GOTO(out_unlock, rc = 0); + } + + list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &entry->ced_obds); + list_add_tail(&entry->ced_link, &chlg_registered_devices); + + rc = chlg_minor_alloc(&minor); + if (rc) + GOTO(out_unlock, rc); + + device_initialize(&entry->ced_device); + entry->ced_device.devt = MKDEV(MAJOR(mdc_changelog_dev), minor); + entry->ced_device.class = mdc_changelog_class; + entry->ced_device.release = chlg_device_release; + dev_set_drvdata(&entry->ced_device, entry); + rc = dev_set_name(&entry->ced_device, "%s-%s", MDC_CHANGELOG_DEV_NAME, + entry->ced_name); + if (rc) + GOTO(out_minor, rc); + + /* Register new character device */ + cdev_init(&entry->ced_cdev, &chlg_fops); + entry->ced_cdev.owner = THIS_MODULE; + rc = cdev_device_add(&entry->ced_cdev, &entry->ced_device); + if (rc) + GOTO(out_device_name, rc); + + entry = NULL; /* prevent it from being freed below */ + GOTO(out_unlock, rc = 0); + +out_device_name: + kfree_const(entry->ced_device.kobj.name); + +out_minor: + chlg_minor_free(minor); + + list_del_init(&obd->u.cli.cl_chg_dev_linkage); + list_del(&entry->ced_link); + +out_unlock: + mutex_unlock(&chlg_registered_dev_lock); + if (entry) + OBD_FREE_PTR(entry); + RETURN(rc); +} + +/** + * Release OBD, decrease reference count of the corresponding changelog device. + */ +void mdc_changelog_cdev_finish(struct obd_device *obd) +{ + struct chlg_registered_dev *dev; + + ENTRY; + mutex_lock(&chlg_registered_dev_lock); + dev = chlg_registered_dev_find_by_obd(obd); + list_del_init(&obd->u.cli.cl_chg_dev_linkage); + kref_put(&dev->ced_refs, chlg_dev_clear); + mutex_unlock(&chlg_registered_dev_lock); + EXIT; +} diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c new file mode 100644 index 0000000000000..45c0f3d20fcd1 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c @@ -0,0 +1,1627 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_device, cl_req for MDC layer. + * + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_MDC + +#include +#include +#include +#include + +#include "mdc_internal.h" + +static void mdc_lock_build_policy(const struct lu_env *env, + const struct cl_lock *lock, + union ldlm_policy_data *policy) +{ + memset(policy, 0, sizeof *policy); + policy->l_inodebits.bits = MDS_INODELOCK_DOM; + if (lock) { + policy->l_inodebits.li_gid = lock->cll_descr.cld_gid; + } +} + +int mdc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data) +{ + return osc_ldlm_glimpse_ast(dlmlock, data); +} + +static void mdc_lock_build_einfo(const struct lu_env *env, + const struct cl_lock *lock, + struct osc_object *osc, + struct ldlm_enqueue_info *einfo) +{ + einfo->ei_type = LDLM_IBITS; + einfo->ei_mode = osc_cl_lock2ldlm(lock->cll_descr.cld_mode); + einfo->ei_cb_bl = mdc_ldlm_blocking_ast; + einfo->ei_cb_cp = ldlm_completion_ast; + einfo->ei_cb_gl = mdc_ldlm_glimpse_ast; + einfo->ei_cbdata = osc; /* value to be put into ->l_ast_data */ + einfo->ei_req_slot = 1; +} + +static void mdc_lock_lvb_update(const struct lu_env *env, + struct osc_object *osc, + struct ldlm_lock *dlmlock, + struct ost_lvb *lvb); + +static int mdc_set_dom_lock_data(struct ldlm_lock *lock, void *data) +{ + int set = 0; + + LASSERT(lock != NULL); + LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast); + + lock_res_and_lock(lock); + + if (lock->l_ast_data == NULL) + lock->l_ast_data = data; + if (lock->l_ast_data == data) + set = 1; + + unlock_res_and_lock(lock); + + return set; +} + +int mdc_dom_lock_match(const struct lu_env *env, struct obd_export *exp, + struct ldlm_res_id *res_id, enum ldlm_type type, + union ldlm_policy_data *policy, enum ldlm_mode mode, + __u64 *flags, struct osc_object *obj, + struct lustre_handle *lockh, + enum ldlm_match_flags match_flags) +{ + struct obd_device *obd = exp->exp_obd; + __u64 lflags = *flags; + enum ldlm_mode rc; + + ENTRY; + + rc = ldlm_lock_match_with_skip(obd->obd_namespace, lflags, 0, + res_id, type, policy, mode, lockh, match_flags); + if (rc == 0 || lflags & LDLM_FL_TEST_LOCK) + RETURN(rc); + + if (obj != NULL) { + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + + LASSERT(lock != NULL); + if (mdc_set_dom_lock_data(lock, obj)) { + lock_res_and_lock(lock); + if (!ldlm_is_lvb_cached(lock)) { + LASSERT(lock->l_ast_data == obj); + mdc_lock_lvb_update(env, obj, lock, NULL); + ldlm_set_lvb_cached(lock); + } + unlock_res_and_lock(lock); + } else { + ldlm_lock_decref(lockh, rc); + rc = 0; + } + LDLM_LOCK_PUT(lock); + } + RETURN(rc); +} + +/** + * Finds an existing lock covering a page with given index. + * Copy of osc_obj_dlmlock_at_pgoff() but for DoM IBITS lock. + */ +struct ldlm_lock *mdc_dlmlock_at_pgoff(const struct lu_env *env, + struct osc_object *obj, pgoff_t index, + enum osc_dap_flags dap_flags) +{ + struct osc_thread_info *info = osc_env_info(env); + struct ldlm_res_id *resname = &info->oti_resname; + union ldlm_policy_data *policy = &info->oti_policy; + struct lustre_handle lockh; + struct ldlm_lock *lock = NULL; + enum ldlm_mode mode; + __u64 flags; + enum ldlm_match_flags match_flags = 0; + + ENTRY; + + fid_build_reg_res_name(lu_object_fid(osc2lu(obj)), resname); + mdc_lock_build_policy(env, NULL, policy); + policy->l_inodebits.li_gid = LDLM_GID_ANY; + + flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING; + if (dap_flags & OSC_DAP_FL_TEST_LOCK) + flags |= LDLM_FL_TEST_LOCK; + + if (dap_flags & OSC_DAP_FL_AST) + match_flags |= LDLM_MATCH_AST; + + if (dap_flags & OSC_DAP_FL_CANCELING) + match_flags |= LDLM_MATCH_UNREF; + +again: + /* Next, search for already existing extent locks that will cover us */ + /* If we're trying to read, we also search for an existing PW lock. The + * VFS and page cache already protect us locally, so lots of readers/ + * writers can share a single PW lock. */ + mode = mdc_dom_lock_match(env, osc_export(obj), resname, LDLM_IBITS, + policy, LCK_PR | LCK_PW | LCK_GROUP, &flags, + obj, &lockh, match_flags); + if (mode != 0) { + lock = ldlm_handle2lock(&lockh); + /* RACE: the lock is cancelled so let's try again */ + if (unlikely(lock == NULL)) + goto again; + } + + RETURN(lock); +} + +/** + * Check if page @page is covered by an extra lock or discard it. + */ +static bool mdc_check_and_discard_cb(const struct lu_env *env, struct cl_io *io, + void **pvec, int count, void *cbdata) +{ + struct osc_thread_info *info = osc_env_info(env); + struct osc_object *osc = cbdata; + pgoff_t index; + int i; + + for (i = 0; i < count; i++) { + struct osc_page *ops = pvec[i]; + + index = osc_index(ops); + if (index >= info->oti_fn_index) { + struct ldlm_lock *tmp; + struct cl_page *page = ops->ops_cl.cpl_page; + + /* refresh non-overlapped index */ + tmp = mdc_dlmlock_at_pgoff(env, osc, index, + OSC_DAP_FL_TEST_LOCK | OSC_DAP_FL_AST); + if (tmp != NULL) { + info->oti_fn_index = CL_PAGE_EOF; + LDLM_LOCK_PUT(tmp); + } else if (cl_page_own(env, io, page) == 0) { + /* discard the page */ + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + } + + info->oti_next_index = index + 1; + } + return true; +} + +/** + * Discard pages protected by the given lock. This function traverses radix + * tree to find all covering pages and discard them. If a page is being covered + * by other locks, it should remain in cache. + * + * If error happens on any step, the process continues anyway (the reasoning + * behind this being that lock cancellation cannot be delayed indefinitely). + */ +static int mdc_lock_discard_pages(const struct lu_env *env, + struct osc_object *osc, + pgoff_t start, pgoff_t end, + bool discard) +{ + struct osc_thread_info *info = osc_env_info(env); + struct cl_io *io = &info->oti_io; + osc_page_gang_cbt cb; + int result; + + ENTRY; + + io->ci_obj = cl_object_top(osc2cl(osc)); + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result != 0) + GOTO(out, result); + + cb = discard ? osc_discard_cb : mdc_check_and_discard_cb; + info->oti_fn_index = info->oti_next_index = start; + + osc_page_gang_lookup(env, io, osc, info->oti_next_index, + end, cb, (void *)osc); +out: + cl_io_fini(env, io); + RETURN(result); +} + +static int mdc_lock_flush(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end, enum cl_lock_mode mode, + bool discard) +{ + int result = 0; + int rc; + + ENTRY; + + if (mode == CLM_WRITE) { + result = osc_cache_writeback_range(env, obj, start, end, 1, + discard); + CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n", + obj, start, end, result, + discard ? "discarded" : "written back"); + if (result > 0) + result = 0; + } + + /* Avoid lock matching with CLM_WRITE, there can be no other locks */ + rc = mdc_lock_discard_pages(env, obj, start, end, + mode == CLM_WRITE || discard); + if (result == 0 && rc < 0) + result = rc; + + RETURN(result); +} + +void mdc_lock_lockless_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + struct osc_object *osc = cl2osc(slice->cls_obj); + struct cl_lock_descr *descr = &slice->cls_lock->cll_descr; + int rc; + + LASSERT(ols->ols_dlmlock == NULL); + rc = mdc_lock_flush(env, osc, descr->cld_start, descr->cld_end, + descr->cld_mode, 0); + if (rc != 0) + CERROR("Pages for lockless lock %p were not purged(%d)\n", + ols, rc); + + osc_lock_wake_waiters(env, osc, ols); +} + +/** + * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock + * and ldlm_lock caches. + */ +static int mdc_dlm_canceling(const struct lu_env *env, + struct ldlm_lock *dlmlock) +{ + struct cl_object *obj = NULL; + int result = 0; + bool discard; + enum cl_lock_mode mode = CLM_READ; + + ENTRY; + + lock_res_and_lock(dlmlock); + if (!ldlm_is_granted(dlmlock)) { + dlmlock->l_ast_data = NULL; + unlock_res_and_lock(dlmlock); + RETURN(0); + } + + discard = ldlm_is_discard_data(dlmlock); + if (dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP)) + mode = CLM_WRITE; + + if (dlmlock->l_ast_data != NULL) { + obj = osc2cl(dlmlock->l_ast_data); + cl_object_get(obj); + } + unlock_res_and_lock(dlmlock); + + /* if l_ast_data is NULL, the dlmlock was enqueued by AGL or + * the object has been destroyed. */ + if (obj != NULL) { + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + + /* Destroy pages covered by the extent of the DLM lock */ + result = mdc_lock_flush(env, cl2osc(obj), cl_index(obj, 0), + CL_PAGE_EOF, mode, discard); + /* Losing a lock, set KMS to 0. + * NB: assumed that DOM lock covers whole data on MDT. + */ + /* losing a lock, update kms */ + lock_res_and_lock(dlmlock); + dlmlock->l_ast_data = NULL; + cl_object_attr_lock(obj); + attr->cat_kms = 0; + cl_object_attr_update(env, obj, attr, CAT_KMS); + cl_object_attr_unlock(obj); + unlock_res_and_lock(dlmlock); + cl_object_put(env, obj); + } + RETURN(result); +} + +int mdc_ldlm_blocking_ast(struct ldlm_lock *dlmlock, + struct ldlm_lock_desc *new, void *data, int reason) +{ + int rc = 0; + + ENTRY; + + switch (reason) { + case LDLM_CB_BLOCKING: { + struct lustre_handle lockh; + + ldlm_lock2handle(dlmlock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (rc == -ENODATA) + rc = 0; + break; + } + case LDLM_CB_CANCELING: { + struct lu_env *env; + __u16 refcheck; + + /* + * This can be called in the context of outer IO, e.g., + * + * osc_enqueue_base()->... + * ->ldlm_prep_elc_req()->... + * ->ldlm_cancel_callback()->... + * ->osc_ldlm_blocking_ast() + * + * new environment has to be created to not corrupt outer + * context. + */ + env = cl_env_get(&refcheck); + if (IS_ERR(env)) { + rc = PTR_ERR(env); + break; + } + + rc = mdc_dlm_canceling(env, dlmlock); + cl_env_put(env, &refcheck); + break; + } + default: + LBUG(); + } + RETURN(rc); +} + +/** + * Updates object attributes from a lock value block (lvb) received together + * with the DLM lock reply from the server. + * This can be optimized to not update attributes when lock is a result of a + * local match. + * + * Called under lock and resource spin-locks. + */ +void mdc_lock_lvb_update(const struct lu_env *env, struct osc_object *osc, + struct ldlm_lock *dlmlock, struct ost_lvb *lvb) +{ + struct cl_object *obj = osc2cl(osc); + struct lov_oinfo *oinfo = osc->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + unsigned valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | + CAT_SIZE; + unsigned int setkms = 0; + + ENTRY; + + if (lvb == NULL) { + LASSERT(dlmlock != NULL); + lvb = &dlmlock->l_ost_lvb; + } + cl_lvb2attr(attr, lvb); + + cl_object_attr_lock(obj); + if (dlmlock != NULL) { + __u64 size; + + check_res_locked(dlmlock->l_resource); + size = lvb->lvb_size; + + if (size >= oinfo->loi_kms) { + valid |= CAT_KMS; + attr->cat_kms = size; + setkms = 1; + } + ldlm_lock_allow_match_locked(dlmlock); + } + + /* The size should not be less than the kms */ + if (attr->cat_size < oinfo->loi_kms) + attr->cat_size = oinfo->loi_kms; + + LDLM_DEBUG(dlmlock, "acquired size %llu, setting rss=%llu;%s " + "kms=%llu, end=%llu", lvb->lvb_size, attr->cat_size, + setkms ? "" : " leaving", + setkms ? attr->cat_kms : oinfo->loi_kms, + dlmlock ? dlmlock->l_policy_data.l_extent.end : -1ull); + + cl_object_attr_update(env, obj, attr, valid); + cl_object_attr_unlock(obj); + EXIT; +} + +static void mdc_lock_granted(const struct lu_env *env, struct osc_lock *oscl, + struct lustre_handle *lockh) +{ + struct osc_object *osc = cl2osc(oscl->ols_cl.cls_obj); + struct ldlm_lock *dlmlock; + + ENTRY; + + dlmlock = ldlm_handle2lock_long(lockh, 0); + LASSERT(dlmlock != NULL); + + /* lock reference taken by ldlm_handle2lock_long() is + * owned by osc_lock and released in osc_lock_detach() + */ + lu_ref_add_atomic(&dlmlock->l_reference, "osc_lock", oscl); + oscl->ols_has_ref = 1; + + LASSERT(oscl->ols_dlmlock == NULL); + oscl->ols_dlmlock = dlmlock; + + /* This may be a matched lock for glimpse request, do not hold + * lock reference in that case. */ + if (!oscl->ols_glimpse) { + /* hold a refc for non glimpse lock which will + * be released in osc_lock_cancel() */ + lustre_handle_copy(&oscl->ols_handle, lockh); + ldlm_lock_addref(lockh, oscl->ols_einfo.ei_mode); + oscl->ols_hold = 1; + } + + /* Lock must have been granted. */ + lock_res_and_lock(dlmlock); + if (ldlm_is_granted(dlmlock)) { + struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr; + + /* extend the lock extent, otherwise it will have problem when + * we decide whether to grant a lockless lock. */ + descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode); + descr->cld_start = cl_index(descr->cld_obj, 0); + descr->cld_end = CL_PAGE_EOF; + + /* no lvb update for matched lock */ + if (!ldlm_is_lvb_cached(dlmlock)) { + LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY); + LASSERT(osc == dlmlock->l_ast_data); + mdc_lock_lvb_update(env, osc, dlmlock, NULL); + ldlm_set_lvb_cached(dlmlock); + } + } + unlock_res_and_lock(dlmlock); + + LASSERT(oscl->ols_state != OLS_GRANTED); + oscl->ols_state = OLS_GRANTED; + EXIT; +} + +/** + * Lock upcall function that is executed either when a reply to ENQUEUE rpc is + * received from a server, or after mdc_enqueue_send() matched a local DLM + * lock. + */ +static int mdc_lock_upcall(void *cookie, struct lustre_handle *lockh, + int errcode) +{ + struct osc_lock *oscl = cookie; + struct cl_lock_slice *slice = &oscl->ols_cl; + struct lu_env *env; + int rc; + + ENTRY; + + env = cl_env_percpu_get(); + /* should never happen, similar to osc_ldlm_blocking_ast(). */ + LASSERT(!IS_ERR(env)); + + rc = ldlm_error2errno(errcode); + if (oscl->ols_state == OLS_ENQUEUED) { + oscl->ols_state = OLS_UPCALL_RECEIVED; + } else if (oscl->ols_state == OLS_CANCELLED) { + rc = -EIO; + } else { + CERROR("Impossible state: %d\n", oscl->ols_state); + LBUG(); + } + + CDEBUG(D_INODE, "rc %d, err %d\n", rc, errcode); + if (rc == 0) + mdc_lock_granted(env, oscl, lockh); + + /* Error handling, some errors are tolerable. */ + if (oscl->ols_glimpse && rc == -ENAVAIL) { + LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY); + mdc_lock_lvb_update(env, cl2osc(slice->cls_obj), + NULL, &oscl->ols_lvb); + /* Hide the error. */ + rc = 0; + } + + if (oscl->ols_owner != NULL) + cl_sync_io_note(env, oscl->ols_owner, rc); + cl_env_percpu_put(env); + + RETURN(rc); +} + +/* This is needed only for old servers (before 2.14) support */ +int mdc_fill_lvb(struct req_capsule *pill, struct ost_lvb *lvb) +{ + struct mdt_body *body; + + /* get LVB data from mdt_body otherwise */ + body = req_capsule_server_get(pill, &RMF_MDT_BODY); + if (!body) + RETURN(-EPROTO); + + if (!(body->mbo_valid & OBD_MD_DOM_SIZE)) + RETURN(-EPROTO); + + mdc_body2lvb(body, lvb); + RETURN(0); +} + +int mdc_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, + osc_enqueue_upcall_f upcall, void *cookie, + struct lustre_handle *lockh, enum ldlm_mode mode, + __u64 *flags, int errcode) +{ + struct osc_lock *ols = cookie; + bool glimpse = *flags & LDLM_FL_HAS_INTENT; + int rc = 0; + + ENTRY; + + /* needed only for glimpse from an old server (< 2.14) */ + if (glimpse && !exp_connect_dom_lvb(exp)) + rc = mdc_fill_lvb(&req->rq_pill, &ols->ols_lvb); + + if (glimpse && errcode == ELDLM_LOCK_ABORTED) { + struct ldlm_reply *rep; + + rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + if (likely(rep)) { + rep->lock_policy_res2 = + ptlrpc_status_ntoh(rep->lock_policy_res2); + if (rep->lock_policy_res2) + errcode = rep->lock_policy_res2; + } else { + rc = -EPROTO; + } + *flags |= LDLM_FL_LVB_READY; + } else if (errcode == ELDLM_OK) { + struct ldlm_lock *lock; + + /* Callers have references, should be valid always */ + lock = ldlm_handle2lock(lockh); + + /* At this point ols_lvb must be filled with correct LVB either + * by mdc_fill_lvb() above or by ldlm_cli_enqueue_fini(). + * DoM uses l_ost_lvb to store LVB data, so copy it here from + * just updated ols_lvb. + */ + lock_res_and_lock(lock); + memcpy(&lock->l_ost_lvb, &ols->ols_lvb, + sizeof(lock->l_ost_lvb)); + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + *flags |= LDLM_FL_LVB_READY; + } + + /* Call the update callback. */ + rc = (*upcall)(cookie, lockh, rc < 0 ? rc : errcode); + + /* release the reference taken in ldlm_cli_enqueue() */ + if (errcode == ELDLM_LOCK_MATCHED) + errcode = ELDLM_OK; + if (errcode == ELDLM_OK && lustre_handle_is_used(lockh)) + ldlm_lock_decref(lockh, mode); + + RETURN(rc); +} + +int mdc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req, + void *args, int rc) +{ + struct osc_enqueue_args *aa = args; + struct ldlm_lock *lock; + struct lustre_handle *lockh = &aa->oa_lockh; + enum ldlm_mode mode = aa->oa_mode; + struct ldlm_enqueue_info einfo = { + .ei_type = aa->oa_type, + .ei_mode = mode, + }; + + ENTRY; + + LASSERT(!aa->oa_speculative); + + /* ldlm_cli_enqueue is holding a reference on the lock, so it must + * be valid. */ + lock = ldlm_handle2lock(lockh); + LASSERTF(lock != NULL, + "lockh %#llx, req %p, aa %p - client evicted?\n", + lockh->cookie, req, aa); + + /* Take an additional reference so that a blocking AST that + * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed + * to arrive after an upcall has been executed by + * mdc_enqueue_fini(). + */ + ldlm_lock_addref(lockh, mode); + + /* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2); + + /* Let CP AST to grant the lock first. */ + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1); + + /* Complete obtaining the lock procedure. */ + rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, &einfo, 1, aa->oa_flags, + aa->oa_lvb, aa->oa_lvb ? + sizeof(*aa->oa_lvb) : 0, lockh, rc, true); + /* Complete mdc stuff. */ + rc = mdc_enqueue_fini(aa->oa_exp, req, aa->oa_upcall, aa->oa_cookie, + lockh, mode, aa->oa_flags, rc); + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10); + + ldlm_lock_decref(lockh, mode); + LDLM_LOCK_PUT(lock); + RETURN(rc); +} + +/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock + * from the 2nd OSC before a lock from the 1st one. This does not deadlock with + * other synchronous requests, however keeping some locks and trying to obtain + * others may take a considerable amount of time in a case of ost failure; and + * when other sync requests do not get released lock from a client, the client + * is excluded from the cluster -- such scenarious make the life difficult, so + * release locks just after they are obtained. */ +int mdc_enqueue_send(const struct lu_env *env, struct obd_export *exp, + struct ldlm_res_id *res_id, __u64 *flags, + union ldlm_policy_data *policy, struct ost_lvb *lvb, + osc_enqueue_upcall_f upcall, void *cookie, + struct ldlm_enqueue_info *einfo, int async) +{ + struct obd_device *obd = exp->exp_obd; + struct lustre_handle lockh = { 0 }; + struct ptlrpc_request *req = NULL; + struct ldlm_intent *lit; + enum ldlm_mode mode; + bool glimpse = *flags & LDLM_FL_HAS_INTENT; + __u64 match_flags = *flags; + LIST_HEAD(cancels); + int rc, count; + int lvb_size; + bool compat_glimpse = glimpse && !exp_connect_dom_lvb(exp); + + ENTRY; + + mode = einfo->ei_mode; + if (einfo->ei_mode == LCK_PR) + mode |= LCK_PW; + + match_flags |= LDLM_FL_LVB_READY; + if (glimpse) + match_flags |= LDLM_FL_BLOCK_GRANTED; + mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id, + einfo->ei_type, policy, mode, &lockh); + if (mode) { + struct ldlm_lock *matched; + + if (*flags & LDLM_FL_TEST_LOCK) + RETURN(ELDLM_OK); + + matched = ldlm_handle2lock(&lockh); + + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GLIMPSE_DDOS)) + ldlm_set_kms_ignore(matched); + + if (mdc_set_dom_lock_data(matched, einfo->ei_cbdata)) { + *flags |= LDLM_FL_LVB_READY; + + /* We already have a lock, and it's referenced. */ + (*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED); + + ldlm_lock_decref(&lockh, mode); + LDLM_LOCK_PUT(matched); + RETURN(ELDLM_OK); + } + ldlm_lock_decref(&lockh, mode); + LDLM_LOCK_PUT(matched); + } + + if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK)) + RETURN(-ENOLCK); + + /* Glimpse is intent on old server */ + req = ptlrpc_request_alloc(class_exp2cliimp(exp), compat_glimpse ? + &RQF_LDLM_INTENT : &RQF_LDLM_ENQUEUE); + if (req == NULL) + RETURN(-ENOMEM); + + /* For WRITE lock cancel other locks on resource early if any */ + if (einfo->ei_mode & LCK_PW) + count = mdc_resource_get_unused_res(exp, res_id, &cancels, + einfo->ei_mode, + MDS_INODELOCK_DOM); + else + count = 0; + + rc = ldlm_prep_enqueue_req(exp, req, &cancels, count); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + + if (compat_glimpse) { + /* pack the glimpse intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = IT_GLIMPSE; + } + + /* users of mdc_enqueue() can pass this flag for ldlm_lock_match() */ + *flags &= ~LDLM_FL_BLOCK_GRANTED; + + if (compat_glimpse) { + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, 0); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0); + lvb_size = 0; + } else { + lvb_size = sizeof(*lvb); + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + lvb_size); + } + ptlrpc_request_set_replen(req); + + rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb, + lvb_size, LVB_T_OST, &lockh, async); + if (async) { + if (!rc) { + struct osc_enqueue_args *aa; + + aa = ptlrpc_req_async_args(aa, req); + aa->oa_exp = exp; + aa->oa_mode = einfo->ei_mode; + aa->oa_type = einfo->ei_type; + lustre_handle_copy(&aa->oa_lockh, &lockh); + aa->oa_upcall = upcall; + aa->oa_cookie = cookie; + aa->oa_speculative = false; + aa->oa_flags = flags; + aa->oa_lvb = compat_glimpse ? NULL : lvb; + + req->rq_interpret_reply = mdc_enqueue_interpret; + ptlrpcd_add_req(req); + } else { + ptlrpc_req_finished(req); + } + RETURN(rc); + } + + rc = mdc_enqueue_fini(exp, req, upcall, cookie, &lockh, einfo->ei_mode, + flags, rc); + ptlrpc_req_finished(req); + RETURN(rc); +} + +/** + * Implementation of cl_lock_operations::clo_enqueue() method for osc + * layer. This initiates ldlm enqueue: + * + * - cancels conflicting locks early (osc_lock_enqueue_wait()); + * + * - calls osc_enqueue_base() to do actual enqueue. + * + * osc_enqueue_base() is supplied with an upcall function that is executed + * when lock is received either after a local cached ldlm lock is matched, or + * when a reply from the server is received. + * + * This function does not wait for the network communication to complete. + */ +static int mdc_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *unused, struct cl_sync_io *anchor) +{ + struct osc_thread_info *info = osc_env_info(env); + struct osc_io *oio = osc_env_io(env); + struct osc_object *osc = cl2osc(slice->cls_obj); + struct osc_lock *oscl = cl2osc_lock(slice); + struct cl_lock *lock = slice->cls_lock; + struct ldlm_res_id *resname = &info->oti_resname; + union ldlm_policy_data *policy = &info->oti_policy; + osc_enqueue_upcall_f upcall = mdc_lock_upcall; + void *cookie = (void *)oscl; + bool async = false; + int result; + + ENTRY; + + LASSERTF(ergo(oscl->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ), + "lock = %p, ols = %p\n", lock, oscl); + + if (oscl->ols_state == OLS_GRANTED) + RETURN(0); + + /* Lockahead is not supported on MDT yet */ + if (oscl->ols_flags & LDLM_FL_NO_EXPANSION) { + result = -EOPNOTSUPP; + RETURN(result); + } + + if (oscl->ols_flags & LDLM_FL_TEST_LOCK) + GOTO(enqueue_base, 0); + + if (oscl->ols_glimpse) { + LASSERT(equi(oscl->ols_speculative, anchor == NULL)); + async = true; + GOTO(enqueue_base, 0); + } + + result = osc_lock_enqueue_wait(env, osc, oscl); + if (result < 0) + GOTO(out, result); + + /* we can grant lockless lock right after all conflicting locks + * are canceled. */ + if (osc_lock_is_lockless(oscl)) { + oscl->ols_state = OLS_GRANTED; + oio->oi_lockless = 1; + RETURN(0); + } + +enqueue_base: + oscl->ols_state = OLS_ENQUEUED; + if (anchor != NULL) { + atomic_inc(&anchor->csi_sync_nr); + oscl->ols_owner = anchor; + } + + /** + * DLM lock's ast data must be osc_object; + * DLM's enqueue callback set to osc_lock_upcall() with cookie as + * osc_lock. + */ + fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname); + mdc_lock_build_policy(env, lock, policy); + LASSERT(!oscl->ols_speculative); + result = mdc_enqueue_send(env, osc_export(osc), resname, + &oscl->ols_flags, policy, &oscl->ols_lvb, + upcall, cookie, &oscl->ols_einfo, async); + if (result == 0) { + if (osc_lock_is_lockless(oscl)) { + oio->oi_lockless = 1; + } else if (!async) { + LASSERT(oscl->ols_state == OLS_GRANTED); + LASSERT(oscl->ols_hold); + LASSERT(oscl->ols_dlmlock != NULL); + } + } +out: + if (result < 0) { + oscl->ols_state = OLS_CANCELLED; + osc_lock_wake_waiters(env, osc, oscl); + + if (anchor != NULL) + cl_sync_io_note(env, anchor, result); + } + RETURN(result); +} + +static const struct cl_lock_operations mdc_lock_lockless_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = mdc_lock_enqueue, + .clo_cancel = mdc_lock_lockless_cancel, + .clo_print = osc_lock_print +}; + +static const struct cl_lock_operations mdc_lock_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = mdc_lock_enqueue, + .clo_cancel = osc_lock_cancel, + .clo_print = osc_lock_print, +}; + +int mdc_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct osc_lock *ols; + __u32 enqflags = lock->cll_descr.cld_enq_flags; + __u64 flags = osc_enq2ldlm_flags(enqflags); + + ENTRY; + + /* Ignore AGL for Data-on-MDT, stat returns size data */ + if ((enqflags & CEF_SPECULATIVE) != 0) + RETURN(0); + + OBD_SLAB_ALLOC_PTR_GFP(ols, osc_lock_kmem, GFP_NOFS); + if (unlikely(ols == NULL)) + RETURN(-ENOMEM); + + ols->ols_state = OLS_NEW; + spin_lock_init(&ols->ols_lock); + INIT_LIST_HEAD(&ols->ols_waiting_list); + INIT_LIST_HEAD(&ols->ols_wait_entry); + INIT_LIST_HEAD(&ols->ols_nextlock_oscobj); + ols->ols_lockless_ops = &mdc_lock_lockless_ops; + + ols->ols_flags = flags; + ols->ols_speculative = !!(enqflags & CEF_SPECULATIVE); + if (lock->cll_descr.cld_mode == CLM_GROUP) + ols->ols_flags |= LDLM_FL_ATOMIC_CB; + + if (ols->ols_flags & LDLM_FL_HAS_INTENT) { + ols->ols_flags |= LDLM_FL_BLOCK_GRANTED; + ols->ols_glimpse = 1; + } + mdc_lock_build_einfo(env, lock, cl2osc(obj), &ols->ols_einfo); + + cl_lock_slice_add(lock, &ols->ols_cl, obj, &mdc_lock_ops); + + if (!(enqflags & CEF_MUST)) + osc_lock_to_lockless(env, ols, (enqflags & CEF_NEVER)); + + if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) + osc_lock_set_writer(env, io, obj, ols); + + LDLM_DEBUG_NOLOCK("lock %p, mdc lock %p, flags %llx\n", + lock, ols, ols->ols_flags); + RETURN(0); +} + +/** + * IO operations. + * + * An implementation of cl_io_operations specific methods for MDC layer. + * + */ +static int mdc_async_upcall(void *a, int rc) +{ + struct osc_async_cbargs *args = a; + + args->opc_rc = rc; + complete(&args->opc_sync); + return 0; +} + +static int mdc_get_lock_handle(const struct lu_env *env, struct osc_object *osc, + pgoff_t index, struct lustre_handle *lh) +{ + struct ldlm_lock *lock; + + /* find DOM lock protecting object */ + lock = mdc_dlmlock_at_pgoff(env, osc, index, + OSC_DAP_FL_TEST_LOCK | + OSC_DAP_FL_CANCELING); + if (lock == NULL) { + struct ldlm_resource *res; + struct ldlm_res_id *resname; + + resname = &osc_env_info(env)->oti_resname; + fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname); + res = ldlm_resource_get(osc_export(osc)->exp_obd->obd_namespace, + NULL, resname, LDLM_IBITS, 0); + if (IS_ERR(res)) + CERROR("No lock resource for "DFID"\n", + PFID(lu_object_fid(osc2lu(osc)))); + else + ldlm_resource_dump(D_ERROR, res); + libcfs_debug_dumpstack(NULL); + return -ENOENT; + } else { + *lh = lock->l_remote_handle; + LDLM_LOCK_PUT(lock); + } + return 0; +} + +static int mdc_io_setattr_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + struct obdo *oa = &oio->oi_oa; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + __u64 size = io->u.ci_setattr.sa_attr.lvb_size; + unsigned int ia_avalid = io->u.ci_setattr.sa_avalid; + enum op_xvalid ia_xvalid = io->u.ci_setattr.sa_xvalid; + int rc; + + /* silently ignore non-truncate setattr for Data-on-MDT object */ + if (cl_io_is_trunc(io)) { + /* truncate cache dirty pages first */ + rc = osc_cache_truncate_start(env, cl2osc(obj), size, + &oio->oi_trunc); + if (rc < 0) + return rc; + } else if (cl_io_is_fallocate(io) && + io->u.ci_setattr.sa_falloc_mode & FALLOC_FL_PUNCH_HOLE) { + rc = osc_punch_start(env, io, obj); + if (rc < 0) + return rc; + } + + if (oio->oi_lockless == 0) { + cl_object_attr_lock(obj); + rc = cl_object_attr_get(env, obj, attr); + if (rc == 0) { + struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr; + unsigned int cl_valid = 0; + + if (ia_avalid & ATTR_SIZE) { + attr->cat_size = size; + attr->cat_kms = size; + cl_valid = (CAT_SIZE | CAT_KMS); + } + if (ia_avalid & ATTR_MTIME_SET) { + attr->cat_mtime = lvb->lvb_mtime; + cl_valid |= CAT_MTIME; + } + if (ia_avalid & ATTR_ATIME_SET) { + attr->cat_atime = lvb->lvb_atime; + cl_valid |= CAT_ATIME; + } + if (ia_xvalid & OP_XVALID_CTIME_SET) { + attr->cat_ctime = lvb->lvb_ctime; + cl_valid |= CAT_CTIME; + } + rc = cl_object_attr_update(env, obj, attr, cl_valid); + } + cl_object_attr_unlock(obj); + if (rc < 0) + return rc; + } + + if (!(ia_avalid & ATTR_SIZE) && !cl_io_is_fallocate(io)) + return 0; + + memset(oa, 0, sizeof(*oa)); + oa->o_oi = loi->loi_oi; + oa->o_mtime = attr->cat_mtime; + oa->o_atime = attr->cat_atime; + oa->o_ctime = attr->cat_ctime; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME | + OBD_MD_FLCTIME | OBD_MD_FLMTIME | OBD_MD_FLSIZE | + OBD_MD_FLBLOCKS; + + if (oio->oi_lockless) { + oa->o_flags = OBD_FL_SRVLOCK; + oa->o_valid |= OBD_MD_FLFLAGS; + } else { + rc = mdc_get_lock_handle(env, cl2osc(obj), CL_PAGE_EOF, + &oa->o_handle); + if (!rc) + oa->o_valid |= OBD_MD_FLHANDLE; + } + + init_completion(&cbargs->opc_sync); + if (cl_io_is_fallocate(io)) { + int falloc_mode = io->u.ci_setattr.sa_falloc_mode; + + oa->o_size = io->u.ci_setattr.sa_falloc_offset; + oa->o_blocks = io->u.ci_setattr.sa_falloc_end; + rc = osc_fallocate_base(osc_export(cl2osc(obj)), oa, + mdc_async_upcall, cbargs, falloc_mode); + } else { + oa->o_size = size; + oa->o_blocks = OBD_OBJECT_EOF; + rc = osc_punch_send(osc_export(cl2osc(obj)), oa, + mdc_async_upcall, cbargs); + } + cbargs->opc_rpc_sent = rc == 0; + return rc; +} + +static int mdc_io_read_ahead(const struct lu_env *env, + const struct cl_io_slice *ios, + pgoff_t start, struct cl_read_ahead *ra) +{ + struct osc_object *osc = cl2osc(ios->cis_obj); + struct osc_io *oio = cl2osc_io(env, ios); + struct ldlm_lock *dlmlock; + + ENTRY; + + dlmlock = mdc_dlmlock_at_pgoff(env, osc, start, 0); + if (dlmlock == NULL) + RETURN(-ENODATA); + + oio->oi_is_readahead = 1; + if (dlmlock->l_req_mode != LCK_PR) { + struct lustre_handle lockh; + + ldlm_lock2handle(dlmlock, &lockh); + ldlm_lock_addref(&lockh, LCK_PR); + ldlm_lock_decref(&lockh, dlmlock->l_req_mode); + } + + ra->cra_rpc_pages = osc_cli(osc)->cl_max_pages_per_rpc; + ra->cra_end_idx = CL_PAGE_EOF; + ra->cra_release = osc_read_ahead_release; + ra->cra_dlmlock = dlmlock; + ra->cra_oio = oio; + + RETURN(0); +} + +int mdc_io_fsync_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct cl_fsync_io *fio = &io->u.ci_fsync; + struct cl_object *obj = slice->cis_obj; + struct osc_object *osc = cl2osc(obj); + int result = 0; + + ENTRY; + + /* a MDC lock always covers whole object, do sync for whole + * possible range despite of supplied start/end values. + */ + result = osc_cache_writeback_range(env, osc, 0, CL_PAGE_EOF, 0, + fio->fi_mode == CL_FSYNC_DISCARD); + if (result > 0) { + fio->fi_nr_written += result; + result = 0; + } + if (fio->fi_mode == CL_FSYNC_ALL) { + int rc; + + rc = osc_cache_wait_range(env, osc, 0, CL_PAGE_EOF); + if (result == 0) + result = rc; + /* Use OSC sync code because it is asynchronous. + * It is to be added into MDC and avoid the using of + * OST_SYNC at both MDC and MDT. + */ + rc = osc_fsync_ost(env, osc, fio); + if (result == 0) + result = rc; + } + + RETURN(result); +} + +struct mdc_data_version_args { + struct osc_io *dva_oio; +}; + +static int +mdc_data_version_interpret(const struct lu_env *env, struct ptlrpc_request *req, + void *args, int rc) +{ + struct mdc_data_version_args *dva = args; + struct osc_io *oio = dva->dva_oio; + const struct mdt_body *body; + + ENTRY; + if (rc < 0) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + /* Prepare OBDO from mdt_body for CLIO */ + oio->oi_oa.o_valid = body->mbo_valid; + oio->oi_oa.o_flags = body->mbo_flags; + oio->oi_oa.o_data_version = body->mbo_version; + oio->oi_oa.o_layout_version = body->mbo_layout_gen; + EXIT; +out: + oio->oi_cbarg.opc_rc = rc; + complete(&oio->oi_cbarg.opc_sync); + return 0; +} + +static int mdc_io_data_version_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version; + struct osc_io *oio = cl2osc_io(env, slice); + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + struct osc_object *obj = cl2osc(slice->cis_obj); + struct obd_export *exp = osc_export(obj); + struct ptlrpc_request *req; + struct mdt_body *body; + struct mdc_data_version_args *dva; + int rc; + + ENTRY; + + memset(&oio->oi_oa, 0, sizeof(oio->oi_oa)); + oio->oi_oa.o_oi.oi_fid = *lu_object_fid(osc2lu(obj)); + oio->oi_oa.o_valid = OBD_MD_FLID; + + init_completion(&cbargs->opc_sync); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY); + body->mbo_fid1 = *lu_object_fid(osc2lu(obj)); + body->mbo_valid = OBD_MD_FLID; + /* Indicate that data version is needed */ + body->mbo_valid |= OBD_MD_FLDATAVERSION; + body->mbo_flags = 0; + + if (dv->dv_flags & (LL_DV_RD_FLUSH | LL_DV_WR_FLUSH)) { + body->mbo_valid |= OBD_MD_FLFLAGS; + body->mbo_flags |= OBD_FL_SRVLOCK; + if (dv->dv_flags & LL_DV_WR_FLUSH) + body->mbo_flags |= OBD_FL_FLUSH; + } + + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0); + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, 0); + req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, RCL_SERVER, 0); + ptlrpc_request_set_replen(req); + + req->rq_interpret_reply = mdc_data_version_interpret; + dva = ptlrpc_req_async_args(dva, req); + dva->dva_oio = oio; + + ptlrpcd_add_req(req); + + RETURN(0); +} + +static void mdc_io_data_version_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version; + struct osc_io *oio = cl2osc_io(env, slice); + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + + ENTRY; + wait_for_completion(&cbargs->opc_sync); + + if (cbargs->opc_rc != 0) { + slice->cis_io->ci_result = cbargs->opc_rc; + } else { + slice->cis_io->ci_result = 0; + if (!(oio->oi_oa.o_valid & + (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION))) + slice->cis_io->ci_result = -ENOTSUPP; + + if (oio->oi_oa.o_valid & OBD_MD_LAYOUT_VERSION) + dv->dv_layout_version = oio->oi_oa.o_layout_version; + if (oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION) + dv->dv_data_version = oio->oi_oa.o_data_version; + } + + EXIT; +} + +static const struct cl_io_operations mdc_io_ops = { + .op = { + [CIT_READ] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_rw_iter_fini, + .cio_start = osc_io_read_start, + }, + [CIT_WRITE] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_rw_iter_fini, + .cio_start = osc_io_write_start, + .cio_end = osc_io_end, + }, + [CIT_SETATTR] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_iter_fini, + .cio_start = mdc_io_setattr_start, + .cio_end = osc_io_setattr_end, + }, + [CIT_DATA_VERSION] = { + .cio_start = mdc_io_data_version_start, + .cio_end = mdc_io_data_version_end, + }, + [CIT_FAULT] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_iter_fini, + .cio_start = osc_io_fault_start, + .cio_end = osc_io_end, + }, + [CIT_FSYNC] = { + .cio_start = mdc_io_fsync_start, + .cio_end = osc_io_fsync_end, + }, + [CIT_LSEEK] = { + .cio_start = osc_io_lseek_start, + .cio_end = osc_io_lseek_end, + }, + }, + .cio_read_ahead = mdc_io_read_ahead, + .cio_lru_reserve = osc_io_lru_reserve, + .cio_submit = osc_io_submit, + .cio_commit_async = osc_io_commit_async, + .cio_extent_release = osc_io_extent_release, +}; + +int mdc_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct osc_io *oio = osc_env_io(env); + + CL_IO_SLICE_CLEAN(oio, oi_cl); + cl_io_slice_add(io, &oio->oi_cl, obj, &mdc_io_ops); + return 0; +} + +static void mdc_build_res_name(struct osc_object *osc, + struct ldlm_res_id *resname) +{ + fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname); +} + +/** + * Implementation of struct cl_req_operations::cro_attr_set() for MDC + * layer. MDC is responsible for struct obdo::o_id and struct obdo::o_seq + * fields. + */ +static void mdc_req_attr_set(const struct lu_env *env, struct cl_object *obj, + struct cl_req_attr *attr) +{ + u64 flags = attr->cra_flags; + + /* Copy object FID to cl_attr */ + attr->cra_oa->o_oi.oi_fid = *lu_object_fid(&obj->co_lu); + + if (flags & OBD_MD_FLGROUP) + attr->cra_oa->o_valid |= OBD_MD_FLGROUP; + + if (flags & OBD_MD_FLID) + attr->cra_oa->o_valid |= OBD_MD_FLID; + + if (flags & OBD_MD_FLHANDLE) { + struct osc_page *opg; + + opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj)); + if (!opg->ops_srvlock) { + int rc; + + rc = mdc_get_lock_handle(env, cl2osc(obj), + osc_index(opg), + &attr->cra_oa->o_handle); + if (rc) { + CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page, + "uncovered page!\n"); + LBUG(); + } else { + attr->cra_oa->o_valid |= OBD_MD_FLHANDLE; + } + } + } +} + +static int mdc_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + + if (OST_LVB_IS_ERR(oinfo->loi_lvb.lvb_blocks)) + return OST_LVB_GET_ERR(oinfo->loi_lvb.lvb_blocks); + + return osc_attr_get(env, obj, attr); +} + +static int mdc_object_ast_clear(struct ldlm_lock *lock, void *data) +{ + struct osc_object *osc = (struct osc_object *)data; + struct ost_lvb *lvb = &lock->l_ost_lvb; + struct lov_oinfo *oinfo; + ENTRY; + + if (lock->l_ast_data == data) { + lock->l_ast_data = NULL; + + LASSERT(osc != NULL); + LASSERT(osc->oo_oinfo != NULL); + LASSERT(lvb != NULL); + + /* Updates lvb in lock by the cached oinfo */ + oinfo = osc->oo_oinfo; + + LDLM_DEBUG(lock, "update lock size %llu blocks %llu [cma]time: " + "%llu %llu %llu by oinfo size %llu blocks %llu " + "[cma]time %llu %llu %llu", lvb->lvb_size, + lvb->lvb_blocks, lvb->lvb_ctime, lvb->lvb_mtime, + lvb->lvb_atime, oinfo->loi_lvb.lvb_size, + oinfo->loi_lvb.lvb_blocks, oinfo->loi_lvb.lvb_ctime, + oinfo->loi_lvb.lvb_mtime, oinfo->loi_lvb.lvb_atime); + LASSERT(oinfo->loi_lvb.lvb_size >= oinfo->loi_kms); + + cl_object_attr_lock(&osc->oo_cl); + memcpy(lvb, &oinfo->loi_lvb, sizeof(oinfo->loi_lvb)); + cl_object_attr_unlock(&osc->oo_cl); + ldlm_clear_lvb_cached(lock); + } + RETURN(LDLM_ITER_CONTINUE); +} + +int mdc_object_prune(const struct lu_env *env, struct cl_object *obj) +{ + struct osc_object *osc = cl2osc(obj); + struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname; + + /* DLM locks don't hold a reference of osc_object so we have to + * clear it before the object is being destroyed. */ + osc_build_res_name(osc, resname); + ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname, + mdc_object_ast_clear, osc); + return 0; +} + +static int mdc_object_flush(const struct lu_env *env, struct cl_object *obj, + struct ldlm_lock *lock) +{ + /* if lock cancel is initiated from llite then it is combined + * lock with DOM bit and it may have no l_ast_data initialized yet, + * so init it here with given osc_object. + */ + mdc_set_dom_lock_data(lock, cl2osc(obj)); + RETURN(mdc_dlm_canceling(env, lock)); +} + +static const struct cl_object_operations mdc_ops = { + .coo_page_init = osc_page_init, + .coo_lock_init = mdc_lock_init, + .coo_io_init = mdc_io_init, + .coo_attr_get = mdc_attr_get, + .coo_attr_update = osc_attr_update, + .coo_glimpse = osc_object_glimpse, + .coo_req_attr_set = mdc_req_attr_set, + .coo_prune = mdc_object_prune, + .coo_object_flush = mdc_object_flush +}; + +static const struct osc_object_operations mdc_object_ops = { + .oto_build_res_name = mdc_build_res_name, + .oto_dlmlock_at_pgoff = mdc_dlmlock_at_pgoff, +}; + +static int mdc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct osc_object *osc = lu2osc(obj); + + if (osc->oo_initialized) + return 0; + + osc->oo_initialized = true; + + return osc_object_init(env, obj, conf); +} + +static void mdc_object_free(const struct lu_env *env, struct lu_object *obj) +{ + osc_object_free(env, obj); +} + +static const struct lu_object_operations mdc_lu_obj_ops = { + .loo_object_init = mdc_object_init, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = mdc_object_free, + .loo_object_print = osc_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *mdc_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct osc_object *osc; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, GFP_NOFS); + if (osc != NULL) { + obj = osc2lu(osc); + lu_object_init(obj, NULL, dev); + osc->oo_cl.co_ops = &mdc_ops; + obj->lo_ops = &mdc_lu_obj_ops; + osc->oo_obj_ops = &mdc_object_ops; + osc->oo_initialized = false; + } else { + obj = NULL; + } + return obj; +} + +static int mdc_process_config(const struct lu_env *env, struct lu_device *d, + struct lustre_cfg *cfg) +{ + size_t count = class_modify_config(cfg, PARAM_MDC, + &d->ld_obd->obd_kset.kobj); + return count > 0 ? 0 : count; +} + +const struct lu_device_operations mdc_lu_ops = { + .ldo_object_alloc = mdc_object_alloc, + .ldo_process_config = mdc_process_config, + .ldo_recovery_complete = NULL, +}; + +static struct lu_device *mdc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct osc_device *oc; + struct obd_device *obd; + int rc; + + OBD_ALLOC_PTR(oc); + if (oc == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + cl_device_init(&oc->osc_cl, t); + d = osc2lu_dev(oc); + d->ld_ops = &mdc_lu_ops; + + /* Setup MDC OBD */ + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + if (obd == NULL) + RETURN(ERR_PTR(-ENODEV)); + + rc = mdc_setup(obd, cfg); + if (rc < 0) { + osc_device_free(env, d); + RETURN(ERR_PTR(rc)); + } + oc->osc_exp = obd->obd_self_export; + oc->osc_stats.os_init = ktime_get_real(); + RETURN(d); +} + +static const struct lu_device_type_operations mdc_device_type_ops = { + .ldto_device_alloc = mdc_device_alloc, + .ldto_device_free = osc_device_free, + .ldto_device_init = osc_device_init, + .ldto_device_fini = osc_device_fini +}; + +struct lu_device_type mdc_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_MDC_NAME, + .ldt_ops = &mdc_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h new file mode 100644 index 0000000000000..20a81bf4d294a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h @@ -0,0 +1,198 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _MDC_INTERNAL_H +#define _MDC_INTERNAL_H + +#include + +int mdc_tunables_init(struct obd_device *obd); + +void mdc_pack_body(struct req_capsule *pill, const struct lu_fid *fid, + u64 valid, size_t ea_size, u32 suppgid, u32 flags); +void mdc_swap_layouts_pack(struct req_capsule *pill, + struct md_op_data *op_data); +void mdc_readdir_pack(struct req_capsule *pill, __u64 pgoff, size_t size, + const struct lu_fid *fid); +void mdc_getattr_pack(struct req_capsule *pill, __u64 valid, __u32 flags, + struct md_op_data *data, size_t ea_size); +void mdc_setattr_pack(struct req_capsule *pill, struct md_op_data *op_data, + void *ea, size_t ealen); +void mdc_create_pack(struct req_capsule *pill, struct md_op_data *op_data, + const void *data, size_t datalen, umode_t mode, + uid_t uid, gid_t gid, kernel_cap_t capability, u64 rdev); +void mdc_open_pack(struct req_capsule *pill, struct md_op_data *op_data, + umode_t mode, __u64 rdev, __u64 flags, + const void *data, size_t datalen); +void mdc_file_secctx_pack(struct req_capsule *pill, + const char *secctx_name, + const void *secctx, size_t secctx_size); +void mdc_file_encctx_pack(struct req_capsule *pill, + const void *encctx, size_t encctx_size); +void mdc_file_sepol_pack(struct req_capsule *pill); + +void mdc_unlink_pack(struct req_capsule *pill, struct md_op_data *op_data); +void mdc_link_pack(struct req_capsule *pill, struct md_op_data *op_data); +void mdc_rename_pack(struct req_capsule *pill, struct md_op_data *op_data, + const char *old, size_t oldlen, + const char *new, size_t newlen); +void mdc_migrate_pack(struct req_capsule *pill, struct md_op_data *op_data, + const char *name, size_t namelen); +void mdc_close_pack(struct req_capsule *pill, struct md_op_data *op_data); + +/* mdc/mdc_locks.c */ +int mdc_set_lock_data(struct obd_export *exp, + const struct lustre_handle *lockh, + void *data, __u64 *bits); + +int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid); + +int mdc_intent_lock(struct obd_export *exp, + struct md_op_data *op_data, + struct lookup_intent *it, + struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags); + +int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + const union ldlm_policy_data *policy, + struct md_op_data *op_data, + struct lustre_handle *lockh, __u64 extra_lock_flags); +int mdc_resource_get_unused_res(struct obd_export *exp, + struct ldlm_res_id *res_id, + struct list_head *cancels, + enum ldlm_mode mode, __u64 bits); +int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid, + struct list_head *cancels, enum ldlm_mode mode, + __u64 bits); +/* mdc/mdc_request.c */ +int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data); +int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg); + +struct obd_client_handle; + +int mdc_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct lookup_intent *it); + +void mdc_commit_open(struct ptlrpc_request *req); +void mdc_replay_open(struct ptlrpc_request *req); + +int mdc_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, size_t datalen, + umode_t mode, uid_t uid, gid_t gid, + kernel_cap_t capability, __u64 rdev, + struct ptlrpc_request **request); +int mdc_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request); +int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, size_t oldlen, const char *new, size_t newlen, + struct ptlrpc_request **request); +int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, size_t ealen, struct ptlrpc_request **request); +int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request); +int mdc_file_resync(struct obd_export *exp, struct md_op_data *data); +int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, + union ldlm_policy_data *policy, enum ldlm_mode mode, + enum ldlm_cancel_flags flags, void *opaque); + +int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits); + +int mdc_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo); + +enum ldlm_mode mdc_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, struct lustre_handle *lockh); + + +#define MDC_CHANGELOG_DEV_COUNT LMV_MAX_STRIPE_COUNT +#define MDC_CHANGELOG_DEV_NAME "changelog" +extern struct class *mdc_changelog_class; +extern dev_t mdc_changelog_dev; +extern struct idr mdc_changelog_minor_idr; + +int mdc_changelog_cdev_init(struct obd_device *obd); + +void mdc_changelog_cdev_finish(struct obd_device *obd); + +static inline int mdc_prep_elc_req(struct obd_export *exp, + struct ptlrpc_request *req, int opc, + struct list_head *cancels, int count) +{ + return ldlm_prep_elc_req(exp, req, LUSTRE_MDS_VERSION, opc, 0, cancels, + count); +} + +#ifdef CONFIG_LUSTRE_FS_POSIX_ACL +int mdc_unpack_acl(struct req_capsule *pill, struct lustre_md *md); +#else +static inline +int mdc_unpack_acl(struct req_capsule *pill, struct lustre_md *md) +{ + return 0; +} +#endif + +static inline void mdc_body2lvb(struct mdt_body *body, struct ost_lvb *lvb) +{ + LASSERT(body->mbo_valid & OBD_MD_DOM_SIZE); + lvb->lvb_mtime = body->mbo_mtime; + lvb->lvb_atime = body->mbo_atime; + lvb->lvb_ctime = body->mbo_ctime; + lvb->lvb_blocks = body->mbo_dom_blocks; + lvb->lvb_size = body->mbo_dom_size; +} + +static inline unsigned long hash_x_index(__u64 hash, int hash64) +{ + if (BITS_PER_LONG == 32 && hash64) + hash >>= 32; + /* save hash 0 with hash 1 */ + return ~0UL - (hash + !hash); +} + +/* mdc_dev.c */ +extern struct lu_device_type mdc_device_type; +int mdc_ldlm_blocking_ast(struct ldlm_lock *dlmlock, + struct ldlm_lock_desc *new, void *data, int flag); +int mdc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data); +int mdc_fill_lvb(struct req_capsule *pill, struct ost_lvb *lvb); + +/* the minimum inline repsize should be PAGE_SIZE at least */ +#define MDC_DOM_DEF_INLINE_REPSIZE max(8192UL, PAGE_SIZE) +#define MDC_DOM_MAX_INLINE_REPSIZE XATTR_SIZE_MAX + +#endif diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c new file mode 100644 index 0000000000000..5d571d3c76e4c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c @@ -0,0 +1,674 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_MDC +#include +#include + +#include +#include +#include +#include +#include "mdc_internal.h" + +static void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags) +{ + mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll); + mrc->cr_flags_h = (__u32)(flags >> 32); +} + +static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid) +{ + LASSERT(b); + + b->mbo_suppgid = suppgid; + b->mbo_uid = from_kuid(&init_user_ns, current_uid()); + b->mbo_gid = from_kgid(&init_user_ns, current_gid()); + b->mbo_fsuid = from_kuid(&init_user_ns, current_fsuid()); + b->mbo_fsgid = from_kgid(&init_user_ns, current_fsgid()); + b->mbo_capability = current_cap().cap[0]; +} + +void mdc_swap_layouts_pack(struct req_capsule *pill, + struct md_op_data *op_data) +{ + struct mdt_body *b = req_capsule_client_get(pill, &RMF_MDT_BODY); + + __mdc_pack_body(b, op_data->op_suppgids[0]); + b->mbo_fid1 = op_data->op_fid1; + b->mbo_fid2 = op_data->op_fid2; + b->mbo_valid |= OBD_MD_FLID; +} + +void mdc_pack_body(struct req_capsule *pill, const struct lu_fid *fid, + u64 valid, size_t ea_size, u32 suppgid, u32 flags) +{ + struct mdt_body *b = req_capsule_client_get(pill, &RMF_MDT_BODY); + LASSERT(b); + b->mbo_valid = valid; + b->mbo_eadatasize = ea_size; + b->mbo_flags = flags; + __mdc_pack_body(b, suppgid); + if (fid) { + b->mbo_fid1 = *fid; + b->mbo_valid |= OBD_MD_FLID; + } +} + +/** + * Pack a name (path component) into a request + * + * \param[in] pill request pill + * \param[in] field request field (usually RMF_NAME) + * \param[in] name path component + * \param[in] name_len length of path component + * + * \a field must be present in \a req and of size \a name_len + 1. + * + * \a name must be '\0' terminated of length \a name_len and represent + * a single path component (not contain '/'). + */ +static void mdc_pack_name(struct req_capsule *pill, + const struct req_msg_field *field, + const char *name, size_t name_len) +{ + char *buf; + size_t buf_size; + size_t cpy_len; + + buf = req_capsule_client_get(pill, field); + buf_size = req_capsule_get_size(pill, field, RCL_CLIENT); + + LASSERT(buf != NULL && buf_size == name_len + 1); + + if (!name) { + buf[name_len] = '\0'; + return; + } + cpy_len = strlcpy(buf, name, buf_size); + + LASSERT(lu_name_is_valid_2(buf, cpy_len)); + if (cpy_len != name_len) + CDEBUG(D_DENTRY, "%s len %zd != %zd, concurrent rename?\n", + buf, name_len, cpy_len); +} + +void mdc_file_secctx_pack(struct req_capsule *pill, const char *secctx_name, + const void *secctx, size_t secctx_size) +{ + void *buf; + size_t buf_size; + + if (secctx_name == NULL) + return; + + buf = req_capsule_client_get(pill, &RMF_FILE_SECCTX_NAME); + buf_size = req_capsule_get_size(pill, &RMF_FILE_SECCTX_NAME, + RCL_CLIENT); + + LASSERT(buf_size == strlen(secctx_name) + 1); + memcpy(buf, secctx_name, buf_size); + + buf = req_capsule_client_get(pill, &RMF_FILE_SECCTX); + buf_size = req_capsule_get_size(pill, &RMF_FILE_SECCTX, + RCL_CLIENT); + + LASSERT(buf_size == secctx_size); + memcpy(buf, secctx, buf_size); +} + +void mdc_file_encctx_pack(struct req_capsule *pill, + const void *encctx, size_t encctx_size) +{ + void *buf; + size_t buf_size; + + if (encctx == NULL) + return; + + buf = req_capsule_client_get(pill, &RMF_FILE_ENCCTX); + buf_size = req_capsule_get_size(pill, &RMF_FILE_ENCCTX, + RCL_CLIENT); + + LASSERT(buf_size == encctx_size); + memcpy(buf, encctx, buf_size); +} + +void mdc_file_sepol_pack(struct req_capsule *pill) +{ + void *buf; + size_t buf_size; + struct ptlrpc_request *req = pill->rc_req; + + if (strlen(req->rq_sepol) == 0) + return; + + buf = req_capsule_client_get(pill, &RMF_SELINUX_POL); + buf_size = req_capsule_get_size(pill, &RMF_SELINUX_POL, + RCL_CLIENT); + + LASSERT(buf_size == strlen(req->rq_sepol) + 1); + snprintf(buf, strlen(req->rq_sepol) + 1, "%s", req->rq_sepol); +} + +void mdc_readdir_pack(struct req_capsule *pill, __u64 pgoff, size_t size, + const struct lu_fid *fid) +{ + struct mdt_body *b = req_capsule_client_get(pill, &RMF_MDT_BODY); + + b->mbo_fid1 = *fid; + b->mbo_valid |= OBD_MD_FLID; + b->mbo_size = pgoff; /* !! */ + b->mbo_nlink = size; /* !! */ + __mdc_pack_body(b, -1); + b->mbo_mode = LUDA_FID | LUDA_TYPE; +} + +/* packing of MDS records */ +void mdc_create_pack(struct req_capsule *pill, struct md_op_data *op_data, + const void *data, size_t datalen, umode_t mode, + uid_t uid, gid_t gid, kernel_cap_t cap_effective, u64 rdev) +{ + struct mdt_rec_create *rec; + char *tmp; + __u64 flags; + + BUILD_BUG_ON(sizeof(struct mdt_rec_reint) != + sizeof(struct mdt_rec_create)); + rec = req_capsule_client_get(pill, &RMF_REC_REINT); + + rec->cr_opcode = REINT_CREATE; + rec->cr_fsuid = uid; + rec->cr_fsgid = gid; + rec->cr_cap = cap_effective.cap[0]; + rec->cr_fid1 = op_data->op_fid1; + rec->cr_fid2 = op_data->op_fid2; + rec->cr_mode = mode; + rec->cr_rdev = rdev; + rec->cr_time = op_data->op_mod_time; + rec->cr_suppgid1 = op_data->op_suppgids[0]; + rec->cr_suppgid2 = op_data->op_suppgids[1]; + flags = 0; + if (op_data->op_bias & MDS_CREATE_VOLATILE) + flags |= MDS_OPEN_VOLATILE; + if (op_data->op_bias & MDS_SETSTRIPE_CREATE) + /* borrow MDS_OPEN_CREATE flag to indicate current setstripe + * create only, and don't restripe if object exists. + */ + flags |= MDS_OPEN_CREAT; + set_mrc_cr_flags(rec, flags); + rec->cr_bias = op_data->op_bias; + rec->cr_umask = current_umask(); + + mdc_pack_name(pill, &RMF_NAME, op_data->op_name, op_data->op_namelen); + if (data) { + tmp = req_capsule_client_get(pill, &RMF_EADATA); + memcpy(tmp, data, datalen); + } + + mdc_file_secctx_pack(pill, op_data->op_file_secctx_name, + op_data->op_file_secctx, + op_data->op_file_secctx_size); + + mdc_file_encctx_pack(pill, op_data->op_file_encctx, + op_data->op_file_encctx_size); + + /* pack SELinux policy info if any */ + mdc_file_sepol_pack(pill); +} + +static inline __u64 mds_pack_open_flags(__u64 flags) +{ + __u64 cr_flags = (flags & MDS_OPEN_FL_INTERNAL); + + if (flags & FMODE_READ) + cr_flags |= MDS_FMODE_READ; + if (flags & FMODE_WRITE) + cr_flags |= MDS_FMODE_WRITE; + if (flags & O_CREAT) + cr_flags |= MDS_OPEN_CREAT; + if (flags & O_EXCL) + cr_flags |= MDS_OPEN_EXCL; + if (flags & O_TRUNC) + cr_flags |= MDS_OPEN_TRUNC; + if (flags & O_APPEND) + cr_flags |= MDS_OPEN_APPEND; + if (flags & O_SYNC) + cr_flags |= MDS_OPEN_SYNC; + if (flags & O_DIRECTORY) + cr_flags |= MDS_OPEN_DIRECTORY; +#ifdef FMODE_EXEC + if (flags & FMODE_EXEC) + cr_flags |= MDS_FMODE_EXEC; +#endif + if (cl_is_lov_delay_create(flags)) + cr_flags |= MDS_OPEN_DELAY_CREATE; + + if (flags & O_NONBLOCK) + cr_flags |= MDS_OPEN_NORESTORE; + + return cr_flags; +} + +/* packing of MDS records */ +void mdc_open_pack(struct req_capsule *pill, struct md_op_data *op_data, + umode_t mode, __u64 rdev, __u64 flags, const void *lmm, + size_t lmmlen) +{ + struct mdt_rec_create *rec; + char *tmp; + __u64 cr_flags; + + BUILD_BUG_ON(sizeof(struct mdt_rec_reint) != + sizeof(struct mdt_rec_create)); + rec = req_capsule_client_get(pill, &RMF_REC_REINT); + + /* XXX do something about time, uid, gid */ + rec->cr_opcode = REINT_OPEN; + rec->cr_fsuid = from_kuid(&init_user_ns, current_fsuid()); + rec->cr_fsgid = from_kgid(&init_user_ns, current_fsgid()); + rec->cr_cap = current_cap().cap[0]; + rec->cr_mode = mode; + cr_flags = mds_pack_open_flags(flags); + rec->cr_rdev = rdev; + rec->cr_umask = current_umask(); + if (op_data != NULL) { + rec->cr_fid1 = op_data->op_fid1; + rec->cr_fid2 = op_data->op_fid2; + rec->cr_time = op_data->op_mod_time; + rec->cr_suppgid1 = op_data->op_suppgids[0]; + rec->cr_suppgid2 = op_data->op_suppgids[1]; + rec->cr_bias = op_data->op_bias; + rec->cr_open_handle_old = op_data->op_open_handle; + + if (op_data->op_name) { + mdc_pack_name(pill, &RMF_NAME, op_data->op_name, + op_data->op_namelen); + + if (op_data->op_bias & MDS_CREATE_VOLATILE) + cr_flags |= MDS_OPEN_VOLATILE; + } + + mdc_file_secctx_pack(pill, op_data->op_file_secctx_name, + op_data->op_file_secctx, + op_data->op_file_secctx_size); + + mdc_file_encctx_pack(pill, op_data->op_file_encctx, + op_data->op_file_encctx_size); + + /* pack SELinux policy info if any */ + mdc_file_sepol_pack(pill); + } + + if (lmm) { + cr_flags |= MDS_OPEN_HAS_EA; + tmp = req_capsule_client_get(pill, &RMF_EADATA); + memcpy(tmp, lmm, lmmlen); + if (cr_flags & MDS_OPEN_PCC) { + LASSERT(op_data != NULL); + rec->cr_archive_id = op_data->op_archive_id; + } + } + cr_flags |= MDS_OPEN_DEFAULT_LMV; + set_mrc_cr_flags(rec, cr_flags); +} + +static inline enum mds_attr_flags mdc_attr_pack(unsigned int ia_valid, + enum op_xvalid ia_xvalid) +{ + enum mds_attr_flags sa_valid = 0; + + if (ia_valid & ATTR_MODE) + sa_valid |= MDS_ATTR_MODE; + if (ia_valid & ATTR_UID) + sa_valid |= MDS_ATTR_UID; + if (ia_valid & ATTR_GID) + sa_valid |= MDS_ATTR_GID; + if (ia_valid & ATTR_SIZE) + sa_valid |= MDS_ATTR_SIZE; + if (ia_valid & ATTR_ATIME) + sa_valid |= MDS_ATTR_ATIME; + if (ia_valid & ATTR_MTIME) + sa_valid |= MDS_ATTR_MTIME; + if (ia_valid & ATTR_CTIME) + sa_valid |= MDS_ATTR_CTIME; + if (ia_valid & ATTR_ATIME_SET) + sa_valid |= MDS_ATTR_ATIME_SET; + if (ia_valid & ATTR_MTIME_SET) + sa_valid |= MDS_ATTR_MTIME_SET; + if (ia_valid & ATTR_FORCE) + sa_valid |= MDS_ATTR_FORCE; + if (ia_xvalid & OP_XVALID_FLAGS) + sa_valid |= MDS_ATTR_ATTR_FLAG; + if (ia_valid & ATTR_KILL_SUID) + sa_valid |= MDS_ATTR_KILL_SUID; + if (ia_valid & ATTR_KILL_SGID) + sa_valid |= MDS_ATTR_KILL_SGID; + if (ia_xvalid & OP_XVALID_CTIME_SET) + sa_valid |= MDS_ATTR_CTIME_SET; + if (ia_valid & ATTR_OPEN) + sa_valid |= MDS_ATTR_FROM_OPEN; + if (ia_xvalid & OP_XVALID_BLOCKS) + sa_valid |= MDS_ATTR_BLOCKS; + if (ia_xvalid & OP_XVALID_OWNEROVERRIDE) + /* NFSD hack (see bug 5781) */ + sa_valid |= MDS_OPEN_OWNEROVERRIDE; + if (ia_xvalid & OP_XVALID_PROJID) + sa_valid |= MDS_ATTR_PROJID; + if (ia_xvalid & OP_XVALID_LAZYSIZE) + sa_valid |= MDS_ATTR_LSIZE; + if (ia_xvalid & OP_XVALID_LAZYBLOCKS) + sa_valid |= MDS_ATTR_LBLOCKS; + + return sa_valid; +} + +static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec, + struct md_op_data *op_data) +{ + rec->sa_opcode = REINT_SETATTR; + rec->sa_fsuid = from_kuid(&init_user_ns, current_fsuid()); + rec->sa_fsgid = from_kgid(&init_user_ns, current_fsgid()); + rec->sa_cap = current_cap().cap[0]; + rec->sa_suppgid = -1; + + rec->sa_fid = op_data->op_fid1; + rec->sa_valid = mdc_attr_pack(op_data->op_attr.ia_valid, + op_data->op_xvalid); + rec->sa_mode = op_data->op_attr.ia_mode; + rec->sa_uid = from_kuid(&init_user_ns, op_data->op_attr.ia_uid); + rec->sa_gid = from_kgid(&init_user_ns, op_data->op_attr.ia_gid); + rec->sa_projid = op_data->op_projid; + rec->sa_size = op_data->op_attr.ia_size; + rec->sa_blocks = op_data->op_attr_blocks; + rec->sa_atime = op_data->op_attr.ia_atime.tv_sec; + rec->sa_mtime = op_data->op_attr.ia_mtime.tv_sec; + rec->sa_ctime = op_data->op_attr.ia_ctime.tv_sec; + rec->sa_attr_flags = op_data->op_attr_flags; + if ((op_data->op_attr.ia_valid & ATTR_GID) && + in_group_p(op_data->op_attr.ia_gid)) + rec->sa_suppgid = + from_kgid(&init_user_ns, op_data->op_attr.ia_gid); + else + rec->sa_suppgid = op_data->op_suppgids[0]; + + rec->sa_bias = op_data->op_bias; +} + +static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch, + struct md_op_data *op_data) +{ + epoch->mio_open_handle = op_data->op_open_handle; + epoch->mio_unused1 = 0; + epoch->mio_unused2 = 0; + epoch->mio_padding = 0; +} + +void mdc_setattr_pack(struct req_capsule *pill, struct md_op_data *op_data, + void *ea, size_t ealen) +{ + struct mdt_rec_setattr *rec; + struct lov_user_md *lum = NULL; + + BUILD_BUG_ON(sizeof(struct mdt_rec_reint) != + sizeof(struct mdt_rec_setattr)); + rec = req_capsule_client_get(pill, &RMF_REC_REINT); + mdc_setattr_pack_rec(rec, op_data); + + if (ealen == 0) + return; + + lum = req_capsule_client_get(pill, &RMF_EADATA); + if (ea == NULL) { /* Remove LOV EA */ + lum->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1); + lum->lmm_stripe_size = 0; + lum->lmm_stripe_count = 0; + lum->lmm_stripe_offset = + (typeof(lum->lmm_stripe_offset))LOV_OFFSET_DEFAULT; + } else { + memcpy(lum, ea, ealen); + } +} + +void mdc_unlink_pack(struct req_capsule *pill, struct md_op_data *op_data) +{ + struct mdt_rec_unlink *rec; + + BUILD_BUG_ON(sizeof(struct mdt_rec_reint) != + sizeof(struct mdt_rec_unlink)); + rec = req_capsule_client_get(pill, &RMF_REC_REINT); + LASSERT(rec != NULL); + + rec->ul_opcode = op_data->op_cli_flags & CLI_RM_ENTRY ? + REINT_RMENTRY : REINT_UNLINK; + rec->ul_fsuid = op_data->op_fsuid; + rec->ul_fsgid = op_data->op_fsgid; + rec->ul_cap = op_data->op_cap.cap[0]; + rec->ul_mode = op_data->op_mode; + rec->ul_suppgid1 = op_data->op_suppgids[0]; + rec->ul_suppgid2 = -1; + rec->ul_fid1 = op_data->op_fid1; + rec->ul_fid2 = op_data->op_fid2; + rec->ul_time = op_data->op_mod_time; + rec->ul_bias = op_data->op_bias; + + mdc_pack_name(pill, &RMF_NAME, op_data->op_name, op_data->op_namelen); + + /* pack SELinux policy info if any */ + mdc_file_sepol_pack(pill); +} + +void mdc_link_pack(struct req_capsule *pill, struct md_op_data *op_data) +{ + struct mdt_rec_link *rec; + + BUILD_BUG_ON(sizeof(struct mdt_rec_reint) != + sizeof(struct mdt_rec_link)); + rec = req_capsule_client_get(pill, &RMF_REC_REINT); + LASSERT(rec != NULL); + + rec->lk_opcode = REINT_LINK; + rec->lk_fsuid = op_data->op_fsuid; /* current->fsuid; */ + rec->lk_fsgid = op_data->op_fsgid; /* current->fsgid; */ + rec->lk_cap = op_data->op_cap.cap[0]; /* current->cap_effective; */ + rec->lk_suppgid1 = op_data->op_suppgids[0]; + rec->lk_suppgid2 = op_data->op_suppgids[1]; + rec->lk_fid1 = op_data->op_fid1; + rec->lk_fid2 = op_data->op_fid2; + rec->lk_time = op_data->op_mod_time; + rec->lk_bias = op_data->op_bias; + + mdc_pack_name(pill, &RMF_NAME, op_data->op_name, op_data->op_namelen); + + /* pack SELinux policy info if any */ + mdc_file_sepol_pack(pill); +} + +static void mdc_close_intent_pack(struct req_capsule *pill, + struct md_op_data *op_data) +{ + struct close_data *data; + struct ldlm_lock *lock; + enum mds_op_bias bias = op_data->op_bias; + + if (!(bias & (MDS_CLOSE_INTENT | MDS_CLOSE_MIGRATE))) + return; + + data = req_capsule_client_get(pill, &RMF_CLOSE_DATA); + LASSERT(data != NULL); + + lock = ldlm_handle2lock(&op_data->op_lease_handle); + if (lock != NULL) { + data->cd_handle = lock->l_remote_handle; + LDLM_LOCK_PUT(lock); + } + ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL); + + data->cd_data_version = op_data->op_data_version; + data->cd_fid = op_data->op_fid2; + + if (bias & MDS_CLOSE_LAYOUT_SPLIT) { + data->cd_mirror_id = op_data->op_mirror_id; + } else if (bias & MDS_CLOSE_RESYNC_DONE) { + struct close_data_resync_done *sync = &data->cd_resync; + + BUILD_BUG_ON(sizeof(data->cd_resync) > + sizeof(data->cd_reserved)); + sync->resync_count = op_data->op_data_size / sizeof(__u32); + if (sync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) { + memcpy(sync->resync_ids_inline, op_data->op_data, + op_data->op_data_size); + } else { + size_t count = sync->resync_count; + + memcpy(req_capsule_client_get(pill, &RMF_U32), + op_data->op_data, count * sizeof(__u32)); + } + } else if (bias & MDS_PCC_ATTACH) { + data->cd_archive_id = op_data->op_archive_id; + } +} + +void mdc_rename_pack(struct req_capsule *pill, struct md_op_data *op_data, + const char *old, size_t oldlen, + const char *new, size_t newlen) +{ + struct mdt_rec_rename *rec; + + BUILD_BUG_ON(sizeof(struct mdt_rec_reint) != + sizeof(struct mdt_rec_rename)); + rec = req_capsule_client_get(pill, &RMF_REC_REINT); + + /* XXX do something about time, uid, gid */ + rec->rn_opcode = REINT_RENAME; + rec->rn_fsuid = op_data->op_fsuid; + rec->rn_fsgid = op_data->op_fsgid; + rec->rn_cap = op_data->op_cap.cap[0]; + rec->rn_suppgid1 = op_data->op_suppgids[0]; + rec->rn_suppgid2 = op_data->op_suppgids[1]; + rec->rn_fid1 = op_data->op_fid1; + rec->rn_fid2 = op_data->op_fid2; + rec->rn_time = op_data->op_mod_time; + rec->rn_mode = op_data->op_mode; + rec->rn_bias = op_data->op_bias; + + mdc_pack_name(pill, &RMF_NAME, old, oldlen); + + if (new != NULL) + mdc_pack_name(pill, &RMF_SYMTGT, new, newlen); + + /* pack SELinux policy info if any */ + mdc_file_sepol_pack(pill); +} + +void mdc_migrate_pack(struct req_capsule *pill, struct md_op_data *op_data, + const char *name, size_t namelen) +{ + struct mdt_rec_rename *rec; + char *ea; + + BUILD_BUG_ON(sizeof(struct mdt_rec_reint) != + sizeof(struct mdt_rec_rename)); + rec = req_capsule_client_get(pill, &RMF_REC_REINT); + + rec->rn_opcode = REINT_MIGRATE; + rec->rn_fsuid = op_data->op_fsuid; + rec->rn_fsgid = op_data->op_fsgid; + rec->rn_cap = op_data->op_cap.cap[0]; + rec->rn_suppgid1 = op_data->op_suppgids[0]; + rec->rn_suppgid2 = op_data->op_suppgids[1]; + rec->rn_fid1 = op_data->op_fid1; + rec->rn_fid2 = op_data->op_fid4; + rec->rn_time = op_data->op_mod_time; + rec->rn_mode = op_data->op_mode; + rec->rn_bias = op_data->op_bias; + + mdc_pack_name(pill, &RMF_NAME, name, namelen); + + if (op_data->op_bias & MDS_CLOSE_MIGRATE) { + struct mdt_ioepoch *epoch; + + mdc_close_intent_pack(pill, op_data); + epoch = req_capsule_client_get(pill, &RMF_MDT_EPOCH); + mdc_ioepoch_pack(epoch, op_data); + } + + ea = req_capsule_client_get(pill, &RMF_EADATA); + memcpy(ea, op_data->op_data, op_data->op_data_size); +} + +void mdc_getattr_pack(struct req_capsule *pill, __u64 valid, __u32 flags, + struct md_op_data *op_data, size_t ea_size) +{ + struct mdt_body *b = req_capsule_client_get(pill, &RMF_MDT_BODY); + + b->mbo_valid = valid; + if (op_data->op_bias & MDS_CROSS_REF) + b->mbo_valid |= OBD_MD_FLCROSSREF; + if (op_data->op_bias & MDS_FID_OP) + b->mbo_valid |= OBD_MD_NAMEHASH; + b->mbo_eadatasize = ea_size; + b->mbo_flags = flags; + __mdc_pack_body(b, op_data->op_suppgids[0]); + + b->mbo_fid1 = op_data->op_fid1; + b->mbo_fid2 = op_data->op_fid2; + b->mbo_valid |= OBD_MD_FLID; + + if (op_data->op_name != NULL) + mdc_pack_name(pill, &RMF_NAME, op_data->op_name, + op_data->op_namelen); +} + +void mdc_close_pack(struct req_capsule *pill, struct md_op_data *op_data) +{ + struct mdt_ioepoch *epoch; + struct mdt_rec_setattr *rec; + + epoch = req_capsule_client_get(pill, &RMF_MDT_EPOCH); + rec = req_capsule_client_get(pill, &RMF_REC_REINT); + + mdc_setattr_pack_rec(rec, op_data); + /* + * The client will zero out local timestamps when losing the IBITS lock + * so any new RPC timestamps will update the client inode's timestamps. + * There was a defect on the server side which allowed the atime to be + * overwritten by a zeroed-out atime packed into the close RPC. + * + * Proactively clear the MDS_ATTR_ATIME flag in the RPC in this case + * to avoid zeroing the atime on old unpatched servers. See LU-8041. + */ + if (rec->sa_atime == 0) + rec->sa_valid &= ~MDS_ATTR_ATIME; + + mdc_ioepoch_pack(epoch, op_data); + mdc_close_intent_pack(pill, op_data); +} diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c new file mode 100644 index 0000000000000..41692b39eb909 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c @@ -0,0 +1,1466 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_MDC + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mdc_internal.h" + +struct mdc_getattr_args { + struct obd_export *ga_exp; + struct md_enqueue_info *ga_minfo; +}; + +int it_open_error(int phase, struct lookup_intent *it) +{ + if (it_disposition(it, DISP_OPEN_LEASE)) { + if (phase >= DISP_OPEN_LEASE) + return it->it_status; + else + return 0; + } + if (it_disposition(it, DISP_OPEN_OPEN)) { + if (phase >= DISP_OPEN_OPEN) + return it->it_status; + else + return 0; + } + + if (it_disposition(it, DISP_OPEN_CREATE)) { + if (phase >= DISP_OPEN_CREATE) + return it->it_status; + else + return 0; + } + + if (it_disposition(it, DISP_LOOKUP_EXECD)) { + if (phase >= DISP_LOOKUP_EXECD) + return it->it_status; + else + return 0; + } + + if (it_disposition(it, DISP_IT_EXECD)) { + if (phase >= DISP_IT_EXECD) + return it->it_status; + else + return 0; + } + + CERROR("it disp: %X, status: %d\n", it->it_disposition, it->it_status); + LBUG(); + + return 0; +} +EXPORT_SYMBOL(it_open_error); + +/* this must be called on a lockh that is known to have a referenced lock */ +int mdc_set_lock_data(struct obd_export *exp, const struct lustre_handle *lockh, + void *data, __u64 *bits) +{ + struct ldlm_lock *lock; + struct inode *new_inode = data; + + ENTRY; + if (bits) + *bits = 0; + + if (!lustre_handle_is_used(lockh)) + RETURN(0); + + lock = ldlm_handle2lock(lockh); + + LASSERT(lock != NULL); + lock_res_and_lock(lock); + if (lock->l_resource->lr_lvb_inode && + lock->l_resource->lr_lvb_inode != data) { + struct inode *old_inode = lock->l_resource->lr_lvb_inode; + + LASSERTF(old_inode->i_state & I_FREEING, + "Found existing inode %p/%lu/%u state %lu in lock: setting data to %p/%lu/%u\n", + old_inode, old_inode->i_ino, old_inode->i_generation, + old_inode->i_state, + new_inode, new_inode->i_ino, new_inode->i_generation); + } + lock->l_resource->lr_lvb_inode = new_inode; + if (bits) + *bits = lock->l_policy_data.l_inodebits.bits; + + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + + RETURN(0); +} + +enum ldlm_mode mdc_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, struct lustre_handle *lockh) +{ + struct ldlm_res_id res_id; + enum ldlm_mode rc; + + ENTRY; + fid_build_reg_res_name(fid, &res_id); + /* LU-4405: Clear bits not supported by server */ + policy->l_inodebits.bits &= exp_connect_ibits(exp); + rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags, + &res_id, type, policy, mode, lockh); + RETURN(rc); +} + +int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, + union ldlm_policy_data *policy, enum ldlm_mode mode, + enum ldlm_cancel_flags flags, void *opaque) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ldlm_res_id res_id; + int rc; + + ENTRY; + fid_build_reg_res_name(fid, &res_id); + rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id, + policy, mode, flags, opaque); + RETURN(rc); +} + +int mdc_null_inode(struct obd_export *exp, + const struct lu_fid *fid) +{ + struct ldlm_res_id res_id; + struct ldlm_resource *res; + struct ldlm_namespace *ns = class_exp2obd(exp)->obd_namespace; + + ENTRY; + LASSERTF(ns != NULL, "no namespace passed\n"); + + fid_build_reg_res_name(fid, &res_id); + + res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); + if (IS_ERR(res)) + RETURN(0); + + lock_res(res); + res->lr_lvb_inode = NULL; + unlock_res(res); + + ldlm_resource_putref(res); + RETURN(0); +} + +static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc) +{ + /* Don't hold error requests for replay. */ + if (req->rq_replay) { + spin_lock(&req->rq_lock); + req->rq_replay = 0; + spin_unlock(&req->rq_lock); + } + if (rc && req->rq_transno != 0) { + DEBUG_REQ(D_ERROR, req, "transno returned on error: rc = %d", + rc); + LBUG(); + } +} + +/** + * Save a large LOV EA into the request buffer so that it is available + * for replay. We don't do this in the initial request because the + * original request doesn't need this buffer (at most it sends just the + * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty + * buffer and may also be difficult to allocate and save a very large + * request buffer for each open. (b=5707) + * + * OOM here may cause recovery failure if lmm is needed (only for the + * original open if the MDS crashed just when this client also OOM'd) + * but this is incredibly unlikely, and questionable whether the client + * could do MDS recovery under OOM anyways... + */ +static int mdc_save_lovea(struct ptlrpc_request *req, void *data, u32 size) +{ + struct req_capsule *pill = &req->rq_pill; + void *lovea; + int rc = 0; + + if (req_capsule_get_size(pill, &RMF_EADATA, RCL_CLIENT) < size) { + rc = sptlrpc_cli_enlarge_reqbuf(req, &RMF_EADATA, size); + if (rc) { + CERROR("%s: Can't enlarge ea size to %d: rc = %d\n", + req->rq_export->exp_obd->obd_name, + size, rc); + return rc; + } + } else { + req_capsule_shrink(pill, &RMF_EADATA, size, RCL_CLIENT); + } + + req_capsule_set_size(pill, &RMF_EADATA, RCL_CLIENT, size); + lovea = req_capsule_client_get(pill, &RMF_EADATA); + if (lovea) { + memcpy(lovea, data, size); + lov_fix_ea_for_replay(lovea); + } + + return rc; +} + +static struct ptlrpc_request * +mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it, + struct md_op_data *op_data, __u32 acl_bufsize) +{ + struct ptlrpc_request *req; + struct obd_device *obd = class_exp2obd(exp); + struct ldlm_intent *lit; + const void *lmm = op_data->op_data; + __u32 lmmsize = op_data->op_data_size; + __u32 mdt_md_capsule_size; + LIST_HEAD(cancels); + int count = 0; + enum ldlm_mode mode; + int repsize, repsize_estimate; + int rc; + + ENTRY; + + mdt_md_capsule_size = obd->u.cli.cl_default_mds_easize; + + it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG; + + /* XXX: openlock is not cancelled for cross-refs. */ + /* If inode is known, cancel conflicting OPEN locks. */ + if (fid_is_sane(&op_data->op_fid2)) { + if (it->it_flags & MDS_OPEN_LEASE) { /* try to get lease */ + if (it->it_flags & MDS_FMODE_WRITE) + mode = LCK_EX; + else + mode = LCK_PR; + } else { + if (it->it_flags & (MDS_FMODE_WRITE | MDS_OPEN_TRUNC)) + mode = LCK_CW; +#ifdef FMODE_EXEC + else if (it->it_flags & FMODE_EXEC) + mode = LCK_PR; +#endif + else + mode = LCK_CR; + } + count = mdc_resource_get_unused(exp, &op_data->op_fid2, + &cancels, mode, + MDS_INODELOCK_OPEN); + } + + /* If CREATE, cancel parent's UPDATE lock. */ + if (it->it_op & IT_CREAT) + mode = LCK_EX; + else + mode = LCK_CR; + count += mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, mode, + MDS_INODELOCK_UPDATE); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_OPEN); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(ERR_PTR(-ENOMEM)); + } + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + if (cl_is_lov_delay_create(it->it_flags)) { + /* open(O_LOV_DELAY_CREATE) won't pack lmm */ + LASSERT(lmmsize == 0); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0); + } else { + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + max(lmmsize, obd->u.cli.cl_default_mds_easize)); + } + + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME, + RCL_CLIENT, op_data->op_file_secctx_name != NULL ? + op_data->op_file_secctx_name_size : 0); + + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT, + op_data->op_file_secctx_size); + + req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, RCL_CLIENT, + op_data->op_file_encctx_size); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + rc = ldlm_prep_enqueue_req(exp, req, &cancels, count); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + spin_lock(&req->rq_lock); + req->rq_replay = req->rq_import->imp_replayable; + spin_unlock(&req->rq_lock); + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + /* pack the intended request */ + mdc_open_pack(&req->rq_pill, op_data, it->it_create_mode, 0, + it->it_flags, lmm, lmmsize); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + mdt_md_capsule_size); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize); + + if (!(it->it_op & IT_CREAT) && it->it_op & IT_OPEN && + req_capsule_has_field(&req->rq_pill, &RMF_FILE_SECCTX_NAME, + RCL_CLIENT) && + op_data->op_file_secctx_name_size > 0 && + op_data->op_file_secctx_name != NULL) { + char *secctx_name; + + secctx_name = req_capsule_client_get(&req->rq_pill, + &RMF_FILE_SECCTX_NAME); + memcpy(secctx_name, op_data->op_file_secctx_name, + op_data->op_file_secctx_name_size); + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, + RCL_SERVER, + obd->u.cli.cl_max_mds_easize); + + CDEBUG(D_SEC, "packed '%.*s' as security xattr name\n", + op_data->op_file_secctx_name_size, + op_data->op_file_secctx_name); + + } else { + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, + RCL_SERVER, 0); + } + + if (exp_connect_encrypt(exp) && !(it->it_op & IT_CREAT) && + it->it_op & IT_OPEN) + req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, + RCL_SERVER, + obd->u.cli.cl_max_mds_easize); + else + req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, + RCL_SERVER, 0); + + /** + * Inline buffer for possible data from Data-on-MDT files. + */ + req_capsule_set_size(&req->rq_pill, &RMF_NIOBUF_INLINE, RCL_SERVER, + sizeof(struct niobuf_remote)); + req_capsule_set_size(&req->rq_pill, &RMF_DEFAULT_MDT_MD, RCL_SERVER, + sizeof(struct lmv_user_md)); + ptlrpc_request_set_replen(req); + + /* Get real repbuf allocated size as rounded up power of 2 */ + repsize = size_roundup_power2(req->rq_replen + + lustre_msg_early_size); + /* Estimate free space for DoM files in repbuf */ + repsize_estimate = repsize - (req->rq_replen - + mdt_md_capsule_size + + sizeof(struct lov_comp_md_v1) + + sizeof(struct lov_comp_md_entry_v1) + + lov_mds_md_size(0, LOV_MAGIC_V3)); + + if (repsize_estimate < obd->u.cli.cl_dom_min_inline_repsize) { + repsize = obd->u.cli.cl_dom_min_inline_repsize - + repsize_estimate + sizeof(struct niobuf_remote); + req_capsule_set_size(&req->rq_pill, &RMF_NIOBUF_INLINE, + RCL_SERVER, + sizeof(struct niobuf_remote) + repsize); + ptlrpc_request_set_replen(req); + CDEBUG(D_INFO, "Increase repbuf by %d bytes, total: %d\n", + repsize, req->rq_replen); + repsize = size_roundup_power2(req->rq_replen + + lustre_msg_early_size); + } + /* The only way to report real allocated repbuf size to the server + * is the lm_repsize but it must be set prior buffer allocation itself + * due to security reasons - it is part of buffer used in signature + * calculation (see LU-11414). Therefore the saved size is predicted + * value as rq_replen rounded to the next higher power of 2. + * Such estimation is safe. Though the final allocated buffer might + * be even larger, it is not possible to know that at this point. + */ + req->rq_reqmsg->lm_repsize = repsize; + RETURN(req); +} + +#define GA_DEFAULT_EA_NAME_LEN 20 +#define GA_DEFAULT_EA_VAL_LEN 250 +#define GA_DEFAULT_EA_NUM 10 + +static struct ptlrpc_request * +mdc_intent_getxattr_pack(struct obd_export *exp, struct lookup_intent *it, + struct md_op_data *op_data) +{ + struct ptlrpc_request *req; + struct ldlm_intent *lit; + int rc, count = 0; + LIST_HEAD(cancels); + u32 ea_vals_buf_size = GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM; + + ENTRY; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_GETXATTR); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + rc = ldlm_prep_enqueue_req(exp, req, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = IT_GETXATTR; + /* Message below is checked in sanity-selinux test_20d + * and sanity-sec test_49 + */ + CDEBUG(D_INFO, "%s: get xattrs for "DFID"\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1)); + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0) + /* If the supplied buffer is too small then the server will return + * -ERANGE and llite will fallback to using non cached xattr + * operations. On servers before 2.10.1 a (non-cached) listxattr RPC + * for an orphan or dead file causes an oops. So let's try to avoid + * sending too small a buffer to too old a server. This is effectively + * undoing the memory conservation of LU-9417 when it would be *more* + * likely to crash the server. See LU-9856. + */ + if (exp->exp_connect_data.ocd_version < OBD_OCD_VERSION(2, 10, 1, 0)) + ea_vals_buf_size = max_t(u32, ea_vals_buf_size, + exp->exp_connect_data.ocd_max_easize); +#endif + + /* pack the intended request */ + mdc_pack_body(&req->rq_pill, &op_data->op_fid1, op_data->op_valid, + ea_vals_buf_size, -1, 0); + + /* get SELinux policy info if any */ + mdc_file_sepol_pack(&req->rq_pill); + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, + GA_DEFAULT_EA_NAME_LEN * GA_DEFAULT_EA_NUM); + + req_capsule_set_size(&req->rq_pill, &RMF_EAVALS, RCL_SERVER, + ea_vals_buf_size); + + req_capsule_set_size(&req->rq_pill, &RMF_EAVALS_LENS, RCL_SERVER, + sizeof(u32) * GA_DEFAULT_EA_NUM); + + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0); + + ptlrpc_request_set_replen(req); + + RETURN(req); +} + +static struct ptlrpc_request * +mdc_intent_getattr_pack(struct obd_export *exp, struct lookup_intent *it, + struct md_op_data *op_data, __u32 acl_bufsize) +{ + struct ptlrpc_request *req; + struct obd_device *obd = class_exp2obd(exp); + u64 valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE | + OBD_MD_FLDIREA | OBD_MD_MEA | OBD_MD_FLACL | + OBD_MD_DEFAULT_MEA; + struct ldlm_intent *lit; + __u32 easize; + bool have_secctx = false; + int rc; + + ENTRY; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_GETATTR); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + /* send name of security xattr to get upon intent */ + if (it->it_op & (IT_LOOKUP | IT_GETATTR) && + req_capsule_has_field(&req->rq_pill, &RMF_FILE_SECCTX_NAME, + RCL_CLIENT) && + op_data->op_file_secctx_name_size > 0 && + op_data->op_file_secctx_name != NULL) { + have_secctx = true; + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME, + RCL_CLIENT, + op_data->op_file_secctx_name_size); + } + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + if (obd->u.cli.cl_default_mds_easize > 0) + easize = obd->u.cli.cl_default_mds_easize; + else + easize = obd->u.cli.cl_max_mds_easize; + + /* pack the intended request */ + mdc_getattr_pack(&req->rq_pill, valid, it->it_flags, op_data, easize); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, easize); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize); + req_capsule_set_size(&req->rq_pill, &RMF_DEFAULT_MDT_MD, RCL_SERVER, + sizeof(struct lmv_user_md)); + + if (have_secctx) { + char *secctx_name; + + secctx_name = req_capsule_client_get(&req->rq_pill, + &RMF_FILE_SECCTX_NAME); + memcpy(secctx_name, op_data->op_file_secctx_name, + op_data->op_file_secctx_name_size); + + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, + RCL_SERVER, easize); + + CDEBUG(D_SEC, "packed '%.*s' as security xattr name\n", + op_data->op_file_secctx_name_size, + op_data->op_file_secctx_name); + } else { + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, + RCL_SERVER, 0); + } + + if (exp_connect_encrypt(exp) && it->it_op & (IT_LOOKUP | IT_GETATTR)) + req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, + RCL_SERVER, easize); + else + req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, + RCL_SERVER, 0); + + ptlrpc_request_set_replen(req); + RETURN(req); +} + +static struct ptlrpc_request *mdc_intent_layout_pack(struct obd_export *exp, + struct lookup_intent *it, + struct md_op_data *op_data) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + struct ldlm_intent *lit; + struct layout_intent *layout; + LIST_HEAD(cancels); + int count = 0, rc; + + ENTRY; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_LAYOUT); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + if (fid_is_sane(&op_data->op_fid2) && (it->it_op & IT_LAYOUT) && + (it->it_flags & FMODE_WRITE)) { + count = mdc_resource_get_unused(exp, &op_data->op_fid2, + &cancels, LCK_EX, + MDS_INODELOCK_LAYOUT); + } + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0); + rc = ldlm_prep_enqueue_req(exp, req, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + /* pack the layout intent request */ + layout = req_capsule_client_get(&req->rq_pill, &RMF_LAYOUT_INTENT); + LASSERT(op_data->op_data != NULL); + LASSERT(op_data->op_data_size == sizeof(*layout)); + memcpy(layout, op_data->op_data, sizeof(*layout)); + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + obd->u.cli.cl_default_mds_easize); + ptlrpc_request_set_replen(req); + RETURN(req); +} + +static struct ptlrpc_request *mdc_enqueue_pack(struct obd_export *exp, + int lvb_len) +{ + struct ptlrpc_request *req; + int rc; + + ENTRY; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len); + ptlrpc_request_set_replen(req); + RETURN(req); +} + +static int mdc_finish_enqueue(struct obd_export *exp, + struct ptlrpc_request *req, + struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, + struct lustre_handle *lockh, int rc) +{ + struct req_capsule *pill = &req->rq_pill; + struct ldlm_request *lockreq; + struct ldlm_reply *lockrep; + struct ldlm_lock *lock; + struct mdt_body *body = NULL; + void *lvb_data = NULL; + __u32 lvb_len = 0; + + ENTRY; + LASSERT(rc >= 0); + /* Similarly, if we're going to replay this request, we don't want to + * actually get a lock, just perform the intent. + */ + if (req->rq_transno || req->rq_replay) { + lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ); + lockreq->lock_flags |= ldlm_flags_to_wire(LDLM_FL_INTENT_ONLY); + } + + if (rc == ELDLM_LOCK_ABORTED) { + einfo->ei_mode = 0; + memset(lockh, 0, sizeof(*lockh)); + rc = 0; + } else { /* rc = 0 */ + lock = ldlm_handle2lock(lockh); + LASSERT(lock != NULL); + + /* If server returned a different lock mode, fix up variables */ + if (lock->l_req_mode != einfo->ei_mode) { + ldlm_lock_addref(lockh, lock->l_req_mode); + ldlm_lock_decref(lockh, einfo->ei_mode); + einfo->ei_mode = lock->l_req_mode; + } + LDLM_LOCK_PUT(lock); + } + + lockrep = req_capsule_server_get(pill, &RMF_DLM_REP); + LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */ + + it->it_disposition = (int)lockrep->lock_policy_res1; + it->it_status = (int)lockrep->lock_policy_res2; + it->it_lock_mode = einfo->ei_mode; + it->it_lock_handle = lockh->cookie; + it->it_request = req; + + /* Technically speaking rq_transno must already be zero if + * it_status is in error, so the check is a bit redundant. + */ + if ((!req->rq_transno || it->it_status < 0) && req->rq_replay) + mdc_clear_replay_flag(req, it->it_status); + + /* If we're doing an IT_OPEN which did not result in an actual + * successful open, then we need to remove the bit which saves + * this request for unconditional replay. + * + * It's important that we do this first! Otherwise we might exit the + * function without doing so, and try to replay a failed create. + * (b=3440) + */ + if (it->it_op & IT_OPEN && req->rq_replay && + (!it_disposition(it, DISP_OPEN_OPEN) || it->it_status != 0)) + mdc_clear_replay_flag(req, it->it_status); + + DEBUG_REQ(D_RPCTRACE, req, "op=%x disposition=%x, status=%d", + it->it_op, it->it_disposition, it->it_status); + + /* We know what to expect, so we do any byte flipping required here */ + if (it_has_reply_body(it)) { + body = req_capsule_server_get(pill, &RMF_MDT_BODY); + if (body == NULL) { + rc = -EPROTO; + CERROR("%s: cannot swab mdt_body: rc = %d\n", + exp->exp_obd->obd_name, rc); + RETURN(rc); + } + + if (it_disposition(it, DISP_OPEN_OPEN) && + !it_open_error(DISP_OPEN_OPEN, it)) { + /* + * If this is a successful OPEN request, we need to set + * replay handler and data early, so that if replay + * happens immediately after swabbing below, new reply + * is swabbed by that handler correctly. + */ + mdc_set_open_replay_data(NULL, NULL, it); + } + + if (it_disposition(it, DISP_OPEN_CREATE) && + !it_open_error(DISP_OPEN_CREATE, it)) { + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_CREATE); + } + + if (body->mbo_valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) { + void *eadata; + + mdc_update_max_ea_from_body(exp, body); + + /* + * The eadata is opaque; just check that it is there. + * Eventually, obd_unpackmd() will check the contents. + */ + eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD, + body->mbo_eadatasize); + if (eadata == NULL) + RETURN(-EPROTO); + + /* save LVB data and length if for layout lock */ + lvb_data = eadata; + lvb_len = body->mbo_eadatasize; + + /* + * We save the reply LOV EA in case we have to replay a + * create for recovery. If we didn't allocate a large + * enough request buffer above we need to reallocate it + * here to hold the actual LOV EA. + * + * To not save LOV EA if request is not going to replay + * (for example error one). + */ + if ((it->it_op & IT_OPEN) && req->rq_replay) { + rc = mdc_save_lovea(req, eadata, + body->mbo_eadatasize); + if (rc) { + body->mbo_valid &= ~OBD_MD_FLEASIZE; + body->mbo_eadatasize = 0; + rc = 0; + } + } + } + } else if (it->it_op & IT_LAYOUT) { + /* maybe the lock was granted right away and layout + * is packed into RMF_DLM_LVB of req + */ + lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER); + CDEBUG(D_INFO, "%s: layout return lvb %d transno %lld\n", + class_exp2obd(exp)->obd_name, lvb_len, req->rq_transno); + if (lvb_len > 0) { + lvb_data = req_capsule_server_sized_get(pill, + &RMF_DLM_LVB, lvb_len); + if (lvb_data == NULL) + RETURN(-EPROTO); + + /** + * save replied layout data to the request buffer for + * recovery consideration (lest MDS reinitialize + * another set of OST objects). + */ + if (req->rq_transno) + (void)mdc_save_lovea(req, lvb_data, lvb_len); + } + } + + /* fill in stripe data for layout lock. + * LU-6581: trust layout data only if layout lock is granted. The MDT + * has stopped sending layout unless the layout lock is granted. The + * client still does this checking in case it's talking with an old + * server. - Jinshan + */ + lock = ldlm_handle2lock(lockh); + if (lock == NULL) + RETURN(rc); + + if (ldlm_has_layout(lock) && lvb_data != NULL && + !(lockrep->lock_flags & LDLM_FL_BLOCKED_MASK)) { + void *lmm; + + LDLM_DEBUG(lock, "layout lock returned by: %s, lvb_len: %d", + ldlm_it2str(it->it_op), lvb_len); + + OBD_ALLOC_LARGE(lmm, lvb_len); + if (lmm == NULL) + GOTO(out_lock, rc = -ENOMEM); + + memcpy(lmm, lvb_data, lvb_len); + + /* install lvb_data */ + lock_res_and_lock(lock); + if (lock->l_lvb_data == NULL) { + lock->l_lvb_type = LVB_T_LAYOUT; + lock->l_lvb_data = lmm; + lock->l_lvb_len = lvb_len; + lmm = NULL; + } + unlock_res_and_lock(lock); + if (lmm != NULL) + OBD_FREE_LARGE(lmm, lvb_len); + } + + if (ldlm_has_dom(lock)) { + LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast); + + body = req_capsule_server_get(pill, &RMF_MDT_BODY); + if (!(body->mbo_valid & OBD_MD_DOM_SIZE)) { + LDLM_ERROR(lock, "%s: DoM lock without size.", + exp->exp_obd->obd_name); + GOTO(out_lock, rc = -EPROTO); + } + + LDLM_DEBUG(lock, "DoM lock is returned by: %s, size: %llu", + ldlm_it2str(it->it_op), body->mbo_dom_size); + + lock_res_and_lock(lock); + mdc_body2lvb(body, &lock->l_ost_lvb); + ldlm_lock_allow_match_locked(lock); + unlock_res_and_lock(lock); + } +out_lock: + LDLM_LOCK_PUT(lock); + + RETURN(rc); +} + +static inline bool mdc_skip_mod_rpc_slot(const struct lookup_intent *it) +{ + if (it != NULL && + (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP || + it->it_op == IT_READDIR || it->it_op == IT_GETXATTR || + (it->it_op == IT_LAYOUT && !(it->it_flags & MDS_FMODE_WRITE)))) + return true; + return false; +} + +/* We always reserve enough space in the reply packet for a stripe MD, because + * we don't know in advance the file type. + */ +static int mdc_enqueue_base(struct obd_export *exp, + struct ldlm_enqueue_info *einfo, + const union ldlm_policy_data *policy, + struct lookup_intent *it, + struct md_op_data *op_data, + struct lustre_handle *lockh, + __u64 extra_lock_flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + __u64 flags, saved_flags = extra_lock_flags; + struct ldlm_res_id res_id; + static const union ldlm_policy_data lookup_policy = { + .l_inodebits = { MDS_INODELOCK_LOOKUP } }; + static const union ldlm_policy_data update_policy = { + .l_inodebits = { MDS_INODELOCK_UPDATE } }; + static const union ldlm_policy_data layout_policy = { + .l_inodebits = { MDS_INODELOCK_LAYOUT } }; + static const union ldlm_policy_data getxattr_policy = { + .l_inodebits = { MDS_INODELOCK_XATTR } }; + int generation, resends = 0; + struct ldlm_reply *lockrep; + struct obd_import *imp = class_exp2cliimp(exp); + __u32 acl_bufsize; + enum lvb_type lvb_type = 0; + int rc; + + ENTRY; + LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n", + einfo->ei_type); + fid_build_reg_res_name(&op_data->op_fid1, &res_id); + + if (it != NULL) { + LASSERT(policy == NULL); + + saved_flags |= LDLM_FL_HAS_INTENT; + if (it->it_op & (IT_GETATTR | IT_READDIR)) + policy = &update_policy; + else if (it->it_op & IT_LAYOUT) + policy = &layout_policy; + else if (it->it_op & IT_GETXATTR) + policy = &getxattr_policy; + else + policy = &lookup_policy; + } + + generation = obd->u.cli.cl_import->imp_generation; + if (!it || (it->it_op & (IT_OPEN | IT_CREAT))) + acl_bufsize = min_t(__u32, imp->imp_connect_data.ocd_max_easize, + XATTR_SIZE_MAX); + else + acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD; + +resend: + flags = saved_flags; + if (it == NULL) { + /* The only way right now is FLOCK. */ + LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n", + einfo->ei_type); + res_id.name[3] = LDLM_FLOCK; + req = ldlm_enqueue_pack(exp, 0); + } else if (it->it_op & IT_OPEN) { + req = mdc_intent_open_pack(exp, it, op_data, acl_bufsize); + } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) { + req = mdc_intent_getattr_pack(exp, it, op_data, acl_bufsize); + } else if (it->it_op & IT_READDIR) { + req = mdc_enqueue_pack(exp, 0); + } else if (it->it_op & IT_LAYOUT) { + if (!imp_connect_lvb_type(imp)) + RETURN(-EOPNOTSUPP); + req = mdc_intent_layout_pack(exp, it, op_data); + lvb_type = LVB_T_LAYOUT; + } else if (it->it_op & IT_GETXATTR) { + req = mdc_intent_getxattr_pack(exp, it, op_data); + } else { + LBUG(); + RETURN(-EINVAL); + } + + if (IS_ERR(req)) + RETURN(PTR_ERR(req)); + + if (resends) { + req->rq_generation_set = 1; + req->rq_import_generation = generation; + req->rq_sent = ktime_get_real_seconds() + resends; + } + + einfo->ei_req_slot = !(op_data->op_cli_flags & CLI_NO_SLOT); + einfo->ei_mod_slot = !mdc_skip_mod_rpc_slot(it); + + /* With Data-on-MDT the glimpse callback is needed too. + * It is set here in advance but not in mdc_finish_enqueue() + * to avoid possible races. It is safe to have glimpse handler + * for non-DOM locks and costs nothing. + */ + if (einfo->ei_cb_gl == NULL) + einfo->ei_cb_gl = mdc_ldlm_glimpse_ast; + + rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL, + 0, lvb_type, lockh, 0); + + if (!it) { + /* For flock requests we immediatelly return without further + * delay and let caller deal with the rest, since rest of + * this function metadata processing makes no sense for flock + * requests anyway. But in case of problem during comms with + * server (-ETIMEDOUT) or any signal/kill attempt (-EINTR), + * we cannot rely on caller and this mainly for F_UNLCKs + * (explicits or automatically generated by kernel to clean + * current flocks upon exit) that can't be trashed. + */ + ptlrpc_req_finished(req); + if (((rc == -EINTR) || (rc == -ETIMEDOUT)) && + (einfo->ei_type == LDLM_FLOCK) && + (einfo->ei_mode == LCK_NL)) + goto resend; + RETURN(rc); + } + + if (rc < 0) { + CDEBUG(D_INFO, + "%s: ldlm_cli_enqueue "DFID":"DFID"=%s failed: rc = %d\n", + obd->obd_name, PFID(&op_data->op_fid1), + PFID(&op_data->op_fid2), op_data->op_name ?: "", rc); + + mdc_clear_replay_flag(req, rc); + ptlrpc_req_finished(req); + RETURN(rc); + } + + lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + LASSERT(lockrep != NULL); + + lockrep->lock_policy_res2 = + ptlrpc_status_ntoh(lockrep->lock_policy_res2); + + /* Retry infinitely when the server returns -EINPROGRESS for the + * intent operation, when server returns -EINPROGRESS for acquiring + * intent lock, we'll retry in after_reply(). + */ + if (it && (int)lockrep->lock_policy_res2 == -EINPROGRESS) { + mdc_clear_replay_flag(req, rc); + ptlrpc_req_finished(req); + if (generation == obd->u.cli.cl_import->imp_generation) { + if (signal_pending(current)) + RETURN(-EINTR); + + resends++; + CDEBUG(D_HA, "%s: resend:%d op:%d "DFID"/"DFID"\n", + obd->obd_name, resends, it->it_op, + PFID(&op_data->op_fid1), + PFID(&op_data->op_fid2)); + goto resend; + } else { + CDEBUG(D_HA, "resend cross eviction\n"); + RETURN(-EIO); + } + } + + if ((int)lockrep->lock_policy_res2 == -ERANGE && + it->it_op & (IT_OPEN | IT_GETATTR | IT_LOOKUP) && + acl_bufsize == LUSTRE_POSIX_ACL_MAX_SIZE_OLD) { + mdc_clear_replay_flag(req, -ERANGE); + ptlrpc_req_finished(req); + acl_bufsize = min_t(__u32, imp->imp_connect_data.ocd_max_easize, + XATTR_SIZE_MAX); + goto resend; + } + + rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc); + if (rc < 0) { + if (lustre_handle_is_used(lockh)) { + ldlm_lock_decref(lockh, einfo->ei_mode); + memset(lockh, 0, sizeof(*lockh)); + } + ptlrpc_req_finished(req); + + it->it_lock_handle = 0; + it->it_lock_mode = 0; + it->it_request = NULL; + } + + RETURN(rc); +} + +int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + const union ldlm_policy_data *policy, + struct md_op_data *op_data, + struct lustre_handle *lockh, __u64 extra_lock_flags) +{ + return mdc_enqueue_base(exp, einfo, policy, NULL, + op_data, lockh, extra_lock_flags); +} + +static int mdc_finish_intent_lock(struct obd_export *exp, + struct ptlrpc_request *request, + struct md_op_data *op_data, + struct lookup_intent *it, + struct lustre_handle *lockh) +{ + struct lustre_handle old_lock; + struct ldlm_lock *lock; + int rc = 0; + + ENTRY; + LASSERT(request != NULL); + LASSERT(request != LP_POISON); + LASSERT(request->rq_repmsg != LP_POISON); + + if (it->it_op & IT_READDIR) + RETURN(0); + + if (it->it_op & (IT_GETXATTR | IT_LAYOUT)) { + if (it->it_status != 0) + GOTO(out, rc = it->it_status); + } else { + if (!it_disposition(it, DISP_IT_EXECD)) { + /* The server failed before it even started executing + * the intent, i.e. because it couldn't unpack the + * request. + */ + LASSERT(it->it_status != 0); + GOTO(out, rc = it->it_status); + } + rc = it_open_error(DISP_IT_EXECD, it); + if (rc) + GOTO(out, rc); + + rc = it_open_error(DISP_LOOKUP_EXECD, it); + if (rc) + GOTO(out, rc); + + /* keep requests around for the multiple phases of the call + * this shows the DISP_XX must guarantee we make it into the + * call + */ + if (!it_disposition(it, DISP_ENQ_CREATE_REF) && + it_disposition(it, DISP_OPEN_CREATE) && + !it_open_error(DISP_OPEN_CREATE, it)) { + it_set_disposition(it, DISP_ENQ_CREATE_REF); + /* balanced in ll_create_node */ + ptlrpc_request_addref(request); + } + if (!it_disposition(it, DISP_ENQ_OPEN_REF) && + it_disposition(it, DISP_OPEN_OPEN) && + !it_open_error(DISP_OPEN_OPEN, it)) { + it_set_disposition(it, DISP_ENQ_OPEN_REF); + /* balanced in ll_file_open */ + ptlrpc_request_addref(request); + /* eviction in middle of open RPC processing b=11546 */ + OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE, + obd_timeout); + } + + if (it->it_op & IT_CREAT) { + /* XXX this belongs in ll_create_it */ + } else if (it->it_op == IT_OPEN) { + LASSERT(!it_disposition(it, DISP_OPEN_CREATE)); + } else { + LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP)); + } + } + + /* If we already have a matching lock, then cancel the new + * one. We have to set the data here instead of in + * mdc_enqueue, because we need to use the child's inode as + * the l_ast_data to match, and that's not available until + * intent_finish has performed the iget(). + */ + lock = ldlm_handle2lock(lockh); + if (lock) { + union ldlm_policy_data policy = lock->l_policy_data; + + LDLM_DEBUG(lock, "matching against this"); + + if (it_has_reply_body(it)) { + struct mdt_body *body; + + body = req_capsule_server_get(&request->rq_pill, + &RMF_MDT_BODY); + /* mdc_enqueue checked */ + LASSERT(body != NULL); + LASSERTF(fid_res_name_eq(&body->mbo_fid1, + &lock->l_resource->lr_name), + "Lock res_id: "DLDLMRES", fid: "DFID"\n", + PLDLMRES(lock->l_resource), + PFID(&body->mbo_fid1)); + } + LDLM_LOCK_PUT(lock); + + memcpy(&old_lock, lockh, sizeof(*lockh)); + if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL, + LDLM_IBITS, &policy, LCK_NL, &old_lock)) { + ldlm_lock_decref_and_cancel(lockh, it->it_lock_mode); + memcpy(lockh, &old_lock, sizeof(old_lock)); + it->it_lock_handle = lockh->cookie; + } + } + + EXIT; +out: + CDEBUG(D_DENTRY, + "D_IT dentry=%.*s intent=%s status=%d disp=%x: rc = %d\n", + (int)op_data->op_namelen, op_data->op_name, + ldlm_it2str(it->it_op), it->it_status, it->it_disposition, rc); + + return rc; +} + +int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits) +{ + /* We could just return 1 immediately, but as we should only be called + * in revalidate_it if we already have a lock, let's verify that. + */ + struct ldlm_res_id res_id; + struct lustre_handle lockh; + union ldlm_policy_data policy; + enum ldlm_mode mode; + + ENTRY; + if (it->it_lock_handle) { + lockh.cookie = it->it_lock_handle; + mode = ldlm_revalidate_lock_handle(&lockh, bits); + } else { + fid_build_reg_res_name(fid, &res_id); + switch (it->it_op) { + case IT_GETATTR: + /* File attributes are held under multiple bits: + * nlink is under lookup lock, size and times are + * under UPDATE lock and recently we've also got + * a separate permissions lock for owner/group/acl that + * were protected by lookup lock before. + * Getattr must provide all of that information, + * so we need to ensure we have all of those locks. + * Unfortunately, if the bits are split across multiple + * locks, there's no easy way to match all of them here, + * so an extra RPC would be performed to fetch all + * of those bits at once for now. + */ + /* For new MDTs(> 2.4), UPDATE|PERM should be enough, + * but for old MDTs (< 2.4), permission is covered + * by LOOKUP lock, so it needs to match all bits here. + */ + policy.l_inodebits.bits = MDS_INODELOCK_UPDATE | + MDS_INODELOCK_PERM; + break; + case IT_READDIR: + policy.l_inodebits.bits = MDS_INODELOCK_UPDATE; + break; + case IT_LAYOUT: + policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT; + break; + default: + policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP; + break; + } + + mode = mdc_lock_match(exp, LDLM_FL_BLOCK_GRANTED, fid, + LDLM_IBITS, &policy, + LCK_CR | LCK_CW | LCK_PR | LCK_PW, + &lockh); + } + + if (mode) { + it->it_lock_handle = lockh.cookie; + it->it_lock_mode = mode; + } else { + it->it_lock_handle = 0; + it->it_lock_mode = 0; + } + + RETURN(!!mode); +} + +/* + * This long block is all about fixing up the lock and request state + * so that it is correct as of the moment _before_ the operation was + * applied; that way, the VFS will think that everything is normal and + * call Lustre's regular VFS methods. + * + * If we're performing a creation, that means that unless the creation + * failed with EEXIST, we should fake up a negative dentry. + * + * For everything else, we want to lookup to succeed. + * + * One additional note: if CREATE or OPEN succeeded, we add an extra + * reference to the request because we need to keep it around until + * ll_create/ll_open gets called. + * + * The server will return to us, in it_disposition, an indication of + * exactly what it_status refers to. + * + * If DISP_OPEN_OPEN is set, then it_status refers to the open() call, + * otherwise if DISP_OPEN_CREATE is set, then it status is the + * creation failure mode. In either case, one of DISP_LOOKUP_NEG or + * DISP_LOOKUP_POS will be set, indicating whether the child lookup + * was successful. + * + * Else, if DISP_LOOKUP_EXECD then it_status is the rc of the + * child lookup. + */ +int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data, + struct lookup_intent *it, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, __u64 extra_lock_flags) +{ + struct ldlm_enqueue_info einfo = { + .ei_type = LDLM_IBITS, + .ei_mode = it_to_lock_mode(it), + .ei_cb_bl = cb_blocking, + .ei_cb_cp = ldlm_completion_ast, + .ei_cb_gl = mdc_ldlm_glimpse_ast, + }; + struct lustre_handle lockh; + int rc = 0; + + ENTRY; + LASSERT(it); + CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID + ", intent: %s flags %#llo\n", (int)op_data->op_namelen, + op_data->op_name, PFID(&op_data->op_fid2), + PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), + it->it_flags); + + lockh.cookie = 0; + /* MDS_FID_OP is not a revalidate case */ + if (fid_is_sane(&op_data->op_fid2) && + (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_READDIR)) && + !(op_data->op_bias & MDS_FID_OP)) { + /* We could just return 1 immediately, but since we should only + * be called in revalidate_it if we already have a lock, let's + * verify that. + */ + it->it_lock_handle = 0; + rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL); + /* Only return failure if it was not GETATTR by cfid + * (from inode_revalidate()). + */ + if (rc || op_data->op_namelen != 0) + RETURN(rc); + } + + /* For case if upper layer did not alloc fid, do it now. */ + if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) { + rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); + if (rc < 0) { + CERROR("%s: cannot allocate new FID: rc=%d\n", + exp->exp_obd->obd_name, rc); + RETURN(rc); + } + } + + rc = mdc_enqueue_base(exp, &einfo, NULL, it, op_data, &lockh, + extra_lock_flags); + if (rc < 0) + RETURN(rc); + + *reqp = it->it_request; + rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh); + RETURN(rc); +} + +static int mdc_intent_getattr_async_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *args, int rc) +{ + struct mdc_getattr_args *ga = args; + struct obd_export *exp = ga->ga_exp; + struct md_enqueue_info *minfo = ga->ga_minfo; + struct ldlm_enqueue_info *einfo = &minfo->mi_einfo; + struct lookup_intent *it = &minfo->mi_it; + struct lustre_handle *lockh = &minfo->mi_lockh; + struct ldlm_reply *lockrep; + __u64 flags = LDLM_FL_HAS_INTENT; + + ENTRY; + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE)) + rc = -ETIMEDOUT; + + rc = ldlm_cli_enqueue_fini(exp, req, einfo, 1, &flags, NULL, 0, + lockh, rc, true); + if (rc < 0) { + CERROR("%s: ldlm_cli_enqueue_fini() failed: rc = %d\n", + exp->exp_obd->obd_name, rc); + mdc_clear_replay_flag(req, rc); + GOTO(out, rc); + } + + lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + LASSERT(lockrep != NULL); + + lockrep->lock_policy_res2 = + ptlrpc_status_ntoh(lockrep->lock_policy_res2); + + rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc); + if (rc) + GOTO(out, rc); + + rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh); + EXIT; + +out: + minfo->mi_cb(req, minfo, rc); + return 0; +} + +int mdc_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo) +{ + struct md_op_data *op_data = &minfo->mi_data; + struct lookup_intent *it = &minfo->mi_it; + struct ptlrpc_request *req; + struct mdc_getattr_args *ga; + struct ldlm_res_id res_id; + union ldlm_policy_data policy = { + .l_inodebits = { MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE } + }; + __u64 flags = LDLM_FL_HAS_INTENT; + int rc = 0; + + ENTRY; + CDEBUG(D_DLMTRACE, + "name: %.*s in inode "DFID", intent: %s flags %#llo\n", + (int)op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), it->it_flags); + + fid_build_reg_res_name(&op_data->op_fid1, &res_id); + /* If the MDT return -ERANGE because of large ACL, then the sponsor + * of the async getattr RPC will handle that by itself. + */ + req = mdc_intent_getattr_pack(exp, it, op_data, + LUSTRE_POSIX_ACL_MAX_SIZE_OLD); + if (IS_ERR(req)) + RETURN(PTR_ERR(req)); + + /* With Data-on-MDT the glimpse callback is needed too. + * It is set here in advance but not in mdc_finish_enqueue() + * to avoid possible races. It is safe to have glimpse handler + * for non-DOM locks and costs nothing. + */ + if (minfo->mi_einfo.ei_cb_gl == NULL) + minfo->mi_einfo.ei_cb_gl = mdc_ldlm_glimpse_ast; + + rc = ldlm_cli_enqueue(exp, &req, &minfo->mi_einfo, &res_id, &policy, + &flags, NULL, 0, LVB_T_NONE, &minfo->mi_lockh, 1); + if (rc < 0) { + ptlrpc_req_finished(req); + RETURN(rc); + } + + ga = ptlrpc_req_async_args(ga, req); + ga->ga_exp = exp; + ga->ga_minfo = minfo; + + req->rq_interpret_reply = mdc_intent_getattr_async_interpret; + ptlrpcd_add_req(req); + + RETURN(0); +} diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c new file mode 100644 index 0000000000000..f75d559981d5a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c @@ -0,0 +1,536 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_MDC + +#include +#include + +#include +#include "mdc_internal.h" +#include + +/* mdc_setattr does its own semaphore handling */ +static int mdc_reint(struct ptlrpc_request *request, int level) +{ + int rc; + + request->rq_send_state = level; + + ptlrpc_get_mod_rpc_slot(request); + rc = ptlrpc_queue_wait(request); + ptlrpc_put_mod_rpc_slot(request); + if (rc) + CDEBUG(D_INFO, "error in handling %d\n", rc); + else if (!req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY)) { + rc = -EPROTO; + } + return rc; +} + +/* Find and cancel locally locks matched by inode @bits & @mode in the resource + * found by @fid. Found locks are added into @cancel list. Returns the amount of + * locks added to @cancels list. */ +int mdc_resource_get_unused_res(struct obd_export *exp, + struct ldlm_res_id *res_id, + struct list_head *cancels, + enum ldlm_mode mode, __u64 bits) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + union ldlm_policy_data policy = { { 0 } }; + struct ldlm_resource *res; + int count; + + ENTRY; + + /* Return, i.e. cancel nothing, only if ELC is supported (flag in + * export) but disabled through procfs (flag in NS). + * + * This distinguishes from a case when ELC is not supported originally, + * when we still want to cancel locks in advance and just cancel them + * locally, without sending any RPC. */ + if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns)) + RETURN(0); + + res = ldlm_resource_get(ns, NULL, res_id, 0, 0); + if (IS_ERR(res)) + RETURN(0); + LDLM_RESOURCE_ADDREF(res); + /* Initialize ibits lock policy. */ + policy.l_inodebits.bits = bits; + count = ldlm_cancel_resource_local(res, cancels, &policy, mode, 0, 0, + NULL); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + RETURN(count); +} + +int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid, + struct list_head *cancels, enum ldlm_mode mode, + __u64 bits) +{ + struct ldlm_res_id res_id; + + fid_build_reg_res_name(fid, &res_id); + return mdc_resource_get_unused_res(exp, &res_id, cancels, mode, bits); +} + +int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, size_t ealen, struct ptlrpc_request **request) +{ + LIST_HEAD(cancels); + struct ptlrpc_request *req; + int count = 0, rc; + __u64 bits; + ENTRY; + + LASSERT(op_data != NULL); + + bits = MDS_INODELOCK_UPDATE; + if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) + bits |= MDS_INODELOCK_LOOKUP; + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, bits); + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_SETATTR); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_EPOCH, RCL_CLIENT, 0); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, ealen); + req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_CLIENT, 0); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME)) + CDEBUG(D_INODE, "setting mtime %lld, ctime %lld\n", + (s64)op_data->op_attr.ia_mtime.tv_sec, + (s64)op_data->op_attr.ia_ctime.tv_sec); + mdc_setattr_pack(&req->rq_pill, op_data, ea, ealen); + + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0); + + ptlrpc_request_set_replen(req); + + rc = mdc_reint(req, LUSTRE_IMP_FULL); + if (rc == -ERESTARTSYS) + rc = 0; + + *request = req; + + RETURN(rc); +} + +int mdc_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, size_t datalen, + umode_t mode, uid_t uid, gid_t gid, + kernel_cap_t cap_effective, __u64 rdev, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int level, rc; + int count, resends = 0; + struct obd_import *import = exp->exp_obd->u.cli.cl_import; + int generation = import->imp_generation; + LIST_HEAD(cancels); + ENTRY; + + /* For case if upper layer did not alloc fid, do it now. */ + if (!fid_is_sane(&op_data->op_fid2)) { + /* + * mdc_fid_alloc() may return errno 1 in case of switch to new + * sequence, handle this. + */ + rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); + if (rc < 0) + RETURN(rc); + } + +rebuild: + count = 0; + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_CREATE_ACL); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + data && datalen ? datalen : 0); + + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME, + RCL_CLIENT, op_data->op_file_secctx_name != NULL ? + strlen(op_data->op_file_secctx_name) + 1 : 0); + + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT, + op_data->op_file_secctx_size); + + req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, RCL_CLIENT, + op_data->op_file_encctx_size); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + /* + * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with + * tgt, for symlinks or lov MD data. + */ + mdc_create_pack(&req->rq_pill, op_data, data, datalen, mode, uid, + gid, cap_effective, rdev); + + ptlrpc_request_set_replen(req); + + /* ask ptlrpc not to resend on EINPROGRESS since we have our own retry + * logic here */ + req->rq_no_retry_einprogress = 1; + + if (resends) { + req->rq_generation_set = 1; + req->rq_import_generation = generation; + req->rq_sent = ktime_get_real_seconds() + resends; + } + level = LUSTRE_IMP_FULL; + resend: + rc = mdc_reint(req, level); + + /* Resend if we were told to. */ + if (rc == -ERESTARTSYS) { + level = LUSTRE_IMP_RECOVER; + goto resend; + } else if (rc == -EINPROGRESS) { + /* Retry create infinitely until succeed or get other + * error code or interrupted. */ + ptlrpc_req_finished(req); + if (generation == import->imp_generation) { + if (signal_pending(current)) + RETURN(-EINTR); + + resends++; + CDEBUG(D_HA, "%s: resend:%d create on "DFID"/"DFID"\n", + exp->exp_obd->obd_name, resends, + PFID(&op_data->op_fid1), + PFID(&op_data->op_fid2)); + goto rebuild; + } else { + CDEBUG(D_HA, "resend cross eviction\n"); + RETURN(-EIO); + } + } + + *request = req; + RETURN(rc); +} + +int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + LIST_HEAD(cancels); + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req = *request; + int count = 0, rc; + ENTRY; + + LASSERT(req == NULL); + + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && + (fid_is_sane(&op_data->op_fid3))) + /* cancel DOM lock only if it has no data to flush */ + count += mdc_resource_get_unused(exp, &op_data->op_fid3, + &cancels, LCK_EX, + op_data->op_cli_flags & + CLI_DIRTY_DATA ? + MDS_INODELOCK_ELC : + MDS_INODELOCK_FULL); + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_UNLINK); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_unlink_pack(&req->rq_pill, op_data); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obd->u.cli.cl_default_mds_easize); + ptlrpc_request_set_replen(req); + + *request = req; + + rc = mdc_reint(req, LUSTRE_IMP_FULL); + if (rc == -ERESTARTSYS) + rc = 0; + RETURN(rc); +} + +int mdc_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + LIST_HEAD(cancels); + struct ptlrpc_request *req; + int count = 0, rc; + ENTRY; + + if ((op_data->op_flags & MF_MDC_CANCEL_FID2) && + (fid_is_sane(&op_data->op_fid2))) + count = mdc_resource_get_unused(exp, &op_data->op_fid2, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count += mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_REINT_LINK); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_link_pack(&req->rq_pill, op_data); + ptlrpc_request_set_replen(req); + + rc = mdc_reint(req, LUSTRE_IMP_FULL); + *request = req; + if (rc == -ERESTARTSYS) + rc = 0; + + RETURN(rc); +} + +int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, size_t oldlen, const char *new, size_t newlen, + struct ptlrpc_request **request) +{ + LIST_HEAD(cancels); + struct obd_device *obd = exp->exp_obd; + struct ptlrpc_request *req; + int count = 0, rc; + + ENTRY; + + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID2) && + (fid_is_sane(&op_data->op_fid2))) + count += mdc_resource_get_unused(exp, &op_data->op_fid2, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && + (fid_is_sane(&op_data->op_fid3))) + count += mdc_resource_get_unused(exp, &op_data->op_fid3, + &cancels, LCK_EX, + MDS_INODELOCK_LOOKUP); + if ((op_data->op_flags & MF_MDC_CANCEL_FID4) && + (fid_is_sane(&op_data->op_fid4))) + count += mdc_resource_get_unused(exp, &op_data->op_fid4, + &cancels, LCK_EX, + MDS_INODELOCK_ELC); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + op_data->op_cli_flags & CLI_MIGRATE ? + &RQF_MDS_REINT_MIGRATE : &RQF_MDS_REINT_RENAME); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1); + req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1); + if (op_data->op_cli_flags & CLI_MIGRATE) + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + op_data->op_data_size); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + if (exp_connect_cancelset(exp) && req) + ldlm_cli_cancel_list(&cancels, count, req, 0); + + if (op_data->op_cli_flags & CLI_MIGRATE) + mdc_migrate_pack(&req->rq_pill, op_data, old, oldlen); + else + mdc_rename_pack(&req->rq_pill, op_data, old, oldlen, + new, newlen); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obd->u.cli.cl_default_mds_easize); + ptlrpc_request_set_replen(req); + + rc = mdc_reint(req, LUSTRE_IMP_FULL); + *request = req; + if (rc == -ERESTARTSYS) + rc = 0; + + RETURN(rc); +} + +int mdc_file_resync(struct obd_export *exp, struct md_op_data *op_data) +{ + LIST_HEAD(cancels); + struct ptlrpc_request *req; + struct ldlm_lock *lock; + struct mdt_rec_resync *rec; + int count = 0, rc; + ENTRY; + + if (op_data->op_flags & MF_MDC_CANCEL_FID1 && + fid_is_sane(&op_data->op_fid1)) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_LAYOUT); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_RESYNC); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + BUILD_BUG_ON(sizeof(*rec) != sizeof(struct mdt_rec_reint)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + rec->rs_opcode = REINT_RESYNC; + rec->rs_fsuid = op_data->op_fsuid; + rec->rs_fsgid = op_data->op_fsgid; + rec->rs_cap = op_data->op_cap.cap[0]; + rec->rs_fid = op_data->op_fid1; + rec->rs_bias = op_data->op_bias; + rec->rs_mirror_id = op_data->op_mirror_id; + + lock = ldlm_handle2lock(&op_data->op_lease_handle); + if (lock != NULL) { + rec->rs_lease_handle = lock->l_remote_handle; + LDLM_LOCK_PUT(lock); + } + + ptlrpc_request_set_replen(req); + + rc = mdc_reint(req, LUSTRE_IMP_FULL); + if (rc == -ERESTARTSYS) + rc = 0; + + ptlrpc_req_finished(req); + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c new file mode 100644 index 0000000000000..93db70c6dc229 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c @@ -0,0 +1,3077 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_MDC + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mdc_internal.h" + +#define REQUEST_MINOR 244 + +static int mdc_cleanup(struct obd_device *obd); + +static inline int mdc_queue_wait(struct ptlrpc_request *req) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + int rc; + + /* obd_get_request_slot() ensures that this client has no more + * than cl_max_rpcs_in_flight RPCs simultaneously inf light + * against an MDT. */ + rc = obd_get_request_slot(cli); + if (rc != 0) + return rc; + + rc = ptlrpc_queue_wait(req); + obd_put_request_slot(cli); + + return rc; +} + +/* + * Send MDS_GET_ROOT RPC to fetch root FID. + * + * If \a fileset is not NULL it should contain a subdirectory off + * the ROOT/ directory to be mounted on the client. Return the FID + * of the subdirectory to the client to mount onto its mountpoint. + * + * \param[in] imp MDC import + * \param[in] fileset fileset name, which could be NULL + * \param[out] rootfid root FID of this mountpoint + * \param[out] pc root capa will be unpacked and saved in this pointer + * + * \retval 0 on success, negative errno on failure + */ +static int mdc_get_root(struct obd_export *exp, const char *fileset, + struct lu_fid *rootfid) +{ + struct ptlrpc_request *req; + struct mdt_body *body; + int rc; + + ENTRY; + + if (fileset && !(exp_connect_flags(exp) & OBD_CONNECT_SUBTREE)) + RETURN(-ENOTSUPP); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_GET_ROOT); + if (req == NULL) + RETURN(-ENOMEM); + + if (fileset != NULL) + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + strlen(fileset) + 1); + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_ROOT); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + mdc_pack_body(&req->rq_pill, NULL, 0, 0, -1, 0); + if (fileset != NULL) { + char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + + memcpy(name, fileset, strlen(fileset)); + } + lustre_msg_add_flags(req->rq_reqmsg, LUSTRE_IMP_FULL); + req->rq_send_state = LUSTRE_IMP_FULL; + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + *rootfid = body->mbo_fid1; + CDEBUG(D_NET, "root fid="DFID", last_committed=%llu\n", + PFID(rootfid), lustre_msg_get_last_committed(req->rq_repmsg)); + EXIT; +out: + ptlrpc_req_finished(req); + + return rc; +} + +/* + * This function now is known to always saying that it will receive 4 buffers + * from server. Even for cases when acl_size and md_size is zero, RPC header + * will contain 4 fields and RPC itself will contain zero size fields. This is + * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed + * and thus zero, it shrinks it, making zero size. The same story about + * md_size. And this is course of problem when client waits for smaller number + * of fields. This issue will be fixed later when client gets aware of RPC + * layouts. --umka + */ +static int mdc_getattr_common(struct obd_export *exp, + struct ptlrpc_request *req, + struct md_op_data *op_data) +{ + struct req_capsule *pill = &req->rq_pill; + struct mdt_body *body; + void *eadata; + int rc; + ENTRY; + + /* Request message already built. */ + rc = ptlrpc_queue_wait(req); + if (rc != 0) + RETURN(rc); + + /* sanity check for the reply */ + body = req_capsule_server_get(pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + CDEBUG(D_NET, "mode: %o\n", body->mbo_mode); + + mdc_update_max_ea_from_body(exp, body); + if (body->mbo_eadatasize != 0) { + eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD, + body->mbo_eadatasize); + if (eadata == NULL) + RETURN(-EPROTO); + } + + /* If encryption context was returned by MDT, put it in op_data + * so that caller can set it on inode and save an extra getxattr. + */ + if (op_data && op_data->op_valid & OBD_MD_ENCCTX && + body->mbo_valid & OBD_MD_ENCCTX) { + op_data->op_file_encctx = + req_capsule_server_get(pill, &RMF_FILE_ENCCTX); + op_data->op_file_encctx_size = + req_capsule_get_size(pill, &RMF_FILE_ENCCTX, + RCL_SERVER); + } + + RETURN(0); +} + +static void mdc_reset_acl_req(struct ptlrpc_request *req) +{ + spin_lock(&req->rq_early_free_lock); + sptlrpc_cli_free_repbuf(req); + req->rq_repbuf = NULL; + req->rq_repbuf_len = 0; + req->rq_repdata = NULL; + req->rq_reqdata_len = 0; + spin_unlock(&req->rq_early_free_lock); +} + +static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + struct obd_device *obd = class_exp2obd(exp); + struct obd_import *imp = class_exp2cliimp(exp); + __u32 acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD; + int rc; + ENTRY; + + /* Single MDS without an LMV case */ + if (op_data->op_flags & MF_GET_MDT_IDX) { + op_data->op_mds = 0; + RETURN(0); + } + + *request = NULL; + req = ptlrpc_request_alloc(imp, &RQF_MDS_GETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + /* LU-15245: avoid deadlock with modifying RPCs on MDS_REQUEST_PORTAL */ + req->rq_request_portal = MDS_READPAGE_PORTAL; + +again: + mdc_pack_body(&req->rq_pill, &op_data->op_fid1, op_data->op_valid, + op_data->op_mode, -1, 0); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize); + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + op_data->op_mode); + if (exp_connect_encrypt(exp) && op_data->op_valid & OBD_MD_ENCCTX) + req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, + RCL_SERVER, + obd->u.cli.cl_max_mds_easize); + else + req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, + RCL_SERVER, 0); + ptlrpc_request_set_replen(req); + + rc = mdc_getattr_common(exp, req, op_data); + if (rc) { + if (rc == -ERANGE) { + acl_bufsize = min_t(__u32, + imp->imp_connect_data.ocd_max_easize, + XATTR_SIZE_MAX); + mdc_reset_acl_req(req); + goto again; + } + + ptlrpc_req_finished(req); + } else { + *request = req; + } + + RETURN(rc); +} + +static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + struct obd_import *imp = class_exp2cliimp(exp); + __u32 acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD; + int rc; + ENTRY; + + *request = NULL; + req = ptlrpc_request_alloc(imp, &RQF_MDS_GETATTR_NAME); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR_NAME); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + if (op_data->op_name) { + char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LASSERT(strnlen(op_data->op_name, op_data->op_namelen) == + op_data->op_namelen); + memcpy(name, op_data->op_name, op_data->op_namelen); + } + +again: + mdc_pack_body(&req->rq_pill, &op_data->op_fid1, op_data->op_valid, + op_data->op_mode, op_data->op_suppgids[0], 0); + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + op_data->op_mode); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize); + req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, RCL_SERVER, 0); + ptlrpc_request_set_replen(req); + if (op_data->op_bias & MDS_FID_OP) { + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + + if (b) { + b->mbo_valid |= OBD_MD_NAMEHASH; + b->mbo_fid2 = op_data->op_fid2; + } + } + + rc = mdc_getattr_common(exp, req, NULL); + if (rc) { + if (rc == -ERANGE) { + acl_bufsize = min_t(__u32, + imp->imp_connect_data.ocd_max_easize, + XATTR_SIZE_MAX); + mdc_reset_acl_req(req); + goto again; + } + + ptlrpc_req_finished(req); + } else { + *request = req; + } + + RETURN(rc); +} + +static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt, + const struct lu_fid *fid, int opcode, u64 valid, + const char *xattr_name, const char *input, + int input_size, int output_size, int flags, + __u32 suppgid, struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int xattr_namelen = 0; + char *tmp; + int rc; + ENTRY; + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), fmt); + if (req == NULL) + RETURN(-ENOMEM); + + if (xattr_name) { + xattr_namelen = strlen(xattr_name) + 1; + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + xattr_namelen); + } + if (input_size) + LASSERT(input); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + input_size); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + /* Flush local XATTR locks to get rid of a possible cancel RPC */ + if (opcode == MDS_REINT && fid_is_sane(fid) && + exp->exp_connect_data.ocd_ibits_known & MDS_INODELOCK_XATTR) { + LIST_HEAD(cancels); + int count; + + /* Without that packing would fail */ + if (input_size == 0) + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, + RCL_CLIENT, 0); + + count = mdc_resource_get_unused(exp, fid, + &cancels, LCK_EX, + MDS_INODELOCK_XATTR); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + } else { + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, opcode); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + } + + if (opcode == MDS_REINT) { + struct mdt_rec_setxattr *rec; + + BUILD_BUG_ON(sizeof(struct mdt_rec_setxattr) != + sizeof(struct mdt_rec_reint)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + rec->sx_opcode = REINT_SETXATTR; + rec->sx_fsuid = from_kuid(&init_user_ns, current_fsuid()); + rec->sx_fsgid = from_kgid(&init_user_ns, current_fsgid()); + rec->sx_cap = current_cap().cap[0]; + rec->sx_suppgid1 = suppgid; + rec->sx_suppgid2 = -1; + rec->sx_fid = *fid; + rec->sx_valid = valid | OBD_MD_FLCTIME; + rec->sx_time = ktime_get_real_seconds(); + rec->sx_size = output_size; + rec->sx_flags = flags; + } else { + mdc_pack_body(&req->rq_pill, fid, valid, output_size, + suppgid, flags); + /* Avoid deadlock with modifying RPCs on MDS_REQUEST_PORTAL. + * See LU-15245. + */ + req->rq_request_portal = MDS_READPAGE_PORTAL; + } + + if (xattr_name) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + memcpy(tmp, xattr_name, xattr_namelen); + } + if (input_size) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + memcpy(tmp, input, input_size); + } + + mdc_file_sepol_pack(&req->rq_pill); + + if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER)) + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, + RCL_SERVER, output_size); + ptlrpc_request_set_replen(req); + + /* make rpc */ + if (opcode == MDS_REINT) + ptlrpc_get_mod_rpc_slot(req); + + rc = ptlrpc_queue_wait(req); + + if (opcode == MDS_REINT) + ptlrpc_put_mod_rpc_slot(req); + + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + RETURN(rc); +} + +static int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid, + u64 obd_md_valid, const char *name, + const void *value, size_t value_size, + unsigned int xattr_flags, u32 suppgid, + struct ptlrpc_request **req) +{ + LASSERT(obd_md_valid == OBD_MD_FLXATTR || + obd_md_valid == OBD_MD_FLXATTRRM); + + return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR, + fid, MDS_REINT, obd_md_valid, name, + value, value_size, 0, xattr_flags, suppgid, + req); +} + +static int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid, + u64 obd_md_valid, const char *name, size_t buf_size, + struct ptlrpc_request **req) +{ + struct mdt_body *body; + int rc; + + LASSERT(obd_md_valid == OBD_MD_FLXATTR || + obd_md_valid == OBD_MD_FLXATTRLS); + + /* Message below is checked in sanity-selinux test_20d + * and sanity-sec test_49 + */ + CDEBUG(D_INFO, "%s: get xattr '%s' for "DFID"\n", + exp->exp_obd->obd_name, name, PFID(fid)); + rc = mdc_xattr_common(exp, &RQF_MDS_GETXATTR, fid, MDS_GETXATTR, + obd_md_valid, name, NULL, 0, buf_size, 0, -1, + req); + if (rc < 0) + GOTO(out, rc); + + body = req_capsule_server_get(&(*req)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + /* only detect the xattr size */ + if (buf_size == 0) { + /* LU-11109: Older MDTs do not distinguish + * between nonexistent xattrs and zero length + * values in this case. Newer MDTs will return + * -ENODATA or set OBD_MD_FLXATTR. */ + GOTO(out, rc = body->mbo_eadatasize); + } + + if (body->mbo_eadatasize == 0) { + /* LU-11109: Newer MDTs set OBD_MD_FLXATTR on + * success so that we can distinguish between + * zero length value and nonexistent xattr. + * + * If OBD_MD_FLXATTR is not set then we keep + * the old behavior and return -ENODATA for + * getxattr() when mbo_eadatasize is 0. But + * -ENODATA only makes sense for getxattr() + * and not for listxattr(). */ + if (body->mbo_valid & OBD_MD_FLXATTR) + GOTO(out, rc = 0); + else if (obd_md_valid == OBD_MD_FLXATTR) + GOTO(out, rc = -ENODATA); + else + GOTO(out, rc = 0); + } + + GOTO(out, rc = body->mbo_eadatasize); +out: + if (rc < 0) { + ptlrpc_req_finished(*req); + *req = NULL; + } + + return rc; +} + +static int mdc_get_lustre_md(struct obd_export *exp, struct req_capsule *pill, + struct obd_export *dt_exp, + struct obd_export *md_exp, + struct lustre_md *md) +{ + int rc; + ENTRY; + + LASSERT(md); + memset(md, 0, sizeof(*md)); + + md->body = req_capsule_server_get(pill, &RMF_MDT_BODY); + LASSERT(md->body != NULL); + + if (md->body->mbo_valid & OBD_MD_FLEASIZE) { + if (!S_ISREG(md->body->mbo_mode)) { + CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, should be a " + "regular file, but is not\n"); + GOTO(out, rc = -EPROTO); + } + + if (md->body->mbo_eadatasize == 0) { + CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, " + "but eadatasize 0\n"); + GOTO(out, rc = -EPROTO); + } + + md->layout.lb_len = md->body->mbo_eadatasize; + md->layout.lb_buf = req_capsule_server_sized_get(pill, + &RMF_MDT_MD, + md->layout.lb_len); + if (md->layout.lb_buf == NULL) + GOTO(out, rc = -EPROTO); + } else if (md->body->mbo_valid & OBD_MD_FLDIREA) { + const union lmv_mds_md *lmv; + size_t lmv_size; + + if (!S_ISDIR(md->body->mbo_mode)) { + CDEBUG(D_INFO, "OBD_MD_FLDIREA set, should be a " + "directory, but is not\n"); + GOTO(out, rc = -EPROTO); + } + + if (md_exp->exp_obd->obd_type->typ_lu == &mdc_device_type) { + CERROR("%s: no LMV, upgrading from old version?\n", + md_exp->exp_obd->obd_name); + + GOTO(out_acl, rc = 0); + } + + if (md->body->mbo_valid & OBD_MD_MEA) { + lmv_size = md->body->mbo_eadatasize; + if (lmv_size == 0) { + CDEBUG(D_INFO, "OBD_MD_FLDIREA is set, " + "but eadatasize 0\n"); + RETURN(-EPROTO); + } + + lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD, + lmv_size); + if (lmv == NULL) + GOTO(out, rc = -EPROTO); + + rc = md_unpackmd(md_exp, &md->lmv, lmv, lmv_size); + if (rc < 0) + GOTO(out, rc); + + if (rc < (int)sizeof(*md->lmv)) { + struct lmv_foreign_md *lfm = md->lfm; + + /* short (< sizeof(struct lmv_stripe_md)) + * foreign LMV case + */ + if (lfm->lfm_magic != LMV_MAGIC_FOREIGN) { + CDEBUG(D_INFO, + "lmv size too small: %d < %d\n", + rc, (int)sizeof(*md->lmv)); + GOTO(out, rc = -EPROTO); + } + } + } + + /* since 2.12.58 intent_getattr fetches default LMV */ + if (md->body->mbo_valid & OBD_MD_DEFAULT_MEA) { + lmv_size = sizeof(struct lmv_user_md); + lmv = req_capsule_server_sized_get(pill, + &RMF_DEFAULT_MDT_MD, + lmv_size); + if (!lmv) + GOTO(out, rc = -EPROTO); + + rc = md_unpackmd(md_exp, &md->default_lmv, lmv, + lmv_size); + if (rc < 0) + GOTO(out, rc); + + if (rc < (int)sizeof(*md->default_lmv)) { + CDEBUG(D_INFO, + "default lmv size too small: %d < %d\n", + rc, (int)sizeof(*md->default_lmv)); + GOTO(out, rc = -EPROTO); + } + } + } + rc = 0; + +out_acl: + if (md->body->mbo_valid & OBD_MD_FLACL) { + /* for ACL, it's possible that FLACL is set but aclsize is zero. + * only when aclsize != 0 there's an actual segment for ACL + * in reply buffer. + */ + rc = mdc_unpack_acl(pill, md); + if (rc) + GOTO(out, rc); + } + + EXIT; +out: + if (rc) + lmd_clear_acl(md); + + return rc; +} + +static int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md) +{ + ENTRY; + RETURN(0); +} + +void mdc_replay_open(struct ptlrpc_request *req) +{ + struct md_open_data *mod = req->rq_cb_data; + struct ptlrpc_request *close_req; + struct obd_client_handle *och; + struct lustre_handle old_open_handle = { }; + struct mdt_body *body; + struct ldlm_reply *rep; + ENTRY; + + if (mod == NULL) { + DEBUG_REQ(D_ERROR, req, + "cannot properly replay without open data"); + EXIT; + return; + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + if (rep != NULL && rep->lock_policy_res2 != 0) + DEBUG_REQ(D_ERROR, req, "Open request replay failed with %ld ", + (long int)rep->lock_policy_res2); + + spin_lock(&req->rq_lock); + och = mod->mod_och; + if (och && och->och_open_handle.cookie) + req->rq_early_free_repbuf = 1; + else + req->rq_early_free_repbuf = 0; + spin_unlock(&req->rq_lock); + + if (req->rq_early_free_repbuf) { + struct lustre_handle *file_open_handle; + + LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC); + + file_open_handle = &och->och_open_handle; + CDEBUG(D_HA, "updating handle from %#llx to %#llx\n", + file_open_handle->cookie, body->mbo_open_handle.cookie); + old_open_handle = *file_open_handle; + *file_open_handle = body->mbo_open_handle; + } + + close_req = mod->mod_close_req; + if (close_req) { + __u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg); + struct mdt_ioepoch *epoch; + + LASSERT(opc == MDS_CLOSE); + epoch = req_capsule_client_get(&close_req->rq_pill, + &RMF_MDT_EPOCH); + LASSERT(epoch); + + if (req->rq_early_free_repbuf) + LASSERT(old_open_handle.cookie == + epoch->mio_open_handle.cookie); + + DEBUG_REQ(D_HA, close_req, "updating close body with new fh"); + epoch->mio_open_handle = body->mbo_open_handle; + } + EXIT; +} + +void mdc_commit_open(struct ptlrpc_request *req) +{ + struct md_open_data *mod = req->rq_cb_data; + if (mod == NULL) + return; + + /** + * No need to touch md_open_data::mod_och, it holds a reference on + * \var mod and will zero references to each other, \var mod will be + * freed after that when md_open_data::mod_och will put the reference. + */ + + /** + * Do not let open request to disappear as it still may be needed + * for close rpc to happen (it may happen on evict only, otherwise + * ptlrpc_request::rq_replay does not let mdc_commit_open() to be + * called), just mark this rpc as committed to distinguish these 2 + * cases, see mdc_close() for details. The open request reference will + * be put along with freeing \var mod. + */ + ptlrpc_request_addref(req); + spin_lock(&req->rq_lock); + req->rq_committed = 1; + spin_unlock(&req->rq_lock); + req->rq_cb_data = NULL; + obd_mod_put(mod); +} + +int mdc_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct lookup_intent *it) +{ + struct md_open_data *mod; + struct mdt_rec_create *rec; + struct mdt_body *body; + struct ptlrpc_request *open_req = it->it_request; + struct obd_import *imp = open_req->rq_import; + ENTRY; + + if (!open_req->rq_replay) + RETURN(0); + + rec = req_capsule_client_get(&open_req->rq_pill, &RMF_REC_REINT); + body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY); + LASSERT(rec != NULL); + /* Incoming message in my byte order (it's been swabbed). */ + /* Outgoing messages always in my byte order. */ + LASSERT(body != NULL); + + /* Only if the import is replayable, we set replay_open data */ + if (och && imp->imp_replayable) { + mod = obd_mod_alloc(); + if (mod == NULL) { + DEBUG_REQ(D_ERROR, open_req, + "cannot allocate md_open_data"); + RETURN(0); + } + + /** + * Take a reference on \var mod, to be freed on mdc_close(). + * It protects \var mod from being freed on eviction (commit + * callback is called despite rq_replay flag). + * Another reference for \var och. + */ + obd_mod_get(mod); + obd_mod_get(mod); + + spin_lock(&open_req->rq_lock); + och->och_mod = mod; + mod->mod_och = och; + mod->mod_is_create = it_disposition(it, DISP_OPEN_CREATE) || + it_disposition(it, DISP_OPEN_STRIPE); + mod->mod_open_req = open_req; + open_req->rq_cb_data = mod; + open_req->rq_commit_cb = mdc_commit_open; + open_req->rq_early_free_repbuf = 1; + spin_unlock(&open_req->rq_lock); + } + + rec->cr_fid2 = body->mbo_fid1; + rec->cr_open_handle_old = body->mbo_open_handle; + open_req->rq_replay_cb = mdc_replay_open; + if (!fid_is_sane(&body->mbo_fid1)) { + DEBUG_REQ(D_ERROR, open_req, + "saving replay request with insane FID " DFID, + PFID(&body->mbo_fid1)); + LBUG(); + } + + DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data"); + RETURN(0); +} + +static void mdc_free_open(struct md_open_data *mod) +{ + int committed = 0; + + if (mod->mod_is_create == 0 && + imp_connect_disp_stripe(mod->mod_open_req->rq_import)) + committed = 1; + + /** + * No reason to asssert here if the open request has + * rq_replay == 1. It means that mdc_close failed, and + * close request wasn`t sent. It is not fatal to client. + * The worst thing is eviction if the client gets open lock + **/ + + DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, + "free open request, rq_replay=%d", + mod->mod_open_req->rq_replay); + + ptlrpc_request_committed(mod->mod_open_req, committed); + if (mod->mod_close_req) + ptlrpc_request_committed(mod->mod_close_req, committed); +} + +static int mdc_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och) +{ + struct md_open_data *mod = och->och_mod; + ENTRY; + + /** + * It is possible to not have \var mod in a case of eviction between + * lookup and ll_file_open(). + **/ + if (mod == NULL) + RETURN(0); + + LASSERT(mod != LP_POISON); + LASSERT(mod->mod_open_req != NULL); + + spin_lock(&mod->mod_open_req->rq_lock); + if (mod->mod_och) + mod->mod_och->och_open_handle.cookie = 0; + mod->mod_open_req->rq_early_free_repbuf = 0; + spin_unlock(&mod->mod_open_req->rq_lock); + mdc_free_open(mod); + + mod->mod_och = NULL; + och->och_mod = NULL; + obd_mod_put(mod); + + RETURN(0); +} + +static int mdc_close(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod, struct ptlrpc_request **request) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + struct req_format *req_fmt; + size_t u32_count = 0; + int rc; + int saved_rc = 0; + ENTRY; + + CDEBUG(D_INODE, "%s: "DFID" file closed with intent: %x\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + op_data->op_bias); + + if (op_data->op_bias & MDS_CLOSE_INTENT) { + req_fmt = &RQF_MDS_CLOSE_INTENT; + if (op_data->op_bias & MDS_HSM_RELEASE) { + /* allocate a FID for volatile file */ + rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, + op_data); + if (rc < 0) { + CERROR("%s: "DFID" allocating FID: rc = %d\n", + obd->obd_name, PFID(&op_data->op_fid1), + rc); + /* save the errcode and proceed to close */ + saved_rc = rc; + } + } + if (op_data->op_bias & MDS_CLOSE_RESYNC_DONE) { + size_t count = op_data->op_data_size / sizeof(__u32); + + if (count > INLINE_RESYNC_ARRAY_SIZE) + u32_count = count; + } + } else { + req_fmt = &RQF_MDS_CLOSE; + } + + *request = NULL; + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_CLOSE)) + req = NULL; + else + req = ptlrpc_request_alloc(class_exp2cliimp(exp), req_fmt); + + /* Ensure that this close's handle is fixed up during replay. */ + if (likely(mod != NULL)) { + LASSERTF(mod->mod_open_req != NULL && + mod->mod_open_req->rq_type != LI_POISON, + "POISONED open %p!\n", mod->mod_open_req); + + mod->mod_close_req = req; + + DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "matched open"); + /* We no longer want to preserve this open for replay even + * though the open was committed. b=3632, b=3633 */ + spin_lock(&mod->mod_open_req->rq_lock); + mod->mod_open_req->rq_replay = 0; + spin_unlock(&mod->mod_open_req->rq_lock); + } else { + CDEBUG(D_HA, "couldn't find open req; expecting close error\n"); + } + if (req == NULL) { + /** + * TODO: repeat close after errors + */ + CWARN("%s: close of FID "DFID" failed, file reference will be " + "dropped when this client unmounts or is evicted\n", + obd->obd_name, PFID(&op_data->op_fid1)); + GOTO(out, rc = -ENOMEM); + } + + if (u32_count > 0) + req_capsule_set_size(&req->rq_pill, &RMF_U32, RCL_CLIENT, + u32_count * sizeof(__u32)); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE); + if (rc) { + ptlrpc_request_free(req); + req = NULL; + GOTO(out, rc); + } + + /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a + * portal whose threads are not taking any DLM locks and are therefore + * always progressing */ + req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (!obd->u.cli.cl_lsom_update || + !(exp_connect_flags2(exp) & OBD_CONNECT2_LSOM)) + op_data->op_xvalid &= ~(OP_XVALID_LAZYSIZE | + OP_XVALID_LAZYBLOCKS); + + mdc_close_pack(&req->rq_pill, op_data); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obd->u.cli.cl_default_mds_easize); + + ptlrpc_request_set_replen(req); + + ptlrpc_get_mod_rpc_slot(req); + rc = ptlrpc_queue_wait(req); + ptlrpc_put_mod_rpc_slot(req); + + if (req->rq_repmsg == NULL) { + CDEBUG(D_RPCTRACE, "request %p failed to send: rc = %d\n", req, + req->rq_status); + if (rc == 0) + rc = req->rq_status ?: -EIO; + } else if (rc == 0 || rc == -EAGAIN) { + struct mdt_body *body; + + rc = lustre_msg_get_status(req->rq_repmsg); + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { + DEBUG_REQ(D_ERROR, req, + "type = PTL_RPC_MSG_ERR: rc = %d", rc); + if (rc > 0) + rc = -rc; + } + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + rc = -EPROTO; + } else if (rc == -ESTALE) { + /** + * it can be allowed error after 3633 if open was committed and + * server failed before close was sent. Let's check if mod + * exists and return no error in that case + */ + if (mod) { + DEBUG_REQ(D_HA, req, "Reset ESTALE = %d", rc); + LASSERT(mod->mod_open_req != NULL); + if (mod->mod_open_req->rq_committed) + rc = 0; + } + } + +out: + if (mod) { + if (rc != 0) + mod->mod_close_req = NULL; + /* Since now, mod is accessed through open_req only, + * thus close req does not keep a reference on mod anymore. */ + obd_mod_put(mod); + } + *request = req; + + RETURN(rc < 0 ? rc : saved_rc); +} + +static int mdc_getpage(struct obd_export *exp, const struct lu_fid *fid, + u64 offset, struct page **pages, int npages, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + struct ptlrpc_bulk_desc *desc; + int i; + int resends = 0; + int rc; + ENTRY; + + *request = NULL; + +restart_bulk: + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_READPAGE); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_READPAGE); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + desc = ptlrpc_prep_bulk_imp(req, npages, 1, + PTLRPC_BULK_PUT_SINK, + MDS_BULK_PORTAL, + &ptlrpc_bulk_kiov_pin_ops); + if (desc == NULL) { + ptlrpc_req_finished(req); + RETURN(-ENOMEM); + } + + /* NB req now owns desc and will free it when it gets freed */ + for (i = 0; i < npages; i++) + desc->bd_frag_ops->add_kiov_frag(desc, pages[i], 0, + PAGE_SIZE); + + mdc_readdir_pack(&req->rq_pill, offset, PAGE_SIZE * npages, fid); + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) { + ptlrpc_req_finished(req); + if (rc != -ETIMEDOUT) + RETURN(rc); + + resends++; + if (!client_should_resend(resends, &exp->exp_obd->u.cli)) { + CERROR("%s: too many resend retries: rc = %d\n", + exp->exp_obd->obd_name, -EIO); + RETURN(-EIO); + } + + /* If a signal interrupts then the timeout returned will + * not be zero. In that case return -EINTR + */ + if (msleep_interruptible(resends * 1000)) + RETURN(-EINTR); + + goto restart_bulk; + } + + rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, + req->rq_bulk->bd_nob_transferred); + if (rc < 0) { + ptlrpc_req_finished(req); + RETURN(rc); + } + + if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) { + CERROR("%s: unexpected bytes transferred: %d (%ld expected)\n", + exp->exp_obd->obd_name, req->rq_bulk->bd_nob_transferred, + PAGE_SIZE * npages); + ptlrpc_req_finished(req); + RETURN(-EPROTO); + } + + *request = req; + RETURN(0); +} + +static void mdc_release_page(struct page *page, int remove) +{ + if (remove) { + lock_page(page); + if (likely(page->mapping != NULL)) + cfs_delete_from_page_cache(page); + unlock_page(page); + } + put_page(page); +} + +static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash, + __u64 *start, __u64 *end, int hash64) +{ + /* + * Complement of hash is used as an index so that + * radix_tree_gang_lookup() can be used to find a page with starting + * hash _smaller_ than one we are looking for. + */ + unsigned long offset = hash_x_index(*hash, hash64); + struct page *page; + unsigned long flags; + int found; + + ll_xa_lock_irqsave(&mapping->i_pages, flags); + found = radix_tree_gang_lookup(&mapping->page_tree, + (void **)&page, offset, 1); + if (found > 0 && !ll_xa_is_value(page)) { + struct lu_dirpage *dp; + + get_page(page); + ll_xa_unlock_irqrestore(&mapping->i_pages, flags); + /* + * In contrast to find_lock_page() we are sure that directory + * page cannot be truncated (while DLM lock is held) and, + * hence, can avoid restart. + * + * In fact, page cannot be locked here at all, because + * mdc_read_page_remote does synchronous io. + */ + wait_on_page_locked(page); + if (PageUptodate(page)) { + dp = kmap(page); + if (BITS_PER_LONG == 32 && hash64) { + *start = le64_to_cpu(dp->ldp_hash_start) >> 32; + *end = le64_to_cpu(dp->ldp_hash_end) >> 32; + *hash = *hash >> 32; + } else { + *start = le64_to_cpu(dp->ldp_hash_start); + *end = le64_to_cpu(dp->ldp_hash_end); + } + if (unlikely(*start == 1 && *hash == 0)) + *hash = *start; + else + LASSERTF(*start <= *hash, "start = %#llx" + ",end = %#llx,hash = %#llx\n", + *start, *end, *hash); + CDEBUG(D_VFSTRACE, "offset %lx [%#llx %#llx]," + " hash %#llx\n", offset, *start, *end, *hash); + if (*hash > *end) { + kunmap(page); + mdc_release_page(page, 0); + page = NULL; + } else if (*end != *start && *hash == *end) { + /* + * upon hash collision, remove this page, + * otherwise put page reference, and + * mdc_read_page_remote() will issue RPC to + * fetch the page we want. + */ + kunmap(page); + mdc_release_page(page, + le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); + page = NULL; + } + } else { + put_page(page); + page = ERR_PTR(-EIO); + } + } else { + ll_xa_unlock_irqrestore(&mapping->i_pages, flags); + page = NULL; + } + return page; +} + +/* + * Adjust a set of pages, each page containing an array of lu_dirpages, + * so that each page can be used as a single logical lu_dirpage. + * + * A lu_dirpage is laid out as follows, where s = ldp_hash_start, + * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a + * struct lu_dirent. It has size up to LU_PAGE_SIZE. The ldp_hash_end + * value is used as a cookie to request the next lu_dirpage in a + * directory listing that spans multiple pages (two in this example): + * ________ + * | | + * .|--------v------- -----. + * |s|e|f|p|ent|ent| ... |ent| + * '--|-------------- -----' Each PAGE contains a single + * '------. lu_dirpage. + * .---------v------- -----. + * |s|e|f|p|ent| 0 | ... | 0 | + * '----------------- -----' + * + * However, on hosts where the native VM page size (PAGE_SIZE) is + * larger than LU_PAGE_SIZE, a single host page may contain multiple + * lu_dirpages. After reading the lu_dirpages from the MDS, the + * ldp_hash_end of the first lu_dirpage refers to the one immediately + * after it in the same PAGE (arrows simplified for brevity, but + * in general e0==s1, e1==s2, etc.): + * + * .-------------------- -----. + * |s0|e0|f0|p|ent|ent| ... |ent| + * |---v---------------- -----| + * |s1|e1|f1|p|ent|ent| ... |ent| + * |---v---------------- -----| Here, each PAGE contains + * ... multiple lu_dirpages. + * |---v---------------- -----| + * |s'|e'|f'|p|ent|ent| ... |ent| + * '---|---------------- -----' + * v + * .----------------------------. + * | next PAGE | + * + * This structure is transformed into a single logical lu_dirpage as follows: + * + * - Replace e0 with e' so the request for the next lu_dirpage gets the page + * labeled 'next PAGE'. + * + * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether + * a hash collision with the next page exists. + * + * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span + * to the first entry of the next lu_dirpage. + */ +#if PAGE_SIZE > LU_PAGE_SIZE +static void mdc_adjust_dirpages(struct page **pages, int cfs_pgs, int lu_pgs) +{ + int i; + + for (i = 0; i < cfs_pgs; i++) { + struct lu_dirpage *dp = kmap(pages[i]); + struct lu_dirpage *first = dp; + struct lu_dirent *end_dirent = NULL; + struct lu_dirent *ent; + __u64 hash_end = dp->ldp_hash_end; + __u32 flags = dp->ldp_flags; + + while (--lu_pgs > 0) { + ent = lu_dirent_start(dp); + for (end_dirent = ent; ent != NULL; + end_dirent = ent, ent = lu_dirent_next(ent)); + + /* Advance dp to next lu_dirpage. */ + dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE); + + /* Check if we've reached the end of the PAGE. */ + if (!((unsigned long)dp & ~PAGE_MASK)) + break; + + /* Save the hash and flags of this lu_dirpage. */ + hash_end = dp->ldp_hash_end; + flags = dp->ldp_flags; + + /* Check if lu_dirpage contains no entries. */ + if (end_dirent == NULL) + break; + + /* Enlarge the end entry lde_reclen from 0 to + * first entry of next lu_dirpage. */ + LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0); + end_dirent->lde_reclen = + cpu_to_le16((char *)(dp->ldp_entries) - + (char *)end_dirent); + } + + first->ldp_hash_end = hash_end; + first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE); + first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE); + + kunmap(pages[i]); + } + LASSERTF(lu_pgs == 0, "left = %d\n", lu_pgs); +} +#else +#define mdc_adjust_dirpages(pages, cfs_pgs, lu_pgs) do {} while (0) +#endif /* PAGE_SIZE > LU_PAGE_SIZE */ + +/* parameters for readdir page */ +struct readpage_param { + struct md_op_data *rp_mod; + __u64 rp_off; + int rp_hash64; + struct obd_export *rp_exp; +}; + +/** + * Read pages from server. + * + * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains + * a header lu_dirpage which describes the start/end hash, and whether this + * page is empty (contains no dir entry) or hash collide with next page. + * After client receives reply, several pages will be integrated into dir page + * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the + * lu_dirpage for this integrated page will be adjusted. + **/ +static int ll_mdc_read_page_remote(void *data, struct page *page0) +{ + struct readpage_param *rp = data; + struct page **page_pool; + struct page *page; + struct lu_dirpage *dp; + struct md_op_data *op_data = rp->rp_mod; + struct ptlrpc_request *req; + int max_pages; + struct inode *inode; + struct lu_fid *fid; + int rd_pgs = 0; /* number of pages actually read */ + int npages; + int i; + int rc; + ENTRY; + + max_pages = rp->rp_exp->exp_obd->u.cli.cl_max_pages_per_rpc; + inode = op_data->op_data; + fid = &op_data->op_fid1; + LASSERT(inode != NULL); + + OBD_ALLOC_PTR_ARRAY_LARGE(page_pool, max_pages); + if (page_pool != NULL) { + page_pool[0] = page0; + } else { + page_pool = &page0; + max_pages = 1; + } + + for (npages = 1; npages < max_pages; npages++) { + page = page_cache_alloc(inode->i_mapping); + if (page == NULL) + break; + page_pool[npages] = page; + } + + rc = mdc_getpage(rp->rp_exp, fid, rp->rp_off, page_pool, npages, &req); + if (rc < 0) { + /* page0 is special, which was added into page cache early */ + cfs_delete_from_page_cache(page0); + } else { + int lu_pgs; + + rd_pgs = (req->rq_bulk->bd_nob_transferred + PAGE_SIZE - 1) >> + PAGE_SHIFT; + lu_pgs = req->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT; + LASSERT(!(req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK)); + + CDEBUG(D_INODE, "read %d(%d) pages\n", rd_pgs, lu_pgs); + + mdc_adjust_dirpages(page_pool, rd_pgs, lu_pgs); + + SetPageUptodate(page0); + } + unlock_page(page0); + + ptlrpc_req_finished(req); + CDEBUG(D_CACHE, "read %d/%d pages\n", rd_pgs, npages); + for (i = 1; i < npages; i++) { + unsigned long offset; + __u64 hash; + int ret; + + page = page_pool[i]; + + if (rc < 0 || i >= rd_pgs) { + put_page(page); + continue; + } + + SetPageUptodate(page); + + dp = kmap(page); + hash = le64_to_cpu(dp->ldp_hash_start); + kunmap(page); + + offset = hash_x_index(hash, rp->rp_hash64); + + prefetchw(&page->flags); + ret = add_to_page_cache_lru(page, inode->i_mapping, offset, + GFP_KERNEL); + if (ret == 0) + unlock_page(page); + else + CDEBUG(D_VFSTRACE, "page %lu add to page cache failed:" + " rc = %d\n", offset, ret); + put_page(page); + } + + if (page_pool != &page0) + OBD_FREE_PTR_ARRAY_LARGE(page_pool, max_pages); + + RETURN(rc); +} + +#ifdef HAVE_READ_CACHE_PAGE_WANTS_FILE +static inline int mdc_read_folio_remote(struct file *file, struct folio *folio) +{ + return ll_mdc_read_page_remote(file->private_data, + folio_page(folio, 0)); +} +#else +#define mdc_read_folio_remote ll_mdc_read_page_remote +#endif + +/** + * Read dir page from cache first, if it can not find it, read it from + * server and add into the cache. + * + * \param[in] exp MDC export + * \param[in] op_data client MD stack parameters, transfering parameters + * between different layers on client MD stack. + * \param[in] mrinfo callback required for ldlm lock enqueue during + * read page + * \param[in] hash_offset the hash offset of the page to be read + * \param[in] ppage the page to be read + * + * retval = 0 get the page successfully + * errno(<0) get the page failed + */ +static int mdc_read_page(struct obd_export *exp, struct md_op_data *op_data, + struct md_readdir_info *mrinfo, __u64 hash_offset, + struct page **ppage) +{ + struct lookup_intent it = { .it_op = IT_READDIR }; + struct page *page; + struct inode *dir = op_data->op_data; + struct address_space *mapping; + struct lu_dirpage *dp; + __u64 start = 0; + __u64 end = 0; + struct lustre_handle lockh; + struct ptlrpc_request *enq_req = NULL; + struct readpage_param rp_param; + int rc; + + ENTRY; + + *ppage = NULL; + + LASSERT(dir != NULL); + mapping = dir->i_mapping; + + rc = mdc_intent_lock(exp, op_data, &it, &enq_req, + mrinfo->mr_blocking_ast, 0); + if (enq_req != NULL) + ptlrpc_req_finished(enq_req); + + if (rc < 0) { + CERROR("%s: "DFID" lock enqueue fails: rc = %d\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), rc); + RETURN(rc); + } + + rc = 0; + lockh.cookie = it.it_lock_handle; + mdc_set_lock_data(exp, &lockh, dir, NULL); + + rp_param.rp_off = hash_offset; + rp_param.rp_hash64 = op_data->op_cli_flags & CLI_HASH64; + page = mdc_page_locate(mapping, &rp_param.rp_off, &start, &end, + rp_param.rp_hash64); + if (IS_ERR(page)) { + CERROR("%s: dir page locate: "DFID" at %llu: rc %ld\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + rp_param.rp_off, PTR_ERR(page)); + GOTO(out_unlock, rc = PTR_ERR(page)); + } else if (page != NULL) { + /* + * XXX nikita: not entirely correct handling of a corner case: + * suppose hash chain of entries with hash value HASH crosses + * border between pages P0 and P1. First both P0 and P1 are + * cached, seekdir() is called for some entry from the P0 part + * of the chain. Later P0 goes out of cache. telldir(HASH) + * happens and finds P1, as it starts with matching hash + * value. Remaining entries from P0 part of the chain are + * skipped. (Is that really a bug?) + * + * Possible solutions: 0. don't cache P1 is such case, handle + * it as an "overflow" page. 1. invalidate all pages at + * once. 2. use HASH|1 as an index for P1. + */ + GOTO(hash_collision, page); + } + + rp_param.rp_exp = exp; + rp_param.rp_mod = op_data; + page = ll_read_cache_page(mapping, + hash_x_index(rp_param.rp_off, + rp_param.rp_hash64), + mdc_read_folio_remote, &rp_param); + if (IS_ERR(page)) { + CDEBUG(D_INFO, "%s: read cache page: "DFID" at %llu: %ld\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + rp_param.rp_off, PTR_ERR(page)); + GOTO(out_unlock, rc = PTR_ERR(page)); + } + + wait_on_page_locked(page); + (void)kmap(page); + if (!PageUptodate(page)) { + CERROR("%s: page not updated: "DFID" at %llu: rc %d\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + rp_param.rp_off, -5); + goto fail; + } + if (!PageChecked(page)) + SetPageChecked(page); + if (PageError(page)) { + CERROR("%s: page error: "DFID" at %llu: rc %d\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + rp_param.rp_off, -5); + goto fail; + } + +hash_collision: + dp = page_address(page); + if (BITS_PER_LONG == 32 && rp_param.rp_hash64) { + start = le64_to_cpu(dp->ldp_hash_start) >> 32; + end = le64_to_cpu(dp->ldp_hash_end) >> 32; + rp_param.rp_off = hash_offset >> 32; + } else { + start = le64_to_cpu(dp->ldp_hash_start); + end = le64_to_cpu(dp->ldp_hash_end); + rp_param.rp_off = hash_offset; + } + if (end == start) { + LASSERT(start == rp_param.rp_off); + CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end); +#if BITS_PER_LONG == 32 + CWARN("Real page-wide hash collision at [%llu %llu] with " + "hash %llu\n", le64_to_cpu(dp->ldp_hash_start), + le64_to_cpu(dp->ldp_hash_end), hash_offset); +#endif + + /* + * Fetch whole overflow chain... + * + * XXX not yet. + */ + goto fail; + } + *ppage = page; +out_unlock: + ldlm_lock_decref(&lockh, it.it_lock_mode); + return rc; +fail: + kunmap(page); + mdc_release_page(page, 1); + rc = -EIO; + goto out_unlock; +} + +static int mdc_statfs_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *args, int rc) +{ + struct obd_info *oinfo = args; + struct obd_statfs *osfs; + + if (!rc) { + osfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (!osfs) + return -EPROTO; + + oinfo->oi_osfs = osfs; + + CDEBUG(D_CACHE, "blocks=%llu free=%llu avail=%llu " + "objects=%llu free=%llu state=%x\n", + osfs->os_blocks, osfs->os_bfree, osfs->os_bavail, + osfs->os_files, osfs->os_ffree, osfs->os_state); + } + + oinfo->oi_cb_up(oinfo, rc); + + return rc; +} + +static int mdc_statfs_async(struct obd_export *exp, + struct obd_info *oinfo, time64_t max_age, + struct ptlrpc_request_set *unused) +{ + struct ptlrpc_request *req; + struct obd_info *aa; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_STATFS, + LUSTRE_MDS_VERSION, MDS_STATFS); + if (req == NULL) + return -ENOMEM; + + ptlrpc_request_set_replen(req); + req->rq_interpret_reply = mdc_statfs_interpret; + + aa = ptlrpc_req_async_args(aa, req); + *aa = *oinfo; + + ptlrpcd_add_req(req); + + return 0; +} + +static int mdc_statfs(const struct lu_env *env, + struct obd_export *exp, struct obd_statfs *osfs, + time64_t max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct req_format *fmt; + struct ptlrpc_request *req; + struct obd_statfs *msfs; + struct obd_import *imp, *imp0; + int rc; + ENTRY; + + /* + * Since the request might also come from lprocfs, so we need + * sync this with client_disconnect_export Bug15684 + */ + with_imp_locked(obd, imp0, rc) + imp = class_import_get(imp0); + if (rc) + RETURN(rc); + + fmt = &RQF_MDS_STATFS; + if ((exp_connect_flags2(exp) & OBD_CONNECT2_SUM_STATFS) && + (flags & OBD_STATFS_SUM)) + fmt = &RQF_MDS_STATFS_NEW; + req = ptlrpc_request_alloc_pack(imp, fmt, LUSTRE_MDS_VERSION, + MDS_STATFS); + if (req == NULL) + GOTO(output, rc = -ENOMEM); + req->rq_allow_intr = 1; + + if ((flags & OBD_STATFS_SUM) && + (exp_connect_flags2(exp) & OBD_CONNECT2_SUM_STATFS)) { + /* request aggregated states */ + struct mdt_body *body; + + body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + body->mbo_valid = OBD_MD_FLAGSTATFS; + } + + ptlrpc_request_set_replen(req); + + if (flags & OBD_STATFS_NODELAY) { + /* procfs requests not want stay in wait for avoid deadlock */ + req->rq_no_resend = 1; + req->rq_no_delay = 1; + } + + rc = ptlrpc_queue_wait(req); + if (rc) { + /* check connection error first */ + if (imp->imp_connect_error) + rc = imp->imp_connect_error; + GOTO(out, rc); + } + + msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (msfs == NULL) + GOTO(out, rc = -EPROTO); + + *osfs = *msfs; + EXIT; +out: + ptlrpc_req_finished(req); +output: + class_import_put(imp); + return rc; +} + +static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf) +{ + __u32 keylen, vallen; + void *key; + int rc; + + if (gf->gf_pathlen > PATH_MAX) + RETURN(-ENAMETOOLONG); + if (gf->gf_pathlen < 2) + RETURN(-EOVERFLOW); + + /* Key is KEY_FID2PATH + getinfo_fid2path description */ + keylen = cfs_size_round(sizeof(KEY_FID2PATH) + sizeof(*gf) + + sizeof(struct lu_fid)); + OBD_ALLOC(key, keylen); + if (key == NULL) + RETURN(-ENOMEM); + memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH)); + memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf)); + memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf), + gf->gf_u.gf_root_fid, sizeof(struct lu_fid)); + CDEBUG(D_IOCTL, "path get "DFID" from %llu #%d\n", + PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno); + + if (!fid_is_sane(&gf->gf_fid)) + GOTO(out, rc = -EINVAL); + + /* Val is struct getinfo_fid2path result plus path */ + vallen = sizeof(*gf) + gf->gf_pathlen; + + rc = obd_get_info(NULL, exp, keylen, key, &vallen, gf); + if (rc != 0 && rc != -EREMOTE) + GOTO(out, rc); + + if (vallen <= sizeof(*gf)) + GOTO(out, rc = -EPROTO); + if (vallen > sizeof(*gf) + gf->gf_pathlen) + GOTO(out, rc = -EOVERFLOW); + + CDEBUG(D_IOCTL, "path got "DFID" from %llu #%d: %s\n", + PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno, + gf->gf_pathlen < 512 ? gf->gf_u.gf_path : + /* only log the last 512 characters of the path */ + gf->gf_u.gf_path + gf->gf_pathlen - 512); + +out: + OBD_FREE(key, keylen); + return rc; +} + +static int mdc_ioc_hsm_progress(struct obd_export *exp, + struct hsm_progress_kernel *hpk) +{ + struct obd_import *imp = class_exp2cliimp(exp); + struct hsm_progress_kernel *req_hpk; + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_PROGRESS, + LUSTRE_MDS_VERSION, MDS_HSM_PROGRESS); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + mdc_pack_body(&req->rq_pill, NULL, 0, 0, -1, 0); + + /* Copy hsm_progress struct */ + req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS); + if (req_hpk == NULL) + GOTO(out, rc = -EPROTO); + + *req_hpk = *hpk; + req_hpk->hpk_errval = lustre_errno_hton(hpk->hpk_errval); + + ptlrpc_request_set_replen(req); + + ptlrpc_get_mod_rpc_slot(req); + rc = ptlrpc_queue_wait(req); + ptlrpc_put_mod_rpc_slot(req); + + GOTO(out, rc); +out: + ptlrpc_req_finished(req); + return rc; +} +/** + * Send hsm_ct_register to MDS + * + * \param[in] imp import + * \param[in] archive_count if in bitmap format, it is the bitmap, + * else it is the count of archive_ids + * \param[in] archives if in bitmap format, it is NULL, + * else it is archive_id lists + */ +static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archive_count, + __u32 *archives) +{ + struct ptlrpc_request *req; + __u32 *archive_array; + size_t archives_size; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_CT_REGISTER); + if (req == NULL) + RETURN(-ENOMEM); + + if (archives != NULL) + archives_size = sizeof(*archive_array) * archive_count; + else + archives_size = sizeof(archive_count); + + req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_ARCHIVE, + RCL_CLIENT, archives_size); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_CT_REGISTER); + if (rc) { + ptlrpc_request_free(req); + RETURN(-ENOMEM); + } + + mdc_pack_body(&req->rq_pill, NULL, 0, 0, -1, 0); + + archive_array = req_capsule_client_get(&req->rq_pill, + &RMF_MDS_HSM_ARCHIVE); + if (archive_array == NULL) + GOTO(out, rc = -EPROTO); + + if (archives != NULL) + memcpy(archive_array, archives, archives_size); + else + *archive_array = archive_count; + + ptlrpc_request_set_replen(req); + req->rq_no_resend = 1; + + rc = mdc_queue_wait(req); + GOTO(out, rc); +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_current_action(struct obd_export *exp, + struct md_op_data *op_data) +{ + struct hsm_current_action *hca = op_data->op_data; + struct hsm_current_action *req_hca; + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_HSM_ACTION); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_ACTION); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(&req->rq_pill, &op_data->op_fid1, 0, 0, + op_data->op_suppgids[0], 0); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + if (rc) + GOTO(out, rc); + + req_hca = req_capsule_server_get(&req->rq_pill, + &RMF_MDS_HSM_CURRENT_ACTION); + if (req_hca == NULL) + GOTO(out, rc = -EPROTO); + + *hca = *req_hca; + + EXIT; +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_ct_unregister(struct obd_import *imp) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_UNREGISTER, + LUSTRE_MDS_VERSION, + MDS_HSM_CT_UNREGISTER); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + mdc_pack_body(&req->rq_pill, NULL, 0, 0, -1, 0); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + GOTO(out, rc); +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_state_get(struct obd_export *exp, + struct md_op_data *op_data) +{ + struct hsm_user_state *hus = op_data->op_data; + struct hsm_user_state *req_hus; + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_HSM_STATE_GET); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_GET); + if (rc != 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(&req->rq_pill, &op_data->op_fid1, 0, 0, + op_data->op_suppgids[0], 0); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + if (rc) + GOTO(out, rc); + + req_hus = req_capsule_server_get(&req->rq_pill, &RMF_HSM_USER_STATE); + if (req_hus == NULL) + GOTO(out, rc = -EPROTO); + + *hus = *req_hus; + + EXIT; +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_state_set(struct obd_export *exp, + struct md_op_data *op_data) +{ + struct hsm_state_set *hss = op_data->op_data; + struct hsm_state_set *req_hss; + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_HSM_STATE_SET); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_SET); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(&req->rq_pill, &op_data->op_fid1, 0, 0, + op_data->op_suppgids[0], 0); + + /* Copy states */ + req_hss = req_capsule_client_get(&req->rq_pill, &RMF_HSM_STATE_SET); + if (req_hss == NULL) + GOTO(out, rc = -EPROTO); + *req_hss = *hss; + + ptlrpc_request_set_replen(req); + + ptlrpc_get_mod_rpc_slot(req); + rc = ptlrpc_queue_wait(req); + ptlrpc_put_mod_rpc_slot(req); + + GOTO(out, rc); +out: + ptlrpc_req_finished(req); + return rc; +} + +/* For RESTORE and RELEASE the mdt will take EX lock on the file layout. + * So we can use early cancel on client side locks for that resource. + */ +static inline int mdc_hsm_request_lock_to_cancel(struct obd_export *exp, + struct hsm_user_request *hur, + struct list_head *cancels) +{ + struct hsm_user_item *hui = &hur->hur_user_item[0]; + struct hsm_request *req_hr = &hur->hur_request; + int count = 0; + int i; + + if (req_hr->hr_action != HUA_RESTORE && + req_hr->hr_action != HUA_RELEASE) + return 0; + + for (i = 0; i < req_hr->hr_itemcount; i++, hui++) { + if (!fid_is_sane(&hui->hui_fid)) + continue; + count += mdc_resource_get_unused(exp, &hui->hui_fid, cancels, + LCK_EX, MDS_INODELOCK_LAYOUT); + } + + return count; +} + +static int mdc_ioc_hsm_request(struct obd_export *exp, + struct hsm_user_request *hur) +{ + struct obd_import *imp = class_exp2cliimp(exp); + struct ptlrpc_request *req; + struct hsm_request *req_hr; + struct hsm_user_item *req_hui; + char *req_opaque; + LIST_HEAD(cancels); + int count; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_REQUEST); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM, RCL_CLIENT, + hur->hur_request.hr_itemcount + * sizeof(struct hsm_user_item)); + req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, RCL_CLIENT, + hur->hur_request.hr_data_len); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_REQUEST); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + /* Cancel existing locks */ + count = mdc_hsm_request_lock_to_cancel(exp, hur, &cancels); + ldlm_cli_cancel_list(&cancels, count, NULL, 0); + mdc_pack_body(&req->rq_pill, NULL, 0, 0, -1, 0); + + /* Copy hsm_request struct */ + req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST); + if (req_hr == NULL) + GOTO(out, rc = -EPROTO); + *req_hr = hur->hur_request; + + /* Copy hsm_user_item structs */ + req_hui = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM); + if (req_hui == NULL) + GOTO(out, rc = -EPROTO); + memcpy(req_hui, hur->hur_user_item, + hur->hur_request.hr_itemcount * sizeof(struct hsm_user_item)); + + /* Copy opaque field */ + req_opaque = req_capsule_client_get(&req->rq_pill, &RMF_GENERIC_DATA); + if (req_opaque == NULL) + GOTO(out, rc = -EPROTO); + memcpy(req_opaque, hur_data(hur), hur->hur_request.hr_data_len); + + ptlrpc_request_set_replen(req); + + ptlrpc_get_mod_rpc_slot(req); + rc = ptlrpc_queue_wait(req); + ptlrpc_put_mod_rpc_slot(req); + + GOTO(out, rc); + +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_ct_start(struct obd_export *exp, + struct lustre_kernelcomm *lk); + +static int mdc_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct ptlrpc_request *req; + struct obd_quotactl *oqc; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_QUOTACTL); + if (req == NULL) + RETURN(-ENOMEM); + + + if (LUSTRE_Q_CMD_IS_POOL(oqctl->qc_cmd)) + req_capsule_set_size(&req->rq_pill, + &RMF_OBD_QUOTACTL, + RCL_CLIENT, + sizeof(*oqc) + LOV_MAXPOOLNAME + 1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, + MDS_QUOTACTL); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + QCTL_COPY(oqc, oqctl); + + ptlrpc_request_set_replen(req); + ptlrpc_at_set_req_timeout(req); + + rc = ptlrpc_queue_wait(req); + if (rc) { + CERROR("%s: ptlrpc_queue_wait failed: rc = %d\n", + exp->exp_obd->obd_name, rc); + GOTO(out, rc); + } + + if (req->rq_repmsg && + (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) { + QCTL_COPY(oqctl, oqc); + } else if (!rc) { + rc = -EPROTO; + CERROR("%s: cannot unpack obd_quotactl: rc = %d\n", + exp->exp_obd->obd_name, rc); + } +out: + ptlrpc_req_finished(req); + + RETURN(rc); +} + +static int mdc_ioc_swap_layouts(struct obd_export *exp, + struct md_op_data *op_data) +{ + LIST_HEAD(cancels); + struct ptlrpc_request *req; + int rc, count; + struct mdc_swap_layouts *msl, *payload; + ENTRY; + + msl = op_data->op_data; + + /* When the MDT will get the MDS_SWAP_LAYOUTS RPC the + * first thing it will do is to cancel the 2 layout + * locks held by this client. + * So the client must cancel its layout locks on the 2 fids + * with the request RPC to avoid extra RPC round trips. + */ + count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels, + LCK_EX, MDS_INODELOCK_LAYOUT | + MDS_INODELOCK_XATTR); + count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels, + LCK_EX, MDS_INODELOCK_LAYOUT | + MDS_INODELOCK_XATTR); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_SWAP_LAYOUTS); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + rc = mdc_prep_elc_req(exp, req, MDS_SWAP_LAYOUTS, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_swap_layouts_pack(&req->rq_pill, op_data); + + payload = req_capsule_client_get(&req->rq_pill, &RMF_SWAP_LAYOUTS); + LASSERT(payload); + + *payload = *msl; + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + EXIT; + +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void __user *uarg) +{ + struct obd_device *obd = exp->exp_obd; + struct obd_ioctl_data *data = karg; + struct obd_import *imp = obd->u.cli.cl_import; + int rc; + ENTRY; + + if (!try_module_get(THIS_MODULE)) { + CERROR("%s: cannot get module '%s'\n", obd->obd_name, + module_name(THIS_MODULE)); + return -EINVAL; + } + switch (cmd) { + case OBD_IOC_FID2PATH: + rc = mdc_ioc_fid2path(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_CT_START: + rc = mdc_ioc_hsm_ct_start(exp, karg); + /* ignore if it was already registered on this MDS. */ + if (rc == -EEXIST) + rc = 0; + GOTO(out, rc); + case LL_IOC_HSM_PROGRESS: + rc = mdc_ioc_hsm_progress(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_STATE_GET: + rc = mdc_ioc_hsm_state_get(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_STATE_SET: + rc = mdc_ioc_hsm_state_set(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_ACTION: + rc = mdc_ioc_hsm_current_action(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_REQUEST: + rc = mdc_ioc_hsm_request(exp, karg); + GOTO(out, rc); + case OBD_IOC_CLIENT_RECOVER: + rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1, 0); + if (rc < 0) + GOTO(out, rc); + GOTO(out, rc = 0); + case IOC_OSC_SET_ACTIVE: + rc = ptlrpc_set_import_active(imp, data->ioc_offset); + GOTO(out, rc); + /* + * Normally IOC_OBD_STATFS, OBD_IOC_QUOTACTL iocontrol are handled by + * LMV instead of MDC. But when the cluster is upgraded from 1.8, + * there'd be no LMV layer thus we might be called here. Eventually + * this code should be removed. + * bz20731, LU-592. + */ + case IOC_OBD_STATFS: { + struct obd_statfs stat_buf = {0}; + + if (*((__u32 *) data->ioc_inlbuf2) != 0) + GOTO(out, rc = -ENODEV); + + /* copy UUID */ + if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(obd), + min((int)data->ioc_plen2, + (int)sizeof(struct obd_uuid)))) + GOTO(out, rc = -EFAULT); + + rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + 0); + if (rc != 0) + GOTO(out, rc); + + if (copy_to_user(data->ioc_pbuf1, &stat_buf, + min((int) data->ioc_plen1, + (int) sizeof(stat_buf)))) + GOTO(out, rc = -EFAULT); + + GOTO(out, rc = 0); + } + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct obd_quotactl *oqctl; + + OBD_ALLOC_PTR(oqctl); + if (oqctl == NULL) + GOTO(out, rc = -ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_MDTIDX; + qctl->obd_uuid = obd->u.cli.cl_target_uuid; + } + + OBD_FREE_PTR(oqctl); + GOTO(out, rc); + } + case LL_IOC_GET_CONNECT_FLAGS: + if (copy_to_user(uarg, exp_connect_flags_ptr(exp), + sizeof(*exp_connect_flags_ptr(exp)))) + GOTO(out, rc = -EFAULT); + + GOTO(out, rc = 0); + case LL_IOC_LOV_SWAP_LAYOUTS: + rc = mdc_ioc_swap_layouts(exp, karg); + GOTO(out, rc); + default: + CERROR("unrecognised ioctl: cmd = %#x\n", cmd); + GOTO(out, rc = -ENOTTY); + } +out: + module_put(THIS_MODULE); + + return rc; +} + +static int mdc_get_info_rpc(struct obd_export *exp, + u32 keylen, void *key, + u32 vallen, void *val) +{ + struct obd_import *imp = class_exp2cliimp(exp); + struct ptlrpc_request *req; + char *tmp; + int rc = -EINVAL; + ENTRY; + + req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY, + RCL_CLIENT, keylen); + req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN, + RCL_CLIENT, sizeof(vallen)); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN); + memcpy(tmp, &vallen, sizeof(vallen)); + + req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL, + RCL_SERVER, vallen); + ptlrpc_request_set_replen(req); + + /* if server failed to resolve FID, and OI scrub not able to fix it, it + * will return -EINPROGRESS, ptlrpc_queue_wait() will keep retrying, + * set request interruptible to avoid deadlock. + */ + if (KEY_IS(KEY_FID2PATH)) + req->rq_allow_intr = 1; + + rc = ptlrpc_queue_wait(req); + /* -EREMOTE means the get_info result is partial, and it needs to + * continue on another MDT, see fid2path part in lmv_iocontrol */ + if (rc == 0 || rc == -EREMOTE) { + tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL); + memcpy(val, tmp, vallen); + if (req_capsule_rep_need_swab(&req->rq_pill)) { + if (KEY_IS(KEY_FID2PATH)) + lustre_swab_fid2path(val); + } + } + ptlrpc_req_finished(req); + + RETURN(rc); +} + +static void lustre_swab_hai(struct hsm_action_item *h) +{ + __swab32s(&h->hai_len); + __swab32s(&h->hai_action); + lustre_swab_lu_fid(&h->hai_fid); + lustre_swab_lu_fid(&h->hai_dfid); + __swab64s(&h->hai_cookie); + __swab64s(&h->hai_extent.offset); + __swab64s(&h->hai_extent.length); + __swab64s(&h->hai_gid); +} + +static void lustre_swab_hal(struct hsm_action_list *h) +{ + struct hsm_action_item *hai; + __u32 i; + + __swab32s(&h->hal_version); + __swab32s(&h->hal_count); + __swab32s(&h->hal_archive_id); + __swab64s(&h->hal_flags); + hai = hai_first(h); + for (i = 0; i < h->hal_count; i++, hai = hai_next(hai)) + lustre_swab_hai(hai); +} + +static void lustre_swab_kuch(struct kuc_hdr *l) +{ + __swab16s(&l->kuc_magic); + /* __u8 l->kuc_transport */ + __swab16s(&l->kuc_msgtype); + __swab16s(&l->kuc_msglen); +} + +static int mdc_ioc_hsm_ct_start(struct obd_export *exp, + struct lustre_kernelcomm *lk) +{ + struct obd_import *imp = class_exp2cliimp(exp); + int rc = 0; + + if (lk->lk_group != KUC_GRP_HSM) { + CERROR("Bad copytool group %d\n", lk->lk_group); + return -EINVAL; + } + + CDEBUG(D_HSM, "CT start r%d w%d u%d g%d f%#x\n", lk->lk_rfd, lk->lk_wfd, + lk->lk_uid, lk->lk_group, lk->lk_flags); + + if (lk->lk_flags & LK_FLG_STOP) { + /* Unregister with the coordinator */ + rc = mdc_ioc_hsm_ct_unregister(imp); + } else { + __u32 *archives = NULL; + + if ((lk->lk_flags & LK_FLG_DATANR) && lk->lk_data_count > 0) + archives = lk->lk_data; + + rc = mdc_ioc_hsm_ct_register(imp, lk->lk_data_count, archives); + } + + return rc; +} + +/** + * Send a message to any listening copytools + * @param val KUC message (kuc_hdr + hsm_action_list) + * @param len total length of message + */ +static int mdc_hsm_copytool_send(const struct obd_uuid *uuid, + size_t len, void *val) +{ + struct kuc_hdr *lh = (struct kuc_hdr *)val; + struct hsm_action_list *hal = (struct hsm_action_list *)(lh + 1); + int rc; + ENTRY; + + if (len < sizeof(*lh) + sizeof(*hal)) { + CERROR("Short HSM message %zu < %zu\n", len, + sizeof(*lh) + sizeof(*hal)); + RETURN(-EPROTO); + } + if (lh->kuc_magic == __swab16(KUC_MAGIC)) { + lustre_swab_kuch(lh); + lustre_swab_hal(hal); + } else if (lh->kuc_magic != KUC_MAGIC) { + CERROR("Bad magic %x!=%x\n", lh->kuc_magic, KUC_MAGIC); + RETURN(-EPROTO); + } + + CDEBUG(D_HSM, " Received message mg=%x t=%d m=%d l=%d actions=%d " + "on %s\n", + lh->kuc_magic, lh->kuc_transport, lh->kuc_msgtype, + lh->kuc_msglen, hal->hal_count, hal->hal_fsname); + + /* Broadcast to HSM listeners */ + rc = libcfs_kkuc_group_put(uuid, KUC_GRP_HSM, lh); + + RETURN(rc); +} + +/** + * callback function passed to kuc for re-registering each HSM copytool + * running on MDC, after MDT shutdown/recovery. + * @param data copytool registration data + * @param cb_arg callback argument (obd_import) + */ +static int mdc_hsm_ct_reregister(void *data, void *cb_arg) +{ + struct obd_import *imp = (struct obd_import *)cb_arg; + struct kkuc_ct_data *kcd = data; + __u32 *archives = NULL; + int rc; + + if (kcd == NULL || + (kcd->kcd_magic != KKUC_CT_DATA_ARRAY_MAGIC && + kcd->kcd_magic != KKUC_CT_DATA_BITMAP_MAGIC)) + return -EPROTO; + + if (kcd->kcd_magic == KKUC_CT_DATA_BITMAP_MAGIC) { + CDEBUG(D_HA, "%s: recover copytool registration to MDT " + "(archive=%#x)\n", imp->imp_obd->obd_name, + kcd->kcd_nr_archives); + } else { + CDEBUG(D_HA, "%s: recover copytool registration to MDT " + "(archive nr = %u)\n", + imp->imp_obd->obd_name, kcd->kcd_nr_archives); + if (kcd->kcd_nr_archives != 0) + archives = kcd->kcd_archives; + } + + rc = mdc_ioc_hsm_ct_register(imp, kcd->kcd_nr_archives, archives); + /* ignore error if the copytool is already registered */ + return (rc == -EEXIST) ? 0 : rc; +} + +/** + * Re-establish all kuc contexts with MDT + * after MDT shutdown/recovery. + */ +static int mdc_kuc_reregister(struct obd_import *imp) +{ + /* re-register HSM agents */ + return libcfs_kkuc_group_foreach(&imp->imp_obd->obd_uuid, KUC_GRP_HSM, + mdc_hsm_ct_reregister, imp); +} + +static int mdc_set_info_async(const struct lu_env *env, + struct obd_export *exp, + u32 keylen, void *key, + u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct obd_import *imp = class_exp2cliimp(exp); + int rc; + ENTRY; + + if (KEY_IS(KEY_READ_ONLY)) { + if (vallen != sizeof(int)) + RETURN(-EINVAL); + + spin_lock(&imp->imp_lock); + if (*((int *)val)) { + imp->imp_connect_flags_orig |= OBD_CONNECT_RDONLY; + imp->imp_connect_data.ocd_connect_flags |= + OBD_CONNECT_RDONLY; + } else { + imp->imp_connect_flags_orig &= ~OBD_CONNECT_RDONLY; + imp->imp_connect_data.ocd_connect_flags &= + ~OBD_CONNECT_RDONLY; + } + spin_unlock(&imp->imp_lock); + + rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION, + keylen, key, vallen, val, set); + RETURN(rc); + } + if (KEY_IS(KEY_CHANGELOG_CLEAR)) { + rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION, + keylen, key, vallen, val, set); + RETURN(rc); + } + if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) { + rc = mdc_hsm_copytool_send(&imp->imp_obd->obd_uuid, vallen, + val); + RETURN(rc); + } + + if (KEY_IS(KEY_DEFAULT_EASIZE)) { + __u32 *default_easize = val; + + exp->exp_obd->u.cli.cl_default_mds_easize = *default_easize; + RETURN(0); + } + + rc = osc_set_info_async(env, exp, keylen, key, vallen, val, set); + RETURN(rc); +} + +static int mdc_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val) +{ + int rc = -EINVAL; + + if (KEY_IS(KEY_MAX_EASIZE)) { + __u32 mdsize, *max_easize; + + if (*vallen != sizeof(int)) + RETURN(-EINVAL); + mdsize = *(__u32 *)val; + if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize) + exp->exp_obd->u.cli.cl_max_mds_easize = mdsize; + max_easize = val; + *max_easize = exp->exp_obd->u.cli.cl_max_mds_easize; + RETURN(0); + } else if (KEY_IS(KEY_DEFAULT_EASIZE)) { + __u32 *default_easize; + + if (*vallen != sizeof(int)) + RETURN(-EINVAL); + default_easize = val; + *default_easize = exp->exp_obd->u.cli.cl_default_mds_easize; + RETURN(0); + } else if (KEY_IS(KEY_CONN_DATA)) { + struct obd_import *imp = class_exp2cliimp(exp); + struct obd_connect_data *data = val; + + if (*vallen != sizeof(*data)) + RETURN(-EINVAL); + + *data = imp->imp_connect_data; + RETURN(0); + } else if (KEY_IS(KEY_TGT_COUNT)) { + *((__u32 *)val) = 1; + RETURN(0); + } + + rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val); + + RETURN(rc); +} + +static int mdc_fsync(struct obd_export *exp, const struct lu_fid *fid, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SYNC); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_SYNC); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(&req->rq_pill, fid, 0, 0, -1, 0); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + RETURN(rc); +} + +struct mdc_rmfid_args { + int *mra_rcs; + int mra_nr; +}; + +int mdc_rmfid_interpret(const struct lu_env *env, struct ptlrpc_request *req, + void *args, int rc) +{ + struct mdc_rmfid_args *aa; + int *rcs, size; + ENTRY; + + if (!rc) { + aa = ptlrpc_req_async_args(aa, req); + + size = req_capsule_get_size(&req->rq_pill, &RMF_RCS, + RCL_SERVER); + LASSERT(size == sizeof(int) * aa->mra_nr); + rcs = req_capsule_server_get(&req->rq_pill, &RMF_RCS); + LASSERT(rcs); + LASSERT(aa->mra_rcs); + LASSERT(aa->mra_nr); + memcpy(aa->mra_rcs, rcs, size); + } + + RETURN(rc); +} + +static int mdc_rmfid(struct obd_export *exp, struct fid_array *fa, + int *rcs, struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + struct mdc_rmfid_args *aa; + struct mdt_body *b; + struct lu_fid *tmp; + int rc, flen; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_RMFID); + if (req == NULL) + RETURN(-ENOMEM); + + flen = fa->fa_nr * sizeof(struct lu_fid); + req_capsule_set_size(&req->rq_pill, &RMF_FID_ARRAY, + RCL_CLIENT, flen); + req_capsule_set_size(&req->rq_pill, &RMF_FID_ARRAY, + RCL_SERVER, flen); + req_capsule_set_size(&req->rq_pill, &RMF_RCS, + RCL_SERVER, fa->fa_nr * sizeof(__u32)); + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_RMFID); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + tmp = req_capsule_client_get(&req->rq_pill, &RMF_FID_ARRAY); + memcpy(tmp, fa->fa_fids, flen); + + mdc_pack_body(&req->rq_pill, NULL, 0, 0, -1, 0); + b = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY); + b->mbo_ctime = ktime_get_real_seconds(); + + ptlrpc_request_set_replen(req); + + LASSERT(rcs); + aa = ptlrpc_req_async_args(aa, req); + aa->mra_rcs = rcs; + aa->mra_nr = fa->fa_nr; + req->rq_interpret_reply = mdc_rmfid_interpret; + + ptlrpc_set_add_req(set, req); + ptlrpc_check_set(NULL, set); + + RETURN(rc); +} + +static int mdc_import_event(struct obd_device *obd, struct obd_import *imp, + enum obd_import_event event) +{ + struct client_obd *cli = &obd->u.cli; + int rc = 0; + + LASSERT(imp->imp_obd == obd); + + switch (event) { + case IMP_EVENT_DISCON: + spin_lock(&cli->cl_loi_list_lock); + cli->cl_avail_grant = 0; + cli->cl_lost_grant = 0; + spin_unlock(&cli->cl_loi_list_lock); + break; + case IMP_EVENT_INACTIVE: + /* + * Flush current sequence to make client obtain new one + * from server in case of disconnect/reconnect. + */ + down_read(&cli->cl_seq_rwsem); + if (cli->cl_seq) + seq_client_flush(cli->cl_seq); + up_read(&cli->cl_seq_rwsem); + + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE); + break; + case IMP_EVENT_INVALIDATE: { + struct ldlm_namespace *ns = obd->obd_namespace; + struct lu_env *env; + __u16 refcheck; + + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + /* Reset grants. All pages go to failing rpcs due to + * the invalid import. + */ + osc_io_unplug(env, cli, NULL); + + cfs_hash_for_each_nolock(ns->ns_rs_hash, + osc_ldlm_resource_invalidate, + env, 0); + cl_env_put(env, &refcheck); + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + } else { + rc = PTR_ERR(env); + } + break; + } + case IMP_EVENT_ACTIVE: + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE); + /* redo the kuc registration after reconnecting */ + if (rc == 0) + rc = mdc_kuc_reregister(imp); + break; + case IMP_EVENT_OCD: { + struct obd_connect_data *ocd = &imp->imp_connect_data; + + if (OCD_HAS_FLAG(ocd, GRANT)) + osc_init_grant(cli, ocd); + + md_init_ea_size(obd->obd_self_export, ocd->ocd_max_easize, 0); + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD); + break; + } + case IMP_EVENT_DEACTIVATE: + case IMP_EVENT_ACTIVATE: + break; + default: + CERROR("Unknown import event %x\n", event); + LBUG(); + } + RETURN(rc); +} + +int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + int rc = -EIO; + + ENTRY; + + down_read(&cli->cl_seq_rwsem); + if (cli->cl_seq) + rc = seq_client_alloc_fid(env, cli->cl_seq, fid); + up_read(&cli->cl_seq_rwsem); + + RETURN(rc); +} + +static struct obd_uuid *mdc_get_uuid(struct obd_export *exp) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + return &cli->cl_target_uuid; +} + +/** + * Determine whether the lock can be canceled before replaying it during + * recovery, non zero value will be return if the lock can be canceled, + * or zero returned for not + */ +static int mdc_cancel_weight(struct ldlm_lock *lock) +{ + if (lock->l_resource->lr_type != LDLM_IBITS) + RETURN(0); + + /* FIXME: if we ever get into a situation where there are too many + * opened files with open locks on a single node, then we really + * should replay these open locks to reget it */ + if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN) + RETURN(0); + + /* Special case for DoM locks, cancel only unused and granted locks */ + if (ldlm_has_dom(lock) && + (lock->l_granted_mode != lock->l_req_mode || + osc_ldlm_weigh_ast(lock) != 0)) + RETURN(0); + + RETURN(1); +} + +static int mdc_resource_inode_free(struct ldlm_resource *res) +{ + if (res->lr_lvb_inode) + res->lr_lvb_inode = NULL; + + return 0; +} + +static struct ldlm_valblock_ops inode_lvbo = { + .lvbo_free = mdc_resource_inode_free +}; + +static int mdc_llog_init(struct obd_device *obd) +{ + struct obd_llog_group *olg = &obd->obd_olg; + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + rc = llog_setup(NULL, obd, olg, LLOG_CHANGELOG_REPL_CTXT, obd, + &llog_client_ops); + if (rc < 0) + RETURN(rc); + + ctxt = llog_group_get_ctxt(olg, LLOG_CHANGELOG_REPL_CTXT); + llog_initiator_connect(ctxt); + llog_ctxt_put(ctxt); + + RETURN(0); +} + +static void mdc_llog_finish(struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT); + if (ctxt != NULL) + llog_cleanup(NULL, ctxt); + + EXIT; +} + +int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg) +{ + int rc; + + ENTRY; + + rc = osc_setup_common(obd, cfg); + if (rc < 0) + RETURN(rc); + + rc = mdc_tunables_init(obd); + if (rc) + GOTO(err_osc_cleanup, rc); + + obd->u.cli.cl_dom_min_inline_repsize = MDC_DOM_DEF_INLINE_REPSIZE; + obd->u.cli.cl_lsom_update = true; + + ns_register_cancel(obd->obd_namespace, mdc_cancel_weight); + + obd->obd_namespace->ns_lvbo = &inode_lvbo; + + rc = mdc_llog_init(obd); + if (rc) { + CERROR("%s: failed to setup llogging subsystems: rc = %d\n", + obd->obd_name, rc); + GOTO(err_llog_cleanup, rc); + } + + rc = mdc_changelog_cdev_init(obd); + if (rc) { + CERROR("%s: failed to setup changelog char device: rc = %d\n", + obd->obd_name, rc); + GOTO(err_changelog_cleanup, rc); + } + + RETURN(rc); + +err_changelog_cleanup: + mdc_llog_finish(obd); +err_llog_cleanup: + lprocfs_free_md_stats(obd); + ptlrpc_lprocfs_unregister_obd(obd); +err_osc_cleanup: + osc_cleanup_common(obd); + return rc; +} + +/* Initialize the default and maximum LOV EA sizes. This allows + * us to make MDS RPCs with large enough reply buffers to hold a default + * sized EA without having to calculate this (via a call into the + * LOV + OSCs) each time we make an RPC. The maximum size is also tracked + * but not used to avoid wastefully vmalloc()'ing large reply buffers when + * a large number of stripes is possible. If a larger reply buffer is + * required it will be reallocated in the ptlrpc layer due to overflow. + */ +static int mdc_init_ea_size(struct obd_export *exp, __u32 easize, + __u32 def_easize) +{ + struct obd_device *obd = exp->exp_obd; + struct client_obd *cli = &obd->u.cli; + ENTRY; + + if (cli->cl_max_mds_easize < easize) + cli->cl_max_mds_easize = easize; + + if (cli->cl_default_mds_easize < def_easize) + cli->cl_default_mds_easize = def_easize; + + RETURN(0); +} + +static int mdc_precleanup(struct obd_device *obd) +{ + ENTRY; + + osc_precleanup_common(obd); + mdc_changelog_cdev_finish(obd); + + obd_cleanup_client_import(obd); + ptlrpc_lprocfs_unregister_obd(obd); + lprocfs_free_md_stats(obd); + mdc_llog_finish(obd); + RETURN(0); +} + +static int mdc_cleanup(struct obd_device *obd) +{ + return osc_cleanup_common(obd); +} + +static const struct obd_ops mdc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = mdc_setup, + .o_precleanup = mdc_precleanup, + .o_cleanup = mdc_cleanup, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = client_connect_import, + .o_reconnect = osc_reconnect, + .o_disconnect = osc_disconnect, + .o_iocontrol = mdc_iocontrol, + .o_set_info_async = mdc_set_info_async, + .o_statfs = mdc_statfs, + .o_statfs_async = mdc_statfs_async, + .o_fid_init = client_fid_init, + .o_fid_fini = client_fid_fini, + .o_fid_alloc = mdc_fid_alloc, + .o_import_event = mdc_import_event, + .o_get_info = mdc_get_info, + .o_get_uuid = mdc_get_uuid, + .o_quotactl = mdc_quotactl, +}; + +static const struct md_ops mdc_md_ops = { + .m_get_root = mdc_get_root, + .m_null_inode = mdc_null_inode, + .m_close = mdc_close, + .m_create = mdc_create, + .m_enqueue = mdc_enqueue, + .m_getattr = mdc_getattr, + .m_getattr_name = mdc_getattr_name, + .m_intent_lock = mdc_intent_lock, + .m_link = mdc_link, + .m_rename = mdc_rename, + .m_setattr = mdc_setattr, + .m_setxattr = mdc_setxattr, + .m_getxattr = mdc_getxattr, + .m_fsync = mdc_fsync, + .m_file_resync = mdc_file_resync, + .m_read_page = mdc_read_page, + .m_unlink = mdc_unlink, + .m_cancel_unused = mdc_cancel_unused, + .m_init_ea_size = mdc_init_ea_size, + .m_set_lock_data = mdc_set_lock_data, + .m_lock_match = mdc_lock_match, + .m_get_lustre_md = mdc_get_lustre_md, + .m_free_lustre_md = mdc_free_lustre_md, + .m_set_open_replay_data = mdc_set_open_replay_data, + .m_clear_open_replay_data = mdc_clear_open_replay_data, + .m_intent_getattr_async = mdc_intent_getattr_async, + .m_revalidate_lock = mdc_revalidate_lock, + .m_rmfid = mdc_rmfid, +}; + +dev_t mdc_changelog_dev; +struct class *mdc_changelog_class; +static int __init mdc_init(void) +{ + int rc = 0; + rc = alloc_chrdev_region(&mdc_changelog_dev, 0, + MDC_CHANGELOG_DEV_COUNT, + MDC_CHANGELOG_DEV_NAME); + if (rc) + return rc; + + mdc_changelog_class = class_create(THIS_MODULE, MDC_CHANGELOG_DEV_NAME); + if (IS_ERR(mdc_changelog_class)) { + rc = PTR_ERR(mdc_changelog_class); + goto out_dev; + } + + rc = class_register_type(&mdc_obd_ops, &mdc_md_ops, true, + LUSTRE_MDC_NAME, &mdc_device_type); + if (rc) + goto out_class; + + return 0; + +out_class: + class_destroy(mdc_changelog_class); +out_dev: + unregister_chrdev_region(mdc_changelog_dev, MDC_CHANGELOG_DEV_COUNT); + return rc; +} + +static void __exit mdc_exit(void) +{ + class_unregister_type(LUSTRE_MDC_NAME); + class_destroy(mdc_changelog_class); + unregister_chrdev_region(mdc_changelog_dev, MDC_CHANGELOG_DEV_COUNT); + idr_destroy(&mdc_changelog_minor_idr); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Metadata Client"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(mdc_init); +module_exit(mdc_exit); diff --git a/drivers/staging/lustrefsx/lustre/mgc/Makefile b/drivers/staging/lustrefsx/lustre/mgc/Makefile new file mode 100644 index 0000000000000..7353c95e42cca --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mgc/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTREFSX_FS) += mgc.o + +mgc-y := mgc_request.o lproc_mgc.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c new file mode 100644 index 0000000000000..051e31559c647 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c @@ -0,0 +1,132 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include "mgc_internal.h" + +#ifdef CONFIG_PROC_FS + +LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, connect_flags); + +LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, server_uuid); + +LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, import); + +LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, state); + +static int mgc_ir_state_seq_show(struct seq_file *m, void *v) +{ + return lprocfs_mgc_rd_ir_state(m, m->private); +} + +LDEBUGFS_SEQ_FOPS_RO(mgc_ir_state); + +struct ldebugfs_vars ldebugfs_mgc_obd_vars[] = { + { .name = "connect_flags", + .fops = &mgc_connect_flags_fops }, + { .name = "mgs_server_uuid", + .fops = &mgc_server_uuid_fops }, + { .name = "import", + .fops = &mgc_import_fops }, + { .name = "state", + .fops = &mgc_state_fops }, + { .name = "ir_state", + .fops = &mgc_ir_state_fops }, + { NULL } +}; +#endif /* CONFIG_PROC_FS */ + +LUSTRE_ATTR(mgs_conn_uuid, 0444, conn_uuid_show, NULL); +LUSTRE_RO_ATTR(conn_uuid); + +LUSTRE_RW_ATTR(ping); + +ssize_t dynamic_nids_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + ssize_t count; + + ENTRY; + count = snprintf(buf, PAGE_SIZE, "%u\n", obd->obd_dynamic_nids); + + RETURN(count); +} + +ssize_t dynamic_nids_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + bool val; + int rc; + + ENTRY; + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + spin_lock(&obd->obd_dev_lock); + obd->obd_dynamic_nids = val; + spin_unlock(&obd->obd_dev_lock); + + RETURN(count); +} + +LUSTRE_RW_ATTR(dynamic_nids); + +static struct attribute *mgc_attrs[] = { + &lustre_attr_mgs_conn_uuid.attr, + &lustre_attr_conn_uuid.attr, + &lustre_attr_ping.attr, + &lustre_attr_dynamic_nids.attr, + NULL, +}; + +KOBJ_ATTRIBUTE_GROUPS(mgc); + +int mgc_tunables_init(struct obd_device *obd) +{ + int rc; + + obd->obd_ktype.default_groups = KOBJ_ATTR_GROUPS(mgc); + obd->obd_debugfs_vars = ldebugfs_mgc_obd_vars; + rc = lprocfs_obd_setup(obd, true); + if (rc) + return rc; + + return sptlrpc_lprocfs_cliobd_attach(obd); +} diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h new file mode 100644 index 0000000000000..2289972d1a82c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h @@ -0,0 +1,74 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef _MGC_INTERNAL_H +#define _MGC_INTERNAL_H + +#include +#include +#include +#include +#include + +int mgc_tunables_init(struct obd_device *obd); +int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data); + +int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld); + +/* this timeout represents how many seconds MGC should wait before + * requeue config and recover lock to the MGS. We need to randomize this + * in order to not flood the MGS. + */ +#define MGC_TIMEOUT_MIN_SECONDS 5 + +extern unsigned int mgc_requeue_timeout_min; + +static inline bool cld_is_sptlrpc(struct config_llog_data *cld) +{ + return cld->cld_type == MGS_CFG_T_SPTLRPC; +} + +static inline bool cld_is_recover(struct config_llog_data *cld) +{ + return cld->cld_type == MGS_CFG_T_RECOVER; +} + +static inline bool cld_is_nodemap(struct config_llog_data *cld) +{ + return cld->cld_type == MGS_CFG_T_NODEMAP; +} + +static inline bool cld_is_barrier(struct config_llog_data *cld) +{ + return cld->cld_type == MGS_CFG_T_BARRIER; +} + +#endif /* _MGC_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c new file mode 100644 index 0000000000000..39df17e03959e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c @@ -0,0 +1,2333 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/mgc/mgc_request.c + * + * Author: Nathan Rutman + */ + +#define DEBUG_SUBSYSTEM S_MGC +#define D_MGC D_CONFIG /*|D_WARNING*/ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mgc_internal.h" + +static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id, + enum mgs_cfg_type type) +{ + __u64 resname = 0; + + if (len > sizeof(resname)) { + CERROR("name too long: %s\n", name); + return -EINVAL; + } + if (len <= 0) { + CERROR("missing name: %s\n", name); + return -EINVAL; + } + memcpy(&resname, name, len); + + /* Always use the same endianness for the resid */ + memset(res_id, 0, sizeof(*res_id)); + res_id->name[0] = cpu_to_le64(resname); + /* XXX: unfortunately, sptlprc and config llog share one lock */ + switch(type) { + case MGS_CFG_T_CONFIG: + case MGS_CFG_T_SPTLRPC: + resname = 0; + break; + case MGS_CFG_T_RECOVER: + case MGS_CFG_T_PARAMS: + case MGS_CFG_T_NODEMAP: + case MGS_CFG_T_BARRIER: + resname = type; + break; + default: + LBUG(); + } + res_id->name[1] = cpu_to_le64(resname); + CDEBUG(D_MGC, "log %s to resid %#llx/%#llx (%.8s)\n", name, + res_id->name[0], res_id->name[1], (char *)&res_id->name[0]); + return 0; +} + +int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, + enum mgs_cfg_type type) +{ + /* fsname is at most 8 chars long, maybe contain "-". + * e.g. "lustre", "SUN-000" */ + return mgc_name2resid(fsname, strlen(fsname), res_id, type); +} +EXPORT_SYMBOL(mgc_fsname2resid); + +int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id, + enum mgs_cfg_type type) +{ + char *name_end; + int len; + + /* logname consists of "fsname-nodetype". + * e.g. "lustre-MDT0001", "SUN-000-client" + * there is an exception: llog "params" */ + name_end = strrchr(logname, '-'); + if (!name_end) + len = strlen(logname); + else + len = name_end - logname; + return mgc_name2resid(logname, len, res_id, type); +} +EXPORT_SYMBOL(mgc_logname2resid); + +/********************** config llog list **********************/ +static LIST_HEAD(config_llog_list); +static DEFINE_SPINLOCK(config_list_lock); /* protects config_llog_list */ + +/* Take a reference to a config log */ +static int config_log_get(struct config_llog_data *cld) +{ + ENTRY; + atomic_inc(&cld->cld_refcount); + CDEBUG(D_INFO, "log %s (%p) refs %d\n", cld->cld_logname, cld, + atomic_read(&cld->cld_refcount)); + RETURN(0); +} + +/* Drop a reference to a config log. When no longer referenced, + we can free the config log data */ +static void config_log_put(struct config_llog_data *cld) +{ + ENTRY; + + if (unlikely(!cld)) + RETURN_EXIT; + + CDEBUG(D_INFO, "log %s(%p) refs %d\n", cld->cld_logname, cld, + atomic_read(&cld->cld_refcount)); + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + /* spinlock to make sure no item with 0 refcount in the list */ + if (atomic_dec_and_lock(&cld->cld_refcount, &config_list_lock)) { + list_del(&cld->cld_list_chain); + spin_unlock(&config_list_lock); + + CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname); + + config_log_put(cld->cld_barrier); + config_log_put(cld->cld_recover); + config_log_put(cld->cld_params); + config_log_put(cld->cld_nodemap); + config_log_put(cld->cld_sptlrpc); + if (cld_is_sptlrpc(cld)) { + cld->cld_stopping = 1; + sptlrpc_conf_log_stop(cld->cld_logname); + } + + class_export_put(cld->cld_mgcexp); + OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1); + } + + EXIT; +} + +/* Find a config log by name */ +static +struct config_llog_data *config_log_find(char *logname, + struct config_llog_instance *cfg) +{ + struct config_llog_data *cld; + struct config_llog_data *found = NULL; + unsigned long cfg_instance; + + ENTRY; + LASSERT(logname != NULL); + + cfg_instance = cfg ? cfg->cfg_instance : 0; + spin_lock(&config_list_lock); + list_for_each_entry(cld, &config_llog_list, cld_list_chain) { + /* check if cfg_instance is the one we want */ + if (cfg_instance != cld->cld_cfg.cfg_instance) + continue; + + /* instance may be NULL, should check name */ + if (strcmp(logname, cld->cld_logname) == 0) { + found = cld; + config_log_get(found); + break; + } + } + spin_unlock(&config_list_lock); + RETURN(found); +} + +static +struct config_llog_data *do_config_log_add(struct obd_device *obd, + char *logname, + enum mgs_cfg_type type, + struct config_llog_instance *cfg, + struct super_block *sb) +{ + struct config_llog_data *cld; + int rc; + + ENTRY; + + CDEBUG(D_MGC, "do adding config log %s-%016lx\n", logname, + cfg ? cfg->cfg_instance : 0); + + OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1); + if (!cld) + RETURN(ERR_PTR(-ENOMEM)); + + rc = mgc_logname2resid(logname, &cld->cld_resid, type); + if (rc) { + OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1); + RETURN(ERR_PTR(rc)); + } + + strcpy(cld->cld_logname, logname); + if (cfg) + cld->cld_cfg = *cfg; + else + cld->cld_cfg.cfg_callback = class_config_llog_handler; + mutex_init(&cld->cld_lock); + cld->cld_cfg.cfg_last_idx = 0; + cld->cld_cfg.cfg_flags = 0; + cld->cld_cfg.cfg_sb = sb; + cld->cld_type = type; + atomic_set(&cld->cld_refcount, 1); + + /* Keep the mgc around until we are done */ + cld->cld_mgcexp = class_export_get(obd->obd_self_export); + + if (cld_is_sptlrpc(cld)) + sptlrpc_conf_log_start(logname); + + spin_lock(&config_list_lock); + list_add(&cld->cld_list_chain, &config_llog_list); + spin_unlock(&config_list_lock); + + if (cld_is_sptlrpc(cld) || cld_is_nodemap(cld) || cld_is_barrier(cld)) { + rc = mgc_process_log(obd, cld); + if (rc && rc != -ENOENT) + CERROR("%s: failed processing log, type %d: rc = %d\n", + obd->obd_name, type, rc); + } + + RETURN(cld); +} + +static struct config_llog_data *config_recover_log_add(struct obd_device *obd, + char *fsname, + struct config_llog_instance *cfg, + struct super_block *sb) +{ + struct config_llog_instance lcfg = *cfg; + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_data *cld; + char logname[32]; + + if (IS_OST(lsi)) + return NULL; + + /* for osp-on-ost, see lustre_start_osp() */ + if (IS_MDT(lsi) && lcfg.cfg_instance) + return NULL; + + /* We have to use different llog for clients and MDTs for DNE, + * where only clients are notified if one of DNE server restarts. + */ + LASSERT(strlen(fsname) < sizeof(logname) / 2); + strncpy(logname, fsname, sizeof(logname)); + if (IS_SERVER(lsi)) { /* mdt */ + LASSERT(lcfg.cfg_instance == 0); + lcfg.cfg_instance = ll_get_cfg_instance(sb); + strncat(logname, "-mdtir", sizeof(logname)); + } else { + LASSERT(lcfg.cfg_instance != 0); + strncat(logname, "-cliir", sizeof(logname)); + } + + cld = do_config_log_add(obd, logname, MGS_CFG_T_RECOVER, &lcfg, sb); + return cld; +} + +static struct config_llog_data * +config_log_find_or_add(struct obd_device *obd, char *logname, + struct super_block *sb, enum mgs_cfg_type type, + struct config_llog_instance *cfg) +{ + struct config_llog_instance lcfg = *cfg; + struct config_llog_data *cld; + + /* Note class_config_llog_handler() depends on getting "obd" back */ + lcfg.cfg_instance = sb ? ll_get_cfg_instance(sb) : (unsigned long)obd; + + cld = config_log_find(logname, &lcfg); + if (unlikely(cld != NULL)) + return cld; + + return do_config_log_add(obd, logname, type, &lcfg, sb); +} + +/** Add this log to the list of active logs watched by an MGC. + * Active means we're watching for updates. + * We have one active log per "mount" - client instance or servername. + * Each instance may be at a different point in the log. + */ +static struct config_llog_data * +config_log_add(struct obd_device *obd, char *logname, + struct config_llog_instance *cfg, struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_data *cld = NULL; + struct config_llog_data *sptlrpc_cld = NULL; + struct config_llog_data *params_cld = NULL; + struct config_llog_data *nodemap_cld = NULL; + struct config_llog_data *barrier_cld = NULL; + char seclogname[32]; + char *ptr; + int rc; + bool locked = false; + ENTRY; + + CDEBUG(D_MGC, "add config log %s-%016lx\n", logname, + cfg->cfg_instance); + + /* + * for each regular log, the depended sptlrpc log name is + * -sptlrpc. multiple regular logs may share one sptlrpc log. + */ + ptr = strrchr(logname, '-'); + if (ptr == NULL || ptr - logname > 8) { + CERROR("logname %s is too long\n", logname); + RETURN(ERR_PTR(-EINVAL)); + } + + memcpy(seclogname, logname, ptr - logname); + strcpy(seclogname + (ptr - logname), "-sptlrpc"); + + if (cfg->cfg_sub_clds & CONFIG_SUB_SPTLRPC) { + sptlrpc_cld = config_log_find_or_add(obd, seclogname, NULL, + MGS_CFG_T_SPTLRPC, cfg); + if (IS_ERR(sptlrpc_cld)) { + CERROR("%s: can't create sptlrpc log %s: rc = %ld\n", + obd->obd_name, seclogname, PTR_ERR(sptlrpc_cld)); + RETURN(sptlrpc_cld); + } + } + + if (!IS_MGS(lsi) && cfg->cfg_sub_clds & CONFIG_SUB_NODEMAP) { + nodemap_cld = config_log_find_or_add(obd, LUSTRE_NODEMAP_NAME, + NULL, MGS_CFG_T_NODEMAP, + cfg); + if (IS_ERR(nodemap_cld)) { + rc = PTR_ERR(nodemap_cld); + CERROR("%s: cannot create nodemap log: rc = %d\n", + obd->obd_name, rc); + GOTO(out_sptlrpc, rc); + } + } + + if (cfg->cfg_sub_clds & CONFIG_SUB_PARAMS) { + params_cld = config_log_find_or_add(obd, PARAMS_FILENAME, sb, + MGS_CFG_T_PARAMS, cfg); + if (IS_ERR(params_cld)) { + rc = PTR_ERR(params_cld); + CERROR("%s: can't create params log: rc = %d\n", + obd->obd_name, rc); + GOTO(out_nodemap, rc); + } + } + + if (IS_MDT(s2lsi(sb)) && cfg->cfg_sub_clds & CONFIG_SUB_BARRIER) { + snprintf(seclogname + (ptr - logname), sizeof(seclogname) - 1, + "-%s", BARRIER_FILENAME); + barrier_cld = config_log_find_or_add(obd, seclogname, sb, + MGS_CFG_T_BARRIER, cfg); + if (IS_ERR(barrier_cld)) { + rc = PTR_ERR(barrier_cld); + CERROR("%s: can't create barrier log: rc = %d\n", + obd->obd_name, rc); + GOTO(out_params, rc); + } + } + + cld = do_config_log_add(obd, logname, MGS_CFG_T_CONFIG, cfg, sb); + if (IS_ERR(cld)) { + rc = PTR_ERR(cld); + CERROR("%s: can't create log: rc = %d\n", + obd->obd_name, rc); + GOTO(out_barrier, rc = PTR_ERR(cld)); + } + + LASSERT(lsi->lsi_lmd); + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR) && + cfg->cfg_sub_clds & CONFIG_SUB_RECOVER) { + struct config_llog_data *recover_cld; + + ptr = strrchr(seclogname, '-'); + if (ptr != NULL) { + *ptr = 0; + } else { + CERROR("%s: sptlrpc log name not correct, %s: " + "rc = %d\n", obd->obd_name, seclogname, -EINVAL); + GOTO(out_cld, rc = -EINVAL); + } + + recover_cld = config_recover_log_add(obd, seclogname, cfg, sb); + if (IS_ERR(recover_cld)) { + rc = PTR_ERR(recover_cld); + CERROR("%s: can't create recover log: rc = %d\n", + obd->obd_name, rc); + GOTO(out_cld, rc); + } + + mutex_lock(&cld->cld_lock); + locked = true; + cld->cld_recover = recover_cld; + } + + if (!locked) + mutex_lock(&cld->cld_lock); + cld->cld_params = params_cld; + cld->cld_barrier = barrier_cld; + cld->cld_nodemap = nodemap_cld; + cld->cld_sptlrpc = sptlrpc_cld; + mutex_unlock(&cld->cld_lock); + + RETURN(cld); + +out_cld: + config_log_put(cld); +out_barrier: + config_log_put(barrier_cld); +out_params: + config_log_put(params_cld); +out_nodemap: + config_log_put(nodemap_cld); +out_sptlrpc: + config_log_put(sptlrpc_cld); + + return ERR_PTR(rc); +} + +DEFINE_MUTEX(llog_process_lock); + +static inline void config_mark_cld_stop_nolock(struct config_llog_data *cld) +{ + ENTRY; + + spin_lock(&config_list_lock); + cld->cld_stopping = 1; + spin_unlock(&config_list_lock); + + CDEBUG(D_INFO, "lockh %#llx\n", cld->cld_lockh.cookie); + if (!ldlm_lock_addref_try(&cld->cld_lockh, LCK_CR)) + ldlm_lock_decref_and_cancel(&cld->cld_lockh, LCK_CR); +} + +static inline void config_mark_cld_stop(struct config_llog_data *cld) +{ + if (cld) { + mutex_lock(&cld->cld_lock); + config_mark_cld_stop_nolock(cld); + mutex_unlock(&cld->cld_lock); + } +} + +/** Stop watching for updates on this log. + */ +static int config_log_end(char *logname, struct config_llog_instance *cfg) +{ + struct config_llog_data *cld; + struct config_llog_data *cld_sptlrpc = NULL; + struct config_llog_data *cld_params = NULL; + struct config_llog_data *cld_recover = NULL; + struct config_llog_data *cld_nodemap = NULL; + struct config_llog_data *cld_barrier = NULL; + int rc = 0; + + ENTRY; + + cld = config_log_find(logname, cfg); + if (cld == NULL) + RETURN(-ENOENT); + + mutex_lock(&cld->cld_lock); + /* + * if cld_stopping is set, it means we didn't start the log thus + * not owning the start ref. this can happen after previous umount: + * the cld still hanging there waiting for lock cancel, and we + * remount again but failed in the middle and call log_end without + * calling start_log. + */ + if (unlikely(cld->cld_stopping)) { + mutex_unlock(&cld->cld_lock); + /* drop the ref from the find */ + config_log_put(cld); + RETURN(rc); + } + + cld_recover = cld->cld_recover; + cld->cld_recover = NULL; + cld_params = cld->cld_params; + cld->cld_params = NULL; + cld_nodemap = cld->cld_nodemap; + cld->cld_nodemap = NULL; + cld_barrier = cld->cld_barrier; + cld->cld_barrier = NULL; + cld_sptlrpc = cld->cld_sptlrpc; + cld->cld_sptlrpc = NULL; + + config_mark_cld_stop_nolock(cld); + mutex_unlock(&cld->cld_lock); + + config_mark_cld_stop(cld_recover); + config_log_put(cld_recover); + config_mark_cld_stop(cld_params); + config_log_put(cld_params); + config_mark_cld_stop(cld_barrier); + config_log_put(cld_barrier); + /* don't explicitly set cld_stopping on sptlrpc lock here, as other + * targets may be active, it will be done in config_log_put if necessary + */ + config_log_put(cld_sptlrpc); + /* don't set cld_stopping on nm lock as other targets may be active */ + config_log_put(cld_nodemap); + + /* drop the ref from the find */ + config_log_put(cld); + /* drop the start ref */ + config_log_put(cld); + + CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client", + rc); + RETURN(rc); +} + +int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_import *imp; + struct obd_connect_data *ocd; + struct config_llog_data *cld; + int rc = 0; + + ENTRY; + LASSERT(obd); + with_imp_locked(obd, imp, rc) { + ocd = &imp->imp_connect_data; + + seq_printf(m, "imperative_recovery: %s\n", + OCD_HAS_FLAG(ocd, IMP_RECOV) ? + "ENABLED" : "DISABLED"); + } + if (rc) + RETURN(rc); + + seq_printf(m, "client_state:\n"); + + spin_lock(&config_list_lock); + list_for_each_entry(cld, &config_llog_list, cld_list_chain) { + if (cld->cld_recover == NULL) + continue; + seq_printf(m, " - { client: %s, nidtbl_version: %u }\n", + cld->cld_logname, + cld->cld_recover->cld_cfg.cfg_last_idx); + } + spin_unlock(&config_list_lock); + + RETURN(0); +} + +/* reenqueue any lost locks */ +#define RQ_RUNNING 0x1 +#define RQ_NOW 0x2 +#define RQ_LATER 0x4 +#define RQ_STOP 0x8 +#define RQ_PRECLEANUP 0x10 +static int rq_state = 0; +static wait_queue_head_t rq_waitq; +static DECLARE_COMPLETION(rq_exit); +static DECLARE_COMPLETION(rq_start); + +static void do_requeue(struct config_llog_data *cld) +{ + int rc = 0; + ENTRY; + + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + /* + * Do not run mgc_process_log on a disconnected export or an + * export which is being disconnected. Take the client + * semaphore to make the check non-racy. + */ + down_read_nested(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem, + OBD_CLI_SEM_MGC); + if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) { + CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname); + rc = mgc_process_log(cld->cld_mgcexp->exp_obd, cld); + if (rc && rc != -ENOENT) + CERROR("failed processing log: %d\n", rc); + } else { + CDEBUG(D_MGC, "disconnecting, won't update log %s\n", + cld->cld_logname); + } + up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem); + + EXIT; +} + +static int mgc_requeue_thread(void *data) +{ + int rc = 0; + bool first = true; + ENTRY; + + CDEBUG(D_MGC, "Starting requeue thread\n"); + + /* Keep trying failed locks periodically */ + spin_lock(&config_list_lock); + rq_state |= RQ_RUNNING; + while (!(rq_state & RQ_STOP)) { + struct config_llog_data *cld, *cld_prev; + int to; + + /* Any new or requeued lostlocks will change the state */ + rq_state &= ~(RQ_NOW | RQ_LATER); + spin_unlock(&config_list_lock); + + if (first) { + first = false; + complete(&rq_start); + } + + /* Always wait a few seconds to allow the server who + * caused the lock revocation to finish its setup, plus some + * random so everyone doesn't try to reconnect at once. + */ + to = mgc_requeue_timeout_min == 0 ? 1 : mgc_requeue_timeout_min; + to = cfs_time_seconds(mgc_requeue_timeout_min) + + get_random_u32_below(cfs_time_seconds(to)); + wait_event_idle_timeout(rq_waitq, + rq_state & (RQ_STOP | RQ_PRECLEANUP), to); + + /* + * iterate & processing through the list. for each cld, process + * its depending sptlrpc cld firstly (if any) and then itself. + * + * it's guaranteed any item in the list must have + * reference > 0; and if cld_lostlock is set, at + * least one reference is taken by the previous enqueue. + */ + cld_prev = NULL; + + spin_lock(&config_list_lock); + rq_state &= ~RQ_PRECLEANUP; + list_for_each_entry(cld, &config_llog_list, + cld_list_chain) { + if (!cld->cld_lostlock || cld->cld_stopping) + continue; + + /* hold reference to avoid being freed during + * subsequent processing. */ + config_log_get(cld); + cld->cld_lostlock = 0; + spin_unlock(&config_list_lock); + + config_log_put(cld_prev); + cld_prev = cld; + + if (likely(!(rq_state & RQ_STOP))) { + do_requeue(cld); + spin_lock(&config_list_lock); + } else { + spin_lock(&config_list_lock); + break; + } + } + spin_unlock(&config_list_lock); + config_log_put(cld_prev); + + /* Wait a bit to see if anyone else needs a requeue */ + wait_event_idle(rq_waitq, rq_state & (RQ_NOW | RQ_STOP)); + spin_lock(&config_list_lock); + } + + /* spinlock and while guarantee RQ_NOW and RQ_LATER are not set */ + rq_state &= ~RQ_RUNNING; + spin_unlock(&config_list_lock); + + complete(&rq_exit); + + CDEBUG(D_MGC, "Ending requeue thread\n"); + RETURN(rc); +} + +/* Add a cld to the list to requeue. Start the requeue thread if needed. + We are responsible for dropping the config log reference from here on out. */ +static void mgc_requeue_add(struct config_llog_data *cld) +{ + bool wakeup = false; + ENTRY; + + CDEBUG(D_INFO, "log %s: requeue (r=%d sp=%d st=%x)\n", + cld->cld_logname, atomic_read(&cld->cld_refcount), + cld->cld_stopping, rq_state); + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + /* lets cancel an existent lock to mark cld as "lostlock" */ + CDEBUG(D_INFO, "lockh %#llx\n", cld->cld_lockh.cookie); + if (!ldlm_lock_addref_try(&cld->cld_lockh, LCK_CR)) + ldlm_lock_decref_and_cancel(&cld->cld_lockh, LCK_CR); + + mutex_lock(&cld->cld_lock); + spin_lock(&config_list_lock); + if (!(rq_state & RQ_STOP) && !cld->cld_stopping) { + cld->cld_lostlock = 1; + rq_state |= RQ_NOW; + wakeup = true; + } + spin_unlock(&config_list_lock); + mutex_unlock(&cld->cld_lock); + if (wakeup) + wake_up(&rq_waitq); + + EXIT; +} + +/********************** class fns **********************/ +static int mgc_local_llog_init(const struct lu_env *env, + struct obd_device *obd, + struct obd_device *disk) +{ + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CONFIG_ORIG_CTXT, disk, + &llog_osd_ops); + if (rc) + RETURN(rc); + + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + LASSERT(ctxt); + ctxt->loc_dir = obd->u.cli.cl_mgc_configs_dir; + llog_ctxt_put(ctxt); + + RETURN(0); +} + +static int mgc_local_llog_fini(const struct lu_env *env, + struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + llog_cleanup(env, ctxt); + + RETURN(0); +} + +static int mgc_fs_setup(const struct lu_env *env, struct obd_device *obd, + struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct client_obd *cli = &obd->u.cli; + struct lu_fid rfid, fid; + struct dt_object *root, *dto; + int rc = 0; + + ENTRY; + + LASSERT(lsi); + LASSERT(lsi->lsi_dt_dev); + + /* The mgc fs exclusion mutex. Only one fs can be setup at a time. */ + mutex_lock(&cli->cl_mgc_mutex); + + /* Setup the configs dir */ + fid.f_seq = FID_SEQ_LOCAL_NAME; + fid.f_oid = 1; + fid.f_ver = 0; + rc = local_oid_storage_init(env, lsi->lsi_dt_dev, &fid, + &cli->cl_mgc_los); + if (rc) + GOTO(out_mutex, rc); + + rc = dt_root_get(env, lsi->lsi_dt_dev, &rfid); + if (rc) + GOTO(out_los, rc); + + root = dt_locate_at(env, lsi->lsi_dt_dev, &rfid, + &cli->cl_mgc_los->los_dev->dd_lu_dev, NULL); + if (unlikely(IS_ERR(root))) + GOTO(out_los, rc = PTR_ERR(root)); + + dto = local_file_find_or_create(env, cli->cl_mgc_los, root, + MOUNT_CONFIGS_DIR, + S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO); + dt_object_put_nocache(env, root); + if (IS_ERR(dto)) + GOTO(out_los, rc = PTR_ERR(dto)); + + cli->cl_mgc_configs_dir = dto; + + LASSERT(lsi->lsi_osd_exp->exp_obd->obd_lvfs_ctxt.dt); + rc = mgc_local_llog_init(env, obd, lsi->lsi_osd_exp->exp_obd); + if (rc) + GOTO(out_llog, rc); + + /* We take an obd ref to insure that we can't get to mgc_cleanup + * without calling mgc_fs_cleanup first. */ + class_incref(obd, "mgc_fs", obd); + + /* We keep the cl_mgc_sem until mgc_fs_cleanup */ + EXIT; +out_llog: + if (rc) { + dt_object_put(env, cli->cl_mgc_configs_dir); + cli->cl_mgc_configs_dir = NULL; + } +out_los: + if (rc < 0) { + local_oid_storage_fini(env, cli->cl_mgc_los); +out_mutex: + cli->cl_mgc_los = NULL; + mutex_unlock(&cli->cl_mgc_mutex); + } + return rc; +} + +static int mgc_fs_cleanup(const struct lu_env *env, struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + ENTRY; + + LASSERT(cli->cl_mgc_los != NULL); + + mgc_local_llog_fini(env, obd); + + dt_object_put_nocache(env, cli->cl_mgc_configs_dir); + cli->cl_mgc_configs_dir = NULL; + + local_oid_storage_fini(env, cli->cl_mgc_los); + cli->cl_mgc_los = NULL; + + class_decref(obd, "mgc_fs", obd); + mutex_unlock(&cli->cl_mgc_mutex); + + RETURN(0); +} + +static int mgc_llog_init(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + /* setup only remote ctxt, the local disk context is switched per each + * filesystem during mgc_fs_setup() */ + rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CONFIG_REPL_CTXT, obd, + &llog_client_ops); + if (rc) + RETURN(rc); + + ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); + LASSERT(ctxt); + + llog_initiator_connect(ctxt); + llog_ctxt_put(ctxt); + + RETURN(0); +} + +static int mgc_llog_fini(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); + if (ctxt) + llog_cleanup(env, ctxt); + + RETURN(0); +} + + +static atomic_t mgc_count = ATOMIC_INIT(0); +static int mgc_precleanup(struct obd_device *obd) +{ + int rc = 0; + int temp; + ENTRY; + + if (atomic_dec_and_test(&mgc_count)) { + LASSERT(rq_state & RQ_RUNNING); + /* stop requeue thread */ + temp = RQ_STOP; + } else { + /* wakeup requeue thread to clean our cld */ + temp = RQ_NOW | RQ_PRECLEANUP; + } + + spin_lock(&config_list_lock); + rq_state |= temp; + spin_unlock(&config_list_lock); + wake_up(&rq_waitq); + + if (temp & RQ_STOP) + wait_for_completion(&rq_exit); + obd_cleanup_client_import(obd); + + rc = mgc_llog_fini(NULL, obd); + if (rc != 0) + CERROR("failed to cleanup llogging subsystems\n"); + + RETURN(rc); +} + +static int mgc_cleanup(struct obd_device *obd) +{ + int rc; + ENTRY; + + /* COMPAT_146 - old config logs may have added profiles we don't + know about */ + if (atomic_read(&obd->obd_type->typ_refcnt) <= 1) + /* Only for the last mgc */ + class_del_profiles(); + + lprocfs_obd_cleanup(obd); + ptlrpcd_decref(); + + rc = client_obd_cleanup(obd); + RETURN(rc); +} + +static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct task_struct *task; + int rc; + ENTRY; + + rc = ptlrpcd_addref(); + if (rc < 0) + RETURN(rc); + + rc = client_obd_setup(obd, lcfg); + if (rc) + GOTO(err_decref, rc); + + rc = mgc_llog_init(NULL, obd); + if (rc) { + CERROR("failed to setup llogging subsystems\n"); + GOTO(err_cleanup, rc); + } + + rc = mgc_tunables_init(obd); + if (rc) + GOTO(err_sysfs, rc); + + if (atomic_inc_return(&mgc_count) == 1) { + rq_state = 0; + init_waitqueue_head(&rq_waitq); + + /* start requeue thread */ + task = kthread_run(mgc_requeue_thread, NULL, "ll_cfg_requeue"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("%s: cannot start requeue thread: rc = %d; " + "no more log updates\n", + obd->obd_name, rc); + GOTO(err_sysfs, rc); + } + /* rc is the task_struct pointer of mgc_requeue_thread. */ + rc = 0; + wait_for_completion(&rq_start); + } + + RETURN(rc); + +err_sysfs: + lprocfs_obd_cleanup(obd); +err_cleanup: + client_obd_cleanup(obd); +err_decref: + ptlrpcd_decref(); + RETURN(rc); +} + +/* based on ll_mdc_blocking_ast */ +static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + struct lustre_handle lockh; + struct config_llog_data *cld = (struct config_llog_data *)data; + int rc = 0; + ENTRY; + + switch (flag) { + case LDLM_CB_BLOCKING: + /* mgs wants the lock, give it up... */ + LDLM_DEBUG(lock, "MGC blocking CB"); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + break; + case LDLM_CB_CANCELING: + /* We've given up the lock, prepare ourselves to update. */ + LDLM_DEBUG(lock, "MGC cancel CB"); + + CDEBUG(D_MGC, "Lock res "DLDLMRES" (%.8s)\n", + PLDLMRES(lock->l_resource), + (char *)&lock->l_resource->lr_name.name[0]); + + if (!cld) { + CDEBUG(D_INFO, "missing data, won't requeue\n"); + break; + } + + /* held at mgc_process_log(). */ + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + lock->l_ast_data = NULL; + cld->cld_lockh.cookie = 0; + /* Are we done with this log? */ + if (cld->cld_stopping) { + CDEBUG(D_MGC, "log %s: stopping, won't requeue\n", + cld->cld_logname); + config_log_put(cld); + break; + } + /* Make sure not to re-enqueue when the mgc is stopping + (we get called from client_disconnect_export) */ + if (lock->l_conn_export == NULL || + lock->l_conn_export->exp_obd->u.cli.cl_conn_count == 0) { + CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n", + cld->cld_logname); + config_log_put(cld); + break; + } + + /* Re-enqueue now */ + mgc_requeue_add(cld); + config_log_put(cld); + break; + default: + LBUG(); + } + + RETURN(rc); +} + +/* Not sure where this should go... */ +/* This is the timeout value for MGS_CONNECT request plus a ping interval, such + * that we can have a chance to try the secondary MGS if any. */ +#define MGC_ENQUEUE_LIMIT (INITIAL_CONNECT_TIMEOUT + (AT_OFF ? 0 : at_min) \ + + PING_INTERVAL) +#define MGC_TARGET_REG_LIMIT 10 +#define MGC_TARGET_REG_LIMIT_MAX RECONNECT_DELAY_MAX +#define MGC_SEND_PARAM_LIMIT 10 + +/* Take a config lock so we can get cancel notifications */ +static int mgc_enqueue(struct obd_export *exp, enum ldlm_type type, + union ldlm_policy_data *policy, enum ldlm_mode mode, + __u64 *flags, ldlm_glimpse_callback glimpse_callback, + void *data, __u32 lvb_len, void *lvb_swabber, + struct lustre_handle *lockh) +{ + struct config_llog_data *cld = (struct config_llog_data *)data; + struct ldlm_enqueue_info einfo = { + .ei_type = type, + .ei_mode = mode, + .ei_cb_bl = mgc_blocking_ast, + .ei_cb_cp = ldlm_completion_ast, + .ei_cb_gl = glimpse_callback, + }; + struct ptlrpc_request *req; + int short_limit = cld_is_sptlrpc(cld); + int rc; + ENTRY; + + if (!exp) + RETURN(-EBADR); + + CDEBUG(D_MGC, "Enqueue for %s (res %#llx)\n", cld->cld_logname, + cld->cld_resid.name[0]); + + /* We need a callback for every lockholder, so don't try to + ldlm_lock_match (see rev 1.1.2.11.2.47) */ + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_LDLM_ENQUEUE, LUSTRE_DLM_VERSION, + LDLM_ENQUEUE); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, 0); + ptlrpc_request_set_replen(req); + + /* check if this is server or client */ + if (cld->cld_cfg.cfg_sb) { + struct lustre_sb_info *lsi = s2lsi(cld->cld_cfg.cfg_sb); + if (lsi && IS_SERVER(lsi)) + short_limit = 1; + } + /* Limit how long we will wait for the enqueue to complete */ + req->rq_delay_limit = short_limit ? 5 : MGC_ENQUEUE_LIMIT; + rc = ldlm_cli_enqueue(exp, &req, &einfo, &cld->cld_resid, NULL, flags, + NULL, 0, LVB_T_NONE, lockh, 0); + /* A failed enqueue should still call the mgc_blocking_ast, + where it will be requeued if needed ("grant failed"). */ + ptlrpc_req_finished(req); + RETURN(rc); +} + +static int mgc_cancel(struct obd_export *exp, enum ldlm_mode mode, + struct lustre_handle *lockh) +{ + ENTRY; + + ldlm_lock_decref(lockh, mode); + + RETURN(0); +} + +static void mgc_notify_active(struct obd_device *unused) +{ + /* wakeup mgc_requeue_thread to requeue mgc lock */ + spin_lock(&config_list_lock); + rq_state |= RQ_NOW; + spin_unlock(&config_list_lock); + wake_up(&rq_waitq); + + /* TODO: Help the MGS rebuild nidtbl. -jay */ +} + +/* Send target_reg message to MGS */ +static int mgc_target_register(struct obd_export *exp, + struct mgs_target_info *mti) +{ + struct ptlrpc_request *req; + struct mgs_target_info *req_mti, *rep_mti; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MGS_TARGET_REG, LUSTRE_MGS_VERSION, + MGS_TARGET_REG); + if (req == NULL) + RETURN(-ENOMEM); + + req_mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO); + if (!req_mti) { + ptlrpc_req_finished(req); + RETURN(-ENOMEM); + } + + memcpy(req_mti, mti, sizeof(*req_mti)); + ptlrpc_request_set_replen(req); + CDEBUG(D_MGC, "register %s\n", mti->mti_svname); + /* Limit how long we will wait for the enqueue to complete */ + req->rq_delay_limit = MGC_TARGET_REG_LIMIT; + + /* if the target needs to regenerate the config log in MGS, it's better + * to use some longer limit to let MGC have time to change connection to + * another MGS (or try again with the same MGS) for the target (server) + * will fail and exit if the request expired due to delay limit. */ + if (mti->mti_flags & (LDD_F_UPDATE | LDD_F_NEED_INDEX)) + req->rq_delay_limit = MGC_TARGET_REG_LIMIT_MAX; + + rc = ptlrpc_queue_wait(req); + if (ptlrpc_client_replied(req)) { + rep_mti = req_capsule_server_get(&req->rq_pill, + &RMF_MGS_TARGET_INFO); + if (rep_mti) + memcpy(mti, rep_mti, sizeof(*rep_mti)); + } + if (!rc) { + CDEBUG(D_MGC, "register %s got index = %d\n", + mti->mti_svname, mti->mti_stripe_index); + } + ptlrpc_req_finished(req); + + RETURN(rc); +} + +static int mgc_set_info_async(const struct lu_env *env, struct obd_export *exp, + u32 keylen, void *key, + u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + int rc = -EINVAL; + ENTRY; + + /* Turn off initial_recov after we try all backup servers once */ + if (KEY_IS(KEY_INIT_RECOV_BACKUP)) { + struct obd_import *imp = class_exp2cliimp(exp); + int value; + if (vallen != sizeof(int)) + RETURN(-EINVAL); + value = *(int *)val; + CDEBUG(D_MGC, "InitRecov %s %d/d%d:i%d:r%d:or%d:%s\n", + imp->imp_obd->obd_name, value, + imp->imp_deactive, imp->imp_invalid, + imp->imp_replayable, imp->imp_obd->obd_replayable, + ptlrpc_import_state_name(imp->imp_state)); + /* Resurrect the import immediately if + * 1. we previously got disconnected, + * 2. value > 1 (at the same node with MGS) + * */ + if (imp->imp_state == LUSTRE_IMP_DISCON || value > 1) + ptlrpc_reconnect_import(imp); + + RETURN(0); + } + + /* FIXME move this to mgc_process_config */ + if (KEY_IS(KEY_REGISTER_TARGET)) { + struct mgs_target_info *mti; + if (vallen != sizeof(struct mgs_target_info)) + RETURN(-EINVAL); + mti = (struct mgs_target_info *)val; + CDEBUG(D_MGC, "register_target %s %#x\n", + mti->mti_svname, mti->mti_flags); + rc = mgc_target_register(exp, mti); + RETURN(rc); + } + if (KEY_IS(KEY_SET_FS)) { + struct super_block *sb = (struct super_block *)val; + + if (vallen != sizeof(struct super_block)) + RETURN(-EINVAL); + + rc = mgc_fs_setup(env, exp->exp_obd, sb); + RETURN(rc); + } + if (KEY_IS(KEY_CLEAR_FS)) { + if (vallen != 0) + RETURN(-EINVAL); + rc = mgc_fs_cleanup(env, exp->exp_obd); + RETURN(rc); + } + if (KEY_IS(KEY_MGSSEC)) { + struct client_obd *cli = &exp->exp_obd->u.cli; + struct sptlrpc_flavor flvr; + + /* + * empty string means using current flavor, if which haven't + * been set yet, set it as null. + * + * if flavor has been set previously, check the asking flavor + * must match the existing one. + */ + if (vallen == 0) { + if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID) + RETURN(0); + val = "null"; + vallen = 4; + } + + rc = sptlrpc_parse_flavor(val, &flvr); + if (rc) { + CERROR("invalid sptlrpc flavor %s to MGS\n", + (char *) val); + RETURN(rc); + } + + /* + * caller already hold a mutex + */ + if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) { + cli->cl_flvr_mgc = flvr; + } else if (memcmp(&cli->cl_flvr_mgc, &flvr, + sizeof(flvr)) != 0) { + char str[20]; + + sptlrpc_flavor2name(&cli->cl_flvr_mgc, + str, sizeof(str)); + LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but " + "currently %s is in use\n", + (char *) val, str); + rc = -EPERM; + } + RETURN(rc); + } + + RETURN(rc); +} + +static int mgc_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val) +{ + int rc = -EINVAL; + + if (KEY_IS(KEY_CONN_DATA)) { + struct obd_import *imp = class_exp2cliimp(exp); + struct obd_connect_data *data = val; + + if (*vallen == sizeof(*data)) { + *data = imp->imp_connect_data; + rc = 0; + } + } + + return rc; +} + +static int mgc_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + int rc = 0; + + LASSERT(imp->imp_obd == obd); + CDEBUG(D_MGC, "import event %#x\n", event); + + switch (event) { + case IMP_EVENT_DISCON: + /* MGC imports should not wait for recovery */ + if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV)) + ptlrpc_pinger_ir_down(); + break; + case IMP_EVENT_INACTIVE: + break; + case IMP_EVENT_INVALIDATE: { + struct ldlm_namespace *ns = obd->obd_namespace; + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + break; + } + case IMP_EVENT_ACTIVE: + CDEBUG(D_INFO, "%s: Reactivating import\n", obd->obd_name); + /* Clearing obd_no_recov allows us to continue pinging */ + obd->obd_no_recov = 0; + mgc_notify_active(obd); + if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV)) + ptlrpc_pinger_ir_up(); + break; + case IMP_EVENT_OCD: + break; + case IMP_EVENT_DEACTIVATE: + case IMP_EVENT_ACTIVATE: + break; + default: + CERROR("Unknown import event %#x\n", event); + LBUG(); + } + RETURN(rc); +} + +enum { + CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_SHIFT), + CONFIG_READ_NRPAGES = 4 +}; + +static int mgc_apply_recover_logs(struct obd_device *mgc, + struct config_llog_data *cld, + __u64 max_version, + void *data, int datalen, bool mne_swab) +{ + struct config_llog_instance *cfg = &cld->cld_cfg; + struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb); + struct mgs_nidtbl_entry *entry; + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + u64 prev_version = 0; + char inst[MTI_NAME_MAXLEN + 1]; + char *buf; + int bufsz; + int pos = 0; + int rc = 0; + int off = 0; + unsigned long dynamic_nids; + + ENTRY; + LASSERT(cfg->cfg_instance != 0); + LASSERT(ll_get_cfg_instance(cfg->cfg_sb) == cfg->cfg_instance); + + /* get dynamic nids setting */ + dynamic_nids = mgc->obd_dynamic_nids; + + if (!IS_SERVER(lsi)) { + pos = snprintf(inst, sizeof(inst), "%016lx", cfg->cfg_instance); + if (pos >= PAGE_SIZE) + return -E2BIG; +#ifdef HAVE_SERVER_SUPPORT + } else { + LASSERT(IS_MDT(lsi)); + rc = server_name2svname(lsi->lsi_svname, inst, NULL, + sizeof(inst)); + if (rc) + RETURN(-EINVAL); +#endif /* HAVE_SERVER_SUPPORT */ + } + + OBD_ALLOC(buf, PAGE_SIZE); + if (!buf) + return -ENOMEM; + bufsz = PAGE_SIZE; + pos = 0; + + while (datalen > 0) { + int entry_len = sizeof(*entry); + int is_ost; + struct obd_device *obd; + struct obd_import *imp; + char *obdname; + char *cname; + char *params; + char *uuid; + + rc = -EINVAL; + if (datalen < sizeof(*entry)) + break; + + entry = (typeof(entry))(data + off); + + /* sanity check */ + if (entry->mne_nid_type != 0) /* only support type 0 for ipv4 */ + break; + if (entry->mne_nid_count == 0) /* at least one nid entry */ + break; + if (entry->mne_nid_size != sizeof(lnet_nid_t)) + break; + + entry_len += entry->mne_nid_count * entry->mne_nid_size; + if (datalen < entry_len) /* must have entry_len at least */ + break; + + /* Keep this swab for normal mixed endian handling. LU-1644 */ + if (mne_swab) + lustre_swab_mgs_nidtbl_entry(entry); + if (entry->mne_length > PAGE_SIZE) { + CERROR("MNE too large (%u)\n", entry->mne_length); + break; + } + + if (entry->mne_length < entry_len) + break; + + off += entry->mne_length; + datalen -= entry->mne_length; + if (datalen < 0) + break; + + if (entry->mne_version > max_version) { + CERROR("entry index(%lld) is over max_index(%lld)\n", + entry->mne_version, max_version); + break; + } + + if (prev_version >= entry->mne_version) { + CERROR("index unsorted, prev %lld, now %lld\n", + prev_version, entry->mne_version); + break; + } + prev_version = entry->mne_version; + + /* + * Write a string with format "nid::instance" to + * lustre//--/import. + */ + + is_ost = entry->mne_type == LDD_F_SV_TYPE_OST; + memset(buf, 0, bufsz); + obdname = buf; + pos = 0; + + /* lustre-OST0001-osc- */ + strcpy(obdname, cld->cld_logname); + cname = strrchr(obdname, '-'); + if (cname == NULL) { + CERROR("mgc %s: invalid logname %s\n", + mgc->obd_name, obdname); + break; + } + + pos = cname - obdname; + obdname[pos] = 0; + pos += sprintf(obdname + pos, "-%s%04x", + is_ost ? "OST" : "MDT", entry->mne_index); + + cname = is_ost ? "osc" : "mdc", + pos += snprintf(obdname + pos, bufsz, "-%s-%s", cname, inst); + lustre_cfg_bufs_reset(&bufs, obdname); + + /* find the obd by obdname */ + obd = class_name2obd(obdname); + if (obd == NULL) { + CDEBUG(D_INFO, "mgc %s: cannot find obdname %s\n", + mgc->obd_name, obdname); + rc = 0; + /* this is a safe race, when the ost is starting up...*/ + continue; + } + + /* osc.import = "connection=::" */ + ++pos; + params = buf + pos; + pos += sprintf(params, "%s.import=%s", cname, "connection="); + uuid = buf + pos; + + with_imp_locked(obd, imp, rc) { + /* iterate all nids to find one */ + /* find uuid by nid */ + /* create import entries if they don't exist */ + rc = client_import_add_nids_to_conn( + imp, entry->u.nids, entry->mne_nid_count, + (struct obd_uuid *)uuid); + + if (rc == -ENOENT && dynamic_nids) { + /* create a new connection for this import */ + char *primary_nid = + libcfs_nid2str(entry->u.nids[0]); + int prim_nid_len = strlen(primary_nid) + 1; + struct obd_uuid server_uuid; + + if (prim_nid_len > UUID_MAX) + goto fail; + strncpy(server_uuid.uuid, primary_nid, + prim_nid_len); + + CDEBUG(D_INFO, "Adding a connection for %s\n", + primary_nid); + + rc = client_import_dyn_add_conn( + imp, &server_uuid, entry->u.nids[0], 1); + if (rc < 0) { + CERROR("%s: Failed to add new connection with NID '%s' to import: rc = %d\n", + obd->obd_name, primary_nid, rc); + goto fail; + } + rc = client_import_add_nids_to_conn( + imp, entry->u.nids, + entry->mne_nid_count, + (struct obd_uuid *)uuid); + if (rc < 0) { + CERROR("%s: failed to lookup UUID: rc = %d\n", + obd->obd_name, rc); + goto fail; + } + } +fail:; + } + if (rc == -ENODEV) { + /* client does not connect to the OST yet */ + rc = 0; + continue; + } + + if (rc < 0 && rc != -ENOSPC) { + CERROR("mgc: cannot find UUID by nid '%s': rc = %d\n", + libcfs_nid2str(entry->u.nids[0]), rc); + break; + } + + CDEBUG(D_INFO, "Found UUID '%s' by NID '%s'\n", + uuid, libcfs_nid2str(entry->u.nids[0])); + + pos += strlen(uuid); + pos += sprintf(buf + pos, "::%u", entry->mne_instance); + LASSERT(pos < bufsz); + + lustre_cfg_bufs_set_string(&bufs, 1, params); + + OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, + bufs.lcfg_buflen)); + if (!lcfg) { + rc = -ENOMEM; + break; + } + lustre_cfg_init(lcfg, LCFG_PARAM, &bufs); + + CDEBUG(D_INFO, "ir apply logs %lld/%lld for %s -> %s\n", + prev_version, max_version, obdname, params); + + rc = class_process_config(lcfg); + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, + lcfg->lcfg_buflens)); + if (rc) + CDEBUG(D_INFO, "process config for %s error %d\n", + obdname, rc); + + /* continue, even one with error */ + } + + OBD_FREE(buf, PAGE_SIZE); + + RETURN(rc); +} + +/** + * This function is called if this client was notified for target restarting + * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery or + * nodemap logs. + */ +static int mgc_process_recover_nodemap_log(struct obd_device *obd, + struct config_llog_data *cld) +{ + struct ptlrpc_connection *mgc_conn; + struct ptlrpc_request *req = NULL; + struct config_llog_instance *cfg = &cld->cld_cfg; + struct mgs_config_body *body; + struct mgs_config_res *res; + struct nodemap_config *new_config = NULL; + struct lu_nodemap *recent_nodemap = NULL; + struct ptlrpc_bulk_desc *desc; + struct page **pages = NULL; + __u64 config_read_offset = 0; + __u8 nodemap_cur_pass = 0; + int nrpages = 0; + bool eof = true; + bool mne_swab = false; + int i; + int ealen; + int rc; + ENTRY; + + mgc_conn = class_exp2cliimp(cld->cld_mgcexp)->imp_connection; + + /* don't need to get local config */ + if (cld_is_nodemap(cld) && + LNetIsPeerLocal(lnet_nid_to_nid4(&mgc_conn->c_peer.nid))) + GOTO(out, rc = 0); + + /* allocate buffer for bulk transfer. + * if this is the first time for this mgs to read logs, + * CONFIG_READ_NRPAGES_INIT will be used since it will read all logs + * once; otherwise, it only reads increment of logs, this should be + * small and CONFIG_READ_NRPAGES will be used. + */ + nrpages = CONFIG_READ_NRPAGES; + if (cfg->cfg_last_idx == 0 || cld_is_nodemap(cld)) + nrpages = CONFIG_READ_NRPAGES_INIT; + + OBD_ALLOC_PTR_ARRAY_LARGE(pages, nrpages); + if (pages == NULL) + GOTO(out, rc = -ENOMEM); + + for (i = 0; i < nrpages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (pages[i] == NULL) + GOTO(out, rc = -ENOMEM); + } + +again: +#ifdef HAVE_SERVER_SUPPORT + if (cld_is_nodemap(cld) && config_read_offset == 0) { + new_config = nodemap_config_alloc(); + if (IS_ERR(new_config)) { + rc = PTR_ERR(new_config); + new_config = NULL; + GOTO(out, rc); + } + } +#endif + LASSERT(cld_is_recover(cld) || cld_is_nodemap(cld)); + LASSERT(mutex_is_locked(&cld->cld_lock)); + req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp), + &RQF_MGS_CONFIG_READ); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ); + if (rc) + GOTO(out, rc); + + /* pack request */ + body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY); + LASSERT(body != NULL); + LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname)); + if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name)) + >= sizeof(body->mcb_name)) + GOTO(out, rc = -E2BIG); + if (cld_is_nodemap(cld)) + body->mcb_offset = config_read_offset; + else + body->mcb_offset = cfg->cfg_last_idx + 1; + body->mcb_type = cld->cld_type; + body->mcb_bits = PAGE_SHIFT; + body->mcb_units = nrpages; + body->mcb_nm_cur_pass = nodemap_cur_pass; + + /* allocate bulk transfer descriptor */ + desc = ptlrpc_prep_bulk_imp(req, nrpages, 1, + PTLRPC_BULK_PUT_SINK, + MGS_BULK_PORTAL, + &ptlrpc_bulk_kiov_pin_ops); + if (desc == NULL) + GOTO(out, rc = -ENOMEM); + + for (i = 0; i < nrpages; i++) + desc->bd_frag_ops->add_kiov_frag(desc, pages[i], 0, + PAGE_SIZE); + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES); + if (!res) + GOTO(out, rc = -EPROTO); + + if (cld_is_nodemap(cld)) { + config_read_offset = res->mcr_offset; + eof = config_read_offset == II_END_OFF; + nodemap_cur_pass = res->mcr_nm_cur_pass; + } else { + if (res->mcr_size < res->mcr_offset) + GOTO(out, rc = -EINVAL); + + /* always update the index even though it might have errors with + * handling the recover logs + */ + cfg->cfg_last_idx = res->mcr_offset; + eof = res->mcr_offset == res->mcr_size; + + CDEBUG(D_INFO, "Latest version %lld, more %d.\n", + res->mcr_offset, eof == false); + } + + ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0); + if (ealen < 0) + GOTO(out, rc = ealen); + + if (ealen > nrpages << PAGE_SHIFT) + GOTO(out, rc = -EINVAL); + + if (ealen == 0) { /* no logs transferred */ +#ifdef HAVE_SERVER_SUPPORT + /* config changed since first read RPC */ + if (cld_is_nodemap(cld) && config_read_offset == 0) { + CDEBUG(D_INFO, "nodemap config changed in transit, retrying\n"); + GOTO(out, rc = -EAGAIN); + } +#endif + if (!eof) + rc = -EINVAL; + GOTO(out, rc); + } + + mne_swab = req_capsule_rep_need_swab(&req->rq_pill); + + /* When a nodemap config is received, we build a new nodemap config, + * with new nodemap structs. We keep track of the most recently added + * nodemap since the config is read ordered by nodemap_id, and so it + * is likely that the next record will be related. Because access to + * the nodemaps is single threaded until the nodemap_config is active, + * we don't need to reference count with recent_nodemap, though + * recent_nodemap should be set to NULL when the nodemap_config + * is either destroyed or set active. + */ + for (i = 0; i < nrpages && ealen > 0; i++) { + int rc2; + union lu_page *ptr; + + ptr = kmap(pages[i]); + if (cld_is_nodemap(cld)) + rc2 = nodemap_process_idx_pages(new_config, ptr, + &recent_nodemap); + else + rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset, + ptr, + min_t(int, ealen, + PAGE_SIZE), + mne_swab); + kunmap(pages[i]); + if (rc2 < 0) { + CWARN("%s: error processing %s log %s: rc = %d\n", + obd->obd_name, + cld_is_nodemap(cld) ? "nodemap" : "recovery", + cld->cld_logname, + rc2); + GOTO(out, rc = rc2); + } + + ealen -= PAGE_SIZE; + } + +out: + if (req) { + ptlrpc_req_finished(req); + req = NULL; + } + + if (rc == 0 && !eof) + goto again; + +#ifdef HAVE_SERVER_SUPPORT + if (new_config != NULL) { + /* recent_nodemap cannot be used after set_active/dealloc */ + if (rc == 0) + nodemap_config_set_active_mgc(new_config); + else + nodemap_config_dealloc(new_config); + } +#endif + + if (pages) { + for (i = 0; i < nrpages; i++) { + if (pages[i] == NULL) + break; + __free_page(pages[i]); + } + OBD_FREE_PTR_ARRAY_LARGE(pages, nrpages); + } + return rc; +} + +static int mgc_barrier_glimpse_ast(struct ldlm_lock *lock, void *data) +{ + struct config_llog_data *cld = lock->l_ast_data; + int rc; + ENTRY; + + if (cld->cld_stopping) + RETURN(-ENODEV); + + rc = barrier_handler(s2lsi(cld->cld_cfg.cfg_sb)->lsi_dt_dev, + (struct ptlrpc_request *)data); + + RETURN(rc); +} + +/* Copy a remote log locally */ +static int mgc_llog_local_copy(const struct lu_env *env, + struct obd_device *obd, + struct llog_ctxt *rctxt, + struct llog_ctxt *lctxt, char *logname) +{ + char *temp_log; + int rc; + + ENTRY; + + /* + * - copy it to backup using llog_backup() + * - copy remote llog to logname using llog_backup() + * - if failed then move bakup to logname again + */ + + OBD_ALLOC(temp_log, strlen(logname) + 2); + if (!temp_log) + RETURN(-ENOMEM); + sprintf(temp_log, "%sT", logname); + + /* make a copy of local llog at first */ + rc = llog_backup(env, obd, lctxt, lctxt, logname, temp_log); + if (rc < 0 && rc != -ENOENT) + GOTO(out, rc); + /* copy remote llog to the local copy */ + rc = llog_backup(env, obd, rctxt, lctxt, logname, logname); + if (rc == -ENOENT) { + /* no remote llog, delete local one too */ + llog_erase(env, lctxt, NULL, logname); + } else if (rc < 0) { + /* error during backup, get local one back from the copy */ + llog_backup(env, obd, lctxt, lctxt, temp_log, logname); +out: + CERROR("%s: failed to copy remote log %s: rc = %d\n", + obd->obd_name, logname, rc); + } + llog_erase(env, lctxt, NULL, temp_log); + OBD_FREE(temp_log, strlen(logname) + 2); + return rc; +} + +/* local_only means it cannot get remote llogs */ +static int mgc_process_cfg_log(struct obd_device *mgc, + struct config_llog_data *cld, int local_only) +{ + struct llog_ctxt *ctxt, *lctxt = NULL; + struct client_obd *cli = &mgc->u.cli; + struct lustre_sb_info *lsi = NULL; + int rc = 0; + struct lu_env *env; + + ENTRY; + + LASSERT(cld); + LASSERT(mutex_is_locked(&cld->cld_lock)); + + if (cld->cld_cfg.cfg_sb) + lsi = s2lsi(cld->cld_cfg.cfg_sb); + + OBD_ALLOC_PTR(env); + if (env == NULL) + RETURN(-ENOMEM); + + rc = lu_env_init(env, LCT_MG_THREAD); + if (rc) + GOTO(out_free, rc); + + ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT); + LASSERT(ctxt); + + lctxt = llog_get_context(mgc, LLOG_CONFIG_ORIG_CTXT); + + /* Copy the setup log locally if we can. Don't mess around if we're + * running an MGS though (logs are already local). */ + if (lctxt && lsi && IS_SERVER(lsi) && !IS_MGS(lsi) && + cli->cl_mgc_configs_dir != NULL && + lu2dt_dev(cli->cl_mgc_configs_dir->do_lu.lo_dev) == + lsi->lsi_dt_dev) { + if (!local_only && !lsi->lsi_dt_dev->dd_rdonly) { + /* Only try to copy log if we have the lock. */ + CDEBUG(D_INFO, "%s: copy local log %s\n", + mgc->obd_name, cld->cld_logname); + + rc = mgc_llog_local_copy(env, mgc, ctxt, lctxt, + cld->cld_logname); + if (!rc) + lsi->lsi_flags &= ~LDD_F_NO_LOCAL_LOGS; + } + if (local_only || rc) { + if (unlikely(lsi->lsi_flags & LDD_F_NO_LOCAL_LOGS) + || rc) { + CWARN("%s: local log %s are not valid and/or remote logs are not accessbile rc = %d\n", + mgc->obd_name, cld->cld_logname, rc); + GOTO(out_pop, rc = -EIO); + } + + if (strcmp(cld->cld_logname, PARAMS_FILENAME) != 0 && + llog_is_empty(env, lctxt, cld->cld_logname)) { + LCONSOLE_ERROR_MSG(0x13a, "Failed to get MGS log %s and no local copy.\n", + cld->cld_logname); + GOTO(out_pop, rc = -ENOENT); + } + CDEBUG(D_MGC, "%s: Failed to get MGS log %s, using local copy for now, will try to update later.\n", + mgc->obd_name, cld->cld_logname); + rc = 0; + } + /* Now, whether we copied or not, start using the local llog. + * If we failed to copy, we'll start using whatever the old + * log has. */ + llog_ctxt_put(ctxt); + ctxt = lctxt; + lctxt = NULL; + } else { + if (local_only) /* no local log at client side */ + GOTO(out_pop, rc = -EIO); + } + + rc = -EAGAIN; + if (lsi && IS_SERVER(lsi) && !IS_MGS(lsi) && + lsi->lsi_dt_dev->dd_rdonly) { + struct llog_ctxt *rctxt; + + /* Under readonly mode, we may have no local copy or local + * copy is incomplete, so try to use remote llog firstly. */ + rctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT); + LASSERT(rctxt); + + rc = class_config_parse_llog(env, rctxt, cld->cld_logname, + &cld->cld_cfg); + llog_ctxt_put(rctxt); + } + + if (rc && rc != -ENOENT) + rc = class_config_parse_llog(env, ctxt, cld->cld_logname, + &cld->cld_cfg); + + /* + * update settings on existing OBDs. + * the logname must be -sptlrpc + */ + if (rc == 0 && cld_is_sptlrpc(cld)) + class_notify_sptlrpc_conf(cld->cld_logname, + strlen(cld->cld_logname) - + strlen("-sptlrpc")); + EXIT; + +out_pop: + __llog_ctxt_put(env, ctxt); + if (lctxt) + __llog_ctxt_put(env, lctxt); + + lu_env_fini(env); +out_free: + OBD_FREE_PTR(env); + return rc; +} + +static bool mgc_import_in_recovery(struct obd_import *imp) +{ + bool in_recovery = true; + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_FULL || + imp->imp_state == LUSTRE_IMP_CLOSED) + in_recovery = false; + spin_unlock(&imp->imp_lock); + + return in_recovery; +} + +/** + * Get a configuration log from the MGS and process it. + * + * This function is called for both clients and servers to process the + * configuration log from the MGS. The MGC enqueues a DLM lock on the + * log from the MGS, and if the lock gets revoked the MGC will be notified + * by the lock cancellation callback that the config log has changed, + * and will enqueue another MGS lock on it, and then continue processing + * the new additions to the end of the log. + * + * Since the MGC import is not replayable, if the import is being evicted + * (rcl == -ESHUTDOWN, \see ptlrpc_import_delay_req()), retry to process + * the log until recovery is finished or the import is closed. + * + * Make a local copy of the log before parsing it if appropriate (non-MGS + * server) so that the server can start even when the MGS is down. + * + * There shouldn't be multiple processes running process_log at once -- + * sounds like badness. It actually might be fine, as long as they're not + * trying to update from the same log simultaneously, in which case we + * should use a per-log semaphore instead of cld_lock. + * + * \param[in] mgc MGC device by which to fetch the configuration log + * \param[in] cld log processing state (stored in lock callback data) + * + * \retval 0 on success + * \retval negative errno on failure + */ +int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld) +{ + struct lustre_handle lockh = { 0 }; + __u64 flags = LDLM_FL_NO_LRU; + int rc = 0, rcl; + bool retry = false; + ENTRY; + + LASSERT(cld != NULL); + + /* I don't want multiple processes running process_log at once -- + sounds like badness. It actually might be fine, as long as + we're not trying to update from the same log + simultaneously (in which case we should use a per-log sem.) */ +restart: + mutex_lock(&cld->cld_lock); + if (cld->cld_stopping) { + mutex_unlock(&cld->cld_lock); + RETURN(0); + } + + OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20); + + CDEBUG(D_MGC, "Process log %s-%016lx from %d\n", cld->cld_logname, + cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1); + + /* Get the cfg lock on the llog */ + rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, LDLM_PLAIN, NULL, + LCK_CR, &flags, + cld_is_barrier(cld) ? mgc_barrier_glimpse_ast : NULL, + cld, 0, NULL, &lockh); + if (rcl == 0) { + /* Get the cld, it will be released in mgc_blocking_ast. */ + config_log_get(cld); + rc = ldlm_lock_set_data(&lockh, (void *)cld); + LASSERT(!lustre_handle_is_used(&cld->cld_lockh)); + LASSERT(rc == 0); + cld->cld_lockh = lockh; + } else { + CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl); + cld->cld_lockh.cookie = 0; + + if (rcl == -ESHUTDOWN && + atomic_read(&mgc->u.cli.cl_mgc_refcount) > 0 && !retry) { + struct obd_import *imp; + long timeout = cfs_time_seconds(obd_timeout); + + mutex_unlock(&cld->cld_lock); + imp = class_exp2cliimp(mgc->u.cli.cl_mgc_mgsexp); + + /* Let's force the pinger, and wait the import to be + * connected, note: since mgc import is non-replayable, + * and even the import state is disconnected, it does + * not mean the "recovery" is stopped, so we will keep + * waitting until timeout or the import state is + * FULL or closed */ + ptlrpc_pinger_force(imp); + + wait_event_idle_timeout(imp->imp_recovery_waitq, + !mgc_import_in_recovery(imp), + timeout); + + if (imp->imp_state == LUSTRE_IMP_FULL) { + retry = true; + goto restart; + } else { + mutex_lock(&cld->cld_lock); + /* unlock/lock mutex, so check stopping again */ + if (cld->cld_stopping) { + mutex_unlock(&cld->cld_lock); + RETURN(0); + } + spin_lock(&config_list_lock); + cld->cld_lostlock = 1; + spin_unlock(&config_list_lock); + } + } else { + /* mark cld_lostlock so that it will requeue + * after MGC becomes available. */ + spin_lock(&config_list_lock); + cld->cld_lostlock = 1; + spin_unlock(&config_list_lock); + } + } + + if (cld_is_recover(cld) || cld_is_nodemap(cld)) { + if (!rcl) + rc = mgc_process_recover_nodemap_log(mgc, cld); + else if (cld_is_nodemap(cld)) + rc = rcl; + + } else if (!cld_is_barrier(cld)) { + rc = mgc_process_cfg_log(mgc, cld, rcl != 0); + } + + CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n", + mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc); + + /* Now drop the lock so MGS can revoke it */ + if (!rcl) { + rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, LCK_CR, &lockh); + if (rcl) + CERROR("Can't drop cfg lock: %d\n", rcl); + } + mutex_unlock(&cld->cld_lock); + + /* requeue nodemap lock immediately if transfer was interrupted */ + if ((cld_is_nodemap(cld) && rc == -EAGAIN) || + (cld_is_recover(cld) && rc)) { + if (cld_is_recover(cld)) + CWARN("%s: IR log %s failed, not fatal: rc = %d\n", + mgc->obd_name, cld->cld_logname, rc); + mgc_requeue_add(cld); + rc = 0; + } + + RETURN(rc); +} + + +/** Called from lustre_process_log. + * LCFG_LOG_START gets the config log from the MGS, processes it to start + * any services, and adds it to the list logs to watch (follow). + */ +static int mgc_process_config(struct obd_device *obd, size_t len, void *buf) +{ + struct lustre_cfg *lcfg = buf; + struct config_llog_instance *cfg = NULL; + char *logname; + int rc = 0; + ENTRY; + + switch(lcfg->lcfg_command) { + case LCFG_LOV_ADD_OBD: { + /* Overloading this cfg command: register a new target */ + struct mgs_target_info *mti; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) != + sizeof(struct mgs_target_info)) + GOTO(out, rc = -EINVAL); + + mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1); + CDEBUG(D_MGC, "add_target %s %#x\n", + mti->mti_svname, mti->mti_flags); + rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti); + break; + } + case LCFG_LOV_DEL_OBD: + /* Unregister has no meaning at the moment. */ + CERROR("lov_del_obd unimplemented\n"); + rc = -ENOSYS; + break; + case LCFG_SPTLRPC_CONF: { + rc = sptlrpc_process_config(lcfg); + break; + } + case LCFG_LOG_START: { + struct config_llog_data *cld; + struct super_block *sb; + + logname = lustre_cfg_string(lcfg, 1); + cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2); + sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3); + + CDEBUG(D_MGC, "parse_log %s from %d\n", logname, + cfg->cfg_last_idx); + + /* We're only called through here on the initial mount */ + cld = config_log_add(obd, logname, cfg, sb); + if (IS_ERR(cld)) { + rc = PTR_ERR(cld); + break; + } + + rc = mgc_process_log(obd, cld); + if (rc == 0 && cld->cld_recover != NULL) { + if (OCD_HAS_FLAG(&obd->u.cli.cl_import-> + imp_connect_data, IMP_RECOV)) { + rc = mgc_process_log(obd, cld->cld_recover); + } else { + struct config_llog_data *cir; + + mutex_lock(&cld->cld_lock); + cir = cld->cld_recover; + cld->cld_recover = NULL; + mutex_unlock(&cld->cld_lock); + config_log_put(cir); + } + + if (rc) + CERROR("Cannot process recover llog %d\n", rc); + } + + if (rc == 0 && cld->cld_params != NULL) { + rc = mgc_process_log(obd, cld->cld_params); + if (rc == -ENOENT) { + CDEBUG(D_MGC, "There is no params " + "config file yet\n"); + rc = 0; + } + /* params log is optional */ + if (rc) + CERROR("%s: can't process params llog: rc = %d\n", + obd->obd_name, rc); + } + + break; + } + case LCFG_LOG_END: { + logname = lustre_cfg_string(lcfg, 1); + + if (lcfg->lcfg_bufcount >= 2) + cfg = (struct config_llog_instance *)lustre_cfg_buf( + lcfg, 2); + rc = config_log_end(logname, cfg); + break; + } + default: { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + GOTO(out, rc = -EINVAL); + + } + } +out: + RETURN(rc); +} + +static const struct obd_ops mgc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = mgc_setup, + .o_precleanup = mgc_precleanup, + .o_cleanup = mgc_cleanup, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = client_connect_import, + .o_disconnect = client_disconnect_export, + .o_set_info_async = mgc_set_info_async, + .o_get_info = mgc_get_info, + .o_import_event = mgc_import_event, + .o_process_config = mgc_process_config, +}; + +static int mgc_param_requeue_timeout_min_set(const char *val, + cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned int num; + + rc = kstrtouint(val, 0, &num); + if (rc < 0) + return rc; + if (num > 120) + return -EINVAL; + + mgc_requeue_timeout_min = num; + + return 0; +} + +static const struct kernel_param_ops param_ops_requeue_timeout_min = { + .set = mgc_param_requeue_timeout_min_set, + .get = param_get_uint, +}; + +#define param_check_requeue_timeout_min(name, p) \ + __param_check(name, p, unsigned int) + +unsigned int mgc_requeue_timeout_min = MGC_TIMEOUT_MIN_SECONDS; +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(mgc_requeue_timeout_min, requeue_timeout_min, 0644); +#else +module_param_call(mgc_requeue_timeout_min, mgc_param_requeue_timeout_min_set, + param_get_uint, ¶m_ops_requeue_timeout_min, 0644); +#endif +MODULE_PARM_DESC(mgc_requeue_timeout_min, "Minimal requeue time to refresh logs"); + +static int __init mgc_init(void) +{ + return class_register_type(&mgc_obd_ops, NULL, false, + LUSTRE_MGC_NAME, NULL); +} + +static void __exit mgc_exit(void) +{ + class_unregister_type(LUSTRE_MGC_NAME); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Management Client"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(mgc_init); +module_exit(mgc_exit); diff --git a/drivers/staging/lustrefsx/lustre/nodist b/drivers/staging/lustrefsx/lustre/nodist new file mode 100644 index 0000000000000..24f55bb96b97d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/nodist @@ -0,0 +1,9 @@ +obd-*/obd-* +CVS +*~ +make.rules +config.* +*.o +*.orig +*.backup +.depfiles diff --git a/drivers/staging/lustrefsx/lustre/obdclass/Makefile b/drivers/staging/lustrefsx/lustre/obdclass/Makefile new file mode 100644 index 0000000000000..eaa614e1a33cd --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/Makefile @@ -0,0 +1,15 @@ +obj-$(CONFIG_LUSTREFSX_FS) += obdclass.o + +obdclass-y := llog.o llog_cat.o llog_obd.o llog_swab.o llog_osd.o +obdclass-y += class_obd.o genops.o llog_ioctl.o +obdclass-y += lprocfs_status.o lprocfs_counters.o +obdclass-y += lustre_handles.o lustre_peer.o local_storage.o +obdclass-y += statfs_pack.o obdo.o obd_config.o obd_mount.o obd_sysfs.o +obdclass-y += lu_object.o dt_object.o +obdclass-y += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o +obdclass-y += linkea.o kernelcomm.o jobid.o +obdclass-y += integrity.o obd_cksum.o +obdclass-y += lu_tgt_descs.o lu_tgt_pool.o +obdclass-y += range_lock.o interval_tree.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h new file mode 100644 index 0000000000000..1f98113a1df3c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h @@ -0,0 +1,57 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Internal cl interfaces. + * + * Author: Nikita Danilov + */ +#ifndef _CL_INTERNAL_H +#define _CL_INTERNAL_H + +/** + * Thread local state internal for generic cl-code. + */ +struct cl_thread_info { + /** + * Used for submitting a sync I/O. + */ + struct cl_sync_io clt_anchor; +}; + +extern struct kmem_cache *cl_dio_aio_kmem; +extern struct kmem_cache *cl_sub_dio_kmem; +extern struct kmem_cache *cl_page_kmem_array[16]; +extern unsigned short cl_page_kmem_size_array[16]; + +struct cl_thread_info *cl_env_info(const struct lu_env *env); +void cl_page_disown0(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg); + +#endif /* _CL_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c new file mode 100644 index 0000000000000..295622f59875a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c @@ -0,0 +1,1439 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Client IO. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include +#include "cl_internal.h" + +/***************************************************************************** + * + * cl_io interface. + * + */ + +static inline int cl_io_type_is_valid(enum cl_io_type type) +{ + return CIT_READ <= type && type < CIT_OP_NR; +} + +static inline int cl_io_is_loopable(const struct cl_io *io) +{ + return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC; +} + +/** + * cl_io invariant that holds at all times when exported cl_io_*() functions + * are entered and left. + */ +static int cl_io_invariant(const struct cl_io *io) +{ + struct cl_io *up; + + up = io->ci_parent; + return + /* + * io can own pages only when it is ongoing. Sub-io might + * still be in CIS_LOCKED state when top-io is in + * CIS_IO_GOING. + */ + ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING || + (io->ci_state == CIS_LOCKED && up != NULL)); +} + +/** + * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top. + */ +void cl_io_fini(const struct lu_env *env, struct cl_io *io) +{ + struct cl_io_slice *slice; + + LINVRNT(cl_io_type_is_valid(io->ci_type)); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + while (!list_empty(&io->ci_layers)) { + slice = container_of(io->ci_layers.prev, struct cl_io_slice, + cis_linkage); + list_del_init(&slice->cis_linkage); + if (slice->cis_iop->op[io->ci_type].cio_fini != NULL) + slice->cis_iop->op[io->ci_type].cio_fini(env, slice); + /* + * Invalidate slice to catch use after free. This assumes that + * slices are allocated within session and can be touched + * after ->cio_fini() returns. + */ + slice->cis_io = NULL; + } + io->ci_state = CIS_FINI; + + /* sanity check for layout change */ + switch(io->ci_type) { + case CIT_READ: + case CIT_WRITE: + case CIT_DATA_VERSION: + case CIT_FAULT: + break; + case CIT_FSYNC: + LASSERT(!io->ci_need_restart); + break; + case CIT_SETATTR: + case CIT_MISC: + /* Check ignore layout change conf */ + LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout, + !io->ci_need_restart)); + case CIT_GLIMPSE: + break; + case CIT_LADVISE: + case CIT_LSEEK: + break; + default: + LBUG(); + } + EXIT; +} +EXPORT_SYMBOL(cl_io_fini); + +static int cl_io_init0(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_object *scan; + int result; + + LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI); + LINVRNT(cl_io_type_is_valid(iot)); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + io->ci_type = iot; + INIT_LIST_HEAD(&io->ci_lockset.cls_todo); + INIT_LIST_HEAD(&io->ci_lockset.cls_done); + INIT_LIST_HEAD(&io->ci_layers); + + result = 0; + cl_object_for_each(scan, obj) { + if (scan->co_ops->coo_io_init != NULL) { + result = scan->co_ops->coo_io_init(env, scan, io); + if (result != 0) + break; + } + } + if (result == 0) + io->ci_state = CIS_INIT; + RETURN(result); +} + +/** + * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom. + * + * \pre obj != cl_object_top(obj) + */ +int cl_io_sub_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + LASSERT(obj != cl_object_top(obj)); + + return cl_io_init0(env, io, iot, obj); +} +EXPORT_SYMBOL(cl_io_sub_init); + +/** + * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom. + * + * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter + * what the latter returned. + * + * \pre obj == cl_object_top(obj) + * \pre cl_io_type_is_valid(iot) + * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot + */ +int cl_io_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + LASSERT(obj == cl_object_top(obj)); + + /* clear I/O restart from previous instance */ + io->ci_need_restart = 0; + + return cl_io_init0(env, io, iot, obj); +} +EXPORT_SYMBOL(cl_io_init); + +/** + * Initialize read or write io. + * + * \pre iot == CIT_READ || iot == CIT_WRITE + */ +int cl_io_rw_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, loff_t pos, size_t count) +{ + LINVRNT(iot == CIT_READ || iot == CIT_WRITE); + LINVRNT(io->ci_obj != NULL); + ENTRY; + + LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu, + "io range: %u [%llu, %llu) %u %u\n", + iot, (__u64)pos, (__u64)pos + count, + io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append); + io->u.ci_rw.crw_pos = pos; + io->u.ci_rw.crw_count = count; + RETURN(cl_io_init(env, io, iot, io->ci_obj)); +} +EXPORT_SYMBOL(cl_io_rw_init); + +#ifdef HAVE_LIST_CMP_FUNC_T +static int cl_lock_descr_cmp(void *priv, + const struct list_head *a, + const struct list_head *b) +#else /* !HAVE_LIST_CMP_FUNC_T */ +static int cl_lock_descr_cmp(void *priv, + struct list_head *a, struct list_head *b) +#endif /* HAVE_LIST_CMP_FUNC_T */ +{ + const struct cl_io_lock_link *l0 = list_entry(a, struct cl_io_lock_link, + cill_linkage); + const struct cl_io_lock_link *l1 = list_entry(b, struct cl_io_lock_link, + cill_linkage); + const struct cl_lock_descr *d0 = &l0->cill_descr; + const struct cl_lock_descr *d1 = &l1->cill_descr; + + return lu_fid_cmp(lu_object_fid(&d0->cld_obj->co_lu), + lu_object_fid(&d1->cld_obj->co_lu)); +} + +static void cl_lock_descr_merge(struct cl_lock_descr *d0, + const struct cl_lock_descr *d1) +{ + d0->cld_start = min(d0->cld_start, d1->cld_start); + d0->cld_end = max(d0->cld_end, d1->cld_end); + + if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE) + d0->cld_mode = CLM_WRITE; + + if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP) + d0->cld_mode = CLM_GROUP; +} + +static int cl_lockset_merge(const struct cl_lockset *set, + const struct cl_lock_descr *need) +{ + struct cl_io_lock_link *scan; + + ENTRY; + list_for_each_entry(scan, &set->cls_todo, cill_linkage) { + if (!cl_object_same(scan->cill_descr.cld_obj, need->cld_obj)) + continue; + + /* Merge locks for the same object because ldlm lock server + * may expand the lock extent, otherwise there is a deadlock + * case if two conflicted locks are queueud for the same object + * and lock server expands one lock to overlap the another. + * The side effect is that it can generate a multi-stripe lock + * that may cause casacading problem */ + cl_lock_descr_merge(&scan->cill_descr, need); + CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n", + scan->cill_descr.cld_mode, scan->cill_descr.cld_start, + scan->cill_descr.cld_end); + RETURN(+1); + } + RETURN(0); +} + +static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io, + struct cl_lockset *set) +{ + struct cl_io_lock_link *link; + struct cl_io_lock_link *temp; + int result; + + ENTRY; + result = 0; + list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) { + result = cl_lock_request(env, io, &link->cill_lock); + if (result < 0) + break; + + list_move(&link->cill_linkage, &set->cls_done); + } + RETURN(result); +} + +/** + * Takes locks necessary for the current iteration of io. + * + * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required + * by layers for the current iteration. Then sort locks (to avoid dead-locks), + * and acquire them. + */ +int cl_io_lock(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_IT_STARTED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_lock == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan); + if (result != 0) + break; + } + if (result == 0) { + /* + * Sort locks in lexicographical order of their (fid, + * start-offset) pairs to avoid deadlocks. + */ + list_sort(NULL, &io->ci_lockset.cls_todo, cl_lock_descr_cmp); + result = cl_lockset_lock(env, io, &io->ci_lockset); + } + if (result != 0) + cl_io_unlock(env, io); + else + io->ci_state = CIS_LOCKED; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock); + +/** + * Release locks takes by io. + */ +void cl_io_unlock(const struct lu_env *env, struct cl_io *io) +{ + struct cl_lockset *set; + struct cl_io_lock_link *link; + struct cl_io_lock_link *temp; + const struct cl_io_slice *scan; + + LASSERT(cl_io_is_loopable(io)); + LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + set = &io->ci_lockset; + + list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) { + list_del_init(&link->cill_linkage); + if (link->cill_fini != NULL) + link->cill_fini(env, link); + } + + list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) { + list_del_init(&link->cill_linkage); + cl_lock_release(env, &link->cill_lock); + if (link->cill_fini != NULL) + link->cill_fini(env, link); + } + + list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL) + scan->cis_iop->op[io->ci_type].cio_unlock(env, scan); + } + io->ci_state = CIS_UNLOCKED; + EXIT; +} +EXPORT_SYMBOL(cl_io_unlock); + +/** + * Prepares next iteration of io. + * + * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give + * layers a chance to modify io parameters, e.g., so that lov can restrict io + * to a single stripe. + */ +int cl_io_iter_init(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + result = 0; + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_iter_init(env, + scan); + if (result != 0) + break; + } + if (result == 0) + io->ci_state = CIS_IT_STARTED; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_iter_init); + +/** + * Finalizes io iteration. + * + * Calls cl_io_operations::cio_iter_fini() bottom-to-top. + */ +void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state <= CIS_IT_STARTED || + io->ci_state > CIS_IO_FINISHED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL) + scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan); + } + io->ci_state = CIS_IT_ENDED; + EXIT; +} +EXPORT_SYMBOL(cl_io_iter_fini); + +/** + * Records that read or write io progressed \a nob bytes forward. + */ +void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob) +{ + const struct cl_io_slice *scan; + + ENTRY; + + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || + nob == 0); + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(cl_io_invariant(io)); + + io->u.ci_rw.crw_pos += nob; + io->u.ci_rw.crw_count -= nob; + + /* layers have to be notified. */ + list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_advance != NULL) + scan->cis_iop->op[io->ci_type].cio_advance(env, scan, + nob); + } + EXIT; +} + +/** + * Adds a lock to a lockset. + */ +int cl_io_lock_add(const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link) +{ + int result; + + ENTRY; + if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr)) + result = +1; + else { + list_add(&link->cill_linkage, &io->ci_lockset.cls_todo); + result = 0; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock_add); + +static void cl_free_io_lock_link(const struct lu_env *env, + struct cl_io_lock_link *link) +{ + OBD_FREE_PTR(link); +} + +/** + * Allocates new lock link, and uses it to add a lock to a lockset. + */ +int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, + struct cl_lock_descr *descr) +{ + struct cl_io_lock_link *link; + int result; + + ENTRY; + OBD_ALLOC_PTR(link); + if (link != NULL) { + link->cill_descr = *descr; + link->cill_fini = cl_free_io_lock_link; + result = cl_io_lock_add(env, io, link); + if (result) /* lock match */ + link->cill_fini(env, link); + } else + result = -ENOMEM; + + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock_alloc_add); + +/** + * Starts io by calling cl_io_operations::cio_start() top-to-bottom. + */ +int cl_io_start(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + io->ci_state = CIS_IO_GOING; + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_start == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_start(env, scan); + if (result != 0) + break; + } + if (result >= 0) + result = 0; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_start); + +/** + * Wait until current io iteration is finished by calling + * cl_io_operations::cio_end() bottom-to-top. + */ +void cl_io_end(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_IO_GOING); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_end != NULL) + scan->cis_iop->op[io->ci_type].cio_end(env, scan); + /* TODO: error handling. */ + } + io->ci_state = CIS_IO_FINISHED; + EXIT; +} +EXPORT_SYMBOL(cl_io_end); + +/** + * Called by read io, to decide the readahead extent + * + * \see cl_io_operations::cio_read_ahead() + */ +int cl_io_read_ahead(const struct lu_env *env, struct cl_io *io, + pgoff_t start, struct cl_read_ahead *ra) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(io->ci_type == CIT_READ || + io->ci_type == CIT_FAULT || + io->ci_type == CIT_WRITE); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->cio_read_ahead == NULL) + continue; + + result = scan->cis_iop->cio_read_ahead(env, scan, start, ra); + if (result != 0) + break; + } + RETURN(result > 0 ? 0 : result); +} +EXPORT_SYMBOL(cl_io_read_ahead); + +/** + * Called before io start, to reserve enough LRU slots to avoid + * deadlock. + * + * \see cl_io_operations::cio_lru_reserve() + */ +int cl_io_lru_reserve(const struct lu_env *env, struct cl_io *io, + loff_t pos, size_t bytes) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->cio_lru_reserve) { + result = scan->cis_iop->cio_lru_reserve(env, scan, + pos, bytes); + if (result) + break; + } + } + + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lru_reserve); + +/** + * Commit a list of contiguous pages into writeback cache. + * + * \returns 0 if all pages committed, or errcode if error occurred. + * \see cl_io_operations::cio_commit_async() + */ +int cl_io_commit_async(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, int from, int to, + cl_commit_cbt cb) +{ + const struct cl_io_slice *scan; + int result = 0; + ENTRY; + + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->cio_commit_async == NULL) + continue; + result = scan->cis_iop->cio_commit_async(env, scan, queue, + from, to, cb); + if (result != 0) + break; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_io_commit_async); + +void cl_io_extent_release(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + ENTRY; + + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->cio_extent_release == NULL) + continue; + scan->cis_iop->cio_extent_release(env, scan); + } + EXIT; +} +EXPORT_SYMBOL(cl_io_extent_release); + +/** + * Submits a list of pages for immediate io. + * + * After the function gets returned, The submitted pages are moved to + * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need + * to be submitted, and the pages are errant to submit. + * + * \returns 0 if at least one page was submitted, error code otherwise. + * \see cl_io_operations::cio_submit() + */ +int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io, + enum cl_req_type crt, struct cl_2queue *queue) +{ + const struct cl_io_slice *scan; + int result = 0; + ENTRY; + + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->cio_submit == NULL) + continue; + result = scan->cis_iop->cio_submit(env, scan, crt, queue); + if (result != 0) + break; + } + /* + * If ->cio_submit() failed, no pages were sent. + */ + LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages))); + RETURN(result); +} +EXPORT_SYMBOL(cl_io_submit_rw); + +/** + * Submit a sync_io and wait for the IO to be finished, or error happens. + * If \a timeout is zero, it means to wait for the IO unconditionally. + */ +int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue, + long timeout) +{ + struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor; + struct cl_page *pg; + int rc; + ENTRY; + + cl_page_list_for_each(pg, &queue->c2_qin) { + LASSERT(pg->cp_sync_io == NULL); + pg->cp_sync_io = anchor; + } + + cl_sync_io_init(anchor, queue->c2_qin.pl_nr); + rc = cl_io_submit_rw(env, io, iot, queue); + if (rc == 0) { + /* + * If some pages weren't sent for any reason (e.g., + * read found up-to-date pages in the cache, or write found + * clean pages), count them as completed to avoid infinite + * wait. + */ + cl_page_list_for_each(pg, &queue->c2_qin) { + pg->cp_sync_io = NULL; + cl_sync_io_note(env, anchor, 1); + } + + /* wait for the IO to be finished. */ + rc = cl_sync_io_wait(env, anchor, timeout); + cl_page_list_assume(env, io, &queue->c2_qout); + } else { + LASSERT(list_empty(&queue->c2_qout.pl_pages)); + cl_page_list_for_each(pg, &queue->c2_qin) + pg->cp_sync_io = NULL; + } + RETURN(rc); +} +EXPORT_SYMBOL(cl_io_submit_sync); + +/** + * Main io loop. + * + * Pumps io through iterations calling + * + * - cl_io_iter_init() + * + * - cl_io_lock() + * + * - cl_io_start() + * + * - cl_io_end() + * + * - cl_io_unlock() + * + * - cl_io_iter_fini() + * + * repeatedly until there is no more io to do. + */ +int cl_io_loop(const struct lu_env *env, struct cl_io *io) +{ + int result = 0; + int rc = 0; + + LINVRNT(cl_io_is_loopable(io)); + ENTRY; + + do { + size_t nob; + + io->ci_continue = 0; + result = cl_io_iter_init(env, io); + if (result == 0) { + nob = io->ci_nob; + result = cl_io_lock(env, io); + if (result == 0) { + /* + * Notify layers that locks has been taken, + * and do actual i/o. + * + * - llite: kms, short read; + * - llite: generic_file_read(); + */ + result = cl_io_start(env, io); + /* + * Send any remaining pending + * io, etc. + * + ** - llite: ll_rw_stats_tally. + */ + cl_io_end(env, io); + cl_io_unlock(env, io); + cl_io_rw_advance(env, io, io->ci_nob - nob); + } + } + cl_io_iter_fini(env, io); + if (result) + rc = result; + } while ((result == 0 || result == -EIOCBQUEUED) && + io->ci_continue); + + if (rc && !result) + result = rc; + + if (result == -EAGAIN && io->ci_ndelay) { + io->ci_need_restart = 1; + result = 0; + } + + if (result == 0) + result = io->ci_result; + RETURN(result < 0 ? result : 0); +} +EXPORT_SYMBOL(cl_io_loop); + +/** + * Adds io slice to the cl_io. + * + * This is called by cl_object_operations::coo_io_init() methods to add a + * per-layer state to the io. New state is added at the end of + * cl_io::ci_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add() + */ +void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, + struct cl_object *obj, + const struct cl_io_operations *ops) +{ + struct list_head *linkage = &slice->cis_linkage; + + LASSERT((linkage->prev == NULL && linkage->next == NULL) || + list_empty(linkage)); + ENTRY; + + list_add_tail(linkage, &io->ci_layers); + slice->cis_io = io; + slice->cis_obj = obj; + slice->cis_iop = ops; + EXIT; +} +EXPORT_SYMBOL(cl_io_slice_add); + + +/** + * Initializes page list. + */ +void cl_page_list_init(struct cl_page_list *plist) +{ + ENTRY; + plist->pl_nr = 0; + INIT_LIST_HEAD(&plist->pl_pages); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_init); + +/** + * Adds a page to a page list. + */ +void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page, + bool get_ref) +{ + ENTRY; + /* it would be better to check that page is owned by "current" io, but + * it is not passed here. */ + LASSERT(page->cp_owner != NULL); + + LASSERT(list_empty(&page->cp_batch)); + list_add_tail(&page->cp_batch, &plist->pl_pages); + ++plist->pl_nr; + lu_ref_add_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist); + if (get_ref) + cl_page_get(page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_add); + +/** + * Removes a page from a page list. + */ +void cl_page_list_del(const struct lu_env *env, + struct cl_page_list *plist, struct cl_page *page) +{ + LASSERT(plist->pl_nr > 0); + LASSERT(cl_page_is_vmlocked(env, page)); + + ENTRY; + list_del_init(&page->cp_batch); + --plist->pl_nr; + lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist); + cl_page_put(env, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_del); + +/** + * Moves a page from one page list to another. + */ +void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page) +{ + LASSERT(src->pl_nr > 0); + + ENTRY; + list_move_tail(&page->cp_batch, &dst->pl_pages); + --src->pl_nr; + ++dst->pl_nr; + lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, "queue", + src, dst); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_move); + +/** + * Moves a page from one page list to the head of another list. + */ +void cl_page_list_move_head(struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page) +{ + LASSERT(src->pl_nr > 0); + + ENTRY; + list_move(&page->cp_batch, &dst->pl_pages); + --src->pl_nr; + ++dst->pl_nr; + lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, "queue", + src, dst); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_move_head); + +/** + * splice the cl_page_list, just as list head does + */ +void cl_page_list_splice(struct cl_page_list *src, struct cl_page_list *dst) +{ +#ifdef CONFIG_LUSTRE_DEBUG_LU_REF + struct cl_page *page; + struct cl_page *tmp; + + ENTRY; + cl_page_list_for_each_safe(page, tmp, src) + lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, + "queue", src, dst); +#else + ENTRY; +#endif + dst->pl_nr += src->pl_nr; + src->pl_nr = 0; + list_splice_tail_init(&src->pl_pages, &dst->pl_pages); + + EXIT; +} +EXPORT_SYMBOL(cl_page_list_splice); + +/** + * Disowns pages in a queue. + */ +void cl_page_list_disown(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + + + ENTRY; + cl_page_list_for_each_safe(page, temp, plist) { + LASSERT(plist->pl_nr > 0); + + list_del_init(&page->cp_batch); + --plist->pl_nr; + /* + * cl_page_disown0 rather than usual cl_page_disown() is used, + * because pages are possibly in CPS_FREEING state already due + * to the call to cl_page_list_discard(). + */ + /* + * XXX cl_page_disown0() will fail if page is not locked. + */ + cl_page_disown0(env, io, page); + lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue", + plist); + cl_page_put(env, page); + } + EXIT; +} +EXPORT_SYMBOL(cl_page_list_disown); + +/** + * Releases pages from queue. + */ +void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + + + ENTRY; + cl_page_list_for_each_safe(page, temp, plist) + cl_page_list_del(env, plist, page); + LASSERT(plist->pl_nr == 0); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_fini); + +/** + * Assumes all pages in a queue. + */ +void cl_page_list_assume(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + + + cl_page_list_for_each(page, plist) + cl_page_assume(env, io, page); +} + +/** + * Discards all pages in a queue. + */ +void cl_page_list_discard(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *plist) +{ + struct cl_page *page; + + ENTRY; + cl_page_list_for_each(page, plist) + cl_page_discard(env, io, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_discard); + +/** + * Initialize dual page queue. + */ +void cl_2queue_init(struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_init(&queue->c2_qin); + cl_page_list_init(&queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_init); + +/** + * Add a page to the incoming page list of 2-queue. + */ +void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page, bool get_ref) +{ + cl_page_list_add(&queue->c2_qin, page, get_ref); +} +EXPORT_SYMBOL(cl_2queue_add); + +/** + * Disown pages in both lists of a 2-queue. + */ +void cl_2queue_disown(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_disown(env, io, &queue->c2_qin); + cl_page_list_disown(env, io, &queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_disown); + +/** + * Discard (truncate) pages in both lists of a 2-queue. + */ +void cl_2queue_discard(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_discard(env, io, &queue->c2_qin); + cl_page_list_discard(env, io, &queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_discard); + +/** + * Assume to own the pages in cl_2queue + */ +void cl_2queue_assume(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + cl_page_list_assume(env, io, &queue->c2_qin); + cl_page_list_assume(env, io, &queue->c2_qout); +} + +/** + * Finalize both page lists of a 2-queue. + */ +void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_fini(env, &queue->c2_qout); + cl_page_list_fini(env, &queue->c2_qin); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_fini); + +/** + * Initialize a 2-queue to contain \a page in its incoming page list. + */ +void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page) +{ + ENTRY; + cl_2queue_init(queue); + cl_2queue_add(queue, page, true); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_init_page); + +/** + * Returns top-level io. + * + * \see cl_object_top() + */ +struct cl_io *cl_io_top(struct cl_io *io) +{ + ENTRY; + while (io->ci_parent != NULL) + io = io->ci_parent; + RETURN(io); +} +EXPORT_SYMBOL(cl_io_top); + +/** + * Prints human readable representation of \a io to the \a f. + */ +void cl_io_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_io *io) +{ +} + +/** + * Fills in attributes that are passed to server together with transfer. Only + * attributes from \a flags may be touched. This can be called multiple times + * for the same request. + */ +void cl_req_attr_set(const struct lu_env *env, struct cl_object *obj, + struct cl_req_attr *attr) +{ + struct cl_object *scan; + ENTRY; + + cl_object_for_each(scan, obj) { + if (scan->co_ops->coo_req_attr_set != NULL) + scan->co_ops->coo_req_attr_set(env, scan, attr); + } + EXIT; +} +EXPORT_SYMBOL(cl_req_attr_set); + +/** + * Initialize synchronous io wait \a anchor for \a nr pages with optional + * \a end handler. + * \param anchor owned by caller, initialzied here. + * \param nr number of pages initally pending in sync. + * \param end optional callback sync_io completion, can be used to + * trigger erasure coding, integrity, dedupe, or similar operation. + * \q end is called with a spinlock on anchor->csi_waitq.lock + */ + +void cl_sync_io_init_notify(struct cl_sync_io *anchor, int nr, + void *dio_aio, cl_sync_io_end_t *end) +{ + ENTRY; + memset(anchor, 0, sizeof(*anchor)); + init_waitqueue_head(&anchor->csi_waitq); + atomic_set(&anchor->csi_sync_nr, nr); + anchor->csi_sync_rc = 0; + anchor->csi_end_io = end; + anchor->csi_dio_aio = dio_aio; + EXIT; +} +EXPORT_SYMBOL(cl_sync_io_init_notify); + +/** + * Wait until all IO completes. Transfer completion routine has to call + * cl_sync_io_note() for every entity. + */ +int cl_sync_io_wait(const struct lu_env *env, struct cl_sync_io *anchor, + long timeout) +{ + int rc = 0; + ENTRY; + + LASSERT(timeout >= 0); + + if (timeout > 0 && + wait_event_idle_timeout(anchor->csi_waitq, + atomic_read(&anchor->csi_sync_nr) == 0, + cfs_time_seconds(timeout)) == 0) { + rc = -ETIMEDOUT; + CERROR("IO failed: %d, still wait for %d remaining entries\n", + rc, atomic_read(&anchor->csi_sync_nr)); + } + + wait_event_idle(anchor->csi_waitq, + atomic_read(&anchor->csi_sync_nr) == 0); + if (!rc) + rc = anchor->csi_sync_rc; + + /* We take the lock to ensure that cl_sync_io_note() has finished */ + spin_lock(&anchor->csi_waitq.lock); + LASSERT(atomic_read(&anchor->csi_sync_nr) == 0); + spin_unlock(&anchor->csi_waitq.lock); + + RETURN(rc); +} +EXPORT_SYMBOL(cl_sync_io_wait); + +static inline void dio_aio_complete(struct kiocb *iocb, ssize_t res) +{ +#ifdef HAVE_AIO_COMPLETE + aio_complete(iocb, res, 0); +#else + if (iocb->ki_complete) +# ifdef HAVE_KIOCB_COMPLETE_2ARGS + iocb->ki_complete(iocb, res); +# else + iocb->ki_complete(iocb, res, 0); +# endif +#endif +} + +static void cl_dio_aio_end(const struct lu_env *env, struct cl_sync_io *anchor) +{ + struct cl_dio_aio *aio = container_of(anchor, typeof(*aio), cda_sync); + ssize_t ret = anchor->csi_sync_rc; + + ENTRY; + + if (!aio->cda_no_aio_complete) + dio_aio_complete(aio->cda_iocb, ret ?: aio->cda_bytes); + + EXIT; +} + +static void cl_sub_dio_end(const struct lu_env *env, struct cl_sync_io *anchor) +{ + struct cl_sub_dio *sdio = container_of(anchor, typeof(*sdio), csd_sync); + ssize_t ret = anchor->csi_sync_rc; + + ENTRY; + + /* release pages */ + while (sdio->csd_pages.pl_nr > 0) { + struct cl_page *page = cl_page_list_first(&sdio->csd_pages); + + cl_page_delete(env, page); + cl_page_list_del(env, &sdio->csd_pages, page); + } + + ll_release_user_pages(sdio->csd_dio_pages.ldp_pages, + sdio->csd_dio_pages.ldp_count); + cl_sync_io_note(env, &sdio->csd_ll_aio->cda_sync, ret); + + EXIT; +} + +struct cl_dio_aio *cl_dio_aio_alloc(struct kiocb *iocb, struct cl_object *obj, + bool is_aio) +{ + struct cl_dio_aio *aio; + + OBD_SLAB_ALLOC_PTR_GFP(aio, cl_dio_aio_kmem, GFP_NOFS); + if (aio != NULL) { + /* + * Hold one ref so that it won't be released until + * every pages is added. + */ + cl_sync_io_init_notify(&aio->cda_sync, 1, aio, cl_dio_aio_end); + aio->cda_iocb = iocb; + aio->cda_no_aio_complete = !is_aio; + /* if this is true AIO, the memory is freed by the last call + * to cl_sync_io_note (when all the I/O is complete), because + * no one is waiting (in the kernel) for this to complete + * + * in other cases, the last user is cl_sync_io_wait, and in + * that case, the creator frees the struct after that call + */ + aio->cda_creator_free = !is_aio; + + cl_object_get(obj); + aio->cda_obj = obj; + } + return aio; +} +EXPORT_SYMBOL(cl_dio_aio_alloc); + +struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio, bool sync) +{ + struct cl_sub_dio *sdio; + + OBD_SLAB_ALLOC_PTR_GFP(sdio, cl_sub_dio_kmem, GFP_NOFS); + if (sdio != NULL) { + /* + * Hold one ref so that it won't be released until + * every pages is added. + */ + cl_sync_io_init_notify(&sdio->csd_sync, 1, sdio, + cl_sub_dio_end); + cl_page_list_init(&sdio->csd_pages); + + sdio->csd_ll_aio = ll_aio; + atomic_add(1, &ll_aio->cda_sync.csi_sync_nr); + sdio->csd_creator_free = sync; + } + return sdio; +} +EXPORT_SYMBOL(cl_sub_dio_alloc); + +void cl_dio_aio_free(const struct lu_env *env, struct cl_dio_aio *aio) +{ + if (aio) { + cl_object_put(env, aio->cda_obj); + OBD_SLAB_FREE_PTR(aio, cl_dio_aio_kmem); + } +} +EXPORT_SYMBOL(cl_dio_aio_free); + +void cl_sub_dio_free(struct cl_sub_dio *sdio) +{ + if (sdio) + OBD_SLAB_FREE_PTR(sdio, cl_sub_dio_kmem); +} +EXPORT_SYMBOL(cl_sub_dio_free); +/* + * ll_release_user_pages - tear down page struct array + * @pages: array of page struct pointers underlying target buffer + */ +void ll_release_user_pages(struct page **pages, int npages) +{ + int i; + + if (npages == 0) { + LASSERT(!pages); + return; + } + + for (i = 0; i < npages; i++) { + if (!pages[i]) + break; + put_page(pages[i]); + } + +#if defined(HAVE_DIO_ITER) + kvfree(pages); +#else + OBD_FREE_PTR_ARRAY_LARGE(pages, npages); +#endif +} +EXPORT_SYMBOL(ll_release_user_pages); + +/** + * Indicate that transfer of a single page completed. + */ +void cl_sync_io_note(const struct lu_env *env, struct cl_sync_io *anchor, + int ioret) +{ + ENTRY; + + if (anchor->csi_sync_rc == 0 && ioret < 0) + anchor->csi_sync_rc = ioret; + /* + * Synchronous IO done without releasing page lock (e.g., as a part of + * ->{prepare,commit}_write(). Completion is used to signal the end of + * IO. + */ + LASSERT(atomic_read(&anchor->csi_sync_nr) > 0); + if (atomic_dec_and_lock(&anchor->csi_sync_nr, + &anchor->csi_waitq.lock)) { + struct cl_sub_dio *sub_dio_aio = NULL; + struct cl_dio_aio *dio_aio = NULL; + void *csi_dio_aio = NULL; + bool creator_free = true; + + cl_sync_io_end_t *end_io = anchor->csi_end_io; + + /* + * Holding the lock across both the decrement and + * the wakeup ensures cl_sync_io_wait() doesn't complete + * before the wakeup completes and the contents of + * of anchor become unsafe to access as the owner is free + * to immediately reclaim anchor when cl_sync_io_wait() + * completes. + */ + wake_up_locked(&anchor->csi_waitq); + if (end_io) + end_io(env, anchor); + + csi_dio_aio = anchor->csi_dio_aio; + sub_dio_aio = csi_dio_aio; + dio_aio = csi_dio_aio; + + if (csi_dio_aio && end_io == cl_dio_aio_end) + creator_free = dio_aio->cda_creator_free; + else if (csi_dio_aio && end_io == cl_sub_dio_end) + creator_free = sub_dio_aio->csd_creator_free; + + spin_unlock(&anchor->csi_waitq.lock); + + if (csi_dio_aio) { + if (end_io == cl_dio_aio_end) { + if (!creator_free) + cl_dio_aio_free(env, dio_aio); + } else if (end_io == cl_sub_dio_end) { + if (!creator_free) + cl_sub_dio_free(sub_dio_aio); + } + } + } + EXIT; +} +EXPORT_SYMBOL(cl_sync_io_note); + +int cl_sync_io_wait_recycle(const struct lu_env *env, struct cl_sync_io *anchor, + long timeout, int ioret) +{ + int rc = 0; + + /* + * @anchor was inited as 1 to prevent end_io to be + * called before we add all pages for IO, so drop + * one extra reference to make sure we could wait + * count to be zero. + */ + cl_sync_io_note(env, anchor, ioret); + /* Wait for completion of normal dio. + * This replaces the EIOCBQEUED return from the DIO/AIO + * path, and this is where AIO and DIO implementations + * split. + */ + rc = cl_sync_io_wait(env, anchor, timeout); + /** + * One extra reference again, as if @anchor is + * reused we assume it as 1 before using. + */ + atomic_add(1, &anchor->csi_sync_nr); + + return rc; +} +EXPORT_SYMBOL(cl_sync_io_wait_recycle); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c new file mode 100644 index 0000000000000..6dd0663161649 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c @@ -0,0 +1,289 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Client Extent Lock. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include "cl_internal.h" + +static void cl_lock_trace0(int level, const struct lu_env *env, + const char *prefix, const struct cl_lock *lock, + const char *func, const int line) +{ + struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj); + CDEBUG(level, "%s: %p (%p/%d) at %s():%d\n", + prefix, lock, env, h->coh_nesting, func, line); +} +#define cl_lock_trace(level, env, prefix, lock) \ + cl_lock_trace0(level, env, prefix, lock, __FUNCTION__, __LINE__) + +/** + * Adds lock slice to the compound lock. + * + * This is called by cl_object_operations::coo_lock_init() methods to add a + * per-layer state to the lock. New state is added at the end of + * cl_lock::cll_layers list, that is, it is at the bottom of the stack. + * + * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add() + */ +void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, + struct cl_object *obj, + const struct cl_lock_operations *ops) +{ + ENTRY; + slice->cls_lock = lock; + list_add_tail(&slice->cls_linkage, &lock->cll_layers); + slice->cls_obj = obj; + slice->cls_ops = ops; + EXIT; +} +EXPORT_SYMBOL(cl_lock_slice_add); + +void cl_lock_fini(const struct lu_env *env, struct cl_lock *lock) +{ + ENTRY; + + cl_lock_trace(D_DLMTRACE, env, "destroy lock", lock); + + while (!list_empty(&lock->cll_layers)) { + struct cl_lock_slice *slice; + + slice = list_entry(lock->cll_layers.next, + struct cl_lock_slice, cls_linkage); + list_del_init(lock->cll_layers.next); + slice->cls_ops->clo_fini(env, slice); + } + POISON(lock, 0x5a, sizeof(*lock)); + EXIT; +} +EXPORT_SYMBOL(cl_lock_fini); + +int cl_lock_init(const struct lu_env *env, struct cl_lock *lock, + const struct cl_io *io) +{ + struct cl_object *obj = lock->cll_descr.cld_obj; + struct cl_object *scan; + int result = 0; + ENTRY; + + /* Make sure cl_lock::cll_descr is initialized. */ + LASSERT(obj != NULL); + + INIT_LIST_HEAD(&lock->cll_layers); + cl_object_for_each(scan, obj) { + if (scan->co_ops->coo_lock_init != NULL) + result = scan->co_ops->coo_lock_init(env, scan, lock, + io); + + if (result != 0) { + cl_lock_fini(env, lock); + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_init); + +/** + * Returns a slice with a lock, corresponding to the given layer in the + * device stack. + * + * \see cl_page_at() + */ +const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, + const struct lu_device_type *dtype) +{ + const struct cl_lock_slice *slice; + + ENTRY; + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype) + RETURN(slice); + } + RETURN(NULL); +} +EXPORT_SYMBOL(cl_lock_at); + +void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + ENTRY; + + cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock); + list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_cancel != NULL) + slice->cls_ops->clo_cancel(env, slice); + } + + EXIT; +} +EXPORT_SYMBOL(cl_lock_cancel); + +/** + * Enqueue a lock. + * \param anchor: if we need to wait for resources before getting the lock, + * use @anchor for the purpose. + * \retval 0 enqueue successfully + * \retval <0 error code + */ +int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io, + struct cl_lock *lock, struct cl_sync_io *anchor) +{ + const struct cl_lock_slice *slice; + int rc = 0; + + ENTRY; + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_enqueue == NULL) + continue; + + rc = slice->cls_ops->clo_enqueue(env, slice, io, anchor); + if (rc != 0) + break; + } + RETURN(rc); +} +EXPORT_SYMBOL(cl_lock_enqueue); + +/** + * Main high-level entry point of cl_lock interface that finds existing or + * enqueues new lock matching given description. + */ +int cl_lock_request(const struct lu_env *env, struct cl_io *io, + struct cl_lock *lock) +{ + struct cl_sync_io *anchor = NULL; + __u32 enq_flags = lock->cll_descr.cld_enq_flags; + int rc; + ENTRY; + + rc = cl_lock_init(env, lock, io); + if (rc < 0) + RETURN(rc); + + if ((enq_flags & CEF_GLIMPSE) && !(enq_flags & CEF_SPECULATIVE)) { + anchor = &cl_env_info(env)->clt_anchor; + cl_sync_io_init(anchor, 1); + } + + rc = cl_lock_enqueue(env, io, lock, anchor); + + if (anchor != NULL) { + int rc2; + + /* drop the reference count held at initialization time */ + cl_sync_io_note(env, anchor, 0); + rc2 = cl_sync_io_wait(env, anchor, 0); + if (rc2 < 0 && rc == 0) + rc = rc2; + } + + if (rc < 0) + cl_lock_release(env, lock); + RETURN(rc); +} +EXPORT_SYMBOL(cl_lock_request); + +/** + * Releases a hold and a reference on a lock, obtained by cl_lock_hold(). + */ +void cl_lock_release(const struct lu_env *env, struct cl_lock *lock) +{ + ENTRY; + + cl_lock_trace(D_DLMTRACE, env, "release lock", lock); + cl_lock_cancel(env, lock); + cl_lock_fini(env, lock); + EXIT; +} +EXPORT_SYMBOL(cl_lock_release); + +const char *cl_lock_mode_name(const enum cl_lock_mode mode) +{ + static const char * const names[] = { + [CLM_READ] = "R", + [CLM_WRITE] = "W", + [CLM_GROUP] = "G" + }; + BUILD_BUG_ON(CLM_MAX != ARRAY_SIZE(names)); + return names[mode]; +} +EXPORT_SYMBOL(cl_lock_mode_name); + +/** + * Prints human readable representation of a lock description. + */ +void cl_lock_descr_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_lock_descr *descr) +{ + const struct lu_fid *fid; + + fid = lu_object_fid(&descr->cld_obj->co_lu); + (*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid)); +} +EXPORT_SYMBOL(cl_lock_descr_print); + +/** + * Prints human readable representation of \a lock to the \a f. + */ +void cl_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + + (*printer)(env, cookie, "lock@%p", lock); + cl_lock_descr_print(env, cookie, printer, &lock->cll_descr); + (*printer)(env, cookie, " {\n"); + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + (*printer)(env, cookie, " %s@%p: ", + slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name, + slice); + if (slice->cls_ops->clo_print != NULL) + slice->cls_ops->clo_print(env, cookie, printer, slice); + (*printer)(env, cookie, "\n"); + } + (*printer)(env, cookie, "} lock@%p\n", lock); +} +EXPORT_SYMBOL(cl_lock_print); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c new file mode 100644 index 0000000000000..8c29b5a164950 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c @@ -0,0 +1,1118 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Client Lustre Object. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +/* + * Locking. + * + * i_mutex + * PG_locked + * ->coh_attr_guard + * ->ls_guard + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include /* for cfs_hash stuff */ +#include +#include +#include "cl_internal.h" + +static struct kmem_cache *cl_env_kmem; +struct kmem_cache *cl_dio_aio_kmem; +struct kmem_cache *cl_sub_dio_kmem; +struct kmem_cache *cl_page_kmem_array[16]; +unsigned short cl_page_kmem_size_array[16]; + +/** Lock class of cl_object_header::coh_attr_guard */ +static struct lock_class_key cl_attr_guard_class; + +/** + * Initialize cl_object_header. + */ +int cl_object_header_init(struct cl_object_header *h) +{ + int result; + + ENTRY; + result = lu_object_header_init(&h->coh_lu); + if (result == 0) { + spin_lock_init(&h->coh_attr_guard); + lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class); + h->coh_page_bufsize = 0; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_header_init); + +/** + * Finalize cl_object_header. + */ +void cl_object_header_fini(struct cl_object_header *h) +{ + lu_object_header_fini(&h->coh_lu); +} + +/** + * Returns a cl_object with a given \a fid. + * + * Returns either cached or newly created object. Additional reference on the + * returned object is acquired. + * + * \see lu_object_find(), cl_page_find(), cl_lock_find() + */ +struct cl_object *cl_object_find(const struct lu_env *env, + struct cl_device *cd, const struct lu_fid *fid, + const struct cl_object_conf *c) +{ + might_sleep(); + return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu)); +} +EXPORT_SYMBOL(cl_object_find); + +/** + * Releases a reference on \a o. + * + * When last reference is released object is returned to the cache, unless + * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header. + * + * \see cl_page_put(), cl_lock_put(). + */ +void cl_object_put(const struct lu_env *env, struct cl_object *o) +{ + lu_object_put(env, &o->co_lu); +} +EXPORT_SYMBOL(cl_object_put); + +/** + * Acquire an additional reference to the object \a o. + * + * This can only be used to acquire _additional_ reference, i.e., caller + * already has to possess at least one reference to \a o before calling this. + * + * \see cl_page_get(), cl_lock_get(). + */ +void cl_object_get(struct cl_object *o) +{ + lu_object_get(&o->co_lu); +} +EXPORT_SYMBOL(cl_object_get); + +/** + * Returns the top-object for a given \a o. + * + * \see cl_io_top() + */ +struct cl_object *cl_object_top(struct cl_object *o) +{ + struct cl_object_header *hdr = cl_object_header(o); + struct cl_object *top; + + while (hdr->coh_parent != NULL) + hdr = hdr->coh_parent; + + top = lu2cl(lu_object_top(&hdr->coh_lu)); + CDEBUG(D_TRACE, "%p -> %p\n", o, top); + return top; +} +EXPORT_SYMBOL(cl_object_top); + +/** + * Returns pointer to the lock protecting data-attributes for the given object + * \a o. + * + * Data-attributes are protected by the cl_object_header::coh_attr_guard + * spin-lock in the top-object. + * + * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get(). + */ +static spinlock_t *cl_object_attr_guard(struct cl_object *o) +{ + return &cl_object_header(cl_object_top(o))->coh_attr_guard; +} + +/** + * Locks data-attributes. + * + * Prevents data-attributes from changing, until lock is released by + * cl_object_attr_unlock(). This has to be called before calls to + * cl_object_attr_get(), cl_object_attr_update(). + */ +void cl_object_attr_lock(struct cl_object *o) +__acquires(cl_object_attr_guard(o)) +{ + spin_lock(cl_object_attr_guard(o)); +} +EXPORT_SYMBOL(cl_object_attr_lock); + +/** + * Releases data-attributes lock, acquired by cl_object_attr_lock(). + */ +void cl_object_attr_unlock(struct cl_object *o) +__releases(cl_object_attr_guard(o)) +{ + spin_unlock(cl_object_attr_guard(o)); +} +EXPORT_SYMBOL(cl_object_attr_unlock); + +/** + * Returns data-attributes of an object \a obj. + * + * Every layer is asked (by calling cl_object_operations::coo_attr_get()) + * top-to-bottom to fill in parts of \a attr that this layer is responsible + * for. + */ +int cl_object_attr_get(const struct lu_env *env, struct cl_object *top, + struct cl_attr *attr) +{ + struct cl_object *obj; + int result = 0; + + assert_spin_locked(cl_object_attr_guard(top)); + ENTRY; + + cl_object_for_each(obj, top) { + if (obj->co_ops->coo_attr_get != NULL) { + result = obj->co_ops->coo_attr_get(env, obj, attr); + if (result != 0) { + if (result > 0) + result = 0; + break; + } + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_attr_get); + +/** + * Updates data-attributes of an object \a obj. + * + * Only attributes, mentioned in a validness bit-mask \a v are + * updated. Calls cl_object_operations::coo_upd_attr() on every layer, bottom + * to top. + */ +int cl_object_attr_update(const struct lu_env *env, struct cl_object *top, + const struct cl_attr *attr, unsigned v) +{ + struct cl_object *obj; + int result = 0; + + assert_spin_locked(cl_object_attr_guard(top)); + ENTRY; + + cl_object_for_each_reverse(obj, top) { + if (obj->co_ops->coo_attr_update != NULL) { + result = obj->co_ops->coo_attr_update(env, obj, attr, + v); + if (result != 0) { + if (result > 0) + result = 0; + break; + } + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_attr_update); + +/** + * Notifies layers (bottom-to-top) that glimpse AST was received. + * + * Layers have to fill \a lvb fields with information that will be shipped + * back to glimpse issuer. + * + * \see cl_lock_operations::clo_glimpse() + */ +int cl_object_glimpse(const struct lu_env *env, struct cl_object *top, + struct ost_lvb *lvb) +{ + struct cl_object *obj; + int result = 0; + + ENTRY; + cl_object_for_each_reverse(obj, top) { + if (obj->co_ops->coo_glimpse != NULL) { + result = obj->co_ops->coo_glimpse(env, obj, lvb); + if (result != 0) + break; + } + } + LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top->co_lu.lo_header), + "size: %llu mtime: %llu atime: %llu " + "ctime: %llu blocks: %llu\n", + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, + lvb->lvb_ctime, lvb->lvb_blocks); + RETURN(result); +} +EXPORT_SYMBOL(cl_object_glimpse); + +/** + * Updates a configuration of an object \a obj. + */ +int cl_conf_set(const struct lu_env *env, struct cl_object *top, + const struct cl_object_conf *conf) +{ + struct cl_object *obj; + int result = 0; + + ENTRY; + cl_object_for_each(obj, top) { + if (obj->co_ops->coo_conf_set != NULL) { + result = obj->co_ops->coo_conf_set(env, obj, conf); + if (result) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_conf_set); + +/** + * Prunes caches of pages and locks for this object. + */ +int cl_object_prune(const struct lu_env *env, struct cl_object *top) +{ + struct cl_object *obj; + int result = 0; + ENTRY; + + cl_object_for_each(obj, top) { + if (obj->co_ops->coo_prune != NULL) { + result = obj->co_ops->coo_prune(env, obj); + if (result) + break; + } + } + + RETURN(result); +} +EXPORT_SYMBOL(cl_object_prune); + +/** + * Get stripe information of this object. + */ +int cl_object_getstripe(const struct lu_env *env, struct cl_object *top, + struct lov_user_md __user *uarg, size_t size) +{ + struct cl_object *obj; + int result = 0; + ENTRY; + + cl_object_for_each(obj, top) { + if (obj->co_ops->coo_getstripe) { + result = obj->co_ops->coo_getstripe(env, obj, uarg, + size); + if (result) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_getstripe); + +/** + * Get fiemap extents from file object. + * + * \param env [in] lustre environment + * \param obj [in] file object + * \param key [in] fiemap request argument + * \param fiemap [out] fiemap extents mapping retrived + * \param buflen [in] max buffer length of @fiemap + * + * \retval 0 success + * \retval < 0 error + */ +int cl_object_fiemap(const struct lu_env *env, struct cl_object *top, + struct ll_fiemap_info_key *key, + struct fiemap *fiemap, size_t *buflen) +{ + struct cl_object *obj; + int result = 0; + ENTRY; + + cl_object_for_each(obj, top) { + if (obj->co_ops->coo_fiemap) { + result = obj->co_ops->coo_fiemap(env, obj, key, fiemap, + buflen); + if (result) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_fiemap); + +int cl_object_layout_get(const struct lu_env *env, struct cl_object *top, + struct cl_layout *cl) +{ + struct cl_object *obj; + ENTRY; + + cl_object_for_each(obj, top) { + if (obj->co_ops->coo_layout_get) + return obj->co_ops->coo_layout_get(env, obj, cl); + } + + RETURN(-EOPNOTSUPP); +} +EXPORT_SYMBOL(cl_object_layout_get); + +loff_t cl_object_maxbytes(struct cl_object *top) +{ + struct cl_object *obj; + loff_t maxbytes = LLONG_MAX; + ENTRY; + + cl_object_for_each(obj, top) { + if (obj->co_ops->coo_maxbytes) + maxbytes = min_t(loff_t, obj->co_ops->coo_maxbytes(obj), + maxbytes); + } + + RETURN(maxbytes); +} +EXPORT_SYMBOL(cl_object_maxbytes); + +int cl_object_flush(const struct lu_env *env, struct cl_object *top, + struct ldlm_lock *lock) +{ + struct cl_object *obj; + int rc = 0; + ENTRY; + + cl_object_for_each(obj, top) { + if (obj->co_ops->coo_object_flush) { + rc = obj->co_ops->coo_object_flush(env, obj, lock); + if (rc) + break; + } + } + RETURN(rc); +} +EXPORT_SYMBOL(cl_object_flush); + +/** + * Helper function removing all object locks, and marking object for + * deletion. All object pages must have been deleted at this point. + * + * This is called by cl_inode_fini() and lov_object_delete() to destroy top- + * and sub- objects respectively. + */ +void cl_object_kill(const struct lu_env *env, struct cl_object *obj) +{ + struct cl_object_header *hdr = cl_object_header(obj); + + set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags); +} +EXPORT_SYMBOL(cl_object_kill); + +void cache_stats_init(struct cache_stats *cs, const char *name) +{ + int i; + + cs->cs_name = name; + for (i = 0; i < CS_NR; i++) + atomic_set(&cs->cs_stats[i], 0); +} + +static int cache_stats_print(const struct cache_stats *cs, + struct seq_file *m, int h) +{ + int i; + + /* + * lookup hit total cached create + * env: ...... ...... ...... ...... ...... + */ + if (h) { + const char *names[CS_NR] = CS_NAMES; + + seq_printf(m, "%6s", " "); + for (i = 0; i < CS_NR; i++) + seq_printf(m, "%8s", names[i]); + seq_printf(m, "\n"); + } + + seq_printf(m, "%5.5s:", cs->cs_name); + for (i = 0; i < CS_NR; i++) + seq_printf(m, "%8u", atomic_read(&cs->cs_stats[i])); + return 0; +} + +static void cl_env_percpu_refill(void); + +/** + * Initialize client site. + * + * Perform common initialization (lu_site_init()), and initialize statistical + * counters. Also perform global initializations on the first call. + */ +int cl_site_init(struct cl_site *s, struct cl_device *d) +{ + size_t i; + int result; + + result = lu_site_init(&s->cs_lu, &d->cd_lu_dev); + if (result == 0) { + cache_stats_init(&s->cs_pages, "pages"); + for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i) + atomic_set(&s->cs_pages_state[0], 0); + cl_env_percpu_refill(); + } + return result; +} +EXPORT_SYMBOL(cl_site_init); + +/** + * Finalize client site. Dual to cl_site_init(). + */ +void cl_site_fini(struct cl_site *s) +{ + lu_site_fini(&s->cs_lu); +} +EXPORT_SYMBOL(cl_site_fini); + +static struct cache_stats cl_env_stats = { + .cs_name = "envs", + .cs_stats = { ATOMIC_INIT(0), } +}; + +/** + * Outputs client site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int cl_site_stats_print(const struct cl_site *site, struct seq_file *m) +{ + static const char *const pstate[] = { + [CPS_CACHED] = "c", + [CPS_OWNED] = "o", + [CPS_PAGEOUT] = "w", + [CPS_PAGEIN] = "r", + [CPS_FREEING] = "f" + }; + size_t i; + +/* + lookup hit total busy create +pages: ...... ...... ...... ...... ...... [...... ...... ...... ......] +locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......] + env: ...... ...... ...... ...... ...... + */ + lu_site_stats_seq_print(&site->cs_lu, m); + cache_stats_print(&site->cs_pages, m, 1); + seq_printf(m, " ["); + for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i) + seq_printf(m, "%s: %u ", pstate[i], + atomic_read(&site->cs_pages_state[i])); + seq_printf(m, "]\n"); + cache_stats_print(&cl_env_stats, m, 0); + seq_printf(m, "\n"); + return 0; +} +EXPORT_SYMBOL(cl_site_stats_print); + +/***************************************************************************** + * + * lu_env handling on client. + * + */ + +/** + * The most efficient way is to store cl_env pointer in task specific + * structures. On Linux, it isn't easy to use task_struct->journal_info + * because Lustre code may call into other fs during memory reclaim, which + * has certain assumptions about journal_info. There are not currently any + * fields in task_struct that can be used for this purpose. + * \note As long as we use task_struct to store cl_env, we assume that once + * called into Lustre, we'll never call into the other part of the kernel + * which will use those fields in task_struct without explicitly exiting + * Lustre. + * + * Since there's no space in task_struct is available, hash will be used. + * bz20044, bz22683. + */ + +static unsigned cl_envs_cached_max = 32; /* XXX: prototype: arbitrary limit + * for now. */ +static struct cl_env_cache { + rwlock_t cec_guard; + unsigned cec_count; + struct list_head cec_envs; +} *cl_envs = NULL; + +struct cl_env { + void *ce_magic; + struct lu_env ce_lu; + struct lu_context ce_ses; + + /* + * Linkage into global list of all client environments. Used for + * garbage collection. + */ + struct list_head ce_linkage; + /* + * + */ + int ce_ref; + /* + * Debugging field: address of the caller who made original + * allocation. + */ + void *ce_debug; +}; + +static void cl_env_inc(enum cache_stats_item item) +{ +#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING + atomic_inc(&cl_env_stats.cs_stats[item]); +#endif +} + +static void cl_env_dec(enum cache_stats_item item) +{ +#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING + LASSERT(atomic_read(&cl_env_stats.cs_stats[item]) > 0); + atomic_dec(&cl_env_stats.cs_stats[item]); +#endif +} + +static void cl_env_init0(struct cl_env *cle, void *debug) +{ + LASSERT(cle->ce_ref == 0); + LASSERT(cle->ce_magic == &cl_env_init0); + LASSERT(cle->ce_debug == NULL); + + cle->ce_ref = 1; + cle->ce_debug = debug; + cl_env_inc(CS_busy); +} + +static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug) +{ + struct lu_env *env; + struct cl_env *cle; + + OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, GFP_NOFS); + if (cle != NULL) { + int rc; + + INIT_LIST_HEAD(&cle->ce_linkage); + cle->ce_magic = &cl_env_init0; + env = &cle->ce_lu; + rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags); + if (rc == 0) { + rc = lu_context_init(&cle->ce_ses, + LCT_SESSION | ses_tags); + if (rc == 0) { + lu_context_enter(&cle->ce_ses); + env->le_ses = &cle->ce_ses; + cl_env_init0(cle, debug); + } else + lu_env_fini(env); + } + if (rc != 0) { + OBD_SLAB_FREE_PTR(cle, cl_env_kmem); + env = ERR_PTR(rc); + } else { + cl_env_inc(CS_create); + cl_env_inc(CS_total); + } + } else + env = ERR_PTR(-ENOMEM); + return env; +} + +static void cl_env_fini(struct cl_env *cle) +{ + cl_env_dec(CS_total); + lu_context_fini(&cle->ce_lu.le_ctx); + lu_context_fini(&cle->ce_ses); + OBD_SLAB_FREE_PTR(cle, cl_env_kmem); +} + +static struct lu_env *cl_env_obtain(void *debug) +{ + struct cl_env *cle; + struct lu_env *env; + int cpu = get_cpu(); + + ENTRY; + + read_lock(&cl_envs[cpu].cec_guard); + LASSERT(equi(cl_envs[cpu].cec_count == 0, + list_empty(&cl_envs[cpu].cec_envs))); + if (cl_envs[cpu].cec_count > 0) { + int rc; + + cle = container_of(cl_envs[cpu].cec_envs.next, struct cl_env, + ce_linkage); + list_del_init(&cle->ce_linkage); + cl_envs[cpu].cec_count--; + read_unlock(&cl_envs[cpu].cec_guard); + put_cpu(); + + env = &cle->ce_lu; + rc = lu_env_refill(env); + if (rc == 0) { + cl_env_init0(cle, debug); + lu_context_enter(&env->le_ctx); + lu_context_enter(&cle->ce_ses); + } else { + cl_env_fini(cle); + env = ERR_PTR(rc); + } + } else { + read_unlock(&cl_envs[cpu].cec_guard); + put_cpu(); + env = cl_env_new(lu_context_tags_default, + lu_session_tags_default, debug); + } + RETURN(env); +} + +static inline struct cl_env *cl_env_container(struct lu_env *env) +{ + return container_of(env, struct cl_env, ce_lu); +} + +/** + * Returns lu_env: if there already is an environment associated with the + * current thread, it is returned, otherwise, new environment is allocated. + * + * Allocations are amortized through the global cache of environments. + * + * \param refcheck pointer to a counter used to detect environment leaks. In + * the usual case cl_env_get() and cl_env_put() are called in the same lexical + * scope and pointer to the same integer is passed as \a refcheck. This is + * used to detect missed cl_env_put(). + * + * \see cl_env_put() + */ +struct lu_env *cl_env_get(__u16 *refcheck) +{ + struct lu_env *env; + + env = cl_env_obtain(__builtin_return_address(0)); + if (!IS_ERR(env)) { + struct cl_env *cle; + + cle = cl_env_container(env); + *refcheck = cle->ce_ref; + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + } + return env; +} +EXPORT_SYMBOL(cl_env_get); + +/** + * Forces an allocation of a fresh environment with given tags. + * + * \see cl_env_get() + */ +struct lu_env *cl_env_alloc(__u16 *refcheck, __u32 tags) +{ + struct lu_env *env; + + env = cl_env_new(tags, tags, __builtin_return_address(0)); + if (!IS_ERR(env)) { + struct cl_env *cle; + + cle = cl_env_container(env); + *refcheck = cle->ce_ref; + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + } + return env; +} +EXPORT_SYMBOL(cl_env_alloc); + +static void cl_env_exit(struct cl_env *cle) +{ + lu_context_exit(&cle->ce_lu.le_ctx); + lu_context_exit(&cle->ce_ses); +} + +/** + * Finalizes and frees a given number of cached environments. This is done to + * (1) free some memory (not currently hooked into VM), or (2) release + * references to modules. + */ +unsigned cl_env_cache_purge(unsigned nr) +{ + struct cl_env *cle; + unsigned i; + + ENTRY; + for_each_possible_cpu(i) { + write_lock(&cl_envs[i].cec_guard); + for (; !list_empty(&cl_envs[i].cec_envs) && nr > 0; --nr) { + cle = container_of(cl_envs[i].cec_envs.next, + struct cl_env, ce_linkage); + list_del_init(&cle->ce_linkage); + LASSERT(cl_envs[i].cec_count > 0); + cl_envs[i].cec_count--; + write_unlock(&cl_envs[i].cec_guard); + + cl_env_fini(cle); + write_lock(&cl_envs[i].cec_guard); + } + LASSERT(equi(cl_envs[i].cec_count == 0, + list_empty(&cl_envs[i].cec_envs))); + write_unlock(&cl_envs[i].cec_guard); + } + RETURN(nr); +} +EXPORT_SYMBOL(cl_env_cache_purge); + +/** + * Release an environment. + * + * Decrement \a env reference counter. When counter drops to 0, nothing in + * this thread is using environment and it is returned to the allocation + * cache, or freed straight away, if cache is large enough. + */ +void cl_env_put(struct lu_env *env, __u16 *refcheck) +{ + struct cl_env *cle; + + cle = cl_env_container(env); + + LASSERT(cle->ce_ref > 0); + LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck)); + + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + if (--cle->ce_ref == 0) { + int cpu = get_cpu(); + + cl_env_dec(CS_busy); + cle->ce_debug = NULL; + cl_env_exit(cle); + /* + * Don't bother to take a lock here. + * + * Return environment to the cache only when it was allocated + * with the standard tags. + */ + if (cl_envs[cpu].cec_count < cl_envs_cached_max && + (env->le_ctx.lc_tags & ~LCT_HAS_EXIT) == lu_context_tags_default && + (env->le_ses->lc_tags & ~LCT_HAS_EXIT) == lu_session_tags_default) { + read_lock(&cl_envs[cpu].cec_guard); + list_add(&cle->ce_linkage, &cl_envs[cpu].cec_envs); + cl_envs[cpu].cec_count++; + read_unlock(&cl_envs[cpu].cec_guard); + } else + cl_env_fini(cle); + put_cpu(); + } +} +EXPORT_SYMBOL(cl_env_put); + +/** + * Converts struct cl_attr to struct ost_lvb. + * + * \see cl_lvb2attr + */ +void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr) +{ + lvb->lvb_size = attr->cat_size; + lvb->lvb_mtime = attr->cat_mtime; + lvb->lvb_atime = attr->cat_atime; + lvb->lvb_ctime = attr->cat_ctime; + lvb->lvb_blocks = attr->cat_blocks; +} + +/** + * Converts struct ost_lvb to struct cl_attr. + * + * \see cl_attr2lvb + */ +void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb) +{ + attr->cat_size = lvb->lvb_size; + attr->cat_mtime = lvb->lvb_mtime; + attr->cat_atime = lvb->lvb_atime; + attr->cat_ctime = lvb->lvb_ctime; + attr->cat_blocks = lvb->lvb_blocks; +} +EXPORT_SYMBOL(cl_lvb2attr); + +static struct cl_env cl_env_percpu[NR_CPUS]; +static DEFINE_MUTEX(cl_env_percpu_mutex); + +static int cl_env_percpu_init(void) +{ + struct cl_env *cle; + int tags = LCT_REMEMBER | LCT_NOREF; + int i, j; + int rc = 0; + + for_each_possible_cpu(i) { + struct lu_env *env; + + rwlock_init(&cl_envs[i].cec_guard); + INIT_LIST_HEAD(&cl_envs[i].cec_envs); + cl_envs[i].cec_count = 0; + + cle = &cl_env_percpu[i]; + env = &cle->ce_lu; + + INIT_LIST_HEAD(&cle->ce_linkage); + cle->ce_magic = &cl_env_init0; + rc = lu_env_init(env, LCT_CL_THREAD | tags); + if (rc == 0) { + rc = lu_context_init(&cle->ce_ses, LCT_SESSION | tags); + if (rc == 0) { + lu_context_enter(&cle->ce_ses); + env->le_ses = &cle->ce_ses; + } else { + lu_env_fini(env); + } + } + if (rc != 0) + break; + } + if (rc != 0) { + /* Indices 0 to i (excluding i) were correctly initialized, + * thus we must uninitialize up to i, the rest are undefined. */ + for (j = 0; j < i; j++) { + cle = &cl_env_percpu[j]; + lu_context_exit(&cle->ce_ses); + lu_context_fini(&cle->ce_ses); + lu_env_fini(&cle->ce_lu); + } + } + + return rc; +} + +static void cl_env_percpu_fini(void) +{ + int i; + + for_each_possible_cpu(i) { + struct cl_env *cle = &cl_env_percpu[i]; + + lu_context_exit(&cle->ce_ses); + lu_context_fini(&cle->ce_ses); + lu_env_fini(&cle->ce_lu); + } +} + +static void cl_env_percpu_refill(void) +{ + int i; + + mutex_lock(&cl_env_percpu_mutex); + for_each_possible_cpu(i) + lu_env_refill(&cl_env_percpu[i].ce_lu); + mutex_unlock(&cl_env_percpu_mutex); +} + +void cl_env_percpu_put(struct lu_env *env) +{ + struct cl_env *cle; + int cpu; + + cpu = smp_processor_id(); + cle = cl_env_container(env); + LASSERT(cle == &cl_env_percpu[cpu]); + + cle->ce_ref--; + LASSERT(cle->ce_ref == 0); + + cl_env_dec(CS_busy); + cle->ce_debug = NULL; + + put_cpu(); +} +EXPORT_SYMBOL(cl_env_percpu_put); + +struct lu_env *cl_env_percpu_get(void) +{ + struct cl_env *cle; + + cle = &cl_env_percpu[get_cpu()]; + cl_env_init0(cle, __builtin_return_address(0)); + + return &cle->ce_lu; +} +EXPORT_SYMBOL(cl_env_percpu_get); + +/***************************************************************************** + * + * Temporary prototype thing: mirror obd-devices into cl devices. + * + */ + +struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, + struct lu_device_type *ldt, + struct lu_device *next) +{ + const char *typename; + struct lu_device *d; + + LASSERT(ldt != NULL); + + typename = ldt->ldt_name; + d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL); + if (!IS_ERR(d)) { + int rc; + + if (site != NULL) + d->ld_site = site; + rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next); + if (rc == 0) { + lu_device_get(d); + lu_ref_add(&d->ld_reference, + "lu-stack", &lu_site_init); + } else { + ldt->ldt_ops->ldto_device_free(env, d); + CERROR("can't init device '%s', %d\n", typename, rc); + d = ERR_PTR(rc); + } + } else + CERROR("Cannot allocate device: '%s'\n", typename); + return lu2cl_dev(d); +} +EXPORT_SYMBOL(cl_type_setup); + +/** + * Finalize device stack by calling lu_stack_fini(). + */ +void cl_stack_fini(const struct lu_env *env, struct cl_device *cl) +{ + lu_stack_fini(env, cl2lu_dev(cl)); +} +EXPORT_SYMBOL(cl_stack_fini); + +static struct lu_context_key cl_key; + +struct cl_thread_info *cl_env_info(const struct lu_env *env) +{ + return lu_context_key_get(&env->le_ctx, &cl_key); +} + +/* defines cl_key_{init,fini}() */ +LU_KEY_INIT_FINI(cl, struct cl_thread_info); + +static struct lu_context_key cl_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = cl_key_init, + .lct_fini = cl_key_fini, +}; + +static struct lu_kmem_descr cl_object_caches[] = { + { + .ckd_cache = &cl_env_kmem, + .ckd_name = "cl_env_kmem", + .ckd_size = sizeof(struct cl_env) + }, + { + .ckd_cache = &cl_dio_aio_kmem, + .ckd_name = "cl_dio_aio_kmem", + .ckd_size = sizeof(struct cl_dio_aio) + }, + { + .ckd_cache = &cl_sub_dio_kmem, + .ckd_name = "cl_sub_dio_kmem", + .ckd_size = sizeof(struct cl_sub_dio) + }, + { + .ckd_cache = NULL + } +}; + +/** + * Global initialization of cl-data. Create kmem caches, register + * lu_context_key's, etc. + * + * \see cl_global_fini() + */ +int cl_global_init(void) +{ + int result; + + OBD_ALLOC_PTR_ARRAY(cl_envs, num_possible_cpus()); + if (cl_envs == NULL) + GOTO(out, result = -ENOMEM); + + result = lu_kmem_init(cl_object_caches); + if (result) + GOTO(out_envs, result); + + LU_CONTEXT_KEY_INIT(&cl_key); + result = lu_context_key_register(&cl_key); + if (result) + GOTO(out_kmem, result); + + result = cl_env_percpu_init(); + if (result) /* no cl_env_percpu_fini on error */ + GOTO(out_keys, result); + + return 0; + +out_keys: + lu_context_key_degister(&cl_key); +out_kmem: + lu_kmem_fini(cl_object_caches); +out_envs: + OBD_FREE_PTR_ARRAY(cl_envs, num_possible_cpus()); +out: + return result; +} + +/** + * Finalization of global cl-data. Dual to cl_global_init(). + */ +void cl_global_fini(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(cl_page_kmem_array); i++) { + if (cl_page_kmem_array[i]) { + kmem_cache_destroy(cl_page_kmem_array[i]); + cl_page_kmem_array[i] = NULL; + } + } + cl_env_percpu_fini(); + lu_context_key_degister(&cl_key); + lu_kmem_fini(cl_object_caches); + OBD_FREE_PTR_ARRAY(cl_envs, num_possible_cpus()); +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c new file mode 100644 index 0000000000000..b573e8da3a1a3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c @@ -0,0 +1,1291 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Client Lustre Page. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include + +#include +#include "cl_internal.h" + +static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg); +static DEFINE_MUTEX(cl_page_kmem_mutex); + +#ifdef LIBCFS_DEBUG +# define PASSERT(env, page, expr) \ + do { \ + if (unlikely(!(expr))) { \ + CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \ + LASSERT(0); \ + } \ + } while (0) +#else /* !LIBCFS_DEBUG */ +# define PASSERT(env, page, exp) \ + ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp)) +#endif /* !LIBCFS_DEBUG */ + +#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK +# define PINVRNT(env, page, expr) \ + do { \ + if (unlikely(!(expr))) { \ + CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \ + LINVRNT(0); \ + } \ + } while (0) +#else /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */ +# define PINVRNT(env, page, exp) \ + ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp)) +#endif /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */ + +/* Disable page statistic by default due to huge performance penalty. */ +static void cs_page_inc(const struct cl_object *obj, + enum cache_stats_item item) +{ +#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING + atomic_inc(&cl_object_site(obj)->cs_pages.cs_stats[item]); +#endif +} + +static void cs_page_dec(const struct cl_object *obj, + enum cache_stats_item item) +{ +#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING + atomic_dec(&cl_object_site(obj)->cs_pages.cs_stats[item]); +#endif +} + +static void cs_pagestate_inc(const struct cl_object *obj, + enum cl_page_state state) +{ +#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING + atomic_inc(&cl_object_site(obj)->cs_pages_state[state]); +#endif +} + +static void cs_pagestate_dec(const struct cl_object *obj, + enum cl_page_state state) +{ +#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING + atomic_dec(&cl_object_site(obj)->cs_pages_state[state]); +#endif +} + +/** + * Internal version of cl_page_get(). + * + * This function can be used to obtain initial reference to previously + * unreferenced cached object. It can be called only if concurrent page + * reclamation is somehow prevented, e.g., by keeping a lock on a VM page, + * associated with \a page. + * + * Use with care! Not exported. + */ +static void cl_page_get_trust(struct cl_page *page) +{ + LASSERT(atomic_read(&page->cp_ref) > 0); + atomic_inc(&page->cp_ref); +} + +static struct cl_page_slice * +cl_page_slice_get(const struct cl_page *cl_page, int index) +{ + if (index < 0 || index >= cl_page->cp_layer_count) + return NULL; + + /* To get the cp_layer_offset values fit under 256 bytes, we + * use the offset beyond the end of struct cl_page. + */ + return (struct cl_page_slice *)((char *)cl_page + sizeof(*cl_page) + + cl_page->cp_layer_offset[index]); +} + +#define cl_page_slice_for_each(cl_page, slice, i) \ + for (i = 0, slice = cl_page_slice_get(cl_page, 0); \ + i < (cl_page)->cp_layer_count; \ + slice = cl_page_slice_get(cl_page, ++i)) + +#define cl_page_slice_for_each_reverse(cl_page, slice, i) \ + for (i = (cl_page)->cp_layer_count - 1, \ + slice = cl_page_slice_get(cl_page, i); i >= 0; \ + slice = cl_page_slice_get(cl_page, --i)) + +/** + * Returns a slice within a cl_page, corresponding to the given layer in the + * device stack. + * + * \see cl_lock_at() + */ +static const struct cl_page_slice * +cl_page_at_trusted(const struct cl_page *cl_page, + const struct lu_device_type *dtype) +{ + const struct cl_page_slice *slice; + int i; + + ENTRY; + + cl_page_slice_for_each(cl_page, slice, i) { + if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype) + RETURN(slice); + } + + RETURN(NULL); +} + +static void __cl_page_free(struct cl_page *cl_page, unsigned short bufsize) +{ + int index = cl_page->cp_kmem_index; + + if (index >= 0) { + LASSERT(index < ARRAY_SIZE(cl_page_kmem_array)); + LASSERT(cl_page_kmem_size_array[index] == bufsize); + OBD_SLAB_FREE(cl_page, cl_page_kmem_array[index], bufsize); + } else { + OBD_FREE(cl_page, bufsize); + } +} + +static void cl_page_free(const struct lu_env *env, struct cl_page *cl_page, + struct pagevec *pvec) +{ + struct cl_object *obj = cl_page->cp_obj; + unsigned short bufsize = cl_object_header(obj)->coh_page_bufsize; + struct cl_page_slice *slice; + int i; + + ENTRY; + PASSERT(env, cl_page, list_empty(&cl_page->cp_batch)); + PASSERT(env, cl_page, cl_page->cp_owner == NULL); + PASSERT(env, cl_page, cl_page->cp_state == CPS_FREEING); + + cl_page_slice_for_each(cl_page, slice, i) { + if (unlikely(slice->cpl_ops->cpo_fini != NULL)) + slice->cpl_ops->cpo_fini(env, slice, pvec); + } + cl_page->cp_layer_count = 0; + cs_page_dec(obj, CS_total); + cs_pagestate_dec(obj, cl_page->cp_state); + lu_object_ref_del_at(&obj->co_lu, &cl_page->cp_obj_ref, + "cl_page", cl_page); + if (cl_page->cp_type != CPT_TRANSIENT) + cl_object_put(env, obj); + lu_ref_fini(&cl_page->cp_reference); + __cl_page_free(cl_page, bufsize); + EXIT; +} + +static struct cl_page *__cl_page_alloc(struct cl_object *o) +{ + int i = 0; + struct cl_page *cl_page = NULL; + unsigned short bufsize = cl_object_header(o)->coh_page_bufsize; + + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_PAGE_ALLOC)) + return NULL; + +check: + /* the number of entries in cl_page_kmem_array is expected to + * only be 2-3 entries, so the lookup overhead should be low. + */ + for ( ; i < ARRAY_SIZE(cl_page_kmem_array); i++) { + if (smp_load_acquire(&cl_page_kmem_size_array[i]) + == bufsize) { + OBD_SLAB_ALLOC_GFP(cl_page, cl_page_kmem_array[i], + bufsize, GFP_NOFS); + if (cl_page) + cl_page->cp_kmem_index = i; + return cl_page; + } + if (cl_page_kmem_size_array[i] == 0) + break; + } + + if (i < ARRAY_SIZE(cl_page_kmem_array)) { + char cache_name[32]; + + mutex_lock(&cl_page_kmem_mutex); + if (cl_page_kmem_size_array[i]) { + mutex_unlock(&cl_page_kmem_mutex); + goto check; + } + snprintf(cache_name, sizeof(cache_name), + "cl_page_kmem-%u", bufsize); + cl_page_kmem_array[i] = + kmem_cache_create(cache_name, bufsize, + 0, 0, NULL); + if (cl_page_kmem_array[i] == NULL) { + mutex_unlock(&cl_page_kmem_mutex); + return NULL; + } + smp_store_release(&cl_page_kmem_size_array[i], + bufsize); + mutex_unlock(&cl_page_kmem_mutex); + goto check; + } else { + OBD_ALLOC_GFP(cl_page, bufsize, GFP_NOFS); + if (cl_page) + cl_page->cp_kmem_index = -1; + } + + return cl_page; +} + +struct cl_page *cl_page_alloc(const struct lu_env *env, struct cl_object *o, + pgoff_t ind, struct page *vmpage, + enum cl_page_type type) +{ + struct cl_page *cl_page; + struct cl_object *head; + + ENTRY; + + cl_page = __cl_page_alloc(o); + if (cl_page != NULL) { + int result = 0; + + /* + * Please fix cl_page:cp_state/type declaration if + * these assertions fail in the future. + */ + BUILD_BUG_ON((1 << CP_STATE_BITS) < CPS_NR); /* cp_state */ + BUILD_BUG_ON((1 << CP_TYPE_BITS) < CPT_NR); /* cp_type */ + atomic_set(&cl_page->cp_ref, 1); + cl_page->cp_obj = o; + if (type != CPT_TRANSIENT) + cl_object_get(o); + lu_object_ref_add_at(&o->co_lu, &cl_page->cp_obj_ref, + "cl_page", cl_page); + cl_page->cp_vmpage = vmpage; + cl_page->cp_state = CPS_CACHED; + cl_page->cp_type = type; + if (type == CPT_TRANSIENT) + /* ref to correct inode will be added + * in ll_direct_rw_pages + */ + cl_page->cp_inode = NULL; + else + cl_page->cp_inode = page2inode(vmpage); + INIT_LIST_HEAD(&cl_page->cp_batch); + lu_ref_init(&cl_page->cp_reference); + head = o; + cl_page->cp_page_index = ind; + cl_object_for_each(o, head) { + if (o->co_ops->coo_page_init != NULL) { + result = o->co_ops->coo_page_init(env, o, + cl_page, ind); + if (result != 0) { + cl_page_delete0(env, cl_page); + cl_page_free(env, cl_page, NULL); + cl_page = ERR_PTR(result); + break; + } + } + } + if (result == 0) { + cs_page_inc(o, CS_total); + cs_page_inc(o, CS_create); + cs_pagestate_dec(o, CPS_CACHED); + } + } else { + cl_page = ERR_PTR(-ENOMEM); + } + RETURN(cl_page); +} + +/** + * Returns a cl_page with index \a idx at the object \a o, and associated with + * the VM page \a vmpage. + * + * This is the main entry point into the cl_page caching interface. First, a + * cache (implemented as a per-object radix tree) is consulted. If page is + * found there, it is returned immediately. Otherwise new page is allocated + * and returned. In any case, additional reference to page is acquired. + * + * \see cl_object_find(), cl_lock_find() + */ +struct cl_page *cl_page_find(const struct lu_env *env, + struct cl_object *o, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type) +{ + struct cl_page *page = NULL; + struct cl_object_header *hdr; + + LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT); + might_sleep(); + + ENTRY; + + hdr = cl_object_header(o); + cs_page_inc(o, CS_lookup); + + CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n", + idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type); + /* fast path. */ + if (type == CPT_CACHEABLE) { + /* vmpage lock is used to protect the child/parent + * relationship */ + LASSERT(PageLocked(vmpage)); + /* + * cl_vmpage_page() can be called here without any locks as + * + * - "vmpage" is locked (which prevents ->private from + * concurrent updates), and + * + * - "o" cannot be destroyed while current thread holds a + * reference on it. + */ + page = cl_vmpage_page(vmpage, o); + if (page != NULL) { + cs_page_inc(o, CS_hit); + RETURN(page); + } + } + + /* allocate and initialize cl_page */ + page = cl_page_alloc(env, o, idx, vmpage, type); + RETURN(page); +} +EXPORT_SYMBOL(cl_page_find); + +static inline int cl_page_invariant(const struct cl_page *pg) +{ + return cl_page_in_use_noref(pg); +} + +static void cl_page_state_set0(const struct lu_env *env, + struct cl_page *cl_page, + enum cl_page_state state) +{ + enum cl_page_state old; + + /* + * Matrix of allowed state transitions [old][new], for sanity + * checking. + */ + static const int allowed_transitions[CPS_NR][CPS_NR] = { + [CPS_CACHED] = { + [CPS_CACHED] = 0, + [CPS_OWNED] = 1, /* io finds existing cached page */ + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 1, /* write-out from the cache */ + [CPS_FREEING] = 1, /* eviction on the memory pressure */ + }, + [CPS_OWNED] = { + [CPS_CACHED] = 1, /* release to the cache */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 1, /* start read immediately */ + [CPS_PAGEOUT] = 1, /* start write immediately */ + [CPS_FREEING] = 1, /* lock invalidation or truncate */ + }, + [CPS_PAGEIN] = { + [CPS_CACHED] = 1, /* io completion */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + }, + [CPS_PAGEOUT] = { + [CPS_CACHED] = 1, /* io completion */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + }, + [CPS_FREEING] = { + [CPS_CACHED] = 0, + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + } + }; + + ENTRY; + old = cl_page->cp_state; + PASSERT(env, cl_page, allowed_transitions[old][state]); + CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d -> %d\n", old, state); + PASSERT(env, cl_page, cl_page->cp_state == old); + PASSERT(env, cl_page, equi(state == CPS_OWNED, + cl_page->cp_owner != NULL)); + + cs_pagestate_dec(cl_page->cp_obj, cl_page->cp_state); + cs_pagestate_inc(cl_page->cp_obj, state); + cl_page->cp_state = state; + EXIT; +} + +static void cl_page_state_set(const struct lu_env *env, + struct cl_page *page, enum cl_page_state state) +{ + cl_page_state_set0(env, page, state); +} + +/** + * Acquires an additional reference to a page. + * + * This can be called only by caller already possessing a reference to \a + * page. + * + * \see cl_object_get(), cl_lock_get(). + */ +void cl_page_get(struct cl_page *page) +{ + ENTRY; + cl_page_get_trust(page); + EXIT; +} +EXPORT_SYMBOL(cl_page_get); + +/** + * Releases a reference to a page, use the pagevec to release the pages + * in batch if provided. + * + * Users need to do a final pagevec_release() to release any trailing pages. + */ +void cl_pagevec_put(const struct lu_env *env, struct cl_page *page, + struct pagevec *pvec) +{ + ENTRY; + CL_PAGE_HEADER(D_TRACE, env, page, "%d\n", + atomic_read(&page->cp_ref)); + + if (atomic_dec_and_test(&page->cp_ref)) { + LASSERT(page->cp_state == CPS_FREEING); + + LASSERT(atomic_read(&page->cp_ref) == 0); + PASSERT(env, page, page->cp_owner == NULL); + PASSERT(env, page, list_empty(&page->cp_batch)); + /* + * Page is no longer reachable by other threads. Tear + * it down. + */ + cl_page_free(env, page, pvec); + } + + EXIT; +} +EXPORT_SYMBOL(cl_pagevec_put); + +/** + * Releases a reference to a page, wrapper to cl_pagevec_put + * + * When last reference is released, page is returned to the cache, unless it + * is in cl_page_state::CPS_FREEING state, in which case it is immediately + * destroyed. + * + * \see cl_object_put(), cl_lock_put(). + */ +void cl_page_put(const struct lu_env *env, struct cl_page *page) +{ + cl_pagevec_put(env, page, NULL); +} +EXPORT_SYMBOL(cl_page_put); + +/** + * Returns a cl_page associated with a VM page, and given cl_object. + */ +struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj) +{ + struct cl_page *page; + + ENTRY; + LASSERT(PageLocked(vmpage)); + + /* + * NOTE: absence of races and liveness of data are guaranteed by page + * lock on a "vmpage". That works because object destruction has + * bottom-to-top pass. + */ + + page = (struct cl_page *)vmpage->private; + if (page != NULL) { + cl_page_get_trust(page); + LASSERT(page->cp_type == CPT_CACHEABLE); + } + RETURN(page); +} +EXPORT_SYMBOL(cl_vmpage_page); + +const struct cl_page_slice *cl_page_at(const struct cl_page *page, + const struct lu_device_type *dtype) +{ + return cl_page_at_trusted(page, dtype); +} +EXPORT_SYMBOL(cl_page_at); + +static void cl_page_owner_clear(struct cl_page *page) +{ + ENTRY; + if (page->cp_owner != NULL) { + LASSERT(page->cp_owner->ci_owned_nr > 0); + page->cp_owner->ci_owned_nr--; + page->cp_owner = NULL; + } + EXIT; +} + +static void cl_page_owner_set(struct cl_page *page) +{ + ENTRY; + LASSERT(page->cp_owner != NULL); + page->cp_owner->ci_owned_nr++; + EXIT; +} + +void cl_page_disown0(const struct lu_env *env, + struct cl_io *io, struct cl_page *cl_page) +{ + const struct cl_page_slice *slice; + enum cl_page_state state; + int i; + + ENTRY; + state = cl_page->cp_state; + PINVRNT(env, cl_page, state == CPS_OWNED || + state == CPS_FREEING); + PINVRNT(env, cl_page, cl_page_invariant(cl_page) || + state == CPS_FREEING); + cl_page_owner_clear(cl_page); + + if (state == CPS_OWNED) + cl_page_state_set(env, cl_page, CPS_CACHED); + /* + * Completion call-backs are executed in the bottom-up order, so that + * uppermost layer (llite), responsible for VFS/VM interaction runs + * last and can release locks safely. + */ + cl_page_slice_for_each_reverse(cl_page, slice, i) { + if (slice->cpl_ops->cpo_disown != NULL) + (*slice->cpl_ops->cpo_disown)(env, slice, io); + } + + EXIT; +} + +/** + * returns true, iff page is owned by the given io. + */ +int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io) +{ + struct cl_io *top = cl_io_top((struct cl_io *)io); + LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj)); + ENTRY; + RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == top); +} +EXPORT_SYMBOL(cl_page_is_owned); + +/** + * Try to own a page by IO. + * + * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it + * into cl_page_state::CPS_OWNED state. + * + * \pre !cl_page_is_owned(cl_page, io) + * \post result == 0 iff cl_page_is_owned(cl_page, io) + * + * \retval 0 success + * + * \retval -ve failure, e.g., cl_page was destroyed (and landed in + * cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED). + * or, page was owned by another thread, or in IO. + * + * \see cl_page_disown() + * \see cl_page_operations::cpo_own() + * \see cl_page_own_try() + * \see cl_page_own + */ +static int cl_page_own0(const struct lu_env *env, struct cl_io *io, + struct cl_page *cl_page, int nonblock) +{ + const struct cl_page_slice *slice; + int result = 0; + int i; + + ENTRY; + PINVRNT(env, cl_page, !cl_page_is_owned(cl_page, io)); + io = cl_io_top(io); + + if (cl_page->cp_state == CPS_FREEING) { + result = -ENOENT; + goto out; + } + + cl_page_slice_for_each(cl_page, slice, i) { + if (slice->cpl_ops->cpo_own) + result = (*slice->cpl_ops->cpo_own)(env, slice, + io, nonblock); + if (result != 0) + break; + } + if (result > 0) + result = 0; + + if (result == 0) { + PASSERT(env, cl_page, cl_page->cp_owner == NULL); + cl_page->cp_owner = cl_io_top(io); + cl_page_owner_set(cl_page); + if (cl_page->cp_state != CPS_FREEING) { + cl_page_state_set(env, cl_page, CPS_OWNED); + } else { + cl_page_disown0(env, io, cl_page); + result = -ENOENT; + } + } + +out: + PINVRNT(env, cl_page, ergo(result == 0, + cl_page_invariant(cl_page))); + RETURN(result); +} + +/** + * Own a page, might be blocked. + * + * \see cl_page_own0() + */ +int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg) +{ + return cl_page_own0(env, io, pg, 0); +} +EXPORT_SYMBOL(cl_page_own); + +/** + * Nonblock version of cl_page_own(). + * + * \see cl_page_own0() + */ +int cl_page_own_try(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg) +{ + return cl_page_own0(env, io, pg, 1); +} +EXPORT_SYMBOL(cl_page_own_try); + + +/** + * Assume page ownership. + * + * Called when page is already locked by the hosting VM. + * + * \pre !cl_page_is_owned(cl_page, io) + * \post cl_page_is_owned(cl_page, io) + * + * \see cl_page_operations::cpo_assume() + */ +void cl_page_assume(const struct lu_env *env, + struct cl_io *io, struct cl_page *cl_page) +{ + const struct cl_page_slice *slice; + int i; + + ENTRY; + + PINVRNT(env, cl_page, + cl_object_same(cl_page->cp_obj, io->ci_obj)); + io = cl_io_top(io); + + cl_page_slice_for_each(cl_page, slice, i) { + if (slice->cpl_ops->cpo_assume != NULL) + (*slice->cpl_ops->cpo_assume)(env, slice, io); + } + + PASSERT(env, cl_page, cl_page->cp_owner == NULL); + cl_page->cp_owner = cl_io_top(io); + cl_page_owner_set(cl_page); + cl_page_state_set(env, cl_page, CPS_OWNED); + EXIT; +} +EXPORT_SYMBOL(cl_page_assume); + +/** + * Releases page ownership without unlocking the page. + * + * Moves cl_page into cl_page_state::CPS_CACHED without releasing a lock + * on the underlying VM page (as VM is supposed to do this itself). + * + * \pre cl_page_is_owned(cl_page, io) + * \post !cl_page_is_owned(cl_page, io) + * + * \see cl_page_assume() + */ +void cl_page_unassume(const struct lu_env *env, + struct cl_io *io, struct cl_page *cl_page) +{ + const struct cl_page_slice *slice; + int i; + + ENTRY; + PINVRNT(env, cl_page, cl_page_is_owned(cl_page, io)); + PINVRNT(env, cl_page, cl_page_invariant(cl_page)); + + io = cl_io_top(io); + cl_page_owner_clear(cl_page); + cl_page_state_set(env, cl_page, CPS_CACHED); + + cl_page_slice_for_each_reverse(cl_page, slice, i) { + if (slice->cpl_ops->cpo_unassume != NULL) + (*slice->cpl_ops->cpo_unassume)(env, slice, io); + } + + EXIT; +} +EXPORT_SYMBOL(cl_page_unassume); + +/** + * Releases page ownership. + * + * Moves page into cl_page_state::CPS_CACHED. + * + * \pre cl_page_is_owned(pg, io) + * \post !cl_page_is_owned(pg, io) + * + * \see cl_page_own() + * \see cl_page_operations::cpo_disown() + */ +void cl_page_disown(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io) || + pg->cp_state == CPS_FREEING); + + ENTRY; + io = cl_io_top(io); + cl_page_disown0(env, io, pg); + EXIT; +} +EXPORT_SYMBOL(cl_page_disown); + +/** + * Called when cl_page is to be removed from the object, e.g., + * as a result of truncate. + * + * Calls cl_page_operations::cpo_discard() top-to-bottom. + * + * \pre cl_page_is_owned(cl_page, io) + * + * \see cl_page_operations::cpo_discard() + */ +void cl_page_discard(const struct lu_env *env, + struct cl_io *io, struct cl_page *cl_page) +{ + const struct cl_page_slice *slice; + int i; + + PINVRNT(env, cl_page, cl_page_is_owned(cl_page, io)); + PINVRNT(env, cl_page, cl_page_invariant(cl_page)); + + cl_page_slice_for_each(cl_page, slice, i) { + if (slice->cpl_ops->cpo_discard != NULL) + (*slice->cpl_ops->cpo_discard)(env, slice, io); + } +} +EXPORT_SYMBOL(cl_page_discard); + +/** + * Version of cl_page_delete() that can be called for not fully constructed + * cl_pages, e.g. in an error handling cl_page_find()->cl_page_delete0() + * path. Doesn't check cl_page invariant. + */ +static void cl_page_delete0(const struct lu_env *env, + struct cl_page *cl_page) +{ + const struct cl_page_slice *slice; + int i; + + ENTRY; + + PASSERT(env, cl_page, cl_page->cp_state != CPS_FREEING); + + /* + * Severe all ways to obtain new pointers to @pg. + */ + cl_page_owner_clear(cl_page); + cl_page_state_set0(env, cl_page, CPS_FREEING); + + cl_page_slice_for_each_reverse(cl_page, slice, i) { + if (slice->cpl_ops->cpo_delete != NULL) + (*slice->cpl_ops->cpo_delete)(env, slice); + } + + EXIT; +} + +/** + * Called when a decision is made to throw page out of memory. + * + * Notifies all layers about page destruction by calling + * cl_page_operations::cpo_delete() method top-to-bottom. + * + * Moves page into cl_page_state::CPS_FREEING state (this is the only place + * where transition to this state happens). + * + * Eliminates all venues through which new references to the page can be + * obtained: + * + * - removes page from the radix trees, + * + * - breaks linkage from VM page to cl_page. + * + * Once page reaches cl_page_state::CPS_FREEING, all remaining references will + * drain after some time, at which point page will be recycled. + * + * \pre VM page is locked + * \post pg->cp_state == CPS_FREEING + * + * \see cl_page_operations::cpo_delete() + */ +void cl_page_delete(const struct lu_env *env, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + ENTRY; + cl_page_delete0(env, pg); + EXIT; +} +EXPORT_SYMBOL(cl_page_delete); + +/** + * Marks page up-to-date. + * + * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The + * layer responsible for VM interaction has to mark/clear page as up-to-date + * by the \a uptodate argument. + * + * \see cl_page_operations::cpo_export() + */ +void cl_page_export(const struct lu_env *env, struct cl_page *cl_page, + int uptodate) +{ + const struct cl_page_slice *slice; + int i; + + PINVRNT(env, cl_page, cl_page_invariant(cl_page)); + + cl_page_slice_for_each(cl_page, slice, i) { + if (slice->cpl_ops->cpo_export != NULL) + (*slice->cpl_ops->cpo_export)(env, slice, uptodate); + } +} +EXPORT_SYMBOL(cl_page_export); + +/** + * Returns true, if \a page is VM locked in a suitable sense by the calling + * thread. + */ +int cl_page_is_vmlocked(const struct lu_env *env, + const struct cl_page *cl_page) +{ + const struct cl_page_slice *slice; + int result; + + ENTRY; + slice = cl_page_slice_get(cl_page, 0); + PASSERT(env, cl_page, slice->cpl_ops->cpo_is_vmlocked != NULL); + /* + * Call ->cpo_is_vmlocked() directly instead of going through + * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by + * cl_page_invariant(). + */ + result = slice->cpl_ops->cpo_is_vmlocked(env, slice); + PASSERT(env, cl_page, result == -EBUSY || result == -ENODATA); + + RETURN(result == -EBUSY); +} +EXPORT_SYMBOL(cl_page_is_vmlocked); + +void cl_page_touch(const struct lu_env *env, + const struct cl_page *cl_page, size_t to) +{ + const struct cl_page_slice *slice; + int i; + + ENTRY; + + cl_page_slice_for_each(cl_page, slice, i) { + if (slice->cpl_ops->cpo_page_touch != NULL) + (*slice->cpl_ops->cpo_page_touch)(env, slice, to); + } + + EXIT; +} +EXPORT_SYMBOL(cl_page_touch); + +static enum cl_page_state cl_req_type_state(enum cl_req_type crt) +{ + ENTRY; + RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN); +} + +static void cl_page_io_start(const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt) +{ + /* + * Page is queued for IO, change its state. + */ + ENTRY; + cl_page_owner_clear(pg); + cl_page_state_set(env, pg, cl_req_type_state(crt)); + EXIT; +} + +/** + * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is + * called top-to-bottom. Every layer either agrees to submit this page (by + * returning 0), or requests to omit this page (by returning -EALREADY). Layer + * handling interactions with the VM also has to inform VM that page is under + * transfer now. + */ +int cl_page_prep(const struct lu_env *env, struct cl_io *io, + struct cl_page *cl_page, enum cl_req_type crt) +{ + const struct cl_page_slice *slice; + int result = 0; + int i; + + PINVRNT(env, cl_page, cl_page_is_owned(cl_page, io)); + PINVRNT(env, cl_page, cl_page_invariant(cl_page)); + PINVRNT(env, cl_page, crt < CRT_NR); + + /* + * this has to be called bottom-to-top, so that llite can set up + * PG_writeback without risking other layers deciding to skip this + * page. + */ + if (crt >= CRT_NR) + return -EINVAL; + + if (cl_page->cp_type != CPT_TRANSIENT) { + cl_page_slice_for_each(cl_page, slice, i) { + if (slice->cpl_ops->cpo_own) + result = + (*slice->cpl_ops->io[crt].cpo_prep)(env, + slice, + io); + if (result != 0) + break; + } + } + + if (result >= 0) { + result = 0; + cl_page_io_start(env, cl_page, crt); + } + + CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d %d\n", crt, result); + return result; +} +EXPORT_SYMBOL(cl_page_prep); + +/** + * Notify layers about transfer completion. + * + * Invoked by transfer sub-system (which is a part of osc) to notify layers + * that a transfer, of which this page is a part of has completed. + * + * Completion call-backs are executed in the bottom-up order, so that + * uppermost layer (llite), responsible for the VFS/VM interaction runs last + * and can release locks safely. + * + * \pre cl_page->cp_state == CPS_PAGEIN || cl_page->cp_state == CPS_PAGEOUT + * \post cl_page->cl_page_state == CPS_CACHED + * + * \see cl_page_operations::cpo_completion() + */ +void cl_page_completion(const struct lu_env *env, + struct cl_page *cl_page, enum cl_req_type crt, + int ioret) +{ + const struct cl_page_slice *slice; + struct cl_sync_io *anchor = cl_page->cp_sync_io; + int i; + + ENTRY; + PASSERT(env, cl_page, crt < CRT_NR); + PASSERT(env, cl_page, cl_page->cp_state == cl_req_type_state(crt)); + + CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d %d\n", crt, ioret); + cl_page_state_set(env, cl_page, CPS_CACHED); + if (crt >= CRT_NR) + return; + + cl_page_slice_for_each_reverse(cl_page, slice, i) { + if (slice->cpl_ops->io[crt].cpo_completion != NULL) + (*slice->cpl_ops->io[crt].cpo_completion)(env, slice, + ioret); + } + + if (anchor != NULL) { + LASSERT(cl_page->cp_sync_io == anchor); + cl_page->cp_sync_io = NULL; + cl_sync_io_note(env, anchor, ioret); + } + EXIT; +} +EXPORT_SYMBOL(cl_page_completion); + +/** + * Notify layers that transfer formation engine decided to yank this page from + * the cache and to make it a part of a transfer. + * + * \pre cl_page->cp_state == CPS_CACHED + * \post cl_page->cp_state == CPS_PAGEIN || cl_page->cp_state == CPS_PAGEOUT + * + * \see cl_page_operations::cpo_make_ready() + */ +int cl_page_make_ready(const struct lu_env *env, struct cl_page *cl_page, + enum cl_req_type crt) +{ + const struct cl_page_slice *slice; + int result = 0; + int i; + + ENTRY; + PINVRNT(env, cl_page, crt < CRT_NR); + if (crt >= CRT_NR) + RETURN(-EINVAL); + + cl_page_slice_for_each(cl_page, slice, i) { + if (slice->cpl_ops->io[crt].cpo_make_ready != NULL) + result = (*slice->cpl_ops->io[crt].cpo_make_ready)(env, slice); + if (result != 0) + break; + } + + if (result >= 0) { + result = 0; + PASSERT(env, cl_page, cl_page->cp_state == CPS_CACHED); + cl_page_io_start(env, cl_page, crt); + } + CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d %d\n", crt, result); + + RETURN(result); +} +EXPORT_SYMBOL(cl_page_make_ready); + +/** + * Called if a page is being written back by kernel's intention. + * + * \pre cl_page_is_owned(cl_page, io) + * \post ergo(result == 0, cl_page->cp_state == CPS_PAGEOUT) + * + * \see cl_page_operations::cpo_flush() + */ +int cl_page_flush(const struct lu_env *env, struct cl_io *io, + struct cl_page *cl_page) +{ + const struct cl_page_slice *slice; + int result = 0; + int i; + + ENTRY; + PINVRNT(env, cl_page, cl_page_is_owned(cl_page, io)); + PINVRNT(env, cl_page, cl_page_invariant(cl_page)); + + cl_page_slice_for_each(cl_page, slice, i) { + if (slice->cpl_ops->cpo_flush != NULL) + result = (*slice->cpl_ops->cpo_flush)(env, slice, io); + if (result != 0) + break; + } + if (result > 0) + result = 0; + + CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d\n", result); + RETURN(result); +} +EXPORT_SYMBOL(cl_page_flush); + +/** + * Tells transfer engine that only part of a page is to be transmitted. + * + * \see cl_page_operations::cpo_clip() + */ +void cl_page_clip(const struct lu_env *env, struct cl_page *cl_page, + int from, int to) +{ + const struct cl_page_slice *slice; + int i; + + PINVRNT(env, cl_page, cl_page_invariant(cl_page)); + + CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d %d\n", from, to); + cl_page_slice_for_each(cl_page, slice, i) { + if (slice->cpl_ops->cpo_clip != NULL) + (*slice->cpl_ops->cpo_clip)(env, slice, from, to); + } +} +EXPORT_SYMBOL(cl_page_clip); + +/** + * Prints human readable representation of \a pg to the \a f. + */ +void cl_page_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_page *pg) +{ + (*printer)(env, cookie, + "page@%p[%d %p %d %d %p]\n", + pg, atomic_read(&pg->cp_ref), pg->cp_obj, + pg->cp_state, pg->cp_type, + pg->cp_owner); +} +EXPORT_SYMBOL(cl_page_header_print); + +/** + * Prints human readable representation of \a cl_page to the \a f. + */ +void cl_page_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_page *cl_page) +{ + const struct cl_page_slice *slice; + int result = 0; + int i; + + cl_page_header_print(env, cookie, printer, cl_page); + cl_page_slice_for_each(cl_page, slice, i) { + if (slice->cpl_ops->cpo_print != NULL) + result = (*slice->cpl_ops->cpo_print)(env, slice, + cookie, printer); + if (result != 0) + break; + } + (*printer)(env, cookie, "end page@%p\n", cl_page); +} +EXPORT_SYMBOL(cl_page_print); + +/** + * Converts a byte offset within object \a obj into a page index. + */ +loff_t cl_offset(const struct cl_object *obj, pgoff_t idx) +{ + return (loff_t)idx << PAGE_SHIFT; +} +EXPORT_SYMBOL(cl_offset); + +/** + * Converts a page index into a byte offset within object \a obj. + */ +pgoff_t cl_index(const struct cl_object *obj, loff_t offset) +{ + return offset >> PAGE_SHIFT; +} +EXPORT_SYMBOL(cl_index); + +size_t cl_page_size(const struct cl_object *obj) +{ + return 1UL << PAGE_SHIFT; +} +EXPORT_SYMBOL(cl_page_size); + +/** + * Adds page slice to the compound page. + * + * This is called by cl_object_operations::coo_page_init() methods to add a + * per-layer state to the page. New state is added at the end of + * cl_page::cp_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add() + */ +void cl_page_slice_add(struct cl_page *cl_page, struct cl_page_slice *slice, + struct cl_object *obj, + const struct cl_page_operations *ops) +{ + unsigned int offset = (char *)slice - + ((char *)cl_page + sizeof(*cl_page)); + + ENTRY; + LASSERT(cl_page->cp_layer_count < CP_MAX_LAYER); + LASSERT(offset < (1 << sizeof(cl_page->cp_layer_offset[0]) * 8)); + cl_page->cp_layer_offset[cl_page->cp_layer_count++] = offset; + slice->cpl_obj = obj; + slice->cpl_ops = ops; + slice->cpl_page = cl_page; + + EXIT; +} +EXPORT_SYMBOL(cl_page_slice_add); + +/** + * Allocate and initialize cl_cache, called by ll_init_sbi(). + */ +struct cl_client_cache *cl_cache_init(unsigned long lru_page_max) +{ + struct cl_client_cache *cache = NULL; + + ENTRY; + OBD_ALLOC(cache, sizeof(*cache)); + if (cache == NULL) + RETURN(NULL); + + /* Initialize cache data */ + atomic_set(&cache->ccc_users, 1); + cache->ccc_lru_max = lru_page_max; + atomic_long_set(&cache->ccc_lru_left, lru_page_max); + spin_lock_init(&cache->ccc_lru_lock); + INIT_LIST_HEAD(&cache->ccc_lru); + + /* turn unstable check off by default as it impacts performance */ + cache->ccc_unstable_check = 0; + atomic_long_set(&cache->ccc_unstable_nr, 0); + init_waitqueue_head(&cache->ccc_unstable_waitq); + mutex_init(&cache->ccc_max_cache_mb_lock); + + RETURN(cache); +} +EXPORT_SYMBOL(cl_cache_init); + +/** + * Increase cl_cache refcount + */ +void cl_cache_incref(struct cl_client_cache *cache) +{ + atomic_inc(&cache->ccc_users); +} +EXPORT_SYMBOL(cl_cache_incref); + +/** + * Decrease cl_cache refcount and free the cache if refcount=0. + * Since llite, lov and osc all hold cl_cache refcount, + * the free will not cause race. (LU-6173) + */ +void cl_cache_decref(struct cl_client_cache *cache) +{ + if (atomic_dec_and_test(&cache->ccc_users)) + OBD_FREE(cache, sizeof(*cache)); +} +EXPORT_SYMBOL(cl_cache_decref); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c new file mode 100644 index 0000000000000..f0c611827aebb --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c @@ -0,0 +1,974 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef HAVE_SERVER_SUPPORT +# include +# include +#endif /* HAVE_SERVER_SUPPORT */ +#include +#include "llog_internal.h" + +#ifdef CONFIG_PROC_FS +static __u64 obd_max_alloc; +#else +__u64 obd_max_alloc; +#endif + +static DEFINE_SPINLOCK(obd_updatemax_lock); + +/* The following are visible and mutable through /proc/sys/lustre/. */ +unsigned int obd_debug_peer_on_timeout; +EXPORT_SYMBOL(obd_debug_peer_on_timeout); +unsigned int obd_dump_on_timeout; +EXPORT_SYMBOL(obd_dump_on_timeout); +unsigned int obd_dump_on_eviction; +EXPORT_SYMBOL(obd_dump_on_eviction); +unsigned int obd_lbug_on_eviction; +EXPORT_SYMBOL(obd_lbug_on_eviction); +unsigned long obd_max_dirty_pages; +EXPORT_SYMBOL(obd_max_dirty_pages); +atomic_long_t obd_dirty_pages; +EXPORT_SYMBOL(obd_dirty_pages); +unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ +EXPORT_SYMBOL(obd_timeout); +unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */ +EXPORT_SYMBOL(ldlm_timeout); +unsigned int obd_timeout_set; +EXPORT_SYMBOL(obd_timeout_set); +unsigned int ldlm_timeout_set; +EXPORT_SYMBOL(ldlm_timeout_set); +/* bulk transfer timeout, give up after 100s by default */ +unsigned int bulk_timeout = 100; /* seconds */ +EXPORT_SYMBOL(bulk_timeout); +/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */ +unsigned int at_min = 0; +EXPORT_SYMBOL(at_min); +unsigned int at_max = 600; +EXPORT_SYMBOL(at_max); +unsigned int at_history = 600; +EXPORT_SYMBOL(at_history); +int at_early_margin = 5; +EXPORT_SYMBOL(at_early_margin); +int at_extra = 30; +EXPORT_SYMBOL(at_extra); + +#ifdef CONFIG_PROC_FS +struct lprocfs_stats *obd_memory = NULL; +EXPORT_SYMBOL(obd_memory); +#endif + +static int obdclass_oom_handler(struct notifier_block *self, + unsigned long notused, void *nfreed) +{ +#ifdef CONFIG_PROC_FS + /* in bytes */ + pr_info("obd_memory max: %llu, obd_memory current: %llu\n", + obd_memory_max(), obd_memory_sum()); +#endif /* CONFIG_PROC_FS */ + + return NOTIFY_OK; +} + +static struct notifier_block obdclass_oom = { + .notifier_call = obdclass_oom_handler +}; + +static int class_resolve_dev_name(__u32 len, const char *name) +{ + int rc; + int dev; + + ENTRY; + if (!len || !name) { + CERROR("No name passed,!\n"); + GOTO(out, rc = -EINVAL); + } + if (name[len - 1] != 0) { + CERROR("Name not nul terminated!\n"); + GOTO(out, rc = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s\n", name); + dev = class_name2dev(name); + if (dev == -1) { + CDEBUG(D_IOCTL, "No device for name %s!\n", name); + GOTO(out, rc = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev); + rc = dev; + +out: + RETURN(rc); +} + +#define OBD_MAX_IOCTL_BUFFER 8192 + +static int obd_ioctl_is_invalid(struct obd_ioctl_data *data) +{ + const int maxlen = 1 << 30; + if (data->ioc_len > maxlen) { + CERROR("OBD ioctl: ioc_len larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inllen1 > maxlen) { + CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inllen2 > maxlen) { + CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inllen3 > maxlen) { + CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inllen4 > maxlen) { + CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inlbuf1 && data->ioc_inllen1 == 0) { + CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_inlbuf2 && data->ioc_inllen2 == 0) { + CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_inlbuf3 && data->ioc_inllen3 == 0) { + CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_inlbuf4 && data->ioc_inllen4 == 0) { + CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_pbuf1 && data->ioc_plen1 == 0) { + CERROR("OBD ioctl: pbuf1 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_pbuf2 && data->ioc_plen2 == 0) { + CERROR("OBD ioctl: pbuf2 pointer but 0 length\n"); + return 1; + } + + if (!data->ioc_pbuf1 && data->ioc_plen1 != 0) { + CERROR("OBD ioctl: plen1 set but NULL pointer\n"); + return 1; + } + + if (!data->ioc_pbuf2 && data->ioc_plen2 != 0) { + CERROR("OBD ioctl: plen2 set but NULL pointer\n"); + return 1; + } + + if (obd_ioctl_packlen(data) > data->ioc_len) { + CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n", + obd_ioctl_packlen(data), data->ioc_len); + return 1; + } + + return 0; +} + +/* buffer MUST be at least the size of obd_ioctl_hdr */ +int obd_ioctl_getdata(struct obd_ioctl_data **datap, int *len, void __user *arg) +{ + struct obd_ioctl_hdr hdr; + struct obd_ioctl_data *data; + int offset = 0; + + ENTRY; + if (copy_from_user(&hdr, arg, sizeof(hdr))) + RETURN(-EFAULT); + + if (hdr.ioc_version != OBD_IOCTL_VERSION) { + CERROR("Version mismatch kernel (%x) vs application (%x)\n", + OBD_IOCTL_VERSION, hdr.ioc_version); + RETURN(-EINVAL); + } + + if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) { + CERROR("User buffer len %d exceeds %d max buffer\n", + hdr.ioc_len, OBD_MAX_IOCTL_BUFFER); + RETURN(-EINVAL); + } + + if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) { + CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len); + RETURN(-EINVAL); + } + + /* When there are lots of processes calling vmalloc on multi-core + * system, the high lock contention will hurt performance badly, + * obdfilter-survey is an example, which relies on ioctl. So we'd + * better avoid vmalloc on ioctl path. LU-66 + */ + OBD_ALLOC_LARGE(data, hdr.ioc_len); + if (!data) { + CERROR("Cannot allocate control buffer of len %d\n", + hdr.ioc_len); + RETURN(-EINVAL); + } + *len = hdr.ioc_len; + + if (copy_from_user(data, arg, hdr.ioc_len)) { + OBD_FREE_LARGE(data, hdr.ioc_len); + RETURN(-EFAULT); + } + + if (obd_ioctl_is_invalid(data)) { + CERROR("ioctl not correctly formatted\n"); + OBD_FREE_LARGE(data, hdr.ioc_len); + RETURN(-EINVAL); + } + + if (data->ioc_inllen1) { + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + offset += cfs_size_round(data->ioc_inllen1); + } + + if (data->ioc_inllen2) { + data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset; + offset += cfs_size_round(data->ioc_inllen2); + } + + if (data->ioc_inllen3) { + data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset; + offset += cfs_size_round(data->ioc_inllen3); + } + + if (data->ioc_inllen4) + data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset; + + *datap = data; + + RETURN(0); +} +EXPORT_SYMBOL(obd_ioctl_getdata); + +int class_handle_ioctl(unsigned int cmd, unsigned long arg) +{ + struct obd_ioctl_data *data; + struct obd_device *obd = NULL; + int err = 0, len = 0; + + ENTRY; + CDEBUG(D_IOCTL, "cmd = %x\n", cmd); + if (obd_ioctl_getdata(&data, &len, (void __user *)arg)) { + CERROR("OBD ioctl: data error\n"); + RETURN(-EINVAL); + } + + switch (cmd) { + case OBD_IOC_PROCESS_CFG: { + struct lustre_cfg *lcfg; + + if (!data->ioc_plen1 || !data->ioc_pbuf1) { + CERROR("No config buffer passed!\n"); + GOTO(out, err = -EINVAL); + } + OBD_ALLOC(lcfg, data->ioc_plen1); + if (lcfg == NULL) + GOTO(out, err = -ENOMEM); + err = copy_from_user(lcfg, data->ioc_pbuf1, + data->ioc_plen1); + if (!err) + err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1); + if (!err) + err = class_process_config(lcfg); + + OBD_FREE(lcfg, data->ioc_plen1); + GOTO(out, err); + } + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0) + case OBD_GET_VERSION: { + static bool warned; + + if (!data->ioc_inlbuf1) { + CERROR("No buffer passed in ioctl\n"); + GOTO(out, err = -EINVAL); + } + + if (strlen(LUSTRE_VERSION_STRING) + 1 > data->ioc_inllen1) { + CERROR("ioctl buffer too small to hold version\n"); + GOTO(out, err = -EINVAL); + } + + if (!warned) { + warned = true; + CWARN("%s: ioctl(OBD_GET_VERSION) is deprecated, " + "use llapi_get_version_string() and/or relink\n", + current->comm); + } + memcpy(data->ioc_bulk, LUSTRE_VERSION_STRING, + strlen(LUSTRE_VERSION_STRING) + 1); + + if (copy_to_user((void __user *)arg, data, len)) + err = -EFAULT; + GOTO(out, err); + } +#endif + case OBD_IOC_NAME2DEV: { + /* Resolve a device name. This does not change the + * currently selected device. + */ + int dev; + + dev = class_resolve_dev_name(data->ioc_inllen1, + data->ioc_inlbuf1); + data->ioc_dev = dev; + if (dev < 0) + GOTO(out, err = -EINVAL); + + if (copy_to_user((void __user *)arg, data, sizeof(*data))) + err = -EFAULT; + GOTO(out, err); + } + + case OBD_IOC_UUID2DEV: { + /* Resolve a device uuid. This does not change the + * currently selected device. + */ + int dev; + struct obd_uuid uuid; + + if (!data->ioc_inllen1 || !data->ioc_inlbuf1) { + CERROR("No UUID passed!\n"); + GOTO(out, err = -EINVAL); + } + if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) { + CERROR("UUID not NUL terminated!\n"); + GOTO(out, err = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1); + obd_str2uuid(&uuid, data->ioc_inlbuf1); + dev = class_uuid2dev(&uuid); + data->ioc_dev = dev; + if (dev == -1) { + CDEBUG(D_IOCTL, "No device for UUID %s!\n", + data->ioc_inlbuf1); + GOTO(out, err = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1, + dev); + if (copy_to_user((void __user *)arg, data, sizeof(*data))) + err = -EFAULT; + GOTO(out, err); + } + + case OBD_IOC_GETDEVICE: { + int index = data->ioc_count; + char *status, *str; + + if (!data->ioc_inlbuf1) { + CERROR("No buffer passed in ioctl\n"); + GOTO(out, err = -EINVAL); + } + if (data->ioc_inllen1 < 128) { + CERROR("ioctl buffer too small to hold version\n"); + GOTO(out, err = -EINVAL); + } + + obd = class_num2obd(index); + if (!obd) + GOTO(out, err = -ENOENT); + + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_inactive) + status = "IN"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + + str = (char *)data->ioc_bulk; + snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d", + (int)index, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + + if (copy_to_user((void __user *)arg, data, len)) + err = -EFAULT; + + GOTO(out, err); + } + + } + + if (data->ioc_dev == OBD_DEV_BY_DEVNAME) { + if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL) + GOTO(out, err = -EINVAL); + if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME) + GOTO(out, err = -EINVAL); + obd = class_name2obd(data->ioc_inlbuf4); + } else if (data->ioc_dev < class_devno_max()) { + obd = class_num2obd(data->ioc_dev); + } else { + CERROR("OBD ioctl: No device\n"); + GOTO(out, err = -EINVAL); + } + + if (obd == NULL) { + CERROR("OBD ioctl : No Device %d\n", data->ioc_dev); + GOTO(out, err = -EINVAL); + } + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + + if (!obd->obd_set_up || obd->obd_stopping) { + CERROR("OBD ioctl: device not setup %d \n", data->ioc_dev); + GOTO(out, err = -EINVAL); + } + + err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL); + if (err) + GOTO(out, err); + + if (copy_to_user((void __user *)arg, data, len)) + err = -EFAULT; +out: + OBD_FREE_LARGE(data, len); + RETURN(err); +} /* class_handle_ioctl */ + +/* to control /dev/obd */ +static long obd_class_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + int err = 0; + + ENTRY; + /* Allow non-root access for some limited ioctls */ + if (!capable(CAP_SYS_ADMIN)) + RETURN(err = -EACCES); + + if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */ + RETURN(err = -ENOTTY); + + err = class_handle_ioctl(cmd, (unsigned long)arg); + + RETURN(err); +} + +/* declare character device */ +static const struct file_operations obd_psdev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */ +}; + +/* modules setup */ +static struct miscdevice obd_psdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = OBD_DEV_NAME, + .fops = &obd_psdev_fops, +}; + +#define test_string_to_size_err(value, expect, def_unit, __rc) \ +({ \ + u64 __size; \ + int __ret; \ + \ + BUILD_BUG_ON(sizeof(value) >= 23); \ + __ret = sysfs_memparse(value, sizeof(value) - 1, &__size, def_unit); \ + if (__ret != __rc) \ + CERROR("string_helper: parsing '%s' expect rc %d != got %d\n", \ + value, __rc, __ret); \ + else if (!__ret && (u64)expect != __size) \ + CERROR("string_helper: parsing '%s' expect %llu != got %llu\n",\ + value, (u64)expect, __size); \ + __ret; \ +}) +#define test_string_to_size_one(value, expect, def_unit) \ + test_string_to_size_err(value, expect, def_unit, 0) + +static int __init obd_init_checks(void) +{ + __u64 u64val, div64val; + char buf[64]; + int len, ret = 0; + + CDEBUG(D_INFO, "OBD_OBJECT_EOF = %#llx\n", (__u64)OBD_OBJECT_EOF); + + u64val = OBD_OBJECT_EOF; + CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val); + if (u64val != OBD_OBJECT_EOF) { + CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), "%#llx", u64val); + if (len != 18) { + CERROR("u64 hex wrong length, strlen(%s)=%d != 18\n", buf, len); + ret = -EINVAL; + } + + div64val = OBD_OBJECT_EOF; + CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val); + if (u64val != OBD_OBJECT_EOF) { + CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + ret = -EOVERFLOW; + } + if (u64val >> 8 != OBD_OBJECT_EOF >> 8) { + CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + ret = -EOVERFLOW; + } + if (do_div(div64val, 256) != (u64val & 255)) { + CERROR("do_div(%#llx,256) != %llu\n", u64val, u64val & 255); + ret = -EOVERFLOW; + } + if (u64val >> 8 != div64val) { + CERROR("do_div(%#llx,256) %llu != %llu\n", + u64val, div64val, u64val >> 8); + ret = -EOVERFLOW; + } + len = snprintf(buf, sizeof(buf), "%#llx", u64val); + if (len != 18) { + CERROR("u64 hex wrong length! strlen(%s)=%d != 18\n", buf, len); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), "%llu", u64val); + if (len != 20) { + CERROR("u64 wrong length! strlen(%s)=%d != 20\n", buf, len); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), "%lld", u64val); + if (len != 2) { + CERROR("s64 wrong length! strlen(%s)=%d != 2\n", buf, len); + ret = -EINVAL; + } + if ((u64val & ~PAGE_MASK) >= PAGE_SIZE) { + CERROR("mask failed: u64val %llu >= %llu\n", u64val, + (__u64)PAGE_SIZE); + ret = -EINVAL; + } + if (ret) + RETURN(ret); + + /* invalid string */ + if (!test_string_to_size_err("256B34", 256, "B", -EINVAL)) { + CERROR("string_helpers: format should be number then units\n"); + ret = -EINVAL; + } + if (!test_string_to_size_err("132OpQ", 132, "B", -EINVAL)) { + CERROR("string_helpers: invalid units should be rejected\n"); + ret = -EINVAL; + } + if (!test_string_to_size_err("1.82B", 1, "B", -EINVAL)) { + CERROR("string_helpers: 'B' with '.' should be invalid\n"); + ret = -EINVAL; + } + if (test_string_to_size_one("343\n", 343, "B")) { + CERROR("string_helpers: should ignore newline\n"); + ret = -EINVAL; + } + if (ret) + RETURN(ret); + + /* memparse unit handling */ + ret = 0; + ret += test_string_to_size_one("0B", 0, "B"); + ret += test_string_to_size_one("512B", 512, "B"); + ret += test_string_to_size_one("1.067kB", 1067, "B"); + ret += test_string_to_size_one("1.042KiB", 1067, "B"); + ret += test_string_to_size_one("8", 8388608, "M"); + ret += test_string_to_size_one("65536", 65536, "B"); + ret += test_string_to_size_one("128", 131072, "K"); + ret += test_string_to_size_one("1M", 1048576, "B"); + ret += test_string_to_size_one("0.5T", 549755813888ULL, "T"); + ret += test_string_to_size_one("256.5G", 275414777856ULL, "G"); + if (ret) + RETURN(ret); + + /* string helper values */ + ret += test_string_to_size_one("16", 16777216, "MiB"); + ret += test_string_to_size_one("8.39MB", 8390000, "MiB"); + ret += test_string_to_size_one("8.00MiB", 8388608, "MiB"); + ret += test_string_to_size_one("256GB", 256000000000ULL, "GiB"); + ret += test_string_to_size_one("238.731GiB", 256335459385ULL, "GiB"); + if (ret) + RETURN(ret); + + /* huge values */ + ret += test_string_to_size_one("0.4TB", 400000000000ULL, "TiB"); + ret += test_string_to_size_one("12.5TiB", 13743895347200ULL, "TiB"); + ret += test_string_to_size_one("2PB", 2000000000000000ULL, "PiB"); + ret += test_string_to_size_one("16PiB", 18014398509481984ULL, "PiB"); + if (ret) + RETURN(ret); + + /* huge values should overflow */ + if (!test_string_to_size_err("1000EiB", 0, "EiB", -EOVERFLOW)) { + CERROR("string_helpers: failed to detect binary overflow\n"); + ret = -EINVAL; + } + if (!test_string_to_size_err("1000EB", 0, "EiB", -EOVERFLOW)) { + CERROR("string_helpers: failed to detect decimal overflow\n"); + ret = -EINVAL; + } + + return ret; +} + +static int __init obdclass_init(void) +{ + int err; + + LCONSOLE_INFO("Lustre: Build Version: "LUSTRE_VERSION_STRING"\n"); + + register_oom_notifier(&obdclass_oom); + + libcfs_kkuc_init(); + + err = obd_init_checks(); + if (err) + return err; + +#ifdef CONFIG_PROC_FS + obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM, + LPROCFS_STATS_FLAG_NONE | + LPROCFS_STATS_FLAG_IRQ_SAFE); + if (obd_memory == NULL) { + CERROR("kmalloc of 'obd_memory' failed\n"); + return -ENOMEM; + } + + lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT, + LPROCFS_CNTR_AVGMINMAX, + "memused", "bytes"); +#endif + err = obd_zombie_impexp_init(); + if (err) + goto cleanup_obd_memory; + + err = class_handle_init(); + if (err) + goto cleanup_zombie_impexp; + + err = misc_register(&obd_psdev); + if (err) { + CERROR("cannot register OBD miscdevice: err = %d\n", err); + goto cleanup_class_handle; + } + + /* Default the dirty page cache cap to 1/2 of system memory. + * For clients with less memory, a larger fraction is needed + * for other purposes (mostly for BGL). */ + if (cfs_totalram_pages() <= 512 << (20 - PAGE_SHIFT)) + obd_max_dirty_pages = cfs_totalram_pages() / 4; + else + obd_max_dirty_pages = cfs_totalram_pages() / 2; + + err = obd_init_caches(); + if (err) + goto cleanup_deregister; + + err = class_procfs_init(); + if (err) + goto cleanup_caches; + + err = lu_global_init(); + if (err) + goto cleanup_class_procfs; + + err = cl_global_init(); + if (err != 0) + goto cleanup_lu_global; + + err = llog_info_init(); + if (err) + goto cleanup_cl_global; + +#ifdef HAVE_SERVER_SUPPORT + err = dt_global_init(); + if (err != 0) + goto cleanup_llog_info; + + err = lu_ucred_global_init(); + if (err != 0) + goto cleanup_dt_global; + + err = lustre_tgt_register_fs(); + if (err && err != -EBUSY) { + /* Don't fail if server code also registers "lustre_tgt" */ + CERROR("obdclass: register fstype 'lustre_tgt' failed: rc = %d\n", + err); + goto cleanup_lu_ucred_global; + } +#endif /* HAVE_SERVER_SUPPORT */ + + /* simulate a late OOM situation now to require all + * alloc'ed/initialized resources to be freed + */ + if (OBD_FAIL_CHECK(OBD_FAIL_OBDCLASS_MODULE_LOAD)) { + /* force error to ensure module will be unloaded/cleaned */ + err = -ENOMEM; + goto cleanup_all; + } + return 0; + +cleanup_all: +#ifdef HAVE_SERVER_SUPPORT + /* fake error but filesystem has been registered */ + lustre_tgt_unregister_fs(); + +cleanup_lu_ucred_global: + lu_ucred_global_fini(); + +cleanup_dt_global: + dt_global_fini(); + +cleanup_llog_info: +#endif /* HAVE_SERVER_SUPPORT */ + llog_info_fini(); + +cleanup_cl_global: + cl_global_fini(); + +cleanup_lu_global: + lu_global_fini(); + +cleanup_class_procfs: + class_procfs_clean(); + +cleanup_caches: + obd_cleanup_caches(); + +cleanup_deregister: + misc_deregister(&obd_psdev); + +cleanup_class_handle: + class_handle_cleanup(); + +cleanup_zombie_impexp: + obd_zombie_impexp_stop(); + +cleanup_obd_memory: +#ifdef CONFIG_PROC_FS + lprocfs_free_stats(&obd_memory); +#endif + + unregister_oom_notifier(&obdclass_oom); + return err; +} + +void obd_update_maxusage(void) +{ + __u64 max; + + max = obd_memory_sum(); + + spin_lock(&obd_updatemax_lock); + if (max > obd_max_alloc) + obd_max_alloc = max; + spin_unlock(&obd_updatemax_lock); +} +EXPORT_SYMBOL(obd_update_maxusage); + +#ifdef CONFIG_PROC_FS +__u64 obd_memory_max(void) +{ + __u64 ret; + + obd_update_maxusage(); + spin_lock(&obd_updatemax_lock); + ret = obd_max_alloc; + spin_unlock(&obd_updatemax_lock); + + return ret; +} +#endif /* CONFIG_PROC_FS */ + +static void __exit obdclass_exit(void) +{ +#ifdef CONFIG_PROC_FS + __u64 memory_leaked; + __u64 memory_max; +#endif /* CONFIG_PROC_FS */ + ENTRY; + + misc_deregister(&obd_psdev); +#ifdef HAVE_SERVER_SUPPORT + lustre_tgt_unregister_fs(); + lu_ucred_global_fini(); + dt_global_fini(); +#endif /* HAVE_SERVER_SUPPORT */ + llog_info_fini(); + cl_global_fini(); + lu_global_fini(); + + obd_cleanup_caches(); + + class_procfs_clean(); + + class_handle_cleanup(); + class_del_uuid(NULL); /* Delete all UUIDs. */ + obd_zombie_impexp_stop(); + +#ifdef CONFIG_PROC_FS + memory_leaked = obd_memory_sum(); + memory_max = obd_memory_max(); + + lprocfs_free_stats(&obd_memory); + /* the below message is checked in test-framework.sh check_mem_leak() */ + CDEBUG((memory_leaked) ? D_ERROR : D_INFO, + "obd_memory max: %llu, leaked: %llu\n", + memory_max, memory_leaked); +#endif /* CONFIG_PROC_FS */ + + unregister_oom_notifier(&obdclass_oom); + + EXIT; +} + +void obd_heat_clear(struct obd_heat_instance *instance, int count) +{ + ENTRY; + + memset(instance, 0, sizeof(*instance) * count); + RETURN_EXIT; +} +EXPORT_SYMBOL(obd_heat_clear); + +/* + * The file heat is calculated for every time interval period I. The access + * frequency during each period is counted. The file heat is only recalculated + * at the end of a time period. And a percentage of the former file heat is + * lost when recalculated. The recursion formula to calculate the heat of the + * file f is as follow: + * + * Hi+1(f) = (1-P)*Hi(f)+ P*Ci + * + * Where Hi is the heat value in the period between time points i*I and + * (i+1)*I; Ci is the access count in the period; the symbol P refers to the + * weight of Ci. The larger the value the value of P is, the more influence Ci + * has on the file heat. + */ +void obd_heat_decay(struct obd_heat_instance *instance, __u64 time_second, + unsigned int weight, unsigned int period_second) +{ + u64 second; + + ENTRY; + + if (instance->ohi_time_second > time_second) { + obd_heat_clear(instance, 1); + RETURN_EXIT; + } + + if (instance->ohi_time_second == 0) + RETURN_EXIT; + + for (second = instance->ohi_time_second + period_second; + second < time_second; + second += period_second) { + instance->ohi_heat = instance->ohi_heat * + (256 - weight) / 256 + + instance->ohi_count * weight / 256; + instance->ohi_count = 0; + instance->ohi_time_second = second; + } + RETURN_EXIT; +} +EXPORT_SYMBOL(obd_heat_decay); + +__u64 obd_heat_get(struct obd_heat_instance *instance, unsigned int time_second, + unsigned int weight, unsigned int period_second) +{ + ENTRY; + + obd_heat_decay(instance, time_second, weight, period_second); + + if (instance->ohi_count == 0) + RETURN(instance->ohi_heat); + + RETURN(instance->ohi_heat * (256 - weight) / 256 + + instance->ohi_count * weight / 256); +} +EXPORT_SYMBOL(obd_heat_get); + +void obd_heat_add(struct obd_heat_instance *instance, + unsigned int time_second, __u64 count, + unsigned int weight, unsigned int period_second) +{ + ENTRY; + + obd_heat_decay(instance, time_second, weight, period_second); + if (instance->ohi_time_second == 0) { + instance->ohi_time_second = time_second; + instance->ohi_heat = 0; + instance->ohi_count = count; + } else { + instance->ohi_count += count; + } + RETURN_EXIT; +} +EXPORT_SYMBOL(obd_heat_add); + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Class Driver"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(obdclass_init); +module_exit(obdclass_exit); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c new file mode 100644 index 0000000000000..ee17b36c9b337 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c @@ -0,0 +1,1292 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/dt_object.c + * + * Dt Object. + * Generic functions from dt_object.h + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +/* fid_be_to_cpu() */ +#include +#include +#include +#include + +/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */ +LU_KEY_INIT(dt_global, struct dt_thread_info); +LU_KEY_FINI(dt_global, struct dt_thread_info); + +struct lu_context_key dt_key = { + .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL, + .lct_init = dt_global_key_init, + .lct_fini = dt_global_key_fini +}; + +/* + * no lock is necessary to protect the list, because call-backs + * are added during system startup. Please refer to "struct dt_device". + */ +void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb) +{ + list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks); +} +EXPORT_SYMBOL(dt_txn_callback_add); + +void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb) +{ + list_del_init(&cb->dtc_linkage); +} +EXPORT_SYMBOL(dt_txn_callback_del); + +int dt_txn_hook_start(const struct lu_env *env, + struct dt_device *dev, struct thandle *th) +{ + int rc = 0; + struct dt_txn_callback *cb; + + if (th->th_local) + return 0; + + list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) { + struct thandle *dtc_th = th; + + if (cb->dtc_txn_start == NULL || + !(cb->dtc_tag & env->le_ctx.lc_tags)) + continue; + + /* + * Usually dt_txn_hook_start is called from bottom device, + * and if the thandle has th_top, then we need use top + * thandle for the callback in the top thandle layer + */ + if (th->th_top != NULL) + dtc_th = th->th_top; + + rc = cb->dtc_txn_start(env, dtc_th, cb->dtc_cookie); + if (rc < 0) + break; + } + return rc; +} +EXPORT_SYMBOL(dt_txn_hook_start); + +int dt_txn_hook_stop(const struct lu_env *env, struct thandle *th) +{ + struct dt_device *dev = th->th_dev; + struct dt_txn_callback *cb; + int rc = 0; + + if (th->th_local) + return 0; + + if (OBD_FAIL_CHECK(OBD_FAIL_DT_TXN_STOP)) + return -EIO; + + list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) { + struct thandle *dtc_th = th; + + if (cb->dtc_txn_stop == NULL || + !(cb->dtc_tag & env->le_ctx.lc_tags)) + continue; + + /* + * Usually dt_txn_hook_stop is called from bottom device, + * and if the thandle has th_top, then we need use top + * thandle for the callback in the top thandle layer + */ + if (th->th_top != NULL) + dtc_th = th->th_top; + + rc = cb->dtc_txn_stop(env, dtc_th, cb->dtc_cookie); + if (rc < 0) + break; + } + return rc; +} +EXPORT_SYMBOL(dt_txn_hook_stop); + +int dt_device_init(struct dt_device *dev, struct lu_device_type *t) +{ + INIT_LIST_HEAD(&dev->dd_txn_callbacks); + return lu_device_init(&dev->dd_lu_dev, t); +} +EXPORT_SYMBOL(dt_device_init); + +void dt_device_fini(struct dt_device *dev) +{ + lu_device_fini(&dev->dd_lu_dev); +} +EXPORT_SYMBOL(dt_device_fini); + +int dt_object_init(struct dt_object *obj, + struct lu_object_header *h, struct lu_device *d) + +{ + return lu_object_init(&obj->do_lu, h, d); +} +EXPORT_SYMBOL(dt_object_init); + +void dt_object_fini(struct dt_object *obj) +{ + lu_object_fini(&obj->do_lu); +} +EXPORT_SYMBOL(dt_object_fini); + +int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj) +{ + if (obj->do_index_ops == NULL) + obj->do_ops->do_index_try(env, obj, &dt_directory_features); + return obj->do_index_ops != NULL; +} +EXPORT_SYMBOL(dt_try_as_dir); + +enum dt_format_type dt_mode_to_dft(__u32 mode) +{ + enum dt_format_type result; + + switch (mode & S_IFMT) { + case S_IFDIR: + result = DFT_DIR; + break; + case S_IFREG: + result = DFT_REGULAR; + break; + case S_IFLNK: + result = DFT_SYM; + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + result = DFT_NODE; + break; + default: + LASSERTF(0, "invalid mode %o\n", mode); + result = 0; /* Just for satisfying compiler. */ + break; + } + return result; +} +EXPORT_SYMBOL(dt_mode_to_dft); + +/** + * lookup fid for object named \a name in directory \a dir. + */ + +int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir, + const char *name, struct lu_fid *fid) +{ + if (dt_try_as_dir(env, dir)) + return dt_lookup(env, dir, (struct dt_rec *)fid, + (const struct dt_key *)name); + return -ENOTDIR; +} +EXPORT_SYMBOL(dt_lookup_dir); + +/* + * this differs from dt_locate by top_dev as parameter + * but not one from lu_site + */ +struct dt_object *dt_locate_at(const struct lu_env *env, + struct dt_device *dev, + const struct lu_fid *fid, + struct lu_device *top_dev, + const struct lu_object_conf *conf) +{ + struct lu_object *lo; + struct lu_object *n; + + lo = lu_object_find_at(env, top_dev, fid, conf); + if (IS_ERR(lo)) + return ERR_CAST(lo); + + LASSERT(lo != NULL); + + list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) { + if (n->lo_dev == &dev->dd_lu_dev) + return container_of(n, struct dt_object, do_lu); + } + + lu_object_put(env, lo); + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL(dt_locate_at); + +/** + * find an object named \a entry in given \a dfh->dfh_o directory. + */ +static int dt_find_entry(const struct lu_env *env, const char *entry, + void *data) +{ + struct dt_find_hint *dfh = data; + struct dt_device *dt = dfh->dfh_dt; + struct lu_fid *fid = dfh->dfh_fid; + struct dt_object *obj = dfh->dfh_o; + int rc; + + rc = dt_lookup_dir(env, obj, entry, fid); + dt_object_put(env, obj); + if (rc == 0) { + obj = dt_locate(env, dt, fid); + if (IS_ERR(obj)) + rc = PTR_ERR(obj); + } + dfh->dfh_o = obj; + + return rc; +} + +/** + * Abstract function which parses path name. This function feeds + * path component to \a entry_func. + */ +int dt_path_parser(const struct lu_env *env, + char *path, dt_entry_func_t entry_func, + void *data) +{ + char *e; + int rc = 0; + + while (1) { + e = strsep(&path, "/"); + if (e == NULL) + break; + + if (e[0] == 0) { + if (!path || path[0] == '\0') + break; + continue; + } + rc = entry_func(env, e, data); + if (rc) + break; + } + + return rc; +} + +struct dt_object * +dt_store_resolve(const struct lu_env *env, struct dt_device *dt, + const char *path, struct lu_fid *fid) +{ + struct dt_thread_info *info = dt_info(env); + struct dt_find_hint *dfh = &info->dti_dfh; + struct dt_object *obj; + int result; + + + dfh->dfh_dt = dt; + dfh->dfh_fid = fid; + + strlcpy(info->dti_buf, path, sizeof(info->dti_buf)); + + result = dt->dd_ops->dt_root_get(env, dt, fid); + if (result == 0) { + obj = dt_locate(env, dt, fid); + if (!IS_ERR(obj)) { + dfh->dfh_o = obj; + result = dt_path_parser(env, info->dti_buf, + dt_find_entry, dfh); + if (result != 0) + obj = ERR_PTR(result); + else + obj = dfh->dfh_o; + } + } else { + obj = ERR_PTR(result); + } + return obj; +} + +static struct dt_object *dt_reg_open(const struct lu_env *env, + struct dt_device *dt, + struct dt_object *p, + const char *name, + struct lu_fid *fid) +{ + struct dt_object *o; + int result; + + result = dt_lookup_dir(env, p, name, fid); + if (result == 0) + o = dt_locate(env, dt, fid); + else + o = ERR_PTR(result); + + return o; +} + +/** + * Open dt object named \a filename from \a dirname directory. + * \param dt dt device + * \param fid on success, object fid is stored in *fid + */ +struct dt_object *dt_store_open(const struct lu_env *env, struct dt_device *dt, + const char *dirname, const char *filename, + struct lu_fid *fid) +{ + struct dt_object *file; + struct dt_object *dir; + + dir = dt_store_resolve(env, dt, dirname, fid); + if (!IS_ERR(dir)) { + file = dt_reg_open(env, dt, dir, filename, fid); + dt_object_put(env, dir); + } else { + file = dir; + } + + return file; +} + +struct dt_object *dt_find_or_create(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object_format *dof, + struct lu_attr *at) +{ + struct dt_object *dto; + struct thandle *th; + int rc; + + ENTRY; + + dto = dt_locate(env, dt, fid); + if (IS_ERR(dto)) + RETURN(dto); + + LASSERT(dto != NULL); + if (dt_object_exists(dto)) + RETURN(dto); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = dt_declare_create(env, dto, at, NULL, dof, th); + if (rc) + GOTO(trans_stop, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(trans_stop, rc); + + dt_write_lock(env, dto, 0); + if (dt_object_exists(dto)) + GOTO(unlock, rc = 0); + + CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid)); + + rc = dt_create(env, dto, at, NULL, dof, th); + if (rc) + GOTO(unlock, rc); + LASSERT(dt_object_exists(dto)); +unlock: + dt_write_unlock(env, dto); +trans_stop: + dt_trans_stop(env, dt, th); +out: + if (rc) { + dt_object_put(env, dto); + dto = ERR_PTR(rc); + } + + RETURN(dto); +} +EXPORT_SYMBOL(dt_find_or_create); + +/* dt class init function. */ +int dt_global_init(void) +{ + int result; + + LU_CONTEXT_KEY_INIT(&dt_key); + result = lu_context_key_register(&dt_key); + return result; +} + +void dt_global_fini(void) +{ + lu_context_key_degister(&dt_key); +} + +/** + * Generic read helper. May return an error for partial reads. + * + * \param env lustre environment + * \param dt object to be read + * \param buf lu_buf to be filled, with buffer pointer and length + * \param pos position to start reading, updated as data is read + * + * \retval real size of data read + * \retval -ve errno on failure + */ +int dt_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos) +{ + LASSERTF(dt != NULL, "dt is NULL when we want to read record\n"); + return dt->do_body_ops->dbo_read(env, dt, buf, pos); +} +EXPORT_SYMBOL(dt_read); + +/** + * Read structures of fixed size from storage. Unlike dt_read(), using + * dt_record_read() will return an error for partial reads. + * + * \param env lustre environment + * \param dt object to be read + * \param buf lu_buf to be filled, with buffer pointer and length + * \param pos position to start reading, updated as data is read + * + * \retval 0 on successfully reading full buffer + * \retval -EFAULT on short read + * \retval -ve errno on failure + */ +int dt_record_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos) +{ + ssize_t size; + + LASSERTF(dt != NULL, "dt is NULL when we want to read record\n"); + + size = dt->do_body_ops->dbo_read(env, dt, buf, pos); + if (size < 0) + return size; + return (size == (ssize_t)buf->lb_len) ? 0 : -EFAULT; +} +EXPORT_SYMBOL(dt_record_read); + +int dt_record_write(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, struct thandle *th) +{ + ssize_t size; + + LASSERTF(dt != NULL, "dt is NULL when we want to write record\n"); + LASSERT(th != NULL); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_write); + + size = dt->do_body_ops->dbo_write(env, dt, buf, pos, th); + if (size < 0) + return size; + return (size == (ssize_t)buf->lb_len) ? 0 : -EFAULT; +} +EXPORT_SYMBOL(dt_record_write); + +int dt_declare_version_set(const struct lu_env *env, struct dt_object *o, + struct thandle *th) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + + LASSERT(o); + vbuf.lb_buf = NULL; + vbuf.lb_len = sizeof(dt_obj_version_t); + return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th); + +} +EXPORT_SYMBOL(dt_declare_version_set); + +void dt_version_set(const struct lu_env *env, struct dt_object *o, + dt_obj_version_t version, struct thandle *th) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + int rc; + + LASSERT(o); + vbuf.lb_buf = &version; + vbuf.lb_len = sizeof(version); + + rc = dt_xattr_set(env, o, &vbuf, xname, 0, th); + if (rc < 0) + CDEBUG(D_INODE, "Can't set version, rc %d\n", rc); +} +EXPORT_SYMBOL(dt_version_set); + +dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + dt_obj_version_t version; + int rc; + + LASSERT(o); + vbuf.lb_buf = &version; + vbuf.lb_len = sizeof(version); + rc = dt_xattr_get(env, o, &vbuf, xname); + if (rc != sizeof(version)) { + CDEBUG(D_INODE, "Can't get version, rc %d\n", rc); + version = 0; + } + return version; +} +EXPORT_SYMBOL(dt_version_get); + +/* list of all supported index types */ + +/* directories */ +const struct dt_index_features dt_directory_features; +EXPORT_SYMBOL(dt_directory_features); + +/* scrub iterator */ +const struct dt_index_features dt_otable_features; +EXPORT_SYMBOL(dt_otable_features); + +/* lfsck layout orphan */ +const struct dt_index_features dt_lfsck_layout_orphan_features = { + .dif_flags = 0, + .dif_keysize_min = sizeof(struct lu_fid), + .dif_keysize_max = sizeof(struct lu_fid), + .dif_recsize_min = sizeof(struct lu_orphan_rec_v3), + .dif_recsize_max = sizeof(struct lu_orphan_rec_v3), + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_lfsck_layout_orphan_features); + +/* lfsck layout dangling */ +const struct dt_index_features dt_lfsck_layout_dangling_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(struct lfsck_layout_dangling_key), + .dif_keysize_max = sizeof(struct lfsck_layout_dangling_key), + .dif_recsize_min = sizeof(struct lu_fid), + .dif_recsize_max = sizeof(struct lu_fid), + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_lfsck_layout_dangling_features); + +/* lfsck namespace */ +const struct dt_index_features dt_lfsck_namespace_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(struct lu_fid), + .dif_keysize_max = sizeof(struct lu_fid), + .dif_recsize_min = sizeof(__u8), + .dif_recsize_max = sizeof(__u8), + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_lfsck_namespace_features); + +/* accounting indexes */ +const struct dt_index_features dt_acct_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_acct_rec), /* 16 bytes */ + .dif_recsize_max = sizeof(struct lquota_acct_rec), /* 16 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_acct_features); + +/* global quota files */ +const struct dt_index_features dt_quota_glb_features = { + .dif_flags = DT_IND_UPDATE, + /* a different key would have to be used for per-directory quota */ + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_glb_rec), /* 32 bytes */ + .dif_recsize_max = sizeof(struct lquota_glb_rec), /* 32 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_quota_glb_features); + +/* slave quota files */ +const struct dt_index_features dt_quota_slv_features = { + .dif_flags = DT_IND_UPDATE, + /* a different key would have to be used for per-directory quota */ + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_slv_rec), /* 8 bytes */ + .dif_recsize_max = sizeof(struct lquota_slv_rec), /* 8 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_quota_slv_features); + +/* nodemap files, nodemap_rec size asserted in nodemap_storage.c */ +const struct dt_index_features dt_nodemap_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(__u64), /* 64-bit nodemap/record id */ + .dif_keysize_max = sizeof(__u64), /* 64-bit nodemap/record id */ + .dif_recsize_min = sizeof(union nodemap_rec), /* 32 bytes */ + .dif_recsize_max = sizeof(union nodemap_rec), /* 32 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_nodemap_features); + +/* + * helper function returning what dt_index_features structure should be used + * based on the FID sequence. This is used by OBD_IDX_READ RPC + */ +static inline const struct dt_index_features *dt_index_feat_select(__u64 seq, + __u32 mode) +{ + if (seq == FID_SEQ_QUOTA_GLB) { + /* global quota index */ + if (!S_ISREG(mode)) + /* global quota index should be a regular file */ + return ERR_PTR(-ENOENT); + return &dt_quota_glb_features; + } else if (seq == FID_SEQ_QUOTA) { + /* quota slave index */ + if (!S_ISREG(mode)) + /* slave index should be a regular file */ + return ERR_PTR(-ENOENT); + return &dt_quota_slv_features; + } else if (seq == FID_SEQ_LAYOUT_RBTREE){ + return &dt_lfsck_layout_orphan_features; + } else if (seq >= FID_SEQ_NORMAL) { + /* object is part of the namespace, verify that it is a + * directory */ + if (!S_ISDIR(mode)) + /* sorry, we can only deal with directory */ + return ERR_PTR(-ENOTDIR); + return &dt_directory_features; + } + + return ERR_PTR(-EOPNOTSUPP); +} + +/* + * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ + * RPC + * + * \param env - is the environment passed by the caller + * \param lp - is a pointer to the lu_page to fill + * \param nob - is the maximum number of bytes that should be copied + * \param iops - is the index operation vector associated with the index object + * \param it - is a pointer to the current iterator + * \param attr - is the index attribute to pass to iops->rec() + * \param arg - is a pointer to the idx_info structure + */ +static int dt_index_page_build(const struct lu_env *env, union lu_page *lp, + size_t nob, const struct dt_it_ops *iops, + struct dt_it *it, __u32 attr, void *arg) +{ + struct idx_info *ii = (struct idx_info *)arg; + struct lu_idxpage *lip = &lp->lp_idx; + char *entry; + __u64 hash; + __u16 hashsize = 0; + __u16 keysize = 0; + __u16 recsize; + int rc; + + ENTRY; + + if (nob < LIP_HDR_SIZE) + return -EINVAL; + + /* initialize the header of the new container */ + memset(lip, 0, LIP_HDR_SIZE); + lip->lip_magic = LIP_MAGIC; + nob -= LIP_HDR_SIZE; + + /* client wants to the 64-bit hash value associated with each record */ + if (!(ii->ii_flags & II_FL_NOHASH)) + hashsize = sizeof(hash); + + entry = lip->lip_entries; + do { + /* fetch 64-bit hash value */ + hash = iops->store(env, it); + ii->ii_hash_end = hash; + + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) { + if (lip->lip_nr != 0) + GOTO(out, rc = 0); + } + + if (!(ii->ii_flags & II_FL_NOKEY)) { + keysize = iops->key_size(env, it); + if (!(ii->ii_flags & II_FL_VARKEY) && + keysize != ii->ii_keysize) { + CERROR("keysize mismatch %hu != %hu.\n", + keysize, ii->ii_keysize); + GOTO(out, rc = -EINVAL); + } + } + + /* and finally the record */ + if (ii->ii_flags & II_FL_VARREC) + recsize = iops->rec_size(env, it, attr); + else + recsize = ii->ii_recsize; + + if (nob < hashsize + keysize + recsize) { + if (lip->lip_nr == 0) + GOTO(out, rc = -E2BIG); + GOTO(out, rc = 0); + } + + rc = iops->rec(env, it, + (struct dt_rec *)(entry + hashsize + keysize), + attr); + if (!rc) { + if (hashsize) + memcpy(entry, &hash, hashsize); + if (keysize) { + struct dt_key *key; + + key = iops->key(env, it); + memcpy(entry + hashsize, key, keysize); + } + /* hash/key/record successfully copied! */ + lip->lip_nr++; + if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0)) + ii->ii_hash_start = hash; + entry += hashsize + keysize + recsize; + nob -= hashsize + keysize + recsize; + } else if (rc != -ESTALE) { + GOTO(out, rc); + } + + /* move on to the next record */ + do { + rc = iops->next(env, it); + } while (rc == -ESTALE); + } while (rc == 0); + + GOTO(out, rc); +out: + if (rc >= 0 && lip->lip_nr > 0) + /* one more container */ + ii->ii_count++; + if (rc > 0) + /* no more entries */ + ii->ii_hash_end = II_END_OFF; + return rc; +} + + +/* + * Walk index and fill lu_page containers with key/record pairs + * + * \param env - is the environment passed by the caller + * \param obj - is the index object to parse + * \param rdpg - is the lu_rdpg descriptor associated with the transfer + * \param filler - is the callback function responsible for filling a lu_page + * with key/record pairs in the format wanted by the caller. + * If NULL, uses dt_index_page_build + * \param arg - is an opaq argument passed to the filler function + * + * \retval sum (in bytes) of all filled lu_pages + * \retval -ve errno on failure + */ +int dt_index_walk(const struct lu_env *env, struct dt_object *obj, + const struct lu_rdpg *rdpg, dt_index_page_build_t filler, + void *arg) +{ + struct dt_it *it; + const struct dt_it_ops *iops; + size_t pageidx, nob, nlupgs = 0; + int rc; + ENTRY; + + LASSERT(rdpg->rp_pages != NULL); + LASSERT(obj->do_index_ops != NULL); + + if (filler == NULL) + filler = dt_index_page_build; + + nob = rdpg->rp_count; + if (nob == 0) + RETURN(-EFAULT); + + /* Iterate through index and fill containers from @rdpg */ + iops = &obj->do_index_ops->dio_it; + LASSERT(iops != NULL); + it = iops->init(env, obj, rdpg->rp_attrs); + if (IS_ERR(it)) + RETURN(PTR_ERR(it)); + + rc = iops->load(env, it, rdpg->rp_hash); + if (rc == 0) { + /* + * Iterator didn't find record with exactly the key requested. + * + * It is currently either + * + * - positioned above record with key less than + * requested---skip it. + * - or not positioned at all (is in IAM_IT_SKEWED + * state)---position it on the next item. + */ + rc = iops->next(env, it); + } else if (rc > 0) { + rc = 0; + } else { + if (rc == -ENODATA) + rc = 0; + GOTO(out, rc); + } + + /* + * Fill containers one after the other. There might be multiple + * containers per physical page. + * + * At this point and across for-loop: + * rc == 0 -> ok, proceed. + * rc > 0 -> end of index. + * rc < 0 -> error. + */ + for (pageidx = 0; rc == 0 && nob > 0; pageidx++) { + union lu_page *lp; + int i; + + LASSERT(pageidx < rdpg->rp_npages); + lp = kmap(rdpg->rp_pages[pageidx]); + + /* fill lu pages */ + for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) { + rc = filler(env, lp, min_t(size_t, nob, LU_PAGE_SIZE), + iops, it, rdpg->rp_attrs, arg); + if (rc < 0) + break; + /* one more lu_page */ + nlupgs++; + if (rc > 0) + /* end of index */ + break; + } + kunmap(rdpg->rp_pages[i]); + } + +out: + iops->put(env, it); + iops->fini(env, it); + + if (rc >= 0) + rc = min_t(size_t, nlupgs * LU_PAGE_SIZE, rdpg->rp_count); + + RETURN(rc); +} +EXPORT_SYMBOL(dt_index_walk); + +/** + * Walk key/record pairs of an index and copy them into 4KB containers to be + * transferred over the network. This is the common handler for OBD_IDX_READ + * RPC processing. + * + * \param env - is the environment passed by the caller + * \param dev - is the dt_device storing the index + * \param ii - is the idx_info structure packed by the client in the + * OBD_IDX_READ request + * \param rdpg - is the lu_rdpg descriptor + * + * \retval on success, return sum (in bytes) of all filled containers + * \retval appropriate error otherwise. + */ +int dt_index_read(const struct lu_env *env, struct dt_device *dev, + struct idx_info *ii, const struct lu_rdpg *rdpg) +{ + const struct dt_index_features *feat; + struct dt_object *obj; + int rc; + ENTRY; + + /* + * rp_count shouldn't be null and should be a multiple of the container + * size + */ + if (rdpg->rp_count == 0 || (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0) + RETURN(-EFAULT); + + if (!fid_is_quota(&ii->ii_fid) && !fid_is_layout_rbtree(&ii->ii_fid) && + !fid_is_norm(&ii->ii_fid)) + RETURN(-EOPNOTSUPP); + + /* lookup index object subject to the transfer */ + obj = dt_locate(env, dev, &ii->ii_fid); + if (IS_ERR(obj)) + RETURN(PTR_ERR(obj)); + if (dt_object_exists(obj) == 0) + GOTO(out, rc = -ENOENT); + + /* fetch index features associated with index object */ + feat = dt_index_feat_select(fid_seq(&ii->ii_fid), + lu_object_attr(&obj->do_lu)); + if (IS_ERR(feat)) + GOTO(out, rc = PTR_ERR(feat)); + + /* load index feature if not done already */ + if (obj->do_index_ops == NULL) { + rc = obj->do_ops->do_index_try(env, obj, feat); + if (rc) + GOTO(out, rc); + } + + /* fill ii_flags with supported index features */ + ii->ii_flags &= (II_FL_NOHASH | II_FL_NOKEY | II_FL_VARKEY | + II_FL_VARREC); + + if (!(feat->dif_flags & DT_IND_VARKEY)) + ii->ii_keysize = feat->dif_keysize_max; + + if (!(feat->dif_flags & DT_IND_VARREC)) + ii->ii_recsize = feat->dif_recsize_max; + + if (feat->dif_flags & DT_IND_NONUNQ) + /* key isn't necessarily unique */ + ii->ii_flags |= II_FL_NONUNQ; + + if (!fid_is_layout_rbtree(&ii->ii_fid)) { + dt_read_lock(env, obj, 0); + /* fetch object version before walking the index */ + ii->ii_version = dt_version_get(env, obj); + } + + /* walk the index and fill lu_idxpages with key/record pairs */ + rc = dt_index_walk(env, obj, rdpg, dt_index_page_build, ii); + if (!fid_is_layout_rbtree(&ii->ii_fid)) + dt_read_unlock(env, obj); + + if (rc == 0) { + /* index is empty */ + LASSERT(ii->ii_count == 0); + ii->ii_hash_end = II_END_OFF; + } + + GOTO(out, rc); +out: + dt_object_put(env, obj); + return rc; +} +EXPORT_SYMBOL(dt_index_read); + +#ifdef CONFIG_PROC_FS +int lprocfs_dt_blksize_seq_show(struct seq_file *m, void *v) +{ + struct dt_device *dt = m->private; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) + seq_printf(m, "%u\n", (unsigned) osfs.os_bsize); + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_blksize_seq_show); + +int lprocfs_dt_kbytestotal_seq_show(struct seq_file *m, void *v) +{ + struct dt_device *dt = m->private; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + seq_printf(m, "%llu\n", result); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_kbytestotal_seq_show); + +int lprocfs_dt_kbytesfree_seq_show(struct seq_file *m, void *v) +{ + struct dt_device *dt = m->private; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + seq_printf(m, "%llu\n", result); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_kbytesfree_seq_show); + +int lprocfs_dt_kbytesavail_seq_show(struct seq_file *m, void *v) +{ + struct dt_device *dt = m->private; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + seq_printf(m, "%llu\n", result); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_kbytesavail_seq_show); + +int lprocfs_dt_filestotal_seq_show(struct seq_file *m, void *v) +{ + struct dt_device *dt = m->private; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) + seq_printf(m, "%llu\n", osfs.os_files); + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_filestotal_seq_show); + +int lprocfs_dt_filesfree_seq_show(struct seq_file *m, void *v) +{ + struct dt_device *dt = m->private; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) + seq_printf(m, "%llu\n", osfs.os_ffree); + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_filesfree_seq_show); + +#endif /* CONFIG_PROC_FS */ + +static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct lu_device *lu = dt2lu_dev(dt); + + if (!lu->ld_obd) + return -ENODEV; + + return sprintf(buf, "%s\n", lu->ld_obd->obd_uuid.uuid); +} +LUSTRE_RO_ATTR(uuid); + +static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct obd_statfs osfs; + int rc; + + rc = dt_statfs(NULL, dt, &osfs); + if (rc) + return rc; + + return sprintf(buf, "%u\n", (unsigned) osfs.os_bsize); +} +LUSTRE_RO_ATTR(blocksize); + +static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct obd_statfs osfs; + u32 blk_size; + u64 result; + int rc; + + rc = dt_statfs(NULL, dt, &osfs); + if (rc) + return rc; + + blk_size = osfs.os_bsize >> 10; + result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); +} +LUSTRE_RO_ATTR(kbytestotal); + +static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct obd_statfs osfs; + u32 blk_size; + u64 result; + int rc; + + rc = dt_statfs(NULL, dt, &osfs); + if (rc) + return rc; + + blk_size = osfs.os_bsize >> 10; + result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); +} +LUSTRE_RO_ATTR(kbytesfree); + +static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct obd_statfs osfs; + u32 blk_size; + u64 result; + int rc; + + rc = dt_statfs(NULL, dt, &osfs); + if (rc) + return rc; + + blk_size = osfs.os_bsize >> 10; + result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); +} +LUSTRE_RO_ATTR(kbytesavail); + +static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct obd_statfs osfs; + int rc; + + rc = dt_statfs(NULL, dt, &osfs); + if (rc) + return rc; + + return sprintf(buf, "%llu\n", osfs.os_files); +} +LUSTRE_RO_ATTR(filestotal); + +static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct obd_statfs osfs; + int rc; + + rc = dt_statfs(NULL, dt, &osfs); + if (rc) + return rc; + + return sprintf(buf, "%llu\n", osfs.os_ffree); +} +LUSTRE_RO_ATTR(filesfree); + +static const struct attribute *dt_def_attrs[] = { + &lustre_attr_uuid.attr, + &lustre_attr_blocksize.attr, + &lustre_attr_kbytestotal.attr, + &lustre_attr_kbytesfree.attr, + &lustre_attr_kbytesavail.attr, + &lustre_attr_filestotal.attr, + &lustre_attr_filesfree.attr, + NULL, +}; + +static void dt_sysfs_release(struct kobject *kobj) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + + debugfs_remove_recursive(dt->dd_debugfs_entry); + dt->dd_debugfs_entry = NULL; + + complete(&dt->dd_kobj_unregister); +} + +int dt_tunables_fini(struct dt_device *dt) +{ + if (!dt) + return -EINVAL; + + if (dt->dd_def_attrs) + sysfs_remove_files(&dt->dd_kobj, dt->dd_def_attrs); + + kobject_put(&dt->dd_kobj); + wait_for_completion(&dt->dd_kobj_unregister); + + return 0; +} +EXPORT_SYMBOL(dt_tunables_fini); + +int dt_tunables_init(struct dt_device *dt, struct obd_type *type, + const char *name, struct ldebugfs_vars *list) +{ + int rc; + + dt->dd_ktype.sysfs_ops = &lustre_sysfs_ops; + dt->dd_ktype.release = dt_sysfs_release; + + init_completion(&dt->dd_kobj_unregister); + rc = kobject_init_and_add(&dt->dd_kobj, &dt->dd_ktype, &type->typ_kobj, + "%s", name); + if (rc) + return rc; + + dt->dd_def_attrs = dt_def_attrs; + + rc = sysfs_create_files(&dt->dd_kobj, dt->dd_def_attrs); + if (rc) { + kobject_put(&dt->dd_kobj); + return rc; + } + + /* + * No need to register debugfs if no enteries. This allows us to + * choose between using dt_device or obd_device for debugfs. + */ + if (!list) + return rc; + + dt->dd_debugfs_entry = debugfs_create_dir(name, + type->typ_debugfs_entry); + ldebugfs_add_vars(dt->dd_debugfs_entry, list, dt); + + return rc; +} +EXPORT_SYMBOL(dt_tunables_init); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/genops.c b/drivers/staging/lustrefsx/lustre/obdclass/genops.c new file mode 100644 index 0000000000000..d8a689024659d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/genops.c @@ -0,0 +1,2348 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/genops.c + * + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_RWLOCK(obd_dev_lock); +static struct obd_device *obd_devs[MAX_OBD_DEVICES]; + +static struct kmem_cache *obd_device_cachep; +static struct kobj_type class_ktype; +static struct workqueue_struct *zombie_wq; + +static void obd_zombie_export_add(struct obd_export *exp); +static void obd_zombie_import_add(struct obd_import *imp); +static void print_export_data(struct obd_export *exp, + const char *status, int locks, int debug_level); + +static LIST_HEAD(obd_stale_exports); +static DEFINE_SPINLOCK(obd_stale_export_lock); +static atomic_t obd_stale_export_num = ATOMIC_INIT(0); + +/* + * support functions: we could use inter-module communication, but this + * is more portable to other OS's + */ +static struct obd_device *obd_device_alloc(void) +{ + struct obd_device *obd; + + OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, GFP_NOFS); + if (obd != NULL) { + obd->obd_magic = OBD_DEVICE_MAGIC; + } + return obd; +} + +static void obd_device_free(struct obd_device *obd) +{ + LASSERT(obd != NULL); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + if (obd->obd_namespace != NULL) { + CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n", + obd, obd->obd_namespace, obd->obd_force); + LBUG(); + } + lu_ref_fini(&obd->obd_reference); + OBD_SLAB_FREE_PTR(obd, obd_device_cachep); +} + +struct obd_type *class_search_type(const char *name) +{ + struct kobject *kobj = kset_find_obj(lustre_kset, name); + + if (kobj && kobj->ktype == &class_ktype) + return container_of(kobj, struct obd_type, typ_kobj); + + kobject_put(kobj); + return NULL; +} +EXPORT_SYMBOL(class_search_type); + +struct obd_type *class_get_type(const char *name) +{ + struct obd_type *type; + + type = class_search_type(name); +#ifdef HAVE_MODULE_LOADING_SUPPORT + if (!type) { + const char *modname = name; + +#ifdef HAVE_SERVER_SUPPORT + if (strcmp(modname, "obdfilter") == 0) + modname = "ofd"; + + if (strcmp(modname, LUSTRE_LWP_NAME) == 0) + modname = LUSTRE_OSP_NAME; + + if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME))) + modname = LUSTRE_MDT_NAME; +#endif /* HAVE_SERVER_SUPPORT */ + + if (!request_module("%s", modname)) { + CDEBUG(D_INFO, "Loaded module '%s'\n", modname); + type = class_search_type(name); + } else { + LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n", + modname); + } + } +#endif + if (type) { + if (try_module_get(type->typ_dt_ops->o_owner)) { + atomic_inc(&type->typ_refcnt); + /* class_search_type() returned a counted reference, + * but we don't need that count any more as + * we have one through typ_refcnt. + */ + kobject_put(&type->typ_kobj); + } else { + kobject_put(&type->typ_kobj); + type = NULL; + } + } + return type; +} + +void class_put_type(struct obd_type *type) +{ + LASSERT(type); + module_put(type->typ_dt_ops->o_owner); + atomic_dec(&type->typ_refcnt); +} + +static void class_sysfs_release(struct kobject *kobj) +{ + struct obd_type *type = container_of(kobj, struct obd_type, typ_kobj); + + debugfs_remove_recursive(type->typ_debugfs_entry); + type->typ_debugfs_entry = NULL; + + if (type->typ_lu) + lu_device_type_fini(type->typ_lu); + +#ifdef CONFIG_PROC_FS + if (type->typ_name && type->typ_procroot) + remove_proc_subtree(type->typ_name, proc_lustre_root); +#endif + OBD_FREE(type, sizeof(*type)); +} + +static struct kobj_type class_ktype = { + .sysfs_ops = &lustre_sysfs_ops, + .release = class_sysfs_release, +}; + +#ifdef HAVE_SERVER_SUPPORT +struct obd_type *class_add_symlinks(const char *name, bool enable_proc) +{ + struct dentry *symlink; + struct obd_type *type; + int rc; + + type = class_search_type(name); + if (type) { + kobject_put(&type->typ_kobj); + return ERR_PTR(-EEXIST); + } + + OBD_ALLOC(type, sizeof(*type)); + if (!type) + return ERR_PTR(-ENOMEM); + + type->typ_kobj.kset = lustre_kset; + rc = kobject_init_and_add(&type->typ_kobj, &class_ktype, + &lustre_kset->kobj, "%s", name); + if (rc) + return ERR_PTR(rc); + + symlink = debugfs_create_dir(name, debugfs_lustre_root); + type->typ_debugfs_entry = symlink; + type->typ_sym_filter = true; + + if (enable_proc) { + type->typ_procroot = lprocfs_register(name, proc_lustre_root, + NULL, NULL); + if (IS_ERR(type->typ_procroot)) { + CERROR("%s: can't create compat proc entry: %d\n", + name, (int)PTR_ERR(type->typ_procroot)); + type->typ_procroot = NULL; + } + } + + return type; +} +EXPORT_SYMBOL(class_add_symlinks); +#endif /* HAVE_SERVER_SUPPORT */ + +#define CLASS_MAX_NAME 1024 + +int class_register_type(const struct obd_ops *dt_ops, + const struct md_ops *md_ops, + bool enable_proc, + const char *name, struct lu_device_type *ldt) +{ + struct obd_type *type; + int rc; + + ENTRY; + /* sanity check */ + LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME); + + type = class_search_type(name); + if (type) { +#ifdef HAVE_SERVER_SUPPORT + if (type->typ_sym_filter) + goto dir_exist; +#endif /* HAVE_SERVER_SUPPORT */ + kobject_put(&type->typ_kobj); + CDEBUG(D_IOCTL, "Type %s already registered\n", name); + RETURN(-EEXIST); + } + + OBD_ALLOC(type, sizeof(*type)); + if (type == NULL) + RETURN(-ENOMEM); + + type->typ_lu = ldt ? OBD_LU_TYPE_SETUP : NULL; + type->typ_kobj.kset = lustre_kset; + kobject_init(&type->typ_kobj, &class_ktype); +#ifdef HAVE_SERVER_SUPPORT +dir_exist: +#endif /* HAVE_SERVER_SUPPORT */ + + type->typ_dt_ops = dt_ops; + type->typ_md_ops = md_ops; + +#ifdef HAVE_SERVER_SUPPORT + if (type->typ_sym_filter) { + type->typ_sym_filter = false; + kobject_put(&type->typ_kobj); + goto setup_ldt; + } +#endif +#ifdef CONFIG_PROC_FS + if (enable_proc && !type->typ_procroot) { + type->typ_procroot = lprocfs_register(name, + proc_lustre_root, + NULL, type); + if (IS_ERR(type->typ_procroot)) { + rc = PTR_ERR(type->typ_procroot); + type->typ_procroot = NULL; + GOTO(failed, rc); + } + } +#endif + type->typ_debugfs_entry = debugfs_create_dir(name, debugfs_lustre_root); + + rc = kobject_add(&type->typ_kobj, &lustre_kset->kobj, "%s", name); + if (rc) + GOTO(failed, rc); +#ifdef HAVE_SERVER_SUPPORT +setup_ldt: +#endif + if (ldt) { + rc = lu_device_type_init(ldt); + smp_store_release(&type->typ_lu, rc ? NULL : ldt); + wake_up_var(&type->typ_lu); + if (rc) + GOTO(failed, rc); + } + + RETURN(0); + +failed: + kobject_put(&type->typ_kobj); + + RETURN(rc); +} +EXPORT_SYMBOL(class_register_type); + +int class_unregister_type(const char *name) +{ + struct obd_type *type = class_search_type(name); + int rc = 0; + ENTRY; + + if (!type) { + CERROR("unknown obd type\n"); + RETURN(-EINVAL); + } + + if (atomic_read(&type->typ_refcnt)) { + CERROR("type %s has refcount (%d)\n", name, + atomic_read(&type->typ_refcnt)); + /* This is a bad situation, let's make the best of it */ + /* Remove ops, but leave the name for debugging */ + type->typ_dt_ops = NULL; + type->typ_md_ops = NULL; + GOTO(out_put, rc = -EBUSY); + } + + /* Put the final ref */ + kobject_put(&type->typ_kobj); +out_put: + /* Put the ref returned by class_search_type() */ + kobject_put(&type->typ_kobj); + + RETURN(rc); +} /* class_unregister_type */ +EXPORT_SYMBOL(class_unregister_type); + +/** + * Create a new obd device. + * + * Allocate the new obd_device and initialize it. + * + * \param[in] type_name obd device type string. + * \param[in] name obd device name. + * \param[in] uuid obd device UUID + * + * \retval newdev pointer to created obd_device + * \retval ERR_PTR(errno) on error + */ +struct obd_device *class_newdev(const char *type_name, const char *name, + const char *uuid) +{ + struct obd_device *newdev; + struct obd_type *type = NULL; + ENTRY; + + if (strlen(name) >= MAX_OBD_NAME) { + CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME); + RETURN(ERR_PTR(-EINVAL)); + } + + type = class_get_type(type_name); + if (type == NULL){ + CERROR("OBD: unknown type: %s\n", type_name); + RETURN(ERR_PTR(-ENODEV)); + } + + newdev = obd_device_alloc(); + if (newdev == NULL) { + class_put_type(type); + RETURN(ERR_PTR(-ENOMEM)); + } + LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC); + strncpy(newdev->obd_name, name, sizeof(newdev->obd_name) - 1); + newdev->obd_type = type; + newdev->obd_minor = -1; + + rwlock_init(&newdev->obd_pool_lock); + newdev->obd_pool_limit = 0; + newdev->obd_pool_slv = 0; + + INIT_LIST_HEAD(&newdev->obd_exports); + newdev->obd_num_exports = 0; + newdev->obd_grant_check_threshold = 100; + INIT_LIST_HEAD(&newdev->obd_unlinked_exports); + INIT_LIST_HEAD(&newdev->obd_delayed_exports); + INIT_LIST_HEAD(&newdev->obd_exports_timed); + INIT_LIST_HEAD(&newdev->obd_nid_stats); + spin_lock_init(&newdev->obd_nid_lock); + spin_lock_init(&newdev->obd_dev_lock); + mutex_init(&newdev->obd_dev_mutex); + spin_lock_init(&newdev->obd_osfs_lock); + /* newdev->obd_osfs_age must be set to a value in the distant + * past to guarantee a fresh statfs is fetched on mount. */ + newdev->obd_osfs_age = ktime_get_seconds() - 1000; + + /* XXX belongs in setup not attach */ + init_rwsem(&newdev->obd_observer_link_sem); + /* recovery data */ + spin_lock_init(&newdev->obd_recovery_task_lock); + init_waitqueue_head(&newdev->obd_next_transno_waitq); + init_waitqueue_head(&newdev->obd_evict_inprogress_waitq); + INIT_LIST_HEAD(&newdev->obd_req_replay_queue); + INIT_LIST_HEAD(&newdev->obd_lock_replay_queue); + INIT_LIST_HEAD(&newdev->obd_final_req_queue); + INIT_LIST_HEAD(&newdev->obd_evict_list); + INIT_LIST_HEAD(&newdev->obd_lwp_list); + + llog_group_init(&newdev->obd_olg); + /* Detach drops this */ + atomic_set(&newdev->obd_refcount, 1); + lu_ref_init(&newdev->obd_reference); + lu_ref_add(&newdev->obd_reference, "newdev", newdev); + + newdev->obd_conn_inprogress = 0; + + strncpy(newdev->obd_uuid.uuid, uuid, UUID_MAX); + + CDEBUG(D_IOCTL, "Allocate new device %s (%p)\n", + newdev->obd_name, newdev); + + return newdev; +} + +/** + * Free obd device. + * + * \param[in] obd obd_device to be freed + * + * \retval none + */ +void class_free_dev(struct obd_device *obd) +{ + struct obd_type *obd_type = obd->obd_type; + + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x " + "!= %08x\n", obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(obd->obd_minor == -1 || obd_devs[obd->obd_minor] == obd, + "obd %p != obd_devs[%d] %p\n", + obd, obd->obd_minor, obd_devs[obd->obd_minor]); + LASSERTF(atomic_read(&obd->obd_refcount) == 0, + "obd_refcount should be 0, not %d\n", + atomic_read(&obd->obd_refcount)); + LASSERT(obd_type != NULL); + + CDEBUG(D_INFO, "Release obd device %s obd_type name = %s\n", + obd->obd_name, obd->obd_type->typ_name); + + CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n", + obd->obd_name, obd->obd_uuid.uuid); + if (obd->obd_stopping) { + int err; + + /* If we're not stopping, we were never set up */ + err = obd_cleanup(obd); + if (err) + CERROR("Cleanup %s returned %d\n", + obd->obd_name, err); + } + + obd_device_free(obd); + + class_put_type(obd_type); +} + +/** + * Unregister obd device. + * + * Free slot in obd_dev[] used by \a obd. + * + * \param[in] new_obd obd_device to be unregistered + * + * \retval none + */ +void class_unregister_device(struct obd_device *obd) +{ + write_lock(&obd_dev_lock); + if (obd->obd_minor >= 0) { + LASSERT(obd_devs[obd->obd_minor] == obd); + obd_devs[obd->obd_minor] = NULL; + obd->obd_minor = -1; + } + write_unlock(&obd_dev_lock); +} + +/** + * Register obd device. + * + * Find free slot in obd_devs[], fills it with \a new_obd. + * + * \param[in] new_obd obd_device to be registered + * + * \retval 0 success + * \retval -EEXIST device with this name is registered + * \retval -EOVERFLOW obd_devs[] is full + */ +int class_register_device(struct obd_device *new_obd) +{ + int ret = 0; + int i; + int new_obd_minor = 0; + bool minor_assign = false; + bool retried = false; + +again: + write_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd != NULL && + (strcmp(new_obd->obd_name, obd->obd_name) == 0)) { + + if (!retried) { + write_unlock(&obd_dev_lock); + + /* the obd_device could be waited to be + * destroyed by the "obd_zombie_impexp_thread". + */ + obd_zombie_barrier(); + retried = true; + goto again; + } + + CERROR("%s: already exists, won't add\n", + obd->obd_name); + /* in case we found a free slot before duplicate */ + minor_assign = false; + ret = -EEXIST; + break; + } + if (!minor_assign && obd == NULL) { + new_obd_minor = i; + minor_assign = true; + } + } + + if (minor_assign) { + new_obd->obd_minor = new_obd_minor; + LASSERTF(obd_devs[new_obd_minor] == NULL, "obd_devs[%d] " + "%p\n", new_obd_minor, obd_devs[new_obd_minor]); + obd_devs[new_obd_minor] = new_obd; + } else { + if (ret == 0) { + ret = -EOVERFLOW; + CERROR("%s: all %u/%u devices used, increase " + "MAX_OBD_DEVICES: rc = %d\n", new_obd->obd_name, + i, class_devno_max(), ret); + } + } + write_unlock(&obd_dev_lock); + + RETURN(ret); +} + +static int class_name2dev_nolock(const char *name) +{ + int i; + + if (!name) + return -1; + + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd && strcmp(name, obd->obd_name) == 0) { + /* Make sure we finished attaching before we give + out any references */ + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_attached) { + return i; + } + break; + } + } + + return -1; +} + +int class_name2dev(const char *name) +{ + int i; + + if (!name) + return -1; + + read_lock(&obd_dev_lock); + i = class_name2dev_nolock(name); + read_unlock(&obd_dev_lock); + + return i; +} +EXPORT_SYMBOL(class_name2dev); + +struct obd_device *class_name2obd(const char *name) +{ + int dev = class_name2dev(name); + + if (dev < 0 || dev > class_devno_max()) + return NULL; + return class_num2obd(dev); +} +EXPORT_SYMBOL(class_name2obd); + +int class_uuid2dev_nolock(struct obd_uuid *uuid) +{ + int i; + + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) { + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + return i; + } + } + + return -1; +} + +int class_uuid2dev(struct obd_uuid *uuid) +{ + int i; + + read_lock(&obd_dev_lock); + i = class_uuid2dev_nolock(uuid); + read_unlock(&obd_dev_lock); + + return i; +} +EXPORT_SYMBOL(class_uuid2dev); + +struct obd_device *class_uuid2obd(struct obd_uuid *uuid) +{ + int dev = class_uuid2dev(uuid); + if (dev < 0) + return NULL; + return class_num2obd(dev); +} +EXPORT_SYMBOL(class_uuid2obd); + +/** + * Get obd device from ::obd_devs[] + * + * \param num [in] array index + * + * \retval NULL if ::obd_devs[\a num] does not contains an obd device + * otherwise return the obd device there. + */ +struct obd_device *class_num2obd(int num) +{ + struct obd_device *obd = NULL; + + if (num < class_devno_max()) { + obd = obd_devs[num]; + if (obd == NULL) + return NULL; + + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "%p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(obd->obd_minor == num, + "%p obd_minor %0d != %0d\n", + obd, obd->obd_minor, num); + } + + return obd; +} +EXPORT_SYMBOL(class_num2obd); + +/** + * Find obd in obd_dev[] by name or uuid. + * + * Increment obd's refcount if found. + * + * \param[in] str obd name or uuid + * + * \retval NULL if not found + * \retval target pointer to found obd_device + */ +struct obd_device *class_dev_by_str(const char *str) +{ + struct obd_device *target = NULL; + struct obd_uuid tgtuuid; + int rc; + + obd_str2uuid(&tgtuuid, str); + + read_lock(&obd_dev_lock); + rc = class_uuid2dev_nolock(&tgtuuid); + if (rc < 0) + rc = class_name2dev_nolock(str); + + if (rc >= 0) + target = class_num2obd(rc); + + if (target != NULL) + class_incref(target, "find", current); + read_unlock(&obd_dev_lock); + + RETURN(target); +} +EXPORT_SYMBOL(class_dev_by_str); + +/** + * Get obd devices count. Device in any + * state are counted + * \retval obd device count + */ +int get_devices_count(void) +{ + int index, max_index = class_devno_max(), dev_count = 0; + + read_lock(&obd_dev_lock); + for (index = 0; index <= max_index; index++) { + struct obd_device *obd = class_num2obd(index); + if (obd != NULL) + dev_count++; + } + read_unlock(&obd_dev_lock); + + return dev_count; +} +EXPORT_SYMBOL(get_devices_count); + +void class_obd_list(void) +{ + char *status; + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n", + i, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + } + read_unlock(&obd_dev_lock); +} + +/* Search for a client OBD connected to tgt_uuid. If grp_uuid is + * specified, then only the client with that uuid is returned, + * otherwise any client connected to the tgt is returned. + */ +struct obd_device *class_find_client_obd(struct obd_uuid *tgt_uuid, + const char *type_name, + struct obd_uuid *grp_uuid) +{ + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if ((strncmp(obd->obd_type->typ_name, type_name, + strlen(type_name)) == 0)) { + if (obd_uuid_equals(tgt_uuid, + &obd->u.cli.cl_target_uuid) && + ((grp_uuid)? obd_uuid_equals(grp_uuid, + &obd->obd_uuid) : 1)) { + read_unlock(&obd_dev_lock); + return obd; + } + } + } + read_unlock(&obd_dev_lock); + + return NULL; +} +EXPORT_SYMBOL(class_find_client_obd); + +/* Iterate the obd_device list looking devices have grp_uuid. Start + * searching at *next, and if a device is found, the next index to look + * at is saved in *next. If next is NULL, then the first matching device + * will always be returned. + */ +struct obd_device *class_devices_in_group(struct obd_uuid *grp_uuid, int *next) +{ + int i; + + if (next == NULL) + i = 0; + else if (*next >= 0 && *next < class_devno_max()) + i = *next; + else + return NULL; + + read_lock(&obd_dev_lock); + for (; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) { + if (next != NULL) + *next = i+1; + read_unlock(&obd_dev_lock); + return obd; + } + } + read_unlock(&obd_dev_lock); + + return NULL; +} +EXPORT_SYMBOL(class_devices_in_group); + +/** + * to notify sptlrpc log for \a fsname has changed, let every relevant OBD + * adjust sptlrpc settings accordingly. + */ +int class_notify_sptlrpc_conf(const char *fsname, int namelen) +{ + struct obd_device *obd; + const char *type; + int i, rc = 0, rc2; + + LASSERT(namelen > 0); + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + obd = class_num2obd(i); + + if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping) + continue; + + /* only notify mdc, osc, osp, lwp, mdt, ost + * because only these have a -sptlrpc llog */ + type = obd->obd_type->typ_name; + if (strcmp(type, LUSTRE_MDC_NAME) != 0 && + strcmp(type, LUSTRE_OSC_NAME) != 0 && + strcmp(type, LUSTRE_OSP_NAME) != 0 && + strcmp(type, LUSTRE_LWP_NAME) != 0 && + strcmp(type, LUSTRE_MDT_NAME) != 0 && + strcmp(type, LUSTRE_OST_NAME) != 0) + continue; + + if (strncmp(obd->obd_name, fsname, namelen)) + continue; + + class_incref(obd, __FUNCTION__, obd); + read_unlock(&obd_dev_lock); + rc2 = obd_set_info_async(NULL, obd->obd_self_export, + sizeof(KEY_SPTLRPC_CONF), + KEY_SPTLRPC_CONF, 0, NULL, NULL); + rc = rc ? rc : rc2; + class_decref(obd, __FUNCTION__, obd); + read_lock(&obd_dev_lock); + } + read_unlock(&obd_dev_lock); + return rc; +} +EXPORT_SYMBOL(class_notify_sptlrpc_conf); + +void obd_cleanup_caches(void) +{ + ENTRY; + if (obd_device_cachep) { + kmem_cache_destroy(obd_device_cachep); + obd_device_cachep = NULL; + } + + EXIT; +} + +int obd_init_caches(void) +{ + int rc; + ENTRY; + + LASSERT(obd_device_cachep == NULL); + obd_device_cachep = kmem_cache_create_usercopy("ll_obd_dev_cache", + sizeof(struct obd_device), + 0, 0, 0, sizeof(struct obd_device), NULL); + if (!obd_device_cachep) + GOTO(out, rc = -ENOMEM); + + RETURN(0); +out: + obd_cleanup_caches(); + RETURN(rc); +} + +static const char export_handle_owner[] = "export"; + +/* map connection to client */ +struct obd_export *class_conn2export(struct lustre_handle *conn) +{ + struct obd_export *export; + ENTRY; + + if (!conn) { + CDEBUG(D_CACHE, "looking for null handle\n"); + RETURN(NULL); + } + + if (conn->cookie == -1) { /* this means assign a new connection */ + CDEBUG(D_CACHE, "want a new connection\n"); + RETURN(NULL); + } + + CDEBUG(D_INFO, "looking for export cookie %#llx\n", conn->cookie); + export = class_handle2object(conn->cookie, export_handle_owner); + RETURN(export); +} +EXPORT_SYMBOL(class_conn2export); + +struct obd_device *class_exp2obd(struct obd_export *exp) +{ + if (exp) + return exp->exp_obd; + return NULL; +} +EXPORT_SYMBOL(class_exp2obd); + +struct obd_import *class_exp2cliimp(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + if (obd == NULL) + return NULL; + return obd->u.cli.cl_import; +} +EXPORT_SYMBOL(class_exp2cliimp); + +/* Export management functions */ +static void class_export_destroy(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + ENTRY; + + LASSERT(refcount_read(&exp->exp_handle.h_ref) == 0); + LASSERT(obd != NULL); + + CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp, + exp->exp_client_uuid.uuid, obd->obd_name); + + /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */ + ptlrpc_connection_put(exp->exp_connection); + + LASSERT(list_empty(&exp->exp_outstanding_replies)); + LASSERT(list_empty(&exp->exp_uncommitted_replies)); + LASSERT(list_empty(&exp->exp_req_replay_queue)); + LASSERT(list_empty(&exp->exp_hp_rpcs)); + obd_destroy_export(exp); + /* self export doesn't hold a reference to an obd, although it + * exists until freeing of the obd */ + if (exp != obd->obd_self_export) + class_decref(obd, "export", exp); + + OBD_FREE_PRE(exp, sizeof(*exp), "rcu"); + kfree_rcu(exp, exp_handle.h_rcu); + EXIT; +} + +struct obd_export *class_export_get(struct obd_export *exp) +{ + refcount_inc(&exp->exp_handle.h_ref); + CDEBUG(D_INFO, "GET export %p refcount=%d\n", exp, + refcount_read(&exp->exp_handle.h_ref)); + return exp; +} +EXPORT_SYMBOL(class_export_get); + +void class_export_put(struct obd_export *exp) +{ + LASSERT(exp != NULL); + LASSERT(refcount_read(&exp->exp_handle.h_ref) > 0); + LASSERT(refcount_read(&exp->exp_handle.h_ref) < LI_POISON); + CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp, + refcount_read(&exp->exp_handle.h_ref) - 1); + + if (refcount_dec_and_test(&exp->exp_handle.h_ref)) { + struct obd_device *obd = exp->exp_obd; + + CDEBUG(D_IOCTL, "final put %p/%s\n", + exp, exp->exp_client_uuid.uuid); + + /* release nid stat refererence */ + lprocfs_exp_cleanup(exp); + + if (exp == obd->obd_self_export) { + /* self export should be destroyed without + * zombie thread as it doesn't hold a + * reference to obd and doesn't hold any + * resources */ + class_export_destroy(exp); + /* self export is destroyed, no class + * references exist and it is safe to free + * obd */ + class_free_dev(obd); + } else { + LASSERT(!list_empty(&exp->exp_obd_chain)); + obd_zombie_export_add(exp); + } + + } +} +EXPORT_SYMBOL(class_export_put); + +static void obd_zombie_exp_cull(struct work_struct *ws) +{ + struct obd_export *export; + + export = container_of(ws, struct obd_export, exp_zombie_work); + class_export_destroy(export); +} + +/* Creates a new export, adds it to the hash table, and returns a + * pointer to it. The refcount is 2: one for the hash reference, and + * one for the pointer returned by this function. */ +struct obd_export *__class_new_export(struct obd_device *obd, + struct obd_uuid *cluuid, bool is_self) +{ + struct obd_export *export; + int rc = 0; + ENTRY; + + OBD_ALLOC_PTR(export); + if (!export) + return ERR_PTR(-ENOMEM); + + export->exp_conn_cnt = 0; + export->exp_lock_hash = NULL; + export->exp_flock_hash = NULL; + /* 2 = class_handle_hash + last */ + refcount_set(&export->exp_handle.h_ref, 2); + atomic_set(&export->exp_rpc_count, 0); + atomic_set(&export->exp_cb_count, 0); + atomic_set(&export->exp_locks_count, 0); +#if LUSTRE_TRACKS_LOCK_EXP_REFS + INIT_LIST_HEAD(&export->exp_locks_list); + spin_lock_init(&export->exp_locks_list_guard); +#endif + atomic_set(&export->exp_replay_count, 0); + export->exp_obd = obd; + INIT_LIST_HEAD(&export->exp_outstanding_replies); + spin_lock_init(&export->exp_uncommitted_replies_lock); + INIT_LIST_HEAD(&export->exp_uncommitted_replies); + INIT_LIST_HEAD(&export->exp_req_replay_queue); + INIT_HLIST_NODE(&export->exp_handle.h_link); + INIT_LIST_HEAD(&export->exp_hp_rpcs); + INIT_LIST_HEAD(&export->exp_reg_rpcs); + class_handle_hash(&export->exp_handle, export_handle_owner); + export->exp_last_request_time = ktime_get_real_seconds(); + spin_lock_init(&export->exp_lock); + spin_lock_init(&export->exp_rpc_lock); + INIT_HLIST_NODE(&export->exp_gen_hash); + spin_lock_init(&export->exp_bl_list_lock); + INIT_LIST_HEAD(&export->exp_bl_list); + INIT_LIST_HEAD(&export->exp_stale_list); + INIT_WORK(&export->exp_zombie_work, obd_zombie_exp_cull); + + export->exp_sp_peer = LUSTRE_SP_ANY; + export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID; + export->exp_client_uuid = *cluuid; + obd_init_export(export); + + at_init(&export->exp_bl_lock_at, obd_timeout, 0); + + spin_lock(&obd->obd_dev_lock); + if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) { + /* shouldn't happen, but might race */ + if (obd->obd_stopping) + GOTO(exit_unlock, rc = -ENODEV); + + rc = obd_uuid_add(obd, export); + if (rc != 0) { + LCONSOLE_WARN("%s: denying duplicate export for %s: rc = %d\n", + obd->obd_name, cluuid->uuid, rc); + GOTO(exit_unlock, rc = -EALREADY); + } + } + + if (!is_self) { + class_incref(obd, "export", export); + list_add_tail(&export->exp_obd_chain_timed, + &obd->obd_exports_timed); + list_add(&export->exp_obd_chain, &obd->obd_exports); + obd->obd_num_exports++; + } else { + INIT_LIST_HEAD(&export->exp_obd_chain_timed); + INIT_LIST_HEAD(&export->exp_obd_chain); + } + spin_unlock(&obd->obd_dev_lock); + RETURN(export); + +exit_unlock: + spin_unlock(&obd->obd_dev_lock); + class_handle_unhash(&export->exp_handle); + obd_destroy_export(export); + OBD_FREE_PTR(export); + return ERR_PTR(rc); +} + +struct obd_export *class_new_export(struct obd_device *obd, + struct obd_uuid *uuid) +{ + return __class_new_export(obd, uuid, false); +} +EXPORT_SYMBOL(class_new_export); + +struct obd_export *class_new_export_self(struct obd_device *obd, + struct obd_uuid *uuid) +{ + return __class_new_export(obd, uuid, true); +} + +void class_unlink_export(struct obd_export *exp) +{ + class_handle_unhash(&exp->exp_handle); + + if (exp->exp_obd->obd_self_export == exp) { + class_export_put(exp); + return; + } + + spin_lock(&exp->exp_obd->obd_dev_lock); + /* delete an uuid-export hashitem from hashtables */ + if (exp != exp->exp_obd->obd_self_export) + obd_uuid_del(exp->exp_obd, exp); + +#ifdef HAVE_SERVER_SUPPORT + if (!hlist_unhashed(&exp->exp_gen_hash)) { + struct tg_export_data *ted = &exp->exp_target_data; + struct cfs_hash *hash; + + /* Because obd_gen_hash will not be released until + * class_cleanup(), so hash should never be NULL here */ + hash = cfs_hash_getref(exp->exp_obd->obd_gen_hash); + LASSERT(hash != NULL); + cfs_hash_del(hash, &ted->ted_lcd->lcd_generation, + &exp->exp_gen_hash); + cfs_hash_putref(hash); + } +#endif /* HAVE_SERVER_SUPPORT */ + + list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports); + list_del_init(&exp->exp_obd_chain_timed); + exp->exp_obd->obd_num_exports--; + spin_unlock(&exp->exp_obd->obd_dev_lock); + atomic_inc(&obd_stale_export_num); + + /* A reference is kept by obd_stale_exports list */ + obd_stale_export_put(exp); +} +EXPORT_SYMBOL(class_unlink_export); + +/* Import management functions */ +static void obd_zombie_import_free(struct obd_import *imp) +{ + struct obd_import_conn *imp_conn; + + ENTRY; + CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp, + imp->imp_obd->obd_name); + + LASSERT(refcount_read(&imp->imp_refcount) == 0); + + ptlrpc_connection_put(imp->imp_connection); + + while ((imp_conn = list_first_entry_or_null(&imp->imp_conn_list, + struct obd_import_conn, + oic_item)) != NULL) { + list_del_init(&imp_conn->oic_item); + ptlrpc_connection_put(imp_conn->oic_conn); + OBD_FREE(imp_conn, sizeof(*imp_conn)); + } + + LASSERT(imp->imp_sec == NULL); + LASSERTF(atomic_read(&imp->imp_reqs) == 0, "%s: imp_reqs = %d\n", + imp->imp_obd->obd_name, atomic_read(&imp->imp_reqs)); + class_decref(imp->imp_obd, "import", imp); + OBD_FREE_PTR(imp); + EXIT; +} + +struct obd_import *class_import_get(struct obd_import *import) +{ + refcount_inc(&import->imp_refcount); + CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import, + refcount_read(&import->imp_refcount), + import->imp_obd->obd_name); + return import; +} +EXPORT_SYMBOL(class_import_get); + +void class_import_put(struct obd_import *imp) +{ + ENTRY; + + LASSERT(refcount_read(&imp->imp_refcount) > 0); + + CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp, + refcount_read(&imp->imp_refcount) - 1, + imp->imp_obd->obd_name); + + if (refcount_dec_and_test(&imp->imp_refcount)) { + CDEBUG(D_INFO, "final put import %p\n", imp); + obd_zombie_import_add(imp); + } + + EXIT; +} +EXPORT_SYMBOL(class_import_put); + +static void init_imp_at(struct imp_at *at) { + int i; + at_init(&at->iat_net_latency, 0, 0); + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + /* max service estimates are tracked on the server side, so + don't use the AT history here, just use the last reported + val. (But keep hist for proc histogram, worst_ever) */ + at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT, + AT_FLG_NOHIST); + } +} + +static void obd_zombie_imp_cull(struct work_struct *ws) +{ + struct obd_import *import; + + import = container_of(ws, struct obd_import, imp_zombie_work); + obd_zombie_import_free(import); +} + +struct obd_import *class_new_import(struct obd_device *obd) +{ + struct obd_import *imp; + struct pid_namespace *curr_pid_ns = ll_task_pid_ns(current); + + OBD_ALLOC(imp, sizeof(*imp)); + if (imp == NULL) + return NULL; + + INIT_LIST_HEAD(&imp->imp_pinger_chain); + INIT_LIST_HEAD(&imp->imp_replay_list); + INIT_LIST_HEAD(&imp->imp_sending_list); + INIT_LIST_HEAD(&imp->imp_delayed_list); + INIT_LIST_HEAD(&imp->imp_committed_list); + INIT_LIST_HEAD(&imp->imp_unreplied_list); + imp->imp_known_replied_xid = 0; + imp->imp_replay_cursor = &imp->imp_committed_list; + spin_lock_init(&imp->imp_lock); + imp->imp_last_success_conn = 0; + imp->imp_state = LUSTRE_IMP_NEW; + imp->imp_obd = class_incref(obd, "import", imp); + rwlock_init(&imp->imp_sec_lock); + init_waitqueue_head(&imp->imp_recovery_waitq); + INIT_WORK(&imp->imp_zombie_work, obd_zombie_imp_cull); + + if (curr_pid_ns && curr_pid_ns->child_reaper) + imp->imp_sec_refpid = curr_pid_ns->child_reaper->pid; + else + imp->imp_sec_refpid = 1; + + refcount_set(&imp->imp_refcount, 2); + atomic_set(&imp->imp_unregistering, 0); + atomic_set(&imp->imp_reqs, 0); + atomic_set(&imp->imp_inflight, 0); + atomic_set(&imp->imp_replay_inflight, 0); + init_waitqueue_head(&imp->imp_replay_waitq); + atomic_set(&imp->imp_inval_count, 0); + atomic_set(&imp->imp_waiting, 0); + INIT_LIST_HEAD(&imp->imp_conn_list); + init_imp_at(&imp->imp_at); + + /* the default magic is V2, will be used in connect RPC, and + * then adjusted according to the flags in request/reply. */ + imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2; + + return imp; +} +EXPORT_SYMBOL(class_new_import); + +void class_destroy_import(struct obd_import *import) +{ + LASSERT(import != NULL); + LASSERT(import != LP_POISON); + + spin_lock(&import->imp_lock); + import->imp_generation++; + spin_unlock(&import->imp_lock); + class_import_put(import); +} +EXPORT_SYMBOL(class_destroy_import); + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + +void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) +{ + spin_lock(&exp->exp_locks_list_guard); + + LASSERT(lock->l_exp_refs_nr >= 0); + + if (lock->l_exp_refs_target != NULL && + lock->l_exp_refs_target != exp) { + LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n", + exp, lock, lock->l_exp_refs_target); + } + if ((lock->l_exp_refs_nr ++) == 0) { + list_add(&lock->l_exp_refs_link, &exp->exp_locks_list); + lock->l_exp_refs_target = exp; + } + CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n", + lock, exp, lock->l_exp_refs_nr); + spin_unlock(&exp->exp_locks_list_guard); +} +EXPORT_SYMBOL(__class_export_add_lock_ref); + +void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) +{ + spin_lock(&exp->exp_locks_list_guard); + LASSERT(lock->l_exp_refs_nr > 0); + if (lock->l_exp_refs_target != exp) { + LCONSOLE_WARN("lock %p, " + "mismatching export pointers: %p, %p\n", + lock, lock->l_exp_refs_target, exp); + } + if (-- lock->l_exp_refs_nr == 0) { + list_del_init(&lock->l_exp_refs_link); + lock->l_exp_refs_target = NULL; + } + CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n", + lock, exp, lock->l_exp_refs_nr); + spin_unlock(&exp->exp_locks_list_guard); +} +EXPORT_SYMBOL(__class_export_del_lock_ref); +#endif + +/* A connection defines an export context in which preallocation can + be managed. This releases the export pointer reference, and returns + the export handle, so the export refcount is 1 when this function + returns. */ +int class_connect(struct lustre_handle *conn, struct obd_device *obd, + struct obd_uuid *cluuid) +{ + struct obd_export *export; + LASSERT(conn != NULL); + LASSERT(obd != NULL); + LASSERT(cluuid != NULL); + ENTRY; + + export = class_new_export(obd, cluuid); + if (IS_ERR(export)) + RETURN(PTR_ERR(export)); + + conn->cookie = export->exp_handle.h_cookie; + class_export_put(export); + + CDEBUG(D_IOCTL, "connect: client %s, cookie %#llx\n", + cluuid->uuid, conn->cookie); + RETURN(0); +} +EXPORT_SYMBOL(class_connect); + +/* if export is involved in recovery then clean up related things */ +static void class_export_recovery_cleanup(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + + spin_lock(&obd->obd_recovery_task_lock); + if (obd->obd_recovering) { + if (exp->exp_in_recovery) { + spin_lock(&exp->exp_lock); + exp->exp_in_recovery = 0; + spin_unlock(&exp->exp_lock); + LASSERT_ATOMIC_POS(&obd->obd_connected_clients); + atomic_dec(&obd->obd_connected_clients); + } + + /* if called during recovery then should update + * obd_stale_clients counter, + * lightweight exports are not counted */ + if ((exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0) + exp->exp_obd->obd_stale_clients++; + } + spin_unlock(&obd->obd_recovery_task_lock); + + spin_lock(&exp->exp_lock); + /** Cleanup req replay fields */ + if (exp->exp_req_replay_needed) { + exp->exp_req_replay_needed = 0; + + LASSERT(atomic_read(&obd->obd_req_replay_clients)); + atomic_dec(&obd->obd_req_replay_clients); + } + + /** Cleanup lock replay data */ + if (exp->exp_lock_replay_needed) { + exp->exp_lock_replay_needed = 0; + + LASSERT(atomic_read(&obd->obd_lock_replay_clients)); + atomic_dec(&obd->obd_lock_replay_clients); + } + spin_unlock(&exp->exp_lock); +} + +/* This function removes 1-3 references from the export: + * 1 - for export pointer passed + * and if disconnect really need + * 2 - removing from hash + * 3 - in client_unlink_export + * The export pointer passed to this function can destroyed */ +int class_disconnect(struct obd_export *export) +{ + int already_disconnected; + ENTRY; + + if (export == NULL) { + CWARN("attempting to free NULL export %p\n", export); + RETURN(-EINVAL); + } + + spin_lock(&export->exp_lock); + already_disconnected = export->exp_disconnected; + export->exp_disconnected = 1; +#ifdef HAVE_SERVER_SUPPORT + /* We hold references of export for uuid hash + * and nid_hash and export link at least. So + * it is safe to call rh*table_remove_fast in + * there. + */ + obd_nid_del(export->exp_obd, export); +#endif /* HAVE_SERVER_SUPPORT */ + spin_unlock(&export->exp_lock); + + /* class_cleanup(), abort_recovery(), and class_fail_export() + * all end up in here, and if any of them race we shouldn't + * call extra class_export_puts(). */ + if (already_disconnected) + GOTO(no_disconn, already_disconnected); + + CDEBUG(D_IOCTL, "disconnect: cookie %#llx\n", + export->exp_handle.h_cookie); + + class_export_recovery_cleanup(export); + class_unlink_export(export); +no_disconn: + class_export_put(export); + RETURN(0); +} +EXPORT_SYMBOL(class_disconnect); + +/* Return non-zero for a fully connected export */ +int class_connected_export(struct obd_export *exp) +{ + int connected = 0; + + if (exp) { + spin_lock(&exp->exp_lock); + connected = (exp->exp_conn_cnt > 0) && !exp->exp_failed; + spin_unlock(&exp->exp_lock); + } + return connected; +} +EXPORT_SYMBOL(class_connected_export); + +static void class_disconnect_export_list(struct list_head *list, + enum obd_option flags) +{ + int rc; + struct obd_export *exp; + ENTRY; + + /* It's possible that an export may disconnect itself, but + * nothing else will be added to this list. + */ + while ((exp = list_first_entry_or_null(list, struct obd_export, + exp_obd_chain)) != NULL) { + /* need for safe call CDEBUG after obd_disconnect */ + class_export_get(exp); + + spin_lock(&exp->exp_lock); + exp->exp_flags = flags; + spin_unlock(&exp->exp_lock); + + if (obd_uuid_equals(&exp->exp_client_uuid, + &exp->exp_obd->obd_uuid)) { + CDEBUG(D_HA, + "exp %p export uuid == obd uuid, don't discon\n", + exp); + /* Need to delete this now so we don't end up pointing + * to work_list later when this export is cleaned up. */ + list_del_init(&exp->exp_obd_chain); + class_export_put(exp); + continue; + } + + class_export_get(exp); + CDEBUG(D_HA, "%s: disconnecting export at %s (%p), " + "last request at %lld\n", + exp->exp_obd->obd_name, obd_export_nid2str(exp), + exp, exp->exp_last_request_time); + /* release one export reference anyway */ + rc = obd_disconnect(exp); + + CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n", + obd_export_nid2str(exp), exp, rc); + class_export_put(exp); + } + EXIT; +} + +void class_disconnect_exports(struct obd_device *obd) +{ + LIST_HEAD(work_list); + ENTRY; + + /* Move all of the exports from obd_exports to a work list, en masse. */ + spin_lock(&obd->obd_dev_lock); + list_splice_init(&obd->obd_exports, &work_list); + list_splice_init(&obd->obd_delayed_exports, &work_list); + spin_unlock(&obd->obd_dev_lock); + + if (!list_empty(&work_list)) { + CDEBUG(D_HA, "OBD device %d (%p) has exports, " + "disconnecting them\n", obd->obd_minor, obd); + class_disconnect_export_list(&work_list, + exp_flags_from_obd(obd)); + } else + CDEBUG(D_HA, "OBD device %d (%p) has no exports\n", + obd->obd_minor, obd); + EXIT; +} +EXPORT_SYMBOL(class_disconnect_exports); + +/* Remove exports that have not completed recovery. + */ +void class_disconnect_stale_exports(struct obd_device *obd, + int (*test_export)(struct obd_export *)) +{ + LIST_HEAD(work_list); + struct obd_export *exp, *n; + int evicted = 0; + ENTRY; + + spin_lock(&obd->obd_dev_lock); + list_for_each_entry_safe(exp, n, &obd->obd_exports, + exp_obd_chain) { + /* don't count self-export as client */ + if (obd_uuid_equals(&exp->exp_client_uuid, + &exp->exp_obd->obd_uuid)) + continue; + + /* don't evict clients which have no slot in last_rcvd + * (e.g. lightweight connection) */ + if (exp->exp_target_data.ted_lr_idx == -1) + continue; + + spin_lock(&exp->exp_lock); + if (exp->exp_failed || test_export(exp)) { + spin_unlock(&exp->exp_lock); + continue; + } + exp->exp_failed = 1; + spin_unlock(&exp->exp_lock); + + list_move(&exp->exp_obd_chain, &work_list); + evicted++; + CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n", + obd->obd_name, exp->exp_client_uuid.uuid, + obd_export_nid2str(exp)); + print_export_data(exp, "EVICTING", 0, D_HA); + } + spin_unlock(&obd->obd_dev_lock); + + if (evicted) + LCONSOLE_WARN("%s: disconnecting %d stale clients\n", + obd->obd_name, evicted); + + class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); + EXIT; +} +EXPORT_SYMBOL(class_disconnect_stale_exports); + +void class_fail_export(struct obd_export *exp) +{ + int rc, already_failed; + + spin_lock(&exp->exp_lock); + already_failed = exp->exp_failed; + exp->exp_failed = 1; + spin_unlock(&exp->exp_lock); + + if (already_failed) { + CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n", + exp, exp->exp_client_uuid.uuid); + return; + } + + CDEBUG(D_HA, "disconnecting export %p/%s\n", + exp, exp->exp_client_uuid.uuid); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + + /* need for safe call CDEBUG after obd_disconnect */ + class_export_get(exp); + + /* Most callers into obd_disconnect are removing their own reference + * (request, for example) in addition to the one from the hash table. + * We don't have such a reference here, so make one. */ + class_export_get(exp); + rc = obd_disconnect(exp); + if (rc) + CERROR("disconnecting export %p failed: %d\n", exp, rc); + else + CDEBUG(D_HA, "disconnected export %p/%s\n", + exp, exp->exp_client_uuid.uuid); + class_export_put(exp); +} +EXPORT_SYMBOL(class_fail_export); + +#ifdef HAVE_SERVER_SUPPORT + +static int take_first(struct obd_export *exp, void *data) +{ + struct obd_export **expp = data; + + if (*expp) + /* already have one */ + return 0; + if (exp->exp_failed) + /* Don't want this one */ + return 0; + if (!refcount_inc_not_zero(&exp->exp_handle.h_ref)) + /* Cannot get a ref on this one */ + return 0; + *expp = exp; + return 1; +} + +int obd_export_evict_by_nid(struct obd_device *obd, const char *nid) +{ + struct lnet_nid nid_key; + struct obd_export *doomed_exp; + int exports_evicted = 0; + + libcfs_strnid(&nid_key, nid); + + spin_lock(&obd->obd_dev_lock); + /* umount has run already, so evict thread should leave + * its task to umount thread now */ + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + return exports_evicted; + } + spin_unlock(&obd->obd_dev_lock); + + doomed_exp = NULL; + while (obd_nid_export_for_each(obd, &nid_key, + take_first, &doomed_exp) > 0) { + + LASSERTF(doomed_exp != obd->obd_self_export, + "self-export is hashed by NID?\n"); + + LCONSOLE_WARN("%s: evicting %s (at %s) by administrative request\n", + obd->obd_name, + obd_uuid2str(&doomed_exp->exp_client_uuid), + obd_export_nid2str(doomed_exp)); + + class_fail_export(doomed_exp); + class_export_put(doomed_exp); + exports_evicted++; + doomed_exp = NULL; + } + + if (!exports_evicted) + CDEBUG(D_HA, + "%s: can't disconnect NID '%s': no exports found\n", + obd->obd_name, nid); + return exports_evicted; +} +EXPORT_SYMBOL(obd_export_evict_by_nid); + +int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid) +{ + struct obd_export *doomed_exp = NULL; + struct obd_uuid doomed_uuid; + int exports_evicted = 0; + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + return exports_evicted; + } + spin_unlock(&obd->obd_dev_lock); + + obd_str2uuid(&doomed_uuid, uuid); + if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) { + CERROR("%s: can't evict myself\n", obd->obd_name); + return exports_evicted; + } + + doomed_exp = obd_uuid_lookup(obd, &doomed_uuid); + if (doomed_exp == NULL) { + CERROR("%s: can't disconnect %s: no exports found\n", + obd->obd_name, uuid); + } else { + CWARN("%s: evicting %s at adminstrative request\n", + obd->obd_name, doomed_exp->exp_client_uuid.uuid); + class_fail_export(doomed_exp); + class_export_put(doomed_exp); + obd_uuid_del(obd, doomed_exp); + exports_evicted++; + } + + return exports_evicted; +} +#endif /* HAVE_SERVER_SUPPORT */ + +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void (*class_export_dump_hook)(struct obd_export*) = NULL; +EXPORT_SYMBOL(class_export_dump_hook); +#endif + +static void print_export_data(struct obd_export *exp, const char *status, + int locks, int debug_level) +{ + struct ptlrpc_reply_state *rs; + struct ptlrpc_reply_state *first_reply = NULL; + int nreplies = 0; + + spin_lock(&exp->exp_lock); + list_for_each_entry(rs, &exp->exp_outstanding_replies, + rs_exp_list) { + if (nreplies == 0) + first_reply = rs; + nreplies++; + } + spin_unlock(&exp->exp_lock); + + CDEBUG(debug_level, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: " + "%p %s %llu stale:%d\n", + exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid, + obd_export_nid2str(exp), + refcount_read(&exp->exp_handle.h_ref), + atomic_read(&exp->exp_rpc_count), + atomic_read(&exp->exp_cb_count), + atomic_read(&exp->exp_locks_count), + exp->exp_disconnected, exp->exp_delayed, exp->exp_failed, + nreplies, first_reply, nreplies > 3 ? "..." : "", + exp->exp_last_committed, !list_empty(&exp->exp_stale_list)); +#if LUSTRE_TRACKS_LOCK_EXP_REFS + if (locks && class_export_dump_hook != NULL) + class_export_dump_hook(exp); +#endif +} + +void dump_exports(struct obd_device *obd, int locks, int debug_level) +{ + struct obd_export *exp; + + spin_lock(&obd->obd_dev_lock); + list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) + print_export_data(exp, "ACTIVE", locks, debug_level); + list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain) + print_export_data(exp, "UNLINKED", locks, debug_level); + list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain) + print_export_data(exp, "DELAYED", locks, debug_level); + spin_unlock(&obd->obd_dev_lock); +} + +void obd_exports_barrier(struct obd_device *obd) +{ + int waited = 2; + LASSERT(list_empty(&obd->obd_exports)); + spin_lock(&obd->obd_dev_lock); + while (!list_empty(&obd->obd_unlinked_exports)) { + spin_unlock(&obd->obd_dev_lock); + schedule_timeout_uninterruptible(cfs_time_seconds(waited)); + if (waited > 5 && is_power_of_2(waited)) { + LCONSOLE_WARN("%s is waiting for obd_unlinked_exports " + "more than %d seconds. " + "The obd refcount = %d. Is it stuck?\n", + obd->obd_name, waited, + atomic_read(&obd->obd_refcount)); + dump_exports(obd, 1, D_CONSOLE | D_WARNING); + } + waited *= 2; + spin_lock(&obd->obd_dev_lock); + } + spin_unlock(&obd->obd_dev_lock); +} +EXPORT_SYMBOL(obd_exports_barrier); + +/** + * Add export to the obd_zombe thread and notify it. + */ +static void obd_zombie_export_add(struct obd_export *exp) { + atomic_dec(&obd_stale_export_num); + spin_lock(&exp->exp_obd->obd_dev_lock); + LASSERT(!list_empty(&exp->exp_obd_chain)); + list_del_init(&exp->exp_obd_chain); + spin_unlock(&exp->exp_obd->obd_dev_lock); + + queue_work(zombie_wq, &exp->exp_zombie_work); +} + +/** + * Add import to the obd_zombe thread and notify it. + */ +static void obd_zombie_import_add(struct obd_import *imp) { + LASSERT(imp->imp_sec == NULL); + + queue_work(zombie_wq, &imp->imp_zombie_work); +} + +/** + * wait when obd_zombie import/export queues become empty + */ +void obd_zombie_barrier(void) +{ + flush_workqueue(zombie_wq); +} +EXPORT_SYMBOL(obd_zombie_barrier); + + +struct obd_export *obd_stale_export_get(void) +{ + struct obd_export *exp = NULL; + ENTRY; + + spin_lock(&obd_stale_export_lock); + if (!list_empty(&obd_stale_exports)) { + exp = list_first_entry(&obd_stale_exports, + struct obd_export, exp_stale_list); + list_del_init(&exp->exp_stale_list); + } + spin_unlock(&obd_stale_export_lock); + + if (exp) { + CDEBUG(D_DLMTRACE, "Get export %p: total %d\n", exp, + atomic_read(&obd_stale_export_num)); + } + RETURN(exp); +} +EXPORT_SYMBOL(obd_stale_export_get); + +void obd_stale_export_put(struct obd_export *exp) +{ + ENTRY; + + LASSERT(list_empty(&exp->exp_stale_list)); + if (exp->exp_lock_hash && + atomic_read(&exp->exp_lock_hash->hs_count)) { + CDEBUG(D_DLMTRACE, "Put export %p: total %d\n", exp, + atomic_read(&obd_stale_export_num)); + + spin_lock_bh(&exp->exp_bl_list_lock); + spin_lock(&obd_stale_export_lock); + /* Add to the tail if there is no blocked locks, + * to the head otherwise. */ + if (list_empty(&exp->exp_bl_list)) + list_add_tail(&exp->exp_stale_list, + &obd_stale_exports); + else + list_add(&exp->exp_stale_list, + &obd_stale_exports); + + spin_unlock(&obd_stale_export_lock); + spin_unlock_bh(&exp->exp_bl_list_lock); + } else { + class_export_put(exp); + } + EXIT; +} +EXPORT_SYMBOL(obd_stale_export_put); + +/** + * Adjust the position of the export in the stale list, + * i.e. move to the head of the list if is needed. + **/ +void obd_stale_export_adjust(struct obd_export *exp) +{ + LASSERT(exp != NULL); + spin_lock_bh(&exp->exp_bl_list_lock); + spin_lock(&obd_stale_export_lock); + + if (!list_empty(&exp->exp_stale_list) && + !list_empty(&exp->exp_bl_list)) + list_move(&exp->exp_stale_list, &obd_stale_exports); + + spin_unlock(&obd_stale_export_lock); + spin_unlock_bh(&exp->exp_bl_list_lock); +} +EXPORT_SYMBOL(obd_stale_export_adjust); + +/** + * start destroy zombie import/export thread + */ +int obd_zombie_impexp_init(void) +{ + zombie_wq = cfs_cpt_bind_workqueue("obd_zombid", cfs_cpt_tab, + 0, CFS_CPT_ANY, + cfs_cpt_number(cfs_cpt_tab)); + + return IS_ERR(zombie_wq) ? PTR_ERR(zombie_wq) : 0; +} + +/** + * stop destroy zombie import/export thread + */ +void obd_zombie_impexp_stop(void) +{ + destroy_workqueue(zombie_wq); + LASSERT(list_empty(&obd_stale_exports)); +} + +/***** Kernel-userspace comm helpers *******/ + +/* Get length of entire message, including header */ +int kuc_len(int payload_len) +{ + return sizeof(struct kuc_hdr) + payload_len; +} +EXPORT_SYMBOL(kuc_len); + +/* Get a pointer to kuc header, given a ptr to the payload + * @param p Pointer to payload area + * @returns Pointer to kuc header + */ +struct kuc_hdr * kuc_ptr(void *p) +{ + struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1; + LASSERT(lh->kuc_magic == KUC_MAGIC); + return lh; +} +EXPORT_SYMBOL(kuc_ptr); + +/* Alloc space for a message, and fill in header + * @return Pointer to payload area + */ +void *kuc_alloc(int payload_len, int transport, int type) +{ + struct kuc_hdr *lh; + int len = kuc_len(payload_len); + + OBD_ALLOC(lh, len); + if (lh == NULL) + return ERR_PTR(-ENOMEM); + + lh->kuc_magic = KUC_MAGIC; + lh->kuc_transport = transport; + lh->kuc_msgtype = type; + lh->kuc_msglen = len; + + return (void *)(lh + 1); +} +EXPORT_SYMBOL(kuc_alloc); + +/* Takes pointer to payload area */ +void kuc_free(void *p, int payload_len) +{ + struct kuc_hdr *lh = kuc_ptr(p); + OBD_FREE(lh, kuc_len(payload_len)); +} +EXPORT_SYMBOL(kuc_free); + +struct obd_request_slot_waiter { + struct list_head orsw_entry; + wait_queue_head_t orsw_waitq; + bool orsw_signaled; +}; + +static bool obd_request_slot_avail(struct client_obd *cli, + struct obd_request_slot_waiter *orsw) +{ + bool avail; + + spin_lock(&cli->cl_loi_list_lock); + avail = !!list_empty(&orsw->orsw_entry); + spin_unlock(&cli->cl_loi_list_lock); + + return avail; +}; + +/* + * For network flow control, the RPC sponsor needs to acquire a credit + * before sending the RPC. The credits count for a connection is defined + * by the "cl_max_rpcs_in_flight". If all the credits are occpuied, then + * the subsequent RPC sponsors need to wait until others released their + * credits, or the administrator increased the "cl_max_rpcs_in_flight". + */ +int obd_get_request_slot(struct client_obd *cli) +{ + struct obd_request_slot_waiter orsw; + int rc; + + spin_lock(&cli->cl_loi_list_lock); + if (cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight) { + cli->cl_rpcs_in_flight++; + spin_unlock(&cli->cl_loi_list_lock); + return 0; + } + + init_waitqueue_head(&orsw.orsw_waitq); + list_add_tail(&orsw.orsw_entry, &cli->cl_flight_waiters); + orsw.orsw_signaled = false; + spin_unlock(&cli->cl_loi_list_lock); + + rc = l_wait_event_abortable(orsw.orsw_waitq, + obd_request_slot_avail(cli, &orsw) || + orsw.orsw_signaled); + + /* Here, we must take the lock to avoid the on-stack 'orsw' to be + * freed but other (such as obd_put_request_slot) is using it. */ + spin_lock(&cli->cl_loi_list_lock); + if (rc != 0) { + if (!orsw.orsw_signaled) { + if (list_empty(&orsw.orsw_entry)) + cli->cl_rpcs_in_flight--; + else + list_del(&orsw.orsw_entry); + } + rc = -EINTR; + } + + if (orsw.orsw_signaled) { + LASSERT(list_empty(&orsw.orsw_entry)); + + rc = -EINTR; + } + spin_unlock(&cli->cl_loi_list_lock); + + return rc; +} +EXPORT_SYMBOL(obd_get_request_slot); + +void obd_put_request_slot(struct client_obd *cli) +{ + struct obd_request_slot_waiter *orsw; + + spin_lock(&cli->cl_loi_list_lock); + cli->cl_rpcs_in_flight--; + + /* If there is free slot, wakeup the first waiter. */ + if (!list_empty(&cli->cl_flight_waiters) && + likely(cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight)) { + orsw = list_first_entry(&cli->cl_flight_waiters, + struct obd_request_slot_waiter, + orsw_entry); + list_del_init(&orsw->orsw_entry); + cli->cl_rpcs_in_flight++; + wake_up(&orsw->orsw_waitq); + } + spin_unlock(&cli->cl_loi_list_lock); +} +EXPORT_SYMBOL(obd_put_request_slot); + +__u32 obd_get_max_rpcs_in_flight(struct client_obd *cli) +{ + return cli->cl_max_rpcs_in_flight; +} +EXPORT_SYMBOL(obd_get_max_rpcs_in_flight); + +int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max) +{ + struct obd_request_slot_waiter *orsw; + __u32 old; + int diff; + int i; + int rc; + + if (max > OBD_MAX_RIF_MAX || max < 1) + return -ERANGE; + + CDEBUG(D_INFO, "%s: max = %hu max_mod = %u rif = %u\n", + cli->cl_import->imp_obd->obd_name, max, + cli->cl_max_mod_rpcs_in_flight, cli->cl_max_rpcs_in_flight); + + if (strcmp(cli->cl_import->imp_obd->obd_type->typ_name, + LUSTRE_MDC_NAME) == 0) { + /* adjust max_mod_rpcs_in_flight to ensure it is always + * strictly lower that max_rpcs_in_flight */ + if (max < 2) { + CERROR("%s: cannot set mdc.*.max_rpcs_in_flight=1\n", + cli->cl_import->imp_obd->obd_name); + return -ERANGE; + } + if (max <= cli->cl_max_mod_rpcs_in_flight) { + rc = obd_set_max_mod_rpcs_in_flight(cli, max - 1); + if (rc != 0) + return rc; + } + } + + spin_lock(&cli->cl_loi_list_lock); + old = cli->cl_max_rpcs_in_flight; + cli->cl_max_rpcs_in_flight = max; + client_adjust_max_dirty(cli); + + diff = max - old; + + /* We increase the max_rpcs_in_flight, then wakeup some waiters. */ + for (i = 0; i < diff; i++) { + orsw = list_first_entry_or_null(&cli->cl_loi_read_list, + struct obd_request_slot_waiter, + orsw_entry); + if (!orsw) + break; + + list_del_init(&orsw->orsw_entry); + cli->cl_rpcs_in_flight++; + wake_up(&orsw->orsw_waitq); + } + spin_unlock(&cli->cl_loi_list_lock); + + return 0; +} +EXPORT_SYMBOL(obd_set_max_rpcs_in_flight); + +__u16 obd_get_max_mod_rpcs_in_flight(struct client_obd *cli) +{ + return cli->cl_max_mod_rpcs_in_flight; +} +EXPORT_SYMBOL(obd_get_max_mod_rpcs_in_flight); + +int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max) +{ + struct obd_connect_data *ocd; + __u16 maxmodrpcs; + __u16 prev; + + if (max > OBD_MAX_RIF_MAX || max < 1) + return -ERANGE; + + ocd = &cli->cl_import->imp_connect_data; + CDEBUG(D_INFO, "%s: max = %hu flags = %llx, max_mod = %u rif = %u\n", + cli->cl_import->imp_obd->obd_name, max, ocd->ocd_connect_flags, + ocd->ocd_maxmodrpcs, cli->cl_max_rpcs_in_flight); + + if (max == OBD_MAX_RIF_MAX) + max = OBD_MAX_RIF_MAX - 1; + + /* Cannot exceed or equal max_rpcs_in_flight. If we are asked to + * increase this value, also bump up max_rpcs_in_flight to match. + */ + if (max >= cli->cl_max_rpcs_in_flight) { + CDEBUG(D_INFO, + "%s: increasing max_rpcs_in_flight=%hu to allow larger max_mod_rpcs_in_flight=%u\n", + cli->cl_import->imp_obd->obd_name, max + 1, max); + obd_set_max_rpcs_in_flight(cli, max + 1); + } + + /* cannot exceed max modify RPCs in flight supported by the server, + * but verify ocd_connect_flags is at least initialized first. If + * not, allow it and fix value later in ptlrpc_connect_set_flags(). + */ + if (!ocd->ocd_connect_flags) { + maxmodrpcs = cli->cl_max_rpcs_in_flight - 1; + } else if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) { + maxmodrpcs = ocd->ocd_maxmodrpcs; + if (maxmodrpcs == 0) { /* connection not finished yet */ + maxmodrpcs = cli->cl_max_rpcs_in_flight - 1; + CDEBUG(D_INFO, + "%s: partial connect, assume maxmodrpcs=%hu\n", + cli->cl_import->imp_obd->obd_name, maxmodrpcs); + } + } else { + maxmodrpcs = 1; + } + if (max > maxmodrpcs) { + CERROR("%s: can't set max_mod_rpcs_in_flight=%hu higher than ocd_maxmodrpcs=%hu returned by the server at connection\n", + cli->cl_import->imp_obd->obd_name, + max, maxmodrpcs); + return -ERANGE; + } + + spin_lock(&cli->cl_mod_rpcs_lock); + + prev = cli->cl_max_mod_rpcs_in_flight; + cli->cl_max_mod_rpcs_in_flight = max; + + /* wakeup waiters if limit has been increased */ + if (cli->cl_max_mod_rpcs_in_flight > prev) + wake_up(&cli->cl_mod_rpcs_waitq); + + spin_unlock(&cli->cl_mod_rpcs_lock); + + return 0; +} +EXPORT_SYMBOL(obd_set_max_mod_rpcs_in_flight); + +int obd_mod_rpc_stats_seq_show(struct client_obd *cli, + struct seq_file *seq) +{ + unsigned long mod_tot = 0, mod_cum; + int i; + + spin_lock(&cli->cl_mod_rpcs_lock); + lprocfs_stats_header(seq, ktime_get_real(), cli->cl_mod_rpcs_init, 25, + ":", true, ""); + seq_printf(seq, "modify_RPCs_in_flight: %hu\n", + cli->cl_mod_rpcs_in_flight); + + seq_printf(seq, "\n\t\t\tmodify\n"); + seq_printf(seq, "rpcs in flight rpcs %% cum %%\n"); + + mod_tot = lprocfs_oh_sum(&cli->cl_mod_rpcs_hist); + + mod_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long mod = cli->cl_mod_rpcs_hist.oh_buckets[i]; + + mod_cum += mod; + seq_printf(seq, "%d:\t\t%10lu %3u %3u\n", + i, mod, pct(mod, mod_tot), + pct(mod_cum, mod_tot)); + if (mod_cum == mod_tot) + break; + } + + spin_unlock(&cli->cl_mod_rpcs_lock); + + return 0; +} +EXPORT_SYMBOL(obd_mod_rpc_stats_seq_show); + +/* The number of modify RPCs sent in parallel is limited + * because the server has a finite number of slots per client to + * store request result and ensure reply reconstruction when needed. + * On the client, this limit is stored in cl_max_mod_rpcs_in_flight + * that takes into account server limit and cl_max_rpcs_in_flight + * value. + * On the MDC client, to avoid a potential deadlock (see Bugzilla 3462), + * one close request is allowed above the maximum. + */ +static inline bool obd_mod_rpc_slot_avail_locked(struct client_obd *cli, + bool close_req) +{ + bool avail; + + /* A slot is available if + * - number of modify RPCs in flight is less than the max + * - it's a close RPC and no other close request is in flight + */ + avail = cli->cl_mod_rpcs_in_flight < cli->cl_max_mod_rpcs_in_flight || + (close_req && cli->cl_close_rpcs_in_flight == 0); + + return avail; +} + +static inline bool obd_mod_rpc_slot_avail(struct client_obd *cli, + bool close_req) +{ + bool avail; + + spin_lock(&cli->cl_mod_rpcs_lock); + avail = obd_mod_rpc_slot_avail_locked(cli, close_req); + spin_unlock(&cli->cl_mod_rpcs_lock); + return avail; +} + + +/* Get a modify RPC slot from the obd client @cli according + * to the kind of operation @opc that is going to be sent + * and the intent @it of the operation if it applies. + * If the maximum number of modify RPCs in flight is reached + * the thread is put to sleep. + * Returns the tag to be set in the request message. Tag 0 + * is reserved for non-modifying requests. + */ +__u16 obd_get_mod_rpc_slot(struct client_obd *cli, __u32 opc) +{ + bool close_req = false; + __u16 i, max; + + if (opc == MDS_CLOSE) + close_req = true; + + do { + spin_lock(&cli->cl_mod_rpcs_lock); + max = cli->cl_max_mod_rpcs_in_flight; + if (obd_mod_rpc_slot_avail_locked(cli, close_req)) { + /* there is a slot available */ + cli->cl_mod_rpcs_in_flight++; + if (close_req) + cli->cl_close_rpcs_in_flight++; + lprocfs_oh_tally(&cli->cl_mod_rpcs_hist, + cli->cl_mod_rpcs_in_flight); + /* find a free tag */ + i = find_first_zero_bit(cli->cl_mod_tag_bitmap, + max + 1); + LASSERT(i < OBD_MAX_RIF_MAX); + LASSERT(!test_and_set_bit(i, cli->cl_mod_tag_bitmap)); + spin_unlock(&cli->cl_mod_rpcs_lock); + /* tag 0 is reserved for non-modify RPCs */ + + CDEBUG(D_RPCTRACE, + "%s: modify RPC slot %u is allocated opc %u, max %hu\n", + cli->cl_import->imp_obd->obd_name, + i + 1, opc, max); + + return i + 1; + } + spin_unlock(&cli->cl_mod_rpcs_lock); + + CDEBUG(D_RPCTRACE, "%s: sleeping for a modify RPC slot " + "opc %u, max %hu\n", + cli->cl_import->imp_obd->obd_name, opc, max); + + wait_event_idle_exclusive(cli->cl_mod_rpcs_waitq, + obd_mod_rpc_slot_avail(cli, + close_req)); + } while (true); +} +EXPORT_SYMBOL(obd_get_mod_rpc_slot); + +/* Put a modify RPC slot from the obd client @cli according + * to the kind of operation @opc that has been sent. + */ +void obd_put_mod_rpc_slot(struct client_obd *cli, __u32 opc, __u16 tag) +{ + bool close_req = false; + + if (tag == 0) + return; + + if (opc == MDS_CLOSE) + close_req = true; + + spin_lock(&cli->cl_mod_rpcs_lock); + cli->cl_mod_rpcs_in_flight--; + if (close_req) + cli->cl_close_rpcs_in_flight--; + /* release the tag in the bitmap */ + LASSERT(tag - 1 < OBD_MAX_RIF_MAX); + LASSERT(test_and_clear_bit(tag - 1, cli->cl_mod_tag_bitmap) != 0); + spin_unlock(&cli->cl_mod_rpcs_lock); + /* LU-14741 - to prevent close RPCs stuck behind normal ones */ + if (close_req) + wake_up_all(&cli->cl_mod_rpcs_waitq); + else + wake_up(&cli->cl_mod_rpcs_waitq); +} +EXPORT_SYMBOL(obd_put_mod_rpc_slot); + diff --git a/drivers/staging/lustrefsx/lustre/obdclass/idmap.c b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c new file mode 100644 index 0000000000000..b89a6d2e86a61 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c @@ -0,0 +1,161 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/idmap.c + * + * Lustre user identity mapping. + * + * Author: Fan Yong + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include + +#include +#include +#include +#include + +/* + * groups_search() is copied from linux kernel! + * A simple bsearch. + */ +static int lustre_groups_search(struct group_info *group_info, + gid_t grp) +{ + int left, right; + + if (!group_info) + return 0; + + left = 0; + right = group_info->ngroups; + while (left < right) { + int mid = (left + right) / 2; + int cmp = grp - + from_kgid(&init_user_ns, CFS_GROUP_AT(group_info, mid)); + + if (cmp > 0) + left = mid + 1; + else if (cmp < 0) + right = mid; + else + return 1; + } + return 0; +} + +void lustre_groups_from_list(struct group_info *ginfo, gid_t *glist) +{ +#ifdef HAVE_GROUP_INFO_GID + memcpy(ginfo->gid, glist, ginfo->ngroups * sizeof(__u32)); +#else + int i; + int count = ginfo->ngroups; + + /* fill group_info from gid array */ + for (i = 0; i < ginfo->nblocks && count > 0; i++) { + int cp_count = min(CFS_NGROUPS_PER_BLOCK, count); + int off = i * CFS_NGROUPS_PER_BLOCK; + int len = cp_count * sizeof(*glist); + + memcpy(ginfo->blocks[i], glist + off, len); + count -= cp_count; + } +#endif +} +EXPORT_SYMBOL(lustre_groups_from_list); + +/* groups_sort() is copied from linux kernel! */ +/* a simple shell-metzner sort */ +void lustre_groups_sort(struct group_info *group_info) +{ + int base, max, stride; + int gidsetsize = group_info->ngroups; + + for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) + ; /* nothing */ + stride /= 3; + + while (stride) { + max = gidsetsize - stride; + for (base = 0; base < max; base++) { + int left = base; + int right = left + stride; + gid_t tmp = from_kgid(&init_user_ns, + CFS_GROUP_AT(group_info, right)); + + while (left >= 0 && + tmp < from_kgid(&init_user_ns, + CFS_GROUP_AT(group_info, left))) { + CFS_GROUP_AT(group_info, right) = + CFS_GROUP_AT(group_info, left); + right = left; + left -= stride; + } + CFS_GROUP_AT(group_info, right) = + make_kgid(&init_user_ns, tmp); + } + stride /= 3; + } +} +EXPORT_SYMBOL(lustre_groups_sort); + +int lustre_in_group_p(struct lu_ucred *mu, gid_t grp) +{ + int rc = 1; + + if (grp != mu->uc_fsgid) { + struct group_info *group_info = NULL; + + if (mu->uc_ginfo || !mu->uc_identity || + mu->uc_valid == UCRED_OLD) + if (grp == mu->uc_suppgids[0] || + grp == mu->uc_suppgids[1]) + return 1; + + if (mu->uc_ginfo) + group_info = mu->uc_ginfo; + else if (mu->uc_identity) + group_info = mu->uc_identity->mi_ginfo; + + if (!group_info) + return 0; + + atomic_inc(&group_info->usage); + rc = lustre_groups_search(group_info, grp); + if (atomic_dec_and_test(&group_info->usage)) + groups_free(group_info); + } + return rc; +} +EXPORT_SYMBOL(lustre_in_group_p); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/integrity.c b/drivers/staging/lustrefsx/lustre/obdclass/integrity.c new file mode 100644 index 0000000000000..1ccec8a93985d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/integrity.c @@ -0,0 +1,277 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2018, DataDirect Networks Storage. + * Author: Li Xi. + * + * General data integrity functions + */ +#include +#include +#include +#include +#include + +#if IS_ENABLED(CONFIG_CRC_T10DIF) +__be16 obd_dif_crc_fn(void *data, unsigned int len) +{ + return cpu_to_be16(crc_t10dif(data, len)); +} +EXPORT_SYMBOL(obd_dif_crc_fn); + +__be16 obd_dif_ip_fn(void *data, unsigned int len) +{ + return (__force __be16)ip_compute_csum(data, len); +} +EXPORT_SYMBOL(obd_dif_ip_fn); + +int obd_page_dif_generate_buffer(const char *obd_name, struct page *page, + __u32 offset, __u32 length, + __be16 *guard_start, int guard_number, + int *used_number, int sector_size, + obd_dif_csum_fn *fn) +{ + unsigned int i = offset; + unsigned int end = offset + length; + char *data_buf; + __be16 *guard_buf = guard_start; + unsigned int data_size; + int used = 0; + + data_buf = kmap(page) + offset; + while (i < end) { + if (used >= guard_number) { + CERROR("%s: unexpected used guard number of DIF %u/%u, " + "data length %u, sector size %u: rc = %d\n", + obd_name, used, guard_number, length, + sector_size, -E2BIG); + return -E2BIG; + } + data_size = min(round_up(i + 1, sector_size), end) - i; + *guard_buf = fn(data_buf, data_size); + guard_buf++; + data_buf += data_size; + i += data_size; + used++; + } + kunmap(page); + *used_number = used; + + return 0; +} +EXPORT_SYMBOL(obd_page_dif_generate_buffer); + +static int __obd_t10_performance_test(const char *obd_name, + enum cksum_types cksum_type, + struct page *data_page, + int repeat_number) +{ + unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP); + struct ahash_request *req; + obd_dif_csum_fn *fn = NULL; + unsigned int bufsize; + unsigned char *buffer; + struct page *__page; + __be16 *guard_start; + int guard_number; + int used_number = 0; + int sector_size = 0; + __u32 cksum; + int rc = 0; + int rc2; + int used; + int i; + + obd_t10_cksum2dif(cksum_type, &fn, §or_size); + if (!fn) + return -EINVAL; + + __page = alloc_page(GFP_KERNEL); + if (__page == NULL) + return -ENOMEM; + + req = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(req)) { + rc = PTR_ERR(req); + CERROR("%s: unable to initialize checksum hash %s: rc = %d\n", + obd_name, cfs_crypto_hash_name(cfs_alg), rc); + GOTO(out, rc); + } + + buffer = kmap(__page); + guard_start = (__be16 *)buffer; + guard_number = PAGE_SIZE / sizeof(*guard_start); + for (i = 0; i < repeat_number; i++) { + /* + * The left guard number should be able to hold checksums of a + * whole page + */ + rc = obd_page_dif_generate_buffer(obd_name, data_page, 0, + PAGE_SIZE, + guard_start + used_number, + guard_number - used_number, + &used, sector_size, fn); + if (rc) + break; + + used_number += used; + if (used_number == guard_number) { + cfs_crypto_hash_update_page(req, __page, 0, + used_number * sizeof(*guard_start)); + used_number = 0; + } + } + kunmap(__page); + if (rc) + GOTO(out_final, rc); + + if (used_number != 0) + cfs_crypto_hash_update_page(req, __page, 0, + used_number * sizeof(*guard_start)); + + bufsize = sizeof(cksum); +out_final: + rc2 = cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize); + rc = rc ? rc : rc2; +out: + __free_page(__page); + + return rc; +} + +/** + * Array of T10PI checksum algorithm speed in MByte per second + */ +static int obd_t10_cksum_speeds[OBD_T10_CKSUM_MAX]; + +static enum obd_t10_cksum_type +obd_t10_cksum2type(enum cksum_types cksum_type) +{ + switch (cksum_type) { + case OBD_CKSUM_T10IP512: + return OBD_T10_CKSUM_IP512; + case OBD_CKSUM_T10IP4K: + return OBD_T10_CKSUM_IP4K; + case OBD_CKSUM_T10CRC512: + return OBD_T10_CKSUM_CRC512; + case OBD_CKSUM_T10CRC4K: + return OBD_T10_CKSUM_CRC4K; + default: + return OBD_T10_CKSUM_UNKNOWN; + } +} + +static const char *obd_t10_cksum_name(enum obd_t10_cksum_type index) +{ + DECLARE_CKSUM_NAME; + + /* Need to skip "crc32", "adler", "crc32c", "reserved" */ + return cksum_name[3 + index]; +} + +/** + * Compute the speed of specified T10PI checksum type + * + * Run a speed test on the given T10PI checksum on buffer using a 1MB buffer + * size. This is a reasonable buffer size for Lustre RPCs, even if the actual + * RPC size is larger or smaller. + * + * The speed is stored internally in the obd_t10_cksum_speeds[] array, and + * is available through the obd_t10_cksum_speed() function. + * + * This function needs to stay the same as cfs_crypto_performance_test() so + * that the speeds are comparable. And this function should reflect the real + * cost of the checksum calculation. + * + * \param[in] obd_name name of the OBD device + * \param[in] cksum_type checksum type (OBD_CKSUM_T10*) + */ +static void obd_t10_performance_test(const char *obd_name, + enum cksum_types cksum_type) +{ + enum obd_t10_cksum_type index = obd_t10_cksum2type(cksum_type); + const int buf_len = max(PAGE_SIZE, 1048576UL); + unsigned long bcount; + unsigned long start; + unsigned long end; + struct page *page; + int rc = 0; + void *buf; + + page = alloc_page(GFP_KERNEL); + if (page == NULL) { + rc = -ENOMEM; + goto out; + } + + buf = kmap(page); + memset(buf, 0xAD, PAGE_SIZE); + kunmap(page); + + for (start = jiffies, end = start + cfs_time_seconds(1) / 4, + bcount = 0; time_before(jiffies, end) && rc == 0; bcount++) { + rc = __obd_t10_performance_test(obd_name, cksum_type, page, + buf_len >> PAGE_SHIFT); + if (rc) + break; + } + end = jiffies; + __free_page(page); +out: + if (rc) { + obd_t10_cksum_speeds[index] = rc; + CDEBUG(D_INFO, "%s: T10 checksum algorithm %s test error: " + "rc = %d\n", obd_name, obd_t10_cksum_name(index), rc); + } else { + unsigned long tmp; + + tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) * + 1000) / (1024 * 1024); + obd_t10_cksum_speeds[index] = (int)tmp; + CDEBUG(D_CONFIG, "%s: T10 checksum algorithm %s speed = %d " + "MB/s\n", obd_name, obd_t10_cksum_name(index), + obd_t10_cksum_speeds[index]); + } +} +#endif /* CONFIG_CRC_T10DIF */ + +int obd_t10_cksum_speed(const char *obd_name, + enum cksum_types cksum_type) +{ +#if IS_ENABLED(CONFIG_CRC_T10DIF) + enum obd_t10_cksum_type index = obd_t10_cksum2type(cksum_type); + + if (unlikely(obd_t10_cksum_speeds[index] == 0)) { + static DEFINE_MUTEX(obd_t10_cksum_speed_mutex); + + mutex_lock(&obd_t10_cksum_speed_mutex); + if (obd_t10_cksum_speeds[index] == 0) + obd_t10_performance_test(obd_name, cksum_type); + mutex_unlock(&obd_t10_cksum_speed_mutex); + } + + return obd_t10_cksum_speeds[index]; +#else /* !CONFIG_CRC_T10DIF */ + return 0; +#endif /* !CONFIG_CRC_T10DIF */ +} +EXPORT_SYMBOL(obd_t10_cksum_speed); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/interval_tree.c b/drivers/staging/lustrefsx/lustre/obdclass/interval_tree.c new file mode 100644 index 0000000000000..6007d37f61b5d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/interval_tree.c @@ -0,0 +1,772 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ldlm/interval_tree.c + * + * Interval tree library used by ldlm extent lock code + * + * Author: Huang Wei + * Author: Jay Xiong + */ + +#include +#include + +enum { + INTERVAL_RED = 0, + INTERVAL_BLACK = 1 +}; + +static inline int node_is_left_child(struct interval_node *node) +{ + LASSERT(node->in_parent != NULL); + return node == node->in_parent->in_left; +} + +static inline int node_is_right_child(struct interval_node *node) +{ + LASSERT(node->in_parent != NULL); + return node == node->in_parent->in_right; +} + +static inline int node_is_red(struct interval_node *node) +{ + return node->in_color == INTERVAL_RED; +} + +static inline int node_is_black(struct interval_node *node) +{ + return node->in_color == INTERVAL_BLACK; +} + +static inline int extent_compare(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + int rc; + + if (e1->start == e2->start) { + if (e1->end < e2->end) + rc = -1; + else if (e1->end > e2->end) + rc = 1; + else + rc = 0; + } else { + if (e1->start < e2->start) + rc = -1; + else + rc = 1; + } + return rc; +} + +static inline int extent_equal(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + return (e1->start == e2->start) && (e1->end == e2->end); +} + +static inline int extent_overlapped(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + return (e1->start <= e2->end) && (e2->start <= e1->end); +} + +static inline int node_compare(struct interval_node *n1, + struct interval_node *n2) +{ + return extent_compare(&n1->in_extent, &n2->in_extent); +} + +int node_equal(struct interval_node *n1, struct interval_node *n2) +{ + return extent_equal(&n1->in_extent, &n2->in_extent); +} + +#define interval_for_each(node, root) \ +for (node = interval_first(root); node != NULL; \ + node = interval_next(node)) + +#define interval_for_each_reverse(node, root) \ +for (node = interval_last(root); node != NULL; \ + node = interval_prev(node)) + +static struct interval_node *interval_first(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + while (node->in_left) + node = node->in_left; + RETURN(node); +} + +static struct interval_node *interval_last(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + while (node->in_right) + node = node->in_right; + RETURN(node); +} + +static struct interval_node *interval_next(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + if (node->in_right) + RETURN(interval_first(node->in_right)); + while (node->in_parent && node_is_right_child(node)) + node = node->in_parent; + RETURN(node->in_parent); +} + +static struct interval_node *interval_prev(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + + if (node->in_left) + RETURN(interval_last(node->in_left)); + + while (node->in_parent && node_is_left_child(node)) + node = node->in_parent; + + RETURN(node->in_parent); +} + +enum interval_iter interval_iterate(struct interval_node *root, + interval_callback_t func, + void *data) +{ + struct interval_node *node; + enum interval_iter rc = INTERVAL_ITER_CONT; + + ENTRY; + + interval_for_each(node, root) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + RETURN(rc); +} +EXPORT_SYMBOL(interval_iterate); + +enum interval_iter interval_iterate_reverse(struct interval_node *root, + interval_callback_t func, + void *data) +{ + struct interval_node *node; + enum interval_iter rc = INTERVAL_ITER_CONT; + + ENTRY; + + interval_for_each_reverse(node, root) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + RETURN(rc); +} +EXPORT_SYMBOL(interval_iterate_reverse); + +/* try to find a node with same interval in the tree, + * if found, return the pointer to the node, otherwise return NULL + */ +struct interval_node *interval_find(struct interval_node *root, + struct interval_node_extent *ex) +{ + struct interval_node *walk = root; + int rc; + + ENTRY; + + while (walk) { + rc = extent_compare(ex, &walk->in_extent); + if (rc == 0) + break; + else if (rc < 0) + walk = walk->in_left; + else + walk = walk->in_right; + } + + RETURN(walk); +} +EXPORT_SYMBOL(interval_find); + +static void __rotate_change_maxhigh(struct interval_node *node, + struct interval_node *rotate) +{ + __u64 left_max, right_max; + + rotate->in_max_high = node->in_max_high; + left_max = node->in_left ? node->in_left->in_max_high : 0; + right_max = node->in_right ? node->in_right->in_max_high : 0; + node->in_max_high = max3(interval_high(node), + left_max, right_max); +} + +/* The left rotation "pivots" around the link from node to node->right, and + * - node will be linked to node->right's left child, and + * - node->right's left child will be linked to node's right child. + */ +static void __rotate_left(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *right = node->in_right; + struct interval_node *parent = node->in_parent; + + node->in_right = right->in_left; + if (node->in_right) + right->in_left->in_parent = node; + + right->in_left = node; + right->in_parent = parent; + if (parent) { + if (node_is_left_child(node)) + parent->in_left = right; + else + parent->in_right = right; + } else { + *root = right; + } + node->in_parent = right; + + /* update max_high for node and right */ + __rotate_change_maxhigh(node, right); +} + +/* The right rotation "pivots" around the link from node to node->left, and + * - node will be linked to node->left's right child, and + * - node->left's right child will be linked to node's left child. + */ +static void __rotate_right(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *left = node->in_left; + struct interval_node *parent = node->in_parent; + + node->in_left = left->in_right; + if (node->in_left) + left->in_right->in_parent = node; + left->in_right = node; + + left->in_parent = parent; + if (parent) { + if (node_is_right_child(node)) + parent->in_right = left; + else + parent->in_left = left; + } else { + *root = left; + } + node->in_parent = left; + + /* update max_high for node and left */ + __rotate_change_maxhigh(node, left); +} + +#define interval_swap(a, b) do { \ + struct interval_node *c = a; a = b; b = c; \ +} while (0) + +/* + * Operations INSERT and DELETE, when run on a tree with n keys, + * take O(logN) time.Because they modify the tree, the result + * may violate the red-black properties.To restore these properties, + * we must change the colors of some of the nodes in the tree + * and also change the pointer structure. + */ +static void interval_insert_color(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *parent, *gparent; + + ENTRY; + + while ((parent = node->in_parent) && node_is_red(parent)) { + gparent = parent->in_parent; + /* Parent is RED, so gparent must not be NULL */ + if (node_is_left_child(parent)) { + struct interval_node *uncle; + + uncle = gparent->in_right; + if (uncle && node_is_red(uncle)) { + uncle->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + node = gparent; + continue; + } + + if (parent->in_right == node) { + __rotate_left(parent, root); + interval_swap(node, parent); + } + + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + __rotate_right(gparent, root); + } else { + struct interval_node *uncle; + + uncle = gparent->in_left; + if (uncle && node_is_red(uncle)) { + uncle->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + node = gparent; + continue; + } + + if (node_is_left_child(node)) { + __rotate_right(parent, root); + interval_swap(node, parent); + } + + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + __rotate_left(gparent, root); + } + } + + (*root)->in_color = INTERVAL_BLACK; + EXIT; +} + +struct interval_node *interval_insert(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node **p, *parent = NULL; + + ENTRY; + + LASSERT(!interval_is_intree(node)); + p = root; + while (*p) { + parent = *p; + if (node_equal(parent, node)) + RETURN(parent); + + /* max_high field must be updated after each iteration */ + if (parent->in_max_high < interval_high(node)) + parent->in_max_high = interval_high(node); + + if (node_compare(node, parent) < 0) + p = &parent->in_left; + else + p = &parent->in_right; + } + + /* link node into the tree */ + node->in_parent = parent; + node->in_color = INTERVAL_RED; + node->in_left = node->in_right = NULL; + *p = node; + + interval_insert_color(node, root); + node->in_intree = 1; + + RETURN(NULL); +} +EXPORT_SYMBOL(interval_insert); + +static inline int node_is_black_or_0(struct interval_node *node) +{ + return !node || node_is_black(node); +} + +static void interval_erase_color(struct interval_node *node, + struct interval_node *parent, + struct interval_node **root) +{ + struct interval_node *tmp; + + ENTRY; + + while (node_is_black_or_0(node) && node != *root) { + if (parent->in_left == node) { + tmp = parent->in_right; + if (node_is_red(tmp)) { + tmp->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_RED; + __rotate_left(parent, root); + tmp = parent->in_right; + } + if (node_is_black_or_0(tmp->in_left) && + node_is_black_or_0(tmp->in_right)) { + tmp->in_color = INTERVAL_RED; + node = parent; + parent = node->in_parent; + } else { + if (node_is_black_or_0(tmp->in_right)) { + struct interval_node *o_left; + + if ((o_left = tmp->in_left)) + o_left->in_color = + INTERVAL_BLACK; + tmp->in_color = INTERVAL_RED; + __rotate_right(tmp, root); + tmp = parent->in_right; + } + tmp->in_color = parent->in_color; + parent->in_color = INTERVAL_BLACK; + if (tmp->in_right) + tmp->in_right->in_color = + INTERVAL_BLACK; + __rotate_left(parent, root); + node = *root; + break; + } + } else { + tmp = parent->in_left; + if (node_is_red(tmp)) { + tmp->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_RED; + __rotate_right(parent, root); + tmp = parent->in_left; + } + if (node_is_black_or_0(tmp->in_left) && + node_is_black_or_0(tmp->in_right)) { + tmp->in_color = INTERVAL_RED; + node = parent; + parent = node->in_parent; + } else { + if (node_is_black_or_0(tmp->in_left)) { + struct interval_node *o_right; + + if ((o_right = tmp->in_right)) + o_right->in_color = + INTERVAL_BLACK; + tmp->in_color = INTERVAL_RED; + __rotate_left(tmp, root); + tmp = parent->in_left; + } + tmp->in_color = parent->in_color; + parent->in_color = INTERVAL_BLACK; + if (tmp->in_left) + tmp->in_left->in_color = INTERVAL_BLACK; + __rotate_right(parent, root); + node = *root; + break; + } + } + } + if (node) + node->in_color = INTERVAL_BLACK; + EXIT; +} + +/* + * if the @max_high value of @node is changed, this function traverse a path + * from node up to the root to update max_high for the whole tree. + */ +static void update_maxhigh(struct interval_node *node, + __u64 old_maxhigh) +{ + __u64 left_max, right_max; + + ENTRY; + + while (node) { + left_max = node->in_left ? node->in_left->in_max_high : 0; + right_max = node->in_right ? node->in_right->in_max_high : 0; + node->in_max_high = max3(interval_high(node), + left_max, right_max); + + if (node->in_max_high >= old_maxhigh) + break; + node = node->in_parent; + } + EXIT; +} + +void interval_erase(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *child, *parent; + int color; + + ENTRY; + + LASSERT(interval_is_intree(node)); + node->in_intree = 0; + if (!node->in_left) { + child = node->in_right; + } else if (!node->in_right) { + child = node->in_left; + } else { /* Both left and right child are not NULL */ + struct interval_node *old = node; + + node = interval_next(node); + child = node->in_right; + parent = node->in_parent; + color = node->in_color; + + if (child) + child->in_parent = parent; + if (parent == old) + parent->in_right = child; + else + parent->in_left = child; + + node->in_color = old->in_color; + node->in_right = old->in_right; + node->in_left = old->in_left; + node->in_parent = old->in_parent; + + if (old->in_parent) { + if (node_is_left_child(old)) + old->in_parent->in_left = node; + else + old->in_parent->in_right = node; + } else { + *root = node; + } + + old->in_left->in_parent = node; + if (old->in_right) + old->in_right->in_parent = node; + update_maxhigh(child ? : parent, node->in_max_high); + update_maxhigh(node, old->in_max_high); + if (parent == old) + parent = node; + goto color; + } + parent = node->in_parent; + color = node->in_color; + + if (child) + child->in_parent = parent; + if (parent) { + if (node_is_left_child(node)) + parent->in_left = child; + else + parent->in_right = child; + } else { + *root = child; + } + + update_maxhigh(child ? : parent, node->in_max_high); + +color: + if (color == INTERVAL_BLACK) + interval_erase_color(child, parent, root); + EXIT; +} +EXPORT_SYMBOL(interval_erase); + +static inline int interval_may_overlap(struct interval_node *node, + struct interval_node_extent *ext) +{ + return (ext->start <= node->in_max_high && + ext->end >= interval_low(node)); +} + +/* + * This function finds all intervals that overlap interval ext, + * and calls func to handle resulted intervals one by one. + * in lustre, this function will find all conflicting locks in + * the granted queue and add these locks to the ast work list. + * + * { + * if (node == NULL) + * return 0; + * if (ext->end < interval_low(node)) { + * interval_search(node->in_left, ext, func, data); + * } else if (interval_may_overlap(node, ext)) { + * if (extent_overlapped(ext, &node->in_extent)) + * func(node, data); + * interval_search(node->in_left, ext, func, data); + * interval_search(node->in_right, ext, func, data); + * } + * return 0; + * } + * + */ +enum interval_iter interval_search(struct interval_node *node, + struct interval_node_extent *ext, + interval_callback_t func, + void *data) +{ + struct interval_node *parent; + enum interval_iter rc = INTERVAL_ITER_CONT; + + ENTRY; + + LASSERT(ext != NULL); + LASSERT(func != NULL); + + while (node) { + if (ext->end < interval_low(node)) { + if (node->in_left) { + node = node->in_left; + continue; + } + } else if (interval_may_overlap(node, ext)) { + if (extent_overlapped(ext, &node->in_extent)) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + if (node->in_left) { + node = node->in_left; + continue; + } + if (node->in_right) { + node = node->in_right; + continue; + } + } + + parent = node->in_parent; + while (parent) { + if (node_is_left_child(node) && + parent->in_right) { + /* If we ever got the left, it means that the + * parent met ext->endin_right; + break; + } + node = parent; + parent = parent->in_parent; + } + if (parent == NULL || !interval_may_overlap(parent, ext)) + break; + } + + RETURN(rc); +} +EXPORT_SYMBOL(interval_search); + +static enum interval_iter interval_overlap_cb(struct interval_node *n, + void *args) +{ + *(int *)args = 1; + return INTERVAL_ITER_STOP; +} + +int interval_is_overlapped(struct interval_node *root, + struct interval_node_extent *ext) +{ + int has = 0; + (void)interval_search(root, ext, interval_overlap_cb, &has); + return has; +} +EXPORT_SYMBOL(interval_is_overlapped); + +/* Don't expand to low. Expanding downwards is expensive, and meaningless to + * some extents, because programs seldom do IO backward. + * + * The recursive algorithm of expanding low: + * expand_low { + * struct interval_node *tmp; + * static __u64 res = 0; + * + * if (root == NULL) + * return res; + * if (root->in_max_high < low) { + * res = max(root->in_max_high + 1, res); + * return res; + * } else if (low < interval_low(root)) { + * interval_expand_low(root->in_left, low); + * return res; + * } + * + * if (interval_high(root) < low) + * res = max(interval_high(root) + 1, res); + * interval_expand_low(root->in_left, low); + * interval_expand_low(root->in_right, low); + * + * return res; + * } + * + * It's much easy to eliminate the recursion, see interval_search for + * an example. -jay + */ +static inline __u64 interval_expand_low(struct interval_node *root, __u64 low) +{ + /* we only concern the empty tree right now. */ + if (root == NULL) + return 0; + return low; +} + +static inline __u64 interval_expand_high(struct interval_node *node, __u64 high) +{ + __u64 result = ~0; + + while (node != NULL) { + if (node->in_max_high < high) + break; + + if (interval_low(node) > high) { + result = interval_low(node) - 1; + node = node->in_left; + } else { + node = node->in_right; + } + } + + return result; +} + +/* expanding the extent based on @ext. */ +void interval_expand(struct interval_node *root, + struct interval_node_extent *ext, + struct interval_node_extent *limiter) +{ + /* The assertion of interval_is_overlapped is expensive because we may + * travel many nodes to find the overlapped node. + */ + LASSERT(interval_is_overlapped(root, ext) == 0); + if (!limiter || limiter->start < ext->start) + ext->start = interval_expand_low(root, ext->start); + if (!limiter || limiter->end > ext->end) + ext->end = interval_expand_high(root, ext->end); + LASSERT(interval_is_overlapped(root, ext) == 0); +} +EXPORT_SYMBOL(interval_expand); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/jobid.c b/drivers/staging/lustrefsx/lustre/obdclass/jobid.c new file mode 100644 index 0000000000000..207a88bcae3c7 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/jobid.c @@ -0,0 +1,932 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + * + * Copyright 2017 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * Store PID->JobID mappings + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include + +#include +#include +#include +#include + +static struct cfs_hash *jobid_hash; +static struct cfs_hash_ops jobid_hash_ops; +spinlock_t jobid_hash_lock; + +#define RESCAN_INTERVAL 30 +#define DELETE_INTERVAL 300 + +char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE; +char obd_jobid_name[LUSTRE_JOBID_SIZE] = "%e.%u"; + +/** + * Structure to store a single PID->JobID mapping + */ +struct jobid_pid_map { + struct hlist_node jp_hash; + time64_t jp_time; + spinlock_t jp_lock; /* protects jp_jobid */ + char jp_jobid[LUSTRE_JOBID_SIZE]; + unsigned int jp_joblen; + atomic_t jp_refcount; + pid_t jp_pid; +}; + +/* + * Jobid can be set for a session (see setsid(2)) by writing to + * a sysfs file from any process in that session. + * The jobids are stored in a hash table indexed by the relevant + * struct pid. We periodically look for entries where the pid has + * no PIDTYPE_SID tasks any more, and prune them. This happens within + * 5 seconds of a jobid being added, and every 5 minutes when jobids exist, + * but none are added. + */ +#define JOBID_EXPEDITED_CLEAN (5) +#define JOBID_BACKGROUND_CLEAN (5 * 60) + +struct session_jobid { + struct pid *sj_session; + struct rhash_head sj_linkage; + struct rcu_head sj_rcu; + char sj_jobid[1]; +}; + +static const struct rhashtable_params jobid_params = { + .key_len = sizeof(struct pid *), + .key_offset = offsetof(struct session_jobid, sj_session), + .head_offset = offsetof(struct session_jobid, sj_linkage), +}; + +static struct rhashtable session_jobids; + +/* + * jobid_current must be called with rcu_read_lock held. + * if it returns non-NULL, the string can only be used + * until rcu_read_unlock is called. + */ +char *jobid_current(void) +{ + struct pid *sid = task_session(current); + struct session_jobid *sj; + + sj = rhashtable_lookup_fast(&session_jobids, &sid, jobid_params); + if (sj) + return sj->sj_jobid; + return NULL; +} + +static void jobid_prune_expedite(void); +/* + * jobid_set_current will try to add a new entry + * to the table. If one exists with the same key, the + * jobid will be replaced + */ +int jobid_set_current(char *jobid) +{ + struct pid *sid; + struct session_jobid *sj, *origsj; + int ret; + int len = strlen(jobid); + + sj = kmalloc(sizeof(*sj) + len, GFP_KERNEL); + if (!sj) + return -ENOMEM; + rcu_read_lock(); + sid = task_session(current); + sj->sj_session = get_pid(sid); + strncpy(sj->sj_jobid, jobid, len+1); + origsj = rhashtable_lookup_get_insert_fast(&session_jobids, + &sj->sj_linkage, + jobid_params); + if (origsj == NULL) { + /* successful insert */ + rcu_read_unlock(); + jobid_prune_expedite(); + return 0; + } + + if (IS_ERR(origsj)) { + put_pid(sj->sj_session); + kfree(sj); + rcu_read_unlock(); + return PTR_ERR(origsj); + } + ret = rhashtable_replace_fast(&session_jobids, + &origsj->sj_linkage, + &sj->sj_linkage, + jobid_params); + if (ret) { + put_pid(sj->sj_session); + kfree(sj); + rcu_read_unlock(); + return ret; + } + put_pid(origsj->sj_session); + rcu_read_unlock(); + kfree_rcu(origsj, sj_rcu); + jobid_prune_expedite(); + + return 0; +} + +static void jobid_free(void *vsj, void *arg) +{ + struct session_jobid *sj = vsj; + + put_pid(sj->sj_session); + kfree(sj); +} + +static void jobid_prune(struct work_struct *work); +static DECLARE_DELAYED_WORK(jobid_prune_work, jobid_prune); +static int jobid_prune_expedited; +static void jobid_prune(struct work_struct *work) +{ + int remaining = 0; + struct rhashtable_iter iter; + struct session_jobid *sj; + + jobid_prune_expedited = 0; + rhashtable_walk_enter(&session_jobids, &iter); + rhashtable_walk_start(&iter); + while ((sj = rhashtable_walk_next(&iter)) != NULL) { + if (IS_ERR(sj)) { + if (PTR_ERR(sj) == -EAGAIN) + continue; + break; + } + if (!hlist_empty(&sj->sj_session->tasks[PIDTYPE_SID])) { + remaining++; + continue; + } + if (rhashtable_remove_fast(&session_jobids, + &sj->sj_linkage, + jobid_params) == 0) { + put_pid(sj->sj_session); + kfree_rcu(sj, sj_rcu); + } + } + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + if (remaining) + schedule_delayed_work(&jobid_prune_work, + cfs_time_seconds(JOBID_BACKGROUND_CLEAN)); +} + +static void jobid_prune_expedite(void) +{ + if (!jobid_prune_expedited) { + jobid_prune_expedited = 1; + mod_delayed_work(system_wq, &jobid_prune_work, + cfs_time_seconds(JOBID_EXPEDITED_CLEAN)); + } +} + +static int cfs_access_process_vm(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long addr, + void *buf, int len, int write) +{ + /* Just copied from kernel for the kernels which doesn't + * have access_process_vm() exported + */ + struct vm_area_struct *vma; + struct page *page; + void *old_buf = buf; + + /* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(), + * which is already holding mmap_sem for writes. If some other + * thread gets the write lock in the meantime, this thread will + * block, but at least it won't deadlock on itself. LU-1735 + */ + if (!mmap_read_trylock(mm)) + return -EDEADLK; + + /* ignore errors, just check how much was successfully transferred */ + while (len) { + int bytes, rc, offset; + void *maddr; + +#if defined(HAVE_GET_USER_PAGES_GUP_FLAGS) + rc = get_user_pages(addr, 1, write ? FOLL_WRITE : 0, &page, + &vma); +#elif defined(HAVE_GET_USER_PAGES_6ARG) + rc = get_user_pages(addr, 1, write, 1, &page, &vma); +#else + rc = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma); +#endif + if (rc <= 0) + break; + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + put_page(page); + len -= bytes; + buf += bytes; + addr += bytes; + } + mmap_read_unlock(mm); + + return buf - old_buf; +} + +/* Read the environment variable of current process specified by @key. */ +static int cfs_get_environ(const char *key, char *value, int *val_len) +{ + struct mm_struct *mm; + char *buffer; + int buf_len = PAGE_SIZE; + int key_len = strlen(key); + unsigned long addr; + int rc; + bool skip = false; + + ENTRY; + buffer = kmalloc(buf_len, GFP_USER); + if (!buffer) + RETURN(-ENOMEM); + + mm = get_task_mm(current); + if (!mm) { + kfree(buffer); + RETURN(-EINVAL); + } + + addr = mm->env_start; + while (addr < mm->env_end) { + int this_len, retval, scan_len; + char *env_start, *env_end; + + memset(buffer, 0, buf_len); + + this_len = min_t(int, mm->env_end - addr, buf_len); + retval = cfs_access_process_vm(current, mm, addr, buffer, + this_len, 0); + if (retval < 0) + GOTO(out, rc = retval); + else if (retval != this_len) + break; + + addr += retval; + + /* Parse the buffer to find out the specified key/value pair. + * The "key=value" entries are separated by '\0'. + */ + env_start = buffer; + scan_len = this_len; + while (scan_len) { + char *entry; + int entry_len; + + env_end = memscan(env_start, '\0', scan_len); + LASSERT(env_end >= env_start && + env_end <= env_start + scan_len); + + /* The last entry of this buffer cross the buffer + * boundary, reread it in next cycle. + */ + if (unlikely(env_end - env_start == scan_len)) { + /* Just skip the entry larger than page size, + * it can't be jobID env variable. + */ + if (unlikely(scan_len == this_len)) + skip = true; + else + addr -= scan_len; + break; + } else if (unlikely(skip)) { + skip = false; + goto skip; + } + entry = env_start; + entry_len = env_end - env_start; + CDEBUG(D_INFO, "key: %s, entry: %s\n", key, entry); + + /* Key length + length of '=' */ + if (entry_len > key_len + 1 && + entry[key_len] == '=' && + !memcmp(entry, key, key_len)) { + entry += key_len + 1; + entry_len -= key_len + 1; + + /* The 'value' buffer passed in is too small. + * Copy what fits, but return -EOVERFLOW. + */ + if (entry_len >= *val_len) { + memcpy(value, entry, *val_len); + value[*val_len - 1] = 0; + GOTO(out, rc = -EOVERFLOW); + } + + memcpy(value, entry, entry_len); + *val_len = entry_len; + GOTO(out, rc = 0); + } +skip: + scan_len -= (env_end - env_start + 1); + env_start = env_end + 1; + } + } + GOTO(out, rc = -ENOENT); + +out: + mmput(mm); + kfree((void *)buffer); + return rc; +} + +/* + * Get jobid of current process by reading the environment variable + * stored in between the "env_start" & "env_end" of task struct. + * + * If some job scheduler doesn't store jobid in the "env_start/end", + * then an upcall could be issued here to get the jobid by utilizing + * the userspace tools/API. Then, the jobid must be cached. + */ +int jobid_get_from_environ(char *jobid_var, char *jobid, int *jobid_len) +{ + int rc; + + rc = cfs_get_environ(jobid_var, jobid, jobid_len); + if (!rc) + goto out; + + if (rc == -EOVERFLOW) { + /* For the PBS_JOBID and LOADL_STEP_ID keys (which are + * variable length strings instead of just numbers), it + * might make sense to keep the unique parts for JobID, + * instead of just returning an error. That means a + * larger temp buffer for cfs_get_environ(), then + * truncating the string at some separator to fit into + * the specified jobid_len. Fix later if needed. */ + static ktime_t printed; + + if (unlikely(ktime_to_ns(printed) == 0 || + ktime_after(ktime_get(), + ktime_add_ns(printed, + 3600ULL * 24 * NSEC_PER_SEC)))) { + LCONSOLE_WARN("jobid: '%s' value too large (%d)\n", + obd_jobid_var, *jobid_len); + printed = ktime_get(); + } + + rc = 0; + } else { + CDEBUG_LIMIT((rc == -ENOENT || rc == -EINVAL || + rc == -EDEADLK) ? D_INFO : D_ERROR, + "jobid: get '%s' failed: rc = %d\n", + obd_jobid_var, rc); + } + +out: + return rc; +} + +/* + * jobid_should_free_item + * + * Each item is checked to see if it should be released + * Removed from hash table by caller + * Actually freed in jobid_put_locked + * + * Returns 1 if item is to be freed, 0 if it is to be kept + */ + +static int jobid_should_free_item(void *obj, void *data) +{ + char *jobid = data; + struct jobid_pid_map *pidmap = obj; + int rc = 0; + + if (obj == NULL) + return 0; + + if (jobid == NULL) { + WARN_ON_ONCE(atomic_read(&pidmap->jp_refcount) != 1); + return 1; + } + + spin_lock(&pidmap->jp_lock); + /* prevent newly inserted items from deleting */ + if (jobid[0] == '\0' && atomic_read(&pidmap->jp_refcount) == 1) + rc = 1; + else if (ktime_get_real_seconds() - pidmap->jp_time > DELETE_INTERVAL) + rc = 1; + else if (strcmp(pidmap->jp_jobid, jobid) == 0) + rc = 1; + spin_unlock(&pidmap->jp_lock); + + return rc; +} + +/* + * jobid_name_is_valid + * + * Checks if the jobid is a Lustre process + * + * Returns true if jobid is valid + * Returns false if jobid looks like it's a Lustre process + */ +static bool jobid_name_is_valid(char *jobid) +{ + const char *const lustre_reserved[] = { "ll_ping", "ptlrpc", + "ldlm", "ll_sa", NULL }; + int i; + + if (jobid[0] == '\0') + return false; + + for (i = 0; lustre_reserved[i] != NULL; i++) { + if (strncmp(jobid, lustre_reserved[i], + strlen(lustre_reserved[i])) == 0) + return false; + } + return true; +} + +/* + * jobid_get_from_cache() + * + * Returns contents of jobid_var from process environment for current PID, + * or from the per-session jobid table. + * Values fetch from process environment will be cached for some time to avoid + * the overhead of scanning the environment. + * + * Return: -ENOMEM if allocating a new pidmap fails + * -ENOENT if no entry could be found + * +ve string length for success (something was returned in jobid) + */ +static int jobid_get_from_cache(char *jobid, size_t joblen) +{ + static time64_t last_expire; + bool expire_cache = false; + pid_t pid = current->pid; + struct jobid_pid_map *pidmap = NULL; + time64_t now = ktime_get_real_seconds(); + int rc = 0; + ENTRY; + + if (strcmp(obd_jobid_var, JOBSTATS_SESSION) == 0) { + char *jid; + + rcu_read_lock(); + jid = jobid_current(); + if (jid) { + strlcpy(jobid, jid, joblen); + joblen = strlen(jobid); + } else { + rc = -ENOENT; + } + rcu_read_unlock(); + GOTO(out, rc); + } + + LASSERT(jobid_hash != NULL); + + /* scan hash periodically to remove old PID entries from cache */ + spin_lock(&jobid_hash_lock); + if (unlikely(last_expire + DELETE_INTERVAL <= now)) { + expire_cache = true; + last_expire = now; + } + spin_unlock(&jobid_hash_lock); + + if (expire_cache) + cfs_hash_cond_del(jobid_hash, jobid_should_free_item, + "intentionally_bad_jobid"); + + /* first try to find PID in the hash and use that value */ + pidmap = cfs_hash_lookup(jobid_hash, &pid); + if (pidmap == NULL) { + struct jobid_pid_map *pidmap2; + + OBD_ALLOC_PTR(pidmap); + if (pidmap == NULL) + GOTO(out, rc = -ENOMEM); + + pidmap->jp_pid = pid; + pidmap->jp_time = 0; + pidmap->jp_jobid[0] = '\0'; + spin_lock_init(&pidmap->jp_lock); + INIT_HLIST_NODE(&pidmap->jp_hash); + /* + * @pidmap might be reclaimed just after it is added into + * hash list, init @jp_refcount as 1 to make sure memory + * could be not freed during access. + */ + atomic_set(&pidmap->jp_refcount, 1); + + /* + * Add the newly created map to the hash, on key collision we + * lost a racing addition and must destroy our newly allocated + * map. The object which exists in the hash will be returned. + */ + pidmap2 = cfs_hash_findadd_unique(jobid_hash, &pid, + &pidmap->jp_hash); + if (unlikely(pidmap != pidmap2)) { + CDEBUG(D_INFO, "jobid: duplicate found for PID=%u\n", + pid); + OBD_FREE_PTR(pidmap); + pidmap = pidmap2; + } + } + + /* + * If pidmap is old (this is always true for new entries) refresh it. + * If obd_jobid_var is not found, cache empty entry and try again + * later, to avoid repeat lookups for PID if obd_jobid_var missing. + */ + spin_lock(&pidmap->jp_lock); + if (pidmap->jp_time + RESCAN_INTERVAL <= now) { + char env_jobid[LUSTRE_JOBID_SIZE] = ""; + int env_len = sizeof(env_jobid); + + pidmap->jp_time = now; + + spin_unlock(&pidmap->jp_lock); + rc = jobid_get_from_environ(obd_jobid_var, env_jobid, &env_len); + + CDEBUG(D_INFO, "jobid: PID mapping established: %d->%s\n", + pidmap->jp_pid, env_jobid); + spin_lock(&pidmap->jp_lock); + if (!rc) { + pidmap->jp_joblen = env_len; + strlcpy(pidmap->jp_jobid, env_jobid, + sizeof(pidmap->jp_jobid)); + rc = 0; + } else if (rc == -ENOENT) { + /* It might have been deleted, clear out old entry */ + pidmap->jp_joblen = 0; + pidmap->jp_jobid[0] = '\0'; + } + } + + /* + * Regardless of how pidmap was found, if it contains a valid entry + * use that for now. If there was a technical error (e.g. -ENOMEM) + * use the old cached value until it can be looked up again properly. + * If a cached missing entry was found, return -ENOENT. + */ + if (pidmap->jp_joblen) { + strlcpy(jobid, pidmap->jp_jobid, joblen); + joblen = pidmap->jp_joblen; + rc = 0; + } else if (!rc) { + rc = -ENOENT; + } + spin_unlock(&pidmap->jp_lock); + + cfs_hash_put(jobid_hash, &pidmap->jp_hash); + + EXIT; +out: + return rc < 0 ? rc : joblen; +} + +/* + * jobid_interpret_string() + * + * Interpret the jobfmt string to expand specified fields, like coredumps do: + * %e = executable + * %g = gid + * %h = hostname + * %H = short hostname + * %j = jobid from environment + * %p = pid + * %u = uid + * + * Unknown escape strings are dropped. Other characters are copied through, + * excluding whitespace (to avoid making jobid parsing difficult). + * + * Return: -EOVERFLOW if the expanded string does not fit within @joblen + * 0 for success + */ +static int jobid_interpret_string(const char *jobfmt, char *jobid, + ssize_t joblen) +{ + char c; + + while ((c = *jobfmt++) && joblen > 1) { + char f, *p; + int l; + + if (isspace(c)) /* Don't allow embedded spaces */ + continue; + + if (c != '%') { + *jobid = c; + joblen--; + jobid++; + *jobid = '\0'; + continue; + } + + switch ((f = *jobfmt++)) { + case 'e': /* executable name */ + l = snprintf(jobid, joblen, "%s", current->comm); + break; + case 'g': /* group ID */ + l = snprintf(jobid, joblen, "%u", + from_kgid(&init_user_ns, current_fsgid())); + break; + case 'h': /* hostname */ + l = snprintf(jobid, joblen, "%s", + init_utsname()->nodename); + break; + case 'H': /* short hostname. Cut at first dot */ + l = snprintf(jobid, joblen, "%s", + init_utsname()->nodename); + p = strnchr(jobid, joblen, '.'); + if (p) { + *p = '\0'; + l = p - jobid; + } + break; + case 'j': /* jobid stored in process environment */ + l = jobid_get_from_cache(jobid, joblen); + if (l < 0) + l = 0; + break; + case 'p': /* process ID */ + l = snprintf(jobid, joblen, "%u", current->pid); + break; + case 'u': /* user ID */ + l = snprintf(jobid, joblen, "%u", + from_kuid(&init_user_ns, current_fsuid())); + break; + case '\0': /* '%' at end of format string */ + l = 0; + goto out; + default: /* drop unknown %x format strings */ + l = 0; + break; + } + jobid += l; + joblen -= l; + } + /* + * This points at the end of the buffer, so long as jobid is always + * incremented the same amount as joblen is decremented. + */ +out: + jobid[joblen - 1] = '\0'; + + return joblen < 0 ? -EOVERFLOW : 0; +} + +/* + * Hash initialization, copied from server-side job stats bucket sizes + */ +#define HASH_JOBID_BKT_BITS 5 +#define HASH_JOBID_CUR_BITS 7 +#define HASH_JOBID_MAX_BITS 12 + +int jobid_cache_init(void) +{ + int rc = 0; + ENTRY; + + if (jobid_hash) + return 0; + + spin_lock_init(&jobid_hash_lock); + jobid_hash = cfs_hash_create("JOBID_HASH", HASH_JOBID_CUR_BITS, + HASH_JOBID_MAX_BITS, HASH_JOBID_BKT_BITS, + 0, CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA, + &jobid_hash_ops, CFS_HASH_DEFAULT); + if (!jobid_hash) { + rc = -ENOMEM; + } else { + rc = rhashtable_init(&session_jobids, &jobid_params); + if (rc) { + cfs_hash_putref(jobid_hash); + jobid_hash = NULL; + } + } + + RETURN(rc); +} +EXPORT_SYMBOL(jobid_cache_init); + +void jobid_cache_fini(void) +{ + struct cfs_hash *tmp_hash; + ENTRY; + + spin_lock(&jobid_hash_lock); + tmp_hash = jobid_hash; + jobid_hash = NULL; + spin_unlock(&jobid_hash_lock); + + cancel_delayed_work_sync(&jobid_prune_work); + + if (tmp_hash != NULL) { + cfs_hash_cond_del(tmp_hash, jobid_should_free_item, NULL); + cfs_hash_putref(tmp_hash); + + rhashtable_free_and_destroy(&session_jobids, jobid_free, NULL); + } + + + EXIT; +} +EXPORT_SYMBOL(jobid_cache_fini); + +/* + * Hash operations for pid<->jobid + */ +static unsigned jobid_hashfn(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(pid_t), mask); +} + +static void *jobid_key(struct hlist_node *hnode) +{ + struct jobid_pid_map *pidmap; + + pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash); + return &pidmap->jp_pid; +} + +static int jobid_keycmp(const void *key, struct hlist_node *hnode) +{ + const pid_t *pid_key1; + const pid_t *pid_key2; + + LASSERT(key != NULL); + pid_key1 = (pid_t *)key; + pid_key2 = (pid_t *)jobid_key(hnode); + + return *pid_key1 == *pid_key2; +} + +static void *jobid_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct jobid_pid_map, jp_hash); +} + +static void jobid_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct jobid_pid_map *pidmap; + + pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash); + + atomic_inc(&pidmap->jp_refcount); +} + +static void jobid_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct jobid_pid_map *pidmap; + + if (hnode == NULL) + return; + + pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash); + LASSERT(atomic_read(&pidmap->jp_refcount) > 0); + if (atomic_dec_and_test(&pidmap->jp_refcount)) { + CDEBUG(D_INFO, "Freeing: %d->%s\n", + pidmap->jp_pid, pidmap->jp_jobid); + + OBD_FREE_PTR(pidmap); + } +} + +static struct cfs_hash_ops jobid_hash_ops = { + .hs_hash = jobid_hashfn, + .hs_keycmp = jobid_keycmp, + .hs_key = jobid_key, + .hs_object = jobid_object, + .hs_get = jobid_get, + .hs_put = jobid_put_locked, + .hs_put_locked = jobid_put_locked, +}; + +/** + * Generate the job identifier string for this process for tracking purposes. + * + * Fill in @jobid string based on the value of obd_jobid_var: + * JOBSTATS_DISABLE: none + * JOBSTATS_NODELOCAL: content of obd_jobid_name (jobid_interpret_string()) + * JOBSTATS_PROCNAME_UID: process name/UID + * JOBSTATS_SESSION per-session value set by + * /sys/fs/lustre/jobid_this_session + * anything else: look up obd_jobid_var in the processes environment + * + * Return -ve error number, 0 on success. + */ +int lustre_get_jobid(char *jobid, size_t joblen) +{ + int rc = 0; + ENTRY; + + if (unlikely(joblen < 2)) { + if (joblen == 1) + jobid[0] = '\0'; + RETURN(-EINVAL); + } + + if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) { + /* Jobstats isn't enabled */ + memset(jobid, 0, joblen); + } else if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) { + /* Whole node dedicated to single job */ + rc = jobid_interpret_string(obd_jobid_name, jobid, joblen); + } else if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) { + rc = jobid_interpret_string("%e.%u", jobid, joblen); + } else if (strcmp(obd_jobid_var, JOBSTATS_SESSION) == 0 || + jobid_name_is_valid(current->comm)) { + /* + * per-process jobid wanted, either from environment or from + * per-session setting. + * If obd_jobid_name contains "%j" or if getting the per-process + * jobid directly fails, fall back to using obd_jobid_name. + */ + rc = -EAGAIN; + if (!strnstr(obd_jobid_name, "%j", joblen)) + rc = jobid_get_from_cache(jobid, joblen); + + /* fall back to jobid_name if jobid_var not available */ + if (rc < 0) { + int rc2 = jobid_interpret_string(obd_jobid_name, + jobid, joblen); + if (!rc2) + rc = 0; + } + } + + RETURN(rc); +} +EXPORT_SYMBOL(lustre_get_jobid); + +/* + * lustre_jobid_clear + * + * Search cache for JobID given by @find_jobid. + * If any entries in the hash table match the value, they are removed + */ +void lustre_jobid_clear(const char *find_jobid) +{ + char jobid[LUSTRE_JOBID_SIZE]; + char *end; + + if (jobid_hash == NULL) + return; + + strlcpy(jobid, find_jobid, sizeof(jobid)); + /* trim \n off the end of the incoming jobid */ + end = strchr(jobid, '\n'); + if (end && *end == '\n') + *end = '\0'; + + CDEBUG(D_INFO, "Clearing Jobid: %s\n", jobid); + cfs_hash_cond_del(jobid_hash, jobid_should_free_item, jobid); + + CDEBUG(D_INFO, "%d items remain in jobID table\n", + atomic_read(&jobid_hash->hs_count)); +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c new file mode 100644 index 0000000000000..7afb9484a8a69 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c @@ -0,0 +1,262 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2015, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Author: Nathan Rutman + * + * Kernel <-> userspace communication routines. + * Using pipes for all arches. + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include + +#include +#include + +/** + * libcfs_kkuc_msg_put - send an message from kernel to userspace + * @param fp to send the message to + * @param payload Payload data. First field of payload is always + * struct kuc_hdr + */ +int libcfs_kkuc_msg_put(struct file *filp, void *payload) +{ + struct kuc_hdr *kuch = (struct kuc_hdr *)payload; + ssize_t count = kuch->kuc_msglen; + loff_t offset = 0; + int rc = 0; + + if (IS_ERR_OR_NULL(filp)) + return -EBADF; + + if (kuch->kuc_magic != KUC_MAGIC) { + CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic); + return -ENOSYS; + } + + while (count > 0) { + rc = cfs_kernel_write(filp, payload, count, &offset); + if (rc < 0) + break; + count -= rc; + payload += rc; + rc = 0; + } + + if (rc < 0) + CWARN("message send failed (%d)\n", rc); + else + CDEBUG(D_HSM, "Sent message rc=%d, fp=%p\n", rc, filp); + + return rc; +} +EXPORT_SYMBOL(libcfs_kkuc_msg_put); + +/* Broadcast groups are global across all mounted filesystems; + * i.e. registering for a group on 1 fs will get messages for that + * group from any fs */ +/** A single group registration has a uid and a file pointer */ +struct kkuc_reg { + struct list_head kr_chain; + struct obd_uuid kr_uuid; + int kr_uid; + struct file *kr_fp; + char kr_data[0]; +}; + +static struct list_head kkuc_groups[KUC_GRP_MAX + 1]; +/* Protect message sending against remove and adds */ +static DECLARE_RWSEM(kg_sem); + +static inline bool libcfs_kkuc_group_is_valid(int group) +{ + return 0 <= group && group < ARRAY_SIZE(kkuc_groups); +} + +void libcfs_kkuc_init(void) +{ + int group; + + for (group = 0; group < ARRAY_SIZE(kkuc_groups); group++) + INIT_LIST_HEAD(&kkuc_groups[group]); +} + +/** Add a receiver to a broadcast group + * @param filp pipe to write into + * @param uid identifier for this receiver + * @param group group number + * @param data user data + */ +int libcfs_kkuc_group_add(struct file *filp, const struct obd_uuid *uuid, + int uid, int group, void *data, size_t data_len) +{ + struct kkuc_reg *reg; + + if (!libcfs_kkuc_group_is_valid(group)) { + CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); + return -EINVAL; + } + + /* fput in group_rem */ + if (filp == NULL) + return -EBADF; + + /* freed in group_rem */ + reg = kzalloc(sizeof(*reg) + data_len, 0); + if (reg == NULL) + return -ENOMEM; + + reg->kr_uuid = *uuid; + reg->kr_fp = filp; + reg->kr_uid = uid; + memcpy(reg->kr_data, data, data_len); + + down_write(&kg_sem); + list_add(®->kr_chain, &kkuc_groups[group]); + up_write(&kg_sem); + + CDEBUG(D_HSM, "Added uid=%d fp=%p to group %d\n", uid, filp, group); + + return 0; +} +EXPORT_SYMBOL(libcfs_kkuc_group_add); + +int libcfs_kkuc_group_rem(const struct obd_uuid *uuid, int uid, int group) +{ + struct kkuc_reg *reg, *next; + ENTRY; + + if (!libcfs_kkuc_group_is_valid(group)) { + CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); + return -EINVAL; + } + + if (uid == 0) { + /* Broadcast a shutdown message */ + struct kuc_hdr lh; + + lh.kuc_magic = KUC_MAGIC; + lh.kuc_transport = KUC_TRANSPORT_GENERIC; + lh.kuc_msgtype = KUC_MSG_SHUTDOWN; + lh.kuc_msglen = sizeof(lh); + libcfs_kkuc_group_put(uuid, group, &lh); + } + + down_write(&kg_sem); + list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) { + if (obd_uuid_equals(uuid, ®->kr_uuid) && + (uid == 0 || uid == reg->kr_uid)) { + list_del(®->kr_chain); + CDEBUG(D_HSM, "Removed uid=%d fp=%p from group %d\n", + reg->kr_uid, reg->kr_fp, group); + if (reg->kr_fp != NULL) + fput(reg->kr_fp); + kfree(reg); + } + } + up_write(&kg_sem); + + RETURN(0); +} +EXPORT_SYMBOL(libcfs_kkuc_group_rem); + +int libcfs_kkuc_group_put(const struct obd_uuid *uuid, int group, void *payload) +{ + struct kkuc_reg *reg; + int rc = 0; + int one_success = 0; + ENTRY; + + if (!libcfs_kkuc_group_is_valid(group)) { + CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); + return -EINVAL; + } + + down_write(&kg_sem); + + if (unlikely(list_empty(&kkuc_groups[group])) || + unlikely(OBD_FAIL_CHECK(OBD_FAIL_MDS_HSM_CT_REGISTER_NET))) { + /* no agent have fully registered, CDT will retry */ + up_write(&kg_sem); + RETURN(-EAGAIN); + } + + list_for_each_entry(reg, &kkuc_groups[group], kr_chain) { + if (obd_uuid_equals(uuid, ®->kr_uuid) && + reg->kr_fp != NULL) { + rc = libcfs_kkuc_msg_put(reg->kr_fp, payload); + if (rc == 0) + one_success = 1; + else if (rc == -EPIPE) { + fput(reg->kr_fp); + reg->kr_fp = NULL; + } + } + } + up_write(&kg_sem); + + /* don't return an error if the message has been delivered + * at least to one agent */ + if (one_success) + rc = 0; + + RETURN(rc); +} +EXPORT_SYMBOL(libcfs_kkuc_group_put); + +/** + * Calls a callback function for each link of the given kuc group. + * @param group the group to call the function on. + * @param cb_func the function to be called. + * @param cb_arg extra argument to be passed to the callback function. + */ +int libcfs_kkuc_group_foreach(const struct obd_uuid *uuid, int group, + libcfs_kkuc_cb_t cb_func, void *cb_arg) +{ + struct kkuc_reg *reg; + int rc = 0; + ENTRY; + + if (!libcfs_kkuc_group_is_valid(group)) { + CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); + RETURN(-EINVAL); + } + + down_read(&kg_sem); + list_for_each_entry(reg, &kkuc_groups[group], kr_chain) { + if (obd_uuid_equals(uuid, ®->kr_uuid) && reg->kr_fp != NULL) + rc = cb_func(reg->kr_data, cb_arg); + } + up_read(&kg_sem); + + RETURN(rc); +} +EXPORT_SYMBOL(libcfs_kkuc_group_foreach); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linkea.c b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c new file mode 100644 index 0000000000000..2ea560fdc125d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c @@ -0,0 +1,330 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + * Use is subject to license terms. + * + * Author: Di Wang + */ + +#include +#include + +int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf) +{ + ldata->ld_buf = lu_buf_check_and_alloc(buf, PAGE_SIZE); + if (ldata->ld_buf->lb_buf == NULL) + return -ENOMEM; + ldata->ld_leh = ldata->ld_buf->lb_buf; + ldata->ld_leh->leh_magic = LINK_EA_MAGIC; + ldata->ld_leh->leh_reccount = 0; + ldata->ld_leh->leh_len = sizeof(struct link_ea_header); + ldata->ld_leh->leh_overflow_time = 0; + ldata->ld_leh->leh_padding = 0; + return 0; +} +EXPORT_SYMBOL(linkea_data_new); + +int linkea_init(struct linkea_data *ldata) +{ + struct link_ea_header *leh; + + LASSERT(ldata->ld_buf != NULL); + leh = ldata->ld_buf->lb_buf; + if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) { + leh->leh_magic = LINK_EA_MAGIC; + leh->leh_reccount = __swab32(leh->leh_reccount); + leh->leh_len = __swab64(leh->leh_len); + leh->leh_overflow_time = __swab32(leh->leh_overflow_time); + leh->leh_padding = __swab32(leh->leh_padding); + /* individual entries are swabbed by linkea_entry_unpack() */ + } + + if (leh->leh_magic != LINK_EA_MAGIC) + return -EINVAL; + + if (leh->leh_reccount == 0 && leh->leh_overflow_time == 0) + return -ENODATA; + + ldata->ld_leh = leh; + return 0; +} +EXPORT_SYMBOL(linkea_init); + +int linkea_init_with_rec(struct linkea_data *ldata) +{ + int rc; + + rc = linkea_init(ldata); + if (!rc && ldata->ld_leh->leh_reccount == 0) + rc = -ENODATA; + + return rc; +} +EXPORT_SYMBOL(linkea_init_with_rec); + +/** + * Pack a link_ea_entry. + * All elements are stored as chars to avoid alignment issues. + * Numbers are always big-endian + * \retval record length + */ +int linkea_entry_pack(struct link_ea_entry *lee, const struct lu_name *lname, + const struct lu_fid *pfid) +{ + struct lu_fid tmpfid; + int reclen; + + tmpfid = *pfid; + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_MUL_REF)) + tmpfid.f_oid--; + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH)) + tmpfid.f_ver = ~0; + fid_cpu_to_be(&tmpfid, &tmpfid); + memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid)); + memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen); + reclen = sizeof(struct link_ea_entry) + lname->ln_namelen; + + lee->lee_reclen[0] = (reclen >> 8) & 0xff; + lee->lee_reclen[1] = reclen & 0xff; + return reclen; +} +EXPORT_SYMBOL(linkea_entry_pack); + +void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen, + struct lu_name *lname, struct lu_fid *pfid) +{ + LASSERT(lee != NULL); + + *reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1]; + memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid)); + fid_be_to_cpu(pfid, pfid); + if (lname != NULL) { + lname->ln_name = lee->lee_name; + lname->ln_namelen = *reclen - sizeof(struct link_ea_entry); + } +} +EXPORT_SYMBOL(linkea_entry_unpack); + +bool linkea_will_overflow(struct linkea_data *ldata, + const struct lu_name *lname) +{ + struct link_ea_header *leh = ldata->ld_leh; + int reclen = lname->ln_namelen + sizeof(struct link_ea_entry); + + if (unlikely(leh->leh_len + reclen > MAX_LINKEA_SIZE)) + return true; + return false; +} +EXPORT_SYMBOL(linkea_will_overflow); + +/** + * Add a record to the end of link ea buf + **/ +int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid, bool err_on_overflow) +{ + struct link_ea_header *leh = ldata->ld_leh; + int reclen; + + LASSERT(leh != NULL); + + if (lname == NULL || pfid == NULL) + return -EINVAL; + + reclen = lname->ln_namelen + sizeof(struct link_ea_entry); + if (unlikely(leh->leh_len + reclen > MAX_LINKEA_SIZE)) { + /* Use 32-bits to save the overflow time, although it will + * shrink the ktime_get_real_seconds() returned 64-bits value + * to 32-bits value, it is still quite large and can be used + * for about 140 years. That is enough. + */ + leh->leh_overflow_time = ktime_get_real_seconds(); + if (unlikely(leh->leh_overflow_time == 0)) + leh->leh_overflow_time++; + + CDEBUG(D_INODE, "No enough space to hold linkea entry '" + DFID": %.*s' at %u\n", PFID(pfid), lname->ln_namelen, + lname->ln_name, leh->leh_overflow_time); + return err_on_overflow ? -EOVERFLOW : 0; + } + + if (leh->leh_len + reclen > ldata->ld_buf->lb_len) { + if (lu_buf_check_and_grow(ldata->ld_buf, + leh->leh_len + reclen) < 0) + return -ENOMEM; + + leh = ldata->ld_leh = ldata->ld_buf->lb_buf; + } + + ldata->ld_lee = ldata->ld_buf->lb_buf + leh->leh_len; + ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid); + leh->leh_len += ldata->ld_reclen; + leh->leh_reccount++; + if (err_on_overflow) + CDEBUG(D_INODE, + "New link_ea name '"DFID":' is added\n", + PFID(pfid), lname->ln_namelen); + else + CDEBUG(D_INODE, "New link_ea name '"DFID":%.*s' is added\n", + PFID(pfid), lname->ln_namelen, lname->ln_name); + return 0; +} +EXPORT_SYMBOL(linkea_add_buf); + +/** Del the current record from the link ea buf */ +void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname, + bool is_encrypted) +{ + LASSERT(ldata->ld_leh != NULL && ldata->ld_lee != NULL); + LASSERT(ldata->ld_leh->leh_reccount > 0); + + ldata->ld_leh->leh_reccount--; + ldata->ld_leh->leh_len -= ldata->ld_reclen; + memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen, + (char *)ldata->ld_leh + ldata->ld_leh->leh_len - + (char *)ldata->ld_lee); + if (is_encrypted) + CDEBUG(D_INODE, + "Old link_ea name '' is removed\n", + lname->ln_namelen); + else + CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n", + lname->ln_namelen, lname->ln_name); + + if ((char *)ldata->ld_lee >= ((char *)ldata->ld_leh + + ldata->ld_leh->leh_len)) + ldata->ld_lee = NULL; +} +EXPORT_SYMBOL(linkea_del_buf); + +int linkea_links_new(struct linkea_data *ldata, struct lu_buf *buf, + const struct lu_name *cname, const struct lu_fid *pfid) +{ + int rc; + + rc = linkea_data_new(ldata, buf); + if (!rc) + rc = linkea_add_buf(ldata, cname, pfid, false); + + return rc; +} +EXPORT_SYMBOL(linkea_links_new); + +/** + * Mark the linkEA as overflow with current timestamp, + * and remove the last linkEA entry. + * + * Return the new linkEA size. + */ +int linkea_overflow_shrink(struct linkea_data *ldata) +{ + struct link_ea_header *leh; + struct lu_name tname; + struct lu_fid tfid; + int count; + + leh = ldata->ld_leh = ldata->ld_buf->lb_buf; + if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) { + leh->leh_magic = LINK_EA_MAGIC; + leh->leh_reccount = __swab32(leh->leh_reccount); + leh->leh_overflow_time = __swab32(leh->leh_overflow_time); + leh->leh_padding = __swab32(leh->leh_padding); + } + + LASSERT(leh->leh_reccount > 0); + + leh->leh_len = sizeof(struct link_ea_header); + leh->leh_reccount--; + if (unlikely(leh->leh_reccount == 0)) + return 0; + + leh->leh_overflow_time = ktime_get_real_seconds(); + if (unlikely(leh->leh_overflow_time == 0)) + leh->leh_overflow_time++; + ldata->ld_reclen = 0; + ldata->ld_lee = (struct link_ea_entry *)(leh + 1); + for (count = 0; count < leh->leh_reccount; count++) { + linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, + &tname, &tfid); + leh->leh_len += ldata->ld_reclen; + ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee + + ldata->ld_reclen); + } + + linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, &tname, &tfid); + CDEBUG(D_INODE, "No enough space to hold the last linkea entry '" + DFID": %.*s', shrink it, left %d linkea entries, size %llu\n", + PFID(&tfid), tname.ln_namelen, tname.ln_name, + leh->leh_reccount, leh->leh_len); + + return leh->leh_len; +} +EXPORT_SYMBOL(linkea_overflow_shrink); + +/** + * Check if such a link exists in linkEA. + * + * \param ldata link data the search to be done on + * \param lname name in the parent's directory entry pointing to this object + * \param pfid parent fid the link to be found for + * + * \retval 0 success + * \retval -ENOENT link does not exist + * \retval -ve on error + */ +int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid) +{ + struct lu_name tmpname; + struct lu_fid tmpfid; + int count; + + LASSERT(ldata->ld_leh != NULL); + + /* link #0, if leh_reccount == 0 we skip the loop and return -ENOENT */ + if (likely(ldata->ld_leh->leh_reccount > 0)) + ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1); + + for (count = 0; count < ldata->ld_leh->leh_reccount; count++) { + linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, + &tmpname, &tmpfid); + if (tmpname.ln_namelen == lname->ln_namelen && + lu_fid_eq(&tmpfid, pfid) && + (strncmp(tmpname.ln_name, lname->ln_name, + tmpname.ln_namelen) == 0)) + break; + ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee + + ldata->ld_reclen); + } + + if (count == ldata->ld_leh->leh_reccount) { + CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n", + lname->ln_namelen, lname->ln_name); + ldata->ld_lee = NULL; + ldata->ld_reclen = 0; + return -ENOENT; + } + return 0; +} +EXPORT_SYMBOL(linkea_links_find); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog.c b/drivers/staging/lustrefsx/lustre/obdclass/llog.c new file mode 100644 index 0000000000000..2c45c9673ae84 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog.c @@ -0,0 +1,1539 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/llog.c + * + * OST<->MDS recovery logging infrastructure. + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger + * Author: Alex Zhuravlev + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include +#include +#include +#include +#include +#include +#include +#include "llog_internal.h" + +/* + * Allocate a new log or catalog handle + * Used inside llog_open(). + */ +static struct llog_handle *llog_alloc_handle(void) +{ + struct llog_handle *loghandle; + + OBD_ALLOC_PTR(loghandle); + if (loghandle == NULL) + return NULL; + + init_rwsem(&loghandle->lgh_lock); + mutex_init(&loghandle->lgh_hdr_mutex); + init_rwsem(&loghandle->lgh_last_sem); + INIT_LIST_HEAD(&loghandle->u.phd.phd_entry); + refcount_set(&loghandle->lgh_refcount, 1); + + return loghandle; +} + +/* + * Free llog handle and header data if exists. Used in llog_close() only + */ +static void llog_free_handle(struct llog_handle *loghandle) +{ + LASSERT(loghandle != NULL); + + /* failed llog_init_handle */ + if (loghandle->lgh_hdr == NULL) + goto out; + + if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) + LASSERT(list_empty(&loghandle->u.phd.phd_entry)); + else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) + LASSERT(list_empty(&loghandle->u.chd.chd_head)); + OBD_FREE_LARGE(loghandle->lgh_hdr, loghandle->lgh_hdr_size); +out: + OBD_FREE_PTR(loghandle); +} + +struct llog_handle *llog_handle_get(struct llog_handle *loghandle) +{ + if (refcount_inc_not_zero(&loghandle->lgh_refcount)) + return loghandle; + return NULL; +} + +int llog_handle_put(const struct lu_env *env, struct llog_handle *loghandle) +{ + int rc = 0; + + if (refcount_dec_and_test(&loghandle->lgh_refcount)) { + const struct llog_operations *lop; + + rc = llog_handle2ops(loghandle, &lop); + if (!rc) { + if (lop->lop_close) + rc = lop->lop_close(env, loghandle); + else + rc = -EOPNOTSUPP; + } + llog_free_handle(loghandle); + } + return rc; +} + +static int llog_declare_destroy(const struct lu_env *env, + struct llog_handle *handle, + struct thandle *th) +{ + const struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_declare_destroy == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_declare_destroy(env, handle, th); + + RETURN(rc); +} + +int llog_trans_destroy(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th) +{ + const struct llog_operations *lop; + int rc; + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc < 0) + RETURN(rc); + if (lop->lop_destroy == NULL) + RETURN(-EOPNOTSUPP); + + LASSERT(handle->lgh_obj != NULL); + if (!llog_exist(handle)) + RETURN(0); + + rc = lop->lop_destroy(env, handle, th); + + RETURN(rc); +} + +int llog_destroy(const struct lu_env *env, struct llog_handle *handle) +{ + const struct llog_operations *lop; + struct dt_device *dt; + struct thandle *th; + int rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc < 0) + RETURN(rc); + if (lop->lop_destroy == NULL) + RETURN(-EOPNOTSUPP); + + if (handle->lgh_obj == NULL) { + /* if lgh_obj == NULL, then it is from client side destroy */ + rc = lop->lop_destroy(env, handle, NULL); + RETURN(rc); + } + + if (!llog_exist(handle)) + RETURN(0); + + dt = lu2dt_dev(handle->lgh_obj->do_lu.lo_dev); + + if (unlikely(unlikely(dt->dd_rdonly))) + RETURN(-EROFS); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = llog_declare_destroy(env, handle, th); + if (rc != 0) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc < 0) + GOTO(out_trans, rc); + + rc = lop->lop_destroy(env, handle, th); + +out_trans: + dt_trans_stop(env, dt, th); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_destroy); + +/* returns negative on error; 0 if success; 1 if success & log destroyed */ +int llog_cancel_arr_rec(const struct lu_env *env, struct llog_handle *loghandle, + int num, int *index) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_device *dt; + struct llog_log_hdr *llh; + struct thandle *th; + __u32 tmp_lgc_index; + int rc, i = 0; + int rc1; + bool subtract_count = false; + + ENTRY; + + LASSERT(loghandle != NULL); + LASSERT(loghandle->lgh_ctxt != NULL); + LASSERT(loghandle->lgh_obj != NULL); + + llh = loghandle->lgh_hdr; + + CDEBUG(D_RPCTRACE, "Canceling %d records, first %d in log "DFID"\n", + num, index[0], PFID(&loghandle->lgh_id.lgl_oi.oi_fid)); + + dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev); + + if (unlikely(unlikely(dt->dd_rdonly))) + RETURN(0); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = llog_declare_write_rec(env, loghandle, &llh->llh_hdr, 0, th); + if (rc < 0) + GOTO(out_trans, rc); + + if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY)) { + rc = llog_declare_destroy(env, loghandle, th); + if (rc < 0) + GOTO(out_trans, rc); + } + + th->th_wait_submit = 1; + rc = dt_trans_start_local(env, dt, th); + if (rc < 0) + GOTO(out_trans, rc); + + down_write(&loghandle->lgh_lock); + /* clear bitmap */ + mutex_lock(&loghandle->lgh_hdr_mutex); + for (i = 0; i < num; ++i) { + if (index[i] == 0) { + CERROR("Can't cancel index 0 which is header\n"); + GOTO(out_unlock, rc = -EINVAL); + } + if (!__test_and_clear_bit_le(index[i], LLOG_HDR_BITMAP(llh))) { + CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", + index[i]); + GOTO(out_unlock, rc = -ENOENT); + } + } + loghandle->lgh_hdr->llh_count -= num; + subtract_count = true; + + /* Since llog_process_thread use lgi_cookie, it`s better to save them + * and restore after using + */ + tmp_lgc_index = lgi->lgi_cookie.lgc_index; + /* Pass this index to llog_osd_write_rec(), which will use the index + * to only update the necesary bitmap. */ + lgi->lgi_cookie.lgc_index = index[0]; + /* update header */ + rc = llog_write_rec(env, loghandle, &llh->llh_hdr, (num != 1 ? NULL : + &lgi->lgi_cookie), LLOG_HEADER_IDX, th); + lgi->lgi_cookie.lgc_index = tmp_lgc_index; + + if (rc != 0) + GOTO(out_unlock, rc); + + if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + (llh->llh_count == 1) && + ((loghandle->lgh_last_idx == LLOG_HDR_BITMAP_SIZE(llh) - 1) || + (loghandle->u.phd.phd_cat_handle != NULL && + loghandle->u.phd.phd_cat_handle->u.chd.chd_current_log != + loghandle))) { + /* never try to destroy it again */ + llh->llh_flags &= ~LLOG_F_ZAP_WHEN_EMPTY; + rc = llog_trans_destroy(env, loghandle, th); + if (rc < 0) { + /* Sigh, can not destroy the final plain llog, but + * the bitmap has been clearly, so the record can not + * be accessed anymore, let's return 0 for now, and + * the orphan will be handled by LFSCK. */ + CERROR("%s: can't destroy empty llog "DFID": rc = %d\n", + loghandle2name(loghandle), + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), rc); + GOTO(out_unlock, rc = 0); + } + rc = LLOG_DEL_PLAIN; + } + +out_unlock: + if (rc < 0) { + /* restore bitmap while holding a mutex */ + if (subtract_count) { + loghandle->lgh_hdr->llh_count += num; + subtract_count = false; + } + for (i = i - 1; i >= 0; i--) + set_bit_le(index[i], LLOG_HDR_BITMAP(llh)); + } + mutex_unlock(&loghandle->lgh_hdr_mutex); + up_write(&loghandle->lgh_lock); +out_trans: + rc1 = dt_trans_stop(env, dt, th); + if (rc == 0) + rc = rc1; + if (rc1 < 0) { + mutex_lock(&loghandle->lgh_hdr_mutex); + if (subtract_count) + loghandle->lgh_hdr->llh_count += num; + for (i = i - 1; i >= 0; i--) + set_bit_le(index[i], LLOG_HDR_BITMAP(llh)); + mutex_unlock(&loghandle->lgh_hdr_mutex); + } + RETURN(rc); +} + +int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle, + int index) +{ + return llog_cancel_arr_rec(env, loghandle, 1, &index); +} + +int llog_read_header(const struct lu_env *env, struct llog_handle *handle, + const struct obd_uuid *uuid) +{ + const struct llog_operations *lop; + int rc; + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + + if (lop->lop_read_header == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_read_header(env, handle); + if (rc == LLOG_EEMPTY) { + struct llog_log_hdr *llh = handle->lgh_hdr; + + /* lrh_len should be initialized in llog_init_handle */ + handle->lgh_last_idx = 0; /* header is record with index 0 */ + llh->llh_count = 1; /* for the header record */ + llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC; + LASSERT(handle->lgh_ctxt->loc_chunk_size >= + LLOG_MIN_CHUNK_SIZE); + llh->llh_hdr.lrh_len = handle->lgh_ctxt->loc_chunk_size; + llh->llh_hdr.lrh_index = 0; + llh->llh_timestamp = ktime_get_real_seconds(); + if (uuid) + memcpy(&llh->llh_tgtuuid, uuid, + sizeof(llh->llh_tgtuuid)); + llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap); + /* Since update llog header might also call this function, + * let's reset the bitmap to 0 here */ + memset(LLOG_HDR_BITMAP(llh), 0, llh->llh_hdr.lrh_len - + llh->llh_bitmap_offset - + sizeof(llh->llh_tail)); + set_bit_le(0, LLOG_HDR_BITMAP(llh)); + LLOG_HDR_TAIL(llh)->lrt_len = llh->llh_hdr.lrh_len; + LLOG_HDR_TAIL(llh)->lrt_index = llh->llh_hdr.lrh_index; + rc = 0; + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_read_header); + +int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, + int flags, struct obd_uuid *uuid) +{ + struct llog_log_hdr *llh; + enum llog_flag fmt = flags & LLOG_F_EXT_MASK; + int rc; + int chunk_size = handle->lgh_ctxt->loc_chunk_size; + ENTRY; + + LASSERT(handle->lgh_hdr == NULL); + + LASSERT(chunk_size >= LLOG_MIN_CHUNK_SIZE); + OBD_ALLOC_LARGE(llh, chunk_size); + if (llh == NULL) + RETURN(-ENOMEM); + + handle->lgh_hdr = llh; + handle->lgh_hdr_size = chunk_size; + /* first assign flags to use llog_client_ops */ + llh->llh_flags = flags; + rc = llog_read_header(env, handle, uuid); + if (rc == 0) { + if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN && + flags & LLOG_F_IS_CAT) || + (llh->llh_flags & LLOG_F_IS_CAT && + flags & LLOG_F_IS_PLAIN))) { + CERROR("%s: llog type is %s but initializing %s\n", + loghandle2name(handle), + llh->llh_flags & LLOG_F_IS_CAT ? + "catalog" : "plain", + flags & LLOG_F_IS_CAT ? "catalog" : "plain"); + GOTO(out, rc = -EINVAL); + } else if (llh->llh_flags & + (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) { + /* + * it is possible to open llog without specifying llog + * type so it is taken from llh_flags + */ + flags = llh->llh_flags; + } else { + /* for some reason the llh_flags has no type set */ + CERROR("llog type is not specified!\n"); + GOTO(out, rc = -EINVAL); + } + if (unlikely(uuid && + !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) { + CERROR("%s: llog uuid mismatch: %s/%s\n", + loghandle2name(handle), + (char *)uuid->uuid, + (char *)llh->llh_tgtuuid.uuid); + GOTO(out, rc = -EEXIST); + } + } + if (flags & LLOG_F_IS_CAT) { + LASSERT(list_empty(&handle->u.chd.chd_head)); + INIT_LIST_HEAD(&handle->u.chd.chd_head); + llh->llh_size = sizeof(struct llog_logid_rec); + llh->llh_flags |= LLOG_F_IS_FIXSIZE; + } else if (!(flags & LLOG_F_IS_PLAIN)) { + CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n", + loghandle2name(handle), flags, LLOG_F_IS_CAT, + LLOG_F_IS_PLAIN); + rc = -EINVAL; + } + llh->llh_flags |= fmt; +out: + if (rc) { + OBD_FREE_LARGE(llh, chunk_size); + handle->lgh_hdr = NULL; + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_init_handle); + +int llog_verify_record(const struct llog_handle *llh, struct llog_rec_hdr *rec) +{ + int chunk_size = llh->lgh_hdr->llh_hdr.lrh_len; + + if (rec->lrh_len == 0 || rec->lrh_len > chunk_size) { + CERROR("%s: record is too large: %d > %d\n", + loghandle2name(llh), rec->lrh_len, chunk_size); + return -EINVAL; + } + if (rec->lrh_index >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr)) { + CERROR("%s: index is too high: %d\n", + loghandle2name(llh), rec->lrh_index); + return -EINVAL; + } + if ((rec->lrh_type & LLOG_OP_MASK) != LLOG_OP_MAGIC) { + CERROR("%s: magic %x is bad\n", + loghandle2name(llh), rec->lrh_type); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL(llog_verify_record); + +static inline bool llog_is_index_skipable(int idx, struct llog_log_hdr *llh, + struct llog_process_cat_data *cd) +{ + if (cd && (cd->lpcd_read_mode & LLOG_READ_MODE_RAW)) + return false; + + return !test_bit_le(idx, LLOG_HDR_BITMAP(llh)); +} + +static int llog_process_thread(void *arg) +{ + struct llog_process_info *lpi = arg; + struct llog_handle *loghandle = lpi->lpi_loghandle; + struct llog_log_hdr *llh = loghandle->lgh_hdr; + struct llog_process_cat_data *cd = lpi->lpi_catdata; + struct llog_thread_info *lti; + char *buf; + size_t chunk_size; + __u64 cur_offset; + int rc = 0, index = 1, last_index; + int saved_index = 0; + int last_called_index = 0; + bool repeated = false; + bool refresh_idx = false; + + ENTRY; + + if (llh == NULL) + RETURN(-EINVAL); + + lti = lpi->lpi_env == NULL ? NULL : llog_info(lpi->lpi_env); + + cur_offset = chunk_size = llh->llh_hdr.lrh_len; + /* expect chunk_size to be power of two */ + LASSERT(is_power_of_2(chunk_size)); + + OBD_ALLOC_LARGE(buf, chunk_size); + if (buf == NULL) { + lpi->lpi_rc = -ENOMEM; + RETURN(0); + } + + if (cd != NULL) { + last_called_index = cd->lpcd_first_idx; + index = cd->lpcd_first_idx + 1; + } + if (cd && cd->lpcd_last_idx) + last_index = cd->lpcd_last_idx; + else if (cd && (cd->lpcd_read_mode & LLOG_READ_MODE_RAW)) + last_index = loghandle->lgh_last_idx; + else + last_index = LLOG_HDR_BITMAP_SIZE(llh) - 1; + + while (rc == 0) { + struct llog_rec_hdr *rec; + off_t chunk_offset = 0; + unsigned int buf_offset = 0; + int lh_last_idx; + int synced_idx = 0; + + /* skip records not set in bitmap */ + while (index <= last_index && + llog_is_index_skipable(index, llh, cd)) + ++index; + + /* There are no indices prior the last_index */ + if (index > last_index) + break; + + CDEBUG(D_OTHER, "index: %d last_index %d\n", index, + last_index); + +repeat: + /* get the buf with our target record; avoid old garbage */ + memset(buf, 0, chunk_size); + /* the record index for outdated chunk data */ + /* it is safe to process buffer until saved lgh_last_idx */ + lh_last_idx = LLOG_HDR_TAIL(llh)->lrt_index; + rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index, + index, &cur_offset, buf, chunk_size); + if (repeated && rc) + CDEBUG(D_OTHER, "cur_offset %llu, chunk_offset %llu," + " buf_offset %u, rc = %d\n", cur_offset, + (__u64)chunk_offset, buf_offset, rc); + if (rc == -ESTALE) + GOTO(out, rc = 0); + /* we`ve tried to reread the chunk, but there is no + * new records */ + if (rc == -EIO && repeated && (chunk_offset + buf_offset) == + cur_offset) + GOTO(out, rc = 0); + if (rc != 0) + GOTO(out, rc); + + /* NB: after llog_next_block() call the cur_offset is the + * offset of the next block after read one. + * The absolute offset of the current chunk is calculated + * from cur_offset value and stored in chunk_offset variable. + */ + if ((cur_offset & (chunk_size - 1)) != 0) + chunk_offset = cur_offset & ~(chunk_size - 1); + else + chunk_offset = cur_offset - chunk_size; + + /* NB: when rec->lrh_len is accessed it is already swabbed + * since it is used at the "end" of the loop and the rec + * swabbing is done at the beginning of the loop. */ + for (rec = (struct llog_rec_hdr *)(buf + buf_offset); + (char *)rec < buf + chunk_size; + rec = llog_rec_hdr_next(rec)) { + + CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n", + rec, rec->lrh_type); + + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n", + rec->lrh_type, rec->lrh_index); + + if (index == (synced_idx + 1) && + synced_idx == LLOG_HDR_TAIL(llh)->lrt_index) + GOTO(out, rc = 0); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PROCESS_TIMEOUT) && + cfs_fail_val == (unsigned int) + (loghandle->lgh_id.lgl_oi.oi.oi_id & + 0xFFFFFFFF)) { + OBD_RACE(OBD_FAIL_LLOG_PROCESS_TIMEOUT); + } + + /* the bitmap could be changed during processing + * records from the chunk. For wrapped catalog + * it means we can read deleted record and try to + * process it. Check this case and reread the chunk. + * It is safe to process to lh_last_idx, including + * lh_last_idx if it was synced. We can not do <= + * comparison, cause for wrapped catalog lgh_last_idx + * could be less than index. So we detect last index + * for processing as index == lh_last_idx+1. But when + * catalog is wrapped and full lgh_last_idx=llh_cat_idx, + * the first processing index is llh_cat_idx+1.The + * exception is !(lgh_last_idx == llh_cat_idx && + * index == llh_cat_idx + 1), and after simplification + * it turns to + * lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index + * This exception is working for catalog only. + */ + + if ((index == lh_last_idx && synced_idx != index) || + (index == (lh_last_idx + 1) && + lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index) || + (rec->lrh_index == 0 && !repeated)) { + + /* save offset inside buffer for the re-read */ + buf_offset = (char *)rec - (char *)buf; + cur_offset = chunk_offset; + repeated = true; + /* We need to be sure lgh_last_idx + * record was saved to disk + */ + down_read(&loghandle->lgh_last_sem); + synced_idx = LLOG_HDR_TAIL(llh)->lrt_index; + up_read(&loghandle->lgh_last_sem); + CDEBUG(D_OTHER, "synced_idx: %d\n", synced_idx); + goto repeat; + + } + + repeated = false; + + rc = llog_verify_record(loghandle, rec); + if (rc) { + CERROR("%s: invalid record in llog "DFID + " record for index %d/%d: rc = %d\n", + loghandle2name(loghandle), + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + rec->lrh_index, index, rc); + /* + * the block seem to be corrupted, let's try + * with the next one. reset rc to go to the + * next chunk. + */ + refresh_idx = true; + index = 0; + GOTO(repeat, rc = 0); + } + + if (rec->lrh_index < index) { + CDEBUG(D_OTHER, "skipping lrh_index %d\n", + rec->lrh_index); + continue; + } + + if (rec->lrh_index > index) { + /* the record itself looks good, but we met a + * gap which can be result of old bugs, just + * keep going */ + CERROR("%s: "DFID" index %u, expected %u\n", + loghandle2name(loghandle), + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + rec->lrh_index, index); + index = rec->lrh_index; + } + + CDEBUG(D_OTHER, + "lrh_index: %d lrh_len: %d (%d remains)\n", + rec->lrh_index, rec->lrh_len, + (int)(buf + chunk_size - (char *)rec)); + + /* lgh_cur_offset is used only at llog_test_3 */ + loghandle->lgh_cur_offset = (char *)rec - (char *)buf + + chunk_offset; + + /* if needed, process the callback on this record */ + if (!llog_is_index_skipable(index, llh, cd)) { + struct llog_cookie *lgc; + __u64 tmp_off; + int tmp_idx; + + CDEBUG((llh->llh_flags & LLOG_F_IS_CAT ? + D_HA : D_OTHER), + "index: %d, lh_last_idx: %d " + "synced_idx: %d lgh_last_idx: %d\n", + index, lh_last_idx, synced_idx, + loghandle->lgh_last_idx); + + if (lti != NULL) { + lgc = <i->lgi_cookie; + /* store lu_env for recursive calls */ + tmp_off = lgc->lgc_offset; + tmp_idx = lgc->lgc_index; + + lgc->lgc_offset = (char *)rec - + (char *)buf + chunk_offset; + lgc->lgc_index = rec->lrh_index; + } + /* using lu_env for passing record offset to + * llog_write through various callbacks */ + rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec, + lpi->lpi_cbdata); + last_called_index = index; + + if (lti != NULL) { + lgc->lgc_offset = tmp_off; + lgc->lgc_index = tmp_idx; + } + + if (rc == LLOG_PROC_BREAK) { + GOTO(out, rc); + } else if (rc == LLOG_DEL_RECORD) { + rc = llog_cancel_rec(lpi->lpi_env, + loghandle, + rec->lrh_index); + /* Allow parallel cancelling, ENOENT + * means record was canceled at another + * processing thread or callback + */ + if (rc == -ENOENT) + rc = 0; + } + if (rc) + GOTO(out, rc); + /* some stupid callbacks directly cancel records + * and delete llog. Check it and stop + * processing. */ + if (loghandle->lgh_hdr == NULL || + loghandle->lgh_hdr->llh_count == 1) + GOTO(out, rc = 0); + } + /* exit if the last index is reached */ + if (index >= last_index) + GOTO(out, rc = 0); + ++index; + } + } + +out: + CDEBUG(D_HA, "stop processing %s "DOSTID":%x index %d count %d\n", + ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" : "plain"), + POSTID(&loghandle->lgh_id.lgl_oi), loghandle->lgh_id.lgl_ogen, + index, llh->llh_count); + + if (cd != NULL) + cd->lpcd_last_idx = last_called_index; + + if (unlikely(rc == -EIO && loghandle->lgh_obj != NULL)) { + if (dt_object_remote(loghandle->lgh_obj)) { + /* If it is remote object, then -EIO might means + * disconnection or eviction, let's return -EAGAIN, + * so for update recovery log processing, it will + * retry until the umount or abort recovery, see + * lod_sub_recovery_thread() */ + CERROR("%s retry remote llog process\n", + loghandle2name(loghandle)); + rc = -EAGAIN; + } else { + /* something bad happened to the processing of a local + * llog file, probably I/O error or the log got + * corrupted to be able to finally release the log we + * discard any remaining bits in the header */ + CERROR("%s: Local llog found corrupted #"DOSTID":%x" + " %s index %d count %d\n", + loghandle2name(loghandle), + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, + ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" : + "plain"), index, llh->llh_count); + + while (index <= last_index) { + if (test_bit_le(index, + LLOG_HDR_BITMAP(llh)) != 0) + llog_cancel_rec(lpi->lpi_env, loghandle, + index); + index++; + } + rc = 0; + } + } + + OBD_FREE_LARGE(buf, chunk_size); + lpi->lpi_rc = rc; + return 0; +} + +static int llog_process_thread_daemonize(void *arg) +{ + struct llog_process_info *lpi = arg; + struct lu_env env; + int rc; + struct nsproxy *new_ns, *curr_ns = current->nsproxy; + + task_lock(lpi->lpi_reftask); + new_ns = lpi->lpi_reftask->nsproxy; + if (curr_ns != new_ns) { + get_nsproxy(new_ns); + + current->nsproxy = new_ns; + /* XXX: we should call put_nsproxy() instead of + * atomic_dec(&ns->count) directly. But put_nsproxy() cannot be + * used outside of the kernel itself, because it calls + * free_nsproxy() which is not exported by the kernel + * (defined in kernel/nsproxy.c) */ + if (curr_ns) + atomic_dec(&curr_ns->count); + } + task_unlock(lpi->lpi_reftask); + + unshare_fs_struct(); + /* client env has no keys, tags is just 0 */ + rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); + if (rc) + goto out; + lpi->lpi_env = &env; + + rc = llog_process_thread(arg); + + lu_env_fini(&env); +out: + complete(&lpi->lpi_completion); + return rc; +} + +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork) +{ + struct llog_process_info *lpi; + struct llog_process_data *d = data; + struct llog_process_cat_data *cd = catdata; + __u32 flags = loghandle->lgh_hdr->llh_flags; + int rc; + + ENTRY; + + OBD_ALLOC_PTR(lpi); + if (lpi == NULL) { + CERROR("cannot alloc pointer\n"); + RETURN(-ENOMEM); + } + lpi->lpi_loghandle = loghandle; + lpi->lpi_cb = cb; + lpi->lpi_cbdata = data; + lpi->lpi_catdata = catdata; + + CDEBUG(D_OTHER, "Processing "DFID" flags 0x%03x startcat %d startidx %d first_idx %d last_idx %d read_mode %d\n", + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), flags, + (flags & LLOG_F_IS_CAT) && d ? d->lpd_startcat : -1, + (flags & LLOG_F_IS_CAT) && d ? d->lpd_startidx : -1, + cd ? cd->lpcd_first_idx : -1, cd ? cd->lpcd_last_idx : -1, + cd ? cd->lpcd_read_mode : -1); + if (fork) { + struct task_struct *task; + + /* The new thread can't use parent env, + * init the new one in llog_process_thread_daemonize. */ + lpi->lpi_env = NULL; + init_completion(&lpi->lpi_completion); + /* take reference to current, so that + * llog_process_thread_daemonize() can use it to switch to + * namespace associated with current */ + lpi->lpi_reftask = current; + task = kthread_run(llog_process_thread_daemonize, lpi, + "llog_process_thread"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("%s: cannot start thread: rc = %d\n", + loghandle2name(loghandle), rc); + GOTO(out_lpi, rc); + } + wait_for_completion(&lpi->lpi_completion); + } else { + lpi->lpi_env = env; + llog_process_thread(lpi); + } + rc = lpi->lpi_rc; + +out_lpi: + OBD_FREE_PTR(lpi); + RETURN(rc); +} +EXPORT_SYMBOL(llog_process_or_fork); + +int llog_process(const struct lu_env *env, struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata) +{ + int rc; + rc = llog_process_or_fork(env, loghandle, cb, data, catdata, true); + return rc == LLOG_DEL_PLAIN ? 0 : rc; +} +EXPORT_SYMBOL(llog_process); + +static inline const struct cred *llog_raise_resource(void) +{ + struct cred *cred = NULL; + + if (cap_raised(current_cap(), CAP_SYS_RESOURCE)) + return cred; + + cred = prepare_creds(); + if (!cred) + return cred; + + cap_raise(cred->cap_effective, CAP_SYS_RESOURCE); + return override_creds(cred); +} + +static inline void llog_restore_resource(const struct cred *old_cred) +{ + if (old_cred) + revert_creds(old_cred); +} + +int llog_reverse_process(const struct lu_env *env, + struct llog_handle *loghandle, llog_cb_t cb, + void *data, void *catdata) +{ + struct llog_log_hdr *llh = loghandle->lgh_hdr; + struct llog_process_cat_data *cd = catdata; + void *buf; + int rc = 0, first_index = 1, index, idx; + __u32 chunk_size = llh->llh_hdr.lrh_len; + ENTRY; + + OBD_ALLOC_LARGE(buf, chunk_size); + if (buf == NULL) + RETURN(-ENOMEM); + + if (cd != NULL) + first_index = cd->lpcd_first_idx + 1; + if (cd != NULL && cd->lpcd_last_idx) + index = cd->lpcd_last_idx; + else + index = LLOG_HDR_BITMAP_SIZE(llh) - 1; + + while (rc == 0) { + struct llog_rec_hdr *rec; + struct llog_rec_tail *tail; + + /* skip records not set in bitmap */ + while (index >= first_index && + llog_is_index_skipable(index, llh, cd)) + --index; + + LASSERT(index >= first_index - 1); + if (index == first_index - 1) + break; + + /* get the buf with our target record; avoid old garbage */ + memset(buf, 0, chunk_size); + rc = llog_prev_block(env, loghandle, index, buf, chunk_size); + if (rc) + GOTO(out, rc); + + rec = buf; + idx = rec->lrh_index; + CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx); + while (idx < index) { + rec = (void *)rec + rec->lrh_len; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + idx ++; + } + LASSERT(idx == index); + tail = (void *)rec + rec->lrh_len - sizeof(*tail); + + /* process records in buffer, starting where we found one */ + while ((void *)tail > buf) { + if (tail->lrt_index == 0) + GOTO(out, rc = 0); /* no more records */ + + /* if needed, process the callback on this record */ + if (!llog_is_index_skipable(index, llh, cd)) { + rec = (void *)tail - tail->lrt_len + + sizeof(*tail); + + rc = cb(env, loghandle, rec, data); + if (rc == LLOG_PROC_BREAK) { + GOTO(out, rc); + } else if (rc == LLOG_DEL_RECORD) { + rc = llog_cancel_rec(env, loghandle, + tail->lrt_index); + } + if (rc) + GOTO(out, rc); + } + + /* previous record, still in buffer? */ + --index; + if (index < first_index) + GOTO(out, rc = 0); + tail = (void *)tail - tail->lrt_len; + } + } + +out: + if (buf != NULL) + OBD_FREE_LARGE(buf, chunk_size); + RETURN(rc); +} +EXPORT_SYMBOL(llog_reverse_process); + +/** + * new llog API + * + * API functions: + * llog_open - open llog, may not exist + * llog_exist - check if llog exists + * llog_close - close opened llog, pair for open, frees llog_handle + * llog_declare_create - declare llog creation + * llog_create - create new llog on disk, need transaction handle + * llog_declare_write_rec - declaration of llog write + * llog_write_rec - write llog record on disk, need transaction handle + * llog_declare_add - declare llog catalog record addition + * llog_add - add llog record in catalog, need transaction handle + */ +int llog_exist(struct llog_handle *loghandle) +{ + const struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_exist == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_exist(loghandle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_exist); + +int llog_declare_create(const struct lu_env *env, + struct llog_handle *loghandle, struct thandle *th) +{ + const struct cred *old_cred; + const struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_declare_create == NULL) + RETURN(-EOPNOTSUPP); + + old_cred = llog_raise_resource(); + rc = lop->lop_declare_create(env, loghandle, th); + llog_restore_resource(old_cred); + RETURN(rc); +} + +int llog_create(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th) +{ + const struct cred *old_cred; + const struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_create == NULL) + RETURN(-EOPNOTSUPP); + + old_cred = llog_raise_resource(); + rc = lop->lop_create(env, handle, th); + llog_restore_resource(old_cred); + RETURN(rc); +} + +int llog_declare_write_rec(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, int idx, + struct thandle *th) +{ + const struct cred *old_cred; + const struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + LASSERT(lop); + if (lop->lop_declare_write_rec == NULL) + RETURN(-EOPNOTSUPP); + + old_cred = llog_raise_resource(); + rc = lop->lop_declare_write_rec(env, handle, rec, idx, th); + llog_restore_resource(old_cred); + RETURN(rc); +} + +int llog_write_rec(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + int idx, struct thandle *th) +{ + const struct cred *old_cred; + const struct llog_operations *lop; + int rc, buflen; + + ENTRY; + + /* API sanity checks */ + if (handle == NULL) { + CERROR("loghandle is missed\n"); + RETURN(-EPROTO); + } else if (handle->lgh_obj == NULL) { + CERROR("loghandle %p with NULL object\n", + handle); + RETURN(-EPROTO); + } else if (th == NULL) { + CERROR("%s: missed transaction handle\n", + loghandle2name(handle)); + RETURN(-EPROTO); + } else if (handle->lgh_hdr == NULL) { + CERROR("%s: loghandle %p with no header\n", + loghandle2name(handle), handle); + RETURN(-EPROTO); + } + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + + if (lop->lop_write_rec == NULL) + RETURN(-EOPNOTSUPP); + + buflen = rec->lrh_len; + LASSERT(cfs_size_round(buflen) == buflen); + + old_cred = llog_raise_resource(); + rc = lop->lop_write_rec(env, handle, rec, logcookies, idx, th); + llog_restore_resource(old_cred); + RETURN(rc); +} + +int llog_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + struct thandle *th) +{ + const struct cred *old_cred; + int rc; + + ENTRY; + + if (lgh->lgh_logops->lop_add == NULL) + RETURN(-EOPNOTSUPP); + + old_cred = llog_raise_resource(); + rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, th); + llog_restore_resource(old_cred); + RETURN(rc); +} +EXPORT_SYMBOL(llog_add); + +int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th) +{ + const struct cred *old_cred; + int rc; + + ENTRY; + + if (lgh->lgh_logops->lop_declare_add == NULL) + RETURN(-EOPNOTSUPP); + + old_cred = llog_raise_resource(); + rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th); + llog_restore_resource(old_cred); + RETURN(rc); +} +EXPORT_SYMBOL(llog_declare_add); + +/** + * Helper function to open llog or create it if doesn't exist. + * It hides all transaction handling from caller. + */ +int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **res, struct llog_logid *logid, + char *name) +{ + struct dt_device *d; + struct thandle *th; + int rc; + + ENTRY; + + rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW); + if (rc) + RETURN(rc); + + if (llog_exist(*res)) + RETURN(0); + + LASSERT((*res)->lgh_obj != NULL); + + d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev); + + if (unlikely(unlikely(d->dd_rdonly))) + RETURN(-EROFS); + + th = dt_trans_create(env, d); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + /* Create update llog object synchronously, which + * happens during inialization process see + * lod_sub_prep_llog(), to make sure the update + * llog object is created before corss-MDT writing + * updates into the llog object */ + if (ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) + th->th_sync = 1; + + th->th_wait_submit = 1; + rc = llog_declare_create(env, *res, th); + if (rc == 0) { + rc = dt_trans_start_local(env, d, th); + if (rc == 0) + rc = llog_create(env, *res, th); + } + dt_trans_stop(env, d, th); +out: + if (rc) + llog_close(env, *res); + RETURN(rc); +} +EXPORT_SYMBOL(llog_open_create); + +/** + * Helper function to delete existent llog. + */ +int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_logid *logid, char *name) +{ + struct llog_handle *handle; + int rc = 0, rc2; + + ENTRY; + + /* nothing to erase */ + if (name == NULL && logid == NULL) + RETURN(0); + + rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS); + if (rc < 0) + RETURN(rc); + + rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL); + if (rc == 0) + rc = llog_destroy(env, handle); + + rc2 = llog_close(env, handle); + if (rc == 0) + rc = rc2; + RETURN(rc); +} +EXPORT_SYMBOL(llog_erase); + +/* + * Helper function for write record in llog. + * It hides all transaction handling from caller. + * Valid only with local llog. + */ +int llog_write(const struct lu_env *env, struct llog_handle *loghandle, + struct llog_rec_hdr *rec, int idx) +{ + struct dt_device *dt; + struct thandle *th; + bool need_cookie; + int rc; + + ENTRY; + + LASSERT(loghandle); + LASSERT(loghandle->lgh_ctxt); + LASSERT(loghandle->lgh_obj != NULL); + + dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev); + + if (unlikely(unlikely(dt->dd_rdonly))) + RETURN(-EROFS); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = llog_declare_write_rec(env, loghandle, rec, idx, th); + if (rc) + GOTO(out_trans, rc); + + th->th_wait_submit = 1; + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(out_trans, rc); + + need_cookie = !(idx == LLOG_HEADER_IDX || idx == LLOG_NEXT_IDX); + + down_write(&loghandle->lgh_lock); + if (need_cookie) { + struct llog_thread_info *lti = llog_info(env); + + /* cookie comes from llog_process_thread */ + rc = llog_write_rec(env, loghandle, rec, <i->lgi_cookie, + rec->lrh_index, th); + /* upper layer didn`t pass cookie so change rc */ + rc = (rc == 1 ? 0 : rc); + } else { + rc = llog_write_rec(env, loghandle, rec, NULL, idx, th); + } + + up_write(&loghandle->lgh_lock); +out_trans: + dt_trans_stop(env, dt, th); + RETURN(rc); +} +EXPORT_SYMBOL(llog_write); + +int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param) +{ + const struct cred *old_cred; + int rc; + + ENTRY; + + LASSERT(ctxt); + LASSERT(ctxt->loc_logops); + + if (ctxt->loc_logops->lop_open == NULL) { + *lgh = NULL; + RETURN(-EOPNOTSUPP); + } + + *lgh = llog_alloc_handle(); + if (*lgh == NULL) + RETURN(-ENOMEM); + (*lgh)->lgh_ctxt = ctxt; + (*lgh)->lgh_logops = ctxt->loc_logops; + + old_cred = llog_raise_resource(); + rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param); + llog_restore_resource(old_cred); + if (rc) { + llog_free_handle(*lgh); + *lgh = NULL; + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_open); + +int llog_close(const struct lu_env *env, struct llog_handle *loghandle) +{ + return llog_handle_put(env, loghandle); +} +EXPORT_SYMBOL(llog_close); + +/** + * Helper function to get the llog size in records. It is used by MGS + * mostly to check that config llog exists and contains data. + * + * \param[in] env execution environment + * \param[in] ctxt llog context + * \param[in] name llog name + * + * \retval true if there are records in llog besides a header + * \retval false on error or llog without records + */ +int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name) +{ + struct llog_handle *llh; + int rc = 0; + + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc < 0) { + if (likely(rc == -ENOENT)) + rc = 0; + GOTO(out, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(out_close, rc); + rc = llog_get_size(llh); + +out_close: + llog_close(env, llh); +out: + /* The header is record 1, the llog is still considered as empty + * if there is only header */ + return (rc <= 1); +} +EXPORT_SYMBOL(llog_is_empty); + +/* this callback run in raw read mode (canceled record are processed) */ +int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_handle *copy_llh = data; + int idx = rec->lrh_index; + int rc; + + ENTRY; + + /* Append all records */ + rc = llog_write(env, copy_llh, rec, LLOG_NEXT_IDX); + + /* Cancel the record if it is canceled on the source */ + if (!rc && !test_bit_le(idx, LLOG_HDR_BITMAP(llh->lgh_hdr))) + rc = llog_cancel_rec(env, copy_llh, copy_llh->lgh_last_idx); + + RETURN(rc); +} + +/* backup plain llog */ +int llog_backup(const struct lu_env *env, struct obd_device *obd, + struct llog_ctxt *ctxt, struct llog_ctxt *bctxt, + char *name, char *backup) +{ + struct llog_handle *llh, *bllh; + struct llog_process_cat_data cd = {0}; + int rc; + + ENTRY; + + /* open original log */ + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc < 0) { + /* the -ENOENT case is also reported to the caller + * but silently so it should handle that if needed. + */ + if (rc != -ENOENT) + CERROR("%s: failed to open log %s: rc = %d\n", + obd->obd_name, name, rc); + RETURN(rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(out_close, rc); + + /* Make sure there's no old backup log */ + rc = llog_erase(env, bctxt, NULL, backup); + if (rc < 0 && rc != -ENOENT) + GOTO(out_close, rc); + + /* open backup log */ + rc = llog_open_create(env, bctxt, &bllh, NULL, backup); + if (rc) { + CERROR("%s: failed to open backup logfile %s: rc = %d\n", + obd->obd_name, backup, rc); + GOTO(out_close, rc); + } + + /* check that backup llog is not the same object as original one */ + if (llh->lgh_obj == bllh->lgh_obj) { + CERROR("%s: backup llog %s to itself (%s), objects %p/%p\n", + obd->obd_name, name, backup, llh->lgh_obj, + bllh->lgh_obj); + GOTO(out_backup, rc = -EEXIST); + } + + rc = llog_init_handle(env, bllh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(out_backup, rc); + + /* Read canceled records to have an exact copy */ + cd.lpcd_read_mode = LLOG_READ_MODE_RAW; + /* Copy log record by record */ + rc = llog_process_or_fork(env, llh, llog_copy_handler, (void *)bllh, + &cd, false); + if (rc) + CERROR("%s: failed to backup log %s: rc = %d\n", + obd->obd_name, name, rc); +out_backup: + llog_close(env, bllh); +out_close: + llog_close(env, llh); + RETURN(rc); +} +EXPORT_SYMBOL(llog_backup); + +/* Get size of llog */ +__u64 llog_size(const struct lu_env *env, struct llog_handle *llh) +{ + int rc; + struct lu_attr la; + + rc = llh->lgh_obj->do_ops->do_attr_get(env, llh->lgh_obj, &la); + if (rc) { + CERROR("%s: attr_get failed for "DFID": rc = %d\n", + loghandle2name(llh), PFID(&llh->lgh_id.lgl_oi.oi_fid), + rc); + return 0; + } + + return la.la_size; +} +EXPORT_SYMBOL(llog_size); + diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c new file mode 100644 index 0000000000000..ba44ad3003559 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c @@ -0,0 +1,1198 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/llog_cat.c + * + * OST<->MDS recovery logging infrastructure. + * + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger + * Author: Alexey Zhuravlev + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include + +#include "llog_internal.h" + + +/** + * lockdep markers for nested struct llog_handle::lgh_lock locking. + */ +enum { + LLOGH_CAT, + LLOGH_LOG, +}; + +/* Create a new log handle and add it to the open list. + * This log handle will be closed when all of the records in it are removed. + * + * Assumes caller has already pushed us into the kernel context and is locking. + */ +static int llog_cat_new_log(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_handle *loghandle, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_logid_rec *rec = &lgi->lgi_logid; + struct thandle *handle = NULL; + struct dt_device *dt = NULL; + struct llog_log_hdr *llh = cathandle->lgh_hdr; + int rc, index; + + ENTRY; + + index = (cathandle->lgh_last_idx + 1) % + (OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS) ? (cfs_fail_val + 1) : + LLOG_HDR_BITMAP_SIZE(llh)); + + /* check that new llog index will not overlap with the first one. + * - llh_cat_idx is the index just before the first/oldest still in-use + * index in catalog + * - lgh_last_idx is the last/newest used index in catalog + * + * When catalog is not wrapped yet then lgh_last_idx is always larger + * than llh_cat_idx. After the wrap around lgh_last_idx re-starts + * from 0 and llh_cat_idx becomes the upper limit for it + * + * Check if catalog has already wrapped around or not by comparing + * last_idx and cat_idx */ + if ((index == llh->llh_cat_idx + 1 && llh->llh_count > 1) || + (index == 0 && llh->llh_cat_idx == 0)) { + if (cathandle->lgh_name == NULL) { + CWARN("%s: there are no more free slots in catalog " + DFID":%x\n", + loghandle2name(loghandle), + PFID(&cathandle->lgh_id.lgl_oi.oi_fid), + cathandle->lgh_id.lgl_ogen); + } else { + CWARN("%s: there are no more free slots in " + "catalog %s\n", loghandle2name(loghandle), + cathandle->lgh_name); + } + RETURN(-ENOSPC); + } + + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED)) + RETURN(-ENOSPC); + + if (loghandle->lgh_hdr != NULL) { + /* If llog object is remote and creation is failed, lgh_hdr + * might be left over here, free it first */ + LASSERT(!llog_exist(loghandle)); + OBD_FREE_LARGE(loghandle->lgh_hdr, loghandle->lgh_hdr_size); + loghandle->lgh_hdr = NULL; + } + + if (th == NULL) { + dt = lu2dt_dev(cathandle->lgh_obj->do_lu.lo_dev); + + handle = dt_trans_create(env, dt); + if (IS_ERR(handle)) + RETURN(PTR_ERR(handle)); + + /* Create update llog object synchronously, which + * happens during inialization process see + * lod_sub_prep_llog(), to make sure the update + * llog object is created before corss-MDT writing + * updates into the llog object */ + if (cathandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) + handle->th_sync = 1; + + handle->th_wait_submit = 1; + + rc = llog_declare_create(env, loghandle, handle); + if (rc != 0) + GOTO(out, rc); + + rec->lid_hdr.lrh_len = sizeof(*rec); + rec->lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + rec->lid_id = loghandle->lgh_id; + rc = llog_declare_write_rec(env, cathandle, &rec->lid_hdr, -1, + handle); + if (rc != 0) + GOTO(out, rc); + + rc = dt_trans_start_local(env, dt, handle); + if (rc != 0) + GOTO(out, rc); + + th = handle; + } + + rc = llog_create(env, loghandle, th); + /* if llog is already created, no need to initialize it */ + if (rc == -EEXIST) { + GOTO(out, rc = 0); + } else if (rc != 0) { + CERROR("%s: can't create new plain llog in catalog: rc = %d\n", + loghandle2name(loghandle), rc); + GOTO(out, rc); + } + + rc = llog_init_handle(env, loghandle, + LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY, + &cathandle->lgh_hdr->llh_tgtuuid); + if (rc < 0) + GOTO(out, rc); + + /* build the record for this log in the catalog */ + rec->lid_hdr.lrh_len = sizeof(*rec); + rec->lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + rec->lid_id = loghandle->lgh_id; + + /* append the new record into catalog. The new index will be + * assigned to the record and updated in rec header */ + rc = llog_write_rec(env, cathandle, &rec->lid_hdr, + &loghandle->u.phd.phd_cookie, LLOG_NEXT_IDX, th); + if (rc < 0) + GOTO(out_destroy, rc); + + CDEBUG(D_OTHER, "new plain log "DFID".%u of catalog "DFID"\n", + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), rec->lid_hdr.lrh_index, + PFID(&cathandle->lgh_id.lgl_oi.oi_fid)); + + loghandle->lgh_hdr->llh_cat_idx = rec->lid_hdr.lrh_index; + + /* limit max size of plain llog so that space can be + * released sooner, especially on small filesystems */ + /* 2MB for the cases when free space hasn't been learned yet */ + loghandle->lgh_max_size = 2 << 20; + dt = lu2dt_dev(cathandle->lgh_obj->do_lu.lo_dev); + rc = dt_statfs(env, dt, &lgi->lgi_statfs); + if (rc == 0 && lgi->lgi_statfs.os_bfree > 0) { + __u64 freespace = (lgi->lgi_statfs.os_bfree * + lgi->lgi_statfs.os_bsize) >> 6; + if (freespace < loghandle->lgh_max_size) + loghandle->lgh_max_size = freespace; + /* shouldn't be > 128MB in any case? + * it's 256K records of 512 bytes each */ + if (freespace > (128 << 20)) + loghandle->lgh_max_size = 128 << 20; + } + if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_PLAIN_RECORDS) || + OBD_FAIL_PRECHECK(OBD_FAIL_CATALOG_FULL_CHECK))) { + // limit the numer of plain records for test + loghandle->lgh_max_size = loghandle->lgh_hdr_size + + cfs_fail_val * 64; + } + + rc = 0; + +out: + if (handle != NULL) { + handle->th_result = rc >= 0 ? 0 : rc; + dt_trans_stop(env, dt, handle); + } + RETURN(rc); + +out_destroy: + /* to signal llog_cat_close() it shouldn't try to destroy the llog, + * we want to destroy it in this transaction, otherwise the object + * becomes an orphan */ + loghandle->lgh_hdr->llh_flags &= ~LLOG_F_ZAP_WHEN_EMPTY; + /* this is to mimic full log, so another llog_cat_current_log() + * can skip it and ask for another onet */ + loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(loghandle->lgh_hdr) + 1; + llog_trans_destroy(env, loghandle, th); + if (handle != NULL) + dt_trans_stop(env, dt, handle); + RETURN(rc); +} + +static int llog_cat_refresh(const struct lu_env *env, + struct llog_handle *cathandle) +{ + struct llog_handle *loghandle; + int rc; + + down_write(&cathandle->lgh_lock); + list_for_each_entry(loghandle, &cathandle->u.chd.chd_head, + u.phd.phd_entry) { + if (!llog_exist(loghandle)) + continue; + + down_write(&loghandle->lgh_lock); + rc = llog_read_header(env, loghandle, NULL); + up_write(&loghandle->lgh_lock); + if (rc) + goto unlock; + } + + rc = llog_read_header(env, cathandle, NULL); +unlock: + up_write(&cathandle->lgh_lock); + + return rc; +} + +/* + * prepare current/next log for catalog. + * + * if \a *ploghandle is NULL, open it, and declare create, NB, if \a + * *ploghandle is remote, create it synchronously here, see comments + * below. + * + * \a cathandle->lgh_lock is down_read-ed, it gets down_write-ed if \a + * *ploghandle has to be opened. + */ +static int llog_cat_prep_log(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_handle **ploghandle, + struct thandle *th) +{ + int rc; + int sem_upgraded; + +start: + rc = 0; + sem_upgraded = 0; + if (IS_ERR_OR_NULL(*ploghandle)) { + up_read(&cathandle->lgh_lock); + down_write(&cathandle->lgh_lock); + sem_upgraded = 1; + if (IS_ERR_OR_NULL(*ploghandle)) { + struct llog_handle *loghandle; + + rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, + NULL, NULL, LLOG_OPEN_NEW); + if (!rc) { + *ploghandle = loghandle; + list_add_tail(&loghandle->u.phd.phd_entry, + &cathandle->u.chd.chd_head); + } + } + if (rc) + GOTO(out, rc); + } + + rc = llog_exist(*ploghandle); + if (rc < 0) + GOTO(out, rc); + if (rc) + GOTO(out, rc = 0); + + if (dt_object_remote(cathandle->lgh_obj)) { + down_write_nested(&(*ploghandle)->lgh_lock, LLOGH_LOG); + if (!llog_exist(*ploghandle)) { + /* For remote operation, if we put the llog object + * creation in the current transaction, then the + * llog object will not be created on the remote + * target until the transaction stop, if other + * operations start before the transaction stop, + * and use the same llog object, will be dependent + * on the success of this transaction. So let's + * create the llog object synchronously here to + * remove the dependency. */ + rc = llog_cat_new_log(env, cathandle, *ploghandle, + NULL); + if (rc == -ESTALE) { + up_write(&(*ploghandle)->lgh_lock); + if (sem_upgraded) + up_write(&cathandle->lgh_lock); + else + up_read(&cathandle->lgh_lock); + + rc = llog_cat_refresh(env, cathandle); + down_read_nested(&cathandle->lgh_lock, + LLOGH_CAT); + if (rc) + return rc; + /* *ploghandle might become NULL, restart */ + goto start; + } + } + up_write(&(*ploghandle)->lgh_lock); + } else { + struct llog_thread_info *lgi = llog_info(env); + struct llog_logid_rec *lirec = &lgi->lgi_logid; + + rc = llog_declare_create(env, *ploghandle, th); + if (rc) + GOTO(out, rc); + + lirec->lid_hdr.lrh_len = sizeof(*lirec); + rc = llog_declare_write_rec(env, cathandle, &lirec->lid_hdr, -1, + th); + } + +out: + if (sem_upgraded) { + up_write(&cathandle->lgh_lock); + down_read_nested(&cathandle->lgh_lock, LLOGH_CAT); + if (rc == 0) + goto start; + } + return rc; +} + +/* Open an existent log handle and add it to the open list. + * This log handle will be closed when all of the records in it are removed. + * + * Assumes caller has already pushed us into the kernel context and is locking. + * We return a lock on the handle to ensure nobody yanks it from us. + * + * This takes extra reference on llog_handle via llog_handle_get() and require + * this reference to be put by caller using llog_handle_put() + */ +int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle **res, struct llog_logid *logid) +{ + struct llog_handle *loghandle; + enum llog_flag fmt; + int rc = 0; + + ENTRY; + + if (cathandle == NULL) + RETURN(-EBADF); + + fmt = cathandle->lgh_hdr->llh_flags & LLOG_F_EXT_MASK; + down_write(&cathandle->lgh_lock); + list_for_each_entry(loghandle, &cathandle->u.chd.chd_head, + u.phd.phd_entry) { + struct llog_logid *cgl = &loghandle->lgh_id; + + if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) && + ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) { + if (cgl->lgl_ogen != logid->lgl_ogen) { + CWARN("%s: log "DFID" generation %x != %x\n", + loghandle2name(loghandle), + PFID(&logid->lgl_oi.oi_fid), + cgl->lgl_ogen, logid->lgl_ogen); + continue; + } + *res = llog_handle_get(loghandle); + if (!*res) { + CERROR("%s: log "DFID" refcount is zero!\n", + loghandle2name(loghandle), + PFID(&logid->lgl_oi.oi_fid)); + continue; + } + loghandle->u.phd.phd_cat_handle = cathandle; + up_write(&cathandle->lgh_lock); + RETURN(rc); + } + } + up_write(&cathandle->lgh_lock); + + rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL, + LLOG_OPEN_EXISTS); + if (rc < 0) { + CERROR("%s: error opening log id "DFID":%x: rc = %d\n", + loghandle2name(cathandle), PFID(&logid->lgl_oi.oi_fid), + logid->lgl_ogen, rc); + RETURN(rc); + } + + rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN | + LLOG_F_ZAP_WHEN_EMPTY | fmt, NULL); + if (rc < 0) { + llog_close(env, loghandle); + *res = NULL; + RETURN(rc); + } + + *res = llog_handle_get(loghandle); + LASSERT(*res); + down_write(&cathandle->lgh_lock); + list_add_tail(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head); + up_write(&cathandle->lgh_lock); + + loghandle->u.phd.phd_cat_handle = cathandle; + loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id; + loghandle->u.phd.phd_cookie.lgc_index = + loghandle->lgh_hdr->llh_cat_idx; + RETURN(0); +} + +int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle) +{ + struct llog_handle *loghandle, *n; + int rc; + + ENTRY; + + list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head, + u.phd.phd_entry) { + struct llog_log_hdr *llh = loghandle->lgh_hdr; + int index; + + /* unlink open-not-created llogs */ + list_del_init(&loghandle->u.phd.phd_entry); + llh = loghandle->lgh_hdr; + if (loghandle->lgh_obj != NULL && llh != NULL && + (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + (llh->llh_count == 1)) { + rc = llog_destroy(env, loghandle); + if (rc) + CERROR("%s: failure destroying log during " + "cleanup: rc = %d\n", + loghandle2name(loghandle), rc); + + index = loghandle->u.phd.phd_cookie.lgc_index; + llog_cat_cleanup(env, cathandle, NULL, index); + } + llog_close(env, loghandle); + } + /* if handle was stored in ctxt, remove it too */ + if (cathandle->lgh_ctxt->loc_handle == cathandle) + cathandle->lgh_ctxt->loc_handle = NULL; + rc = llog_close(env, cathandle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_close); + +/** Return the currently active log handle. If the current log handle doesn't + * have enough space left for the current record, start a new one. + * + * If reclen is 0, we only want to know what the currently active log is, + * otherwise we get a lock on this log so nobody can steal our space. + * + * Assumes caller has already pushed us into the kernel context and is locking. + * + * NOTE: loghandle is write-locked upon successful return + */ +static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle, + struct thandle *th) +{ + struct llog_handle *loghandle = NULL; + ENTRY; + + + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED2)) { + down_write_nested(&cathandle->lgh_lock, LLOGH_CAT); + GOTO(next, loghandle); + } + + down_read_nested(&cathandle->lgh_lock, LLOGH_CAT); + loghandle = cathandle->u.chd.chd_current_log; + if (loghandle) { + struct llog_log_hdr *llh; + + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + llh = loghandle->lgh_hdr; + if (llh == NULL || !llog_is_full(loghandle)) { + up_read(&cathandle->lgh_lock); + RETURN(loghandle); + } else { + up_write(&loghandle->lgh_lock); + } + } + up_read(&cathandle->lgh_lock); + + /* time to use next log */ + + /* first, we have to make sure the state hasn't changed */ + down_write_nested(&cathandle->lgh_lock, LLOGH_CAT); + loghandle = cathandle->u.chd.chd_current_log; + if (loghandle) { + struct llog_log_hdr *llh; + + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + llh = loghandle->lgh_hdr; + if (llh == NULL || !llog_is_full(loghandle)) + GOTO(out_unlock, loghandle); + else + up_write(&loghandle->lgh_lock); + } + +next: + /* Sigh, the chd_next_log and chd_current_log is initialized + * in declare phase, and we do not serialize the catlog + * accessing, so it might be possible the llog creation + * thread (see llog_cat_declare_add_rec()) did not create + * llog successfully, then the following thread might + * meet this situation. */ + if (IS_ERR_OR_NULL(cathandle->u.chd.chd_next_log)) { + CERROR("%s: next log does not exist!\n", + loghandle2name(cathandle)); + loghandle = ERR_PTR(-EIO); + if (cathandle->u.chd.chd_next_log == NULL) { + /* Store the error in chd_next_log, so + * the following process can get correct + * failure value */ + cathandle->u.chd.chd_next_log = loghandle; + } + GOTO(out_unlock, loghandle); + } + + CDEBUG(D_INODE, "use next log\n"); + + loghandle = cathandle->u.chd.chd_next_log; + cathandle->u.chd.chd_current_log = loghandle; + cathandle->u.chd.chd_next_log = NULL; + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + +out_unlock: + up_write(&cathandle->lgh_lock); + LASSERT(loghandle); + RETURN(loghandle); +} + +/* Add a single record to the recovery log(s) using a catalog + * Returns as llog_write_record + * + * Assumes caller has already pushed us into the kernel context. + */ +int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + struct thandle *th) +{ + struct llog_handle *loghandle; + int rc, retried = 0; + ENTRY; + + LASSERT(rec->lrh_len <= cathandle->lgh_ctxt->loc_chunk_size); + +retry: + loghandle = llog_cat_current_log(cathandle, th); + if (IS_ERR(loghandle)) + RETURN(PTR_ERR(loghandle)); + + /* loghandle is already locked by llog_cat_current_log() for us */ + if (!llog_exist(loghandle)) { + rc = llog_cat_new_log(env, cathandle, loghandle, th); + if (rc < 0) { + up_write(&loghandle->lgh_lock); + /* nobody should be trying to use this llog */ + down_write(&cathandle->lgh_lock); + if (cathandle->u.chd.chd_current_log == loghandle) + cathandle->u.chd.chd_current_log = NULL; + up_write(&cathandle->lgh_lock); + RETURN(rc); + } + } + /* now let's try to add the record */ + rc = llog_write_rec(env, loghandle, rec, reccookie, LLOG_NEXT_IDX, th); + if (rc < 0) { + CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR, + "llog_write_rec %d: lh=%p\n", rc, loghandle); + /* -ENOSPC is returned if no empty records left + * and when it's lack of space on the stogage. + * there is no point to try again if it's the second + * case. many callers (like llog test) expect ENOSPC, + * so we preserve this error code, but look for the + * actual cause here */ + if (rc == -ENOSPC && llog_is_full(loghandle)) + rc = -ENOBUFS; + } + up_write(&loghandle->lgh_lock); + + if (rc == -ENOBUFS) { + if (retried++ == 0) + GOTO(retry, rc); + CERROR("%s: error on 2nd llog: rc = %d\n", + loghandle2name(cathandle), rc); + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_add_rec); + +int llog_cat_declare_add_rec(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct thandle *th) +{ + int rc; + + ENTRY; + +start: + down_read_nested(&cathandle->lgh_lock, LLOGH_CAT); + rc = llog_cat_prep_log(env, cathandle, + &cathandle->u.chd.chd_current_log, th); + if (rc) + GOTO(unlock, rc); + + rc = llog_cat_prep_log(env, cathandle, &cathandle->u.chd.chd_next_log, + th); + if (rc) + GOTO(unlock, rc); + + rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log, + rec, -1, th); + if (rc == -ESTALE && dt_object_remote(cathandle->lgh_obj)) { + up_read(&cathandle->lgh_lock); + rc = llog_cat_refresh(env, cathandle); + if (rc) + RETURN(rc); + goto start; + } + +#if 0 + /* + * XXX: we hope for declarations made for existing llog this might be + * not correct with some backends where declarations are expected + * against specific object like ZFS with full debugging enabled. + */ + rc = llog_declare_write_rec(env, cathandle->u.chd.chd_next_log, rec, -1, + th); +#endif +unlock: + up_read(&cathandle->lgh_lock); + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_declare_add_rec); + +int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie) +{ + struct llog_ctxt *ctxt; + struct dt_device *dt; + struct thandle *th = NULL; + int rc; + + ctxt = cathandle->lgh_ctxt; + LASSERT(ctxt); + LASSERT(ctxt->loc_exp); + + LASSERT(cathandle->lgh_obj != NULL); + dt = lu2dt_dev(cathandle->lgh_obj->do_lu.lo_dev); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = llog_cat_declare_add_rec(env, cathandle, rec, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(out_trans, rc); + rc = llog_cat_add_rec(env, cathandle, rec, reccookie, th); +out_trans: + dt_trans_stop(env, dt, th); + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_add); + +int llog_cat_cancel_arr_rec(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_logid *lgl, int count, int *index) +{ + struct llog_handle *loghandle; + int rc; + + ENTRY; + rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl); + if (rc) { + CDEBUG(D_HA, "%s: cannot find llog for handle "DFID":%x" + ": rc = %d\n", loghandle2name(cathandle), + PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, rc); + RETURN(rc); + } + + if ((cathandle->lgh_ctxt->loc_flags & + LLOG_CTXT_FLAG_NORMAL_FID) && !llog_exist(loghandle)) { + /* For update log, some of loghandles of cathandle + * might not exist because remote llog creation might + * be failed, so let's skip the record cancellation + * for these non-exist llogs. + */ + rc = -ENOENT; + CDEBUG(D_HA, "%s: llog "DFID":%x does not exist" + ": rc = %d\n", loghandle2name(cathandle), + PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, rc); + + llog_handle_put(env, loghandle); + RETURN(rc); + } + + rc = llog_cancel_arr_rec(env, loghandle, count, index); + if (rc == LLOG_DEL_PLAIN) { /* log has been destroyed */ + int cat_index; + + cat_index = loghandle->u.phd.phd_cookie.lgc_index; + rc = llog_cat_cleanup(env, cathandle, loghandle, cat_index); + if (rc) + CERROR("%s: fail to cancel catalog record: rc = %d\n", + loghandle2name(cathandle), rc); + rc = 0; + + } + llog_handle_put(env, loghandle); + + if (rc) + CERROR("%s: fail to cancel %d llog-records: rc = %d\n", + loghandle2name(cathandle), count, rc); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_cancel_arr_rec); + +/* For each cookie in the cookie array, we clear the log in-use bit and either: + * - the log is empty, so mark it free in the catalog header and delete it + * - the log is not empty, just write out the log header + * + * The cookies may be in different log files, so we need to get new logs + * each time. + * + * Assumes caller has already pushed us into the kernel context. + */ +int llog_cat_cancel_records(const struct lu_env *env, + struct llog_handle *cathandle, int count, + struct llog_cookie *cookies) +{ + int i, rc = 0, failed = 0; + + ENTRY; + + for (i = 0; i < count; i++, cookies++) { + int lrc; + + lrc = llog_cat_cancel_arr_rec(env, cathandle, &cookies->lgc_lgl, + 1, &cookies->lgc_index); + if (lrc) { + failed++; + if (!rc) + rc = lrc; + } + } + if (failed) + CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n", + loghandle2name(cathandle), failed, count, rc); + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_cancel_records); + +static int llog_cat_process_common(const struct lu_env *env, + struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, + struct llog_handle **llhp) +{ + struct llog_logid_rec *lir = container_of(rec, typeof(*lir), lid_hdr); + struct llog_log_hdr *hdr; + int rc; + + ENTRY; + if (rec->lrh_type != le32_to_cpu(LLOG_LOGID_MAGIC)) { + rc = -EINVAL; + CWARN("%s: invalid record in catalog "DFID":%x: rc = %d\n", + loghandle2name(cat_llh), + PFID(&cat_llh->lgh_id.lgl_oi.oi_fid), + cat_llh->lgh_id.lgl_ogen, rc); + RETURN(rc); + } + CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "DFID"\n", + PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen, + le32_to_cpu(rec->lrh_index), + PFID(&cat_llh->lgh_id.lgl_oi.oi_fid)); + + rc = llog_cat_id2handle(env, cat_llh, llhp, &lir->lid_id); + if (rc) { + /* After a server crash, a stub of index record in catlog could + * be kept, because plain log destroy + catlog index record + * deletion are not atomic. So we end up with an index but no + * actual record. Destroy the index and move on. */ + if (rc == -ENOENT || rc == -ESTALE) + rc = LLOG_DEL_RECORD; + else if (rc) + CWARN("%s: can't find llog handle "DFID":%x: rc = %d\n", + loghandle2name(cat_llh), + PFID(&lir->lid_id.lgl_oi.oi_fid), + lir->lid_id.lgl_ogen, rc); + + RETURN(rc); + } + + /* clean old empty llogs, do not consider current llog in use */ + /* ignore remote (lgh_obj == NULL) llogs */ + hdr = (*llhp)->lgh_hdr; + if ((hdr->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + hdr->llh_count == 1 && cat_llh->lgh_obj != NULL && + *llhp != cat_llh->u.chd.chd_current_log && + *llhp != cat_llh->u.chd.chd_next_log) { + rc = llog_destroy(env, *llhp); + if (rc) + CWARN("%s: can't destroy empty log "DFID": rc = %d\n", + loghandle2name((*llhp)), + PFID(&lir->lid_id.lgl_oi.oi_fid), rc); + rc = LLOG_DEL_PLAIN; + } + + RETURN(rc); +} + +static int llog_cat_process_cb(const struct lu_env *env, + struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_process_data *d = data; + struct llog_handle *llh = NULL; + int rc; + + ENTRY; + rc = llog_cat_process_common(env, cat_llh, rec, &llh); + if (rc) + GOTO(out, rc); + + if (rec->lrh_index < d->lpd_startcat) { + /* Skip processing of the logs until startcat */ + rc = 0; + } else if (d->lpd_startidx > 0) { + struct llog_process_cat_data cd; + + cd.lpcd_read_mode = LLOG_READ_MODE_NORMAL; + cd.lpcd_first_idx = d->lpd_startidx; + cd.lpcd_last_idx = 0; + rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data, + &cd, false); + /* Continue processing the next log from idx 0 */ + d->lpd_startidx = 0; + } else { + rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data, + NULL, false); + } + if (rc == -ENOENT && (cat_llh->lgh_hdr->llh_flags & LLOG_F_RM_ON_ERR)) { + /* + * plain llog is reported corrupted, so better to just remove + * it if the caller is fine with that. + */ + CERROR("%s: remove corrupted/missing llog "DFID"\n", + loghandle2name(cat_llh), + PFID(&llh->lgh_id.lgl_oi.oi_fid)); + rc = LLOG_DEL_PLAIN; + } + +out: + /* The empty plain log was destroyed while processing */ + if (rc == LLOG_DEL_PLAIN || rc == LLOG_DEL_RECORD) + /* clear wrong catalog entry */ + rc = llog_cat_cleanup(env, cat_llh, llh, rec->lrh_index); + + if (llh) + llog_handle_put(env, llh); + + RETURN(rc); +} + +int llog_cat_process_or_fork(const struct lu_env *env, + struct llog_handle *cat_llh, llog_cb_t cat_cb, + llog_cb_t cb, void *data, int startcat, + int startidx, bool fork) +{ + struct llog_process_data d; + struct llog_log_hdr *llh = cat_llh->lgh_hdr; + int rc; + + ENTRY; + + LASSERT(llh->llh_flags & LLOG_F_IS_CAT); + d.lpd_data = data; + d.lpd_cb = cb; + d.lpd_startcat = (startcat == LLOG_CAT_FIRST ? 0 : startcat); + d.lpd_startidx = startidx; + + if (llh->llh_cat_idx >= cat_llh->lgh_last_idx && + llh->llh_count > 1) { + struct llog_process_cat_data cd = { + .lpcd_read_mode = LLOG_READ_MODE_NORMAL + }; + + CWARN("%s: catlog "DFID" crosses index zero\n", + loghandle2name(cat_llh), + PFID(&cat_llh->lgh_id.lgl_oi.oi_fid)); + /*startcat = 0 is default value for general processing */ + if ((startcat != LLOG_CAT_FIRST && + startcat >= llh->llh_cat_idx) || !startcat) { + /* processing the catalog part at the end */ + cd.lpcd_first_idx = (startcat ? startcat : + llh->llh_cat_idx); + if (OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS)) + cd.lpcd_last_idx = cfs_fail_val; + else + cd.lpcd_last_idx = 0; + rc = llog_process_or_fork(env, cat_llh, cat_cb, + &d, &cd, fork); + /* Reset the startcat becasue it has already reached + * catalog bottom. + */ + startcat = 0; + d.lpd_startcat = 0; + if (rc != 0) + RETURN(rc); + } + /* processing the catalog part at the begining */ + cd.lpcd_first_idx = (startcat == LLOG_CAT_FIRST) ? 0 : startcat; + /* Note, the processing will stop at the lgh_last_idx value, + * and it could be increased during processing. So records + * between current lgh_last_idx and lgh_last_idx in future + * would left unprocessed. + */ + cd.lpcd_last_idx = cat_llh->lgh_last_idx; + rc = llog_process_or_fork(env, cat_llh, cat_cb, + &d, &cd, fork); + } else { + rc = llog_process_or_fork(env, cat_llh, cat_cb, + &d, NULL, fork); + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_process_or_fork); + +int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh, + llog_cb_t cb, void *data, int startcat, int startidx) +{ + return llog_cat_process_or_fork(env, cat_llh, llog_cat_process_cb, + cb, data, startcat, startidx, false); +} +EXPORT_SYMBOL(llog_cat_process); + +static int llog_cat_size_cb(const struct lu_env *env, + struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_process_data *d = data; + struct llog_handle *llh = NULL; + __u64 *cum_size = d->lpd_data; + __u64 size; + int rc; + + ENTRY; + rc = llog_cat_process_common(env, cat_llh, rec, &llh); + + if (rc == LLOG_DEL_PLAIN) { + /* empty log was deleted, don't count it */ + rc = llog_cat_cleanup(env, cat_llh, llh, + llh->u.phd.phd_cookie.lgc_index); + } else if (rc == LLOG_DEL_RECORD) { + /* clear wrong catalog entry */ + rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index); + } else { + size = llog_size(env, llh); + *cum_size += size; + + CDEBUG(D_INFO, "Add llog entry "DFID" size=%llu, tot=%llu\n", + PFID(&llh->lgh_id.lgl_oi.oi_fid), size, *cum_size); + } + + if (llh != NULL) + llog_handle_put(env, llh); + + RETURN(0); +} + +__u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh) +{ + __u64 size = llog_size(env, cat_llh); + + llog_cat_process_or_fork(env, cat_llh, llog_cat_size_cb, + NULL, &size, 0, 0, false); + + return size; +} +EXPORT_SYMBOL(llog_cat_size); + +/* currently returns the number of "free" entries in catalog, + * ie the available entries for a new plain LLOG file creation, + * even if catalog has wrapped + */ +__u32 llog_cat_free_space(struct llog_handle *cat_llh) +{ + /* simulate almost full Catalog */ + if (OBD_FAIL_CHECK(OBD_FAIL_CAT_FREE_RECORDS)) + return cfs_fail_val; + + if (cat_llh->lgh_hdr->llh_count == 1) + return LLOG_HDR_BITMAP_SIZE(cat_llh->lgh_hdr) - 1; + + if (cat_llh->lgh_last_idx > cat_llh->lgh_hdr->llh_cat_idx) + return LLOG_HDR_BITMAP_SIZE(cat_llh->lgh_hdr) - 1 + + cat_llh->lgh_hdr->llh_cat_idx - cat_llh->lgh_last_idx; + + /* catalog is presently wrapped */ + return cat_llh->lgh_hdr->llh_cat_idx - cat_llh->lgh_last_idx; +} +EXPORT_SYMBOL(llog_cat_free_space); + +static int llog_cat_reverse_process_cb(const struct lu_env *env, + struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_process_data *d = data; + struct llog_handle *llh; + int rc; + + ENTRY; + rc = llog_cat_process_common(env, cat_llh, rec, &llh); + + /* The empty plain log was destroyed while processing */ + if (rc == LLOG_DEL_PLAIN) { + rc = llog_cat_cleanup(env, cat_llh, llh, + llh->u.phd.phd_cookie.lgc_index); + } else if (rc == LLOG_DEL_RECORD) { + /* clear wrong catalog entry */ + rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index); + } + if (rc) + RETURN(rc); + + rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL); + + /* The empty plain was destroyed while processing */ + if (rc == LLOG_DEL_PLAIN) + rc = llog_cat_cleanup(env, cat_llh, llh, + llh->u.phd.phd_cookie.lgc_index); + + llog_handle_put(env, llh); + RETURN(rc); +} + +int llog_cat_reverse_process(const struct lu_env *env, + struct llog_handle *cat_llh, + llog_cb_t cb, void *data) +{ + struct llog_process_data d; + struct llog_process_cat_data cd; + struct llog_log_hdr *llh = cat_llh->lgh_hdr; + int rc; + ENTRY; + + LASSERT(llh->llh_flags & LLOG_F_IS_CAT); + cd.lpcd_read_mode = LLOG_READ_MODE_NORMAL; + d.lpd_data = data; + d.lpd_cb = cb; + + if (llh->llh_cat_idx >= cat_llh->lgh_last_idx && + llh->llh_count > 1) { + CWARN("%s: catalog "DFID" crosses index zero\n", + loghandle2name(cat_llh), + PFID(&cat_llh->lgh_id.lgl_oi.oi_fid)); + + cd.lpcd_first_idx = 0; + cd.lpcd_last_idx = cat_llh->lgh_last_idx; + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, &cd); + if (rc != 0) + RETURN(rc); + + cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx); + cd.lpcd_last_idx = 0; + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, &cd); + } else { + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, NULL); + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_reverse_process); + +static int llog_cat_set_first_idx(struct llog_handle *cathandle, int idx) +{ + struct llog_log_hdr *llh = cathandle->lgh_hdr; + int bitmap_size; + + ENTRY; + + bitmap_size = LLOG_HDR_BITMAP_SIZE(llh); + /* + * The llh_cat_idx equals to the first used index minus 1 + * so if we canceled the first index then llh_cat_idx + * must be renewed. + */ + if (llh->llh_cat_idx == (idx - 1)) { + llh->llh_cat_idx = idx; + + while (idx != cathandle->lgh_last_idx) { + idx = (idx + 1) % bitmap_size; + if (!test_bit_le(idx, LLOG_HDR_BITMAP(llh))) { + /* update llh_cat_idx for each unset bit, + * expecting the next one is set */ + llh->llh_cat_idx = idx; + } else if (idx == 0) { + /* skip header bit */ + llh->llh_cat_idx = 0; + continue; + } else { + /* the first index is found */ + break; + } + } + + CDEBUG(D_HA, "catlog "DFID" first idx %u, last_idx %u\n", + PFID(&cathandle->lgh_id.lgl_oi.oi_fid), + llh->llh_cat_idx, cathandle->lgh_last_idx); + } + + RETURN(0); +} + +/* Cleanup deleted plain llog traces from catalog */ +int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle *loghandle, int index) +{ + int rc; + struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0}; + + LASSERT(index); + if (loghandle != NULL) { + /* remove destroyed llog from catalog list and + * chd_current_log variable */ + fid = loghandle->lgh_id.lgl_oi.oi_fid; + down_write(&cathandle->lgh_lock); + if (cathandle->u.chd.chd_current_log == loghandle) + cathandle->u.chd.chd_current_log = NULL; + list_del_init(&loghandle->u.phd.phd_entry); + up_write(&cathandle->lgh_lock); + LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index || + loghandle->u.phd.phd_cookie.lgc_index == 0); + /* llog was opened and keep in a list, close it now */ + llog_close(env, loghandle); + } + + /* do not attempt to cleanup on-disk llog if on client side */ + if (cathandle->lgh_obj == NULL) + return 0; + + /* remove plain llog entry from catalog by index */ + llog_cat_set_first_idx(cathandle, index); + rc = llog_cancel_rec(env, cathandle, index); + if (rc == 0) + CDEBUG(D_HA, + "cancel plain log "DFID" at index %u of catalog "DFID"\n", + PFID(&fid), index, + PFID(&cathandle->lgh_id.lgl_oi.oi_fid)); + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h new file mode 100644 index 0000000000000..096f595e75102 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h @@ -0,0 +1,102 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef __LLOG_INTERNAL_H__ +#define __LLOG_INTERNAL_H__ + +#include + +struct llog_process_info { + struct llog_handle *lpi_loghandle; + llog_cb_t lpi_cb; + void *lpi_cbdata; + void *lpi_catdata; + int lpi_rc; + struct completion lpi_completion; + const struct lu_env *lpi_env; + struct task_struct *lpi_reftask; +}; + +struct llog_thread_info { + struct lu_attr lgi_attr; + struct lu_fid lgi_fid; + struct dt_object_format lgi_dof; + struct lu_buf lgi_buf; + loff_t lgi_off; + struct llog_logid_rec lgi_logid; + struct dt_insert_rec lgi_dt_rec; + struct lu_seq_range lgi_range; + struct llog_cookie lgi_cookie; + struct obd_statfs lgi_statfs; + char lgi_name[32]; +}; + +extern struct lu_context_key llog_thread_key; + +static inline struct llog_thread_info *llog_info(const struct lu_env *env) +{ + struct llog_thread_info *lgi; + + lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key); + LASSERT(lgi); + return lgi; +} + +int llog_info_init(void); +void llog_info_fini(void); + +struct llog_handle *llog_handle_get(struct llog_handle *loghandle); +int llog_handle_put(const struct lu_env *env, struct llog_handle *loghandle); +int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle **res, struct llog_logid *logid); +void llog_get_marker_cfg_flags(struct llog_rec_hdr *rec, + unsigned int *cfg_flags); +int class_config_dump_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data); +int class_config_yaml_output(struct llog_rec_hdr *rec, char *buf, int size, + unsigned int *cfg_flags, bool raw); +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork); +int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle *loghandle, int index); + +static inline struct llog_rec_hdr *llog_rec_hdr_next(struct llog_rec_hdr *rec) +{ + return (struct llog_rec_hdr *)((char *)rec + rec->lrh_len); +} +int llog_verify_record(const struct llog_handle *llh, struct llog_rec_hdr *rec); +static inline char *loghandle2name(const struct llog_handle *lgh) +{ + return lgh->lgh_ctxt->loc_obd->obd_name; +} +#endif diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c new file mode 100644 index 0000000000000..62a5b88e2e86b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c @@ -0,0 +1,551 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include +#include +#include +#include "llog_internal.h" + +static int str2logid(struct llog_logid *logid, char *str, int len) +{ + unsigned long long id, seq; + char *start, *end; + u32 ogen; + int rc; + + ENTRY; + start = str; + if (start[0] == '[') { + struct lu_fid *fid = &logid->lgl_oi.oi_fid; + struct lu_fid sfid; + int num; + + fid_zero(fid); + logid->lgl_ogen = 0; + num = sscanf(start + 1, SFID, RFID(fid)); + CDEBUG(D_INFO, DFID":%x\n", PFID(fid), logid->lgl_ogen); + logid_to_fid(logid, &sfid); + RETURN(num == 3 && fid_is_sane(&sfid) ? 0 : -EINVAL); + } + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 1, 53, 0) + /* + * logids used to be input in the form "#id#seq:ogen" before they + * were changed over to accept the FID [seq:oid:ver] format. + * This is accepted for compatibility reasons, though I doubt + * anyone is actually using this for anything. + */ + if (start[0] != '#') + RETURN(-EINVAL); + + start++; + if (start - str >= len - 1) + RETURN(-EINVAL); + end = strchr(start, '#'); + if (end == NULL || end == start) + RETURN(-EINVAL); + + *end = '\0'; + rc = kstrtoull(start, 0, &id); + if (rc) + RETURN(rc); + + start = ++end; + if (start - str >= len - 1) + RETURN(-EINVAL); + + end = strchr(start, '#'); + if (!end || end == start) + RETURN(-EINVAL); + + *end = '\0'; + rc = kstrtoull(start, 0, &seq); + if (rc) + RETURN(rc); + + ostid_set_seq(&logid->lgl_oi, seq); + if (ostid_set_id(&logid->lgl_oi, id)) + RETURN(-EINVAL); + + start = ++end; + if (start - str >= len - 1) + RETURN(-EINVAL); + + rc = kstrtouint(start, 16, &ogen); + if (rc) + RETURN(-EINVAL); + logid->lgl_ogen = ogen; + + RETURN(0); +#else + RETURN(-EINVAL); +#endif +} + +static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct obd_ioctl_data *ioc_data = data; + static int l, remains; + static long from, to; + static char *out; + int cur_index; + int rc = 0; + + ENTRY; + if (ioc_data && ioc_data->ioc_inllen1 > 0) { + l = 0; + remains = ioc_data->ioc_inllen4 + + round_up(ioc_data->ioc_inllen1, 8) + + round_up(ioc_data->ioc_inllen2, 8) + + round_up(ioc_data->ioc_inllen3, 8); + + rc = kstrtol(ioc_data->ioc_inlbuf2, 0, &from); + if (rc) + RETURN(rc); + + rc = kstrtol(ioc_data->ioc_inlbuf3, 0, &to); + if (rc) + RETURN(rc); + + ioc_data->ioc_inllen1 = 0; + out = ioc_data->ioc_bulk; + } + + cur_index = rec->lrh_index; + if (cur_index < from) + RETURN(0); + if (to > 0 && cur_index > to) + RETURN(-LLOG_EEMPTY); + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) { + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *loghandle; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + l = snprintf(out, remains, + "[index]: %05d [type]: %02x [len]: %04d failed\n", + cur_index, rec->lrh_type, + rec->lrh_len); + } + if (handle->lgh_ctxt == NULL) + RETURN(-EOPNOTSUPP); + rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id); + if (rc) { + CDEBUG(D_IOCTL, "cannot find log "DFID":%x\n", + PFID(&lir->lid_id.lgl_oi.oi_fid), + lir->lid_id.lgl_ogen); + RETURN(rc); + } + rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL); + llog_handle_put(env, loghandle); + } else { + bool ok; + + switch (rec->lrh_type) { + case OST_SZ_REC: + case MDS_UNLINK_REC: + case MDS_UNLINK64_REC: + case MDS_SETATTR64_REC: + case OBD_CFG_REC: + case LLOG_GEN_REC: + case LLOG_HDR_MAGIC: + ok = true; + break; + default: + ok = false; + } + + l = snprintf(out, remains, "[index]: %05d [type]: " + "%02x [len]: %04d %s\n", + cur_index, rec->lrh_type, rec->lrh_len, + ok ? "ok" : "failed"); + out += l; + remains -= l; + if (remains <= 0) { + CERROR("%s: no space to print log records\n", + handle->lgh_ctxt->loc_obd->obd_name); + RETURN(-LLOG_EEMPTY); + } + } + RETURN(rc); +} + +struct llog_print_data { + struct obd_ioctl_data *lprd_data; + unsigned int lprd_cfg_flags; + bool lprd_raw; +}; + +#define MARKER_DIFF 10 +static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_print_data *lprd = data; + struct obd_ioctl_data *ioc_data = lprd->lprd_data; + static int l, remains; + static long from, to; + static char *out; + int cur_index; + int rc; + + ENTRY; + if (ioc_data && ioc_data->ioc_inllen1 > 0) { + l = 0; + remains = ioc_data->ioc_inllen4 + + round_up(ioc_data->ioc_inllen1, 8) + + round_up(ioc_data->ioc_inllen2, 8) + + round_up(ioc_data->ioc_inllen3, 8); + + rc = kstrtol(ioc_data->ioc_inlbuf2, 0, &from); + if (rc) + RETURN(rc); + + rc = kstrtol(ioc_data->ioc_inlbuf3, 0, &to); + if (rc) + RETURN(rc); + + out = ioc_data->ioc_bulk; + ioc_data->ioc_inllen1 = 0; + } + + cur_index = rec->lrh_index; + if (from > MARKER_DIFF && cur_index >= from - MARKER_DIFF && + cur_index < from) { + /* LU-15706: try to remember the marker cfg_flag that the "from" + * is using, in case that the "from" record doesn't know its + * "SKIP" or not flag. + */ + llog_get_marker_cfg_flags(rec, &lprd->lprd_cfg_flags); + } + if (cur_index < from) + RETURN(0); + if (to > 0 && cur_index > to) + RETURN(-LLOG_EEMPTY); + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) { + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + + l = snprintf(out, remains, + "[index]: %05d [logid]: "DFID":%x\n", + cur_index, PFID(&lir->lid_id.lgl_oi.oi_fid), + lir->lid_id.lgl_ogen); + } else if (rec->lrh_type == OBD_CFG_REC) { + int rc; + + rc = class_config_yaml_output(rec, out, remains, + &lprd->lprd_cfg_flags, + lprd->lprd_raw); + if (rc < 0) + RETURN(rc); + l = rc; + } else { + l = snprintf(out, remains, + "[index]: %05d [type]: %02x [len]: %04d\n", + cur_index, rec->lrh_type, rec->lrh_len); + } + out += l; + remains -= l; + if (remains <= 0) { + CERROR("not enough space for print log records\n"); + RETURN(-LLOG_EEMPTY); + } + + RETURN(0); +} +static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat, + struct llog_logid *logid) +{ + struct llog_handle *log; + int rc; + + ENTRY; + + rc = llog_cat_id2handle(env, cat, &log, logid); + if (rc) { + CDEBUG(D_IOCTL, "cannot find log "DFID":%x\n", + PFID(&logid->lgl_oi.oi_fid), logid->lgl_ogen); + RETURN(-ENOENT); + } + + rc = llog_destroy(env, log); + if (rc) { + CDEBUG(D_IOCTL, "cannot destroy log "DFID":%x\n", + PFID(&logid->lgl_oi.oi_fid), logid->lgl_ogen); + GOTO(out, rc); + } + llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index); +out: + llog_handle_put(env, log); + RETURN(rc); + +} + +static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + int rc; + + ENTRY; + if (rec->lrh_type != LLOG_LOGID_MAGIC) + RETURN(-EINVAL); + rc = llog_remove_log(env, handle, &lir->lid_id); + + RETURN(rc); +} + + +int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd, + struct obd_ioctl_data *data) +{ + struct llog_logid logid; + int rc = 0; + struct llog_handle *handle = NULL; + char *logname, start; + + ENTRY; + + logname = data->ioc_inlbuf1; + start = logname[0]; + if (start == '#' || start == '[') { + rc = str2logid(&logid, logname, data->ioc_inllen1); + if (rc) + RETURN(rc); + rc = llog_open(env, ctxt, &handle, &logid, NULL, + LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + } else if (start == '$' || isalpha(start) || isdigit(start)) { + if (start == '$') + logname++; + + rc = llog_open(env, ctxt, &handle, NULL, logname, + LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + } else { + rc = -EINVAL; + CDEBUG(D_INFO, "%s: invalid log name '%s': rc = %d\n", + ctxt->loc_obd->obd_name, logname, rc); + RETURN(rc); + } + + rc = llog_init_handle(env, handle, 0, NULL); + if (rc) + GOTO(out_close, rc = -ENOENT); + + switch (cmd) { + case OBD_IOC_LLOG_INFO: { + int l; + int remains = data->ioc_inllen2 + + cfs_size_round(data->ioc_inllen1); + char *out = data->ioc_bulk; + + l = snprintf(out, remains, + "logid: "DFID":%x\n" + "flags: %x (%s)\n" + "records_count: %d\n" + "last_index: %d\n", + PFID(&handle->lgh_id.lgl_oi.oi_fid), + handle->lgh_id.lgl_ogen, + handle->lgh_hdr->llh_flags, + handle->lgh_hdr->llh_flags & + LLOG_F_IS_CAT ? "cat" : "plain", + handle->lgh_hdr->llh_count, + handle->lgh_last_idx); + out += l; + remains -= l; + if (remains <= 0) { + CERROR("%s: not enough space for log header info\n", + ctxt->loc_obd->obd_name); + rc = -ENOSPC; + } + break; + } + case OBD_IOC_LLOG_CHECK: + LASSERT(data->ioc_inllen1 > 0); + rc = llog_process(env, handle, llog_check_cb, data, NULL); + if (rc == -LLOG_EEMPTY) + rc = 0; + else if (rc) + GOTO(out_close, rc); + break; + case OBD_IOC_LLOG_PRINT: { + struct llog_print_data lprd = { + .lprd_data = data, + .lprd_raw = data->ioc_u32_1, + }; + + LASSERT(data->ioc_inllen1 > 0); + rc = llog_process(env, handle, llog_print_cb, &lprd, NULL); + if (rc == -LLOG_EEMPTY) + rc = 0; + else if (rc) + GOTO(out_close, rc); + break; + } + case OBD_IOC_LLOG_CANCEL: { + struct llog_cookie cookie; + struct llog_logid plain; + u32 lgc_index; + + rc = kstrtouint(data->ioc_inlbuf3, 0, &lgc_index); + if (rc) + GOTO(out_close, rc); + cookie.lgc_index = lgc_index; + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) { + rc = llog_cancel_rec(env, handle, cookie.lgc_index); + GOTO(out_close, rc); + } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) { + GOTO(out_close, rc = -EINVAL); + } + + if (data->ioc_inlbuf2 == NULL) /* catalog but no logid */ + GOTO(out_close, rc = -ENOTTY); + + rc = str2logid(&plain, data->ioc_inlbuf2, data->ioc_inllen2); + if (rc) + GOTO(out_close, rc); + cookie.lgc_lgl = plain; + rc = llog_cat_cancel_records(env, handle, 1, &cookie); + if (rc) + GOTO(out_close, rc); + break; + } + case OBD_IOC_LLOG_REMOVE: { + struct llog_logid plain; + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) { + rc = llog_destroy(env, handle); + GOTO(out_close, rc); + } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) { + GOTO(out_close, rc = -EINVAL); + } + + if (data->ioc_inlbuf2) { + /* remove indicate log from the catalog */ + rc = str2logid(&plain, data->ioc_inlbuf2, + data->ioc_inllen2); + if (rc) + GOTO(out_close, rc); + rc = llog_remove_log(env, handle, &plain); + } else { + /* remove all the log of the catalog */ + rc = llog_process(env, handle, llog_delete_cb, NULL, + NULL); + if (rc) + GOTO(out_close, rc); + } + break; + } + default: + CERROR("%s: Unknown ioctl cmd %#x\n", + ctxt->loc_obd->obd_name, cmd); + GOTO(out_close, rc = -ENOTTY); + } + +out_close: + if (handle->lgh_hdr && + handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) + llog_cat_close(env, handle); + else + llog_close(env, handle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_ioctl); + +int llog_catalog_list(const struct lu_env *env, struct dt_device *d, + int count, struct obd_ioctl_data *data, + const struct lu_fid *fid) +{ + int size, i; + struct llog_catid *idarray; + struct llog_logid *id; + char *out; + int l, remains, rc = 0; + + ENTRY; + + if (count == 0) { /* get total number of logs */ + rc = llog_osd_get_cat_list(env, d, 0, 0, NULL, fid); + if (rc < 0) + RETURN(rc); + count = rc; + } + + size = sizeof(*idarray) * count; + + OBD_ALLOC_LARGE(idarray, size); + if (!idarray) + RETURN(-ENOMEM); + + rc = llog_osd_get_cat_list(env, d, 0, count, idarray, fid); + if (rc) + GOTO(out, rc); + + out = data->ioc_bulk; + remains = data->ioc_inllen1; + /* OBD_FAIL: fetch the catalog records from the specified one */ + if (OBD_FAIL_CHECK(OBD_FAIL_CATLIST)) + data->ioc_count = cfs_fail_val - 1; + for (i = data->ioc_count; i < count; i++) { + id = &idarray[i].lci_logid; + l = snprintf(out, remains, "catalog_log: "DFID":%x\n", + PFID(&id->lgl_oi.oi_fid), id->lgl_ogen); + out += l; + remains -= l; + if (remains <= 0) { + if (remains < 0) { + /* the print is not complete */ + remains += l; + data->ioc_bulk[out - data->ioc_bulk - l] = '\0'; + data->ioc_count = i; + } else { + data->ioc_count = i++; + } + goto out; + } + } + data->ioc_count = 0; +out: + OBD_FREE_LARGE(idarray, size); + RETURN(rc); +} +EXPORT_SYMBOL(llog_catalog_list); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c new file mode 100644 index 0000000000000..0d05e64047835 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c @@ -0,0 +1,248 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include +#include +#include "llog_internal.h" + +/* helper functions for calling the llog obd methods */ +static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + OBD_ALLOC_PTR(ctxt); + if (!ctxt) + return NULL; + + ctxt->loc_obd = obd; + atomic_set(&ctxt->loc_refcount, 1); + + return ctxt; +} + +static void llog_ctxt_destroy(struct llog_ctxt *ctxt) +{ + if (ctxt->loc_exp) { + class_export_put(ctxt->loc_exp); + ctxt->loc_exp = NULL; + } + if (ctxt->loc_imp) { + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = NULL; + } + OBD_FREE_PTR(ctxt); +} + +int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct obd_llog_group *olg = ctxt->loc_olg; + struct obd_device *obd; + int rc = 0; + + spin_lock(&olg->olg_lock); + if (!atomic_dec_and_test(&ctxt->loc_refcount)) { + spin_unlock(&olg->olg_lock); + return rc; + } + olg->olg_ctxts[ctxt->loc_idx] = NULL; + spin_unlock(&olg->olg_lock); + + obd = ctxt->loc_obd; + spin_lock(&obd->obd_dev_lock); + /* sync with llog ctxt user thread */ + spin_unlock(&obd->obd_dev_lock); + + /* + * obd->obd_starting is needed for the case of cleanup + * in error case while obd is starting up. + */ + LASSERTF(obd->obd_starting == 1 || + obd->obd_stopping == 1 || obd->obd_set_up == 0, + "wrong obd state: %d/%d/%d\n", !!obd->obd_starting, + !!obd->obd_stopping, !!obd->obd_set_up); + + /* cleanup the llog ctxt here */ + if (ctxt->loc_logops->lop_cleanup) + rc = ctxt->loc_logops->lop_cleanup(env, ctxt); + + llog_ctxt_destroy(ctxt); + wake_up(&olg->olg_waitq); + return rc; +} +EXPORT_SYMBOL(__llog_ctxt_put); + +int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct obd_llog_group *olg; + int rc, idx; + + ENTRY; + + LASSERT(ctxt != NULL); + LASSERT(ctxt != LP_POISON); + + olg = ctxt->loc_olg; + LASSERT(olg != NULL); + LASSERT(olg != LP_POISON); + + idx = ctxt->loc_idx; + + /* + * Banlance the ctxt get when calling llog_cleanup() + */ + LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON); + LASSERT(atomic_read(&ctxt->loc_refcount) > 1); + llog_ctxt_put(ctxt); + + /* + * Try to free the ctxt. + */ + rc = __llog_ctxt_put(env, ctxt); + if (rc) + CERROR("Error %d while cleaning up ctxt %p\n", + rc, ctxt); + + l_wait_event_abortable(olg->olg_waitq, + llog_group_ctxt_null(olg, idx)); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cleanup); + +int llog_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int index, + struct obd_device *disk_obd, const struct llog_operations *op) +{ + struct llog_ctxt *ctxt; + int rc = 0; + + ENTRY; + + if (index < 0 || index >= LLOG_MAX_CTXTS) + RETURN(-EINVAL); + + LASSERT(olg != NULL); + + ctxt = llog_new_ctxt(obd); + if (!ctxt) + RETURN(-ENOMEM); + + ctxt->loc_obd = obd; + ctxt->loc_olg = olg; + ctxt->loc_idx = index; + ctxt->loc_logops = op; + mutex_init(&ctxt->loc_mutex); + if (disk_obd != NULL) + ctxt->loc_exp = class_export_get(disk_obd->obd_self_export); + else + ctxt->loc_exp = class_export_get(obd->obd_self_export); + + ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED; + ctxt->loc_chunk_size = LLOG_MIN_CHUNK_SIZE; + + rc = llog_group_set_ctxt(olg, ctxt, index); + if (rc) { + llog_ctxt_destroy(ctxt); + if (rc == -EEXIST) { + ctxt = llog_group_get_ctxt(olg, index); + if (ctxt) { + CDEBUG(D_CONFIG, "%s: ctxt %d already set up\n", + obd->obd_name, index); + LASSERT(ctxt->loc_olg == olg); + LASSERT(ctxt->loc_obd == obd); + if (disk_obd != NULL) + LASSERT(ctxt->loc_exp == + disk_obd->obd_self_export); + else + LASSERT(ctxt->loc_exp == + obd->obd_self_export); + LASSERT(ctxt->loc_logops == op); + llog_ctxt_put(ctxt); + } + rc = 0; + } + RETURN(rc); + } + + if (op->lop_setup) { + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP)) + rc = -EOPNOTSUPP; + else + rc = op->lop_setup(env, obd, olg, index, disk_obd); + } + + if (rc) { + CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n", + obd->obd_name, index, op->lop_setup, rc); + llog_group_clear_ctxt(olg, index); + llog_ctxt_destroy(ctxt); + } else { + CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n", + obd->obd_name, index); + ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED; + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_setup); + +int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags) +{ + int rc = 0; + + ENTRY; + if (ctxt && ctxt->loc_logops->lop_sync) + rc = ctxt->loc_logops->lop_sync(ctxt, exp, flags); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_sync); + +/* context key constructor/destructor: llog_key_init, llog_key_fini */ +LU_KEY_INIT_FINI(llog, struct llog_thread_info); +/* context key: llog_thread_key */ +LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL); +LU_KEY_INIT_GENERIC(llog); + +int llog_info_init(void) +{ + llog_key_init_generic(&llog_thread_key, NULL); + lu_context_key_register(&llog_thread_key); + return 0; +} + +void llog_info_fini(void) +{ + lu_context_key_degister(&llog_thread_key); +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c new file mode 100644 index 0000000000000..f58bb59982783 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c @@ -0,0 +1,2242 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +/* + * lustre/obdclass/llog_osd.c + * + * Low level llog routines on top of OSD API + * + * This file provides set of methods for llog operations on top of + * dt_device. It contains all supported llog_operations interfaces and + * supplimental functions. + * + * Author: Alexey Zhuravlev + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include + +#include +#include +#include +#include +#include + +#include "llog_internal.h" +#include "local_storage.h" + +/** + * Implementation of the llog_operations::lop_declare_create + * + * This function is a wrapper over local_storage API function + * local_object_declare_create(). + * + * \param[in] env execution environment + * \param[in] los local_storage for bottom storage device + * \param[in] o dt_object to create + * \param[in] th current transaction handle + * + * \retval 0 on successful declaration of the new object + * \retval negative error if declaration was failed + */ +static int llog_osd_declare_new_object(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + + lgi->lgi_attr.la_valid = LA_MODE; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + return local_object_declare_create(env, los, o, &lgi->lgi_attr, + &lgi->lgi_dof, th); +} + +/** + * Implementation of the llog_operations::lop_create + * + * This function is a wrapper over local_storage API function + * local_object_create(). + * + * \param[in] env execution environment + * \param[in] los local_storage for bottom storage device + * \param[in] o dt_object to create + * \param[in] th current transaction handle + * + * \retval 0 on successful creation of the new object + * \retval negative error if creation was failed + */ +static int llog_osd_create_new_object(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + + lgi->lgi_attr.la_valid = LA_MODE; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + return local_object_create(env, los, o, &lgi->lgi_attr, + &lgi->lgi_dof, th); +} + +/** + * Implementation of the llog_operations::lop_exist + * + * This function checks that llog exists on storage. + * + * \param[in] handle llog handle of the current llog + * + * \retval true if llog object exists and is not just destroyed + * \retval false if llog doesn't exist or just destroyed + */ +static int llog_osd_exist(struct llog_handle *handle) +{ + LASSERT(handle->lgh_obj); + return dt_object_exists(handle->lgh_obj) && !handle->lgh_destroyed; +} + +static void *rec_tail(struct llog_rec_hdr *rec) +{ + return (void *)((char *)rec + rec->lrh_len - + sizeof(struct llog_rec_tail)); +} + +/** + * Write a padding record to the llog + * + * This function writes a padding record to the end of llog. That may + * be needed if llog contains records of variable size, e.g. config logs + * or changelogs. + * The padding record just aligns llog to the llog chunk_size boundary if + * the current record doesn't fit in the remaining space. + * + * It allocates full length to avoid two separate writes for header and tail. + * Such 2-steps scheme needs extra protection and complex error handling. + * + * \param[in] env execution environment + * \param[in] o dt_object to create + * \param[in,out] off pointer to the padding start offset + * \param[in] len padding length + * \param[in] index index of the padding record in a llog + * \param[in] th current transaction handle + * + * \retval 0 on successful padding write + * \retval negative error if write failed + */ +static int llog_osd_pad(const struct lu_env *env, struct dt_object *o, + loff_t *off, int len, int index, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_rec_hdr *rec; + struct llog_rec_tail *tail; + int rc; + + ENTRY; + + LASSERT(th); + LASSERT(off); + LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0); + + OBD_ALLOC(rec, len); + if (rec == NULL) + RETURN(-ENOMEM); + + rec->lrh_len = len; + rec->lrh_index = index; + rec->lrh_type = LLOG_PAD_MAGIC; + + tail = rec_tail(rec); + tail->lrt_len = len; + tail->lrt_index = index; + + lgi->lgi_buf.lb_buf = rec; + lgi->lgi_buf.lb_len = len; + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) + CERROR("%s: error writing padding record: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + + OBD_FREE(rec, len); + RETURN(rc); +} + +/** + * Implementation of the llog_operations::lop_read_header + * + * This function reads the current llog header from the bottom storage + * device. + * + * \param[in] env execution environment + * \param[in] handle llog handle of the current llog + * + * \retval 0 on successful header read + * \retval negative error if read failed + */ +static int llog_osd_read_header(const struct lu_env *env, + struct llog_handle *handle) +{ + struct llog_rec_hdr *llh_hdr; + struct dt_object *o; + struct llog_thread_info *lgi; + enum llog_flag flags; + int rc; + + ENTRY; + + o = handle->lgh_obj; + LASSERT(o); + + lgi = llog_info(env); + + dt_read_lock(env, o, 0); + + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + GOTO(unlock, rc); + + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + + if (lgi->lgi_attr.la_size == 0) { + CDEBUG(D_HA, "not reading header from 0-byte log\n"); + GOTO(unlock, rc = LLOG_EEMPTY); + } + + flags = handle->lgh_hdr->llh_flags; + + lgi->lgi_off = 0; + lgi->lgi_buf.lb_buf = handle->lgh_hdr; + lgi->lgi_buf.lb_len = handle->lgh_hdr_size; + rc = dt_read(env, o, &lgi->lgi_buf, &lgi->lgi_off); + llh_hdr = &handle->lgh_hdr->llh_hdr; + if (rc < sizeof(*llh_hdr) || rc < llh_hdr->lrh_len) { + CERROR("%s: error reading "DFID" log header size %d: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), rc < 0 ? 0 : rc, + -EFAULT); + + if (rc >= 0) + rc = -EFAULT; + + GOTO(unlock, rc); + } + + if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr)) + lustre_swab_llog_hdr(handle->lgh_hdr); + + if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) { + CERROR("%s: bad log %s "DFID" header magic: %#x " + "(expected %#x)\n", o->do_lu.lo_dev->ld_obd->obd_name, + handle->lgh_name ? handle->lgh_name : "", + PFID(lu_object_fid(&o->do_lu)), + llh_hdr->lrh_type, LLOG_HDR_MAGIC); + GOTO(unlock, rc = -EIO); + } else if (llh_hdr->lrh_len < LLOG_MIN_CHUNK_SIZE || + llh_hdr->lrh_len > handle->lgh_hdr_size) { + CERROR("%s: incorrectly sized log %s "DFID" header: " + "%#x (expected at least %#x)\n" + "you may need to re-run lconf --write_conf.\n", + o->do_lu.lo_dev->ld_obd->obd_name, + handle->lgh_name ? handle->lgh_name : "", + PFID(lu_object_fid(&o->do_lu)), + llh_hdr->lrh_len, LLOG_MIN_CHUNK_SIZE); + GOTO(unlock, rc = -EIO); + } else if (LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index > + LLOG_HDR_BITMAP_SIZE(handle->lgh_hdr) || + LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len != + llh_hdr->lrh_len) { + CERROR("%s: incorrectly sized log %s "DFID" tailer: " + "%#x : rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + handle->lgh_name ? handle->lgh_name : "", + PFID(lu_object_fid(&o->do_lu)), + LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len, -EIO); + GOTO(unlock, rc = -EIO); + } + + handle->lgh_hdr->llh_flags |= (flags & LLOG_F_EXT_MASK); + handle->lgh_last_idx = LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index; + rc = 0; + +unlock: + dt_read_unlock(env, o); + RETURN(rc); +} + +/** + * Implementation of the llog_operations::lop_declare_write + * + * This function declares the new record write. + * + * \param[in] env execution environment + * \param[in] loghandle llog handle of the current llog + * \param[in] rec llog record header. This is a real header of the full + * llog record to write. This is the beginning of buffer + * to write, the length of buffer is stored in + * \a rec::lrh_len + * \param[in] idx index of the llog record. If \a idx == -1 then this is + * append case, otherwise \a idx is the index of record + * to modify + * \param[in] th current transaction handle + * + * \retval 0 on successful declaration + * \retval negative error if declaration failed + */ +static int llog_osd_declare_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + int idx, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + __u32 chunk_size; + struct dt_object *o; + int rc; + + ENTRY; + + LASSERT(env); + LASSERT(th); + LASSERT(loghandle); + LASSERT(rec); + LASSERT(rec->lrh_len <= loghandle->lgh_ctxt->loc_chunk_size); + + o = loghandle->lgh_obj; + LASSERT(o); + + chunk_size = loghandle->lgh_ctxt->loc_chunk_size; + lgi->lgi_buf.lb_len = chunk_size; + lgi->lgi_buf.lb_buf = NULL; + /* each time we update header */ + rc = dt_declare_record_write(env, o, &lgi->lgi_buf, 0, + th); + if (rc || idx == 0) /* if error or just header */ + RETURN(rc); + + /** + * the pad record can be inserted so take into account double + * record size + */ + lgi->lgi_buf.lb_len = chunk_size * 2; + lgi->lgi_buf.lb_buf = NULL; + /* XXX: implement declared window or multi-chunks approach */ + rc = dt_declare_record_write(env, o, &lgi->lgi_buf, -1, th); + + RETURN(rc); +} + +/** + * Implementation of the llog_operations::lop_write + * + * This function writes the new record in the llog or modify the existed one. + * + * \param[in] env execution environment + * \param[in] loghandle llog handle of the current llog + * \param[in] rec llog record header. This is a real header of + * the full llog record to write. This is + * the beginning of buffer to write, the length + * of buffer is stored in \a rec::lrh_len + * \param[in,out] reccookie pointer to the cookie to return back if needed. + * It is used for further cancel of this llog + * record. + * \param[in] idx index of the llog record. If \a idx == -1 then + * this is append case, otherwise \a idx is + * the index of record to modify + * \param[in] th current transaction handle + * + * \retval 0 on successful write && \a reccookie == NULL + * 1 on successful write && \a reccookie != NULL + * \retval negative error if write failed + */ +static int llog_osd_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + struct llog_cookie *reccookie, + int idx, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_log_hdr *llh; + int reclen = rec->lrh_len; + int index, rc; + struct llog_rec_tail *lrt; + struct dt_object *o; + __u32 chunk_size; + size_t left; + __u32 orig_last_idx; + bool pad = false; + ENTRY; + + llh = loghandle->lgh_hdr; + o = loghandle->lgh_obj; + + chunk_size = llh->llh_hdr.lrh_len; + CDEBUG(D_OTHER, "new record %x to "DFID"\n", + rec->lrh_type, PFID(lu_object_fid(&o->do_lu))); + + if (!llog_osd_exist(loghandle)) + RETURN(-ENOENT); + + /* record length should not bigger than */ + if (reclen > loghandle->lgh_hdr->llh_hdr.lrh_len) + RETURN(-E2BIG); + + /* sanity check for fixed-records llog */ + if (idx != LLOG_HEADER_IDX && (llh->llh_flags & LLOG_F_IS_FIXSIZE)) { + LASSERT(llh->llh_size != 0); + LASSERT(llh->llh_size == reclen); + } + + /* return error if osp object is stale */ + if (idx != LLOG_HEADER_IDX && dt_object_stale(o)) + RETURN(-ESTALE); + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + RETURN(rc); + + /** + * The modification case. + * If idx set then the record with that index must be modified. + * There are three cases possible: + * 1) the common case is the llog header update (idx == 0) + * 2) the llog record modification during llog process. + * This is indicated by the \a loghandle::lgh_cur_idx > 0. + * In that case the \a loghandle::lgh_cur_offset + * 3) otherwise this is assumed that llog consist of records of + * fixed size, i.e. catalog. The llog header must has llh_size + * field equal to record size. The record offset is calculated + * just by /a idx value + * + * During modification we don't need extra header update because + * the bitmap and record count are not changed. The record header + * and tail remains the same too. + */ + if (idx != LLOG_NEXT_IDX) { + /* llog can be empty only when first record is being written */ + LASSERT(ergo(idx > 0, lgi->lgi_attr.la_size > 0)); + + if (!test_bit_le(idx, LLOG_HDR_BITMAP(llh))) { + CERROR("%s: modify unset record %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, idx); + RETURN(-ENOENT); + } + + if (idx != rec->lrh_index) { + CERROR("%s: modify index mismatch %d %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, idx, + rec->lrh_index); + RETURN(-EFAULT); + } + + if (idx == LLOG_HEADER_IDX) { + /* llog header update */ + __u32 *bitmap = LLOG_HDR_BITMAP(llh); + + lgi->lgi_off = 0; + + /* If it does not indicate the bitmap index + * (reccookie == NULL), then it means update + * the whole update header. Otherwise only + * update header and bits needs to be updated, + * and in DNE cases, it will signaficantly + * shrink the RPC size. + * see distribute_txn_cancel_records()*/ + if (reccookie == NULL) { + lgi->lgi_buf.lb_len = reclen; + lgi->lgi_buf.lb_buf = rec; + rc = dt_record_write(env, o, &lgi->lgi_buf, + &lgi->lgi_off, th); + RETURN(rc); + } + + /* update the header */ + lgi->lgi_buf.lb_len = llh->llh_bitmap_offset; + lgi->lgi_buf.lb_buf = llh; + rc = dt_record_write(env, o, &lgi->lgi_buf, + &lgi->lgi_off, th); + if (rc != 0) + RETURN(rc); + + /* update the bitmap */ + index = reccookie->lgc_index; + lgi->lgi_off = llh->llh_bitmap_offset + + (index / (sizeof(*bitmap) * 8)) * + sizeof(*bitmap); + lgi->lgi_buf.lb_len = sizeof(*bitmap); + lgi->lgi_buf.lb_buf = + &bitmap[index/(sizeof(*bitmap)*8)]; + rc = dt_record_write(env, o, &lgi->lgi_buf, + &lgi->lgi_off, th); + + RETURN(rc); + } else if (llh->llh_flags & LLOG_F_IS_FIXSIZE) { + lgi->lgi_off = llh->llh_hdr.lrh_len + + (idx - 1) * reclen; + } else if (reccookie != NULL && reccookie->lgc_index > 0) { + /** + * The lgc_offset can be used only if index is + * the same. + */ + if (idx != reccookie->lgc_index) { + CERROR("%s: modify index mismatch %d %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, idx, + reccookie->lgc_index); + RETURN(-EFAULT); + } + + lgi->lgi_off = reccookie->lgc_offset; + CDEBUG(D_OTHER, "modify record "DFID": idx:%u, " + "len:%u offset %llu\n", + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), idx, + rec->lrh_len, (long long)lgi->lgi_off); + } else { + /* This can be result of lgh_cur_idx is not set during + * llog processing or llh_size is not set to proper + * record size for fixed records llog. Therefore it is + * impossible to get record offset. */ + CERROR("%s: can't get record offset, idx:%d, " + "len:%u.\n", o->do_lu.lo_dev->ld_obd->obd_name, + idx, rec->lrh_len); + RETURN(-EFAULT); + } + + /* update only data, header and tail remain the same */ + lgi->lgi_off += sizeof(struct llog_rec_hdr); + lgi->lgi_buf.lb_len = REC_DATA_LEN(rec); + lgi->lgi_buf.lb_buf = REC_DATA(rec); + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc == 0 && reccookie) { + reccookie->lgc_lgl = loghandle->lgh_id; + reccookie->lgc_index = idx; + rc = 1; + } + RETURN(rc); + } + + /** + * The append case. + * The most common case of using llog. The new index is assigned to + * the new record, new bit is set in llog bitmap and llog count is + * incremented. + * + * Make sure that records don't cross a chunk boundary, so we can + * process them page-at-a-time if needed. If it will cross a chunk + * boundary, write in a fake (but referenced) entry to pad the chunk. + */ + + + /* simulate ENOSPC when new plain llog is being added to the + * catalog */ + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED2) && + llh->llh_flags & LLOG_F_IS_CAT) + RETURN(-ENOSPC); + + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + orig_last_idx = loghandle->lgh_last_idx; + lgi->lgi_off = lgi->lgi_attr.la_size; + + if (loghandle->lgh_max_size > 0 && + lgi->lgi_off >= loghandle->lgh_max_size) { + CDEBUG(D_OTHER, "llog is getting too large (%u > %u) at %u " + DFID"\n", (unsigned)lgi->lgi_off, + loghandle->lgh_max_size, (int)loghandle->lgh_last_idx, + PFID(&loghandle->lgh_id.lgl_oi.oi_fid)); + /* this is to signal that this llog is full */ + loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) - 1; + RETURN(-ENOSPC); + } + + left = chunk_size - (lgi->lgi_off & (chunk_size - 1)); + /* NOTE: padding is a record, but no bit is set */ + if (left != 0 && left != reclen && + left < (reclen + LLOG_MIN_REC_SIZE)) { + index = loghandle->lgh_last_idx + 1; + rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th); + if (rc) + RETURN(rc); + + loghandle->lgh_last_idx++; /* for pad rec */ + pad = true; + } + /* if it's the last idx in log file, then return -ENOSPC + * or wrap around if a catalog */ + if (llog_is_full(loghandle) || + unlikely(llh->llh_flags & LLOG_F_IS_CAT && + OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS) && + loghandle->lgh_last_idx >= cfs_fail_val)) { + if (llh->llh_flags & LLOG_F_IS_CAT) + loghandle->lgh_last_idx = 0; + else + RETURN(-ENOSPC); + } + + down_write(&loghandle->lgh_last_sem); + /* increment the last_idx along with llh_tail index, they should + * be equal for a llog lifetime */ + if (OBD_FAIL_CHECK(OBD_FAIL_LLOG_ADD_GAP) && --cfs_fail_val == 0) + loghandle->lgh_last_idx++; + loghandle->lgh_last_idx++; + index = loghandle->lgh_last_idx; + LLOG_HDR_TAIL(llh)->lrt_index = index; + /** + * NB: the caller should make sure only 1 process access + * the lgh_last_idx, e.g. append should be exclusive. + * Otherwise it might hit the assert. + */ + LASSERT(index < LLOG_HDR_BITMAP_SIZE(llh)); + rec->lrh_index = index; + lrt = rec_tail(rec); + lrt->lrt_len = rec->lrh_len; + lrt->lrt_index = rec->lrh_index; + + /* the lgh_hdr_mutex protects llog header data from concurrent + * update/cancel, the llh_count and llh_bitmap are protected */ + mutex_lock(&loghandle->lgh_hdr_mutex); + if (__test_and_set_bit_le(index, LLOG_HDR_BITMAP(llh))) { + CERROR("%s: index %u already set in llog bitmap "DFID"\n", + o->do_lu.lo_dev->ld_obd->obd_name, index, + PFID(lu_object_fid(&o->do_lu))); + mutex_unlock(&loghandle->lgh_hdr_mutex); + LBUG(); /* should never happen */ + } + llh->llh_count++; + + if (!(llh->llh_flags & LLOG_F_IS_FIXSIZE)) { + /* Update the minimum size of the llog record */ + if (llh->llh_size == 0) + llh->llh_size = reclen; + else if (reclen < llh->llh_size) + llh->llh_size = reclen; + } + + /* + * readers (e.g. llog_osd_read_header()) must not find + * llog updated partially (bitmap/counter claims record, + * but a record hasn't been added yet) as this results + * in EIO. + */ + dt_write_lock(env, o, 0); + + if (lgi->lgi_attr.la_size == 0) { + lgi->lgi_off = 0; + lgi->lgi_buf.lb_len = llh->llh_hdr.lrh_len; + lgi->lgi_buf.lb_buf = &llh->llh_hdr; + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc != 0) + GOTO(out_unlock, rc); + } else { + __u32 *bitmap = LLOG_HDR_BITMAP(llh); + + /* Note: If this is not initialization (size == 0), then do not + * write the whole header (8k bytes), only update header/tail + * and bits needs to be updated. Because this update might be + * part of cross-MDT operation, which needs to write these + * updates into the update log(32KB limit) and also pack inside + * the RPC (1MB limit), if we write 8K for each operation, which + * will cost a lot space, and keep us adding more updates to one + * update log.*/ + lgi->lgi_off = 0; + lgi->lgi_buf.lb_len = llh->llh_bitmap_offset; + lgi->lgi_buf.lb_buf = &llh->llh_hdr; + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc != 0) + GOTO(out_unlock, rc); + + lgi->lgi_off = llh->llh_bitmap_offset + + (index / (sizeof(*bitmap) * 8)) * sizeof(*bitmap); + lgi->lgi_buf.lb_len = sizeof(*bitmap); + lgi->lgi_buf.lb_buf = &bitmap[index/(sizeof(*bitmap)*8)]; + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc != 0) + GOTO(out_unlock, rc); + + lgi->lgi_off = (unsigned long)LLOG_HDR_TAIL(llh) - + (unsigned long)llh; + lgi->lgi_buf.lb_len = sizeof(llh->llh_tail); + lgi->lgi_buf.lb_buf = LLOG_HDR_TAIL(llh); + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc != 0) + GOTO(out_unlock, rc); + } + if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PAUSE_AFTER_PAD) && pad) { + /* a window for concurrent llog reader, see LU-12577 */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LLOG_PAUSE_AFTER_PAD, + cfs_fail_val ?: 1); + } + +out_unlock: + /* unlock here for remote object */ + mutex_unlock(&loghandle->lgh_hdr_mutex); + if (rc) { + dt_write_unlock(env, o); + GOTO(out, rc); + } + + if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PROCESS_TIMEOUT) && + cfs_fail_val == (unsigned int)(loghandle->lgh_id.lgl_oi.oi.oi_id & + 0xFFFFFFFF)) { + OBD_RACE(OBD_FAIL_LLOG_PROCESS_TIMEOUT); + msleep(1 * MSEC_PER_SEC); + } + /* computed index can be used to determine offset for fixed-size + * records. This also allows to handle Catalog wrap around case */ + if (llh->llh_flags & LLOG_F_IS_FIXSIZE) { + lgi->lgi_off = llh->llh_hdr.lrh_len + (index - 1) * reclen; + } else { + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) { + dt_write_unlock(env, o); + GOTO(out, rc); + } + + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + lgi->lgi_off = max_t(__u64, lgi->lgi_attr.la_size, + lgi->lgi_off); + } + + lgi->lgi_buf.lb_len = reclen; + lgi->lgi_buf.lb_buf = rec; + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + + dt_write_unlock(env, o); + if (rc < 0) + GOTO(out, rc); + + up_write(&loghandle->lgh_last_sem); + + CDEBUG(D_HA, "added record "DFID".%u, %u off%llu\n", + PFID(lu_object_fid(&o->do_lu)), index, rec->lrh_len, + lgi->lgi_off); + if (reccookie != NULL) { + reccookie->lgc_lgl = loghandle->lgh_id; + reccookie->lgc_index = index; + if ((rec->lrh_type == MDS_UNLINK_REC) || + (rec->lrh_type == MDS_SETATTR64_REC)) + reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT; + else if (rec->lrh_type == OST_SZ_REC) + reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT; + else + reccookie->lgc_subsys = -1; + rc = 1; + } + RETURN(rc); +out: + /* cleanup llog for error case */ + mutex_lock(&loghandle->lgh_hdr_mutex); + clear_bit_le(index, LLOG_HDR_BITMAP(llh)); + llh->llh_count--; + mutex_unlock(&loghandle->lgh_hdr_mutex); + + /* restore llog last_idx */ + if (dt_object_remote(o)) { + loghandle->lgh_last_idx = orig_last_idx; + } else if (--loghandle->lgh_last_idx == 0 && + (llh->llh_flags & LLOG_F_IS_CAT) && llh->llh_cat_idx != 0) { + /* catalog had just wrap-around case */ + loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) - 1; + } + + LLOG_HDR_TAIL(llh)->lrt_index = loghandle->lgh_last_idx; + up_write(&loghandle->lgh_last_sem); + + RETURN(rc); +} + +/** + * We can skip reading at least as many log blocks as the number of + * minimum sized log records we are skipping. If it turns out + * that we are not far enough along the log (because the + * actual records are larger than minimum size) we just skip + * some more records. + * + * Note: in llog_process_thread, it will use bitmap offset as + * the index to locate the record, which also includs some pad + * records, whose record size is very small, and it also does not + * consider pad record when recording minimum record size (otherwise + * min_record size might be too small), so in some rare cases, + * it might skip too much record for @goal, see llog_osd_next_block(). + * + * When force_mini_rec is true, it means we have to use LLOG_MIN_REC_SIZE + * as the min record size to skip over, usually because in the previous + * try, it skip too much record, see loog_osd_next(prev)_block(). + */ +static inline void llog_skip_over(struct llog_handle *lgh, __u64 *off, + int curr, int goal, __u32 chunk_size, + bool force_mini_rec) +{ + struct llog_log_hdr *llh = lgh->lgh_hdr; + + /* Goal should not bigger than the record count */ + if (goal > lgh->lgh_last_idx) + goal = lgh->lgh_last_idx; + + if (goal > curr) { + if (llh->llh_flags & LLOG_F_IS_FIXSIZE) { + *off = chunk_size + (goal - 1) * llh->llh_size; + } else { + __u64 min_rec_size = LLOG_MIN_REC_SIZE; + + if (llh->llh_size > 0 && !force_mini_rec) + min_rec_size = llh->llh_size; + + *off = *off + (goal - curr - 1) * min_rec_size; + } + } + /* always align with lower chunk boundary*/ + *off &= ~(chunk_size - 1); +} + +/** + * Remove optional fields that the client doesn't expect. + * This is typically in order to ensure compatibility with older clients. + * It is assumed that since we exclusively remove fields, the block will be + * big enough to handle the remapped records. It is also assumed that records + * of a block have the same format (i.e.: the same features enabled). + * + * \param[in,out] hdr Header of the block of records to remap. + * \param[in,out] last_hdr Last header, don't read past this point. + * \param[in] flags Flags describing the fields to keep. + * \param[in] extra_flags Flags describing the extra fields to keep. + */ +static void changelog_block_trim_ext(struct llog_rec_hdr *hdr, + struct llog_rec_hdr *last_hdr, + struct llog_handle *loghandle) +{ + enum changelog_rec_flags flags = CLF_SUPPORTED; + enum changelog_rec_extra_flags extra_flags = CLFE_SUPPORTED; + + if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_XATTR)) + extra_flags &= ~CLFE_XATTR; + if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_OMODE)) + extra_flags &= ~CLFE_OPEN; + if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_NID)) + extra_flags &= ~CLFE_NID; + if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_UIDGID)) + extra_flags &= ~CLFE_UIDGID; + if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_EXTRA_FLAGS)) + flags &= ~CLF_EXTRA_FLAGS; + if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID)) + flags &= ~CLF_JOBID; + + if (flags == CLF_SUPPORTED && extra_flags == CLFE_SUPPORTED) + return; + + if (hdr->lrh_type != CHANGELOG_REC) + return; + + do { + struct changelog_rec *rec = (struct changelog_rec *)(hdr + 1); + enum changelog_rec_extra_flags xflag = CLFE_INVALID; + + if (flags & CLF_EXTRA_FLAGS && + rec->cr_flags & CLF_EXTRA_FLAGS) { + xflag = changelog_rec_extra_flags(rec)->cr_extra_flags & + extra_flags; + } + + if (unlikely(hdr->lrh_len == 0)) { + /* It is corruption case, we cannot know the next rec, + * jump to the last one directly to avoid dead loop. */ + LCONSOLE(D_WARNING, "Hit invalid llog record: " + "idx %u, type %u, id %u\n", + hdr->lrh_index, hdr->lrh_type, hdr->lrh_id); + hdr = llog_rec_hdr_next(last_hdr); + if (unlikely(hdr == last_hdr)) + LCONSOLE(D_WARNING, "The last record crashed: " + "idx %u, type %u, id %u\n", + hdr->lrh_index, hdr->lrh_type, + hdr->lrh_id); + break; + } + + changelog_remap_rec(rec, hdr->lrh_len - sizeof(struct llog_rec_hdr), + rec->cr_flags & flags, xflag); + hdr = llog_rec_hdr_next(hdr); + /* Yield CPU to avoid soft-lockup if there are too many records + * to be handled. */ + cond_resched(); + } while ((char *)hdr <= (char *)last_hdr); +} + +/** + * Implementation of the llog_operations::lop_next_block + * + * This function finds the the next llog block to return which contains + * record with required index. It is main part of llog processing. + * + * \param[in] env execution environment + * \param[in] loghandle llog handle of the current llog + * \param[in,out] cur_idx index preceeding cur_offset + * \param[in] next_idx target index to find + * \param[in,out] cur_offset furtherst point read in the file + * \param[in] buf pointer to data buffer to fill + * \param[in] len required len to read, it is + * usually llog chunk_size. + * + * \retval 0 on successful buffer read + * \retval negative value on error + */ +static int llog_osd_next_block(const struct lu_env *env, + struct llog_handle *loghandle, int *cur_idx, + int next_idx, __u64 *cur_offset, void *buf, + int len) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o; + struct dt_device *dt; + int rc; + __u32 chunk_size; + int last_idx = *cur_idx; + __u64 last_offset = *cur_offset; + bool force_mini_rec = false; + + ENTRY; + + LASSERT(env); + LASSERT(lgi); + + chunk_size = loghandle->lgh_hdr->llh_hdr.lrh_len; + if (len == 0 || len & (chunk_size - 1)) + RETURN(-EINVAL); + + LASSERT(loghandle); + LASSERT(loghandle->lgh_ctxt); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_CHANGELOG_DEL) && + cfs_fail_val == ((unsigned long)loghandle & 0xFFFFFFFF)) { + OBD_RACE(OBD_FAIL_MDS_CHANGELOG_DEL); + msleep(MSEC_PER_SEC >> 2); + } + + o = loghandle->lgh_obj; + LASSERT(o); + dt_read_lock(env, o, 0); + if (!llog_osd_exist(loghandle)) + GOTO(out, rc = -ESTALE); //object was destroyed + + dt = lu2dt_dev(o->do_lu.lo_dev); + LASSERT(dt); + + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + GOTO(out, rc); + + CDEBUG(D_OTHER, + "looking for log index %u (cur idx %u off %llu), size %llu\n", + next_idx, *cur_idx, + *cur_offset, lgi->lgi_attr.la_size); + + while (*cur_offset < lgi->lgi_attr.la_size) { + struct llog_rec_hdr *rec, *last_rec; + struct llog_rec_tail *tail; + + llog_skip_over(loghandle, cur_offset, *cur_idx, + next_idx, chunk_size, force_mini_rec); + + /* read up to next llog chunk_size block */ + lgi->lgi_buf.lb_len = chunk_size - + (*cur_offset & (chunk_size - 1)); + lgi->lgi_buf.lb_buf = buf; + + rc = dt_read(env, o, &lgi->lgi_buf, cur_offset); + if (rc < 0) { + if (rc == -EBADR && !force_mini_rec) + goto retry; + + CERROR("%s: can't read llog block from log "DFID + " offset %llu: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), *cur_offset, + rc); + GOTO(out, rc); + } + + if (rc < len) { + /* signal the end of the valid buffer to + * llog_process */ + memset(buf + rc, 0, len - rc); + } + + if (rc == 0) { /* end of file, nothing to do */ + if (!force_mini_rec) + goto retry; + GOTO(out, rc); + } + + if (rc < sizeof(*tail)) { + if (!force_mini_rec) + goto retry; + + CERROR("%s: invalid llog block at log id "DFID":%x " + "offset %llu\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + loghandle->lgh_id.lgl_ogen, *cur_offset); + GOTO(out, rc = -EINVAL); + } + + rec = buf; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + tail = (struct llog_rec_tail *)((char *)buf + rc - + sizeof(struct llog_rec_tail)); + + if (llog_verify_record(loghandle, rec)) { + /* + * the block seems corrupted. make a pad record so the + * caller can skip the block and try with the next one + */ + rec->lrh_len = rc; + rec->lrh_index = next_idx; + rec->lrh_type = LLOG_PAD_MAGIC; + + tail = rec_tail(rec); + tail->lrt_len = rc; + tail->lrt_index = next_idx; + + GOTO(out, rc = 0); + } + + /* get the last record in block */ + last_rec = (struct llog_rec_hdr *)((char *)buf + rc - + tail->lrt_len); + + if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec)) + lustre_swab_llog_rec(last_rec); + + if (last_rec->lrh_index != tail->lrt_index) { + CERROR("%s: invalid llog tail at log id "DFID":%x offset %llu last_rec idx %u tail idx %u lrt len %u read_size %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + loghandle->lgh_id.lgl_ogen, *cur_offset, + last_rec->lrh_index, tail->lrt_index, + tail->lrt_len, rc); + GOTO(out, rc = -EINVAL); + } + + *cur_idx = tail->lrt_index; + + /* this shouldn't happen */ + if (tail->lrt_index == 0) { + CERROR("%s: invalid llog tail at log id "DFID":%x " + "offset %llu bytes %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + loghandle->lgh_id.lgl_ogen, *cur_offset, rc); + GOTO(out, rc = -EINVAL); + } + if (tail->lrt_index < next_idx) { + last_idx = *cur_idx; + last_offset = *cur_offset; + continue; + } + + /* sanity check that the start of the new buffer is no farther + * than the record that we wanted. This shouldn't happen. */ + if (next_idx && rec->lrh_index > next_idx) { + if (!force_mini_rec && next_idx > last_idx) + goto retry; + + CERROR("%s: missed desired record? %u > %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, + rec->lrh_index, next_idx); + GOTO(out, rc = -ENOENT); + } + + /* Trim unsupported extensions for compat w/ older clients */ + changelog_block_trim_ext(rec, last_rec, loghandle); + + GOTO(out, rc = 0); + +retry: + /* Note: because there are some pad records in the + * llog, so llog_skip_over() might skip too much + * records, let's try skip again with minimum record */ + force_mini_rec = true; + *cur_offset = last_offset; + *cur_idx = last_idx; + } + GOTO(out, rc = -EIO); +out: + dt_read_unlock(env, o); + return rc; +} + +/** + * Implementation of the llog_operations::lop_prev_block + * + * This function finds the llog block to return which contains + * record with required index but in reverse order - from end of llog + * to the beginning. + * It is main part of reverse llog processing. + * + * \param[in] env execution environment + * \param[in] loghandle llog handle of the current llog + * \param[in] prev_idx target index to find + * \param[in] buf pointer to data buffer to fill + * \param[in] len required len to read, it is llog_chunk_size usually. + * + * \retval 0 on successful buffer read + * \retval negative value on error + */ +static int llog_osd_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o; + struct dt_device *dt; + loff_t cur_offset; + __u32 chunk_size; + int rc; + + ENTRY; + + chunk_size = loghandle->lgh_hdr->llh_hdr.lrh_len; + if (len == 0 || len & (chunk_size - 1)) + RETURN(-EINVAL); + + CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx); + + LASSERT(loghandle); + LASSERT(loghandle->lgh_ctxt); + + o = loghandle->lgh_obj; + LASSERT(o); + dt_read_lock(env, o, 0); + if (!llog_osd_exist(loghandle)) + GOTO(out, rc = -ESTALE); + + dt = lu2dt_dev(o->do_lu.lo_dev); + LASSERT(dt); + + /* Let's only use mini record size for previous block read + * for now XXX */ + cur_offset = chunk_size; + llog_skip_over(loghandle, &cur_offset, 0, prev_idx, + chunk_size, true); + + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + GOTO(out, rc); + + while (cur_offset < lgi->lgi_attr.la_size) { + struct llog_rec_hdr *rec, *last_rec; + struct llog_rec_tail *tail; + + lgi->lgi_buf.lb_len = len; + lgi->lgi_buf.lb_buf = buf; + rc = dt_read(env, o, &lgi->lgi_buf, &cur_offset); + if (rc < 0) { + CERROR("%s: can't read llog block from log "DFID + " offset %llu: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), cur_offset, rc); + GOTO(out, rc); + } + + if (rc == 0) /* end of file, nothing to do */ + GOTO(out, rc); + + if (rc < sizeof(*tail)) { + CERROR("%s: invalid llog block at log id "DFID":%x " + "offset %llu\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + loghandle->lgh_id.lgl_ogen, cur_offset); + GOTO(out, rc = -EINVAL); + } + + rec = buf; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + tail = (struct llog_rec_tail *)((char *)buf + rc - + sizeof(struct llog_rec_tail)); + /* get the last record in block */ + last_rec = (struct llog_rec_hdr *)((char *)buf + rc - + le32_to_cpu(tail->lrt_len)); + + if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec)) + lustre_swab_llog_rec(last_rec); + LASSERT(last_rec->lrh_index == tail->lrt_index); + + /* this shouldn't happen */ + if (tail->lrt_index == 0) { + CERROR("%s: invalid llog tail at log id "DFID":%x " + "offset %llu\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + loghandle->lgh_id.lgl_ogen, cur_offset); + GOTO(out, rc = -EINVAL); + } + if (tail->lrt_index < prev_idx) + continue; + + /* sanity check that the start of the new buffer is no farther + * than the record that we wanted. This shouldn't happen. */ + if (rec->lrh_index > prev_idx) { + CERROR("%s: missed desired record? %u > %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, + rec->lrh_index, prev_idx); + GOTO(out, rc = -ENOENT); + } + + /* Trim unsupported extensions for compat w/ older clients */ + changelog_block_trim_ext(rec, last_rec, loghandle); + + GOTO(out, rc = 0); + } + GOTO(out, rc = -EIO); +out: + dt_read_unlock(env, o); + return rc; +} + +/** + * This is helper function to get llog directory object. It is used by named + * llog operations to find/insert/delete llog entry from llog directory. + * + * \param[in] env execution environment + * \param[in] ctxt llog context + * + * \retval dt_object of llog directory + * \retval ERR_PTR of negative value on error + */ +static struct dt_object *llog_osd_dir_get(const struct lu_env *env, + struct llog_ctxt *ctxt) +{ + struct dt_device *dt; + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dir; + int rc; + + dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; + if (ctxt->loc_dir == NULL) { + rc = dt_root_get(env, dt, &dti->dti_fid); + if (rc) + return ERR_PTR(rc); + dir = dt_locate(env, dt, &dti->dti_fid); + + if (!IS_ERR(dir) && !dt_try_as_dir(env, dir)) { + dt_object_put(env, dir); + return ERR_PTR(-ENOTDIR); + } + } else { + lu_object_get(&ctxt->loc_dir->do_lu); + dir = ctxt->loc_dir; + } + + return dir; +} + +/** + * Implementation of the llog_operations::lop_open + * + * This function opens the llog by its logid or by name, it may open also + * non existent llog and assing then new id to it. + * The llog_open/llog_close pair works similar to lu_object_find/put, + * the object may not exist prior open. The result of open is just dt_object + * in the llog header. + * + * \param[in] env execution environment + * \param[in] handle llog handle of the current llog + * \param[in] logid logid of llog to open (nameless llog) + * \param[in] name name of llog to open (named llog) + * \param[in] open_param + * LLOG_OPEN_NEW - new llog, may not exist + * LLOG_OPEN_EXIST - old llog, must exist + * + * \retval 0 on successful open, llog_handle::lgh_obj + * contains the dt_object of the llog. + * \retval negative value on error + */ +static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle, + struct llog_logid *logid, char *name, + enum llog_open_param open_param) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_ctxt *ctxt = handle->lgh_ctxt; + struct dt_object *o; + struct dt_device *dt; + struct ls_device *ls; + struct local_oid_storage *los = NULL; + int rc = 0; + bool new_id = false; + + ENTRY; + + LASSERT(env); + LASSERT(ctxt); + LASSERT(ctxt->loc_exp); + LASSERT(ctxt->loc_exp->exp_obd); + dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; + LASSERT(dt); + if (ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + struct lu_object_conf conf = { 0 }; + if (logid != NULL) { + logid_to_fid(logid, &lgi->lgi_fid); + } else { + /* If logid == NULL, then it means the caller needs + * to allocate new FID (llog_cat_declare_add_rec()). */ + rc = dt_fid_alloc(env, dt, &lgi->lgi_fid, NULL, NULL); + if (rc < 0) + RETURN(rc); + rc = 0; + conf.loc_flags = LOC_F_NEW; + } + + o = dt_locate_at(env, dt, &lgi->lgi_fid, + dt->dd_lu_dev.ld_site->ls_top_dev, &conf); + if (IS_ERR(o)) + RETURN(PTR_ERR(o)); + + goto after_open; + } + + ls = ls_device_get(dt); + if (IS_ERR(ls)) + RETURN(PTR_ERR(ls)); + + mutex_lock(&ls->ls_los_mutex); + los = dt_los_find(ls, name != NULL ? FID_SEQ_LLOG_NAME : FID_SEQ_LLOG); + mutex_unlock(&ls->ls_los_mutex); + LASSERT(los); + ls_device_put(env, ls); + + LASSERT(handle); + + if (logid != NULL) { + logid_to_fid(logid, &lgi->lgi_fid); + } else if (name) { + struct dt_object *llog_dir; + + llog_dir = llog_osd_dir_get(env, ctxt); + if (IS_ERR(llog_dir)) + GOTO(out, rc = PTR_ERR(llog_dir)); + dt_read_lock(env, llog_dir, 0); + rc = dt_lookup_dir(env, llog_dir, name, &lgi->lgi_fid); + dt_read_unlock(env, llog_dir); + dt_object_put(env, llog_dir); + if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) { + /* generate fid for new llog */ + rc = local_object_fid_generate(env, los, + &lgi->lgi_fid); + new_id = true; + } + if (rc < 0) + GOTO(out, rc); + OBD_ALLOC(handle->lgh_name, strlen(name) + 1); + if (handle->lgh_name) + strcpy(handle->lgh_name, name); + else + GOTO(out, rc = -ENOMEM); + } else { + LASSERTF(open_param & LLOG_OPEN_NEW, "%#x\n", open_param); + /* generate fid for new llog */ +generate: + rc = local_object_fid_generate(env, los, &lgi->lgi_fid); + if (rc < 0) + GOTO(out, rc); + new_id = true; + } + if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_LLOG_UMOUNT_RACE) && + cfs_fail_val == 1) { + cfs_fail_val = 2; + OBD_RACE(OBD_FAIL_MDS_LLOG_UMOUNT_RACE); + msleep(MSEC_PER_SEC); + } + o = ls_locate(env, ls, &lgi->lgi_fid, NULL); + if (IS_ERR(o)) + GOTO(out_name, rc = PTR_ERR(o)); + + if (dt_object_exists(o) && new_id) { + /* llog exists with just generated ID, e.g. some old llog file + * still is in use or is orphan, drop a warn and skip it. */ + CDEBUG(D_INFO, "%s: llog exists with the same FID: "DFID + ", skipping\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu))); + dt_object_put(env, o); + /* just skip this llog ID, we shouldn't delete it because we + * don't know exactly what is its purpose and state. */ + goto generate; + } + +after_open: + /* No new llog is expected but doesn't exist */ + if (open_param != LLOG_OPEN_NEW && !dt_object_exists(o)) { + CDEBUG(D_INFO, "%s: llog FID: "DFID" obj %p doesn`t exist\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), o); + GOTO(out_put, rc = -ENOENT); + } + fid_to_logid(&lgi->lgi_fid, &handle->lgh_id); + handle->lgh_obj = o; + handle->private_data = los; + LASSERT(handle->lgh_ctxt); + + RETURN(rc); + +out_put: + dt_object_put(env, o); +out_name: + if (handle->lgh_name != NULL) + OBD_FREE(handle->lgh_name, strlen(name) + 1); +out: + if (los != NULL) + dt_los_put(los); + RETURN(rc); +} + +/** + * Get dir for regular fid log object + * + * Get directory for regular fid log object, and these regular fid log + * object will be inserted under this directory, to satisfy the FS + * consistency check, e2fsck etc. + * + * \param [in] env execution environment + * \param [in] dto llog object + * + * \retval pointer to the directory if it is found. + * \retval ERR_PTR(negative errno) if it fails. + */ +struct dt_object *llog_osd_get_regular_fid_dir(const struct lu_env *env, + struct dt_object *dto) +{ + struct llog_thread_info *lgi = llog_info(env); + struct seq_server_site *ss = dto->do_lu.lo_dev->ld_site->ld_seq_site; + struct lu_seq_range *range = &lgi->lgi_range; + struct lu_fid *dir_fid = &lgi->lgi_fid; + struct dt_object *dir; + int rc; + ENTRY; + + fld_range_set_any(range); + LASSERT(ss != NULL); + rc = ss->ss_server_fld->lsf_seq_lookup(env, ss->ss_server_fld, + fid_seq(lu_object_fid(&dto->do_lu)), range); + if (rc < 0) + RETURN(ERR_PTR(rc)); + + lu_update_log_dir_fid(dir_fid, range->lsr_index); + dir = dt_locate(env, lu2dt_dev(dto->do_lu.lo_dev), dir_fid); + if (IS_ERR(dir)) + RETURN(dir); + + if (!dt_try_as_dir(env, dir)) { + dt_object_put(env, dir); + RETURN(ERR_PTR(-ENOTDIR)); + } + + RETURN(dir); +} + +/** + * Add llog object with regular FID to name entry + * + * Add llog object with regular FID to name space, and each llog + * object on each MDT will be /update_log_dir/[seq:oid:ver], + * so to satisfy the namespace consistency check, e2fsck etc. + * + * \param [in] env execution environment + * \param [in] dto llog object + * \param [in] th thandle + * \param [in] declare if it is declare or execution + * + * \retval 0 if insertion succeeds. + * \retval negative errno if insertion fails. + */ +static int +llog_osd_regular_fid_add_name_entry(const struct lu_env *env, + struct dt_object *dto, + struct thandle *th, bool declare) +{ + struct llog_thread_info *lgi = llog_info(env); + const struct lu_fid *fid = lu_object_fid(&dto->do_lu); + struct dt_insert_rec *rec = &lgi->lgi_dt_rec; + struct dt_object *dir; + char *name = lgi->lgi_name; + int rc; + ENTRY; + + if (!fid_is_norm(fid)) + RETURN(0); + + dir = llog_osd_get_regular_fid_dir(env, dto); + if (IS_ERR(dir)) + RETURN(PTR_ERR(dir)); + + rec->rec_fid = fid; + rec->rec_type = S_IFREG; + snprintf(name, sizeof(lgi->lgi_name), DFID, PFID(fid)); + dt_write_lock(env, dir, 0); + if (declare) { + rc = dt_declare_insert(env, dir, (struct dt_rec *)rec, + (struct dt_key *)name, th); + } else { + rc = dt_insert(env, dir, (struct dt_rec *)rec, + (struct dt_key *)name, th); + } + dt_write_unlock(env, dir); + + dt_object_put(env, dir); + RETURN(rc); +} + + +/** + * Implementation of the llog_operations::lop_declare_create + * + * This function declares the llog create. It declares also name insert + * into llog directory in case of named llog. + * + * \param[in] env execution environment + * \param[in] res llog handle of the current llog + * \param[in] th current transaction handle + * + * \retval 0 on successful create declaration + * \retval negative value on error + */ +static int llog_osd_declare_create(const struct lu_env *env, + struct llog_handle *res, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_insert_rec *rec = &lgi->lgi_dt_rec; + struct local_oid_storage *los; + struct dt_object *o; + int rc; + + ENTRY; + + LASSERT(res->lgh_obj); + LASSERT(th); + + /* object can be created by another thread */ + o = res->lgh_obj; + if (dt_object_exists(o)) + RETURN(0); + + if (res->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + struct llog_thread_info *lgi = llog_info(env); + + lgi->lgi_attr.la_valid = LA_MODE | LA_SIZE; + lgi->lgi_attr.la_size = 0; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL, + &lgi->lgi_dof, th); + if (rc < 0) + RETURN(rc); + + + rc = llog_osd_regular_fid_add_name_entry(env, o, th, true); + + RETURN(rc); + } + los = res->private_data; + LASSERT(los); + + rc = llog_osd_declare_new_object(env, los, o, th); + if (rc) + RETURN(rc); + + /* do not declare header initialization here as it's declared + * in llog_osd_declare_write_rec() which is always called */ + + if (res->lgh_name) { + struct dt_object *llog_dir; + + llog_dir = llog_osd_dir_get(env, res->lgh_ctxt); + if (IS_ERR(llog_dir)) + RETURN(PTR_ERR(llog_dir)); + logid_to_fid(&res->lgh_id, &lgi->lgi_fid); + rec->rec_fid = &lgi->lgi_fid; + rec->rec_type = S_IFREG; + rc = dt_declare_insert(env, llog_dir, + (struct dt_rec *)rec, + (struct dt_key *)res->lgh_name, th); + dt_object_put(env, llog_dir); + if (rc) + CERROR("%s: can't declare named llog %s: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + res->lgh_name, rc); + } + RETURN(rc); +} + +/** + * Implementation of the llog_operations::lop_create + * + * This function creates the llog according with llog_handle::lgh_obj + * and llog_handle::lgh_name. + * + * \param[in] env execution environment + * \param[in] res llog handle of the current llog + * \param[in] th current transaction handle + * + * \retval 0 on successful create + * \retval negative value on error + */ +static int llog_osd_create(const struct lu_env *env, struct llog_handle *res, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_insert_rec *rec = &lgi->lgi_dt_rec; + struct local_oid_storage *los; + struct dt_object *o; + int rc = 0; + + ENTRY; + + LASSERT(env); + o = res->lgh_obj; + LASSERT(o); + + /* llog can be already created */ + if (dt_object_exists(o)) + RETURN(-EEXIST); + + if (res->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + struct llog_thread_info *lgi = llog_info(env); + + lgi->lgi_attr.la_valid = LA_MODE | LA_SIZE | LA_TYPE; + lgi->lgi_attr.la_size = 0; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + dt_write_lock(env, o, 0); + rc = dt_create(env, o, &lgi->lgi_attr, NULL, + &lgi->lgi_dof, th); + dt_write_unlock(env, o); + if (rc < 0) + RETURN(rc); + + rc = llog_osd_regular_fid_add_name_entry(env, o, th, false); + + RETURN(rc); + } + + los = res->private_data; + LASSERT(los); + + dt_write_lock(env, o, 0); + if (!dt_object_exists(o)) + rc = llog_osd_create_new_object(env, los, o, th); + else + rc = -EEXIST; + + dt_write_unlock(env, o); + if (rc) + RETURN(rc); + + if (res->lgh_name) { + struct dt_object *llog_dir; + + llog_dir = llog_osd_dir_get(env, res->lgh_ctxt); + if (IS_ERR(llog_dir)) + RETURN(PTR_ERR(llog_dir)); + + logid_to_fid(&res->lgh_id, &lgi->lgi_fid); + rec->rec_fid = &lgi->lgi_fid; + rec->rec_type = S_IFREG; + dt_read_lock(env, llog_dir, 0); + rc = dt_insert(env, llog_dir, (struct dt_rec *)rec, + (struct dt_key *)res->lgh_name, th); + dt_read_unlock(env, llog_dir); + dt_object_put(env, llog_dir); + if (rc) + CERROR("%s: can't create named llog %s: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + res->lgh_name, rc); + } + RETURN(rc); +} + +/** + * Implementation of the llog_operations::lop_close + * + * This function closes the llog. It just put llog object and referenced + * local storage. + * + * \param[in] env execution environment + * \param[in] handle llog handle of the current llog + * + * \retval 0 on successful llog close + * \retval negative value on error + */ +static int llog_osd_close(const struct lu_env *env, struct llog_handle *handle) +{ + struct local_oid_storage *los; + int rc = 0; + + ENTRY; + + LASSERT(handle->lgh_obj); + + if (handle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + /* Remove the object from the cache, otherwise it may + * hold LOD being released during cleanup process */ + dt_object_put_nocache(env, handle->lgh_obj); + LASSERT(handle->private_data == NULL); + RETURN(rc); + } else { + dt_object_put(env, handle->lgh_obj); + } + los = handle->private_data; + LASSERT(los); + dt_los_put(los); + + if (handle->lgh_name) + OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1); + + RETURN(rc); +} + +/** + * delete llog object name entry + * + * Delete llog object (with regular FID) from name space (under + * update_log_dir). + * + * \param [in] env execution environment + * \param [in] dto llog object + * \param [in] th thandle + * \param [in] declare if it is declare or execution + * + * \retval 0 if deletion succeeds. + * \retval negative errno if deletion fails. + */ +static int +llog_osd_regular_fid_del_name_entry(const struct lu_env *env, + struct dt_object *dto, + struct thandle *th, bool declare) +{ + struct llog_thread_info *lgi = llog_info(env); + const struct lu_fid *fid = lu_object_fid(&dto->do_lu); + struct dt_object *dir; + char *name = lgi->lgi_name; + int rc; + ENTRY; + + if (!fid_is_norm(fid)) + RETURN(0); + + dir = llog_osd_get_regular_fid_dir(env, dto); + if (IS_ERR(dir)) + RETURN(PTR_ERR(dir)); + + snprintf(name, sizeof(lgi->lgi_name), DFID, PFID(fid)); + dt_write_lock(env, dir, 0); + if (declare) { + rc = dt_declare_delete(env, dir, (struct dt_key *)name, + th); + } else { + rc = dt_delete(env, dir, (struct dt_key *)name, th); + } + dt_write_unlock(env, dir); + + dt_object_put(env, dir); + RETURN(rc); +} + +/** + * Implementation of the llog_operations::lop_declare_destroy + * + * This function declare destroys the llog and deletes also entry in the + * llog directory in case of named llog. Llog should be opened prior that. + * + * \param[in] env execution environment + * \param[in] loghandle llog handle of the current llog + * + * \retval 0 on successful destroy + * \retval negative value on error + */ +static int llog_osd_declare_destroy(const struct lu_env *env, + struct llog_handle *loghandle, + struct thandle *th) +{ + struct llog_ctxt *ctxt; + struct dt_object *o, *llog_dir = NULL; + int rc; + + ENTRY; + + ctxt = loghandle->lgh_ctxt; + LASSERT(ctxt); + + o = loghandle->lgh_obj; + LASSERT(o); + + if (loghandle->lgh_name) { + llog_dir = llog_osd_dir_get(env, ctxt); + if (IS_ERR(llog_dir)) + RETURN(PTR_ERR(llog_dir)); + + rc = dt_declare_delete(env, llog_dir, + (struct dt_key *)loghandle->lgh_name, + th); + if (rc < 0) + GOTO(out_put, rc); + } + + rc = dt_declare_ref_del(env, o, th); + if (rc < 0) + GOTO(out_put, rc); + + rc = dt_declare_destroy(env, o, th); + if (rc < 0) + GOTO(out_put, rc); + + if (loghandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + rc = llog_osd_regular_fid_del_name_entry(env, o, th, true); + if (rc < 0) + GOTO(out_put, rc); + } + +out_put: + if (!(IS_ERR_OR_NULL(llog_dir))) + dt_object_put(env, llog_dir); + + RETURN(rc); +} + + +/** + * Implementation of the llog_operations::lop_destroy + * + * This function destroys the llog and deletes also entry in the + * llog directory in case of named llog. Llog should be opened prior that. + * Destroy method is not part of external transaction and does everything + * inside. + * + * \param[in] env execution environment + * \param[in] loghandle llog handle of the current llog + * + * \retval 0 on successful destroy + * \retval negative value on error + */ +static int llog_osd_destroy(const struct lu_env *env, + struct llog_handle *loghandle, struct thandle *th) +{ + struct llog_ctxt *ctxt; + struct dt_object *o, *llog_dir = NULL; + int rc; + + ENTRY; + + ctxt = loghandle->lgh_ctxt; + LASSERT(ctxt != NULL); + + o = loghandle->lgh_obj; + LASSERT(o != NULL); + + dt_write_lock(env, o, 0); + if (!llog_osd_exist(loghandle)) + GOTO(out_unlock, rc = 0); + + if (loghandle->lgh_name) { + llog_dir = llog_osd_dir_get(env, ctxt); + if (IS_ERR(llog_dir)) + GOTO(out_unlock, rc = PTR_ERR(llog_dir)); + + dt_read_lock(env, llog_dir, 0); + rc = dt_delete(env, llog_dir, + (struct dt_key *)loghandle->lgh_name, + th); + dt_read_unlock(env, llog_dir); + if (rc) { + CERROR("%s: can't remove llog %s: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + loghandle->lgh_name, rc); + GOTO(out_unlock, rc); + } + } + + dt_ref_del(env, o, th); + rc = dt_destroy(env, o, th); + if (rc < 0) + GOTO(out_unlock, rc); + + loghandle->lgh_destroyed = true; + if (loghandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + rc = llog_osd_regular_fid_del_name_entry(env, o, th, false); + if (rc < 0) + GOTO(out_unlock, rc); + } + +out_unlock: + dt_write_unlock(env, o); + if (!(IS_ERR_OR_NULL(llog_dir))) + dt_object_put(env, llog_dir); + RETURN(rc); +} + +/** + * Implementation of the llog_operations::lop_setup + * + * This function setup the llog on local storage. + * + * \param[in] env execution environment + * \param[in] obd obd device the llog belongs to + * \param[in] olg the llog group, it is always zero group now. + * \param[in] ctxt_idx the llog index, it defines the purpose of this llog. + * Every new llog type have to use own index. + * \param[in] disk_obd the storage obd, where llog is stored. + * + * \retval 0 on successful llog setup + * \retval negative value on error + */ +static int llog_osd_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int ctxt_idx, + struct obd_device *disk_obd) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_ctxt *ctxt; + int rc = 0; + ENTRY; + + LASSERT(obd); + LASSERT(olg->olg_ctxts[ctxt_idx]); + + ctxt = llog_ctxt_get(olg->olg_ctxts[ctxt_idx]); + LASSERT(ctxt); + + if (disk_obd == NULL) + GOTO(out, rc = 0); + + /* initialize data allowing to generate new fids, + * literally we need a sequece */ + lgi->lgi_fid.f_seq = FID_SEQ_LLOG; + lgi->lgi_fid.f_oid = 1; + lgi->lgi_fid.f_ver = 0; + rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt, + &lgi->lgi_fid, + &ctxt->loc_los_nameless); + if (rc != 0) + GOTO(out, rc); + + lgi->lgi_fid.f_seq = FID_SEQ_LLOG_NAME; + lgi->lgi_fid.f_oid = 1; + lgi->lgi_fid.f_ver = 0; + rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt, + &lgi->lgi_fid, + &ctxt->loc_los_named); + if (rc != 0) { + local_oid_storage_fini(env, ctxt->loc_los_nameless); + ctxt->loc_los_nameless = NULL; + } + + GOTO(out, rc); + +out: + llog_ctxt_put(ctxt); + return rc; +} + +/** + * Implementation of the llog_operations::lop_cleanup + * + * This function cleanups the llog on local storage. + * + * \param[in] env execution environment + * \param[in] ctxt the llog context + * + * \retval 0 + */ +static int llog_osd_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + if (ctxt->loc_los_nameless != NULL) { + local_oid_storage_fini(env, ctxt->loc_los_nameless); + ctxt->loc_los_nameless = NULL; + } + + if (ctxt->loc_los_named != NULL) { + local_oid_storage_fini(env, ctxt->loc_los_named); + ctxt->loc_los_named = NULL; + } + + return 0; +} + +const struct llog_operations llog_osd_ops = { + .lop_next_block = llog_osd_next_block, + .lop_prev_block = llog_osd_prev_block, + .lop_read_header = llog_osd_read_header, + .lop_declare_destroy = llog_osd_declare_destroy, + .lop_destroy = llog_osd_destroy, + .lop_setup = llog_osd_setup, + .lop_cleanup = llog_osd_cleanup, + .lop_open = llog_osd_open, + .lop_exist = llog_osd_exist, + .lop_declare_create = llog_osd_declare_create, + .lop_create = llog_osd_create, + .lop_declare_write_rec = llog_osd_declare_write_rec, + .lop_write_rec = llog_osd_write_rec, + .lop_close = llog_osd_close, +}; +EXPORT_SYMBOL(llog_osd_ops); + +const struct llog_operations llog_common_cat_ops = { + .lop_next_block = llog_osd_next_block, + .lop_prev_block = llog_osd_prev_block, + .lop_read_header = llog_osd_read_header, + .lop_declare_destroy = llog_osd_declare_destroy, + .lop_destroy = llog_osd_destroy, + .lop_setup = llog_osd_setup, + .lop_cleanup = llog_osd_cleanup, + .lop_open = llog_osd_open, + .lop_exist = llog_osd_exist, + .lop_declare_create = llog_osd_declare_create, + .lop_create = llog_osd_create, + .lop_declare_write_rec = llog_osd_declare_write_rec, + .lop_write_rec = llog_osd_write_rec, + .lop_close = llog_osd_close, + .lop_add = llog_cat_add_rec, + .lop_declare_add = llog_cat_declare_add_rec, +}; +EXPORT_SYMBOL(llog_common_cat_ops); + +/** + * Read the special file which contains the list of llog catalogs IDs + * + * This function reads the CATALOGS file which contains the array of llog + * catalogs IDs. The main purpose of this file is to store OSP llogs indexed + * by OST/MDT number. + * + * \param[in] env execution environment + * \param[in] d corresponding storage device + * \param[in] idx position to start from, usually OST/MDT index + * \param[in] count how many catalog IDs to read + * \param[out] idarray the buffer for the data. If it is NULL then + * function returns just number of catalog IDs + * in the file. + * \param[in] fid LLOG_CATALOGS_OID for CATALOG object + * + * \retval 0 on successful read of catalog IDs + * \retval negative value on error + * \retval positive value which is number of records in + * the file if \a idarray is NULL + */ +int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, struct llog_catid *idarray, + const struct lu_fid *fid) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o = NULL; + struct thandle *th; + int rc, size; + + ENTRY; + + LASSERT(d); + + size = sizeof(*idarray) * count; + lgi->lgi_off = idx * sizeof(*idarray); + + lgi->lgi_fid = *fid; + o = dt_locate(env, d, &lgi->lgi_fid); + if (IS_ERR(o)) + RETURN(PTR_ERR(o)); + + if (!dt_object_exists(o)) { + th = dt_trans_create(env, d); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + lgi->lgi_attr.la_valid = LA_MODE | LA_TYPE; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + th->th_wait_submit = 1; + /* Make the llog object creation synchronization, so + * it will be reliable to the reference, especially + * for remote reference */ + th->th_sync = 1; + + rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL, + &lgi->lgi_dof, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, d, th); + if (rc) + GOTO(out_trans, rc); + + dt_write_lock(env, o, 0); + if (!dt_object_exists(o)) + rc = dt_create(env, o, &lgi->lgi_attr, NULL, + &lgi->lgi_dof, th); + dt_write_unlock(env, o); +out_trans: + dt_trans_stop(env, d, th); + if (rc) + GOTO(out, rc); + } + + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + GOTO(out, rc); + + if (!S_ISREG(lgi->lgi_attr.la_mode)) { + CERROR("%s: CATALOGS is not a regular file!: mode = %o\n", + o->do_lu.lo_dev->ld_obd->obd_name, + lgi->lgi_attr.la_mode); + GOTO(out, rc = -ENOENT); + } + + CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n", + (int)lgi->lgi_attr.la_size, size); + + /* return just number of llogs */ + if (idarray == NULL) { + rc = lgi->lgi_attr.la_size / sizeof(*idarray); + GOTO(out, rc); + } + + /* read for new ost index or for empty file */ + memset(idarray, 0, size); + if (lgi->lgi_attr.la_size <= lgi->lgi_off) + GOTO(out, rc = 0); + if (lgi->lgi_attr.la_size < lgi->lgi_off + size) + size = lgi->lgi_attr.la_size - lgi->lgi_off; + + lgi->lgi_buf.lb_buf = idarray; + lgi->lgi_buf.lb_len = size; + rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off); + /* -EFAULT means the llog is a sparse file. This is not an error + * after arbitrary OST index is supported. */ + if (rc < 0 && rc != -EFAULT) { + CERROR("%s: error reading CATALOGS: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + GOTO(out, rc); + } + + EXIT; +out: + dt_object_put(env, o); + RETURN(rc); +} +EXPORT_SYMBOL(llog_osd_get_cat_list); + +/** + * Write the special file which contains the list of llog catalogs IDs + * + * This function writes the CATALOG file which contains the array of llog + * catalogs IDs. It is used mostly to store OSP llogs indexed by OST/MDT + * number. + * + * \param[in] env execution environment + * \param[in] d corresponding storage device + * \param[in] idx position to start from, usually OST/MDT index + * \param[in] count how many catalog IDs to write + * \param[out] idarray the buffer with the data to write. + * \param[in] fid LLOG_CATALOGS_OID for CATALOG object + * + * \retval 0 on successful write of catalog IDs + * \retval negative value on error + */ +int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, struct llog_catid *idarray, + const struct lu_fid *fid) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o = NULL; + struct thandle *th; + int rc, size; + + if (count == 0) + RETURN(0); + + LASSERT(d); + + size = sizeof(*idarray) * count; + lgi->lgi_off = idx * sizeof(*idarray); + lgi->lgi_fid = *fid; + + o = dt_locate(env, d, &lgi->lgi_fid); + if (IS_ERR(o)) + RETURN(PTR_ERR(o)); + + if (!dt_object_exists(o)) + GOTO(out, rc = -ENOENT); + + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + GOTO(out, rc); + + if (!S_ISREG(lgi->lgi_attr.la_mode)) { + CERROR("%s: CATALOGS is not a regular file!: mode = %o\n", + o->do_lu.lo_dev->ld_obd->obd_name, + lgi->lgi_attr.la_mode); + GOTO(out, rc = -ENOENT); + } + + th = dt_trans_create(env, d); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + lgi->lgi_buf.lb_len = size; + lgi->lgi_buf.lb_buf = idarray; + rc = dt_declare_record_write(env, o, &lgi->lgi_buf, lgi->lgi_off, th); + if (rc) + GOTO(out_trans, rc); + + /* For update log, this happens during initialization, + * see lod_sub_prep_llog(), and we need make sure catlog + * file ID is written to catlist file(committed) before + * cross-MDT operation write update records to catlog FILE, + * otherwise, during failover these update records might + * missing */ + if (fid_is_update_log(fid)) + th->th_sync = 1; + + rc = dt_trans_start_local(env, d, th); + if (rc) + GOTO(out_trans, rc); + + th->th_wait_submit = 1; + + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc) + CDEBUG(D_INODE, "can't write CATALOGS at index %d: rc = %d\n", + idx, rc); +out_trans: + dt_trans_stop(env, d, th); +out: + dt_object_put(env, o); + RETURN(rc); +} +EXPORT_SYMBOL(llog_osd_put_cat_list); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c new file mode 100644 index 0000000000000..67c0ce62713ce --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c @@ -0,0 +1,488 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/llog_swab.c + * + * Swabbing of llog datatypes (from disk or over the wire). + * + * Author: jacob berkman + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include +#include + +static void print_llogd_body(struct llogd_body *d) +{ + CDEBUG(D_OTHER, "llogd body: %p\n", d); + CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi.oi_fid: "DFID"\n", + PFID(&d->lgd_logid.lgl_oi.oi_fid)); + CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen); + CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx); + CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags); + CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index); + CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index); + CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len); + CDEBUG(D_OTHER, "\tlgd_cur_offset: %#llx\n", d->lgd_cur_offset); +} + +void lustre_swab_lu_fid(struct lu_fid *fid) +{ + __swab64s(&fid->f_seq); + __swab32s(&fid->f_oid); + __swab32s(&fid->f_ver); +} +EXPORT_SYMBOL(lustre_swab_lu_fid); + +void lustre_swab_ost_id(struct ost_id *oid) +{ + if (fid_seq_is_mdt0(oid->oi.oi_seq) || + fid_seq_is_default(oid->oi.oi_seq)) { + __swab64s(&oid->oi.oi_id); + __swab64s(&oid->oi.oi_seq); + } else { + lustre_swab_lu_fid(&oid->oi_fid); + } +} +EXPORT_SYMBOL(lustre_swab_ost_id); + +void lustre_swab_llog_id(struct llog_logid *log_id) +{ + __swab64s(&log_id->lgl_oi.oi.oi_id); + __swab64s(&log_id->lgl_oi.oi.oi_seq); + __swab32s(&log_id->lgl_ogen); +} + +void lustre_swab_llogd_body (struct llogd_body *d) +{ + ENTRY; + print_llogd_body(d); + lustre_swab_llog_id(&d->lgd_logid); + __swab32s(&d->lgd_ctxt_idx); + __swab32s(&d->lgd_llh_flags); + __swab32s(&d->lgd_index); + __swab32s(&d->lgd_saved_index); + __swab32s(&d->lgd_len); + __swab64s(&d->lgd_cur_offset); + print_llogd_body(d); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_llogd_body); + +void lustre_swab_llogd_conn_body (struct llogd_conn_body *d) +{ + __swab64s(&d->lgdc_gen.mnt_cnt); + __swab64s(&d->lgdc_gen.conn_cnt); + lustre_swab_llog_id(&d->lgdc_logid); + __swab32s(&d->lgdc_ctxt_idx); +} +EXPORT_SYMBOL(lustre_swab_llogd_conn_body); + +void lustre_swab_ll_fid(struct ll_fid *fid) +{ + __swab64s(&fid->id); + __swab32s(&fid->generation); + __swab32s(&fid->f_type); +} + +void lustre_swab_lu_seq_range(struct lu_seq_range *range) +{ + __swab64s(&range->lsr_start); + __swab64s(&range->lsr_end); + __swab32s(&range->lsr_index); + __swab32s(&range->lsr_flags); +} +EXPORT_SYMBOL(lustre_swab_lu_seq_range); + +void lustre_swab_update_ops(struct update_ops *uops, unsigned int op_count) +{ + unsigned int i; + unsigned int j; + + for (i = 0; i < op_count; i++) { + lustre_swab_lu_fid(&uops->uops_op[i].uop_fid); + __swab16s(&uops->uops_op[i].uop_type); + __swab16s(&uops->uops_op[i].uop_param_count); + for (j = 0; j < uops->uops_op[i].uop_param_count; j++) + __swab16s(&uops->uops_op[i].uop_params_off[j]); + } +} +EXPORT_SYMBOL(lustre_swab_update_ops); + +void lustre_swab_llog_rec(struct llog_rec_hdr *rec) +{ + struct llog_rec_tail *tail = NULL; + + __swab32s(&rec->lrh_len); + __swab32s(&rec->lrh_index); + __swab32s(&rec->lrh_type); + __swab32s(&rec->lrh_id); + + switch (rec->lrh_type) { + case OST_SZ_REC: + { + struct llog_size_change_rec *lsc = + (struct llog_size_change_rec *)rec; + + lustre_swab_ll_fid(&lsc->lsc_fid); + __swab32s(&lsc->lsc_ioepoch); + tail = &lsc->lsc_tail; + break; + } + case MDS_UNLINK_REC: + { + struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec; + + __swab64s(&lur->lur_oid); + __swab32s(&lur->lur_oseq); + __swab32s(&lur->lur_count); + tail = &lur->lur_tail; + break; + } + case MDS_UNLINK64_REC: + { + struct llog_unlink64_rec *lur = + (struct llog_unlink64_rec *)rec; + + lustre_swab_lu_fid(&lur->lur_fid); + __swab32s(&lur->lur_count); + tail = &lur->lur_tail; + break; + } + case CHANGELOG_REC: + { + struct llog_changelog_rec *cr = + (struct llog_changelog_rec *)rec; + + __swab16s(&cr->cr.cr_namelen); + __swab16s(&cr->cr.cr_flags); + __swab32s(&cr->cr.cr_type); + __swab64s(&cr->cr.cr_index); + __swab64s(&cr->cr.cr_prev); + __swab64s(&cr->cr.cr_time); + lustre_swab_lu_fid(&cr->cr.cr_tfid); + lustre_swab_lu_fid(&cr->cr.cr_pfid); + if (cr->cr.cr_flags & CLF_RENAME) { + struct changelog_ext_rename *rnm = + changelog_rec_rename(&cr->cr); + + lustre_swab_lu_fid(&rnm->cr_sfid); + lustre_swab_lu_fid(&rnm->cr_spfid); + } + /* + * Because the tail follows a variable-length structure we need + * to compute its location at runtime + */ + tail = (struct llog_rec_tail *)((char *)rec + + rec->lrh_len - sizeof(*tail)); + break; + } + + case CHANGELOG_USER_REC: + case CHANGELOG_USER_REC2: + { + struct llog_changelog_user_rec2 *cur = + (struct llog_changelog_user_rec2 *)rec; + + __swab32s(&cur->cur_id); + __swab64s(&cur->cur_endrec); + __swab32s(&cur->cur_time); + if (cur->cur_hdr.lrh_type == CHANGELOG_USER_REC2) { + __swab32s(&cur->cur_mask); + BUILD_BUG_ON(offsetof(typeof(*cur), cur_padding1) == 0); + BUILD_BUG_ON(offsetof(typeof(*cur), cur_padding2) == 0); + BUILD_BUG_ON(offsetof(typeof(*cur), cur_padding3) == 0); + } + tail = (struct llog_rec_tail *)((char *)rec + + rec->lrh_len - sizeof(*tail)); + break; + } + + case HSM_AGENT_REC: { + struct llog_agent_req_rec *arr = + (struct llog_agent_req_rec *)rec; + + __swab32s(&arr->arr_hai.hai_len); + __swab32s(&arr->arr_hai.hai_action); + lustre_swab_lu_fid(&arr->arr_hai.hai_fid); + lustre_swab_lu_fid(&arr->arr_hai.hai_dfid); + __swab64s(&arr->arr_hai.hai_cookie); + __swab64s(&arr->arr_hai.hai_extent.offset); + __swab64s(&arr->arr_hai.hai_extent.length); + __swab64s(&arr->arr_hai.hai_gid); + /* + * no swabing for opaque data + * hai_data[0]; + */ + break; + } + + case MDS_SETATTR64_REC: + { + struct llog_setattr64_rec *lsr = + (struct llog_setattr64_rec *)rec; + + lustre_swab_ost_id(&lsr->lsr_oi); + __swab32s(&lsr->lsr_uid); + __swab32s(&lsr->lsr_uid_h); + __swab32s(&lsr->lsr_gid); + __swab32s(&lsr->lsr_gid_h); + __swab64s(&lsr->lsr_valid); + + if (rec->lrh_len > sizeof(struct llog_setattr64_rec)) { + struct llog_setattr64_rec_v2 *lsr2 = + (struct llog_setattr64_rec_v2 *)rec; + + __swab32s(&lsr2->lsr_projid); + __swab32s(&lsr2->lsr_layout_version); + tail = &lsr2->lsr_tail; + } else { + tail = &lsr->lsr_tail; + } + break; + } + case OBD_CFG_REC: + /* these are swabbed as they are consumed */ + break; + case LLOG_HDR_MAGIC: + { + struct llog_log_hdr *llh = (struct llog_log_hdr *)rec; + + __swab64s(&llh->llh_timestamp); + __swab32s(&llh->llh_count); + __swab32s(&llh->llh_bitmap_offset); + __swab32s(&llh->llh_flags); + __swab32s(&llh->llh_size); + __swab32s(&llh->llh_cat_idx); + tail = LLOG_HDR_TAIL(llh); + break; + } + case LLOG_LOGID_MAGIC: + { + struct llog_logid_rec *lid = (struct llog_logid_rec *)rec; + + lustre_swab_llog_id(&lid->lid_id); + tail = &lid->lid_tail; + break; + } + case LLOG_GEN_REC: + { + struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec; + + __swab64s(&lgr->lgr_gen.mnt_cnt); + __swab64s(&lgr->lgr_gen.conn_cnt); + tail = &lgr->lgr_tail; + break; + } + case LLOG_PAD_MAGIC: + break; + case UPDATE_REC: + { + struct llog_update_record *lur = + (struct llog_update_record *)rec; + struct update_records *record = &lur->lur_update_rec; + + __swab32s(&record->ur_flags); + __swab64s(&record->ur_batchid); + __swab64s(&record->ur_master_transno); + __swab32s(&record->ur_param_count); + __swab32s(&record->ur_update_count); + lustre_swab_update_ops(&record->ur_ops, + record->ur_update_count); + + /* Compute tail location. */ + tail = (struct llog_rec_tail *)((char *)record + + update_records_size(record)); + break; + } + default: + CERROR("Unknown llog rec type %#x swabbing rec %p\n", + rec->lrh_type, rec); + } + + if (tail) { + __swab32s(&tail->lrt_len); + __swab32s(&tail->lrt_index); + } +} + +static void print_llog_hdr(struct llog_log_hdr *h) +{ + CDEBUG(D_OTHER, "llog header: %p\n", h); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type); + CDEBUG(D_OTHER, "\tllh_timestamp: %#llx\n", h->llh_timestamp); + CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count); + CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset); + CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags); + CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size); + CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx); + CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", + LLOG_HDR_TAIL(h)->lrt_index); + CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", + LLOG_HDR_TAIL(h)->lrt_len); +} + +void lustre_swab_llog_hdr (struct llog_log_hdr *h) +{ + ENTRY; + print_llog_hdr(h); + + lustre_swab_llog_rec(&h->llh_hdr); + + print_llog_hdr(h); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_llog_hdr); + +void print_lustre_cfg(struct lustre_cfg *lcfg) +{ + int i; + + ENTRY; + + if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */ + return; + + CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg); + CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version); + + CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command); + CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num); + CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags); + CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", + libcfs_nid2str(lcfg->lcfg_nid)); + + CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount); + if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT) + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d %s\n", + i, lcfg->lcfg_buflens[i], + lustre_cfg_string(lcfg, i)); + } + + EXIT; +} +EXPORT_SYMBOL(print_lustre_cfg); + +void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg) +{ + int i; + + ENTRY; + + __swab32s(&lcfg->lcfg_version); + + if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) { + CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n", + lcfg->lcfg_version, LUSTRE_CFG_VERSION); + EXIT; + return; + } + + __swab32s(&lcfg->lcfg_command); + __swab32s(&lcfg->lcfg_num); + __swab32s(&lcfg->lcfg_flags); + __swab64s(&lcfg->lcfg_nid); + __swab32s(&lcfg->lcfg_bufcount); + for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++) + __swab32s(&lcfg->lcfg_buflens[i]); + + print_lustre_cfg(lcfg); + EXIT; +} + +/* used only for compatibility with old on-disk cfg_marker data */ +struct cfg_marker32 { + __u32 cm_step; + __u32 cm_flags; + __u32 cm_vers; + __u32 padding; + __u32 cm_createtime; + __u32 cm_canceltime; + char cm_tgtname[MTI_NAME_MAXLEN]; + char cm_comment[MTI_NAME_MAXLEN]; +}; + +#define MTI_NAMELEN32 (MTI_NAME_MAXLEN - \ + (sizeof(struct cfg_marker) - sizeof(struct cfg_marker32))) + +void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size) +{ + struct cfg_marker32 *cm32 = (struct cfg_marker32 *)marker; + + ENTRY; + + if (swab) { + __swab32s(&marker->cm_step); + __swab32s(&marker->cm_flags); + __swab32s(&marker->cm_vers); + } + if (size == sizeof(*cm32)) { + __u32 createtime, canceltime; + /* + * There was a problem with the original declaration of + * cfg_marker on 32-bit systems because it used 32 time as + * a wire protocol structure, and didn't verify this in + * wirecheck. We now have to convert the offsets of the + * later fields in order to work on 32- and 64-bit systems. + * + * Fortunately, the cm_comment field has no functional use + * so can be sacrificed when converting the timestamp size. + * + * Overwrite fields from the end first, so they are not + * clobbered, and use memmove() instead of memcpy() because + * the source and target buffers overlap. bug 16771 + */ + createtime = cm32->cm_createtime; + canceltime = cm32->cm_canceltime; + memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32); + marker->cm_comment[MTI_NAMELEN32 - 1] = '\0'; + memmove(marker->cm_tgtname, cm32->cm_tgtname, + sizeof(marker->cm_tgtname)); + if (swab) { + __swab32s(&createtime); + __swab32s(&canceltime); + } + marker->cm_createtime = createtime; + marker->cm_canceltime = canceltime; + CDEBUG(D_CONFIG, + "Find old cfg_marker(Srv32b,Clt64b) for target %s, converting\n", + marker->cm_tgtname); + } else if (swab) { + __swab64s(&marker->cm_createtime); + __swab64s(&marker->cm_canceltime); + } + + EXIT; +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c new file mode 100644 index 0000000000000..13d8531fa6d9b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c @@ -0,0 +1,2288 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/llog_test.c + * + * Author: Phil Schwan + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include + +#include +#include +#include + +/* This is slightly more than the number of records that can fit into a + * single llog file, because the llog_log_header takes up some of the + * space in the first block that cannot be used for the bitmap. */ +static int llog_test_recnum = (LLOG_MIN_CHUNK_SIZE * 8); +static int llog_test_rand; +static struct obd_uuid uuid = { .uuid = "test_uuid" }; +static struct llog_logid cat_logid; + +struct llog_mini_rec { + struct llog_rec_hdr lmr_hdr; + struct llog_rec_tail lmr_tail; +} __attribute__((packed)); + +static int verify_handle(char *test, struct llog_handle *llh, int num_recs) +{ + int i; + int last_idx = 0; + int active_recs = 0; + + for (i = 0; i < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr); i++) { + if (test_bit_le(i, LLOG_HDR_BITMAP(llh->lgh_hdr))) { + last_idx = i; + active_recs++; + } + } + + /* check the llog is sane at first, llh_count and lgh_last_idx*/ + if (llh->lgh_hdr->llh_count != active_recs) { + CERROR("%s: handle->count is %d, but there are %d recs found\n", + test, llh->lgh_hdr->llh_count, active_recs); + RETURN(-ERANGE); + } + + if (llh->lgh_last_idx != LLOG_HDR_TAIL(llh->lgh_hdr)->lrt_index || + (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_CAT) && + llh->lgh_last_idx < last_idx)) { + CERROR("%s: lgh_last_idx is %d (%d in the header), last found %d\n", + test, llh->lgh_last_idx, + LLOG_HDR_TAIL(llh->lgh_hdr)->lrt_index, last_idx); + RETURN(-ERANGE); + } + + /* finally checks against expected value from the caller */ + if (active_recs != num_recs) { + CERROR("%s: expected %d active recs after write, found %d\n", + test, num_recs, active_recs); + RETURN(-ERANGE); + } + + RETURN(0); +} + +/* Test named-log create/open, close */ +static int llog_test_1(const struct lu_env *env, + struct obd_device *obd, char *name) +{ + struct llog_handle *llh; + struct llog_ctxt *ctxt; + int rc; + int rc2; + + ENTRY; + + CWARN("1a: create a log with name: %s\n", name); + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + rc = llog_open_create(env, ctxt, &llh, NULL, name); + if (rc) { + CERROR("1a: llog_create with name %s failed: %d\n", name, rc); + GOTO(out, rc); + } + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("1a: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + rc = verify_handle("1", llh, 1); + + CWARN("1b: close newly-created log\n"); +out_close: + rc2 = llog_close(env, llh); + if (rc2) { + CERROR("1b: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +out: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +static int test_2_cancel_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + return LLOG_DEL_RECORD; +} + +/* Test named-log reopen; returns opened log on success */ +static int llog_test_2(const struct lu_env *env, struct obd_device *obd, + char *name, struct llog_handle **llh) +{ + struct llog_ctxt *ctxt; + struct llog_handle *lgh; + struct llog_logid logid; + int rc; + struct llog_mini_rec lmr; + + ENTRY; + + CWARN("2a: re-open a log with name: %s\n", name); + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + rc = llog_open(env, ctxt, llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("2a: re-open log with name %s failed: %d\n", name, rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, *llh, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("2a: can't init llog handle: %d\n", rc); + GOTO(out_close_llh, rc); + } + + rc = verify_handle("2", *llh, 1); + if (rc) + GOTO(out_close_llh, rc); + + CWARN("2b: create a log without specified NAME & LOGID\n"); + rc = llog_open_create(env, ctxt, &lgh, NULL, NULL); + if (rc) { + CERROR("2b: create log failed\n"); + GOTO(out_close_llh, rc); + } + rc = llog_init_handle(env, lgh, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("2b: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + logid = lgh->lgh_id; + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC; + + /* Check llog header values are correct after record add/cancel */ + CWARN("2b: write 1 llog records, check llh_count\n"); + rc = llog_write(env, lgh, &lmr.lmr_hdr, LLOG_NEXT_IDX); + if (rc < 0) + GOTO(out_close, rc); + + /* in-memory values after record addition */ + rc = verify_handle("2b", lgh, 2); + if (rc < 0) + GOTO(out_close, rc); + + /* re-open llog to read on-disk values */ + llog_close(env, lgh); + + CWARN("2c: re-open the log by LOGID and verify llh_count\n"); + rc = llog_open(env, ctxt, &lgh, &logid, NULL, LLOG_OPEN_EXISTS); + if (rc < 0) { + CERROR("2c: re-open log by LOGID failed\n"); + GOTO(out_close_llh, rc); + } + + rc = llog_init_handle(env, lgh, LLOG_F_IS_PLAIN, &uuid); + if (rc < 0) { + CERROR("2c: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + /* check values just read from disk */ + rc = verify_handle("2c", lgh, 2); + if (rc < 0) + GOTO(out_close, rc); + + rc = llog_process(env, lgh, test_2_cancel_cb, NULL, NULL); + if (rc < 0) + GOTO(out_close, rc); + + /* in-memory values */ + rc = verify_handle("2c", lgh, 1); + if (rc < 0) + GOTO(out_close, rc); + + /* re-open llog to get on-disk values */ + llog_close(env, lgh); + + rc = llog_open(env, ctxt, &lgh, &logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("2c: re-open log by LOGID failed\n"); + GOTO(out_close_llh, rc); + } + + rc = llog_init_handle(env, lgh, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("2c: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + /* on-disk values after llog re-open */ + rc = verify_handle("2c", lgh, 1); + if (rc < 0) + GOTO(out_close, rc); + + CWARN("2d: destroy this log\n"); + rc = llog_destroy(env, lgh); + if (rc) + CERROR("2d: destroy log failed\n"); +out_close: + llog_close(env, lgh); +out_close_llh: + if (rc) + llog_close(env, *llh); +out_put: + llog_ctxt_put(ctxt); + + RETURN(rc); +} + +static int test_3_rec_num; +static off_t test_3_rec_off; +static int test_3_paddings; +static int test_3_start_idx; + +/* + * Test 3 callback. + * - check lgh_cur_offset correctness + * - check record index consistency + * - modify each record in-place + * - add new record during *last_idx processing + */ +static int test3_check_n_add_cb(const struct lu_env *env, + struct llog_handle *lgh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec; + int *last_rec = data; + unsigned cur_idx = test_3_start_idx + test_3_rec_num; + int rc; + + if (lgh->lgh_hdr->llh_flags & LLOG_F_IS_FIXSIZE) { + LASSERT(lgh->lgh_hdr->llh_size > 0); + if (lgh->lgh_cur_offset != lgh->lgh_hdr->llh_hdr.lrh_len + + (cur_idx - 1) * lgh->lgh_hdr->llh_size) + CERROR("Wrong record offset in cur_off: %llu, should be %u\n", + lgh->lgh_cur_offset, + lgh->lgh_hdr->llh_hdr.lrh_len + + (cur_idx - 1) * lgh->lgh_hdr->llh_size); + } else { + size_t chunk_size = lgh->lgh_hdr->llh_hdr.lrh_len; + + /* + * For variable size records the start offset is unknown, trust + * the first value and check others are consistent with it. + */ + if (test_3_rec_off == 0) + test_3_rec_off = lgh->lgh_cur_offset; + + if (lgh->lgh_cur_offset != test_3_rec_off) { + __u64 tmp = lgh->lgh_cur_offset; + + /* there can be padding record */ + if ((do_div(tmp, chunk_size) == 0) && + (lgh->lgh_cur_offset - test_3_rec_off < + rec->lrh_len + LLOG_MIN_REC_SIZE)) { + test_3_rec_off = lgh->lgh_cur_offset; + test_3_paddings++; + } else { + CERROR("Wrong record offset in cur_off: %llu" + ", should be %lld (rec len %u)\n", + lgh->lgh_cur_offset, + (long long)test_3_rec_off, + rec->lrh_len); + } + } + test_3_rec_off += rec->lrh_len; + } + + cur_idx += test_3_paddings; + if (cur_idx != rec->lrh_index) + CERROR("Record with wrong index was read: %u, expected %u\n", + rec->lrh_index, cur_idx); + + /* modify all records in place */ + lgr->lgr_gen.conn_cnt = rec->lrh_index; + rc = llog_write(env, lgh, rec, rec->lrh_index); + if (rc < 0) + CERROR("cb_test_3: cannot modify record while processing\n"); + + /* + * Add new record to the llog at *last_rec position one by one to + * check that last block is re-read during processing + */ + if (cur_idx == *last_rec || cur_idx == (*last_rec + 1)) { + rc = llog_write(env, lgh, rec, LLOG_NEXT_IDX); + if (rc < 0) + CERROR("cb_test_3: cannot add new record while " + "processing\n"); + } + test_3_rec_num++; + + return rc; +} + +/* Check in-place modifications were done for all records*/ +static int test3_check_cb(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec; + + if (lgr->lgr_gen.conn_cnt != rec->lrh_index) { + CERROR("cb_test_3: record %u is not modified\n", + rec->lrh_index); + return -EINVAL; + } + test_3_rec_num++; + return 0; +} + +static int llog_test3_process(const struct lu_env *env, + struct llog_handle *lgh, + llog_cb_t cb, int start) +{ + struct llog_process_cat_data cd; + int last_idx; /* new record will be injected here */ + int rc = 0; + + CWARN("test3: processing records from index %d to the end\n", + start); + cd.lpcd_read_mode = LLOG_READ_MODE_NORMAL; + cd.lpcd_first_idx = start - 1; + cd.lpcd_last_idx = 0; + test_3_rec_num = test_3_paddings = 0; + last_idx = lgh->lgh_last_idx; + rc = llog_process(env, lgh, cb, &last_idx, &cd); + if (rc < 0) + return rc; + CWARN("test3: total %u records processed with %u paddings\n", + test_3_rec_num, test_3_paddings); + return test_3_rec_num; +} + +/* Test plain llog functionality */ +static int llog_test_3(const struct lu_env *env, struct obd_device *obd, + struct llog_handle *llh) +{ + char buf[128]; + struct llog_rec_hdr *hdr = (void *)buf; + int rc, i; + int num_recs = 1; /* 1 for the header */ + int expected; + + ENTRY; + + hdr->lrh_len = sizeof(struct llog_gen_rec); + hdr->lrh_type = LLOG_GEN_REC; + llh->lgh_hdr->llh_size = sizeof(struct llog_gen_rec); + llh->lgh_hdr->llh_flags |= LLOG_F_IS_FIXSIZE; + + /* + * Fill the llog with 64-bytes records, use 1023 records, + * so last chunk will be partially full. Don't change this + * value until record size is changed. + */ + CWARN("3a: write 1023 fixed-size llog records\n"); + for (i = 0; i < 1023; i++) { + rc = llog_write(env, llh, hdr, LLOG_NEXT_IDX); + if (rc < 0) { + CERROR("3a: write 1023 records failed at #%d: %d\n", + i + 1, rc); + RETURN(rc); + } + num_recs++; + } + + rc = verify_handle("3a", llh, num_recs); + if (rc) + RETURN(rc); + + /* + * Test fixed-size records processing: + * - search the needed index + * - go through all records from that index + * - check all indices are growing monotonically and exist + * - modify each record + * + * NB: test3_check_n_add adds two new records while processing + * after last record. There were 1023 records created so the last chunk + * misses exactly one record. Therefore one of new records will be + * the last in the current chunk and second causes the new chunk to be + * created. + */ + test_3_rec_off = 0; + test_3_start_idx = 501; + expected = 525; + rc = llog_test3_process(env, llh, test3_check_n_add_cb, + test_3_start_idx); + if (rc < 0) + RETURN(rc); + + /* extra record is created during llog_process() */ + if (rc != expected) { + CERROR("3a: process total %d records but expect %d\n", + rc, expected); + RETURN(-ERANGE); + } + + num_recs += 2; + + /* test modification in place */ + rc = llog_test3_process(env, llh, test3_check_cb, test_3_start_idx); + if (rc < 0) + RETURN(rc); + + if (rc != expected) { + CERROR("3a: process total %d records but expect %d\n", + rc, expected); + RETURN(-ERANGE); + } + + CWARN("3b: write 566 variable size llog records\n"); + + /* + * Drop llh_size to 0 to mark llog as variable-size and write + * header to make this change permanent. + */ + llh->lgh_hdr->llh_flags &= ~LLOG_F_IS_FIXSIZE; + llog_write(env, llh, &llh->lgh_hdr->llh_hdr, LLOG_HEADER_IDX); + + hdr->lrh_type = OBD_CFG_REC; + + /* + * there are 1025 64-bytes records in llog already, + * the last chunk contains single record, i.e. 64 bytes. + * Each pair of variable size records is 200 bytes, so + * we will have the following distribution per chunks: + * block 1: 64 + 80(80/120) + 80 + 48(pad) = 81 iterations + * block 2: 80(120/80) + 120 + 72(pad) = 81 itereations + * block 3: 80(80/120) + 80 + 112(pad) = 81 iterations + * -- the same as block 2 again and so on. + * block 7: 80(80/120) = 80 iterations and 192 bytes remain + * Total 6 * 81 + 80 = 566 itereations. + * Callback will add another 120 bytes in the end of the last chunk + * and another 120 bytes will cause padding (72 bytes) plus 120 + * bytes in the new block. + */ + for (i = 0; i < 566; i++) { + if ((i % 2) == 0) + hdr->lrh_len = 80; + else + hdr->lrh_len = 120; + + rc = llog_write(env, llh, hdr, LLOG_NEXT_IDX); + if (rc < 0) { + CERROR("3b: write 566 records failed at #%d: %d\n", + i + 1, rc); + RETURN(rc); + } + num_recs++; + } + + rc = verify_handle("3b", llh, num_recs); + if (rc) + RETURN(rc); + + test_3_start_idx = 1026; + expected = 568; + rc = llog_test3_process(env, llh, test3_check_n_add_cb, + test_3_start_idx); + if (rc < 0) + RETURN(rc); + + if (rc != expected) { + CERROR("3b: process total %d records but expect %d\n", + rc, expected); + RETURN(-ERANGE); + } + + num_recs += 2; + + /* test modification in place */ + rc = llog_test3_process(env, llh, test3_check_cb, test_3_start_idx); + if (rc < 0) + RETURN(rc); + + if (rc != expected) { + CERROR("3b: process total %d records but expect %d\n", + rc, expected); + RETURN(-ERANGE); + } + + CWARN("3c: write records with variable size until BITMAP_SIZE, " + "return -ENOSPC\n"); + while (num_recs < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr)) { + if ((num_recs % 2) == 0) + hdr->lrh_len = 80; + else + hdr->lrh_len = 128; + + rc = llog_write(env, llh, hdr, LLOG_NEXT_IDX); + if (rc == -ENOSPC) { + break; + } else if (rc < 0) { + CERROR("3c: write recs failed at #%d: %d\n", + num_recs, rc); + RETURN(rc); + } + num_recs++; + } + + if (rc != -ENOSPC) { + CWARN("3c: write record more than BITMAP size!\n"); + RETURN(-EINVAL); + } + CWARN("3c: wrote %d more records before end of llog is reached\n", + num_recs); + + rc = verify_handle("3c", llh, num_recs); + + RETURN(rc); +} + +/* Test catalogue additions */ +static int llog_test_4(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *cath, *llh; + char name[10]; + int rc, rc2, i, buflen; + struct llog_mini_rec lmr; + struct llog_cookie cookie; + struct llog_ctxt *ctxt; + int num_recs = 0; + char *buf; + struct llog_rec_hdr *rec; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC; + + sprintf(name, "%x", llog_test_rand + 1); + CWARN("4a: create a catalog log with name: %s\n", name); + rc = llog_open_create(env, ctxt, &cath, NULL, name); + if (rc) { + CERROR("4a: llog_create with name %s failed: %d\n", name, rc); + GOTO(ctxt_release, rc); + } + rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("4a: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + num_recs++; + cat_logid = cath->lgh_id; + + CWARN("4b: write 1 record into the catalog\n"); + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, &cookie); + if (rc != 1) { + CERROR("4b: write 1 catalog record failed at: %d\n", rc); + GOTO(out, rc); + } + num_recs++; + rc = verify_handle("4b", cath, 2); + if (rc) + GOTO(out, rc); + + rc = verify_handle("4b", cath->u.chd.chd_current_log, num_recs); + if (rc) + GOTO(out, rc); + + /* estimate the max number of record for the plain llog + * cause it depends on disk size + */ + llh = cath->u.chd.chd_current_log; + if (llh->lgh_max_size != 0) { + llog_test_recnum = (llh->lgh_max_size - + sizeof(struct llog_log_hdr)) / LLOG_MIN_REC_SIZE; + } + + if (llog_test_recnum >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr)) + llog_test_recnum = LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1; + + CWARN("4c: cancel 1 log record\n"); + rc = llog_cat_cancel_records(env, cath, 1, &cookie); + if (rc) { + CERROR("4c: cancel 1 catalog based record failed: %d\n", rc); + GOTO(out, rc); + } + num_recs--; + + rc = verify_handle("4c", cath->u.chd.chd_current_log, num_recs); + if (rc) + GOTO(out, rc); + + CWARN("4d: write %d more log records\n", llog_test_recnum); + for (i = 0; i < llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc) { + CERROR("4d: write %d records failed at #%d: %d\n", + llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + num_recs++; + } + + /* make sure new plain llog appears */ + rc = verify_handle("4d", cath, 3); + if (rc) + GOTO(out, rc); + + CWARN("4e: add 5 large records, one record per block\n"); + buflen = LLOG_MIN_CHUNK_SIZE; + OBD_ALLOC(buf, buflen); + if (buf == NULL) + GOTO(out, rc = -ENOMEM); + for (i = 0; i < 5; i++) { + rec = (void *)buf; + rec->lrh_len = buflen; + rec->lrh_type = OBD_CFG_REC; + rc = llog_cat_add(env, cath, rec, NULL); + if (rc) { + CERROR("4e: write 5 records failed at #%d: %d\n", + i + 1, rc); + GOTO(out_free, rc); + } + num_recs++; + } +out_free: + OBD_FREE(buf, buflen); +out: + CWARN("4f: put newly-created catalog\n"); + rc2 = llog_cat_close(env, cath); + if (rc2) { + CERROR("4: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +ctxt_release: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +static int cat_counter; + +static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct lu_fid fid = {0}; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + + logid_to_fid(&lir->lid_id, &fid); + + CWARN("seeing record at index %d - "DFID" in log "DFID"\n", + rec->lrh_index, PFID(&fid), + PFID(lu_object_fid(&llh->lgh_obj->do_lu))); + + cat_counter++; + + RETURN(0); +} + +static int plain_counter; + +static int plain_print_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct lu_fid fid = {0}; + + if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) { + CERROR("log is not plain\n"); + RETURN(-EINVAL); + } + + logid_to_fid(&llh->lgh_id, &fid); + + CDEBUG(D_INFO, "seeing record at index %d in log "DFID"\n", + rec->lrh_index, PFID(&fid)); + + plain_counter++; + + RETURN(0); +} + +static int cancel_count; + +static int llog_cancel_rec_cb(const struct lu_env *env, + struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_cookie cookie; + + if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) { + CERROR("log is not plain\n"); + RETURN(-EINVAL); + } + + cookie.lgc_lgl = llh->lgh_id; + cookie.lgc_index = rec->lrh_index; + + llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie); + cancel_count++; + if (cancel_count == llog_test_recnum) + RETURN(-LLOG_EEMPTY); + RETURN(0); +} + +/* Test log and catalogue processing */ +static int llog_test_5(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *llh = NULL; + char name[10]; + int rc, rc2; + struct llog_mini_rec lmr; + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC; + + CWARN("5a: re-open catalog by id\n"); + rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("5a: llog_create with logid failed: %d\n", rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("5a: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + CWARN("5b: print the catalog entries.. we expect 2\n"); + cat_counter = 0; + rc = llog_process(env, llh, cat_print_cb, "test 5", NULL); + if (rc) { + CERROR("5b: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 2) { + CERROR("5b: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + CWARN("5c: Cancel %d records, see one log zapped\n", llog_test_recnum); + cancel_count = 0; + rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("5c: process with llog_cancel_rec_cb failed: %d\n", rc); + GOTO(out, rc); + } + + CWARN("5c: print the catalog entries.. we expect 1\n"); + cat_counter = 0; + rc = llog_process(env, llh, cat_print_cb, "test 5", NULL); + if (rc) { + CERROR("5c: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 1) { + CERROR("5c: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + CWARN("5d: add 1 record to the log with many canceled empty pages\n"); + rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL); + if (rc) { + CERROR("5d: add record to the log with many canceled empty " + "pages failed\n"); + GOTO(out, rc); + } + + CWARN("5e: print plain log entries.. expect 6\n"); + plain_counter = 0; + rc = llog_cat_process(env, llh, plain_print_cb, "foobar", 0, 0); + if (rc) { + CERROR("5e: process with plain_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (plain_counter != 6) { + CERROR("5e: found %d records\n", plain_counter); + GOTO(out, rc = -EINVAL); + } + + CWARN("5f: print plain log entries reversely.. expect 6\n"); + plain_counter = 0; + rc = llog_cat_reverse_process(env, llh, plain_print_cb, "foobar"); + if (rc) { + CERROR("5f: reversely process with plain_print_cb failed: " + "%d\n", rc); + GOTO(out, rc); + } + if (plain_counter != 6) { + CERROR("5f: found %d records\n", plain_counter); + GOTO(out, rc = -EINVAL); + } + +out: + CWARN("5g: close re-opened catalog\n"); + rc2 = llog_cat_close(env, llh); + if (rc2) { + CERROR("5g: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +out_put: + llog_ctxt_put(ctxt); + + RETURN(rc); +} + +/* Test client api; open log by name and process */ +static int llog_test_6(const struct lu_env *env, struct obd_device *obd, + char *name) +{ + struct obd_device *mgc_obd; + struct llog_ctxt *ctxt; + struct obd_uuid *mgs_uuid; + struct obd_export *exp; + struct obd_uuid uuid = { "LLOG_TEST6_UUID" }; + struct llog_handle *llh = NULL; + struct llog_ctxt *nctxt; + int rc, rc2; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid; + + CWARN("6a: re-open log %s using client API\n", name); + mgc_obd = class_find_client_obd(mgs_uuid, LUSTRE_MGC_NAME, NULL); + if (mgc_obd == NULL) { + CERROR("6a: no MGC devices connected to %s found.\n", + mgs_uuid->uuid); + GOTO(ctxt_release, rc = -ENOENT); + } + + rc = obd_connect(NULL, &exp, mgc_obd, &uuid, + NULL /* obd_connect_data */, NULL); + if (rc != -EALREADY) { + CERROR("6a: connect on connected MGC (%s) failed to return" + " -EALREADY\n", mgc_obd->obd_name); + if (rc == 0) + obd_disconnect(exp); + GOTO(ctxt_release, rc = -EINVAL); + } + + nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT); + rc = llog_open(env, nctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("6a: llog_open failed %d\n", rc); + GOTO(nctxt_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) { + CERROR("6a: llog_init_handle failed %d\n", rc); + GOTO(parse_out, rc); + } + + plain_counter = 1; /* llog header is first record */ + CWARN("6b: process log %s using client API\n", name); + rc = llog_process(env, llh, plain_print_cb, NULL, NULL); + if (rc) + CERROR("6b: llog_process failed %d\n", rc); + CWARN("6b: processed %d records\n", plain_counter); + + rc = verify_handle("6b", llh, plain_counter); + if (rc) + GOTO(parse_out, rc); + + plain_counter = 1; /* llog header is first record */ + CWARN("6c: process log %s reversely using client API\n", name); + rc = llog_reverse_process(env, llh, plain_print_cb, NULL, NULL); + if (rc) + CERROR("6c: llog_reverse_process failed %d\n", rc); + CWARN("6c: processed %d records\n", plain_counter); + + rc = verify_handle("6c", llh, plain_counter); + if (rc) + GOTO(parse_out, rc); + +parse_out: + rc2 = llog_close(env, llh); + if (rc2) { + CERROR("6: llog_close failed: rc = %d\n", rc2); + if (rc == 0) + rc = rc2; + } +nctxt_put: + llog_ctxt_put(nctxt); +ctxt_release: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +static union { + struct llog_rec_hdr lrh; /* common header */ + struct llog_logid_rec llr; /* LLOG_LOGID_MAGIC */ + struct llog_unlink64_rec lur; /* MDS_UNLINK64_REC */ + struct llog_setattr64_rec lsr64; /* MDS_SETATTR64_REC */ + struct llog_setattr64_rec_v2 lsr64_v2; /* MDS_SETATTR64_REC */ + struct llog_size_change_rec lscr; /* OST_SZ_REC */ + struct llog_changelog_rec lcr; /* CHANGELOG_REC */ + struct llog_changelog_user_rec2 lcur; /* CHANGELOG_USER_REC2 */ + struct llog_gen_rec lgr; /* LLOG_GEN_REC */ +} llog_records; + +static int test_7_print_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct lu_fid fid = {0}; + + logid_to_fid(&llh->lgh_id, &fid); + + CDEBUG(D_OTHER, "record type %#x at index %d in log "DFID"\n", + rec->lrh_type, rec->lrh_index, PFID(&fid)); + + plain_counter++; + return 0; +} + +static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + plain_counter++; + /* test LLOG_DEL_RECORD is working */ + return LLOG_DEL_RECORD; +} + +static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct llog_handle *llh; + int rc = 0, i, process_count; + int num_recs = 0; + + ENTRY; + + rc = llog_open_create(env, ctxt, &llh, NULL, NULL); + if (rc) { + CERROR("7_sub: create log failed\n"); + RETURN(rc); + } + + rc = llog_init_handle(env, llh, + LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY, + &uuid); + if (rc) { + CERROR("7_sub: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + for (i = 0; i < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr); i++) { + rc = llog_write(env, llh, &llog_records.lrh, LLOG_NEXT_IDX); + if (rc == -ENOSPC) { + break; + } else if (rc < 0) { + CERROR("7_sub: write recs failed at #%d: %d\n", + i + 1, rc); + GOTO(out_close, rc); + } + num_recs++; + } + if (rc != -ENOSPC) { + CWARN("7_sub: write record more than BITMAP size!\n"); + GOTO(out_close, rc = -EINVAL); + } + + rc = verify_handle("7_sub", llh, num_recs + 1); + if (rc) { + CERROR("7_sub: verify handle failed: %d\n", rc); + GOTO(out_close, rc); + } + if (num_recs < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1) + CWARN("7_sub: records are not aligned, written %d from %u\n", + num_recs, LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1); + + plain_counter = 0; + rc = llog_process(env, llh, test_7_print_cb, "test 7", NULL); + if (rc) { + CERROR("7_sub: llog process failed: %d\n", rc); + GOTO(out_close, rc); + } + process_count = plain_counter; + if (process_count != num_recs) { + CERROR("7_sub: processed %d records from %d total\n", + process_count, num_recs); + GOTO(out_close, rc = -EINVAL); + } + + plain_counter = 0; + rc = llog_reverse_process(env, llh, test_7_cancel_cb, "test 7", NULL); + if (rc && rc != LLOG_DEL_PLAIN) { + CERROR("7_sub: reverse llog process failed: %d\n", rc); + GOTO(out_close, rc); + } + if (process_count != plain_counter) { + CERROR("7_sub: Reverse/direct processing found different number of records: %d/%d\n", + plain_counter, process_count); + GOTO(out_close, rc = -EINVAL); + } + if (llog_exist(llh)) { + CERROR("7_sub: llog exists but should be zapped\n"); + GOTO(out_close, rc = -EEXIST); + } + + rc = verify_handle("7_sub", llh, 1); +out_close: + if (rc) + llog_destroy(env, llh); + llog_close(env, llh); + RETURN(rc); +} + +/* Test all llog records writing and processing */ +static int llog_test_7(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + + CWARN("7a: test llog_logid_rec\n"); + llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr); + llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr); + llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7a: llog_logid_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7b: test llog_unlink64_rec\n"); + llog_records.lur.lur_hdr.lrh_len = sizeof(llog_records.lur); + llog_records.lur.lur_tail.lrt_len = sizeof(llog_records.lur); + llog_records.lur.lur_hdr.lrh_type = MDS_UNLINK64_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7b: llog_unlink_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7c: test llog_setattr64_rec\n"); + llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64); + llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64); + llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7c: llog_setattr64_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7d: test llog_size_change_rec\n"); + llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr); + llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr); + llog_records.lscr.lsc_hdr.lrh_type = OST_SZ_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7d: llog_size_change_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7e: test llog_changelog_rec\n"); + /* Direct access to cr_do_not_use: peculiar case for this test */ + llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr); + llog_records.lcr.cr_do_not_use.lrt_len = sizeof(llog_records.lcr); + llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7e: llog_changelog_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7f: test llog_changelog_user_rec2\n"); + llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur); + llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur); + llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC2; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7f: llog_changelog_user_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7g: test llog_gen_rec\n"); + llog_records.lgr.lgr_hdr.lrh_len = sizeof(llog_records.lgr); + llog_records.lgr.lgr_tail.lrt_len = sizeof(llog_records.lgr); + llog_records.lgr.lgr_hdr.lrh_type = LLOG_GEN_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7g: llog_size_change_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7h: test llog_setattr64_rec_v2\n"); + llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64_v2); + llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64_v2); + llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7h: llog_setattr64_rec_v2 test failed\n"); + GOTO(out, rc); + } +out: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +static int test_8_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + plain_counter++; + return 0; +} + +static int llog_test_8(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *llh = NULL; + char name[10]; + int rc, rc2, i; + int orig_counter; + struct llog_mini_rec lmr; + struct llog_ctxt *ctxt; + struct dt_object *obj = NULL; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC; + + CWARN("8a: fill the first plain llog\n"); + rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("8a: llog_create with logid failed: %d\n", rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("8a: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + plain_counter = 0; + rc = llog_cat_process(env, llh, test_8_cb, "foobar", 0, 0); + if (rc != 0) { + CERROR("5a: process with test_8_cb failed: %d\n", rc); + GOTO(out, rc); + } + orig_counter = plain_counter; + + for (i = 0; i < 100; i++) { + rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL); + if (rc) { + CERROR("5a: add record failed\n"); + GOTO(out, rc); + } + } + + /* grab the current plain llog, we'll corrupt it later */ + obj = llh->u.chd.chd_current_log->lgh_obj; + LASSERT(obj); + lu_object_get(&obj->do_lu); + CWARN("8a: pin llog "DFID"\n", PFID(lu_object_fid(&obj->do_lu))); + + rc2 = llog_cat_close(env, llh); + if (rc2) { + CERROR("8a: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + GOTO(out_put, rc); + } + + CWARN("8b: fill the second plain llog\n"); + rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("8b: llog_create with logid failed: %d\n", rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("8b: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + for (i = 0; i < 100; i++) { + rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL); + if (rc) { + CERROR("8b: add record failed\n"); + GOTO(out, rc); + } + } + CWARN("8b: second llog "DFID"\n", + PFID(lu_object_fid(&llh->u.chd.chd_current_log->lgh_obj->do_lu))); + + rc2 = llog_cat_close(env, llh); + if (rc2) { + CERROR("8b: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + GOTO(out_put, rc); + } + + /* Here was 8c: drop two records from the first plain llog + * llog_truncate was bad idea cause it creates a wrong state, + * lgh_last_idx is wrong and two records belongs to zeroed buffer + */ + + CWARN("8d: count survived records\n"); + rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("8d: llog_create with logid failed: %d\n", rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("8d: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + plain_counter = 0; + rc = llog_cat_process(env, llh, test_8_cb, "foobar", 0, 0); + if (rc != 0) { + CERROR("8d: process with test_8_cb failed: %d\n", rc); + GOTO(out, rc); + } + + if (orig_counter + 200 != plain_counter) { + CERROR("found %d records (expected %d)\n", plain_counter, + orig_counter + 200); + rc = -EIO; + } + +out: + CWARN("8d: close re-opened catalog\n"); + rc2 = llog_cat_close(env, llh); + if (rc2) { + CERROR("8d: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +out_put: + llog_ctxt_put(ctxt); + + if (obj != NULL) + dt_object_put(env, obj); + + RETURN(rc); +} + +static int llog_test_9_sub(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct llog_handle *llh; + struct lu_fid fid; + int rc = 0; + + ENTRY; + + rc = llog_open_create(env, ctxt, &llh, NULL, NULL); + if (rc != 0) { + CERROR("9_sub: create log failed\n"); + RETURN(rc); + } + + rc = llog_init_handle(env, llh, + LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY, + &uuid); + if (rc != 0) { + CERROR("9_sub: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + logid_to_fid(&llh->lgh_id, &fid); + fid_to_logid(&fid, &llog_records.llr.lid_id); + rc = llog_write(env, llh, &llog_records.lrh, LLOG_NEXT_IDX); + if (rc < 0) { + CERROR("9_sub: write recs failed at #1: %d\n", rc); + GOTO(out_close, rc); + } + /* The below message is checked in sanity.sh test_60a (run-llog.sh) */ + CWARN("9_sub: record type %x in log "DFID_NOBRACE"\n", + llog_records.lrh.lrh_type, PFID(&fid)); +out_close: + llog_close(env, llh); + RETURN(rc); +} + +/* Prepare different types of llog records for llog_reader test*/ +static int llog_test_9(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + + CWARN("9a: test llog_logid_rec\n"); + llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr); + llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr); + llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + + rc = llog_test_9_sub(env, ctxt); + if (rc != 0) { + CERROR("9a: llog_logid_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("9b: test llog_obd_cfg_rec\n"); + llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr); + llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr); + llog_records.lscr.lsc_hdr.lrh_type = OBD_CFG_REC; + + rc = llog_test_9_sub(env, ctxt); + if (rc != 0) { + CERROR("9b: llog_obd_cfg_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("9c: test llog_changelog_rec\n"); + /* Direct access to cr_do_not_use: peculiar case for this test */ + llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr); + llog_records.lcr.cr_do_not_use.lrt_len = sizeof(llog_records.lcr); + llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC; + + rc = llog_test_9_sub(env, ctxt); + if (rc != 0) { + CERROR("9c: llog_changelog_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("9d: test llog_changelog_user_rec2\n"); + llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur); + llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur); + llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC; + + rc = llog_test_9_sub(env, ctxt); + if (rc != 0) { + CERROR("9d: llog_changelog_user_rec test failed\n"); + GOTO(out, rc); + } + +out: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +struct llog_process_info { + struct llog_handle *lpi_loghandle; + llog_cb_t lpi_cb; + void *lpi_cbdata; + void *lpi_catdata; + int lpi_rc; + struct completion lpi_completion; + const struct lu_env *lpi_env; + struct task_struct *lpi_reftask; +}; + + +static int llog_test_process_thread(void *arg) +{ + struct llog_process_info *lpi = arg; + int rc; + + rc = llog_cat_process_or_fork(NULL, lpi->lpi_loghandle, lpi->lpi_cb, + NULL, lpi->lpi_cbdata, 1, 0, true); + + complete(&lpi->lpi_completion); + + lpi->lpi_rc = rc; + if (rc) + CWARN("10h: Error during catalog processing %d\n", rc); + return rc; +} + +static int cat_check_old_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct lu_fid fid = {0}; + struct lu_fid *prev_fid = data; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + + logid_to_fid(&lir->lid_id, &fid); + + CWARN("seeing record at index %d - "DFID" in log "DFID"\n", + rec->lrh_index, PFID(&fid), + PFID(lu_object_fid(&llh->lgh_obj->do_lu))); + + if (prev_fid->f_oid > fid.f_oid) { + CWARN("processing old record, fail\n"); + prev_fid->f_oid = 0xbad; + RETURN(-LLOG_EEMPTY); + } + + if (prev_fid->f_oid == 0) { + cfs_fail_loc = OBD_FAIL_ONCE | OBD_FAIL_LLOG_PROCESS_TIMEOUT; + cfs_fail_val = (unsigned int) (llh->lgh_id.lgl_oi.oi.oi_id & + 0xFFFFFFFF); + msleep(1 * MSEC_PER_SEC); + } + *prev_fid = fid; + + RETURN(0); +} + +/* test catalog wrap around */ +static int llog_test_10(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *cath; + char name[10]; + int rc, rc2, i, enospc, eok; + struct llog_mini_rec lmr; + struct llog_ctxt *ctxt; + struct lu_attr la; + __u64 cat_max_size; + struct dt_device *dt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC; + + snprintf(name, sizeof(name), "%x", llog_test_rand + 2); + CWARN("10a: create a catalog log with name: %s\n", name); + rc = llog_open_create(env, ctxt, &cath, NULL, name); + if (rc) { + CERROR("10a: llog_create with name %s failed: %d\n", name, rc); + GOTO(ctxt_release, rc); + } + rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("10a: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + cat_logid = cath->lgh_id; + dt = lu2dt_dev(cath->lgh_obj->do_lu.lo_dev); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10c: sync failed: %d\n", rc); + GOTO(out, rc); + } + + /* force catalog wrap for 5th plain LLOG */ + cfs_fail_loc = CFS_FAIL_SKIP|OBD_FAIL_CAT_RECORDS; + cfs_fail_val = 4; + + CWARN("10b: write %d log records\n", llog_test_recnum); + for (i = 0; i < llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc) { + CERROR("10b: write %d records failed at #%d: %d\n", + llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + } + + /* make sure 2 new plain llog appears in catalog (+1 with hdr) */ + rc = verify_handle("10b", cath, 3); + if (rc) + GOTO(out, rc); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10b: sync failed: %d\n", rc); + GOTO(out, rc); + } + + CWARN("10c: write %d more log records\n", 2 * llog_test_recnum); + for (i = 0; i < 2 * llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc) { + CERROR("10c: write %d records failed at #%d: %d\n", + 2*llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + } + + /* make sure 2 new plain llog appears in catalog (+1 with hdr) */ + rc = verify_handle("10c", cath, 5); + if (rc) + GOTO(out, rc); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10c: sync failed: %d\n", rc); + GOTO(out, rc); + } + + /* + * fill last allocated plain LLOG and reach -ENOSPC condition + * because no slot available in Catalog + */ + enospc = 0; + eok = 0; + CWARN("10c: write %d more log records\n", llog_test_recnum); + for (i = 0; i < llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc && rc != -ENOSPC) { + CERROR("10c: write %d records failed at #%d: %d\n", + llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + /* + * after last added plain LLOG has filled up, all new + * records add should fail with -ENOSPC + */ + if (rc == -ENOSPC) { + enospc++; + } else { + enospc = 0; + eok++; + } + } + + if ((enospc == 0) && (enospc+eok != llog_test_recnum)) { + CERROR("10c: all last records adds should have failed with" + " -ENOSPC\n"); + GOTO(out, rc = -EINVAL); + } + + CWARN("10c: wrote %d records then %d failed with ENOSPC\n", eok, + enospc); + + /* make sure no new record in Catalog */ + rc = verify_handle("10c", cath, 5); + if (rc) + GOTO(out, rc); + + /* Catalog should have reached its max size for test */ + rc = dt_attr_get(env, cath->lgh_obj, &la); + if (rc) { + CERROR("10c: failed to get catalog attrs: %d\n", rc); + GOTO(out, rc); + } + cat_max_size = la.la_size; + + /* + * cancel all 1st plain llog records to empty it, this will also cause + * its catalog entry to be freed for next forced wrap in 10e + */ + CWARN("10d: Cancel %d records, see one log zapped\n", llog_test_recnum); + cancel_count = 0; + rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("10d: process with llog_cancel_rec_cb failed: %d\n", rc); + /* + * need to indicate error if for any reason llog_test_recnum is + * not reached + */ + if (rc == 0) + rc = -ERANGE; + GOTO(out, rc); + } + + CWARN("10d: print the catalog entries.. we expect 3\n"); + cat_counter = 0; + rc = llog_process(env, cath, cat_print_cb, "test 10", NULL); + if (rc) { + CERROR("10d: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 3) { + CERROR("10d: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + /* verify one down in catalog (+1 with hdr) */ + rc = verify_handle("10d", cath, 4); + if (rc) + GOTO(out, rc); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10d: sync failed: %d\n", rc); + GOTO(out, rc); + } + + enospc = 0; + eok = 0; + CWARN("10e: write %d more log records\n", llog_test_recnum); + for (i = 0; i < llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc && rc != -ENOSPC) { + CERROR("10e: write %d records failed at #%d: %d\n", + llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + /* + * after last added plain LLOG has filled up, all new + * records add should fail with -ENOSPC + */ + if (rc == -ENOSPC) { + enospc++; + } else { + enospc = 0; + eok++; + } + } + + if ((enospc == 0) && (enospc+eok != llog_test_recnum)) { + CERROR("10e: all last records adds should have failed with" + " -ENOSPC\n"); + GOTO(out, rc = -EINVAL); + } + + CWARN("10e: wrote %d records then %d failed with ENOSPC\n", eok, + enospc); + + CWARN("10e: print the catalog entries.. we expect 4\n"); + cat_counter = 0; + rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10", + 0, 0, false); + if (rc) { + CERROR("10e: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 4) { + CERROR("10e: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + /* make sure 1 new plain llog appears in catalog (+1 with hdr) */ + rc = verify_handle("10e", cath, 5); + if (rc) + GOTO(out, rc); + + /* verify catalog has wrap around */ + if (cath->lgh_last_idx > cath->lgh_hdr->llh_cat_idx) { + CERROR("10e: catalog failed to wrap around\n"); + GOTO(out, rc = -EINVAL); + } + + rc = dt_attr_get(env, cath->lgh_obj, &la); + if (rc) { + CERROR("10e: failed to get catalog attrs: %d\n", rc); + GOTO(out, rc); + } + + if (la.la_size != cat_max_size) { + CERROR("10e: catalog size has changed after it has wrap around," + " current size = %llu, expected size = %llu\n", + la.la_size, cat_max_size); + GOTO(out, rc = -EINVAL); + } + CWARN("10e: catalog successfully wrap around, last_idx %d, first %d\n", + cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10e: sync failed: %d\n", rc); + GOTO(out, rc); + } + + /* + * cancel more records to free one more slot in Catalog + * see if it is re-allocated when adding more records + */ + CWARN("10f: Cancel %d records, see one log zapped\n", llog_test_recnum); + cancel_count = 0; + rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("10f: process with llog_cancel_rec_cb failed: %d\n", rc); + /* + * need to indicate error if for any reason llog_test_recnum is + * not reached + */ + if (rc == 0) + rc = -ERANGE; + GOTO(out, rc); + } + + CWARN("10f: print the catalog entries.. we expect 3\n"); + cat_counter = 0; + rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10", + 0, 0, false); + if (rc) { + CERROR("10f: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 3) { + CERROR("10f: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + /* verify one down in catalog (+1 with hdr) */ + rc = verify_handle("10f", cath, 4); + if (rc) + GOTO(out, rc); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10f: sync failed: %d\n", rc); + GOTO(out, rc); + } + + enospc = 0; + eok = 0; + CWARN("10f: write %d more log records\n", llog_test_recnum); + for (i = 0; i < llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc && rc != -ENOSPC) { + CERROR("10f: write %d records failed at #%d: %d\n", + llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + /* + * after last added plain LLOG has filled up, all new + * records add should fail with -ENOSPC + */ + if (rc == -ENOSPC) { + enospc++; + } else { + enospc = 0; + eok++; + } + } + + if ((enospc == 0) && (enospc+eok != llog_test_recnum)) { + CERROR("10f: all last records adds should have failed with" + " -ENOSPC\n"); + GOTO(out, rc = -EINVAL); + } + + CWARN("10f: wrote %d records then %d failed with ENOSPC\n", eok, + enospc); + + /* make sure 1 new plain llog appears in catalog (+1 with hdr) */ + rc = verify_handle("10f", cath, 5); + if (rc) + GOTO(out, rc); + + /* verify lgh_last_idx = llh_cat_idx = 2 now */ + if (cath->lgh_last_idx != cath->lgh_hdr->llh_cat_idx || + cath->lgh_last_idx != 2) { + CERROR("10f: lgh_last_idx = %d vs 2, llh_cat_idx = %d vs 2\n", + cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx); + GOTO(out, rc = -EINVAL); + } + + rc = dt_attr_get(env, cath->lgh_obj, &la); + if (rc) { + CERROR("10f: failed to get catalog attrs: %d\n", rc); + GOTO(out, rc); + } + + if (la.la_size != cat_max_size) { + CERROR("10f: catalog size has changed after it has wrap around," + " current size = %llu, expected size = %llu\n", + la.la_size, cat_max_size); + GOTO(out, rc = -EINVAL); + } + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10f: sync failed: %d\n", rc); + GOTO(out, rc); + } + + /* will llh_cat_idx also successfully wrap ? */ + + /* + * cancel all records in the plain LLOGs referenced by 2 last indexes in + * Catalog + */ + + /* cancel more records to free one more slot in Catalog */ + CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum); + cancel_count = 0; + rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc); + /* need to indicate error if for any reason llog_test_recnum is + * not reached */ + if (rc == 0) + rc = -ERANGE; + GOTO(out, rc); + } + + CWARN("10g: print the catalog entries.. we expect 3\n"); + cat_counter = 0; + rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10", + 0, 0, false); + if (rc) { + CERROR("10g: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 3) { + CERROR("10g: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + /* verify one down in catalog (+1 with hdr) */ + rc = verify_handle("10g", cath, 4); + if (rc) + GOTO(out, rc); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10g: sync failed: %d\n", rc); + GOTO(out, rc); + } + + /* cancel more records to free one more slot in Catalog */ + CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum); + cancel_count = 0; + rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc); + /* + * need to indicate error if for any reason llog_test_recnum is + * not reached + */ + if (rc == 0) + rc = -ERANGE; + GOTO(out, rc); + } + + CWARN("10g: print the catalog entries.. we expect 2\n"); + cat_counter = 0; + rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10", + 0, 0, false); + if (rc) { + CERROR("10g: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 2) { + CERROR("10g: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + /* verify one down in catalog (+1 with hdr) */ + rc = verify_handle("10g", cath, 3); + if (rc) + GOTO(out, rc); + + /* verify lgh_last_idx = 2 and llh_cat_idx = 0 now */ + if (cath->lgh_hdr->llh_cat_idx != 0 || + cath->lgh_last_idx != 2) { + CERROR("10g: lgh_last_idx = %d vs 2, llh_cat_idx = %d vs 0\n", + cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx); + GOTO(out, rc = -EINVAL); + } + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10g: sync failed: %d\n", rc); + GOTO(out, rc); + } + + /* cancel more records to free one more slot in Catalog */ + CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum); + cancel_count = 0; + rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc); + /* + * need to indicate error if for any reason llog_test_recnum is + * not reached + */ + if (rc == 0) + rc = -ERANGE; + GOTO(out, rc); + } + + CWARN("10g: print the catalog entries.. we expect 1\n"); + cat_counter = 0; + rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10", + 0, 0, false); + if (rc) { + CERROR("10g: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 1) { + CERROR("10g: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + /* verify one down in catalog (+1 with hdr) */ + rc = verify_handle("10g", cath, 2); + if (rc) + GOTO(out, rc); + + /* verify lgh_last_idx = 2 and llh_cat_idx = 1 now */ + if (cath->lgh_hdr->llh_cat_idx != 1 || + cath->lgh_last_idx != 2) { + CERROR("10g: lgh_last_idx = %d vs 2, llh_cat_idx = %d vs 1\n", + cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx); + GOTO(out, rc = -EINVAL); + } + + CWARN("10g: llh_cat_idx has also successfully wrapped!\n"); + + /* + * catalog has only one valid entry other slots has outdated + * records. Trying to race the llog_thread_process with llog_add + * llog_thread_process read buffer and loop record on it. + * llog_add adds a record and mark a record in bitmap. + * llog_thread_process process record with old data. + */ + { + struct llog_process_info lpi; + struct lu_fid test_fid = {0}; + + lpi.lpi_loghandle = cath; + lpi.lpi_cb = cat_check_old_cb; + lpi.lpi_catdata = NULL; + lpi.lpi_cbdata = &test_fid; + init_completion(&lpi.lpi_completion); + + kthread_run(llog_test_process_thread, &lpi, "llog_test_process_thread"); + + msleep(1 * MSEC_PER_SEC / 2); + enospc = 0; + eok = 0; + CWARN("10h: write %d more log records\n", llog_test_recnum); + for (i = 0; i < llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc && rc != -ENOSPC) { + CERROR("10h: write %d records failed at #%d: %d\n", + llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + /* + * after last added plain LLOG has filled up, all new + * records add should fail with -ENOSPC + */ + if (rc == -ENOSPC) { + enospc++; + } else { + enospc = 0; + eok++; + } + } + + if ((enospc == 0) && (enospc+eok != llog_test_recnum)) { + CERROR("10h: all last records adds should have failed with" + " -ENOSPC\n"); + GOTO(out, rc = -EINVAL); + } + + CWARN("10h: wrote %d records then %d failed with ENOSPC\n", eok, + enospc); + + wait_for_completion(&lpi.lpi_completion); + + if (lpi.lpi_rc != 0) { + CERROR("10h: race happened, old record was processed\n"); + GOTO(out, rc = -EINVAL); + } + } +out: + cfs_fail_loc = 0; + cfs_fail_val = 0; + + CWARN("10: put newly-created catalog\n"); + rc2 = llog_cat_close(env, cath); + if (rc2) { + CERROR("10: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +ctxt_release: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +/* + * ------------------------------------------------------------------------- + * Tests above, boring obd functions below + * ------------------------------------------------------------------------- + */ +static int llog_run_tests(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *llh = NULL; + struct llog_ctxt *ctxt; + int rc, err; + char name[10]; + + ENTRY; + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + sprintf(name, "%x", llog_test_rand); + + rc = llog_test_1(env, obd, name); + if (rc) + GOTO(cleanup_ctxt, rc); + + rc = llog_test_2(env, obd, name, &llh); + if (rc) + GOTO(cleanup_ctxt, rc); + + rc = llog_test_3(env, obd, llh); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_4(env, obd); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_5(env, obd); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_6(env, obd, name); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_7(env, obd); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_8(env, obd); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_9(env, obd); + if (rc != 0) + GOTO(cleanup, rc); + + rc = llog_test_10(env, obd); + if (rc) + GOTO(cleanup, rc); + +cleanup: + err = llog_destroy(env, llh); + if (err) + CERROR("cleanup: llog_destroy failed: %d\n", err); + llog_close(env, llh); + if (rc == 0) + rc = err; +cleanup_ctxt: + llog_ctxt_put(ctxt); + return rc; +} + +static int llog_test_cleanup(struct obd_device *obd) +{ + struct obd_device *tgt; + struct lu_env env; + int rc; + + ENTRY; + + rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); + if (rc) + RETURN(rc); + + tgt = obd->obd_lvfs_ctxt.dt->dd_lu_dev.ld_obd; + rc = llog_cleanup(&env, llog_get_context(tgt, LLOG_TEST_ORIG_CTXT)); + if (rc) + CERROR("failed to llog_test_llog_finish: %d\n", rc); + lu_env_fini(&env); + RETURN(rc); +} + +static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_device *tgt; + struct llog_ctxt *ctxt; + struct dt_object *o; + struct lu_env env; + struct lu_context test_session; + int rc; + + ENTRY; + + if (lcfg->lcfg_bufcount < 2) { + CERROR("requires a TARGET OBD name\n"); + RETURN(-EINVAL); + } + + if (lcfg->lcfg_buflens[1] < 1) { + CERROR("requires a TARGET OBD name\n"); + RETURN(-EINVAL); + } + + /* disk obd */ + tgt = class_name2obd(lustre_cfg_string(lcfg, 1)); + if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) { + CERROR("target device not attached or not set up (%s)\n", + lustre_cfg_string(lcfg, 1)); + RETURN(-EINVAL); + } + + rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); + if (rc) + RETURN(rc); + + rc = lu_context_init(&test_session, LCT_SERVER_SESSION); + if (rc) + GOTO(cleanup_env, rc); + test_session.lc_thread = (struct ptlrpc_thread *)current; + lu_context_enter(&test_session); + env.le_ses = &test_session; + + CWARN("Setup llog-test device over %s device\n", + lustre_cfg_string(lcfg, 1)); + + OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt); + obd->obd_lvfs_ctxt.dt = lu2dt_dev(tgt->obd_lu_dev); + + rc = llog_setup(&env, tgt, &tgt->obd_olg, LLOG_TEST_ORIG_CTXT, tgt, + &llog_osd_ops); + if (rc) + GOTO(cleanup_session, rc); + + /* use MGS llog dir for tests */ + ctxt = llog_get_context(tgt, LLOG_CONFIG_ORIG_CTXT); + LASSERT(ctxt); + o = ctxt->loc_dir; + llog_ctxt_put(ctxt); + + ctxt = llog_get_context(tgt, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + ctxt->loc_dir = o; + llog_ctxt_put(ctxt); + + llog_test_rand = get_random_u32(); + + rc = llog_run_tests(&env, tgt); + if (rc) + llog_test_cleanup(obd); +cleanup_session: + lu_context_exit(&test_session); + lu_context_fini(&test_session); +cleanup_env: + lu_env_fini(&env); + RETURN(rc); +} + +static const struct obd_ops llog_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = llog_test_setup, + .o_cleanup = llog_test_cleanup, +}; + +static int __init llog_test_init(void) +{ + return class_register_type(&llog_obd_ops, NULL, false, + "llog_test", NULL); +} + +static void __exit llog_test_exit(void) +{ + class_unregister_type("llog_test"); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Log test module"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(llog_test_init); +module_exit(llog_test_exit); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c new file mode 100644 index 0000000000000..2e2a0c4f5deff --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c @@ -0,0 +1,987 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * lustre/obdclass/local_storage.c + * + * Local storage for file/objects with fid generation. Works on top of OSD. + * + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "local_storage.h" + +/* all initialized local storages on this node are linked on this */ +static LIST_HEAD(ls_list_head); +static DEFINE_MUTEX(ls_list_mutex); + +static int ls_object_init(const struct lu_env *env, struct lu_object *o, + const struct lu_object_conf *unused) +{ + struct ls_device *ls; + struct lu_object *below; + struct lu_device *under; + + ENTRY; + + ls = container_of(o->lo_dev, struct ls_device, ls_top_dev.dd_lu_dev); + under = &ls->ls_osd->dd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under); + if (below == NULL) + RETURN(-ENOMEM); + + lu_object_add(o, below); + + RETURN(0); +} + +static void ls_object_free(const struct lu_env *env, struct lu_object *o) +{ + struct ls_object *obj = lu2ls_obj(o); + struct lu_object_header *h = o->lo_header; + + dt_object_fini(&obj->ls_obj); + lu_object_header_fini(h); + OBD_FREE_PRE(obj, sizeof(*obj), "kfreed"); + kfree_rcu(obj, ls_header.loh_rcu); +} + +static const struct lu_object_operations ls_lu_obj_ops = { + .loo_object_init = ls_object_init, + .loo_object_free = ls_object_free, +}; + +static struct lu_object *ls_object_alloc(const struct lu_env *env, + const struct lu_object_header *_h, + struct lu_device *d) +{ + struct lu_object_header *h; + struct ls_object *o; + struct lu_object *l; + + LASSERT(_h == NULL); + + OBD_ALLOC_PTR(o); + if (o != NULL) { + l = &o->ls_obj.do_lu; + h = &o->ls_header; + + lu_object_header_init(h); + dt_object_init(&o->ls_obj, h, d); + lu_object_add_top(h, l); + + l->lo_ops = &ls_lu_obj_ops; + + return l; + } else { + return NULL; + } +} + +static const struct lu_device_operations ls_lu_dev_ops = { + .ldo_object_alloc = ls_object_alloc +}; + +static struct ls_device *__ls_find_dev(struct dt_device *dev) +{ + struct ls_device *ls, *ret = NULL; + + list_for_each_entry(ls, &ls_list_head, ls_linkage) { + if (ls->ls_osd == dev) { + atomic_inc(&ls->ls_refcount); + ret = ls; + break; + } + } + return ret; +} + +struct ls_device *ls_find_dev(struct dt_device *dev) +{ + struct ls_device *ls; + + mutex_lock(&ls_list_mutex); + ls = __ls_find_dev(dev); + mutex_unlock(&ls_list_mutex); + + return ls; +} + +static const struct lu_device_type_operations ls_device_type_ops = { + .ldto_start = NULL, + .ldto_stop = NULL, +}; + +static struct lu_device_type ls_lu_type = { + .ldt_name = "local_storage", + .ldt_ops = &ls_device_type_ops, +}; + +struct ls_device *ls_device_get(struct dt_device *dev) +{ + struct ls_device *ls; + + ENTRY; + + mutex_lock(&ls_list_mutex); + ls = __ls_find_dev(dev); + if (ls) + GOTO(out_ls, ls); + + /* not found, then create */ + OBD_ALLOC_PTR(ls); + if (ls == NULL) + GOTO(out_ls, ls = ERR_PTR(-ENOMEM)); + + atomic_set(&ls->ls_refcount, 1); + INIT_LIST_HEAD(&ls->ls_los_list); + mutex_init(&ls->ls_los_mutex); + + ls->ls_osd = dev; + + LASSERT(dev->dd_lu_dev.ld_site); + lu_device_init(&ls->ls_top_dev.dd_lu_dev, &ls_lu_type); + ls->ls_top_dev.dd_lu_dev.ld_ops = &ls_lu_dev_ops; + ls->ls_top_dev.dd_lu_dev.ld_site = dev->dd_lu_dev.ld_site; + + /* finally add ls to the list */ + list_add(&ls->ls_linkage, &ls_list_head); +out_ls: + mutex_unlock(&ls_list_mutex); + RETURN(ls); +} + +void ls_device_put(const struct lu_env *env, struct ls_device *ls) +{ + LASSERT(env); + if (!atomic_dec_and_test(&ls->ls_refcount)) + return; + + mutex_lock(&ls_list_mutex); + if (atomic_read(&ls->ls_refcount) == 0) { + LASSERT(list_empty(&ls->ls_los_list)); + list_del(&ls->ls_linkage); + lu_site_purge(env, ls->ls_top_dev.dd_lu_dev.ld_site, ~0); + lu_device_fini(&ls->ls_top_dev.dd_lu_dev); + OBD_FREE_PTR(ls); + } + mutex_unlock(&ls_list_mutex); +} + +/** + * local file fid generation + */ +int local_object_fid_generate(const struct lu_env *env, + struct local_oid_storage *los, + struct lu_fid *fid) +{ + LASSERT(los->los_dev); + LASSERT(los->los_obj); + + /* take next OID */ + + /* to make it unique after reboot we store + * the latest generated fid atomically with + * object creation see local_object_create() */ + + mutex_lock(&los->los_id_lock); + fid->f_seq = los->los_seq; + fid->f_oid = ++los->los_last_oid; + fid->f_ver = 0; + mutex_unlock(&los->los_id_lock); + + return 0; +} + +int local_object_declare_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, struct lu_attr *attr, + struct dt_object_format *dof, + struct thandle *th) +{ + struct dt_thread_info *dti = dt_info(env); + int rc; + + ENTRY; + + /* update fid generation file */ + if (los != NULL) { + LASSERT(dt_object_exists(los->los_obj)); + dti->dti_lb.lb_buf = NULL; + dti->dti_lb.lb_len = sizeof(struct los_ondisk); + rc = dt_declare_record_write(env, los->los_obj, + &dti->dti_lb, 0, th); + if (rc) + RETURN(rc); + } + + rc = dt_declare_create(env, o, attr, NULL, dof, th); + if (rc) + RETURN(rc); + + dti->dti_lb.lb_buf = NULL; + dti->dti_lb.lb_len = sizeof(dti->dti_lma); + rc = dt_declare_xattr_set(env, o, &dti->dti_lb, XATTR_NAME_LMA, 0, th); + + RETURN(rc); +} + +int local_object_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, struct lu_attr *attr, + struct dt_object_format *dof, struct thandle *th) +{ + struct dt_thread_info *dti = dt_info(env); + u64 lastid; + int rc; + + ENTRY; + + rc = dt_create(env, o, attr, NULL, dof, th); + if (rc) + RETURN(rc); + + if (los == NULL) + RETURN(rc); + + LASSERT(los->los_obj); + LASSERT(dt_object_exists(los->los_obj)); + + /* many threads can be updated this, serialize + * them here to avoid the race where one thread + * takes the value first, but writes it last */ + mutex_lock(&los->los_id_lock); + + /* update local oid number on disk so that + * we know the last one used after reboot */ + lastid = cpu_to_le64(los->los_last_oid); + + dti->dti_off = 0; + dti->dti_lb.lb_buf = &lastid; + dti->dti_lb.lb_len = sizeof(lastid); + rc = dt_record_write(env, los->los_obj, &dti->dti_lb, &dti->dti_off, + th); + mutex_unlock(&los->los_id_lock); + + RETURN(rc); +} + +/* + * Create local named object (file, directory or index) in parent directory. + */ +static struct dt_object *__local_file_create(const struct lu_env *env, + const struct lu_fid *fid, + struct local_oid_storage *los, + struct ls_device *ls, + struct dt_object *parent, + const char *name, + struct lu_attr *attr, + struct dt_object_format *dof) +{ + struct dt_thread_info *dti = dt_info(env); + struct lu_object_conf *conf = &dti->dti_conf; + struct dt_insert_rec *rec = &dti->dti_dt_rec; + struct dt_object *dto; + struct thandle *th; + int rc; + + /* We know that the target object does not exist, to be created, + * then give some hints - LOC_F_NEW to help low layer to handle + * that efficiently and properly. */ + memset(conf, 0, sizeof(*conf)); + conf->loc_flags = LOC_F_NEW; + dto = ls_locate(env, ls, fid, conf); + if (unlikely(IS_ERR(dto))) + RETURN(dto); + + LASSERT(dto != NULL); + if (dt_object_exists(dto)) + GOTO(out, rc = -EEXIST); + + th = dt_trans_create(env, ls->ls_osd); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = local_object_declare_create(env, los, dto, attr, dof, th); + if (rc) + GOTO(trans_stop, rc); + + if (dti->dti_dof.dof_type == DFT_DIR) { + rc = dt_declare_ref_add(env, dto, th); + if (rc < 0) + GOTO(trans_stop, rc); + + rc = dt_declare_ref_add(env, parent, th); + if (rc < 0) + GOTO(trans_stop, rc); + } + + rec->rec_fid = fid; + rec->rec_type = attr->la_mode & S_IFMT; + rc = dt_declare_insert(env, parent, (const struct dt_rec *)rec, + (const struct dt_key *)name, th); + if (rc) + GOTO(trans_stop, rc); + + if (dti->dti_dof.dof_type == DFT_DIR) { + if (!dt_try_as_dir(env, dto)) + GOTO(trans_stop, rc = -ENOTDIR); + + rec->rec_type = S_IFDIR; + rec->rec_fid = fid; + rc = dt_declare_insert(env, dto, (const struct dt_rec *)rec, + (const struct dt_key *)".", th); + if (rc != 0) + GOTO(trans_stop, rc); + + rec->rec_fid = lu_object_fid(&parent->do_lu); + rc = dt_declare_insert(env, dto, (const struct dt_rec *)rec, + (const struct dt_key *)"..", th); + if (rc != 0) + GOTO(trans_stop, rc); + + rc = dt_declare_ref_add(env, dto, th); + if (rc != 0) + GOTO(trans_stop, rc); + } + + rc = dt_trans_start_local(env, ls->ls_osd, th); + if (rc) + GOTO(trans_stop, rc); + + dt_write_lock(env, dto, DT_SRC_CHILD); + if (dt_object_exists(dto)) + GOTO(unlock, rc = 0); + + CDEBUG(D_OTHER, "create new object "DFID"\n", + PFID(lu_object_fid(&dto->do_lu))); + rc = local_object_create(env, los, dto, attr, dof, th); + if (rc) + GOTO(unlock, rc); + LASSERT(dt_object_exists(dto)); + + if (dti->dti_dof.dof_type == DFT_DIR) { + + rec->rec_type = S_IFDIR; + rec->rec_fid = fid; + /* Add "." and ".." for newly created dir */ + rc = dt_insert(env, dto, (const struct dt_rec *)rec, + (const struct dt_key *)".", th); + if (rc != 0) + GOTO(destroy, rc); + + dt_ref_add(env, dto, th); + rec->rec_fid = lu_object_fid(&parent->do_lu); + rc = dt_insert(env, dto, (const struct dt_rec *)rec, + (const struct dt_key *)"..", th); + if (rc != 0) + GOTO(destroy, rc); + } + + rec->rec_fid = fid; + rec->rec_type = dto->do_lu.lo_header->loh_attr; + dt_write_lock(env, parent, DT_SRC_PARENT); + rc = dt_insert(env, parent, (const struct dt_rec *)rec, + (const struct dt_key *)name, th); + if (dti->dti_dof.dof_type == DFT_DIR) + dt_ref_add(env, parent, th); + dt_write_unlock(env, parent); + if (rc) + GOTO(destroy, rc); +destroy: + if (rc) + dt_destroy(env, dto, th); +unlock: + dt_write_unlock(env, dto); +trans_stop: + dt_trans_stop(env, ls->ls_osd, th); +out: + if (rc) { + dt_object_put_nocache(env, dto); + dto = ERR_PTR(rc); + } + RETURN(dto); +} + +struct dt_object *local_file_find(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (!rc) + dto = ls_locate(env, dt2ls_dev(los->los_dev), + &dti->dti_fid, NULL); + else + dto = ERR_PTR(rc); + + return dto; +} +EXPORT_SYMBOL(local_file_find); + +/* + * Look up and create (if it does not exist) a local named file or directory in + * parent directory. + */ +struct dt_object *local_file_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + dto = local_file_find(env, los, parent, name); + if (!IS_ERR(dto) || PTR_ERR(dto) != -ENOENT) + return dto; + + rc = local_object_fid_generate(env, los, &dti->dti_fid); + if (rc) + return ERR_PTR(rc); + + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = dt_mode_to_dft(mode & S_IFMT); + dto = __local_file_create(env, &dti->dti_fid, los, + dt2ls_dev(los->los_dev), parent, name, + &dti->dti_attr, &dti->dti_dof); + return dto; +} +EXPORT_SYMBOL(local_file_find_or_create); + +struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, + __u32 mode) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) { + dto = dt_locate(env, dt, &dti->dti_fid); + } else if (rc != -ENOENT) { + dto = ERR_PTR(rc); + } else { + struct ls_device *ls; + + ls = ls_device_get(dt); + if (IS_ERR(ls)) { + dto = ERR_CAST(ls); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = dt_mode_to_dft(mode & S_IFMT); + dto = __local_file_create(env, fid, NULL, ls, parent, + name, &dti->dti_attr, + &dti->dti_dof); + /* ls_device_put() will finalize the ls device, we + * have to open the object in other device stack */ + if (!IS_ERR(dto)) { + dti->dti_fid = dto->do_lu.lo_header->loh_fid; + dt_object_put_nocache(env, dto); + dto = dt_locate(env, dt, &dti->dti_fid); + } + ls_device_put(env, ls); + } + } + return dto; +} +EXPORT_SYMBOL(local_file_find_or_create_with_fid); + +/* + * Look up and create (if it does not exist) a local named index file in parent + * directory. + */ +struct dt_object *local_index_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) { + /* name is found, get the object */ + dto = ls_locate(env, dt2ls_dev(los->los_dev), + &dti->dti_fid, NULL); + } else if (rc != -ENOENT) { + dto = ERR_PTR(rc); + } else { + rc = local_object_fid_generate(env, los, &dti->dti_fid); + if (rc < 0) { + dto = ERR_PTR(rc); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = DFT_INDEX; + dti->dti_dof.u.dof_idx.di_feat = ft; + dto = __local_file_create(env, &dti->dti_fid, los, + dt2ls_dev(los->los_dev), + parent, name, &dti->dti_attr, + &dti->dti_dof); + } + } + return dto; + +} +EXPORT_SYMBOL(local_index_find_or_create); + +struct dt_object * +local_index_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) { + /* name is found, get the object */ + if (!lu_fid_eq(fid, &dti->dti_fid)) + dto = ERR_PTR(-EINVAL); + else + dto = dt_locate(env, dt, fid); + } else if (rc != -ENOENT) { + dto = ERR_PTR(rc); + } else { + struct ls_device *ls; + + ls = ls_device_get(dt); + if (IS_ERR(ls)) { + dto = ERR_CAST(ls); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = DFT_INDEX; + dti->dti_dof.u.dof_idx.di_feat = ft; + dto = __local_file_create(env, fid, NULL, ls, parent, + name, &dti->dti_attr, + &dti->dti_dof); + /* ls_device_put() will finalize the ls device, we + * have to open the object in other device stack */ + if (!IS_ERR(dto)) { + dti->dti_fid = dto->do_lu.lo_header->loh_fid; + dt_object_put_nocache(env, dto); + dto = dt_locate(env, dt, &dti->dti_fid); + } + ls_device_put(env, ls); + } + } + return dto; +} +EXPORT_SYMBOL(local_index_find_or_create_with_fid); + +static int local_object_declare_unlink(const struct lu_env *env, + struct dt_device *dt, + struct dt_object *p, + struct dt_object *c, const char *name, + struct thandle *th) +{ + int rc; + + rc = dt_declare_delete(env, p, (const struct dt_key *)name, th); + if (rc < 0) + return rc; + + if (S_ISDIR(p->do_lu.lo_header->loh_attr)) { + rc = dt_declare_ref_del(env, p, th); + if (rc < 0) + return rc; + } + + rc = dt_declare_ref_del(env, c, th); + if (rc < 0) + return rc; + + return dt_declare_destroy(env, c, th); +} + +int local_object_unlink(const struct lu_env *env, struct dt_device *dt, + struct dt_object *parent, const char *name) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + struct thandle *th; + int rc; + + ENTRY; + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == -ENOENT) + RETURN(0); + else if (rc < 0) + RETURN(rc); + + dto = dt_locate(env, dt, &dti->dti_fid); + if (unlikely(IS_ERR(dto))) + RETURN(PTR_ERR(dto)); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = local_object_declare_unlink(env, dt, parent, dto, name, th); + if (rc < 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc < 0) + GOTO(stop, rc); + + if (S_ISDIR(dto->do_lu.lo_header->loh_attr)) { + dt_write_lock(env, parent, 0); + rc = dt_ref_del(env, parent, th); + dt_write_unlock(env, parent); + if (rc) + GOTO(stop, rc); + } + + dt_write_lock(env, dto, 0); + rc = dt_delete(env, parent, (struct dt_key *)name, th); + if (rc < 0) + GOTO(unlock, rc); + + rc = dt_ref_del(env, dto, th); + if (rc < 0) { + struct dt_insert_rec *rec = &dti->dti_dt_rec; + + rec->rec_fid = &dti->dti_fid; + rec->rec_type = dto->do_lu.lo_header->loh_attr; + rc = dt_insert(env, parent, (const struct dt_rec *)rec, + (const struct dt_key *)name, th); + GOTO(unlock, rc); + } + + rc = dt_destroy(env, dto, th); +unlock: + dt_write_unlock(env, dto); +stop: + dt_trans_stop(env, dt, th); +out: + dt_object_put_nocache(env, dto); + return rc; +} +EXPORT_SYMBOL(local_object_unlink); + +struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq) +{ + struct local_oid_storage *los, *ret = NULL; + + list_for_each_entry(los, &ls->ls_los_list, los_list) { + if (los->los_seq == seq) { + atomic_inc(&los->los_refcount); + ret = los; + break; + } + } + return ret; +} + +void dt_los_put(struct local_oid_storage *los) +{ + if (atomic_dec_and_test(&los->los_refcount)) + /* should never happen, only local_oid_storage_fini should + * drop refcount to zero */ + LBUG(); +} + +/* after Lustre 2.3 release there may be old file to store last generated FID + * If such file exists then we have to read its content + */ +static int lastid_compat_check(const struct lu_env *env, struct dt_device *dev, + __u64 lastid_seq, __u32 *first_oid, + struct ls_device *ls) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *root = NULL; + struct los_ondisk losd; + struct dt_object *o = NULL; + int rc = 0; + + rc = dt_root_get(env, dev, &dti->dti_fid); + if (rc) + return rc; + + root = ls_locate(env, ls, &dti->dti_fid, NULL); + if (IS_ERR(root)) + return PTR_ERR(root); + + /* find old last_id file */ + snprintf(dti->dti_buf, sizeof(dti->dti_buf), "seq-%#llx-lastid", + lastid_seq); + rc = dt_lookup_dir(env, root, dti->dti_buf, &dti->dti_fid); + dt_object_put_nocache(env, root); + if (rc == -ENOENT) { + /* old llog lastid accessed by FID only */ + if (lastid_seq != FID_SEQ_LLOG) + return 0; + dti->dti_fid.f_seq = FID_SEQ_LLOG; + dti->dti_fid.f_oid = 1; + dti->dti_fid.f_ver = 0; + o = ls_locate(env, ls, &dti->dti_fid, NULL); + if (IS_ERR(o)) + return PTR_ERR(o); + + if (!dt_object_exists(o)) { + dt_object_put_nocache(env, o); + return 0; + } + CDEBUG(D_INFO, "Found old llog lastid file\n"); + } else if (rc < 0) { + return rc; + } else { + CDEBUG(D_INFO, "Found old lastid file for sequence %#llx\n", + lastid_seq); + o = ls_locate(env, ls, &dti->dti_fid, NULL); + if (IS_ERR(o)) + return PTR_ERR(o); + } + /* let's read seq-NNNNNN-lastid file value */ + LASSERT(dt_object_exists(o)); + dti->dti_off = 0; + dti->dti_lb.lb_buf = &losd; + dti->dti_lb.lb_len = sizeof(losd); + dt_read_lock(env, o, 0); + rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off); + dt_read_unlock(env, o); + if (rc == 0 && le32_to_cpu(losd.lso_magic) != LOS_MAGIC) { + CERROR("%s: wrong content of seq-%#llx-lastid file, magic %x\n", + o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq, + le32_to_cpu(losd.lso_magic)); + rc = -EINVAL; + } else if (rc < 0) { + CERROR("%s: failed to read seq-%#llx-lastid: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq, rc); + } + dt_object_put_nocache(env, o); + if (rc == 0) + *first_oid = le32_to_cpu(losd.lso_next_oid); + return rc; +} + +/** + * Initialize local OID storage for required sequence. + * That may be needed for services that uses local files and requires + * dynamic OID allocation for them. + * + * Per each sequence we have an object with 'first_fid' identificator + * containing the counter for OIDs of locally created files with that + * sequence. + * + * It is used now by llog subsystem and MGS for NID tables + * + * Function gets first_fid to create counter object. + * All dynamic fids will be generated with the same sequence and incremented + * OIDs + * + * Returned local_oid_storage is in-memory representaion of OID storage + */ +int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *first_fid, + struct local_oid_storage **los) +{ + struct dt_thread_info *dti = dt_info(env); + struct ls_device *ls; + u64 lastid; + struct dt_object *o = NULL; + struct thandle *th; + __u32 first_oid = fid_oid(first_fid); + int rc = 0; + + ENTRY; + + ls = ls_device_get(dev); + if (IS_ERR(ls)) + RETURN(PTR_ERR(ls)); + + mutex_lock(&ls->ls_los_mutex); + *los = dt_los_find(ls, fid_seq(first_fid)); + if (*los != NULL) + GOTO(out, rc = 0); + + /* not found, then create */ + OBD_ALLOC_PTR(*los); + if (*los == NULL) + GOTO(out, rc = -ENOMEM); + + atomic_set(&(*los)->los_refcount, 1); + mutex_init(&(*los)->los_id_lock); + (*los)->los_dev = &ls->ls_top_dev; + atomic_inc(&ls->ls_refcount); + list_add(&(*los)->los_list, &ls->ls_los_list); + + /* Use {seq, 0, 0} to create the LAST_ID file for every + * sequence. OIDs start at LUSTRE_FID_INIT_OID. + */ + dti->dti_fid.f_seq = fid_seq(first_fid); + dti->dti_fid.f_oid = LUSTRE_FID_LASTID_OID; + dti->dti_fid.f_ver = 0; + o = ls_locate(env, ls, &dti->dti_fid, NULL); + if (IS_ERR(o)) + GOTO(out_los, rc = PTR_ERR(o)); + + if (!dt_object_exists(o)) { + rc = lastid_compat_check(env, dev, fid_seq(first_fid), + &first_oid, ls); + if (rc < 0) + GOTO(out_los, rc); + + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(out_los, rc = PTR_ERR(th)); + + dti->dti_attr.la_valid = LA_MODE | LA_TYPE; + dti->dti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + dti->dti_dof.dof_type = dt_mode_to_dft(S_IFREG); + + rc = dt_declare_create(env, o, &dti->dti_attr, NULL, + &dti->dti_dof, th); + if (rc) + GOTO(out_trans, rc); + + lastid = cpu_to_le64(first_oid); + + dti->dti_off = 0; + dti->dti_lb.lb_buf = &lastid; + dti->dti_lb.lb_len = sizeof(lastid); + rc = dt_declare_record_write(env, o, &dti->dti_lb, dti->dti_off, + th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(out_trans, rc); + + dt_write_lock(env, o, 0); + if (dt_object_exists(o)) + GOTO(out_lock, rc = 0); + + rc = dt_create(env, o, &dti->dti_attr, NULL, &dti->dti_dof, + th); + if (rc) + GOTO(out_lock, rc); + + rc = dt_record_write(env, o, &dti->dti_lb, &dti->dti_off, th); + if (rc) + GOTO(out_lock, rc); +out_lock: + dt_write_unlock(env, o); +out_trans: + dt_trans_stop(env, dev, th); + } else { + dti->dti_off = 0; + dti->dti_lb.lb_buf = &lastid; + dti->dti_lb.lb_len = sizeof(lastid); + dt_read_lock(env, o, 0); + rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off); + dt_read_unlock(env, o); + if (rc == 0 && le64_to_cpu(lastid) > OBIF_MAX_OID) { + CERROR("%s: bad oid %llu is read from LAST_ID\n", + o->do_lu.lo_dev->ld_obd->obd_name, + le64_to_cpu(lastid)); + rc = -EINVAL; + } + } +out_los: + if (rc != 0) { + list_del(&(*los)->los_list); + atomic_dec(&ls->ls_refcount); + OBD_FREE_PTR(*los); + *los = NULL; + if (o != NULL && !IS_ERR(o)) + dt_object_put_nocache(env, o); + } else { + (*los)->los_seq = fid_seq(first_fid); + (*los)->los_last_oid = le64_to_cpu(lastid); + (*los)->los_obj = o; + /* Read value should not be less than initial one + * but possible after upgrade from older fs. + * In this case just switch to the first_oid in memory and + * it will be updated on disk with first object generated */ + if ((*los)->los_last_oid < first_oid) + (*los)->los_last_oid = first_oid; + } +out: + mutex_unlock(&ls->ls_los_mutex); + ls_device_put(env, ls); + return rc; +} +EXPORT_SYMBOL(local_oid_storage_init); + +void local_oid_storage_fini(const struct lu_env *env, + struct local_oid_storage *los) +{ + struct ls_device *ls; + + LASSERT(env); + LASSERT(los->los_dev); + ls = dt2ls_dev(los->los_dev); + + /* Take the mutex before decreasing the reference to avoid race + * conditions as described in LU-4721. */ + mutex_lock(&ls->ls_los_mutex); + if (!atomic_dec_and_test(&los->los_refcount)) { + mutex_unlock(&ls->ls_los_mutex); + return; + } + + if (los->los_obj) + dt_object_put_nocache(env, los->los_obj); + list_del(&los->los_list); + OBD_FREE_PTR(los); + mutex_unlock(&ls->ls_los_mutex); + ls_device_put(env, ls); +} +EXPORT_SYMBOL(local_oid_storage_fini); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.h b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.h new file mode 100644 index 0000000000000..63af1946e6095 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.h @@ -0,0 +1,94 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * lustre/obdclass/local_storage.c + * + * Local storage for file/objects with fid generation. Works on top of OSD. + * + * Author: Mikhail Pershin + */ +#ifndef __LOCAL_STORAGE_H +#define __LOCAL_STORAGE_H + +#include +#include +#include +#include + +struct ls_device { + struct dt_device ls_top_dev; + /* all initialized ls_devices on this node linked by this */ + struct list_head ls_linkage; + /* how many handle's reference this local storage */ + atomic_t ls_refcount; + /* underlaying OSD device */ + struct dt_device *ls_osd; + /* list of all local OID storages */ + struct list_head ls_los_list; + struct mutex ls_los_mutex; +}; + +static inline struct ls_device *dt2ls_dev(struct dt_device *d) +{ + return container_of_safe(d, struct ls_device, ls_top_dev); +} + +struct ls_object { + struct lu_object_header ls_header; + struct dt_object ls_obj; +}; + +static inline struct ls_object *lu2ls_obj(struct lu_object *o) +{ + return container_of_safe(o, struct ls_object, ls_obj.do_lu); +} + +static inline struct dt_object *ls_locate(const struct lu_env *env, + struct ls_device *ls, + const struct lu_fid *fid, + const struct lu_object_conf *conf) +{ + return dt_locate_at(env, ls->ls_osd, fid, + &ls->ls_top_dev.dd_lu_dev, conf); +} + +struct ls_device *ls_device_get(struct dt_device *dev); +void ls_device_put(const struct lu_env *env, struct ls_device *ls); +struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq); +void dt_los_put(struct local_oid_storage *los); + +/* Lustre 2.3 on-disk structure describing local object OIDs storage + * the structure to be used with any sequence managed by + * local object library. + * Obsoleted since 2.4 but is kept for compatibility reasons, + * see lastid_compat_check() in obdclass/local_storage.c */ +struct los_ondisk { + __u32 lso_magic; + __u32 lso_next_oid; +}; + +#define LOS_MAGIC 0xdecafbee + +#endif diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c new file mode 100644 index 0000000000000..521e59c16e88b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c @@ -0,0 +1,136 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/lprocfs_counters.c + * + * Lustre lprocfs counter routines + * + * Author: Andreas Dilger + */ +#include +#include +#include + +#ifdef CONFIG_PROC_FS +void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount) +{ + struct lprocfs_counter *percpu_cntr; + struct lprocfs_counter_header *header; + int smp_id; + unsigned long flags = 0; + + if (stats == NULL) + return; + + LASSERTF(0 <= idx && idx < stats->ls_num, + "idx %d, ls_num %hu\n", idx, stats->ls_num); + + /* With per-client stats, statistics are allocated only for + * single CPU area, so the smp_id should be 0 always. */ + smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags); + if (smp_id < 0) + return; + + header = &stats->ls_cnt_header[idx]; + percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx); + percpu_cntr->lc_count++; + + if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) { + /* + * lprocfs_counter_add() can be called in interrupt context, + * as memory allocation could trigger memory shrinker call + * ldlm_pool_shrink(), which calls lprocfs_counter_add(). + * LU-1727. + * + * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE + * flag, because it needs accurate counting lest memory leak + * check reports error. + */ + if (in_interrupt() && + (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpu_cntr->lc_sum_irq += amount; + else + percpu_cntr->lc_sum += amount; + + if (header->lc_config & LPROCFS_CNTR_STDDEV) + percpu_cntr->lc_sumsquare += (__s64)amount * amount; + if (amount < percpu_cntr->lc_min) + percpu_cntr->lc_min = amount; + if (amount > percpu_cntr->lc_max) + percpu_cntr->lc_max = amount; + } + lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags); +} +EXPORT_SYMBOL(lprocfs_counter_add); + +void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount) +{ + struct lprocfs_counter *percpu_cntr; + struct lprocfs_counter_header *header; + int smp_id; + unsigned long flags = 0; + + if (stats == NULL) + return; + + LASSERTF(0 <= idx && idx < stats->ls_num, + "idx %d, ls_num %hu\n", idx, stats->ls_num); + + /* With per-client stats, statistics are allocated only for + * single CPU area, so the smp_id should be 0 always. */ + smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags); + if (smp_id < 0) + return; + + header = &stats->ls_cnt_header[idx]; + percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx); + if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) { + /* + * Sometimes we use RCU callbacks to free memory which calls + * lprocfs_counter_sub(), and RCU callbacks may execute in + * softirq context - right now that's the only case we're in + * softirq context here, use separate counter for that. + * bz20650. + * + * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE + * flag, because it needs accurate counting lest memory leak + * check reports error. + */ + if (in_interrupt() && + (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpu_cntr->lc_sum_irq -= amount; + else + percpu_cntr->lc_sum -= amount; + } + lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags); +} +EXPORT_SYMBOL(lprocfs_counter_sub); +#endif /* CONFIG_PROC_FS */ diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c new file mode 100644 index 0000000000000..4202f44459316 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c @@ -0,0 +1,691 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2016, Intel Corporation. + * Use is subject to license terms. + * + * Author: Niu Yawei + */ +/* + * lustre/obdclass/lprocfs_jobstats.c + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#ifdef CONFIG_PROC_FS + +/* + * JobID formats & JobID environment variable names for supported + * job schedulers: + * + * SLURM: + * JobID format: 32 bit integer. + * JobID env var: SLURM_JOB_ID. + * SGE: + * JobID format: Decimal integer range to 99999. + * JobID env var: JOB_ID. + * LSF: + * JobID format: 6 digit integer by default (up to 999999), can be + * increased to 10 digit (up to 2147483646). + * JobID env var: LSB_JOBID. + * Loadleveler: + * JobID format: String of machine_name.cluster_id.process_id, for + * example: fr2n02.32.0 + * JobID env var: LOADL_STEP_ID. + * PBS: + * JobID format: String of sequence_number[.server_name][@server]. + * JobID env var: PBS_JOBID. + * Maui/MOAB: + * JobID format: Same as PBS. + * JobID env var: Same as PBS. + */ + +struct job_stat { + struct hlist_node js_hash; /* hash struct for this jobid */ + struct list_head js_list; /* on ojs_list, with ojs_lock */ + atomic_t js_refcount; /* num users of this struct */ + char js_jobid[LUSTRE_JOBID_SIZE]; /* job name + NUL*/ + ktime_t js_time_init; /* time of initial stat*/ + ktime_t js_time_latest; /* time of most recent stat*/ + struct lprocfs_stats *js_stats; /* per-job statistics */ + struct obd_job_stats *js_jobstats; /* for accessing ojs_lock */ +}; + +static unsigned +job_stat_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, strlen(key), mask); +} + +static void *job_stat_key(struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + return job->js_jobid; +} + +static int job_stat_keycmp(const void *key, struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + return (strlen(job->js_jobid) == strlen(key)) && + !strncmp(job->js_jobid, key, strlen(key)); +} + +static void *job_stat_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct job_stat, js_hash); +} + +static void job_stat_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + atomic_inc(&job->js_refcount); +} + +static void job_free(struct job_stat *job) +{ + LASSERT(atomic_read(&job->js_refcount) == 0); + LASSERT(job->js_jobstats != NULL); + + write_lock(&job->js_jobstats->ojs_lock); + list_del_init(&job->js_list); + write_unlock(&job->js_jobstats->ojs_lock); + + lprocfs_free_stats(&job->js_stats); + OBD_FREE_PTR(job); +} + +static void job_putref(struct job_stat *job) +{ + LASSERT(atomic_read(&job->js_refcount) > 0); + if (atomic_dec_and_test(&job->js_refcount)) + job_free(job); +} + +static void job_stat_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + job_putref(job); +} + +static void job_stat_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + CERROR("should not have any items\n"); +} + +static struct cfs_hash_ops job_stats_hash_ops = { + .hs_hash = job_stat_hash, + .hs_key = job_stat_key, + .hs_keycmp = job_stat_keycmp, + .hs_object = job_stat_object, + .hs_get = job_stat_get, + .hs_put_locked = job_stat_put_locked, + .hs_exit = job_stat_exit, +}; + +/** + * Jobstats expiry iterator to clean up old jobids + * + * Called for each job_stat structure on this device, it should delete stats + * older than the specified \a oldest_time in seconds. If \a oldest_time is + * in the future then this will delete all statistics (e.g. during shutdown). + * + * \param[in] hs hash of all jobids on this device + * \param[in] bd hash bucket containing this jobid + * \param[in] hnode hash structure for this jobid + * \param[in] data pointer to stats expiry time in seconds + */ +static int job_cleanup_iter_callback(struct cfs_hash *hs, + struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) +{ + ktime_t oldest_time = *((ktime_t *)data); + struct job_stat *job; + + job = hlist_entry(hnode, struct job_stat, js_hash); + if (ktime_before(job->js_time_latest, oldest_time)) + cfs_hash_bd_del_locked(hs, bd, hnode); + + return 0; +} + +/** + * Clean up jobstats that were updated more than \a before seconds ago. + * + * Since this function may be called frequently, do not scan all of the + * jobstats on each call, only twice per cleanup interval. That means stats + * may be on average around cleanup_interval / 4 older than the cleanup + * interval, but that is not considered harmful. + * + * The value stored in ojs_cleanup_interval is how often to perform a cleanup + * scan, and 1/2 of the maximum age of the individual statistics. This is + * done rather than dividing the interval by two each time, because it is + * much easier to do the division when the value is initially set (in seconds) + * rather than after it has been converted to ktime_t, and maybe a bit faster. + * + * If \a clear is true then this will force clean up all jobstats + * (e.g. at shutdown). + * + * If there is already another thread doing jobstats cleanup, don't try to + * do this again in the current thread unless this is a force cleanup. + * + * \param[in] stats stucture tracking all job stats for this device + * \param[in] clear clear all job stats if true + */ +static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool clear) +{ + ktime_t cleanup_interval = stats->ojs_cleanup_interval; + ktime_t now = ktime_get_real(); + ktime_t oldest; + + if (likely(!clear)) { + /* ojs_cleanup_interval of zero means never clean up stats */ + if (ktime_to_ns(cleanup_interval) == 0) + return; + + if (ktime_before(now, ktime_add(stats->ojs_cleanup_last, + cleanup_interval))) + return; + + if (stats->ojs_cleaning) + return; + } + + write_lock(&stats->ojs_lock); + if (!clear && stats->ojs_cleaning) { + write_unlock(&stats->ojs_lock); + return; + } + + stats->ojs_cleaning = true; + write_unlock(&stats->ojs_lock); + + /* Can't hold ojs_lock over hash iteration, since it is grabbed by + * job_cleanup_iter_callback() + * ->cfs_hash_bd_del_locked() + * ->job_putref() + * ->job_free() + * + * Holding ojs_lock isn't necessary for safety of the hash iteration, + * since locking of the hash is handled internally, but there isn't + * any benefit to having multiple threads doing cleanup at one time. + * + * Subtract twice the cleanup_interval, since it is 1/2 the maximum age. + */ + oldest = ktime_sub(now, ktime_add(cleanup_interval, cleanup_interval)); + cfs_hash_for_each_safe(stats->ojs_hash, job_cleanup_iter_callback, + &oldest); + + write_lock(&stats->ojs_lock); + stats->ojs_cleaning = false; + stats->ojs_cleanup_last = ktime_get_real(); + write_unlock(&stats->ojs_lock); +} + +static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs) +{ + struct job_stat *job; + + OBD_ALLOC_PTR(job); + if (job == NULL) + return NULL; + + job->js_stats = lprocfs_alloc_stats(jobs->ojs_cntr_num, 0); + if (job->js_stats == NULL) { + OBD_FREE_PTR(job); + return NULL; + } + + jobs->ojs_cntr_init_fn(job->js_stats, 0); + + memcpy(job->js_jobid, jobid, sizeof(job->js_jobid)); + job->js_time_latest = job->js_stats->ls_init; + job->js_jobstats = jobs; + INIT_HLIST_NODE(&job->js_hash); + INIT_LIST_HEAD(&job->js_list); + atomic_set(&job->js_refcount, 1); + + return job; +} + +int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, + int event, long amount) +{ + struct obd_job_stats *stats = &obd->u.obt.obt_jobstats; + struct job_stat *job, *job2; + ENTRY; + + LASSERT(stats != NULL); + LASSERT(stats->ojs_hash != NULL); + + if (event >= stats->ojs_cntr_num) + RETURN(-EINVAL); + + if (jobid == NULL || strlen(jobid) == 0) + RETURN(0); + + /* unterminated jobid should be handled in lustre_msg_get_jobid() */ + if (strlen(jobid) >= LUSTRE_JOBID_SIZE) { + CERROR("%s: invalid jobid size %lu, expect %d\n", obd->obd_name, + (unsigned long)strlen(jobid) + 1, LUSTRE_JOBID_SIZE); + RETURN(-EINVAL); + } + + job = cfs_hash_lookup(stats->ojs_hash, jobid); + if (job) + goto found; + + lprocfs_job_cleanup(stats, false); + + job = job_alloc(jobid, stats); + if (job == NULL) + RETURN(-ENOMEM); + + job2 = cfs_hash_findadd_unique(stats->ojs_hash, job->js_jobid, + &job->js_hash); + if (job2 != job) { + job_putref(job); + job = job2; + /* We cannot LASSERT(!list_empty(&job->js_list)) here, + * since we just lost the race for inserting "job" into the + * ojs_list, and some other thread is doing it _right_now_. + * Instead, be content the other thread is doing this, since + * "job2" was initialized in job_alloc() already. LU-2163 */ + } else { + LASSERT(list_empty(&job->js_list)); + write_lock(&stats->ojs_lock); + list_add_tail(&job->js_list, &stats->ojs_list); + write_unlock(&stats->ojs_lock); + } + +found: + LASSERT(stats == job->js_jobstats); + job->js_time_latest = ktime_get_real(); + lprocfs_counter_add(job->js_stats, event, amount); + + job_putref(job); + + RETURN(0); +} +EXPORT_SYMBOL(lprocfs_job_stats_log); + +void lprocfs_job_stats_fini(struct obd_device *obd) +{ + struct obd_job_stats *stats = &obd->u.obt.obt_jobstats; + + if (stats->ojs_hash == NULL) + return; + + lprocfs_job_cleanup(stats, true); + cfs_hash_putref(stats->ojs_hash); + stats->ojs_hash = NULL; + LASSERT(list_empty(&stats->ojs_list)); +} +EXPORT_SYMBOL(lprocfs_job_stats_fini); + +static void *lprocfs_jobstats_seq_start(struct seq_file *p, loff_t *pos) +{ + struct obd_job_stats *stats = p->private; + loff_t off = *pos; + struct job_stat *job; + + read_lock(&stats->ojs_lock); + if (off == 0) + return SEQ_START_TOKEN; + off--; + list_for_each_entry(job, &stats->ojs_list, js_list) { + if (!off--) + return job; + } + return NULL; +} + +static void lprocfs_jobstats_seq_stop(struct seq_file *p, void *v) +{ + struct obd_job_stats *stats = p->private; + + read_unlock(&stats->ojs_lock); +} + +static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct obd_job_stats *stats = p->private; + struct job_stat *job; + struct list_head *next; + + ++*pos; + if (v == SEQ_START_TOKEN) { + next = stats->ojs_list.next; + } else { + job = (struct job_stat *)v; + next = job->js_list.next; + } + + return next == &stats->ojs_list ? NULL : + list_entry(next, struct job_stat, js_list); +} + +/* + * Example of output on MDT: + * + * job_stats: + * - job_id: dd.4854 + * snapshot_time: 1322494486.123456789 + * start_time: 1322494476.012345678 + * elapsed_time: 10.111111111 + * open: { samples: 1, unit: reqs } + * close: { samples: 1, unit: reqs } + * mknod: { samples: 0, unit: reqs } + * link: { samples: 0, unit: reqs } + * unlink: { samples: 0, unit: reqs } + * mkdir: { samples: 0, unit: reqs } + * rmdir: { samples: 0, unit: reqs } + * rename: { samples: 0, unit: reqs } + * getattr: { samples: 1, unit: reqs } + * setattr: { samples: 0, unit: reqs } + * getxattr: { samples: 0, unit: reqs } + * setxattr: { samples: 0, unit: reqs } + * statfs: { samples: 0, unit: reqs } + * sync: { samples: 0, unit: reqs } + * + * Example of output on OST: + * + * job_stats: + * - job_id dd.4854 + * snapshot_time: 1322494602.123456789 + * start_time: 1322494592.987654321 + * elapsed_time: 9.135802468 + * read: { samples: 0, unit: bytes, min: 0, max: 0, sum: 0 } + * write: { samples: 1, unit: bytes, min: 4096, max: 4096, sum: 4096 } + * setattr: { samples: 0, unit: reqs } + * punch: { samples: 0, unit: reqs } + * sync: { samples: 0, unit: reqs } + */ + +static const char spaces[] = " "; + +static int inline width(const char *str, int len) +{ + return len - min((int)strlen(str), 15); +} + +static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v) +{ + struct job_stat *job = v; + struct lprocfs_stats *s; + struct lprocfs_counter ret; + struct lprocfs_counter_header *cntr_header; + char escaped[LUSTRE_JOBID_SIZE * 4] = ""; + char *quote = "", *c, *end; + int i, joblen = 0; + + if (v == SEQ_START_TOKEN) { + seq_printf(p, "job_stats:\n"); + return 0; + } + + /* Quote and escape jobid characters to escape hex codes "\xHH" if + * it contains any non-standard characters (space, newline, etc), + * so it will be confined to single line and not break parsing. + */ + for (c = job->js_jobid, end = job->js_jobid + sizeof(job->js_jobid); + c < end && *c != '\0'; + c++, joblen++) { + if (!isalnum(*c) && + *c != '.' && *c != '@' && *c != '-' && *c != '_') { + quote = "\""; + snprintf(escaped + joblen, sizeof(escaped), "\\x%02X", + (unsigned char)*c); + joblen += 3; + } else { + escaped[joblen] = *c; + } + } + + seq_printf(p, "- %-16s %s%*s%s\n", + "job_id:", quote, joblen, escaped, quote); + lprocfs_stats_header(p, job->js_time_latest, job->js_stats->ls_init, + 16, ":", true, " "); + + s = job->js_stats; + for (i = 0; i < s->ls_num; i++) { + cntr_header = &s->ls_cnt_header[i]; + lprocfs_stats_collect(s, i, &ret); + + seq_printf(p, " %s:%.*s { samples: %11llu", + cntr_header->lc_name, + width(cntr_header->lc_name, 15), spaces, + ret.lc_count); + if (cntr_header->lc_units[0] != '\0') + seq_printf(p, ", unit: %5s", cntr_header->lc_units); + + if (cntr_header->lc_config & LPROCFS_CNTR_AVGMINMAX) { + seq_printf(p, ", min: %8llu, max: %8llu, sum: %16llu", + ret.lc_count ? ret.lc_min : 0, + ret.lc_count ? ret.lc_max : 0, + ret.lc_count ? ret.lc_sum : 0); + } + if (cntr_header->lc_config & LPROCFS_CNTR_STDDEV) { + seq_printf(p, ", sumsq: %18llu", + ret.lc_count ? ret.lc_sumsquare : 0); + } + + seq_printf(p, " }\n"); + + } + + return 0; +} + +static const struct seq_operations lprocfs_jobstats_seq_sops = { + .start = lprocfs_jobstats_seq_start, + .stop = lprocfs_jobstats_seq_stop, + .next = lprocfs_jobstats_seq_next, + .show = lprocfs_jobstats_seq_show, +}; + +static int lprocfs_jobstats_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lprocfs_jobstats_seq_sops); + if (rc) + return rc; + seq = file->private_data; + seq->private = pde_data(inode); + return 0; +} + +static ssize_t lprocfs_jobstats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_job_stats *stats = seq->private; + char jobid[LUSTRE_JOBID_SIZE]; + struct job_stat *job; + + if (len == 0 || len >= LUSTRE_JOBID_SIZE) + return -EINVAL; + + if (stats->ojs_hash == NULL) + return -ENODEV; + + if (copy_from_user(jobid, buf, len)) + return -EFAULT; + jobid[len] = 0; + + /* Trim '\n' if any */ + if (jobid[len - 1] == '\n') + jobid[len - 1] = 0; + + if (strcmp(jobid, "clear") == 0) { + lprocfs_job_cleanup(stats, true); + + return len; + } + + if (strlen(jobid) == 0) + return -EINVAL; + + job = cfs_hash_lookup(stats->ojs_hash, jobid); + if (!job) + return -EINVAL; + + cfs_hash_del_key(stats->ojs_hash, jobid); + + job_putref(job); + return len; +} + +/** + * Clean up the seq file state when the /proc file is closed. + * + * This also expires old job stats from the cache after they have been + * printed in case the system is idle and not generating new jobstats. + * + * \param[in] inode struct inode for seq file being closed + * \param[in] file struct file for seq file being closed + * + * \retval 0 on success + * \retval negative errno on failure + */ +static int lprocfs_jobstats_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct obd_job_stats *stats = seq->private; + + lprocfs_job_cleanup(stats, false); + + return lprocfs_seq_release(inode, file); +} + +static const struct proc_ops lprocfs_jobstats_seq_fops = { + PROC_OWNER(THIS_MODULE) + .proc_open = lprocfs_jobstats_seq_open, + .proc_read = seq_read, + .proc_write = lprocfs_jobstats_seq_write, + .proc_lseek = seq_lseek, + .proc_release = lprocfs_jobstats_seq_release, +}; + +int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num, + cntr_init_callback init_fn) +{ + struct proc_dir_entry *entry; + struct obd_job_stats *stats; + ENTRY; + + LASSERT(obd->obd_proc_entry != NULL); + LASSERT(obd->obd_type->typ_name); + + if (cntr_num <= 0) + RETURN(-EINVAL); + + if (init_fn == NULL) + RETURN(-EINVAL); + + /* Currently needs to be a target due to the use of obt_jobstats. */ + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) != 0 && + strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) != 0) { + CERROR("%s: invalid device type %s for job stats: rc = %d\n", + obd->obd_name, obd->obd_type->typ_name, -EINVAL); + RETURN(-EINVAL); + } + stats = &obd->u.obt.obt_jobstats; + + LASSERT(stats->ojs_hash == NULL); + stats->ojs_hash = cfs_hash_create("JOB_STATS", + HASH_JOB_STATS_CUR_BITS, + HASH_JOB_STATS_MAX_BITS, + HASH_JOB_STATS_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &job_stats_hash_ops, + CFS_HASH_DEFAULT); + if (stats->ojs_hash == NULL) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&stats->ojs_list); + rwlock_init(&stats->ojs_lock); + stats->ojs_cntr_num = cntr_num; + stats->ojs_cntr_init_fn = init_fn; + /* Store 1/2 the actual interval, since we use that the most, and + * it is easier to work with. + */ + stats->ojs_cleanup_interval = ktime_set(600 / 2, 0); /* default 10 min*/ + stats->ojs_cleanup_last = ktime_get_real(); + + entry = lprocfs_add_simple(obd->obd_proc_entry, "job_stats", stats, + &lprocfs_jobstats_seq_fops); + if (IS_ERR(entry)) { + lprocfs_job_stats_fini(obd); + RETURN(-ENOMEM); + } + RETURN(0); +} +EXPORT_SYMBOL(lprocfs_job_stats_init); +#endif /* CONFIG_PROC_FS*/ + +ssize_t job_cleanup_interval_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_job_stats *stats; + struct timespec64 ts; + + stats = &obd->u.obt.obt_jobstats; + ts = ktime_to_timespec64(stats->ojs_cleanup_interval); + + return scnprintf(buf, PAGE_SIZE, "%lld\n", (long long)ts.tv_sec * 2); +} +EXPORT_SYMBOL(job_cleanup_interval_show); + +ssize_t job_cleanup_interval_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_job_stats *stats; + unsigned int val; + int rc; + + stats = &obd->u.obt.obt_jobstats; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + stats->ojs_cleanup_interval = ktime_set(val / 2, 0); + lprocfs_job_cleanup(stats, false); + + return count; +} +EXPORT_SYMBOL(job_cleanup_interval_store); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c new file mode 100644 index 0000000000000..bbb8e1c569215 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c @@ -0,0 +1,2331 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/lprocfs_status.c + * + * Author: Hariharan Thantry + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#ifdef CONFIG_PROC_FS + +static int lprocfs_no_percpu_stats = 0; +module_param(lprocfs_no_percpu_stats, int, 0644); +MODULE_PARM_DESC(lprocfs_no_percpu_stats, "Do not alloc percpu data for lprocfs stats"); + +#define MAX_STRING_SIZE 128 + +int lprocfs_single_release(struct inode *inode, struct file *file) +{ + return single_release(inode, file); +} +EXPORT_SYMBOL(lprocfs_single_release); + +int lprocfs_seq_release(struct inode *inode, struct file *file) +{ + return seq_release(inode, file); +} +EXPORT_SYMBOL(lprocfs_seq_release); + +static umode_t default_mode(const struct proc_ops *ops) +{ + umode_t mode = 0; + + if (ops->proc_read) + mode = 0444; + if (ops->proc_write) + mode |= 0200; + + return mode; +} + +struct proc_dir_entry * +lprocfs_add_simple(struct proc_dir_entry *root, char *name, + void *data, const struct proc_ops *fops) +{ + struct proc_dir_entry *proc; + umode_t mode; + + if (!root || !name || !fops) + return ERR_PTR(-EINVAL); + + mode = default_mode(fops); + proc = proc_create_data(name, mode, root, fops, data); + if (!proc) { + CERROR("LprocFS: No memory to create /proc entry %s\n", + name); + return ERR_PTR(-ENOMEM); + } + return proc; +} +EXPORT_SYMBOL(lprocfs_add_simple); + +struct proc_dir_entry *lprocfs_add_symlink(const char *name, + struct proc_dir_entry *parent, + const char *format, ...) +{ + struct proc_dir_entry *entry; + char *dest; + va_list ap; + + if (!parent || !format) + return NULL; + + OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1); + if (!dest) + return NULL; + + va_start(ap, format); + vsnprintf(dest, MAX_STRING_SIZE, format, ap); + va_end(ap); + + entry = proc_symlink(name, parent, dest); + if (!entry) + CERROR("LprocFS: Could not create symbolic link from " + "%s to %s\n", name, dest); + + OBD_FREE(dest, MAX_STRING_SIZE + 1); + return entry; +} +EXPORT_SYMBOL(lprocfs_add_symlink); + +static const struct file_operations ldebugfs_empty_ops = { }; + +void ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *list, + void *data) +{ + if (IS_ERR_OR_NULL(parent) || IS_ERR_OR_NULL(list)) + return; + + while (list->name) { + umode_t mode = 0; + + if (list->proc_mode != 0000) { + mode = list->proc_mode; + } else if (list->fops) { + if (list->fops->read) + mode = 0444; + if (list->fops->write) + mode |= 0200; + } + debugfs_create_file(list->name, mode, parent, + list->data ? : data, + list->fops ? : &ldebugfs_empty_ops); + list++; + } +} +EXPORT_SYMBOL_GPL(ldebugfs_add_vars); + +static const struct proc_ops lprocfs_empty_ops = { }; + +/** + * Add /proc entries. + * + * \param root [in] The parent proc entry on which new entry will be added. + * \param list [in] Array of proc entries to be added. + * \param data [in] The argument to be passed when entries read/write routines + * are called through /proc file. + * + * \retval 0 on success + * < 0 on error + */ +int +lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list, + void *data) +{ + if (!root || !list) + return -EINVAL; + + while (list->name) { + struct proc_dir_entry *proc; + umode_t mode = 0; + + if (list->proc_mode) + mode = list->proc_mode; + else if (list->fops) + mode = default_mode(list->fops); + proc = proc_create_data(list->name, mode, root, + list->fops ?: &lprocfs_empty_ops, + list->data ?: data); + if (!proc) + return -ENOMEM; + list++; + } + return 0; +} +EXPORT_SYMBOL(lprocfs_add_vars); + +void lprocfs_remove(struct proc_dir_entry **rooth) +{ + proc_remove(*rooth); + *rooth = NULL; +} +EXPORT_SYMBOL(lprocfs_remove); + +void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent) +{ + LASSERT(parent != NULL); + remove_proc_entry(name, parent); +} +EXPORT_SYMBOL(lprocfs_remove_proc_entry); + +struct proc_dir_entry * +lprocfs_register(const char *name, struct proc_dir_entry *parent, + struct lprocfs_vars *list, void *data) +{ + struct proc_dir_entry *newchild; + + newchild = proc_mkdir(name, parent); + if (!newchild) + return ERR_PTR(-ENOMEM); + + if (list) { + int rc = lprocfs_add_vars(newchild, list, data); + if (rc) { + lprocfs_remove(&newchild); + return ERR_PTR(rc); + } + } + return newchild; +} +EXPORT_SYMBOL(lprocfs_register); + +/* Generic callbacks */ +int lprocfs_uuid_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + + LASSERT(obd != NULL); + seq_printf(m, "%s\n", obd->obd_uuid.uuid); + return 0; +} +EXPORT_SYMBOL(lprocfs_uuid_seq_show); + +static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%s\n", obd->obd_uuid.uuid); +} +LUSTRE_RO_ATTR(uuid); + +static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + OBD_STATFS_NODELAY); + if (!rc) + return sprintf(buf, "%u\n", osfs.os_bsize); + + return rc; +} +LUSTRE_RO_ATTR(blocksize); + +static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + OBD_STATFS_NODELAY); + if (!rc) { + u32 blk_size = osfs.os_bsize >> 10; + u64 result = osfs.os_blocks; + + result *= rounddown_pow_of_two(blk_size ?: 1); + return sprintf(buf, "%llu\n", result); + } + + return rc; +} +LUSTRE_RO_ATTR(kbytestotal); + +static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + OBD_STATFS_NODELAY); + if (!rc) { + u32 blk_size = osfs.os_bsize >> 10; + u64 result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); + } + + return rc; +} +LUSTRE_RO_ATTR(kbytesfree); + +static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + OBD_STATFS_NODELAY); + if (!rc) { + u32 blk_size = osfs.os_bsize >> 10; + u64 result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); + } + + return rc; +} +LUSTRE_RO_ATTR(kbytesavail); + +static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + OBD_STATFS_NODELAY); + if (!rc) + return sprintf(buf, "%llu\n", osfs.os_files); + + return rc; +} +LUSTRE_RO_ATTR(filestotal); + +static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + OBD_STATFS_NODELAY); + if (!rc) + return sprintf(buf, "%llu\n", osfs.os_ffree); + + return rc; +} +LUSTRE_RO_ATTR(filesfree); + +ssize_t conn_uuid_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp; + struct ptlrpc_connection *conn; + ssize_t count; + + with_imp_locked(obd, imp, count) { + conn = imp->imp_connection; + if (conn) + count = sprintf(buf, "%s\n", conn->c_remote_uuid.uuid); + else + count = sprintf(buf, "%s\n", ""); + } + + return count; +} +EXPORT_SYMBOL(conn_uuid_show); + +int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_import *imp; + const char *imp_state_name = NULL; + int rc = 0; + + LASSERT(obd != NULL); + with_imp_locked(obd, imp, rc) { + imp_state_name = ptlrpc_import_state_name(imp->imp_state); + seq_printf(m, "%s\t%s%s\n", obd2cli_tgt(obd), imp_state_name, + imp->imp_deactive ? "\tDEACTIVATED" : ""); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_server_uuid_seq_show); + +/** add up per-cpu counters */ + +/** + * Lock statistics structure for access, possibly only on this CPU. + * + * The statistics struct may be allocated with per-CPU structures for + * efficient concurrent update (usually only on server-wide stats), or + * as a single global struct (e.g. for per-client or per-job statistics), + * so the required locking depends on the type of structure allocated. + * + * For per-CPU statistics, pin the thread to the current cpuid so that + * will only access the statistics for that CPU. If the stats structure + * for the current CPU has not been allocated (or previously freed), + * allocate it now. The per-CPU statistics do not need locking since + * the thread is pinned to the CPU during update. + * + * For global statistics, lock the stats structure to prevent concurrent update. + * + * \param[in] stats statistics structure to lock + * \param[in] opc type of operation: + * LPROCFS_GET_SMP_ID: "lock" and return current CPU index + * for incrementing statistics for that CPU + * LPROCFS_GET_NUM_CPU: "lock" and return number of used + * CPU indices to iterate over all indices + * \param[out] flags CPU interrupt saved state for IRQ-safe locking + * + * \retval cpuid of current thread or number of allocated structs + * \retval negative on error (only for opc LPROCFS_GET_SMP_ID + per-CPU stats) + */ +int lprocfs_stats_lock(struct lprocfs_stats *stats, + enum lprocfs_stats_lock_ops opc, + unsigned long *flags) +{ + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_lock_irqsave(&stats->ls_lock, *flags); + else + spin_lock(&stats->ls_lock); + return opc == LPROCFS_GET_NUM_CPU ? 1 : 0; + } + + switch (opc) { + case LPROCFS_GET_SMP_ID: { + unsigned int cpuid = get_cpu(); + + if (unlikely(!stats->ls_percpu[cpuid])) { + int rc = lprocfs_stats_alloc_one(stats, cpuid); + + if (rc < 0) { + put_cpu(); + return rc; + } + } + return cpuid; + } + case LPROCFS_GET_NUM_CPU: + return stats->ls_biggest_alloc_num; + default: + LBUG(); + } +} + +/** + * Unlock statistics structure after access. + * + * Unlock the lock acquired via lprocfs_stats_lock() for global statistics, + * or unpin this thread from the current cpuid for per-CPU statistics. + * + * This function must be called using the same arguments as used when calling + * lprocfs_stats_lock() so that the correct operation can be performed. + * + * \param[in] stats statistics structure to unlock + * \param[in] opc type of operation (current cpuid or number of structs) + * \param[in] flags CPU interrupt saved state for IRQ-safe locking + */ +void lprocfs_stats_unlock(struct lprocfs_stats *stats, + enum lprocfs_stats_lock_ops opc, + unsigned long *flags) +{ + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_unlock_irqrestore(&stats->ls_lock, *flags); + else + spin_unlock(&stats->ls_lock); + } else if (opc == LPROCFS_GET_SMP_ID) { + put_cpu(); + } +} + +/** add up per-cpu counters */ +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt) +{ + unsigned int num_entry; + struct lprocfs_counter *percpu_cntr; + int i; + unsigned long flags = 0; + + memset(cnt, 0, sizeof(*cnt)); + + if (!stats) { + /* set count to 1 to avoid divide-by-zero errs in callers */ + cnt->lc_count = 1; + return; + } + + cnt->lc_min = LC_MIN_INIT; + + num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + + for (i = 0; i < num_entry; i++) { + if (!stats->ls_percpu[i]) + continue; + percpu_cntr = lprocfs_stats_counter_get(stats, i, idx); + + cnt->lc_count += percpu_cntr->lc_count; + cnt->lc_sum += percpu_cntr->lc_sum; + if (percpu_cntr->lc_min < cnt->lc_min) + cnt->lc_min = percpu_cntr->lc_min; + if (percpu_cntr->lc_max > cnt->lc_max) + cnt->lc_max = percpu_cntr->lc_max; + cnt->lc_sumsquare += percpu_cntr->lc_sumsquare; + } + + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} + +static void obd_import_flags2str(struct obd_import *imp, struct seq_file *m) +{ + bool first = true; + + if (imp->imp_obd->obd_no_recov) { + seq_printf(m, "no_recov"); + first = false; + } + + flag2str(imp, invalid); + flag2str(imp, deactive); + flag2str(imp, replayable); + flag2str(imp, delayed_recovery); + flag2str(imp, vbr_failed); + flag2str(imp, pingable); + flag2str(imp, resend_replay); + flag2str(imp, no_pinger_recover); + flag2str(imp, connect_tried); +} + +static const char *const obd_connect_names[] = { + /* flags names */ + "read_only", + "lov_index", + "connect_from_mds", + "write_grant", + "server_lock", + "version", + "request_portal", + "acl", + "xattr", + "create_on_write", + "truncate_lock", + "initial_transno", + "inode_bit_locks", + "barrier", + "getattr_by_fid", + "no_oh_for_devices", + "remote_client", + "remote_client_by_force", + "max_byte_per_rpc", + "64bit_qdata", + "mds_capability", + "oss_capability", + "early_lock_cancel", + "som", + "adaptive_timeouts", + "lru_resize", + "mds_mds_connection", + "real_conn", + "change_qunit_size", + "alt_checksum_algorithm", + "fid_is_enabled", + "version_recovery", + "pools", + "grant_shrink", + "skip_orphan", + "large_ea", + "full20", + "layout_lock", + "64bithash", + "object_max_bytes", + "imp_recov", + "jobstats", + "umask", + "einprogress", + "grant_param", + "flock_owner", + "lvb_type", + "nanoseconds_times", + "lightweight_conn", + "short_io", + "pingless", + "flock_deadlock", + "disp_stripe", + "open_by_fid", + "lfsck", + "unknown", + "unlink_close", + "multi_mod_rpcs", + "dir_stripe", + "subtree", + "lockahead", + "bulk_mbits", + "compact_obdo", + "second_flags", + /* flags2 names */ + "file_secctx", /* 0x01 */ + "lockaheadv2", /* 0x02 */ + "dir_migrate", /* 0x04 */ + "sum_statfs", /* 0x08 */ + "overstriping", /* 0x10 */ + "flr", /* 0x20 */ + "wbc", /* 0x40 */ + "lock_convert", /* 0x80 */ + "archive_id_array", /* 0x100 */ + "increasing_xid", /* 0x200 */ + "selinux_policy", /* 0x400 */ + "lsom", /* 0x800 */ + "pcc", /* 0x1000 */ + "crush", /* 0x2000 */ + "async_discard", /* 0x4000 */ + "client_encryption", /* 0x8000 */ + "fidmap", /* 0x10000 */ + "getattr_pfid", /* 0x20000 */ + "lseek", /* 0x40000 */ + "dom_lvb", /* 0x80000 */ + "reply_mbits", /* 0x100000 */ + "mode_convert", /* 0x200000 */ + "batch_rpc", /* 0x400000 */ + "pcc_ro", /* 0x800000 */ + "mne_nid_type", /* 0x1000000 */ + "lock_contend", /* 0x2000000 */ + "atomic_open_lock", /* 0x4000000 */ + "name_encryption", /* 0x8000000 */ + "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", + "mdll_bypass", /* 0x800000000000000 */ + "mdll", /* 0x1000000000000000 */ + "mdll_auto_refresh", /* 0x2000000000000000 */ + "", "", + NULL +}; + +void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, __u64 flags2, + const char *sep) +{ + bool first = true; + __u64 mask; + int i; + + for (i = 0, mask = 1; i < 64; i++, mask <<= 1) { + if (flags & mask) { + seq_printf(m, "%s%s", + first ? "" : sep, obd_connect_names[i]); + first = false; + } + } + + if (flags & ~(mask - 1)) { + seq_printf(m, "%sunknown_%#llx", + first ? "" : sep, flags & ~(mask - 1)); + first = false; + } + + if (!(flags & OBD_CONNECT_FLAGS2) || flags2 == 0) + return; + + for (i = 64, mask = 1; obd_connect_names[i] != NULL; i++, mask <<= 1) { + if (flags2 & mask) { + seq_printf(m, "%s%s", + first ? "" : sep, obd_connect_names[i]); + first = false; + } + } + + if (flags2 & ~(mask - 1)) { + seq_printf(m, "%sunknown2_%#llx", + first ? "" : sep, flags2 & ~(mask - 1)); + first = false; + } +} +EXPORT_SYMBOL(obd_connect_seq_flags2str); + +int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2, + const char *sep) +{ + __u64 mask; + int i, ret = 0; + + for (i = 0, mask = 1; i < 64; i++, mask <<= 1) { + if (flags & mask) + ret += snprintf(page + ret, count - ret, "%s%s", + ret ? sep : "", obd_connect_names[i]); + } + + if (flags & ~(mask - 1)) + ret += snprintf(page + ret, count - ret, + "%sunknown_%#llx", + ret ? sep : "", flags & ~(mask - 1)); + + if (!(flags & OBD_CONNECT_FLAGS2) || flags2 == 0) + return ret; + + for (i = 64, mask = 1; obd_connect_names[i] != NULL; i++, mask <<= 1) { + if (flags2 & mask) + ret += snprintf(page + ret, count - ret, "%s%s", + ret ? sep : "", obd_connect_names[i]); + } + + if (flags2 & ~(mask - 1)) + ret += snprintf(page + ret, count - ret, + "%sunknown2_%#llx", + ret ? sep : "", flags2 & ~(mask - 1)); + + return ret; +} +EXPORT_SYMBOL(obd_connect_flags2str); + +void +obd_connect_data_seqprint(struct seq_file *m, struct obd_connect_data *ocd) +{ + __u64 flags; + + LASSERT(ocd != NULL); + flags = ocd->ocd_connect_flags; + + seq_printf(m, " connect_data:\n" + " flags: %#llx\n" + " instance: %u\n", + ocd->ocd_connect_flags, + ocd->ocd_instance); + if (flags & OBD_CONNECT_VERSION) + seq_printf(m, " target_version: %u.%u.%u.%u\n", + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version)); + if (flags & OBD_CONNECT_MDS) + seq_printf(m, " mdt_index: %d\n", ocd->ocd_group); + if (flags & OBD_CONNECT_GRANT) + seq_printf(m, " initial_grant: %d\n", ocd->ocd_grant); + if (flags & OBD_CONNECT_INDEX) + seq_printf(m, " target_index: %u\n", ocd->ocd_index); + if (flags & OBD_CONNECT_BRW_SIZE) + seq_printf(m, " max_brw_size: %d\n", ocd->ocd_brw_size); + if (flags & OBD_CONNECT_IBITS) + seq_printf(m, " ibits_known: %#llx\n", + ocd->ocd_ibits_known); + if (flags & OBD_CONNECT_GRANT_PARAM) + seq_printf(m, " grant_block_size: %d\n" + " grant_inode_size: %d\n" + " grant_max_extent_size: %d\n" + " grant_extent_tax: %d\n", + 1 << ocd->ocd_grant_blkbits, + 1 << ocd->ocd_grant_inobits, + ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits, + ocd->ocd_grant_tax_kb << 10); + if (flags & OBD_CONNECT_TRANSNO) + seq_printf(m, " first_transno: %#llx\n", + ocd->ocd_transno); + if (flags & OBD_CONNECT_CKSUM) + seq_printf(m, " cksum_types: %#x\n", + ocd->ocd_cksum_types); + if (flags & OBD_CONNECT_MAX_EASIZE) + seq_printf(m, " max_easize: %d\n", ocd->ocd_max_easize); + if (flags & OBD_CONNECT_MAXBYTES) + seq_printf(m, " max_object_bytes: %llu\n", + ocd->ocd_maxbytes); + if (flags & OBD_CONNECT_MULTIMODRPCS) + seq_printf(m, " max_mod_rpcs: %hu\n", + ocd->ocd_maxmodrpcs); +} + +static void lprocfs_import_seq_show_locked(struct seq_file *m, + struct obd_device *obd, + struct obd_import *imp) +{ + char nidstr[LNET_NIDSTR_SIZE]; + struct lprocfs_counter ret; + struct lprocfs_counter_header *header; + struct obd_import_conn *conn; + struct obd_connect_data *ocd; + int j; + int k; + int rw = 0; + + ocd = &imp->imp_connect_data; + + seq_printf(m, "import:\n" + " name: %s\n" + " target: %s\n" + " state: %s\n" + " connect_flags: [ ", + obd->obd_name, + obd2cli_tgt(obd), + ptlrpc_import_state_name(imp->imp_state)); + obd_connect_seq_flags2str(m, imp->imp_connect_data.ocd_connect_flags, + imp->imp_connect_data.ocd_connect_flags2, + ", "); + seq_printf(m, " ]\n"); + obd_connect_data_seqprint(m, ocd); + seq_printf(m, " import_flags: [ "); + obd_import_flags2str(imp, m); + + seq_printf(m, " ]\n" + " connection:\n" + " failover_nids: [ "); + spin_lock(&imp->imp_lock); + j = 0; + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + libcfs_nidstr_r(&conn->oic_conn->c_peer.nid, + nidstr, sizeof(nidstr)); + seq_printf(m, "%s%s", j ? ", " : "", nidstr); + j++; + } + if (imp->imp_connection) + libcfs_nidstr_r(&imp->imp_connection->c_peer.nid, + nidstr, sizeof(nidstr)); + else + strncpy(nidstr, "", sizeof(nidstr)); + seq_printf(m, " ]\n" + " current_connection: %s\n" + " connection_attempts: %u\n" + " generation: %u\n" + " in-progress_invalidations: %u\n" + " idle: %lld sec\n", + nidstr, + imp->imp_conn_cnt, + imp->imp_generation, + atomic_read(&imp->imp_inval_count), + ktime_get_real_seconds() - imp->imp_last_reply_time); + spin_unlock(&imp->imp_lock); + + if (!obd->obd_svc_stats) + return; + + header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR]; + lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret); + if (ret.lc_count != 0) + ret.lc_sum = div64_s64(ret.lc_sum, ret.lc_count); + else + ret.lc_sum = 0; + seq_printf(m, " rpcs:\n" + " inflight: %u\n" + " unregistering: %u\n" + " timeouts: %u\n" + " avg_waittime: %llu %s\n", + atomic_read(&imp->imp_inflight), + atomic_read(&imp->imp_unregistering), + atomic_read(&imp->imp_timeouts), + ret.lc_sum, header->lc_units); + + k = 0; + for(j = 0; j < IMP_AT_MAX_PORTALS; j++) { + if (imp->imp_at.iat_portal[j] == 0) + break; + k = max_t(unsigned int, k, + at_get(&imp->imp_at.iat_service_estimate[j])); + } + seq_printf(m, " service_estimates:\n" + " services: %u sec\n" + " network: %d sec\n", + k, + at_get(&imp->imp_at.iat_net_latency)); + + seq_printf(m, " transactions:\n" + " last_replay: %llu\n" + " peer_committed: %llu\n" + " last_checked: %llu\n", + imp->imp_last_replay_transno, + imp->imp_peer_committed_transno, + imp->imp_last_transno_checked); + + /* avg data rates */ + for (rw = 0; rw <= 1; rw++) { + lprocfs_stats_collect(obd->obd_svc_stats, + PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw, + &ret); + if (ret.lc_sum > 0 && ret.lc_count > 0) { + ret.lc_sum = div64_s64(ret.lc_sum, ret.lc_count); + seq_printf(m, " %s_data_averages:\n" + " bytes_per_rpc: %llu\n", + rw ? "write" : "read", + ret.lc_sum); + } + k = (int)ret.lc_sum; + j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES; + header = &obd->obd_svc_stats->ls_cnt_header[j]; + lprocfs_stats_collect(obd->obd_svc_stats, j, &ret); + if (ret.lc_sum > 0 && ret.lc_count != 0) { + ret.lc_sum = div64_s64(ret.lc_sum, ret.lc_count); + seq_printf(m, " %s_per_rpc: %llu\n", + header->lc_units, ret.lc_sum); + j = (int)ret.lc_sum; + if (j > 0) + seq_printf(m, " MB_per_sec: %u.%.02u\n", + k / j, (100 * k / j) % 100); + } + } +} + +int lprocfs_import_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + int rv; + + LASSERT(obd != NULL); + with_imp_locked(obd, imp, rv) + lprocfs_import_seq_show_locked(m, obd, imp); + return rv; +} +EXPORT_SYMBOL(lprocfs_import_seq_show); + +int lprocfs_state_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + int j, k; + int rc; + + LASSERT(obd != NULL); + with_imp_locked(obd, imp, rc) { + seq_printf(m, "current_state: %s\n", + ptlrpc_import_state_name(imp->imp_state)); + seq_printf(m, "state_history:\n"); + k = imp->imp_state_hist_idx; + for (j = 0; j < IMP_STATE_HIST_LEN; j++) { + struct import_state_hist *ish = + &imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN]; + if (ish->ish_state == 0) + continue; + seq_printf(m, " - [ %lld, %s ]\n", (s64)ish->ish_time, + ptlrpc_import_state_name(ish->ish_state)); + } + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_state_seq_show); + +int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at) +{ + int i; + for (i = 0; i < AT_BINS; i++) + seq_printf(m, "%3u ", at->at_hist[i]); + seq_printf(m, "\n"); + return 0; +} +EXPORT_SYMBOL(lprocfs_at_hist_helper); + +/* See also ptlrpc_lprocfs_timeouts_show_seq */ +static void lprocfs_timeouts_seq_show_locked(struct seq_file *m, + struct obd_device *obd, + struct obd_import *imp) +{ + timeout_t cur_timeout, worst_timeout; + time64_t now, worst_timestamp; + int i; + + LASSERT(obd != NULL); + + now = ktime_get_real_seconds(); + + /* Some network health info for kicks */ + seq_printf(m, "%-10s : %lld, %llds ago\n", + "last reply", (s64)imp->imp_last_reply_time, + (s64)(now - imp->imp_last_reply_time)); + + cur_timeout = at_get(&imp->imp_at.iat_net_latency); + worst_timeout = imp->imp_at.iat_net_latency.at_worst_timeout_ever; + worst_timestamp = imp->imp_at.iat_net_latency.at_worst_timestamp; + seq_printf(m, "%-10s : cur %3u worst %3u (at %lld, %llds ago) ", + "network", cur_timeout, worst_timeout, worst_timestamp, + now - worst_timestamp); + lprocfs_at_hist_helper(m, &imp->imp_at.iat_net_latency); + + for(i = 0; i < IMP_AT_MAX_PORTALS; i++) { + struct adaptive_timeout *service_est; + + if (imp->imp_at.iat_portal[i] == 0) + break; + + service_est = &imp->imp_at.iat_service_estimate[i]; + cur_timeout = at_get(service_est); + worst_timeout = service_est->at_worst_timeout_ever; + worst_timestamp = service_est->at_worst_timestamp; + seq_printf(m, "portal %-2d : cur %3u worst %3u (at %lld, %llds ago) ", + imp->imp_at.iat_portal[i], cur_timeout, + worst_timeout, worst_timestamp, + now - worst_timestamp); + lprocfs_at_hist_helper(m, service_est); + } +} + +int lprocfs_timeouts_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + int rc; + + with_imp_locked(obd, imp, rc) + lprocfs_timeouts_seq_show_locked(m, obd, imp); + return rc; +} +EXPORT_SYMBOL(lprocfs_timeouts_seq_show); + +int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + __u64 flags; + __u64 flags2; + struct obd_import *imp; + int rc; + + with_imp_locked(obd, imp, rc) { + flags = imp->imp_connect_data.ocd_connect_flags; + flags2 = imp->imp_connect_data.ocd_connect_flags2; + seq_printf(m, "flags=%#llx\n", flags); + seq_printf(m, "flags2=%#llx\n", flags2); + obd_connect_seq_flags2str(m, flags, flags2, "\n"); + seq_printf(m, "\n"); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_connect_flags_seq_show); + +static const struct attribute *obd_def_uuid_attrs[] = { + &lustre_attr_uuid.attr, + NULL, +}; + +static const struct attribute *obd_def_attrs[] = { + &lustre_attr_blocksize.attr, + &lustre_attr_kbytestotal.attr, + &lustre_attr_kbytesfree.attr, + &lustre_attr_kbytesavail.attr, + &lustre_attr_filestotal.attr, + &lustre_attr_filesfree.attr, + &lustre_attr_uuid.attr, + NULL, +}; + +static void obd_sysfs_release(struct kobject *kobj) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + complete(&obd->obd_kobj_unregister); +} + +int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only) +{ + struct ldebugfs_vars *debugfs_vars = NULL; + int rc; + + if (!obd || obd->obd_magic != OBD_DEVICE_MAGIC) + return -ENODEV; + + rc = kobject_set_name(&obd->obd_kset.kobj, "%s", obd->obd_name); + if (rc) + return rc; + + obd->obd_ktype.sysfs_ops = &lustre_sysfs_ops; + obd->obd_ktype.release = obd_sysfs_release; + + obd->obd_kset.kobj.parent = &obd->obd_type->typ_kobj; + obd->obd_kset.kobj.ktype = &obd->obd_ktype; + init_completion(&obd->obd_kobj_unregister); + rc = kset_register(&obd->obd_kset); + if (rc) + return rc; + + if (uuid_only) + obd->obd_attrs = obd_def_uuid_attrs; + else + obd->obd_attrs = obd_def_attrs; + + rc = sysfs_create_files(&obd->obd_kset.kobj, obd->obd_attrs); + if (rc) { + kset_unregister(&obd->obd_kset); + return rc; + } + + if (!obd->obd_type->typ_procroot) + debugfs_vars = obd->obd_debugfs_vars; + obd->obd_debugfs_entry = debugfs_create_dir( + obd->obd_name, obd->obd_type->typ_debugfs_entry); + ldebugfs_add_vars(obd->obd_debugfs_entry, debugfs_vars, obd); + + if (obd->obd_proc_entry || !obd->obd_type->typ_procroot) + GOTO(already_registered, rc); + + obd->obd_proc_entry = lprocfs_register(obd->obd_name, + obd->obd_type->typ_procroot, + obd->obd_vars, obd); + if (IS_ERR(obd->obd_proc_entry)) { + rc = PTR_ERR(obd->obd_proc_entry); + CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name); + obd->obd_proc_entry = NULL; + + debugfs_remove_recursive(obd->obd_debugfs_entry); + obd->obd_debugfs_entry = NULL; + + sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs); + obd->obd_attrs = NULL; + kset_unregister(&obd->obd_kset); + return rc; + } +already_registered: + return rc; +} +EXPORT_SYMBOL(lprocfs_obd_setup); + +int lprocfs_obd_cleanup(struct obd_device *obd) +{ + if (!obd) + return -EINVAL; + + if (obd->obd_proc_exports_entry) { + /* Should be no exports left */ + lprocfs_remove(&obd->obd_proc_exports_entry); + obd->obd_proc_exports_entry = NULL; + } + + if (obd->obd_proc_entry) { + lprocfs_remove(&obd->obd_proc_entry); + obd->obd_proc_entry = NULL; + } + + debugfs_remove_recursive(obd->obd_debugfs_entry); + obd->obd_debugfs_entry = NULL; + + /* obd device never allocated a kset */ + if (!obd->obd_kset.kobj.state_initialized) + return 0; + + if (obd->obd_attrs) { + sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs); + obd->obd_attrs = NULL; + } + + kset_unregister(&obd->obd_kset); + wait_for_completion(&obd->obd_kobj_unregister); + return 0; +} +EXPORT_SYMBOL(lprocfs_obd_cleanup); + +int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid) +{ + struct lprocfs_counter *cntr; + unsigned int percpusize; + int rc = -ENOMEM; + unsigned long flags = 0; + int i; + + LASSERT(stats->ls_percpu[cpuid] == NULL); + LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0); + + percpusize = lprocfs_stats_counter_size(stats); + LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize); + if (stats->ls_percpu[cpuid]) { + rc = 0; + if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_lock_irqsave(&stats->ls_lock, flags); + else + spin_lock(&stats->ls_lock); + if (stats->ls_biggest_alloc_num <= cpuid) + stats->ls_biggest_alloc_num = cpuid + 1; + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) { + spin_unlock_irqrestore(&stats->ls_lock, flags); + } else { + spin_unlock(&stats->ls_lock); + } + } + /* initialize the ls_percpu[cpuid] non-zero counter */ + for (i = 0; i < stats->ls_num; ++i) { + cntr = lprocfs_stats_counter_get(stats, cpuid, i); + cntr->lc_min = LC_MIN_INIT; + } + } + return rc; +} + +struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num, + enum lprocfs_stats_flags flags) +{ + struct lprocfs_stats *stats; + unsigned int num_entry; + unsigned int percpusize = 0; + int i; + + if (num == 0) + return NULL; + + if (lprocfs_no_percpu_stats != 0) + flags |= LPROCFS_STATS_FLAG_NOPERCPU; + + if (flags & LPROCFS_STATS_FLAG_NOPERCPU) + num_entry = 1; + else + num_entry = num_possible_cpus(); + + /* alloc percpu pointers for all possible cpu slots */ + LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry])); + if (!stats) + return NULL; + + stats->ls_num = num; + stats->ls_flags = flags; + stats->ls_init = ktime_get_real(); + spin_lock_init(&stats->ls_lock); + + /* alloc num of counter headers */ + CFS_ALLOC_PTR_ARRAY(stats->ls_cnt_header, stats->ls_num); + if (!stats->ls_cnt_header) + goto fail; + + if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) { + /* contains only one set counters */ + percpusize = lprocfs_stats_counter_size(stats); + LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize); + if (!stats->ls_percpu[0]) + goto fail; + stats->ls_biggest_alloc_num = 1; + } else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) { + /* alloc all percpu data, currently only obd_memory use this */ + for (i = 0; i < num_entry; ++i) + if (lprocfs_stats_alloc_one(stats, i) < 0) + goto fail; + } + + return stats; + +fail: + lprocfs_free_stats(&stats); + return NULL; +} +EXPORT_SYMBOL(lprocfs_alloc_stats); + +void lprocfs_free_stats(struct lprocfs_stats **statsh) +{ + struct lprocfs_stats *stats = *statsh; + unsigned int num_entry; + unsigned int percpusize; + unsigned int i; + + if (!stats || stats->ls_num == 0) + return; + *statsh = NULL; + + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) + num_entry = 1; + else + num_entry = num_possible_cpus(); + + percpusize = lprocfs_stats_counter_size(stats); + for (i = 0; i < num_entry; i++) + if (stats->ls_percpu[i]) + LIBCFS_FREE(stats->ls_percpu[i], percpusize); + if (stats->ls_cnt_header) + CFS_FREE_PTR_ARRAY(stats->ls_cnt_header, stats->ls_num); + LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry])); +} +EXPORT_SYMBOL(lprocfs_free_stats); + +u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, + enum lprocfs_fields_flags field) +{ + unsigned long flags = 0; + unsigned int num_cpu; + unsigned int i; + u64 ret = 0; + + LASSERT(stats); + + num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + for (i = 0; i < num_cpu; i++) { + struct lprocfs_counter *cntr; + + if (!stats->ls_percpu[i]) + continue; + + cntr = lprocfs_stats_counter_get(stats, i, idx); + ret += lprocfs_read_helper(cntr, &stats->ls_cnt_header[idx], + stats->ls_flags, field); + } + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); + return ret; +} +EXPORT_SYMBOL(lprocfs_stats_collector); + +void lprocfs_clear_stats(struct lprocfs_stats *stats) +{ + struct lprocfs_counter *percpu_cntr; + int i; + int j; + unsigned int num_entry; + unsigned long flags = 0; + + num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + + for (i = 0; i < num_entry; i++) { + if (!stats->ls_percpu[i]) + continue; + for (j = 0; j < stats->ls_num; j++) { + percpu_cntr = lprocfs_stats_counter_get(stats, i, j); + percpu_cntr->lc_count = 0; + percpu_cntr->lc_min = LC_MIN_INIT; + percpu_cntr->lc_max = 0; + percpu_cntr->lc_sumsquare = 0; + percpu_cntr->lc_sum = 0; + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + percpu_cntr->lc_sum_irq = 0; + } + } + stats->ls_init = ktime_get_real(); + + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} +EXPORT_SYMBOL(lprocfs_clear_stats); + +static ssize_t lprocfs_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct lprocfs_stats *stats = seq->private; + + lprocfs_clear_stats(stats); + + return len; +} + +static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos) +{ + struct lprocfs_stats *stats = p->private; + + return (*pos < stats->ls_num) ? pos : NULL; +} + +static void lprocfs_stats_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + (*pos)++; + + return lprocfs_stats_seq_start(p, pos); +} + +/** + * print header of stats including snapshot_time, start_time and elapsed_time. + * + * \param seq the file to print content to + * \param now end time to calculate elapsed_time + * \param ts_init start time to calculate elapsed_time + * \param width the width of key to align them well + * \param colon "" or ":" + * \param show_units show units or not + * \param prefix prefix (indent) before printing each line of header + * to align them with other content + */ +void lprocfs_stats_header(struct seq_file *seq, ktime_t now, ktime_t ts_init, + int width, const char *colon, bool show_units, + const char *prefix) +{ + const char *units = show_units ? " secs.nsecs" : ""; + struct timespec64 ts; + const char *field; + + field = (colon && colon[0]) ? "snapshot_time:" : "snapshot_time"; + ts = ktime_to_timespec64(now); + seq_printf(seq, "%s%-*s %llu.%09lu%s\n", prefix, width, field, + (s64)ts.tv_sec, ts.tv_nsec, units); + + field = (colon && colon[0]) ? "start_time:" : "start_time"; + ts = ktime_to_timespec64(ts_init); + seq_printf(seq, "%s%-*s %llu.%09lu%s\n", prefix, width, field, + (s64)ts.tv_sec, ts.tv_nsec, units); + + field = (colon && colon[0]) ? "elapsed_time:" : "elapsed_time"; + ts = ktime_to_timespec64(ktime_sub(now, ts_init)); + seq_printf(seq, "%s%-*s %llu.%09lu%s\n", prefix, width, field, + (s64)ts.tv_sec, ts.tv_nsec, units); +} +EXPORT_SYMBOL(lprocfs_stats_header); + +/* seq file export of one lprocfs counter */ +static int lprocfs_stats_seq_show(struct seq_file *p, void *v) +{ + struct lprocfs_stats *stats = p->private; + struct lprocfs_counter_header *hdr; + struct lprocfs_counter ctr; + int idx = *(loff_t *)v; + + if (idx == 0) + lprocfs_stats_header(p, ktime_get_real(), stats->ls_init, 25, + "", true, ""); + + hdr = &stats->ls_cnt_header[idx]; + lprocfs_stats_collect(stats, idx, &ctr); + + if (ctr.lc_count == 0) + return 0; + + seq_printf(p, "%-25s %lld samples [%s]", hdr->lc_name, + ctr.lc_count, hdr->lc_units); + + if ((hdr->lc_config & LPROCFS_CNTR_AVGMINMAX) && ctr.lc_count > 0) { + seq_printf(p, " %lld %lld %lld", + ctr.lc_min, ctr.lc_max, ctr.lc_sum); + if (hdr->lc_config & LPROCFS_CNTR_STDDEV) + seq_printf(p, " %llu", ctr.lc_sumsquare); + } + seq_putc(p, '\n'); + return 0; +} + +static const struct seq_operations lprocfs_stats_seq_sops = { + .start = lprocfs_stats_seq_start, + .stop = lprocfs_stats_seq_stop, + .next = lprocfs_stats_seq_next, + .show = lprocfs_stats_seq_show, +}; + +static int lprocfs_stats_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lprocfs_stats_seq_sops); + if (rc) + return rc; + seq = file->private_data; + seq->private = inode->i_private ? inode->i_private : pde_data(inode); + return 0; +} + +const struct file_operations ldebugfs_stats_seq_fops = { + .owner = THIS_MODULE, + .open = lprocfs_stats_seq_open, + .read = seq_read, + .write = lprocfs_stats_seq_write, + .llseek = seq_lseek, + .release = lprocfs_seq_release, +}; +EXPORT_SYMBOL(ldebugfs_stats_seq_fops); + +static const struct proc_ops lprocfs_stats_seq_fops = { + PROC_OWNER(THIS_MODULE) + .proc_open = lprocfs_stats_seq_open, + .proc_read = seq_read, + .proc_write = lprocfs_stats_seq_write, + .proc_lseek = seq_lseek, + .proc_release = lprocfs_seq_release, +}; + +int lprocfs_register_stats(struct proc_dir_entry *root, const char *name, + struct lprocfs_stats *stats) +{ + struct proc_dir_entry *entry; + LASSERT(root != NULL); + + entry = proc_create_data(name, 0644, root, + &lprocfs_stats_seq_fops, stats); + if (!entry) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL(lprocfs_register_stats); + +void lprocfs_counter_init(struct lprocfs_stats *stats, int index, + unsigned conf, const char *name, const char *units) +{ + struct lprocfs_counter_header *header; + struct lprocfs_counter *percpu_cntr; + unsigned long flags = 0; + unsigned int i; + unsigned int num_cpu; + + LASSERT(stats != NULL); + + header = &stats->ls_cnt_header[index]; + LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n", + index, name, units); + + header->lc_config = conf; + header->lc_name = name; + header->lc_units = units; + + num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + for (i = 0; i < num_cpu; ++i) { + if (!stats->ls_percpu[i]) + continue; + percpu_cntr = lprocfs_stats_counter_get(stats, i, index); + percpu_cntr->lc_count = 0; + percpu_cntr->lc_min = LC_MIN_INIT; + percpu_cntr->lc_max = 0; + percpu_cntr->lc_sumsquare = 0; + percpu_cntr->lc_sum = 0; + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpu_cntr->lc_sum_irq = 0; + } + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} +EXPORT_SYMBOL(lprocfs_counter_init); + +static const char * const mps_stats[] = { + [LPROC_MD_CLOSE] = "close", + [LPROC_MD_CREATE] = "create", + [LPROC_MD_ENQUEUE] = "enqueue", + [LPROC_MD_GETATTR] = "getattr", + [LPROC_MD_INTENT_LOCK] = "intent_lock", + [LPROC_MD_LINK] = "link", + [LPROC_MD_RENAME] = "rename", + [LPROC_MD_SETATTR] = "setattr", + [LPROC_MD_FSYNC] = "fsync", + [LPROC_MD_READ_PAGE] = "read_page", + [LPROC_MD_UNLINK] = "unlink", + [LPROC_MD_SETXATTR] = "setxattr", + [LPROC_MD_GETXATTR] = "getxattr", + [LPROC_MD_INTENT_GETATTR_ASYNC] = "intent_getattr_async", + [LPROC_MD_REVALIDATE_LOCK] = "revalidate_lock", +}; + +int lprocfs_alloc_md_stats(struct obd_device *obd, + unsigned int num_private_stats) +{ + struct lprocfs_stats *stats; + unsigned int num_stats; + int rc, i; + + /* + * TODO Ensure that this function is only used where + * appropriate by adding an assertion to the effect that + * obd->obd_type->typ_md_ops is not NULL. We can't do this now + * because mdt_procfs_init() uses this function to allocate + * the stats backing /proc/fs/lustre/mdt/.../md_stats but the + * mdt layer does not use the md_ops interface. This is + * confusing and a waste of memory. See LU-2484. + */ + LASSERT(obd->obd_proc_entry != NULL); + LASSERT(obd->obd_md_stats == NULL); + + num_stats = ARRAY_SIZE(mps_stats) + num_private_stats; + stats = lprocfs_alloc_stats(num_stats, 0); + if (!stats) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(mps_stats); i++) { + lprocfs_counter_init(stats, i, 0, mps_stats[i], "reqs"); + if (!stats->ls_cnt_header[i].lc_name) { + CERROR("Missing md_stat initializer md_op operation at offset %d. Aborting.\n", + i); + LBUG(); + } + } + + rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats); + if (rc < 0) { + lprocfs_free_stats(&stats); + } else { + obd->obd_md_stats = stats; + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_alloc_md_stats); + +void lprocfs_free_md_stats(struct obd_device *obd) +{ + struct lprocfs_stats *stats = obd->obd_md_stats; + + if (stats) { + obd->obd_md_stats = NULL; + lprocfs_free_stats(&stats); + } +} +EXPORT_SYMBOL(lprocfs_free_md_stats); + +void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats) +{ + lprocfs_counter_init(ldlm_stats, + LDLM_ENQUEUE - LDLM_FIRST_OPC, + 0, "ldlm_enqueue", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CONVERT - LDLM_FIRST_OPC, + 0, "ldlm_convert", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CANCEL - LDLM_FIRST_OPC, + 0, "ldlm_cancel", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_BL_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_bl_callback", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CP_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_cp_callback", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_GL_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_gl_callback", "reqs"); +} +EXPORT_SYMBOL(lprocfs_init_ldlm_stats); + +__s64 lprocfs_read_helper(struct lprocfs_counter *lc, + struct lprocfs_counter_header *header, + enum lprocfs_stats_flags flags, + enum lprocfs_fields_flags field) +{ + __s64 ret = 0; + + if (!lc || !header) + RETURN(0); + + switch (field) { + case LPROCFS_FIELDS_FLAGS_CONFIG: + ret = header->lc_config; + break; + case LPROCFS_FIELDS_FLAGS_SUM: + ret = lc->lc_sum; + if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + ret += lc->lc_sum_irq; + break; + case LPROCFS_FIELDS_FLAGS_MIN: + ret = lc->lc_min; + break; + case LPROCFS_FIELDS_FLAGS_MAX: + ret = lc->lc_max; + break; + case LPROCFS_FIELDS_FLAGS_AVG: + ret = (lc->lc_max - lc->lc_min) / 2; + break; + case LPROCFS_FIELDS_FLAGS_SUMSQUARE: + ret = lc->lc_sumsquare; + break; + case LPROCFS_FIELDS_FLAGS_COUNT: + ret = lc->lc_count; + break; + default: + break; + }; + RETURN(ret); +} +EXPORT_SYMBOL(lprocfs_read_helper); + +/** + * string_to_size - convert ASCII string representing a numerical + * value with optional units to 64-bit binary value + * + * @size: The numerical value extract out of @buffer + * @buffer: passed in string to parse + * @count: length of the @buffer + * + * This function returns a 64-bit binary value if @buffer contains a valid + * numerical string. The string is parsed to 3 significant figures after + * the decimal point. Support the string containing an optional units at + * the end which can be base 2 or base 10 in value. If no units are given + * the string is assumed to just a numerical value. + * + * Returns: @count if the string is successfully parsed, + * -errno on invalid input strings. Error values: + * + * - ``-EINVAL``: @buffer is not a proper numerical string + * - ``-EOVERFLOW``: results does not fit into 64 bits. + * - ``-E2BIG ``: @buffer is too large (not a valid number) + */ +int string_to_size(u64 *size, const char *buffer, size_t count) +{ + /* For string_get_size() it can support values above exabytes, + * (ZiB, YiB) due to breaking the return value into a size and + * bulk size to avoid 64 bit overflow. We don't break the size + * up into block size units so we don't support ZiB or YiB. + */ + static const char *const units_10[] = { + "kB", "MB", "GB", "TB", "PB", "EB", + }; + static const char *const units_2[] = { + "K", "M", "G", "T", "P", "E", + }; + static const char *const *const units_str[] = { + [STRING_UNITS_2] = units_2, + [STRING_UNITS_10] = units_10, + }; + static const unsigned int coeff[] = { + [STRING_UNITS_10] = 1000, + [STRING_UNITS_2] = 1024, + }; + enum string_size_units unit = STRING_UNITS_2; + u64 whole, blk_size = 1; + char kernbuf[22], *end; + size_t len = count; + int rc; + int i; + + if (count >= sizeof(kernbuf)) { + CERROR("count %zd > buffer %zd\n", count, sizeof(kernbuf)); + return -E2BIG; + } + + *size = 0; + /* The "iB" suffix is optionally allowed for indicating base-2 numbers. + * If suffix is only "B" and not "iB" then we treat it as base-10. + */ + end = strstr(buffer, "B"); + if (end && *(end - 1) != 'i') + unit = STRING_UNITS_10; + + i = unit == STRING_UNITS_2 ? ARRAY_SIZE(units_2) - 1 : + ARRAY_SIZE(units_10) - 1; + do { + end = strnstr(buffer, units_str[unit][i], count); + if (end) { + for (; i >= 0; i--) + blk_size *= coeff[unit]; + len = end - buffer; + break; + } + } while (i--); + + /* as 'B' is a substring of all units, we need to handle it + * separately. + */ + if (!end) { + /* 'B' is only acceptable letter at this point */ + end = strnchr(buffer, count, 'B'); + if (end) { + len = end - buffer; + + if (count - len > 2 || + (count - len == 2 && strcmp(end, "B\n") != 0)) { + CDEBUG(D_INFO, "unknown suffix '%s'\n", buffer); + return -EINVAL; + } + } + /* kstrtoull will error out if it has non digits */ + goto numbers_only; + } + + end = strnchr(buffer, count, '.'); + if (end) { + /* need to limit 3 decimal places */ + char rem[4] = "000"; + u64 frac = 0; + size_t off; + + len = end - buffer; + end++; + + /* limit to 3 decimal points */ + off = min_t(size_t, 3, strspn(end, "0123456789")); + /* need to limit frac_d to a u32 */ + memcpy(rem, end, off); + rc = kstrtoull(rem, 10, &frac); + if (rc) + return rc; + + if (fls64(frac) + fls64(blk_size) - 1 > 64) + return -EOVERFLOW; + + frac *= blk_size; + do_div(frac, 1000); + *size += frac; + } +numbers_only: + snprintf(kernbuf, sizeof(kernbuf), "%.*s", (int)len, buffer); + rc = kstrtoull(kernbuf, 10, &whole); + if (rc) + return rc; + + if (whole != 0 && fls64(whole) + fls64(blk_size) - 1 > 64) + return -EOVERFLOW; + + *size += whole * blk_size; + + return count; +} +EXPORT_SYMBOL(string_to_size); + +/** + * sysfs_memparse - parse a ASCII string to 64-bit binary value, + * with optional units + * + * @buffer: kernel pointer to input string + * @count: number of bytes in the input @buffer + * @val: (output) binary value returned to caller + * @defunit: default unit suffix to use if none is provided + * + * Parses a string into a number. The number stored at @buffer is + * potentially suffixed with K, M, G, T, P, E. Besides these other + * valid suffix units are shown in the string_to_size() function. + * If the string lacks a suffix then the defunit is used. The defunit + * should be given as a binary unit (e.g. MiB) as that is the standard + * for tunables in Lustre. If no unit suffix is given (e.g. 'G'), then + * it is assumed to be in binary units. + * + * Returns: 0 on success or -errno on failure. + */ +int sysfs_memparse(const char *buffer, size_t count, u64 *val, + const char *defunit) +{ + const char *param = buffer; + char tmp_buf[23]; + int rc; + + count = strlen(buffer); + while (count > 0 && isspace(buffer[count - 1])) + count--; + + if (!count) + RETURN(-EINVAL); + + /* If there isn't already a unit on this value, append @defunit. + * Units of 'B' don't affect the value, so don't bother adding. + */ + if (!isalpha(buffer[count - 1]) && defunit[0] != 'B') { + if (count + 3 >= sizeof(tmp_buf)) { + CERROR("count %zd > size %zd\n", count, sizeof(param)); + RETURN(-E2BIG); + } + + scnprintf(tmp_buf, sizeof(tmp_buf), "%.*s%s", (int)count, + buffer, defunit); + param = tmp_buf; + count = strlen(param); + } + + rc = string_to_size(val, param, count); + + return rc < 0 ? rc : 0; +} +EXPORT_SYMBOL(sysfs_memparse); + +char *lprocfs_strnstr(const char *s1, const char *s2, size_t len) +{ + size_t l2; + + l2 = strlen(s2); + if (!l2) + return (char *)s1; + while (len >= l2) { + len--; + if (!memcmp(s1, s2, l2)) + return (char *)s1; + s1++; + } + return NULL; +} +EXPORT_SYMBOL(lprocfs_strnstr); + +/** + * Find the string \a name in the input \a buffer, and return a pointer to the + * value immediately following \a name, reducing \a count appropriately. + * If \a name is not found the original \a buffer is returned. + */ +char *lprocfs_find_named_value(const char *buffer, const char *name, + size_t *count) +{ + char *val; + size_t buflen = *count; + + /* there is no strnstr() in rhel5 and ubuntu kernels */ + val = lprocfs_strnstr(buffer, name, buflen); + if (!val) + return (char *)buffer; + + val += strlen(name); /* skip prefix */ + while (val < buffer + buflen && isspace(*val)) /* skip separator */ + val++; + + *count = 0; + while (val < buffer + buflen && isalnum(*val)) { + ++*count; + ++val; + } + + return val - *count; +} +EXPORT_SYMBOL(lprocfs_find_named_value); + +int lprocfs_seq_create(struct proc_dir_entry *parent, + const char *name, + mode_t mode, + const struct proc_ops *seq_fops, + void *data) +{ + struct proc_dir_entry *entry; + ENTRY; + + /* Disallow secretly (un)writable entries. */ + LASSERT(!seq_fops->proc_write == !(mode & 0222)); + + entry = proc_create_data(name, mode, parent, seq_fops, data); + + if (!entry) + RETURN(-ENOMEM); + + RETURN(0); +} +EXPORT_SYMBOL(lprocfs_seq_create); + +int lprocfs_obd_seq_create(struct obd_device *obd, + const char *name, + mode_t mode, + const struct proc_ops *seq_fops, + void *data) +{ + return lprocfs_seq_create(obd->obd_proc_entry, name, + mode, seq_fops, data); +} +EXPORT_SYMBOL(lprocfs_obd_seq_create); + +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value) +{ + if (value >= OBD_HIST_MAX) + value = OBD_HIST_MAX - 1; + + spin_lock(&oh->oh_lock); + oh->oh_buckets[value]++; + spin_unlock(&oh->oh_lock); +} +EXPORT_SYMBOL(lprocfs_oh_tally); + +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value) +{ + unsigned int val = 0; + + if (likely(value != 0)) + val = min(fls(value - 1), OBD_HIST_MAX); + + lprocfs_oh_tally(oh, val); +} +EXPORT_SYMBOL(lprocfs_oh_tally_log2); + +unsigned long lprocfs_oh_sum(struct obd_histogram *oh) +{ + unsigned long ret = 0; + int i; + + for (i = 0; i < OBD_HIST_MAX; i++) + ret += oh->oh_buckets[i]; + return ret; +} +EXPORT_SYMBOL(lprocfs_oh_sum); + +void lprocfs_oh_clear(struct obd_histogram *oh) +{ + spin_lock(&oh->oh_lock); + memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets)); + spin_unlock(&oh->oh_lock); +} +EXPORT_SYMBOL(lprocfs_oh_clear); + +void lprocfs_oh_tally_pcpu(struct obd_hist_pcpu *oh, + unsigned int value) +{ + if (value >= OBD_HIST_MAX) + value = OBD_HIST_MAX - 1; + + percpu_counter_inc(&oh->oh_pc_buckets[value]); +} +EXPORT_SYMBOL(lprocfs_oh_tally_pcpu); + +void lprocfs_oh_tally_log2_pcpu(struct obd_hist_pcpu *oh, + unsigned int value) +{ + unsigned int val = 0; + + if (likely(value != 0)) + val = min(fls(value - 1), OBD_HIST_MAX); + + lprocfs_oh_tally_pcpu(oh, val); +} +EXPORT_SYMBOL(lprocfs_oh_tally_log2_pcpu); + +unsigned long lprocfs_oh_counter_pcpu(struct obd_hist_pcpu *oh, + unsigned int value) +{ + return percpu_counter_sum(&oh->oh_pc_buckets[value]); +} +EXPORT_SYMBOL(lprocfs_oh_counter_pcpu); + +unsigned long lprocfs_oh_sum_pcpu(struct obd_hist_pcpu *oh) +{ + unsigned long ret = 0; + int i; + + for (i = 0; i < OBD_HIST_MAX; i++) + ret += percpu_counter_sum(&oh->oh_pc_buckets[i]); + + return ret; +} +EXPORT_SYMBOL(lprocfs_oh_sum_pcpu); + +int lprocfs_oh_alloc_pcpu(struct obd_hist_pcpu *oh) +{ + int i, rc; + + if (oh->oh_initialized) + return 0; + + for (i = 0; i < OBD_HIST_MAX; i++) { + rc = percpu_counter_init(&oh->oh_pc_buckets[i], 0, GFP_KERNEL); + if (rc) + goto out; + } + + oh->oh_initialized = true; + + return 0; + +out: + for (i--; i >= 0; i--) + percpu_counter_destroy(&oh->oh_pc_buckets[i]); + + return rc; +} +EXPORT_SYMBOL(lprocfs_oh_alloc_pcpu); + +void lprocfs_oh_clear_pcpu(struct obd_hist_pcpu *oh) +{ + int i; + + for (i = 0; i < OBD_HIST_MAX; i++) + percpu_counter_set(&oh->oh_pc_buckets[i], 0); +} +EXPORT_SYMBOL(lprocfs_oh_clear_pcpu); + +void lprocfs_oh_release_pcpu(struct obd_hist_pcpu *oh) +{ + int i; + + if (!oh->oh_initialized) + return; + + for (i = 0; i < OBD_HIST_MAX; i++) + percpu_counter_destroy(&oh->oh_pc_buckets[i]); + + oh->oh_initialized = false; +} +EXPORT_SYMBOL(lprocfs_oh_release_pcpu); + +ssize_t lustre_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct lustre_attr *a = container_of(attr, struct lustre_attr, attr); + + return a->show ? a->show(kobj, attr, buf) : 0; +} +EXPORT_SYMBOL_GPL(lustre_attr_show); + +ssize_t lustre_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct lustre_attr *a = container_of(attr, struct lustre_attr, attr); + + return a->store ? a->store(kobj, attr, buf, len) : len; +} +EXPORT_SYMBOL_GPL(lustre_attr_store); + +const struct sysfs_ops lustre_sysfs_ops = { + .show = lustre_attr_show, + .store = lustre_attr_store, +}; +EXPORT_SYMBOL_GPL(lustre_sysfs_ops); + +int lprocfs_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct client_obd *cli = &obd->u.cli; + + spin_lock(&cli->cl_loi_list_lock); + seq_printf(m, "%d\n", cli->cl_max_pages_per_rpc); + spin_unlock(&cli->cl_loi_list_lock); + return 0; +} +EXPORT_SYMBOL(lprocfs_obd_max_pages_per_rpc_seq_show); + +ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp; + struct obd_connect_data *ocd; + int chunk_mask, rc; + char kernbuf[22]; + u64 val; + + if (count > sizeof(kernbuf) - 1) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + + rc = sysfs_memparse(kernbuf, count, &val, "B"); + if (rc) + return rc; + + /* if the max_pages is specified in bytes, convert to pages */ + if (val >= ONE_MB_BRW_SIZE) + val >>= PAGE_SHIFT; + + with_imp_locked(obd, imp, rc) { + ocd = &imp->imp_connect_data; + chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1); + /* max_pages_per_rpc must be chunk aligned */ + val = (val + ~chunk_mask) & chunk_mask; + if (val == 0 || (ocd->ocd_brw_size != 0 && + val > ocd->ocd_brw_size >> PAGE_SHIFT)) { + rc = -ERANGE; + } else { + spin_lock(&cli->cl_loi_list_lock); + cli->cl_max_pages_per_rpc = val; + client_adjust_max_dirty(cli); + spin_unlock(&cli->cl_loi_list_lock); + } + } + + return rc ?: count; +} +EXPORT_SYMBOL(lprocfs_obd_max_pages_per_rpc_seq_write); + +ssize_t short_io_bytes_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &obd->u.cli; + int rc; + + spin_lock(&cli->cl_loi_list_lock); + rc = sprintf(buf, "%d\n", cli->cl_max_short_io_bytes); + spin_unlock(&cli->cl_loi_list_lock); + return rc; +} +EXPORT_SYMBOL(short_io_bytes_show); + +/* Used to catch people who think they're specifying pages. */ +#define MIN_SHORT_IO_BYTES 64U + +ssize_t short_io_bytes_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &obd->u.cli; + u64 val; + int rc; + + if (strcmp(buffer, "-1") == 0) { + val = OBD_DEF_SHORT_IO_BYTES; + } else { + rc = sysfs_memparse(buffer, count, &val, "B"); + if (rc) + GOTO(out, rc); + } + + if (val && (val < MIN_SHORT_IO_BYTES || val > LNET_MTU)) + GOTO(out, rc = -ERANGE); + + rc = count; + + spin_lock(&cli->cl_loi_list_lock); + cli->cl_max_short_io_bytes = min_t(u64, val, OST_MAX_SHORT_IO_BYTES); + spin_unlock(&cli->cl_loi_list_lock); + +out: + return rc; +} +EXPORT_SYMBOL(short_io_bytes_store); + +int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count, + struct root_squash_info *squash, char *name) +{ + int rc; + char kernbuf[64], *tmp, *errmsg; + unsigned long uid, gid; + ENTRY; + + if (count >= sizeof(kernbuf)) { + errmsg = "string too long"; + GOTO(failed_noprint, rc = -EINVAL); + } + if (copy_from_user(kernbuf, buffer, count)) { + errmsg = "bad address"; + GOTO(failed_noprint, rc = -EFAULT); + } + kernbuf[count] = '\0'; + + /* look for uid gid separator */ + tmp = strchr(kernbuf, ':'); + if (!tmp) { + errmsg = "needs uid:gid format"; + GOTO(failed, rc = -EINVAL); + } + *tmp = '\0'; + tmp++; + + /* parse uid */ + if (kstrtoul(kernbuf, 0, &uid) != 0) { + errmsg = "bad uid"; + GOTO(failed, rc = -EINVAL); + } + + /* parse gid */ + if (kstrtoul(tmp, 0, &gid) != 0) { + errmsg = "bad gid"; + GOTO(failed, rc = -EINVAL); + } + + squash->rsi_uid = uid; + squash->rsi_gid = gid; + + LCONSOLE_INFO("%s: root_squash is set to %u:%u\n", + name, squash->rsi_uid, squash->rsi_gid); + RETURN(count); + +failed: + if (tmp) { + tmp--; + *tmp = ':'; + } + CWARN("%s: failed to set root_squash to \"%s\", %s, rc = %d\n", + name, kernbuf, errmsg, rc); + RETURN(rc); +failed_noprint: + CWARN("%s: failed to set root_squash due to %s, rc = %d\n", + name, errmsg, rc); + RETURN(rc); +} +EXPORT_SYMBOL(lprocfs_wr_root_squash); + + +int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count, + struct root_squash_info *squash, char *name) +{ + int rc; + char *kernbuf = NULL; + char *errmsg; + LIST_HEAD(tmp); + int len = count; + ENTRY; + + if (count > 4096) { + errmsg = "string too long"; + GOTO(failed, rc = -EINVAL); + } + + OBD_ALLOC(kernbuf, count + 1); + if (!kernbuf) { + errmsg = "no memory"; + GOTO(failed, rc = -ENOMEM); + } + if (copy_from_user(kernbuf, buffer, count)) { + errmsg = "bad address"; + GOTO(failed, rc = -EFAULT); + } + kernbuf[count] = '\0'; + + if (count > 0 && kernbuf[count - 1] == '\n') + len = count - 1; + + if ((len == 4 && strncmp(kernbuf, "NONE", len) == 0) || + (len == 5 && strncmp(kernbuf, "clear", len) == 0)) { + /* empty string is special case */ + spin_lock(&squash->rsi_lock); + if (!list_empty(&squash->rsi_nosquash_nids)) + cfs_free_nidlist(&squash->rsi_nosquash_nids); + spin_unlock(&squash->rsi_lock); + LCONSOLE_INFO("%s: nosquash_nids is cleared\n", name); + OBD_FREE(kernbuf, count + 1); + RETURN(count); + } + + if (cfs_parse_nidlist(kernbuf, count, &tmp) <= 0) { + errmsg = "can't parse"; + GOTO(failed, rc = -EINVAL); + } + LCONSOLE_INFO("%s: nosquash_nids set to %s\n", + name, kernbuf); + OBD_FREE(kernbuf, count + 1); + kernbuf = NULL; + + spin_lock(&squash->rsi_lock); + if (!list_empty(&squash->rsi_nosquash_nids)) + cfs_free_nidlist(&squash->rsi_nosquash_nids); + list_splice(&tmp, &squash->rsi_nosquash_nids); + spin_unlock(&squash->rsi_lock); + + RETURN(count); + +failed: + if (kernbuf) { + CWARN("%s: failed to set nosquash_nids to \"%s\", %s rc = %d\n", + name, kernbuf, errmsg, rc); + OBD_FREE(kernbuf, count + 1); + } else { + CWARN("%s: failed to set nosquash_nids due to %s rc = %d\n", + name, errmsg, rc); + } + RETURN(rc); +} +EXPORT_SYMBOL(lprocfs_wr_nosquash_nids); + +#endif /* CONFIG_PROC_FS*/ diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c new file mode 100644 index 0000000000000..a09ae67d89e33 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c @@ -0,0 +1,1121 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/lprocfs_status_server.c + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#include +#include +#include + +#define MAX_STRING_SIZE 128 + +struct dentry *ldebugfs_add_symlink(const char *name, const char *target, + const char *format, ...) +{ + struct dentry *entry = NULL; + struct dentry *parent; + struct qstr dname; + va_list ap; + char *dest; + + if (!target || !format) + return NULL; + + dname.name = target; + dname.len = strlen(dname.name); + dname.hash = ll_full_name_hash(debugfs_lustre_root, + dname.name, dname.len); + parent = d_lookup(debugfs_lustre_root, &dname); + if (!parent) + return NULL; + + OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1); + if (!dest) + goto no_entry; + + va_start(ap, format); + vsnprintf(dest, MAX_STRING_SIZE, format, ap); + va_end(ap); + + entry = debugfs_create_symlink(name, parent, dest); + + OBD_FREE(dest, MAX_STRING_SIZE + 1); +no_entry: + dput(parent); + return entry; +} +EXPORT_SYMBOL(ldebugfs_add_symlink); + +#ifdef CONFIG_PROC_FS + +int lprocfs_evict_client_open(struct inode *inode, struct file *f) +{ + struct obd_device *obd = pde_data(file_inode(f)); + + atomic_inc(&obd->obd_evict_inprogress); + return 0; +} + +int lprocfs_evict_client_release(struct inode *inode, struct file *f) +{ + struct obd_device *obd = pde_data(file_inode(f)); + + atomic_dec(&obd->obd_evict_inprogress); + wake_up(&obd->obd_evict_inprogress_waitq); + + return 0; +} + +#define BUFLEN (UUID_MAX + 5) + +ssize_t +lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + char *tmpbuf, *kbuf; + + OBD_ALLOC(kbuf, BUFLEN); + if (kbuf == NULL) + return -ENOMEM; + + /* + * OBD_ALLOC() will zero kbuf, but we only copy BUFLEN - 1 + * bytes into kbuf, to ensure that the string is NUL-terminated. + * UUID_MAX should include a trailing NUL already. + */ + if (copy_from_user(kbuf, buffer, + min_t(unsigned long, BUFLEN - 1, count))) { + count = -EFAULT; + goto out; + } + tmpbuf = skip_spaces(kbuf); + tmpbuf = strsep(&tmpbuf, " \t\n\f\v\r"); + class_incref(obd, __func__, current); + + if (strncmp(tmpbuf, "nid:", 4) == 0) + obd_export_evict_by_nid(obd, tmpbuf + 4); + else if (strncmp(tmpbuf, "uuid:", 5) == 0) + obd_export_evict_by_uuid(obd, tmpbuf + 5); + else + obd_export_evict_by_uuid(obd, tmpbuf); + + class_decref(obd, __func__, current); + +out: + OBD_FREE(kbuf, BUFLEN); + return count; +} +EXPORT_SYMBOL(lprocfs_evict_client_seq_write); + +#undef BUFLEN + +ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", obd->obd_num_exports); +} +EXPORT_SYMBOL(num_exports_show); + +ssize_t grant_check_threshold_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%d\n", + obd->obd_grant_check_threshold); +} +EXPORT_SYMBOL(grant_check_threshold_show); + +ssize_t grant_check_threshold_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + int val; + int rc; + + rc = kstrtoint(buffer, 10, &val); + if (rc) + return rc; + + if (val < 0) + return -EINVAL; + obd->obd_grant_check_threshold = val; + return count; +} +EXPORT_SYMBOL(grant_check_threshold_store); + +static int obd_export_flags2str(struct obd_export *exp, struct seq_file *m) +{ + bool first = true; + + flag2str(exp, failed); + flag2str(exp, in_recovery); + flag2str(exp, disconnected); + flag2str(exp, connecting); + flag2str(exp, no_recovery); + + return 0; +} + +static int +lprocfs_exp_print_export_seq(struct obd_export *exp, void *cb_data) +{ + struct seq_file *m = cb_data; + struct obd_device *obd; + struct obd_connect_data *ocd; + + LASSERT(exp != NULL); + if (exp->exp_nid_stats == NULL) + goto out; + obd = exp->exp_obd; + ocd = &exp->exp_connect_data; + + seq_printf(m, "%s:\n" + " name: %s\n" + " client: %s\n" + " connect_flags: [ ", + obd_uuid2str(&exp->exp_client_uuid), + obd->obd_name, + obd_export_nid2str(exp)); + obd_connect_seq_flags2str(m, ocd->ocd_connect_flags, + ocd->ocd_connect_flags2, ", "); + seq_printf(m, " ]\n"); + obd_connect_data_seqprint(m, ocd); + seq_printf(m, " export_flags: [ "); + obd_export_flags2str(exp, m); + seq_printf(m, " ]\n"); + + if (obd->obd_type && + strcmp(obd->obd_type->typ_name, "obdfilter") == 0) { + struct filter_export_data *fed = &exp->exp_filter_data; + + seq_printf(m, " grant:\n"); + seq_printf(m, " granted: %ld\n", + fed->fed_ted.ted_grant); + seq_printf(m, " dirty: %ld\n", + fed->fed_ted.ted_dirty); + seq_printf(m, " pending: %ld\n", + fed->fed_ted.ted_pending); + } + +out: + return 0; +} + +/** + * RPC connections are composed of an import and an export. Using the + * lctl utility we can extract important information about the state. + * The lprocfs_exp_export_seq_show routine displays the state information + * for the export. + * + * \param[in] m seq file + * \param[in] data unused + * + * \retval 0 on success + * + * The format of the export state information is like: + * a793e354-49c0-aa11-8c4f-a4f2b1a1a92b: + * name: MGS + * client: 10.211.55.10@tcp + * connect_flags: [ version, barrier, adaptive_timeouts, ... ] + * connect_data: + * flags: 0x2000011005002020 + * instance: 0 + * target_version: 2.10.51.0 + * export_flags: [ ... ] + * + */ +static int lprocfs_exp_export_seq_show(struct seq_file *m, void *data) +{ + struct nid_stat *stats = m->private; + + return obd_nid_export_for_each(stats->nid_obd, &stats->nid, + lprocfs_exp_print_export_seq, m); +} +LPROC_SEQ_FOPS_RO(lprocfs_exp_export); + +static void lprocfs_free_client_stats(struct nid_stat *client_stat) +{ + CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat, + client_stat->nid_proc, client_stat->nid_stats); + + LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0, + "nid %s:count %d\n", libcfs_nidstr(&client_stat->nid), + atomic_read(&client_stat->nid_exp_ref_count)); + + if (client_stat->nid_proc) + lprocfs_remove(&client_stat->nid_proc); + + if (client_stat->nid_stats) + lprocfs_free_stats(&client_stat->nid_stats); + + if (client_stat->nid_ldlm_stats) + lprocfs_free_stats(&client_stat->nid_ldlm_stats); + + OBD_FREE_PTR(client_stat); +} + +void lprocfs_free_per_client_stats(struct obd_device *obd) +{ + struct cfs_hash *hash = obd->obd_nid_stats_hash; + struct nid_stat *stat; + ENTRY; + + /* we need extra list - because hash_exit called to early */ + /* not need locking because all clients is died */ + while (!list_empty(&obd->obd_nid_stats)) { + stat = list_entry(obd->obd_nid_stats.next, + struct nid_stat, nid_list); + list_del_init(&stat->nid_list); + cfs_hash_del(hash, &stat->nid, &stat->nid_hash); + lprocfs_free_client_stats(stat); + } + EXIT; +} +EXPORT_SYMBOL(lprocfs_free_per_client_stats); + +static int +lprocfs_exp_print_nodemap_seq(struct obd_export *exp, void *cb_data) +{ + struct lu_nodemap *nodemap = exp->exp_target_data.ted_nodemap; + struct seq_file *m = cb_data; + + if (nodemap) + seq_printf(m, "%s\n", nodemap->nm_name); + return 0; +} + +static int +lprocfs_exp_nodemap_seq_show(struct seq_file *m, void *data) +{ + struct nid_stat *stats = m->private; + + return obd_nid_export_for_each(stats->nid_obd, &stats->nid, + lprocfs_exp_print_nodemap_seq, m); +} +LPROC_SEQ_FOPS_RO(lprocfs_exp_nodemap); + +static int +lprocfs_exp_print_uuid_seq(struct obd_export *exp, void *cb_data) +{ + struct seq_file *m = cb_data; + + if (exp->exp_nid_stats) + seq_printf(m, "%s\n", obd_uuid2str(&exp->exp_client_uuid)); + return 0; +} + +static int lprocfs_exp_uuid_seq_show(struct seq_file *m, void *data) +{ + struct nid_stat *stats = m->private; + + return obd_nid_export_for_each(stats->nid_obd, &stats->nid, + lprocfs_exp_print_uuid_seq, m); +} +LPROC_SEQ_FOPS_RO(lprocfs_exp_uuid); + +#define HASH_NAME_LEN 16 + +static void ldebugfs_rhash_seq_show(const char *name, struct rhashtable *ht, + struct seq_file *m) +{ + unsigned int max_size = ht->p.max_size ? ht->p.max_size : UINT_MAX; + struct bucket_table *tbl; + int dist[8] = { 0, }; + int maxdep = 0; + int i; + + rcu_read_lock(); + tbl = rht_dereference(ht->tbl, ht); + for (i = 0; i < tbl->size; i++) { + struct rhash_head *pos; + int count = 0; + + rht_for_each(pos, tbl, i) + count++; + + if (count) + maxdep = max(maxdep, count); + + dist[min(fls(count), 7)]++; + } + + seq_printf(m, "%-*s %5d %5d %10u %d.%03d 0.300 0.750 0x%03x %7d %7d %7d ", + HASH_NAME_LEN, name, tbl->size, ht->p.min_size, max_size, + atomic_read(&ht->nelems) / tbl->size, + atomic_read(&ht->nelems) * 1000 / tbl->size, + ht->p.automatic_shrinking, 0, + atomic_read(&ht->nelems), maxdep); + rcu_read_unlock(); + + for (i = 0; i < 8; i++) + seq_printf(m, "%d%c", dist[i], (i == 7) ? '\n' : '/'); +} + +static int +lprocfs_exp_print_hash_seq(struct obd_export *exp, void *cb_data) + +{ + struct obd_device *obd = exp->exp_obd; + struct seq_file *m = cb_data; + + if (exp->exp_lock_hash != NULL) { + seq_printf(m, "%-*s cur min max theta t-min t-max flags rehash count distribution\n", + HASH_NAME_LEN, "name"); + ldebugfs_rhash_seq_show("NID_HASH", &obd->obd_nid_hash.ht, m); + } + return 0; +} + +static int lprocfs_exp_hash_seq_show(struct seq_file *m, void *data) +{ + struct nid_stat *stats = m->private; + + return obd_nid_export_for_each(stats->nid_obd, &stats->nid, + lprocfs_exp_print_hash_seq, m); +} +LPROC_SEQ_FOPS_RO(lprocfs_exp_hash); + +int lprocfs_exp_print_replydata_seq(struct obd_export *exp, void *cb_data) + +{ + struct seq_file *m = cb_data; + struct tg_export_data *ted = &exp->exp_target_data; + + seq_printf(m, "reply_cnt: %d\n" + "reply_max: %d\n" + "reply_released_by_xid: %d\n" + "reply_released_by_tag: %d\n\n", + ted->ted_reply_cnt, + ted->ted_reply_max, + ted->ted_release_xid, + ted->ted_release_tag); + return 0; +} + +int lprocfs_exp_replydata_seq_show(struct seq_file *m, void *data) +{ + struct nid_stat *stats = m->private; + + return obd_nid_export_for_each(stats->nid_obd, &stats->nid, + lprocfs_exp_print_replydata_seq, m); +} +LPROC_SEQ_FOPS_RO(lprocfs_exp_replydata); + +int lprocfs_exp_print_fmd_count_seq(struct obd_export *exp, void *cb_data) +{ + struct seq_file *m = cb_data; + struct tg_export_data *ted = &exp->exp_target_data; + + seq_printf(m, "%d\n", ted->ted_fmd_count); + + return 0; +} + +int lprocfs_exp_fmd_count_seq_show(struct seq_file *m, void *data) +{ + struct nid_stat *stats = m->private; + + return obd_nid_export_for_each(stats->nid_obd, &stats->nid, + lprocfs_exp_print_fmd_count_seq, m); +} +LPROC_SEQ_FOPS_RO(lprocfs_exp_fmd_count); + +int lprocfs_nid_stats_clear_seq_show(struct seq_file *m, void *data) +{ + seq_puts(m, "Write into this file to clear all nid stats and stale nid entries\n"); + return 0; +} +EXPORT_SYMBOL(lprocfs_nid_stats_clear_seq_show); + +static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data) +{ + struct nid_stat *stat = obj; + ENTRY; + + CDEBUG(D_INFO, "refcnt %d\n", atomic_read(&stat->nid_exp_ref_count)); + if (atomic_read(&stat->nid_exp_ref_count) == 1) { + /* object has only hash references. */ + spin_lock(&stat->nid_obd->obd_nid_lock); + list_move(&stat->nid_list, data); + spin_unlock(&stat->nid_obd->obd_nid_lock); + RETURN(1); + } + /* we has reference to object - only clear data*/ + if (stat->nid_stats) + lprocfs_clear_stats(stat->nid_stats); + + RETURN(0); +} + +ssize_t +lprocfs_nid_stats_clear_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + struct nid_stat *client_stat; + LIST_HEAD(free_list); + + cfs_hash_cond_del(obd->obd_nid_stats_hash, + lprocfs_nid_stats_clear_write_cb, &free_list); + + while (!list_empty(&free_list)) { + client_stat = list_entry(free_list.next, struct nid_stat, + nid_list); + list_del_init(&client_stat->nid_list); + lprocfs_free_client_stats(client_stat); + } + return count; +} +EXPORT_SYMBOL(lprocfs_nid_stats_clear_seq_write); + +int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid) +{ + struct nid_stat *new_stat, *old_stat; + struct obd_device *obd = NULL; + struct proc_dir_entry *entry; + char nidstr[LNET_NIDSTR_SIZE]; + int rc = 0; + ENTRY; + + if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry || + !exp->exp_obd->obd_nid_stats_hash) + RETURN(-EINVAL); + + /* not test against zero because eric say: + * You may only test nid against another nid, or LNET_NID_ANY. + * Anything else is nonsense.*/ + if (nid == NULL || *nid == LNET_NID_ANY) + RETURN(-EALREADY); + + libcfs_nid2str_r(*nid, nidstr, sizeof(nidstr)); + + spin_lock(&exp->exp_lock); + if (exp->exp_nid_stats != NULL) { + spin_unlock(&exp->exp_lock); + RETURN(-EALREADY); + } + spin_unlock(&exp->exp_lock); + + obd = exp->exp_obd; + + CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash); + + OBD_ALLOC_PTR(new_stat); + if (new_stat == NULL) + RETURN(-ENOMEM); + + lnet_nid4_to_nid(*nid, &new_stat->nid); + new_stat->nid_obd = exp->exp_obd; + /* we need set default refcount to 1 to balance obd_disconnect */ + atomic_set(&new_stat->nid_exp_ref_count, 1); + + old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash, + &new_stat->nid, + &new_stat->nid_hash); + CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n", + old_stat, nidstr, atomic_read(&old_stat->nid_exp_ref_count)); + + /* Return -EALREADY here so that we know that the /proc + * entry already has been created */ + if (old_stat != new_stat) { + spin_lock(&exp->exp_lock); + if (exp->exp_nid_stats) { + LASSERT(exp->exp_nid_stats == old_stat); + nidstat_putref(exp->exp_nid_stats); + } + exp->exp_nid_stats = old_stat; + spin_unlock(&exp->exp_lock); + GOTO(destroy_new, rc = -EALREADY); + } + /* not found - create */ + new_stat->nid_proc = lprocfs_register(nidstr, + obd->obd_proc_exports_entry, + NULL, NULL); + + if (IS_ERR(new_stat->nid_proc)) { + rc = PTR_ERR(new_stat->nid_proc); + new_stat->nid_proc = NULL; + CERROR("%s: cannot create proc entry for export %s: rc = %d\n", + obd->obd_name, nidstr, rc); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "nodemap", new_stat, + &lprocfs_exp_nodemap_fops); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CWARN("%s: error adding the nodemap file: rc = %d\n", + obd->obd_name, rc); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "uuid", new_stat, + &lprocfs_exp_uuid_fops); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CWARN("%s: error adding the NID stats file: rc = %d\n", + obd->obd_name, rc); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "hash", new_stat, + &lprocfs_exp_hash_fops); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CWARN("%s: error adding the hash file: rc = %d\n", + obd->obd_name, rc); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "export", + new_stat, &lprocfs_exp_export_fops); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CWARN("%s: error adding the export file: rc = %d\n", + obd->obd_name, rc); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "reply_data", new_stat, + &lprocfs_exp_replydata_fops); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CWARN("%s: error adding the reply_data file: rc = %d\n", + obd->obd_name, rc); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "fmd_count", new_stat, + &lprocfs_exp_fmd_count_fops); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CWARN("%s: error adding the fmd_count file: rc = %d\n", + obd->obd_name, rc); + GOTO(destroy_new_ns, rc); + } + + spin_lock(&exp->exp_lock); + exp->exp_nid_stats = new_stat; + spin_unlock(&exp->exp_lock); + + /* protect competitive add to list, not need locking on destroy */ + spin_lock(&obd->obd_nid_lock); + list_add(&new_stat->nid_list, &obd->obd_nid_stats); + spin_unlock(&obd->obd_nid_lock); + + RETURN(0); + +destroy_new_ns: + if (new_stat->nid_proc != NULL) + lprocfs_remove(&new_stat->nid_proc); + cfs_hash_del(obd->obd_nid_stats_hash, &new_stat->nid, + &new_stat->nid_hash); + +destroy_new: + nidstat_putref(new_stat); + OBD_FREE_PTR(new_stat); + RETURN(rc); +} +EXPORT_SYMBOL(lprocfs_exp_setup); + +int lprocfs_exp_cleanup(struct obd_export *exp) +{ + struct nid_stat *stat = exp->exp_nid_stats; + + if (!stat || !exp->exp_obd) + RETURN(0); + + nidstat_putref(exp->exp_nid_stats); + exp->exp_nid_stats = NULL; + + return 0; +} + +int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned int num_stats) +{ + struct lprocfs_stats *stats; + int rc; + + LASSERT(obd->obd_stats == NULL); + LASSERT(obd->obd_proc_entry != NULL); + + stats = lprocfs_alloc_stats(num_stats, 0); + if (stats == NULL) + return -ENOMEM; + + rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats); + if (rc < 0) + lprocfs_free_stats(&stats); + else + obd->obd_stats = stats; + + return rc; +} +EXPORT_SYMBOL(lprocfs_alloc_obd_stats); + +void lprocfs_free_obd_stats(struct obd_device *obd) +{ + if (obd->obd_stats) + lprocfs_free_stats(&obd->obd_stats); +} +EXPORT_SYMBOL(lprocfs_free_obd_stats); + +static void display_brw_stats(struct seq_file *seq, const char *name, + const char *units, struct obd_hist_pcpu *read, + struct obd_hist_pcpu *write, bool scale) +{ + unsigned long read_tot, write_tot, r, w, read_cum = 0, write_cum = 0; + unsigned int i; + + seq_printf(seq, "\n%26s read | write\n", " "); + seq_printf(seq, "%-22s %-5s %% cum %% | %-11s %% cum %%\n", + name, units, units); + + read_tot = lprocfs_oh_sum_pcpu(read); + write_tot = lprocfs_oh_sum_pcpu(write); + + if (!read_tot && !write_tot) + return; + + for (i = 0; i < OBD_HIST_MAX; i++) { + r = lprocfs_oh_counter_pcpu(read, i); + w = lprocfs_oh_counter_pcpu(write, i); + read_cum += r; + write_cum += w; + if (read_cum == 0 && write_cum == 0) + continue; + + if (!scale) + seq_printf(seq, "%u", i); + else if (i < 10) + seq_printf(seq, "%lu", BIT(i)); + else if (i < 20) + seq_printf(seq, "%luK", BIT(i - 10)); + else + seq_printf(seq, "%luM", BIT(i - 20)); + + seq_printf(seq, ":\t\t%10lu %3u %3u | %4lu %3u %3u\n", + r, pct(r, read_tot), pct(read_cum, read_tot), + w, pct(w, write_tot), pct(write_cum, write_tot)); + + if (read_cum == read_tot && write_cum == write_tot) + break; + } +} + +static const struct brw_stats_props brw_props[] = { + { .bsp_name = "pages per bulk r/w", + .bsp_units = "rpcs", + .bsp_scale = true }, + { .bsp_name = "discontiguous pages", + .bsp_units = "rpcs", + .bsp_scale = false }, + { .bsp_name = "discontiguous blocks", + .bsp_units = "rpcs", + .bsp_scale = false }, + { .bsp_name = "disk fragmented I/Os", + .bsp_units = "ios", + .bsp_scale = false }, + { .bsp_name = "disk I/Os in flight", + .bsp_units = "ios", + .bsp_scale = false }, + { .bsp_name = "I/O time (1/1000s)", + .bsp_units = "ios", + .bsp_scale = true }, + { .bsp_name = "disk I/O size", + .bsp_units = "ios", + .bsp_scale = true }, +}; + +static int brw_stats_seq_show(struct seq_file *seq, void *v) +{ + struct brw_stats *brw_stats = seq->private; + int i; + + /* this sampling races with updates */ + lprocfs_stats_header(seq, ktime_get_real(), brw_stats->bs_init, 25, + ":", true, ""); + + for (i = 0; i < ARRAY_SIZE(brw_stats->bs_props); i++) { + if (!brw_stats->bs_props[i].bsp_name) + continue; + + display_brw_stats(seq, brw_stats->bs_props[i].bsp_name, + brw_stats->bs_props[i].bsp_units, + &brw_stats->bs_hist[i * 2], + &brw_stats->bs_hist[i * 2 + 1], + brw_stats->bs_props[i].bsp_scale); + } + + return 0; +} + +static ssize_t brw_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct brw_stats *brw_stats = seq->private; + int i; + + for (i = 0; i < BRW_RW_STATS_NUM; i++) + lprocfs_oh_clear_pcpu(&brw_stats->bs_hist[i]); + brw_stats->bs_init = ktime_get_real(); + + return len; +} + +LDEBUGFS_SEQ_FOPS(brw_stats); + +int lprocfs_init_brw_stats(struct brw_stats *brw_stats) +{ + int i, result; + + for (i = 0; i < BRW_RW_STATS_NUM; i++) { + result = lprocfs_oh_alloc_pcpu(&brw_stats->bs_hist[i]); + if (result) + break; + } + + return result; +} +EXPORT_SYMBOL(lprocfs_init_brw_stats); + +void lprocfs_fini_brw_stats(struct brw_stats *brw_stats) +{ + int i; + + for (i = 0; i < BRW_RW_STATS_NUM; i++) + lprocfs_oh_release_pcpu(&brw_stats->bs_hist[i]); +} +EXPORT_SYMBOL(lprocfs_fini_brw_stats); + +void ldebugfs_register_osd_stats(struct dentry *parent, + struct brw_stats *brw_stats, + struct lprocfs_stats *stats) +{ + int i; + + LASSERT(brw_stats); + brw_stats->bs_init = ktime_get_real(); + for (i = 0; i < BRW_RW_STATS_NUM; i++) { + struct brw_stats_props *props = brw_stats->bs_props; + + if (i % 2) { + props[i / 2].bsp_name = brw_props[i / 2].bsp_name; + props[i / 2].bsp_units = brw_props[i / 2].bsp_units; + props[i / 2].bsp_scale = brw_props[i / 2].bsp_scale; + } + } + + if (!parent) + return; + + debugfs_create_file("brw_stats", 0644, parent, brw_stats, + &brw_stats_fops); + + if (stats) + debugfs_create_file("stats", 0644, parent, stats, + &ldebugfs_stats_seq_fops); +} +EXPORT_SYMBOL(ldebugfs_register_osd_stats); + +int lprocfs_hash_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = m->private; + + if (obd == NULL) + return 0; + + /* header for rhashtable state */ + seq_printf(m, "%-*s cur min max theta t-min t-max flags rehash count maxdep distribution\n", + HASH_NAME_LEN, "name"); + ldebugfs_rhash_seq_show("UUID_HASH", &obd->obd_uuid_hash, m); + ldebugfs_rhash_seq_show("NID_HASH", &obd->obd_nid_hash.ht, m); + + cfs_hash_debug_header(m); + cfs_hash_debug_str(obd->obd_nid_stats_hash, m); + return 0; +} +EXPORT_SYMBOL(lprocfs_hash_seq_show); + +int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = m->private; + struct target_distribute_txn_data *tdtd; + + LASSERT(obd != NULL); + + seq_printf(m, "status: "); + if (atomic_read(&obd->obd_max_recoverable_clients) == 0) { + seq_printf(m, "INACTIVE\n"); + goto out; + } + + /* There is gap between client data read from storage and setting + * obd_recovering so check obd_recovery_end as well to make sure + * recovery is really finished + */ + if (obd->obd_recovery_end > 0 && !obd->obd_recovering) { + seq_printf(m, "COMPLETE\n"); + seq_printf(m, "recovery_start: %lld\n", + (s64)ktime_get_real_seconds() - + (ktime_get_seconds() - obd->obd_recovery_start)); + seq_printf(m, "recovery_duration: %lld\n", + obd->obd_recovery_end ? + obd->obd_recovery_end - obd->obd_recovery_start : + ktime_get_seconds() - obd->obd_recovery_start); + /* Number of clients that have completed recovery */ + seq_printf(m, "completed_clients: %d/%d\n", + atomic_read(&obd->obd_max_recoverable_clients) - + obd->obd_stale_clients, + atomic_read(&obd->obd_max_recoverable_clients)); + seq_printf(m, "replayed_requests: %d\n", + obd->obd_replayed_requests); + seq_printf(m, "last_transno: %lld\n", + obd->obd_next_recovery_transno - 1); + seq_printf(m, "VBR: %s\n", obd->obd_version_recov ? + "ENABLED" : "DISABLED"); + seq_printf(m, "IR: %s\n", obd->obd_no_ir ? + "DISABLED" : "ENABLED"); + goto out; + } + + tdtd = obd->u.obt.obt_lut->lut_tdtd; + if (tdtd && tdtd->tdtd_show_update_logs_retrievers) { + char *buf; + int size = 0; + int count = 0; + + buf = tdtd->tdtd_show_update_logs_retrievers( + tdtd->tdtd_show_retrievers_cbdata, + &size, &count); + if (count > 0) { + seq_printf(m, "WAITING\n"); + seq_printf(m, "non-ready MDTs: %s\n", + buf ? buf : "unknown (not enough RAM)"); + seq_printf(m, "recovery_start: %lld\n", + (s64)ktime_get_real_seconds() - + (ktime_get_seconds() - + obd->obd_recovery_start)); + seq_printf(m, "time_waited: %lld\n", + (s64)(ktime_get_seconds() - + obd->obd_recovery_start)); + } + + if (buf != NULL) + OBD_FREE(buf, size); + + if (likely(count > 0)) + goto out; + } + + /* recovery won't start until the clients connect */ + if (obd->obd_recovery_start == 0) { + seq_printf(m, "WAITING_FOR_CLIENTS\n"); + goto out; + } + + seq_printf(m, "RECOVERING\n"); + seq_printf(m, "recovery_start: %lld\n", (s64)ktime_get_real_seconds() - + (ktime_get_seconds() - obd->obd_recovery_start)); + seq_printf(m, "time_remaining: %lld\n", + ktime_get_seconds() >= + obd->obd_recovery_start + + obd->obd_recovery_timeout ? 0 : + (s64)(obd->obd_recovery_start + + obd->obd_recovery_timeout - + ktime_get_seconds())); + seq_printf(m, "connected_clients: %d/%d\n", + atomic_read(&obd->obd_connected_clients), + atomic_read(&obd->obd_max_recoverable_clients)); + /* Number of clients that have completed recovery */ + seq_printf(m, "req_replay_clients: %d\n", + atomic_read(&obd->obd_req_replay_clients)); + seq_printf(m, "lock_repay_clients: %d\n", + atomic_read(&obd->obd_lock_replay_clients)); + seq_printf(m, "completed_clients: %d\n", + atomic_read(&obd->obd_connected_clients) - + atomic_read(&obd->obd_lock_replay_clients)); + seq_printf(m, "evicted_clients: %d\n", obd->obd_stale_clients); + seq_printf(m, "replayed_requests: %d\n", obd->obd_replayed_requests); + seq_printf(m, "queued_requests: %d\n", + obd->obd_requests_queued_for_recovery); + seq_printf(m, "next_transno: %lld\n", + obd->obd_next_recovery_transno); +out: + return 0; +} +EXPORT_SYMBOL(lprocfs_recovery_status_seq_show); + +ssize_t ir_factor_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%d\n", obd->obd_recovery_ir_factor); +} +EXPORT_SYMBOL(ir_factor_show); + +ssize_t ir_factor_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + int val; + int rc; + + rc = kstrtoint(buffer, 10, &val); + if (rc) + return rc; + + if (val < OBD_IR_FACTOR_MIN || val > OBD_IR_FACTOR_MAX) + return -EINVAL; + + obd->obd_recovery_ir_factor = val; + return count; +} +EXPORT_SYMBOL(ir_factor_store); + +int lprocfs_checksum_dump_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = m->private; + + LASSERT(obd != NULL); + seq_printf(m, "%d\n", obd->obd_checksum_dump); + return 0; +} +EXPORT_SYMBOL(lprocfs_checksum_dump_seq_show); + +ssize_t +lprocfs_checksum_dump_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + bool val; + int rc; + + LASSERT(obd != NULL); + rc = kstrtobool_from_user(buffer, count, &val); + if (rc) + return rc; + + obd->obd_checksum_dump = val; + return count; +} +EXPORT_SYMBOL(lprocfs_checksum_dump_seq_write); + +ssize_t recovery_time_soft_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%d\n", obd->obd_recovery_timeout); +} +EXPORT_SYMBOL(recovery_time_soft_show); + +ssize_t recovery_time_soft_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + obd->obd_recovery_timeout = val; + return count; +} +EXPORT_SYMBOL(recovery_time_soft_store); + +ssize_t recovery_time_hard_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%d\n", obd->obd_recovery_time_hard); +} +EXPORT_SYMBOL(recovery_time_hard_show); + +ssize_t recovery_time_hard_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + obd->obd_recovery_time_hard = val; + return count; +} +EXPORT_SYMBOL(recovery_time_hard_store); + +ssize_t instance_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_device_target *target = &obd->u.obt; + + LASSERT(target->obt_magic == OBT_MAGIC); + return scnprintf(buf, PAGE_SIZE, "%u\n", obd->u.obt.obt_instance); +} +EXPORT_SYMBOL(instance_show); + +#endif /* CONFIG_PROC_FS*/ diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c new file mode 100644 index 0000000000000..c581211098acf --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c @@ -0,0 +1,2597 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/lu_object.c + * + * Lustre Object. + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct lu_site_bkt_data { + /** + * LRU list, updated on each access to object. Protected by + * lsb_waitq.lock. + * + * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are + * moved to the lu_site::ls_lru.prev + */ + struct list_head lsb_lru; + /** + * Wait-queue signaled when an object in this site is ultimately + * destroyed (lu_object_free()) or initialized (lu_object_start()). + * It is used by lu_object_find() to wait before re-trying when + * object in the process of destruction is found in the hash table; + * or wait object to be initialized by the allocator. + * + * \see htable_lookup(). + */ + wait_queue_head_t lsb_waitq; +}; + +enum { + LU_CACHE_PERCENT_MAX = 50, + LU_CACHE_PERCENT_DEFAULT = 20 +}; + +#define LU_CACHE_NR_MAX_ADJUST 512 +#define LU_CACHE_NR_UNLIMITED -1 +#define LU_CACHE_NR_DEFAULT LU_CACHE_NR_UNLIMITED +/** This is set to roughly (20 * OSS_NTHRS_MAX) to prevent thrashing */ +#define LU_CACHE_NR_ZFS_LIMIT 10240 + +#define LU_CACHE_NR_MIN 4096 +#define LU_CACHE_NR_MAX 0x80000000UL + +/** + * Max 256 buckets, we don't want too many buckets because: + * - consume too much memory (currently max 16K) + * - avoid unbalanced LRU list + * With few cpus there is little gain from extra buckets, so + * we treat this as a maximum in lu_site_init(). + */ +#define LU_SITE_BKT_BITS 8 + +static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; +module_param(lu_cache_percent, int, 0644); +MODULE_PARM_DESC(lu_cache_percent, "Percentage of memory to be used as lu_object cache"); + +static long lu_cache_nr = LU_CACHE_NR_DEFAULT; +module_param(lu_cache_nr, long, 0644); +MODULE_PARM_DESC(lu_cache_nr, "Maximum number of objects in lu_object cache"); + +static void lu_object_free(const struct lu_env *env, struct lu_object *o); +static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx); + +static u32 lu_fid_hash(const void *data, u32 len, u32 seed) +{ + const struct lu_fid *fid = data; + + seed = cfs_hash_32(seed ^ fid->f_oid, 32); + seed ^= cfs_hash_64(fid->f_seq, 32); + return seed; +} + +static const struct rhashtable_params obj_hash_params = { + .key_len = sizeof(struct lu_fid), + .key_offset = offsetof(struct lu_object_header, loh_fid), + .head_offset = offsetof(struct lu_object_header, loh_hash), + .hashfn = lu_fid_hash, + .automatic_shrinking = true, +}; + +static inline int lu_bkt_hash(struct lu_site *s, const struct lu_fid *fid) +{ + return lu_fid_hash(fid, sizeof(*fid), s->ls_bkt_seed) & + (s->ls_bkt_cnt - 1); +} + +wait_queue_head_t * +lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid) +{ + struct lu_site_bkt_data *bkt; + + bkt = &site->ls_bkts[lu_bkt_hash(site, fid)]; + return &bkt->lsb_waitq; +} +EXPORT_SYMBOL(lu_site_wq_from_fid); + +/** + * Decrease reference counter on object. If last reference is freed, return + * object to the cache, unless lu_object_is_dying(o) holds. In the latter + * case, free object immediately. + */ +void lu_object_put(const struct lu_env *env, struct lu_object *o) +{ + struct lu_site_bkt_data *bkt; + struct lu_object_header *top = o->lo_header; + struct lu_site *site = o->lo_dev->ld_site; + struct lu_object *orig = o; + const struct lu_fid *fid = lu_object_fid(o); + + /* + * till we have full fids-on-OST implemented anonymous objects + * are possible in OSP. such an object isn't listed in the site + * so we should not remove it from the site. + */ + if (fid_is_zero(fid)) { + LASSERT(list_empty(&top->loh_lru)); + if (!atomic_dec_and_test(&top->loh_ref)) + return; + list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_release != NULL) + o->lo_ops->loo_object_release(env, o); + } + lu_object_free(env, orig); + return; + } + + bkt = &site->ls_bkts[lu_bkt_hash(site, &top->loh_fid)]; + if (atomic_add_unless(&top->loh_ref, -1, 1)) { +still_active: + /* + * At this point the object reference is dropped and lock is + * not taken, so lu_object should not be touched because it + * can be freed by concurrent thread. + * + * Somebody may be waiting for this, currently only used for + * cl_object, see cl_object_put_last(). + */ + wake_up(&bkt->lsb_waitq); + + return; + } + + spin_lock(&bkt->lsb_waitq.lock); + if (!atomic_dec_and_test(&top->loh_ref)) { + spin_unlock(&bkt->lsb_waitq.lock); + goto still_active; + } + + /* + * Refcount is zero, and cannot be incremented without taking the bkt + * lock, so object is stable. + */ + + /* + * When last reference is released, iterate over object layers, and + * notify them that object is no longer busy. + */ + list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_release != NULL) + o->lo_ops->loo_object_release(env, o); + } + + /* + * Don't use local 'is_dying' here because if was taken without lock but + * here we need the latest actual value of it so check lu_object + * directly here. + */ + if (!lu_object_is_dying(top) && + (lu_object_exists(orig) || lu_object_is_cl(orig))) { + LASSERT(list_empty(&top->loh_lru)); + list_add_tail(&top->loh_lru, &bkt->lsb_lru); + spin_unlock(&bkt->lsb_waitq.lock); + percpu_counter_inc(&site->ls_lru_len_counter); + CDEBUG(D_INODE, "Add %p/%p to site lru. bkt: %p\n", + orig, top, bkt); + return; + } + + /* + * If object is dying (will not be cached) then remove it from hash + * table (it is already not on the LRU). + * + * This is done with bucket lock held. As the only way to acquire first + * reference to previously unreferenced object is through hash-table + * lookup (lu_object_find()) which takes the lock for first reference, + * no race with concurrent object lookup is possible and we can safely + * destroy object below. + */ + if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) + rhashtable_remove_fast(&site->ls_obj_hash, &top->loh_hash, + obj_hash_params); + + spin_unlock(&bkt->lsb_waitq.lock); + /* Object was already removed from hash above, can kill it. */ + lu_object_free(env, orig); +} +EXPORT_SYMBOL(lu_object_put); + +/** + * Put object and don't keep in cache. This is temporary solution for + * multi-site objects when its layering is not constant. + */ +void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o) +{ + set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags); + return lu_object_put(env, o); +} +EXPORT_SYMBOL(lu_object_put_nocache); + +/** + * Kill the object and take it out of LRU cache. + * Currently used by client code for layout change. + */ +void lu_object_unhash(const struct lu_env *env, struct lu_object *o) +{ + struct lu_object_header *top; + + top = o->lo_header; + set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags); + if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) { + struct lu_site *site = o->lo_dev->ld_site; + struct rhashtable *obj_hash = &site->ls_obj_hash; + struct lu_site_bkt_data *bkt; + + bkt = &site->ls_bkts[lu_bkt_hash(site, &top->loh_fid)]; + spin_lock(&bkt->lsb_waitq.lock); + if (!list_empty(&top->loh_lru)) { + list_del_init(&top->loh_lru); + percpu_counter_dec(&site->ls_lru_len_counter); + } + spin_unlock(&bkt->lsb_waitq.lock); + + rhashtable_remove_fast(obj_hash, &top->loh_hash, + obj_hash_params); + } +} +EXPORT_SYMBOL(lu_object_unhash); + +/** + * Allocate new object. + * + * This follows object creation protocol, described in the comment within + * struct lu_device_operations definition. + */ +static struct lu_object *lu_object_alloc(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f) +{ + struct lu_object *top; + + /* + * Create top-level object slice. This will also create + * lu_object_header. + */ + top = dev->ld_ops->ldo_object_alloc(env, NULL, dev); + if (top == NULL) + return ERR_PTR(-ENOMEM); + if (IS_ERR(top)) + return top; + /* + * This is the only place where object fid is assigned. It's constant + * after this point. + */ + top->lo_header->loh_fid = *f; + + return top; +} + +/** + * Initialize object. + * + * This is called after object hash insertion to avoid returning an object with + * stale attributes. + */ +static int lu_object_start(const struct lu_env *env, struct lu_device *dev, + struct lu_object *top, + const struct lu_object_conf *conf) +{ + struct lu_object *scan; + struct list_head *layers; + unsigned int init_mask = 0; + unsigned int init_flag; + int clean; + int result; + + layers = &top->lo_header->loh_layers; + + do { + /* + * Call ->loo_object_init() repeatedly, until no more new + * object slices are created. + */ + clean = 1; + init_flag = 1; + list_for_each_entry(scan, layers, lo_linkage) { + if (init_mask & init_flag) + goto next; + clean = 0; + scan->lo_header = top->lo_header; + result = scan->lo_ops->loo_object_init(env, scan, conf); + if (result) + return result; + + init_mask |= init_flag; +next: + init_flag <<= 1; + } + } while (!clean); + + list_for_each_entry_reverse(scan, layers, lo_linkage) { + if (scan->lo_ops->loo_object_start != NULL) { + result = scan->lo_ops->loo_object_start(env, scan); + if (result) + return result; + } + } + + lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED); + + set_bit(LU_OBJECT_INITED, &top->lo_header->loh_flags); + + return 0; +} + +/** + * Free an object. + */ +static void lu_object_free(const struct lu_env *env, struct lu_object *o) +{ + wait_queue_head_t *wq; + struct lu_site *site; + struct lu_object *scan; + struct list_head *layers; + LIST_HEAD(splice); + + site = o->lo_dev->ld_site; + layers = &o->lo_header->loh_layers; + wq = lu_site_wq_from_fid(site, &o->lo_header->loh_fid); + /* + * First call ->loo_object_delete() method to release all resources. + */ + list_for_each_entry_reverse(scan, layers, lo_linkage) { + if (scan->lo_ops->loo_object_delete != NULL) + scan->lo_ops->loo_object_delete(env, scan); + } + + /* + * Then, splice object layers into stand-alone list, and call + * ->loo_object_free() on all layers to free memory. Splice is + * necessary, because lu_object_header is freed together with the + * top-level slice. + */ + list_splice_init(layers, &splice); + while (!list_empty(&splice)) { + /* + * Free layers in bottom-to-top order, so that object header + * lives as long as possible and ->loo_object_free() methods + * can look at its contents. + */ + o = container_of(splice.prev, struct lu_object, lo_linkage); + list_del_init(&o->lo_linkage); + LASSERT(o->lo_ops->loo_object_free != NULL); + o->lo_ops->loo_object_free(env, o); + } + + if (waitqueue_active(wq)) + wake_up(wq); +} + +/** + * Free \a nr objects from the cold end of the site LRU list. + * if canblock is 0, then don't block awaiting for another + * instance of lu_site_purge() to complete + */ +int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s, + int nr, int canblock) +{ + struct lu_object_header *h; + struct lu_object_header *temp; + struct lu_site_bkt_data *bkt; + LIST_HEAD(dispose); + int did_sth; + unsigned int start = 0; + int count; + int bnr; + unsigned int i; + + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU)) + RETURN(0); + + /* + * Under LRU list lock, scan LRU list and move unreferenced objects to + * the dispose list, removing them from LRU and hash table. + */ + if (nr != ~0) + start = s->ls_purge_start; + bnr = (nr == ~0) ? -1 : nr / s->ls_bkt_cnt + 1; +again: + /* + * It doesn't make any sense to make purge threads parallel, that can + * only bring troubles to us. See LU-5331. + */ + if (canblock != 0) + mutex_lock(&s->ls_purge_mutex); + else if (mutex_trylock(&s->ls_purge_mutex) == 0) + goto out; + + did_sth = 0; + for (i = start; i < s->ls_bkt_cnt ; i++) { + count = bnr; + bkt = &s->ls_bkts[i]; + spin_lock(&bkt->lsb_waitq.lock); + + list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) { + LASSERT(atomic_read(&h->loh_ref) == 0); + + LINVRNT(lu_bkt_hash(s, &h->loh_fid) == i); + + set_bit(LU_OBJECT_UNHASHED, &h->loh_flags); + rhashtable_remove_fast(&s->ls_obj_hash, &h->loh_hash, + obj_hash_params); + list_move(&h->loh_lru, &dispose); + percpu_counter_dec(&s->ls_lru_len_counter); + if (did_sth == 0) + did_sth = 1; + + if (nr != ~0 && --nr == 0) + break; + + if (count > 0 && --count == 0) + break; + + } + spin_unlock(&bkt->lsb_waitq.lock); + cond_resched(); + /* + * Free everything on the dispose list. This is safe against + * races due to the reasons described in lu_object_put(). + */ + while ((h = list_first_entry_or_null(&dispose, + struct lu_object_header, + loh_lru)) != NULL) { + list_del_init(&h->loh_lru); + lu_object_free(env, lu_object_top(h)); + lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED); + } + + if (nr == 0) + break; + } + mutex_unlock(&s->ls_purge_mutex); + + if (nr != 0 && did_sth && start != 0) { + start = 0; /* restart from the first bucket */ + goto again; + } + /* race on s->ls_purge_start, but nobody cares */ + s->ls_purge_start = i & (s->ls_bkt_cnt - 1); +out: + return nr; +} +EXPORT_SYMBOL(lu_site_purge_objects); + +/* + * Object printing. + * + * Code below has to jump through certain loops to output object description + * into libcfs_debug_msg-based log. The problem is that lu_object_print() + * composes object description from strings that are parts of _lines_ of + * output (i.e., strings that are not terminated by newline). This doesn't fit + * very well into libcfs_debug_msg() interface that assumes that each message + * supplied to it is a self-contained output line. + * + * To work around this, strings are collected in a temporary buffer + * (implemented as a value of lu_cdebug_key key), until terminating newline + * character is detected. + * + */ + +enum { + /** + * Maximal line size. + * + * XXX overflow is not handled correctly. + */ + LU_CDEBUG_LINE = 512 +}; + +struct lu_cdebug_data { + /** + * Temporary buffer. + */ + char lck_area[LU_CDEBUG_LINE]; +}; + +/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */ +LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data); + +/** + * Key, holding temporary buffer. This key is registered very early by + * lu_global_init(). + */ +static struct lu_context_key lu_global_key = { + .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | + LCT_MG_THREAD | LCT_CL_THREAD | LCT_LOCAL, + .lct_init = lu_global_key_init, + .lct_fini = lu_global_key_fini +}; + +/** + * Printer function emitting messages through libcfs_debug_msg(). + */ +int lu_cdebug_printer(const struct lu_env *env, + void *cookie, const char *format, ...) +{ + struct libcfs_debug_msg_data *msgdata = cookie; + struct lu_cdebug_data *key; + int used; + int complete; + va_list args; + + va_start(args, format); + + key = lu_context_key_get(&env->le_ctx, &lu_global_key); + LASSERT(key != NULL); + + used = strlen(key->lck_area); + complete = format[strlen(format) - 1] == '\n'; + /* + * Append new chunk to the buffer. + */ + vsnprintf(key->lck_area + used, + ARRAY_SIZE(key->lck_area) - used, format, args); + if (complete) { + if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys)) + libcfs_debug_msg(msgdata, "%s\n", key->lck_area); + key->lck_area[0] = 0; + } + va_end(args); + return 0; +} +EXPORT_SYMBOL(lu_cdebug_printer); + +/** + * Print object header. + */ +void lu_object_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct lu_object_header *hdr) +{ + (*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]", + hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref), + PFID(&hdr->loh_fid), + test_bit(LU_OBJECT_UNHASHED, + &hdr->loh_flags) ? "" : " hash", + list_empty(&hdr->loh_lru) ? "" : " lru", + hdr->loh_attr & LOHA_EXISTS ? " exist" : ""); +} +EXPORT_SYMBOL(lu_object_header_print); + +/** + * Print human readable representation of the \a o to the \a printer. + */ +void lu_object_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct lu_object *o) +{ + static const char ruler[] = "........................................"; + struct lu_object_header *top; + int depth = 4; + + top = o->lo_header; + lu_object_header_print(env, cookie, printer, top); + (*printer)(env, cookie, "{\n"); + + list_for_each_entry(o, &top->loh_layers, lo_linkage) { + /* + * print `.' \a depth times followed by type name and address + */ + (*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler, + o->lo_dev->ld_type->ldt_name, o); + + if (o->lo_ops->loo_object_print != NULL) + (*o->lo_ops->loo_object_print)(env, cookie, printer, o); + + (*printer)(env, cookie, "\n"); + } + + (*printer)(env, cookie, "} header@%p\n", top); +} +EXPORT_SYMBOL(lu_object_print); + +/** + * Check object consistency. + */ +int lu_object_invariant(const struct lu_object *o) +{ + struct lu_object_header *top; + + top = o->lo_header; + list_for_each_entry(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_invariant != NULL && + !o->lo_ops->loo_object_invariant(o)) + return 0; + } + return 1; +} + +/* + * Limit the lu_object cache to a maximum of lu_cache_nr objects. Because the + * calculation for the number of objects to reclaim is not covered by a lock the + * maximum number of objects is capped by LU_CACHE_MAX_ADJUST. This ensures + * that many concurrent threads will not accidentally purge the entire cache. + */ +static void lu_object_limit(const struct lu_env *env, + struct lu_device *dev) +{ + u64 size, nr; + + if (lu_cache_nr == LU_CACHE_NR_UNLIMITED) + return; + + size = atomic_read(&dev->ld_site->ls_obj_hash.nelems); + nr = (u64)lu_cache_nr; + if (size <= nr) + return; + + lu_site_purge_objects(env, dev->ld_site, + min_t(u64, size - nr, LU_CACHE_NR_MAX_ADJUST), + 0); +} + +static struct lu_object *htable_lookup(const struct lu_env *env, + struct lu_device *dev, + struct lu_site_bkt_data *bkt, + const struct lu_fid *f, + struct lu_object_header *new) +{ + struct lu_site *s = dev->ld_site; + struct lu_object_header *h; + +try_again: + rcu_read_lock(); + if (new) + h = rhashtable_lookup_get_insert_fast(&s->ls_obj_hash, + &new->loh_hash, + obj_hash_params); + else + h = rhashtable_lookup(&s->ls_obj_hash, f, obj_hash_params); + + if (IS_ERR_OR_NULL(h)) { + /* Not found */ + if (!new) + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS); + rcu_read_unlock(); + if (PTR_ERR(h) == -ENOMEM) { + msleep(20); + goto try_again; + } + lu_object_limit(env, dev); + if (PTR_ERR(h) == -E2BIG) + goto try_again; + + return ERR_PTR(-ENOENT); + } + + if (atomic_inc_not_zero(&h->loh_ref)) { + rcu_read_unlock(); + return lu_object_top(h); + } + + spin_lock(&bkt->lsb_waitq.lock); + if (lu_object_is_dying(h) || + test_bit(LU_OBJECT_UNHASHED, &h->loh_flags)) { + spin_unlock(&bkt->lsb_waitq.lock); + rcu_read_unlock(); + if (new) { + /* + * Old object might have already been removed, or will + * be soon. We need to insert our new object, so + * remove the old one just in case it is still there. + */ + rhashtable_remove_fast(&s->ls_obj_hash, &h->loh_hash, + obj_hash_params); + goto try_again; + } + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS); + return ERR_PTR(-ENOENT); + } + /* Now protected by spinlock */ + rcu_read_unlock(); + + if (!list_empty(&h->loh_lru)) { + list_del_init(&h->loh_lru); + percpu_counter_dec(&s->ls_lru_len_counter); + } + atomic_inc(&h->loh_ref); + spin_unlock(&bkt->lsb_waitq.lock); + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT); + return lu_object_top(h); +} + +/** + * Search cache for an object with the fid \a f. If such object is found, + * return it. Otherwise, create new object, insert it into cache and return + * it. In any case, additional reference is acquired on the returned object. + */ +struct lu_object *lu_object_find(const struct lu_env *env, + struct lu_device *dev, const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf); +} +EXPORT_SYMBOL(lu_object_find); + +/* + * Get a 'first' reference to an object that was found while looking through the + * hash table. + */ +struct lu_object *lu_object_get_first(struct lu_object_header *h, + struct lu_device *dev) +{ + struct lu_site *s = dev->ld_site; + struct lu_object *ret; + + if (IS_ERR_OR_NULL(h) || lu_object_is_dying(h)) + return NULL; + + ret = lu_object_locate(h, dev->ld_type); + if (!ret) + return ret; + + if (!atomic_inc_not_zero(&h->loh_ref)) { + struct lu_site_bkt_data *bkt; + + bkt = &s->ls_bkts[lu_bkt_hash(s, &h->loh_fid)]; + spin_lock(&bkt->lsb_waitq.lock); + if (!lu_object_is_dying(h) && + !test_bit(LU_OBJECT_UNHASHED, &h->loh_flags)) + atomic_inc(&h->loh_ref); + else + ret = NULL; + spin_unlock(&bkt->lsb_waitq.lock); + } + return ret; +} +EXPORT_SYMBOL(lu_object_get_first); + +/** + * Core logic of lu_object_find*() functions. + * + * Much like lu_object_find(), but top level device of object is specifically + * \a dev rather than top level device of the site. This interface allows + * objects of different "stacking" to be created within the same site. + */ +struct lu_object *lu_object_find_at(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_object *o; + struct lu_object *shadow; + struct lu_site *s; + struct lu_site_bkt_data *bkt; + struct rhashtable *hs; + int rc; + + ENTRY; + + /* FID is from disk or network, zero FID is meaningless, return error + * early to avoid assertion in lu_object_put. If a zero FID is wanted, + * it should be allocated via lu_object_anon(). + */ + if (fid_is_zero(f)) + RETURN(ERR_PTR(-EINVAL)); + + /* + * This uses standard index maintenance protocol: + * + * - search index under lock, and return object if found; + * - otherwise, unlock index, allocate new object; + * - lock index and search again; + * - if nothing is found (usual case), insert newly created + * object into index; + * - otherwise (race: other thread inserted object), free + * object just allocated. + * - unlock index; + * - return object. + * + * For "LOC_F_NEW" case, we are sure the object is new established. + * It is unnecessary to perform lookup-alloc-lookup-insert, instead, + * just alloc and insert directly. + * + */ + s = dev->ld_site; + hs = &s->ls_obj_hash; + + if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_OBD_ZERO_NLINK_RACE))) + lu_site_purge(env, s, -1); + + bkt = &s->ls_bkts[lu_bkt_hash(s, f)]; + if (!(conf && conf->loc_flags & LOC_F_NEW)) { + o = htable_lookup(env, dev, bkt, f, NULL); + + if (!IS_ERR(o)) { + if (likely(lu_object_is_inited(o->lo_header))) + RETURN(o); + + wait_event_idle(bkt->lsb_waitq, + lu_object_is_inited(o->lo_header) || + lu_object_is_dying(o->lo_header)); + + if (lu_object_is_dying(o->lo_header)) { + lu_object_put(env, o); + + RETURN(ERR_PTR(-ENOENT)); + } + + RETURN(o); + } + + if (PTR_ERR(o) != -ENOENT) + RETURN(o); + } + + /* + * Allocate new object, NB, object is unitialized in case object + * is changed between allocation and hash insertion, thus the object + * with stale attributes is returned. + */ + o = lu_object_alloc(env, dev, f); + if (IS_ERR(o)) + RETURN(o); + + LASSERT(lu_fid_eq(lu_object_fid(o), f)); + + CFS_RACE_WAIT(OBD_FAIL_OBD_ZERO_NLINK_RACE); + + if (conf && conf->loc_flags & LOC_F_NEW) { + int status = rhashtable_insert_fast(hs, &o->lo_header->loh_hash, + obj_hash_params); + if (status) + /* Strange error - go the slow way */ + shadow = htable_lookup(env, dev, bkt, f, o->lo_header); + else + shadow = ERR_PTR(-ENOENT); + } else { + shadow = htable_lookup(env, dev, bkt, f, o->lo_header); + } + if (likely(PTR_ERR(shadow) == -ENOENT)) { + /* + * The new object has been successfully inserted. + * + * This may result in rather complicated operations, including + * fld queries, inode loading, etc. + */ + rc = lu_object_start(env, dev, o, conf); + if (rc) { + lu_object_put_nocache(env, o); + RETURN(ERR_PTR(rc)); + } + + wake_up(&bkt->lsb_waitq); + + lu_object_limit(env, dev); + + RETURN(o); + } + + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE); + lu_object_free(env, o); + + if (!(conf && conf->loc_flags & LOC_F_NEW) && + !IS_ERR(shadow) && + !lu_object_is_inited(shadow->lo_header)) { + wait_event_idle(bkt->lsb_waitq, + lu_object_is_inited(shadow->lo_header) || + lu_object_is_dying(shadow->lo_header)); + + if (lu_object_is_dying(shadow->lo_header)) { + lu_object_put(env, shadow); + + RETURN(ERR_PTR(-ENOENT)); + } + } + + RETURN(shadow); +} +EXPORT_SYMBOL(lu_object_find_at); + +/** + * Find object with given fid, and return its slice belonging to given device. + */ +struct lu_object *lu_object_find_slice(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_object *top; + struct lu_object *obj; + + top = lu_object_find(env, dev, f, conf); + if (IS_ERR(top)) + return top; + + obj = lu_object_locate(top->lo_header, dev->ld_type); + if (unlikely(obj == NULL)) { + lu_object_put(env, top); + obj = ERR_PTR(-ENOENT); + } + + return obj; +} +EXPORT_SYMBOL(lu_object_find_slice); + +int lu_device_type_init(struct lu_device_type *ldt) +{ + int result = 0; + + atomic_set(&ldt->ldt_device_nr, 0); + if (ldt->ldt_ops->ldto_init) + result = ldt->ldt_ops->ldto_init(ldt); + + return result; +} +EXPORT_SYMBOL(lu_device_type_init); + +void lu_device_type_fini(struct lu_device_type *ldt) +{ + if (ldt->ldt_ops->ldto_fini) + ldt->ldt_ops->ldto_fini(ldt); +} +EXPORT_SYMBOL(lu_device_type_fini); + +/** + * Global list of all sites on this node + */ +static LIST_HEAD(lu_sites); +static DECLARE_RWSEM(lu_sites_guard); + +/** + * Global environment used by site shrinker. + */ +static struct lu_env lu_shrink_env; + +struct lu_site_print_arg { + struct lu_env *lsp_env; + void *lsp_cookie; + lu_printer_t lsp_printer; +}; + +static void +lu_site_obj_print(struct lu_object_header *h, struct lu_site_print_arg *arg) +{ + if (!list_empty(&h->loh_layers)) { + const struct lu_object *o; + + o = lu_object_top(h); + lu_object_print(arg->lsp_env, arg->lsp_cookie, + arg->lsp_printer, o); + } else { + lu_object_header_print(arg->lsp_env, arg->lsp_cookie, + arg->lsp_printer, h); + } +} + +/** + * Print all objects in \a s. + */ +void lu_site_print(const struct lu_env *env, struct lu_site *s, atomic_t *ref, + int msg_flag, lu_printer_t printer) +{ + struct lu_site_print_arg arg = { + .lsp_env = (struct lu_env *)env, + .lsp_printer = printer, + }; + struct rhashtable_iter iter; + struct lu_object_header *h; + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, msg_flag, NULL); + + if (!s || !atomic_read(ref)) + return; + + arg.lsp_cookie = (void *)&msgdata; + + rhashtable_walk_enter(&s->ls_obj_hash, &iter); + rhashtable_walk_start(&iter); + while ((h = rhashtable_walk_next(&iter)) != NULL) { + if (IS_ERR(h)) + continue; + lu_site_obj_print(h, &arg); + } + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); +} +EXPORT_SYMBOL(lu_site_print); + +/** + * Return desired hash table order. + */ +static void lu_htable_limits(struct lu_device *top) +{ + unsigned long cache_size; + + /* + * For ZFS based OSDs the cache should be disabled by default. This + * allows the ZFS ARC maximum flexibility in determining what buffers + * to cache. If Lustre has objects or buffer which it wants to ensure + * always stay cached it must maintain a hold on them. + */ + if (strcmp(top->ld_type->ldt_name, LUSTRE_OSD_ZFS_NAME) == 0) { + lu_cache_nr = LU_CACHE_NR_ZFS_LIMIT; + return; + } + + /* + * Calculate hash table size, assuming that we want reasonable + * performance when 20% of total memory is occupied by cache of + * lu_objects. + * + * Size of lu_object is (arbitrary) taken as 1K (together with inode). + */ + cache_size = cfs_totalram_pages(); + +#if BITS_PER_LONG == 32 + /* limit hashtable size for lowmem systems to low RAM */ + if (cache_size > 1 << (30 - PAGE_SHIFT)) + cache_size = 1 << (30 - PAGE_SHIFT) * 3 / 4; +#endif + + /* clear off unreasonable cache setting. */ + if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) { + CWARN("obdclass: invalid lu_cache_percent: %u, it must be in the range of (0, %u]. Will use default value: %u.\n", + lu_cache_percent, LU_CACHE_PERCENT_MAX, + LU_CACHE_PERCENT_DEFAULT); + + lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; + } + cache_size = cache_size / 100 * lu_cache_percent * + (PAGE_SIZE / 1024); + + lu_cache_nr = clamp_t(typeof(cache_size), cache_size, + LU_CACHE_NR_MIN, LU_CACHE_NR_MAX); +} + +void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d) +{ + spin_lock(&s->ls_ld_lock); + if (list_empty(&d->ld_linkage)) + list_add(&d->ld_linkage, &s->ls_ld_linkage); + spin_unlock(&s->ls_ld_lock); +} +EXPORT_SYMBOL(lu_dev_add_linkage); + +void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d) +{ + spin_lock(&s->ls_ld_lock); + list_del_init(&d->ld_linkage); + spin_unlock(&s->ls_ld_lock); +} +EXPORT_SYMBOL(lu_dev_del_linkage); + +/** + * Initialize site \a s, with \a d as the top level device. + */ +int lu_site_init(struct lu_site *s, struct lu_device *top) +{ + struct lu_site_bkt_data *bkt; + unsigned int i; + int rc; + ENTRY; + + memset(s, 0, sizeof *s); + mutex_init(&s->ls_purge_mutex); + lu_htable_limits(top); + +#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG + rc = percpu_counter_init(&s->ls_lru_len_counter, 0, GFP_NOFS); +#else + rc = percpu_counter_init(&s->ls_lru_len_counter, 0); +#endif + if (rc) + return -ENOMEM; + + if (rhashtable_init(&s->ls_obj_hash, &obj_hash_params) != 0) { + CERROR("failed to create lu_site hash\n"); + return -ENOMEM; + } + + s->ls_bkt_seed = get_random_u32(); + s->ls_bkt_cnt = max_t(long, 1 << LU_SITE_BKT_BITS, + 2 * num_possible_cpus()); + s->ls_bkt_cnt = roundup_pow_of_two(s->ls_bkt_cnt); + OBD_ALLOC_PTR_ARRAY_LARGE(s->ls_bkts, s->ls_bkt_cnt); + if (!s->ls_bkts) { + rhashtable_destroy(&s->ls_obj_hash); + s->ls_bkts = NULL; + return -ENOMEM; + } + + for (i = 0; i < s->ls_bkt_cnt; i++) { + bkt = &s->ls_bkts[i]; + INIT_LIST_HEAD(&bkt->lsb_lru); + init_waitqueue_head(&bkt->lsb_waitq); + } + + s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0); + if (s->ls_stats == NULL) { + OBD_FREE_PTR_ARRAY_LARGE(s->ls_bkts, s->ls_bkt_cnt); + s->ls_bkts = NULL; + rhashtable_destroy(&s->ls_obj_hash); + return -ENOMEM; + } + + lprocfs_counter_init(s->ls_stats, LU_SS_CREATED, + 0, "created", "created"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT, + 0, "cache_hit", "cache_hit"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS, + 0, "cache_miss", "cache_miss"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE, + 0, "cache_race", "cache_race"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE, + 0, "cache_death_race", "cache_death_race"); + lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED, + 0, "lru_purged", "lru_purged"); + + INIT_LIST_HEAD(&s->ls_linkage); + s->ls_top_dev = top; + top->ld_site = s; + lu_device_get(top); + lu_ref_add(&top->ld_reference, "site-top", s); + + INIT_LIST_HEAD(&s->ls_ld_linkage); + spin_lock_init(&s->ls_ld_lock); + + lu_dev_add_linkage(s, top); + + RETURN(0); +} +EXPORT_SYMBOL(lu_site_init); + +/** + * Finalize \a s and release its resources. + */ +void lu_site_fini(struct lu_site *s) +{ + down_write(&lu_sites_guard); + list_del_init(&s->ls_linkage); + up_write(&lu_sites_guard); + + percpu_counter_destroy(&s->ls_lru_len_counter); + + if (s->ls_bkts) { + rhashtable_destroy(&s->ls_obj_hash); + OBD_FREE_PTR_ARRAY_LARGE(s->ls_bkts, s->ls_bkt_cnt); + s->ls_bkts = NULL; + } + + if (s->ls_top_dev != NULL) { + s->ls_top_dev->ld_site = NULL; + lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s); + lu_device_put(s->ls_top_dev); + s->ls_top_dev = NULL; + } + + if (s->ls_stats != NULL) + lprocfs_free_stats(&s->ls_stats); +} +EXPORT_SYMBOL(lu_site_fini); + +/** + * Called when initialization of stack for this site is completed. + */ +int lu_site_init_finish(struct lu_site *s) +{ + int result; + down_write(&lu_sites_guard); + result = lu_context_refill(&lu_shrink_env.le_ctx); + if (result == 0) + list_add(&s->ls_linkage, &lu_sites); + up_write(&lu_sites_guard); + return result; +} +EXPORT_SYMBOL(lu_site_init_finish); + +/** + * Acquire additional reference on device \a d + */ +void lu_device_get(struct lu_device *d) +{ + atomic_inc(&d->ld_ref); +} +EXPORT_SYMBOL(lu_device_get); + +/** + * Release reference on device \a d. + */ +void lu_device_put(struct lu_device *d) +{ + LASSERT(atomic_read(&d->ld_ref) > 0); + atomic_dec(&d->ld_ref); +} +EXPORT_SYMBOL(lu_device_put); + +enum { /* Maximal number of tld slots. */ + LU_CONTEXT_KEY_NR = 40 +}; +static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, }; +static DECLARE_RWSEM(lu_key_initing); + +/** + * Initialize device \a d of type \a t. + */ +int lu_device_init(struct lu_device *d, struct lu_device_type *t) +{ + if (atomic_add_unless(&t->ldt_device_nr, 1, 0) == 0) { + down_write(&lu_key_initing); + if (t->ldt_ops->ldto_start && + atomic_read(&t->ldt_device_nr) == 0) + t->ldt_ops->ldto_start(t); + atomic_inc(&t->ldt_device_nr); + up_write(&lu_key_initing); + } + + memset(d, 0, sizeof *d); + d->ld_type = t; + lu_ref_init(&d->ld_reference); + INIT_LIST_HEAD(&d->ld_linkage); + + return 0; +} +EXPORT_SYMBOL(lu_device_init); + +/** + * Finalize device \a d. + */ +void lu_device_fini(struct lu_device *d) +{ + struct lu_device_type *t = d->ld_type; + + if (d->ld_obd != NULL) { + d->ld_obd->obd_lu_dev = NULL; + d->ld_obd = NULL; + } + + lu_ref_fini(&d->ld_reference); + LASSERTF(atomic_read(&d->ld_ref) == 0, + "Refcount is %u\n", atomic_read(&d->ld_ref)); + LASSERT(atomic_read(&t->ldt_device_nr) > 0); + + if (atomic_dec_and_test(&t->ldt_device_nr) && + t->ldt_ops->ldto_stop != NULL) + t->ldt_ops->ldto_stop(t); +} +EXPORT_SYMBOL(lu_device_fini); + +/** + * Initialize object \a o that is part of compound object \a h and was created + * by device \a d. + */ +int lu_object_init(struct lu_object *o, struct lu_object_header *h, + struct lu_device *d) +{ + memset(o, 0, sizeof(*o)); + o->lo_header = h; + o->lo_dev = d; + lu_device_get(d); + lu_ref_add_at(&d->ld_reference, &o->lo_dev_ref, "lu_object", o); + INIT_LIST_HEAD(&o->lo_linkage); + + return 0; +} +EXPORT_SYMBOL(lu_object_init); + +/** + * Finalize object and release its resources. + */ +void lu_object_fini(struct lu_object *o) +{ + struct lu_device *dev = o->lo_dev; + + LASSERT(list_empty(&o->lo_linkage)); + + if (dev != NULL) { + lu_ref_del_at(&dev->ld_reference, &o->lo_dev_ref, + "lu_object", o); + lu_device_put(dev); + o->lo_dev = NULL; + } +} +EXPORT_SYMBOL(lu_object_fini); + +/** + * Add object \a o as first layer of compound object \a h + * + * This is typically called by the ->ldo_object_alloc() method of top-level + * device. + */ +void lu_object_add_top(struct lu_object_header *h, struct lu_object *o) +{ + list_move(&o->lo_linkage, &h->loh_layers); +} +EXPORT_SYMBOL(lu_object_add_top); + +/** + * Add object \a o as a layer of compound object, going after \a before. + * + * This is typically called by the ->ldo_object_alloc() method of \a + * before->lo_dev. + */ +void lu_object_add(struct lu_object *before, struct lu_object *o) +{ + list_move(&o->lo_linkage, &before->lo_linkage); +} +EXPORT_SYMBOL(lu_object_add); + +/** + * Initialize compound object. + */ +int lu_object_header_init(struct lu_object_header *h) +{ + memset(h, 0, sizeof *h); + atomic_set(&h->loh_ref, 1); + INIT_LIST_HEAD(&h->loh_lru); + INIT_LIST_HEAD(&h->loh_layers); + lu_ref_init(&h->loh_reference); + return 0; +} +EXPORT_SYMBOL(lu_object_header_init); + +/** + * Finalize compound object. + */ +void lu_object_header_fini(struct lu_object_header *h) +{ + LASSERT(list_empty(&h->loh_layers)); + LASSERT(list_empty(&h->loh_lru)); + lu_ref_fini(&h->loh_reference); +} +EXPORT_SYMBOL(lu_object_header_fini); + +/** + * Free lu_object_header with proper RCU handling + */ +void lu_object_header_free(struct lu_object_header *h) +{ + lu_object_header_fini(h); + OBD_FREE_PRE(h, sizeof(*h), "kfreed"); + kfree_rcu(h, loh_rcu); +} +EXPORT_SYMBOL(lu_object_header_free); + +/** + * Given a compound object, find its slice, corresponding to the device type + * \a dtype. + */ +struct lu_object *lu_object_locate(struct lu_object_header *h, + const struct lu_device_type *dtype) +{ + struct lu_object *o; + + list_for_each_entry(o, &h->loh_layers, lo_linkage) { + if (o->lo_dev->ld_type == dtype) + return o; + } + return NULL; +} +EXPORT_SYMBOL(lu_object_locate); + +/** + * Finalize and free devices in the device stack. + * + * Finalize device stack by purging object cache, and calling + * lu_device_type_operations::ldto_device_fini() and + * lu_device_type_operations::ldto_device_free() on all devices in the stack. + */ +void lu_stack_fini(const struct lu_env *env, struct lu_device *top) +{ + struct lu_site *site = top->ld_site; + struct lu_device *scan; + struct lu_device *next; + + lu_site_purge(env, site, ~0); + for (scan = top; scan != NULL; scan = next) { + next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan); + lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init); + lu_device_put(scan); + } + + /* purge again. */ + lu_site_purge(env, site, ~0); + + for (scan = top; scan != NULL; scan = next) { + const struct lu_device_type *ldt = scan->ld_type; + + next = ldt->ldt_ops->ldto_device_free(env, scan); + } +} + +/** + * Global counter incremented whenever key is registered, unregistered, + * revived or quiesced. This is used to void unnecessary calls to + * lu_context_refill(). No locking is provided, as initialization and shutdown + * are supposed to be externally serialized. + */ +static atomic_t key_set_version = ATOMIC_INIT(0); + +/** + * Register new key. + */ +int lu_context_key_register(struct lu_context_key *key) +{ + int result; + unsigned int i; + + LASSERT(key->lct_init != NULL); + LASSERT(key->lct_fini != NULL); + LASSERT(key->lct_tags != 0); + LASSERT(key->lct_owner != NULL); + + result = -ENFILE; + atomic_set(&key->lct_used, 1); + lu_ref_init(&key->lct_reference); + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + if (lu_keys[i]) + continue; + key->lct_index = i; + + if (strncmp("osd_", module_name(key->lct_owner), 4) == 0) + CFS_RACE_WAIT(OBD_FAIL_OBD_SETUP); + + if (cmpxchg(&lu_keys[i], NULL, key) != NULL) + continue; + + result = 0; + atomic_inc(&key_set_version); + break; + } + if (result) { + lu_ref_fini(&key->lct_reference); + atomic_set(&key->lct_used, 0); + } + return result; +} +EXPORT_SYMBOL(lu_context_key_register); + +static void key_fini(struct lu_context *ctx, int index) +{ + if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) { + struct lu_context_key *key; + + key = lu_keys[index]; + LASSERT(key != NULL); + LASSERT(key->lct_fini != NULL); + LASSERT(atomic_read(&key->lct_used) > 0); + + key->lct_fini(ctx, key, ctx->lc_value[index]); + lu_ref_del(&key->lct_reference, "ctx", ctx); + if (atomic_dec_and_test(&key->lct_used)) + wake_up_var(&key->lct_used); + + LASSERT(key->lct_owner != NULL); + if ((ctx->lc_tags & LCT_NOREF) == 0) { + LINVRNT(module_refcount(key->lct_owner) > 0); + module_put(key->lct_owner); + } + ctx->lc_value[index] = NULL; + } +} + +/** + * Deregister key. + */ +void lu_context_key_degister(struct lu_context_key *key) +{ + LASSERT(atomic_read(&key->lct_used) >= 1); + LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); + + lu_context_key_quiesce(NULL, key); + + key_fini(&lu_shrink_env.le_ctx, key->lct_index); + + /** + * Wait until all transient contexts referencing this key have + * run lu_context_key::lct_fini() method. + */ + atomic_dec(&key->lct_used); + wait_var_event(&key->lct_used, atomic_read(&key->lct_used) == 0); + + if (!WARN_ON(lu_keys[key->lct_index] == NULL)) + lu_ref_fini(&key->lct_reference); + + smp_store_release(&lu_keys[key->lct_index], NULL); +} +EXPORT_SYMBOL(lu_context_key_degister); + +/** + * Register a number of keys. This has to be called after all keys have been + * initialized by a call to LU_CONTEXT_KEY_INIT(). + */ +int lu_context_key_register_many(struct lu_context_key *k, ...) +{ + struct lu_context_key *key = k; + va_list args; + int result; + + va_start(args, k); + do { + result = lu_context_key_register(key); + if (result) + break; + key = va_arg(args, struct lu_context_key *); + } while (key != NULL); + va_end(args); + + if (result != 0) { + va_start(args, k); + while (k != key) { + lu_context_key_degister(k); + k = va_arg(args, struct lu_context_key *); + } + va_end(args); + } + + return result; +} +EXPORT_SYMBOL(lu_context_key_register_many); + +/** + * De-register a number of keys. This is a dual to + * lu_context_key_register_many(). + */ +void lu_context_key_degister_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_degister(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_degister_many); + +/** + * Revive a number of keys. + */ +void lu_context_key_revive_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_revive(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_revive_many); + +/** + * Quiescent a number of keys. + */ +void lu_context_key_quiesce_many(struct lu_device_type *t, + struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_quiesce(t, k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_quiesce_many); + +/** + * Return value associated with key \a key in context \a ctx. + */ +void *lu_context_key_get(const struct lu_context *ctx, + const struct lu_context_key *key) +{ + LINVRNT(ctx->lc_state == LCS_ENTERED); + LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); + LASSERT(lu_keys[key->lct_index] == key); + return ctx->lc_value[key->lct_index]; +} +EXPORT_SYMBOL(lu_context_key_get); + +/** + * List of remembered contexts. XXX document me. + */ +static LIST_HEAD(lu_context_remembered); +static DEFINE_SPINLOCK(lu_context_remembered_guard); + +/** + * Destroy \a key in all remembered contexts. This is used to destroy key + * values in "shared" contexts (like service threads), when a module owning + * the key is about to be unloaded. + */ +void lu_context_key_quiesce(struct lu_device_type *t, + struct lu_context_key *key) +{ + struct lu_context *ctx; + + if (key->lct_tags & LCT_QUIESCENT) + return; + /* + * The write-lock on lu_key_initing will ensure that any + * keys_fill() which didn't see LCT_QUIESCENT will have + * finished before we call key_fini(). + */ + down_write(&lu_key_initing); + if (!(key->lct_tags & LCT_QUIESCENT)) { + if (t == NULL || atomic_read(&t->ldt_device_nr) == 0) + key->lct_tags |= LCT_QUIESCENT; + up_write(&lu_key_initing); + + spin_lock(&lu_context_remembered_guard); + list_for_each_entry(ctx, &lu_context_remembered, lc_remember) { + spin_until_cond(READ_ONCE(ctx->lc_state) != LCS_LEAVING); + key_fini(ctx, key->lct_index); + } + spin_unlock(&lu_context_remembered_guard); + + return; + } + up_write(&lu_key_initing); +} + +void lu_context_key_revive(struct lu_context_key *key) +{ + key->lct_tags &= ~LCT_QUIESCENT; + atomic_inc(&key_set_version); +} + +static void keys_fini(struct lu_context *ctx) +{ + unsigned int i; + + if (ctx->lc_value == NULL) + return; + + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) + key_fini(ctx, i); + + OBD_FREE_PTR_ARRAY(ctx->lc_value, ARRAY_SIZE(lu_keys)); + ctx->lc_value = NULL; +} + +static int keys_fill(struct lu_context *ctx) +{ + unsigned int i; + int rc = 0; + + /* + * A serialisation with lu_context_key_quiesce() is needed, to + * ensure we see LCT_QUIESCENT and don't allocate a new value + * after it freed one. The rwsem provides this. As down_read() + * does optimistic spinning while the writer is active, this is + * unlikely to ever sleep. + */ + down_read(&lu_key_initing); + ctx->lc_version = atomic_read(&key_set_version); + + LINVRNT(ctx->lc_value); + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + struct lu_context_key *key; + + key = lu_keys[i]; + if (!ctx->lc_value[i] && key && + (key->lct_tags & ctx->lc_tags) && + /* + * Don't create values for a LCT_QUIESCENT key, as this + * will pin module owning a key. + */ + !(key->lct_tags & LCT_QUIESCENT)) { + void *value; + + LINVRNT(key->lct_init != NULL); + LINVRNT(key->lct_index == i); + + LASSERT(key->lct_owner != NULL); + if (!(ctx->lc_tags & LCT_NOREF) && + try_module_get(key->lct_owner) == 0) { + /* module is unloading, skip this key */ + continue; + } + + value = key->lct_init(ctx, key); + if (unlikely(IS_ERR(value))) { + rc = PTR_ERR(value); + break; + } + + lu_ref_add_atomic(&key->lct_reference, "ctx", ctx); + atomic_inc(&key->lct_used); + /* + * This is the only place in the code, where an + * element of ctx->lc_value[] array is set to non-NULL + * value. + */ + ctx->lc_value[i] = value; + if (key->lct_exit != NULL) + ctx->lc_tags |= LCT_HAS_EXIT; + } + } + + up_read(&lu_key_initing); + return rc; +} + +static int keys_init(struct lu_context *ctx) +{ + OBD_ALLOC_PTR_ARRAY(ctx->lc_value, ARRAY_SIZE(lu_keys)); + if (likely(ctx->lc_value != NULL)) + return keys_fill(ctx); + + return -ENOMEM; +} + +/** + * Initialize context data-structure. Create values for all keys. + */ +int lu_context_init(struct lu_context *ctx, __u32 tags) +{ + int rc; + + memset(ctx, 0, sizeof *ctx); + ctx->lc_state = LCS_INITIALIZED; + ctx->lc_tags = tags; + if (tags & LCT_REMEMBER) { + spin_lock(&lu_context_remembered_guard); + list_add(&ctx->lc_remember, &lu_context_remembered); + spin_unlock(&lu_context_remembered_guard); + } else { + INIT_LIST_HEAD(&ctx->lc_remember); + } + + rc = keys_init(ctx); + if (rc != 0) + lu_context_fini(ctx); + + return rc; +} +EXPORT_SYMBOL(lu_context_init); + +/** + * Finalize context data-structure. Destroy key values. + */ +void lu_context_fini(struct lu_context *ctx) +{ + LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT); + ctx->lc_state = LCS_FINALIZED; + + if ((ctx->lc_tags & LCT_REMEMBER) == 0) { + LASSERT(list_empty(&ctx->lc_remember)); + } else { + /* could race with key degister */ + spin_lock(&lu_context_remembered_guard); + list_del_init(&ctx->lc_remember); + spin_unlock(&lu_context_remembered_guard); + } + keys_fini(ctx); +} +EXPORT_SYMBOL(lu_context_fini); + +/** + * Called before entering context. + */ +void lu_context_enter(struct lu_context *ctx) +{ + LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT); + ctx->lc_state = LCS_ENTERED; +} +EXPORT_SYMBOL(lu_context_enter); + +/** + * Called after exiting from \a ctx + */ +void lu_context_exit(struct lu_context *ctx) +{ + unsigned int i; + + LINVRNT(ctx->lc_state == LCS_ENTERED); + /* + * Disable preempt to ensure we get a warning if + * any lct_exit ever tries to sleep. That would hurt + * lu_context_key_quiesce() which spins waiting for us. + * This also ensure we aren't preempted while the state + * is LCS_LEAVING, as that too would cause problems for + * lu_context_key_quiesce(). + */ + preempt_disable(); + /* + * Ensure lu_context_key_quiesce() sees LCS_LEAVING + * or we see LCT_QUIESCENT + */ + smp_store_mb(ctx->lc_state, LCS_LEAVING); + if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value) { + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + struct lu_context_key *key; + + key = lu_keys[i]; + if (ctx->lc_value[i] && + !(key->lct_tags & LCT_QUIESCENT) && + key->lct_exit) + key->lct_exit(ctx, key, ctx->lc_value[i]); + } + } + + smp_store_release(&ctx->lc_state, LCS_LEFT); + preempt_enable(); +} +EXPORT_SYMBOL(lu_context_exit); + +/** + * Allocate for context all missing keys that were registered after context + * creation. key_set_version is only changed in rare cases when modules + * are loaded and removed. + */ +int lu_context_refill(struct lu_context *ctx) +{ + if (likely(ctx->lc_version == atomic_read(&key_set_version))) + return 0; + + return keys_fill(ctx); +} + +/** + * lu_ctx_tags/lu_ses_tags will be updated if there are new types of + * obd being added. Currently, this is only used on client side, specifically + * for echo device client, for other stack (like ptlrpc threads), context are + * predefined when the lu_device type are registered, during the module probe + * phase. + */ +u32 lu_context_tags_default = LCT_CL_THREAD; +u32 lu_session_tags_default = LCT_SESSION; + +void lu_context_tags_update(__u32 tags) +{ + spin_lock(&lu_context_remembered_guard); + lu_context_tags_default |= tags; + atomic_inc(&key_set_version); + spin_unlock(&lu_context_remembered_guard); +} +EXPORT_SYMBOL(lu_context_tags_update); + +void lu_context_tags_clear(__u32 tags) +{ + spin_lock(&lu_context_remembered_guard); + lu_context_tags_default &= ~tags; + atomic_inc(&key_set_version); + spin_unlock(&lu_context_remembered_guard); +} +EXPORT_SYMBOL(lu_context_tags_clear); + +void lu_session_tags_update(__u32 tags) +{ + spin_lock(&lu_context_remembered_guard); + lu_session_tags_default |= tags; + atomic_inc(&key_set_version); + spin_unlock(&lu_context_remembered_guard); +} +EXPORT_SYMBOL(lu_session_tags_update); + +void lu_session_tags_clear(__u32 tags) +{ + spin_lock(&lu_context_remembered_guard); + lu_session_tags_default &= ~tags; + atomic_inc(&key_set_version); + spin_unlock(&lu_context_remembered_guard); +} +EXPORT_SYMBOL(lu_session_tags_clear); + +int lu_env_init(struct lu_env *env, __u32 tags) +{ + int result; + + env->le_ses = NULL; + result = lu_context_init(&env->le_ctx, tags); + if (likely(result == 0)) + lu_context_enter(&env->le_ctx); + return result; +} +EXPORT_SYMBOL(lu_env_init); + +void lu_env_fini(struct lu_env *env) +{ + lu_context_exit(&env->le_ctx); + lu_context_fini(&env->le_ctx); + env->le_ses = NULL; +} +EXPORT_SYMBOL(lu_env_fini); + +int lu_env_refill(struct lu_env *env) +{ + int result; + + result = lu_context_refill(&env->le_ctx); + if (result == 0 && env->le_ses != NULL) + result = lu_context_refill(env->le_ses); + return result; +} +EXPORT_SYMBOL(lu_env_refill); + +/** + * Currently, this API will only be used by echo client. + * Because echo client and normal lustre client will share + * same cl_env cache. So echo client needs to refresh + * the env context after it get one from the cache, especially + * when normal client and echo client co-exist in the same client. + */ +int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, + __u32 stags) +{ + int result; + + if ((env->le_ctx.lc_tags & ctags) != ctags) { + env->le_ctx.lc_version = 0; + env->le_ctx.lc_tags |= ctags; + } + + if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) { + env->le_ses->lc_version = 0; + env->le_ses->lc_tags |= stags; + } + + result = lu_env_refill(env); + + return result; +} +EXPORT_SYMBOL(lu_env_refill_by_tags); + + +struct lu_env_item { + struct task_struct *lei_task; /* rhashtable key */ + struct rhash_head lei_linkage; + struct lu_env *lei_env; + struct rcu_head lei_rcu_head; +}; + +static const struct rhashtable_params lu_env_rhash_params = { + .key_len = sizeof(struct task_struct *), + .key_offset = offsetof(struct lu_env_item, lei_task), + .head_offset = offsetof(struct lu_env_item, lei_linkage), + }; + +struct rhashtable lu_env_rhash; + +struct lu_env_percpu { + struct task_struct *lep_task; + struct lu_env *lep_env ____cacheline_aligned_in_smp; +}; + +static struct lu_env_percpu lu_env_percpu[NR_CPUS]; + +int lu_env_add_task(struct lu_env *env, struct task_struct *task) +{ + struct lu_env_item *lei, *old; + + LASSERT(env); + + OBD_ALLOC_PTR(lei); + if (!lei) + return -ENOMEM; + + lei->lei_task = task; + lei->lei_env = env; + + old = rhashtable_lookup_get_insert_fast(&lu_env_rhash, + &lei->lei_linkage, + lu_env_rhash_params); + LASSERT(!old); + + return 0; +} +EXPORT_SYMBOL(lu_env_add_task); + +int lu_env_add(struct lu_env *env) +{ + return lu_env_add_task(env, current); +} +EXPORT_SYMBOL(lu_env_add); + +static void lu_env_item_free(struct rcu_head *head) +{ + struct lu_env_item *lei; + + lei = container_of(head, struct lu_env_item, lei_rcu_head); + OBD_FREE_PTR(lei); +} + +void lu_env_remove(struct lu_env *env) +{ + struct lu_env_item *lei; + const void *task = current; + int i; + + for_each_possible_cpu(i) { + if (lu_env_percpu[i].lep_env == env) { + LASSERT(lu_env_percpu[i].lep_task == task); + lu_env_percpu[i].lep_task = NULL; + lu_env_percpu[i].lep_env = NULL; + } + } + + /* The rcu_lock is not taking in this case since the key + * used is the actual task_struct. This implies that each + * object is only removed by the owning thread, so there + * can never be a race on a particular object. + */ + lei = rhashtable_lookup_fast(&lu_env_rhash, &task, + lu_env_rhash_params); + if (lei && rhashtable_remove_fast(&lu_env_rhash, &lei->lei_linkage, + lu_env_rhash_params) == 0) + call_rcu(&lei->lei_rcu_head, lu_env_item_free); +} +EXPORT_SYMBOL(lu_env_remove); + +struct lu_env *lu_env_find(void) +{ + struct lu_env *env = NULL; + struct lu_env_item *lei; + const void *task = current; + int i = get_cpu(); + + if (lu_env_percpu[i].lep_task == current) { + env = lu_env_percpu[i].lep_env; + put_cpu(); + LASSERT(env); + return env; + } + + lei = rhashtable_lookup_fast(&lu_env_rhash, &task, + lu_env_rhash_params); + if (lei) { + env = lei->lei_env; + lu_env_percpu[i].lep_task = current; + lu_env_percpu[i].lep_env = env; + } + put_cpu(); + + return env; +} +EXPORT_SYMBOL(lu_env_find); + +typedef struct lu_site_stats{ + unsigned lss_populated; + unsigned lss_max_search; + unsigned lss_total; + unsigned lss_busy; +} lu_site_stats_t; + +static void lu_site_stats_get(const struct lu_site *s, + lu_site_stats_t *stats) +{ + int cnt = atomic_read(&s->ls_obj_hash.nelems); + /* + * percpu_counter_sum_positive() won't accept a const pointer + * as it does modify the struct by taking a spinlock + */ + struct lu_site *s2 = (struct lu_site *)s; + + stats->lss_busy += cnt - + percpu_counter_sum_positive(&s2->ls_lru_len_counter); + + stats->lss_total += cnt; + stats->lss_max_search = 0; + stats->lss_populated = 0; +} + + +/* + * lu_cache_shrink_count() returns an approximate number of cached objects + * that can be freed by shrink_slab(). A counter, which tracks the + * number of items in the site's lru, is maintained in a percpu_counter + * for each site. The percpu values are incremented and decremented as + * objects are added or removed from the lru. The percpu values are summed + * and saved whenever a percpu value exceeds a threshold. Thus the saved, + * summed value at any given time may not accurately reflect the current + * lru length. But this value is sufficiently accurate for the needs of + * a shrinker. + * + * Using a per cpu counter is a compromise solution to concurrent access: + * lu_object_put() can update the counter without locking the site and + * lu_cache_shrink_count can sum the counters without locking each + * ls_obj_hash bucket. + */ +static unsigned long lu_cache_shrink_count(struct shrinker *sk, + struct shrink_control *sc) +{ + struct lu_site *s; + struct lu_site *tmp; + unsigned long cached = 0; + + if (!(sc->gfp_mask & __GFP_FS)) + return 0; + + down_read(&lu_sites_guard); + list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) + cached += percpu_counter_read_positive(&s->ls_lru_len_counter); + up_read(&lu_sites_guard); + + cached = (cached / 100) * sysctl_vfs_cache_pressure; + CDEBUG(D_INODE, "%ld objects cached, cache pressure %d\n", + cached, sysctl_vfs_cache_pressure); + + return cached; +} + +static unsigned long lu_cache_shrink_scan(struct shrinker *sk, + struct shrink_control *sc) +{ + struct lu_site *s; + struct lu_site *tmp; + unsigned long remain = sc->nr_to_scan; + LIST_HEAD(splice); + + if (!(sc->gfp_mask & __GFP_FS)) + /* We must not take the lu_sites_guard lock when + * __GFP_FS is *not* set because of the deadlock + * possibility detailed above. Additionally, + * since we cannot determine the number of + * objects in the cache without taking this + * lock, we're in a particularly tough spot. As + * a result, we'll just lie and say our cache is + * empty. This _should_ be ok, as we can't + * reclaim objects when __GFP_FS is *not* set + * anyways. + */ + return SHRINK_STOP; + + down_write(&lu_sites_guard); + list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) { + remain = lu_site_purge(&lu_shrink_env, s, remain); + /* + * Move just shrunk site to the tail of site list to + * assure shrinking fairness. + */ + list_move_tail(&s->ls_linkage, &splice); + } + list_splice(&splice, lu_sites.prev); + up_write(&lu_sites_guard); + + return sc->nr_to_scan - remain; +} + +#ifdef HAVE_SHRINKER_COUNT +static struct shrinker lu_site_shrinker = { + .count_objects = lu_cache_shrink_count, + .scan_objects = lu_cache_shrink_scan, + .seeks = DEFAULT_SEEKS, +}; + +#else +/* + * There exists a potential lock inversion deadlock scenario when using + * Lustre on top of ZFS. This occurs between one of ZFS's + * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially, + * thread A will take the lu_sites_guard lock and sleep on the ht_lock, + * while thread B will take the ht_lock and sleep on the lu_sites_guard + * lock. Obviously neither thread will wake and drop their respective hold + * on their lock. + * + * To prevent this from happening we must ensure the lu_sites_guard lock is + * not taken while down this code path. ZFS reliably does not set the + * __GFP_FS bit in its code paths, so this can be used to determine if it + * is safe to take the lu_sites_guard lock. + * + * Ideally we should accurately return the remaining number of cached + * objects without taking the lu_sites_guard lock, but this is not + * possible in the current implementation. + */ +static int lu_cache_shrink(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int cached = 0; + + CDEBUG(D_INODE, "Shrink %lu objects\n", sc->nr_to_scan); + + if (sc->nr_to_scan != 0) + lu_cache_shrink_scan(shrinker, sc); + + cached = lu_cache_shrink_count(shrinker, sc); + return cached; +} + +static struct shrinker lu_site_shrinker = { + .shrink = lu_cache_shrink, + .seeks = DEFAULT_SEEKS, +}; + +#endif /* HAVE_SHRINKER_COUNT */ + + +/* + * Debugging stuff. + */ + +/** + * Environment to be used in debugger, contains all tags. + */ +static struct lu_env lu_debugging_env; + +/** + * Debugging printer function using printk(). + */ +int lu_printk_printer(const struct lu_env *env, + void *unused, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vprintk(format, args); + va_end(args); + return 0; +} + +int lu_debugging_setup(void) +{ + return lu_env_init(&lu_debugging_env, ~0); +} + +void lu_context_keys_dump(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + struct lu_context_key *key; + + key = lu_keys[i]; + if (key != NULL) { + CERROR("LU context keys [%d]: %p %x (%p,%p,%p) %d %d \"%s\"@%p\n", + i, key, key->lct_tags, + key->lct_init, key->lct_fini, key->lct_exit, + key->lct_index, atomic_read(&key->lct_used), + key->lct_owner ? key->lct_owner->name : "", + key->lct_owner); + lu_ref_print(&key->lct_reference); + } + } +} + +/** + * Initialization of global lu_* data. + */ +int lu_global_init(void) +{ + int result; + + CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys); + + result = lu_ref_global_init(); + if (result != 0) + return result; + + LU_CONTEXT_KEY_INIT(&lu_global_key); + result = lu_context_key_register(&lu_global_key); + if (result) + goto out_lu_ref; + + /* + * At this level, we don't know what tags are needed, so allocate them + * conservatively. This should not be too bad, because this + * environment is global. + */ + down_write(&lu_sites_guard); + result = lu_env_init(&lu_shrink_env, LCT_SHRINKER); + up_write(&lu_sites_guard); + if (result) { + lu_context_key_degister(&lu_global_key); + goto out_lu_ref; + } + + /* + * seeks estimation: 3 seeks to read a record from oi, one to read + * inode, one for ea. Unfortunately setting this high value results in + * lu_object/inode cache consuming all the memory. + */ + result = register_shrinker(&lu_site_shrinker); + if (result) + goto out_env; + + result = rhashtable_init(&lu_env_rhash, &lu_env_rhash_params); + + if (result) + goto out_shrinker; + + return result; + +out_shrinker: + unregister_shrinker(&lu_site_shrinker); +out_env: + /* ordering here is explained in lu_global_fini() */ + lu_context_key_degister(&lu_global_key); + down_write(&lu_sites_guard); + lu_env_fini(&lu_shrink_env); + up_write(&lu_sites_guard); +out_lu_ref: + lu_ref_global_fini(); + return result; +} + +/** + * Dual to lu_global_init(). + */ +void lu_global_fini(void) +{ + unregister_shrinker(&lu_site_shrinker); + + lu_context_key_degister(&lu_global_key); + + /* + * Tear shrinker environment down _after_ de-registering + * lu_global_key, because the latter has a value in the former. + */ + down_write(&lu_sites_guard); + lu_env_fini(&lu_shrink_env); + up_write(&lu_sites_guard); + + rhashtable_destroy(&lu_env_rhash); + + lu_ref_global_fini(); +} + +static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx) +{ +#ifdef CONFIG_PROC_FS + struct lprocfs_counter ret; + + lprocfs_stats_collect(stats, idx, &ret); + return (__u32)ret.lc_count; +#else + return 0; +#endif +} + +/** + * Output site statistical counters into a buffer. Suitable for + * lprocfs_rd_*()-style functions. + */ +int lu_site_stats_seq_print(const struct lu_site *s, struct seq_file *m) +{ + const struct bucket_table *tbl; + lu_site_stats_t stats; + unsigned int chains; + + memset(&stats, 0, sizeof(stats)); + lu_site_stats_get(s, &stats); + + rcu_read_lock(); + tbl = rht_dereference_rcu(s->ls_obj_hash.tbl, + &((struct lu_site *)s)->ls_obj_hash); + chains = tbl->size; + rcu_read_unlock(); + seq_printf(m, "%d/%d %d/%u %d %d %d %d %d %d %d\n", + stats.lss_busy, + stats.lss_total, + stats.lss_populated, + chains, + stats.lss_max_search, + ls_stats_read(s->ls_stats, LU_SS_CREATED), + ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT), + ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS), + ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE), + ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE), + ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED)); + return 0; +} +EXPORT_SYMBOL(lu_site_stats_seq_print); + +/** + * Helper function to initialize a number of kmem slab caches at once. + */ +int lu_kmem_init(struct lu_kmem_descr *caches) +{ + int result; + struct lu_kmem_descr *iter = caches; + + for (result = 0; iter->ckd_cache != NULL; ++iter) { + *iter->ckd_cache = kmem_cache_create(iter->ckd_name, + iter->ckd_size, + 0, 0, NULL); + if (*iter->ckd_cache == NULL) { + result = -ENOMEM; + /* free all previously allocated caches */ + lu_kmem_fini(caches); + break; + } + } + return result; +} +EXPORT_SYMBOL(lu_kmem_init); + +/** + * Helper function to finalize a number of kmem slab cached at once. Dual to + * lu_kmem_init(). + */ +void lu_kmem_fini(struct lu_kmem_descr *caches) +{ + for (; caches->ckd_cache != NULL; ++caches) { + if (*caches->ckd_cache != NULL) { + kmem_cache_destroy(*caches->ckd_cache); + *caches->ckd_cache = NULL; + } + } +} +EXPORT_SYMBOL(lu_kmem_fini); + +/** + * Temporary solution to be able to assign fid in ->do_create() + * till we have fully-functional OST fids + */ +void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o, + const struct lu_fid *fid) +{ + struct lu_site *s = o->lo_dev->ld_site; + struct lu_fid *old = &o->lo_header->loh_fid; + int rc; + + LASSERT(fid_is_zero(old)); + *old = *fid; +try_again: + rc = rhashtable_lookup_insert_fast(&s->ls_obj_hash, + &o->lo_header->loh_hash, + obj_hash_params); + /* supposed to be unique */ + LASSERT(rc != -EEXIST); + /* handle hash table resizing */ + if (rc == -ENOMEM || rc == -EBUSY) { + msleep(20); + goto try_again; + } + /* trim the hash if its growing to big */ + lu_object_limit(env, o->lo_dev); + if (rc == -E2BIG) + goto try_again; + + LASSERTF(rc == 0, "failed hashtable insertion: rc = %d\n", rc); +} +EXPORT_SYMBOL(lu_object_assign_fid); + +/** + * allocates object with 0 (non-assiged) fid + * XXX: temporary solution to be able to assign fid in ->do_create() + * till we have fully-functional OST fids + */ +struct lu_object *lu_object_anon(const struct lu_env *env, + struct lu_device *dev, + const struct lu_object_conf *conf) +{ + struct lu_fid fid; + struct lu_object *o; + int rc; + + fid_zero(&fid); + o = lu_object_alloc(env, dev, &fid); + if (!IS_ERR(o)) { + rc = lu_object_start(env, dev, o, conf); + if (rc) { + lu_object_free(env, o); + return ERR_PTR(rc); + } + } + + return o; +} +EXPORT_SYMBOL(lu_object_anon); + +struct lu_buf LU_BUF_NULL = { + .lb_buf = NULL, + .lb_len = 0 +}; +EXPORT_SYMBOL(LU_BUF_NULL); + +void lu_buf_free(struct lu_buf *buf) +{ + LASSERT(buf); + if (buf->lb_buf) { + LASSERT(buf->lb_len > 0); + OBD_FREE_LARGE(buf->lb_buf, buf->lb_len); + buf->lb_buf = NULL; + buf->lb_len = 0; + } +} +EXPORT_SYMBOL(lu_buf_free); + +void lu_buf_alloc(struct lu_buf *buf, size_t size) +{ + LASSERT(buf); + LASSERT(buf->lb_buf == NULL); + LASSERT(buf->lb_len == 0); + OBD_ALLOC_LARGE(buf->lb_buf, size); + if (likely(buf->lb_buf)) + buf->lb_len = size; +} +EXPORT_SYMBOL(lu_buf_alloc); + +void lu_buf_realloc(struct lu_buf *buf, size_t size) +{ + lu_buf_free(buf); + lu_buf_alloc(buf, size); +} +EXPORT_SYMBOL(lu_buf_realloc); + +struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, size_t len) +{ + if (buf->lb_buf == NULL && buf->lb_len == 0) + lu_buf_alloc(buf, len); + + if ((len > buf->lb_len) && (buf->lb_buf != NULL)) + lu_buf_realloc(buf, len); + + return buf; +} +EXPORT_SYMBOL(lu_buf_check_and_alloc); + +/** + * Increase the size of the \a buf. + * preserves old data in buffer + * old buffer remains unchanged on error + * \retval 0 or -ENOMEM + */ +int lu_buf_check_and_grow(struct lu_buf *buf, size_t len) +{ + char *ptr; + + if (len <= buf->lb_len) + return 0; + + OBD_ALLOC_LARGE(ptr, len); + if (ptr == NULL) + return -ENOMEM; + + /* Free the old buf */ + if (buf->lb_buf != NULL) { + memcpy(ptr, buf->lb_buf, buf->lb_len); + OBD_FREE_LARGE(buf->lb_buf, buf->lb_len); + } + + buf->lb_buf = ptr; + buf->lb_len = len; + return 0; +} +EXPORT_SYMBOL(lu_buf_check_and_grow); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c new file mode 100644 index 0000000000000..bcc59fb3fc6c7 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c @@ -0,0 +1,437 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/lu_ref.c + * + * Lustre reference. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include + +#ifdef CONFIG_LUSTRE_DEBUG_LU_REF +/** + * Asserts a condition for a given lu_ref. Must be called with + * lu_ref::lf_guard held. + */ +#define REFASSERT(ref, expr) do { \ + struct lu_ref *__tmp = (ref); \ + \ + if (unlikely(!(expr))) { \ + lu_ref_print(__tmp); \ + spin_unlock(&__tmp->lf_guard); \ + lu_ref_print_all(); \ + LASSERT(0); \ + spin_lock(&__tmp->lf_guard); \ + } \ +} while (0) + +static struct kmem_cache *lu_ref_link_kmem; + +static struct lu_kmem_descr lu_ref_caches[] = { + { + .ckd_cache = &lu_ref_link_kmem, + .ckd_name = "lu_ref_link_kmem", + .ckd_size = sizeof(struct lu_ref_link) + }, + { + .ckd_cache = NULL + } +}; + +/** + * Global list of active (initialized, but not finalized) lu_ref's. + * + * Protected by lu_ref_refs_guard. + */ +static LIST_HEAD(lu_ref_refs); +static DEFINE_SPINLOCK(lu_ref_refs_guard); +static struct lu_ref lu_ref_marker = { + .lf_guard = __SPIN_LOCK_UNLOCKED(lu_ref_marker.lf_guard), + .lf_list = LIST_HEAD_INIT(lu_ref_marker.lf_list), + .lf_linkage = LIST_HEAD_INIT(lu_ref_marker.lf_linkage) +}; + +void lu_ref_print(const struct lu_ref *ref) +{ + struct lu_ref_link *link; + + CERROR("lu_ref: %p %d %d %s:%d\n", + ref, ref->lf_refs, ref->lf_failed, ref->lf_func, ref->lf_line); + list_for_each_entry(link, &ref->lf_list, ll_linkage) { + CERROR(" link: %s %p\n", link->ll_scope, link->ll_source); + } +} + +static int lu_ref_is_marker(const struct lu_ref *ref) +{ + return ref == &lu_ref_marker; +} + +void lu_ref_print_all(void) +{ + struct lu_ref *ref; + + spin_lock(&lu_ref_refs_guard); + list_for_each_entry(ref, &lu_ref_refs, lf_linkage) { + if (lu_ref_is_marker(ref)) + continue; + + spin_lock(&ref->lf_guard); + lu_ref_print(ref); + spin_unlock(&ref->lf_guard); + } + spin_unlock(&lu_ref_refs_guard); +} + +void lu_ref_init_loc(struct lu_ref *ref, const char *func, const int line) +{ + ref->lf_refs = 0; + ref->lf_func = func; + ref->lf_line = line; + spin_lock_init(&ref->lf_guard); + INIT_LIST_HEAD(&ref->lf_list); + spin_lock(&lu_ref_refs_guard); + list_add(&ref->lf_linkage, &lu_ref_refs); + spin_unlock(&lu_ref_refs_guard); +} +EXPORT_SYMBOL(lu_ref_init_loc); + +void lu_ref_fini(struct lu_ref *ref) +{ + spin_lock(&ref->lf_guard); + REFASSERT(ref, list_empty(&ref->lf_list)); + REFASSERT(ref, ref->lf_refs == 0); + spin_unlock(&ref->lf_guard); + spin_lock(&lu_ref_refs_guard); + list_del_init(&ref->lf_linkage); + spin_unlock(&lu_ref_refs_guard); +} +EXPORT_SYMBOL(lu_ref_fini); + +static struct lu_ref_link *lu_ref_add_context(struct lu_ref *ref, + int flags, + const char *scope, + const void *source) +{ + struct lu_ref_link *link; + + link = NULL; + if (lu_ref_link_kmem) { + OBD_SLAB_ALLOC_PTR_GFP(link, lu_ref_link_kmem, flags); + if (link) { + link->ll_ref = ref; + link->ll_scope = scope; + link->ll_source = source; + spin_lock(&ref->lf_guard); + list_add_tail(&link->ll_linkage, &ref->lf_list); + ref->lf_refs++; + spin_unlock(&ref->lf_guard); + } + } + + if (!link) { + spin_lock(&ref->lf_guard); + ref->lf_failed++; + spin_unlock(&ref->lf_guard); + link = ERR_PTR(-ENOMEM); + } + + return link; +} + +void lu_ref_add(struct lu_ref *ref, const char *scope, const void *source) +{ + might_sleep(); + lu_ref_add_context(ref, GFP_NOFS, scope, source); +} +EXPORT_SYMBOL(lu_ref_add); + +void lu_ref_add_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source) +{ + link->ll_ref = ref; + link->ll_scope = scope; + link->ll_source = source; + spin_lock(&ref->lf_guard); + list_add_tail(&link->ll_linkage, &ref->lf_list); + ref->lf_refs++; + spin_unlock(&ref->lf_guard); +} +EXPORT_SYMBOL(lu_ref_add_at); + +/** + * Version of lu_ref_add() to be used in non-blockable contexts. + */ +void lu_ref_add_atomic(struct lu_ref *ref, const char *scope, + const void *source) +{ + lu_ref_add_context(ref, GFP_ATOMIC, scope, source); +} +EXPORT_SYMBOL(lu_ref_add_atomic); + +static inline int lu_ref_link_eq(const struct lu_ref_link *link, + const char *scope, + const void *source) +{ + return link->ll_source == source && !strcmp(link->ll_scope, scope); +} + +/** + * Maximal chain length seen so far. + */ +static unsigned int lu_ref_chain_max_length = 127; + +/** + * Searches for a lu_ref_link with given [scope, source] within given lu_ref. + */ +static struct lu_ref_link *lu_ref_find(struct lu_ref *ref, const char *scope, + const void *source) +{ + struct lu_ref_link *link; + unsigned int iterations; + + iterations = 0; + list_for_each_entry(link, &ref->lf_list, ll_linkage) { + ++iterations; + if (lu_ref_link_eq(link, scope, source)) { + if (iterations > lu_ref_chain_max_length) { + CWARN("Long lu_ref chain %d \"%s\":%p\n", + iterations, scope, source); + lu_ref_chain_max_length = iterations * 3 / 2; + } + return link; + } + } + return NULL; +} + +void lu_ref_del(struct lu_ref *ref, const char *scope, const void *source) +{ + struct lu_ref_link *link; + + spin_lock(&ref->lf_guard); + link = lu_ref_find(ref, scope, source); + if (link) { + list_del(&link->ll_linkage); + ref->lf_refs--; + spin_unlock(&ref->lf_guard); + OBD_SLAB_FREE(link, lu_ref_link_kmem, sizeof(*link)); + } else { + REFASSERT(ref, ref->lf_failed > 0); + ref->lf_failed--; + spin_unlock(&ref->lf_guard); + } +} +EXPORT_SYMBOL(lu_ref_del); + +void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, + const void *source0, const void *source1) +{ + spin_lock(&ref->lf_guard); + REFASSERT(ref, !IS_ERR_OR_NULL(link)); + REFASSERT(ref, link->ll_ref == ref); + REFASSERT(ref, lu_ref_link_eq(link, scope, source0)); + link->ll_source = source1; + spin_unlock(&ref->lf_guard); +} +EXPORT_SYMBOL(lu_ref_set_at); + +void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source) +{ + spin_lock(&ref->lf_guard); + REFASSERT(ref, !IS_ERR_OR_NULL(link)); + REFASSERT(ref, link->ll_ref == ref); + REFASSERT(ref, lu_ref_link_eq(link, scope, source)); + list_del(&link->ll_linkage); + ref->lf_refs--; + spin_unlock(&ref->lf_guard); +} +EXPORT_SYMBOL(lu_ref_del_at); + +static void *lu_ref_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct lu_ref *ref = seq->private; + + spin_lock(&lu_ref_refs_guard); + if (list_empty(&ref->lf_linkage)) + ref = NULL; + spin_unlock(&lu_ref_refs_guard); + + return ref; +} + +static void *lu_ref_seq_next(struct seq_file *seq, void *p, loff_t *pos) +{ + struct lu_ref *ref = p; + struct lu_ref *next; + + LASSERT(seq->private == p); + LASSERT(!list_empty(&ref->lf_linkage)); + + (*pos)++; + spin_lock(&lu_ref_refs_guard); + next = list_entry(ref->lf_linkage.next, struct lu_ref, lf_linkage); + if (&next->lf_linkage == &lu_ref_refs) + p = NULL; + else + list_move(&ref->lf_linkage, &next->lf_linkage); + spin_unlock(&lu_ref_refs_guard); + + return p; +} + +static void lu_ref_seq_stop(struct seq_file *seq, void *p) +{ + /* Nothing to do */ +} + + +static int lu_ref_seq_show(struct seq_file *seq, void *p) +{ + struct lu_ref *ref = p; + struct lu_ref *next; + + spin_lock(&lu_ref_refs_guard); + next = list_entry(ref->lf_linkage.next, struct lu_ref, lf_linkage); + if ((&next->lf_linkage == &lu_ref_refs) || lu_ref_is_marker(next)) { + spin_unlock(&lu_ref_refs_guard); + return 0; + } + + /* print the entry */ + spin_lock(&next->lf_guard); + seq_printf(seq, "lu_ref: %p %d %d %s:%d\n", + next, next->lf_refs, next->lf_failed, + next->lf_func, next->lf_line); + if (next->lf_refs > 64) { + seq_puts(seq, " too many references, skip\n"); + } else { + struct lu_ref_link *link; + int i = 0; + + list_for_each_entry(link, &next->lf_list, ll_linkage) + seq_printf(seq, " #%d link: %s %p\n", + i++, link->ll_scope, link->ll_source); + } + spin_unlock(&next->lf_guard); + spin_unlock(&lu_ref_refs_guard); + + return 0; +} + +static const struct seq_operations lu_ref_seq_ops = { + .start = lu_ref_seq_start, + .stop = lu_ref_seq_stop, + .next = lu_ref_seq_next, + .show = lu_ref_seq_show +}; + +static int lu_ref_seq_open(struct inode *inode, struct file *file) +{ + struct lu_ref *marker = &lu_ref_marker; + int result = 0; + + result = seq_open(file, &lu_ref_seq_ops); + if (result == 0) { + spin_lock(&lu_ref_refs_guard); + if (!list_empty(&marker->lf_linkage)) + result = -EAGAIN; + else + list_add(&marker->lf_linkage, &lu_ref_refs); + spin_unlock(&lu_ref_refs_guard); + + if (result == 0) { + struct seq_file *f = file->private_data; + + f->private = marker; + } else { + seq_release(inode, file); + } + } + + return result; +} + +static int lu_ref_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct lu_ref *ref = m->private; + + spin_lock(&lu_ref_refs_guard); + list_del_init(&ref->lf_linkage); + spin_unlock(&lu_ref_refs_guard); + + return seq_release(inode, file); +} + +static const struct file_operations lu_ref_dump_fops = { + .owner = THIS_MODULE, + .open = lu_ref_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = lu_ref_seq_release +}; + +int lu_ref_global_init(void) +{ + int result; + + CDEBUG(D_CONSOLE, + "lu_ref tracking is enabled. Performance isn't.\n"); + + result = lu_kmem_init(lu_ref_caches); + if (result) + return result; + + debugfs_create_file("lu_refs", 0444, debugfs_lustre_root, + NULL, &lu_ref_dump_fops); + + return result; +} + +void lu_ref_global_fini(void) +{ + /* debugfs file gets cleaned up by debugfs_remove_recursive on + * debugfs_lustre_root + */ + lu_kmem_fini(lu_ref_caches); +} + +#endif /* CONFIG_LUSTRE_DEBUG_LU_REF */ diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c new file mode 100644 index 0000000000000..f070169218b62 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c @@ -0,0 +1,687 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/lu_tgt_descs.c + * + * Lustre target descriptions + * These are the only exported functions, they provide some generic + * infrastructure for target description management used by LOD/LMV + * + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include /* hash_long() */ +#include +#include +#include +#include +#include +#include + +/** + * lu_prandom_u64_max - returns a pseudo-random u64 number in interval + * [0, ep_ro) + * + * \param[in] ep_ro right open interval endpoint + * + * \retval a pseudo-random 64-bit number that is in interval [0, ep_ro). + */ +u64 lu_prandom_u64_max(u64 ep_ro) +{ + u64 rand = 0; + + if (ep_ro) { +#ifdef HAVE_GET_RANDOM_U32_AND_U64 + rand = get_random_u64() % ep_ro; +#elif BITS_PER_LONG == 32 + /* + * If ep_ro > 32-bit, first generate the high + * 32 bits of the random number, then add in the low + * 32 bits (truncated to the upper limit, if needed) + */ + if (ep_ro > 0xffffffffULL) + rand = (u64)get_random_u32_below((u32)(ep_ro >> 32)) << 32; + + if (rand == (ep_ro & 0xffffffff00000000ULL)) + rand |= get_random_u32_below((u32)ep_ro); + else + rand |= get_random_u32(); +#else + rand = ((u64)get_random_u32() << 32 | get_random_u32()) % ep_ro; +#endif + } + + return rand; +} +EXPORT_SYMBOL(lu_prandom_u64_max); + +/** + * Add a new target to Quality of Service (QoS) target table. + * + * Add a new MDT/OST target to the structure representing an OSS. Resort the + * list of known MDSs/OSSs by the number of MDTs/OSTs attached to each MDS/OSS. + * The MDS/OSS list is protected internally and no external locking is required. + * + * \param[in] qos lu_qos data + * \param[in] tgt target description + * + * \retval 0 on success + * \retval -ENOMEM on error + */ +int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *tgt) +{ + struct lu_svr_qos *svr = NULL; + struct lu_svr_qos *tempsvr; + struct obd_export *exp = tgt->ltd_exp; + int found = 0; + __u32 id = 0; + int rc = 0; + + ENTRY; + + down_write(&qos->lq_rw_sem); + /* + * a bit hacky approach to learn NID of corresponding connection + * but there is no official API to access information like this + * with OSD API. + */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + if (obd_uuid_equals(&svr->lsq_uuid, + &exp->exp_connection->c_remote_uuid)) { + found++; + break; + } + if (svr->lsq_id > id) + id = svr->lsq_id; + } + + if (!found) { + OBD_ALLOC_PTR(svr); + if (!svr) + GOTO(out, rc = -ENOMEM); + memcpy(&svr->lsq_uuid, &exp->exp_connection->c_remote_uuid, + sizeof(svr->lsq_uuid)); + ++id; + svr->lsq_id = id; + } else { + /* Assume we have to move this one */ + list_del(&svr->lsq_svr_list); + } + + svr->lsq_tgt_count++; + tgt->ltd_qos.ltq_svr = svr; + + CDEBUG(D_OTHER, "add tgt %s to server %s (%d targets)\n", + obd_uuid2str(&tgt->ltd_uuid), obd_uuid2str(&svr->lsq_uuid), + svr->lsq_tgt_count); + + /* + * Add sorted by # of tgts. Find the first entry that we're + * bigger than... + */ + list_for_each_entry(tempsvr, &qos->lq_svr_list, lsq_svr_list) { + if (svr->lsq_tgt_count > tempsvr->lsq_tgt_count) + break; + } + /* + * ...and add before it. If we're the first or smallest, tempsvr + * points to the list head, and we add to the end. + */ + list_add_tail(&svr->lsq_svr_list, &tempsvr->lsq_svr_list); + + set_bit(LQ_DIRTY, &qos->lq_flags); +#ifdef HAVE_SERVER_SUPPORT + set_bit(LQ_DIRTY, &qos->lq_rr.lqr_flags); +#endif +out: + up_write(&qos->lq_rw_sem); + RETURN(rc); +} +EXPORT_SYMBOL(lu_qos_add_tgt); + +/** + * Remove MDT/OST target from QoS table. + * + * Removes given MDT/OST target from QoS table and releases related + * MDS/OSS structure if no target remain on the MDS/OSS. + * + * \param[in] qos lu_qos data + * \param[in] ltd target description + * + * \retval 0 on success + * \retval -ENOENT if no server was found + */ +static int lu_qos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd) +{ + struct lu_svr_qos *svr; + int rc = 0; + + ENTRY; + + down_write(&qos->lq_rw_sem); + svr = ltd->ltd_qos.ltq_svr; + if (!svr) + GOTO(out, rc = -ENOENT); + + svr->lsq_tgt_count--; + if (svr->lsq_tgt_count == 0) { + CDEBUG(D_OTHER, "removing server %s\n", + obd_uuid2str(&svr->lsq_uuid)); + list_del(&svr->lsq_svr_list); + ltd->ltd_qos.ltq_svr = NULL; + OBD_FREE_PTR(svr); + } + + set_bit(LQ_DIRTY, &qos->lq_flags); +#ifdef HAVE_SERVER_SUPPORT + set_bit(LQ_DIRTY, &qos->lq_rr.lqr_flags); +#endif +out: + up_write(&qos->lq_rw_sem); + RETURN(rc); +} + +static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt) +{ + struct obd_statfs *statfs = &tgt->ltd_statfs; + + return statfs->os_bavail * statfs->os_bsize; +} + +static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt) +{ + return tgt->ltd_statfs.os_ffree; +} + +/** + * Calculate weight for a given tgt. + * + * The final tgt weight is bavail >> 16 * iavail >> 8 minus the tgt and server + * penalties. See ltd_qos_penalties_calc() for how penalties are calculated. + * + * \param[in] tgt target descriptor + */ +void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt) +{ + struct lu_tgt_qos *ltq = &tgt->ltd_qos; + __u64 penalty; + + ltq->ltq_avail = (tgt_statfs_bavail(tgt) >> 16) * + (tgt_statfs_iavail(tgt) >> 8); + penalty = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty; + if (ltq->ltq_avail < penalty) + ltq->ltq_weight = 0; + else + ltq->ltq_weight = ltq->ltq_avail - penalty; +} +EXPORT_SYMBOL(lu_tgt_qos_weight_calc); + +/** + * Allocate and initialize target table. + * + * A helper function to initialize the target table and allocate + * a bitmap of the available targets. + * + * \param[in] ltd target's table to initialize + * \param[in] is_mdt target table for MDTs + * + * \retval 0 on success + * \retval negative negated errno on error + **/ +int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt) +{ + mutex_init(<d->ltd_mutex); + init_rwsem(<d->ltd_rw_sem); + + /* + * the tgt array and bitmap are allocated/grown dynamically as tgts are + * added to the LOD/LMV, see lu_tgt_descs_add() + */ + ltd->ltd_tgt_bitmap = bitmap_zalloc(BITS_PER_LONG, GFP_NOFS); + if (!ltd->ltd_tgt_bitmap) + return -ENOMEM; + + ltd->ltd_tgts_size = BITS_PER_LONG; + ltd->ltd_death_row = 0; + ltd->ltd_refcount = 0; + + /* Set up allocation policy (QoS and RR) */ + INIT_LIST_HEAD(<d->ltd_qos.lq_svr_list); + init_rwsem(<d->ltd_qos.lq_rw_sem); + set_bit(LQ_DIRTY, <d->ltd_qos.lq_flags); + set_bit(LQ_RESET, <d->ltd_qos.lq_flags); + ltd->ltd_is_mdt = is_mdt; + /* MDT imbalance threshold is low to balance across MDTs + * relatively quickly, because each directory may result + * in a large number of files/subdirs created therein. + */ + if (is_mdt) { + ltd->ltd_lmv_desc.ld_pattern = LMV_HASH_TYPE_DEFAULT; + ltd->ltd_qos.lq_prio_free = LMV_QOS_DEF_PRIO_FREE * 256 / 100; + ltd->ltd_qos.lq_threshold_rr = + LMV_QOS_DEF_THRESHOLD_RR_PCT * + QOS_THRESHOLD_MAX / 100; + } else { + ltd->ltd_qos.lq_prio_free = LOV_QOS_DEF_PRIO_FREE * 256 / 100; + ltd->ltd_qos.lq_threshold_rr = + LOV_QOS_DEF_THRESHOLD_RR_PCT * + QOS_THRESHOLD_MAX / 100; + } + + return 0; +} +EXPORT_SYMBOL(lu_tgt_descs_init); + +/** + * Free bitmap and target table pages. + * + * \param[in] ltd target table + */ +void lu_tgt_descs_fini(struct lu_tgt_descs *ltd) +{ + int i; + + bitmap_free(ltd->ltd_tgt_bitmap); + for (i = 0; i < ARRAY_SIZE(ltd->ltd_tgt_idx); i++) { + if (ltd->ltd_tgt_idx[i]) + OBD_FREE_PTR(ltd->ltd_tgt_idx[i]); + } + ltd->ltd_tgts_size = 0; +} +EXPORT_SYMBOL(lu_tgt_descs_fini); + +/** + * Expand size of target table. + * + * When the target table is full, we have to extend the table. To do so, + * we allocate new memory with some reserve, move data from the old table + * to the new one and release memory consumed by the old table. + * + * \param[in] ltd target table + * \param[in] newsize new size of the table + * + * \retval 0 on success + * \retval -ENOMEM if reallocation failed + */ +static int lu_tgt_descs_resize(struct lu_tgt_descs *ltd, __u32 newsize) +{ + unsigned long *new_bitmap, *old_bitmap = NULL; + + /* someone else has already resize the array */ + if (newsize <= ltd->ltd_tgts_size) + return 0; + + new_bitmap = bitmap_zalloc(newsize, GFP_NOFS); + if (!new_bitmap) + return -ENOMEM; + + if (ltd->ltd_tgts_size > 0) { + /* the bitmap already exists, copy data from old one */ + bitmap_copy(new_bitmap, ltd->ltd_tgt_bitmap, + ltd->ltd_tgts_size); + old_bitmap = ltd->ltd_tgt_bitmap; + } + + ltd->ltd_tgts_size = newsize; + ltd->ltd_tgt_bitmap = new_bitmap; + + bitmap_free(old_bitmap); + + CDEBUG(D_CONFIG, "tgt size: %d\n", ltd->ltd_tgts_size); + + return 0; +} + +/** + * Add new target to target table. + * + * Extend target table if it's full, update target table and bitmap. + * Notice we need to take ltd_rw_sem exclusively before entry to ensure + * atomic switch. + * + * \param[in] ltd target table + * \param[in] tgt new target desc + * + * \retval 0 on success + * \retval -ENOMEM if reallocation failed + * -EEXIST if target existed + */ +int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt) +{ + __u32 index = tgt->ltd_index; + int rc; + + ENTRY; + + if (index >= ltd->ltd_tgts_size) { + __u32 newsize = 1; + + if (index > TGT_PTRS * TGT_PTRS_PER_BLOCK) + RETURN(-ENFILE); + + while (newsize < index + 1) + newsize = newsize << 1; + + rc = lu_tgt_descs_resize(ltd, newsize); + if (rc) + RETURN(rc); + } else if (test_bit(index, ltd->ltd_tgt_bitmap)) { + RETURN(-EEXIST); + } + + if (ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK] == NULL) { + OBD_ALLOC_PTR(ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK]); + if (ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK] == NULL) + RETURN(-ENOMEM); + } + + LTD_TGT(ltd, tgt->ltd_index) = tgt; + set_bit(tgt->ltd_index, ltd->ltd_tgt_bitmap); + + ltd->ltd_lov_desc.ld_tgt_count++; + if (tgt->ltd_active) + ltd->ltd_lov_desc.ld_active_tgt_count++; + + RETURN(0); +} +EXPORT_SYMBOL(ltd_add_tgt); + +/** + * Delete target from target table + */ +void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt) +{ + lu_qos_del_tgt(<d->ltd_qos, tgt); + LTD_TGT(ltd, tgt->ltd_index) = NULL; + clear_bit(tgt->ltd_index, ltd->ltd_tgt_bitmap); + ltd->ltd_lov_desc.ld_tgt_count--; + if (tgt->ltd_active) + ltd->ltd_lov_desc.ld_active_tgt_count--; +} +EXPORT_SYMBOL(ltd_del_tgt); + +/** + * Calculate penalties per-tgt and per-server + * + * Re-calculate penalties when the configuration changes, active targets + * change and after statfs refresh (all these are reflected by lq_dirty flag). + * On every tgt and server: decay the penalty by half for every 8x the update + * interval that the device has been idle. That gives lots of time for the + * statfs information to be updated (which the penalty is only a proxy for), + * and avoids penalizing server/tgt under light load. + * See lu_qos_tgt_weight_calc() for how penalties are factored into the weight. + * + * \param[in] ltd lu_tgt_descs + * + * \retval 0 on success + * \retval -EAGAIN the number of tgt isn't enough or all tgt spaces are + * almost the same + */ +int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd) +{ + struct lu_qos *qos = <d->ltd_qos; + struct lov_desc *desc = <d->ltd_lov_desc; + struct lu_tgt_desc *tgt; + struct lu_svr_qos *svr; + __u64 ba_max, ba_min, ba; + __u64 ia_max, ia_min, ia = 1; + __u32 num_active; + int prio_wide; + time64_t now, age; + int rc; + + ENTRY; + + if (!test_bit(LQ_DIRTY, &qos->lq_flags)) + GOTO(out, rc = 0); + + num_active = desc->ld_active_tgt_count - 1; + if (num_active < 1) + GOTO(out, rc = -EAGAIN); + + /* find bavail on each server */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + svr->lsq_bavail = 0; + /* if inode is not counted, set to 1 to ignore */ + svr->lsq_iavail = ltd->ltd_is_mdt ? 0 : 1; + } + qos->lq_active_svr_count = 0; + + /* + * How badly user wants to select targets "widely" (not recently chosen + * and not on recent MDS's). As opposed to "freely" (free space avail.) + * 0-256 + */ + prio_wide = 256 - qos->lq_prio_free; + + ba_min = (__u64)(-1); + ba_max = 0; + ia_min = (__u64)(-1); + ia_max = 0; + now = ktime_get_real_seconds(); + + /* Calculate server penalty per object */ + ltd_foreach_tgt(ltd, tgt) { + if (!tgt->ltd_active) + continue; + + /* when inode is counted, bavail >> 16 to avoid overflow */ + ba = tgt_statfs_bavail(tgt); + if (ltd->ltd_is_mdt) + ba >>= 16; + else + ba >>= 8; + if (!ba) + continue; + + ba_min = min(ba, ba_min); + ba_max = max(ba, ba_max); + + /* Count the number of usable servers */ + if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0) + qos->lq_active_svr_count++; + tgt->ltd_qos.ltq_svr->lsq_bavail += ba; + + if (ltd->ltd_is_mdt) { + /* iavail >> 8 to avoid overflow */ + ia = tgt_statfs_iavail(tgt) >> 8; + if (!ia) + continue; + + ia_min = min(ia, ia_min); + ia_max = max(ia, ia_max); + + tgt->ltd_qos.ltq_svr->lsq_iavail += ia; + } + + /* + * per-tgt penalty is + * prio * bavail * iavail / (num_tgt - 1) / 2 + */ + tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia >> 8; + do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active); + tgt->ltd_qos.ltq_penalty_per_obj >>= 1; + + age = (now - tgt->ltd_qos.ltq_used) >> 3; + if (test_bit(LQ_RESET, &qos->lq_flags) || + age > 32 * desc->ld_qos_maxage) + tgt->ltd_qos.ltq_penalty = 0; + else if (age > desc->ld_qos_maxage) + /* Decay tgt penalty. */ + tgt->ltd_qos.ltq_penalty >>= age / desc->ld_qos_maxage; + } + + num_active = qos->lq_active_svr_count - 1; + if (num_active < 1) { + /* + * If there's only 1 server, we can't penalize it, so instead + * we have to double the tgt penalty + */ + num_active = 1; + ltd_foreach_tgt(ltd, tgt) { + if (!tgt->ltd_active) + continue; + + tgt->ltd_qos.ltq_penalty_per_obj <<= 1; + } + } + + /* + * Per-server penalty is + * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2 + */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + ba = svr->lsq_bavail; + ia = svr->lsq_iavail; + svr->lsq_penalty_per_obj = prio_wide * ba * ia >> 8; + do_div(svr->lsq_penalty_per_obj, + svr->lsq_tgt_count * num_active); + svr->lsq_penalty_per_obj >>= 1; + + age = (now - svr->lsq_used) >> 3; + if (test_bit(LQ_RESET, &qos->lq_flags) || + age > 32 * desc->ld_qos_maxage) + svr->lsq_penalty = 0; + else if (age > desc->ld_qos_maxage) + /* Decay server penalty. */ + svr->lsq_penalty >>= age / desc->ld_qos_maxage; + } + + clear_bit(LQ_DIRTY, &qos->lq_flags); + clear_bit(LQ_RESET, &qos->lq_flags); + + /* + * If each tgt has almost same free space, do rr allocation for better + * creation performance + */ + clear_bit(LQ_SAME_SPACE, &qos->lq_flags); + if (((ba_max * (QOS_THRESHOLD_MAX - qos->lq_threshold_rr)) / + QOS_THRESHOLD_MAX) < ba_min && + ((ia_max * (QOS_THRESHOLD_MAX - qos->lq_threshold_rr)) / + QOS_THRESHOLD_MAX) < ia_min) { + set_bit(LQ_SAME_SPACE, &qos->lq_flags); + /* Reset weights for the next time we enter qos mode */ + set_bit(LQ_RESET, &qos->lq_flags); + } + rc = 0; + +out: + if (!rc && test_bit(LQ_SAME_SPACE, &qos->lq_flags)) + RETURN(-EAGAIN); + + RETURN(rc); +} +EXPORT_SYMBOL(ltd_qos_penalties_calc); + +/** + * Re-calculate penalties and weights of all tgts. + * + * The function is called when some target was used for a new object. In + * this case we should re-calculate all the weights to keep new allocations + * balanced well. + * + * \param[in] ltd lu_tgt_descs + * \param[in] tgt recently used tgt + * \param[out] total_wt new total weight for the pool + * + * \retval 0 + */ +int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt, + __u64 *total_wt) +{ + struct lu_qos *qos = <d->ltd_qos; + struct lu_tgt_qos *ltq; + struct lu_svr_qos *svr; + + ENTRY; + + ltq = &tgt->ltd_qos; + LASSERT(ltq); + + /* Don't allocate on this device anymore, until the next alloc_qos */ + ltq->ltq_usable = 0; + + svr = ltq->ltq_svr; + + /* + * Decay old penalty by half (we're adding max penalty, and don't + * want it to run away.) + */ + ltq->ltq_penalty >>= 1; + svr->lsq_penalty >>= 1; + + /* mark the server and tgt as recently used */ + ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds(); + + /* Set max penalties for this tgt and server */ + ltq->ltq_penalty += ltq->ltq_penalty_per_obj * + ltd->ltd_lov_desc.ld_active_tgt_count; + svr->lsq_penalty += svr->lsq_penalty_per_obj * + qos->lq_active_svr_count; + + /* Decrease all MDS penalties */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + if (svr->lsq_penalty < svr->lsq_penalty_per_obj) + svr->lsq_penalty = 0; + else + svr->lsq_penalty -= svr->lsq_penalty_per_obj; + } + + *total_wt = 0; + /* Decrease all tgt penalties */ + ltd_foreach_tgt(ltd, tgt) { + if (!tgt->ltd_active) + continue; + + ltq = &tgt->ltd_qos; + if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj) + ltq->ltq_penalty = 0; + else + ltq->ltq_penalty -= ltq->ltq_penalty_per_obj; + + lu_tgt_qos_weight_calc(tgt); + + /* Recalc the total weight of usable osts */ + if (ltq->ltq_usable) + *total_wt += ltq->ltq_weight; + + CDEBUG(D_OTHER, "recalc tgt %d usable=%d bavail=%llu ffree=%llu tgtppo=%llu tgtp=%llu svrppo=%llu svrp=%llu wt=%llu\n", + tgt->ltd_index, ltq->ltq_usable, + tgt_statfs_bavail(tgt) >> 16, + tgt_statfs_iavail(tgt) >> 8, + ltq->ltq_penalty_per_obj >> 10, + ltq->ltq_penalty >> 10, + ltq->ltq_svr->lsq_penalty_per_obj >> 10, + ltq->ltq_svr->lsq_penalty >> 10, + ltq->ltq_weight >> 10); + } + + RETURN(0); +} +EXPORT_SYMBOL(ltd_qos_update); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_pool.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_pool.c new file mode 100644 index 0000000000000..4bf0d168b7380 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_pool.c @@ -0,0 +1,244 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +/* + * lustre/target/tgt_pool.c + * + * This file handles creation, lookup, and removal of pools themselves, as + * well as adding and removing targets to pools. + * + * Author: Jacques-Charles LAFOUCRIERE + * Author: Alex Lyashkov + * Author: Nathaniel Rutman + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include + +/** + * Initialize the pool data structures at startup. + * + * Allocate and initialize the pool data structures with the specified + * array size. If pool count is not specified (\a count == 0), then + * POOL_INIT_COUNT will be used. Allocating a non-zero initial array + * size avoids the need to reallocate as new pools are added. + * + * \param[in] op pool structure + * \param[in] count initial size of the target op_array[] array + * + * \retval 0 indicates successful pool initialization + * \retval negative error number on failure + */ +#define POOL_INIT_COUNT 2 +int lu_tgt_pool_init(struct lu_tgt_pool *op, unsigned int count) +{ + ENTRY; + + if (count == 0) + count = POOL_INIT_COUNT; + op->op_array = NULL; + op->op_count = 0; + init_rwsem(&op->op_rw_sem); + op->op_size = count * sizeof(op->op_array[0]); + OBD_ALLOC(op->op_array, op->op_size); + if (op->op_array == NULL) { + op->op_size = 0; + RETURN(-ENOMEM); + } + EXIT; + return 0; +} +EXPORT_SYMBOL(lu_tgt_pool_init); + +/** + * Increase the op_array size to hold more targets in this pool. + * + * The size is increased to at least \a min_count, but may be larger + * for an existing pool since ->op_array[] is growing exponentially. + * Caller must hold write op_rwlock. + * + * \param[in] op pool structure + * \param[in] min_count minimum number of entries to handle + * + * \retval 0 on success + * \retval negative error number on failure. + */ +int lu_tgt_pool_extend(struct lu_tgt_pool *op, unsigned int min_count) +{ + __u32 *new; + __u32 new_size; + + LASSERT(min_count != 0); + + if (op->op_count * sizeof(op->op_array[0]) < op->op_size) + return 0; + + new_size = max_t(__u32, min_count * sizeof(op->op_array[0]), + 2 * op->op_size); + OBD_ALLOC(new, new_size); + if (new == NULL) + return -ENOMEM; + + /* copy old array to new one */ + memcpy(new, op->op_array, op->op_size); + OBD_FREE(op->op_array, op->op_size); + op->op_array = new; + op->op_size = new_size; + + return 0; +} +EXPORT_SYMBOL(lu_tgt_pool_extend); + +/** + * Add a new target to an existing pool. + * + * Add a new target device to the pool previously created and returned by + * lod_pool_new(). Each target can only be in each pool at most one time. + * + * \param[in] op target pool to add new entry + * \param[in] idx pool index number to add to the \a op array + * \param[in] min_count minimum number of entries to expect in the pool + * + * \retval 0 if target could be added to the pool + * \retval negative error if target \a idx was not added + */ +int lu_tgt_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count) +{ + unsigned int i; + int rc = 0; + ENTRY; + + down_write(&op->op_rw_sem); + + /* search ost in pool array */ + for (i = 0; i < op->op_count; i++) { + if (op->op_array[i] == idx) + GOTO(out, rc = -EEXIST); + } + + rc = lu_tgt_pool_extend(op, min_count); + if (rc) + GOTO(out, rc); + + /* ost not found we add it */ + op->op_array[op->op_count] = idx; + op->op_count++; + EXIT; +out: + up_write(&op->op_rw_sem); + return rc; +} +EXPORT_SYMBOL(lu_tgt_pool_add); + +/** + * Remove an existing pool from the system. + * + * The specified pool must have previously been allocated by + * lod_pool_new() and not have any target members in the pool. + * If the removed target is not the last, compact the array + * to remove empty spaces. + * + * \param[in] op pointer to the original data structure + * \param[in] idx target index to be removed + * + * \retval 0 on success + * \retval negative error number on failure + */ +int lu_tgt_pool_remove(struct lu_tgt_pool *op, __u32 idx) +{ + unsigned int i; + ENTRY; + + down_write(&op->op_rw_sem); + + for (i = 0; i < op->op_count; i++) { + if (op->op_array[i] == idx) { + memmove(&op->op_array[i], &op->op_array[i + 1], + (op->op_count - i - 1) * + sizeof(op->op_array[0])); + op->op_count--; + up_write(&op->op_rw_sem); + EXIT; + return 0; + } + } + + up_write(&op->op_rw_sem); + RETURN(-EINVAL); +} +EXPORT_SYMBOL(lu_tgt_pool_remove); + +int lu_tgt_check_index(int idx, struct lu_tgt_pool *osts) +{ + int i, rc = -ENOENT; + ENTRY; + + down_read(&osts->op_rw_sem); + for (i = 0; i < osts->op_count; i++) { + if (osts->op_array[i] == idx) + GOTO(out, rc = 0); + } + EXIT; +out: + up_read(&osts->op_rw_sem); + return rc; +} +EXPORT_SYMBOL(lu_tgt_check_index); + +/** + * Free the pool after it was emptied and removed from /proc. + * + * Note that all of the child/target entries referenced by this pool + * must have been removed by lod_ost_pool_remove() before it can be + * deleted from memory. + * + * \param[in] op pool to be freed. + */ +void lu_tgt_pool_free(struct lu_tgt_pool *op) +{ + ENTRY; + + if (op->op_size == 0) + RETURN_EXIT; + + down_write(&op->op_rw_sem); + + OBD_FREE(op->op_array, op->op_size); + op->op_array = NULL; + op->op_count = 0; + op->op_size = 0; + + up_write(&op->op_rw_sem); + EXIT; +} +EXPORT_SYMBOL(lu_tgt_pool_free); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c new file mode 100644 index 0000000000000..216181e32f701 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c @@ -0,0 +1,102 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/lu_ucred.c + * + * Lustre user credentials context infrastructure. + * + * Author: Nikita Danilov + * Author: Fan Yong + * Author: Vitaly Fertman + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include + +/* context key constructor/destructor: lu_ucred_key_init, lu_ucred_key_fini */ +LU_KEY_INIT_FINI(lu_ucred, struct lu_ucred); + +static struct lu_context_key lu_ucred_key = { + .lct_tags = LCT_SERVER_SESSION, + .lct_init = lu_ucred_key_init, + .lct_fini = lu_ucred_key_fini +}; + +/** + * Get ucred key if session exists and ucred key is allocated on it. + * Return NULL otherwise. + */ +struct lu_ucred *lu_ucred(const struct lu_env *env) +{ + if (!env->le_ses) + return NULL; + return lu_context_key_get(env->le_ses, &lu_ucred_key); +} +EXPORT_SYMBOL(lu_ucred); + +/** + * Get ucred key and check if it is properly initialized. + * Return NULL otherwise. + */ +struct lu_ucred *lu_ucred_check(const struct lu_env *env) +{ + struct lu_ucred *uc = lu_ucred(env); + if (uc && uc->uc_valid != UCRED_OLD && uc->uc_valid != UCRED_NEW) + return NULL; + return uc; +} +EXPORT_SYMBOL(lu_ucred_check); + +/** + * Get ucred key, which must exist and must be properly initialized. + * Assert otherwise. + */ +struct lu_ucred *lu_ucred_assert(const struct lu_env *env) +{ + struct lu_ucred *uc = lu_ucred_check(env); + LASSERT(uc != NULL); + return uc; +} +EXPORT_SYMBOL(lu_ucred_assert); + +int lu_ucred_global_init(void) +{ + LU_CONTEXT_KEY_INIT(&lu_ucred_key); + return lu_context_key_register(&lu_ucred_key); +} + +void lu_ucred_global_fini(void) +{ + lu_context_key_degister(&lu_ucred_key); +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c new file mode 100644 index 0000000000000..9ac9cf13c0200 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c @@ -0,0 +1,219 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/lustre_handles.c + * + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include + +#include +#include +#include + + +static __u64 handle_base; +#define HANDLE_INCR 7 +static DEFINE_SPINLOCK(handle_base_lock); + +static struct handle_bucket { + spinlock_t lock; + struct hlist_head head; +} *handle_hash; + +#define HANDLE_HASH_SIZE (1 << 16) +#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1) + +/* + * Generate a unique 64bit cookie (hash) for a handle and insert it into + * global (per-node) hash-table. + */ +void class_handle_hash(struct portals_handle *h, const char *owner) +{ + struct handle_bucket *bucket; + + ENTRY; + + LASSERT(h != NULL); + LASSERT(hlist_unhashed(&h->h_link)); + + /* + * This is fast, but simplistic cookie generation algorithm, it will + * need a re-do at some point in the future for security. + */ + spin_lock(&handle_base_lock); + handle_base += HANDLE_INCR; + + if (unlikely(handle_base == 0)) { + /* + * Cookie of zero is "dangerous", because in many places it's + * assumed that 0 means "unassigned" handle, not bound to any + * object. + */ + CWARN("The universe has been exhausted: cookie wrap-around.\n"); + handle_base += HANDLE_INCR; + } + h->h_cookie = handle_base; + spin_unlock(&handle_base_lock); + + h->h_owner = owner; + + bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK]; + spin_lock(&bucket->lock); + hlist_add_head_rcu(&h->h_link, &bucket->head); + spin_unlock(&bucket->lock); + + CDEBUG(D_INFO, "added object %p with handle %#llx to hash\n", + h, h->h_cookie); + EXIT; +} +EXPORT_SYMBOL(class_handle_hash); + +static void class_handle_unhash_nolock(struct portals_handle *h) +{ + if (hlist_unhashed(&h->h_link)) { + CERROR("removing an already-removed handle (%#llx)\n", + h->h_cookie); + return; + } + + CDEBUG(D_INFO, "removing object %p with handle %#llx from hash\n", + h, h->h_cookie); + + hlist_del_init_rcu(&h->h_link); +} + +void class_handle_unhash(struct portals_handle *h) +{ + struct handle_bucket *bucket; + bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK); + + spin_lock(&bucket->lock); + class_handle_unhash_nolock(h); + spin_unlock(&bucket->lock); +} +EXPORT_SYMBOL(class_handle_unhash); + +void *class_handle2object(u64 cookie, const char *owner) +{ + struct handle_bucket *bucket; + struct portals_handle *h; + void *retval = NULL; + + ENTRY; + + LASSERT(handle_hash != NULL); + + /* + * Be careful when you want to change this code. See the + * rcu_read_lock() definition on top this file. - jxiong + */ + bucket = handle_hash + (cookie & HANDLE_HASH_MASK); + + rcu_read_lock(); + hlist_for_each_entry_rcu(h, &bucket->head, h_link) { + if (h->h_cookie != cookie || h->h_owner != owner) + continue; + + if (refcount_inc_not_zero(&h->h_ref)) { + CDEBUG(D_INFO, "GET %s %p refcount=%d\n", + h->h_owner, h, + refcount_read(&h->h_ref)); + retval = h; + } + break; + } + rcu_read_unlock(); + + RETURN(retval); +} +EXPORT_SYMBOL(class_handle2object); + +int class_handle_init(void) +{ + struct handle_bucket *bucket; + + LASSERT(handle_hash == NULL); + + OBD_ALLOC_PTR_ARRAY_LARGE(handle_hash, HANDLE_HASH_SIZE); + if (handle_hash == NULL) + return -ENOMEM; + + for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash; + bucket--) { + INIT_HLIST_HEAD(&bucket->head); + spin_lock_init(&bucket->lock); + } + + get_random_bytes(&handle_base, sizeof(handle_base)); + LASSERT(handle_base != 0ULL); + + return 0; +} + +static int cleanup_all_handles(void) +{ + int rc; + int i; + + for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) { + struct portals_handle *h; + + spin_lock(&handle_hash[i].lock); + hlist_for_each_entry_rcu(h, &handle_hash[i].head, h_link) { + CERROR("force clean handle %#llx addr %p owner %p\n", + h->h_cookie, h, h->h_owner); + + class_handle_unhash_nolock(h); + rc++; + } + spin_unlock(&handle_hash[i].lock); + } + + return rc; +} + +void class_handle_cleanup(void) +{ + int count; + + LASSERT(handle_hash != NULL); + + count = cleanup_all_handles(); + + OBD_FREE_PTR_ARRAY_LARGE(handle_hash, HANDLE_HASH_SIZE); + handle_hash = NULL; + + if (count != 0) + CERROR("handle_count at cleanup: %d\n", count); +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c new file mode 100644 index 0000000000000..16b50f9377a20 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c @@ -0,0 +1,247 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include +#include + +struct uuid_nid_data { + struct list_head un_list; + struct obd_uuid un_uuid; + int un_nid_count; + lnet_nid_t un_nids[MTI_NIDS_MAX]; +}; + +/* FIXME: This should probably become more elegant than a global linked list */ +static LIST_HEAD(g_uuid_list); +static DEFINE_SPINLOCK(g_uuid_lock); + +int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index) +{ + struct uuid_nid_data *data; + struct obd_uuid tmp; + int rc = -ENOENT; + + obd_str2uuid(&tmp, uuid); + spin_lock(&g_uuid_lock); + list_for_each_entry(data, &g_uuid_list, un_list) { + if (obd_uuid_equals(&data->un_uuid, &tmp)) { + if (index >= data->un_nid_count) + break; + + rc = 0; + *peer_nid = data->un_nids[index]; + break; + } + } + spin_unlock(&g_uuid_lock); + return rc; +} +EXPORT_SYMBOL(lustre_uuid_to_peer); + +/* Add a nid to a niduuid. Multiple nids can be added to a single uuid; + LNET will choose the best one. */ +int class_add_uuid(const char *uuid, __u64 nid) +{ + struct uuid_nid_data *data, *entry; + int found = 0; + int rc; + + LASSERT(nid != 0); /* valid newconfig NID is never zero */ + + if (strlen(uuid) > UUID_MAX - 1) + return -EOVERFLOW; + + OBD_ALLOC_PTR(data); + if (data == NULL) + return -ENOMEM; + + obd_str2uuid(&data->un_uuid, uuid); + data->un_nids[0] = nid; + data->un_nid_count = 1; + + spin_lock(&g_uuid_lock); + list_for_each_entry(entry, &g_uuid_list, un_list) { + if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) { + int i; + + found = 1; + for (i = 0; i < entry->un_nid_count; i++) + if (nid == entry->un_nids[i]) + break; + + if (i == entry->un_nid_count) { + LASSERT(entry->un_nid_count < MTI_NIDS_MAX); + entry->un_nids[entry->un_nid_count++] = nid; + } + break; + } + } + if (!found) + list_add(&data->un_list, &g_uuid_list); + spin_unlock(&g_uuid_lock); + + if (found) { + CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid, + libcfs_nid2str(nid), entry->un_nid_count); + rc = LNetAddPeer(entry->un_nids, entry->un_nid_count); + CDEBUG(D_INFO, "Add peer %s rc = %d\n", + libcfs_nid2str(data->un_nids[0]), rc); + OBD_FREE(data, sizeof(*data)); + } else { + CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid)); + rc = LNetAddPeer(data->un_nids, data->un_nid_count); + CDEBUG(D_INFO, "Add peer %s rc = %d\n", + libcfs_nid2str(data->un_nids[0]), rc); + } + + return 0; +} +EXPORT_SYMBOL(class_add_uuid); + +/* Delete the nids for one uuid if specified, otherwise delete all */ +int class_del_uuid(const char *uuid) +{ + struct uuid_nid_data *data; + LIST_HEAD(deathrow); + + spin_lock(&g_uuid_lock); + if (uuid != NULL) { + struct obd_uuid tmp; + + obd_str2uuid(&tmp, uuid); + list_for_each_entry(data, &g_uuid_list, un_list) { + if (obd_uuid_equals(&data->un_uuid, &tmp)) { + list_move(&data->un_list, &deathrow); + break; + } + } + } else + list_splice_init(&g_uuid_list, &deathrow); + spin_unlock(&g_uuid_lock); + + if (uuid != NULL && list_empty(&deathrow)) { + CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid); + return -EINVAL; + } + + while ((data = list_first_entry_or_null(&deathrow, struct uuid_nid_data, + un_list)) != NULL) { + list_del(&data->un_list); + + CDEBUG(D_INFO, "del uuid %s %s/%d\n", + obd_uuid2str(&data->un_uuid), + libcfs_nid2str(data->un_nids[0]), + data->un_nid_count); + + OBD_FREE(data, sizeof(*data)); + } + return 0; +} + +int class_add_nids_to_uuid(struct obd_uuid *uuid, lnet_nid_t *nids, + int nid_count) +{ + struct uuid_nid_data *entry; + int i, rc; + bool matched = false; + + ENTRY; + + if (nid_count >= MTI_NIDS_MAX) { + CDEBUG(D_NET, "too many NIDs (%d) for UUID '%s'\n", + nid_count, obd_uuid2str(uuid)); + return -ENOSPC; + } + + spin_lock(&g_uuid_lock); + list_for_each_entry(entry, &g_uuid_list, un_list) { + CDEBUG(D_NET, "Comparing %s with %s\n", + obd_uuid2str(uuid), obd_uuid2str(&entry->un_uuid)); + + if (!obd_uuid_equals(&entry->un_uuid, uuid)) + continue; + + matched = true; + CDEBUG(D_NET, "Updating UUID '%s'\n", obd_uuid2str(uuid)); + for (i = 0; i < nid_count; i++) + entry->un_nids[i] = nids[i]; + entry->un_nid_count = nid_count; + break; + } + spin_unlock(&g_uuid_lock); + if (matched) { + rc = LNetAddPeer(entry->un_nids, entry->un_nid_count); + CDEBUG(D_INFO, "Add peer %s rc = %d\n", + libcfs_nid2str(entry->un_nids[0]), rc); + } + + RETURN(0); +} +EXPORT_SYMBOL(class_add_nids_to_uuid); + +/* check if @nid exists in nid list of @uuid */ +int class_check_uuid(struct obd_uuid *uuid, __u64 nid) +{ + struct uuid_nid_data *entry; + int found = 0; + + ENTRY; + + CDEBUG(D_INFO, "check if uuid %s has %s.\n", + obd_uuid2str(uuid), libcfs_nid2str(nid)); + + spin_lock(&g_uuid_lock); + list_for_each_entry(entry, &g_uuid_list, un_list) { + int i; + + if (!obd_uuid_equals(&entry->un_uuid, uuid)) + continue; + + /* found the uuid, check if it has @nid */ + for (i = 0; i < entry->un_nid_count; i++) { + if (entry->un_nids[i] == nid) { + found = 1; + break; + } + } + break; + } + spin_unlock(&g_uuid_lock); + RETURN(found); +} +EXPORT_SYMBOL(class_check_uuid); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c new file mode 100644 index 0000000000000..b0a68a0fb9981 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c @@ -0,0 +1,198 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + * Use is subject to license terms. + * + * Author: Johann Lombardi + */ +#include +#include +#include +#include + +/** + * Initialize new \a lma. Only fid is stored. + * + * \param lma - is the new LMA structure to be initialized + * \param fid - is the FID of the object this LMA belongs to + * \param incompat - features that MDS must understand to access object + */ +void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid, + __u32 compat, __u32 incompat) +{ + lma->lma_compat = compat; + lma->lma_incompat = incompat; + lma->lma_self_fid = *fid; + + /* If a field is added in struct lustre_mdt_attrs, zero it explicitly + * and change the test below. */ + BUILD_BUG_ON(sizeof(*lma) != + (offsetof(struct lustre_mdt_attrs, lma_self_fid) + + sizeof(lma->lma_self_fid))); +} +EXPORT_SYMBOL(lustre_lma_init); + +/** + * Swab, if needed, LMA structure which is stored on-disk in little-endian order. + * + * \param lma - is a pointer to the LMA structure to be swabbed. + */ +void lustre_lma_swab(struct lustre_mdt_attrs *lma) +{ +#ifdef __BIG_ENDIAN + __swab32s(&lma->lma_compat); + __swab32s(&lma->lma_incompat); + lustre_swab_lu_fid(&lma->lma_self_fid); +#endif +} +EXPORT_SYMBOL(lustre_lma_swab); + +void lustre_loa_init(struct lustre_ost_attrs *loa, const struct lu_fid *fid, + __u32 compat, __u32 incompat) +{ + BUILD_BUG_ON(sizeof(*loa) != LMA_OLD_SIZE); + + memset_startat(loa, 0, loa_parent_fid); + lustre_lma_init(&loa->loa_lma, fid, compat, incompat); +} +EXPORT_SYMBOL(lustre_loa_init); + +/** + * Swab, if needed, LOA (for OST-object only) structure with LMA EA and PFID EA + * combined together are stored on-disk in little-endian order. + * + * \param[in] loa - the pointer to the LOA structure to be swabbed. + * \param[in] to_cpu - to indicate swab for CPU order or not. + */ +void lustre_loa_swab(struct lustre_ost_attrs *loa, bool to_cpu) +{ + struct lustre_mdt_attrs *lma = &loa->loa_lma; +#ifdef __BIG_ENDIAN + __u32 compat = lma->lma_compat; +#endif + + lustre_lma_swab(lma); +#ifdef __BIG_ENDIAN + if (to_cpu) + compat = lma->lma_compat; + + if (compat & LMAC_STRIPE_INFO) { + lustre_swab_lu_fid(&loa->loa_parent_fid); + __swab32s(&loa->loa_stripe_size); + } + if (compat & LMAC_COMP_INFO) { + __swab32s(&loa->loa_comp_id); + __swab64s(&loa->loa_comp_start); + __swab64s(&loa->loa_comp_end); + } +#endif +} +EXPORT_SYMBOL(lustre_loa_swab); + +/** + * Swab, if needed, SOM structure which is stored on-disk in little-endian + * order. + * + * \param attrs - is a pointer to the SOM structure to be swabbed. + */ +void lustre_som_swab(struct lustre_som_attrs *attrs) +{ +#ifdef __BIG_ENDIAN + __swab16s(&attrs->lsa_valid); + __swab64s(&attrs->lsa_size); + __swab64s(&attrs->lsa_blocks); +#endif +} +EXPORT_SYMBOL(lustre_som_swab); + +/** + * Swab, if needed, HSM structure which is stored on-disk in little-endian + * order. + * + * \param attrs - is a pointer to the HSM structure to be swabbed. + */ +void lustre_hsm_swab(struct hsm_attrs *attrs) +{ +#ifdef __BIG_ENDIAN + __swab32s(&attrs->hsm_compat); + __swab32s(&attrs->hsm_flags); + __swab64s(&attrs->hsm_arch_id); + __swab64s(&attrs->hsm_arch_ver); +#endif +} + +/* + * Swab and extract HSM attributes from on-disk xattr. + * + * \param buf - is a buffer containing the on-disk HSM extended attribute. + * \param rc - is the HSM xattr stored in \a buf + * \param mh - is the md_hsm structure where to extract HSM attributes. + */ +int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh) +{ + struct hsm_attrs *attrs = (struct hsm_attrs *)buf; + ENTRY; + + if (rc == 0 || rc == -ENODATA) + /* no HSM attributes */ + RETURN(-ENODATA); + + if (rc < 0) + /* error hit while fetching xattr */ + RETURN(rc); + + /* unpack HSM attributes */ + lustre_hsm_swab(attrs); + + /* fill md_hsm structure */ + mh->mh_compat = attrs->hsm_compat; + mh->mh_flags = attrs->hsm_flags; + mh->mh_arch_id = attrs->hsm_arch_id; + mh->mh_arch_ver = attrs->hsm_arch_ver; + + RETURN(0); +} +EXPORT_SYMBOL(lustre_buf2hsm); + +/* + * Pack HSM attributes. + * + * \param buf - is the output buffer where to pack the on-disk HSM xattr. + * \param mh - is the md_hsm structure to pack. + */ +void lustre_hsm2buf(void *buf, const struct md_hsm *mh) +{ + struct hsm_attrs *attrs = (struct hsm_attrs *)buf; + ENTRY; + + /* copy HSM attributes */ + attrs->hsm_compat = mh->mh_compat; + attrs->hsm_flags = mh->mh_flags; + attrs->hsm_arch_id = mh->mh_arch_id; + attrs->hsm_arch_ver = mh->mh_arch_ver; + + /* pack xattr */ + lustre_hsm_swab(attrs); +} +EXPORT_SYMBOL(lustre_hsm2buf); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c new file mode 100644 index 0000000000000..16e6f12f8a05c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c @@ -0,0 +1,149 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2018, DataDirect Networks Storage. + * Author: Li Xi. + * + * Checksum functions + */ +#include +#include + +/* Server uses algos that perform at 50% or better of the Adler */ +enum cksum_types obd_cksum_types_supported_server(const char *obd_name) +{ + enum cksum_types ret = OBD_CKSUM_ADLER; + int base_speed; + + CDEBUG(D_INFO, "%s: checksum speed: crc %d, crc32c %d, adler %d, " + "t10ip512 %d, t10ip4k %d, t10crc512 %d, t10crc4k %d\n", + obd_name, + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)), + obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512), + obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K), + obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512), + obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K)); + + base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2; + + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >= + base_speed) + ret |= OBD_CKSUM_CRC32C; + + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >= + base_speed) + ret |= OBD_CKSUM_CRC32; + + if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512) >= base_speed) + ret |= OBD_CKSUM_T10IP512; + + if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K) >= base_speed) + ret |= OBD_CKSUM_T10IP4K; + + if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512) >= base_speed) + ret |= OBD_CKSUM_T10CRC512; + + if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K) >= base_speed) + ret |= OBD_CKSUM_T10CRC4K; + + return ret; +} +EXPORT_SYMBOL(obd_cksum_types_supported_server); + +/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can + * only be a single checksum type per RPC. + * + * The OBD_CKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask + * since they need to represent the full range of checksum algorithms that + * both the client and server can understand. + * + * In case of an unsupported types/flags we fall back to ADLER + * because that is supported by all clients since 1.8 + * + * In case multiple algorithms are supported the best one is used. */ +u32 obd_cksum_type_pack(const char *obd_name, enum cksum_types cksum_type) +{ + unsigned int performance = 0, tmp; + u32 flag = OBD_FL_CKSUM_ADLER; + + if (cksum_type & OBD_CKSUM_CRC32) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_CRC32; + } + } + if (cksum_type & OBD_CKSUM_CRC32C) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_CRC32C; + } + } + if (cksum_type & OBD_CKSUM_ADLER) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_ADLER; + } + } + + if (cksum_type & OBD_CKSUM_T10IP512) { + tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_T10IP512; + } + } + + if (cksum_type & OBD_CKSUM_T10IP4K) { + tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_T10IP4K; + } + } + + if (cksum_type & OBD_CKSUM_T10CRC512) { + tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_T10CRC512; + } + } + + if (cksum_type & OBD_CKSUM_T10CRC4K) { + tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_T10CRC4K; + } + } + + if (unlikely(cksum_type && !(cksum_type & OBD_CKSUM_ALL))) + CWARN("%s: unknown cksum type %x\n", obd_name, cksum_type); + + return flag; +} +EXPORT_SYMBOL(obd_cksum_type_pack); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c new file mode 100644 index 0000000000000..09a524323ea1a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c @@ -0,0 +1,2479 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/obd_config.c + * + * Config API + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "llog_internal.h" + +#ifdef HAVE_SERVER_SUPPORT +static struct cfs_hash_ops nid_stat_hash_ops; +static struct cfs_hash_ops gen_hash_ops; +#endif /* HAVE_SERVER_SUPPORT */ + +/* + * uuid<->export lustre hash operations + */ +/* + * NOTE: It is impossible to find an export that is in failed + * state with this function + */ +static int +uuid_keycmp(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct obd_uuid *uuid = arg->key; + const struct obd_export *exp = obj; + + if (obd_uuid_equals(uuid, &exp->exp_client_uuid) && + !exp->exp_failed) + return 0; + return -ESRCH; +} + +static void +obd_export_exit(void *vexport, void *data) +{ + struct obd_export *exp = vexport; + + class_export_put(exp); +} + +static const struct rhashtable_params uuid_hash_params = { + .key_len = sizeof(struct obd_uuid), + .key_offset = offsetof(struct obd_export, exp_client_uuid), + .head_offset = offsetof(struct obd_export, exp_uuid_hash), + .obj_cmpfn = uuid_keycmp, + .max_size = MAX_OBD_DEVICES, + .automatic_shrinking = true, +}; + +int obd_uuid_add(struct obd_device *obd, struct obd_export *export) +{ + int rc; + + class_export_get(export); + rcu_read_lock(); + rc = rhashtable_lookup_insert_fast(&obd->obd_uuid_hash, + &export->exp_uuid_hash, + uuid_hash_params); + if (rc) { + class_export_put(export); + if (rc != -EEXIST) { + /* map obscure error codes to -ENOMEM */ + rc = -ENOMEM; + } else { + rc = -EALREADY; + } + } + rcu_read_unlock(); + + return rc; +} +EXPORT_SYMBOL(obd_uuid_add); + +void obd_uuid_del(struct obd_device *obd, struct obd_export *export) +{ + int rc; + + rcu_read_lock(); + rc = rhashtable_remove_fast(&obd->obd_uuid_hash, + &export->exp_uuid_hash, + uuid_hash_params); + if (!rc) + class_export_put(export); + rcu_read_unlock(); +} +EXPORT_SYMBOL(obd_uuid_del); + +#ifdef HAVE_SERVER_SUPPORT +/* obd_uuid_lookup() is used only server side by target_handle_connect(), + * mdt_hsm_agent_send(), and obd_export_evict_by_uuid(). + */ +struct obd_export *obd_uuid_lookup(struct obd_device *obd, + struct obd_uuid *uuid) +{ + struct obd_export *export = NULL; + + rcu_read_lock(); + export = rhashtable_lookup_fast(&obd->obd_uuid_hash, uuid, + uuid_hash_params); + if (export && !refcount_inc_not_zero(&export->exp_handle.h_ref)) + export = NULL; + rcu_read_unlock(); + + return export; +} +EXPORT_SYMBOL(obd_uuid_lookup); + +/* + * nid<->export hash operations + */ +static u32 nid_keyhash(const void *data, u32 key_len, u32 seed) +{ + const struct obd_export *exp = data; + void *key; + + if (!exp->exp_connection) + return 0; + + key = &exp->exp_connection->c_peer.nid; + return jhash2(key, key_len / sizeof(u32), seed); +} + +/* + * NOTE: It is impossible to find an export that is in failed + * state with this function + */ +static int +nid_keycmp(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct lnet_nid *nid = arg->key; + const struct obd_export *exp = obj; + + if (nid_same(&exp->exp_connection->c_peer.nid, nid)) + return 0; + + return -ESRCH; +} + +static void +nid_export_exit(void *vexport, void *data) +{ + struct obd_export *exp = vexport; + + class_export_put(exp); +} + +static const struct rhashtable_params nid_hash_params = { + .key_len = sizeof(struct lnet_nid), + .head_offset = offsetof(struct obd_export, exp_nid_hash), + .obj_hashfn = nid_keyhash, + .obj_cmpfn = nid_keycmp, + .automatic_shrinking = true, +}; + +int obd_nid_add(struct obd_device *obd, struct obd_export *exp) +{ + int rc; + + if (exp == exp->exp_obd->obd_self_export || exp->exp_hashed) + return 0; + + class_export_get(exp); + rc = rhltable_insert_key(&obd->obd_nid_hash, + &exp->exp_connection->c_peer.nid, + &exp->exp_nid_hash, + nid_hash_params); + if (rc) { + class_export_put(exp); + /* map obscure error codes to -ENOMEM */ + rc = -ENOMEM; + } else { + exp->exp_hashed = 1; + } + return rc; +} +EXPORT_SYMBOL(obd_nid_add); + +void obd_nid_del(struct obd_device *obd, struct obd_export *exp) +{ + int rc; + + if (exp == exp->exp_obd->obd_self_export || !exp->exp_hashed) + return; + + rc = rhltable_remove(&obd->obd_nid_hash, &exp->exp_nid_hash, + nid_hash_params); + if (rc == 0) { + class_export_put(exp); + exp->exp_hashed = 0; + } +} +EXPORT_SYMBOL(obd_nid_del); + +int obd_nid_export_for_each(struct obd_device *obd, struct lnet_nid *nid, + int cb(struct obd_export *exp, void *data), + void *data) +{ + struct rhlist_head *exports, *tmp; + struct obd_export *exp; + int ret = 0; + + rcu_read_lock(); + exports = rhltable_lookup(&obd->obd_nid_hash, nid, nid_hash_params); + if (!exports) { + ret = -ENODEV; + goto out_unlock; + } + + rhl_for_each_entry_rcu(exp, tmp, exports, exp_nid_hash) { + if (!exp->exp_failed && cb(exp, data)) + ret++; + } + +out_unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(obd_nid_export_for_each); +#endif /* HAVE_SERVER_SUPPORT */ + +/*********** string parsing utils *********/ + +/* returns 0 if we find this key in the buffer, else 1 */ +int class_find_param(char *buf, char *key, char **valp) +{ + char *ptr; + + if (!buf) + return 1; + + ptr = strstr(buf, key); + if (!ptr) + return 1; + + if (valp) + *valp = ptr + strlen(key); + + return 0; +} +EXPORT_SYMBOL(class_find_param); + +/** + * Check whether the proc parameter \a param is an old parameter or not from + * the array \a ptr which contains the mapping from old parameters to new ones. + * If it's an old one, then return the pointer to the cfg_interop_param struc- + * ture which contains both the old and new parameters. + * + * \param param proc parameter + * \param ptr an array which contains the mapping from + * old parameters to new ones + * + * \retval valid-pointer pointer to the cfg_interop_param structure + * which contains the old and new parameters + * \retval NULL \a param or \a ptr is NULL, + * or \a param is not an old parameter + */ +struct cfg_interop_param *class_find_old_param(const char *param, + struct cfg_interop_param *ptr) +{ + char *value = NULL; + int name_len = 0; + + if (!param || !ptr) + RETURN(NULL); + + value = strchr(param, '='); + if (value) + name_len = value - param; + else + name_len = strlen(param); + + while (ptr->old_param) { + if (strncmp(param, ptr->old_param, name_len) == 0 && + name_len == strlen(ptr->old_param)) + RETURN(ptr); + ptr++; + } + + RETURN(NULL); +} +EXPORT_SYMBOL(class_find_old_param); + +/** + * Finds a parameter in \a params and copies it to \a copy. + * + * Leading spaces are skipped. Next space or end of string is the + * parameter terminator with the exception that spaces inside single or double + * quotes get included into a parameter. The parameter is copied into \a copy + * which has to be allocated big enough by a caller, quotes are stripped in + * the copy and the copy is terminated by 0. + * + * On return \a params is set to next parameter or to NULL if last + * parameter is returned. + * + * \retval 0 if parameter is returned in \a copy + * \retval 1 otherwise + * \retval -EINVAL if unbalanced quota is found + */ +int class_get_next_param(char **params, char *copy) +{ + char *q1, *q2, *str; + int len; + + str = *params; + while (*str == ' ') + str++; + + if (*str == '\0') { + *params = NULL; + return 1; + } + + while (1) { + q1 = strpbrk(str, " '\""); + if (!q1) { + len = strlen(str); + memcpy(copy, str, len); + copy[len] = '\0'; + *params = NULL; + return 0; + } + len = q1 - str; + if (*q1 == ' ') { + memcpy(copy, str, len); + copy[len] = '\0'; + *params = str + len; + return 0; + } + + memcpy(copy, str, len); + copy += len; + + /* search for the matching closing quote */ + str = q1 + 1; + q2 = strchr(str, *q1); + if (!q2) { + CERROR("Unbalanced quota in parameters: \"%s\"\n", + *params); + return -EINVAL; + } + len = q2 - str; + memcpy(copy, str, len); + copy += len; + str = q2 + 1; + } + return 1; +} +EXPORT_SYMBOL(class_get_next_param); + +/* + * returns 0 if this is the first key in the buffer, else 1. + * valp points to first char after key. + */ +int class_match_param(char *buf, const char *key, char **valp) +{ + if (!buf) + return 1; + + if (memcmp(buf, key, strlen(key)) != 0) + return 1; + + if (valp) + *valp = buf + strlen(key); + + return 0; +} +EXPORT_SYMBOL(class_match_param); + +static int parse_nid(char *buf, void *value, int quiet) +{ + lnet_nid_t *nid = (lnet_nid_t *)value; + + *nid = libcfs_str2nid(buf); + if (*nid != LNET_NID_ANY) + return 0; + + if (!quiet) + LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf); + return -EINVAL; +} + +static int parse_net(char *buf, void *value) +{ + __u32 *net = (__u32 *)value; + + *net = libcfs_str2net(buf); + CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net)); + return 0; +} + +enum { + CLASS_PARSE_NID = 1, + CLASS_PARSE_NET, +}; + +/* + * 0 is good NID, + * 1 not found + * < 0 error + * endh is set to next separator + */ +static int class_parse_value(char *buf, int opc, void *value, char **endh, + int quiet) +{ + char *endp; + char tmp; + int rc = 0; + + if (!buf) + return 1; + while (*buf == ',' || *buf == ':') + buf++; + if (*buf == ' ' || *buf == '/' || *buf == '\0') + return 1; + + /* NID separators or end of NIDs */ + endp = strpbrk(buf, ",: /"); + if (!endp) + endp = buf + strlen(buf); + + tmp = *endp; + *endp = '\0'; + switch (opc) { + default: + LBUG(); + case CLASS_PARSE_NID: + rc = parse_nid(buf, value, quiet); + break; + case CLASS_PARSE_NET: + rc = parse_net(buf, value); + break; + } + *endp = tmp; + if (rc != 0) + return rc; + if (endh) + *endh = endp; + return 0; +} + +int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0); +} +EXPORT_SYMBOL(class_parse_nid); + +int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1); +} +EXPORT_SYMBOL(class_parse_nid_quiet); + +int class_parse_net(char *buf, __u32 *net, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0); +} + +/* + * 1 param contains key and match + * 0 param contains key and not match + * -1 param does not contain key + */ +int class_match_nid(char *buf, char *key, lnet_nid_t nid) +{ + lnet_nid_t tmp; + int rc = -1; + + while (class_find_param(buf, key, &buf) == 0) { + /* + * please restrict to the NIDs pertaining to + * the specified NIDs + */ + while (class_parse_nid(buf, &tmp, &buf) == 0) { + if (tmp == nid) + return 1; + } + rc = 0; + } + return rc; +} + +int class_match_net(char *buf, char *key, __u32 net) +{ + __u32 tmp; + int rc = -1; + + while (class_find_param(buf, key, &buf) == 0) { + /* + * please restrict to the NIDs pertaining to + * the specified networks + */ + while (class_parse_net(buf, &tmp, &buf) == 0) { + if (tmp == net) + return 1; + } + rc = 0; + } + return rc; +} + +char *lustre_cfg_string(struct lustre_cfg *lcfg, u32 index) +{ + char *s; + + if (!lcfg->lcfg_buflens[index]) + return NULL; + + s = lustre_cfg_buf(lcfg, index); + if (!s) + return NULL; + + /* + * make sure it's NULL terminated, even if this kills a char + * of data. Try to use the padding first though. + */ + if (s[lcfg->lcfg_buflens[index] - 1] != '\0') { + size_t last = ALIGN(lcfg->lcfg_buflens[index], 8) - 1; + char lost; + + /* Use the smaller value */ + if (last > lcfg->lcfg_buflens[index]) + last = lcfg->lcfg_buflens[index]; + + lost = s[last]; + s[last] = '\0'; + if (lost != '\0') { + CWARN("Truncated buf %d to '%s' (lost '%c'...)\n", + index, s, lost); + } + } + return s; +} +EXPORT_SYMBOL(lustre_cfg_string); + +/********************** class fns **********************/ + +/** + * Create a new OBD device and set the type, name and uuid. If successful, + * the new device can be accessed by either name or uuid. + */ +int class_attach(struct lustre_cfg *lcfg) +{ + struct obd_export *exp; + struct obd_device *obd = NULL; + char *typename, *name, *uuid; + int rc, len; + + ENTRY; + + if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) { + CERROR("No type passed!\n"); + RETURN(-EINVAL); + } + typename = lustre_cfg_string(lcfg, 1); + + if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) { + CERROR("No name passed!\n"); + RETURN(-EINVAL); + } + name = lustre_cfg_string(lcfg, 0); + if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) { + CERROR("No UUID passed!\n"); + RETURN(-EINVAL); + } + + uuid = lustre_cfg_string(lcfg, 2); + len = strlen(uuid); + if (len >= sizeof(obd->obd_uuid)) { + CERROR("%s: uuid must be < %d bytes long\n", + name, (int)sizeof(obd->obd_uuid)); + RETURN(-EINVAL); + } + + obd = class_newdev(typename, name, uuid); + if (IS_ERR(obd)) { /* Already exists or out of obds */ + rc = PTR_ERR(obd); + CERROR("Cannot create device %s of type %s : %d\n", + name, typename, rc); + RETURN(rc); + } + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "obd %p obd_magic %08X != %08X\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0, + "%p obd_name %s != %s\n", obd, obd->obd_name, name); + + exp = class_new_export_self(obd, &obd->obd_uuid); + if (IS_ERR(exp)) { + rc = PTR_ERR(exp); + class_free_dev(obd); + RETURN(rc); + } + + obd->obd_self_export = exp; + list_del_init(&exp->exp_obd_chain_timed); + class_export_put(exp); + + rc = class_register_device(obd); + if (rc != 0) { + class_decref(obd, "newdev", obd); + RETURN(rc); + } + + obd->obd_attached = 1; + CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n", + obd->obd_minor, typename, atomic_read(&obd->obd_refcount)); + + RETURN(0); +} +EXPORT_SYMBOL(class_attach); + +/** + * Create hashes, self-export, and call type-specific setup. + * Setup is effectively the "start this obd" call. + */ +int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + int err = 0; + + ENTRY; + + LASSERT(obd != NULL); + LASSERTF(obd == class_num2obd(obd->obd_minor), + "obd %p != obd_devs[%d] %p\n", + obd, obd->obd_minor, class_num2obd(obd->obd_minor)); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "obd %p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + + /* have we attached a type to this device? */ + if (!obd->obd_attached) { + CERROR("Device %d not attached\n", obd->obd_minor); + RETURN(-ENODEV); + } + + if (obd->obd_set_up) { + CERROR("Device %d already setup (type %s)\n", + obd->obd_minor, obd->obd_type->typ_name); + RETURN(-EEXIST); + } + + /* is someone else setting us up right now? (attach inits spinlock) */ + spin_lock(&obd->obd_dev_lock); + if (obd->obd_starting) { + spin_unlock(&obd->obd_dev_lock); + CERROR("Device %d setup in progress (type %s)\n", + obd->obd_minor, obd->obd_type->typ_name); + RETURN(-EEXIST); + } + /* + * just leave this on forever. I can't use obd_set_up here because + * other fns check that status, and we're not actually set up yet. + */ + obd->obd_starting = 1; + obd->obd_nid_stats_hash = NULL; + obd->obd_gen_hash = NULL; + spin_unlock(&obd->obd_dev_lock); + + /* create an uuid-export lustre hash */ + err = rhashtable_init(&obd->obd_uuid_hash, &uuid_hash_params); + if (err) + GOTO(err_starting, err); + +#ifdef HAVE_SERVER_SUPPORT + /* create a nid-export lustre hash */ + err = rhltable_init(&obd->obd_nid_hash, &nid_hash_params); + if (err) + GOTO(err_uuid_hash, err = -ENOMEM); + + /* create a nid-stats lustre hash */ + obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS", + HASH_NID_STATS_CUR_BITS, + HASH_NID_STATS_MAX_BITS, + HASH_NID_STATS_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nid_stat_hash_ops, + CFS_HASH_DEFAULT); + if (!obd->obd_nid_stats_hash) + GOTO(err_nid_hash, err = -ENOMEM); + + /* create a client_generation-export lustre hash */ + obd->obd_gen_hash = cfs_hash_create("UUID_HASH", + HASH_GEN_CUR_BITS, + HASH_GEN_MAX_BITS, + HASH_GEN_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &gen_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_gen_hash) + GOTO(err_nid_stats_hash, err = -ENOMEM); +#endif /* HAVE_SERVER_SUPPORT */ + + err = obd_setup(obd, lcfg); + if (err) +#ifdef HAVE_SERVER_SUPPORT + GOTO(err_gen_hash, err); +#else + GOTO(err_uuid_hash, err); +#endif /* ! HAVE_SERVER_SUPPORT */ + + obd->obd_set_up = 1; + + spin_lock(&obd->obd_dev_lock); + /* cleanup drops this */ + class_incref(obd, "setup", obd); + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n", + obd->obd_name, obd->obd_uuid.uuid); + + RETURN(0); + +#ifdef HAVE_SERVER_SUPPORT +err_gen_hash: + if (obd->obd_gen_hash) { + cfs_hash_putref(obd->obd_gen_hash); + obd->obd_gen_hash = NULL; + } +err_nid_stats_hash: + if (obd->obd_nid_stats_hash) { + cfs_hash_putref(obd->obd_nid_stats_hash); + obd->obd_nid_stats_hash = NULL; + } +err_nid_hash: + rhltable_destroy(&obd->obd_nid_hash); +#endif /* HAVE_SERVER_SUPPORT */ +err_uuid_hash: + rhashtable_destroy(&obd->obd_uuid_hash); +err_starting: + obd->obd_starting = 0; + CERROR("setup %s failed (%d)\n", obd->obd_name, err); + return err; +} +EXPORT_SYMBOL(class_setup); + +/** + * We have finished using this OBD and are ready to destroy it. + * There can be no more references to this obd. + */ +int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + ENTRY; + + if (obd->obd_set_up) { + CERROR("OBD device %d still set up\n", obd->obd_minor); + RETURN(-EBUSY); + } + + spin_lock(&obd->obd_dev_lock); + if (!obd->obd_attached) { + spin_unlock(&obd->obd_dev_lock); + CERROR("OBD device %d not attached\n", obd->obd_minor); + RETURN(-ENODEV); + } + obd->obd_attached = 0; + spin_unlock(&obd->obd_dev_lock); + + /* cleanup in progress. we don't like to find this device after now */ + class_unregister_device(obd); + + CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n", + obd->obd_name, obd->obd_uuid.uuid); + + class_decref(obd, "newdev", obd); + + RETURN(0); +} +EXPORT_SYMBOL(class_detach); + +/** + * Start shutting down the OBD. There may be in-progess ops when + * this is called. We tell them to start shutting down with a call + * to class_disconnect_exports(). + */ +int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + int err = 0; + char *flag; + ENTRY; + + OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS); + + if (!obd->obd_set_up) { + CERROR("Device %d not setup\n", obd->obd_minor); + RETURN(-ENODEV); + } + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + CERROR("OBD %d already stopping\n", obd->obd_minor); + RETURN(-ENODEV); + } + /* Leave this on forever */ + obd->obd_stopping = 1; + spin_unlock(&obd->obd_dev_lock); + + /* wait for already-arrived-connections to finish. */ + while (obd->obd_conn_inprogress > 0) + yield(); + smp_rmb(); + + if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) { + for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++) + switch (*flag) { + case 'F': + obd->obd_force = 1; + break; + case 'A': + LCONSOLE_WARN("Failing over %s\n", + obd->obd_name); + spin_lock(&obd->obd_dev_lock); + obd->obd_fail = 1; +#ifdef HAVE_SERVER_SUPPORT + obd->obd_no_transno = 1; +#endif + obd->obd_no_recov = 1; + spin_unlock(&obd->obd_dev_lock); + if (OBP(obd, iocontrol)) { + obd_iocontrol(OBD_IOC_SYNC, + obd->obd_self_export, + 0, NULL, NULL); + } + break; + default: + CERROR("Unrecognised flag '%c'\n", *flag); + } + } + + LASSERT(obd->obd_self_export); + + CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d/%d\n", + obd->obd_name, obd->obd_num_exports, + atomic_read(&obd->obd_refcount) - 2); + dump_exports(obd, 0, D_HA); + class_disconnect_exports(obd); + + /* Precleanup, we must make sure all exports get destroyed. */ + err = obd_precleanup(obd); + if (err) + CERROR("Precleanup %s returned %d\n", + obd->obd_name, err); + + /* destroy an uuid-export hash body */ + rhashtable_free_and_destroy(&obd->obd_uuid_hash, obd_export_exit, + NULL); +#ifdef HAVE_SERVER_SUPPORT + /* destroy a nid-export hash body */ + rhltable_free_and_destroy(&obd->obd_nid_hash, nid_export_exit, NULL); + + /* destroy a nid-stats hash body */ + if (obd->obd_nid_stats_hash) { + cfs_hash_putref(obd->obd_nid_stats_hash); + obd->obd_nid_stats_hash = NULL; + } + + /* destroy a client_generation-export hash body */ + if (obd->obd_gen_hash) { + cfs_hash_putref(obd->obd_gen_hash); + obd->obd_gen_hash = NULL; + } +#endif /* HAVE_SERVER_SUPPORT */ + class_decref(obd, "setup", obd); + obd->obd_set_up = 0; + + RETURN(0); +} + +struct obd_device *class_incref(struct obd_device *obd, + const char *scope, + const void *source) +{ + lu_ref_add_atomic(&obd->obd_reference, scope, source); + atomic_inc(&obd->obd_refcount); + CDEBUG(D_INFO, "incref %s (%p) now %d - %s\n", obd->obd_name, obd, + atomic_read(&obd->obd_refcount), scope); + + return obd; +} +EXPORT_SYMBOL(class_incref); + +void class_decref(struct obd_device *obd, const char *scope, const void *source) +{ + int last; + + CDEBUG(D_INFO, "Decref %s (%p) now %d - %s\n", obd->obd_name, obd, + atomic_read(&obd->obd_refcount), scope); + + LASSERT(obd->obd_num_exports >= 0); + last = atomic_dec_and_test(&obd->obd_refcount); + lu_ref_del(&obd->obd_reference, scope, source); + + if (last) { + struct obd_export *exp; + + LASSERT(!obd->obd_attached); + /* + * All exports have been destroyed; there should + * be no more in-progress ops by this point. + */ + exp = obd->obd_self_export; + + if (exp) { + exp->exp_flags |= exp_flags_from_obd(obd); + class_unlink_export(exp); + } + } +} +EXPORT_SYMBOL(class_decref); + +/** + * Add a failover NID location. + * Client OBD types contact server OBD types using this NID list. + */ +int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_import *imp; + struct obd_uuid uuid; + int rc; + + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || + LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) { + CERROR("invalid conn_uuid\n"); + RETURN(-EINVAL); + } + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) { + CERROR("can't add connection on non-client dev\n"); + RETURN(-EINVAL); + } + + imp = obd->u.cli.cl_import; + if (!imp) { + CERROR("try to add conn on immature client dev\n"); + RETURN(-EINVAL); + } + + obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1)); + rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num); + + RETURN(rc); +} + +/** Remove a failover NID location. */ +static int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_import *imp; + struct obd_uuid uuid; + int rc; + + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || + LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) { + CERROR("invalid conn_uuid\n"); + RETURN(-EINVAL); + } + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { + CERROR("can't del connection on non-client dev\n"); + RETURN(-EINVAL); + } + + imp = obd->u.cli.cl_import; + if (!imp) { + CERROR("try to del conn on immature client dev\n"); + RETURN(-EINVAL); + } + + obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1)); + rc = obd_del_conn(imp, &uuid); + + RETURN(rc); +} + +static LIST_HEAD(lustre_profile_list); +static DEFINE_SPINLOCK(lustre_profile_list_lock); + +static struct lustre_profile *class_get_profile_nolock(const char *prof) +{ + struct lustre_profile *lprof; + + ENTRY; + list_for_each_entry(lprof, &lustre_profile_list, lp_list) { + if (strcmp(lprof->lp_profile, prof) == 0) { + lprof->lp_refs++; + RETURN(lprof); + } + } + RETURN(NULL); +} + +struct lustre_profile *class_get_profile(const char *prof) +{ + struct lustre_profile *lprof; + + ENTRY; + spin_lock(&lustre_profile_list_lock); + lprof = class_get_profile_nolock(prof); + spin_unlock(&lustre_profile_list_lock); + RETURN(lprof); +} +EXPORT_SYMBOL(class_get_profile); + +/** + * Create a named "profile". + * This defines the MDC and OSC names to use for a client. + * This also is used to define the LOV to be used by a MDT. + */ +static int class_add_profile(int proflen, char *prof, int osclen, char *osc, + int mdclen, char *mdc) +{ + struct lustre_profile *lprof; + int err = 0; + + ENTRY; + + CDEBUG(D_CONFIG, "Add profile %s\n", prof); + + OBD_ALLOC(lprof, sizeof(*lprof)); + if (!lprof) + RETURN(-ENOMEM); + INIT_LIST_HEAD(&lprof->lp_list); + + LASSERT(proflen == (strlen(prof) + 1)); + OBD_ALLOC(lprof->lp_profile, proflen); + if (!lprof->lp_profile) + GOTO(out, err = -ENOMEM); + memcpy(lprof->lp_profile, prof, proflen); + + LASSERT(osclen == (strlen(osc) + 1)); + OBD_ALLOC(lprof->lp_dt, osclen); + if (!lprof->lp_dt) + GOTO(out, err = -ENOMEM); + memcpy(lprof->lp_dt, osc, osclen); + + if (mdclen > 0) { + LASSERT(mdclen == (strlen(mdc) + 1)); + OBD_ALLOC(lprof->lp_md, mdclen); + if (!lprof->lp_md) + GOTO(out, err = -ENOMEM); + memcpy(lprof->lp_md, mdc, mdclen); + } + + spin_lock(&lustre_profile_list_lock); + lprof->lp_refs = 1; + lprof->lp_list_deleted = false; + + list_add(&lprof->lp_list, &lustre_profile_list); + spin_unlock(&lustre_profile_list_lock); + RETURN(err); + +out: + if (lprof->lp_md) + OBD_FREE(lprof->lp_md, mdclen); + if (lprof->lp_dt) + OBD_FREE(lprof->lp_dt, osclen); + if (lprof->lp_profile) + OBD_FREE(lprof->lp_profile, proflen); + OBD_FREE(lprof, sizeof(*lprof)); + RETURN(err); +} + +void class_del_profile(const char *prof) +{ + struct lustre_profile *lprof; + + ENTRY; + + CDEBUG(D_CONFIG, "Del profile %s\n", prof); + + spin_lock(&lustre_profile_list_lock); + lprof = class_get_profile_nolock(prof); + if (lprof) { + /* because get profile increments the ref counter */ + lprof->lp_refs--; + list_del(&lprof->lp_list); + lprof->lp_list_deleted = true; + spin_unlock(&lustre_profile_list_lock); + + class_put_profile(lprof); + } else { + spin_unlock(&lustre_profile_list_lock); + } + EXIT; +} +EXPORT_SYMBOL(class_del_profile); + +void class_put_profile(struct lustre_profile *lprof) +{ + spin_lock(&lustre_profile_list_lock); + if ((--lprof->lp_refs) > 0) { + LASSERT(lprof->lp_refs > 0); + spin_unlock(&lustre_profile_list_lock); + return; + } + spin_unlock(&lustre_profile_list_lock); + + /* confirm not a negative number */ + LASSERT(lprof->lp_refs == 0); + + /* + * At least one class_del_profile/profiles must be called + * on the target profile or lustre_profile_list will corrupt + */ + LASSERT(lprof->lp_list_deleted); + OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1); + OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1); + if (lprof->lp_md) + OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1); + OBD_FREE(lprof, sizeof(*lprof)); +} +EXPORT_SYMBOL(class_put_profile); + +/* COMPAT_146 */ +void class_del_profiles(void) +{ + struct lustre_profile *lprof, *n; + ENTRY; + + spin_lock(&lustre_profile_list_lock); + list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) { + list_del(&lprof->lp_list); + lprof->lp_list_deleted = true; + spin_unlock(&lustre_profile_list_lock); + + class_put_profile(lprof); + + spin_lock(&lustre_profile_list_lock); + } + spin_unlock(&lustre_profile_list_lock); + EXIT; +} +EXPORT_SYMBOL(class_del_profiles); + +/* + * We can't call lquota_process_config directly because + * it lives in a module that must be loaded after this one. + */ +#ifdef HAVE_SERVER_SUPPORT +static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL; +#endif /* HAVE_SERVER_SUPPORT */ + +/** + * Rename the proc parameter in \a cfg with a new name \a new_name. + * + * \param cfg config structure which contains the proc parameter + * \param new_name new name of the proc parameter + * + * \retval valid-pointer pointer to the newly-allocated config structure + * which contains the renamed proc parameter + * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does + * not contain a proc parameter + * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs + */ +struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg, + const char *new_name) +{ + struct lustre_cfg_bufs *bufs = NULL; + struct lustre_cfg *new_cfg = NULL; + char *param = NULL; + char *new_param = NULL; + char *value = NULL; + int name_len = 0; + int new_len = 0; + + ENTRY; + + if (!cfg || !new_name) + GOTO(out_nocfg, new_cfg = ERR_PTR(-EINVAL)); + + param = lustre_cfg_string(cfg, 1); + if (!param) + GOTO(out_nocfg, new_cfg = ERR_PTR(-EINVAL)); + + value = strchr(param, '='); + if (value) + name_len = value - param; + else + name_len = strlen(param); + + new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len; + + OBD_ALLOC(new_param, new_len); + if (!new_param) + GOTO(out_nocfg, new_cfg = ERR_PTR(-ENOMEM)); + + strlcpy(new_param, new_name, new_len); + if (value) + strcat(new_param, value); + + OBD_ALLOC_PTR(bufs); + if (!bufs) + GOTO(out_free_param, new_cfg = ERR_PTR(-ENOMEM)); + + lustre_cfg_bufs_reset(bufs, NULL); + lustre_cfg_bufs_init(bufs, cfg); + lustre_cfg_bufs_set_string(bufs, 1, new_param); + + OBD_ALLOC(new_cfg, lustre_cfg_len(bufs->lcfg_bufcount, + bufs->lcfg_buflen)); + if (!new_cfg) + GOTO(out_free_buf, new_cfg = ERR_PTR(-ENOMEM)); + + lustre_cfg_init(new_cfg, cfg->lcfg_command, bufs); + + new_cfg->lcfg_num = cfg->lcfg_num; + new_cfg->lcfg_flags = cfg->lcfg_flags; + new_cfg->lcfg_nid = cfg->lcfg_nid; + new_cfg->lcfg_nal = cfg->lcfg_nal; +out_free_buf: + OBD_FREE_PTR(bufs); +out_free_param: + OBD_FREE(new_param, new_len); +out_nocfg: + RETURN(new_cfg); +} +EXPORT_SYMBOL(lustre_cfg_rename); + +static ssize_t process_param2_config(struct lustre_cfg *lcfg) +{ + char *param = lustre_cfg_string(lcfg, 1); + char *upcall = lustre_cfg_string(lcfg, 2); + struct kobject *kobj = NULL; + const char *subsys = param; + char *newparam = NULL; + char *argv[] = { + [0] = "/usr/sbin/lctl", + [1] = "set_param", + [2] = param, + [3] = NULL + }; + ktime_t start; + ktime_t end; + size_t len; + int rc; + + ENTRY; + print_lustre_cfg(lcfg); + + len = strcspn(param, ".="); + if (!len) + RETURN(-EINVAL); + + /* If we find '=' then its the top level sysfs directory */ + if (param[len] == '=') + RETURN(class_set_global(param)); + + subsys = kstrndup(param, len, GFP_KERNEL); + if (!subsys) + RETURN(-ENOMEM); + + kobj = kset_find_obj(lustre_kset, subsys); + kfree(subsys); + if (kobj) { + char *value = param; + char *envp[4]; + int i; + + param = strsep(&value, "="); + envp[0] = kasprintf(GFP_KERNEL, "PARAM=%s", param); + envp[1] = kasprintf(GFP_KERNEL, "SETTING=%s", value); + envp[2] = kasprintf(GFP_KERNEL, "TIME=%lld", + ktime_get_real_seconds()); + envp[3] = NULL; + + rc = kobject_uevent_env(kobj, KOBJ_CHANGE, envp); + for (i = 0; i < ARRAY_SIZE(envp); i++) + kfree(envp[i]); + + kobject_put(kobj); + + RETURN(rc); + } + + /* Add upcall processing here. Now only lctl is supported */ + if (strcmp(upcall, LCTL_UPCALL) != 0) { + CERROR("Unsupported upcall %s\n", upcall); + RETURN(-EINVAL); + } + + /* root_squash and nosquash_nids settings must be applied to + * global subsystem (*.) so that it is taken into account by + * both client and server sides. So do the equivalent of a + * 's / mdt. / *. /'. + */ + if ((strstr(param, PARAM_NOSQUASHNIDS) || + strstr(param, PARAM_ROOTSQUASH)) && + (param[0] != '*' || param[1] != '.')) { + newparam = kmalloc(strlen(param) + 1, GFP_NOFS); + if (!newparam) + RETURN(-ENOMEM); + + snprintf(newparam, strlen(param) + 1, "*%s", param + len); + argv[2] = (char *)newparam; + } + + start = ktime_get(); + rc = call_usermodehelper(argv[0], argv, NULL, UMH_WAIT_PROC); + end = ktime_get(); + + if (rc < 0) { + CERROR("lctl: error invoking upcall %s %s %s: rc = %d; " + "time %ldus\n", argv[0], argv[1], argv[2], rc, + (long)ktime_us_delta(end, start)); + } else { + CDEBUG(D_HA, "lctl: invoked upcall %s %s %s, time %ldus\n", + argv[0], argv[1], argv[2], + (long)ktime_us_delta(end, start)); + rc = 0; + } + + kfree(newparam); + RETURN(rc); +} + +#ifdef HAVE_SERVER_SUPPORT +void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg)) +{ + quota_process_config = qpc; +} +EXPORT_SYMBOL(lustre_register_quota_process_config); +#endif /* HAVE_SERVER_SUPPORT */ + +/** + * Process configuration commands given in lustre_cfg form. + * These may come from direct calls (e.g. class_manual_cleanup) + * or processing the config llog, or ioctl from lctl. + */ +int class_process_config(struct lustre_cfg *lcfg) +{ + struct obd_device *obd; + int err; + + LASSERT(lcfg && !IS_ERR(lcfg)); + CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command); + + /* Commands that don't need a device */ + switch (lcfg->lcfg_command) { + case LCFG_ATTACH: { + err = class_attach(lcfg); + GOTO(out, err); + } + case LCFG_ADD_UUID: { + CDEBUG(D_IOCTL, + "adding mapping from uuid %s to nid %#llx (%s)\n", + lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid, + libcfs_nid2str(lcfg->lcfg_nid)); + + err = class_add_uuid(lustre_cfg_string(lcfg, 1), + lcfg->lcfg_nid); + GOTO(out, err); + } + case LCFG_DEL_UUID: { + CDEBUG(D_IOCTL, "removing mappings for uuid %s\n", + (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == + 0) ? "" : lustre_cfg_string(lcfg, 1)); + + err = class_del_uuid(lustre_cfg_string(lcfg, 1)); + GOTO(out, err); + } + case LCFG_MOUNTOPT: { + CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n", + lustre_cfg_string(lcfg, 1), + lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + /* + * set these mount options somewhere, so ll_fill_super + * can find them. + */ + err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1), + lustre_cfg_string(lcfg, 1), + LUSTRE_CFG_BUFLEN(lcfg, 2), + lustre_cfg_string(lcfg, 2), + LUSTRE_CFG_BUFLEN(lcfg, 3), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err); + } + case LCFG_DEL_MOUNTOPT: { + CDEBUG(D_IOCTL, "mountopt: profile %s\n", + lustre_cfg_string(lcfg, 1)); + class_del_profile(lustre_cfg_string(lcfg, 1)); + GOTO(out, err = 0); + } + case LCFG_SET_TIMEOUT: { + CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n", + obd_timeout, lcfg->lcfg_num); + obd_timeout = max(lcfg->lcfg_num, 1U); + obd_timeout_set = 1; + GOTO(out, err = 0); + } + case LCFG_SET_LDLM_TIMEOUT: { + CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n", + ldlm_timeout, lcfg->lcfg_num); + ldlm_timeout = max(lcfg->lcfg_num, 1U); + if (ldlm_timeout >= obd_timeout) + ldlm_timeout = max(obd_timeout / 3, 1U); + ldlm_timeout_set = 1; + GOTO(out, err = 0); + } + case LCFG_SET_UPCALL: { + LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n"); + /* COMPAT_146 Don't fail on old configs */ + GOTO(out, err = 0); + } + case LCFG_MARKER: { + struct cfg_marker *marker; + + marker = lustre_cfg_buf(lcfg, 1); + CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step, + marker->cm_flags, marker->cm_tgtname, + marker->cm_comment); + GOTO(out, err = 0); + } + case LCFG_PARAM: { + char *tmp; + + /* llite has no OBD */ + if (class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_LLITE, NULL) == 0) { + struct lustre_sb_info *lsi; + unsigned long addr; + ssize_t count; + + /* + * The instance name contains the sb: + * lustre-client-aacfe000 + */ + tmp = strrchr(lustre_cfg_string(lcfg, 0), '-'); + if (!tmp || !*(++tmp)) + GOTO(out, err = -EINVAL); + + if (sscanf(tmp, "%lx", &addr) != 1) + GOTO(out, err = -EINVAL); + + lsi = s2lsi((struct super_block *)addr); + /* This better be a real Lustre superblock! */ + LASSERT(lsi->lsi_lmd->lmd_magic == LMD_MAGIC); + + count = class_modify_config(lcfg, PARAM_LLITE, + lsi->lsi_kobj); + err = count < 0 ? count : 0; + GOTO(out, err); + } else if ((class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_SYS, &tmp) == 0)) { + /* Global param settings */ + err = class_set_global(tmp); + /* + * Client or server should not fail to mount if + * it hits an unknown configuration parameter. + */ + if (err < 0) + CWARN("Ignoring unknown param %s\n", tmp); + + GOTO(out, err = 0); +#ifdef HAVE_SERVER_SUPPORT + } else if ((class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_QUOTA, &tmp) == 0) && + quota_process_config) { + err = (*quota_process_config)(lcfg); + GOTO(out, err); +#endif /* HAVE_SERVER_SUPPORT */ + } + + break; + } + case LCFG_SET_PARAM: { + err = process_param2_config(lcfg); + GOTO(out, err = 0); + } + } + /* Commands that require a device */ + obd = class_name2obd(lustre_cfg_string(lcfg, 0)); + if (!obd) { + if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) + CERROR("this lcfg command requires a device name\n"); + else + CERROR("no device for: %s\n", + lustre_cfg_string(lcfg, 0)); + + GOTO(out, err = -EINVAL); + } + switch(lcfg->lcfg_command) { + case LCFG_SETUP: { + err = class_setup(obd, lcfg); + GOTO(out, err); + } + case LCFG_DETACH: { + err = class_detach(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_CLEANUP: { + err = class_cleanup(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_ADD_CONN: { + err = class_add_conn(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_DEL_CONN: { + err = class_del_conn(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_POOL_NEW: { + err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2)); + GOTO(out, err = 0); + } + case LCFG_POOL_ADD: { + err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err = 0); + } + case LCFG_POOL_REM: { + err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err = 0); + } + case LCFG_POOL_DEL: { + err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2)); + GOTO(out, err = 0); + } + /* + * Process config log ADD_MDC record twice to add MDC also to LOV + * for Data-on-MDT: + * + * add 0:lustre-clilmv 1:lustre-MDT0000_UUID 2:0 3:1 + * 4:lustre-MDT0000-mdc_UUID + */ + case LCFG_ADD_MDC: { + struct obd_device *lov_obd; + char *clilmv; + + err = obd_process_config(obd, sizeof(*lcfg), lcfg); + if (err) + GOTO(out, err); + + /* make sure this is client LMV log entry */ + clilmv = strstr(lustre_cfg_string(lcfg, 0), "clilmv"); + if (!clilmv) + GOTO(out, err); + + /* + * replace 'lmv' with 'lov' name to address LOV device and + * process llog record to add MDC there. + */ + clilmv[4] = 'o'; + lov_obd = class_name2obd(lustre_cfg_string(lcfg, 0)); + if (lov_obd) { + err = obd_process_config(lov_obd, sizeof(*lcfg), lcfg); + } else { + err = -ENOENT; + CERROR("%s: Cannot find LOV by %s name, rc = %d\n", + obd->obd_name, lustre_cfg_string(lcfg, 0), err); + } + /* restore 'lmv' name */ + clilmv[4] = 'm'; + GOTO(out, err); + } + default: { + err = obd_process_config(obd, sizeof(*lcfg), lcfg); + GOTO(out, err); + } + } + EXIT; +out: + if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) { + CWARN("Ignoring error %d on optional command %#x\n", err, + lcfg->lcfg_command); + err = 0; + } + return err; +} +EXPORT_SYMBOL(class_process_config); + +ssize_t class_modify_config(struct lustre_cfg *lcfg, const char *prefix, + struct kobject *kobj) +{ + const struct kobj_type *typ; + ssize_t count = 0; + int i; + + if (lcfg->lcfg_command != LCFG_PARAM) { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + return -EINVAL; + } + + typ = get_ktype(kobj); + if (!typ || !typ->default_groups) + return -ENODEV; + + print_lustre_cfg(lcfg); + + /* + * e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt + * or lctl conf_param lustre-MDT0000.mdt.group_upcall=bar + * or lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 + */ + for (i = 1; i < lcfg->lcfg_bufcount; i++) { + struct attribute *attr = NULL; + size_t keylen; + char *value; + char *key; + + key = lustre_cfg_buf(lcfg, i); + /* Strip off prefix */ + if (class_match_param(key, prefix, &key)) + /* + * If the prefix doesn't match, return error so we + * can pass it down the stack + */ + return -EINVAL; + + value = strchr(key, '='); + if (!value || *(value + 1) == 0) { + CERROR("%s: can't parse param '%s' (missing '=')\n", + lustre_cfg_string(lcfg, 0), + lustre_cfg_string(lcfg, i)); + /* continue parsing other params */ + continue; + } + keylen = value - key; + value++; + + attr = get_attr_starts_with(typ, key, keylen); + if (!attr) { + char *envp[4], *param, *path; + + path = kobject_get_path(kobj, GFP_KERNEL); + if (!path) + return -EINVAL; + + /* convert sysfs path to uevent format */ + param = path; + while ((param = strchr(param, '/')) != NULL) + *param = '.'; + + param = strstr(path, "fs.lustre.") + 10; + + envp[0] = kasprintf(GFP_KERNEL, "PARAM=%s.%.*s", + param, (int) keylen, key); + envp[1] = kasprintf(GFP_KERNEL, "SETTING=%s", value); + envp[2] = kasprintf(GFP_KERNEL, "TIME=%lld", + ktime_get_real_seconds()); + envp[3] = NULL; + + if (kobject_uevent_env(kobj, KOBJ_CHANGE, envp)) { + CERROR("%s: failed to send uevent %s\n", + kobject_name(kobj), key); + } + + for (i = 0; i < ARRAY_SIZE(envp); i++) + kfree(envp[i]); + kfree(path); + } else { + count += lustre_attr_store(kobj, attr, value, + strlen(value)); + } + } + return count; +} +EXPORT_SYMBOL(class_modify_config); + +/* + * Supplemental functions for config logs, it allocates lustre_cfg + * buffers plus initialized llog record header at the beginning. + */ +struct llog_cfg_rec *lustre_cfg_rec_new(int cmd, struct lustre_cfg_bufs *bufs) +{ + struct llog_cfg_rec *lcr; + int reclen; + + ENTRY; + + reclen = lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen); + reclen = llog_data_len(reclen) + sizeof(struct llog_rec_hdr) + + sizeof(struct llog_rec_tail); + + OBD_ALLOC(lcr, reclen); + if (!lcr) + RETURN(NULL); + + lustre_cfg_init(&lcr->lcr_cfg, cmd, bufs); + + lcr->lcr_hdr.lrh_len = reclen; + lcr->lcr_hdr.lrh_type = OBD_CFG_REC; + + RETURN(lcr); +} +EXPORT_SYMBOL(lustre_cfg_rec_new); + +void lustre_cfg_rec_free(struct llog_cfg_rec *lcr) +{ + ENTRY; + OBD_FREE(lcr, lcr->lcr_hdr.lrh_len); + EXIT; +} +EXPORT_SYMBOL(lustre_cfg_rec_free); + +/** + * Parse a configuration llog, doing various manipulations on them + * for various reasons, (modifications for compatibility, skip obsolete + * records, change uuids, etc), then class_process_config() resulting + * net records. + */ +int class_config_llog_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct config_llog_instance *cfg = data; + int cfg_len = rec->lrh_len; + char *cfg_buf = (char *) (rec + 1); + int rc = 0; + ENTRY; + + /* class_config_dump_handler(handle, rec, data); */ + + switch (rec->lrh_type) { + case OBD_CFG_REC: { + struct lustre_cfg *lcfg, *lcfg_new; + struct lustre_cfg_bufs bufs; + char *inst_name = NULL; + int inst_len = 0; + int swab = 0; + + lcfg = (struct lustre_cfg *)cfg_buf; + if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) { + lustre_swab_lustre_cfg(lcfg); + swab = 1; + } + + rc = lustre_cfg_sanity_check(cfg_buf, cfg_len); + if (rc) + GOTO(out, rc); + + /* Figure out config state info */ + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + lustre_swab_cfg_marker(marker, swab, + LUSTRE_CFG_BUFLEN(lcfg, 1)); + CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n", + cfg->cfg_flags, marker->cm_flags); + if (marker->cm_flags & CM_START) { + /* all previous flags off */ + cfg->cfg_flags = CFG_F_MARKER; + server_name2index(marker->cm_tgtname, + &cfg->cfg_lwp_idx, NULL); + if (marker->cm_flags & CM_SKIP) { + cfg->cfg_flags |= CFG_F_SKIP; + CDEBUG(D_CONFIG, "SKIP #%d\n", + marker->cm_step); + } else if ((marker->cm_flags & CM_EXCLUDE) || + (cfg->cfg_sb && + lustre_check_exclusion(cfg->cfg_sb, + marker->cm_tgtname))) { + cfg->cfg_flags |= CFG_F_EXCLUDE; + CDEBUG(D_CONFIG, "EXCLUDE %d\n", + marker->cm_step); + } + } else if (marker->cm_flags & CM_END) { + cfg->cfg_flags = 0; + } + } + /* + * A config command without a start marker before it is + * illegal + */ + if (!(cfg->cfg_flags & CFG_F_MARKER) && + (lcfg->lcfg_command != LCFG_MARKER)) { + CWARN("Skip config outside markers, (inst: %016lx, uuid: %s, flags: %#x)\n", + cfg->cfg_instance, + cfg->cfg_uuid.uuid, cfg->cfg_flags); + cfg->cfg_flags |= CFG_F_SKIP; + } + if (cfg->cfg_flags & CFG_F_SKIP) { + CDEBUG(D_CONFIG, "skipping %#x\n", + cfg->cfg_flags); + rc = 0; + /* No processing! */ + break; + } + + /* + * For interoperability between 1.8 and 2.0, + * rename "mds" OBD device type to "mdt". + */ + { + char *typename = lustre_cfg_string(lcfg, 1); + char *index = lustre_cfg_string(lcfg, 2); + + if ((lcfg->lcfg_command == LCFG_ATTACH && typename && + strcmp(typename, "mds") == 0)) { + CWARN("For 1.8 interoperability, rename obd " + "type from mds to mdt\n"); + typename[2] = 't'; + } + if ((lcfg->lcfg_command == LCFG_SETUP && index && + strcmp(index, "type") == 0)) { + CDEBUG(D_INFO, "For 1.8 interoperability, " + "set this index to '0'\n"); + index[0] = '0'; + index[1] = 0; + } + } + +#ifdef HAVE_SERVER_SUPPORT + /* newer MDS replaces LOV/OSC with LOD/OSP */ + if ((lcfg->lcfg_command == LCFG_ATTACH || + lcfg->lcfg_command == LCFG_SET_PARAM || + lcfg->lcfg_command == LCFG_PARAM) && + cfg->cfg_sb && IS_MDT(s2lsi(cfg->cfg_sb))) { + char *typename = lustre_cfg_string(lcfg, 1); + + if (typename && + strcmp(typename, LUSTRE_LOV_NAME) == 0) { + CDEBUG(D_CONFIG, + "For 2.x interoperability, rename obd " + "type from lov to lod (%s)\n", + s2lsi(cfg->cfg_sb)->lsi_svname); + strcpy(typename, LUSTRE_LOD_NAME); + } + if (typename && + strcmp(typename, LUSTRE_OSC_NAME) == 0) { + CDEBUG(D_CONFIG, + "For 2.x interoperability, rename obd " + "type from osc to osp (%s)\n", + s2lsi(cfg->cfg_sb)->lsi_svname); + strcpy(typename, LUSTRE_OSP_NAME); + } + } +#endif /* HAVE_SERVER_SUPPORT */ + + if (cfg->cfg_flags & CFG_F_EXCLUDE) { + CDEBUG(D_CONFIG, "cmd: %x marked EXCLUDED\n", + lcfg->lcfg_command); + if (lcfg->lcfg_command == LCFG_LOV_ADD_OBD) + /* Add inactive instead */ + lcfg->lcfg_command = LCFG_LOV_ADD_INA; + } + + lustre_cfg_bufs_reset(&bufs, NULL); + lustre_cfg_bufs_init(&bufs, lcfg); + + if (cfg->cfg_instance && + lcfg->lcfg_command != LCFG_SPTLRPC_CONF && + LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) { + inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) + + LUSTRE_MAXINSTANCE + 4; + OBD_ALLOC(inst_name, inst_len); + if (!inst_name) + GOTO(out, rc = -ENOMEM); + snprintf(inst_name, inst_len, "%s-%016lx", + lustre_cfg_string(lcfg, 0), + cfg->cfg_instance); + lustre_cfg_bufs_set_string(&bufs, 0, inst_name); + CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n", + lcfg->lcfg_command, inst_name); + } + + /* override llog UUID for clients, to insure they are unique */ + if (cfg->cfg_instance && lcfg->lcfg_command == LCFG_ATTACH) + lustre_cfg_bufs_set_string(&bufs, 2, + cfg->cfg_uuid.uuid); + /* + * sptlrpc config record, we expect 2 data segments: + * [0]: fs_name/target_name, + * [1]: rule string + * moving them to index [1] and [2], and insert MGC's + * obdname at index [0]. + */ + if (cfg->cfg_instance && + lcfg->lcfg_command == LCFG_SPTLRPC_CONF) { + /* After ASLR changes cfg_instance this needs fixing */ + /* "obd" is set in config_log_find_or_add() */ + struct obd_device *obd = (void *)cfg->cfg_instance; + + lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1], + bufs.lcfg_buflen[1]); + lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0], + bufs.lcfg_buflen[0]); + lustre_cfg_bufs_set_string(&bufs, 0, + obd->obd_name); + } + + /* + * Add net info to setup command + * if given on command line. + * So config log will be: + * [0]: client name + * [1]: client UUID + * [2]: server UUID + * [3]: inactive-on-startup + * [4]: restrictive net + */ + if (cfg && cfg->cfg_sb && s2lsi(cfg->cfg_sb) && + !IS_SERVER(s2lsi(cfg->cfg_sb))) { + struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb); + char *nidnet = lsi->lsi_lmd->lmd_nidnet; + + if (lcfg->lcfg_command == LCFG_SETUP && + lcfg->lcfg_bufcount != 2 && nidnet) { + CDEBUG(D_CONFIG, "Adding net %s info to setup " + "command for client %s\n", nidnet, + lustre_cfg_string(lcfg, 0)); + lustre_cfg_bufs_set_string(&bufs, 4, nidnet); + } + } + + OBD_ALLOC(lcfg_new, lustre_cfg_len(bufs.lcfg_bufcount, + bufs.lcfg_buflen)); + if (!lcfg_new) + GOTO(out, rc = -ENOMEM); + + lustre_cfg_init(lcfg_new, lcfg->lcfg_command, &bufs); + lcfg_new->lcfg_num = lcfg->lcfg_num; + lcfg_new->lcfg_flags = lcfg->lcfg_flags; + + /* + * XXX Hack to try to remain binary compatible with + * pre-newconfig logs + */ + if (lcfg->lcfg_nal != 0 && /* pre-newconfig log? */ + (lcfg->lcfg_nid >> 32) == 0) { + __u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff); + + lcfg_new->lcfg_nid = + LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr); + CWARN("Converted pre-newconfig NAL %d NID %x to %s\n", + lcfg->lcfg_nal, addr, + libcfs_nid2str(lcfg_new->lcfg_nid)); + } else { + lcfg_new->lcfg_nid = lcfg->lcfg_nid; + } + + lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */ + + rc = class_process_config(lcfg_new); + OBD_FREE(lcfg_new, lustre_cfg_len(lcfg_new->lcfg_bufcount, + lcfg_new->lcfg_buflens)); + if (inst_name) + OBD_FREE(inst_name, inst_len); + break; + } + default: + CERROR("Unknown llog record type %#x encountered\n", + rec->lrh_type); + break; + } +out: + if (rc) { + CERROR("%s: cfg command failed: rc = %d\n", + handle->lgh_ctxt->loc_obd->obd_name, rc); + class_config_dump_handler(NULL, handle, rec, data); + } + RETURN(rc); +} +EXPORT_SYMBOL(class_config_llog_handler); + +int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg) +{ + struct llog_process_cat_data cd = { + .lpcd_first_idx = 0, + .lpcd_read_mode = LLOG_READ_MODE_NORMAL, + }; + struct llog_handle *llh; + llog_cb_t callback; + int rc; + ENTRY; + + CDEBUG(D_INFO, "looking up llog %s\n", name); + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(parse_out, rc); + + /* continue processing from where we last stopped to end-of-log */ + if (cfg) { + cd.lpcd_first_idx = cfg->cfg_last_idx; + callback = cfg->cfg_callback; + LASSERT(callback != NULL); + } else { + callback = class_config_llog_handler; + } + + cd.lpcd_last_idx = 0; + + rc = llog_process(env, llh, callback, cfg, &cd); + + CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name, + cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc); + if (cfg) + cfg->cfg_last_idx = cd.lpcd_last_idx; + +parse_out: + llog_close(env, llh); + RETURN(rc); +} +EXPORT_SYMBOL(class_config_parse_llog); + +/** + * Get marker cfg_flag + */ +void llog_get_marker_cfg_flags(struct llog_rec_hdr *rec, + unsigned int *cfg_flags) +{ + struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1); + struct cfg_marker *marker; + + if (lcfg->lcfg_command == LCFG_MARKER) { + marker = lustre_cfg_buf(lcfg, 1); + if (marker->cm_flags & CM_START) { + *cfg_flags = CFG_F_MARKER; + if (marker->cm_flags & CM_SKIP) + *cfg_flags = CFG_F_SKIP; + } else if (marker->cm_flags & CM_END) { + *cfg_flags = 0; + } + CDEBUG(D_INFO, "index=%d, cm_flags=%#08x cfg_flags=%#08x\n", + rec->lrh_index, marker->cm_flags, *cfg_flags); + } +} + +/** + * Parse config record and output dump in supplied buffer. + * + * This is separated from class_config_dump_handler() to use + * for ioctl needs as well + * + * Sample Output: + * - { index: 4, event: attach, device: lustrewt-clilov, type: lov, + * UUID: lustrewt-clilov_UUID } + */ +int class_config_yaml_output(struct llog_rec_hdr *rec, char *buf, int size, + unsigned int *cfg_flags, bool raw) +{ + struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1); + char *ptr = buf; + char *end = buf + size; + int rc = 0, i; + struct lcfg_type_data *ldata; + int swab = 0; + + LASSERT(rec->lrh_type == OBD_CFG_REC); + + if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) { + lustre_swab_lustre_cfg(lcfg); + swab = 1; + } + + rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len); + if (rc < 0) + return rc; + + ldata = lcfg_cmd2data(lcfg->lcfg_command); + if (!ldata) + return -ENOTTY; + + llog_get_marker_cfg_flags(rec, cfg_flags); + if ((lcfg->lcfg_command == LCFG_MARKER) && likely(!raw)) + return 0; + /* entries outside marker are skipped */ + if (!(*cfg_flags & CFG_F_MARKER) && !raw) + return 0; + /* inside skipped marker */ + if ((*cfg_flags & CFG_F_SKIP) && !raw) + return 0; + + /* form YAML entity */ + ptr += snprintf(ptr, end - ptr, "- { index: %u, event: %s", + rec->lrh_index, ldata->ltd_name); + if (end - ptr <= 0) + goto out_overflow; + + if (lcfg->lcfg_flags) { + ptr += snprintf(ptr, end - ptr, ", flags: %#08x", + lcfg->lcfg_flags); + if (end - ptr <= 0) + goto out_overflow; + } + if (lcfg->lcfg_num) { + ptr += snprintf(ptr, end - ptr, ", num: %#08x", + lcfg->lcfg_num); + if (end - ptr <= 0) + goto out_overflow; + } + if (lcfg->lcfg_nid) { + char nidstr[LNET_NIDSTR_SIZE]; + + libcfs_nid2str_r(lcfg->lcfg_nid, nidstr, sizeof(nidstr)); + ptr += snprintf(ptr, end - ptr, ", nid: %s(%#llx)", + nidstr, lcfg->lcfg_nid); + if (end - ptr <= 0) + goto out_overflow; + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) { + ptr += snprintf(ptr, end - ptr, ", device: %s", + lustre_cfg_string(lcfg, 0)); + if (end - ptr <= 0) + goto out_overflow; + } + + if (lcfg->lcfg_command == LCFG_SET_PARAM) { + /* + * set_param -P parameters have param=val here, separate + * them through pointer magic and print them out in + * native yamlese + */ + char *cfg_str = lustre_cfg_string(lcfg, 1); + char *tmp = strchr(cfg_str, '='); + size_t len; + + if (!tmp) + goto out_done; + + ptr += snprintf(ptr, end - ptr, ", %s: ", ldata->ltd_bufs[0]); + len = tmp - cfg_str + 1; + snprintf(ptr, len, "%s", cfg_str); + ptr += len - 1; + + ptr += snprintf(ptr, end - ptr, ", %s: ", ldata->ltd_bufs[1]); + ptr += snprintf(ptr, end - ptr, "%s", tmp + 1); + + goto out_done; + } + + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker; + + marker = lustre_cfg_buf(lcfg, 1); + ptr += snprintf(ptr, end - ptr, ", flags: %#04x", + marker->cm_flags); + ptr += snprintf(ptr, end - ptr, ", version: %d.%d.%d.%d", + OBD_OCD_VERSION_MAJOR(marker->cm_vers), + OBD_OCD_VERSION_MINOR(marker->cm_vers), + OBD_OCD_VERSION_PATCH(marker->cm_vers), + OBD_OCD_VERSION_FIX(marker->cm_vers)); + ptr += snprintf(ptr, end - ptr, ", createtime: %lld", + marker->cm_createtime); + ptr += snprintf(ptr, end - ptr, ", canceltime: %lld", + marker->cm_canceltime); + + goto out_done; + } + + for (i = 1; i < lcfg->lcfg_bufcount; i++) { + if (LUSTRE_CFG_BUFLEN(lcfg, i) > 0) { + ptr += snprintf(ptr, end - ptr, ", %s: %s", + ldata->ltd_bufs[i - 1], + lustre_cfg_string(lcfg, i)); + if (end - ptr <= 0) + goto out_overflow; + } + } + +out_done: + ptr += snprintf(ptr, end - ptr, " }\n"); +out_overflow: + /* Return consumed bytes. If the buffer overflowed, zero last byte */ + rc = ptr - buf; + if (rc > size) { + rc = -EOVERFLOW; + *(end - 1) = '\0'; + } + + return rc; +} + +/** + * parse config record and output dump in supplied buffer. + * This is separated from class_config_dump_handler() to use + * for ioctl needs as well + */ +static int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size) +{ + struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1); + char *ptr = buf; + char *end = buf + size; + int rc = 0; + + ENTRY; + + LASSERT(rec->lrh_type == OBD_CFG_REC); + rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len); + if (rc < 0) + RETURN(rc); + + ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command); + if (lcfg->lcfg_flags) + ptr += snprintf(ptr, end-ptr, "flags=%#08x ", + lcfg->lcfg_flags); + + if (lcfg->lcfg_num) + ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num); + + if (lcfg->lcfg_nid) { + char nidstr[LNET_NIDSTR_SIZE]; + + libcfs_nid2str_r(lcfg->lcfg_nid, nidstr, sizeof(nidstr)); + ptr += snprintf(ptr, end-ptr, "nid=%s(%#llx) ", + nidstr, lcfg->lcfg_nid); + } + + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + + ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'", + marker->cm_step, marker->cm_flags, + marker->cm_tgtname, marker->cm_comment); + } else { + int i; + + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + ptr += snprintf(ptr, end-ptr, "%d:%s ", i, + lustre_cfg_string(lcfg, i)); + } + } + ptr += snprintf(ptr, end - ptr, "\n"); + /* return consumed bytes */ + rc = ptr - buf; + RETURN(rc); +} + +int class_config_dump_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + char *outstr; + int rc = 0; + + ENTRY; + + OBD_ALLOC(outstr, 256); + if (!outstr) + RETURN(-ENOMEM); + + if (rec->lrh_type == OBD_CFG_REC) { + class_config_parse_rec(rec, outstr, 256); + LCONSOLE(D_WARNING, " %s\n", outstr); + } else { + LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type); + rc = -EINVAL; + } + + OBD_FREE(outstr, 256); + RETURN(rc); +} + +/** + * Call class_cleanup and class_detach. + * "Manual" only in the sense that we're faking lcfg commands. + */ +int class_manual_cleanup(struct obd_device *obd) +{ + char flags[3] = ""; + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + int rc; + + ENTRY; + + if (!obd) { + CERROR("empty cleanup\n"); + RETURN(-EALREADY); + } + + if (obd->obd_force) + strlcat(flags, "F", sizeof(flags)); + if (obd->obd_fail) + strlcat(flags, "A", sizeof(flags)); + + CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n", + obd->obd_name, flags); + + lustre_cfg_bufs_reset(&bufs, obd->obd_name); + lustre_cfg_bufs_set_string(&bufs, 1, flags); + OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen)); + if (!lcfg) + RETURN(-ENOMEM); + lustre_cfg_init(lcfg, LCFG_CLEANUP, &bufs); + + rc = class_process_config(lcfg); + if (rc) { + CERROR("cleanup failed %d: %s\n", rc, obd->obd_name); + GOTO(out, rc); + } + + /* the lcfg is almost the same for both ops */ + lcfg->lcfg_command = LCFG_DETACH; + rc = class_process_config(lcfg); + if (rc) + CERROR("detach failed %d: %s\n", rc, obd->obd_name); +out: + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)); + RETURN(rc); +} +EXPORT_SYMBOL(class_manual_cleanup); + +#ifdef HAVE_SERVER_SUPPORT +/* + * nid<->nidstats hash operations + */ +static unsigned +nidstats_hash(struct cfs_hash *hs, const void *key, unsigned int mask) +{ + return cfs_hash_djb2_hash(key, sizeof(struct lnet_nid), mask); +} + +static void * +nidstats_key(struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + + return &ns->nid; +} + +static int +nidstats_keycmp(const void *key, struct hlist_node *hnode) +{ + return nid_same((struct lnet_nid *)nidstats_key(hnode), + (struct lnet_nid *)key); +} + +static void * +nidstats_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct nid_stat, nid_hash); +} + +static void +nidstats_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + nidstat_getref(ns); +} + +static void +nidstats_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + nidstat_putref(ns); +} + +static struct cfs_hash_ops nid_stat_hash_ops = { + .hs_hash = nidstats_hash, + .hs_key = nidstats_key, + .hs_keycmp = nidstats_keycmp, + .hs_object = nidstats_object, + .hs_get = nidstats_get, + .hs_put_locked = nidstats_put_locked, +}; + +/* + * client_generation<->export hash operations + */ + +static unsigned +gen_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(__u32), mask); +} + +static void * +gen_key(struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_gen_hash); + + RETURN(&exp->exp_target_data.ted_lcd->lcd_generation); +} + +/* + * NOTE: It is impossible to find an export that is in failed + * state with this function + */ +static int +gen_kepcmp(const void *key, struct hlist_node *hnode) +{ + struct obd_export *exp; + + LASSERT(key); + exp = hlist_entry(hnode, struct obd_export, exp_gen_hash); + + RETURN(exp->exp_target_data.ted_lcd->lcd_generation == *(__u32 *)key && + !exp->exp_failed); +} + +static void * +gen_export_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct obd_export, exp_gen_hash); +} + +static void +gen_export_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_gen_hash); + class_export_get(exp); +} + +static void +gen_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_gen_hash); + class_export_put(exp); +} + +static struct cfs_hash_ops gen_hash_ops = { + .hs_hash = gen_hash, + .hs_key = gen_key, + .hs_keycmp = gen_kepcmp, + .hs_object = gen_export_object, + .hs_get = gen_export_get, + .hs_put_locked = gen_export_put_locked, +}; + +#endif /* HAVE_SERVER_SUPPORT */ diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c new file mode 100644 index 0000000000000..f9e46c67c3fe4 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c @@ -0,0 +1,1689 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/obd_mount.c + * + * Client mount routines + * + * Author: Nathan Rutman + */ + + +#define DEBUG_SUBSYSTEM S_CLASS +#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */) +#define PRINT_CMD CDEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/**************** config llog ********************/ + +/** + * Get a config log from the MGS and process it. + * This func is called for both clients and servers. + * Continue to process new statements appended to the logs + * (whenever the config lock is revoked) until lustre_end_log + * is called. + * + * @param sb The superblock is used by the MGC to write to the local copy of + * the config log + * @param logname The name of the llog to replicate from the MGS + * @param cfg Since the same MGC may be used to follow multiple config logs + * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for + * this log, and is added to the mgc's list of logs to follow. + */ +int lustre_process_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg) +{ + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs *bufs; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + int rc; + + ENTRY; + + LASSERT(mgc); + LASSERT(cfg); + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) + RETURN(-ENOMEM); + + /* mgc_process_config */ + lustre_cfg_bufs_reset(bufs, mgc->obd_name); + lustre_cfg_bufs_set_string(bufs, 1, logname); + lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg)); + lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb)); + OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen)); + if (!lcfg) + GOTO(out, rc = -ENOMEM); + lustre_cfg_init(lcfg, LCFG_LOG_START, bufs); + + rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)); +out: + OBD_FREE_PTR(bufs); + + if (rc == -EINVAL) + LCONSOLE_ERROR_MSG(0x15b, + "%s: Configuration from log %s failed from MGS %d. Check client and MGS are on compatible version.\n", + mgc->obd_name, logname, rc); + else if (rc != 0) + LCONSOLE_ERROR_MSG(0x15c, + "%s: Confguration from log %s failed from MGS %d. Communication error between node & MGS, a bad configuration, or other errors. See syslog for more info\n", + mgc->obd_name, logname, rc); + + /* class_obd_list(); */ + RETURN(rc); +} +EXPORT_SYMBOL(lustre_process_log); + +/* Stop watching this config log for updates */ +int lustre_end_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg) +{ + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + int rc; + + ENTRY; + + if (!mgc) + RETURN(-ENOENT); + + /* mgc_process_config */ + lustre_cfg_bufs_reset(&bufs, mgc->obd_name); + lustre_cfg_bufs_set_string(&bufs, 1, logname); + if (cfg) + lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg)); + OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen)); + if (!lcfg) + RETURN(-ENOMEM); + lustre_cfg_init(lcfg, LCFG_LOG_END, &bufs); + rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)); + RETURN(rc); +} +EXPORT_SYMBOL(lustre_end_log); + +/**************** OBD start *******************/ + +/** + * lustre_cfg_bufs are a holdover from 1.4; we can still set these up from + * lctl (and do for echo cli/srv. + */ +static int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd, + char *s1, char *s2, char *s3, char *s4) +{ + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg = NULL; + int rc; + + CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname, + cmd, s1, s2, s3, s4); + + lustre_cfg_bufs_reset(&bufs, cfgname); + if (s1) + lustre_cfg_bufs_set_string(&bufs, 1, s1); + if (s2) + lustre_cfg_bufs_set_string(&bufs, 2, s2); + if (s3) + lustre_cfg_bufs_set_string(&bufs, 3, s3); + if (s4) + lustre_cfg_bufs_set_string(&bufs, 4, s4); + + OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen)); + if (!lcfg) + return -ENOMEM; + lustre_cfg_init(lcfg, cmd, &bufs); + lcfg->lcfg_nid = nid; + rc = class_process_config(lcfg); + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)); + return rc; +} + +/** + * Call class_attach and class_setup. These methods in turn call + * OBD type-specific methods. + */ +int lustre_start_simple(char *obdname, char *type, char *uuid, + char *s1, char *s2, char *s3, char *s4) +{ + int rc; + + CDEBUG(D_MOUNT, "Starting OBD %s (typ=%s)\n", obdname, type); + + rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, NULL, NULL); + if (rc) { + CERROR("%s attach error %d\n", obdname, rc); + return rc; + } + rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4); + if (rc) { + CERROR("%s setup error %d\n", obdname, rc); + do_lcfg(obdname, 0, LCFG_DETACH, NULL, NULL, NULL, NULL); + } + return rc; +} + +static DEFINE_MUTEX(mgc_start_lock); + +/** + * Set up a MGC OBD to process startup logs + * + * \param sb [in] super block of the MGC OBD + * + * \retval 0 success, otherwise error code + */ +int lustre_start_mgc(struct super_block *sb) +{ + struct obd_connect_data *data = NULL; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + struct obd_export *exp; + struct obd_uuid *uuid = NULL; + uuid_t uuidc; + lnet_nid_t nid; + char nidstr[LNET_NIDSTR_SIZE]; + char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL; + char *ptr; + int rc = 0, i = 0, j; + size_t len; + + ENTRY; + + LASSERT(lsi->lsi_lmd); + + /* Find the first non-lo MGS NID for our MGC name */ + if (IS_SERVER(lsi)) { + /* mount -o mgsnode=nid */ + ptr = lsi->lsi_lmd->lmd_mgs; + if (lsi->lsi_lmd->lmd_mgs && + (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) { + i++; + } else if (IS_MGS(lsi)) { + struct lnet_processid id; + + while ((rc = LNetGetId(i++, &id)) != -ENOENT) { + if (nid_is_lo0(&id.nid)) + continue; + nid = lnet_nid_to_nid4(&id.nid); + i++; + break; + } + } + } else { /* client */ + /* Use NIDs from mount line: uml1,1@elan:uml2,2@elan:/lustre */ + ptr = lsi->lsi_lmd->lmd_dev; + if (class_parse_nid(ptr, &nid, &ptr) == 0) + i++; + } + if (i == 0) { + CERROR("No valid MGS NIDs found.\n"); + RETURN(-EINVAL); + } + + mutex_lock(&mgc_start_lock); + + libcfs_nid2str_r(nid, nidstr, sizeof(nidstr)); + len = strlen(LUSTRE_MGC_OBDNAME) + strlen(nidstr) + 1; + OBD_ALLOC(mgcname, len); + OBD_ALLOC(niduuid, len + 2); + if (mgcname == NULL || niduuid == NULL) + GOTO(out_free, rc = -ENOMEM); + snprintf(mgcname, len, "%s%s", LUSTRE_MGC_OBDNAME, nidstr); + + mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : ""; + + OBD_ALLOC_PTR(data); + if (data == NULL) + GOTO(out_free, rc = -ENOMEM); + + obd = class_name2obd(mgcname); + if (obd && !obd->obd_stopping) { + int recov_bk; + + rc = obd_set_info_async(NULL, obd->obd_self_export, + strlen(KEY_MGSSEC), KEY_MGSSEC, + strlen(mgssec), mgssec, NULL); + if (rc) + GOTO(out_free, rc); + + /* Re-using an existing MGC */ + atomic_inc(&obd->u.cli.cl_mgc_refcount); + + /* IR compatibility check, only for clients */ + if (lmd_is_client(lsi->lsi_lmd)) { + int has_ir; + int vallen = sizeof(*data); + __u32 *flags = &lsi->lsi_lmd->lmd_flags; + + rc = obd_get_info(NULL, obd->obd_self_export, + strlen(KEY_CONN_DATA), KEY_CONN_DATA, + &vallen, data); + LASSERT(rc == 0); + has_ir = OCD_HAS_FLAG(data, IMP_RECOV); + if (has_ir ^ !(*flags & LMD_FLG_NOIR)) { + /* LMD_FLG_NOIR is for test purpose only */ + LCONSOLE_WARN( + "Mounting client with IR setting not compatible with current MGC. Using MGC setting that is IR %s", + has_ir ? "enabled" : "disabled"); + if (has_ir) + *flags &= ~LMD_FLG_NOIR; + else + *flags |= LMD_FLG_NOIR; + } + } + + recov_bk = 0; + /* + * If we are restarting the MGS, don't try to keep the MGC's + * old connection, or registration will fail. + */ + if (IS_MGS(lsi)) { + CDEBUG(D_MOUNT, "New MGS with live MGC\n"); + recov_bk = 1; + } + + /* + * Try all connections, but only once (again). + * We don't want to block another target from starting + * (using its local copy of the log), but we do want to connect + * if at all possible. + */ + recov_bk++; + CDEBUG(D_MOUNT, "%s:Set MGC reconnect %d\n", mgcname, recov_bk); + rc = obd_set_info_async(NULL, obd->obd_self_export, + sizeof(KEY_INIT_RECOV_BACKUP), + KEY_INIT_RECOV_BACKUP, + sizeof(recov_bk), &recov_bk, NULL); + GOTO(out, rc = 0); + } + + CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname); + + /* Add the primary NIDs for the MGS */ + i = 0; + snprintf(niduuid, len + 2, "%s_%x", mgcname, i); + if (IS_SERVER(lsi)) { + ptr = lsi->lsi_lmd->lmd_mgs; + CDEBUG(D_MOUNT, "mgs NIDs %s.\n", ptr); + if (IS_MGS(lsi)) { + /* Use local NIDs (including LO) */ + struct lnet_processid id; + + while ((rc = LNetGetId(i++, &id)) != -ENOENT) { + rc = do_lcfg(mgcname, lnet_nid_to_nid4(&id.nid), + LCFG_ADD_UUID, + niduuid, NULL, NULL, NULL); + } + } else { + /* Use mgsnode= nids */ + /* mount -o mgsnode=nid */ + if (lsi->lsi_lmd->lmd_mgs) { + ptr = lsi->lsi_lmd->lmd_mgs; + } else if (class_find_param(ptr, PARAM_MGSNODE, + &ptr) != 0) { + CERROR("No MGS NIDs given.\n"); + GOTO(out_free, rc = -EINVAL); + } + /* + * Add primary MGS NID(s). + * Multiple NIDs on one MGS node are separated + * by commas. + */ + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID, + niduuid, NULL, NULL, NULL); + if (rc == 0) + ++i; + /* Stop at the first failover NID */ + if (*ptr == ':') + break; + } + } + } else { /* client */ + /* Use NIDs from mount line: uml1,1@elan:uml2,2@elan:/lustre */ + ptr = lsi->lsi_lmd->lmd_dev; + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID, + niduuid, NULL, NULL, NULL); + if (rc == 0) + ++i; + /* Stop at the first failover NID */ + if (*ptr == ':') + break; + } + } + if (i == 0) { + CERROR("No valid MGS NIDs found.\n"); + GOTO(out_free, rc = -EINVAL); + } + lsi->lsi_lmd->lmd_mgs_failnodes = 1; + + /* Random uuid for MGC allows easier reconnects */ + OBD_ALLOC_PTR(uuid); + if (uuid == NULL) + GOTO(out_free, rc = -ENOMEM); + + generate_random_uuid(uuidc.b); + snprintf(uuid->uuid, sizeof(*uuid), "%pU", uuidc.b); + + /* Start the MGC */ + rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME, + (char *)uuid->uuid, LUSTRE_MGS_OBDNAME, + niduuid, NULL, NULL); + if (rc) + GOTO(out_free, rc); + + /* Add any failover MGS NIDs */ + i = 1; + while (ptr && ((*ptr == ':' || + class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) { + /* New failover node */ + sprintf(niduuid, "%s_%x", mgcname, i); + j = 0; + while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID, + niduuid, NULL, NULL, NULL); + if (rc == 0) + ++j; + if (*ptr == ':') + break; + } + if (j > 0) { + rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN, + niduuid, NULL, NULL, NULL); + if (rc == 0) + ++i; + } else { + /* at ":/fsname" */ + break; + } + } + lsi->lsi_lmd->lmd_mgs_failnodes = i; + + obd = class_name2obd(mgcname); + if (!obd) { + CERROR("Can't find mgcobd %s\n", mgcname); + GOTO(out_free, rc = -ENOTCONN); + } + + rc = obd_set_info_async(NULL, obd->obd_self_export, + strlen(KEY_MGSSEC), KEY_MGSSEC, + strlen(mgssec), mgssec, NULL); + if (rc) + GOTO(out_free, rc); + + /* + * Keep a refcount of servers/clients who started with "mount", + * so we know when we can get rid of the mgc. + */ + atomic_set(&obd->u.cli.cl_mgc_refcount, 1); + + /* We connect to the MGS at setup, and don't disconnect until cleanup */ + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT | + OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | + OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_BULK_MBITS | OBD_CONNECT_BARRIER | + OBD_CONNECT_FLAGS2; + data->ocd_connect_flags2 = OBD_CONNECT2_REP_MBITS; + + if (lmd_is_client(lsi->lsi_lmd) && + lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR) + data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV; + data->ocd_version = LUSTRE_VERSION_CODE; + rc = obd_connect(NULL, &exp, obd, uuid, data, NULL); + if (rc) { + CERROR("connect failed %d\n", rc); + GOTO(out, rc); + } + + obd->u.cli.cl_mgc_mgsexp = exp; + +out: + /* + * Keep the MGC info in the sb. Note that many lsi's can point + * to the same mgc. + */ + lsi->lsi_mgc = obd; +out_free: + mutex_unlock(&mgc_start_lock); + + if (uuid) + OBD_FREE_PTR(uuid); + if (data) + OBD_FREE_PTR(data); + if (mgcname) + OBD_FREE(mgcname, len); + if (niduuid) + OBD_FREE(niduuid, len + 2); + RETURN(rc); +} +EXPORT_SYMBOL(lustre_start_mgc); + +static int lustre_stop_mgc(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + char niduuid[MAX_OBD_NAME + 6], *ptr = NULL; + int i, rc = 0; + + ENTRY; + + if (!lsi) + RETURN(-ENOENT); + obd = lsi->lsi_mgc; + if (!obd) + RETURN(-ENOENT); + lsi->lsi_mgc = NULL; + + mutex_lock(&mgc_start_lock); + LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0); + if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) { + /* + * This is not fatal, every client that stops + * will call in here. + */ + CDEBUG(D_MOUNT, "MGC still has %d references.\n", + atomic_read(&obd->u.cli.cl_mgc_refcount)); + GOTO(out, rc = -EBUSY); + } + + /* + * The MGC has no recoverable data in any case. + * force shotdown set in umount_begin + */ + obd->obd_no_recov = 1; + + if (obd->u.cli.cl_mgc_mgsexp) { + /* + * An error is not fatal, if we are unable to send the + * disconnect mgs ping evictor cleans up the export + */ + rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp); + if (rc) + CDEBUG(D_MOUNT, "disconnect failed %d\n", rc); + } + + /* + * Cache the obdname for cleaning the nid uuids, which are + * obdname_XX before calling class_manual_cleanup + */ + strcpy(niduuid, obd->obd_name); + ptr = niduuid + strlen(niduuid); + + rc = class_manual_cleanup(obd); + if (rc) + GOTO(out, rc); + + for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) { + sprintf(ptr, "_%x", i); + rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID, + niduuid, NULL, NULL, NULL); + if (rc) + CERROR("del MDC UUID %s failed: rc = %d\n", + niduuid, rc); + } +out: + /* class_import_put will get rid of the additional connections */ + mutex_unlock(&mgc_start_lock); + RETURN(rc); +} + +/***************** lustre superblock **************/ + +struct lustre_sb_info *lustre_init_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi; + + ENTRY; + + OBD_ALLOC_PTR(lsi); + if (!lsi) + RETURN(NULL); + OBD_ALLOC_PTR(lsi->lsi_lmd); + if (!lsi->lsi_lmd) { + OBD_FREE_PTR(lsi); + RETURN(NULL); + } + + s2lsi_nocast(sb) = lsi; + /* we take 1 extra ref for our setup */ + atomic_set(&lsi->lsi_mounts, 1); + + /* Default umount style */ + lsi->lsi_flags = LSI_UMOUNT_FAILOVER; + INIT_LIST_HEAD(&lsi->lsi_lwp_list); + mutex_init(&lsi->lsi_lwp_mutex); + + RETURN(lsi); +} +EXPORT_SYMBOL(lustre_init_lsi); + +static int lustre_free_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + + ENTRY; + + LASSERT(lsi != NULL); + CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi); + + /* someone didn't call server_put_mount. */ + LASSERT(atomic_read(&lsi->lsi_mounts) == 0); + + llcrypt_sb_free(sb); + if (lsi->lsi_lmd != NULL) { + if (lsi->lsi_lmd->lmd_dev != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_dev, + strlen(lsi->lsi_lmd->lmd_dev) + 1); + if (lsi->lsi_lmd->lmd_profile != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_profile, + strlen(lsi->lsi_lmd->lmd_profile) + 1); + if (lsi->lsi_lmd->lmd_fileset != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_fileset, + strlen(lsi->lsi_lmd->lmd_fileset) + 1); + if (lsi->lsi_lmd->lmd_mgssec != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_mgssec, + strlen(lsi->lsi_lmd->lmd_mgssec) + 1); + if (lsi->lsi_lmd->lmd_opts != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_opts, + strlen(lsi->lsi_lmd->lmd_opts) + 1); + if (lsi->lsi_lmd->lmd_exclude_count) + OBD_FREE(lsi->lsi_lmd->lmd_exclude, + sizeof(lsi->lsi_lmd->lmd_exclude[0]) * + lsi->lsi_lmd->lmd_exclude_count); + if (lsi->lsi_lmd->lmd_mgs != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_mgs, + strlen(lsi->lsi_lmd->lmd_mgs) + 1); + if (lsi->lsi_lmd->lmd_osd_type != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_osd_type, + strlen(lsi->lsi_lmd->lmd_osd_type) + 1); + if (lsi->lsi_lmd->lmd_params != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_params, 4096); + if (lsi->lsi_lmd->lmd_nidnet != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_nidnet, + strlen(lsi->lsi_lmd->lmd_nidnet) + 1); + + OBD_FREE_PTR(lsi->lsi_lmd); + } + + LASSERT(lsi->lsi_llsbi == NULL); + OBD_FREE_PTR(lsi); + s2lsi_nocast(sb) = NULL; + + RETURN(0); +} + +/* + * The lsi has one reference for every server that is using the disk - + * e.g. MDT, MGS, and potentially MGC + */ +int lustre_put_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + + ENTRY; + + LASSERT(lsi != NULL); + + CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts)); + if (atomic_dec_and_test(&lsi->lsi_mounts)) { + if (IS_SERVER(lsi) && lsi->lsi_osd_exp) { + lu_device_put(&lsi->lsi_dt_dev->dd_lu_dev); + lsi->lsi_osd_exp->exp_obd->obd_lvfs_ctxt.dt = NULL; + lsi->lsi_dt_dev = NULL; + obd_disconnect(lsi->lsi_osd_exp); + /* wait till OSD is gone */ + obd_zombie_barrier(); + } + lustre_free_lsi(sb); + RETURN(1); + } + RETURN(0); +} +EXPORT_SYMBOL(lustre_put_lsi); + +/* + * The goal of this function is to extract the file system name + * from the OBD name. This can come in two flavors. One is + * fsname-MDTXXXX or fsname-XXXXXXX were X is a hexadecimal + * number. In both cases we should return fsname. If it is + * not a valid OBD name it is assumed to be the file system + * name itself. + */ +void obdname2fsname(const char *tgt, char *fsname, size_t buflen) +{ + const char *ptr; + const char *tmp; + size_t len = 0; + + /* + * First we have to see if the @tgt has '-' at all. It is + * valid for the user to request something like + * lctl set_param -P llite.lustre*.xattr_cache=0 + */ + ptr = strrchr(tgt, '-'); + if (!ptr) { + /* No '-' means it could end in '*' */ + ptr = strchr(tgt, '*'); + if (!ptr) { + /* No '*' either. Assume tgt = fsname */ + len = strlen(tgt); + goto valid_obd_name; + } + len = ptr - tgt; + goto valid_obd_name; + } + + /* tgt format fsname-MDT0000-* */ + if ((!strncmp(ptr, "-MDT", 4) || + !strncmp(ptr, "-OST", 4)) && + (isxdigit(ptr[4]) && isxdigit(ptr[5]) && + isxdigit(ptr[6]) && isxdigit(ptr[7]))) { + len = ptr - tgt; + goto valid_obd_name; + } + + /* + * tgt_format fsname-cli'dev'-'uuid' except for the llite case + * which are named fsname-'uuid'. Examples: + * + * lustre-clilov-ffff88104db5b800 + * lustre-ffff88104db5b800 (for llite device) + * + * The length of the OBD uuid can vary on different platforms. + * This test if any invalid characters are in string. Allow + * wildcards with '*' character. + */ + ptr++; + if (!strspn(ptr, "0123456789abcdefABCDEF*")) { + len = 0; + goto no_fsname; + } + + /* + * Now that we validated the device name lets extract the + * file system name. Most of the names in this class will + * have '-cli' in its name which needs to be dropped. If + * it doesn't have '-cli' then its a llite device which + * ptr already points to the start of the uuid string. + */ + tmp = strstr(tgt, "-cli"); + if (tmp) + ptr = tmp; + else + ptr--; + len = ptr - tgt; +valid_obd_name: + len = min_t(size_t, len, LUSTRE_MAXFSNAME); + snprintf(fsname, buflen, "%.*s", (int)len, tgt); +no_fsname: + fsname[len] = '\0'; +} +EXPORT_SYMBOL(obdname2fsname); + +/** + * SERVER NAME *** + * + * FSNAME is between 1 and 8 characters (inclusive). + * Excluded characters are '/' and ':' + * SEPARATOR is either ':' or '-' + * TYPE: "OST", "MDT", etc. + * INDEX: Hex representation of the index + */ + +/** + * Get the fsname ("lustre") from the server name ("lustre-OST003F"). + * @param [in] svname server name including type and index + * @param [out] fsname Buffer to copy filesystem name prefix into. + * Must have at least 'strlen(fsname) + 1' chars. + * @param [out] endptr if endptr isn't NULL it is set to end of fsname + * rc < 0 on error + */ +int server_name2fsname(const char *svname, char *fsname, const char **endptr) +{ + const char *dash; + + dash = svname + strnlen(svname, LUSTRE_MAXFSNAME); + for (; dash > svname && *dash != '-' && *dash != ':'; dash--) + ; + if (dash == svname) + return -EINVAL; + + if (fsname != NULL) { + strncpy(fsname, svname, dash - svname); + fsname[dash - svname] = '\0'; + } + + if (endptr != NULL) + *endptr = dash; + + return 0; +} +EXPORT_SYMBOL(server_name2fsname); + +/** + * Get service name (svname) from string + * rc < 0 on error + * if endptr isn't NULL it is set to end of fsname * + */ +int server_name2svname(const char *label, char *svname, const char **endptr, + size_t svsize) +{ + int rc; + const char *dash; + + /* We use server_name2fsname() just for parsing */ + rc = server_name2fsname(label, NULL, &dash); + if (rc != 0) + return rc; + + if (endptr != NULL) + *endptr = dash; + + if (strlcpy(svname, dash + 1, svsize) >= svsize) + return -E2BIG; + + return 0; +} +EXPORT_SYMBOL(server_name2svname); + +/** + * check server name is OST. + **/ +int server_name_is_ost(const char *svname) +{ + const char *dash; + int rc; + + /* We use server_name2fsname() just for parsing */ + rc = server_name2fsname(svname, NULL, &dash); + if (rc != 0) + return rc; + + dash++; + + if (strncmp(dash, "OST", 3) == 0) + return 1; + return 0; +} +EXPORT_SYMBOL(server_name_is_ost); + +/** + * Get the index from the target name MDTXXXX/OSTXXXX + * rc = server type, or rc < 0 on error + **/ +int target_name2index(const char *tgtname, __u32 *idx, const char **endptr) +{ + const char *dash = tgtname; + unsigned long index; + int rc; + + if (strncmp(dash, "MDT", 3) == 0) + rc = LDD_F_SV_TYPE_MDT; + else if (strncmp(dash, "OST", 3) == 0) + rc = LDD_F_SV_TYPE_OST; + else + return -EINVAL; + + dash += 3; + + if (strncmp(dash, "all", 3) == 0) { + if (endptr != NULL) + *endptr = dash + 3; + return rc | LDD_F_SV_ALL; + } + + index = simple_strtoul(dash, (char **)endptr, 16); + if (idx != NULL) + *idx = index; + + if (index > 0xffff) + return -ERANGE; + + return rc; +} +EXPORT_SYMBOL(target_name2index); + +/* + * Get the index from the OBD name. + * rc = server type, or + * rc < 0 on error + * if endptr isn't NULL it is set to end of name + */ +int server_name2index(const char *svname, __u32 *idx, const char **endptr) +{ + const char *dash; + int rc; + + /* We use server_name2fsname() just for parsing */ + rc = server_name2fsname(svname, NULL, &dash); + if (rc != 0) + return rc; + + dash++; + rc = target_name2index(dash, idx, endptr); + if (rc < 0) + return rc; + + /* Account for -mdc after index that is possible when specifying mdt */ + if (endptr != NULL && strncmp(LUSTRE_MDC_NAME, *endptr + 1, + sizeof(LUSTRE_MDC_NAME)-1) == 0) + *endptr += sizeof(LUSTRE_MDC_NAME); + + return rc; +} +EXPORT_SYMBOL(server_name2index); + +/*************** mount common betweeen server and client ***************/ + +/* Common umount */ +int lustre_common_put_super(struct super_block *sb) +{ + int rc; + + ENTRY; + + CDEBUG(D_MOUNT, "dropping sb %p\n", sb); + + /* Drop a ref to the MGC */ + rc = lustre_stop_mgc(sb); + if (rc && (rc != -ENOENT)) { + if (rc != -EBUSY) { + CERROR("Can't stop MGC: %d\n", rc); + RETURN(rc); + } + /* + * BUSY just means that there's some other OBD that + * needs the mgc. Let him clean it up. + */ + CDEBUG(D_MOUNT, "MGC still in use\n"); + } + /* Drop a ref to the mounted disk */ + lustre_put_lsi(sb); + + RETURN(rc); +} +EXPORT_SYMBOL(lustre_common_put_super); + +static void lmd_print(struct lustre_mount_data *lmd) +{ + int i; + + PRINT_CMD(D_MOUNT, " mount data:\n"); + if (lmd_is_client(lmd)) + PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile); + PRINT_CMD(D_MOUNT, "device: %s\n", lmd->lmd_dev); + PRINT_CMD(D_MOUNT, "flags: %x\n", lmd->lmd_flags); + + if (lmd->lmd_opts) + PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts); + + if (lmd->lmd_recovery_time_soft) + PRINT_CMD(D_MOUNT, "recovery time soft: %d\n", + lmd->lmd_recovery_time_soft); + + if (lmd->lmd_recovery_time_hard) + PRINT_CMD(D_MOUNT, "recovery time hard: %d\n", + lmd->lmd_recovery_time_hard); + + for (i = 0; i < lmd->lmd_exclude_count; i++) { + PRINT_CMD(D_MOUNT, "exclude %d: OST%04x\n", i, + lmd->lmd_exclude[i]); + } +} + +/* Is this server on the exclusion list */ +int lustre_check_exclusion(struct super_block *sb, char *svname) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct lustre_mount_data *lmd = lsi->lsi_lmd; + __u32 index; + int i, rc; + + ENTRY; + + rc = server_name2index(svname, &index, NULL); + if (rc != LDD_F_SV_TYPE_OST) + /* Only exclude OSTs */ + RETURN(0); + + CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname, + index, lmd->lmd_exclude_count, lmd->lmd_dev); + + for (i = 0; i < lmd->lmd_exclude_count; i++) { + if (index == lmd->lmd_exclude[i]) { + CWARN("Excluding %s (on exclusion list)\n", svname); + RETURN(1); + } + } + RETURN(0); +} + +/* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */ +static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr) +{ + const char *s1 = ptr, *s2; + __u32 *exclude_list; + __u32 index = 0; + int rc = 0, devmax; + + ENTRY; + + /* + * The shortest an ost name can be is 8 chars: -OST0000. + * We don't actually know the fsname at this time, so in fact + * a user could specify any fsname. + */ + devmax = strlen(ptr) / 8 + 1; + + /* temp storage until we figure out how many we have */ + OBD_ALLOC_PTR_ARRAY(exclude_list, devmax); + if (!exclude_list) + RETURN(-ENOMEM); + + /* we enter this fn pointing at the '=' */ + while (*s1 && *s1 != ' ' && *s1 != ',') { + s1++; + rc = server_name2index(s1, &index, &s2); + if (rc < 0) { + CERROR("Can't parse server name '%s': rc = %d\n", + s1, rc); + break; + } + if (rc == LDD_F_SV_TYPE_OST) + exclude_list[lmd->lmd_exclude_count++] = index; + else + CDEBUG(D_MOUNT, "ignoring exclude %.*s: type = %#x\n", + (uint)(s2-s1), s1, rc); + s1 = s2; + /* + * now we are pointing at ':' (next exclude) + * or ',' (end of excludes) + */ + if (lmd->lmd_exclude_count >= devmax) + break; + } + if (rc >= 0) /* non-err */ + rc = 0; + + if (lmd->lmd_exclude_count) { + /* permanent, freed in lustre_free_lsi */ + OBD_ALLOC_PTR_ARRAY(lmd->lmd_exclude, + lmd->lmd_exclude_count); + if (lmd->lmd_exclude) { + memcpy(lmd->lmd_exclude, exclude_list, + sizeof(index) * lmd->lmd_exclude_count); + } else { + rc = -ENOMEM; + lmd->lmd_exclude_count = 0; + } + } + OBD_FREE_PTR_ARRAY(exclude_list, devmax); + RETURN(rc); +} + +static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr) +{ + char *tail; + int length; + + if (lmd->lmd_mgssec != NULL) { + OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1); + lmd->lmd_mgssec = NULL; + } + + tail = strchr(ptr, ','); + if (tail == NULL) + length = strlen(ptr); + else + length = tail - ptr; + + OBD_ALLOC(lmd->lmd_mgssec, length + 1); + if (lmd->lmd_mgssec == NULL) + return -ENOMEM; + + memcpy(lmd->lmd_mgssec, ptr, length); + lmd->lmd_mgssec[length] = '\0'; + return 0; +} + +static int lmd_parse_network(struct lustre_mount_data *lmd, char *ptr) +{ + char *tail; + int length; + + if (lmd->lmd_nidnet != NULL) { + OBD_FREE(lmd->lmd_nidnet, strlen(lmd->lmd_nidnet) + 1); + lmd->lmd_nidnet = NULL; + } + + tail = strchr(ptr, ','); + if (tail == NULL) + length = strlen(ptr); + else + length = tail - ptr; + + OBD_ALLOC(lmd->lmd_nidnet, length + 1); + if (lmd->lmd_nidnet == NULL) + return -ENOMEM; + + memcpy(lmd->lmd_nidnet, ptr, length); + lmd->lmd_nidnet[length] = '\0'; + return 0; +} + +static int lmd_parse_string(char **handle, char *ptr) +{ + char *tail; + int length; + + if ((handle == NULL) || (ptr == NULL)) + return -EINVAL; + + if (*handle != NULL) { + OBD_FREE(*handle, strlen(*handle) + 1); + *handle = NULL; + } + + tail = strchr(ptr, ','); + if (tail == NULL) + length = strlen(ptr); + else + length = tail - ptr; + + OBD_ALLOC(*handle, length + 1); + if (*handle == NULL) + return -ENOMEM; + + memcpy(*handle, ptr, length); + (*handle)[length] = '\0'; + + return 0; +} + +/* Collect multiple values for mgsnid specifiers */ +static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr) +{ + lnet_nid_t nid; + char *tail = *ptr; + char *mgsnid; + int length; + int oldlen = 0; + + /* Find end of NID-list */ + while (class_parse_nid_quiet(tail, &nid, &tail) == 0) + ; /* do nothing */ + + length = tail - *ptr; + if (length == 0) { + LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr); + return -EINVAL; + } + + if (lmd->lmd_mgs != NULL) + oldlen = strlen(lmd->lmd_mgs) + 1; + + OBD_ALLOC(mgsnid, oldlen + length + 1); + if (mgsnid == NULL) + return -ENOMEM; + + if (lmd->lmd_mgs != NULL) { + /* Multiple mgsnid= are taken to mean failover locations */ + memcpy(mgsnid, lmd->lmd_mgs, oldlen); + mgsnid[oldlen - 1] = ':'; + OBD_FREE(lmd->lmd_mgs, oldlen); + } + memcpy(mgsnid + oldlen, *ptr, length); + mgsnid[oldlen + length] = '\0'; + lmd->lmd_mgs = mgsnid; + *ptr = tail; + + return 0; +} + +/** + * Find the first delimiter (comma or colon) from the specified \a buf and + * make \a *endh point to the string starting with the delimiter. The commas + * in expression list [...] will be skipped. + * + * @buf a delimiter-separated string + * @endh a pointer to a pointer that will point to the string + * starting with the delimiter + * + * RETURNS true if delimiter is found, false if delimiter is not found + */ +static bool lmd_find_delimiter(char *buf, char **endh) +{ + char *c = buf; + size_t pos; + bool found; + + if (!buf) + return false; +try_again: + if (*c == ',' || *c == ':') + return true; + + pos = strcspn(c, "[:,]"); + if (!pos) + return false; + + /* Not a valid mount string */ + if (*c == ']') { + CWARN("invalid mount string format\n"); + return false; + } + + c += pos; + if (*c == '[') { + c = strchr(c, ']'); + + /* invalid mount string */ + if (!c) { + CWARN("invalid mount string format\n"); + return false; + } + c++; + goto try_again; + } + + found = *c != '\0'; + if (found && endh) + *endh = c; + + return found; +} + +/** + * Find the first valid string delimited by comma or colon from the specified + * \a buf and parse it to see whether it's a valid NID list. If yes, \a *endh + * will point to the next string starting with the delimiter. + * + * \param[in] buf a delimiter-separated string + * \param[in] endh a pointer to a pointer that will point to the string + * starting with the delimiter + * + * \retval 0 if the string is a valid NID list + * \retval 1 if the string is not a valid NID list + */ +static int lmd_parse_nidlist(char *buf, char **endh) +{ + LIST_HEAD(nidlist); + char *endp = buf; + char tmp; + int rc = 0; + + if (buf == NULL) + return 1; + while (*buf == ',' || *buf == ':') + buf++; + if (*buf == ' ' || *buf == '/' || *buf == '\0') + return 1; + + if (!lmd_find_delimiter(buf, &endp)) + endp = buf + strlen(buf); + + tmp = *endp; + *endp = '\0'; + + if (cfs_parse_nidlist(buf, strlen(buf), &nidlist) <= 0) + rc = 1; + cfs_free_nidlist(&nidlist); + + *endp = tmp; + if (rc != 0) + return rc; + if (endh != NULL) + *endh = endp; + return 0; +} + +/** + * Parse mount line options + * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre + * dev is passed as device=uml1:/lustre by mount.lustre_tgt + */ +int lmd_parse(char *options, struct lustre_mount_data *lmd) +{ + char *s1, *s2, *devname = NULL; + struct lustre_mount_data *raw = (struct lustre_mount_data *)options; + int rc = 0; + + ENTRY; + + LASSERT(lmd); + if (!options) { + LCONSOLE_ERROR_MSG(0x162, + "Missing mount data: check /sbin/mount.lustre_tgt is installed.\n"); + RETURN(-EINVAL); + } + + /* Options should be a string - try to detect old lmd data */ + if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) { + LCONSOLE_ERROR_MSG(0x163, + "Using an old version of /sbin/mount.lustre. Please install version %s\n", + LUSTRE_VERSION_STRING); + RETURN(-EINVAL); + } + lmd->lmd_magic = LMD_MAGIC; + + OBD_ALLOC(lmd->lmd_params, LMD_PARAMS_MAXLEN); + if (lmd->lmd_params == NULL) + RETURN(-ENOMEM); + lmd->lmd_params[0] = '\0'; + + /* Set default flags here */ + + s1 = options; + while (*s1) { + int clear = 0; + int time_min = OBD_RECOVERY_TIME_MIN; + char *s3; + + /* Skip whitespace and extra commas */ + while (*s1 == ' ' || *s1 == ',') + s1++; + s3 = s1; + + /* + * Client options are parsed in ll_options: eg. flock, + * user_xattr, acl + */ + + /* + * Parse non-ldiskfs options here. Rather than modifying + * ldiskfs, we just zero these out here + */ + if (strncmp(s1, "abort_recov", 11) == 0) { + lmd->lmd_flags |= LMD_FLG_ABORT_RECOV; + clear++; + } else if (strncmp(s1, "abort_recov_mdt", 15) == 0) { + lmd->lmd_flags |= LMD_FLG_ABORT_RECOV_MDT; + clear++; + } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) { + lmd->lmd_recovery_time_soft = + max_t(int, simple_strtoul(s1 + 19, NULL, 10), + time_min); + clear++; + } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) { + lmd->lmd_recovery_time_hard = + max_t(int, simple_strtoul(s1 + 19, NULL, 10), + time_min); + clear++; + } else if (strncmp(s1, "no_precreate", 12) == 0) { + lmd->lmd_flags |= LMD_FLG_NO_PRECREATE; + clear++; + } else if (strncmp(s1, "noir", 4) == 0) { + lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */ + clear++; + } else if (strncmp(s1, "nosvc", 5) == 0) { + lmd->lmd_flags |= LMD_FLG_NOSVC; + clear++; + } else if (strncmp(s1, "nomgs", 5) == 0) { + lmd->lmd_flags |= LMD_FLG_NOMGS; + clear++; + } else if (strncmp(s1, "noscrub", 7) == 0) { + lmd->lmd_flags |= LMD_FLG_NOSCRUB; + clear++; + } else if (strncmp(s1, "skip_lfsck", 10) == 0) { + lmd->lmd_flags |= LMD_FLG_SKIP_LFSCK; + clear++; + } else if (strncmp(s1, "rdonly_dev", 10) == 0) { + lmd->lmd_flags |= LMD_FLG_DEV_RDONLY; + clear++; + } else if (strncmp(s1, PARAM_MGSNODE, + sizeof(PARAM_MGSNODE) - 1) == 0) { + s2 = s1 + sizeof(PARAM_MGSNODE) - 1; + /* + * Assume the next mount opt is the first + * invalid NID we get to. + */ + rc = lmd_parse_mgs(lmd, &s2); + if (rc) + goto invalid; + s3 = s2; + clear++; + } else if (strncmp(s1, "writeconf", 9) == 0) { + lmd->lmd_flags |= LMD_FLG_WRITECONF; + clear++; + } else if (strncmp(s1, "nolocallogs", 11) == 0) { + lmd->lmd_flags |= LMD_FLG_NO_LOCAL_LOGS; + clear++; + } else if (strncmp(s1, "update", 6) == 0) { + lmd->lmd_flags |= LMD_FLG_UPDATE; + clear++; + } else if (strncmp(s1, "virgin", 6) == 0) { + lmd->lmd_flags |= LMD_FLG_VIRGIN; + clear++; + } else if (strncmp(s1, "noprimnode", 10) == 0) { + lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE; + clear++; + } else if (strncmp(s1, "mgssec=", 7) == 0) { + rc = lmd_parse_mgssec(lmd, s1 + 7); + if (rc) + goto invalid; + clear++; + /* ost exclusion list */ + } else if (strncmp(s1, "exclude=", 8) == 0) { + rc = lmd_make_exclusion(lmd, s1 + 7); + if (rc) + goto invalid; + clear++; + } else if (strncmp(s1, "mgs", 3) == 0) { + /* We are an MGS */ + lmd->lmd_flags |= LMD_FLG_MGS; + clear++; + } else if (strncmp(s1, "svname=", 7) == 0) { + rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7); + if (rc) + goto invalid; + clear++; + } else if (strncmp(s1, "param=", 6) == 0) { + size_t length, params_length; + char *tail = s1; + + if (lmd_find_delimiter(s1 + 6, &tail)) { + char *param_str = tail + 1; + int supplementary = 1; + + while (lmd_parse_nidlist(param_str, + ¶m_str) == 0) { + supplementary = 0; + } + length = param_str - s1 - supplementary; + } else { + length = strlen(s1); + } + length -= 6; + params_length = strlen(lmd->lmd_params); + if (params_length + length + 1 >= LMD_PARAMS_MAXLEN) + RETURN(-E2BIG); + strncat(lmd->lmd_params, s1 + 6, length); + lmd->lmd_params[params_length + length] = '\0'; + strlcat(lmd->lmd_params, " ", LMD_PARAMS_MAXLEN); + s3 = s1 + 6 + length; + clear++; + } else if (strncmp(s1, "localrecov", 10) == 0) { + lmd->lmd_flags |= LMD_FLG_LOCAL_RECOV; + clear++; + } else if (strncmp(s1, "osd=", 4) == 0) { + rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4); + if (rc) + goto invalid; + clear++; + } + /* + * Linux 2.4 doesn't pass the device, so we stuck it at + * the end of the options. + */ + else if (strncmp(s1, "device=", 7) == 0) { + devname = s1 + 7; + /* + * terminate options right before device. device + * must be the last one. + */ + *s1 = '\0'; + break; + } else if (strncmp(s1, "network=", 8) == 0) { + rc = lmd_parse_network(lmd, s1 + 8); + if (rc) + goto invalid; + + /* check if LNet dynamic peer discovery is activated */ + if (LNetGetPeerDiscoveryStatus()) { + CERROR("LNet Dynamic Peer Discovery is enabled " + "on this node. 'network' mount option " + "cannot be taken into account.\n"); + goto invalid; + } + + clear++; + } + + /* Find next opt */ + s2 = strchr(s3, ','); + if (s2 == NULL) { + if (clear) + *s1 = '\0'; + break; + } + s2++; + if (clear) + memmove(s1, s2, strlen(s2) + 1); + else + s1 = s2; + } + + if (!devname) { + LCONSOLE_ERROR_MSG(0x164, + "Can't find device name (need mount option 'device=...')\n"); + goto invalid; + } + + s1 = strstr(devname, ":/"); + if (s1) { + ++s1; + lmd->lmd_flags |= LMD_FLG_CLIENT; + /* Remove leading /s from fsname */ + while (*++s1 == '/') + ; + s2 = s1; + while (*s2 != '/' && *s2 != '\0') + s2++; + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_profile, s2 - s1 + 8); + if (!lmd->lmd_profile) + RETURN(-ENOMEM); + + strncat(lmd->lmd_profile, s1, s2 - s1); + strncat(lmd->lmd_profile, "-client", 7); + + s1 = s2; + s2 = s1 + strlen(s1) - 1; + /* Remove padding /s from fileset */ + while (*s2 == '/') + s2--; + if (s2 > s1) { + OBD_ALLOC(lmd->lmd_fileset, s2 - s1 + 2); + if (lmd->lmd_fileset == NULL) { + OBD_FREE(lmd->lmd_profile, + strlen(lmd->lmd_profile) + 1); + RETURN(-ENOMEM); + } + strncat(lmd->lmd_fileset, s1, s2 - s1 + 1); + } + } else { + /* server mount */ + if (lmd->lmd_nidnet != NULL) { + /* 'network=' mount option forbidden for server */ + OBD_FREE(lmd->lmd_nidnet, strlen(lmd->lmd_nidnet) + 1); + lmd->lmd_nidnet = NULL; + rc = -EINVAL; + CERROR( + "%s: option 'network=' not allowed for Lustre servers: rc = %d\n", + devname, rc); + RETURN(rc); + } + } + + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1); + if (!lmd->lmd_dev) + RETURN(-ENOMEM); + strncpy(lmd->lmd_dev, devname, strlen(devname)+1); + + /* Save mount options */ + s1 = options + strlen(options) - 1; + while (s1 >= options && (*s1 == ',' || *s1 == ' ')) + *s1-- = 0; + while (*options && (*options == ',' || *options == ' ')) + options++; + if (*options != 0) { + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1); + if (!lmd->lmd_opts) + RETURN(-ENOMEM); + strncpy(lmd->lmd_opts, options, strlen(options)+1); + } + + lmd_print(lmd); + lmd->lmd_magic = LMD_MAGIC; + + RETURN(rc); + +invalid: + CERROR("Bad mount options %s\n", options); + RETURN(-EINVAL); +} +EXPORT_SYMBOL(lmd_parse); + +#ifdef HAVE_SERVER_SUPPORT +/** + * This is the entry point for the mount call into Lustre. + * This is called when a server target is mounted, + * and this is where we start setting things up. + * @param data Mount options (e.g. -o flock,abort_recov) + */ +static int lustre_tgt_fill_super(struct super_block *sb, void *lmd2_data, + int silent) +{ + struct lustre_mount_data *lmd; + struct lustre_sb_info *lsi; + int rc; + + ENTRY; + + CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb); + + lsi = lustre_init_lsi(sb); + if (!lsi) + RETURN(-ENOMEM); + lmd = lsi->lsi_lmd; + + /* + * Disable lockdep during mount, because mount locking patterns are + * 'special'. + */ + lockdep_off(); + + /* + * LU-639: the OBD cleanup of last mount may not finish yet, wait here. + */ + obd_zombie_barrier(); + + /* Figure out the lmd from the mount options */ + if (lmd_parse(lmd2_data, lmd)) { + lustre_put_lsi(sb); + GOTO(out, rc = -EINVAL); + } + + if (lmd_is_client(lmd)) { + rc = -ENODEV; + CERROR("%s: attempting to mount a client with -t lustre_tgt' which is only for server-side mounts: rc = %d\n", + lmd->lmd_dev, rc); + lustre_put_lsi(sb); + GOTO(out, rc); + } + + CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev); + rc = server_fill_super(sb); + /* + * server_fill_super calls lustre_start_mgc after the mount + * because we need the MGS NIDs which are stored on disk. + * Plus, we may need to start the MGS first. + * + * server_fill_super will call server_put_super on failure + * + * If error happens in fill_super() call, @lsi will be killed there. + * This is why we do not put it here. + */ +out: + if (rc) { + CERROR("Unable to mount %s (%d)\n", + s2lsi(sb) ? lmd->lmd_dev : "", rc); + } else { + CDEBUG(D_SUPER, "Mount %s complete\n", + lmd->lmd_dev); + } + lockdep_on(); + return rc; +} + +/***************** FS registration ******************/ +static struct dentry *lustre_tgt_mount(struct file_system_type *fs_type, + int flags, const char *devname, + void *data) +{ + return mount_nodev(fs_type, flags, data, lustre_tgt_fill_super); +} + +/* Register the "lustre_tgt" fs type. + * + * Right now this isn't any different than the normal "lustre" filesystem + * type, but it is added so that there is some compatibility to allow + * changing documentation and scripts to start using the "lustre_tgt" type + * at mount time. That will simplify test interop, and in case of upgrades + * that change to the new type and then need to roll back for some reason. + * + * The long-term goal is to disentangle the client and server mount code. + */ +static struct file_system_type lustre_tgt_fstype = { + .owner = THIS_MODULE, + .name = "lustre_tgt", + .mount = lustre_tgt_mount, + .kill_sb = kill_anon_super, + .fs_flags = FS_REQUIRES_DEV | FS_RENAME_DOES_D_MOVE, +}; +MODULE_ALIAS_FS("lustre_tgt"); + +int lustre_tgt_register_fs(void) +{ + return register_filesystem(&lustre_tgt_fstype); +} + +void lustre_tgt_unregister_fs(void) +{ + unregister_filesystem(&lustre_tgt_fstype); +} + +#endif /* HAVE_SERVER_SUPPORT */ diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c new file mode 100644 index 0000000000000..a175ebe7f1af1 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c @@ -0,0 +1,2112 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/obd_mount_server.c + * + * Server mount routines + * + * Author: Nathan Rutman + */ + + +#define DEBUG_SUBSYSTEM S_CLASS +#define D_MOUNT (D_SUPER | D_CONFIG /* | D_WARNING */) +#define PRINT_CMD CDEBUG +#define PRINT_MASK (D_SUPER | D_CONFIG) + +#include +#ifdef HAVE_LINUX_SELINUX_IS_ENABLED +#include +#endif +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/*********** mount lookup *********/ + +static DEFINE_MUTEX(lustre_mount_info_lock); +static LIST_HEAD(server_mount_info_list); + +static struct lustre_mount_info *server_find_mount(const char *name) +{ + struct list_head *tmp; + struct lustre_mount_info *lmi; + ENTRY; + + list_for_each(tmp, &server_mount_info_list) { + lmi = list_entry(tmp, struct lustre_mount_info, + lmi_list_chain); + if (strcmp(name, lmi->lmi_name) == 0) + RETURN(lmi); + } + RETURN(NULL); +} + +/* we must register an obd for a mount before we call the setup routine. + *_setup will call lustre_get_mount to get the mnt struct + by obd_name, since we can't pass the pointer to setup. */ +static int server_register_mount(const char *name, struct super_block *sb) +{ + struct lustre_mount_info *lmi; + char *name_cp; + ENTRY; + + LASSERT(sb); + + OBD_ALLOC(lmi, sizeof(*lmi)); + if (!lmi) + RETURN(-ENOMEM); + OBD_ALLOC(name_cp, strlen(name) + 1); + if (!name_cp) { + OBD_FREE(lmi, sizeof(*lmi)); + RETURN(-ENOMEM); + } + strcpy(name_cp, name); + + mutex_lock(&lustre_mount_info_lock); + + if (server_find_mount(name)) { + mutex_unlock(&lustre_mount_info_lock); + OBD_FREE(lmi, sizeof(*lmi)); + OBD_FREE(name_cp, strlen(name) + 1); + CERROR("Already registered %s\n", name); + RETURN(-EEXIST); + } + lmi->lmi_name = name_cp; + lmi->lmi_sb = sb; + list_add(&lmi->lmi_list_chain, &server_mount_info_list); + + mutex_unlock(&lustre_mount_info_lock); + + CDEBUG(D_MOUNT, "register mount %p from %s\n", sb, name); + + RETURN(0); +} + +/* when an obd no longer needs a mount */ +static int server_deregister_mount(const char *name) +{ + struct lustre_mount_info *lmi; + ENTRY; + + mutex_lock(&lustre_mount_info_lock); + lmi = server_find_mount(name); + if (!lmi) { + mutex_unlock(&lustre_mount_info_lock); + CERROR("%s not registered\n", name); + RETURN(-ENOENT); + } + + CDEBUG(D_MOUNT, "deregister mount %p from %s\n", lmi->lmi_sb, name); + + OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1); + list_del(&lmi->lmi_list_chain); + OBD_FREE(lmi, sizeof(*lmi)); + mutex_unlock(&lustre_mount_info_lock); + + OBD_RACE(OBD_FAIL_MDS_LLOG_UMOUNT_RACE); + RETURN(0); +} + +/* obd's look up a registered mount using their obdname. This is just + for initial obd setup to find the mount struct. It should not be + called every time you want to mntget. */ +struct lustre_mount_info *server_get_mount(const char *name) +{ + struct lustre_mount_info *lmi; + struct lustre_sb_info *lsi; + ENTRY; + + mutex_lock(&lustre_mount_info_lock); + lmi = server_find_mount(name); + mutex_unlock(&lustre_mount_info_lock); + if (!lmi) { + CERROR("Can't find mount for %s\n", name); + RETURN(NULL); + } + lsi = s2lsi(lmi->lmi_sb); + + atomic_inc(&lsi->lsi_mounts); + + CDEBUG(D_MOUNT, "get mount %p from %s, refs=%d\n", lmi->lmi_sb, + name, atomic_read(&lsi->lsi_mounts)); + + RETURN(lmi); +} +EXPORT_SYMBOL(server_get_mount); + +/** + * server_put_mount: to be called from obd_cleanup methods + * @name: obd name + * @dereg_mnt: 0 or 1 depending on whether the mount is to be deregistered or + * not + * + * The caller decides whether server_deregister_mount() needs to be called or + * not. Calling of server_deregister_mount() does not depend on refcounting on + * lsi because we could have say the mgs and mds on the same node and we + * unmount the mds, then the ref on the lsi would still be non-zero but we + * would still want to deregister the mds mount. + */ +int server_put_mount(const char *name, bool dereg_mnt) +{ + struct lustre_mount_info *lmi; + struct lustre_sb_info *lsi; + ENTRY; + + mutex_lock(&lustre_mount_info_lock); + lmi = server_find_mount(name); + mutex_unlock(&lustre_mount_info_lock); + if (!lmi) { + CERROR("Can't find mount for %s\n", name); + RETURN(-ENOENT); + } + lsi = s2lsi(lmi->lmi_sb); + + CDEBUG(D_MOUNT, "put mount %p from %s, refs=%d\n", + lmi->lmi_sb, name, atomic_read(&lsi->lsi_mounts)); + + if (lustre_put_lsi(lmi->lmi_sb)) + CDEBUG(D_MOUNT, "Last put of mount %p from %s\n", + lmi->lmi_sb, name); + + if (dereg_mnt) + /* this obd should never need the mount again */ + server_deregister_mount(name); + + RETURN(0); +} +EXPORT_SYMBOL(server_put_mount); + +/* Set up a MGS to serve startup logs */ +static int server_start_mgs(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct lustre_mount_info *lmi; + int rc = 0; + ENTRY; + + /* It is impossible to have more than 1 MGS per node, since + MGC wouldn't know which to connect to */ + lmi = server_find_mount(LUSTRE_MGS_OBDNAME); + if (lmi) { + lsi = s2lsi(lmi->lmi_sb); + LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started" + " from server\n"); + RETURN(-EALREADY); + } + + CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME); + + rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb); + + if (!rc) { + rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME, + LUSTRE_MGS_OBDNAME, NULL, NULL, + lsi->lsi_osd_obdname, NULL); + /* server_deregister_mount() is not called previously, for lsi + * and other stuff can't be freed cleanly when mgs calls + * server_put_mount() in error handling case (see b=17758), + * this problem is caused by a bug in mgs_init0, which forgot + * calling server_put_mount in error case. */ + + if (rc) + server_deregister_mount(LUSTRE_MGS_OBDNAME); + } + + if (rc) + LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). " + "Is the 'mgs' module loaded?\n", + LUSTRE_MGS_OBDNAME, rc); + RETURN(rc); +} + +static int server_stop_mgs(struct super_block *sb) +{ + struct obd_device *obd; + int rc; + struct lustre_mount_info *lmi; + ENTRY; + + /* Do not stop MGS if this device is not the running MGT */ + lmi = server_find_mount(LUSTRE_MGS_OBDNAME); + if (lmi != NULL && lmi->lmi_sb != sb) + RETURN(0); + + CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME); + + /* There better be only one MGS */ + obd = class_name2obd(LUSTRE_MGS_OBDNAME); + if (!obd) { + CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME); + RETURN(-EALREADY); + } + + /* The MGS should always stop when we say so */ + obd->obd_force = 1; + rc = class_manual_cleanup(obd); + RETURN(rc); +} + +/* Since there's only one mgc per node, we have to change it's fs to get + access to the right disk. */ +static int server_mgc_set_fs(const struct lu_env *env, + struct obd_device *mgc, struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev); + + /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */ + rc = obd_set_info_async(env, mgc->obd_self_export, + sizeof(KEY_SET_FS), KEY_SET_FS, + sizeof(*sb), sb, NULL); + if (rc != 0) + CERROR("can't set_fs %d\n", rc); + + RETURN(rc); +} + +static int server_mgc_clear_fs(const struct lu_env *env, + struct obd_device *mgc) +{ + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "Unassign mgc disk\n"); + + rc = obd_set_info_async(env, mgc->obd_self_export, + sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS, + 0, NULL, NULL); + RETURN(rc); +} + +static inline bool is_mdc_device(const char *devname) +{ + char *ptr; + + ptr = strrchr(devname, '-'); + return ptr != NULL && strcmp(ptr, "-mdc") == 0; +} + +static inline bool tgt_is_mdt(const char *tgtname, __u32 *idx) +{ + int type; + + type = server_name2index(tgtname, idx, NULL); + + return type == LDD_F_SV_TYPE_MDT; +} + +/** + * Convert OST/MDT name(fsname-{MDT,OST}xxxx) to a lwp name with the @idx:yyyy + * (fsname-MDTyyyy-lwp-{MDT,OST}xxxx) + **/ +int tgt_name2lwp_name(const char *tgt_name, char *lwp_name, int len, __u32 idx) +{ + char *fsname; + const char *tgt; + int rc; + ENTRY; + + OBD_ALLOC(fsname, MTI_NAME_MAXLEN); + if (fsname == NULL) + RETURN(-ENOMEM); + + rc = server_name2fsname(tgt_name, fsname, &tgt); + if (rc != 0) { + CERROR("%s: failed to get fsname from tgt_name: rc = %d\n", + tgt_name, rc); + GOTO(cleanup, rc); + } + + if (*tgt != '-' && *tgt != ':') { + CERROR("%s: invalid tgt_name name!\n", tgt_name); + GOTO(cleanup, rc = -EINVAL); + } + + tgt++; + if (strncmp(tgt, "OST", 3) != 0 && strncmp(tgt, "MDT", 3) != 0) { + CERROR("%s is not an OST or MDT target!\n", tgt_name); + GOTO(cleanup, rc = -EINVAL); + } + snprintf(lwp_name, len, "%s-MDT%04x-%s-%s", + fsname, idx, LUSTRE_LWP_NAME, tgt); + + GOTO(cleanup, rc = 0); + +cleanup: + if (fsname != NULL) + OBD_FREE(fsname, MTI_NAME_MAXLEN); + + return rc; +} +EXPORT_SYMBOL(tgt_name2lwp_name); + +static LIST_HEAD(lwp_register_list); +static DEFINE_SPINLOCK(lwp_register_list_lock); + +static void lustre_put_lwp_item(struct lwp_register_item *lri) +{ + if (atomic_dec_and_test(&lri->lri_ref)) { + LASSERT(list_empty(&lri->lri_list)); + + if (*lri->lri_exp != NULL) + class_export_put(*lri->lri_exp); + OBD_FREE_PTR(lri); + } +} + +int lustre_register_lwp_item(const char *lwpname, struct obd_export **exp, + register_lwp_cb cb_func, void *cb_data) +{ + struct obd_device *lwp; + struct lwp_register_item *lri; + bool cb = false; + ENTRY; + + LASSERTF(strlen(lwpname) < MTI_NAME_MAXLEN, "lwpname is too long %s\n", + lwpname); + LASSERT(exp != NULL && *exp == NULL); + + OBD_ALLOC_PTR(lri); + if (lri == NULL) + RETURN(-ENOMEM); + + lwp = class_name2obd(lwpname); + if (lwp != NULL && lwp->obd_set_up == 1) { + struct obd_uuid *uuid; + + OBD_ALLOC_PTR(uuid); + if (uuid == NULL) { + OBD_FREE_PTR(lri); + RETURN(-ENOMEM); + } + memcpy(uuid->uuid, lwpname, strlen(lwpname)); + *exp = obd_uuid_lookup(lwp, uuid); + OBD_FREE_PTR(uuid); + } + + memcpy(lri->lri_name, lwpname, strlen(lwpname)); + lri->lri_exp = exp; + lri->lri_cb_func = cb_func; + lri->lri_cb_data = cb_data; + INIT_LIST_HEAD(&lri->lri_list); + /* + * Initialize the lri_ref at 2, one will be released before + * current function returned via lustre_put_lwp_item(), the + * other will be released in lustre_deregister_lwp_item(). + */ + atomic_set(&lri->lri_ref, 2); + + spin_lock(&lwp_register_list_lock); + list_add(&lri->lri_list, &lwp_register_list); + if (*exp != NULL) + cb = true; + spin_unlock(&lwp_register_list_lock); + + if (cb && cb_func != NULL) + cb_func(cb_data); + lustre_put_lwp_item(lri); + + RETURN(0); +} +EXPORT_SYMBOL(lustre_register_lwp_item); + +void lustre_deregister_lwp_item(struct obd_export **exp) +{ + struct lwp_register_item *lri; + bool removed = false; + int repeat = 0; + + spin_lock(&lwp_register_list_lock); + list_for_each_entry(lri, &lwp_register_list, lri_list) { + if (exp == lri->lri_exp) { + list_del_init(&lri->lri_list); + removed = true; + break; + } + } + spin_unlock(&lwp_register_list_lock); + + if (!removed) + return; + + /* See lustre_notify_lwp_list(), in some extreme race conditions, + * the notify callback could be still on the fly, we need to wait + * for the callback done before moving on to free the data used + * by callback. */ + while (atomic_read(&lri->lri_ref) > 1) { + CDEBUG(D_MOUNT, "lri reference count %u, repeat: %d\n", + atomic_read(&lri->lri_ref), repeat); + repeat++; + schedule_timeout_interruptible(cfs_time_seconds(1)); + } + lustre_put_lwp_item(lri); +} +EXPORT_SYMBOL(lustre_deregister_lwp_item); + +struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx) +{ + struct lustre_mount_info *lmi; + struct lustre_sb_info *lsi; + struct obd_device *lwp; + struct obd_export *exp = NULL; + char fsname[16]; + char lwp_name[24]; + int rc; + + lmi = server_get_mount(dev); + if (lmi == NULL) + return NULL; + + lsi = s2lsi(lmi->lmi_sb); + rc = server_name2fsname(lsi->lsi_svname, fsname, NULL); + if (rc != 0) { + CERROR("%s: failed to get fsname: rc = %d\n", + lsi->lsi_svname, rc); + goto err_lmi; + } + + snprintf(lwp_name, sizeof(lwp_name), "%s-MDT%04x", fsname, idx); + mutex_lock(&lsi->lsi_lwp_mutex); + list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) { + char *ptr = strstr(lwp->obd_name, lwp_name); + + if (ptr != NULL && lwp->obd_lwp_export != NULL) { + exp = class_export_get(lwp->obd_lwp_export); + break; + } + } + mutex_unlock(&lsi->lsi_lwp_mutex); + +err_lmi: + server_put_mount(dev, false); + + return exp; +} +EXPORT_SYMBOL(lustre_find_lwp_by_index); + +void lustre_notify_lwp_list(struct obd_export *exp) +{ + struct lwp_register_item *lri; + LASSERT(exp != NULL); + +again: + spin_lock(&lwp_register_list_lock); + list_for_each_entry(lri, &lwp_register_list, lri_list) { + if (strcmp(exp->exp_obd->obd_name, lri->lri_name)) + continue; + if (*lri->lri_exp != NULL) + continue; + *lri->lri_exp = class_export_get(exp); + if (lri->lri_cb_func == NULL) + continue; + atomic_inc(&lri->lri_ref); + spin_unlock(&lwp_register_list_lock); + + lri->lri_cb_func(lri->lri_cb_data); + lustre_put_lwp_item(lri); + + /* Others may have changed the list after we unlock, we have + * to rescan the list from the beginning. Usually, the list + * 'lwp_register_list' is very short, and there is 'guard' + * lri::lri_exp that will prevent the callback to be done + * repeatedly. So rescanning the list has no problem. */ + goto again; + } + spin_unlock(&lwp_register_list_lock); +} +EXPORT_SYMBOL(lustre_notify_lwp_list); + +static int lustre_lwp_connect(struct obd_device *lwp, bool is_mdt) +{ + struct lu_env env; + struct lu_context session_ctx; + struct obd_export *exp; + struct obd_uuid *uuid = NULL; + struct obd_connect_data *data = NULL; + int rc; + ENTRY; + + /* log has been fully processed, let clients connect */ + rc = lu_env_init(&env, lwp->obd_lu_dev->ld_type->ldt_ctx_tags); + if (rc != 0) + RETURN(rc); + + lu_context_init(&session_ctx, LCT_SERVER_SESSION); + session_ctx.lc_thread = NULL; + lu_context_enter(&session_ctx); + env.le_ses = &session_ctx; + + OBD_ALLOC_PTR(data); + if (data == NULL) + GOTO(out, rc = -ENOMEM); + + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX; + data->ocd_version = LUSTRE_VERSION_CODE; + data->ocd_connect_flags |= OBD_CONNECT_FID | OBD_CONNECT_AT | + OBD_CONNECT_LRU_RESIZE | OBD_CONNECT_FULL20 | + OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LIGHTWEIGHT | + OBD_CONNECT_LFSCK | OBD_CONNECT_BULK_MBITS; + + if (is_mdt) + data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS; + + OBD_ALLOC_PTR(uuid); + if (uuid == NULL) + GOTO(out, rc = -ENOMEM); + + if (strlen(lwp->obd_name) > sizeof(uuid->uuid)) { + CERROR("%s: Too long lwp name %s, max_size is %d\n", + lwp->obd_name, lwp->obd_name, (int)sizeof(uuid->uuid)); + GOTO(out, rc = -EINVAL); + } + + /* Use lwp name as the uuid, so we find the export by lwp name later */ + memcpy(uuid->uuid, lwp->obd_name, strlen(lwp->obd_name)); + rc = obd_connect(&env, &exp, lwp, uuid, data, NULL); + if (rc != 0) { + CERROR("%s: connect failed: rc = %d\n", lwp->obd_name, rc); + } else { + if (unlikely(lwp->obd_lwp_export != NULL)) + class_export_put(lwp->obd_lwp_export); + lwp->obd_lwp_export = class_export_get(exp); + } + + GOTO(out, rc); + +out: + if (data != NULL) + OBD_FREE_PTR(data); + if (uuid != NULL) + OBD_FREE_PTR(uuid); + + lu_env_fini(&env); + lu_context_exit(&session_ctx); + lu_context_fini(&session_ctx); + + return rc; +} + +/** + * lwp is used by slaves (Non-MDT0 targets) to manage the connection to MDT0, + * or from the OSTx to MDTy. + **/ +static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi, + __u32 idx) +{ + struct obd_device *obd; + char *lwpname = NULL; + char *lwpuuid = NULL; + int rc; + ENTRY; + + rc = class_add_uuid(lustre_cfg_string(lcfg, 1), + lcfg->lcfg_nid); + if (rc != 0) { + CERROR("%s: Can't add uuid: rc =%d\n", lsi->lsi_svname, rc); + RETURN(rc); + } + + OBD_ALLOC(lwpname, MTI_NAME_MAXLEN); + if (lwpname == NULL) + GOTO(out, rc = -ENOMEM); + + rc = tgt_name2lwp_name(lsi->lsi_svname, lwpname, MTI_NAME_MAXLEN, idx); + if (rc != 0) { + CERROR("%s: failed to generate lwp name: rc = %d\n", + lsi->lsi_svname, rc); + GOTO(out, rc); + } + + OBD_ALLOC(lwpuuid, MTI_NAME_MAXLEN); + if (lwpuuid == NULL) + GOTO(out, rc = -ENOMEM); + + sprintf(lwpuuid, "%s_UUID", lwpname); + rc = lustre_start_simple(lwpname, LUSTRE_LWP_NAME, + lwpuuid, lustre_cfg_string(lcfg, 1), + NULL, NULL, NULL); + if (rc) { + CERROR("%s: setup up failed: rc %d\n", lwpname, rc); + GOTO(out, rc); + } + + obd = class_name2obd(lwpname); + LASSERT(obd != NULL); + + rc = lustre_lwp_connect(obd, strstr(lsi->lsi_svname, "-MDT") != NULL); + if (rc == 0) { + obd->u.cli.cl_max_mds_easize = MAX_MD_SIZE; + mutex_lock(&lsi->lsi_lwp_mutex); + list_add_tail(&obd->obd_lwp_list, &lsi->lsi_lwp_list); + mutex_unlock(&lsi->lsi_lwp_mutex); + } else { + CERROR("%s: connect failed: rc = %d\n", lwpname, rc); + } + + GOTO(out, rc); + +out: + if (lwpname != NULL) + OBD_FREE(lwpname, MTI_NAME_MAXLEN); + if (lwpuuid != NULL) + OBD_FREE(lwpuuid, MTI_NAME_MAXLEN); + + return rc; +} + +/* the caller is responsible for memory free */ +static struct obd_device *lustre_find_lwp(struct lustre_sb_info *lsi, + char **lwpname, __u32 idx) +{ + struct obd_device *lwp; + int rc = 0; + ENTRY; + + LASSERT(lwpname != NULL); + LASSERT(IS_OST(lsi) || IS_MDT(lsi)); + + OBD_ALLOC(*lwpname, MTI_NAME_MAXLEN); + if (*lwpname == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + rc = tgt_name2lwp_name(lsi->lsi_svname, *lwpname, MTI_NAME_MAXLEN, idx); + if (rc != 0) { + CERROR("%s: failed to generate lwp name: rc = %d\n", + lsi->lsi_svname, rc); + GOTO(out, rc = -EINVAL); + } + + lwp = class_name2obd(*lwpname); + +out: + if (rc != 0) { + if (*lwpname != NULL) { + OBD_FREE(*lwpname, MTI_NAME_MAXLEN); + *lwpname = NULL; + } + lwp = ERR_PTR(rc); + } + + RETURN(lwp != NULL ? lwp : ERR_PTR(-ENOENT)); +} + +static int lustre_lwp_add_conn(struct lustre_cfg *cfg, + struct lustre_sb_info *lsi, __u32 idx) +{ + struct lustre_cfg_bufs *bufs = NULL; + struct lustre_cfg *lcfg = NULL; + char *lwpname = NULL; + struct obd_device *lwp; + int rc; + ENTRY; + + lwp = lustre_find_lwp(lsi, &lwpname, idx); + if (IS_ERR(lwp)) { + CERROR("%s: can't find lwp device.\n", lsi->lsi_svname); + GOTO(out, rc = PTR_ERR(lwp)); + } + LASSERT(lwpname != NULL); + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) + GOTO(out, rc = -ENOMEM); + + lustre_cfg_bufs_reset(bufs, lwpname); + lustre_cfg_bufs_set_string(bufs, 1, + lustre_cfg_string(cfg, 1)); + + OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen)); + if (!lcfg) + GOTO(out_cfg, rc = -ENOMEM); + lustre_cfg_init(lcfg, LCFG_ADD_CONN, bufs); + + rc = class_add_conn(lwp, lcfg); + if (rc) + CERROR("%s: can't add conn: rc = %d\n", lwpname, rc); + + if (lcfg) + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, + lcfg->lcfg_buflens)); +out_cfg: + if (bufs != NULL) + OBD_FREE_PTR(bufs); +out: + if (lwpname != NULL) + OBD_FREE(lwpname, MTI_NAME_MAXLEN); + RETURN(rc); +} + +/** + * Retrieve MDT nids from the client log, then start the lwp device. + * there are only two scenarios which would include mdt nid. + * 1. + * marker 5 (flags=0x01, v2.1.54.0) lustre-MDTyyyy 'add mdc' xxx- + * add_uuid nid=192.168.122.162@tcp(0x20000c0a87aa2) 0: 1:192.168.122.162@tcp + * attach 0:lustre-MDTyyyy-mdc 1:mdc 2:lustre-clilmv_UUID + * setup 0:lustre-MDTyyyy-mdc 1:lustre-MDTyyyy_UUID 2:192.168.122.162@tcp + * add_uuid nid=192.168.172.1@tcp(0x20000c0a8ac01) 0: 1:192.168.172.1@tcp + * add_conn 0:lustre-MDTyyyy-mdc 1:192.168.172.1@tcp + * modify_mdc_tgts add 0:lustre-clilmv 1:lustre-MDTyyyy_UUID xxxx + * marker 5 (flags=0x02, v2.1.54.0) lustre-MDTyyyy 'add mdc' xxxx- + * 2. + * marker 7 (flags=0x01, v2.1.54.0) lustre-MDTyyyy 'add failnid' xxxx- + * add_uuid nid=192.168.122.2@tcp(0x20000c0a87a02) 0: 1:192.168.122.2@tcp + * add_conn 0:lustre-MDTyyyy-mdc 1:192.168.122.2@tcp + * marker 7 (flags=0x02, v2.1.54.0) lustre-MDTyyyy 'add failnid' xxxx- + **/ +static int client_lwp_config_process(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct config_llog_instance *cfg = data; + int cfg_len = rec->lrh_len; + char *cfg_buf = (char *) (rec + 1); + struct lustre_cfg *lcfg = NULL; + struct lustre_sb_info *lsi; + int rc = 0, swab = 0; + ENTRY; + + if (rec->lrh_type != OBD_CFG_REC) { + CERROR("Unknown llog record type %#x encountered\n", + rec->lrh_type); + RETURN(-EINVAL); + } + + if (cfg->cfg_sb == NULL) + GOTO(out, rc = -EINVAL); + lsi = s2lsi(cfg->cfg_sb); + + lcfg = (struct lustre_cfg *)cfg_buf; + if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) { + lustre_swab_lustre_cfg(lcfg); + swab = 1; + } + + rc = lustre_cfg_sanity_check(cfg_buf, cfg_len); + if (rc) + GOTO(out, rc); + + switch (lcfg->lcfg_command) { + case LCFG_MARKER: { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + + lustre_swab_cfg_marker(marker, swab, + LUSTRE_CFG_BUFLEN(lcfg, 1)); + if (marker->cm_flags & CM_SKIP || + marker->cm_flags & CM_EXCLUDE) + GOTO(out, rc = 0); + + if (!tgt_is_mdt(marker->cm_tgtname, &cfg->cfg_lwp_idx)) + GOTO(out, rc = 0); + + if (IS_MDT(lsi) && cfg->cfg_lwp_idx != 0) + GOTO(out, rc = 0); + + if (!strncmp(marker->cm_comment, "add mdc", 7) || + !strncmp(marker->cm_comment, "add failnid", 11)) { + if (marker->cm_flags & CM_START) { + cfg->cfg_flags = CFG_F_MARKER; + /* This hack is to differentiate the + * ADD_UUID is come from "add mdc" record + * or from "add failnid" record. */ + if (!strncmp(marker->cm_comment, + "add failnid", 11)) + cfg->cfg_flags |= CFG_F_SKIP; + } else if (marker->cm_flags & CM_END) { + cfg->cfg_flags = 0; + } + } + break; + } + case LCFG_ADD_UUID: { + if (cfg->cfg_flags == CFG_F_MARKER) { + rc = lustre_lwp_setup(lcfg, lsi, cfg->cfg_lwp_idx); + /* XXX: process only the first nid as + * we don't need another instance of lwp */ + cfg->cfg_flags |= CFG_F_SKIP; + } else if (cfg->cfg_flags == (CFG_F_MARKER | CFG_F_SKIP)) { + rc = class_add_uuid(lustre_cfg_string(lcfg, 1), + lcfg->lcfg_nid); + if (rc) + CERROR("%s: Fail to add uuid, rc:%d\n", + lsi->lsi_svname, rc); + } + break; + } + case LCFG_ADD_CONN: { + char *devname = lustre_cfg_string(lcfg, 0); + char *ptr; + __u32 idx = 0; + + if (!is_mdc_device(devname)) + break; + + if (!(cfg->cfg_flags & CFG_F_MARKER)) { + CDEBUG(D_CONFIG, "Skipping add_conn for %s, rec %d\n", + devname, rec->lrh_index); + break; + } + + /* add_conn should follow by add_uuid. This + * guarantee lwp device was created + */ + if (!(cfg->cfg_flags & CFG_F_SKIP)) { + CWARN("Error at config for %s rec %d, add_conn should follow by add_uuid\n", + devname, rec->lrh_index); + break; + } + ptr = strrchr(devname, '-'); + if (ptr == NULL) + break; + + *ptr = 0; + if (!tgt_is_mdt(devname, &idx)) { + *ptr = '-'; + break; + } + *ptr = '-'; + + if (IS_MDT(lsi) && idx != 0) + break; + + rc = lustre_lwp_add_conn(lcfg, lsi, idx); + break; + } + default: + break; + } +out: + RETURN(rc); +} + +static int lustre_disconnect_lwp(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *lwp; + char *logname = NULL; + struct lustre_cfg_bufs *bufs = NULL; + struct config_llog_instance *cfg = NULL; + int rc = 0; + int rc1 = 0; + ENTRY; + + if (likely(lsi->lsi_lwp_started)) { + OBD_ALLOC(logname, MTI_NAME_MAXLEN); + if (logname == NULL) + RETURN(-ENOMEM); + + rc = server_name2fsname(lsi->lsi_svname, logname, NULL); + if (rc != 0) { + CERROR("%s: failed to get fsname from svname: " + "rc = %d\n", lsi->lsi_svname, rc); + GOTO(out, rc = -EINVAL); + } + + strcat(logname, "-client"); + OBD_ALLOC_PTR(cfg); + if (cfg == NULL) + GOTO(out, rc = -ENOMEM); + + /* end log first */ + cfg->cfg_instance = ll_get_cfg_instance(sb); + rc = lustre_end_log(sb, logname, cfg); + if (rc != 0 && rc != -ENOENT) + GOTO(out, rc); + + lsi->lsi_lwp_started = 0; + } + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) + GOTO(out, rc = -ENOMEM); + + mutex_lock(&lsi->lsi_lwp_mutex); + list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) { + struct lustre_cfg *lcfg; + + if (likely(lwp->obd_lwp_export != NULL)) { + class_export_put(lwp->obd_lwp_export); + lwp->obd_lwp_export = NULL; + } + + lustre_cfg_bufs_reset(bufs, lwp->obd_name); + lustre_cfg_bufs_set_string(bufs, 1, NULL); + OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, + bufs->lcfg_buflen)); + if (!lcfg) { + rc = -ENOMEM; + break; + } + lustre_cfg_init(lcfg, LCFG_CLEANUP, bufs); + + /* Disconnect import first. NULL is passed for the '@env', + * since it will not be used. */ + rc = lwp->obd_lu_dev->ld_ops->ldo_process_config(NULL, + lwp->obd_lu_dev, lcfg); + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, + lcfg->lcfg_buflens)); + if (rc != 0 && rc != -ETIMEDOUT) { + CERROR("%s: fail to disconnect LWP: rc = %d\n", + lwp->obd_name, rc); + rc1 = rc; + } + } + mutex_unlock(&lsi->lsi_lwp_mutex); + + GOTO(out, rc); + +out: + if (bufs != NULL) + OBD_FREE_PTR(bufs); + if (cfg != NULL) + OBD_FREE_PTR(cfg); + if (logname != NULL) + OBD_FREE(logname, MTI_NAME_MAXLEN); + + return rc1 != 0 ? rc1 : rc; +} + +/** + * Stop the lwp for an OST/MDT target. + **/ +static int lustre_stop_lwp(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *lwp; + int rc = 0; + int rc1 = 0; + ENTRY; + + mutex_lock(&lsi->lsi_lwp_mutex); + while (!list_empty(&lsi->lsi_lwp_list)) { + lwp = list_entry(lsi->lsi_lwp_list.next, struct obd_device, + obd_lwp_list); + list_del_init(&lwp->obd_lwp_list); + lwp->obd_force = 1; + mutex_unlock(&lsi->lsi_lwp_mutex); + + rc = class_manual_cleanup(lwp); + if (rc != 0) { + CERROR("%s: fail to stop LWP: rc = %d\n", + lwp->obd_name, rc); + rc1 = rc; + } + mutex_lock(&lsi->lsi_lwp_mutex); + } + mutex_unlock(&lsi->lsi_lwp_mutex); + + RETURN(rc1 != 0 ? rc1 : rc); +} + +/** + * Start the lwp(fsname-MDTyyyy-lwp-{MDT,OST}xxxx) for a MDT/OST or MDT target. + **/ +static int lustre_start_lwp(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_instance *cfg = NULL; + char *logname; + int rc; + ENTRY; + + if (unlikely(lsi->lsi_lwp_started)) + RETURN(0); + + OBD_ALLOC(logname, MTI_NAME_MAXLEN); + if (logname == NULL) + RETURN(-ENOMEM); + + rc = server_name2fsname(lsi->lsi_svname, logname, NULL); + if (rc != 0) { + CERROR("%s: failed to get fsname from svname: rc = %d\n", + lsi->lsi_svname, rc); + GOTO(out, rc = -EINVAL); + } + + strcat(logname, "-client"); + OBD_ALLOC_PTR(cfg); + if (cfg == NULL) + GOTO(out, rc = -ENOMEM); + + cfg->cfg_callback = client_lwp_config_process; + cfg->cfg_instance = ll_get_cfg_instance(sb); + rc = lustre_process_log(sb, logname, cfg); + /* need to remove config llog from mgc */ + lsi->lsi_lwp_started = 1; + + GOTO(out, rc); + +out: + OBD_FREE(logname, MTI_NAME_MAXLEN); + if (cfg != NULL) + OBD_FREE_PTR(cfg); + + return rc; +} + +static DEFINE_MUTEX(server_start_lock); + +/* Stop MDS/OSS if nobody is using them */ +static int server_stop_servers(int lsiflags) +{ + struct obd_device *obd = NULL; + struct obd_type *type = NULL; + int rc = 0; + bool type_last; + ENTRY; + + mutex_lock(&server_start_lock); + + /* Either an MDT or an OST or neither */ + /* if this was an MDT, and there are no more MDT's, clean up the MDS */ + if (lsiflags & LDD_F_SV_TYPE_MDT) { + obd = class_name2obd(LUSTRE_MDS_OBDNAME); + type = class_search_type(LUSTRE_MDT_NAME); + } else if (lsiflags & LDD_F_SV_TYPE_OST) { + /* if this was an OST, and there are no more OST's, clean up the OSS */ + obd = class_name2obd(LUSTRE_OSS_OBDNAME); + type = class_search_type(LUSTRE_OST_NAME); + } + + /* server_stop_servers is a pair of server_start_targets + * Here we put type which was taken at server_start_targets. + * If type is NULL then there is a wrong logic around type or + * type reference. */ + LASSERTF(type, "Server flags %d, obd %s\n", lsiflags, + obd ? obd->obd_name : "NULL"); + + type_last = (atomic_read(&type->typ_refcnt) == 1); + + class_put_type(type); + if (obd != NULL && type_last) { + obd->obd_force = 1; + /* obd_fail doesn't mean much on a server obd */ + rc = class_manual_cleanup(obd); + } + + /* put reference taken by class_search_type */ + kobject_put(&type->typ_kobj); + + mutex_unlock(&server_start_lock); + + RETURN(rc); +} + +int server_mti_print(const char *title, struct mgs_target_info *mti) +{ + PRINT_CMD(PRINT_MASK, "mti %s\n", title); + PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname); + PRINT_CMD(PRINT_MASK, "fs: %s\n", mti->mti_fsname); + PRINT_CMD(PRINT_MASK, "uuid: %s\n", mti->mti_uuid); + PRINT_CMD(PRINT_MASK, "ver: %d flags: %#x\n", + mti->mti_config_ver, mti->mti_flags); + return 0; +} + +/* Generate data for registration */ +static int server_lsi2mti(struct lustre_sb_info *lsi, + struct mgs_target_info *mti) +{ + struct lnet_processid id; + int rc, i = 0; + int cplen = 0; + ENTRY; + + if (!IS_SERVER(lsi)) + RETURN(-EINVAL); + + if (strlcpy(mti->mti_svname, lsi->lsi_svname, sizeof(mti->mti_svname)) + >= sizeof(mti->mti_svname)) + RETURN(-E2BIG); + + mti->mti_nid_count = 0; + while (LNetGetId(i++, &id) != -ENOENT) { + if (nid_is_lo0(&id.nid)) + continue; + + /* server use --servicenode param, only allow specified + * nids be registered */ + if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) != 0 && + class_match_nid(lsi->lsi_lmd->lmd_params, + PARAM_FAILNODE, + lnet_nid_to_nid4(&id.nid)) < 1) + continue; + + /* match specified network */ + if (!class_match_net(lsi->lsi_lmd->lmd_params, + PARAM_NETWORK, LNET_NID_NET(&id.nid))) + continue; + + mti->mti_nids[mti->mti_nid_count] = lnet_nid_to_nid4(&id.nid); + mti->mti_nid_count++; + if (mti->mti_nid_count >= MTI_NIDS_MAX) { + CWARN("Only using first %d nids for %s\n", + mti->mti_nid_count, mti->mti_svname); + break; + } + } + + if (mti->mti_nid_count == 0) { + CERROR("Failed to get NID for server %s, please check whether " + "the target is specifed with improper --servicenode or " + "--network options.\n", mti->mti_svname); + RETURN(-EINVAL); + } + + mti->mti_lustre_ver = LUSTRE_VERSION_CODE; + mti->mti_config_ver = 0; + + rc = server_name2fsname(lsi->lsi_svname, mti->mti_fsname, NULL); + if (rc != 0) + return rc; + + rc = server_name2index(lsi->lsi_svname, &mti->mti_stripe_index, NULL); + if (rc < 0) + return rc; + /* Orion requires index to be set */ + LASSERT(!(rc & LDD_F_NEED_INDEX)); + /* keep only LDD flags */ + mti->mti_flags = lsi->lsi_flags & LDD_F_MASK; + if (mti->mti_flags & (LDD_F_WRITECONF | LDD_F_VIRGIN)) + mti->mti_flags |= LDD_F_UPDATE; + cplen = strlcpy(mti->mti_params, lsi->lsi_lmd->lmd_params, + sizeof(mti->mti_params)); + if (cplen >= sizeof(mti->mti_params)) + return -E2BIG; + return 0; +} + +/* Register an old or new target with the MGS. If needed MGS will construct + startup logs and assign index */ +static int server_register_target(struct lustre_sb_info *lsi) +{ + struct obd_device *mgc = lsi->lsi_mgc; + struct mgs_target_info *mti = NULL; + bool must_succeed; + int rc; + int tried = 0; + ENTRY; + + LASSERT(mgc); + + if (!IS_SERVER(lsi)) + RETURN(-EINVAL); + + OBD_ALLOC_PTR(mti); + if (!mti) + RETURN(-ENOMEM); + + rc = server_lsi2mti(lsi, mti); + if (rc) + GOTO(out, rc); + + CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n", + mti->mti_svname, mti->mti_fsname, + libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index, + mti->mti_flags); + + /* we cannot ignore registration failure if MGS logs must be updated. */ + must_succeed = !!(lsi->lsi_flags & + (LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_WRITECONF | + LDD_F_VIRGIN)); + mti->mti_flags |= LDD_F_OPC_REG; + +again: + /* Register the target */ + /* FIXME use mgc_process_config instead */ + rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp, + sizeof(KEY_REGISTER_TARGET), + KEY_REGISTER_TARGET, + sizeof(*mti), mti, NULL); + if (rc) { + if (mti->mti_flags & LDD_F_ERROR) { + LCONSOLE_ERROR_MSG(0x160, + "%s: the MGS refuses to allow this server " + "to start: rc = %d. Please see messages on " + "the MGS.\n", lsi->lsi_svname, rc); + } else if (must_succeed) { + if ((rc == -ESHUTDOWN || rc == -EIO) && ++tried < 5) { + /* The connection with MGS is not established. + * Try again after 2 seconds. Interruptable. */ + schedule_timeout_interruptible( + cfs_time_seconds(2)); + if (!signal_pending(current)) + goto again; + } + + LCONSOLE_ERROR_MSG(0x15f, + "%s: cannot register this server with the MGS: " + "rc = %d. Is the MGS running?\n", + lsi->lsi_svname, rc); + } else { + CDEBUG(D_HA, "%s: error registering with the MGS: " + "rc = %d (not fatal)\n", lsi->lsi_svname, rc); + /* reset the error code for non-fatal error. */ + rc = 0; + } + GOTO(out, rc); + } + +out: + if (mti) + OBD_FREE_PTR(mti); + RETURN(rc); +} + +/** + * Notify the MGS that this target is ready. + * Used by IR - if the MGS receives this message, it will notify clients. + */ +static int server_notify_target(struct super_block *sb, struct obd_device *obd) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + struct mgs_target_info *mti = NULL; + int rc; + ENTRY; + + LASSERT(mgc); + + if (!(IS_SERVER(lsi))) + RETURN(-EINVAL); + + OBD_ALLOC_PTR(mti); + if (!mti) + RETURN(-ENOMEM); + rc = server_lsi2mti(lsi, mti); + if (rc) + GOTO(out, rc); + + mti->mti_instance = obd->u.obt.obt_instance; + mti->mti_flags |= LDD_F_OPC_READY; + + /* FIXME use mgc_process_config instead */ + rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp, + sizeof(KEY_REGISTER_TARGET), + KEY_REGISTER_TARGET, + sizeof(*mti), mti, NULL); + + /* Imperative recovery: if the mgs informs us to use IR? */ + if (!rc && !(mti->mti_flags & LDD_F_ERROR) && + (mti->mti_flags & LDD_F_IR_CAPABLE)) + lsi->lsi_flags |= LDD_F_IR_CAPABLE; + +out: + if (mti) + OBD_FREE_PTR(mti); + RETURN(rc); + +} + +/** Start server targets: MDTs and OSTs + */ +static int server_start_targets(struct super_block *sb) +{ + struct obd_device *obd; + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_instance cfg; + struct lu_env mgc_env; + struct lu_device *dev; + char *name_service, *obd_name_service = NULL; + struct obd_type *type = NULL; + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_svname); + + LASSERTF(IS_MDT(lsi) || IS_OST(lsi), "designed for MDT or OST only\n"); + + if (IS_MDT(lsi)) { + obd_name_service = LUSTRE_MDS_OBDNAME; + name_service = LUSTRE_MDS_NAME; + } else { + obd_name_service = LUSTRE_OSS_OBDNAME; + name_service = LUSTRE_OSS_NAME; + } + + /* make sure MDS/OSS is started */ + mutex_lock(&server_start_lock); + obd = class_name2obd(obd_name_service); + if (!obd) { + rc = lustre_start_simple(obd_name_service, name_service, + (IS_MDT(lsi) ? + LUSTRE_MDS_OBDNAME"_uuid" : + LUSTRE_OSS_OBDNAME"_uuid"), + NULL, NULL, NULL, NULL); + if (rc) { + mutex_unlock(&server_start_lock); + CERROR("failed to start %s: %d\n", + obd_name_service, rc); + RETURN(rc); + } + } + /* hold a type reference and put it at server_stop_servers */ + type = class_get_type(IS_MDT(lsi) ? + LUSTRE_MDT_NAME : LUSTRE_OST_NAME); + if (!type) { + mutex_unlock(&server_start_lock); + GOTO(out_stop_service, rc = -ENODEV); + } + lsi->lsi_server_started = 1; + mutex_unlock(&server_start_lock); + if (OBD_FAIL_PRECHECK(OBD_FAIL_OBD_STOP_MDS_RACE) && + IS_MDT(lsi)) { + OBD_RACE(OBD_FAIL_OBD_STOP_MDS_RACE); + msleep(2 * MSEC_PER_SEC); + } + + rc = lu_env_init(&mgc_env, LCT_MG_THREAD); + if (rc != 0) + GOTO(out_stop_service, rc); + + /* Set the mgc fs to our server disk. This allows the MGC to + * read and write configs locally, in case it can't talk to the MGS. */ + rc = server_mgc_set_fs(&mgc_env, lsi->lsi_mgc, sb); + if (rc) + GOTO(out_env, rc); + + /* Register with MGS */ + rc = server_register_target(lsi); + if (rc) + GOTO(out_mgc, rc); + + /* Let the target look up the mount using the target's name + (we can't pass the sb or mnt through class_process_config.) */ + rc = server_register_mount(lsi->lsi_svname, sb); + if (rc) + GOTO(out_mgc, rc); + + /* Start targets using the llog named for the target */ + memset(&cfg, 0, sizeof(cfg)); + cfg.cfg_callback = class_config_llog_handler; + cfg.cfg_sub_clds = CONFIG_SUB_SERVER; + rc = lustre_process_log(sb, lsi->lsi_svname, &cfg); + if (rc) { + CERROR("failed to start server %s: %d\n", + lsi->lsi_svname, rc); + /* Do NOT call server_deregister_mount() here. This makes it + * impossible to find mount later in cleanup time and leaves + * @lsi and othder stuff leaked. -umka */ + GOTO(out_mgc, rc); + } + + obd = class_name2obd(lsi->lsi_svname); + if (!obd) { + CERROR("no server named %s was started\n", lsi->lsi_svname); + GOTO(out_mgc, rc = -ENXIO); + } + + if (IS_OST(lsi) || IS_MDT(lsi)) { + rc = lustre_start_lwp(sb); + if (rc) { + CERROR("%s: failed to start LWP: %d\n", + lsi->lsi_svname, rc); + GOTO(out_mgc, rc); + } + } + + server_notify_target(sb, obd); + + /* calculate recovery timeout, do it after lustre_process_log */ + server_calc_timeout(lsi, obd); + + /* log has been fully processed, let clients connect */ + dev = obd->obd_lu_dev; + if (dev && dev->ld_ops->ldo_prepare) { + struct lu_env env; + + rc = lu_env_init(&env, dev->ld_type->ldt_ctx_tags); + if (rc == 0) { + struct lu_context session_ctx; + + lu_context_init(&session_ctx, LCT_SERVER_SESSION); + session_ctx.lc_thread = NULL; + lu_context_enter(&session_ctx); + env.le_ses = &session_ctx; + + rc = dev->ld_ops->ldo_prepare(&env, NULL, dev); + + lu_env_fini(&env); + lu_context_exit(&session_ctx); + lu_context_fini(&session_ctx); + } + } + + /* abort recovery only on the complete stack: + * many devices can be involved */ + if ((lsi->lsi_lmd->lmd_flags & + (LMD_FLG_ABORT_RECOV | LMD_FLG_ABORT_RECOV_MDT)) && + (OBP(obd, iocontrol))) { + struct obd_ioctl_data karg = { + .ioc_type = lsi->lsi_lmd->lmd_flags, + }; + + obd_iocontrol(OBD_IOC_ABORT_RECOVERY, obd->obd_self_export, 0, + &karg, NULL); + } + +out_mgc: + /* Release the mgc fs for others to use */ + server_mgc_clear_fs(&mgc_env, lsi->lsi_mgc); +out_env: + lu_env_fini(&mgc_env); +out_stop_service: + /* in case of error upper function call + * server_put_super->server_stop_servers() + */ + + RETURN(rc); +} + +static int lsi_prepare(struct lustre_sb_info *lsi) +{ + const char *osd_type; + const char *fstype; + __u32 index; + int rc; + ENTRY; + + LASSERT(lsi); + LASSERT(lsi->lsi_lmd); + + /* The server name is given as a mount line option */ + if (lsi->lsi_lmd->lmd_profile == NULL) { + LCONSOLE_ERROR("Can't determine server name\n"); + RETURN(-EINVAL); + } + + /* Determine osd type */ + if (lsi->lsi_lmd->lmd_osd_type == NULL) { + osd_type = LUSTRE_OSD_LDISKFS_NAME; + fstype = "ldiskfs"; + } else { + osd_type = lsi->lsi_lmd->lmd_osd_type; + fstype = lsi->lsi_lmd->lmd_osd_type; + } + + if (strlen(lsi->lsi_lmd->lmd_profile) >= sizeof(lsi->lsi_svname) || + strlen(osd_type) >= sizeof(lsi->lsi_osd_type) || + strlen(fstype) >= sizeof(lsi->lsi_fstype)) + RETURN(-ENAMETOOLONG); + + strlcpy(lsi->lsi_svname, lsi->lsi_lmd->lmd_profile, + sizeof(lsi->lsi_svname)); + strlcpy(lsi->lsi_osd_type, osd_type, sizeof(lsi->lsi_osd_type)); + /* XXX: a temp. solution for components using ldiskfs + * to be removed in one of the subsequent patches */ + strlcpy(lsi->lsi_fstype, fstype, sizeof(lsi->lsi_fstype)); + + /* Determine server type */ + rc = server_name2index(lsi->lsi_svname, &index, NULL); + if (rc < 0) { + if (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) { + /* Assume we're a bare MGS */ + rc = 0; + lsi->lsi_lmd->lmd_flags |= LMD_FLG_NOSVC; + } else { + LCONSOLE_ERROR("Can't determine server type of '%s'\n", + lsi->lsi_svname); + RETURN(rc); + } + } + lsi->lsi_flags |= rc; + + /* Add mount line flags that used to be in ldd: + * writeconf, mgs, anything else? + */ + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF) ? + LDD_F_WRITECONF : 0; + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_LOCAL_LOGS) ? + LDD_F_NO_LOCAL_LOGS : 0; + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_VIRGIN) ? + LDD_F_VIRGIN : 0; + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_UPDATE) ? + LDD_F_UPDATE : 0; + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) ? + LDD_F_SV_TYPE_MGS : 0; + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) ? + LDD_F_NO_PRIMNODE : 0; + + RETURN(0); +} + +/*************** server mount ******************/ + +/** Start the shutdown of servers at umount. + */ +static void server_put_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + char *tmpname, *extraname = NULL; + int tmpname_sz; + int lsiflags = lsi->lsi_flags; + bool stop_servers = lsi->lsi_server_started; + ENTRY; + + LASSERT(IS_SERVER(lsi)); + + tmpname_sz = strlen(lsi->lsi_svname) + 1; + OBD_ALLOC(tmpname, tmpname_sz); + memcpy(tmpname, lsi->lsi_svname, tmpname_sz); + CDEBUG(D_MOUNT, "server put_super %s\n", tmpname); + if (IS_MDT(lsi) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC)) + snprintf(tmpname, tmpname_sz, "MGS"); + + /* disconnect the lwp first to drain off the inflight request */ + if (IS_OST(lsi) || IS_MDT(lsi)) { + int rc; + + rc = lustre_disconnect_lwp(sb); + if (rc != 0 && rc != -ETIMEDOUT && + rc != -ENOTCONN && rc != -ESHUTDOWN) + CWARN("%s: failed to disconnect lwp: rc= %d\n", + tmpname, rc); + } + + /* Stop the target */ + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) && + (IS_MDT(lsi) || IS_OST(lsi))) { + struct lustre_profile *lprof = NULL; + + /* tell the mgc to drop the config log */ + lustre_end_log(sb, lsi->lsi_svname, NULL); + + /* COMPAT_146 - profile may get deleted in mgc_cleanup. + If there are any setup/cleanup errors, save the lov + name for safety cleanup later. */ + lprof = class_get_profile(lsi->lsi_svname); + if (lprof != NULL) { + if (lprof->lp_dt != NULL) { + OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1); + strncpy(extraname, lprof->lp_dt, + strlen(lprof->lp_dt) + 1); + } + class_put_profile(lprof); + } + + obd = class_name2obd(lsi->lsi_svname); + if (obd) { + CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name); + if (lsiflags & LSI_UMOUNT_FAILOVER) + obd->obd_fail = 1; + /* We can't seem to give an error return code + * to .put_super, so we better make sure we clean up! */ + obd->obd_force = 1; + class_manual_cleanup(obd); + if (OBD_FAIL_PRECHECK(OBD_FAIL_OBD_STOP_MDS_RACE)) { + int idx; + server_name2index(lsi->lsi_svname, &idx, NULL); + /* sleeping for MDT0001 */ + if (idx == 1) + OBD_RACE(OBD_FAIL_OBD_STOP_MDS_RACE); + } + } else { + CERROR("no obd %s\n", lsi->lsi_svname); + server_deregister_mount(lsi->lsi_svname); + } + } + + /* If they wanted the mgs to stop separately from the mdt, they + should have put it on a different device. */ + if (IS_MGS(lsi)) { + /* if MDS start with --nomgs, don't stop MGS then */ + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) + server_stop_mgs(sb); + } + + if (IS_OST(lsi) || IS_MDT(lsi)) { + if (lustre_stop_lwp(sb) < 0) + CERROR("%s: failed to stop lwp!\n", tmpname); + } + + /* Clean the mgc and sb */ + lustre_common_put_super(sb); + + /* wait till all in-progress cleanups are done + * specifically we're interested in ofd cleanup + * as it pins OSS */ + obd_zombie_barrier(); + + /* Stop the servers (MDS, OSS) if no longer needed. We must wait + until the target is really gone so that our type refcount check + is right. */ + if (stop_servers) + server_stop_servers(lsiflags); + + /* In case of startup or cleanup err, stop related obds */ + if (extraname) { + obd = class_name2obd(extraname); + if (obd) { + CWARN("Cleaning orphaned obd %s\n", extraname); + obd->obd_force = 1; + class_manual_cleanup(obd); + } + OBD_FREE(extraname, strlen(extraname) + 1); + } + + LCONSOLE_WARN("server umount %s complete\n", tmpname); + OBD_FREE(tmpname, tmpname_sz); + EXIT; +} + +/** Called only for 'umount -f' + */ +static void server_umount_begin(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + ENTRY; + + CDEBUG(D_MOUNT, "umount -f\n"); + /* umount = failover + umount -f = force + no third way to do non-force, non-failover */ + lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER; + EXIT; +} + +static int server_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_statfs statfs; + int rc; + ENTRY; + + if (lsi->lsi_dt_dev) { + rc = dt_statfs(NULL, lsi->lsi_dt_dev, &statfs); + if (rc == 0) { + statfs_unpack(buf, &statfs); + buf->f_type = sb->s_magic; + RETURN(0); + } + } + + /* just return 0 */ + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = 1; + buf->f_bfree = 0; + buf->f_bavail = 0; + buf->f_files = 1; + buf->f_ffree = 0; + buf->f_namelen = NAME_MAX; + RETURN(0); +} + +int server_show_options(struct seq_file *seq, struct dentry *dentry) +{ + struct lustre_sb_info *lsi; + struct lustre_mount_data *lmd; + + LASSERT(seq != NULL && dentry != NULL); + lsi = s2lsi(dentry->d_sb); + lmd = lsi->lsi_lmd; + seq_printf(seq, ",svname=%s", lmd->lmd_profile); + + if (lmd->lmd_flags & LMD_FLG_ABORT_RECOV) + seq_puts(seq, ",abort_recov"); + + if (lmd->lmd_flags & LMD_FLG_NOIR) + seq_puts(seq, ",noir"); + + if (lmd->lmd_flags & LMD_FLG_NOSVC) + seq_puts(seq, ",nosvc"); + + if (lmd->lmd_flags & LMD_FLG_NOMGS) + seq_puts(seq, ",nomgs"); + + if (lmd->lmd_flags & LMD_FLG_NOSCRUB) + seq_puts(seq, ",noscrub"); + if (lmd->lmd_flags & LMD_FLG_SKIP_LFSCK) + seq_puts(seq, ",skip_lfsck"); + + if (lmd->lmd_flags & LMD_FLG_DEV_RDONLY) + seq_puts(seq, ",rdonly_dev"); + + if (lmd->lmd_flags & LMD_FLG_MGS) + seq_puts(seq, ",mgs"); + + if (lmd->lmd_mgs != NULL) + seq_printf(seq, ",mgsnode=%s", lmd->lmd_mgs); + + if (lmd->lmd_osd_type != NULL) + seq_printf(seq, ",osd=%s", lmd->lmd_osd_type); + + if (lmd->lmd_opts != NULL) { + seq_putc(seq, ','); + seq_puts(seq, lmd->lmd_opts); + } + + RETURN(0); +} + +/** The operations we support directly on the superblock: + * mount, umount, and df. + */ +static const struct super_operations server_ops = { + .put_super = server_put_super, + .umount_begin = server_umount_begin, /* umount -f */ + .statfs = server_statfs, + .show_options = server_show_options, +}; + +/* + * Xattr support for Lustre servers + */ +#ifdef HAVE_IOP_XATTR +static ssize_t lustre_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + if (!selinux_is_enabled()) + return -EOPNOTSUPP; + return -ENODATA; +} + +static int lustre_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} +#endif + +static ssize_t lustre_listxattr(struct dentry *d_entry, char *name, + size_t size) +{ + return -EOPNOTSUPP; +} + +static bool is_cmd_supported(unsigned int command) +{ + switch (command) { + case FITRIM: + return true; + default: + return false; + } + + return false; +} + +static long server_ioctl(struct file *filp, unsigned int command, + unsigned long arg) +{ + struct file active_filp; + struct inode *inode = file_inode(filp); + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + struct super_block *dd_sb = dt_mnt_sb_get(lsi->lsi_dt_dev); + struct inode *active_inode; + int err = -EOPNOTSUPP; + + if (IS_ERR(dd_sb) || !is_cmd_supported(command)) + return err; + + active_inode = igrab(dd_sb->s_root->d_inode); + if (!active_inode) + return -EACCES; + + active_filp.f_inode = active_inode; + if (active_inode->i_fop && active_inode->i_fop->unlocked_ioctl) + err = active_inode->i_fop->unlocked_ioctl(&active_filp, + command, arg); + iput(active_inode); + return err; +} + +static const struct inode_operations server_inode_operations = { +#ifdef HAVE_IOP_XATTR + .setxattr = lustre_setxattr, + .getxattr = lustre_getxattr, +#endif + .listxattr = lustre_listxattr, +}; + +static const struct file_operations server_file_operations = { + .unlocked_ioctl = server_ioctl, +}; + +#define log2(n) ffz(~(n)) +#define LUSTRE_SUPER_MAGIC 0x0BD00BD1 + +static int server_fill_super_common(struct super_block *sb) +{ + struct inode *root = NULL; + ENTRY; + + CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev); + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = log2(sb->s_blocksize); + sb->s_magic = LUSTRE_SUPER_MAGIC; + sb->s_maxbytes = 0; /* we don't allow file IO on server mountpoints */ + sb->s_flags |= SB_RDONLY; + sb->s_op = &server_ops; + + root = new_inode(sb); + if (!root) { + CERROR("Can't make root inode\n"); + RETURN(-EIO); + } + + /* returns -EIO for every operation */ + /* make_bad_inode(root); -- badness - can't umount */ + /* apparently we need to be a directory for the mount to finish */ + root->i_mode = S_IFDIR; + root->i_op = &server_inode_operations; + root->i_fop = &server_file_operations; + sb->s_root = d_make_root(root); + if (!sb->s_root) { + CERROR("%s: can't make root dentry\n", sb->s_id); + RETURN(-EIO); + } + + RETURN(0); +} + +static int osd_start(struct lustre_sb_info *lsi, unsigned long mflags) +{ + struct lustre_mount_data *lmd = lsi->lsi_lmd; + struct obd_device *obd; + struct dt_device_param p; + char flagstr[20 + 1 + 10 + 1]; + int rc; + ENTRY; + + CDEBUG(D_MOUNT, + "Attempting to start %s, type=%s, lsifl=%x, mountfl=%lx\n", + lsi->lsi_svname, lsi->lsi_osd_type, lsi->lsi_flags, mflags); + + sprintf(lsi->lsi_osd_obdname, "%s-osd", lsi->lsi_svname); + strcpy(lsi->lsi_osd_uuid, lsi->lsi_osd_obdname); + strcat(lsi->lsi_osd_uuid, "_UUID"); + snprintf(flagstr, sizeof(flagstr), "%lu:%u", mflags, lmd->lmd_flags); + + obd = class_name2obd(lsi->lsi_osd_obdname); + if (obd == NULL) { + rc = lustre_start_simple(lsi->lsi_osd_obdname, + lsi->lsi_osd_type, + lsi->lsi_osd_uuid, lmd->lmd_dev, + flagstr, lsi->lsi_lmd->lmd_opts, + lsi->lsi_svname); + if (rc) + GOTO(out, rc); + obd = class_name2obd(lsi->lsi_osd_obdname); + LASSERT(obd); + } else { + CDEBUG(D_MOUNT, "%s already started\n", lsi->lsi_osd_obdname); + /* but continue setup to allow special case of MDT and internal + * MGT being started separately. */ + if (!((IS_MGS(lsi) && (lsi->lsi_lmd->lmd_flags & + LMD_FLG_NOMGS)) || + (IS_MDT(lsi) && (lsi->lsi_lmd->lmd_flags & + LMD_FLG_NOSVC)))) + RETURN(-EALREADY); + } + + rc = obd_connect(NULL, &lsi->lsi_osd_exp, + obd, &obd->obd_uuid, NULL, NULL); + + if (rc) { + obd->obd_force = 1; + class_manual_cleanup(obd); + lsi->lsi_dt_dev = NULL; + RETURN(rc); + } + + LASSERT(obd->obd_lu_dev); + lu_device_get(obd->obd_lu_dev); + lsi->lsi_dt_dev = lu2dt_dev(obd->obd_lu_dev); + LASSERT(lsi->lsi_dt_dev); + + /* set disk context for llog usage */ + OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt); + obd->obd_lvfs_ctxt.dt = lsi->lsi_dt_dev; + + dt_conf_get(NULL, lsi->lsi_dt_dev, &p); +out: + RETURN(rc); +} + +/** Fill in the superblock info for a Lustre server. + * Mount the device with the correct options. + * Read the on-disk config file. + * Start the services. + */ +int server_fill_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + int rc; + ENTRY; + + /* to simulate target mount race */ + OBD_RACE(OBD_FAIL_TGT_MOUNT_RACE); + + rc = lsi_prepare(lsi); + if (rc) { + lustre_put_lsi(sb); + RETURN(rc); + } + + /* Start low level OSD */ + rc = osd_start(lsi, sb->s_flags); + if (rc) { + CERROR("Unable to start osd on %s: %d\n", + lsi->lsi_lmd->lmd_dev, rc); + lustre_put_lsi(sb); + RETURN(rc); + } + + CDEBUG(D_MOUNT, "Found service %s on device %s\n", + lsi->lsi_svname, lsi->lsi_lmd->lmd_dev); + + if (class_name2obd(lsi->lsi_svname)) { + LCONSOLE_ERROR_MSG(0x161, "The target named %s is already " + "running. Double-mount may have compromised" + " the disk journal.\n", + lsi->lsi_svname); + lustre_put_lsi(sb); + RETURN(-EALREADY); + } + + /* Start MGS before MGC */ + if (IS_MGS(lsi) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) { + rc = server_start_mgs(sb); + if (rc) + GOTO(out_mnt, rc); + } + + /* Start MGC before servers */ + rc = lustre_start_mgc(sb); + if (rc) + GOTO(out_mnt, rc); + + /* Set up all obd devices for service */ + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) && + (IS_OST(lsi) || IS_MDT(lsi))) { + rc = server_start_targets(sb); + if (rc < 0) { + CERROR("Unable to start targets: %d\n", rc); + GOTO(out_mnt, rc); + } + /* FIXME overmount client here, or can we just start a + * client log and client_fill_super on this sb? We + * need to make sure server_put_super gets called too + * - ll_put_super calls lustre_common_put_super; check + * there for LSI_SERVER flag, call s_p_s if so. + * + * Probably should start client from new thread so we + * can return. Client will not finish until all + * servers are connected. Note - MGS-only server does + * NOT get a client, since there is no lustre fs + * associated - the MGS is for all lustre fs's */ + } + + rc = server_fill_super_common(sb); + if (rc) + GOTO(out_mnt, rc); + + RETURN(0); +out_mnt: + /* We jump here in case of failure while starting targets or MGS. + * In this case we can't just put @mnt and have to do real cleanup + * with stoping targets, etc. */ + server_put_super(sb); + return rc; +} +EXPORT_SYMBOL(server_fill_super); + +/* + * Calculate timeout value for a target. + */ +void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd) +{ + struct lustre_mount_data *lmd; + int soft = 0; + int hard = 0; + int factor = 0; + bool has_ir = !!(lsi->lsi_flags & LDD_F_IR_CAPABLE); + int min = OBD_RECOVERY_TIME_MIN; + + LASSERT(IS_SERVER(lsi)); + + lmd = lsi->lsi_lmd; + if (lmd) { + soft = lmd->lmd_recovery_time_soft; + hard = lmd->lmd_recovery_time_hard; + has_ir = has_ir && !(lmd->lmd_flags & LMD_FLG_NOIR); + obd->obd_no_ir = !has_ir; + } + + if (soft == 0) + soft = OBD_RECOVERY_TIME_SOFT; + if (hard == 0) + hard = OBD_RECOVERY_TIME_HARD; + + /* target may have ir_factor configured. */ + factor = OBD_IR_FACTOR_DEFAULT; + if (obd->obd_recovery_ir_factor) + factor = obd->obd_recovery_ir_factor; + + if (has_ir) { + int new_soft = soft; + + /* adjust timeout value by imperative recovery */ + new_soft = (soft * factor) / OBD_IR_FACTOR_MAX; + /* make sure the timeout is not too short */ + new_soft = max(min, new_soft); + + LCONSOLE_INFO("%s: Imperative Recovery enabled, recovery " + "window shrunk from %d-%d down to %d-%d\n", + obd->obd_name, soft, hard, new_soft, hard); + + soft = new_soft; + } else { + LCONSOLE_INFO("%s: Imperative Recovery not enabled, recovery " + "window %d-%d\n", obd->obd_name, soft, hard); + } + + /* we're done */ + obd->obd_recovery_timeout = max_t(time64_t, obd->obd_recovery_timeout, + soft); + obd->obd_recovery_time_hard = hard; + obd->obd_recovery_ir_factor = factor; +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c new file mode 100644 index 0000000000000..5d9430a0930e9 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c @@ -0,0 +1,687 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/obd_sysfs.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +struct static_lustre_uintvalue_attr { + struct { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, struct attribute *attr, + char *buf); + ssize_t (*store)(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len); + } u; + int *value; +}; + +static ssize_t static_uintvalue_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct static_lustre_uintvalue_attr *lattr = (void *)attr; + + return sprintf(buf, "%d\n", *lattr->value); +} + +static ssize_t static_uintvalue_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct static_lustre_uintvalue_attr *lattr = (void *)attr; + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + *lattr->value = val; + + return count; +} + +#define LUSTRE_STATIC_UINT_ATTR(name, value) \ +static struct static_lustre_uintvalue_attr lustre_sattr_##name = \ + { __ATTR(name, 0644, static_uintvalue_show, \ + static_uintvalue_store), value } + +LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout); +LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout); +LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout); +LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction); +LUSTRE_STATIC_UINT_ATTR(at_min, &at_min); +LUSTRE_STATIC_UINT_ATTR(at_max, &at_max); +LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra); +LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin); +LUSTRE_STATIC_UINT_ATTR(at_history, &at_history); +LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction); + +#ifdef HAVE_SERVER_SUPPORT +LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout); +LUSTRE_STATIC_UINT_ATTR(bulk_timeout, &bulk_timeout); +#endif + +static ssize_t memused_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%llu\n", obd_memory_sum()); +} +LUSTRE_RO_ATTR(memused); + +static ssize_t memused_max_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%llu\n", obd_memory_max()); +} +LUSTRE_RO_ATTR(memused_max); + +static ssize_t max_dirty_mb_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", + obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT))); +} + +static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + unsigned long val; + int rc; + + rc = kstrtoul(buffer, 10, &val); + if (rc) + return rc; + + val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */ + + if (val > ((cfs_totalram_pages() / 10) * 9)) { + /* Somebody wants to assign too much memory to dirty pages */ + return -EINVAL; + } + + if (val < 4 << (20 - PAGE_SHIFT)) { + /* Less than 4 Mb for dirty cache is also bad */ + return -EINVAL; + } + + obd_max_dirty_pages = val; + + return count; +} +LUSTRE_RW_ATTR(max_dirty_mb); + +#ifdef HAVE_SERVER_SUPPORT +static ssize_t no_transno_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd; + unsigned int idx; + int rc; + + rc = kstrtouint(buffer, 10, &idx); + if (rc) + return rc; + + obd = class_num2obd(idx); + if (!obd || !obd->obd_attached) { + if (obd) + CERROR("%s: not attached\n", obd->obd_name); + return -ENODEV; + } + + spin_lock(&obd->obd_dev_lock); + obd->obd_no_transno = 1; + spin_unlock(&obd->obd_dev_lock); + return count; +} +LUSTRE_WO_ATTR(no_transno); +#endif /* HAVE_SERVER_SUPPORT */ + +static ssize_t version_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", LUSTRE_VERSION_STRING); +} + +static ssize_t pinger_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ +#ifdef CONFIG_LUSTRE_FS_PINGER + const char *state = "on"; +#else + const char *state = "off"; +#endif + return sprintf(buf, "%s\n", state); +} + +/** + * Check all obd devices health + * + * \param kobj + * \param buf [in] + * + * \retval number of characters printed if healthy + */ +static ssize_t +health_check_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + bool healthy = true; + size_t len = 0; + int i; + + if (libcfs_catastrophe) + return sprintf(buf, "LBUG\n"); + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd; + + obd = class_num2obd(i); + if (obd == NULL || !obd->obd_attached || !obd->obd_set_up) + continue; + + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_stopping) + continue; + + class_incref(obd, __func__, current); + read_unlock(&obd_dev_lock); + + if (obd_health_check(NULL, obd)) + healthy = false; + + class_decref(obd, __func__, current); + read_lock(&obd_dev_lock); + + if (!healthy) + break; + } + read_unlock(&obd_dev_lock); + + if (healthy) + len = sprintf(buf, "healthy\n"); + else + len = sprintf(buf, "NOT HEALTHY\n"); + + return len; +} + +static ssize_t jobid_var_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + int rc = 0; + + if (strlen(obd_jobid_var)) + rc = scnprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_var); + return rc; +} + +static ssize_t jobid_var_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN) + return -EINVAL; + + memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1); + + memcpy(obd_jobid_var, buffer, count); + + /* Trim the trailing '\n' if any */ + if (obd_jobid_var[count - 1] == '\n') + obd_jobid_var[count - 1] = 0; + + return count; +} + +static ssize_t jobid_name_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + int rc = 0; + + if (strlen(obd_jobid_name)) + rc = scnprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_name); + return rc; +} + +static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + if (!count || count > LUSTRE_JOBID_SIZE) + return -EINVAL; + + if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) != 0 && + !strchr(buffer, '%')) { + lustre_jobid_clear(buffer); + return count; + } + + /* clear previous value */ + memset(obd_jobid_name, 0, LUSTRE_JOBID_SIZE); + + memcpy(obd_jobid_name, buffer, count); + + /* Trim the trailing '\n' if any */ + if (obd_jobid_name[count - 1] == '\n') { + /* Don't echo just a newline */ + if (count == 1) + return -EINVAL; + obd_jobid_name[count - 1] = 0; + } + + return count; +} + +static ssize_t jobid_this_session_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + char *jid; + int ret = -ENOENT; + + rcu_read_lock(); + jid = jobid_current(); + if (jid) + ret = scnprintf(buf, PAGE_SIZE, "%s\n", jid); + rcu_read_unlock(); + return ret; +} + +static ssize_t jobid_this_session_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + char *jobid; + int len; + int ret; + + if (!count || count > LUSTRE_JOBID_SIZE) + return -EINVAL; + + jobid = kstrndup(buffer, count, GFP_KERNEL); + if (!jobid) + return -ENOMEM; + len = strcspn(jobid, "\n "); + jobid[len] = '\0'; + ret = jobid_set_current(jobid); + kfree(jobid); + + return ret ?: count; +} + +/* Root for /sys/kernel/debug/lustre */ +struct dentry *debugfs_lustre_root; +EXPORT_SYMBOL_GPL(debugfs_lustre_root); + +#ifdef CONFIG_PROC_FS +/* Root for /proc/fs/lustre */ +struct proc_dir_entry *proc_lustre_root; +EXPORT_SYMBOL(proc_lustre_root); +#else +#define lprocfs_base NULL +#endif /* CONFIG_PROC_FS */ + +LUSTRE_RO_ATTR(version); +LUSTRE_RO_ATTR(pinger); +LUSTRE_RO_ATTR(health_check); +LUSTRE_RW_ATTR(jobid_var); +LUSTRE_RW_ATTR(jobid_name); +LUSTRE_RW_ATTR(jobid_this_session); + +static struct attribute *lustre_attrs[] = { + &lustre_attr_version.attr, + &lustre_attr_pinger.attr, + &lustre_attr_health_check.attr, + &lustre_attr_jobid_name.attr, + &lustre_attr_jobid_var.attr, + &lustre_attr_jobid_this_session.attr, + &lustre_sattr_timeout.u.attr, + &lustre_attr_max_dirty_mb.attr, + &lustre_sattr_debug_peer_on_timeout.u.attr, + &lustre_sattr_dump_on_timeout.u.attr, + &lustre_sattr_dump_on_eviction.u.attr, + &lustre_sattr_at_min.u.attr, + &lustre_sattr_at_max.u.attr, + &lustre_sattr_at_extra.u.attr, + &lustre_sattr_at_early_margin.u.attr, + &lustre_sattr_at_history.u.attr, + &lustre_attr_memused_max.attr, + &lustre_attr_memused.attr, +#ifdef HAVE_SERVER_SUPPORT + &lustre_sattr_ldlm_timeout.u.attr, + &lustre_sattr_bulk_timeout.u.attr, + &lustre_attr_no_transno.attr, +#endif + &lustre_sattr_lbug_on_eviction.u.attr, + NULL, +}; + +static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos) +{ + if (*pos >= class_devno_max()) + return NULL; + + return pos; +} + +static void obd_device_list_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + ++*pos; + if (*pos >= class_devno_max()) + return NULL; + + return pos; +} + +static int obd_device_list_seq_show(struct seq_file *p, void *v) +{ + loff_t index = *(loff_t *)v; + struct obd_device *obd = class_num2obd((int)index); + char *status; + + if (obd == NULL) + return 0; + + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_inactive) + status = "IN"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + + seq_printf(p, "%3d %s %s %s %s %d\n", + (int)index, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + return 0; +} + +static const struct seq_operations obd_device_list_sops = { + .start = obd_device_list_seq_start, + .stop = obd_device_list_seq_stop, + .next = obd_device_list_seq_next, + .show = obd_device_list_seq_show, +}; + +static int obd_device_list_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = seq_open(file, &obd_device_list_sops); + + if (rc) + return rc; + + seq = file->private_data; + seq->private = inode->i_private; + return 0; +} + +static const struct file_operations obd_device_list_fops = { + .owner = THIS_MODULE, + .open = obd_device_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* checksum_speed */ +static void *checksum_speed_start(struct seq_file *p, loff_t *pos) +{ + return pos; +} + +static void checksum_speed_stop(struct seq_file *p, void *v) +{ +} + +static void *checksum_speed_next(struct seq_file *p, void *v, loff_t *pos) +{ + ++(*pos); + if (*pos >= CFS_HASH_ALG_SPEED_MAX - 1) + return NULL; + + return pos; +} + +static int checksum_speed_show(struct seq_file *p, void *v) +{ + loff_t index = *(loff_t *)v; + + if (!index || index > CFS_HASH_ALG_SPEED_MAX - 1) + return 0; + + seq_printf(p, "%s: %d\n", cfs_crypto_hash_name(index), + cfs_crypto_hash_speeds[index]); + + return 0; +} + +static const struct seq_operations checksum_speed_sops = { + .start = checksum_speed_start, + .stop = checksum_speed_stop, + .next = checksum_speed_next, + .show = checksum_speed_show, +}; + +static int checksum_speed_open(struct inode *inode, struct file *file) +{ + int rc = seq_open(file, &checksum_speed_sops); + + if (rc) + return rc; + + return 0; +} + +static const struct file_operations checksum_speed_fops = { + .owner = THIS_MODULE, + .open = checksum_speed_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int +health_check_seq_show(struct seq_file *m, void *unused) +{ + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd; + + obd = class_num2obd(i); + if (obd == NULL || !obd->obd_attached || !obd->obd_set_up) + continue; + + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_stopping) + continue; + + class_incref(obd, __func__, current); + read_unlock(&obd_dev_lock); + + if (obd_health_check(NULL, obd)) { + seq_printf(m, "device %s reported unhealthy\n", + obd->obd_name); + } + class_decref(obd, __func__, current); + read_lock(&obd_dev_lock); + } + read_unlock(&obd_dev_lock); + + return 0; +} + +LDEBUGFS_SEQ_FOPS_RO(health_check); + +struct kset *lustre_kset; +EXPORT_SYMBOL_GPL(lustre_kset); + +static struct attribute_group lustre_attr_group = { + .attrs = lustre_attrs, +}; + +ssize_t class_set_global(const char *param) +{ + const char *value = strchr(param, '=') + 1; + size_t off = value - param - 1; + ssize_t count = -ENOENT; + int i; + + for (i = 0; lustre_attrs[i]; i++) { + if (!strncmp(lustre_attrs[i]->name, param, off)) { + count = lustre_attr_store(&lustre_kset->kobj, + lustre_attrs[i], value, + strlen(value)); + break; + } + } + return count; +} + +int class_procfs_init(void) +{ + struct proc_dir_entry *entry; + struct dentry *file; + int rc = -ENOMEM; + + ENTRY; + + lustre_kset = kset_create_and_add("lustre", NULL, fs_kobj); + if (!lustre_kset) + goto out; + + /* Create the files associated with this kobject */ + rc = sysfs_create_group(&lustre_kset->kobj, &lustre_attr_group); + if (rc) { + kset_unregister(lustre_kset); + goto out; + } + + rc = jobid_cache_init(); + if (rc) { + kset_unregister(lustre_kset); + goto out; + } + + debugfs_lustre_root = debugfs_create_dir("lustre", NULL); + + file = debugfs_create_file("devices", 0444, debugfs_lustre_root, NULL, + &obd_device_list_fops); + + file = debugfs_create_file("health_check", 0444, debugfs_lustre_root, + NULL, &health_check_fops); + + file = debugfs_create_file("checksum_speed", 0444, debugfs_lustre_root, + NULL, &checksum_speed_fops); + + entry = lprocfs_register("fs/lustre", NULL, NULL, NULL); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CERROR("cannot create '/proc/fs/lustre': rc = %d\n", rc); + debugfs_remove_recursive(debugfs_lustre_root); + kset_unregister(lustre_kset); + goto out; + } + + proc_lustre_root = entry; +out: + RETURN(rc); +} + +int class_procfs_clean(void) +{ + ENTRY; + + debugfs_remove_recursive(debugfs_lustre_root); + + debugfs_lustre_root = NULL; + jobid_cache_fini(); + + if (proc_lustre_root) + lprocfs_remove(&proc_lustre_root); + + sysfs_remove_group(&lustre_kset->kobj, &lustre_attr_group); + + kset_unregister(lustre_kset); + + RETURN(0); +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c new file mode 100644 index 0000000000000..d17d19741b6ba --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c @@ -0,0 +1,225 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/obdo.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#include +#include + +void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent) +{ + dst->o_parent_oid = fid_oid(parent); + dst->o_parent_seq = fid_seq(parent); + dst->o_parent_ver = fid_ver(parent); + dst->o_valid |= OBD_MD_FLPARENT | OBD_MD_FLFID; +} +EXPORT_SYMBOL(obdo_set_parent_fid); + +void obdo_set_o_projid(struct obdo *dst, u32 projid) +{ + dst->o_projid = projid; + dst->o_valid |= OBD_MD_FLPROJID; +} +EXPORT_SYMBOL(obdo_set_o_projid); + +/* + * WARNING: the file systems must take care not to tinker with + * attributes they don't manage (such as blocks). + */ +void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid) +{ + u64 newvalid = 0; + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, "valid %#llx, new time %lld/%lld\n", + valid, (s64) src->i_mtime.tv_sec, + (s64) src->i_ctime.tv_sec); + + if (valid & OBD_MD_FLATIME) { + dst->o_atime = src->i_atime.tv_sec; + newvalid |= OBD_MD_FLATIME; + } + if (valid & OBD_MD_FLMTIME) { + dst->o_mtime = src->i_mtime.tv_sec; + newvalid |= OBD_MD_FLMTIME; + } + if (valid & OBD_MD_FLCTIME) { + dst->o_ctime = src->i_ctime.tv_sec; + newvalid |= OBD_MD_FLCTIME; + } + if (valid & OBD_MD_FLSIZE) { + dst->o_size = i_size_read(src); + newvalid |= OBD_MD_FLSIZE; + } + if (valid & OBD_MD_FLBLOCKS) { /* allocation of space (x512 bytes) */ + dst->o_blocks = src->i_blocks; + newvalid |= OBD_MD_FLBLOCKS; + } + if (valid & OBD_MD_FLBLKSZ) { /* optimal block size */ + dst->o_blksize = 1U << src->i_blkbits; + newvalid |= OBD_MD_FLBLKSZ; + } + if (valid & OBD_MD_FLTYPE) { + dst->o_mode = (dst->o_mode & S_IALLUGO) | + (src->i_mode & S_IFMT); + newvalid |= OBD_MD_FLTYPE; + } + if (valid & OBD_MD_FLMODE) { + dst->o_mode = (dst->o_mode & S_IFMT) | + (src->i_mode & S_IALLUGO); + newvalid |= OBD_MD_FLMODE; + } + if (valid & OBD_MD_FLUID) { + dst->o_uid = from_kuid(&init_user_ns, src->i_uid); + newvalid |= OBD_MD_FLUID; + } + if (valid & OBD_MD_FLGID) { + dst->o_gid = from_kgid(&init_user_ns, src->i_gid); + newvalid |= OBD_MD_FLGID; + } + if (valid & OBD_MD_FLFLAGS) { + dst->o_flags = src->i_flags; + newvalid |= OBD_MD_FLFLAGS; + } + dst->o_valid |= newvalid; +} +EXPORT_SYMBOL(obdo_from_inode); + +void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid) +{ + CDEBUG(D_INODE, "src obdo "DOSTID" valid %#llx, dst obdo "DOSTID"\n", + POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi)); + if (valid & OBD_MD_FLATIME) + dst->o_atime = src->o_atime; + if (valid & OBD_MD_FLMTIME) + dst->o_mtime = src->o_mtime; + if (valid & OBD_MD_FLCTIME) + dst->o_ctime = src->o_ctime; + if (valid & OBD_MD_FLSIZE) + dst->o_size = src->o_size; + if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ + dst->o_blocks = src->o_blocks; + if (valid & OBD_MD_FLBLKSZ) + dst->o_blksize = src->o_blksize; + if (valid & OBD_MD_FLTYPE) + dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT); + if (valid & OBD_MD_FLMODE) + dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT); + if (valid & OBD_MD_FLUID) + dst->o_uid = src->o_uid; + if (valid & OBD_MD_FLGID) + dst->o_gid = src->o_gid; + if (valid & OBD_MD_FLFLAGS) + dst->o_flags = src->o_flags; + if (valid & OBD_MD_FLFID) { + dst->o_parent_seq = src->o_parent_seq; + dst->o_parent_ver = src->o_parent_ver; + } + if (valid & OBD_MD_FLPARENT) + dst->o_parent_oid = src->o_parent_oid; + if (valid & OBD_MD_FLHANDLE) + dst->o_handle = src->o_handle; + + dst->o_valid |= valid; +} +EXPORT_SYMBOL(obdo_cpy_md); + +void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj) +{ + ioobj->ioo_oid = oa->o_oi; + if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP))) + ostid_set_seq_mdt0(&ioobj->ioo_oid); + + /* + * Since 2.4 this does not contain o_mode in the low 16 bits. + * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs + */ + ioobj->ioo_max_brw = 0; +} +EXPORT_SYMBOL(obdo_to_ioobj); + +/* + * Create an obdo to send over the wire + */ +void lustre_set_wire_obdo(const struct obd_connect_data *ocd, + struct obdo *wobdo, + const struct obdo *lobdo) +{ + *wobdo = *lobdo; + if (ocd == NULL) + return; + + if (!(wobdo->o_valid & OBD_MD_FLUID)) + wobdo->o_uid = from_kuid(&init_user_ns, current_uid()); + if (!(wobdo->o_valid & OBD_MD_FLGID)) + wobdo->o_gid = from_kgid(&init_user_ns, current_gid()); + + if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) && + fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) { + /* + * Currently OBD_FL_OSTID will only be used when 2.4 echo + * client communicate with pre-2.4 server + */ + wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid); + wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid); + } +} +EXPORT_SYMBOL(lustre_set_wire_obdo); + +/* + * Create a local obdo from a wire based odbo + */ +void lustre_get_wire_obdo(const struct obd_connect_data *ocd, + struct obdo *lobdo, + const struct obdo *wobdo) +{ + *lobdo = *wobdo; + if (ocd == NULL) + return; + + if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) && + fid_seq_is_echo(wobdo->o_oi.oi.oi_seq)) { + /* see above */ + lobdo->o_oi.oi_fid.f_seq = wobdo->o_oi.oi.oi_seq; + lobdo->o_oi.oi_fid.f_oid = wobdo->o_oi.oi.oi_id; + lobdo->o_oi.oi_fid.f_ver = 0; + } +} +EXPORT_SYMBOL(lustre_get_wire_obdo); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c new file mode 100644 index 0000000000000..0e546c4815467 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c @@ -0,0 +1,163 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/linux/linux-obdo.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include /* for PAGE_SIZE */ +#include + +/*FIXME: Just copy from obdo_from_inode*/ +void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid) +{ + u64 newvalid = 0; + + if (valid & LA_ATIME) { + dst->o_atime = la->la_atime; + newvalid |= OBD_MD_FLATIME; + } + if (valid & LA_MTIME) { + dst->o_mtime = la->la_mtime; + newvalid |= OBD_MD_FLMTIME; + } + if (valid & LA_CTIME) { + dst->o_ctime = la->la_ctime; + newvalid |= OBD_MD_FLCTIME; + } + if (valid & LA_SIZE) { + dst->o_size = la->la_size; + newvalid |= OBD_MD_FLSIZE; + } + if (valid & LA_BLOCKS) { /* allocation of space (x512 bytes) */ + dst->o_blocks = la->la_blocks; + newvalid |= OBD_MD_FLBLOCKS; + } + if (valid & LA_TYPE) { + dst->o_mode = (dst->o_mode & S_IALLUGO) | + (la->la_mode & S_IFMT); + newvalid |= OBD_MD_FLTYPE; + } + if (valid & LA_MODE) { + dst->o_mode = (dst->o_mode & S_IFMT) | + (la->la_mode & S_IALLUGO); + newvalid |= OBD_MD_FLMODE; + } + if (valid & LA_UID) { + dst->o_uid = la->la_uid; + newvalid |= OBD_MD_FLUID; + } + if (valid & LA_GID) { + dst->o_gid = la->la_gid; + newvalid |= OBD_MD_FLGID; + } + if (valid & LA_PROJID) { + dst->o_projid = la->la_projid; + newvalid |= OBD_MD_FLPROJID; + } + if (valid & LA_FLAGS) { + dst->o_flags = la->la_flags; + newvalid |= OBD_MD_FLFLAGS; + } + if (valid & LA_NLINK) { + dst->o_nlink = la->la_nlink; + newvalid |= OBD_MD_FLNLINK; + } + dst->o_valid |= newvalid; +} +EXPORT_SYMBOL(obdo_from_la); + +/*FIXME: Just copy from obdo_from_inode*/ +void la_from_obdo(struct lu_attr *dst, const struct obdo *obdo, u64 valid) +{ + u64 newvalid = 0; + + valid &= obdo->o_valid; + + if (valid & OBD_MD_FLATIME) { + dst->la_atime = obdo->o_atime; + newvalid |= LA_ATIME; + } + if (valid & OBD_MD_FLMTIME) { + dst->la_mtime = obdo->o_mtime; + newvalid |= LA_MTIME; + } + if (valid & OBD_MD_FLCTIME) { + dst->la_ctime = obdo->o_ctime; + newvalid |= LA_CTIME; + } + if (valid & OBD_MD_FLSIZE) { + dst->la_size = obdo->o_size; + newvalid |= LA_SIZE; + } + if (valid & OBD_MD_FLBLOCKS) { + dst->la_blocks = obdo->o_blocks; + newvalid |= LA_BLOCKS; + } + if (valid & OBD_MD_FLTYPE) { + dst->la_mode = (dst->la_mode & S_IALLUGO) | + (obdo->o_mode & S_IFMT); + newvalid |= LA_TYPE; + } + if (valid & OBD_MD_FLMODE) { + dst->la_mode = (dst->la_mode & S_IFMT) | + (obdo->o_mode & S_IALLUGO); + newvalid |= LA_MODE; + } + if (valid & OBD_MD_FLUID) { + dst->la_uid = obdo->o_uid; + newvalid |= LA_UID; + } + if (valid & OBD_MD_FLGID) { + dst->la_gid = obdo->o_gid; + newvalid |= LA_GID; + } + if (valid & OBD_MD_FLPROJID) { + dst->la_projid = obdo->o_projid; + newvalid |= LA_PROJID; + } + if (valid & OBD_MD_FLFLAGS) { + dst->la_flags = obdo->o_flags; + newvalid |= LA_FLAGS; + } + if (valid & OBD_MD_FLNLINK) { + dst->la_nlink = obdo->o_nlink; + newvalid |= LA_NLINK; + } + dst->la_valid = newvalid; +} +EXPORT_SYMBOL(la_from_obdo); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/range_lock.c b/drivers/staging/lustrefsx/lustre/obdclass/range_lock.c new file mode 100644 index 0000000000000..ae3ed9ab0c975 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/range_lock.c @@ -0,0 +1,179 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Range lock is used to allow multiple threads writing a single shared + * file given each thread is writing to a non-overlapping portion of the + * file. + * + * Refer to the possible upstream kernel version of range lock by + * Jan Kara : https://lkml.org/lkml/2013/1/31/480 + * + * This file could later replaced by the upstream kernel version. + */ +/* + * Author: Prakash Surya + * Author: Bobi Jam + */ +#ifdef HAVE_SCHED_HEADERS +#include +#endif +#include +#include +#include + +#define START(node) ((node)->rl_start) +#define LAST(node) ((node)->rl_end) + +INTERVAL_TREE_DEFINE(struct range_lock, rl_rb, __u64, rl_subtree_last, + START, LAST, static, range_lock) + +/** + * Initialize a range lock tree + * + * \param tree [in] an empty range lock tree + * + * Pre: Caller should have allocated the range lock tree. + * Post: The range lock tree is ready to function. + */ +void range_lock_tree_init(struct range_lock_tree *tree) +{ + tree->rlt_root = INTERVAL_TREE_ROOT; + tree->rlt_sequence = 0; + spin_lock_init(&tree->rlt_lock); +} +EXPORT_SYMBOL(range_lock_tree_init); + +/** + * Intialize a range lock node + * + * \param lock [in] an empty range lock node + * \param start [in] start of the covering region + * \param end [in] end of the covering region + * + * Pre: Caller should have allocated the range lock node. + * Post: The range lock node is meant to cover [start, end] region + */ +void range_lock_init(struct range_lock *lock, __u64 start, __u64 end) +{ + start >>= PAGE_SHIFT; + if (end != LUSTRE_EOF) + end >>= PAGE_SHIFT; + lock->rl_start = start; + lock->rl_end = end; + + lock->rl_task = NULL; + lock->rl_blocking_ranges = 0; + lock->rl_sequence = 0; +} +EXPORT_SYMBOL(range_lock_init); + +/** + * Unlock a range lock, wake up locks blocked by this lock. + * + * \param tree [in] range lock tree + * \param lock [in] range lock to be deleted + * + * If this lock has been granted, relase it; if not, just delete it from + * the tree or the same region lock list. Wake up those locks only blocked + * by this lock. + */ +void range_unlock(struct range_lock_tree *tree, struct range_lock *lock) +{ + struct range_lock *overlap; + ENTRY; + + spin_lock(&tree->rlt_lock); + + range_lock_remove(lock, &tree->rlt_root); + + for (overlap = range_lock_iter_first(&tree->rlt_root, + lock->rl_start, + lock->rl_end); + overlap; + overlap = range_lock_iter_next(overlap, + lock->rl_start, + lock->rl_end)) + if (overlap->rl_sequence > lock->rl_sequence) { + --overlap->rl_blocking_ranges; + if (overlap->rl_blocking_ranges == 0) + wake_up_process(overlap->rl_task); + } + + spin_unlock(&tree->rlt_lock); + + EXIT; +} +EXPORT_SYMBOL(range_unlock); + +/** + * Lock a region + * + * \param tree [in] range lock tree + * \param lock [in] range lock node containing the region span + * + * \retval 0 get the range lock + * \retval <0 error code while not getting the range lock + * + * If there exists overlapping range lock, the new lock will wait and + * retry, if later it find that it is not the chosen one to wake up, + * it wait again. + */ +int range_lock(struct range_lock_tree *tree, struct range_lock *lock) +{ + struct range_lock *overlap; + int rc = 0; + ENTRY; + + spin_lock(&tree->rlt_lock); + /* + * We need to check for all conflicting intervals + * already in the tree. + */ + for (overlap = range_lock_iter_first(&tree->rlt_root, + lock->rl_start, + lock->rl_end); + overlap; + overlap = range_lock_iter_next(overlap, + lock->rl_start, + lock->rl_end)) + lock->rl_blocking_ranges += 1; + + range_lock_insert(lock, &tree->rlt_root); + lock->rl_sequence = ++tree->rlt_sequence; + + while (lock->rl_blocking_ranges > 0) { + lock->rl_task = current; + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock(&tree->rlt_lock); + schedule(); + + if (signal_pending(current)) { + range_unlock(tree, lock); + GOTO(out, rc = -ERESTARTSYS); + } + spin_lock(&tree->rlt_lock); + } + spin_unlock(&tree->rlt_lock); +out: + RETURN(rc); +} +EXPORT_SYMBOL(range_lock); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/scrub.c b/drivers/staging/lustrefsx/lustre/obdclass/scrub.c new file mode 100644 index 0000000000000..89c3f752dad22 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/scrub.c @@ -0,0 +1,1356 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + */ +/* + * lustre/obdclass/scrub.c + * + * The OI scrub is used for checking and (re)building Object Index files + * that are usually backend special. Here are some general scrub related + * functions that can be shared by different backends for OI scrub. + * + * Author: Fan Yong + */ + +#define DEBUG_SUBSYSTEM S_LFSCK + +#include +#include +#include +#include + +static inline struct dt_device *scrub_obj2dev(struct dt_object *obj) +{ + return container_of_safe(obj->do_lu.lo_dev, struct dt_device, + dd_lu_dev); +} + +static void scrub_file_to_cpu(struct scrub_file *des, struct scrub_file *src) +{ + uuid_copy(&des->sf_uuid, &src->sf_uuid); + des->sf_flags = le64_to_cpu(src->sf_flags); + des->sf_magic = le32_to_cpu(src->sf_magic); + des->sf_status = le16_to_cpu(src->sf_status); + des->sf_param = le16_to_cpu(src->sf_param); + des->sf_time_last_complete = + le64_to_cpu(src->sf_time_last_complete); + des->sf_time_latest_start = + le64_to_cpu(src->sf_time_latest_start); + des->sf_time_last_checkpoint = + le64_to_cpu(src->sf_time_last_checkpoint); + des->sf_pos_latest_start = + le64_to_cpu(src->sf_pos_latest_start); + des->sf_pos_last_checkpoint = + le64_to_cpu(src->sf_pos_last_checkpoint); + des->sf_pos_first_inconsistent = + le64_to_cpu(src->sf_pos_first_inconsistent); + des->sf_items_checked = + le64_to_cpu(src->sf_items_checked); + des->sf_items_updated = + le64_to_cpu(src->sf_items_updated); + des->sf_items_failed = + le64_to_cpu(src->sf_items_failed); + des->sf_items_updated_prior = + le64_to_cpu(src->sf_items_updated_prior); + des->sf_run_time = le32_to_cpu(src->sf_run_time); + des->sf_success_count = le32_to_cpu(src->sf_success_count); + des->sf_oi_count = le16_to_cpu(src->sf_oi_count); + des->sf_internal_flags = le16_to_cpu(src->sf_internal_flags); + memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE); +} + +static void scrub_file_to_le(struct scrub_file *des, struct scrub_file *src) +{ + uuid_copy(&des->sf_uuid, &src->sf_uuid); + des->sf_flags = cpu_to_le64(src->sf_flags); + des->sf_magic = cpu_to_le32(src->sf_magic); + des->sf_status = cpu_to_le16(src->sf_status); + des->sf_param = cpu_to_le16(src->sf_param); + des->sf_time_last_complete = + cpu_to_le64(src->sf_time_last_complete); + des->sf_time_latest_start = + cpu_to_le64(src->sf_time_latest_start); + des->sf_time_last_checkpoint = + cpu_to_le64(src->sf_time_last_checkpoint); + des->sf_pos_latest_start = + cpu_to_le64(src->sf_pos_latest_start); + des->sf_pos_last_checkpoint = + cpu_to_le64(src->sf_pos_last_checkpoint); + des->sf_pos_first_inconsistent = + cpu_to_le64(src->sf_pos_first_inconsistent); + des->sf_items_checked = + cpu_to_le64(src->sf_items_checked); + des->sf_items_updated = + cpu_to_le64(src->sf_items_updated); + des->sf_items_failed = + cpu_to_le64(src->sf_items_failed); + des->sf_items_updated_prior = + cpu_to_le64(src->sf_items_updated_prior); + des->sf_run_time = cpu_to_le32(src->sf_run_time); + des->sf_success_count = cpu_to_le32(src->sf_success_count); + des->sf_oi_count = cpu_to_le16(src->sf_oi_count); + des->sf_internal_flags = cpu_to_le16(src->sf_internal_flags); + memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE); +} + +void scrub_file_init(struct lustre_scrub *scrub, uuid_t uuid) +{ + struct scrub_file *sf = &scrub->os_file; + + memset(sf, 0, sizeof(*sf)); + uuid_copy(&sf->sf_uuid, &uuid); + sf->sf_magic = SCRUB_MAGIC_V2; + sf->sf_status = SS_INIT; +} +EXPORT_SYMBOL(scrub_file_init); + +void scrub_file_reset(struct lustre_scrub *scrub, uuid_t uuid, u64 flags) +{ + struct scrub_file *sf = &scrub->os_file; + + CDEBUG(D_LFSCK, "%s: reset OI scrub file, old flags = " + "%#llx, add flags = %#llx\n", + scrub->os_name, sf->sf_flags, flags); + + uuid_copy(&sf->sf_uuid, &uuid); + sf->sf_magic = SCRUB_MAGIC_V2; + sf->sf_status = SS_INIT; + sf->sf_flags |= flags; + sf->sf_flags &= ~SF_AUTO; + sf->sf_run_time = 0; + sf->sf_time_latest_start = 0; + sf->sf_time_last_checkpoint = 0; + sf->sf_pos_latest_start = 0; + sf->sf_pos_last_checkpoint = 0; + sf->sf_pos_first_inconsistent = 0; + sf->sf_items_checked = 0; + sf->sf_items_updated = 0; + sf->sf_items_failed = 0; + sf->sf_items_noscrub = 0; + sf->sf_items_igif = 0; + if (!scrub->os_in_join) + sf->sf_items_updated_prior = 0; +} +EXPORT_SYMBOL(scrub_file_reset); + +int scrub_file_load(const struct lu_env *env, struct lustre_scrub *scrub) +{ + struct scrub_file *sf = &scrub->os_file; + struct lu_buf buf = { + .lb_buf = &scrub->os_file_disk, + .lb_len = sizeof(scrub->os_file_disk) + }; + loff_t pos = 0; + int rc; + + rc = dt_read(env, scrub->os_obj, &buf, &pos); + /* failure */ + if (rc < 0) { + CERROR("%s: fail to load scrub file: rc = %d\n", + scrub->os_name, rc); + return rc; + } + + /* empty */ + if (!rc) + return -ENOENT; + + /* corrupted */ + if (rc < buf.lb_len) { + CDEBUG(D_LFSCK, "%s: fail to load scrub file, " + "expected = %d: rc = %d\n", + scrub->os_name, (int)buf.lb_len, rc); + return -EFAULT; + } + + scrub_file_to_cpu(sf, &scrub->os_file_disk); + if (sf->sf_magic == SCRUB_MAGIC_V1) { + CWARN("%s: reset scrub OI count for format change (LU-16655)\n", + scrub->os_name); + sf->sf_oi_count = 0; + } else if (sf->sf_magic != SCRUB_MAGIC_V2) { + CDEBUG(D_LFSCK, "%s: invalid scrub magic %#x, should be %#x\n", + scrub->os_name, sf->sf_magic, SCRUB_MAGIC_V2); + return -EFAULT; + } + + return 0; +} +EXPORT_SYMBOL(scrub_file_load); + +int scrub_file_store(const struct lu_env *env, struct lustre_scrub *scrub) +{ + struct scrub_file *sf = &scrub->os_file_disk; + struct dt_object *obj = scrub->os_obj; + struct dt_device *dev = scrub_obj2dev(obj); + struct lu_buf buf = { + .lb_buf = sf, + .lb_len = sizeof(*sf) + }; + struct thandle *th; + loff_t pos = 0; + int rc; + ENTRY; + + /* Skip store under rdonly mode. */ + if (dev->dd_rdonly) + RETURN(0); + + scrub_file_to_le(sf, &scrub->os_file); + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(log, rc = PTR_ERR(th)); + + rc = dt_declare_record_write(env, obj, &buf, pos, th); + if (rc) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + rc = dt_record_write(env, obj, &buf, &pos, th); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, dev, th); + +log: + if (rc) + CERROR("%s: store scrub file: rc = %d\n", + scrub->os_name, rc); + else + CDEBUG(D_LFSCK, "%s: store scrub file: rc = %d\n", + scrub->os_name, rc); + + scrub->os_time_last_checkpoint = ktime_get_seconds(); + scrub->os_time_next_checkpoint = scrub->os_time_last_checkpoint + + SCRUB_CHECKPOINT_INTERVAL; + return rc; +} +EXPORT_SYMBOL(scrub_file_store); + +bool scrub_needs_check(struct lustre_scrub *scrub, const struct lu_fid *fid, + u64 index) +{ + bool check = true; + + if (!fid_is_norm(fid) && !fid_is_igif(fid)) + check = false; + else if (scrub->os_running && scrub->os_pos_current > index) + check = false; + else if (scrub->os_auto_scrub_interval == AS_NEVER) + check = false; + else if (ktime_get_real_seconds() < + scrub->os_file.sf_time_last_complete + + scrub->os_auto_scrub_interval) + check = false; + + return check; +} +EXPORT_SYMBOL(scrub_needs_check); + +int scrub_checkpoint(const struct lu_env *env, struct lustre_scrub *scrub) +{ + struct scrub_file *sf = &scrub->os_file; + time64_t now = ktime_get_seconds(); + int rc; + + if (likely(now < scrub->os_time_next_checkpoint || + scrub->os_new_checked == 0)) + return 0; + + CDEBUG(D_LFSCK, "%s: OI scrub checkpoint at pos %llu\n", + scrub->os_name, scrub->os_pos_current); + + down_write(&scrub->os_rwsem); + sf->sf_items_checked += scrub->os_new_checked; + scrub->os_new_checked = 0; + sf->sf_pos_last_checkpoint = scrub->os_pos_current; + sf->sf_time_last_checkpoint = ktime_get_real_seconds(); + sf->sf_run_time += now - scrub->os_time_last_checkpoint; + rc = scrub_file_store(env, scrub); + up_write(&scrub->os_rwsem); + + return rc; +} +EXPORT_SYMBOL(scrub_checkpoint); + +int scrub_thread_prep(const struct lu_env *env, struct lustre_scrub *scrub, + uuid_t uuid, u64 start) +{ + struct scrub_file *sf = &scrub->os_file; + u32 flags = scrub->os_start_flags; + bool drop_dryrun = false; + int rc; + + ENTRY; + CDEBUG(D_LFSCK, "%s: OI scrub prep, flags = 0x%x\n", + scrub->os_name, flags); + + down_write(&scrub->os_rwsem); + if (flags & SS_SET_FAILOUT) + sf->sf_param |= SP_FAILOUT; + else if (flags & SS_CLEAR_FAILOUT) + sf->sf_param &= ~SP_FAILOUT; + + if (flags & SS_SET_DRYRUN) { + sf->sf_param |= SP_DRYRUN; + } else if (flags & SS_CLEAR_DRYRUN && sf->sf_param & SP_DRYRUN) { + sf->sf_param &= ~SP_DRYRUN; + drop_dryrun = true; + } + + if (flags & SS_RESET) + scrub_file_reset(scrub, uuid, 0); + + spin_lock(&scrub->os_lock); + scrub->os_partial_scan = 0; + if (flags & SS_AUTO_FULL) { + scrub->os_full_speed = 1; + sf->sf_flags |= SF_AUTO; + } else if (flags & SS_AUTO_PARTIAL) { + scrub->os_full_speed = 0; + scrub->os_partial_scan = 1; + sf->sf_flags |= SF_AUTO; + } else if (sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT | + SF_UPGRADE)) { + scrub->os_full_speed = 1; + } else { + scrub->os_full_speed = 0; + } + + scrub->os_in_prior = 0; + scrub->os_waiting = 0; + scrub->os_paused = 0; + scrub->os_in_join = 0; + scrub->os_full_scrub = 0; + spin_unlock(&scrub->os_lock); + scrub->os_new_checked = 0; + if (drop_dryrun && sf->sf_pos_first_inconsistent != 0) + sf->sf_pos_latest_start = sf->sf_pos_first_inconsistent; + else if (sf->sf_pos_last_checkpoint != 0) + sf->sf_pos_latest_start = sf->sf_pos_last_checkpoint + 1; + else + sf->sf_pos_latest_start = start; + + scrub->os_pos_current = sf->sf_pos_latest_start; + sf->sf_status = SS_SCANNING; + sf->sf_time_latest_start = ktime_get_real_seconds(); + sf->sf_time_last_checkpoint = sf->sf_time_latest_start; + sf->sf_pos_last_checkpoint = sf->sf_pos_latest_start - 1; + rc = scrub_file_store(env, scrub); + if (rc == 0) { + spin_lock(&scrub->os_lock); + scrub->os_running = 1; + spin_unlock(&scrub->os_lock); + wake_up_var(scrub); + } + up_write(&scrub->os_rwsem); + + RETURN(rc); +} +EXPORT_SYMBOL(scrub_thread_prep); + +int scrub_thread_post(const struct lu_env *env, struct lustre_scrub *scrub, + int result) +{ + struct scrub_file *sf = &scrub->os_file; + int rc; + ENTRY; + + CDEBUG(D_LFSCK, "%s: OI scrub post with result = %d\n", + scrub->os_name, result); + + down_write(&scrub->os_rwsem); + spin_lock(&scrub->os_lock); + scrub->os_running = 0; + spin_unlock(&scrub->os_lock); + if (scrub->os_new_checked > 0) { + sf->sf_items_checked += scrub->os_new_checked; + scrub->os_new_checked = 0; + sf->sf_pos_last_checkpoint = scrub->os_pos_current; + } + sf->sf_time_last_checkpoint = ktime_get_real_seconds(); + if (result > 0) { + sf->sf_status = SS_COMPLETED; + if (!(sf->sf_param & SP_DRYRUN)) { + memset(sf->sf_oi_bitmap, 0, SCRUB_OI_BITMAP_SIZE); + sf->sf_flags &= ~(SF_RECREATED | SF_INCONSISTENT | + SF_UPGRADE | SF_AUTO); + } + sf->sf_time_last_complete = sf->sf_time_last_checkpoint; + sf->sf_success_count++; + } else if (result == 0) { + if (scrub->os_paused) + sf->sf_status = SS_PAUSED; + else + sf->sf_status = SS_STOPPED; + } else { + sf->sf_status = SS_FAILED; + } + sf->sf_run_time += ktime_get_seconds() - + scrub->os_time_last_checkpoint; + + rc = scrub_file_store(env, scrub); + up_write(&scrub->os_rwsem); + + RETURN(rc < 0 ? rc : result); +} +EXPORT_SYMBOL(scrub_thread_post); + +int scrub_start(int (*threadfn)(void *data), struct lustre_scrub *scrub, + void *data, __u32 flags) +{ + struct task_struct *task; + int rc; + ENTRY; + + if (scrub->os_task) + RETURN(-EALREADY); + + if (scrub->os_file.sf_status == SS_COMPLETED) { + if (!(flags & SS_SET_FAILOUT)) + flags |= SS_CLEAR_FAILOUT; + + if (!(flags & SS_SET_DRYRUN)) + flags |= SS_CLEAR_DRYRUN; + + flags |= SS_RESET; + } + + task = kthread_create(threadfn, data, "OI_scrub"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("%s: cannot start iteration thread: rc = %d\n", + scrub->os_name, rc); + RETURN(rc); + } + spin_lock(&scrub->os_lock); + if (scrub->os_task) { + /* Lost a race */ + spin_unlock(&scrub->os_lock); + kthread_stop(task); + RETURN(-EALREADY); + } + scrub->os_start_flags = flags; + scrub->os_task = task; + wake_up_process(task); + spin_unlock(&scrub->os_lock); + wait_var_event(scrub, scrub->os_running || !scrub->os_task); + + RETURN(0); +} +EXPORT_SYMBOL(scrub_start); + +void scrub_stop(struct lustre_scrub *scrub) +{ + struct task_struct *task; + + spin_lock(&scrub->os_lock); + scrub->os_running = 0; + spin_unlock(&scrub->os_lock); + task = xchg(&scrub->os_task, NULL); + if (task) + kthread_stop(task); +} +EXPORT_SYMBOL(scrub_stop); + +const char *const scrub_status_names[] = { + "init", + "scanning", + "completed", + "failed", + "stopped", + "paused", + "crashed", + NULL +}; + +const char *const scrub_flags_names[] = { + "recreated", + "inconsistent", + "auto", + "upgrade", + NULL +}; + +const char *const scrub_param_names[] = { + "failout", + "dryrun", + NULL +}; + +static void scrub_bits_dump(struct seq_file *m, int bits, + const char *const names[], + const char *prefix) +{ + int flag; + int i; + + seq_printf(m, "%s:%c", prefix, bits != 0 ? ' ' : '\n'); + + for (i = 0, flag = 1; bits != 0; i++, flag = BIT(i)) { + if (flag & bits) { + bits &= ~flag; + seq_printf(m, "%s%c", names[i], + bits != 0 ? ',' : '\n'); + } + } +} + +static void scrub_time_dump(struct seq_file *m, time64_t time, + const char *prefix) +{ + if (time != 0) + seq_printf(m, "%s: %llu seconds\n", prefix, + ktime_get_real_seconds() - time); + else + seq_printf(m, "%s: N/A\n", prefix); +} + +static void scrub_pos_dump(struct seq_file *m, __u64 pos, const char *prefix) +{ + if (pos != 0) + seq_printf(m, "%s: %llu\n", prefix, pos); + else + seq_printf(m, "%s: N/A\n", prefix); +} + +void scrub_dump(struct seq_file *m, struct lustre_scrub *scrub) +{ + struct scrub_file *sf = &scrub->os_file; + u64 checked; + s64 speed; + + down_read(&scrub->os_rwsem); + seq_printf(m, "name: OI_scrub\n" + "magic: 0x%x\n" + "oi_files: %d\n" + "status: %s\n", + sf->sf_magic, (int)sf->sf_oi_count, + scrub_status_names[sf->sf_status]); + + scrub_bits_dump(m, sf->sf_flags, scrub_flags_names, "flags"); + + scrub_bits_dump(m, sf->sf_param, scrub_param_names, "param"); + + scrub_time_dump(m, sf->sf_time_last_complete, + "time_since_last_completed"); + + scrub_time_dump(m, sf->sf_time_latest_start, + "time_since_latest_start"); + + scrub_time_dump(m, sf->sf_time_last_checkpoint, + "time_since_last_checkpoint"); + + scrub_pos_dump(m, sf->sf_pos_latest_start, + "latest_start_position"); + + scrub_pos_dump(m, sf->sf_pos_last_checkpoint, + "last_checkpoint_position"); + + scrub_pos_dump(m, sf->sf_pos_first_inconsistent, + "first_failure_position"); + + checked = sf->sf_items_checked + scrub->os_new_checked; + seq_printf(m, "checked: %llu\n" + "%s: %llu\n" + "failed: %llu\n" + "prior_%s: %llu\n" + "noscrub: %llu\n" + "igif: %llu\n" + "success_count: %u\n", + checked, + sf->sf_param & SP_DRYRUN ? "inconsistent" : "updated", + sf->sf_items_updated, sf->sf_items_failed, + sf->sf_param & SP_DRYRUN ? "inconsistent" : "updated", + sf->sf_items_updated_prior, sf->sf_items_noscrub, + sf->sf_items_igif, sf->sf_success_count); + + speed = checked; + if (scrub->os_running) { + s64 new_checked = scrub->os_new_checked; + time64_t duration; + time64_t rtime; + + /* Since the time resolution is in seconds for new system + * or small devices it ismore likely that duration will be + * zero which will lead to inaccurate results. + */ + duration = ktime_get_seconds() - + scrub->os_time_last_checkpoint; + if (duration != 0) + new_checked = div_s64(new_checked, duration); + + rtime = sf->sf_run_time + duration; + if (rtime != 0) + speed = div_s64(speed, rtime); + + seq_printf(m, "run_time: %lld seconds\n" + "average_speed: %lld objects/sec\n" + "real_time_speed: %lld objects/sec\n" + "current_position: %llu\n" + "scrub_in_prior: %s\n" + "scrub_full_speed: %s\n" + "partial_scan: %s\n", + rtime, speed, new_checked, + scrub->os_pos_current, + scrub->os_in_prior ? "yes" : "no", + scrub->os_full_speed ? "yes" : "no", + scrub->os_partial_scan ? "yes" : "no"); + } else { + if (sf->sf_run_time != 0) + speed = div_s64(speed, sf->sf_run_time); + seq_printf(m, "run_time: %d seconds\n" + "average_speed: %lld objects/sec\n" + "real_time_speed: N/A\n" + "current_position: N/A\n", + sf->sf_run_time, speed); + } + + up_read(&scrub->os_rwsem); +} +EXPORT_SYMBOL(scrub_dump); + +int lustre_liru_new(struct list_head *head, const struct lu_fid *pfid, + const struct lu_fid *cfid, __u64 child, + const char *name, int namelen) +{ + struct lustre_index_restore_unit *liru; + int len = sizeof(*liru) + namelen + 1; + + OBD_ALLOC(liru, len); + if (!liru) + return -ENOMEM; + + INIT_LIST_HEAD(&liru->liru_link); + liru->liru_pfid = *pfid; + liru->liru_cfid = *cfid; + liru->liru_clid = child; + liru->liru_len = len; + memcpy(liru->liru_name, name, namelen); + liru->liru_name[namelen] = 0; + list_add_tail(&liru->liru_link, head); + + return 0; +} +EXPORT_SYMBOL(lustre_liru_new); + +int lustre_index_register(struct dt_device *dev, const char *devname, + struct list_head *head, spinlock_t *lock, int *guard, + const struct lu_fid *fid, + __u32 keysize, __u32 recsize) +{ + struct lustre_index_backup_unit *libu, *pos; + int rc = 0; + ENTRY; + + if (dev->dd_rdonly || *guard) + RETURN(1); + + OBD_ALLOC_PTR(libu); + if (!libu) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&libu->libu_link); + libu->libu_keysize = keysize; + libu->libu_recsize = recsize; + libu->libu_fid = *fid; + + spin_lock(lock); + if (unlikely(*guard)) { + spin_unlock(lock); + OBD_FREE_PTR(libu); + + RETURN(1); + } + + list_for_each_entry_reverse(pos, head, libu_link) { + rc = lu_fid_cmp(&pos->libu_fid, fid); + if (rc < 0) { + list_add(&libu->libu_link, &pos->libu_link); + spin_unlock(lock); + + RETURN(0); + } + + if (!rc) { + /* Registered already. But the former registered one + * has different keysize/recsize. It may because that + * the former values are from disk and corrupted, then + * replace it with new values. */ + if (unlikely(keysize != pos->libu_keysize || + recsize != pos->libu_recsize)) { + CWARN("%s: the index "DFID" has registered " + "with %u/%u, may be invalid, replace " + "with %u/%u\n", + devname, PFID(fid), pos->libu_keysize, + pos->libu_recsize, keysize, recsize); + + pos->libu_keysize = keysize; + pos->libu_recsize = recsize; + } else { + rc = 1; + } + + spin_unlock(lock); + OBD_FREE_PTR(libu); + + RETURN(rc); + } + } + + list_add(&libu->libu_link, head); + spin_unlock(lock); + + RETURN(0); +} +EXPORT_SYMBOL(lustre_index_register); + +static void lustre_index_degister(struct list_head *head, spinlock_t *lock, + const struct lu_fid *fid) +{ + struct lustre_index_backup_unit *libu; + int rc = -ENOENT; + + spin_lock(lock); + list_for_each_entry_reverse(libu, head, libu_link) { + rc = lu_fid_cmp(&libu->libu_fid, fid); + /* NOT registered. */ + if (rc < 0) + break; + + if (!rc) { + list_del(&libu->libu_link); + break; + } + } + spin_unlock(lock); + + if (!rc) + OBD_FREE_PTR(libu); +} + +static void +lustre_index_backup_make_header(struct lustre_index_backup_header *header, + __u32 keysize, __u32 recsize, + const struct lu_fid *fid, __u32 count) +{ + memset(header, 0, sizeof(*header)); + header->libh_magic = cpu_to_le32(INDEX_BACKUP_MAGIC_V1); + header->libh_count = cpu_to_le32(count); + header->libh_keysize = cpu_to_le32(keysize); + header->libh_recsize = cpu_to_le32(recsize); + fid_cpu_to_le(&header->libh_owner, fid); +} + +static int lustre_index_backup_body(const struct lu_env *env, + struct dt_object *obj, loff_t *pos, + void *buf, int bufsize) +{ + struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev); + struct thandle *th; + struct lu_buf lbuf = { + .lb_buf = buf, + .lb_len = bufsize + }; + int rc; + ENTRY; + + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = dt_declare_record_write(env, obj, &lbuf, *pos, th); + if (rc) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + rc = dt_record_write(env, obj, &lbuf, pos, th); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, dev, th); + return rc; +} + +static int lustre_index_backup_header(const struct lu_env *env, + struct dt_object *obj, + const struct lu_fid *tgt_fid, + __u32 keysize, __u32 recsize, + void *buf, int bufsize, int count) +{ + struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev); + struct lustre_index_backup_header *header = buf; + struct lu_attr *la = buf; + struct thandle *th; + struct lu_buf lbuf = { + .lb_buf = header, + .lb_len = sizeof(*header) + }; + loff_t size = sizeof(*header) + (keysize + recsize) * count; + loff_t pos = 0; + int rc; + bool punch = false; + ENTRY; + + LASSERT(sizeof(*la) <= bufsize); + LASSERT(sizeof(*header) <= bufsize); + + rc = dt_attr_get(env, obj, la); + if (rc) + RETURN(rc); + + if (la->la_size > size) + punch = true; + + lustre_index_backup_make_header(header, keysize, recsize, + tgt_fid, count); + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = dt_declare_record_write(env, obj, &lbuf, pos, th); + if (rc) + GOTO(stop, rc); + + if (punch) { + rc = dt_declare_punch(env, obj, size, OBD_OBJECT_EOF, th); + if (rc) + GOTO(stop, rc); + } + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + rc = dt_record_write(env, obj, &lbuf, &pos, th); + if (!rc && punch) + rc = dt_punch(env, obj, size, OBD_OBJECT_EOF, th); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, dev, th); + return rc; +} + +static int lustre_index_update_lma(const struct lu_env *env, + struct dt_object *obj, + void *buf, int bufsize) +{ + struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev); + struct lustre_mdt_attrs *lma = buf; + struct lu_buf lbuf = { + .lb_buf = lma, + .lb_len = sizeof(struct lustre_ost_attrs) + }; + struct thandle *th; + int fl = LU_XATTR_REPLACE; + int rc; + ENTRY; + + LASSERT(bufsize >= lbuf.lb_len); + + rc = dt_xattr_get(env, obj, &lbuf, XATTR_NAME_LMA); + if (unlikely(rc == -ENODATA)) { + fl = LU_XATTR_CREATE; + lustre_lma_init(lma, lu_object_fid(&obj->do_lu), + LMAC_IDX_BACKUP, 0); + rc = sizeof(*lma); + } else if (rc < sizeof(*lma)) { + RETURN(rc < 0 ? rc : -EFAULT); + } else { + lustre_lma_swab(lma); + if (lma->lma_compat & LMAC_IDX_BACKUP) + RETURN(0); + + lma->lma_compat |= LMAC_IDX_BACKUP; + } + + lustre_lma_swab(lma); + lbuf.lb_len = rc; + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + RETURN(rc); + + rc = dt_declare_xattr_set(env, obj, &lbuf, XATTR_NAME_LMA, fl, th); + if (rc) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + rc = dt_xattr_set(env, obj, &lbuf, XATTR_NAME_LMA, fl, th); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, dev, th); + return rc; +} + +static int lustre_index_backup_one(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + struct lustre_index_backup_unit *libu, + char *buf, int bufsize) +{ + struct dt_device *dev = scrub_obj2dev(parent); + struct dt_object *tgt_obj = NULL; + struct dt_object *bak_obj = NULL; + const struct dt_it_ops *iops; + struct dt_it *di; + loff_t pos = sizeof(struct lustre_index_backup_header); + int count = 0; + int size = 0; + int rc; + ENTRY; + + tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev, + &libu->libu_fid, NULL)); + if (IS_ERR_OR_NULL(tgt_obj)) + GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT); + + if (!dt_object_exists(tgt_obj)) + GOTO(out, rc = 0); + + if (!tgt_obj->do_index_ops) { + struct dt_index_features feat; + + feat.dif_flags = DT_IND_UPDATE; + feat.dif_keysize_min = libu->libu_keysize; + feat.dif_keysize_max = libu->libu_keysize; + feat.dif_recsize_min = libu->libu_recsize; + feat.dif_recsize_max = libu->libu_recsize; + feat.dif_ptrsize = 4; + rc = tgt_obj->do_ops->do_index_try(env, tgt_obj, &feat); + if (rc) + GOTO(out, rc); + } + + lustre_fid2lbx(buf, &libu->libu_fid, bufsize); + bak_obj = local_file_find_or_create(env, los, parent, buf, + S_IFREG | S_IRUGO | S_IWUSR); + if (IS_ERR_OR_NULL(bak_obj)) + GOTO(out, rc = bak_obj ? PTR_ERR(bak_obj) : -ENOENT); + + iops = &tgt_obj->do_index_ops->dio_it; + di = iops->init(env, tgt_obj, 0); + if (IS_ERR(di)) + GOTO(out, rc = PTR_ERR(di)); + + rc = iops->load(env, di, 0); + if (!rc) + rc = iops->next(env, di); + else if (rc > 0) + rc = 0; + + while (!rc) { + void *key; + void *rec; + + key = iops->key(env, di); + memcpy(&buf[size], key, libu->libu_keysize); + size += libu->libu_keysize; + rec = &buf[size]; + rc = iops->rec(env, di, rec, 0); + if (rc) + GOTO(fini, rc); + + size += libu->libu_recsize; + count++; + if (size + libu->libu_keysize + libu->libu_recsize > bufsize) { + rc = lustre_index_backup_body(env, bak_obj, &pos, + buf, size); + if (rc) + GOTO(fini, rc); + + size = 0; + } + + rc = iops->next(env, di); + } + + if (rc >= 0 && size > 0) + rc = lustre_index_backup_body(env, bak_obj, &pos, buf, size); + + if (rc < 0) + GOTO(fini, rc); + + rc = lustre_index_backup_header(env, bak_obj, &libu->libu_fid, + libu->libu_keysize, libu->libu_recsize, + buf, bufsize, count); + if (!rc) + rc = lustre_index_update_lma(env, tgt_obj, buf, bufsize); + + if (!rc && OBD_FAIL_CHECK(OBD_FAIL_OSD_INDEX_CRASH)) { + LASSERT(bufsize >= 512); + + pos = 0; + memset(buf, 0, 512); + lustre_index_backup_body(env, tgt_obj, &pos, buf, 512); + } + + GOTO(fini, rc); + +fini: + iops->fini(env, di); +out: + if (!IS_ERR_OR_NULL(tgt_obj)) + dt_object_put_nocache(env, tgt_obj); + if (!IS_ERR_OR_NULL(bak_obj)) + dt_object_put_nocache(env, bak_obj); + return rc; +} + +void lustre_index_backup(const struct lu_env *env, struct dt_device *dev, + const char *devname, struct list_head *head, + spinlock_t *lock, int *guard, bool backup) +{ + struct lustre_index_backup_unit *libu; + struct local_oid_storage *los = NULL; + struct dt_object *parent = NULL; + char *buf = NULL; + struct lu_fid fid; + int rc; + ENTRY; + + if (dev->dd_rdonly || *guard) + RETURN_EXIT; + + spin_lock(lock); + *guard = 1; + spin_unlock(lock); + + if (list_empty(head)) + RETURN_EXIT; + + /* Handle kinds of failures during mount process. */ + if (!dev->dd_lu_dev.ld_site || !dev->dd_lu_dev.ld_site->ls_top_dev) + backup = false; + + if (backup) { + OBD_ALLOC_LARGE(buf, INDEX_BACKUP_BUFSIZE); + if (!buf) { + backup = false; + goto scan; + } + + lu_local_obj_fid(&fid, INDEX_BACKUP_OID); + parent = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev, + &fid, NULL)); + if (IS_ERR_OR_NULL(parent)) { + CERROR("%s: failed to locate backup dir: rc = %ld\n", + devname, parent ? PTR_ERR(parent) : -ENOENT); + backup = false; + goto scan; + } + + lu_local_name_obj_fid(&fid, 1); + rc = local_oid_storage_init(env, dev, &fid, &los); + if (rc) { + CERROR("%s: failed to init local storage: rc = %d\n", + devname, rc); + backup = false; + } + } + +scan: + spin_lock(lock); + while (!list_empty(head)) { + libu = list_entry(head->next, + struct lustre_index_backup_unit, libu_link); + list_del_init(&libu->libu_link); + spin_unlock(lock); + + if (backup) { + rc = lustre_index_backup_one(env, los, parent, libu, + buf, INDEX_BACKUP_BUFSIZE); + CDEBUG(D_WARNING, "%s: backup index "DFID": rc = %d\n", + devname, PFID(&libu->libu_fid), rc); + } + + OBD_FREE_PTR(libu); + spin_lock(lock); + } + spin_unlock(lock); + + if (los) + local_oid_storage_fini(env, los); + if (parent) + dt_object_put_nocache(env, parent); + if (buf) + OBD_FREE_LARGE(buf, INDEX_BACKUP_BUFSIZE); + + EXIT; +} +EXPORT_SYMBOL(lustre_index_backup); + +int lustre_index_restore(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *parent_fid, + const struct lu_fid *tgt_fid, + const struct lu_fid *bak_fid, const char *name, + struct list_head *head, spinlock_t *lock, + char *buf, int bufsize) +{ + struct dt_object *parent_obj = NULL; + struct dt_object *tgt_obj = NULL; + struct dt_object *bak_obj = NULL; + struct lustre_index_backup_header *header; + struct dt_index_features *feat; + struct dt_object_format *dof; + struct lu_attr *la; + struct thandle *th; + struct lu_object_conf conf; + struct dt_insert_rec ent; + struct lu_buf lbuf; + struct lu_fid tfid; + loff_t pos = 0; + __u32 keysize; + __u32 recsize; + __u32 pairsize; + int count; + int rc; + bool registered = false; + ENTRY; + + LASSERT(bufsize >= sizeof(*la) + sizeof(*dof) + + sizeof(*feat) + sizeof(*header)); + + memset(buf, 0, bufsize); + la = (struct lu_attr *)buf; + dof = (void *)la + sizeof(*la); + feat = (void *)dof + sizeof(*dof); + header = (void *)feat + sizeof(*feat); + lbuf.lb_buf = header; + lbuf.lb_len = sizeof(*header); + + tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev, + tgt_fid, NULL)); + if (IS_ERR_OR_NULL(tgt_obj)) + GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT); + + bak_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev, + bak_fid, NULL)); + if (IS_ERR_OR_NULL(bak_obj)) + GOTO(out, rc = bak_obj ? PTR_ERR(bak_obj) : -ENOENT); + + if (!dt_object_exists(bak_obj)) + GOTO(out, rc = -ENOENT); + + parent_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev, + parent_fid, NULL)); + if (IS_ERR_OR_NULL(parent_obj)) + GOTO(out, rc = parent_obj ? PTR_ERR(parent_obj) : -ENOENT); + + LASSERT(dt_object_exists(parent_obj)); + + if (unlikely(!dt_try_as_dir(env, parent_obj))) + GOTO(out, rc = -ENOTDIR); + + rc = dt_attr_get(env, tgt_obj, la); + if (rc) + GOTO(out, rc); + + rc = dt_record_read(env, bak_obj, &lbuf, &pos); + if (rc) + GOTO(out, rc); + + if (le32_to_cpu(header->libh_magic) != INDEX_BACKUP_MAGIC_V1) + GOTO(out, rc = -EINVAL); + + fid_le_to_cpu(&tfid, &header->libh_owner); + if (unlikely(!lu_fid_eq(tgt_fid, &tfid))) + GOTO(out, rc = -EINVAL); + + keysize = le32_to_cpu(header->libh_keysize); + recsize = le32_to_cpu(header->libh_recsize); + pairsize = keysize + recsize; + + memset(feat, 0, sizeof(*feat)); + feat->dif_flags = DT_IND_UPDATE; + feat->dif_keysize_min = feat->dif_keysize_max = keysize; + feat->dif_recsize_min = feat->dif_recsize_max = recsize; + feat->dif_ptrsize = 4; + + /* T1: remove old name entry and destroy old index. */ + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = dt_declare_delete(env, parent_obj, + (const struct dt_key *)name, th); + if (rc) + GOTO(stop, rc); + + rc = dt_declare_ref_del(env, tgt_obj, th); + if (rc) + GOTO(stop, rc); + + rc = dt_declare_destroy(env, tgt_obj, th); + if (rc) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + rc = dt_delete(env, parent_obj, (const struct dt_key *)name, th); + if (rc) + GOTO(stop, rc); + + dt_write_lock(env, tgt_obj, 0); + rc = dt_ref_del(env, tgt_obj, th); + if (rc == 0) { + if (S_ISDIR(tgt_obj->do_lu.lo_header->loh_attr)) + dt_ref_del(env, tgt_obj, th); + rc = dt_destroy(env, tgt_obj, th); + } + dt_write_unlock(env, tgt_obj); + dt_trans_stop(env, dev, th); + if (rc) + GOTO(out, rc); + + la->la_valid = LA_MODE | LA_UID | LA_GID; + conf.loc_flags = LOC_F_NEW; + dof->u.dof_idx.di_feat = feat; + dof->dof_type = DFT_INDEX; + ent.rec_type = S_IFREG; + ent.rec_fid = tgt_fid; + + /* Drop cache before re-create it. */ + dt_object_put_nocache(env, tgt_obj); + tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev, + tgt_fid, &conf)); + if (IS_ERR_OR_NULL(tgt_obj)) + GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT); + + LASSERT(!dt_object_exists(tgt_obj)); + + /* T2: create new index and insert new name entry. */ + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = dt_declare_create(env, tgt_obj, la, NULL, dof, th); + if (rc) + GOTO(stop, rc); + + rc = dt_declare_insert(env, parent_obj, (const struct dt_rec *)&ent, + (const struct dt_key *)name, th); + if (rc) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + dt_write_lock(env, tgt_obj, 0); + rc = dt_create(env, tgt_obj, la, NULL, dof, th); + dt_write_unlock(env, tgt_obj); + if (rc) + GOTO(stop, rc); + + rc = dt_insert(env, parent_obj, (const struct dt_rec *)&ent, + (const struct dt_key *)name, th); + dt_trans_stop(env, dev, th); + /* Some index name may has been inserted by OSD + * automatically when create the index object. */ + if (unlikely(rc == -EEXIST)) + rc = 0; + if (rc) + GOTO(out, rc); + + /* The new index will register via index_try. */ + rc = tgt_obj->do_ops->do_index_try(env, tgt_obj, feat); + if (rc) + GOTO(out, rc); + + registered = true; + count = le32_to_cpu(header->libh_count); + while (!rc && count > 0) { + int size = pairsize * count; + int items = count; + int i; + + if (size > bufsize) { + items = bufsize / pairsize; + size = pairsize * items; + } + + lbuf.lb_buf = buf; + lbuf.lb_len = size; + rc = dt_record_read(env, bak_obj, &lbuf, &pos); + for (i = 0; i < items && !rc; i++) { + void *key = &buf[i * pairsize]; + void *rec = &buf[i * pairsize + keysize]; + + /* Tn: restore the records. */ + th = dt_trans_create(env, dev); + if (!th) + GOTO(out, rc = -ENOMEM); + + rc = dt_declare_insert(env, tgt_obj, rec, key, th); + if (rc) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + rc = dt_insert(env, tgt_obj, rec, key, th); + if (unlikely(rc == -EEXIST)) + rc = 0; + + dt_trans_stop(env, dev, th); + } + + count -= items; + } + + GOTO(out, rc); + +stop: + dt_trans_stop(env, dev, th); + if (rc && registered) + /* Degister the index to avoid overwriting the backup. */ + lustre_index_degister(head, lock, tgt_fid); + +out: + if (!IS_ERR_OR_NULL(tgt_obj)) + dt_object_put_nocache(env, tgt_obj); + if (!IS_ERR_OR_NULL(bak_obj)) + dt_object_put_nocache(env, bak_obj); + if (!IS_ERR_OR_NULL(parent_obj)) + dt_object_put_nocache(env, parent_obj); + return rc; +} +EXPORT_SYMBOL(lustre_index_restore); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c new file mode 100644 index 0000000000000..3f0c50e2c32cb --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c @@ -0,0 +1,72 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/statfs_pack.c + * + * (Un)packing of OST/MDS requests + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include + +void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs) +{ + memset(osfs, 0, sizeof(*osfs)); + osfs->os_type = sfs->f_type; + osfs->os_blocks = sfs->f_blocks; + osfs->os_bfree = sfs->f_bfree; + osfs->os_bavail = sfs->f_bavail; + osfs->os_files = sfs->f_files; + osfs->os_ffree = sfs->f_ffree; + osfs->os_bsize = sfs->f_bsize; + osfs->os_namelen = sfs->f_namelen; +} +EXPORT_SYMBOL(statfs_pack); + +void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs) +{ + memset(sfs, 0, sizeof(*sfs)); + sfs->f_type = osfs->os_type; + sfs->f_blocks = osfs->os_blocks; + sfs->f_bfree = osfs->os_bfree; + sfs->f_bavail = osfs->os_bavail; + sfs->f_files = osfs->os_files; + sfs->f_ffree = osfs->os_ffree; + sfs->f_bsize = osfs->os_bsize; + sfs->f_namelen = osfs->os_namelen; +} +EXPORT_SYMBOL(statfs_unpack); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c new file mode 100644 index 0000000000000..94a150b266a17 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c @@ -0,0 +1,454 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/upcall_cache.c + * + * Supplementary groups cache. + */ +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include +#include + +static struct upcall_cache_entry *alloc_entry(struct upcall_cache *cache, + __u64 key, void *args) +{ + struct upcall_cache_entry *entry; + + LIBCFS_ALLOC(entry, sizeof(*entry)); + if (!entry) + return NULL; + + UC_CACHE_SET_NEW(entry); + INIT_LIST_HEAD(&entry->ue_hash); + entry->ue_key = key; + atomic_set(&entry->ue_refcount, 0); + init_waitqueue_head(&entry->ue_waitq); + if (cache->uc_ops->init_entry) + cache->uc_ops->init_entry(entry, args); + return entry; +} + +/* protected by cache lock */ +static void free_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + if (cache->uc_ops->free_entry) + cache->uc_ops->free_entry(cache, entry); + + list_del(&entry->ue_hash); + CDEBUG(D_OTHER, "destroy cache entry %p for key %llu\n", + entry, entry->ue_key); + LIBCFS_FREE(entry, sizeof(*entry)); +} + +static inline int upcall_compare(struct upcall_cache *cache, + struct upcall_cache_entry *entry, + __u64 key, void *args) +{ + if (entry->ue_key != key) + return -1; + + if (cache->uc_ops->upcall_compare) + return cache->uc_ops->upcall_compare(cache, entry, key, args); + + return 0; +} + +static inline int downcall_compare(struct upcall_cache *cache, + struct upcall_cache_entry *entry, + __u64 key, void *args) +{ + if (entry->ue_key != key) + return -1; + + if (cache->uc_ops->downcall_compare) + return cache->uc_ops->downcall_compare(cache, entry, key, args); + + return 0; +} + +static inline void get_entry(struct upcall_cache_entry *entry) +{ + atomic_inc(&entry->ue_refcount); +} + +static inline void put_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + if (atomic_dec_and_test(&entry->ue_refcount) && + (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry))) { + free_entry(cache, entry); + } +} + +static int check_unlink_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + time64_t now = ktime_get_seconds(); + + if (UC_CACHE_IS_VALID(entry) && now < entry->ue_expire) + return 0; + + if (UC_CACHE_IS_ACQUIRING(entry)) { + if (entry->ue_acquire_expire == 0 || + now < entry->ue_acquire_expire) + return 0; + + UC_CACHE_SET_EXPIRED(entry); + wake_up(&entry->ue_waitq); + } else if (!UC_CACHE_IS_INVALID(entry)) { + UC_CACHE_SET_EXPIRED(entry); + } + + list_del_init(&entry->ue_hash); + if (!atomic_read(&entry->ue_refcount)) + free_entry(cache, entry); + return 1; +} + +static inline int refresh_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + LASSERT(cache->uc_ops->do_upcall); + return cache->uc_ops->do_upcall(cache, entry); +} + +struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache, + __u64 key, void *args) +{ + struct upcall_cache_entry *entry = NULL, *new = NULL, *next; + bool failedacquiring = false; + struct list_head *head; + wait_queue_entry_t wait; + int rc, found; + ENTRY; + + LASSERT(cache); + + head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)]; +find_again: + found = 0; + spin_lock(&cache->uc_lock); + list_for_each_entry_safe(entry, next, head, ue_hash) { + /* check invalid & expired items */ + if (check_unlink_entry(cache, entry)) + continue; + if (upcall_compare(cache, entry, key, args) == 0) { + found = 1; + break; + } + } + + if (!found) { + if (!new) { + spin_unlock(&cache->uc_lock); + new = alloc_entry(cache, key, args); + if (!new) { + CERROR("fail to alloc entry\n"); + RETURN(ERR_PTR(-ENOMEM)); + } + goto find_again; + } else { + list_add(&new->ue_hash, head); + entry = new; + } + } else { + if (new) { + free_entry(cache, new); + new = NULL; + } + list_move(&entry->ue_hash, head); + } + get_entry(entry); + + /* acquire for new one */ + if (UC_CACHE_IS_NEW(entry)) { + UC_CACHE_SET_ACQUIRING(entry); + UC_CACHE_CLEAR_NEW(entry); + spin_unlock(&cache->uc_lock); + rc = refresh_entry(cache, entry); + spin_lock(&cache->uc_lock); + entry->ue_acquire_expire = ktime_get_seconds() + + cache->uc_acquire_expire; + if (rc < 0) { + UC_CACHE_CLEAR_ACQUIRING(entry); + UC_CACHE_SET_INVALID(entry); + wake_up(&entry->ue_waitq); + if (unlikely(rc == -EREMCHG)) { + put_entry(cache, entry); + GOTO(out, entry = ERR_PTR(rc)); + } + } + } + /* someone (and only one) is doing upcall upon this item, + * wait it to complete */ + if (UC_CACHE_IS_ACQUIRING(entry)) { + long expiry = (entry == new) ? + cfs_time_seconds(cache->uc_acquire_expire) : + MAX_SCHEDULE_TIMEOUT; + long left; + + init_wait(&wait); + add_wait_queue(&entry->ue_waitq, &wait); + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock(&cache->uc_lock); + + left = schedule_timeout(expiry); + + spin_lock(&cache->uc_lock); + remove_wait_queue(&entry->ue_waitq, &wait); + if (UC_CACHE_IS_ACQUIRING(entry)) { + /* we're interrupted or upcall failed in the middle */ + rc = left > 0 ? -EINTR : -ETIMEDOUT; + CERROR("acquire for key %llu: error %d\n", + entry->ue_key, rc); + put_entry(cache, entry); + if (!failedacquiring) { + spin_unlock(&cache->uc_lock); + failedacquiring = true; + new = NULL; + goto find_again; + } + GOTO(out, entry = ERR_PTR(rc)); + } + } + + /* invalid means error, don't need to try again */ + if (UC_CACHE_IS_INVALID(entry)) { + put_entry(cache, entry); + GOTO(out, entry = ERR_PTR(-EIDRM)); + } + + /* check expired + * We can't refresh the existing one because some + * memory might be shared by multiple processes. + */ + if (check_unlink_entry(cache, entry)) { + /* if expired, try again. but if this entry is + * created by me but too quickly turn to expired + * without any error, should at least give a + * chance to use it once. + */ + if (entry != new) { + put_entry(cache, entry); + spin_unlock(&cache->uc_lock); + new = NULL; + goto find_again; + } + } + + /* Now we know it's good */ +out: + spin_unlock(&cache->uc_lock); + RETURN(entry); +} +EXPORT_SYMBOL(upcall_cache_get_entry); + +void upcall_cache_put_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + ENTRY; + + if (!entry) { + EXIT; + return; + } + + LASSERT(atomic_read(&entry->ue_refcount) > 0); + spin_lock(&cache->uc_lock); + put_entry(cache, entry); + spin_unlock(&cache->uc_lock); + EXIT; +} +EXPORT_SYMBOL(upcall_cache_put_entry); + +int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key, + void *args) +{ + struct upcall_cache_entry *entry = NULL; + struct list_head *head; + int found = 0, rc = 0; + ENTRY; + + LASSERT(cache); + + head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)]; + + spin_lock(&cache->uc_lock); + list_for_each_entry(entry, head, ue_hash) { + if (downcall_compare(cache, entry, key, args) == 0) { + found = 1; + get_entry(entry); + break; + } + } + + if (!found) { + CDEBUG(D_OTHER, "%s: upcall for key %llu not expected\n", + cache->uc_name, key); + /* haven't found, it's possible */ + spin_unlock(&cache->uc_lock); + RETURN(-EINVAL); + } + + if (err) { + CDEBUG(D_OTHER, "%s: upcall for key %llu returned %d\n", + cache->uc_name, entry->ue_key, err); + GOTO(out, rc = -EINVAL); + } + + if (!UC_CACHE_IS_ACQUIRING(entry)) { + CDEBUG(D_RPCTRACE, "%s: found uptodate entry %p (key %llu)" + "\n", cache->uc_name, entry, entry->ue_key); + GOTO(out, rc = 0); + } + + if (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry)) { + CERROR("%s: found a stale entry %p (key %llu) in ioctl\n", + cache->uc_name, entry, entry->ue_key); + GOTO(out, rc = -EINVAL); + } + + spin_unlock(&cache->uc_lock); + if (cache->uc_ops->parse_downcall) + rc = cache->uc_ops->parse_downcall(cache, entry, args); + spin_lock(&cache->uc_lock); + if (rc) + GOTO(out, rc); + + entry->ue_expire = ktime_get_seconds() + cache->uc_entry_expire; + UC_CACHE_SET_VALID(entry); + CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key %llu\n", + cache->uc_name, entry, entry->ue_key); +out: + if (rc) { + UC_CACHE_SET_INVALID(entry); + list_del_init(&entry->ue_hash); + } + UC_CACHE_CLEAR_ACQUIRING(entry); + spin_unlock(&cache->uc_lock); + wake_up(&entry->ue_waitq); + put_entry(cache, entry); + + RETURN(rc); +} +EXPORT_SYMBOL(upcall_cache_downcall); + +void upcall_cache_flush(struct upcall_cache *cache, int force) +{ + struct upcall_cache_entry *entry, *next; + int i; + ENTRY; + + spin_lock(&cache->uc_lock); + for (i = 0; i < UC_CACHE_HASH_SIZE; i++) { + list_for_each_entry_safe(entry, next, + &cache->uc_hashtable[i], ue_hash) { + if (!force && atomic_read(&entry->ue_refcount)) { + UC_CACHE_SET_EXPIRED(entry); + continue; + } + LASSERT(!atomic_read(&entry->ue_refcount)); + free_entry(cache, entry); + } + } + spin_unlock(&cache->uc_lock); + EXIT; +} +EXPORT_SYMBOL(upcall_cache_flush); + +void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args) +{ + struct list_head *head; + struct upcall_cache_entry *entry; + int found = 0; + ENTRY; + + head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)]; + + spin_lock(&cache->uc_lock); + list_for_each_entry(entry, head, ue_hash) { + if (upcall_compare(cache, entry, key, args) == 0) { + found = 1; + break; + } + } + + if (found) { + CWARN("%s: flush entry %p: key %llu, ref %d, fl %x, " + "cur %lld, ex %lld/%lld\n", + cache->uc_name, entry, entry->ue_key, + atomic_read(&entry->ue_refcount), entry->ue_flags, + ktime_get_real_seconds(), entry->ue_acquire_expire, + entry->ue_expire); + UC_CACHE_SET_EXPIRED(entry); + if (!atomic_read(&entry->ue_refcount)) + free_entry(cache, entry); + } + spin_unlock(&cache->uc_lock); +} +EXPORT_SYMBOL(upcall_cache_flush_one); + +struct upcall_cache *upcall_cache_init(const char *name, const char *upcall, + struct upcall_cache_ops *ops) +{ + struct upcall_cache *cache; + int i; + ENTRY; + + LIBCFS_ALLOC(cache, sizeof(*cache)); + if (!cache) + RETURN(ERR_PTR(-ENOMEM)); + + spin_lock_init(&cache->uc_lock); + init_rwsem(&cache->uc_upcall_rwsem); + for (i = 0; i < UC_CACHE_HASH_SIZE; i++) + INIT_LIST_HEAD(&cache->uc_hashtable[i]); + strlcpy(cache->uc_name, name, sizeof(cache->uc_name)); + /* upcall pathname proc tunable */ + strlcpy(cache->uc_upcall, upcall, sizeof(cache->uc_upcall)); + cache->uc_entry_expire = 20 * 60; + cache->uc_acquire_expire = 30; + cache->uc_ops = ops; + + RETURN(cache); +} +EXPORT_SYMBOL(upcall_cache_init); + +void upcall_cache_cleanup(struct upcall_cache *cache) +{ + if (!cache) + return; + upcall_cache_flush_all(cache); + LIBCFS_FREE(cache, sizeof(*cache)); +} +EXPORT_SYMBOL(upcall_cache_cleanup); diff --git a/drivers/staging/lustrefsx/lustre/obdecho/Makefile b/drivers/staging/lustrefsx/lustre/obdecho/Makefile new file mode 100644 index 0000000000000..3a2ba7082c3f4 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdecho/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTREFSX_FS) += obdecho.o + +obdecho-y := echo_client.o debug.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/obdecho/debug.c b/drivers/staging/lustrefsx/lustre/obdecho/debug.c new file mode 100644 index 0000000000000..3b9465e63636d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdecho/debug.c @@ -0,0 +1,99 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/debug.c + * + * Helper routines for dumping data structs for debugging. + */ + +#define DEBUG_SUBSYSTEM D_OTHER + + +#include +#include "echo_internal.h" +#include + +#define LPDS sizeof(__u64) +int block_debug_setup(void *addr, int len, __u64 off, __u64 id) +{ + LASSERT(addr); + + off = cpu_to_le64 (off); + id = cpu_to_le64 (id); + memcpy(addr, (char *)&off, LPDS); + memcpy(addr + LPDS, (char *)&id, LPDS); + + addr += len - LPDS - LPDS; + memcpy(addr, (char *)&off, LPDS); + memcpy(addr + LPDS, (char *)&id, LPDS); + + return 0; +} +EXPORT_SYMBOL(block_debug_setup); + +int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id) +{ + __u64 ne_off; + int err = 0; + + LASSERT(addr); + + ne_off = le64_to_cpu(off); + id = le64_to_cpu(id); + if (memcmp(addr, (char *)&ne_off, LPDS)) { + CDEBUG(D_ERROR, + "%s: id %#llx offset %llu off: %#llx != %#llx\n", + who, id, off, *(__u64 *)addr, ne_off); + err = -EINVAL; + } + if (memcmp(addr + LPDS, (char *)&id, LPDS)) { + CDEBUG(D_ERROR, "%s: id %#llx offset %llu id: %#llx != %#llx\n", + who, id, off, *(__u64 *)(addr + LPDS), id); + err = -EINVAL; + } + + addr += end - LPDS - LPDS; + if (memcmp(addr, (char *)&ne_off, LPDS)) { + CDEBUG(D_ERROR, + "%s: id %#llx offset %llu end off: %#llx != %#llx\n", + who, id, off, *(__u64 *)addr, ne_off); + err = -EINVAL; + } + if (memcmp(addr + LPDS, (char *)&id, LPDS)) { + CDEBUG(D_ERROR, + "%s: id %#llx offset %llu end id: %#llx != %#llx\n", + who, id, off, *(__u64 *)(addr + LPDS), id); + err = -EINVAL; + } + + return err; +} +EXPORT_SYMBOL(block_debug_check); +#undef LPDS diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo.c b/drivers/staging/lustrefsx/lustre/obdecho/echo.c new file mode 100644 index 0000000000000..b65d01a7fae2e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdecho/echo.c @@ -0,0 +1,980 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdecho/echo.c + * + * Author: Peter Braam + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_ECHO + +#include +#include +#include +#include + +#include "echo_internal.h" + +/* + * The echo objid needs to be below 2^32, because regular FID numbers are + * limited to 2^32 objects in f_oid for the FID_SEQ_ECHO range. b=23335 + */ +#define ECHO_INIT_OID 0x10000000ULL +#define ECHO_HANDLE_MAGIC 0xabcd0123fedc9876ULL + +#define ECHO_PERSISTENT_PAGES (ECHO_PERSISTENT_SIZE >> PAGE_SHIFT) +static struct page *echo_persistent_pages[ECHO_PERSISTENT_PAGES]; + +enum { + LPROC_ECHO_READ_BYTES = 1, + LPROC_ECHO_WRITE_BYTES = 2, + LPROC_ECHO_LAST = LPROC_ECHO_WRITE_BYTES + 1 +}; + +struct echo_srv_device { + struct lu_device esd_dev; + struct lu_target esd_lut; +}; + +static inline struct echo_srv_device *echo_srv_dev(struct lu_device *d) +{ + return container_of_safe(d, struct echo_srv_device, esd_dev); +} + +static inline struct obd_device *echo_srv_obd(struct echo_srv_device *esd) +{ + return esd->esd_dev.ld_obd; +} + +static int echo_connect(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *data, + void *localdata) +{ + struct lustre_handle conn = { 0 }; + int rc; + + data->ocd_connect_flags &= ECHO_CONNECT_SUPPORTED; + + if (data->ocd_connect_flags & OBD_CONNECT_FLAGS2) + data->ocd_connect_flags2 &= ECHO_CONNECT_SUPPORTED2; + + rc = class_connect(&conn, obd, cluuid); + if (rc) { + CERROR("can't connect %d\n", rc); + return rc; + } + *exp = class_conn2export(&conn); + + return 0; +} + +static int echo_disconnect(struct obd_export *exp) +{ + LASSERT(exp != NULL); + + return server_disconnect_export(exp); +} + +static int echo_init_export(struct obd_export *exp) +{ + return ldlm_init_export(exp); +} + +static int echo_destroy_export(struct obd_export *exp) +{ + ENTRY; + + target_destroy_export(exp); + ldlm_destroy_export(exp); + + RETURN(0); +} + +static u64 echo_next_id(struct obd_device *obd) +{ + u64 id; + + spin_lock(&obd->u.echo.eo_lock); + id = ++obd->u.echo.eo_lastino; + spin_unlock(&obd->u.echo.eo_lock); + + return id; +} + +static void +echo_page_debug_setup(struct page *page, int rw, u64 id, + __u64 offset, int len) +{ + int page_offset = offset & ~PAGE_MASK; + char *addr = ((char *)kmap(page)) + page_offset; + + if (len % OBD_ECHO_BLOCK_SIZE != 0) + CERROR("Unexpected block size %d\n", len); + + while (len > 0) { + if (rw & OBD_BRW_READ) + block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE, + offset, id); + else + block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE, + 0xecc0ecc0ecc0ecc0ULL, + 0xecc0ecc0ecc0ecc0ULL); + + addr += OBD_ECHO_BLOCK_SIZE; + offset += OBD_ECHO_BLOCK_SIZE; + len -= OBD_ECHO_BLOCK_SIZE; + } + + kunmap(page); +} + +static int +echo_page_debug_check(struct page *page, u64 id, + __u64 offset, int len) +{ + int page_offset = offset & ~PAGE_MASK; + char *addr = ((char *)kmap(page)) + page_offset; + int rc = 0; + int rc2; + + if (len % OBD_ECHO_BLOCK_SIZE != 0) + CERROR("Unexpected block size %d\n", len); + + while (len > 0) { + rc2 = block_debug_check("echo", addr, OBD_ECHO_BLOCK_SIZE, + offset, id); + + if (rc2 != 0 && rc == 0) + rc = rc2; + + addr += OBD_ECHO_BLOCK_SIZE; + offset += OBD_ECHO_BLOCK_SIZE; + len -= OBD_ECHO_BLOCK_SIZE; + } + + kunmap(page); + + return rc; +} + +static int echo_map_nb_to_lb(struct obdo *oa, struct obd_ioobj *obj, + struct niobuf_remote *nb, int *pages, + struct niobuf_local *lb, int cmd, int *left) +{ + gfp_t gfp_mask = (ostid_id(&obj->ioo_oid) & 1) ? + GFP_HIGHUSER : GFP_KERNEL; + int ispersistent = ostid_id(&obj->ioo_oid) == ECHO_PERSISTENT_OBJID; + int debug_setup = (!ispersistent && + (oa->o_valid & OBD_MD_FLFLAGS) != 0 && + (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); + struct niobuf_local *res = lb; + u64 offset = nb->rnb_offset; + int len = nb->rnb_len; + + while (len > 0) { + int plen = PAGE_SIZE - (offset & (PAGE_SIZE - 1)); + + if (len < plen) + plen = len; + + /* check for local buf overflow */ + if (*left == 0) + return -EINVAL; + + res->lnb_file_offset = offset; + res->lnb_len = plen; + LASSERT((res->lnb_file_offset & ~PAGE_MASK) + + res->lnb_len <= PAGE_SIZE); + + if (ispersistent && + ((res->lnb_file_offset >> PAGE_SHIFT) < + ECHO_PERSISTENT_PAGES)) { + res->lnb_page = + echo_persistent_pages[res->lnb_file_offset >> + PAGE_SHIFT]; + /* Take extra ref so __free_pages() can be called OK */ + get_page(res->lnb_page); + } else { + res->lnb_page = alloc_page(gfp_mask); + if (!res->lnb_page) { + CERROR("can't get page for id " DOSTID"\n", + POSTID(&obj->ioo_oid)); + return -ENOMEM; + } + /* set mapping so page is not considered encrypted */ + res->lnb_page->mapping = ECHO_MAPPING_UNENCRYPTED; + } + + CDEBUG(D_PAGE, "$$$$ get page %p @ %llu for %d\n", + res->lnb_page, res->lnb_file_offset, res->lnb_len); + + if (cmd & OBD_BRW_READ) + res->lnb_rc = res->lnb_len; + + if (debug_setup) + echo_page_debug_setup(res->lnb_page, cmd, + ostid_id(&obj->ioo_oid), + res->lnb_file_offset, + res->lnb_len); + + offset += plen; + len -= plen; + res++; + + (*left)--; + (*pages)++; + } + + return 0; +} + +static int echo_finalize_lb(struct obdo *oa, struct obd_ioobj *obj, + struct niobuf_remote *rb, int *pgs, + struct niobuf_local *lb, int verify) +{ + struct niobuf_local *res = lb; + u64 start = rb->rnb_offset >> PAGE_SHIFT; + u64 end = (rb->rnb_offset + rb->rnb_len + PAGE_SIZE - 1) >> + PAGE_SHIFT; + int count = (int)(end - start); + int rc = 0; + int i; + + for (i = 0; i < count; i++, (*pgs) ++, res++) { + struct page *page = res->lnb_page; + void *addr; + + if (!page) { + CERROR("null page objid %llu:%p, buf %d/%d\n", + ostid_id(&obj->ioo_oid), page, i, + obj->ioo_bufcnt); + return -EFAULT; + } + + addr = kmap(page); + + CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@%llu\n", + res->lnb_page, addr, res->lnb_file_offset); + + if (verify) { + int vrc = echo_page_debug_check(page, + ostid_id(&obj->ioo_oid), + res->lnb_file_offset, + res->lnb_len); + /* check all the pages always */ + if (vrc != 0 && rc == 0) + rc = vrc; + } + + kunmap(page); + /* NB see comment above regarding persistent pages */ + __free_page(page); + } + + return rc; +} + +static int echo_preprw(const struct lu_env *env, int cmd, + struct obd_export *export, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *nb, int *pages, + struct niobuf_local *res) +{ + struct obd_device *obd; + int tot_bytes = 0; + int rc = 0; + int i, left; + + ENTRY; + + obd = export->exp_obd; + if (!obd) + RETURN(-EINVAL); + + /* Temp fix to stop falling foul of osc_announce_cached() */ + oa->o_valid &= ~(OBD_MD_FLBLOCKS | OBD_MD_FLGRANT); + + memset(res, 0, sizeof(*res) * *pages); + + CDEBUG(D_PAGE, "%s %d obdos with %d IOs\n", + cmd == OBD_BRW_READ ? "reading" : "writing", objcount, *pages); + + left = *pages; + *pages = 0; + + for (i = 0; i < objcount; i++, obj++) { + int j; + + for (j = 0 ; j < obj->ioo_bufcnt ; j++, nb++) { + rc = echo_map_nb_to_lb(oa, obj, nb, pages, + res + *pages, cmd, &left); + if (rc) + GOTO(preprw_cleanup, rc); + + tot_bytes += nb->rnb_len; + } + } + + atomic_add(*pages, &obd->u.echo.eo_prep); + + if (cmd & OBD_BRW_READ) + lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_READ_BYTES, + tot_bytes); + else + lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_WRITE_BYTES, + tot_bytes); + + CDEBUG(D_PAGE, "%d pages allocated after prep\n", + atomic_read(&obd->u.echo.eo_prep)); + + RETURN(0); + +preprw_cleanup: + /* + * It is possible that we would rather handle errors by allow + * any already-set-up pages to complete, rather than tearing them + * all down again. I believe that this is what the in-kernel + * prep/commit operations do. + */ + CERROR("cleaning up %u pages (%d obdos)\n", *pages, objcount); + for (i = 0; i < *pages; i++) { + kunmap(res[i].lnb_page); + /* + * NB if this is a persistent page, __free_page() will just + * lose the extra ref gained above + */ + __free_page(res[i].lnb_page); + res[i].lnb_page = NULL; + atomic_dec(&obd->u.echo.eo_prep); + } + + return rc; +} + +static int echo_commitrw(const struct lu_env *env, int cmd, + struct obd_export *export, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *rb, int niocount, + struct niobuf_local *res, int rc, int nob, + ktime_t kstart) +{ + struct obd_device *obd; + int pgs = 0; + int i; + + ENTRY; + + obd = export->exp_obd; + if (!obd) + RETURN(-EINVAL); + + if (rc) + GOTO(commitrw_cleanup, rc); + + if ((cmd & OBD_BRW_RWMASK) == OBD_BRW_READ) { + CDEBUG(D_PAGE, "reading %d obdos with %d IOs\n", + objcount, niocount); + } else { + CDEBUG(D_PAGE, "writing %d obdos with %d IOs\n", + objcount, niocount); + } + + if (niocount && !res) { + CERROR("NULL res niobuf with niocount %d\n", niocount); + RETURN(-EINVAL); + } + + for (i = 0; i < objcount; i++, obj++) { + int verify = (rc == 0 && + ostid_id(&obj->ioo_oid) != ECHO_PERSISTENT_OBJID && + (oa->o_valid & OBD_MD_FLFLAGS) != 0 && + (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); + int j; + + for (j = 0 ; j < obj->ioo_bufcnt ; j++, rb++) { + int vrc = echo_finalize_lb(oa, obj, rb, &pgs, &res[pgs], + verify); + if (vrc == 0) + continue; + + if (vrc == -EFAULT) + GOTO(commitrw_cleanup, rc = vrc); + + if (rc == 0) + rc = vrc; + } + } + + atomic_sub(pgs, &obd->u.echo.eo_prep); + + CDEBUG(D_PAGE, "%d pages remain after commit\n", + atomic_read(&obd->u.echo.eo_prep)); + RETURN(rc); + +commitrw_cleanup: + atomic_sub(pgs, &obd->u.echo.eo_prep); + + CERROR("cleaning up %d pages (%d obdos)\n", + niocount - pgs - 1, objcount); + + while (pgs < niocount) { + struct page *page = res[pgs++].lnb_page; + + if (!page) + continue; + + /* NB see comment above regarding persistent pages */ + __free_page(page); + atomic_dec(&obd->u.echo.eo_prep); + } + return rc; +} + +LPROC_SEQ_FOPS_RO_TYPE(echo, uuid); +static struct lprocfs_vars lprocfs_echo_obd_vars[] = { + { .name = "uuid", + .fops = &echo_uuid_fops }, + { NULL } +}; + +const struct obd_ops echo_obd_ops = { + .o_owner = THIS_MODULE, + .o_connect = echo_connect, + .o_disconnect = echo_disconnect, + .o_init_export = echo_init_export, + .o_destroy_export = echo_destroy_export, + .o_preprw = echo_preprw, + .o_commitrw = echo_commitrw, +}; + +/** + * Echo Server request handler for OST_CREATE RPC. + * + * This is part of request processing. Its simulates the object + * creation on OST. + * + * \param[in] tsi target session environment for this request + * + * \retval 0 if successful + * \retval negative value on error + */ +static int esd_create_hdl(struct tgt_session_info *tsi) +{ + const struct obdo *oa = &tsi->tsi_ost_body->oa; + struct obd_device *obd = tsi->tsi_exp->exp_obd; + struct ost_body *repbody; + struct obdo *rep_oa; + + ENTRY; + + repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY); + if (!repbody) + RETURN(-ENOMEM); + + if (!(oa->o_mode & S_IFMT)) { + CERROR("%s: no type is set in obdo!\n", + tsi->tsi_exp->exp_obd->obd_name); + RETURN(-ENOENT); + } + + if (!(oa->o_valid & OBD_MD_FLTYPE)) { + CERROR("%s: invalid o_valid in obdo: %#llx\n", + tsi->tsi_exp->exp_obd->obd_name, oa->o_valid); + RETURN(-EINVAL); + } + + rep_oa = &repbody->oa; + + if (!fid_seq_is_echo(ostid_seq(&oa->o_oi))) { + CERROR("%s: invalid seq %#llx\n", + tsi->tsi_exp->exp_obd->obd_name, ostid_seq(&oa->o_oi)); + return -EINVAL; + } + + ostid_set_seq_echo(&rep_oa->o_oi); + ostid_set_id(&rep_oa->o_oi, echo_next_id(obd)); + + CDEBUG(D_INFO, "%s: Create object "DOSTID"\n", + tsi->tsi_exp->exp_obd->obd_name, POSTID(&rep_oa->o_oi)); + + rep_oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP; + + RETURN(0); +} + +/** + * Echo Server request handler for OST_DESTROY RPC. + * + * This is Echo Server part of request handling. It simulates the objects + * destroy on OST. + * + * \param[in] tsi target session environment for this request + * + * \retval 0 if successful + * \retval negative value on error + */ +static int esd_destroy_hdl(struct tgt_session_info *tsi) +{ + const struct obdo *oa = &tsi->tsi_ost_body->oa; + struct obd_device *obd = tsi->tsi_exp->exp_obd; + struct ost_body *repbody; + u64 oid; + + ENTRY; + + oid = ostid_id(&oa->o_oi); + LASSERT(oid != 0); + + if (!(oa->o_valid & OBD_MD_FLID)) { + CERROR("%s: obdo missing FLID valid flag: %#llx\n", + tsi->tsi_exp->exp_obd->obd_name, oa->o_valid); + RETURN(-EINVAL); + } + + repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY); + + if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino || + ostid_id(&oa->o_oi) < ECHO_INIT_OID) { + CERROR("%s: bad objid to destroy: "DOSTID"\n", + tsi->tsi_exp->exp_obd->obd_name, POSTID(&oa->o_oi)); + RETURN(-EINVAL); + } + + CDEBUG(D_INFO, "%s: Destroy object "DOSTID"\n", + tsi->tsi_exp->exp_obd->obd_name, POSTID(&oa->o_oi)); + + repbody->oa.o_oi = oa->o_oi; + RETURN(0); +} + +/** + * Echo Server request handler for OST_GETATTR RPC. + * + * This is Echo Server part of request handling. It returns an object + * attributes to the client. All objects have the same attributes in + * Echo Server. + * + * \param[in] tsi target session environment for this request + * + * \retval 0 if successful + * \retval negative value on error + */ +static int esd_getattr_hdl(struct tgt_session_info *tsi) +{ + const struct obdo *oa = &tsi->tsi_ost_body->oa; + struct obd_device *obd = tsi->tsi_exp->exp_obd; + struct ost_body *repbody; + + ENTRY; + + if (!(oa->o_valid & OBD_MD_FLID)) { + CERROR("%s: obdo missing FLID valid flag: %#llx\n", + tsi->tsi_exp->exp_obd->obd_name, oa->o_valid); + RETURN(-EINVAL); + } + + repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY); + if (!repbody) + RETURN(-ENOMEM); + + repbody->oa.o_oi = oa->o_oi; + repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + + obdo_cpy_md(&repbody->oa, &obd->u.echo.eo_oa, oa->o_valid); + + repbody->oa.o_valid |= OBD_MD_FLFLAGS; + repbody->oa.o_flags = OBD_FL_FLUSH; + + RETURN(0); +} + +/** + * Echo Server request handler for OST_SETATTR RPC. + * + * This is Echo Server part of request handling. It sets common + * attributes from request to the Echo Server objects. + * + * \param[in] tsi target session environment for this request + * + * \retval 0 if successful + * \retval negative value on error + */ +static int esd_setattr_hdl(struct tgt_session_info *tsi) +{ + struct ost_body *body = tsi->tsi_ost_body; + struct obd_device *obd = tsi->tsi_exp->exp_obd; + struct ost_body *repbody; + + ENTRY; + + if (!(body->oa.o_valid & OBD_MD_FLID)) { + CERROR("%s: obdo missing FLID valid flag: %#llx\n", + tsi->tsi_exp->exp_obd->obd_name, + body->oa.o_valid); + RETURN(-EINVAL); + } + + repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY); + if (!repbody) + RETURN(-ENOMEM); + + repbody->oa.o_oi = body->oa.o_oi; + repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + + obd->u.echo.eo_oa = body->oa; + + RETURN(0); +} + +#define OBD_FAIL_OST_READ_NET OBD_FAIL_OST_BRW_NET +#define OBD_FAIL_OST_WRITE_NET OBD_FAIL_OST_BRW_NET +#define OST_BRW_READ OST_READ +#define OST_BRW_WRITE OST_WRITE + +/** + * Table of Echo Server specific request handlers + * + * This table contains all opcodes accepted by Echo Server and + * specifies handlers for them. The tgt_request_handler() + * uses such table from each target to process incoming + * requests. + */ +static struct tgt_handler esd_tgt_handlers[] = { +TGT_RPC_HANDLER(OST_FIRST_OPC, 0, OST_CONNECT, tgt_connect, + &RQF_CONNECT, LUSTRE_OBD_VERSION), +TGT_RPC_HANDLER(OST_FIRST_OPC, 0, OST_DISCONNECT, tgt_disconnect, + &RQF_OST_DISCONNECT, LUSTRE_OBD_VERSION), +TGT_OST_HDL(HAS_BODY | HAS_REPLY, OST_GETATTR, esd_getattr_hdl), +TGT_OST_HDL(HAS_BODY | HAS_REPLY | IS_MUTABLE, OST_SETATTR, + esd_setattr_hdl), +TGT_OST_HDL(HAS_REPLY | IS_MUTABLE, OST_CREATE, esd_create_hdl), +TGT_OST_HDL(HAS_REPLY | IS_MUTABLE, OST_DESTROY, esd_destroy_hdl), +TGT_OST_HDL(HAS_BODY | HAS_REPLY, OST_BRW_READ, tgt_brw_read), +TGT_OST_HDL(HAS_BODY | IS_MUTABLE, OST_BRW_WRITE, tgt_brw_write), +}; + +static struct tgt_opc_slice esd_common_slice[] = { + { + .tos_opc_start = OST_FIRST_OPC, + .tos_opc_end = OST_LAST_OPC, + .tos_hs = esd_tgt_handlers + }, + { + .tos_opc_start = OBD_FIRST_OPC, + .tos_opc_end = OBD_LAST_OPC, + .tos_hs = tgt_obd_handlers + }, + { + .tos_opc_start = LDLM_FIRST_OPC, + .tos_opc_end = LDLM_LAST_OPC, + .tos_hs = tgt_dlm_handlers + }, + { + .tos_opc_start = SEC_FIRST_OPC, + .tos_opc_end = SEC_LAST_OPC, + .tos_hs = tgt_sec_ctx_handlers + }, + { + .tos_hs = NULL + } +}; + +/** + * lu_device_operations matrix for ECHO SRV device is NULL, + * this device is just serving incoming requests immediately + * without building a stack of lu_devices. + */ +static const struct lu_device_operations echo_srv_lu_ops = { 0 }; + +/** + * Initialize Echo Server device with parameters in the config log \a cfg. + * + * This is the main starting point of Echo Server initialization. It fills all + * parameters with their initial values and starts Echo Server. + * + * \param[in] env execution environment + * \param[in] m Echo Server device + * \param[in] ldt LU device type of Echo Server + * \param[in] cfg configuration log + * + * \retval 0 if successful + * \retval negative value on error + */ +static int echo_srv_init0(const struct lu_env *env, + struct echo_srv_device *esd, + struct lu_device_type *ldt, struct lustre_cfg *cfg) +{ + const char *dev = lustre_cfg_string(cfg, 0); + struct obd_device *obd; + char ns_name[48]; + int rc; + + ENTRY; + + obd = class_name2obd(dev); + if (!obd) { + CERROR("Cannot find obd with name %s\n", dev); + RETURN(-ENODEV); + } + + spin_lock_init(&obd->u.echo.eo_lock); + obd->u.echo.eo_lastino = ECHO_INIT_OID; + + esd->esd_dev.ld_ops = &echo_srv_lu_ops; + esd->esd_dev.ld_obd = obd; + /* set this lu_device to obd, because error handling need it */ + obd->obd_lu_dev = &esd->esd_dev; + + /* No connection accepted until configurations will finish */ + spin_lock(&obd->obd_dev_lock); + obd->obd_no_conn = 1; + spin_unlock(&obd->obd_dev_lock); + + /* non-replayable target */ + obd->obd_replayable = 0; + + snprintf(ns_name, sizeof(ns_name), "echotgt-%s", obd->obd_uuid.uuid); + obd->obd_namespace = ldlm_namespace_new(obd, ns_name, + LDLM_NAMESPACE_SERVER, + LDLM_NAMESPACE_MODEST, + LDLM_NS_TYPE_OST); + if (IS_ERR(obd->obd_namespace)) { + rc = PTR_ERR(obd->obd_namespace); + CERROR("%s: unable to create server namespace: rc = %d\n", + obd->obd_name, rc); + obd->obd_namespace = NULL; + RETURN(rc); + } + + obd->obd_vars = lprocfs_echo_obd_vars; + if (!lprocfs_obd_setup(obd, true) && + lprocfs_alloc_obd_stats(obd, LPROC_ECHO_LAST) == 0) { + lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_READ_BYTES, + LPROCFS_CNTR_AVGMINMAX, + "read_bytes", "bytes"); + lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_WRITE_BYTES, + LPROCFS_CNTR_AVGMINMAX, + "write_bytes", "bytes"); + } + + ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL, + "echo_ldlm_cb_client", &obd->obd_ldlm_client); + + rc = tgt_init(env, &esd->esd_lut, obd, NULL, esd_common_slice, + OBD_FAIL_OST_ALL_REQUEST_NET, + OBD_FAIL_OST_ALL_REPLY_NET); + if (rc) + GOTO(err_out, rc); + + spin_lock(&obd->obd_dev_lock); + obd->obd_no_conn = 0; + spin_unlock(&obd->obd_dev_lock); + + RETURN(0); + +err_out: + ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force); + obd->obd_namespace = NULL; + + lprocfs_obd_cleanup(obd); + lprocfs_free_obd_stats(obd); + RETURN(rc); +} + +/** + * Stop the Echo Server device. + * + * This function stops the Echo Server device and all its subsystems. + * This is the end of Echo Server lifecycle. + * + * \param[in] env execution environment + * \param[in] esd ESD device + */ +static void echo_srv_fini(const struct lu_env *env, + struct echo_srv_device *esd) +{ + struct obd_device *obd = echo_srv_obd(esd); + struct lu_device *d = &esd->esd_dev; + int leaked; + + ENTRY; + + class_disconnect_exports(obd); + if (obd->obd_namespace) + ldlm_namespace_free_prior(obd->obd_namespace, NULL, + obd->obd_force); + + obd_exports_barrier(obd); + obd_zombie_barrier(); + + tgt_fini(env, &esd->esd_lut); + + if (obd->obd_namespace) { + ldlm_namespace_free_post(obd->obd_namespace); + obd->obd_namespace = NULL; + } + + lprocfs_obd_cleanup(obd); + lprocfs_free_obd_stats(obd); + + leaked = atomic_read(&obd->u.echo.eo_prep); + if (leaked != 0) + CERROR("%d prep/commitrw pages leaked\n", leaked); + + LASSERT(atomic_read(&d->ld_ref) == 0); + EXIT; +} + +/** + * Implementation of lu_device_type_operations::ldto_device_fini. + * + * Finalize device. Dual to echo_srv_device_init(). It is called from + * obd_precleanup() and stops the current device. + * + * \param[in] env execution environment + * \param[in] d LU device of ESD + * + * \retval NULL + */ +static struct lu_device *echo_srv_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + ENTRY; + echo_srv_fini(env, echo_srv_dev(d)); + RETURN(NULL); +} + +/** + * Implementation of lu_device_type_operations::ldto_device_free. + * + * Free Echo Server device. Dual to echo_srv_device_alloc(). + * + * \param[in] env execution environment + * \param[in] d LU device of ESD + * + * \retval NULL + */ +static struct lu_device *echo_srv_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct echo_srv_device *esd = echo_srv_dev(d); + + lu_device_fini(&esd->esd_dev); + OBD_FREE_PTR(esd); + RETURN(NULL); +} + +/** + * Implementation of lu_device_type_operations::ldto_device_alloc. + * + * This function allocates the new Echo Server device. It is called from + * obd_setup() if OBD device had lu_device_type defined. + * + * \param[in] env execution environment + * \param[in] t lu_device_type of ESD device + * \param[in] cfg configuration log + * + * \retval pointer to the lu_device of just allocated OFD + * \retval ERR_PTR of return value on error + */ +static struct lu_device *echo_srv_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct echo_srv_device *esd; + struct lu_device *l; + int rc; + + OBD_ALLOC_PTR(esd); + if (!esd) + return ERR_PTR(-ENOMEM); + + l = &esd->esd_dev; + lu_device_init(l, t); + rc = echo_srv_init0(env, esd, t, cfg); + if (rc != 0) { + echo_srv_device_free(env, l); + l = ERR_PTR(rc); + } + + return l; +} + +static const struct lu_device_type_operations echo_srv_type_ops = { + .ldto_device_alloc = echo_srv_device_alloc, + .ldto_device_free = echo_srv_device_free, + .ldto_device_fini = echo_srv_device_fini +}; + +struct lu_device_type echo_srv_type = { + .ldt_tags = LU_DEVICE_DT, + .ldt_name = LUSTRE_ECHO_NAME, + .ldt_ops = &echo_srv_type_ops, + .ldt_ctx_tags = LCT_DT_THREAD, +}; + +void echo_persistent_pages_fini(void) +{ + int i; + + for (i = 0; i < ECHO_PERSISTENT_PAGES; i++) + if (echo_persistent_pages[i]) { + __free_page(echo_persistent_pages[i]); + echo_persistent_pages[i] = NULL; + } +} + +int echo_persistent_pages_init(void) +{ + struct page *pg; + int i; + + for (i = 0; i < ECHO_PERSISTENT_PAGES; i++) { + gfp_t gfp_mask = (i < ECHO_PERSISTENT_PAGES / 2) ? + GFP_KERNEL : GFP_HIGHUSER; + + pg = alloc_page(gfp_mask); + if (!pg) { + echo_persistent_pages_fini(); + return -ENOMEM; + } + + memset(kmap(pg), 0, PAGE_SIZE); + kunmap(pg); + /* set mapping so page is not considered encrypted */ + pg->mapping = ECHO_MAPPING_UNENCRYPTED; + + echo_persistent_pages[i] = pg; + } + + return 0; +} diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c new file mode 100644 index 0000000000000..3c6bc10a8046d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c @@ -0,0 +1,3171 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_ECHO + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_SERVER_SUPPORT +# include + +#define ETI_NAME_LEN 20 + +#endif /* HAVE_SERVER_SUPPORT */ + +#include "echo_internal.h" + +/** \defgroup echo_client Echo Client + * @{ + */ + +/* echo thread key have a CL_THREAD flag, which set cl_env function directly */ +#define ECHO_MD_CTX_TAG (LCT_REMEMBER | LCT_MD_THREAD) +#define ECHO_DT_CTX_TAG (LCT_REMEMBER | LCT_DT_THREAD) +#define ECHO_SES_TAG (LCT_REMEMBER | LCT_SESSION | LCT_SERVER_SESSION) + +struct echo_device { + struct cl_device ed_cl; + struct echo_client_obd *ed_ec; + + struct cl_site ed_site_myself; + struct lu_site *ed_site; + struct lu_device *ed_next; + int ed_next_ismd; + struct lu_client_seq *ed_cl_seq; +#ifdef HAVE_SERVER_SUPPORT + struct local_oid_storage *ed_los; + struct lu_fid ed_root_fid; +#endif /* HAVE_SERVER_SUPPORT */ +}; + +struct echo_object { + struct cl_object eo_cl; + struct cl_object_header eo_hdr; + struct echo_device *eo_dev; + struct list_head eo_obj_chain; + struct lov_oinfo *eo_oinfo; + atomic_t eo_npages; + int eo_deleted; +}; + +struct echo_object_conf { + struct cl_object_conf eoc_cl; + struct lov_oinfo **eoc_oinfo; +}; + +struct echo_page { + struct cl_page_slice ep_cl; + unsigned long ep_lock; +}; + +struct echo_lock { + struct cl_lock_slice el_cl; + struct list_head el_chain; + struct echo_object *el_object; + __u64 el_cookie; + atomic_t el_refcount; +}; + +#ifdef HAVE_SERVER_SUPPORT +static const char echo_md_root_dir_name[] = "ROOT_ECHO"; + +/** + * In order to use the values of members in struct mdd_device, + * we define an alias structure here. + */ +struct echo_md_device { + struct md_device emd_md_dev; + struct obd_export *emd_child_exp; + struct dt_device *emd_child; + struct dt_device *emd_bottom; + struct lu_fid emd_root_fid; + struct lu_fid emd_local_root_fid; +}; +#endif /* HAVE_SERVER_SUPPORT */ + +static int echo_client_setup(const struct lu_env *env, + struct obd_device *obd, + struct lustre_cfg *lcfg); +static int echo_client_cleanup(struct obd_device *obd); + +/** \defgroup echo_helpers Helper functions + * @{ + */ +static inline struct echo_device *cl2echo_dev(const struct cl_device *dev) +{ + return container_of_safe(dev, struct echo_device, ed_cl); +} + +static inline struct cl_device *echo_dev2cl(struct echo_device *d) +{ + return &d->ed_cl; +} + +static inline struct echo_device *obd2echo_dev(const struct obd_device *obd) +{ + return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev)); +} + +static inline struct cl_object *echo_obj2cl(struct echo_object *eco) +{ + return &eco->eo_cl; +} + +static inline struct echo_object *cl2echo_obj(const struct cl_object *o) +{ + return container_of(o, struct echo_object, eo_cl); +} + +static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s) +{ + return container_of(s, struct echo_page, ep_cl); +} + +static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s) +{ + return container_of(s, struct echo_lock, el_cl); +} + +static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl) +{ + return ecl->el_cl.cls_lock; +} + +static struct lu_context_key echo_thread_key; + +static inline struct echo_thread_info *echo_env_info(const struct lu_env *env) +{ + struct echo_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &echo_thread_key); + LASSERT(info != NULL); + return info; +} + +static inline +struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c) +{ + return container_of(c, struct echo_object_conf, eoc_cl); +} + +#ifdef HAVE_SERVER_SUPPORT +static inline struct echo_md_device *lu2emd_dev(struct lu_device *d) +{ + return container_of_safe(d, struct echo_md_device, + emd_md_dev.md_lu_dev); +} + +static inline struct lu_device *emd2lu_dev(struct echo_md_device *d) +{ + return &d->emd_md_dev.md_lu_dev; +} + +static inline struct seq_server_site *echo_md_seq_site(struct echo_md_device *d) +{ + return emd2lu_dev(d)->ld_site->ld_seq_site; +} + +static inline struct obd_device *emd2obd_dev(struct echo_md_device *d) +{ + return d->emd_md_dev.md_lu_dev.ld_obd; +} +#endif /* HAVE_SERVER_SUPPORT */ + +/** @} echo_helpers */ + +static int cl_echo_object_put(struct echo_object *eco); +static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset, + struct page **pages, int npages, int async); + +struct echo_thread_info { + struct echo_object_conf eti_conf; + struct lustre_md eti_md; + struct cl_2queue eti_queue; + struct cl_io eti_io; + struct cl_lock eti_lock; + struct lu_fid eti_fid; + struct lu_fid eti_fid2; +#ifdef HAVE_SERVER_SUPPORT + struct md_op_spec eti_spec; + struct lov_mds_md_v3 eti_lmm; + struct lov_user_md_v3 eti_lum; + struct md_attr eti_ma; + struct lu_name eti_lname; + /* per-thread values, can be re-used */ + void *eti_big_lmm; /* may be vmalloc'd */ + int eti_big_lmmsize; + char eti_name[ETI_NAME_LEN]; + struct lu_buf eti_buf; + /* If we want to test large ACL, then need to enlarge the buffer. */ + char eti_xattr_buf[LUSTRE_POSIX_ACL_MAX_SIZE_OLD]; +#endif +}; + +/* No session used right now */ +struct echo_session_info { + unsigned long dummy; +}; + +static struct kmem_cache *echo_lock_kmem; +static struct kmem_cache *echo_object_kmem; +static struct kmem_cache *echo_thread_kmem; +static struct kmem_cache *echo_session_kmem; +/* static struct kmem_cache *echo_req_kmem; */ + +static struct lu_kmem_descr echo_caches[] = { + { + .ckd_cache = &echo_lock_kmem, + .ckd_name = "echo_lock_kmem", + .ckd_size = sizeof(struct echo_lock) + }, + { + .ckd_cache = &echo_object_kmem, + .ckd_name = "echo_object_kmem", + .ckd_size = sizeof(struct echo_object) + }, + { + .ckd_cache = &echo_thread_kmem, + .ckd_name = "echo_thread_kmem", + .ckd_size = sizeof(struct echo_thread_info) + }, + { + .ckd_cache = &echo_session_kmem, + .ckd_name = "echo_session_kmem", + .ckd_size = sizeof(struct echo_session_info) + }, + { + .ckd_cache = NULL + } +}; + +/** \defgroup echo_page Page operations + * + * Echo page operations. + * + * @{ + */ +static int echo_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io, int nonblock) +{ + struct echo_page *ep = cl2echo_page(slice); + + if (!nonblock) { + if (test_and_set_bit(0, &ep->ep_lock)) + return -EAGAIN; + } else { + while (test_and_set_bit(0, &ep->ep_lock)) + wait_on_bit(&ep->ep_lock, 0, TASK_UNINTERRUPTIBLE); + } + return 0; +} + +static void echo_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct echo_page *ep = cl2echo_page(slice); + + LASSERT(test_bit(0, &ep->ep_lock)); + clear_and_wake_up_bit(0, &ep->ep_lock); +} + +static void echo_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + cl_page_delete(env, slice->cpl_page); +} + +static int echo_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + if (test_bit(0, &cl2echo_page(slice)->ep_lock)) + return -EBUSY; + return -ENODATA; +} + +static void echo_page_completion(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + LASSERT(slice->cpl_page->cp_sync_io != NULL); +} + +static void echo_page_fini(const struct lu_env *env, + struct cl_page_slice *slice, + struct pagevec *pvec) +{ + struct echo_object *eco = cl2echo_obj(slice->cpl_obj); + + ENTRY; + atomic_dec(&eco->eo_npages); + put_page(slice->cpl_page->cp_vmpage); + EXIT; +} + +static int echo_page_prep(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + return 0; +} + +static int echo_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct echo_page *ep = cl2echo_page(slice); + + (*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p %d vm@%p\n", + ep, test_bit(0, &ep->ep_lock), + slice->cpl_page->cp_vmpage); + return 0; +} + +static const struct cl_page_operations echo_page_ops = { + .cpo_own = echo_page_own, + .cpo_disown = echo_page_disown, + .cpo_discard = echo_page_discard, + .cpo_fini = echo_page_fini, + .cpo_print = echo_page_print, + .cpo_is_vmlocked = echo_page_is_vmlocked, + .io = { + [CRT_READ] = { + .cpo_prep = echo_page_prep, + .cpo_completion = echo_page_completion, + }, + [CRT_WRITE] = { + .cpo_prep = echo_page_prep, + .cpo_completion = echo_page_completion, + } + } +}; + +/** @} echo_page */ + +/** \defgroup echo_lock Locking + * + * echo lock operations + * + * @{ + */ +static void echo_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct echo_lock *ecl = cl2echo_lock(slice); + + LASSERT(list_empty(&ecl->el_chain)); + OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem); +} + +static const struct cl_lock_operations echo_lock_ops = { + .clo_fini = echo_lock_fini, +}; + +/** @} echo_lock */ + +/** \defgroup echo_cl_ops cl_object operations + * + * operations for cl_object + * + * @{ + */ +static int echo_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index) +{ + struct echo_page *ep = cl_object_page_slice(obj, page); + struct echo_object *eco = cl2echo_obj(obj); + + ENTRY; + get_page(page->cp_vmpage); + /* + * ep_lock is similar to the lock_page() lock, and + * cannot usefully be monitored by lockdep. + * So just use a bit in an "unsigned long" and use the + * wait_on_bit() interface to wait for the bit to be clear. + */ + ep->ep_lock = 0; + cl_page_slice_add(page, &ep->ep_cl, obj, &echo_page_ops); + atomic_inc(&eco->eo_npages); + RETURN(0); +} + +static int echo_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + return 0; +} + +static int echo_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *unused) +{ + struct echo_lock *el; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(el, echo_lock_kmem, GFP_NOFS); + if (el) { + cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops); + el->el_object = cl2echo_obj(obj); + INIT_LIST_HEAD(&el->el_chain); + atomic_set(&el->el_refcount, 0); + } + RETURN(el ? 0 : -ENOMEM); +} + +static int echo_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + return 0; +} + +static const struct cl_object_operations echo_cl_obj_ops = { + .coo_page_init = echo_page_init, + .coo_lock_init = echo_lock_init, + .coo_io_init = echo_io_init, + .coo_conf_set = echo_conf_set +}; +/** @} echo_cl_ops */ + +/** \defgroup echo_lu_ops lu_object operations + * + * operations for echo lu object. + * + * @{ + */ +static int echo_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(obj->lo_dev)); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco = cl2echo_obj(lu2cl(obj)); + + ENTRY; + if (ed->ed_next) { + struct lu_object *below; + struct lu_device *under; + + under = ed->ed_next; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, + under); + if (!below) + RETURN(-ENOMEM); + lu_object_add(obj, below); + } + + if (!ed->ed_next_ismd) { + const struct cl_object_conf *cconf = lu2cl_conf(conf); + struct echo_object_conf *econf = cl2echo_conf(cconf); + + LASSERT(econf->eoc_oinfo != NULL); + + /* + * Transfer the oinfo pointer to eco that it won't be + * freed. + */ + eco->eo_oinfo = *econf->eoc_oinfo; + *econf->eoc_oinfo = NULL; + } else { + eco->eo_oinfo = NULL; + } + + eco->eo_dev = ed; + atomic_set(&eco->eo_npages, 0); + cl_object_page_init(lu2cl(obj), sizeof(struct echo_page)); + + spin_lock(&ec->ec_lock); + list_add_tail(&eco->eo_obj_chain, &ec->ec_objects); + spin_unlock(&ec->ec_lock); + + RETURN(0); +} + +static void echo_object_delete(const struct lu_env *env, struct lu_object *obj) +{ + struct echo_object *eco = cl2echo_obj(lu2cl(obj)); + struct echo_client_obd *ec; + + ENTRY; + + /* object delete called unconditolally - layer init or not */ + if (eco->eo_dev == NULL) + return; + + ec = eco->eo_dev->ed_ec; + + LASSERT(atomic_read(&eco->eo_npages) == 0); + + spin_lock(&ec->ec_lock); + list_del_init(&eco->eo_obj_chain); + spin_unlock(&ec->ec_lock); + + if (eco->eo_oinfo) + OBD_FREE_PTR(eco->eo_oinfo); +} + +static void echo_object_free_rcu(struct rcu_head *head) +{ + struct echo_object *eco = container_of(head, struct echo_object, + eo_hdr.coh_lu.loh_rcu); + + kmem_cache_free(echo_object_kmem, eco); +} + +static void echo_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct echo_object *eco = cl2echo_obj(lu2cl(obj)); + + ENTRY; + + lu_object_fini(obj); + lu_object_header_fini(obj->lo_header); + + OBD_FREE_PRE(eco, sizeof(*eco), "slab-freed"); + call_rcu(&eco->eo_hdr.coh_lu.loh_rcu, echo_object_free_rcu); + EXIT; +} + +static int echo_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct echo_object *obj = cl2echo_obj(lu2cl(o)); + + return (*p)(env, cookie, "echoclient-object@%p", obj); +} + +static const struct lu_object_operations echo_lu_obj_ops = { + .loo_object_init = echo_object_init, + .loo_object_delete = echo_object_delete, + .loo_object_release = NULL, + .loo_object_free = echo_object_free, + .loo_object_print = echo_object_print, + .loo_object_invariant = NULL +}; +/** @} echo_lu_ops */ + +/** \defgroup echo_lu_dev_ops lu_device operations + * + * Operations for echo lu device. + * + * @{ + */ +static struct lu_object *echo_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev) +{ + struct echo_object *eco; + struct lu_object *obj = NULL; + + ENTRY; + /* we're the top dev. */ + LASSERT(hdr == NULL); + OBD_SLAB_ALLOC_PTR_GFP(eco, echo_object_kmem, GFP_NOFS); + if (eco) { + struct cl_object_header *hdr = &eco->eo_hdr; + + obj = &echo_obj2cl(eco)->co_lu; + cl_object_header_init(hdr); + hdr->coh_page_bufsize = cfs_size_round(sizeof(struct cl_page)); + + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + + eco->eo_cl.co_ops = &echo_cl_obj_ops; + obj->lo_ops = &echo_lu_obj_ops; + } + RETURN(obj); +} + +static const struct lu_device_operations echo_device_lu_ops = { + .ldo_object_alloc = echo_object_alloc, +}; + +/** @} echo_lu_dev_ops */ + +/** \defgroup echo_init Setup and teardown + * + * Init and fini functions for echo client. + * + * @{ + */ +static int echo_site_init(const struct lu_env *env, struct echo_device *ed) +{ + struct cl_site *site = &ed->ed_site_myself; + int rc; + + /* initialize site */ + rc = cl_site_init(site, &ed->ed_cl); + if (rc) { + CERROR("Cannot initialize site for echo client(%d)\n", rc); + return rc; + } + + rc = lu_site_init_finish(&site->cs_lu); + if (rc) { + cl_site_fini(site); + return rc; + } + + ed->ed_site = &site->cs_lu; + return 0; +} + +static void echo_site_fini(const struct lu_env *env, struct echo_device *ed) +{ + if (ed->ed_site) { + if (!ed->ed_next_ismd) + lu_site_fini(ed->ed_site); + ed->ed_site = NULL; + } +} + +static void *echo_thread_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct echo_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, echo_thread_kmem, GFP_NOFS); + if (!info) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void echo_thread_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct echo_thread_info *info = data; + + OBD_SLAB_FREE_PTR(info, echo_thread_kmem); +} + +static struct lu_context_key echo_thread_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = echo_thread_key_init, + .lct_fini = echo_thread_key_fini, +}; + +static void *echo_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct echo_session_info *session; + + OBD_SLAB_ALLOC_PTR_GFP(session, echo_session_kmem, GFP_NOFS); + if (!session) + session = ERR_PTR(-ENOMEM); + return session; +} + +static void echo_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct echo_session_info *session = data; + + OBD_SLAB_FREE_PTR(session, echo_session_kmem); +} + +static struct lu_context_key echo_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = echo_session_key_init, + .lct_fini = echo_session_key_fini, +}; + +LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key); + +#ifdef HAVE_SERVER_SUPPORT +# define ECHO_SEQ_WIDTH 0xffffffff +static int echo_fid_init(struct echo_device *ed, char *obd_name, + struct seq_server_site *ss) +{ + char *prefix; + int rc; + + ENTRY; + OBD_ALLOC_PTR(ed->ed_cl_seq); + if (!ed->ed_cl_seq) + RETURN(-ENOMEM); + + OBD_ALLOC(prefix, MAX_OBD_NAME + 5); + if (!prefix) + GOTO(out_free_seq, rc = -ENOMEM); + + snprintf(prefix, MAX_OBD_NAME + 5, "srv-%s", obd_name); + + /* Init client side sequence-manager */ + seq_client_init(ed->ed_cl_seq, NULL, + LUSTRE_SEQ_METADATA, + prefix, ss->ss_server_seq); + ed->ed_cl_seq->lcs_width = ECHO_SEQ_WIDTH; + OBD_FREE(prefix, MAX_OBD_NAME + 5); + + RETURN(0); + +out_free_seq: + OBD_FREE_PTR(ed->ed_cl_seq); + ed->ed_cl_seq = NULL; + RETURN(rc); +} + +static int echo_fid_fini(struct obd_device *obd) +{ + struct echo_device *ed = obd2echo_dev(obd); + + ENTRY; + if (ed->ed_cl_seq) { + seq_client_fini(ed->ed_cl_seq); + OBD_FREE_PTR(ed->ed_cl_seq); + ed->ed_cl_seq = NULL; + } + + RETURN(0); +} + +static void echo_ed_los_fini(const struct lu_env *env, struct echo_device *ed) +{ + ENTRY; + if (ed != NULL && ed->ed_next_ismd && ed->ed_los != NULL) { + local_oid_storage_fini(env, ed->ed_los); + ed->ed_los = NULL; + } +} + +static int +echo_md_local_file_create(const struct lu_env *env, struct echo_md_device *emd, + struct local_oid_storage *los, + const struct lu_fid *pfid, const char *name, + __u32 mode, struct lu_fid *fid) +{ + struct dt_object *parent = NULL; + struct dt_object *dto = NULL; + int rc = 0; + + ENTRY; + LASSERT(!fid_is_zero(pfid)); + parent = dt_locate(env, emd->emd_bottom, pfid); + if (unlikely(IS_ERR(parent))) + RETURN(PTR_ERR(parent)); + + /* create local file with @fid */ + dto = local_file_find_or_create_with_fid(env, emd->emd_bottom, fid, + parent, name, mode); + if (IS_ERR(dto)) + GOTO(out_put, rc = PTR_ERR(dto)); + + *fid = *lu_object_fid(&dto->do_lu); + /* + * since stack is not fully set up the local_storage uses own stack + * and we should drop its object from cache + */ + dt_object_put_nocache(env, dto); + + EXIT; +out_put: + dt_object_put(env, parent); + RETURN(rc); +} + +static int +echo_md_root_get(const struct lu_env *env, struct echo_md_device *emd, + struct echo_device *ed) +{ + struct lu_fid fid; + int rc = 0; + + ENTRY; + /* Setup local dirs */ + fid.f_seq = FID_SEQ_LOCAL_NAME; + fid.f_oid = 1; + fid.f_ver = 0; + rc = local_oid_storage_init(env, emd->emd_bottom, &fid, &ed->ed_los); + if (rc != 0) + RETURN(rc); + + lu_echo_root_fid(&fid); + if (echo_md_seq_site(emd)->ss_node_id == 0) { + rc = echo_md_local_file_create(env, emd, ed->ed_los, + &emd->emd_local_root_fid, + echo_md_root_dir_name, S_IFDIR | + S_IRUGO | S_IWUSR | S_IXUGO, + &fid); + if (rc != 0) { + CERROR("%s: create md echo root fid failed: rc = %d\n", + emd2obd_dev(emd)->obd_name, rc); + GOTO(out_los, rc); + } + } + ed->ed_root_fid = fid; + + RETURN(0); +out_los: + echo_ed_los_fini(env, ed); + + RETURN(rc); +} +#endif /* HAVE_SERVER_SUPPORT */ + +static struct lu_device *echo_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *next; + struct echo_device *ed; + struct cl_device *cd; + struct obd_device *obd = NULL; /* to keep compiler happy */ + struct obd_device *tgt; + const char *tgt_type_name; + int rc; + int cleanup = 0; + + ENTRY; + OBD_ALLOC_PTR(ed); + if (!ed) + GOTO(out, rc = -ENOMEM); + + cleanup = 1; + cd = &ed->ed_cl; + rc = cl_device_init(cd, t); + if (rc) + GOTO(out, rc); + + cd->cd_lu_dev.ld_ops = &echo_device_lu_ops; + + cleanup = 2; + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + LASSERT(env != NULL); + + tgt = class_name2obd(lustre_cfg_string(cfg, 1)); + if (!tgt) { + CERROR("Can not find tgt device %s\n", + lustre_cfg_string(cfg, 1)); + GOTO(out, rc = -ENODEV); + } + + next = tgt->obd_lu_dev; + + if (strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) { + ed->ed_next_ismd = 1; + } else if (strcmp(tgt->obd_type->typ_name, LUSTRE_OST_NAME) == 0 || + strcmp(tgt->obd_type->typ_name, LUSTRE_OSC_NAME) == 0) { + ed->ed_next_ismd = 0; + rc = echo_site_init(env, ed); + if (rc) + GOTO(out, rc); + } else { + GOTO(out, rc = -EINVAL); + } + + cleanup = 3; + + rc = echo_client_setup(env, obd, cfg); + if (rc) + GOTO(out, rc); + + ed->ed_ec = &obd->u.echo_client; + cleanup = 4; + + if (ed->ed_next_ismd) { +#ifdef HAVE_SERVER_SUPPORT + /* Suppose to connect to some Metadata layer */ + struct lu_site *ls = NULL; + struct lu_device *ld = NULL; + struct md_device *md = NULL; + struct echo_md_device *emd = NULL; + int found = 0; + + if (!next) { + CERROR("%s is not lu device type!\n", + lustre_cfg_string(cfg, 1)); + GOTO(out, rc = -EINVAL); + } + + tgt_type_name = lustre_cfg_string(cfg, 2); + if (!tgt_type_name) { + CERROR("%s no type name for echo %s setup\n", + lustre_cfg_string(cfg, 1), + tgt->obd_type->typ_name); + GOTO(out, rc = -EINVAL); + } + + ls = next->ld_site; + + spin_lock(&ls->ls_ld_lock); + list_for_each_entry(ld, &ls->ls_ld_linkage, ld_linkage) { + if (strcmp(ld->ld_type->ldt_name, tgt_type_name) == 0) { + found = 1; + break; + } + } + spin_unlock(&ls->ls_ld_lock); + + if (found == 0) { + CERROR("%s is not lu device type!\n", + lustre_cfg_string(cfg, 1)); + GOTO(out, rc = -EINVAL); + } + + next = ld; + /* For MD echo client, it will use the site in MDS stack */ + ed->ed_site = ls; + ed->ed_cl.cd_lu_dev.ld_site = ls; + rc = echo_fid_init(ed, obd->obd_name, lu_site2seq(ls)); + if (rc) { + CERROR("echo fid init error %d\n", rc); + GOTO(out, rc); + } + + md = lu2md_dev(next); + emd = lu2emd_dev(&md->md_lu_dev); + rc = echo_md_root_get(env, emd, ed); + if (rc != 0) { + CERROR("%s: get root error: rc = %d\n", + emd2obd_dev(emd)->obd_name, rc); + GOTO(out, rc); + } +#else /* !HAVE_SERVER_SUPPORT */ + CERROR( + "Local operations are NOT supported on client side. Only remote operations are supported. Metadata client must be run on server side.\n"); + GOTO(out, rc = -EOPNOTSUPP); +#endif /* HAVE_SERVER_SUPPORT */ + } else { + /* + * if echo client is to be stacked upon ost device, the next is + * NULL since ost is not a clio device so far + */ + if (next != NULL && !lu_device_is_cl(next)) + next = NULL; + + tgt_type_name = tgt->obd_type->typ_name; + if (next) { + LASSERT(next != NULL); + if (next->ld_site) + GOTO(out, rc = -EBUSY); + + next->ld_site = ed->ed_site; + rc = next->ld_type->ldt_ops->ldto_device_init(env, next, + next->ld_type->ldt_name, + NULL); + if (rc) + GOTO(out, rc); + } else { + LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0); + } + } + + ed->ed_next = next; + RETURN(&cd->cd_lu_dev); +out: + switch (cleanup) { + case 4: { + int rc2; + + rc2 = echo_client_cleanup(obd); + if (rc2) + CERROR("Cleanup obd device %s error(%d)\n", + obd->obd_name, rc2); + } + fallthrough; + + case 3: + echo_site_fini(env, ed); + fallthrough; + case 2: + cl_device_fini(&ed->ed_cl); + fallthrough; + case 1: + OBD_FREE_PTR(ed); + fallthrough; + case 0: + default: + break; + } + return ERR_PTR(rc); +} + +static int echo_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + LBUG(); + return 0; +} + +static struct lu_device *echo_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(d)); + struct lu_device *next = ed->ed_next; + + while (next && !ed->ed_next_ismd) + next = next->ld_type->ldt_ops->ldto_device_fini(env, next); + return NULL; +} + +static void echo_lock_release(const struct lu_env *env, + struct echo_lock *ecl, + int still_used) +{ + struct cl_lock *clk = echo_lock2cl(ecl); + + cl_lock_release(env, clk); +} + +static struct lu_device *echo_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(d)); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco; + struct lu_device *next = ed->ed_next; + + CDEBUG(D_INFO, "echo device:%p is going to be freed, next = %p\n", + ed, next); + + lu_site_purge(env, ed->ed_site, -1); + + /* + * check if there are objects still alive. + * It shouldn't have any object because lu_site_purge would cleanup + * all of cached objects. Anyway, probably the echo device is being + * parallelly accessed. + */ + spin_lock(&ec->ec_lock); + list_for_each_entry(eco, &ec->ec_objects, eo_obj_chain) + eco->eo_deleted = 1; + spin_unlock(&ec->ec_lock); + + /* purge again */ + lu_site_purge(env, ed->ed_site, -1); + + CDEBUG(D_INFO, + "Waiting for the reference of echo object to be dropped\n"); + + /* Wait for the last reference to be dropped. */ + spin_lock(&ec->ec_lock); + while (!list_empty(&ec->ec_objects)) { + spin_unlock(&ec->ec_lock); + CERROR( + "echo_client still has objects at cleanup time, wait for 1 second\n"); + schedule_timeout_uninterruptible(cfs_time_seconds(1)); + lu_site_purge(env, ed->ed_site, -1); + spin_lock(&ec->ec_lock); + } + spin_unlock(&ec->ec_lock); + + LASSERT(list_empty(&ec->ec_locks)); + + CDEBUG(D_INFO, "No object exists, exiting...\n"); + + echo_client_cleanup(d->ld_obd); +#ifdef HAVE_SERVER_SUPPORT + echo_fid_fini(d->ld_obd); + echo_ed_los_fini(env, ed); +#endif + while (next && !ed->ed_next_ismd) + next = next->ld_type->ldt_ops->ldto_device_free(env, next); + + LASSERT(ed->ed_site == d->ld_site); + echo_site_fini(env, ed); + cl_device_fini(&ed->ed_cl); + OBD_FREE_PTR(ed); + + cl_env_cache_purge(~0); + + return NULL; +} + +static const struct lu_device_type_operations echo_device_type_ops = { + .ldto_init = echo_type_init, + .ldto_fini = echo_type_fini, + + .ldto_start = echo_type_start, + .ldto_stop = echo_type_stop, + + .ldto_device_alloc = echo_device_alloc, + .ldto_device_free = echo_device_free, + .ldto_device_init = echo_device_init, + .ldto_device_fini = echo_device_fini +}; + +static struct lu_device_type echo_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_ECHO_CLIENT_NAME, + .ldt_ops = &echo_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD | LCT_MD_THREAD | LCT_DT_THREAD, +}; +/** @} echo_init */ + +/** \defgroup echo_exports Exported operations + * + * exporting functions to echo client + * + * @{ + */ + +/* Interfaces to echo client obd device */ +static struct echo_object * +cl_echo_object_find(struct echo_device *d, const struct ost_id *oi) +{ + struct lu_env *env; + struct echo_thread_info *info; + struct echo_object_conf *conf; + struct echo_object *eco; + struct cl_object *obj; + struct lov_oinfo *oinfo = NULL; + struct lu_fid *fid; + __u16 refcheck; + int rc; + + ENTRY; + LASSERTF(ostid_id(oi) != 0, DOSTID"\n", POSTID(oi)); + LASSERTF(ostid_seq(oi) == FID_SEQ_ECHO, DOSTID"\n", POSTID(oi)); + + /* Never return an object if the obd is to be freed. */ + if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping) + RETURN(ERR_PTR(-ENODEV)); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN((void *)env); + + info = echo_env_info(env); + conf = &info->eti_conf; + if (d->ed_next) { + OBD_ALLOC_PTR(oinfo); + if (!oinfo) + GOTO(out, eco = ERR_PTR(-ENOMEM)); + + oinfo->loi_oi = *oi; + conf->eoc_cl.u.coc_oinfo = oinfo; + } + + /* + * If echo_object_init() is successful then ownership of oinfo + * is transferred to the object. + */ + conf->eoc_oinfo = &oinfo; + + fid = &info->eti_fid; + rc = ostid_to_fid(fid, oi, 0); + if (rc != 0) + GOTO(out, eco = ERR_PTR(rc)); + + /* + * In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() + */ + /* coverity[overrun-buffer-val] */ + obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl); + if (IS_ERR(obj)) + GOTO(out, eco = (void *)obj); + + eco = cl2echo_obj(obj); + if (eco->eo_deleted) { + cl_object_put(env, obj); + eco = ERR_PTR(-EAGAIN); + } + +out: + if (oinfo) + OBD_FREE_PTR(oinfo); + + cl_env_put(env, &refcheck); + RETURN(eco); +} + +static int cl_echo_object_put(struct echo_object *eco) +{ + struct lu_env *env; + struct cl_object *obj = echo_obj2cl(eco); + __u16 refcheck; + + ENTRY; + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + /* an external function to kill an object? */ + if (eco->eo_deleted) { + struct lu_object_header *loh = obj->co_lu.lo_header; + + LASSERT(&eco->eo_hdr == luh2coh(loh)); + set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags); + } + + cl_object_put(env, obj); + cl_env_put(env, &refcheck); + RETURN(0); +} + +static int cl_echo_enqueue0(struct lu_env *env, struct echo_object *eco, + u64 start, u64 end, int mode, + __u64 *cookie, __u32 enqflags) +{ + struct cl_io *io; + struct cl_lock *lck; + struct cl_object *obj; + struct cl_lock_descr *descr; + struct echo_thread_info *info; + int rc = -ENOMEM; + + ENTRY; + info = echo_env_info(env); + io = &info->eti_io; + lck = &info->eti_lock; + obj = echo_obj2cl(eco); + + memset(lck, 0, sizeof(*lck)); + descr = &lck->cll_descr; + descr->cld_obj = obj; + descr->cld_start = cl_index(obj, start); + descr->cld_end = cl_index(obj, end); + descr->cld_mode = mode == LCK_PW ? CLM_WRITE : CLM_READ; + descr->cld_enq_flags = enqflags; + io->ci_obj = obj; + + rc = cl_lock_request(env, io, lck); + if (rc == 0) { + struct echo_client_obd *ec = eco->eo_dev->ed_ec; + struct echo_lock *el; + + el = cl2echo_lock(cl_lock_at(lck, &echo_device_type)); + spin_lock(&ec->ec_lock); + if (list_empty(&el->el_chain)) { + list_add(&el->el_chain, &ec->ec_locks); + el->el_cookie = ++ec->ec_unique; + } + atomic_inc(&el->el_refcount); + *cookie = el->el_cookie; + spin_unlock(&ec->ec_lock); + } + RETURN(rc); +} + +static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed, + __u64 cookie) +{ + struct echo_client_obd *ec = ed->ed_ec; + struct echo_lock *ecl = NULL; + struct list_head *el; + int found = 0, still_used = 0; + + ENTRY; + LASSERT(ec != NULL); + spin_lock(&ec->ec_lock); + list_for_each(el, &ec->ec_locks) { + ecl = list_entry(el, struct echo_lock, el_chain); + CDEBUG(D_INFO, "ecl: %p, cookie: %#llx\n", ecl, ecl->el_cookie); + found = (ecl->el_cookie == cookie); + if (found) { + if (atomic_dec_and_test(&ecl->el_refcount)) + list_del_init(&ecl->el_chain); + else + still_used = 1; + break; + } + } + spin_unlock(&ec->ec_lock); + + if (!found) + RETURN(-ENOENT); + + echo_lock_release(env, ecl, still_used); + RETURN(0); +} + +static void echo_commit_callback(const struct lu_env *env, struct cl_io *io, + struct pagevec *pvec) +{ + struct echo_thread_info *info; + struct cl_2queue *queue; + int i = 0; + + info = echo_env_info(env); + LASSERT(io == &info->eti_io); + + queue = &info->eti_queue; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *vmpage = pvec->pages[i]; + struct cl_page *page = (struct cl_page *)vmpage->private; + + cl_page_list_add(&queue->c2_qout, page, true); + } +} + +static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset, + struct page **pages, int npages, int async) +{ + struct lu_env *env; + struct echo_thread_info *info; + struct cl_object *obj = echo_obj2cl(eco); + struct echo_device *ed = eco->eo_dev; + struct cl_2queue *queue; + struct cl_io *io; + struct cl_page *clp; + struct lustre_handle lh = { 0 }; + int page_size = cl_page_size(obj); + int rc; + int i; + __u16 refcheck; + + ENTRY; + LASSERT((offset & ~PAGE_MASK) == 0); + LASSERT(ed->ed_next != NULL); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + info = echo_env_info(env); + io = &info->eti_io; + queue = &info->eti_queue; + + cl_2queue_init(queue); + + io->ci_ignore_layout = 1; + rc = cl_io_init(env, io, CIT_MISC, obj); + if (rc < 0) + GOTO(out, rc); + LASSERT(rc == 0); + + rc = cl_echo_enqueue0(env, eco, offset, + offset + npages * PAGE_SIZE - 1, + rw == READ ? LCK_PR : LCK_PW, &lh.cookie, + CEF_NEVER); + if (rc < 0) + GOTO(error_lock, rc); + + for (i = 0; i < npages; i++) { + LASSERT(pages[i]); + clp = cl_page_find(env, obj, cl_index(obj, offset), + pages[i], CPT_TRANSIENT); + if (IS_ERR(clp)) { + rc = PTR_ERR(clp); + break; + } + LASSERT(clp->cp_type == CPT_TRANSIENT); + + rc = cl_page_own(env, io, clp); + if (rc) { + LASSERT(clp->cp_state == CPS_FREEING); + cl_page_put(env, clp); + break; + } + + cl_2queue_add(queue, clp, true); + + /* + * drop the reference count for cl_page_find, so that the page + * will be freed in cl_2queue_fini. + */ + cl_page_put(env, clp); + cl_page_clip(env, clp, 0, page_size); + + offset += page_size; + } + + if (rc == 0) { + enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE; + + async = async && (typ == CRT_WRITE); + if (async) + rc = cl_io_commit_async(env, io, &queue->c2_qin, + 0, PAGE_SIZE, + echo_commit_callback); + else + rc = cl_io_submit_sync(env, io, typ, queue, 0); + CDEBUG(D_INFO, "echo_client %s write returns %d\n", + async ? "async" : "sync", rc); + } + + cl_echo_cancel0(env, ed, lh.cookie); + EXIT; +error_lock: + cl_2queue_discard(env, io, queue); + cl_2queue_disown(env, io, queue); + cl_2queue_fini(env, queue); + cl_io_fini(env, io); +out: + cl_env_put(env, &refcheck); + return rc; +} +/** @} echo_exports */ + +static u64 last_object_id; + +#ifdef HAVE_SERVER_SUPPORT +static inline void echo_md_build_name(struct lu_name *lname, char *name, + __u64 id) +{ + snprintf(name, ETI_NAME_LEN, "%llu", id); + lname->ln_name = name; + lname->ln_namelen = strlen(name); +} + +/* similar to mdt_attr_get_complex */ +static int echo_big_lmm_get(const struct lu_env *env, struct md_object *o, + struct md_attr *ma) +{ + struct echo_thread_info *info = echo_env_info(env); + int rc; + + ENTRY; + + LASSERT(ma->ma_lmm_size > 0); + + LASSERT(ma->ma_need & (MA_LOV | MA_LMV)); + if (ma->ma_need & MA_LOV) + rc = mo_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LOV); + else + rc = mo_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LMV); + + if (rc < 0) + RETURN(rc); + + /* big_lmm may need to be grown */ + if (info->eti_big_lmmsize < rc) { + int size = size_roundup_power2(rc); + + if (info->eti_big_lmmsize > 0) { + /* free old buffer */ + LASSERT(info->eti_big_lmm); + OBD_FREE_LARGE(info->eti_big_lmm, + info->eti_big_lmmsize); + info->eti_big_lmm = NULL; + info->eti_big_lmmsize = 0; + } + + OBD_ALLOC_LARGE(info->eti_big_lmm, size); + if (!info->eti_big_lmm) + RETURN(-ENOMEM); + info->eti_big_lmmsize = size; + } + LASSERT(info->eti_big_lmmsize >= rc); + + info->eti_buf.lb_buf = info->eti_big_lmm; + info->eti_buf.lb_len = info->eti_big_lmmsize; + if (ma->ma_need & MA_LOV) + rc = mo_xattr_get(env, o, &info->eti_buf, XATTR_NAME_LOV); + else + rc = mo_xattr_get(env, o, &info->eti_buf, XATTR_NAME_LMV); + if (rc < 0) + RETURN(rc); + + if (ma->ma_need & MA_LOV) + ma->ma_valid |= MA_LOV; + else + ma->ma_valid |= MA_LMV; + + ma->ma_lmm = info->eti_big_lmm; + ma->ma_lmm_size = rc; + + RETURN(0); +} + +static int echo_attr_get_complex(const struct lu_env *env, + struct md_object *next, + struct md_attr *ma) +{ + struct echo_thread_info *info = echo_env_info(env); + struct lu_buf *buf = &info->eti_buf; + umode_t mode = lu_object_attr(&next->mo_lu); + int rc = 0, rc2; + + ENTRY; + + ma->ma_valid = 0; + + if (ma->ma_need & MA_INODE) { + rc = mo_attr_get(env, next, ma); + if (rc) + GOTO(out, rc); + ma->ma_valid |= MA_INODE; + } + + if ((ma->ma_need & MA_LOV) && (S_ISREG(mode) || S_ISDIR(mode))) { + LASSERT(ma->ma_lmm_size > 0); + buf->lb_buf = ma->ma_lmm; + buf->lb_len = ma->ma_lmm_size; + rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LOV); + if (rc2 > 0) { + ma->ma_lmm_size = rc2; + ma->ma_valid |= MA_LOV; + } else if (rc2 == -ENODATA) { + /* no LOV EA */ + ma->ma_lmm_size = 0; + } else if (rc2 == -ERANGE) { + rc2 = echo_big_lmm_get(env, next, ma); + if (rc2 < 0) + GOTO(out, rc = rc2); + } else { + GOTO(out, rc = rc2); + } + } + + if ((ma->ma_need & MA_LMV) && S_ISDIR(mode)) { + LASSERT(ma->ma_lmm_size > 0); + buf->lb_buf = ma->ma_lmm; + buf->lb_len = ma->ma_lmm_size; + rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LMV); + if (rc2 > 0) { + ma->ma_lmm_size = rc2; + ma->ma_valid |= MA_LMV; + } else if (rc2 == -ENODATA) { + /* no LMV EA */ + ma->ma_lmm_size = 0; + } else if (rc2 == -ERANGE) { + rc2 = echo_big_lmm_get(env, next, ma); + if (rc2 < 0) + GOTO(out, rc = rc2); + } else { + GOTO(out, rc = rc2); + } + } + +#ifdef CONFIG_LUSTRE_FS_POSIX_ACL + if ((ma->ma_need & MA_ACL_DEF) && S_ISDIR(mode)) { + buf->lb_buf = ma->ma_acl; + buf->lb_len = ma->ma_acl_size; + rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_ACL_DEFAULT); + if (rc2 > 0) { + ma->ma_acl_size = rc2; + ma->ma_valid |= MA_ACL_DEF; + } else if (rc2 == -ENODATA) { + /* no ACLs */ + ma->ma_acl_size = 0; + } else { + GOTO(out, rc = rc2); + } + } +#endif +out: + CDEBUG(D_INODE, "after getattr rc = %d, ma_valid = %#llx ma_lmm=%p\n", + rc, ma->ma_valid, ma->ma_lmm); + RETURN(rc); +} + +static int +echo_md_create_internal(const struct lu_env *env, struct echo_device *ed, + struct md_object *parent, struct lu_fid *fid, + struct lu_name *lname, struct md_op_spec *spec, + struct md_attr *ma) +{ + struct lu_object *ec_child, *child; + struct lu_device *ld = ed->ed_next; + struct echo_thread_info *info = echo_env_info(env); + struct lu_fid *fid2 = &info->eti_fid2; + struct lu_object_conf conf = { .loc_flags = LOC_F_NEW }; + int rc; + + ENTRY; + + rc = mdo_lookup(env, parent, lname, fid2, spec); + if (rc == 0) + return -EEXIST; + else if (rc != -ENOENT) + return rc; + + ec_child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, + fid, &conf); + if (IS_ERR(ec_child)) { + CERROR("Can not find the child "DFID": rc = %ld\n", PFID(fid), + PTR_ERR(ec_child)); + RETURN(PTR_ERR(ec_child)); + } + + child = lu_object_locate(ec_child->lo_header, ld->ld_type); + if (!child) { + CERROR("Can not locate the child "DFID"\n", PFID(fid)); + GOTO(out_put, rc = -EINVAL); + } + + CDEBUG(D_RPCTRACE, "Start creating object "DFID" %s %p\n", + PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent); + + /* + * Do not perform lookup sanity check. We know that name does not exist. + */ + spec->sp_cr_lookup = 0; + rc = mdo_create(env, parent, lname, lu2md(child), spec, ma); + if (rc) { + CERROR("Can not create child "DFID": rc = %d\n", PFID(fid), rc); + GOTO(out_put, rc); + } + CDEBUG(D_RPCTRACE, "End creating object "DFID" %s %p rc = %d\n", + PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent, rc); + EXIT; +out_put: + lu_object_put(env, ec_child); + return rc; +} + +static int echo_set_lmm_size(const struct lu_env *env, struct lu_device *ld, + struct md_attr *ma) +{ + struct echo_thread_info *info = echo_env_info(env); + + if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) { + ma->ma_lmm = (void *)&info->eti_lmm; + ma->ma_lmm_size = sizeof(info->eti_lmm); + } else { + LASSERT(info->eti_big_lmmsize); + ma->ma_lmm = info->eti_big_lmm; + ma->ma_lmm_size = info->eti_big_lmmsize; + } + + return 0; +} + +static int +echo_md_dir_stripe_choose(const struct lu_env *env, struct echo_device *ed, + struct lu_object *obj, const char *name, + unsigned int namelen, __u64 id, + struct lu_object **new_parent) +{ + struct echo_thread_info *info = echo_env_info(env); + struct md_attr *ma = &info->eti_ma; + struct lmv_mds_md_v1 *lmv; + struct lu_device *ld = ed->ed_next; + unsigned int idx; + struct lu_name tmp_ln_name; + struct lu_fid stripe_fid; + struct lu_object *stripe_obj; + int rc; + + LASSERT(obj != NULL); + LASSERT(S_ISDIR(obj->lo_header->loh_attr)); + + memset(ma, 0, sizeof(*ma)); + echo_set_lmm_size(env, ld, ma); + ma->ma_need = MA_LMV; + rc = echo_attr_get_complex(env, lu2md(obj), ma); + if (rc) { + CERROR("Can not getattr child "DFID": rc = %d\n", + PFID(lu_object_fid(obj)), rc); + return rc; + } + + if (!(ma->ma_valid & MA_LMV)) { + *new_parent = obj; + return 0; + } + + lmv = (struct lmv_mds_md_v1 *)ma->ma_lmm; + if (!lmv_is_sane(lmv)) { + rc = -EINVAL; + CERROR("Invalid mds md magic %x "DFID": rc = %d\n", + le32_to_cpu(lmv->lmv_magic), PFID(lu_object_fid(obj)), + rc); + return rc; + } + + if (name) { + tmp_ln_name.ln_name = name; + tmp_ln_name.ln_namelen = namelen; + } else { + LASSERT(id != -1); + echo_md_build_name(&tmp_ln_name, info->eti_name, id); + } + + idx = lmv_name_to_stripe_index(lmv, tmp_ln_name.ln_name, + tmp_ln_name.ln_namelen); + + LASSERT(idx < le32_to_cpu(lmv->lmv_stripe_count)); + fid_le_to_cpu(&stripe_fid, &lmv->lmv_stripe_fids[idx]); + + stripe_obj = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, &stripe_fid, + NULL); + if (IS_ERR(stripe_obj)) { + rc = PTR_ERR(stripe_obj); + CERROR("Can not find the parent "DFID": rc = %d\n", + PFID(&stripe_fid), rc); + return rc; + } + + *new_parent = lu_object_locate(stripe_obj->lo_header, ld->ld_type); + if (!*new_parent) { + lu_object_put(env, stripe_obj); + RETURN(-ENXIO); + } + + return rc; +} + +static int echo_create_md_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + struct lu_fid *fid, + char *name, int namelen, + __u64 id, __u32 mode, int count, + int stripe_count, int stripe_offset) +{ + struct lu_object *parent; + struct lu_object *new_parent; + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + struct md_op_spec *spec = &info->eti_spec; + struct md_attr *ma = &info->eti_ma; + struct lu_device *ld = ed->ed_next; + int rc = 0; + int i; + + ENTRY; + + if (!ec_parent) + return -1; + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (!parent) + RETURN(-ENXIO); + + rc = echo_md_dir_stripe_choose(env, ed, parent, name, namelen, + id, &new_parent); + if (rc != 0) + RETURN(rc); + + LASSERT(new_parent != NULL); + memset(ma, 0, sizeof(*ma)); + memset(spec, 0, sizeof(*spec)); + echo_set_lmm_size(env, ld, ma); + if (stripe_count != 0) { + spec->sp_cr_flags |= MDS_FMODE_WRITE; + if (stripe_count != -1) { + if (S_ISDIR(mode)) { + struct lmv_user_md *lmu; + + lmu = (struct lmv_user_md *)&info->eti_lum; + lmu->lum_magic = LMV_USER_MAGIC; + lmu->lum_stripe_offset = stripe_offset; + lmu->lum_stripe_count = stripe_count; + lmu->lum_hash_type = LMV_HASH_TYPE_FNV_1A_64; + spec->u.sp_ea.eadata = lmu; + spec->u.sp_ea.eadatalen = sizeof(*lmu); + } else { + struct lov_user_md_v3 *lum = &info->eti_lum; + + lum->lmm_magic = LOV_USER_MAGIC_V3; + lum->lmm_stripe_count = stripe_count; + lum->lmm_stripe_offset = stripe_offset; + lum->lmm_pattern = LOV_PATTERN_NONE; + spec->u.sp_ea.eadata = lum; + spec->u.sp_ea.eadatalen = sizeof(*lum); + } + spec->sp_cr_flags |= MDS_OPEN_HAS_EA; + } + } + + ma->ma_attr.la_mode = mode; + ma->ma_attr.la_valid = LA_CTIME | LA_MODE; + ma->ma_attr.la_ctime = ktime_get_real_seconds(); + + if (name) { + lname->ln_name = name; + lname->ln_namelen = namelen; + /* If name is specified, only create one object by name */ + rc = echo_md_create_internal(env, ed, lu2md(new_parent), fid, + lname, spec, ma); + GOTO(out_put, rc); + } + + /* Create multiple object sequenced by id */ + for (i = 0; i < count; i++) { + char *tmp_name = info->eti_name; + + echo_md_build_name(lname, tmp_name, id); + + rc = echo_md_create_internal(env, ed, lu2md(new_parent), + fid, lname, spec, ma); + if (rc) { + CERROR("Can not create child %s: rc = %d\n", tmp_name, + rc); + break; + } + id++; + fid->f_oid++; + } + +out_put: + if (new_parent != parent) + lu_object_put(env, new_parent); + + RETURN(rc); +} + +static struct lu_object *echo_md_lookup(const struct lu_env *env, + struct echo_device *ed, + struct md_object *parent, + struct lu_name *lname) +{ + struct echo_thread_info *info = echo_env_info(env); + struct lu_fid *fid = &info->eti_fid; + struct lu_object *child; + int rc; + + ENTRY; + CDEBUG(D_INFO, "lookup %s in parent "DFID" %p\n", lname->ln_name, + PFID(fid), parent); + + rc = mdo_lookup(env, parent, lname, fid, NULL); + if (rc) { + CERROR("lookup %s: rc = %d\n", lname->ln_name, rc); + RETURN(ERR_PTR(rc)); + } + + /* + * In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() + */ + /* coverity[overrun-buffer-val] */ + child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL); + + RETURN(child); +} + +static int echo_setattr_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + __u64 id, int count) +{ + struct lu_object *parent; + struct lu_object *new_parent; + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + char *name = info->eti_name; + struct lu_device *ld = ed->ed_next; + struct lu_buf *buf = &info->eti_buf; + int rc = 0; + int i; + + ENTRY; + + if (!ec_parent) + return -1; + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (!parent) + RETURN(-ENXIO); + + rc = echo_md_dir_stripe_choose(env, ed, parent, NULL, 0, id, + &new_parent); + if (rc != 0) + RETURN(rc); + + for (i = 0; i < count; i++) { + struct lu_object *ec_child, *child; + + echo_md_build_name(lname, name, id); + + ec_child = echo_md_lookup(env, ed, lu2md(new_parent), lname); + if (IS_ERR(ec_child)) { + rc = PTR_ERR(ec_child); + CERROR("Can't find child %s: rc = %d\n", + lname->ln_name, rc); + break; + } + + child = lu_object_locate(ec_child->lo_header, ld->ld_type); + if (!child) { + CERROR("Can not locate the child %s\n", lname->ln_name); + lu_object_put(env, ec_child); + rc = -EINVAL; + break; + } + + CDEBUG(D_RPCTRACE, "Start setattr object "DFID"\n", + PFID(lu_object_fid(child))); + + buf->lb_buf = info->eti_xattr_buf; + buf->lb_len = sizeof(info->eti_xattr_buf); + + sprintf(name, "%s.test1", XATTR_USER_PREFIX); + rc = mo_xattr_set(env, lu2md(child), buf, name, + LU_XATTR_CREATE); + if (rc < 0) { + CERROR("Can not setattr child "DFID": rc = %d\n", + PFID(lu_object_fid(child)), rc); + lu_object_put(env, ec_child); + break; + } + CDEBUG(D_RPCTRACE, "End setattr object "DFID"\n", + PFID(lu_object_fid(child))); + id++; + lu_object_put(env, ec_child); + } + + if (new_parent != parent) + lu_object_put(env, new_parent); + + RETURN(rc); +} + +static int echo_getattr_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + __u64 id, int count) +{ + struct lu_object *parent; + struct lu_object *new_parent; + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + char *name = info->eti_name; + struct md_attr *ma = &info->eti_ma; + struct lu_device *ld = ed->ed_next; + int rc = 0; + int i; + + ENTRY; + + if (!ec_parent) + return -1; + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (!parent) + RETURN(-ENXIO); + + rc = echo_md_dir_stripe_choose(env, ed, parent, NULL, 0, id, + &new_parent); + if (rc != 0) + RETURN(rc); + + memset(ma, 0, sizeof(*ma)); + ma->ma_need |= MA_INODE | MA_LOV | MA_PFID | MA_HSM | MA_ACL_DEF; + ma->ma_acl = info->eti_xattr_buf; + ma->ma_acl_size = sizeof(info->eti_xattr_buf); + + for (i = 0; i < count; i++) { + struct lu_object *ec_child, *child; + + ma->ma_valid = 0; + echo_md_build_name(lname, name, id); + echo_set_lmm_size(env, ld, ma); + + ec_child = echo_md_lookup(env, ed, lu2md(new_parent), lname); + if (IS_ERR(ec_child)) { + CERROR("Can't find child %s: rc = %ld\n", + lname->ln_name, PTR_ERR(ec_child)); + RETURN(PTR_ERR(ec_child)); + } + + child = lu_object_locate(ec_child->lo_header, ld->ld_type); + if (!child) { + CERROR("Can not locate the child %s\n", lname->ln_name); + lu_object_put(env, ec_child); + RETURN(-EINVAL); + } + + CDEBUG(D_RPCTRACE, "Start getattr object "DFID"\n", + PFID(lu_object_fid(child))); + rc = echo_attr_get_complex(env, lu2md(child), ma); + if (rc) { + CERROR("Can not getattr child "DFID": rc = %d\n", + PFID(lu_object_fid(child)), rc); + lu_object_put(env, ec_child); + break; + } + CDEBUG(D_RPCTRACE, "End getattr object "DFID"\n", + PFID(lu_object_fid(child))); + id++; + lu_object_put(env, ec_child); + } + + if (new_parent != parent) + lu_object_put(env, new_parent); + + RETURN(rc); +} + +static int echo_lookup_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + __u64 id, int count) +{ + struct lu_object *parent; + struct lu_object *new_parent; + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + char *name = info->eti_name; + struct lu_fid *fid = &info->eti_fid; + struct lu_device *ld = ed->ed_next; + int rc = 0; + int i; + + if (!ec_parent) + return -1; + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (!parent) + return -ENXIO; + + rc = echo_md_dir_stripe_choose(env, ed, parent, NULL, 0, id, + &new_parent); + if (rc != 0) + RETURN(rc); + + /*prepare the requests*/ + for (i = 0; i < count; i++) { + echo_md_build_name(lname, name, id); + + CDEBUG(D_RPCTRACE, "Start lookup object "DFID" %s %p\n", + PFID(lu_object_fid(new_parent)), lname->ln_name, + new_parent); + + rc = mdo_lookup(env, lu2md(new_parent), lname, fid, NULL); + if (rc) { + CERROR("Can not lookup child %s: rc = %d\n", name, rc); + break; + } + + CDEBUG(D_RPCTRACE, "End lookup object "DFID" %s %p\n", + PFID(lu_object_fid(new_parent)), lname->ln_name, + new_parent); + + id++; + } + + if (new_parent != parent) + lu_object_put(env, new_parent); + + return rc; +} + +static int echo_md_destroy_internal(const struct lu_env *env, + struct echo_device *ed, + struct md_object *parent, + struct lu_name *lname, + struct md_attr *ma) +{ + struct lu_device *ld = ed->ed_next; + struct lu_object *ec_child; + struct lu_object *child; + int rc; + + ENTRY; + + ec_child = echo_md_lookup(env, ed, parent, lname); + if (IS_ERR(ec_child)) { + CERROR("Can't find child %s: rc = %ld\n", lname->ln_name, + PTR_ERR(ec_child)); + RETURN(PTR_ERR(ec_child)); + } + + child = lu_object_locate(ec_child->lo_header, ld->ld_type); + if (!child) { + CERROR("Can not locate the child %s\n", lname->ln_name); + GOTO(out_put, rc = -EINVAL); + } + + if (lu_object_remote(child)) { + CERROR("Can not destroy remote object %s: rc = %d\n", + lname->ln_name, -EPERM); + GOTO(out_put, rc = -EPERM); + } + CDEBUG(D_RPCTRACE, "Start destroy object "DFID" %s %p\n", + PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent); + + rc = mdo_unlink(env, parent, lu2md(child), lname, ma, 0); + if (rc) { + CERROR("Can not unlink child %s: rc = %d\n", + lname->ln_name, rc); + GOTO(out_put, rc); + } + CDEBUG(D_RPCTRACE, "End destroy object "DFID" %s %p\n", + PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent); +out_put: + lu_object_put(env, ec_child); + return rc; +} + +static int echo_destroy_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + char *name, int namelen, + __u64 id, __u32 mode, + int count) +{ + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + struct md_attr *ma = &info->eti_ma; + struct lu_device *ld = ed->ed_next; + struct lu_object *parent; + struct lu_object *new_parent; + int rc = 0; + int i; + + ENTRY; + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (!parent) + RETURN(-EINVAL); + + rc = echo_md_dir_stripe_choose(env, ed, parent, name, namelen, + id, &new_parent); + if (rc != 0) + RETURN(rc); + + memset(ma, 0, sizeof(*ma)); + ma->ma_attr.la_mode = mode; + ma->ma_attr.la_valid = LA_CTIME; + ma->ma_attr.la_ctime = ktime_get_real_seconds(); + ma->ma_need = MA_INODE; + ma->ma_valid = 0; + + if (name) { + lname->ln_name = name; + lname->ln_namelen = namelen; + rc = echo_md_destroy_internal(env, ed, lu2md(new_parent), lname, + ma); + GOTO(out_put, rc); + } + + /*prepare the requests*/ + for (i = 0; i < count; i++) { + char *tmp_name = info->eti_name; + + ma->ma_valid = 0; + echo_md_build_name(lname, tmp_name, id); + + rc = echo_md_destroy_internal(env, ed, lu2md(new_parent), lname, + ma); + if (rc) { + CERROR("Can not unlink child %s: rc = %d\n", name, rc); + break; + } + id++; + } + +out_put: + if (new_parent != parent) + lu_object_put(env, new_parent); + + RETURN(rc); +} + +static struct lu_object *echo_resolve_path(const struct lu_env *env, + struct echo_device *ed, char *path, + int path_len) +{ + struct lu_device *ld = ed->ed_next; + struct echo_thread_info *info = echo_env_info(env); + struct lu_fid *fid = &info->eti_fid; + struct lu_name *lname = &info->eti_lname; + struct lu_object *parent = NULL; + struct lu_object *child = NULL; + int rc = 0; + + ENTRY; + *fid = ed->ed_root_fid; + + /* + * In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() + */ + /* coverity[overrun-buffer-val] */ + parent = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL); + if (IS_ERR(parent)) { + CERROR("Can not find the parent "DFID": rc = %ld\n", + PFID(fid), PTR_ERR(parent)); + RETURN(parent); + } + + while (1) { + struct lu_object *ld_parent; + char *e; + + e = strsep(&path, "/"); + if (!e) + break; + + if (e[0] == 0) { + if (!path || path[0] == '\0') + break; + continue; + } + + lname->ln_name = e; + lname->ln_namelen = strlen(e); + + ld_parent = lu_object_locate(parent->lo_header, ld->ld_type); + if (!ld_parent) { + lu_object_put(env, parent); + rc = -EINVAL; + break; + } + + child = echo_md_lookup(env, ed, lu2md(ld_parent), lname); + lu_object_put(env, parent); + if (IS_ERR(child)) { + rc = (int)PTR_ERR(child); + CERROR("lookup %s under parent "DFID": rc = %d\n", + lname->ln_name, PFID(lu_object_fid(ld_parent)), + rc); + break; + } + parent = child; + } + if (rc) + RETURN(ERR_PTR(rc)); + + RETURN(parent); +} + +static void echo_ucred_init(struct lu_env *env) +{ + struct lu_ucred *ucred = lu_ucred(env); + kernel_cap_t kcap = current_cap(); + + ucred->uc_valid = UCRED_INVALID; + + ucred->uc_suppgids[0] = -1; + ucred->uc_suppgids[1] = -1; + + ucred->uc_uid = ucred->uc_o_uid = + from_kuid(&init_user_ns, current_uid()); + ucred->uc_gid = ucred->uc_o_gid = + from_kgid(&init_user_ns, current_gid()); + ucred->uc_fsuid = ucred->uc_o_fsuid = + from_kuid(&init_user_ns, current_fsuid()); + ucred->uc_fsgid = ucred->uc_o_fsgid = + from_kgid(&init_user_ns, current_fsgid()); + ucred->uc_cap = current_cap(); + + /* remove fs privilege for non-root user. */ + if (ucred->uc_fsuid) { + kcap = cap_drop_nfsd_set(kcap); + kcap = cap_drop_fs_set(kcap); + } + ucred->uc_cap = kcap; + ucred->uc_valid = UCRED_NEW; +} + +static void echo_ucred_fini(struct lu_env *env) +{ + struct lu_ucred *ucred = lu_ucred(env); + + ucred->uc_valid = UCRED_INIT; +} + +static int echo_md_handler(struct echo_device *ed, int command, + char *path, int path_len, __u64 id, int count, + struct obd_ioctl_data *data) +{ + struct echo_thread_info *info; + struct lu_device *ld = ed->ed_next; + struct lu_env *env; + __u16 refcheck; + struct lu_object *parent; + char *name = NULL; + int namelen = data->ioc_plen2; + int rc = 0; + + ENTRY; + if (!ld) { + CERROR("MD echo client is not being initialized properly\n"); + RETURN(-EINVAL); + } + + if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) { + CERROR("Only support MDD layer right now!\n"); + RETURN(-EINVAL); + } + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + rc = lu_env_refill_by_tags(env, ECHO_MD_CTX_TAG, ECHO_SES_TAG); + if (rc != 0) + GOTO(out_env, rc); + + /* init big_lmm buffer */ + info = echo_env_info(env); + LASSERT(info->eti_big_lmm == NULL); + OBD_ALLOC_LARGE(info->eti_big_lmm, MIN_MD_SIZE); + if (!info->eti_big_lmm) + GOTO(out_env, rc = -ENOMEM); + info->eti_big_lmmsize = MIN_MD_SIZE; + + parent = echo_resolve_path(env, ed, path, path_len); + if (IS_ERR(parent)) { + CERROR("Can not resolve the path %s: rc = %ld\n", path, + PTR_ERR(parent)); + GOTO(out_free, rc = PTR_ERR(parent)); + } + + if (namelen > 0) { + OBD_ALLOC(name, namelen + 1); + if (!name) + GOTO(out_put, rc = -ENOMEM); + if (copy_from_user(name, data->ioc_pbuf2, namelen)) + GOTO(out_name, rc = -EFAULT); + } + + echo_ucred_init(env); + + switch (command) { + case ECHO_MD_CREATE: + case ECHO_MD_MKDIR: { + struct echo_thread_info *info = echo_env_info(env); + __u32 mode = data->ioc_obdo2.o_mode; + struct lu_fid *fid = &info->eti_fid; + int stripe_count = (int)data->ioc_obdo2.o_misc; + int stripe_index = (int)data->ioc_obdo2.o_stripe_idx; + + rc = ostid_to_fid(fid, &data->ioc_obdo1.o_oi, 0); + if (rc != 0) + break; + + /* + * In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() + */ + /* coverity[overrun-buffer-val] */ + rc = echo_create_md_object(env, ed, parent, fid, name, namelen, + id, mode, count, stripe_count, + stripe_index); + break; + } + case ECHO_MD_DESTROY: + case ECHO_MD_RMDIR: { + __u32 mode = data->ioc_obdo2.o_mode; + + rc = echo_destroy_object(env, ed, parent, name, namelen, + id, mode, count); + break; + } + case ECHO_MD_LOOKUP: + rc = echo_lookup_object(env, ed, parent, id, count); + break; + case ECHO_MD_GETATTR: + rc = echo_getattr_object(env, ed, parent, id, count); + break; + case ECHO_MD_SETATTR: + rc = echo_setattr_object(env, ed, parent, id, count); + break; + default: + CERROR("unknown command %d\n", command); + rc = -EINVAL; + break; + } + echo_ucred_fini(env); + +out_name: + if (name) + OBD_FREE(name, namelen + 1); +out_put: + lu_object_put(env, parent); +out_free: + LASSERT(info->eti_big_lmm); + OBD_FREE_LARGE(info->eti_big_lmm, info->eti_big_lmmsize); + info->eti_big_lmm = NULL; + info->eti_big_lmmsize = 0; +out_env: + cl_env_put(env, &refcheck); + return rc; +} +#endif /* HAVE_SERVER_SUPPORT */ + +static int echo_create_object(const struct lu_env *env, struct echo_device *ed, + struct obdo *oa) +{ + struct echo_object *eco; + struct echo_client_obd *ec = ed->ed_ec; + int created = 0; + int rc; + + ENTRY; + if (!(oa->o_valid & OBD_MD_FLID) || + !(oa->o_valid & OBD_MD_FLGROUP) || + !fid_seq_is_echo(ostid_seq(&oa->o_oi))) { + CERROR("invalid oid "DOSTID"\n", POSTID(&oa->o_oi)); + RETURN(-EINVAL); + } + + if (ostid_id(&oa->o_oi) == 0) { + rc = ostid_set_id(&oa->o_oi, ++last_object_id); + if (rc) + GOTO(failed, rc); + } + + rc = obd_create(env, ec->ec_exp, oa); + if (rc != 0) { + CERROR("Cannot create objects: rc = %d\n", rc); + GOTO(failed, rc); + } + + created = 1; + + oa->o_valid |= OBD_MD_FLID; + + eco = cl_echo_object_find(ed, &oa->o_oi); + if (IS_ERR(eco)) + GOTO(failed, rc = PTR_ERR(eco)); + cl_echo_object_put(eco); + + CDEBUG(D_INFO, "oa oid "DOSTID"\n", POSTID(&oa->o_oi)); + EXIT; + +failed: + if (created && rc != 0) + obd_destroy(env, ec->ec_exp, oa); + + if (rc != 0) + CERROR("create object failed with: rc = %d\n", rc); + + return rc; +} + +static int echo_get_object(struct echo_object **ecop, struct echo_device *ed, + struct obdo *oa) +{ + struct echo_object *eco; + int rc; + + ENTRY; + if (!(oa->o_valid & OBD_MD_FLID) || + !(oa->o_valid & OBD_MD_FLGROUP) || + ostid_id(&oa->o_oi) == 0) { + CERROR("invalid oid "DOSTID"\n", POSTID(&oa->o_oi)); + RETURN(-EINVAL); + } + + rc = 0; + eco = cl_echo_object_find(ed, &oa->o_oi); + if (!IS_ERR(eco)) + *ecop = eco; + else + rc = PTR_ERR(eco); + + RETURN(rc); +} + +static void echo_put_object(struct echo_object *eco) +{ + int rc; + + rc = cl_echo_object_put(eco); + if (rc) + CERROR("%s: echo client drop an object failed: rc = %d\n", + eco->eo_dev->ed_ec->ec_exp->exp_obd->obd_name, rc); +} + +static void echo_client_page_debug_setup(struct page *page, int rw, u64 id, + u64 offset, u64 count) +{ + char *addr; + u64 stripe_off; + u64 stripe_id; + int delta; + + /* no partial pages on the client */ + LASSERT(count == PAGE_SIZE); + + addr = kmap(page); + + for (delta = 0; delta < PAGE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) { + if (rw == OBD_BRW_WRITE) { + stripe_off = offset + delta; + stripe_id = id; + } else { + stripe_off = 0xdeadbeef00c0ffeeULL; + stripe_id = 0xdeadbeef00c0ffeeULL; + } + block_debug_setup(addr + delta, OBD_ECHO_BLOCK_SIZE, + stripe_off, stripe_id); + } + + kunmap(page); +} + +static int +echo_client_page_debug_check(struct page *page, u64 id, u64 offset, u64 count) +{ + u64 stripe_off; + u64 stripe_id; + char *addr; + int delta; + int rc; + int rc2; + + /* no partial pages on the client */ + LASSERT(count == PAGE_SIZE); + + addr = kmap(page); + + for (rc = delta = 0; delta < PAGE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) { + stripe_off = offset + delta; + stripe_id = id; + + rc2 = block_debug_check("test_brw", + addr + delta, OBD_ECHO_BLOCK_SIZE, + stripe_off, stripe_id); + if (rc2 != 0) { + CERROR("Error in echo object %#llx\n", id); + rc = rc2; + } + } + + kunmap(page); + return rc; +} + +static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa, + struct echo_object *eco, u64 offset, + u64 count, int async) +{ + size_t npages; + struct brw_page *pga; + struct brw_page *pgp; + struct page **pages; + u64 off; + size_t i; + int rc; + int verify; + gfp_t gfp_mask; + u32 brw_flags = 0; + + ENTRY; + verify = (ostid_id(&oa->o_oi) != ECHO_PERSISTENT_OBJID && + (oa->o_valid & OBD_MD_FLFLAGS) != 0 && + (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); + + gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_KERNEL : GFP_HIGHUSER; + + LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ); + + if ((count & (~PAGE_MASK)) != 0) + RETURN(-EINVAL); + + /* XXX think again with misaligned I/O */ + npages = count >> PAGE_SHIFT; + + if (rw == OBD_BRW_WRITE) + brw_flags = OBD_BRW_ASYNC; + + OBD_ALLOC_PTR_ARRAY_LARGE(pga, npages); + if (!pga) + RETURN(-ENOMEM); + + OBD_ALLOC_PTR_ARRAY_LARGE(pages, npages); + if (!pages) { + OBD_FREE_PTR_ARRAY_LARGE(pga, npages); + RETURN(-ENOMEM); + } + + for (i = 0, pgp = pga, off = offset; + i < npages; + i++, pgp++, off += PAGE_SIZE) { + + LASSERT(pgp->pg == NULL); /* for cleanup */ + + rc = -ENOMEM; + pgp->pg = alloc_page(gfp_mask); + if (!pgp->pg) + goto out; + + /* set mapping so page is not considered encrypted */ + pgp->pg->mapping = ECHO_MAPPING_UNENCRYPTED; + pages[i] = pgp->pg; + pgp->count = PAGE_SIZE; + pgp->off = off; + pgp->flag = brw_flags; + + if (verify) + echo_client_page_debug_setup(pgp->pg, rw, + ostid_id(&oa->o_oi), off, + pgp->count); + } + + /* brw mode can only be used at client */ + LASSERT(ed->ed_next != NULL); + rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async); + + out: + if (rc != 0 || rw != OBD_BRW_READ) + verify = 0; + + for (i = 0, pgp = pga; i < npages; i++, pgp++) { + if (!pgp->pg) + continue; + + if (verify) { + int vrc; + + vrc = echo_client_page_debug_check(pgp->pg, + ostid_id(&oa->o_oi), + pgp->off, + pgp->count); + if (vrc != 0 && rc == 0) + rc = vrc; + } + __free_page(pgp->pg); + } + OBD_FREE_PTR_ARRAY_LARGE(pga, npages); + OBD_FREE_PTR_ARRAY_LARGE(pages, npages); + RETURN(rc); +} + +static int echo_client_prep_commit(const struct lu_env *env, + struct obd_export *exp, int rw, + struct obdo *oa, struct echo_object *eco, + u64 offset, u64 count, + u64 batch, int async) +{ + struct obd_ioobj ioo; + struct niobuf_local *lnb; + struct niobuf_remote rnb; + u64 off; + u64 npages, tot_pages, apc; + int i, ret = 0, brw_flags = 0; + + ENTRY; + if (count <= 0 || (count & ~PAGE_MASK) != 0) + RETURN(-EINVAL); + + apc = npages = batch >> PAGE_SHIFT; + tot_pages = count >> PAGE_SHIFT; + + OBD_ALLOC_PTR_ARRAY_LARGE(lnb, apc); + if (!lnb) + RETURN(-ENOMEM); + + if (rw == OBD_BRW_WRITE && async) + brw_flags |= OBD_BRW_ASYNC; + + obdo_to_ioobj(oa, &ioo); + + off = offset; + + for (; tot_pages > 0; tot_pages -= npages) { + int lpages; + + if (tot_pages < npages) + npages = tot_pages; + + rnb.rnb_offset = off; + rnb.rnb_len = npages * PAGE_SIZE; + rnb.rnb_flags = brw_flags; + ioo.ioo_bufcnt = 1; + off += npages * PAGE_SIZE; + + lpages = npages; + ret = obd_preprw(env, rw, exp, oa, 1, &ioo, &rnb, &lpages, lnb); + if (ret != 0) + GOTO(out, ret); + + for (i = 0; i < lpages; i++) { + struct page *page = lnb[i].lnb_page; + + /* read past eof? */ + if (!page && lnb[i].lnb_rc == 0) + continue; + + if (async) + lnb[i].lnb_flags |= OBD_BRW_ASYNC; + + if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID || + (oa->o_valid & OBD_MD_FLFLAGS) == 0 || + (oa->o_flags & OBD_FL_DEBUG_CHECK) == 0) + continue; + + if (rw == OBD_BRW_WRITE) + echo_client_page_debug_setup(page, rw, + ostid_id(&oa->o_oi), + lnb[i].lnb_file_offset, + lnb[i].lnb_len); + else + echo_client_page_debug_check(page, + ostid_id(&oa->o_oi), + lnb[i].lnb_file_offset, + lnb[i].lnb_len); + } + + ret = obd_commitrw(env, rw, exp, oa, 1, &ioo, &rnb, npages, lnb, + ret, rnb.rnb_len, ktime_set(0, 0)); + if (ret != 0) + break; + + /* Reuse env context. */ + lu_context_exit((struct lu_context *)&env->le_ctx); + lu_context_enter((struct lu_context *)&env->le_ctx); + } + +out: + OBD_FREE_PTR_ARRAY_LARGE(lnb, apc); + + RETURN(ret); +} + +static int echo_client_brw_ioctl(const struct lu_env *env, int rw, + struct obd_export *exp, + struct obd_ioctl_data *data) +{ + struct obd_device *obd = class_exp2obd(exp); + struct echo_device *ed = obd2echo_dev(obd); + struct echo_client_obd *ec = ed->ed_ec; + struct obdo *oa = &data->ioc_obdo1; + struct echo_object *eco; + int rc; + int async = 0; + long test_mode; + + ENTRY; + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + + rc = echo_get_object(&eco, ed, oa); + if (rc) + RETURN(rc); + + oa->o_valid &= ~OBD_MD_FLHANDLE; + + /* OFD/obdfilter works only via prep/commit */ + test_mode = (long)data->ioc_pbuf1; + if (!ed->ed_next && test_mode != 3) { + test_mode = 3; + data->ioc_plen1 = data->ioc_count; + } + + if (test_mode == 3) + async = 1; + + /* Truncate batch size to maximum */ + if (data->ioc_plen1 > PTLRPC_MAX_BRW_SIZE) + data->ioc_plen1 = PTLRPC_MAX_BRW_SIZE; + + switch (test_mode) { + case 1: + fallthrough; + case 2: + rc = echo_client_kbrw(ed, rw, oa, eco, data->ioc_offset, + data->ioc_count, async); + break; + case 3: + rc = echo_client_prep_commit(env, ec->ec_exp, rw, oa, eco, + data->ioc_offset, data->ioc_count, + data->ioc_plen1, async); + break; + default: + rc = -EINVAL; + } + + echo_put_object(eco); + + RETURN(rc); +} + +static int +echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void __user *uarg) +{ +#ifdef HAVE_SERVER_SUPPORT + struct tgt_session_info *tsi; +#endif + struct obd_device *obd = exp->exp_obd; + struct echo_device *ed = obd2echo_dev(obd); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco; + struct obd_ioctl_data *data = karg; + struct lu_env *env; + unsigned long env_tags = 0; + __u16 refcheck; + struct obdo *oa; + struct lu_fid fid; + int rw = OBD_BRW_READ; + int rc = 0; + + ENTRY; + oa = &data->ioc_obdo1; + if (!(oa->o_valid & OBD_MD_FLGROUP)) { + oa->o_valid |= OBD_MD_FLGROUP; + ostid_set_seq_echo(&oa->o_oi); + } + + /* This FID is unpacked just for validation at this point */ + rc = ostid_to_fid(&fid, &oa->o_oi, 0); + if (rc < 0) + RETURN(rc); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + lu_env_add(env); + +#ifdef HAVE_SERVER_SUPPORT + if (cmd == OBD_IOC_ECHO_MD || cmd == OBD_IOC_ECHO_ALLOC_SEQ) + env_tags = ECHO_MD_CTX_TAG; + else +#endif + env_tags = ECHO_DT_CTX_TAG; + + rc = lu_env_refill_by_tags(env, env_tags, ECHO_SES_TAG); + if (rc != 0) + GOTO(out, rc); + +#ifdef HAVE_SERVER_SUPPORT + tsi = tgt_ses_info(env); + /* treat as local operation */ + tsi->tsi_exp = NULL; + tsi->tsi_jobid = NULL; +#endif + + switch (cmd) { + case OBD_IOC_CREATE: /* may create echo object */ + if (!capable(CAP_SYS_ADMIN)) + GOTO(out, rc = -EPERM); + + rc = echo_create_object(env, ed, oa); + GOTO(out, rc); + +#ifdef HAVE_SERVER_SUPPORT + case OBD_IOC_ECHO_MD: { + int count; + int cmd; + char *dir = NULL; + int dirlen; + __u64 id; + + if (!capable(CAP_SYS_ADMIN)) + GOTO(out, rc = -EPERM); + + count = data->ioc_count; + cmd = data->ioc_command; + + id = data->ioc_obdo2.o_oi.oi.oi_id; + dirlen = data->ioc_plen1; + OBD_ALLOC(dir, dirlen + 1); + if (!dir) + GOTO(out, rc = -ENOMEM); + + if (copy_from_user(dir, data->ioc_pbuf1, dirlen)) { + OBD_FREE(dir, data->ioc_plen1 + 1); + GOTO(out, rc = -EFAULT); + } + + rc = echo_md_handler(ed, cmd, dir, dirlen, id, count, data); + OBD_FREE(dir, dirlen + 1); + GOTO(out, rc); + } + case OBD_IOC_ECHO_ALLOC_SEQ: { + __u64 seq; + int max_count; + + if (!capable(CAP_SYS_ADMIN)) + GOTO(out, rc = -EPERM); + + rc = seq_client_get_seq(env, ed->ed_cl_seq, &seq); + if (rc < 0) { + CERROR("%s: Can not alloc seq: rc = %d\n", + obd->obd_name, rc); + GOTO(out, rc); + } + + if (copy_to_user(data->ioc_pbuf1, &seq, data->ioc_plen1)) + return -EFAULT; + + max_count = LUSTRE_METADATA_SEQ_MAX_WIDTH; + if (copy_to_user(data->ioc_pbuf2, &max_count, + data->ioc_plen2)) + return -EFAULT; + GOTO(out, rc); + } +#endif /* HAVE_SERVER_SUPPORT */ + case OBD_IOC_DESTROY: + if (!capable(CAP_SYS_ADMIN)) + GOTO(out, rc = -EPERM); + + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + rc = obd_destroy(env, ec->ec_exp, oa); + if (rc == 0) + eco->eo_deleted = 1; + echo_put_object(eco); + } + GOTO(out, rc); + + case OBD_IOC_GETATTR: + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + rc = obd_getattr(env, ec->ec_exp, oa); + echo_put_object(eco); + } + GOTO(out, rc); + + case OBD_IOC_SETATTR: + if (!capable(CAP_SYS_ADMIN)) + GOTO(out, rc = -EPERM); + + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + rc = obd_setattr(env, ec->ec_exp, oa); + echo_put_object(eco); + } + GOTO(out, rc); + + case OBD_IOC_BRW_WRITE: + if (!capable(CAP_SYS_ADMIN)) + GOTO(out, rc = -EPERM); + + rw = OBD_BRW_WRITE; + fallthrough; + case OBD_IOC_BRW_READ: + rc = echo_client_brw_ioctl(env, rw, exp, data); + GOTO(out, rc); + + default: + CERROR("echo_ioctl(): unrecognised ioctl %#x\n", cmd); + GOTO(out, rc = -ENOTTY); + } + + EXIT; +out: + lu_env_remove(env); + cl_env_put(env, &refcheck); + + return rc; +} + +static int echo_client_setup(const struct lu_env *env, + struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct echo_client_obd *ec = &obd->u.echo_client; + struct obd_device *tgt; + struct obd_uuid echo_uuid = { "ECHO_UUID" }; + struct obd_connect_data *ocd = NULL; + int rc; + + ENTRY; + if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("requires a TARGET OBD name\n"); + RETURN(-EINVAL); + } + + tgt = class_name2obd(lustre_cfg_string(lcfg, 1)); + if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) { + CERROR("device not attached or not set up (%s)\n", + lustre_cfg_string(lcfg, 1)); + RETURN(-EINVAL); + } + + spin_lock_init(&ec->ec_lock); + INIT_LIST_HEAD(&ec->ec_objects); + INIT_LIST_HEAD(&ec->ec_locks); + ec->ec_unique = 0; + + lu_context_tags_update(ECHO_DT_CTX_TAG); + lu_session_tags_update(ECHO_SES_TAG); + + if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) { +#ifdef HAVE_SERVER_SUPPORT + lu_context_tags_update(ECHO_MD_CTX_TAG); +#else + CERROR( + "Local operations are NOT supported on client side. Only remote operations are supported. Metadata client must be run on server side.\n"); +#endif + RETURN(0); + } + + OBD_ALLOC(ocd, sizeof(*ocd)); + if (!ocd) { + CERROR("Can't alloc ocd connecting to %s\n", + lustre_cfg_string(lcfg, 1)); + return -ENOMEM; + } + + ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL | + OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_GRANT | OBD_CONNECT_FULL20 | + OBD_CONNECT_64BITHASH | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_FID | OBD_CONNECT_FLAGS2; + ocd->ocd_connect_flags2 = OBD_CONNECT2_REP_MBITS; + + ocd->ocd_brw_size = DT_MAX_BRW_SIZE; + ocd->ocd_version = LUSTRE_VERSION_CODE; + ocd->ocd_group = FID_SEQ_ECHO; + + rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL); + if (rc == 0) { + /* Turn off pinger because it connects to tgt obd directly. */ + spin_lock(&tgt->obd_dev_lock); + list_del_init(&ec->ec_exp->exp_obd_chain_timed); + spin_unlock(&tgt->obd_dev_lock); + } + + OBD_FREE(ocd, sizeof(*ocd)); + + if (rc != 0) { + CERROR("fail to connect to device %s\n", + lustre_cfg_string(lcfg, 1)); + return rc; + } + + RETURN(rc); +} + +static int echo_client_cleanup(struct obd_device *obd) +{ + struct echo_device *ed = obd2echo_dev(obd); + struct echo_client_obd *ec = &obd->u.echo_client; + int rc; + + ENTRY; + /*Do nothing for Metadata echo client*/ + if (!ed) + RETURN(0); + + lu_session_tags_clear(ECHO_SES_TAG & ~LCT_SESSION); + lu_context_tags_clear(ECHO_DT_CTX_TAG); + if (ed->ed_next_ismd) { +#ifdef HAVE_SERVER_SUPPORT + lu_context_tags_clear(ECHO_MD_CTX_TAG); +#else + CERROR( + "This is client-side only module, does not support metadata echo client.\n"); +#endif + RETURN(0); + } + + if (!list_empty(&obd->obd_exports)) { + CERROR("still has clients!\n"); + RETURN(-EBUSY); + } + + LASSERT(refcount_read(&ec->ec_exp->exp_handle.h_ref) > 0); + rc = obd_disconnect(ec->ec_exp); + if (rc != 0) + CERROR("fail to disconnect device: %d\n", rc); + + RETURN(rc); +} + +static int echo_client_connect(const struct lu_env *env, + struct obd_export **exp, + struct obd_device *src, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata) +{ + int rc; + struct lustre_handle conn = { 0 }; + + ENTRY; + rc = class_connect(&conn, src, cluuid); + if (rc == 0) + *exp = class_conn2export(&conn); + + RETURN(rc); +} + +static int echo_client_disconnect(struct obd_export *exp) +{ + int rc; + + ENTRY; + if (!exp) + GOTO(out, rc = -EINVAL); + + rc = class_disconnect(exp); + GOTO(out, rc); +out: + return rc; +} + +static const struct obd_ops echo_client_obd_ops = { + .o_owner = THIS_MODULE, + .o_iocontrol = echo_client_iocontrol, + .o_connect = echo_client_connect, + .o_disconnect = echo_client_disconnect +}; + +static int __init obdecho_init(void) +{ + int rc; + + ENTRY; + LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n"); + + LASSERT(PAGE_SIZE % OBD_ECHO_BLOCK_SIZE == 0); + +# ifdef HAVE_SERVER_SUPPORT + rc = echo_persistent_pages_init(); + if (rc != 0) + goto failed_0; + + rc = class_register_type(&echo_obd_ops, NULL, true, + LUSTRE_ECHO_NAME, &echo_srv_type); + if (rc != 0) + goto failed_1; +# endif + + rc = lu_kmem_init(echo_caches); + if (rc == 0) { + rc = class_register_type(&echo_client_obd_ops, NULL, false, + LUSTRE_ECHO_CLIENT_NAME, + &echo_device_type); + if (rc) + lu_kmem_fini(echo_caches); + } + +# ifdef HAVE_SERVER_SUPPORT + if (rc == 0) + RETURN(0); + + class_unregister_type(LUSTRE_ECHO_NAME); +failed_1: + echo_persistent_pages_fini(); +failed_0: +# endif + RETURN(rc); +} + +static void __exit obdecho_exit(void) +{ + class_unregister_type(LUSTRE_ECHO_CLIENT_NAME); + lu_kmem_fini(echo_caches); + +#ifdef HAVE_SERVER_SUPPORT + class_unregister_type(LUSTRE_ECHO_NAME); + echo_persistent_pages_fini(); +#endif +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Echo Client test driver"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(obdecho_init); +module_exit(obdecho_exit); + +/** @} echo_client */ diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h new file mode 100644 index 0000000000000..158fc9745707c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h @@ -0,0 +1,59 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014 Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdecho/echo_internal.h + */ + +#ifndef _ECHO_INTERNAL_H +#define _ECHO_INTERNAL_H + +/* The persistent object (i.e. actually stores stuff!) */ +#define ECHO_PERSISTENT_OBJID 1ULL +#define ECHO_PERSISTENT_SIZE ((__u64)(1<<20)) + +/* block size to use for data verification */ +#define OBD_ECHO_BLOCK_SIZE (4<<10) + +#ifdef HAVE_SERVER_SUPPORT +extern const struct obd_ops echo_obd_ops; +extern struct lu_device_type echo_srv_type; +int echo_persistent_pages_init(void); +void echo_persistent_pages_fini(void); +#endif /* HAVE_SERVER_SUPPORT */ + +/* mapping value to tell page is not encrypted */ +#define ECHO_MAPPING_UNENCRYPTED ((void *)1) + +/* debug.c */ +int block_debug_setup(void *addr, int len, u64 off, u64 id); +int block_debug_check(char *who, void *addr, int len, u64 off, u64 id); + +#endif diff --git a/drivers/staging/lustrefsx/lustre/osc/Makefile b/drivers/staging/lustrefsx/lustre/osc/Makefile new file mode 100644 index 0000000000000..223e42283bf92 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_LUSTREFSX_FS) += osc.o + +osc-y := osc_request.o lproc_osc.o osc_dev.o osc_object.o osc_page.o +osc-y += osc_lock.o osc_io.o osc_quota.o osc_cache.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c new file mode 100644 index 0000000000000..78a8c14e17298 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c @@ -0,0 +1,916 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include + +#include "osc_internal.h" + +static ssize_t active_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp; + int rc; + + with_imp_locked(obd, imp, rc) + rc = sprintf(buf, "%d\n", !imp->imp_deactive); + + return rc; +} + +static ssize_t active_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp, *imp0; + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + with_imp_locked(obd, imp0, rc) + imp = class_import_get(imp0); + if (rc) + return rc; + /* opposite senses */ + if (imp->imp_deactive == val) + rc = ptlrpc_set_import_active(imp, val); + else + CDEBUG(D_CONFIG, "activate %u: ignoring repeat request\n", + (unsigned int)val); + class_import_put(imp); + + return rc ?: count; +} +LUSTRE_RW_ATTR(active); + +static ssize_t max_rpcs_in_flight_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &obd->u.cli; + + return scnprintf(buf, PAGE_SIZE, "%u\n", cli->cl_max_rpcs_in_flight); +} + +static ssize_t max_rpcs_in_flight_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &obd->u.cli; + int adding, added, req_count; + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + if (val == 0 || val > OSC_MAX_RIF_MAX) + return -ERANGE; + + adding = (int)val - cli->cl_max_rpcs_in_flight; + req_count = atomic_read(&osc_pool_req_count); + if (adding > 0 && req_count < osc_reqpool_maxreqcount) { + /* + * There might be some race which will cause over-limit + * allocation, but it is fine. + */ + if (req_count + adding > osc_reqpool_maxreqcount) + adding = osc_reqpool_maxreqcount - req_count; + + added = osc_rq_pool->prp_populate(osc_rq_pool, adding); + atomic_add(added, &osc_pool_req_count); + } + + spin_lock(&cli->cl_loi_list_lock); + cli->cl_max_rpcs_in_flight = val; + client_adjust_max_dirty(cli); + spin_unlock(&cli->cl_loi_list_lock); + + return count; +} +LUSTRE_RW_ATTR(max_rpcs_in_flight); + +static ssize_t max_dirty_mb_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &obd->u.cli; + + return scnprintf(buf, PAGE_SIZE, "%lu\n", + PAGES_TO_MiB(cli->cl_dirty_max_pages)); +} + +static ssize_t max_dirty_mb_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &obd->u.cli; + unsigned long pages_number, max_dirty_mb; + int rc; + + rc = kstrtoul(buffer, 10, &max_dirty_mb); + if (rc) + return rc; + + pages_number = MiB_TO_PAGES(max_dirty_mb); + + if (pages_number >= MiB_TO_PAGES(OSC_MAX_DIRTY_MB_MAX) || + pages_number > cfs_totalram_pages() / 4) /* 1/4 of RAM */ + return -ERANGE; + + spin_lock(&cli->cl_loi_list_lock); + cli->cl_dirty_max_pages = pages_number; + osc_wake_cache_waiters(cli); + spin_unlock(&cli->cl_loi_list_lock); + + return count; +} +LUSTRE_RW_ATTR(max_dirty_mb); + +LUSTRE_ATTR(ost_conn_uuid, 0444, conn_uuid_show, NULL); +LUSTRE_RO_ATTR(conn_uuid); + +LUSTRE_RW_ATTR(ping); + +static int osc_cached_mb_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + struct client_obd *cli = &obd->u.cli; + int shift = 20 - PAGE_SHIFT; + + seq_printf(m, "used_mb: %ld\n" + "busy_cnt: %ld\n" + "reclaim: %llu\n", + (atomic_long_read(&cli->cl_lru_in_list) + + atomic_long_read(&cli->cl_lru_busy)) >> shift, + atomic_long_read(&cli->cl_lru_busy), + cli->cl_lru_reclaim); + + return 0; +} + +/* shrink the number of caching pages to a specific number */ +static ssize_t osc_cached_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + struct client_obd *cli = &obd->u.cli; + u64 pages_number; + const char *tmp; + long rc; + char kernbuf[128]; + + if (count >= sizeof(kernbuf)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + kernbuf[count] = 0; + + tmp = lprocfs_find_named_value(kernbuf, "used_mb:", &count); + rc = sysfs_memparse(tmp, count, &pages_number, "MiB"); + if (rc < 0) + return rc; + + pages_number >>= PAGE_SHIFT; + + rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number; + if (rc > 0) { + struct lu_env *env; + __u16 refcheck; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + (void)osc_lru_shrink(env, cli, rc, true); + cl_env_put(env, &refcheck); + } + } + + return count; +} + +LPROC_SEQ_FOPS(osc_cached_mb); + +static ssize_t cur_dirty_bytes_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &obd->u.cli; + + return scnprintf(buf, PAGE_SIZE, "%lu\n", + cli->cl_dirty_pages << PAGE_SHIFT); +} +LUSTRE_RO_ATTR(cur_dirty_bytes); + +static int osc_cur_grant_bytes_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + struct client_obd *cli = &obd->u.cli; + + seq_printf(m, "%lu\n", cli->cl_avail_grant); + return 0; +} + +static ssize_t osc_cur_grant_bytes_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp; + char kernbuf[22] = ""; + u64 val; + int rc; + + if (obd == NULL) + return 0; + + if (count >= sizeof(kernbuf)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + kernbuf[count] = 0; + + rc = sysfs_memparse(kernbuf, count, &val, "MiB"); + if (rc < 0) + return rc; + + /* this is only for shrinking grant */ + if (val >= cli->cl_avail_grant) + return 0; + + with_imp_locked(obd, imp, rc) + if (imp->imp_state == LUSTRE_IMP_FULL) + rc = osc_shrink_grant_to_target(cli, val); + + return rc ? rc : count; +} +LPROC_SEQ_FOPS(osc_cur_grant_bytes); + +static ssize_t cur_lost_grant_bytes_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &obd->u.cli; + + return scnprintf(buf, PAGE_SIZE, "%lu\n", cli->cl_lost_grant); +} +LUSTRE_RO_ATTR(cur_lost_grant_bytes); + +static ssize_t cur_dirty_grant_bytes_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &obd->u.cli; + + return scnprintf(buf, PAGE_SIZE, "%lu\n", cli->cl_dirty_grant); +} +LUSTRE_RO_ATTR(cur_dirty_grant_bytes); + +static ssize_t grant_shrink_interval_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%lld\n", obd->u.cli.cl_grant_shrink_interval); +} + +static ssize_t grant_shrink_interval_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + if (val == 0) + return -ERANGE; + + obd->u.cli.cl_grant_shrink_interval = val; + osc_update_next_shrink(&obd->u.cli); + osc_schedule_grant_work(); + + return count; +} +LUSTRE_RW_ATTR(grant_shrink_interval); + +static ssize_t checksums_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%d\n", !!obd->u.cli.cl_checksum); +} + +static ssize_t checksums_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + obd->u.cli.cl_checksum = val; + + return count; +} +LUSTRE_RW_ATTR(checksums); + +DECLARE_CKSUM_NAME; + +static int osc_checksum_type_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + int i; + + if (obd == NULL) + return 0; + + for (i = 0; i < ARRAY_SIZE(cksum_name); i++) { + if ((BIT(i) & obd->u.cli.cl_supp_cksum_types) == 0) + continue; + if (obd->u.cli.cl_cksum_type == BIT(i)) + seq_printf(m, "[%s] ", cksum_name[i]); + else + seq_printf(m, "%s ", cksum_name[i]); + } + seq_puts(m, "\n"); + + return 0; +} + +static ssize_t osc_checksum_type_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + char kernbuf[10]; + int rc = -EINVAL; + int i; + + if (obd == NULL) + return 0; + + if (count > sizeof(kernbuf) - 1) + return -EINVAL; + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + if (count > 0 && kernbuf[count - 1] == '\n') + kernbuf[count - 1] = '\0'; + else + kernbuf[count] = '\0'; + + for (i = 0; i < ARRAY_SIZE(cksum_name); i++) { + if (strcasecmp(kernbuf, cksum_name[i]) == 0) { + obd->u.cli.cl_preferred_cksum_type = BIT(i); + if (obd->u.cli.cl_supp_cksum_types & BIT(i)) { + obd->u.cli.cl_cksum_type = BIT(i); + rc = count; + } else { + rc = -ENOTSUPP; + } + break; + } + } + return rc; +} +LPROC_SEQ_FOPS(osc_checksum_type); + +static ssize_t resend_count_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%u\n", atomic_read(&obd->u.cli.cl_resends)); +} + +static ssize_t resend_count_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + atomic_set(&obd->u.cli.cl_resends, val); + + return count; +} +LUSTRE_RW_ATTR(resend_count); + +static ssize_t checksum_dump_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%d\n", !!obd->u.cli.cl_checksum_dump); +} + +static ssize_t checksum_dump_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + obd->u.cli.cl_checksum_dump = val; + + return count; +} +LUSTRE_RW_ATTR(checksum_dump); + +static ssize_t destroys_in_flight_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%u\n", + atomic_read(&obd->u.cli.cl_destroy_in_flight)); +} +LUSTRE_RO_ATTR(destroys_in_flight); + +LPROC_SEQ_FOPS_RW_TYPE(osc, obd_max_pages_per_rpc); + +LUSTRE_RW_ATTR(short_io_bytes); + +#ifdef CONFIG_PROC_FS +static int osc_unstable_stats_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + struct client_obd *cli = &obd->u.cli; + long pages; + int mb; + + pages = atomic_long_read(&cli->cl_unstable_count); + mb = (pages * PAGE_SIZE) >> 20; + + seq_printf(m, "unstable_pages: %20ld\n" + "unstable_mb: %10d\n", + pages, mb); + return 0; +} +LPROC_SEQ_FOPS_RO(osc_unstable_stats); + +static ssize_t idle_timeout_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp; + int ret; + + with_imp_locked(obd, imp, ret) + ret = sprintf(buf, "%u\n", imp->imp_idle_timeout); + + return ret; +} + +static ssize_t idle_timeout_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp; + struct ptlrpc_request *req; + unsigned int idle_debug = 0; + unsigned int val; + int rc; + + if (strncmp(buffer, "debug", 5) == 0) { + idle_debug = D_CONSOLE; + } else if (strncmp(buffer, "nodebug", 6) == 0) { + idle_debug = D_HA; + } else { + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + if (val > CONNECTION_SWITCH_MAX) + return -ERANGE; + } + + with_imp_locked(obd, imp, rc) { + if (idle_debug) { + imp->imp_idle_debug = idle_debug; + } else { + if (!val) { + /* initiate the connection if it's in IDLE state */ + req = ptlrpc_request_alloc(imp, + &RQF_OST_STATFS); + if (req != NULL) + ptlrpc_req_finished(req); + } + imp->imp_idle_timeout = val; + } + } + + return count; +} +LUSTRE_RW_ATTR(idle_timeout); + +static ssize_t idle_connect_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp; + struct ptlrpc_request *req; + int rc; + + with_imp_locked(obd, imp, rc) { + /* to initiate the connection if it's in IDLE state */ + req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS); + if (req) + ptlrpc_req_finished(req); + ptlrpc_pinger_force(imp); + } + + return rc ?: count; +} +LUSTRE_WO_ATTR(idle_connect); + +static ssize_t grant_shrink_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp; + ssize_t len; + + with_imp_locked(obd, imp, len) + len = scnprintf(buf, PAGE_SIZE, "%d\n", + !imp->imp_grant_shrink_disabled && + OCD_HAS_FLAG(&imp->imp_connect_data, + GRANT_SHRINK)); + + return len; +} + +static ssize_t grant_shrink_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp; + bool val; + int rc; + + if (obd == NULL) + return 0; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + with_imp_locked(obd, imp, rc) { + spin_lock(&imp->imp_lock); + imp->imp_grant_shrink_disabled = !val; + spin_unlock(&imp->imp_lock); + } + + return rc ?: count; +} +LUSTRE_RW_ATTR(grant_shrink); + +LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags); +LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid); +LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts); +LPROC_SEQ_FOPS_RO_TYPE(osc, state); + +LPROC_SEQ_FOPS_RW_TYPE(osc, import); +LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov); + +struct lprocfs_vars lprocfs_osc_obd_vars[] = { + { .name = "connect_flags", + .fops = &osc_connect_flags_fops }, + { .name = "ost_server_uuid", + .fops = &osc_server_uuid_fops }, + { .name = "max_pages_per_rpc", + .fops = &osc_obd_max_pages_per_rpc_fops }, + { .name = "osc_cached_mb", + .fops = &osc_cached_mb_fops }, + { .name = "cur_grant_bytes", + .fops = &osc_cur_grant_bytes_fops }, + { .name = "checksum_type", + .fops = &osc_checksum_type_fops }, + { .name = "timeouts", + .fops = &osc_timeouts_fops }, + { .name = "import", + .fops = &osc_import_fops }, + { .name = "state", + .fops = &osc_state_fops }, + { .name = "pinger_recov", + .fops = &osc_pinger_recov_fops }, + { .name = "unstable_stats", + .fops = &osc_unstable_stats_fops }, + { NULL } +}; + +static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *obd = seq->private; + struct client_obd *cli = &obd->u.cli; + unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; + int i; + + spin_lock(&cli->cl_loi_list_lock); + + lprocfs_stats_header(seq, ktime_get_real(), cli->cl_stats_init, 25, + ":", true, ""); + seq_printf(seq, "read RPCs in flight: %d\n", + cli->cl_r_in_flight); + seq_printf(seq, "write RPCs in flight: %d\n", + cli->cl_w_in_flight); + seq_printf(seq, "pending write pages: %d\n", + atomic_read(&cli->cl_pending_w_pages)); + seq_printf(seq, "pending read pages: %d\n", + atomic_read(&cli->cl_pending_r_pages)); + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "pages per rpc rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_page_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_page_hist.oh_buckets[i]; + + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3u %3u | %10lu %3u %3u\n", + 1 << i, r, pct(r, read_tot), + pct(read_cum, read_tot), w, + pct(w, write_tot), + pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "rpcs in flight rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist); + + read_cum = 0; + write_cum = 0; + for (i = 1; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3u %3u | %10lu %3u %3u\n", + i, r, pct(r, read_tot), + pct(read_cum, read_tot), w, + pct(w, write_tot), + pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "offset rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_offset_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_offset_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3u %3u | %10lu %3u %3u\n", + (i == 0) ? 0 : 1 << (i - 1), + r, pct(r, read_tot), pct(read_cum, read_tot), + w, pct(w, write_tot), pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + spin_unlock(&cli->cl_loi_list_lock); + + return 0; +} + +static ssize_t osc_rpc_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *obd = seq->private; + struct client_obd *cli = &obd->u.cli; + + lprocfs_oh_clear(&cli->cl_read_rpc_hist); + lprocfs_oh_clear(&cli->cl_write_rpc_hist); + lprocfs_oh_clear(&cli->cl_read_page_hist); + lprocfs_oh_clear(&cli->cl_write_page_hist); + lprocfs_oh_clear(&cli->cl_read_offset_hist); + lprocfs_oh_clear(&cli->cl_write_offset_hist); + cli->cl_stats_init = ktime_get_real(); + + return len; +} +LPROC_SEQ_FOPS(osc_rpc_stats); + +static int osc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *obd = seq->private; + struct osc_stats *stats = &obd2osc_dev(obd)->osc_stats; + + lprocfs_stats_header(seq, ktime_get_real(), stats->os_init, 25, ":", + true, ""); + seq_printf(seq, "lockless_write_bytes\t\t%llu\n", + stats->os_lockless_writes); + seq_printf(seq, "lockless_read_bytes\t\t%llu\n", + stats->os_lockless_reads); + return 0; +} + +static ssize_t osc_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *obd = seq->private; + struct osc_stats *stats = &obd2osc_dev(obd)->osc_stats; + + memset(stats, 0, sizeof(*stats)); + stats->os_init = ktime_get_real(); + + return len; +} + +LPROC_SEQ_FOPS(osc_stats); + +int lprocfs_osc_attach_seqstat(struct obd_device *obd) +{ + int rc; + + rc = lprocfs_seq_create(obd->obd_proc_entry, "osc_stats", 0644, + &osc_stats_fops, obd); + if (rc == 0) + rc = lprocfs_obd_seq_create(obd, "rpc_stats", 0644, + &osc_rpc_stats_fops, obd); + + return rc; +} +#endif /* CONFIG_PROC_FS */ + +static struct attribute *osc_attrs[] = { + &lustre_attr_active.attr, + &lustre_attr_checksums.attr, + &lustre_attr_checksum_dump.attr, + &lustre_attr_cur_dirty_bytes.attr, + &lustre_attr_cur_lost_grant_bytes.attr, + &lustre_attr_cur_dirty_grant_bytes.attr, + &lustre_attr_destroys_in_flight.attr, + &lustre_attr_grant_shrink_interval.attr, + &lustre_attr_max_dirty_mb.attr, + &lustre_attr_max_rpcs_in_flight.attr, + &lustre_attr_short_io_bytes.attr, + &lustre_attr_resend_count.attr, + &lustre_attr_ost_conn_uuid.attr, + &lustre_attr_conn_uuid.attr, + &lustre_attr_ping.attr, + &lustre_attr_idle_timeout.attr, + &lustre_attr_idle_connect.attr, + &lustre_attr_grant_shrink.attr, + NULL, +}; + +KOBJ_ATTRIBUTE_GROUPS(osc); /* creates osc_groups */ + +int osc_tunables_init(struct obd_device *obd) +{ + int rc; + + obd->obd_vars = lprocfs_osc_obd_vars; + obd->obd_ktype.default_groups = KOBJ_ATTR_GROUPS(osc); + rc = lprocfs_obd_setup(obd, false); + if (rc) + return rc; +#ifdef CONFIG_PROC_FS + /* If the basic OSC proc tree construction succeeded then + * lets do the rest. + */ + rc = lprocfs_osc_attach_seqstat(obd); + if (rc) + goto obd_cleanup; + +#endif /* CONFIG_PROC_FS */ + rc = sptlrpc_lprocfs_cliobd_attach(obd); + if (rc) + goto obd_cleanup; + + ptlrpc_lprocfs_register_obd(obd); +obd_cleanup: + if (rc) + lprocfs_obd_cleanup(obd); + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_cache.c b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c new file mode 100644 index 0000000000000..8b7737ede01ef --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c @@ -0,0 +1,3303 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + * + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * osc cache management. + * + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include +#include + +#include "osc_internal.h" + +static int extent_debug; /* set it to be true for more debug */ + +static void osc_update_pending(struct osc_object *obj, int cmd, int delta); +static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, + enum osc_extent_state state); +static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, + struct osc_async_page *oap, int sent, int rc); +static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, + int cmd); +static int osc_refresh_count(const struct lu_env *env, + struct osc_async_page *oap, int cmd); +static int osc_io_unplug_async(const struct lu_env *env, + struct client_obd *cli, struct osc_object *osc); +static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, + unsigned int lost_grant, unsigned int dirty_grant); + +static void osc_extent_tree_dump0(int mask, struct osc_object *obj, + const char *func, int line); +#define osc_extent_tree_dump(mask, obj) \ + osc_extent_tree_dump0(mask, obj, __func__, __LINE__) + +static void osc_unreserve_grant(struct client_obd *cli, unsigned int reserved, + unsigned int unused); + +/** \addtogroup osc + * @{ + */ + +/* ------------------ osc extent ------------------ */ +static inline char *ext_flags(struct osc_extent *ext, char *flags) +{ + char *buf = flags; + *buf++ = ext->oe_rw ? 'r' : 'w'; + if (!RB_EMPTY_NODE(&ext->oe_node)) + *buf++ = 'i'; + if (ext->oe_sync) + *buf++ = 'S'; + if (ext->oe_srvlock) + *buf++ = 's'; + if (ext->oe_hp) + *buf++ = 'h'; + if (ext->oe_urgent) + *buf++ = 'u'; + if (ext->oe_memalloc) + *buf++ = 'm'; + if (ext->oe_trunc_pending) + *buf++ = 't'; + if (ext->oe_fsync_wait) + *buf++ = 'Y'; + *buf = 0; + return flags; +} + +#define EXTSTR "[%lu -> %lu/%lu]" +#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end +static const char *const oes_strings[] = { + "inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL }; + +#define OSC_EXTENT_DUMP_WITH_LOC(file, func, line, mask, extent, fmt, ...) do {\ + static struct cfs_debug_limit_state cdls; \ + struct osc_extent *__ext = (extent); \ + char __buf[16]; \ + \ + __CDEBUG_WITH_LOC(file, func, line, mask, &cdls, \ + "extent %p@{" EXTSTR ", " \ + "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt, \ + /* ----- extent part 0 ----- */ \ + __ext, EXTPARA(__ext), \ + /* ----- part 1 ----- */ \ + kref_read(&__ext->oe_refc), \ + atomic_read(&__ext->oe_users), \ + list_empty_marker(&__ext->oe_link), \ + oes_strings[__ext->oe_state], ext_flags(__ext, __buf), \ + __ext->oe_obj, \ + /* ----- part 2 ----- */ \ + __ext->oe_grants, __ext->oe_nr_pages, \ + list_empty_marker(&__ext->oe_pages), \ + waitqueue_active(&__ext->oe_waitq) ? '+' : '-', \ + __ext->oe_dlmlock, __ext->oe_mppr, __ext->oe_owner, \ + /* ----- part 4 ----- */ \ + ## __VA_ARGS__); \ + if (mask == D_ERROR && __ext->oe_dlmlock != NULL) \ + LDLM_ERROR(__ext->oe_dlmlock, "extent: %p", __ext); \ + else \ + LDLM_DEBUG(__ext->oe_dlmlock, "extent: %p", __ext); \ +} while (0) + +#define OSC_EXTENT_DUMP(mask, ext, fmt, ...) \ + OSC_EXTENT_DUMP_WITH_LOC(__FILE__, __func__, __LINE__, \ + mask, ext, fmt, ## __VA_ARGS__) + +#undef EASSERTF +#define EASSERTF(expr, ext, fmt, args...) do { \ + if (!(expr)) { \ + OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args); \ + osc_extent_tree_dump(D_ERROR, (ext)->oe_obj); \ + LASSERT(expr); \ + } \ +} while (0) + +#undef EASSERT +#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n") + +static inline struct osc_extent *rb_extent(struct rb_node *n) +{ + return rb_entry_safe(n, struct osc_extent, oe_node); +} + +static inline struct osc_extent *next_extent(struct osc_extent *ext) +{ + if (ext == NULL) + return NULL; + + LASSERT(!RB_EMPTY_NODE(&ext->oe_node)); + return rb_extent(rb_next(&ext->oe_node)); +} + +static inline struct osc_extent *prev_extent(struct osc_extent *ext) +{ + if (ext == NULL) + return NULL; + + LASSERT(!RB_EMPTY_NODE(&ext->oe_node)); + return rb_extent(rb_prev(&ext->oe_node)); +} + +static inline struct osc_extent *first_extent(struct osc_object *obj) +{ + return rb_extent(rb_first(&obj->oo_root)); +} + +/* object must be locked by caller. */ +static int osc_extent_sanity_check0(struct osc_extent *ext, + const char *func, const int line) +{ + struct osc_object *obj = ext->oe_obj; + struct osc_async_page *oap; + size_t page_count; + int rc = 0; + + assert_osc_object_is_locked(obj); + + if (ext->oe_state >= OES_STATE_MAX) + GOTO(out, rc = 10); + + if (kref_read(&ext->oe_refc) <= 0) + GOTO(out, rc = 20); + + if (kref_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) + GOTO(out, rc = 30); + + switch (ext->oe_state) { + case OES_INV: + if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages)) + GOTO(out, rc = 35); + GOTO(out, rc = 0); + break; + case OES_ACTIVE: + if (atomic_read(&ext->oe_users) == 0) + GOTO(out, rc = 40); + if (ext->oe_hp) + GOTO(out, rc = 50); + if (ext->oe_fsync_wait && !ext->oe_urgent) + GOTO(out, rc = 55); + break; + case OES_CACHE: + if (ext->oe_grants == 0) + GOTO(out, rc = 60); + if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) + GOTO(out, rc = 65); + fallthrough; + default: + if (atomic_read(&ext->oe_users) > 0) + GOTO(out, rc = 70); + } + + if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start) + GOTO(out, rc = 80); + + if (ext->oe_sync && ext->oe_grants > 0) + GOTO(out, rc = 90); + + if (ext->oe_dlmlock != NULL && + ext->oe_dlmlock->l_resource->lr_type == LDLM_EXTENT && + !ldlm_is_failed(ext->oe_dlmlock)) { + struct ldlm_extent *extent; + + extent = &ext->oe_dlmlock->l_policy_data.l_extent; + if (!(extent->start <= cl_offset(osc2cl(obj), ext->oe_start) && + extent->end >= cl_offset(osc2cl(obj), ext->oe_max_end))) + GOTO(out, rc = 100); + + if (!(ext->oe_dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP))) + GOTO(out, rc = 102); + } + + if (ext->oe_nr_pages > ext->oe_mppr) + GOTO(out, rc = 105); + + /* Do not verify page list if extent is in RPC. This is because an + * in-RPC extent is supposed to be exclusively accessible w/o lock. */ + if (ext->oe_state > OES_CACHE) + GOTO(out, rc = 0); + + if (!extent_debug) + GOTO(out, rc = 0); + + page_count = 0; + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + pgoff_t index = osc_index(oap2osc(oap)); + ++page_count; + if (index > ext->oe_end || index < ext->oe_start) + GOTO(out, rc = 110); + } + if (page_count != ext->oe_nr_pages) + GOTO(out, rc = 120); + +out: + if (rc != 0) + OSC_EXTENT_DUMP_WITH_LOC(__FILE__, func, line, D_ERROR, ext, + "sanity check %p failed: rc = %d\n", + ext, rc); + return rc; +} + +#define sanity_check_nolock(ext) \ + osc_extent_sanity_check0(ext, __func__, __LINE__) + +#define sanity_check(ext) ({ \ + int __res; \ + osc_object_lock((ext)->oe_obj); \ + __res = sanity_check_nolock(ext); \ + osc_object_unlock((ext)->oe_obj); \ + __res; \ +}) + +static inline bool +overlapped(const struct osc_extent *ex1, const struct osc_extent *ex2) +{ + return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start); +} + +/** + * sanity check - to make sure there is no overlapped extent in the tree. + */ +static int osc_extent_is_overlapped(struct osc_object *obj, + struct osc_extent *ext) +{ + struct osc_extent *tmp; + + assert_osc_object_is_locked(obj); + + if (!extent_debug) + return 0; + + for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) { + if (tmp == ext) + continue; + if (overlapped(tmp, ext)) + return 1; + } + return 0; +} + +static void osc_extent_state_set(struct osc_extent *ext, int state) +{ + assert_osc_object_is_locked(ext->oe_obj); + LASSERT(state >= OES_INV && state < OES_STATE_MAX); + + /* Never try to sanity check a state changing extent :-) */ + /* LASSERT(sanity_check_nolock(ext) == 0); */ + + /* TODO: validate the state machine */ + smp_store_release(&ext->oe_state, state); + wake_up(&ext->oe_waitq); +} + +static struct osc_extent *osc_extent_alloc(struct osc_object *obj) +{ + struct osc_extent *ext; + + OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_NOFS); + if (ext == NULL) + return NULL; + + RB_CLEAR_NODE(&ext->oe_node); + ext->oe_obj = obj; + cl_object_get(osc2cl(obj)); + kref_init(&ext->oe_refc); + atomic_set(&ext->oe_users, 0); + INIT_LIST_HEAD(&ext->oe_link); + ext->oe_state = OES_INV; + INIT_LIST_HEAD(&ext->oe_pages); + init_waitqueue_head(&ext->oe_waitq); + ext->oe_dlmlock = NULL; + + return ext; +} + +static void osc_extent_free(struct kref *kref) +{ + struct osc_extent *ext = container_of(kref, struct osc_extent, + oe_refc); + + LASSERT(list_empty(&ext->oe_link)); + LASSERT(atomic_read(&ext->oe_users) == 0); + LASSERT(ext->oe_state == OES_INV); + LASSERT(RB_EMPTY_NODE(&ext->oe_node)); + + if (ext->oe_dlmlock) { + lu_ref_del(&ext->oe_dlmlock->l_reference, + "osc_extent", ext); + LDLM_LOCK_PUT(ext->oe_dlmlock); + ext->oe_dlmlock = NULL; + } +#if 0 + /* If/When cl_object_put drops the need for 'env', + * this code can be enabled, and matching code in + * osc_extent_put removed. + */ + cl_object_put(osc2cl(ext->oe_obj)); + + OBD_SLAB_FREE_PTR(ext, osc_extent_kmem); +#endif +} + +static struct osc_extent *osc_extent_get(struct osc_extent *ext) +{ + LASSERT(kref_read(&ext->oe_refc) >= 0); + kref_get(&ext->oe_refc); + return ext; +} + +static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) +{ + LASSERT(kref_read(&ext->oe_refc) > 0); + if (kref_put(&ext->oe_refc, osc_extent_free)) { + /* This should be in osc_extent_free(), but + * while we need to pass 'env' it cannot be. + */ + cl_object_put(env, osc2cl(ext->oe_obj)); + + OBD_SLAB_FREE_PTR(ext, osc_extent_kmem); + } +} + +/** + * osc_extent_put_trust() is a special version of osc_extent_put() when + * it's known that the caller is not the last user. This is to address the + * problem of lacking of lu_env ;-). + */ +static void osc_extent_put_trust(struct osc_extent *ext) +{ + LASSERT(kref_read(&ext->oe_refc) > 1); + assert_osc_object_is_locked(ext->oe_obj); + osc_extent_put(NULL, ext); +} + +/** + * Return the extent which includes pgoff @index, or return the greatest + * previous extent in the tree. + */ +static struct osc_extent *osc_extent_search(struct osc_object *obj, + pgoff_t index) +{ + struct rb_node *n = obj->oo_root.rb_node; + struct osc_extent *tmp, *p = NULL; + + assert_osc_object_is_locked(obj); + while (n != NULL) { + tmp = rb_extent(n); + if (index < tmp->oe_start) { + n = n->rb_left; + } else if (index > tmp->oe_end) { + p = rb_extent(n); + n = n->rb_right; + } else { + return tmp; + } + } + return p; +} + +/* + * Return the extent covering @index, otherwise return NULL. + * caller must have held object lock. + */ +static struct osc_extent *osc_extent_lookup(struct osc_object *obj, + pgoff_t index) +{ + struct osc_extent *ext; + + ext = osc_extent_search(obj, index); + if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end) + return osc_extent_get(ext); + return NULL; +} + +/* caller must have held object lock. */ +static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext) +{ + struct rb_node **n = &obj->oo_root.rb_node; + struct rb_node *parent = NULL; + struct osc_extent *tmp; + + LASSERT(RB_EMPTY_NODE(&ext->oe_node)); + LASSERT(ext->oe_obj == obj); + assert_osc_object_is_locked(obj); + while (*n != NULL) { + tmp = rb_extent(*n); + parent = *n; + + if (ext->oe_end < tmp->oe_start) + n = &(*n)->rb_left; + else if (ext->oe_start > tmp->oe_end) + n = &(*n)->rb_right; + else + EASSERTF(0, tmp, EXTSTR"\n", EXTPARA(ext)); + } + rb_link_node(&ext->oe_node, parent, n); + rb_insert_color(&ext->oe_node, &obj->oo_root); + osc_extent_get(ext); +} + +/* caller must have held object lock. */ +static void osc_extent_erase(struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + assert_osc_object_is_locked(obj); + if (!RB_EMPTY_NODE(&ext->oe_node)) { + rb_erase(&ext->oe_node, &obj->oo_root); + RB_CLEAR_NODE(&ext->oe_node); + /* rbtree held a refcount */ + osc_extent_put_trust(ext); + } +} + +static struct osc_extent *osc_extent_hold(struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + + assert_osc_object_is_locked(obj); + LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE); + if (ext->oe_state == OES_CACHE) { + osc_extent_state_set(ext, OES_ACTIVE); + osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages); + } + atomic_inc(&ext->oe_users); + list_del_init(&ext->oe_link); + return osc_extent_get(ext); +} + +static void __osc_extent_remove(struct osc_extent *ext) +{ + assert_osc_object_is_locked(ext->oe_obj); + LASSERT(list_empty(&ext->oe_pages)); + osc_extent_erase(ext); + list_del_init(&ext->oe_link); + osc_extent_state_set(ext, OES_INV); + OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n"); +} + +static void osc_extent_remove(struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + + osc_object_lock(obj); + __osc_extent_remove(ext); + osc_object_unlock(obj); +} + +/** + * This function is used to merge extents to get better performance. It checks + * if @cur and @victim are contiguous at block level. + */ +static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, + struct osc_extent *victim) +{ + struct osc_object *obj = cur->oe_obj; + struct client_obd *cli = osc_cli(obj); + pgoff_t chunk_start; + pgoff_t chunk_end; + int ppc_bits; + + LASSERT(cur->oe_state == OES_CACHE); + assert_osc_object_is_locked(obj); + if (victim == NULL) + return -EINVAL; + + if (victim->oe_state != OES_INV && + (victim->oe_state != OES_CACHE || victim->oe_fsync_wait)) + return -EBUSY; + + if (cur->oe_max_end != victim->oe_max_end) + return -ERANGE; + + /* + * In the rare case max_pages_per_rpc (mppr) is changed, don't + * merge extents until after old ones have been sent, or the + * "extents are aligned to RPCs" checks are unhappy. + */ + if (cur->oe_mppr != victim->oe_mppr) + return -ERANGE; + + LASSERT(cur->oe_dlmlock == victim->oe_dlmlock); + ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_SHIFT; + chunk_start = cur->oe_start >> ppc_bits; + chunk_end = cur->oe_end >> ppc_bits; + if (chunk_start != (victim->oe_end >> ppc_bits) + 1 && + chunk_end + 1 != victim->oe_start >> ppc_bits) + return -ERANGE; + + /* overall extent size should not exceed the max supported limit + * reported by the server */ + if (cur->oe_end - cur->oe_start + 1 + + victim->oe_end - victim->oe_start + 1 > cli->cl_max_extent_pages) + return -ERANGE; + + OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur); + + cur->oe_start = min(cur->oe_start, victim->oe_start); + cur->oe_end = max(cur->oe_end, victim->oe_end); + /* per-extent tax should be accounted only once for the whole extent */ + cur->oe_grants += victim->oe_grants - cli->cl_grant_extent_tax; + cur->oe_nr_pages += victim->oe_nr_pages; + /* only the following bits are needed to merge */ + cur->oe_urgent |= victim->oe_urgent; + cur->oe_memalloc |= victim->oe_memalloc; + list_splice_init(&victim->oe_pages, &cur->oe_pages); + victim->oe_nr_pages = 0; + + osc_extent_get(victim); + __osc_extent_remove(victim); + osc_extent_put(env, victim); + + OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim); + return 0; +} + +/** + * Drop user count of osc_extent, and unplug IO asynchronously. + */ +void osc_extent_release(const struct lu_env *env, struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + struct client_obd *cli = osc_cli(obj); + ENTRY; + + LASSERT(atomic_read(&ext->oe_users) > 0); + LASSERT(sanity_check(ext) == 0); + LASSERT(ext->oe_grants > 0); + + if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) { + LASSERT(ext->oe_state == OES_ACTIVE); + if (ext->oe_trunc_pending) { + /* a truncate process is waiting for this extent. + * This may happen due to a race, check + * osc_cache_truncate_start(). */ + osc_extent_state_set(ext, OES_TRUNC); + ext->oe_trunc_pending = 0; + osc_object_unlock(obj); + } else { + int grant = 0; + + osc_extent_state_set(ext, OES_CACHE); + osc_update_pending(obj, OBD_BRW_WRITE, + ext->oe_nr_pages); + + /* try to merge the previous and next extent. */ + if (osc_extent_merge(env, ext, prev_extent(ext)) == 0) + grant += cli->cl_grant_extent_tax; + if (osc_extent_merge(env, ext, next_extent(ext)) == 0) + grant += cli->cl_grant_extent_tax; + + if (ext->oe_hp) + list_move_tail(&ext->oe_link, + &obj->oo_hp_exts); + else if (ext->oe_urgent) + list_move_tail(&ext->oe_link, + &obj->oo_urgent_exts); + else if (ext->oe_nr_pages == ext->oe_mppr) { + list_move_tail(&ext->oe_link, + &obj->oo_full_exts); + } + osc_object_unlock(obj); + if (grant > 0) + osc_unreserve_grant(cli, 0, grant); + } + + osc_io_unplug_async(env, cli, obj); + } + osc_extent_put(env, ext); + + RETURN_EXIT; +} + +/** + * Find or create an extent which includes @index, core function to manage + * extent tree. + */ +static struct osc_extent *osc_extent_find(const struct lu_env *env, + struct osc_object *obj, pgoff_t index, + unsigned int *grants) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_lock *olck; + struct cl_lock_descr *descr; + struct osc_extent *cur; + struct osc_extent *ext; + struct osc_extent *conflict = NULL; + struct osc_extent *found = NULL; + pgoff_t chunk; + pgoff_t max_end; + unsigned int max_pages; /* max_pages_per_rpc */ + unsigned int chunksize; + int ppc_bits; /* pages per chunk bits */ + pgoff_t chunk_mask; + int rc; + ENTRY; + + cur = osc_extent_alloc(obj); + if (cur == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + olck = osc_env_io(env)->oi_write_osclock; + LASSERTF(olck != NULL, "page %lu is not covered by lock\n", index); + LASSERT(olck->ols_state == OLS_GRANTED); + + descr = &olck->ols_cl.cls_lock->cll_descr; + LASSERT(descr->cld_mode >= CLM_WRITE); + + LASSERTF(cli->cl_chunkbits >= PAGE_SHIFT, + "chunkbits: %u\n", cli->cl_chunkbits); + ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; + chunk_mask = ~((1 << ppc_bits) - 1); + chunksize = 1 << cli->cl_chunkbits; + chunk = index >> ppc_bits; + + /* align end to RPC edge. */ + max_pages = cli->cl_max_pages_per_rpc; + if ((max_pages & ~chunk_mask) != 0) { + CERROR("max_pages: %#x chunkbits: %u chunk_mask: %#lx\n", + max_pages, cli->cl_chunkbits, chunk_mask); + RETURN(ERR_PTR(-EINVAL)); + } + max_end = index - (index % max_pages) + max_pages - 1; + max_end = min_t(pgoff_t, max_end, descr->cld_end); + + /* initialize new extent by parameters so far */ + cur->oe_max_end = max_end; + cur->oe_start = index & chunk_mask; + cur->oe_end = ((index + ~chunk_mask + 1) & chunk_mask) - 1; + if (cur->oe_start < descr->cld_start) + cur->oe_start = descr->cld_start; + if (cur->oe_end > max_end) + cur->oe_end = max_end; + cur->oe_grants = chunksize + cli->cl_grant_extent_tax; + cur->oe_mppr = max_pages; + if (olck->ols_dlmlock != NULL) { + LASSERT(olck->ols_hold); + cur->oe_dlmlock = LDLM_LOCK_GET(olck->ols_dlmlock); + lu_ref_add(&olck->ols_dlmlock->l_reference, "osc_extent", cur); + } + + /* grants has been allocated by caller */ + LASSERTF(*grants >= chunksize + cli->cl_grant_extent_tax, + "%u/%u/%u.\n", *grants, chunksize, cli->cl_grant_extent_tax); + LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR"\n", + EXTPARA(cur)); + +restart: + osc_object_lock(obj); + ext = osc_extent_search(obj, cur->oe_start); + if (!ext) + ext = first_extent(obj); + for (; ext; ext = next_extent(ext)) { + pgoff_t ext_chk_start = ext->oe_start >> ppc_bits; + pgoff_t ext_chk_end = ext->oe_end >> ppc_bits; + + LASSERT(sanity_check_nolock(ext) == 0); + if (chunk > ext_chk_end + 1 || chunk < ext_chk_start) + break; + + /* if covering by different locks, no chance to match */ + if (olck->ols_dlmlock != ext->oe_dlmlock) { + EASSERTF(!overlapped(ext, cur), ext, + EXTSTR"\n", EXTPARA(cur)); + + continue; + } + + /* discontiguous chunks? */ + if (chunk + 1 < ext_chk_start) + continue; + + /* ok, from now on, ext and cur have these attrs: + * 1. covered by the same lock + * 2. contiguous at chunk level or overlapping. */ + + if (overlapped(ext, cur)) { + /* cur is the minimum unit, so overlapping means + * full contain. */ + EASSERTF((ext->oe_start <= cur->oe_start && + ext->oe_end >= cur->oe_end), + ext, EXTSTR"\n", EXTPARA(cur)); + + if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) { + /* for simplicity, we wait for this extent to + * finish before going forward. */ + conflict = osc_extent_get(ext); + break; + } + + found = osc_extent_hold(ext); + break; + } + + /* non-overlapped extent */ + if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) + /* we can't do anything for a non OES_CACHE extent, or + * if there is someone waiting for this extent to be + * flushed, try next one. */ + continue; + + if (osc_extent_merge(env, ext, cur) == 0) { + LASSERT(*grants >= chunksize); + *grants -= chunksize; + + /* + * Try to merge with the next one too because we + * might have just filled in a gap. + */ + if (osc_extent_merge(env, ext, next_extent(ext)) == 0) + /* we can save extent tax from next extent */ + *grants += cli->cl_grant_extent_tax; + + found = osc_extent_hold(ext); + break; + } + } + + osc_extent_tree_dump(D_CACHE, obj); + if (found != NULL) { + LASSERT(conflict == NULL); + if (!IS_ERR(found)) { + LASSERT(found->oe_dlmlock == cur->oe_dlmlock); + OSC_EXTENT_DUMP(D_CACHE, found, + "found caching ext for %lu.\n", index); + } + } else if (conflict == NULL) { + /* create a new extent */ + EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur); + LASSERT(*grants >= cur->oe_grants); + *grants -= cur->oe_grants; + + cur->oe_state = OES_CACHE; + found = osc_extent_hold(cur); + osc_extent_insert(obj, cur); + OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n", + index, descr->cld_end); + } + osc_object_unlock(obj); + + if (conflict != NULL) { + LASSERT(found == NULL); + + /* waiting for IO to finish. Please notice that it's impossible + * to be an OES_TRUNC extent. */ + rc = osc_extent_wait(env, conflict, OES_INV); + osc_extent_put(env, conflict); + conflict = NULL; + if (rc < 0) + GOTO(out, found = ERR_PTR(rc)); + + goto restart; + } + EXIT; + +out: + osc_extent_put(env, cur); + return found; +} + +/** + * Called when IO is finished to an extent. + */ +int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, + int sent, int rc) +{ + struct client_obd *cli = osc_cli(ext->oe_obj); + struct osc_async_page *oap; + struct osc_async_page *tmp; + int nr_pages = ext->oe_nr_pages; + int lost_grant = 0; + int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096; + loff_t last_off = 0; + int last_count = -1; + ENTRY; + + OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n"); + + ext->oe_rc = rc ?: ext->oe_nr_pages; + EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext); + + osc_lru_add_batch(cli, &ext->oe_pages); + list_for_each_entry_safe(oap, tmp, &ext->oe_pages, + oap_pending_item) { + list_del_init(&oap->oap_rpc_item); + list_del_init(&oap->oap_pending_item); + if (last_off <= oap->oap_obj_off) { + last_off = oap->oap_obj_off; + last_count = oap->oap_count; + } + + --ext->oe_nr_pages; + osc_ap_completion(env, cli, oap, sent, rc); + } + EASSERT(ext->oe_nr_pages == 0, ext); + + if (!sent) { + lost_grant = ext->oe_grants; + } else if (cli->cl_ocd_grant_param == 0 && + blocksize < PAGE_SIZE && + last_count != PAGE_SIZE) { + /* For short writes without OBD_CONNECT_GRANT support, we + * shouldn't count parts of pages that span a whole chunk on + * the OST side, or our accounting goes wrong. Should match + * the code in tgt_grant_check. + */ + int offset = last_off & ~PAGE_MASK; + int count = last_count + (offset & (blocksize - 1)); + int end = (offset + last_count) & (blocksize - 1); + if (end) + count += blocksize - end; + + lost_grant = PAGE_SIZE - count; + } + if (ext->oe_grants > 0) + osc_free_grant(cli, nr_pages, lost_grant, ext->oe_grants); + + osc_extent_remove(ext); + /* put the refcount for RPC */ + osc_extent_put(env, ext); + RETURN(0); +} + +/** + * Wait for the extent's state to become @state. + */ +static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, + enum osc_extent_state state) +{ + struct osc_object *obj = ext->oe_obj; + int rc = 0; + ENTRY; + + osc_object_lock(obj); + LASSERT(sanity_check_nolock(ext) == 0); + /* `Kick' this extent only if the caller is waiting for it to be + * written out. */ + if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp) { + if (ext->oe_state == OES_ACTIVE) { + ext->oe_urgent = 1; + } else if (ext->oe_state == OES_CACHE) { + ext->oe_urgent = 1; + osc_extent_hold(ext); + rc = 1; + } + } + osc_object_unlock(obj); + if (rc == 1) + osc_extent_release(env, ext); + + /* wait for the extent until its state becomes @state */ + rc = wait_event_idle_timeout(ext->oe_waitq, + smp_load_acquire(&ext->oe_state) == state, + cfs_time_seconds(600)); + if (rc == 0) { + OSC_EXTENT_DUMP(D_ERROR, ext, + "%s: wait ext to %u timedout, recovery in progress?\n", + cli_name(osc_cli(obj)), state); + + wait_event_idle(ext->oe_waitq, + smp_load_acquire(&ext->oe_state) == state); + } + if (ext->oe_rc < 0) + rc = ext->oe_rc; + else + rc = 0; + RETURN(rc); +} + +/** + * Discard pages with index greater than @size. If @ext is overlapped with + * @size, then partial truncate happens. + */ +static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, + bool partial) +{ + struct lu_env *env; + struct cl_io *io; + struct osc_object *obj = ext->oe_obj; + struct client_obd *cli = osc_cli(obj); + struct osc_async_page *oap; + struct osc_async_page *tmp; + struct pagevec *pvec; + int pages_in_chunk = 0; + int ppc_bits = cli->cl_chunkbits - + PAGE_SHIFT; + __u64 trunc_chunk = trunc_index >> ppc_bits; + int grants = 0; + int nr_pages = 0; + int rc = 0; + __u16 refcheck; + ENTRY; + + LASSERT(sanity_check(ext) == 0); + LASSERT(ext->oe_state == OES_TRUNC); + LASSERT(!ext->oe_urgent); + + /* Request new lu_env. + * We can't use that env from osc_cache_truncate_start() because + * it's from lov_io_sub and not fully initialized. */ + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = osc_env_thread_io(env); + io->ci_obj = cl_object_top(osc2cl(obj)); + io->ci_ignore_layout = 1; + pvec = &osc_env_info(env)->oti_pagevec; + ll_pagevec_init(pvec, 0); + rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (rc < 0) + GOTO(out, rc); + + /* discard all pages with index greater than trunc_index */ + list_for_each_entry_safe(oap, tmp, &ext->oe_pages, + oap_pending_item) { + pgoff_t index = osc_index(oap2osc(oap)); + struct cl_page *page = oap2cl_page(oap); + + LASSERT(list_empty(&oap->oap_rpc_item)); + + /* only discard the pages with their index greater than + * trunc_index, and ... */ + if (index < trunc_index || + (index == trunc_index && partial)) { + /* accounting how many pages remaining in the chunk + * so that we can calculate grants correctly. */ + if (index >> ppc_bits == trunc_chunk) + ++pages_in_chunk; + continue; + } + + list_del_init(&oap->oap_pending_item); + + cl_page_get(page); + lu_ref_add(&page->cp_reference, "truncate", current); + + if (cl_page_own(env, io, page) == 0) { + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + LASSERT(0); + } + + lu_ref_del(&page->cp_reference, "truncate", current); + cl_pagevec_put(env, page, pvec); + + --ext->oe_nr_pages; + ++nr_pages; + } + pagevec_release(pvec); + + EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial, + ext->oe_nr_pages == 0), + ext, "trunc_index %lu, partial %d\n", trunc_index, partial); + + osc_object_lock(obj); + if (ext->oe_nr_pages == 0) { + LASSERT(pages_in_chunk == 0); + grants = ext->oe_grants; + ext->oe_grants = 0; + } else { /* calculate how many grants we can free */ + int chunks = (ext->oe_end >> ppc_bits) - trunc_chunk; + pgoff_t last_index; + + + /* if there is no pages in this chunk, we can also free grants + * for the last chunk */ + if (pages_in_chunk == 0) { + /* if this is the 1st chunk and no pages in this chunk, + * ext->oe_nr_pages must be zero, so we should be in + * the other if-clause. */ + LASSERT(trunc_chunk > 0); + --trunc_chunk; + ++chunks; + } + + /* this is what we can free from this extent */ + grants = chunks << cli->cl_chunkbits; + ext->oe_grants -= grants; + last_index = ((trunc_chunk + 1) << ppc_bits) - 1; + ext->oe_end = min(last_index, ext->oe_max_end); + LASSERT(ext->oe_end >= ext->oe_start); + LASSERT(ext->oe_grants > 0); + } + osc_object_unlock(obj); + + if (grants > 0 || nr_pages > 0) + osc_free_grant(cli, nr_pages, grants, grants); + +out: + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + RETURN(rc); +} + +/** + * This function is used to make the extent prepared for transfer. + * A race with flusing page - ll_writepage() has to be handled cautiously. + */ +static int osc_extent_make_ready(const struct lu_env *env, + struct osc_extent *ext) +{ + struct osc_async_page *oap; + struct osc_async_page *last = NULL; + struct osc_object *obj = ext->oe_obj; + unsigned int page_count = 0; + int rc; + ENTRY; + + /* we're going to grab page lock, so object lock must not be taken. */ + LASSERT(sanity_check(ext) == 0); + /* in locking state, any process should not touch this extent. */ + EASSERT(ext->oe_state == OES_LOCKING, ext); + EASSERT(ext->oe_owner != NULL, ext); + + OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n"); + + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + ++page_count; + if (last == NULL || last->oap_obj_off < oap->oap_obj_off) + last = oap; + + /* checking ASYNC_READY is race safe */ + if ((oap->oap_async_flags & ASYNC_READY) != 0) + continue; + + rc = osc_make_ready(env, oap, OBD_BRW_WRITE); + switch (rc) { + case 0: + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_READY; + spin_unlock(&oap->oap_lock); + break; + case -EALREADY: + LASSERT((oap->oap_async_flags & ASYNC_READY) != 0); + break; + default: + LASSERTF(0, "unknown return code: %d\n", rc); + } + } + + LASSERT(page_count == ext->oe_nr_pages); + LASSERT(last != NULL); + /* the last page is the only one we need to refresh its count by + * the size of file. */ + if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) { + int last_oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE); + LASSERTF(last_oap_count > 0, + "last_oap_count %d\n", last_oap_count); + LASSERT(last->oap_page_off + last_oap_count <= PAGE_SIZE); + last->oap_count = last_oap_count; + spin_lock(&last->oap_lock); + last->oap_async_flags |= ASYNC_COUNT_STABLE; + spin_unlock(&last->oap_lock); + } + + /* for the rest of pages, we don't need to call osf_refresh_count() + * because it's known they are not the last page */ + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { + oap->oap_count = PAGE_SIZE - oap->oap_page_off; + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_COUNT_STABLE; + spin_unlock(&oap->oap_lock); + } + } + + osc_object_lock(obj); + osc_extent_state_set(ext, OES_RPC); + osc_object_unlock(obj); + /* get a refcount for RPC. */ + osc_extent_get(ext); + + RETURN(0); +} + +/** + * Quick and simple version of osc_extent_find(). This function is frequently + * called to expand the extent for the same IO. To expand the extent, the + * page index must be in the same or next chunk of ext->oe_end. + */ +static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, + unsigned int *grants) +{ + struct osc_object *obj = ext->oe_obj; + struct client_obd *cli = osc_cli(obj); + struct osc_extent *next; + int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; + pgoff_t chunk = index >> ppc_bits; + pgoff_t end_chunk; + pgoff_t end_index; + unsigned int chunksize = 1 << cli->cl_chunkbits; + int rc = 0; + ENTRY; + + LASSERT(ext->oe_max_end >= index && ext->oe_start <= index); + osc_object_lock(obj); + LASSERT(sanity_check_nolock(ext) == 0); + end_chunk = ext->oe_end >> ppc_bits; + if (chunk > end_chunk + 1) + GOTO(out, rc = -ERANGE); + + if (end_chunk >= chunk) + GOTO(out, rc = 0); + + LASSERT(end_chunk + 1 == chunk); + + /* try to expand this extent to cover @index */ + end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1); + + /* don't go over the maximum extent size reported by server */ + if (end_index - ext->oe_start + 1 > cli->cl_max_extent_pages) + GOTO(out, rc = -ERANGE); + + next = next_extent(ext); + if (next != NULL && next->oe_start <= end_index) + /* complex mode - overlapped with the next extent, + * this case will be handled by osc_extent_find() */ + GOTO(out, rc = -EAGAIN); + + ext->oe_end = end_index; + ext->oe_grants += chunksize; + LASSERT(*grants >= chunksize); + *grants -= chunksize; + EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext, + "overlapped after expanding for %lu.\n", index); + EXIT; + +out: + osc_object_unlock(obj); + RETURN(rc); +} + +static void osc_extent_tree_dump0(int mask, struct osc_object *obj, + const char *func, int line) +{ + struct osc_extent *ext; + int cnt; + + if (!cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) + return; + + CDEBUG(mask, "Dump object %p extents at %s:%d, mppr: %u.\n", + obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc); + + /* osc_object_lock(obj); */ + cnt = 1; + for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext)) + OSC_EXTENT_DUMP(mask, ext, "in tree %d.\n", cnt++); + + cnt = 1; + list_for_each_entry(ext, &obj->oo_hp_exts, oe_link) + OSC_EXTENT_DUMP(mask, ext, "hp %d.\n", cnt++); + + cnt = 1; + list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link) + OSC_EXTENT_DUMP(mask, ext, "urgent %d.\n", cnt++); + + cnt = 1; + list_for_each_entry(ext, &obj->oo_reading_exts, oe_link) + OSC_EXTENT_DUMP(mask, ext, "reading %d.\n", cnt++); + /* osc_object_unlock(obj); */ +} + +/* ------------------ osc extent end ------------------ */ + +static inline int osc_is_ready(struct osc_object *osc) +{ + return !list_empty(&osc->oo_ready_item) || + !list_empty(&osc->oo_hp_ready_item); +} + +#define OSC_IO_DEBUG(OSC, STR, args...) \ + CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR, \ + (OSC), osc_is_ready(OSC), \ + list_empty_marker(&(OSC)->oo_hp_ready_item), \ + list_empty_marker(&(OSC)->oo_ready_item), \ + atomic_read(&(OSC)->oo_nr_writes), \ + list_empty_marker(&(OSC)->oo_hp_exts), \ + list_empty_marker(&(OSC)->oo_urgent_exts), \ + atomic_read(&(OSC)->oo_nr_reads), \ + list_empty_marker(&(OSC)->oo_reading_exts), \ + ##args) + +static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, + int cmd) +{ + struct osc_page *opg = oap2osc_page(oap); + struct cl_page *page = oap2cl_page(oap); + int result; + + LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */ + + ENTRY; + result = cl_page_make_ready(env, page, CRT_WRITE); + if (result == 0) + opg->ops_submit_time = ktime_get(); + RETURN(result); +} + +static int osc_refresh_count(const struct lu_env *env, + struct osc_async_page *oap, int cmd) +{ + struct osc_page *opg = oap2osc_page(oap); + pgoff_t index = osc_index(oap2osc(oap)); + struct cl_object *obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int result; + loff_t kms; + + /* readpage queues with _COUNT_STABLE, shouldn't get here. */ + LASSERT(!(cmd & OBD_BRW_READ)); + LASSERT(opg != NULL); + obj = opg->ops_cl.cpl_obj; + + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + if (result < 0) + return result; + kms = attr->cat_kms; + if (cl_offset(obj, index) >= kms) + /* catch race with truncate */ + return 0; + else if (cl_offset(obj, index + 1) > kms) + /* catch sub-page write at end of file */ + return kms & ~PAGE_MASK; + else + return PAGE_SIZE; +} + +static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, + int cmd, int rc) +{ + struct osc_page *opg = oap2osc_page(oap); + struct cl_page *page = oap2cl_page(oap); + enum cl_req_type crt; + int srvlock; + + ENTRY; + + cmd &= ~OBD_BRW_NOQUOTA; + LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ), + "cp_state:%u, cmd:%d\n", page->cp_state, cmd); + LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE), + "cp_state:%u, cmd:%d\n", page->cp_state, cmd); + LASSERT(opg->ops_transfer_pinned); + + crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE; + /* Clear opg->ops_transfer_pinned before VM lock is released. */ + opg->ops_transfer_pinned = 0; + + opg->ops_submit_time = ktime_set(0, 0); + srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK; + + /* statistic */ + if (rc == 0 && srvlock) { + struct lu_device *ld = opg->ops_cl.cpl_obj->co_lu.lo_dev; + struct osc_stats *stats = &lu2osc_dev(ld)->osc_stats; + size_t bytes = oap->oap_count; + + if (crt == CRT_READ) + stats->os_lockless_reads += bytes; + else + stats->os_lockless_writes += bytes; + } + + /* + * This has to be the last operation with the page, as locks are + * released in cl_page_completion() and nothing except for the + * reference counter protects page from concurrent reclaim. + */ + lu_ref_del(&page->cp_reference, "transfer", page); + + cl_page_completion(env, page, crt, rc); + cl_page_put(env, page); + + RETURN(0); +} + +#define OSC_DUMP_GRANT(mask, cli, fmt, args...) do { \ + struct client_obd *__tmp = (cli); \ + CDEBUG(mask, "%s: grant { dirty: %ld/%ld dirty_pages: %ld/%lu " \ + "dropped: %ld avail: %ld, dirty_grant: %ld, " \ + "reserved: %ld, flight: %d } lru {in list: %ld, " \ + "left: %ld, waiters: %d }" fmt "\n", \ + cli_name(__tmp), \ + __tmp->cl_dirty_pages, __tmp->cl_dirty_max_pages, \ + atomic_long_read(&obd_dirty_pages), obd_max_dirty_pages, \ + __tmp->cl_lost_grant, __tmp->cl_avail_grant, \ + __tmp->cl_dirty_grant, \ + __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, \ + atomic_long_read(&__tmp->cl_lru_in_list), \ + atomic_long_read(&__tmp->cl_lru_busy), \ + atomic_read(&__tmp->cl_lru_shrinkers), ##args); \ +} while (0) + +/* caller must hold loi_list_lock */ +static void osc_consume_write_grant(struct client_obd *cli, + struct brw_page *pga) +{ + assert_spin_locked(&cli->cl_loi_list_lock); + LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); + cli->cl_dirty_pages++; + pga->flag |= OBD_BRW_FROM_GRANT; + CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", + PAGE_SIZE, pga, pga->pg); +} + +/* the companion to osc_consume_write_grant, called when a brw has completed. + * must be called with the loi lock held. */ +static void osc_release_write_grant(struct client_obd *cli, + struct brw_page *pga) +{ + ENTRY; + + assert_spin_locked(&cli->cl_loi_list_lock); + if (!(pga->flag & OBD_BRW_FROM_GRANT)) { + EXIT; + return; + } + + pga->flag &= ~OBD_BRW_FROM_GRANT; + atomic_long_dec(&obd_dirty_pages); + cli->cl_dirty_pages--; + EXIT; +} + +/** + * To avoid sleeping with object lock held, it's good for us allocate enough + * grants before entering into critical section. + * + * client_obd_list_lock held by caller + */ +static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes) +{ + int rc = -EDQUOT; + + if (cli->cl_avail_grant >= bytes) { + cli->cl_avail_grant -= bytes; + cli->cl_reserved_grant += bytes; + rc = 0; + } + return rc; +} + +static void __osc_unreserve_grant(struct client_obd *cli, + unsigned int reserved, unsigned int unused) +{ + /* it's quite normal for us to get more grant than reserved. + * Thinking about a case that two extents merged by adding a new + * chunk, we can save one extent tax. If extent tax is greater than + * one chunk, we can save more grant by adding a new chunk */ + cli->cl_reserved_grant -= reserved; + if (unused > reserved) { + cli->cl_avail_grant += reserved; + cli->cl_lost_grant += unused - reserved; + cli->cl_dirty_grant -= unused - reserved; + } else { + cli->cl_avail_grant += unused; + cli->cl_dirty_grant += reserved - unused; + } +} + +static void osc_unreserve_grant_nolock(struct client_obd *cli, + unsigned int reserved, + unsigned int unused) +{ + __osc_unreserve_grant(cli, reserved, unused); + if (unused > 0) + osc_wake_cache_waiters(cli); +} + +static void osc_unreserve_grant(struct client_obd *cli, + unsigned int reserved, unsigned int unused) +{ + spin_lock(&cli->cl_loi_list_lock); + osc_unreserve_grant_nolock(cli, reserved, unused); + spin_unlock(&cli->cl_loi_list_lock); +} + +/** + * Free grant after IO is finished or canceled. + * + * @lost_grant is used to remember how many grants we have allocated but not + * used, we should return these grants to OST. There're two cases where grants + * can be lost: + * 1. truncate; + * 2. Without OBD_CONNECT_GRANT support and blocksize at OST is less than + * PAGE_SIZE and a partial page was written. In this case OST may use less + * chunks to serve this partial write. OSTs don't actually know the page + * size on the client side. so clients have to calculate lost grant by the + * blocksize on the OST. See tgt_grant_check() for details. + */ +static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, + unsigned int lost_grant, unsigned int dirty_grant) +{ + unsigned long grant; + + grant = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax; + + spin_lock(&cli->cl_loi_list_lock); + atomic_long_sub(nr_pages, &obd_dirty_pages); + cli->cl_dirty_pages -= nr_pages; + cli->cl_lost_grant += lost_grant; + cli->cl_dirty_grant -= dirty_grant; + if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) { + /* borrow some grant from truncate to avoid the case that + * truncate uses up all avail grant */ + cli->cl_lost_grant -= grant; + cli->cl_avail_grant += grant; + } + osc_wake_cache_waiters(cli); + spin_unlock(&cli->cl_loi_list_lock); + CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu/%lu\n", + lost_grant, cli->cl_lost_grant, + cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_SHIFT, + cli->cl_dirty_grant); +} + +/** + * The companion to osc_enter_cache(), called when @oap is no longer part of + * the dirty accounting due to error. + */ +static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap) +{ + spin_lock(&cli->cl_loi_list_lock); + osc_release_write_grant(cli, &oap->oap_brw_page); + spin_unlock(&cli->cl_loi_list_lock); +} + +/** + * Non-blocking version of osc_enter_cache() that consumes grant only when it + * is available. + */ +static int osc_enter_cache_try(struct client_obd *cli, + struct osc_async_page *oap, + int bytes) +{ + int rc; + + OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); + + rc = osc_reserve_grant(cli, bytes); + if (rc < 0) + return 0; + + if (cli->cl_dirty_pages < cli->cl_dirty_max_pages) { + if (atomic_long_add_return(1, &obd_dirty_pages) <= + obd_max_dirty_pages) { + osc_consume_write_grant(cli, &oap->oap_brw_page); + rc = 1; + goto out; + } else + atomic_long_dec(&obd_dirty_pages); + } + __osc_unreserve_grant(cli, bytes, bytes); + +out: + return rc; +} + +/* Following two inlines exist to pass code fragments + * to wait_event_idle_exclusive_timeout_cmd(). Passing + * code fragments as macro args can look confusing, so + * we provide inlines to encapsulate them. + */ +static inline void cli_unlock_and_unplug(const struct lu_env *env, + struct client_obd *cli, + struct osc_async_page *oap) +{ + spin_unlock(&cli->cl_loi_list_lock); + osc_io_unplug_async(env, cli, NULL); + CDEBUG(D_CACHE, + "%s: sleeping for cache space for %p\n", + cli_name(cli), oap); +} + +static inline void cli_lock_after_unplug(struct client_obd *cli) +{ + spin_lock(&cli->cl_loi_list_lock); +} +/** + * The main entry to reserve dirty page accounting. Usually the grant reserved + * in this function will be freed in bulk in osc_free_grant() unless it fails + * to add osc cache, in that case, it will be freed in osc_exit_cache(). + * + * The process will be put into sleep if it's already run out of grant. + */ +static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, + struct osc_async_page *oap, int bytes) +{ + struct osc_object *osc = oap->oap_obj; + struct lov_oinfo *loi = osc->oo_oinfo; + int rc = -EDQUOT; + int remain; + bool entered = false; + /* We cannot wait for a long time here since we are holding ldlm lock + * across the actual IO. If no requests complete fast (e.g. due to + * overloaded OST that takes a long time to process everything, we'd + * get evicted if we wait for a normal obd_timeout or some such. + * So we try to wait half the time it would take the client to be + * evicted by server which is half obd_timeout when AT is off + * or at least ldlm_enqueue_min with AT on. + * See LU-13131 */ + unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout / 2 : + ldlm_enqueue_min / 2); + + ENTRY; + + OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); + + spin_lock(&cli->cl_loi_list_lock); + + /* force the caller to try sync io. this can jump the list + * of queued writes and create a discontiguous rpc stream */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) || + cli->cl_dirty_max_pages == 0 || + cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) { + OSC_DUMP_GRANT(D_CACHE, cli, "forced sync i/o\n"); + GOTO(out, rc = -EDQUOT); + } + + /* + * We can wait here for two reasons: too many dirty pages in cache, or + * run out of grants. In both cases we should write dirty pages out. + * Adding a cache waiter will trigger urgent write-out no matter what + * RPC size will be. + * The exiting condition (other than success) is no avail grants + * and no dirty pages caching, that really means there is no space + * on the OST. + */ + remain = wait_event_idle_exclusive_timeout_cmd( + cli->cl_cache_waiters, + (entered = osc_enter_cache_try(cli, oap, bytes)) || + (cli->cl_dirty_pages == 0 && cli->cl_w_in_flight == 0), + timeout, + cli_unlock_and_unplug(env, cli, oap), + cli_lock_after_unplug(cli)); + + if (entered) { + if (remain == timeout) + OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); + else + OSC_DUMP_GRANT(D_CACHE, cli, + "finally got grant space\n"); + wake_up(&cli->cl_cache_waiters); + rc = 0; + } else if (remain == 0) { + OSC_DUMP_GRANT(D_CACHE, cli, + "timeout, fall back to sync i/o\n"); + osc_extent_tree_dump(D_CACHE, osc); + /* fall back to synchronous I/O */ + } else { + OSC_DUMP_GRANT(D_CACHE, cli, + "no grant space, fall back to sync i/o\n"); + wake_up_all(&cli->cl_cache_waiters); + } + EXIT; +out: + spin_unlock(&cli->cl_loi_list_lock); + RETURN(rc); +} + +static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc) +{ + int hprpc = !!list_empty(&osc->oo_hp_exts); + return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc; +} + +/* This maintains the lists of pending pages to read/write for a given object + * (lop). This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint() + * to quickly find objects that are ready to send an RPC. */ +static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc, + int cmd) +{ + int invalid_import = 0; + ENTRY; + + /* if we have an invalid import we want to drain the queued pages + * by forcing them through rpcs that immediately fail and complete + * the pages. recovery relies on this to empty the queued pages + * before canceling the locks and evicting down the llite pages */ + if ((cli->cl_import == NULL || cli->cl_import->imp_invalid)) + invalid_import = 1; + + if (cmd & OBD_BRW_WRITE) { + if (atomic_read(&osc->oo_nr_writes) == 0) + RETURN(0); + if (invalid_import) { + CDEBUG(D_CACHE, "invalid import forcing RPC\n"); + RETURN(1); + } + if (!list_empty(&osc->oo_hp_exts)) { + CDEBUG(D_CACHE, "high prio request forcing RPC\n"); + RETURN(1); + } + if (!list_empty(&osc->oo_urgent_exts)) { + CDEBUG(D_CACHE, "urgent request forcing RPC\n"); + RETURN(1); + } + /* trigger a write rpc stream as long as there are dirtiers + * waiting for space. as they're waiting, they're not going to + * create more pages to coalesce with what's waiting.. + */ + if (waitqueue_active(&cli->cl_cache_waiters)) { + CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); + RETURN(1); + } + if (!list_empty(&osc->oo_full_exts)) { + CDEBUG(D_CACHE, "full extent ready, make an RPC\n"); + RETURN(1); + } + } else { + if (atomic_read(&osc->oo_nr_reads) == 0) + RETURN(0); + if (invalid_import) { + CDEBUG(D_CACHE, "invalid import forcing RPC\n"); + RETURN(1); + } + /* all read are urgent. */ + if (!list_empty(&osc->oo_reading_exts)) + RETURN(1); + } + + RETURN(0); +} + +static void osc_update_pending(struct osc_object *obj, int cmd, int delta) +{ + struct client_obd *cli = osc_cli(obj); + if (cmd & OBD_BRW_WRITE) { + atomic_add(delta, &obj->oo_nr_writes); + atomic_add(delta, &cli->cl_pending_w_pages); + LASSERT(atomic_read(&obj->oo_nr_writes) >= 0); + } else { + atomic_add(delta, &obj->oo_nr_reads); + atomic_add(delta, &cli->cl_pending_r_pages); + LASSERT(atomic_read(&obj->oo_nr_reads) >= 0); + } + OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta); +} + +static int osc_makes_hprpc(struct osc_object *obj) +{ + return !list_empty(&obj->oo_hp_exts); +} + +static void on_list(struct list_head *item, struct list_head *list, + int should_be_on) +{ + if (list_empty(item) && should_be_on) + list_add_tail(item, list); + else if (!list_empty(item) && !should_be_on) + list_del_init(item); +} + +/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc + * can find pages to build into rpcs quickly */ +static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc) +{ + if (osc_makes_hprpc(osc)) { + /* HP rpc */ + on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0); + on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1); + } else { + on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0); + on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, + osc_makes_rpc(cli, osc, OBD_BRW_WRITE) || + osc_makes_rpc(cli, osc, OBD_BRW_READ)); + } + + on_list(&osc->oo_write_item, &cli->cl_loi_write_list, + atomic_read(&osc->oo_nr_writes) > 0); + + on_list(&osc->oo_read_item, &cli->cl_loi_read_list, + atomic_read(&osc->oo_nr_reads) > 0); + + return osc_is_ready(osc); +} + +static int osc_list_maint(struct client_obd *cli, struct osc_object *osc) +{ + int is_ready; + + spin_lock(&cli->cl_loi_list_lock); + is_ready = __osc_list_maint(cli, osc); + spin_unlock(&cli->cl_loi_list_lock); + + return is_ready; +} + +/* this is trying to propogate async writeback errors back up to the + * application. As an async write fails we record the error code for later if + * the app does an fsync. As long as errors persist we force future rpcs to be + * sync so that the app can get a sync error and break the cycle of queueing + * pages for which writeback will fail. */ +static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, + int rc) +{ + if (rc) { + if (!ar->ar_rc) + ar->ar_rc = rc; + + ar->ar_force_sync = 1; + ar->ar_min_xid = ptlrpc_sample_next_xid(); + return; + + } + + if (ar->ar_force_sync && (xid >= ar->ar_min_xid)) + ar->ar_force_sync = 0; +} + +/* this must be called holding the loi list lock to give coverage to exit_cache, + * async_flag maintenance, and oap_request */ +static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, + struct osc_async_page *oap, int sent, int rc) +{ + struct osc_object *osc = oap->oap_obj; + struct lov_oinfo *loi = osc->oo_oinfo; + __u64 xid = 0; + + ENTRY; + if (oap->oap_request != NULL) { + xid = ptlrpc_req_xid(oap->oap_request); + ptlrpc_req_finished(oap->oap_request); + oap->oap_request = NULL; + } + + /* As the transfer for this page is being done, clear the flags */ + spin_lock(&oap->oap_lock); + oap->oap_async_flags = 0; + spin_unlock(&oap->oap_lock); + + if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) { + spin_lock(&cli->cl_loi_list_lock); + osc_process_ar(&cli->cl_ar, xid, rc); + osc_process_ar(&loi->loi_ar, xid, rc); + spin_unlock(&cli->cl_loi_list_lock); + } + + rc = osc_completion(env, oap, oap->oap_cmd, rc); + if (rc) + CERROR("completion on oap %p obj %p returns %d.\n", + oap, osc, rc); + + EXIT; +} + +struct extent_rpc_data { + struct list_head *erd_rpc_list; + unsigned int erd_page_count; + unsigned int erd_max_pages; + unsigned int erd_max_chunks; + unsigned int erd_max_extents; +}; + +static inline unsigned osc_extent_chunks(const struct osc_extent *ext) +{ + struct client_obd *cli = osc_cli(ext->oe_obj); + unsigned ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; + + return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1; +} + +static inline bool +can_merge(const struct osc_extent *ext, const struct osc_extent *in_rpc) +{ + if (ext->oe_no_merge || in_rpc->oe_no_merge) + return false; + + if (ext->oe_srvlock != in_rpc->oe_srvlock) + return false; + + if (ext->oe_ndelay != in_rpc->oe_ndelay) + return false; + + if (!ext->oe_grants != !in_rpc->oe_grants) + return false; + + if (ext->oe_dio != in_rpc->oe_dio) + return false; + + /* It's possible to have overlap on DIO */ + if (in_rpc->oe_dio && overlapped(ext, in_rpc)) + return false; + + if (ext->oe_is_rdma_only != in_rpc->oe_is_rdma_only) + return false; + + return true; +} + +/** + * Try to add extent to one RPC. We need to think about the following things: + * - # of pages must not be over max_pages_per_rpc + * - extent must be compatible with previous ones + */ +static int try_to_add_extent_for_io(struct client_obd *cli, + struct osc_extent *ext, + struct extent_rpc_data *data) +{ + struct osc_extent *tmp; + unsigned int chunk_count; + ENTRY; + + EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE), + ext); + OSC_EXTENT_DUMP(D_CACHE, ext, "trying to add this extent\n"); + + if (data->erd_max_extents == 0) + RETURN(0); + + chunk_count = osc_extent_chunks(ext); + EASSERTF(data->erd_page_count != 0 || + chunk_count <= data->erd_max_chunks, ext, + "The first extent to be fit in a RPC contains %u chunks, " + "which is over the limit %u.\n", chunk_count, + data->erd_max_chunks); + if (chunk_count > data->erd_max_chunks) + RETURN(0); + + data->erd_max_pages = max(ext->oe_mppr, data->erd_max_pages); + EASSERTF(data->erd_page_count != 0 || + ext->oe_nr_pages <= data->erd_max_pages, ext, + "The first extent to be fit in a RPC contains %u pages, " + "which is over the limit %u.\n", ext->oe_nr_pages, + data->erd_max_pages); + if (data->erd_page_count + ext->oe_nr_pages > data->erd_max_pages) + RETURN(0); + + list_for_each_entry(tmp, data->erd_rpc_list, oe_link) { + EASSERT(tmp->oe_owner == current, tmp); + + if (!can_merge(ext, tmp)) + RETURN(0); + } + + data->erd_max_extents--; + data->erd_max_chunks -= chunk_count; + data->erd_page_count += ext->oe_nr_pages; + list_move_tail(&ext->oe_link, data->erd_rpc_list); + ext->oe_owner = current; + RETURN(1); +} + +/** + * In order to prevent multiple ptlrpcd from breaking contiguous extents, + * get_write_extent() takes all appropriate extents in atomic. + * + * The following policy is used to collect extents for IO: + * 1. Add as many HP extents as possible; + * 2. Add the first urgent extent in urgent extent list and take it out of + * urgent list; + * 3. Add subsequent extents of this urgent extent; + * 4. If urgent list is not empty, goto 2; + * 5. Traverse the extent tree from the 1st extent; + * 6. Above steps exit if there is no space in this RPC. + */ +static unsigned int get_write_extents(struct osc_object *obj, + struct list_head *rpclist) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + struct extent_rpc_data data = { + .erd_rpc_list = rpclist, + .erd_page_count = 0, + .erd_max_pages = cli->cl_max_pages_per_rpc, + .erd_max_chunks = osc_max_write_chunks(cli), + .erd_max_extents = 256, + }; + + assert_osc_object_is_locked(obj); + while ((ext = list_first_entry_or_null(&obj->oo_hp_exts, + struct osc_extent, + oe_link)) != NULL) { + if (!try_to_add_extent_for_io(cli, ext, &data)) + return data.erd_page_count; + EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext); + } + if (data.erd_page_count == data.erd_max_pages) + return data.erd_page_count; + + while ((ext = list_first_entry_or_null(&obj->oo_urgent_exts, + struct osc_extent, + oe_link)) != NULL) { + if (!try_to_add_extent_for_io(cli, ext, &data)) + return data.erd_page_count; + } + if (data.erd_page_count == data.erd_max_pages) + return data.erd_page_count; + + /* One key difference between full extents and other extents: full + * extents can usually only be added if the rpclist was empty, so if we + * can't add one, we continue on to trying to add normal extents. This + * is so we don't miss adding extra extents to an RPC containing high + * priority or urgent extents. + */ + while ((ext = list_first_entry_or_null(&obj->oo_full_exts, + struct osc_extent, + oe_link)) != NULL) { + if (!try_to_add_extent_for_io(cli, ext, &data)) + break; + } + if (data.erd_page_count == data.erd_max_pages) + return data.erd_page_count; + + for (ext = first_extent(obj); + ext; + ext = next_extent(ext)) { + if ((ext->oe_state != OES_CACHE) || + /* this extent may be already in current rpclist */ + (!list_empty(&ext->oe_link) && ext->oe_owner)) + continue; + + if (!try_to_add_extent_for_io(cli, ext, &data)) + return data.erd_page_count; + } + return data.erd_page_count; +} + +static int +osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc) +__must_hold(osc) +{ + LIST_HEAD(rpclist); + struct osc_extent *ext; + struct osc_extent *tmp; + struct osc_extent *first = NULL; + unsigned int page_count = 0; + int srvlock = 0; + int rc = 0; + ENTRY; + + assert_osc_object_is_locked(osc); + + page_count = get_write_extents(osc, &rpclist); + LASSERT(equi(page_count == 0, list_empty(&rpclist))); + + if (list_empty(&rpclist)) + RETURN(0); + + osc_update_pending(osc, OBD_BRW_WRITE, -page_count); + + list_for_each_entry(ext, &rpclist, oe_link) { + LASSERT(ext->oe_state == OES_CACHE || + ext->oe_state == OES_LOCK_DONE); + if (ext->oe_state == OES_CACHE) + osc_extent_state_set(ext, OES_LOCKING); + else + osc_extent_state_set(ext, OES_RPC); + } + + /* we're going to grab page lock, so release object lock because + * lock order is page lock -> object lock. */ + osc_object_unlock(osc); + + list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) { + if (ext->oe_state == OES_LOCKING) { + rc = osc_extent_make_ready(env, ext); + if (unlikely(rc < 0)) { + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 0, rc); + continue; + } + } + if (first == NULL) { + first = ext; + srvlock = ext->oe_srvlock; + } else { + LASSERT(srvlock == ext->oe_srvlock); + } + } + + if (!list_empty(&rpclist)) { + LASSERT(page_count > 0); + rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE); + LASSERT(list_empty(&rpclist)); + } + + osc_object_lock(osc); + RETURN(rc); +} + +/** + * prepare pages for ASYNC io and put pages in send queue. + * + * \param cmd OBD_BRW_* macroses + * \param lop pending pages + * + * \return zero if no page added to send queue. + * \return 1 if pages successfully added to send queue. + * \return negative on errors. + */ +static int +osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc) +__must_hold(osc) +{ + struct osc_extent *ext; + struct osc_extent *next; + LIST_HEAD(rpclist); + struct extent_rpc_data data = { + .erd_rpc_list = &rpclist, + .erd_page_count = 0, + .erd_max_pages = cli->cl_max_pages_per_rpc, + .erd_max_chunks = UINT_MAX, + .erd_max_extents = UINT_MAX, + }; + int rc = 0; + ENTRY; + + assert_osc_object_is_locked(osc); + list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) { + EASSERT(ext->oe_state == OES_LOCK_DONE, ext); + if (!try_to_add_extent_for_io(cli, ext, &data)) + break; + osc_extent_state_set(ext, OES_RPC); + EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext); + } + LASSERT(data.erd_page_count <= data.erd_max_pages); + + osc_update_pending(osc, OBD_BRW_READ, -data.erd_page_count); + + if (!list_empty(&rpclist)) { + osc_object_unlock(osc); + + rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ); + LASSERT(list_empty(&rpclist)); + + osc_object_lock(osc); + } + RETURN(rc); +} + +#define list_to_obj(list, item) ({ \ + struct list_head *__tmp = (list)->next; \ + list_del_init(__tmp); \ + list_entry(__tmp, struct osc_object, oo_##item); \ +}) + +/* This is called by osc_check_rpcs() to find which objects have pages that + * we could be sending. These lists are maintained by osc_makes_rpc(). */ +static struct osc_object *osc_next_obj(struct client_obd *cli) +{ + ENTRY; + + /* First return objects that have blocked locks so that they + * will be flushed quickly and other clients can get the lock, + * then objects which have pages ready to be stuffed into RPCs */ + if (!list_empty(&cli->cl_loi_hp_ready_list)) + RETURN(list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item)); + if (!list_empty(&cli->cl_loi_ready_list)) + RETURN(list_to_obj(&cli->cl_loi_ready_list, ready_item)); + + /* then if we have cache waiters, return all objects with queued + * writes. This is especially important when many small files + * have filled up the cache and not been fired into rpcs because + * they don't pass the nr_pending/object threshhold + */ + if (waitqueue_active(&cli->cl_cache_waiters) && + !list_empty(&cli->cl_loi_write_list)) + RETURN(list_to_obj(&cli->cl_loi_write_list, write_item)); + + /* then return all queued objects when we have an invalid import + * so that they get flushed */ + if (cli->cl_import == NULL || cli->cl_import->imp_invalid) { + if (!list_empty(&cli->cl_loi_write_list)) + RETURN(list_to_obj(&cli->cl_loi_write_list, + write_item)); + if (!list_empty(&cli->cl_loi_read_list)) + RETURN(list_to_obj(&cli->cl_loi_read_list, + read_item)); + } + RETURN(NULL); +} + +/* called with the loi list lock held */ +static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli) +__must_hold(&cli->cl_loi_list_lock) +{ + struct osc_object *osc; + int rc = 0; + ENTRY; + + while ((osc = osc_next_obj(cli)) != NULL) { + struct cl_object *obj = osc2cl(osc); + struct lu_ref_link link; + + OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli)); + + /* even if we have reached our max in flight RPCs, we still + * allow all high-priority RPCs through to prevent their + * starvation and leading to server evicting us for not + * writing out pages in a timely manner LU-13131 */ + if (osc_max_rpc_in_flight(cli, osc) && + list_empty(&osc->oo_hp_exts)) { + __osc_list_maint(cli, osc); + break; + } + + cl_object_get(obj); + spin_unlock(&cli->cl_loi_list_lock); + lu_object_ref_add_at(&obj->co_lu, &link, "check", current); + + /* attempt some read/write balancing by alternating between + * reads and writes in an object. The makes_rpc checks here + * would be redundant if we were getting read/write work items + * instead of objects. we don't want send_oap_rpc to drain a + * partial read pending queue when we're given this object to + * do io on writes while there are cache waiters */ + osc_object_lock(osc); + if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) { + rc = osc_send_write_rpc(env, cli, osc); + if (rc < 0) { + CERROR("Write request failed with %d\n", rc); + + /* osc_send_write_rpc failed, mostly because of + * memory pressure. + * + * It can't break here, because if: + * - a page was submitted by osc_io_submit, so + * page locked; + * - no request in flight + * - no subsequent request + * The system will be in live-lock state, + * because there is no chance to call + * osc_io_unplug() and osc_check_rpcs() any + * more. pdflush can't help in this case, + * because it might be blocked at grabbing + * the page lock as we mentioned. + * + * Anyway, continue to drain pages. */ + /* break; */ + } + } + if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) { + rc = osc_send_read_rpc(env, cli, osc); + if (rc < 0) + CERROR("Read request failed with %d\n", rc); + } + osc_object_unlock(osc); + + osc_list_maint(cli, osc); + lu_object_ref_del_at(&obj->co_lu, &link, "check", current); + cl_object_put(env, obj); + + spin_lock(&cli->cl_loi_list_lock); + } +} + +int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, int async) +{ + int rc = 0; + + if (osc != NULL && osc_list_maint(cli, osc) == 0) + return 0; + + if (!async) { + spin_lock(&cli->cl_loi_list_lock); + osc_check_rpcs(env, cli); + spin_unlock(&cli->cl_loi_list_lock); + } else { + CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli); + LASSERT(cli->cl_writeback_work != NULL); + rc = ptlrpcd_queue_work(cli->cl_writeback_work); + } + return rc; +} +EXPORT_SYMBOL(osc_io_unplug0); + +int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, + struct cl_page *page, loff_t offset) +{ + struct obd_export *exp = osc_export(osc); + struct osc_async_page *oap = &ops->ops_oap; + struct page *vmpage = page->cp_vmpage; + ENTRY; + + if (!page) + return cfs_size_round(sizeof(*oap)); + + oap->oap_magic = OAP_MAGIC; + oap->oap_cli = &exp->exp_obd->u.cli; + oap->oap_obj = osc; + + oap->oap_page = vmpage; + oap->oap_obj_off = offset; + LASSERT(!(offset & ~PAGE_MASK)); + + /* Count of transient (direct i/o) pages is always stable by the time + * they're submitted. Setting this here lets us avoid calling + * cl_page_clip later to set this. + */ + if (page->cp_type == CPT_TRANSIENT) + oap->oap_async_flags |= ASYNC_COUNT_STABLE|ASYNC_URGENT| + ASYNC_READY; + + INIT_LIST_HEAD(&oap->oap_pending_item); + INIT_LIST_HEAD(&oap->oap_rpc_item); + + spin_lock_init(&oap->oap_lock); + CDEBUG(D_INFO, "oap %p vmpage %p obj off %llu\n", + oap, vmpage, oap->oap_obj_off); + RETURN(0); +} +EXPORT_SYMBOL(osc_prep_async_page); + +int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops, cl_commit_cbt cb) +{ + struct osc_io *oio = osc_env_io(env); + struct osc_extent *ext = NULL; + struct osc_async_page *oap = &ops->ops_oap; + struct client_obd *cli = oap->oap_cli; + struct osc_object *osc = oap->oap_obj; + struct pagevec *pvec = &osc_env_info(env)->oti_pagevec; + pgoff_t index; + unsigned int tmp; + unsigned int grants = 0; + u32 brw_flags = OBD_BRW_ASYNC; + int cmd = OBD_BRW_WRITE; + int need_release = 0; + int rc = 0; + ENTRY; + + if (oap->oap_magic != OAP_MAGIC) + RETURN(-EINVAL); + + if (cli->cl_import == NULL || cli->cl_import->imp_invalid) + RETURN(-EIO); + + if (!list_empty(&oap->oap_pending_item) || + !list_empty(&oap->oap_rpc_item)) + RETURN(-EBUSY); + + /* Set the OBD_BRW_SRVLOCK before the page is queued. */ + brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; + if (io->ci_noquota) { + brw_flags |= OBD_BRW_NOQUOTA; + cmd |= OBD_BRW_NOQUOTA; + } + + if (oio->oi_cap_sys_resource) { + brw_flags |= OBD_BRW_SYS_RESOURCE; + cmd |= OBD_BRW_SYS_RESOURCE; + } + + /* check if the file's owner/group is over quota */ + /* do not check for root without root squash, because in this case + * we should bypass quota + */ + if ((!oio->oi_cap_sys_resource || + cli->cl_root_squash) && + !io->ci_noquota) { + struct cl_object *obj; + struct cl_attr *attr; + unsigned int qid[LL_MAXQUOTAS]; + + obj = cl_object_top(&osc->oo_cl); + attr = &osc_env_info(env)->oti_attr; + + cl_object_attr_lock(obj); + rc = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + + qid[USRQUOTA] = attr->cat_uid; + qid[GRPQUOTA] = attr->cat_gid; + qid[PRJQUOTA] = attr->cat_projid; + if (rc == 0 && osc_quota_chkdq(cli, qid) == -EDQUOT) + rc = -EDQUOT; + if (rc) + RETURN(rc); + } + + oap->oap_cmd = cmd; + oap->oap_page_off = ops->ops_from; + oap->oap_count = ops->ops_to - ops->ops_from + 1; + /* No need to hold a lock here, + * since this page is not in any list yet. */ + oap->oap_async_flags = 0; + oap->oap_brw_flags = brw_flags; + + OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n", + oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK); + + index = osc_index(oap2osc(oap)); + + /* Add this page into extent by the following steps: + * 1. if there exists an active extent for this IO, mostly this page + * can be added to the active extent and sometimes we need to + * expand extent to accomodate this page; + * 2. otherwise, a new extent will be allocated. */ + + ext = oio->oi_active; + if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) { + /* one chunk plus extent overhead must be enough to write this + * page */ + grants = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax; + if (ext->oe_end >= index) + grants = 0; + + /* it doesn't need any grant to dirty this page */ + spin_lock(&cli->cl_loi_list_lock); + rc = osc_enter_cache_try(cli, oap, grants); + if (rc == 0) { /* try failed */ + grants = 0; + need_release = 1; + } else if (ext->oe_end < index) { + tmp = grants; + /* try to expand this extent */ + rc = osc_extent_expand(ext, index, &tmp); + if (rc < 0) { + need_release = 1; + /* don't free reserved grant */ + } else { + OSC_EXTENT_DUMP(D_CACHE, ext, + "expanded for %lu.\n", index); + osc_unreserve_grant_nolock(cli, grants, tmp); + grants = 0; + } + } + spin_unlock(&cli->cl_loi_list_lock); + rc = 0; + } else if (ext != NULL) { + /* index is located outside of active extent */ + need_release = 1; + } + if (need_release) { + osc_extent_release(env, ext); + oio->oi_active = NULL; + ext = NULL; + } + + if (ext == NULL) { + tmp = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax; + + /* try to find new extent to cover this page */ + LASSERT(oio->oi_active == NULL); + /* we may have allocated grant for this page if we failed + * to expand the previous active extent. */ + LASSERT(ergo(grants > 0, grants >= tmp)); + + rc = 0; + + /* We must not hold a page lock while we do osc_enter_cache() + * or osc_extent_find(), so we must mark dirty & unlock + * any pages in the write commit pagevec. */ + if (pagevec_count(pvec)) { + cb(env, io, pvec); + pagevec_reinit(pvec); + } + + if (grants == 0) { + rc = osc_enter_cache(env, cli, oap, tmp); + if (rc == 0) + grants = tmp; + } + + tmp = grants; + if (rc == 0) { + ext = osc_extent_find(env, osc, index, &tmp); + if (IS_ERR(ext)) { + LASSERT(tmp == grants); + osc_exit_cache(cli, oap); + rc = PTR_ERR(ext); + ext = NULL; + } else { + oio->oi_active = ext; + } + } + if (grants > 0) + osc_unreserve_grant(cli, grants, tmp); + } + + LASSERT(ergo(rc == 0, ext != NULL)); + if (ext != NULL) { + EASSERTF(ext->oe_end >= index && ext->oe_start <= index, + ext, "index = %lu.\n", index); + LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0); + + osc_object_lock(osc); + if (ext->oe_nr_pages == 0) + ext->oe_srvlock = ops->ops_srvlock; + else + LASSERT(ext->oe_srvlock == ops->ops_srvlock); + ++ext->oe_nr_pages; + list_add_tail(&oap->oap_pending_item, &ext->oe_pages); + osc_object_unlock(osc); + + if (!ext->oe_layout_version) + ext->oe_layout_version = io->ci_layout_version; + } + + RETURN(rc); +} + +int osc_teardown_async_page(const struct lu_env *env, + struct osc_object *obj, struct osc_page *ops) +{ + struct osc_async_page *oap = &ops->ops_oap; + int rc = 0; + ENTRY; + + LASSERT(oap->oap_magic == OAP_MAGIC); + + CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n", + oap, ops, osc_index(oap2osc(oap))); + + if (!list_empty(&oap->oap_rpc_item)) { + CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap); + rc = -EBUSY; + } else if (!list_empty(&oap->oap_pending_item)) { + struct osc_extent *ext = NULL; + + osc_object_lock(obj); + ext = osc_extent_lookup(obj, osc_index(oap2osc(oap))); + osc_object_unlock(obj); + /* only truncated pages are allowed to be taken out. + * See osc_extent_truncate() and osc_cache_truncate_start() + * for details. */ + if (ext != NULL && ext->oe_state != OES_TRUNC) { + OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n", + osc_index(oap2osc(oap))); + rc = -EBUSY; + } + if (ext != NULL) + osc_extent_put(env, ext); + } + RETURN(rc); +} + +/** + * This is called when a page is picked up by kernel to write out. + * + * We should find out the corresponding extent and add the whole extent + * into urgent list. The extent may be being truncated or used, handle it + * carefully. + */ +int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops) +{ + struct osc_extent *ext = NULL; + struct osc_object *obj = cl2osc(ops->ops_cl.cpl_obj); + struct cl_page *cp = ops->ops_cl.cpl_page; + pgoff_t index = osc_index(ops); + struct osc_async_page *oap = &ops->ops_oap; + bool unplug = false; + int rc = 0; + ENTRY; + + osc_object_lock(obj); + ext = osc_extent_lookup(obj, index); + if (ext == NULL) { + osc_extent_tree_dump(D_ERROR, obj); + LASSERTF(0, "page index %lu is NOT covered.\n", index); + } + + switch (ext->oe_state) { + case OES_RPC: + case OES_LOCK_DONE: + CL_PAGE_DEBUG(D_ERROR, env, cp, "flush an in-rpc page?\n"); + LASSERT(0); + break; + case OES_LOCKING: + /* If we know this extent is being written out, we should abort + * so that the writer can make this page ready. Otherwise, there + * exists a deadlock problem because other process can wait for + * page writeback bit holding page lock; and meanwhile in + * vvp_page_make_ready(), we need to grab page lock before + * really sending the RPC. */ + case OES_TRUNC: + /* race with truncate, page will be redirtied */ + case OES_ACTIVE: + /* The extent is active so we need to abort and let the caller + * re-dirty the page. If we continued on here, and we were the + * one making the extent active, we could deadlock waiting for + * the page writeback to clear but it won't because the extent + * is active and won't be written out. */ + GOTO(out, rc = -EAGAIN); + default: + break; + } + + rc = cl_page_prep(env, io, cp, CRT_WRITE); + if (rc) + GOTO(out, rc); + + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT; + spin_unlock(&oap->oap_lock); + + if (current->flags & PF_MEMALLOC) + ext->oe_memalloc = 1; + + ext->oe_urgent = 1; + if (ext->oe_state == OES_CACHE) { + OSC_EXTENT_DUMP(D_CACHE, ext, + "flush page %p make it urgent.\n", oap); + if (list_empty(&ext->oe_link)) + list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); + unplug = true; + } + rc = 0; + EXIT; + +out: + osc_object_unlock(obj); + osc_extent_put(env, ext); + if (unplug) + osc_io_unplug_async(env, osc_cli(obj), obj); + return rc; +} + +int osc_queue_sync_pages(const struct lu_env *env, struct cl_io *io, + struct osc_object *obj, struct list_head *list, + int brw_flags) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + struct osc_async_page *oap; + int page_count = 0; + int mppr = cli->cl_max_pages_per_rpc; + bool can_merge = true; + pgoff_t start = CL_PAGE_EOF; + pgoff_t end = 0; + ENTRY; + + list_for_each_entry(oap, list, oap_pending_item) { + struct osc_page *opg = oap2osc_page(oap); + pgoff_t index = osc_index(opg); + + if (index > end) + end = index; + if (index < start) + start = index; + ++page_count; + mppr <<= (page_count > mppr); + + if (unlikely(opg->ops_from > 0 || + opg->ops_to < PAGE_SIZE - 1)) + can_merge = false; + } + + ext = osc_extent_alloc(obj); + if (ext == NULL) { + struct osc_async_page *tmp; + + list_for_each_entry_safe(oap, tmp, list, oap_pending_item) { + list_del_init(&oap->oap_pending_item); + osc_ap_completion(env, cli, oap, 0, -ENOMEM); + } + RETURN(-ENOMEM); + } + + ext->oe_rw = !!(brw_flags & OBD_BRW_READ); + ext->oe_sync = 1; + ext->oe_no_merge = !can_merge; + ext->oe_urgent = 1; + ext->oe_start = start; + ext->oe_end = ext->oe_max_end = end; + ext->oe_obj = obj; + ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK); + ext->oe_ndelay = !!(brw_flags & OBD_BRW_NDELAY); + ext->oe_dio = !!(brw_flags & OBD_BRW_NOCACHE); + if (ext->oe_dio && !ext->oe_rw) { /* direct io write */ + int grants; + int ppc; + + ppc = 1 << (cli->cl_chunkbits - PAGE_SHIFT); + grants = cli->cl_grant_extent_tax; + grants += (1 << cli->cl_chunkbits) * + ((page_count + ppc - 1) / ppc); + + CDEBUG(D_CACHE, "requesting %d bytes grant\n", grants); + spin_lock(&cli->cl_loi_list_lock); + if (osc_reserve_grant(cli, grants) == 0) { + list_for_each_entry(oap, list, oap_pending_item) { + osc_consume_write_grant(cli, + &oap->oap_brw_page); + } + atomic_long_add(page_count, &obd_dirty_pages); + osc_unreserve_grant_nolock(cli, grants, 0); + ext->oe_grants = grants; + } else { + /* We cannot report ENOSPC correctly if we do parallel + * DIO (async RPC submission), so turn off parallel dio + * if there is not sufficient grant available. This + * makes individual RPCs synchronous. + */ + io->ci_parallel_dio = false; + CDEBUG(D_CACHE, + "not enough grant available, switching to sync for this i/o\n"); + } + spin_unlock(&cli->cl_loi_list_lock); + osc_update_next_shrink(cli); + } + + ext->oe_is_rdma_only = !!(brw_flags & OBD_BRW_RDMA_ONLY); + ext->oe_nr_pages = page_count; + ext->oe_mppr = mppr; + list_splice_init(list, &ext->oe_pages); + ext->oe_layout_version = io->ci_layout_version; + + osc_object_lock(obj); + /* Reuse the initial refcount for RPC, don't drop it */ + osc_extent_state_set(ext, OES_LOCK_DONE); + if (!ext->oe_rw) { /* write */ + if (!ext->oe_srvlock && !ext->oe_dio) { + /* The most likely case here is from lack of grants + * so we are either out of quota or out of space. + * Since this means we are holding locks across + * potentially multi-striped IO, we must send out + * everything out instantly to avoid prolonged + * waits resulting in lock eviction (likely since + * the extended wait in osc_cache_enter() did not + * yield any additional grant due to a timeout. + * LU-13131 */ + ext->oe_hp = 1; + list_add_tail(&ext->oe_link, &obj->oo_hp_exts); + } else { + list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); + } + osc_update_pending(obj, OBD_BRW_WRITE, page_count); + } else { + list_add_tail(&ext->oe_link, &obj->oo_reading_exts); + osc_update_pending(obj, OBD_BRW_READ, page_count); + } + osc_object_unlock(obj); + + osc_io_unplug_async(env, cli, obj); + RETURN(0); +} + +/** + * Called by osc_io_setattr_start() to freeze and destroy covering extents. + */ +int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj, + __u64 size, struct osc_extent **extp) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + struct osc_extent *waiting = NULL; + pgoff_t index; + LIST_HEAD(list); + int result = 0; + bool partial; + ENTRY; + + /* pages with index greater or equal to index will be truncated. */ + index = cl_index(osc2cl(obj), size); + partial = size > cl_offset(osc2cl(obj), index); + +again: + osc_object_lock(obj); + ext = osc_extent_search(obj, index); + if (ext == NULL) + ext = first_extent(obj); + else if (ext->oe_end < index) + ext = next_extent(ext); + while (ext != NULL) { + EASSERT(ext->oe_state != OES_TRUNC, ext); + + if (ext->oe_state > OES_CACHE || ext->oe_urgent) { + /* if ext is in urgent state, it means there must exist + * a page already having been flushed by write_page(). + * We have to wait for this extent because we can't + * truncate that page. */ + OSC_EXTENT_DUMP(D_CACHE, ext, + "waiting for busy extent\n"); + waiting = osc_extent_get(ext); + break; + } + + OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size); + + osc_extent_get(ext); + if (ext->oe_state == OES_ACTIVE) { + /* though we grab inode mutex for write path, but we + * release it before releasing extent(in osc_io_end()), + * so there is a race window that an extent is still + * in OES_ACTIVE when truncate starts. */ + LASSERT(!ext->oe_trunc_pending); + ext->oe_trunc_pending = 1; + } else { + EASSERT(ext->oe_state == OES_CACHE, ext); + osc_extent_state_set(ext, OES_TRUNC); + osc_update_pending(obj, OBD_BRW_WRITE, + -ext->oe_nr_pages); + } + /* This extent could be on the full extents list, that's OK */ + EASSERT(!ext->oe_hp && !ext->oe_urgent, ext); + if (!list_empty(&ext->oe_link)) + list_move_tail(&ext->oe_link, &list); + else + list_add_tail(&ext->oe_link, &list); + + ext = next_extent(ext); + } + osc_object_unlock(obj); + + osc_list_maint(cli, obj); + + while ((ext = list_first_entry_or_null(&list, + struct osc_extent, + oe_link)) != NULL) { + int rc; + + list_del_init(&ext->oe_link); + + /* extent may be in OES_ACTIVE state because inode mutex + * is released before osc_io_end() in file write case */ + if (ext->oe_state != OES_TRUNC) + osc_extent_wait(env, ext, OES_TRUNC); + + rc = osc_extent_truncate(ext, index, partial); + if (rc < 0) { + if (result == 0) + result = rc; + + OSC_EXTENT_DUMP(D_ERROR, ext, + "truncate error %d\n", rc); + } else if (ext->oe_nr_pages == 0) { + osc_extent_remove(ext); + } else { + /* this must be an overlapped extent which means only + * part of pages in this extent have been truncated. + */ + EASSERTF(ext->oe_start <= index, ext, + "trunc index = %lu/%d.\n", index, partial); + /* fix index to skip this partially truncated extent */ + index = ext->oe_end + 1; + partial = false; + + /* we need to hold this extent in OES_TRUNC state so + * that no writeback will happen. This is to avoid + * BUG 17397. + * Only partial truncate can reach here, if @size is + * not zero, the caller should provide a valid @extp. */ + LASSERT(*extp == NULL); + *extp = osc_extent_get(ext); + OSC_EXTENT_DUMP(D_CACHE, ext, + "trunc at %llu\n", size); + } + osc_extent_put(env, ext); + } + if (waiting != NULL) { + int rc; + + /* ignore the result of osc_extent_wait the write initiator + * should take care of it. */ + rc = osc_extent_wait(env, waiting, OES_INV); + if (rc < 0) + OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc); + + osc_extent_put(env, waiting); + waiting = NULL; + goto again; + } + RETURN(result); +} +EXPORT_SYMBOL(osc_cache_truncate_start); + +/** + * Called after osc_io_setattr_end to add oio->oi_trunc back to cache. + */ +void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext) +{ + if (ext != NULL) { + struct osc_object *obj = ext->oe_obj; + bool unplug = false; + + EASSERT(ext->oe_nr_pages > 0, ext); + EASSERT(ext->oe_state == OES_TRUNC, ext); + EASSERT(!ext->oe_urgent, ext); + + OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n"); + osc_object_lock(obj); + osc_extent_state_set(ext, OES_CACHE); + if (ext->oe_fsync_wait && !ext->oe_urgent) { + ext->oe_urgent = 1; + list_move_tail(&ext->oe_link, &obj->oo_urgent_exts); + unplug = true; + } + osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages); + osc_object_unlock(obj); + osc_extent_put(env, ext); + + if (unplug) + osc_io_unplug_async(env, osc_cli(obj), obj); + } +} + +/** + * Wait for extents in a specific range to be written out. + * The caller must have called osc_cache_writeback_range() to issue IO + * otherwise it will take a long time for this function to finish. + * + * Caller must hold inode_mutex , or cancel exclusive dlm lock so that + * nobody else can dirty this range of file while we're waiting for + * extents to be written. + */ +int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end) +{ + struct osc_extent *ext; + pgoff_t index = start; + int result = 0; + ENTRY; + +again: + osc_object_lock(obj); + ext = osc_extent_search(obj, index); + if (ext == NULL) + ext = first_extent(obj); + else if (ext->oe_end < index) + ext = next_extent(ext); + while (ext != NULL) { + int rc; + + if (ext->oe_start > end) + break; + + if (!ext->oe_fsync_wait) { + ext = next_extent(ext); + continue; + } + + EASSERT(ergo(ext->oe_state == OES_CACHE, + ext->oe_hp || ext->oe_urgent), ext); + EASSERT(ergo(ext->oe_state == OES_ACTIVE, + !ext->oe_hp && ext->oe_urgent), ext); + + index = ext->oe_end + 1; + osc_extent_get(ext); + osc_object_unlock(obj); + + rc = osc_extent_wait(env, ext, OES_INV); + if (result == 0) + result = rc; + osc_extent_put(env, ext); + goto again; + } + osc_object_unlock(obj); + + OSC_IO_DEBUG(obj, "sync file range.\n"); + RETURN(result); +} +EXPORT_SYMBOL(osc_cache_wait_range); + +/** + * Called to write out a range of osc object. + * + * @hp : should be set this is caused by lock cancel; + * @discard: is set if dirty pages should be dropped - file will be deleted or + * truncated, this implies there is no partially discarding extents. + * + * Return how many pages will be issued, or error code if error occurred. + */ +int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end, int hp, int discard) +{ + struct osc_extent *ext; + LIST_HEAD(discard_list); + bool unplug = false; + int result = 0; + ENTRY; + + osc_object_lock(obj); + ext = osc_extent_search(obj, start); + if (ext == NULL) + ext = first_extent(obj); + else if (ext->oe_end < start) + ext = next_extent(ext); + while (ext != NULL) { + if (ext->oe_start > end) + break; + + ext->oe_fsync_wait = 1; + switch (ext->oe_state) { + case OES_CACHE: + result += ext->oe_nr_pages; + if (!discard) { + struct list_head *list = NULL; + if (hp) { + EASSERT(!ext->oe_hp, ext); + ext->oe_hp = 1; + list = &obj->oo_hp_exts; + } else if (!ext->oe_urgent && !ext->oe_hp) { + ext->oe_urgent = 1; + list = &obj->oo_urgent_exts; + } + if (list != NULL) + list_move_tail(&ext->oe_link, list); + unplug = true; + } else { + struct client_obd *cli = osc_cli(obj); + int pcc_bits = cli->cl_chunkbits - PAGE_SHIFT; + pgoff_t align_by = (1 << pcc_bits); + pgoff_t a_start = round_down(start, align_by); + pgoff_t a_end = round_up(end, align_by); + + /* overflow case */ + if (end && !a_end) + a_end = CL_PAGE_EOF; + /* the only discarder is lock cancelling, so + * [start, end], aligned by chunk size, must + * contain this extent */ + LASSERTF(ext->oe_start >= a_start && + ext->oe_end <= a_end, + "ext [%lu, %lu] reg [%lu, %lu] " + "orig [%lu %lu] align %lu bits " + "%d\n", ext->oe_start, ext->oe_end, + a_start, a_end, start, end, + align_by, pcc_bits); + osc_extent_state_set(ext, OES_LOCKING); + ext->oe_owner = current; + list_move_tail(&ext->oe_link, + &discard_list); + osc_update_pending(obj, OBD_BRW_WRITE, + -ext->oe_nr_pages); + } + break; + case OES_ACTIVE: + /* It's pretty bad to wait for ACTIVE extents, because + * we don't know how long we will wait for it to be + * flushed since it may be blocked at awaiting more + * grants. We do this for the correctness of fsync. */ + LASSERT(hp == 0 && discard == 0); + ext->oe_urgent = 1; + break; + case OES_TRUNC: + /* this extent is being truncated, can't do anything + * for it now. it will be set to urgent after truncate + * is finished in osc_cache_truncate_end(). */ + default: + break; + } + ext = next_extent(ext); + } + osc_object_unlock(obj); + + LASSERT(ergo(!discard, list_empty(&discard_list))); + if (!list_empty(&discard_list)) { + struct osc_extent *tmp; + int rc; + + osc_list_maint(osc_cli(obj), obj); + list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) { + list_del_init(&ext->oe_link); + EASSERT(ext->oe_state == OES_LOCKING, ext); + + /* Discard caching pages. We don't actually write this + * extent out but we complete it as if we did. */ + rc = osc_extent_make_ready(env, ext); + if (unlikely(rc < 0)) { + OSC_EXTENT_DUMP(D_ERROR, ext, + "make_ready returned %d\n", rc); + if (result >= 0) + result = rc; + } + + /* finish the extent as if the pages were sent */ + osc_extent_finish(env, ext, 0, 0); + } + } + + if (unplug) + osc_io_unplug(env, osc_cli(obj), obj); + + if (hp || discard) { + int rc; + rc = osc_cache_wait_range(env, obj, start, end); + if (result >= 0 && rc < 0) + result = rc; + } + + OSC_IO_DEBUG(obj, "pageout [%lu, %lu], %d.\n", start, end, result); + RETURN(result); +} +EXPORT_SYMBOL(osc_cache_writeback_range); + +/** + * Returns a list of pages by a given [start, end] of \a obj. + * + * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely + * crucial in the face of [offset, EOF] locks. + * + * Return at least one page in @queue unless there is no covered page. + */ +bool osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, + struct osc_object *osc, pgoff_t start, pgoff_t end, + osc_page_gang_cbt cb, void *cbdata) +{ + struct osc_page *ops; + struct pagevec *pagevec; + void **pvec; + pgoff_t idx; + unsigned int nr; + unsigned int i; + unsigned int j; + bool res = true; + bool tree_lock = true; + ENTRY; + + idx = start; + pvec = osc_env_info(env)->oti_pvec; + pagevec = &osc_env_info(env)->oti_pagevec; + ll_pagevec_init(pagevec, 0); + spin_lock(&osc->oo_tree_lock); + while ((nr = radix_tree_gang_lookup(&osc->oo_tree, pvec, + idx, OTI_PVEC_SIZE)) > 0) { + struct cl_page *page; + bool end_of_region = false; + + for (i = 0, j = 0; i < nr; ++i) { + ops = pvec[i]; + pvec[i] = NULL; + + idx = osc_index(ops); + if (idx > end) { + end_of_region = true; + break; + } + + page = ops->ops_cl.cpl_page; + LASSERT(page->cp_type == CPT_CACHEABLE); + if (page->cp_state == CPS_FREEING) + continue; + + cl_page_get(page); + lu_ref_add_atomic(&page->cp_reference, + "gang_lookup", current); + pvec[j++] = ops; + } + ++idx; + + /* + * Here a delicate locking dance is performed. Current thread + * holds a reference to a page, but has to own it before it + * can be placed into queue. Owning implies waiting, so + * radix-tree lock is to be released. After a wait one has to + * check that pages weren't truncated (cl_page_own() returns + * error in the latter case). + */ + spin_unlock(&osc->oo_tree_lock); + tree_lock = false; + + res = (*cb)(env, io, pvec, j, cbdata); + + for (i = 0; i < j; ++i) { + ops = pvec[i]; + page = ops->ops_cl.cpl_page; + lu_ref_del(&page->cp_reference, "gang_lookup", current); + cl_pagevec_put(env, page, pagevec); + } + pagevec_release(pagevec); + + if (nr < OTI_PVEC_SIZE || end_of_region) + break; + + if (!res) + break; + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SLOW_PAGE_EVICT, + cfs_fail_val ?: 20); + + if (io->ci_type == CIT_MISC && + io->u.ci_misc.lm_next_rpc_time && + ktime_get_seconds() > io->u.ci_misc.lm_next_rpc_time) { + osc_send_empty_rpc(osc, idx << PAGE_SHIFT); + io->u.ci_misc.lm_next_rpc_time = ktime_get_seconds() + + 5 * obd_timeout / 16; + } + + if (need_resched()) + cond_resched(); + + spin_lock(&osc->oo_tree_lock); + tree_lock = true; + } + if (tree_lock) + spin_unlock(&osc->oo_tree_lock); + RETURN(res); +} +EXPORT_SYMBOL(osc_page_gang_lookup); + +/** + * Check if page @page is covered by an extra lock or discard it. + */ +static bool check_and_discard_cb(const struct lu_env *env, struct cl_io *io, + void **pvec, int count, void *cbdata) +{ + struct osc_thread_info *info = osc_env_info(env); + struct osc_object *osc = cbdata; + int i; + + for (i = 0; i < count; i++) { + struct osc_page *ops = pvec[i]; + struct cl_page *page = ops->ops_cl.cpl_page; + pgoff_t index = osc_index(ops); + bool discard = false; + + /* negative lock caching */ + if (index < info->oti_ng_index) { + discard = true; + } else if (index >= info->oti_fn_index) { + struct ldlm_lock *tmp; + /* refresh non-overlapped index */ + tmp = osc_dlmlock_at_pgoff(env, osc, index, + OSC_DAP_FL_TEST_LOCK | + OSC_DAP_FL_AST | + OSC_DAP_FL_RIGHT); + if (tmp != NULL) { + __u64 end = + tmp->l_policy_data.l_extent.end; + __u64 start = + tmp->l_policy_data.l_extent.start; + + /* no lock covering this page */ + if (index < cl_index(osc2cl(osc), start)) { + /* no lock at @index, + * first lock at @start + */ + info->oti_ng_index = + cl_index(osc2cl(osc), start); + discard = true; + } else { + /* Cache the first-non-overlapped + * index so as to skip all pages + * within [index, oti_fn_index). + * This is safe because if tmp lock + * is canceled, it will discard these + * pages. + */ + info->oti_fn_index = + cl_index(osc2cl(osc), end + 1); + if (end == OBD_OBJECT_EOF) + info->oti_fn_index = + CL_PAGE_EOF; + } + LDLM_LOCK_PUT(tmp); + } else { + info->oti_ng_index = CL_PAGE_EOF; + discard = true; + } + } + + if (discard) { + if (cl_page_own(env, io, page) == 0) { + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + } + + info->oti_next_index = index + 1; + } + return true; +} + +bool osc_discard_cb(const struct lu_env *env, struct cl_io *io, + void **pvec, int count, void *cbdata) +{ + struct osc_thread_info *info = osc_env_info(env); + int i; + + for (i = 0; i < count; i++) { + struct osc_page *ops = pvec[i]; + struct cl_page *page = ops->ops_cl.cpl_page; + + /* page is top page. */ + info->oti_next_index = osc_index(ops) + 1; + if (cl_page_own(env, io, page) == 0) { + if (!ergo(page->cp_type == CPT_CACHEABLE, + !PageDirty(cl_page_vmpage(page)))) + CL_PAGE_DEBUG(D_ERROR, env, page, + "discard dirty page?\n"); + + /* discard the page */ + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + } + + return true; +} +EXPORT_SYMBOL(osc_discard_cb); + +/** + * Discard pages protected by the given lock. This function traverses radix + * tree to find all covering pages and discard them. If a page is being covered + * by other locks, it should remain in cache. + * + * If error happens on any step, the process continues anyway (the reasoning + * behind this being that lock cancellation cannot be delayed indefinitely). + */ +int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc, + pgoff_t start, pgoff_t end, bool discard) +{ + struct osc_thread_info *info = osc_env_info(env); + struct cl_io *io = osc_env_thread_io(env); + osc_page_gang_cbt cb; + int result; + + ENTRY; + + io->ci_obj = cl_object_top(osc2cl(osc)); + io->ci_ignore_layout = 1; + io->ci_invalidate_page_cache = 1; + io->u.ci_misc.lm_next_rpc_time = ktime_get_seconds() + + 5 * obd_timeout / 16; + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result != 0) + GOTO(out, result); + + cb = discard ? osc_discard_cb : check_and_discard_cb; + info->oti_fn_index = info->oti_next_index = start; + info->oti_ng_index = 0; + + osc_page_gang_lookup(env, io, osc, + info->oti_next_index, end, cb, osc); +out: + cl_io_fini(env, io); + RETURN(result); +} + + +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_dev.c b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c new file mode 100644 index 0000000000000..a2d3bcaab069a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c @@ -0,0 +1,252 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_device, for OSC layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_OSC + +/* class_name2obd() */ +#include +#include +#include + +#include "osc_internal.h" + +/** \addtogroup osc + * @{ + */ + +struct kmem_cache *osc_lock_kmem; +EXPORT_SYMBOL(osc_lock_kmem); +struct kmem_cache *osc_object_kmem; +EXPORT_SYMBOL(osc_object_kmem); + +struct kmem_cache *osc_thread_kmem; +struct kmem_cache *osc_session_kmem; +struct kmem_cache *osc_extent_kmem; +struct kmem_cache *osc_quota_kmem; +struct kmem_cache *osc_obdo_kmem; + +struct lu_kmem_descr osc_caches[] = { + { + .ckd_cache = &osc_lock_kmem, + .ckd_name = "osc_lock_kmem", + .ckd_size = sizeof (struct osc_lock) + }, + { + .ckd_cache = &osc_object_kmem, + .ckd_name = "osc_object_kmem", + .ckd_size = sizeof (struct osc_object) + }, + { + .ckd_cache = &osc_thread_kmem, + .ckd_name = "osc_thread_kmem", + .ckd_size = sizeof (struct osc_thread_info) + }, + { + .ckd_cache = &osc_session_kmem, + .ckd_name = "osc_session_kmem", + .ckd_size = sizeof (struct osc_session) + }, + { + .ckd_cache = &osc_extent_kmem, + .ckd_name = "osc_extent_kmem", + .ckd_size = sizeof (struct osc_extent) + }, + { + .ckd_cache = &osc_quota_kmem, + .ckd_name = "osc_quota_kmem", + .ckd_size = sizeof(struct osc_quota_info) + }, + { + .ckd_cache = &osc_obdo_kmem, + .ckd_name = "osc_obdo_kmem", + .ckd_size = sizeof(struct obdo) + }, + { + .ckd_cache = NULL + } +}; + +/***************************************************************************** + * + * Osc device and device type functions. + * + */ + +static void *osc_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct osc_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, osc_thread_kmem, GFP_NOFS); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void osc_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct osc_thread_info *info = data; + + lu_buf_free(&info->oti_ladvise_buf); + OBD_SLAB_FREE_PTR(info, osc_thread_kmem); +} + +struct lu_context_key osc_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = osc_key_init, + .lct_fini = osc_key_fini +}; +EXPORT_SYMBOL(osc_key); + +static void *osc_session_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct osc_session *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, osc_session_kmem, GFP_NOFS); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void osc_session_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct osc_session *info = data; + OBD_SLAB_FREE_PTR(info, osc_session_kmem); +} + +struct lu_context_key osc_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = osc_session_init, + .lct_fini = osc_session_fini +}; +EXPORT_SYMBOL(osc_session_key); + +/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */ +LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key); + +static int osc_process_config(const struct lu_env *env, struct lu_device *d, + struct lustre_cfg *cfg) +{ + ssize_t count = class_modify_config(cfg, PARAM_OSC, + &d->ld_obd->obd_kset.kobj); + return count > 0 ? 0 : count; +} + +static const struct lu_device_operations osc_lu_ops = { + .ldo_object_alloc = osc_object_alloc, + .ldo_process_config = osc_process_config, + .ldo_recovery_complete = NULL +}; + +int osc_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + RETURN(0); +} +EXPORT_SYMBOL(osc_device_init); + +struct lu_device *osc_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + return NULL; +} +EXPORT_SYMBOL(osc_device_fini); + +struct lu_device *osc_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct osc_device *oc = lu2osc_dev(d); + + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(oc); + return NULL; +} +EXPORT_SYMBOL(osc_device_free); + +static struct lu_device *osc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct osc_device *osc; + struct obd_device *obd; + int rc; + + OBD_ALLOC_PTR(osc); + if (osc == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + cl_device_init(&osc->osc_cl, t); + d = osc2lu_dev(osc); + d->ld_ops = &osc_lu_ops; + + /* Setup OSC OBD */ + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + rc = osc_setup(obd, cfg); + if (rc) { + osc_device_free(env, d); + RETURN(ERR_PTR(rc)); + } + osc->osc_exp = obd->obd_self_export; + osc->osc_stats.os_init = ktime_get_real(); + RETURN(d); +} + +static const struct lu_device_type_operations osc_device_type_ops = { + .ldto_init = osc_type_init, + .ldto_fini = osc_type_fini, + + .ldto_start = osc_type_start, + .ldto_stop = osc_type_stop, + + .ldto_device_alloc = osc_device_alloc, + .ldto_device_free = osc_device_free, + + .ldto_device_init = osc_device_init, + .ldto_device_fini = osc_device_fini +}; + +struct lu_device_type osc_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_OSC_NAME, + .ldt_ops = &osc_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_internal.h b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h new file mode 100644 index 0000000000000..52a7ab503d419 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h @@ -0,0 +1,222 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#ifndef OSC_INTERNAL_H +#define OSC_INTERNAL_H + +#define OAP_MAGIC 8675309 + +#include +#include + +extern atomic_t osc_pool_req_count; +extern unsigned int osc_reqpool_maxreqcount; +extern struct ptlrpc_request_pool *osc_rq_pool; + +int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes); +void osc_schedule_grant_work(void); +void osc_update_next_shrink(struct client_obd *cli); +int lru_queue_work(const struct lu_env *env, void *data); +int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, + int sent, int rc); +void osc_extent_release(const struct lu_env *env, struct osc_extent *ext); +int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc, + pgoff_t start, pgoff_t end, bool discard); + +void osc_lock_lvb_update(const struct lu_env *env, + struct osc_object *osc, + struct ldlm_lock *dlmlock, + struct ost_lvb *lvb); + +int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u64 *flags, union ldlm_policy_data *policy, + struct ost_lvb *lvb, osc_enqueue_upcall_f upcall, + void *cookie, struct ldlm_enqueue_info *einfo, + struct ptlrpc_request_set *rqset, int async, + bool speculative); + +int osc_match_base(const struct lu_env *env, struct obd_export *exp, + struct ldlm_res_id *res_id, enum ldlm_type type, + union ldlm_policy_data *policy, enum ldlm_mode mode, + __u64 *flags, struct osc_object *obj, + struct lustre_handle *lockh, enum ldlm_match_flags match_flags); + +int osc_setattr_async(struct obd_export *exp, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset); +int osc_fallocate_base(struct obd_export *exp, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie, + int mode); +int osc_sync_base(struct osc_object *obj, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset); +int osc_ladvise_base(struct obd_export *exp, struct obdo *oa, + struct ladvise_hdr *ladvise_hdr, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset); +int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg); +int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, + struct list_head *ext_list, int cmd); +void osc_send_empty_rpc(struct osc_object *osc, pgoff_t start); +unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages); +void osc_lru_unreserve(struct client_obd *cli, unsigned long npages); + +extern struct lu_kmem_descr osc_caches[]; + +unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock); + +int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg); + +int osc_tunables_init(struct obd_device *obd); + +extern struct lu_device_type osc_device_type; + +static inline struct cl_io *osc_env_thread_io(const struct lu_env *env) +{ + struct cl_io *io = &osc_env_info(env)->oti_io; + + memset(io, 0, sizeof(*io)); + return io; +} + +static inline int osc_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &osc_device_type; +} + +static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock) +{ + return cl2osc_lock(cl_lock_at(lock, &osc_device_type)); +} + +int osc_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int osc_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +struct lu_object *osc_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); + +static inline int osc_recoverable_error(int rc) +{ + return (rc == -EIO || rc == -EROFS || rc == -ENOMEM || + rc == -EAGAIN || rc == -EINPROGRESS); +} + +static inline unsigned long rpcs_in_flight(struct client_obd *cli) +{ + return cli->cl_r_in_flight + cli->cl_w_in_flight; +} + +static inline char *cli_name(struct client_obd *cli) +{ + return cli->cl_import->imp_obd->obd_name; +} + +static inline char list_empty_marker(struct list_head *list) +{ + return list_empty(list) ? '-' : '+'; +} + +struct osc_async_args { + struct obd_info *aa_oi; +}; + +int osc_quota_setup(struct obd_device *obd); +int osc_quota_cleanup(struct obd_device *obd); +int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[], + u64 valid, u32 flags); +int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]); +int osc_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl); +void osc_inc_unstable_pages(struct ptlrpc_request *req); +void osc_dec_unstable_pages(struct ptlrpc_request *req); +bool osc_over_unstable_soft_limit(struct client_obd *cli); +void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj, + pgoff_t idx, size_t to); + +struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env, + struct osc_object *obj, + pgoff_t index, + enum osc_dap_flags flags); + +int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc); + +/** osc shrink list to link all osc client obd */ +extern struct list_head osc_shrink_list; +/** spin lock to protect osc_shrink_list */ +extern spinlock_t osc_shrink_lock; +extern unsigned long osc_cache_shrink_count(struct shrinker *sk, + struct shrink_control *sc); +extern unsigned long osc_cache_shrink_scan(struct shrinker *sk, + struct shrink_control *sc); +static inline unsigned int osc_max_write_chunks(const struct client_obd *cli) +{ + /* + * LU-8135: + * + * The maximum size of a single transaction is about 64MB in ZFS. + * #define DMU_MAX_ACCESS (64 * 1024 * 1024) + * + * Since ZFS is a copy-on-write file system, a single dirty page in + * a chunk will result in the rewrite of the whole chunk, therefore + * an RPC shouldn't be allowed to contain too many chunks otherwise + * it will make transaction size much bigger than 64MB, especially + * with big block size for ZFS. + * + * This piece of code is to make sure that OSC won't send write RPCs + * with too many chunks. The maximum chunk size that an RPC can cover + * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally + * OST should tell the client what the biggest transaction size is, + * but it's good enough for now. + * + * This limitation doesn't apply to ldiskfs, which allows as many + * chunks in one RPC as we want. However, it won't have any benefits + * to have too many discontiguous pages in one RPC. + * + * An osc_extent won't cover over a RPC size, so the chunks in an + * osc_extent won't bigger than PTLRPC_MAX_BRW_SIZE >> chunkbits. + */ + return PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits; +} + +static inline void osc_set_io_portal(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + + /* Distinguish OSC from MDC here to use OST or MDS portal */ + if (OCD_HAS_FLAG(&imp->imp_connect_data, IBITS)) + req->rq_request_portal = MDS_IO_PORTAL; + else + req->rq_request_portal = OST_IO_PORTAL; +} + +#endif /* OSC_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_io.c b/drivers/staging/lustrefsx/lustre/osc/osc_io.c new file mode 100644 index 0000000000000..e4bd2738a6cb3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_io.c @@ -0,0 +1,1321 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_io for OSC layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include +#include +#include +#include + +#include "osc_internal.h" +#include + +/** \addtogroup osc + * @{ + */ + +/***************************************************************************** + * + * io operations. + * + */ + +static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io) +{ +} + +void osc_read_ahead_release(const struct lu_env *env, struct cl_read_ahead *ra) +{ + struct ldlm_lock *dlmlock = ra->cra_dlmlock; + struct osc_io *oio = ra->cra_oio; + struct lustre_handle lockh; + + oio->oi_is_readahead = 0; + ldlm_lock2handle(dlmlock, &lockh); + ldlm_lock_decref(&lockh, LCK_PR); + LDLM_LOCK_PUT(dlmlock); +} +EXPORT_SYMBOL(osc_read_ahead_release); + +static int osc_io_read_ahead(const struct lu_env *env, + const struct cl_io_slice *ios, + pgoff_t start, struct cl_read_ahead *ra) +{ + struct osc_object *osc = cl2osc(ios->cis_obj); + struct osc_io *oio = cl2osc_io(env, ios); + struct ldlm_lock *dlmlock; + int result = -ENODATA; + + ENTRY; + + oio->oi_is_readahead = true; + dlmlock = osc_dlmlock_at_pgoff(env, osc, start, 0); + if (dlmlock != NULL) { + LASSERT(dlmlock->l_ast_data == osc); + if (dlmlock->l_req_mode != LCK_PR) { + struct lustre_handle lockh; + ldlm_lock2handle(dlmlock, &lockh); + ldlm_lock_addref(&lockh, LCK_PR); + ldlm_lock_decref(&lockh, dlmlock->l_req_mode); + } + + ra->cra_rpc_pages = osc_cli(osc)->cl_max_pages_per_rpc; + ra->cra_end_idx = cl_index(osc2cl(osc), + dlmlock->l_policy_data.l_extent.end); + ra->cra_release = osc_read_ahead_release; + ra->cra_dlmlock = dlmlock; + ra->cra_oio = oio; + if (ra->cra_end_idx != CL_PAGE_EOF) + ra->cra_contention = true; + result = 0; + } + + RETURN(result); +} + +/** + * An implementation of cl_io_operations::cio_io_submit() method for osc + * layer. Iterates over pages in the in-queue, prepares each for io by calling + * cl_page_prep() and then either submits them through osc_io_submit_page() + * or, if page is already submitted, changes osc flags through + * osc_set_async_flags(). + */ +int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue) +{ + struct cl_page *page; + struct cl_page *tmp; + struct client_obd *cli = NULL; + struct osc_object *osc = NULL; /* to keep gcc happy */ + struct osc_page *opg; + struct cl_io *io; + LIST_HEAD(list); + + struct cl_page_list *qin = &queue->c2_qin; + struct cl_page_list *qout = &queue->c2_qout; + unsigned int queued = 0; + int result = 0; + int brw_flags; + unsigned int max_pages; + unsigned int ppc_bits; /* pages per chunk bits */ + unsigned int ppc; + ktime_t submit_time = ktime_get(); + bool sync_queue = false; + + LASSERT(qin->pl_nr > 0); + + CDEBUG(D_CACHE|D_READA, "%d %d\n", qin->pl_nr, crt); + + osc = cl2osc(ios->cis_obj); + cli = osc_cli(osc); + max_pages = cli->cl_max_pages_per_rpc; + ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; + ppc = 1 << ppc_bits; + + brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0; + brw_flags |= crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; + if (crt == CRT_READ && ios->cis_io->ci_ndelay) + brw_flags |= OBD_BRW_NDELAY; + + page = cl_page_list_first(qin); + if (page->cp_type == CPT_TRANSIENT) + brw_flags |= OBD_BRW_NOCACHE; + if (lnet_is_rdma_only_page(page->cp_vmpage)) + brw_flags |= OBD_BRW_RDMA_ONLY; + + /* + * NOTE: here @page is a top-level page. This is done to avoid + * creation of sub-page-list. + */ + cl_page_list_for_each_safe(page, tmp, qin) { + struct osc_async_page *oap; + + /* Top level IO. */ + io = page->cp_owner; + LASSERT(io != NULL); + + opg = osc_cl_page_osc(page, osc); + oap = &opg->ops_oap; + LASSERT(osc == oap->oap_obj); + + if (!list_empty(&oap->oap_pending_item) || + !list_empty(&oap->oap_rpc_item)) { + CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n", + oap, opg); + result = -EBUSY; + break; + } + + result = cl_page_prep(env, io, page, crt); + if (result != 0) { + LASSERT(result < 0); + if (result != -EALREADY) + break; + /* + * Handle -EALREADY error: for read case, the page is + * already in UPTODATE state; for write, the page + * is not dirty. + */ + result = 0; + continue; + } + + if (page->cp_type != CPT_TRANSIENT) { + spin_lock(&oap->oap_lock); + oap->oap_async_flags = ASYNC_URGENT|ASYNC_READY; + oap->oap_async_flags |= ASYNC_COUNT_STABLE; + spin_unlock(&oap->oap_lock); + } + + osc_page_submit(env, opg, crt, brw_flags, submit_time); + list_add_tail(&oap->oap_pending_item, &list); + + if (page->cp_sync_io != NULL) + cl_page_list_move(qout, qin, page); + else /* async IO */ + cl_page_list_del(env, qin, page); + + queued++; + if (queued == max_pages) { + sync_queue = true; + } else if (crt == CRT_WRITE) { + unsigned int chunks; + unsigned int next_chunks; + + chunks = (queued + ppc - 1) >> ppc_bits; + /* chunk number if add another page */ + next_chunks = (queued + ppc) >> ppc_bits; + + /* next page will excceed write chunk limit */ + if (chunks == osc_max_write_chunks(cli) && + next_chunks > chunks) + sync_queue = true; + } + + if (sync_queue) { + result = osc_queue_sync_pages(env, io, osc, &list, + brw_flags); + if (result < 0) + break; + queued = 0; + sync_queue = false; + } + } + + if (queued > 0) + result = osc_queue_sync_pages(env, io, osc, &list, brw_flags); + + /* Update c/mtime for sync write. LU-7310 */ + if (crt == CRT_WRITE && qout->pl_nr > 0 && result == 0) { + struct cl_object *obj = ios->cis_obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + + cl_object_attr_lock(obj); + attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds(); + cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME); + cl_object_attr_unlock(obj); + } + + CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result); + return qout->pl_nr > 0 ? 0 : result; +} +EXPORT_SYMBOL(osc_io_submit); + +/** + * This is called to update the attributes when modifying a specific page, + * both when making new pages and when doing updates to existing cached pages. + * + * Expand stripe KMS if necessary. + */ +void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj, + pgoff_t idx, size_t to) +{ + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int valid; + __u64 kms; + + ENTRY; + + /* offset within stripe */ + kms = cl_offset(obj, idx) + to; + + cl_object_attr_lock(obj); + CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n", + kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms, + loi->loi_lvb.lvb_size); + + attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds(); + valid = CAT_MTIME | CAT_CTIME; + if (kms > loi->loi_kms) { + attr->cat_kms = kms; + valid |= CAT_KMS; + } + if (kms > loi->loi_lvb.lvb_size) { + attr->cat_size = kms; + valid |= CAT_SIZE; + } + cl_object_attr_update(env, obj, attr, valid); + cl_object_attr_unlock(obj); + + EXIT; +} + +int osc_io_commit_async(const struct lu_env *env, + const struct cl_io_slice *ios, + struct cl_page_list *qin, int from, int to, + cl_commit_cbt cb) +{ + struct cl_io *io = ios->cis_io; + struct osc_io *oio = cl2osc_io(env, ios); + struct osc_object *osc = cl2osc(ios->cis_obj); + struct cl_page *page; + struct cl_page *last_page; + struct osc_page *opg; + struct pagevec *pvec = &osc_env_info(env)->oti_pagevec; + int result = 0; + ENTRY; + + LASSERT(qin->pl_nr > 0); + + /* Handle partial page cases */ + last_page = cl_page_list_last(qin); + if (oio->oi_lockless) { + page = cl_page_list_first(qin); + if (page == last_page) { + cl_page_clip(env, page, from, to); + } else { + if (from != 0) + cl_page_clip(env, page, from, PAGE_SIZE); + if (to != PAGE_SIZE) + cl_page_clip(env, last_page, 0, to); + } + } + + ll_pagevec_init(pvec, 0); + + while (qin->pl_nr > 0) { + struct osc_async_page *oap; + + page = cl_page_list_first(qin); + opg = osc_cl_page_osc(page, osc); + oap = &opg->ops_oap; + + LASSERTF(osc == oap->oap_obj, + "obj mismatch: %p / %p\n", osc, oap->oap_obj); + + if (!list_empty(&oap->oap_rpc_item)) { + CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n", + oap, opg); + result = -EBUSY; + break; + } + + /* The page may be already in dirty cache. */ + if (list_empty(&oap->oap_pending_item)) { + result = osc_page_cache_add(env, opg, io, cb); + if (result != 0) + break; + } + + osc_page_touch_at(env, osc2cl(osc), osc_index(opg), + page == last_page ? to : PAGE_SIZE); + + cl_page_list_del(env, qin, page); + + /* if there are no more slots, do the callback & reinit */ + if (pagevec_add(pvec, page->cp_vmpage) == 0) { + (*cb)(env, io, pvec); + pagevec_reinit(pvec); + } + } + /* The shrink interval is in seconds, so we can update it once per + * write, rather than once per page. + */ + osc_update_next_shrink(osc_cli(osc)); + + + /* Clean up any partially full pagevecs */ + if (pagevec_count(pvec) != 0) + (*cb)(env, io, pvec); + + /* Can't access these pages any more. Page can be in transfer and + * complete at any time. */ + + /* for sync write, kernel will wait for this page to be flushed before + * osc_io_end() is called, so release it earlier. + * for mkwrite(), it's known there is no further pages. */ + if (cl_io_is_sync_write(io) && oio->oi_active != NULL) { + osc_extent_release(env, oio->oi_active); + oio->oi_active = NULL; + } + + CDEBUG(D_INFO, "%d %d\n", qin->pl_nr, result); + RETURN(result); +} +EXPORT_SYMBOL(osc_io_commit_async); + +void osc_io_extent_release(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct osc_io *oio = cl2osc_io(env, ios); + + if (oio->oi_active != NULL) { + osc_extent_release(env, oio->oi_active); + oio->oi_active = NULL; + } +} +EXPORT_SYMBOL(osc_io_extent_release); + +static bool osc_import_not_healthy(struct obd_import *imp) +{ + return imp->imp_invalid || imp->imp_deactive || + !(imp->imp_state == LUSTRE_IMP_FULL || + imp->imp_state == LUSTRE_IMP_IDLE); +} + +int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct osc_object *osc = cl2osc(ios->cis_obj); + struct obd_import *imp = osc_cli(osc)->cl_import; + struct osc_io *oio = osc_env_io(env); + int rc = -EIO; + + ENTRY; + + spin_lock(&imp->imp_lock); + /** + * check whether this OSC device is available for non-delay read, + * fast switching mirror if we haven't tried all mirrors. + */ + if (ios->cis_io->ci_type == CIT_READ && ios->cis_io->ci_ndelay && + !ios->cis_io->ci_tried_all_mirrors && osc_import_not_healthy(imp)) { + rc = -EAGAIN; + } else if (likely(!imp->imp_invalid)) { + atomic_inc(&osc->oo_nr_ios); + oio->oi_is_active = 1; + rc = 0; + } + spin_unlock(&imp->imp_lock); + + if (capable(CAP_SYS_RESOURCE)) + oio->oi_cap_sys_resource = 1; + + RETURN(rc); +} +EXPORT_SYMBOL(osc_io_iter_init); + +void osc_io_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct osc_io *oio = osc_env_io(env); + + if (oio->oi_is_active) { + struct osc_object *osc = cl2osc(ios->cis_obj); + + oio->oi_is_active = 0; + LASSERT(atomic_read(&osc->oo_nr_ios) > 0); + if (atomic_dec_and_test(&osc->oo_nr_ios)) + wake_up(&osc->oo_io_waitq); + } +} +EXPORT_SYMBOL(osc_io_iter_fini); + +void osc_io_rw_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct osc_io *oio = osc_env_io(env); + struct osc_object *osc = cl2osc(ios->cis_obj); + + if (oio->oi_lru_reserved > 0) { + osc_lru_unreserve(osc_cli(osc), oio->oi_lru_reserved); + oio->oi_lru_reserved = 0; + } + oio->oi_write_osclock = NULL; + + osc_io_iter_fini(env, ios); +} +EXPORT_SYMBOL(osc_io_rw_iter_fini); + +int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct cl_io *io; + struct cl_fault_io *fio; + ENTRY; + + io = ios->cis_io; + fio = &io->u.ci_fault; + CDEBUG(D_INFO, "%lu %d %zu\n", + fio->ft_index, fio->ft_writable, fio->ft_nob); + /* + * If mapping is writeable, adjust kms to cover this page, + * but do not extend kms beyond actual file size. + * See bug 10919. + */ + if (fio->ft_writable) + osc_page_touch_at(env, ios->cis_obj, + fio->ft_index, fio->ft_nob); + RETURN(0); +} +EXPORT_SYMBOL(osc_io_fault_start); + + +static int osc_async_upcall(void *a, int rc) +{ + struct osc_async_cbargs *args = a; + + args->opc_rc = rc; + complete(&args->opc_sync); + return 0; +} + +/** + * Checks that there are no pages being written in the extent being truncated. + */ +static bool trunc_check_cb(const struct lu_env *env, struct cl_io *io, + void **pvec, int count, void *cbdata) +{ + int i; + + for (i = 0; i < count; i++) { + struct osc_page *ops = pvec[i]; + struct cl_page *page = ops->ops_cl.cpl_page; + struct osc_async_page *oap; + __u64 start = *(__u64 *)cbdata; + + oap = &ops->ops_oap; + if (oap->oap_cmd & OBD_BRW_WRITE && + !list_empty(&oap->oap_pending_item)) + CL_PAGE_DEBUG(D_ERROR, env, page, "exists %llu/%s.\n", + start, current->comm); + + if (PageLocked(page->cp_vmpage)) + CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n", + ops, osc_index(ops), + oap->oap_cmd & OBD_BRW_RWMASK); + } + return true; +} + +static void osc_trunc_check(const struct lu_env *env, struct cl_io *io, + struct osc_io *oio, __u64 size) +{ + struct cl_object *clob; + int partial; + pgoff_t start; + + clob = oio->oi_cl.cis_obj; + start = cl_index(clob, size); + partial = cl_offset(clob, start) < size; + + /* + * Complain if there are pages in the truncated region. + */ + osc_page_gang_lookup(env, io, cl2osc(clob), + start + partial, CL_PAGE_EOF, + trunc_check_cb, (void *)&size); +} + +/** + * Flush affected pages prior punch. + * We shouldn't discard them locally first because that could be data loss + * if server doesn't support fallocate punch, we also need these data to be + * flushed first to prevent re-ordering with the punch + */ +int osc_punch_start(const struct lu_env *env, struct cl_io *io, + struct cl_object *obj) +{ + struct osc_object *osc = cl2osc(obj); + pgoff_t pg_start = cl_index(obj, io->u.ci_setattr.sa_falloc_offset); + pgoff_t pg_end = cl_index(obj, io->u.ci_setattr.sa_falloc_end - 1); + int rc; + + ENTRY; + rc = osc_cache_writeback_range(env, osc, pg_start, pg_end, 1, 0); + if (rc < 0) + RETURN(rc); + + osc_page_gang_lookup(env, io, osc, pg_start, pg_end, osc_discard_cb, + osc); + RETURN(0); +} +EXPORT_SYMBOL(osc_punch_start); + +static int osc_io_setattr_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + struct obdo *oa = &oio->oi_oa; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + unsigned int ia_avalid = io->u.ci_setattr.sa_avalid; + enum op_xvalid ia_xvalid = io->u.ci_setattr.sa_xvalid; + int result = 0; + __u64 size = io->u.ci_setattr.sa_attr.lvb_size; + bool io_is_falloc = cl_io_is_fallocate(io); + + ENTRY; + /* truncate cache dirty pages first */ + if (cl_io_is_trunc(io)) + result = osc_cache_truncate_start(env, cl2osc(obj), size, + &oio->oi_trunc); + /* flush local pages prior punching them on server */ + if (io_is_falloc && + io->u.ci_setattr.sa_falloc_mode & FALLOC_FL_PUNCH_HOLE) + result = osc_punch_start(env, io, obj); + + if (result == 0 && oio->oi_lockless == 0) { + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + if (result == 0) { + struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr; + unsigned int cl_valid = 0; + + if (ia_avalid & ATTR_SIZE) { + attr->cat_size = size; + attr->cat_kms = size; + cl_valid = (CAT_SIZE | CAT_KMS); + } + if (ia_avalid & ATTR_MTIME_SET) { + attr->cat_mtime = lvb->lvb_mtime; + cl_valid |= CAT_MTIME; + } + if (ia_avalid & ATTR_ATIME_SET) { + attr->cat_atime = lvb->lvb_atime; + cl_valid |= CAT_ATIME; + } + if (ia_xvalid & OP_XVALID_CTIME_SET) { + attr->cat_ctime = lvb->lvb_ctime; + cl_valid |= CAT_CTIME; + } + result = cl_object_attr_update(env, obj, attr, + cl_valid); + } + cl_object_attr_unlock(obj); + } + memset(oa, 0, sizeof(*oa)); + if (result == 0) { + oa->o_oi = loi->loi_oi; + obdo_set_parent_fid(oa, io->u.ci_setattr.sa_parent_fid); + oa->o_stripe_idx = io->u.ci_setattr.sa_stripe_index; + oa->o_layout = io->u.ci_setattr.sa_layout; + oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP | + OBD_MD_FLOSTLAYOUT; + if (ia_avalid & ATTR_CTIME) { + oa->o_valid |= OBD_MD_FLCTIME; + oa->o_ctime = attr->cat_ctime; + } + if (ia_avalid & ATTR_ATIME) { + oa->o_valid |= OBD_MD_FLATIME; + oa->o_atime = attr->cat_atime; + } + if (ia_avalid & ATTR_MTIME) { + oa->o_valid |= OBD_MD_FLMTIME; + oa->o_mtime = attr->cat_mtime; + } + + if (ia_avalid & ATTR_SIZE || io_is_falloc) { + if (oio->oi_lockless) { + oa->o_flags = OBD_FL_SRVLOCK; + oa->o_valid |= OBD_MD_FLFLAGS; + } + + if (io->ci_layout_version > 0) { + /* verify layout version */ + oa->o_valid |= OBD_MD_LAYOUT_VERSION; + oa->o_layout_version = io->ci_layout_version; + } + } else { + LASSERT(oio->oi_lockless == 0); + } + + if (ia_xvalid & OP_XVALID_FLAGS) { + oa->o_flags = io->u.ci_setattr.sa_attr_flags; + oa->o_valid |= OBD_MD_FLFLAGS; + } + + init_completion(&cbargs->opc_sync); + + if (io_is_falloc) { + int falloc_mode = io->u.ci_setattr.sa_falloc_mode; + + oa->o_size = io->u.ci_setattr.sa_falloc_offset; + oa->o_blocks = io->u.ci_setattr.sa_falloc_end; + oa->o_uid = io->u.ci_setattr.sa_falloc_uid; + oa->o_gid = io->u.ci_setattr.sa_falloc_gid; + oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | + OBD_MD_FLUID | OBD_MD_FLGID; + + CDEBUG(D_INODE, "size %llu blocks %llu uid %u gid %u\n", + oa->o_size, oa->o_blocks, oa->o_uid, oa->o_gid); + result = osc_fallocate_base(osc_export(cl2osc(obj)), + oa, osc_async_upcall, + cbargs, falloc_mode); + } else if (ia_avalid & ATTR_SIZE) { + oa->o_size = size; + oa->o_blocks = OBD_OBJECT_EOF; + oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; + result = osc_punch_send(osc_export(cl2osc(obj)), + oa, osc_async_upcall, cbargs); + } else { + result = osc_setattr_async(osc_export(cl2osc(obj)), + oa, osc_async_upcall, + cbargs, PTLRPCD_SET); + } + cbargs->opc_rpc_sent = result == 0; + } + + RETURN(result); +} + +void osc_io_setattr_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + struct obdo *oa = &oio->oi_oa; + unsigned int cl_valid = 0; + int result = 0; + + if (cbargs->opc_rpc_sent) { + wait_for_completion(&cbargs->opc_sync); + result = io->ci_result = cbargs->opc_rc; + } + + if (cl_io_is_trunc(io)) { + __u64 size = io->u.ci_setattr.sa_attr.lvb_size; + + if (result == 0) { + cl_object_attr_lock(obj); + if (oa->o_valid & OBD_MD_FLBLOCKS) { + attr->cat_blocks = oa->o_blocks; + cl_valid |= CAT_BLOCKS; + } + + cl_object_attr_update(env, obj, attr, cl_valid); + cl_object_attr_unlock(obj); + } + osc_trunc_check(env, io, oio, size); + osc_cache_truncate_end(env, oio->oi_trunc); + oio->oi_trunc = NULL; + } + + if (cl_io_is_fallocate(io)) { + if (result == 0) { + cl_object_attr_lock(obj); + /* update blocks */ + if (oa->o_valid & OBD_MD_FLBLOCKS) { + attr->cat_blocks = oa->o_blocks; + cl_valid |= CAT_BLOCKS; + } + + cl_object_attr_update(env, obj, attr, cl_valid); + cl_object_attr_unlock(obj); + } + } +} +EXPORT_SYMBOL(osc_io_setattr_end); + +struct osc_data_version_args { + struct osc_io *dva_oio; +}; + +static int +osc_data_version_interpret(const struct lu_env *env, struct ptlrpc_request *req, + void *args, int rc) +{ + struct osc_data_version_args *dva = args; + struct osc_io *oio = dva->dva_oio; + const struct ost_body *body; + + ENTRY; + if (rc < 0) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, &oio->oi_oa, + &body->oa); + EXIT; +out: + oio->oi_cbarg.opc_rc = rc; + complete(&oio->oi_cbarg.opc_sync); + + return 0; +} + +static int osc_io_data_version_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version; + struct osc_io *oio = cl2osc_io(env, slice); + struct obdo *oa = &oio->oi_oa; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + struct osc_object *obj = cl2osc(slice->cis_obj); + struct lov_oinfo *loi = obj->oo_oinfo; + struct obd_export *exp = osc_export(obj); + struct ptlrpc_request *req; + struct ost_body *body; + struct osc_data_version_args *dva; + int rc; + + ENTRY; + memset(oa, 0, sizeof(*oa)); + oa->o_oi = loi->loi_oi; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + + if (dv->dv_flags & (LL_DV_RD_FLUSH | LL_DV_WR_FLUSH)) { + oa->o_valid |= OBD_MD_FLFLAGS; + oa->o_flags |= OBD_FL_SRVLOCK; + if (dv->dv_flags & LL_DV_WR_FLUSH) + oa->o_flags |= OBD_FL_FLUSH; + } + + init_completion(&cbargs->opc_sync); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + ptlrpc_request_set_replen(req); + req->rq_interpret_reply = osc_data_version_interpret; + dva = ptlrpc_req_async_args(dva, req); + dva->dva_oio = oio; + + ptlrpcd_add_req(req); + + RETURN(0); +} + +static void osc_io_data_version_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + struct obdo *oa = &oio->oi_oa; + unsigned int cl_valid = 0; + + ENTRY; + wait_for_completion(&cbargs->opc_sync); + + if (cbargs->opc_rc != 0) { + slice->cis_io->ci_result = cbargs->opc_rc; + } else { + slice->cis_io->ci_result = 0; + if (!(oa->o_valid & + (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION))) + slice->cis_io->ci_result = -ENOTSUPP; + + if (oa->o_valid & OBD_MD_LAYOUT_VERSION) + dv->dv_layout_version = oa->o_layout_version; + if (oa->o_valid & OBD_MD_FLDATAVERSION) + dv->dv_data_version = oa->o_data_version; + + if (dv->dv_flags & LL_DV_SZ_UPDATE) { + if (oa->o_valid & OBD_MD_FLSIZE) { + attr->cat_size = oa->o_size; + cl_valid |= CAT_SIZE; + } + + if (oa->o_valid & OBD_MD_FLBLOCKS) { + attr->cat_blocks = oa->o_blocks; + cl_valid |= CAT_BLOCKS; + } + + cl_object_attr_lock(obj); + cl_object_attr_update(env, obj, attr, cl_valid); + cl_object_attr_unlock(obj); + } + } + + EXIT; +} + +int osc_io_read_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_object *obj = slice->cis_obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int rc = 0; + ENTRY; + + if (!slice->cis_io->ci_noatime) { + cl_object_attr_lock(obj); + attr->cat_atime = ktime_get_real_seconds(); + rc = cl_object_attr_update(env, obj, attr, CAT_ATIME); + cl_object_attr_unlock(obj); + } + + RETURN(rc); +} +EXPORT_SYMBOL(osc_io_read_start); + +int osc_io_write_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_object *obj = slice->cis_obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int rc = 0; + ENTRY; + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1); + cl_object_attr_lock(obj); + attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds(); + rc = cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME); + cl_object_attr_unlock(obj); + + RETURN(rc); +} +EXPORT_SYMBOL(osc_io_write_start); + +int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj, + struct cl_fsync_io *fio) +{ + struct osc_io *oio = osc_env_io(env); + struct obdo *oa = &oio->oi_oa; + struct lov_oinfo *loi = obj->oo_oinfo; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + int rc = 0; + ENTRY; + + memset(oa, 0, sizeof(*oa)); + oa->o_oi = loi->loi_oi; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + + /* reload size abd blocks for start and end of sync range */ + oa->o_size = fio->fi_start; + oa->o_blocks = fio->fi_end; + oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; + + obdo_set_parent_fid(oa, fio->fi_fid); + + init_completion(&cbargs->opc_sync); + + rc = osc_sync_base(obj, oa, osc_async_upcall, cbargs, PTLRPCD_SET); + RETURN(rc); +} +EXPORT_SYMBOL(osc_fsync_ost); + +int osc_io_fsync_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct cl_fsync_io *fio = &io->u.ci_fsync; + struct cl_object *obj = slice->cis_obj; + struct osc_object *osc = cl2osc(obj); + pgoff_t start = cl_index(obj, fio->fi_start); + pgoff_t end = cl_index(obj, fio->fi_end); + int result = 0; + ENTRY; + + if (fio->fi_end == OBD_OBJECT_EOF) + end = CL_PAGE_EOF; + + result = osc_cache_writeback_range(env, osc, start, end, 0, + fio->fi_mode == CL_FSYNC_DISCARD); + if (result > 0) { + fio->fi_nr_written += result; + result = 0; + } + if (fio->fi_mode == CL_FSYNC_ALL) { + int rc; + + /* we have to wait for writeback to finish before we can + * send OST_SYNC RPC. This is bad because it causes extents + * to be written osc by osc. However, we usually start + * writeback before CL_FSYNC_ALL so this won't have any real + * problem. */ + rc = osc_cache_wait_range(env, osc, start, end); + if (result == 0) + result = rc; + rc = osc_fsync_ost(env, osc, fio); + if (result == 0) + result = rc; + } + + RETURN(result); +} + +void osc_io_fsync_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync; + struct cl_object *obj = slice->cis_obj; + pgoff_t start = cl_index(obj, fio->fi_start); + pgoff_t end = cl_index(obj, fio->fi_end); + int result = 0; + + if (fio->fi_mode == CL_FSYNC_LOCAL) { + result = osc_cache_wait_range(env, cl2osc(obj), start, end); + } else if (fio->fi_mode == CL_FSYNC_ALL) { + struct osc_io *oio = cl2osc_io(env, slice); + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + + wait_for_completion(&cbargs->opc_sync); + if (result == 0) + result = cbargs->opc_rc; + } + slice->cis_io->ci_result = result; +} +EXPORT_SYMBOL(osc_io_fsync_end); + +static int osc_io_ladvise_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + int result = 0; + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_ladvise_io *lio = &io->u.ci_ladvise; + struct obdo *oa = &oio->oi_oa; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + struct lu_ladvise *ladvise; + struct ladvise_hdr *ladvise_hdr; + int buf_size; + int num_advise = 1; + ENTRY; + + /* TODO: add multiple ladvise support in CLIO */ + buf_size = offsetof(typeof(*ladvise_hdr), lah_advise[num_advise]); + if (osc_env_info(env)->oti_ladvise_buf.lb_len < buf_size) + lu_buf_realloc(&osc_env_info(env)->oti_ladvise_buf, buf_size); + + ladvise_hdr = osc_env_info(env)->oti_ladvise_buf.lb_buf; + if (ladvise_hdr == NULL) + RETURN(-ENOMEM); + + memset(ladvise_hdr, 0, buf_size); + ladvise_hdr->lah_magic = LADVISE_MAGIC; + ladvise_hdr->lah_count = num_advise; + ladvise_hdr->lah_flags = lio->li_flags; + + memset(oa, 0, sizeof(*oa)); + oa->o_oi = loi->loi_oi; + oa->o_valid = OBD_MD_FLID; + obdo_set_parent_fid(oa, lio->li_fid); + + ladvise = ladvise_hdr->lah_advise; + ladvise->lla_start = lio->li_start; + ladvise->lla_end = lio->li_end; + ladvise->lla_advice = lio->li_advice; + + if (lio->li_flags & LF_ASYNC) { + result = osc_ladvise_base(osc_export(cl2osc(obj)), oa, + ladvise_hdr, NULL, NULL, NULL); + } else { + init_completion(&cbargs->opc_sync); + result = osc_ladvise_base(osc_export(cl2osc(obj)), oa, + ladvise_hdr, osc_async_upcall, + cbargs, PTLRPCD_SET); + cbargs->opc_rpc_sent = result == 0; + } + RETURN(result); +} + +static void osc_io_ladvise_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + int result = 0; + struct cl_ladvise_io *lio = &io->u.ci_ladvise; + + if ((!(lio->li_flags & LF_ASYNC)) && cbargs->opc_rpc_sent) { + wait_for_completion(&cbargs->opc_sync); + result = cbargs->opc_rc; + } + slice->cis_io->ci_result = result; +} + +void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice) +{ + struct osc_io *oio = cl2osc_io(env, slice); + + if (oio->oi_active) { + osc_extent_release(env, oio->oi_active); + oio->oi_active = NULL; + } +} +EXPORT_SYMBOL(osc_io_end); + +struct osc_lseek_args { + struct osc_io *lsa_oio; +}; + +static int osc_lseek_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *arg, int rc) +{ + struct ost_body *reply; + struct osc_lseek_args *lsa = arg; + struct osc_io *oio = lsa->lsa_oio; + struct cl_io *io = oio->oi_cl.cis_io; + struct cl_lseek_io *lsio = &io->u.ci_lseek; + + ENTRY; + + if (rc != 0) + GOTO(out, rc); + + reply = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (reply == NULL) + GOTO(out, rc = -EPROTO); + + lsio->ls_result = reply->oa.o_size; +out: + osc_async_upcall(&oio->oi_cbarg, rc); + RETURN(rc); +} + +int osc_io_lseek_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_lseek_io *lsio = &io->u.ci_lseek; + struct obdo *oa = &oio->oi_oa; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + struct obd_export *exp = osc_export(cl2osc(obj)); + struct ptlrpc_request *req; + struct ost_body *body; + struct osc_lseek_args *lsa; + int rc = 0; + + ENTRY; + + /* No negative values at this point */ + LASSERT(lsio->ls_start >= 0); + LASSERT(lsio->ls_whence == SEEK_HOLE || lsio->ls_whence == SEEK_DATA); + + /* with IO lock taken we have object size in LVB and can check + * boundaries prior sending LSEEK RPC + */ + if (lsio->ls_start >= loi->loi_lvb.lvb_size) { + /* consider area beyond end of object as hole */ + if (lsio->ls_whence == SEEK_HOLE) + lsio->ls_result = lsio->ls_start; + else + lsio->ls_result = -ENXIO; + RETURN(0); + } + + /* if LSEEK RPC is not supported by server, consider whole stripe + * object is data with hole after end of object + */ + if (!exp_connect_lseek(exp)) { + if (lsio->ls_whence == SEEK_HOLE) + lsio->ls_result = loi->loi_lvb.lvb_size; + else + lsio->ls_result = lsio->ls_start; + RETURN(0); + } + + memset(oa, 0, sizeof(*oa)); + oa->o_oi = loi->loi_oi; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + oa->o_size = lsio->ls_start; + oa->o_mode = lsio->ls_whence; + if (oio->oi_lockless) { + oa->o_flags = OBD_FL_SRVLOCK; + oa->o_valid |= OBD_MD_FLFLAGS; + } + + init_completion(&cbargs->opc_sync); + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SEEK); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SEEK); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + ptlrpc_request_set_replen(req); + req->rq_interpret_reply = osc_lseek_interpret; + lsa = ptlrpc_req_async_args(lsa, req); + lsa->lsa_oio = oio; + + ptlrpcd_add_req(req); + cbargs->opc_rpc_sent = 1; + + RETURN(0); +} +EXPORT_SYMBOL(osc_io_lseek_start); + +void osc_io_lseek_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct osc_io *oio = cl2osc_io(env, slice); + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + int rc = 0; + + if (cbargs->opc_rpc_sent) { + wait_for_completion(&cbargs->opc_sync); + rc = cbargs->opc_rc; + } + slice->cis_io->ci_result = rc; +} +EXPORT_SYMBOL(osc_io_lseek_end); + +int osc_io_lru_reserve(const struct lu_env *env, + const struct cl_io_slice *ios, + loff_t pos, size_t bytes) +{ + struct osc_object *osc = cl2osc(ios->cis_obj); + struct osc_io *oio = osc_env_io(env); + unsigned long npages = 0; + size_t page_offset; + + ENTRY; + + page_offset = pos & ~PAGE_MASK; + if (page_offset) { + ++npages; + if (bytes > PAGE_SIZE - page_offset) + bytes -= (PAGE_SIZE - page_offset); + else + bytes = 0; + } + npages += (bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + oio->oi_lru_reserved = osc_lru_reserve(osc_cli(osc), npages); + + RETURN(0); +} +EXPORT_SYMBOL(osc_io_lru_reserve); + +static const struct cl_io_operations osc_io_ops = { + .op = { + [CIT_READ] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_rw_iter_fini, + .cio_start = osc_io_read_start, + .cio_fini = osc_io_fini + }, + [CIT_WRITE] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_rw_iter_fini, + .cio_start = osc_io_write_start, + .cio_end = osc_io_end, + .cio_fini = osc_io_fini + }, + [CIT_SETATTR] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_iter_fini, + .cio_start = osc_io_setattr_start, + .cio_end = osc_io_setattr_end + }, + [CIT_DATA_VERSION] = { + .cio_start = osc_io_data_version_start, + .cio_end = osc_io_data_version_end, + }, + [CIT_FAULT] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_iter_fini, + .cio_start = osc_io_fault_start, + .cio_end = osc_io_end, + .cio_fini = osc_io_fini + }, + [CIT_FSYNC] = { + .cio_start = osc_io_fsync_start, + .cio_end = osc_io_fsync_end, + .cio_fini = osc_io_fini + }, + [CIT_LADVISE] = { + .cio_start = osc_io_ladvise_start, + .cio_end = osc_io_ladvise_end, + .cio_fini = osc_io_fini + }, + [CIT_LSEEK] = { + .cio_start = osc_io_lseek_start, + .cio_end = osc_io_lseek_end, + .cio_fini = osc_io_fini + }, + [CIT_MISC] = { + .cio_fini = osc_io_fini + } + }, + .cio_read_ahead = osc_io_read_ahead, + .cio_lru_reserve = osc_io_lru_reserve, + .cio_submit = osc_io_submit, + .cio_commit_async = osc_io_commit_async, + .cio_extent_release = osc_io_extent_release +}; + +/***************************************************************************** + * + * Transfer operations. + * + */ + +int osc_io_init(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io) +{ + struct osc_io *oio = osc_env_io(env); + + CL_IO_SLICE_CLEAN(oio, oi_cl); + cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops); + return 0; +} + +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_lock.c b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c new file mode 100644 index 0000000000000..dbf8cde90317f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c @@ -0,0 +1,1300 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_lock for OSC layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC + +/* fid_build_reg_res_name() */ +#include +#include + +#include "osc_internal.h" + +/** \addtogroup osc + * @{ + */ + +/** + * Returns a weak pointer to the ldlm lock identified by a handle. Returned + * pointer cannot be dereferenced, as lock is not protected from concurrent + * reclaim. This function is a helper for osc_lock_invariant(). + */ +static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle) +{ + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(handle); + if (lock != NULL) + LDLM_LOCK_PUT(lock); + return lock; +} + +/** + * Invariant that has to be true all of the time. + */ +static int osc_lock_invariant(struct osc_lock *ols) +{ + struct ldlm_lock *lock = osc_handle_ptr(&ols->ols_handle); + struct ldlm_lock *olock = ols->ols_dlmlock; + int handle_used = lustre_handle_is_used(&ols->ols_handle); + + if (ergo(osc_lock_is_lockless(ols), + ols->ols_locklessable && ols->ols_dlmlock == NULL)) + return 1; + + /* + * If all the following "ergo"s are true, return 1, otherwise 0 + */ + if (! ergo(olock != NULL, handle_used)) + return 0; + + if (! ergo(olock != NULL, + olock->l_handle.h_cookie == ols->ols_handle.cookie)) + return 0; + + if (! ergo(handle_used, + ergo(lock != NULL && olock != NULL, lock == olock) && + ergo(lock == NULL, olock == NULL))) + return 0; + /* + * Check that ->ols_handle and ->ols_dlmlock are consistent, but + * take into account that they are set at the different time. + */ + if (! ergo(ols->ols_state == OLS_CANCELLED, + olock == NULL && !handle_used)) + return 0; + /* + * DLM lock is destroyed only after we have seen cancellation + * ast. + */ + if (! ergo(olock != NULL && ols->ols_state < OLS_CANCELLED, + !ldlm_is_destroyed(olock))) + return 0; + + if (! ergo(ols->ols_state == OLS_GRANTED, + olock != NULL && + ldlm_is_granted(olock) && + ols->ols_hold)) + return 0; + return 1; +} + +/***************************************************************************** + * + * Lock operations. + * + */ + +void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + + LINVRNT(osc_lock_invariant(ols)); + LASSERT(ols->ols_dlmlock == NULL); + + OBD_SLAB_FREE_PTR(ols, osc_lock_kmem); +} +EXPORT_SYMBOL(osc_lock_fini); + +static void osc_lock_build_policy(const struct lu_env *env, + const struct cl_lock *lock, + union ldlm_policy_data *policy) +{ + const struct cl_lock_descr *d = &lock->cll_descr; + + osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end); + policy->l_extent.gid = d->cld_gid; +} + +/** + * Updates object attributes from a lock value block (lvb) received together + * with the DLM lock reply from the server. Copy of osc_update_enqueue() + * logic. + * + * Called under lock and resource spin-locks. + */ +void osc_lock_lvb_update(const struct lu_env *env, + struct osc_object *osc, + struct ldlm_lock *dlmlock, + struct ost_lvb *lvb) +{ + struct cl_object *obj = osc2cl(osc); + struct lov_oinfo *oinfo = osc->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + unsigned valid, setkms = 0; + + ENTRY; + + valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE; + if (lvb == NULL) { + LASSERT(dlmlock != NULL); + lvb = dlmlock->l_lvb_data; + } + cl_lvb2attr(attr, lvb); + + cl_object_attr_lock(obj); + if (dlmlock != NULL) { + __u64 size; + + check_res_locked(dlmlock->l_resource); + + LASSERT(lvb == dlmlock->l_lvb_data); + size = lvb->lvb_size; + + /* Extend KMS up to the end of this lock and no further + * A lock on [x,y] means a KMS of up to y + 1 bytes! */ + if (size > dlmlock->l_policy_data.l_extent.end) + size = dlmlock->l_policy_data.l_extent.end + 1; + if (size >= oinfo->loi_kms) { + valid |= CAT_KMS; + attr->cat_kms = size; + setkms = 1; + } + ldlm_lock_allow_match_locked(dlmlock); + } + + /* The size should not be less than the kms */ + if (attr->cat_size < oinfo->loi_kms) + attr->cat_size = oinfo->loi_kms; + + LDLM_DEBUG(dlmlock, "acquired size %llu, setting rss=%llu;%s " + "kms=%llu, end=%llu", lvb->lvb_size, attr->cat_size, + setkms ? "" : " leaving", + setkms ? attr->cat_kms : oinfo->loi_kms, + dlmlock ? dlmlock->l_policy_data.l_extent.end : -1ull); + + cl_object_attr_update(env, obj, attr, valid); + cl_object_attr_unlock(obj); + + EXIT; +} + +static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl, + struct lustre_handle *lockh) +{ + struct osc_object *osc = cl2osc(oscl->ols_cl.cls_obj); + struct ldlm_lock *dlmlock; + + dlmlock = ldlm_handle2lock_long(lockh, 0); + LASSERT(dlmlock != NULL); + + /* lock reference taken by ldlm_handle2lock_long() is + * owned by osc_lock and released in osc_lock_detach() + */ + lu_ref_add_atomic(&dlmlock->l_reference, "osc_lock", oscl); + oscl->ols_has_ref = 1; + + LASSERT(oscl->ols_dlmlock == NULL); + oscl->ols_dlmlock = dlmlock; + + /* This may be a matched lock for glimpse request, do not hold + * lock reference in that case. */ + if (!oscl->ols_glimpse) { + /* hold a refc for non glimpse lock which will + * be released in osc_lock_cancel() */ + lustre_handle_copy(&oscl->ols_handle, lockh); + ldlm_lock_addref(lockh, oscl->ols_einfo.ei_mode); + oscl->ols_hold = 1; + } + + /* Lock must have been granted. */ + lock_res_and_lock(dlmlock); + if (ldlm_is_granted(dlmlock)) { + struct ldlm_extent *ext = &dlmlock->l_policy_data.l_extent; + struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr; + + /* extend the lock extent, otherwise it will have problem when + * we decide whether to grant a lockless lock. */ + descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode); + descr->cld_start = cl_index(descr->cld_obj, ext->start); + descr->cld_end = cl_index(descr->cld_obj, ext->end); + descr->cld_gid = ext->gid; + + /* no lvb update for matched lock */ + if (!ldlm_is_lvb_cached(dlmlock)) { + LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY); + LASSERT(osc == dlmlock->l_ast_data); + osc_lock_lvb_update(env, osc, dlmlock, NULL); + ldlm_set_lvb_cached(dlmlock); + } + LINVRNT(osc_lock_invariant(oscl)); + } + unlock_res_and_lock(dlmlock); + + LASSERT(oscl->ols_state != OLS_GRANTED); + oscl->ols_state = OLS_GRANTED; +} + +/** + * Lock upcall function that is executed either when a reply to ENQUEUE rpc is + * received from a server, or after osc_enqueue_base() matched a local DLM + * lock. + */ +static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh, + int errcode) +{ + struct osc_lock *oscl = cookie; + struct cl_lock_slice *slice = &oscl->ols_cl; + struct lu_env *env; + int rc; + + ENTRY; + + env = cl_env_percpu_get(); + /* should never happen, similar to osc_ldlm_blocking_ast(). */ + LASSERT(!IS_ERR(env)); + + rc = ldlm_error2errno(errcode); + if (oscl->ols_state == OLS_ENQUEUED) { + oscl->ols_state = OLS_UPCALL_RECEIVED; + } else if (oscl->ols_state == OLS_CANCELLED) { + rc = -EIO; + } else { + CERROR("Impossible state: %d\n", oscl->ols_state); + LBUG(); + } + + if (rc == 0) + osc_lock_granted(env, oscl, lockh); + + /* Error handling, some errors are tolerable. */ + if (oscl->ols_glimpse && rc == -ENAVAIL) { + LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY); + osc_lock_lvb_update(env, cl2osc(slice->cls_obj), + NULL, &oscl->ols_lvb); + /* Hide the error. */ + rc = 0; + } else if (rc < 0 && oscl->ols_flags & LDLM_FL_NDELAY) { + rc = -EAGAIN; + } + + if (oscl->ols_owner != NULL) + cl_sync_io_note(env, oscl->ols_owner, rc); + cl_env_percpu_put(env); + + RETURN(rc); +} + +static int osc_lock_upcall_speculative(void *cookie, + struct lustre_handle *lockh, + int errcode) +{ + struct osc_object *osc = cookie; + struct ldlm_lock *dlmlock; + struct lu_env *env; + __u16 refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + LASSERT(!IS_ERR(env)); + + if (errcode == ELDLM_LOCK_MATCHED) + GOTO(out, errcode = ELDLM_OK); + + if (errcode != ELDLM_OK) + GOTO(out, errcode); + + dlmlock = ldlm_handle2lock(lockh); + LASSERT(dlmlock != NULL); + + lock_res_and_lock(dlmlock); + LASSERT(ldlm_is_granted(dlmlock)); + + /* there is no osc_lock associated with speculative locks + * thus no need to set LDLM_FL_LVB_CACHED */ + osc_lock_lvb_update(env, osc, dlmlock, NULL); + + unlock_res_and_lock(dlmlock); + LDLM_LOCK_PUT(dlmlock); + +out: + cl_object_put(env, osc2cl(osc)); + cl_env_put(env, &refcheck); + RETURN(ldlm_error2errno(errcode)); +} + +static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end, + enum cl_lock_mode mode, bool discard) +{ + struct lu_env *env; + __u16 refcheck; + int rc = 0; + int rc2 = 0; + + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + if (mode == CLM_WRITE) { + rc = osc_cache_writeback_range(env, obj, start, end, 1, + discard); + CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n", + obj, start, end, rc, + discard ? "discarded" : "written back"); + if (rc > 0) + rc = 0; + } + + /* + * Do not try to match other locks with CLM_WRITE since we already + * know there're none + */ + rc2 = osc_lock_discard_pages(env, obj, start, end, + mode == CLM_WRITE || discard); + if (rc == 0 && rc2 < 0) + rc = rc2; + + cl_env_put(env, &refcheck); + RETURN(rc); +} + +/** + * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock + * and ldlm_lock caches. + */ +static int osc_dlm_blocking_ast0(const struct lu_env *env, + struct ldlm_lock *dlmlock, + void *data, int flag) +{ + struct cl_object *obj = NULL; + int result = 0; + bool discard; + enum cl_lock_mode mode = CLM_READ; + ENTRY; + + LASSERT(flag == LDLM_CB_CANCELING); + + lock_res_and_lock(dlmlock); + if (!ldlm_is_granted(dlmlock)) { + dlmlock->l_ast_data = NULL; + unlock_res_and_lock(dlmlock); + RETURN(0); + } + + discard = ldlm_is_discard_data(dlmlock); + if (dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP)) + mode = CLM_WRITE; + + if (dlmlock->l_ast_data != NULL) { + obj = osc2cl(dlmlock->l_ast_data); + cl_object_get(obj); + } + + unlock_res_and_lock(dlmlock); + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_CANCEL, 5); + + /* if l_ast_data is NULL, the dlmlock was enqueued by AGL or + * the object has been destroyed. */ + if (obj != NULL) { + struct ldlm_extent *extent = &dlmlock->l_policy_data.l_extent; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + __u64 old_kms; + + /* Destroy pages covered by the extent of the DLM lock */ + result = osc_lock_flush(cl2osc(obj), + cl_index(obj, extent->start), + cl_index(obj, extent->end), + mode, discard); + + /* losing a lock, update kms */ + lock_res_and_lock(dlmlock); + /* clearing l_ast_data after flushing data, + * to let glimpse ast find the lock and the object */ + dlmlock->l_ast_data = NULL; + cl_object_attr_lock(obj); + /* Must get the value under the lock to avoid race. */ + old_kms = cl2osc(obj)->oo_oinfo->loi_kms; + /* Update the kms. Need to loop all granted locks. + * Not a problem for the client */ + attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms); + + cl_object_attr_update(env, obj, attr, CAT_KMS); + cl_object_attr_unlock(obj); + unlock_res_and_lock(dlmlock); + + cl_object_put(env, obj); + } + RETURN(result); +} + +/** + * Blocking ast invoked by ldlm when dlm lock is either blocking progress of + * some other lock, or is canceled. This function is installed as a + * ldlm_lock::l_blocking_ast() for client extent locks. + * + * Control flow is tricky, because ldlm uses the same call-back + * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's. + * + * \param dlmlock lock for which ast occurred. + * + * \param new description of a conflicting lock in case of blocking ast. + * + * \param data value of dlmlock->l_ast_data + * + * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish + * cancellation and blocking ast's. + * + * Possible use cases: + * + * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel + * lock due to lock lru pressure, or explicit user request to purge + * locks. + * + * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify + * us that dlmlock conflicts with another lock that some client is + * enqueuing. Lock is canceled. + * + * - cl_lock_cancel() is called. osc_lock_cancel() calls + * ldlm_cli_cancel() that calls + * + * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) + * + * recursively entering osc_ldlm_blocking_ast(). + * + * - client cancels lock voluntary (e.g., as a part of early cancellation): + * + * cl_lock_cancel()-> + * osc_lock_cancel()-> + * ldlm_cli_cancel()-> + * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) + * + */ +static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock, + struct ldlm_lock_desc *new, void *data, + int flag) +{ + int result = 0; + ENTRY; + + switch (flag) { + case LDLM_CB_BLOCKING: { + struct lustre_handle lockh; + + ldlm_lock2handle(dlmlock, &lockh); + result = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (result == -ENODATA) + result = 0; + break; + } + case LDLM_CB_CANCELING: { + struct lu_env *env; + __u16 refcheck; + + /* + * This can be called in the context of outer IO, e.g., + * + * osc_enqueue_base()->... + * ->ldlm_prep_elc_req()->... + * ->ldlm_cancel_callback()->... + * ->osc_ldlm_blocking_ast() + * + * new environment has to be created to not corrupt outer + * context. + */ + env = cl_env_get(&refcheck); + if (IS_ERR(env)) { + result = PTR_ERR(env); + break; + } + + result = osc_dlm_blocking_ast0(env, dlmlock, data, flag); + cl_env_put(env, &refcheck); + break; + } + default: + LBUG(); + } + RETURN(result); +} + +int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data) +{ + struct ptlrpc_request *req = data; + struct lu_env *env; + struct ost_lvb *lvb; + struct req_capsule *cap; + struct cl_object *obj = NULL; + struct ldlm_resource *res = dlmlock->l_resource; + struct ldlm_match_data matchdata = { 0 }; + union ldlm_policy_data policy; + enum ldlm_mode mode = LCK_PW | LCK_GROUP | LCK_PR; + int result; + __u16 refcheck; + + ENTRY; + + LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out, result = PTR_ERR(env)); + + policy.l_extent.start = 0; + policy.l_extent.end = LUSTRE_EOF; + + matchdata.lmd_mode = &mode; + matchdata.lmd_policy = &policy; + matchdata.lmd_flags = LDLM_FL_TEST_LOCK | LDLM_FL_CBPENDING; + matchdata.lmd_match = LDLM_MATCH_UNREF | LDLM_MATCH_AST_ANY; + + LDLM_LOCK_GET(dlmlock); + + /* If any dlmlock has l_ast_data set, we must find it or we risk + * missing a size update done under a different lock. + */ + while (dlmlock) { + lock_res_and_lock(dlmlock); + if (dlmlock->l_ast_data) { + obj = osc2cl(dlmlock->l_ast_data); + cl_object_get(obj); + } + unlock_res_and_lock(dlmlock); + LDLM_LOCK_RELEASE(dlmlock); + + dlmlock = NULL; + + if (obj == NULL && res->lr_type == LDLM_EXTENT) { + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_SIZE_DATA)) + break; + + lock_res(res); + dlmlock = search_itree(res, &matchdata); + unlock_res(res); + } + } + + if (obj != NULL) { + /* Do not grab the mutex of cl_lock for glimpse. + * See LU-1274 for details. + * BTW, it's okay for cl_lock to be cancelled during + * this period because server can handle this race. + * See ldlm_server_glimpse_ast() for details. + * cl_lock_mutex_get(env, lock); */ + cap = &req->rq_pill; + req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK); + req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER, + sizeof *lvb); + result = req_capsule_server_pack(cap); + if (result == 0) { + lvb = req_capsule_server_get(cap, &RMF_DLM_LVB); + result = cl_object_glimpse(env, obj, lvb); + } + if (!exp_connect_lvb_type(req->rq_export)) + req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB, + sizeof(struct ost_lvb_v1), RCL_SERVER); + cl_object_put(env, obj); + } else { + /* + * These errors are normal races, so we don't want to + * fill the console with messages by calling + * ptlrpc_error() + */ + lustre_pack_reply(req, 1, NULL, NULL); + result = -ELDLM_NO_LOCK_DATA; + } + cl_env_put(env, &refcheck); + EXIT; + +out: + req->rq_status = result; + RETURN(result); +} +EXPORT_SYMBOL(osc_ldlm_glimpse_ast); + +static bool weigh_cb(const struct lu_env *env, struct cl_io *io, + void **pvec, int count, void *cbdata) +{ + int i; + + for (i = 0; i < count; i++) { + struct osc_page *ops = pvec[i]; + struct cl_page *page = ops->ops_cl.cpl_page; + + if (cl_page_is_vmlocked(env, page) || + PageDirty(page->cp_vmpage) || + PageWriteback(page->cp_vmpage)) + return false; + + *(pgoff_t *)cbdata = osc_index(ops) + 1; + } + return true; +} + +static unsigned long osc_lock_weight(const struct lu_env *env, + struct osc_object *oscobj, + loff_t start, loff_t end) +{ + struct cl_io *io = osc_env_thread_io(env); + struct cl_object *obj = cl_object_top(&oscobj->oo_cl); + pgoff_t page_index; + int result; + + ENTRY; + + io->ci_obj = obj; + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result != 0) + RETURN(1); + + page_index = cl_index(obj, start); + + if (!osc_page_gang_lookup(env, io, oscobj, + page_index, cl_index(obj, end), + weigh_cb, (void *)&page_index)) + result = 1; + cl_io_fini(env, io); + + return result; +} + +/** + * Get the weight of dlm lock for early cancellation. + */ +unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock) +{ + struct lu_env *env; + struct osc_object *obj; + struct osc_lock *oscl; + unsigned long weight; + bool found = false; + __u16 refcheck; + + ENTRY; + + might_sleep(); + /* + * osc_ldlm_weigh_ast has a complex context since it might be called + * because of lock canceling, or from user's input. We have to make + * a new environment for it. Probably it is implementation safe to use + * the upper context because cl_lock_put don't modify environment + * variables. But just in case .. + */ + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + /* Mostly because lack of memory, do not eliminate this lock */ + RETURN(1); + + LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT || + dlmlock->l_resource->lr_type == LDLM_IBITS); + + lock_res_and_lock(dlmlock); + obj = dlmlock->l_ast_data; + if (obj) + cl_object_get(osc2cl(obj)); + unlock_res_and_lock(dlmlock); + + if (obj == NULL) + GOTO(out, weight = 0); + + spin_lock(&obj->oo_ol_spin); + list_for_each_entry(oscl, &obj->oo_ol_list, ols_nextlock_oscobj) { + if (oscl->ols_dlmlock == dlmlock) { + found = true; + break; + } + } + spin_unlock(&obj->oo_ol_spin); + if (found) { + /* + * If the lock is being used by an IO, definitely not cancel it. + */ + GOTO(out, weight = 1); + } + + if (dlmlock->l_resource->lr_type == LDLM_EXTENT) + weight = osc_lock_weight(env, obj, + dlmlock->l_policy_data.l_extent.start, + dlmlock->l_policy_data.l_extent.end); + else if (ldlm_has_dom(dlmlock)) + weight = osc_lock_weight(env, obj, 0, OBD_OBJECT_EOF); + /* The DOM bit can be cancelled at any time; in that case, we know + * there are no pages, so just return weight of 0 + */ + else + weight = 0; + + EXIT; + +out: + if (obj) + cl_object_put(env, osc2cl(obj)); + + cl_env_put(env, &refcheck); + return weight; +} +EXPORT_SYMBOL(osc_ldlm_weigh_ast); + +static void osc_lock_build_einfo(const struct lu_env *env, + const struct cl_lock *lock, + struct osc_object *osc, + struct ldlm_enqueue_info *einfo) +{ + einfo->ei_type = LDLM_EXTENT; + einfo->ei_mode = osc_cl_lock2ldlm(lock->cll_descr.cld_mode); + einfo->ei_cb_bl = osc_ldlm_blocking_ast; + einfo->ei_cb_cp = ldlm_completion_ast; + einfo->ei_cb_gl = osc_ldlm_glimpse_ast; + einfo->ei_cbdata = osc; /* value to be put into ->l_ast_data */ +} + +/** + * Determine if the lock should be converted into a lockless lock. + * + * Steps to check: + * - if the lock has an explicite requirment for a non-lockless lock; + * - if the io lock request type ci_lockreq; + * - send the enqueue rpc to ost to make the further decision; + * - special treat to truncate lockless lock + * + * Additional policy can be implemented here, e.g., never do lockless-io + * for large extents. + */ +void osc_lock_to_lockless(const struct lu_env *env, + struct osc_lock *ols, int force) +{ + struct cl_lock_slice *slice = &ols->ols_cl; + struct osc_io *oio = osc_env_io(env); + struct cl_io *io = oio->oi_cl.cis_io; + struct cl_object *obj = slice->cls_obj; + struct osc_object *oob = cl2osc(obj); + struct obd_connect_data *ocd; + + LASSERT(ols->ols_state == OLS_NEW || + ols->ols_state == OLS_UPCALL_RECEIVED); + + if (force) { + ols->ols_locklessable = 1; + slice->cls_ops = ols->ols_lockless_ops; + } else { + LASSERT(io->ci_lockreq == CILR_MANDATORY || + io->ci_lockreq == CILR_MAYBE || + io->ci_lockreq == CILR_NEVER); + + ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data; + ols->ols_locklessable = (io->ci_type != CIT_SETATTR) && + (io->ci_lockreq == CILR_MAYBE) && + (ocd->ocd_connect_flags & + OBD_CONNECT_SRVLOCK); + if (io->ci_lockreq == CILR_NEVER) { + ols->ols_locklessable = 1; + slice->cls_ops = ols->ols_lockless_ops; + } + } + LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols))); +} +EXPORT_SYMBOL(osc_lock_to_lockless); + +static bool osc_lock_compatible(const struct osc_lock *qing, + const struct osc_lock *qed) +{ + struct cl_lock_descr *qed_descr = &qed->ols_cl.cls_lock->cll_descr; + struct cl_lock_descr *qing_descr = &qing->ols_cl.cls_lock->cll_descr; + + if (qed->ols_glimpse || qed->ols_speculative) + return true; + + if (qing_descr->cld_mode == CLM_READ && qed_descr->cld_mode == CLM_READ) + return true; + + if (qed->ols_state < OLS_GRANTED) + return true; + + if (qed_descr->cld_mode >= qing_descr->cld_mode && + qed_descr->cld_start <= qing_descr->cld_start && + qed_descr->cld_end >= qing_descr->cld_end) + return true; + + return false; +} + +void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc, + struct osc_lock *oscl) +{ + struct osc_lock *scan; + + spin_lock(&osc->oo_ol_spin); + list_del_init(&oscl->ols_nextlock_oscobj); + spin_unlock(&osc->oo_ol_spin); + + spin_lock(&oscl->ols_lock); + while ((scan = list_first_entry_or_null(&oscl->ols_waiting_list, + struct osc_lock, + ols_wait_entry)) != NULL) { + list_del_init(&scan->ols_wait_entry); + + cl_sync_io_note(env, scan->ols_owner, 0); + } + spin_unlock(&oscl->ols_lock); +} +EXPORT_SYMBOL(osc_lock_wake_waiters); + +int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj, + struct osc_lock *oscl) +{ + struct osc_lock *tmp_oscl; + struct cl_lock_descr *need = &oscl->ols_cl.cls_lock->cll_descr; + struct cl_sync_io *waiter = &osc_env_info(env)->oti_anchor; + int rc = 0; + + ENTRY; + + spin_lock(&obj->oo_ol_spin); + list_add_tail(&oscl->ols_nextlock_oscobj, &obj->oo_ol_list); + +restart: + list_for_each_entry(tmp_oscl, &obj->oo_ol_list, + ols_nextlock_oscobj) { + struct cl_lock_descr *descr; + + if (tmp_oscl == oscl) + break; + + descr = &tmp_oscl->ols_cl.cls_lock->cll_descr; + if (descr->cld_start > need->cld_end || + descr->cld_end < need->cld_start) + continue; + + /* We're not supposed to give up group lock */ + if (descr->cld_mode == CLM_GROUP) + break; + + if (!osc_lock_is_lockless(oscl) && + osc_lock_compatible(oscl, tmp_oscl)) + continue; + + /* wait for conflicting lock to be canceled */ + cl_sync_io_init(waiter, 1); + oscl->ols_owner = waiter; + + spin_lock(&tmp_oscl->ols_lock); + /* add oscl into tmp's ols_waiting list */ + list_add_tail(&oscl->ols_wait_entry, + &tmp_oscl->ols_waiting_list); + spin_unlock(&tmp_oscl->ols_lock); + + spin_unlock(&obj->oo_ol_spin); + rc = cl_sync_io_wait(env, waiter, 0); + spin_lock(&obj->oo_ol_spin); + + if (rc < 0) + break; + + oscl->ols_owner = NULL; + goto restart; + } + spin_unlock(&obj->oo_ol_spin); + + RETURN(rc); +} +EXPORT_SYMBOL(osc_lock_enqueue_wait); + +/** + * Implementation of cl_lock_operations::clo_enqueue() method for osc + * layer. This initiates ldlm enqueue: + * + * - cancels conflicting locks early (osc_lock_enqueue_wait()); + * + * - calls osc_enqueue_base() to do actual enqueue. + * + * osc_enqueue_base() is supplied with an upcall function that is executed + * when lock is received either after a local cached ldlm lock is matched, or + * when a reply from the server is received. + * + * This function does not wait for the network communication to complete. + */ +static int osc_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *unused, struct cl_sync_io *anchor) +{ + struct osc_thread_info *info = osc_env_info(env); + struct osc_io *oio = osc_env_io(env); + struct osc_object *osc = cl2osc(slice->cls_obj); + struct osc_lock *oscl = cl2osc_lock(slice); + struct obd_export *exp = osc_export(osc); + struct cl_lock *lock = slice->cls_lock; + struct ldlm_res_id *resname = &info->oti_resname; + union ldlm_policy_data *policy = &info->oti_policy; + osc_enqueue_upcall_f upcall = osc_lock_upcall; + void *cookie = oscl; + bool async = false; + int result; + + ENTRY; + + LASSERTF(ergo(oscl->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ), + "lock = %p, ols = %p\n", lock, oscl); + + if (oscl->ols_state == OLS_GRANTED) + RETURN(0); + + if ((oscl->ols_flags & LDLM_FL_NO_EXPANSION) && + !exp_connect_lockahead(exp)) { + result = -EOPNOTSUPP; + CERROR("%s: server does not support lockahead/locknoexpand: rc = %d\n", + exp->exp_obd->obd_name, result); + RETURN(result); + } + + if (oscl->ols_flags & LDLM_FL_TEST_LOCK) + GOTO(enqueue_base, 0); + + /* For glimpse and/or speculative locks, do not wait for reply from + * server on LDLM request */ + if (oscl->ols_glimpse || oscl->ols_speculative) { + /* Speculative and glimpse locks do not have an anchor */ + LASSERT(equi(oscl->ols_speculative, anchor == NULL)); + async = true; + GOTO(enqueue_base, 0); + } + + result = osc_lock_enqueue_wait(env, osc, oscl); + if (result < 0) + GOTO(out, result); + + /* we can grant lockless lock right after all conflicting locks + * are canceled. */ + if (osc_lock_is_lockless(oscl)) { + oscl->ols_state = OLS_GRANTED; + oio->oi_lockless = 1; + RETURN(0); + } + +enqueue_base: + oscl->ols_state = OLS_ENQUEUED; + if (anchor != NULL) { + atomic_inc(&anchor->csi_sync_nr); + oscl->ols_owner = anchor; + } + + /** + * DLM lock's ast data must be osc_object; + * if glimpse or speculative lock, async of osc_enqueue_base() + * must be true + * + * For non-speculative locks: + * DLM's enqueue callback set to osc_lock_upcall() with cookie as + * osc_lock. + * For speculative locks: + * osc_lock_upcall_speculative & cookie is the osc object, since + * there is no osc_lock + */ + ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname); + osc_lock_build_policy(env, lock, policy); + if (oscl->ols_speculative) { + oscl->ols_einfo.ei_cbdata = NULL; + /* hold a reference for callback */ + cl_object_get(osc2cl(osc)); + upcall = osc_lock_upcall_speculative; + cookie = osc; + } + result = osc_enqueue_base(exp, resname, &oscl->ols_flags, + policy, &oscl->ols_lvb, + upcall, cookie, + &oscl->ols_einfo, PTLRPCD_SET, async, + oscl->ols_speculative); + if (result == 0) { + if (osc_lock_is_lockless(oscl)) { + oio->oi_lockless = 1; + } else if (!async) { + if (OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_IDLE_RACE)) { + OBD_RACE(OBD_FAIL_PTLRPC_IDLE_RACE); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1) / 2); + } + LASSERT(oscl->ols_state == OLS_GRANTED); + LASSERT(oscl->ols_hold); + LASSERT(oscl->ols_dlmlock != NULL); + } + } else if (oscl->ols_speculative) { + cl_object_put(env, osc2cl(osc)); + if (oscl->ols_glimpse) { + /* hide error for AGL request */ + result = 0; + } + } + +out: + if (result < 0) { + oscl->ols_state = OLS_CANCELLED; + osc_lock_wake_waiters(env, osc, oscl); + + if (anchor != NULL) + cl_sync_io_note(env, anchor, result); + } + RETURN(result); +} + +/** + * Breaks a link between osc_lock and dlm_lock. + */ +static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck) +{ + struct ldlm_lock *dlmlock; + + ENTRY; + + dlmlock = olck->ols_dlmlock; + if (dlmlock == NULL) + RETURN_EXIT; + + if (olck->ols_hold) { + olck->ols_hold = 0; + ldlm_lock_decref(&olck->ols_handle, olck->ols_einfo.ei_mode); + olck->ols_handle.cookie = 0ULL; + } + + olck->ols_dlmlock = NULL; + + /* release a reference taken in osc_lock_upcall(). */ + LASSERT(olck->ols_has_ref); + lu_ref_del(&dlmlock->l_reference, "osc_lock", olck); + LDLM_LOCK_RELEASE(dlmlock); + olck->ols_has_ref = 0; + + EXIT; +} + +/** + * Implements cl_lock_operations::clo_cancel() method for osc layer. This is + * called (as part of cl_lock_cancel()) when lock is canceled either voluntary + * (LRU pressure, early cancellation, umount, etc.) or due to the conflict + * with some other lock some where in the cluster. This function does the + * following: + * + * - invalidates all pages protected by this lock (after sending dirty + * ones to the server, as necessary); + * + * - decref's underlying ldlm lock; + * + * - cancels ldlm lock (ldlm_cli_cancel()). + */ +void osc_lock_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_object *obj = cl2osc(slice->cls_obj); + struct osc_lock *oscl = cl2osc_lock(slice); + + ENTRY; + + LINVRNT(osc_lock_invariant(oscl)); + + osc_lock_detach(env, oscl); + oscl->ols_state = OLS_CANCELLED; + oscl->ols_flags &= ~LDLM_FL_LVB_READY; + + osc_lock_wake_waiters(env, obj, oscl); + EXIT; +} +EXPORT_SYMBOL(osc_lock_cancel); + +int osc_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + struct osc_lock *lock = cl2osc_lock(slice); + + (*p)(env, cookie, "%p %#llx %#llx %d %p ", + lock->ols_dlmlock, lock->ols_flags, lock->ols_handle.cookie, + lock->ols_state, lock->ols_owner); + osc_lvb_print(env, cookie, p, &lock->ols_lvb); + return 0; +} +EXPORT_SYMBOL(osc_lock_print); + +static const struct cl_lock_operations osc_lock_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = osc_lock_enqueue, + .clo_cancel = osc_lock_cancel, + .clo_print = osc_lock_print, +}; + +static void osc_lock_lockless_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + struct osc_object *osc = cl2osc(slice->cls_obj); + + LASSERT(ols->ols_dlmlock == NULL); + osc_lock_wake_waiters(env, osc, ols); +} + +static const struct cl_lock_operations osc_lock_lockless_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = osc_lock_enqueue, + .clo_cancel = osc_lock_lockless_cancel, + .clo_print = osc_lock_print +}; + +void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io, + struct cl_object *obj, struct osc_lock *oscl) +{ + struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr; + pgoff_t io_start; + pgoff_t io_end; + + if (!cl_object_same(io->ci_obj, obj)) + return; + + if (likely(io->ci_type == CIT_WRITE)) { + io_start = cl_index(obj, io->u.ci_rw.crw_pos); + io_end = cl_index(obj, io->u.ci_rw.crw_pos + + io->u.ci_rw.crw_count - 1); + } else { + LASSERT(cl_io_is_mkwrite(io)); + io_start = io_end = io->u.ci_fault.ft_index; + } + + if (descr->cld_mode >= CLM_WRITE && + (cl_io_is_append(io) || + (descr->cld_start <= io_start && descr->cld_end >= io_end))) { + struct osc_io *oio = osc_env_io(env); + + /* There must be only one lock to match the write region */ + LASSERT(oio->oi_write_osclock == NULL); + oio->oi_write_osclock = oscl; + } +} +EXPORT_SYMBOL(osc_lock_set_writer); + +int osc_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io) +{ + struct osc_lock *oscl; + __u32 enqflags = lock->cll_descr.cld_enq_flags; + + OBD_SLAB_ALLOC_PTR_GFP(oscl, osc_lock_kmem, GFP_NOFS); + if (oscl == NULL) + return -ENOMEM; + + oscl->ols_state = OLS_NEW; + spin_lock_init(&oscl->ols_lock); + INIT_LIST_HEAD(&oscl->ols_waiting_list); + INIT_LIST_HEAD(&oscl->ols_wait_entry); + INIT_LIST_HEAD(&oscl->ols_nextlock_oscobj); + oscl->ols_lockless_ops = &osc_lock_lockless_ops; + + /* Speculative lock requests must be either no_expand or glimpse + * request (CEF_GLIMPSE). non-glimpse no_expand speculative extent + * locks will break ofd_intent_cb. (see comment there)*/ + LASSERT(ergo((enqflags & CEF_SPECULATIVE) != 0, + (enqflags & (CEF_LOCK_NO_EXPAND | CEF_GLIMPSE)) != 0)); + + oscl->ols_flags = osc_enq2ldlm_flags(enqflags); + oscl->ols_speculative = !!(enqflags & CEF_SPECULATIVE); + if (lock->cll_descr.cld_mode == CLM_GROUP) + oscl->ols_flags |= LDLM_FL_ATOMIC_CB; + + if (oscl->ols_flags & LDLM_FL_HAS_INTENT) { + oscl->ols_flags |= LDLM_FL_BLOCK_GRANTED; + oscl->ols_glimpse = 1; + } + if (io->ci_ndelay && cl_object_same(io->ci_obj, obj)) + oscl->ols_flags |= LDLM_FL_NDELAY; + osc_lock_build_einfo(env, lock, cl2osc(obj), &oscl->ols_einfo); + + cl_lock_slice_add(lock, &oscl->ols_cl, obj, &osc_lock_ops); + + if (!(enqflags & CEF_MUST)) + /* try to convert this lock to a lockless lock */ + osc_lock_to_lockless(env, oscl, (enqflags & CEF_NEVER)); + + if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) + osc_lock_set_writer(env, io, obj, oscl); + + LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %#llx", + lock, oscl, oscl->ols_flags); + + return 0; +} + +/** + * Finds an existing lock covering given index and optionally different from a + * given \a except lock. + */ +struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env, + struct osc_object *obj, + pgoff_t index, + enum osc_dap_flags dap_flags) +{ + struct osc_thread_info *info = osc_env_info(env); + struct ldlm_res_id *resname = &info->oti_resname; + union ldlm_policy_data *policy = &info->oti_policy; + struct lustre_handle lockh; + struct ldlm_lock *lock = NULL; + enum ldlm_mode mode; + __u64 flags; + enum ldlm_match_flags match_flags = 0; + + ENTRY; + + ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname); + osc_index2policy(policy, osc2cl(obj), index, index); + policy->l_extent.gid = LDLM_GID_ANY; + + flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING; + if (dap_flags & OSC_DAP_FL_TEST_LOCK) + flags |= LDLM_FL_TEST_LOCK; + + if (dap_flags & OSC_DAP_FL_AST) + match_flags |= LDLM_MATCH_AST; + + if (dap_flags & OSC_DAP_FL_CANCELING) + match_flags |= LDLM_MATCH_UNREF; + + if (dap_flags & OSC_DAP_FL_RIGHT) + match_flags |= LDLM_MATCH_RIGHT; + + /* + * It is fine to match any group lock since there could be only one + * with a uniq gid and it conflicts with all other lock modes too + */ +again: + mode = osc_match_base(env, osc_export(obj), resname, LDLM_EXTENT, + policy, LCK_PR | LCK_PW | LCK_GROUP, &flags, + obj, &lockh, match_flags); + if (mode != 0) { + lock = ldlm_handle2lock(&lockh); + /* RACE: the lock is cancelled so let's try again */ + if (unlikely(lock == NULL)) + goto again; + } + + RETURN(lock); +} +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_object.c b/drivers/staging/lustrefsx/lustre/osc/osc_object.c new file mode 100644 index 0000000000000..aa116260c3475 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_object.c @@ -0,0 +1,497 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_object for OSC layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC +#include +#include + +#include "osc_internal.h" + +/** \addtogroup osc + * @{ + */ + +/***************************************************************************** + * + * Object operations. + * + */ +static void osc_obj_build_res_name(struct osc_object *osc, + struct ldlm_res_id *resname) +{ + ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname); +} + +static const struct osc_object_operations osc_object_ops = { + .oto_build_res_name = osc_obj_build_res_name, + .oto_dlmlock_at_pgoff = osc_obj_dlmlock_at_pgoff, +}; + +int osc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct osc_object *osc = lu2osc(obj); + const struct cl_object_conf *cconf = lu2cl_conf(conf); + + osc->oo_oinfo = cconf->u.coc_oinfo; +#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK + mutex_init(&osc->oo_debug_mutex); +#endif + INIT_LIST_HEAD(&osc->oo_ready_item); + INIT_LIST_HEAD(&osc->oo_hp_ready_item); + INIT_LIST_HEAD(&osc->oo_write_item); + INIT_LIST_HEAD(&osc->oo_read_item); + + osc->oo_root.rb_node = NULL; + INIT_LIST_HEAD(&osc->oo_hp_exts); + INIT_LIST_HEAD(&osc->oo_urgent_exts); + INIT_LIST_HEAD(&osc->oo_full_exts); + INIT_LIST_HEAD(&osc->oo_reading_exts); + atomic_set(&osc->oo_nr_reads, 0); + atomic_set(&osc->oo_nr_writes, 0); + spin_lock_init(&osc->oo_lock); + spin_lock_init(&osc->oo_tree_lock); + spin_lock_init(&osc->oo_ol_spin); + INIT_LIST_HEAD(&osc->oo_ol_list); + + atomic_set(&osc->oo_nr_ios, 0); + init_waitqueue_head(&osc->oo_io_waitq); + + LASSERT(osc->oo_obj_ops != NULL); + + cl_object_page_init(lu2cl(obj), sizeof(struct osc_page)); + + return 0; +} +EXPORT_SYMBOL(osc_object_init); + +void osc_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct osc_object *osc = lu2osc(obj); + + LASSERT(list_empty(&osc->oo_ready_item)); + LASSERT(list_empty(&osc->oo_hp_ready_item)); + LASSERT(list_empty(&osc->oo_write_item)); + LASSERT(list_empty(&osc->oo_read_item)); + + LASSERT(osc->oo_root.rb_node == NULL); + LASSERT(list_empty(&osc->oo_hp_exts)); + LASSERT(list_empty(&osc->oo_urgent_exts)); + LASSERT(list_empty(&osc->oo_full_exts)); + LASSERT(list_empty(&osc->oo_reading_exts)); + LASSERT(atomic_read(&osc->oo_nr_reads) == 0); + LASSERT(atomic_read(&osc->oo_nr_writes) == 0); + LASSERT(list_empty(&osc->oo_ol_list)); + LASSERT(atomic_read(&osc->oo_nr_ios) == 0); + + lu_object_fini(obj); + /* osc doen't contain an lu_object_header, so we don't need call_rcu */ + OBD_SLAB_FREE_PTR(osc, osc_object_kmem); +} +EXPORT_SYMBOL(osc_object_free); + +int osc_lvb_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct ost_lvb *lvb) +{ + return (*p)(env, cookie, "size: %llu mtime: %llu atime: %llu " + "ctime: %llu blocks: %llu", + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, + lvb->lvb_ctime, lvb->lvb_blocks); +} +EXPORT_SYMBOL(osc_lvb_print); + +int osc_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *obj) +{ + struct osc_object *osc = lu2osc(obj); + struct lov_oinfo *oinfo = osc->oo_oinfo; + struct osc_async_rc *ar = &oinfo->loi_ar; + + (*p)(env, cookie, "id: "DOSTID" " + "idx: %d gen: %d kms_valid: %u kms %llu " + "rc: %d force_sync: %d min_xid: %llu ", + POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx, + oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms, + ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid); + osc_lvb_print(env, cookie, p, &oinfo->loi_lvb); + return 0; +} +EXPORT_SYMBOL(osc_object_print); + + +int osc_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + + cl_lvb2attr(attr, &oinfo->loi_lvb); + attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0; + return 0; +} +EXPORT_SYMBOL(osc_attr_get); + +int osc_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + struct ost_lvb *lvb = &oinfo->loi_lvb; + + if (valid & CAT_SIZE) + lvb->lvb_size = attr->cat_size; + if (valid & CAT_MTIME) + lvb->lvb_mtime = attr->cat_mtime; + if (valid & CAT_ATIME) + lvb->lvb_atime = attr->cat_atime; + if (valid & CAT_CTIME) + lvb->lvb_ctime = attr->cat_ctime; + if (valid & CAT_BLOCKS) + lvb->lvb_blocks = attr->cat_blocks; + if (valid & CAT_KMS) { + CDEBUG(D_CACHE, "set kms from %lluto %llu\n", + oinfo->loi_kms, (__u64)attr->cat_kms); + loi_kms_set(oinfo, attr->cat_kms); + } + return 0; +} +EXPORT_SYMBOL(osc_attr_update); + +int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj, + struct ost_lvb *lvb) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + + lvb->lvb_size = oinfo->loi_kms; + lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks; + return 0; +} +EXPORT_SYMBOL(osc_object_glimpse); + +static int osc_object_ast_clear(struct ldlm_lock *lock, void *data) +{ + struct osc_object *osc = (struct osc_object *)data; + struct ost_lvb *lvb = lock->l_lvb_data; + struct lov_oinfo *oinfo; + ENTRY; + + if (lock->l_ast_data == data) { + lock->l_ast_data = NULL; + + LASSERT(osc != NULL); + LASSERT(osc->oo_oinfo != NULL); + LASSERT(lvb != NULL); + + /* Updates lvb in lock by the cached oinfo */ + oinfo = osc->oo_oinfo; + + LDLM_DEBUG(lock, "update lock size %llu blocks %llu [cma]time: " + "%llu %llu %llu by oinfo size %llu blocks %llu " + "[cma]time %llu %llu %llu", lvb->lvb_size, + lvb->lvb_blocks, lvb->lvb_ctime, lvb->lvb_mtime, + lvb->lvb_atime, oinfo->loi_lvb.lvb_size, + oinfo->loi_lvb.lvb_blocks, oinfo->loi_lvb.lvb_ctime, + oinfo->loi_lvb.lvb_mtime, oinfo->loi_lvb.lvb_atime); + LASSERTF(oinfo->loi_lvb.lvb_size >= oinfo->loi_kms, + "lvb_size %#llx, loi_kms %#llx\n", + oinfo->loi_lvb.lvb_size, oinfo->loi_kms); + + cl_object_attr_lock(&osc->oo_cl); + memcpy(lvb, &oinfo->loi_lvb, sizeof(oinfo->loi_lvb)); + cl_object_attr_unlock(&osc->oo_cl); + ldlm_clear_lvb_cached(lock); + } + RETURN(LDLM_ITER_CONTINUE); +} + +int osc_object_prune(const struct lu_env *env, struct cl_object *obj) +{ + struct osc_object *osc = cl2osc(obj); + struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname; + + /* DLM locks don't hold a reference of osc_object so we have to + * clear it before the object is being destroyed. */ + osc_build_res_name(osc, resname); + ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname, + osc_object_ast_clear, osc); + return 0; +} +EXPORT_SYMBOL(osc_object_prune); + +static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj, + struct ll_fiemap_info_key *fmkey, + struct fiemap *fiemap, size_t *buflen) +{ + struct obd_export *exp = osc_export(cl2osc(obj)); + struct ldlm_res_id resid; + union ldlm_policy_data policy; + struct lustre_handle lockh; + enum ldlm_mode mode = LCK_MINMODE; + struct ptlrpc_request *req; + struct fiemap *reply; + char *tmp; + int rc; + ENTRY; + + fmkey->lfik_oa.o_oi = cl2osc(obj)->oo_oinfo->loi_oi; + if (!(fmkey->lfik_fiemap.fm_flags & FIEMAP_FLAG_SYNC)) + goto skip_locking; + + policy.l_extent.start = fmkey->lfik_fiemap.fm_start & PAGE_MASK; + + if (OBD_OBJECT_EOF - fmkey->lfik_fiemap.fm_length <= + fmkey->lfik_fiemap.fm_start + PAGE_SIZE - 1) + policy.l_extent.end = OBD_OBJECT_EOF; + else + policy.l_extent.end = (fmkey->lfik_fiemap.fm_start + + fmkey->lfik_fiemap.fm_length + + PAGE_SIZE - 1) & PAGE_MASK; + + ostid_build_res_name(&fmkey->lfik_oa.o_oi, &resid); + mode = ldlm_lock_match(exp->exp_obd->obd_namespace, + LDLM_FL_BLOCK_GRANTED | LDLM_FL_LVB_READY, + &resid, LDLM_EXTENT, &policy, + LCK_PR | LCK_PW, &lockh); + if (mode) { /* lock is cached on client */ + if (mode != LCK_PR) { + ldlm_lock_addref(&lockh, LCK_PR); + ldlm_lock_decref(&lockh, LCK_PW); + } + } else { /* no cached lock, needs acquire lock on server side */ + fmkey->lfik_oa.o_valid |= OBD_MD_FLFLAGS; + fmkey->lfik_oa.o_flags |= OBD_FL_SRVLOCK; + } + +skip_locking: + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_OST_GET_INFO_FIEMAP); + if (req == NULL) + GOTO(drop_lock, rc = -ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY, RCL_CLIENT, + sizeof(*fmkey)); + req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_CLIENT, + *buflen); + req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_SERVER, + *buflen); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO); + if (rc != 0) { + ptlrpc_request_free(req); + GOTO(drop_lock, rc); + } + tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY); + memcpy(tmp, fmkey, sizeof(*fmkey)); + tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL); + memcpy(tmp, fiemap, *buflen); + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc != 0) + GOTO(fini_req, rc); + + reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL); + if (reply == NULL) + GOTO(fini_req, rc = -EPROTO); + + memcpy(fiemap, reply, *buflen); +fini_req: + ptlrpc_req_finished(req); +drop_lock: + if (mode) + ldlm_lock_decref(&lockh, LCK_PR); + RETURN(rc); +} + +#define MAX_OSC_DLMLOCK_LOOKUP 3 +/** + * Implementation of struct cl_object_operations::coo_req_attr_set() for osc + * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq + * fields. + */ +static void osc_req_attr_set(const struct lu_env *env, struct cl_object *obj, + struct cl_req_attr *attr) +{ + struct lov_oinfo *oinfo; + struct obdo *oa; + struct ost_lvb *lvb; + u64 flags = attr->cra_flags; + + oinfo = cl2osc(obj)->oo_oinfo; + lvb = &oinfo->loi_lvb; + oa = attr->cra_oa; + + if ((flags & OBD_MD_FLMTIME) != 0) { + oa->o_mtime = lvb->lvb_mtime; + oa->o_valid |= OBD_MD_FLMTIME; + } + /* XXX: + * I don't understand this part, what for OSC resets atime just + * set by VVP layer to 0 so that OST gets 0 instead of actual + * atime, bzzz. please inspect this place with extra care. + */ + if ((flags & OBD_MD_FLATIME) && lvb->lvb_atime > oa->o_atime) { + oa->o_atime = lvb->lvb_atime; + oa->o_valid |= OBD_MD_FLATIME; + } + if ((flags & OBD_MD_FLCTIME) != 0) { + oa->o_ctime = lvb->lvb_ctime; + oa->o_valid |= OBD_MD_FLCTIME; + } + if (flags & OBD_MD_FLGROUP) { + ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi)); + oa->o_valid |= OBD_MD_FLGROUP; + } + if (flags & OBD_MD_FLID) { + int rc; + + rc = ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi)); + if (rc) { + CERROR("Bad %llu to set " DOSTID " : rc %d\n", + (unsigned long long)ostid_id(&oinfo->loi_oi), + POSTID(&oa->o_oi), rc); + } + oa->o_valid |= OBD_MD_FLID; + } + if (flags & OBD_MD_FLHANDLE) { + struct ldlm_lock *lock; + struct osc_page *opg; + int retry_cnt = 0; + + opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj)); +lookup: + lock = osc_dlmlock_at_pgoff(env, cl2osc(obj), osc_index(opg), + OSC_DAP_FL_TEST_LOCK | OSC_DAP_FL_CANCELING); + if (lock == NULL && !opg->ops_srvlock) { + struct ldlm_resource *res; + struct ldlm_res_id *resname; + + if (retry_cnt < MAX_OSC_DLMLOCK_LOOKUP) { + /* the code is racing, delay to be sure to be + * out of it and try again, value based on + * debugging timing. */ + CERROR("Uncovered page by a LDLM lock, " + "retrying %d\n", ++retry_cnt); + smp_mb(); + mdelay(50); + goto lookup; + } + + CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page, + "uncovered page!\n"); + + resname = &osc_env_info(env)->oti_resname; + ostid_build_res_name(&oinfo->loi_oi, resname); + res = ldlm_resource_get( + osc_export(cl2osc(obj))->exp_obd->obd_namespace, + NULL, resname, LDLM_EXTENT, 0); + if (IS_ERR(res)) + CERROR("No lock resource\n"); + else + ldlm_resource_dump(D_ERROR, res); + + libcfs_debug_dumpstack(NULL); + LBUG(); + } + + /* check for lockless io. */ + if (lock != NULL) { + oa->o_handle = lock->l_remote_handle; + oa->o_valid |= OBD_MD_FLHANDLE; + LDLM_LOCK_PUT(lock); + } + } +} + +static const struct cl_object_operations osc_ops = { + .coo_page_init = osc_page_init, + .coo_lock_init = osc_lock_init, + .coo_io_init = osc_io_init, + .coo_attr_get = osc_attr_get, + .coo_attr_update = osc_attr_update, + .coo_glimpse = osc_object_glimpse, + .coo_prune = osc_object_prune, + .coo_fiemap = osc_object_fiemap, + .coo_req_attr_set = osc_req_attr_set +}; + +static const struct lu_object_operations osc_lu_obj_ops = { + .loo_object_init = osc_object_init, + .loo_object_release = NULL, + .loo_object_free = osc_object_free, + .loo_object_print = osc_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *osc_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct osc_object *osc; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, GFP_NOFS); + if (osc != NULL) { + obj = osc2lu(osc); + lu_object_init(obj, NULL, dev); + osc->oo_cl.co_ops = &osc_ops; + obj->lo_ops = &osc_lu_obj_ops; + osc->oo_obj_ops = &osc_object_ops; + } else + obj = NULL; + return obj; +} + +int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc) +{ + ENTRY; + + CDEBUG(D_INODE, "Invalidate osc object: %p, # of active IOs: %d\n", + osc, atomic_read(&osc->oo_nr_ios)); + + wait_event_idle(osc->oo_io_waitq, atomic_read(&osc->oo_nr_ios) == 0); + + /* Discard all dirty pages of this object. */ + osc_cache_truncate_start(env, osc, 0, NULL); + + /* Discard all caching pages */ + osc_lock_discard_pages(env, osc, 0, CL_PAGE_EOF, true); + + /* Clear ast data of dlm lock. Do this after discarding all pages */ + cl_object_prune(env, osc2cl(osc)); + + RETURN(0); +} +EXPORT_SYMBOL(osc_object_invalidate); +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_page.c b/drivers/staging/lustrefsx/lustre/osc/osc_page.c new file mode 100644 index 0000000000000..fa5d86e0f2ea4 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_page.c @@ -0,0 +1,1158 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_page for OSC layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC +#include + +#include "osc_internal.h" + +static void osc_lru_del(struct client_obd *cli, struct osc_page *opg); +static void osc_lru_use(struct client_obd *cli, struct osc_page *opg); +static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli, + struct osc_page *opg); + +/** \addtogroup osc + * @{ + */ + +/* + * Page operations. + */ +static void osc_page_transfer_get(struct osc_page *opg, const char *label) +{ + struct cl_page *page = opg->ops_cl.cpl_page; + + LASSERT(!opg->ops_transfer_pinned); + cl_page_get(page); + lu_ref_add_atomic(&page->cp_reference, label, page); + opg->ops_transfer_pinned = 1; +} + +static void osc_page_transfer_put(const struct lu_env *env, + struct osc_page *opg) +{ + struct cl_page *page = opg->ops_cl.cpl_page; + + if (opg->ops_transfer_pinned) { + opg->ops_transfer_pinned = 0; + lu_ref_del(&page->cp_reference, "transfer", page); + cl_page_put(env, page); + } +} + +/** + * This is called once for every page when it is submitted for a transfer + * either opportunistic (osc_page_cache_add()), or immediate + * (osc_page_submit()). + */ +static void osc_page_transfer_add(const struct lu_env *env, + struct osc_page *opg, enum cl_req_type crt) +{ + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + + osc_lru_use(osc_cli(obj), opg); +} + +int osc_page_cache_add(const struct lu_env *env, struct osc_page *opg, + struct cl_io *io, cl_commit_cbt cb) +{ + int result; + ENTRY; + + osc_page_transfer_get(opg, "transfer\0cache"); + result = osc_queue_async_io(env, io, opg, cb); + if (result != 0) + osc_page_transfer_put(env, opg); + else + osc_page_transfer_add(env, opg, CRT_WRITE); + + RETURN(result); +} + +void osc_index2policy(union ldlm_policy_data *policy, + const struct cl_object *obj, pgoff_t start, pgoff_t end) +{ + memset(policy, 0, sizeof *policy); + policy->l_extent.start = cl_offset(obj, start); + policy->l_extent.end = cl_offset(obj, end + 1) - 1; +} + +static inline s64 osc_submit_duration(struct osc_page *opg) +{ + if (ktime_to_ns(opg->ops_submit_time) == 0) + return 0; + + return ktime_ms_delta(ktime_get(), opg->ops_submit_time); +} + +static int osc_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_async_page *oap = &opg->ops_oap; + struct osc_object *obj = cl2osc(slice->cpl_obj); + struct client_obd *cli = &osc_export(obj)->exp_obd->u.cli; + + return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p %lu: " + "1< %#x %d %c %c > " + "2< %lld %u %u %#x %#x | %p %p %p > " + "3< %d %lld %d > " + "4< %d %d %d %lu %c | %c %c %c %c > " + "5< %c %c %c %c | %d %c | %d %c %c>\n", + opg, osc_index(opg), + /* 1 */ + oap->oap_magic, oap->oap_cmd, + list_empty_marker(&oap->oap_pending_item), + list_empty_marker(&oap->oap_rpc_item), + /* 2 */ + oap->oap_obj_off, oap->oap_page_off, oap->oap_count, + oap->oap_async_flags, oap->oap_brw_flags, + oap->oap_request, oap->oap_cli, obj, + /* 3 */ + opg->ops_transfer_pinned, + osc_submit_duration(opg), opg->ops_srvlock, + /* 4 */ + cli->cl_r_in_flight, cli->cl_w_in_flight, + cli->cl_max_rpcs_in_flight, + cli->cl_avail_grant, + waitqueue_active(&cli->cl_cache_waiters) ? '+' : '-', + list_empty_marker(&cli->cl_loi_ready_list), + list_empty_marker(&cli->cl_loi_hp_ready_list), + list_empty_marker(&cli->cl_loi_write_list), + list_empty_marker(&cli->cl_loi_read_list), + /* 5 */ + list_empty_marker(&obj->oo_ready_item), + list_empty_marker(&obj->oo_hp_ready_item), + list_empty_marker(&obj->oo_write_item), + list_empty_marker(&obj->oo_read_item), + atomic_read(&obj->oo_nr_reads), + list_empty_marker(&obj->oo_reading_exts), + atomic_read(&obj->oo_nr_writes), + list_empty_marker(&obj->oo_hp_exts), + list_empty_marker(&obj->oo_urgent_exts)); +} + +static void osc_page_delete(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + int rc; + + ENTRY; + CDEBUG(D_TRACE, "%p\n", opg); + osc_page_transfer_put(env, opg); + rc = osc_teardown_async_page(env, obj, opg); + if (rc) { + CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page, + "Trying to teardown failed: %d\n", rc); + LASSERT(0); + } + + osc_lru_del(osc_cli(obj), opg); + + if (slice->cpl_page->cp_type == CPT_CACHEABLE) { + void *value = NULL; + + spin_lock(&obj->oo_tree_lock); + if (opg->ops_intree) { + value = radix_tree_delete(&obj->oo_tree, + osc_index(opg)); + if (value != NULL) { + --obj->oo_npages; + opg->ops_intree = 0; + } + } + spin_unlock(&obj->oo_tree_lock); + + LASSERT(ergo(value != NULL, value == opg)); + } + + EXIT; +} + +static void osc_page_clip(const struct lu_env *env, + const struct cl_page_slice *slice, + int from, int to) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_async_page *oap = &opg->ops_oap; + + opg->ops_from = from; + /* argument @to is exclusive, but @ops_to is inclusive */ + opg->ops_to = to - 1; + /* This isn't really necessary for transient pages, but we also don't + * call clip on transient pages often, so it's OK. + */ + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_COUNT_STABLE; + spin_unlock(&oap->oap_lock); +} + +static int osc_page_flush(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct osc_page *opg = cl2osc_page(slice); + int rc = 0; + ENTRY; + rc = osc_flush_async_page(env, io, opg); + RETURN(rc); +} + +static void osc_page_touch(const struct lu_env *env, + const struct cl_page_slice *slice, size_t to) +{ + struct osc_page *opg = cl2osc_page(slice); + struct cl_object *obj = opg->ops_cl.cpl_obj; + + osc_page_touch_at(env, obj, osc_index(opg), to); +} + +static const struct cl_page_operations osc_page_ops = { + .cpo_print = osc_page_print, + .cpo_delete = osc_page_delete, + .cpo_clip = osc_page_clip, + .cpo_flush = osc_page_flush, + .cpo_page_touch = osc_page_touch, +}; + +int osc_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *cl_page, pgoff_t index) +{ + struct osc_object *osc = cl2osc(obj); + struct osc_page *opg = cl_object_page_slice(obj, cl_page); + struct osc_io *oio = osc_env_io(env); + int result; + + opg->ops_from = 0; + opg->ops_to = PAGE_SIZE - 1; + + INIT_LIST_HEAD(&opg->ops_lru); + + result = osc_prep_async_page(osc, opg, cl_page, cl_offset(obj, index)); + if (result != 0) + return result; + + opg->ops_srvlock = osc_io_srvlock(oio); + cl_page_slice_add(cl_page, &opg->ops_cl, obj, &osc_page_ops); + + /* reserve an LRU space for this page */ + if (cl_page->cp_type == CPT_CACHEABLE) { + result = osc_lru_alloc(env, osc_cli(osc), opg); + if (result == 0) { + result = radix_tree_preload(GFP_NOFS); + if (result == 0) { + spin_lock(&osc->oo_tree_lock); + result = radix_tree_insert(&osc->oo_tree, + index, opg); + if (result == 0) { + ++osc->oo_npages; + opg->ops_intree = 1; + } + spin_unlock(&osc->oo_tree_lock); + + radix_tree_preload_end(); + } + } + } + + return result; +} +EXPORT_SYMBOL(osc_page_init); + +/** + * Helper function called by osc_io_submit() for every page in an immediate + * transfer (i.e., transferred synchronously). + */ +void osc_page_submit(const struct lu_env *env, struct osc_page *opg, + enum cl_req_type crt, int brw_flags, ktime_t submit_time) +{ + struct osc_io *oio = osc_env_io(env); + struct osc_async_page *oap = &opg->ops_oap; + + LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, " + "magic 0x%x\n", oap, oap->oap_magic); + LASSERT(oap->oap_async_flags & ASYNC_READY); + LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE); + + oap->oap_cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; + oap->oap_page_off = opg->ops_from; + oap->oap_count = opg->ops_to - opg->ops_from + 1; + oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags; + + if (oio->oi_cap_sys_resource) { + oap->oap_brw_flags |= OBD_BRW_SYS_RESOURCE; + oap->oap_cmd |= OBD_BRW_SYS_RESOURCE; + } + + opg->ops_submit_time = submit_time; + osc_page_transfer_get(opg, "transfer\0imm"); + osc_page_transfer_add(env, opg, crt); +} + +/* --------------- LRU page management ------------------ */ + +/* OSC is a natural place to manage LRU pages as applications are specialized + * to write OSC by OSC. Ideally, if one OSC is used more frequently it should + * occupy more LRU slots. On the other hand, we should avoid using up all LRU + * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep + * for free LRU slots - this will be very bad so the algorithm requires each + * OSC to free slots voluntarily to maintain a reasonable number of free slots + * at any time. + */ + +static DECLARE_WAIT_QUEUE_HEAD(osc_lru_waitq); + +/** + * LRU pages are freed in batch mode. OSC should at least free this + * number of pages to avoid running out of LRU slots. + */ +static inline int lru_shrink_min(struct client_obd *cli) +{ + return cli->cl_max_pages_per_rpc * 2; +} + +/** + * free this number at most otherwise it will take too long time to finsih. + */ +static inline int lru_shrink_max(struct client_obd *cli) +{ + return cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight; +} + +/** + * Check if we can free LRU slots from this OSC. If there exists LRU waiters, + * we should free slots aggressively. In this way, slots are freed in a steady + * step to maintain fairness among OSCs. + * + * Return how many LRU pages should be freed. + */ +static int osc_cache_too_much(struct client_obd *cli) +{ + struct cl_client_cache *cache = cli->cl_cache; + long pages = atomic_long_read(&cli->cl_lru_in_list); + unsigned long budget; + + LASSERT(cache != NULL); + budget = cache->ccc_lru_max / (atomic_read(&cache->ccc_users) - 2); + + /* if it's going to run out LRU slots, we should free some, but not + * too much to maintain faireness among OSCs. */ + if (atomic_long_read(cli->cl_lru_left) < cache->ccc_lru_max >> 2) { + if (pages >= budget) + return lru_shrink_max(cli); + else if (pages >= budget / 2) + return lru_shrink_min(cli); + } else { + time64_t duration = ktime_get_real_seconds(); + long timediff; + + /* knock out pages by duration of no IO activity */ + duration -= cli->cl_lru_last_used; + /* + * The difference shouldn't be more than 70 years + * so we can safely case to a long. Round to + * approximately 1 minute. + */ + timediff = (long)(duration >> 6); + if (timediff > 0 && pages >= budget / timediff) + return lru_shrink_min(cli); + } + return 0; +} + +int lru_queue_work(const struct lu_env *env, void *data) +{ + struct client_obd *cli = data; + int count; + + CDEBUG(D_CACHE, "%s: run LRU work for client obd\n", cli_name(cli)); + count = osc_cache_too_much(cli); + if (count > 0) { + int rc = osc_lru_shrink(env, cli, count, false); + + CDEBUG(D_CACHE, "%s: shrank %d/%d pages from client obd\n", + cli_name(cli), rc, count); + if (rc >= count) { + CDEBUG(D_CACHE, "%s: queue again\n", cli_name(cli)); + ptlrpcd_queue_work(cli->cl_lru_work); + } + } + + RETURN(0); +} + +void osc_lru_add_batch(struct client_obd *cli, struct list_head *plist) +{ + LIST_HEAD(lru); + struct osc_async_page *oap; + long npages = 0; + + list_for_each_entry(oap, plist, oap_pending_item) { + struct osc_page *opg = oap2osc_page(oap); + + if (!opg->ops_in_lru) + continue; + + ++npages; + LASSERT(list_empty(&opg->ops_lru)); + list_add(&opg->ops_lru, &lru); + } + + if (npages > 0) { + spin_lock(&cli->cl_lru_list_lock); + list_splice_tail(&lru, &cli->cl_lru_list); + atomic_long_sub(npages, &cli->cl_lru_busy); + atomic_long_add(npages, &cli->cl_lru_in_list); + cli->cl_lru_last_used = ktime_get_real_seconds(); + spin_unlock(&cli->cl_lru_list_lock); + + if (waitqueue_active(&osc_lru_waitq)) + (void)ptlrpcd_queue_work(cli->cl_lru_work); + } +} + +static void __osc_lru_del(struct client_obd *cli, struct osc_page *opg) +{ + LASSERT(atomic_long_read(&cli->cl_lru_in_list) > 0); + list_del_init(&opg->ops_lru); + atomic_long_dec(&cli->cl_lru_in_list); +} + +/** + * Page is being destroyed. The page may be not in LRU list, if the transfer + * has never finished(error occurred). + */ +static void osc_lru_del(struct client_obd *cli, struct osc_page *opg) +{ + if (opg->ops_in_lru) { + spin_lock(&cli->cl_lru_list_lock); + if (!list_empty(&opg->ops_lru)) { + __osc_lru_del(cli, opg); + } else { + LASSERT(atomic_long_read(&cli->cl_lru_busy) > 0); + atomic_long_dec(&cli->cl_lru_busy); + } + spin_unlock(&cli->cl_lru_list_lock); + + atomic_long_inc(cli->cl_lru_left); + /* this is a great place to release more LRU pages if + * this osc occupies too many LRU pages and kernel is + * stealing one of them. */ + if (osc_cache_too_much(cli)) { + CDEBUG(D_CACHE, "%s: queue LRU work\n", cli_name(cli)); + (void)ptlrpcd_queue_work(cli->cl_lru_work); + } + wake_up(&osc_lru_waitq); + } else { + LASSERT(list_empty(&opg->ops_lru)); + } +} + +/** + * Delete page from LRU list for redirty. + */ +static void osc_lru_use(struct client_obd *cli, struct osc_page *opg) +{ + /* If page is being transferred for the first time, + * ops_lru should be empty */ + if (opg->ops_in_lru) { + if (list_empty(&opg->ops_lru)) + return; + spin_lock(&cli->cl_lru_list_lock); + if (!list_empty(&opg->ops_lru)) { + __osc_lru_del(cli, opg); + atomic_long_inc(&cli->cl_lru_busy); + } + spin_unlock(&cli->cl_lru_list_lock); + } +} + +static void discard_pagevec(const struct lu_env *env, struct cl_io *io, + struct cl_page **pvec, int max_index) +{ + struct pagevec *pagevec = &osc_env_info(env)->oti_pagevec; + int i; + + ll_pagevec_init(pagevec, 0); + for (i = 0; i < max_index; i++) { + struct cl_page *page = pvec[i]; + + LASSERT(cl_page_is_owned(page, io)); + cl_page_delete(env, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + cl_pagevec_put(env, page, pagevec); + + pvec[i] = NULL; + } + pagevec_release(pagevec); +} + +/** + * Check if a cl_page can be released, i.e, it's not being used. + * + * If unstable account is turned on, bulk transfer may hold one refcount + * for recovery so we need to check vmpage refcount as well; otherwise, + * even we can destroy cl_page but the corresponding vmpage can't be reused. + */ +static inline bool lru_page_busy(struct client_obd *cli, struct cl_page *page) +{ + if (cl_page_in_use_noref(page)) + return true; + + if (cli->cl_cache->ccc_unstable_check) { + struct page *vmpage = cl_page_vmpage(page); + + /* vmpage have two known users: cl_page and VM page cache */ + if (page_count(vmpage) - page_mapcount(vmpage) > 2) + return true; + } + return false; +} + +/** + * Drop @target of pages from LRU at most. + */ +long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, + long target, bool force) +{ + struct cl_io *io; + struct cl_object *clobj = NULL; + struct cl_page **pvec; + struct osc_page *opg; + long count = 0; + int maxscan = 0; + int index = 0; + int rc = 0; + ENTRY; + + LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0); + if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0) + RETURN(0); + + CDEBUG(D_CACHE, "%s: shrinkers: %d, force: %d\n", + cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force); + if (!force) { + if (atomic_read(&cli->cl_lru_shrinkers) > 0) + RETURN(-EBUSY); + + if (atomic_inc_return(&cli->cl_lru_shrinkers) > 1) { + atomic_dec(&cli->cl_lru_shrinkers); + RETURN(-EBUSY); + } + } else { + atomic_inc(&cli->cl_lru_shrinkers); + } + + pvec = (struct cl_page **)osc_env_info(env)->oti_pvec; + io = osc_env_thread_io(env); + + spin_lock(&cli->cl_lru_list_lock); + if (force) + cli->cl_lru_reclaim++; + maxscan = min(target << 1, atomic_long_read(&cli->cl_lru_in_list)); + while (!list_empty(&cli->cl_lru_list)) { + struct cl_page *page; + bool will_free = false; + + if (!force && atomic_read(&cli->cl_lru_shrinkers) > 1) + break; + + if (--maxscan < 0) + break; + + opg = list_first_entry(&cli->cl_lru_list, struct osc_page, + ops_lru); + page = opg->ops_cl.cpl_page; + if (lru_page_busy(cli, page)) { + list_move_tail(&opg->ops_lru, &cli->cl_lru_list); + continue; + } + + LASSERT(page->cp_obj != NULL); + if (clobj != page->cp_obj) { + struct cl_object *tmp = page->cp_obj; + + cl_object_get(tmp); + spin_unlock(&cli->cl_lru_list_lock); + + if (clobj != NULL) { + discard_pagevec(env, io, pvec, index); + index = 0; + + cl_io_fini(env, io); + cl_object_put(env, clobj); + clobj = NULL; + } + + clobj = tmp; + io->ci_obj = clobj; + io->ci_ignore_layout = 1; + rc = cl_io_init(env, io, CIT_MISC, clobj); + + spin_lock(&cli->cl_lru_list_lock); + + if (rc != 0) + break; + + ++maxscan; + continue; + } + + if (cl_page_own_try(env, io, page) == 0) { + if (!lru_page_busy(cli, page)) { + /* remove it from lru list earlier to avoid + * lock contention */ + __osc_lru_del(cli, opg); + opg->ops_in_lru = 0; /* will be discarded */ + + cl_page_get(page); + will_free = true; + } else { + cl_page_disown(env, io, page); + } + } + + if (!will_free) { + list_move_tail(&opg->ops_lru, &cli->cl_lru_list); + continue; + } + + /* Don't discard and free the page with cl_lru_list held */ + pvec[index++] = page; + if (unlikely(index == OTI_PVEC_SIZE)) { + spin_unlock(&cli->cl_lru_list_lock); + discard_pagevec(env, io, pvec, index); + index = 0; + + spin_lock(&cli->cl_lru_list_lock); + } + + if (++count >= target) + break; + } + spin_unlock(&cli->cl_lru_list_lock); + + if (clobj != NULL) { + discard_pagevec(env, io, pvec, index); + + cl_io_fini(env, io); + cl_object_put(env, clobj); + } + + atomic_dec(&cli->cl_lru_shrinkers); + if (count > 0) { + atomic_long_add(count, cli->cl_lru_left); + wake_up(&osc_lru_waitq); + } + RETURN(count > 0 ? count : rc); +} +EXPORT_SYMBOL(osc_lru_shrink); + +/** + * Reclaim LRU pages by an IO thread. The caller wants to reclaim at least + * \@npages of LRU slots. For performance consideration, it's better to drop + * LRU pages in batch. Therefore, the actual number is adjusted at least + * max_pages_per_rpc. + */ +static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages) +{ + struct lu_env *env; + struct cl_client_cache *cache = cli->cl_cache; + struct client_obd *scan; + int max_scans; + __u16 refcheck; + long rc = 0; + ENTRY; + + LASSERT(cache != NULL); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(rc); + + npages = max_t(int, npages, cli->cl_max_pages_per_rpc); + CDEBUG(D_CACHE, "%s: start to reclaim %ld pages from LRU\n", + cli_name(cli), npages); + rc = osc_lru_shrink(env, cli, npages, true); + if (rc >= npages) { + CDEBUG(D_CACHE, "%s: reclaimed %ld/%ld pages from LRU\n", + cli_name(cli), rc, npages); + if (osc_cache_too_much(cli) > 0) + ptlrpcd_queue_work(cli->cl_lru_work); + GOTO(out, rc); + } else if (rc > 0) { + npages -= rc; + } + + CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %ld/%ld, want: %ld\n", + cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list), + atomic_long_read(&cli->cl_lru_busy), npages); + + /* Reclaim LRU slots from other client_obd as it can't free enough + * from its own. This should rarely happen. */ + spin_lock(&cache->ccc_lru_lock); + LASSERT(!list_empty(&cache->ccc_lru)); + + cache->ccc_lru_shrinkers++; + list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); + + max_scans = atomic_read(&cache->ccc_users) - 2; + while (--max_scans > 0 && + (scan = list_first_entry_or_null(&cache->ccc_lru, + struct client_obd, + cl_lru_osc)) != NULL) { + CDEBUG(D_CACHE, "%s: cli %p LRU pages: %ld, busy: %ld.\n", + cli_name(scan), scan, + atomic_long_read(&scan->cl_lru_in_list), + atomic_long_read(&scan->cl_lru_busy)); + + list_move_tail(&scan->cl_lru_osc, &cache->ccc_lru); + if (osc_cache_too_much(scan) > 0) { + spin_unlock(&cache->ccc_lru_lock); + + rc = osc_lru_shrink(env, scan, npages, true); + spin_lock(&cache->ccc_lru_lock); + if (rc >= npages) + break; + if (rc > 0) + npages -= rc; + } + } + spin_unlock(&cache->ccc_lru_lock); + +out: + cl_env_put(env, &refcheck); + CDEBUG(D_CACHE, "%s: cli %p freed %ld pages.\n", + cli_name(cli), cli, rc); + return rc; +} + +/** + * osc_lru_alloc() is called to allocate an LRU slot for a cl_page. + * + * Usually the LRU slots are reserved in osc_io_iter_rw_init(). + * Only in the case that the LRU slots are in extreme shortage, it should + * have reserved enough slots for an IO. + */ +static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli, + struct osc_page *opg) +{ + struct osc_io *oio = osc_env_io(env); + int rc = 0; + + ENTRY; + + if (cli->cl_cache == NULL) /* shall not be in LRU */ + RETURN(0); + + if (oio->oi_lru_reserved > 0) { + --oio->oi_lru_reserved; + goto out; + } + + LASSERT(atomic_long_read(cli->cl_lru_left) >= 0); + while (!atomic_long_add_unless(cli->cl_lru_left, -1, 0)) { + /* run out of LRU spaces, try to drop some by itself */ + rc = osc_lru_reclaim(cli, 1); + if (rc < 0) + break; + if (rc > 0) + continue; + /* IO issued by readahead, don't try hard */ + if (oio->oi_is_readahead) { + if (atomic_long_read(cli->cl_lru_left) > 0) + continue; + rc = -EBUSY; + break; + } + + cond_resched(); + rc = l_wait_event_abortable( + osc_lru_waitq, + atomic_long_read(cli->cl_lru_left) > 0); + if (rc < 0) { + rc = -EINTR; + break; + } + } + +out: + if (rc >= 0) { + atomic_long_inc(&cli->cl_lru_busy); + opg->ops_in_lru = 1; + rc = 0; + } + + RETURN(rc); +} + +/** + * osc_lru_reserve() is called to reserve enough LRU slots for I/O. + * + * The benefit of doing this is to reduce contention against atomic counter + * cl_lru_left by changing it from per-page access to per-IO access. + */ +unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages) +{ + unsigned long reserved = 0; + unsigned long max_pages; + unsigned long c; + int rc; + +again: + c = atomic_long_read(cli->cl_lru_left); + if (c < npages && osc_lru_reclaim(cli, npages) > 0) + c = atomic_long_read(cli->cl_lru_left); + + if (c < npages) { + /* + * Trigger writeback in the hope some LRU slot could + * be freed. + */ + rc = ptlrpcd_queue_work(cli->cl_writeback_work); + if (rc) + return 0; + } + + while (c >= npages) { + if (c == atomic_long_cmpxchg(cli->cl_lru_left, c, c - npages)) { + reserved = npages; + break; + } + c = atomic_long_read(cli->cl_lru_left); + } + + if (reserved != npages) { + cond_resched(); + rc = l_wait_event_abortable( + osc_lru_waitq, + atomic_long_read(cli->cl_lru_left) > 0); + goto again; + } + + max_pages = cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight; + if (atomic_long_read(cli->cl_lru_left) < max_pages) { + /* If there aren't enough pages in the per-OSC LRU then + * wake up the LRU thread to try and clear out space, so + * we don't block if pages are being dirtied quickly. */ + CDEBUG(D_CACHE, "%s: queue LRU, left: %lu/%ld.\n", + cli_name(cli), atomic_long_read(cli->cl_lru_left), + max_pages); + (void)ptlrpcd_queue_work(cli->cl_lru_work); + } + + return reserved; +} + +/** + * osc_lru_unreserve() is called to unreserve LRU slots. + * + * LRU slots reserved by osc_lru_reserve() may have entries left due to several + * reasons such as page already existing or I/O error. Those reserved slots + * should be freed by calling this function. + */ +void osc_lru_unreserve(struct client_obd *cli, unsigned long npages) +{ + atomic_long_add(npages, cli->cl_lru_left); + wake_up(&osc_lru_waitq); +} + +/** + * Atomic operations are expensive. We accumulate the accounting for the + * same page zone to get better performance. + * In practice this can work pretty good because the pages in the same RPC + * are likely from the same page zone. + */ +#ifdef HAVE_NR_UNSTABLE_NFS +/* Old kernels use a separate counter for unstable pages, + * newer kernels treat them like any other writeback. + */ +#define NR_WRITEBACK NR_UNSTABLE_NFS +#endif + +static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc, + struct osc_brw_async_args *aa, + int factor) +{ + int page_count; + void *zone = NULL; + int count = 0; + int i; + + if (desc != NULL) { + page_count = desc->bd_iov_count; + } else { + page_count = aa->aa_page_count; + } + + for (i = 0; i < page_count; i++) { + void *pz; + if (desc) + pz = page_zone(desc->bd_vec[i].bv_page); + else + pz = page_zone(aa->aa_ppga[i]->pg); + + if (likely(pz == zone)) { + ++count; + continue; + } + + if (count > 0) { + mod_zone_page_state(zone, NR_WRITEBACK, + factor * count); + count = 0; + } + zone = pz; + ++count; + } + if (count > 0) + mod_zone_page_state(zone, NR_WRITEBACK, factor * count); +} + +static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc, + struct osc_brw_async_args *aa) +{ + unstable_page_accounting(desc, aa, 1); +} + +static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc, + struct osc_brw_async_args *aa) +{ + unstable_page_accounting(desc, aa, -1); +} + +/** + * Performs "unstable" page accounting. This function balances the + * increment operations performed in osc_inc_unstable_pages. It is + * registered as the RPC request callback, and is executed when the + * bulk RPC is committed on the server. Thus at this point, the pages + * involved in the bulk transfer are no longer considered unstable. + * + * If this function is called, the request should have been committed + * or req:rq_unstable must have been set; it implies that the unstable + * statistic have been added. + */ +void osc_dec_unstable_pages(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + struct osc_brw_async_args *aa = (void *)&req->rq_async_args; + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + int page_count; + long unstable_count; + + if (desc) + page_count = desc->bd_iov_count; + else + page_count = aa->aa_page_count; + + LASSERT(page_count >= 0); + + dec_unstable_page_accounting(desc, aa); + + unstable_count = atomic_long_sub_return(page_count, + &cli->cl_unstable_count); + LASSERT(unstable_count >= 0); + + unstable_count = atomic_long_sub_return(page_count, + &cli->cl_cache->ccc_unstable_nr); + LASSERT(unstable_count >= 0); + if (unstable_count == 0) + wake_up(&cli->cl_cache->ccc_unstable_waitq); + + if (waitqueue_active(&osc_lru_waitq)) + (void)ptlrpcd_queue_work(cli->cl_lru_work); +} + +/** + * "unstable" page accounting. See: osc_dec_unstable_pages. + */ +void osc_inc_unstable_pages(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + struct osc_brw_async_args *aa = (void *)&req->rq_async_args; + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + long page_count; + + /* No unstable page tracking */ + if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check) + return; + + if (desc) + page_count = desc->bd_iov_count; + else + page_count = aa->aa_page_count; + + add_unstable_page_accounting(desc, aa); + atomic_long_add(page_count, &cli->cl_unstable_count); + atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr); + + /* If the request has already been committed (i.e. brw_commit + * called via rq_commit_cb), we need to undo the unstable page + * increments we just performed because rq_commit_cb wont be + * called again. */ + spin_lock(&req->rq_lock); + if (unlikely(req->rq_committed)) { + spin_unlock(&req->rq_lock); + + osc_dec_unstable_pages(req); + } else { + req->rq_unstable = 1; + spin_unlock(&req->rq_lock); + } +} + +/** + * Check if it piggybacks SOFT_SYNC flag to OST from this OSC. + * This function will be called by every BRW RPC so it's critical + * to make this function fast. + */ +bool osc_over_unstable_soft_limit(struct client_obd *cli) +{ + long unstable_nr, osc_unstable_count; + + /* Can't check cli->cl_unstable_count, therefore, no soft limit */ + if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check) + return false; + + osc_unstable_count = atomic_long_read(&cli->cl_unstable_count); + unstable_nr = atomic_long_read(&cli->cl_cache->ccc_unstable_nr); + + CDEBUG(D_CACHE, + "%s: cli: %p unstable pages: %lu, osc unstable pages: %lu\n", + cli_name(cli), cli, unstable_nr, osc_unstable_count); + + /* If the LRU slots are in shortage - 25% remaining AND this OSC + * has one full RPC window of unstable pages, it's a good chance + * to piggyback a SOFT_SYNC flag. + * Please notice that the OST won't take immediate response for the + * SOFT_SYNC request so active OSCs will have more chance to carry + * the flag, this is reasonable. */ + return unstable_nr > cli->cl_cache->ccc_lru_max >> 2 && + osc_unstable_count > cli->cl_max_pages_per_rpc * + cli->cl_max_rpcs_in_flight; +} + +/** + * Return how many LRU pages in the cache of all OSC devices + * + * \retval return # of cached LRU pages times reclaimation tendency + * \retval SHRINK_STOP if it cannot do any scanning in this time + */ +unsigned long osc_cache_shrink_count(struct shrinker *sk, + struct shrink_control *sc) +{ + struct client_obd *cli; + unsigned long cached = 0; + + spin_lock(&osc_shrink_lock); + list_for_each_entry(cli, &osc_shrink_list, cl_shrink_list) + cached += atomic_long_read(&cli->cl_lru_in_list); + spin_unlock(&osc_shrink_lock); + + return (cached * sysctl_vfs_cache_pressure) / 100; +} + +/** + * Scan and try to reclaim sc->nr_to_scan cached LRU pages + * + * \retval number of cached LRU pages reclaimed + * \retval SHRINK_STOP if it cannot do any scanning in this time + * + * Linux kernel will loop calling this shrinker scan routine with + * sc->nr_to_scan = SHRINK_BATCH(128 for now) until kernel got enough memory. + * + * If sc->nr_to_scan is 0, the VM is querying the cache size, we don't need + * to scan and try to reclaim LRU pages, just return 0 and + * osc_cache_shrink_count() will report the LRU page number. + */ +unsigned long osc_cache_shrink_scan(struct shrinker *sk, + struct shrink_control *sc) +{ + struct client_obd *cli; + struct client_obd *stop_anchor = NULL; + struct lu_env *env; + long shrank = 0; + int rc; + __u16 refcheck; + + if (sc->nr_to_scan == 0) + return 0; + + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return SHRINK_STOP; + + spin_lock(&osc_shrink_lock); + while ((cli = list_first_entry_or_null(&osc_shrink_list, + struct client_obd, + cl_shrink_list)) != NULL) { + if (stop_anchor == NULL) + stop_anchor = cli; + else if (cli == stop_anchor) + break; + + list_move_tail(&cli->cl_shrink_list, &osc_shrink_list); + spin_unlock(&osc_shrink_lock); + + /* shrink no more than max_pages_per_rpc for an OSC */ + rc = osc_lru_shrink(env, cli, (sc->nr_to_scan - shrank) > + cli->cl_max_pages_per_rpc ? + cli->cl_max_pages_per_rpc : + sc->nr_to_scan - shrank, true); + if (rc > 0) + shrank += rc; + + if (shrank >= sc->nr_to_scan) + goto out; + + spin_lock(&osc_shrink_lock); + } + spin_unlock(&osc_shrink_lock); + +out: + cl_env_put(env, &refcheck); + + return shrank; +} + +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_quota.c b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c new file mode 100644 index 0000000000000..0f0795274593c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c @@ -0,0 +1,321 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2017, Intel Corporation. + * + * Code originally extracted from quota directory + */ + +#include +#include + +#include "osc_internal.h" + +static inline struct osc_quota_info *osc_oqi_alloc(u32 id) +{ + struct osc_quota_info *oqi; + + OBD_SLAB_ALLOC_PTR(oqi, osc_quota_kmem); + if (oqi != NULL) + oqi->oqi_id = id; + + return oqi; +} + +int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]) +{ + int type; + ENTRY; + + for (type = 0; type < LL_MAXQUOTAS; type++) { + struct osc_quota_info *oqi; + + oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]); + if (oqi) { + /* do not try to access oqi here, it could have been + * freed by osc_quota_setdq() */ + + /* the slot is busy, the user is about to run out of + * quota space on this OST */ + CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n", + type == USRQUOTA ? "user" : "grout", qid[type]); + RETURN(-EDQUOT); + } + } + + RETURN(0); +} + +static inline u32 md_quota_flag(int qtype) +{ + switch (qtype) { + case USRQUOTA: + return OBD_MD_FLUSRQUOTA; + case GRPQUOTA: + return OBD_MD_FLGRPQUOTA; + case PRJQUOTA: + return OBD_MD_FLPRJQUOTA; + default: + return 0; + } +} + +static inline u32 fl_quota_flag(int qtype) +{ + switch (qtype) { + case USRQUOTA: + return OBD_FL_NO_USRQUOTA; + case GRPQUOTA: + return OBD_FL_NO_GRPQUOTA; + case PRJQUOTA: + return OBD_FL_NO_PRJQUOTA; + default: + return 0; + } +} + +int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[], + u64 valid, u32 flags) +{ + int type; + int rc = 0; + + ENTRY; + + if ((valid & (OBD_MD_FLALLQUOTA)) == 0) + RETURN(0); + + mutex_lock(&cli->cl_quota_mutex); + cli->cl_root_squash = !!(flags & OBD_FL_ROOT_SQUASH); + /* still mark the quots is running out for the old request, because it + * could be processed after the new request at OST, the side effect is + * the following request will be processed synchronously, but it will + * not break the quota enforcement. */ + if (cli->cl_quota_last_xid > xid && !(flags & OBD_FL_NO_QUOTA_ALL)) + GOTO(out_unlock, rc); + + if (cli->cl_quota_last_xid < xid) + cli->cl_quota_last_xid = xid; + + for (type = 0; type < LL_MAXQUOTAS; type++) { + struct osc_quota_info *oqi; + + if ((valid & md_quota_flag(type)) == 0) + continue; + + /* lookup the ID in the per-type hash table */ + oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]); + if ((flags & fl_quota_flag(type)) != 0) { + /* This ID is getting close to its quota limit, let's + * switch to sync I/O */ + if (oqi != NULL) + continue; + + oqi = osc_oqi_alloc(qid[type]); + if (oqi == NULL) { + rc = -ENOMEM; + break; + } + + rc = cfs_hash_add_unique(cli->cl_quota_hash[type], + &qid[type], &oqi->oqi_hash); + /* race with others? */ + if (rc == -EALREADY) { + rc = 0; + OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem); + } + + CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n", + cli_name(cli), qtype_name(type), qid[type], rc); + } else { + /* This ID is now off the hook, let's remove it from + * the hash table */ + if (oqi == NULL) + continue; + + oqi = cfs_hash_del_key(cli->cl_quota_hash[type], + &qid[type]); + if (oqi) + OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem); + + CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n", + cli_name(cli), qtype_name(type), qid[type], oqi); + } + } + +out_unlock: + mutex_unlock(&cli->cl_quota_mutex); + RETURN(rc); +} + +/* + * Hash operations for uid/gid <-> osc_quota_info + */ +static unsigned +oqi_hashfn(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_u32_hash(*((__u32*)key), mask); +} + +static int +oqi_keycmp(const void *key, struct hlist_node *hnode) +{ + struct osc_quota_info *oqi; + u32 uid; + + LASSERT(key != NULL); + uid = *((u32 *)key); + oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); + + return uid == oqi->oqi_id; +} + +static void * +oqi_key(struct hlist_node *hnode) +{ + struct osc_quota_info *oqi; + oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); + return &oqi->oqi_id; +} + +static void * +oqi_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct osc_quota_info, oqi_hash); +} + +static void +oqi_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ +} + +static void +oqi_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ +} + +static void +oqi_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct osc_quota_info *oqi; + + oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); + + OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem); +} + +#define HASH_QUOTA_BKT_BITS 5 +#define HASH_QUOTA_CUR_BITS 5 +#define HASH_QUOTA_MAX_BITS 15 + +static struct cfs_hash_ops quota_hash_ops = { + .hs_hash = oqi_hashfn, + .hs_keycmp = oqi_keycmp, + .hs_key = oqi_key, + .hs_object = oqi_object, + .hs_get = oqi_get, + .hs_put_locked = oqi_put_locked, + .hs_exit = oqi_exit, +}; + +int osc_quota_setup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int i, type; + ENTRY; + + mutex_init(&cli->cl_quota_mutex); + + for (type = 0; type < LL_MAXQUOTAS; type++) { + cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH", + HASH_QUOTA_CUR_BITS, + HASH_QUOTA_MAX_BITS, + HASH_QUOTA_BKT_BITS, + 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + "a_hash_ops, + CFS_HASH_DEFAULT); + if (cli->cl_quota_hash[type] == NULL) + break; + } + + if (type == LL_MAXQUOTAS) + RETURN(0); + + for (i = 0; i < type; i++) + cfs_hash_putref(cli->cl_quota_hash[i]); + + RETURN(-ENOMEM); +} + +int osc_quota_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int type; + ENTRY; + + for (type = 0; type < LL_MAXQUOTAS; type++) + cfs_hash_putref(cli->cl_quota_hash[type]); + + RETURN(0); +} + +int osc_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct ptlrpc_request *req; + struct obd_quotactl *oqc; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_OST_QUOTACTL, LUSTRE_OST_VERSION, + OST_QUOTACTL); + if (req == NULL) + RETURN(-ENOMEM); + + oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + *oqc = *oqctl; + + ptlrpc_request_set_replen(req); + ptlrpc_at_set_req_timeout(req); + req->rq_no_resend = 1; + + rc = ptlrpc_queue_wait(req); + if (rc) + CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc); + + if (req->rq_repmsg && + (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) { + *oqctl = *oqc; + } else if (!rc) { + CERROR ("Can't unpack obd_quotactl\n"); + rc = -EPROTO; + } + ptlrpc_req_finished(req); + + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_request.c b/drivers/staging/lustrefsx/lustre/osc/osc_request.c new file mode 100644 index 0000000000000..59607e7b19bce --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_request.c @@ -0,0 +1,3942 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "osc_internal.h" +#include + +atomic_t osc_pool_req_count; +unsigned int osc_reqpool_maxreqcount; +struct ptlrpc_request_pool *osc_rq_pool; + +/* max memory used for request pool, unit is MB */ +static unsigned int osc_reqpool_mem_max = 5; +module_param(osc_reqpool_mem_max, uint, 0444); + +static int osc_idle_timeout = 20; +module_param(osc_idle_timeout, uint, 0644); + +#define osc_grant_args osc_brw_async_args + +struct osc_setattr_args { + struct obdo *sa_oa; + obd_enqueue_update_f sa_upcall; + void *sa_cookie; +}; + +struct osc_fsync_args { + struct osc_object *fa_obj; + struct obdo *fa_oa; + obd_enqueue_update_f fa_upcall; + void *fa_cookie; +}; + +struct osc_ladvise_args { + struct obdo *la_oa; + obd_enqueue_update_f la_upcall; + void *la_cookie; +}; + +static void osc_release_ppga(struct brw_page **ppga, size_t count); +static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req, + void *data, int rc); + +void osc_pack_req_body(struct ptlrpc_request *req, struct obdo *oa) +{ + struct ost_body *body; + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); +} + +static int osc_getattr(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa) +{ + struct ptlrpc_request *req; + struct ost_body *body; + int rc; + + ENTRY; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + osc_pack_req_body(req, oa); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa); + + oa->o_blksize = cli_brw_size(exp->exp_obd); + oa->o_valid |= OBD_MD_FLBLKSZ; + + EXIT; +out: + ptlrpc_req_finished(req); + + return rc; +} + +static int osc_setattr(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa) +{ + struct ptlrpc_request *req; + struct ost_body *body; + int rc; + + ENTRY; + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + osc_pack_req_body(req, oa); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa); + + EXIT; +out: + ptlrpc_req_finished(req); + + RETURN(rc); +} + +static int osc_setattr_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *args, int rc) +{ + struct osc_setattr_args *sa = args; + struct ost_body *body; + + ENTRY; + + if (rc != 0) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa, + &body->oa); +out: + rc = sa->sa_upcall(sa->sa_cookie, rc); + RETURN(rc); +} + +int osc_setattr_async(struct obd_export *exp, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) +{ + struct ptlrpc_request *req; + struct osc_setattr_args *sa; + int rc; + + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + osc_pack_req_body(req, oa); + + ptlrpc_request_set_replen(req); + + /* do mds to ost setattr asynchronously */ + if (!rqset) { + /* Do not wait for response. */ + ptlrpcd_add_req(req); + } else { + req->rq_interpret_reply = osc_setattr_interpret; + + sa = ptlrpc_req_async_args(sa, req); + sa->sa_oa = oa; + sa->sa_upcall = upcall; + sa->sa_cookie = cookie; + + ptlrpc_set_add_req(rqset, req); + } + + RETURN(0); +} + +static int osc_ladvise_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *arg, int rc) +{ + struct osc_ladvise_args *la = arg; + struct ost_body *body; + ENTRY; + + if (rc != 0) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + *la->la_oa = body->oa; +out: + rc = la->la_upcall(la->la_cookie, rc); + RETURN(rc); +} + +/** + * If rqset is NULL, do not wait for response. Upcall and cookie could also + * be NULL in this case + */ +int osc_ladvise_base(struct obd_export *exp, struct obdo *oa, + struct ladvise_hdr *ladvise_hdr, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) +{ + struct ptlrpc_request *req; + struct ost_body *body; + struct osc_ladvise_args *la; + int rc; + struct lu_ladvise *req_ladvise; + struct lu_ladvise *ladvise = ladvise_hdr->lah_advise; + int num_advise = ladvise_hdr->lah_count; + struct ladvise_hdr *req_ladvise_hdr; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_LADVISE); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_OST_LADVISE, RCL_CLIENT, + num_advise * sizeof(*ladvise)); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_LADVISE); + if (rc != 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + req->rq_request_portal = OST_IO_PORTAL; + ptlrpc_at_set_req_timeout(req); + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, + oa); + + req_ladvise_hdr = req_capsule_client_get(&req->rq_pill, + &RMF_OST_LADVISE_HDR); + memcpy(req_ladvise_hdr, ladvise_hdr, sizeof(*ladvise_hdr)); + + req_ladvise = req_capsule_client_get(&req->rq_pill, &RMF_OST_LADVISE); + memcpy(req_ladvise, ladvise, sizeof(*ladvise) * num_advise); + ptlrpc_request_set_replen(req); + + if (rqset == NULL) { + /* Do not wait for response. */ + ptlrpcd_add_req(req); + RETURN(0); + } + + req->rq_interpret_reply = osc_ladvise_interpret; + la = ptlrpc_req_async_args(la, req); + la->la_oa = oa; + la->la_upcall = upcall; + la->la_cookie = cookie; + + ptlrpc_set_add_req(rqset, req); + + RETURN(0); +} + +static int osc_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa) +{ + struct ptlrpc_request *req; + struct ost_body *body; + int rc; + ENTRY; + + LASSERT(oa != NULL); + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + LASSERT(fid_seq_is_echo(ostid_seq(&oa->o_oi))); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE); + if (rc) { + ptlrpc_request_free(req); + GOTO(out, rc); + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out_req, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out_req, rc = -EPROTO); + + CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa); + + oa->o_blksize = cli_brw_size(exp->exp_obd); + oa->o_valid |= OBD_MD_FLBLKSZ; + + CDEBUG(D_HA, "transno: %lld\n", + lustre_msg_get_transno(req->rq_repmsg)); +out_req: + ptlrpc_req_finished(req); +out: + RETURN(rc); +} + +int osc_punch_send(struct obd_export *exp, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie) +{ + struct ptlrpc_request *req; + struct osc_setattr_args *sa; + struct obd_import *imp = class_exp2cliimp(exp); + struct ost_body *body; + int rc; + + ENTRY; + + req = ptlrpc_request_alloc(imp, &RQF_OST_PUNCH); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + + osc_set_io_portal(req); + + ptlrpc_at_set_req_timeout(req); + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + + lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa); + + ptlrpc_request_set_replen(req); + + req->rq_interpret_reply = osc_setattr_interpret; + sa = ptlrpc_req_async_args(sa, req); + sa->sa_oa = oa; + sa->sa_upcall = upcall; + sa->sa_cookie = cookie; + + ptlrpcd_add_req(req); + + RETURN(0); +} +EXPORT_SYMBOL(osc_punch_send); + +/** + * osc_fallocate_base() - Handles fallocate request. + * + * @exp: Export structure + * @oa: Attributes passed to OSS from client (obdo structure) + * @upcall: Primary & supplementary group information + * @cookie: Exclusive identifier + * @rqset: Request list. + * @mode: Operation done on given range. + * + * osc_fallocate_base() - Handles fallocate requests only. Only block + * allocation or standard preallocate operation is supported currently. + * Other mode flags is not supported yet. ftruncate(2) or truncate(2) + * is supported via SETATTR request. + * + * Return: Non-zero on failure and O on success. + */ +int osc_fallocate_base(struct obd_export *exp, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie, int mode) +{ + struct ptlrpc_request *req; + struct osc_setattr_args *sa; + struct ost_body *body; + struct obd_import *imp = class_exp2cliimp(exp); + int rc; + ENTRY; + + oa->o_falloc_mode = mode; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_OST_FALLOCATE); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_FALLOCATE); + if (rc != 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + + lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa); + + ptlrpc_request_set_replen(req); + + req->rq_interpret_reply = osc_setattr_interpret; + BUILD_BUG_ON(sizeof(*sa) > sizeof(req->rq_async_args)); + sa = ptlrpc_req_async_args(sa, req); + sa->sa_oa = oa; + sa->sa_upcall = upcall; + sa->sa_cookie = cookie; + + ptlrpcd_add_req(req); + + RETURN(0); +} +EXPORT_SYMBOL(osc_fallocate_base); + +static int osc_sync_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *args, int rc) +{ + struct osc_fsync_args *fa = args; + struct ost_body *body; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + unsigned long valid = 0; + struct cl_object *obj; + ENTRY; + + if (rc != 0) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) { + CERROR("can't unpack ost_body\n"); + GOTO(out, rc = -EPROTO); + } + + *fa->fa_oa = body->oa; + obj = osc2cl(fa->fa_obj); + + /* Update osc object's blocks attribute */ + cl_object_attr_lock(obj); + if (body->oa.o_valid & OBD_MD_FLBLOCKS) { + attr->cat_blocks = body->oa.o_blocks; + valid |= CAT_BLOCKS; + } + + if (valid != 0) + cl_object_attr_update(env, obj, attr, valid); + cl_object_attr_unlock(obj); + +out: + rc = fa->fa_upcall(fa->fa_cookie, rc); + RETURN(rc); +} + +int osc_sync_base(struct osc_object *obj, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) +{ + struct obd_export *exp = osc_export(obj); + struct ptlrpc_request *req; + struct ost_body *body; + struct osc_fsync_args *fa; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + /* overload the size and blocks fields in the oa with start/end */ + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + ptlrpc_request_set_replen(req); + req->rq_interpret_reply = osc_sync_interpret; + + fa = ptlrpc_req_async_args(fa, req); + fa->fa_obj = obj; + fa->fa_oa = oa; + fa->fa_upcall = upcall; + fa->fa_cookie = cookie; + + ptlrpc_set_add_req(rqset, req); + + RETURN (0); +} + +/* Find and cancel locally locks matched by @mode in the resource found by + * @objid. Found locks are added into @cancel list. Returns the amount of + * locks added to @cancels list. */ +static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, + struct list_head *cancels, + enum ldlm_mode mode, __u64 lock_flags) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + struct ldlm_res_id res_id; + struct ldlm_resource *res; + int count; + ENTRY; + + /* Return, i.e. cancel nothing, only if ELC is supported (flag in + * export) but disabled through procfs (flag in NS). + * + * This distinguishes from a case when ELC is not supported originally, + * when we still want to cancel locks in advance and just cancel them + * locally, without sending any RPC. */ + if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns)) + RETURN(0); + + ostid_build_res_name(&oa->o_oi, &res_id); + res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); + if (IS_ERR(res)) + RETURN(0); + + LDLM_RESOURCE_ADDREF(res); + count = ldlm_cancel_resource_local(res, cancels, NULL, mode, + lock_flags, 0, NULL); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + RETURN(count); +} + +static int osc_destroy_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *args, int rc) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + + atomic_dec(&cli->cl_destroy_in_flight); + wake_up(&cli->cl_destroy_waitq); + + return 0; +} + +static int osc_can_send_destroy(struct client_obd *cli) +{ + if (atomic_inc_return(&cli->cl_destroy_in_flight) <= + cli->cl_max_rpcs_in_flight) { + /* The destroy request can be sent */ + return 1; + } + if (atomic_dec_return(&cli->cl_destroy_in_flight) < + cli->cl_max_rpcs_in_flight) { + /* + * The counter has been modified between the two atomic + * operations. + */ + wake_up(&cli->cl_destroy_waitq); + } + return 0; +} + +static int osc_destroy(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + struct ptlrpc_request *req; + struct ost_body *body; + LIST_HEAD(cancels); + int rc, count; + ENTRY; + + if (!oa) { + CDEBUG(D_INFO, "oa NULL\n"); + RETURN(-EINVAL); + } + + count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW, + LDLM_FL_DISCARD_DATA); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY, + 0, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ + ptlrpc_at_set_req_timeout(req); + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + ptlrpc_request_set_replen(req); + + req->rq_interpret_reply = osc_destroy_interpret; + if (!osc_can_send_destroy(cli)) { + /* + * Wait until the number of on-going destroy RPCs drops + * under max_rpc_in_flight + */ + rc = l_wait_event_abortable_exclusive( + cli->cl_destroy_waitq, + osc_can_send_destroy(cli)); + if (rc) { + ptlrpc_req_finished(req); + RETURN(-EINTR); + } + } + + /* Do not wait for response */ + ptlrpcd_add_req(req); + RETURN(0); +} + +static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, + long writing_bytes) +{ + u64 bits = OBD_MD_FLBLOCKS | OBD_MD_FLGRANT; + + LASSERT(!(oa->o_valid & bits)); + + oa->o_valid |= bits; + spin_lock(&cli->cl_loi_list_lock); + if (cli->cl_ocd_grant_param) + oa->o_dirty = cli->cl_dirty_grant; + else + oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT; + if (unlikely(cli->cl_dirty_pages > cli->cl_dirty_max_pages)) { + CERROR("dirty %lu > dirty_max %lu\n", + cli->cl_dirty_pages, + cli->cl_dirty_max_pages); + oa->o_undirty = 0; + } else if (unlikely(atomic_long_read(&obd_dirty_pages) > + (long)(obd_max_dirty_pages + 1))) { + /* The atomic_read() allowing the atomic_inc() are + * not covered by a lock thus they may safely race and trip + * this CERROR() unless we add in a small fudge factor (+1). */ + CERROR("%s: dirty %ld > system dirty_max %ld\n", + cli_name(cli), atomic_long_read(&obd_dirty_pages), + obd_max_dirty_pages); + oa->o_undirty = 0; + } else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages > + 0x7fffffff)) { + CERROR("dirty %lu - dirty_max %lu too big???\n", + cli->cl_dirty_pages, cli->cl_dirty_max_pages); + oa->o_undirty = 0; + } else { + unsigned long nrpages; + unsigned long undirty; + + nrpages = cli->cl_max_pages_per_rpc; + nrpages *= cli->cl_max_rpcs_in_flight + 1; + nrpages = max(nrpages, cli->cl_dirty_max_pages); + undirty = nrpages << PAGE_SHIFT; + if (cli->cl_ocd_grant_param) { + int nrextents; + + /* take extent tax into account when asking for more + * grant space */ + nrextents = (nrpages + cli->cl_max_extent_pages - 1) / + cli->cl_max_extent_pages; + undirty += nrextents * cli->cl_grant_extent_tax; + } + /* Do not ask for more than OBD_MAX_GRANT - a margin for server + * to add extent tax, etc. + */ + oa->o_undirty = min(undirty, OBD_MAX_GRANT & + ~(PTLRPC_MAX_BRW_SIZE * 4UL)); + } + oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant; + /* o_dropped AKA o_misc is 32 bits, but cl_lost_grant is 64 bits */ + if (cli->cl_lost_grant > INT_MAX) { + CDEBUG(D_CACHE, + "%s: avoided o_dropped overflow: cl_lost_grant %lu\n", + cli_name(cli), cli->cl_lost_grant); + oa->o_dropped = INT_MAX; + } else { + oa->o_dropped = cli->cl_lost_grant; + } + cli->cl_lost_grant -= oa->o_dropped; + spin_unlock(&cli->cl_loi_list_lock); + CDEBUG(D_CACHE, "%s: dirty: %llu undirty: %u dropped %u grant: %llu" + " cl_lost_grant %lu\n", cli_name(cli), oa->o_dirty, + oa->o_undirty, oa->o_dropped, oa->o_grant, cli->cl_lost_grant); +} + +void osc_update_next_shrink(struct client_obd *cli) +{ + cli->cl_next_shrink_grant = ktime_get_seconds() + + cli->cl_grant_shrink_interval; + + CDEBUG(D_CACHE, "next time %lld to shrink grant\n", + cli->cl_next_shrink_grant); +} +EXPORT_SYMBOL(osc_update_next_shrink); + +static void __osc_update_grant(struct client_obd *cli, u64 grant) +{ + spin_lock(&cli->cl_loi_list_lock); + cli->cl_avail_grant += grant; + spin_unlock(&cli->cl_loi_list_lock); +} + +static void osc_update_grant(struct client_obd *cli, struct ost_body *body) +{ + if (body->oa.o_valid & OBD_MD_FLGRANT) { + CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant); + __osc_update_grant(cli, body->oa.o_grant); + } +} + +/** + * grant thread data for shrinking space. + */ +struct grant_thread_data { + struct list_head gtd_clients; + struct mutex gtd_mutex; + unsigned long gtd_stopped:1; +}; +static struct grant_thread_data client_gtd; + +static int osc_shrink_grant_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *args, int rc) +{ + struct osc_grant_args *aa = args; + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + struct ost_body *body; + + if (rc != 0) { + __osc_update_grant(cli, aa->aa_oa->o_grant); + GOTO(out, rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + osc_update_grant(cli, body); +out: + OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem); + aa->aa_oa = NULL; + + return rc; +} + +static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa) +{ + spin_lock(&cli->cl_loi_list_lock); + oa->o_grant = cli->cl_avail_grant / 4; + cli->cl_avail_grant -= oa->o_grant; + spin_unlock(&cli->cl_loi_list_lock); + if (!(oa->o_valid & OBD_MD_FLFLAGS)) { + oa->o_valid |= OBD_MD_FLFLAGS; + oa->o_flags = 0; + } + oa->o_flags |= OBD_FL_SHRINK_GRANT; + osc_update_next_shrink(cli); +} + +/* Shrink the current grant, either from some large amount to enough for a + * full set of in-flight RPCs, or if we have already shrunk to that limit + * then to enough for a single RPC. This avoids keeping more grant than + * needed, and avoids shrinking the grant piecemeal. */ +static int osc_shrink_grant(struct client_obd *cli) +{ + __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) * + (cli->cl_max_pages_per_rpc << PAGE_SHIFT); + + spin_lock(&cli->cl_loi_list_lock); + if (cli->cl_avail_grant <= target_bytes) + target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT; + spin_unlock(&cli->cl_loi_list_lock); + + return osc_shrink_grant_to_target(cli, target_bytes); +} + +int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes) +{ + int rc = 0; + struct ost_body *body; + ENTRY; + + spin_lock(&cli->cl_loi_list_lock); + /* Don't shrink if we are already above or below the desired limit + * We don't want to shrink below a single RPC, as that will negatively + * impact block allocation and long-term performance. */ + if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_SHIFT) + target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT; + + if (target_bytes >= cli->cl_avail_grant) { + spin_unlock(&cli->cl_loi_list_lock); + RETURN(0); + } + spin_unlock(&cli->cl_loi_list_lock); + + OBD_ALLOC_PTR(body); + if (!body) + RETURN(-ENOMEM); + + osc_announce_cached(cli, &body->oa, 0); + + spin_lock(&cli->cl_loi_list_lock); + if (target_bytes >= cli->cl_avail_grant) { + /* available grant has changed since target calculation */ + spin_unlock(&cli->cl_loi_list_lock); + GOTO(out_free, rc = 0); + } + body->oa.o_grant = cli->cl_avail_grant - target_bytes; + cli->cl_avail_grant = target_bytes; + spin_unlock(&cli->cl_loi_list_lock); + if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) { + body->oa.o_valid |= OBD_MD_FLFLAGS; + body->oa.o_flags = 0; + } + body->oa.o_flags |= OBD_FL_SHRINK_GRANT; + osc_update_next_shrink(cli); + + rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export, + sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK, + sizeof(*body), body, NULL); + if (rc != 0) + __osc_update_grant(cli, body->oa.o_grant); +out_free: + OBD_FREE_PTR(body); + RETURN(rc); +} + +static int osc_should_shrink_grant(struct client_obd *client) +{ + time64_t next_shrink = client->cl_next_shrink_grant; + + if (client->cl_import == NULL) + return 0; + + if (!OCD_HAS_FLAG(&client->cl_import->imp_connect_data, GRANT_SHRINK) || + client->cl_import->imp_grant_shrink_disabled) { + osc_update_next_shrink(client); + return 0; + } + + if (ktime_get_seconds() >= next_shrink - 5) { + /* Get the current RPC size directly, instead of going via: + * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export) + * Keep comment here so that it can be found by searching. */ + int brw_size = client->cl_max_pages_per_rpc << PAGE_SHIFT; + + if (client->cl_import->imp_state == LUSTRE_IMP_FULL && + client->cl_avail_grant > brw_size) + return 1; + else + osc_update_next_shrink(client); + } + return 0; +} + +#define GRANT_SHRINK_RPC_BATCH 100 + +static struct delayed_work work; + +static void osc_grant_work_handler(struct work_struct *data) +{ + struct client_obd *cli; + int rpc_sent; + bool init_next_shrink = true; + time64_t next_shrink = ktime_get_seconds() + GRANT_SHRINK_INTERVAL; + + rpc_sent = 0; + mutex_lock(&client_gtd.gtd_mutex); + list_for_each_entry(cli, &client_gtd.gtd_clients, + cl_grant_chain) { + if (rpc_sent < GRANT_SHRINK_RPC_BATCH && + osc_should_shrink_grant(cli)) { + osc_shrink_grant(cli); + rpc_sent++; + } + + if (!init_next_shrink) { + if (cli->cl_next_shrink_grant < next_shrink && + cli->cl_next_shrink_grant > ktime_get_seconds()) + next_shrink = cli->cl_next_shrink_grant; + } else { + init_next_shrink = false; + next_shrink = cli->cl_next_shrink_grant; + } + } + mutex_unlock(&client_gtd.gtd_mutex); + + if (client_gtd.gtd_stopped == 1) + return; + + if (next_shrink > ktime_get_seconds()) { + time64_t delay = next_shrink - ktime_get_seconds(); + + schedule_delayed_work(&work, cfs_time_seconds(delay)); + } else { + schedule_work(&work.work); + } +} + +void osc_schedule_grant_work(void) +{ + cancel_delayed_work_sync(&work); + schedule_work(&work.work); +} +EXPORT_SYMBOL(osc_schedule_grant_work); + +/** + * Start grant thread for returing grant to server for idle clients. + */ +static int osc_start_grant_work(void) +{ + client_gtd.gtd_stopped = 0; + mutex_init(&client_gtd.gtd_mutex); + INIT_LIST_HEAD(&client_gtd.gtd_clients); + + INIT_DELAYED_WORK(&work, osc_grant_work_handler); + schedule_work(&work.work); + + return 0; +} + +static void osc_stop_grant_work(void) +{ + client_gtd.gtd_stopped = 1; + cancel_delayed_work_sync(&work); +} + +static void osc_add_grant_list(struct client_obd *client) +{ + mutex_lock(&client_gtd.gtd_mutex); + list_add(&client->cl_grant_chain, &client_gtd.gtd_clients); + mutex_unlock(&client_gtd.gtd_mutex); +} + +static void osc_del_grant_list(struct client_obd *client) +{ + if (list_empty(&client->cl_grant_chain)) + return; + + mutex_lock(&client_gtd.gtd_mutex); + list_del_init(&client->cl_grant_chain); + mutex_unlock(&client_gtd.gtd_mutex); +} + +void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd) +{ + /* + * ocd_grant is the total grant amount we're expect to hold: if we've + * been evicted, it's the new avail_grant amount, cl_dirty_pages will + * drop to 0 as inflight RPCs fail out; otherwise, it's avail_grant + + * dirty. + * + * race is tolerable here: if we're evicted, but imp_state already + * left EVICTED state, then cl_dirty_pages must be 0 already. + */ + spin_lock(&cli->cl_loi_list_lock); + cli->cl_avail_grant = ocd->ocd_grant; + if (cli->cl_import->imp_state != LUSTRE_IMP_EVICTED) { + unsigned long consumed = cli->cl_reserved_grant; + + if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) + consumed += cli->cl_dirty_grant; + else + consumed += cli->cl_dirty_pages << PAGE_SHIFT; + if (cli->cl_avail_grant < consumed) { + CERROR("%s: granted %ld but already consumed %ld\n", + cli_name(cli), cli->cl_avail_grant, consumed); + cli->cl_avail_grant = 0; + } else { + cli->cl_avail_grant -= consumed; + } + } + + if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) { + u64 size; + int chunk_mask; + + /* overhead for each extent insertion */ + cli->cl_grant_extent_tax = ocd->ocd_grant_tax_kb << 10; + /* determine the appropriate chunk size used by osc_extent. */ + cli->cl_chunkbits = max_t(int, PAGE_SHIFT, + ocd->ocd_grant_blkbits); + /* max_pages_per_rpc must be chunk aligned */ + chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1); + cli->cl_max_pages_per_rpc = (cli->cl_max_pages_per_rpc + + ~chunk_mask) & chunk_mask; + /* determine maximum extent size, in #pages */ + size = (u64)ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits; + cli->cl_max_extent_pages = (size >> PAGE_SHIFT) ?: 1; + cli->cl_ocd_grant_param = 1; + } else { + cli->cl_ocd_grant_param = 0; + cli->cl_grant_extent_tax = 0; + cli->cl_chunkbits = PAGE_SHIFT; + cli->cl_max_extent_pages = DT_MAX_BRW_PAGES; + } + spin_unlock(&cli->cl_loi_list_lock); + + CDEBUG(D_CACHE, + "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld. chunk bits: %d cl_max_extent_pages: %d\n", + cli_name(cli), + cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits, + cli->cl_max_extent_pages); + + if (OCD_HAS_FLAG(ocd, GRANT_SHRINK) && list_empty(&cli->cl_grant_chain)) + osc_add_grant_list(cli); +} +EXPORT_SYMBOL(osc_init_grant); + +/* We assume that the reason this OSC got a short read is because it read + * beyond the end of a stripe file; i.e. lustre is reading a sparse file + * via the LOV, and it _knows_ it's reading inside the file, it's just that + * this stripe never got written at or beyond this stripe offset yet. */ +static void handle_short_read(int nob_read, size_t page_count, + struct brw_page **pga) +{ + char *ptr; + int i = 0; + + /* skip bytes read OK */ + while (nob_read > 0) { + LASSERT (page_count > 0); + + if (pga[i]->count > nob_read) { + /* EOF inside this page */ + ptr = kmap(pga[i]->pg) + + (pga[i]->off & ~PAGE_MASK); + memset(ptr + nob_read, 0, pga[i]->count - nob_read); + kunmap(pga[i]->pg); + page_count--; + i++; + break; + } + + nob_read -= pga[i]->count; + page_count--; + i++; + } + + /* zero remaining pages */ + while (page_count-- > 0) { + ptr = kmap(pga[i]->pg) + (pga[i]->off & ~PAGE_MASK); + memset(ptr, 0, pga[i]->count); + kunmap(pga[i]->pg); + i++; + } +} + +static int check_write_rcs(struct ptlrpc_request *req, + int requested_nob, int niocount, + size_t page_count, struct brw_page **pga) +{ + int i; + __u32 *remote_rcs; + + remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS, + sizeof(*remote_rcs) * + niocount); + if (remote_rcs == NULL) { + CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n"); + return(-EPROTO); + } + + /* return error if any niobuf was in error */ + for (i = 0; i < niocount; i++) { + if ((int)remote_rcs[i] < 0) { + CDEBUG(D_INFO, "rc[%d]: %d req %p\n", + i, remote_rcs[i], req); + return remote_rcs[i]; + } + + if (remote_rcs[i] != 0) { + CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n", + i, remote_rcs[i], req); + return(-EPROTO); + } + } + if (req->rq_bulk != NULL && + req->rq_bulk->bd_nob_transferred != requested_nob) { + CERROR("Unexpected # bytes transferred: %d (requested %d)\n", + req->rq_bulk->bd_nob_transferred, requested_nob); + return(-EPROTO); + } + + return (0); +} + +static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2) +{ + if (p1->flag != p2->flag) { + unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE | + OBD_BRW_SYNC | OBD_BRW_ASYNC | + OBD_BRW_NOQUOTA | OBD_BRW_SOFT_SYNC | + OBD_BRW_SYS_RESOURCE); + + /* warn if we try to combine flags that we don't know to be + * safe to combine */ + if (unlikely((p1->flag & mask) != (p2->flag & mask))) { + CWARN("Saw flags 0x%x and 0x%x in the same brw, please " + "report this at https://jira.whamcloud.com/\n", + p1->flag, p2->flag); + } + return 0; + } + + return (p1->off + p1->count == p2->off); +} + +#if IS_ENABLED(CONFIG_CRC_T10DIF) +static int osc_checksum_bulk_t10pi(const char *obd_name, int nob, + size_t pg_count, struct brw_page **pga, + int opc, obd_dif_csum_fn *fn, + int sector_size, + u32 *check_sum, bool resend) +{ + struct ahash_request *req; + /* Used Adler as the default checksum type on top of DIF tags */ + unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP); + struct page *__page; + unsigned char *buffer; + __be16 *guard_start; + unsigned int bufsize; + int guard_number; + int used_number = 0; + int used; + u32 cksum; + int rc = 0; + int i = 0; + + LASSERT(pg_count > 0); + + __page = alloc_page(GFP_KERNEL); + if (__page == NULL) + return -ENOMEM; + + req = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(req)) { + rc = PTR_ERR(req); + CERROR("%s: unable to initialize checksum hash %s: rc = %d\n", + obd_name, cfs_crypto_hash_name(cfs_alg), rc); + GOTO(out, rc); + } + + buffer = kmap(__page); + guard_start = (__be16 *)buffer; + guard_number = PAGE_SIZE / sizeof(*guard_start); + CDEBUG(D_PAGE | (resend ? D_HA : 0), + "GRD tags per page=%u, resend=%u, bytes=%u, pages=%zu\n", + guard_number, resend, nob, pg_count); + + while (nob > 0 && pg_count > 0) { + unsigned int count = pga[i]->count > nob ? nob : pga[i]->count; + + /* corrupt the data before we compute the checksum, to + * simulate an OST->client data error */ + if (unlikely(i == 0 && opc == OST_READ && + OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))) { + unsigned char *ptr = kmap(pga[i]->pg); + int off = pga[i]->off & ~PAGE_MASK; + + memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob)); + kunmap(pga[i]->pg); + } + + /* + * The left guard number should be able to hold checksums of a + * whole page + */ + rc = obd_page_dif_generate_buffer(obd_name, pga[i]->pg, + pga[i]->off & ~PAGE_MASK, + count, + guard_start + used_number, + guard_number - used_number, + &used, sector_size, + fn); + if (unlikely(resend)) + CDEBUG(D_PAGE | D_HA, + "pga[%u]: used %u off %llu+%u gen checksum: %*phN\n", + i, used, pga[i]->off & ~PAGE_MASK, count, + (int)(used * sizeof(*guard_start)), + guard_start + used_number); + if (rc) + break; + + used_number += used; + if (used_number == guard_number) { + cfs_crypto_hash_update_page(req, __page, 0, + used_number * sizeof(*guard_start)); + used_number = 0; + } + + nob -= pga[i]->count; + pg_count--; + i++; + } + kunmap(__page); + if (rc) + GOTO(out, rc); + + if (used_number != 0) + cfs_crypto_hash_update_page(req, __page, 0, + used_number * sizeof(*guard_start)); + + bufsize = sizeof(cksum); + cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize); + + /* For sending we only compute the wrong checksum instead + * of corrupting the data so it is still correct on a redo */ + if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND)) + cksum++; + + *check_sum = cksum; +out: + __free_page(__page); + return rc; +} +#else /* !CONFIG_CRC_T10DIF */ +#define obd_dif_ip_fn NULL +#define obd_dif_crc_fn NULL +#define osc_checksum_bulk_t10pi(name, nob, pgc, pga, opc, fn, ssize, csum, re) \ + -EOPNOTSUPP +#endif /* CONFIG_CRC_T10DIF */ + +static int osc_checksum_bulk(int nob, size_t pg_count, + struct brw_page **pga, int opc, + enum cksum_types cksum_type, + u32 *cksum) +{ + int i = 0; + struct ahash_request *req; + unsigned int bufsize; + unsigned char cfs_alg = cksum_obd2cfs(cksum_type); + + LASSERT(pg_count > 0); + + req = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(req)) { + CERROR("Unable to initialize checksum hash %s\n", + cfs_crypto_hash_name(cfs_alg)); + return PTR_ERR(req); + } + + while (nob > 0 && pg_count > 0) { + unsigned int count = pga[i]->count > nob ? nob : pga[i]->count; + + /* corrupt the data before we compute the checksum, to + * simulate an OST->client data error */ + if (i == 0 && opc == OST_READ && + OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) { + unsigned char *ptr = kmap(pga[i]->pg); + int off = pga[i]->off & ~PAGE_MASK; + + memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob)); + kunmap(pga[i]->pg); + } + cfs_crypto_hash_update_page(req, pga[i]->pg, + pga[i]->off & ~PAGE_MASK, + count); + LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n", + (int)(pga[i]->off & ~PAGE_MASK)); + + nob -= pga[i]->count; + pg_count--; + i++; + } + + bufsize = sizeof(*cksum); + cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize); + + /* For sending we only compute the wrong checksum instead + * of corrupting the data so it is still correct on a redo */ + if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND)) + (*cksum)++; + + return 0; +} + +static int osc_checksum_bulk_rw(const char *obd_name, + enum cksum_types cksum_type, + int nob, size_t pg_count, + struct brw_page **pga, int opc, + u32 *check_sum, bool resend) +{ + obd_dif_csum_fn *fn = NULL; + int sector_size = 0; + int rc; + + ENTRY; + obd_t10_cksum2dif(cksum_type, &fn, §or_size); + + if (fn) + rc = osc_checksum_bulk_t10pi(obd_name, nob, pg_count, pga, + opc, fn, sector_size, check_sum, + resend); + else + rc = osc_checksum_bulk(nob, pg_count, pga, opc, cksum_type, + check_sum); + + RETURN(rc); +} + +#ifdef CONFIG_LL_ENCRYPTION +/** + * osc_encrypt_pagecache_blocks() - overlay to llcrypt_encrypt_pagecache_blocks + * @srcpage: The locked pagecache page containing the block(s) to encrypt + * @dstpage: The page to put encryption result + * @len: Total size of the block(s) to encrypt. Must be a nonzero + * multiple of the filesystem's block size. + * @offs: Byte offset within @page of the first block to encrypt. Must be + * a multiple of the filesystem's block size. + * @gfp_flags: Memory allocation flags + * + * This overlay function is necessary to be able to provide our own bounce page. + */ +static struct page *osc_encrypt_pagecache_blocks(struct page *srcpage, + struct page *dstpage, + unsigned int len, + unsigned int offs, + gfp_t gfp_flags) + +{ + const struct inode *inode = srcpage->mapping->host; + const unsigned int blockbits = inode->i_blkbits; + const unsigned int blocksize = 1 << blockbits; + u64 lblk_num = ((u64)srcpage->index << (PAGE_SHIFT - blockbits)) + + (offs >> blockbits); + unsigned int i; + int err; + + if (unlikely(!dstpage)) + return llcrypt_encrypt_pagecache_blocks(srcpage, len, offs, + gfp_flags); + + if (WARN_ON_ONCE(!PageLocked(srcpage))) + return ERR_PTR(-EINVAL); + + if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize))) + return ERR_PTR(-EINVAL); + + /* Set PagePrivate2 for disambiguation in + * osc_finalize_bounce_page(). + * It means cipher page was not allocated by llcrypt. + */ + SetPagePrivate2(dstpage); + + for (i = offs; i < offs + len; i += blocksize, lblk_num++) { + err = llcrypt_encrypt_block(inode, srcpage, dstpage, blocksize, + i, lblk_num, gfp_flags); + if (err) + return ERR_PTR(err); + } + SetPagePrivate(dstpage); + set_page_private(dstpage, (unsigned long)srcpage); + return dstpage; +} + +/** + * osc_finalize_bounce_page() - overlay to llcrypt_finalize_bounce_page + * + * This overlay function is necessary to handle bounce pages + * allocated by ourselves. + */ +static inline void osc_finalize_bounce_page(struct page **pagep) +{ + struct page *page = *pagep; + + /* PagePrivate2 was set in osc_encrypt_pagecache_blocks + * to indicate the cipher page was allocated by ourselves. + * So we must not free it via llcrypt. + */ + if (unlikely(!page || !PagePrivate2(page))) + return llcrypt_finalize_bounce_page(pagep); + + if (llcrypt_is_bounce_page(page)) { + *pagep = llcrypt_pagecache_page(page); + ClearPagePrivate2(page); + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + } +} +#else /* !CONFIG_LL_ENCRYPTION */ +#define osc_encrypt_pagecache_blocks(srcpage, dstpage, len, offs, gfp_flags) \ + llcrypt_encrypt_pagecache_blocks(srcpage, len, offs, gfp_flags) +#define osc_finalize_bounce_page(page) llcrypt_finalize_bounce_page(page) +#endif + +static inline void osc_release_bounce_pages(struct brw_page **pga, + u32 page_count) +{ +#ifdef HAVE_LUSTRE_CRYPTO + struct page **pa = NULL; + int i, j = 0; + +#ifdef CONFIG_LL_ENCRYPTION + if (PageChecked(pga[0]->pg)) { + OBD_ALLOC_PTR_ARRAY_LARGE(pa, page_count); + if (!pa) + return; + } +#endif + + for (i = 0; i < page_count; i++) { + /* Bounce pages used by osc_encrypt_pagecache_blocks() + * called from osc_brw_prep_request() + * are identified thanks to the PageChecked flag. + */ + if (PageChecked(pga[i]->pg)) { + if (pa) + pa[j++] = pga[i]->pg; + osc_finalize_bounce_page(&pga[i]->pg); + } + pga[i]->count -= pga[i]->bp_count_diff; + pga[i]->off += pga[i]->bp_off_diff; + } + + if (pa) { + sptlrpc_enc_pool_put_pages_array(pa, j); + OBD_FREE_PTR_ARRAY_LARGE(pa, page_count); + } +#endif +} + +static int +osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa, + u32 page_count, struct brw_page **pga, + struct ptlrpc_request **reqp, int resend) +{ + struct ptlrpc_request *req; + struct ptlrpc_bulk_desc *desc; + struct ost_body *body; + struct obd_ioobj *ioobj; + struct niobuf_remote *niobuf; + int niocount, i, requested_nob, opc, rc, short_io_size = 0; + struct osc_brw_async_args *aa; + struct req_capsule *pill; + struct brw_page *pg_prev; + void *short_io_buf; + const char *obd_name = cli->cl_import->imp_obd->obd_name; + struct inode *inode = NULL; + bool directio = false; + bool enable_checksum = true; + struct cl_page *clpage; + + ENTRY; + if (pga[0]->pg) { + clpage = oap2cl_page(brw_page2oap(pga[0])); + inode = clpage->cp_inode; + if (clpage->cp_type == CPT_TRANSIENT) + directio = true; + } + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ)) + RETURN(-ENOMEM); /* Recoverable */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2)) + RETURN(-EINVAL); /* Fatal */ + + if ((cmd & OBD_BRW_WRITE) != 0) { + opc = OST_WRITE; + req = ptlrpc_request_alloc_pool(cli->cl_import, + osc_rq_pool, + &RQF_OST_BRW_WRITE); + } else { + opc = OST_READ; + req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ); + } + if (req == NULL) + RETURN(-ENOMEM); + + if (opc == OST_WRITE && inode && IS_ENCRYPTED(inode) && + llcrypt_has_encryption_key(inode)) { + struct page **pa = NULL; + +#ifdef CONFIG_LL_ENCRYPTION + OBD_ALLOC_PTR_ARRAY_LARGE(pa, page_count); + if (pa == NULL) { + ptlrpc_request_free(req); + RETURN(-ENOMEM); + } + + rc = sptlrpc_enc_pool_get_pages_array(pa, page_count); + if (rc) { + CDEBUG(D_SEC, "failed to allocate from enc pool: %d\n", + rc); + ptlrpc_request_free(req); + RETURN(rc); + } +#endif + + for (i = 0; i < page_count; i++) { + struct brw_page *brwpg = pga[i]; + struct page *data_page = NULL; + bool retried = false; + bool lockedbymyself; + u32 nunits = (brwpg->off & ~PAGE_MASK) + brwpg->count; + struct address_space *map_orig = NULL; + pgoff_t index_orig; + +retry_encrypt: + nunits = round_up(nunits, LUSTRE_ENCRYPTION_UNIT_SIZE); + /* The page can already be locked when we arrive here. + * This is possible when cl_page_assume/vvp_page_assume + * is stuck on wait_on_page_writeback with page lock + * held. In this case there is no risk for the lock to + * be released while we are doing our encryption + * processing, because writeback against that page will + * end in vvp_page_completion_write/cl_page_completion, + * which means only once the page is fully processed. + */ + lockedbymyself = trylock_page(brwpg->pg); + if (directio) { + map_orig = brwpg->pg->mapping; + brwpg->pg->mapping = inode->i_mapping; + index_orig = brwpg->pg->index; + clpage = oap2cl_page(brw_page2oap(brwpg)); + brwpg->pg->index = clpage->cp_page_index; + } + data_page = + osc_encrypt_pagecache_blocks(brwpg->pg, + pa ? pa[i] : NULL, + nunits, 0, + GFP_NOFS); + if (directio) { + brwpg->pg->mapping = map_orig; + brwpg->pg->index = index_orig; + } + if (lockedbymyself) + unlock_page(brwpg->pg); + if (IS_ERR(data_page)) { + rc = PTR_ERR(data_page); + if (rc == -ENOMEM && !retried) { + retried = true; + rc = 0; + goto retry_encrypt; + } + if (pa) { + sptlrpc_enc_pool_put_pages_array(pa + i, + page_count - i); + OBD_FREE_PTR_ARRAY_LARGE(pa, + page_count); + } + ptlrpc_request_free(req); + RETURN(rc); + } + /* Set PageChecked flag on bounce page for + * disambiguation in osc_release_bounce_pages(). + */ + SetPageChecked(data_page); + brwpg->pg = data_page; + /* there should be no gap in the middle of page array */ + if (i == page_count - 1) { + struct osc_async_page *oap = + brw_page2oap(brwpg); + + oa->o_size = oap->oap_count + + oap->oap_obj_off + oap->oap_page_off; + } + /* len is forced to nunits, and relative offset to 0 + * so store the old, clear text info + */ + brwpg->bp_count_diff = nunits - brwpg->count; + brwpg->count = nunits; + brwpg->bp_off_diff = brwpg->off & ~PAGE_MASK; + brwpg->off = brwpg->off & PAGE_MASK; + } + + if (pa) + OBD_FREE_PTR_ARRAY_LARGE(pa, page_count); + } else if (opc == OST_WRITE && inode && IS_ENCRYPTED(inode)) { + struct osc_async_page *oap = brw_page2oap(pga[0]); + struct cl_page *clpage = oap2cl_page(oap); + struct cl_object *clobj = clpage->cp_obj; + struct cl_attr attr = { 0 }; + struct lu_env *env; + __u16 refcheck; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) { + rc = PTR_ERR(env); + ptlrpc_request_free(req); + RETURN(rc); + } + + cl_object_attr_lock(clobj); + rc = cl_object_attr_get(env, clobj, &attr); + cl_object_attr_unlock(clobj); + cl_env_put(env, &refcheck); + if (rc != 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + if (attr.cat_size) + oa->o_size = attr.cat_size; + } else if (opc == OST_READ && inode && IS_ENCRYPTED(inode) && + llcrypt_has_encryption_key(inode)) { + for (i = 0; i < page_count; i++) { + struct brw_page *pg = pga[i]; + u32 nunits = (pg->off & ~PAGE_MASK) + pg->count; + + nunits = round_up(nunits, LUSTRE_ENCRYPTION_UNIT_SIZE); + /* count/off are forced to cover the whole encryption + * unit size so that all encrypted data is stored on the + * OST, so adjust bp_{count,off}_diff for the size of + * the clear text. + */ + pg->bp_count_diff = nunits - pg->count; + pg->count = nunits; + pg->bp_off_diff = pg->off & ~PAGE_MASK; + pg->off = pg->off & PAGE_MASK; + } + } + + for (niocount = i = 1; i < page_count; i++) { + if (!can_merge_pages(pga[i - 1], pga[i])) + niocount++; + } + + pill = &req->rq_pill; + req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT, + sizeof(*ioobj)); + req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT, + niocount * sizeof(*niobuf)); + + for (i = 0; i < page_count; i++) { + short_io_size += pga[i]->count; + if (!inode || !IS_ENCRYPTED(inode) || + !llcrypt_has_encryption_key(inode)) { + pga[i]->bp_count_diff = 0; + pga[i]->bp_off_diff = 0; + } + } + + if (brw_page2oap(pga[0])->oap_brw_flags & OBD_BRW_RDMA_ONLY) { + enable_checksum = false; + short_io_size = 0; + } + + /* Check if read/write is small enough to be a short io. */ + if (short_io_size > cli->cl_max_short_io_bytes || niocount > 1 || + !imp_connect_shortio(cli->cl_import)) + short_io_size = 0; + + /* If this is an empty RPC to old server, just ignore it */ + if (!short_io_size && !pga[0]->pg) { + ptlrpc_request_free(req); + RETURN(-ENODATA); + } + + req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT, + opc == OST_READ ? 0 : short_io_size); + if (opc == OST_READ) + req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_SERVER, + short_io_size); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + osc_set_io_portal(req); + + ptlrpc_at_set_req_timeout(req); + /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own + * retry logic */ + req->rq_no_retry_einprogress = 1; + + if (short_io_size != 0) { + desc = NULL; + short_io_buf = NULL; + goto no_bulk; + } + + desc = ptlrpc_prep_bulk_imp(req, page_count, + cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS, + (opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE : + PTLRPC_BULK_PUT_SINK), + OST_BULK_PORTAL, + &ptlrpc_bulk_kiov_pin_ops); + + if (desc == NULL) + GOTO(out, rc = -ENOMEM); + /* NB request now owns desc and will free it when it gets freed */ +no_bulk: + body = req_capsule_client_get(pill, &RMF_OST_BODY); + ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ); + niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); + LASSERT(body != NULL && ioobj != NULL && niobuf != NULL); + + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + /* For READ and WRITE, we can't fill o_uid and o_gid using from_kuid() + * and from_kgid(), because they are asynchronous. Fortunately, variable + * oa contains valid o_uid and o_gid in these two operations. + * Besides, filling o_uid and o_gid is enough for nrs-tbf, see LU-9658. + * OBD_MD_FLUID and OBD_MD_FLUID is not set in order to avoid breaking + * other process logic */ + body->oa.o_uid = oa->o_uid; + body->oa.o_gid = oa->o_gid; + + obdo_to_ioobj(oa, ioobj); + ioobj->ioo_bufcnt = niocount; + /* The high bits of ioo_max_brw tells server _maximum_ number of bulks + * that might be send for this request. The actual number is decided + * when the RPC is finally sent in ptlrpc_register_bulk(). It sends + * "max - 1" for old client compatibility sending "0", and also so the + * the actual maximum is a power-of-two number, not one less. LU-1431 */ + if (desc != NULL) + ioobj_max_brw_set(ioobj, desc->bd_md_max_brw); + else /* short io */ + ioobj_max_brw_set(ioobj, 0); + + if (inode && IS_ENCRYPTED(inode) && + llcrypt_has_encryption_key(inode) && + !OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NO_ENCFLAG)) { + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) { + body->oa.o_valid |= OBD_MD_FLFLAGS; + body->oa.o_flags = 0; + } + body->oa.o_flags |= LUSTRE_ENCRYPT_FL; + } + + if (short_io_size != 0) { + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) { + body->oa.o_valid |= OBD_MD_FLFLAGS; + body->oa.o_flags = 0; + } + body->oa.o_flags |= OBD_FL_SHORT_IO; + CDEBUG(D_CACHE, "Using short io for data transfer, size = %d\n", + short_io_size); + if (opc == OST_WRITE) { + short_io_buf = req_capsule_client_get(pill, + &RMF_SHORT_IO); + LASSERT(short_io_buf != NULL); + } + } + + LASSERT(page_count > 0); + pg_prev = pga[0]; + for (requested_nob = i = 0; i < page_count; i++, niobuf++) { + struct brw_page *pg = pga[i]; + int poff = pg->off & ~PAGE_MASK; + + LASSERT(pg->count > 0); + /* make sure there is no gap in the middle of page array */ + LASSERTF(page_count == 1 || + (ergo(i == 0, poff + pg->count == PAGE_SIZE) && + ergo(i > 0 && i < page_count - 1, + poff == 0 && pg->count == PAGE_SIZE) && + ergo(i == page_count - 1, poff == 0)), + "i: %d/%d pg: %p off: %llu, count: %u\n", + i, page_count, pg, pg->off, pg->count); + LASSERTF(i == 0 || pg->off > pg_prev->off, + "i %d p_c %u pg %p [pri %lu ind %lu] off %llu" + " prev_pg %p [pri %lu ind %lu] off %llu\n", + i, page_count, + pg->pg, page_private(pg->pg), pg->pg->index, pg->off, + pg_prev->pg, page_private(pg_prev->pg), + pg_prev->pg->index, pg_prev->off); + LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) == + (pg->flag & OBD_BRW_SRVLOCK)); + if (short_io_size != 0 && opc == OST_WRITE) { + unsigned char *ptr = kmap_atomic(pg->pg); + + LASSERT(short_io_size >= requested_nob + pg->count); + memcpy(short_io_buf + requested_nob, + ptr + poff, + pg->count); + kunmap_atomic(ptr); + } else if (short_io_size == 0) { + desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff, + pg->count); + } + requested_nob += pg->count; + + if (i > 0 && can_merge_pages(pg_prev, pg)) { + niobuf--; + niobuf->rnb_len += pg->count; + } else { + niobuf->rnb_offset = pg->off; + niobuf->rnb_len = pg->count; + niobuf->rnb_flags = pg->flag; + } + pg_prev = pg; + } + + LASSERTF((void *)(niobuf - niocount) == + req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE), + "want %p - real %p\n", req_capsule_client_get(&req->rq_pill, + &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount)); + + osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0); + if (resend) { + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) { + body->oa.o_valid |= OBD_MD_FLFLAGS; + body->oa.o_flags = 0; + } + body->oa.o_flags |= OBD_FL_RECOV_RESEND; + } + + if (osc_should_shrink_grant(cli)) + osc_shrink_grant_local(cli, &body->oa); + + if (!cli->cl_checksum || sptlrpc_flavor_has_bulk(&req->rq_flvr)) + enable_checksum = false; + + /* size[REQ_REC_OFF] still sizeof (*body) */ + if (opc == OST_WRITE) { + if (enable_checksum) { + /* store cl_cksum_type in a local variable since + * it can be changed via lprocfs */ + enum cksum_types cksum_type = cli->cl_cksum_type; + + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) + body->oa.o_flags = 0; + + body->oa.o_flags |= obd_cksum_type_pack(obd_name, + cksum_type); + body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + + rc = osc_checksum_bulk_rw(obd_name, cksum_type, + requested_nob, page_count, + pga, OST_WRITE, + &body->oa.o_cksum, resend); + if (rc < 0) { + CDEBUG(D_PAGE, "failed to checksum: rc = %d\n", + rc); + GOTO(out, rc); + } + CDEBUG(D_PAGE | (resend ? D_HA : 0), + "checksum at write origin: %x (%x)\n", + body->oa.o_cksum, cksum_type); + + /* save this in 'oa', too, for later checking */ + oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + oa->o_flags |= obd_cksum_type_pack(obd_name, + cksum_type); + } else { + /* clear out the checksum flag, in case this is a + * resend but cl_checksum is no longer set. b=11238 */ + oa->o_valid &= ~OBD_MD_FLCKSUM; + } + oa->o_cksum = body->oa.o_cksum; + /* 1 RC per niobuf */ + req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER, + sizeof(__u32) * niocount); + } else { + if (enable_checksum) { + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) + body->oa.o_flags = 0; + body->oa.o_flags |= obd_cksum_type_pack(obd_name, + cli->cl_cksum_type); + body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + } + + /* Client cksum has been already copied to wire obdo in previous + * lustre_set_wire_obdo(), and in the case a bulk-read is being + * resent due to cksum error, this will allow Server to + * check+dump pages on its side */ + } + ptlrpc_request_set_replen(req); + + aa = ptlrpc_req_async_args(aa, req); + aa->aa_oa = oa; + aa->aa_requested_nob = requested_nob; + aa->aa_nio_count = niocount; + aa->aa_page_count = page_count; + aa->aa_resends = 0; + aa->aa_ppga = pga; + aa->aa_cli = cli; + INIT_LIST_HEAD(&aa->aa_oaps); + + *reqp = req; + niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); + CDEBUG(D_RPCTRACE, "brw rpc %p - object "DOSTID" offset %lld<>%lld\n", + req, POSTID(&oa->o_oi), niobuf[0].rnb_offset, + niobuf[niocount - 1].rnb_offset + niobuf[niocount - 1].rnb_len); + RETURN(0); + + out: + ptlrpc_req_finished(req); + RETURN(rc); +} + +char dbgcksum_file_name[PATH_MAX]; + +static void dump_all_bulk_pages(struct obdo *oa, __u32 page_count, + struct brw_page **pga, __u32 server_cksum, + __u32 client_cksum) +{ + struct file *filp; + int rc, i; + unsigned int len; + char *buf; + + /* will only keep dump of pages on first error for the same range in + * file/fid, not during the resends/retries. */ + snprintf(dbgcksum_file_name, sizeof(dbgcksum_file_name), + "%s-checksum_dump-osc-"DFID":[%llu-%llu]-%x-%x", + (strncmp(libcfs_debug_file_path, "NONE", 4) != 0 ? + libcfs_debug_file_path : LIBCFS_DEBUG_FILE_PATH_DEFAULT), + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, + pga[0]->off, + pga[page_count-1]->off + pga[page_count-1]->count - 1, + client_cksum, server_cksum); + CWARN("dumping checksum data to %s\n", dbgcksum_file_name); + filp = filp_open(dbgcksum_file_name, + O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + if (rc == -EEXIST) + CDEBUG(D_INFO, "%s: can't open to dump pages with " + "checksum error: rc = %d\n", dbgcksum_file_name, + rc); + else + CERROR("%s: can't open to dump pages with checksum " + "error: rc = %d\n", dbgcksum_file_name, rc); + return; + } + + for (i = 0; i < page_count; i++) { + len = pga[i]->count; + buf = kmap(pga[i]->pg); + while (len != 0) { + rc = cfs_kernel_write(filp, buf, len, &filp->f_pos); + if (rc < 0) { + CERROR("%s: wanted to write %u but got %d " + "error\n", dbgcksum_file_name, len, rc); + break; + } + len -= rc; + buf += rc; + } + kunmap(pga[i]->pg); + } + + rc = vfs_fsync_range(filp, 0, LLONG_MAX, 1); + if (rc) + CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc); + filp_close(filp, NULL); + + libcfs_debug_dumplog(); +} + +static int +check_write_checksum(struct obdo *oa, const struct lnet_processid *peer, + __u32 client_cksum, __u32 server_cksum, + struct osc_brw_async_args *aa) +{ + const char *obd_name = aa->aa_cli->cl_import->imp_obd->obd_name; + enum cksum_types cksum_type; + obd_dif_csum_fn *fn = NULL; + int sector_size = 0; + __u32 new_cksum; + char *msg; + int rc; + + if (server_cksum == client_cksum) { + CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); + return 0; + } + + if (aa->aa_cli->cl_checksum_dump) + dump_all_bulk_pages(oa, aa->aa_page_count, aa->aa_ppga, + server_cksum, client_cksum); + + cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ? + oa->o_flags : 0); + + switch (cksum_type) { + case OBD_CKSUM_T10IP512: + fn = obd_dif_ip_fn; + sector_size = 512; + break; + case OBD_CKSUM_T10IP4K: + fn = obd_dif_ip_fn; + sector_size = 4096; + break; + case OBD_CKSUM_T10CRC512: + fn = obd_dif_crc_fn; + sector_size = 512; + break; + case OBD_CKSUM_T10CRC4K: + fn = obd_dif_crc_fn; + sector_size = 4096; + break; + default: + break; + } + + if (fn) + rc = osc_checksum_bulk_t10pi(obd_name, aa->aa_requested_nob, + aa->aa_page_count, aa->aa_ppga, + OST_WRITE, fn, sector_size, + &new_cksum, true); + else + rc = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count, + aa->aa_ppga, OST_WRITE, cksum_type, + &new_cksum); + + if (rc < 0) + msg = "failed to calculate the client write checksum"; + else if (cksum_type != obd_cksum_type_unpack(aa->aa_oa->o_flags)) + msg = "the server did not use the checksum type specified in " + "the original request - likely a protocol problem"; + else if (new_cksum == server_cksum) + msg = "changed on the client after we checksummed it - " + "likely false positive due to mmap IO (bug 11742)"; + else if (new_cksum == client_cksum) + msg = "changed in transit before arrival at OST"; + else + msg = "changed in transit AND doesn't match the original - " + "likely false positive due to mmap IO (bug 11742)"; + + LCONSOLE_ERROR_MSG(0x132, "%s: BAD WRITE CHECKSUM: %s: from %s inode " + DFID " object "DOSTID" extent [%llu-%llu], original " + "client csum %x (type %x), server csum %x (type %x)," + " client csum now %x\n", + obd_name, msg, libcfs_nidstr(&peer->nid), + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, + POSTID(&oa->o_oi), aa->aa_ppga[0]->off, + aa->aa_ppga[aa->aa_page_count - 1]->off + + aa->aa_ppga[aa->aa_page_count-1]->count - 1, + client_cksum, + obd_cksum_type_unpack(aa->aa_oa->o_flags), + server_cksum, cksum_type, new_cksum); + return 1; +} + +/* Note rc enters this function as number of bytes transferred */ +static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) +{ + struct osc_brw_async_args *aa = (void *)&req->rq_async_args; + struct client_obd *cli = aa->aa_cli; + const char *obd_name = cli->cl_import->imp_obd->obd_name; + const struct lnet_processid *peer = + &req->rq_import->imp_connection->c_peer; + struct ost_body *body; + u32 client_cksum = 0; + struct inode *inode = NULL; + unsigned int blockbits = 0, blocksize = 0; + struct cl_page *clpage; + + ENTRY; + + if (rc < 0 && rc != -EDQUOT) { + DEBUG_REQ(D_INFO, req, "Failed request: rc = %d", rc); + RETURN(rc); + } + + LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc); + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) { + DEBUG_REQ(D_INFO, req, "cannot unpack body"); + RETURN(-EPROTO); + } + + /* set/clear over quota flag for a uid/gid/projid */ + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && + body->oa.o_valid & (OBD_MD_FLALLQUOTA)) { + unsigned qid[LL_MAXQUOTAS] = { + body->oa.o_uid, body->oa.o_gid, + body->oa.o_projid }; + CDEBUG(D_QUOTA, + "setdq for [%u %u %u] with valid %#llx, flags %x\n", + body->oa.o_uid, body->oa.o_gid, body->oa.o_projid, + body->oa.o_valid, body->oa.o_flags); + osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid, + body->oa.o_flags); + } + + osc_update_grant(cli, body); + + if (rc < 0) + RETURN(rc); + + if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM) + client_cksum = aa->aa_oa->o_cksum; /* save for later */ + + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) { + if (rc > 0) { + CERROR("%s: unexpected positive size %d\n", + obd_name, rc); + RETURN(-EPROTO); + } + + if (req->rq_bulk != NULL && + sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk)) + RETURN(-EAGAIN); + + if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum && + check_write_checksum(&body->oa, peer, client_cksum, + body->oa.o_cksum, aa)) + RETURN(-EAGAIN); + + rc = check_write_rcs(req, aa->aa_requested_nob, + aa->aa_nio_count, aa->aa_page_count, + aa->aa_ppga); + GOTO(out, rc); + } + + /* The rest of this function executes only for OST_READs */ + + if (req->rq_bulk == NULL) { + rc = req_capsule_get_size(&req->rq_pill, &RMF_SHORT_IO, + RCL_SERVER); + LASSERT(rc == req->rq_status); + } else { + /* if unwrap_bulk failed, return -EAGAIN to retry */ + rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc); + } + if (rc < 0) + GOTO(out, rc = -EAGAIN); + + if (rc > aa->aa_requested_nob) { + CERROR("%s: unexpected size %d, requested %d\n", obd_name, + rc, aa->aa_requested_nob); + RETURN(-EPROTO); + } + + if (req->rq_bulk != NULL && rc != req->rq_bulk->bd_nob_transferred) { + CERROR("%s: unexpected size %d, transferred %d\n", obd_name, + rc, req->rq_bulk->bd_nob_transferred); + RETURN(-EPROTO); + } + + if (req->rq_bulk == NULL) { + /* short io */ + int nob, pg_count, i = 0; + unsigned char *buf; + + CDEBUG(D_CACHE, "Using short io read, size %d\n", rc); + pg_count = aa->aa_page_count; + buf = req_capsule_server_sized_get(&req->rq_pill, &RMF_SHORT_IO, + rc); + nob = rc; + while (nob > 0 && pg_count > 0) { + unsigned char *ptr; + int count = aa->aa_ppga[i]->count > nob ? + nob : aa->aa_ppga[i]->count; + + CDEBUG(D_CACHE, "page %p count %d\n", + aa->aa_ppga[i]->pg, count); + ptr = kmap_atomic(aa->aa_ppga[i]->pg); + memcpy(ptr + (aa->aa_ppga[i]->off & ~PAGE_MASK), buf, + count); + kunmap_atomic((void *) ptr); + + buf += count; + nob -= count; + i++; + pg_count--; + } + } + + if (rc < aa->aa_requested_nob) + handle_short_read(rc, aa->aa_page_count, aa->aa_ppga); + + if (body->oa.o_valid & OBD_MD_FLCKSUM) { + static int cksum_counter; + u32 server_cksum = body->oa.o_cksum; + int nob = rc; + char *via = ""; + char *router = ""; + enum cksum_types cksum_type; + u32 o_flags = body->oa.o_valid & OBD_MD_FLFLAGS ? + body->oa.o_flags : 0; + + cksum_type = obd_cksum_type_unpack(o_flags); + rc = osc_checksum_bulk_rw(obd_name, cksum_type, nob, + aa->aa_page_count, aa->aa_ppga, + OST_READ, &client_cksum, false); + if (rc < 0) + GOTO(out, rc); + + if (req->rq_bulk != NULL && + lnet_nid_to_nid4(&peer->nid) != req->rq_bulk->bd_sender) { + via = " via "; + router = libcfs_nid2str(req->rq_bulk->bd_sender); + } + + if (server_cksum != client_cksum) { + struct ost_body *clbody; + __u32 client_cksum2; + u32 page_count = aa->aa_page_count; + + osc_checksum_bulk_rw(obd_name, cksum_type, nob, + page_count, aa->aa_ppga, + OST_READ, &client_cksum2, true); + clbody = req_capsule_client_get(&req->rq_pill, + &RMF_OST_BODY); + if (cli->cl_checksum_dump) + dump_all_bulk_pages(&clbody->oa, page_count, + aa->aa_ppga, server_cksum, + client_cksum); + + LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from " + "%s%s%s inode "DFID" object "DOSTID + " extent [%llu-%llu], client %x/%x, " + "server %x, cksum_type %x\n", + obd_name, + libcfs_nidstr(&peer->nid), + via, router, + clbody->oa.o_valid & OBD_MD_FLFID ? + clbody->oa.o_parent_seq : 0ULL, + clbody->oa.o_valid & OBD_MD_FLFID ? + clbody->oa.o_parent_oid : 0, + clbody->oa.o_valid & OBD_MD_FLFID ? + clbody->oa.o_parent_ver : 0, + POSTID(&body->oa.o_oi), + aa->aa_ppga[0]->off, + aa->aa_ppga[page_count-1]->off + + aa->aa_ppga[page_count-1]->count - 1, + client_cksum, client_cksum2, + server_cksum, cksum_type); + cksum_counter = 0; + aa->aa_oa->o_cksum = client_cksum; + rc = -EAGAIN; + } else { + cksum_counter++; + CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); + rc = 0; + } + } else if (unlikely(client_cksum)) { + static int cksum_missed; + + cksum_missed++; + if ((cksum_missed & (-cksum_missed)) == cksum_missed) + CERROR("%s: checksum %u requested from %s but not sent\n", + obd_name, cksum_missed, + libcfs_nidstr(&peer->nid)); + } else { + rc = 0; + } + + /* get the inode from the first cl_page */ + clpage = oap2cl_page(brw_page2oap(aa->aa_ppga[0])); + inode = clpage->cp_inode; + if (clpage->cp_type == CPT_TRANSIENT && inode) { + blockbits = inode->i_blkbits; + blocksize = 1 << blockbits; + } + if (inode && IS_ENCRYPTED(inode)) { + int idx; + + if (!llcrypt_has_encryption_key(inode)) { + CDEBUG(D_SEC, "no enc key for ino %lu\n", inode->i_ino); + GOTO(out, rc); + } + for (idx = 0; idx < aa->aa_page_count; idx++) { + struct brw_page *brwpg = aa->aa_ppga[idx]; + unsigned int offs = 0; + + while (offs < PAGE_SIZE) { + /* do not decrypt if page is all 0s */ + if (memchr_inv(page_address(brwpg->pg) + offs, + 0, LUSTRE_ENCRYPTION_UNIT_SIZE) == NULL) { + /* if page is empty forward info to + * upper layers (ll_io_zero_page) by + * clearing PagePrivate2 + */ + if (!offs) + ClearPagePrivate2(brwpg->pg); + break; + } + + if (blockbits) { + /* This is direct IO case. Directly call + * decrypt function that takes inode as + * input parameter. Page does not need + * to be locked. + */ + u64 lblk_num; + unsigned int i; + + clpage = + oap2cl_page(brw_page2oap(brwpg)); + lblk_num = + ((u64)(clpage->cp_page_index) << + (PAGE_SHIFT - blockbits)) + + (offs >> blockbits); + for (i = offs; + i < offs + + LUSTRE_ENCRYPTION_UNIT_SIZE; + i += blocksize, lblk_num++) { + rc = + llcrypt_decrypt_block_inplace( + inode, brwpg->pg, + blocksize, i, + lblk_num); + if (rc) + break; + } + } else { + rc = llcrypt_decrypt_pagecache_blocks( + brwpg->pg, + LUSTRE_ENCRYPTION_UNIT_SIZE, + offs); + } + if (rc) + GOTO(out, rc); + + offs += LUSTRE_ENCRYPTION_UNIT_SIZE; + } + } + } + +out: + if (rc >= 0) + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, + aa->aa_oa, &body->oa); + + RETURN(rc); +} + +static int osc_brw_redo_request(struct ptlrpc_request *request, + struct osc_brw_async_args *aa, int rc) +{ + struct ptlrpc_request *new_req; + struct osc_brw_async_args *new_aa; + struct osc_async_page *oap; + ENTRY; + + /* The below message is checked in replay-ost-single.sh test_8ae*/ + DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request, + "redo for recoverable error %d", rc); + + rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) == + OST_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ, + aa->aa_cli, aa->aa_oa, aa->aa_page_count, + aa->aa_ppga, &new_req, 1); + if (rc) + RETURN(rc); + + list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { + if (oap->oap_request != NULL) { + LASSERTF(request == oap->oap_request, + "request %p != oap_request %p\n", + request, oap->oap_request); + } + } + /* + * New request takes over pga and oaps from old request. + * Note that copying a list_head doesn't work, need to move it... + */ + aa->aa_resends++; + new_req->rq_interpret_reply = request->rq_interpret_reply; + new_req->rq_async_args = request->rq_async_args; + new_req->rq_commit_cb = request->rq_commit_cb; + /* cap resend delay to the current request timeout, this is similar to + * what ptlrpc does (see after_reply()) */ + if (aa->aa_resends > new_req->rq_timeout) + new_req->rq_sent = ktime_get_real_seconds() + new_req->rq_timeout; + else + new_req->rq_sent = ktime_get_real_seconds() + aa->aa_resends; + new_req->rq_generation_set = 1; + new_req->rq_import_generation = request->rq_import_generation; + + new_aa = ptlrpc_req_async_args(new_aa, new_req); + + INIT_LIST_HEAD(&new_aa->aa_oaps); + list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps); + INIT_LIST_HEAD(&new_aa->aa_exts); + list_splice_init(&aa->aa_exts, &new_aa->aa_exts); + new_aa->aa_resends = aa->aa_resends; + + list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) { + if (oap->oap_request) { + ptlrpc_req_finished(oap->oap_request); + oap->oap_request = ptlrpc_request_addref(new_req); + } + } + + /* XXX: This code will run into problem if we're going to support + * to add a series of BRW RPCs into a self-defined ptlrpc_request_set + * and wait for all of them to be finished. We should inherit request + * set from old request. */ + ptlrpcd_add_req(new_req); + + DEBUG_REQ(D_INFO, new_req, "new request"); + RETURN(0); +} + +/* + * ugh, we want disk allocation on the target to happen in offset order. we'll + * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do + * fine for our small page arrays and doesn't require allocation. its an + * insertion sort that swaps elements that are strides apart, shrinking the + * stride down until its '1' and the array is sorted. + */ +static void sort_brw_pages(struct brw_page **array, int num) +{ + int stride, i, j; + struct brw_page *tmp; + + if (num == 1) + return; + for (stride = 1; stride < num ; stride = (stride * 3) + 1) + ; + + do { + stride /= 3; + for (i = stride ; i < num ; i++) { + tmp = array[i]; + j = i; + while (j >= stride && array[j - stride]->off > tmp->off) { + array[j] = array[j - stride]; + j -= stride; + } + array[j] = tmp; + } + } while (stride > 1); +} + +static void osc_release_ppga(struct brw_page **ppga, size_t count) +{ + LASSERT(ppga != NULL); + OBD_FREE_PTR_ARRAY_LARGE(ppga, count); +} + +static int brw_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *args, int rc) +{ + struct osc_brw_async_args *aa = args; + struct osc_extent *ext; + struct osc_extent *tmp; + struct client_obd *cli = aa->aa_cli; + unsigned long transferred = 0; + + ENTRY; + + rc = osc_brw_fini_request(req, rc); + CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc); + + /* restore clear text pages */ + osc_release_bounce_pages(aa->aa_ppga, aa->aa_page_count); + + /* + * When server returns -EINPROGRESS, client should always retry + * regardless of the number of times the bulk was resent already. + */ + if (osc_recoverable_error(rc) && !req->rq_no_delay) { + if (req->rq_import_generation != + req->rq_import->imp_generation) { + CDEBUG(D_HA, "%s: resend cross eviction for object: " + ""DOSTID", rc = %d.\n", + req->rq_import->imp_obd->obd_name, + POSTID(&aa->aa_oa->o_oi), rc); + } else if (rc == -EINPROGRESS || + client_should_resend(aa->aa_resends, aa->aa_cli)) { + rc = osc_brw_redo_request(req, aa, rc); + } else { + CERROR("%s: too many resent retries for object: " + "%llu:%llu, rc = %d.\n", + req->rq_import->imp_obd->obd_name, + POSTID(&aa->aa_oa->o_oi), rc); + } + + if (rc == 0) + RETURN(0); + else if (rc == -EAGAIN || rc == -EINPROGRESS) + rc = -EIO; + } + + if (rc == 0) { + struct obdo *oa = aa->aa_oa; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + unsigned long valid = 0; + struct cl_object *obj; + struct osc_async_page *last; + + last = brw_page2oap(aa->aa_ppga[aa->aa_page_count - 1]); + obj = osc2cl(last->oap_obj); + + cl_object_attr_lock(obj); + if (oa->o_valid & OBD_MD_FLBLOCKS) { + attr->cat_blocks = oa->o_blocks; + valid |= CAT_BLOCKS; + } + if (oa->o_valid & OBD_MD_FLMTIME) { + attr->cat_mtime = oa->o_mtime; + valid |= CAT_MTIME; + } + if (oa->o_valid & OBD_MD_FLATIME) { + attr->cat_atime = oa->o_atime; + valid |= CAT_ATIME; + } + if (oa->o_valid & OBD_MD_FLCTIME) { + attr->cat_ctime = oa->o_ctime; + valid |= CAT_CTIME; + } + + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) { + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + loff_t last_off = last->oap_count + last->oap_obj_off + + last->oap_page_off; + + /* Change file size if this is an out of quota or + * direct IO write and it extends the file size */ + if (loi->loi_lvb.lvb_size < last_off) { + attr->cat_size = last_off; + valid |= CAT_SIZE; + } + /* Extend KMS if it's not a lockless write */ + if (loi->loi_kms < last_off && + oap2osc_page(last)->ops_srvlock == 0) { + attr->cat_kms = last_off; + valid |= CAT_KMS; + } + } + + if (valid != 0) + cl_object_attr_update(env, obj, attr, valid); + cl_object_attr_unlock(obj); + } + OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem); + aa->aa_oa = NULL; + + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0) + osc_inc_unstable_pages(req); + + list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) { + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 1, + rc && req->rq_no_delay ? -EAGAIN : rc); + } + LASSERT(list_empty(&aa->aa_exts)); + LASSERT(list_empty(&aa->aa_oaps)); + + transferred = (req->rq_bulk == NULL ? /* short io */ + aa->aa_requested_nob : + req->rq_bulk->bd_nob_transferred); + + osc_release_ppga(aa->aa_ppga, aa->aa_page_count); + ptlrpc_lprocfs_brw(req, transferred); + + spin_lock(&cli->cl_loi_list_lock); + /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters + * is called so we know whether to go to sync BRWs or wait for more + * RPCs to complete */ + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) + cli->cl_w_in_flight--; + else + cli->cl_r_in_flight--; + osc_wake_cache_waiters(cli); + spin_unlock(&cli->cl_loi_list_lock); + + osc_io_unplug(env, cli, NULL); + RETURN(rc); +} + +static void brw_commit(struct ptlrpc_request *req) +{ + /* If osc_inc_unstable_pages (via osc_extent_finish) races with + * this called via the rq_commit_cb, I need to ensure + * osc_dec_unstable_pages is still called. Otherwise unstable + * pages may be leaked. */ + spin_lock(&req->rq_lock); + if (likely(req->rq_unstable)) { + req->rq_unstable = 0; + spin_unlock(&req->rq_lock); + + osc_dec_unstable_pages(req); + } else { + req->rq_committed = 1; + spin_unlock(&req->rq_lock); + } +} + +/** + * Build an RPC by the list of extent @ext_list. The caller must ensure + * that the total pages in this list are NOT over max pages per RPC. + * Extents in the list must be in OES_RPC state. + */ +int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, + struct list_head *ext_list, int cmd) +{ + struct ptlrpc_request *req = NULL; + struct osc_extent *ext; + struct brw_page **pga = NULL; + struct osc_brw_async_args *aa = NULL; + struct obdo *oa = NULL; + struct osc_async_page *oap; + struct osc_object *obj = NULL; + struct cl_req_attr *crattr = NULL; + loff_t starting_offset = OBD_OBJECT_EOF; + loff_t ending_offset = 0; + /* '1' for consistency with code that checks !mpflag to restore */ + int mpflag = 1; + int mem_tight = 0; + int page_count = 0; + bool soft_sync = false; + bool ndelay = false; + int i; + int grant = 0; + int rc; + __u32 layout_version = 0; + LIST_HEAD(rpc_list); + struct ost_body *body; + ENTRY; + LASSERT(!list_empty(ext_list)); + + /* add pages into rpc_list to build BRW rpc */ + list_for_each_entry(ext, ext_list, oe_link) { + LASSERT(ext->oe_state == OES_RPC); + mem_tight |= ext->oe_memalloc; + grant += ext->oe_grants; + page_count += ext->oe_nr_pages; + layout_version = max(layout_version, ext->oe_layout_version); + if (obj == NULL) + obj = ext->oe_obj; + } + + soft_sync = osc_over_unstable_soft_limit(cli); + if (mem_tight) + mpflag = memalloc_noreclaim_save(); + + OBD_ALLOC_PTR_ARRAY_LARGE(pga, page_count); + if (pga == NULL) + GOTO(out, rc = -ENOMEM); + + OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS); + if (oa == NULL) + GOTO(out, rc = -ENOMEM); + + i = 0; + list_for_each_entry(ext, ext_list, oe_link) { + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + if (mem_tight) + oap->oap_brw_flags |= OBD_BRW_MEMALLOC; + if (soft_sync) + oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC; + pga[i] = &oap->oap_brw_page; + pga[i]->off = oap->oap_obj_off + oap->oap_page_off; + i++; + + list_add_tail(&oap->oap_rpc_item, &rpc_list); + if (starting_offset == OBD_OBJECT_EOF || + starting_offset > oap->oap_obj_off) + starting_offset = oap->oap_obj_off; + else + LASSERT(oap->oap_page_off == 0); + if (ending_offset < oap->oap_obj_off + oap->oap_count) + ending_offset = oap->oap_obj_off + + oap->oap_count; + else + LASSERT(oap->oap_page_off + oap->oap_count == + PAGE_SIZE); + } + if (ext->oe_ndelay) + ndelay = true; + } + + /* first page in the list */ + oap = list_first_entry(&rpc_list, typeof(*oap), oap_rpc_item); + + crattr = &osc_env_info(env)->oti_req_attr; + memset(crattr, 0, sizeof(*crattr)); + crattr->cra_type = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ; + crattr->cra_flags = ~0ULL; + crattr->cra_page = oap2cl_page(oap); + crattr->cra_oa = oa; + cl_req_attr_set(env, osc2cl(obj), crattr); + + if (cmd == OBD_BRW_WRITE) { + oa->o_grant_used = grant; + if (layout_version > 0) { + CDEBUG(D_LAYOUT, DFID": write with layout version %u\n", + PFID(&oa->o_oi.oi_fid), layout_version); + + oa->o_layout_version = layout_version; + oa->o_valid |= OBD_MD_LAYOUT_VERSION; + } + } + + sort_brw_pages(pga, page_count); + rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0); + if (rc != 0) { + CERROR("prep_req failed: %d\n", rc); + GOTO(out, rc); + } + + req->rq_commit_cb = brw_commit; + req->rq_interpret_reply = brw_interpret; + req->rq_memalloc = mem_tight != 0; + oap->oap_request = ptlrpc_request_addref(req); + if (ndelay) { + req->rq_no_resend = req->rq_no_delay = 1; + /* probably set a shorter timeout value. + * to handle ETIMEDOUT in brw_interpret() correctly. */ + /* lustre_msg_set_timeout(req, req->rq_timeout / 2); */ + } + + /* Need to update the timestamps after the request is built in case + * we race with setattr (locally or in queue at OST). If OST gets + * later setattr before earlier BRW (as determined by the request xid), + * the OST will not use BRW timestamps. Sadly, there is no obvious + * way to do this in a single call. bug 10150 */ + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + crattr->cra_oa = &body->oa; + crattr->cra_flags = OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME; + cl_req_attr_set(env, osc2cl(obj), crattr); + lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid); + + aa = ptlrpc_req_async_args(aa, req); + INIT_LIST_HEAD(&aa->aa_oaps); + list_splice_init(&rpc_list, &aa->aa_oaps); + INIT_LIST_HEAD(&aa->aa_exts); + list_splice_init(ext_list, &aa->aa_exts); + + spin_lock(&cli->cl_loi_list_lock); + starting_offset >>= PAGE_SHIFT; + if (cmd == OBD_BRW_READ) { + cli->cl_r_in_flight++; + lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count); + lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight); + lprocfs_oh_tally_log2(&cli->cl_read_offset_hist, + starting_offset + 1); + } else { + cli->cl_w_in_flight++; + lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count); + lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight); + lprocfs_oh_tally_log2(&cli->cl_write_offset_hist, + starting_offset + 1); + } + spin_unlock(&cli->cl_loi_list_lock); + + DEBUG_REQ(D_INODE, req, "%d pages, aa %p, now %ur/%uw in flight", + page_count, aa, cli->cl_r_in_flight, + cli->cl_w_in_flight); + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, cfs_fail_val); + + ptlrpcd_add_req(req); + rc = 0; + EXIT; + +out: + if (mem_tight) + memalloc_noreclaim_restore(mpflag); + + if (rc != 0) { + LASSERT(req == NULL); + + if (oa) + OBD_SLAB_FREE_PTR(oa, osc_obdo_kmem); + if (pga) { + osc_release_bounce_pages(pga, page_count); + osc_release_ppga(pga, page_count); + } + /* this should happen rarely and is pretty bad, it makes the + * pending list not follow the dirty order + */ + while ((ext = list_first_entry_or_null(ext_list, + struct osc_extent, + oe_link)) != NULL) { + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 0, rc); + } + } + RETURN(rc); +} + +/* This is to refresh our lock in face of no RPCs. */ +void osc_send_empty_rpc(struct osc_object *osc, pgoff_t start) +{ + struct ptlrpc_request *req; + struct obdo oa; + struct brw_page bpg = { .off = start, .count = 1}; + struct brw_page *pga = &bpg; + int rc; + + memset(&oa, 0, sizeof(oa)); + oa.o_oi = osc->oo_oinfo->loi_oi; + oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLFLAGS; + /* For updated servers - don't do a read */ + oa.o_flags = OBD_FL_NORPC; + + rc = osc_brw_prep_request(OBD_BRW_READ, osc_cli(osc), &oa, 1, &pga, + &req, 0); + + /* If we succeeded we ship it off, if not there's no point in doing + * anything. Also no resends. + * No interpret callback, no commit callback. + */ + if (!rc) { + req->rq_no_resend = 1; + ptlrpcd_add_req(req); + } +} + +static int osc_set_lock_data(struct ldlm_lock *lock, void *data) +{ + int set = 0; + + LASSERT(lock != NULL); + + lock_res_and_lock(lock); + + if (lock->l_ast_data == NULL) + lock->l_ast_data = data; + if (lock->l_ast_data == data) + set = 1; + + unlock_res_and_lock(lock); + + return set; +} + +int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall, + void *cookie, struct lustre_handle *lockh, + enum ldlm_mode mode, __u64 *flags, bool speculative, + int errcode) +{ + bool intent = *flags & LDLM_FL_HAS_INTENT; + int rc; + ENTRY; + + /* The request was created before ldlm_cli_enqueue call. */ + if (intent && errcode == ELDLM_LOCK_ABORTED) { + struct ldlm_reply *rep; + + rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + LASSERT(rep != NULL); + + rep->lock_policy_res1 = + ptlrpc_status_ntoh(rep->lock_policy_res1); + if (rep->lock_policy_res1) + errcode = rep->lock_policy_res1; + if (!speculative) + *flags |= LDLM_FL_LVB_READY; + } else if (errcode == ELDLM_OK) { + *flags |= LDLM_FL_LVB_READY; + } + + /* Call the update callback. */ + rc = (*upcall)(cookie, lockh, errcode); + + /* release the reference taken in ldlm_cli_enqueue() */ + if (errcode == ELDLM_LOCK_MATCHED) + errcode = ELDLM_OK; + if (errcode == ELDLM_OK && lustre_handle_is_used(lockh)) + ldlm_lock_decref(lockh, mode); + + RETURN(rc); +} + +int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req, + void *args, int rc) +{ + struct osc_enqueue_args *aa = args; + struct ldlm_lock *lock; + struct lustre_handle *lockh = &aa->oa_lockh; + enum ldlm_mode mode = aa->oa_mode; + struct ost_lvb *lvb = aa->oa_lvb; + __u32 lvb_len = sizeof(*lvb); + __u64 flags = 0; + struct ldlm_enqueue_info einfo = { + .ei_type = aa->oa_type, + .ei_mode = mode, + }; + + ENTRY; + + /* ldlm_cli_enqueue is holding a reference on the lock, so it must + * be valid. */ + lock = ldlm_handle2lock(lockh); + LASSERTF(lock != NULL, + "lockh %#llx, req %p, aa %p - client evicted?\n", + lockh->cookie, req, aa); + + /* Take an additional reference so that a blocking AST that + * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed + * to arrive after an upcall has been executed by + * osc_enqueue_fini(). */ + ldlm_lock_addref(lockh, mode); + + /* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2); + + /* Let CP AST to grant the lock first. */ + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1); + + if (aa->oa_speculative) { + LASSERT(aa->oa_lvb == NULL); + LASSERT(aa->oa_flags == NULL); + aa->oa_flags = &flags; + } + + /* Complete obtaining the lock procedure. */ + rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, &einfo, 1, aa->oa_flags, + lvb, lvb_len, lockh, rc, false); + /* Complete osc stuff. */ + rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode, + aa->oa_flags, aa->oa_speculative, rc); + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10); + + ldlm_lock_decref(lockh, mode); + LDLM_LOCK_PUT(lock); + RETURN(rc); +} + +/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock + * from the 2nd OSC before a lock from the 1st one. This does not deadlock with + * other synchronous requests, however keeping some locks and trying to obtain + * others may take a considerable amount of time in a case of ost failure; and + * when other sync requests do not get released lock from a client, the client + * is evicted from the cluster -- such scenarious make the life difficult, so + * release locks just after they are obtained. */ +int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u64 *flags, union ldlm_policy_data *policy, + struct ost_lvb *lvb, osc_enqueue_upcall_f upcall, + void *cookie, struct ldlm_enqueue_info *einfo, + struct ptlrpc_request_set *rqset, int async, + bool speculative) +{ + struct obd_device *obd = exp->exp_obd; + struct lustre_handle lockh = { 0 }; + struct ptlrpc_request *req = NULL; + int intent = *flags & LDLM_FL_HAS_INTENT; + __u64 match_flags = *flags; + enum ldlm_mode mode; + int rc; + ENTRY; + + /* Filesystem lock extents are extended to page boundaries so that + * dealing with the page cache is a little smoother. */ + policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK; + policy->l_extent.end |= ~PAGE_MASK; + + /* Next, search for already existing extent locks that will cover us */ + /* If we're trying to read, we also search for an existing PW lock. The + * VFS and page cache already protect us locally, so lots of readers/ + * writers can share a single PW lock. + * + * There are problems with conversion deadlocks, so instead of + * converting a read lock to a write lock, we'll just enqueue a new + * one. + * + * At some point we should cancel the read lock instead of making them + * send us a blocking callback, but there are problems with canceling + * locks out from other users right now, too. */ + mode = einfo->ei_mode; + if (einfo->ei_mode == LCK_PR) + mode |= LCK_PW; + /* Normal lock requests must wait for the LVB to be ready before + * matching a lock; speculative lock requests do not need to, + * because they will not actually use the lock. */ + if (!speculative) + match_flags |= LDLM_FL_LVB_READY; + if (intent != 0) + match_flags |= LDLM_FL_BLOCK_GRANTED; + mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id, + einfo->ei_type, policy, mode, &lockh); + if (mode) { + struct ldlm_lock *matched; + + if (*flags & LDLM_FL_TEST_LOCK) + RETURN(ELDLM_OK); + + matched = ldlm_handle2lock(&lockh); + if (speculative) { + /* This DLM lock request is speculative, and does not + * have an associated IO request. Therefore if there + * is already a DLM lock, it wll just inform the + * caller to cancel the request for this stripe.*/ + lock_res_and_lock(matched); + if (ldlm_extent_equal(&policy->l_extent, + &matched->l_policy_data.l_extent)) + rc = -EEXIST; + else + rc = -ECANCELED; + unlock_res_and_lock(matched); + + ldlm_lock_decref(&lockh, mode); + LDLM_LOCK_PUT(matched); + RETURN(rc); + } else if (osc_set_lock_data(matched, einfo->ei_cbdata)) { + *flags |= LDLM_FL_LVB_READY; + + /* We already have a lock, and it's referenced. */ + (*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED); + + ldlm_lock_decref(&lockh, mode); + LDLM_LOCK_PUT(matched); + RETURN(ELDLM_OK); + } else { + ldlm_lock_decref(&lockh, mode); + LDLM_LOCK_PUT(matched); + } + } + + if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK)) + RETURN(-ENOLCK); + + /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */ + *flags &= ~LDLM_FL_BLOCK_GRANTED; + + rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb, + sizeof(*lvb), LVB_T_OST, &lockh, async); + if (async) { + if (!rc) { + struct osc_enqueue_args *aa; + aa = ptlrpc_req_async_args(aa, req); + aa->oa_exp = exp; + aa->oa_mode = einfo->ei_mode; + aa->oa_type = einfo->ei_type; + lustre_handle_copy(&aa->oa_lockh, &lockh); + aa->oa_upcall = upcall; + aa->oa_cookie = cookie; + aa->oa_speculative = speculative; + if (!speculative) { + aa->oa_flags = flags; + aa->oa_lvb = lvb; + } else { + /* speculative locks are essentially to enqueue + * a DLM lock in advance, so we don't care + * about the result of the enqueue. */ + aa->oa_lvb = NULL; + aa->oa_flags = NULL; + } + + req->rq_interpret_reply = osc_enqueue_interpret; + ptlrpc_set_add_req(rqset, req); + } + RETURN(rc); + } + + rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode, + flags, speculative, rc); + + RETURN(rc); +} + +int osc_match_base(const struct lu_env *env, struct obd_export *exp, + struct ldlm_res_id *res_id, enum ldlm_type type, + union ldlm_policy_data *policy, enum ldlm_mode mode, + __u64 *flags, struct osc_object *obj, + struct lustre_handle *lockh, enum ldlm_match_flags match_flags) +{ + struct obd_device *obd = exp->exp_obd; + __u64 lflags = *flags; + enum ldlm_mode rc; + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH)) + RETURN(-EIO); + + /* Filesystem lock extents are extended to page boundaries so that + * dealing with the page cache is a little smoother */ + policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK; + policy->l_extent.end |= ~PAGE_MASK; + + /* Next, search for already existing extent locks that will cover us */ + rc = ldlm_lock_match_with_skip(obd->obd_namespace, lflags, 0, + res_id, type, policy, mode, lockh, + match_flags); + if (rc == 0 || lflags & LDLM_FL_TEST_LOCK) + RETURN(rc); + + if (obj != NULL) { + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + + LASSERT(lock != NULL); + if (osc_set_lock_data(lock, obj)) { + lock_res_and_lock(lock); + if (!ldlm_is_lvb_cached(lock)) { + LASSERT(lock->l_ast_data == obj); + osc_lock_lvb_update(env, obj, lock, NULL); + ldlm_set_lvb_cached(lock); + } + unlock_res_and_lock(lock); + } else { + ldlm_lock_decref(lockh, rc); + rc = 0; + } + LDLM_LOCK_PUT(lock); + } + RETURN(rc); +} + +static int osc_statfs_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *args, int rc) +{ + struct osc_async_args *aa = args; + struct obd_statfs *msfs; + + ENTRY; + if (rc == -EBADR) + /* + * The request has in fact never been sent due to issues at + * a higher level (LOV). Exit immediately since the caller + * is aware of the problem and takes care of the clean up. + */ + RETURN(rc); + + if ((rc == -ENOTCONN || rc == -EAGAIN) && + (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY)) + GOTO(out, rc = 0); + + if (rc != 0) + GOTO(out, rc); + + msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (msfs == NULL) + GOTO(out, rc = -EPROTO); + + *aa->aa_oi->oi_osfs = *msfs; +out: + rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); + + RETURN(rc); +} + +static int osc_statfs_async(struct obd_export *exp, + struct obd_info *oinfo, time64_t max_age, + struct ptlrpc_request_set *rqset) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + struct osc_async_args *aa; + int rc; + ENTRY; + + if (obd->obd_osfs_age >= max_age) { + CDEBUG(D_SUPER, + "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n", + obd->obd_name, &obd->obd_osfs, + obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks, + obd->obd_osfs.os_ffree, obd->obd_osfs.os_files); + spin_lock(&obd->obd_osfs_lock); + memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs)); + spin_unlock(&obd->obd_osfs_lock); + oinfo->oi_flags |= OBD_STATFS_FROM_CACHE; + if (oinfo->oi_cb_up) + oinfo->oi_cb_up(oinfo, 0); + + RETURN(0); + } + + /* We could possibly pass max_age in the request (as an absolute + * timestamp or a "seconds.usec ago") so the target can avoid doing + * extra calls into the filesystem if that isn't necessary (e.g. + * during mount that would help a bit). Having relative timestamps + * is not so great if request processing is slow, while absolute + * timestamps are not ideal because they need time synchronization. */ + req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + ptlrpc_request_set_replen(req); + req->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (oinfo->oi_flags & OBD_STATFS_NODELAY) { + /* procfs requests not want stat in wait for avoid deadlock */ + req->rq_no_resend = 1; + req->rq_no_delay = 1; + } + + req->rq_interpret_reply = osc_statfs_interpret; + aa = ptlrpc_req_async_args(aa, req); + aa->aa_oi = oinfo; + + ptlrpc_set_add_req(rqset, req); + RETURN(0); +} + +static int osc_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, time64_t max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct obd_statfs *msfs; + struct ptlrpc_request *req; + struct obd_import *imp, *imp0; + int rc; + ENTRY; + + /*Since the request might also come from lprocfs, so we need + *sync this with client_disconnect_export Bug15684 + */ + with_imp_locked(obd, imp0, rc) + imp = class_import_get(imp0); + if (rc) + RETURN(rc); + + /* We could possibly pass max_age in the request (as an absolute + * timestamp or a "seconds.usec ago") so the target can avoid doing + * extra calls into the filesystem if that isn't necessary (e.g. + * during mount that would help a bit). Having relative timestamps + * is not so great if request processing is slow, while absolute + * timestamps are not ideal because they need time synchronization. */ + req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS); + + class_import_put(imp); + + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + ptlrpc_request_set_replen(req); + req->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (flags & OBD_STATFS_NODELAY) { + /* procfs requests not want stat in wait for avoid deadlock */ + req->rq_no_resend = 1; + req->rq_no_delay = 1; + } + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (msfs == NULL) + GOTO(out, rc = -EPROTO); + + *osfs = *msfs; + + EXIT; +out: + ptlrpc_req_finished(req); + return rc; +} + +static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void __user *uarg) +{ + struct obd_device *obd = exp->exp_obd; + struct obd_ioctl_data *data = karg; + int rc = 0; + + ENTRY; + if (!try_module_get(THIS_MODULE)) { + CERROR("%s: cannot get module '%s'\n", obd->obd_name, + module_name(THIS_MODULE)); + return -EINVAL; + } + switch (cmd) { + case OBD_IOC_CLIENT_RECOVER: + rc = ptlrpc_recover_import(obd->u.cli.cl_import, + data->ioc_inlbuf1, 0); + if (rc > 0) + rc = 0; + break; + case OBD_IOC_GETATTR: + rc = obd_getattr(NULL, exp, &data->ioc_obdo1); + break; + case IOC_OSC_SET_ACTIVE: + rc = ptlrpc_set_import_active(obd->u.cli.cl_import, + data->ioc_offset); + break; + default: + rc = -ENOTTY; + CDEBUG(D_INODE, "%s: unrecognised ioctl %#x by %s: rc = %d\n", + obd->obd_name, cmd, current->comm, rc); + break; + } + + module_put(THIS_MODULE); + return rc; +} + +int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, + u32 keylen, void *key, u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + struct obd_device *obd = exp->exp_obd; + struct obd_import *imp = class_exp2cliimp(exp); + char *tmp; + int rc; + ENTRY; + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10); + + if (KEY_IS(KEY_CHECKSUM)) { + if (vallen != sizeof(int)) + RETURN(-EINVAL); + exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0; + RETURN(0); + } + + if (KEY_IS(KEY_SPTLRPC_CONF)) { + sptlrpc_conf_client_adapt(obd); + RETURN(0); + } + + if (KEY_IS(KEY_FLUSH_CTX)) { + sptlrpc_import_flush_my_ctx(imp); + RETURN(0); + } + + if (KEY_IS(KEY_CACHE_LRU_SHRINK)) { + struct client_obd *cli = &obd->u.cli; + long nr = atomic_long_read(&cli->cl_lru_in_list) >> 1; + long target = *(long *)val; + + nr = osc_lru_shrink(env, cli, min(nr, target), true); + *(long *)val -= nr; + RETURN(0); + } + + if (!set && !KEY_IS(KEY_GRANT_SHRINK)) + RETURN(-EINVAL); + + /* We pass all other commands directly to OST. Since nobody calls osc + methods directly and everybody is supposed to go through LOV, we + assume lov checked invalid values for us. + The only recognised values so far are evict_by_nid and mds_conn. + Even if something bad goes through, we'd get a -EINVAL from OST + anyway. */ + + req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ? + &RQF_OST_SET_GRANT_INFO : + &RQF_OBD_SET_INFO); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT, keylen); + if (!KEY_IS(KEY_GRANT_SHRINK)) + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT, vallen); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ? + &RMF_OST_BODY : + &RMF_SETINFO_VAL); + memcpy(tmp, val, vallen); + + if (KEY_IS(KEY_GRANT_SHRINK)) { + struct osc_grant_args *aa; + struct obdo *oa; + + aa = ptlrpc_req_async_args(aa, req); + OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS); + if (!oa) { + ptlrpc_req_finished(req); + RETURN(-ENOMEM); + } + *oa = ((struct ost_body *)val)->oa; + aa->aa_oa = oa; + req->rq_interpret_reply = osc_shrink_grant_interpret; + } + + ptlrpc_request_set_replen(req); + if (!KEY_IS(KEY_GRANT_SHRINK)) { + LASSERT(set != NULL); + ptlrpc_set_add_req(set, req); + ptlrpc_check_set(NULL, set); + } else { + ptlrpcd_add_req(req); + } + + RETURN(0); +} +EXPORT_SYMBOL(osc_set_info_async); + +int osc_reconnect(const struct lu_env *env, struct obd_export *exp, + struct obd_device *obd, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata) +{ + struct client_obd *cli = &obd->u.cli; + + if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) { + long lost_grant; + long grant; + + spin_lock(&cli->cl_loi_list_lock); + grant = cli->cl_avail_grant + cli->cl_reserved_grant; + if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM) { + /* restore ocd_grant_blkbits as client page bits */ + data->ocd_grant_blkbits = PAGE_SHIFT; + grant += cli->cl_dirty_grant; + } else { + grant += cli->cl_dirty_pages << PAGE_SHIFT; + } + data->ocd_grant = grant ? : 2 * cli_brw_size(obd); + lost_grant = cli->cl_lost_grant; + cli->cl_lost_grant = 0; + spin_unlock(&cli->cl_loi_list_lock); + + CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d" + " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags, + data->ocd_version, data->ocd_grant, lost_grant); + } + + RETURN(0); +} +EXPORT_SYMBOL(osc_reconnect); + +int osc_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + int rc; + + rc = client_disconnect_export(exp); + /** + * Initially we put del_shrink_grant before disconnect_export, but it + * causes the following problem if setup (connect) and cleanup + * (disconnect) are tangled together. + * connect p1 disconnect p2 + * ptlrpc_connect_import + * ............... class_manual_cleanup + * osc_disconnect + * del_shrink_grant + * ptlrpc_connect_interrupt + * osc_init_grant + * add this client to shrink list + * cleanup_osc + * Bang! grant shrink thread trigger the shrink. BUG18662 + */ + osc_del_grant_list(&obd->u.cli); + return rc; +} +EXPORT_SYMBOL(osc_disconnect); + +int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct lu_env *env = arg; + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + struct ldlm_lock *lock; + struct osc_object *osc = NULL; + ENTRY; + + lock_res(res); + list_for_each_entry(lock, &res->lr_granted, l_res_link) { + if (lock->l_ast_data != NULL && osc == NULL) { + osc = lock->l_ast_data; + cl_object_get(osc2cl(osc)); + } + + /* clear LDLM_FL_CLEANED flag to make sure it will be canceled + * by the 2nd round of ldlm_namespace_clean() call in + * osc_import_event(). */ + ldlm_clear_cleaned(lock); + } + unlock_res(res); + + if (osc != NULL) { + osc_object_invalidate(env, osc); + cl_object_put(env, osc2cl(osc)); + } + + RETURN(0); +} +EXPORT_SYMBOL(osc_ldlm_resource_invalidate); + +static int osc_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + struct client_obd *cli; + int rc = 0; + + ENTRY; + LASSERT(imp->imp_obd == obd); + + switch (event) { + case IMP_EVENT_DISCON: { + cli = &obd->u.cli; + spin_lock(&cli->cl_loi_list_lock); + cli->cl_avail_grant = 0; + cli->cl_lost_grant = 0; + spin_unlock(&cli->cl_loi_list_lock); + break; + } + case IMP_EVENT_INACTIVE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE); + break; + } + case IMP_EVENT_INVALIDATE: { + struct ldlm_namespace *ns = obd->obd_namespace; + struct lu_env *env; + __u16 refcheck; + + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + osc_io_unplug(env, &obd->u.cli, NULL); + + cfs_hash_for_each_nolock(ns->ns_rs_hash, + osc_ldlm_resource_invalidate, + env, 0); + cl_env_put(env, &refcheck); + + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + } else + rc = PTR_ERR(env); + break; + } + case IMP_EVENT_ACTIVE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE); + break; + } + case IMP_EVENT_OCD: { + struct obd_connect_data *ocd = &imp->imp_connect_data; + + if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT) + osc_init_grant(&obd->u.cli, ocd); + + /* See bug 7198 */ + if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL) + imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL; + + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD); + break; + } + case IMP_EVENT_DEACTIVATE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE); + break; + } + case IMP_EVENT_ACTIVATE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE); + break; + } + default: + CERROR("Unknown import event %d\n", event); + LBUG(); + } + RETURN(rc); +} + +/** + * Determine whether the lock can be canceled before replaying the lock + * during recovery, see bug16774 for detailed information. + * + * \retval zero the lock can't be canceled + * \retval other ok to cancel + */ +static int osc_cancel_weight(struct ldlm_lock *lock) +{ + /* + * Cancel all unused and granted extent lock. + */ + if (lock->l_resource->lr_type == LDLM_EXTENT && + ldlm_is_granted(lock) && + osc_ldlm_weigh_ast(lock) == 0) + RETURN(1); + + RETURN(0); +} + +static int brw_queue_work(const struct lu_env *env, void *data) +{ + struct client_obd *cli = data; + + CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli); + + osc_io_unplug(env, cli, NULL); + RETURN(0); +} + +int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct client_obd *cli = &obd->u.cli; + void *handler; + int rc; + + ENTRY; + + rc = ptlrpcd_addref(); + if (rc) + RETURN(rc); + + rc = client_obd_setup(obd, lcfg); + if (rc) + GOTO(out_ptlrpcd, rc); + + + handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli); + if (IS_ERR(handler)) + GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler)); + cli->cl_writeback_work = handler; + + handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli); + if (IS_ERR(handler)) + GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler)); + cli->cl_lru_work = handler; + + rc = osc_quota_setup(obd); + if (rc) + GOTO(out_ptlrpcd_work, rc); + + cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL; + cli->cl_root_squash = 0; + osc_update_next_shrink(cli); + + RETURN(rc); + +out_ptlrpcd_work: + if (cli->cl_writeback_work != NULL) { + ptlrpcd_destroy_work(cli->cl_writeback_work); + cli->cl_writeback_work = NULL; + } + if (cli->cl_lru_work != NULL) { + ptlrpcd_destroy_work(cli->cl_lru_work); + cli->cl_lru_work = NULL; + } + client_obd_cleanup(obd); +out_ptlrpcd: + ptlrpcd_decref(); + RETURN(rc); +} +EXPORT_SYMBOL(osc_setup_common); + +int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct client_obd *cli = &obd->u.cli; + int adding; + int added; + int req_count; + int rc; + + ENTRY; + + rc = osc_setup_common(obd, lcfg); + if (rc < 0) + RETURN(rc); + + rc = osc_tunables_init(obd); + if (rc) + RETURN(rc); + + /* + * We try to control the total number of requests with a upper limit + * osc_reqpool_maxreqcount. There might be some race which will cause + * over-limit allocation, but it is fine. + */ + req_count = atomic_read(&osc_pool_req_count); + if (req_count < osc_reqpool_maxreqcount) { + adding = cli->cl_max_rpcs_in_flight + 2; + if (req_count + adding > osc_reqpool_maxreqcount) + adding = osc_reqpool_maxreqcount - req_count; + + added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding); + atomic_add(added, &osc_pool_req_count); + } + + ns_register_cancel(obd->obd_namespace, osc_cancel_weight); + + spin_lock(&osc_shrink_lock); + list_add_tail(&cli->cl_shrink_list, &osc_shrink_list); + spin_unlock(&osc_shrink_lock); + cli->cl_import->imp_idle_timeout = osc_idle_timeout; + cli->cl_import->imp_idle_debug = D_HA; + + RETURN(0); +} + +int osc_precleanup_common(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + ENTRY; + + /* LU-464 + * for echo client, export may be on zombie list, wait for + * zombie thread to cull it, because cli.cl_import will be + * cleared in client_disconnect_export(): + * class_export_destroy() -> obd_cleanup() -> + * echo_device_free() -> echo_client_cleanup() -> + * obd_disconnect() -> osc_disconnect() -> + * client_disconnect_export() + */ + obd_zombie_barrier(); + if (cli->cl_writeback_work) { + ptlrpcd_destroy_work(cli->cl_writeback_work); + cli->cl_writeback_work = NULL; + } + + if (cli->cl_lru_work) { + ptlrpcd_destroy_work(cli->cl_lru_work); + cli->cl_lru_work = NULL; + } + + obd_cleanup_client_import(obd); + RETURN(0); +} +EXPORT_SYMBOL(osc_precleanup_common); + +static int osc_precleanup(struct obd_device *obd) +{ + ENTRY; + + osc_precleanup_common(obd); + + ptlrpc_lprocfs_unregister_obd(obd); + RETURN(0); +} + +int osc_cleanup_common(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int rc; + + ENTRY; + + spin_lock(&osc_shrink_lock); + list_del(&cli->cl_shrink_list); + spin_unlock(&osc_shrink_lock); + + /* lru cleanup */ + if (cli->cl_cache != NULL) { + LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0); + spin_lock(&cli->cl_cache->ccc_lru_lock); + list_del_init(&cli->cl_lru_osc); + spin_unlock(&cli->cl_cache->ccc_lru_lock); + cli->cl_lru_left = NULL; + cl_cache_decref(cli->cl_cache); + cli->cl_cache = NULL; + } + + /* free memory of osc quota cache */ + osc_quota_cleanup(obd); + + rc = client_obd_cleanup(obd); + + ptlrpcd_decref(); + RETURN(rc); +} +EXPORT_SYMBOL(osc_cleanup_common); + +static const struct obd_ops osc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = osc_setup, + .o_precleanup = osc_precleanup, + .o_cleanup = osc_cleanup_common, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = client_connect_import, + .o_reconnect = osc_reconnect, + .o_disconnect = osc_disconnect, + .o_statfs = osc_statfs, + .o_statfs_async = osc_statfs_async, + .o_create = osc_create, + .o_destroy = osc_destroy, + .o_getattr = osc_getattr, + .o_setattr = osc_setattr, + .o_iocontrol = osc_iocontrol, + .o_set_info_async = osc_set_info_async, + .o_import_event = osc_import_event, + .o_quotactl = osc_quotactl, +}; + +LIST_HEAD(osc_shrink_list); +DEFINE_SPINLOCK(osc_shrink_lock); + +#ifdef HAVE_SHRINKER_COUNT +static struct shrinker osc_cache_shrinker = { + .count_objects = osc_cache_shrink_count, + .scan_objects = osc_cache_shrink_scan, + .seeks = DEFAULT_SEEKS, +}; +#else +static int osc_cache_shrink(struct shrinker *shrinker, + struct shrink_control *sc) +{ + (void)osc_cache_shrink_scan(shrinker, sc); + + return osc_cache_shrink_count(shrinker, sc); +} + +static struct shrinker osc_cache_shrinker = { + .shrink = osc_cache_shrink, + .seeks = DEFAULT_SEEKS, +}; +#endif + +static int __init osc_init(void) +{ + unsigned int reqpool_size; + unsigned int reqsize; + int rc; + ENTRY; + + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches); + + rc = lu_kmem_init(osc_caches); + if (rc) + RETURN(rc); + + rc = class_register_type(&osc_obd_ops, NULL, true, + LUSTRE_OSC_NAME, &osc_device_type); + if (rc) + GOTO(out_kmem, rc); + + rc = register_shrinker(&osc_cache_shrinker); + if (rc) + GOTO(out_type, rc); + + /* This is obviously too much memory, only prevent overflow here */ + if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0) + GOTO(out_shrinker, rc = -EINVAL); + + reqpool_size = osc_reqpool_mem_max << 20; + + reqsize = 1; + while (reqsize < OST_IO_MAXREQSIZE) + reqsize = reqsize << 1; + + /* + * We don't enlarge the request count in OSC pool according to + * cl_max_rpcs_in_flight. The allocation from the pool will only be + * tried after normal allocation failed. So a small OSC pool won't + * cause much performance degression in most of cases. + */ + osc_reqpool_maxreqcount = reqpool_size / reqsize; + + atomic_set(&osc_pool_req_count, 0); + osc_rq_pool = ptlrpc_init_rq_pool(0, OST_IO_MAXREQSIZE, + ptlrpc_add_rqs_to_pool); + + if (osc_rq_pool == NULL) + GOTO(out_shrinker, rc = -ENOMEM); + + rc = osc_start_grant_work(); + if (rc != 0) + GOTO(out_req_pool, rc); + + RETURN(rc); + +out_req_pool: + ptlrpc_free_rq_pool(osc_rq_pool); +out_shrinker: + unregister_shrinker(&osc_cache_shrinker); +out_type: + class_unregister_type(LUSTRE_OSC_NAME); +out_kmem: + lu_kmem_fini(osc_caches); + + RETURN(rc); +} + +static void __exit osc_exit(void) +{ + osc_stop_grant_work(); + unregister_shrinker(&osc_cache_shrinker); + class_unregister_type(LUSTRE_OSC_NAME); + lu_kmem_fini(osc_caches); + ptlrpc_free_rq_pool(osc_rq_pool); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(osc_init); +module_exit(osc_exit); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile b/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile new file mode 100644 index 0000000000000..2765abe6ee44c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile @@ -0,0 +1,25 @@ +obj-$(CONFIG_LUSTREFSX_FS) += ptlrpc.o + +LDLM := ../../lustre/ldlm/ +TARGET := ../../lustre/target/ + +ldlm_objs := $(LDLM)l_lock.o $(LDLM)ldlm_lock.o +ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o +ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o +ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o +ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o +ldlm_objs += $(LDLM)ldlm_pool.o $(LDLM)ldlm_reclaim.o + +ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o +ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o +ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o +ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o +ptlrpc_objs += sec.o sec_ctx.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o +ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o nrs_delay.o heap.o +ptlrpc_objs += errno.o + +ptlrpc-y := $(ldlm_objs) $(ptlrpc_objs) $(TARGET)barrier.o + +ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/ldlm + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c new file mode 100644 index 0000000000000..efde01993f0ed --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c @@ -0,0 +1,3712 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +/** Implementation of client-side PortalRPC interfaces */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +static void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, + int len) +{ + __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1); +} + +static void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, + int len) +{ + __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0); +} + +static void ptlrpc_release_bulk_page_pin(struct ptlrpc_bulk_desc *desc) +{ + int i; + + for (i = 0; i < desc->bd_iov_count ; i++) + put_page(desc->bd_vec[i].bv_page); +} + +static int ptlrpc_prep_bulk_frag_pages(struct ptlrpc_bulk_desc *desc, + void *frag, int len) +{ + unsigned int offset = (unsigned long)frag & ~PAGE_MASK; + + ENTRY; + while (len > 0) { + int page_len = min_t(unsigned int, PAGE_SIZE - offset, + len); + unsigned long vaddr = (unsigned long)frag; + + ptlrpc_prep_bulk_page_nopin(desc, + lnet_kvaddr_to_page(vaddr), + offset, page_len); + offset = 0; + len -= page_len; + frag += page_len; + } + + RETURN(desc->bd_nob); +} + +const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops = { + .add_kiov_frag = ptlrpc_prep_bulk_page_pin, + .release_frags = ptlrpc_release_bulk_page_pin, +}; +EXPORT_SYMBOL(ptlrpc_bulk_kiov_pin_ops); + +const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops = { + .add_kiov_frag = ptlrpc_prep_bulk_page_nopin, + .release_frags = ptlrpc_release_bulk_noop, + .add_iov_frag = ptlrpc_prep_bulk_frag_pages, +}; +EXPORT_SYMBOL(ptlrpc_bulk_kiov_nopin_ops); + +static int ptlrpc_send_new_req(struct ptlrpc_request *req); +static int ptlrpcd_check_work(struct ptlrpc_request *req); +static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async); + +/** + * Initialize passed in client structure \a cl. + */ +void ptlrpc_init_client(int req_portal, int rep_portal, const char *name, + struct ptlrpc_client *cl) +{ + cl->cli_request_portal = req_portal; + cl->cli_reply_portal = rep_portal; + cl->cli_name = name; +} +EXPORT_SYMBOL(ptlrpc_init_client); + +/** + * Return PortalRPC connection for remore uud \a uuid + */ +struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid, + lnet_nid_t nid4refnet) +{ + struct ptlrpc_connection *c; + lnet_nid_t self; + struct lnet_process_id peer; + int err; + + /* + * ptlrpc_uuid_to_peer() initializes its 2nd parameter + * before accessing its values. + */ + /* coverity[uninit_use_in_call] */ + peer.nid = nid4refnet; + err = ptlrpc_uuid_to_peer(uuid, &peer, &self); + if (err != 0) { + CNETERR("cannot find peer %s!\n", uuid->uuid); + return NULL; + } + + c = ptlrpc_connection_get(peer, self, uuid); + if (c) { + memcpy(c->c_remote_uuid.uuid, + uuid->uuid, sizeof(c->c_remote_uuid.uuid)); + } + + CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c); + + return c; +} + +/** + * Allocate and initialize new bulk descriptor on the sender. + * Returns pointer to the descriptor or NULL on error. + */ +struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned int nfrags, + unsigned int max_brw, + enum ptlrpc_bulk_op_type type, + unsigned int portal, + const struct ptlrpc_bulk_frag_ops *ops) +{ + struct ptlrpc_bulk_desc *desc; + int i; + + LASSERT(ops->add_kiov_frag != NULL); + + if (max_brw > PTLRPC_BULK_OPS_COUNT) + RETURN(NULL); + + if (nfrags > LNET_MAX_IOV * max_brw) + RETURN(NULL); + + OBD_ALLOC_PTR(desc); + if (!desc) + return NULL; + + OBD_ALLOC_LARGE(desc->bd_vec, + nfrags * sizeof(*desc->bd_vec)); + if (!desc->bd_vec) + goto out; + + spin_lock_init(&desc->bd_lock); + init_waitqueue_head(&desc->bd_waitq); + desc->bd_max_iov = nfrags; + desc->bd_iov_count = 0; + desc->bd_portal = portal; + desc->bd_type = type; + desc->bd_md_count = 0; + desc->bd_nob_last = LNET_MTU; + desc->bd_frag_ops = ops; + LASSERT(max_brw > 0); + desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT); + /* + * PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this + * node. Negotiated ocd_brw_size will always be <= this number. + */ + for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++) + LNetInvalidateMDHandle(&desc->bd_mds[i]); + + return desc; +out: + OBD_FREE_PTR(desc); + return NULL; +} + +/** + * Prepare bulk descriptor for specified outgoing request \a req that + * can fit \a nfrags * pages. \a type is bulk type. \a portal is where + * the bulk to be sent. Used on client-side. + * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on + * error. + */ +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, + unsigned int nfrags, + unsigned int max_brw, + unsigned int type, + unsigned int portal, + const struct ptlrpc_bulk_frag_ops + *ops) +{ + struct obd_import *imp = req->rq_import; + struct ptlrpc_bulk_desc *desc; + + ENTRY; + LASSERT(ptlrpc_is_bulk_op_passive(type)); + + desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops); + if (!desc) + RETURN(NULL); + + desc->bd_import = class_import_get(imp); + desc->bd_req = req; + + desc->bd_cbid.cbid_fn = client_bulk_callback; + desc->bd_cbid.cbid_arg = desc; + + /* This makes req own desc, and free it when she frees herself */ + req->rq_bulk = desc; + + return desc; +} +EXPORT_SYMBOL(ptlrpc_prep_bulk_imp); + +void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len, + int pin) +{ + struct bio_vec *kiov; + + LASSERT(desc->bd_iov_count < desc->bd_max_iov); + LASSERT(page != NULL); + LASSERT(pageoffset >= 0); + LASSERT(len > 0); + LASSERT(pageoffset + len <= PAGE_SIZE); + + kiov = &desc->bd_vec[desc->bd_iov_count]; + + if (((desc->bd_iov_count % LNET_MAX_IOV) == 0) || + ((desc->bd_nob_last + len) > LNET_MTU)) { + desc->bd_mds_off[desc->bd_md_count] = desc->bd_iov_count; + desc->bd_md_count++; + desc->bd_nob_last = 0; + LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_COUNT); + } + + desc->bd_nob_last += len; + desc->bd_nob += len; + + if (pin) + get_page(page); + + kiov->bv_page = page; + kiov->bv_offset = pageoffset; + kiov->bv_len = len; + + desc->bd_iov_count++; +} +EXPORT_SYMBOL(__ptlrpc_prep_bulk_page); + +void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) +{ + ENTRY; + + LASSERT(desc != NULL); + LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */ + LASSERT(desc->bd_refs == 0); /* network hands off */ + LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL)); + LASSERT(desc->bd_frag_ops != NULL); + + sptlrpc_enc_pool_put_pages(desc); + + if (desc->bd_export) + class_export_put(desc->bd_export); + else + class_import_put(desc->bd_import); + + if (desc->bd_frag_ops->release_frags != NULL) + desc->bd_frag_ops->release_frags(desc); + + OBD_FREE_LARGE(desc->bd_vec, + desc->bd_max_iov * sizeof(*desc->bd_vec)); + OBD_FREE_PTR(desc); + EXIT; +} +EXPORT_SYMBOL(ptlrpc_free_bulk); + +/** + * Set server timelimit for this req, i.e. how long are we willing to wait + * for reply before timing out this request. + */ +void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) +{ + LASSERT(req->rq_import); + + if (AT_OFF) { + /* non-AT settings */ + /** + * \a imp_server_timeout means this is reverse import and + * we send (currently only) ASTs to the client and cannot afford + * to wait too long for the reply, otherwise the other client + * (because of which we are sending this request) would + * timeout waiting for us + */ + req->rq_timeout = req->rq_import->imp_server_timeout ? + obd_timeout / 2 : obd_timeout; + } else { + struct imp_at *at = &req->rq_import->imp_at; + timeout_t serv_est; + int idx; + + idx = import_at_get_index(req->rq_import, + req->rq_request_portal); + serv_est = at_get(&at->iat_service_estimate[idx]); + /* + * Currently a 32 bit value is sent over the + * wire for rq_timeout so please don't change this + * to time64_t. The work for LU-1158 will in time + * replace rq_timeout with a 64 bit nanosecond value + */ + req->rq_timeout = at_est2timeout(serv_est); + } + /* + * We could get even fancier here, using history to predict increased + * loading... + * + * Let the server know what this RPC timeout is by putting it in the + * reqmsg + */ + lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); +} +EXPORT_SYMBOL(ptlrpc_at_set_req_timeout); + +/* Adjust max service estimate based on server value */ +static void ptlrpc_at_adj_service(struct ptlrpc_request *req, + timeout_t serv_est) +{ + int idx; + timeout_t oldse; + struct imp_at *at; + + LASSERT(req->rq_import); + at = &req->rq_import->imp_at; + + idx = import_at_get_index(req->rq_import, req->rq_request_portal); + /* + * max service estimates are tracked on the server side, + * so just keep minimal history here + */ + oldse = at_measured(&at->iat_service_estimate[idx], serv_est); + if (oldse != 0) + CDEBUG(D_ADAPTTO, + "The RPC service estimate for %s ptl %d has changed from %d to %d\n", + req->rq_import->imp_obd->obd_name, + req->rq_request_portal, + oldse, at_get(&at->iat_service_estimate[idx])); +} + +/* Expected network latency per remote node (secs) */ +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) +{ + return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency); +} + +/* Adjust expected network latency */ +void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, + timeout_t service_timeout) +{ + time64_t now = ktime_get_real_seconds(); + struct imp_at *at; + timeout_t oldnl; + timeout_t nl; + + LASSERT(req->rq_import); + + if (service_timeout > now - req->rq_sent + 3) { + /* + * b=16408, however, this can also happen if early reply + * is lost and client RPC is expired and resent, early reply + * or reply of original RPC can still be fit in reply buffer + * of resent RPC, now client is measuring time from the + * resent time, but server sent back service time of original + * RPC. + */ + CDEBUG_LIMIT((lustre_msg_get_flags(req->rq_reqmsg) & + MSG_RESENT) ? D_ADAPTTO : D_WARNING, + "Reported service time %u > total measured time %lld\n", + service_timeout, now - req->rq_sent); + return; + } + + /* Network latency is total time less server processing time, + * st rounding + */ + nl = max_t(timeout_t, now - req->rq_sent - service_timeout, 0) + 1; + at = &req->rq_import->imp_at; + + oldnl = at_measured(&at->iat_net_latency, nl); + if (oldnl != 0) + CDEBUG(D_ADAPTTO, + "The network latency for %s (nid %s) has changed from %d to %d\n", + req->rq_import->imp_obd->obd_name, + obd_uuid2str(&req->rq_import->imp_connection->c_remote_uuid), + oldnl, at_get(&at->iat_net_latency)); +} + +static int unpack_reply(struct ptlrpc_request *req) +{ + int rc; + + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { + rc = ptlrpc_unpack_rep_msg(req, req->rq_replen); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unpack_rep failed: rc = %d", + rc); + return -EPROTO; + } + } + + rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: rc = %d", + rc); + return -EPROTO; + } + return 0; +} + +/** + * Handle an early reply message, called with the rq_lock held. + * If anything goes wrong just ignore it - same as if it never happened + */ +static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) +__must_hold(&req->rq_lock) +{ + struct ptlrpc_request *early_req; + timeout_t service_timeout; + time64_t olddl; + int rc; + + ENTRY; + req->rq_early = 0; + spin_unlock(&req->rq_lock); + + rc = sptlrpc_cli_unwrap_early_reply(req, &early_req); + if (rc) { + spin_lock(&req->rq_lock); + RETURN(rc); + } + + rc = unpack_reply(early_req); + if (rc != 0) { + sptlrpc_cli_finish_early_reply(early_req); + spin_lock(&req->rq_lock); + RETURN(rc); + } + + /* + * Use new timeout value just to adjust the local value for this + * request, don't include it into at_history. It is unclear yet why + * service time increased and should it be counted or skipped, e.g. + * that can be recovery case or some error or server, the real reply + * will add all new data if it is worth to add. + */ + req->rq_timeout = lustre_msg_get_timeout(early_req->rq_repmsg); + lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); + + /* Network latency can be adjusted, it is pure network delays */ + service_timeout = lustre_msg_get_service_timeout(early_req->rq_repmsg); + ptlrpc_at_adj_net_latency(req, service_timeout); + + sptlrpc_cli_finish_early_reply(early_req); + + spin_lock(&req->rq_lock); + olddl = req->rq_deadline; + /* + * server assumes it now has rq_timeout from when the request + * arrived, so the client should give it at least that long. + * since we don't know the arrival time we'll use the original + * sent time + */ + req->rq_deadline = req->rq_sent + req->rq_timeout + + ptlrpc_at_get_net_latency(req); + + /* The below message is checked in replay-single.sh test_65{a,b} */ + /* The below message is checked in sanity-{gss,krb5} test_8 */ + DEBUG_REQ(D_ADAPTTO, req, + "Early reply #%d, new deadline in %llds (%llds)", + req->rq_early_count, + req->rq_deadline - ktime_get_real_seconds(), + req->rq_deadline - olddl); + + RETURN(rc); +} + +static struct kmem_cache *request_cache; + +int ptlrpc_request_cache_init(void) +{ + request_cache = kmem_cache_create("ptlrpc_cache", + sizeof(struct ptlrpc_request), + 0, SLAB_HWCACHE_ALIGN, NULL); + return request_cache ? 0 : -ENOMEM; +} + +void ptlrpc_request_cache_fini(void) +{ + kmem_cache_destroy(request_cache); +} + +struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags) +{ + struct ptlrpc_request *req; + + OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags); + return req; +} + +void ptlrpc_request_cache_free(struct ptlrpc_request *req) +{ + OBD_SLAB_FREE_PTR(req, request_cache); +} + +/** + * Wind down request pool \a pool. + * Frees all requests from the pool too + */ +void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool) +{ + struct ptlrpc_request *req; + + LASSERT(pool != NULL); + + spin_lock(&pool->prp_lock); + while ((req = list_first_entry_or_null(&pool->prp_req_list, + struct ptlrpc_request, + rq_list))) { + list_del(&req->rq_list); + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len == pool->prp_rq_size); + OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size); + ptlrpc_request_cache_free(req); + } + spin_unlock(&pool->prp_lock); + OBD_FREE(pool, sizeof(*pool)); +} +EXPORT_SYMBOL(ptlrpc_free_rq_pool); + +/** + * Allocates, initializes and adds \a num_rq requests to the pool \a pool + */ +int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) +{ + int i; + int size = 1; + + while (size < pool->prp_rq_size) + size <<= 1; + + LASSERTF(list_empty(&pool->prp_req_list) || + size == pool->prp_rq_size, + "Trying to change pool size with nonempty pool from %d to %d bytes\n", + pool->prp_rq_size, size); + + pool->prp_rq_size = size; + for (i = 0; i < num_rq; i++) { + struct ptlrpc_request *req; + struct lustre_msg *msg; + + req = ptlrpc_request_cache_alloc(GFP_NOFS); + if (!req) + return i; + OBD_ALLOC_LARGE(msg, size); + if (!msg) { + ptlrpc_request_cache_free(req); + return i; + } + req->rq_reqbuf = msg; + req->rq_reqbuf_len = size; + req->rq_pool = pool; + spin_lock(&pool->prp_lock); + list_add_tail(&req->rq_list, &pool->prp_req_list); + spin_unlock(&pool->prp_lock); + } + return num_rq; +} +EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); + +/** + * Create and initialize new request pool with given attributes: + * \a num_rq - initial number of requests to create for the pool + * \a msgsize - maximum message size possible for requests in thid pool + * \a populate_pool - function to be called when more requests need to be added + * to the pool + * Returns pointer to newly created pool or NULL on error. + */ +struct ptlrpc_request_pool * +ptlrpc_init_rq_pool(int num_rq, int msgsize, + int (*populate_pool)(struct ptlrpc_request_pool *, int)) +{ + struct ptlrpc_request_pool *pool; + + OBD_ALLOC_PTR(pool); + if (!pool) + return NULL; + + /* + * Request next power of two for the allocation, because internally + * kernel would do exactly this + */ + spin_lock_init(&pool->prp_lock); + INIT_LIST_HEAD(&pool->prp_req_list); + pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD; + pool->prp_populate = populate_pool; + + populate_pool(pool, num_rq); + + return pool; +} +EXPORT_SYMBOL(ptlrpc_init_rq_pool); + +/** + * Fetches one request from pool \a pool + */ +static struct ptlrpc_request * +ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool) +{ + struct ptlrpc_request *request; + struct lustre_msg *reqbuf; + + if (!pool) + return NULL; + + spin_lock(&pool->prp_lock); + + /* + * See if we have anything in a pool, and bail out if nothing, + * in writeout path, where this matters, this is safe to do, because + * nothing is lost in this case, and when some in-flight requests + * complete, this code will be called again. + */ + if (unlikely(list_empty(&pool->prp_req_list))) { + spin_unlock(&pool->prp_lock); + return NULL; + } + + request = list_first_entry(&pool->prp_req_list, struct ptlrpc_request, + rq_list); + list_del_init(&request->rq_list); + spin_unlock(&pool->prp_lock); + + LASSERT(request->rq_reqbuf); + LASSERT(request->rq_pool); + + reqbuf = request->rq_reqbuf; + memset(request, 0, sizeof(*request)); + request->rq_reqbuf = reqbuf; + request->rq_reqbuf_len = pool->prp_rq_size; + request->rq_pool = pool; + + return request; +} + +/** + * Returns freed \a request to pool. + */ +static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request) +{ + struct ptlrpc_request_pool *pool = request->rq_pool; + + spin_lock(&pool->prp_lock); + LASSERT(list_empty(&request->rq_list)); + LASSERT(!request->rq_receiving_reply); + list_add_tail(&request->rq_list, &pool->prp_req_list); + spin_unlock(&pool->prp_lock); +} + +void ptlrpc_add_unreplied(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + struct ptlrpc_request *iter; + + assert_spin_locked(&imp->imp_lock); + LASSERT(list_empty(&req->rq_unreplied_list)); + + /* unreplied list is sorted by xid in ascending order */ + list_for_each_entry_reverse(iter, &imp->imp_unreplied_list, + rq_unreplied_list) { + LASSERT(req->rq_xid != iter->rq_xid); + if (req->rq_xid < iter->rq_xid) + continue; + list_add(&req->rq_unreplied_list, &iter->rq_unreplied_list); + return; + } + list_add(&req->rq_unreplied_list, &imp->imp_unreplied_list); +} + +void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req) +{ + req->rq_xid = ptlrpc_next_xid(); + ptlrpc_add_unreplied(req); +} + +static inline void ptlrpc_assign_next_xid(struct ptlrpc_request *req) +{ + spin_lock(&req->rq_import->imp_lock); + ptlrpc_assign_next_xid_nolock(req); + spin_unlock(&req->rq_import->imp_lock); +} + +static atomic64_t ptlrpc_last_xid; + +static void ptlrpc_reassign_next_xid(struct ptlrpc_request *req) +{ + spin_lock(&req->rq_import->imp_lock); + list_del_init(&req->rq_unreplied_list); + ptlrpc_assign_next_xid_nolock(req); + spin_unlock(&req->rq_import->imp_lock); + DEBUG_REQ(D_RPCTRACE, req, "reassign xid"); +} + +void ptlrpc_get_mod_rpc_slot(struct ptlrpc_request *req) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + __u32 opc; + __u16 tag; + + opc = lustre_msg_get_opc(req->rq_reqmsg); + tag = obd_get_mod_rpc_slot(cli, opc); + lustre_msg_set_tag(req->rq_reqmsg, tag); + ptlrpc_reassign_next_xid(req); +} +EXPORT_SYMBOL(ptlrpc_get_mod_rpc_slot); + +void ptlrpc_put_mod_rpc_slot(struct ptlrpc_request *req) +{ + __u16 tag = lustre_msg_get_tag(req->rq_reqmsg); + + if (tag != 0) { + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + + obd_put_mod_rpc_slot(cli, opc, tag); + } +} +EXPORT_SYMBOL(ptlrpc_put_mod_rpc_slot); + +int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, + __u32 version, int opcode, char **bufs, + struct ptlrpc_cli_ctx *ctx) +{ + int count; + struct obd_import *imp; + __u32 *lengths; + int rc; + + ENTRY; + + count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT); + imp = request->rq_import; + lengths = request->rq_pill.rc_area[RCL_CLIENT]; + + if (ctx) { + request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx); + } else { + rc = sptlrpc_req_get_ctx(request); + if (rc) + GOTO(out_free, rc); + } + sptlrpc_req_set_flavor(request, opcode); + + rc = lustre_pack_request(request, imp->imp_msg_magic, count, + lengths, bufs); + if (rc) + GOTO(out_ctx, rc); + + lustre_msg_add_version(request->rq_reqmsg, version); + request->rq_send_state = LUSTRE_IMP_FULL; + request->rq_type = PTL_RPC_MSG_REQUEST; + + request->rq_req_cbid.cbid_fn = request_out_callback; + request->rq_req_cbid.cbid_arg = request; + + request->rq_reply_cbid.cbid_fn = reply_in_callback; + request->rq_reply_cbid.cbid_arg = request; + + request->rq_reply_deadline = 0; + request->rq_bulk_deadline = 0; + request->rq_req_deadline = 0; + request->rq_phase = RQ_PHASE_NEW; + request->rq_next_phase = RQ_PHASE_UNDEFINED; + + request->rq_request_portal = imp->imp_client->cli_request_portal; + request->rq_reply_portal = imp->imp_client->cli_reply_portal; + + ptlrpc_at_set_req_timeout(request); + + lustre_msg_set_opc(request->rq_reqmsg, opcode); + + /* Let's setup deadline for req/reply/bulk unlink for opcode. */ + if (cfs_fail_val == opcode) { + time64_t *fail_t = NULL, *fail2_t = NULL; + + if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) { + fail_t = &request->rq_bulk_deadline; + } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { + fail_t = &request->rq_reply_deadline; + } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK)) { + fail_t = &request->rq_req_deadline; + } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK)) { + fail_t = &request->rq_reply_deadline; + fail2_t = &request->rq_bulk_deadline; + } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_ROUND_XID)) { + time64_t now = ktime_get_real_seconds(); + u64 xid = ((u64)now >> 4) << 24; + + atomic64_set(&ptlrpc_last_xid, xid); + } + + if (fail_t) { + *fail_t = ktime_get_real_seconds() + + PTLRPC_REQ_LONG_UNLINK; + + if (fail2_t) + *fail2_t = ktime_get_real_seconds() + + PTLRPC_REQ_LONG_UNLINK; + + /* + * The RPC is infected, let the test to change the + * fail_loc + */ + msleep(4 * MSEC_PER_SEC); + } + } + ptlrpc_assign_next_xid(request); + + RETURN(0); + +out_ctx: + LASSERT(!request->rq_pool); + sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1); +out_free: + atomic_dec(&imp->imp_reqs); + class_import_put(imp); + + return rc; +} +EXPORT_SYMBOL(ptlrpc_request_bufs_pack); + +/** + * Pack request buffers for network transfer, performing necessary encryption + * steps if necessary. + */ +int ptlrpc_request_pack(struct ptlrpc_request *request, + __u32 version, int opcode) +{ + return ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL); +} +EXPORT_SYMBOL(ptlrpc_request_pack); + +/** + * Helper function to allocate new request on import \a imp + * and possibly using existing request from pool \a pool if provided. + * Returns allocated request structure with import field filled or + * NULL on error. + */ +static inline +struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp, + struct ptlrpc_request_pool *pool) +{ + struct ptlrpc_request *request = NULL; + + request = ptlrpc_request_cache_alloc(GFP_NOFS); + + if (!request && pool) + request = ptlrpc_prep_req_from_pool(pool); + + if (request) { + ptlrpc_cli_req_init(request); + + LASSERTF((unsigned long)imp > 0x1000, "%p\n", imp); + LASSERT(imp != LP_POISON); + LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p\n", + imp->imp_client); + LASSERT(imp->imp_client != LP_POISON); + + request->rq_import = class_import_get(imp); + atomic_inc(&imp->imp_reqs); + } else { + CERROR("request allocation out of memory\n"); + } + + return request; +} + +static int ptlrpc_reconnect_if_idle(struct obd_import *imp) +{ + int rc; + + /* + * initiate connection if needed when the import has been + * referenced by the new request to avoid races with disconnect. + * serialize this check against conditional state=IDLE + * in ptlrpc_disconnect_idle_interpret() + */ + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_IDLE) { + imp->imp_generation++; + imp->imp_initiated_at = imp->imp_generation; + imp->imp_state = LUSTRE_IMP_NEW; + + /* connect_import_locked releases imp_lock */ + rc = ptlrpc_connect_import_locked(imp); + if (rc) + return rc; + ptlrpc_pinger_add_import(imp); + } else { + spin_unlock(&imp->imp_lock); + } + return 0; +} + +/** + * Helper function for creating a request. + * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits + * buffer structures according to capsule template \a format. + * Returns allocated request structure pointer or NULL on error. + */ +static struct ptlrpc_request * +ptlrpc_request_alloc_internal(struct obd_import *imp, + struct ptlrpc_request_pool *pool, + const struct req_format *format) +{ + struct ptlrpc_request *request; + + request = __ptlrpc_request_alloc(imp, pool); + if (!request) + return NULL; + + /* don't make expensive check for idling connection + * if it's already connected */ + if (unlikely(imp->imp_state != LUSTRE_IMP_FULL)) { + if (ptlrpc_reconnect_if_idle(imp) < 0) { + atomic_dec(&imp->imp_reqs); + ptlrpc_request_free(request); + return NULL; + } + } + + req_capsule_init(&request->rq_pill, request, RCL_CLIENT); + req_capsule_set(&request->rq_pill, format); + return request; +} + +/** + * Allocate new request structure for import \a imp and initialize its + * buffer structure according to capsule template \a format. + */ +struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, + const struct req_format *format) +{ + return ptlrpc_request_alloc_internal(imp, NULL, format); +} +EXPORT_SYMBOL(ptlrpc_request_alloc); + +/** + * Allocate new request structure for import \a imp from pool \a pool and + * initialize its buffer structure according to capsule template \a format. + */ +struct ptlrpc_request * +ptlrpc_request_alloc_pool(struct obd_import *imp, + struct ptlrpc_request_pool *pool, + const struct req_format *format) +{ + return ptlrpc_request_alloc_internal(imp, pool, format); +} +EXPORT_SYMBOL(ptlrpc_request_alloc_pool); + +/** + * For requests not from pool, free memory of the request structure. + * For requests obtained from a pool earlier, return request back to pool. + */ +void ptlrpc_request_free(struct ptlrpc_request *request) +{ + if (request->rq_pool) + __ptlrpc_free_req_to_pool(request); + else + ptlrpc_request_cache_free(request); +} +EXPORT_SYMBOL(ptlrpc_request_free); + +/** + * Allocate new request for operatione \a opcode and immediatelly pack it for + * network transfer. + * Only used for simple requests like OBD_PING where the only important + * part of the request is operation itself. + * Returns allocated request or NULL on error. + */ +struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, + const struct req_format *format, + __u32 version, int opcode) +{ + struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format); + int rc; + + if (req) { + rc = ptlrpc_request_pack(req, version, opcode); + if (rc) { + ptlrpc_request_free(req); + req = NULL; + } + } + return req; +} +EXPORT_SYMBOL(ptlrpc_request_alloc_pack); + +/** + * Allocate and initialize new request set structure on the current CPT. + * Returns a pointer to the newly allocated set structure or NULL on error. + */ +struct ptlrpc_request_set *ptlrpc_prep_set(void) +{ + struct ptlrpc_request_set *set; + int cpt; + + ENTRY; + cpt = cfs_cpt_current(cfs_cpt_tab, 0); + OBD_CPT_ALLOC(set, cfs_cpt_tab, cpt, sizeof(*set)); + if (!set) + RETURN(NULL); + atomic_set(&set->set_refcount, 1); + INIT_LIST_HEAD(&set->set_requests); + init_waitqueue_head(&set->set_waitq); + atomic_set(&set->set_new_count, 0); + atomic_set(&set->set_remaining, 0); + spin_lock_init(&set->set_new_req_lock); + INIT_LIST_HEAD(&set->set_new_requests); + set->set_max_inflight = UINT_MAX; + set->set_producer = NULL; + set->set_producer_arg = NULL; + set->set_rc = 0; + + RETURN(set); +} +EXPORT_SYMBOL(ptlrpc_prep_set); + +/** + * Allocate and initialize new request set structure with flow control + * extension. This extension allows to control the number of requests in-flight + * for the whole set. A callback function to generate requests must be provided + * and the request set will keep the number of requests sent over the wire to + * @max_inflight. + * Returns a pointer to the newly allocated set structure or NULL on error. + */ +struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, + void *arg) + +{ + struct ptlrpc_request_set *set; + + set = ptlrpc_prep_set(); + if (!set) + RETURN(NULL); + + set->set_max_inflight = max; + set->set_producer = func; + set->set_producer_arg = arg; + + RETURN(set); +} + +/** + * Wind down and free request set structure previously allocated with + * ptlrpc_prep_set. + * Ensures that all requests on the set have completed and removes + * all requests from the request list in a set. + * If any unsent request happen to be on the list, pretends that they got + * an error in flight and calls their completion handler. + */ +void ptlrpc_set_destroy(struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + int expected_phase; + int n = 0; + + ENTRY; + + /* Requests on the set should either all be completed, or all be new */ + expected_phase = (atomic_read(&set->set_remaining) == 0) ? + RQ_PHASE_COMPLETE : RQ_PHASE_NEW; + list_for_each_entry(req, &set->set_requests, rq_set_chain) { + LASSERT(req->rq_phase == expected_phase); + n++; + } + + LASSERTF(atomic_read(&set->set_remaining) == 0 || + atomic_read(&set->set_remaining) == n, "%d / %d\n", + atomic_read(&set->set_remaining), n); + + while ((req = list_first_entry_or_null(&set->set_requests, + struct ptlrpc_request, + rq_set_chain))) { + list_del_init(&req->rq_set_chain); + + LASSERT(req->rq_phase == expected_phase); + + if (req->rq_phase == RQ_PHASE_NEW) { + ptlrpc_req_interpret(NULL, req, -EBADR); + atomic_dec(&set->set_remaining); + } + + spin_lock(&req->rq_lock); + req->rq_set = NULL; + req->rq_invalid_rqset = 0; + spin_unlock(&req->rq_lock); + + ptlrpc_req_finished(req); + } + + LASSERT(atomic_read(&set->set_remaining) == 0); + + ptlrpc_reqset_put(set); + EXIT; +} +EXPORT_SYMBOL(ptlrpc_set_destroy); + +/** + * Add a new request to the general purpose request set. + * Assumes request reference from the caller. + */ +void ptlrpc_set_add_req(struct ptlrpc_request_set *set, + struct ptlrpc_request *req) +{ + if (set == PTLRPCD_SET) { + ptlrpcd_add_req(req); + return; + } + + LASSERT(req->rq_import->imp_state != LUSTRE_IMP_IDLE); + LASSERT(list_empty(&req->rq_set_chain)); + + if (req->rq_allow_intr) + set->set_allow_intr = 1; + + /* The set takes over the caller's request reference */ + list_add_tail(&req->rq_set_chain, &set->set_requests); + req->rq_set = set; + atomic_inc(&set->set_remaining); + req->rq_queued_time = ktime_get_seconds(); + + if (req->rq_reqmsg) + lustre_msg_set_jobid(req->rq_reqmsg, NULL); + + if (set->set_producer) + /* + * If the request set has a producer callback, the RPC must be + * sent straight away + */ + ptlrpc_send_new_req(req); +} +EXPORT_SYMBOL(ptlrpc_set_add_req); + +/** + * Add a request to a request with dedicated server thread + * and wake the thread to make any necessary processing. + * Currently only used for ptlrpcd. + */ +void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, + struct ptlrpc_request *req) +{ + struct ptlrpc_request_set *set = pc->pc_set; + int count, i; + + LASSERT(req->rq_set == NULL); + LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0); + + spin_lock(&set->set_new_req_lock); + /* + * The set takes over the caller's request reference. + */ + req->rq_set = set; + req->rq_queued_time = ktime_get_seconds(); + list_add_tail(&req->rq_set_chain, &set->set_new_requests); + count = atomic_inc_return(&set->set_new_count); + spin_unlock(&set->set_new_req_lock); + + /* Only need to call wakeup once for the first entry. */ + if (count == 1) { + wake_up(&set->set_waitq); + + /* + * XXX: It maybe unnecessary to wakeup all the partners. But to + * guarantee the async RPC can be processed ASAP, we have + * no other better choice. It maybe fixed in future. + */ + for (i = 0; i < pc->pc_npartners; i++) + wake_up(&pc->pc_partners[i]->pc_set->set_waitq); + } +} + +/** + * Based on the current state of the import, determine if the request + * can be sent, is an error, or should be delayed. + * + * Returns true if this request should be delayed. If false, and + * *status is set, then the request can not be sent and *status is the + * error code. If false and status is 0, then request can be sent. + * + * The imp->imp_lock must be held. + */ +static int ptlrpc_import_delay_req(struct obd_import *imp, + struct ptlrpc_request *req, int *status) +{ + int delay = 0; + + ENTRY; + LASSERT(status); + *status = 0; + + if (req->rq_ctx_init || req->rq_ctx_fini) { + /* always allow ctx init/fini rpc go through */ + } else if (imp->imp_state == LUSTRE_IMP_NEW) { + DEBUG_REQ(D_ERROR, req, "Uninitialized import"); + *status = -EIO; + } else if (imp->imp_state == LUSTRE_IMP_CLOSED) { + unsigned int opc = lustre_msg_get_opc(req->rq_reqmsg); + + /* + * pings or MDS-equivalent STATFS may safely + * race with umount + */ + DEBUG_REQ((opc == OBD_PING || opc == OST_STATFS) ? + D_HA : D_ERROR, req, "IMP_CLOSED"); + *status = -EIO; + } else if (ptlrpc_send_limit_expired(req)) { + /* probably doesn't need to be a D_ERROR afterinitial testing */ + DEBUG_REQ(D_HA, req, "send limit expired"); + *status = -ETIMEDOUT; + } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING && + imp->imp_state == LUSTRE_IMP_CONNECTING) { + ;/* allow CONNECT even if import is invalid */ + if (atomic_read(&imp->imp_inval_count) != 0) { + DEBUG_REQ(D_ERROR, req, "invalidate in flight"); + *status = -EIO; + } + } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) { + if (!imp->imp_deactive) + DEBUG_REQ(D_NET, req, "IMP_INVALID"); + *status = -ESHUTDOWN; /* b=12940 */ + } else if (req->rq_import_generation != imp->imp_generation) { + DEBUG_REQ(D_ERROR, req, "req wrong generation:"); + *status = -EIO; + } else if (req->rq_send_state != imp->imp_state) { + /* invalidate in progress - any requests should be drop */ + if (atomic_read(&imp->imp_inval_count) != 0) { + DEBUG_REQ(D_ERROR, req, "invalidate in flight"); + *status = -EIO; + } else if (req->rq_no_delay && + imp->imp_generation != imp->imp_initiated_at) { + /* ignore nodelay for requests initiating connections */ + *status = -EAGAIN; + } else if (req->rq_allow_replay && + (imp->imp_state == LUSTRE_IMP_REPLAY || + imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS || + imp->imp_state == LUSTRE_IMP_REPLAY_WAIT || + imp->imp_state == LUSTRE_IMP_RECOVER)) { + DEBUG_REQ(D_HA, req, "allow during recovery"); + } else { + delay = 1; + } + } + + RETURN(delay); +} + +/** + * Decide if the error message should be printed to the console or not. + * Makes its decision based on request type, status, and failure frequency. + * + * \param[in] req request that failed and may need a console message + * + * \retval false if no message should be printed + * \retval true if console message should be printed + */ +static bool ptlrpc_console_allow(struct ptlrpc_request *req, __u32 opc, int err) +{ + LASSERT(req->rq_reqmsg != NULL); + + /* Suppress particular reconnect errors which are to be expected. */ + if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) { + /* Suppress timed out reconnect requests */ + if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) || + req->rq_timedout) + return false; + + /* + * Suppress most unavailable/again reconnect requests, but + * print occasionally so it is clear client is trying to + * connect to a server where no target is running. + */ + if ((err == -ENODEV || err == -EAGAIN) && + req->rq_import->imp_conn_cnt % 30 != 20) + return false; + } + + if (opc == LDLM_ENQUEUE && err == -EAGAIN) + /* -EAGAIN is normal when using POSIX flocks */ + return false; + + if (opc == OBD_PING && (err == -ENODEV || err == -ENOTCONN) && + (req->rq_xid & 0xf) != 10) + /* Suppress most ping requests, they may fail occasionally */ + return false; + + return true; +} + +/** + * Check request processing status. + * Returns the status. + */ +static int ptlrpc_check_status(struct ptlrpc_request *req) +{ + int rc; + + ENTRY; + rc = lustre_msg_get_status(req->rq_repmsg); + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { + struct obd_import *imp = req->rq_import; + struct lnet_nid *nid = &imp->imp_connection->c_peer.nid; + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + + if (ptlrpc_console_allow(req, opc, rc)) + LCONSOLE_ERROR_MSG(0x11, + "%s: operation %s to node %s failed: rc = %d\n", + imp->imp_obd->obd_name, + ll_opcode2str(opc), + libcfs_nidstr(nid), rc); + RETURN(rc < 0 ? rc : -EINVAL); + } + + if (rc) + DEBUG_REQ(D_INFO, req, "check status: rc = %d", rc); + + RETURN(rc); +} + +/** + * save pre-versions of objects into request for replay. + * Versions are obtained from server reply. + * used for VBR. + */ +static void ptlrpc_save_versions(struct ptlrpc_request *req) +{ + struct lustre_msg *repmsg = req->rq_repmsg; + struct lustre_msg *reqmsg = req->rq_reqmsg; + __u64 *versions = lustre_msg_get_versions(repmsg); + + ENTRY; + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) + return; + + LASSERT(versions); + lustre_msg_set_versions(reqmsg, versions); + CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n", + versions[0], versions[1]); + + EXIT; +} + +__u64 ptlrpc_known_replied_xid(struct obd_import *imp) +{ + struct ptlrpc_request *req; + + assert_spin_locked(&imp->imp_lock); + if (list_empty(&imp->imp_unreplied_list)) + return 0; + + req = list_first_entry(&imp->imp_unreplied_list, struct ptlrpc_request, + rq_unreplied_list); + LASSERTF(req->rq_xid >= 1, "XID:%llu\n", req->rq_xid); + + if (imp->imp_known_replied_xid < req->rq_xid - 1) + imp->imp_known_replied_xid = req->rq_xid - 1; + + return req->rq_xid - 1; +} + +/** + * Callback function called when client receives RPC reply for \a req. + * Returns 0 on success or error code. + * The return alue would be assigned to req->rq_status by the caller + * as request processing status. + * This function also decides if the request needs to be saved for later replay. + */ +static int after_reply(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + struct obd_device *obd = req->rq_import->imp_obd; + ktime_t work_start; + u64 committed; + s64 timediff; + int rc; + + ENTRY; + LASSERT(obd != NULL); + /* repbuf must be unlinked */ + LASSERT(!req->rq_receiving_reply && req->rq_reply_unlinked); + + if (req->rq_reply_truncated) { + if (ptlrpc_no_resend(req)) { + DEBUG_REQ(D_ERROR, req, + "reply buffer overflow, expected=%d, actual size=%d", + req->rq_nob_received, req->rq_repbuf_len); + RETURN(-EOVERFLOW); + } + + sptlrpc_cli_free_repbuf(req); + /* + * Pass the required reply buffer size (include + * space for early reply). + * NB: no need to roundup because alloc_repbuf + * will roundup it + */ + req->rq_replen = req->rq_nob_received; + req->rq_nob_received = 0; + spin_lock(&req->rq_lock); + req->rq_resend = 1; + spin_unlock(&req->rq_lock); + RETURN(0); + } + + work_start = ktime_get_real(); + timediff = ktime_us_delta(work_start, req->rq_sent_ns); + + /* + * NB Until this point, the whole of the incoming message, + * including buflens, status etc is in the sender's byte order. + */ + rc = sptlrpc_cli_unwrap_reply(req); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unwrap reply failed: rc = %d", rc); + RETURN(rc); + } + + /* + * Security layer unwrap might ask resend this request. + */ + if (req->rq_resend) + RETURN(0); + + rc = unpack_reply(req); + if (rc) + RETURN(rc); + + /* retry indefinitely on EINPROGRESS */ + if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS && + ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) { + time64_t now = ktime_get_real_seconds(); + + DEBUG_REQ((req->rq_nr_resend % 8 == 1 ? D_WARNING : 0) | + D_RPCTRACE, req, "resending request on EINPROGRESS"); + spin_lock(&req->rq_lock); + req->rq_resend = 1; + spin_unlock(&req->rq_lock); + req->rq_nr_resend++; + + /* Readjust the timeout for current conditions */ + ptlrpc_at_set_req_timeout(req); + /* + * delay resend to give a chance to the server to get ready. + * The delay is increased by 1s on every resend and is capped to + * the current request timeout (i.e. obd_timeout if AT is off, + * or AT service time x 125% + 5s, see at_est2timeout) + */ + if (req->rq_nr_resend > req->rq_timeout) + req->rq_sent = now + req->rq_timeout; + else + req->rq_sent = now + req->rq_nr_resend; + + /* Resend for EINPROGRESS will use a new XID */ + spin_lock(&imp->imp_lock); + list_del_init(&req->rq_unreplied_list); + spin_unlock(&imp->imp_lock); + + RETURN(0); + } + + if (obd->obd_svc_stats) { + lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, + timediff); + ptlrpc_lprocfs_rpc_sent(req, timediff); + } + + if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY && + lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) { + DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)", + lustre_msg_get_type(req->rq_repmsg)); + RETURN(-EPROTO); + } + + if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING) + CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val); + ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg)); + ptlrpc_at_adj_net_latency(req, + lustre_msg_get_service_timeout(req->rq_repmsg)); + + rc = ptlrpc_check_status(req); + + if (rc) { + /* + * Either we've been evicted, or the server has failed for + * some reason. Try to reconnect, and if that fails, punt to + * the upcall. + */ + if (ptlrpc_recoverable_error(rc)) { + if (req->rq_send_state != LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { + RETURN(rc); + } + ptlrpc_request_handle_notconn(req); + RETURN(rc); + } + } else { + /* + * Let's look if server sent slv. Do it only for RPC with + * rc == 0. + */ + ldlm_cli_update_pool(req); + } + + /* + * Store transno in reqmsg for replay. + */ + if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) { + req->rq_transno = lustre_msg_get_transno(req->rq_repmsg); + lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno); + } + + if (imp->imp_replayable) { + /* if other threads are waiting for ptlrpc_free_committed() + * they could continue the work of freeing RPCs. That reduces + * lock hold times, and distributes work more fairly across + * waiting threads. We can't use spin_is_contended() since + * there are many other places where imp_lock is held. + */ + atomic_inc(&imp->imp_waiting); + spin_lock(&imp->imp_lock); + atomic_dec(&imp->imp_waiting); + /* + * No point in adding already-committed requests to the replay + * list, we will just remove them immediately. b=9829 + */ + if (req->rq_transno != 0 && + (req->rq_transno > + lustre_msg_get_last_committed(req->rq_repmsg) || + req->rq_replay)) { + /** version recovery */ + ptlrpc_save_versions(req); + ptlrpc_retain_replayable_request(req, imp); + } else if (req->rq_commit_cb && + list_empty(&req->rq_replay_list)) { + /* + * NB: don't call rq_commit_cb if it's already on + * rq_replay_list, ptlrpc_free_committed() will call + * it later, see LU-3618 for details + */ + spin_unlock(&imp->imp_lock); + req->rq_commit_cb(req); + atomic_inc(&imp->imp_waiting); + spin_lock(&imp->imp_lock); + atomic_dec(&imp->imp_waiting); + } + + /* + * Replay-enabled imports return commit-status information. + */ + committed = lustre_msg_get_last_committed(req->rq_repmsg); + if (likely(committed > imp->imp_peer_committed_transno)) + imp->imp_peer_committed_transno = committed; + + ptlrpc_free_committed(imp); + + if (!list_empty(&imp->imp_replay_list)) { + struct ptlrpc_request *last; + + last = list_entry(imp->imp_replay_list.prev, + struct ptlrpc_request, + rq_replay_list); + /* + * Requests with rq_replay stay on the list even if no + * commit is expected. + */ + if (last->rq_transno > imp->imp_peer_committed_transno) + ptlrpc_pinger_commit_expected(imp); + } + + spin_unlock(&imp->imp_lock); + } + + RETURN(rc); +} + +/** + * Helper function to send request \a req over the network for the first time + * Also adjusts request phase. + * Returns 0 on success or error code. + */ +static int ptlrpc_send_new_req(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + __u64 min_xid = 0; + int rc; + + ENTRY; + LASSERT(req->rq_phase == RQ_PHASE_NEW); + + /* do not try to go further if there is not enough memory in enc_pool */ + if (req->rq_sent && req->rq_bulk) + if (req->rq_bulk->bd_iov_count > get_free_pages_in_pool() && + pool_is_at_full_capacity()) + RETURN(-ENOMEM); + + if (req->rq_sent && (req->rq_sent > ktime_get_real_seconds()) && + (!req->rq_generation_set || + req->rq_import_generation == imp->imp_generation)) + RETURN(0); + + ptlrpc_rqphase_move(req, RQ_PHASE_RPC); + + spin_lock(&imp->imp_lock); + + LASSERT(req->rq_xid != 0); + LASSERT(!list_empty(&req->rq_unreplied_list)); + + if (!req->rq_generation_set) + req->rq_import_generation = imp->imp_generation; + + if (ptlrpc_import_delay_req(imp, req, &rc)) { + spin_lock(&req->rq_lock); + req->rq_waiting = 1; + spin_unlock(&req->rq_lock); + + DEBUG_REQ(D_HA, req, "req waiting for recovery: (%s != %s)", + ptlrpc_import_state_name(req->rq_send_state), + ptlrpc_import_state_name(imp->imp_state)); + LASSERT(list_empty(&req->rq_list)); + list_add_tail(&req->rq_list, &imp->imp_delayed_list); + atomic_inc(&req->rq_import->imp_inflight); + spin_unlock(&imp->imp_lock); + RETURN(0); + } + + if (rc != 0) { + spin_unlock(&imp->imp_lock); + req->rq_status = rc; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + RETURN(rc); + } + + LASSERT(list_empty(&req->rq_list)); + list_add_tail(&req->rq_list, &imp->imp_sending_list); + atomic_inc(&req->rq_import->imp_inflight); + + /* + * find the known replied XID from the unreplied list, CONNECT + * and DISCONNECT requests are skipped to make the sanity check + * on server side happy. see process_req_last_xid(). + * + * For CONNECT: Because replay requests have lower XID, it'll + * break the sanity check if CONNECT bump the exp_last_xid on + * server. + * + * For DISCONNECT: Since client will abort inflight RPC before + * sending DISCONNECT, DISCONNECT may carry an XID which higher + * than the inflight RPC. + */ + if (!ptlrpc_req_is_connect(req) && !ptlrpc_req_is_disconnect(req)) + min_xid = ptlrpc_known_replied_xid(imp); + spin_unlock(&imp->imp_lock); + + lustre_msg_set_last_xid(req->rq_reqmsg, min_xid); + + lustre_msg_set_status(req->rq_reqmsg, current->pid); + + /* If the request to be sent is an LDLM callback, do not try to + * refresh context. + * An LDLM callback is sent by a server to a client in order to make + * it release a lock, on a communication channel that uses a reverse + * context. It cannot be refreshed on its own, as it is the 'reverse' + * (server-side) representation of a client context. + * We do not care if the reverse context is expired, and want to send + * the LDLM callback anyway. Once the client receives the AST, it is + * its job to refresh its own context if it has expired, hence + * refreshing the associated reverse context on server side, before + * being able to send the LDLM_CANCEL requested by the server. + */ + if (lustre_msg_get_opc(req->rq_reqmsg) != LDLM_BL_CALLBACK && + lustre_msg_get_opc(req->rq_reqmsg) != LDLM_CP_CALLBACK && + lustre_msg_get_opc(req->rq_reqmsg) != LDLM_GL_CALLBACK) + rc = sptlrpc_req_refresh_ctx(req, 0); + if (rc) { + if (req->rq_err) { + req->rq_status = rc; + RETURN(1); + } else { + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 1; + spin_unlock(&req->rq_lock); + RETURN(0); + } + } + + CDEBUG(D_RPCTRACE, + "Sending RPC req@%p pname:cluuid:pid:xid:nid:opc:job %s:%s:%d:%llu:%s:%d:%s\n", + req, current->comm, + imp->imp_obd->obd_uuid.uuid, + lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, + obd_import_nid2str(imp), lustre_msg_get_opc(req->rq_reqmsg), + lustre_msg_get_jobid(req->rq_reqmsg) ?: ""); + + rc = ptl_send_rpc(req, 0); + if (rc == -ENOMEM) { + spin_lock(&imp->imp_lock); + if (!list_empty(&req->rq_list)) { + list_del_init(&req->rq_list); + if (atomic_dec_and_test(&req->rq_import->imp_inflight)) + wake_up(&req->rq_import->imp_recovery_waitq); + } + spin_unlock(&imp->imp_lock); + ptlrpc_rqphase_move(req, RQ_PHASE_NEW); + RETURN(rc); + } + if (rc) { + DEBUG_REQ(D_HA, req, "send failed, expect timeout: rc = %d", + rc); + spin_lock(&req->rq_lock); + req->rq_net_err = 1; + spin_unlock(&req->rq_lock); + RETURN(rc); + } + RETURN(0); +} + +static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set) +{ + int remaining, rc; + + ENTRY; + LASSERT(set->set_producer != NULL); + + remaining = atomic_read(&set->set_remaining); + + /* + * populate the ->set_requests list with requests until we + * reach the maximum number of RPCs in flight for this set + */ + while (atomic_read(&set->set_remaining) < set->set_max_inflight) { + rc = set->set_producer(set, set->set_producer_arg); + if (rc == -ENOENT) { + /* no more RPC to produce */ + set->set_producer = NULL; + set->set_producer_arg = NULL; + RETURN(0); + } + } + + RETURN((atomic_read(&set->set_remaining) - remaining)); +} + +/** + * this sends any unsent RPCs in \a set and returns 1 if all are sent + * and no more replies are expected. + * (it is possible to get less replies than requests sent e.g. due to timed out + * requests or requests that we had trouble to send out) + * + * NOTE: This function contains a potential schedule point (cond_resched()). + */ +int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req, *next; + LIST_HEAD(comp_reqs); + int force_timer_recalc = 0; + + ENTRY; + if (atomic_read(&set->set_remaining) == 0) + RETURN(1); + + list_for_each_entry_safe(req, next, &set->set_requests, + rq_set_chain) { + struct obd_import *imp = req->rq_import; + int unregistered = 0; + int async = 1; + int rc = 0; + + if (req->rq_phase == RQ_PHASE_COMPLETE) { + list_move_tail(&req->rq_set_chain, &comp_reqs); + continue; + } + + /* + * This schedule point is mainly for the ptlrpcd caller of this + * function. Most ptlrpc sets are not long-lived and unbounded + * in length, but at the least the set used by the ptlrpcd is. + * Since the processing time is unbounded, we need to insert an + * explicit schedule point to make the thread well-behaved. + */ + cond_resched(); + + /* + * If the caller requires to allow to be interpreted by force + * and it has really been interpreted, then move the request + * to RQ_PHASE_INTERPRET phase in spite of what the current + * phase is. + */ + if (unlikely(req->rq_allow_intr && req->rq_intr)) { + req->rq_status = -EINTR; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + + /* + * Since it is interpreted and we have to wait for + * the reply to be unlinked, then use sync mode. + */ + async = 0; + + GOTO(interpret, req->rq_status); + } + + if (req->rq_phase == RQ_PHASE_NEW && ptlrpc_send_new_req(req)) + force_timer_recalc = 1; + + /* delayed send - skip */ + if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent) + continue; + + /* delayed resend - skip */ + if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend && + req->rq_sent > ktime_get_real_seconds()) + continue; + + if (!(req->rq_phase == RQ_PHASE_RPC || + req->rq_phase == RQ_PHASE_BULK || + req->rq_phase == RQ_PHASE_INTERPRET || + req->rq_phase == RQ_PHASE_UNREG_RPC || + req->rq_phase == RQ_PHASE_UNREG_BULK)) { + DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase); + LBUG(); + } + + if (req->rq_phase == RQ_PHASE_UNREG_RPC || + req->rq_phase == RQ_PHASE_UNREG_BULK) { + LASSERT(req->rq_next_phase != req->rq_phase); + LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED); + + if (req->rq_req_deadline && + !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK)) + req->rq_req_deadline = 0; + if (req->rq_reply_deadline && + !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) + req->rq_reply_deadline = 0; + if (req->rq_bulk_deadline && + !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) + req->rq_bulk_deadline = 0; + + /* + * Skip processing until reply is unlinked. We + * can't return to pool before that and we can't + * call interpret before that. We need to make + * sure that all rdma transfers finished and will + * not corrupt any data. + */ + if (req->rq_phase == RQ_PHASE_UNREG_RPC && + ptlrpc_cli_wait_unlink(req)) + continue; + if (req->rq_phase == RQ_PHASE_UNREG_BULK && + ptlrpc_client_bulk_active(req)) + continue; + + /* + * Turn fail_loc off to prevent it from looping + * forever. + */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK, + OBD_FAIL_ONCE); + } + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) { + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK, + OBD_FAIL_ONCE); + } + + /* + * Move to next phase if reply was successfully + * unlinked. + */ + ptlrpc_rqphase_move(req, req->rq_next_phase); + } + + if (req->rq_phase == RQ_PHASE_INTERPRET) + GOTO(interpret, req->rq_status); + + /* + * Note that this also will start async reply unlink. + */ + if (req->rq_net_err && !req->rq_timedout) { + ptlrpc_expire_one_request(req, 1); + + /* + * Check if we still need to wait for unlink. + */ + if (ptlrpc_cli_wait_unlink(req) || + ptlrpc_client_bulk_active(req)) + continue; + /* If there is no need to resend, fail it now. */ + if (req->rq_no_resend) { + if (req->rq_status == 0) + req->rq_status = -EIO; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + GOTO(interpret, req->rq_status); + } else { + continue; + } + } + + if (req->rq_err) { + if (!ptlrpc_unregister_reply(req, 1)) { + ptlrpc_unregister_bulk(req, 1); + continue; + } + + spin_lock(&req->rq_lock); + req->rq_replied = 0; + spin_unlock(&req->rq_lock); + if (req->rq_status == 0) + req->rq_status = -EIO; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + GOTO(interpret, req->rq_status); + } + + /* + * ptlrpc_set_wait uses l_wait_event_abortable_timeout() + * so it sets rq_intr regardless of individual rpc + * timeouts. The synchronous IO waiting path sets + * rq_intr irrespective of whether ptlrpcd + * has seen a timeout. Our policy is to only interpret + * interrupted rpcs after they have timed out, so we + * need to enforce that here. + */ + + if (req->rq_intr && (req->rq_timedout || req->rq_waiting || + req->rq_wait_ctx)) { + req->rq_status = -EINTR; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + GOTO(interpret, req->rq_status); + } + + if (req->rq_phase == RQ_PHASE_RPC) { + if (req->rq_timedout || req->rq_resend || + req->rq_waiting || req->rq_wait_ctx) { + int status; + + if (!ptlrpc_unregister_reply(req, 1)) { + ptlrpc_unregister_bulk(req, 1); + continue; + } + + spin_lock(&imp->imp_lock); + if (ptlrpc_import_delay_req(imp, req, + &status)) { + /* + * put on delay list - only if we wait + * recovery finished - before send + */ + list_move_tail(&req->rq_list, + &imp->imp_delayed_list); + spin_unlock(&imp->imp_lock); + continue; + } + + if (status != 0) { + req->rq_status = status; + ptlrpc_rqphase_move(req, + RQ_PHASE_INTERPRET); + spin_unlock(&imp->imp_lock); + GOTO(interpret, req->rq_status); + } + /* ignore on just initiated connections */ + if (ptlrpc_no_resend(req) && + !req->rq_wait_ctx && + imp->imp_generation != + imp->imp_initiated_at) { + req->rq_status = -ENOTCONN; + ptlrpc_rqphase_move(req, + RQ_PHASE_INTERPRET); + spin_unlock(&imp->imp_lock); + GOTO(interpret, req->rq_status); + } + + /* don't resend too fast in case of network + * errors. + */ + if (ktime_get_real_seconds() < (req->rq_sent + 1) + && req->rq_net_err && req->rq_timedout) { + + DEBUG_REQ(D_INFO, req, + "throttle request"); + /* Don't try to resend RPC right away + * as it is likely it will fail again + * and ptlrpc_check_set() will be + * called again, keeping this thread + * busy. Instead, wait for the next + * timeout. Flag it as resend to + * ensure we don't wait to long. + */ + req->rq_resend = 1; + spin_unlock(&imp->imp_lock); + continue; + } + + list_move_tail(&req->rq_list, + &imp->imp_sending_list); + + spin_unlock(&imp->imp_lock); + + spin_lock(&req->rq_lock); + req->rq_waiting = 0; + spin_unlock(&req->rq_lock); + + if (req->rq_timedout || req->rq_resend) { + /* + * This is re-sending anyways, + * let's mark req as resend. + */ + spin_lock(&req->rq_lock); + req->rq_resend = 1; + spin_unlock(&req->rq_lock); + } + /* + * rq_wait_ctx is only touched by ptlrpcd, + * so no lock is needed here. + */ + status = sptlrpc_req_refresh_ctx(req, 0); + if (status) { + if (req->rq_err) { + req->rq_status = status; + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 0; + spin_unlock(&req->rq_lock); + force_timer_recalc = 1; + } else { + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 1; + spin_unlock(&req->rq_lock); + } + + continue; + } else { + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 0; + spin_unlock(&req->rq_lock); + } + + /* + * In any case, the previous bulk should be + * cleaned up to prepare for the new sending + */ + if (req->rq_bulk && + !ptlrpc_unregister_bulk(req, 1)) + continue; + + rc = ptl_send_rpc(req, 0); + if (rc == -ENOMEM) { + spin_lock(&imp->imp_lock); + if (!list_empty(&req->rq_list)) + list_del_init(&req->rq_list); + spin_unlock(&imp->imp_lock); + ptlrpc_rqphase_move(req, RQ_PHASE_NEW); + continue; + } + if (rc) { + DEBUG_REQ(D_HA, req, + "send failed: rc = %d", rc); + force_timer_recalc = 1; + spin_lock(&req->rq_lock); + req->rq_net_err = 1; + spin_unlock(&req->rq_lock); + continue; + } + /* need to reset the timeout */ + force_timer_recalc = 1; + } + + spin_lock(&req->rq_lock); + + if (ptlrpc_client_early(req)) { + ptlrpc_at_recv_early_reply(req); + spin_unlock(&req->rq_lock); + continue; + } + + /* Still waiting for a reply? */ + if (ptlrpc_client_recv(req)) { + spin_unlock(&req->rq_lock); + continue; + } + + /* Did we actually receive a reply? */ + if (!ptlrpc_client_replied(req)) { + spin_unlock(&req->rq_lock); + continue; + } + + spin_unlock(&req->rq_lock); + + /* + * unlink from net because we are going to + * swab in-place of reply buffer + */ + unregistered = ptlrpc_unregister_reply(req, 1); + if (!unregistered) + continue; + + req->rq_status = after_reply(req); + if (req->rq_resend) { + force_timer_recalc = 1; + continue; + } + + /* + * If there is no bulk associated with this request, + * then we're done and should let the interpreter + * process the reply. Similarly if the RPC returned + * an error, and therefore the bulk will never arrive. + */ + if (!req->rq_bulk || req->rq_status < 0) { + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + GOTO(interpret, req->rq_status); + } + + ptlrpc_rqphase_move(req, RQ_PHASE_BULK); + } + + LASSERT(req->rq_phase == RQ_PHASE_BULK); + if (ptlrpc_client_bulk_active(req)) + continue; + + if (req->rq_bulk->bd_failure) { + /* + * The RPC reply arrived OK, but the bulk screwed + * up! Dead weird since the server told us the RPC + * was good after getting the REPLY for her GET or + * the ACK for her PUT. + */ + DEBUG_REQ(D_ERROR, req, "bulk transfer failed %d/%d/%d", + req->rq_status, + req->rq_bulk->bd_nob, + req->rq_bulk->bd_nob_transferred); + req->rq_status = -EIO; + } + + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + +interpret: + LASSERT(req->rq_phase == RQ_PHASE_INTERPRET); + + /* + * This moves to "unregistering" phase we need to wait for + * reply unlink. + */ + if (!unregistered && !ptlrpc_unregister_reply(req, async)) { + /* start async bulk unlink too */ + ptlrpc_unregister_bulk(req, 1); + continue; + } + + if (!ptlrpc_unregister_bulk(req, async)) + continue; + + /* + * When calling interpret receiving already should be + * finished. + */ + LASSERT(!req->rq_receiving_reply); + + ptlrpc_req_interpret(env, req, req->rq_status); + + if (ptlrpcd_check_work(req)) { + atomic_dec(&set->set_remaining); + continue; + } + ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE); + + if (req->rq_reqmsg) + CDEBUG(D_RPCTRACE, + "Completed RPC req@%p pname:cluuid:pid:xid:nid:opc:job %s:%s:%d:%llu:%s:%d:%s\n", + req, current->comm, + imp->imp_obd->obd_uuid.uuid, + lustre_msg_get_status(req->rq_reqmsg), + req->rq_xid, + obd_import_nid2str(imp), + lustre_msg_get_opc(req->rq_reqmsg), + lustre_msg_get_jobid(req->rq_reqmsg) ?: ""); + + spin_lock(&imp->imp_lock); + /* + * Request already may be not on sending or delaying list. This + * may happen in the case of marking it erroneous for the case + * ptlrpc_import_delay_req(req, status) find it impossible to + * allow sending this rpc and returns *status != 0. + */ + if (!list_empty(&req->rq_list)) { + list_del_init(&req->rq_list); + if (atomic_dec_and_test(&imp->imp_inflight)) + wake_up(&imp->imp_recovery_waitq); + } + list_del_init(&req->rq_unreplied_list); + spin_unlock(&imp->imp_lock); + + atomic_dec(&set->set_remaining); + wake_up(&imp->imp_recovery_waitq); + + if (set->set_producer) { + /* produce a new request if possible */ + if (ptlrpc_set_producer(set) > 0) + force_timer_recalc = 1; + + /* + * free the request that has just been completed + * in order not to pollute set->set_requests + */ + list_del_init(&req->rq_set_chain); + spin_lock(&req->rq_lock); + req->rq_set = NULL; + req->rq_invalid_rqset = 0; + spin_unlock(&req->rq_lock); + + /* record rq_status to compute the final status later */ + if (req->rq_status != 0) + set->set_rc = req->rq_status; + ptlrpc_req_finished(req); + } else { + list_move_tail(&req->rq_set_chain, &comp_reqs); + } + } + + /* + * move completed request at the head of list so it's easier for + * caller to find them + */ + list_splice(&comp_reqs, &set->set_requests); + + /* If we hit an error, we want to recover promptly. */ + RETURN(atomic_read(&set->set_remaining) == 0 || force_timer_recalc); +} +EXPORT_SYMBOL(ptlrpc_check_set); + +/** + * Time out request \a req. is \a async_unlink is set, that means do not wait + * until LNet actually confirms network buffer unlinking. + * Return 1 if we should give up further retrying attempts or 0 otherwise. + */ +int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) +{ + struct obd_import *imp = req->rq_import; + unsigned int debug_mask = D_RPCTRACE; + int rc = 0; + + ENTRY; + spin_lock(&req->rq_lock); + req->rq_timedout = 1; + spin_unlock(&req->rq_lock); + + if (ptlrpc_console_allow(req, lustre_msg_get_opc(req->rq_reqmsg), + lustre_msg_get_status(req->rq_reqmsg))) + debug_mask = D_WARNING; + DEBUG_REQ(debug_mask, req, "Request sent has %s: [sent %lld/real %lld]", + req->rq_net_err ? "failed due to network error" : + ((req->rq_real_sent == 0 || + req->rq_real_sent < req->rq_sent || + req->rq_real_sent >= req->rq_deadline) ? + "timed out for sent delay" : "timed out for slow reply"), + req->rq_sent, req->rq_real_sent); + + if (imp && obd_debug_peer_on_timeout) + LNetDebugPeer(&imp->imp_connection->c_peer); + + ptlrpc_unregister_reply(req, async_unlink); + ptlrpc_unregister_bulk(req, async_unlink); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + + if (!imp) { + DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?"); + RETURN(1); + } + + atomic_inc(&imp->imp_timeouts); + + /* The DLM server doesn't want recovery run on its imports. */ + if (imp->imp_dlm_fake) + RETURN(1); + + /* + * If this request is for recovery or other primordial tasks, + * then error it out here. + */ + if (req->rq_ctx_init || req->rq_ctx_fini || + req->rq_send_state != LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov) { + DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)", + ptlrpc_import_state_name(req->rq_send_state), + ptlrpc_import_state_name(imp->imp_state)); + spin_lock(&req->rq_lock); + req->rq_status = -ETIMEDOUT; + req->rq_err = 1; + spin_unlock(&req->rq_lock); + RETURN(1); + } + + /* + * if a request can't be resent we can't wait for an answer after + * the timeout + */ + if (ptlrpc_no_resend(req)) { + DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:"); + rc = 1; + } + + ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg)); + + RETURN(rc); +} + +/** + * Time out all uncompleted requests in request set pointed by \a data + * This is called when a wait times out. + */ +void ptlrpc_expired_set(struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + time64_t now = ktime_get_real_seconds(); + + ENTRY; + LASSERT(set != NULL); + + /* + * A timeout expired. See which reqs it applies to... + */ + list_for_each_entry(req, &set->set_requests, rq_set_chain) { + /* don't expire request waiting for context */ + if (req->rq_wait_ctx) + continue; + + /* Request in-flight? */ + if (!((req->rq_phase == RQ_PHASE_RPC && + !req->rq_waiting && !req->rq_resend) || + (req->rq_phase == RQ_PHASE_BULK))) + continue; + + if (req->rq_timedout || /* already dealt with */ + req->rq_deadline > now) /* not expired */ + continue; + + /* + * Deal with this guy. Do it asynchronously to not block + * ptlrpcd thread. + */ + ptlrpc_expire_one_request(req, 1); + /* + * Loops require that we resched once in a while to avoid + * RCU stalls and a few other problems. + */ + cond_resched(); + + } +} + +/** + * Interrupts (sets interrupted flag) all uncompleted requests in + * a set \a data. This is called when a wait_event is interrupted + * by a signal. + */ +static void ptlrpc_interrupted_set(struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + + LASSERT(set != NULL); + CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set); + + list_for_each_entry(req, &set->set_requests, rq_set_chain) { + if (req->rq_intr) + continue; + + if (req->rq_phase != RQ_PHASE_RPC && + req->rq_phase != RQ_PHASE_UNREG_RPC && + !req->rq_allow_intr) + continue; + + spin_lock(&req->rq_lock); + req->rq_intr = 1; + spin_unlock(&req->rq_lock); + } +} + +/** + * Get the smallest timeout in the set; this does NOT set a timeout. + */ +time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) +{ + time64_t now = ktime_get_real_seconds(); + int timeout = 0; + struct ptlrpc_request *req; + time64_t deadline; + + ENTRY; + list_for_each_entry(req, &set->set_requests, rq_set_chain) { + /* Request in-flight? */ + if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || + (req->rq_phase == RQ_PHASE_BULK) || + (req->rq_phase == RQ_PHASE_NEW))) + continue; + + /* Already timed out. */ + if (req->rq_timedout) + continue; + + /* Waiting for ctx. */ + if (req->rq_wait_ctx) + continue; + + if (req->rq_phase == RQ_PHASE_NEW) + deadline = req->rq_sent; + else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend) + deadline = req->rq_sent; + else + deadline = req->rq_sent + req->rq_timeout; + + if (deadline <= now) /* actually expired already */ + timeout = 1; /* ASAP */ + else if (timeout == 0 || timeout > deadline - now) + timeout = deadline - now; + } + RETURN(timeout); +} + +/** + * Send all unset request from the set and then wait untill all + * requests in the set complete (either get a reply, timeout, get an + * error or otherwise be interrupted). + * Returns 0 on success or error code otherwise. + */ +int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + time64_t timeout; + int rc; + + ENTRY; + if (set->set_producer) + (void)ptlrpc_set_producer(set); + else + list_for_each_entry(req, &set->set_requests, rq_set_chain) { + if (req->rq_phase == RQ_PHASE_NEW) + (void)ptlrpc_send_new_req(req); + } + + if (list_empty(&set->set_requests)) + RETURN(0); + + do { + timeout = ptlrpc_set_next_timeout(set); + + /* + * wait until all complete, interrupted, or an in-flight + * req times out + */ + CDEBUG(D_RPCTRACE, "set %p going to sleep for %lld seconds\n", + set, timeout); + + if ((timeout == 0 && !signal_pending(current)) || + set->set_allow_intr) { + /* + * No requests are in-flight (ether timed out + * or delayed), so we can allow interrupts. + * We still want to block for a limited time, + * so we allow interrupts during the timeout. + */ + rc = l_wait_event_abortable_timeout( + set->set_waitq, + ptlrpc_check_set(NULL, set), + cfs_time_seconds(timeout ? timeout : 1)); + if (rc == 0) { + rc = -ETIMEDOUT; + ptlrpc_expired_set(set); + } else if (rc < 0) { + rc = -EINTR; + ptlrpc_interrupted_set(set); + } else { + rc = 0; + } + } else { + /* + * At least one request is in flight, so no + * interrupts are allowed. Wait until all + * complete, or an in-flight req times out. + */ + rc = wait_event_idle_timeout( + set->set_waitq, + ptlrpc_check_set(NULL, set), + cfs_time_seconds(timeout ? timeout : 1)); + if (rc == 0) { + ptlrpc_expired_set(set); + rc = -ETIMEDOUT; + } else { + rc = 0; + } + + /* + * LU-769 - if we ignored the signal because + * it was already pending when we started, we + * need to handle it now or we risk it being + * ignored forever + */ + if (rc == -ETIMEDOUT && + signal_pending(current)) { + sigset_t old, new; + + siginitset(&new, LUSTRE_FATAL_SIGS); + sigprocmask(SIG_BLOCK, &new, &old); + /* + * In fact we only interrupt for the + * "fatal" signals like SIGINT or + * SIGKILL. We still ignore less + * important signals since ptlrpc set + * is not easily reentrant from + * userspace again + */ + if (signal_pending(current)) + ptlrpc_interrupted_set(set); + sigprocmask(SIG_SETMASK, &old, NULL); + } + } + + LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT); + + /* + * -EINTR => all requests have been flagged rq_intr so next + * check completes. + * -ETIMEDOUT => someone timed out. When all reqs have + * timed out, signals are enabled allowing completion with + * EINTR. + * I don't really care if we go once more round the loop in + * the error cases -eeb. + */ + if (rc == 0 && atomic_read(&set->set_remaining) == 0) { + list_for_each_entry(req, &set->set_requests, + rq_set_chain) { + spin_lock(&req->rq_lock); + req->rq_invalid_rqset = 1; + spin_unlock(&req->rq_lock); + } + } + } while (rc != 0 || atomic_read(&set->set_remaining) != 0); + + LASSERT(atomic_read(&set->set_remaining) == 0); + + rc = set->set_rc; /* rq_status of already freed requests if any */ + list_for_each_entry(req, &set->set_requests, rq_set_chain) { + LASSERT(req->rq_phase == RQ_PHASE_COMPLETE); + if (req->rq_status != 0) + rc = req->rq_status; + } + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_set_wait); + +/** + * Helper fuction for request freeing. + * Called when request count reached zero and request needs to be freed. + * Removes request from all sorts of sending/replay lists it might be on, + * frees network buffers if any are present. + * If \a locked is set, that means caller is already holding import imp_lock + * and so we no longer need to reobtain it (for certain lists manipulations) + */ +static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) +{ + ENTRY; + + if (!request) + RETURN_EXIT; + + LASSERT(!request->rq_srv_req); + LASSERT(request->rq_export == NULL); + LASSERTF(!request->rq_receiving_reply, "req %p\n", request); + LASSERTF(list_empty(&request->rq_list), "req %p\n", request); + LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request); + LASSERTF(!request->rq_replay, "req %p\n", request); + + req_capsule_fini(&request->rq_pill); + + /* + * We must take it off the imp_replay_list first. Otherwise, we'll set + * request->rq_reqmsg to NULL while osc_close is dereferencing it. + */ + if (request->rq_import) { + if (!locked) + spin_lock(&request->rq_import->imp_lock); + list_del_init(&request->rq_replay_list); + list_del_init(&request->rq_unreplied_list); + if (!locked) + spin_unlock(&request->rq_import->imp_lock); + } + LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request); + + if (atomic_read(&request->rq_refcount) != 0) { + DEBUG_REQ(D_ERROR, request, + "freeing request with nonzero refcount"); + LBUG(); + } + + if (request->rq_repbuf) + sptlrpc_cli_free_repbuf(request); + + if (request->rq_import) { + if (!ptlrpcd_check_work(request)) { + LASSERT(atomic_read(&request->rq_import->imp_reqs) > 0); + atomic_dec(&request->rq_import->imp_reqs); + } + class_import_put(request->rq_import); + request->rq_import = NULL; + } + if (request->rq_bulk) + ptlrpc_free_bulk(request->rq_bulk); + + if (request->rq_reqbuf || request->rq_clrbuf) + sptlrpc_cli_free_reqbuf(request); + + if (request->rq_cli_ctx) + sptlrpc_req_put_ctx(request, !locked); + + if (request->rq_pool) + __ptlrpc_free_req_to_pool(request); + else + ptlrpc_request_cache_free(request); + EXIT; +} + +static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked); +/** + * Drop one request reference. Must be called with import imp_lock held. + * When reference count drops to zero, request is freed. + */ +void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request) +{ + assert_spin_locked(&request->rq_import->imp_lock); + (void)__ptlrpc_req_finished(request, 1); +} + +/** + * Helper function + * Drops one reference count for request \a request. + * \a locked set indicates that caller holds import imp_lock. + * Frees the request whe reference count reaches zero. + * + * \retval 1 the request is freed + * \retval 0 some others still hold references on the request + */ +static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked) +{ + int count; + + ENTRY; + if (!request) + RETURN(1); + + LASSERT(request != LP_POISON); + LASSERT(request->rq_reqmsg != LP_POISON); + + DEBUG_REQ(D_INFO, request, "refcount now %u", + atomic_read(&request->rq_refcount) - 1); + + spin_lock(&request->rq_lock); + count = atomic_dec_return(&request->rq_refcount); + LASSERTF(count >= 0, "Invalid ref count %d\n", count); + + /* + * For open RPC, the client does not know the EA size (LOV, ACL, and + * so on) before replied, then the client has to reserve very large + * reply buffer. Such buffer will not be released until the RPC freed. + * Since The open RPC is replayable, we need to keep it in the replay + * list until close. If there are a lot of files opened concurrently, + * then the client may be OOM. + * + * If fact, it is unnecessary to keep reply buffer for open replay, + * related EAs have already been saved via mdc_save_lovea() before + * coming here. So it is safe to free the reply buffer some earlier + * before releasing the RPC to avoid client OOM. LU-9514 + */ + if (count == 1 && request->rq_early_free_repbuf && request->rq_repbuf) { + spin_lock(&request->rq_early_free_lock); + sptlrpc_cli_free_repbuf(request); + request->rq_repbuf = NULL; + request->rq_repbuf_len = 0; + request->rq_repdata = NULL; + request->rq_reqdata_len = 0; + spin_unlock(&request->rq_early_free_lock); + } + spin_unlock(&request->rq_lock); + + if (!count) + __ptlrpc_free_req(request, locked); + + RETURN(!count); +} + +/** + * Drops one reference count for a request. + */ +void ptlrpc_req_finished(struct ptlrpc_request *request) +{ + __ptlrpc_req_finished(request, 0); +} +EXPORT_SYMBOL(ptlrpc_req_finished); + +/** + * Returns xid of a \a request + */ +__u64 ptlrpc_req_xid(struct ptlrpc_request *request) +{ + return request->rq_xid; +} +EXPORT_SYMBOL(ptlrpc_req_xid); + +/** + * Disengage the client's reply buffer from the network + * NB does _NOT_ unregister any client-side bulk. + * IDEMPOTENT, but _not_ safe against concurrent callers. + * The request owner (i.e. the thread doing the I/O) must call... + * Returns 0 on success or 1 if unregistering cannot be made. + */ +static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) +{ + bool discard = false; + /* + * Might sleep. + */ + LASSERT(!in_interrupt()); + + /* Let's setup deadline for reply unlink. */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + async && request->rq_reply_deadline == 0 && cfs_fail_val == 0) + request->rq_reply_deadline = ktime_get_real_seconds() + + PTLRPC_REQ_LONG_UNLINK; + + /* + * Nothing left to do. + */ + if (!__ptlrpc_cli_wait_unlink(request, &discard)) + RETURN(1); + + LNetMDUnlink(request->rq_reply_md_h); + + if (discard) /* Discard the request-out callback */ + __LNetMDUnlink(request->rq_req_md_h, discard); + + /* + * Let's check it once again. + */ + if (!ptlrpc_cli_wait_unlink(request)) + RETURN(1); + + /* Move to "Unregistering" phase as reply was not unlinked yet. */ + ptlrpc_rqphase_move(request, RQ_PHASE_UNREG_RPC); + + /* + * Do not wait for unlink to finish. + */ + if (async) + RETURN(0); + + /* + * We have to wait_event_idle_timeout() whatever the result, to get + * a chance to run reply_in_callback(), and to make sure we've + * unlinked before returning a req to the pool. + */ + for (;;) { + wait_queue_head_t *wq = (request->rq_set) ? + &request->rq_set->set_waitq : + &request->rq_reply_waitq; + int seconds = PTLRPC_REQ_LONG_UNLINK; + /* + * Network access will complete in finite time but the HUGE + * timeout lets us CWARN for visibility of sluggish NALs + */ + while (seconds > 0 && + wait_event_idle_timeout( + *wq, + !ptlrpc_cli_wait_unlink(request), + cfs_time_seconds(1)) == 0) + seconds -= 1; + if (seconds > 0) { + ptlrpc_rqphase_move(request, request->rq_next_phase); + RETURN(1); + } + + DEBUG_REQ(D_WARNING, request, + "Unexpectedly long timeout receiving_reply=%d req_ulinked=%d reply_unlinked=%d", + request->rq_receiving_reply, + request->rq_req_unlinked, + request->rq_reply_unlinked); + } + RETURN(0); +} + +static void ptlrpc_free_request(struct ptlrpc_request *req) +{ + spin_lock(&req->rq_lock); + req->rq_replay = 0; + spin_unlock(&req->rq_lock); + + if (req->rq_commit_cb) + req->rq_commit_cb(req); + list_del_init(&req->rq_replay_list); + + __ptlrpc_req_finished(req, 1); +} + +/** + * the request is committed and dropped from the replay list of its import + */ +void ptlrpc_request_committed(struct ptlrpc_request *req, int force) +{ + struct obd_import *imp = req->rq_import; + + spin_lock(&imp->imp_lock); + if (list_empty(&req->rq_replay_list)) { + spin_unlock(&imp->imp_lock); + return; + } + + if (force || req->rq_transno <= imp->imp_peer_committed_transno) { + if (imp->imp_replay_cursor == &req->rq_replay_list) + imp->imp_replay_cursor = req->rq_replay_list.next; + ptlrpc_free_request(req); + } + + spin_unlock(&imp->imp_lock); +} +EXPORT_SYMBOL(ptlrpc_request_committed); + +/** + * Iterates through replay_list on import and prunes + * all requests have transno smaller than last_committed for the + * import and don't have rq_replay set. + * Since requests are sorted in transno order, stops when meeting first + * transno bigger than last_committed. + * caller must hold imp->imp_lock + */ +void ptlrpc_free_committed(struct obd_import *imp) +{ + struct ptlrpc_request *req, *saved; + struct ptlrpc_request *last_req = NULL; /* temporary fire escape */ + bool skip_committed_list = true; + unsigned int replay_scanned = 0, replay_freed = 0; + unsigned int commit_scanned = 0, commit_freed = 0; + unsigned int debug_level = D_INFO; + __u64 peer_committed_transno; + int imp_generation; + time64_t start, now; + + ENTRY; + LASSERT(imp != NULL); + assert_spin_locked(&imp->imp_lock); + + start = ktime_get_seconds(); + /* save these here, we can potentially drop imp_lock after checking */ + peer_committed_transno = imp->imp_peer_committed_transno; + imp_generation = imp->imp_generation; + + if (peer_committed_transno == imp->imp_last_transno_checked && + imp_generation == imp->imp_last_generation_checked) { + CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n", + imp->imp_obd->obd_name, peer_committed_transno); + RETURN_EXIT; + } + CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n", + imp->imp_obd->obd_name, peer_committed_transno, imp_generation); + + if (imp_generation != imp->imp_last_generation_checked || + imp->imp_last_transno_checked == 0) + skip_committed_list = false; + /* maybe drop imp_lock here, if another lock protected the lists */ + + list_for_each_entry_safe(req, saved, &imp->imp_replay_list, + rq_replay_list) { + /* XXX ok to remove when 1357 resolved - rread 05/29/03 */ + LASSERT(req != last_req); + last_req = req; + + if (req->rq_transno == 0) { + DEBUG_REQ(D_EMERG, req, "zero transno during replay"); + LBUG(); + } + + /* If other threads are waiting on imp_lock, stop processing + * in this thread. Another thread can finish remaining work. + * This may happen if there are huge numbers of open files + * that are closed suddenly or evicted, or if the server + * commit interval is very high vs. RPC rate. + */ + if (++replay_scanned % 2048 == 0) { + now = ktime_get_seconds(); + if (now > start + 5) + debug_level = D_WARNING; + + if ((replay_freed > 128 && now > start + 3) && + atomic_read(&imp->imp_waiting)) { + if (debug_level == D_INFO) + debug_level = D_RPCTRACE; + break; + } + } + + if (req->rq_import_generation < imp_generation) { + DEBUG_REQ(D_RPCTRACE, req, "free request with old gen"); + GOTO(free_req, 0); + } + + /* not yet committed */ + if (req->rq_transno > peer_committed_transno) { + DEBUG_REQ(D_RPCTRACE, req, "stopping search"); + break; + } + + if (req->rq_replay) { + DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)"); + list_move_tail(&req->rq_replay_list, + &imp->imp_committed_list); + continue; + } + + DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)", + peer_committed_transno); +free_req: + replay_freed++; + ptlrpc_free_request(req); + } + + if (skip_committed_list) + GOTO(out, 0); + + list_for_each_entry_safe(req, saved, &imp->imp_committed_list, + rq_replay_list) { + LASSERT(req->rq_transno != 0); + + /* If other threads are waiting on imp_lock, stop processing + * in this thread. Another thread can finish remaining work. */ + if (++commit_scanned % 2048 == 0) { + now = ktime_get_seconds(); + if (now > start + 6) + debug_level = D_WARNING; + + if ((commit_freed > 128 && now > start + 4) && + atomic_read(&imp->imp_waiting)) { + if (debug_level == D_INFO) + debug_level = D_RPCTRACE; + break; + } + } + + if (req->rq_import_generation < imp_generation || + !req->rq_replay) { + DEBUG_REQ(D_RPCTRACE, req, "free %s open request", + req->rq_import_generation < + imp_generation ? "stale" : "closed"); + + if (imp->imp_replay_cursor == &req->rq_replay_list) + imp->imp_replay_cursor = + req->rq_replay_list.next; + + commit_freed++; + ptlrpc_free_request(req); + } + } +out: + /* if full lists processed without interruption, avoid next scan */ + if (debug_level == D_INFO) { + imp->imp_last_transno_checked = peer_committed_transno; + imp->imp_last_generation_checked = imp_generation; + } + + CDEBUG_LIMIT(debug_level, + "%s: %s: skip=%u replay=%u/%u committed=%u/%u\n", + imp->imp_obd->obd_name, + debug_level == D_INFO ? "normal" : "overloaded", + skip_committed_list, replay_freed, replay_scanned, + commit_freed, commit_scanned); + EXIT; +} + +void ptlrpc_cleanup_client(struct obd_import *imp) +{ + ENTRY; + EXIT; +} + +/** + * Schedule previously sent request for resend. + * For bulk requests we assign new xid (to avoid problems with + * lost replies and therefore several transfers landing into same buffer + * from different sending attempts). + */ +void ptlrpc_resend_req(struct ptlrpc_request *req) +{ + DEBUG_REQ(D_HA, req, "going to resend"); + spin_lock(&req->rq_lock); + + /* + * Request got reply but linked to the import list still. + * Let ptlrpc_check_set() process it. + */ + if (ptlrpc_client_replied(req)) { + spin_unlock(&req->rq_lock); + DEBUG_REQ(D_HA, req, "it has reply, so skip it"); + return; + } + + req->rq_status = -EAGAIN; + + req->rq_resend = 1; + req->rq_net_err = 0; + req->rq_timedout = 0; + + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); +} + +/* XXX: this function and rq_status are currently unused */ +void ptlrpc_restart_req(struct ptlrpc_request *req) +{ + DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request"); + req->rq_status = -ERESTARTSYS; + + spin_lock(&req->rq_lock); + req->rq_restart = 1; + req->rq_timedout = 0; + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); +} + +/** + * Grab additional reference on a request \a req + */ +struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req) +{ + ENTRY; + atomic_inc(&req->rq_refcount); + RETURN(req); +} +EXPORT_SYMBOL(ptlrpc_request_addref); + +/** + * Add a request to import replay_list. + * Must be called under imp_lock + */ +void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, + struct obd_import *imp) +{ + struct ptlrpc_request *iter; + + assert_spin_locked(&imp->imp_lock); + + if (req->rq_transno == 0) { + DEBUG_REQ(D_EMERG, req, "saving request with zero transno"); + LBUG(); + } + + /* + * clear this for new requests that were resent as well + * as resent replayed requests. + */ + lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT); + + /* don't re-add requests that have been replayed */ + if (!list_empty(&req->rq_replay_list)) + return; + + lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY); + + spin_lock(&req->rq_lock); + req->rq_resend = 0; + spin_unlock(&req->rq_lock); + + LASSERT(imp->imp_replayable); + /* Balanced in ptlrpc_free_committed, usually. */ + ptlrpc_request_addref(req); + list_for_each_entry_reverse(iter, &imp->imp_replay_list, + rq_replay_list) { + /* + * We may have duplicate transnos if we create and then + * open a file, or for closes retained if to match creating + * opens, so use req->rq_xid as a secondary key. + * (See bugs 684, 685, and 428.) + * XXX no longer needed, but all opens need transnos! + */ + if (iter->rq_transno > req->rq_transno) + continue; + + if (iter->rq_transno == req->rq_transno) { + LASSERT(iter->rq_xid != req->rq_xid); + if (iter->rq_xid > req->rq_xid) + continue; + } + + list_add(&req->rq_replay_list, &iter->rq_replay_list); + return; + } + + list_add(&req->rq_replay_list, &imp->imp_replay_list); +} + +/** + * Send request and wait until it completes. + * Returns request processing status. + */ +int ptlrpc_queue_wait(struct ptlrpc_request *req) +{ + struct ptlrpc_request_set *set; + int rc; + + ENTRY; + LASSERT(req->rq_set == NULL); + LASSERT(!req->rq_receiving_reply); + + set = ptlrpc_prep_set(); + if (!set) { + CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM); + RETURN(-ENOMEM); + } + + /* for distributed debugging */ + lustre_msg_set_status(req->rq_reqmsg, current->pid); + + /* add a ref for the set (see comment in ptlrpc_set_add_req) */ + ptlrpc_request_addref(req); + ptlrpc_set_add_req(set, req); + rc = ptlrpc_set_wait(NULL, set); + ptlrpc_set_destroy(set); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_queue_wait); + +/** + * Callback used for replayed requests reply processing. + * In case of successful reply calls registered request replay callback. + * In case of error restart replay process. + */ +static int ptlrpc_replay_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *args, int rc) +{ + struct ptlrpc_replay_async_args *aa = args; + struct obd_import *imp = req->rq_import; + + ENTRY; + atomic_dec(&imp->imp_replay_inflight); + + /* + * Note: if it is bulk replay (MDS-MDS replay), then even if + * server got the request, but bulk transfer timeout, let's + * replay the bulk req again + */ + if (!ptlrpc_client_replied(req) || + (req->rq_bulk && + lustre_msg_get_status(req->rq_repmsg) == -ETIMEDOUT)) { + DEBUG_REQ(D_ERROR, req, "request replay timed out"); + GOTO(out, rc = -ETIMEDOUT); + } + + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR && + (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN || + lustre_msg_get_status(req->rq_repmsg) == -ENODEV)) + GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg)); + + /** VBR: check version failure */ + if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) { + /** replay was failed due to version mismatch */ + DEBUG_REQ(D_WARNING, req, "Version mismatch during replay"); + spin_lock(&imp->imp_lock); + imp->imp_vbr_failed = 1; + spin_unlock(&imp->imp_lock); + lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); + } else { + /** The transno had better not change over replay. */ + LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) == + lustre_msg_get_transno(req->rq_repmsg) || + lustre_msg_get_transno(req->rq_repmsg) == 0, + "%#llx/%#llx\n", + lustre_msg_get_transno(req->rq_reqmsg), + lustre_msg_get_transno(req->rq_repmsg)); + } + + spin_lock(&imp->imp_lock); + imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg); + spin_unlock(&imp->imp_lock); + LASSERT(imp->imp_last_replay_transno); + + /* transaction number shouldn't be bigger than the latest replayed */ + if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) { + DEBUG_REQ(D_ERROR, req, + "Reported transno=%llu is bigger than replayed=%llu", + req->rq_transno, + lustre_msg_get_transno(req->rq_reqmsg)); + GOTO(out, rc = -EINVAL); + } + + DEBUG_REQ(D_HA, req, "got reply"); + + /* let the callback do fixups, possibly including in the request */ + if (req->rq_replay_cb) + req->rq_replay_cb(req); + + if (ptlrpc_client_replied(req) && + lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) { + DEBUG_REQ(D_ERROR, req, "status %d, old was %d", + lustre_msg_get_status(req->rq_repmsg), + aa->praa_old_status); + + /* + * Note: If the replay fails for MDT-MDT recovery, let's + * abort all of the following requests in the replay + * and sending list, because MDT-MDT update requests + * are dependent on each other, see LU-7039 + */ + if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) { + struct ptlrpc_request *free_req; + struct ptlrpc_request *tmp; + + spin_lock(&imp->imp_lock); + list_for_each_entry_safe(free_req, tmp, + &imp->imp_replay_list, + rq_replay_list) { + ptlrpc_free_request(free_req); + } + + list_for_each_entry_safe(free_req, tmp, + &imp->imp_committed_list, + rq_replay_list) { + ptlrpc_free_request(free_req); + } + + list_for_each_entry_safe(free_req, tmp, + &imp->imp_delayed_list, + rq_list) { + spin_lock(&free_req->rq_lock); + free_req->rq_err = 1; + free_req->rq_status = -EIO; + ptlrpc_client_wake_req(free_req); + spin_unlock(&free_req->rq_lock); + } + + list_for_each_entry_safe(free_req, tmp, + &imp->imp_sending_list, + rq_list) { + spin_lock(&free_req->rq_lock); + free_req->rq_err = 1; + free_req->rq_status = -EIO; + ptlrpc_client_wake_req(free_req); + spin_unlock(&free_req->rq_lock); + } + spin_unlock(&imp->imp_lock); + } + } else { + /* Put it back for re-replay. */ + lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); + } + + /* + * Errors while replay can set transno to 0, but + * imp_last_replay_transno shouldn't be set to 0 anyway + */ + if (req->rq_transno == 0) + CERROR("Transno is 0 during replay!\n"); + + /* continue with recovery */ + rc = ptlrpc_import_recovery_state_machine(imp); + out: + req->rq_send_state = aa->praa_old_state; + + if (rc != 0) + /* this replay failed, so restart recovery */ + ptlrpc_connect_import(imp); + + RETURN(rc); +} + +/** + * Prepares and queues request for replay. + * Adds it to ptlrpcd queue for actual sending. + * Returns 0 on success. + */ +int ptlrpc_replay_req(struct ptlrpc_request *req) +{ + struct ptlrpc_replay_async_args *aa; + + ENTRY; + + LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY); + + aa = ptlrpc_req_async_args(aa, req); + memset(aa, 0, sizeof(*aa)); + + /* Prepare request to be resent with ptlrpcd */ + aa->praa_old_state = req->rq_send_state; + req->rq_send_state = LUSTRE_IMP_REPLAY; + req->rq_phase = RQ_PHASE_NEW; + req->rq_next_phase = RQ_PHASE_UNDEFINED; + if (req->rq_repmsg) + aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); + req->rq_status = 0; + req->rq_interpret_reply = ptlrpc_replay_interpret; + /* Readjust the timeout for current conditions */ + ptlrpc_at_set_req_timeout(req); + + /* Tell server net_latency to calculate how long to wait for reply. */ + lustre_msg_set_service_timeout(req->rq_reqmsg, + ptlrpc_at_get_net_latency(req)); + DEBUG_REQ(D_HA, req, "REPLAY"); + + atomic_inc(&req->rq_import->imp_replay_inflight); + spin_lock(&req->rq_lock); + req->rq_early_free_repbuf = 0; + spin_unlock(&req->rq_lock); + ptlrpc_request_addref(req); /* ptlrpcd needs a ref */ + + ptlrpcd_add_req(req); + RETURN(0); +} + +/** + * Aborts all in-flight request on import \a imp sending and delayed lists + */ +void ptlrpc_abort_inflight(struct obd_import *imp) +{ + struct ptlrpc_request *req; + ENTRY; + + /* + * Make sure that no new requests get processed for this import. + * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing + * this flag and then putting requests on sending_list or delayed_list. + */ + assert_spin_locked(&imp->imp_lock); + + /* + * XXX locking? Maybe we should remove each request with the list + * locked? Also, how do we know if the requests on the list are + * being freed at this time? + */ + list_for_each_entry(req, &imp->imp_sending_list, rq_list) { + DEBUG_REQ(D_RPCTRACE, req, "inflight"); + + spin_lock(&req->rq_lock); + if (req->rq_import_generation < imp->imp_generation) { + req->rq_err = 1; + req->rq_status = -EIO; + ptlrpc_client_wake_req(req); + } + spin_unlock(&req->rq_lock); + } + + list_for_each_entry(req, &imp->imp_delayed_list, rq_list) { + DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req"); + + spin_lock(&req->rq_lock); + if (req->rq_import_generation < imp->imp_generation) { + req->rq_err = 1; + req->rq_status = -EIO; + ptlrpc_client_wake_req(req); + } + spin_unlock(&req->rq_lock); + } + + /* + * Last chance to free reqs left on the replay list, but we + * will still leak reqs that haven't committed. + */ + if (imp->imp_replayable) + ptlrpc_free_committed(imp); + + EXIT; +} + +/** + * Abort all uncompleted requests in request set \a set + */ +void ptlrpc_abort_set(struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + + LASSERT(set != NULL); + + list_for_each_entry(req, &set->set_requests, rq_set_chain) { + spin_lock(&req->rq_lock); + if (req->rq_phase != RQ_PHASE_RPC) { + spin_unlock(&req->rq_lock); + continue; + } + + req->rq_err = 1; + req->rq_status = -EINTR; + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); + } +} + +/** + * Initialize the XID for the node. This is common among all requests on + * this node, and only requires the property that it is monotonically + * increasing. It does not need to be sequential. Since this is also used + * as the RDMA match bits, it is important that a single client NOT have + * the same match bits for two different in-flight requests, hence we do + * NOT want to have an XID per target or similar. + * + * To avoid an unlikely collision between match bits after a client reboot + * (which would deliver old data into the wrong RDMA buffer) initialize + * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s. + * If the time is clearly incorrect, we instead use a 62-bit random number. + * In the worst case the random number will overflow 1M RPCs per second in + * 9133 years, or permutations thereof. + */ +#define YEAR_2004 (1ULL << 30) +void ptlrpc_init_xid(void) +{ + time64_t now = ktime_get_real_seconds(); + u64 xid; + + if (now < YEAR_2004) { + get_random_bytes(&xid, sizeof(xid)); + xid >>= 2; + xid |= (1ULL << 61); + } else { + xid = (u64)now << 20; + } + + /* Need to always be aligned to a power-of-two for mutli-bulk BRW */ + BUILD_BUG_ON((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) != + 0); + xid &= PTLRPC_BULK_OPS_MASK; + atomic64_set(&ptlrpc_last_xid, xid); +} + +/** + * Increase xid and returns resulting new value to the caller. + * + * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting + * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC + * itself uses the last bulk xid needed, so the server can determine the + * the number of bulk transfers from the RPC XID and a bitmask. The starting + * xid must align to a power-of-two value. + * + * This is assumed to be true due to the initial ptlrpc_last_xid + * value also being initialized to a power-of-two value. LU-1431 + */ +__u64 ptlrpc_next_xid(void) +{ + return atomic64_add_return(PTLRPC_BULK_OPS_COUNT, &ptlrpc_last_xid); +} + +/** + * If request has a new allocated XID (new request or EINPROGRESS resend), + * use this XID as matchbits of bulk, otherwise allocate a new matchbits for + * request to ensure previous bulk fails and avoid problems with lost replies + * and therefore several transfers landing into the same buffer from different + * sending attempts. + * Also, to avoid previous reply landing to a different sending attempt. + */ +void ptlrpc_set_mbits(struct ptlrpc_request *req) +{ + int md_count = req->rq_bulk ? req->rq_bulk->bd_md_count : 1; + + /* + * Generate new matchbits for all resend requests, including + * resend replay. + */ + if (req->rq_resend) { + __u64 old_mbits = req->rq_mbits; + + /* + * First time resend on -EINPROGRESS will generate new xid, + * so we can actually use the rq_xid as rq_mbits in such case, + * however, it's bit hard to distinguish such resend with a + * 'resend for the -EINPROGRESS resend'. To make it simple, + * we opt to generate mbits for all resend cases. + */ + if (OCD_HAS_FLAG(&req->rq_import->imp_connect_data, + BULK_MBITS)) { + req->rq_mbits = ptlrpc_next_xid(); + } else { + /* + * Old version transfers rq_xid to peer as + * matchbits. + */ + spin_lock(&req->rq_import->imp_lock); + list_del_init(&req->rq_unreplied_list); + ptlrpc_assign_next_xid_nolock(req); + spin_unlock(&req->rq_import->imp_lock); + req->rq_mbits = req->rq_xid; + } + CDEBUG(D_HA, "resend with new mbits old x%llu new x%llu\n", + old_mbits, req->rq_mbits); + } else if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) { + /* Request being sent first time, use xid as matchbits. */ + if (OCD_HAS_FLAG(&req->rq_import->imp_connect_data, + BULK_MBITS) || req->rq_mbits == 0) + { + req->rq_mbits = req->rq_xid; + } else { + req->rq_mbits -= md_count - 1; + } + } else { + /* + * Replay request, xid and matchbits have already been + * correctly assigned. + */ + return; + } + + /* + * For multi-bulk RPCs, rq_mbits is the last mbits needed for bulks so + * that server can infer the number of bulks that were prepared, + * see LU-1431 + */ + req->rq_mbits += md_count - 1; + + /* + * Set rq_xid as rq_mbits to indicate the final bulk for the old + * server which does not support OBD_CONNECT_BULK_MBITS. LU-6808. + * + * It's ok to directly set the rq_xid here, since this xid bump + * won't affect the request position in unreplied list. + */ + if (!OCD_HAS_FLAG(&req->rq_import->imp_connect_data, BULK_MBITS)) + req->rq_xid = req->rq_mbits; +} + +/** + * Get a glimpse at what next xid value might have been. + * Returns possible next xid. + */ +__u64 ptlrpc_sample_next_xid(void) +{ + return atomic64_read(&ptlrpc_last_xid) + PTLRPC_BULK_OPS_COUNT; +} +EXPORT_SYMBOL(ptlrpc_sample_next_xid); + +/** + * Functions for operating ptlrpc workers. + * + * A ptlrpc work is a function which will be running inside ptlrpc context. + * The callback shouldn't sleep otherwise it will block that ptlrpcd thread. + * + * 1. after a work is created, it can be used many times, that is: + * handler = ptlrpcd_alloc_work(); + * ptlrpcd_queue_work(); + * + * queue it again when necessary: + * ptlrpcd_queue_work(); + * ptlrpcd_destroy_work(); + * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but + * it will only be queued once in any time. Also as its name implies, it may + * have delay before it really runs by ptlrpcd thread. + */ +struct ptlrpc_work_async_args { + int (*cb)(const struct lu_env *, void *); + void *cbdata; +}; + +static void ptlrpcd_add_work_req(struct ptlrpc_request *req) +{ + /* re-initialize the req */ + req->rq_timeout = obd_timeout; + req->rq_sent = ktime_get_real_seconds(); + req->rq_deadline = req->rq_sent + req->rq_timeout; + req->rq_phase = RQ_PHASE_INTERPRET; + req->rq_next_phase = RQ_PHASE_COMPLETE; + req->rq_xid = ptlrpc_next_xid(); + req->rq_import_generation = req->rq_import->imp_generation; + + ptlrpcd_add_req(req); +} + +static int work_interpreter(const struct lu_env *env, + struct ptlrpc_request *req, void *args, int rc) +{ + struct ptlrpc_work_async_args *arg = args; + + LASSERT(ptlrpcd_check_work(req)); + LASSERT(arg->cb != NULL); + + rc = arg->cb(env, arg->cbdata); + + list_del_init(&req->rq_set_chain); + req->rq_set = NULL; + + if (atomic_dec_return(&req->rq_refcount) > 1) { + atomic_set(&req->rq_refcount, 2); + ptlrpcd_add_work_req(req); + } + return rc; +} + +static int worker_format; + +static int ptlrpcd_check_work(struct ptlrpc_request *req) +{ + return req->rq_pill.rc_fmt == (void *)&worker_format; +} + +/** + * Create a work for ptlrpc. + */ +void *ptlrpcd_alloc_work(struct obd_import *imp, + int (*cb)(const struct lu_env *, void *), void *cbdata) +{ + struct ptlrpc_request *req = NULL; + struct ptlrpc_work_async_args *args; + + ENTRY; + might_sleep(); + + if (!cb) + RETURN(ERR_PTR(-EINVAL)); + + /* copy some code from deprecated fakereq. */ + req = ptlrpc_request_cache_alloc(GFP_NOFS); + if (!req) { + CERROR("ptlrpc: run out of memory!\n"); + RETURN(ERR_PTR(-ENOMEM)); + } + + ptlrpc_cli_req_init(req); + + req->rq_send_state = LUSTRE_IMP_FULL; + req->rq_type = PTL_RPC_MSG_REQUEST; + req->rq_import = class_import_get(imp); + req->rq_interpret_reply = work_interpreter; + /* don't want reply */ + req->rq_no_delay = req->rq_no_resend = 1; + req->rq_pill.rc_fmt = (void *)&worker_format; + + args = ptlrpc_req_async_args(args, req); + args->cb = cb; + args->cbdata = cbdata; + + RETURN(req); +} +EXPORT_SYMBOL(ptlrpcd_alloc_work); + +void ptlrpcd_destroy_work(void *handler) +{ + struct ptlrpc_request *req = handler; + + if (req) + ptlrpc_req_finished(req); +} +EXPORT_SYMBOL(ptlrpcd_destroy_work); + +int ptlrpcd_queue_work(void *handler) +{ + struct ptlrpc_request *req = handler; + + /* + * Check if the req is already being queued. + * + * Here comes a trick: it lacks a way of checking if a req is being + * processed reliably in ptlrpc. Here I have to use refcount of req + * for this purpose. This is okay because the caller should use this + * req as opaque data. - Jinshan + */ + LASSERT(atomic_read(&req->rq_refcount) > 0); + if (atomic_inc_return(&req->rq_refcount) == 2) + ptlrpcd_add_work_req(req); + return 0; +} +EXPORT_SYMBOL(ptlrpcd_queue_work); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/connection.c b/drivers/staging/lustrefsx/lustre/ptlrpc/connection.c new file mode 100644 index 0000000000000..3f690ad652c0a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/connection.c @@ -0,0 +1,174 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +static struct rhashtable conn_hash; + +/* + * struct lnet_process_id may contain unassigned bytes which might not + * be zero, so we cannot just hash and compare bytes. + */ + +static u32 lnet_process_id_hash(const void *data, u32 len, u32 seed) +{ + const struct lnet_processid *lpi = data; + + seed = cfs_hash_32(seed ^ lpi->pid, 32); + seed = cfs_hash_32(nidhash(&lpi->nid) ^ seed, 32); + return seed; +} + +static int lnet_process_id_cmp(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct lnet_processid *lpi = arg->key; + const struct ptlrpc_connection *con = obj; + + if (nid_same(&lpi->nid, &con->c_peer.nid) && + lpi->pid == con->c_peer.pid) + return 0; + return -ESRCH; +} + +static const struct rhashtable_params conn_hash_params = { + .key_len = 1, /* actually variable-length */ + .key_offset = offsetof(struct ptlrpc_connection, c_peer), + .head_offset = offsetof(struct ptlrpc_connection, c_hash), + .hashfn = lnet_process_id_hash, + .obj_cmpfn = lnet_process_id_cmp, +}; + +struct ptlrpc_connection * +ptlrpc_connection_get(struct lnet_process_id peer4, lnet_nid_t self, + struct obd_uuid *uuid) +{ + struct ptlrpc_connection *conn, *conn2; + struct lnet_processid peer; + ENTRY; + + peer4.nid = LNetPrimaryNID(peer4.nid); + lnet_pid4_to_pid(peer4, &peer); + conn = rhashtable_lookup_fast(&conn_hash, &peer, conn_hash_params); + if (conn) { + ptlrpc_connection_addref(conn); + GOTO(out, conn); + } + + OBD_ALLOC_PTR(conn); + if (!conn) + RETURN(NULL); + + conn->c_peer = peer; + lnet_nid4_to_nid(self, &conn->c_self); + atomic_set(&conn->c_refcount, 1); + if (uuid) + obd_str2uuid(&conn->c_remote_uuid, uuid->uuid); + + /* + * Add the newly created conn to the hash, on key collision we + * lost a racing addition and must destroy our newly allocated + * connection. The object which exists in the hash will be + * returned,otherwise NULL is returned on success. + */ +try_again: + conn2 = rhashtable_lookup_get_insert_fast(&conn_hash, &conn->c_hash, + conn_hash_params); + if (conn2) { + /* insertion failed */ + if (IS_ERR(conn2)) { + /* hash table could be resizing. */ + if (PTR_ERR(conn2) == -ENOMEM || + PTR_ERR(conn2) == -EBUSY) { + msleep(5); + goto try_again; + } + conn2 = NULL; + } + OBD_FREE_PTR(conn); + conn = conn2; + if (conn) + ptlrpc_connection_addref(conn); + } + EXIT; +out: + CDEBUG(D_INFO, "conn=%p refcount %d to %s\n", + conn, atomic_read(&conn->c_refcount), + libcfs_nidstr(&conn->c_peer.nid)); + return conn; +} + +struct ptlrpc_connection * +ptlrpc_connection_addref(struct ptlrpc_connection *conn) +{ + ENTRY; + + atomic_inc(&conn->c_refcount); + CDEBUG(D_INFO, "conn=%p refcount %d to %s\n", + conn, atomic_read(&conn->c_refcount), + libcfs_nidstr(&conn->c_peer.nid)); + + RETURN(conn); +} + +static void +conn_exit(void *vconn, void *data) +{ + struct ptlrpc_connection *conn = vconn; + + /* + * Nothing should be left. Connection user put it and + * connection also was deleted from table by this time + * so we should have 0 refs. + */ + LASSERTF(atomic_read(&conn->c_refcount) == 0, + "Busy connection with %d refs\n", + atomic_read(&conn->c_refcount)); + OBD_FREE_PTR(conn); +} + +int ptlrpc_connection_init(void) +{ + return rhashtable_init(&conn_hash, &conn_hash_params); +} + +void ptlrpc_connection_fini(void) +{ + rhashtable_free_and_destroy(&conn_hash, conn_exit, NULL); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c new file mode 100644 index 0000000000000..987803be5b86f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c @@ -0,0 +1,411 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.txt + * + * GPL HEADER END + */ +/* + * Copyright (C) 2011 FUJITSU LIMITED. All rights reserved. + * + * Copyright (c) 2013, Intel Corporation. + */ + +#include +#include + +#ifdef LUSTRE_TRANSLATE_ERRNOS +#include + +/* + * The two translation tables below must define a one-to-one mapping between + * host and network errnos. + * + * EAGAIN is equal to EAGAIN on all architectures except for parisc, which + * appears irrelevant. Thus, existing references to EAGAIN are fine. + * + * EDEADLOCK is equal to EDEADLK on x86 but not on sparc, at least. A sparc + * host has no context-free way to determine if a LUSTRE_EDEADLK represents an + * EDEADLK or an EDEADLOCK. Therefore, all existing references to EDEADLOCK + * that need to be transferred on wire have been replaced with EDEADLK. + */ +static int lustre_errno_hton_mapping[] = { + [EPERM] = LUSTRE_EPERM, + [ENOENT] = LUSTRE_ENOENT, + [ESRCH] = LUSTRE_ESRCH, + [EINTR] = LUSTRE_EINTR, + [EIO] = LUSTRE_EIO, + [ENXIO] = LUSTRE_ENXIO, + [E2BIG] = LUSTRE_E2BIG, + [ENOEXEC] = LUSTRE_ENOEXEC, + [EBADF] = LUSTRE_EBADF, + [ECHILD] = LUSTRE_ECHILD, + [EAGAIN] = LUSTRE_EAGAIN, + [ENOMEM] = LUSTRE_ENOMEM, + [EACCES] = LUSTRE_EACCES, + [EFAULT] = LUSTRE_EFAULT, + [ENOTBLK] = LUSTRE_ENOTBLK, + [EBUSY] = LUSTRE_EBUSY, + [EEXIST] = LUSTRE_EEXIST, + [EXDEV] = LUSTRE_EXDEV, + [ENODEV] = LUSTRE_ENODEV, + [ENOTDIR] = LUSTRE_ENOTDIR, + [EISDIR] = LUSTRE_EISDIR, + [EINVAL] = LUSTRE_EINVAL, + [ENFILE] = LUSTRE_ENFILE, + [EMFILE] = LUSTRE_EMFILE, + [ENOTTY] = LUSTRE_ENOTTY, + [ETXTBSY] = LUSTRE_ETXTBSY, + [EFBIG] = LUSTRE_EFBIG, + [ENOSPC] = LUSTRE_ENOSPC, + [ESPIPE] = LUSTRE_ESPIPE, + [EROFS] = LUSTRE_EROFS, + [EMLINK] = LUSTRE_EMLINK, + [EPIPE] = LUSTRE_EPIPE, + [EDOM] = LUSTRE_EDOM, + [ERANGE] = LUSTRE_ERANGE, + [EDEADLK] = LUSTRE_EDEADLK, + [ENAMETOOLONG] = LUSTRE_ENAMETOOLONG, + [ENOLCK] = LUSTRE_ENOLCK, + [ENOSYS] = LUSTRE_ENOSYS, + [ENOTEMPTY] = LUSTRE_ENOTEMPTY, + [ELOOP] = LUSTRE_ELOOP, + [ENOMSG] = LUSTRE_ENOMSG, + [EIDRM] = LUSTRE_EIDRM, + [ECHRNG] = LUSTRE_ECHRNG, + [EL2NSYNC] = LUSTRE_EL2NSYNC, + [EL3HLT] = LUSTRE_EL3HLT, + [EL3RST] = LUSTRE_EL3RST, + [ELNRNG] = LUSTRE_ELNRNG, + [EUNATCH] = LUSTRE_EUNATCH, + [ENOCSI] = LUSTRE_ENOCSI, + [EL2HLT] = LUSTRE_EL2HLT, + [EBADE] = LUSTRE_EBADE, + [EBADR] = LUSTRE_EBADR, + [EXFULL] = LUSTRE_EXFULL, + [ENOANO] = LUSTRE_ENOANO, + [EBADRQC] = LUSTRE_EBADRQC, + [EBADSLT] = LUSTRE_EBADSLT, + [EBFONT] = LUSTRE_EBFONT, + [ENOSTR] = LUSTRE_ENOSTR, + [ENODATA] = LUSTRE_ENODATA, + [ETIME] = LUSTRE_ETIME, + [ENOSR] = LUSTRE_ENOSR, + [ENONET] = LUSTRE_ENONET, + [ENOPKG] = LUSTRE_ENOPKG, + [EREMOTE] = LUSTRE_EREMOTE, + [ENOLINK] = LUSTRE_ENOLINK, + [EADV] = LUSTRE_EADV, + [ESRMNT] = LUSTRE_ESRMNT, + [ECOMM] = LUSTRE_ECOMM, + [EPROTO] = LUSTRE_EPROTO, + [EMULTIHOP] = LUSTRE_EMULTIHOP, + [EDOTDOT] = LUSTRE_EDOTDOT, + [EBADMSG] = LUSTRE_EBADMSG, + [EOVERFLOW] = LUSTRE_EOVERFLOW, + [ENOTUNIQ] = LUSTRE_ENOTUNIQ, + [EBADFD] = LUSTRE_EBADFD, + [EREMCHG] = LUSTRE_EREMCHG, + [ELIBACC] = LUSTRE_ELIBACC, + [ELIBBAD] = LUSTRE_ELIBBAD, + [ELIBSCN] = LUSTRE_ELIBSCN, + [ELIBMAX] = LUSTRE_ELIBMAX, + [ELIBEXEC] = LUSTRE_ELIBEXEC, + [EILSEQ] = LUSTRE_EILSEQ, + [ERESTART] = LUSTRE_ERESTART, + [ESTRPIPE] = LUSTRE_ESTRPIPE, + [EUSERS] = LUSTRE_EUSERS, + [ENOTSOCK] = LUSTRE_ENOTSOCK, + [EDESTADDRREQ] = LUSTRE_EDESTADDRREQ, + [EMSGSIZE] = LUSTRE_EMSGSIZE, + [EPROTOTYPE] = LUSTRE_EPROTOTYPE, + [ENOPROTOOPT] = LUSTRE_ENOPROTOOPT, + [EPROTONOSUPPORT] = LUSTRE_EPROTONOSUPPORT, + [ESOCKTNOSUPPORT] = LUSTRE_ESOCKTNOSUPPORT, + [EOPNOTSUPP] = LUSTRE_EOPNOTSUPP, + [EPFNOSUPPORT] = LUSTRE_EPFNOSUPPORT, + [EAFNOSUPPORT] = LUSTRE_EAFNOSUPPORT, + [EADDRINUSE] = LUSTRE_EADDRINUSE, + [EADDRNOTAVAIL] = LUSTRE_EADDRNOTAVAIL, + [ENETDOWN] = LUSTRE_ENETDOWN, + [ENETUNREACH] = LUSTRE_ENETUNREACH, + [ENETRESET] = LUSTRE_ENETRESET, + [ECONNABORTED] = LUSTRE_ECONNABORTED, + [ECONNRESET] = LUSTRE_ECONNRESET, + [ENOBUFS] = LUSTRE_ENOBUFS, + [EISCONN] = LUSTRE_EISCONN, + [ENOTCONN] = LUSTRE_ENOTCONN, + [ESHUTDOWN] = LUSTRE_ESHUTDOWN, + [ETOOMANYREFS] = LUSTRE_ETOOMANYREFS, + [ETIMEDOUT] = LUSTRE_ETIMEDOUT, + [ECONNREFUSED] = LUSTRE_ECONNREFUSED, + [EHOSTDOWN] = LUSTRE_EHOSTDOWN, + [EHOSTUNREACH] = LUSTRE_EHOSTUNREACH, + [EALREADY] = LUSTRE_EALREADY, + [EINPROGRESS] = LUSTRE_EINPROGRESS, + [ESTALE] = LUSTRE_ESTALE, + [EUCLEAN] = LUSTRE_EUCLEAN, + [ENOTNAM] = LUSTRE_ENOTNAM, + [ENAVAIL] = LUSTRE_ENAVAIL, + [EISNAM] = LUSTRE_EISNAM, + [EREMOTEIO] = LUSTRE_EREMOTEIO, + [EDQUOT] = LUSTRE_EDQUOT, + [ENOMEDIUM] = LUSTRE_ENOMEDIUM, + [EMEDIUMTYPE] = LUSTRE_EMEDIUMTYPE, + [ECANCELED] = LUSTRE_ECANCELED, + [ENOKEY] = LUSTRE_ENOKEY, + [EKEYEXPIRED] = LUSTRE_EKEYEXPIRED, + [EKEYREVOKED] = LUSTRE_EKEYREVOKED, + [EKEYREJECTED] = LUSTRE_EKEYREJECTED, + [EOWNERDEAD] = LUSTRE_EOWNERDEAD, + [ENOTRECOVERABLE] = LUSTRE_ENOTRECOVERABLE, + [ERESTARTSYS] = LUSTRE_ERESTARTSYS, + [ERESTARTNOINTR] = LUSTRE_ERESTARTNOINTR, + [ERESTARTNOHAND] = LUSTRE_ERESTARTNOHAND, + [ENOIOCTLCMD] = LUSTRE_ENOIOCTLCMD, + [ERESTART_RESTARTBLOCK] = LUSTRE_ERESTART_RESTARTBLOCK, + [EBADHANDLE] = LUSTRE_EBADHANDLE, + [ENOTSYNC] = LUSTRE_ENOTSYNC, + [EBADCOOKIE] = LUSTRE_EBADCOOKIE, + [ENOTSUPP] = LUSTRE_ENOTSUPP, + [ETOOSMALL] = LUSTRE_ETOOSMALL, + [ESERVERFAULT] = LUSTRE_ESERVERFAULT, + [EBADTYPE] = LUSTRE_EBADTYPE, + [EJUKEBOX] = LUSTRE_EJUKEBOX, + [EIOCBQUEUED] = LUSTRE_EIOCBQUEUED, + + /* + * The ELDLM errors are Lustre specific errors whose ranges + * lie in the middle of the above system errors. The ELDLM + * numbers must be preserved to avoid LU-9793. + */ + [ELDLM_LOCK_CHANGED] = ELDLM_LOCK_CHANGED, + [ELDLM_LOCK_ABORTED] = ELDLM_LOCK_ABORTED, + [ELDLM_LOCK_REPLACED] = ELDLM_LOCK_REPLACED, + [ELDLM_NO_LOCK_DATA] = ELDLM_NO_LOCK_DATA, + [ELDLM_LOCK_WOULDBLOCK] = ELDLM_LOCK_WOULDBLOCK, + [ELDLM_NAMESPACE_EXISTS]= ELDLM_NAMESPACE_EXISTS, + [ELDLM_BAD_NAMESPACE] = ELDLM_BAD_NAMESPACE +}; + +static int lustre_errno_ntoh_mapping[] = { + [LUSTRE_EPERM] = EPERM, + [LUSTRE_ENOENT] = ENOENT, + [LUSTRE_ESRCH] = ESRCH, + [LUSTRE_EINTR] = EINTR, + [LUSTRE_EIO] = EIO, + [LUSTRE_ENXIO] = ENXIO, + [LUSTRE_E2BIG] = E2BIG, + [LUSTRE_ENOEXEC] = ENOEXEC, + [LUSTRE_EBADF] = EBADF, + [LUSTRE_ECHILD] = ECHILD, + [LUSTRE_EAGAIN] = EAGAIN, + [LUSTRE_ENOMEM] = ENOMEM, + [LUSTRE_EACCES] = EACCES, + [LUSTRE_EFAULT] = EFAULT, + [LUSTRE_ENOTBLK] = ENOTBLK, + [LUSTRE_EBUSY] = EBUSY, + [LUSTRE_EEXIST] = EEXIST, + [LUSTRE_EXDEV] = EXDEV, + [LUSTRE_ENODEV] = ENODEV, + [LUSTRE_ENOTDIR] = ENOTDIR, + [LUSTRE_EISDIR] = EISDIR, + [LUSTRE_EINVAL] = EINVAL, + [LUSTRE_ENFILE] = ENFILE, + [LUSTRE_EMFILE] = EMFILE, + [LUSTRE_ENOTTY] = ENOTTY, + [LUSTRE_ETXTBSY] = ETXTBSY, + [LUSTRE_EFBIG] = EFBIG, + [LUSTRE_ENOSPC] = ENOSPC, + [LUSTRE_ESPIPE] = ESPIPE, + [LUSTRE_EROFS] = EROFS, + [LUSTRE_EMLINK] = EMLINK, + [LUSTRE_EPIPE] = EPIPE, + [LUSTRE_EDOM] = EDOM, + [LUSTRE_ERANGE] = ERANGE, + [LUSTRE_EDEADLK] = EDEADLK, + [LUSTRE_ENAMETOOLONG] = ENAMETOOLONG, + [LUSTRE_ENOLCK] = ENOLCK, + [LUSTRE_ENOSYS] = ENOSYS, + [LUSTRE_ENOTEMPTY] = ENOTEMPTY, + [LUSTRE_ELOOP] = ELOOP, + [LUSTRE_ENOMSG] = ENOMSG, + [LUSTRE_EIDRM] = EIDRM, + [LUSTRE_ECHRNG] = ECHRNG, + [LUSTRE_EL2NSYNC] = EL2NSYNC, + [LUSTRE_EL3HLT] = EL3HLT, + [LUSTRE_EL3RST] = EL3RST, + [LUSTRE_ELNRNG] = ELNRNG, + [LUSTRE_EUNATCH] = EUNATCH, + [LUSTRE_ENOCSI] = ENOCSI, + [LUSTRE_EL2HLT] = EL2HLT, + [LUSTRE_EBADE] = EBADE, + [LUSTRE_EBADR] = EBADR, + [LUSTRE_EXFULL] = EXFULL, + [LUSTRE_ENOANO] = ENOANO, + [LUSTRE_EBADRQC] = EBADRQC, + [LUSTRE_EBADSLT] = EBADSLT, + [LUSTRE_EBFONT] = EBFONT, + [LUSTRE_ENOSTR] = ENOSTR, + [LUSTRE_ENODATA] = ENODATA, + [LUSTRE_ETIME] = ETIME, + [LUSTRE_ENOSR] = ENOSR, + [LUSTRE_ENONET] = ENONET, + [LUSTRE_ENOPKG] = ENOPKG, + [LUSTRE_EREMOTE] = EREMOTE, + [LUSTRE_ENOLINK] = ENOLINK, + [LUSTRE_EADV] = EADV, + [LUSTRE_ESRMNT] = ESRMNT, + [LUSTRE_ECOMM] = ECOMM, + [LUSTRE_EPROTO] = EPROTO, + [LUSTRE_EMULTIHOP] = EMULTIHOP, + [LUSTRE_EDOTDOT] = EDOTDOT, + [LUSTRE_EBADMSG] = EBADMSG, + [LUSTRE_EOVERFLOW] = EOVERFLOW, + [LUSTRE_ENOTUNIQ] = ENOTUNIQ, + [LUSTRE_EBADFD] = EBADFD, + [LUSTRE_EREMCHG] = EREMCHG, + [LUSTRE_ELIBACC] = ELIBACC, + [LUSTRE_ELIBBAD] = ELIBBAD, + [LUSTRE_ELIBSCN] = ELIBSCN, + [LUSTRE_ELIBMAX] = ELIBMAX, + [LUSTRE_ELIBEXEC] = ELIBEXEC, + [LUSTRE_EILSEQ] = EILSEQ, + [LUSTRE_ERESTART] = ERESTART, + [LUSTRE_ESTRPIPE] = ESTRPIPE, + [LUSTRE_EUSERS] = EUSERS, + [LUSTRE_ENOTSOCK] = ENOTSOCK, + [LUSTRE_EDESTADDRREQ] = EDESTADDRREQ, + [LUSTRE_EMSGSIZE] = EMSGSIZE, + [LUSTRE_EPROTOTYPE] = EPROTOTYPE, + [LUSTRE_ENOPROTOOPT] = ENOPROTOOPT, + [LUSTRE_EPROTONOSUPPORT] = EPROTONOSUPPORT, + [LUSTRE_ESOCKTNOSUPPORT] = ESOCKTNOSUPPORT, + [LUSTRE_EOPNOTSUPP] = EOPNOTSUPP, + [LUSTRE_EPFNOSUPPORT] = EPFNOSUPPORT, + [LUSTRE_EAFNOSUPPORT] = EAFNOSUPPORT, + [LUSTRE_EADDRINUSE] = EADDRINUSE, + [LUSTRE_EADDRNOTAVAIL] = EADDRNOTAVAIL, + [LUSTRE_ENETDOWN] = ENETDOWN, + [LUSTRE_ENETUNREACH] = ENETUNREACH, + [LUSTRE_ENETRESET] = ENETRESET, + [LUSTRE_ECONNABORTED] = ECONNABORTED, + [LUSTRE_ECONNRESET] = ECONNRESET, + [LUSTRE_ENOBUFS] = ENOBUFS, + [LUSTRE_EISCONN] = EISCONN, + [LUSTRE_ENOTCONN] = ENOTCONN, + [LUSTRE_ESHUTDOWN] = ESHUTDOWN, + [LUSTRE_ETOOMANYREFS] = ETOOMANYREFS, + [LUSTRE_ETIMEDOUT] = ETIMEDOUT, + [LUSTRE_ECONNREFUSED] = ECONNREFUSED, + [LUSTRE_EHOSTDOWN] = EHOSTDOWN, + [LUSTRE_EHOSTUNREACH] = EHOSTUNREACH, + [LUSTRE_EALREADY] = EALREADY, + [LUSTRE_EINPROGRESS] = EINPROGRESS, + [LUSTRE_ESTALE] = ESTALE, + [LUSTRE_EUCLEAN] = EUCLEAN, + [LUSTRE_ENOTNAM] = ENOTNAM, + [LUSTRE_ENAVAIL] = ENAVAIL, + [LUSTRE_EISNAM] = EISNAM, + [LUSTRE_EREMOTEIO] = EREMOTEIO, + [LUSTRE_EDQUOT] = EDQUOT, + [LUSTRE_ENOMEDIUM] = ENOMEDIUM, + [LUSTRE_EMEDIUMTYPE] = EMEDIUMTYPE, + [LUSTRE_ECANCELED] = ECANCELED, + [LUSTRE_ENOKEY] = ENOKEY, + [LUSTRE_EKEYEXPIRED] = EKEYEXPIRED, + [LUSTRE_EKEYREVOKED] = EKEYREVOKED, + [LUSTRE_EKEYREJECTED] = EKEYREJECTED, + [LUSTRE_EOWNERDEAD] = EOWNERDEAD, + [LUSTRE_ENOTRECOVERABLE] = ENOTRECOVERABLE, + [LUSTRE_ERESTARTSYS] = ERESTARTSYS, + [LUSTRE_ERESTARTNOINTR] = ERESTARTNOINTR, + [LUSTRE_ERESTARTNOHAND] = ERESTARTNOHAND, + [LUSTRE_ENOIOCTLCMD] = ENOIOCTLCMD, + [LUSTRE_ERESTART_RESTARTBLOCK] = ERESTART_RESTARTBLOCK, + [LUSTRE_EBADHANDLE] = EBADHANDLE, + [LUSTRE_ENOTSYNC] = ENOTSYNC, + [LUSTRE_EBADCOOKIE] = EBADCOOKIE, + [LUSTRE_ENOTSUPP] = ENOTSUPP, + [LUSTRE_ETOOSMALL] = ETOOSMALL, + [LUSTRE_ESERVERFAULT] = ESERVERFAULT, + [LUSTRE_EBADTYPE] = EBADTYPE, + [LUSTRE_EJUKEBOX] = EJUKEBOX, + [LUSTRE_EIOCBQUEUED] = EIOCBQUEUED, + + /* + * The ELDLM errors are Lustre specific errors whose ranges + * lie in the middle of the above system errors. The ELDLM + * numbers must be preserved to avoid LU-9793. + */ + [ELDLM_LOCK_CHANGED] = ELDLM_LOCK_CHANGED, + [ELDLM_LOCK_ABORTED] = ELDLM_LOCK_ABORTED, + [ELDLM_LOCK_REPLACED] = ELDLM_LOCK_REPLACED, + [ELDLM_NO_LOCK_DATA] = ELDLM_NO_LOCK_DATA, + [ELDLM_LOCK_WOULDBLOCK] = ELDLM_LOCK_WOULDBLOCK, + [ELDLM_NAMESPACE_EXISTS] = ELDLM_NAMESPACE_EXISTS, + [ELDLM_BAD_NAMESPACE] = ELDLM_BAD_NAMESPACE +}; + +unsigned int lustre_errno_hton(unsigned int h) +{ + unsigned int n; + + if (h == 0) { + n = 0; + } else if (h < ARRAY_SIZE(lustre_errno_hton_mapping)) { + n = lustre_errno_hton_mapping[h]; + if (n == 0) + goto generic; + } else { +generic: + /* + * A generic errno is better than the unknown one that could + * mean anything to a different host. + */ + n = LUSTRE_EIO; + } + + return n; +} +EXPORT_SYMBOL(lustre_errno_hton); + +unsigned int lustre_errno_ntoh(unsigned int n) +{ + unsigned int h; + + if (n == 0) { + h = 0; + } else if (n < ARRAY_SIZE(lustre_errno_ntoh_mapping)) { + h = lustre_errno_ntoh_mapping[n]; + if (h == 0) + goto generic; + } else { +generic: + /* + * Similar to the situation in lustre_errno_hton(), an unknown + * network errno could coincide with anything. Hence, it is + * better to return a generic errno. + */ + h = EIO; + } + + return h; +} +EXPORT_SYMBOL(lustre_errno_ntoh); + +#endif /* LUSTRE_TRANSLATE_ERRNOS */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/events.c b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c new file mode 100644 index 0000000000000..06e4aad174c4b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c @@ -0,0 +1,655 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +lnet_handler_t ptlrpc_handler; +struct percpu_ref ptlrpc_pending; + +/* + * Client's outgoing request callback + */ +void request_out_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md_user_ptr; + struct ptlrpc_request *req = cbid->cbid_arg; + bool wakeup = false; + ENTRY; + + LASSERT(ev->type == LNET_EVENT_SEND || ev->type == LNET_EVENT_UNLINK); + LASSERT(ev->unlinked); + + if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val && + CFS_FAIL_CHECK_RESET(OBD_FAIL_NET_ERROR_RPC, + OBD_FAIL_OSP_PRECREATE_PAUSE | + CFS_FAIL_ONCE))) + ev->status = -ECONNABORTED; + + DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status); + + /* Do not update imp_next_ping for connection request */ + if (lustre_msg_get_opc(req->rq_reqmsg) != + req->rq_import->imp_connect_op) + ptlrpc_pinger_sending_on_import(req->rq_import); + + sptlrpc_request_out_callback(req); + + spin_lock(&req->rq_lock); + req->rq_real_sent = ktime_get_real_seconds(); + req->rq_req_unlinked = 1; + /* reply_in_callback happened before request_out_callback? */ + if (req->rq_reply_unlinked) + wakeup = true; + + if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) { + /* Failed send: make it seem like the reply timed out, just + * like failing sends in client.c does currently... */ + req->rq_net_err = 1; + wakeup = true; + } + + if (wakeup) + ptlrpc_client_wake_req(req); + + spin_unlock(&req->rq_lock); + + ptlrpc_req_finished(req); + EXIT; +} + +/* + * Client's incoming reply callback + */ +void reply_in_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md_user_ptr; + struct ptlrpc_request *req = cbid->cbid_arg; + ENTRY; + + DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status); + + LASSERT(ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK); + LASSERT(ev->md_start == req->rq_repbuf); + LASSERT(ev->offset + ev->mlength <= req->rq_repbuf_len); + /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests + * for adaptive timeouts' early reply. + */ + LASSERT((ev->md_options & LNET_MD_MANAGE_REMOTE) != 0); + + spin_lock(&req->rq_lock); + + req->rq_receiving_reply = 0; + req->rq_early = 0; + if (ev->unlinked) + req->rq_reply_unlinked = 1; + + if (ev->status) + goto out_wake; + + if (ev->type == LNET_EVENT_UNLINK) { + LASSERT(ev->unlinked); + DEBUG_REQ(D_NET, req, "unlink"); + goto out_wake; + } + + if (ev->mlength < ev->rlength ) { + CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req, + req->rq_replen, ev->rlength, ev->offset); + req->rq_reply_truncated = 1; + req->rq_replied = 1; + req->rq_status = -EOVERFLOW; + req->rq_nob_received = ev->rlength + ev->offset; + goto out_wake; + } + + if ((ev->offset == 0) && + ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) { + /* Early reply */ + DEBUG_REQ(D_ADAPTTO, req, + "Early reply received, mlen=%u offset=%d replen=%d replied=%d unlinked=%d", + ev->mlength, ev->offset, + req->rq_replen, req->rq_replied, ev->unlinked); + + req->rq_early_count++; /* number received, client side */ + + /* already got the real reply or buffers are already unlinked */ + if (req->rq_replied || + req->rq_reply_unlinked == 1) + goto out_wake; + + req->rq_early = 1; + req->rq_reply_off = ev->offset; + req->rq_nob_received = ev->mlength; + /* And we're still receiving */ + req->rq_receiving_reply = 1; + } else { + /* Real reply */ + req->rq_rep_swab_mask = 0; + req->rq_replied = 1; + /* Got reply, no resend required */ + req->rq_resend = 0; + req->rq_reply_off = ev->offset; + req->rq_nob_received = ev->mlength; + /* LNetMDUnlink can't be called under the LNET_LOCK, + so we must unlink in ptlrpc_unregister_reply */ + DEBUG_REQ(D_INFO, req, + "reply in flags=%x mlen=%u offset=%d replen=%d", + lustre_msg_get_flags(req->rq_reqmsg), + ev->mlength, ev->offset, req->rq_replen); + } + + if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING) + req->rq_import->imp_last_reply_time = ktime_get_real_seconds(); + +out_wake: + /* NB don't unlock till after wakeup; req can disappear under us + * since we don't have our own ref */ + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); + EXIT; +} + +/* + * Client's bulk has been written/read + */ +void client_bulk_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md_user_ptr; + struct ptlrpc_bulk_desc *desc = cbid->cbid_arg; + struct ptlrpc_request *req; + ENTRY; + + LASSERT((ptlrpc_is_bulk_put_sink(desc->bd_type) && + ev->type == LNET_EVENT_PUT) || + (ptlrpc_is_bulk_get_source(desc->bd_type) && + ev->type == LNET_EVENT_GET) || + ev->type == LNET_EVENT_UNLINK); + LASSERT(ev->unlinked); + + if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE)) + ev->status = -EIO; + + if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE)) + ev->status = -EIO; + + CDEBUG_LIMIT((ev->status == 0) ? D_NET : D_ERROR, + "event type %d, status %d, desc %p\n", + ev->type, ev->status, desc); + + spin_lock(&desc->bd_lock); + req = desc->bd_req; + LASSERT(desc->bd_refs > 0); + desc->bd_refs--; + + if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) { + desc->bd_nob_transferred += ev->mlength; + desc->bd_sender = lnet_nid_to_nid4(&ev->sender); + } else { + /* start reconnect and resend if network error hit */ + spin_lock(&req->rq_lock); + req->rq_net_err = 1; + spin_unlock(&req->rq_lock); + desc->bd_failure = 1; + } + + + /* NB don't unlock till after wakeup; desc can disappear under us + * otherwise */ + if (desc->bd_refs == 0) + ptlrpc_client_wake_req(desc->bd_req); + + spin_unlock(&desc->bd_lock); + EXIT; +} + +/* + * We will have percpt request history list for ptlrpc service in upcoming + * patches because we don't want to be serialized by current per-service + * history operations. So we require history ID can (somehow) show arriving + * order w/o grabbing global lock, and user can sort them in userspace. + * + * This is how we generate history ID for ptlrpc_request: + * ---------------------------------------------------- + * | 32 bits | 16 bits | (16 - X)bits | X bits | + * ---------------------------------------------------- + * | seconds | usec / 16 | sequence | CPT id | + * ---------------------------------------------------- + * + * it might not be precise but should be good enough. + */ + +#define REQS_CPT_BITS(svcpt) ((svcpt)->scp_service->srv_cpt_bits) + +#define REQS_SEC_SHIFT 32 +#define REQS_USEC_SHIFT 16 +#define REQS_SEQ_SHIFT(svcpt) REQS_CPT_BITS(svcpt) + +static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + u64 sec = req->rq_arrival_time.tv_sec; + u32 usec = req->rq_arrival_time.tv_nsec / NSEC_PER_USEC / 16; /* usec / 16 */ + u64 new_seq; + + /* set sequence ID for request and add it to history list, + * it must be called with hold svcpt::scp_lock */ + + new_seq = (sec << REQS_SEC_SHIFT) | + (usec << REQS_USEC_SHIFT) | + (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt); + + if (new_seq > svcpt->scp_hist_seq) { + /* This handles the initial case of scp_hist_seq == 0 or + * we just jumped into a new time window */ + svcpt->scp_hist_seq = new_seq; + } else { + LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT); + /* NB: increase sequence number in current usec bucket, + * however, it's possible that we used up all bits for + * sequence and jumped into the next usec bucket (future time), + * then we hope there will be less RPCs per bucket at some + * point, and sequence will catch up again */ + svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt)); + new_seq = svcpt->scp_hist_seq; + } + + req->rq_history_seq = new_seq; + + list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs); +} + +/* + * Server's incoming request callback + */ +void request_in_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md_user_ptr; + struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg; + struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; + struct ptlrpc_service *service = svcpt->scp_service; + struct ptlrpc_request *req; + ENTRY; + + LASSERT(ev->type == LNET_EVENT_PUT || + ev->type == LNET_EVENT_UNLINK); + LASSERT((char *)ev->md_start >= rqbd->rqbd_buffer); + LASSERT((char *)ev->md_start + ev->offset + ev->mlength <= + rqbd->rqbd_buffer + service->srv_buf_size); + + CDEBUG_LIMIT((ev->status == 0) ? D_NET : D_ERROR, + "event type %d, status %d, service %s\n", + ev->type, ev->status, service->srv_name); + + if (ev->unlinked) { + /* If this is the last request message to fit in the + * request buffer we can use the request object embedded in + * rqbd. Note that if we failed to allocate a request, + * we'd have to re-post the rqbd, which we can't do in this + * context. + */ + req = &rqbd->rqbd_req; + memset(req, 0, sizeof(*req)); + } else { + LASSERT(ev->type == LNET_EVENT_PUT); + if (ev->status != 0) /* We moaned above already... */ + return; + req = ptlrpc_request_cache_alloc(GFP_ATOMIC); + if (req == NULL) { + CERROR("Can't allocate incoming request descriptor: Dropping %s RPC from %s\n", + service->srv_name, + libcfs_idstr(&ev->initiator)); + return; + } + } + + ptlrpc_srv_req_init(req); + /* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL, + * flags are reset and scalars are zero. We only set the message + * size to non-zero if this was a successful receive. */ + req->rq_xid = ev->match_bits; + req->rq_reqbuf = ev->md_start + ev->offset; + if (ev->type == LNET_EVENT_PUT && ev->status == 0) + req->rq_reqdata_len = ev->mlength; + ktime_get_real_ts64(&req->rq_arrival_time); + /* Multi-Rail: keep track of both initiator and source NID. */ + req->rq_peer = lnet_pid_to_pid4(&ev->initiator); + req->rq_source = lnet_pid_to_pid4(&ev->source); + req->rq_self = lnet_nid_to_nid4(&ev->target.nid); + req->rq_rqbd = rqbd; + req->rq_phase = RQ_PHASE_NEW; + if (ev->type == LNET_EVENT_PUT) + CDEBUG(D_INFO, "incoming req@%p x%llu msgsize %u\n", + req, req->rq_xid, ev->mlength); + + CDEBUG(D_RPCTRACE, "peer: %s (source: %s)\n", + libcfs_id2str(req->rq_peer), libcfs_id2str(req->rq_source)); + + spin_lock(&svcpt->scp_lock); + + ptlrpc_req_add_history(svcpt, req); + + if (ev->unlinked) { + svcpt->scp_nrqbds_posted--; + CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n", + svcpt->scp_nrqbds_posted); + + /* Normally, don't complain about 0 buffers posted; LNET won't + * drop incoming reqs since we set the portal lazy */ + if (test_req_buffer_pressure && + ev->type != LNET_EVENT_UNLINK && + svcpt->scp_nrqbds_posted == 0) + CWARN("All %s request buffers busy\n", + service->srv_name); + + /* req takes over the network's ref on rqbd */ + } else { + /* req takes a ref on rqbd */ + rqbd->rqbd_refcount++; + } + + list_add_tail(&req->rq_list, &svcpt->scp_req_incoming); + svcpt->scp_nreqs_incoming++; + + /* NB everything can disappear under us once the request + * has been queued and we unlock, so do the wake now... */ + wake_up(&svcpt->scp_waitq); + + spin_unlock(&svcpt->scp_lock); + EXIT; +} + +/* + * Server's outgoing reply callback + */ +void reply_out_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md_user_ptr; + struct ptlrpc_reply_state *rs = cbid->cbid_arg; + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + bool need_schedule = false; + + ENTRY; + + LASSERT(ev->type == LNET_EVENT_SEND || + ev->type == LNET_EVENT_ACK || + ev->type == LNET_EVENT_UNLINK); + + if (!rs->rs_difficult) { + /* 'Easy' replies have no further processing so I drop the + * net's ref on 'rs' + */ + LASSERT(ev->unlinked); + ptlrpc_rs_decref(rs); + EXIT; + return; + } + + if (ev->type == LNET_EVENT_SEND) { + spin_lock(&rs->rs_lock); + rs->rs_sent = 1; + /* If transaction was committed before the SEND, and the ACK + * is lost, then we need to schedule so ptlrpc_hr can unlink + * the MD. + */ + if (rs->rs_handled) + need_schedule = true; + spin_unlock(&rs->rs_lock); + } + + if (ev->unlinked || need_schedule) { + LASSERT(rs->rs_sent); + + /* Last network callback. The net's ref on 'rs' stays put + * until ptlrpc_handle_rs() is done with it + */ + spin_lock(&svcpt->scp_rep_lock); + spin_lock(&rs->rs_lock); + + rs->rs_unlinked = ev->unlinked; + if (!rs->rs_no_ack || + rs->rs_transno <= + rs->rs_export->exp_obd->obd_last_committed || + list_empty(&rs->rs_obd_list)) + ptlrpc_schedule_difficult_reply(rs); + + spin_unlock(&rs->rs_lock); + spin_unlock(&svcpt->scp_rep_lock); + } + EXIT; +} + +#ifdef HAVE_SERVER_SUPPORT +/* + * Server's bulk completion callback + */ +void server_bulk_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md_user_ptr; + struct ptlrpc_bulk_desc *desc = cbid->cbid_arg; + ENTRY; + + LASSERT(ev->type == LNET_EVENT_SEND || + ev->type == LNET_EVENT_UNLINK || + (ptlrpc_is_bulk_put_source(desc->bd_type) && + ev->type == LNET_EVENT_ACK) || + (ptlrpc_is_bulk_get_sink(desc->bd_type) && + ev->type == LNET_EVENT_REPLY)); + + CDEBUG_LIMIT((ev->status == 0) ? D_NET : D_ERROR, + "event type %d, status %d, desc %p\n", + ev->type, ev->status, desc); + + spin_lock(&desc->bd_lock); + + LASSERT(desc->bd_refs > 0); + + if ((ev->type == LNET_EVENT_ACK || + ev->type == LNET_EVENT_REPLY) && + ev->status == 0) { + /* We heard back from the peer, so even if we get this + * before the SENT event (oh yes we can), we know we + * read/wrote the peer buffer and how much... */ + desc->bd_nob_transferred += ev->mlength; + desc->bd_sender = lnet_nid_to_nid4(&ev->sender); + } + + if (ev->status != 0) + desc->bd_failure = 1; + + if (ev->unlinked) { + desc->bd_refs--; + /* This is the last callback no matter what... */ + if (desc->bd_refs == 0) + wake_up(&desc->bd_waitq); + } + + spin_unlock(&desc->bd_lock); + EXIT; +} +#endif + +static void ptlrpc_master_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md_user_ptr; + void (*callback)(struct lnet_event *ev) = cbid->cbid_fn; + + /* Honestly, it's best to find out early. */ + LASSERT(cbid->cbid_arg != LP_POISON); + LASSERT(callback == request_out_callback || + callback == reply_in_callback || + callback == client_bulk_callback || + callback == request_in_callback || + callback == reply_out_callback +#ifdef HAVE_SERVER_SUPPORT + || callback == server_bulk_callback +#endif + ); + + callback(ev); + if (ev->unlinked) + percpu_ref_put(&ptlrpc_pending); +} + +int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, + struct lnet_process_id *peer, lnet_nid_t *self) +{ + int best_dist = 0; + __u32 best_order = 0; + int count = 0; + int rc = -ENOENT; + int dist; + __u32 order; + lnet_nid_t dst_nid; + lnet_nid_t src_nid; + + peer->pid = LNET_PID_LUSTRE; + + /* Choose the matching UUID that's closest */ + while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) { + if (peer->nid != LNET_NID_ANY && LNET_NIDADDR(peer->nid) == 0 && + LNET_NIDNET(dst_nid) != LNET_NIDNET(peer->nid)) + continue; + + dist = LNetDist(dst_nid, &src_nid, &order); + if (dist < 0) + continue; + + if (dist == 0) { /* local! use loopback LND */ + peer->nid = *self = LNET_NID_LO_0; + rc = 0; + break; + } + + if (rc < 0 || + dist < best_dist || + (dist == best_dist && order < best_order)) { + best_dist = dist; + best_order = order; + + peer->nid = dst_nid; + *self = src_nid; + rc = 0; + } + } + + CDEBUG(D_NET, "%s->%s\n", uuid->uuid, libcfs_id2str(*peer)); + return rc; +} + +static struct completion ptlrpc_done; + +static void ptlrpc_release(struct percpu_ref *ref) +{ + complete(&ptlrpc_done); +} + +static void ptlrpc_ni_fini(void) +{ + /* Wait for the event queue to become idle since there may still be + * messages in flight with pending events (i.e. the fire-and-forget + * messages == client requests and "non-difficult" server + * replies */ + + init_completion(&ptlrpc_done); + percpu_ref_kill(&ptlrpc_pending); + wait_for_completion(&ptlrpc_done); + + lnet_assert_handler_unused(ptlrpc_handler); + LNetNIFini(); +} + +lnet_pid_t ptl_get_pid(void) +{ + return LNET_PID_LUSTRE; +} + +int ptlrpc_ni_init(void) +{ + int rc; + lnet_pid_t pid; + + pid = ptl_get_pid(); + CDEBUG(D_NET, "My pid is: %x\n", pid); + + /* We're not passing any limits yet... */ + rc = LNetNIInit(pid); + if (rc < 0) { + CDEBUG(D_NET, "ptlrpc: Can't init network interface: rc = %d\n", + rc); + return rc; + } + + rc = percpu_ref_init(&ptlrpc_pending, ptlrpc_release, 0, GFP_KERNEL); + if (rc) { + CERROR("ptlrpc: Can't init percpu refcount: rc = %d\n", rc); + return rc; + } + /* CAVEAT EMPTOR: how we process portals events is _radically_ + * different depending on... + */ + /* kernel LNet calls our master callback when there are new event, + * because we are guaranteed to get every event via callback, + * so we just set EQ size to 0 to avoid overhread of serializing + * enqueue/dequeue operations in LNet. */ + ptlrpc_handler = ptlrpc_master_callback; + return 0; +} + +int ptlrpc_init_portals(void) +{ + int rc = ptlrpc_ni_init(); + + if (rc != 0) { + CERROR("network initialisation failed\n"); + return rc; + } + rc = ptlrpcd_addref(); + if (rc == 0) + return 0; + + CERROR("rpcd initialisation failed\n"); + ptlrpc_ni_fini(); + return rc; +} + +void ptlrpc_exit_portals(void) +{ + ptlrpcd_decref(); + ptlrpc_ni_fini(); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h new file mode 100644 index 0000000000000..aa481015dd2d7 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h @@ -0,0 +1,185 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * Somewhat simplified version of the gss api. + * + * Dug Song + * Andy Adamson + * Bruce Fields + * Copyright (c) 2000 The Regents of the University of Michigan + * + */ + +#ifndef __PTLRPC_GSS_GSS_API_H_ +#define __PTLRPC_GSS_GSS_API_H_ + +struct gss_api_mech; + +typedef int (*digest_hash)( + struct ahash_request *req, rawobj_t *hdr, + int msgcnt, rawobj_t *msgs, + int iovcnt, struct bio_vec *iovs); + +/* The mechanism-independent gss-api context: */ +struct gss_ctx { + struct gss_api_mech *mech_type; + void *internal_ctx_id; + digest_hash hash_func; +}; + +#define GSS_C_NO_BUFFER ((rawobj_t) 0) +#define GSS_C_NO_CONTEXT ((struct gss_ctx *) 0) +#define GSS_C_NULL_OID ((rawobj_t) 0) + +/* + * gss-api prototypes; note that these are somewhat simplified versions of + * the prototypes specified in RFC 2744. + */ +__u32 lgss_import_sec_context( + rawobj_t *input_token, + struct gss_api_mech *mech, + struct gss_ctx **ctx); +__u32 lgss_copy_reverse_context( + struct gss_ctx *ctx, + struct gss_ctx **ctx_new); +__u32 lgss_inquire_context( + struct gss_ctx *ctx, + time64_t *endtime); +__u32 lgss_get_mic( + struct gss_ctx *ctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + struct bio_vec *iovs, + rawobj_t *mic_token); +__u32 lgss_verify_mic( + struct gss_ctx *ctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + struct bio_vec *iovs, + rawobj_t *mic_token); +__u32 lgss_wrap( + struct gss_ctx *ctx, + rawobj_t *gsshdr, + rawobj_t *msg, + int msg_buflen, + rawobj_t *out_token); +__u32 lgss_unwrap( + struct gss_ctx *ctx, + rawobj_t *gsshdr, + rawobj_t *token, + rawobj_t *out_msg); +__u32 lgss_prep_bulk( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc); +__u32 lgss_wrap_bulk( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); +__u32 lgss_unwrap_bulk( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); +__u32 lgss_delete_sec_context( + struct gss_ctx **ctx); +int lgss_display( + struct gss_ctx *ctx, + char *buf, + int bufsize); + +struct subflavor_desc { + __u32 sf_subflavor; + __u32 sf_qop; + __u32 sf_service; + char *sf_name; +}; + +/* Each mechanism is described by the following struct: */ +struct gss_api_mech { + struct list_head gm_list; + struct module *gm_owner; + char *gm_name; + rawobj_t gm_oid; + atomic_t gm_count; + struct gss_api_ops *gm_ops; + int gm_sf_num; + struct subflavor_desc *gm_sfs; +}; + +/* and must provide the following operations: */ +struct gss_api_ops { + __u32 (*gss_import_sec_context)( + rawobj_t *input_token, + struct gss_ctx *ctx); + __u32 (*gss_copy_reverse_context)( + struct gss_ctx *ctx, + struct gss_ctx *ctx_new); + __u32 (*gss_inquire_context)( + struct gss_ctx *ctx, + time64_t *endtime); + __u32 (*gss_get_mic)( + struct gss_ctx *ctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + struct bio_vec *iovs, + rawobj_t *mic_token); + __u32 (*gss_verify_mic)( + struct gss_ctx *ctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + struct bio_vec *iovs, + rawobj_t *mic_token); + __u32 (*gss_wrap)( + struct gss_ctx *ctx, + rawobj_t *gsshdr, + rawobj_t *msg, + int msg_buflen, + rawobj_t *out_token); + __u32 (*gss_unwrap)( + struct gss_ctx *ctx, + rawobj_t *gsshdr, + rawobj_t *token, + rawobj_t *out_msg); + __u32 (*gss_prep_bulk)( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc); + __u32 (*gss_wrap_bulk)( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); + __u32 (*gss_unwrap_bulk)( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); + void (*gss_delete_sec_context)( + void *ctx); + int (*gss_display)( + struct gss_ctx *ctx, + char *buf, + int bufsize); +}; + +int lgss_mech_register(struct gss_api_mech *mech); +void lgss_mech_unregister(struct gss_api_mech *mech); + +struct gss_api_mech * lgss_OID_to_mech(rawobj_t *oid); +struct gss_api_mech * lgss_name_to_mech(char *name); +struct gss_api_mech * lgss_subflavor_to_mech(__u32 subflavor); + +struct gss_api_mech * lgss_mech_get(struct gss_api_mech *mech); +void lgss_mech_put(struct gss_api_mech *mech); + +#endif /* __PTLRPC_GSS_GSS_API_H_ */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h new file mode 100644 index 0000000000000..1f535485bd0f3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h @@ -0,0 +1,84 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * minimal asn1 for generic encoding/decoding of gss tokens + * + * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h, + * lib/gssapi/krb5/gssapiP_krb5.h, and others + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1995 by the Massachusetts Institute of Technology. + * All Rights Reserved. + * + * Export of this software from the United States of America may + * require a specific license from the United States Government. + * It is the responsibility of any person or organization contemplating + * export to obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of M.I.T. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. Furthermore if you modify this software you must label + * your software as modified software and not distribute it in such a + * fashion that it might be confused with the original M.I.T. software. + * M.I.T. makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + */ + +#define SIZEOF_INT 4 + +/* from gssapi_err_generic.h */ +#define G_BAD_SERVICE_NAME (-2045022976L) +#define G_BAD_STRING_UID (-2045022975L) +#define G_NOUSER (-2045022974L) +#define G_VALIDATE_FAILED (-2045022973L) +#define G_BUFFER_ALLOC (-2045022972L) +#define G_BAD_MSG_CTX (-2045022971L) +#define G_WRONG_SIZE (-2045022970L) +#define G_BAD_USAGE (-2045022969L) +#define G_UNKNOWN_QOP (-2045022968L) +#define G_NO_HOSTNAME (-2045022967L) +#define G_BAD_HOSTNAME (-2045022966L) +#define G_WRONG_MECH (-2045022965L) +#define G_BAD_TOK_HEADER (-2045022964L) +#define G_BAD_DIRECTION (-2045022963L) +#define G_TOK_TRUNC (-2045022962L) +#define G_REFLECT (-2045022961L) +#define G_WRONG_TOKID (-2045022960L) + +#define g_OID_equal(o1,o2) \ + (((o1)->len == (o2)->len) && \ + (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0)) + +__u32 g_verify_token_header(rawobj_t *mech, + int *body_size, + unsigned char **buf_in, + int toksize); + +__u32 g_get_mech_oid(rawobj_t *mech, + rawobj_t *in_buf); + +int g_token_size(rawobj_t *mech, + unsigned int body_size); + +void g_make_token_header(rawobj_t *mech, + int body_size, + unsigned char **buf); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c new file mode 100644 index 0000000000000..466b868c44068 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c @@ -0,0 +1,516 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/gss/gss_bulk.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_cli_ctx *gctx; + struct lustre_msg *msg; + struct ptlrpc_bulk_sec_desc *bsd; + rawobj_t token; + __u32 maj; + int offset; + int rc; + ENTRY; + + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + LASSERT(gctx->gc_mechctx); + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + LASSERT(req->rq_reqbuf->lm_bufcount >= 3); + msg = req->rq_reqbuf; + offset = msg->lm_bufcount - 1; + break; + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + LASSERT(req->rq_reqbuf->lm_bufcount >= 4); + msg = req->rq_reqbuf; + offset = msg->lm_bufcount - 2; + break; + case SPTLRPC_SVC_PRIV: + LASSERT(req->rq_clrbuf->lm_bufcount >= 2); + msg = req->rq_clrbuf; + offset = msg->lm_bufcount - 1; + break; + default: + LBUG(); + } + + bsd = lustre_msg_buf(msg, offset, sizeof(*bsd)); + bsd->bsd_version = 0; + bsd->bsd_flags = 0; + bsd->bsd_type = SPTLRPC_BULK_DEFAULT; + bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc); + + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL) + RETURN(0); + + LASSERT(bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG || + bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV); + + if (req->rq_bulk_read) { + /* + * bulk read: prepare receiving pages only for privacy mode. + */ + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV) + return gss_cli_prep_bulk(req, desc); + } else { + /* + * bulk write: sign or encrypt bulk pages. + */ + bsd->bsd_nob = desc->bd_nob; + + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG) { + /* integrity mode */ + token.data = bsd->bsd_data; + token.len = lustre_msg_buflen(msg, offset) - + sizeof(*bsd); + + maj = lgss_get_mic(gctx->gc_mechctx, 0, NULL, + desc->bd_iov_count, + desc->bd_vec, + &token); + if (maj != GSS_S_COMPLETE) { + CWARN("failed to sign bulk data: %x\n", maj); + RETURN(-EACCES); + } + } else { + /* privacy mode */ + if (desc->bd_iov_count == 0) + RETURN(0); + + rc = sptlrpc_enc_pool_get_pages(desc); + if (rc) { + CERROR("bulk write: failed to allocate " + "encryption pages: %d\n", rc); + RETURN(rc); + } + + token.data = bsd->bsd_data; + token.len = lustre_msg_buflen(msg, offset) - + sizeof(*bsd); + + maj = lgss_wrap_bulk(gctx->gc_mechctx, desc, &token, 0); + if (maj != GSS_S_COMPLETE) { + CWARN("fail to encrypt bulk data: %x\n", maj); + RETURN(-EACCES); + } + } + } + + RETURN(0); +} + +int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_cli_ctx *gctx; + struct lustre_msg *rmsg, *vmsg; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + rawobj_t token; + __u32 maj; + int roff, voff; + ENTRY; + + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + vmsg = req->rq_repdata; + LASSERT(vmsg != NULL && vmsg->lm_bufcount >= 3); + voff = vmsg->lm_bufcount - 1; + + rmsg = req->rq_reqbuf; + LASSERT(rmsg != NULL && rmsg->lm_bufcount >= 3); + roff = rmsg->lm_bufcount - 1; /* last segment */ + break; + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + vmsg = req->rq_repdata; + LASSERT(vmsg != NULL && vmsg->lm_bufcount >= 4); + voff = vmsg->lm_bufcount - 2; + + rmsg = req->rq_reqbuf; + LASSERT(rmsg != NULL && rmsg->lm_bufcount >= 4); + roff = rmsg->lm_bufcount - 2; /* second last segment */ + break; + case SPTLRPC_SVC_PRIV: + vmsg = req->rq_repdata; + LASSERT(vmsg != NULL && vmsg->lm_bufcount >= 2); + voff = vmsg->lm_bufcount - 1; + + rmsg = req->rq_clrbuf; + LASSERT(rmsg != NULL && rmsg->lm_bufcount >= 2); + roff = rmsg->lm_bufcount - 1; /* last segment */ + break; + default: + LBUG(); + } + + bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr)); + bsdv = lustre_msg_buf(vmsg, voff, sizeof(*bsdv)); + LASSERT(bsdr && bsdv); + + if (bsdr->bsd_version != bsdv->bsd_version || + bsdr->bsd_type != bsdv->bsd_type || + bsdr->bsd_svc != bsdv->bsd_svc) { + CERROR("bulk security descriptor mismatch: " + "(%u,%u,%u) != (%u,%u,%u)\n", + bsdr->bsd_version, bsdr->bsd_type, bsdr->bsd_svc, + bsdv->bsd_version, bsdv->bsd_type, bsdv->bsd_svc); + RETURN(-EPROTO); + } + + LASSERT(bsdv->bsd_svc == SPTLRPC_BULK_SVC_NULL || + bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG || + bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV); + + /* + * in privacy mode if return success, make sure bd_nob_transferred + * is the actual size of the clear text, otherwise upper layer + * may be surprised. + */ + if (req->rq_bulk_write) { + if (bsdv->bsd_flags & BSD_FL_ERR) { + CERROR("server reported bulk i/o failure\n"); + RETURN(-EIO); + } + + if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) + desc->bd_nob_transferred = desc->bd_nob; + } else { + /* + * bulk read, upon return success, bd_nob_transferred is + * the size of plain text actually received. + */ + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + LASSERT(gctx->gc_mechctx); + + if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG) { + int i, nob; + + /* fix the actual data size */ + for (i = 0, nob = 0; i < desc->bd_iov_count; i++) { + if (desc->bd_vec[i].bv_len + nob > + desc->bd_nob_transferred) { + desc->bd_vec[i].bv_len = + desc->bd_nob_transferred - nob; + } + nob += desc->bd_vec[i].bv_len; + } + + token.data = bsdv->bsd_data; + token.len = lustre_msg_buflen(vmsg, voff) - + sizeof(*bsdv); + + maj = lgss_verify_mic(gctx->gc_mechctx, 0, NULL, + desc->bd_iov_count, + desc->bd_vec, + &token); + if (maj != GSS_S_COMPLETE) { + CERROR("failed to verify bulk read: %x\n", maj); + RETURN(-EACCES); + } + } else if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) { + desc->bd_nob = bsdv->bsd_nob; + if (desc->bd_nob == 0) + RETURN(0); + + token.data = bsdv->bsd_data; + token.len = lustre_msg_buflen(vmsg, voff) - + sizeof(*bsdr); + + maj = lgss_unwrap_bulk(gctx->gc_mechctx, desc, + &token, 1); + if (maj != GSS_S_COMPLETE) { + CERROR("failed to decrypt bulk read: %x\n", + maj); + RETURN(-EACCES); + } + + desc->bd_nob_transferred = desc->bd_nob; + } + } + + RETURN(0); +} + +static int gss_prep_bulk(struct ptlrpc_bulk_desc *desc, + struct gss_ctx *mechctx) +{ + int rc; + + if (desc->bd_iov_count == 0) + return 0; + + rc = sptlrpc_enc_pool_get_pages(desc); + if (rc) + return rc; + + if (lgss_prep_bulk(mechctx, desc) != GSS_S_COMPLETE) + return -EACCES; + + return 0; +} + +int gss_cli_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + int rc; + ENTRY; + + LASSERT(req->rq_cli_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_read); + + if (SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_BULK_SVC_PRIV) + RETURN(0); + + rc = gss_prep_bulk(desc, ctx2gctx(req->rq_cli_ctx)->gc_mechctx); + if (rc) + CERROR("bulk read: failed to prepare encryption " + "pages: %d\n", rc); + + RETURN(rc); +} + +int gss_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_svc_reqctx *grctx; + struct ptlrpc_bulk_sec_desc *bsd; + int rc; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_write); + + grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + LASSERT(grctx->src_reqbsd); + LASSERT(grctx->src_repbsd); + LASSERT(grctx->src_ctx); + LASSERT(grctx->src_ctx->gsc_mechctx); + + bsd = grctx->src_reqbsd; + if (bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV) + RETURN(0); + + rc = gss_prep_bulk(desc, grctx->src_ctx->gsc_mechctx); + if (rc) + CERROR("bulk write: failed to prepare encryption " + "pages: %d\n", rc); + + RETURN(rc); +} + +int gss_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_svc_reqctx *grctx; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + rawobj_t token; + __u32 maj; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_write); + + grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + + LASSERT(grctx->src_reqbsd); + LASSERT(grctx->src_repbsd); + LASSERT(grctx->src_ctx); + LASSERT(grctx->src_ctx->gsc_mechctx); + + bsdr = grctx->src_reqbsd; + bsdv = grctx->src_repbsd; + + /* bsdr has been sanity checked during unpacking */ + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + switch (bsdv->bsd_svc) { + case SPTLRPC_BULK_SVC_INTG: + token.data = bsdr->bsd_data; + token.len = grctx->src_reqbsd_size - sizeof(*bsdr); + + maj = lgss_verify_mic(grctx->src_ctx->gsc_mechctx, 0, NULL, + desc->bd_iov_count, + desc->bd_vec, &token); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed to verify bulk signature: %x\n", maj); + RETURN(-EACCES); + } + break; + case SPTLRPC_BULK_SVC_PRIV: + if (bsdr->bsd_nob != desc->bd_nob) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("prepared nob %d doesn't match the actual " + "nob %d\n", desc->bd_nob, bsdr->bsd_nob); + RETURN(-EPROTO); + } + + if (desc->bd_iov_count == 0) { + LASSERT(desc->bd_nob == 0); + break; + } + + token.data = bsdr->bsd_data; + token.len = grctx->src_reqbsd_size - sizeof(*bsdr); + + maj = lgss_unwrap_bulk(grctx->src_ctx->gsc_mechctx, + desc, &token, 0); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed decrypt bulk data: %x\n", maj); + RETURN(-EACCES); + } + + /* mimic gss_cli_ctx_unwrap_bulk */ + desc->bd_nob_transferred = desc->bd_nob; + + break; + } + + RETURN(0); +} + +int gss_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_svc_reqctx *grctx; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + rawobj_t token; + __u32 maj; + int rc; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_read); + + grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + + LASSERT(grctx->src_reqbsd); + LASSERT(grctx->src_repbsd); + LASSERT(grctx->src_ctx); + LASSERT(grctx->src_ctx->gsc_mechctx); + + bsdr = grctx->src_reqbsd; + bsdv = grctx->src_repbsd; + + /* bsdr has been sanity checked during unpacking */ + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + switch (bsdv->bsd_svc) { + case SPTLRPC_BULK_SVC_INTG: + token.data = bsdv->bsd_data; + token.len = grctx->src_repbsd_size - sizeof(*bsdv); + + maj = lgss_get_mic(grctx->src_ctx->gsc_mechctx, 0, NULL, + desc->bd_iov_count, + desc->bd_vec, &token); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed to sign bulk data: %x\n", maj); + RETURN(-EACCES); + } + break; + case SPTLRPC_BULK_SVC_PRIV: + bsdv->bsd_nob = desc->bd_nob; + + if (desc->bd_iov_count == 0) { + LASSERT(desc->bd_nob == 0); + break; + } + + rc = sptlrpc_enc_pool_get_pages(desc); + if (rc) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("bulk read: failed to allocate encryption " + "pages: %d\n", rc); + RETURN(rc); + } + + token.data = bsdv->bsd_data; + token.len = grctx->src_repbsd_size - sizeof(*bsdv); + + maj = lgss_wrap_bulk(grctx->src_ctx->gsc_mechctx, + desc, &token, 1); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed to encrypt bulk data: %x\n", maj); + RETURN(-EACCES); + } + break; + } + + RETURN(0); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c new file mode 100644 index 0000000000000..0b4bfba0a0ac6 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c @@ -0,0 +1,429 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/gss/gss_cli_upcall.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +/********************************************** + * gss context init/fini helper * + **********************************************/ + +static +int ctx_init_pack_request(struct obd_import *imp, + struct ptlrpc_request *req, + int lustre_srv, + uid_t uid, gid_t gid, + long token_size, + char __user *token) +{ + struct lustre_msg *msg = req->rq_reqbuf; + struct gss_sec *gsec; + struct gss_header *ghdr; + struct ptlrpc_user_desc *pud; + __u32 *p, size, offset = 2; + rawobj_t obj; + + LASSERT(msg->lm_bufcount <= 4); + LASSERT(req->rq_cli_ctx); + LASSERT(req->rq_cli_ctx->cc_sec); + + /* gss hdr */ + ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr)); + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_sp = (__u8) imp->imp_sec->ps_part; + ghdr->gh_flags = 0; + ghdr->gh_proc = PTLRPC_GSS_PROC_INIT; + ghdr->gh_seq = 0; + ghdr->gh_svc = SPTLRPC_SVC_NULL; + ghdr->gh_handle.len = 0; + + /* fix the user desc */ + if (req->rq_pack_udesc) { + ghdr->gh_flags |= LUSTRE_GSS_PACK_USER; + + pud = lustre_msg_buf(msg, offset, sizeof(*pud)); + LASSERT(pud); + pud->pud_uid = pud->pud_fsuid = uid; + pud->pud_gid = pud->pud_fsgid = gid; + pud->pud_cap = 0; + pud->pud_ngroups = 0; + offset++; + } + + /* new clients are expected to set KCSUM flag */ + ghdr->gh_flags |= LUSTRE_GSS_PACK_KCSUM; + + /* security payload */ + p = lustre_msg_buf(msg, offset, 0); + size = msg->lm_buflens[offset]; + LASSERT(p); + + /* 1. lustre svc type */ + LASSERT(size > 4); + *p++ = cpu_to_le32(lustre_srv); + size -= 4; + + /* 2. target uuid */ + obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1; + obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid; + if (rawobj_serialize(&obj, &p, &size)) + LBUG(); + + /* 3. reverse context handle. actually only needed by root user, + * but we send it anyway. */ + gsec = sec2gsec(req->rq_cli_ctx->cc_sec); + obj.len = sizeof(gsec->gs_rvs_hdl); + obj.data = (__u8 *) &gsec->gs_rvs_hdl; + if (rawobj_serialize(&obj, &p, &size)) + LBUG(); + + /* 4. now the token */ + LASSERT(size >= (sizeof(__u32) + token_size)); + *p++ = cpu_to_le32(((__u32) token_size)); + if (copy_from_user(p, token, token_size)) { + CERROR("can't copy token\n"); + return -EFAULT; + } + size -= sizeof(__u32) + round_up(token_size, 4); + + req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset, + msg->lm_buflens[offset] - size, 0); + return 0; +} + +static +int ctx_init_parse_reply(struct lustre_msg *msg, int swabbed, + char __user *outbuf, long outlen) +{ + struct gss_rep_header *ghdr; + __u32 obj_len, round_len; + __u32 status, effective = 0; + + if (msg->lm_bufcount != 3) { + CERROR("unexpected bufcount %u\n", msg->lm_bufcount); + return -EPROTO; + } + + ghdr = (struct gss_rep_header *) gss_swab_header(msg, 0, swabbed); + if (ghdr == NULL) { + CERROR("unable to extract gss reply header\n"); + return -EPROTO; + } + + if (ghdr->gh_version != PTLRPC_GSS_VERSION) { + CERROR("invalid gss version %u\n", ghdr->gh_version); + return -EPROTO; + } + + if (outlen < (4 + 2) * 4 + round_up(ghdr->gh_handle.len, 4) + + round_up(msg->lm_buflens[2], 4)) { + CERROR("output buffer size %ld too small\n", outlen); + return -EFAULT; + } + + status = 0; + effective = 0; + + if (copy_to_user(outbuf, &status, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, &ghdr->gh_major, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, &ghdr->gh_minor, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, &ghdr->gh_seqwin, 4)) + return -EFAULT; + outbuf += 4; + effective += 4 * 4; + + /* handle */ + obj_len = ghdr->gh_handle.len; + round_len = (obj_len + 3) & ~3; + if (copy_to_user(outbuf, &obj_len, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, (char *) ghdr->gh_handle.data, round_len)) + return -EFAULT; + outbuf += round_len; + effective += 4 + round_len; + + /* out token */ + obj_len = msg->lm_buflens[2]; + round_len = (obj_len + 3) & ~3; + if (copy_to_user(outbuf, &obj_len, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, lustre_msg_buf(msg, 2, 0), round_len)) + return -EFAULT; + outbuf += round_len; + effective += 4 + round_len; + + return effective; +} + +int gss_do_ctx_init_rpc(char __user *buffer, unsigned long count) +{ + struct obd_import *imp, *imp0; + struct ptlrpc_request *req; + struct lgssd_ioctl_param param; + struct obd_device *obd; + char obdname[64]; + long lsize; + int rc; + + if (count != sizeof(param)) { + CERROR("ioctl size %lu, expect %lu, please check lgss_keyring version\n", + count, (unsigned long) sizeof(param)); + RETURN(-EINVAL); + } + if (copy_from_user(¶m, buffer, sizeof(param))) { + CERROR("failed copy data from lgssd\n"); + RETURN(-EFAULT); + } + + if (param.version != GSSD_INTERFACE_VERSION) { + CERROR("gssd interface version %d (expect %d)\n", + param.version, GSSD_INTERFACE_VERSION); + RETURN(-EINVAL); + } + + /* take name */ + if (strncpy_from_user(obdname, (const char __user *)param.uuid, + sizeof(obdname)) <= 0) { + CERROR("Invalid obdname pointer\n"); + RETURN(-EFAULT); + } + + obd = class_name2obd(obdname); + if (!obd) { + CERROR("no such obd %s\n", obdname); + RETURN(-EINVAL); + } + + if (unlikely(!obd->obd_set_up)) { + CERROR("obd %s not setup\n", obdname); + RETURN(-EINVAL); + } + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + CERROR("obd %s has stopped\n", obdname); + spin_unlock(&obd->obd_dev_lock); + RETURN(-EINVAL); + } + + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME)) { + CERROR("obd %s is not a client device\n", obdname); + spin_unlock(&obd->obd_dev_lock); + RETURN(-EINVAL); + } + spin_unlock(&obd->obd_dev_lock); + + with_imp_locked(obd, imp0, rc) + imp = class_import_get(imp0); + if (rc) { + CERROR("obd %s: import has gone\n", obd->obd_name); + RETURN(-EINVAL); + } + + if (imp->imp_deactive) { + CERROR("import has been deactivated\n"); + class_import_put(imp); + RETURN(-EINVAL); + } + + req = ptlrpc_request_alloc_pack(imp, &RQF_SEC_CTX, LUSTRE_OBD_VERSION, + SEC_CTX_INIT); + if (req == NULL) { + param.status = -ENOMEM; + goto out_copy; + } + + if (req->rq_cli_ctx->cc_sec->ps_id != param.secid) { + CWARN("original secid %d, now has changed to %d, cancel this negotiation\n", + param.secid, req->rq_cli_ctx->cc_sec->ps_id); + param.status = -EINVAL; + goto out_copy; + } + + /* get token */ + rc = ctx_init_pack_request(imp, req, + param.lustre_svc, + param.uid, param.gid, + param.send_token_size, + (char __user *)param.send_token); + if (rc) { + param.status = rc; + goto out_copy; + } + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) { + /* If any _real_ denial be made, we expect server return + * -EACCES reply or return success but indicate gss error + * inside reply messsage. All other errors are treated as + * timeout, caller might try the negotiation repeatedly, + * leave recovery decisions to general ptlrpc layer. + * + * FIXME maybe some other error code shouldn't be treated + * as timeout. + */ + param.status = rc; + if (rc != -EACCES) + param.status = -ETIMEDOUT; + goto out_copy; + } + + LASSERT(req->rq_repdata); + lsize = ctx_init_parse_reply(req->rq_repdata, + req_capsule_rep_need_swab(&req->rq_pill), + (char __user *)param.reply_buf, + param.reply_buf_size); + if (lsize < 0) { + param.status = (int) lsize; + goto out_copy; + } + + param.status = 0; + param.reply_length = lsize; + +out_copy: + if (copy_to_user(buffer, ¶m, sizeof(param))) + rc = -EFAULT; + else + rc = 0; + + class_import_put(imp); + ptlrpc_req_finished(req); + RETURN(rc); +} + +int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx) +{ + struct ptlrpc_cli_ctx *ctx = &gctx->gc_base; + struct obd_import *imp = ctx->cc_sec->ps_import; + struct ptlrpc_request *req; + struct ptlrpc_user_desc *pud; + int rc; + ENTRY; + + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + if (cli_ctx_is_error(ctx) || !cli_ctx_is_uptodate(ctx)) { + CDEBUG(D_SEC, "ctx %p(%u->%s) not uptodate, " + "don't send destroy rpc\n", ctx, + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + RETURN(0); + } + + might_sleep(); + + CWARN("%s ctx %p idx %#llx (%u->%s)\n", + sec_is_reverse(ctx->cc_sec) ? + "server finishing reverse" : "client finishing forward", + ctx, gss_handle_to_u64(&gctx->gc_handle), + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + + gctx->gc_proc = PTLRPC_GSS_PROC_DESTROY; + + req = ptlrpc_request_alloc(imp, &RQF_SEC_CTX); + if (req == NULL) { + CWARN("ctx %p(%u): fail to prepare rpc, destroy locally\n", + ctx, ctx->cc_vcred.vc_uid); + GOTO(out, rc = -ENOMEM); + } + + rc = ptlrpc_request_bufs_pack(req, LUSTRE_OBD_VERSION, SEC_CTX_FINI, + NULL, ctx); + if (rc) + GOTO(out_ref, rc); + + /* fix the user desc */ + if (req->rq_pack_udesc) { + /* we rely the fact that this request is in AUTH mode, + * and user_desc at offset 2. */ + pud = lustre_msg_buf(req->rq_reqbuf, 2, sizeof(*pud)); + LASSERT(pud); + pud->pud_uid = pud->pud_fsuid = ctx->cc_vcred.vc_uid; + pud->pud_gid = pud->pud_fsgid = ctx->cc_vcred.vc_gid; + pud->pud_cap = 0; + pud->pud_ngroups = 0; + } + + req->rq_phase = RQ_PHASE_RPC; + rc = ptl_send_rpc(req, 1); + if (rc) + CWARN("ctx %p(%u->%s): rpc error %d, destroy locally\n", ctx, + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), rc); + +out_ref: + ptlrpc_req_finished(req); +out: + RETURN(rc); +} + +int __init gss_init_cli_upcall(void) +{ + return 0; +} + +void gss_exit_cli_upcall(void) +{ +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c new file mode 100644 index 0000000000000..a07fac77ef8ef --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c @@ -0,0 +1,463 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_krb5_mech.c + * linux/net/sunrpc/gss_krb5_crypto.c + * linux/net/sunrpc/gss_krb5_seal.c + * linux/net/sunrpc/gss_krb5_seqnum.c + * linux/net/sunrpc/gss_krb5_unseal.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include + +#include "gss_internal.h" +#include "gss_crypto.h" + +int gss_keyblock_init(struct gss_keyblock *kb, const char *alg_name, + const int alg_mode) +{ + int rc; + + kb->kb_tfm = crypto_alloc_sync_skcipher(alg_name, alg_mode, 0); + if (IS_ERR(kb->kb_tfm)) { + rc = PTR_ERR(kb->kb_tfm); + kb->kb_tfm = NULL; + CERROR("failed to alloc tfm: %s, mode %d: rc = %d\n", alg_name, + alg_mode, rc); + return rc; + } + + rc = crypto_sync_skcipher_setkey(kb->kb_tfm, kb->kb_key.data, + kb->kb_key.len); + if (rc) { + CERROR("failed to set %s key, len %d, rc = %d\n", alg_name, + kb->kb_key.len, rc); + return rc; + } + + return 0; +} + +void gss_keyblock_free(struct gss_keyblock *kb) +{ + rawobj_free(&kb->kb_key); + if (kb->kb_tfm) + crypto_free_sync_skcipher(kb->kb_tfm); +} + +int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb) +{ + return rawobj_dup(&new->kb_key, &kb->kb_key); +} + +int gss_get_bytes(char **ptr, const char *end, void *res, size_t len) +{ + char *p, *q; + p = *ptr; + q = p + len; + if (q > end || q < p) + return -EINVAL; + memcpy(res, p, len); + *ptr = q; + return 0; +} + +int gss_get_rawobj(char **ptr, const char *end, rawobj_t *res) +{ + char *p, *q; + __u32 len; + + p = *ptr; + if (gss_get_bytes(&p, end, &len, sizeof(len))) + return -EINVAL; + + q = p + len; + if (q > end || q < p) + return -EINVAL; + + /* Support empty objects */ + if (len != 0) { + OBD_ALLOC_LARGE(res->data, len); + if (!res->data) + return -ENOMEM; + } else { + res->len = len; + res->data = NULL; + return 0; + } + + res->len = len; + memcpy(res->data, p, len); + *ptr = q; + return 0; +} + +int gss_get_keyblock(char **ptr, const char *end, + struct gss_keyblock *kb, __u32 keysize) +{ + char *buf; + int rc; + + OBD_ALLOC_LARGE(buf, keysize); + if (buf == NULL) + return -ENOMEM; + + rc = gss_get_bytes(ptr, end, buf, keysize); + if (rc) { + OBD_FREE_LARGE(buf, keysize); + return rc; + } + + kb->kb_key.len = keysize; + kb->kb_key.data = buf; + return 0; +} + +/* + * Should be used for buffers allocated with k/vmalloc(). + * + * Dispose of @sgt with gss_teardown_sgtable(). + * + * @prealloc_sg is to avoid memory allocation inside sg_alloc_table() + * in cases where a single sg is sufficient. No attempt to reduce the + * number of sgs by squeezing physically contiguous pages together is + * made though, for simplicity. + * + * This function is copied from the ceph filesystem code. + */ +int gss_setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg, + const void *buf, unsigned int buf_len) +{ + struct scatterlist *sg; + const bool is_vmalloc = is_vmalloc_addr(buf); + unsigned int off = offset_in_page(buf); + unsigned int chunk_cnt = 1; + unsigned int chunk_len = PAGE_ALIGN(off + buf_len); + int i; + int rc; + + if (buf_len == 0) { + memset(sgt, 0, sizeof(*sgt)); + return -EINVAL; + } + + if (is_vmalloc) { + chunk_cnt = chunk_len >> PAGE_SHIFT; + chunk_len = PAGE_SIZE; + } + + if (chunk_cnt > 1) { + rc = sg_alloc_table(sgt, chunk_cnt, GFP_NOFS); + if (rc) + return rc; + } else { + WARN_ON_ONCE(chunk_cnt != 1); + sg_init_table(prealloc_sg, 1); + sgt->sgl = prealloc_sg; + sgt->nents = sgt->orig_nents = 1; + } + + for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) { + struct page *page; + unsigned int len = min(chunk_len - off, buf_len); + + if (is_vmalloc) + page = vmalloc_to_page(buf); + else + page = virt_to_page(buf); + + sg_set_page(sg, page, len, off); + + off = 0; + buf += len; + buf_len -= len; + } + + WARN_ON_ONCE(buf_len != 0); + + return 0; +} + +void gss_teardown_sgtable(struct sg_table *sgt) +{ + if (sgt->orig_nents > 1) + sg_free_table(sgt); +} + +int gss_crypt_generic(struct crypto_sync_skcipher *tfm, int decrypt, + const void *iv, const void *in, void *out, size_t length) +{ + struct scatterlist sg; + struct sg_table sg_out; + __u8 local_iv[16] = {0}; + __u32 ret = -EINVAL; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + + LASSERT(tfm); + + if (length % crypto_sync_skcipher_blocksize(tfm) != 0) { + CERROR("output length %zu mismatch blocksize %d\n", + length, crypto_sync_skcipher_blocksize(tfm)); + goto out; + } + + if (crypto_sync_skcipher_ivsize(tfm) > ARRAY_SIZE(local_iv)) { + CERROR("iv size too large %d\n", + crypto_sync_skcipher_ivsize(tfm)); + goto out; + } + + if (iv) + memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm)); + + if (in != out) + memmove(out, in, length); + + ret = gss_setup_sgtable(&sg_out, &sg, out, length); + if (ret != 0) + goto out; + + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sg, &sg, length, local_iv); + + if (decrypt) + ret = crypto_skcipher_decrypt_iv(req, &sg, &sg, length); + else + ret = crypto_skcipher_encrypt_iv(req, &sg, &sg, length); + + skcipher_request_zero(req); + gss_teardown_sgtable(&sg_out); +out: + return ret; +} + +int gss_digest_hash(struct ahash_request *req, + rawobj_t *hdr, int msgcnt, rawobj_t *msgs, + int iovcnt, struct bio_vec *iovs) +{ + struct scatterlist sg[1]; + struct sg_table sgt; + int rc = 0; + int i; + + for (i = 0; i < msgcnt; i++) { + if (msgs[i].len == 0) + continue; + + rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len); + if (rc) + return rc; + + ahash_request_set_crypt(req, sg, NULL, msgs[i].len); + rc = crypto_ahash_update(req); + gss_teardown_sgtable(&sgt); + if (rc) + return rc; + } + + for (i = 0; i < iovcnt; i++) { + if (iovs[i].bv_len == 0) + continue; + + sg_init_table(sg, 1); + sg_set_page(&sg[0], iovs[i].bv_page, iovs[i].bv_len, + iovs[i].bv_offset); + + ahash_request_set_crypt(req, sg, NULL, iovs[i].bv_len); + rc = crypto_ahash_update(req); + if (rc) + return rc; + } + + if (hdr) { + rc = gss_setup_sgtable(&sgt, sg, hdr->data, hdr->len); + if (rc) + return rc; + + ahash_request_set_crypt(req, sg, NULL, hdr->len); + rc = crypto_ahash_update(req); + gss_teardown_sgtable(&sgt); + if (rc) + return rc; + } + + return rc; +} + +int gss_digest_hash_compat(struct ahash_request *req, + rawobj_t *hdr, int msgcnt, rawobj_t *msgs, + int iovcnt, struct bio_vec *iovs) +{ + struct scatterlist sg[1]; + struct sg_table sgt; + int rc = 0; + int i; + + for (i = 0; i < msgcnt; i++) { + if (msgs[i].len == 0) + continue; + + rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len); + if (rc) + return rc; + + ahash_request_set_crypt(req, sg, NULL, msgs[i].len); + rc = crypto_ahash_update(req); + gss_teardown_sgtable(&sgt); + if (rc) + return rc; + } + + for (i = 0; i < iovcnt; i++) { + if (iovs[i].bv_len == 0) + continue; + + sg_init_table(sg, 1); + sg_set_page(&sg[0], iovs[i].bv_page, iovs[i].bv_len, + iovs[i].bv_offset); + + ahash_request_set_crypt(req, sg, NULL, iovs[i].bv_len); + rc = crypto_ahash_update(req); + if (rc) + return rc; + } + + if (hdr) { + rc = gss_setup_sgtable(&sgt, sg, &(hdr->len), sizeof(hdr->len)); + if (rc) + return rc; + + ahash_request_set_crypt(req, sg, NULL, sizeof(hdr->len)); + rc = crypto_ahash_update(req); + gss_teardown_sgtable(&sgt); + if (rc) + return rc; + } + + return rc; +} + +int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize) +{ + int padding; + + padding = (blocksize - (msg->len & (blocksize - 1))) & + (blocksize - 1); + if (!padding) + return 0; + + if (msg->len + padding > msg_buflen) { + CERROR("bufsize %u too small: datalen %u, padding %u\n", + msg_buflen, msg->len, padding); + return -EINVAL; + } + + memset(msg->data + msg->len, padding, padding); + msg->len += padding; + return 0; +} + +int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv, + int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj, + int enc) +{ + struct scatterlist src; + struct scatterlist dst; + struct sg_table sg_dst; + struct sg_table sg_src; + __u8 *buf; + __u32 datalen = 0; + int i, rc; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + + ENTRY; + + buf = outobj->data; + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + + for (i = 0; i < inobj_cnt; i++) { + LASSERT(buf + inobjs[i].len <= outobj->data + outobj->len); + + rc = gss_setup_sgtable(&sg_src, &src, inobjs[i].data, + inobjs[i].len); + if (rc != 0) + RETURN(rc); + + rc = gss_setup_sgtable(&sg_dst, &dst, buf, + outobj->len - datalen); + if (rc != 0) { + gss_teardown_sgtable(&sg_src); + RETURN(rc); + } + + skcipher_request_set_crypt(req, &src, &dst, src.length, iv); + if (!iv) + skcipher_request_set_crypt_iv(req); + + if (enc) + rc = crypto_skcipher_encrypt_iv(req, &dst, &src, + src.length); + else + rc = crypto_skcipher_decrypt_iv(req, &dst, &src, + src.length); + + gss_teardown_sgtable(&sg_src); + gss_teardown_sgtable(&sg_dst); + + if (rc) { + CERROR("encrypt error %d\n", rc); + skcipher_request_zero(req); + RETURN(rc); + } + + datalen += inobjs[i].len; + buf += inobjs[i].len; + } + skcipher_request_zero(req); + + outobj->len = datalen; + RETURN(0); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h new file mode 100644 index 0000000000000..7653e2139dbef --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h @@ -0,0 +1,131 @@ +#ifndef PTLRPC_GSS_CRYPTO_H +#define PTLRPC_GSS_CRYPTO_H + +#include + +#include "gss_internal.h" + +#include + +/* + * linux v4.19-rc2-66-gb350bee5ea0f + * crypto: skcipher - Introduce crypto_sync_skcipher + * + * crypto_sync_skcipher will replace crypto_blkcipher so start using + * crypto_sync_skcipher and provide wrappers for older kernels + */ +#ifdef SYNC_SKCIPHER_REQUEST_ON_STACK + +#define crypto_skcipher_encrypt_iv(desc, dst, src, blocksize) \ + crypto_skcipher_encrypt((desc)) + +#define crypto_skcipher_decrypt_iv(desc, dst, src, blocksize) \ + crypto_skcipher_decrypt((desc)) + +#define skcipher_request_set_crypt_iv(d) + +#else /* ! SYNC_SKCIPHER_REQUEST_ON_STACK */ + +#ifdef HAVE_CRYPTO_ALLOC_SKCIPHER + +#define crypto_sync_skcipher crypto_skcipher + +#define SYNC_SKCIPHER_REQUEST_ON_STACK SKCIPHER_REQUEST_ON_STACK + +#define skcipher_request_set_sync_tfm skcipher_request_set_tfm + +#define skcipher_request_set_crypt_iv(d) + +#define crypto_sync_skcipher_blocksize crypto_skcipher_blocksize + +#define crypto_sync_skcipher_setkey crypto_skcipher_setkey + +#define crypto_alloc_sync_skcipher crypto_alloc_skcipher + +#define crypto_free_sync_skcipher crypto_free_skcipher + +#define crypto_sync_skcipher_ivsize crypto_skcipher_ivsize + +#define crypto_skcipher_encrypt_iv(desc, dst, src, blocksize) \ + crypto_skcipher_encrypt((desc)) + +#define crypto_skcipher_decrypt_iv(desc, dst, src, blocksize) \ + crypto_skcipher_decrypt((desc)) + +#define skcipher_request_zero(req) /* nop */ + +#else /* ! HAVE_CRYPTO_ALLOC_SKCIPHER */ + +#define crypto_sync_skcipher crypto_blkcipher + +#define SYNC_SKCIPHER_REQUEST_ON_STACK(name, tfm) \ + struct blkcipher_desc __##name##_obj, *name = (void *)&__##name##_obj + +#define skcipher_request_set_sync_tfm(d, _tfm) \ + do { (d)->tfm = _tfm; } while (0) + +#define skcipher_request_set_callback(d, f, c, data) \ + do { (d)->flags = f; } while (0) + +#define skcipher_request_set_crypt(d, src, dst, cryptlen, iv) \ + do { (d)->info = iv; } while (0) + +#define skcipher_request_set_crypt_iv(d) \ + do { (d)->info = crypto_blkcipher_crt((d)->tfm)->iv; } while (0) + +#define crypto_sync_skcipher_blocksize(tfm) \ + crypto_blkcipher_blocksize((tfm)) + +#define crypto_sync_skcipher_setkey(tfm, key, keylen) \ + crypto_blkcipher_setkey((tfm), (key), (keylen)) + +#define crypto_alloc_sync_skcipher(name, type, mask) \ + crypto_alloc_blkcipher((name), (type), (mask)) + +#define crypto_free_sync_skcipher(tfm) \ + crypto_free_blkcipher((tfm)) + +#define crypto_sync_skcipher_ivsize(tfm) \ + crypto_blkcipher_ivsize((tfm)) + +#define crypto_skcipher_encrypt_iv(desc, dst, src, len) \ + crypto_blkcipher_encrypt_iv((desc), (dst), (src), (len)) + +#define crypto_skcipher_decrypt_iv(desc, dst, src, len) \ + crypto_blkcipher_decrypt_iv((desc), (dst), (src), (len)) + +#define skcipher_request_zero(req) /* nop */ + +#endif /* HAVE_CRYPTO_ALLOC_SKCIPHER */ +#endif /* SYNC_SKCIPHER_REQUEST_ON_STACK */ + +struct gss_keyblock { + rawobj_t kb_key; + struct crypto_sync_skcipher *kb_tfm; +}; + +int gss_keyblock_init(struct gss_keyblock *kb, const char *alg_name, + const int alg_mode); +void gss_keyblock_free(struct gss_keyblock *kb); +int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb); +int gss_get_bytes(char **ptr, const char *end, void *res, size_t len); +int gss_get_rawobj(char **ptr, const char *end, rawobj_t *res); +int gss_get_keyblock(char **ptr, const char *end, struct gss_keyblock *kb, + __u32 keysize); +int gss_setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg, + const void *buf, unsigned int buf_len); +void gss_teardown_sgtable(struct sg_table *sgt); +int gss_crypt_generic(struct crypto_sync_skcipher *tfm, int decrypt, + const void *iv, const void *in, void *out, size_t length); +int gss_digest_hash(struct ahash_request *req, rawobj_t *hdr, + int msgcnt, rawobj_t *msgs, int iovcnt, + struct bio_vec *iovs); +int gss_digest_hash_compat(struct ahash_request *req, + rawobj_t *hdr, int msgcnt, rawobj_t *msgs, + int iovcnt, struct bio_vec *iovs); +int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize); +int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv, + int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj, + int enc); + +#endif /* PTLRPC_GSS_CRYPTO_H */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h new file mode 100644 index 0000000000000..34cd9a422e06b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h @@ -0,0 +1,193 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * Adapted from MIT Kerberos 5-1.2.1 include/gssapi/gssapi.h + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __PTLRPC_GSS_GSS_ERR_H_ +#define __PTLRPC_GSS_GSS_ERR_H_ + +typedef unsigned int OM_uint32; + +/* + * Flag bits for context-level services. + */ +#define GSS_C_DELEG_FLAG (1) +#define GSS_C_MUTUAL_FLAG (2) +#define GSS_C_REPLAY_FLAG (4) +#define GSS_C_SEQUENCE_FLAG (8) +#define GSS_C_CONF_FLAG (16) +#define GSS_C_INTEG_FLAG (32) +#define GSS_C_ANON_FLAG (64) +#define GSS_C_PROT_READY_FLAG (128) +#define GSS_C_TRANS_FLAG (256) + +/* + * Credential usage options + */ +#define GSS_C_BOTH (0) +#define GSS_C_INITIATE (1) +#define GSS_C_ACCEPT (2) + +/* + * Status code types for gss_display_status + */ +#define GSS_C_GSS_CODE (1) +#define GSS_C_MECH_CODE (2) + + +/* + * Define the default Quality of Protection for per-message services. Note + * that an implementation that offers multiple levels of QOP may either reserve + * a value (for example zero, as assumed here) to mean "default protection", or + * alternatively may simply equate GSS_C_QOP_DEFAULT to a specific explicit + * QOP value. However a value of 0 should always be interpreted by a GSSAPI + * implementation as a request for the default protection level. + */ +#define GSS_C_QOP_DEFAULT (0) + +/* + * Expiration time of 2^32-1 seconds means infinite lifetime for a + * credential or security context + */ +#define GSS_C_INDEFINITE ((OM_uint32) 0xfffffffful) + + +/* Major status codes */ + +#define GSS_S_COMPLETE (0) + +/* + * Some "helper" definitions to make the status code macros obvious. + */ +#define GSS_C_CALLING_ERROR_OFFSET (24) +#define GSS_C_ROUTINE_ERROR_OFFSET (16) +#define GSS_C_SUPPLEMENTARY_OFFSET (0) +#define GSS_C_CALLING_ERROR_MASK ((OM_uint32) 0377ul) +#define GSS_C_ROUTINE_ERROR_MASK ((OM_uint32) 0377ul) +#define GSS_C_SUPPLEMENTARY_MASK ((OM_uint32) 0177777ul) + +/* + * The macros that test status codes for error conditions. Note that the + * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now + * evaluates its argument only once. + */ +#define GSS_CALLING_ERROR(x) \ + ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET)) +#define GSS_ROUTINE_ERROR(x) \ + ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)) +#define GSS_SUPPLEMENTARY_INFO(x) \ + ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET)) +#define GSS_ERROR(x) \ + ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \ + (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))) + +/* + * Now the actual status code definitions + */ + +/* + * Calling errors: + */ +#define GSS_S_CALL_INACCESSIBLE_READ \ + (((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET) +#define GSS_S_CALL_INACCESSIBLE_WRITE \ + (((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET) +#define GSS_S_CALL_BAD_STRUCTURE \ + (((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET) + +/* + * Routine errors: + */ +#define GSS_S_BAD_MECH \ + (((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_NAME \ + (((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_NAMETYPE \ + (((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_BINDINGS \ + (((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_STATUS \ + (((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_SIG \ + (((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NO_CRED \ + (((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NO_CONTEXT \ + (((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DEFECTIVE_TOKEN \ + (((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DEFECTIVE_CREDENTIAL \ + (((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_CREDENTIALS_EXPIRED \ + (((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_CONTEXT_EXPIRED \ + (((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_FAILURE \ + (((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_QOP \ + (((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_UNAUTHORIZED \ + (((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_UNAVAILABLE \ + (((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DUPLICATE_ELEMENT \ + (((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NAME_NOT_MN \ + (((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET) + +/* + * Supplementary info bits: + */ +#define GSS_S_CONTINUE_NEEDED BIT(GSS_C_SUPPLEMENTARY_OFFSET + 0) +#define GSS_S_DUPLICATE_TOKEN BIT(GSS_C_SUPPLEMENTARY_OFFSET + 1) +#define GSS_S_OLD_TOKEN BIT(GSS_C_SUPPLEMENTARY_OFFSET + 2) +#define GSS_S_UNSEQ_TOKEN BIT(GSS_C_SUPPLEMENTARY_OFFSET + 3) +#define GSS_S_GAP_TOKEN BIT(GSS_C_SUPPLEMENTARY_OFFSET + 4) + +/* XXXX these are not part of the GSSAPI C bindings! (but should be) */ + +#define GSS_CALLING_ERROR_FIELD(x) \ + (((x) >> GSS_C_CALLING_ERROR_OFFSET) & GSS_C_CALLING_ERROR_MASK) +#define GSS_ROUTINE_ERROR_FIELD(x) \ + (((x) >> GSS_C_ROUTINE_ERROR_OFFSET) & GSS_C_ROUTINE_ERROR_MASK) +#define GSS_SUPPLEMENTARY_INFO_FIELD(x) \ + (((x) >> GSS_C_SUPPLEMENTARY_OFFSET) & GSS_C_SUPPLEMENTARY_MASK) + +/* XXXX This is a necessary evil until the spec is fixed */ +#define GSS_S_CRED_UNAVAIL GSS_S_FAILURE + +#endif /* __PTLRPC_GSS_GSS_ERR_H_ */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c new file mode 100644 index 0000000000000..23506f89d67c2 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c @@ -0,0 +1,284 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_generic_token.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_krb5.h" +#include "gss_asn1.h" + + +/* TWRITE_STR from gssapiP_generic.h */ +#define TWRITE_STR(ptr, str, len) \ + memcpy((ptr), (char *) (str), (len)); \ + (ptr) += (len); + +/* XXXX this code currently makes the assumption that a mech oid will + never be longer than 127 bytes. This assumption is not inherent in + the interfaces, so the code can be fixed if the OSI namespace + balloons unexpectedly. */ + +/* Each token looks like this: + +0x60 tag for APPLICATION 0, SEQUENCE + (constructed, definite-length) + possible multiple bytes, need to parse/generate + 0x06 tag for OBJECT IDENTIFIER + compile-time constant string (assume 1 byte) + compile-time constant string + the ANY containing the application token + bytes 0,1 are the token type + bytes 2,n are the token data + +For the purposes of this abstraction, the token "header" consists of +the sequence tag and length octets, the mech OID DER encoding, and the +first two inner bytes, which indicate the token type. The token +"body" consists of everything else. + +*/ + +static +int der_length_size(int length) +{ + if (length < (1 << 7)) + return 1; + else if (length < (1 << 8)) + return 2; +#if (SIZEOF_INT == 2) + else + return 3; +#else + else if (length < (1 << 16)) + return 3; + else if (length < (1 << 24)) + return 4; + else + return 5; +#endif +} + +static +void der_write_length(unsigned char **buf, int length) +{ + if (length < (1 << 7)) { + *(*buf)++ = (unsigned char) length; + } else { + *(*buf)++ = (unsigned char) (der_length_size(length) + 127); +#if (SIZEOF_INT > 2) + if (length >= (1 << 24)) + *(*buf)++ = (unsigned char) (length >> 24); + if (length >= (1 << 16)) + *(*buf)++ = (unsigned char) ((length >> 16) & 0xff); +#endif + if (length >= (1 << 8)) + *(*buf)++ = (unsigned char) ((length >> 8) & 0xff); + *(*buf)++ = (unsigned char) (length & 0xff); + } +} + +/* + * returns decoded length, or < 0 on failure. Advances buf and + * decrements bufsize + */ +static +int der_read_length(unsigned char **buf, int *bufsize) +{ + unsigned char sf; + int ret; + + if (*bufsize < 1) + return -1; + sf = *(*buf)++; + (*bufsize)--; + if (sf & 0x80) { + if ((sf &= 0x7f) > ((*bufsize) - 1)) + return -1; + if (sf > SIZEOF_INT) + return -1; + ret = 0; + for (; sf; sf--) { + ret = (ret << 8) + (*(*buf)++); + (*bufsize)--; + } + } else { + ret = sf; + } + + return ret; +} + +/* + * returns the length of a token, given the mech oid and the body size + */ +int g_token_size(rawobj_t *mech, unsigned int body_size) +{ + /* set body_size to sequence contents size */ + body_size += 4 + (int) mech->len; /* NEED overflow check */ + return (1 + der_length_size(body_size) + body_size); +} + +/* + * fills in a buffer with the token header. The buffer is assumed to + * be the right size. buf is advanced past the token header + */ +void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf) +{ + *(*buf)++ = 0x60; + der_write_length(buf, 4 + mech->len + body_size); + *(*buf)++ = 0x06; + *(*buf)++ = (unsigned char) mech->len; + TWRITE_STR(*buf, mech->data, ((int) mech->len)); +} + +/* + * Given a buffer containing a token, reads and verifies the token, + * leaving buf advanced past the token header, and setting body_size + * to the number of remaining bytes. Returns 0 on success, + * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the + * mechanism in the token does not match the mech argument. buf and + * *body_size are left unmodified on error. + */ +__u32 g_verify_token_header(rawobj_t *mech, int *body_size, + unsigned char **buf_in, int toksize) +{ + unsigned char *buf = *buf_in; + int seqsize; + rawobj_t toid; + int ret = 0; + + if ((toksize -= 1) < 0) + return (G_BAD_TOK_HEADER); + if (*buf++ != 0x60) + return (G_BAD_TOK_HEADER); + + if ((seqsize = der_read_length(&buf, &toksize)) < 0) + return(G_BAD_TOK_HEADER); + + if (seqsize != toksize) + return (G_BAD_TOK_HEADER); + + if ((toksize -= 1) < 0) + return (G_BAD_TOK_HEADER); + if (*buf++ != 0x06) + return (G_BAD_TOK_HEADER); + + if ((toksize -= 1) < 0) + return (G_BAD_TOK_HEADER); + toid.len = *buf++; + + if ((toksize -= toid.len) < 0) + return (G_BAD_TOK_HEADER); + toid.data = buf; + buf += toid.len; + + if (!g_OID_equal(&toid, mech)) + ret = G_WRONG_MECH; + + /* G_WRONG_MECH is not returned immediately because it's more + * important to return G_BAD_TOK_HEADER if the token header is + * in fact bad + */ + if ((toksize -= 2) < 0) + return (G_BAD_TOK_HEADER); + + if (ret) + return (ret); + + if (!ret) { + *buf_in = buf; + *body_size = toksize; + } + + return (ret); +} + +/* + * Given a buffer containing a token, returns a copy of the mech oid in + * the parameter mech. + */ +__u32 g_get_mech_oid(rawobj_t *mech, rawobj_t *in_buf) +{ + unsigned char *buf = in_buf->data; + int len = in_buf->len; + int ret = 0; + int seqsize; + + if ((len -= 1) < 0) + return (G_BAD_TOK_HEADER); + if (*buf++ != 0x60) + return (G_BAD_TOK_HEADER); + + if ((seqsize = der_read_length(&buf, &len)) < 0) + return (G_BAD_TOK_HEADER); + + if ((len -= 1) < 0) + return (G_BAD_TOK_HEADER); + if (*buf++ != 0x06) + return (G_BAD_TOK_HEADER); + + if ((len -= 1) < 0) + return (G_BAD_TOK_HEADER); + mech->len = *buf++; + + if ((len -= mech->len) < 0) + return (G_BAD_TOK_HEADER); + OBD_ALLOC_LARGE(mech->data, mech->len); + if (!mech->data) + return (G_BUFFER_ALLOC); + memcpy(mech->data, buf, mech->len); + + return ret; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h new file mode 100644 index 0000000000000..d8302bacfc9d8 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h @@ -0,0 +1,509 @@ +/* + * Modified from NFSv4 project for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2016, Intel Corporation. + * + * Author: Eric Mei + */ + +#ifndef __PTLRPC_GSS_GSS_INTERNAL_H_ +#define __PTLRPC_GSS_GSS_INTERNAL_H_ + +#include +#include +#include + +/* + * rawobj stuff + */ +#define NETOBJ_EMPTY ((netobj_t) { 0 }) +#define RAWOBJ_EMPTY ((rawobj_t) { 0, NULL }) + +typedef struct rawobj_buf_s { + __u32 dataoff; + __u32 datalen; + __u32 buflen; + __u8 *buf; +} rawobj_buf_t; + +int rawobj_empty(rawobj_t *obj); +int rawobj_alloc(rawobj_t *obj, char *buf, int len); +void rawobj_free(rawobj_t *obj); +int rawobj_equal(rawobj_t *a, rawobj_t *b); +int rawobj_dup(rawobj_t *dest, rawobj_t *src); +int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj); +int rawobj_from_netobj_alloc(rawobj_t *obj, netobj_t *netobj); + +int buffer_extract_bytes(const void **buf, __u32 *buflen, + void *res, __u32 reslen); + +/* + * several timeout values. client refresh upcall timeout we using + * default in pipefs implemnetation. + */ +#define __TIMEOUT_DELTA (10) + +#define GSS_SECINIT_RPC_TIMEOUT \ + (obd_timeout < __TIMEOUT_DELTA ? \ + __TIMEOUT_DELTA : obd_timeout - __TIMEOUT_DELTA) + +#define GSS_SECFINI_RPC_TIMEOUT (__TIMEOUT_DELTA) +#define GSS_SECSVC_UPCALL_TIMEOUT (GSS_SECINIT_RPC_TIMEOUT) + +/* + * default gc interval + */ +#define GSS_GC_INTERVAL (60 * 60) /* 60 minutes */ + +static inline time64_t gss_round_ctx_expiry(time64_t expiry, + unsigned long sec_flags) +{ + if (sec_flags & PTLRPC_SEC_FL_REVERSE) + return expiry; + + if (ktime_get_real_seconds() + __TIMEOUT_DELTA <= expiry) + return expiry - __TIMEOUT_DELTA; + + return expiry; +} + +/* + * Max encryption element in block cipher algorithms. + */ +#define GSS_MAX_CIPHER_BLOCK (16) + +/* + * XXX make it visible of kernel and lgssd/lsvcgssd + */ +enum { + GSSD_INTERFACE_VERSION_V1 = 1, + GSSD_INTERFACE_VERSION_V2 = 2, + GSSD_INTERFACE_VERSION = GSSD_INTERFACE_VERSION_V2, +}; + +#define PTLRPC_GSS_VERSION (1) + + +enum ptlrpc_gss_proc { + PTLRPC_GSS_PROC_DATA = 0, + PTLRPC_GSS_PROC_INIT = 1, + PTLRPC_GSS_PROC_CONTINUE_INIT = 2, + PTLRPC_GSS_PROC_DESTROY = 3, + PTLRPC_GSS_PROC_ERR = 4, +}; + +enum ptlrpc_gss_tgt { + LUSTRE_GSS_TGT_MGS = 0, + LUSTRE_GSS_TGT_MDS = 1, + LUSTRE_GSS_TGT_OSS = 2, +}; + +enum ptlrpc_gss_header_flags { + LUSTRE_GSS_PACK_BULK = 1, + LUSTRE_GSS_PACK_USER = 2, + LUSTRE_GSS_PACK_KCSUM = 4, +}; + +static inline +__u32 import_to_gss_svc(struct obd_import *imp) +{ + int cl_sp_to = LUSTRE_SP_ANY; + + if (imp->imp_obd) + cl_sp_to = imp->imp_obd->u.cli.cl_sp_to; + + switch (cl_sp_to) { + case LUSTRE_SP_MDT: + return LUSTRE_GSS_TGT_MDS; + case LUSTRE_SP_OST: + return LUSTRE_GSS_TGT_OSS; + case LUSTRE_SP_MGC: + case LUSTRE_SP_MGS: + return LUSTRE_GSS_TGT_MGS; + case LUSTRE_SP_CLI: + case LUSTRE_SP_ANY: + default: + return 0; + } +} + +#define PTLRPC_GSS_MAX_HANDLE_SIZE (8) +#define PTLRPC_GSS_HEADER_SIZE (sizeof(struct gss_header) + \ + PTLRPC_GSS_MAX_HANDLE_SIZE) + + +static inline __u64 gss_handle_to_u64(rawobj_t *handle) +{ + if (handle->len != PTLRPC_GSS_MAX_HANDLE_SIZE) + return -1; + return *((__u64 *) handle->data); +} + +#define GSS_SEQ_WIN (2048) +#define GSS_SEQ_WIN_MAIN GSS_SEQ_WIN +#define GSS_SEQ_WIN_BACK (128) +#define GSS_SEQ_REPACK_THRESHOLD (GSS_SEQ_WIN_MAIN / 2 + \ + GSS_SEQ_WIN_MAIN / 4) + +struct gss_svc_seq_data { + spinlock_t ssd_lock; + /* + * highest sequence number seen so far, for main and back window + */ + __u32 ssd_max_main; + __u32 ssd_max_back; + /* + * main and back window + * for i such that ssd_max - GSS_SEQ_WIN < i <= ssd_max, the i-th bit + * of ssd_win is nonzero iff sequence number i has been seen already. + */ + unsigned long ssd_win_main[GSS_SEQ_WIN_MAIN/BITS_PER_LONG]; + unsigned long ssd_win_back[GSS_SEQ_WIN_BACK/BITS_PER_LONG]; +}; + +struct gss_svc_ctx { + struct gss_ctx *gsc_mechctx; + struct gss_svc_seq_data gsc_seqdata; + rawobj_t gsc_rvs_hdl; + __u32 gsc_rvs_seq; + uid_t gsc_uid; + gid_t gsc_gid; + uid_t gsc_mapped_uid; + unsigned int gsc_usr_root:1, + gsc_usr_mds:1, + gsc_usr_oss:1, + gsc_remote:1, + gsc_reverse:1; +}; + +struct gss_svc_reqctx { + struct ptlrpc_svc_ctx src_base; + /* + * context + */ + struct gss_wire_ctx src_wirectx; + struct gss_svc_ctx *src_ctx; + /* + * record place of bulk_sec_desc in request/reply buffer + */ + struct ptlrpc_bulk_sec_desc *src_reqbsd; + int src_reqbsd_size; + struct ptlrpc_bulk_sec_desc *src_repbsd; + int src_repbsd_size; + /* + * flags + */ + unsigned int src_init:1, + src_init_continue:1, + src_err_notify:1; + int src_reserve_len; +}; + +struct gss_cli_ctx { + struct ptlrpc_cli_ctx gc_base; + __u32 gc_flavor; + __u32 gc_proc; + __u32 gc_win; + atomic_t gc_seq; + rawobj_t gc_handle; + struct gss_ctx *gc_mechctx; + /* handle for the buddy svc ctx */ + rawobj_t gc_svc_handle; +}; + +struct gss_cli_ctx_keyring { + struct gss_cli_ctx gck_base; + struct key *gck_key; + struct timer_list gck_timer; +}; + +struct gss_sec { + struct ptlrpc_sec gs_base; + struct gss_api_mech *gs_mech; + spinlock_t gs_lock; + __u64 gs_rvs_hdl; +}; + +struct gss_sec_pipefs { + struct gss_sec gsp_base; + int gsp_chash_size; /* must be 2^n */ + struct hlist_head gsp_chash[0]; +}; + +/* + * FIXME cleanup the keyring upcall mutexes + */ +#define HAVE_KEYRING_UPCALL_SERIALIZED 1 + +struct gss_sec_keyring { + struct gss_sec gsk_base; + /* + * all contexts listed here. access is protected by sec spinlock. + */ + struct hlist_head gsk_clist; + /* + * specially point to root ctx (only one at a time). access is + * protected by sec spinlock. + */ + struct ptlrpc_cli_ctx *gsk_root_ctx; + /* + * specially serialize upcalls for root context. + */ + struct mutex gsk_root_uc_lock; + +#ifdef HAVE_KEYRING_UPCALL_SERIALIZED + struct mutex gsk_uc_lock; /* serialize upcalls */ +#endif +}; + +static inline struct gss_cli_ctx *ctx2gctx(struct ptlrpc_cli_ctx *ctx) +{ + return container_of(ctx, struct gss_cli_ctx, gc_base); +} + +static inline +struct gss_cli_ctx_keyring *ctx2gctx_keyring(struct ptlrpc_cli_ctx *ctx) +{ + return container_of(ctx2gctx(ctx), + struct gss_cli_ctx_keyring, gck_base); +} + +static inline struct gss_sec *sec2gsec(struct ptlrpc_sec *sec) +{ + return container_of(sec, struct gss_sec, gs_base); +} + +static inline struct gss_sec_pipefs *sec2gsec_pipefs(struct ptlrpc_sec *sec) +{ + return container_of(sec2gsec(sec), struct gss_sec_pipefs, gsp_base); +} + +static inline struct gss_sec_keyring *sec2gsec_keyring(struct ptlrpc_sec *sec) +{ + return container_of(sec2gsec(sec), struct gss_sec_keyring, gsk_base); +} + +#ifdef HAVE_CACHE_HASH_SPINLOCK +# define sunrpc_cache_lookup(c, i, h) sunrpc_cache_lookup_rcu((c), (i), (h)) +# define cache_read_lock(cdetail) spin_lock(&((cdetail)->hash_lock)) +# define cache_read_unlock(cdetail) spin_unlock(&((cdetail)->hash_lock)) +#else /* ! HAVE_CACHE_HASH_SPINLOCK */ +# define cache_read_lock(cdetail) read_lock(&((cdetail)->hash_lock)) +# define cache_read_unlock(cdetail) read_unlock(&((cdetail)->hash_lock)) +#endif + +#define GSS_CTX_INIT_MAX_LEN (1024) + +/* + * This only guaranteed be enough for current krb5 des-cbc-crc . We might + * adjust this when new enc type or mech added in. + */ +#define GSS_PRIVBUF_PREFIX_LEN (32) +#define GSS_PRIVBUF_SUFFIX_LEN (32) + +static inline +struct gss_svc_reqctx *gss_svc_ctx2reqctx(struct ptlrpc_svc_ctx *ctx) +{ + LASSERT(ctx); + return container_of(ctx, struct gss_svc_reqctx, src_base); +} + +static inline +struct gss_svc_ctx *gss_svc_ctx2gssctx(struct ptlrpc_svc_ctx *ctx) +{ + LASSERT(ctx); + return gss_svc_ctx2reqctx(ctx)->src_ctx; +} + +/* sec_gss.c */ +int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred); +int gss_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize); +int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); +int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); +int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); +int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); + +int gss_sec_install_rctx(struct obd_import *imp, struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx); +int gss_alloc_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, + int msgsize); +void gss_free_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req); +int gss_alloc_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, + int msgsize); +void gss_free_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req); +int gss_enlarge_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, + int segment, int newsize); + +int gss_svc_accept(struct ptlrpc_sec_policy *policy, + struct ptlrpc_request *req); +void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx); +int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen); +int gss_svc_authorize(struct ptlrpc_request *req); +void gss_svc_free_rs(struct ptlrpc_reply_state *rs); +void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx); + +int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx); +int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx); + +int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx, + struct ptlrpc_svc_ctx *svc_ctx); + +struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment, + int swabbed); +netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment); + +void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx); +int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor); +int gss_check_seq_num(struct gss_svc_seq_data *sd, __u32 seq_num, int set); + +int gss_sec_create_common(struct gss_sec *gsec, + struct ptlrpc_sec_policy *policy, + struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *sf); +void gss_sec_destroy_common(struct gss_sec *gsec); +void gss_sec_kill(struct ptlrpc_sec *sec); + +int gss_cli_ctx_init_common(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_ctx_ops *ctxops, + struct vfs_cred *vcred); +int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx); + +void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize); + +/* gss_keyring.c */ +#ifndef HAVE_GSS_KEYRING +static inline int __init gss_init_keyring(void) { return 0; } +static inline void __exit gss_exit_keyring(void) { return; } +#else +int __init gss_init_keyring(void); +void __exit gss_exit_keyring(void); +#endif +extern unsigned int gss_check_upcall_ns; + +/* gss_pipefs.c */ +#ifndef HAVE_GSS_PIPEFS +static inline int __init gss_init_pipefs(void) { return 0; } +static inline void __exit gss_exit_pipefs(void) { return; } +#else +int __init gss_init_pipefs(void); +void __exit gss_exit_pipefs(void); +#endif + +/* gss_bulk.c */ +int gss_cli_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + +/* gss_generic_token.c */ +int g_token_size(rawobj_t *mech, unsigned int body_size); +void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf); +__u32 g_verify_token_header(rawobj_t *mech, int *body_size, + unsigned char **buf_in, int toksize); + + +/* gss_cli_upcall.c */ +int gss_do_ctx_init_rpc(char __user *buffer, unsigned long count); +int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx); + +int __init gss_init_cli_upcall(void); +void gss_exit_cli_upcall(void); + +/* gss_svc_upcall.c */ +__u64 gss_get_next_ctx_index(void); +int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp, + struct gss_sec *gsec, + struct gss_cli_ctx *gctx); +int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle); +int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx); +int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq); +int gss_svc_upcall_handle_init(struct ptlrpc_request *req, + struct gss_svc_reqctx *grctx, + struct gss_wire_ctx *gw, + struct obd_device *target, + __u32 lustre_svc, + rawobj_t *rvs_hdl, + rawobj_t *in_token); +struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req, + struct gss_wire_ctx *gw); +void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx); +void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx); + +int __init gss_init_svc_upcall(void); +void gss_exit_svc_upcall(void); +extern unsigned int krb5_allow_old_client_csum; + +/* lproc_gss.c */ +void gss_stat_oos_record_cli(int behind); +void gss_stat_oos_record_svc(int phase, int replay); + +int __init gss_init_tunables(void); +void gss_exit_tunables(void); + +/* gss_null_mech.c */ +int __init init_null_module(void); +void cleanup_null_module(void); + +/* gss_krb5_mech.c */ +int __init init_kerberos_module(void); +void cleanup_kerberos_module(void); + +/* gss_sk_mech.c */ +#ifdef HAVE_OPENSSL_SSK +int __init init_sk_module(void); +void cleanup_sk_module(void); +#else +static inline int init_sk_module(void) { return 0; } +static inline void cleanup_sk_module(void) { return; } +#endif /* HAVE_OPENSSL_SSK */ + +/* debug */ +static inline +void __dbg_memdump(char *name, void *ptr, int size) +{ + char *buf, *p = (char *) ptr; + int bufsize = size * 2 + 1, i; + + OBD_ALLOC(buf, bufsize); + if (!buf) { + CDEBUG(D_ERROR, "DUMP ERROR: can't alloc %d bytes\n", bufsize); + return; + } + + for (i = 0; i < size; i++) + sprintf(&buf[i+i], "%02x", (__u8) p[i]); + buf[size + size] = '\0'; + LCONSOLE_INFO("DUMP %s@%p(%d): %s\n", name, ptr, size, buf); + OBD_FREE(buf, bufsize); +} + +static inline unsigned int ll_read_key_usage(struct key *key) +{ +#ifdef HAVE_KEY_USAGE_REFCOUNT + return refcount_read(&key->usage); +#else + return atomic_read(&key->usage); +#endif +} + +#endif /* __PTLRPC_GSS_GSS_INTERNAL_H_ */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c new file mode 100644 index 0000000000000..124ebe1dc15f7 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c @@ -0,0 +1,1652 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/gss/gss_keyring.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +#ifdef HAVE_GET_REQUEST_KEY_AUTH +#include +#endif + +static struct ptlrpc_sec_policy gss_policy_keyring; +static struct ptlrpc_ctx_ops gss_keyring_ctxops; +static struct key_type gss_key_type; + +static int sec_install_rctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_svc_ctx *svc_ctx); +static void request_key_unlink(struct key *key); + +/* + * the timeout is only for the case that upcall child process die abnormally. + * in any other cases it should finally update kernel key. + * + * FIXME we'd better to incorporate the client & server side upcall timeouts + * into the framework of Adaptive Timeouts, but we need to figure out how to + * make sure that kernel knows the upcall processes is in-progress or died + * unexpectedly. + */ +#define KEYRING_UPCALL_TIMEOUT (obd_timeout + obd_timeout) + +/* Check caller's namespace in gss_keyring upcall */ +unsigned int gss_check_upcall_ns = 1; + +/**************************************** + * internal helpers * + ****************************************/ + +static inline void keyring_upcall_lock(struct gss_sec_keyring *gsec_kr) +{ +#ifdef HAVE_KEYRING_UPCALL_SERIALIZED + mutex_lock(&gsec_kr->gsk_uc_lock); +#endif +} + +static inline void keyring_upcall_unlock(struct gss_sec_keyring *gsec_kr) +{ +#ifdef HAVE_KEYRING_UPCALL_SERIALIZED + mutex_unlock(&gsec_kr->gsk_uc_lock); +#endif +} + +static inline void key_revoke_locked(struct key *key) +{ + set_bit(KEY_FLAG_REVOKED, &key->flags); +} + +static void ctx_upcall_timeout_kr(cfs_timer_cb_arg_t data) +{ + struct gss_cli_ctx_keyring *gctx_kr = cfs_from_timer(gctx_kr, + data, gck_timer); + struct ptlrpc_cli_ctx *ctx = &(gctx_kr->gck_base.gc_base); + struct key *key = gctx_kr->gck_key; + + CWARN("ctx %p, key %p\n", ctx, key); + + LASSERT(key); + + cli_ctx_expire(ctx); + key_revoke_locked(key); +} + +static void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, time64_t timeout) +{ + struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx); + struct timer_list *timer = &gctx_kr->gck_timer; + + LASSERT(timer); + + CDEBUG(D_SEC, "ctx %p: start timer %llds\n", ctx, timeout); + + cfs_timer_setup(timer, ctx_upcall_timeout_kr, + (unsigned long)gctx_kr, 0); + timer->expires = cfs_time_seconds(timeout) + jiffies; + add_timer(timer); +} + +/* + * caller should make sure no race with other threads + */ +static +void ctx_clear_timer_kr(struct ptlrpc_cli_ctx *ctx) +{ + struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx); + struct timer_list *timer = &gctx_kr->gck_timer; + + CDEBUG(D_SEC, "ctx %p, key %p\n", ctx, gctx_kr->gck_key); + + del_singleshot_timer_sync(timer); +} + +static +struct ptlrpc_cli_ctx *ctx_create_kr(struct ptlrpc_sec *sec, + struct vfs_cred *vcred) +{ + struct ptlrpc_cli_ctx *ctx; + struct gss_cli_ctx_keyring *gctx_kr; + + OBD_ALLOC_PTR(gctx_kr); + if (gctx_kr == NULL) + return NULL; + + cfs_timer_setup(&gctx_kr->gck_timer, NULL, 0, 0); + + ctx = &gctx_kr->gck_base.gc_base; + + if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) { + OBD_FREE_PTR(gctx_kr); + return NULL; + } + + ctx->cc_expire = ktime_get_real_seconds() + KEYRING_UPCALL_TIMEOUT; + clear_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags); + atomic_inc(&ctx->cc_refcount); /* for the caller */ + + return ctx; +} + +static void ctx_destroy_kr(struct ptlrpc_cli_ctx *ctx) +{ + struct ptlrpc_sec *sec = ctx->cc_sec; + struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx); + + CDEBUG(D_SEC, "destroying ctx %p\n", ctx); + + /* at this time the association with key has been broken. */ + LASSERT(sec); + LASSERT(atomic_read(&sec->ps_refcount) > 0); + LASSERT(atomic_read(&sec->ps_nctx) > 0); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0); + LASSERT(gctx_kr->gck_key == NULL); + + ctx_clear_timer_kr(ctx); + + if (gss_cli_ctx_fini_common(sec, ctx)) + return; + + OBD_FREE_PTR(gctx_kr); + + atomic_dec(&sec->ps_nctx); + sptlrpc_sec_put(sec); +} + +static void ctx_release_kr(struct ptlrpc_cli_ctx *ctx, int sync) +{ + if (sync) { + ctx_destroy_kr(ctx); + } else { + atomic_inc(&ctx->cc_refcount); + sptlrpc_gc_add_ctx(ctx); + } +} + +static void ctx_put_kr(struct ptlrpc_cli_ctx *ctx, int sync) +{ + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + if (atomic_dec_and_test(&ctx->cc_refcount)) + ctx_release_kr(ctx, sync); +} + +/* + * key <-> ctx association and rules: + * - ctx might not bind with any key + * - key/ctx binding is protected by key semaphore (if the key present) + * - key and ctx each take a reference of the other + * - ctx enlist/unlist is protected by ctx spinlock + * - never enlist a ctx after it's been unlisted + * - whoever do enlist should also do bind, lock key before enlist: + * - lock key -> lock ctx -> enlist -> unlock ctx -> bind -> unlock key + * - whoever do unlist should also do unbind: + * - lock key -> lock ctx -> unlist -> unlock ctx -> unbind -> unlock key + * - lock ctx -> unlist -> unlock ctx -> lock key -> unbind -> unlock key + */ + +static inline void spin_lock_if(spinlock_t *lock, int condition) +{ + if (condition) + spin_lock(lock); +} + +static inline void spin_unlock_if(spinlock_t *lock, int condition) +{ + if (condition) + spin_unlock(lock); +} + +static void ctx_enlist_kr(struct ptlrpc_cli_ctx *ctx, int is_root, int locked) +{ + struct ptlrpc_sec *sec = ctx->cc_sec; + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + + LASSERT(!test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)); + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + spin_lock_if(&sec->ps_lock, !locked); + + atomic_inc(&ctx->cc_refcount); + set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags); + hlist_add_head(&ctx->cc_cache, &gsec_kr->gsk_clist); + if (is_root) + gsec_kr->gsk_root_ctx = ctx; + + spin_unlock_if(&sec->ps_lock, !locked); +} + +/* + * Note after this get called, caller should not access ctx again because + * it might have been freed, unless caller hold at least one refcount of + * the ctx. + * + * return non-zero if we indeed unlist this ctx. + */ +static int ctx_unlist_kr(struct ptlrpc_cli_ctx *ctx, int locked) +{ + struct ptlrpc_sec *sec = ctx->cc_sec; + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + + /* if hashed bit has gone, leave the job to somebody who is doing it */ + if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0) + return 0; + + /* drop ref inside spin lock to prevent race with other operations */ + spin_lock_if(&sec->ps_lock, !locked); + + if (gsec_kr->gsk_root_ctx == ctx) + gsec_kr->gsk_root_ctx = NULL; + hlist_del_init(&ctx->cc_cache); + atomic_dec(&ctx->cc_refcount); + + spin_unlock_if(&sec->ps_lock, !locked); + + return 1; +} + +/* + * Get specific payload. Newer kernels support 4 slots. + */ +static void * +key_get_payload(struct key *key, unsigned int index) +{ + void *key_ptr = NULL; + +#ifdef HAVE_KEY_PAYLOAD_DATA_ARRAY + key_ptr = key->payload.data[index]; +#else + if (!index) + key_ptr = key->payload.data; +#endif + return key_ptr; +} + +/* + * Set specific payload. Newer kernels support 4 slots. + */ +static int key_set_payload(struct key *key, unsigned int index, + struct ptlrpc_cli_ctx *ctx) +{ + int rc = -EINVAL; + +#ifdef HAVE_KEY_PAYLOAD_DATA_ARRAY + if (index < 4) { + key->payload.data[index] = ctx; +#else + if (!index) { + key->payload.data = ctx; +#endif + rc = 0; + } + return rc; +} + +/* + * bind a key with a ctx together. + * caller must hold write lock of the key, as well as ref on key & ctx. + */ +static void bind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(ll_read_key_usage(key) > 0); + LASSERT(ctx2gctx_keyring(ctx)->gck_key == NULL); + LASSERT(!key_get_payload(key, 0)); + + /* at this time context may or may not in list. */ + key_get(key); + atomic_inc(&ctx->cc_refcount); + ctx2gctx_keyring(ctx)->gck_key = key; + LASSERT(!key_set_payload(key, 0, ctx)); +} + +/* + * unbind a key and a ctx. + * caller must hold write lock, as well as a ref of the key. + */ +static void unbind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(key_get_payload(key, 0) == ctx); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0); + + /* must revoke the key, or others may treat it as newly created */ + key_revoke_locked(key); + + key_set_payload(key, 0, NULL); + ctx2gctx_keyring(ctx)->gck_key = NULL; + + /* once ctx get split from key, the timer is meaningless */ + ctx_clear_timer_kr(ctx); + + ctx_put_kr(ctx, 1); + key_put(key); +} + +/* + * given a ctx, unbind with its coupled key, if any. + * unbind could only be called once, so we don't worry the key be released + * by someone else. + */ +static void unbind_ctx_kr(struct ptlrpc_cli_ctx *ctx) +{ + struct key *key = ctx2gctx_keyring(ctx)->gck_key; + + if (key) { + LASSERT(key_get_payload(key, 0) == ctx); + + key_get(key); + down_write(&key->sem); + unbind_key_ctx(key, ctx); + up_write(&key->sem); + key_put(key); + request_key_unlink(key); + } +} + +/* + * given a key, unbind with its coupled ctx, if any. + * caller must hold write lock, as well as a ref of the key. + */ +static void unbind_key_locked(struct key *key) +{ + struct ptlrpc_cli_ctx *ctx = key_get_payload(key, 0); + + if (ctx) + unbind_key_ctx(key, ctx); +} + +/* + * unlist a ctx, and unbind from coupled key + */ +static void kill_ctx_kr(struct ptlrpc_cli_ctx *ctx) +{ + if (ctx_unlist_kr(ctx, 0)) + unbind_ctx_kr(ctx); +} + +/* + * given a key, unlist and unbind with the coupled ctx (if any). + * caller must hold write lock, as well as a ref of the key. + */ +static void kill_key_locked(struct key *key) +{ + struct ptlrpc_cli_ctx *ctx = key_get_payload(key, 0); + + if (ctx && ctx_unlist_kr(ctx, 0)) + unbind_key_locked(key); +} + +/* + * caller should hold one ref on contexts in freelist. + */ +static void dispose_ctx_list_kr(struct hlist_head *freelist) +{ + struct hlist_node *next; + struct ptlrpc_cli_ctx *ctx; + struct gss_cli_ctx *gctx; + + hlist_for_each_entry_safe(ctx, next, freelist, cc_cache) { + hlist_del_init(&ctx->cc_cache); + + /* reverse ctx: update current seq to buddy svcctx if exist. + * ideally this should be done at gss_cli_ctx_finalize(), but + * the ctx destroy could be delayed by: + * 1) ctx still has reference; + * 2) ctx destroy is asynchronous; + * and reverse import call inval_all_ctx() require this be done + * _immediately_ otherwise newly created reverse ctx might copy + * the very old sequence number from svcctx. */ + gctx = ctx2gctx(ctx); + if (!rawobj_empty(&gctx->gc_svc_handle) && + sec_is_reverse(gctx->gc_base.cc_sec)) { + gss_svc_upcall_update_sequence(&gctx->gc_svc_handle, + (__u32) atomic_read(&gctx->gc_seq)); + } + + /* we need to wakeup waiting reqs here. the context might + * be forced released before upcall finished, then the + * late-arrived downcall can't find the ctx even. */ + sptlrpc_cli_ctx_wakeup(ctx); + + unbind_ctx_kr(ctx); + ctx_put_kr(ctx, 0); + } +} + +/* + * lookup a root context directly in a sec, return root ctx with a + * reference taken or NULL. + */ +static +struct ptlrpc_cli_ctx * sec_lookup_root_ctx_kr(struct ptlrpc_sec *sec) +{ + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct ptlrpc_cli_ctx *ctx = NULL; + + spin_lock(&sec->ps_lock); + + ctx = gsec_kr->gsk_root_ctx; + + if (ctx == NULL && unlikely(sec_is_reverse(sec))) { + struct ptlrpc_cli_ctx *tmp; + + /* reverse ctx, search root ctx in list, choose the one + * with shortest expire time, which is most possibly have + * an established peer ctx at client side. */ + hlist_for_each_entry(tmp, &gsec_kr->gsk_clist, cc_cache) { + if (ctx == NULL || ctx->cc_expire == 0 || + ctx->cc_expire > tmp->cc_expire) { + ctx = tmp; + /* promote to be root_ctx */ + gsec_kr->gsk_root_ctx = ctx; + } + } + } + + if (ctx) { + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(!hlist_empty(&gsec_kr->gsk_clist)); + atomic_inc(&ctx->cc_refcount); + } + + spin_unlock(&sec->ps_lock); + + return ctx; +} + +#define RVS_CTX_EXPIRE_NICE (10) + +static +void rvs_sec_install_root_ctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *new_ctx, + struct key *key) +{ + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct ptlrpc_cli_ctx *ctx; + time64_t now; + + ENTRY; + LASSERT(sec_is_reverse(sec)); + + spin_lock(&sec->ps_lock); + + now = ktime_get_real_seconds(); + + /* set all existing ctxs short expiry */ + hlist_for_each_entry(ctx, &gsec_kr->gsk_clist, cc_cache) { + if (ctx->cc_expire > now + RVS_CTX_EXPIRE_NICE) { + ctx->cc_early_expire = 1; + ctx->cc_expire = now + RVS_CTX_EXPIRE_NICE; + } + } + + /* if there's root_ctx there, instead obsolete the current + * immediately, we leave it continue operating for a little while. + * hopefully when the first backward rpc with newest ctx send out, + * the client side already have the peer ctx well established. */ + ctx_enlist_kr(new_ctx, gsec_kr->gsk_root_ctx ? 0 : 1, 1); + + if (key) + bind_key_ctx(key, new_ctx); + + spin_unlock(&sec->ps_lock); +} + +static void construct_key_desc(void *buf, int bufsize, + struct ptlrpc_sec *sec, uid_t uid) +{ + snprintf(buf, bufsize, "%d@%x", uid, sec->ps_id); + ((char *)buf)[bufsize - 1] = '\0'; +} + +/**************************************** + * sec apis * + ****************************************/ + +static +struct ptlrpc_sec * gss_sec_create_kr(struct obd_import *imp, + struct ptlrpc_svc_ctx *svcctx, + struct sptlrpc_flavor *sf) +{ + struct gss_sec_keyring *gsec_kr; + ENTRY; + + OBD_ALLOC(gsec_kr, sizeof(*gsec_kr)); + if (gsec_kr == NULL) + RETURN(NULL); + + INIT_HLIST_HEAD(&gsec_kr->gsk_clist); + gsec_kr->gsk_root_ctx = NULL; + mutex_init(&gsec_kr->gsk_root_uc_lock); +#ifdef HAVE_KEYRING_UPCALL_SERIALIZED + mutex_init(&gsec_kr->gsk_uc_lock); +#endif + + if (gss_sec_create_common(&gsec_kr->gsk_base, &gss_policy_keyring, + imp, svcctx, sf)) + goto err_free; + + if (svcctx != NULL && + sec_install_rctx_kr(&gsec_kr->gsk_base.gs_base, svcctx)) { + gss_sec_destroy_common(&gsec_kr->gsk_base); + goto err_free; + } + + RETURN(&gsec_kr->gsk_base.gs_base); + +err_free: + OBD_FREE(gsec_kr, sizeof(*gsec_kr)); + RETURN(NULL); +} + +static +void gss_sec_destroy_kr(struct ptlrpc_sec *sec) +{ + struct gss_sec *gsec = sec2gsec(sec); + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + + CDEBUG(D_SEC, "destroy %s@%p\n", sec->ps_policy->sp_name, sec); + + LASSERT(hlist_empty(&gsec_kr->gsk_clist)); + LASSERT(gsec_kr->gsk_root_ctx == NULL); + + gss_sec_destroy_common(gsec); + + OBD_FREE(gsec_kr, sizeof(*gsec_kr)); +} + +static inline int user_is_root(struct ptlrpc_sec *sec, struct vfs_cred *vcred) +{ + /* except the ROOTONLY flag, treat it as root user only if real uid + * is 0, euid/fsuid being 0 are handled as setuid scenarios */ + if (sec_is_rootonly(sec) || (vcred->vc_uid == 0)) + return 1; + else + return 0; +} + +/* + * kernel 5.3: commit 0f44e4d976f96c6439da0d6717238efa4b91196e + * keys: Move the user and user-session keyrings to the user_namespace + * + * When lookup_user_key is available use the kernel API rather than directly + * accessing the uid_keyring and session_keyring via the current process + * credentials. + */ +#ifdef HAVE_LOOKUP_USER_KEY + +/* from Linux security/keys/internal.h: */ +#ifndef KEY_LOOKUP_FOR_UNLINK +#define KEY_LOOKUP_FOR_UNLINK 0x04 +#endif + +static struct key *_user_key(key_serial_t id) +{ + key_ref_t ref; + + might_sleep(); + ref = lookup_user_key(id, KEY_LOOKUP_FOR_UNLINK, 0); + if (IS_ERR(ref)) + return NULL; + return key_ref_to_ptr(ref); +} + +static inline struct key *get_user_session_keyring(const struct cred *cred) +{ + return _user_key(KEY_SPEC_USER_SESSION_KEYRING); +} + +static inline struct key *get_user_keyring(const struct cred *cred) +{ + return _user_key(KEY_SPEC_USER_KEYRING); +} +#else +static inline struct key *get_user_session_keyring(const struct cred *cred) +{ + return key_get(cred->user->session_keyring); +} + +static inline struct key *get_user_keyring(const struct cred *cred) +{ + return key_get(cred->user->uid_keyring); +} +#endif + +/* + * unlink request key from it's ring, which is linked during request_key(). + * sadly, we have to 'guess' which keyring it's linked to. + * + * FIXME this code is fragile, it depends on how request_key() is implemented. + */ +static void request_key_unlink(struct key *key) +{ + const struct cred *cred = current_cred(); + struct key *ring = NULL; + + switch (cred->jit_keyring) { + case KEY_REQKEY_DEFL_DEFAULT: + case KEY_REQKEY_DEFL_REQUESTOR_KEYRING: +#ifdef HAVE_GET_REQUEST_KEY_AUTH + if (cred->request_key_auth) { + struct request_key_auth *rka; + struct key *authkey = cred->request_key_auth; + + down_read(&authkey->sem); + rka = get_request_key_auth(authkey); + if (!test_bit(KEY_FLAG_REVOKED, &authkey->flags)) + ring = key_get(rka->dest_keyring); + up_read(&authkey->sem); + if (ring) + break; + } +#endif + fallthrough; + case KEY_REQKEY_DEFL_THREAD_KEYRING: + ring = key_get(cred->thread_keyring); + if (ring) + break; + fallthrough; + case KEY_REQKEY_DEFL_PROCESS_KEYRING: + ring = key_get(cred->process_keyring); + if (ring) + break; + fallthrough; + case KEY_REQKEY_DEFL_SESSION_KEYRING: + rcu_read_lock(); + ring = key_get(rcu_dereference(cred->session_keyring)); + rcu_read_unlock(); + if (ring) + break; + fallthrough; + case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: + ring = get_user_session_keyring(cred); + break; + case KEY_REQKEY_DEFL_USER_KEYRING: + ring = get_user_keyring(cred); + break; + case KEY_REQKEY_DEFL_GROUP_KEYRING: + default: + LBUG(); + } + + LASSERT(ring); + key_unlink(ring, key); + key_put(ring); +} + +static +struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_kr(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + struct obd_import *imp = sec->ps_import; + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct ptlrpc_cli_ctx *ctx = NULL; + unsigned int is_root = 0, create_new = 0; + struct key *key; + char desc[24]; + char *coinfo; + int coinfo_size; + const char *sec_part_flags = ""; + char svc_flag = '-'; + pid_t caller_pid; + ENTRY; + + LASSERT(imp != NULL); + + is_root = user_is_root(sec, vcred); + + /* a little bit optimization for root context */ + if (is_root) { + ctx = sec_lookup_root_ctx_kr(sec); + /* + * Only lookup directly for REVERSE sec, which should + * always succeed. + */ + if (ctx || sec_is_reverse(sec)) + RETURN(ctx); + } + + LASSERT(create != 0); + + /* for root context, obtain lock and check again, this time hold + * the root upcall lock, make sure nobody else populated new root + * context after last check. + */ + if (is_root) { + mutex_lock(&gsec_kr->gsk_root_uc_lock); + + ctx = sec_lookup_root_ctx_kr(sec); + if (ctx) + goto out; + + /* update reverse handle for root user */ + sec2gsec(sec)->gs_rvs_hdl = gss_get_next_ctx_index(); + + switch (sec->ps_part) { + case LUSTRE_SP_MDT: + sec_part_flags = "m"; + break; + case LUSTRE_SP_OST: + sec_part_flags = "o"; + break; + case LUSTRE_SP_MGC: + sec_part_flags = "rmo"; + break; + case LUSTRE_SP_CLI: + sec_part_flags = "r"; + break; + case LUSTRE_SP_MGS: + default: + LBUG(); + } + + switch (SPTLRPC_FLVR_SVC(sec->ps_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + svc_flag = 'n'; + break; + case SPTLRPC_SVC_AUTH: + svc_flag = 'a'; + break; + case SPTLRPC_SVC_INTG: + svc_flag = 'i'; + break; + case SPTLRPC_SVC_PRIV: + svc_flag = 'p'; + break; + default: + LBUG(); + } + } + + /* in case of setuid, key will be constructed as owner of fsuid/fsgid, + * but we do authentication based on real uid/gid. the key permission + * bits will be exactly as POS_ALL, so only processes who subscribed + * this key could have the access, although the quota might be counted + * on others (fsuid/fsgid). + * + * keyring will use fsuid/fsgid as upcall parameters, so we have to + * encode real uid/gid into callout info. + */ + + /* But first we need to make sure the obd type is supported */ + if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_OSC_NAME) && + strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MGC_NAME) && + strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_LWP_NAME) && + strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_OSP_NAME)) { + CERROR("obd %s is not a supported device\n", + imp->imp_obd->obd_name); + GOTO(out, ctx = NULL); + } + + construct_key_desc(desc, sizeof(desc), sec, vcred->vc_uid); + + /* callout info format: + * secid:mech:uid:gid:sec_flags:svc_flag:svc_type:peer_nid:target_uuid: + * self_nid:pid + */ + coinfo_size = sizeof(struct obd_uuid) + MAX_OBD_NAME + 64; + OBD_ALLOC(coinfo, coinfo_size); + if (coinfo == NULL) + goto out; + + /* Last callout parameter is pid of process whose namespace will be used + * for credentials' retrieval. + */ + if (gss_check_upcall_ns) { + /* For user's credentials (in which case sec_part_flags is + * empty), use current PID instead of import's reference + * PID to get reference namespace. + */ + if (sec_part_flags[0] == '\0') + caller_pid = current->pid; + else + caller_pid = imp->imp_sec_refpid; + } else { + /* Do not switch namespace in gss keyring upcall. */ + caller_pid = 0; + } + snprintf(coinfo, coinfo_size, "%d:%s:%u:%u:%s:%c:%d:%#llx:%s:%#llx:%d", + sec->ps_id, sec2gsec(sec)->gs_mech->gm_name, + vcred->vc_uid, vcred->vc_gid, + sec_part_flags, svc_flag, import_to_gss_svc(imp), + lnet_nid_to_nid4(&imp->imp_connection->c_peer.nid), + imp->imp_obd->obd_name, + LNetPrimaryNID(lnet_nid_to_nid4(&imp->imp_connection->c_self)), + caller_pid); + + CDEBUG(D_SEC, "requesting key for %s\n", desc); + + keyring_upcall_lock(gsec_kr); + key = request_key(&gss_key_type, desc, coinfo); + keyring_upcall_unlock(gsec_kr); + + OBD_FREE(coinfo, coinfo_size); + + if (IS_ERR(key)) { + CERROR("failed request key: %ld\n", PTR_ERR(key)); + goto out; + } + CDEBUG(D_SEC, "obtained key %08x for %s\n", key->serial, desc); + + /* once payload.data was pointed to a ctx, it never changes until + * we de-associate them; but parallel request_key() may return + * a key with payload.data == NULL at the same time. so we still + * need wirtelock of key->sem to serialize them. + */ + down_write(&key->sem); + + ctx = key_get_payload(key, 0); + if (likely(ctx)) { + LASSERT(atomic_read(&ctx->cc_refcount) >= 1); + LASSERT(ctx2gctx_keyring(ctx)->gck_key == key); + LASSERT(ll_read_key_usage(key) >= 2); + + /* simply take a ref and return. it's upper layer's + * responsibility to detect & replace dead ctx. + */ + atomic_inc(&ctx->cc_refcount); + } else { + /* pre initialization with a cli_ctx. this can't be done in + * key_instantiate() because we'v no enough information + * there. + */ + ctx = ctx_create_kr(sec, vcred); + if (ctx != NULL) { + ctx_enlist_kr(ctx, is_root, 0); + bind_key_ctx(key, ctx); + + ctx_start_timer_kr(ctx, KEYRING_UPCALL_TIMEOUT); + + CDEBUG(D_SEC, "installed key %p <-> ctx %p (sec %p)\n", + key, ctx, sec); + } else { + /* we'd prefer to call key_revoke(), but we more like + * to revoke it within this key->sem locked period. + */ + key_revoke_locked(key); + } + + create_new = 1; + } + + up_write(&key->sem); + + if (is_root && create_new) + request_key_unlink(key); + + key_put(key); +out: + if (is_root) + mutex_unlock(&gsec_kr->gsk_root_uc_lock); + RETURN(ctx); +} + +static +void gss_sec_release_ctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + int sync) +{ + LASSERT(atomic_read(&sec->ps_refcount) > 0); + LASSERT(atomic_read(&ctx->cc_refcount) == 0); + ctx_release_kr(ctx, sync); +} + +/* + * flush context of normal user, we must resort to keyring itself to find out + * contexts which belong to me. + * + * Note here we suppose only to flush _my_ context, the "uid" will + * be ignored in the search. + */ +static +void flush_user_ctx_cache_kr(struct ptlrpc_sec *sec, + uid_t uid, + int grace, int force) +{ + struct key *key; + char desc[24]; + + /* nothing to do for reverse or rootonly sec */ + if (sec_is_reverse(sec) || sec_is_rootonly(sec)) + return; + + construct_key_desc(desc, sizeof(desc), sec, uid); + + /* there should be only one valid key, but we put it in the + * loop in case of any weird cases */ + for (;;) { + key = request_key(&gss_key_type, desc, NULL); + if (IS_ERR(key)) { + CDEBUG(D_SEC, "No more key found for current user\n"); + break; + } + + down_write(&key->sem); + + kill_key_locked(key); + + /* kill_key_locked() should usually revoke the key, but we + * revoke it again to make sure, e.g. some case the key may + * not well coupled with a context. */ + key_revoke_locked(key); + + up_write(&key->sem); + + request_key_unlink(key); + + key_put(key); + } +} + +/* + * flush context of root or all, we iterate through the list. + */ +static +void flush_spec_ctx_cache_kr(struct ptlrpc_sec *sec, uid_t uid, int grace, + int force) +{ + struct gss_sec_keyring *gsec_kr; + struct hlist_head freelist = HLIST_HEAD_INIT; + struct hlist_node *next; + struct ptlrpc_cli_ctx *ctx; + ENTRY; + + gsec_kr = sec2gsec_keyring(sec); + + spin_lock(&sec->ps_lock); + hlist_for_each_entry_safe(ctx, next, &gsec_kr->gsk_clist, + cc_cache) { + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + if (uid != -1 && uid != ctx->cc_vcred.vc_uid) + continue; + + /* at this moment there's at least 2 base reference: + * key association and in-list. */ + if (atomic_read(&ctx->cc_refcount) > 2) { + if (!force) + continue; + CWARN("flush busy ctx %p(%u->%s, extra ref %d)\n", + ctx, ctx->cc_vcred.vc_uid, + sec2target_str(ctx->cc_sec), + atomic_read(&ctx->cc_refcount) - 2); + } + + set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags); + if (!grace) + clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags); + + atomic_inc(&ctx->cc_refcount); + + if (ctx_unlist_kr(ctx, 1)) { + hlist_add_head(&ctx->cc_cache, &freelist); + } else { + LASSERT(atomic_read(&ctx->cc_refcount) >= 2); + atomic_dec(&ctx->cc_refcount); + } + } + spin_unlock(&sec->ps_lock); + + dispose_ctx_list_kr(&freelist); + EXIT; +} + +static +int gss_sec_flush_ctx_cache_kr(struct ptlrpc_sec *sec, + uid_t uid, int grace, int force) +{ + ENTRY; + + CDEBUG(D_SEC, "sec %p(%d, nctx %d), uid %d, grace %d, force %d\n", + sec, atomic_read(&sec->ps_refcount), + atomic_read(&sec->ps_nctx), + uid, grace, force); + + if (uid != -1 && uid != 0) + flush_user_ctx_cache_kr(sec, uid, grace, force); + else + flush_spec_ctx_cache_kr(sec, uid, grace, force); + + RETURN(0); +} + +static +void gss_sec_gc_ctx_kr(struct ptlrpc_sec *sec) +{ + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct hlist_head freelist = HLIST_HEAD_INIT; + struct hlist_node *next; + struct ptlrpc_cli_ctx *ctx; + ENTRY; + + CWARN("running gc\n"); + + spin_lock(&sec->ps_lock); + hlist_for_each_entry_safe(ctx, next, &gsec_kr->gsk_clist, + cc_cache) { + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + atomic_inc(&ctx->cc_refcount); + + if (cli_ctx_check_death(ctx) && ctx_unlist_kr(ctx, 1)) { + hlist_add_head(&ctx->cc_cache, &freelist); + CWARN("unhashed ctx %p\n", ctx); + } else { + LASSERT(atomic_read(&ctx->cc_refcount) >= 2); + atomic_dec(&ctx->cc_refcount); + } + } + spin_unlock(&sec->ps_lock); + + dispose_ctx_list_kr(&freelist); + EXIT; +} + +static +int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq) +{ + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct hlist_node *next; + struct ptlrpc_cli_ctx *ctx; + struct gss_cli_ctx *gctx; + time64_t now = ktime_get_real_seconds(); + + ENTRY; + spin_lock(&sec->ps_lock); + hlist_for_each_entry_safe(ctx, next, &gsec_kr->gsk_clist, + cc_cache) { + struct key *key; + char flags_str[40]; + char mech[40]; + + gctx = ctx2gctx(ctx); + key = ctx2gctx_keyring(ctx)->gck_key; + + gss_cli_ctx_flags2str(ctx->cc_flags, + flags_str, sizeof(flags_str)); + + if (gctx->gc_mechctx) + lgss_display(gctx->gc_mechctx, mech, sizeof(mech)); + else + snprintf(mech, sizeof(mech), "N/A"); + mech[sizeof(mech) - 1] = '\0'; + + seq_printf(seq, + "%p: uid %u, ref %d, expire %lld(%+lld), fl %s, seq %d, win %u, key %08x(ref %d), hdl %#llx:%#llx, mech: %s\n", + ctx, ctx->cc_vcred.vc_uid, + atomic_read(&ctx->cc_refcount), + ctx->cc_expire, + ctx->cc_expire ? ctx->cc_expire - now : 0, + flags_str, + atomic_read(&gctx->gc_seq), + gctx->gc_win, + key ? key->serial : 0, + key ? ll_read_key_usage(key) : 0, + gss_handle_to_u64(&gctx->gc_handle), + gss_handle_to_u64(&gctx->gc_svc_handle), + mech); + } + spin_unlock(&sec->ps_lock); + + RETURN(0); +} + +/**************************************** + * cli_ctx apis * + ****************************************/ + +static +int gss_cli_ctx_refresh_kr(struct ptlrpc_cli_ctx *ctx) +{ + /* upcall is already on the way */ + struct gss_cli_ctx *gctx = ctx ? ctx2gctx(ctx) : NULL; + + /* record latest sequence number in buddy svcctx */ + if (gctx && !rawobj_empty(&gctx->gc_svc_handle) && + sec_is_reverse(gctx->gc_base.cc_sec)) { + return gss_svc_upcall_update_sequence(&gctx->gc_svc_handle, + (__u32)atomic_read(&gctx->gc_seq)); + } + return 0; +} + +static +int gss_cli_ctx_validate_kr(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(ctx->cc_sec); + + if (cli_ctx_check_death(ctx)) { + kill_ctx_kr(ctx); + return 1; + } + + if (cli_ctx_is_ready(ctx)) + return 0; + return 1; +} + +static +void gss_cli_ctx_die_kr(struct ptlrpc_cli_ctx *ctx, int grace) +{ + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(ctx->cc_sec); + + cli_ctx_expire(ctx); + kill_ctx_kr(ctx); +} + +/**************************************** + * (reverse) service * + ****************************************/ + +/* + * reverse context could have nothing to do with keyrings. here we still keep + * the version which bind to a key, for future reference. + */ +#define HAVE_REVERSE_CTX_NOKEY + +#ifdef HAVE_REVERSE_CTX_NOKEY + +static +int sec_install_rctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct ptlrpc_cli_ctx *cli_ctx; + struct vfs_cred vcred = { .vc_uid = 0 }; + int rc; + + LASSERT(sec); + LASSERT(svc_ctx); + + cli_ctx = ctx_create_kr(sec, &vcred); + if (cli_ctx == NULL) + return -ENOMEM; + + rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx); + if (rc) { + CERROR("failed copy reverse cli ctx: %d\n", rc); + + ctx_put_kr(cli_ctx, 1); + return rc; + } + + rvs_sec_install_root_ctx_kr(sec, cli_ctx, NULL); + + ctx_put_kr(cli_ctx, 1); + + return 0; +} + +#else /* ! HAVE_REVERSE_CTX_NOKEY */ + +static +int sec_install_rctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct ptlrpc_cli_ctx *cli_ctx = NULL; + struct key *key; + struct vfs_cred vcred = { .vc_uid = 0 }; + char desc[64]; + int rc; + + LASSERT(sec); + LASSERT(svc_ctx); + CWARN("called\n"); + + construct_key_desc(desc, sizeof(desc), sec, 0); + + key = key_alloc(&gss_key_type, desc, 0, 0, + KEY_POS_ALL | KEY_USR_ALL, 1); + if (IS_ERR(key)) { + CERROR("failed to alloc key: %ld\n", PTR_ERR(key)); + return PTR_ERR(key); + } + + rc = key_instantiate_and_link(key, NULL, 0, NULL, NULL); + if (rc) { + CERROR("failed to instantiate key: %d\n", rc); + goto err_revoke; + } + + down_write(&key->sem); + + LASSERT(!key_get_payload(key, 0)); + + cli_ctx = ctx_create_kr(sec, &vcred); + if (cli_ctx == NULL) { + rc = -ENOMEM; + goto err_up; + } + + rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx); + if (rc) { + CERROR("failed copy reverse cli ctx: %d\n", rc); + goto err_put; + } + + rvs_sec_install_root_ctx_kr(sec, cli_ctx, key); + + ctx_put_kr(cli_ctx, 1); + up_write(&key->sem); + + rc = 0; + CWARN("ok!\n"); +out: + key_put(key); + return rc; + +err_put: + ctx_put_kr(cli_ctx, 1); +err_up: + up_write(&key->sem); +err_revoke: + key_revoke(key); + goto out; +} + +#endif /* HAVE_REVERSE_CTX_NOKEY */ + +/**************************************** + * service apis * + ****************************************/ + +static +int gss_svc_accept_kr(struct ptlrpc_request *req) +{ + return gss_svc_accept(&gss_policy_keyring, req); +} + +static +int gss_svc_install_rctx_kr(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct ptlrpc_sec *sec; + int rc; + + sec = sptlrpc_import_sec_ref(imp); + LASSERT(sec); + + rc = sec_install_rctx_kr(sec, svc_ctx); + sptlrpc_sec_put(sec); + + return rc; +} + +/**************************************** + * key apis * + ****************************************/ + +static +#ifdef HAVE_KEY_TYPE_INSTANTIATE_2ARGS +int gss_kt_instantiate(struct key *key, struct key_preparsed_payload *prep) +{ + const void *data = prep->data; + size_t datalen = prep->datalen; +#else +int gss_kt_instantiate(struct key *key, const void *data, size_t datalen) +{ +#endif + int rc; + ENTRY; + + if (data != NULL || datalen != 0) { + CERROR("invalid: data %p, len %lu\n", data, (long)datalen); + RETURN(-EINVAL); + } + + if (key_get_payload(key, 0)) { + CERROR("key already have payload\n"); + RETURN(-EINVAL); + } + + /* link the key to session keyring, so following context negotiation + * rpc fired from user space could find this key. This will be unlinked + * automatically when upcall processes die. + * + * we can't do this through keyctl from userspace, because the upcall + * might be neither possessor nor owner of the key (setuid). + * + * the session keyring is created upon upcall, and don't change all + * the way until upcall finished, so rcu lock is not needed here. + */ + LASSERT(current_cred()->session_keyring); + + lockdep_off(); + rc = key_link(current_cred()->session_keyring, key); + lockdep_on(); + if (unlikely(rc)) { + CERROR("failed to link key %08x to keyring %08x: %d\n", + key->serial, + current_cred()->session_keyring->serial, rc); + RETURN(rc); + } + + CDEBUG(D_SEC, "key %p instantiated, ctx %p\n", key, + key_get_payload(key, 0)); + RETURN(0); +} + +/* + * called with key semaphore write locked. it means we can operate + * on the context without fear of loosing refcount. + */ +static +#ifdef HAVE_KEY_TYPE_INSTANTIATE_2ARGS +int gss_kt_update(struct key *key, struct key_preparsed_payload *prep) +{ + const void *data = prep->data; + __u32 datalen32 = (__u32) prep->datalen; +#else +int gss_kt_update(struct key *key, const void *data, size_t datalen) +{ + __u32 datalen32 = (__u32) datalen; +#endif + struct ptlrpc_cli_ctx *ctx = key_get_payload(key, 0); + struct gss_cli_ctx *gctx; + rawobj_t tmpobj = RAWOBJ_EMPTY; + int rc; + ENTRY; + + if (data == NULL || datalen32 == 0) { + CWARN("invalid: data %p, len %lu\n", data, (long)datalen32); + RETURN(-EINVAL); + } + + /* if upcall finished negotiation too fast (mostly likely because + * of local error happened) and call kt_update(), the ctx + * might be still NULL. but the key will finally be associate + * with a context, or be revoked. if key status is fine, return + * -EAGAIN to allow userspace sleep a while and call again. */ + if (ctx == NULL) { + CDEBUG(D_SEC, "update too soon: key %p(%x) flags %lx\n", + key, key->serial, key->flags); + + rc = key_validate(key); + if (rc == 0) + RETURN(-EAGAIN); + else + RETURN(rc); + } + + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(ctx->cc_sec); + + ctx_clear_timer_kr(ctx); + + /* don't proceed if already refreshed */ + if (cli_ctx_is_refreshed(ctx)) { + CWARN("ctx already done refresh\n"); + RETURN(0); + } + + sptlrpc_cli_ctx_get(ctx); + gctx = ctx2gctx(ctx); + + rc = buffer_extract_bytes(&data, &datalen32, &gctx->gc_win, + sizeof(gctx->gc_win)); + if (rc) { + CERROR("failed extract seq_win\n"); + goto out; + } + + if (gctx->gc_win == 0) { + __u32 nego_rpc_err, nego_gss_err; + + rc = buffer_extract_bytes(&data, &datalen32, &nego_rpc_err, + sizeof(nego_rpc_err)); + if (rc) { + CERROR("cannot extract RPC: rc = %d\n", rc); + goto out; + } + + rc = buffer_extract_bytes(&data, &datalen32, &nego_gss_err, + sizeof(nego_gss_err)); + if (rc) { + CERROR("failed to extract gss rc = %d\n", rc); + goto out; + } + + CERROR("negotiation: rpc err %d, gss err %x\n", + nego_rpc_err, nego_gss_err); + + rc = nego_rpc_err ? nego_rpc_err : -EACCES; + } else { + rc = rawobj_extract_local_alloc(&gctx->gc_handle, + (__u32 **) &data, &datalen32); + if (rc) { + CERROR("failed extract handle\n"); + goto out; + } + + rc = rawobj_extract_local(&tmpobj, + (__u32 **) &data, &datalen32); + if (rc) { + CERROR("failed extract mech\n"); + goto out; + } + + rc = lgss_import_sec_context(&tmpobj, + sec2gsec(ctx->cc_sec)->gs_mech, + &gctx->gc_mechctx); + if (rc != GSS_S_COMPLETE) + CERROR("failed import context\n"); + else + rc = 0; + } +out: + /* we don't care what current status of this ctx, even someone else + * is operating on the ctx at the same time. we just add up our own + * opinions here. */ + if (rc == 0) { + gss_cli_ctx_uptodate(gctx); + } else { + /* this will also revoke the key. has to be done before + * wakeup waiters otherwise they can find the stale key */ + kill_key_locked(key); + + cli_ctx_expire(ctx); + + if (rc != -ERESTART) + set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags); + } + + /* let user space think it's a success */ + sptlrpc_cli_ctx_put(ctx, 1); + RETURN(0); +} + +#ifndef HAVE_KEY_MATCH_DATA +static int +gss_kt_match(const struct key *key, const void *desc) +{ + return strcmp(key->description, (const char *) desc) == 0 && + !test_bit(KEY_FLAG_REVOKED, &key->flags); +} +#else /* ! HAVE_KEY_MATCH_DATA */ +static bool +gss_kt_match(const struct key *key, const struct key_match_data *match_data) +{ + const char *desc = match_data->raw_data; + + return strcmp(key->description, desc) == 0 && + !test_bit(KEY_FLAG_REVOKED, &key->flags); +} + +/* + * Preparse the match criterion. + */ +static int gss_kt_match_preparse(struct key_match_data *match_data) +{ + match_data->lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT; + match_data->cmp = gss_kt_match; + return 0; +} +#endif /* HAVE_KEY_MATCH_DATA */ + +static +void gss_kt_destroy(struct key *key) +{ + ENTRY; + LASSERT(!key_get_payload(key, 0)); + CDEBUG(D_SEC, "destroy key %p\n", key); + EXIT; +} + +static +void gss_kt_describe(const struct key *key, struct seq_file *s) +{ + if (key->description == NULL) + seq_puts(s, "[null]"); + else + seq_puts(s, key->description); +} + +static struct key_type gss_key_type = +{ + .name = "lgssc", + .def_datalen = 0, + .instantiate = gss_kt_instantiate, + .update = gss_kt_update, +#ifdef HAVE_KEY_MATCH_DATA + .match_preparse = gss_kt_match_preparse, +#else + .match = gss_kt_match, +#endif + .destroy = gss_kt_destroy, + .describe = gss_kt_describe, +}; + +/**************************************** + * lustre gss keyring policy * + ****************************************/ + +static struct ptlrpc_ctx_ops gss_keyring_ctxops = { + .match = gss_cli_ctx_match, + .refresh = gss_cli_ctx_refresh_kr, + .validate = gss_cli_ctx_validate_kr, + .die = gss_cli_ctx_die_kr, + .sign = gss_cli_ctx_sign, + .verify = gss_cli_ctx_verify, + .seal = gss_cli_ctx_seal, + .unseal = gss_cli_ctx_unseal, + .wrap_bulk = gss_cli_ctx_wrap_bulk, + .unwrap_bulk = gss_cli_ctx_unwrap_bulk, +}; + +static struct ptlrpc_sec_cops gss_sec_keyring_cops = { + .create_sec = gss_sec_create_kr, + .destroy_sec = gss_sec_destroy_kr, + .kill_sec = gss_sec_kill, + .lookup_ctx = gss_sec_lookup_ctx_kr, + .release_ctx = gss_sec_release_ctx_kr, + .flush_ctx_cache = gss_sec_flush_ctx_cache_kr, + .gc_ctx = gss_sec_gc_ctx_kr, + .install_rctx = gss_sec_install_rctx, + .alloc_reqbuf = gss_alloc_reqbuf, + .free_reqbuf = gss_free_reqbuf, + .alloc_repbuf = gss_alloc_repbuf, + .free_repbuf = gss_free_repbuf, + .enlarge_reqbuf = gss_enlarge_reqbuf, + .display = gss_sec_display_kr, +}; + +static struct ptlrpc_sec_sops gss_sec_keyring_sops = { + .accept = gss_svc_accept_kr, + .invalidate_ctx = gss_svc_invalidate_ctx, + .alloc_rs = gss_svc_alloc_rs, + .authorize = gss_svc_authorize, + .free_rs = gss_svc_free_rs, + .free_ctx = gss_svc_free_ctx, + .prep_bulk = gss_svc_prep_bulk, + .unwrap_bulk = gss_svc_unwrap_bulk, + .wrap_bulk = gss_svc_wrap_bulk, + .install_rctx = gss_svc_install_rctx_kr, +}; + +static struct ptlrpc_sec_policy gss_policy_keyring = { + .sp_owner = THIS_MODULE, + .sp_name = "gss.keyring", + .sp_policy = SPTLRPC_POLICY_GSS, + .sp_cops = &gss_sec_keyring_cops, + .sp_sops = &gss_sec_keyring_sops, +}; + + +int __init gss_init_keyring(void) +{ + int rc; + + rc = register_key_type(&gss_key_type); + if (rc) { + CERROR("failed to register keyring type: %d\n", rc); + return rc; + } + + rc = sptlrpc_register_policy(&gss_policy_keyring); + if (rc) { + unregister_key_type(&gss_key_type); + return rc; + } + + return 0; +} + +void __exit gss_exit_keyring(void) +{ + unregister_key_type(&gss_key_type); + sptlrpc_unregister_policy(&gss_policy_keyring); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h new file mode 100644 index 0000000000000..611160458d9b1 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h @@ -0,0 +1,160 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * linux/include/linux/sunrpc/gss_krb5_types.h + * + * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h, + * lib/gssapi/krb5/gssapiP_krb5.h, and others + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Bruce Fields + */ + +/* + * Copyright 1995 by the Massachusetts Institute of Technology. + * All Rights Reserved. + * + * Export of this software from the United States of America may + * require a specific license from the United States Government. + * It is the responsibility of any person or organization contemplating + * export to obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of M.I.T. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. Furthermore if you modify this software you must label + * your software as modified software and not distribute it in such a + * fashion that it might be confused with the original M.I.T. software. + * M.I.T. makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + */ + +#ifndef PTLRPC_GSS_KRB5_H +#define PTLRPC_GSS_KRB5_H + +#include "gss_crypto.h" + +/* + * RFC 4142 + */ + +#define KG_USAGE_ACCEPTOR_SEAL 22 +#define KG_USAGE_ACCEPTOR_SIGN 23 +#define KG_USAGE_INITIATOR_SEAL 24 +#define KG_USAGE_INITIATOR_SIGN 25 + +#define KG_TOK_MIC_MSG 0x0404 +#define KG_TOK_WRAP_MSG 0x0504 + +#define FLAG_SENDER_IS_ACCEPTOR 0x01 +#define FLAG_WRAP_CONFIDENTIAL 0x02 +#define FLAG_ACCEPTOR_SUBKEY 0x04 + +struct krb5_header { + __u16 kh_tok_id; /* token id */ + __u8 kh_flags; /* acceptor flags */ + __u8 kh_filler; /* 0xff */ + __u16 kh_ec; /* extra count */ + __u16 kh_rrc; /* right rotation count */ + __u64 kh_seq; /* sequence number */ + __u8 kh_cksum[0]; /* checksum */ +}; + +struct krb5_ctx { + unsigned int kc_initiate:1, + kc_cfx:1, + kc_seed_init:1, + kc_have_acceptor_subkey:1; + time64_t kc_endtime; + __u8 kc_seed[16]; + __u64 kc_seq_send; + __u64 kc_seq_recv; + __u32 kc_enctype; + struct gss_keyblock kc_keye; /* encryption */ + struct gss_keyblock kc_keyi; /* integrity */ + struct gss_keyblock kc_keyc; /* checksum */ + rawobj_t kc_mech_used; +}; + +enum sgn_alg { + SGN_ALG_DES_MAC_MD5 = 0x0000, + SGN_ALG_MD2_5 = 0x0001, + SGN_ALG_DES_MAC = 0x0002, + SGN_ALG_3 = 0x0003, /* not published */ + SGN_ALG_HMAC_MD5 = 0x0011, /* microsoft w2k; no support */ + SGN_ALG_HMAC_SHA1_DES3_KD = 0x0004 +}; + +enum seal_alg { + SEAL_ALG_NONE = 0xffff, + SEAL_ALG_DES = 0x0000, + SEAL_ALG_1 = 0x0001, /* not published */ + SEAL_ALG_MICROSOFT_RC4 = 0x0010, /* microsoft w2k; no support */ + SEAL_ALG_DES3KD = 0x0002 +}; + +#define CKSUMTYPE_CRC32 0x0001 +#define CKSUMTYPE_RSA_MD4 0x0002 +#define CKSUMTYPE_RSA_MD4_DES 0x0003 +#define CKSUMTYPE_DESCBC 0x0004 +/* des-mac-k */ +/* rsa-md4-des-k */ +#define CKSUMTYPE_RSA_MD5 0x0007 +#define CKSUMTYPE_RSA_MD5_DES 0x0008 +#define CKSUMTYPE_NIST_SHA 0x0009 +#define CKSUMTYPE_HMAC_SHA1_DES3 0x000c +#define CKSUMTYPE_HMAC_SHA1_96_AES128 0x000f +#define CKSUMTYPE_HMAC_SHA1_96_AES256 0x0010 +#define CKSUMTYPE_HMAC_MD5_ARCFOUR -138 + +/* from gssapi_err_krb5.h */ +#define KG_CCACHE_NOMATCH (39756032L) +#define KG_KEYTAB_NOMATCH (39756033L) +#define KG_TGT_MISSING (39756034L) +#define KG_NO_SUBKEY (39756035L) +#define KG_CONTEXT_ESTABLISHED (39756036L) +#define KG_BAD_SIGN_TYPE (39756037L) +#define KG_BAD_LENGTH (39756038L) +#define KG_CTX_INCOMPLETE (39756039L) +#define KG_CONTEXT (39756040L) +#define KG_CRED (39756041L) +#define KG_ENC_DESC (39756042L) +#define KG_BAD_SEQ (39756043L) +#define KG_EMPTY_CCACHE (39756044L) +#define KG_NO_CTYPES (39756045L) + +/* per Kerberos v5 protocol spec crypto types from the wire. + * these get mapped to linux kernel crypto routines. + */ +#define ENCTYPE_NULL 0x0000 +#define ENCTYPE_DES_CBC_CRC 0x0001 /* DES cbc mode with CRC-32 */ +#define ENCTYPE_DES_CBC_MD4 0x0002 /* DES cbc mode with RSA-MD4 */ +#define ENCTYPE_DES_CBC_MD5 0x0003 /* DES cbc mode with RSA-MD5 */ +#define ENCTYPE_DES_CBC_RAW 0x0004 /* DES cbc mode raw */ +/* XXX deprecated? */ +#define ENCTYPE_DES3_CBC_SHA 0x0005 /* DES-3 cbc mode with NIST-SHA */ +#define ENCTYPE_DES3_CBC_RAW 0x0006 /* DES-3 cbc mode raw */ +#define ENCTYPE_DES_HMAC_SHA1 0x0008 +#define ENCTYPE_DES3_CBC_SHA1 0x0010 +#define ENCTYPE_AES128_CTS_HMAC_SHA1_96 0x0011 +#define ENCTYPE_AES256_CTS_HMAC_SHA1_96 0x0012 +#define ENCTYPE_ARCFOUR_HMAC 0x0017 +#define ENCTYPE_ARCFOUR_HMAC_EXP 0x0018 +#define ENCTYPE_UNKNOWN 0x01ff + +#endif /* PTLRPC_GSS_KRB5_H */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c new file mode 100644 index 0000000000000..d95924993285f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c @@ -0,0 +1,1604 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2015, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_krb5_mech.c + * linux/net/sunrpc/gss_krb5_crypto.c + * linux/net/sunrpc/gss_krb5_seal.c + * linux/net/sunrpc/gss_krb5_seqnum.c + * linux/net/sunrpc/gss_krb5_unseal.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_asn1.h" +#include "gss_krb5.h" +#include "gss_crypto.h" + +static DEFINE_SPINLOCK(krb5_seq_lock); + +struct krb5_enctype { + char *ke_dispname; + char *ke_enc_name; /* linux tfm name */ + char *ke_hash_name; /* linux tfm name */ + int ke_enc_mode; /* linux tfm mode */ + int ke_hash_size; /* checksum size */ + int ke_conf_size; /* confounder size */ + unsigned int ke_hash_hmac:1; /* is hmac? */ +}; + +/* + * NOTE: for aes128-cts and aes256-cts, MIT implementation use CTS encryption. + * but currently we simply CBC with padding, because linux doesn't support CTS + * yet. this need to be fixed in the future. + */ +static struct krb5_enctype enctypes[] = { + [ENCTYPE_DES_CBC_RAW] = { /* des-cbc-md5 */ + .ke_dispname = "des-cbc-md5", + .ke_enc_name = "cbc(des)", + .ke_hash_name = "md5", + .ke_hash_size = 16, + .ke_conf_size = 8, + }, +#ifdef HAVE_DES3_SUPPORT + [ENCTYPE_DES3_CBC_RAW] = { /* des3-hmac-sha1 */ + .ke_dispname = "des3-hmac-sha1", + .ke_enc_name = "cbc(des3_ede)", + .ke_hash_name = "sha1", + .ke_hash_size = 20, + .ke_conf_size = 8, + .ke_hash_hmac = 1, + }, +#endif + [ENCTYPE_AES128_CTS_HMAC_SHA1_96] = { /* aes128-cts */ + .ke_dispname = "aes128-cts-hmac-sha1-96", + .ke_enc_name = "cbc(aes)", + .ke_hash_name = "sha1", + .ke_hash_size = 12, + .ke_conf_size = 16, + .ke_hash_hmac = 1, + }, + [ENCTYPE_AES256_CTS_HMAC_SHA1_96] = { /* aes256-cts */ + .ke_dispname = "aes256-cts-hmac-sha1-96", + .ke_enc_name = "cbc(aes)", + .ke_hash_name = "sha1", + .ke_hash_size = 12, + .ke_conf_size = 16, + .ke_hash_hmac = 1, + }, + [ENCTYPE_ARCFOUR_HMAC] = { /* arcfour-hmac-md5 */ + .ke_dispname = "arcfour-hmac-md5", + .ke_enc_name = "ecb(arc4)", + .ke_hash_name = "md5", + .ke_hash_size = 16, + .ke_conf_size = 8, + .ke_hash_hmac = 1, + } +}; + +static const char * enctype2str(__u32 enctype) +{ + if (enctype < ARRAY_SIZE(enctypes) && enctypes[enctype].ke_dispname) + return enctypes[enctype].ke_dispname; + + return "unknown"; +} + +static +int krb5_init_keys(struct krb5_ctx *kctx) +{ + struct krb5_enctype *ke; + + if (kctx->kc_enctype >= ARRAY_SIZE(enctypes) || + enctypes[kctx->kc_enctype].ke_hash_size == 0) { + CERROR("unsupported enctype %x\n", kctx->kc_enctype); + return -1; + } + + ke = &enctypes[kctx->kc_enctype]; + + /* tfm arc4 is stateful, user should alloc-use-free by his own */ + if (kctx->kc_enctype != ENCTYPE_ARCFOUR_HMAC && + gss_keyblock_init(&kctx->kc_keye, ke->ke_enc_name, ke->ke_enc_mode)) + return -1; + + /* tfm hmac is stateful, user should alloc-use-free by his own */ + if (ke->ke_hash_hmac == 0 && + gss_keyblock_init(&kctx->kc_keyi, ke->ke_enc_name, ke->ke_enc_mode)) + return -1; + if (ke->ke_hash_hmac == 0 && + gss_keyblock_init(&kctx->kc_keyc, ke->ke_enc_name, ke->ke_enc_mode)) + return -1; + + return 0; +} + +static +void delete_context_kerberos(struct krb5_ctx *kctx) +{ + rawobj_free(&kctx->kc_mech_used); + + gss_keyblock_free(&kctx->kc_keye); + gss_keyblock_free(&kctx->kc_keyi); + gss_keyblock_free(&kctx->kc_keyc); +} + +static +__u32 import_context_rfc1964(struct krb5_ctx *kctx, char *p, char *end) +{ + unsigned int tmp_uint, keysize; + + /* seed_init flag */ + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + kctx->kc_seed_init = (tmp_uint != 0); + + /* seed */ + if (gss_get_bytes(&p, end, kctx->kc_seed, sizeof(kctx->kc_seed))) + goto out_err; + + /* sign/seal algorithm, not really used now */ + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) || + gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + + /* end time. While kc_endtime might be 64 bit the krb5 API + * still uses 32 bits. To delay the 2038 bug see the incoming + * value as a u32 which give us until 2106. See the link for details: + * + * http://web.mit.edu/kerberos/www/krb5-current/doc/appdev/y2038.html + */ + if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(u32))) + goto out_err; + + /* seq send */ + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + kctx->kc_seq_send = tmp_uint; + + /* mech oid */ + if (gss_get_rawobj(&p, end, &kctx->kc_mech_used)) + goto out_err; + + /* old style enc/seq keys in format: + * - enctype (u32) + * - keysize (u32) + * - keydata + * we decompose them to fit into the new context + */ + + /* enc key */ + if (gss_get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype))) + goto out_err; + + if (gss_get_bytes(&p, end, &keysize, sizeof(keysize))) + goto out_err; + + if (gss_get_keyblock(&p, end, &kctx->kc_keye, keysize)) + goto out_err; + + /* seq key */ + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) || + tmp_uint != kctx->kc_enctype) + goto out_err; + + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) || + tmp_uint != keysize) + goto out_err; + + if (gss_get_keyblock(&p, end, &kctx->kc_keyc, keysize)) + goto out_err; + + /* old style fallback */ + if (gss_keyblock_dup(&kctx->kc_keyi, &kctx->kc_keyc)) + goto out_err; + + if (p != end) + goto out_err; + + CDEBUG(D_SEC, "successfully imported rfc1964 context\n"); + return 0; +out_err: + return GSS_S_FAILURE; +} + +/* Flags for version 2 context flags */ +#define KRB5_CTX_FLAG_INITIATOR 0x00000001 +#define KRB5_CTX_FLAG_CFX 0x00000002 +#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY 0x00000004 + +static +__u32 import_context_rfc4121(struct krb5_ctx *kctx, char *p, char *end) +{ + unsigned int tmp_uint, keysize; + + /* end time. While kc_endtime might be 64 bit the krb5 API + * still uses 32 bits. To delay the 2038 bug see the incoming + * value as a u32 which give us until 2106. See the link for details: + * + * http://web.mit.edu/kerberos/www/krb5-current/doc/appdev/y2038.html + */ + if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(u32))) + goto out_err; + + /* flags */ + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + + if (tmp_uint & KRB5_CTX_FLAG_INITIATOR) + kctx->kc_initiate = 1; + if (tmp_uint & KRB5_CTX_FLAG_CFX) + kctx->kc_cfx = 1; + if (tmp_uint & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY) + kctx->kc_have_acceptor_subkey = 1; + + /* seq send */ + if (gss_get_bytes(&p, end, &kctx->kc_seq_send, + sizeof(kctx->kc_seq_send))) + goto out_err; + + /* enctype */ + if (gss_get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype))) + goto out_err; + + /* size of each key */ + if (gss_get_bytes(&p, end, &keysize, sizeof(keysize))) + goto out_err; + + /* number of keys - should always be 3 */ + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + + if (tmp_uint != 3) { + CERROR("Invalid number of keys: %u\n", tmp_uint); + goto out_err; + } + + /* ke */ + if (gss_get_keyblock(&p, end, &kctx->kc_keye, keysize)) + goto out_err; + /* ki */ + if (gss_get_keyblock(&p, end, &kctx->kc_keyi, keysize)) + goto out_err; + /* ki */ + if (gss_get_keyblock(&p, end, &kctx->kc_keyc, keysize)) + goto out_err; + + CDEBUG(D_SEC, "successfully imported v2 context\n"); + return 0; +out_err: + return GSS_S_FAILURE; +} + +/* + * The whole purpose here is trying to keep user level gss context parsing + * from nfs-utils unchanged as possible as we can, they are not quite mature + * yet, and many stuff still not clear, like heimdal etc. + */ +static +__u32 gss_import_sec_context_kerberos(rawobj_t *inbuf, + struct gss_ctx *gctx) +{ + struct krb5_ctx *kctx; + char *p = (char *)inbuf->data; + char *end = (char *)(inbuf->data + inbuf->len); + unsigned int tmp_uint, rc; + + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) { + CERROR("Fail to read version\n"); + return GSS_S_FAILURE; + } + + /* only support 0, 1 for the moment */ + if (tmp_uint > 2) { + CERROR("Invalid version %u\n", tmp_uint); + return GSS_S_FAILURE; + } + + OBD_ALLOC_PTR(kctx); + if (!kctx) + return GSS_S_FAILURE; + + if (tmp_uint == 0 || tmp_uint == 1) { + kctx->kc_initiate = tmp_uint; + rc = import_context_rfc1964(kctx, p, end); + } else { + rc = import_context_rfc4121(kctx, p, end); + } + + if (rc == 0) + rc = krb5_init_keys(kctx); + + if (rc) { + delete_context_kerberos(kctx); + OBD_FREE_PTR(kctx); + + return GSS_S_FAILURE; + } + + gctx->internal_ctx_id = kctx; + return GSS_S_COMPLETE; +} + +static +__u32 gss_copy_reverse_context_kerberos(struct gss_ctx *gctx, + struct gss_ctx *gctx_new) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_ctx *knew; + + OBD_ALLOC_PTR(knew); + if (!knew) + return GSS_S_FAILURE; + + knew->kc_initiate = kctx->kc_initiate ? 0 : 1; + knew->kc_cfx = kctx->kc_cfx; + knew->kc_seed_init = kctx->kc_seed_init; + knew->kc_have_acceptor_subkey = kctx->kc_have_acceptor_subkey; + knew->kc_endtime = kctx->kc_endtime; + + memcpy(knew->kc_seed, kctx->kc_seed, sizeof(kctx->kc_seed)); + knew->kc_seq_send = kctx->kc_seq_recv; + knew->kc_seq_recv = kctx->kc_seq_send; + knew->kc_enctype = kctx->kc_enctype; + + if (rawobj_dup(&knew->kc_mech_used, &kctx->kc_mech_used)) + goto out_err; + + if (gss_keyblock_dup(&knew->kc_keye, &kctx->kc_keye)) + goto out_err; + if (gss_keyblock_dup(&knew->kc_keyi, &kctx->kc_keyi)) + goto out_err; + if (gss_keyblock_dup(&knew->kc_keyc, &kctx->kc_keyc)) + goto out_err; + if (krb5_init_keys(knew)) + goto out_err; + + gctx_new->internal_ctx_id = knew; + CDEBUG(D_SEC, "successfully copied reverse context\n"); + return GSS_S_COMPLETE; + +out_err: + delete_context_kerberos(knew); + OBD_FREE_PTR(knew); + return GSS_S_FAILURE; +} + +static +__u32 gss_inquire_context_kerberos(struct gss_ctx *gctx, + time64_t *endtime) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + + *endtime = kctx->kc_endtime; + return GSS_S_COMPLETE; +} + +static +void gss_delete_sec_context_kerberos(void *internal_ctx) +{ + struct krb5_ctx *kctx = internal_ctx; + + delete_context_kerberos(kctx); + OBD_FREE_PTR(kctx); +} + +/* + * compute (keyed/keyless) checksum against the plain text which appended + * with krb5 wire token header. + */ +static +__s32 krb5_make_checksum(__u32 enctype, + struct gss_keyblock *kb, + struct krb5_header *khdr, + int msgcnt, rawobj_t *msgs, + int iovcnt, struct bio_vec *iovs, + rawobj_t *cksum, + digest_hash hash_func) +{ + struct krb5_enctype *ke = &enctypes[enctype]; + struct ahash_request *req = NULL; + enum cfs_crypto_hash_alg hash_algo; + rawobj_t hdr; + int rc; + + hash_algo = cfs_crypto_hash_alg(ke->ke_hash_name); + + /* For the cbc(des) case we want md5 instead of hmac(md5) */ + if (strcmp(ke->ke_enc_name, "cbc(des)")) + req = cfs_crypto_hash_init(hash_algo, kb->kb_key.data, + kb->kb_key.len); + else + req = cfs_crypto_hash_init(hash_algo, NULL, 0); + if (IS_ERR(req)) { + rc = PTR_ERR(req); + CERROR("failed to alloc hash %s : rc = %d\n", + ke->ke_hash_name, rc); + goto out_no_hash; + } + + cksum->len = cfs_crypto_hash_digestsize(hash_algo); + OBD_ALLOC_LARGE(cksum->data, cksum->len); + if (!cksum->data) { + cksum->len = 0; + rc = -ENOMEM; + goto out_free_hash; + } + + hdr.data = (__u8 *)khdr; + hdr.len = sizeof(*khdr); + + if (!hash_func) { + rc = -EPROTO; + CERROR("hash function for %s undefined\n", + ke->ke_hash_name); + goto out_free_hash; + } + rc = hash_func(req, &hdr, msgcnt, msgs, iovcnt, iovs); + if (rc) + goto out_free_hash; + + if (!ke->ke_hash_hmac) { + LASSERT(kb->kb_tfm); + + cfs_crypto_hash_final(req, cksum->data, &cksum->len); + rc = gss_crypt_generic(kb->kb_tfm, 0, NULL, + cksum->data, cksum->data, + cksum->len); + goto out_no_hash; + } + +out_free_hash: + if (req) + cfs_crypto_hash_final(req, cksum->data, &cksum->len); +out_no_hash: + return rc ? GSS_S_FAILURE : GSS_S_COMPLETE; +} + +static void fill_krb5_header(struct krb5_ctx *kctx, + struct krb5_header *khdr, + int privacy) +{ + unsigned char acceptor_flag; + + acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR; + + if (privacy) { + khdr->kh_tok_id = cpu_to_be16(KG_TOK_WRAP_MSG); + khdr->kh_flags = acceptor_flag | FLAG_WRAP_CONFIDENTIAL; + khdr->kh_ec = cpu_to_be16(0); + khdr->kh_rrc = cpu_to_be16(0); + } else { + khdr->kh_tok_id = cpu_to_be16(KG_TOK_MIC_MSG); + khdr->kh_flags = acceptor_flag; + khdr->kh_ec = cpu_to_be16(0xffff); + khdr->kh_rrc = cpu_to_be16(0xffff); + } + + khdr->kh_filler = 0xff; + spin_lock(&krb5_seq_lock); + khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++); + spin_unlock(&krb5_seq_lock); +} + +static __u32 verify_krb5_header(struct krb5_ctx *kctx, + struct krb5_header *khdr, + int privacy) +{ + unsigned char acceptor_flag; + __u16 tok_id, ec_rrc; + + acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0; + + if (privacy) { + tok_id = KG_TOK_WRAP_MSG; + ec_rrc = 0x0; + } else { + tok_id = KG_TOK_MIC_MSG; + ec_rrc = 0xffff; + } + + /* sanity checks */ + if (be16_to_cpu(khdr->kh_tok_id) != tok_id) { + CERROR("bad token id\n"); + return GSS_S_DEFECTIVE_TOKEN; + } + if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) { + CERROR("bad direction flag\n"); + return GSS_S_BAD_SIG; + } + if (privacy && (khdr->kh_flags & FLAG_WRAP_CONFIDENTIAL) == 0) { + CERROR("missing confidential flag\n"); + return GSS_S_BAD_SIG; + } + if (khdr->kh_filler != 0xff) { + CERROR("bad filler\n"); + return GSS_S_DEFECTIVE_TOKEN; + } + if (be16_to_cpu(khdr->kh_ec) != ec_rrc || + be16_to_cpu(khdr->kh_rrc) != ec_rrc) { + CERROR("bad EC or RRC\n"); + return GSS_S_DEFECTIVE_TOKEN; + } + return GSS_S_COMPLETE; +} + +static +__u32 gss_get_mic_kerberos(struct gss_ctx *gctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + struct bio_vec *iovs, + rawobj_t *token) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + rawobj_t cksum = RAWOBJ_EMPTY; + u32 major; + + /* fill krb5 header */ + LASSERT(token->len >= sizeof(*khdr)); + khdr = (struct krb5_header *)token->data; + fill_krb5_header(kctx, khdr, 0); + + /* checksum */ + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc, khdr, + msgcnt, msgs, iovcnt, iovs, &cksum, + gctx->hash_func)) + GOTO(out_free_cksum, major = GSS_S_FAILURE); + + LASSERT(cksum.len >= ke->ke_hash_size); + LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size); + memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size); + + token->len = sizeof(*khdr) + ke->ke_hash_size; + major = GSS_S_COMPLETE; +out_free_cksum: + rawobj_free(&cksum); + return major; +} + +static +__u32 gss_verify_mic_kerberos(struct gss_ctx *gctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + struct bio_vec *iovs, + rawobj_t *token) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + rawobj_t cksum = RAWOBJ_EMPTY; + u32 major; + + if (token->len < sizeof(*khdr)) { + CERROR("short signature: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + khdr = (struct krb5_header *)token->data; + + major = verify_krb5_header(kctx, khdr, 0); + if (major != GSS_S_COMPLETE) { + CERROR("bad krb5 header\n"); + goto out; + } + + if (token->len < sizeof(*khdr) + ke->ke_hash_size) { + CERROR("short signature: %u, require %d\n", + token->len, (int) sizeof(*khdr) + ke->ke_hash_size); + GOTO(out, major = GSS_S_FAILURE); + } + + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc, + khdr, msgcnt, msgs, iovcnt, iovs, &cksum, + gctx->hash_func)) + GOTO(out_free_cksum, major = GSS_S_FAILURE); + + LASSERT(cksum.len >= ke->ke_hash_size); + if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size)) { + CERROR("checksum mismatch\n"); + GOTO(out_free_cksum, major = GSS_S_BAD_SIG); + } + major = GSS_S_COMPLETE; +out_free_cksum: + rawobj_free(&cksum); +out: + return major; +} + +/* + * if adj_nob != 0, we adjust desc->bd_nob to the actual cipher text size. + */ +static +int krb5_encrypt_bulk(struct crypto_sync_skcipher *tfm, + struct krb5_header *khdr, + char *confounder, + struct ptlrpc_bulk_desc *desc, + rawobj_t *cipher, + int adj_nob) +{ + __u8 local_iv[16] = {0}; + struct scatterlist src, dst; + struct sg_table sg_src, sg_dst; + int blocksize, i, rc, nob = 0; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + + LASSERT(desc->bd_iov_count); + LASSERT(desc->bd_enc_vec); + + blocksize = crypto_sync_skcipher_blocksize(tfm); + LASSERT(blocksize > 1); + LASSERT(cipher->len == blocksize + sizeof(*khdr)); + + /* encrypt confounder */ + rc = gss_setup_sgtable(&sg_src, &src, confounder, blocksize); + if (rc != 0) + return rc; + + rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data, blocksize); + if (rc != 0) { + gss_teardown_sgtable(&sg_src); + return rc; + } + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl, + blocksize, local_iv); + + rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl, blocksize); + + gss_teardown_sgtable(&sg_dst); + gss_teardown_sgtable(&sg_src); + + if (rc) { + CERROR("error to encrypt confounder: %d\n", rc); + skcipher_request_zero(req); + return rc; + } + + /* encrypt clear pages */ + for (i = 0; i < desc->bd_iov_count; i++) { + sg_init_table(&src, 1); + sg_set_page(&src, desc->bd_vec[i].bv_page, + (desc->bd_vec[i].bv_len + + blocksize - 1) & + (~(blocksize - 1)), + desc->bd_vec[i].bv_offset); + if (adj_nob) + nob += src.length; + sg_init_table(&dst, 1); + sg_set_page(&dst, desc->bd_enc_vec[i].bv_page, + src.length, src.offset); + + desc->bd_enc_vec[i].bv_offset = dst.offset; + desc->bd_enc_vec[i].bv_len = dst.length; + + skcipher_request_set_crypt(req, &src, &dst, + src.length, local_iv); + rc = crypto_skcipher_encrypt_iv(req, &dst, &src, src.length); + if (rc) { + CERROR("error to encrypt page: %d\n", rc); + skcipher_request_zero(req); + return rc; + } + } + + /* encrypt krb5 header */ + rc = gss_setup_sgtable(&sg_src, &src, khdr, sizeof(*khdr)); + if (rc != 0) { + skcipher_request_zero(req); + return rc; + } + + rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data + blocksize, + sizeof(*khdr)); + if (rc != 0) { + gss_teardown_sgtable(&sg_src); + skcipher_request_zero(req); + return rc; + } + + skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl, + sizeof(*khdr), local_iv); + rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl, + sizeof(*khdr)); + skcipher_request_zero(req); + + gss_teardown_sgtable(&sg_dst); + gss_teardown_sgtable(&sg_src); + + if (rc) { + CERROR("error to encrypt krb5 header: %d\n", rc); + return rc; + } + + if (adj_nob) + desc->bd_nob = nob; + + return 0; +} + +/* + * desc->bd_nob_transferred is the size of cipher text received. + * desc->bd_nob is the target size of plain text supposed to be. + * + * if adj_nob != 0, we adjust each page's bv_len to the actual + * plain text size. + * - for client read: we don't know data size for each page, so + * bd_iov[]->bv_len is set to PAGE_SIZE, but actual data received might + * be smaller, so we need to adjust it according to + * bd_u.bd_kiov.bd_enc_vec[]->bv_len. + * this means we DO NOT support the situation that server send an odd size + * data in a page which is not the last one. + * - for server write: we knows exactly data size for each page being expected, + * thus bv_len is accurate already, so we should not adjust it at all. + * and bd_u.bd_kiov.bd_enc_vec[]->bv_len should be + * round_up(bd_iov[]->bv_len) which + * should have been done by prep_bulk(). + */ +static +int krb5_decrypt_bulk(struct crypto_sync_skcipher *tfm, + struct krb5_header *khdr, + struct ptlrpc_bulk_desc *desc, + rawobj_t *cipher, + rawobj_t *plain, + int adj_nob) +{ + __u8 local_iv[16] = {0}; + struct scatterlist src, dst; + struct sg_table sg_src, sg_dst; + int ct_nob = 0, pt_nob = 0; + int blocksize, i, rc; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + + LASSERT(desc->bd_iov_count); + LASSERT(desc->bd_enc_vec); + LASSERT(desc->bd_nob_transferred); + + blocksize = crypto_sync_skcipher_blocksize(tfm); + LASSERT(blocksize > 1); + LASSERT(cipher->len == blocksize + sizeof(*khdr)); + + if (desc->bd_nob_transferred % blocksize) { + CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred); + return -EPROTO; + } + + /* decrypt head (confounder) */ + rc = gss_setup_sgtable(&sg_src, &src, cipher->data, blocksize); + if (rc != 0) + return rc; + + rc = gss_setup_sgtable(&sg_dst, &dst, plain->data, blocksize); + if (rc != 0) { + gss_teardown_sgtable(&sg_src); + return rc; + } + + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl, + blocksize, local_iv); + + rc = crypto_skcipher_decrypt_iv(req, sg_dst.sgl, sg_src.sgl, blocksize); + + gss_teardown_sgtable(&sg_dst); + gss_teardown_sgtable(&sg_src); + + if (rc) { + CERROR("error to decrypt confounder: %d\n", rc); + skcipher_request_zero(req); + return rc; + } + + for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred; + i++) { + if (desc->bd_enc_vec[i].bv_offset % blocksize != 0 || + desc->bd_enc_vec[i].bv_len % blocksize != 0) { + CERROR("page %d: odd offset %u len %u, blocksize %d\n", + i, desc->bd_enc_vec[i].bv_offset, + desc->bd_enc_vec[i].bv_len, + blocksize); + skcipher_request_zero(req); + return -EFAULT; + } + + if (adj_nob) { + if (ct_nob + desc->bd_enc_vec[i].bv_len > + desc->bd_nob_transferred) + desc->bd_enc_vec[i].bv_len = + desc->bd_nob_transferred - ct_nob; + + desc->bd_vec[i].bv_len = + desc->bd_enc_vec[i].bv_len; + if (pt_nob + desc->bd_enc_vec[i].bv_len > + desc->bd_nob) + desc->bd_vec[i].bv_len = + desc->bd_nob - pt_nob; + } else { + /* this should be guaranteed by LNET */ + LASSERT(ct_nob + desc->bd_enc_vec[i]. + bv_len <= + desc->bd_nob_transferred); + LASSERT(desc->bd_vec[i].bv_len <= + desc->bd_enc_vec[i].bv_len); + } + + if (desc->bd_enc_vec[i].bv_len == 0) + continue; + + sg_init_table(&src, 1); + sg_set_page(&src, desc->bd_enc_vec[i].bv_page, + desc->bd_enc_vec[i].bv_len, + desc->bd_enc_vec[i].bv_offset); + dst = src; + if (desc->bd_vec[i].bv_len % blocksize == 0) + sg_assign_page(&dst, + desc->bd_vec[i].bv_page); + + skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl, + src.length, local_iv); + rc = crypto_skcipher_decrypt_iv(req, &dst, &src, src.length); + if (rc) { + CERROR("error to decrypt page: %d\n", rc); + skcipher_request_zero(req); + return rc; + } + + if (desc->bd_vec[i].bv_len % blocksize != 0) { + memcpy(page_address(desc->bd_vec[i].bv_page) + + desc->bd_vec[i].bv_offset, + page_address(desc->bd_enc_vec[i]. + bv_page) + + desc->bd_vec[i].bv_offset, + desc->bd_vec[i].bv_len); + } + + ct_nob += desc->bd_enc_vec[i].bv_len; + pt_nob += desc->bd_vec[i].bv_len; + } + + if (unlikely(ct_nob != desc->bd_nob_transferred)) { + CERROR("%d cipher text transferred but only %d decrypted\n", + desc->bd_nob_transferred, ct_nob); + skcipher_request_zero(req); + return -EFAULT; + } + + if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) { + CERROR("%d plain text expected but only %d received\n", + desc->bd_nob, pt_nob); + skcipher_request_zero(req); + return -EFAULT; + } + + /* if needed, clear up the rest unused iovs */ + if (adj_nob) + while (i < desc->bd_iov_count) + desc->bd_vec[i++].bv_len = 0; + + /* decrypt tail (krb5 header) */ + rc = gss_setup_sgtable(&sg_src, &src, cipher->data + blocksize, + sizeof(*khdr)); + if (rc != 0) + return rc; + + rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data + blocksize, + sizeof(*khdr)); + if (rc != 0) { + gss_teardown_sgtable(&sg_src); + return rc; + } + + skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl, + src.length, local_iv); + rc = crypto_skcipher_decrypt_iv(req, sg_dst.sgl, sg_src.sgl, + sizeof(*khdr)); + gss_teardown_sgtable(&sg_src); + gss_teardown_sgtable(&sg_dst); + + skcipher_request_zero(req); + if (rc) { + CERROR("error to decrypt tail: %d\n", rc); + return rc; + } + + if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) { + CERROR("krb5 header doesn't match\n"); + return -EACCES; + } + + return 0; +} + +static +__u32 gss_wrap_kerberos(struct gss_ctx *gctx, + rawobj_t *gsshdr, + rawobj_t *msg, + int msg_buflen, + rawobj_t *token) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + int blocksize; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t data_desc[3], cipher; + __u8 conf[GSS_MAX_CIPHER_BLOCK]; + __u8 local_iv[16] = {0}; + u32 major; + int rc = 0; + + LASSERT(ke); + LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK); + LASSERT(kctx->kc_keye.kb_tfm == NULL || + ke->ke_conf_size >= + crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm)); + + /* + * final token format: + * --------------------------------------------------- + * | krb5 header | cipher text | checksum (16 bytes) | + * --------------------------------------------------- + */ + + /* fill krb5 header */ + LASSERT(token->len >= sizeof(*khdr)); + khdr = (struct krb5_header *)token->data; + fill_krb5_header(kctx, khdr, 1); + + /* generate confounder */ + get_random_bytes(conf, ke->ke_conf_size); + + /* get encryption blocksize. note kc_keye might not associated with + * a tfm, currently only for arcfour-hmac */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksize = 1; + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksize = crypto_sync_skcipher_blocksize( + kctx->kc_keye.kb_tfm); + } + LASSERT(blocksize <= ke->ke_conf_size); + + /* padding the message */ + if (gss_add_padding(msg, msg_buflen, blocksize)) + return GSS_S_FAILURE; + + /* + * clear text layout for checksum: + * ------------------------------------------------------ + * | confounder | gss header | clear msgs | krb5 header | + * ------------------------------------------------------ + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + data_desc[1].data = gsshdr->data; + data_desc[1].len = gsshdr->len; + data_desc[2].data = msg->data; + data_desc[2].len = msg->len; + + /* compute checksum */ + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 3, data_desc, 0, NULL, &cksum, + gctx->hash_func)) + GOTO(out_free_cksum, major = GSS_S_FAILURE); + LASSERT(cksum.len >= ke->ke_hash_size); + + /* + * clear text layout for encryption: + * ----------------------------------------- + * | confounder | clear msgs | krb5 header | + * ----------------------------------------- + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + data_desc[1].data = msg->data; + data_desc[1].len = msg->len; + data_desc[2].data = (__u8 *) khdr; + data_desc[2].len = sizeof(*khdr); + + /* cipher text will be directly inplace */ + cipher.data = (__u8 *)(khdr + 1); + cipher.len = token->len - sizeof(*khdr); + LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr)); + + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + rawobj_t arc4_keye = RAWOBJ_EMPTY; + struct crypto_sync_skcipher *arc4_tfm; + + if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi, + NULL, 1, &cksum, 0, NULL, &arc4_keye, + gctx->hash_func)) { + CERROR("failed to obtain arc4 enc key\n"); + GOTO(arc4_out_key, rc = -EACCES); + } + + arc4_tfm = crypto_alloc_sync_skcipher("ecb(arc4)", 0, 0); + if (IS_ERR(arc4_tfm)) { + CERROR("failed to alloc tfm arc4 in ECB mode\n"); + GOTO(arc4_out_key, rc = -EACCES); + } + + if (crypto_sync_skcipher_setkey(arc4_tfm, arc4_keye.data, + arc4_keye.len)) { + CERROR("failed to set arc4 key, len %d\n", + arc4_keye.len); + GOTO(arc4_out_tfm, rc = -EACCES); + } + + rc = gss_crypt_rawobjs(arc4_tfm, NULL, 3, data_desc, + &cipher, 1); +arc4_out_tfm: + crypto_free_sync_skcipher(arc4_tfm); +arc4_out_key: + rawobj_free(&arc4_keye); + } else { + rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 3, + data_desc, &cipher, 1); + } + + if (rc) + GOTO(out_free_cksum, major = GSS_S_FAILURE); + + /* fill in checksum */ + LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size); + memcpy((char *)(khdr + 1) + cipher.len, + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size); + + /* final token length */ + token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size; + major = GSS_S_COMPLETE; +out_free_cksum: + rawobj_free(&cksum); + return major; +} + +static +__u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + int blocksize, i; + + LASSERT(desc->bd_iov_count); + LASSERT(desc->bd_enc_vec); + LASSERT(kctx->kc_keye.kb_tfm); + + blocksize = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm); + + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(desc->bd_enc_vec[i].bv_page); + /* + * offset should always start at page boundary of either + * client or server side. + */ + if (desc->bd_vec[i].bv_offset & blocksize) { + CERROR("odd offset %d in page %d\n", + desc->bd_vec[i].bv_offset, i); + return GSS_S_FAILURE; + } + + desc->bd_enc_vec[i].bv_offset = + desc->bd_vec[i].bv_offset; + desc->bd_enc_vec[i].bv_len = + (desc->bd_vec[i].bv_len + + blocksize - 1) & (~(blocksize - 1)); + } + + return GSS_S_COMPLETE; +} + +static +__u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, int adj_nob) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + int blocksz; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t data_desc[1], cipher; + __u8 conf[GSS_MAX_CIPHER_BLOCK]; + int rc = 0; + u32 major; + + LASSERT(ke); + LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK); + + /* + * final token format: + * -------------------------------------------------- + * | krb5 header | head/tail cipher text | checksum | + * -------------------------------------------------- + */ + + /* fill krb5 header */ + LASSERT(token->len >= sizeof(*khdr)); + khdr = (struct krb5_header *)token->data; + fill_krb5_header(kctx, khdr, 1); + + /* generate confounder */ + get_random_bytes(conf, ke->ke_conf_size); + + /* get encryption blocksize. note kc_keye might not associated with + * a tfm, currently only for arcfour-hmac */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksz = 1; + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm); + } + + /* + * we assume the size of krb5_header (16 bytes) must be n * blocksize. + * the bulk token size would be exactly (sizeof(krb5_header) + + * blocksize + sizeof(krb5_header) + hashsize) + */ + LASSERT(blocksz <= ke->ke_conf_size); + LASSERT(sizeof(*khdr) >= blocksz && sizeof(*khdr) % blocksz == 0); + LASSERT(token->len >= sizeof(*khdr) + blocksz + sizeof(*khdr) + 16); + + /* + * clear text layout for checksum: + * ------------------------------------------ + * | confounder | clear pages | krb5 header | + * ------------------------------------------ + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + + /* compute checksum */ + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 1, data_desc, + desc->bd_iov_count, desc->bd_vec, + &cksum, gctx->hash_func)) + GOTO(out_free_cksum, major = GSS_S_FAILURE); + LASSERT(cksum.len >= ke->ke_hash_size); + + /* + * clear text layout for encryption: + * ------------------------------------------ + * | confounder | clear pages | krb5 header | + * ------------------------------------------ + * | | | + * ---------- (cipher pages) | + * result token: | | + * ------------------------------------------- + * | krb5 header | cipher text | cipher text | + * ------------------------------------------- + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + + cipher.data = (__u8 *)(khdr + 1); + cipher.len = blocksz + sizeof(*khdr); + + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LBUG(); + rc = 0; + } else { + rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr, + conf, desc, &cipher, adj_nob); + } + if (rc) + GOTO(out_free_cksum, major = GSS_S_FAILURE); + + /* fill in checksum */ + LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size); + memcpy((char *)(khdr + 1) + cipher.len, + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size); + + /* final token length */ + token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size; + major = GSS_S_COMPLETE; +out_free_cksum: + rawobj_free(&cksum); + return major; +} + +static +__u32 gss_unwrap_kerberos(struct gss_ctx *gctx, + rawobj_t *gsshdr, + rawobj_t *token, + rawobj_t *msg) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + unsigned char *tmpbuf; + int blocksz, bodysize; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t cipher_in, plain_out; + rawobj_t hash_objs[3]; + int rc = 0; + __u32 major; + __u8 local_iv[16] = {0}; + + LASSERT(ke); + + if (token->len < sizeof(*khdr)) { + CERROR("short signature: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + khdr = (struct krb5_header *)token->data; + + major = verify_krb5_header(kctx, khdr, 1); + if (major != GSS_S_COMPLETE) { + CERROR("bad krb5 header\n"); + return major; + } + + /* block size */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksz = 1; + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm); + } + + /* expected token layout: + * ---------------------------------------- + * | krb5 header | cipher text | checksum | + * ---------------------------------------- + */ + bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size; + + if (bodysize % blocksz) { + CERROR("odd bodysize %d\n", bodysize); + return GSS_S_DEFECTIVE_TOKEN; + } + + if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) { + CERROR("incomplete token: bodysize %d\n", bodysize); + return GSS_S_DEFECTIVE_TOKEN; + } + + if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) { + CERROR("buffer too small: %u, require %d\n", + msg->len, bodysize - ke->ke_conf_size); + return GSS_S_FAILURE; + } + + /* decrypting */ + OBD_ALLOC_LARGE(tmpbuf, bodysize); + if (!tmpbuf) + return GSS_S_FAILURE; + + major = GSS_S_FAILURE; + + cipher_in.data = (__u8 *)(khdr + 1); + cipher_in.len = bodysize; + plain_out.data = tmpbuf; + plain_out.len = bodysize; + + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + rawobj_t arc4_keye; + struct crypto_sync_skcipher *arc4_tfm; + + cksum.data = token->data + token->len - ke->ke_hash_size; + cksum.len = ke->ke_hash_size; + + if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi, + NULL, 1, &cksum, 0, NULL, &arc4_keye, + gctx->hash_func)) { + CERROR("failed to obtain arc4 enc key\n"); + GOTO(arc4_out, rc = -EACCES); + } + + arc4_tfm = crypto_alloc_sync_skcipher("ecb(arc4)", 0, 0); + if (IS_ERR(arc4_tfm)) { + CERROR("failed to alloc tfm arc4 in ECB mode\n"); + GOTO(arc4_out_key, rc = -EACCES); + } + + if (crypto_sync_skcipher_setkey(arc4_tfm, arc4_keye.data, + arc4_keye.len)) { + CERROR("failed to set arc4 key, len %d\n", + arc4_keye.len); + GOTO(arc4_out_tfm, rc = -EACCES); + } + + rc = gss_crypt_rawobjs(arc4_tfm, NULL, 1, &cipher_in, + &plain_out, 0); +arc4_out_tfm: + crypto_free_sync_skcipher(arc4_tfm); +arc4_out_key: + rawobj_free(&arc4_keye); +arc4_out: + cksum = RAWOBJ_EMPTY; + } else { + rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 1, + &cipher_in, &plain_out, 0); + } + + if (rc != 0) { + CERROR("error decrypt\n"); + goto out_free; + } + LASSERT(plain_out.len == bodysize); + + /* expected clear text layout: + * ----------------------------------------- + * | confounder | clear msgs | krb5 header | + * ----------------------------------------- + */ + + /* verify krb5 header in token is not modified */ + if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr), + sizeof(*khdr))) { + CERROR("decrypted krb5 header mismatch\n"); + goto out_free; + } + + /* verify checksum, compose clear text as layout: + * ------------------------------------------------------ + * | confounder | gss header | clear msgs | krb5 header | + * ------------------------------------------------------ + */ + hash_objs[0].len = ke->ke_conf_size; + hash_objs[0].data = plain_out.data; + hash_objs[1].len = gsshdr->len; + hash_objs[1].data = gsshdr->data; + hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr); + hash_objs[2].data = plain_out.data + ke->ke_conf_size; + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 3, hash_objs, 0, NULL, &cksum, + gctx->hash_func)) + goto out_free; + + LASSERT(cksum.len >= ke->ke_hash_size); + if (memcmp((char *)(khdr + 1) + bodysize, + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size)) { + CERROR("checksum mismatch\n"); + goto out_free; + } + + msg->len = bodysize - ke->ke_conf_size - sizeof(*khdr); + memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len); + + major = GSS_S_COMPLETE; +out_free: + OBD_FREE_LARGE(tmpbuf, bodysize); + rawobj_free(&cksum); + return major; +} + +static +__u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, int adj_nob) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + int blocksz; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t cipher, plain; + rawobj_t data_desc[1]; + int rc; + __u32 major; + + LASSERT(ke); + + if (token->len < sizeof(*khdr)) { + CERROR("short signature: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + khdr = (struct krb5_header *)token->data; + + major = verify_krb5_header(kctx, khdr, 1); + if (major != GSS_S_COMPLETE) { + CERROR("bad krb5 header\n"); + return major; + } + + /* block size */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksz = 1; + LBUG(); + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm); + } + LASSERT(sizeof(*khdr) >= blocksz && sizeof(*khdr) % blocksz == 0); + + /* + * token format is expected as: + * ----------------------------------------------- + * | krb5 header | head/tail cipher text | cksum | + * ----------------------------------------------- + */ + if (token->len < sizeof(*khdr) + blocksz + sizeof(*khdr) + + ke->ke_hash_size) { + CERROR("short token size: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + cipher.data = (__u8 *) (khdr + 1); + cipher.len = blocksz + sizeof(*khdr); + plain.data = cipher.data; + plain.len = cipher.len; + + rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr, + desc, &cipher, &plain, adj_nob); + if (rc) + return GSS_S_DEFECTIVE_TOKEN; + + /* + * verify checksum, compose clear text as layout: + * ------------------------------------------ + * | confounder | clear pages | krb5 header | + * ------------------------------------------ + */ + data_desc[0].data = plain.data; + data_desc[0].len = blocksz; + + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 1, data_desc, + desc->bd_iov_count, + desc->bd_vec, + &cksum, gctx->hash_func)) + return GSS_S_FAILURE; + LASSERT(cksum.len >= ke->ke_hash_size); + + if (memcmp(plain.data + blocksz + sizeof(*khdr), + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size)) { + CERROR("checksum mismatch\n"); + rawobj_free(&cksum); + return GSS_S_BAD_SIG; + } + + rawobj_free(&cksum); + return GSS_S_COMPLETE; +} + +int gss_display_kerberos(struct gss_ctx *ctx, + char *buf, + int bufsize) +{ + struct krb5_ctx *kctx = ctx->internal_ctx_id; + int written; + + written = scnprintf(buf, bufsize, "krb5 (%s)", + enctype2str(kctx->kc_enctype)); + return written; +} + +static struct gss_api_ops gss_kerberos_ops = { + .gss_import_sec_context = gss_import_sec_context_kerberos, + .gss_copy_reverse_context = gss_copy_reverse_context_kerberos, + .gss_inquire_context = gss_inquire_context_kerberos, + .gss_get_mic = gss_get_mic_kerberos, + .gss_verify_mic = gss_verify_mic_kerberos, + .gss_wrap = gss_wrap_kerberos, + .gss_unwrap = gss_unwrap_kerberos, + .gss_prep_bulk = gss_prep_bulk_kerberos, + .gss_wrap_bulk = gss_wrap_bulk_kerberos, + .gss_unwrap_bulk = gss_unwrap_bulk_kerberos, + .gss_delete_sec_context = gss_delete_sec_context_kerberos, + .gss_display = gss_display_kerberos, +}; + +static struct subflavor_desc gss_kerberos_sfs[] = { + { + .sf_subflavor = SPTLRPC_SUBFLVR_KRB5N, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_NULL, + .sf_name = "krb5n" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_KRB5A, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_AUTH, + .sf_name = "krb5a" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_KRB5I, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_INTG, + .sf_name = "krb5i" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_KRB5P, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_PRIV, + .sf_name = "krb5p" + }, +}; + +static struct gss_api_mech gss_kerberos_mech = { + /* .gm_owner uses default NULL value for THIS_MODULE */ + .gm_name = "krb5", + .gm_oid = (rawobj_t) + {9, "\052\206\110\206\367\022\001\002\002"}, + .gm_ops = &gss_kerberos_ops, + .gm_sf_num = 4, + .gm_sfs = gss_kerberos_sfs, +}; + +int __init init_kerberos_module(void) +{ + int status; + + status = lgss_mech_register(&gss_kerberos_mech); + if (status) + CERROR("Failed to register kerberos gss mechanism!\n"); + return status; +} + +void cleanup_kerberos_module(void) +{ + lgss_mech_unregister(&gss_kerberos_mech); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c new file mode 100644 index 0000000000000..ee2b851e90c82 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c @@ -0,0 +1,361 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_mech_switch.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_crypto.h" + +static LIST_HEAD(registered_mechs); +static DEFINE_SPINLOCK(registered_mechs_lock); + +int lgss_mech_register(struct gss_api_mech *gm) +{ + spin_lock(®istered_mechs_lock); + list_add(&gm->gm_list, ®istered_mechs); + spin_unlock(®istered_mechs_lock); + CDEBUG(D_SEC, "register %s mechanism\n", gm->gm_name); + return 0; +} + +void lgss_mech_unregister(struct gss_api_mech *gm) +{ + spin_lock(®istered_mechs_lock); + list_del(&gm->gm_list); + spin_unlock(®istered_mechs_lock); + CDEBUG(D_SEC, "Unregister %s mechanism\n", gm->gm_name); +} + + +struct gss_api_mech *lgss_mech_get(struct gss_api_mech *gm) +{ + __module_get(gm->gm_owner); + return gm; +} + +struct gss_api_mech *lgss_name_to_mech(char *name) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (0 == strcmp(name, pos->gm_name)) { + if (!try_module_get(pos->gm_owner)) + continue; + gm = pos; + break; + } + } + spin_unlock(®istered_mechs_lock); + return gm; + +} + +static inline +int mech_supports_subflavor(struct gss_api_mech *gm, __u32 subflavor) +{ + int i; + + for (i = 0; i < gm->gm_sf_num; i++) { + if (gm->gm_sfs[i].sf_subflavor == subflavor) + return 1; + } + return 0; +} + +struct gss_api_mech *lgss_subflavor_to_mech(__u32 subflavor) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (!try_module_get(pos->gm_owner)) + continue; + if (!mech_supports_subflavor(pos, subflavor)) { + module_put(pos->gm_owner); + continue; + } + gm = pos; + break; + } + spin_unlock(®istered_mechs_lock); + return gm; +} + +void lgss_mech_put(struct gss_api_mech *gm) +{ + module_put(gm->gm_owner); +} + +/* The mech could probably be determined from the token instead, but it's just + * as easy for now to pass it in. */ +__u32 lgss_import_sec_context(rawobj_t *input_token, + struct gss_api_mech *mech, + struct gss_ctx **ctx_id) +{ + OBD_ALLOC_PTR(*ctx_id); + if (*ctx_id == NULL) + return GSS_S_FAILURE; + + (*ctx_id)->mech_type = lgss_mech_get(mech); + (*ctx_id)->hash_func = gss_digest_hash; + + LASSERT(mech); + LASSERT(mech->gm_ops); + LASSERT(mech->gm_ops->gss_import_sec_context); + return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id); +} + +__u32 lgss_copy_reverse_context(struct gss_ctx *ctx_id, + struct gss_ctx **ctx_id_new) +{ + struct gss_api_mech *mech = ctx_id->mech_type; + __u32 major; + + LASSERT(mech); + + OBD_ALLOC_PTR(*ctx_id_new); + if (*ctx_id_new == NULL) + return GSS_S_FAILURE; + + (*ctx_id_new)->mech_type = lgss_mech_get(mech); + (*ctx_id_new)->hash_func = ctx_id->hash_func; + + LASSERT(mech); + LASSERT(mech->gm_ops); + LASSERT(mech->gm_ops->gss_copy_reverse_context); + + major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new); + if (major != GSS_S_COMPLETE) { + lgss_mech_put(mech); + OBD_FREE_PTR(*ctx_id_new); + *ctx_id_new = NULL; + } + return major; +} + +/* + * this interface is much simplified, currently we only need endtime. + */ +__u32 lgss_inquire_context(struct gss_ctx *context_handle, + time64_t *endtime) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_inquire_context); + + return context_handle->mech_type->gm_ops + ->gss_inquire_context(context_handle, + endtime); +} + +/* gss_get_mic: compute a mic over message and return mic_token. */ +__u32 lgss_get_mic(struct gss_ctx *context_handle, + int msgcnt, + rawobj_t *msg, + int iovcnt, + struct bio_vec *iovs, + rawobj_t *mic_token) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_get_mic); + + return context_handle->mech_type->gm_ops + ->gss_get_mic(context_handle, + msgcnt, + msg, + iovcnt, + iovs, + mic_token); +} + +/* gss_verify_mic: check whether the provided mic_token verifies message. */ +__u32 lgss_verify_mic(struct gss_ctx *context_handle, + int msgcnt, + rawobj_t *msg, + int iovcnt, + struct bio_vec *iovs, + rawobj_t *mic_token) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_verify_mic); + + return context_handle->mech_type->gm_ops + ->gss_verify_mic(context_handle, + msgcnt, + msg, + iovcnt, + iovs, + mic_token); +} + +__u32 lgss_wrap(struct gss_ctx *context_handle, + rawobj_t *gsshdr, + rawobj_t *msg, + int msg_buflen, + rawobj_t *out_token) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_wrap); + + return context_handle->mech_type->gm_ops + ->gss_wrap(context_handle, gsshdr, msg, msg_buflen, out_token); +} + +__u32 lgss_unwrap(struct gss_ctx *context_handle, + rawobj_t *gsshdr, + rawobj_t *token, + rawobj_t *out_msg) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_unwrap); + + return context_handle->mech_type->gm_ops + ->gss_unwrap(context_handle, gsshdr, token, out_msg); +} + + +__u32 lgss_prep_bulk(struct gss_ctx *context_handle, + struct ptlrpc_bulk_desc *desc) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_prep_bulk); + + return context_handle->mech_type->gm_ops + ->gss_prep_bulk(context_handle, desc); +} + +__u32 lgss_wrap_bulk(struct gss_ctx *context_handle, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_wrap_bulk); + + return context_handle->mech_type->gm_ops + ->gss_wrap_bulk(context_handle, desc, token, adj_nob); +} + +__u32 lgss_unwrap_bulk(struct gss_ctx *context_handle, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_unwrap_bulk); + + return context_handle->mech_type->gm_ops + ->gss_unwrap_bulk(context_handle, desc, token, adj_nob); +} + +/* gss_delete_sec_context: free all resources associated with context_handle. + * Note this differs from the RFC 2744-specified prototype in that we don't + * bother returning an output token, since it would never be used anyway. */ + +__u32 lgss_delete_sec_context(struct gss_ctx **context_handle) +{ + struct gss_api_mech *mech; + + if (!*context_handle) + return GSS_S_NO_CONTEXT; + + CDEBUG(D_SEC, "deleting %p\n", *context_handle); + + mech = (*context_handle)->mech_type; + if ((*context_handle)->internal_ctx_id != NULL) { + LASSERT(mech); + LASSERT(mech->gm_ops); + LASSERT(mech->gm_ops->gss_delete_sec_context); + mech->gm_ops->gss_delete_sec_context( + (*context_handle)->internal_ctx_id); + } + if (mech) + lgss_mech_put(mech); + + OBD_FREE_PTR(*context_handle); + *context_handle = NULL; + return GSS_S_COMPLETE; +} + +int lgss_display(struct gss_ctx *ctx, + char *buf, + int bufsize) +{ + LASSERT(ctx); + LASSERT(ctx->mech_type); + LASSERT(ctx->mech_type->gm_ops); + LASSERT(ctx->mech_type->gm_ops->gss_display); + + return ctx->mech_type->gm_ops->gss_display(ctx, buf, bufsize); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c new file mode 100644 index 0000000000000..6362673743bcf --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c @@ -0,0 +1,220 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (C) 2013, 2015, Trustees of Indiana University + * + * Copyright (c) 2014, Intel Corporation. + * + * Author: Jeremy Filizetti + * Author: Andrew Korty + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_asn1.h" + +struct null_ctx { + __u64 nc_token; +}; + +static +__u32 gss_import_sec_context_null(rawobj_t *inbuf, struct gss_ctx *gss_context) +{ + struct null_ctx *null_context; + + if (inbuf == NULL || inbuf->data == NULL || + inbuf->len != sizeof(*null_context)) { + CDEBUG(D_SEC, "Invalid input buffer for null context\n"); + return GSS_S_FAILURE; + } + + OBD_ALLOC_PTR(null_context); + if (null_context == NULL) + return GSS_S_FAILURE; + + memcpy(&null_context->nc_token, inbuf->data, inbuf->len); + + gss_context->internal_ctx_id = null_context; + CDEBUG(D_SEC, "successfully imported null context\n"); + + return GSS_S_COMPLETE; +} + +static +__u32 gss_copy_reverse_context_null(struct gss_ctx *gss_context_old, + struct gss_ctx *gss_context_new) +{ + struct null_ctx *null_context_old; + struct null_ctx *null_context_new; + + OBD_ALLOC_PTR(null_context_new); + if (null_context_new == NULL) + return GSS_S_FAILURE; + + null_context_old = gss_context_old->internal_ctx_id; + memcpy(null_context_new, null_context_old, sizeof(*null_context_new)); + gss_context_new->internal_ctx_id = null_context_new; + CDEBUG(D_SEC, "successfully copied reverse null context\n"); + + return GSS_S_COMPLETE; +} + +static +__u32 gss_inquire_context_null(struct gss_ctx *gss_context, + time64_t *endtime) +{ + /* quick timeout for testing purposes */ + *endtime = ktime_get_real_seconds() + 60; + return GSS_S_COMPLETE; +} + +static +__u32 gss_wrap_null(struct gss_ctx *gss_context, rawobj_t *gss_header, + rawobj_t *message, int message_buffer_length, + rawobj_t *token) +{ + return GSS_S_COMPLETE; +} + +static +__u32 gss_unwrap_null(struct gss_ctx *gss_context, rawobj_t *gss_header, + rawobj_t *token, rawobj_t *message) +{ + return GSS_S_COMPLETE; +} + +static +__u32 gss_prep_bulk_null(struct gss_ctx *gss_context, + struct ptlrpc_bulk_desc *desc) +{ + return GSS_S_COMPLETE; +} + +static +__u32 gss_wrap_bulk_null(struct gss_ctx *gss_context, + struct ptlrpc_bulk_desc *desc, rawobj_t *token, + int adj_nob) +{ + return GSS_S_COMPLETE; +} + +static +__u32 gss_unwrap_bulk_null(struct gss_ctx *gss_context, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, int adj_nob) +{ + return GSS_S_COMPLETE; +} + +static +void gss_delete_sec_context_null(void *internal_context) +{ + struct null_ctx *null_context = internal_context; + + OBD_FREE_PTR(null_context); +} + +int gss_display_null(struct gss_ctx *gss_context, char *buf, int bufsize) +{ + return scnprintf(buf, bufsize, "null"); +} + +static +__u32 gss_get_mic_null(struct gss_ctx *gss_context, int message_count, + rawobj_t *messages, int iov_count, struct bio_vec *iovs, + rawobj_t *token) +{ + return GSS_S_COMPLETE; +} + +static +__u32 gss_verify_mic_null(struct gss_ctx *gss_context, int message_count, + rawobj_t *messages, int iov_count, + struct bio_vec *iovs, + rawobj_t *token) +{ + return GSS_S_COMPLETE; +} + +static struct gss_api_ops gss_null_ops = { + .gss_import_sec_context = gss_import_sec_context_null, + .gss_copy_reverse_context = gss_copy_reverse_context_null, + .gss_inquire_context = gss_inquire_context_null, + .gss_get_mic = gss_get_mic_null, + .gss_verify_mic = gss_verify_mic_null, + .gss_wrap = gss_wrap_null, + .gss_unwrap = gss_unwrap_null, + .gss_prep_bulk = gss_prep_bulk_null, + .gss_wrap_bulk = gss_wrap_bulk_null, + .gss_unwrap_bulk = gss_unwrap_bulk_null, + .gss_delete_sec_context = gss_delete_sec_context_null, + .gss_display = gss_display_null, +}; + +static struct subflavor_desc gss_null_sfs[] = { + { + .sf_subflavor = SPTLRPC_SUBFLVR_GSSNULL, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_NULL, + .sf_name = "gssnull" + }, +}; + +static struct gss_api_mech gss_null_mech = { + /* .gm_owner uses default NULL value for THIS_MODULE */ + .gm_name = "gssnull", + .gm_oid = (rawobj_t) { + 12, + "\053\006\001\004\001\311\146\215\126\001\000\000" + }, + .gm_ops = &gss_null_ops, + .gm_sf_num = 1, + .gm_sfs = gss_null_sfs, +}; + +int __init init_null_module(void) +{ + int status; + + status = lgss_mech_register(&gss_null_mech); + if (status) + CERROR("Failed to register null gss mechanism!\n"); + + return status; +} + +void cleanup_null_module(void) +{ + lgss_mech_unregister(&gss_null_mech); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c new file mode 100644 index 0000000000000..4a21cc77a6eea --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c @@ -0,0 +1,1255 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2016, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/auth_gss.c + * + * RPCSEC_GSS client authentication. + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Dug Song + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include +#include +struct rpc_clnt; /* for rpc_pipefs */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +static struct ptlrpc_sec_policy gss_policy_pipefs; +static struct ptlrpc_ctx_ops gss_pipefs_ctxops; + +static int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx); + +static int gss_sec_pipe_upcall_init(struct gss_sec *gsec) +{ + return 0; +} + +static void gss_sec_pipe_upcall_fini(struct gss_sec *gsec) +{ +} + +/**************************************** + * internal context helpers * + ****************************************/ + +static +struct ptlrpc_cli_ctx *ctx_create_pf(struct ptlrpc_sec *sec, + struct vfs_cred *vcred) +{ + struct gss_cli_ctx *gctx; + int rc; + + OBD_ALLOC_PTR(gctx); + if (gctx == NULL) + return NULL; + + rc = gss_cli_ctx_init_common(sec, &gctx->gc_base, + &gss_pipefs_ctxops, vcred); + if (rc) { + OBD_FREE_PTR(gctx); + return NULL; + } + + return &gctx->gc_base; +} + +static +void ctx_destroy_pf(struct ptlrpc_sec *sec, struct ptlrpc_cli_ctx *ctx) +{ + struct gss_cli_ctx *gctx = ctx2gctx(ctx); + + if (gss_cli_ctx_fini_common(sec, ctx)) + return; + + OBD_FREE_PTR(gctx); + + atomic_dec(&sec->ps_nctx); + sptlrpc_sec_put(sec); +} + +static +void ctx_enhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *hash) +{ + set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags); + atomic_inc(&ctx->cc_refcount); + hlist_add_head(&ctx->cc_cache, hash); +} + +/* + * caller must hold spinlock + */ +static +void ctx_unhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *freelist) +{ + assert_spin_locked(&ctx->cc_sec->ps_lock); + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)); + LASSERT(!hlist_unhashed(&ctx->cc_cache)); + + clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags); + + if (atomic_dec_and_test(&ctx->cc_refcount)) { + __hlist_del(&ctx->cc_cache); + hlist_add_head(&ctx->cc_cache, freelist); + } else { + hlist_del_init(&ctx->cc_cache); + } +} + +/* + * return 1 if the context is dead. + */ +static +int ctx_check_death_pf(struct ptlrpc_cli_ctx *ctx, + struct hlist_head *freelist) +{ + if (cli_ctx_check_death(ctx)) { + if (freelist) + ctx_unhash_pf(ctx, freelist); + return 1; + } + + return 0; +} + +static inline +int ctx_check_death_locked_pf(struct ptlrpc_cli_ctx *ctx, + struct hlist_head *freelist) +{ + LASSERT(ctx->cc_sec); + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)); + + return ctx_check_death_pf(ctx, freelist); +} + +static inline +int ctx_match_pf(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred) +{ + /* a little bit optimization for null policy */ + if (!ctx->cc_ops->match) + return 1; + + return ctx->cc_ops->match(ctx, vcred); +} + +static +void ctx_list_destroy_pf(struct hlist_head *head) +{ + struct ptlrpc_cli_ctx *ctx; + + while (!hlist_empty(head)) { + ctx = cfs_hlist_entry(head->first, struct ptlrpc_cli_ctx, + cc_cache); + + LASSERT(atomic_read(&ctx->cc_refcount) == 0); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, + &ctx->cc_flags) == 0); + + hlist_del_init(&ctx->cc_cache); + ctx_destroy_pf(ctx->cc_sec, ctx); + } +} + +/**************************************** + * context apis * + ****************************************/ + +static +int gss_cli_ctx_validate_pf(struct ptlrpc_cli_ctx *ctx) +{ + if (ctx_check_death_pf(ctx, NULL)) + return 1; + if (cli_ctx_is_ready(ctx)) + return 0; + return 1; +} + +static +void gss_cli_ctx_die_pf(struct ptlrpc_cli_ctx *ctx, int grace) +{ + LASSERT(ctx->cc_sec); + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + cli_ctx_expire(ctx); + + spin_lock(&ctx->cc_sec->ps_lock); + + if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)) { + LASSERT(!hlist_unhashed(&ctx->cc_cache)); + LASSERT(atomic_read(&ctx->cc_refcount) > 1); + + hlist_del_init(&ctx->cc_cache); + if (atomic_dec_and_test(&ctx->cc_refcount)) + LBUG(); + } + + spin_unlock(&ctx->cc_sec->ps_lock); +} + +/**************************************** + * reverse context installation * + ****************************************/ + +static inline +unsigned int ctx_hash_index(int hashsize, __u64 key) +{ + return (unsigned int) (key & ((__u64) hashsize - 1)); +} + +static +void gss_sec_ctx_replace_pf(struct gss_sec *gsec, + struct ptlrpc_cli_ctx *new) +{ + struct hlist_node __maybe_unused *pos, *next; + struct gss_sec_pipefs *gsec_pf; + struct ptlrpc_cli_ctx *ctx; + HLIST_HEAD(freelist); + unsigned int hash; + ENTRY; + + gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base); + + hash = ctx_hash_index(gsec_pf->gsp_chash_size, + (__u64) new->cc_vcred.vc_uid); + LASSERT(hash < gsec_pf->gsp_chash_size); + + spin_lock(&gsec->gs_base.ps_lock); + + cfs_hlist_for_each_entry_safe(ctx, pos, next, + &gsec_pf->gsp_chash[hash], cc_cache) { + if (!ctx_match_pf(ctx, &new->cc_vcred)) + continue; + + cli_ctx_expire(ctx); + ctx_unhash_pf(ctx, &freelist); + break; + } + + ctx_enhash_pf(new, &gsec_pf->gsp_chash[hash]); + + spin_unlock(&gsec->gs_base.ps_lock); + + ctx_list_destroy_pf(&freelist); + EXIT; +} + +static +int gss_install_rvs_cli_ctx_pf(struct gss_sec *gsec, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct vfs_cred vcred; + struct ptlrpc_cli_ctx *cli_ctx; + int rc; + ENTRY; + + vcred.vc_uid = 0; + vcred.vc_gid = 0; + + cli_ctx = ctx_create_pf(&gsec->gs_base, &vcred); + if (!cli_ctx) + RETURN(-ENOMEM); + + rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx); + if (rc) { + ctx_destroy_pf(cli_ctx->cc_sec, cli_ctx); + RETURN(rc); + } + + gss_sec_ctx_replace_pf(gsec, cli_ctx); + RETURN(0); +} + +static +void gss_ctx_cache_gc_pf(struct gss_sec_pipefs *gsec_pf, + struct hlist_head *freelist) +{ + struct ptlrpc_sec *sec; + struct ptlrpc_cli_ctx *ctx; + struct hlist_node __maybe_unused *pos; + struct hlist_node *next; + int i; + ENTRY; + + sec = &gsec_pf->gsp_base.gs_base; + + CDEBUG(D_SEC, "do gc on sec %s@%p\n", sec->ps_policy->sp_name, sec); + + for (i = 0; i < gsec_pf->gsp_chash_size; i++) { + cfs_hlist_for_each_entry_safe(ctx, pos, next, + &gsec_pf->gsp_chash[i], cc_cache) + ctx_check_death_locked_pf(ctx, freelist); + } + + sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval; + EXIT; +} + +static +struct ptlrpc_sec* gss_sec_create_pf(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *sf) +{ + struct gss_sec_pipefs *gsec_pf; + int alloc_size, hash_size, i; + ENTRY; + +#define GSS_SEC_PIPEFS_CTX_HASH_SIZE (32) + + if (ctx || + sf->sf_flags & (PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_REVERSE)) + hash_size = 1; + else + hash_size = GSS_SEC_PIPEFS_CTX_HASH_SIZE; + + alloc_size = sizeof(*gsec_pf) + + sizeof(struct hlist_head) * hash_size; + + OBD_ALLOC(gsec_pf, alloc_size); + if (!gsec_pf) + RETURN(NULL); + + gsec_pf->gsp_chash_size = hash_size; + for (i = 0; i < hash_size; i++) + INIT_HLIST_HEAD(&gsec_pf->gsp_chash[i]); + + if (gss_sec_create_common(&gsec_pf->gsp_base, &gss_policy_pipefs, + imp, ctx, sf)) + goto err_free; + + if (ctx == NULL) { + if (gss_sec_pipe_upcall_init(&gsec_pf->gsp_base)) + goto err_destroy; + } else { + if (gss_install_rvs_cli_ctx_pf(&gsec_pf->gsp_base, ctx)) + goto err_destroy; + } + + RETURN(&gsec_pf->gsp_base.gs_base); + +err_destroy: + gss_sec_destroy_common(&gsec_pf->gsp_base); +err_free: + OBD_FREE(gsec_pf, alloc_size); + RETURN(NULL); +} + +static +void gss_sec_destroy_pf(struct ptlrpc_sec *sec) +{ + struct gss_sec_pipefs *gsec_pf; + struct gss_sec *gsec; + + CWARN("destroy %s@%p\n", sec->ps_policy->sp_name, sec); + + gsec = container_of(sec, struct gss_sec, gs_base); + gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base); + + LASSERT(gsec_pf->gsp_chash); + LASSERT(gsec_pf->gsp_chash_size); + + gss_sec_pipe_upcall_fini(gsec); + + gss_sec_destroy_common(gsec); + + OBD_FREE(gsec, sizeof(*gsec_pf) + + sizeof(struct hlist_head) * gsec_pf->gsp_chash_size); +} + +static +struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_pf(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + struct gss_sec *gsec; + struct gss_sec_pipefs *gsec_pf; + struct ptlrpc_cli_ctx *ctx = NULL, *new = NULL; + struct hlist_head *hash_head; + struct hlist_node __maybe_unused *pos, *next; + unsigned int hash, gc = 0, found = 0; + HLIST_HEAD(freelist); + ENTRY; + + might_sleep(); + + gsec = container_of(sec, struct gss_sec, gs_base); + gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base); + + hash = ctx_hash_index(gsec_pf->gsp_chash_size, + (__u64) vcred->vc_uid); + hash_head = &gsec_pf->gsp_chash[hash]; + LASSERT(hash < gsec_pf->gsp_chash_size); + +retry: + spin_lock(&sec->ps_lock); + + /* gc_next == 0 means never do gc */ + if (remove_dead && sec->ps_gc_next && + (ktime_get_real_seconds() > sec->ps_gc_next)) { + gss_ctx_cache_gc_pf(gsec_pf, &freelist); + gc = 1; + } + + cfs_hlist_for_each_entry_safe(ctx, pos, next, hash_head, cc_cache) { + if (gc == 0 && + ctx_check_death_locked_pf(ctx, + remove_dead ? &freelist : NULL)) + continue; + + if (ctx_match_pf(ctx, vcred)) { + found = 1; + break; + } + } + + if (found) { + if (new && new != ctx) { + /* lost the race, just free it */ + hlist_add_head(&new->cc_cache, &freelist); + new = NULL; + } + + /* hot node, move to head */ + if (hash_head->first != &ctx->cc_cache) { + __hlist_del(&ctx->cc_cache); + hlist_add_head(&ctx->cc_cache, hash_head); + } + } else { + /* don't allocate for reverse sec */ + if (sec_is_reverse(sec)) { + spin_unlock(&sec->ps_lock); + RETURN(NULL); + } + + if (new) { + ctx_enhash_pf(new, hash_head); + ctx = new; + } else if (create) { + spin_unlock(&sec->ps_lock); + new = ctx_create_pf(sec, vcred); + if (new) { + clear_bit(PTLRPC_CTX_NEW_BIT, &new->cc_flags); + goto retry; + } + } else { + ctx = NULL; + } + } + + /* hold a ref */ + if (ctx) + atomic_inc(&ctx->cc_refcount); + + spin_unlock(&sec->ps_lock); + + /* the allocator of the context must give the first push to refresh */ + if (new) { + LASSERT(new == ctx); + gss_cli_ctx_refresh_pf(new); + } + + ctx_list_destroy_pf(&freelist); + RETURN(ctx); +} + +static +void gss_sec_release_ctx_pf(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + int sync) +{ + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0); + LASSERT(hlist_unhashed(&ctx->cc_cache)); + + /* if required async, we must clear the UPTODATE bit to prevent extra + * rpcs during destroy procedure. */ + if (!sync) + clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags); + + /* destroy this context */ + ctx_destroy_pf(sec, ctx); +} + +/* + * @uid: which user. "-1" means flush all. + * @grace: mark context DEAD, allow graceful destroy like notify + * server side, etc. + * @force: also flush busy entries. + * + * return the number of busy context encountered. + * + * In any cases, never touch "eternal" contexts. + */ +static +int gss_sec_flush_ctx_cache_pf(struct ptlrpc_sec *sec, + uid_t uid, + int grace, int force) +{ + struct gss_sec *gsec; + struct gss_sec_pipefs *gsec_pf; + struct ptlrpc_cli_ctx *ctx; + struct hlist_node __maybe_unused *pos, *next; + HLIST_HEAD(freelist); + int i, busy = 0; + ENTRY; + + might_sleep_if(grace); + + gsec = container_of(sec, struct gss_sec, gs_base); + gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base); + + spin_lock(&sec->ps_lock); + for (i = 0; i < gsec_pf->gsp_chash_size; i++) { + cfs_hlist_for_each_entry_safe(ctx, pos, next, + &gsec_pf->gsp_chash[i], + cc_cache) { + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + if (uid != -1 && uid != ctx->cc_vcred.vc_uid) + continue; + + if (atomic_read(&ctx->cc_refcount) > 1) { + busy++; + if (!force) + continue; + + CWARN("flush busy(%d) ctx %p(%u->%s) by force, " + "grace %d\n", + atomic_read(&ctx->cc_refcount), + ctx, ctx->cc_vcred.vc_uid, + sec2target_str(ctx->cc_sec), grace); + } + ctx_unhash_pf(ctx, &freelist); + + set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags); + if (!grace) + clear_bit(PTLRPC_CTX_UPTODATE_BIT, + &ctx->cc_flags); + } + } + spin_unlock(&sec->ps_lock); + + ctx_list_destroy_pf(&freelist); + RETURN(busy); +} + +/**************************************** + * service apis * + ****************************************/ + +static +int gss_svc_accept_pf(struct ptlrpc_request *req) +{ + return gss_svc_accept(&gss_policy_pipefs, req); +} + +static +int gss_svc_install_rctx_pf(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx) +{ + struct ptlrpc_sec *sec; + int rc; + + sec = sptlrpc_import_sec_ref(imp); + LASSERT(sec); + rc = gss_install_rvs_cli_ctx_pf(sec2gsec(sec), ctx); + + sptlrpc_sec_put(sec); + return rc; +} + +/**************************************** + * rpc_pipefs definitions * + ****************************************/ + +#define LUSTRE_PIPE_ROOT "/lustre" +#define LUSTRE_PIPE_KRB5 LUSTRE_PIPE_ROOT"/krb5" + +struct gss_upcall_msg_data { + __u32 gum_seq; + __u32 gum_uid; + __u32 gum_gid; + __u32 gum_svc; /* MDS/OSS... */ + __u64 gum_nid; /* peer NID */ + __u8 gum_obd[64]; /* client obd name */ +}; + +struct gss_upcall_msg { + struct rpc_pipe_msg gum_base; + atomic_t gum_refcount; + struct list_head gum_list; + __u32 gum_mechidx; + struct gss_sec *gum_gsec; + struct gss_cli_ctx *gum_gctx; + struct gss_upcall_msg_data gum_data; +}; + +static atomic_t upcall_seq = ATOMIC_INIT(0); + +static inline +__u32 upcall_get_sequence(void) +{ + return (__u32) atomic_inc_return(&upcall_seq); +} + +enum mech_idx_t { + MECH_KRB5 = 0, + MECH_MAX +}; + +static inline +__u32 mech_name2idx(const char *name) +{ + LASSERT(!strcmp(name, "krb5")); + return MECH_KRB5; +} + +/* pipefs dentries for each mechanisms */ +static struct dentry *de_pipes[MECH_MAX] = { NULL, }; +/* all upcall messgaes linked here */ +static struct list_head upcall_lists[MECH_MAX]; +/* and protected by this */ +static spinlock_t upcall_locks[MECH_MAX]; + +static inline +void upcall_list_lock(int idx) +{ + spin_lock(&upcall_locks[idx]); +} + +static inline +void upcall_list_unlock(int idx) +{ + spin_unlock(&upcall_locks[idx]); +} + +static +void upcall_msg_enlist(struct gss_upcall_msg *msg) +{ + __u32 idx = msg->gum_mechidx; + + upcall_list_lock(idx); + list_add(&msg->gum_list, &upcall_lists[idx]); + upcall_list_unlock(idx); +} + +static +void upcall_msg_delist(struct gss_upcall_msg *msg) +{ + __u32 idx = msg->gum_mechidx; + + upcall_list_lock(idx); + list_del_init(&msg->gum_list); + upcall_list_unlock(idx); +} + +/**************************************** + * rpc_pipefs upcall helpers * + ****************************************/ + +static +void gss_release_msg(struct gss_upcall_msg *gmsg) +{ + ENTRY; + LASSERT(atomic_read(&gmsg->gum_refcount) > 0); + + if (!atomic_dec_and_test(&gmsg->gum_refcount)) { + EXIT; + return; + } + + if (gmsg->gum_gctx) { + sptlrpc_cli_ctx_wakeup(&gmsg->gum_gctx->gc_base); + sptlrpc_cli_ctx_put(&gmsg->gum_gctx->gc_base, 1); + gmsg->gum_gctx = NULL; + } + + LASSERT(list_empty(&gmsg->gum_list)); + LASSERT(list_empty(&gmsg->gum_base.list)); + OBD_FREE_PTR(gmsg); + EXIT; +} + +static +void gss_unhash_msg_nolock(struct gss_upcall_msg *gmsg) +{ + __u32 idx = gmsg->gum_mechidx; + + LASSERT(idx < MECH_MAX); + assert_spin_locked(&upcall_locks[idx]); + + if (list_empty(&gmsg->gum_list)) + return; + + list_del_init(&gmsg->gum_list); + LASSERT(atomic_read(&gmsg->gum_refcount) > 1); + atomic_dec(&gmsg->gum_refcount); +} + +static +void gss_unhash_msg(struct gss_upcall_msg *gmsg) +{ + __u32 idx = gmsg->gum_mechidx; + + LASSERT(idx < MECH_MAX); + upcall_list_lock(idx); + gss_unhash_msg_nolock(gmsg); + upcall_list_unlock(idx); +} + +static +void gss_msg_fail_ctx(struct gss_upcall_msg *gmsg) +{ + if (gmsg->gum_gctx) { + struct ptlrpc_cli_ctx *ctx = &gmsg->gum_gctx->gc_base; + + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + sptlrpc_cli_ctx_expire(ctx); + set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags); + } +} + +static +struct gss_upcall_msg * gss_find_upcall(__u32 mechidx, __u32 seq) +{ + struct gss_upcall_msg *gmsg; + + upcall_list_lock(mechidx); + list_for_each_entry(gmsg, &upcall_lists[mechidx], gum_list) { + if (gmsg->gum_data.gum_seq != seq) + continue; + + LASSERT(atomic_read(&gmsg->gum_refcount) > 0); + LASSERT(gmsg->gum_mechidx == mechidx); + + atomic_inc(&gmsg->gum_refcount); + upcall_list_unlock(mechidx); + return gmsg; + } + upcall_list_unlock(mechidx); + return NULL; +} + +static +int simple_get_bytes(char **buf, __u32 *buflen, void *res, __u32 reslen) +{ + if (*buflen < reslen) { + CERROR("shorter buflen than needed: %u < %u\n", + *buflen, reslen); + return -EINVAL; + } + + memcpy(res, *buf, reslen); + *buf += reslen; + *buflen -= reslen; + return 0; +} + +/**************************************** + * rpc_pipefs apis * + ****************************************/ + +static +ssize_t gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, + char *dst, size_t buflen) +{ + char *data = (char *)msg->data + msg->copied; + ssize_t mlen = msg->len; + ssize_t left; + ENTRY; + + if (mlen > buflen) + mlen = buflen; + left = copy_to_user(dst, data, mlen); + if (left < 0) { + msg->errno = left; + RETURN(left); + } + mlen -= left; + msg->copied += mlen; + msg->errno = 0; + RETURN(mlen); +} + +static +ssize_t gss_pipe_downcall(struct file *filp, const char *src, size_t mlen) +{ + struct rpc_inode *rpci = RPC_I(file_inode(filp)); + struct gss_upcall_msg *gss_msg; + struct ptlrpc_cli_ctx *ctx; + struct gss_cli_ctx *gctx = NULL; + char *buf, *data; + int datalen; + int timeout, rc; + __u32 mechidx, seq, gss_err; + ENTRY; + + mechidx = (__u32) (long) rpci->private; + LASSERT(mechidx < MECH_MAX); + + OBD_ALLOC(buf, mlen); + if (!buf) + RETURN(-ENOMEM); + + if (copy_from_user(buf, src, mlen)) { + CERROR("failed copy user space data\n"); + GOTO(out_free, rc = -EFAULT); + } + data = buf; + datalen = mlen; + + /* data passed down format: + * - seq + * - timeout + * - gc_win / error + * - wire_ctx (rawobj) + * - mech_ctx (rawobj) + */ + if (simple_get_bytes(&data, &datalen, &seq, sizeof(seq))) { + CERROR("fail to get seq\n"); + GOTO(out_free, rc = -EFAULT); + } + + gss_msg = gss_find_upcall(mechidx, seq); + if (!gss_msg) { + CERROR("upcall %u has aborted earlier\n", seq); + GOTO(out_free, rc = -EINVAL); + } + + gss_unhash_msg(gss_msg); + gctx = gss_msg->gum_gctx; + LASSERT(gctx); + LASSERT(atomic_read(&gctx->gc_base.cc_refcount) > 0); + + /* timeout is not in use for now */ + if (simple_get_bytes(&data, &datalen, &timeout, sizeof(timeout))) + GOTO(out_msg, rc = -EFAULT); + + /* lgssd signal an error by gc_win == 0 */ + if (simple_get_bytes(&data, &datalen, &gctx->gc_win, + sizeof(gctx->gc_win))) + GOTO(out_msg, rc = -EFAULT); + + if (gctx->gc_win == 0) { + /* followed by: + * - rpc error + * - gss error + */ + if (simple_get_bytes(&data, &datalen, &rc, sizeof(rc))) + GOTO(out_msg, rc = -EFAULT); + if (simple_get_bytes(&data, &datalen, &gss_err,sizeof(gss_err))) + GOTO(out_msg, rc = -EFAULT); + + if (rc == 0 && gss_err == GSS_S_COMPLETE) { + CWARN("both rpc & gss error code not set\n"); + rc = -EPERM; + } + } else { + rawobj_t tmpobj; + + /* handle */ + if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen)) + GOTO(out_msg, rc = -EFAULT); + if (rawobj_dup(&gctx->gc_handle, &tmpobj)) + GOTO(out_msg, rc = -ENOMEM); + + /* mechctx */ + if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen)) + GOTO(out_msg, rc = -EFAULT); + gss_err = lgss_import_sec_context(&tmpobj, + gss_msg->gum_gsec->gs_mech, + &gctx->gc_mechctx); + rc = 0; + } + + if (likely(rc == 0 && gss_err == GSS_S_COMPLETE)) { + gss_cli_ctx_uptodate(gctx); + } else { + ctx = &gctx->gc_base; + sptlrpc_cli_ctx_expire(ctx); + if (rc != -ERESTART || gss_err != GSS_S_COMPLETE) + set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags); + + CERROR("refresh ctx %p(uid %d) failed: %d/0x%08x: %s\n", + ctx, ctx->cc_vcred.vc_uid, rc, gss_err, + test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags) ? + "fatal error" : "non-fatal"); + } + + rc = mlen; + +out_msg: + gss_release_msg(gss_msg); + +out_free: + OBD_FREE(buf, mlen); + /* FIXME + * hack pipefs: always return asked length unless all following + * downcalls might be messed up. */ + rc = mlen; + RETURN(rc); +} + +static +void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct gss_upcall_msg *gmsg; + struct gss_upcall_msg_data *gumd; + static time64_t ratelimit; + ENTRY; + + LASSERT(list_empty(&msg->list)); + + /* normally errno is >= 0 */ + if (msg->errno >= 0) { + EXIT; + return; + } + + gmsg = container_of(msg, struct gss_upcall_msg, gum_base); + gumd = &gmsg->gum_data; + LASSERT(atomic_read(&gmsg->gum_refcount) > 0); + + CERROR("failed msg %p (seq %u, uid %u, svc %u, nid %#llx, obd %.*s): " + "errno %d\n", msg, gumd->gum_seq, gumd->gum_uid, gumd->gum_svc, + gumd->gum_nid, (int) sizeof(gumd->gum_obd), + gumd->gum_obd, msg->errno); + + atomic_inc(&gmsg->gum_refcount); + gss_unhash_msg(gmsg); + if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) { + time64_t now = ktime_get_real_seconds(); + + if (now > ratelimit) { + CWARN("upcall timed out, is lgssd running?\n"); + ratelimit = now + 15; + } + } + gss_msg_fail_ctx(gmsg); + gss_release_msg(gmsg); + EXIT; +} + +static +void gss_pipe_release(struct inode *inode) +{ + struct rpc_inode *rpci = RPC_I(inode); + __u32 idx; + ENTRY; + + idx = (__u32) (long) rpci->private; + LASSERT(idx < MECH_MAX); + + upcall_list_lock(idx); + while (!list_empty(&upcall_lists[idx])) { + struct gss_upcall_msg *gmsg; + struct gss_upcall_msg_data *gumd; + + gmsg = list_entry(upcall_lists[idx].next, + struct gss_upcall_msg, gum_list); + gumd = &gmsg->gum_data; + LASSERT(list_empty(&gmsg->gum_base.list)); + + CERROR("failing remaining msg %p:seq %u, uid %u, svc %u, " + "nid %#llx, obd %.*s\n", gmsg, + gumd->gum_seq, gumd->gum_uid, gumd->gum_svc, + gumd->gum_nid, (int) sizeof(gumd->gum_obd), + gumd->gum_obd); + + gmsg->gum_base.errno = -EPIPE; + atomic_inc(&gmsg->gum_refcount); + gss_unhash_msg_nolock(gmsg); + + gss_msg_fail_ctx(gmsg); + + upcall_list_unlock(idx); + gss_release_msg(gmsg); + upcall_list_lock(idx); + } + upcall_list_unlock(idx); + EXIT; +} + +static struct rpc_pipe_ops gss_upcall_ops = { + .upcall = gss_pipe_upcall, + .downcall = gss_pipe_downcall, + .destroy_msg = gss_pipe_destroy_msg, + .release_pipe = gss_pipe_release, +}; + +/**************************************** + * upcall helper functions * + ****************************************/ + +static +int gss_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx) +{ + struct obd_import *imp; + struct gss_sec *gsec; + struct gss_upcall_msg *gmsg; + int rc = 0; + ENTRY; + + might_sleep(); + + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_import); + LASSERT(ctx->cc_sec->ps_import->imp_obd); + + imp = ctx->cc_sec->ps_import; + if (!imp->imp_connection) { + CERROR("import has no connection set\n"); + RETURN(-EINVAL); + } + + gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base); + + OBD_ALLOC_PTR(gmsg); + if (!gmsg) + RETURN(-ENOMEM); + + /* initialize pipefs base msg */ + INIT_LIST_HEAD(&gmsg->gum_base.list); + gmsg->gum_base.data = &gmsg->gum_data; + gmsg->gum_base.len = sizeof(gmsg->gum_data); + gmsg->gum_base.copied = 0; + gmsg->gum_base.errno = 0; + + /* init upcall msg */ + atomic_set(&gmsg->gum_refcount, 1); + gmsg->gum_mechidx = mech_name2idx(gsec->gs_mech->gm_name); + gmsg->gum_gsec = gsec; + gmsg->gum_gctx = container_of(sptlrpc_cli_ctx_get(ctx), + struct gss_cli_ctx, gc_base); + gmsg->gum_data.gum_seq = upcall_get_sequence(); + gmsg->gum_data.gum_uid = ctx->cc_vcred.vc_uid; + gmsg->gum_data.gum_gid = 0; /* not used for now */ + gmsg->gum_data.gum_svc = import_to_gss_svc(imp); + gmsg->gum_data.gum_nid = imp->imp_connection->c_peer.nid; + strlcpy(gmsg->gum_data.gum_obd, imp->imp_obd->obd_name, + sizeof(gmsg->gum_data.gum_obd)); + + /* This only could happen when sysadmin set it dead/expired + * using lctl by force. */ + if (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK) { + CWARN("ctx %p(%u->%s) was set flags %lx unexpectedly\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), + ctx->cc_flags); + + LASSERT(!(ctx->cc_flags & PTLRPC_CTX_UPTODATE)); + ctx->cc_flags |= PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR; + + rc = -EIO; + goto err_free; + } + + upcall_msg_enlist(gmsg); + + rc = rpc_queue_upcall(de_pipes[gmsg->gum_mechidx]->d_inode, + &gmsg->gum_base); + if (rc) { + CERROR("rpc_queue_upcall failed: %d\n", rc); + + upcall_msg_delist(gmsg); + goto err_free; + } + + RETURN(0); +err_free: + OBD_FREE_PTR(gmsg); + RETURN(rc); +} + +static +int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx) +{ + /* if we are refreshing for root, also update the reverse + * handle index, do not confuse reverse contexts. */ + if (ctx->cc_vcred.vc_uid == 0) { + struct gss_sec *gsec; + + gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base); + gsec->gs_rvs_hdl = gss_get_next_ctx_index(); + } + + return gss_ctx_refresh_pf(ctx); +} + +/**************************************** + * lustre gss pipefs policy * + ****************************************/ + +static struct ptlrpc_ctx_ops gss_pipefs_ctxops = { + .match = gss_cli_ctx_match, + .refresh = gss_cli_ctx_refresh_pf, + .validate = gss_cli_ctx_validate_pf, + .die = gss_cli_ctx_die_pf, + .sign = gss_cli_ctx_sign, + .verify = gss_cli_ctx_verify, + .seal = gss_cli_ctx_seal, + .unseal = gss_cli_ctx_unseal, + .wrap_bulk = gss_cli_ctx_wrap_bulk, + .unwrap_bulk = gss_cli_ctx_unwrap_bulk, +}; + +static struct ptlrpc_sec_cops gss_sec_pipefs_cops = { + .create_sec = gss_sec_create_pf, + .destroy_sec = gss_sec_destroy_pf, + .kill_sec = gss_sec_kill, + .lookup_ctx = gss_sec_lookup_ctx_pf, + .release_ctx = gss_sec_release_ctx_pf, + .flush_ctx_cache = gss_sec_flush_ctx_cache_pf, + .install_rctx = gss_sec_install_rctx, + .alloc_reqbuf = gss_alloc_reqbuf, + .free_reqbuf = gss_free_reqbuf, + .alloc_repbuf = gss_alloc_repbuf, + .free_repbuf = gss_free_repbuf, + .enlarge_reqbuf = gss_enlarge_reqbuf, +}; + +static struct ptlrpc_sec_sops gss_sec_pipefs_sops = { + .accept = gss_svc_accept_pf, + .invalidate_ctx = gss_svc_invalidate_ctx, + .alloc_rs = gss_svc_alloc_rs, + .authorize = gss_svc_authorize, + .free_rs = gss_svc_free_rs, + .free_ctx = gss_svc_free_ctx, + .unwrap_bulk = gss_svc_unwrap_bulk, + .wrap_bulk = gss_svc_wrap_bulk, + .install_rctx = gss_svc_install_rctx_pf, +}; + +static struct ptlrpc_sec_policy gss_policy_pipefs = { + .sp_owner = THIS_MODULE, + .sp_name = "gss.pipefs", + .sp_policy = SPTLRPC_POLICY_GSS_PIPEFS, + .sp_cops = &gss_sec_pipefs_cops, + .sp_sops = &gss_sec_pipefs_sops, +}; + +static +int __init gss_init_pipefs_upcall(void) +{ + struct dentry *de; + + /* pipe dir */ + de = rpc_mkdir(LUSTRE_PIPE_ROOT, NULL); + if (IS_ERR(de) && PTR_ERR(de) != -EEXIST) { + CERROR("Failed to create gss pipe dir: %ld\n", PTR_ERR(de)); + return PTR_ERR(de); + } + + /* FIXME hack pipefs: dput will sometimes cause oops during module + * unload and lgssd close the pipe fds. */ + + /* krb5 mechanism */ + de = rpc_mkpipe(LUSTRE_PIPE_KRB5, (void *) MECH_KRB5, &gss_upcall_ops, + RPC_PIPE_WAIT_FOR_OPEN); + if (!de || IS_ERR(de)) { + CERROR("failed to make rpc_pipe %s: %ld\n", + LUSTRE_PIPE_KRB5, PTR_ERR(de)); + rpc_rmdir(LUSTRE_PIPE_ROOT); + return PTR_ERR(de); + } + + de_pipes[MECH_KRB5] = de; + INIT_LIST_HEAD(&upcall_lists[MECH_KRB5]); + spin_lock_init(&upcall_locks[MECH_KRB5]); + + return 0; +} + +static +void __exit gss_exit_pipefs_upcall(void) +{ + __u32 i; + + for (i = 0; i < MECH_MAX; i++) { + LASSERT(list_empty(&upcall_lists[i])); + + /* dput pipe dentry here might cause lgssd oops. */ + de_pipes[i] = NULL; + } + + rpc_unlink(LUSTRE_PIPE_KRB5); + rpc_rmdir(LUSTRE_PIPE_ROOT); +} + +int __init gss_init_pipefs(void) +{ + int rc; + + rc = gss_init_pipefs_upcall(); + if (rc) + return rc; + + rc = sptlrpc_register_policy(&gss_policy_pipefs); + if (rc) { + gss_exit_pipefs_upcall(); + return rc; + } + + return 0; +} + +void __exit gss_exit_pipefs(void) +{ + gss_exit_pipefs_upcall(); + sptlrpc_unregister_policy(&gss_policy_pipefs); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c new file mode 100644 index 0000000000000..a6237909b7c5d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c @@ -0,0 +1,240 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/gss/gss_rawobj.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include + +#include +#include +#include +#include + +#include "gss_internal.h" + +int rawobj_empty(rawobj_t *obj) +{ + LASSERT(equi(obj->len, obj->data)); + return (obj->len == 0); +} + +int rawobj_alloc(rawobj_t *obj, char *buf, int len) +{ + LASSERT(obj); + LASSERT(len >= 0); + + obj->len = len; + if (len) { + OBD_ALLOC_LARGE(obj->data, len); + if (!obj->data) { + obj->len = 0; + RETURN(-ENOMEM); + } + memcpy(obj->data, buf, len); + } else + obj->data = NULL; + return 0; +} + +void rawobj_free(rawobj_t *obj) +{ + LASSERT(obj); + + if (obj->len) { + LASSERT(obj->data); + OBD_FREE_LARGE(obj->data, obj->len); + obj->len = 0; + obj->data = NULL; + } else + LASSERT(!obj->data); +} + +int rawobj_equal(rawobj_t *a, rawobj_t *b) +{ + LASSERT(a && b); + + return (a->len == b->len && + (!a->len || !memcmp(a->data, b->data, a->len))); +} + +int rawobj_dup(rawobj_t *dest, rawobj_t *src) +{ + LASSERT(src && dest); + + dest->len = src->len; + if (dest->len) { + OBD_ALLOC_LARGE(dest->data, dest->len); + if (!dest->data) { + dest->len = 0; + return -ENOMEM; + } + memcpy(dest->data, src->data, dest->len); + } else + dest->data = NULL; + return 0; +} + +int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + __u32 len; + + LASSERT(obj); + LASSERT(buf); + LASSERT(buflen); + + len = round_up(obj->len, 4); + + if (*buflen < 4 + len) { + CERROR("shorter buflen than needed: %u < %u\n", + *buflen, 4 + len); + return -EINVAL; + } + + *(*buf)++ = cpu_to_le32(obj->len); + memcpy(*buf, obj->data, obj->len); + *buf += (len >> 2); + *buflen -= (4 + len); + + return 0; +} + +static int __rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen, + int alloc, int local) +{ + __u32 len; + + if (*buflen < sizeof(__u32)) { + CERROR("too short buflen: %u\n", *buflen); + return -EINVAL; + } + + obj->len = *(*buf)++; + if (!local) + obj->len = le32_to_cpu(obj->len); + *buflen -= sizeof(__u32); + + if (!obj->len) { + obj->data = NULL; + return 0; + } + + len = local ? obj->len : round_up(obj->len, 4); + if (*buflen < len) { + CERROR("shorter buflen than object size: %u < %u\n", + *buflen, len); + obj->len = 0; + return -EINVAL; + } + + if (!alloc) + obj->data = (__u8 *) *buf; + else { + OBD_ALLOC_LARGE(obj->data, obj->len); + if (!obj->data) { + CERROR("fail to alloc %u bytes\n", obj->len); + obj->len = 0; + return -ENOMEM; + } + memcpy(obj->data, *buf, obj->len); + } + + *((char **)buf) += len; + *buflen -= len; + + return 0; +} + +int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 0, 0); +} + +int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 1, 0); +} + +int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 0, 1); +} + +int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 1, 1); +} + +int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj) +{ + rawobj->len = netobj->len; + rawobj->data = netobj->data; + return 0; +} + +int rawobj_from_netobj_alloc(rawobj_t *rawobj, netobj_t *netobj) +{ + rawobj->len = 0; + rawobj->data = NULL; + + if (netobj->len == 0) + return 0; + + OBD_ALLOC_LARGE(rawobj->data, netobj->len); + if (rawobj->data == NULL) + return -ENOMEM; + + rawobj->len = netobj->len; + memcpy(rawobj->data, netobj->data, netobj->len); + return 0; +} + +/**************************************** + * misc more * + ****************************************/ + +int buffer_extract_bytes(const void **buf, __u32 *buflen, + void *res, __u32 reslen) +{ + if (*buflen < reslen) { + CERROR("shorter buflen than expected: %u < %u\n", + *buflen, reslen); + return -EINVAL; + } + + memcpy(res, *buf, reslen); + *buf += reslen; + *buflen -= reslen; + return 0; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c new file mode 100644 index 0000000000000..1059df722fe37 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c @@ -0,0 +1,960 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (C) 2013, 2015, Trustees of Indiana University + * + * Copyright (c) 2014, 2016, Intel Corporation. + * + * Author: Jeremy Filizetti + * Author: Andrew Korty + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "gss_err.h" +#include "gss_crypto.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_asn1.h" + +#define SK_INTERFACE_VERSION 1 +#define SK_MSG_VERSION 1 +#define SK_MIN_SIZE 8 +#define SK_IV_SIZE 16 + +/* Starting number for reverse contexts. It is critical to security + * that reverse contexts use a different range of numbers than regular + * contexts because they are using the same key. Therefore the IV/nonce + * combination must be unique for them. To accomplish this reverse contexts + * use the the negative range of a 64-bit number and regular contexts use the + * postive range. If the same IV/nonce combination were reused it would leak + * information about the plaintext. */ +#define SK_IV_REV_START (1ULL << 63) + +struct sk_ctx { + enum cfs_crypto_crypt_alg sc_crypt; + enum cfs_crypto_hash_alg sc_hmac; + __u32 sc_expire; + __u32 sc_host_random; + __u32 sc_peer_random; + atomic64_t sc_iv; + rawobj_t sc_hmac_key; + struct gss_keyblock sc_session_kb; +}; + +struct sk_hdr { + __u64 skh_version; + __u64 skh_iv; +} __attribute__((packed)); + +/* The format of SK wire data is similar to that of RFC3686 ESP Payload + * (section 3) except instead of just an IV there is a struct sk_hdr. + * --------------------------------------------------------------------- + * | struct sk_hdr | ciphertext (variable size) | HMAC (variable size) | + * --------------------------------------------------------------------- */ +struct sk_wire { + rawobj_t skw_header; + rawobj_t skw_cipher; + rawobj_t skw_hmac; +}; + +static inline unsigned long sk_block_mask(unsigned long len, int blocksize) +{ + return (len + blocksize - 1) & (~(blocksize - 1)); +} + +static int sk_fill_header(struct sk_ctx *skc, struct sk_hdr *skh) +{ + __u64 tmp_iv; + skh->skh_version = be64_to_cpu(SK_MSG_VERSION); + + /* Always using inc_return so we don't use our initial numbers which + * could be the reuse detecting numbers */ + tmp_iv = atomic64_inc_return(&skc->sc_iv); + skh->skh_iv = be64_to_cpu(tmp_iv); + if (tmp_iv == 0 || tmp_iv == SK_IV_REV_START) { + CERROR("Counter looped, connection must be reset to avoid " + "plaintext information\n"); + return GSS_S_FAILURE; + } + + return GSS_S_COMPLETE; +} + +static int sk_verify_header(struct sk_hdr *skh) +{ + if (cpu_to_be64(skh->skh_version) != SK_MSG_VERSION) + return GSS_S_DEFECTIVE_TOKEN; + + return GSS_S_COMPLETE; +} + +void sk_construct_rfc3686_iv(__u8 *iv, __u32 nonce, __u64 partial_iv) +{ + __u32 ctr = cpu_to_be32(1); + + memcpy(iv, &nonce, CTR_RFC3686_NONCE_SIZE); + iv += CTR_RFC3686_NONCE_SIZE; + memcpy(iv, &partial_iv, CTR_RFC3686_IV_SIZE); + iv += CTR_RFC3686_IV_SIZE; + memcpy(iv, &ctr, sizeof(ctr)); +} + +static int sk_fill_context(rawobj_t *inbuf, struct sk_ctx *skc) +{ + char *ptr = inbuf->data; + char *end = inbuf->data + inbuf->len; + char sk_hmac[CRYPTO_MAX_ALG_NAME]; + char sk_crypt[CRYPTO_MAX_ALG_NAME]; + u32 tmp; + + /* see sk_serialize_kctx() for format from userspace side */ + /* 1. Version */ + if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) { + CERROR("Failed to read shared key interface version\n"); + return -1; + } + if (tmp != SK_INTERFACE_VERSION) { + CERROR("Invalid shared key interface version: %d\n", tmp); + return -1; + } + + /* 2. HMAC type */ + if (gss_get_bytes(&ptr, end, &sk_hmac, sizeof(sk_hmac))) { + CERROR("Failed to read HMAC algorithm type\n"); + return -1; + } + + skc->sc_hmac = cfs_crypto_hash_alg(sk_hmac); + if (skc->sc_hmac != CFS_HASH_ALG_NULL && + skc->sc_hmac != CFS_HASH_ALG_SHA256 && + skc->sc_hmac != CFS_HASH_ALG_SHA512) { + CERROR("Invalid hmac type: %s\n", sk_hmac); + return -1; + } + + /* 3. crypt type */ + if (gss_get_bytes(&ptr, end, &sk_crypt, sizeof(sk_crypt))) { + CERROR("Failed to read crypt algorithm type\n"); + return -1; + } + + skc->sc_crypt = cfs_crypto_crypt_alg(sk_crypt); + if (skc->sc_crypt == CFS_CRYPT_ALG_UNKNOWN) { + CERROR("Invalid crypt type: %s\n", sk_crypt); + return -1; + } + + /* 4. expiration time */ + if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) { + CERROR("Failed to read context expiration time\n"); + return -1; + } + skc->sc_expire = tmp + ktime_get_real_seconds(); + + /* 5. host random is used as nonce for encryption */ + if (gss_get_bytes(&ptr, end, &skc->sc_host_random, + sizeof(skc->sc_host_random))) { + CERROR("Failed to read host random\n"); + return -1; + } + + /* 6. peer random is used as nonce for decryption */ + if (gss_get_bytes(&ptr, end, &skc->sc_peer_random, + sizeof(skc->sc_peer_random))) { + CERROR("Failed to read peer random\n"); + return -1; + } + + /* 7. HMAC key */ + if (gss_get_rawobj(&ptr, end, &skc->sc_hmac_key)) { + CERROR("Failed to read HMAC key\n"); + return -1; + } + if (skc->sc_hmac_key.len <= SK_MIN_SIZE) { + CERROR("HMAC key must key must be larger than %d bytes\n", + SK_MIN_SIZE); + return -1; + } + + /* 8. Session key, can be empty if not using privacy mode */ + if (gss_get_rawobj(&ptr, end, &skc->sc_session_kb.kb_key)) { + CERROR("Failed to read session key\n"); + return -1; + } + + return 0; +} + +static void sk_delete_context(struct sk_ctx *skc) +{ + if (!skc) + return; + + rawobj_free(&skc->sc_hmac_key); + gss_keyblock_free(&skc->sc_session_kb); + OBD_FREE_PTR(skc); +} + +static +__u32 gss_import_sec_context_sk(rawobj_t *inbuf, struct gss_ctx *gss_context) +{ + struct sk_ctx *skc; + bool privacy = false; + + if (inbuf == NULL || inbuf->data == NULL) + return GSS_S_FAILURE; + + OBD_ALLOC_PTR(skc); + if (!skc) + return GSS_S_FAILURE; + + atomic64_set(&skc->sc_iv, 0); + + if (sk_fill_context(inbuf, skc)) + goto out_err; + + /* Only privacy mode needs to initialize keys */ + if (skc->sc_session_kb.kb_key.len > 0) { + privacy = true; + if (gss_keyblock_init(&skc->sc_session_kb, + cfs_crypto_crypt_name(skc->sc_crypt), 0)) + goto out_err; + } + + gss_context->internal_ctx_id = skc; + CDEBUG(D_SEC, "successfully imported sk%s context\n", + privacy ? " (with privacy)" : ""); + + return GSS_S_COMPLETE; + +out_err: + sk_delete_context(skc); + return GSS_S_FAILURE; +} + +static +__u32 gss_copy_reverse_context_sk(struct gss_ctx *gss_context_old, + struct gss_ctx *gss_context_new) +{ + struct sk_ctx *skc_old = gss_context_old->internal_ctx_id; + struct sk_ctx *skc_new; + + OBD_ALLOC_PTR(skc_new); + if (!skc_new) + return GSS_S_FAILURE; + + skc_new->sc_hmac = skc_old->sc_hmac; + skc_new->sc_crypt = skc_old->sc_crypt; + skc_new->sc_expire = skc_old->sc_expire; + skc_new->sc_host_random = skc_old->sc_host_random; + skc_new->sc_peer_random = skc_old->sc_peer_random; + + atomic64_set(&skc_new->sc_iv, SK_IV_REV_START); + + if (rawobj_dup(&skc_new->sc_hmac_key, &skc_old->sc_hmac_key)) + goto out_err; + if (gss_keyblock_dup(&skc_new->sc_session_kb, &skc_old->sc_session_kb)) + goto out_err; + + /* Only privacy mode needs to initialize keys */ + if (skc_new->sc_session_kb.kb_key.len > 0) + if (gss_keyblock_init(&skc_new->sc_session_kb, + cfs_crypto_crypt_name(skc_new->sc_crypt), + 0)) + goto out_err; + + gss_context_new->internal_ctx_id = skc_new; + CDEBUG(D_SEC, "successfully copied reverse sk context\n"); + + return GSS_S_COMPLETE; + +out_err: + sk_delete_context(skc_new); + return GSS_S_FAILURE; +} + +static +__u32 gss_inquire_context_sk(struct gss_ctx *gss_context, + time64_t *endtime) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + + *endtime = skc->sc_expire; + return GSS_S_COMPLETE; +} + +static +u32 sk_make_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key, int msg_count, + rawobj_t *msgs, int iov_count, struct bio_vec *iovs, + rawobj_t *token, digest_hash hash_func) +{ + struct ahash_request *req; + int rc2, rc; + + req = cfs_crypto_hash_init(algo, key->data, key->len); + if (IS_ERR(req)) { + rc = PTR_ERR(req); + goto out_init_failed; + } + + + if (hash_func) + rc2 = hash_func(req, NULL, msg_count, msgs, iov_count, + iovs); + else + rc2 = gss_digest_hash(req, NULL, msg_count, msgs, iov_count, + iovs); + + rc = cfs_crypto_hash_final(req, token->data, &token->len); + if (!rc && rc2) + rc = rc2; +out_init_failed: + return rc ? GSS_S_FAILURE : GSS_S_COMPLETE; +} + +static +__u32 gss_get_mic_sk(struct gss_ctx *gss_context, + int message_count, + rawobj_t *messages, + int iov_count, + struct bio_vec *iovs, + rawobj_t *token) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + + return sk_make_hmac(skc->sc_hmac, + &skc->sc_hmac_key, message_count, messages, + iov_count, iovs, token, gss_context->hash_func); +} + +static +u32 sk_verify_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key, + int message_count, rawobj_t *messages, + int iov_count, struct bio_vec *iovs, + rawobj_t *token, digest_hash hash_func) +{ + rawobj_t checksum = RAWOBJ_EMPTY; + __u32 rc = GSS_S_FAILURE; + + checksum.len = cfs_crypto_hash_digestsize(algo); + if (token->len < checksum.len) { + CDEBUG(D_SEC, "Token received too short, expected %d " + "received %d\n", token->len, checksum.len); + return GSS_S_DEFECTIVE_TOKEN; + } + + OBD_ALLOC_LARGE(checksum.data, checksum.len); + if (!checksum.data) + return rc; + + if (sk_make_hmac(algo, key, message_count, + messages, iov_count, iovs, &checksum, + hash_func)) { + CDEBUG(D_SEC, "Failed to create checksum to validate\n"); + goto cleanup; + } + + if (memcmp(token->data, checksum.data, checksum.len)) { + CERROR("checksum mismatch\n"); + rc = GSS_S_BAD_SIG; + goto cleanup; + } + + rc = GSS_S_COMPLETE; + +cleanup: + OBD_FREE(checksum.data, checksum.len); + return rc; +} + +/* sk_verify_bulk_hmac() differs slightly from sk_verify_hmac() because all + * encrypted pages in the bulk descriptor are populated although we only need + * to decrypt up to the number of bytes actually specified from the sender + * (bd_nob) otherwise the calulated HMAC will be incorrect. */ +static +u32 sk_verify_bulk_hmac(enum cfs_crypto_hash_alg sc_hmac, rawobj_t *key, + int msgcnt, rawobj_t *msgs, int iovcnt, + struct bio_vec *iovs, int iov_bytes, rawobj_t *token) +{ + rawobj_t checksum = RAWOBJ_EMPTY; + struct ahash_request *req; + struct scatterlist sg[1]; + int rc = 0; + struct sg_table sgt; + int bytes; + int i; + + checksum.len = cfs_crypto_hash_digestsize(sc_hmac); + if (token->len < checksum.len) { + CDEBUG(D_SEC, "Token received too short, expected %d " + "received %d\n", token->len, checksum.len); + return GSS_S_DEFECTIVE_TOKEN; + } + + OBD_ALLOC_LARGE(checksum.data, checksum.len); + if (!checksum.data) + return GSS_S_FAILURE; + + req = cfs_crypto_hash_init(sc_hmac, key->data, key->len); + if (IS_ERR(req)) { + rc = GSS_S_FAILURE; + goto cleanup; + } + + for (i = 0; i < msgcnt; i++) { + if (!msgs[i].len) + continue; + + rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len); + if (rc != 0) + goto hash_cleanup; + + ahash_request_set_crypt(req, sg, NULL, msgs[i].len); + rc = crypto_ahash_update(req); + if (rc) { + gss_teardown_sgtable(&sgt); + goto hash_cleanup; + } + + gss_teardown_sgtable(&sgt); + } + + for (i = 0; i < iovcnt && iov_bytes > 0; i++) { + if (iovs[i].bv_len == 0) + continue; + + bytes = min_t(int, iov_bytes, iovs[i].bv_len); + iov_bytes -= bytes; + + sg_init_table(sg, 1); + sg_set_page(&sg[0], iovs[i].bv_page, bytes, + iovs[i].bv_offset); + ahash_request_set_crypt(req, sg, NULL, bytes); + rc = crypto_ahash_update(req); + if (rc) + goto hash_cleanup; + } + +hash_cleanup: + cfs_crypto_hash_final(req, checksum.data, &checksum.len); + if (rc) + goto cleanup; + + if (memcmp(token->data, checksum.data, checksum.len)) + rc = GSS_S_BAD_SIG; + else + rc = GSS_S_COMPLETE; + +cleanup: + OBD_FREE_LARGE(checksum.data, checksum.len); + + return rc; +} + +static +__u32 gss_verify_mic_sk(struct gss_ctx *gss_context, + int message_count, + rawobj_t *messages, + int iov_count, + struct bio_vec *iovs, + rawobj_t *token) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + + return sk_verify_hmac(skc->sc_hmac, &skc->sc_hmac_key, + message_count, messages, iov_count, iovs, token, + gss_context->hash_func); +} + +static +__u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header, + rawobj_t *message, int message_buffer_length, + rawobj_t *token) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac); + struct sk_wire skw; + struct sk_hdr skh; + rawobj_t msgbufs[3]; + __u8 local_iv[SK_IV_SIZE]; + unsigned int blocksize; + + LASSERT(skc->sc_session_kb.kb_tfm); + + blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm); + if (gss_add_padding(message, message_buffer_length, blocksize)) + return GSS_S_FAILURE; + + memset(token->data, 0, token->len); + + if (sk_fill_header(skc, &skh) != GSS_S_COMPLETE) + return GSS_S_FAILURE; + + skw.skw_header.data = token->data; + skw.skw_header.len = sizeof(skh); + memcpy(skw.skw_header.data, &skh, sizeof(skh)); + + sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv); + skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len; + skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes; + if (gss_crypt_rawobjs(skc->sc_session_kb.kb_tfm, local_iv, 1, message, + &skw.skw_cipher, 1)) + return GSS_S_FAILURE; + + /* HMAC covers the SK header, GSS header, and ciphertext */ + msgbufs[0] = skw.skw_header; + msgbufs[1] = *gss_header; + msgbufs[2] = skw.skw_cipher; + + skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len; + skw.skw_hmac.len = sht_bytes; + if (sk_make_hmac(skc->sc_hmac, &skc->sc_hmac_key, + 3, msgbufs, 0, NULL, &skw.skw_hmac, + gss_context->hash_func)) + return GSS_S_FAILURE; + + token->len = skw.skw_header.len + skw.skw_cipher.len + skw.skw_hmac.len; + + return GSS_S_COMPLETE; +} + +static +__u32 gss_unwrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header, + rawobj_t *token, rawobj_t *message) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac); + struct sk_wire skw; + struct sk_hdr *skh; + rawobj_t msgbufs[3]; + __u8 local_iv[SK_IV_SIZE]; + unsigned int blocksize; + int rc; + + LASSERT(skc->sc_session_kb.kb_tfm); + + if (token->len < sizeof(skh) + sht_bytes) + return GSS_S_DEFECTIVE_TOKEN; + + skw.skw_header.data = token->data; + skw.skw_header.len = sizeof(struct sk_hdr); + skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len; + skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes; + skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len; + skw.skw_hmac.len = sht_bytes; + + blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm); + if (skw.skw_cipher.len % blocksize != 0) + return GSS_S_DEFECTIVE_TOKEN; + + skh = (struct sk_hdr *)skw.skw_header.data; + rc = sk_verify_header(skh); + if (rc != GSS_S_COMPLETE) + return rc; + + /* HMAC covers the SK header, GSS header, and ciphertext */ + msgbufs[0] = skw.skw_header; + msgbufs[1] = *gss_header; + msgbufs[2] = skw.skw_cipher; + rc = sk_verify_hmac(skc->sc_hmac, &skc->sc_hmac_key, 3, msgbufs, + 0, NULL, &skw.skw_hmac, gss_context->hash_func); + if (rc) + return rc; + + sk_construct_rfc3686_iv(local_iv, skc->sc_peer_random, skh->skh_iv); + message->len = skw.skw_cipher.len; + if (gss_crypt_rawobjs(skc->sc_session_kb.kb_tfm, local_iv, + 1, &skw.skw_cipher, message, 0)) + return GSS_S_FAILURE; + + return GSS_S_COMPLETE; +} + +static +__u32 gss_prep_bulk_sk(struct gss_ctx *gss_context, + struct ptlrpc_bulk_desc *desc) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + int blocksize; + int i; + + LASSERT(skc->sc_session_kb.kb_tfm); + blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm); + + for (i = 0; i < desc->bd_iov_count; i++) { + if (desc->bd_vec[i].bv_offset & blocksize) { + CERROR("offset %d not blocksize aligned\n", + desc->bd_vec[i].bv_offset); + return GSS_S_FAILURE; + } + + desc->bd_enc_vec[i].bv_offset = + desc->bd_vec[i].bv_offset; + desc->bd_enc_vec[i].bv_len = + sk_block_mask(desc->bd_vec[i].bv_len, blocksize); + } + + return GSS_S_COMPLETE; +} + +static __u32 sk_encrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv, + struct ptlrpc_bulk_desc *desc, rawobj_t *cipher, + int adj_nob) +{ + struct scatterlist ptxt; + struct scatterlist ctxt; + int blocksize; + int i; + int rc; + int nob = 0; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + + blocksize = crypto_sync_skcipher_blocksize(tfm); + + sg_init_table(&ptxt, 1); + sg_init_table(&ctxt, 1); + + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + + for (i = 0; i < desc->bd_iov_count; i++) { + sg_set_page(&ptxt, desc->bd_vec[i].bv_page, + sk_block_mask(desc->bd_vec[i].bv_len, + blocksize), + desc->bd_vec[i].bv_offset); + nob += ptxt.length; + + sg_set_page(&ctxt, desc->bd_enc_vec[i].bv_page, + ptxt.length, ptxt.offset); + + desc->bd_enc_vec[i].bv_offset = ctxt.offset; + desc->bd_enc_vec[i].bv_len = ctxt.length; + + skcipher_request_set_crypt(req, &ptxt, &ctxt, ptxt.length, iv); + rc = crypto_skcipher_encrypt_iv(req, &ctxt, &ptxt, ptxt.length); + if (rc) { + CERROR("failed to encrypt page: %d\n", rc); + skcipher_request_zero(req); + return rc; + } + } + skcipher_request_zero(req); + + if (adj_nob) + desc->bd_nob = nob; + + return 0; +} + +static __u32 sk_decrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv, + struct ptlrpc_bulk_desc *desc, rawobj_t *cipher, + int adj_nob) +{ + struct scatterlist ptxt; + struct scatterlist ctxt; + int blocksize; + int i; + int rc; + int pnob = 0; + int cnob = 0; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + + sg_init_table(&ptxt, 1); + sg_init_table(&ctxt, 1); + + blocksize = crypto_sync_skcipher_blocksize(tfm); + if (desc->bd_nob_transferred % blocksize != 0) { + CERROR("Transfer not a multiple of block size: %d\n", + desc->bd_nob_transferred); + return GSS_S_DEFECTIVE_TOKEN; + } + + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + + for (i = 0; i < desc->bd_iov_count && cnob < desc->bd_nob_transferred; + i++) { + struct bio_vec *piov = &desc->bd_vec[i]; + struct bio_vec *ciov = &desc->bd_enc_vec[i]; + + if (ciov->bv_offset % blocksize != 0 || + ciov->bv_len % blocksize != 0) { + CERROR("Invalid bulk descriptor vector\n"); + skcipher_request_zero(req); + return GSS_S_DEFECTIVE_TOKEN; + } + + /* Must adjust bytes here because we know the actual sizes after + * decryption. Similar to what gss_cli_ctx_unwrap_bulk does for + * integrity only mode */ + if (adj_nob) { + /* cipher text must not exceed transferred size */ + if (ciov->bv_len + cnob > desc->bd_nob_transferred) + ciov->bv_len = + desc->bd_nob_transferred - cnob; + + piov->bv_len = ciov->bv_len; + + /* plain text must not exceed bulk's size */ + if (ciov->bv_len + pnob > desc->bd_nob) + piov->bv_len = desc->bd_nob - pnob; + } else { + /* Taken from krb5_decrypt since it was not verified + * whether or not LNET guarantees these */ + if (ciov->bv_len + cnob > desc->bd_nob_transferred || + piov->bv_len > ciov->bv_len) { + CERROR("Invalid decrypted length\n"); + skcipher_request_zero(req); + return GSS_S_FAILURE; + } + } + + if (ciov->bv_len == 0) + continue; + + sg_init_table(&ctxt, 1); + sg_set_page(&ctxt, ciov->bv_page, ciov->bv_len, + ciov->bv_offset); + ptxt = ctxt; + + /* In the event the plain text size is not a multiple + * of blocksize we decrypt in place and copy the result + * after the decryption */ + if (piov->bv_len % blocksize == 0) + sg_assign_page(&ptxt, piov->bv_page); + + skcipher_request_set_crypt(req, &ctxt, &ptxt, ptxt.length, iv); + rc = crypto_skcipher_decrypt_iv(req, &ptxt, &ctxt, ptxt.length); + if (rc) { + CERROR("Decryption failed for page: %d\n", rc); + skcipher_request_zero(req); + return GSS_S_FAILURE; + } + + if (piov->bv_len % blocksize != 0) { + memcpy(page_address(piov->bv_page) + + piov->bv_offset, + page_address(ciov->bv_page) + + ciov->bv_offset, + piov->bv_len); + } + + cnob += ciov->bv_len; + pnob += piov->bv_len; + } + skcipher_request_zero(req); + + /* if needed, clear up the rest unused iovs */ + if (adj_nob) + while (i < desc->bd_iov_count) + desc->bd_vec[i++].bv_len = 0; + + if (unlikely(cnob != desc->bd_nob_transferred)) { + CERROR("%d cipher text transferred but only %d decrypted\n", + desc->bd_nob_transferred, cnob); + return GSS_S_FAILURE; + } + + if (unlikely(!adj_nob && pnob != desc->bd_nob)) { + CERROR("%d plain text expected but only %d received\n", + desc->bd_nob, pnob); + return GSS_S_FAILURE; + } + + return 0; +} + +static +__u32 gss_wrap_bulk_sk(struct gss_ctx *gss_context, + struct ptlrpc_bulk_desc *desc, rawobj_t *token, + int adj_nob) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac); + struct sk_wire skw; + struct sk_hdr skh; + __u8 local_iv[SK_IV_SIZE]; + + LASSERT(skc->sc_session_kb.kb_tfm); + + memset(token->data, 0, token->len); + if (sk_fill_header(skc, &skh) != GSS_S_COMPLETE) + return GSS_S_FAILURE; + + skw.skw_header.data = token->data; + skw.skw_header.len = sizeof(skh); + memcpy(skw.skw_header.data, &skh, sizeof(skh)); + + sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv); + skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len; + skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes; + if (sk_encrypt_bulk(skc->sc_session_kb.kb_tfm, local_iv, + desc, &skw.skw_cipher, adj_nob)) + return GSS_S_FAILURE; + + skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len; + skw.skw_hmac.len = sht_bytes; + if (sk_make_hmac(skc->sc_hmac, &skc->sc_hmac_key, 1, &skw.skw_cipher, + desc->bd_iov_count, desc->bd_enc_vec, &skw.skw_hmac, + gss_context->hash_func)) + return GSS_S_FAILURE; + + return GSS_S_COMPLETE; +} + +static +__u32 gss_unwrap_bulk_sk(struct gss_ctx *gss_context, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, int adj_nob) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac); + struct sk_wire skw; + struct sk_hdr *skh; + __u8 local_iv[SK_IV_SIZE]; + int rc; + + LASSERT(skc->sc_session_kb.kb_tfm); + + if (token->len < sizeof(skh) + sht_bytes) + return GSS_S_DEFECTIVE_TOKEN; + + skw.skw_header.data = token->data; + skw.skw_header.len = sizeof(struct sk_hdr); + skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len; + skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes; + skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len; + skw.skw_hmac.len = sht_bytes; + + skh = (struct sk_hdr *)skw.skw_header.data; + rc = sk_verify_header(skh); + if (rc != GSS_S_COMPLETE) + return rc; + + rc = sk_verify_bulk_hmac(skc->sc_hmac, &skc->sc_hmac_key, 1, + &skw.skw_cipher, desc->bd_iov_count, + desc->bd_enc_vec, desc->bd_nob, + &skw.skw_hmac); + if (rc) + return rc; + + sk_construct_rfc3686_iv(local_iv, skc->sc_peer_random, skh->skh_iv); + rc = sk_decrypt_bulk(skc->sc_session_kb.kb_tfm, local_iv, + desc, &skw.skw_cipher, adj_nob); + if (rc) + return rc; + + return GSS_S_COMPLETE; +} + +static +void gss_delete_sec_context_sk(void *internal_context) +{ + struct sk_ctx *sk_context = internal_context; + sk_delete_context(sk_context); +} + +int gss_display_sk(struct gss_ctx *gss_context, char *buf, int bufsize) +{ + return scnprintf(buf, bufsize, "sk"); +} + +static struct gss_api_ops gss_sk_ops = { + .gss_import_sec_context = gss_import_sec_context_sk, + .gss_copy_reverse_context = gss_copy_reverse_context_sk, + .gss_inquire_context = gss_inquire_context_sk, + .gss_get_mic = gss_get_mic_sk, + .gss_verify_mic = gss_verify_mic_sk, + .gss_wrap = gss_wrap_sk, + .gss_unwrap = gss_unwrap_sk, + .gss_prep_bulk = gss_prep_bulk_sk, + .gss_wrap_bulk = gss_wrap_bulk_sk, + .gss_unwrap_bulk = gss_unwrap_bulk_sk, + .gss_delete_sec_context = gss_delete_sec_context_sk, + .gss_display = gss_display_sk, +}; + +static struct subflavor_desc gss_sk_sfs[] = { + { + .sf_subflavor = SPTLRPC_SUBFLVR_SKN, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_NULL, + .sf_name = "skn" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_SKA, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_AUTH, + .sf_name = "ska" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_SKI, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_INTG, + .sf_name = "ski" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_SKPI, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_PRIV, + .sf_name = "skpi" + }, +}; + +static struct gss_api_mech gss_sk_mech = { + /* .gm_owner uses default NULL value for THIS_MODULE */ + .gm_name = "sk", + .gm_oid = (rawobj_t) { + .len = 12, + .data = "\053\006\001\004\001\311\146\215\126\001\000\001", + }, + .gm_ops = &gss_sk_ops, + .gm_sf_num = 4, + .gm_sfs = gss_sk_sfs, +}; + +int __init init_sk_module(void) +{ + int status; + + status = lgss_mech_register(&gss_sk_mech); + if (status) + CERROR("Failed to register sk gss mechanism!\n"); + + return status; +} + +void cleanup_sk_module(void) +{ + lgss_mech_unregister(&gss_sk_mech); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c new file mode 100644 index 0000000000000..f5c83a25a13e1 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c @@ -0,0 +1,1190 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2014, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * Neil Brown + * J. Bruce Fields + * Andy Adamson + * Dug Song + * + * RPCSEC_GSS server authentication. + * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078 + * (gssapi) + * + * The RPCSEC_GSS involves three stages: + * 1/ context creation + * 2/ data exchange + * 3/ context destruction + * + * Context creation is handled largely by upcalls to user-space. + * In particular, GSS_Accept_sec_context is handled by an upcall + * Data exchange is handled entirely within the kernel + * In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel. + * Context destruction is handled in-kernel + * GSS_Delete_sec_context is in-kernel + * + * Context creation is initiated by a RPCSEC_GSS_INIT request arriving. + * The context handle and gss_token are used as a key into the rpcsec_init cache. + * The content of this cache includes some of the outputs of GSS_Accept_sec_context, + * being major_status, minor_status, context_handle, reply_token. + * These are sent back to the client. + * Sequence window management is handled by the kernel. The window size if currently + * a compile time constant. + * + * When user-space is happy that a context is established, it places an entry + * in the rpcsec_context cache. The key for this cache is the context_handle. + * The content includes: + * uid/gidlist - for determining access rights + * mechanism type + * mechanism specific information, such as a key + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_crypto.h" + +#define GSS_SVC_UPCALL_TIMEOUT (20) + +static DEFINE_SPINLOCK(__ctx_index_lock); +static __u64 __ctx_index; + +unsigned int krb5_allow_old_client_csum; + +__u64 gss_get_next_ctx_index(void) +{ + __u64 idx; + + spin_lock(&__ctx_index_lock); + idx = __ctx_index++; + spin_unlock(&__ctx_index_lock); + + return idx; +} + +static inline unsigned long hash_mem(char *buf, int length, int bits) +{ + unsigned long hash = 0; + unsigned long l = 0; + int len = 0; + unsigned char c; + + do { + if (len == length) { + c = (char) len; + len = -1; + } else + c = *buf++; + + l = (l << 8) | c; + len++; + + if ((len & (BITS_PER_LONG/8-1)) == 0) + hash = hash_long(hash^l, BITS_PER_LONG); + } while (len); + + return hash >> (BITS_PER_LONG - bits); +} + +/**************************************** + * rpc sec init (rsi) cache * + ****************************************/ + +#define RSI_HASHBITS (6) +#define RSI_HASHMAX (1 << RSI_HASHBITS) +#define RSI_HASHMASK (RSI_HASHMAX - 1) + +struct rsi { + struct cache_head h; + __u32 lustre_svc; + __u64 nid; + char nm_name[LUSTRE_NODEMAP_NAME_LENGTH + 1]; + wait_queue_head_t waitq; + rawobj_t in_handle, in_token; + rawobj_t out_handle, out_token; + int major_status, minor_status; +#ifdef HAVE_CACHE_HASH_SPINLOCK + struct rcu_head rcu_head; +#endif +}; + +#ifdef HAVE_CACHE_HEAD_HLIST +static struct hlist_head rsi_table[RSI_HASHMAX]; +#else +static struct cache_head *rsi_table[RSI_HASHMAX]; +#endif +static struct cache_detail rsi_cache; +static struct rsi *rsi_update(struct rsi *new, struct rsi *old); +static struct rsi *rsi_lookup(struct rsi *item); + +#ifdef HAVE_CACHE_DETAIL_WRITERS +static inline int channel_users(struct cache_detail *cd) +{ + return atomic_read(&cd->writers); +} +#else +static inline int channel_users(struct cache_detail *cd) +{ + return atomic_read(&cd->readers); +} +#endif + +static inline int rsi_hash(struct rsi *item) +{ + return hash_mem((char *)item->in_handle.data, item->in_handle.len, + RSI_HASHBITS) ^ + hash_mem((char *)item->in_token.data, item->in_token.len, + RSI_HASHBITS); +} + +static inline int __rsi_match(struct rsi *item, struct rsi *tmp) +{ + return (rawobj_equal(&item->in_handle, &tmp->in_handle) && + rawobj_equal(&item->in_token, &tmp->in_token)); +} + +static void rsi_free(struct rsi *rsi) +{ + rawobj_free(&rsi->in_handle); + rawobj_free(&rsi->in_token); + rawobj_free(&rsi->out_handle); + rawobj_free(&rsi->out_token); +} + +/* See handle_channel_req() userspace for where the upcall data is read */ +static void rsi_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + struct rsi *rsi = container_of(h, struct rsi, h); + __u64 index = 0; + + /* if in_handle is null, provide kernel suggestion */ + if (rsi->in_handle.len == 0) + index = gss_get_next_ctx_index(); + + qword_addhex(bpp, blen, (char *) &rsi->lustre_svc, + sizeof(rsi->lustre_svc)); + qword_addhex(bpp, blen, (char *) &rsi->nid, sizeof(rsi->nid)); + qword_addhex(bpp, blen, (char *) &index, sizeof(index)); + qword_addhex(bpp, blen, (char *) rsi->nm_name, + strlen(rsi->nm_name) + 1); + qword_addhex(bpp, blen, rsi->in_handle.data, rsi->in_handle.len); + qword_addhex(bpp, blen, rsi->in_token.data, rsi->in_token.len); + (*bpp)[-1] = '\n'; +} + +static inline void __rsi_init(struct rsi *new, struct rsi *item) +{ + new->out_handle = RAWOBJ_EMPTY; + new->out_token = RAWOBJ_EMPTY; + + new->in_handle = item->in_handle; + item->in_handle = RAWOBJ_EMPTY; + new->in_token = item->in_token; + item->in_token = RAWOBJ_EMPTY; + + new->lustre_svc = item->lustre_svc; + new->nid = item->nid; + memcpy(new->nm_name, item->nm_name, sizeof(item->nm_name)); + init_waitqueue_head(&new->waitq); +} + +static inline void __rsi_update(struct rsi *new, struct rsi *item) +{ + LASSERT(new->out_handle.len == 0); + LASSERT(new->out_token.len == 0); + + new->out_handle = item->out_handle; + item->out_handle = RAWOBJ_EMPTY; + new->out_token = item->out_token; + item->out_token = RAWOBJ_EMPTY; + + new->major_status = item->major_status; + new->minor_status = item->minor_status; +} + +#ifdef HAVE_CACHE_HASH_SPINLOCK +static void rsi_free_rcu(struct rcu_head *head) +{ + struct rsi *rsi = container_of(head, struct rsi, rcu_head); + +#ifdef HAVE_CACHE_HEAD_HLIST + LASSERT(hlist_unhashed(&rsi->h.cache_list)); +#else + LASSERT(rsi->h.next == NULL); +#endif + rsi_free(rsi); + OBD_FREE_PTR(rsi); +} + +static void rsi_put(struct kref *ref) +{ + struct rsi *rsi = container_of(ref, struct rsi, h.ref); + + call_rcu(&rsi->rcu_head, rsi_free_rcu); +} +#else /* !HAVE_CACHE_HASH_SPINLOCK */ +static void rsi_put(struct kref *ref) +{ + struct rsi *rsi = container_of(ref, struct rsi, h.ref); + +#ifdef HAVE_CACHE_HEAD_HLIST + LASSERT(hlist_unhashed(&rsi->h.cache_list)); +#else + LASSERT(rsi->h.next == NULL); +#endif + rsi_free(rsi); + OBD_FREE_PTR(rsi); +} +#endif /* HAVE_CACHE_HASH_SPINLOCK */ + +static int rsi_match(struct cache_head *a, struct cache_head *b) +{ + struct rsi *item = container_of(a, struct rsi, h); + struct rsi *tmp = container_of(b, struct rsi, h); + + return __rsi_match(item, tmp); +} + +static void rsi_init(struct cache_head *cnew, struct cache_head *citem) +{ + struct rsi *new = container_of(cnew, struct rsi, h); + struct rsi *item = container_of(citem, struct rsi, h); + + __rsi_init(new, item); +} + +static void update_rsi(struct cache_head *cnew, struct cache_head *citem) +{ + struct rsi *new = container_of(cnew, struct rsi, h); + struct rsi *item = container_of(citem, struct rsi, h); + + __rsi_update(new, item); +} + +static struct cache_head *rsi_alloc(void) +{ + struct rsi *rsi; + + OBD_ALLOC_PTR(rsi); + if (rsi) + return &rsi->h; + else + return NULL; +} + +static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen) +{ + char *buf = mesg; + int len; + struct rsi rsii, *rsip = NULL; + time64_t expiry; + int status = -EINVAL; + ENTRY; + + memset(&rsii, 0, sizeof(rsii)); + + /* handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (rawobj_alloc(&rsii.in_handle, buf, len)) { + status = -ENOMEM; + goto out; + } + + /* token */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (rawobj_alloc(&rsii.in_token, buf, len)) { + status = -ENOMEM; + goto out; + } + + rsip = rsi_lookup(&rsii); + if (!rsip) + goto out; + if (!test_bit(CACHE_PENDING, &rsip->h.flags)) { + /* If this is not a pending request, it probably means + * someone wrote arbitrary data to the init channel. + * Directly return -EINVAL in this case. + */ + status = -EINVAL; + goto out; + } + + rsii.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + if (expiry == 0) + goto out; + + len = qword_get(&mesg, buf, mlen); + if (len <= 0) + goto out; + + /* major */ + status = kstrtoint(buf, 10, &rsii.major_status); + if (status) + goto out; + + /* minor */ + len = qword_get(&mesg, buf, mlen); + if (len <= 0) { + status = -EINVAL; + goto out; + } + + status = kstrtoint(buf, 10, &rsii.minor_status); + if (status) + goto out; + + /* out_handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (rawobj_alloc(&rsii.out_handle, buf, len)) { + status = -ENOMEM; + goto out; + } + + /* out_token */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (rawobj_alloc(&rsii.out_token, buf, len)) { + status = -ENOMEM; + goto out; + } + + rsii.h.expiry_time = expiry; + rsip = rsi_update(&rsii, rsip); + status = 0; +out: + rsi_free(&rsii); + if (rsip) { + wake_up(&rsip->waitq); + cache_put(&rsip->h, &rsi_cache); + } else { + status = -ENOMEM; + } + + if (status) + CERROR("rsi parse error %d\n", status); + RETURN(status); +} + +static struct cache_detail rsi_cache = { + .hash_size = RSI_HASHMAX, + .hash_table = rsi_table, + .name = "auth.sptlrpc.init", + .cache_put = rsi_put, + .cache_request = rsi_request, + .cache_upcall = sunrpc_cache_pipe_upcall, + .cache_parse = rsi_parse, + .match = rsi_match, + .init = rsi_init, + .update = update_rsi, + .alloc = rsi_alloc, +}; + +static struct rsi *rsi_lookup(struct rsi *item) +{ + struct cache_head *ch; + int hash = rsi_hash(item); + + ch = sunrpc_cache_lookup(&rsi_cache, &item->h, hash); + if (ch) + return container_of(ch, struct rsi, h); + else + return NULL; +} + +static struct rsi *rsi_update(struct rsi *new, struct rsi *old) +{ + struct cache_head *ch; + int hash = rsi_hash(new); + + ch = sunrpc_cache_update(&rsi_cache, &new->h, &old->h, hash); + if (ch) + return container_of(ch, struct rsi, h); + else + return NULL; +} + +/**************************************** + * rpc sec context (rsc) cache * + ****************************************/ + +#define RSC_HASHBITS (10) +#define RSC_HASHMAX (1 << RSC_HASHBITS) +#define RSC_HASHMASK (RSC_HASHMAX - 1) + +struct rsc { + struct cache_head h; + struct obd_device *target; + rawobj_t handle; + struct gss_svc_ctx ctx; +#ifdef HAVE_CACHE_HASH_SPINLOCK + struct rcu_head rcu_head; +#endif +}; + +#ifdef HAVE_CACHE_HEAD_HLIST +static struct hlist_head rsc_table[RSC_HASHMAX]; +#else +static struct cache_head *rsc_table[RSC_HASHMAX]; +#endif +static struct cache_detail rsc_cache; +static struct rsc *rsc_update(struct rsc *new, struct rsc *old); +static struct rsc *rsc_lookup(struct rsc *item); + +static void rsc_free(struct rsc *rsci) +{ + rawobj_free(&rsci->handle); + rawobj_free(&rsci->ctx.gsc_rvs_hdl); + lgss_delete_sec_context(&rsci->ctx.gsc_mechctx); +} + +static inline int rsc_hash(struct rsc *rsci) +{ + return hash_mem((char *)rsci->handle.data, + rsci->handle.len, RSC_HASHBITS); +} + +static inline int __rsc_match(struct rsc *new, struct rsc *tmp) +{ + return rawobj_equal(&new->handle, &tmp->handle); +} + +static inline void __rsc_init(struct rsc *new, struct rsc *tmp) +{ + new->handle = tmp->handle; + tmp->handle = RAWOBJ_EMPTY; + + new->target = NULL; + memset(&new->ctx, 0, sizeof(new->ctx)); + new->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY; +} + +static inline void __rsc_update(struct rsc *new, struct rsc *tmp) +{ + new->ctx = tmp->ctx; + memset(&tmp->ctx, 0, sizeof(tmp->ctx)); + tmp->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY; + tmp->ctx.gsc_mechctx = NULL; + tmp->target = NULL; + + memset(&new->ctx.gsc_seqdata, 0, sizeof(new->ctx.gsc_seqdata)); + spin_lock_init(&new->ctx.gsc_seqdata.ssd_lock); +} + +#ifdef HAVE_CACHE_HASH_SPINLOCK +static void rsc_free_rcu(struct rcu_head *head) +{ + struct rsc *rsci = container_of(head, struct rsc, rcu_head); + +#ifdef HAVE_CACHE_HEAD_HLIST + LASSERT(hlist_unhashed(&rsci->h.cache_list)); +#else + LASSERT(rsci->h.next == NULL); +#endif + rawobj_free(&rsci->handle); + OBD_FREE_PTR(rsci); +} + +static void rsc_put(struct kref *ref) +{ + struct rsc *rsci = container_of(ref, struct rsc, h.ref); + + rawobj_free(&rsci->ctx.gsc_rvs_hdl); + lgss_delete_sec_context(&rsci->ctx.gsc_mechctx); + call_rcu(&rsci->rcu_head, rsc_free_rcu); +} +#else /* !HAVE_CACHE_HASH_SPINLOCK */ +static void rsc_put(struct kref *ref) +{ + struct rsc *rsci = container_of(ref, struct rsc, h.ref); + +#ifdef HAVE_CACHE_HEAD_HLIST + LASSERT(hlist_unhashed(&rsci->h.cache_list)); +#else + LASSERT(rsci->h.next == NULL); +#endif + rsc_free(rsci); + OBD_FREE_PTR(rsci); +} +#endif /* HAVE_CACHE_HASH_SPINLOCK */ + +static int rsc_match(struct cache_head *a, struct cache_head *b) +{ + struct rsc *new = container_of(a, struct rsc, h); + struct rsc *tmp = container_of(b, struct rsc, h); + + return __rsc_match(new, tmp); +} + +static void rsc_init(struct cache_head *cnew, struct cache_head *ctmp) +{ + struct rsc *new = container_of(cnew, struct rsc, h); + struct rsc *tmp = container_of(ctmp, struct rsc, h); + + __rsc_init(new, tmp); +} + +static void update_rsc(struct cache_head *cnew, struct cache_head *ctmp) +{ + struct rsc *new = container_of(cnew, struct rsc, h); + struct rsc *tmp = container_of(ctmp, struct rsc, h); + + __rsc_update(new, tmp); +} + +static struct cache_head * rsc_alloc(void) +{ + struct rsc *rsc; + + OBD_ALLOC_PTR(rsc); + if (rsc) + return &rsc->h; + else + return NULL; +} + +static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen) +{ + char *buf = mesg; + int len, rv, tmp_int; + struct rsc rsci, *rscp = NULL; + time64_t expiry; + int status = -EINVAL; + struct gss_api_mech *gm = NULL; + + memset(&rsci, 0, sizeof(rsci)); + + /* context handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) goto out; + status = -ENOMEM; + if (rawobj_alloc(&rsci.handle, buf, len)) + goto out; + + rsci.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; + if (expiry == 0) + goto out; + + /* remote flag */ + rv = get_int(&mesg, &tmp_int); + if (rv) { + CERROR("fail to get remote flag\n"); + goto out; + } + rsci.ctx.gsc_remote = (tmp_int != 0); + + /* root user flag */ + rv = get_int(&mesg, &tmp_int); + if (rv) { + CERROR("fail to get root user flag\n"); + goto out; + } + rsci.ctx.gsc_usr_root = (tmp_int != 0); + + /* mds user flag */ + rv = get_int(&mesg, &tmp_int); + if (rv) { + CERROR("fail to get mds user flag\n"); + goto out; + } + rsci.ctx.gsc_usr_mds = (tmp_int != 0); + + /* oss user flag */ + rv = get_int(&mesg, &tmp_int); + if (rv) { + CERROR("fail to get oss user flag\n"); + goto out; + } + rsci.ctx.gsc_usr_oss = (tmp_int != 0); + + /* mapped uid */ + rv = get_int(&mesg, (int *) &rsci.ctx.gsc_mapped_uid); + if (rv) { + CERROR("fail to get mapped uid\n"); + goto out; + } + + rscp = rsc_lookup(&rsci); + if (!rscp) + goto out; + + /* uid, or NEGATIVE */ + rv = get_int(&mesg, (int *) &rsci.ctx.gsc_uid); + if (rv == -EINVAL) + goto out; + if (rv == -ENOENT) { + CERROR("NOENT? set rsc entry negative\n"); + set_bit(CACHE_NEGATIVE, &rsci.h.flags); + } else { + rawobj_t tmp_buf; + time64_t ctx_expiry; + + /* gid */ + if (get_int(&mesg, (int *) &rsci.ctx.gsc_gid)) + goto out; + + /* mech name */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + gm = lgss_name_to_mech(buf); + status = -EOPNOTSUPP; + if (!gm) + goto out; + + status = -EINVAL; + /* mech-specific data: */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + + tmp_buf.len = len; + tmp_buf.data = (unsigned char *)buf; + if (lgss_import_sec_context(&tmp_buf, gm, + &rsci.ctx.gsc_mechctx)) + goto out; + + /* set to seconds since machine booted */ + expiry = ktime_get_seconds(); + + /* currently the expiry time passed down from user-space + * is invalid, here we retrive it from mech. + */ + if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) { + CERROR("unable to get expire time, drop it\n"); + goto out; + } + + /* ctx_expiry is the number of seconds since Jan 1 1970. + * We want just the number of seconds into the future. + */ + expiry += ctx_expiry - ktime_get_real_seconds(); + } + + rsci.h.expiry_time = expiry; + rscp = rsc_update(&rsci, rscp); + status = 0; +out: + if (gm) + lgss_mech_put(gm); + rsc_free(&rsci); + if (rscp) + cache_put(&rscp->h, &rsc_cache); + else + status = -ENOMEM; + + if (status) + CERROR("parse rsc error %d\n", status); + return status; +} + +static struct cache_detail rsc_cache = { + .hash_size = RSC_HASHMAX, + .hash_table = rsc_table, + .name = "auth.sptlrpc.context", + .cache_put = rsc_put, + .cache_parse = rsc_parse, + .match = rsc_match, + .init = rsc_init, + .update = update_rsc, + .alloc = rsc_alloc, +}; + +static struct rsc *rsc_lookup(struct rsc *item) +{ + struct cache_head *ch; + int hash = rsc_hash(item); + + ch = sunrpc_cache_lookup(&rsc_cache, &item->h, hash); + if (ch) + return container_of(ch, struct rsc, h); + else + return NULL; +} + +static struct rsc *rsc_update(struct rsc *new, struct rsc *old) +{ + struct cache_head *ch; + int hash = rsc_hash(new); + + ch = sunrpc_cache_update(&rsc_cache, &new->h, &old->h, hash); + if (ch) + return container_of(ch, struct rsc, h); + else + return NULL; +} + +#define COMPAT_RSC_PUT(item, cd) cache_put((item), (cd)) + +/**************************************** + * rsc cache flush * + ****************************************/ + +static struct rsc *gss_svc_searchbyctx(rawobj_t *handle) +{ + struct rsc rsci; + struct rsc *found; + + memset(&rsci, 0, sizeof(rsci)); + if (rawobj_dup(&rsci.handle, handle)) + return NULL; + + found = rsc_lookup(&rsci); + rsc_free(&rsci); + if (!found) + return NULL; + if (cache_check(&rsc_cache, &found->h, NULL)) + return NULL; + return found; +} + +int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp, + struct gss_sec *gsec, + struct gss_cli_ctx *gctx) +{ + struct rsc rsci, *rscp = NULL; + time64_t ctx_expiry; + __u32 major; + int rc; + ENTRY; + + memset(&rsci, 0, sizeof(rsci)); + + if (rawobj_alloc(&rsci.handle, (char *) &gsec->gs_rvs_hdl, + sizeof(gsec->gs_rvs_hdl))) + GOTO(out, rc = -ENOMEM); + + rscp = rsc_lookup(&rsci); + if (rscp == NULL) + GOTO(out, rc = -ENOMEM); + + major = lgss_copy_reverse_context(gctx->gc_mechctx, + &rsci.ctx.gsc_mechctx); + if (major != GSS_S_COMPLETE) + GOTO(out, rc = -ENOMEM); + + if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) { + CERROR("unable to get expire time, drop it\n"); + GOTO(out, rc = -EINVAL); + } + rsci.h.expiry_time = ctx_expiry; + + switch (imp->imp_obd->u.cli.cl_sp_to) { + case LUSTRE_SP_MDT: + rsci.ctx.gsc_usr_mds = 1; + break; + case LUSTRE_SP_OST: + rsci.ctx.gsc_usr_oss = 1; + break; + case LUSTRE_SP_CLI: + rsci.ctx.gsc_usr_root = 1; + break; + case LUSTRE_SP_MGS: + /* by convention, all 3 set to 1 means MGS */ + rsci.ctx.gsc_usr_mds = 1; + rsci.ctx.gsc_usr_oss = 1; + rsci.ctx.gsc_usr_root = 1; + break; + default: + break; + } + + rscp = rsc_update(&rsci, rscp); + if (rscp == NULL) + GOTO(out, rc = -ENOMEM); + + rscp->target = imp->imp_obd; + rawobj_dup(&gctx->gc_svc_handle, &rscp->handle); + + CWARN("create reverse svc ctx %p to %s: idx %#llx\n", + &rscp->ctx, obd2cli_tgt(imp->imp_obd), gsec->gs_rvs_hdl); + rc = 0; +out: + if (rscp) + cache_put(&rscp->h, &rsc_cache); + rsc_free(&rsci); + + if (rc) + CERROR("create reverse svc ctx: idx %#llx, rc %d\n", + gsec->gs_rvs_hdl, rc); + RETURN(rc); +} + +int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle) +{ + const time64_t expire = 20; + struct rsc *rscp; + + rscp = gss_svc_searchbyctx(handle); + if (rscp) { + CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) expire soon\n", + &rscp->ctx, rscp); + + rscp->h.expiry_time = ktime_get_real_seconds() + expire; + COMPAT_RSC_PUT(&rscp->h, &rsc_cache); + } + return 0; +} + +int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx) +{ + struct rsc *rscp = container_of(ctx, struct rsc, ctx); + + return rawobj_dup(handle, &rscp->handle); +} + +int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq) +{ + struct rsc *rscp; + + rscp = gss_svc_searchbyctx(handle); + if (rscp) { + CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) update seq to %u\n", + &rscp->ctx, rscp, seq + 1); + + rscp->ctx.gsc_rvs_seq = seq + 1; + COMPAT_RSC_PUT(&rscp->h, &rsc_cache); + } + return 0; +} + +static struct cache_deferred_req* cache_upcall_defer(struct cache_req *req) +{ + return NULL; +} +static struct cache_req cache_upcall_chandle = { cache_upcall_defer }; + +int gss_svc_upcall_handle_init(struct ptlrpc_request *req, + struct gss_svc_reqctx *grctx, + struct gss_wire_ctx *gw, + struct obd_device *target, + __u32 lustre_svc, + rawobj_t *rvs_hdl, + rawobj_t *in_token) +{ + struct ptlrpc_reply_state *rs; + struct rsc *rsci = NULL; + struct rsi *rsip = NULL, rsikey; + wait_queue_entry_t wait; + int replen = sizeof(struct ptlrpc_body); + struct gss_rep_header *rephdr; + int first_check = 1; + int rc = SECSVC_DROP; + ENTRY; + + memset(&rsikey, 0, sizeof(rsikey)); + rsikey.lustre_svc = lustre_svc; + /* In case of MR, rq_peer is not the NID from which request is received, + * but primary NID of peer. + * So we need LNetPrimaryNID(rq_source) to match what the clients uses. + */ + rsikey.nid = (__u64)LNetPrimaryNID(req->rq_source.nid); + nodemap_test_nid(req->rq_peer.nid, rsikey.nm_name, + sizeof(rsikey.nm_name)); + + /* duplicate context handle. for INIT it always 0 */ + if (rawobj_dup(&rsikey.in_handle, &gw->gw_handle)) { + CERROR("fail to dup context handle\n"); + GOTO(out, rc); + } + + if (rawobj_dup(&rsikey.in_token, in_token)) { + CERROR("can't duplicate token\n"); + rawobj_free(&rsikey.in_handle); + GOTO(out, rc); + } + + rsip = rsi_lookup(&rsikey); + rsi_free(&rsikey); + if (!rsip) { + CERROR("error in rsi_lookup.\n"); + + if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0)) + rc = SECSVC_COMPLETE; + + GOTO(out, rc); + } + + cache_get(&rsip->h); /* take an extra ref */ + init_wait(&wait); + add_wait_queue(&rsip->waitq, &wait); + +cache_check: + /* Note each time cache_check() will drop a reference if return + * non-zero. We hold an extra reference on initial rsip, but must + * take care of following calls. */ + rc = cache_check(&rsi_cache, &rsip->h, &cache_upcall_chandle); + switch (rc) { + case -ETIMEDOUT: + case -EAGAIN: { + int valid; + + if (first_check) { + first_check = 0; + + cache_read_lock(&rsi_cache); + valid = test_bit(CACHE_VALID, &rsip->h.flags); + if (valid == 0) + set_current_state(TASK_INTERRUPTIBLE); + cache_read_unlock(&rsi_cache); + + if (valid == 0) { + unsigned long timeout; + + timeout = cfs_time_seconds(GSS_SVC_UPCALL_TIMEOUT); + schedule_timeout(timeout); + } + cache_get(&rsip->h); + goto cache_check; + } + CWARN("waited %ds timeout, drop\n", GSS_SVC_UPCALL_TIMEOUT); + break; + } + case -ENOENT: + CDEBUG(D_SEC, "cache_check return ENOENT, drop\n"); + break; + case 0: + /* if not the first check, we have to release the extra + * reference we just added on it. */ + if (!first_check) + cache_put(&rsip->h, &rsi_cache); + CDEBUG(D_SEC, "cache_check is good\n"); + break; + } + + remove_wait_queue(&rsip->waitq, &wait); + cache_put(&rsip->h, &rsi_cache); + + if (rc) + GOTO(out, rc = SECSVC_DROP); + + rc = SECSVC_DROP; + rsci = gss_svc_searchbyctx(&rsip->out_handle); + if (!rsci) { + CERROR("authentication failed\n"); + + /* gss mechanism returned major and minor code so we return + * those in error message */ + if (!gss_pack_err_notify(req, rsip->major_status, + rsip->minor_status)) + rc = SECSVC_COMPLETE; + + GOTO(out, rc); + } else { + cache_get(&rsci->h); + grctx->src_ctx = &rsci->ctx; + } + + if (gw->gw_flags & LUSTRE_GSS_PACK_KCSUM) { + grctx->src_ctx->gsc_mechctx->hash_func = gss_digest_hash; + } else if (!strcmp(grctx->src_ctx->gsc_mechctx->mech_type->gm_name, + "krb5") && + !krb5_allow_old_client_csum) { + CWARN("%s: deny connection from '%s' due to missing 'krb_csum' feature, set 'sptlrpc.gss.krb5_allow_old_client_csum=1' to allow, but recommend client upgrade: rc = %d\n", + target->obd_name, libcfs_nid2str(req->rq_peer.nid), + -EPROTO); + GOTO(out, rc = SECSVC_DROP); + } else { + grctx->src_ctx->gsc_mechctx->hash_func = + gss_digest_hash_compat; + } + + if (rawobj_dup(&rsci->ctx.gsc_rvs_hdl, rvs_hdl)) { + CERROR("failed duplicate reverse handle\n"); + GOTO(out, rc); + } + + rsci->target = target; + + CDEBUG(D_SEC, "server create rsc %p(%u->%s)\n", + rsci, rsci->ctx.gsc_uid, libcfs_nid2str(req->rq_peer.nid)); + + if (rsip->out_handle.len > PTLRPC_GSS_MAX_HANDLE_SIZE) { + CERROR("handle size %u too large\n", rsip->out_handle.len); + GOTO(out, rc = SECSVC_DROP); + } + + grctx->src_init = 1; + grctx->src_reserve_len = round_up(rsip->out_token.len, 4); + + rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0); + if (rc) { + CERROR("failed to pack reply: %d\n", rc); + GOTO(out, rc = SECSVC_DROP); + } + + rs = req->rq_reply_state; + LASSERT(rs->rs_repbuf->lm_bufcount == 3); + LASSERT(rs->rs_repbuf->lm_buflens[0] >= + sizeof(*rephdr) + rsip->out_handle.len); + LASSERT(rs->rs_repbuf->lm_buflens[2] >= rsip->out_token.len); + + rephdr = lustre_msg_buf(rs->rs_repbuf, 0, 0); + rephdr->gh_version = PTLRPC_GSS_VERSION; + rephdr->gh_flags = 0; + rephdr->gh_proc = PTLRPC_GSS_PROC_ERR; + rephdr->gh_major = rsip->major_status; + rephdr->gh_minor = rsip->minor_status; + rephdr->gh_seqwin = GSS_SEQ_WIN; + rephdr->gh_handle.len = rsip->out_handle.len; + memcpy(rephdr->gh_handle.data, rsip->out_handle.data, + rsip->out_handle.len); + + memcpy(lustre_msg_buf(rs->rs_repbuf, 2, 0), rsip->out_token.data, + rsip->out_token.len); + + rs->rs_repdata_len = lustre_shrink_msg(rs->rs_repbuf, 2, + rsip->out_token.len, 0); + + rc = SECSVC_OK; + +out: + /* it looks like here we should put rsip also, but this mess up + * with NFS cache mgmt code... FIXME + * something like: + * if (rsip) + * rsi_put(&rsip->h, &rsi_cache); */ + + if (rsci) { + /* if anything went wrong, we don't keep the context too */ + if (rc != SECSVC_OK) + set_bit(CACHE_NEGATIVE, &rsci->h.flags); + else + CDEBUG(D_SEC, "create rsc with idx %#llx\n", + gss_handle_to_u64(&rsci->handle)); + + COMPAT_RSC_PUT(&rsci->h, &rsc_cache); + } + RETURN(rc); +} + +struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req, + struct gss_wire_ctx *gw) +{ + struct rsc *rsc; + + rsc = gss_svc_searchbyctx(&gw->gw_handle); + if (!rsc) { + CWARN("Invalid gss ctx idx %#llx from %s\n", + gss_handle_to_u64(&gw->gw_handle), + libcfs_nid2str(req->rq_peer.nid)); + return NULL; + } + + return &rsc->ctx; +} + +void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx) +{ + struct rsc *rsc = container_of(ctx, struct rsc, ctx); + + COMPAT_RSC_PUT(&rsc->h, &rsc_cache); +} + +void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx) +{ + struct rsc *rsc = container_of(ctx, struct rsc, ctx); + + /* can't be found */ + set_bit(CACHE_NEGATIVE, &rsc->h.flags); + /* to be removed at next scan */ + rsc->h.expiry_time = 1; +} + +int __init gss_init_svc_upcall(void) +{ + int i, rc; + + /* + * this helps reducing context index confliction. after server reboot, + * conflicting request from clients might be filtered out by initial + * sequence number checking, thus no chance to sent error notification + * back to clients. + */ + get_random_bytes(&__ctx_index, sizeof(__ctx_index)); + +#ifdef HAVE_CACHE_HEAD_HLIST + for (i = 0; i < rsi_cache.hash_size; i++) + INIT_HLIST_HEAD(&rsi_cache.hash_table[i]); +#endif + rc = cache_register_net(&rsi_cache, &init_net); + if (rc != 0) + return rc; + +#ifdef HAVE_CACHE_HEAD_HLIST + for (i = 0; i < rsc_cache.hash_size; i++) + INIT_HLIST_HEAD(&rsc_cache.hash_table[i]); +#endif + rc = cache_register_net(&rsc_cache, &init_net); + if (rc != 0) { + cache_unregister_net(&rsi_cache, &init_net); + return rc; + } + + /* FIXME this looks stupid. we intend to give lsvcgssd a chance to open + * the init upcall channel, otherwise there's big chance that the first + * upcall issued before the channel be opened thus nfsv4 cache code will + * drop the request directly, thus lead to unnecessary recovery time. + * Here we wait at minimum 1.5 seconds. + */ + for (i = 0; i < 6; i++) { + if (channel_users(&rsi_cache) > 0) + break; + schedule_timeout_uninterruptible(cfs_time_seconds(1) / 4); + } + + if (channel_users(&rsi_cache) == 0) + CDEBUG(D_SEC, + "Init channel is not opened by lsvcgssd, following request might be dropped until lsvcgssd is active\n"); + + return 0; +} + +void gss_exit_svc_upcall(void) +{ + cache_purge(&rsi_cache); + cache_unregister_net(&rsi_cache, &init_net); + + cache_purge(&rsc_cache); + cache_unregister_net(&rsc_cache, &init_net); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c new file mode 100644 index 0000000000000..e401985e69f50 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c @@ -0,0 +1,278 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +static struct dentry *gss_debugfs_dir_lk; +static struct dentry *gss_debugfs_dir; +static struct proc_dir_entry *gss_lprocfs_dir; + +/* + * statistic of "out-of-sequence-window" + */ +static struct { + spinlock_t oos_lock; + atomic_t oos_cli_count; /* client occurrence */ + int oos_cli_behind; /* client max seqs behind */ + atomic_t oos_svc_replay[3]; /* server replay detected */ + atomic_t oos_svc_pass[3]; /* server verified ok */ +} gss_stat_oos = { + .oos_cli_count = ATOMIC_INIT(0), + .oos_cli_behind = 0, + .oos_svc_replay = { ATOMIC_INIT(0), }, + .oos_svc_pass = { ATOMIC_INIT(0), }, +}; + +void gss_stat_oos_record_cli(int behind) +{ + atomic_inc(&gss_stat_oos.oos_cli_count); + + spin_lock(&gss_stat_oos.oos_lock); + if (behind > gss_stat_oos.oos_cli_behind) + gss_stat_oos.oos_cli_behind = behind; + spin_unlock(&gss_stat_oos.oos_lock); +} + +void gss_stat_oos_record_svc(int phase, int replay) +{ + LASSERT(phase >= 0 && phase <= 2); + + if (replay) + atomic_inc(&gss_stat_oos.oos_svc_replay[phase]); + else + atomic_inc(&gss_stat_oos.oos_svc_pass[phase]); +} + +static int gss_proc_oos_seq_show(struct seq_file *m, void *v) +{ + seq_printf(m, "seqwin: %u\n" + "backwin: %u\n" + "client fall behind seqwin\n" + " occurrence: %d\n" + " max seq behind: %d\n" + "server replay detected:\n" + " phase 0: %d\n" + " phase 1: %d\n" + " phase 2: %d\n" + "server verify ok:\n" + " phase 2: %d\n", + GSS_SEQ_WIN_MAIN, + GSS_SEQ_WIN_BACK, + atomic_read(&gss_stat_oos.oos_cli_count), + gss_stat_oos.oos_cli_behind, + atomic_read(&gss_stat_oos.oos_svc_replay[0]), + atomic_read(&gss_stat_oos.oos_svc_replay[1]), + atomic_read(&gss_stat_oos.oos_svc_replay[2]), + atomic_read(&gss_stat_oos.oos_svc_pass[2])); + return 0; +} +LDEBUGFS_SEQ_FOPS_RO(gss_proc_oos); + +static ssize_t +gss_proc_write_secinit(struct file *file, const char *buffer, + size_t count, loff_t *off) +{ + int rc; + + rc = gss_do_ctx_init_rpc((char *) buffer, count); + if (rc) { + LASSERT(rc < 0); + return rc; + } + return count; +} + +static const struct file_operations gss_proc_secinit = { + .write = gss_proc_write_secinit, +}; + +int sptlrpc_krb5_allow_old_client_csum_seq_show(struct seq_file *m, void *data) +{ + seq_printf(m, "%u\n", krb5_allow_old_client_csum); + return 0; +} + +ssize_t sptlrpc_krb5_allow_old_client_csum_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + bool val; + int rc; + + rc = kstrtobool_from_user(buffer, count, &val); + if (rc) + return rc; + + krb5_allow_old_client_csum = val; + return count; +} +LPROC_SEQ_FOPS(sptlrpc_krb5_allow_old_client_csum); + +#ifdef HAVE_GSS_KEYRING +int sptlrpc_gss_check_upcall_ns_seq_show(struct seq_file *m, void *data) +{ + seq_printf(m, "%u\n", gss_check_upcall_ns); + return 0; +} + +ssize_t sptlrpc_gss_check_upcall_ns_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + bool val; + int rc; + + rc = kstrtobool_from_user(buffer, count, &val); + if (rc) + return rc; + + gss_check_upcall_ns = val; + return count; +} +LPROC_SEQ_FOPS(sptlrpc_gss_check_upcall_ns); +#endif /* HAVE_GSS_KEYRING */ + +static struct ldebugfs_vars gss_debugfs_vars[] = { + { .name = "replays", + .fops = &gss_proc_oos_fops }, + { .name = "init_channel", + .fops = &gss_proc_secinit, + .proc_mode = 0222 }, + { NULL } +}; + +static struct lprocfs_vars gss_lprocfs_vars[] = { + { .name = "krb5_allow_old_client_csum", + .fops = &sptlrpc_krb5_allow_old_client_csum_fops }, +#ifdef HAVE_GSS_KEYRING + { .name = "gss_check_upcall_ns", + .fops = &sptlrpc_gss_check_upcall_ns_fops }, +#endif + { NULL } +}; + +/* + * for userspace helper lgss_keyring. + * + * debug_level: [0, 4], defined in utils/gss/lgss_utils.h + */ +static int gss_lk_debug_level = 1; + +static int gss_lk_proc_dl_seq_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%u\n", gss_lk_debug_level); + return 0; +} + +static ssize_t +gss_lk_proc_dl_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + unsigned int val; + int rc; + + rc = kstrtouint_from_user(buffer, count, 0, &val); + if (rc < 0) + return rc; + + if (val > 4) + return -ERANGE; + + gss_lk_debug_level = val; + + return count; +} +LDEBUGFS_SEQ_FOPS(gss_lk_proc_dl); + +static struct ldebugfs_vars gss_lk_debugfs_vars[] = { + { .name = "debug_level", + .fops = &gss_lk_proc_dl_fops }, + { NULL } +}; + +void gss_exit_tunables(void) +{ + debugfs_remove_recursive(gss_debugfs_dir_lk); + gss_debugfs_dir_lk = NULL; + + debugfs_remove_recursive(gss_debugfs_dir); + gss_debugfs_dir = NULL; + + if (!IS_ERR_OR_NULL(gss_lprocfs_dir)) + lprocfs_remove(&gss_lprocfs_dir); +} + +int gss_init_tunables(void) +{ + int rc; + + spin_lock_init(&gss_stat_oos.oos_lock); + + gss_debugfs_dir = debugfs_create_dir("gss", sptlrpc_debugfs_dir); + ldebugfs_add_vars(gss_debugfs_dir, gss_debugfs_vars, NULL); + + gss_debugfs_dir_lk = debugfs_create_dir("lgss_keyring", + gss_debugfs_dir); + ldebugfs_add_vars(gss_debugfs_dir_lk, gss_lk_debugfs_vars, NULL); + + gss_lprocfs_dir = lprocfs_register("gss", sptlrpc_lprocfs_dir, + gss_lprocfs_vars, NULL); + if (IS_ERR_OR_NULL(gss_lprocfs_dir)) { + rc = gss_lprocfs_dir ? PTR_ERR(gss_lprocfs_dir) : -ENOMEM; + gss_lprocfs_dir = NULL; + GOTO(out, rc); + } + + return 0; + +out: + CERROR("failed to initialize gss lproc entries: %d\n", rc); + gss_exit_tunables(); + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c new file mode 100644 index 0000000000000..7c8001152a454 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c @@ -0,0 +1,2929 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2015, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/auth_gss.c + * + * RPCSEC_GSS client authentication. + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Dug Song + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +#include +#include + +/* + * early reply have fixed size, respectively in privacy and integrity mode. + * so we calculate them only once. + */ +static int gss_at_reply_off_integ; +static int gss_at_reply_off_priv; + + +static inline int msg_last_segidx(struct lustre_msg *msg) +{ + LASSERT(msg->lm_bufcount > 0); + return msg->lm_bufcount - 1; +} +static inline int msg_last_seglen(struct lustre_msg *msg) +{ + return msg->lm_buflens[msg_last_segidx(msg)]; +} + +/******************************************** + * wire data swabber * + ********************************************/ + +static +void gss_header_swabber(struct gss_header *ghdr) +{ + __swab32s(&ghdr->gh_flags); + __swab32s(&ghdr->gh_proc); + __swab32s(&ghdr->gh_seq); + __swab32s(&ghdr->gh_svc); + __swab32s(&ghdr->gh_pad1); + __swab32s(&ghdr->gh_handle.len); +} + +struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment, + int swabbed) +{ + struct gss_header *ghdr; + + ghdr = lustre_msg_buf(msg, segment, sizeof(*ghdr)); + if (ghdr == NULL) + return NULL; + + if (swabbed) + gss_header_swabber(ghdr); + + if (sizeof(*ghdr) + ghdr->gh_handle.len > msg->lm_buflens[segment]) { + CERROR("gss header has length %d, now %u received\n", + (int) sizeof(*ghdr) + ghdr->gh_handle.len, + msg->lm_buflens[segment]); + return NULL; + } + + return ghdr; +} + +/* + * payload should be obtained from mechanism. but currently since we + * only support kerberos, we could simply use fixed value. + * krb5 "meta" data: + * - krb5 header: 16 + * - krb5 checksum: 20 + * + * for privacy mode, payload also include the cipher text which has the same + * size as plain text, plus possible confounder, padding both at maximum cipher + * block size. + */ +#define GSS_KRB5_INTEG_MAX_PAYLOAD (40) + +static inline +int gss_mech_payload(struct gss_ctx *mechctx, int msgsize, int privacy) +{ + if (privacy) + return GSS_KRB5_INTEG_MAX_PAYLOAD + 16 + 16 + 16 + msgsize; + else + return GSS_KRB5_INTEG_MAX_PAYLOAD; +} + +/* + * return signature size, otherwise < 0 to indicate error + */ +static int gss_sign_msg(struct lustre_msg *msg, + struct gss_ctx *mechctx, + enum lustre_sec_part sp, + __u32 flags, __u32 proc, __u32 seq, __u32 svc, + rawobj_t *handle) +{ + struct gss_header *ghdr; + rawobj_t text[4], mic; + int textcnt, max_textcnt, mic_idx; + __u32 major; + + LASSERT(msg->lm_bufcount >= 2); + + /* gss hdr */ + LASSERT(msg->lm_buflens[0] >= + sizeof(*ghdr) + (handle ? handle->len : 0)); + ghdr = lustre_msg_buf(msg, 0, 0); + + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_sp = (__u8) sp; + ghdr->gh_flags = flags; + ghdr->gh_proc = proc; + ghdr->gh_seq = seq; + ghdr->gh_svc = svc; + if (!handle) { + /* fill in a fake one */ + ghdr->gh_handle.len = 0; + } else { + ghdr->gh_handle.len = handle->len; + memcpy(ghdr->gh_handle.data, handle->data, handle->len); + } + + /* no actual signature for null mode */ + if (svc == SPTLRPC_SVC_NULL) + return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + + /* MIC */ + mic_idx = msg_last_segidx(msg); + max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx; + + for (textcnt = 0; textcnt < max_textcnt; textcnt++) { + text[textcnt].len = msg->lm_buflens[textcnt]; + text[textcnt].data = lustre_msg_buf(msg, textcnt, 0); + } + + mic.len = msg->lm_buflens[mic_idx]; + mic.data = lustre_msg_buf(msg, mic_idx, 0); + + major = lgss_get_mic(mechctx, textcnt, text, 0, NULL, &mic); + if (major != GSS_S_COMPLETE) { + CERROR("fail to generate MIC: %08x\n", major); + return -EPERM; + } + LASSERT(mic.len <= msg->lm_buflens[mic_idx]); + + return lustre_shrink_msg(msg, mic_idx, mic.len, 0); +} + +/* + * return gss error + */ +static +__u32 gss_verify_msg(struct lustre_msg *msg, + struct gss_ctx *mechctx, + __u32 svc) +{ + rawobj_t text[4], mic; + int textcnt, max_textcnt; + int mic_idx; + __u32 major; + + LASSERT(msg->lm_bufcount >= 2); + + if (svc == SPTLRPC_SVC_NULL) + return GSS_S_COMPLETE; + + mic_idx = msg_last_segidx(msg); + max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx; + + for (textcnt = 0; textcnt < max_textcnt; textcnt++) { + text[textcnt].len = msg->lm_buflens[textcnt]; + text[textcnt].data = lustre_msg_buf(msg, textcnt, 0); + } + + mic.len = msg->lm_buflens[mic_idx]; + mic.data = lustre_msg_buf(msg, mic_idx, 0); + + major = lgss_verify_mic(mechctx, textcnt, text, 0, NULL, &mic); + if (major != GSS_S_COMPLETE) + CERROR("mic verify error: %08x\n", major); + + return major; +} + +/* + * return gss error code + */ +static +__u32 gss_unseal_msg(struct gss_ctx *mechctx, + struct lustre_msg *msgbuf, + int *msg_len, int msgbuf_len) +{ + rawobj_t clear_obj, hdrobj, token; + __u8 *clear_buf; + int clear_buflen; + __u32 major; + ENTRY; + + if (msgbuf->lm_bufcount != 2) { + CERROR("invalid bufcount %d\n", msgbuf->lm_bufcount); + RETURN(GSS_S_FAILURE); + } + + /* allocate a temporary clear text buffer, same sized as token, + * we assume the final clear text size <= token size */ + clear_buflen = lustre_msg_buflen(msgbuf, 1); + OBD_ALLOC_LARGE(clear_buf, clear_buflen); + if (!clear_buf) + RETURN(GSS_S_FAILURE); + + /* buffer objects */ + hdrobj.len = lustre_msg_buflen(msgbuf, 0); + hdrobj.data = lustre_msg_buf(msgbuf, 0, 0); + token.len = lustre_msg_buflen(msgbuf, 1); + token.data = lustre_msg_buf(msgbuf, 1, 0); + clear_obj.len = clear_buflen; + clear_obj.data = clear_buf; + + major = lgss_unwrap(mechctx, &hdrobj, &token, &clear_obj); + if (major != GSS_S_COMPLETE) { + CERROR("unwrap message error: %08x\n", major); + GOTO(out_free, major = GSS_S_FAILURE); + } + LASSERT(clear_obj.len <= clear_buflen); + LASSERT(clear_obj.len <= msgbuf_len); + + /* now the decrypted message */ + memcpy(msgbuf, clear_obj.data, clear_obj.len); + *msg_len = clear_obj.len; + + major = GSS_S_COMPLETE; +out_free: + OBD_FREE_LARGE(clear_buf, clear_buflen); + RETURN(major); +} + +/******************************************** + * gss client context manipulation helpers * + ********************************************/ + +int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(atomic_read(&ctx->cc_refcount)); + + if (!test_and_set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags)) { + if (!ctx->cc_early_expire) + clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags); + + CWARN("ctx %p(%u->%s) get expired: %lld(%+llds)\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), + ctx->cc_expire, + ctx->cc_expire == 0 ? 0 : + ctx->cc_expire - ktime_get_real_seconds()); + + sptlrpc_cli_ctx_wakeup(ctx); + return 1; + } + + return 0; +} + +/* + * return 1 if the context is dead. + */ +int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx) +{ + if (unlikely(cli_ctx_is_dead(ctx))) + return 1; + + /* expire is 0 means never expire. a newly created gss context + * which during upcall may has 0 expiration */ + if (ctx->cc_expire == 0) + return 0; + + /* check real expiration */ + if (ctx->cc_expire > ktime_get_real_seconds()) + return 0; + + cli_ctx_expire(ctx); + return 1; +} + +void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx) +{ + struct ptlrpc_cli_ctx *ctx = &gctx->gc_base; + time64_t ctx_expiry; + + if (lgss_inquire_context(gctx->gc_mechctx, &ctx_expiry)) { + CERROR("ctx %p(%u): unable to inquire, expire it now\n", + gctx, ctx->cc_vcred.vc_uid); + ctx_expiry = 1; /* make it expired now */ + } + + ctx->cc_expire = gss_round_ctx_expiry(ctx_expiry, + ctx->cc_sec->ps_flvr.sf_flags); + + /* At this point this ctx might have been marked as dead by + * someone else, in which case nobody will make further use + * of it. we don't care, and mark it UPTODATE will help + * destroying server side context when it be destroyed. */ + set_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags); + + if (sec_is_reverse(ctx->cc_sec)) { + CWARN("server installed reverse ctx %p idx %#llx, " + "expiry %lld(%+llds)\n", ctx, + gss_handle_to_u64(&gctx->gc_handle), + ctx->cc_expire, + ctx->cc_expire - ktime_get_real_seconds()); + } else { + CWARN("client refreshed ctx %p idx %#llx (%u->%s), " + "expiry %lld(%+llds)\n", ctx, + gss_handle_to_u64(&gctx->gc_handle), + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), + ctx->cc_expire, + ctx->cc_expire - ktime_get_real_seconds()); + + /* install reverse svc ctx for root context */ + if (ctx->cc_vcred.vc_uid == 0) + gss_sec_install_rctx(ctx->cc_sec->ps_import, + ctx->cc_sec, ctx); + } + + sptlrpc_cli_ctx_wakeup(ctx); +} + +static void gss_cli_ctx_finalize(struct gss_cli_ctx *gctx) +{ + LASSERT(gctx->gc_base.cc_sec); + + if (gctx->gc_mechctx) { + lgss_delete_sec_context(&gctx->gc_mechctx); + gctx->gc_mechctx = NULL; + } + + if (!rawobj_empty(&gctx->gc_svc_handle)) { + /* forward ctx: mark buddy reverse svcctx soon-expire. */ + if (!sec_is_reverse(gctx->gc_base.cc_sec) && + !rawobj_empty(&gctx->gc_svc_handle)) + gss_svc_upcall_expire_rvs_ctx(&gctx->gc_svc_handle); + + rawobj_free(&gctx->gc_svc_handle); + } + + rawobj_free(&gctx->gc_handle); +} + +/** + * Based on sequence number algorithm as specified in RFC 2203. + * + * Modified for our own problem: arriving request has valid sequence number, + * but unwrapping request might cost a long time, after that its sequence + * are not valid anymore (fall behind the window). It rarely happen, mostly + * under extreme load. + * + * Note we should not check sequence before verifying the integrity of incoming + * request, because just one attacking request with high sequence number might + * cause all following requests be dropped. + * + * So here we use a multi-phase approach: prepare 2 sequence windows, + * "main window" for normal sequence and "back window" for fall behind sequence. + * and 3-phase checking mechanism: + * 0 - before integrity verification, perform an initial sequence checking in + * main window, which only tries and doesn't actually set any bits. if the + * sequence is high above the window or fits in the window and the bit + * is 0, then accept and proceed to integrity verification. otherwise + * reject this sequence. + * 1 - after integrity verification, check in main window again. if this + * sequence is high above the window or fits in the window and the bit + * is 0, then set the bit and accept; if it fits in the window but bit + * already set, then reject; if it falls behind the window, then proceed + * to phase 2. + * 2 - check in back window. if it is high above the window or fits in the + * window and the bit is 0, then set the bit and accept. otherwise reject. + * + * \return 1: looks like a replay + * \return 0: is ok + * \return -1: is a replay + * + * Note phase 0 is necessary, because otherwise replay attacking request of + * sequence which between the 2 windows can't be detected. + * + * This mechanism can't totally solve the problem, but could help reduce the + * number of valid requests be dropped. + */ +static +int gss_do_check_seq(unsigned long *window, __u32 win_size, __u32 *max_seq, + __u32 seq_num, int phase) +{ + LASSERT(phase >= 0 && phase <= 2); + + if (seq_num > *max_seq) { + /* + * 1. high above the window + */ + if (phase == 0) + return 0; + + if (seq_num >= *max_seq + win_size) { + memset(window, 0, win_size / 8); + *max_seq = seq_num; + } else { + while(*max_seq < seq_num) { + (*max_seq)++; + __clear_bit((*max_seq) % win_size, window); + } + } + __set_bit(seq_num % win_size, window); + } else if (seq_num + win_size <= *max_seq) { + /* + * 2. low behind the window + */ + if (phase == 0 || phase == 2) + goto replay; + + CWARN("seq %u is %u behind (size %d), check backup window\n", + seq_num, *max_seq - win_size - seq_num, win_size); + return 1; + } else { + /* + * 3. fit into the window + */ + switch (phase) { + case 0: + if (test_bit(seq_num % win_size, window)) + goto replay; + break; + case 1: + case 2: + if (__test_and_set_bit(seq_num % win_size, window)) + goto replay; + break; + } + } + + return 0; + +replay: + CERROR("seq %u (%s %s window) is a replay: max %u, winsize %d\n", + seq_num, + seq_num + win_size > *max_seq ? "in" : "behind", + phase == 2 ? "backup " : "main", + *max_seq, win_size); + return -1; +} + +/* + * Based on sequence number algorithm as specified in RFC 2203. + * + * if @set == 0: initial check, don't set any bit in window + * if @sec == 1: final check, set bit in window + */ +int gss_check_seq_num(struct gss_svc_seq_data *ssd, __u32 seq_num, int set) +{ + int rc = 0; + + spin_lock(&ssd->ssd_lock); + + if (set == 0) { + /* + * phase 0 testing + */ + rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN, + &ssd->ssd_max_main, seq_num, 0); + if (unlikely(rc)) + gss_stat_oos_record_svc(0, 1); + } else { + /* + * phase 1 checking main window + */ + rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN, + &ssd->ssd_max_main, seq_num, 1); + switch (rc) { + case -1: + gss_stat_oos_record_svc(1, 1); + fallthrough; + case 0: + goto exit; + } + /* + * phase 2 checking back window + */ + rc = gss_do_check_seq(ssd->ssd_win_back, GSS_SEQ_WIN_BACK, + &ssd->ssd_max_back, seq_num, 2); + if (rc) + gss_stat_oos_record_svc(2, 1); + else + gss_stat_oos_record_svc(2, 0); + } +exit: + spin_unlock(&ssd->ssd_lock); + return rc; +} + +/*************************************** + * cred APIs * + ***************************************/ + +static inline int gss_cli_payload(struct ptlrpc_cli_ctx *ctx, + int msgsize, int privacy) +{ + return gss_mech_payload(NULL, msgsize, privacy); +} + +static int gss_cli_bulk_payload(struct ptlrpc_cli_ctx *ctx, + struct sptlrpc_flavor *flvr, + int reply, int read) +{ + int payload = sizeof(struct ptlrpc_bulk_sec_desc); + + LASSERT(SPTLRPC_FLVR_BULK_TYPE(flvr->sf_rpc) == SPTLRPC_BULK_DEFAULT); + + if ((!reply && !read) || (reply && read)) { + switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { + case SPTLRPC_BULK_SVC_NULL: + break; + case SPTLRPC_BULK_SVC_INTG: + payload += gss_cli_payload(ctx, 0, 0); + break; + case SPTLRPC_BULK_SVC_PRIV: + payload += gss_cli_payload(ctx, 0, 1); + break; + case SPTLRPC_BULK_SVC_AUTH: + default: + LBUG(); + } + } + + return payload; +} + +int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred) +{ + return (ctx->cc_vcred.vc_uid == vcred->vc_uid); +} + +void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize) +{ + buf[0] = '\0'; + + if (flags & PTLRPC_CTX_NEW) + strlcat(buf, "new,", bufsize); + if (flags & PTLRPC_CTX_UPTODATE) + strlcat(buf, "uptodate,", bufsize); + if (flags & PTLRPC_CTX_DEAD) + strlcat(buf, "dead,", bufsize); + if (flags & PTLRPC_CTX_ERROR) + strlcat(buf, "error,", bufsize); + if (flags & PTLRPC_CTX_CACHED) + strlcat(buf, "cached,", bufsize); + if (flags & PTLRPC_CTX_ETERNAL) + strlcat(buf, "eternal,", bufsize); + if (buf[0] == '\0') + strlcat(buf, "-,", bufsize); +} + +int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req) +{ + struct gss_cli_ctx *gctx = ctx2gctx(ctx); + __u32 flags = 0, seq, svc; + int rc; + ENTRY; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf->lm_bufcount >= 2); + LASSERT(req->rq_cli_ctx == ctx); + + /* nothing to do for context negotiation RPCs */ + if (req->rq_ctx_init) + RETURN(0); + + svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + if (req->rq_pack_bulk) + flags |= LUSTRE_GSS_PACK_BULK; + if (req->rq_pack_udesc) + flags |= LUSTRE_GSS_PACK_USER; + +redo: + seq = atomic_inc_return(&gctx->gc_seq); + + rc = gss_sign_msg(req->rq_reqbuf, gctx->gc_mechctx, + ctx->cc_sec->ps_part, + flags, gctx->gc_proc, seq, svc, + &gctx->gc_handle); + if (rc < 0) + RETURN(rc); + + /* gss_sign_msg() msg might take long time to finish, in which period + * more rpcs could be wrapped up and sent out. if we found too many + * of them we should repack this rpc, because sent it too late might + * lead to the sequence number fall behind the window on server and + * be dropped. also applies to gss_cli_ctx_seal(). + * + * Note: null mode doesn't check sequence number. */ + if (svc != SPTLRPC_SVC_NULL && + atomic_read(&gctx->gc_seq) - seq > GSS_SEQ_REPACK_THRESHOLD) { + int behind = atomic_read(&gctx->gc_seq) - seq; + + gss_stat_oos_record_cli(behind); + CWARN("req %p: %u behind, retry signing\n", req, behind); + goto redo; + } + + req->rq_reqdata_len = rc; + RETURN(0); +} + +static +int gss_cli_ctx_handle_err_notify(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct gss_header *ghdr) +{ + struct gss_err_header *errhdr; + int rc; + + LASSERT(ghdr->gh_proc == PTLRPC_GSS_PROC_ERR); + + errhdr = (struct gss_err_header *) ghdr; + + CWARN("req x%llu/t%llu, ctx %p idx %#llx(%u->%s): " + "%sserver respond (%08x/%08x)\n", + req->rq_xid, req->rq_transno, ctx, + gss_handle_to_u64(&ctx2gctx(ctx)->gc_handle), + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), + sec_is_reverse(ctx->cc_sec) ? "reverse" : "", + errhdr->gh_major, errhdr->gh_minor); + + /* context fini rpc, let it failed */ + if (req->rq_ctx_fini) { + CWARN("context fini rpc failed\n"); + return -EINVAL; + } + + /* reverse sec, just return error, don't expire this ctx because it's + * crucial to callback rpcs. note if the callback rpc failed because + * of bit flip during network transfer, the client will be evicted + * directly. so more gracefully we probably want let it retry for + * number of times. */ + if (sec_is_reverse(ctx->cc_sec)) + return -EINVAL; + + if (errhdr->gh_major != GSS_S_NO_CONTEXT && + errhdr->gh_major != GSS_S_BAD_SIG) + return -EACCES; + + /* server return NO_CONTEXT might be caused by context expire + * or server reboot/failover. we try to refresh a new ctx which + * be transparent to upper layer. + * + * In some cases, our gss handle is possible to be incidentally + * identical to another handle since the handle itself is not + * fully random. In krb5 case, the GSS_S_BAD_SIG will be + * returned, maybe other gss error for other mechanism. + * + * if we add new mechanism, make sure the correct error are + * returned in this case. */ + CWARN("%s: server might lost the context, retrying\n", + errhdr->gh_major == GSS_S_NO_CONTEXT ? "NO_CONTEXT" : "BAD_SIG"); + + sptlrpc_cli_ctx_expire(ctx); + + /* we need replace the ctx right here, otherwise during + * resent we'll hit the logic in sptlrpc_req_refresh_ctx() + * which keep the ctx with RESEND flag, thus we'll never + * get rid of this ctx. */ + rc = sptlrpc_req_replace_dead_ctx(req); + if (rc == 0) + req->rq_resend = 1; + + return rc; +} + +int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req) +{ + struct gss_cli_ctx *gctx; + struct gss_header *ghdr, *reqhdr; + struct lustre_msg *msg = req->rq_repdata; + __u32 major; + int pack_bulk, swabbed, rc = 0; + ENTRY; + + LASSERT(req->rq_cli_ctx == ctx); + LASSERT(msg); + + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + + /* special case for context negotiation, rq_repmsg/rq_replen actually + * are not used currently. but early reply always be treated normally */ + if (req->rq_ctx_init && !req->rq_early) { + req->rq_repmsg = lustre_msg_buf(msg, 1, 0); + req->rq_replen = msg->lm_buflens[1]; + RETURN(0); + } + + if (msg->lm_bufcount < 2 || msg->lm_bufcount > 4) { + CERROR("unexpected bufcount %u\n", msg->lm_bufcount); + RETURN(-EPROTO); + } + + swabbed = req_capsule_rep_need_swab(&req->rq_pill); + + ghdr = gss_swab_header(msg, 0, swabbed); + if (ghdr == NULL) { + CERROR("can't decode gss header\n"); + RETURN(-EPROTO); + } + + /* sanity checks */ + reqhdr = lustre_msg_buf(msg, 0, sizeof(*reqhdr)); + LASSERT(reqhdr); + + if (ghdr->gh_version != reqhdr->gh_version) { + CERROR("gss version %u mismatch, expect %u\n", + ghdr->gh_version, reqhdr->gh_version); + RETURN(-EPROTO); + } + + switch (ghdr->gh_proc) { + case PTLRPC_GSS_PROC_DATA: + pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK; + + if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){ + CERROR("%s bulk flag in reply\n", + req->rq_pack_bulk ? "missing" : "unexpected"); + RETURN(-EPROTO); + } + + if (ghdr->gh_seq != reqhdr->gh_seq) { + CERROR("seqnum %u mismatch, expect %u\n", + ghdr->gh_seq, reqhdr->gh_seq); + RETURN(-EPROTO); + } + + if (ghdr->gh_svc != reqhdr->gh_svc) { + CERROR("svc %u mismatch, expect %u\n", + ghdr->gh_svc, reqhdr->gh_svc); + RETURN(-EPROTO); + } + + if (swabbed) + gss_header_swabber(ghdr); + + major = gss_verify_msg(msg, gctx->gc_mechctx, reqhdr->gh_svc); + if (major != GSS_S_COMPLETE) { + CERROR("failed to verify reply: %x\n", major); + RETURN(-EPERM); + } + + if (req->rq_early && reqhdr->gh_svc == SPTLRPC_SVC_NULL) { + __u32 cksum; + + cksum = crc32_le(!(__u32) 0, + lustre_msg_buf(msg, 1, 0), + lustre_msg_buflen(msg, 1)); + if (cksum != msg->lm_cksum) { + CWARN("early reply checksum mismatch: " + "%08x != %08x\n", cksum, msg->lm_cksum); + RETURN(-EPROTO); + } + } + + if (pack_bulk) { + /* bulk checksum is right after the lustre msg */ + if (msg->lm_bufcount < 3) { + CERROR("Invalid reply bufcount %u\n", + msg->lm_bufcount); + RETURN(-EPROTO); + } + + rc = bulk_sec_desc_unpack(msg, 2, swabbed); + if (rc) { + CERROR("unpack bulk desc: %d\n", rc); + RETURN(rc); + } + } + + req->rq_repmsg = lustre_msg_buf(msg, 1, 0); + req->rq_replen = msg->lm_buflens[1]; + break; + case PTLRPC_GSS_PROC_ERR: + if (req->rq_early) { + CERROR("server return error with early reply\n"); + rc = -EPROTO; + } else { + rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr); + } + break; + default: + CERROR("unknown gss proc %d\n", ghdr->gh_proc); + rc = -EPROTO; + } + + RETURN(rc); +} + +int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req) +{ + struct gss_cli_ctx *gctx; + rawobj_t hdrobj, msgobj, token; + struct gss_header *ghdr; + __u32 buflens[2], major; + int wiresize, rc; + ENTRY; + + LASSERT(req->rq_clrbuf); + LASSERT(req->rq_cli_ctx == ctx); + LASSERT(req->rq_reqlen); + + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + + /* final clear data length */ + req->rq_clrdata_len = lustre_msg_size_v2(req->rq_clrbuf->lm_bufcount, + req->rq_clrbuf->lm_buflens); + + /* calculate wire data length */ + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_cli_payload(&gctx->gc_base, req->rq_clrdata_len, 1); + wiresize = lustre_msg_size_v2(2, buflens); + + /* allocate wire buffer */ + if (req->rq_pool) { + /* pre-allocated */ + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf != req->rq_clrbuf); + LASSERT(req->rq_reqbuf_len >= wiresize); + } else { + OBD_ALLOC_LARGE(req->rq_reqbuf, wiresize); + if (!req->rq_reqbuf) + RETURN(-ENOMEM); + req->rq_reqbuf_len = wiresize; + } + + lustre_init_msg_v2(req->rq_reqbuf, 2, buflens, NULL); + req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc; + + /* gss header */ + ghdr = lustre_msg_buf(req->rq_reqbuf, 0, 0); + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_sp = (__u8) ctx->cc_sec->ps_part; + ghdr->gh_flags = 0; + ghdr->gh_proc = gctx->gc_proc; + ghdr->gh_svc = SPTLRPC_SVC_PRIV; + ghdr->gh_handle.len = gctx->gc_handle.len; + memcpy(ghdr->gh_handle.data, gctx->gc_handle.data, gctx->gc_handle.len); + if (req->rq_pack_bulk) + ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK; + if (req->rq_pack_udesc) + ghdr->gh_flags |= LUSTRE_GSS_PACK_USER; + +redo: + ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq); + + /* buffer objects */ + hdrobj.len = PTLRPC_GSS_HEADER_SIZE; + hdrobj.data = (__u8 *) ghdr; + msgobj.len = req->rq_clrdata_len; + msgobj.data = (__u8 *) req->rq_clrbuf; + token.len = lustre_msg_buflen(req->rq_reqbuf, 1); + token.data = lustre_msg_buf(req->rq_reqbuf, 1, 0); + + major = lgss_wrap(gctx->gc_mechctx, &hdrobj, &msgobj, + req->rq_clrbuf_len, &token); + if (major != GSS_S_COMPLETE) { + CERROR("priv: wrap message error: %08x\n", major); + GOTO(err_free, rc = -EPERM); + } + LASSERT(token.len <= buflens[1]); + + /* see explain in gss_cli_ctx_sign() */ + if (unlikely(atomic_read(&gctx->gc_seq) - ghdr->gh_seq > + GSS_SEQ_REPACK_THRESHOLD)) { + int behind = atomic_read(&gctx->gc_seq) - ghdr->gh_seq; + + gss_stat_oos_record_cli(behind); + CWARN("req %p: %u behind, retry sealing\n", req, behind); + + ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq); + goto redo; + } + + /* now set the final wire data length */ + req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, 1, token.len,0); + RETURN(0); + +err_free: + if (!req->rq_pool) { + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } + RETURN(rc); +} + +int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req) +{ + struct gss_cli_ctx *gctx; + struct gss_header *ghdr; + struct lustre_msg *msg = req->rq_repdata; + int msglen, pack_bulk, swabbed, rc; + __u32 major; + ENTRY; + + LASSERT(req->rq_cli_ctx == ctx); + LASSERT(req->rq_ctx_init == 0); + LASSERT(msg); + + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + swabbed = req_capsule_rep_need_swab(&req->rq_pill); + + ghdr = gss_swab_header(msg, 0, swabbed); + if (ghdr == NULL) { + CERROR("can't decode gss header\n"); + RETURN(-EPROTO); + } + + /* sanity checks */ + if (ghdr->gh_version != PTLRPC_GSS_VERSION) { + CERROR("gss version %u mismatch, expect %u\n", + ghdr->gh_version, PTLRPC_GSS_VERSION); + RETURN(-EPROTO); + } + + switch (ghdr->gh_proc) { + case PTLRPC_GSS_PROC_DATA: + pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK; + + if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){ + CERROR("%s bulk flag in reply\n", + req->rq_pack_bulk ? "missing" : "unexpected"); + RETURN(-EPROTO); + } + + if (swabbed) + gss_header_swabber(ghdr); + + /* use rq_repdata_len as buffer size, which assume unseal + * doesn't need extra memory space. for precise control, we'd + * better calculate out actual buffer size as + * (repbuf_len - offset - repdata_len) */ + major = gss_unseal_msg(gctx->gc_mechctx, msg, + &msglen, req->rq_repdata_len); + if (major != GSS_S_COMPLETE) { + CERROR("failed to unwrap reply: %x\n", major); + rc = -EPERM; + break; + } + + swabbed = __lustre_unpack_msg(msg, msglen); + if (swabbed < 0) { + CERROR("Failed to unpack after decryption\n"); + RETURN(-EPROTO); + } + + if (msg->lm_bufcount < 1) { + CERROR("Invalid reply buffer: empty\n"); + RETURN(-EPROTO); + } + + if (pack_bulk) { + if (msg->lm_bufcount < 2) { + CERROR("bufcount %u: missing bulk sec desc\n", + msg->lm_bufcount); + RETURN(-EPROTO); + } + + /* bulk checksum is the last segment */ + if (bulk_sec_desc_unpack(msg, msg->lm_bufcount - 1, + swabbed)) + RETURN(-EPROTO); + } + + req->rq_repmsg = lustre_msg_buf(msg, 0, 0); + req->rq_replen = msg->lm_buflens[0]; + + rc = 0; + break; + case PTLRPC_GSS_PROC_ERR: + if (req->rq_early) { + CERROR("server return error with early reply\n"); + rc = -EPROTO; + } else { + rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr); + } + break; + default: + CERROR("unexpected proc %d\n", ghdr->gh_proc); + rc = -EPERM; + } + + RETURN(rc); +} + +/********************************************* + * reverse context installation * + *********************************************/ + +static inline +int gss_install_rvs_svc_ctx(struct obd_import *imp, + struct gss_sec *gsec, + struct gss_cli_ctx *gctx) +{ + return gss_svc_upcall_install_rvs_ctx(imp, gsec, gctx); +} + +/********************************************* + * GSS security APIs * + *********************************************/ +int gss_sec_create_common(struct gss_sec *gsec, + struct ptlrpc_sec_policy *policy, + struct obd_import *imp, + struct ptlrpc_svc_ctx *svcctx, + struct sptlrpc_flavor *sf) +{ + struct ptlrpc_sec *sec; + + LASSERT(imp); + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_GSS); + + gsec->gs_mech = lgss_subflavor_to_mech( + SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc)); + if (!gsec->gs_mech) { + CERROR("gss backend 0x%x not found\n", + SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc)); + return -EOPNOTSUPP; + } + + spin_lock_init(&gsec->gs_lock); + gsec->gs_rvs_hdl = 0ULL; + + /* initialize upper ptlrpc_sec */ + sec = &gsec->gs_base; + sec->ps_policy = policy; + atomic_set(&sec->ps_refcount, 0); + atomic_set(&sec->ps_nctx, 0); + sec->ps_id = sptlrpc_get_next_secid(); + sec->ps_flvr = *sf; + sec->ps_import = class_import_get(imp); + spin_lock_init(&sec->ps_lock); + INIT_LIST_HEAD(&sec->ps_gc_list); + sec->ps_sepol_mtime = ktime_set(0, 0); + sec->ps_sepol_checknext = ktime_set(0, 0); + sec->ps_sepol[0] = '\0'; + + if (!svcctx) { + sec->ps_gc_interval = GSS_GC_INTERVAL; + } else { + LASSERT(sec_is_reverse(sec)); + + /* never do gc on reverse sec */ + sec->ps_gc_interval = 0; + } + + if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV) + sptlrpc_enc_pool_add_user(); + + CDEBUG(D_SEC, "create %s%s@%p\n", (svcctx ? "reverse " : ""), + policy->sp_name, gsec); + return 0; +} + +void gss_sec_destroy_common(struct gss_sec *gsec) +{ + struct ptlrpc_sec *sec = &gsec->gs_base; + ENTRY; + + LASSERT(sec->ps_import); + LASSERT(atomic_read(&sec->ps_refcount) == 0); + LASSERT(atomic_read(&sec->ps_nctx) == 0); + + if (gsec->gs_mech) { + lgss_mech_put(gsec->gs_mech); + gsec->gs_mech = NULL; + } + + class_import_put(sec->ps_import); + + if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV) + sptlrpc_enc_pool_del_user(); + + EXIT; +} + +void gss_sec_kill(struct ptlrpc_sec *sec) +{ + sec->ps_dying = 1; +} + +int gss_cli_ctx_init_common(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_ctx_ops *ctxops, + struct vfs_cred *vcred) +{ + struct gss_cli_ctx *gctx = ctx2gctx(ctx); + + gctx->gc_win = 0; + atomic_set(&gctx->gc_seq, 0); + + INIT_HLIST_NODE(&ctx->cc_cache); + atomic_set(&ctx->cc_refcount, 0); + ctx->cc_sec = sec; + ctx->cc_ops = ctxops; + ctx->cc_expire = 0; + ctx->cc_flags = PTLRPC_CTX_NEW; + ctx->cc_vcred = *vcred; + spin_lock_init(&ctx->cc_lock); + INIT_LIST_HEAD(&ctx->cc_req_list); + INIT_LIST_HEAD(&ctx->cc_gc_chain); + + /* take a ref on belonging sec, balanced in ctx destroying */ + atomic_inc(&sec->ps_refcount); + /* statistic only */ + atomic_inc(&sec->ps_nctx); + + CDEBUG(D_SEC, "%s@%p: create ctx %p(%u->%s)\n", + sec->ps_policy->sp_name, ctx->cc_sec, + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + return 0; +} + +/* + * return value: + * 1: the context has been taken care of by someone else + * 0: proceed to really destroy the context locally + */ +int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx) +{ + struct gss_cli_ctx *gctx = ctx2gctx(ctx); + + LASSERT(atomic_read(&sec->ps_nctx) > 0); + LASSERT(atomic_read(&ctx->cc_refcount) == 0); + LASSERT(ctx->cc_sec == sec); + + /* + * remove UPTODATE flag of reverse ctx thus we won't send fini rpc, + * this is to avoid potential problems of client side reverse svc ctx + * be mis-destroyed in various recovery senarios. anyway client can + * manage its reverse ctx well by associating it with its buddy ctx. + */ + if (sec_is_reverse(sec)) + ctx->cc_flags &= ~PTLRPC_CTX_UPTODATE; + + if (gctx->gc_mechctx) { + /* the final context fini rpc will use this ctx too, and it's + * asynchronous which finished by request_out_callback(). so + * we add refcount, whoever drop finally drop the refcount to + * 0 should responsible for the rest of destroy. */ + atomic_inc(&ctx->cc_refcount); + + gss_do_ctx_fini_rpc(gctx); + gss_cli_ctx_finalize(gctx); + + if (!atomic_dec_and_test(&ctx->cc_refcount)) + return 1; + } + + if (sec_is_reverse(sec)) + CWARN("reverse sec %p: destroy ctx %p\n", + ctx->cc_sec, ctx); + else + CWARN("%s@%p: destroy ctx %p(%u->%s)\n", + sec->ps_policy->sp_name, ctx->cc_sec, + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + + return 0; +} + +static +int gss_alloc_reqbuf_intg(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int svc, int msgsize) +{ + int bufsize, txtsize; + int bufcnt = 2; + __u32 buflens[5]; + ENTRY; + + /* + * on-wire data layout: + * - gss header + * - lustre message + * - user descriptor (optional) + * - bulk sec descriptor (optional) + * - signature (optional) + * - svc == NULL: NULL + * - svc == AUTH: signature of gss header + * - svc == INTG: signature of all above + * + * if this is context negotiation, reserver fixed space + * at the last (signature) segment regardless of svc mode. + */ + + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + txtsize = buflens[0]; + + buflens[1] = msgsize; + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[1]; + + if (req->rq_pack_udesc) { + buflens[bufcnt] = sptlrpc_current_user_desc_size(); + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[bufcnt]; + bufcnt++; + } + + if (req->rq_pack_bulk) { + buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, + 0, req->rq_bulk_read); + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[bufcnt]; + bufcnt++; + } + + if (req->rq_ctx_init) + buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN; + else if (svc != SPTLRPC_SVC_NULL) + buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0); + + bufsize = lustre_msg_size_v2(bufcnt, buflens); + + if (!req->rq_reqbuf) { + bufsize = size_roundup_power2(bufsize); + + OBD_ALLOC_LARGE(req->rq_reqbuf, bufsize); + if (!req->rq_reqbuf) + RETURN(-ENOMEM); + + req->rq_reqbuf_len = bufsize; + } else { + LASSERT(req->rq_pool); + LASSERT(req->rq_reqbuf_len >= bufsize); + memset(req->rq_reqbuf, 0, bufsize); + } + + lustre_init_msg_v2(req->rq_reqbuf, bufcnt, buflens, NULL); + req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc; + + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, msgsize); + LASSERT(req->rq_reqmsg); + + /* pack user desc here, later we might leave current user's process */ + if (req->rq_pack_udesc) + sptlrpc_pack_user_desc(req->rq_reqbuf, 2); + + RETURN(0); +} + +static +int gss_alloc_reqbuf_priv(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + __u32 ibuflens[3], wbuflens[2]; + int ibufcnt; + int clearsize, wiresize; + ENTRY; + + LASSERT(req->rq_clrbuf == NULL); + LASSERT(req->rq_clrbuf_len == 0); + + /* Inner (clear) buffers + * - lustre message + * - user descriptor (optional) + * - bulk checksum (optional) + */ + ibufcnt = 1; + ibuflens[0] = msgsize; + + if (req->rq_pack_udesc) + ibuflens[ibufcnt++] = sptlrpc_current_user_desc_size(); + if (req->rq_pack_bulk) + ibuflens[ibufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, 0, + req->rq_bulk_read); + + clearsize = lustre_msg_size_v2(ibufcnt, ibuflens); + /* to allow append padding during encryption */ + clearsize += GSS_MAX_CIPHER_BLOCK; + + /* Wrapper (wire) buffers + * - gss header + * - cipher text + */ + wbuflens[0] = PTLRPC_GSS_HEADER_SIZE; + wbuflens[1] = gss_cli_payload(req->rq_cli_ctx, clearsize, 1); + wiresize = lustre_msg_size_v2(2, wbuflens); + + if (req->rq_pool) { + /* rq_reqbuf is preallocated */ + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len >= wiresize); + + memset(req->rq_reqbuf, 0, req->rq_reqbuf_len); + + /* if the pre-allocated buffer is big enough, we just pack + * both clear buf & request buf in it, to avoid more alloc. */ + if (clearsize + wiresize <= req->rq_reqbuf_len) { + req->rq_clrbuf = + (void *) (((char *) req->rq_reqbuf) + wiresize); + } else { + CWARN("pre-allocated buf size %d is not enough for " + "both clear (%d) and cipher (%d) text, proceed " + "with extra allocation\n", req->rq_reqbuf_len, + clearsize, wiresize); + } + } + + if (!req->rq_clrbuf) { + clearsize = size_roundup_power2(clearsize); + + OBD_ALLOC_LARGE(req->rq_clrbuf, clearsize); + if (!req->rq_clrbuf) + RETURN(-ENOMEM); + } + req->rq_clrbuf_len = clearsize; + + lustre_init_msg_v2(req->rq_clrbuf, ibufcnt, ibuflens, NULL); + req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, msgsize); + + if (req->rq_pack_udesc) + sptlrpc_pack_user_desc(req->rq_clrbuf, 1); + + RETURN(0); +} + +/* + * NOTE: any change of request buffer allocation should also consider + * changing enlarge_reqbuf() series functions. + */ +int gss_alloc_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + int svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + + LASSERT(!req->rq_pack_bulk || + (req->rq_bulk_read || req->rq_bulk_write)); + + switch (svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + return gss_alloc_reqbuf_intg(sec, req, svc, msgsize); + case SPTLRPC_SVC_PRIV: + return gss_alloc_reqbuf_priv(sec, req, msgsize); + default: + LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc); + return 0; + } +} + +void gss_free_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + int privacy; + ENTRY; + + LASSERT(!req->rq_pool || req->rq_reqbuf); + privacy = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) == SPTLRPC_SVC_PRIV; + + if (!req->rq_clrbuf) + goto release_reqbuf; + + /* release clear buffer */ + LASSERT(privacy); + LASSERT(req->rq_clrbuf_len); + + if (req->rq_pool == NULL || + req->rq_clrbuf < req->rq_reqbuf || + (char *) req->rq_clrbuf >= + (char *) req->rq_reqbuf + req->rq_reqbuf_len) + OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len); + + req->rq_clrbuf = NULL; + req->rq_clrbuf_len = 0; + +release_reqbuf: + if (!req->rq_pool && req->rq_reqbuf) { + LASSERT(req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } + + EXIT; +} + +static int do_alloc_repbuf(struct ptlrpc_request *req, int bufsize) +{ + bufsize = size_roundup_power2(bufsize); + + OBD_ALLOC_LARGE(req->rq_repbuf, bufsize); + if (!req->rq_repbuf) + return -ENOMEM; + + req->rq_repbuf_len = bufsize; + return 0; +} + +static +int gss_alloc_repbuf_intg(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int svc, int msgsize) +{ + int txtsize; + __u32 buflens[4]; + int bufcnt = 2; + int alloc_size; + + /* + * on-wire data layout: + * - gss header + * - lustre message + * - bulk sec descriptor (optional) + * - signature (optional) + * - svc == NULL: NULL + * - svc == AUTH: signature of gss header + * - svc == INTG: signature of all above + * + * if this is context negotiation, reserver fixed space + * at the last (signature) segment regardless of svc mode. + */ + + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + txtsize = buflens[0]; + + buflens[1] = msgsize; + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[1]; + + if (req->rq_pack_bulk) { + buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, + 1, req->rq_bulk_read); + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[bufcnt]; + bufcnt++; + } + + if (req->rq_ctx_init) + buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN; + else if (svc != SPTLRPC_SVC_NULL) + buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0); + + alloc_size = lustre_msg_size_v2(bufcnt, buflens); + + /* add space for early reply */ + alloc_size += gss_at_reply_off_integ; + + return do_alloc_repbuf(req, alloc_size); +} + +static +int gss_alloc_repbuf_priv(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + int txtsize; + __u32 buflens[2]; + int bufcnt; + int alloc_size; + + /* inner buffers */ + bufcnt = 1; + buflens[0] = msgsize; + + if (req->rq_pack_bulk) + buflens[bufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, + 1, req->rq_bulk_read); + txtsize = lustre_msg_size_v2(bufcnt, buflens); + txtsize += GSS_MAX_CIPHER_BLOCK; + + /* wrapper buffers */ + bufcnt = 2; + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_cli_payload(req->rq_cli_ctx, txtsize, 1); + + alloc_size = lustre_msg_size_v2(bufcnt, buflens); + /* add space for early reply */ + alloc_size += gss_at_reply_off_priv; + + return do_alloc_repbuf(req, alloc_size); +} + +int gss_alloc_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + int svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + ENTRY; + + LASSERT(!req->rq_pack_bulk || + (req->rq_bulk_read || req->rq_bulk_write)); + + switch (svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + return gss_alloc_repbuf_intg(sec, req, svc, msgsize); + case SPTLRPC_SVC_PRIV: + return gss_alloc_repbuf_priv(sec, req, msgsize); + default: + LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc); + return 0; + } +} + +void gss_free_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len); + req->rq_repbuf = NULL; + req->rq_repbuf_len = 0; + req->rq_repdata = NULL; + req->rq_repdata_len = 0; +} + +static int get_enlarged_msgsize(struct lustre_msg *msg, + int segment, int newsize) +{ + int save, newmsg_size; + + LASSERT(newsize >= msg->lm_buflens[segment]); + + save = msg->lm_buflens[segment]; + msg->lm_buflens[segment] = newsize; + newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + msg->lm_buflens[segment] = save; + + return newmsg_size; +} + +static int get_enlarged_msgsize2(struct lustre_msg *msg, + int segment1, int newsize1, + int segment2, int newsize2) +{ + int save1, save2, newmsg_size; + + LASSERT(newsize1 >= msg->lm_buflens[segment1]); + LASSERT(newsize2 >= msg->lm_buflens[segment2]); + + save1 = msg->lm_buflens[segment1]; + save2 = msg->lm_buflens[segment2]; + msg->lm_buflens[segment1] = newsize1; + msg->lm_buflens[segment2] = newsize2; + newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + msg->lm_buflens[segment1] = save1; + msg->lm_buflens[segment2] = save2; + + return newmsg_size; +} + +static +int gss_enlarge_reqbuf_intg(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int svc, + int segment, int newsize) +{ + struct lustre_msg *newbuf; + int txtsize, sigsize = 0, i; + int newmsg_size, newbuf_size; + + /* + * gss header is at seg 0; + * embedded msg is at seg 1; + * signature (if any) is at the last seg + */ + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len > req->rq_reqlen); + LASSERT(req->rq_reqbuf->lm_bufcount >= 2); + LASSERT(lustre_msg_buf(req->rq_reqbuf, 1, 0) == req->rq_reqmsg); + + /* 1. compute new embedded msg size */ + newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize); + LASSERT(newmsg_size >= req->rq_reqbuf->lm_buflens[1]); + + /* 2. compute new wrapper msg size */ + if (svc == SPTLRPC_SVC_NULL) { + /* no signature, get size directly */ + newbuf_size = get_enlarged_msgsize(req->rq_reqbuf, + 1, newmsg_size); + } else { + txtsize = req->rq_reqbuf->lm_buflens[0]; + + if (svc == SPTLRPC_SVC_INTG) { + for (i = 1; i < req->rq_reqbuf->lm_bufcount; i++) + txtsize += req->rq_reqbuf->lm_buflens[i]; + txtsize += newmsg_size - req->rq_reqbuf->lm_buflens[1]; + } + + sigsize = gss_cli_payload(req->rq_cli_ctx, txtsize, 0); + LASSERT(sigsize >= msg_last_seglen(req->rq_reqbuf)); + + newbuf_size = get_enlarged_msgsize2( + req->rq_reqbuf, + 1, newmsg_size, + msg_last_segidx(req->rq_reqbuf), + sigsize); + } + + /* request from pool should always have enough buffer */ + LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size); + + if (req->rq_reqbuf_len < newbuf_size) { + newbuf_size = size_roundup_power2(newbuf_size); + + OBD_ALLOC_LARGE(newbuf, newbuf_size); + if (newbuf == NULL) + RETURN(-ENOMEM); + + /* Must lock this, so that otherwise unprotected change of + * rq_reqmsg is not racing with parallel processing of + * imp_replay_list traversing threads. See LU-3333 + * This is a bandaid at best, we really need to deal with this + * in request enlarging code before unpacking that's already + * there */ + if (req->rq_import) + spin_lock(&req->rq_import->imp_lock); + + memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = newbuf; + req->rq_reqbuf_len = newbuf_size; + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, 0); + + if (req->rq_import) + spin_unlock(&req->rq_import->imp_lock); + } + + /* do enlargement, from wrapper to embedded, from end to begin */ + if (svc != SPTLRPC_SVC_NULL) + _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, + msg_last_segidx(req->rq_reqbuf), + sigsize); + + _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, 1, newmsg_size); + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + + req->rq_reqlen = newmsg_size; + RETURN(0); +} + +static +int gss_enlarge_reqbuf_priv(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + struct lustre_msg *newclrbuf; + int newmsg_size, newclrbuf_size, newcipbuf_size; + __u32 buflens[3]; + + /* + * embedded msg is at seg 0 of clear buffer; + * cipher text is at seg 2 of cipher buffer; + */ + LASSERT(req->rq_pool || + (req->rq_reqbuf == NULL && req->rq_reqbuf_len == 0)); + LASSERT(req->rq_reqbuf == NULL || + (req->rq_pool && req->rq_reqbuf->lm_bufcount == 3)); + LASSERT(req->rq_clrbuf); + LASSERT(req->rq_clrbuf_len > req->rq_reqlen); + LASSERT(lustre_msg_buf(req->rq_clrbuf, 0, 0) == req->rq_reqmsg); + + /* compute new embedded msg size */ + newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize); + + /* compute new clear buffer size */ + newclrbuf_size = get_enlarged_msgsize(req->rq_clrbuf, 0, newmsg_size); + newclrbuf_size += GSS_MAX_CIPHER_BLOCK; + + /* compute new cipher buffer size */ + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_cli_payload(req->rq_cli_ctx, buflens[0], 0); + buflens[2] = gss_cli_payload(req->rq_cli_ctx, newclrbuf_size, 1); + newcipbuf_size = lustre_msg_size_v2(3, buflens); + + /* handle the case that we put both clear buf and cipher buf into + * pre-allocated single buffer. */ + if (unlikely(req->rq_pool) && + req->rq_clrbuf >= req->rq_reqbuf && + (char *) req->rq_clrbuf < + (char *) req->rq_reqbuf + req->rq_reqbuf_len) { + /* it couldn't be better we still fit into the + * pre-allocated buffer. */ + if (newclrbuf_size + newcipbuf_size <= req->rq_reqbuf_len) { + void *src, *dst; + + if (req->rq_import) + spin_lock(&req->rq_import->imp_lock); + /* move clear text backward. */ + src = req->rq_clrbuf; + dst = (char *) req->rq_reqbuf + newcipbuf_size; + + memmove(dst, src, req->rq_clrbuf_len); + + req->rq_clrbuf = (struct lustre_msg *) dst; + req->rq_clrbuf_len = newclrbuf_size; + req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0); + + if (req->rq_import) + spin_unlock(&req->rq_import->imp_lock); + } else { + /* sadly we have to split out the clear buffer */ + LASSERT(req->rq_reqbuf_len >= newcipbuf_size); + LASSERT(req->rq_clrbuf_len < newclrbuf_size); + } + } + + if (req->rq_clrbuf_len < newclrbuf_size) { + newclrbuf_size = size_roundup_power2(newclrbuf_size); + + OBD_ALLOC_LARGE(newclrbuf, newclrbuf_size); + if (newclrbuf == NULL) + RETURN(-ENOMEM); + + /* Must lock this, so that otherwise unprotected change of + * rq_reqmsg is not racing with parallel processing of + * imp_replay_list traversing threads. See LU-3333 + * This is a bandaid at best, we really need to deal with this + * in request enlarging code before unpacking that's already + * there */ + if (req->rq_import) + spin_lock(&req->rq_import->imp_lock); + + memcpy(newclrbuf, req->rq_clrbuf, req->rq_clrbuf_len); + + if (req->rq_reqbuf == NULL || + req->rq_clrbuf < req->rq_reqbuf || + (char *) req->rq_clrbuf >= + (char *) req->rq_reqbuf + req->rq_reqbuf_len) { + OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len); + } + + req->rq_clrbuf = newclrbuf; + req->rq_clrbuf_len = newclrbuf_size; + req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0); + + if (req->rq_import) + spin_unlock(&req->rq_import->imp_lock); + } + + _sptlrpc_enlarge_msg_inplace(req->rq_clrbuf, 0, newmsg_size); + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + req->rq_reqlen = newmsg_size; + + RETURN(0); +} + +int gss_enlarge_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + int svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + + LASSERT(!req->rq_ctx_init && !req->rq_ctx_fini); + + switch (svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + return gss_enlarge_reqbuf_intg(sec, req, svc, segment, newsize); + case SPTLRPC_SVC_PRIV: + return gss_enlarge_reqbuf_priv(sec, req, segment, newsize); + default: + LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc); + return 0; + } +} + +int gss_sec_install_rctx(struct obd_import *imp, + struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx) +{ + struct gss_sec *gsec; + struct gss_cli_ctx *gctx; + int rc; + + gsec = container_of(sec, struct gss_sec, gs_base); + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + + rc = gss_install_rvs_svc_ctx(imp, gsec, gctx); + return rc; +} + +/******************************************** + * server side API * + ********************************************/ + +static inline +int gss_svc_reqctx_is_special(struct gss_svc_reqctx *grctx) +{ + LASSERT(grctx); + return (grctx->src_init || grctx->src_init_continue || + grctx->src_err_notify); +} + +static +void gss_svc_reqctx_free(struct gss_svc_reqctx *grctx) +{ + if (grctx->src_ctx) + gss_svc_upcall_put_ctx(grctx->src_ctx); + + sptlrpc_policy_put(grctx->src_base.sc_policy); + OBD_FREE_PTR(grctx); +} + +static inline +void gss_svc_reqctx_addref(struct gss_svc_reqctx *grctx) +{ + LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0); + atomic_inc(&grctx->src_base.sc_refcount); +} + +static inline +void gss_svc_reqctx_decref(struct gss_svc_reqctx *grctx) +{ + LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0); + + if (atomic_dec_and_test(&grctx->src_base.sc_refcount)) + gss_svc_reqctx_free(grctx); +} + +static +int gss_svc_sign(struct ptlrpc_request *req, + struct ptlrpc_reply_state *rs, + struct gss_svc_reqctx *grctx, + __u32 svc) +{ + __u32 flags = 0; + int rc; + ENTRY; + + LASSERT(rs->rs_msg == lustre_msg_buf(rs->rs_repbuf, 1, 0)); + + /* embedded lustre_msg might have been shrunk */ + if (req->rq_replen != rs->rs_repbuf->lm_buflens[1]) + lustre_shrink_msg(rs->rs_repbuf, 1, req->rq_replen, 1); + + if (req->rq_pack_bulk) + flags |= LUSTRE_GSS_PACK_BULK; + + rc = gss_sign_msg(rs->rs_repbuf, grctx->src_ctx->gsc_mechctx, + LUSTRE_SP_ANY, flags, PTLRPC_GSS_PROC_DATA, + grctx->src_wirectx.gw_seq, svc, NULL); + if (rc < 0) + RETURN(rc); + + rs->rs_repdata_len = rc; + + if (likely(req->rq_packed_final)) { + if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) + req->rq_reply_off = gss_at_reply_off_integ; + else + req->rq_reply_off = 0; + } else { + if (svc == SPTLRPC_SVC_NULL) + rs->rs_repbuf->lm_cksum = crc32_le(!(__u32) 0, + lustre_msg_buf(rs->rs_repbuf, 1, 0), + lustre_msg_buflen(rs->rs_repbuf, 1)); + req->rq_reply_off = 0; + } + + RETURN(0); +} + +int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor) +{ + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + struct ptlrpc_reply_state *rs; + struct gss_err_header *ghdr; + int replen = sizeof(struct ptlrpc_body); + int rc; + ENTRY; + + //if (OBD_FAIL_CHECK_ORSET(OBD_FAIL_SVCGSS_ERR_NOTIFY, OBD_FAIL_ONCE)) + // RETURN(-EINVAL); + + grctx->src_err_notify = 1; + grctx->src_reserve_len = 0; + + rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0); + if (rc) { + CERROR("could not pack reply, err %d\n", rc); + RETURN(rc); + } + + /* gss hdr */ + rs = req->rq_reply_state; + LASSERT(rs->rs_repbuf->lm_buflens[1] >= sizeof(*ghdr)); + ghdr = lustre_msg_buf(rs->rs_repbuf, 0, 0); + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_flags = 0; + ghdr->gh_proc = PTLRPC_GSS_PROC_ERR; + ghdr->gh_major = major; + ghdr->gh_minor = minor; + ghdr->gh_handle.len = 0; /* fake context handle */ + + rs->rs_repdata_len = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount, + rs->rs_repbuf->lm_buflens); + + CDEBUG(D_SEC, "prepare gss error notify(0x%x/0x%x) to %s\n", + major, minor, libcfs_nid2str(req->rq_peer.nid)); + RETURN(0); +} + +static +int gss_svc_handle_init(struct ptlrpc_request *req, + struct gss_wire_ctx *gw) +{ + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + struct lustre_msg *reqbuf = req->rq_reqbuf; + struct obd_uuid *uuid; + struct obd_device *target; + rawobj_t uuid_obj, rvs_hdl, in_token; + __u32 lustre_svc; + __u32 *secdata, seclen; + int swabbed, rc; + ENTRY; + + CDEBUG(D_SEC, "processing gss init(%d) request from %s\n", gw->gw_proc, + libcfs_nid2str(req->rq_peer.nid)); + + req->rq_ctx_init = 1; + + if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) { + CERROR("unexpected bulk flag\n"); + RETURN(SECSVC_DROP); + } + + if (gw->gw_proc == PTLRPC_GSS_PROC_INIT && gw->gw_handle.len != 0) { + CERROR("proc %u: invalid handle length %u\n", + gw->gw_proc, gw->gw_handle.len); + RETURN(SECSVC_DROP); + } + + if (reqbuf->lm_bufcount < 3 || reqbuf->lm_bufcount > 4){ + CERROR("Invalid bufcount %d\n", reqbuf->lm_bufcount); + RETURN(SECSVC_DROP); + } + + swabbed = req_capsule_req_need_swab(&req->rq_pill); + + /* ctx initiate payload is in last segment */ + secdata = lustre_msg_buf(reqbuf, reqbuf->lm_bufcount - 1, 0); + seclen = reqbuf->lm_buflens[reqbuf->lm_bufcount - 1]; + + if (seclen < 4 + 4) { + CERROR("sec size %d too small\n", seclen); + RETURN(SECSVC_DROP); + } + + /* lustre svc type */ + lustre_svc = le32_to_cpu(*secdata++); + seclen -= 4; + + /* extract target uuid, note this code is somewhat fragile + * because touched internal structure of obd_uuid */ + if (rawobj_extract(&uuid_obj, &secdata, &seclen)) { + CERROR("failed to extract target uuid\n"); + RETURN(SECSVC_DROP); + } + uuid_obj.data[uuid_obj.len - 1] = '\0'; + + uuid = (struct obd_uuid *) uuid_obj.data; + target = class_uuid2obd(uuid); + if (!target || target->obd_stopping || !target->obd_set_up) { + CERROR("target '%s' is not available for context init (%s)\n", + uuid->uuid, target == NULL ? "no target" : + (target->obd_stopping ? "stopping" : "not set up")); + RETURN(SECSVC_DROP); + } + + /* extract reverse handle */ + if (rawobj_extract(&rvs_hdl, &secdata, &seclen)) { + CERROR("failed extract reverse handle\n"); + RETURN(SECSVC_DROP); + } + + /* extract token */ + if (rawobj_extract(&in_token, &secdata, &seclen)) { + CERROR("can't extract token\n"); + RETURN(SECSVC_DROP); + } + + rc = gss_svc_upcall_handle_init(req, grctx, gw, target, lustre_svc, + &rvs_hdl, &in_token); + if (rc != SECSVC_OK) + RETURN(rc); + + if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss || + grctx->src_ctx->gsc_usr_root) + CWARN("create svc ctx %p: user from %s authenticated as %s\n", + grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid), + grctx->src_ctx->gsc_usr_root ? "root" : + (grctx->src_ctx->gsc_usr_mds ? "mds" : + (grctx->src_ctx->gsc_usr_oss ? "oss" : "null"))); + else + CWARN("create svc ctx %p: accept user %u from %s\n", + grctx->src_ctx, grctx->src_ctx->gsc_uid, + libcfs_nid2str(req->rq_peer.nid)); + + if (gw->gw_flags & LUSTRE_GSS_PACK_USER) { + if (reqbuf->lm_bufcount < 4) { + CERROR("missing user descriptor\n"); + RETURN(SECSVC_DROP); + } + if (sptlrpc_unpack_user_desc(reqbuf, 2, swabbed)) { + CERROR("Mal-formed user descriptor\n"); + RETURN(SECSVC_DROP); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(reqbuf, 2, 0); + } + + req->rq_reqmsg = lustre_msg_buf(reqbuf, 1, 0); + req->rq_reqlen = lustre_msg_buflen(reqbuf, 1); + + RETURN(rc); +} + +/* + * last segment must be the gss signature. + */ +static +int gss_svc_verify_request(struct ptlrpc_request *req, + struct gss_svc_reqctx *grctx, + struct gss_wire_ctx *gw, + __u32 *major) +{ + struct gss_svc_ctx *gctx = grctx->src_ctx; + struct lustre_msg *msg = req->rq_reqbuf; + int offset = 2; + int swabbed; + ENTRY; + + *major = GSS_S_COMPLETE; + + if (msg->lm_bufcount < 2) { + CERROR("Too few segments (%u) in request\n", msg->lm_bufcount); + RETURN(-EINVAL); + } + + if (gw->gw_svc == SPTLRPC_SVC_NULL) + goto verified; + + if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) { + CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq); + *major = GSS_S_DUPLICATE_TOKEN; + RETURN(-EACCES); + } + + *major = gss_verify_msg(msg, gctx->gsc_mechctx, gw->gw_svc); + if (*major != GSS_S_COMPLETE) { + CERROR("failed to verify request: %x\n", *major); + RETURN(-EACCES); + } + + if (gctx->gsc_reverse == 0 && + gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) { + CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq); + *major = GSS_S_DUPLICATE_TOKEN; + RETURN(-EACCES); + } + +verified: + swabbed = req_capsule_req_need_swab(&req->rq_pill); + + /* user descriptor */ + if (gw->gw_flags & LUSTRE_GSS_PACK_USER) { + if (msg->lm_bufcount < (offset + 1)) { + CERROR("no user desc included\n"); + RETURN(-EINVAL); + } + + if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) { + CERROR("Mal-formed user descriptor\n"); + RETURN(-EINVAL); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(msg, offset, 0); + offset++; + } + + /* check bulk_sec_desc data */ + if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) { + if (msg->lm_bufcount < (offset + 1)) { + CERROR("missing bulk sec descriptor\n"); + RETURN(-EINVAL); + } + + if (bulk_sec_desc_unpack(msg, offset, swabbed)) + RETURN(-EINVAL); + + req->rq_pack_bulk = 1; + grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0); + grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset); + } + + req->rq_reqmsg = lustre_msg_buf(msg, 1, 0); + req->rq_reqlen = msg->lm_buflens[1]; + RETURN(0); +} + +static +int gss_svc_unseal_request(struct ptlrpc_request *req, + struct gss_svc_reqctx *grctx, + struct gss_wire_ctx *gw, + __u32 *major) +{ + struct gss_svc_ctx *gctx = grctx->src_ctx; + struct lustre_msg *msg = req->rq_reqbuf; + int swabbed, msglen, offset = 1; + ENTRY; + + if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) { + CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq); + *major = GSS_S_DUPLICATE_TOKEN; + RETURN(-EACCES); + } + + *major = gss_unseal_msg(gctx->gsc_mechctx, msg, + &msglen, req->rq_reqdata_len); + if (*major != GSS_S_COMPLETE) { + CERROR("failed to unwrap request: %x\n", *major); + RETURN(-EACCES); + } + + if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) { + CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq); + *major = GSS_S_DUPLICATE_TOKEN; + RETURN(-EACCES); + } + + swabbed = __lustre_unpack_msg(msg, msglen); + if (swabbed < 0) { + CERROR("Failed to unpack after decryption\n"); + RETURN(-EINVAL); + } + req->rq_reqdata_len = msglen; + + if (msg->lm_bufcount < 1) { + CERROR("Invalid buffer: is empty\n"); + RETURN(-EINVAL); + } + + if (gw->gw_flags & LUSTRE_GSS_PACK_USER) { + if (msg->lm_bufcount < offset + 1) { + CERROR("no user descriptor included\n"); + RETURN(-EINVAL); + } + + if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) { + CERROR("Mal-formed user descriptor\n"); + RETURN(-EINVAL); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(msg, offset, 0); + offset++; + } + + if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) { + if (msg->lm_bufcount < offset + 1) { + CERROR("no bulk checksum included\n"); + RETURN(-EINVAL); + } + + if (bulk_sec_desc_unpack(msg, offset, swabbed)) + RETURN(-EINVAL); + + req->rq_pack_bulk = 1; + grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0); + grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset); + } + + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 0, 0); + req->rq_reqlen = req->rq_reqbuf->lm_buflens[0]; + RETURN(0); +} + +static +int gss_svc_handle_data(struct ptlrpc_request *req, + struct gss_wire_ctx *gw) +{ + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + __u32 major = 0; + int rc = 0; + ENTRY; + + grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw); + if (!grctx->src_ctx) { + major = GSS_S_NO_CONTEXT; + goto error; + } + + switch (gw->gw_svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + rc = gss_svc_verify_request(req, grctx, gw, &major); + break; + case SPTLRPC_SVC_PRIV: + rc = gss_svc_unseal_request(req, grctx, gw, &major); + break; + default: + CERROR("unsupported gss service %d\n", gw->gw_svc); + rc = -EINVAL; + } + + if (rc == 0) + RETURN(SECSVC_OK); + + CERROR("svc %u failed: major 0x%08x: req xid %llu ctx %p idx " + "%#llx(%u->%s)\n", gw->gw_svc, major, req->rq_xid, + grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle), + grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid)); +error: + /* we only notify client in case of NO_CONTEXT/BAD_SIG, which + * might happen after server reboot, to allow recovery. */ + if ((major == GSS_S_NO_CONTEXT || major == GSS_S_BAD_SIG) && + gss_pack_err_notify(req, major, 0) == 0) + RETURN(SECSVC_COMPLETE); + + RETURN(SECSVC_DROP); +} + +static +int gss_svc_handle_destroy(struct ptlrpc_request *req, + struct gss_wire_ctx *gw) +{ + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + __u32 major; + ENTRY; + + req->rq_ctx_fini = 1; + req->rq_no_reply = 1; + + grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw); + if (!grctx->src_ctx) { + CDEBUG(D_SEC, "invalid gss context handle for destroy.\n"); + RETURN(SECSVC_DROP); + } + + if (gw->gw_svc != SPTLRPC_SVC_INTG) { + CERROR("svc %u is not supported in destroy.\n", gw->gw_svc); + RETURN(SECSVC_DROP); + } + + if (gss_svc_verify_request(req, grctx, gw, &major)) + RETURN(SECSVC_DROP); + + CWARN("destroy svc ctx %p idx %#llx (%u->%s)\n", + grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle), + grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid)); + + gss_svc_upcall_destroy_ctx(grctx->src_ctx); + + if (gw->gw_flags & LUSTRE_GSS_PACK_USER) { + if (req->rq_reqbuf->lm_bufcount < 4) { + CERROR("missing user descriptor, ignore it\n"); + RETURN(SECSVC_OK); + } + if (sptlrpc_unpack_user_desc(req->rq_reqbuf, 2, + req_capsule_req_need_swab(&req->rq_pill))) { + CERROR("Mal-formed user descriptor, ignore it\n"); + RETURN(SECSVC_OK); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(req->rq_reqbuf, 2, 0); + } + + RETURN(SECSVC_OK); +} + +int gss_svc_accept(struct ptlrpc_sec_policy *policy, struct ptlrpc_request *req) +{ + struct gss_header *ghdr; + struct gss_svc_reqctx *grctx; + struct gss_wire_ctx *gw; + int swabbed, rc; + ENTRY; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_svc_ctx == NULL); + + if (req->rq_reqbuf->lm_bufcount < 2) { + CERROR("buf count only %d\n", req->rq_reqbuf->lm_bufcount); + RETURN(SECSVC_DROP); + } + + swabbed = req_capsule_req_need_swab(&req->rq_pill); + + ghdr = gss_swab_header(req->rq_reqbuf, 0, swabbed); + if (ghdr == NULL) { + CERROR("can't decode gss header\n"); + RETURN(SECSVC_DROP); + } + + /* sanity checks */ + if (ghdr->gh_version != PTLRPC_GSS_VERSION) { + CERROR("gss version %u, expect %u\n", ghdr->gh_version, + PTLRPC_GSS_VERSION); + RETURN(SECSVC_DROP); + } + + req->rq_sp_from = ghdr->gh_sp; + + /* alloc grctx data */ + OBD_ALLOC_PTR(grctx); + if (!grctx) + RETURN(SECSVC_DROP); + + grctx->src_base.sc_policy = sptlrpc_policy_get(policy); + atomic_set(&grctx->src_base.sc_refcount, 1); + req->rq_svc_ctx = &grctx->src_base; + gw = &grctx->src_wirectx; + + /* save wire context */ + gw->gw_flags = ghdr->gh_flags; + gw->gw_proc = ghdr->gh_proc; + gw->gw_seq = ghdr->gh_seq; + gw->gw_svc = ghdr->gh_svc; + rawobj_from_netobj(&gw->gw_handle, &ghdr->gh_handle); + + /* keep original wire header which subject to checksum verification */ + if (swabbed) + gss_header_swabber(ghdr); + + switch(ghdr->gh_proc) { + case PTLRPC_GSS_PROC_INIT: + case PTLRPC_GSS_PROC_CONTINUE_INIT: + rc = gss_svc_handle_init(req, gw); + break; + case PTLRPC_GSS_PROC_DATA: + rc = gss_svc_handle_data(req, gw); + break; + case PTLRPC_GSS_PROC_DESTROY: + rc = gss_svc_handle_destroy(req, gw); + break; + default: + CERROR("unknown proc %u\n", gw->gw_proc); + rc = SECSVC_DROP; + break; + } + + switch (rc) { + case SECSVC_OK: + LASSERT (grctx->src_ctx); + + req->rq_auth_gss = 1; + req->rq_auth_usr_mdt = grctx->src_ctx->gsc_usr_mds; + req->rq_auth_usr_ost = grctx->src_ctx->gsc_usr_oss; + req->rq_auth_usr_root = grctx->src_ctx->gsc_usr_root; + req->rq_auth_uid = grctx->src_ctx->gsc_uid; + req->rq_auth_mapped_uid = grctx->src_ctx->gsc_mapped_uid; + break; + case SECSVC_COMPLETE: + break; + case SECSVC_DROP: + gss_svc_reqctx_free(grctx); + req->rq_svc_ctx = NULL; + break; + } + + RETURN(rc); +} + +void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx) +{ + struct gss_svc_reqctx *grctx; + ENTRY; + + if (svc_ctx == NULL) { + EXIT; + return; + } + + grctx = gss_svc_ctx2reqctx(svc_ctx); + + CWARN("gss svc invalidate ctx %p(%u)\n", + grctx->src_ctx, grctx->src_ctx->gsc_uid); + gss_svc_upcall_destroy_ctx(grctx->src_ctx); + + EXIT; +} + +static inline +int gss_svc_payload(struct gss_svc_reqctx *grctx, int early, + int msgsize, int privacy) +{ + /* we should treat early reply normally, but which is actually sharing + * the same ctx with original request, so in this case we should + * ignore the special ctx's special flags */ + if (early == 0 && gss_svc_reqctx_is_special(grctx)) + return grctx->src_reserve_len; + + return gss_mech_payload(NULL, msgsize, privacy); +} + +static int gss_svc_bulk_payload(struct gss_svc_ctx *gctx, + struct sptlrpc_flavor *flvr, + int read) +{ + int payload = sizeof(struct ptlrpc_bulk_sec_desc); + + if (read) { + switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { + case SPTLRPC_BULK_SVC_NULL: + break; + case SPTLRPC_BULK_SVC_INTG: + payload += gss_mech_payload(NULL, 0, 0); + break; + case SPTLRPC_BULK_SVC_PRIV: + payload += gss_mech_payload(NULL, 0, 1); + break; + case SPTLRPC_BULK_SVC_AUTH: + default: + LBUG(); + } + } + + return payload; +} + +int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen) +{ + struct gss_svc_reqctx *grctx; + struct ptlrpc_reply_state *rs; + int early, privacy, svc, bsd_off = 0; + __u32 ibuflens[2], buflens[4]; + int ibufcnt = 0, bufcnt; + int txtsize, wmsg_size, rs_size; + ENTRY; + + LASSERT(msglen % 8 == 0); + + if (req->rq_pack_bulk && !req->rq_bulk_read && !req->rq_bulk_write) { + CERROR("client request bulk sec on non-bulk rpc\n"); + RETURN(-EPROTO); + } + + svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + early = (req->rq_packed_final == 0); + + grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + if (!early && gss_svc_reqctx_is_special(grctx)) + privacy = 0; + else + privacy = (svc == SPTLRPC_SVC_PRIV); + + if (privacy) { + /* inner clear buffers */ + ibufcnt = 1; + ibuflens[0] = msglen; + + if (req->rq_pack_bulk) { + LASSERT(grctx->src_reqbsd); + + bsd_off = ibufcnt; + ibuflens[ibufcnt++] = gss_svc_bulk_payload( + grctx->src_ctx, + &req->rq_flvr, + req->rq_bulk_read); + } + + txtsize = lustre_msg_size_v2(ibufcnt, ibuflens); + txtsize += GSS_MAX_CIPHER_BLOCK; + + /* wrapper buffer */ + bufcnt = 2; + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_svc_payload(grctx, early, txtsize, 1); + } else { + bufcnt = 2; + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = msglen; + + txtsize = buflens[0]; + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[1]; + + if (req->rq_pack_bulk) { + LASSERT(grctx->src_reqbsd); + + bsd_off = bufcnt; + buflens[bufcnt] = gss_svc_bulk_payload( + grctx->src_ctx, + &req->rq_flvr, + req->rq_bulk_read); + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[bufcnt]; + bufcnt++; + } + + if ((!early && gss_svc_reqctx_is_special(grctx)) || + svc != SPTLRPC_SVC_NULL) + buflens[bufcnt++] = gss_svc_payload(grctx, early, + txtsize, 0); + } + + wmsg_size = lustre_msg_size_v2(bufcnt, buflens); + + rs_size = sizeof(*rs) + wmsg_size; + rs = req->rq_reply_state; + + if (rs) { + /* pre-allocated */ + LASSERT(rs->rs_size >= rs_size); + } else { + OBD_ALLOC_LARGE(rs, rs_size); + if (rs == NULL) + RETURN(-ENOMEM); + + rs->rs_size = rs_size; + } + + rs->rs_repbuf = (struct lustre_msg *) (rs + 1); + rs->rs_repbuf_len = wmsg_size; + + /* initialize the buffer */ + if (privacy) { + lustre_init_msg_v2(rs->rs_repbuf, ibufcnt, ibuflens, NULL); + rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 0, msglen); + } else { + lustre_init_msg_v2(rs->rs_repbuf, bufcnt, buflens, NULL); + rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc; + + rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 1, 0); + } + + if (bsd_off) { + grctx->src_repbsd = lustre_msg_buf(rs->rs_repbuf, bsd_off, 0); + grctx->src_repbsd_size = lustre_msg_buflen(rs->rs_repbuf, + bsd_off); + } + + gss_svc_reqctx_addref(grctx); + rs->rs_svc_ctx = req->rq_svc_ctx; + + LASSERT(rs->rs_msg); + req->rq_reply_state = rs; + RETURN(0); +} + +static int gss_svc_seal(struct ptlrpc_request *req, + struct ptlrpc_reply_state *rs, + struct gss_svc_reqctx *grctx) +{ + struct gss_svc_ctx *gctx = grctx->src_ctx; + rawobj_t hdrobj, msgobj, token; + struct gss_header *ghdr; + __u8 *token_buf; + int token_buflen; + __u32 buflens[2], major; + int msglen, rc; + ENTRY; + + /* get clear data length. note embedded lustre_msg might + * have been shrunk */ + if (req->rq_replen != lustre_msg_buflen(rs->rs_repbuf, 0)) + msglen = lustre_shrink_msg(rs->rs_repbuf, 0, req->rq_replen, 1); + else + msglen = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount, + rs->rs_repbuf->lm_buflens); + + /* temporarily use tail of buffer to hold gss header data */ + LASSERT(msglen + PTLRPC_GSS_HEADER_SIZE <= rs->rs_repbuf_len); + ghdr = (struct gss_header *) ((char *) rs->rs_repbuf + + rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE); + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_sp = LUSTRE_SP_ANY; + ghdr->gh_flags = 0; + ghdr->gh_proc = PTLRPC_GSS_PROC_DATA; + ghdr->gh_seq = grctx->src_wirectx.gw_seq; + ghdr->gh_svc = SPTLRPC_SVC_PRIV; + ghdr->gh_handle.len = 0; + if (req->rq_pack_bulk) + ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK; + + /* allocate temporary cipher buffer */ + token_buflen = gss_mech_payload(gctx->gsc_mechctx, msglen, 1); + OBD_ALLOC_LARGE(token_buf, token_buflen); + if (token_buf == NULL) + RETURN(-ENOMEM); + + hdrobj.len = PTLRPC_GSS_HEADER_SIZE; + hdrobj.data = (__u8 *) ghdr; + msgobj.len = msglen; + msgobj.data = (__u8 *) rs->rs_repbuf; + token.len = token_buflen; + token.data = token_buf; + + major = lgss_wrap(gctx->gsc_mechctx, &hdrobj, &msgobj, + rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE, &token); + if (major != GSS_S_COMPLETE) { + CERROR("wrap message error: %08x\n", major); + GOTO(out_free, rc = -EPERM); + } + LASSERT(token.len <= token_buflen); + + /* we are about to override data at rs->rs_repbuf, nullify pointers + * to which to catch further illegal usage. */ + if (req->rq_pack_bulk) { + grctx->src_repbsd = NULL; + grctx->src_repbsd_size = 0; + } + + /* now fill the actual wire data + * - gss header + * - gss token + */ + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = token.len; + + rs->rs_repdata_len = lustre_msg_size_v2(2, buflens); + LASSERT(rs->rs_repdata_len <= rs->rs_repbuf_len); + + lustre_init_msg_v2(rs->rs_repbuf, 2, buflens, NULL); + rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc; + + memcpy(lustre_msg_buf(rs->rs_repbuf, 0, 0), ghdr, + PTLRPC_GSS_HEADER_SIZE); + memcpy(lustre_msg_buf(rs->rs_repbuf, 1, 0), token.data, token.len); + + /* reply offset */ + if (req->rq_packed_final && + (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) + req->rq_reply_off = gss_at_reply_off_priv; + else + req->rq_reply_off = 0; + + /* to catch upper layer's further access */ + rs->rs_msg = NULL; + req->rq_repmsg = NULL; + req->rq_replen = 0; + + rc = 0; +out_free: + OBD_FREE_LARGE(token_buf, token_buflen); + RETURN(rc); +} + +int gss_svc_authorize(struct ptlrpc_request *req) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + struct gss_wire_ctx *gw = &grctx->src_wirectx; + int early, rc; + ENTRY; + + early = (req->rq_packed_final == 0); + + if (!early && gss_svc_reqctx_is_special(grctx)) { + LASSERT(rs->rs_repdata_len != 0); + + req->rq_reply_off = gss_at_reply_off_integ; + RETURN(0); + } + + /* early reply could happen in many cases */ + if (!early && + gw->gw_proc != PTLRPC_GSS_PROC_DATA && + gw->gw_proc != PTLRPC_GSS_PROC_DESTROY) { + CERROR("proc %d not support\n", gw->gw_proc); + RETURN(-EINVAL); + } + + LASSERT(grctx->src_ctx); + + switch (gw->gw_svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + rc = gss_svc_sign(req, rs, grctx, gw->gw_svc); + break; + case SPTLRPC_SVC_PRIV: + rc = gss_svc_seal(req, rs, grctx); + break; + default: + CERROR("Unknown service %d\n", gw->gw_svc); + GOTO(out, rc = -EINVAL); + } + rc = 0; + +out: + RETURN(rc); +} + +void gss_svc_free_rs(struct ptlrpc_reply_state *rs) +{ + struct gss_svc_reqctx *grctx; + + LASSERT(rs->rs_svc_ctx); + grctx = container_of(rs->rs_svc_ctx, struct gss_svc_reqctx, src_base); + + gss_svc_reqctx_decref(grctx); + rs->rs_svc_ctx = NULL; + + if (!rs->rs_prealloc) + OBD_FREE_LARGE(rs, rs->rs_size); +} + +void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx) +{ + LASSERT(atomic_read(&ctx->sc_refcount) == 0); + gss_svc_reqctx_free(gss_svc_ctx2reqctx(ctx)); +} + +int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct gss_cli_ctx *cli_gctx = ctx2gctx(cli_ctx); + struct gss_svc_ctx *svc_gctx = gss_svc_ctx2gssctx(svc_ctx); + struct gss_ctx *mechctx = NULL; + + LASSERT(cli_gctx); + LASSERT(svc_gctx && svc_gctx->gsc_mechctx); + + cli_gctx->gc_proc = PTLRPC_GSS_PROC_DATA; + cli_gctx->gc_win = GSS_SEQ_WIN; + + /* The problem is the reverse ctx might get lost in some recovery + * situations, and the same svc_ctx will be used to re-create it. + * if there's callback be sentout before that, new reverse ctx start + * with sequence 0 will lead to future callback rpc be treated as + * replay. + * + * each reverse root ctx will record its latest sequence number on its + * buddy svcctx before be destroyed, so here we continue use it. + */ + atomic_set(&cli_gctx->gc_seq, svc_gctx->gsc_rvs_seq); + + if (gss_svc_upcall_dup_handle(&cli_gctx->gc_svc_handle, svc_gctx)) { + CERROR("failed to dup svc handle\n"); + goto err_out; + } + + if (lgss_copy_reverse_context(svc_gctx->gsc_mechctx, &mechctx) != + GSS_S_COMPLETE) { + CERROR("failed to copy mech context\n"); + goto err_svc_handle; + } + + if (rawobj_dup(&cli_gctx->gc_handle, &svc_gctx->gsc_rvs_hdl)) { + CERROR("failed to dup reverse handle\n"); + goto err_ctx; + } + + cli_gctx->gc_mechctx = mechctx; + gss_cli_ctx_uptodate(cli_gctx); + + return 0; + +err_ctx: + lgss_delete_sec_context(&mechctx); +err_svc_handle: + rawobj_free(&cli_gctx->gc_svc_handle); +err_out: + return -ENOMEM; +} + +static void gss_init_at_reply_offset(void) +{ + __u32 buflens[3]; + int clearsize; + + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = lustre_msg_early_size; + buflens[2] = gss_cli_payload(NULL, buflens[1], 0); + gss_at_reply_off_integ = lustre_msg_size_v2(3, buflens); + + buflens[0] = lustre_msg_early_size; + clearsize = lustre_msg_size_v2(1, buflens); + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_cli_payload(NULL, clearsize, 0); + buflens[2] = gss_cli_payload(NULL, clearsize, 1); + gss_at_reply_off_priv = lustre_msg_size_v2(3, buflens); +} + +static int __init sptlrpc_gss_init(void) +{ + int rc; + + rc = gss_init_tunables(); + if (rc) + return rc; + + rc = gss_init_cli_upcall(); + if (rc) + goto out_tunables; + + rc = gss_init_svc_upcall(); + if (rc) + goto out_cli_upcall; + + rc = init_null_module(); + if (rc) + goto out_svc_upcall; + + rc = init_kerberos_module(); + if (rc) + goto out_null; + + rc = init_sk_module(); + if (rc) + goto out_kerberos; + + /* register policy after all other stuff be initialized, because it + * might be in used immediately after the registration. */ + + rc = gss_init_keyring(); + if (rc) + goto out_sk; + + rc = gss_init_pipefs(); + if (rc) + goto out_keyring; + + gss_init_at_reply_offset(); + + return 0; + +out_keyring: + gss_exit_keyring(); +out_sk: + cleanup_sk_module(); +out_kerberos: + cleanup_kerberos_module(); +out_null: + cleanup_null_module(); +out_svc_upcall: + gss_exit_svc_upcall(); +out_cli_upcall: + gss_exit_cli_upcall(); +out_tunables: + gss_exit_tunables(); + return rc; +} + +static void __exit sptlrpc_gss_exit(void) +{ + gss_exit_keyring(); + gss_exit_pipefs(); + cleanup_kerberos_module(); + gss_exit_svc_upcall(); + gss_exit_cli_upcall(); + gss_exit_tunables(); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre GSS security policy"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(sptlrpc_gss_init); +module_exit(sptlrpc_gss_exit); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/heap.c b/drivers/staging/lustrefsx/lustre/ptlrpc/heap.c new file mode 100644 index 0000000000000..b96ea1864c6a9 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/heap.c @@ -0,0 +1,497 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * GPL HEADER END + */ +/* + * Copyright (c) 2011 Intel Corporation + */ +/* + * libcfs/libcfs/heap.c + * + * Author: Eric Barton + * Liang Zhen + */ +/** \addtogroup heap + * + * @{ + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include "heap.h" + +#define CBH_ALLOC(ptr, h) \ +do { \ + if (h->cbh_cptab) { \ + if ((h)->cbh_flags & CBH_FLAG_ATOMIC_GROW) \ + LIBCFS_CPT_ALLOC_GFP((ptr), h->cbh_cptab, \ + h->cbh_cptid, CBH_NOB, \ + GFP_ATOMIC); \ + else \ + LIBCFS_CPT_ALLOC((ptr), h->cbh_cptab, \ + h->cbh_cptid, CBH_NOB); \ + } else { \ + if ((h)->cbh_flags & CBH_FLAG_ATOMIC_GROW) \ + LIBCFS_ALLOC_ATOMIC((ptr), CBH_NOB); \ + else \ + LIBCFS_ALLOC((ptr), CBH_NOB); \ + } \ +} while (0) + +#define CBH_FREE(ptr) LIBCFS_FREE(ptr, CBH_NOB) + +/** + * Grows the capacity of a binary heap so that it can handle a larger number of + * \e struct binheap_node objects. + * + * \param[in] h The binary heap + * + * \retval 0 Successfully grew the heap + * \retval -ENOMEM OOM error + */ +static int +binheap_grow(struct binheap *h) +{ + struct binheap_node ***frag1 = NULL; + struct binheap_node **frag2; + int hwm = h->cbh_hwm; + + /* need a whole new chunk of pointers */ + LASSERT((h->cbh_hwm & CBH_MASK) == 0); + + if (hwm == 0) { + /* first use of single indirect */ + CBH_ALLOC(h->cbh_elements1, h); + if (h->cbh_elements1 == NULL) + return -ENOMEM; + + goto out; + } + + hwm -= CBH_SIZE; + if (hwm < CBH_SIZE * CBH_SIZE) { + /* not filled double indirect */ + CBH_ALLOC(frag2, h); + if (frag2 == NULL) + return -ENOMEM; + + if (hwm == 0) { + /* first use of double indirect */ + CBH_ALLOC(h->cbh_elements2, h); + if (h->cbh_elements2 == NULL) { + CBH_FREE(frag2); + return -ENOMEM; + } + } + + h->cbh_elements2[hwm >> CBH_SHIFT] = frag2; + goto out; + } + + hwm -= CBH_SIZE * CBH_SIZE; +#if (CBH_SHIFT * 3 < 32) + if (hwm >= CBH_SIZE * CBH_SIZE * CBH_SIZE) { + /* filled triple indirect */ + return -ENOMEM; + } +#endif + CBH_ALLOC(frag2, h); + if (frag2 == NULL) + return -ENOMEM; + + if (((hwm >> CBH_SHIFT) & CBH_MASK) == 0) { + /* first use of this 2nd level index */ + CBH_ALLOC(frag1, h); + if (frag1 == NULL) { + CBH_FREE(frag2); + return -ENOMEM; + } + } + + if (hwm == 0) { + /* first use of triple indirect */ + CBH_ALLOC(h->cbh_elements3, h); + if (h->cbh_elements3 == NULL) { + CBH_FREE(frag2); + CBH_FREE(frag1); + return -ENOMEM; + } + } + + if (frag1 != NULL) { + LASSERT(h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] == NULL); + h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] = frag1; + } else { + frag1 = h->cbh_elements3[hwm >> (2 * CBH_SHIFT)]; + LASSERT(frag1 != NULL); + } + + frag1[(hwm >> CBH_SHIFT) & CBH_MASK] = frag2; + + out: + h->cbh_hwm += CBH_SIZE; + return 0; +} + +/** + * Creates and initializes a binary heap instance. + * + * \param[in] ops The operations to be used + * \param[in] flags The heap flags + * \parm[in] count The initial heap capacity in # of elements + * \param[in] arg An optional private argument + * \param[in] cptab The CPT table this heap instance will operate over + * \param[in] cptid The CPT id of \a cptab this heap instance will operate over + * + * \retval valid-pointer A newly-created and initialized binary heap object + * \retval NULL error + */ +struct binheap * +binheap_create(struct binheap_ops *ops, unsigned int flags, + unsigned int count, void *arg, struct cfs_cpt_table *cptab, + int cptid) +{ + struct binheap *h; + + LASSERT(ops != NULL); + LASSERT(ops->hop_compare != NULL); + if (cptab) { + LASSERT(cptid == CFS_CPT_ANY || + (cptid >= 0 && cptid < cfs_cpt_number(cptab))); + LIBCFS_CPT_ALLOC(h, cptab, cptid, sizeof(*h)); + } else { + LIBCFS_ALLOC(h, sizeof(*h)); + } + if (!h) + return NULL; + + h->cbh_ops = ops; + h->cbh_nelements = 0; + h->cbh_hwm = 0; + h->cbh_private = arg; + h->cbh_flags = flags & (~CBH_FLAG_ATOMIC_GROW); + h->cbh_cptab = cptab; + h->cbh_cptid = cptid; + + while (h->cbh_hwm < count) { /* preallocate */ + if (binheap_grow(h) != 0) { + binheap_destroy(h); + return NULL; + } + } + + h->cbh_flags |= flags & CBH_FLAG_ATOMIC_GROW; + + return h; +} +EXPORT_SYMBOL(binheap_create); + +/** + * Releases all resources associated with a binary heap instance. + * + * Deallocates memory for all indirection levels and the binary heap object + * itself. + * + * \param[in] h The binary heap object + */ +void +binheap_destroy(struct binheap *h) +{ + int idx0; + int idx1; + int n; + + LASSERT(h != NULL); + + n = h->cbh_hwm; + + if (n > 0) { + CBH_FREE(h->cbh_elements1); + n -= CBH_SIZE; + } + + if (n > 0) { + for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) { + CBH_FREE(h->cbh_elements2[idx0]); + n -= CBH_SIZE; + } + + CBH_FREE(h->cbh_elements2); + } + + if (n > 0) { + for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) { + + for (idx1 = 0; idx1 < CBH_SIZE && n > 0; idx1++) { + CBH_FREE(h->cbh_elements3[idx0][idx1]); + n -= CBH_SIZE; + } + + CBH_FREE(h->cbh_elements3[idx0]); + } + + CBH_FREE(h->cbh_elements3); + } + + LIBCFS_FREE(h, sizeof(*h)); +} +EXPORT_SYMBOL(binheap_destroy); + +/** + * Obtains a double pointer to a heap element, given its index into the binary + * tree. + * + * \param[in] h The binary heap instance + * \param[in] idx The requested node's index + * + * \retval valid-pointer A double pointer to a heap pointer entry + */ +static struct binheap_node ** +binheap_pointer(struct binheap *h, unsigned int idx) +{ + if (idx < CBH_SIZE) + return &(h->cbh_elements1[idx]); + + idx -= CBH_SIZE; + if (idx < CBH_SIZE * CBH_SIZE) + return &(h->cbh_elements2[idx >> CBH_SHIFT][idx & CBH_MASK]); + + idx -= CBH_SIZE * CBH_SIZE; + return &(h->cbh_elements3[idx >> (2 * CBH_SHIFT)] + [(idx >> CBH_SHIFT) & CBH_MASK] + [idx & CBH_MASK]); +} + +/** + * Obtains a pointer to a heap element, given its index into the binary tree. + * + * \param[in] h The binary heap + * \param[in] idx The requested node's index + * + * \retval valid-pointer The requested heap node + * \retval NULL Supplied index is out of bounds + */ +struct binheap_node * +binheap_find(struct binheap *h, unsigned int idx) +{ + if (idx >= h->cbh_nelements) + return NULL; + + return *binheap_pointer(h, idx); +} +EXPORT_SYMBOL(binheap_find); + +/** + * Moves a node upwards, towards the root of the binary tree. + * + * \param[in] h The heap + * \param[in] e The node + * + * \retval 1 The position of \a e in the tree was changed at least once + * \retval 0 The position of \a e in the tree was not changed + */ +static int +binheap_bubble(struct binheap *h, struct binheap_node *e) +{ + unsigned int cur_idx = e->chn_index; + struct binheap_node **cur_ptr; + unsigned int parent_idx; + struct binheap_node **parent_ptr; + int did_sth = 0; + + cur_ptr = binheap_pointer(h, cur_idx); + LASSERT(*cur_ptr == e); + + while (cur_idx > 0) { + parent_idx = (cur_idx - 1) >> 1; + + parent_ptr = binheap_pointer(h, parent_idx); + LASSERT((*parent_ptr)->chn_index == parent_idx); + + if (h->cbh_ops->hop_compare(*parent_ptr, e)) + break; + + (*parent_ptr)->chn_index = cur_idx; + *cur_ptr = *parent_ptr; + cur_ptr = parent_ptr; + cur_idx = parent_idx; + did_sth = 1; + } + + e->chn_index = cur_idx; + *cur_ptr = e; + + return did_sth; +} + +/** + * Moves a node downwards, towards the last level of the binary tree. + * + * \param[in] h The heap + * \param[in] e The node + * + * \retval 1 The position of \a e in the tree was changed at least once + * \retval 0 The position of \a e in the tree was not changed + */ +static int +binheap_sink(struct binheap *h, struct binheap_node *e) +{ + unsigned int n = h->cbh_nelements; + unsigned int child_idx; + struct binheap_node **child_ptr; + struct binheap_node *child; + unsigned int child2_idx; + struct binheap_node **child2_ptr; + struct binheap_node *child2; + unsigned int cur_idx; + struct binheap_node **cur_ptr; + int did_sth = 0; + + cur_idx = e->chn_index; + cur_ptr = binheap_pointer(h, cur_idx); + LASSERT(*cur_ptr == e); + + while (cur_idx < n) { + child_idx = (cur_idx << 1) + 1; + if (child_idx >= n) + break; + + child_ptr = binheap_pointer(h, child_idx); + child = *child_ptr; + + child2_idx = child_idx + 1; + if (child2_idx < n) { + child2_ptr = binheap_pointer(h, child2_idx); + child2 = *child2_ptr; + + if (h->cbh_ops->hop_compare(child2, child)) { + child_idx = child2_idx; + child_ptr = child2_ptr; + child = child2; + } + } + + LASSERT(child->chn_index == child_idx); + + if (h->cbh_ops->hop_compare(e, child)) + break; + + child->chn_index = cur_idx; + *cur_ptr = child; + cur_ptr = child_ptr; + cur_idx = child_idx; + did_sth = 1; + } + + e->chn_index = cur_idx; + *cur_ptr = e; + + return did_sth; +} + +/** + * Sort-inserts a node into the binary heap. + * + * \param[in] h The heap + * \param[in] e The node + * + * \retval 0 Element inserted successfully + * \retval != 0 error + */ +int +binheap_insert(struct binheap *h, struct binheap_node *e) +{ + struct binheap_node **new_ptr; + unsigned int new_idx = h->cbh_nelements; + int rc; + + if (new_idx == h->cbh_hwm) { + rc = binheap_grow(h); + if (rc != 0) + return rc; + } + + if (h->cbh_ops->hop_enter) { + rc = h->cbh_ops->hop_enter(h, e); + if (rc != 0) + return rc; + } + + e->chn_index = new_idx; + new_ptr = binheap_pointer(h, new_idx); + h->cbh_nelements++; + *new_ptr = e; + + binheap_bubble(h, e); + + return 0; +} +EXPORT_SYMBOL(binheap_insert); + +/** + * Removes a node from the binary heap. + * + * \param[in] h The heap + * \param[in] e The node + */ +void +binheap_remove(struct binheap *h, struct binheap_node *e) +{ + unsigned int n = h->cbh_nelements; + unsigned int cur_idx = e->chn_index; + struct binheap_node **cur_ptr; + struct binheap_node *last; + + LASSERT(cur_idx != CBH_POISON); + LASSERT(cur_idx < n); + + cur_ptr = binheap_pointer(h, cur_idx); + LASSERT(*cur_ptr == e); + + n--; + last = *binheap_pointer(h, n); + h->cbh_nelements = n; + if (last == e) + return; + + last->chn_index = cur_idx; + *cur_ptr = last; + binheap_relocate(h, *cur_ptr); + + e->chn_index = CBH_POISON; + if (h->cbh_ops->hop_exit) + h->cbh_ops->hop_exit(h, e); +} +EXPORT_SYMBOL(binheap_remove); + +/** + * Relocate a node in the binary heap. + * Should be called whenever a node's values + * which affects its ranking are changed. + * + * \param[in] h The heap + * \param[in] e The node + */ +void +binheap_relocate(struct binheap *h, struct binheap_node *e) +{ + if (!binheap_bubble(h, e)) + binheap_sink(h, e); +} +EXPORT_SYMBOL(binheap_relocate); +/** @} heap */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/heap.h b/drivers/staging/lustrefsx/lustre/ptlrpc/heap.h new file mode 100644 index 0000000000000..7cd5c1c00645a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/heap.h @@ -0,0 +1,188 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * GPL HEADER END + */ +/* + * Copyright (c) 2011 Intel Corporation + */ +/* + * libcfs/include/libcfs/heap.h + * + * Author: Eric Barton + * Liang Zhen + */ + +#ifndef __LIBCFS_HEAP_H__ +#define __LIBCFS_HEAP_H__ + +/** \defgroup heap Binary heap + * + * The binary heap is a scalable data structure created using a binary tree. It + * is capable of maintaining large sets of elements sorted usually by one or + * more element properties, but really based on anything that can be used as a + * binary predicate in order to determine the relevant ordering of any two nodes + * that belong to the set. There is no search operation, rather the intention is + * for the element of the lowest priority which will always be at the root of + * the tree (as this is an implementation of a min-heap) to be removed by users + * for consumption. + * + * Users of the heap should embed a \e struct binheap_node object instance + * on every object of the set that they wish the binary heap instance to handle, + * and (at a minimum) provide a struct binheap_ops::hop_compare() + * implementation which is used by the heap as the binary predicate during its + * internal sorting operations. + * + * The current implementation enforces no locking scheme, and so assumes the + * user caters for locking between calls to insert, delete and lookup + * operations. Since the only consumer for the data structure at this point + * are NRS policies, and these operate on a per-CPT basis, binary heap instances + * are tied to a specific CPT. + * @{ + */ + +#define CBH_SHIFT 9 +#define CBH_SIZE (1 << CBH_SHIFT) /* # ptrs per level */ +#define CBH_MASK (CBH_SIZE - 1) +#define CBH_NOB (CBH_SIZE * sizeof(struct binheap_node *)) + +#define CBH_POISON 0xdeadbeef + +/** + * Binary heap flags. + */ +enum { + CBH_FLAG_ATOMIC_GROW = 1, +}; + +struct binheap; + +/** + * Binary heap operations. + */ +struct binheap_ops { + /** + * Called right before inserting a node into the binary heap. + * + * Implementing this operation is optional. + * + * \param[in] h The heap + * \param[in] e The node + * + * \retval 0 success + * \retval != 0 error + */ + int (*hop_enter)(struct binheap *h, + struct binheap_node *e); + /** + * Called right after removing a node from the binary heap. + * + * Implementing this operation is optional. + * + * \param[in] h The heap + * \param[in] e The node + */ + void (*hop_exit)(struct binheap *h, + struct binheap_node *e); + /** + * A binary predicate which is called during internal heap sorting + * operations, and used in order to determine the relevant ordering of + * two heap nodes. + * + * Implementing this operation is mandatory. + * + * \param[in] a The first heap node + * \param[in] b The second heap node + * + * \retval 0 Node a > node b + * \retval 1 Node a < node b + * + * \see binheap_bubble() + * \see cfs_biheap_sink() + */ + int (*hop_compare)(struct binheap_node *a, + struct binheap_node *b); +}; + +/** + * Binary heap object. + * + * Sorts elements of type \e struct binheap_node + */ +struct binheap { + /** Triple indirect */ + struct binheap_node ****cbh_elements3; + /** double indirect */ + struct binheap_node ***cbh_elements2; + /** single indirect */ + struct binheap_node **cbh_elements1; + /** # elements referenced */ + unsigned int cbh_nelements; + /** high water mark */ + unsigned int cbh_hwm; + /** user flags */ + unsigned int cbh_flags; + /** operations table */ + struct binheap_ops *cbh_ops; + /** private data */ + void *cbh_private; + /** associated CPT table */ + struct cfs_cpt_table *cbh_cptab; + /** associated CPT id of this struct binheap::cbh_cptab */ + int cbh_cptid; +}; + +void binheap_destroy(struct binheap *h); +struct binheap * +binheap_create(struct binheap_ops *ops, unsigned int flags, + unsigned int count, void *arg, struct cfs_cpt_table *cptab, + int cptid); +struct binheap_node * +binheap_find(struct binheap *h, unsigned int idx); +int binheap_insert(struct binheap *h, struct binheap_node *e); +void binheap_remove(struct binheap *h, struct binheap_node *e); +void binheap_relocate(struct binheap *h, struct binheap_node *e); + +static inline int +binheap_size(struct binheap *h) +{ + return h->cbh_nelements; +} + +static inline int +binheap_is_empty(struct binheap *h) +{ + return h->cbh_nelements == 0; +} + +static inline struct binheap_node * +binheap_root(struct binheap *h) +{ + return binheap_find(h, 0); +} + +static inline struct binheap_node * +binheap_remove_root(struct binheap *h) +{ + struct binheap_node *e = binheap_find(h, 0); + + if (e != NULL) + binheap_remove(h, e); + return e; +} + +/** @} heap */ + +#endif /* __LIBCFS_HEAP_H__ */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/import.c b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c new file mode 100644 index 0000000000000..ae3998082661a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c @@ -0,0 +1,2069 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/import.c + * + * Author: Mike Shaver + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +struct ptlrpc_connect_async_args { + __u64 pcaa_peer_committed; + int pcaa_initial_connect; +}; + +int allow_version_mismatch; +EXPORT_SYMBOL(allow_version_mismatch); + +/** + * Updates import \a imp current state to provided \a state value + * Helper function. + */ +static void import_set_state_nolock(struct obd_import *imp, + enum lustre_imp_state state) +{ + switch (state) { + case LUSTRE_IMP_CLOSED: + case LUSTRE_IMP_NEW: + case LUSTRE_IMP_DISCON: + case LUSTRE_IMP_CONNECTING: + break; + case LUSTRE_IMP_REPLAY_WAIT: + imp->imp_replay_state = LUSTRE_IMP_REPLAY_LOCKS; + break; + default: + imp->imp_replay_state = LUSTRE_IMP_REPLAY; + break; + } + + /* A CLOSED import should remain so. */ + if (imp->imp_state == LUSTRE_IMP_CLOSED) + return; + + if (imp->imp_state != LUSTRE_IMP_NEW) { + CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n", + imp, obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(imp->imp_state), + ptlrpc_import_state_name(state)); + } + + imp->imp_state = state; + imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state; + imp->imp_state_hist[imp->imp_state_hist_idx].ish_time = + ktime_get_real_seconds(); + imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) % + IMP_STATE_HIST_LEN; +} + +static void import_set_state(struct obd_import *imp, + enum lustre_imp_state new_state) +{ + spin_lock(&imp->imp_lock); + import_set_state_nolock(imp, new_state); + spin_unlock(&imp->imp_lock); +} + +void ptlrpc_import_enter_resend(struct obd_import *imp) +{ + import_set_state(imp, LUSTRE_IMP_RECOVER); +} +EXPORT_SYMBOL(ptlrpc_import_enter_resend); + + +static int ptlrpc_connect_interpret(const struct lu_env *env, + struct ptlrpc_request *request, + void *args, int rc); +int ptlrpc_import_recovery_state_machine(struct obd_import *imp); + +/* Only this function is allowed to change the import state when it is + * CLOSED. I would rather refcount the import and free it after + * disconnection like we do with exports. To do that, the client_obd + * will need to save the peer info somewhere other than in the import, + * though. */ +int ptlrpc_init_import(struct obd_import *imp) +{ + spin_lock(&imp->imp_lock); + + imp->imp_generation++; + imp->imp_state = LUSTRE_IMP_NEW; + + spin_unlock(&imp->imp_lock); + + return 0; +} +EXPORT_SYMBOL(ptlrpc_init_import); + +#define UUID_STR "_UUID" +void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len) +{ + *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix)) + ? uuid : uuid + strlen(prefix); + + *uuid_len = strlen(*uuid_start); + + if (*uuid_len < strlen(UUID_STR)) + return; + + if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR), + UUID_STR, strlen(UUID_STR))) + *uuid_len -= strlen(UUID_STR); +} + +/* Must be called with imp_lock held! */ +static void ptlrpc_deactivate_import_nolock(struct obd_import *imp) +{ + ENTRY; + + assert_spin_locked(&imp->imp_lock); + CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd)); + imp->imp_invalid = 1; + imp->imp_generation++; + + ptlrpc_abort_inflight(imp); + + EXIT; +} + +/** + * Returns true if import was FULL, false if import was already not + * connected. + * @imp - import to be disconnected + * @conn_cnt - connection count (epoch) of the request that timed out + * and caused the disconnection. In some cases, multiple + * inflight requests can fail to a single target (e.g. OST + * bulk requests) and if one has already caused a reconnection + * (increasing the import->conn_cnt) the older failure should + * not also cause a reconnection. If zero it forces a reconnect. + * @invalid - set import invalid flag + */ +int ptlrpc_set_import_discon(struct obd_import *imp, + __u32 conn_cnt, bool invalid) +{ + int rc = 0; + + spin_lock(&imp->imp_lock); + + if (imp->imp_state == LUSTRE_IMP_FULL && + (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) { + char *target_start; + int target_len; + bool inact = false; + + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + + import_set_state_nolock(imp, LUSTRE_IMP_DISCON); + if (imp->imp_replayable) { + LCONSOLE_WARN("%s: Connection to %.*s (at %s) was " + "lost; in progress operations using this " + "service will wait for recovery to complete\n", + imp->imp_obd->obd_name, target_len, target_start, + obd_import_nid2str(imp)); + } else { + LCONSOLE_ERROR_MSG(0x166, "%s: Connection to " + "%.*s (at %s) was lost; in progress " + "operations using this service will fail\n", + imp->imp_obd->obd_name, target_len, target_start, + obd_import_nid2str(imp)); + if (invalid) { + CDEBUG(D_HA, "import %s@%s for %s not " + "replayable, auto-deactivating\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_obd->obd_name); + ptlrpc_deactivate_import_nolock(imp); + inact = true; + } + } + spin_unlock(&imp->imp_lock); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); + + if (inact) + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); + rc = 1; + } else { + spin_unlock(&imp->imp_lock); + CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n", + imp->imp_client->cli_name, imp, + (imp->imp_state == LUSTRE_IMP_FULL && + imp->imp_conn_cnt > conn_cnt) ? + "reconnected" : "not connected", imp->imp_conn_cnt, + conn_cnt, ptlrpc_import_state_name(imp->imp_state)); + } + + return rc; +} + +/* + * This acts as a barrier; all existing requests are rejected, and + * no new requests will be accepted until the import is valid again. + */ +void ptlrpc_deactivate_import(struct obd_import *imp) +{ + spin_lock(&imp->imp_lock); + ptlrpc_deactivate_import_nolock(imp); + spin_unlock(&imp->imp_lock); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); +} +EXPORT_SYMBOL(ptlrpc_deactivate_import); + +static time64_t ptlrpc_inflight_deadline(struct ptlrpc_request *req, + time64_t now) +{ + time64_t dl; + + if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || + (req->rq_phase == RQ_PHASE_BULK) || + (req->rq_phase == RQ_PHASE_NEW))) + return 0; + + if (req->rq_timedout) + return 0; + + if (req->rq_phase == RQ_PHASE_NEW) + dl = req->rq_sent; + else + dl = req->rq_deadline; + + if (dl <= now) + return 0; + + return dl - now; +} + +static time64_t ptlrpc_inflight_timeout(struct obd_import *imp) +{ + time64_t now = ktime_get_real_seconds(); + struct ptlrpc_request *req; + time64_t timeout = 0; + + spin_lock(&imp->imp_lock); + list_for_each_entry(req, &imp->imp_sending_list, rq_list) + timeout = max(ptlrpc_inflight_deadline(req, now), timeout); + spin_unlock(&imp->imp_lock); + return timeout; +} + +/** + * This function will invalidate the import, if necessary, then block + * for all the RPC completions, and finally notify the obd to + * invalidate its state (ie cancel locks, clear pending requests, + * etc). + */ +void ptlrpc_invalidate_import(struct obd_import *imp) +{ + struct ptlrpc_request *req; + time64_t timeout; + int rc; + + atomic_inc(&imp->imp_inval_count); + + if (!imp->imp_invalid || imp->imp_obd->obd_no_recov) + ptlrpc_deactivate_import(imp); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CONNECT_RACE)) { + OBD_RACE(OBD_FAIL_PTLRPC_CONNECT_RACE); + msleep(10 * MSEC_PER_SEC); + } + CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2); + LASSERT(imp->imp_invalid); + + /* Wait forever until inflight == 0. We really can't do it another + * way because in some cases we need to wait for very long reply + * unlink. We can't do anything before that because there is really + * no guarantee that some rdma transfer is not in progress right now. + */ + do { + long timeout_jiffies; + + /* Calculate max timeout for waiting on rpcs to error + * out. Use obd_timeout if calculated value is smaller + * than it. + */ + if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { + timeout = ptlrpc_inflight_timeout(imp); + timeout += div_u64(timeout, 3); + + if (timeout == 0) + timeout = obd_timeout; + } else { + /* decrease the interval to increase race condition */ + timeout = 1; + } + + CDEBUG(D_RPCTRACE, "Sleeping %llds for inflight to error out\n", + timeout); + + /* Wait for all requests to error out and call completion + * callbacks. Cap it at obd_timeout -- these should all + * have been locally cancelled by ptlrpc_abort_inflight. + */ + timeout_jiffies = max_t(long, cfs_time_seconds(timeout), 1); + rc = wait_event_idle_timeout( + imp->imp_recovery_waitq, + (atomic_read(&imp->imp_inflight) == 0), + timeout_jiffies); + + if (rc == 0) { + const char *cli_tgt = obd2cli_tgt(imp->imp_obd); + + CERROR("%s: timeout waiting for callback (%d != 0)\n", + cli_tgt, atomic_read(&imp->imp_inflight)); + + spin_lock(&imp->imp_lock); + if (atomic_read(&imp->imp_inflight) == 0) { + int count = atomic_read(&imp->imp_unregistering); + + /* We know that "unregistering" rpcs only can + * survive in sending or delaying lists (they + * maybe waiting for long reply unlink in + * sluggish nets). Let's check this. If there + * is no inflight and unregistering != 0, this + * is bug. */ + LASSERTF(count == 0, "Some RPCs are still " + "unregistering: %d\n", count); + + /* Let's save one loop as soon as inflight have + * dropped to zero. No new inflights possible at + * this point. */ + rc = 1; + } else { + list_for_each_entry(req, &imp->imp_sending_list, + rq_list) { + DEBUG_REQ(D_ERROR, req, + "still on sending list"); + } + list_for_each_entry(req, &imp->imp_delayed_list, + rq_list) { + DEBUG_REQ(D_ERROR, req, + "still on delayed list"); + } + + CERROR("%s: Unregistering RPCs found (%d). " + "Network is sluggish? Waiting for them " + "to error out.\n", cli_tgt, + atomic_read(&imp->imp_unregistering)); + } + spin_unlock(&imp->imp_lock); + } + } while (rc == 0); + + /* + * Let's additionally check that no new rpcs added to import in + * "invalidate" state. + */ + LASSERT(atomic_read(&imp->imp_inflight) == 0); + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); + sptlrpc_import_flush_all_ctx(imp); + + atomic_dec(&imp->imp_inval_count); + wake_up(&imp->imp_recovery_waitq); +} +EXPORT_SYMBOL(ptlrpc_invalidate_import); + +/* unset imp_invalid */ +void ptlrpc_activate_import(struct obd_import *imp, bool set_state_full) +{ + struct obd_device *obd = imp->imp_obd; + + spin_lock(&imp->imp_lock); + if (imp->imp_deactive != 0) { + LASSERT(imp->imp_state != LUSTRE_IMP_FULL); + if (imp->imp_state != LUSTRE_IMP_DISCON) + import_set_state_nolock(imp, LUSTRE_IMP_DISCON); + spin_unlock(&imp->imp_lock); + return; + } + if (set_state_full) + import_set_state_nolock(imp, LUSTRE_IMP_FULL); + + imp->imp_invalid = 0; + + spin_unlock(&imp->imp_lock); + obd_import_event(obd, imp, IMP_EVENT_ACTIVE); +} +EXPORT_SYMBOL(ptlrpc_activate_import); + +void ptlrpc_pinger_force(struct obd_import *imp) +{ + CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(imp->imp_state)); + + spin_lock(&imp->imp_lock); + imp->imp_force_verify = 1; + spin_unlock(&imp->imp_lock); + + if (imp->imp_state != LUSTRE_IMP_CONNECTING) + ptlrpc_pinger_wake_up(); +} +EXPORT_SYMBOL(ptlrpc_pinger_force); + +void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt) +{ + ENTRY; + + LASSERT(!imp->imp_dlm_fake); + + if (ptlrpc_set_import_discon(imp, conn_cnt, true)) + ptlrpc_pinger_force(imp); + + EXIT; +} + +int ptlrpc_reconnect_import(struct obd_import *imp) +{ +#ifdef CONFIG_LUSTRE_FS_PINGER + long timeout_jiffies = cfs_time_seconds(obd_timeout); + int rc; + + ptlrpc_pinger_force(imp); + + CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n", + obd2cli_tgt(imp->imp_obd), obd_timeout); + + rc = wait_event_idle_timeout(imp->imp_recovery_waitq, + !ptlrpc_import_in_recovery(imp), + timeout_jiffies); + if (rc == 0) + rc = -ETIMEDOUT; + else + rc = 0; + CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(imp->imp_state)); + return rc; +#else + ptlrpc_set_import_discon(imp, 0, false); + /* Force a new connect attempt */ + ptlrpc_invalidate_import(imp); + /* Do a fresh connect next time by zeroing the handle */ + ptlrpc_disconnect_import(imp, 1); + /* Wait for all invalidate calls to finish */ + if (atomic_read(&imp->imp_inval_count) > 0) { + int rc; + + rc = l_wait_event_abortable( + imp->imp_recovery_waitq, + (atomic_read(&imp->imp_inval_count) == 0)); + if (rc) + CERROR("Interrupted, inval=%d\n", + atomic_read(&imp->imp_inval_count)); + } + + /* Allow reconnect attempts */ + imp->imp_obd->obd_no_recov = 0; + /* Remove 'invalid' flag */ + ptlrpc_activate_import(imp, false); + /* Attempt a new connect */ + ptlrpc_recover_import(imp, NULL, 0); + return 0; +#endif +} +EXPORT_SYMBOL(ptlrpc_reconnect_import); + +/** + * Connection on import \a imp is changed to another one (if more than one is + * present). We typically chose connection that we have not tried to connect to + * the longest + */ +static int import_select_connection(struct obd_import *imp) +{ + struct obd_import_conn *imp_conn = NULL, *conn; + struct obd_export *dlmexp; + char *target_start; + int target_len, tried_all = 1; + int rc = 0; + ENTRY; + + spin_lock(&imp->imp_lock); + + if (list_empty(&imp->imp_conn_list)) { + rc = -EINVAL; + CERROR("%s: no connections available: rc = %d\n", + imp->imp_obd->obd_name, rc); + GOTO(out_unlock, rc); + } + + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + CDEBUG(D_HA, "%s: connect to NID %s last attempt %lld\n", + imp->imp_obd->obd_name, + libcfs_nidstr(&conn->oic_conn->c_peer.nid), + conn->oic_last_attempt); + + /* If we have not tried this connection since + * the last successful attempt, go with this one + */ + if ((conn->oic_last_attempt == 0) || + conn->oic_last_attempt <= imp->imp_last_success_conn) { + imp_conn = conn; + tried_all = 0; + break; + } + + /* If all of the connections have already been tried + * since the last successful connection; just choose the + * least recently used + */ + if (!imp_conn) + imp_conn = conn; + else if (imp_conn->oic_last_attempt > conn->oic_last_attempt) + imp_conn = conn; + } + + /* if not found, simply choose the current one */ + if (!imp_conn || imp->imp_force_reconnect) { + LASSERT(imp->imp_conn_current); + imp_conn = imp->imp_conn_current; + tried_all = 0; + } + LASSERT(imp_conn->oic_conn); + + /* If we've tried everything, and we're back to the beginning of the + * list, increase our timeout and try again. It will be reset when + * we do finally connect. (FIXME: really we should wait for all network + * state associated with the last connection attempt to drain before + * trying to reconnect on it.) + */ + if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) { + struct adaptive_timeout *at = &imp->imp_at.iat_net_latency; + + if (at_get(at) < CONNECTION_SWITCH_MAX) { + at_measured(at, at_get(at) + CONNECTION_SWITCH_INC); + if (at_get(at) > CONNECTION_SWITCH_MAX) + at_reset(at, CONNECTION_SWITCH_MAX); + } + LASSERT(imp_conn->oic_last_attempt); + CDEBUG(D_HA, + "%s: tried all connections, increasing latency to %ds\n", + imp->imp_obd->obd_name, at_get(at)); + } + + imp_conn->oic_last_attempt = ktime_get_seconds(); + + /* switch connection, don't mind if it's same as the current one */ + ptlrpc_connection_put(imp->imp_connection); + imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn); + + dlmexp = class_conn2export(&imp->imp_dlm_handle); + if (!dlmexp) + GOTO(out_unlock, rc = -EINVAL); + ptlrpc_connection_put(dlmexp->exp_connection); + dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn); + class_export_put(dlmexp); + + if (imp->imp_conn_current != imp_conn) { + if (imp->imp_conn_current) { + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + + CDEBUG(D_HA, "%s: Connection changing to" + " %.*s (at %s)\n", + imp->imp_obd->obd_name, + target_len, target_start, + libcfs_nidstr(&imp_conn->oic_conn->c_peer.nid)); + } + + imp->imp_conn_current = imp_conn; + } + + /* The below message is checked in conf-sanity.sh test_35[ab] */ + CDEBUG(D_HA, "%s: import %p using connection %s/%s\n", + imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid, + libcfs_nidstr(&imp_conn->oic_conn->c_peer.nid)); + +out_unlock: + spin_unlock(&imp->imp_lock); + RETURN(rc); +} + +/* + * must be called under imp_lock + */ +static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno) +{ + struct ptlrpc_request *req; + + /* The requests in committed_list always have smaller transnos than + * the requests in replay_list */ + if (!list_empty(&imp->imp_committed_list)) { + req = list_first_entry(&imp->imp_committed_list, + struct ptlrpc_request, rq_replay_list); + *transno = req->rq_transno; + if (req->rq_transno == 0) { + DEBUG_REQ(D_ERROR, req, + "zero transno in committed_list"); + LBUG(); + } + return 1; + } + if (!list_empty(&imp->imp_replay_list)) { + req = list_first_entry(&imp->imp_committed_list, + struct ptlrpc_request, rq_replay_list); + *transno = req->rq_transno; + if (req->rq_transno == 0) { + DEBUG_REQ(D_ERROR, req, "zero transno in replay_list"); + LBUG(); + } + return 1; + } + return 0; +} + +int ptlrpc_connect_import(struct obd_import *imp) +{ + spin_lock(&imp->imp_lock); + return ptlrpc_connect_import_locked(imp); +} + +/** + * Attempt to (re)connect import \a imp. This includes all preparations, + * initializing CONNECT RPC request and passing it to ptlrpcd for + * actual sending. + * + * Assumes imp->imp_lock is held, and releases it. + * + * Returns 0 on success or error code. + */ +int ptlrpc_connect_import_locked(struct obd_import *imp) +{ + struct obd_device *obd = imp->imp_obd; + int initial_connect = 0; + int set_transno = 0; + __u64 committed_before_reconnect = 0; + struct ptlrpc_request *request; + struct obd_connect_data ocd; + char *bufs[] = { NULL, + obd2cli_tgt(imp->imp_obd), + obd->obd_uuid.uuid, + (char *)&imp->imp_dlm_handle, + (char *)&ocd, + NULL }; + struct ptlrpc_connect_async_args *aa; + int rc; + ENTRY; + + assert_spin_locked(&imp->imp_lock); + + if (imp->imp_state == LUSTRE_IMP_CLOSED) { + spin_unlock(&imp->imp_lock); + CERROR("can't connect to a closed import\n"); + RETURN(-EINVAL); + } else if (imp->imp_state == LUSTRE_IMP_FULL) { + spin_unlock(&imp->imp_lock); + CERROR("already connected\n"); + RETURN(0); + } else if (imp->imp_state == LUSTRE_IMP_CONNECTING || + imp->imp_state == LUSTRE_IMP_EVICTED || + imp->imp_connected) { + spin_unlock(&imp->imp_lock); + CERROR("already connecting\n"); + RETURN(-EALREADY); + } + + import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING); + + imp->imp_conn_cnt++; + imp->imp_resend_replay = 0; + + if (!lustre_handle_is_used(&imp->imp_remote_handle)) + initial_connect = 1; + else + committed_before_reconnect = imp->imp_peer_committed_transno; + + set_transno = ptlrpc_first_transno(imp, + &imp->imp_connect_data.ocd_transno); + spin_unlock(&imp->imp_lock); + + rc = import_select_connection(imp); + if (rc) + GOTO(out, rc); + + rc = sptlrpc_import_sec_adapt(imp, NULL, NULL); + if (rc) + GOTO(out, rc); + + /* Reset connect flags to the originally requested flags, in case + * the server is updated on-the-fly we will get the new features. */ + ocd = imp->imp_connect_data; + ocd.ocd_connect_flags = imp->imp_connect_flags_orig; + ocd.ocd_connect_flags2 = imp->imp_connect_flags2_orig; + /* Reset ocd_version each time so the server knows the exact versions */ + ocd.ocd_version = LUSTRE_VERSION_CODE; + imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18; + + rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd, + &obd->obd_uuid, &ocd, NULL); + if (rc) + GOTO(out, rc); + + request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT); + if (request == NULL) + GOTO(out, rc = -ENOMEM); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(request); + if (rc < 0) { + ptlrpc_request_free(request); + GOTO(out, rc); + } + + bufs[5] = request->rq_sepol; + + req_capsule_set_size(&request->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(request->rq_sepol) ? + strlen(request->rq_sepol) + 1 : 0); + + rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION, + imp->imp_connect_op, bufs, NULL); + if (rc) { + ptlrpc_request_free(request); + GOTO(out, rc); + } + + /* Report the rpc service time to the server so that it knows how long + * to wait for clients to join recovery */ + lustre_msg_set_service_timeout(request->rq_reqmsg, + at_timeout2est(request->rq_timeout)); + + /* The amount of time we give the server to process the connect req. + * import_select_connection will increase the net latency on + * repeated reconnect attempts to cover slow networks. + * We override/ignore the server rpc completion estimate here, + * which may be large if this is a reconnect attempt */ + request->rq_timeout = INITIAL_CONNECT_TIMEOUT; + lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout); + + request->rq_no_resend = request->rq_no_delay = 1; + request->rq_send_state = LUSTRE_IMP_CONNECTING; + /* Allow a slightly larger reply for future growth compatibility */ + req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER, + sizeof(struct obd_connect_data)+16*sizeof(__u64)); + ptlrpc_request_set_replen(request); + request->rq_interpret_reply = ptlrpc_connect_interpret; + + aa = ptlrpc_req_async_args(aa, request); + memset(aa, 0, sizeof *aa); + + aa->pcaa_peer_committed = committed_before_reconnect; + aa->pcaa_initial_connect = initial_connect; + + if (aa->pcaa_initial_connect) { + spin_lock(&imp->imp_lock); + imp->imp_replayable = 1; + spin_unlock(&imp->imp_lock); + lustre_msg_add_op_flags(request->rq_reqmsg, + MSG_CONNECT_INITIAL); + } + + if (set_transno) + lustre_msg_add_op_flags(request->rq_reqmsg, + MSG_CONNECT_TRANSNO); + + DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)", + request->rq_timeout); + ptlrpcd_add_req(request); + rc = 0; +out: + if (rc != 0) + import_set_state(imp, LUSTRE_IMP_DISCON); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_connect_import); + +static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp) +{ + int force_verify; + + spin_lock(&imp->imp_lock); + force_verify = imp->imp_force_verify != 0; + spin_unlock(&imp->imp_lock); + + if (force_verify) + ptlrpc_pinger_wake_up(); +} + +static int ptlrpc_busy_reconnect(int rc) +{ + return (rc == -EBUSY) || (rc == -EAGAIN); +} + +static int ptlrpc_connect_set_flags(struct obd_import *imp, + struct obd_connect_data *ocd, + __u64 old_connect_flags, + struct obd_export *exp, int init_connect) +{ + static bool warned; + struct client_obd *cli = &imp->imp_obd->u.cli; + + spin_lock(&imp->imp_lock); + list_move(&imp->imp_conn_current->oic_item, + &imp->imp_conn_list); + imp->imp_last_success_conn = + imp->imp_conn_current->oic_last_attempt; + + spin_unlock(&imp->imp_lock); + + /* + * We should warn on very new servers, but don't block. This + * ensures forward compatibility and preserves the kernel + * warning found in other Lustre versions. + */ + if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + (ocd->ocd_version > LUSTRE_VERSION_CODE + LUSTRE_VERSION_OFFSET_WARN)) { + const char *newer = "newer than client. Consider upgrading client"; + + if (!warned) { + LCONSOLE_WARN("Client version (%s). Server %s version (%d.%d.%d.%d) is much %s\n", + LUSTRE_VERSION_STRING, + obd2cli_tgt(imp->imp_obd), + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version), + newer); + warned = true; + } + } + + /* + * Block old servers by default. This prevents the LTS Client and + * LTS - 2 Server mismatch, unless users pass a mount flag. + */ + if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + (ocd->ocd_version < LUSTRE_VERSION_CODE -LUSTRE_VERSION_OFFSET_WARN)) { + const char *older = "older than client. Consider upgrading server"; + + if (!warned || !allow_version_mismatch) { + LCONSOLE_WARN("Client version (%s). Server %s version (%d.%d.%d.%d) is much %s\n", + LUSTRE_VERSION_STRING, + obd2cli_tgt(imp->imp_obd), + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version), + older); + warned = true; + } + + if (!allow_version_mismatch) + return -EPROTO; + } + + if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) { + /* We sent to the server ocd_cksum_types with bits set + * for algorithms we understand. The server masked off + * the checksum types it doesn't support */ + if ((ocd->ocd_cksum_types & + obd_cksum_types_supported_client()) == 0) { + LCONSOLE_ERROR("The negotiation of the checksum " + "alogrithm to use with server %s " + "failed (%x/%x)\n", + obd2cli_tgt(imp->imp_obd), + ocd->ocd_cksum_types, + obd_cksum_types_supported_client()); + return -EPROTO; + } else { + cli->cl_supp_cksum_types = ocd->ocd_cksum_types; + } + } else { + /* The server does not support OBD_CONNECT_CKSUM. + * Enforce ADLER for backward compatibility*/ + cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; + } + cli->cl_cksum_type = obd_cksum_type_select(imp->imp_obd->obd_name, + cli->cl_supp_cksum_types, + cli->cl_preferred_cksum_type); + + if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) + cli->cl_max_pages_per_rpc = + min(ocd->ocd_brw_size >> PAGE_SHIFT, + cli->cl_max_pages_per_rpc); + else if (imp->imp_connect_op == MDS_CONNECT || + imp->imp_connect_op == MGS_CONNECT) + cli->cl_max_pages_per_rpc = 1; + + LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) && + (cli->cl_max_pages_per_rpc > 0)); + + client_adjust_max_dirty(cli); + + /* Update client max modify RPCs in flight with value returned + * by the server */ + if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) + cli->cl_max_mod_rpcs_in_flight = min( + cli->cl_max_mod_rpcs_in_flight, + ocd->ocd_maxmodrpcs); + else + cli->cl_max_mod_rpcs_in_flight = 1; + + /* Reset ns_connect_flags only for initial connect. It might be + * changed in while using FS and if we reset it in reconnect + * this leads to losing user settings done before such as + * disable lru_resize, etc. */ + if (old_connect_flags != exp_connect_flags(exp) || init_connect) { + struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; + __u64 changed_flags; + + changed_flags = + ns->ns_connect_flags ^ ns->ns_orig_connect_flags; + CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server " + "flags: %#llx\n", imp->imp_obd->obd_name, + ocd->ocd_connect_flags); + ns->ns_connect_flags = (ns->ns_connect_flags & changed_flags) | + (ocd->ocd_connect_flags & ~changed_flags); + ns->ns_orig_connect_flags = ocd->ocd_connect_flags; + } + + if (ocd->ocd_connect_flags & OBD_CONNECT_AT) + /* We need a per-message support flag, because + * a. we don't know if the incoming connect reply + * supports AT or not (in reply_in_callback) + * until we unpack it. + * b. failovered server means export and flags are gone + * (in ptlrpc_send_reply). + * Can only be set when we know AT is supported at + * both ends */ + imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT; + else + imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + + imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18; + + return 0; +} + +/** + * Add all replay requests back to unreplied list before start replay, + * so that we can make sure the known replied XID is always increased + * only even if when replaying requests. + */ +static void ptlrpc_prepare_replay(struct obd_import *imp) +{ + struct ptlrpc_request *req; + + if (imp->imp_state != LUSTRE_IMP_REPLAY || + imp->imp_resend_replay) + return; + + /* If the server was restart during repaly, the requests may + * have been added to the unreplied list in former replay. */ + spin_lock(&imp->imp_lock); + + list_for_each_entry(req, &imp->imp_committed_list, rq_replay_list) { + if (list_empty(&req->rq_unreplied_list)) + ptlrpc_add_unreplied(req); + } + + list_for_each_entry(req, &imp->imp_replay_list, rq_replay_list) { + if (list_empty(&req->rq_unreplied_list)) + ptlrpc_add_unreplied(req); + } + + imp->imp_known_replied_xid = ptlrpc_known_replied_xid(imp); + spin_unlock(&imp->imp_lock); +} + +/** + * interpret_reply callback for connect RPCs. + * Looks into returned status of connect operation and decides + * what to do with the import - i.e enter recovery, promote it to + * full state for normal operations of disconnect it due to an error. + */ +static int ptlrpc_connect_interpret(const struct lu_env *env, + struct ptlrpc_request *request, + void *data, int rc) +{ + struct ptlrpc_connect_async_args *aa = data; + struct obd_import *imp = request->rq_import; + struct lustre_handle old_hdl; + __u64 old_connect_flags; + timeout_t service_timeout; + int msg_flags; + struct obd_connect_data *ocd; + struct obd_export *exp = NULL; + int ret; + ENTRY; + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_CLOSED) { + imp->imp_connect_tried = 1; + spin_unlock(&imp->imp_lock); + RETURN(0); + } + + imp->imp_connect_error = rc; + if (rc) { + struct ptlrpc_request *free_req; + struct ptlrpc_request *tmp; + + /* abort all delayed requests initiated connection */ + list_for_each_entry_safe(free_req, tmp, &imp->imp_delayed_list, + rq_list) { + spin_lock(&free_req->rq_lock); + if (free_req->rq_no_resend) { + free_req->rq_err = 1; + free_req->rq_status = -EIO; + ptlrpc_client_wake_req(free_req); + } + spin_unlock(&free_req->rq_lock); + } + + /* if this reconnect to busy export - not need select new target + * for connecting*/ + imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc); + spin_unlock(&imp->imp_lock); + GOTO(out, rc); + } + + /* LU-7558: indicate that we are interpretting connect reply, + * pltrpc_connect_import() will not try to reconnect until + * interpret will finish. */ + imp->imp_connected = 1; + spin_unlock(&imp->imp_lock); + + LASSERT(imp->imp_conn_current); + + msg_flags = lustre_msg_get_op_flags(request->rq_repmsg); + + ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA, + RCL_SERVER); + /* server replied obd_connect_data is always bigger */ + ocd = req_capsule_server_sized_get(&request->rq_pill, + &RMF_CONNECT_DATA, ret); + + if (ocd == NULL) { + CERROR("%s: no connect data from server\n", + imp->imp_obd->obd_name); + rc = -EPROTO; + GOTO(out, rc); + } + + spin_lock(&imp->imp_lock); + + /* All imports are pingable */ + imp->imp_pingable = 1; + imp->imp_force_reconnect = 0; + imp->imp_force_verify = 0; + + imp->imp_connect_data = *ocd; + + CDEBUG(D_HA, "%s: connect to target with instance %u\n", + imp->imp_obd->obd_name, ocd->ocd_instance); + exp = class_conn2export(&imp->imp_dlm_handle); + + spin_unlock(&imp->imp_lock); + + if (!exp) { + /* This could happen if export is cleaned during the + connect attempt */ + CERROR("%s: missing export after connect\n", + imp->imp_obd->obd_name); + GOTO(out, rc = -ENODEV); + } + + /* check that server granted subset of flags we asked for. */ + if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) != + ocd->ocd_connect_flags) { + CERROR("%s: Server didn't grant requested subset of flags: " + "asked=%#llx granted=%#llx\n", + imp->imp_obd->obd_name, imp->imp_connect_flags_orig, + ocd->ocd_connect_flags); + GOTO(out, rc = -EPROTO); + } + + if ((ocd->ocd_connect_flags2 & imp->imp_connect_flags2_orig) != + ocd->ocd_connect_flags2) { + CERROR("%s: Server didn't grant requested subset of flags2: " + "asked=%#llx granted=%#llx\n", + imp->imp_obd->obd_name, imp->imp_connect_flags2_orig, + ocd->ocd_connect_flags2); + GOTO(out, rc = -EPROTO); + } + + if (!(imp->imp_connect_flags_orig & OBD_CONNECT_LIGHTWEIGHT) && + (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) && + (imp->imp_connect_flags_orig & OBD_CONNECT_FID) && + (ocd->ocd_connect_flags & OBD_CONNECT_VERSION)) { + __u32 major = OBD_OCD_VERSION_MAJOR(ocd->ocd_version); + __u32 minor = OBD_OCD_VERSION_MINOR(ocd->ocd_version); + __u32 patch = OBD_OCD_VERSION_PATCH(ocd->ocd_version); + + /* We do not support the MDT-MDT interoperations with + * different version MDT because of protocol changes. */ + if (unlikely(major != LUSTRE_MAJOR || + minor != LUSTRE_MINOR || + abs(patch - LUSTRE_PATCH) > 3)) { + LCONSOLE_WARN("%s: import %p (%u.%u.%u.%u) tried the " + "connection to different version MDT " + "(%d.%d.%d.%d) %s\n", + imp->imp_obd->obd_name, imp, LUSTRE_MAJOR, + LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX, + major, minor, patch, + OBD_OCD_VERSION_FIX(ocd->ocd_version), + imp->imp_connection->c_remote_uuid.uuid); + + GOTO(out, rc = -EPROTO); + } + } + + old_connect_flags = exp_connect_flags(exp); + exp->exp_connect_data = *ocd; + imp->imp_obd->obd_self_export->exp_connect_data = *ocd; + + /* The net statistics after (re-)connect is not valid anymore, + * because may reflect other routing, etc. + */ + service_timeout = lustre_msg_get_service_timeout(request->rq_repmsg); + at_reinit(&imp->imp_at.iat_net_latency, 0, 0); + ptlrpc_at_adj_net_latency(request, service_timeout); + + /* Import flags should be updated before waking import at FULL state */ + rc = ptlrpc_connect_set_flags(imp, ocd, old_connect_flags, exp, + aa->pcaa_initial_connect); + class_export_put(exp); + exp = NULL; + + if (rc != 0) + GOTO(out, rc); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD); + + if (aa->pcaa_initial_connect) { + spin_lock(&imp->imp_lock); + if (msg_flags & MSG_CONNECT_REPLAYABLE) { + imp->imp_replayable = 1; + CDEBUG(D_HA, "connected to replayable target: %s\n", + obd2cli_tgt(imp->imp_obd)); + } else { + imp->imp_replayable = 0; + } + + /* if applies, adjust the imp->imp_msg_magic here + * according to reply flags + */ + + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + + /* Initial connects are allowed for clients with non-random + * uuids when servers are in recovery. Simply signal the + * servers replay is complete and wait in REPLAY_WAIT. + */ + if (msg_flags & MSG_CONNECT_RECOVERING) { + CDEBUG(D_HA, "connect to %s during recovery\n", + obd2cli_tgt(imp->imp_obd)); + import_set_state_nolock(imp, LUSTRE_IMP_REPLAY_LOCKS); + spin_unlock(&imp->imp_lock); + } else { + spin_unlock(&imp->imp_lock); + ptlrpc_activate_import(imp, true); + } + + GOTO(finish, rc = 0); + } + + /* Determine what recovery state to move the import to. */ + if (MSG_CONNECT_RECONNECT & msg_flags) { + memset(&old_hdl, 0, sizeof(old_hdl)); + if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg), + sizeof(old_hdl))) { + LCONSOLE_WARN("Reconnect to %s (at @%s) failed due " + "bad handle %#llx\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_dlm_handle.cookie); + GOTO(out, rc = -ENOTCONN); + } + + if (memcmp(&imp->imp_remote_handle, + lustre_msg_get_handle(request->rq_repmsg), + sizeof(imp->imp_remote_handle))) { + int level = msg_flags & MSG_CONNECT_RECOVERING ? + D_HA : D_WARNING; + + /* Bug 16611/14775: if server handle have changed, + * that means some sort of disconnection happened. + * If the server is not in recovery, that also means it + * already erased all of our state because of previous + * eviction. If it is in recovery - we are safe to + * participate since we can reestablish all of our state + * with server again + */ + if ((MSG_CONNECT_RECOVERING & msg_flags)) { + CDEBUG_LIMIT(level, + "%s@%s changed server handle from " + "%#llx to %#llx" + " but is still in recovery\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_remote_handle.cookie, + lustre_msg_get_handle( + request->rq_repmsg)->cookie); + } else { + LCONSOLE_WARN("Evicted from %s (at %s) " + "after server handle changed from " + "%#llx to %#llx\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection-> + c_remote_uuid.uuid, + imp->imp_remote_handle.cookie, + lustre_msg_get_handle( + request->rq_repmsg)->cookie); + } + + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + + if (!(MSG_CONNECT_RECOVERING & msg_flags)) { + import_set_state(imp, LUSTRE_IMP_EVICTED); + GOTO(finish, rc = 0); + } + } else { + CDEBUG(D_HA, "reconnected to %s@%s after partition\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + } + + if (imp->imp_invalid) { + CDEBUG(D_HA, "%s: reconnected but import is invalid; " + "marking evicted\n", imp->imp_obd->obd_name); + import_set_state(imp, LUSTRE_IMP_EVICTED); + } else if (MSG_CONNECT_RECOVERING & msg_flags) { + CDEBUG(D_HA, "%s: reconnected to %s during replay\n", + imp->imp_obd->obd_name, + obd2cli_tgt(imp->imp_obd)); + + spin_lock(&imp->imp_lock); + imp->imp_resend_replay = 1; + spin_unlock(&imp->imp_lock); + + import_set_state(imp, imp->imp_replay_state); + } else { + import_set_state(imp, LUSTRE_IMP_RECOVER); + } + } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) { + LASSERT(imp->imp_replayable); + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + imp->imp_last_replay_transno = 0; + imp->imp_replay_cursor = &imp->imp_committed_list; + import_set_state(imp, LUSTRE_IMP_REPLAY); + } else if ((ocd->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0 && + !imp->imp_invalid) { + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); + /* The below message is checked in recovery-small.sh test_106 */ + DEBUG_REQ(D_HA, request, "%s: lwp recover", + imp->imp_obd->obd_name); + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + import_set_state(imp, LUSTRE_IMP_RECOVER); + } else { + DEBUG_REQ(D_HA, request, + "%s: evicting (reconnect/recover flags not set: %x)", + imp->imp_obd->obd_name, msg_flags); + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + import_set_state(imp, LUSTRE_IMP_EVICTED); + } + + /* Sanity checks for a reconnected import. */ + if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) + CERROR("imp_replayable flag does not match server after reconnect. We should LBUG right here.\n"); + + if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 && + lustre_msg_get_last_committed(request->rq_repmsg) < + aa->pcaa_peer_committed) { + static bool printed; + + /* The below message is checked in recovery-small.sh test_54 */ + CERROR("%s: went back in time (transno %lld was previously committed, server now claims %lld)!\n", + obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed, + lustre_msg_get_last_committed(request->rq_repmsg)); + if (!printed) { + CERROR("For further information, see http://doc.lustre.org/lustre_manual.xhtml#went_back_in_time\n"); + printed = true; + } + } + +finish: + ptlrpc_prepare_replay(imp); + rc = ptlrpc_import_recovery_state_machine(imp); + if (rc == -ENOTCONN) { + CDEBUG(D_HA, + "evicted/aborted by %s@%s during recovery; invalidating and reconnecting\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + ptlrpc_connect_import(imp); + spin_lock(&imp->imp_lock); + imp->imp_connected = 0; + imp->imp_connect_tried = 1; + spin_unlock(&imp->imp_lock); + RETURN(0); + } + +out: + if (exp != NULL) + class_export_put(exp); + + spin_lock(&imp->imp_lock); + imp->imp_connected = 0; + imp->imp_connect_tried = 1; + + if (rc != 0) { + bool inact = false; + time64_t now = ktime_get_seconds(); + time64_t next_connect; + + import_set_state_nolock(imp, LUSTRE_IMP_DISCON); + if (rc == -EACCES) { + /* + * Give up trying to reconnect + * EACCES means client has no permission for connection + */ + imp->imp_obd->obd_no_recov = 1; + ptlrpc_deactivate_import_nolock(imp); + inact = true; + } else if (rc == -EPROTO) { + struct obd_connect_data *ocd; + + /* reply message might not be ready */ + if (request->rq_repmsg == NULL) { + spin_unlock(&imp->imp_lock); + RETURN(-EPROTO); + } + + ocd = req_capsule_server_get(&request->rq_pill, + &RMF_CONNECT_DATA); + /* Servers are not supposed to refuse connections from + * clients based on version, only connection feature + * flags. We should never see this from llite, but it + * may be useful for debugging in the future. */ + if (ocd && + (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + (ocd->ocd_version != LUSTRE_VERSION_CODE)) { + LCONSOLE_ERROR_MSG(0x16a, "Server %s version " + "(%d.%d.%d.%d)" + " refused connection from this client " + "with an incompatible version (%s). " + "Client must be recompiled\n", + obd2cli_tgt(imp->imp_obd), + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version), + LUSTRE_VERSION_STRING); + ptlrpc_deactivate_import_nolock(imp); + import_set_state_nolock(imp, LUSTRE_IMP_CLOSED); + inact = true; + } + } else if (rc == -ENODEV || rc == -ETIMEDOUT) { + /* ENODEV means there is no service, force reconnection + * to a pair if attempt happen ptlrpc_next_reconnect + * before now. ETIMEDOUT could be set during network + * error and do not guarantee request deadline happened. + */ + struct obd_import_conn *conn; + time64_t reconnect_time; + + /* Same as ptlrpc_next_reconnect, but in past */ + reconnect_time = now - INITIAL_CONNECT_TIMEOUT; + list_for_each_entry(conn, &imp->imp_conn_list, + oic_item) { + if (conn->oic_last_attempt <= reconnect_time) { + imp->imp_force_verify = 1; + break; + } + } + } + + next_connect = imp->imp_conn_current->oic_last_attempt + + (request->rq_deadline - request->rq_sent); + spin_unlock(&imp->imp_lock); + + if (inact) + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); + + if (rc == -EPROTO) + RETURN(rc); + + /* adjust imp_next_ping to request deadline + 1 and reschedule + * a pinger if import lost processing during CONNECTING or far + * away from request deadline. It could happen when connection + * was initiated outside of pinger, like + * ptlrpc_set_import_discon(). + */ + if (!imp->imp_force_verify && (imp->imp_next_ping <= now || + imp->imp_next_ping > next_connect)) { + imp->imp_next_ping = max(now, next_connect) + 1; + ptlrpc_pinger_wake_up(); + } + + ptlrpc_maybe_ping_import_soon(imp); + + CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n", + obd2cli_tgt(imp->imp_obd), + (char *)imp->imp_connection->c_remote_uuid.uuid, rc); + } else { + spin_unlock(&imp->imp_lock); + } + + wake_up(&imp->imp_recovery_waitq); + RETURN(rc); +} + +/** + * interpret callback for "completed replay" RPCs. + * \see signal_completed_replay + */ +static int completed_replay_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *args, int rc) +{ + ENTRY; + atomic_dec(&req->rq_import->imp_replay_inflight); + if (req->rq_status == 0 && !req->rq_import->imp_vbr_failed) { + ptlrpc_import_recovery_state_machine(req->rq_import); + } else { + if (req->rq_import->imp_vbr_failed) { + CDEBUG(D_WARNING, + "%s: version recovery fails, reconnecting\n", + req->rq_import->imp_obd->obd_name); + } else { + CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, " + "reconnecting\n", + req->rq_import->imp_obd->obd_name, + req->rq_status); + } + ptlrpc_connect_import(req->rq_import); + } + + RETURN(0); +} + +/** + * Let server know that we have no requests to replay anymore. + * Achieved by just sending a PING request + */ +static int signal_completed_replay(struct obd_import *imp) +{ + struct ptlrpc_request *req; + ENTRY; + + if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY))) + RETURN(0); + + if (!atomic_add_unless(&imp->imp_replay_inflight, 1, 1)) + RETURN(0); + + req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION, + OBD_PING); + if (req == NULL) { + atomic_dec(&imp->imp_replay_inflight); + RETURN(-ENOMEM); + } + + ptlrpc_request_set_replen(req); + req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT; + lustre_msg_add_flags(req->rq_reqmsg, + MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE); + if (AT_OFF) + req->rq_timeout *= 3; + req->rq_interpret_reply = completed_replay_interpret; + + ptlrpcd_add_req(req); + RETURN(0); +} + +/** + * In kernel code all import invalidation happens in its own + * separate thread, so that whatever application happened to encounter + * a problem could still be killed or otherwise continue + */ +static int ptlrpc_invalidate_import_thread(void *data) +{ + struct obd_import *imp = data; + + ENTRY; + unshare_fs_struct(); + CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n", + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + + if (do_dump_on_eviction(imp->imp_obd)) { + CERROR("dump the log upon eviction\n"); + libcfs_debug_dumplog(); + } + + ptlrpc_invalidate_import(imp); + import_set_state(imp, LUSTRE_IMP_RECOVER); + ptlrpc_import_recovery_state_machine(imp); + + class_import_put(imp); + RETURN(0); +} + +/** + * This is the state machine for client-side recovery on import. + * + * Typicaly we have two possibly paths. If we came to server and it is not + * in recovery, we just enter IMP_EVICTED state, invalidate our import + * state and reconnect from scratch. + * If we came to server that is in recovery, we enter IMP_REPLAY import state. + * We go through our list of requests to replay and send them to server one by + * one. + * After sending all request from the list we change import state to + * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server + * and also all the locks we don't yet have and wait for server to grant us. + * After that we send a special "replay completed" request and change import + * state to IMP_REPLAY_WAIT. + * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER + * state and resend all requests from sending list. + * After that we promote import to FULL state and send all delayed requests + * and import is fully operational after that. + * + */ +int ptlrpc_import_recovery_state_machine(struct obd_import *imp) +{ + int rc = 0; + int inflight; + char *target_start; + int target_len; + + ENTRY; + if (imp->imp_state == LUSTRE_IMP_EVICTED) { + struct task_struct *task; + + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + /* Don't care about MGC eviction */ + if (strcmp(imp->imp_obd->obd_type->typ_name, + LUSTRE_MGC_NAME) != 0) { + LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted " + "by %.*s; in progress operations " + "using this service will fail.\n", + imp->imp_obd->obd_name, target_len, + target_start); + LASSERTF(!obd_lbug_on_eviction, "LBUG upon eviction\n"); + } + CDEBUG(D_HA, "evicted from %s@%s; invalidating\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + /* reset vbr_failed flag upon eviction */ + spin_lock(&imp->imp_lock); + imp->imp_vbr_failed = 0; + spin_unlock(&imp->imp_lock); + + /* bug 17802: XXX client_disconnect_export vs connect request + * race. if client is evicted at this time then we start + * invalidate thread without reference to import and import can + * be freed at same time. */ + class_import_get(imp); + task = kthread_run(ptlrpc_invalidate_import_thread, imp, + "ll_imp_inval"); + if (IS_ERR(task)) { + class_import_put(imp); + rc = PTR_ERR(task); + CERROR("%s: can't start invalidate thread: rc = %d\n", + imp->imp_obd->obd_name, rc); + } else { + rc = 0; + } + RETURN(rc); + } + + if (imp->imp_state == LUSTRE_IMP_REPLAY) { + CDEBUG(D_HA, "replay requested by %s\n", + obd2cli_tgt(imp->imp_obd)); + rc = ptlrpc_replay_next(imp, &inflight); + if (inflight == 0 && + atomic_read(&imp->imp_replay_inflight) == 0) { + import_set_state(imp, LUSTRE_IMP_REPLAY_LOCKS); + rc = ldlm_replay_locks(imp); + if (rc) + GOTO(out, rc); + } + rc = 0; + } + + if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) { + if (atomic_read(&imp->imp_replay_inflight) == 0) { + import_set_state(imp, LUSTRE_IMP_REPLAY_WAIT); + rc = signal_completed_replay(imp); + if (rc) + GOTO(out, rc); + } + } + + if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) { + if (atomic_read(&imp->imp_replay_inflight) == 0) { + import_set_state(imp, LUSTRE_IMP_RECOVER); + } + } + + if (imp->imp_state == LUSTRE_IMP_RECOVER) { + struct ptlrpc_connection *conn = imp->imp_connection; + + rc = ptlrpc_resend(imp); + if (rc) + GOTO(out, rc); + ptlrpc_activate_import(imp, true); + + /* Reverse import are flagged with dlm_fake == 1. + * They do not do recovery and connection are not "restored". + */ + if (!imp->imp_dlm_fake) + CDEBUG_LIMIT(imp->imp_was_idle ? + imp->imp_idle_debug : D_CONSOLE, + "%s: Connection restored to %s (at %s)\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid), + obd_import_nid2str(imp)); + spin_lock(&imp->imp_lock); + imp->imp_was_idle = 0; + spin_unlock(&imp->imp_lock); + } + + if (imp->imp_state == LUSTRE_IMP_FULL) { + wake_up(&imp->imp_recovery_waitq); + ptlrpc_wake_delayed(imp); + } + +out: + RETURN(rc); +} + +static struct ptlrpc_request *ptlrpc_disconnect_prep_req(struct obd_import *imp) +{ + struct ptlrpc_request *req; + int rq_opc, rc = 0; + ENTRY; + + switch (imp->imp_connect_op) { + case OST_CONNECT: + rq_opc = OST_DISCONNECT; + break; + case MDS_CONNECT: + rq_opc = MDS_DISCONNECT; + break; + case MGS_CONNECT: + rq_opc = MGS_DISCONNECT; + break; + default: + rc = -EINVAL; + CERROR("%s: don't know how to disconnect from %s " + "(connect_op %d): rc = %d\n", + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), + imp->imp_connect_op, rc); + RETURN(ERR_PTR(rc)); + } + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT, + LUSTRE_OBD_VERSION, rq_opc); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + /* We are disconnecting, do not retry a failed DISCONNECT rpc if + * it fails. We can get through the above with a down server + * if the client doesn't know the server is gone yet. */ + req->rq_no_resend = 1; + + /* We want client umounts to happen quickly, no matter the + server state... */ + req->rq_timeout = min_t(timeout_t, req->rq_timeout, + INITIAL_CONNECT_TIMEOUT); + + req->rq_send_state = LUSTRE_IMP_CONNECTING; + ptlrpc_request_set_replen(req); + + RETURN(req); +} + +int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) +{ + struct ptlrpc_request *req; + int rc = 0; + ENTRY; + + if (imp->imp_obd->obd_force) + GOTO(set_state, rc); + + /* probably the import has been disconnected already being idle */ + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_IDLE) + GOTO(out, rc); + spin_unlock(&imp->imp_lock); + + if (ptlrpc_import_in_recovery(imp)) { + long timeout_jiffies; + time64_t timeout; + + if (AT_OFF) { + if (imp->imp_server_timeout) + timeout = obd_timeout >> 1; + else + timeout = obd_timeout; + } else { + u32 req_portal; + int idx; + + req_portal = imp->imp_client->cli_request_portal; + idx = import_at_get_index(imp, req_portal); + timeout = at_get(&imp->imp_at.iat_service_estimate[idx]); + } + + timeout_jiffies = cfs_time_seconds(timeout); + if (wait_event_idle_timeout(imp->imp_recovery_waitq, + !ptlrpc_import_in_recovery(imp), + timeout_jiffies) == 0 && + l_wait_event_abortable(imp->imp_recovery_waitq, + !ptlrpc_import_in_recovery(imp)) < 0) + rc = -EINTR; + } + + req = ptlrpc_disconnect_prep_req(imp); + if (IS_ERR(req)) + GOTO(set_state, rc = PTR_ERR(req)); + + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_FULL) { + ptlrpc_req_finished_with_imp_lock(req); + GOTO(out, rc); + } + import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING); + spin_unlock(&imp->imp_lock); + + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + +set_state: + spin_lock(&imp->imp_lock); +out: + if (noclose) + import_set_state_nolock(imp, LUSTRE_IMP_DISCON); + else + import_set_state_nolock(imp, LUSTRE_IMP_CLOSED); + memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle)); + spin_unlock(&imp->imp_lock); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); + if (!noclose) + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); + + if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN) + rc = 0; + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_disconnect_import); + +static void ptlrpc_reset_reqs_generation(struct obd_import *imp) +{ + struct ptlrpc_request *old, *tmp; + + /* tag all resendable requests generated before disconnection + * notice this code is part of disconnect-at-idle path only */ + list_for_each_entry_safe(old, tmp, &imp->imp_delayed_list, + rq_list) { + spin_lock(&old->rq_lock); + if (old->rq_import_generation == imp->imp_generation - 1 && + ((imp->imp_initiated_at == imp->imp_generation) || + !old->rq_no_resend)) + old->rq_import_generation = imp->imp_generation; + spin_unlock(&old->rq_lock); + } +} + +static int ptlrpc_disconnect_idle_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *args, int rc) +{ + struct obd_import *imp = req->rq_import; + int connect = 0; + + DEBUG_REQ(D_HA, req, "inflight=%d, refcount=%d: rc = %d", + atomic_read(&imp->imp_inflight), + refcount_read(&imp->imp_refcount), rc); + + spin_lock(&imp->imp_lock); + /* DISCONNECT reply can be late and another connection can just + * be initiated. so we have to abort disconnection. */ + if (req->rq_import_generation == imp->imp_generation && + imp->imp_state != LUSTRE_IMP_CLOSED) { + LASSERTF(imp->imp_state == LUSTRE_IMP_CONNECTING, + "%s\n", ptlrpc_import_state_name(imp->imp_state)); + memset(&imp->imp_remote_handle, 0, + sizeof(imp->imp_remote_handle)); + /* take our DISCONNECT into account */ + if (atomic_read(&imp->imp_reqs) > 1) { + imp->imp_generation++; + imp->imp_initiated_at = imp->imp_generation; + import_set_state_nolock(imp, LUSTRE_IMP_NEW); + ptlrpc_reset_reqs_generation(imp); + connect = 1; + } else { + /* do not expose transient IDLE state */ + import_set_state_nolock(imp, LUSTRE_IMP_IDLE); + } + } + + if (connect) { + rc = ptlrpc_connect_import_locked(imp); + if (rc >= 0) + ptlrpc_pinger_add_import(imp); + } else { + spin_unlock(&imp->imp_lock); + } + + return 0; +} + +static bool ptlrpc_can_idle(struct obd_import *imp) +{ + struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; + + /* one request for disconnect rpc */ + if (atomic_read(&imp->imp_reqs) > 1) + return false; + + /* any lock increases ns_bref being a resource holder */ + if (ns && atomic_read(&ns->ns_bref) > 0) + return false; + + return true; +} + +int ptlrpc_disconnect_and_idle_import(struct obd_import *imp) +{ + struct ptlrpc_request *req; + ENTRY; + + if (imp->imp_obd->obd_force) + RETURN(0); + + if (ptlrpc_import_in_recovery(imp)) + RETURN(0); + + req = ptlrpc_disconnect_prep_req(imp); + if (IS_ERR(req)) + RETURN(PTR_ERR(req)); + + req->rq_interpret_reply = ptlrpc_disconnect_idle_interpret; + + if (OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_IDLE_RACE)) { + __u32 idx; + + server_name2index(imp->imp_obd->obd_name, &idx, NULL); + if (idx == 0) + OBD_RACE(OBD_FAIL_PTLRPC_IDLE_RACE); + } + + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_FULL || !ptlrpc_can_idle(imp)) { + ptlrpc_req_finished_with_imp_lock(req); + spin_unlock(&imp->imp_lock); + RETURN(0); + } + import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING); + /* don't make noise at reconnection */ + imp->imp_was_idle = 1; + spin_unlock(&imp->imp_lock); + + CDEBUG_LIMIT(imp->imp_idle_debug, "%s: disconnect after %llus idle\n", + imp->imp_obd->obd_name, + ktime_get_real_seconds() - imp->imp_last_reply_time); + + ptlrpcd_add_req(req); + + RETURN(1); +} +EXPORT_SYMBOL(ptlrpc_disconnect_and_idle_import); + +void ptlrpc_cleanup_imp(struct obd_import *imp) +{ + ENTRY; + + spin_lock(&imp->imp_lock); + + import_set_state_nolock(imp, LUSTRE_IMP_CLOSED); + imp->imp_generation++; + ptlrpc_abort_inflight(imp); + + spin_unlock(&imp->imp_lock); + + EXIT; +} + +/* Adaptive Timeout utils */ + +/* Update at_current_timeout with the specified value (bounded by at_min and + * at_max), as well as the AT history "bins". + * - Bin into timeslices using AT_BINS bins. + * - This gives us a max of the last at_history seconds without the storage, + * but still smoothing out a return to normalcy from a slow response. + * - (E.g. remember the maximum latency in each minute of the last 4 minutes.) + */ +timeout_t at_measured(struct adaptive_timeout *at, timeout_t timeout) +{ + timeout_t old_timeout = at->at_current_timeout; + time64_t now = ktime_get_real_seconds(); + long binlimit = max_t(long, at_history / AT_BINS, 1); + + LASSERT(at); + CDEBUG(D_OTHER, "add %u to %p time=%lld v=%u (%u %u %u %u)\n", + timeout, at, now - at->at_binstart, at->at_current_timeout, + at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]); + + if (timeout <= 0) + /* Negative timeouts and 0's don't count, because we never + * want our timeout to drop to 0 or below, and because 0 could + * mean an error + */ + return 0; + + spin_lock(&at->at_lock); + + if (unlikely(at->at_binstart == 0)) { + /* Special case to remove default from history */ + at->at_current_timeout = timeout; + at->at_worst_timeout_ever = timeout; + at->at_worst_timestamp = now; + at->at_hist[0] = timeout; + at->at_binstart = now; + } else if (now - at->at_binstart < binlimit ) { + /* in bin 0 */ + at->at_hist[0] = max_t(timeout_t, timeout, at->at_hist[0]); + at->at_current_timeout = max_t(timeout_t, timeout, + at->at_current_timeout); + } else { + int i, shift; + timeout_t maxv = timeout; + + /* move bins over */ + shift = (u32)(now - at->at_binstart) / binlimit; + LASSERT(shift > 0); + for(i = AT_BINS - 1; i >= 0; i--) { + if (i >= shift) { + at->at_hist[i] = at->at_hist[i - shift]; + maxv = max_t(timeout_t, maxv, at->at_hist[i]); + } else { + at->at_hist[i] = 0; + } + } + at->at_hist[0] = timeout; + at->at_current_timeout = maxv; + at->at_binstart += shift * binlimit; + } + + if (at->at_current_timeout > at->at_worst_timeout_ever) { + at->at_worst_timeout_ever = at->at_current_timeout; + at->at_worst_timestamp = now; + } + + if (at->at_flags & AT_FLG_NOHIST) + /* Only keep last reported val; keeping the rest of the history + * for debugfs only + */ + at->at_current_timeout = timeout; + + if (at_max > 0) + at->at_current_timeout = min_t(timeout_t, + at->at_current_timeout, at_max); + at->at_current_timeout = max_t(timeout_t, at->at_current_timeout, + at_min); + if (at->at_current_timeout != old_timeout) + CDEBUG(D_OTHER, + "AT %p change: old=%u new=%u delta=%d (val=%d) hist %u %u %u %u\n", + at, old_timeout, at->at_current_timeout, + at->at_current_timeout - old_timeout, timeout, + at->at_hist[0], at->at_hist[1], at->at_hist[2], + at->at_hist[3]); + + /* if we changed, report the old timeout value */ + old_timeout = (at->at_current_timeout != old_timeout) ? old_timeout : 0; + + spin_unlock(&at->at_lock); + return old_timeout; +} + +/* Find the imp_at index for a given portal; assign if space available */ +int import_at_get_index(struct obd_import *imp, int portal) +{ + struct imp_at *at = &imp->imp_at; + int i; + + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + if (at->iat_portal[i] == portal) + return i; + if (at->iat_portal[i] == 0) + /* unused */ + break; + } + + /* Not found in list, add it under a lock */ + spin_lock(&imp->imp_lock); + + /* Check unused under lock */ + for (; i < IMP_AT_MAX_PORTALS; i++) { + if (at->iat_portal[i] == portal) + goto out; + if (at->iat_portal[i] == 0) + /* unused */ + break; + } + + /* Not enough portals? */ + LASSERT(i < IMP_AT_MAX_PORTALS); + + at->iat_portal[i] = portal; +out: + spin_unlock(&imp->imp_lock); + return i; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c new file mode 100644 index 0000000000000..9d20c186a6475 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c @@ -0,0 +1,2719 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/layout.c + * + * Lustre Metadata Target (mdt) request handler + * + * Author: Nikita Danilov + */ +/* + * This file contains the "capsule/pill" abstraction layered above PTLRPC. + * + * Every struct ptlrpc_request contains a "pill", which points to a description + * of the format that the request conforms to. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include + +#include +#include +#include +#include + +/* struct ptlrpc_request, lustre_msg* */ +#include +#include +#include + +/* + * RQFs (see below) refer to two struct req_msg_field arrays describing the + * client request and server reply, respectively. + */ +/* empty set of fields... for suitable definition of emptiness. */ +static const struct req_msg_field *empty[] = { + &RMF_PTLRPC_BODY +}; + +static const struct req_msg_field *mgs_target_info_only[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_TARGET_INFO +}; + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 18, 53, 0) +static const struct req_msg_field *mgs_set_info[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_SEND_PARAM +}; +#endif + +static const struct req_msg_field *mgs_config_read_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_CONFIG_BODY +}; + +static const struct req_msg_field *mgs_config_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_CONFIG_RES +}; + +static const struct req_msg_field *mdt_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY +}; + +static const struct req_msg_field *mdt_body_capa[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1 +}; + +static const struct req_msg_field *quotactl_only[] = { + &RMF_PTLRPC_BODY, + &RMF_OBD_QUOTACTL +}; + +static const struct req_msg_field *quota_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_QUOTA_BODY +}; + +static const struct req_msg_field *ldlm_intent_quota_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_QUOTA_BODY +}; + +static const struct req_msg_field *ldlm_intent_quota_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_DLM_LVB, + &RMF_QUOTA_BODY +}; + +static const struct req_msg_field *mdt_close_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_EPOCH, + &RMF_REC_REINT, + &RMF_CAPA1 +}; + +static const struct req_msg_field *mdt_close_intent_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_EPOCH, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CLOSE_DATA, + &RMF_U32 +}; + +static const struct req_msg_field *obd_statfs_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OBD_STATFS +}; + +static const struct req_msg_field *seq_query_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SEQ_OPC, + &RMF_SEQ_RANGE +}; + +static const struct req_msg_field *seq_query_server[] = { + &RMF_PTLRPC_BODY, + &RMF_SEQ_RANGE +}; + +static const struct req_msg_field *fld_query_client[] = { + &RMF_PTLRPC_BODY, + &RMF_FLD_OPC, + &RMF_FLD_MDFLD +}; + +static const struct req_msg_field *fld_query_server[] = { + &RMF_PTLRPC_BODY, + &RMF_FLD_MDFLD +}; + +static const struct req_msg_field *fld_read_client[] = { + &RMF_PTLRPC_BODY, + &RMF_FLD_MDFLD +}; + +static const struct req_msg_field *fld_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_GENERIC_DATA +}; + +static const struct req_msg_field *mds_getattr_name_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_NAME +}; + +static const struct req_msg_field *mds_reint_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT +}; + +static const struct req_msg_field *mds_reint_create_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME +}; + +static const struct req_msg_field *mds_reint_create_slave_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_create_acl_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_DLM_REQ, + &RMF_FILE_SECCTX_NAME, + &RMF_FILE_SECCTX, + &RMF_SELINUX_POL, + &RMF_FILE_ENCCTX, +}; + +static const struct req_msg_field *mds_reint_create_sym_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_SYMTGT, + &RMF_DLM_REQ, + &RMF_FILE_SECCTX_NAME, + &RMF_FILE_SECCTX, + &RMF_SELINUX_POL, + &RMF_FILE_ENCCTX, +}; + +static const struct req_msg_field *mds_reint_open_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_EADATA, + &RMF_FILE_SECCTX_NAME, + &RMF_FILE_SECCTX, + &RMF_SELINUX_POL, + &RMF_FILE_ENCCTX, +}; + +static const struct req_msg_field *mds_reint_open_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_reint_unlink_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_DLM_REQ, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *mds_reint_link_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_DLM_REQ, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *mds_reint_rename_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_SYMTGT, + &RMF_DLM_REQ, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *mds_reint_migrate_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_SYMTGT, + &RMF_DLM_REQ, + &RMF_SELINUX_POL, + &RMF_MDT_EPOCH, + &RMF_CLOSE_DATA, + &RMF_EADATA +}; + +static const struct req_msg_field *mds_last_unlink_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_LOGCOOKIES, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_reint_setattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_MDT_EPOCH, + &RMF_EADATA, + &RMF_LOGCOOKIES, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_setxattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_DLM_REQ, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *mds_reint_resync[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mdt_swap_layouts[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_SWAP_LAYOUTS, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_rmfid_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_FID_ARRAY, + &RMF_CAPA1, + &RMF_CAPA2, +}; + +static const struct req_msg_field *mds_rmfid_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_FID_ARRAY, + &RMF_RCS, +}; + +static const struct req_msg_field *obd_connect_client[] = { + &RMF_PTLRPC_BODY, + &RMF_TGTUUID, + &RMF_CLUUID, + &RMF_CONN, + &RMF_CONNECT_DATA, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *obd_connect_server[] = { + &RMF_PTLRPC_BODY, + &RMF_CONNECT_DATA +}; + +static const struct req_msg_field *obd_set_info_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SETINFO_KEY, + &RMF_SETINFO_VAL +}; + +static const struct req_msg_field *mdt_set_info_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SETINFO_KEY, + &RMF_SETINFO_VAL, + &RMF_MDT_BODY +}; + +static const struct req_msg_field *ost_grant_shrink_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SETINFO_KEY, + &RMF_OST_BODY +}; + +static const struct req_msg_field *mds_getinfo_client[] = { + &RMF_PTLRPC_BODY, + &RMF_GETINFO_KEY, + &RMF_GETINFO_VALLEN +}; + +static const struct req_msg_field *mds_getinfo_server[] = { + &RMF_PTLRPC_BODY, + &RMF_GETINFO_VAL, +}; + +static const struct req_msg_field *ldlm_enqueue_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *ldlm_enqueue_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP +}; + +static const struct req_msg_field *ldlm_enqueue_lvb_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_DLM_LVB +}; + +static const struct req_msg_field *ldlm_cp_callback_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_DLM_LVB +}; + +static const struct req_msg_field *ldlm_gl_callback_desc_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_DLM_GL_DESC +}; + +static const struct req_msg_field *ldlm_gl_callback_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_LVB +}; + +static const struct req_msg_field *ldlm_intent_basic_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, +}; + +static const struct req_msg_field *ldlm_intent_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT +}; + +static const struct req_msg_field *ldlm_intent_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL +}; + +static const struct req_msg_field *ldlm_intent_layout_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_LAYOUT_INTENT, + &RMF_EADATA /* for new layout to be set up */ +}; + +static const struct req_msg_field *ldlm_intent_open_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NIOBUF_INLINE, + &RMF_FILE_SECCTX, + &RMF_FILE_ENCCTX, + &RMF_DEFAULT_MDT_MD, +}; + +static const struct req_msg_field *ldlm_intent_getattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_MDT_BODY, /* coincides with mds_getattr_name_client[] */ + &RMF_CAPA1, + &RMF_NAME, + &RMF_FILE_SECCTX_NAME +}; + +static const struct req_msg_field *ldlm_intent_getattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_FILE_SECCTX, + &RMF_DEFAULT_MDT_MD, + &RMF_FILE_ENCCTX, +}; + +static const struct req_msg_field *ldlm_intent_create_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT, /* coincides with mds_reint_create_client[] */ + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_FILE_SECCTX_NAME, + &RMF_FILE_SECCTX, + &RMF_SELINUX_POL, + &RMF_FILE_ENCCTX, +}; + +static const struct req_msg_field *ldlm_intent_open_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT, /* coincides with mds_reint_open_client[] */ + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_EADATA, + &RMF_FILE_SECCTX_NAME, + &RMF_FILE_SECCTX, + &RMF_SELINUX_POL, + &RMF_FILE_ENCCTX, +}; + +static const struct req_msg_field *ldlm_intent_getxattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *ldlm_intent_getxattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, /* for req_capsule_extend/mdt_intent_policy */ + &RMF_EADATA, + &RMF_EAVALS, + &RMF_EAVALS_LENS +}; + +static const struct req_msg_field *mds_get_root_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_NAME +}; + +static const struct req_msg_field *mds_getxattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *mds_getxattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_EADATA +}; + +static const struct req_msg_field *mds_getattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_FILE_ENCCTX, +}; + +static const struct req_msg_field *mds_setattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *llog_origin_handle_create_client[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_BODY, + &RMF_NAME, + &RMF_MDT_BODY +}; + +static const struct req_msg_field *llogd_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_BODY +}; + +static const struct req_msg_field *llog_log_hdr_only[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOG_LOG_HDR +}; + +static const struct req_msg_field *llog_origin_handle_next_block_server[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_BODY, + &RMF_EADATA +}; + +static const struct req_msg_field *obd_idx_read_client[] = { + &RMF_PTLRPC_BODY, + &RMF_IDX_INFO +}; + +static const struct req_msg_field *obd_idx_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_IDX_INFO +}; + +static const struct req_msg_field *ost_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY +}; + +static const struct req_msg_field *ost_body_capa[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_CAPA1 +}; + +static const struct req_msg_field *ost_destroy_client[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_DLM_REQ, + &RMF_CAPA1 +}; + + +static const struct req_msg_field *ost_brw_client[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_OBD_IOOBJ, + &RMF_NIOBUF_REMOTE, + &RMF_CAPA1, + &RMF_SHORT_IO +}; + +static const struct req_msg_field *ost_brw_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_SHORT_IO +}; + +static const struct req_msg_field *ost_brw_write_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_RCS +}; + +static const struct req_msg_field *ost_get_info_generic_server[] = { + &RMF_PTLRPC_BODY, + &RMF_GENERIC_DATA, +}; + +static const struct req_msg_field *ost_get_info_generic_client[] = { + &RMF_PTLRPC_BODY, + &RMF_GETINFO_KEY +}; + +static const struct req_msg_field *ost_get_last_id_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OBD_ID +}; + +static const struct req_msg_field *ost_get_last_fid_client[] = { + &RMF_PTLRPC_BODY, + &RMF_GETINFO_KEY, + &RMF_FID, +}; + +static const struct req_msg_field *ost_get_last_fid_server[] = { + &RMF_PTLRPC_BODY, + &RMF_FID, +}; + +static const struct req_msg_field *ost_get_fiemap_client[] = { + &RMF_PTLRPC_BODY, + &RMF_FIEMAP_KEY, + &RMF_FIEMAP_VAL +}; + +static const struct req_msg_field *ost_ladvise[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_OST_LADVISE_HDR, + &RMF_OST_LADVISE, +}; + +static const struct req_msg_field *ost_get_fiemap_server[] = { + &RMF_PTLRPC_BODY, + &RMF_FIEMAP_VAL +}; + +static const struct req_msg_field *mdt_hsm_progress[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_PROGRESS, +}; + +static const struct req_msg_field *mdt_hsm_ct_register[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_ARCHIVE, +}; + +static const struct req_msg_field *mdt_hsm_ct_unregister[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, +}; + +static const struct req_msg_field *mdt_hsm_action_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_CURRENT_ACTION, +}; + +static const struct req_msg_field *mdt_hsm_state_get_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_HSM_USER_STATE, +}; + +static const struct req_msg_field *mdt_hsm_state_set[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_HSM_STATE_SET, +}; + +static const struct req_msg_field *mdt_hsm_request[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_REQUEST, + &RMF_MDS_HSM_USER_ITEM, + &RMF_GENERIC_DATA, +}; + +static const struct req_msg_field *obd_lfsck_request[] = { + &RMF_PTLRPC_BODY, + &RMF_LFSCK_REQUEST, +}; + +static const struct req_msg_field *obd_lfsck_reply[] = { + &RMF_PTLRPC_BODY, + &RMF_LFSCK_REPLY, +}; + +static struct req_format *req_formats[] = { + &RQF_OBD_PING, + &RQF_OBD_SET_INFO, + &RQF_MDT_SET_INFO, + &RQF_OBD_IDX_READ, + &RQF_SEC_CTX, + &RQF_MGS_TARGET_REG, +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 18, 53, 0) + &RQF_MGS_SET_INFO, +#endif + &RQF_MGS_CONFIG_READ, + &RQF_SEQ_QUERY, + &RQF_FLD_QUERY, + &RQF_FLD_READ, + &RQF_MDS_CONNECT, + &RQF_MDS_DISCONNECT, + &RQF_MDS_GET_INFO, + &RQF_MDS_GET_ROOT, + &RQF_MDS_STATFS, + &RQF_MDS_STATFS_NEW, + &RQF_MDS_GETATTR, + &RQF_MDS_GETATTR_NAME, + &RQF_MDS_GETXATTR, + &RQF_MDS_SYNC, + &RQF_MDS_CLOSE, + &RQF_MDS_CLOSE_INTENT, + &RQF_MDS_READPAGE, + &RQF_MDS_REINT, + &RQF_MDS_REINT_CREATE, + &RQF_MDS_REINT_CREATE_ACL, + &RQF_MDS_REINT_CREATE_SLAVE, + &RQF_MDS_REINT_CREATE_SYM, + &RQF_MDS_REINT_OPEN, + &RQF_MDS_REINT_UNLINK, + &RQF_MDS_REINT_LINK, + &RQF_MDS_REINT_RENAME, + &RQF_MDS_REINT_MIGRATE, + &RQF_MDS_REINT_SETATTR, + &RQF_MDS_REINT_SETXATTR, + &RQF_MDS_REINT_RESYNC, + &RQF_MDS_QUOTACTL, + &RQF_MDS_HSM_PROGRESS, + &RQF_MDS_HSM_CT_REGISTER, + &RQF_MDS_HSM_CT_UNREGISTER, + &RQF_MDS_HSM_STATE_GET, + &RQF_MDS_HSM_STATE_SET, + &RQF_MDS_HSM_ACTION, + &RQF_MDS_HSM_REQUEST, + &RQF_MDS_SWAP_LAYOUTS, + &RQF_MDS_RMFID, +#ifdef HAVE_SERVER_SUPPORT + &RQF_OUT_UPDATE, +#endif + &RQF_OST_CONNECT, + &RQF_OST_DISCONNECT, + &RQF_OST_QUOTACTL, + &RQF_OST_GETATTR, + &RQF_OST_SETATTR, + &RQF_OST_CREATE, + &RQF_OST_PUNCH, + &RQF_OST_FALLOCATE, + &RQF_OST_SYNC, + &RQF_OST_DESTROY, + &RQF_OST_BRW_READ, + &RQF_OST_BRW_WRITE, + &RQF_OST_STATFS, + &RQF_OST_SET_GRANT_INFO, + &RQF_OST_GET_INFO, + &RQF_OST_GET_INFO_LAST_ID, + &RQF_OST_GET_INFO_LAST_FID, + &RQF_OST_SET_INFO_LAST_FID, + &RQF_OST_GET_INFO_FIEMAP, + &RQF_OST_LADVISE, + &RQF_OST_SEEK, + &RQF_LDLM_ENQUEUE, + &RQF_LDLM_ENQUEUE_LVB, + &RQF_LDLM_CONVERT, + &RQF_LDLM_CANCEL, + &RQF_LDLM_CALLBACK, + &RQF_LDLM_CP_CALLBACK, + &RQF_LDLM_BL_CALLBACK, + &RQF_LDLM_GL_CALLBACK, + &RQF_LDLM_GL_CALLBACK_DESC, + &RQF_LDLM_INTENT, + &RQF_LDLM_INTENT_BASIC, + &RQF_LDLM_INTENT_LAYOUT, + &RQF_LDLM_INTENT_GETATTR, + &RQF_LDLM_INTENT_OPEN, + &RQF_LDLM_INTENT_CREATE, + &RQF_LDLM_INTENT_GETXATTR, + &RQF_LDLM_INTENT_QUOTA, + &RQF_QUOTA_DQACQ, + &RQF_LLOG_ORIGIN_HANDLE_CREATE, + &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK, + &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK, + &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER, + &RQF_CONNECT, + &RQF_LFSCK_NOTIFY, + &RQF_LFSCK_QUERY, +}; + +struct req_msg_field { + const __u32 rmf_flags; + const char *rmf_name; + /** + * Field length. (-1) means "variable length". If the + * \a RMF_F_STRUCT_ARRAY flag is set the field is also variable-length, + * but the actual size must be a whole multiple of \a rmf_size. + */ + const int rmf_size; + void (*rmf_swabber)(void *); + /** + * Pass buffer size to swabbing function + * \retval > 0 the number of bytes swabbed + * -EOVERFLOW on error + */ + int (*rmf_swab_len)(void *, __u32); + void (*rmf_dumper)(void *); + int rmf_offset[ARRAY_SIZE(req_formats)][RCL_NR]; +}; + +enum rmf_flags { + /** + * The field is a string, must be NUL-terminated. + */ + RMF_F_STRING = BIT(0), + /** + * The field's buffer size need not match the declared \a rmf_size. + */ + RMF_F_NO_SIZE_CHECK = BIT(1), + /** + * The field's buffer size must be a whole multiple of the declared \a + * rmf_size and the \a rmf_swabber function must work on the declared \a + * rmf_size worth of bytes. + */ + RMF_F_STRUCT_ARRAY = BIT(2), +}; + +struct req_capsule; + +/* + * Request fields. + */ +#define DEFINE_MSGF(name, flags, size, swabber, dumper) { \ + .rmf_name = (name), \ + .rmf_flags = (flags), \ + .rmf_size = (size), \ + .rmf_swabber = (void (*)(void*))(swabber), \ + .rmf_dumper = (void (*)(void*))(dumper) \ +} + +#define DEFINE_MSGFL(name, flags, size, swab_len, dumper) { \ + .rmf_name = (name), \ + .rmf_flags = (flags), \ + .rmf_size = (size), \ + .rmf_swab_len = (int (*)(void *, __u32))(swab_len), \ + .rmf_dumper = (void (*)(void *))(dumper) \ +} + +struct req_msg_field RMF_GENERIC_DATA = + DEFINE_MSGF("generic_data", 0, + -1, NULL, NULL); +EXPORT_SYMBOL(RMF_GENERIC_DATA); + +struct req_msg_field RMF_MGS_TARGET_INFO = + DEFINE_MSGF("mgs_target_info", 0, + sizeof(struct mgs_target_info), + lustre_swab_mgs_target_info, NULL); +EXPORT_SYMBOL(RMF_MGS_TARGET_INFO); + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 18, 53, 0) +struct req_msg_field RMF_MGS_SEND_PARAM = + DEFINE_MSGF("mgs_send_param", 0, + sizeof(struct mgs_send_param), + NULL, NULL); +EXPORT_SYMBOL(RMF_MGS_SEND_PARAM); +#endif + +struct req_msg_field RMF_MGS_CONFIG_BODY = + DEFINE_MSGF("mgs_config_read request", 0, + sizeof(struct mgs_config_body), + lustre_swab_mgs_config_body, NULL); +EXPORT_SYMBOL(RMF_MGS_CONFIG_BODY); + +struct req_msg_field RMF_MGS_CONFIG_RES = + DEFINE_MSGF("mgs_config_read reply ", 0, + sizeof(struct mgs_config_res), + lustre_swab_mgs_config_res, NULL); +EXPORT_SYMBOL(RMF_MGS_CONFIG_RES); + +struct req_msg_field RMF_U32 = + DEFINE_MSGF("generic u32", RMF_F_STRUCT_ARRAY, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_U32); + +struct req_msg_field RMF_SETINFO_VAL = + DEFINE_MSGF("setinfo_val", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SETINFO_VAL); + +struct req_msg_field RMF_GETINFO_KEY = + DEFINE_MSGF("getinfo_key", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_GETINFO_KEY); + +struct req_msg_field RMF_GETINFO_VALLEN = + DEFINE_MSGF("getinfo_vallen", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_GETINFO_VALLEN); + +struct req_msg_field RMF_GETINFO_VAL = + DEFINE_MSGF("getinfo_val", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_GETINFO_VAL); + +struct req_msg_field RMF_SEQ_OPC = + DEFINE_MSGF("seq_query_opc", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_SEQ_OPC); + +struct req_msg_field RMF_SEQ_RANGE = + DEFINE_MSGF("seq_query_range", 0, + sizeof(struct lu_seq_range), + lustre_swab_lu_seq_range, NULL); +EXPORT_SYMBOL(RMF_SEQ_RANGE); + +struct req_msg_field RMF_FLD_OPC = + DEFINE_MSGF("fld_query_opc", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_FLD_OPC); + +struct req_msg_field RMF_FLD_MDFLD = + DEFINE_MSGF("fld_query_mdfld", 0, + sizeof(struct lu_seq_range), + lustre_swab_lu_seq_range, NULL); +EXPORT_SYMBOL(RMF_FLD_MDFLD); + +struct req_msg_field RMF_MDT_BODY = + DEFINE_MSGF("mdt_body", 0, + sizeof(struct mdt_body), lustre_swab_mdt_body, NULL); +EXPORT_SYMBOL(RMF_MDT_BODY); + +struct req_msg_field RMF_OBD_QUOTACTL = + DEFINE_MSGFL("obd_quotactl", + 0, + sizeof(struct obd_quotactl), + lustre_swab_obd_quotactl, NULL); +EXPORT_SYMBOL(RMF_OBD_QUOTACTL); + +struct req_msg_field RMF_QUOTA_BODY = + DEFINE_MSGF("quota_body", 0, + sizeof(struct quota_body), lustre_swab_quota_body, NULL); +EXPORT_SYMBOL(RMF_QUOTA_BODY); + +struct req_msg_field RMF_MDT_EPOCH = + DEFINE_MSGF("mdt_ioepoch", 0, + sizeof(struct mdt_ioepoch), lustre_swab_mdt_ioepoch, NULL); +EXPORT_SYMBOL(RMF_MDT_EPOCH); + +struct req_msg_field RMF_PTLRPC_BODY = + DEFINE_MSGF("ptlrpc_body", 0, + sizeof(struct ptlrpc_body), lustre_swab_ptlrpc_body, NULL); +EXPORT_SYMBOL(RMF_PTLRPC_BODY); + +struct req_msg_field RMF_CLOSE_DATA = + DEFINE_MSGF("data_version", 0, + sizeof(struct close_data), lustre_swab_close_data, NULL); +EXPORT_SYMBOL(RMF_CLOSE_DATA); + +struct req_msg_field RMF_OBD_STATFS = + DEFINE_MSGF("obd_statfs", 0, + sizeof(struct obd_statfs), lustre_swab_obd_statfs, NULL); +EXPORT_SYMBOL(RMF_OBD_STATFS); + +struct req_msg_field RMF_SETINFO_KEY = + DEFINE_MSGF("setinfo_key", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SETINFO_KEY); + +struct req_msg_field RMF_NAME = + DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_NAME); + +struct req_msg_field RMF_FID_ARRAY = + DEFINE_MSGF("fid_array", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_FID_ARRAY); + +struct req_msg_field RMF_SYMTGT = + DEFINE_MSGF("symtgt", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SYMTGT); + +struct req_msg_field RMF_TGTUUID = + DEFINE_MSGF("tgtuuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL, + NULL); +EXPORT_SYMBOL(RMF_TGTUUID); + +struct req_msg_field RMF_CLUUID = + DEFINE_MSGF("cluuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL, + NULL); +EXPORT_SYMBOL(RMF_CLUUID); + +struct req_msg_field RMF_STRING = + DEFINE_MSGF("string", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_STRING); + +struct req_msg_field RMF_FILE_SECCTX_NAME = + DEFINE_MSGF("file_secctx_name", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_FILE_SECCTX_NAME); + +struct req_msg_field RMF_FILE_SECCTX = + DEFINE_MSGF("file_secctx", RMF_F_NO_SIZE_CHECK, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_FILE_SECCTX); + +struct req_msg_field RMF_FILE_ENCCTX = + DEFINE_MSGF("file_encctx", RMF_F_NO_SIZE_CHECK, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_FILE_ENCCTX); + +struct req_msg_field RMF_LLOGD_BODY = + DEFINE_MSGF("llogd_body", 0, + sizeof(struct llogd_body), lustre_swab_llogd_body, NULL); +EXPORT_SYMBOL(RMF_LLOGD_BODY); + +struct req_msg_field RMF_LLOG_LOG_HDR = + DEFINE_MSGF("llog_log_hdr", 0, + sizeof(struct llog_log_hdr), lustre_swab_llog_hdr, NULL); +EXPORT_SYMBOL(RMF_LLOG_LOG_HDR); + +struct req_msg_field RMF_LLOGD_CONN_BODY = + DEFINE_MSGF("llogd_conn_body", 0, + sizeof(struct llogd_conn_body), + lustre_swab_llogd_conn_body, NULL); +EXPORT_SYMBOL(RMF_LLOGD_CONN_BODY); + +/* + * connection handle received in MDS_CONNECT request. + * + * No swabbing needed because struct lustre_handle contains only a 64-bit cookie + * that the client does not interpret at all. + */ +struct req_msg_field RMF_CONN = + DEFINE_MSGF("conn", 0, sizeof(struct lustre_handle), NULL, NULL); +EXPORT_SYMBOL(RMF_CONN); + +struct req_msg_field RMF_CONNECT_DATA = + DEFINE_MSGF("cdata", + RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */, + sizeof(struct obd_connect_data), + lustre_swab_connect, NULL); +EXPORT_SYMBOL(RMF_CONNECT_DATA); + +struct req_msg_field RMF_DLM_REQ = + DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */, + sizeof(struct ldlm_request), + lustre_swab_ldlm_request, NULL); +EXPORT_SYMBOL(RMF_DLM_REQ); + +struct req_msg_field RMF_DLM_REP = + DEFINE_MSGF("dlm_rep", 0, + sizeof(struct ldlm_reply), lustre_swab_ldlm_reply, NULL); +EXPORT_SYMBOL(RMF_DLM_REP); + +struct req_msg_field RMF_LDLM_INTENT = + DEFINE_MSGF("ldlm_intent", 0, + sizeof(struct ldlm_intent), lustre_swab_ldlm_intent, NULL); +EXPORT_SYMBOL(RMF_LDLM_INTENT); + +struct req_msg_field RMF_DLM_LVB = + DEFINE_MSGF("dlm_lvb", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_DLM_LVB); + +struct req_msg_field RMF_DLM_GL_DESC = + DEFINE_MSGF("dlm_gl_desc", 0, sizeof(union ldlm_gl_desc), NULL, NULL); +EXPORT_SYMBOL(RMF_DLM_GL_DESC); + +struct req_msg_field RMF_MDT_MD = + DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL); +EXPORT_SYMBOL(RMF_MDT_MD); + +struct req_msg_field RMF_DEFAULT_MDT_MD = + DEFINE_MSGF("default_mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, + NULL); +EXPORT_SYMBOL(RMF_DEFAULT_MDT_MD); + +struct req_msg_field RMF_REC_REINT = + DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint), + lustre_swab_mdt_rec_reint, NULL); +EXPORT_SYMBOL(RMF_REC_REINT); + +/* FIXME: this length should be defined as a macro */ +struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1, + NULL, NULL); +EXPORT_SYMBOL(RMF_EADATA); + +struct req_msg_field RMF_EAVALS = DEFINE_MSGF("eavals", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_EAVALS); + +struct req_msg_field RMF_ACL = DEFINE_MSGF("acl", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_ACL); + +/* FIXME: this should be made to use RMF_F_STRUCT_ARRAY */ +struct req_msg_field RMF_LOGCOOKIES = + DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */, + sizeof(struct llog_cookie), NULL, NULL); +EXPORT_SYMBOL(RMF_LOGCOOKIES); + +struct req_msg_field RMF_CAPA1 = + DEFINE_MSGF("capa", 0, 0, NULL, NULL); +EXPORT_SYMBOL(RMF_CAPA1); + +struct req_msg_field RMF_CAPA2 = + DEFINE_MSGF("capa", 0, 0, NULL, NULL); +EXPORT_SYMBOL(RMF_CAPA2); + +struct req_msg_field RMF_LAYOUT_INTENT = + DEFINE_MSGF("layout_intent", 0, + sizeof(struct layout_intent), lustre_swab_layout_intent, + NULL); +EXPORT_SYMBOL(RMF_LAYOUT_INTENT); + +struct req_msg_field RMF_SELINUX_POL = + DEFINE_MSGF("selinux_pol", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SELINUX_POL); + +/* + * OST request field. + */ +struct req_msg_field RMF_OST_BODY = + DEFINE_MSGF("ost_body", 0, + sizeof(struct ost_body), lustre_swab_ost_body, + dump_ost_body); +EXPORT_SYMBOL(RMF_OST_BODY); + +struct req_msg_field RMF_OBD_IOOBJ = + DEFINE_MSGF("obd_ioobj", RMF_F_STRUCT_ARRAY, + sizeof(struct obd_ioobj), lustre_swab_obd_ioobj, dump_ioo); +EXPORT_SYMBOL(RMF_OBD_IOOBJ); + +struct req_msg_field RMF_NIOBUF_REMOTE = + DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, + sizeof(struct niobuf_remote), lustre_swab_niobuf_remote, + dump_rniobuf); +EXPORT_SYMBOL(RMF_NIOBUF_REMOTE); + +struct req_msg_field RMF_NIOBUF_INLINE = + DEFINE_MSGF("niobuf_inline", RMF_F_NO_SIZE_CHECK, + sizeof(struct niobuf_remote), lustre_swab_niobuf_remote, + dump_rniobuf); +EXPORT_SYMBOL(RMF_NIOBUF_INLINE); + +struct req_msg_field RMF_RCS = + DEFINE_MSGF("niobuf_rcs", RMF_F_STRUCT_ARRAY, sizeof(__u32), + lustre_swab_generic_32s, dump_rcs); +EXPORT_SYMBOL(RMF_RCS); + +struct req_msg_field RMF_EAVALS_LENS = + DEFINE_MSGF("eavals_lens", RMF_F_STRUCT_ARRAY, sizeof(__u32), + lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_EAVALS_LENS); + +struct req_msg_field RMF_OBD_ID = + DEFINE_MSGF("obd_id", 0, + sizeof(__u64), lustre_swab_ost_last_id, NULL); +EXPORT_SYMBOL(RMF_OBD_ID); + +struct req_msg_field RMF_FID = + DEFINE_MSGF("fid", 0, + sizeof(struct lu_fid), lustre_swab_lu_fid, NULL); +EXPORT_SYMBOL(RMF_FID); + +struct req_msg_field RMF_OST_ID = + DEFINE_MSGF("ost_id", 0, + sizeof(struct ost_id), lustre_swab_ost_id, NULL); +EXPORT_SYMBOL(RMF_OST_ID); + +struct req_msg_field RMF_FIEMAP_KEY = + DEFINE_MSGF("fiemap_key", 0, sizeof(struct ll_fiemap_info_key), + lustre_swab_fiemap_info_key, NULL); +EXPORT_SYMBOL(RMF_FIEMAP_KEY); + +struct req_msg_field RMF_FIEMAP_VAL = + DEFINE_MSGFL("fiemap", 0, -1, lustre_swab_fiemap, NULL); +EXPORT_SYMBOL(RMF_FIEMAP_VAL); + +struct req_msg_field RMF_IDX_INFO = + DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info), + lustre_swab_idx_info, NULL); +EXPORT_SYMBOL(RMF_IDX_INFO); +struct req_msg_field RMF_SHORT_IO = + DEFINE_MSGF("short_io", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SHORT_IO); +struct req_msg_field RMF_HSM_USER_STATE = + DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state), + lustre_swab_hsm_user_state, NULL); +EXPORT_SYMBOL(RMF_HSM_USER_STATE); + +struct req_msg_field RMF_HSM_STATE_SET = + DEFINE_MSGF("hsm_state_set", 0, sizeof(struct hsm_state_set), + lustre_swab_hsm_state_set, NULL); +EXPORT_SYMBOL(RMF_HSM_STATE_SET); + +struct req_msg_field RMF_MDS_HSM_PROGRESS = + DEFINE_MSGF("hsm_progress", 0, sizeof(struct hsm_progress_kernel), + lustre_swab_hsm_progress_kernel, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_PROGRESS); + +struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION = + DEFINE_MSGF("hsm_current_action", 0, sizeof(struct hsm_current_action), + lustre_swab_hsm_current_action, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_CURRENT_ACTION); + +struct req_msg_field RMF_MDS_HSM_USER_ITEM = + DEFINE_MSGF("hsm_user_item", RMF_F_STRUCT_ARRAY, + sizeof(struct hsm_user_item), lustre_swab_hsm_user_item, + NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM); + +struct req_msg_field RMF_MDS_HSM_ARCHIVE = + DEFINE_MSGF("hsm_archive", RMF_F_STRUCT_ARRAY, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE); + +struct req_msg_field RMF_MDS_HSM_REQUEST = + DEFINE_MSGF("hsm_request", 0, sizeof(struct hsm_request), + lustre_swab_hsm_request, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_REQUEST); + +struct req_msg_field RMF_SWAP_LAYOUTS = + DEFINE_MSGF("swap_layouts", 0, sizeof(struct mdc_swap_layouts), + lustre_swab_swap_layouts, NULL); +EXPORT_SYMBOL(RMF_SWAP_LAYOUTS); + +struct req_msg_field RMF_LFSCK_REQUEST = + DEFINE_MSGF("lfsck_request", 0, sizeof(struct lfsck_request), + lustre_swab_lfsck_request, NULL); +EXPORT_SYMBOL(RMF_LFSCK_REQUEST); + +struct req_msg_field RMF_LFSCK_REPLY = + DEFINE_MSGF("lfsck_reply", 0, sizeof(struct lfsck_reply), + lustre_swab_lfsck_reply, NULL); +EXPORT_SYMBOL(RMF_LFSCK_REPLY); + +struct req_msg_field RMF_OST_LADVISE_HDR = + DEFINE_MSGF("ladvise_request", 0, + sizeof(struct ladvise_hdr), + lustre_swab_ladvise_hdr, NULL); +EXPORT_SYMBOL(RMF_OST_LADVISE_HDR); + +struct req_msg_field RMF_OST_LADVISE = + DEFINE_MSGF("ladvise_request", RMF_F_STRUCT_ARRAY, + sizeof(struct lu_ladvise), + lustre_swab_ladvise, NULL); +EXPORT_SYMBOL(RMF_OST_LADVISE); + +/* + * Request formats. + */ + +struct req_format { + const char *rf_name; + size_t rf_idx; + struct { + size_t nr; + const struct req_msg_field **d; + } rf_fields[RCL_NR]; +}; + +#define DEFINE_REQ_FMT(name, client, client_nr, server, server_nr) { \ + .rf_name = name, \ + .rf_fields = { \ + [RCL_CLIENT] = { \ + .nr = client_nr, \ + .d = client \ + }, \ + [RCL_SERVER] = { \ + .nr = server_nr, \ + .d = server \ + } \ + } \ +} + +#define DEFINE_REQ_FMT0(name, client, server) \ +DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server)) + +struct req_format RQF_OBD_PING = + DEFINE_REQ_FMT0("OBD_PING", empty, empty); +EXPORT_SYMBOL(RQF_OBD_PING); + +struct req_format RQF_OBD_SET_INFO = + DEFINE_REQ_FMT0("OBD_SET_INFO", obd_set_info_client, empty); +EXPORT_SYMBOL(RQF_OBD_SET_INFO); + +struct req_format RQF_MDT_SET_INFO = + DEFINE_REQ_FMT0("MDT_SET_INFO", mdt_set_info_client, empty); +EXPORT_SYMBOL(RQF_MDT_SET_INFO); + +/* Read index file through the network */ +struct req_format RQF_OBD_IDX_READ = + DEFINE_REQ_FMT0("OBD_IDX_READ", + obd_idx_read_client, obd_idx_read_server); +EXPORT_SYMBOL(RQF_OBD_IDX_READ); + +struct req_format RQF_SEC_CTX = + DEFINE_REQ_FMT0("SEC_CTX", empty, empty); +EXPORT_SYMBOL(RQF_SEC_CTX); + +struct req_format RQF_MGS_TARGET_REG = + DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only, + mgs_target_info_only); +EXPORT_SYMBOL(RQF_MGS_TARGET_REG); + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 18, 53, 0) +struct req_format RQF_MGS_SET_INFO = + DEFINE_REQ_FMT0("MGS_SET_INFO", mgs_set_info, + mgs_set_info); +EXPORT_SYMBOL(RQF_MGS_SET_INFO); +#endif + +struct req_format RQF_MGS_CONFIG_READ = + DEFINE_REQ_FMT0("MGS_CONFIG_READ", mgs_config_read_client, + mgs_config_read_server); +EXPORT_SYMBOL(RQF_MGS_CONFIG_READ); + +struct req_format RQF_SEQ_QUERY = + DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server); +EXPORT_SYMBOL(RQF_SEQ_QUERY); + +struct req_format RQF_FLD_QUERY = + DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server); +EXPORT_SYMBOL(RQF_FLD_QUERY); + +/* The 'fld_read_server' uses 'RMF_GENERIC_DATA' to hold the 'FLD_QUERY' + * RPC reply that is composed of 'struct lu_seq_range_array'. But there + * is not registered swabber function for 'RMF_GENERIC_DATA'. So the RPC + * peers need to handle the RPC reply with fixed little-endian format. + * + * In theory, we can define new structure with some swabber registered to + * handle the 'FLD_QUERY' RPC reply result automatically. But from the + * implementation view, it is not easy to be done within current "struct + * req_msg_field" framework. Because the sequence range array in the RPC + * reply is not fixed length, instead, its length depends on 'lu_seq_range' + * count, that is unknown when prepare the RPC buffer. Generally, for such + * flexible length RPC usage, there will be a field in the RPC layout to + * indicate the data length. But for the 'FLD_READ' RPC, we have no way to + * do that unless we add new length filed that will broken the on-wire RPC + * protocol and cause interoperability trouble with old peer. */ +struct req_format RQF_FLD_READ = + DEFINE_REQ_FMT0("FLD_READ", fld_read_client, fld_read_server); +EXPORT_SYMBOL(RQF_FLD_READ); + +struct req_format RQF_MDS_QUOTACTL = + DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only); +EXPORT_SYMBOL(RQF_MDS_QUOTACTL); + +struct req_format RQF_OST_QUOTACTL = + DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only); +EXPORT_SYMBOL(RQF_OST_QUOTACTL); + +struct req_format RQF_QUOTA_DQACQ = + DEFINE_REQ_FMT0("QUOTA_DQACQ", quota_body_only, quota_body_only); +EXPORT_SYMBOL(RQF_QUOTA_DQACQ); + +struct req_format RQF_LDLM_INTENT_QUOTA = + DEFINE_REQ_FMT0("LDLM_INTENT_QUOTA", + ldlm_intent_quota_client, + ldlm_intent_quota_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_QUOTA); + +struct req_format RQF_MDS_GET_ROOT = + DEFINE_REQ_FMT0("MDS_GET_ROOT", mds_get_root_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_GET_ROOT); + +struct req_format RQF_MDS_STATFS = + DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server); +EXPORT_SYMBOL(RQF_MDS_STATFS); + +struct req_format RQF_MDS_STATFS_NEW = + DEFINE_REQ_FMT0("MDS_STATFS_NEW", mdt_body_only, obd_statfs_server); +EXPORT_SYMBOL(RQF_MDS_STATFS_NEW); + +struct req_format RQF_MDS_SYNC = + DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_SYNC); + +struct req_format RQF_MDS_GETATTR = + DEFINE_REQ_FMT0("MDS_GETATTR", mdt_body_capa, mds_getattr_server); +EXPORT_SYMBOL(RQF_MDS_GETATTR); + +struct req_format RQF_MDS_GETXATTR = + DEFINE_REQ_FMT0("MDS_GETXATTR", + mds_getxattr_client, mds_getxattr_server); +EXPORT_SYMBOL(RQF_MDS_GETXATTR); + +struct req_format RQF_MDS_GETATTR_NAME = + DEFINE_REQ_FMT0("MDS_GETATTR_NAME", + mds_getattr_name_client, mds_getattr_server); +EXPORT_SYMBOL(RQF_MDS_GETATTR_NAME); + +struct req_format RQF_MDS_REINT = + DEFINE_REQ_FMT0("MDS_REINT", mds_reint_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT); + +struct req_format RQF_MDS_REINT_CREATE = + DEFINE_REQ_FMT0("MDS_REINT_CREATE", + mds_reint_create_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE); + +struct req_format RQF_MDS_REINT_CREATE_ACL = + DEFINE_REQ_FMT0("MDS_REINT_CREATE_ACL", + mds_reint_create_acl_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_ACL); + +struct req_format RQF_MDS_REINT_CREATE_SLAVE = + DEFINE_REQ_FMT0("MDS_REINT_CREATE_EA", + mds_reint_create_slave_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SLAVE); + +struct req_format RQF_MDS_REINT_CREATE_SYM = + DEFINE_REQ_FMT0("MDS_REINT_CREATE_SYM", + mds_reint_create_sym_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SYM); + +struct req_format RQF_MDS_REINT_OPEN = + DEFINE_REQ_FMT0("MDS_REINT_OPEN", + mds_reint_open_client, mds_reint_open_server); +EXPORT_SYMBOL(RQF_MDS_REINT_OPEN); + +struct req_format RQF_MDS_REINT_UNLINK = + DEFINE_REQ_FMT0("MDS_REINT_UNLINK", mds_reint_unlink_client, + mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_REINT_UNLINK); + +struct req_format RQF_MDS_REINT_LINK = + DEFINE_REQ_FMT0("MDS_REINT_LINK", + mds_reint_link_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT_LINK); + +struct req_format RQF_MDS_REINT_RENAME = + DEFINE_REQ_FMT0("MDS_REINT_RENAME", mds_reint_rename_client, + mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_REINT_RENAME); + +struct req_format RQF_MDS_REINT_MIGRATE = + DEFINE_REQ_FMT0("MDS_REINT_MIGRATE", mds_reint_migrate_client, + mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_REINT_MIGRATE); + +struct req_format RQF_MDS_REINT_SETATTR = + DEFINE_REQ_FMT0("MDS_REINT_SETATTR", + mds_reint_setattr_client, mds_setattr_server); +EXPORT_SYMBOL(RQF_MDS_REINT_SETATTR); + +struct req_format RQF_MDS_REINT_SETXATTR = + DEFINE_REQ_FMT0("MDS_REINT_SETXATTR", + mds_reint_setxattr_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR); + +struct req_format RQF_MDS_REINT_RESYNC = + DEFINE_REQ_FMT0("MDS_REINT_RESYNC", mds_reint_resync, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT_RESYNC); + +struct req_format RQF_MDS_CONNECT = + DEFINE_REQ_FMT0("MDS_CONNECT", + obd_connect_client, obd_connect_server); +EXPORT_SYMBOL(RQF_MDS_CONNECT); + +struct req_format RQF_MDS_DISCONNECT = + DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty); +EXPORT_SYMBOL(RQF_MDS_DISCONNECT); + +struct req_format RQF_MDS_GET_INFO = + DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client, + mds_getinfo_server); +EXPORT_SYMBOL(RQF_MDS_GET_INFO); + +struct req_format RQF_LDLM_ENQUEUE = + DEFINE_REQ_FMT0("LDLM_ENQUEUE", + ldlm_enqueue_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_ENQUEUE); + +struct req_format RQF_LDLM_ENQUEUE_LVB = + DEFINE_REQ_FMT0("LDLM_ENQUEUE_LVB", + ldlm_enqueue_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_ENQUEUE_LVB); + +struct req_format RQF_LDLM_CONVERT = + DEFINE_REQ_FMT0("LDLM_CONVERT", + ldlm_enqueue_client, ldlm_enqueue_server); +EXPORT_SYMBOL(RQF_LDLM_CONVERT); + +struct req_format RQF_LDLM_CANCEL = + DEFINE_REQ_FMT0("LDLM_CANCEL", ldlm_enqueue_client, empty); +EXPORT_SYMBOL(RQF_LDLM_CANCEL); + +struct req_format RQF_LDLM_CALLBACK = + DEFINE_REQ_FMT0("LDLM_CALLBACK", ldlm_enqueue_client, empty); +EXPORT_SYMBOL(RQF_LDLM_CALLBACK); + +struct req_format RQF_LDLM_CP_CALLBACK = + DEFINE_REQ_FMT0("LDLM_CP_CALLBACK", ldlm_cp_callback_client, empty); +EXPORT_SYMBOL(RQF_LDLM_CP_CALLBACK); + +struct req_format RQF_LDLM_BL_CALLBACK = + DEFINE_REQ_FMT0("LDLM_BL_CALLBACK", ldlm_enqueue_client, empty); +EXPORT_SYMBOL(RQF_LDLM_BL_CALLBACK); + +struct req_format RQF_LDLM_GL_CALLBACK = + DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_enqueue_client, + ldlm_gl_callback_server); +EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK); + +struct req_format RQF_LDLM_GL_CALLBACK_DESC = + DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client, + ldlm_gl_callback_server); +EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK_DESC); + +struct req_format RQF_LDLM_INTENT_BASIC = + DEFINE_REQ_FMT0("LDLM_INTENT_BASIC", + ldlm_intent_basic_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_BASIC); + +struct req_format RQF_LDLM_INTENT = + DEFINE_REQ_FMT0("LDLM_INTENT", + ldlm_intent_client, ldlm_intent_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT); + +struct req_format RQF_LDLM_INTENT_LAYOUT = + DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT", + ldlm_intent_layout_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT); + +struct req_format RQF_LDLM_INTENT_GETATTR = + DEFINE_REQ_FMT0("LDLM_INTENT_GETATTR", + ldlm_intent_getattr_client, ldlm_intent_getattr_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_GETATTR); + +struct req_format RQF_LDLM_INTENT_OPEN = + DEFINE_REQ_FMT0("LDLM_INTENT_OPEN", + ldlm_intent_open_client, ldlm_intent_open_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_OPEN); + +struct req_format RQF_LDLM_INTENT_CREATE = + DEFINE_REQ_FMT0("LDLM_INTENT_CREATE", + ldlm_intent_create_client, ldlm_intent_getattr_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE); + +struct req_format RQF_LDLM_INTENT_GETXATTR = + DEFINE_REQ_FMT0("LDLM_INTENT_GETXATTR", + ldlm_intent_getxattr_client, + ldlm_intent_getxattr_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_GETXATTR); + +struct req_format RQF_MDS_CLOSE = + DEFINE_REQ_FMT0("MDS_CLOSE", + mdt_close_client, mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_CLOSE); + +struct req_format RQF_MDS_CLOSE_INTENT = + DEFINE_REQ_FMT0("MDS_CLOSE_INTENT", + mdt_close_intent_client, mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_CLOSE_INTENT); + +struct req_format RQF_MDS_READPAGE = + DEFINE_REQ_FMT0("MDS_READPAGE", + mdt_body_capa, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_READPAGE); + +struct req_format RQF_MDS_HSM_ACTION = + DEFINE_REQ_FMT0("MDS_HSM_ACTION", mdt_body_capa, mdt_hsm_action_server); +EXPORT_SYMBOL(RQF_MDS_HSM_ACTION); + +struct req_format RQF_MDS_HSM_PROGRESS = + DEFINE_REQ_FMT0("MDS_HSM_PROGRESS", mdt_hsm_progress, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_PROGRESS); + +struct req_format RQF_MDS_HSM_CT_REGISTER = + DEFINE_REQ_FMT0("MDS_HSM_CT_REGISTER", mdt_hsm_ct_register, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_CT_REGISTER); + +struct req_format RQF_MDS_HSM_CT_UNREGISTER = + DEFINE_REQ_FMT0("MDS_HSM_CT_UNREGISTER", mdt_hsm_ct_unregister, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_CT_UNREGISTER); + +struct req_format RQF_MDS_HSM_STATE_GET = + DEFINE_REQ_FMT0("MDS_HSM_STATE_GET", + mdt_body_capa, mdt_hsm_state_get_server); +EXPORT_SYMBOL(RQF_MDS_HSM_STATE_GET); + +struct req_format RQF_MDS_HSM_STATE_SET = + DEFINE_REQ_FMT0("MDS_HSM_STATE_SET", mdt_hsm_state_set, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_STATE_SET); + +struct req_format RQF_MDS_HSM_REQUEST = + DEFINE_REQ_FMT0("MDS_HSM_REQUEST", mdt_hsm_request, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_REQUEST); + +struct req_format RQF_MDS_SWAP_LAYOUTS = + DEFINE_REQ_FMT0("MDS_SWAP_LAYOUTS", + mdt_swap_layouts, empty); +EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS); + +struct req_format RQF_MDS_RMFID = + DEFINE_REQ_FMT0("MDS_RMFID", mds_rmfid_client, + mds_rmfid_server); +EXPORT_SYMBOL(RQF_MDS_RMFID); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE", + llog_origin_handle_create_client, llogd_body_only); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK", + llogd_body_only, llog_origin_handle_next_block_server); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_PREV_BLOCK", + llogd_body_only, llog_origin_handle_next_block_server); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_READ_HEADER", + llogd_body_only, llog_log_hdr_only); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER); + +struct req_format RQF_CONNECT = + DEFINE_REQ_FMT0("CONNECT", obd_connect_client, obd_connect_server); +EXPORT_SYMBOL(RQF_CONNECT); + +struct req_format RQF_OST_CONNECT = + DEFINE_REQ_FMT0("OST_CONNECT", + obd_connect_client, obd_connect_server); +EXPORT_SYMBOL(RQF_OST_CONNECT); + +struct req_format RQF_OST_DISCONNECT = + DEFINE_REQ_FMT0("OST_DISCONNECT", empty, empty); +EXPORT_SYMBOL(RQF_OST_DISCONNECT); + +struct req_format RQF_OST_GETATTR = + DEFINE_REQ_FMT0("OST_GETATTR", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_GETATTR); + +struct req_format RQF_OST_SETATTR = + DEFINE_REQ_FMT0("OST_SETATTR", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_SETATTR); + +struct req_format RQF_OST_CREATE = + DEFINE_REQ_FMT0("OST_CREATE", ost_body_only, ost_body_only); +EXPORT_SYMBOL(RQF_OST_CREATE); + +struct req_format RQF_OST_PUNCH = + DEFINE_REQ_FMT0("OST_PUNCH", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_PUNCH); + +struct req_format RQF_OST_FALLOCATE = + DEFINE_REQ_FMT0("OST_FALLOCATE", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_FALLOCATE); + +struct req_format RQF_OST_SEEK = + DEFINE_REQ_FMT0("OST_SEEK", ost_body_only, ost_body_only); +EXPORT_SYMBOL(RQF_OST_SEEK); + +struct req_format RQF_OST_SYNC = + DEFINE_REQ_FMT0("OST_SYNC", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_SYNC); + +struct req_format RQF_OST_DESTROY = + DEFINE_REQ_FMT0("OST_DESTROY", ost_destroy_client, ost_body_only); +EXPORT_SYMBOL(RQF_OST_DESTROY); + +struct req_format RQF_OST_BRW_READ = + DEFINE_REQ_FMT0("OST_BRW_READ", ost_brw_client, ost_brw_read_server); +EXPORT_SYMBOL(RQF_OST_BRW_READ); + +struct req_format RQF_OST_BRW_WRITE = + DEFINE_REQ_FMT0("OST_BRW_WRITE", ost_brw_client, ost_brw_write_server); +EXPORT_SYMBOL(RQF_OST_BRW_WRITE); + +struct req_format RQF_OST_STATFS = + DEFINE_REQ_FMT0("OST_STATFS", empty, obd_statfs_server); +EXPORT_SYMBOL(RQF_OST_STATFS); + +struct req_format RQF_OST_SET_GRANT_INFO = + DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_grant_shrink_client, + ost_body_only); +EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO); + +struct req_format RQF_OST_GET_INFO = + DEFINE_REQ_FMT0("OST_GET_INFO", ost_get_info_generic_client, + ost_get_info_generic_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO); + +struct req_format RQF_OST_GET_INFO_LAST_ID = + DEFINE_REQ_FMT0("OST_GET_INFO_LAST_ID", ost_get_info_generic_client, + ost_get_last_id_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_ID); + +struct req_format RQF_OST_GET_INFO_LAST_FID = + DEFINE_REQ_FMT0("OST_GET_INFO_LAST_FID", ost_get_last_fid_client, + ost_get_last_fid_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_FID); + +struct req_format RQF_OST_SET_INFO_LAST_FID = + DEFINE_REQ_FMT0("OST_SET_INFO_LAST_FID", obd_set_info_client, + empty); +EXPORT_SYMBOL(RQF_OST_SET_INFO_LAST_FID); + +struct req_format RQF_OST_GET_INFO_FIEMAP = + DEFINE_REQ_FMT0("OST_GET_INFO_FIEMAP", ost_get_fiemap_client, + ost_get_fiemap_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP); + +struct req_format RQF_LFSCK_NOTIFY = + DEFINE_REQ_FMT0("LFSCK_NOTIFY", obd_lfsck_request, empty); +EXPORT_SYMBOL(RQF_LFSCK_NOTIFY); + +struct req_format RQF_LFSCK_QUERY = + DEFINE_REQ_FMT0("LFSCK_QUERY", obd_lfsck_request, obd_lfsck_reply); +EXPORT_SYMBOL(RQF_LFSCK_QUERY); + +struct req_format RQF_OST_LADVISE = + DEFINE_REQ_FMT0("OST_LADVISE", ost_ladvise, ost_body_only); +EXPORT_SYMBOL(RQF_OST_LADVISE); + +/* Convenience macro */ +#define FMT_FIELD(fmt, i, j) (fmt)->rf_fields[(i)].d[(j)] + +/** + * Initializes the capsule abstraction by computing and setting the \a rf_idx + * field of RQFs and the \a rmf_offset field of RMFs. + */ +int req_layout_init(void) +{ + size_t i; + size_t j; + size_t k; + struct req_format *rf = NULL; + + for (i = 0; i < ARRAY_SIZE(req_formats); ++i) { + rf = req_formats[i]; + rf->rf_idx = i; + for (j = 0; j < RCL_NR; ++j) { + LASSERT(rf->rf_fields[j].nr <= REQ_MAX_FIELD_NR); + for (k = 0; k < rf->rf_fields[j].nr; ++k) { + struct req_msg_field *field; + + field = (typeof(field))rf->rf_fields[j].d[k]; + LASSERT(!(field->rmf_flags & RMF_F_STRUCT_ARRAY) + || field->rmf_size > 0); + LASSERT(field->rmf_offset[i][j] == 0); + /* + * k + 1 to detect unused format/field + * combinations. + */ + field->rmf_offset[i][j] = k + 1; + } + } + } + return 0; +} +EXPORT_SYMBOL(req_layout_init); + +void req_layout_fini(void) +{ +} +EXPORT_SYMBOL(req_layout_fini); + +/** + * Initializes the expected sizes of each RMF in a \a pill (\a rc_area) to -1. + * + * Actual/expected field sizes are set elsewhere in functions in this file: + * req_capsule_init(), req_capsule_server_pack(), req_capsule_set_size() and + * req_capsule_msg_size(). The \a rc_area information is used by. + * ptlrpc_request_set_replen(). + */ +void req_capsule_init_area(struct req_capsule *pill) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) { + pill->rc_area[RCL_CLIENT][i] = -1; + pill->rc_area[RCL_SERVER][i] = -1; + } +} +EXPORT_SYMBOL(req_capsule_init_area); + +/** + * Initialize a pill. + * + * The \a location indicates whether the caller is executing on the client side + * (RCL_CLIENT) or server side (RCL_SERVER).. + */ +void req_capsule_init(struct req_capsule *pill, + struct ptlrpc_request *req, + enum req_location location) +{ + LASSERT(location == RCL_SERVER || location == RCL_CLIENT); + + /* + * Today all capsules are embedded in ptlrpc_request structs, + * but just in case that ever isn't the case, we don't reach + * into req unless req != NULL and pill is the one embedded in + * the req. + * + * The req->rq_pill_init flag makes it safe to initialize a pill + * twice, which might happen in the OST paths as a result of the + * high-priority RPC queue getting peeked at before ost_handle() + * handles an OST RPC. + */ + if (req != NULL && pill == &req->rq_pill && req->rq_pill_init) + return; + + pill->rc_fmt = NULL; + pill->rc_req = req; + pill->rc_loc = location; + req_capsule_init_area(pill); + + if (req != NULL && pill == &req->rq_pill) + req->rq_pill_init = 1; +} +EXPORT_SYMBOL(req_capsule_init); + +void req_capsule_fini(struct req_capsule *pill) +{ +} +EXPORT_SYMBOL(req_capsule_fini); + +static int __req_format_is_sane(const struct req_format *fmt) +{ + return fmt->rf_idx < ARRAY_SIZE(req_formats) && + req_formats[fmt->rf_idx] == fmt; +} + +static struct lustre_msg *__req_msg(const struct req_capsule *pill, + enum req_location loc) +{ + return loc == RCL_CLIENT ? pill->rc_reqmsg : pill->rc_repmsg; +} + +/** + * Set the format (\a fmt) of a \a pill; format changes are not allowed here + * (see req_capsule_extend()). + */ +void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt) +{ + LASSERT(pill->rc_fmt == NULL || pill->rc_fmt == fmt); + LASSERT(__req_format_is_sane(fmt)); + + pill->rc_fmt = fmt; +} +EXPORT_SYMBOL(req_capsule_set); + +/** + * Fills in any parts of the \a rc_area of a \a pill that haven't been filled in + * yet. + + * \a rc_area is an array of REQ_MAX_FIELD_NR elements, used to store sizes of + * variable-sized fields. The field sizes come from the declared \a rmf_size + * field of a \a pill's \a rc_fmt's RMF's. + */ +size_t req_capsule_filled_sizes(struct req_capsule *pill, + enum req_location loc) +{ + const struct req_format *fmt = pill->rc_fmt; + size_t i; + + LASSERT(fmt != NULL); + + for (i = 0; i < fmt->rf_fields[loc].nr; ++i) { + if (pill->rc_area[loc][i] == -1) { + pill->rc_area[loc][i] = + fmt->rf_fields[loc].d[i]->rmf_size; + if (pill->rc_area[loc][i] == -1) { + /* + * Skip the following fields. + * + * If this LASSERT() trips then you're missing a + * call to req_capsule_set_size(). + */ + LASSERT(loc != RCL_SERVER); + break; + } + } + } + return i; +} +EXPORT_SYMBOL(req_capsule_filled_sizes); + +/** + * Capsule equivalent of lustre_pack_request() and lustre_pack_reply(). + * + * This function uses the \a pill's \a rc_area as filled in by + * req_capsule_set_size() or req_capsule_filled_sizes() (the latter is called by + * this function). + */ +int req_capsule_server_pack(struct req_capsule *pill) +{ + const struct req_format *fmt; + int count; + int rc; + + LASSERT(pill->rc_loc == RCL_SERVER); + fmt = pill->rc_fmt; + LASSERT(fmt != NULL); + + count = req_capsule_filled_sizes(pill, RCL_SERVER); + rc = lustre_pack_reply(pill->rc_req, count, + pill->rc_area[RCL_SERVER], NULL); + if (rc != 0) { + DEBUG_REQ(D_ERROR, pill->rc_req, + "Cannot pack %d fields in format '%s'", + count, fmt->rf_name); + } + return rc; +} +EXPORT_SYMBOL(req_capsule_server_pack); + +/** + * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill + * corresponding to the given RMF (\a field). + */ +__u32 __req_capsule_offset(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + unsigned int offset; + + offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc]; + LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n", + pill->rc_fmt->rf_name, + field->rmf_name, offset, loc); + offset--; + + LASSERT(offset < REQ_MAX_FIELD_NR); + return offset; +} + +void req_capsule_set_swabbed(struct req_capsule *pill, enum req_location loc, + __u32 index) +{ + if (loc == RCL_CLIENT) + req_capsule_set_req_swabbed(pill, index); + else + req_capsule_set_rep_swabbed(pill, index); +} + +bool req_capsule_need_swab(struct req_capsule *pill, enum req_location loc, + __u32 index) +{ + if (loc == RCL_CLIENT) + return (req_capsule_req_need_swab(pill) && + !req_capsule_req_swabbed(pill, index)); + + return (req_capsule_rep_need_swab(pill) && + !req_capsule_rep_swabbed(pill, index)); +} + +/** + * Helper for __req_capsule_get(); swabs value / array of values and/or dumps + * them if desired. + */ +static int +swabber_dumper_helper(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, + int offset, + void *value, int len, bool dump, void (*swabber)(void *)) +{ + void *p; + int i; + int n; + int size; + int rc = 0; + bool do_swab; + bool array = field->rmf_flags & RMF_F_STRUCT_ARRAY; + + swabber = swabber ?: field->rmf_swabber; + + if (req_capsule_need_swab(pill, loc, offset) && + (swabber != NULL || field->rmf_swab_len != NULL) && value != NULL) + do_swab = true; + else + do_swab = false; + + if (!field->rmf_dumper) + dump = false; + + /* + * We're swabbing an array; swabber() swabs a single array element, so + * swab every element. + */ + if (array && (len % field->rmf_size)) { + static const struct req_msg_field *last_field; + + if (field != last_field) { + CERROR("%s: array buffer size %u is not a multiple of element size %u\n", + field->rmf_name, len, field->rmf_size); + last_field = field; + } + } + /* For the non-array cases, the process of swab/dump/swab only + * needs to be done once. (n = 1) + */ + if (!array) + len = field->rmf_size; + for (p = value, i = 0, n = len / field->rmf_size; + i < n; + i++, p += field->rmf_size) { + if (dump) { + CDEBUG(D_RPCTRACE, "Dump of %s%sfield %s element %d follows\n", + do_swab ? "unswabbed " : "", + array ? "array " : "", + field->rmf_name, i); + field->rmf_dumper(p); + } + if (!do_swab) { + if (array) + continue; + else + break; + } + if (!field->rmf_swab_len) { + swabber(p); + } else { + size = field->rmf_swab_len(p, len); + if (size > 0) { + len -= size; + } else { + rc = size; + break; + } + } + if (dump) { + CDEBUG(D_RPCTRACE, "Dump of swabbed %sfield %s, element %d follows\n", + array ? "array " : "", field->rmf_name, i); + field->rmf_dumper(value); + } + } + if (do_swab) + req_capsule_set_swabbed(pill, loc, offset); + + return rc; +} + +/** + * Returns the pointer to a PTLRPC request or reply (\a loc) buffer of a \a pill + * corresponding to the given RMF (\a field). + * + * The buffer will be swabbed using the given \a swabber. If \a swabber == NULL + * then the \a rmf_swabber from the RMF will be used. Soon there will be no + * calls to __req_capsule_get() with a non-NULL \a swabber; \a swabber will then + * be removed. Fields with the \a RMF_F_STRUCT_ARRAY flag set will have each + * element of the array swabbed. + */ +static void *__req_capsule_get(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, + void (*swabber)(void *), + bool dump) +{ + const struct req_format *fmt; + struct lustre_msg *msg; + void *value; + __u32 len; + __u32 offset; + + void *(*getter)(struct lustre_msg *m, __u32 n, __u32 minlen); + + static const char *rcl_names[RCL_NR] = { + [RCL_CLIENT] = "client", + [RCL_SERVER] = "server" + }; + + LASSERT(pill != NULL); + LASSERT(pill != LP_POISON); + fmt = pill->rc_fmt; + LASSERT(fmt != NULL); + LASSERT(fmt != LP_POISON); + LASSERT(__req_format_is_sane(fmt)); + + offset = __req_capsule_offset(pill, field, loc); + + msg = __req_msg(pill, loc); + LASSERT(msg != NULL); + + getter = (field->rmf_flags & RMF_F_STRING) ? + (typeof(getter))lustre_msg_string : lustre_msg_buf; + + if (field->rmf_flags & (RMF_F_STRUCT_ARRAY|RMF_F_NO_SIZE_CHECK)) { + /* + * We've already asserted that field->rmf_size > 0 in + * req_layout_init(). + */ + len = lustre_msg_buflen(msg, offset); + if (!(field->rmf_flags & RMF_F_NO_SIZE_CHECK) && + (len % field->rmf_size) != 0) { + CERROR("%s: array field size mismatch " + "%d modulo %u != 0 (%d)\n", + field->rmf_name, len, field->rmf_size, loc); + return NULL; + } + } else if (pill->rc_area[loc][offset] != -1) { + len = pill->rc_area[loc][offset]; + } else { + len = max_t(typeof(field->rmf_size), field->rmf_size, 0); + } + value = getter(msg, offset, len); + + if (value == NULL) { + DEBUG_REQ(D_ERROR, pill->rc_req, + "Wrong buffer for field '%s' (%u of %u) in format '%s', %u vs. %u (%s)", + field->rmf_name, offset, lustre_msg_bufcount(msg), + fmt->rf_name, lustre_msg_buflen(msg, offset), len, + rcl_names[loc]); + } else { + swabber_dumper_helper(pill, field, loc, offset, value, len, + dump, swabber); + } + + return value; +} + +/** + * Dump a request and/or reply + */ +void __req_capsule_dump(struct req_capsule *pill, enum req_location loc) +{ + const struct req_format *fmt; + const struct req_msg_field *field; + __u32 len; + size_t i; + + fmt = pill->rc_fmt; + + DEBUG_REQ(D_RPCTRACE, pill->rc_req, "BEGIN REQ CAPSULE DUMP"); + for (i = 0; i < fmt->rf_fields[loc].nr; ++i) { + field = FMT_FIELD(fmt, loc, i); + if (field->rmf_dumper == NULL) { + /* + * FIXME Add a default hex dumper for fields that don't + * have a specific dumper + */ + len = req_capsule_get_size(pill, field, loc); + CDEBUG(D_RPCTRACE, + "Field %s has no dumper function; field size is %u\n", + field->rmf_name, len); + } else { + /* It's dumping side-effect that we're interested in */ + (void) __req_capsule_get(pill, field, loc, NULL, true); + } + } + CDEBUG(D_RPCTRACE, "END REQ CAPSULE DUMP\n"); +} + +/** + * Dump a request. + */ +void req_capsule_client_dump(struct req_capsule *pill) +{ + __req_capsule_dump(pill, RCL_CLIENT); +} +EXPORT_SYMBOL(req_capsule_client_dump); + +/** + * Dump a reply + */ +void req_capsule_server_dump(struct req_capsule *pill) +{ + __req_capsule_dump(pill, RCL_SERVER); +} +EXPORT_SYMBOL(req_capsule_server_dump); + +/** + * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC request + * buffer corresponding to the given RMF (\a field) of a \a pill. + */ +void *req_capsule_client_get(struct req_capsule *pill, + const struct req_msg_field *field) +{ + return __req_capsule_get(pill, field, RCL_CLIENT, NULL, false); +} +EXPORT_SYMBOL(req_capsule_client_get); + +/** + * Same as req_capsule_client_get(), but with a \a swabber argument. + * + * Currently unused; will be removed when req_capsule_server_swab_get() is + * unused too. + */ +void *req_capsule_client_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber) +{ + return __req_capsule_get(pill, field, RCL_CLIENT, swabber, false); +} +EXPORT_SYMBOL(req_capsule_client_swab_get); + +/** + * Utility that combines req_capsule_set_size() and req_capsule_client_get(). + * + * First the \a pill's request \a field's size is set (\a rc_area) using + * req_capsule_set_size() with the given \a len. Then the actual buffer is + * returned. + */ +void *req_capsule_client_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 len) +{ + req_capsule_set_size(pill, field, RCL_CLIENT, len); + return __req_capsule_get(pill, field, RCL_CLIENT, NULL, false); +} +EXPORT_SYMBOL(req_capsule_client_sized_get); + +/** + * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC reply + * buffer corresponding to the given RMF (\a field) of a \a pill. + */ +void *req_capsule_server_get(struct req_capsule *pill, + const struct req_msg_field *field) +{ + return __req_capsule_get(pill, field, RCL_SERVER, NULL, false); +} +EXPORT_SYMBOL(req_capsule_server_get); + +/** + * Same as req_capsule_server_get(), but with a \a swabber argument. + * + * Ideally all swabbing should be done pursuant to RMF definitions, with no + * swabbing done outside this capsule abstraction. + */ +void *req_capsule_server_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber) +{ + return __req_capsule_get(pill, field, RCL_SERVER, swabber, false); +} +EXPORT_SYMBOL(req_capsule_server_swab_get); + +/** + * Utility that combines req_capsule_set_size() and req_capsule_server_get(). + * + * First the \a pill's request \a field's size is set (\a rc_area) using + * req_capsule_set_size() with the given \a len. Then the actual buffer is + * returned. + */ +void *req_capsule_server_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 len) +{ + req_capsule_set_size(pill, field, RCL_SERVER, len); + return __req_capsule_get(pill, field, RCL_SERVER, NULL, false); +} +EXPORT_SYMBOL(req_capsule_server_sized_get); + +void *req_capsule_server_sized_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 len, void *swabber) +{ + req_capsule_set_size(pill, field, RCL_SERVER, len); + return __req_capsule_get(pill, field, RCL_SERVER, swabber, false); +} +EXPORT_SYMBOL(req_capsule_server_sized_swab_get); + +/** + * Returns the buffer of a \a pill corresponding to the given \a field from the + * request (if the caller is executing on the server-side) or reply (if the + * caller is executing on the client-side). + * + * This function convienient for use is code that could be executed on the + * client and server alike. + */ +const void *req_capsule_other_get(struct req_capsule *pill, + const struct req_msg_field *field) +{ + return __req_capsule_get(pill, field, pill->rc_loc ^ 1, NULL, false); +} +EXPORT_SYMBOL(req_capsule_other_get); + +/** + * Set the size of the PTLRPC request/reply (\a loc) buffer for the given \a + * field of the given \a pill. + * + * This function must be used when constructing variable sized fields of a + * request or reply. + */ +void req_capsule_set_size(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, __u32 size) +{ + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + + if ((size != (__u32)field->rmf_size) && + (field->rmf_size != -1) && + !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) && + (size > 0)) { + __u32 rmf_size = (__u32)field->rmf_size; + if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) && + (size % rmf_size != 0)) { + CERROR("%s: array field size mismatch " + "%u %% %u != 0 (%d)\n", + field->rmf_name, size, rmf_size, loc); + LBUG(); + } else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) && + size < rmf_size) { + CERROR("%s: field size mismatch %u != %u (%d)\n", + field->rmf_name, size, rmf_size, loc); + LBUG(); + } + } + + pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size; +} +EXPORT_SYMBOL(req_capsule_set_size); + +/** + * Return the actual PTLRPC buffer length of a request or reply (\a loc) + * for the given \a pill's given \a field. + * + * NB: this function doesn't correspond with req_capsule_set_size(), which + * actually sets the size in pill.rc_area[loc][offset], but this function + * returns the message buflen[offset], maybe we should use another name. + */ +__u32 req_capsule_get_size(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + + return lustre_msg_buflen(__req_msg(pill, loc), + __req_capsule_offset(pill, field, loc)); +} +EXPORT_SYMBOL(req_capsule_get_size); + +/** + * Wrapper around lustre_msg_size() that returns the PTLRPC size needed for the + * given \a pill's request or reply (\a loc) given the field size recorded in + * the \a pill's rc_area. + * + * See also req_capsule_set_size(). + */ +__u32 req_capsule_msg_size(struct req_capsule *pill, enum req_location loc) +{ + return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic, + pill->rc_fmt->rf_fields[loc].nr, + pill->rc_area[loc]); +} + +/** + * While req_capsule_msg_size() computes the size of a PTLRPC request or reply + * (\a loc) given a \a pill's \a rc_area, this function computes the size of a + * PTLRPC request or reply given only an RQF (\a fmt). + * + * This function should not be used for formats which contain variable size + * fields. + */ +__u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, + enum req_location loc) +{ + __u32 size; + size_t i = 0; + + /* + * This function should probably LASSERT() that fmt has no fields with + * RMF_F_STRUCT_ARRAY in rmf_flags, since we can't know here how many + * elements in the array there will ultimately be, but then, we could + * assume that there will be at least one element, and that's just what + * we do. + */ + size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr); + if (size == 0) + return size; + + for (; i < fmt->rf_fields[loc].nr; ++i) + if (fmt->rf_fields[loc].d[i]->rmf_size != -1) + size += cfs_size_round(fmt->rf_fields[loc].d[i]-> + rmf_size); + return size; +} +EXPORT_SYMBOL(req_capsule_fmt_size); + +/** + * Changes the format of an RPC. + * + * The pill must already have been initialized, which means that it already has + * a request format. The new format \a fmt must be an extension of the pill's + * old format. Specifically: the new format must have as many request and reply + * fields as the old one, and all fields shared by the old and new format must + * be at least as large in the new format. + * + * The new format's fields may be of different "type" than the old format, but + * only for fields that are "opaque" blobs: fields which have a) have no + * \a rmf_swabber, b) \a rmf_flags == 0 or RMF_F_NO_SIZE_CHECK, and c) \a + * rmf_size == -1 or \a rmf_flags == RMF_F_NO_SIZE_CHECK. For example, + * OBD_SET_INFO has a key field and an opaque value field that gets interpreted + * according to the key field. When the value, according to the key, contains a + * structure (or array thereof) to be swabbed, the format should be changed to + * one where the value field has \a rmf_size/rmf_flags/rmf_swabber set + * accordingly. + */ +void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt) +{ + int i; + size_t j; + + const struct req_format *old; + + LASSERT(pill->rc_fmt != NULL); + LASSERT(__req_format_is_sane(fmt)); + + old = pill->rc_fmt; + /* + * Sanity checking... + */ + for (i = 0; i < RCL_NR; ++i) { + LASSERT(fmt->rf_fields[i].nr >= old->rf_fields[i].nr); + for (j = 0; j < old->rf_fields[i].nr - 1; ++j) { + const struct req_msg_field *ofield = FMT_FIELD(old, i, j); + + /* "opaque" fields can be transmogrified */ + if (ofield->rmf_swabber == NULL && + (ofield->rmf_flags & ~RMF_F_NO_SIZE_CHECK) == 0 && + (ofield->rmf_size == -1 || + ofield->rmf_flags == RMF_F_NO_SIZE_CHECK)) + continue; + LASSERT(FMT_FIELD(fmt, i, j) == FMT_FIELD(old, i, j)); + } + /* + * Last field in old format can be shorter than in new. + */ + LASSERT(FMT_FIELD(fmt, i, j)->rmf_size >= + FMT_FIELD(old, i, j)->rmf_size); + } + + pill->rc_fmt = fmt; +} +EXPORT_SYMBOL(req_capsule_extend); + +/** + * This function returns a non-zero value if the given \a field is present in + * the format (\a rc_fmt) of \a pill's PTLRPC request or reply (\a loc), else it + * returns 0. + */ +int req_capsule_has_field(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + + return field->rmf_offset[pill->rc_fmt->rf_idx][loc]; +} +EXPORT_SYMBOL(req_capsule_has_field); + +/** + * Returns a non-zero value if the given \a field is present in the given \a + * pill's PTLRPC request or reply (\a loc), else it returns 0. + */ +int req_capsule_field_present(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + __u32 offset; + + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + LASSERT(req_capsule_has_field(pill, field, loc)); + + offset = __req_capsule_offset(pill, field, loc); + return lustre_msg_bufcount(__req_msg(pill, loc)) > offset; +} +EXPORT_SYMBOL(req_capsule_field_present); + +/** + * This function shrinks the size of the _buffer_ of the \a pill's PTLRPC + * request or reply (\a loc). + * + * This is not the opposite of req_capsule_extend(). + */ +void req_capsule_shrink(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 newlen, + enum req_location loc) +{ + const struct req_format *fmt; + struct lustre_msg *msg; + __u32 len; + int offset; + + fmt = pill->rc_fmt; + LASSERT(fmt != NULL); + LASSERT(__req_format_is_sane(fmt)); + LASSERT(req_capsule_has_field(pill, field, loc)); + LASSERT(req_capsule_field_present(pill, field, loc)); + + offset = __req_capsule_offset(pill, field, loc); + + msg = __req_msg(pill, loc); + len = lustre_msg_buflen(msg, offset); + LASSERTF(newlen <= len, "%s:%s, oldlen=%u, newlen=%u\n", + fmt->rf_name, field->rmf_name, len, newlen); + + if (loc == RCL_CLIENT) { + pill->rc_req->rq_reqlen = lustre_shrink_msg(msg, offset, newlen, + 1); + } else { + pill->rc_req->rq_replen = lustre_shrink_msg(msg, offset, newlen, + 1); + /* update also field size in reply lenghts arrays for possible + * reply re-pack due to req_capsule_server_grow() call. + */ + req_capsule_set_size(pill, field, loc, newlen); + } +} +EXPORT_SYMBOL(req_capsule_shrink); + +int req_capsule_server_grow(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 newlen) +{ + struct ptlrpc_reply_state *rs = pill->rc_req->rq_reply_state, *nrs; + char *from, *to; + int rc; + __u32 offset, len; + + LASSERT(pill->rc_fmt != NULL); + LASSERT(__req_format_is_sane(pill->rc_fmt)); + LASSERT(req_capsule_has_field(pill, field, RCL_SERVER)); + LASSERT(req_capsule_field_present(pill, field, RCL_SERVER)); + + len = req_capsule_get_size(pill, field, RCL_SERVER); + offset = __req_capsule_offset(pill, field, RCL_SERVER); + + CDEBUG(D_INFO, "Reply packed: %d, allocated: %d, field len %d -> %d\n", + lustre_packed_msg_size(rs->rs_msg), rs->rs_repbuf_len, + len, newlen); + + req_capsule_set_size(pill, field, RCL_SERVER, newlen); + /** + * There can be enough space in current reply buffer, make sure + * that rs_repbuf is not a wrapper but real reply msg, otherwise + * re-packing is still needed. + */ + if (rs->rs_msg == rs->rs_repbuf && + rs->rs_repbuf_len >= + lustre_packed_msg_size(rs->rs_msg) - len + newlen) { + pill->rc_req->rq_replen = lustre_grow_msg(rs->rs_msg, offset, + newlen); + return 0; + } + + /* Re-allocate replay state */ + pill->rc_req->rq_reply_state = NULL; + rc = req_capsule_server_pack(pill); + if (rc) { + /* put old values back, the caller should decide what to do */ + req_capsule_set_size(pill, field, RCL_SERVER, len); + pill->rc_req->rq_reply_state = rs; + return rc; + } + nrs = pill->rc_req->rq_reply_state; + LASSERT(lustre_packed_msg_size(nrs->rs_msg) > + lustre_packed_msg_size(rs->rs_msg)); + + /* Now we need only buffers, copy them and grow the needed one */ + to = lustre_msg_buf(nrs->rs_msg, 0, 0); + from = lustre_msg_buf(rs->rs_msg, 0, 0); + memcpy(to, from, + (char *)rs->rs_msg + lustre_packed_msg_size(rs->rs_msg) - from); + lustre_msg_set_buflen(nrs->rs_msg, offset, len); + pill->rc_req->rq_replen = lustre_grow_msg(nrs->rs_msg, offset, newlen); + + if (rs->rs_difficult) { + /* copy rs data */ + int i; + + nrs->rs_difficult = 1; + nrs->rs_no_ack = rs->rs_no_ack; + nrs->rs_convert_lock = rs->rs_convert_lock; + for (i = 0; i < rs->rs_nlocks; i++) { + nrs->rs_locks[i] = rs->rs_locks[i]; + nrs->rs_modes[i] = rs->rs_modes[i]; + nrs->rs_nlocks++; + } + rs->rs_nlocks = 0; + rs->rs_difficult = 0; + rs->rs_no_ack = 0; + } + ptlrpc_rs_decref(rs); + return 0; +} +EXPORT_SYMBOL(req_capsule_server_grow); + +#ifdef HAVE_SERVER_SUPPORT +static const struct req_msg_field *mds_update_client[] = { + &RMF_PTLRPC_BODY, + &RMF_OUT_UPDATE_HEADER, + &RMF_OUT_UPDATE_BUF, +}; + +static const struct req_msg_field *mds_update_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OUT_UPDATE_REPLY, +}; + +struct req_msg_field RMF_OUT_UPDATE = DEFINE_MSGFL("object_update", 0, -1, + lustre_swab_object_update_request, NULL); +EXPORT_SYMBOL(RMF_OUT_UPDATE); + +struct req_msg_field RMF_OUT_UPDATE_REPLY = + DEFINE_MSGFL("object_update_reply", 0, -1, + lustre_swab_object_update_reply, NULL); +EXPORT_SYMBOL(RMF_OUT_UPDATE_REPLY); + +struct req_msg_field RMF_OUT_UPDATE_HEADER = DEFINE_MSGF("out_update_header", 0, + -1, lustre_swab_out_update_header, NULL); +EXPORT_SYMBOL(RMF_OUT_UPDATE_HEADER); + +struct req_msg_field RMF_OUT_UPDATE_BUF = DEFINE_MSGF("update_buf", + RMF_F_STRUCT_ARRAY, sizeof(struct out_update_buffer), + lustre_swab_out_update_buffer, NULL); +EXPORT_SYMBOL(RMF_OUT_UPDATE_BUF); + +struct req_format RQF_OUT_UPDATE = + DEFINE_REQ_FMT0("OUT_UPDATE", mds_update_client, + mds_update_server); +EXPORT_SYMBOL(RQF_OUT_UPDATE); + +int req_check_sepol(struct req_capsule *pill) +{ + int rc = 0; + struct obd_export *export; + struct lu_nodemap *nm = NULL; + const char *sepol = NULL; + const char *nm_sepol = NULL; + + if (!pill->rc_req) + return -EPROTO; + + export = pill->rc_req->rq_export; + if (!export || !exp_connect_sepol(export) || + !req_capsule_has_field(pill, &RMF_SELINUX_POL, RCL_CLIENT)) + goto nm; + + if (req_capsule_get_size(pill, &RMF_SELINUX_POL, RCL_CLIENT) == 0) + goto nm; + + sepol = req_capsule_client_get(pill, &RMF_SELINUX_POL); + CDEBUG(D_SEC, "retrieved sepol %s\n", sepol); + +nm: + if (export) { + nm = nodemap_get_from_exp(export); + if (!IS_ERR_OR_NULL(nm)) { + nm_sepol = nodemap_get_sepol(nm); + if (nm_sepol && nm_sepol[0]) + if (sepol == NULL || + strcmp(sepol, nm_sepol) != 0) + rc = -EACCES; + } + } + + if (!IS_ERR_OR_NULL(nm)) + nodemap_putref(nm); + + return rc; +} +EXPORT_SYMBOL(req_check_sepol); +#endif diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c new file mode 100644 index 0000000000000..d0c3e61ad7b87 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c @@ -0,0 +1,352 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/llog_client.c + * + * remote api for llog - client side + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include +#include + +#include +#include +#include + +#include "ptlrpc_internal.h" + +#define LLOG_CLIENT_ENTRY(ctxt, imp) do { \ + mutex_lock(&ctxt->loc_mutex); \ + if (ctxt->loc_imp) { \ + imp = class_import_get(ctxt->loc_imp); \ + } else { \ + CERROR("ctxt->loc_imp == NULL for context idx %d." \ + "Unable to complete MDS/OSS recovery," \ + "but I'll try again next time. Not fatal.\n", \ + ctxt->loc_idx); \ + imp = NULL; \ + mutex_unlock(&ctxt->loc_mutex); \ + return -EINVAL; \ + } \ + mutex_unlock(&ctxt->loc_mutex); \ +} while (0) + +#define LLOG_CLIENT_EXIT(ctxt, imp) do { \ + mutex_lock(&ctxt->loc_mutex); \ + if (ctxt->loc_imp != imp) \ + CWARN("loc_imp has changed from %p to %p\n", \ + ctxt->loc_imp, imp); \ + class_import_put(imp); \ + mutex_unlock(&ctxt->loc_mutex); \ +} while (0) + +/* + * This is a callback from the llog_* functions. + * Assumes caller has already pushed us into the kernel context. + */ +static int llog_client_open(const struct lu_env *env, + struct llog_handle *lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param) +{ + struct obd_import *imp; + struct llogd_body *body; + struct llog_ctxt *ctxt = lgh->lgh_ctxt; + struct ptlrpc_request *req = NULL; + int rc; + + ENTRY; + + LLOG_CLIENT_ENTRY(ctxt, imp); + + /* client cannot create llog */ + LASSERTF(open_param != LLOG_OPEN_NEW, "%#x\n", open_param); + LASSERT(lgh); + + req = ptlrpc_request_alloc(imp, &RQF_LLOG_ORIGIN_HANDLE_CREATE); + if (!req) + GOTO(out, rc = -ENOMEM); + + if (name) + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + strlen(name) + 1); + + rc = ptlrpc_request_pack(req, LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_CREATE); + if (rc) { + ptlrpc_request_free(req); + req = NULL; + GOTO(out, rc); + } + ptlrpc_request_set_replen(req); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (logid) + body->lgd_logid = *logid; + body->lgd_ctxt_idx = ctxt->loc_idx - 1; + + if (name) { + char *tmp; + + tmp = req_capsule_client_sized_get(&req->rq_pill, &RMF_NAME, + strlen(name) + 1); + LASSERT(tmp); + strcpy(tmp, name); + + do_pack_body(req); + } + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (!body) + GOTO(out, rc = -EFAULT); + + lgh->lgh_id = body->lgd_logid; + lgh->lgh_ctxt = ctxt; + EXIT; +out: + LLOG_CLIENT_EXIT(ctxt, imp); + ptlrpc_req_finished(req); + return rc; +} + +static int llog_client_next_block(const struct lu_env *env, + struct llog_handle *loghandle, + int *cur_idx, int next_idx, + __u64 *cur_offset, void *buf, int len) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + void *ptr; + int rc; + + ENTRY; + + LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_NEXT_BLOCK); + if (!req) + GOTO(err_exit, rc = -ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = loghandle->lgh_id; + body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1; + body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; + body->lgd_index = next_idx; + body->lgd_saved_index = *cur_idx; + body->lgd_len = len; + body->lgd_cur_offset = *cur_offset; + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len); + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + /* + * -EIO has a special meaning here. If llog_osd_next_block() + * reaches the end of the log without finding the desired + * record then it updates *cur_offset and *cur_idx and returns + * -EIO. In llog_process_thread() we use this to detect + * EOF. But we must be careful to distinguish between -EIO + * coming from llog_osd_next_block() and -EIO coming from + * ptlrpc or below. + */ + if (rc == -EIO) { + if (!req->rq_repmsg || + lustre_msg_get_status(req->rq_repmsg) != -EIO) + GOTO(out, rc); + } else if (rc < 0) { + GOTO(out, rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (!body) + GOTO(out, rc = -EFAULT); + + *cur_idx = body->lgd_saved_index; + *cur_offset = body->lgd_cur_offset; + + if (rc < 0) + GOTO(out, rc); + + /* The log records are swabbed as they are processed */ + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + if (!ptr) + GOTO(out, rc = -EFAULT); + + memcpy(buf, ptr, len); + EXIT; +out: + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); + return rc; +} + +static int llog_client_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + void *ptr; + int rc; + + ENTRY; + + LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_PREV_BLOCK); + if (!req) + GOTO(err_exit, rc = -ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = loghandle->lgh_id; + body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1; + body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; + body->lgd_index = prev_idx; + body->lgd_len = len; + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len); + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (!body) + GOTO(out, rc = -EFAULT); + + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + if (!ptr) + GOTO(out, rc = -EFAULT); + + memcpy(buf, ptr, len); + EXIT; +out: + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); + return rc; +} + +static int llog_client_read_header(const struct lu_env *env, + struct llog_handle *handle) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + struct llog_log_hdr *hdr; + struct llog_rec_hdr *llh_hdr; + int rc; + + ENTRY; + + LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp, + &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_READ_HEADER); + if (!req) + GOTO(err_exit, rc = -ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = handle->lgh_id; + body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1; + body->lgd_llh_flags = handle->lgh_hdr->llh_flags; + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR); + if (!hdr) + GOTO(out, rc = -EFAULT); + + if (handle->lgh_hdr_size < hdr->llh_hdr.lrh_len) + GOTO(out, rc = -EFAULT); + + memcpy(handle->lgh_hdr, hdr, hdr->llh_hdr.lrh_len); + handle->lgh_last_idx = LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index; + + /* sanity checks */ + llh_hdr = &handle->lgh_hdr->llh_hdr; + if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) { + CERROR("bad log header magic: %#x (expecting %#x)\n", + llh_hdr->lrh_type, LLOG_HDR_MAGIC); + rc = -EIO; + } else if (llh_hdr->lrh_len != + LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len || + (llh_hdr->lrh_len & (llh_hdr->lrh_len - 1)) != 0 || + llh_hdr->lrh_len < LLOG_MIN_CHUNK_SIZE || + llh_hdr->lrh_len > handle->lgh_hdr_size) { + CERROR("incorrectly sized log header: %#x, expecting %#x (power of two > 8192)\n", + llh_hdr->lrh_len, + LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len); + CERROR("you may need to re-run lconf --write_conf.\n"); + rc = -EIO; + } + EXIT; +out: + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp); + return rc; +} + +static int llog_client_close(const struct lu_env *env, + struct llog_handle *handle) +{ + /* + * this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because + * the servers all close the file at the end of every + * other LLOG_ RPC. + */ + return 0; +} + +const struct llog_operations llog_client_ops = { + .lop_next_block = llog_client_next_block, + .lop_prev_block = llog_client_prev_block, + .lop_read_header = llog_client_read_header, + .lop_open = llog_client_open, + .lop_close = llog_client_close, +}; +EXPORT_SYMBOL(llog_client_ops); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c new file mode 100644 index 0000000000000..5be7dfc38bcbd --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c @@ -0,0 +1,67 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/llog_net.c + * + * OST<->MDS recovery logging infrastructure. + * + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include +#include +#include + +int llog_initiator_connect(struct llog_ctxt *ctxt) +{ + struct obd_import *new_imp; + + ENTRY; + + LASSERT(ctxt); + new_imp = ctxt->loc_obd->u.cli.cl_import; + LASSERTF(!ctxt->loc_imp || ctxt->loc_imp == new_imp, + "%p - %p\n", ctxt->loc_imp, new_imp); + mutex_lock(&ctxt->loc_mutex); + if (ctxt->loc_imp != new_imp) { + if (ctxt->loc_imp) + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = class_import_get(new_imp); + } + mutex_unlock(&ctxt->loc_mutex); + RETURN(0); +} +EXPORT_SYMBOL(llog_initiator_connect); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c new file mode 100644 index 0000000000000..d19ea86d82f54 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c @@ -0,0 +1,288 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/llog_server.c + * + * remote api for llog - server side + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include +#include +#include +#include + +static int llog_origin_close(const struct lu_env *env, struct llog_handle *lgh) +{ + if (lgh->lgh_hdr != NULL && lgh->lgh_hdr->llh_flags & LLOG_F_IS_CAT) + return llog_cat_close(env, lgh); + else + return llog_close(env, lgh); +} + +/* Only open is supported, no new llog can be created remotely */ +int llog_origin_handle_open(struct ptlrpc_request *req) +{ + struct obd_export *exp = req->rq_export; + struct obd_device *obd = exp->exp_obd; + struct llog_handle *loghandle; + struct llogd_body *body; + struct llog_logid *logid = NULL; + struct llog_ctxt *ctxt; + char *name = NULL; + int rc; + + ENTRY; + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + RETURN(err_serious(-EFAULT)); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(err_serious(-ENOMEM)); + + if (ostid_id(&body->lgd_logid.lgl_oi) > 0) + logid = &body->lgd_logid; + + if (req_capsule_field_present(&req->rq_pill, &RMF_NAME, RCL_CLIENT)) { + name = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + if (name == NULL) + RETURN(-EFAULT); + CDEBUG(D_INFO, "%s: opening log %s\n", obd->obd_name, name); + } + + if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) { + CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d name=%s\n", + obd->obd_name, body->lgd_ctxt_idx, name); + RETURN(-EPROTO); + } + + ctxt = llog_get_context(obd, body->lgd_ctxt_idx); + if (ctxt == NULL) { + CDEBUG(D_WARNING, "%s: no ctxt. group=%p idx=%d name=%s\n", + obd->obd_name, &obd->obd_olg, body->lgd_ctxt_idx, name); + RETURN(-ENODEV); + } + + rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, logid, + name, LLOG_OPEN_EXISTS); + if (rc) + GOTO(out_ctxt, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = loghandle->lgh_id; + + llog_origin_close(req->rq_svc_thread->t_env, loghandle); + EXIT; +out_ctxt: + llog_ctxt_put(ctxt); + return rc; +} + +int llog_origin_handle_next_block(struct ptlrpc_request *req) +{ + struct llog_handle *loghandle; + struct llogd_body *body; + struct llogd_body *repbody; + struct llog_ctxt *ctxt; + __u32 flags; + void *ptr; + int rc; + + ENTRY; + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + RETURN(err_serious(-EFAULT)); + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, + LLOG_MIN_CHUNK_SIZE); + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(err_serious(-ENOMEM)); + + if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) { + CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n", + req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx); + RETURN(-EPROTO); + } + + ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx); + if (ctxt == NULL) + RETURN(-ENODEV); + if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_LLOG_UMOUNT_RACE)) + cfs_fail_val = 1; + + rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, + &body->lgd_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) + GOTO(out_ctxt, rc); + + flags = body->lgd_llh_flags; + rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags, + NULL); + if (rc) + GOTO(out_close, rc); + + repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + *repbody = *body; + + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + rc = llog_next_block(req->rq_svc_thread->t_env, loghandle, + &repbody->lgd_saved_index, repbody->lgd_index, + &repbody->lgd_cur_offset, ptr, + LLOG_MIN_CHUNK_SIZE); + if (rc) + GOTO(out_close, rc); + EXIT; +out_close: + llog_origin_close(req->rq_svc_thread->t_env, loghandle); +out_ctxt: + llog_ctxt_put(ctxt); + return rc; +} + +int llog_origin_handle_prev_block(struct ptlrpc_request *req) +{ + struct llog_handle *loghandle; + struct llogd_body *body; + struct llogd_body *repbody; + struct llog_ctxt *ctxt; + __u32 flags; + void *ptr; + int rc; + + ENTRY; + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + RETURN(err_serious(-EFAULT)); + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, + LLOG_MIN_CHUNK_SIZE); + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(err_serious(-ENOMEM)); + + if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) { + CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n", + req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx); + RETURN(-EPROTO); + } + + ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx); + if (ctxt == NULL) + RETURN(-ENODEV); + + rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, + &body->lgd_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) + GOTO(out_ctxt, rc); + + flags = body->lgd_llh_flags; + rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags, + NULL); + if (rc) + GOTO(out_close, rc); + + repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + *repbody = *body; + + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + rc = llog_prev_block(req->rq_svc_thread->t_env, loghandle, + body->lgd_index, ptr, LLOG_MIN_CHUNK_SIZE); + if (rc) + GOTO(out_close, rc); + + EXIT; +out_close: + llog_origin_close(req->rq_svc_thread->t_env, loghandle); +out_ctxt: + llog_ctxt_put(ctxt); + return rc; +} + +int llog_origin_handle_read_header(struct ptlrpc_request *req) +{ + struct llog_handle *loghandle; + struct llogd_body *body; + struct llog_log_hdr *hdr; + struct llog_ctxt *ctxt; + __u32 flags; + int rc; + + ENTRY; + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + RETURN(err_serious(-EFAULT)); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(err_serious(-ENOMEM)); + + if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) { + CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n", + req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx); + RETURN(-EPROTO); + } + + ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx); + if (ctxt == NULL) + RETURN(-ENODEV); + + rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, + &body->lgd_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) + GOTO(out_ctxt, rc); + + /* + * llog_init_handle() reads the llog header + */ + flags = body->lgd_llh_flags; + rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags, + NULL); + if (rc) + GOTO(out_close, rc); + flags = loghandle->lgh_hdr->llh_flags; + + hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR); + *hdr = *loghandle->lgh_hdr; + EXIT; +out_close: + llog_origin_close(req->rq_svc_thread->t_env, loghandle); +out_ctxt: + llog_ctxt_put(ctxt); + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c new file mode 100644 index 0000000000000..0e0b1706655af --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c @@ -0,0 +1,1480 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ +#define DEBUG_SUBSYSTEM S_CLASS + + +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" + + +static struct ll_rpc_opcode { + __u32 opcode; + const char *opname; +} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = { + { OST_REPLY, "ost_reply" }, + { OST_GETATTR, "ost_getattr" }, + { OST_SETATTR, "ost_setattr" }, + { OST_READ, "ost_read" }, + { OST_WRITE, "ost_write" }, + { OST_CREATE , "ost_create" }, + { OST_DESTROY, "ost_destroy" }, + { OST_GET_INFO, "ost_get_info" }, + { OST_CONNECT, "ost_connect" }, + { OST_DISCONNECT, "ost_disconnect" }, + { OST_PUNCH, "ost_punch" }, + { OST_OPEN, "ost_open" }, + { OST_CLOSE, "ost_close" }, + { OST_STATFS, "ost_statfs" }, + { 14, NULL }, /* formerly OST_SAN_READ */ + { 15, NULL }, /* formerly OST_SAN_WRITE */ + { OST_SYNC, "ost_sync" }, + { OST_SET_INFO, "ost_set_info" }, + { OST_QUOTACHECK, "ost_quotacheck" }, + { OST_QUOTACTL, "ost_quotactl" }, + { OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" }, + { OST_LADVISE, "ost_ladvise" }, + { OST_FALLOCATE, "ost_fallocate" }, + { OST_SEEK, "ost_seek" }, + { MDS_GETATTR, "mds_getattr" }, + { MDS_GETATTR_NAME, "mds_getattr_lock" }, + { MDS_CLOSE, "mds_close" }, + { MDS_REINT, "mds_reint" }, + { MDS_READPAGE, "mds_readpage" }, + { MDS_CONNECT, "mds_connect" }, + { MDS_DISCONNECT, "mds_disconnect" }, + { MDS_GET_ROOT, "mds_get_root" }, + { MDS_STATFS, "mds_statfs" }, + { MDS_PIN, "mds_pin" }, + { MDS_UNPIN, "mds_unpin" }, + { MDS_SYNC, "mds_sync" }, + { MDS_DONE_WRITING, "mds_done_writing" }, + { MDS_SET_INFO, "mds_set_info" }, + { MDS_QUOTACHECK, "mds_quotacheck" }, + { MDS_QUOTACTL, "mds_quotactl" }, + { MDS_GETXATTR, "mds_getxattr" }, + { MDS_SETXATTR, "mds_setxattr" }, + { MDS_WRITEPAGE, "mds_writepage" }, + { MDS_IS_SUBDIR, "mds_is_subdir" }, + { MDS_GET_INFO, "mds_get_info" }, + { MDS_HSM_STATE_GET, "mds_hsm_state_get" }, + { MDS_HSM_STATE_SET, "mds_hsm_state_set" }, + { MDS_HSM_ACTION, "mds_hsm_action" }, + { MDS_HSM_PROGRESS, "mds_hsm_progress" }, + { MDS_HSM_REQUEST, "mds_hsm_request" }, + { MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" }, + { MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" }, + { MDS_SWAP_LAYOUTS, "mds_swap_layouts" }, + { MDS_RMFID, "mds_rmfid" }, + { LDLM_ENQUEUE, "ldlm_enqueue" }, + { LDLM_CONVERT, "ldlm_convert" }, + { LDLM_CANCEL, "ldlm_cancel" }, + { LDLM_BL_CALLBACK, "ldlm_bl_callback" }, + { LDLM_CP_CALLBACK, "ldlm_cp_callback" }, + { LDLM_GL_CALLBACK, "ldlm_gl_callback" }, + { LDLM_SET_INFO, "ldlm_set_info" }, + { MGS_CONNECT, "mgs_connect" }, + { MGS_DISCONNECT, "mgs_disconnect" }, + { MGS_EXCEPTION, "mgs_exception" }, + { MGS_TARGET_REG, "mgs_target_reg" }, + { MGS_TARGET_DEL, "mgs_target_del" }, + { MGS_SET_INFO, "mgs_set_info" }, + { MGS_CONFIG_READ, "mgs_config_read" }, + { OBD_PING, "obd_ping" }, + { 401, /* was OBD_LOG_CANCEL */ "llog_cancel" }, + { 402, /* was OBD_QC_CALLBACK */ "obd_quota_callback" }, + { OBD_IDX_READ, "dt_index_read" }, + { LLOG_ORIGIN_HANDLE_CREATE, "llog_origin_handle_open" }, + { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" }, + { LLOG_ORIGIN_HANDLE_READ_HEADER, "llog_origin_handle_read_header" }, + { 504, /*LLOG_ORIGIN_HANDLE_WRITE_REC*/"llog_origin_handle_write_rec" }, + { 505, /* was LLOG_ORIGIN_HANDLE_CLOSE */ "llog_origin_handle_close" }, + { 506, /* was LLOG_ORIGIN_CONNECT */ "llog_origin_connect" }, + { 507, /* was LLOG_CATINFO */ "llog_catinfo" }, + { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" }, + { LLOG_ORIGIN_HANDLE_DESTROY, "llog_origin_handle_destroy" }, + { QUOTA_DQACQ, "quota_acquire" }, + { QUOTA_DQREL, "quota_release" }, + { SEQ_QUERY, "seq_query" }, + { SEC_CTX_INIT, "sec_ctx_init" }, + { SEC_CTX_INIT_CONT, "sec_ctx_init_cont" }, + { SEC_CTX_FINI, "sec_ctx_fini" }, + { FLD_QUERY, "fld_query" }, + { FLD_READ, "fld_read" }, +#ifdef HAVE_SERVER_SUPPORT + { OUT_UPDATE, "out_update" }, + { LFSCK_NOTIFY, "lfsck_notify" }, + { LFSCK_QUERY, "lfsck_query" }, +#endif +}; + +static struct ll_eopcode { + __u32 opcode; + const char *opname; +} ll_eopcode_table[EXTRA_LAST_OPC] = { + { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" }, + { LDLM_PLAIN_ENQUEUE, "ldlm_plain_enqueue" }, + { LDLM_EXTENT_ENQUEUE, "ldlm_extent_enqueue" }, + { LDLM_FLOCK_ENQUEUE, "ldlm_flock_enqueue" }, + { LDLM_IBITS_ENQUEUE, "ldlm_ibits_enqueue" }, + { MDS_REINT_SETATTR, "mds_reint_setattr" }, + { MDS_REINT_CREATE, "mds_reint_create" }, + { MDS_REINT_LINK, "mds_reint_link" }, + { MDS_REINT_UNLINK, "mds_reint_unlink" }, + { MDS_REINT_RENAME, "mds_reint_rename" }, + { MDS_REINT_OPEN, "mds_reint_open" }, + { MDS_REINT_SETXATTR, "mds_reint_setxattr" }, + { MDS_REINT_RESYNC, "mds_reint_resync" }, + { BRW_READ_BYTES, "read_bytes" }, + { BRW_WRITE_BYTES, "write_bytes" }, +}; + +const char *ll_opcode2str(__u32 opcode) +{ + __u32 offset = opcode_offset(opcode); + + /* When one of the assertions below fail, chances are that: + * 1) A new opcode was added in include/lustre/lustre_idl.h, + * but is missing from the table above. + * or 2) The opcode space was renumbered or rearranged, + * and the opcode_offset() function in + * ptlrpc_internal.h needs to be modified. + */ + LASSERTF(offset < LUSTRE_MAX_OPCODES, + "offset %u >= LUSTRE_MAX_OPCODES %u\n", + offset, LUSTRE_MAX_OPCODES); + LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode, + "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n", + offset, ll_rpc_opcode_table[offset].opcode, opcode); + + return ll_rpc_opcode_table[offset].opname; +} + +const int ll_str2opcode(const char *ops) +{ + int i; + + for (i = 0; i < LUSTRE_MAX_OPCODES; i++) { + if (ll_rpc_opcode_table[i].opname != NULL && + strcmp(ll_rpc_opcode_table[i].opname, ops) == 0) + return ll_rpc_opcode_table[i].opcode; + } + + return -EINVAL; +} + +static const char *ll_eopcode2str(__u32 opcode) +{ + LASSERT(ll_eopcode_table[opcode].opcode == opcode); + return ll_eopcode_table[opcode].opname; +} + +static void +ptlrpc_ldebugfs_register(struct dentry *root, char *dir, char *name, + struct dentry **debugfs_root_ret, + struct lprocfs_stats **stats_ret) +{ + struct dentry *svc_debugfs_entry; + struct lprocfs_stats *svc_stats; + int i; + unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX | + LPROCFS_CNTR_STDDEV; + + LASSERT(!*debugfs_root_ret); + LASSERT(!*stats_ret); + + svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES + LUSTRE_MAX_OPCODES, + 0); + if (!svc_stats) + return; + + if (dir) + svc_debugfs_entry = debugfs_create_dir(dir, root); + else + svc_debugfs_entry = root; + + lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR, + svc_counter_config, "req_waittime", "usec"); + lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR, + svc_counter_config, "req_qdepth", "reqs"); + lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR, + svc_counter_config, "req_active", "reqs"); + lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT, + svc_counter_config, "req_timeout", "sec"); + lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR, + svc_counter_config, "reqbuf_avail", "bufs"); + for (i = 0; i < EXTRA_LAST_OPC; i++) { + char *units; + + switch (i) { + case BRW_WRITE_BYTES: + case BRW_READ_BYTES: + units = "bytes"; + break; + default: + units = "reqs"; + break; + } + lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i, + svc_counter_config, + ll_eopcode2str(i), units); + } + for (i = 0; i < LUSTRE_MAX_OPCODES; i++) { + __u32 opcode = ll_rpc_opcode_table[i].opcode; + + lprocfs_counter_init(svc_stats, + EXTRA_MAX_OPCODES + i, svc_counter_config, + ll_opcode2str(opcode), "usec"); + } + + debugfs_create_file(name, 0644, svc_debugfs_entry, svc_stats, + &ldebugfs_stats_seq_fops); + + if (dir) + *debugfs_root_ret = svc_debugfs_entry; + *stats_ret = svc_stats; +} + +static int +ptlrpc_lprocfs_req_buffer_history_len_seq_show(struct seq_file *m, void *v) +{ + struct ptlrpc_service *svc = m->private; + struct ptlrpc_service_part *svcpt; + int total = 0; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) + total += svcpt->scp_hist_nrqbds; + + seq_printf(m, "%d\n", total); + + return 0; +} + +LDEBUGFS_SEQ_FOPS_RO(ptlrpc_lprocfs_req_buffer_history_len); + +static int +ptlrpc_lprocfs_req_buffer_history_max_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + struct ptlrpc_service_part *svcpt; + int total = 0; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) + total += svc->srv_hist_nrqbds_cpt_max; + + seq_printf(m, "%d\n", total); + return 0; +} + +static ssize_t +ptlrpc_lprocfs_req_buffer_history_max_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + unsigned long long val; + unsigned long long limit; + int bufpages; + int rc; + + rc = kstrtoull_from_user(buffer, count, 0, &val); + if (rc < 0) + return rc; + + if (val < 0 || val > INT_MAX) + return -ERANGE; + + /* This sanity check is more of an insanity check; we can still + * hose a kernel by allowing the request history to grow too + * far. The roundup to the next power of two is an empirical way + * to take care that request buffer is allocated in Slab and thus + * will be upgraded */ + bufpages = (roundup_pow_of_two(svc->srv_buf_size) + PAGE_SIZE - 1) >> + PAGE_SHIFT; + limit = cfs_totalram_pages() / (2 * bufpages); + /* do not allow history to consume more than half max number of rqbds */ + if ((svc->srv_nrqbds_max == 0 && val > limit) || + (svc->srv_nrqbds_max != 0 && val > svc->srv_nrqbds_max / 2)) + return -ERANGE; + + spin_lock(&svc->srv_lock); + + if (val == 0) + svc->srv_hist_nrqbds_cpt_max = 0; + else + svc->srv_hist_nrqbds_cpt_max = + max(1, ((int)val / svc->srv_ncpts)); + + spin_unlock(&svc->srv_lock); + + return count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_req_buffer_history_max); + +static int +ptlrpc_lprocfs_req_buffers_max_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + + seq_printf(m, "%d\n", svc->srv_nrqbds_max); + return 0; +} + +static ssize_t +ptlrpc_lprocfs_req_buffers_max_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + int val; + int rc; + + rc = kstrtoint_from_user(buffer, count, 0, &val); + if (rc < 0) + return rc; + + if (val < svc->srv_nbuf_per_group && val != 0) + return -ERANGE; + + spin_lock(&svc->srv_lock); + + svc->srv_nrqbds_max = (uint)val; + + spin_unlock(&svc->srv_lock); + + return count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_req_buffers_max); + +static ssize_t threads_min_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + + return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_init * svc->srv_ncpts); +} + +static ssize_t threads_min_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + unsigned long val; + int rc; + + rc = kstrtoul(buffer, 10, &val); + if (rc < 0) + return rc; + + if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT) + return -ERANGE; + + spin_lock(&svc->srv_lock); + if (val > svc->srv_nthrs_cpt_limit * svc->srv_ncpts) { + spin_unlock(&svc->srv_lock); + return -ERANGE; + } + + svc->srv_nthrs_cpt_init = (int)val / svc->srv_ncpts; + + spin_unlock(&svc->srv_lock); + + return count; +} +LUSTRE_RW_ATTR(threads_min); + +static ssize_t threads_started_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + struct ptlrpc_service_part *svcpt; + int total = 0; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) + total += svcpt->scp_nthrs_running; + + return sprintf(buf, "%d\n", total); +} +LUSTRE_RO_ATTR(threads_started); + +static ssize_t threads_max_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + + return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_limit * svc->srv_ncpts); +} + +static ssize_t threads_max_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + unsigned long val; + int rc; + + rc = kstrtoul(buffer, 10, &val); + if (rc < 0) + return rc; + + if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT) + return -ERANGE; + + spin_lock(&svc->srv_lock); + if (val < svc->srv_nthrs_cpt_init * svc->srv_ncpts) { + spin_unlock(&svc->srv_lock); + return -ERANGE; + } + + svc->srv_nthrs_cpt_limit = (int)val / svc->srv_ncpts; + + spin_unlock(&svc->srv_lock); + + return count; +} +LUSTRE_RW_ATTR(threads_max); + +/** + * Translates \e ptlrpc_nrs_pol_state values to human-readable strings. + * + * \param[in] state The policy state + */ +static const char *nrs_state2str(enum ptlrpc_nrs_pol_state state) +{ + switch (state) { + default: + LBUG(); + case NRS_POL_STATE_INVALID: + return "invalid"; + case NRS_POL_STATE_STOPPED: + return "stopped"; + case NRS_POL_STATE_STOPPING: + return "stopping"; + case NRS_POL_STATE_STARTING: + return "starting"; + case NRS_POL_STATE_STARTED: + return "started"; + } +} + +/** + * Obtains status information for \a policy. + * + * Information is copied in \a info. + * + * \param[in] policy The policy + * \param[out] info Holds returned status information + */ +static void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_pol_info *info) +{ + LASSERT(policy != NULL); + LASSERT(info != NULL); + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + BUILD_BUG_ON(sizeof(info->pi_arg) != sizeof(policy->pol_arg)); + memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX); + memcpy(info->pi_arg, policy->pol_arg, sizeof(policy->pol_arg)); + + info->pi_fallback = !!(policy->pol_flags & PTLRPC_NRS_FL_FALLBACK); + info->pi_state = policy->pol_state; + /** + * XXX: These are accessed without holding + * ptlrpc_service_part::scp_req_lock. + */ + info->pi_req_queued = policy->pol_req_queued; + info->pi_req_started = policy->pol_req_started; +} + +/** + * Reads and prints policy status information for all policies of a PTLRPC + * service. + */ +static int ptlrpc_lprocfs_nrs_policies_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + struct ptlrpc_service_part *svcpt; + struct ptlrpc_nrs *nrs; + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_pol_info *infos; + struct ptlrpc_nrs_pol_info tmp; + unsigned int num_pols; + unsigned int pol_idx = 0; + bool hp = false; + int i; + int rc = 0; + ENTRY; + + /** + * Serialize NRS core lprocfs operations with policy registration/ + * unregistration. + */ + mutex_lock(&nrs_core.nrs_mutex); + + /** + * Use the first service partition's regular NRS head in order to obtain + * the number of policies registered with NRS heads of this service. All + * service partitions will have the same number of policies. + */ + nrs = nrs_svcpt2nrs(svc->srv_parts[0], false); + + spin_lock(&nrs->nrs_lock); + num_pols = svc->srv_parts[0]->scp_nrs_reg.nrs_num_pols; + spin_unlock(&nrs->nrs_lock); + + OBD_ALLOC_PTR_ARRAY(infos, num_pols); + if (infos == NULL) + GOTO(out, rc = -ENOMEM); +again: + + ptlrpc_service_for_each_part(svcpt, i, svc) { + nrs = nrs_svcpt2nrs(svcpt, hp); + spin_lock(&nrs->nrs_lock); + + pol_idx = 0; + + list_for_each_entry(policy, &nrs->nrs_policy_list, + pol_list) { + LASSERT(pol_idx < num_pols); + + nrs_policy_get_info_locked(policy, &tmp); + /** + * Copy values when handling the first service + * partition. + */ + if (i == 0) { + memcpy(infos[pol_idx].pi_name, tmp.pi_name, + NRS_POL_NAME_MAX); + memcpy(infos[pol_idx].pi_arg, tmp.pi_arg, + sizeof(tmp.pi_arg)); + memcpy(&infos[pol_idx].pi_state, &tmp.pi_state, + sizeof(tmp.pi_state)); + infos[pol_idx].pi_fallback = tmp.pi_fallback; + /** + * For the rest of the service partitions + * sanity-check the values we get. + */ + } else { + if (strncmp(infos[pol_idx].pi_name, + tmp.pi_name, + NRS_POL_NAME_MAX) != 0) { + spin_unlock(&nrs->nrs_lock); + rc = -EINVAL; + CERROR("%s: failed to check pi_name: rc = %d\n", + svc->srv_thread_name, rc); + GOTO(out, rc); + } + if (strncmp(infos[pol_idx].pi_arg, + tmp.pi_arg, + sizeof(tmp.pi_arg)) != 0) { + spin_unlock(&nrs->nrs_lock); + rc = -EINVAL; + CERROR("%s: failed to check pi_arg: rc = %d\n", + svc->srv_thread_name, rc); + GOTO(out, rc); + } + /** + * Not checking ptlrpc_nrs_pol_info::pi_state, + * because it may be different between + * instances of the same policy in different + * service partitions. + */ + + if (infos[pol_idx].pi_fallback != + tmp.pi_fallback) { + spin_unlock(&nrs->nrs_lock); + rc = -EINVAL; + CERROR("%s: failed to check pi_fallback: rc = %d\n", + svc->srv_thread_name, rc); + GOTO(out, rc); + } + } + + infos[pol_idx].pi_req_queued += tmp.pi_req_queued; + infos[pol_idx].pi_req_started += tmp.pi_req_started; + + pol_idx++; + } + spin_unlock(&nrs->nrs_lock); + } + + /** + * Policy status information output is in YAML format. + * For example: + * + * regular_requests: + * - name: fifo + * state: started + * fallback: yes + * queued: 0 + * active: 0 + * + * - name: crrn + * state: started + * fallback: no + * queued: 2015 + * active: 384 + * + * high_priority_requests: + * - name: fifo + * state: started + * fallback: yes + * queued: 0 + * active: 2 + * + * - name: crrn + * state: stopped + * fallback: no + * queued: 0 + * active: 0 + */ + seq_printf(m, "%s\n", !hp ? "\nregular_requests:" : + "high_priority_requests:"); + + for (pol_idx = 0; pol_idx < num_pols; pol_idx++) { + if (strlen(infos[pol_idx].pi_arg) > 0) + seq_printf(m, " - name: %s %s\n", + infos[pol_idx].pi_name, + infos[pol_idx].pi_arg); + else + seq_printf(m, " - name: %s\n", + infos[pol_idx].pi_name); + + + seq_printf(m, " state: %s\n" + " fallback: %s\n" + " queued: %-20d\n" + " active: %-20d\n\n", + nrs_state2str(infos[pol_idx].pi_state), + infos[pol_idx].pi_fallback ? "yes" : "no", + (int)infos[pol_idx].pi_req_queued, + (int)infos[pol_idx].pi_req_started); + } + + if (!hp && nrs_svc_has_hp(svc)) { + memset(infos, 0, num_pols * sizeof(*infos)); + + /** + * Redo the processing for the service's HP NRS heads' policies. + */ + hp = true; + goto again; + } + +out: + if (infos) + OBD_FREE_PTR_ARRAY(infos, num_pols); + + mutex_unlock(&nrs_core.nrs_mutex); + + RETURN(rc); +} + +#define LPROCFS_NRS_WR_MAX_ARG (1024) +/** + * The longest valid command string is the maxium policy name size, plus the + * length of the " reg" substring, plus the lenght of argument + */ +#define LPROCFS_NRS_WR_MAX_CMD (NRS_POL_NAME_MAX + sizeof(" reg") - 1 + \ + LPROCFS_NRS_WR_MAX_ARG) + +/** + * Starts and stops a given policy on a PTLRPC service. + * + * Commands consist of the policy name, followed by an optional [reg|hp] token; + * if the optional token is omitted, the operation is performed on both the + * regular and high-priority (if the service has one) NRS head. + */ +static ssize_t +ptlrpc_lprocfs_nrs_policies_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + enum ptlrpc_nrs_queue_type queue = PTLRPC_NRS_QUEUE_BOTH; + char *cmd; + char *cmd_copy = NULL; + char *policy_name; + char *queue_name; + int rc = 0; + ENTRY; + + if (count >= LPROCFS_NRS_WR_MAX_CMD) + GOTO(out, rc = -EINVAL); + + OBD_ALLOC(cmd, LPROCFS_NRS_WR_MAX_CMD); + if (cmd == NULL) + GOTO(out, rc = -ENOMEM); + /** + * strsep() modifies its argument, so keep a copy + */ + cmd_copy = cmd; + + if (copy_from_user(cmd, buffer, count)) + GOTO(out, rc = -EFAULT); + + cmd[count] = '\0'; + + policy_name = strsep(&cmd, " "); + + if (strlen(policy_name) > NRS_POL_NAME_MAX - 1) + GOTO(out, rc = -EINVAL); + + /** + * No [reg|hp] token has been specified + */ + if (cmd == NULL) + goto default_queue; + + queue_name = strsep(&cmd, " "); + /** + * The second token is either an optional [reg|hp] string, + * or arguments + */ + if (strcmp(queue_name, "reg") == 0) + queue = PTLRPC_NRS_QUEUE_REG; + else if (strcmp(queue_name, "hp") == 0) + queue = PTLRPC_NRS_QUEUE_HP; + else { + if (cmd != NULL) + *(cmd - 1) = ' '; + cmd = queue_name; + } + +default_queue: + + if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc)) + GOTO(out, rc = -ENODEV); + else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc)) + queue = PTLRPC_NRS_QUEUE_REG; + + /** + * Serialize NRS core lprocfs operations with policy registration/ + * unregistration. + */ + mutex_lock(&nrs_core.nrs_mutex); + + rc = ptlrpc_nrs_policy_control(svc, queue, policy_name, + PTLRPC_NRS_CTL_START, + false, cmd); + + mutex_unlock(&nrs_core.nrs_mutex); +out: + if (cmd_copy) + OBD_FREE(cmd_copy, LPROCFS_NRS_WR_MAX_CMD); + + RETURN(rc < 0 ? rc : count); +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_policies); + +/** @} nrs */ + +struct ptlrpc_srh_iterator { + int srhi_idx; + __u64 srhi_seq; + struct ptlrpc_request *srhi_req; +}; + +static int +ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service_part *svcpt, + struct ptlrpc_srh_iterator *srhi, + __u64 seq) +{ + struct list_head *e; + struct ptlrpc_request *req; + + if (srhi->srhi_req != NULL && + srhi->srhi_seq > svcpt->scp_hist_seq_culled && + srhi->srhi_seq <= seq) { + /* If srhi_req was set previously, hasn't been culled and + * we're searching for a seq on or after it (i.e. more + * recent), search from it onwards. + * Since the service history is LRU (i.e. culled reqs will + * be near the head), we shouldn't have to do long re-scans. + */ + LASSERTF(srhi->srhi_seq == srhi->srhi_req->rq_history_seq, + "%s:%d: seek seq %llu, request seq %llu\n", + svcpt->scp_service->srv_name, svcpt->scp_cpt, + srhi->srhi_seq, srhi->srhi_req->rq_history_seq); + LASSERTF(!list_empty(&svcpt->scp_hist_reqs), + "%s:%d: seek offset %llu, request seq %llu, " + "last culled %llu\n", + svcpt->scp_service->srv_name, svcpt->scp_cpt, + seq, srhi->srhi_seq, svcpt->scp_hist_seq_culled); + e = &srhi->srhi_req->rq_history_list; + } else { + /* search from start */ + e = svcpt->scp_hist_reqs.next; + } + + while (e != &svcpt->scp_hist_reqs) { + req = list_entry(e, struct ptlrpc_request, rq_history_list); + + if (req->rq_history_seq >= seq) { + srhi->srhi_seq = req->rq_history_seq; + srhi->srhi_req = req; + return 0; + } + e = e->next; + } + + return -ENOENT; +} + +/* + * ptlrpc history sequence is used as "position" of seq_file, in some case, + * seq_read() will increase "position" to indicate reading the next + * element, however, low bits of history sequence are reserved for CPT id + * (check the details from comments before ptlrpc_req_add_history), which + * means seq_read() might change CPT id of history sequence and never + * finish reading of requests on a CPT. To make it work, we have to shift + * CPT id to high bits and timestamp to low bits, so seq_read() will only + * increase timestamp which can correctly indicate the next position. + */ + +/* convert seq_file pos to cpt */ +#define PTLRPC_REQ_POS2CPT(svc, pos) \ + ((svc)->srv_cpt_bits == 0 ? 0 : \ + (__u64)(pos) >> (64 - (svc)->srv_cpt_bits)) + +/* make up seq_file pos from cpt */ +#define PTLRPC_REQ_CPT2POS(svc, cpt) \ + ((svc)->srv_cpt_bits == 0 ? 0 : \ + (cpt) << (64 - (svc)->srv_cpt_bits)) + +/* convert sequence to position */ +#define PTLRPC_REQ_SEQ2POS(svc, seq) \ + ((svc)->srv_cpt_bits == 0 ? (seq) : \ + ((seq) >> (svc)->srv_cpt_bits) | \ + ((seq) << (64 - (svc)->srv_cpt_bits))) + +/* convert position to sequence */ +#define PTLRPC_REQ_POS2SEQ(svc, pos) \ + ((svc)->srv_cpt_bits == 0 ? (pos) : \ + ((__u64)(pos) << (svc)->srv_cpt_bits) | \ + ((__u64)(pos) >> (64 - (svc)->srv_cpt_bits))) + +static void * +ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_service_part *svcpt; + struct ptlrpc_srh_iterator *srhi; + unsigned int cpt; + int rc; + int i; + + if (sizeof(loff_t) != sizeof(__u64)) { /* can't support */ + CWARN("Failed to read request history because size of loff_t " + "%d can't match size of u64\n", (int)sizeof(loff_t)); + return NULL; + } + + OBD_ALLOC(srhi, sizeof(*srhi)); + if (srhi == NULL) + return NULL; + + srhi->srhi_seq = 0; + srhi->srhi_req = NULL; + + cpt = PTLRPC_REQ_POS2CPT(svc, *pos); + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (i < cpt) /* skip */ + continue; + if (i > cpt) /* make up the lowest position for this CPT */ + *pos = PTLRPC_REQ_CPT2POS(svc, i); + + mutex_lock(&svcpt->scp_mutex); + spin_lock(&svcpt->scp_lock); + rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, + PTLRPC_REQ_POS2SEQ(svc, *pos)); + spin_unlock(&svcpt->scp_lock); + mutex_unlock(&svcpt->scp_mutex); + if (rc == 0) { + *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq); + srhi->srhi_idx = i; + return srhi; + } + } + + OBD_FREE(srhi, sizeof(*srhi)); + return NULL; +} + +static void +ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter) +{ + struct ptlrpc_srh_iterator *srhi = iter; + + if (srhi != NULL) + OBD_FREE(srhi, sizeof(*srhi)); +} + +static void * +ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s, + void *iter, loff_t *pos) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_srh_iterator *srhi = iter; + struct ptlrpc_service_part *svcpt; + __u64 seq; + int rc; + int i; + + for (i = srhi->srhi_idx; i < svc->srv_ncpts; i++) { + svcpt = svc->srv_parts[i]; + + if (i > srhi->srhi_idx) { /* reset iterator for a new CPT */ + srhi->srhi_req = NULL; + seq = srhi->srhi_seq = 0; + } else { /* the next sequence */ + seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits); + } + + mutex_lock(&svcpt->scp_mutex); + spin_lock(&svcpt->scp_lock); + rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq); + spin_unlock(&svcpt->scp_lock); + mutex_unlock(&svcpt->scp_mutex); + if (rc == 0) { + *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq); + srhi->srhi_idx = i; + return srhi; + } + } + + OBD_FREE(srhi, sizeof(*srhi)); + ++*pos; + return NULL; +} + +/* common ost/mdt so_req_printer */ +void target_print_req(void *seq_file, struct ptlrpc_request *req) +{ + /* Called holding srv_lock with irqs disabled. + * Print specific req contents and a newline. + * CAVEAT EMPTOR: check request message length before printing!!! + * You might have received any old crap so you must be just as + * careful here as the service's request parser!!! + */ + struct seq_file *sf = seq_file; + + switch (req->rq_phase) { + case RQ_PHASE_NEW: + /* still awaiting a service thread's attention, or rejected + * because the generic request message didn't unpack + */ + seq_printf(sf, "\n"); + break; + case RQ_PHASE_INTERPRET: + /* being handled, so basic msg swabbed, and opc is valid + * but racing with mds_handle(). fallthrough. + */ + fallthrough; + case RQ_PHASE_COMPLETE: + /* been handled by mds_handle(), reply state may be volatile */ + seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg)); + break; + default: + DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase); + } +} +EXPORT_SYMBOL(target_print_req); + +static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_srh_iterator *srhi = iter; + struct ptlrpc_service_part *svcpt; + struct ptlrpc_request *req; + int rc; + + LASSERT(srhi->srhi_idx < svc->srv_ncpts); + + svcpt = svc->srv_parts[srhi->srhi_idx]; + + mutex_lock(&svcpt->scp_mutex); + spin_lock(&svcpt->scp_lock); + + rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq); + + if (rc == 0) { + struct timespec64 arrival, sent, arrivaldiff; + char nidstr[LNET_NIDSTR_SIZE]; + + req = srhi->srhi_req; + + arrival.tv_sec = req->rq_arrival_time.tv_sec; + arrival.tv_nsec = req->rq_arrival_time.tv_nsec; + sent.tv_sec = req->rq_sent; + sent.tv_nsec = 0; + arrivaldiff = timespec64_sub(sent, arrival); + + /* Print common req fields. + * CAVEAT EMPTOR: we're racing with the service handler + * here. The request could contain any old crap, so you + * must be just as careful as the service's request + * parser. Currently I only print stuff here I know is OK + * to look at coz it was set up in request_in_callback()!!! + */ + seq_printf(s, + "%lld:%s:%s:x%llu:%d:%s:%lld.%06lld:%lld.%06llds(%+lld.0s) ", + req->rq_history_seq, + req->rq_export && req->rq_export->exp_obd ? + req->rq_export->exp_obd->obd_name : + libcfs_nid2str_r(req->rq_self, nidstr, + sizeof(nidstr)), + libcfs_id2str(req->rq_peer), req->rq_xid, + req->rq_reqlen, ptlrpc_rqphase2str(req), + (s64)req->rq_arrival_time.tv_sec, + (s64)(req->rq_arrival_time.tv_nsec / NSEC_PER_USEC), + (s64)arrivaldiff.tv_sec, + (s64)(arrivaldiff.tv_nsec / NSEC_PER_USEC), + (s64)(req->rq_sent - req->rq_deadline)); + if (svc->srv_ops.so_req_printer == NULL) + seq_printf(s, "\n"); + else + svc->srv_ops.so_req_printer(s, srhi->srhi_req); + } + + spin_unlock(&svcpt->scp_lock); + mutex_unlock(&svcpt->scp_mutex); + + return rc; +} + +static int +ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file) +{ + static const struct seq_operations sops = { + .start = ptlrpc_lprocfs_svc_req_history_start, + .stop = ptlrpc_lprocfs_svc_req_history_stop, + .next = ptlrpc_lprocfs_svc_req_history_next, + .show = ptlrpc_lprocfs_svc_req_history_show, + }; + struct seq_file *seqf; + int rc; + + rc = seq_open(file, &sops); + if (rc) + return rc; + + seqf = file->private_data; + seqf->private = inode->i_private; + return 0; +} + +/* See also lprocfs_rd_timeouts */ +static int ptlrpc_lprocfs_timeouts_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + struct ptlrpc_service_part *svcpt; + time64_t worst_timestamp; + timeout_t cur_timeout; + timeout_t worst_timeout; + int i; + + if (AT_OFF) { + seq_printf(m, "adaptive timeouts off, using obd_timeout %u\n", + obd_timeout); + return 0; + } + + ptlrpc_service_for_each_part(svcpt, i, svc) { + cur_timeout = at_get(&svcpt->scp_at_estimate); + worst_timeout = svcpt->scp_at_estimate.at_worst_timeout_ever; + worst_timestamp = svcpt->scp_at_estimate.at_worst_timestamp; + + seq_printf(m, "%10s : cur %3u worst %3u (at %lld, %llds ago) ", + "service", cur_timeout, worst_timeout, + worst_timestamp, + ktime_get_real_seconds() - worst_timestamp); + + lprocfs_at_hist_helper(m, &svcpt->scp_at_estimate); + } + + return 0; +} + +LDEBUGFS_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts); + +static ssize_t high_priority_ratio_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + + return sprintf(buf, "%d\n", svc->srv_hpreq_ratio); +} + +static ssize_t high_priority_ratio_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + int rc; + unsigned long val; + + rc = kstrtoul(buffer, 10, &val); + if (rc < 0) + return rc; + + spin_lock(&svc->srv_lock); + svc->srv_hpreq_ratio = val; + spin_unlock(&svc->srv_lock); + + return count; +} +LUSTRE_RW_ATTR(high_priority_ratio); + +static struct attribute *ptlrpc_svc_attrs[] = { + &lustre_attr_threads_min.attr, + &lustre_attr_threads_started.attr, + &lustre_attr_threads_max.attr, + &lustre_attr_high_priority_ratio.attr, + NULL, +}; + +KOBJ_ATTRIBUTE_GROUPS(ptlrpc_svc); /* creates ptlrpc_svc_groups */ + +static void ptlrpc_sysfs_svc_release(struct kobject *kobj) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + + complete(&svc->srv_kobj_unregister); +} + +static struct kobj_type ptlrpc_svc_ktype = { + .default_groups = KOBJ_ATTR_GROUPS(ptlrpc_svc), + .sysfs_ops = &lustre_sysfs_ops, + .release = ptlrpc_sysfs_svc_release, +}; + +void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc) +{ + /* Let's see if we had a chance at initialization first */ + if (svc->srv_kobj.kset) { + kobject_put(&svc->srv_kobj); + wait_for_completion(&svc->srv_kobj_unregister); + } +} + +int ptlrpc_sysfs_register_service(struct kset *parent, + struct ptlrpc_service *svc) +{ + svc->srv_kobj.kset = parent; + init_completion(&svc->srv_kobj_unregister); + return kobject_init_and_add(&svc->srv_kobj, &ptlrpc_svc_ktype, + &parent->kobj, "%s", svc->srv_name); +} + +void ptlrpc_ldebugfs_register_service(struct dentry *entry, + struct ptlrpc_service *svc) +{ + struct ldebugfs_vars ldebugfs_vars[] = { + { .name = "req_buffer_history_len", + .fops = &ptlrpc_lprocfs_req_buffer_history_len_fops, + .data = svc }, + { .name = "req_buffer_history_max", + .fops = &ptlrpc_lprocfs_req_buffer_history_max_fops, + .data = svc }, + { .name = "timeouts", + .fops = &ptlrpc_lprocfs_timeouts_fops, + .data = svc }, + { .name = "nrs_policies", + .fops = &ptlrpc_lprocfs_nrs_policies_fops, + .data = svc }, + { .name = "req_buffers_max", + .fops = &ptlrpc_lprocfs_req_buffers_max_fops, + .data = svc }, + { NULL } + }; + static const struct file_operations req_history_fops = { + .owner = THIS_MODULE, + .open = ptlrpc_lprocfs_svc_req_history_open, + .read = seq_read, + .llseek = seq_lseek, + .release = lprocfs_seq_release, + }; + + ptlrpc_ldebugfs_register(entry, svc->srv_name, "stats", + &svc->srv_debugfs_entry, &svc->srv_stats); + if (!svc->srv_debugfs_entry) + return; + + ldebugfs_add_vars(svc->srv_debugfs_entry, ldebugfs_vars, NULL); + + debugfs_create_file("req_history", 0400, svc->srv_debugfs_entry, svc, + &req_history_fops); +} + +void ptlrpc_lprocfs_register_obd(struct obd_device *obd) +{ + ptlrpc_ldebugfs_register(obd->obd_debugfs_entry, NULL, "stats", + &obd->obd_svc_debugfs_entry, + &obd->obd_svc_stats); +} +EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd); + +void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount) +{ + struct lprocfs_stats *svc_stats; + __u32 op = lustre_msg_get_opc(req->rq_reqmsg); + int opc = opcode_offset(op); + + svc_stats = req->rq_import->imp_obd->obd_svc_stats; + if (svc_stats == NULL || opc <= 0) + return; + + LASSERT(opc < LUSTRE_MAX_OPCODES); + if (!(op == LDLM_ENQUEUE || op == MDS_REINT)) + lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, amount); +} + +void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) +{ + struct lprocfs_stats *svc_stats; + int idx; + + if (!req->rq_import) + return; + + svc_stats = req->rq_import->imp_obd->obd_svc_stats; + if (!svc_stats) + return; + + idx = lustre_msg_get_opc(req->rq_reqmsg); + switch (idx) { + case OST_READ: + idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR; + break; + case OST_WRITE: + idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR; + break; + default: + LASSERTF(0, "unsupported opcode %u\n", idx); + break; + } + + lprocfs_counter_add(svc_stats, idx, bytes); +} + +EXPORT_SYMBOL(ptlrpc_lprocfs_brw); + +void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc) +{ + debugfs_remove_recursive(svc->srv_debugfs_entry); + + if (svc->srv_stats) + lprocfs_free_stats(&svc->srv_stats); +} + +void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) +{ + /* cleanup first to allow concurrent access to device's + * stats via debugfs to complete safely + */ + lprocfs_obd_cleanup(obd); + + debugfs_remove_recursive(obd->obd_svc_debugfs_entry); + + if (obd->obd_svc_stats) + lprocfs_free_stats(&obd->obd_svc_stats); +} +EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd); + +ssize_t ping_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp; + struct ptlrpc_request *req; + int rc; + + ENTRY; + with_imp_locked(obd, imp, rc) + req = ptlrpc_prep_ping(imp); + + if (rc) + RETURN(rc); + if (!req) + RETURN(-ENOMEM); + + req->rq_send_state = LUSTRE_IMP_FULL; + + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + + RETURN(rc); +} +EXPORT_SYMBOL(ping_show); + +/* kept for older verison of tools. */ +ssize_t ping_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + int rc = ping_show(kobj, attr, (char *)buffer); + + return (rc < 0) ? rc : count; +} +EXPORT_SYMBOL(ping_store); + +/* Write the connection UUID to this file to attempt to connect to that node. + * The connection UUID is a node's primary NID. For example, + * "echo connection=192.168.0.1@tcp0::instance > .../import". + */ +ssize_t +ldebugfs_import_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + struct obd_import *imp; + char *kbuf = NULL; + char *uuid; + char *ptr; + int do_reconn = 1; + const char prefix[] = "connection="; + const int prefix_len = sizeof(prefix) - 1; + int rc = 0; + + if (count > PAGE_SIZE - 1 || count <= prefix_len) + return -EINVAL; + + OBD_ALLOC(kbuf, count + 1); + if (kbuf == NULL) + return -ENOMEM; + + if (copy_from_user(kbuf, buffer, count)) + GOTO(out, rc = -EFAULT); + + kbuf[count] = 0; + + /* only support connection=uuid::instance now */ + if (strncmp(prefix, kbuf, prefix_len) != 0) + GOTO(out, rc = -EINVAL); + + with_imp_locked(obd, imp, rc) { + uuid = kbuf + prefix_len; + ptr = strstr(uuid, "::"); + if (ptr) { + u32 inst; + int rc; + + *ptr = 0; + do_reconn = 0; + ptr += 2; /* Skip :: */ + rc = kstrtouint(ptr, 10, &inst); + if (rc) { + CERROR("config: wrong instance # %s\n", ptr); + } else if (inst != imp->imp_connect_data.ocd_instance) { + CDEBUG(D_INFO, + "IR: %s is connecting to an obsoleted target(%u/%u), reconnecting...\n", + imp->imp_obd->obd_name, + imp->imp_connect_data.ocd_instance, + inst); + do_reconn = 1; + } else { + CDEBUG(D_INFO, + "IR: %s has already been connecting to " + "new target(%u)\n", + imp->imp_obd->obd_name, inst); + } + } + + if (do_reconn) + ptlrpc_recover_import(imp, uuid, 1); + } + +out: + OBD_FREE(kbuf, count + 1); + return rc ?: count; +} +EXPORT_SYMBOL(ldebugfs_import_seq_write); + +int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *n) +{ + struct obd_device *obd = m->private; + struct obd_import *imp; + int rc; + + with_imp_locked(obd, imp, rc) + seq_printf(m, "%d\n", !imp->imp_no_pinger_recover); + + return rc; +} +EXPORT_SYMBOL(lprocfs_pinger_recov_seq_show); + +ssize_t +lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + struct obd_import *imp; + bool val; + int rc; + + rc = kstrtobool_from_user(buffer, count, &val); + if (rc < 0) + return rc; + + with_imp_locked(obd, imp, rc) { + spin_lock(&imp->imp_lock); + imp->imp_no_pinger_recover = !val; + spin_unlock(&imp->imp_lock); + } + + return rc ?: count; +} +EXPORT_SYMBOL(lprocfs_pinger_recov_seq_write); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c new file mode 100644 index 0000000000000..cfd16ff3ab877 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c @@ -0,0 +1,1028 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" +#include /* for CFS_FAIL_PTLRPC_OST_BULK_CB2 */ + +/** + * Helper function. Sends \a len bytes from \a base at offset \a offset + * over \a conn connection to portal \a portal. + * Returns 0 on success or error code. + */ +static int ptl_send_buf(struct lnet_handle_md *mdh, void *base, int len, + enum lnet_ack_req ack, struct ptlrpc_cb_id *cbid, + lnet_nid_t self, struct lnet_process_id peer_id, + int portal, __u64 xid, unsigned int offset, + struct lnet_handle_md *bulk_cookie) +{ + int rc; + struct lnet_md md; + ENTRY; + + LASSERT (portal != 0); + CDEBUG (D_INFO, "peer_id %s\n", libcfs_id2str(peer_id)); + md.start = base; + md.length = len; + md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1; + md.options = PTLRPC_MD_OPTIONS; + md.user_ptr = cbid; + md.handler = ptlrpc_handler; + LNetInvalidateMDHandle(&md.bulk_handle); + + if (bulk_cookie) { + md.bulk_handle = *bulk_cookie; + md.options |= LNET_MD_BULK_HANDLE; + } + + if (unlikely(ack == LNET_ACK_REQ && + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){ + /* don't ask for the ack to simulate failing client */ + ack = LNET_NOACK_REQ; + } + + rc = LNetMDBind(&md, LNET_UNLINK, mdh); + if (unlikely(rc != 0)) { + CERROR ("LNetMDBind failed: %d\n", rc); + LASSERT (rc == -ENOMEM); + RETURN (-ENOMEM); + } + + CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n", + len, portal, xid, offset); + + percpu_ref_get(&ptlrpc_pending); + + rc = LNetPut(self, *mdh, ack, + peer_id, portal, xid, offset, 0); + if (unlikely(rc != 0)) { + int rc2; + /* We're going to get an UNLINK event when I unlink below, + * which will complete just like any other failed send, so + * I fall through and return success here! */ + CERROR("LNetPut(%s, %d, %lld) failed: %d\n", + libcfs_id2str(peer_id), portal, xid, rc); + rc2 = LNetMDUnlink(*mdh); + LASSERTF(rc2 == 0, "rc2 = %d\n", rc2); + } + + RETURN (0); +} + +#define mdunlink_iterate_helper(mds, count) \ + __mdunlink_iterate_helper(mds, count, false) +static void __mdunlink_iterate_helper(struct lnet_handle_md *bd_mds, + int count, bool discard) +{ + int i; + + for (i = 0; i < count; i++) + __LNetMDUnlink(bd_mds[i], discard); +} + +#ifdef HAVE_SERVER_SUPPORT +/** + * Prepare bulk descriptor for specified incoming request \a req that + * can fit \a nfrags * pages. \a type is bulk type. \a portal is where + * the bulk to be sent. Used on server-side after request was already + * received. + * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on + * error. + */ +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req, + unsigned nfrags, unsigned max_brw, + unsigned int type, + unsigned portal, + const struct ptlrpc_bulk_frag_ops + *ops) +{ + struct obd_export *exp = req->rq_export; + struct ptlrpc_bulk_desc *desc; + + ENTRY; + LASSERT(ptlrpc_is_bulk_op_active(type)); + + desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops); + if (desc == NULL) + RETURN(NULL); + + desc->bd_export = class_export_get(exp); + desc->bd_req = req; + + desc->bd_cbid.cbid_fn = server_bulk_callback; + desc->bd_cbid.cbid_arg = desc; + + /* NB we don't assign rq_bulk here; server-side requests are + * re-used, and the handler frees the bulk desc explicitly. */ + + return desc; +} +EXPORT_SYMBOL(ptlrpc_prep_bulk_exp); + +/** + * Starts bulk transfer for descriptor \a desc on the server. + * Returns 0 on success or error code. + */ +int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc) +{ + struct obd_export *exp = desc->bd_export; + lnet_nid_t self_nid; + struct lnet_process_id peer_id; + int rc = 0; + __u64 mbits; + int posted_md; + int total_md; + struct lnet_md md; + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_PUT_NET)) + RETURN(0); + + /* NB no locking required until desc is on the network */ + LASSERT(ptlrpc_is_bulk_op_active(desc->bd_type)); + + LASSERT(desc->bd_cbid.cbid_fn == server_bulk_callback); + LASSERT(desc->bd_cbid.cbid_arg == desc); + + /* + * Multi-Rail: get the preferred self and peer NIDs from the + * request, so they are based on the route taken by the + * message. + */ + self_nid = desc->bd_req->rq_self; + peer_id = desc->bd_req->rq_source; + + /* NB total length may be 0 for a read past EOF, so we send 0 + * length bulks, since the client expects bulk events. + * + * The client may not need all of the bulk mbits for the RPC. The RPC + * used the mbits of the highest bulk mbits needed, and the server masks + * off high bits to get bulk count for this RPC. LU-1431 */ + mbits = desc->bd_req->rq_mbits & ~((__u64)desc->bd_md_max_brw - 1); + total_md = desc->bd_req->rq_mbits - mbits + 1; + desc->bd_refs = total_md; + desc->bd_failure = 0; + + md.user_ptr = &desc->bd_cbid; + md.handler = ptlrpc_handler; + md.threshold = 2; /* SENT and ACK/REPLY */ + + for (posted_md = 0; posted_md < total_md; mbits++) { + md.options = PTLRPC_MD_OPTIONS; + + /* NB it's assumed that source and sink buffer frags are + * page-aligned. Otherwise we'd have to send client bulk + * sizes over and split server buffer accordingly */ + ptlrpc_fill_bulk_md(&md, desc, posted_md); + rc = LNetMDBind(&md, LNET_UNLINK, &desc->bd_mds[posted_md]); + if (rc != 0) { + CERROR("%s: LNetMDBind failed for MD %u: rc = %d\n", + exp->exp_obd->obd_name, posted_md, rc); + LASSERT(rc == -ENOMEM); + if (posted_md == 0) { + desc->bd_md_count = 0; + RETURN(-ENOMEM); + } + break; + } + percpu_ref_get(&ptlrpc_pending); + + /* sanity.sh 224c: lets skip last md */ + if (posted_md == desc->bd_md_max_brw - 1) + OBD_FAIL_CHECK_RESET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB3, + CFS_FAIL_PTLRPC_OST_BULK_CB2); + + /* Network is about to get at the memory */ + if (ptlrpc_is_bulk_put_source(desc->bd_type)) + rc = LNetPut(self_nid, desc->bd_mds[posted_md], + LNET_ACK_REQ, peer_id, + desc->bd_portal, mbits, 0, 0); + else + rc = LNetGet(self_nid, desc->bd_mds[posted_md], + peer_id, desc->bd_portal, mbits, 0, false); + + posted_md++; + if (rc != 0) { + CERROR("%s: failed bulk transfer with %s:%u x%llu: " + "rc = %d\n", exp->exp_obd->obd_name, + libcfs_id2str(peer_id), desc->bd_portal, + mbits, rc); + break; + } + } + + if (rc != 0) { + /* Can't send, so we unlink the MD bound above. The UNLINK + * event this creates will signal completion with failure, + * so we return SUCCESS here! */ + spin_lock(&desc->bd_lock); + desc->bd_refs -= total_md - posted_md; + spin_unlock(&desc->bd_lock); + LASSERT(desc->bd_refs >= 0); + + mdunlink_iterate_helper(desc->bd_mds, posted_md); + RETURN(0); + } + + CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d " + "id %s mbits %#llx-%#llx\n", desc->bd_iov_count, + desc->bd_nob, desc->bd_portal, libcfs_id2str(peer_id), + mbits - posted_md, mbits - 1); + + RETURN(0); +} + +/** + * Server side bulk abort. Idempotent. Not thread-safe (i.e. only + * serialises with completion callback) + */ +void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc) +{ + LASSERT(!in_interrupt()); /* might sleep */ + + if (!ptlrpc_server_bulk_active(desc)) /* completed or */ + return; /* never started */ + + /* We used to poison the pages with 0xab here because we did not want to + * send any meaningful data over the wire for evicted clients (bug 9297) + * However, this is no longer safe now that we use the page cache on the + * OSS (bug 20560) */ + + /* The unlink ensures the callback happens ASAP and is the last + * one. If it fails, it must be because completion just happened, + * but we must still wait_event_idle_timeout() in this case, to give + * us a chance to run server_bulk_callback() + */ + __mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw, true); + + for (;;) { + /* Network access will complete in finite time but the HUGE + * timeout lets us CWARN for visibility of sluggish NALs */ + int seconds = PTLRPC_REQ_LONG_UNLINK; + + while (seconds > 0 && + wait_event_idle_timeout(desc->bd_waitq, + !ptlrpc_server_bulk_active(desc), + cfs_time_seconds(1)) == 0) + seconds -= 1; + if (seconds > 0) + return; + + CWARN("Unexpectedly long timeout: desc %p\n", desc); + } +} +#endif /* HAVE_SERVER_SUPPORT */ + +/** + * Register bulk at the sender for later transfer. + * Returns 0 on success or error code. + */ +int ptlrpc_register_bulk(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + struct lnet_processid peer; + int rc = 0; + int posted_md; + int total_md; + __u64 mbits; + struct lnet_me *me; + struct lnet_md md; + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET)) + RETURN(0); + + /* NB no locking required until desc is on the network */ + LASSERT(desc->bd_nob > 0); + LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT); + LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); + LASSERT(desc->bd_req != NULL); + LASSERT(ptlrpc_is_bulk_op_passive(desc->bd_type)); + + /* cleanup the state of the bulk for it will be reused */ + if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY) + desc->bd_nob_transferred = 0; + else if (desc->bd_nob_transferred != 0) + /* If the network failed after an RPC was sent, this condition + * could happen. Rather than assert (was here before), return + * an EIO error. */ + RETURN(-EIO); + + desc->bd_failure = 0; + + peer = desc->bd_import->imp_connection->c_peer; + + LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback); + LASSERT(desc->bd_cbid.cbid_arg == desc); + + total_md = desc->bd_md_count; + /* rq_mbits is matchbits of the final bulk */ + mbits = req->rq_mbits - desc->bd_md_count + 1; + + LASSERTF(mbits == (req->rq_mbits & PTLRPC_BULK_OPS_MASK), + "first mbits = x%llu, last mbits = x%llu\n", + mbits, req->rq_mbits); + LASSERTF(!(desc->bd_registered && + req->rq_send_state != LUSTRE_IMP_REPLAY) || + mbits != desc->bd_last_mbits, + "registered: %d rq_mbits: %llu bd_last_mbits: %llu\n", + desc->bd_registered, mbits, desc->bd_last_mbits); + + desc->bd_registered = 1; + desc->bd_last_mbits = mbits; + desc->bd_refs = total_md; + md.user_ptr = &desc->bd_cbid; + md.handler = ptlrpc_handler; + md.threshold = 1; /* PUT or GET */ + + for (posted_md = 0; posted_md < desc->bd_md_count; + posted_md++, mbits++) { + md.options = PTLRPC_MD_OPTIONS | + (ptlrpc_is_bulk_op_get(desc->bd_type) ? + LNET_MD_OP_GET : LNET_MD_OP_PUT); + ptlrpc_fill_bulk_md(&md, desc, posted_md); + + if (posted_md > 0 && posted_md + 1 == desc->bd_md_count && + OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_ATTACH)) { + rc = -ENOMEM; + } else { + me = LNetMEAttach(desc->bd_portal, &peer, mbits, 0, + LNET_UNLINK, LNET_INS_AFTER); + rc = PTR_ERR_OR_ZERO(me); + } + if (rc != 0) { + CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n", + desc->bd_import->imp_obd->obd_name, mbits, + posted_md, rc); + break; + } + percpu_ref_get(&ptlrpc_pending); + + /* About to let the network at it... */ + rc = LNetMDAttach(me, &md, LNET_UNLINK, + &desc->bd_mds[posted_md]); + if (rc != 0) { + CERROR("%s: LNetMDAttach failed x%llu/%d: rc = %d\n", + desc->bd_import->imp_obd->obd_name, mbits, + posted_md, rc); + break; + } + } + + if (rc != 0) { + LASSERT(rc == -ENOMEM); + spin_lock(&desc->bd_lock); + desc->bd_refs -= total_md - posted_md; + spin_unlock(&desc->bd_lock); + LASSERT(desc->bd_refs >= 0); + mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); + req->rq_status = -ENOMEM; + desc->bd_registered = 0; + RETURN(-ENOMEM); + } + + spin_lock(&desc->bd_lock); + /* Holler if peer manages to touch buffers before he knows the mbits */ + if (desc->bd_refs != total_md) + CWARN("%s: Peer %s touched %d buffers while I registered\n", + desc->bd_import->imp_obd->obd_name, libcfs_idstr(&peer), + total_md - desc->bd_refs); + spin_unlock(&desc->bd_lock); + + CDEBUG(D_NET, + "Setup %u bulk %s buffers: %u pages %u bytes, mbits x%#llx-%#llx, portal %u\n", + desc->bd_refs, + ptlrpc_is_bulk_op_get(desc->bd_type) ? "get-source" : "put-sink", + desc->bd_iov_count, desc->bd_nob, + desc->bd_last_mbits, req->rq_mbits, desc->bd_portal); + + RETURN(0); +} + +/** + * Disconnect a bulk desc from the network. Idempotent. Not + * thread-safe (i.e. only interlocks with completion callback). + * Returns 1 on success or 0 if network unregistration failed for whatever + * reason. + */ +int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + ENTRY; + + LASSERT(!in_interrupt()); /* might sleep */ + + if (desc) + desc->bd_registered = 0; + + /* Let's setup deadline for reply unlink. */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && + async && req->rq_bulk_deadline == 0 && cfs_fail_val == 0) + req->rq_bulk_deadline = ktime_get_real_seconds() + + PTLRPC_REQ_LONG_UNLINK; + + if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ + RETURN(1); /* never registered */ + + LASSERT(desc->bd_req == req); /* bd_req NULL until registered */ + + /* the unlink ensures the callback happens ASAP and is the last + * one. If it fails, it must be because completion just happened, + * but we must still wait_event_idle_timeout() in this case to give + * us a chance to run client_bulk_callback() + */ + mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); + + if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ + RETURN(1); /* never registered */ + + /* Move to "Unregistering" phase as bulk was not unlinked yet. */ + ptlrpc_rqphase_move(req, RQ_PHASE_UNREG_BULK); + + /* Do not wait for unlink to finish. */ + if (async) + RETURN(0); + + for (;;) { + /* The wq argument is ignored by user-space wait_event macros */ + wait_queue_head_t *wq = (req->rq_set != NULL) ? + &req->rq_set->set_waitq : + &req->rq_reply_waitq; + /* + * Network access will complete in finite time but the HUGE + * timeout lets us CWARN for visibility of sluggish NALs. + */ + int seconds = PTLRPC_REQ_LONG_UNLINK; + + while (seconds > 0 && + wait_event_idle_timeout(*wq, + !ptlrpc_client_bulk_active(req), + cfs_time_seconds(1)) == 0) + seconds -= 1; + if (seconds > 0) { + ptlrpc_rqphase_move(req, req->rq_next_phase); + RETURN(1); + } + + DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p", + desc); + } + RETURN(0); +} + +static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + timeout_t service_timeout; + + service_timeout = clamp_t(timeout_t, ktime_get_real_seconds() - + req->rq_arrival_time.tv_sec, 1, + (AT_OFF ? obd_timeout * 3 / 2 : at_max)); + if (!(flags & PTLRPC_REPLY_EARLY) && + (req->rq_type != PTL_RPC_MSG_ERR) && + (req->rq_reqmsg != NULL) && + !(lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY | + MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) { + /* early replies, errors and recovery requests don't count + * toward our service time estimate + */ + timeout_t oldse = at_measured(&svcpt->scp_at_estimate, + service_timeout); + + if (oldse != 0) { + DEBUG_REQ(D_ADAPTTO, req, + "svc %s changed estimate from %d to %d", + svc->srv_name, oldse, + at_get(&svcpt->scp_at_estimate)); + } + } + /* Report actual service time for client latency calc */ + lustre_msg_set_service_timeout(req->rq_repmsg, service_timeout); + /* Report service time estimate for future client reqs, but report 0 + * (to be ignored by client) if it's an error reply during recovery. + * b=15815 + */ + if (req->rq_type == PTL_RPC_MSG_ERR && + (req->rq_export == NULL || + req->rq_export->exp_obd->obd_recovering)) { + lustre_msg_set_timeout(req->rq_repmsg, 0); + } else { + timeout_t timeout; + + if (req->rq_export && req->rq_reqmsg != NULL && + (flags & PTLRPC_REPLY_EARLY) && + lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) { + struct obd_device *exp_obd = req->rq_export->exp_obd; + + timeout = ktime_get_real_seconds() - + req->rq_arrival_time.tv_sec + + min_t(timeout_t, at_extra, + exp_obd->obd_recovery_timeout / 4); + } else { + timeout = at_get(&svcpt->scp_at_estimate); + } + lustre_msg_set_timeout(req->rq_repmsg, timeout); + } + + if (req->rq_reqmsg && + !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { + CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x " + "req_flags=%#x magic=%x/%x len=%d\n", + flags, lustre_msg_get_flags(req->rq_reqmsg), + lustre_msg_get_magic(req->rq_reqmsg), + lustre_msg_get_magic(req->rq_repmsg), req->rq_replen); + } +} + +/** + * Send request reply from request \a req reply buffer. + * \a flags defines reply types + * Returns 0 on success or error code + */ +int ptlrpc_send_reply(struct ptlrpc_request *req, int flags) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_connection *conn; + int rc; + + /* We must already have a reply buffer (only ptlrpc_error() may be + * called without one). The reply generated by sptlrpc layer (e.g. + * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must + * have a request buffer which is either the actual (swabbed) incoming + * request, or a saved copy if this is a req saved in + * target_queue_final_reply(). + */ + LASSERT (req->rq_no_reply == 0); + LASSERT (req->rq_reqbuf != NULL); + LASSERT (rs != NULL); + LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult); + LASSERT (req->rq_repmsg != NULL); + LASSERT (req->rq_repmsg == rs->rs_msg); + LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback); + LASSERT (rs->rs_cb_id.cbid_arg == rs); + + /* There may be no rq_export during failover */ + + if (unlikely(req->rq_export && req->rq_export->exp_obd && + req->rq_export->exp_obd->obd_fail)) { + /* Failed obd's only send ENODEV */ + req->rq_type = PTL_RPC_MSG_ERR; + req->rq_status = -ENODEV; + CDEBUG(D_HA, "sending ENODEV from failed obd %d\n", + req->rq_export->exp_obd->obd_minor); + } + + if (req->rq_type != PTL_RPC_MSG_ERR) + req->rq_type = PTL_RPC_MSG_REPLY; + + lustre_msg_set_type(req->rq_repmsg, req->rq_type); + lustre_msg_set_status(req->rq_repmsg, + ptlrpc_status_hton(req->rq_status)); + lustre_msg_set_opc(req->rq_repmsg, + req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0); + + target_pack_pool_reply(req); + + ptlrpc_at_set_reply(req, flags); + + if (req->rq_export == NULL || req->rq_export->exp_connection == NULL) + conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL); + else + conn = ptlrpc_connection_addref(req->rq_export->exp_connection); + + if (unlikely(conn == NULL)) { + CERROR("not replying on NULL connection\n"); /* bug 9635 */ + return -ENOTCONN; + } + ptlrpc_rs_addref(rs); /* +1 ref for the network */ + + rc = sptlrpc_svc_wrap_reply(req); + if (unlikely(rc)) + goto out; + + req->rq_sent = ktime_get_real_seconds(); + + rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len, + (rs->rs_difficult && !rs->rs_no_ack) ? + LNET_ACK_REQ : LNET_NOACK_REQ, + &rs->rs_cb_id, req->rq_self, req->rq_source, + ptlrpc_req2svc(req)->srv_rep_portal, + req->rq_rep_mbits ? req->rq_rep_mbits : req->rq_xid, + req->rq_reply_off, NULL); +out: + if (unlikely(rc != 0)) + ptlrpc_req_drop_rs(req); + ptlrpc_connection_put(conn); + return rc; +} + +int ptlrpc_reply (struct ptlrpc_request *req) +{ + if (req->rq_no_reply) + return 0; + else + return (ptlrpc_send_reply(req, 0)); +} + +/** + * For request \a req send an error reply back. Create empty + * reply buffers if necessary. + */ +int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult) +{ + int rc; + ENTRY; + + if (req->rq_no_reply) + RETURN(0); + + if (!req->rq_repmsg) { + rc = lustre_pack_reply(req, 1, NULL, NULL); + if (rc) + RETURN(rc); + } + + if (req->rq_status != -ENOSPC && req->rq_status != -EACCES && + req->rq_status != -EPERM && req->rq_status != -ENOENT && + req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT) + req->rq_type = PTL_RPC_MSG_ERR; + + rc = ptlrpc_send_reply(req, may_be_difficult); + RETURN(rc); +} + +int ptlrpc_error(struct ptlrpc_request *req) +{ + return ptlrpc_send_error(req, 0); +} + +/** + * Send request \a request. + * if \a noreply is set, don't expect any reply back and don't set up + * reply buffers. + * Returns 0 on success or error code. + */ +int ptl_send_rpc(struct ptlrpc_request *request, int noreply) +{ + int rc; + __u32 opc; + int mpflag = 0; + bool rep_mbits = false; + struct lnet_handle_md bulk_cookie; + struct lnet_processid peer; + struct ptlrpc_connection *connection; + struct lnet_me *reply_me = NULL; + struct lnet_md reply_md; + struct obd_import *imp = request->rq_import; + struct obd_device *obd = imp->imp_obd; + ENTRY; + + LNetInvalidateMDHandle(&bulk_cookie); + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC)) + RETURN(0); + + if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DELAY_RECOV) && + lustre_msg_get_opc(request->rq_reqmsg) == MDS_CONNECT && + strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) == 0)) { + RETURN(0); + } + + LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST); + LASSERT(request->rq_wait_ctx == 0); + + /* If this is a re-transmit, we're required to have disengaged + * cleanly from the previous attempt */ + LASSERT(!request->rq_receiving_reply); + LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) && + (imp->imp_state == LUSTRE_IMP_FULL))); + + if (unlikely(obd != NULL && obd->obd_fail)) { + CDEBUG(D_HA, "muting rpc for failed imp obd %s\n", + obd->obd_name); + /* this prevents us from waiting in ptlrpc_queue_wait */ + spin_lock(&request->rq_lock); + request->rq_err = 1; + spin_unlock(&request->rq_lock); + request->rq_status = -ENODEV; + RETURN(-ENODEV); + } + + connection = imp->imp_connection; + + lustre_msg_set_handle(request->rq_reqmsg, + &imp->imp_remote_handle); + lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST); + lustre_msg_set_conn_cnt(request->rq_reqmsg, + imp->imp_conn_cnt); + lustre_msghdr_set_flags(request->rq_reqmsg, + imp->imp_msghdr_flags); + + /* If it's the first time to resend the request for EINPROGRESS, + * we need to allocate a new XID (see after_reply()), it's different + * from the resend for reply timeout. */ + if (request->rq_nr_resend != 0 && + list_empty(&request->rq_unreplied_list)) { + __u64 min_xid = 0; + /* resend for EINPROGRESS, allocate new xid to avoid reply + * reconstruction */ + spin_lock(&imp->imp_lock); + ptlrpc_assign_next_xid_nolock(request); + min_xid = ptlrpc_known_replied_xid(imp); + spin_unlock(&imp->imp_lock); + + lustre_msg_set_last_xid(request->rq_reqmsg, min_xid); + DEBUG_REQ(D_RPCTRACE, request, + "Allocating new XID for resend on EINPROGRESS"); + } + + opc = lustre_msg_get_opc(request->rq_reqmsg); + if (opc != OST_CONNECT && opc != MDS_CONNECT && + opc != MGS_CONNECT && OCD_HAS_FLAG(&imp->imp_connect_data, FLAGS2)) + rep_mbits = imp->imp_connect_data.ocd_connect_flags2 & + OBD_CONNECT2_REP_MBITS; + + if ((request->rq_bulk != NULL) || rep_mbits) { + ptlrpc_set_mbits(request); + lustre_msg_set_mbits(request->rq_reqmsg, request->rq_mbits); + } + + if (list_empty(&request->rq_unreplied_list) || + request->rq_xid <= imp->imp_known_replied_xid) { + DEBUG_REQ(D_ERROR, request, + "xid=%llu, replied=%llu, list_empty=%d", + request->rq_xid, imp->imp_known_replied_xid, + list_empty(&request->rq_unreplied_list)); + LBUG(); + } + + /** For enabled AT all request should have AT_SUPPORT in the + * FULL import state when OBD_CONNECT_AT is set */ + LASSERT(AT_OFF || imp->imp_state != LUSTRE_IMP_FULL || + (imp->imp_msghdr_flags & MSGHDR_AT_SUPPORT) || + !(imp->imp_connect_data.ocd_connect_flags & + OBD_CONNECT_AT)); + + if (request->rq_resend) { + lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); + if (request->rq_resend_cb != NULL) + request->rq_resend_cb(request, &request->rq_async_args); + } + if (request->rq_memalloc) + mpflag = memalloc_noreclaim_save(); + + rc = sptlrpc_cli_wrap_request(request); + if (rc) + GOTO(out, rc); + + /* bulk register should be done after wrap_request() */ + if (request->rq_bulk != NULL) { + rc = ptlrpc_register_bulk (request); + if (rc != 0) + GOTO(cleanup_bulk, rc); + /* + * All the mds in the request will have the same cpt + * encoded in the cookie. So we can just get the first + * one. + */ + bulk_cookie = request->rq_bulk->bd_mds[0]; + } + + if (!noreply) { + LASSERT (request->rq_replen != 0); + if (request->rq_repbuf == NULL) { + LASSERT(request->rq_repdata == NULL); + LASSERT(request->rq_repmsg == NULL); + rc = sptlrpc_cli_alloc_repbuf(request, + request->rq_replen); + if (rc) { + /* this prevents us from looping in + * ptlrpc_queue_wait */ + spin_lock(&request->rq_lock); + request->rq_err = 1; + spin_unlock(&request->rq_lock); + request->rq_status = rc; + GOTO(cleanup_bulk, rc); + } + } else { + request->rq_repdata = NULL; + request->rq_repmsg = NULL; + } + + peer = connection->c_peer; + if (request->rq_bulk && + OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_REPLY_ATTACH)) { + reply_me = ERR_PTR(-ENOMEM); + } else { + reply_me = LNetMEAttach(request->rq_reply_portal, + &peer, + rep_mbits ? request->rq_mbits : + request->rq_xid, + 0, LNET_UNLINK, LNET_INS_AFTER); + } + + if (IS_ERR(reply_me)) { + rc = PTR_ERR(reply_me); + CERROR("LNetMEAttach failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + GOTO(cleanup_bulk, rc = -ENOMEM); + } + } + + spin_lock(&request->rq_lock); + /* We are responsible for unlinking the reply buffer */ + request->rq_reply_unlinked = noreply; + request->rq_receiving_reply = !noreply; + /* Clear any flags that may be present from previous sends. */ + request->rq_req_unlinked = 0; + request->rq_replied = 0; + request->rq_err = 0; + request->rq_timedout = 0; + request->rq_net_err = 0; + request->rq_resend = 0; + request->rq_restart = 0; + request->rq_reply_truncated = 0; + spin_unlock(&request->rq_lock); + + if (!noreply) { + reply_md.start = request->rq_repbuf; + reply_md.length = request->rq_repbuf_len; + /* Allow multiple early replies */ + reply_md.threshold = LNET_MD_THRESH_INF; + /* Manage remote for early replies */ + reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | + LNET_MD_MANAGE_REMOTE | + LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */; + reply_md.user_ptr = &request->rq_reply_cbid; + reply_md.handler = ptlrpc_handler; + + /* We must see the unlink callback to set rq_reply_unlinked, + * so we can't auto-unlink */ + rc = LNetMDAttach(reply_me, &reply_md, LNET_RETAIN, + &request->rq_reply_md_h); + if (rc != 0) { + CERROR("LNetMDAttach failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + spin_lock(&request->rq_lock); + /* ...but the MD attach didn't succeed... */ + request->rq_receiving_reply = 0; + spin_unlock(&request->rq_lock); + GOTO(cleanup_bulk, rc = -ENOMEM); + } + percpu_ref_get(&ptlrpc_pending); + + CDEBUG(D_NET, + "Setup reply buffer: %u bytes, xid %llu, portal %u\n", + request->rq_repbuf_len, request->rq_xid, + request->rq_reply_portal); + } + + /* add references on request for request_out_callback */ + ptlrpc_request_addref(request); + if (obd != NULL && obd->obd_svc_stats != NULL) + lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR, + atomic_read(&imp->imp_inflight)); + + OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5); + + request->rq_sent_ns = ktime_get_real(); + request->rq_sent = ktime_get_real_seconds(); + /* We give the server rq_timeout secs to process the req, and + * add the network latency for our local timeout. + */ + request->rq_deadline = request->rq_sent + request->rq_timeout + + ptlrpc_at_get_net_latency(request); + + DEBUG_REQ(D_INFO, request, "send flags=%x", + lustre_msg_get_flags(request->rq_reqmsg)); + rc = ptl_send_buf(&request->rq_req_md_h, + request->rq_reqbuf, request->rq_reqdata_len, + LNET_NOACK_REQ, &request->rq_req_cbid, + LNET_NID_ANY, + lnet_pid_to_pid4(&connection->c_peer), + request->rq_request_portal, + request->rq_xid, 0, &bulk_cookie); + if (likely(rc == 0)) + GOTO(out, rc); + + request->rq_req_unlinked = 1; + ptlrpc_req_finished(request); + if (noreply) + GOTO(out, rc); + + LNetMDUnlink(request->rq_reply_md_h); + + /* UNLINKED callback called synchronously */ + LASSERT(!request->rq_receiving_reply); + + cleanup_bulk: + /* We do sync unlink here as there was no real transfer here so + * the chance to have long unlink to sluggish net is smaller here. */ + ptlrpc_unregister_bulk(request, 0); + out: + if (rc == -ENOMEM) { + /* set rq_sent so that this request is treated + * as a delayed send in the upper layers */ + request->rq_sent = ktime_get_real_seconds(); + } + + if (request->rq_memalloc) + memalloc_noreclaim_restore(mpflag); + + return rc; +} +EXPORT_SYMBOL(ptl_send_rpc); + +/** + * Register request buffer descriptor for request receiving. + */ +int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd) +{ + struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service; + static struct lnet_processid match_id = { + .nid = LNET_ANY_NID, + .pid = LNET_PID_ANY + }; + int rc; + struct lnet_md md; + struct lnet_me *me; + + CDEBUG(D_NET, "%s: registering portal %d\n", service->srv_name, + service->srv_req_portal); + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD)) + return -ENOMEM; + + /* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL, + * which means buffer can only be attached on local CPT, and LND + * threads can find it by grabbing a local lock */ + me = LNetMEAttach(service->srv_req_portal, + &match_id, 0, ~0, LNET_UNLINK, + rqbd->rqbd_svcpt->scp_cpt >= 0 ? + LNET_INS_LOCAL : LNET_INS_AFTER); + if (IS_ERR(me)) { + CERROR("%s: LNetMEAttach failed: rc = %ld\n", + service->srv_name, PTR_ERR(me)); + return PTR_ERR(me); + } + + LASSERT(rqbd->rqbd_refcount == 0); + rqbd->rqbd_refcount = 1; + + md.start = rqbd->rqbd_buffer; + md.length = service->srv_buf_size; + md.max_size = service->srv_max_req_size; + md.threshold = LNET_MD_THRESH_INF; + md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE; + md.user_ptr = &rqbd->rqbd_cbid; + md.handler = ptlrpc_handler; + + rc = LNetMDAttach(me, &md, LNET_UNLINK, &rqbd->rqbd_md_h); + if (rc == 0) { + percpu_ref_get(&ptlrpc_pending); + return 0; + } + + CERROR("%s: LNetMDAttach failed: rc = %d\n", service->srv_name, rc); + LASSERT(rc == -ENOMEM); + rqbd->rqbd_refcount = 0; + + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h new file mode 100644 index 0000000000000..41c0bb05c5f84 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h @@ -0,0 +1,204 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (C) 2013, Trustees of Indiana University + * + * Copyright (c) 2013, 2017, Intel Corporation. + * + * Author: Joshua Walgenbach + */ + +#ifndef _NODEMAP_INTERNAL_H +#define _NODEMAP_INTERNAL_H + +#include +#include + +#define DEFAULT_NODEMAP "default" + +/* Default nobody uid, gid and projid values */ +#define NODEMAP_NOBODY_UID 99 +#define NODEMAP_NOBODY_GID 99 +#define NODEMAP_NOBODY_PROJID 99 + +struct lprocfs_static_vars; + +/* nodemap root proc directory under fs/lustre */ +extern struct proc_dir_entry *proc_lustre_nodemap_root; +/* flag if nodemap is active */ +extern bool nodemap_active; + +extern struct mutex active_config_lock; +extern struct nodemap_config *active_config; + +struct lu_nid_range { + /* unique id set by mgs */ + unsigned int rn_id; + /* lu_nodemap containing this range */ + struct lu_nodemap *rn_nodemap; + /* list for nodemap */ + struct list_head rn_list; + /* nid interval tree */ + lnet_nid_t rn_start, + rn_end, + rn_subtree_last; + struct rb_node rn_rb; +}; + +struct lu_idmap { + /* uid/gid of client */ + __u32 id_client; + /* uid/gid on filesystem */ + __u32 id_fs; + /* tree mapping client ids to filesystem ids */ + struct rb_node id_client_to_fs; + /* tree mappung filesystem to client */ + struct rb_node id_fs_to_client; +}; + +/* first 4 bits of the nodemap_id is the index type */ +struct nodemap_key { + __u32 nk_nodemap_id; + union { + __u32 nk_range_id; + __u32 nk_id_client; + __u32 nk_unused; + }; +}; + +enum nodemap_idx_type { + NODEMAP_EMPTY_IDX = 0, /* index created with blank record */ + NODEMAP_CLUSTER_IDX = 1, /* a nodemap cluster of nodes */ + NODEMAP_RANGE_IDX = 2, /* nid range assigned to a nm cluster */ + NODEMAP_UIDMAP_IDX = 3, /* uid map assigned to a nm cluster */ + NODEMAP_GIDMAP_IDX = 4, /* gid map assigned to a nm cluster */ + NODEMAP_PROJIDMAP_IDX = 5, /* projid map assigned to nm cluster */ + NODEMAP_GLOBAL_IDX = 15, /* stores nodemap activation status */ +}; + +#define NM_TYPE_MASK 0x0FFFFFFF +#define NM_TYPE_SHIFT 28 + +static inline enum nodemap_idx_type nm_idx_get_type(unsigned int id) +{ + return id >> NM_TYPE_SHIFT; +} + +static inline __u32 nm_idx_set_type(unsigned int id, enum nodemap_idx_type t) +{ + return (id & NM_TYPE_MASK) | (t << NM_TYPE_SHIFT); +} + +void nodemap_config_set_active(struct nodemap_config *config); +struct lu_nodemap *nodemap_create(const char *name, + struct nodemap_config *config, + bool is_default); +void nodemap_putref(struct lu_nodemap *nodemap); +struct lu_nodemap *nodemap_lookup(const char *name); + +int nodemap_procfs_init(void); +void nodemap_procfs_exit(void); +int lprocfs_nodemap_register(struct lu_nodemap *nodemap, + bool is_default_nodemap); +void lprocfs_nodemap_remove(struct nodemap_pde *nodemap_pde); +struct lu_nid_range *nodemap_range_find(lnet_nid_t start_nid, + lnet_nid_t end_nid); +struct lu_nid_range *range_create(struct nodemap_range_tree *nm_range_tree, + lnet_nid_t start_nid, lnet_nid_t end_nid, + struct lu_nodemap *nodemap, + unsigned int range_id); +void range_destroy(struct lu_nid_range *range); +int range_insert(struct nodemap_range_tree *nm_range_tree, + struct lu_nid_range *data); +void range_delete(struct nodemap_range_tree *nm_range_tree, + struct lu_nid_range *data); +struct lu_nid_range *range_search(struct nodemap_range_tree *nm_range_tree, + lnet_nid_t nid); +struct lu_nid_range *range_find(struct nodemap_range_tree *nm_range_tree, + lnet_nid_t start_nid, lnet_nid_t end_nid); +int range_parse_nidstring(char *range_string, lnet_nid_t *start_nid, + lnet_nid_t *end_nid); +void range_init_tree(void); +struct lu_idmap *idmap_create(__u32 client_id, __u32 fs_id); +struct lu_idmap *idmap_insert(enum nodemap_id_type id_type, + struct lu_idmap *idmap, + struct lu_nodemap *nodemap); +void idmap_delete(enum nodemap_id_type id_type, struct lu_idmap *idmap, + struct lu_nodemap *nodemap); +void idmap_delete_tree(struct lu_nodemap *nodemap); +struct lu_idmap *idmap_search(struct lu_nodemap *nodemap, + enum nodemap_tree_type, + enum nodemap_id_type id_type, + __u32 id); +int nm_member_add(struct lu_nodemap *nodemap, struct obd_export *exp); +void nm_member_del(struct lu_nodemap *nodemap, struct obd_export *exp); +void nm_member_delete_list(struct lu_nodemap *nodemap); +struct lu_nodemap *nodemap_classify_nid(lnet_nid_t nid); +void nm_member_reclassify_nodemap(struct lu_nodemap *nodemap); +void nm_member_revoke_locks(struct lu_nodemap *nodemap); +void nm_member_revoke_locks_always(struct lu_nodemap *nodemap); +void nm_member_revoke_all(void); + +int nodemap_add_idmap_helper(struct lu_nodemap *nodemap, + enum nodemap_id_type id_type, + const __u32 map[2]); +int nodemap_add_range_helper(struct nodemap_config *config, + struct lu_nodemap *nodemap, + const lnet_nid_t nid[2], + unsigned int range_id); + +struct rb_node *nm_rb_next_postorder(const struct rb_node *node); +struct rb_node *nm_rb_first_postorder(const struct rb_root *root); +void nodemap_getref(struct lu_nodemap *nodemap); +void nodemap_putref(struct lu_nodemap *nodemap); +int nm_hash_list_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, + void *nodemap_list_head); + +#define nm_rbtree_postorder_for_each_entry_safe(pos, n, \ + root, field) \ + for (pos = nm_rb_first_postorder(root) ? \ + rb_entry(nm_rb_first_postorder(root), typeof(*pos), \ + field) : NULL, \ + n = (pos && nm_rb_next_postorder(&pos->field)) ? \ + rb_entry(nm_rb_next_postorder(&pos->field), \ + typeof(*pos), field) : NULL; \ + pos != NULL; \ + pos = n, \ + n = (pos && nm_rb_next_postorder(&pos->field)) ? \ + rb_entry(nm_rb_next_postorder(&pos->field), \ + typeof(*pos), field) : NULL) + +int nodemap_idx_nodemap_add(const struct lu_nodemap *nodemap); +int nodemap_idx_nodemap_update(const struct lu_nodemap *nodemap); +int nodemap_idx_nodemap_del(const struct lu_nodemap *nodemap); +int nodemap_idx_idmap_add(const struct lu_nodemap *nodemap, + enum nodemap_id_type id_type, + const __u32 map[2]); +int nodemap_idx_idmap_del(const struct lu_nodemap *nodemap, + enum nodemap_id_type id_type, + const __u32 map[2]); +int nodemap_idx_range_add(const struct lu_nid_range *range, + const lnet_nid_t nid[2]); +int nodemap_idx_range_del(const struct lu_nid_range *range); +int nodemap_idx_nodemap_activate(bool value); +#endif /* _NODEMAP_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c new file mode 100644 index 0000000000000..ccfc4e96c813c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c @@ -0,0 +1,1786 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, 2016, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs.c + * + * Network Request Scheduler (NRS) + * + * Allows to reorder the handling of RPCs at servers. + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ +/** + * \addtogoup nrs + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +/** + * NRS core object. + */ +struct nrs_core nrs_core; + +static int nrs_policy_init(struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_desc->pd_ops->op_policy_init != NULL ? + policy->pol_desc->pd_ops->op_policy_init(policy) : 0; +} + +static void nrs_policy_fini(struct ptlrpc_nrs_policy *policy) +{ + LASSERT(policy->pol_ref == 0); + LASSERT(policy->pol_req_queued == 0); + + if (policy->pol_desc->pd_ops->op_policy_fini != NULL) + policy->pol_desc->pd_ops->op_policy_fini(policy); +} + +static int nrs_policy_ctl_locked(struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg) +{ + /** + * The policy may be stopped, but the lprocfs files and + * ptlrpc_nrs_policy instances remain present until unregistration time. + * Do not perform the ctl operation if the policy is stopped, as + * policy->pol_private will be NULL in such a case. + */ + if (policy->pol_state == NRS_POL_STATE_STOPPED) + RETURN(-ENODEV); + + RETURN(policy->pol_desc->pd_ops->op_policy_ctl != NULL ? + policy->pol_desc->pd_ops->op_policy_ctl(policy, opc, arg) : + -ENOSYS); +} + +static void nrs_policy_stop0(struct ptlrpc_nrs_policy *policy) +{ + ENTRY; + + if (policy->pol_desc->pd_ops->op_policy_stop != NULL) + policy->pol_desc->pd_ops->op_policy_stop(policy); + + LASSERT(list_empty(&policy->pol_list_queued)); + LASSERT(policy->pol_req_queued == 0 && + policy->pol_req_started == 0); + + policy->pol_private = NULL; + + policy->pol_state = NRS_POL_STATE_STOPPED; + + if (atomic_dec_and_test(&policy->pol_desc->pd_refs)) + module_put(policy->pol_desc->pd_owner); + + EXIT; +} + +static int nrs_policy_stop_locked(struct ptlrpc_nrs_policy *policy) +{ + struct ptlrpc_nrs *nrs = policy->pol_nrs; + ENTRY; + + if (nrs->nrs_policy_fallback == policy && !nrs->nrs_stopping) + RETURN(-EPERM); + + if (policy->pol_state == NRS_POL_STATE_STARTING) + RETURN(-EAGAIN); + + /* In progress or already stopped */ + if (policy->pol_state != NRS_POL_STATE_STARTED) + RETURN(0); + + policy->pol_state = NRS_POL_STATE_STOPPING; + + /* Immediately make it invisible */ + if (nrs->nrs_policy_primary == policy) { + nrs->nrs_policy_primary = NULL; + + } else { + LASSERT(nrs->nrs_policy_fallback == policy); + nrs->nrs_policy_fallback = NULL; + } + + /* I have the only refcount */ + if (policy->pol_ref == 1) + nrs_policy_stop0(policy); + + RETURN(0); +} + +/** + * Transitions the \a nrs NRS head's primary policy to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING and if the policy has no + * pending usage references, to ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED. + * + * \param[in] nrs the NRS head to carry out this operation on + */ +static void nrs_policy_stop_primary(struct ptlrpc_nrs *nrs) +{ + struct ptlrpc_nrs_policy *tmp = nrs->nrs_policy_primary; + ENTRY; + + if (tmp == NULL) { + /** + * XXX: This should really be RETURN_EXIT, but the latter does + * not currently print anything out, and possibly should be + * fixed to do so. + */ + EXIT; + return; + } + + nrs->nrs_policy_primary = NULL; + + LASSERT(tmp->pol_state == NRS_POL_STATE_STARTED); + tmp->pol_state = NRS_POL_STATE_STOPPING; + + if (tmp->pol_ref == 0) + nrs_policy_stop0(tmp); + EXIT; +} + +/** + * Transitions a policy across the ptlrpc_nrs_pol_state range of values, in + * response to an lprocfs command to start a policy. + * + * If a primary policy different to the current one is specified, this function + * will transition the new policy to the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTING and then to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED, and will then transition + * the old primary policy (if there is one) to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding + * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. + * + * If the fallback policy is specified, this is taken to indicate an instruction + * to stop the current primary policy, without substituting it with another + * primary policy, so the primary policy (if any) is transitioned to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding + * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. In + * this case, the fallback policy is only left active in the NRS head. + */ +static int nrs_policy_start_locked(struct ptlrpc_nrs_policy *policy, char *arg) +{ + struct ptlrpc_nrs *nrs = policy->pol_nrs; + int rc = 0; + ENTRY; + + /** + * Don't allow multiple starting which is too complex, and has no real + * benefit. + */ + if (nrs->nrs_policy_starting) + RETURN(-EAGAIN); + + LASSERT(policy->pol_state != NRS_POL_STATE_STARTING); + + if (policy->pol_state == NRS_POL_STATE_STOPPING) + RETURN(-EAGAIN); + + if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) { + /** + * This is for cases in which the user sets the policy to the + * fallback policy (currently fifo for all services); i.e. the + * user is resetting the policy to the default; so we stop the + * primary policy, if any. + */ + if (policy == nrs->nrs_policy_fallback) { + nrs_policy_stop_primary(nrs); + RETURN(0); + } + + /** + * If we reach here, we must be setting up the fallback policy + * at service startup time, and only a single policy with the + * nrs_policy_flags::PTLRPC_NRS_FL_FALLBACK flag set can + * register with NRS core. + */ + LASSERT(nrs->nrs_policy_fallback == NULL); + } else { + /** + * Shouldn't start primary policy if w/o fallback policy. + */ + if (nrs->nrs_policy_fallback == NULL) + RETURN(-EPERM); + + if (policy->pol_state == NRS_POL_STATE_STARTED) { + /** + * If the policy argument now is different from the last time, + * stop the policy first and start it again with the new + * argument. + */ + if ((arg != NULL) && (strlen(arg) >= NRS_POL_ARG_MAX)) + return -EINVAL; + + if ((arg == NULL && strlen(policy->pol_arg) == 0) || + (arg != NULL && strcmp(policy->pol_arg, arg) == 0)) + RETURN(0); + + rc = nrs_policy_stop_locked(policy); + if (rc) + RETURN(-EAGAIN); + } + } + + /** + * Increase the module usage count for policies registering from other + * modules. + */ + if (atomic_inc_return(&policy->pol_desc->pd_refs) == 1 && + !try_module_get(policy->pol_desc->pd_owner)) { + atomic_dec(&policy->pol_desc->pd_refs); + CERROR("NRS: cannot get module for policy %s; is it alive?\n", + policy->pol_desc->pd_name); + RETURN(-ENODEV); + } + + /** + * Serialize policy starting across the NRS head + */ + nrs->nrs_policy_starting = 1; + + policy->pol_state = NRS_POL_STATE_STARTING; + + if (policy->pol_desc->pd_ops->op_policy_start) { + spin_unlock(&nrs->nrs_lock); + + rc = policy->pol_desc->pd_ops->op_policy_start(policy, arg); + + spin_lock(&nrs->nrs_lock); + if (rc != 0) { + if (atomic_dec_and_test(&policy->pol_desc->pd_refs)) + module_put(policy->pol_desc->pd_owner); + + policy->pol_state = NRS_POL_STATE_STOPPED; + GOTO(out, rc); + } + } + + if (arg != NULL) { + if (strlcpy(policy->pol_arg, arg, sizeof(policy->pol_arg)) >= + sizeof(policy->pol_arg)) { + CERROR("NRS: arg '%s' is too long\n", arg); + GOTO(out, rc = -E2BIG); + } + } else { + policy->pol_arg[0] = '\0'; + } + + policy->pol_state = NRS_POL_STATE_STARTED; + + if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) { + /** + * This path is only used at PTLRPC service setup time. + */ + nrs->nrs_policy_fallback = policy; + } else { + /* + * Try to stop the current primary policy if there is one. + */ + nrs_policy_stop_primary(nrs); + + /** + * And set the newly-started policy as the primary one. + */ + nrs->nrs_policy_primary = policy; + } + +out: + nrs->nrs_policy_starting = 0; + + RETURN(rc); +} + +/** + * Increases the policy's usage reference count. + */ +static inline void nrs_policy_get_locked(struct ptlrpc_nrs_policy *policy) +{ + policy->pol_ref++; +} + +/** + * Decreases the policy's usage reference count, and stops the policy in case it + * was already stopping and have no more outstanding usage references (which + * indicates it has no more queued or started requests, and can be safely + * stopped). + */ +static void nrs_policy_put_locked(struct ptlrpc_nrs_policy *policy) +{ + LASSERT(policy->pol_ref > 0); + + policy->pol_ref--; + if (unlikely(policy->pol_ref == 0 && + policy->pol_state == NRS_POL_STATE_STOPPING)) + nrs_policy_stop0(policy); +} + +static void nrs_policy_put(struct ptlrpc_nrs_policy *policy) +{ + spin_lock(&policy->pol_nrs->nrs_lock); + nrs_policy_put_locked(policy); + spin_unlock(&policy->pol_nrs->nrs_lock); +} + +/** + * Find and return a policy by name. + */ +static struct ptlrpc_nrs_policy * nrs_policy_find_locked(struct ptlrpc_nrs *nrs, + char *name) +{ + struct ptlrpc_nrs_policy *tmp; + + list_for_each_entry(tmp, &nrs->nrs_policy_list, pol_list) { + if (strncmp(tmp->pol_desc->pd_name, name, + NRS_POL_NAME_MAX) == 0) { + nrs_policy_get_locked(tmp); + return tmp; + } + } + return NULL; +} + +/** + * Release references for the resource hierarchy moving upwards towards the + * policy instance resource. + */ +static void nrs_resource_put(struct ptlrpc_nrs_resource *res) +{ + struct ptlrpc_nrs_policy *policy = res->res_policy; + + if (policy->pol_desc->pd_ops->op_res_put != NULL) { + struct ptlrpc_nrs_resource *parent; + + for (; res != NULL; res = parent) { + parent = res->res_parent; + policy->pol_desc->pd_ops->op_res_put(policy, res); + } + } +} + +/** + * Obtains references for each resource in the resource hierarchy for request + * \a nrq if it is to be handled by \a policy. + * + * \param[in] policy the policy + * \param[in] nrq the request + * \param[in] moving_req denotes whether this is a call to the function by + * ldlm_lock_reorder_req(), in order to move \a nrq to + * the high-priority NRS head; we should not sleep when + * set. + * + * \retval NULL resource hierarchy references not obtained + * \retval valid-pointer the bottom level of the resource hierarchy + * + * \see ptlrpc_nrs_pol_ops::op_res_get() + */ +static +struct ptlrpc_nrs_resource * nrs_resource_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + bool moving_req) +{ + /** + * Set to NULL to traverse the resource hierarchy from the top. + */ + struct ptlrpc_nrs_resource *res = NULL; + struct ptlrpc_nrs_resource *tmp = NULL; + int rc; + + while (1) { + rc = policy->pol_desc->pd_ops->op_res_get(policy, nrq, res, + &tmp, moving_req); + if (rc < 0) { + if (res != NULL) + nrs_resource_put(res); + return NULL; + } + + LASSERT(tmp != NULL); + tmp->res_parent = res; + tmp->res_policy = policy; + res = tmp; + tmp = NULL; + /** + * Return once we have obtained a reference to the bottom level + * of the resource hierarchy. + */ + if (rc > 0) + return res; + } +} + +/** + * Obtains resources for the resource hierarchies and policy references for + * the fallback and current primary policy (if any), that will later be used + * to handle request \a nrq. + * + * \param[in] nrs the NRS head instance that will be handling request \a nrq. + * \param[in] nrq the request that is being handled. + * \param[out] resp the array where references to the resource hierarchy are + * stored. + * \param[in] moving_req is set when obtaining resources while moving a + * request from a policy on the regular NRS head to a + * policy on the HP NRS head (via + * ldlm_lock_reorder_req()). It signifies that + * allocations to get resources should be atomic; for + * a full explanation, see comment in + * ptlrpc_nrs_pol_ops::op_res_get(). + */ +static void nrs_resource_get_safe(struct ptlrpc_nrs *nrs, + struct ptlrpc_nrs_request *nrq, + struct ptlrpc_nrs_resource **resp, + bool moving_req) +{ + struct ptlrpc_nrs_policy *primary = NULL; + struct ptlrpc_nrs_policy *fallback = NULL; + + memset(resp, 0, sizeof(resp[0]) * NRS_RES_MAX); + + /** + * Obtain policy references. + */ + spin_lock(&nrs->nrs_lock); + + fallback = nrs->nrs_policy_fallback; + nrs_policy_get_locked(fallback); + + primary = nrs->nrs_policy_primary; + if (primary != NULL) + nrs_policy_get_locked(primary); + + spin_unlock(&nrs->nrs_lock); + + /** + * Obtain resource hierarchy references. + */ + resp[NRS_RES_FALLBACK] = nrs_resource_get(fallback, nrq, moving_req); + LASSERT(resp[NRS_RES_FALLBACK] != NULL); + + if (primary != NULL) { + resp[NRS_RES_PRIMARY] = nrs_resource_get(primary, nrq, + moving_req); + /** + * A primary policy may exist which may not wish to serve a + * particular request for different reasons; release the + * reference on the policy as it will not be used for this + * request. + */ + if (resp[NRS_RES_PRIMARY] == NULL) + nrs_policy_put(primary); + } +} + +/** + * Releases references to resource hierarchies and policies, because they are no + * longer required; used when request handling has been completed, or the + * request is moving to the high priority NRS head. + * + * \param resp the resource hierarchy that is being released + * + * \see ptlrpcnrs_req_hp_move() + * \see ptlrpc_nrs_req_finalize() + */ +static void nrs_resource_put_safe(struct ptlrpc_nrs_resource **resp) +{ + struct ptlrpc_nrs_policy *pols[NRS_RES_MAX]; + struct ptlrpc_nrs *nrs = NULL; + int i; + + for (i = 0; i < NRS_RES_MAX; i++) { + if (resp[i] != NULL) { + pols[i] = resp[i]->res_policy; + nrs_resource_put(resp[i]); + resp[i] = NULL; + } else { + pols[i] = NULL; + } + } + + for (i = 0; i < NRS_RES_MAX; i++) { + if (pols[i] == NULL) + continue; + + if (nrs == NULL) { + nrs = pols[i]->pol_nrs; + spin_lock(&nrs->nrs_lock); + } + nrs_policy_put_locked(pols[i]); + } + + if (nrs != NULL) + spin_unlock(&nrs->nrs_lock); +} + +/** + * Obtains an NRS request from \a policy for handling or examination; the + * request should be removed in the 'handling' case. + * + * Calling into this function implies we already know the policy has a request + * waiting to be handled. + * + * \param[in] policy the policy from which a request + * \param[in] peek when set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force when set, it will force a policy to return a request if it + * has one pending + * + * \retval the NRS request to be handled + */ +static inline +struct ptlrpc_nrs_request * nrs_request_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct ptlrpc_nrs_request *nrq; + + LASSERT(policy->pol_req_queued > 0); + + nrq = policy->pol_desc->pd_ops->op_req_get(policy, peek, force); + + LASSERT(ergo(nrq != NULL, nrs_request_policy(nrq) == policy)); + + return nrq; +} + +/** + * Enqueues request \a nrq for later handling, via one one the policies for + * which resources where earlier obtained via nrs_resource_get_safe(). The + * function attempts to enqueue the request first on the primary policy + * (if any), since this is the preferred choice. + * + * \param nrq the request being enqueued + * + * \see nrs_resource_get_safe() + */ +static inline void nrs_request_enqueue(struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_nrs_policy *policy; + int rc; + int i; + + /** + * Try in descending order, because the primary policy (if any) is + * the preferred choice. + */ + for (i = NRS_RES_MAX - 1; i >= 0; i--) { + if (nrq->nr_res_ptrs[i] == NULL) + continue; + + nrq->nr_res_idx = i; + policy = nrq->nr_res_ptrs[i]->res_policy; + + rc = policy->pol_desc->pd_ops->op_req_enqueue(policy, nrq); + if (rc == 0) { + policy->pol_nrs->nrs_req_queued++; + policy->pol_req_queued++; + return; + } + } + /** + * Should never get here, as at least the primary policy's + * ptlrpc_nrs_pol_ops::op_req_enqueue() implementation should always + * succeed. + */ + LBUG(); +} + +/** + * Called when a request has been handled + * + * \param[in] nrs the request that has been handled; can be used for + * job/resource control. + * + * \see ptlrpc_nrs_req_stop_nolock() + */ +static inline void nrs_request_stop(struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_nrs_policy *policy = nrs_request_policy(nrq); + + if (policy->pol_desc->pd_ops->op_req_stop) + policy->pol_desc->pd_ops->op_req_stop(policy, nrq); + + LASSERT(policy->pol_nrs->nrs_req_started > 0); + LASSERT(policy->pol_req_started > 0); + + policy->pol_nrs->nrs_req_started--; + policy->pol_req_started--; +} + +/** + * Handler for operations that can be carried out on policies. + * + * Handles opcodes that are common to all policy types within NRS core, and + * passes any unknown opcodes to the policy-specific control function. + * + * \param[in] nrs the NRS head this policy belongs to. + * \param[in] name the human-readable policy name; should be the same as + * ptlrpc_nrs_pol_desc::pd_name. + * \param[in] opc the opcode of the operation being carried out. + * \param[in,out] arg can be used to pass information in and out between when + * carrying an operation; usually data that is private to + * the policy at some level, or generic policy status + * information. + * + * \retval -ve error condition + * \retval 0 operation was carried out successfully + */ +static int nrs_policy_ctl(struct ptlrpc_nrs *nrs, char *name, + enum ptlrpc_nrs_ctl opc, void *arg) +{ + struct ptlrpc_nrs_policy *policy; + int rc = 0; + ENTRY; + + spin_lock(&nrs->nrs_lock); + + policy = nrs_policy_find_locked(nrs, name); + if (policy == NULL) + GOTO(out, rc = -ENOENT); + + if (policy->pol_state != NRS_POL_STATE_STARTED && + policy->pol_state != NRS_POL_STATE_STOPPED) + GOTO(out, rc = -EAGAIN); + + switch (opc) { + /** + * Unknown opcode, pass it down to the policy-specific control + * function for handling. + */ + default: + rc = nrs_policy_ctl_locked(policy, opc, arg); + break; + + /** + * Start \e policy + */ + case PTLRPC_NRS_CTL_START: + rc = nrs_policy_start_locked(policy, arg); + break; + } +out: + if (policy != NULL) + nrs_policy_put_locked(policy); + + spin_unlock(&nrs->nrs_lock); + + RETURN(rc); +} + +/** + * Unregisters a policy by name. + * + * \param[in] nrs the NRS head this policy belongs to. + * \param[in] name the human-readable policy name; should be the same as + * ptlrpc_nrs_pol_desc::pd_name + * + * \retval -ve error + * \retval 0 success + */ +static int nrs_policy_unregister(struct ptlrpc_nrs *nrs, char *name) +{ + struct ptlrpc_nrs_policy *policy = NULL; + ENTRY; + + spin_lock(&nrs->nrs_lock); + + policy = nrs_policy_find_locked(nrs, name); + if (policy == NULL) { + spin_unlock(&nrs->nrs_lock); + + CERROR("Can't find NRS policy %s\n", name); + RETURN(-ENOENT); + } + + if (policy->pol_ref > 1) { + CERROR("Policy %s is busy with %d references\n", name, + (int)policy->pol_ref); + nrs_policy_put_locked(policy); + + spin_unlock(&nrs->nrs_lock); + RETURN(-EBUSY); + } + + LASSERT(policy->pol_req_queued == 0); + LASSERT(policy->pol_req_started == 0); + + if (policy->pol_state != NRS_POL_STATE_STOPPED) { + nrs_policy_stop_locked(policy); + LASSERT(policy->pol_state == NRS_POL_STATE_STOPPED); + } + + list_del(&policy->pol_list); + nrs->nrs_num_pols--; + + nrs_policy_put_locked(policy); + + spin_unlock(&nrs->nrs_lock); + + nrs_policy_fini(policy); + + LASSERT(policy->pol_private == NULL); + OBD_FREE_PTR(policy); + + RETURN(0); +} + +/** + * Register a policy from \policy descriptor \a desc with NRS head \a nrs. + * + * \param[in] nrs the NRS head on which the policy will be registered. + * \param[in] desc the policy descriptor from which the information will be + * obtained to register the policy. + * + * \retval -ve error + * \retval 0 success + */ +static int nrs_policy_register(struct ptlrpc_nrs *nrs, + struct ptlrpc_nrs_pol_desc *desc) +{ + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_policy *tmp; + struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt; + int rc; + ENTRY; + + LASSERT(svcpt != NULL); + LASSERT(desc->pd_ops != NULL); + LASSERT(desc->pd_ops->op_res_get != NULL); + LASSERT(desc->pd_ops->op_req_get != NULL); + LASSERT(desc->pd_ops->op_req_enqueue != NULL); + LASSERT(desc->pd_ops->op_req_dequeue != NULL); + LASSERT(desc->pd_compat != NULL); + + OBD_CPT_ALLOC_GFP(policy, svcpt->scp_service->srv_cptable, + svcpt->scp_cpt, sizeof(*policy), GFP_NOFS); + if (policy == NULL) + RETURN(-ENOMEM); + + policy->pol_nrs = nrs; + policy->pol_desc = desc; + policy->pol_state = NRS_POL_STATE_STOPPED; + policy->pol_flags = desc->pd_flags; + + INIT_LIST_HEAD(&policy->pol_list); + INIT_LIST_HEAD(&policy->pol_list_queued); + + rc = nrs_policy_init(policy); + if (rc != 0) { + OBD_FREE_PTR(policy); + RETURN(rc); + } + + spin_lock(&nrs->nrs_lock); + + tmp = nrs_policy_find_locked(nrs, policy->pol_desc->pd_name); + if (tmp != NULL) { + CERROR("NRS policy %s has been registered, can't register it " + "for %s\n", policy->pol_desc->pd_name, + svcpt->scp_service->srv_name); + nrs_policy_put_locked(tmp); + + spin_unlock(&nrs->nrs_lock); + nrs_policy_fini(policy); + OBD_FREE_PTR(policy); + + RETURN(-EEXIST); + } + + list_add_tail(&policy->pol_list, &nrs->nrs_policy_list); + nrs->nrs_num_pols++; + + if (policy->pol_flags & PTLRPC_NRS_FL_REG_START) + rc = nrs_policy_start_locked(policy, NULL); + + spin_unlock(&nrs->nrs_lock); + + if (rc != 0) + (void) nrs_policy_unregister(nrs, policy->pol_desc->pd_name); + + RETURN(rc); +} + +/** + * Enqueue request \a req using one of the policies its resources are referring + * to. + * + * \param[in] req the request to enqueue. + */ +static void ptlrpc_nrs_req_add_nolock(struct ptlrpc_request *req) +{ + struct ptlrpc_nrs_policy *policy; + + LASSERT(req->rq_nrq.nr_initialized); + LASSERT(!req->rq_nrq.nr_enqueued); + + nrs_request_enqueue(&req->rq_nrq); + req->rq_nrq.nr_enqueued = 1; + + policy = nrs_request_policy(&req->rq_nrq); + /** + * Add the policy to the NRS head's list of policies with enqueued + * requests, if it has not been added there. + */ + if (unlikely(list_empty(&policy->pol_list_queued))) + list_add_tail(&policy->pol_list_queued, + &policy->pol_nrs->nrs_policy_queued); +} + +/** + * Enqueue a request on the high priority NRS head. + * + * \param req the request to enqueue. + */ +static void ptlrpc_nrs_hpreq_add_nolock(struct ptlrpc_request *req) +{ + int opc = lustre_msg_get_opc(req->rq_reqmsg); + ENTRY; + + spin_lock(&req->rq_lock); + req->rq_hp = 1; + ptlrpc_nrs_req_add_nolock(req); + if (opc != OBD_PING) + DEBUG_REQ(D_NET, req, "high priority req"); + spin_unlock(&req->rq_lock); + EXIT; +} + +/** + * Returns a boolean predicate indicating whether the policy described by + * \a desc is adequate for use with service \a svc. + * + * \param[in] svc the service + * \param[in] desc the policy descriptor + * + * \retval false the policy is not compatible with the service + * \retval true the policy is compatible with the service + */ +static inline bool nrs_policy_compatible(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + return desc->pd_compat(svc, desc); +} + +/** + * Registers all compatible policies in nrs_core.nrs_policies, for NRS head + * \a nrs. + * + * \param[in] nrs the NRS head + * + * \retval -ve error + * \retval 0 success + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + * + * \see ptlrpc_service_nrs_setup() + */ +static int nrs_register_policies_locked(struct ptlrpc_nrs *nrs) +{ + struct ptlrpc_nrs_pol_desc *desc; + /* for convenience */ + struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + int rc = -EINVAL; + ENTRY; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + + list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { + if (nrs_policy_compatible(svc, desc)) { + rc = nrs_policy_register(nrs, desc); + if (rc != 0) { + CERROR("Failed to register NRS policy %s for " + "partition %d of service %s: %d\n", + desc->pd_name, svcpt->scp_cpt, + svc->srv_name, rc); + /** + * Fail registration if any of the policies' + * registration fails. + */ + break; + } + } + } + + RETURN(rc); +} + +/** + * Initializes NRS head \a nrs of service partition \a svcpt, and registers all + * compatible policies in NRS core, with the NRS head. + * + * \param[in] nrs the NRS head + * \param[in] svcpt the PTLRPC service partition to setup + * + * \retval -ve error + * \retval 0 success + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + */ +static int nrs_svcpt_setup_locked0(struct ptlrpc_nrs *nrs, + struct ptlrpc_service_part *svcpt) +{ + int rc; + enum ptlrpc_nrs_queue_type queue; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + + if (nrs == &svcpt->scp_nrs_reg) + queue = PTLRPC_NRS_QUEUE_REG; + else if (nrs == svcpt->scp_nrs_hp) + queue = PTLRPC_NRS_QUEUE_HP; + else + LBUG(); + + nrs->nrs_svcpt = svcpt; + nrs->nrs_queue_type = queue; + spin_lock_init(&nrs->nrs_lock); + INIT_LIST_HEAD(&nrs->nrs_policy_list); + INIT_LIST_HEAD(&nrs->nrs_policy_queued); + nrs->nrs_throttling = 0; + + rc = nrs_register_policies_locked(nrs); + + RETURN(rc); +} + +/** + * Allocates a regular and optionally a high-priority NRS head (if the service + * handles high-priority RPCs), and then registers all available compatible + * policies on those NRS heads. + * + * \param[in,out] svcpt the PTLRPC service partition to setup + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + */ +static int nrs_svcpt_setup_locked(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_nrs *nrs; + int rc; + ENTRY; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + + /** + * Initialize the regular NRS head. + */ + nrs = nrs_svcpt2nrs(svcpt, false); + rc = nrs_svcpt_setup_locked0(nrs, svcpt); + if (rc < 0) + GOTO(out, rc); + + /** + * Optionally allocate a high-priority NRS head. + */ + if (svcpt->scp_service->srv_ops.so_hpreq_handler == NULL) + GOTO(out, rc); + + OBD_CPT_ALLOC_PTR(svcpt->scp_nrs_hp, + svcpt->scp_service->srv_cptable, + svcpt->scp_cpt); + if (svcpt->scp_nrs_hp == NULL) + GOTO(out, rc = -ENOMEM); + + nrs = nrs_svcpt2nrs(svcpt, true); + rc = nrs_svcpt_setup_locked0(nrs, svcpt); + +out: + RETURN(rc); +} + +/** + * Unregisters all policies on all available NRS heads in a service partition; + * called at PTLRPC service unregistration time. + * + * \param[in] svcpt the PTLRPC service partition + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + */ +static void nrs_svcpt_cleanup_locked(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_nrs *nrs; + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_policy *tmp; + int rc; + bool hp = false; + ENTRY; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + +again: + /* scp_nrs_hp could be NULL due to short of memory. */ + nrs = hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg; + /* check the nrs_svcpt to see if nrs is initialized. */ + if (!nrs || !nrs->nrs_svcpt) { + EXIT; + return; + } + nrs->nrs_stopping = 1; + + list_for_each_entry_safe(policy, tmp, &nrs->nrs_policy_list, + pol_list) { + rc = nrs_policy_unregister(nrs, policy->pol_desc->pd_name); + LASSERT(rc == 0); + } + + /** + * If the service partition has an HP NRS head, clean that up as well. + */ + if (!hp && nrs_svcpt_has_hp(svcpt)) { + hp = true; + goto again; + } + + if (hp) + OBD_FREE_PTR(nrs); + + EXIT; +} + +/** + * Returns the descriptor for a policy as identified by by \a name. + * + * \param[in] name the policy name + * + * \retval the policy descriptor + * \retval NULL + */ +static struct ptlrpc_nrs_pol_desc *nrs_policy_find_desc_locked(const char *name) +{ + struct ptlrpc_nrs_pol_desc *tmp; + ENTRY; + + list_for_each_entry(tmp, &nrs_core.nrs_policies, pd_list) { + if (strncmp(tmp->pd_name, name, NRS_POL_NAME_MAX) == 0) + RETURN(tmp); + } + RETURN(NULL); +} + +/** + * Removes the policy from all supported NRS heads of all partitions of all + * PTLRPC services. + * + * \param[in] desc the policy descriptor to unregister + * + * \retval -ve error + * \retval 0 successfully unregistered policy on all supported NRS heads + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + * \pre mutex_is_locked(&ptlrpc_all_services_mutex) + */ +static int nrs_policy_unregister_locked(struct ptlrpc_nrs_pol_desc *desc) +{ + struct ptlrpc_nrs *nrs; + struct ptlrpc_service *svc; + struct ptlrpc_service_part *svcpt; + int i; + int rc = 0; + ENTRY; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + LASSERT(mutex_is_locked(&ptlrpc_all_services_mutex)); + + list_for_each_entry(svc, &ptlrpc_all_services, srv_list) { + + if (!nrs_policy_compatible(svc, desc) || + unlikely(svc->srv_is_stopping)) + continue; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + bool hp = false; + +again: + nrs = nrs_svcpt2nrs(svcpt, hp); + rc = nrs_policy_unregister(nrs, desc->pd_name); + /** + * Ignore -ENOENT as the policy may not have registered + * successfully on all service partitions. + */ + if (rc == -ENOENT) { + rc = 0; + } else if (rc != 0) { + CERROR("Failed to unregister NRS policy %s for " + "partition %d of service %s: %d\n", + desc->pd_name, svcpt->scp_cpt, + svcpt->scp_service->srv_name, rc); + RETURN(rc); + } + + if (!hp && nrs_svc_has_hp(svc)) { + hp = true; + goto again; + } + } + + if (desc->pd_ops->op_lprocfs_fini != NULL) + desc->pd_ops->op_lprocfs_fini(svc); + } + + RETURN(rc); +} + +/** + * Registers a new policy with NRS core. + * + * The function will only succeed if policy registration with all compatible + * service partitions (if any) is successful. + * + * N.B. This function should be called either at ptlrpc module initialization + * time when registering a policy that ships with NRS core, or in a + * module's init() function for policies registering from other modules. + * + * \param[in] conf configuration information for the new policy to register + * + * \retval -ve error + * \retval 0 success + */ +static int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf) +{ + struct ptlrpc_service *svc; + struct ptlrpc_nrs_pol_desc *desc; + int rc = 0; + ENTRY; + + LASSERT(conf != NULL); + LASSERT(conf->nc_ops != NULL); + LASSERT(conf->nc_compat != NULL); + LASSERT(ergo(conf->nc_compat == nrs_policy_compat_one, + conf->nc_compat_svc_name != NULL)); + LASSERT(ergo((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0, + conf->nc_owner != NULL)); + + conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0'; + + /** + * External policies are not allowed to start immediately upon + * registration, as there is a relatively higher chance that their + * registration might fail. In such a case, some policy instances may + * already have requests queued wen unregistration needs to happen as + * part o cleanup; since there is currently no way to drain requests + * from a policy unless the service is unregistering, we just disallow + * this. + */ + if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) && + (conf->nc_flags & (PTLRPC_NRS_FL_FALLBACK | + PTLRPC_NRS_FL_REG_START))) { + CERROR("NRS: failing to register policy %s. Please check " + "policy flags; external policies cannot act as fallback " + "policies, or be started immediately upon registration " + "without interaction with lprocfs\n", conf->nc_name); + RETURN(-EINVAL); + } + + mutex_lock(&nrs_core.nrs_mutex); + + if (nrs_policy_find_desc_locked(conf->nc_name) != NULL) { + CERROR("NRS: failing to register policy %s which has already " + "been registered with NRS core!\n", + conf->nc_name); + GOTO(fail, rc = -EEXIST); + } + + OBD_ALLOC_PTR(desc); + if (desc == NULL) + GOTO(fail, rc = -ENOMEM); + + if (strlcpy(desc->pd_name, conf->nc_name, sizeof(desc->pd_name)) >= + sizeof(desc->pd_name)) { + OBD_FREE_PTR(desc); + GOTO(fail, rc = -E2BIG); + } + desc->pd_ops = conf->nc_ops; + desc->pd_compat = conf->nc_compat; + desc->pd_compat_svc_name = conf->nc_compat_svc_name; + if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0) + desc->pd_owner = conf->nc_owner; + desc->pd_flags = conf->nc_flags; + atomic_set(&desc->pd_refs, 0); + + /** + * For policies that are held in the same module as NRS (currently + * ptlrpc), do not register the policy with all compatible services, + * as the services will not have started at this point, since we are + * calling from ptlrpc module initialization code. In such cases each + * service will register all compatible policies later, via + * ptlrpc_service_nrs_setup(). + */ + if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) == 0) + goto internal; + + /** + * Register the new policy on all compatible services + */ + mutex_lock(&ptlrpc_all_services_mutex); + + list_for_each_entry(svc, &ptlrpc_all_services, srv_list) { + struct ptlrpc_service_part *svcpt; + int i; + int rc2; + + if (!nrs_policy_compatible(svc, desc) || + unlikely(svc->srv_is_stopping)) + continue; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + struct ptlrpc_nrs *nrs; + bool hp = false; +again: + nrs = nrs_svcpt2nrs(svcpt, hp); + rc = nrs_policy_register(nrs, desc); + if (rc != 0) { + CERROR("Failed to register NRS policy %s for " + "partition %d of service %s: %d\n", + desc->pd_name, svcpt->scp_cpt, + svcpt->scp_service->srv_name, rc); + + rc2 = nrs_policy_unregister_locked(desc); + /** + * Should not fail at this point + */ + LASSERT(rc2 == 0); + mutex_unlock(&ptlrpc_all_services_mutex); + OBD_FREE_PTR(desc); + GOTO(fail, rc); + } + + if (!hp && nrs_svc_has_hp(svc)) { + hp = true; + goto again; + } + } + + /** + * No need to take a reference to other modules here, as we + * will be calling from the module's init() function. + */ + if (desc->pd_ops->op_lprocfs_init != NULL) { + rc = desc->pd_ops->op_lprocfs_init(svc); + if (rc != 0) { + rc2 = nrs_policy_unregister_locked(desc); + /** + * Should not fail at this point + */ + LASSERT(rc2 == 0); + mutex_unlock(&ptlrpc_all_services_mutex); + OBD_FREE_PTR(desc); + GOTO(fail, rc); + } + } + } + + mutex_unlock(&ptlrpc_all_services_mutex); +internal: + list_add_tail(&desc->pd_list, &nrs_core.nrs_policies); +fail: + mutex_unlock(&nrs_core.nrs_mutex); + + RETURN(rc); +} + +/** + * Setup NRS heads on all service partitions of service \a svc, and register + * all compatible policies on those NRS heads. + * + * To be called from withing ptl + * \param[in] svc the service to setup + * + * \retval -ve error, the calling logic should eventually call + * ptlrpc_service_nrs_cleanup() to undo any work performed + * by this function. + * + * \see ptlrpc_register_service() + * \see ptlrpc_service_nrs_cleanup() + */ +int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + const struct ptlrpc_nrs_pol_desc *desc; + int i; + int rc = 0; + + mutex_lock(&nrs_core.nrs_mutex); + + /** + * Initialize NRS heads on all service CPTs. + */ + ptlrpc_service_for_each_part(svcpt, i, svc) { + rc = nrs_svcpt_setup_locked(svcpt); + if (rc != 0) + GOTO(failed, rc); + } + + /** + * Set up lprocfs interfaces for all supported policies for the + * service. + */ + list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { + if (!nrs_policy_compatible(svc, desc)) + continue; + + if (desc->pd_ops->op_lprocfs_init != NULL) { + rc = desc->pd_ops->op_lprocfs_init(svc); + if (rc != 0) + GOTO(failed, rc); + } + } + +failed: + + mutex_unlock(&nrs_core.nrs_mutex); + + RETURN(rc); +} + +/** + * Unregisters all policies on all service partitions of service \a svc. + * + * \param[in] svc the PTLRPC service to unregister + */ +void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + const struct ptlrpc_nrs_pol_desc *desc; + int i; + + mutex_lock(&nrs_core.nrs_mutex); + + /** + * Clean up NRS heads on all service partitions + */ + ptlrpc_service_for_each_part(svcpt, i, svc) + nrs_svcpt_cleanup_locked(svcpt); + + /** + * Clean up lprocfs interfaces for all supported policies for the + * service. + */ + list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { + if (!nrs_policy_compatible(svc, desc)) + continue; + + if (desc->pd_ops->op_lprocfs_fini != NULL) + desc->pd_ops->op_lprocfs_fini(svc); + } + + mutex_unlock(&nrs_core.nrs_mutex); +} + +/** + * Obtains NRS head resources for request \a req. + * + * These could be either on the regular or HP NRS head of \a svcpt; resources + * taken on the regular head can later be swapped for HP head resources by + * ldlm_lock_reorder_req(). + * + * \param[in] svcpt the service partition + * \param[in] req the request + * \param[in] hp which NRS head of \a svcpt to use + */ +void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + + memset(&req->rq_nrq, 0, sizeof(req->rq_nrq)); + nrs_resource_get_safe(nrs, &req->rq_nrq, req->rq_nrq.nr_res_ptrs, + false); + + /** + * It is fine to access \e nr_initialized without locking as there is + * no contention at this early stage. + */ + req->rq_nrq.nr_initialized = 1; +} + +/** + * Releases resources for a request; is called after the request has been + * handled. + * + * \param[in] req the request + * + * \see ptlrpc_server_finish_request() + */ +void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req) +{ + if (req->rq_nrq.nr_initialized) { + nrs_resource_put_safe(req->rq_nrq.nr_res_ptrs); + /* no protection on bit nr_initialized because no + * contention at this late stage */ + req->rq_nrq.nr_finalized = 1; + } +} + +void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req) +{ + if (req->rq_nrq.nr_started) + nrs_request_stop(&req->rq_nrq); +} + +/** + * Enqueues request \a req on either the regular or high-priority NRS head + * of service partition \a svcpt. + * + * \param[in] svcpt the service partition + * \param[in] req the request to be enqueued + * \param[in] hp whether to enqueue the request on the regular or + * high-priority NRS head. + */ +void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp) +{ + spin_lock(&svcpt->scp_req_lock); + + if (hp) + ptlrpc_nrs_hpreq_add_nolock(req); + else + ptlrpc_nrs_req_add_nolock(req); + + spin_unlock(&svcpt->scp_req_lock); +} + +static void nrs_request_removed(struct ptlrpc_nrs_policy *policy) +{ + LASSERT(policy->pol_nrs->nrs_req_queued > 0); + LASSERT(policy->pol_req_queued > 0); + + policy->pol_nrs->nrs_req_queued--; + policy->pol_req_queued--; + + /** + * If the policy has no more requests queued, remove it from + * ptlrpc_nrs::nrs_policy_queued. + */ + if (unlikely(policy->pol_req_queued == 0)) { + list_del_init(&policy->pol_list_queued); + + /** + * If there are other policies with queued requests, move the + * current policy to the end so that we can round robin over + * all policies and drain the requests. + */ + } else if (policy->pol_req_queued != policy->pol_nrs->nrs_req_queued) { + LASSERT(policy->pol_req_queued < + policy->pol_nrs->nrs_req_queued); + + list_move_tail(&policy->pol_list_queued, + &policy->pol_nrs->nrs_policy_queued); + } +} + +/** + * Obtains a request for handling from an NRS head of service partition + * \a svcpt. + * + * \param[in] svcpt the service partition + * \param[in] hp whether to obtain a request from the regular or + * high-priority NRS head. + * \param[in] peek when set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force when set, it will force a policy to return a request if it + * has one pending + * + * \retval the request to be handled + * \retval NULL the head has no requests to serve + */ +struct ptlrpc_request * +ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp, + bool peek, bool force) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_request *nrq; + + /** + * Always try to drain requests from all NRS polices even if they are + * inactive, because the user can change policy status at runtime. + */ + list_for_each_entry(policy, &nrs->nrs_policy_queued, + pol_list_queued) { + nrq = nrs_request_get(policy, peek, force); + if (nrq != NULL) { + if (likely(!peek)) { + nrq->nr_started = 1; + + policy->pol_req_started++; + policy->pol_nrs->nrs_req_started++; + + nrs_request_removed(policy); + } + + return container_of(nrq, struct ptlrpc_request, rq_nrq); + } + } + + return NULL; +} + +/** + * Dequeues request \a req from the policy it has been enqueued on. + * + * \param[in] req the request + */ +void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req) +{ + struct ptlrpc_nrs_policy *policy = nrs_request_policy(&req->rq_nrq); + + policy->pol_desc->pd_ops->op_req_dequeue(policy, &req->rq_nrq); + + req->rq_nrq.nr_enqueued = 0; + + nrs_request_removed(policy); +} + +/** + * Returns whether there are any requests currently enqueued on any of the + * policies of service partition's \a svcpt NRS head specified by \a hp. Should + * be called while holding ptlrpc_service_part::scp_req_lock to get a reliable + * result. + * + * \param[in] svcpt the service partition to enquire. + * \param[in] hp whether the regular or high-priority NRS head is to be + * enquired. + * + * \retval false the indicated NRS head has no enqueued requests. + * \retval true the indicated NRS head has some enqueued requests. + */ +bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + + return nrs->nrs_req_queued > 0; +}; + +/** + * Returns whether NRS policy is throttling reqeust + * + * \param[in] svcpt the service partition to enquire. + * \param[in] hp whether the regular or high-priority NRS head is to be + * enquired. + * + * \retval false the indicated NRS head has no enqueued requests. + * \retval true the indicated NRS head has some enqueued requests. + */ +bool ptlrpc_nrs_req_throttling_nolock(struct ptlrpc_service_part *svcpt, + bool hp) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + + return !!nrs->nrs_throttling; +}; + +/** + * Moves request \a req from the regular to the high-priority NRS head. + * + * \param[in] req the request to move + */ +void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_nrs_request *nrq = &req->rq_nrq; + struct ptlrpc_nrs_resource *res1[NRS_RES_MAX]; + struct ptlrpc_nrs_resource *res2[NRS_RES_MAX]; + ENTRY; + + /** + * Obtain the high-priority NRS head resources. + */ + nrs_resource_get_safe(nrs_svcpt2nrs(svcpt, true), nrq, res1, true); + + spin_lock(&svcpt->scp_req_lock); + + if (!ptlrpc_nrs_req_can_move(req)) + goto out; + + ptlrpc_nrs_req_del_nolock(req); + + memcpy(res2, nrq->nr_res_ptrs, NRS_RES_MAX * sizeof(res2[0])); + memcpy(nrq->nr_res_ptrs, res1, NRS_RES_MAX * sizeof(res1[0])); + + ptlrpc_nrs_hpreq_add_nolock(req); + + memcpy(res1, res2, NRS_RES_MAX * sizeof(res1[0])); +out: + spin_unlock(&svcpt->scp_req_lock); + + /** + * Release either the regular NRS head resources if we moved the + * request, or the high-priority NRS head resources if we took a + * reference earlier in this function and ptlrpc_nrs_req_can_move() + * returned false. + */ + nrs_resource_put_safe(res1); + EXIT; +} + +/** + * Carries out a control operation \a opc on the policy identified by the + * human-readable \a name, on either all partitions, or only on the first + * partition of service \a svc. + * + * \param[in] svc the service the policy belongs to. + * \param[in] queue whether to carry out the command on the policy which + * belongs to the regular, high-priority, or both NRS + * heads of service partitions of \a svc. + * \param[in] name the policy to act upon, by human-readable name + * \param[in] opc the opcode of the operation to carry out + * \param[in] single when set, the operation will only be carried out on the + * NRS heads of the first service partition of \a svc. + * This is useful for some policies which e.g. share + * identical values on the same parameters of different + * service partitions; when reading these parameters via + * lprocfs, these policies may just want to obtain and + * print out the values from the first service partition. + * Storing these values centrally elsewhere then could be + * another solution for this. + * \param[in,out] arg can be used as a generic in/out buffer between control + * operations and the user environment. + * + *\retval -ve error condition + *\retval 0 operation was carried out successfully + */ +int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc, + enum ptlrpc_nrs_queue_type queue, char *name, + enum ptlrpc_nrs_ctl opc, bool single, void *arg) +{ + struct ptlrpc_service_part *svcpt; + int i; + int rc = 0; + ENTRY; + + LASSERT(opc != PTLRPC_NRS_CTL_INVALID); + + if ((queue & PTLRPC_NRS_QUEUE_BOTH) == 0) + return -EINVAL; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, false), name, + opc, arg); + if (rc != 0 || (queue == PTLRPC_NRS_QUEUE_REG && + single)) + GOTO(out, rc); + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + /** + * XXX: We could optionally check for + * nrs_svc_has_hp(svc) here, and return an error if it + * is false. Right now we rely on the policies' lprocfs + * handlers that call the present function to make this + * check; if they fail to do so, they might hit the + * assertion inside nrs_svcpt2nrs() below. + */ + rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, true), name, + opc, arg); + if (rc != 0 || single) + GOTO(out, rc); + } + } +out: + RETURN(rc); +} + +/** + * Adds all policies that ship with the ptlrpc module, to NRS core's list of + * policies \e nrs_core.nrs_policies. + * + * \retval 0 all policies have been registered successfully + * \retval -ve error + */ +int ptlrpc_nrs_init(void) +{ + int rc; + ENTRY; + + mutex_init(&nrs_core.nrs_mutex); + INIT_LIST_HEAD(&nrs_core.nrs_policies); + + rc = ptlrpc_nrs_policy_register(&nrs_conf_fifo); + if (rc != 0) + GOTO(fail, rc); + +#ifdef HAVE_SERVER_SUPPORT + rc = ptlrpc_nrs_policy_register(&nrs_conf_crrn); + if (rc != 0) + GOTO(fail, rc); + + rc = ptlrpc_nrs_policy_register(&nrs_conf_orr); + if (rc != 0) + GOTO(fail, rc); + + rc = ptlrpc_nrs_policy_register(&nrs_conf_trr); + if (rc != 0) + GOTO(fail, rc); + rc = ptlrpc_nrs_policy_register(&nrs_conf_tbf); + if (rc != 0) + GOTO(fail, rc); +#endif /* HAVE_SERVER_SUPPORT */ + + rc = ptlrpc_nrs_policy_register(&nrs_conf_delay); + if (rc != 0) + GOTO(fail, rc); + + RETURN(rc); +fail: + /** + * Since no PTLRPC services have been started at this point, all we need + * to do for cleanup is to free the descriptors. + */ + ptlrpc_nrs_fini(); + + RETURN(rc); +} + +/** + * Removes all policy descriptors from nrs_core::nrs_policies, and frees the + * policy descriptors. + * + * Since all PTLRPC services are stopped at this point, there are no more + * instances of any policies, because each service will have stopped its policy + * instances in ptlrpc_service_nrs_cleanup(), so we just need to free the + * descriptors here. + */ +void ptlrpc_nrs_fini(void) +{ + struct ptlrpc_nrs_pol_desc *desc; + struct ptlrpc_nrs_pol_desc *tmp; + + list_for_each_entry_safe(desc, tmp, &nrs_core.nrs_policies, + pd_list) { + list_del_init(&desc->pd_list); + OBD_FREE_PTR(desc); + } +} + +/** @} nrs */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c new file mode 100644 index 0000000000000..dc41970d531b5 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c @@ -0,0 +1,830 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs_crr.c + * + * Network Request Scheduler (NRS) CRR-N policy + * + * Request ordering in a batched Round-Robin manner over client NIDs + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ +/** + * \addtogoup nrs + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +/** + * \name CRR-N policy + * + * Client Round-Robin scheduling over client NIDs + * + * @{ + * + */ + +#define NRS_POL_NAME_CRRN "crrn" + +/** + * Binary heap predicate. + * + * Uses ptlrpc_nrs_request::nr_u::crr::cr_round and + * ptlrpc_nrs_request::nr_u::crr::cr_sequence to compare two binheap nodes and + * produce a binary predicate that shows their relative priority, so that the + * binary heap can perform the necessary sorting operations. + * + * \param[in] e1 the first binheap node to compare + * \param[in] e2 the second binheap node to compare + * + * \retval 0 e1 > e2 + * \retval 1 e1 <= e2 + */ +static int +crrn_req_compare(struct binheap_node *e1, struct binheap_node *e2) +{ + struct ptlrpc_nrs_request *nrq1; + struct ptlrpc_nrs_request *nrq2; + + nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node); + nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node); + + if (nrq1->nr_u.crr.cr_round < nrq2->nr_u.crr.cr_round) + return 1; + else if (nrq1->nr_u.crr.cr_round > nrq2->nr_u.crr.cr_round) + return 0; + + return nrq1->nr_u.crr.cr_sequence < nrq2->nr_u.crr.cr_sequence; +} + +static struct binheap_ops nrs_crrn_heap_ops = { + .hop_enter = NULL, + .hop_exit = NULL, + .hop_compare = crrn_req_compare, +}; + +/** + * rhashtable operations for nrs_crrn_net::cn_cli_hash + * + * This uses ptlrpc_request::rq_peer.nid as its key, in order to hash + * nrs_crrn_client objects. + */ +static u32 nrs_crrn_hashfn(const void *data, u32 len, u32 seed) +{ + const lnet_nid_t *nid = data; + + seed ^= cfs_hash_64((u64)nid, 32); + return seed; +} + +static int nrs_crrn_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct nrs_crrn_client *cli = obj; + const lnet_nid_t *nid = arg->key; + + return *nid != cli->cc_nid; +} + +static const struct rhashtable_params nrs_crrn_hash_params = { + .key_len = sizeof(lnet_nid_t), + .key_offset = offsetof(struct nrs_crrn_client, cc_nid), + .head_offset = offsetof(struct nrs_crrn_client, cc_rhead), + .hashfn = nrs_crrn_hashfn, + .obj_cmpfn = nrs_crrn_cmpfn, +}; + +static void nrs_crrn_exit(void *vcli, void *data) +{ + struct nrs_crrn_client *cli = vcli; + + LASSERTF(atomic_read(&cli->cc_ref) == 0, + "Busy CRR-N object from client with NID %s, with %d refs\n", + libcfs_nid2str(cli->cc_nid), atomic_read(&cli->cc_ref)); + + OBD_FREE_PTR(cli); +} + +/** + * Called when a CRR-N policy instance is started. + * + * \param[in] policy the policy + * + * \retval -ENOMEM OOM error + * \retval 0 success + */ +static int nrs_crrn_start(struct ptlrpc_nrs_policy *policy, char *arg) +{ + struct nrs_crrn_net *net; + int rc = 0; + ENTRY; + + OBD_CPT_ALLOC_PTR(net, nrs_pol2cptab(policy), nrs_pol2cptid(policy)); + if (net == NULL) + RETURN(-ENOMEM); + + net->cn_binheap = binheap_create(&nrs_crrn_heap_ops, + CBH_FLAG_ATOMIC_GROW, 4096, NULL, + nrs_pol2cptab(policy), + nrs_pol2cptid(policy)); + if (net->cn_binheap == NULL) + GOTO(out_net, rc = -ENOMEM); + + rc = rhashtable_init(&net->cn_cli_hash, &nrs_crrn_hash_params); + if (rc) + GOTO(out_binheap, rc); + + /** + * Set default quantum value to max_rpcs_in_flight for non-MDS OSCs; + * there may be more RPCs pending from each struct nrs_crrn_client even + * with the default max_rpcs_in_flight value, as we are scheduling over + * NIDs, and there may be more than one mount point per client. + */ + net->cn_quantum = OBD_MAX_RIF_DEFAULT; + /** + * Set to 1 so that the test inside nrs_crrn_req_add() can evaluate to + * true. + */ + net->cn_sequence = 1; + + policy->pol_private = net; + + RETURN(rc); + +out_binheap: + binheap_destroy(net->cn_binheap); +out_net: + OBD_FREE_PTR(net); + + RETURN(rc); +} + +/** + * Called when a CRR-N policy instance is stopped. + * + * Called when the policy has been instructed to transition to the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state and has no more pending + * requests to serve. + * + * \param[in] policy the policy + */ +static void nrs_crrn_stop(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_crrn_net *net = policy->pol_private; + ENTRY; + + LASSERT(net != NULL); + LASSERT(net->cn_binheap != NULL); + LASSERT(binheap_is_empty(net->cn_binheap)); + + rhashtable_free_and_destroy(&net->cn_cli_hash, nrs_crrn_exit, NULL); + binheap_destroy(net->cn_binheap); + + OBD_FREE_PTR(net); +} + +/** + * Performs a policy-specific ctl function on CRR-N policy instances; similar + * to ioctl. + * + * \param[in] policy the policy instance + * \param[in] opc the opcode + * \param[in,out] arg used for passing parameters and information + * + * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * + * \retval 0 operation carried out successfully + * \retval -ve error + */ +static int nrs_crrn_ctl(struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, + void *arg) +{ + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + switch((enum nrs_ctl_crr)opc) { + default: + RETURN(-EINVAL); + + /** + * Read Round Robin quantum size of a policy instance. + */ + case NRS_CTL_CRRN_RD_QUANTUM: { + struct nrs_crrn_net *net = policy->pol_private; + + *(__u16 *)arg = net->cn_quantum; + } + break; + + /** + * Write Round Robin quantum size of a policy instance. + */ + case NRS_CTL_CRRN_WR_QUANTUM: { + struct nrs_crrn_net *net = policy->pol_private; + + net->cn_quantum = *(__u16 *)arg; + LASSERT(net->cn_quantum != 0); + } + break; + } + + RETURN(0); +} + +/** + * Obtains resources from CRR-N policy instances. The top-level resource lives + * inside \e nrs_crrn_net and the second-level resource inside + * \e nrs_crrn_client object instances. + * + * \param[in] policy the policy for which resources are being taken for + * request \a nrq + * \param[in] nrq the request for which resources are being taken + * \param[in] parent parent resource, embedded in nrs_crrn_net for the + * CRR-N policy + * \param[out] resp resources references are placed in this array + * \param[in] moving_req signifies limited caller context; used to perform + * memory allocations in an atomic context in this + * policy + * + * \retval 0 we are returning a top-level, parent resource, one that is + * embedded in an nrs_crrn_net object + * \retval 1 we are returning a bottom-level resource, one that is embedded + * in an nrs_crrn_client object + * + * \see nrs_resource_get_safe() + */ +static int nrs_crrn_res_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, bool moving_req) +{ + struct nrs_crrn_net *net; + struct nrs_crrn_client *cli; + struct nrs_crrn_client *tmp; + struct ptlrpc_request *req; + + if (parent == NULL) { + *resp = &((struct nrs_crrn_net *)policy->pol_private)->cn_res; + return 0; + } + + net = container_of(parent, struct nrs_crrn_net, cn_res); + req = container_of(nrq, struct ptlrpc_request, rq_nrq); + + cli = rhashtable_lookup_fast(&net->cn_cli_hash, &req->rq_peer.nid, + nrs_crrn_hash_params); + if (cli) + goto out; + + OBD_CPT_ALLOC_GFP(cli, nrs_pol2cptab(policy), nrs_pol2cptid(policy), + sizeof(*cli), moving_req ? GFP_ATOMIC : GFP_NOFS); + if (cli == NULL) + return -ENOMEM; + + cli->cc_nid = req->rq_peer.nid; + + atomic_set(&cli->cc_ref, 0); + + tmp = rhashtable_lookup_get_insert_fast(&net->cn_cli_hash, + &cli->cc_rhead, + nrs_crrn_hash_params); + if (tmp) { + /* insertion failed */ + OBD_FREE_PTR(cli); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + cli = tmp; + } +out: + atomic_inc(&cli->cc_ref); + *resp = &cli->cc_res; + + return 1; +} + +/** + * Called when releasing references to the resource hierachy obtained for a + * request for scheduling using the CRR-N policy. + * + * \param[in] policy the policy the resource belongs to + * \param[in] res the resource to be released + */ +static void nrs_crrn_res_put(struct ptlrpc_nrs_policy *policy, + const struct ptlrpc_nrs_resource *res) +{ + struct nrs_crrn_client *cli; + + /** + * Do nothing for freeing parent, nrs_crrn_net resources + */ + if (res->res_parent == NULL) + return; + + cli = container_of(res, struct nrs_crrn_client, cc_res); + + atomic_dec(&cli->cc_ref); +} + +/** + * Called when getting a request from the CRR-N policy for handlingso that it can be served + * + * \param[in] policy the policy being polled + * \param[in] peek when set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force force the policy to return a request; unused in this policy + * + * \retval the request to be handled + * \retval NULL no request available + * + * \see ptlrpc_nrs_req_get_nolock() + * \see nrs_request_get() + */ +static +struct ptlrpc_nrs_request *nrs_crrn_req_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct nrs_crrn_net *net = policy->pol_private; + struct binheap_node *node = binheap_root(net->cn_binheap); + struct ptlrpc_nrs_request *nrq; + + nrq = unlikely(node == NULL) ? NULL : + container_of(node, struct ptlrpc_nrs_request, nr_node); + + if (likely(!peek && nrq != NULL)) { + struct nrs_crrn_client *cli; + struct ptlrpc_request *req = container_of(nrq, + struct ptlrpc_request, + rq_nrq); + + cli = container_of(nrs_request_resource(nrq), + struct nrs_crrn_client, cc_res); + + LASSERT(nrq->nr_u.crr.cr_round <= cli->cc_round); + + binheap_remove(net->cn_binheap, &nrq->nr_node); + cli->cc_active--; + + CDEBUG(D_RPCTRACE, + "NRS: starting to handle %s request from %s, with round " + "%llu\n", NRS_POL_NAME_CRRN, + libcfs_id2str(req->rq_peer), nrq->nr_u.crr.cr_round); + + /** Peek at the next request to be served */ + node = binheap_root(net->cn_binheap); + + /** No more requests */ + if (unlikely(node == NULL)) { + net->cn_round++; + } else { + struct ptlrpc_nrs_request *next; + + next = container_of(node, struct ptlrpc_nrs_request, + nr_node); + + if (net->cn_round < next->nr_u.crr.cr_round) + net->cn_round = next->nr_u.crr.cr_round; + } + } + + return nrq; +} + +/** + * Adds request \a nrq to a CRR-N \a policy instance's set of queued requests + * + * A scheduling round is a stream of requests that have been sorted in batches + * according to the client that they originate from (as identified by its NID); + * there can be only one batch for each client in each round. The batches are of + * maximum size nrs_crrn_net:cn_quantum. When a new request arrives for + * scheduling from a client that has exhausted its quantum in its current round, + * it will start scheduling requests on the next scheduling round. Clients are + * allowed to schedule requests against a round until all requests for the round + * are serviced, so a client might miss a round if it is not generating requests + * for a long enough period of time. Clients that miss a round will continue + * with scheduling the next request that they generate, starting at the round + * that requests are being dispatched for, at the time of arrival of this new + * request. + * + * Requests are tagged with the round number and a sequence number; the sequence + * number indicates the relative ordering amongst the batches of requests in a + * round, and is identical for all requests in a batch, as is the round number. + * The round and sequence numbers are used by crrn_req_compare() in order to + * maintain an ordered set of rounds, with each round consisting of an ordered + * set of batches of requests. + * + * \param[in] policy the policy + * \param[in] nrq the request to add + * + * \retval 0 request successfully added + * \retval != 0 error + */ +static int nrs_crrn_req_add(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_crrn_net *net; + struct nrs_crrn_client *cli; + int rc; + + cli = container_of(nrs_request_resource(nrq), + struct nrs_crrn_client, cc_res); + net = container_of(nrs_request_resource(nrq)->res_parent, + struct nrs_crrn_net, cn_res); + + if (cli->cc_quantum == 0 || cli->cc_round < net->cn_round || + (cli->cc_active == 0 && cli->cc_quantum > 0)) { + + /** + * If the client has no pending requests, and still some of its + * quantum remaining unused, which implies it has not had a + * chance to schedule up to its maximum allowed batch size of + * requests in the previous round it participated, schedule this + * next request on a new round; this avoids fragmentation of + * request batches caused by client inactivity, at the expense + * of potentially slightly increased service time for the + * request batch this request will be a part of. + */ + if (cli->cc_active == 0 && cli->cc_quantum > 0) + cli->cc_round++; + + /** A new scheduling round has commenced */ + if (cli->cc_round < net->cn_round) + cli->cc_round = net->cn_round; + + /** I was not the last client through here */ + if (cli->cc_sequence < net->cn_sequence) + cli->cc_sequence = ++net->cn_sequence; + /** + * Reset the quantum if we have reached the maximum quantum + * size for this batch, or even if we have not managed to + * complete a batch size up to its maximum allowed size. + * XXX: Accessed unlocked + */ + cli->cc_quantum = net->cn_quantum; + } + + nrq->nr_u.crr.cr_round = cli->cc_round; + nrq->nr_u.crr.cr_sequence = cli->cc_sequence; + + rc = binheap_insert(net->cn_binheap, &nrq->nr_node); + if (rc == 0) { + cli->cc_active++; + if (--cli->cc_quantum == 0) + cli->cc_round++; + } + return rc; +} + +/** + * Removes request \a nrq from a CRR-N \a policy instance's set of queued + * requests. + * + * \param[in] policy the policy + * \param[in] nrq the request to remove + */ +static void nrs_crrn_req_del(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_crrn_net *net; + struct nrs_crrn_client *cli; + bool is_root; + + cli = container_of(nrs_request_resource(nrq), + struct nrs_crrn_client, cc_res); + net = container_of(nrs_request_resource(nrq)->res_parent, + struct nrs_crrn_net, cn_res); + + LASSERT(nrq->nr_u.crr.cr_round <= cli->cc_round); + + is_root = &nrq->nr_node == binheap_root(net->cn_binheap); + + binheap_remove(net->cn_binheap, &nrq->nr_node); + cli->cc_active--; + + /** + * If we just deleted the node at the root of the binheap, we may have + * to adjust round numbers. + */ + if (unlikely(is_root)) { + /** Peek at the next request to be served */ + struct binheap_node *node = binheap_root(net->cn_binheap); + + /** No more requests */ + if (unlikely(node == NULL)) { + net->cn_round++; + } else { + nrq = container_of(node, struct ptlrpc_nrs_request, + nr_node); + + if (net->cn_round < nrq->nr_u.crr.cr_round) + net->cn_round = nrq->nr_u.crr.cr_round; + } + } +} + +/** + * Called right after the request \a nrq finishes being handled by CRR-N policy + * instance \a policy. + * + * \param[in] policy the policy that handled the request + * \param[in] nrq the request that was handled + */ +static void nrs_crrn_req_stop(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + + CDEBUG(D_RPCTRACE, + "NRS: finished handling %s request from %s, with round %llu" + "\n", NRS_POL_NAME_CRRN, + libcfs_id2str(req->rq_peer), nrq->nr_u.crr.cr_round); +} + +/** + * debugfs interface + */ + +/** + * Retrieves the value of the Round Robin quantum (i.e. the maximum batch size) + * for CRR-N policy instances on both the regular and high-priority NRS head + * of a service, as long as a policy instance is not in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; policy instances in this + * state are skipped later by nrs_crrn_ctl(). + * + * Quantum values are in # of RPCs, and output is in YAML format. + * + * For example: + * + * reg_quantum:8 + * hp_quantum:4 + */ +static int +ptlrpc_lprocfs_nrs_crrn_quantum_seq_show(struct seq_file *m, void *data) +{ + struct ptlrpc_service *svc = m->private; + __u16 quantum; + int rc; + + /** + * Perform two separate calls to this as only one of the NRS heads' + * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state. + */ + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + NRS_POL_NAME_CRRN, + NRS_CTL_CRRN_RD_QUANTUM, + true, &quantum); + if (rc == 0) { + seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_REG + "%-5d\n", quantum); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + + if (!nrs_svc_has_hp(svc)) + goto no_hp; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + NRS_POL_NAME_CRRN, + NRS_CTL_CRRN_RD_QUANTUM, + true, &quantum); + if (rc == 0) { + seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_HP"%-5d\n", quantum); + /** + * Ignore -ENODEV as the high priority NRS head's policy may be + * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + +no_hp: + return rc; +} + +/** + * Sets the value of the Round Robin quantum (i.e. the maximum batch size) + * for CRR-N policy instances of a service. The user can set the quantum size + * for the regular or high priority NRS head individually by specifying each + * value, or both together in a single invocation. + * + * For example: + * + * lctl set_param *.*.*.nrs_crrn_quantum=reg_quantum:32, to set the regular + * request quantum size on all PTLRPC services to 32 + * + * lctl set_param *.*.*.nrs_crrn_quantum=hp_quantum:16, to set the high + * priority request quantum size on all PTLRPC services to 16, and + * + * lctl set_param *.*.ost_io.nrs_crrn_quantum=16, to set both the regular and + * high priority request quantum sizes of the ost_io service to 16. + * + * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state + * are skipped later by nrs_crrn_ctl(). + */ +static ssize_t +ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file, + const char __user *buffer, + size_t count, + loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + enum ptlrpc_nrs_queue_type queue = 0; + char kernbuf[LPROCFS_NRS_WR_QUANTUM_MAX_CMD]; + char *val; + long quantum_reg; + long quantum_hp; + /** lprocfs_find_named_value() modifies its argument, so keep a copy */ + size_t count_copy; + int rc = 0; + int rc2 = 0; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + + count_copy = count; + + /** + * Check if the regular quantum value has been specified + */ + val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG, + &count_copy); + if (val != kernbuf) { + rc = kstrtol(val, 10, &quantum_reg); + if (rc) + return rc; + + queue |= PTLRPC_NRS_QUEUE_REG; + } + + count_copy = count; + + /** + * Check if the high priority quantum value has been specified + */ + val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_HP, + &count_copy); + if (val != kernbuf) { + if (!nrs_svc_has_hp(svc)) + return -ENODEV; + + rc = kstrtol(val, 10, &quantum_hp); + if (rc) + return rc; + + queue |= PTLRPC_NRS_QUEUE_HP; + } + + /** + * If none of the queues has been specified, look for a valid numerical + * value + */ + if (queue == 0) { + rc = kstrtol(kernbuf, 10, &quantum_reg); + if (rc) + return rc; + + queue = PTLRPC_NRS_QUEUE_REG; + + if (nrs_svc_has_hp(svc)) { + queue |= PTLRPC_NRS_QUEUE_HP; + quantum_hp = quantum_reg; + } + } + + if ((((queue & PTLRPC_NRS_QUEUE_REG) != 0) && + ((quantum_reg > LPROCFS_NRS_QUANTUM_MAX || quantum_reg <= 0))) || + (((queue & PTLRPC_NRS_QUEUE_HP) != 0) && + ((quantum_hp > LPROCFS_NRS_QUANTUM_MAX || quantum_hp <= 0)))) + return -EINVAL; + + /** + * We change the values on regular and HP NRS heads separately, so that + * we do not exit early from ptlrpc_nrs_policy_control() with an error + * returned by nrs_policy_ctl_locked(), in cases where the user has not + * started the policy on either the regular or HP NRS head; i.e. we are + * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned + * only if the operation fails with -ENODEV on all heads that have been + * specified by the command; if at least one operation succeeds, + * success is returned. + */ + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + NRS_POL_NAME_CRRN, + NRS_CTL_CRRN_WR_QUANTUM, false, + &quantum_reg); + if ((rc < 0 && rc != -ENODEV) || + (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG)) + return rc; + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + NRS_POL_NAME_CRRN, + NRS_CTL_CRRN_WR_QUANTUM, false, + &quantum_hp); + if ((rc2 < 0 && rc2 != -ENODEV) || + (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP)) + return rc2; + } + + return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_crrn_quantum); + +/** + * Initializes a CRR-N policy's lprocfs interface for service \a svc + * + * \param[in] svc the service + * + * \retval 0 success + * \retval != 0 error + */ +static int nrs_crrn_lprocfs_init(struct ptlrpc_service *svc) +{ + struct ldebugfs_vars nrs_crrn_lprocfs_vars[] = { + { .name = "nrs_crrn_quantum", + .fops = &ptlrpc_lprocfs_nrs_crrn_quantum_fops, + .data = svc }, + { NULL } + }; + + if (!svc->srv_debugfs_entry) + return 0; + + ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_crrn_lprocfs_vars, NULL); + + return 0; +} + +/** + * CRR-N policy operations + */ +static const struct ptlrpc_nrs_pol_ops nrs_crrn_ops = { + .op_policy_start = nrs_crrn_start, + .op_policy_stop = nrs_crrn_stop, + .op_policy_ctl = nrs_crrn_ctl, + .op_res_get = nrs_crrn_res_get, + .op_res_put = nrs_crrn_res_put, + .op_req_get = nrs_crrn_req_get, + .op_req_enqueue = nrs_crrn_req_add, + .op_req_dequeue = nrs_crrn_req_del, + .op_req_stop = nrs_crrn_req_stop, + .op_lprocfs_init = nrs_crrn_lprocfs_init, +}; + +/** + * CRR-N policy configuration + */ +struct ptlrpc_nrs_pol_conf nrs_conf_crrn = { + .nc_name = NRS_POL_NAME_CRRN, + .nc_ops = &nrs_crrn_ops, + .nc_compat = nrs_policy_compat_all, +}; + +/** @} CRR-N policy */ + +/** @} nrs */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c new file mode 100644 index 0000000000000..c09dd7eaff28e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c @@ -0,0 +1,829 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Cray Inc. All Rights Reserved. + * + * Copyright (c) 2017, Intel Corporation. + */ +/* + * lustre/ptlrpc/nrs_delay.c + * + * Network Request Scheduler (NRS) Delay policy + * + * This policy will delay request handling for some configurable amount of + * time. + * + * Author: Chris Horn + */ +/** + * \addtogoup nrs + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include + +#include +#include +#include "ptlrpc_internal.h" + +/** + * \name delay + * + * The delay policy schedules RPCs so that they are only processed after some + * configurable amount of time (in seconds) has passed. + * + * The defaults were chosen arbitrarily. + * + * @{ + */ + +#define NRS_POL_NAME_DELAY "delay" + +/* Default minimum delay in seconds. */ +#define NRS_DELAY_MIN_DEFAULT 5 +/* Default maximum delay, in seconds. */ +#define NRS_DELAY_MAX_DEFAULT 300 +/* Default percentage of delayed RPCs. */ +#define NRS_DELAY_PCT_DEFAULT 100 + +/** + * Binary heap predicate. + * + * Elements are sorted according to the start time assigned to the requests + * upon enqueue. An element with an earlier start time is "less than" an + * element with a later start time. + * + * \retval 0 start_time(e1) > start_time(e2) + * \retval 1 start_time(e1) <= start_time(e2) + */ +static int delay_req_compare(struct binheap_node *e1, + struct binheap_node *e2) +{ + struct ptlrpc_nrs_request *nrq1; + struct ptlrpc_nrs_request *nrq2; + + nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node); + nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node); + + return nrq1->nr_u.delay.req_start_time <= + nrq2->nr_u.delay.req_start_time; +} + +static struct binheap_ops nrs_delay_heap_ops = { + .hop_enter = NULL, + .hop_exit = NULL, + .hop_compare = delay_req_compare, +}; + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes + * the delay-specific private data structure. + * + * \param[in] policy The policy to start + * \param[in] Generic char buffer; unused in this policy + * + * \retval -ENOMEM OOM error + * \retval 0 success + * + * \see nrs_policy_register() + * \see nrs_policy_ctl() + */ +static int nrs_delay_start(struct ptlrpc_nrs_policy *policy, char *arg) +{ + struct nrs_delay_data *delay_data; + + ENTRY; + + OBD_CPT_ALLOC_PTR(delay_data, nrs_pol2cptab(policy), + nrs_pol2cptid(policy)); + if (delay_data == NULL) + RETURN(-ENOMEM); + + delay_data->delay_binheap = binheap_create(&nrs_delay_heap_ops, + CBH_FLAG_ATOMIC_GROW, + 4096, NULL, + nrs_pol2cptab(policy), + nrs_pol2cptid(policy)); + + if (delay_data->delay_binheap == NULL) { + OBD_FREE_PTR(delay_data); + RETURN(-ENOMEM); + } + + delay_data->min_delay = NRS_DELAY_MIN_DEFAULT; + delay_data->max_delay = NRS_DELAY_MAX_DEFAULT; + delay_data->delay_pct = NRS_DELAY_PCT_DEFAULT; + + policy->pol_private = delay_data; + + RETURN(0); +} + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the delay-specific + * private data structure. + * + * \param[in] policy The policy to stop + * + * \see nrs_policy_stop0() + */ +static void nrs_delay_stop(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_delay_data *delay_data = policy->pol_private; + + LASSERT(delay_data != NULL); + LASSERT(delay_data->delay_binheap != NULL); + LASSERT(binheap_is_empty(delay_data->delay_binheap)); + + binheap_destroy(delay_data->delay_binheap); + + OBD_FREE_PTR(delay_data); +} + +/** + * Is called for obtaining a delay policy resource. + * + * \param[in] policy The policy on which the request is being asked for + * \param[in] nrq The request for which resources are being taken + * \param[in] parent Parent resource, unused in this policy + * \param[out] resp Resources references are placed in this array + * \param[in] moving_req Signifies limited caller context; unused in this + * policy + * + * \retval 1 The delay policy only has a one-level resource hierarchy + * + * \see nrs_resource_get_safe() + */ +static int nrs_delay_res_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, bool moving_req) +{ + /** + * Just return the resource embedded inside nrs_delay_data, and end this + * resource hierarchy reference request. + */ + *resp = &((struct nrs_delay_data *)policy->pol_private)->delay_res; + return 1; +} + +/** + * Called when getting a request from the delay policy for handling, or just + * peeking; removes the request from the policy when it is to be handled. + * Requests are only removed from this policy when their start time has + * passed. + * + * \param[in] policy The policy + * \param[in] peek When set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force Force the policy to return a request + * + * \retval The request to be handled + * \retval NULL no request available + * + * \see ptlrpc_nrs_req_get_nolock() + * \see nrs_request_get() + */ +static +struct ptlrpc_nrs_request *nrs_delay_req_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct nrs_delay_data *delay_data = policy->pol_private; + struct binheap_node *node; + struct ptlrpc_nrs_request *nrq; + + node = binheap_root(delay_data->delay_binheap); + nrq = unlikely(node == NULL) ? NULL : + container_of(node, struct ptlrpc_nrs_request, nr_node); + + if (likely(nrq != NULL)) { + if (!force && + ktime_get_real_seconds() < nrq->nr_u.delay.req_start_time) + nrq = NULL; + else if (likely(!peek)) + binheap_remove(delay_data->delay_binheap, + &nrq->nr_node); + } + + return nrq; +} + +/** + * Adds request \a nrq to a delay \a policy instance's set of queued requests + * + * A percentage (delay_pct) of incoming requests are delayed by this policy. + * If selected for delay a request start time is calculated. A start time + * is the current time plus a random offset in the range [min_delay, max_delay] + * The start time is recorded in the request, and is then used by + * delay_req_compare() to maintain a set of requests ordered by their start + * times. + * + * \param[in] policy The policy + * \param[in] nrq The request to add + * + * \retval 0 request added + * \retval 1 request not added + * + */ +static int nrs_delay_req_add(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_delay_data *delay_data = policy->pol_private; + + if (delay_data->delay_pct == 0 || /* Not delaying anything */ + (delay_data->delay_pct != 100 && + delay_data->delay_pct < get_random_u32_below(100))) + return 1; + + nrq->nr_u.delay.req_start_time = ktime_get_real_seconds() + + get_random_u32_below(delay_data->max_delay - delay_data->min_delay + 1) + + delay_data->min_delay; + + return binheap_insert(delay_data->delay_binheap, &nrq->nr_node); +} + +/** + * Removes request \a nrq from \a policy's list of queued requests. + * + * \param[in] policy The policy + * \param[in] nrq The request to remove + */ +static void nrs_delay_req_del(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_delay_data *delay_data = policy->pol_private; + + binheap_remove(delay_data->delay_binheap, &nrq->nr_node); +} + +/** + * Prints a debug statement right before the request \a nrq stops being + * handled. + * + * \param[in] policy The policy handling the request + * \param[in] nrq The request being handled + * + * \see ptlrpc_server_finish_request() + * \see ptlrpc_nrs_req_stop_nolock() + */ +static void nrs_delay_req_stop(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + + DEBUG_REQ(D_RPCTRACE, req, + "NRS: finished delayed request from %s after %llds", + libcfs_id2str(req->rq_peer), + (s64)(nrq->nr_u.delay.req_start_time - + req->rq_srv.sr_arrival_time.tv_sec)); +} + +/** + * Performs ctl functions specific to delay policy instances; similar to ioctl + * + * \param[in] policy the policy instance + * \param[in] opc the opcode + * \param[in,out] arg used for passing parameters and information + * + * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * + * \retval 0 operation carried out successfully + * \retval -ve error + */ +static int nrs_delay_ctl(struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg) +{ + struct nrs_delay_data *delay_data = policy->pol_private; + __u32 *val = (__u32 *)arg; + + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + switch ((enum nrs_ctl_delay)opc) { + default: + RETURN(-EINVAL); + + case NRS_CTL_DELAY_RD_MIN: + *val = delay_data->min_delay; + break; + + case NRS_CTL_DELAY_WR_MIN: + if (*val > delay_data->max_delay) + RETURN(-EINVAL); + + delay_data->min_delay = *val; + break; + + case NRS_CTL_DELAY_RD_MAX: + *val = delay_data->max_delay; + break; + + case NRS_CTL_DELAY_WR_MAX: + if (*val < delay_data->min_delay) + RETURN(-EINVAL); + + delay_data->max_delay = *val; + break; + + case NRS_CTL_DELAY_RD_PCT: + *val = delay_data->delay_pct; + break; + + case NRS_CTL_DELAY_WR_PCT: + if (*val < 0 || *val > 100) + RETURN(-EINVAL); + + delay_data->delay_pct = *val; + break; + } + RETURN(0); +} + +/** + * debugfs interface + */ + +/* nrs_delay_min and nrs_delay_max are bounded by these values */ +#define LPROCFS_NRS_DELAY_LOWER_BOUND 0 +#define LPROCFS_NRS_DELAY_UPPER_BOUND 65535 + +#define LPROCFS_NRS_DELAY_MIN_NAME "delay_min:" +#define LPROCFS_NRS_DELAY_MIN_NAME_REG "reg_delay_min:" +#define LPROCFS_NRS_DELAY_MIN_NAME_HP "hp_delay_min:" + +/** + * Max size of the nrs_delay_min seq_write buffer. Needs to be large enough + * to hold the string: "reg_min_delay:65535 hp_min_delay:65535" + */ +#define LPROCFS_NRS_DELAY_MIN_SIZE \ + sizeof(LPROCFS_NRS_DELAY_MIN_NAME_REG \ + __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND) \ + " " LPROCFS_NRS_DELAY_MIN_NAME_HP \ + __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND)) + +#define LPROCFS_NRS_DELAY_MAX_NAME "delay_max:" +#define LPROCFS_NRS_DELAY_MAX_NAME_REG "reg_delay_max:" +#define LPROCFS_NRS_DELAY_MAX_NAME_HP "hp_delay_max:" + +/** + * Similar to LPROCFS_NRS_DELAY_MIN_SIZE above, but for the nrs_delay_max + * variable. + */ +#define LPROCFS_NRS_DELAY_MAX_SIZE \ + sizeof(LPROCFS_NRS_DELAY_MAX_NAME_REG \ + __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND) \ + " " LPROCFS_NRS_DELAY_MAX_NAME_HP \ + __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND)) + +#define LPROCFS_NRS_DELAY_PCT_MIN_VAL 0 +#define LPROCFS_NRS_DELAY_PCT_MAX_VAL 100 +#define LPROCFS_NRS_DELAY_PCT_NAME "delay_pct:" +#define LPROCFS_NRS_DELAY_PCT_NAME_REG "reg_delay_pct:" +#define LPROCFS_NRS_DELAY_PCT_NAME_HP "hp_delay_pct:" + +/** + * Similar to LPROCFS_NRS_DELAY_MIN_SIZE above, but for the nrs_delay_pct + * variable. + */ +#define LPROCFS_NRS_DELAY_PCT_SIZE \ + sizeof(LPROCFS_NRS_DELAY_PCT_NAME_REG \ + __stringify(LPROCFS_NRS_DELAY_PCT_MAX_VAL) \ + " " LPROCFS_NRS_DELAY_PCT_NAME_HP \ + __stringify(LPROCFS_NRS_DELAY_PCT_MAX_VAL)) + +/** + * Helper for delay's seq_write functions. + */ +static ssize_t +lprocfs_nrs_delay_seq_write_common(const char __user *buffer, + unsigned int bufsize, size_t count, + const char *var_name, unsigned int min_val, + unsigned int max_val, + struct ptlrpc_service *svc, char *pol_name, + enum ptlrpc_nrs_ctl opc, bool single) +{ + enum ptlrpc_nrs_queue_type queue = 0; + char *kernbuf; + char *val_str; + long unsigned int val_reg; + long unsigned int val_hp; + size_t count_copy; + int rc = 0; + char *tmp = NULL; + int tmpsize = 0; + + if (count > bufsize - 1) + return -EINVAL; + + OBD_ALLOC(kernbuf, bufsize); + if (kernbuf == NULL) + return -ENOMEM; + + if (copy_from_user(kernbuf, buffer, count)) + GOTO(free_kernbuf, rc = -EFAULT); + + tmpsize = strlen("reg_") + strlen(var_name) + 1; + OBD_ALLOC(tmp, tmpsize); + if (tmp == NULL) + GOTO(free_tmp, rc = -ENOMEM); + + /* look for "reg_" in kernbuf */ + snprintf(tmp, tmpsize, "reg_%s", var_name); + count_copy = count; + val_str = lprocfs_find_named_value(kernbuf, tmp, &count_copy); + if (val_str != kernbuf) { + rc = kstrtoul(val_str, 10, &val_reg); + if (rc != 0) + GOTO(free_tmp, rc = -EINVAL); + queue |= PTLRPC_NRS_QUEUE_REG; + } + + /* look for "hp_" in kernbuf */ + snprintf(tmp, tmpsize, "hp_%s", var_name); + count_copy = count; + val_str = lprocfs_find_named_value(kernbuf, tmp, &count_copy); + if (val_str != kernbuf) { + if (!nrs_svc_has_hp(svc)) + GOTO(free_tmp, rc = -ENODEV); + + rc = kstrtoul(val_str, 10, &val_hp); + if (rc != 0) + GOTO(free_tmp, rc = -EINVAL); + queue |= PTLRPC_NRS_QUEUE_HP; + } + + if (queue == 0) { + if (!isdigit(kernbuf[0])) + GOTO(free_tmp, rc = -EINVAL); + + rc = kstrtoul(kernbuf, 10, &val_reg); + if (rc != 0) + GOTO(free_tmp, rc = -EINVAL); + + queue = PTLRPC_NRS_QUEUE_REG; + + if (nrs_svc_has_hp(svc)) { + queue |= PTLRPC_NRS_QUEUE_HP; + val_hp = val_reg; + } + } + + if (queue & PTLRPC_NRS_QUEUE_REG) { + if (val_reg > max_val || val_reg < min_val) + GOTO(free_tmp, rc = -EINVAL); + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + pol_name, opc, single, &val_reg); + if ((rc < 0 && rc != -ENODEV) || + (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG)) + GOTO(free_tmp, rc); + } + + if (queue & PTLRPC_NRS_QUEUE_HP) { + int rc2 = 0; + if (val_hp > max_val || val_hp < min_val) + GOTO(free_tmp, rc = -EINVAL); + + rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + pol_name, opc, single, &val_hp); + if ((rc2 < 0 && rc2 != -ENODEV) || + (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP)) + GOTO(free_tmp, rc = rc2); + } + + /* If we've reached here then we want to return count */ + rc = count; + +free_tmp: + OBD_FREE(tmp, tmpsize); +free_kernbuf: + OBD_FREE(kernbuf, bufsize); + + return rc; +} + +/** + * Retrieves the value of the minimum delay for delay policy instances on both + * the regular and high-priority NRS head of a service, as long as a policy + * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; + */ +static int +ptlrpc_lprocfs_nrs_delay_min_seq_show(struct seq_file *m, void *data) +{ + struct ptlrpc_service *svc = m->private; + unsigned int min_delay; + int rc; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_RD_MIN, + true, &min_delay); + + if (rc == 0) + seq_printf(m, LPROCFS_NRS_DELAY_MIN_NAME_REG"%-5d\n", + min_delay); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in + * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + else if (rc != -ENODEV) + return rc; + + if (!nrs_svc_has_hp(svc)) + return 0; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_RD_MIN, + true, &min_delay); + if (rc == 0) + seq_printf(m, LPROCFS_NRS_DELAY_MIN_NAME_HP"%-5d\n", + min_delay); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in + * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + else if (rc == -ENODEV) + rc = 0; + + return rc; +} + +/** + * Sets the value of the minimum request delay for delay policy instances of a + * service. The user can set the minimum request delay for the regular or high + * priority NRS head individually by specifying each value, or both together in + * a single invocation. + * + * For example: + * + * lctl set_param *.*.*.nrs_delay_min=reg_delay_min:5, to set the regular + * request minimum delay on all PtlRPC services to 5 seconds + * + * lctl set_param *.*.*.nrs_delay_min=hp_delay_min:2, to set the high-priority + * request minimum delay on all PtlRPC services to 2 seconds, and + * + * lctl set_param *.*.ost_io.nrs_delay_min=8, to set both the regular and + * high priority request minimum delay of the ost_io service to 8 seconds. + */ +static ssize_t +ptlrpc_lprocfs_nrs_delay_min_seq_write(struct file *file, + const char __user *buffer, size_t count, + loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + + return lprocfs_nrs_delay_seq_write_common(buffer, + LPROCFS_NRS_DELAY_MIN_SIZE, + count, + LPROCFS_NRS_DELAY_MIN_NAME, + LPROCFS_NRS_DELAY_LOWER_BOUND, + LPROCFS_NRS_DELAY_UPPER_BOUND, + svc, NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_WR_MIN, false); +} +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_min); + +/** + * Retrieves the value of the maximum delay for delay policy instances on both + * the regular and high-priority NRS head of a service, as long as a policy + * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; + */ +static int +ptlrpc_lprocfs_nrs_delay_max_seq_show(struct seq_file *m, void *data) +{ + struct ptlrpc_service *svc = m->private; + unsigned int max_delay; + int rc; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_RD_MAX, + true, &max_delay); + + if (rc == 0) + seq_printf(m, LPROCFS_NRS_DELAY_MAX_NAME_REG"%-5d\n", + max_delay); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in + * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + else if (rc != -ENODEV) + return rc; + + if (!nrs_svc_has_hp(svc)) + return 0; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_RD_MAX, + true, &max_delay); + if (rc == 0) + seq_printf(m, LPROCFS_NRS_DELAY_MAX_NAME_HP"%-5d\n", + max_delay); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in + * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + else if (rc == -ENODEV) + rc = 0; + + return rc; +} + +/** + * Sets the value of the maximum request delay for delay policy instances of a + * service. The user can set the maximum request delay for the regular or high + * priority NRS head individually by specifying each value, or both together in + * a single invocation. + * + * For example: + * + * lctl set_param *.*.*.nrs_delay_max=reg_delay_max:20, to set the regular + * request maximum delay on all PtlRPC services to 20 seconds + * + * lctl set_param *.*.*.nrs_delay_max=hp_delay_max:10, to set the high-priority + * request maximum delay on all PtlRPC services to 10 seconds, and + * + * lctl set_param *.*.ost_io.nrs_delay_max=35, to set both the regular and + * high priority request maximum delay of the ost_io service to 35 seconds. + */ +static ssize_t +ptlrpc_lprocfs_nrs_delay_max_seq_write(struct file *file, + const char __user *buffer, size_t count, + loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + + return lprocfs_nrs_delay_seq_write_common(buffer, + LPROCFS_NRS_DELAY_MAX_SIZE, + count, + LPROCFS_NRS_DELAY_MAX_NAME, + LPROCFS_NRS_DELAY_LOWER_BOUND, + LPROCFS_NRS_DELAY_UPPER_BOUND, + svc, NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_WR_MAX, false); +} +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_max); + +/** + * Retrieves the value of the percentage of requests which should be delayed + * for delay policy instances on both the regular and high-priority NRS head + * of a service, as long as a policy instance is not in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; + */ +static int +ptlrpc_lprocfs_nrs_delay_pct_seq_show(struct seq_file *m, void *data) +{ + struct ptlrpc_service *svc = m->private; + unsigned int delay_pct; + int rc; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_RD_PCT, + true, &delay_pct); + + if (rc == 0) + seq_printf(m, LPROCFS_NRS_DELAY_PCT_NAME_REG"%-3d\n", + delay_pct); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in + * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + else if (rc != -ENODEV) + return rc; + + if (!nrs_svc_has_hp(svc)) + return 0; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_RD_PCT, + true, &delay_pct); + if (rc == 0) + seq_printf(m, LPROCFS_NRS_DELAY_PCT_NAME_HP"%-3d\n", + delay_pct); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in + * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + else if (rc == -ENODEV) + rc = 0; + + return rc; +} + +/** + * Sets the value of the percentage of requests to be delayed for delay policy + * instances of a service. The user can set the percentage for the regular or + * high-priority NRS head individually by specifying each value, or both + * together in a single invocation. + * + * For example: + * + * lctl set_param *.*.*.nrs_delay_pct=reg_delay_pct:5, to delay 5 percent of + * regular requests on all PtlRPC services + * + * lctl set_param *.*.*.nrs_delay_pct=hp_delay_pct:2, to delay 2 percent of + * high-priority requests on all PtlRPC services, and + * + * lctl set_param *.*.ost_io.nrs_delay_pct=8, to delay 8 percent of both + * regular and high-priority requests of the ost_io service. + */ +static ssize_t +ptlrpc_lprocfs_nrs_delay_pct_seq_write(struct file *file, + const char __user *buffer, size_t count, + loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + + return lprocfs_nrs_delay_seq_write_common(buffer, + LPROCFS_NRS_DELAY_PCT_SIZE, + count, + LPROCFS_NRS_DELAY_PCT_NAME, + LPROCFS_NRS_DELAY_PCT_MIN_VAL, + LPROCFS_NRS_DELAY_PCT_MAX_VAL, + svc, NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_WR_PCT, false); +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_pct); + +static int nrs_delay_lprocfs_init(struct ptlrpc_service *svc) +{ + struct ldebugfs_vars nrs_delay_lprocfs_vars[] = { + { .name = "nrs_delay_min", + .fops = &ptlrpc_lprocfs_nrs_delay_min_fops, + .data = svc }, + { .name = "nrs_delay_max", + .fops = &ptlrpc_lprocfs_nrs_delay_max_fops, + .data = svc }, + { .name = "nrs_delay_pct", + .fops = &ptlrpc_lprocfs_nrs_delay_pct_fops, + .data = svc }, + { NULL } + }; + + if (!svc->srv_debugfs_entry) + return 0; + + ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_delay_lprocfs_vars, NULL); + + return 0; +} + +/** + * Delay policy operations + */ +static const struct ptlrpc_nrs_pol_ops nrs_delay_ops = { + .op_policy_start = nrs_delay_start, + .op_policy_stop = nrs_delay_stop, + .op_policy_ctl = nrs_delay_ctl, + .op_res_get = nrs_delay_res_get, + .op_req_get = nrs_delay_req_get, + .op_req_enqueue = nrs_delay_req_add, + .op_req_dequeue = nrs_delay_req_del, + .op_req_stop = nrs_delay_req_stop, + .op_lprocfs_init = nrs_delay_lprocfs_init, +}; + +/** + * Delay policy configuration + */ +struct ptlrpc_nrs_pol_conf nrs_conf_delay = { + .nc_name = NRS_POL_NAME_DELAY, + .nc_ops = &nrs_delay_ops, + .nc_compat = nrs_policy_compat_all, +}; + +/** @} delay */ + +/** @} nrs */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c new file mode 100644 index 0000000000000..2142ff4f665aa --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c @@ -0,0 +1,271 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011, 2014, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs_fifo.c + * + * Network Request Scheduler (NRS) FIFO policy + * + * Handles RPCs in a FIFO manner, as received from the network. This policy is + * a logical wrapper around previous, non-NRS functionality. It is used as the + * default and fallback policy for all types of RPCs on all PTLRPC service + * partitions, for both regular and high-priority NRS heads. Default here means + * the policy is the one enabled at PTLRPC service partition startup time, and + * fallback means the policy is used to handle RPCs that are not handled + * successfully or are not handled at all by any primary policy that may be + * enabled on a given NRS head. + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ +/** + * \addtogoup nrs + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include "ptlrpc_internal.h" + +/** + * \name fifo + * + * The FIFO policy is a logical wrapper around previous, non-NRS functionality. + * It schedules RPCs in the same order as they are queued from LNet. + * + * @{ + */ + +#define NRS_POL_NAME_FIFO "fifo" + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a + * policy-specific private data structure. + * + * \param[in] policy The policy to start + * + * \retval -ENOMEM OOM error + * \retval 0 success + * + * \see nrs_policy_register() + * \see nrs_policy_ctl() + */ +static int nrs_fifo_start(struct ptlrpc_nrs_policy *policy, char *arg) +{ + struct nrs_fifo_head *head; + + OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy)); + if (head == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&head->fh_list); + policy->pol_private = head; + return 0; +} + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific + * private data structure. + * + * \param[in] policy The policy to stop + * + * \see nrs_policy_stop0() + */ +static void nrs_fifo_stop(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_fifo_head *head = policy->pol_private; + + LASSERT(head != NULL); + LASSERT(list_empty(&head->fh_list)); + + OBD_FREE_PTR(head); +} + +/** + * Is called for obtaining a FIFO policy resource. + * + * \param[in] policy The policy on which the request is being asked for + * \param[in] nrq The request for which resources are being taken + * \param[in] parent Parent resource, unused in this policy + * \param[out] resp Resources references are placed in this array + * \param[in] moving_req Signifies limited caller context; unused in this + * policy + * + * \retval 1 The FIFO policy only has a one-level resource hierarchy, as since + * it implements a simple scheduling algorithm in which request + * priority is determined on the request arrival order, it does not + * need to maintain a set of resources that would otherwise be used + * to calculate a request's priority. + * + * \see nrs_resource_get_safe() + */ +static int nrs_fifo_res_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, bool moving_req) +{ + /** + * Just return the resource embedded inside nrs_fifo_head, and end this + * resource hierarchy reference request. + */ + *resp = &((struct nrs_fifo_head *)policy->pol_private)->fh_res; + return 1; +} + +/** + * Called when getting a request from the FIFO policy for handling, or just + * peeking; removes the request from the policy when it is to be handled. + * + * \param[in] policy The policy + * \param[in] peek When set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force Force the policy to return a request; unused in this + * policy + * + * \retval The request to be handled; this is the next request in the FIFO + * queue + * + * \see ptlrpc_nrs_req_get_nolock() + * \see nrs_request_get() + */ +static +struct ptlrpc_nrs_request * nrs_fifo_req_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct nrs_fifo_head *head = policy->pol_private; + struct ptlrpc_nrs_request *nrq; + + nrq = list_first_entry_or_null(&head->fh_list, + struct ptlrpc_nrs_request, + nr_u.fifo.fr_list); + + if (likely(!peek && nrq != NULL)) { + struct ptlrpc_request *req = container_of(nrq, + struct ptlrpc_request, + rq_nrq); + + list_del_init(&nrq->nr_u.fifo.fr_list); + + CDEBUG(D_RPCTRACE, "NRS start %s request from %s, seq: %llu" + "\n", policy->pol_desc->pd_name, + libcfs_id2str(req->rq_peer), nrq->nr_u.fifo.fr_sequence); + } + + return nrq; +} + +/** + * Adds request \a nrq to \a policy's list of queued requests + * + * \param[in] policy The policy + * \param[in] nrq The request to add + * + * \retval 0 success; nrs_request_enqueue() assumes this function will always + * succeed + */ +static int nrs_fifo_req_add(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_fifo_head *head; + + head = container_of(nrs_request_resource(nrq), struct nrs_fifo_head, + fh_res); + /** + * Only used for debugging + */ + nrq->nr_u.fifo.fr_sequence = head->fh_sequence++; + list_add_tail(&nrq->nr_u.fifo.fr_list, &head->fh_list); + + return 0; +} + +/** + * Removes request \a nrq from \a policy's list of queued requests. + * + * \param[in] policy The policy + * \param[in] nrq The request to remove + */ +static void nrs_fifo_req_del(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + LASSERT(!list_empty(&nrq->nr_u.fifo.fr_list)); + list_del_init(&nrq->nr_u.fifo.fr_list); +} + +/** + * Prints a debug statement right before the request \a nrq stops being + * handled. + * + * \param[in] policy The policy handling the request + * \param[in] nrq The request being handled + * + * \see ptlrpc_server_finish_request() + * \see ptlrpc_nrs_req_stop_nolock() + */ +static void nrs_fifo_req_stop(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + + CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: %llu\n", + policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer), + nrq->nr_u.fifo.fr_sequence); +} + +/** + * FIFO policy operations + */ +static const struct ptlrpc_nrs_pol_ops nrs_fifo_ops = { + .op_policy_start = nrs_fifo_start, + .op_policy_stop = nrs_fifo_stop, + .op_res_get = nrs_fifo_res_get, + .op_req_get = nrs_fifo_req_get, + .op_req_enqueue = nrs_fifo_req_add, + .op_req_dequeue = nrs_fifo_req_del, + .op_req_stop = nrs_fifo_req_stop, +}; + +/** + * FIFO policy configuration + */ +struct ptlrpc_nrs_pol_conf nrs_conf_fifo = { + .nc_name = NRS_POL_NAME_FIFO, + .nc_ops = &nrs_fifo_ops, + .nc_compat = nrs_policy_compat_all, + .nc_flags = PTLRPC_NRS_FL_FALLBACK | + PTLRPC_NRS_FL_REG_START +}; + +/** @} fifo */ + +/** @} nrs */ + diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c new file mode 100644 index 0000000000000..a81609a3f084d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c @@ -0,0 +1,1970 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs_orr.c + * + * Network Request Scheduler (NRS) ORR and TRR policies + * + * Request scheduling in a Round-Robin manner over backend-fs objects and OSTs + * respectively + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ + +/** + * \addtogoup nrs + * @{ + */ +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +/** + * \name ORR/TRR policy + * + * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies + * + * ORR performs batched Round Robin shceduling of brw RPCs, based on the FID of + * the backend-fs object that the brw RPC pertains to; the TRR policy performs + * batched Round Robin scheduling of brw RPCs, based on the OST index that the + * RPC pertains to. Both policies also order RPCs in each batch in ascending + * offset order, which is lprocfs-tunable between logical file offsets, and + * physical disk offsets, as reported by fiemap. + * + * The TRR policy reuses much of the functionality of ORR. These two scheduling + * algorithms could alternatively be implemented under a single NRS policy, that + * uses an lprocfs tunable in order to switch between the two types of + * scheduling behaviour. The two algorithms have been implemented as separate + * policies for reasons of clarity to the user, and to avoid issues that would + * otherwise arise at the point of switching between behaviours in the case of + * having a single policy, such as resource cleanup for nrs_orr_object + * instances. It is possible that this may need to be re-examined in the future, + * along with potentially coalescing other policies that perform batched request + * scheduling in a Round-Robin manner, all into one policy. + * + * @{ + */ + +#define NRS_POL_NAME_ORR "orr" +#define NRS_POL_NAME_TRR "trr" + +/** + * Checks if the RPC type of \a nrq is currently handled by an ORR/TRR policy + * + * \param[in] orrd the ORR/TRR policy scheduler instance + * \param[in] nrq the request + * \param[out] opcode the opcode is saved here, just in order to avoid calling + * lustre_msg_get_opc() again later + * + * \retval true request type is supported by the policy instance + * \retval false request type is not supported by the policy instance + */ +static bool nrs_orr_req_supported(struct nrs_orr_data *orrd, + struct ptlrpc_nrs_request *nrq, __u32 *opcode) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + bool rc = false; + + /** + * XXX: nrs_orr_data::od_supp accessed unlocked. + */ + switch (opc) { + case OST_READ: + rc = orrd->od_supp & NOS_OST_READ; + break; + case OST_WRITE: + rc = orrd->od_supp & NOS_OST_WRITE; + break; + } + + if (rc) + *opcode = opc; + + return rc; +} + +/** + * Returns the ORR/TRR key fields for the request \a nrq in \a key. + * + * \param[in] orrd the ORR/TRR policy scheduler instance + * \param[in] nrq the request + * \param[in] opc the request's opcode + * \param[in] name the policy name + * \param[out] key fields of the key are returned here. + * + * \retval 0 key filled successfully + * \retval < 0 error + */ +static int nrs_orr_key_fill(struct nrs_orr_data *orrd, + struct ptlrpc_nrs_request *nrq, __u32 opc, + char *name, struct nrs_orr_key *key) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + struct ost_body *body; + __u32 ost_idx; + bool is_orr = strncmp(name, NRS_POL_NAME_ORR, + NRS_POL_NAME_MAX) == 0; + + LASSERT(req != NULL); + + /** + * This is an attempt to fill in the request key fields while + * moving a request from the regular to the high-priority NRS + * head (via ldlm_lock_reorder_req()), but the request key has + * been adequately filled when nrs_orr_res_get() was called through + * ptlrpc_nrs_req_initialize() for the regular NRS head's ORR/TRR + * policy, so there is nothing to do. + */ + if ((is_orr && nrq->nr_u.orr.or_orr_set) || + (!is_orr && nrq->nr_u.orr.or_trr_set)) { + *key = nrq->nr_u.orr.or_key; + return 0; + } + + /* Bounce unconnected requests to the default policy. */ + if (req->rq_export == NULL) + return -ENOTCONN; + + if (nrq->nr_u.orr.or_orr_set || nrq->nr_u.orr.or_trr_set) + memset(&nrq->nr_u.orr.or_key, 0, sizeof(nrq->nr_u.orr.or_key)); + + ost_idx = class_server_data(req->rq_export->exp_obd)->lsd_osd_index; + + if (is_orr) { + int rc; + /** + * The request pill for OST_READ and OST_WRITE requests is + * initialized in the ost_io service's + * ptlrpc_service_ops::so_hpreq_handler, ost_io_hpreq_handler(), + * so no need to redo it here. + */ + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + RETURN(-EFAULT); + + rc = ostid_to_fid(&key->ok_fid, &body->oa.o_oi, ost_idx); + if (rc < 0) + return rc; + + nrq->nr_u.orr.or_orr_set = 1; + } else { + key->ok_idx = ost_idx; + nrq->nr_u.orr.or_trr_set = 1; + } + + return 0; +} + +/** + * Populates the range values in \a range with logical offsets obtained via + * \a nb. + * + * \param[in] nb niobuf_remote struct array for this request + * \param[in] niocount count of niobuf_remote structs for this request + * \param[out] range the offset range is returned here + */ +static void nrs_orr_range_fill_logical(struct niobuf_remote *nb, int niocount, + struct nrs_orr_req_range *range) +{ + /* Should we do this at page boundaries ? */ + range->or_start = nb[0].rnb_offset & PAGE_MASK; + range->or_end = (nb[niocount - 1].rnb_offset + + nb[niocount - 1].rnb_len - 1) | ~PAGE_MASK; +} + +/** + * We obtain information just for a single extent, as the request can only be in + * a single place in the binary heap anyway. + */ +#define ORR_NUM_EXTENTS 1 + +/** + * Converts the logical file offset range in \a range, to a physical disk offset + * range in \a range, for a request. Uses obd_get_info() in order to carry out a + * fiemap call and obtain backend-fs extent information. The returned range is + * in physical block numbers. + * + * \param[in] nrq the request + * \param[in] oa obdo struct for this request + * \param[in,out] range the offset range in bytes; logical range in, physical + * range out + * + * \retval 0 physical offsets obtained successfully + * \retvall < 0 error + */ +static int nrs_orr_range_fill_physical(struct ptlrpc_nrs_request *nrq, + struct obdo *oa, + struct nrs_orr_req_range *range) +{ + struct ptlrpc_request *req = container_of(nrq, + struct ptlrpc_request, + rq_nrq); + char fiemap_buf[offsetof(struct fiemap, + fm_extents[ORR_NUM_EXTENTS])]; + struct fiemap *fiemap = (struct fiemap *)fiemap_buf; + struct ll_fiemap_info_key key; + loff_t start; + loff_t end; + int rc; + + key = (typeof(key)) { + .lfik_name = KEY_FIEMAP, + .lfik_oa = *oa, + .lfik_fiemap = { + .fm_start = range->or_start, + .fm_length = range->or_end - range->or_start, + .fm_extent_count = ORR_NUM_EXTENTS + } + }; + + rc = obd_get_info(req->rq_svc_thread->t_env, req->rq_export, + sizeof(key), &key, NULL, fiemap); + if (rc < 0) + GOTO(out, rc); + + if (fiemap->fm_mapped_extents == 0 || + fiemap->fm_mapped_extents > ORR_NUM_EXTENTS) + GOTO(out, rc = -EFAULT); + + /** + * Calculate the physical offset ranges for the request from the extent + * information and the logical request offsets. + */ + start = fiemap->fm_extents[0].fe_physical + range->or_start - + fiemap->fm_extents[0].fe_logical; + end = start + range->or_end - range->or_start; + + range->or_start = start; + range->or_end = end; + + nrq->nr_u.orr.or_physical_set = 1; +out: + return rc; +} + +/** + * Sets the offset range the request covers; either in logical file + * offsets or in physical disk offsets. + * + * \param[in] nrq the request + * \param[in] orrd the ORR/TRR policy scheduler instance + * \param[in] opc the request's opcode + * \param[in] moving_req is the request in the process of moving onto the + * high-priority NRS head? + * + * \retval 0 range filled successfully + * \retval != 0 error + */ +static int nrs_orr_range_fill(struct ptlrpc_nrs_request *nrq, + struct nrs_orr_data *orrd, __u32 opc, + bool moving_req) +{ + struct ptlrpc_request *req = container_of(nrq, + struct ptlrpc_request, + rq_nrq); + struct obd_ioobj *ioo; + struct niobuf_remote *nb; + struct ost_body *body; + struct nrs_orr_req_range range; + int niocount; + int rc = 0; + + /** + * If we are scheduling using physical disk offsets, but we have filled + * the offset information in the request previously + * (i.e. ldlm_lock_reorder_req() is moving the request to the + * high-priority NRS head), there is no need to do anything, and we can + * exit. Moreover than the lack of need, we would be unable to perform + * the obd_get_info() call required in nrs_orr_range_fill_physical(), + * because ldlm_lock_reorder_lock() calls into here while holding a + * spinlock, and retrieving fiemap information via obd_get_info() is a + * potentially sleeping operation. + */ + if (orrd->od_physical && nrq->nr_u.orr.or_physical_set) + return 0; + + ioo = req_capsule_client_get(&req->rq_pill, &RMF_OBD_IOOBJ); + if (ioo == NULL) + GOTO(out, rc = -EFAULT); + + niocount = ioo->ioo_bufcnt; + + nb = req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE); + if (nb == NULL) + GOTO(out, rc = -EFAULT); + + /** + * Use logical information from niobuf_remote structures. + */ + nrs_orr_range_fill_logical(nb, niocount, &range); + + /** + * Obtain physical offsets if selected, and this is an OST_READ RPC + * RPC. We do not enter this block if moving_req is set which indicates + * that the request is being moved to the high-priority NRS head by + * ldlm_lock_reorder_req(), as that function calls in here while holding + * a spinlock, and nrs_orr_range_physical() can sleep, so we just use + * logical file offsets for the range values for such requests. + */ + if (orrd->od_physical && opc == OST_READ && !moving_req) { + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EFAULT); + + /** + * Translate to physical block offsets from backend filesystem + * extents. + * Ignore return values; if obtaining the physical offsets + * fails, use the logical offsets. + */ + nrs_orr_range_fill_physical(nrq, &body->oa, &range); + } + + nrq->nr_u.orr.or_range = range; +out: + return rc; +} + +/** + * Generates a character string that can be used in order to register uniquely + * named libcfs_hash and slab objects for ORR/TRR policy instances. The + * character string is unique per policy instance, as it includes the policy's + * name, the CPT number, and a {reg|hp} token, and there is one policy instance + * per NRS head on each CPT, and the policy is only compatible with the ost_io + * service. + * + * \param[in] policy the policy instance + * \param[out] name the character array that will hold the generated name + */ +static void nrs_orr_genobjname(struct ptlrpc_nrs_policy *policy, char *name) +{ + snprintf(name, NRS_ORR_OBJ_NAME_MAX, "%s%s%s%d", + "nrs_", policy->pol_desc->pd_name, + policy->pol_nrs->nrs_queue_type == PTLRPC_NRS_QUEUE_REG ? + "_reg_" : "_hp_", nrs_pol2cptid(policy)); +} + +/** + * ORR/TRR hash operations + */ +#define NRS_ORR_BITS 24 +#define NRS_ORR_BKT_BITS 12 +#define NRS_ORR_HASH_FLAGS (CFS_HASH_SPIN_BKTLOCK | CFS_HASH_ASSERT_EMPTY) + +#define NRS_TRR_BITS 4 +#define NRS_TRR_BKT_BITS 2 +#define NRS_TRR_HASH_FLAGS CFS_HASH_SPIN_BKTLOCK + +static unsigned +nrs_orr_hop_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(struct nrs_orr_key), mask); +} + +static void *nrs_orr_hop_key(struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + return &orro->oo_key; +} + +static int nrs_orr_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + + return lu_fid_eq(&orro->oo_key.ok_fid, + &((struct nrs_orr_key *)key)->ok_fid); +} + +static void *nrs_orr_hop_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct nrs_orr_object, oo_hnode); +} + +static void nrs_orr_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + orro->oo_ref++; +} + +/** + * Removes an nrs_orr_object the hash and frees its memory, if the object has + * no active users. + */ +static void nrs_orr_hop_put_free(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + struct nrs_orr_data *orrd = container_of(orro->oo_res.res_parent, + struct nrs_orr_data, od_res); + struct cfs_hash_bd bd; + + cfs_hash_bd_get_and_lock(hs, &orro->oo_key, &bd, 1); + + if (--orro->oo_ref > 1) { + cfs_hash_bd_unlock(hs, &bd, 1); + + return; + } + LASSERT(orro->oo_ref == 1); + + cfs_hash_bd_del_locked(hs, &bd, hnode); + cfs_hash_bd_unlock(hs, &bd, 1); + + OBD_SLAB_FREE_PTR(orro, orrd->od_cache); +} + +static void nrs_orr_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + orro->oo_ref--; +} + +static int nrs_trr_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + + return orro->oo_key.ok_idx == ((struct nrs_orr_key *)key)->ok_idx; +} + +static void nrs_trr_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + struct nrs_orr_data *orrd = container_of(orro->oo_res.res_parent, + struct nrs_orr_data, od_res); + + LASSERTF(orro->oo_ref == 0, + "Busy NRS TRR policy object for OST with index %u, with %ld " + "refs\n", orro->oo_key.ok_idx, orro->oo_ref); + + OBD_SLAB_FREE_PTR(orro, orrd->od_cache); +} + +static struct cfs_hash_ops nrs_orr_hash_ops = { + .hs_hash = nrs_orr_hop_hash, + .hs_key = nrs_orr_hop_key, + .hs_keycmp = nrs_orr_hop_keycmp, + .hs_object = nrs_orr_hop_object, + .hs_get = nrs_orr_hop_get, + .hs_put = nrs_orr_hop_put_free, + .hs_put_locked = nrs_orr_hop_put, +}; + +static struct cfs_hash_ops nrs_trr_hash_ops = { + .hs_hash = nrs_orr_hop_hash, + .hs_key = nrs_orr_hop_key, + .hs_keycmp = nrs_trr_hop_keycmp, + .hs_object = nrs_orr_hop_object, + .hs_get = nrs_orr_hop_get, + .hs_put = nrs_orr_hop_put, + .hs_put_locked = nrs_orr_hop_put, + .hs_exit = nrs_trr_hop_exit, +}; + +#define NRS_ORR_QUANTUM_DFLT 256 + +/** + * Binary heap predicate. + * + * Uses + * ptlrpc_nrs_request::nr_u::orr::or_round, + * ptlrpc_nrs_request::nr_u::orr::or_sequence, and + * ptlrpc_nrs_request::nr_u::orr::or_range to compare two binheap nodes and + * produce a binary predicate that indicates their relative priority, so that + * the binary heap can perform the necessary sorting operations. + * + * \param[in] e1 the first binheap node to compare + * \param[in] e2 the second binheap node to compare + * + * \retval 0 e1 > e2 + * \retval 1 e1 < e2 + */ +static int +orr_req_compare(struct binheap_node *e1, struct binheap_node *e2) +{ + struct ptlrpc_nrs_request *nrq1; + struct ptlrpc_nrs_request *nrq2; + + nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node); + nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node); + + /** + * Requests have been scheduled against a different scheduling round. + */ + if (nrq1->nr_u.orr.or_round < nrq2->nr_u.orr.or_round) + return 1; + else if (nrq1->nr_u.orr.or_round > nrq2->nr_u.orr.or_round) + return 0; + + /** + * Requests have been scheduled against the same scheduling round, but + * belong to a different batch, i.e. they pertain to a different + * backend-fs object (for ORR policy instances) or OST (for TRR policy + * instances). + */ + if (nrq1->nr_u.orr.or_sequence < nrq2->nr_u.orr.or_sequence) + return 1; + else if (nrq1->nr_u.orr.or_sequence > nrq2->nr_u.orr.or_sequence) + return 0; + + /** + * If round numbers and sequence numbers are equal, the two requests + * have been scheduled on the same round, and belong to the same batch, + * which means they pertain to the same backend-fs object (if this is an + * ORR policy instance), or to the same OST (if this is a TRR policy + * instance), so these requests should be sorted by ascending offset + * order. + */ + if (nrq1->nr_u.orr.or_range.or_start < + nrq2->nr_u.orr.or_range.or_start) { + return 1; + } else if (nrq1->nr_u.orr.or_range.or_start > + nrq2->nr_u.orr.or_range.or_start) { + return 0; + } else { + /** + * Requests start from the same offset; Dispatch the shorter one + * first; perhaps slightly more chances of hitting caches like + * this. + */ + return nrq1->nr_u.orr.or_range.or_end < + nrq2->nr_u.orr.or_range.or_end; + } +} + +/** + * ORR binary heap operations + */ +static struct binheap_ops nrs_orr_heap_ops = { + .hop_enter = NULL, + .hop_exit = NULL, + .hop_compare = orr_req_compare, +}; + +/** + * Prints a warning message if an ORR/TRR policy is started on a service with + * more than one CPT. Not printed on the console for now, since we don't + * have any performance metrics in the first place, and it is annoying. + * + * \param[in] policy the policy instance + * + * \retval 0 success + */ +static int nrs_orr_init(struct ptlrpc_nrs_policy *policy) +{ + if (policy->pol_nrs->nrs_svcpt->scp_service->srv_ncpts > 1) + CDEBUG(D_CONFIG, "%s: The %s NRS policy was registered on a " + "service with multiple service partitions. This policy " + "may perform better with a single partition.\n", + policy->pol_nrs->nrs_svcpt->scp_service->srv_name, + policy->pol_desc->pd_name); + + return 0; +} + +/** + * Called when an ORR policy instance is started. + * + * \param[in] policy the policy + * + * \retval -ENOMEM OOM error + * \retval 0 success + */ +static int nrs_orr_start(struct ptlrpc_nrs_policy *policy, char *arg) +{ + struct nrs_orr_data *orrd; + struct cfs_hash_ops *ops; + unsigned cur_bits; + unsigned max_bits; + unsigned bkt_bits; + unsigned flags; + int rc = 0; + ENTRY; + + OBD_CPT_ALLOC_PTR(orrd, nrs_pol2cptab(policy), nrs_pol2cptid(policy)); + if (orrd == NULL) + RETURN(-ENOMEM); + + /* + * Binary heap instance for sorted incoming requests. + */ + orrd->od_binheap = binheap_create(&nrs_orr_heap_ops, + CBH_FLAG_ATOMIC_GROW, 4096, NULL, + nrs_pol2cptab(policy), + nrs_pol2cptid(policy)); + if (orrd->od_binheap == NULL) + GOTO(out_orrd, rc = -ENOMEM); + + nrs_orr_genobjname(policy, orrd->od_objname); + + /** + * Slab cache for NRS ORR/TRR objects. + */ + orrd->od_cache = kmem_cache_create(orrd->od_objname, + sizeof(struct nrs_orr_object), + 0, 0, NULL); + if (orrd->od_cache == NULL) + GOTO(out_binheap, rc = -ENOMEM); + + if (strncmp(policy->pol_desc->pd_name, NRS_POL_NAME_ORR, + NRS_POL_NAME_MAX) == 0) { + ops = &nrs_orr_hash_ops; + cur_bits = NRS_ORR_BITS; + max_bits = NRS_ORR_BITS; + bkt_bits = NRS_ORR_BKT_BITS; + flags = NRS_ORR_HASH_FLAGS; + } else { + ops = &nrs_trr_hash_ops; + cur_bits = NRS_TRR_BITS; + max_bits = NRS_TRR_BITS; + bkt_bits = NRS_TRR_BKT_BITS; + flags = NRS_TRR_HASH_FLAGS; + } + + /** + * Hash for finding objects by struct nrs_orr_key. + * XXX: For TRR, it might be better to avoid using libcfs_hash? + * All that needs to be resolved are OST indices, and they + * will stay relatively stable during an OSS node's lifetime. + */ + orrd->od_obj_hash = cfs_hash_create(orrd->od_objname, cur_bits, + max_bits, bkt_bits, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, ops, flags); + if (orrd->od_obj_hash == NULL) + GOTO(out_cache, rc = -ENOMEM); + + /* XXX: Fields accessed unlocked */ + orrd->od_quantum = NRS_ORR_QUANTUM_DFLT; + orrd->od_supp = NOS_DFLT; + orrd->od_physical = true; + /** + * Set to 1 so that the test inside nrs_orr_req_add() can evaluate to + * true. + */ + orrd->od_sequence = 1; + + policy->pol_private = orrd; + + RETURN(rc); + +out_cache: + kmem_cache_destroy(orrd->od_cache); +out_binheap: + binheap_destroy(orrd->od_binheap); +out_orrd: + OBD_FREE_PTR(orrd); + + RETURN(rc); +} + +/** + * Called when an ORR/TRR policy instance is stopped. + * + * Called when the policy has been instructed to transition to the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state and has no more + * pending requests to serve. + * + * \param[in] policy the policy + */ +static void nrs_orr_stop(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_orr_data *orrd = policy->pol_private; + ENTRY; + + LASSERT(orrd != NULL); + LASSERT(orrd->od_binheap != NULL); + LASSERT(orrd->od_obj_hash != NULL); + LASSERT(orrd->od_cache != NULL); + LASSERT(binheap_is_empty(orrd->od_binheap)); + + binheap_destroy(orrd->od_binheap); + cfs_hash_putref(orrd->od_obj_hash); + kmem_cache_destroy(orrd->od_cache); + + OBD_FREE_PTR(orrd); +} + +/** + * Performs a policy-specific ctl function on ORR/TRR policy instances; similar + * to ioctl. + * + * \param[in] policy the policy instance + * \param[in] opc the opcode + * \param[in,out] arg used for passing parameters and information + * + * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * + * \retval 0 operation carried successfully + * \retval -ve error + */ +static int nrs_orr_ctl(struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg) +{ + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + switch((enum nrs_ctl_orr)opc) { + default: + RETURN(-EINVAL); + + case NRS_CTL_ORR_RD_QUANTUM: { + struct nrs_orr_data *orrd = policy->pol_private; + + *(__u16 *)arg = orrd->od_quantum; + } + break; + + case NRS_CTL_ORR_WR_QUANTUM: { + struct nrs_orr_data *orrd = policy->pol_private; + + orrd->od_quantum = *(__u16 *)arg; + LASSERT(orrd->od_quantum != 0); + } + break; + + case NRS_CTL_ORR_RD_OFF_TYPE: { + struct nrs_orr_data *orrd = policy->pol_private; + + *(bool *)arg = orrd->od_physical; + } + break; + + case NRS_CTL_ORR_WR_OFF_TYPE: { + struct nrs_orr_data *orrd = policy->pol_private; + + orrd->od_physical = *(bool *)arg; + } + break; + + case NRS_CTL_ORR_RD_SUPP_REQ: { + struct nrs_orr_data *orrd = policy->pol_private; + + *(enum nrs_orr_supp *)arg = orrd->od_supp; + } + break; + + case NRS_CTL_ORR_WR_SUPP_REQ: { + struct nrs_orr_data *orrd = policy->pol_private; + + orrd->od_supp = *(enum nrs_orr_supp *)arg; + LASSERT((orrd->od_supp & NOS_OST_RW) != 0); + } + break; + } + RETURN(0); +} + +/** + * Obtains resources for ORR/TRR policy instances. The top-level resource lives + * inside \e nrs_orr_data and the second-level resource inside + * \e nrs_orr_object instances. + * + * \param[in] policy the policy for which resources are being taken for + * request \a nrq + * \param[in] nrq the request for which resources are being taken + * \param[in] parent parent resource, embedded in nrs_orr_data for the + * ORR/TRR policies + * \param[out] resp used to return resource references + * \param[in] moving_req signifies limited caller context; used to perform + * memory allocations in an atomic context in this + * policy + * + * \retval 0 we are returning a top-level, parent resource, one that is + * embedded in an nrs_orr_data object + * \retval 1 we are returning a bottom-level resource, one that is embedded + * in an nrs_orr_object object + * + * \see nrs_resource_get_safe() + */ +static int nrs_orr_res_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, bool moving_req) +{ + struct nrs_orr_data *orrd; + struct nrs_orr_object *orro; + struct nrs_orr_object *tmp; + struct nrs_orr_key key = { { { 0 } } }; + __u32 opc; + int rc = 0; + + /** + * struct nrs_orr_data is requested. + */ + if (parent == NULL) { + *resp = &((struct nrs_orr_data *)policy->pol_private)->od_res; + return 0; + } + + orrd = container_of(parent, struct nrs_orr_data, od_res); + + /** + * If the request type is not supported, fail the enqueuing; the RPC + * will be handled by the fallback NRS policy. + */ + if (!nrs_orr_req_supported(orrd, nrq, &opc)) + return -1; + + /** + * Fill in the key for the request; OST FID for ORR policy instances, + * and OST index for TRR policy instances. + */ + rc = nrs_orr_key_fill(orrd, nrq, opc, policy->pol_desc->pd_name, &key); + if (rc < 0) + RETURN(rc); + + /** + * Set the offset range the request covers + */ + rc = nrs_orr_range_fill(nrq, orrd, opc, moving_req); + if (rc < 0) + RETURN(rc); + + orro = cfs_hash_lookup(orrd->od_obj_hash, &key); + if (orro != NULL) + goto out; + + OBD_SLAB_CPT_ALLOC_PTR_GFP(orro, orrd->od_cache, + nrs_pol2cptab(policy), nrs_pol2cptid(policy), + moving_req ? GFP_ATOMIC : GFP_NOFS); + if (orro == NULL) + RETURN(-ENOMEM); + + orro->oo_key = key; + orro->oo_ref = 1; + + tmp = cfs_hash_findadd_unique(orrd->od_obj_hash, &orro->oo_key, + &orro->oo_hnode); + if (tmp != orro) { + OBD_SLAB_FREE_PTR(orro, orrd->od_cache); + orro = tmp; + } +out: + /** + * For debugging purposes + */ + nrq->nr_u.orr.or_key = orro->oo_key; + + *resp = &orro->oo_res; + + return 1; +} + +/** + * Called when releasing references to the resource hierachy obtained for a + * request for scheduling using ORR/TRR policy instances + * + * \param[in] policy the policy the resource belongs to + * \param[in] res the resource to be released + */ +static void nrs_orr_res_put(struct ptlrpc_nrs_policy *policy, + const struct ptlrpc_nrs_resource *res) +{ + struct nrs_orr_data *orrd; + struct nrs_orr_object *orro; + + /** + * Do nothing for freeing parent, nrs_orr_data resources. + */ + if (res->res_parent == NULL) + return; + + orro = container_of(res, struct nrs_orr_object, oo_res); + orrd = container_of(res->res_parent, struct nrs_orr_data, od_res); + + cfs_hash_put(orrd->od_obj_hash, &orro->oo_hnode); +} + +/** + * Called when polling an ORR/TRR policy instance for a request so that it can + * be served. Returns the request that is at the root of the binary heap, as + * that is the lowest priority one (i.e. libcfs_heap is an implementation of a + * min-heap) + * + * \param[in] policy the policy instance being polled + * \param[in] peek when set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force force the policy to return a request; unused in this policy + * + * \retval the request to be handled + * \retval NULL no request available + * + * \see ptlrpc_nrs_req_get_nolock() + * \see nrs_request_get() + */ +static +struct ptlrpc_nrs_request *nrs_orr_req_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct nrs_orr_data *orrd = policy->pol_private; + struct binheap_node *node = binheap_root(orrd->od_binheap); + struct ptlrpc_nrs_request *nrq; + + nrq = unlikely(node == NULL) ? NULL : + container_of(node, struct ptlrpc_nrs_request, nr_node); + + if (likely(!peek && nrq != NULL)) { + struct nrs_orr_object *orro; + + orro = container_of(nrs_request_resource(nrq), + struct nrs_orr_object, oo_res); + + LASSERT(nrq->nr_u.orr.or_round <= orro->oo_round); + + binheap_remove(orrd->od_binheap, &nrq->nr_node); + orro->oo_active--; + + if (strncmp(policy->pol_desc->pd_name, NRS_POL_NAME_ORR, + NRS_POL_NAME_MAX) == 0) + CDEBUG(D_RPCTRACE, + "NRS: starting to handle %s request for object " + "with FID "DFID", from OST with index %u, with " + "round %llu\n", NRS_POL_NAME_ORR, + PFID(&orro->oo_key.ok_fid), + nrq->nr_u.orr.or_key.ok_idx, + nrq->nr_u.orr.or_round); + else + CDEBUG(D_RPCTRACE, + "NRS: starting to handle %s request from OST " + "with index %u, with round %llu\n", + NRS_POL_NAME_TRR, nrq->nr_u.orr.or_key.ok_idx, + nrq->nr_u.orr.or_round); + + /** Peek at the next request to be served */ + node = binheap_root(orrd->od_binheap); + + /** No more requests */ + if (unlikely(node == NULL)) { + orrd->od_round++; + } else { + struct ptlrpc_nrs_request *next; + + next = container_of(node, struct ptlrpc_nrs_request, + nr_node); + + if (orrd->od_round < next->nr_u.orr.or_round) + orrd->od_round = next->nr_u.orr.or_round; + } + } + + return nrq; +} + +/** + * Sort-adds request \a nrq to an ORR/TRR \a policy instance's set of queued + * requests in the policy's binary heap. + * + * A scheduling round is a stream of requests that have been sorted in batches + * according to the backend-fs object (for ORR policy instances) or OST (for TRR + * policy instances) that they pertain to (as identified by its IDIF FID or OST + * index respectively); there can be only one batch for each object or OST in + * each round. The batches are of maximum size nrs_orr_data:od_quantum. When a + * new request arrives for scheduling for an object or OST that has exhausted + * its quantum in its current round, the request will be scheduled on the next + * scheduling round. Requests are allowed to be scheduled against a round until + * all requests for the round are serviced, so an object or OST might miss a + * round if requests are not scheduled for it for a long enough period of time. + * Objects or OSTs that miss a round will continue with having their next + * request scheduled, starting at the round that requests are being dispatched + * for, at the time of arrival of this request. + * + * Requests are tagged with the round number and a sequence number; the sequence + * number indicates the relative ordering amongst the batches of requests in a + * round, and is identical for all requests in a batch, as is the round number. + * The round and sequence numbers are used by orr_req_compare() in order to use + * nrs_orr_data::od_binheap in order to maintain an ordered set of rounds, with + * each round consisting of an ordered set of batches of requests, and each + * batch consisting of an ordered set of requests according to their logical + * file or physical disk offsets. + * + * \param[in] policy the policy + * \param[in] nrq the request to add + * + * \retval 0 request successfully added + * \retval != 0 error + */ +static int nrs_orr_req_add(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_orr_data *orrd; + struct nrs_orr_object *orro; + int rc; + + orro = container_of(nrs_request_resource(nrq), + struct nrs_orr_object, oo_res); + orrd = container_of(nrs_request_resource(nrq)->res_parent, + struct nrs_orr_data, od_res); + + if (orro->oo_quantum == 0 || orro->oo_round < orrd->od_round || + (orro->oo_active == 0 && orro->oo_quantum > 0)) { + + /** + * If there are no pending requests for the object/OST, but some + * of its quantum still remains unused, which implies we did not + * get a chance to schedule up to its maximum allowed batch size + * of requests in the previous round this object/OST + * participated in, schedule this next request on a new round; + * this avoids fragmentation of request batches caused by + * intermittent inactivity on the object/OST, at the expense of + * potentially slightly increased service time for the request + * batch this request will be a part of. + */ + if (orro->oo_active == 0 && orro->oo_quantum > 0) + orro->oo_round++; + + /** A new scheduling round has commenced */ + if (orro->oo_round < orrd->od_round) + orro->oo_round = orrd->od_round; + + /** I was not the last object/OST that scheduled a request */ + if (orro->oo_sequence < orrd->od_sequence) + orro->oo_sequence = ++orrd->od_sequence; + /** + * Reset the quantum if we have reached the maximum quantum + * size for this batch, or even if we have not managed to + * complete a batch size up to its maximum allowed size. + * XXX: Accessed unlocked + */ + orro->oo_quantum = orrd->od_quantum; + } + + nrq->nr_u.orr.or_round = orro->oo_round; + nrq->nr_u.orr.or_sequence = orro->oo_sequence; + + rc = binheap_insert(orrd->od_binheap, &nrq->nr_node); + if (rc == 0) { + orro->oo_active++; + if (--orro->oo_quantum == 0) + orro->oo_round++; + } + return rc; +} + +/** + * Removes request \a nrq from an ORR/TRR \a policy instance's set of queued + * requests. + * + * \param[in] policy the policy + * \param[in] nrq the request to remove + */ +static void nrs_orr_req_del(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_orr_data *orrd; + struct nrs_orr_object *orro; + bool is_root; + + orro = container_of(nrs_request_resource(nrq), + struct nrs_orr_object, oo_res); + orrd = container_of(nrs_request_resource(nrq)->res_parent, + struct nrs_orr_data, od_res); + + LASSERT(nrq->nr_u.orr.or_round <= orro->oo_round); + + is_root = &nrq->nr_node == binheap_root(orrd->od_binheap); + + binheap_remove(orrd->od_binheap, &nrq->nr_node); + orro->oo_active--; + + /** + * If we just deleted the node at the root of the binheap, we may have + * to adjust round numbers. + */ + if (unlikely(is_root)) { + /** Peek at the next request to be served */ + struct binheap_node *node = binheap_root(orrd->od_binheap); + + /** No more requests */ + if (unlikely(node == NULL)) { + orrd->od_round++; + } else { + nrq = container_of(node, struct ptlrpc_nrs_request, + nr_node); + + if (orrd->od_round < nrq->nr_u.orr.or_round) + orrd->od_round = nrq->nr_u.orr.or_round; + } + } +} + +/** + * Called right after the request \a nrq finishes being handled by ORR policy + * instance \a policy. + * + * \param[in] policy the policy that handled the request + * \param[in] nrq the request that was handled + */ +static void nrs_orr_req_stop(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + /** NB: resource control, credits etc can be added here */ + if (strncmp(policy->pol_desc->pd_name, NRS_POL_NAME_ORR, + NRS_POL_NAME_MAX) == 0) + CDEBUG(D_RPCTRACE, + "NRS: finished handling %s request for object with FID " + DFID", from OST with index %u, with round %llu\n", + NRS_POL_NAME_ORR, PFID(&nrq->nr_u.orr.or_key.ok_fid), + nrq->nr_u.orr.or_key.ok_idx, nrq->nr_u.orr.or_round); + else + CDEBUG(D_RPCTRACE, + "NRS: finished handling %s request from OST with index %u," + " with round %llu\n", + NRS_POL_NAME_TRR, nrq->nr_u.orr.or_key.ok_idx, + nrq->nr_u.orr.or_round); +} + +/** + * debugfs interface + */ + +/** + * This allows to bundle the policy name into the lprocfs_vars::data pointer + * so that lprocfs read/write functions can be used by both the ORR and TRR + * policies. + */ +static struct nrs_lprocfs_orr_data { + struct ptlrpc_service *svc; + char *name; +} lprocfs_orr_data = { + .name = NRS_POL_NAME_ORR +}, lprocfs_trr_data = { + .name = NRS_POL_NAME_TRR +}; + +/** + * Retrieves the value of the Round Robin quantum (i.e. the maximum batch size) + * for ORR/TRR policy instances on both the regular and high-priority NRS head + * of a service, as long as a policy instance is not in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; policy instances in this + * state are skipped later by nrs_orr_ctl(). + * + * Quantum values are in # of RPCs, and the output is in YAML format. + * + * For example: + * + * reg_quantum:256 + * hp_quantum:8 + * + * XXX: the CRR-N version of this, ptlrpc_lprocfs_rd_nrs_crrn_quantum() is + * almost identical; it can be reworked and then reused for ORR/TRR. + */ +static int +ptlrpc_lprocfs_nrs_orr_quantum_seq_show(struct seq_file *m, void *data) +{ + struct nrs_lprocfs_orr_data *orr_data = m->private; + struct ptlrpc_service *svc = orr_data->svc; + __u16 quantum; + int rc; + + /** + * Perform two separate calls to this as only one of the NRS heads' + * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state. + */ + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + orr_data->name, + NRS_CTL_ORR_RD_QUANTUM, + true, &quantum); + if (rc == 0) { + seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_REG "%-5d\n", quantum); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + + /** + * We know the ost_io service which is the only one ORR/TRR policies are + * compatible with, do have an HP NRS head, but it may be best to guard + * against a possible change of this in the future. + */ + if (!nrs_svc_has_hp(svc)) + goto no_hp; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + orr_data->name, NRS_CTL_ORR_RD_QUANTUM, + true, &quantum); + if (rc == 0) { + seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_HP"%-5d\n", quantum); + /** + * Ignore -ENODEV as the high priority NRS head's policy may be + * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + +no_hp: + + return rc; +} + +/** + * Sets the value of the Round Robin quantum (i.e. the maximum batch size) + * for ORR/TRR policy instances of a service. The user can set the quantum size + * for the regular and high priority NRS head separately by specifying each + * value, or both together in a single invocation. + * + * For example: + * + * lctl set_param ost.OSS.ost_io.nrs_orr_quantum=req_quantum:64, to set the + * request quantum size of the ORR policy instance on the regular NRS head of + * the ost_io service to 64 + * + * lctl set_param ost.OSS.ost_io.nrs_trr_quantum=hp_quantum:8 to set the request + * quantum size of the TRR policy instance on the high priority NRS head of the + * ost_io service to 8 + * + * lctl set_param ost.OSS.ost_io.nrs_orr_quantum=32, to set both the request + * quantum size of the ORR policy instance on both the regular and the high + * priority NRS head of the ost_io service to 32 + * + * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state + * are skipped later by nrs_orr_ctl(). + * + * XXX: the CRR-N version of this, ptlrpc_lprocfs_wr_nrs_crrn_quantum() is + * almost identical; it can be reworked and then reused for ORR/TRR. + */ +static ssize_t +ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct nrs_lprocfs_orr_data *orr_data = m->private; + struct ptlrpc_service *svc = orr_data->svc; + enum ptlrpc_nrs_queue_type queue = 0; + char kernbuf[LPROCFS_NRS_WR_QUANTUM_MAX_CMD]; + char *val; + long quantum_reg; + long quantum_hp; + /** lprocfs_find_named_value() modifies its argument, so keep a copy */ + size_t count_copy; + int rc = 0; + int rc2 = 0; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + + count_copy = count; + + /** + * Check if the regular quantum value has been specified + */ + val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG, + &count_copy); + if (val != kernbuf) { + rc = kstrtol(val, 10, &quantum_reg); + if (rc) + return rc; + queue |= PTLRPC_NRS_QUEUE_REG; + } + + count_copy = count; + + /** + * Check if the high priority quantum value has been specified + */ + val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_HP, + &count_copy); + if (val != kernbuf) { + if (!nrs_svc_has_hp(svc)) + return -ENODEV; + + rc = kstrtol(val, 10, &quantum_hp); + if (rc) + return rc; + + queue |= PTLRPC_NRS_QUEUE_HP; + } + + /** + * If none of the queues has been specified, look for a valid numerical + * value + */ + if (queue == 0) { + rc = kstrtol(kernbuf, 10, &quantum_reg); + if (rc) + return rc; + + queue = PTLRPC_NRS_QUEUE_REG; + + if (nrs_svc_has_hp(svc)) { + queue |= PTLRPC_NRS_QUEUE_HP; + quantum_hp = quantum_reg; + } + } + + if ((((queue & PTLRPC_NRS_QUEUE_REG) != 0) && + ((quantum_reg > LPROCFS_NRS_QUANTUM_MAX || quantum_reg <= 0))) || + (((queue & PTLRPC_NRS_QUEUE_HP) != 0) && + ((quantum_hp > LPROCFS_NRS_QUANTUM_MAX || quantum_hp <= 0)))) + return -EINVAL; + + /** + * We change the values on regular and HP NRS heads separately, so that + * we do not exit early from ptlrpc_nrs_policy_control() with an error + * returned by nrs_policy_ctl_locked(), in cases where the user has not + * started the policy on either the regular or HP NRS head; i.e. we are + * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned + * only if the operation fails with -ENODEV on all heads that have been + * specified by the command; if at least one operation succeeds, + * success is returned. + */ + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + orr_data->name, + NRS_CTL_ORR_WR_QUANTUM, false, + &quantum_reg); + if ((rc < 0 && rc != -ENODEV) || + (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG)) + return rc; + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + orr_data->name, + NRS_CTL_ORR_WR_QUANTUM, false, + &quantum_hp); + if ((rc2 < 0 && rc2 != -ENODEV) || + (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP)) + return rc2; + } + + return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_quantum); + +#define LPROCFS_NRS_OFF_NAME_REG "reg_offset_type:" +#define LPROCFS_NRS_OFF_NAME_HP "hp_offset_type:" + +#define LPROCFS_NRS_OFF_NAME_PHYSICAL "physical" +#define LPROCFS_NRS_OFF_NAME_LOGICAL "logical" + +/** + * Retrieves the offset type used by ORR/TRR policy instances on both the + * regular and high-priority NRS head of a service, as long as a policy + * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; + * policy instances in this state are skipped later by nrs_orr_ctl(). + * + * Offset type information is a (physical|logical) string, and output is + * in YAML format. + * + * For example: + * + * reg_offset_type:physical + * hp_offset_type:logical + */ +static int +ptlrpc_lprocfs_nrs_orr_offset_type_seq_show(struct seq_file *m, void *data) +{ + struct nrs_lprocfs_orr_data *orr_data = m->private; + struct ptlrpc_service *svc = orr_data->svc; + bool physical; + int rc; + + /** + * Perform two separate calls to this as only one of the NRS heads' + * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED + * or ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state. + */ + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + orr_data->name, NRS_CTL_ORR_RD_OFF_TYPE, + true, &physical); + if (rc == 0) { + seq_printf(m, LPROCFS_NRS_OFF_NAME_REG"%s\n", + physical ? LPROCFS_NRS_OFF_NAME_PHYSICAL : + LPROCFS_NRS_OFF_NAME_LOGICAL); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + + /** + * We know the ost_io service which is the only one ORR/TRR policies are + * compatible with, do have an HP NRS head, but it may be best to guard + * against a possible change of this in the future. + */ + if (!nrs_svc_has_hp(svc)) + goto no_hp; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + orr_data->name, NRS_CTL_ORR_RD_OFF_TYPE, + true, &physical); + if (rc == 0) { + seq_printf(m, LPROCFS_NRS_OFF_NAME_HP"%s\n", + physical ? LPROCFS_NRS_OFF_NAME_PHYSICAL : + LPROCFS_NRS_OFF_NAME_LOGICAL); + /** + * Ignore -ENODEV as the high priority NRS head's policy may be + * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + +no_hp: + return rc; +} + +/** + * Max valid command string is the size of the labels, plus "physical" twice. + * plus a separating ' ' + */ +#define LPROCFS_NRS_WR_OFF_TYPE_MAX_CMD \ + sizeof(LPROCFS_NRS_OFF_NAME_REG LPROCFS_NRS_OFF_NAME_PHYSICAL " " \ + LPROCFS_NRS_OFF_NAME_HP LPROCFS_NRS_OFF_NAME_PHYSICAL) + +/** + * Sets the type of offsets used to order RPCs in ORR/TRR policy instances. The + * user can set offset type for the regular or high priority NRS head + * separately by specifying each value, or both together in a single invocation. + * + * For example: + * + * lctl set_param ost.OSS.ost_io.nrs_orr_offset_type= + * reg_offset_type:physical, to enable the ORR policy instance on the regular + * NRS head of the ost_io service to use physical disk offset ordering. + * + * lctl set_param ost.OSS.ost_io.nrs_trr_offset_type=logical, to enable the TRR + * policy instances on both the regular ang high priority NRS heads of the + * ost_io service to use logical file offset ordering. + * + * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state are + * are skipped later by nrs_orr_ctl(). + */ +static ssize_t +ptlrpc_lprocfs_nrs_orr_offset_type_seq_write(struct file *file, + const char __user *buffer, + size_t count, + loff_t *off) +{ + struct seq_file *m = file->private_data; + struct nrs_lprocfs_orr_data *orr_data = m->private; + struct ptlrpc_service *svc = orr_data->svc; + enum ptlrpc_nrs_queue_type queue = 0; + char kernbuf[LPROCFS_NRS_WR_OFF_TYPE_MAX_CMD]; + char *val_reg; + char *val_hp; + bool physical_reg; + bool physical_hp; + size_t count_copy; + int rc = 0; + int rc2 = 0; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + + count_copy = count; + + /** + * Check if the regular offset type has been specified + */ + val_reg = lprocfs_find_named_value(kernbuf, + LPROCFS_NRS_OFF_NAME_REG, + &count_copy); + if (val_reg != kernbuf) + queue |= PTLRPC_NRS_QUEUE_REG; + + count_copy = count; + + /** + * Check if the high priority offset type has been specified + */ + val_hp = lprocfs_find_named_value(kernbuf, LPROCFS_NRS_OFF_NAME_HP, + &count_copy); + if (val_hp != kernbuf) { + if (!nrs_svc_has_hp(svc)) + return -ENODEV; + + queue |= PTLRPC_NRS_QUEUE_HP; + } + + /** + * If none of the queues has been specified, there may be a valid + * command string at the start of the buffer. + */ + if (queue == 0) { + queue = PTLRPC_NRS_QUEUE_REG; + + if (nrs_svc_has_hp(svc)) + queue |= PTLRPC_NRS_QUEUE_HP; + } + + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + if (strncmp(val_reg, LPROCFS_NRS_OFF_NAME_PHYSICAL, + sizeof(LPROCFS_NRS_OFF_NAME_PHYSICAL) - 1) == 0) + physical_reg = true; + else if (strncmp(val_reg, LPROCFS_NRS_OFF_NAME_LOGICAL, + sizeof(LPROCFS_NRS_OFF_NAME_LOGICAL) - 1) == 0) + physical_reg = false; + else + return -EINVAL; + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + if (strncmp(val_hp, LPROCFS_NRS_OFF_NAME_PHYSICAL, + sizeof(LPROCFS_NRS_OFF_NAME_PHYSICAL) - 1) == 0) + physical_hp = true; + else if (strncmp(val_hp, LPROCFS_NRS_OFF_NAME_LOGICAL, + sizeof(LPROCFS_NRS_OFF_NAME_LOGICAL) - 1) == 0) + physical_hp = false; + else + return -EINVAL; + } + + /** + * We change the values on regular and HP NRS heads separately, so that + * we do not exit early from ptlrpc_nrs_policy_control() with an error + * returned by nrs_policy_ctl_locked(), in cases where the user has not + * started the policy on either the regular or HP NRS head; i.e. we are + * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned + * only if the operation fails with -ENODEV on all heads that have been + * specified by the command; if at least one operation succeeds, + * success is returned. + */ + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + orr_data->name, + NRS_CTL_ORR_WR_OFF_TYPE, false, + &physical_reg); + if ((rc < 0 && rc != -ENODEV) || + (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG)) + return rc; + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + orr_data->name, + NRS_CTL_ORR_WR_OFF_TYPE, false, + &physical_hp); + if ((rc2 < 0 && rc2 != -ENODEV) || + (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP)) + return rc2; + } + + return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_offset_type); + +#define NRS_LPROCFS_REQ_SUPP_NAME_REG "reg_supported:" +#define NRS_LPROCFS_REQ_SUPP_NAME_HP "hp_supported:" + +#define LPROCFS_NRS_SUPP_NAME_READS "reads" +#define LPROCFS_NRS_SUPP_NAME_WRITES "writes" +#define LPROCFS_NRS_SUPP_NAME_READWRITES "reads_and_writes" + +/** + * Translates enum nrs_orr_supp values to a corresponding string. + */ +static const char *nrs_orr_supp2str(enum nrs_orr_supp supp) +{ + switch(supp) { + default: + LBUG(); + case NOS_OST_READ: + return LPROCFS_NRS_SUPP_NAME_READS; + case NOS_OST_WRITE: + return LPROCFS_NRS_SUPP_NAME_WRITES; + case NOS_OST_RW: + return LPROCFS_NRS_SUPP_NAME_READWRITES; + } +} + +/** + * Translates strings to the corresponding enum nrs_orr_supp value + */ +static enum nrs_orr_supp nrs_orr_str2supp(const char *val) +{ + if (strncmp(val, LPROCFS_NRS_SUPP_NAME_READWRITES, + sizeof(LPROCFS_NRS_SUPP_NAME_READWRITES) - 1) == 0) + return NOS_OST_RW; + else if (strncmp(val, LPROCFS_NRS_SUPP_NAME_READS, + sizeof(LPROCFS_NRS_SUPP_NAME_READS) - 1) == 0) + return NOS_OST_READ; + else if (strncmp(val, LPROCFS_NRS_SUPP_NAME_WRITES, + sizeof(LPROCFS_NRS_SUPP_NAME_WRITES) - 1) == 0) + return NOS_OST_WRITE; + else + return -EINVAL; +} + +/** + * Retrieves the type of RPCs handled at the point of invocation by ORR/TRR + * policy instances on both the regular and high-priority NRS head of a service, + * as long as a policy instance is not in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; policy instances in this + * state are skipped later by nrs_orr_ctl(). + * + * Supported RPC type information is a (reads|writes|reads_and_writes) string, + * and output is in YAML format. + * + * For example: + * + * reg_supported:reads + * hp_supported:reads_and_writes + */ +static int +ptlrpc_lprocfs_nrs_orr_supported_seq_show(struct seq_file *m, void *data) +{ + struct nrs_lprocfs_orr_data *orr_data = m->private; + struct ptlrpc_service *svc = orr_data->svc; + enum nrs_orr_supp supported; + int rc; + + /** + * Perform two separate calls to this as only one of the NRS heads' + * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED + * or ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state. + */ + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + orr_data->name, + NRS_CTL_ORR_RD_SUPP_REQ, true, + &supported); + + if (rc == 0) { + seq_printf(m, NRS_LPROCFS_REQ_SUPP_NAME_REG"%s\n", + nrs_orr_supp2str(supported)); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + + /** + * We know the ost_io service which is the only one ORR/TRR policies are + * compatible with, do have an HP NRS head, but it may be best to guard + * against a possible change of this in the future. + */ + if (!nrs_svc_has_hp(svc)) + goto no_hp; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + orr_data->name, + NRS_CTL_ORR_RD_SUPP_REQ, true, + &supported); + if (rc == 0) { + seq_printf(m, NRS_LPROCFS_REQ_SUPP_NAME_HP"%s\n", + nrs_orr_supp2str(supported)); + /** + * Ignore -ENODEV as the high priority NRS head's policy may be + * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + +no_hp: + + return rc; +} + +/** + * Max valid command string is the size of the labels, plus "reads_and_writes" + * twice, plus a separating ' ' + */ +#define LPROCFS_NRS_WR_REQ_SUPP_MAX_CMD \ + sizeof(NRS_LPROCFS_REQ_SUPP_NAME_REG LPROCFS_NRS_SUPP_NAME_READWRITES \ + NRS_LPROCFS_REQ_SUPP_NAME_HP LPROCFS_NRS_SUPP_NAME_READWRITES \ + " ") + +/** + * Sets the type of RPCs handled by ORR/TRR policy instances. The user can + * modify this setting for the regular or high priority NRS heads separately, or + * both together in a single invocation. + * + * For example: + * + * lctl set_param ost.OSS.ost_io.nrs_orr_supported= + * "reg_supported:reads", to enable the ORR policy instance on the regular NRS + * head of the ost_io service to handle OST_READ RPCs. + * + * lctl set_param ost.OSS.ost_io.nrs_trr_supported=reads_and_writes, to enable + * the TRR policy instances on both the regular ang high priority NRS heads of + * the ost_io service to use handle OST_READ and OST_WRITE RPCs. + * + * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state are + * are skipped later by nrs_orr_ctl(). + */ +static ssize_t +ptlrpc_lprocfs_nrs_orr_supported_seq_write(struct file *file, + const char __user *buffer, + size_t count, + loff_t *off) +{ + struct seq_file *m = file->private_data; + struct nrs_lprocfs_orr_data *orr_data = m->private; + struct ptlrpc_service *svc = orr_data->svc; + enum ptlrpc_nrs_queue_type queue = 0; + char kernbuf[LPROCFS_NRS_WR_REQ_SUPP_MAX_CMD]; + char *val_reg; + char *val_hp; + enum nrs_orr_supp supp_reg; + enum nrs_orr_supp supp_hp; + size_t count_copy; + int rc = 0; + int rc2 = 0; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + + count_copy = count; + + /** + * Check if the regular supported requests setting has been specified + */ + val_reg = lprocfs_find_named_value(kernbuf, + NRS_LPROCFS_REQ_SUPP_NAME_REG, + &count_copy); + if (val_reg != kernbuf) + queue |= PTLRPC_NRS_QUEUE_REG; + + count_copy = count; + + /** + * Check if the high priority supported requests setting has been + * specified + */ + val_hp = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_REQ_SUPP_NAME_HP, + &count_copy); + if (val_hp != kernbuf) { + if (!nrs_svc_has_hp(svc)) + return -ENODEV; + + queue |= PTLRPC_NRS_QUEUE_HP; + } + + /** + * If none of the queues has been specified, there may be a valid + * command string at the start of the buffer. + */ + if (queue == 0) { + queue = PTLRPC_NRS_QUEUE_REG; + + if (nrs_svc_has_hp(svc)) + queue |= PTLRPC_NRS_QUEUE_HP; + } + + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + supp_reg = nrs_orr_str2supp(val_reg); + if (supp_reg == -EINVAL) + return -EINVAL; + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + supp_hp = nrs_orr_str2supp(val_hp); + if (supp_hp == -EINVAL) + return -EINVAL; + } + + /** + * We change the values on regular and HP NRS heads separately, so that + * we do not exit early from ptlrpc_nrs_policy_control() with an error + * returned by nrs_policy_ctl_locked(), in cases where the user has not + * started the policy on either the regular or HP NRS head; i.e. we are + * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned + * only if the operation fails with -ENODEV on all heads that have been + * specified by the command; if at least one operation succeeds, + * success is returned. + */ + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + orr_data->name, + NRS_CTL_ORR_WR_SUPP_REQ, false, + &supp_reg); + if ((rc < 0 && rc != -ENODEV) || + (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG)) + return rc; + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + orr_data->name, + NRS_CTL_ORR_WR_SUPP_REQ, false, + &supp_hp); + if ((rc2 < 0 && rc2 != -ENODEV) || + (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP)) + return rc2; + } + + return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_supported); + +static int nrs_orr_lprocfs_init(struct ptlrpc_service *svc) +{ + int i; + + struct ldebugfs_vars nrs_orr_lprocfs_vars[] = { + { .name = "nrs_orr_quantum", + .fops = &ptlrpc_lprocfs_nrs_orr_quantum_fops }, + { .name = "nrs_orr_offset_type", + .fops = &ptlrpc_lprocfs_nrs_orr_offset_type_fops }, + { .name = "nrs_orr_supported", + .fops = &ptlrpc_lprocfs_nrs_orr_supported_fops }, + { NULL } + }; + + if (!svc->srv_debugfs_entry) + return 0; + + lprocfs_orr_data.svc = svc; + + for (i = 0; i < ARRAY_SIZE(nrs_orr_lprocfs_vars); i++) + nrs_orr_lprocfs_vars[i].data = &lprocfs_orr_data; + + ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_orr_lprocfs_vars, NULL); + + return 0; +} + +static const struct ptlrpc_nrs_pol_ops nrs_orr_ops = { + .op_policy_init = nrs_orr_init, + .op_policy_start = nrs_orr_start, + .op_policy_stop = nrs_orr_stop, + .op_policy_ctl = nrs_orr_ctl, + .op_res_get = nrs_orr_res_get, + .op_res_put = nrs_orr_res_put, + .op_req_get = nrs_orr_req_get, + .op_req_enqueue = nrs_orr_req_add, + .op_req_dequeue = nrs_orr_req_del, + .op_req_stop = nrs_orr_req_stop, + .op_lprocfs_init = nrs_orr_lprocfs_init, +}; + +struct ptlrpc_nrs_pol_conf nrs_conf_orr = { + .nc_name = NRS_POL_NAME_ORR, + .nc_ops = &nrs_orr_ops, + .nc_compat = nrs_policy_compat_one, + .nc_compat_svc_name = "ost_io", +}; + +/** + * TRR, Target-based Round Robin policy + * + * TRR reuses much of the functions and data structures of ORR + */ +static int nrs_trr_lprocfs_init(struct ptlrpc_service *svc) +{ + int i; + + struct ldebugfs_vars nrs_trr_lprocfs_vars[] = { + { .name = "nrs_trr_quantum", + .fops = &ptlrpc_lprocfs_nrs_orr_quantum_fops }, + { .name = "nrs_trr_offset_type", + .fops = &ptlrpc_lprocfs_nrs_orr_offset_type_fops }, + { .name = "nrs_trr_supported", + .fops = &ptlrpc_lprocfs_nrs_orr_supported_fops }, + { NULL } + }; + + if (!svc->srv_debugfs_entry) + return 0; + + lprocfs_trr_data.svc = svc; + + for (i = 0; i < ARRAY_SIZE(nrs_trr_lprocfs_vars); i++) + nrs_trr_lprocfs_vars[i].data = &lprocfs_trr_data; + + ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_trr_lprocfs_vars, NULL); + + return 0; +} + +/** + * Reuse much of the ORR functionality for TRR. + */ +static const struct ptlrpc_nrs_pol_ops nrs_trr_ops = { + .op_policy_init = nrs_orr_init, + .op_policy_start = nrs_orr_start, + .op_policy_stop = nrs_orr_stop, + .op_policy_ctl = nrs_orr_ctl, + .op_res_get = nrs_orr_res_get, + .op_res_put = nrs_orr_res_put, + .op_req_get = nrs_orr_req_get, + .op_req_enqueue = nrs_orr_req_add, + .op_req_dequeue = nrs_orr_req_del, + .op_req_stop = nrs_orr_req_stop, + .op_lprocfs_init = nrs_trr_lprocfs_init, +}; + +struct ptlrpc_nrs_pol_conf nrs_conf_trr = { + .nc_name = NRS_POL_NAME_TRR, + .nc_ops = &nrs_trr_ops, + .nc_compat = nrs_policy_compat_one, + .nc_compat_svc_name = "ost_io", +}; + +/** @} ORR/TRR policy */ + +/** @} nrs */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c new file mode 100644 index 0000000000000..50b983e739d8a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c @@ -0,0 +1,3712 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (C) 2013 DataDirect Networks, Inc. + * + * Copyright (c) 2014, 2016, Intel Corporation. + */ +/* + * lustre/ptlrpc/nrs_tbf.c + * + * Network Request Scheduler (NRS) Token Bucket Filter(TBF) policy + * + */ + +/** + * \addtogoup nrs + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +/** + * \name tbf + * + * Token Bucket Filter over client NIDs + * + * @{ + */ + +#define NRS_POL_NAME_TBF "tbf" + +static int tbf_jobid_cache_size = 8192; +module_param(tbf_jobid_cache_size, int, 0644); +MODULE_PARM_DESC(tbf_jobid_cache_size, "The size of jobid cache"); + +static int tbf_rate = 10000; +module_param(tbf_rate, int, 0644); +MODULE_PARM_DESC(tbf_rate, "Default rate limit in RPCs/s"); + +static int tbf_depth = 3; +module_param(tbf_depth, int, 0644); +MODULE_PARM_DESC(tbf_depth, "How many tokens that a client can save up"); + +static enum hrtimer_restart nrs_tbf_timer_cb(struct hrtimer *timer) +{ + struct nrs_tbf_head *head = container_of(timer, struct nrs_tbf_head, + th_timer); + struct ptlrpc_nrs *nrs = head->th_res.res_policy->pol_nrs; + struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt; + + nrs->nrs_throttling = 0; + wake_up(&svcpt->scp_waitq); + + return HRTIMER_NORESTART; +} + +#define NRS_TBF_DEFAULT_RULE "default" + +static void nrs_tbf_rule_fini(struct nrs_tbf_rule *rule) +{ + LASSERT(atomic_read(&rule->tr_ref) == 0); + LASSERT(list_empty(&rule->tr_cli_list)); + LASSERT(list_empty(&rule->tr_linkage)); + + rule->tr_head->th_ops->o_rule_fini(rule); + OBD_FREE_PTR(rule); +} + +/** + * Decreases the rule's usage reference count, and stops the rule in case it + * was already stopping and have no more outstanding usage references (which + * indicates it has no more queued or started requests, and can be safely + * stopped). + */ +static void nrs_tbf_rule_put(struct nrs_tbf_rule *rule) +{ + if (atomic_dec_and_test(&rule->tr_ref)) + nrs_tbf_rule_fini(rule); +} + +/** + * Increases the rule's usage reference count. + */ +static inline void nrs_tbf_rule_get(struct nrs_tbf_rule *rule) +{ + atomic_inc(&rule->tr_ref); +} + +static void +nrs_tbf_cli_rule_put(struct nrs_tbf_client *cli) +{ + LASSERT(!list_empty(&cli->tc_linkage)); + LASSERT(cli->tc_rule); + spin_lock(&cli->tc_rule->tr_rule_lock); + list_del_init(&cli->tc_linkage); + spin_unlock(&cli->tc_rule->tr_rule_lock); + nrs_tbf_rule_put(cli->tc_rule); + cli->tc_rule = NULL; +} + +static void +nrs_tbf_cli_reset_value(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) + +{ + struct nrs_tbf_rule *rule = cli->tc_rule; + + cli->tc_rpc_rate = rule->tr_rpc_rate; + cli->tc_nsecs = rule->tr_nsecs_per_rpc; + cli->tc_depth = rule->tr_depth; + cli->tc_ntoken = rule->tr_depth; + cli->tc_check_time = ktime_to_ns(ktime_get()); + cli->tc_rule_sequence = atomic_read(&head->th_rule_sequence); + cli->tc_rule_generation = rule->tr_generation; + + if (cli->tc_in_heap) + binheap_relocate(head->th_binheap, + &cli->tc_node); +} + +static void +nrs_tbf_cli_reset(struct nrs_tbf_head *head, + struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + spin_lock(&cli->tc_rule_lock); + if (cli->tc_rule != NULL && !list_empty(&cli->tc_linkage)) { + LASSERT(rule != cli->tc_rule); + nrs_tbf_cli_rule_put(cli); + } + LASSERT(cli->tc_rule == NULL); + LASSERT(list_empty(&cli->tc_linkage)); + /* Rule's ref is added before called */ + cli->tc_rule = rule; + spin_lock(&rule->tr_rule_lock); + list_add_tail(&cli->tc_linkage, &rule->tr_cli_list); + spin_unlock(&rule->tr_rule_lock); + spin_unlock(&cli->tc_rule_lock); + nrs_tbf_cli_reset_value(head, cli); +} + +static int +nrs_tbf_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m) +{ + return rule->tr_head->th_ops->o_rule_dump(rule, m); +} + +static int +nrs_tbf_rule_dump_all(struct nrs_tbf_head *head, struct seq_file *m) +{ + struct nrs_tbf_rule *rule; + int rc = 0; + + LASSERT(head != NULL); + spin_lock(&head->th_rule_lock); + /* List the rules from newest to oldest */ + list_for_each_entry(rule, &head->th_list, tr_linkage) { + LASSERT((rule->tr_flags & NTRS_STOPPING) == 0); + rc = nrs_tbf_rule_dump(rule, m); + if (rc) { + rc = -ENOSPC; + break; + } + } + spin_unlock(&head->th_rule_lock); + + return rc; +} + +static struct nrs_tbf_rule * +nrs_tbf_rule_find_nolock(struct nrs_tbf_head *head, + const char *name) +{ + struct nrs_tbf_rule *rule; + + LASSERT(head != NULL); + list_for_each_entry(rule, &head->th_list, tr_linkage) { + LASSERT((rule->tr_flags & NTRS_STOPPING) == 0); + if (strcmp(rule->tr_name, name) == 0) { + nrs_tbf_rule_get(rule); + return rule; + } + } + return NULL; +} + +static struct nrs_tbf_rule * +nrs_tbf_rule_find(struct nrs_tbf_head *head, + const char *name) +{ + struct nrs_tbf_rule *rule; + + LASSERT(head != NULL); + spin_lock(&head->th_rule_lock); + rule = nrs_tbf_rule_find_nolock(head, name); + spin_unlock(&head->th_rule_lock); + return rule; +} + +static struct nrs_tbf_rule * +nrs_tbf_rule_match(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + struct nrs_tbf_rule *rule = NULL; + struct nrs_tbf_rule *tmp_rule; + + spin_lock(&head->th_rule_lock); + /* Match the newest rule in the list */ + list_for_each_entry(tmp_rule, &head->th_list, tr_linkage) { + LASSERT((tmp_rule->tr_flags & NTRS_STOPPING) == 0); + if (head->th_ops->o_rule_match(tmp_rule, cli)) { + rule = tmp_rule; + break; + } + } + + if (rule == NULL) + rule = head->th_rule; + + nrs_tbf_rule_get(rule); + spin_unlock(&head->th_rule_lock); + return rule; +} + +static void +nrs_tbf_cli_init(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + struct nrs_tbf_rule *rule; + + memset(cli, 0, sizeof(*cli)); + cli->tc_in_heap = false; + head->th_ops->o_cli_init(cli, req); + INIT_LIST_HEAD(&cli->tc_list); + INIT_LIST_HEAD(&cli->tc_linkage); + spin_lock_init(&cli->tc_rule_lock); + atomic_set(&cli->tc_ref, 1); + rule = nrs_tbf_rule_match(head, cli); + nrs_tbf_cli_reset(head, rule, cli); +} + +static void +nrs_tbf_cli_fini(struct nrs_tbf_client *cli) +{ + LASSERT(list_empty(&cli->tc_list)); + LASSERT(!cli->tc_in_heap); + LASSERT(atomic_read(&cli->tc_ref) == 0); + spin_lock(&cli->tc_rule_lock); + nrs_tbf_cli_rule_put(cli); + spin_unlock(&cli->tc_rule_lock); + OBD_FREE_PTR(cli); +} + +static int +nrs_tbf_rule_start(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head, + struct nrs_tbf_cmd *start) +{ + struct nrs_tbf_rule *rule; + struct nrs_tbf_rule *tmp_rule; + struct nrs_tbf_rule *next_rule; + char *next_name = start->u.tc_start.ts_next_name; + int rc; + + rule = nrs_tbf_rule_find(head, start->tc_name); + if (rule) { + nrs_tbf_rule_put(rule); + return -EEXIST; + } + + OBD_CPT_ALLOC_PTR(rule, nrs_pol2cptab(policy), nrs_pol2cptid(policy)); + if (rule == NULL) + return -ENOMEM; + + strlcpy(rule->tr_name, start->tc_name, sizeof(rule->tr_name)); + rule->tr_rpc_rate = start->u.tc_start.ts_rpc_rate; + rule->tr_flags = start->u.tc_start.ts_rule_flags; + rule->tr_nsecs_per_rpc = NSEC_PER_SEC / rule->tr_rpc_rate; + rule->tr_depth = tbf_depth; + atomic_set(&rule->tr_ref, 1); + INIT_LIST_HEAD(&rule->tr_cli_list); + INIT_LIST_HEAD(&rule->tr_nids); + INIT_LIST_HEAD(&rule->tr_linkage); + spin_lock_init(&rule->tr_rule_lock); + rule->tr_head = head; + + rc = head->th_ops->o_rule_init(policy, rule, start); + if (rc) { + OBD_FREE_PTR(rule); + return rc; + } + + /* Add as the newest rule */ + spin_lock(&head->th_rule_lock); + tmp_rule = nrs_tbf_rule_find_nolock(head, start->tc_name); + if (tmp_rule) { + spin_unlock(&head->th_rule_lock); + nrs_tbf_rule_put(tmp_rule); + nrs_tbf_rule_put(rule); + return -EEXIST; + } + + if (next_name) { + next_rule = nrs_tbf_rule_find_nolock(head, next_name); + if (!next_rule) { + spin_unlock(&head->th_rule_lock); + nrs_tbf_rule_put(rule); + return -ENOENT; + } + + list_add(&rule->tr_linkage, next_rule->tr_linkage.prev); + nrs_tbf_rule_put(next_rule); + } else { + /* Add on the top of the rule list */ + list_add(&rule->tr_linkage, &head->th_list); + } + spin_unlock(&head->th_rule_lock); + atomic_inc(&head->th_rule_sequence); + if (start->u.tc_start.ts_rule_flags & NTRS_DEFAULT) { + rule->tr_flags |= NTRS_DEFAULT; + LASSERT(head->th_rule == NULL); + head->th_rule = rule; + } + + CDEBUG(D_RPCTRACE, "TBF starts rule@%p rate %llu gen %llu\n", + rule, rule->tr_rpc_rate, rule->tr_generation); + + return 0; +} + +/** + * Change the rank of a rule in the rule list + * + * The matched rule will be moved to the position right before another + * given rule. + * + * \param[in] policy the policy instance + * \param[in] head the TBF policy instance + * \param[in] name the rule name to be moved + * \param[in] next_name the rule name before which the matched rule will be + * moved + * + */ +static int +nrs_tbf_rule_change_rank(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head, + char *name, + char *next_name) +{ + struct nrs_tbf_rule *rule = NULL; + struct nrs_tbf_rule *next_rule = NULL; + int rc = 0; + + LASSERT(head != NULL); + + spin_lock(&head->th_rule_lock); + rule = nrs_tbf_rule_find_nolock(head, name); + if (!rule) + GOTO(out, rc = -ENOENT); + + if (strcmp(name, next_name) == 0) + GOTO(out_put, rc); + + next_rule = nrs_tbf_rule_find_nolock(head, next_name); + if (!next_rule) + GOTO(out_put, rc = -ENOENT); + + /* rules may be adjacent in same list, so list_move() isn't safe here */ + list_move_tail(&rule->tr_linkage, &next_rule->tr_linkage); + nrs_tbf_rule_put(next_rule); +out_put: + nrs_tbf_rule_put(rule); +out: + spin_unlock(&head->th_rule_lock); + return rc; +} + +static int +nrs_tbf_rule_change_rate(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head, + char *name, + __u64 rate) +{ + struct nrs_tbf_rule *rule; + + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + rule = nrs_tbf_rule_find(head, name); + if (rule == NULL) + return -ENOENT; + + rule->tr_rpc_rate = rate; + rule->tr_nsecs_per_rpc = NSEC_PER_SEC / rule->tr_rpc_rate; + rule->tr_generation++; + nrs_tbf_rule_put(rule); + + return 0; +} + +static int +nrs_tbf_rule_change(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head, + struct nrs_tbf_cmd *change) +{ + __u64 rate = change->u.tc_change.tc_rpc_rate; + char *next_name = change->u.tc_change.tc_next_name; + int rc; + + if (rate != 0) { + rc = nrs_tbf_rule_change_rate(policy, head, change->tc_name, + rate); + if (rc) + return rc; + } + + if (next_name) { + rc = nrs_tbf_rule_change_rank(policy, head, change->tc_name, + next_name); + if (rc) + return rc; + } + + return 0; +} + +static int +nrs_tbf_rule_stop(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head, + struct nrs_tbf_cmd *stop) +{ + struct nrs_tbf_rule *rule; + + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + if (strcmp(stop->tc_name, NRS_TBF_DEFAULT_RULE) == 0) + return -EPERM; + + rule = nrs_tbf_rule_find(head, stop->tc_name); + if (rule == NULL) + return -ENOENT; + + list_del_init(&rule->tr_linkage); + rule->tr_flags |= NTRS_STOPPING; + nrs_tbf_rule_put(rule); + nrs_tbf_rule_put(rule); + + return 0; +} + +static int +nrs_tbf_command(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head, + struct nrs_tbf_cmd *cmd) +{ + int rc; + + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + switch (cmd->tc_cmd) { + case NRS_CTL_TBF_START_RULE: + if (cmd->u.tc_start.ts_valid_type != head->th_type_flag) + return -EINVAL; + + spin_unlock(&policy->pol_nrs->nrs_lock); + rc = nrs_tbf_rule_start(policy, head, cmd); + spin_lock(&policy->pol_nrs->nrs_lock); + return rc; + case NRS_CTL_TBF_CHANGE_RULE: + rc = nrs_tbf_rule_change(policy, head, cmd); + return rc; + case NRS_CTL_TBF_STOP_RULE: + rc = nrs_tbf_rule_stop(policy, head, cmd); + /* Take it as a success, if not exists at all */ + return rc == -ENOENT ? 0 : rc; + default: + return -EFAULT; + } +} + +/** + * Binary heap predicate. + * + * \param[in] e1 the first binheap node to compare + * \param[in] e2 the second binheap node to compare + * + * \retval 0 e1 > e2 + * \retval 1 e1 < e2 + */ +static int +tbf_cli_compare(struct binheap_node *e1, struct binheap_node *e2) +{ + struct nrs_tbf_client *cli1; + struct nrs_tbf_client *cli2; + + cli1 = container_of(e1, struct nrs_tbf_client, tc_node); + cli2 = container_of(e2, struct nrs_tbf_client, tc_node); + + if (cli1->tc_deadline < cli2->tc_deadline) + return 1; + else if (cli1->tc_deadline > cli2->tc_deadline) + return 0; + + if (cli1->tc_check_time < cli2->tc_check_time) + return 1; + else if (cli1->tc_check_time > cli2->tc_check_time) + return 0; + + /* Maybe need more comparasion, e.g. request number in the rules */ + return 1; +} + +/** + * TBF binary heap operations + */ +static struct binheap_ops nrs_tbf_heap_ops = { + .hop_enter = NULL, + .hop_exit = NULL, + .hop_compare = tbf_cli_compare, +}; + +static unsigned nrs_tbf_jobid_hop_hash(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, strlen(key), mask); +} + +static int nrs_tbf_jobid_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return (strcmp(cli->tc_jobid, key) == 0); +} + +static void *nrs_tbf_jobid_hop_key(struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return cli->tc_jobid; +} + +static void *nrs_tbf_hop_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode); +} + +static void nrs_tbf_jobid_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_inc(&cli->tc_ref); +} + +static void nrs_tbf_jobid_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_dec(&cli->tc_ref); +} + +static void +nrs_tbf_jobid_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode) + +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + LASSERT(atomic_read(&cli->tc_ref) == 0); + nrs_tbf_cli_fini(cli); +} + +static struct cfs_hash_ops nrs_tbf_jobid_hash_ops = { + .hs_hash = nrs_tbf_jobid_hop_hash, + .hs_keycmp = nrs_tbf_jobid_hop_keycmp, + .hs_key = nrs_tbf_jobid_hop_key, + .hs_object = nrs_tbf_hop_object, + .hs_get = nrs_tbf_jobid_hop_get, + .hs_put = nrs_tbf_jobid_hop_put, + .hs_put_locked = nrs_tbf_jobid_hop_put, + .hs_exit = nrs_tbf_jobid_hop_exit, +}; + +#define NRS_TBF_JOBID_HASH_FLAGS (CFS_HASH_SPIN_BKTLOCK | \ + CFS_HASH_NO_ITEMREF | \ + CFS_HASH_DEPTH) + +static struct nrs_tbf_client * +nrs_tbf_jobid_hash_lookup(struct cfs_hash *hs, + struct cfs_hash_bd *bd, + const char *jobid) +{ + struct hlist_node *hnode; + struct nrs_tbf_client *cli; + + hnode = cfs_hash_bd_lookup_locked(hs, bd, (void *)jobid); + if (hnode == NULL) + return NULL; + + cli = container_of(hnode, struct nrs_tbf_client, tc_hnode); + if (!list_empty(&cli->tc_lru)) + list_del_init(&cli->tc_lru); + return cli; +} + +#define NRS_TBF_JOBID_NULL "" + +static struct nrs_tbf_client * +nrs_tbf_jobid_cli_find(struct nrs_tbf_head *head, + struct ptlrpc_request *req) +{ + const char *jobid; + struct nrs_tbf_client *cli; + struct cfs_hash *hs = head->th_cli_hash; + struct cfs_hash_bd bd; + + jobid = lustre_msg_get_jobid(req->rq_reqmsg); + if (jobid == NULL) + jobid = NRS_TBF_JOBID_NULL; + cfs_hash_bd_get_and_lock(hs, (void *)jobid, &bd, 1); + cli = nrs_tbf_jobid_hash_lookup(hs, &bd, jobid); + cfs_hash_bd_unlock(hs, &bd, 1); + + return cli; +} + +static struct nrs_tbf_client * +nrs_tbf_jobid_cli_findadd(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + const char *jobid; + struct nrs_tbf_client *ret; + struct cfs_hash *hs = head->th_cli_hash; + struct cfs_hash_bd bd; + + jobid = cli->tc_jobid; + cfs_hash_bd_get_and_lock(hs, (void *)jobid, &bd, 1); + ret = nrs_tbf_jobid_hash_lookup(hs, &bd, jobid); + if (ret == NULL) { + cfs_hash_bd_add_locked(hs, &bd, &cli->tc_hnode); + ret = cli; + } + cfs_hash_bd_unlock(hs, &bd, 1); + + return ret; +} + +static void +nrs_tbf_jobid_cli_put(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + struct cfs_hash_bd bd; + struct cfs_hash *hs = head->th_cli_hash; + struct nrs_tbf_bucket *bkt; + int hw; + LIST_HEAD(zombies); + + cfs_hash_bd_get(hs, &cli->tc_jobid, &bd); + bkt = cfs_hash_bd_extra_get(hs, &bd); + if (!cfs_hash_bd_dec_and_lock(hs, &bd, &cli->tc_ref)) + return; + LASSERT(list_empty(&cli->tc_lru)); + list_add_tail(&cli->tc_lru, &bkt->ntb_lru); + + /* + * Check and purge the LRU, there is at least one client in the LRU. + */ + hw = tbf_jobid_cache_size >> + (hs->hs_cur_bits - hs->hs_bkt_bits); + while (cfs_hash_bd_count_get(&bd) > hw) { + if (unlikely(list_empty(&bkt->ntb_lru))) + break; + cli = list_entry(bkt->ntb_lru.next, + struct nrs_tbf_client, + tc_lru); + LASSERT(atomic_read(&cli->tc_ref) == 0); + cfs_hash_bd_del_locked(hs, &bd, &cli->tc_hnode); + list_move(&cli->tc_lru, &zombies); + } + cfs_hash_bd_unlock(head->th_cli_hash, &bd, 1); + + while (!list_empty(&zombies)) { + cli = container_of(zombies.next, + struct nrs_tbf_client, tc_lru); + list_del_init(&cli->tc_lru); + nrs_tbf_cli_fini(cli); + } +} + +static void +nrs_tbf_jobid_cli_init(struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + char *jobid = lustre_msg_get_jobid(req->rq_reqmsg); + + if (jobid == NULL) + jobid = NRS_TBF_JOBID_NULL; + LASSERT(strlen(jobid) < LUSTRE_JOBID_SIZE); + INIT_LIST_HEAD(&cli->tc_lru); + memcpy(cli->tc_jobid, jobid, strlen(jobid)); +} + +static int nrs_tbf_jobid_hash_order(void) +{ + int bits; + + for (bits = 1; (1 << bits) < tbf_jobid_cache_size; ++bits) + ; + + return bits; +} + +#define NRS_TBF_JOBID_BKT_BITS 10 + +static int +nrs_tbf_jobid_startup(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head) +{ + struct nrs_tbf_cmd start; + struct nrs_tbf_bucket *bkt; + int bits; + int i; + int rc; + struct cfs_hash_bd bd; + + bits = nrs_tbf_jobid_hash_order(); + if (bits < NRS_TBF_JOBID_BKT_BITS) + bits = NRS_TBF_JOBID_BKT_BITS; + head->th_cli_hash = cfs_hash_create("nrs_tbf_hash", + bits, + bits, + NRS_TBF_JOBID_BKT_BITS, + sizeof(*bkt), + 0, + 0, + &nrs_tbf_jobid_hash_ops, + NRS_TBF_JOBID_HASH_FLAGS); + if (head->th_cli_hash == NULL) + return -ENOMEM; + + cfs_hash_for_each_bucket(head->th_cli_hash, &bd, i) { + bkt = cfs_hash_bd_extra_get(head->th_cli_hash, &bd); + INIT_LIST_HEAD(&bkt->ntb_lru); + } + + memset(&start, 0, sizeof(start)); + start.u.tc_start.ts_jobids_str = "*"; + + start.u.tc_start.ts_rpc_rate = tbf_rate; + start.u.tc_start.ts_rule_flags = NTRS_DEFAULT; + start.tc_name = NRS_TBF_DEFAULT_RULE; + INIT_LIST_HEAD(&start.u.tc_start.ts_jobids); + rc = nrs_tbf_rule_start(policy, head, &start); + if (rc) { + cfs_hash_putref(head->th_cli_hash); + head->th_cli_hash = NULL; + } + + return rc; +} + +/** + * Frees jobid of \a list. + * + */ +static void +nrs_tbf_jobid_list_free(struct list_head *jobid_list) +{ + struct nrs_tbf_jobid *jobid, *n; + + list_for_each_entry_safe(jobid, n, jobid_list, tj_linkage) { + OBD_FREE(jobid->tj_id, strlen(jobid->tj_id) + 1); + list_del(&jobid->tj_linkage); + OBD_FREE_PTR(jobid); + } +} + +static int +nrs_tbf_jobid_list_add(struct cfs_lstr *id, struct list_head *jobid_list) +{ + struct nrs_tbf_jobid *jobid; + char *ptr; + + OBD_ALLOC_PTR(jobid); + if (jobid == NULL) + return -ENOMEM; + + OBD_ALLOC(jobid->tj_id, id->ls_len + 1); + if (jobid->tj_id == NULL) { + OBD_FREE_PTR(jobid); + return -ENOMEM; + } + + memcpy(jobid->tj_id, id->ls_str, id->ls_len); + ptr = lprocfs_strnstr(id->ls_str, "*", id->ls_len); + if (ptr == NULL) + jobid->tj_match_flag = NRS_TBF_MATCH_FULL; + else + jobid->tj_match_flag = NRS_TBF_MATCH_WILDCARD; + + list_add_tail(&jobid->tj_linkage, jobid_list); + return 0; +} + +static bool +cfs_match_wildcard(const char *pattern, const char *content) +{ + if (*pattern == '\0' && *content == '\0') + return true; + + if (*pattern == '*' && *(pattern + 1) != '\0' && *content == '\0') + return false; + + while (*pattern == *content) { + pattern++; + content++; + if (*pattern == '\0' && *content == '\0') + return true; + + if (*pattern == '*' && *(pattern + 1) != '\0' && + *content == '\0') + return false; + } + + if (*pattern == '*') + return (cfs_match_wildcard(pattern + 1, content) || + cfs_match_wildcard(pattern, content + 1)); + + return false; +} + +static inline bool +nrs_tbf_jobid_match(const struct nrs_tbf_jobid *jobid, const char *id) +{ + if (jobid->tj_match_flag == NRS_TBF_MATCH_FULL) + return strcmp(jobid->tj_id, id) == 0; + + if (jobid->tj_match_flag == NRS_TBF_MATCH_WILDCARD) + return cfs_match_wildcard(jobid->tj_id, id); + + return false; +} + +static int +nrs_tbf_jobid_list_match(struct list_head *jobid_list, char *id) +{ + struct nrs_tbf_jobid *jobid; + + list_for_each_entry(jobid, jobid_list, tj_linkage) { + if (nrs_tbf_jobid_match(jobid, id)) + return 1; + } + return 0; +} + +static int +nrs_tbf_jobid_list_parse(char *str, int len, struct list_head *jobid_list) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc = 0; + ENTRY; + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(jobid_list); + while (src.ls_str) { + rc = cfs_gettok(&src, ' ', &res); + if (rc == 0) { + rc = -EINVAL; + break; + } + rc = nrs_tbf_jobid_list_add(&res, jobid_list); + if (rc) + break; + } + if (rc) + nrs_tbf_jobid_list_free(jobid_list); + RETURN(rc); +} + +static void nrs_tbf_jobid_cmd_fini(struct nrs_tbf_cmd *cmd) +{ + if (!list_empty(&cmd->u.tc_start.ts_jobids)) + nrs_tbf_jobid_list_free(&cmd->u.tc_start.ts_jobids); + if (cmd->u.tc_start.ts_jobids_str) + OBD_FREE(cmd->u.tc_start.ts_jobids_str, + strlen(cmd->u.tc_start.ts_jobids_str) + 1); +} + +static int nrs_tbf_check_id_value(struct cfs_lstr *src, char *key) +{ + struct cfs_lstr res; + int keylen = strlen(key); + int rc; + + rc = cfs_gettok(src, '=', &res); + if (rc == 0 || res.ls_len != keylen || + strncmp(res.ls_str, key, keylen) != 0 || + !src->ls_str || src->ls_len <= 2 || + src->ls_str[0] != '{' || src->ls_str[src->ls_len - 1] != '}') + return -EINVAL; + + /* Skip '{' and '}' */ + src->ls_str++; + src->ls_len -= 2; + return 0; +} + +static int nrs_tbf_jobid_parse(struct nrs_tbf_cmd *cmd, char *id) +{ + struct cfs_lstr src; + int rc; + + src.ls_str = id; + src.ls_len = strlen(id); + rc = nrs_tbf_check_id_value(&src, "jobid"); + if (rc) + return rc; + + OBD_ALLOC(cmd->u.tc_start.ts_jobids_str, src.ls_len + 1); + if (cmd->u.tc_start.ts_jobids_str == NULL) + return -ENOMEM; + + memcpy(cmd->u.tc_start.ts_jobids_str, src.ls_str, src.ls_len); + + /* parse jobid list */ + rc = nrs_tbf_jobid_list_parse(cmd->u.tc_start.ts_jobids_str, + strlen(cmd->u.tc_start.ts_jobids_str), + &cmd->u.tc_start.ts_jobids); + if (rc) + nrs_tbf_jobid_cmd_fini(cmd); + + return rc; +} + +static int nrs_tbf_jobid_rule_init(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_rule *rule, + struct nrs_tbf_cmd *start) +{ + int rc = 0; + + LASSERT(start->u.tc_start.ts_jobids_str); + OBD_ALLOC(rule->tr_jobids_str, + strlen(start->u.tc_start.ts_jobids_str) + 1); + if (rule->tr_jobids_str == NULL) + return -ENOMEM; + + memcpy(rule->tr_jobids_str, + start->u.tc_start.ts_jobids_str, + strlen(start->u.tc_start.ts_jobids_str)); + + INIT_LIST_HEAD(&rule->tr_jobids); + if (!list_empty(&start->u.tc_start.ts_jobids)) { + rc = nrs_tbf_jobid_list_parse(rule->tr_jobids_str, + strlen(rule->tr_jobids_str), + &rule->tr_jobids); + if (rc) + CERROR("jobids {%s} illegal\n", rule->tr_jobids_str); + } + if (rc) + OBD_FREE(rule->tr_jobids_str, + strlen(start->u.tc_start.ts_jobids_str) + 1); + return rc; +} + +static int +nrs_tbf_jobid_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m) +{ + seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name, + rule->tr_jobids_str, rule->tr_rpc_rate, + atomic_read(&rule->tr_ref) - 1); + return 0; +} + +static int +nrs_tbf_jobid_rule_match(struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + return nrs_tbf_jobid_list_match(&rule->tr_jobids, cli->tc_jobid); +} + +static void nrs_tbf_jobid_rule_fini(struct nrs_tbf_rule *rule) +{ + if (!list_empty(&rule->tr_jobids)) + nrs_tbf_jobid_list_free(&rule->tr_jobids); + LASSERT(rule->tr_jobids_str != NULL); + OBD_FREE(rule->tr_jobids_str, strlen(rule->tr_jobids_str) + 1); +} + +static struct nrs_tbf_ops nrs_tbf_jobid_ops = { + .o_name = NRS_TBF_TYPE_JOBID, + .o_startup = nrs_tbf_jobid_startup, + .o_cli_find = nrs_tbf_jobid_cli_find, + .o_cli_findadd = nrs_tbf_jobid_cli_findadd, + .o_cli_put = nrs_tbf_jobid_cli_put, + .o_cli_init = nrs_tbf_jobid_cli_init, + .o_rule_init = nrs_tbf_jobid_rule_init, + .o_rule_dump = nrs_tbf_jobid_rule_dump, + .o_rule_match = nrs_tbf_jobid_rule_match, + .o_rule_fini = nrs_tbf_jobid_rule_fini, +}; + +/** + * libcfs_hash operations for nrs_tbf_net::cn_cli_hash + * + * This uses ptlrpc_request::rq_peer.nid as its key, in order to hash + * nrs_tbf_client objects. + */ +#define NRS_TBF_NID_BKT_BITS 8 +#define NRS_TBF_NID_BITS 16 + +static unsigned nrs_tbf_nid_hop_hash(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask); +} + +static int nrs_tbf_nid_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + lnet_nid_t *nid = (lnet_nid_t *)key; + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return *nid == cli->tc_nid; +} + +static void *nrs_tbf_nid_hop_key(struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return &cli->tc_nid; +} + +static void nrs_tbf_nid_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_inc(&cli->tc_ref); +} + +static void nrs_tbf_nid_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_dec(&cli->tc_ref); +} + +static void nrs_tbf_nid_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + LASSERTF(atomic_read(&cli->tc_ref) == 0, + "Busy TBF object from client with NID %s, with %d refs\n", + libcfs_nid2str(cli->tc_nid), atomic_read(&cli->tc_ref)); + + nrs_tbf_cli_fini(cli); +} + +static struct cfs_hash_ops nrs_tbf_nid_hash_ops = { + .hs_hash = nrs_tbf_nid_hop_hash, + .hs_keycmp = nrs_tbf_nid_hop_keycmp, + .hs_key = nrs_tbf_nid_hop_key, + .hs_object = nrs_tbf_hop_object, + .hs_get = nrs_tbf_nid_hop_get, + .hs_put = nrs_tbf_nid_hop_put, + .hs_put_locked = nrs_tbf_nid_hop_put, + .hs_exit = nrs_tbf_nid_hop_exit, +}; + +static struct nrs_tbf_client * +nrs_tbf_nid_cli_find(struct nrs_tbf_head *head, + struct ptlrpc_request *req) +{ + return cfs_hash_lookup(head->th_cli_hash, &req->rq_peer.nid); +} + +static struct nrs_tbf_client * +nrs_tbf_nid_cli_findadd(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_nid, + &cli->tc_hnode); +} + +static void +nrs_tbf_nid_cli_put(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + cfs_hash_put(head->th_cli_hash, &cli->tc_hnode); +} + +static int +nrs_tbf_nid_startup(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head) +{ + struct nrs_tbf_cmd start; + int rc; + + head->th_cli_hash = cfs_hash_create("nrs_tbf_hash", + NRS_TBF_NID_BITS, + NRS_TBF_NID_BITS, + NRS_TBF_NID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nrs_tbf_nid_hash_ops, + CFS_HASH_RW_BKTLOCK); + if (head->th_cli_hash == NULL) + return -ENOMEM; + + memset(&start, 0, sizeof(start)); + start.u.tc_start.ts_nids_str = "*"; + + start.u.tc_start.ts_rpc_rate = tbf_rate; + start.u.tc_start.ts_rule_flags = NTRS_DEFAULT; + start.tc_name = NRS_TBF_DEFAULT_RULE; + INIT_LIST_HEAD(&start.u.tc_start.ts_nids); + rc = nrs_tbf_rule_start(policy, head, &start); + if (rc) { + cfs_hash_putref(head->th_cli_hash); + head->th_cli_hash = NULL; + } + + return rc; +} + +static void +nrs_tbf_nid_cli_init(struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + cli->tc_nid = req->rq_peer.nid; +} + +static int nrs_tbf_nid_rule_init(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_rule *rule, + struct nrs_tbf_cmd *start) +{ + LASSERT(start->u.tc_start.ts_nids_str); + OBD_ALLOC(rule->tr_nids_str, + strlen(start->u.tc_start.ts_nids_str) + 1); + if (rule->tr_nids_str == NULL) + return -ENOMEM; + + memcpy(rule->tr_nids_str, + start->u.tc_start.ts_nids_str, + strlen(start->u.tc_start.ts_nids_str)); + + INIT_LIST_HEAD(&rule->tr_nids); + if (!list_empty(&start->u.tc_start.ts_nids)) { + if (cfs_parse_nidlist(rule->tr_nids_str, + strlen(rule->tr_nids_str), + &rule->tr_nids) <= 0) { + CERROR("nids {%s} illegal\n", + rule->tr_nids_str); + OBD_FREE(rule->tr_nids_str, + strlen(start->u.tc_start.ts_nids_str) + 1); + return -EINVAL; + } + } + return 0; +} + +static int +nrs_tbf_nid_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m) +{ + seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name, + rule->tr_nids_str, rule->tr_rpc_rate, + atomic_read(&rule->tr_ref) - 1); + return 0; +} + +static int +nrs_tbf_nid_rule_match(struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + return cfs_match_nid(cli->tc_nid, &rule->tr_nids); +} + +static void nrs_tbf_nid_rule_fini(struct nrs_tbf_rule *rule) +{ + if (!list_empty(&rule->tr_nids)) + cfs_free_nidlist(&rule->tr_nids); + LASSERT(rule->tr_nids_str != NULL); + OBD_FREE(rule->tr_nids_str, strlen(rule->tr_nids_str) + 1); +} + +static void nrs_tbf_nid_cmd_fini(struct nrs_tbf_cmd *cmd) +{ + if (!list_empty(&cmd->u.tc_start.ts_nids)) + cfs_free_nidlist(&cmd->u.tc_start.ts_nids); + if (cmd->u.tc_start.ts_nids_str) + OBD_FREE(cmd->u.tc_start.ts_nids_str, + strlen(cmd->u.tc_start.ts_nids_str) + 1); +} + +static int nrs_tbf_nid_parse(struct nrs_tbf_cmd *cmd, char *id) +{ + struct cfs_lstr src; + int rc; + + src.ls_str = id; + src.ls_len = strlen(id); + rc = nrs_tbf_check_id_value(&src, "nid"); + if (rc) + return rc; + + OBD_ALLOC(cmd->u.tc_start.ts_nids_str, src.ls_len + 1); + if (cmd->u.tc_start.ts_nids_str == NULL) + return -ENOMEM; + + memcpy(cmd->u.tc_start.ts_nids_str, src.ls_str, src.ls_len); + + /* parse NID list */ + if (cfs_parse_nidlist(cmd->u.tc_start.ts_nids_str, + strlen(cmd->u.tc_start.ts_nids_str), + &cmd->u.tc_start.ts_nids) <= 0) { + nrs_tbf_nid_cmd_fini(cmd); + return -EINVAL; + } + + return 0; +} + +static struct nrs_tbf_ops nrs_tbf_nid_ops = { + .o_name = NRS_TBF_TYPE_NID, + .o_startup = nrs_tbf_nid_startup, + .o_cli_find = nrs_tbf_nid_cli_find, + .o_cli_findadd = nrs_tbf_nid_cli_findadd, + .o_cli_put = nrs_tbf_nid_cli_put, + .o_cli_init = nrs_tbf_nid_cli_init, + .o_rule_init = nrs_tbf_nid_rule_init, + .o_rule_dump = nrs_tbf_nid_rule_dump, + .o_rule_match = nrs_tbf_nid_rule_match, + .o_rule_fini = nrs_tbf_nid_rule_fini, +}; + +static unsigned nrs_tbf_hop_hash(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, strlen(key), mask); +} + +static int nrs_tbf_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return (strcmp(cli->tc_key, key) == 0); +} + +static void *nrs_tbf_hop_key(struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + return cli->tc_key; +} + +static void nrs_tbf_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_inc(&cli->tc_ref); +} + +static void nrs_tbf_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_dec(&cli->tc_ref); +} + +static void nrs_tbf_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode) + +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + LASSERT(atomic_read(&cli->tc_ref) == 0); + nrs_tbf_cli_fini(cli); +} + +static struct cfs_hash_ops nrs_tbf_hash_ops = { + .hs_hash = nrs_tbf_hop_hash, + .hs_keycmp = nrs_tbf_hop_keycmp, + .hs_key = nrs_tbf_hop_key, + .hs_object = nrs_tbf_hop_object, + .hs_get = nrs_tbf_hop_get, + .hs_put = nrs_tbf_hop_put, + .hs_put_locked = nrs_tbf_hop_put, + .hs_exit = nrs_tbf_hop_exit, +}; + +#define NRS_TBF_GENERIC_BKT_BITS 10 +#define NRS_TBF_GENERIC_HASH_FLAGS (CFS_HASH_SPIN_BKTLOCK | \ + CFS_HASH_NO_ITEMREF | \ + CFS_HASH_DEPTH) + +static int +nrs_tbf_startup(struct ptlrpc_nrs_policy *policy, struct nrs_tbf_head *head) +{ + struct nrs_tbf_cmd start; + struct nrs_tbf_bucket *bkt; + int bits; + int i; + int rc; + struct cfs_hash_bd bd; + + bits = nrs_tbf_jobid_hash_order(); + if (bits < NRS_TBF_GENERIC_BKT_BITS) + bits = NRS_TBF_GENERIC_BKT_BITS; + head->th_cli_hash = cfs_hash_create("nrs_tbf_hash", + bits, bits, + NRS_TBF_GENERIC_BKT_BITS, + sizeof(*bkt), 0, 0, + &nrs_tbf_hash_ops, + NRS_TBF_GENERIC_HASH_FLAGS); + if (head->th_cli_hash == NULL) + return -ENOMEM; + + cfs_hash_for_each_bucket(head->th_cli_hash, &bd, i) { + bkt = cfs_hash_bd_extra_get(head->th_cli_hash, &bd); + INIT_LIST_HEAD(&bkt->ntb_lru); + } + + memset(&start, 0, sizeof(start)); + start.u.tc_start.ts_conds_str = "*"; + + start.u.tc_start.ts_rpc_rate = tbf_rate; + start.u.tc_start.ts_rule_flags = NTRS_DEFAULT; + start.tc_name = NRS_TBF_DEFAULT_RULE; + INIT_LIST_HEAD(&start.u.tc_start.ts_conds); + rc = nrs_tbf_rule_start(policy, head, &start); + if (rc) + cfs_hash_putref(head->th_cli_hash); + + return rc; +} + +static struct nrs_tbf_client * +nrs_tbf_cli_hash_lookup(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const char *key) +{ + struct hlist_node *hnode; + struct nrs_tbf_client *cli; + + hnode = cfs_hash_bd_lookup_locked(hs, bd, (void *)key); + if (hnode == NULL) + return NULL; + + cli = container_of(hnode, struct nrs_tbf_client, tc_hnode); + if (!list_empty(&cli->tc_lru)) + list_del_init(&cli->tc_lru); + return cli; +} + +/** + * ONLY opcode presented in this function will be checked in + * nrs_tbf_id_cli_set(). That means, we can add or remove an + * opcode to enable or disable requests handled in nrs_tbf + */ +static struct req_format *req_fmt(__u32 opcode) +{ + switch (opcode) { + case OST_GETATTR: + return &RQF_OST_GETATTR; + case OST_SETATTR: + return &RQF_OST_SETATTR; + case OST_READ: + return &RQF_OST_BRW_READ; + case OST_WRITE: + return &RQF_OST_BRW_WRITE; + /* FIXME: OST_CREATE and OST_DESTROY comes from MDS + * in most case. Should they be removed? */ + case OST_CREATE: + return &RQF_OST_CREATE; + case OST_DESTROY: + return &RQF_OST_DESTROY; + case OST_PUNCH: + return &RQF_OST_PUNCH; + case OST_SYNC: + return &RQF_OST_SYNC; + case OST_LADVISE: + return &RQF_OST_LADVISE; + case MDS_GETATTR: + return &RQF_MDS_GETATTR; + case MDS_GETATTR_NAME: + return &RQF_MDS_GETATTR_NAME; + /* close is skipped to avoid LDLM cancel slowness */ +#if 0 + case MDS_CLOSE: + return &RQF_MDS_CLOSE; +#endif + case MDS_REINT: + return &RQF_MDS_REINT; + case MDS_READPAGE: + return &RQF_MDS_READPAGE; + case MDS_GET_ROOT: + return &RQF_MDS_GET_ROOT; + case MDS_STATFS: + return &RQF_MDS_STATFS; + case MDS_SYNC: + return &RQF_MDS_SYNC; + case MDS_QUOTACTL: + return &RQF_MDS_QUOTACTL; + case MDS_GETXATTR: + return &RQF_MDS_GETXATTR; + case MDS_GET_INFO: + return &RQF_MDS_GET_INFO; + /* HSM op is skipped */ +#if 0 + case MDS_HSM_STATE_GET: + return &RQF_MDS_HSM_STATE_GET; + case MDS_HSM_STATE_SET: + return &RQF_MDS_HSM_STATE_SET; + case MDS_HSM_ACTION: + return &RQF_MDS_HSM_ACTION; + case MDS_HSM_CT_REGISTER: + return &RQF_MDS_HSM_CT_REGISTER; + case MDS_HSM_CT_UNREGISTER: + return &RQF_MDS_HSM_CT_UNREGISTER; +#endif + case MDS_SWAP_LAYOUTS: + return &RQF_MDS_SWAP_LAYOUTS; + case LDLM_ENQUEUE: + return &RQF_LDLM_ENQUEUE; + default: + return NULL; + } +} + +static struct req_format *intent_req_fmt(__u32 it_opc) +{ + if (it_opc & (IT_OPEN | IT_CREAT)) + return &RQF_LDLM_INTENT_OPEN; + else if (it_opc & (IT_GETATTR | IT_LOOKUP)) + return &RQF_LDLM_INTENT_GETATTR; + else if (it_opc & IT_GETXATTR) + return &RQF_LDLM_INTENT_GETXATTR; + else if (it_opc & (IT_GLIMPSE | IT_BRW)) + return &RQF_LDLM_INTENT; + else + return NULL; +} + +static int ost_tbf_id_cli_set(struct ptlrpc_request *req, + struct tbf_id *id) +{ + struct ost_body *body; + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + if (body != NULL) { + id->ti_uid = body->oa.o_uid; + id->ti_gid = body->oa.o_gid; + return 0; + } + + return -EINVAL; +} + +static void unpack_ugid_from_mdt_body(struct ptlrpc_request *req, + struct tbf_id *id) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + LASSERT(b != NULL); + + /* TODO: nodemaping feature converts {ug}id from individual + * clients to the actual ones of the file system. Some work + * may be needed to fix this. */ + id->ti_uid = b->mbo_uid; + id->ti_gid = b->mbo_gid; +} + +static void unpack_ugid_from_mdt_rec_reint(struct ptlrpc_request *req, + struct tbf_id *id) +{ + struct mdt_rec_reint *rec; + + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + LASSERT(rec != NULL); + + /* use the fs{ug}id as {ug}id of the process */ + id->ti_uid = rec->rr_fsuid; + id->ti_gid = rec->rr_fsgid; +} + +static int mdt_tbf_id_cli_set(struct ptlrpc_request *req, + struct tbf_id *id) +{ + u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + int rc = 0; + + switch (opc) { + case MDS_GETATTR: + case MDS_GETATTR_NAME: + case MDS_GET_ROOT: + case MDS_READPAGE: + case MDS_SYNC: + case MDS_GETXATTR: + case MDS_HSM_STATE_GET ... MDS_SWAP_LAYOUTS: + unpack_ugid_from_mdt_body(req, id); + break; + case MDS_CLOSE: + case MDS_REINT: + unpack_ugid_from_mdt_rec_reint(req, id); + break; + default: + rc = -EINVAL; + break; + } + return rc; +} + +static int ldlm_tbf_id_cli_set(struct ptlrpc_request *req, + struct tbf_id *id) +{ + struct ldlm_intent *lit; + struct req_format *fmt; + + if (req->rq_reqmsg->lm_bufcount <= DLM_INTENT_IT_OFF) + return -EINVAL; + + req_capsule_extend(&req->rq_pill, &RQF_LDLM_INTENT_BASIC); + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + if (lit == NULL) + return -EINVAL; + + fmt = intent_req_fmt(lit->opc); + if (fmt == NULL) + return -EINVAL; + + req_capsule_extend(&req->rq_pill, fmt); + + if (lit->opc & (IT_GETXATTR | IT_GETATTR | IT_LOOKUP)) + unpack_ugid_from_mdt_body(req, id); + else if (lit->opc & (IT_OPEN | IT_OPEN | IT_GLIMPSE | IT_BRW)) + unpack_ugid_from_mdt_rec_reint(req, id); + else + return -EINVAL; + return 0; +} + +static int nrs_tbf_id_cli_set(struct ptlrpc_request *req, struct tbf_id *id, + enum nrs_tbf_flag ti_type) +{ + u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + struct req_format *fmt = req_fmt(opc); + bool fmt_unset = false; + int rc; + + memset(id, 0, sizeof(struct tbf_id)); + id->ti_type = ti_type; + + if (fmt == NULL) + return -EINVAL; + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + if (req->rq_pill.rc_fmt == NULL) { + req_capsule_set(&req->rq_pill, fmt); + fmt_unset = true; + } + + if (opc < OST_LAST_OPC) + rc = ost_tbf_id_cli_set(req, id); + else if (opc >= MDS_FIRST_OPC && opc < MDS_LAST_OPC) + rc = mdt_tbf_id_cli_set(req, id); + else if (opc == LDLM_ENQUEUE) + rc = ldlm_tbf_id_cli_set(req, id); + else + rc = -EINVAL; + + /* restore it to the initialized state */ + if (fmt_unset) + req->rq_pill.rc_fmt = NULL; + return rc; +} + +static inline void nrs_tbf_cli_gen_key(struct nrs_tbf_client *cli, + struct ptlrpc_request *req, + char *keystr, size_t keystr_sz) +{ + const char *jobid; + u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + struct tbf_id id; + + nrs_tbf_id_cli_set(req, &id, NRS_TBF_FLAG_UID | NRS_TBF_FLAG_GID); + jobid = lustre_msg_get_jobid(req->rq_reqmsg); + if (jobid == NULL) + jobid = NRS_TBF_JOBID_NULL; + + snprintf(keystr, keystr_sz, "%s_%s_%d_%u_%u", jobid, + libcfs_nid2str(req->rq_peer.nid), opc, id.ti_uid, + id.ti_gid); + + if (cli) { + INIT_LIST_HEAD(&cli->tc_lru); + strlcpy(cli->tc_key, keystr, sizeof(cli->tc_key)); + strlcpy(cli->tc_jobid, jobid, sizeof(cli->tc_jobid)); + cli->tc_nid = req->rq_peer.nid; + cli->tc_opcode = opc; + cli->tc_id = id; + } +} + +static struct nrs_tbf_client * +nrs_tbf_cli_find(struct nrs_tbf_head *head, struct ptlrpc_request *req) +{ + struct nrs_tbf_client *cli; + struct cfs_hash *hs = head->th_cli_hash; + struct cfs_hash_bd bd; + char keystr[NRS_TBF_KEY_LEN]; + + nrs_tbf_cli_gen_key(NULL, req, keystr, sizeof(keystr)); + cfs_hash_bd_get_and_lock(hs, (void *)keystr, &bd, 1); + cli = nrs_tbf_cli_hash_lookup(hs, &bd, keystr); + cfs_hash_bd_unlock(hs, &bd, 1); + + return cli; +} + +static struct nrs_tbf_client * +nrs_tbf_cli_findadd(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + const char *key; + struct nrs_tbf_client *ret; + struct cfs_hash *hs = head->th_cli_hash; + struct cfs_hash_bd bd; + + key = cli->tc_key; + cfs_hash_bd_get_and_lock(hs, (void *)key, &bd, 1); + ret = nrs_tbf_cli_hash_lookup(hs, &bd, key); + if (ret == NULL) { + cfs_hash_bd_add_locked(hs, &bd, &cli->tc_hnode); + ret = cli; + } + cfs_hash_bd_unlock(hs, &bd, 1); + + return ret; +} + +static void +nrs_tbf_cli_put(struct nrs_tbf_head *head, struct nrs_tbf_client *cli) +{ + struct cfs_hash_bd bd; + struct cfs_hash *hs = head->th_cli_hash; + struct nrs_tbf_bucket *bkt; + int hw; + LIST_HEAD(zombies); + + cfs_hash_bd_get(hs, &cli->tc_key, &bd); + bkt = cfs_hash_bd_extra_get(hs, &bd); + if (!cfs_hash_bd_dec_and_lock(hs, &bd, &cli->tc_ref)) + return; + LASSERT(list_empty(&cli->tc_lru)); + list_add_tail(&cli->tc_lru, &bkt->ntb_lru); + + /** + * Check and purge the LRU, there is at least one client in the LRU. + */ + hw = tbf_jobid_cache_size >> (hs->hs_cur_bits - hs->hs_bkt_bits); + while (cfs_hash_bd_count_get(&bd) > hw) { + if (unlikely(list_empty(&bkt->ntb_lru))) + break; + cli = list_entry(bkt->ntb_lru.next, + struct nrs_tbf_client, + tc_lru); + LASSERT(atomic_read(&cli->tc_ref) == 0); + cfs_hash_bd_del_locked(hs, &bd, &cli->tc_hnode); + list_move(&cli->tc_lru, &zombies); + } + cfs_hash_bd_unlock(head->th_cli_hash, &bd, 1); + + while (!list_empty(&zombies)) { + cli = container_of(zombies.next, + struct nrs_tbf_client, tc_lru); + list_del_init(&cli->tc_lru); + nrs_tbf_cli_fini(cli); + } +} + +static void +nrs_tbf_generic_cli_init(struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + char keystr[NRS_TBF_KEY_LEN]; + + nrs_tbf_cli_gen_key(cli, req, keystr, sizeof(keystr)); +} + +static void +nrs_tbf_id_list_free(struct list_head *uid_list) +{ + struct nrs_tbf_id *nti_id, *n; + + list_for_each_entry_safe(nti_id, n, uid_list, nti_linkage) { + list_del_init(&nti_id->nti_linkage); + OBD_FREE_PTR(nti_id); + } +} + +static void +nrs_tbf_expression_free(struct nrs_tbf_expression *expr) +{ + LASSERT(expr->te_field >= NRS_TBF_FIELD_NID && + expr->te_field < NRS_TBF_FIELD_MAX); + switch (expr->te_field) { + case NRS_TBF_FIELD_NID: + cfs_free_nidlist(&expr->te_cond); + break; + case NRS_TBF_FIELD_JOBID: + nrs_tbf_jobid_list_free(&expr->te_cond); + break; + case NRS_TBF_FIELD_OPCODE: + CFS_FREE_BITMAP(expr->te_opcodes); + break; + case NRS_TBF_FIELD_UID: + case NRS_TBF_FIELD_GID: + nrs_tbf_id_list_free(&expr->te_cond); + break; + default: + LBUG(); + } + OBD_FREE_PTR(expr); +} + +static void +nrs_tbf_conjunction_free(struct nrs_tbf_conjunction *conjunction) +{ + struct nrs_tbf_expression *expression; + struct nrs_tbf_expression *n; + + LASSERT(list_empty(&conjunction->tc_linkage)); + list_for_each_entry_safe(expression, n, + &conjunction->tc_expressions, + te_linkage) { + list_del_init(&expression->te_linkage); + nrs_tbf_expression_free(expression); + } + OBD_FREE_PTR(conjunction); +} + +static void +nrs_tbf_conds_free(struct list_head *cond_list) +{ + struct nrs_tbf_conjunction *conjunction; + struct nrs_tbf_conjunction *n; + + list_for_each_entry_safe(conjunction, n, cond_list, tc_linkage) { + list_del_init(&conjunction->tc_linkage); + nrs_tbf_conjunction_free(conjunction); + } +} + +static void +nrs_tbf_generic_cmd_fini(struct nrs_tbf_cmd *cmd) +{ + if (!list_empty(&cmd->u.tc_start.ts_conds)) + nrs_tbf_conds_free(&cmd->u.tc_start.ts_conds); + if (cmd->u.tc_start.ts_conds_str) + OBD_FREE(cmd->u.tc_start.ts_conds_str, + strlen(cmd->u.tc_start.ts_conds_str) + 1); +} + +#define NRS_TBF_DISJUNCTION_DELIM (',') +#define NRS_TBF_CONJUNCTION_DELIM ('&') +#define NRS_TBF_EXPRESSION_DELIM ('=') + +static inline bool +nrs_tbf_check_field(struct cfs_lstr *field, char *str) +{ + int len = strlen(str); + + return (field->ls_len == len && + strncmp(field->ls_str, str, len) == 0); +} + +static int +nrs_tbf_opcode_list_parse(char *str, int len, struct cfs_bitmap **bitmaptr); +static int +nrs_tbf_id_list_parse(char *str, int len, struct list_head *id_list, + enum nrs_tbf_flag tif); + +static int +nrs_tbf_expression_parse(struct cfs_lstr *src, struct list_head *cond_list) +{ + struct nrs_tbf_expression *expr; + struct cfs_lstr field; + int rc = 0; + + OBD_ALLOC_PTR(expr); + if (expr == NULL) + return -ENOMEM; + + rc = cfs_gettok(src, NRS_TBF_EXPRESSION_DELIM, &field); + if (rc == 0 || !src->ls_str || src->ls_len <= 2 || + src->ls_str[0] != '{' || src->ls_str[src->ls_len - 1] != '}') + GOTO(out, rc = -EINVAL); + + /* Skip '{' and '}' */ + src->ls_str++; + src->ls_len -= 2; + + if (nrs_tbf_check_field(&field, "nid")) { + if (cfs_parse_nidlist(src->ls_str, + src->ls_len, + &expr->te_cond) <= 0) + GOTO(out, rc = -EINVAL); + expr->te_field = NRS_TBF_FIELD_NID; + } else if (nrs_tbf_check_field(&field, "jobid")) { + if (nrs_tbf_jobid_list_parse(src->ls_str, + src->ls_len, + &expr->te_cond) < 0) + GOTO(out, rc = -EINVAL); + expr->te_field = NRS_TBF_FIELD_JOBID; + } else if (nrs_tbf_check_field(&field, "opcode")) { + if (nrs_tbf_opcode_list_parse(src->ls_str, + src->ls_len, + &expr->te_opcodes) < 0) + GOTO(out, rc = -EINVAL); + expr->te_field = NRS_TBF_FIELD_OPCODE; + } else if (nrs_tbf_check_field(&field, "uid")) { + if (nrs_tbf_id_list_parse(src->ls_str, + src->ls_len, + &expr->te_cond, + NRS_TBF_FLAG_UID) < 0) + GOTO(out, rc = -EINVAL); + expr->te_field = NRS_TBF_FIELD_UID; + } else if (nrs_tbf_check_field(&field, "gid")) { + if (nrs_tbf_id_list_parse(src->ls_str, + src->ls_len, + &expr->te_cond, + NRS_TBF_FLAG_GID) < 0) + GOTO(out, rc = -EINVAL); + expr->te_field = NRS_TBF_FIELD_GID; + } else { + GOTO(out, rc = -EINVAL); + } + + list_add_tail(&expr->te_linkage, cond_list); + return 0; +out: + OBD_FREE_PTR(expr); + return rc; +} + +static int +nrs_tbf_conjunction_parse(struct cfs_lstr *src, struct list_head *cond_list) +{ + struct nrs_tbf_conjunction *conjunction; + struct cfs_lstr expr; + int rc = 0; + + OBD_ALLOC_PTR(conjunction); + if (conjunction == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&conjunction->tc_expressions); + list_add_tail(&conjunction->tc_linkage, cond_list); + + while (src->ls_str) { + rc = cfs_gettok(src, NRS_TBF_CONJUNCTION_DELIM, &expr); + if (rc == 0) { + rc = -EINVAL; + break; + } + rc = nrs_tbf_expression_parse(&expr, + &conjunction->tc_expressions); + if (rc) + break; + } + return rc; +} + +static int +nrs_tbf_conds_parse(char *str, int len, struct list_head *cond_list) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc = 0; + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(cond_list); + while (src.ls_str) { + rc = cfs_gettok(&src, NRS_TBF_DISJUNCTION_DELIM, &res); + if (rc == 0) { + rc = -EINVAL; + break; + } + rc = nrs_tbf_conjunction_parse(&res, cond_list); + if (rc) + break; + } + return rc; +} + +static int +nrs_tbf_generic_parse(struct nrs_tbf_cmd *cmd, const char *id) +{ + int rc; + + OBD_ALLOC(cmd->u.tc_start.ts_conds_str, strlen(id) + 1); + if (cmd->u.tc_start.ts_conds_str == NULL) + return -ENOMEM; + + memcpy(cmd->u.tc_start.ts_conds_str, id, strlen(id)); + + /* Parse hybird NID and JOBID conditions */ + rc = nrs_tbf_conds_parse(cmd->u.tc_start.ts_conds_str, + strlen(cmd->u.tc_start.ts_conds_str), + &cmd->u.tc_start.ts_conds); + if (rc) + nrs_tbf_generic_cmd_fini(cmd); + + return rc; +} + +static int +nrs_tbf_id_list_match(struct list_head *id_list, struct tbf_id id); + +static int +nrs_tbf_expression_match(struct nrs_tbf_expression *expr, + struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + switch (expr->te_field) { + case NRS_TBF_FIELD_NID: + return cfs_match_nid(cli->tc_nid, &expr->te_cond); + case NRS_TBF_FIELD_JOBID: + return nrs_tbf_jobid_list_match(&expr->te_cond, cli->tc_jobid); + case NRS_TBF_FIELD_OPCODE: + return cfs_bitmap_check(expr->te_opcodes, cli->tc_opcode); + case NRS_TBF_FIELD_UID: + case NRS_TBF_FIELD_GID: + return nrs_tbf_id_list_match(&expr->te_cond, cli->tc_id); + default: + return 0; + } +} + +static int +nrs_tbf_conjunction_match(struct nrs_tbf_conjunction *conjunction, + struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + struct nrs_tbf_expression *expr; + int matched; + + list_for_each_entry(expr, &conjunction->tc_expressions, te_linkage) { + matched = nrs_tbf_expression_match(expr, rule, cli); + if (!matched) + return 0; + } + + return 1; +} + +static int +nrs_tbf_cond_match(struct nrs_tbf_rule *rule, struct nrs_tbf_client *cli) +{ + struct nrs_tbf_conjunction *conjunction; + int matched; + + list_for_each_entry(conjunction, &rule->tr_conds, tc_linkage) { + matched = nrs_tbf_conjunction_match(conjunction, rule, cli); + if (matched) + return 1; + } + + return 0; +} + +static void +nrs_tbf_generic_rule_fini(struct nrs_tbf_rule *rule) +{ + if (!list_empty(&rule->tr_conds)) + nrs_tbf_conds_free(&rule->tr_conds); + LASSERT(rule->tr_conds_str != NULL); + OBD_FREE(rule->tr_conds_str, strlen(rule->tr_conds_str) + 1); +} + +static int +nrs_tbf_rule_init(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_rule *rule, struct nrs_tbf_cmd *start) +{ + int rc = 0; + + LASSERT(start->u.tc_start.ts_conds_str); + OBD_ALLOC(rule->tr_conds_str, + strlen(start->u.tc_start.ts_conds_str) + 1); + if (rule->tr_conds_str == NULL) + return -ENOMEM; + + memcpy(rule->tr_conds_str, + start->u.tc_start.ts_conds_str, + strlen(start->u.tc_start.ts_conds_str)); + + INIT_LIST_HEAD(&rule->tr_conds); + if (!list_empty(&start->u.tc_start.ts_conds)) { + rc = nrs_tbf_conds_parse(rule->tr_conds_str, + strlen(rule->tr_conds_str), + &rule->tr_conds); + } + if (rc) + nrs_tbf_generic_rule_fini(rule); + + return rc; +} + +static int +nrs_tbf_generic_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m) +{ + seq_printf(m, "%s %s %llu, ref %d\n", rule->tr_name, + rule->tr_conds_str, rule->tr_rpc_rate, + atomic_read(&rule->tr_ref) - 1); + return 0; +} + +static int +nrs_tbf_generic_rule_match(struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + return nrs_tbf_cond_match(rule, cli); +} + +static struct nrs_tbf_ops nrs_tbf_generic_ops = { + .o_name = NRS_TBF_TYPE_GENERIC, + .o_startup = nrs_tbf_startup, + .o_cli_find = nrs_tbf_cli_find, + .o_cli_findadd = nrs_tbf_cli_findadd, + .o_cli_put = nrs_tbf_cli_put, + .o_cli_init = nrs_tbf_generic_cli_init, + .o_rule_init = nrs_tbf_rule_init, + .o_rule_dump = nrs_tbf_generic_rule_dump, + .o_rule_match = nrs_tbf_generic_rule_match, + .o_rule_fini = nrs_tbf_generic_rule_fini, +}; + +static void nrs_tbf_opcode_rule_fini(struct nrs_tbf_rule *rule) +{ + if (rule->tr_opcodes != NULL) + CFS_FREE_BITMAP(rule->tr_opcodes); + + LASSERT(rule->tr_opcodes_str != NULL); + OBD_FREE(rule->tr_opcodes_str, strlen(rule->tr_opcodes_str) + 1); +} + +static unsigned nrs_tbf_opcode_hop_hash(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(__u32), mask); +} + +static int nrs_tbf_opcode_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + const __u32 *opc = key; + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return *opc == cli->tc_opcode; +} + +static void *nrs_tbf_opcode_hop_key(struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return &cli->tc_opcode; +} + +static void nrs_tbf_opcode_hop_get(struct cfs_hash *hs, + struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_inc(&cli->tc_ref); +} + +static void nrs_tbf_opcode_hop_put(struct cfs_hash *hs, + struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_dec(&cli->tc_ref); +} + +static void nrs_tbf_opcode_hop_exit(struct cfs_hash *hs, + struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + LASSERTF(atomic_read(&cli->tc_ref) == 0, + "Busy TBF object from client with opcode %s, with %d refs\n", + ll_opcode2str(cli->tc_opcode), + atomic_read(&cli->tc_ref)); + + nrs_tbf_cli_fini(cli); +} +static struct cfs_hash_ops nrs_tbf_opcode_hash_ops = { + .hs_hash = nrs_tbf_opcode_hop_hash, + .hs_keycmp = nrs_tbf_opcode_hop_keycmp, + .hs_key = nrs_tbf_opcode_hop_key, + .hs_object = nrs_tbf_hop_object, + .hs_get = nrs_tbf_opcode_hop_get, + .hs_put = nrs_tbf_opcode_hop_put, + .hs_put_locked = nrs_tbf_opcode_hop_put, + .hs_exit = nrs_tbf_opcode_hop_exit, +}; + +static int +nrs_tbf_opcode_startup(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head) +{ + struct nrs_tbf_cmd start = { 0 }; + int rc; + + head->th_cli_hash = cfs_hash_create("nrs_tbf_hash", + NRS_TBF_NID_BITS, + NRS_TBF_NID_BITS, + NRS_TBF_NID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nrs_tbf_opcode_hash_ops, + CFS_HASH_RW_BKTLOCK); + if (head->th_cli_hash == NULL) + return -ENOMEM; + + start.u.tc_start.ts_opcodes_str = "*"; + + start.u.tc_start.ts_rpc_rate = tbf_rate; + start.u.tc_start.ts_rule_flags = NTRS_DEFAULT; + start.tc_name = NRS_TBF_DEFAULT_RULE; + rc = nrs_tbf_rule_start(policy, head, &start); + + return rc; +} + +static struct nrs_tbf_client * +nrs_tbf_opcode_cli_find(struct nrs_tbf_head *head, + struct ptlrpc_request *req) +{ + __u32 opc; + + opc = lustre_msg_get_opc(req->rq_reqmsg); + return cfs_hash_lookup(head->th_cli_hash, &opc); +} + +static struct nrs_tbf_client * +nrs_tbf_opcode_cli_findadd(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_opcode, + &cli->tc_hnode); +} + +static void +nrs_tbf_opcode_cli_init(struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + cli->tc_opcode = lustre_msg_get_opc(req->rq_reqmsg); +} + +#define MAX_OPCODE_LEN 32 +static int +nrs_tbf_opcode_set_bit(const struct cfs_lstr *id, struct cfs_bitmap *opcodes) +{ + int op = 0; + char opcode_str[MAX_OPCODE_LEN]; + + if (id->ls_len + 1 > MAX_OPCODE_LEN) + return -EINVAL; + + memcpy(opcode_str, id->ls_str, id->ls_len); + opcode_str[id->ls_len] = '\0'; + + op = ll_str2opcode(opcode_str); + if (op < 0) + return -EINVAL; + + cfs_bitmap_set(opcodes, op); + return 0; +} + +static int +nrs_tbf_opcode_list_parse(char *str, int len, struct cfs_bitmap **bitmaptr) +{ + struct cfs_bitmap *opcodes; + struct cfs_lstr src; + struct cfs_lstr res; + int rc = 0; + ENTRY; + + opcodes = CFS_ALLOCATE_BITMAP(LUSTRE_MAX_OPCODES); + if (opcodes == NULL) + return -ENOMEM; + + src.ls_str = str; + src.ls_len = len; + while (src.ls_str) { + rc = cfs_gettok(&src, ' ', &res); + if (rc == 0) { + rc = -EINVAL; + break; + } + rc = nrs_tbf_opcode_set_bit(&res, opcodes); + if (rc) + break; + } + + if (rc == 0 && bitmaptr) + *bitmaptr = opcodes; + else + CFS_FREE_BITMAP(opcodes); + + RETURN(rc); +} + +static void nrs_tbf_opcode_cmd_fini(struct nrs_tbf_cmd *cmd) +{ + if (cmd->u.tc_start.ts_opcodes_str) + OBD_FREE(cmd->u.tc_start.ts_opcodes_str, + strlen(cmd->u.tc_start.ts_opcodes_str) + 1); + +} + +static int nrs_tbf_opcode_parse(struct nrs_tbf_cmd *cmd, char *id) +{ + struct cfs_lstr src; + int rc; + + src.ls_str = id; + src.ls_len = strlen(id); + rc = nrs_tbf_check_id_value(&src, "opcode"); + if (rc) + return rc; + + OBD_ALLOC(cmd->u.tc_start.ts_opcodes_str, src.ls_len + 1); + if (cmd->u.tc_start.ts_opcodes_str == NULL) + return -ENOMEM; + + memcpy(cmd->u.tc_start.ts_opcodes_str, src.ls_str, src.ls_len); + + /* parse opcode list */ + rc = nrs_tbf_opcode_list_parse(cmd->u.tc_start.ts_opcodes_str, + strlen(cmd->u.tc_start.ts_opcodes_str), + NULL); + if (rc) + nrs_tbf_opcode_cmd_fini(cmd); + + return rc; +} + +static int +nrs_tbf_opcode_rule_match(struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + if (rule->tr_opcodes == NULL) + return 0; + + return cfs_bitmap_check(rule->tr_opcodes, cli->tc_opcode); +} + +static int nrs_tbf_opcode_rule_init(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_rule *rule, + struct nrs_tbf_cmd *start) +{ + int rc = 0; + + LASSERT(start->u.tc_start.ts_opcodes_str != NULL); + OBD_ALLOC(rule->tr_opcodes_str, + strlen(start->u.tc_start.ts_opcodes_str) + 1); + if (rule->tr_opcodes_str == NULL) + return -ENOMEM; + + strncpy(rule->tr_opcodes_str, start->u.tc_start.ts_opcodes_str, + strlen(start->u.tc_start.ts_opcodes_str) + 1); + + /* Default rule '*' */ + if (strcmp(start->u.tc_start.ts_opcodes_str, "*") == 0) + return 0; + + rc = nrs_tbf_opcode_list_parse(rule->tr_opcodes_str, + strlen(rule->tr_opcodes_str), + &rule->tr_opcodes); + if (rc) + OBD_FREE(rule->tr_opcodes_str, + strlen(start->u.tc_start.ts_opcodes_str) + 1); + + return rc; +} + +static int +nrs_tbf_opcode_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m) +{ + seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name, + rule->tr_opcodes_str, rule->tr_rpc_rate, + atomic_read(&rule->tr_ref) - 1); + return 0; +} + + +struct nrs_tbf_ops nrs_tbf_opcode_ops = { + .o_name = NRS_TBF_TYPE_OPCODE, + .o_startup = nrs_tbf_opcode_startup, + .o_cli_find = nrs_tbf_opcode_cli_find, + .o_cli_findadd = nrs_tbf_opcode_cli_findadd, + .o_cli_put = nrs_tbf_nid_cli_put, + .o_cli_init = nrs_tbf_opcode_cli_init, + .o_rule_init = nrs_tbf_opcode_rule_init, + .o_rule_dump = nrs_tbf_opcode_rule_dump, + .o_rule_match = nrs_tbf_opcode_rule_match, + .o_rule_fini = nrs_tbf_opcode_rule_fini, +}; + +static unsigned nrs_tbf_id_hop_hash(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(struct tbf_id), mask); +} + +static int nrs_tbf_id_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + const struct tbf_id *opc = key; + enum nrs_tbf_flag ntf; + struct nrs_tbf_client *cli = hlist_entry(hnode, struct nrs_tbf_client, + tc_hnode); + ntf = opc->ti_type & cli->tc_id.ti_type; + if ((ntf & NRS_TBF_FLAG_UID) && opc->ti_uid != cli->tc_id.ti_uid) + return 0; + + if ((ntf & NRS_TBF_FLAG_GID) && opc->ti_gid != cli->tc_id.ti_gid) + return 0; + + return 1; +} + +static void *nrs_tbf_id_hop_key(struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + return &cli->tc_id; +} + +static void nrs_tbf_id_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_inc(&cli->tc_ref); +} + +static void nrs_tbf_id_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_dec(&cli->tc_ref); +} + +static void +nrs_tbf_id_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode) + +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + LASSERT(atomic_read(&cli->tc_ref) == 0); + nrs_tbf_cli_fini(cli); +} + +static struct cfs_hash_ops nrs_tbf_id_hash_ops = { + .hs_hash = nrs_tbf_id_hop_hash, + .hs_keycmp = nrs_tbf_id_hop_keycmp, + .hs_key = nrs_tbf_id_hop_key, + .hs_object = nrs_tbf_hop_object, + .hs_get = nrs_tbf_id_hop_get, + .hs_put = nrs_tbf_id_hop_put, + .hs_put_locked = nrs_tbf_id_hop_put, + .hs_exit = nrs_tbf_id_hop_exit, +}; + +static int +nrs_tbf_id_startup(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head) +{ + struct nrs_tbf_cmd start; + int rc; + + head->th_cli_hash = cfs_hash_create("nrs_tbf_id_hash", + NRS_TBF_NID_BITS, + NRS_TBF_NID_BITS, + NRS_TBF_NID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nrs_tbf_id_hash_ops, + CFS_HASH_RW_BKTLOCK); + if (head->th_cli_hash == NULL) + return -ENOMEM; + + memset(&start, 0, sizeof(start)); + start.u.tc_start.ts_ids_str = "*"; + start.u.tc_start.ts_rpc_rate = tbf_rate; + start.u.tc_start.ts_rule_flags = NTRS_DEFAULT; + start.tc_name = NRS_TBF_DEFAULT_RULE; + INIT_LIST_HEAD(&start.u.tc_start.ts_ids); + rc = nrs_tbf_rule_start(policy, head, &start); + if (rc) { + cfs_hash_putref(head->th_cli_hash); + head->th_cli_hash = NULL; + } + + return rc; +} + +static struct nrs_tbf_client * +nrs_tbf_id_cli_find(struct nrs_tbf_head *head, + struct ptlrpc_request *req) +{ + struct tbf_id id; + + LASSERT(head->th_type_flag == NRS_TBF_FLAG_UID || + head->th_type_flag == NRS_TBF_FLAG_GID); + + nrs_tbf_id_cli_set(req, &id, head->th_type_flag); + return cfs_hash_lookup(head->th_cli_hash, &id); +} + +static struct nrs_tbf_client * +nrs_tbf_id_cli_findadd(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_id, + &cli->tc_hnode); +} + +static void +nrs_tbf_uid_cli_init(struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + nrs_tbf_id_cli_set(req, &cli->tc_id, NRS_TBF_FLAG_UID); +} + +static void +nrs_tbf_gid_cli_init(struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + nrs_tbf_id_cli_set(req, &cli->tc_id, NRS_TBF_FLAG_GID); +} + +static int +nrs_tbf_id_list_match(struct list_head *id_list, struct tbf_id id) +{ + struct nrs_tbf_id *nti_id; + enum nrs_tbf_flag flag; + + list_for_each_entry(nti_id, id_list, nti_linkage) { + flag = id.ti_type & nti_id->nti_id.ti_type; + if (!flag) + continue; + + if ((flag & NRS_TBF_FLAG_UID) && + (id.ti_uid != nti_id->nti_id.ti_uid)) + continue; + + if ((flag & NRS_TBF_FLAG_GID) && + (id.ti_gid != nti_id->nti_id.ti_gid)) + continue; + + return 1; + } + return 0; +} + +static int +nrs_tbf_id_rule_match(struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + return nrs_tbf_id_list_match(&rule->tr_ids, cli->tc_id); +} + +static void nrs_tbf_id_cmd_fini(struct nrs_tbf_cmd *cmd) +{ + nrs_tbf_id_list_free(&cmd->u.tc_start.ts_ids); + + if (cmd->u.tc_start.ts_ids_str) + OBD_FREE(cmd->u.tc_start.ts_ids_str, + strlen(cmd->u.tc_start.ts_ids_str) + 1); +} + +static int +nrs_tbf_id_list_parse(char *str, int len, struct list_head *id_list, + enum nrs_tbf_flag tif) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc = 0; + struct tbf_id id = { 0 }; + ENTRY; + + if (tif != NRS_TBF_FLAG_UID && tif != NRS_TBF_FLAG_GID) + RETURN(-EINVAL); + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(id_list); + while (src.ls_str) { + struct nrs_tbf_id *nti_id; + + if (cfs_gettok(&src, ' ', &res) == 0) + GOTO(out, rc = -EINVAL); + + id.ti_type = tif; + if (tif == NRS_TBF_FLAG_UID) { + if (!cfs_str2num_check(res.ls_str, res.ls_len, + &id.ti_uid, 0, (u32)~0U)) + GOTO(out, rc = -EINVAL); + } else { + if (!cfs_str2num_check(res.ls_str, res.ls_len, + &id.ti_gid, 0, (u32)~0U)) + GOTO(out, rc = -EINVAL); + } + + OBD_ALLOC_PTR(nti_id); + if (nti_id == NULL) + GOTO(out, rc = -ENOMEM); + + nti_id->nti_id = id; + list_add_tail(&nti_id->nti_linkage, id_list); + } +out: + if (rc) + nrs_tbf_id_list_free(id_list); + RETURN(rc); +} + +static int nrs_tbf_ug_id_parse(struct nrs_tbf_cmd *cmd, char *id) +{ + struct cfs_lstr src; + int rc; + enum nrs_tbf_flag tif; + + tif = cmd->u.tc_start.ts_valid_type; + + src.ls_str = id; + src.ls_len = strlen(id); + + rc = nrs_tbf_check_id_value(&src, + tif == NRS_TBF_FLAG_UID ? "uid" : "gid"); + if (rc) + return rc; + + OBD_ALLOC(cmd->u.tc_start.ts_ids_str, src.ls_len + 1); + if (cmd->u.tc_start.ts_ids_str == NULL) + return -ENOMEM; + + strlcpy(cmd->u.tc_start.ts_ids_str, src.ls_str, src.ls_len + 1); + + rc = nrs_tbf_id_list_parse(cmd->u.tc_start.ts_ids_str, + strlen(cmd->u.tc_start.ts_ids_str), + &cmd->u.tc_start.ts_ids, tif); + if (rc) + nrs_tbf_id_cmd_fini(cmd); + + return rc; +} + +static int +nrs_tbf_id_rule_init(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_rule *rule, + struct nrs_tbf_cmd *start) +{ + struct nrs_tbf_head *head = rule->tr_head; + int rc = 0; + enum nrs_tbf_flag tif = head->th_type_flag; + int ids_len = strlen(start->u.tc_start.ts_ids_str) + 1; + + LASSERT(start->u.tc_start.ts_ids_str); + INIT_LIST_HEAD(&rule->tr_ids); + + OBD_ALLOC(rule->tr_ids_str, ids_len); + if (rule->tr_ids_str == NULL) + return -ENOMEM; + + strlcpy(rule->tr_ids_str, start->u.tc_start.ts_ids_str, + ids_len); + + if (!list_empty(&start->u.tc_start.ts_ids)) { + rc = nrs_tbf_id_list_parse(rule->tr_ids_str, + strlen(rule->tr_ids_str), + &rule->tr_ids, tif); + if (rc) + CERROR("%ss {%s} illegal\n", + tif == NRS_TBF_FLAG_UID ? "uid" : "gid", + rule->tr_ids_str); + } + if (rc) { + OBD_FREE(rule->tr_ids_str, ids_len); + rule->tr_ids_str = NULL; + } + return rc; +} + +static int +nrs_tbf_id_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m) +{ + seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name, + rule->tr_ids_str, rule->tr_rpc_rate, + atomic_read(&rule->tr_ref) - 1); + return 0; +} + +static void nrs_tbf_id_rule_fini(struct nrs_tbf_rule *rule) +{ + nrs_tbf_id_list_free(&rule->tr_ids); + if (rule->tr_ids_str != NULL) + OBD_FREE(rule->tr_ids_str, strlen(rule->tr_ids_str) + 1); +} + +struct nrs_tbf_ops nrs_tbf_uid_ops = { + .o_name = NRS_TBF_TYPE_UID, + .o_startup = nrs_tbf_id_startup, + .o_cli_find = nrs_tbf_id_cli_find, + .o_cli_findadd = nrs_tbf_id_cli_findadd, + .o_cli_put = nrs_tbf_nid_cli_put, + .o_cli_init = nrs_tbf_uid_cli_init, + .o_rule_init = nrs_tbf_id_rule_init, + .o_rule_dump = nrs_tbf_id_rule_dump, + .o_rule_match = nrs_tbf_id_rule_match, + .o_rule_fini = nrs_tbf_id_rule_fini, +}; + +struct nrs_tbf_ops nrs_tbf_gid_ops = { + .o_name = NRS_TBF_TYPE_GID, + .o_startup = nrs_tbf_id_startup, + .o_cli_find = nrs_tbf_id_cli_find, + .o_cli_findadd = nrs_tbf_id_cli_findadd, + .o_cli_put = nrs_tbf_nid_cli_put, + .o_cli_init = nrs_tbf_gid_cli_init, + .o_rule_init = nrs_tbf_id_rule_init, + .o_rule_dump = nrs_tbf_id_rule_dump, + .o_rule_match = nrs_tbf_id_rule_match, + .o_rule_fini = nrs_tbf_id_rule_fini, +}; + +static struct nrs_tbf_type nrs_tbf_types[] = { + { + .ntt_name = NRS_TBF_TYPE_JOBID, + .ntt_flag = NRS_TBF_FLAG_JOBID, + .ntt_ops = &nrs_tbf_jobid_ops, + }, + { + .ntt_name = NRS_TBF_TYPE_NID, + .ntt_flag = NRS_TBF_FLAG_NID, + .ntt_ops = &nrs_tbf_nid_ops, + }, + { + .ntt_name = NRS_TBF_TYPE_OPCODE, + .ntt_flag = NRS_TBF_FLAG_OPCODE, + .ntt_ops = &nrs_tbf_opcode_ops, + }, + { + .ntt_name = NRS_TBF_TYPE_GENERIC, + .ntt_flag = NRS_TBF_FLAG_GENERIC, + .ntt_ops = &nrs_tbf_generic_ops, + }, + { + .ntt_name = NRS_TBF_TYPE_UID, + .ntt_flag = NRS_TBF_FLAG_UID, + .ntt_ops = &nrs_tbf_uid_ops, + }, + { + .ntt_name = NRS_TBF_TYPE_GID, + .ntt_flag = NRS_TBF_FLAG_GID, + .ntt_ops = &nrs_tbf_gid_ops, + }, +}; + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a + * policy-specific private data structure. + * + * \param[in] policy The policy to start + * + * \retval -ENOMEM OOM error + * \retval 0 success + * + * \see nrs_policy_register() + * \see nrs_policy_ctl() + */ +static int nrs_tbf_start(struct ptlrpc_nrs_policy *policy, char *arg) +{ + struct nrs_tbf_head *head; + struct nrs_tbf_ops *ops; + __u32 type; + char *name; + int found = 0; + int i; + int rc = 0; + + if (arg == NULL) + name = NRS_TBF_TYPE_GENERIC; + else if (strlen(arg) < NRS_TBF_TYPE_MAX_LEN) + name = arg; + else + GOTO(out, rc = -EINVAL); + + for (i = 0; i < ARRAY_SIZE(nrs_tbf_types); i++) { + if (strcmp(name, nrs_tbf_types[i].ntt_name) == 0) { + ops = nrs_tbf_types[i].ntt_ops; + type = nrs_tbf_types[i].ntt_flag; + found = 1; + break; + } + } + if (found == 0) + GOTO(out, rc = -ENOTSUPP); + + OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy)); + if (head == NULL) + GOTO(out, rc = -ENOMEM); + + memcpy(head->th_type, name, strlen(name)); + head->th_type[strlen(name)] = '\0'; + head->th_ops = ops; + head->th_type_flag = type; + + head->th_binheap = binheap_create(&nrs_tbf_heap_ops, + CBH_FLAG_ATOMIC_GROW, 4096, NULL, + nrs_pol2cptab(policy), + nrs_pol2cptid(policy)); + if (head->th_binheap == NULL) + GOTO(out_free_head, rc = -ENOMEM); + + atomic_set(&head->th_rule_sequence, 0); + spin_lock_init(&head->th_rule_lock); + INIT_LIST_HEAD(&head->th_list); + hrtimer_init(&head->th_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + head->th_timer.function = nrs_tbf_timer_cb; + rc = head->th_ops->o_startup(policy, head); + if (rc) + GOTO(out_free_heap, rc); + + policy->pol_private = head; + return 0; +out_free_heap: + binheap_destroy(head->th_binheap); +out_free_head: + OBD_FREE_PTR(head); +out: + return rc; +} + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific + * private data structure. + * + * \param[in] policy The policy to stop + * + * \see nrs_policy_stop0() + */ +static void nrs_tbf_stop(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_tbf_head *head = policy->pol_private; + struct ptlrpc_nrs *nrs = policy->pol_nrs; + struct nrs_tbf_rule *rule, *n; + + LASSERT(head != NULL); + LASSERT(head->th_cli_hash != NULL); + hrtimer_cancel(&head->th_timer); + /* Should cleanup hash first before free rules */ + cfs_hash_putref(head->th_cli_hash); + list_for_each_entry_safe(rule, n, &head->th_list, tr_linkage) { + list_del_init(&rule->tr_linkage); + nrs_tbf_rule_put(rule); + } + LASSERT(list_empty(&head->th_list)); + LASSERT(head->th_binheap != NULL); + LASSERT(binheap_is_empty(head->th_binheap)); + binheap_destroy(head->th_binheap); + OBD_FREE_PTR(head); + nrs->nrs_throttling = 0; + wake_up(&policy->pol_nrs->nrs_svcpt->scp_waitq); +} + +/** + * Performs a policy-specific ctl function on TBF policy instances; similar + * to ioctl. + * + * \param[in] policy the policy instance + * \param[in] opc the opcode + * \param[in,out] arg used for passing parameters and information + * + * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * + * \retval 0 operation carried out successfully + * \retval -ve error + */ +static int nrs_tbf_ctl(struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, + void *arg) +{ + int rc = 0; + ENTRY; + + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + switch ((enum nrs_ctl_tbf)opc) { + default: + RETURN(-EINVAL); + + /** + * Read RPC rate size of a policy instance. + */ + case NRS_CTL_TBF_RD_RULE: { + struct nrs_tbf_head *head = policy->pol_private; + struct seq_file *m = arg; + struct ptlrpc_service_part *svcpt; + + svcpt = policy->pol_nrs->nrs_svcpt; + seq_printf(m, "CPT %d:\n", svcpt->scp_cpt); + + rc = nrs_tbf_rule_dump_all(head, m); + } + break; + + /** + * Write RPC rate of a policy instance. + */ + case NRS_CTL_TBF_WR_RULE: { + struct nrs_tbf_head *head = policy->pol_private; + struct nrs_tbf_cmd *cmd; + + cmd = (struct nrs_tbf_cmd *)arg; + rc = nrs_tbf_command(policy, + head, + cmd); + } + break; + /** + * Read the TBF policy type of a policy instance. + */ + case NRS_CTL_TBF_RD_TYPE_FLAG: { + struct nrs_tbf_head *head = policy->pol_private; + + *(__u32 *)arg = head->th_type_flag; + } + break; + } + + RETURN(rc); +} + +/** + * Is called for obtaining a TBF policy resource. + * + * \param[in] policy The policy on which the request is being asked for + * \param[in] nrq The request for which resources are being taken + * \param[in] parent Parent resource, unused in this policy + * \param[out] resp Resources references are placed in this array + * \param[in] moving_req Signifies limited caller context; unused in this + * policy + * + * + * \see nrs_resource_get_safe() + */ +static int nrs_tbf_res_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, + bool moving_req) +{ + struct nrs_tbf_head *head; + struct nrs_tbf_client *cli; + struct nrs_tbf_client *tmp; + struct ptlrpc_request *req; + + if (parent == NULL) { + *resp = &((struct nrs_tbf_head *)policy->pol_private)->th_res; + return 0; + } + + head = container_of(parent, struct nrs_tbf_head, th_res); + req = container_of(nrq, struct ptlrpc_request, rq_nrq); + cli = head->th_ops->o_cli_find(head, req); + if (cli != NULL) { + spin_lock(&policy->pol_nrs->nrs_svcpt->scp_req_lock); + LASSERT(cli->tc_rule); + if (cli->tc_rule_sequence != + atomic_read(&head->th_rule_sequence) || + cli->tc_rule->tr_flags & NTRS_STOPPING) { + struct nrs_tbf_rule *rule; + + CDEBUG(D_RPCTRACE, + "TBF class@%p rate %llu sequence %d, " + "rule flags %d, head sequence %d\n", + cli, cli->tc_rpc_rate, + cli->tc_rule_sequence, + cli->tc_rule->tr_flags, + atomic_read(&head->th_rule_sequence)); + rule = nrs_tbf_rule_match(head, cli); + if (rule != cli->tc_rule) { + nrs_tbf_cli_reset(head, rule, cli); + } else { + if (cli->tc_rule_generation != rule->tr_generation) + nrs_tbf_cli_reset_value(head, cli); + nrs_tbf_rule_put(rule); + } + } else if (cli->tc_rule_generation != + cli->tc_rule->tr_generation) { + nrs_tbf_cli_reset_value(head, cli); + } + spin_unlock(&policy->pol_nrs->nrs_svcpt->scp_req_lock); + goto out; + } + + OBD_CPT_ALLOC_GFP(cli, nrs_pol2cptab(policy), nrs_pol2cptid(policy), + sizeof(*cli), moving_req ? GFP_ATOMIC : __GFP_IO); + if (cli == NULL) + return -ENOMEM; + + nrs_tbf_cli_init(head, cli, req); + tmp = head->th_ops->o_cli_findadd(head, cli); + if (tmp != cli) { + atomic_dec(&cli->tc_ref); + nrs_tbf_cli_fini(cli); + cli = tmp; + } +out: + *resp = &cli->tc_res; + + return 1; +} + +/** + * Called when releasing references to the resource hierachy obtained for a + * request for scheduling using the TBF policy. + * + * \param[in] policy the policy the resource belongs to + * \param[in] res the resource to be released + */ +static void nrs_tbf_res_put(struct ptlrpc_nrs_policy *policy, + const struct ptlrpc_nrs_resource *res) +{ + struct nrs_tbf_head *head; + struct nrs_tbf_client *cli; + + /** + * Do nothing for freeing parent, nrs_tbf_net resources + */ + if (res->res_parent == NULL) + return; + + cli = container_of(res, struct nrs_tbf_client, tc_res); + head = container_of(res->res_parent, struct nrs_tbf_head, th_res); + + head->th_ops->o_cli_put(head, cli); +} + +/** + * Called when getting a request from the TBF policy for handling, or just + * peeking; removes the request from the policy when it is to be handled. + * + * \param[in] policy The policy + * \param[in] peek When set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force Force the policy to return a request; unused in this + * policy + * + * \retval The request to be handled; this is the next request in the TBF + * rule + * + * \see ptlrpc_nrs_req_get_nolock() + * \see nrs_request_get() + */ +static +struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct nrs_tbf_head *head = policy->pol_private; + struct ptlrpc_nrs_request *nrq = NULL; + struct nrs_tbf_client *cli; + struct binheap_node *node; + + assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock); + + if (!peek && policy->pol_nrs->nrs_throttling) + return NULL; + + node = binheap_root(head->th_binheap); + if (unlikely(node == NULL)) + return NULL; + + cli = container_of(node, struct nrs_tbf_client, tc_node); + LASSERT(cli->tc_in_heap); + if (peek) { + nrq = list_entry(cli->tc_list.next, + struct ptlrpc_nrs_request, + nr_u.tbf.tr_list); + } else { + struct nrs_tbf_rule *rule = cli->tc_rule; + __u64 now = ktime_to_ns(ktime_get()); + __u64 passed; + __u64 ntoken; + __u64 deadline; + __u64 old_resid = 0; + + deadline = cli->tc_check_time + + cli->tc_nsecs; + LASSERT(now >= cli->tc_check_time); + passed = now - cli->tc_check_time; + ntoken = passed * cli->tc_rpc_rate; + do_div(ntoken, NSEC_PER_SEC); + + ntoken += cli->tc_ntoken; + if (rule->tr_flags & NTRS_REALTIME) { + LASSERT(cli->tc_nsecs_resid < cli->tc_nsecs); + old_resid = cli->tc_nsecs_resid; + cli->tc_nsecs_resid += passed % cli->tc_nsecs; + if (cli->tc_nsecs_resid > cli->tc_nsecs) { + ntoken++; + cli->tc_nsecs_resid -= cli->tc_nsecs; + } + } else if (ntoken > cli->tc_depth) + ntoken = cli->tc_depth; + + if (ntoken > 0) { + struct ptlrpc_request *req; + nrq = list_entry(cli->tc_list.next, + struct ptlrpc_nrs_request, + nr_u.tbf.tr_list); + req = container_of(nrq, + struct ptlrpc_request, + rq_nrq); + ntoken--; + cli->tc_ntoken = ntoken; + cli->tc_check_time = now; + list_del_init(&nrq->nr_u.tbf.tr_list); + if (list_empty(&cli->tc_list)) { + binheap_remove(head->th_binheap, + &cli->tc_node); + cli->tc_in_heap = false; + } else { + if (!(rule->tr_flags & NTRS_REALTIME)) + cli->tc_deadline = now + cli->tc_nsecs; + binheap_relocate(head->th_binheap, + &cli->tc_node); + } + CDEBUG(D_RPCTRACE, + "TBF dequeues: class@%p rate %llu gen %llu token %llu, rule@%p rate %llu gen %llu\n", + cli, cli->tc_rpc_rate, + cli->tc_rule_generation, cli->tc_ntoken, + cli->tc_rule, cli->tc_rule->tr_rpc_rate, + cli->tc_rule->tr_generation); + } else { + ktime_t time; + + if (rule->tr_flags & NTRS_REALTIME) { + cli->tc_deadline = deadline; + cli->tc_nsecs_resid = old_resid; + binheap_relocate(head->th_binheap, + &cli->tc_node); + if (node != binheap_root(head->th_binheap)) + return nrs_tbf_req_get(policy, + peek, force); + } + policy->pol_nrs->nrs_throttling = 1; + head->th_deadline = deadline; + time = ktime_set(0, 0); + time = ktime_add_ns(time, deadline); + hrtimer_start(&head->th_timer, time, HRTIMER_MODE_ABS); + } + } + + return nrq; +} + +/** + * Adds request \a nrq to \a policy's list of queued requests + * + * \param[in] policy The policy + * \param[in] nrq The request to add + * + * \retval 0 success; nrs_request_enqueue() assumes this function will always + * succeed + */ +static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_tbf_head *head; + struct nrs_tbf_client *cli; + int rc = 0; + + assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock); + + cli = container_of(nrs_request_resource(nrq), + struct nrs_tbf_client, tc_res); + head = container_of(nrs_request_resource(nrq)->res_parent, + struct nrs_tbf_head, th_res); + if (list_empty(&cli->tc_list)) { + LASSERT(!cli->tc_in_heap); + cli->tc_deadline = cli->tc_check_time + cli->tc_nsecs; + rc = binheap_insert(head->th_binheap, &cli->tc_node); + if (rc == 0) { + cli->tc_in_heap = true; + nrq->nr_u.tbf.tr_sequence = head->th_sequence++; + list_add_tail(&nrq->nr_u.tbf.tr_list, + &cli->tc_list); + if (policy->pol_nrs->nrs_throttling) { + __u64 deadline = cli->tc_deadline; + if ((head->th_deadline > deadline) && + (hrtimer_try_to_cancel(&head->th_timer) + >= 0)) { + ktime_t time; + head->th_deadline = deadline; + time = ktime_set(0, 0); + time = ktime_add_ns(time, deadline); + hrtimer_start(&head->th_timer, time, + HRTIMER_MODE_ABS); + } + } + } + } else { + LASSERT(cli->tc_in_heap); + nrq->nr_u.tbf.tr_sequence = head->th_sequence++; + list_add_tail(&nrq->nr_u.tbf.tr_list, + &cli->tc_list); + } + + if (rc == 0) + CDEBUG(D_RPCTRACE, + "TBF enqueues: class@%p rate %llu gen %llu token %llu, rule@%p rate %llu gen %llu\n", + cli, cli->tc_rpc_rate, + cli->tc_rule_generation, cli->tc_ntoken, + cli->tc_rule, cli->tc_rule->tr_rpc_rate, + cli->tc_rule->tr_generation); + + return rc; +} + +/** + * Removes request \a nrq from \a policy's list of queued requests. + * + * \param[in] policy The policy + * \param[in] nrq The request to remove + */ +static void nrs_tbf_req_del(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_tbf_head *head; + struct nrs_tbf_client *cli; + + assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock); + + cli = container_of(nrs_request_resource(nrq), + struct nrs_tbf_client, tc_res); + head = container_of(nrs_request_resource(nrq)->res_parent, + struct nrs_tbf_head, th_res); + + LASSERT(!list_empty(&nrq->nr_u.tbf.tr_list)); + list_del_init(&nrq->nr_u.tbf.tr_list); + if (list_empty(&cli->tc_list)) { + binheap_remove(head->th_binheap, + &cli->tc_node); + cli->tc_in_heap = false; + } else { + binheap_relocate(head->th_binheap, + &cli->tc_node); + } +} + +/** + * Prints a debug statement right before the request \a nrq stops being + * handled. + * + * \param[in] policy The policy handling the request + * \param[in] nrq The request being handled + * + * \see ptlrpc_server_finish_request() + * \see ptlrpc_nrs_req_stop_nolock() + */ +static void nrs_tbf_req_stop(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + + assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock); + + CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: %llu\n", + policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer), + nrq->nr_u.tbf.tr_sequence); +} + +/** + * debugfs interface + */ + +/** + * The maximum RPC rate. + */ +#define LPROCFS_NRS_RATE_MAX 1000000ULL /* 1rpc/us */ + +static int +ptlrpc_lprocfs_nrs_tbf_rule_seq_show(struct seq_file *m, void *data) +{ + struct ptlrpc_service *svc = m->private; + int rc; + + seq_printf(m, "regular_requests:\n"); + /** + * Perform two separate calls to this as only one of the NRS heads' + * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state. + */ + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + NRS_POL_NAME_TBF, + NRS_CTL_TBF_RD_RULE, + false, m); + if (rc == 0) { + /** + * -ENOSPC means buf in the parameter m is overflow, return 0 + * here to let upper layer function seq_read alloc a larger + * memory area and do this process again. + */ + } else if (rc == -ENOSPC) { + return 0; + + /** + * Ignore -ENODEV as the regular NRS head's policy may be in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + + if (!nrs_svc_has_hp(svc)) + goto no_hp; + + seq_printf(m, "high_priority_requests:\n"); + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + NRS_POL_NAME_TBF, + NRS_CTL_TBF_RD_RULE, + false, m); + if (rc == 0) { + /** + * -ENOSPC means buf in the parameter m is overflow, return 0 + * here to let upper layer function seq_read alloc a larger + * memory area and do this process again. + */ + } else if (rc == -ENOSPC) { + return 0; + } + +no_hp: + + return rc; +} + +static int nrs_tbf_id_parse(struct nrs_tbf_cmd *cmd, char *token) +{ + int rc; + ENTRY; + + switch (cmd->u.tc_start.ts_valid_type) { + case NRS_TBF_FLAG_JOBID: + rc = nrs_tbf_jobid_parse(cmd, token); + break; + case NRS_TBF_FLAG_NID: + rc = nrs_tbf_nid_parse(cmd, token); + break; + case NRS_TBF_FLAG_OPCODE: + rc = nrs_tbf_opcode_parse(cmd, token); + break; + case NRS_TBF_FLAG_GENERIC: + rc = nrs_tbf_generic_parse(cmd, token); + break; + case NRS_TBF_FLAG_UID: + case NRS_TBF_FLAG_GID: + rc = nrs_tbf_ug_id_parse(cmd, token); + break; + default: + RETURN(-EINVAL); + } + + RETURN(rc); +} + +static void nrs_tbf_cmd_fini(struct nrs_tbf_cmd *cmd) +{ + if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) { + switch (cmd->u.tc_start.ts_valid_type) { + case NRS_TBF_FLAG_JOBID: + nrs_tbf_jobid_cmd_fini(cmd); + break; + case NRS_TBF_FLAG_NID: + nrs_tbf_nid_cmd_fini(cmd); + break; + case NRS_TBF_FLAG_OPCODE: + nrs_tbf_opcode_cmd_fini(cmd); + break; + case NRS_TBF_FLAG_GENERIC: + nrs_tbf_generic_cmd_fini(cmd); + break; + case NRS_TBF_FLAG_UID: + case NRS_TBF_FLAG_GID: + nrs_tbf_id_cmd_fini(cmd); + break; + default: + CWARN("unknown NRS_TBF_FLAGS:0x%x\n", + cmd->u.tc_start.ts_valid_type); + } + } +} + +static int check_rule_name(const char *name) +{ + int i; + + if (name[0] == '\0') + return -EINVAL; + + for (i = 0; name[i] != '\0' && i < MAX_TBF_NAME; i++) { + if (!isalnum(name[i]) && name[i] != '_') + return -EINVAL; + } + + if (i == MAX_TBF_NAME) + return -ENAMETOOLONG; + + return 0; +} + +static int +nrs_tbf_parse_value_pair(struct nrs_tbf_cmd *cmd, char *buffer) +{ + char *key; + char *val; + int rc; + __u64 rate; + + val = buffer; + key = strsep(&val, "="); + if (val == NULL || strlen(val) == 0) + return -EINVAL; + + /* Key of the value pair */ + if (strcmp(key, "rate") == 0) { + rc = kstrtoull(val, 10, &rate); + if (rc) + return rc; + + if (rate <= 0 || rate >= LPROCFS_NRS_RATE_MAX) + return -EINVAL; + + if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) + cmd->u.tc_start.ts_rpc_rate = rate; + else if (cmd->tc_cmd == NRS_CTL_TBF_CHANGE_RULE) + cmd->u.tc_change.tc_rpc_rate = rate; + else + return -EINVAL; + } else if (strcmp(key, "rank") == 0) { + rc = check_rule_name(val); + if (rc) + return rc; + + if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) + cmd->u.tc_start.ts_next_name = val; + else if (cmd->tc_cmd == NRS_CTL_TBF_CHANGE_RULE) + cmd->u.tc_change.tc_next_name = val; + else + return -EINVAL; + } else if (strcmp(key, "realtime") == 0) { + unsigned long realtime; + + rc = kstrtoul(val, 10, &realtime); + if (rc) + return rc; + + if (realtime > 0) + cmd->u.tc_start.ts_rule_flags |= NTRS_REALTIME; + } else { + return -EINVAL; + } + return 0; +} + +static int +nrs_tbf_parse_value_pairs(struct nrs_tbf_cmd *cmd, char *buffer) +{ + char *val; + char *token; + int rc; + + val = buffer; + while (val != NULL && strlen(val) != 0) { + token = strsep(&val, " "); + rc = nrs_tbf_parse_value_pair(cmd, token); + if (rc) + return rc; + } + + switch (cmd->tc_cmd) { + case NRS_CTL_TBF_START_RULE: + if (cmd->u.tc_start.ts_rpc_rate == 0) + cmd->u.tc_start.ts_rpc_rate = tbf_rate; + break; + case NRS_CTL_TBF_CHANGE_RULE: + if (cmd->u.tc_change.tc_rpc_rate == 0 && + cmd->u.tc_change.tc_next_name == NULL) + return -EINVAL; + break; + case NRS_CTL_TBF_STOP_RULE: + break; + default: + return -EINVAL; + } + return 0; +} + +static struct nrs_tbf_cmd * +nrs_tbf_parse_cmd(char *buffer, unsigned long count, __u32 type_flag) +{ + struct nrs_tbf_cmd *cmd; + char *token; + char *val; + int rc = 0; + + OBD_ALLOC_PTR(cmd); + if (cmd == NULL) + GOTO(out, rc = -ENOMEM); + memset(cmd, 0, sizeof(*cmd)); + + val = buffer; + token = strsep(&val, " "); + if (val == NULL || strlen(val) == 0) + GOTO(out_free_cmd, rc = -EINVAL); + + /* Type of the command */ + if (strcmp(token, "start") == 0) { + cmd->tc_cmd = NRS_CTL_TBF_START_RULE; + cmd->u.tc_start.ts_valid_type = type_flag; + } else if (strcmp(token, "stop") == 0) + cmd->tc_cmd = NRS_CTL_TBF_STOP_RULE; + else if (strcmp(token, "change") == 0) + cmd->tc_cmd = NRS_CTL_TBF_CHANGE_RULE; + else + GOTO(out_free_cmd, rc = -EINVAL); + + /* Name of the rule */ + token = strsep(&val, " "); + if ((val == NULL && cmd->tc_cmd != NRS_CTL_TBF_STOP_RULE)) + GOTO(out_free_cmd, rc = -EINVAL); + + rc = check_rule_name(token); + if (rc) + GOTO(out_free_cmd, rc); + + cmd->tc_name = token; + + if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) { + /* List of ID */ + LASSERT(val); + token = val; + val = strrchr(token, '}'); + if (!val) + GOTO(out_free_cmd, rc = -EINVAL); + + /* Skip '}' */ + val++; + if (*val == '\0') { + val = NULL; + } else if (*val == ' ') { + *val = '\0'; + val++; + } else + GOTO(out_free_cmd, rc = -EINVAL); + + rc = nrs_tbf_id_parse(cmd, token); + if (rc) + GOTO(out_free_cmd, rc); + } + + rc = nrs_tbf_parse_value_pairs(cmd, val); + if (rc) + GOTO(out_cmd_fini, rc = -EINVAL); + goto out; +out_cmd_fini: + nrs_tbf_cmd_fini(cmd); +out_free_cmd: + OBD_FREE_PTR(cmd); +out: + if (rc) + cmd = ERR_PTR(rc); + return cmd; +} + +/** + * Get the TBF policy type (nid, jobid, etc) preset by + * proc entry 'nrs_policies' for command buffer parsing. + * + * \param[in] svc the PTLRPC service + * \param[in] queue the NRS queue type + * + * \retval the preset TBF policy type flag + */ +static __u32 +nrs_tbf_type_flag(struct ptlrpc_service *svc, enum ptlrpc_nrs_queue_type queue) +{ + __u32 type; + int rc; + + rc = ptlrpc_nrs_policy_control(svc, queue, + NRS_POL_NAME_TBF, + NRS_CTL_TBF_RD_TYPE_FLAG, + true, &type); + if (rc != 0) + type = NRS_TBF_FLAG_INVALID; + + return type; +} + +#define LPROCFS_WR_NRS_TBF_MAX_CMD (4096) +static ssize_t +ptlrpc_lprocfs_nrs_tbf_rule_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + char *kernbuf; + char *val; + int rc; + struct nrs_tbf_cmd *cmd; + enum ptlrpc_nrs_queue_type queue = PTLRPC_NRS_QUEUE_BOTH; + unsigned long length; + char *token; + + OBD_ALLOC(kernbuf, LPROCFS_WR_NRS_TBF_MAX_CMD); + if (kernbuf == NULL) + GOTO(out, rc = -ENOMEM); + + if (count > LPROCFS_WR_NRS_TBF_MAX_CMD - 1) + GOTO(out_free_kernbuff, rc = -EINVAL); + + if (copy_from_user(kernbuf, buffer, count)) + GOTO(out_free_kernbuff, rc = -EFAULT); + + val = kernbuf; + token = strsep(&val, " "); + if (val == NULL) + GOTO(out_free_kernbuff, rc = -EINVAL); + + if (strcmp(token, "reg") == 0) { + queue = PTLRPC_NRS_QUEUE_REG; + } else if (strcmp(token, "hp") == 0) { + queue = PTLRPC_NRS_QUEUE_HP; + } else { + kernbuf[strlen(token)] = ' '; + val = kernbuf; + } + length = strlen(val); + + if (length == 0) + GOTO(out_free_kernbuff, rc = -EINVAL); + + if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc)) + GOTO(out_free_kernbuff, rc = -ENODEV); + else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc)) + queue = PTLRPC_NRS_QUEUE_REG; + + cmd = nrs_tbf_parse_cmd(val, length, nrs_tbf_type_flag(svc, queue)); + if (IS_ERR(cmd)) + GOTO(out_free_kernbuff, rc = PTR_ERR(cmd)); + + /** + * Serialize NRS core lprocfs operations with policy registration/ + * unregistration. + */ + mutex_lock(&nrs_core.nrs_mutex); + rc = ptlrpc_nrs_policy_control(svc, queue, + NRS_POL_NAME_TBF, + NRS_CTL_TBF_WR_RULE, + false, cmd); + mutex_unlock(&nrs_core.nrs_mutex); + + nrs_tbf_cmd_fini(cmd); + OBD_FREE_PTR(cmd); +out_free_kernbuff: + OBD_FREE(kernbuf, LPROCFS_WR_NRS_TBF_MAX_CMD); +out: + return rc ? rc : count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_tbf_rule); + +/** + * Initializes a TBF policy's lprocfs interface for service \a svc + * + * \param[in] svc the service + * + * \retval 0 success + * \retval != 0 error + */ +static int nrs_tbf_lprocfs_init(struct ptlrpc_service *svc) +{ + struct ldebugfs_vars nrs_tbf_lprocfs_vars[] = { + { .name = "nrs_tbf_rule", + .fops = &ptlrpc_lprocfs_nrs_tbf_rule_fops, + .data = svc }, + { NULL } + }; + + if (!svc->srv_debugfs_entry) + return 0; + + ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_tbf_lprocfs_vars, NULL); + + return 0; +} + +/** + * TBF policy operations + */ +static const struct ptlrpc_nrs_pol_ops nrs_tbf_ops = { + .op_policy_start = nrs_tbf_start, + .op_policy_stop = nrs_tbf_stop, + .op_policy_ctl = nrs_tbf_ctl, + .op_res_get = nrs_tbf_res_get, + .op_res_put = nrs_tbf_res_put, + .op_req_get = nrs_tbf_req_get, + .op_req_enqueue = nrs_tbf_req_add, + .op_req_dequeue = nrs_tbf_req_del, + .op_req_stop = nrs_tbf_req_stop, + .op_lprocfs_init = nrs_tbf_lprocfs_init, +}; + +/** + * TBF policy configuration + */ +struct ptlrpc_nrs_pol_conf nrs_conf_tbf = { + .nc_name = NRS_POL_NAME_TBF, + .nc_ops = &nrs_tbf_ops, + .nc_compat = nrs_policy_compat_all, +}; + +/** @} tbf */ + +/** @} nrs */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c new file mode 100644 index 0000000000000..3263e944e76b7 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c @@ -0,0 +1,3001 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/pack_generic.c + * + * (Un)packing of OST requests + * + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +static inline __u32 lustre_msg_hdr_size_v2(__u32 count) +{ + return cfs_size_round(offsetof(struct lustre_msg_v2, + lm_buflens[count])); +} + +__u32 lustre_msg_hdr_size(__u32 magic, __u32 count) +{ + LASSERT(count > 0); + + switch (magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_hdr_size_v2(count); + default: + LASSERTF(0, "incorrect message magic: %08x\n", magic); + return 0; + } +} + +static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg, + enum lustre_msg_version version) +{ + enum lustre_msg_version ver = lustre_msg_get_version(msg); + + return (ver & LUSTRE_VERSION_MASK) != version; +} + +int lustre_msg_check_version(struct lustre_msg *msg, + enum lustre_msg_version version) +{ +#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0 + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + CERROR("msg v1 not supported - please upgrade you system\n"); + return -EINVAL; + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_check_version_v2(msg, version); + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return -EPROTO; + } +#undef LUSTRE_MSG_MAGIC_V1 +} + +__u32 lustre_msg_early_size; +EXPORT_SYMBOL(lustre_msg_early_size); + +/* early reply size */ +void lustre_msg_early_size_init(void) +{ + __u32 pblen = sizeof(struct ptlrpc_body); + + lustre_msg_early_size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pblen); +} + +__u32 lustre_msg_size_v2(int count, __u32 *lengths) +{ + __u32 size; + int i; + + LASSERT(count > 0); + size = lustre_msg_hdr_size_v2(count); + for (i = 0; i < count; i++) + size += cfs_size_round(lengths[i]); + + return size; +} +EXPORT_SYMBOL(lustre_msg_size_v2); + +/* + * This returns the size of the buffer that is required to hold a lustre_msg + * with the given sub-buffer lengths. + * NOTE: this should only be used for NEW requests, and should always be + * in the form of a v2 request. If this is a connection to a v1 + * target then the first buffer will be stripped because the ptlrpc + * data is part of the lustre_msg_v1 header. b=14043 + */ +__u32 lustre_msg_size(__u32 magic, int count, __u32 *lens) +{ + __u32 size[] = { sizeof(struct ptlrpc_body) }; + + if (!lens) { + LASSERT(count == 1); + lens = size; + } + + LASSERT(count > 0); + LASSERT(lens[MSG_PTLRPC_BODY_OFF] >= sizeof(struct ptlrpc_body_v2)); + + switch (magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_size_v2(count, lens); + default: + LASSERTF(0, "incorrect message magic: %08x\n", magic); + return 0; + } +} + +/* + * This is used to determine the size of a buffer that was already packed + * and will correctly handle the different message formats. + */ +__u32 lustre_packed_msg_size(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_packed_msg_size); + +void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens, + char **bufs) +{ + char *ptr; + int i; + + LASSERT(count > 0); + + msg->lm_bufcount = count; + /* XXX: lm_secflvr uninitialized here */ + msg->lm_magic = LUSTRE_MSG_MAGIC_V2; + + for (i = 0; i < count; i++) + msg->lm_buflens[i] = lens[i]; + + if (bufs == NULL) + return; + + ptr = (char *)msg + lustre_msg_hdr_size_v2(count); + for (i = 0; i < count; i++) { + char *tmp = bufs[i]; + + if (tmp) + memcpy(ptr, tmp, lens[i]); + ptr += cfs_size_round(lens[i]); + } +} +EXPORT_SYMBOL(lustre_init_msg_v2); + +static int lustre_pack_request_v2(struct ptlrpc_request *req, + int count, __u32 *lens, char **bufs) +{ + int reqlen, rc; + + reqlen = lustre_msg_size_v2(count, lens); + + rc = sptlrpc_cli_alloc_reqbuf(req, reqlen); + if (rc) + return rc; + + req->rq_reqlen = reqlen; + + lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs); + lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION); + return 0; +} + +int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count, + __u32 *lens, char **bufs) +{ + __u32 size[] = { sizeof(struct ptlrpc_body) }; + + if (!lens) { + LASSERT(count == 1); + lens = size; + } + + LASSERT(count > 0); + LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body)); + + /* only use new format, we don't need to be compatible with 1.4 */ + magic = LUSTRE_MSG_MAGIC_V2; + + switch (magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_pack_request_v2(req, count, lens, bufs); + default: + LASSERTF(0, "incorrect message magic: %08x\n", magic); + return -EINVAL; + } +} + +#if RS_DEBUG +struct list_head ptlrpc_rs_debug_lru = + LIST_HEAD_INIT(ptlrpc_rs_debug_lru); +spinlock_t ptlrpc_rs_debug_lock; + +#define PTLRPC_RS_DEBUG_LRU_ADD(rs) \ +do { \ + spin_lock(&ptlrpc_rs_debug_lock); \ + list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru); \ + spin_unlock(&ptlrpc_rs_debug_lock); \ +} while (0) + +#define PTLRPC_RS_DEBUG_LRU_DEL(rs) \ +do { \ + spin_lock(&ptlrpc_rs_debug_lock); \ + list_del(&(rs)->rs_debug_list); \ + spin_unlock(&ptlrpc_rs_debug_lock); \ +} while (0) +#else +# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while(0) +# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while(0) +#endif + +struct ptlrpc_reply_state * +lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_reply_state *rs = NULL; + + spin_lock(&svcpt->scp_rep_lock); + + /* See if we have anything in a pool, and wait if nothing */ + while (list_empty(&svcpt->scp_rep_idle)) { + int rc; + + spin_unlock(&svcpt->scp_rep_lock); + /* If we cannot get anything for some long time, we better + * bail out instead of waiting infinitely */ + rc = wait_event_idle_timeout(svcpt->scp_rep_waitq, + !list_empty(&svcpt->scp_rep_idle), + cfs_time_seconds(10)); + if (rc <= 0) + goto out; + spin_lock(&svcpt->scp_rep_lock); + } + + rs = list_first_entry(&svcpt->scp_rep_idle, + struct ptlrpc_reply_state, rs_list); + list_del(&rs->rs_list); + + spin_unlock(&svcpt->scp_rep_lock); + + memset(rs, 0, svcpt->scp_service->srv_max_reply_size); + rs->rs_size = svcpt->scp_service->srv_max_reply_size; + rs->rs_svcpt = svcpt; + rs->rs_prealloc = 1; +out: + return rs; +} + +void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + + spin_lock(&svcpt->scp_rep_lock); + list_add(&rs->rs_list, &svcpt->scp_rep_idle); + spin_unlock(&svcpt->scp_rep_lock); + wake_up(&svcpt->scp_rep_waitq); +} + +int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, + __u32 *lens, char **bufs, int flags) +{ + struct ptlrpc_reply_state *rs; + int msg_len, rc; + ENTRY; + + LASSERT(req->rq_reply_state == NULL); + LASSERT(count > 0); + + if ((flags & LPRFL_EARLY_REPLY) == 0) { + spin_lock(&req->rq_lock); + req->rq_packed_final = 1; + spin_unlock(&req->rq_lock); + } + + msg_len = lustre_msg_size_v2(count, lens); + rc = sptlrpc_svc_alloc_rs(req, msg_len); + if (rc) + RETURN(rc); + + rs = req->rq_reply_state; + atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */ + rs->rs_cb_id.cbid_fn = reply_out_callback; + rs->rs_cb_id.cbid_arg = rs; + rs->rs_svcpt = req->rq_rqbd->rqbd_svcpt; + INIT_LIST_HEAD(&rs->rs_exp_list); + INIT_LIST_HEAD(&rs->rs_obd_list); + INIT_LIST_HEAD(&rs->rs_list); + spin_lock_init(&rs->rs_lock); + + req->rq_replen = msg_len; + req->rq_reply_state = rs; + req->rq_repmsg = rs->rs_msg; + + lustre_init_msg_v2(rs->rs_msg, count, lens, bufs); + lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION); + + PTLRPC_RS_DEBUG_LRU_ADD(rs); + + RETURN(0); +} +EXPORT_SYMBOL(lustre_pack_reply_v2); + +int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens, + char **bufs, int flags) +{ + int rc = 0; + __u32 size[] = { sizeof(struct ptlrpc_body) }; + + if (!lens) { + LASSERT(count == 1); + lens = size; + } + + LASSERT(count > 0); + LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body)); + + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + rc = lustre_pack_reply_v2(req, count, lens, bufs, flags); + break; + default: + LASSERTF(0, "incorrect message magic: %08x\n", + req->rq_reqmsg->lm_magic); + rc = -EINVAL; + } + if (rc != 0) + CERROR("lustre_pack_reply failed: rc=%d size=%d\n", rc, + lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens)); + return rc; +} + +int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens, + char **bufs) +{ + return lustre_pack_reply_flags(req, count, lens, bufs, 0); +} +EXPORT_SYMBOL(lustre_pack_reply); + +void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, __u32 n, __u32 min_size) +{ + __u32 i, offset, buflen, bufcount; + + LASSERT(m != NULL); + LASSERT(m->lm_bufcount > 0); + + bufcount = m->lm_bufcount; + if (unlikely(n >= bufcount)) { + CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n", + m, n, bufcount); + return NULL; + } + + buflen = m->lm_buflens[n]; + if (unlikely(buflen < min_size)) { + CERROR("msg %p buffer[%d] size %d too small " + "(required %d, opc=%d)\n", m, n, buflen, min_size, + n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m)); + return NULL; + } + + offset = lustre_msg_hdr_size_v2(bufcount); + for (i = 0; i < n; i++) + offset += cfs_size_round(m->lm_buflens[i]); + + return (char *)m + offset; +} + +void *lustre_msg_buf(struct lustre_msg *m, __u32 n, __u32 min_size) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_buf_v2(m, n, min_size); + default: + LASSERTF(0, "incorrect message magic: %08x (msg:%p)\n", + m->lm_magic, m); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_buf); + +static int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, __u32 segment, + unsigned int newlen, int move_data) +{ + char *tail = NULL, *newpos; + int tail_len = 0, n; + + LASSERT(msg); + LASSERT(msg->lm_bufcount > segment); + LASSERT(msg->lm_buflens[segment] >= newlen); + + if (msg->lm_buflens[segment] == newlen) + goto out; + + if (move_data && msg->lm_bufcount > segment + 1) { + tail = lustre_msg_buf_v2(msg, segment + 1, 0); + for (n = segment + 1; n < msg->lm_bufcount; n++) + tail_len += cfs_size_round(msg->lm_buflens[n]); + } + + msg->lm_buflens[segment] = newlen; + + if (tail && tail_len) { + newpos = lustre_msg_buf_v2(msg, segment + 1, 0); + LASSERT(newpos <= tail); + if (newpos != tail) + memmove(newpos, tail, tail_len); + } +out: + return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); +} + +/* + * for @msg, shrink @segment to size @newlen. if @move_data is non-zero, + * we also move data forward from @segment + 1. + * + * if @newlen == 0, we remove the segment completely, but we still keep the + * totally bufcount the same to save possible data moving. this will leave a + * unused segment with size 0 at the tail, but that's ok. + * + * return new msg size after shrinking. + * + * CAUTION: + * + if any buffers higher than @segment has been filled in, must call shrink + * with non-zero @move_data. + * + caller should NOT keep pointers to msg buffers which higher than @segment + * after call shrink. + */ +int lustre_shrink_msg(struct lustre_msg *msg, int segment, + unsigned int newlen, int move_data) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_shrink_msg_v2(msg, segment, newlen, move_data); + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_shrink_msg); + +static int lustre_grow_msg_v2(struct lustre_msg_v2 *msg, __u32 segment, + unsigned int newlen) +{ + char *tail = NULL, *newpos; + int tail_len = 0, n; + + LASSERT(msg); + LASSERT(msg->lm_bufcount > segment); + LASSERT(msg->lm_buflens[segment] <= newlen); + + if (msg->lm_buflens[segment] == newlen) + goto out; + + if (msg->lm_bufcount > segment + 1) { + tail = lustre_msg_buf_v2(msg, segment + 1, 0); + for (n = segment + 1; n < msg->lm_bufcount; n++) + tail_len += cfs_size_round(msg->lm_buflens[n]); + } + + msg->lm_buflens[segment] = newlen; + + if (tail && tail_len) { + newpos = lustre_msg_buf_v2(msg, segment + 1, 0); + memmove(newpos, tail, tail_len); + } +out: + return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); +} + +/* + * for @msg, grow @segment to size @newlen. + * Always move higher buffer forward. + * + * return new msg size after growing. + * + * CAUTION: + * - caller must make sure there is enough space in allocated message buffer + * - caller should NOT keep pointers to msg buffers which higher than @segment + * after call shrink. + */ +int lustre_grow_msg(struct lustre_msg *msg, int segment, unsigned int newlen) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_grow_msg_v2(msg, segment, newlen); + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_grow_msg); + +void lustre_free_reply_state(struct ptlrpc_reply_state *rs) +{ + PTLRPC_RS_DEBUG_LRU_DEL(rs); + + LASSERT(atomic_read(&rs->rs_refcount) == 0); + LASSERT(!rs->rs_difficult || rs->rs_handled); + LASSERT(!rs->rs_difficult || rs->rs_unlinked); + LASSERT(!rs->rs_scheduled); + LASSERT(rs->rs_export == NULL); + LASSERT(rs->rs_nlocks == 0); + LASSERT(list_empty(&rs->rs_exp_list)); + LASSERT(list_empty(&rs->rs_obd_list)); + + sptlrpc_svc_free_rs(rs); +} + +static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len) +{ + int swabbed, required_len, i, buflen; + + /* Now we know the sender speaks my language. */ + required_len = lustre_msg_hdr_size_v2(0); + if (len < required_len) { + /* can't even look inside the message */ + CERROR("message length %d too small for lustre_msg\n", len); + return -EINVAL; + } + + swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED); + + if (swabbed) { + __swab32s(&m->lm_magic); + __swab32s(&m->lm_bufcount); + __swab32s(&m->lm_secflvr); + __swab32s(&m->lm_repsize); + __swab32s(&m->lm_cksum); + __swab32s(&m->lm_flags); + BUILD_BUG_ON(offsetof(typeof(*m), lm_padding_2) == 0); + BUILD_BUG_ON(offsetof(typeof(*m), lm_padding_3) == 0); + } + + if (m->lm_bufcount == 0 || m->lm_bufcount > PTLRPC_MAX_BUFCOUNT) { + CERROR("message bufcount %d is not valid\n", m->lm_bufcount); + return -EINVAL; + } + required_len = lustre_msg_hdr_size_v2(m->lm_bufcount); + if (len < required_len) { + /* didn't receive all the buffer lengths */ + CERROR("message length %d too small for %d buflens\n", + len, m->lm_bufcount); + return -EINVAL; + } + + for (i = 0; i < m->lm_bufcount; i++) { + if (swabbed) + __swab32s(&m->lm_buflens[i]); + buflen = cfs_size_round(m->lm_buflens[i]); + if (buflen < 0 || buflen > PTLRPC_MAX_BUFLEN) { + CERROR("buffer %d length %d is not valid\n", i, buflen); + return -EINVAL; + } + required_len += buflen; + } + if (len < required_len || required_len > PTLRPC_MAX_BUFLEN) { + CERROR("len: %d, required_len %d, bufcount: %d\n", + len, required_len, m->lm_bufcount); + for (i = 0; i < m->lm_bufcount; i++) + CERROR("buffer %d length %d\n", i, m->lm_buflens[i]); + return -EINVAL; + } + + return swabbed; +} + +int __lustre_unpack_msg(struct lustre_msg *m, int len) +{ + int required_len, rc; + + ENTRY; + /* + * We can provide a slightly better error log, if we check the + * message magic and version first. In the future, struct + * lustre_msg may grow, and we'd like to log a version mismatch, + * rather than a short message. + */ + required_len = offsetof(struct lustre_msg, lm_magic) + + sizeof(m->lm_magic); + if (len < required_len) { + /* can't even look inside the message */ + CERROR("message length %d too small for magic/version check\n", + len); + RETURN(-EINVAL); + } + + rc = lustre_unpack_msg_v2(m, len); + + RETURN(rc); +} +EXPORT_SYMBOL(__lustre_unpack_msg); + +int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len) +{ + int rc; + + rc = __lustre_unpack_msg(req->rq_reqmsg, len); + if (rc == 1) { + req_capsule_set_req_swabbed(&req->rq_pill, + MSG_PTLRPC_HEADER_OFF); + rc = 0; + } + return rc; +} + +int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len) +{ + int rc; + + rc = __lustre_unpack_msg(req->rq_repmsg, len); + if (rc == 1) { + req_capsule_set_rep_swabbed(&req->rq_pill, + MSG_PTLRPC_HEADER_OFF); + rc = 0; + } + return rc; +} + +static inline int +lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req, + enum req_location loc, int offset) +{ + struct ptlrpc_body *pb; + struct lustre_msg_v2 *m; + + m = loc == RCL_CLIENT ? req->rq_reqmsg : req->rq_repmsg; + + pb = lustre_msg_buf_v2(m, offset, sizeof(struct ptlrpc_body_v2)); + if (!pb) { + CERROR("error unpacking ptlrpc body\n"); + return -EFAULT; + } + if (req_capsule_need_swab(&req->rq_pill, loc, offset)) { + lustre_swab_ptlrpc_body(pb); + req_capsule_set_swabbed(&req->rq_pill, loc, offset); + } + + if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) { + CERROR("wrong lustre_msg version %08x\n", pb->pb_version); + return -EINVAL; + } + + if (loc == RCL_SERVER) + pb->pb_status = ptlrpc_status_ntoh(pb->pb_status); + + return 0; +} + +int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset) +{ + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_unpack_ptlrpc_body_v2(req, RCL_CLIENT, offset); + default: + CERROR("bad lustre msg magic: %08x\n", + req->rq_reqmsg->lm_magic); + return -EINVAL; + } +} + +int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset) +{ + switch (req->rq_repmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_unpack_ptlrpc_body_v2(req, RCL_SERVER, offset); + default: + CERROR("bad lustre msg magic: %08x\n", + req->rq_repmsg->lm_magic); + return -EINVAL; + } +} + +static inline __u32 lustre_msg_buflen_v2(struct lustre_msg_v2 *m, __u32 n) +{ + if (n >= m->lm_bufcount) + return 0; + + return m->lm_buflens[n]; +} + +/** + * lustre_msg_buflen - return the length of buffer \a n in message \a m + * \param m lustre_msg (request or reply) to look at + * \param n message index (base 0) + * + * returns zero for non-existent message indices + */ +__u32 lustre_msg_buflen(struct lustre_msg *m, __u32 n) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_buflen_v2(m, n); + default: + CERROR("incorrect message magic: %08x\n", m->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_buflen); + +static inline void +lustre_msg_set_buflen_v2(struct lustre_msg_v2 *m, __u32 n, __u32 len) +{ + if (n >= m->lm_bufcount) + LBUG(); + + m->lm_buflens[n] = len; +} + +void lustre_msg_set_buflen(struct lustre_msg *m, __u32 n, __u32 len) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + lustre_msg_set_buflen_v2(m, n, len); + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic); + } +} + +/* + * NB return the bufcount for lustre_msg_v2 format, so if message is packed + * in V1 format, the result is one bigger. (add struct ptlrpc_body). + */ +__u32 lustre_msg_bufcount(struct lustre_msg *m) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return m->lm_bufcount; + default: + CERROR("incorrect message magic: %08x\n", m->lm_magic); + return 0; + } +} + +char *lustre_msg_string(struct lustre_msg *m, __u32 index, __u32 max_len) +{ + /* max_len == 0 means the string should fill the buffer */ + char *str; + __u32 slen, blen; + + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + str = lustre_msg_buf_v2(m, index, 0); + blen = lustre_msg_buflen_v2(m, index); + break; + default: + LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic); + } + + if (str == NULL) { + CERROR("can't unpack string in msg %p buffer[%d]\n", m, index); + return NULL; + } + + slen = strnlen(str, blen); + + if (slen == blen) { /* not NULL terminated */ + CERROR("can't unpack non-NULL terminated string in msg %p buffer[%d] len %d\n", + m, index, blen); + return NULL; + } + if (blen > PTLRPC_MAX_BUFLEN) { + CERROR("buffer length of msg %p buffer[%d] is invalid(%d)\n", + m, index, blen); + return NULL; + } + + if (max_len == 0) { + if (slen != blen - 1) { + CERROR("can't unpack short string in msg %p buffer[%d] len %d: strlen %d\n", + m, index, blen, slen); + return NULL; + } + } else if (slen > max_len) { + CERROR("can't unpack oversized string in msg %p buffer[%d] len %d strlen %d: max %d expected\n", + m, index, blen, slen, max_len); + return NULL; + } + + return str; +} + +/* Wrap up the normal fixed length cases */ +static inline void *__lustre_swab_buf(struct lustre_msg *msg, __u32 index, + __u32 min_size, void *swabber) +{ + void *ptr = NULL; + + LASSERT(msg != NULL); + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + ptr = lustre_msg_buf_v2(msg, index, min_size); + break; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + } + + if (ptr != NULL && swabber != NULL) + ((void (*)(void *))swabber)(ptr); + + return ptr; +} + +static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg) +{ + return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, + sizeof(struct ptlrpc_body_v2)); +} + +enum lustre_msghdr lustre_msghdr_get_flags(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + /* already in host endian */ + return msg->lm_flags; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msghdr_get_flags); + +void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + msg->lm_flags = flags; + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +__u32 lustre_msg_get_flags(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb != NULL) + return pb->pb_flags; + + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + } + fallthrough; + default: + /* + * flags might be printed in debug code while message + * uninitialized + */ + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_flags); + +void lustre_msg_add_flags(struct lustre_msg *msg, __u32 flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_flags |= flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_add_flags); + +void lustre_msg_set_flags(struct lustre_msg *msg, __u32 flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_flags = flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_clear_flags(struct lustre_msg *msg, __u32 flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_flags &= ~flags; + + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_clear_flags); + +__u32 lustre_msg_get_op_flags(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb != NULL) + return pb->pb_op_flags; + + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + } + fallthrough; + default: + return 0; + } +} + +void lustre_msg_add_op_flags(struct lustre_msg *msg, __u32 flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_op_flags |= flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_add_op_flags); + +struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return NULL; + } + return &pb->pb_handle; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return NULL; + } +} + +__u32 lustre_msg_get_type(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return PTL_RPC_MSG_ERR; + } + return pb->pb_type; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return PTL_RPC_MSG_ERR; + } +} +EXPORT_SYMBOL(lustre_msg_get_type); + +enum lustre_msg_version lustre_msg_get_version(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_version; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +void lustre_msg_add_version(struct lustre_msg *msg, __u32 version) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_version |= version; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +__u32 lustre_msg_get_opc(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_opc; + } + default: + CERROR("incorrect message magic: %08x (msg:%p)\n", + msg->lm_magic, msg); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_opc); + +__u64 lustre_msg_get_last_xid(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_last_xid; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_last_xid); + +__u16 lustre_msg_get_tag(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_tag; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_tag); + +__u64 lustre_msg_get_last_committed(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_last_committed; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_last_committed); + +__u64 *lustre_msg_get_versions(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return NULL; + } + return pb->pb_pre_versions; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_get_versions); + +__u64 lustre_msg_get_transno(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_transno; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_transno); + +int lustre_msg_get_status(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb != NULL) + return pb->pb_status; + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + } + fallthrough; + default: + /* + * status might be printed in debug code while message + * uninitialized + */ + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_get_status); + +__u64 lustre_msg_get_slv(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return -EINVAL; + } + return pb->pb_slv; + } + default: + CERROR("invalid msg magic %08x\n", msg->lm_magic); + return -EINVAL; + } +} + + +void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return; + } + pb->pb_slv = slv; + return; + } + default: + CERROR("invalid msg magic %x\n", msg->lm_magic); + return; + } +} + +__u32 lustre_msg_get_limit(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return -EINVAL; + } + return pb->pb_limit; + } + default: + CERROR("invalid msg magic %x\n", msg->lm_magic); + return -EINVAL; + } +} + + +void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return; + } + pb->pb_limit = limit; + return; + } + default: + CERROR("invalid msg magic %08x\n", msg->lm_magic); + return; + } +} + +__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_conn_cnt; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_conn_cnt); + +__u32 lustre_msg_get_magic(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return msg->lm_magic; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +timeout_t lustre_msg_get_timeout(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_timeout; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +timeout_t lustre_msg_get_service_timeout(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_service_time; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +char *lustre_msg_get_jobid(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb; + + /* the old pltrpc_body_v2 is smaller; doesn't include jobid */ + if (msg->lm_buflens[MSG_PTLRPC_BODY_OFF] < + sizeof(struct ptlrpc_body)) + return NULL; + + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, + sizeof(struct ptlrpc_body)); + if (!pb) + return NULL; + + /* If clients send unterminated jobids, terminate them here + * so that there is no chance of string overflow later. + */ + if (unlikely(pb->pb_jobid[LUSTRE_JOBID_SIZE - 1] != '\0')) + pb->pb_jobid[LUSTRE_JOBID_SIZE - 1] = '\0'; + + return pb->pb_jobid; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_get_jobid); + +__u32 lustre_msg_get_cksum(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return msg->lm_cksum; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +__u64 lustre_msg_get_mbits(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_mbits; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, __u32 buf) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_buf_v2(msg, buf, 0); + __u32 len = lustre_msg_buflen(msg, buf); + __u32 crc; + +#if IS_ENABLED(CONFIG_CRC32) + /* about 10x faster than crypto_hash for small buffers */ + crc = crc32_le(~(__u32)0, (unsigned char *)pb, len); +#elif IS_ENABLED(CONFIG_CRYPTO_CRC32) + unsigned int hsize = 4; + + cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb, + len, NULL, 0, (unsigned char *)&crc, + &hsize); +#else +#error "need either CONFIG_CRC32 or CONFIG_CRYPTO_CRC32 enabled in the kernel" +#endif + return crc; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_handle = *handle; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_type(struct lustre_msg *msg, __u32 type) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_type = type; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_opc = opc; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_last_xid = last_xid; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_last_xid); + +void lustre_msg_set_tag(struct lustre_msg *msg, __u16 tag) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_tag = tag; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_tag); + +void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_last_committed = last_committed; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_pre_versions[0] = versions[0]; + pb->pb_pre_versions[1] = versions[1]; + pb->pb_pre_versions[2] = versions[2]; + pb->pb_pre_versions[3] = versions[3]; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_versions); + +void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_transno = transno; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_transno); + +void lustre_msg_set_status(struct lustre_msg *msg, __u32 status) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_status = status; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_status); + +void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_conn_cnt = conn_cnt; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_timeout(struct lustre_msg *msg, timeout_t timeout) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + + LASSERT(timeout >= 0); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_timeout = timeout; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_service_timeout(struct lustre_msg *msg, + timeout_t service_timeout) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + + LASSERT(service_timeout >= 0); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_service_time = service_timeout; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + __u32 opc = lustre_msg_get_opc(msg); + struct ptlrpc_body *pb; + + /* Don't set jobid for ldlm ast RPCs, they've been shrinked. + * See the comment in ptlrpc_request_pack(). */ + if (!opc || opc == LDLM_BL_CALLBACK || + opc == LDLM_CP_CALLBACK || opc == LDLM_GL_CALLBACK) + return; + + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, + sizeof(struct ptlrpc_body)); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + + if (jobid != NULL) + memcpy(pb->pb_jobid, jobid, sizeof(pb->pb_jobid)); + else if (pb->pb_jobid[0] == '\0') + lustre_get_jobid(pb->pb_jobid, sizeof(pb->pb_jobid)); + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_jobid); + +void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + msg->lm_cksum = cksum; + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_mbits(struct lustre_msg *msg, __u64 mbits) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_mbits = mbits; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void ptlrpc_request_set_replen(struct ptlrpc_request *req) +{ + int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER); + + req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, + req->rq_pill.rc_area[RCL_SERVER]); + if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2) + req->rq_reqmsg->lm_repsize = req->rq_replen; +} +EXPORT_SYMBOL(ptlrpc_request_set_replen); + +void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *lens) +{ + req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens); + if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2) + req->rq_reqmsg->lm_repsize = req->rq_replen; +} + +/** + * Send a remote set_info_async. + * + * This may go from client to server or server to client. + */ +int do_set_info_async(struct obd_import *imp, + int opcode, int version, + size_t keylen, void *key, + size_t vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + char *tmp; + int rc; + + ENTRY; + + req = ptlrpc_request_alloc(imp, KEY_IS(KEY_CHANGELOG_CLEAR) ? + &RQF_MDT_SET_INFO : + &RQF_OBD_SET_INFO); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT, keylen); + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT, vallen); + rc = ptlrpc_request_pack(req, version, opcode); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + if (KEY_IS(KEY_CHANGELOG_CLEAR)) + do_pack_body(req); + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL); + memcpy(tmp, val, vallen); + + ptlrpc_request_set_replen(req); + + if (set) { + ptlrpc_set_add_req(set, req); + ptlrpc_check_set(NULL, set); + } else { + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + } + + RETURN(rc); +} +EXPORT_SYMBOL(do_set_info_async); + +/* byte flipping routines for all wire types declared in + * lustre_idl.h implemented here. + */ +void lustre_swab_ptlrpc_body(struct ptlrpc_body *body) +{ + __swab32s(&body->pb_type); + __swab32s(&body->pb_version); + __swab32s(&body->pb_opc); + __swab32s(&body->pb_status); + __swab64s(&body->pb_last_xid); + __swab16s(&body->pb_tag); + BUILD_BUG_ON(offsetof(typeof(*body), pb_padding0) == 0); + BUILD_BUG_ON(offsetof(typeof(*body), pb_padding1) == 0); + __swab64s(&body->pb_last_committed); + __swab64s(&body->pb_transno); + __swab32s(&body->pb_flags); + __swab32s(&body->pb_op_flags); + __swab32s(&body->pb_conn_cnt); + __swab32s(&body->pb_timeout); + __swab32s(&body->pb_service_time); + __swab32s(&body->pb_limit); + __swab64s(&body->pb_slv); + __swab64s(&body->pb_pre_versions[0]); + __swab64s(&body->pb_pre_versions[1]); + __swab64s(&body->pb_pre_versions[2]); + __swab64s(&body->pb_pre_versions[3]); + __swab64s(&body->pb_mbits); + BUILD_BUG_ON(offsetof(typeof(*body), pb_padding64_0) == 0); + BUILD_BUG_ON(offsetof(typeof(*body), pb_padding64_1) == 0); + BUILD_BUG_ON(offsetof(typeof(*body), pb_padding64_2) == 0); + /* + * While we need to maintain compatibility between + * clients and servers without ptlrpc_body_v2 (< 2.3) + * do not swab any fields beyond pb_jobid, as we are + * using this swab function for both ptlrpc_body + * and ptlrpc_body_v2. + */ + /* pb_jobid is an ASCII string and should not be swabbed */ + BUILD_BUG_ON(offsetof(typeof(*body), pb_jobid) == 0); +} + +void lustre_swab_connect(struct obd_connect_data *ocd) +{ + __swab64s(&ocd->ocd_connect_flags); + __swab32s(&ocd->ocd_version); + __swab32s(&ocd->ocd_grant); + __swab64s(&ocd->ocd_ibits_known); + __swab32s(&ocd->ocd_index); + __swab32s(&ocd->ocd_brw_size); + /* + * ocd_blocksize and ocd_inodespace don't need to be swabbed because + * they are 8-byte values + */ + __swab16s(&ocd->ocd_grant_tax_kb); + __swab32s(&ocd->ocd_grant_max_blks); + __swab64s(&ocd->ocd_transno); + __swab32s(&ocd->ocd_group); + __swab32s(&ocd->ocd_cksum_types); + __swab32s(&ocd->ocd_instance); + /* + * Fields after ocd_cksum_types are only accessible by the receiver + * if the corresponding flag in ocd_connect_flags is set. Accessing + * any field after ocd_maxbytes on the receiver without a valid flag + * may result in out-of-bound memory access and kernel oops. + */ + if (ocd->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE) + __swab32s(&ocd->ocd_max_easize); + if (ocd->ocd_connect_flags & OBD_CONNECT_MAXBYTES) + __swab64s(&ocd->ocd_maxbytes); + if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) + __swab16s(&ocd->ocd_maxmodrpcs); + BUILD_BUG_ON(offsetof(typeof(*ocd), padding0) == 0); + BUILD_BUG_ON(offsetof(typeof(*ocd), padding1) == 0); + if (ocd->ocd_connect_flags & OBD_CONNECT_FLAGS2) + __swab64s(&ocd->ocd_connect_flags2); + BUILD_BUG_ON(offsetof(typeof(*ocd), padding3) == 0); + BUILD_BUG_ON(offsetof(typeof(*ocd), padding4) == 0); + BUILD_BUG_ON(offsetof(typeof(*ocd), padding5) == 0); + BUILD_BUG_ON(offsetof(typeof(*ocd), padding6) == 0); + BUILD_BUG_ON(offsetof(typeof(*ocd), padding7) == 0); + BUILD_BUG_ON(offsetof(typeof(*ocd), padding8) == 0); + BUILD_BUG_ON(offsetof(typeof(*ocd), padding9) == 0); + BUILD_BUG_ON(offsetof(typeof(*ocd), paddingA) == 0); + BUILD_BUG_ON(offsetof(typeof(*ocd), paddingB) == 0); + BUILD_BUG_ON(offsetof(typeof(*ocd), paddingC) == 0); + BUILD_BUG_ON(offsetof(typeof(*ocd), paddingD) == 0); + BUILD_BUG_ON(offsetof(typeof(*ocd), paddingE) == 0); + BUILD_BUG_ON(offsetof(typeof(*ocd), paddingF) == 0); +} + +static void lustre_swab_ost_layout(struct ost_layout *ol) +{ + __swab32s(&ol->ol_stripe_size); + __swab32s(&ol->ol_stripe_count); + __swab64s(&ol->ol_comp_start); + __swab64s(&ol->ol_comp_end); + __swab32s(&ol->ol_comp_id); +} + +void lustre_swab_obdo(struct obdo *o) +{ + __swab64s(&o->o_valid); + lustre_swab_ost_id(&o->o_oi); + __swab64s(&o->o_parent_seq); + __swab64s(&o->o_size); + __swab64s(&o->o_mtime); + __swab64s(&o->o_atime); + __swab64s(&o->o_ctime); + __swab64s(&o->o_blocks); + __swab64s(&o->o_grant); + __swab32s(&o->o_blksize); + __swab32s(&o->o_mode); + __swab32s(&o->o_uid); + __swab32s(&o->o_gid); + __swab32s(&o->o_flags); + __swab32s(&o->o_nlink); + __swab32s(&o->o_parent_oid); + __swab32s(&o->o_misc); + __swab64s(&o->o_ioepoch); + __swab32s(&o->o_stripe_idx); + __swab32s(&o->o_parent_ver); + lustre_swab_ost_layout(&o->o_layout); + __swab32s(&o->o_layout_version); + __swab32s(&o->o_uid_h); + __swab32s(&o->o_gid_h); + __swab64s(&o->o_data_version); + __swab32s(&o->o_projid); + BUILD_BUG_ON(offsetof(typeof(*o), o_padding_4) == 0); + BUILD_BUG_ON(offsetof(typeof(*o), o_padding_5) == 0); + BUILD_BUG_ON(offsetof(typeof(*o), o_padding_6) == 0); + +} +EXPORT_SYMBOL(lustre_swab_obdo); + +void lustre_swab_obd_statfs(struct obd_statfs *os) +{ + __swab64s(&os->os_type); + __swab64s(&os->os_blocks); + __swab64s(&os->os_bfree); + __swab64s(&os->os_bavail); + __swab64s(&os->os_files); + __swab64s(&os->os_ffree); + /* no need to swab os_fsid */ + __swab32s(&os->os_bsize); + __swab32s(&os->os_namelen); + __swab64s(&os->os_maxbytes); + __swab32s(&os->os_state); + __swab32s(&os->os_fprecreated); + __swab32s(&os->os_granted); + BUILD_BUG_ON(offsetof(typeof(*os), os_spare3) == 0); + BUILD_BUG_ON(offsetof(typeof(*os), os_spare4) == 0); + BUILD_BUG_ON(offsetof(typeof(*os), os_spare5) == 0); + BUILD_BUG_ON(offsetof(typeof(*os), os_spare6) == 0); + BUILD_BUG_ON(offsetof(typeof(*os), os_spare7) == 0); + BUILD_BUG_ON(offsetof(typeof(*os), os_spare8) == 0); + BUILD_BUG_ON(offsetof(typeof(*os), os_spare9) == 0); +} + +void lustre_swab_obd_ioobj(struct obd_ioobj *ioo) +{ + lustre_swab_ost_id(&ioo->ioo_oid); + __swab32s(&ioo->ioo_max_brw); + __swab32s(&ioo->ioo_bufcnt); +} + +void lustre_swab_niobuf_remote(struct niobuf_remote *nbr) +{ + __swab64s(&nbr->rnb_offset); + __swab32s(&nbr->rnb_len); + __swab32s(&nbr->rnb_flags); +} + +void lustre_swab_ost_body(struct ost_body *b) +{ + lustre_swab_obdo(&b->oa); +} + +void lustre_swab_ost_last_id(u64 *id) +{ + __swab64s(id); +} + +void lustre_swab_generic_32s(__u32 *val) +{ + __swab32s(val); +} + +void lustre_swab_gl_lquota_desc(struct ldlm_gl_lquota_desc *desc) +{ + lustre_swab_lu_fid(&desc->gl_id.qid_fid); + __swab64s(&desc->gl_flags); + __swab64s(&desc->gl_ver); + __swab64s(&desc->gl_hardlimit); + __swab64s(&desc->gl_softlimit); + __swab64s(&desc->gl_time); + BUILD_BUG_ON(offsetof(typeof(*desc), gl_pad2) == 0); +} +EXPORT_SYMBOL(lustre_swab_gl_lquota_desc); + +void lustre_swab_gl_barrier_desc(struct ldlm_gl_barrier_desc *desc) +{ + __swab32s(&desc->lgbd_status); + __swab32s(&desc->lgbd_timeout); + BUILD_BUG_ON(offsetof(typeof(*desc), lgbd_padding) == 0); +} +EXPORT_SYMBOL(lustre_swab_gl_barrier_desc); + +void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb) +{ + __swab64s(&lvb->lvb_size); + __swab64s(&lvb->lvb_mtime); + __swab64s(&lvb->lvb_atime); + __swab64s(&lvb->lvb_ctime); + __swab64s(&lvb->lvb_blocks); +} +EXPORT_SYMBOL(lustre_swab_ost_lvb_v1); + +void lustre_swab_ost_lvb(struct ost_lvb *lvb) +{ + __swab64s(&lvb->lvb_size); + __swab64s(&lvb->lvb_mtime); + __swab64s(&lvb->lvb_atime); + __swab64s(&lvb->lvb_ctime); + __swab64s(&lvb->lvb_blocks); + __swab32s(&lvb->lvb_mtime_ns); + __swab32s(&lvb->lvb_atime_ns); + __swab32s(&lvb->lvb_ctime_ns); + __swab32s(&lvb->lvb_padding); +} +EXPORT_SYMBOL(lustre_swab_ost_lvb); + +void lustre_swab_lquota_lvb(struct lquota_lvb *lvb) +{ + __swab64s(&lvb->lvb_flags); + __swab64s(&lvb->lvb_id_may_rel); + __swab64s(&lvb->lvb_id_rel); + __swab64s(&lvb->lvb_id_qunit); + __swab64s(&lvb->lvb_pad1); +} +EXPORT_SYMBOL(lustre_swab_lquota_lvb); + +void lustre_swab_barrier_lvb(struct barrier_lvb *lvb) +{ + __swab32s(&lvb->lvb_status); + __swab32s(&lvb->lvb_index); + BUILD_BUG_ON(offsetof(typeof(*lvb), lvb_padding) == 0); +} +EXPORT_SYMBOL(lustre_swab_barrier_lvb); + +void lustre_swab_mdt_body(struct mdt_body *b) +{ + lustre_swab_lu_fid(&b->mbo_fid1); + lustre_swab_lu_fid(&b->mbo_fid2); + /* handle is opaque */ + __swab64s(&b->mbo_valid); + __swab64s(&b->mbo_size); + __swab64s(&b->mbo_mtime); + __swab64s(&b->mbo_atime); + __swab64s(&b->mbo_ctime); + __swab64s(&b->mbo_blocks); + __swab64s(&b->mbo_version); + __swab64s(&b->mbo_t_state); + __swab32s(&b->mbo_fsuid); + __swab32s(&b->mbo_fsgid); + __swab32s(&b->mbo_capability); + __swab32s(&b->mbo_mode); + __swab32s(&b->mbo_uid); + __swab32s(&b->mbo_gid); + __swab32s(&b->mbo_flags); + __swab32s(&b->mbo_rdev); + __swab32s(&b->mbo_nlink); + __swab32s(&b->mbo_layout_gen); + __swab32s(&b->mbo_suppgid); + __swab32s(&b->mbo_eadatasize); + __swab32s(&b->mbo_aclsize); + __swab32s(&b->mbo_max_mdsize); + BUILD_BUG_ON(offsetof(typeof(*b), mbo_unused3) == 0); + __swab32s(&b->mbo_uid_h); + __swab32s(&b->mbo_gid_h); + __swab32s(&b->mbo_projid); + __swab64s(&b->mbo_dom_size); + __swab64s(&b->mbo_dom_blocks); + __swab64s(&b->mbo_btime); + BUILD_BUG_ON(offsetof(typeof(*b), mbo_padding_9) == 0); + BUILD_BUG_ON(offsetof(typeof(*b), mbo_padding_10) == 0); +} + +void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b) +{ + /* mio_open_handle is opaque */ + BUILD_BUG_ON(offsetof(typeof(*b), mio_unused1) == 0); + BUILD_BUG_ON(offsetof(typeof(*b), mio_unused2) == 0); + BUILD_BUG_ON(offsetof(typeof(*b), mio_padding) == 0); +} + +void lustre_swab_mgs_target_info(struct mgs_target_info *mti) +{ + int i; + + __swab32s(&mti->mti_lustre_ver); + __swab32s(&mti->mti_stripe_index); + __swab32s(&mti->mti_config_ver); + __swab32s(&mti->mti_flags); + __swab32s(&mti->mti_instance); + __swab32s(&mti->mti_nid_count); + BUILD_BUG_ON(sizeof(lnet_nid_t) != sizeof(__u64)); + for (i = 0; i < MTI_NIDS_MAX; i++) + __swab64s(&mti->mti_nids[i]); +} + +void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry) +{ + __u8 i; + + __swab64s(&entry->mne_version); + __swab32s(&entry->mne_instance); + __swab32s(&entry->mne_index); + __swab32s(&entry->mne_length); + + /* mne_nid_(count|type) must be one byte size because we're gonna + * access it w/o swapping. */ + BUILD_BUG_ON(sizeof(entry->mne_nid_count) != sizeof(__u8)); + BUILD_BUG_ON(sizeof(entry->mne_nid_type) != sizeof(__u8)); + + /* remove this assertion if ipv6 is supported. */ + LASSERT(entry->mne_nid_type == 0); + for (i = 0; i < entry->mne_nid_count; i++) { + BUILD_BUG_ON(sizeof(lnet_nid_t) != sizeof(__u64)); + __swab64s(&entry->u.nids[i]); + } +} +EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry); + +void lustre_swab_mgs_config_body(struct mgs_config_body *body) +{ + __swab64s(&body->mcb_offset); + __swab32s(&body->mcb_units); + __swab16s(&body->mcb_type); +} + +void lustre_swab_mgs_config_res(struct mgs_config_res *body) +{ + __swab64s(&body->mcr_offset); + __swab64s(&body->mcr_size); +} + +static void lustre_swab_obd_dqinfo(struct obd_dqinfo *i) +{ + __swab64s(&i->dqi_bgrace); + __swab64s(&i->dqi_igrace); + __swab32s(&i->dqi_flags); + __swab32s(&i->dqi_valid); +} + +static void lustre_swab_obd_dqblk(struct obd_dqblk *b) +{ + __swab64s(&b->dqb_ihardlimit); + __swab64s(&b->dqb_isoftlimit); + __swab64s(&b->dqb_curinodes); + __swab64s(&b->dqb_bhardlimit); + __swab64s(&b->dqb_bsoftlimit); + __swab64s(&b->dqb_curspace); + __swab64s(&b->dqb_btime); + __swab64s(&b->dqb_itime); + __swab32s(&b->dqb_valid); + BUILD_BUG_ON(offsetof(typeof(*b), dqb_padding) == 0); +} + +int lustre_swab_obd_quotactl(struct obd_quotactl *q, __u32 len) +{ + if (unlikely(len <= sizeof(struct obd_quotactl))) + return -EOVERFLOW; + + __swab32s(&q->qc_cmd); + __swab32s(&q->qc_type); + __swab32s(&q->qc_id); + __swab32s(&q->qc_stat); + lustre_swab_obd_dqinfo(&q->qc_dqinfo); + lustre_swab_obd_dqblk(&q->qc_dqblk); + + return len; +} + +void lustre_swab_fid2path(struct getinfo_fid2path *gf) +{ + lustre_swab_lu_fid(&gf->gf_fid); + __swab64s(&gf->gf_recno); + __swab32s(&gf->gf_linkno); + __swab32s(&gf->gf_pathlen); +} +EXPORT_SYMBOL(lustre_swab_fid2path); + +static void lustre_swab_fiemap_extent(struct fiemap_extent *fm_extent) +{ + __swab64s(&fm_extent->fe_logical); + __swab64s(&fm_extent->fe_physical); + __swab64s(&fm_extent->fe_length); + __swab32s(&fm_extent->fe_flags); + __swab32s(&fm_extent->fe_device); +} + +static void lustre_swab_fiemap_hdr(struct fiemap *fiemap) +{ + __swab64s(&fiemap->fm_start); + __swab64s(&fiemap->fm_length); + __swab32s(&fiemap->fm_flags); + __swab32s(&fiemap->fm_mapped_extents); + __swab32s(&fiemap->fm_extent_count); + __swab32s(&fiemap->fm_reserved); +} + +int lustre_swab_fiemap(struct fiemap *fiemap, __u32 len) +{ + __u32 i, size, count; + + lustre_swab_fiemap_hdr(fiemap); + + size = fiemap_count_to_size(fiemap->fm_mapped_extents); + count = fiemap->fm_mapped_extents; + if (unlikely(size > len)) { + count = (len - sizeof(struct fiemap)) / + sizeof(struct fiemap_extent); + fiemap->fm_mapped_extents = count; + size = -EOVERFLOW; + } + /* still swab extents as we cannot yet pass rc to callers */ + for (i = 0; i < count; i++) + lustre_swab_fiemap_extent(&fiemap->fm_extents[i]); + + return size; +} + +void lustre_swab_fiemap_info_key(struct ll_fiemap_info_key *fiemap_info) +{ + lustre_swab_obdo(&fiemap_info->lfik_oa); + lustre_swab_fiemap_hdr(&fiemap_info->lfik_fiemap); +} + +void lustre_swab_idx_info(struct idx_info *ii) +{ + __swab32s(&ii->ii_magic); + __swab32s(&ii->ii_flags); + __swab16s(&ii->ii_count); + __swab32s(&ii->ii_attrs); + lustre_swab_lu_fid(&ii->ii_fid); + __swab64s(&ii->ii_version); + __swab64s(&ii->ii_hash_start); + __swab64s(&ii->ii_hash_end); + __swab16s(&ii->ii_keysize); + __swab16s(&ii->ii_recsize); +} + +void lustre_swab_lip_header(struct lu_idxpage *lip) +{ + /* swab header */ + __swab32s(&lip->lip_magic); + __swab16s(&lip->lip_flags); + __swab16s(&lip->lip_nr); +} +EXPORT_SYMBOL(lustre_swab_lip_header); + +void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr) +{ + __swab32s(&rr->rr_opcode); + __swab32s(&rr->rr_cap); + __swab32s(&rr->rr_fsuid); + /* rr_fsuid_h is unused */ + __swab32s(&rr->rr_fsgid); + /* rr_fsgid_h is unused */ + __swab32s(&rr->rr_suppgid1); + /* rr_suppgid1_h is unused */ + __swab32s(&rr->rr_suppgid2); + /* rr_suppgid2_h is unused */ + lustre_swab_lu_fid(&rr->rr_fid1); + lustre_swab_lu_fid(&rr->rr_fid2); + __swab64s(&rr->rr_mtime); + __swab64s(&rr->rr_atime); + __swab64s(&rr->rr_ctime); + __swab64s(&rr->rr_size); + __swab64s(&rr->rr_blocks); + __swab32s(&rr->rr_bias); + __swab32s(&rr->rr_mode); + __swab32s(&rr->rr_flags); + __swab32s(&rr->rr_flags_h); + __swab32s(&rr->rr_umask); + __swab16s(&rr->rr_mirror_id); + + BUILD_BUG_ON(offsetof(typeof(*rr), rr_padding_4) == 0); +}; + +void lustre_swab_lov_desc(struct lov_desc *ld) +{ + __swab32s(&ld->ld_tgt_count); + __swab32s(&ld->ld_active_tgt_count); + __swab32s(&ld->ld_default_stripe_count); + __swab32s(&ld->ld_pattern); + __swab64s(&ld->ld_default_stripe_size); + __swab64s(&ld->ld_default_stripe_offset); + __swab32s(&ld->ld_qos_maxage); + /* uuid endian insensitive */ +} +EXPORT_SYMBOL(lustre_swab_lov_desc); + +void lustre_swab_lmv_desc(struct lmv_desc *ld) +{ + __swab32s(&ld->ld_tgt_count); + __swab32s(&ld->ld_active_tgt_count); + __swab32s(&ld->ld_default_stripe_count); + __swab32s(&ld->ld_pattern); + __swab64s(&ld->ld_default_hash_size); + __swab32s(&ld->ld_qos_maxage); + /* uuid endian insensitive */ +} + +/* This structure is always in little-endian */ +static void lustre_swab_lmv_mds_md_v1(struct lmv_mds_md_v1 *lmm1) +{ + int i; + + __swab32s(&lmm1->lmv_magic); + __swab32s(&lmm1->lmv_stripe_count); + __swab32s(&lmm1->lmv_master_mdt_index); + __swab32s(&lmm1->lmv_hash_type); + __swab32s(&lmm1->lmv_layout_version); + for (i = 0; i < lmm1->lmv_stripe_count; i++) + lustre_swab_lu_fid(&lmm1->lmv_stripe_fids[i]); +} + +void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm) +{ + switch (lmm->lmv_magic) { + case LMV_MAGIC_V1: + lustre_swab_lmv_mds_md_v1(&lmm->lmv_md_v1); + break; + default: + break; + } +} +EXPORT_SYMBOL(lustre_swab_lmv_mds_md); + +void lustre_swab_lmv_user_md_objects(struct lmv_user_mds_data *lmd, + int stripe_count) +{ + int i; + + for (i = 0; i < stripe_count; i++) + __swab32s(&(lmd[i].lum_mds)); +} +EXPORT_SYMBOL(lustre_swab_lmv_user_md_objects); + + +void lustre_swab_lmv_user_md(struct lmv_user_md *lum) +{ + __u32 count; + + if (lum->lum_magic == LMV_MAGIC_FOREIGN) { + __swab32s(&lum->lum_magic); + __swab32s(&((struct lmv_foreign_md *)lum)->lfm_length); + __swab32s(&((struct lmv_foreign_md *)lum)->lfm_type); + __swab32s(&((struct lmv_foreign_md *)lum)->lfm_flags); + return; + } + + count = lum->lum_stripe_count; + __swab32s(&lum->lum_magic); + __swab32s(&lum->lum_stripe_count); + __swab32s(&lum->lum_stripe_offset); + __swab32s(&lum->lum_hash_type); + __swab32s(&lum->lum_type); + /* lum_max_inherit and lum_max_inherit_rr do not need to be swabbed */ + BUILD_BUG_ON(offsetof(typeof(*lum), lum_padding1) == 0); + BUILD_BUG_ON(offsetof(typeof(*lum), lum_padding2) == 0); + BUILD_BUG_ON(offsetof(typeof(*lum), lum_padding3) == 0); + switch (lum->lum_magic) { + case LMV_USER_MAGIC_SPECIFIC: + count = lum->lum_stripe_count; + fallthrough; + case __swab32(LMV_USER_MAGIC_SPECIFIC): + lustre_swab_lmv_user_md_objects(lum->lum_objects, count); + break; + default: + break; + } +} +EXPORT_SYMBOL(lustre_swab_lmv_user_md); + +static void lustre_print_v1v3(unsigned int lvl, struct lov_user_md *lum, + const char *msg) +{ + CDEBUG(lvl, "%s lov_user_md %p:\n", msg, lum); + CDEBUG(lvl, "\tlmm_magic: %#x\n", lum->lmm_magic); + CDEBUG(lvl, "\tlmm_pattern: %#x\n", lum->lmm_pattern); + CDEBUG(lvl, "\tlmm_object_id: %llu\n", lmm_oi_id(&lum->lmm_oi)); + CDEBUG(lvl, "\tlmm_object_gr: %llu\n", lmm_oi_seq(&lum->lmm_oi)); + CDEBUG(lvl, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size); + CDEBUG(lvl, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count); + CDEBUG(lvl, "\tlmm_stripe_offset/lmm_layout_gen: %#x\n", + lum->lmm_stripe_offset); + if (lum->lmm_magic == LOV_USER_MAGIC_V3) { + struct lov_user_md_v3 *v3 = (void *)lum; + CDEBUG(lvl, "\tlmm_pool_name: %s\n", v3->lmm_pool_name); + } + if (lum->lmm_magic == LOV_USER_MAGIC_SPECIFIC) { + struct lov_user_md_v3 *v3 = (void *)lum; + int i; + + if (v3->lmm_pool_name[0] != '\0') + CDEBUG(lvl, "\tlmm_pool_name: %s\n", v3->lmm_pool_name); + + CDEBUG(lvl, "\ttarget list:\n"); + for (i = 0; i < v3->lmm_stripe_count; i++) + CDEBUG(lvl, "\t\t%u\n", v3->lmm_objects[i].l_ost_idx); + } +} + +void lustre_print_user_md(unsigned int lvl, struct lov_user_md *lum, + const char *msg) +{ + struct lov_comp_md_v1 *comp_v1; + int i; + + if (likely(!cfs_cdebug_show(lvl, DEBUG_SUBSYSTEM))) + return; + + if (lum->lmm_magic == LOV_USER_MAGIC_V1 || + lum->lmm_magic == LOV_USER_MAGIC_V3) { + lustre_print_v1v3(lvl, lum, msg); + return; + } + + if (lum->lmm_magic != LOV_USER_MAGIC_COMP_V1) { + CDEBUG(lvl, "%s: bad magic: %x\n", msg, lum->lmm_magic); + return; + } + + comp_v1 = (struct lov_comp_md_v1 *)lum; + CDEBUG(lvl, "%s: lov_comp_md_v1 %p:\n", msg, lum); + CDEBUG(lvl, "\tlcm_magic: %#x\n", comp_v1->lcm_magic); + CDEBUG(lvl, "\tlcm_size: %#x\n", comp_v1->lcm_size); + CDEBUG(lvl, "\tlcm_layout_gen: %#x\n", comp_v1->lcm_layout_gen); + CDEBUG(lvl, "\tlcm_flags: %#x\n", comp_v1->lcm_flags); + CDEBUG(lvl, "\tlcm_entry_count: %#x\n\n", comp_v1->lcm_entry_count); + CDEBUG(lvl, "\tlcm_mirror_count: %#x\n\n", comp_v1->lcm_mirror_count); + + for (i = 0; i < comp_v1->lcm_entry_count; i++) { + struct lov_comp_md_entry_v1 *ent = &comp_v1->lcm_entries[i]; + struct lov_user_md *v1; + + CDEBUG(lvl, "\tentry %d:\n", i); + CDEBUG(lvl, "\tlcme_id: %#x\n", ent->lcme_id); + CDEBUG(lvl, "\tlcme_flags: %#x\n", ent->lcme_flags); + if (ent->lcme_flags & LCME_FL_NOSYNC) + CDEBUG(lvl, "\tlcme_timestamp: %llu\n", + ent->lcme_timestamp); + CDEBUG(lvl, "\tlcme_extent.e_start: %llu\n", + ent->lcme_extent.e_start); + CDEBUG(lvl, "\tlcme_extent.e_end: %llu\n", + ent->lcme_extent.e_end); + CDEBUG(lvl, "\tlcme_offset: %#x\n", ent->lcme_offset); + CDEBUG(lvl, "\tlcme_size: %#x\n\n", ent->lcme_size); + + v1 = (struct lov_user_md *)((char *)comp_v1 + + comp_v1->lcm_entries[i].lcme_offset); + lustre_print_v1v3(lvl, v1, msg); + } +} +EXPORT_SYMBOL(lustre_print_user_md); + +static void lustre_swab_lmm_oi(struct ost_id *oi) +{ + __swab64s(&oi->oi.oi_id); + __swab64s(&oi->oi.oi_seq); +} + +static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum) +{ + ENTRY; + __swab32s(&lum->lmm_magic); + __swab32s(&lum->lmm_pattern); + lustre_swab_lmm_oi(&lum->lmm_oi); + __swab32s(&lum->lmm_stripe_size); + __swab16s(&lum->lmm_stripe_count); + __swab16s(&lum->lmm_stripe_offset); + EXIT; +} + +void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum) +{ + ENTRY; + CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n"); + lustre_swab_lov_user_md_common(lum); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_lov_user_md_v1); + +void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum) +{ + ENTRY; + CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n"); + lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum); + /* lmm_pool_name nothing to do with char */ + EXIT; +} +EXPORT_SYMBOL(lustre_swab_lov_user_md_v3); + +void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum) +{ + struct lov_comp_md_entry_v1 *ent; + struct lov_user_md_v1 *v1; + struct lov_user_md_v3 *v3; + int i; + bool cpu_endian; + __u32 off, size; + __u16 ent_count, stripe_count; + ENTRY; + + cpu_endian = lum->lcm_magic == LOV_USER_MAGIC_COMP_V1; + ent_count = lum->lcm_entry_count; + if (!cpu_endian) + __swab16s(&ent_count); + + CDEBUG(D_IOCTL, "swabbing lov_user_comp_md v1\n"); + __swab32s(&lum->lcm_magic); + __swab32s(&lum->lcm_size); + __swab32s(&lum->lcm_layout_gen); + __swab16s(&lum->lcm_flags); + __swab16s(&lum->lcm_entry_count); + __swab16s(&lum->lcm_mirror_count); + BUILD_BUG_ON(offsetof(typeof(*lum), lcm_padding1) == 0); + BUILD_BUG_ON(offsetof(typeof(*lum), lcm_padding2) == 0); + + for (i = 0; i < ent_count; i++) { + ent = &lum->lcm_entries[i]; + off = ent->lcme_offset; + size = ent->lcme_size; + + if (!cpu_endian) { + __swab32s(&off); + __swab32s(&size); + } + __swab32s(&ent->lcme_id); + __swab32s(&ent->lcme_flags); + __swab64s(&ent->lcme_timestamp); + __swab64s(&ent->lcme_extent.e_start); + __swab64s(&ent->lcme_extent.e_end); + __swab32s(&ent->lcme_offset); + __swab32s(&ent->lcme_size); + __swab32s(&ent->lcme_layout_gen); + BUILD_BUG_ON(offsetof(typeof(*ent), lcme_padding_1) == 0); + + v1 = (struct lov_user_md_v1 *)((char *)lum + off); + stripe_count = v1->lmm_stripe_count; + if (!cpu_endian) + __swab16s(&stripe_count); + + if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_V1) || + v1->lmm_magic == LOV_USER_MAGIC_V1) { + lustre_swab_lov_user_md_v1(v1); + if (size > sizeof(*v1)) + lustre_swab_lov_user_md_objects(v1->lmm_objects, + stripe_count); + } else if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_V3) || + v1->lmm_magic == LOV_USER_MAGIC_V3 || + v1->lmm_magic == __swab32(LOV_USER_MAGIC_SPECIFIC) || + v1->lmm_magic == LOV_USER_MAGIC_SPECIFIC) { + v3 = (struct lov_user_md_v3 *)v1; + lustre_swab_lov_user_md_v3(v3); + if (size > sizeof(*v3)) + lustre_swab_lov_user_md_objects(v3->lmm_objects, + stripe_count); + } else { + CERROR("Invalid magic %#x\n", v1->lmm_magic); + } + } +} +EXPORT_SYMBOL(lustre_swab_lov_comp_md_v1); + +void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, + int stripe_count) +{ + int i; + + ENTRY; + for (i = 0; i < stripe_count; i++) { + lustre_swab_ost_id(&(lod[i].l_ost_oi)); + __swab32s(&(lod[i].l_ost_gen)); + __swab32s(&(lod[i].l_ost_idx)); + } + EXIT; +} +EXPORT_SYMBOL(lustre_swab_lov_user_md_objects); + +void lustre_swab_lov_user_md(struct lov_user_md *lum, size_t size) +{ + struct lov_user_md_v1 *v1; + struct lov_user_md_v3 *v3; + struct lov_foreign_md *lfm; + __u16 stripe_count; + ENTRY; + + CDEBUG(D_IOCTL, "swabbing lov_user_md\n"); + switch (lum->lmm_magic) { + case __swab32(LOV_MAGIC_V1): + case LOV_USER_MAGIC_V1: + { + v1 = (struct lov_user_md_v1 *)lum; + stripe_count = v1->lmm_stripe_count; + + if (lum->lmm_magic != LOV_USER_MAGIC_V1) + __swab16s(&stripe_count); + + lustre_swab_lov_user_md_v1(v1); + if (size > sizeof(*v1)) + lustre_swab_lov_user_md_objects(v1->lmm_objects, + stripe_count); + + break; + } + case __swab32(LOV_MAGIC_V3): + case LOV_USER_MAGIC_V3: + { + v3 = (struct lov_user_md_v3 *)lum; + stripe_count = v3->lmm_stripe_count; + + if (lum->lmm_magic != LOV_USER_MAGIC_V3) + __swab16s(&stripe_count); + + lustre_swab_lov_user_md_v3(v3); + if (size > sizeof(*v3)) + lustre_swab_lov_user_md_objects(v3->lmm_objects, + stripe_count); + break; + } + case __swab32(LOV_USER_MAGIC_SPECIFIC): + case LOV_USER_MAGIC_SPECIFIC: + { + v3 = (struct lov_user_md_v3 *)lum; + stripe_count = v3->lmm_stripe_count; + + if (lum->lmm_magic != LOV_USER_MAGIC_SPECIFIC) + __swab16s(&stripe_count); + + lustre_swab_lov_user_md_v3(v3); + lustre_swab_lov_user_md_objects(v3->lmm_objects, stripe_count); + break; + } + case __swab32(LOV_MAGIC_COMP_V1): + case LOV_USER_MAGIC_COMP_V1: + lustre_swab_lov_comp_md_v1((struct lov_comp_md_v1 *)lum); + break; + case __swab32(LOV_MAGIC_FOREIGN): + case LOV_USER_MAGIC_FOREIGN: + { + lfm = (struct lov_foreign_md *)lum; + __swab32s(&lfm->lfm_magic); + __swab32s(&lfm->lfm_length); + __swab32s(&lfm->lfm_type); + __swab32s(&lfm->lfm_flags); + break; + } + default: + CDEBUG(D_IOCTL, "Invalid LOV magic %08x\n", lum->lmm_magic); + } +} +EXPORT_SYMBOL(lustre_swab_lov_user_md); + +void lustre_swab_lov_mds_md(struct lov_mds_md *lmm) +{ + ENTRY; + CDEBUG(D_IOCTL, "swabbing lov_mds_md\n"); + __swab32s(&lmm->lmm_magic); + __swab32s(&lmm->lmm_pattern); + lustre_swab_lmm_oi(&lmm->lmm_oi); + __swab32s(&lmm->lmm_stripe_size); + __swab16s(&lmm->lmm_stripe_count); + __swab16s(&lmm->lmm_layout_gen); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_lov_mds_md); + +void lustre_swab_ldlm_res_id(struct ldlm_res_id *id) +{ + int i; + + for (i = 0; i < RES_NAME_SIZE; i++) + __swab64s(&id->name[i]); +} + +void lustre_swab_ldlm_policy_data(union ldlm_wire_policy_data *d) +{ + /* the lock data is a union and the first two fields are always an + * extent so it's ok to process an LDLM_EXTENT and LDLM_FLOCK lock + * data the same way. + */ + __swab64s(&d->l_extent.start); + __swab64s(&d->l_extent.end); + __swab64s(&d->l_extent.gid); + __swab64s(&d->l_flock.lfw_owner); + __swab32s(&d->l_flock.lfw_pid); +} + +void lustre_swab_ldlm_intent(struct ldlm_intent *i) +{ + __swab64s(&i->opc); +} + +void lustre_swab_ldlm_resource_desc(struct ldlm_resource_desc *r) +{ + __swab32s(&r->lr_type); + BUILD_BUG_ON(offsetof(typeof(*r), lr_pad) == 0); + lustre_swab_ldlm_res_id(&r->lr_name); +} + +void lustre_swab_ldlm_lock_desc(struct ldlm_lock_desc *l) +{ + lustre_swab_ldlm_resource_desc(&l->l_resource); + __swab32s(&l->l_req_mode); + __swab32s(&l->l_granted_mode); + lustre_swab_ldlm_policy_data(&l->l_policy_data); +} + +void lustre_swab_ldlm_request(struct ldlm_request *rq) +{ + __swab32s(&rq->lock_flags); + lustre_swab_ldlm_lock_desc(&rq->lock_desc); + __swab32s(&rq->lock_count); + /* lock_handle[] opaque */ +} + +void lustre_swab_ldlm_reply(struct ldlm_reply *r) +{ + __swab32s(&r->lock_flags); + BUILD_BUG_ON(offsetof(typeof(*r), lock_padding) == 0); + lustre_swab_ldlm_lock_desc(&r->lock_desc); + /* lock_handle opaque */ + __swab64s(&r->lock_policy_res1); + __swab64s(&r->lock_policy_res2); +} + +void lustre_swab_quota_body(struct quota_body *b) +{ + lustre_swab_lu_fid(&b->qb_fid); + lustre_swab_lu_fid((struct lu_fid *)&b->qb_id); + __swab32s(&b->qb_flags); + __swab64s(&b->qb_count); + __swab64s(&b->qb_usage); + __swab64s(&b->qb_slv_ver); +} + +/* Dump functions */ +void dump_ioo(struct obd_ioobj *ioo) +{ + CDEBUG(D_RPCTRACE, + "obd_ioobj: ioo_oid="DOSTID", ioo_max_brw=%#x, " + "ioo_bufct=%d\n", POSTID(&ioo->ioo_oid), ioo->ioo_max_brw, + ioo->ioo_bufcnt); +} + +void dump_rniobuf(struct niobuf_remote *nb) +{ + CDEBUG(D_RPCTRACE, "niobuf_remote: offset=%llu, len=%d, flags=%x\n", + nb->rnb_offset, nb->rnb_len, nb->rnb_flags); +} + +void dump_obdo(struct obdo *oa) +{ + u64 valid = oa->o_valid; + + CDEBUG(D_RPCTRACE, "obdo: o_valid = %#llx\n", valid); + if (valid & OBD_MD_FLID) + CDEBUG(D_RPCTRACE, "obdo: id = "DOSTID"\n", POSTID(&oa->o_oi)); + if (valid & OBD_MD_FLFID) + CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = %#llx\n", + oa->o_parent_seq); + if (valid & OBD_MD_FLSIZE) + CDEBUG(D_RPCTRACE, "obdo: o_size = %lld\n", oa->o_size); + if (valid & OBD_MD_FLMTIME) + CDEBUG(D_RPCTRACE, "obdo: o_mtime = %lld\n", oa->o_mtime); + if (valid & OBD_MD_FLATIME) + CDEBUG(D_RPCTRACE, "obdo: o_atime = %lld\n", oa->o_atime); + if (valid & OBD_MD_FLCTIME) + CDEBUG(D_RPCTRACE, "obdo: o_ctime = %lld\n", oa->o_ctime); + if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ + CDEBUG(D_RPCTRACE, "obdo: o_blocks = %lld\n", oa->o_blocks); + if (valid & OBD_MD_FLGRANT) + CDEBUG(D_RPCTRACE, "obdo: o_grant = %lld\n", oa->o_grant); + if (valid & OBD_MD_FLBLKSZ) + CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize); + if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE)) + CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n", + oa->o_mode & ((valid & OBD_MD_FLTYPE ? S_IFMT : 0) | + (valid & OBD_MD_FLMODE ? ~S_IFMT : 0))); + if (valid & OBD_MD_FLUID) + CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid); + if (valid & OBD_MD_FLUID) + CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h); + if (valid & OBD_MD_FLGID) + CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid); + if (valid & OBD_MD_FLGID) + CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h); + if (valid & OBD_MD_FLFLAGS) + CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags); + if (valid & OBD_MD_FLNLINK) + CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink); + else if (valid & OBD_MD_FLCKSUM) + CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n", + oa->o_nlink); + if (valid & OBD_MD_FLPARENT) + CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n", + oa->o_parent_oid); + if (valid & OBD_MD_FLFID) { + CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n", + oa->o_stripe_idx); + CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n", + oa->o_parent_ver); + } + if (valid & OBD_MD_FLHANDLE) + CDEBUG(D_RPCTRACE, "obdo: o_handle = %lld\n", + oa->o_handle.cookie); +} + +void dump_ost_body(struct ost_body *ob) +{ + dump_obdo(&ob->oa); +} + +void dump_rcs(__u32 *rc) +{ + CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc); +} + +static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req) +{ + LASSERT(req->rq_reqmsg); + + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return req_capsule_req_swabbed(&req->rq_pill, + MSG_PTLRPC_BODY_OFF); + default: + CERROR("bad lustre msg magic: %#08X\n", + req->rq_reqmsg->lm_magic); + } + return 0; +} + +static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req) +{ + if (unlikely(!req->rq_repmsg)) + return 0; + + switch (req->rq_repmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return req_capsule_rep_swabbed(&req->rq_pill, + MSG_PTLRPC_BODY_OFF); + default: + /* uninitialized yet */ + return 0; + } +} + +void _debug_req(struct ptlrpc_request *req, + struct libcfs_debug_msg_data *msgdata, const char *fmt, ...) +{ + bool req_ok = req->rq_reqmsg != NULL; + bool rep_ok = false; + struct lnet_nid *nid = NULL; + struct va_format vaf; + va_list args; + int rep_flags = -1; + int rep_status = -1; + + spin_lock(&req->rq_early_free_lock); + if (req->rq_repmsg) + rep_ok = true; + + if (req_capsule_req_need_swab(&req->rq_pill)) { + req_ok = req_ok && req_ptlrpc_body_swabbed(req); + rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req); + } + + if (rep_ok) { + rep_flags = lustre_msg_get_flags(req->rq_repmsg); + rep_status = lustre_msg_get_status(req->rq_repmsg); + } + spin_unlock(&req->rq_early_free_lock); + + if (req->rq_import && req->rq_import->imp_connection) + nid = &req->rq_import->imp_connection->c_peer.nid; + else if (req->rq_export && req->rq_export->exp_connection) + nid = &req->rq_export->exp_connection->c_peer.nid; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + libcfs_debug_msg(msgdata, + "%pV req@%p x%llu/t%lld(%lld) o%d->%s@%s:%d/%d lens %d/%d e %d to %lld dl %lld ref %d fl " REQ_FLAGS_FMT "/%x/%x rc %d/%d job:'%s'\n", + &vaf, + req, req->rq_xid, req->rq_transno, + req_ok ? lustre_msg_get_transno(req->rq_reqmsg) : 0, + req_ok ? lustre_msg_get_opc(req->rq_reqmsg) : -1, + req->rq_import ? + req->rq_import->imp_obd->obd_name : + req->rq_export ? + req->rq_export->exp_client_uuid.uuid : + "", + nid ? libcfs_nidstr(nid) : "", + req->rq_request_portal, req->rq_reply_portal, + req->rq_reqlen, req->rq_replen, + req->rq_early_count, (s64)req->rq_timedout, + (s64)req->rq_deadline, + atomic_read(&req->rq_refcount), + DEBUG_REQ_FLAGS(req), + req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1, + rep_flags, req->rq_status, rep_status, + req_ok ? lustre_msg_get_jobid(req->rq_reqmsg) ?: "" + : ""); + va_end(args); +} +EXPORT_SYMBOL(_debug_req); + +void lustre_swab_hsm_user_state(struct hsm_user_state *state) +{ + __swab32s(&state->hus_states); + __swab32s(&state->hus_archive_id); +} + +void lustre_swab_hsm_state_set(struct hsm_state_set *hss) +{ + __swab32s(&hss->hss_valid); + __swab64s(&hss->hss_setmask); + __swab64s(&hss->hss_clearmask); + __swab32s(&hss->hss_archive_id); +} + +static void lustre_swab_hsm_extent(struct hsm_extent *extent) +{ + __swab64s(&extent->offset); + __swab64s(&extent->length); +} + +void lustre_swab_hsm_current_action(struct hsm_current_action *action) +{ + __swab32s(&action->hca_state); + __swab32s(&action->hca_action); + lustre_swab_hsm_extent(&action->hca_location); +} + +void lustre_swab_hsm_user_item(struct hsm_user_item *hui) +{ + lustre_swab_lu_fid(&hui->hui_fid); + lustre_swab_hsm_extent(&hui->hui_extent); +} + +void lustre_swab_lu_extent(struct lu_extent *le) +{ + __swab64s(&le->e_start); + __swab64s(&le->e_end); +} + +void lustre_swab_layout_intent(struct layout_intent *li) +{ + __swab32s(&li->li_opc); + __swab32s(&li->li_flags); + lustre_swab_lu_extent(&li->li_extent); +} + +void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk) +{ + lustre_swab_lu_fid(&hpk->hpk_fid); + __swab64s(&hpk->hpk_cookie); + __swab64s(&hpk->hpk_extent.offset); + __swab64s(&hpk->hpk_extent.length); + __swab16s(&hpk->hpk_flags); + __swab16s(&hpk->hpk_errval); +} + +void lustre_swab_hsm_request(struct hsm_request *hr) +{ + __swab32s(&hr->hr_action); + __swab32s(&hr->hr_archive_id); + __swab64s(&hr->hr_flags); + __swab32s(&hr->hr_itemcount); + __swab32s(&hr->hr_data_len); +} + +void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl) +{ + __swab64s(&msl->msl_flags); +} + +void lustre_swab_close_data(struct close_data *cd) +{ + lustre_swab_lu_fid(&cd->cd_fid); + __swab64s(&cd->cd_data_version); +} + +void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync) +{ + int i; + + __swab32s(&resync->resync_count); + /* after swab, resync_count must in CPU endian */ + if (resync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) { + for (i = 0; i < resync->resync_count; i++) + __swab32s(&resync->resync_ids_inline[i]); + } +} +EXPORT_SYMBOL(lustre_swab_close_data_resync_done); + +void lustre_swab_lfsck_request(struct lfsck_request *lr) +{ + __swab32s(&lr->lr_event); + __swab32s(&lr->lr_index); + __swab32s(&lr->lr_flags); + __swab32s(&lr->lr_valid); + __swab32s(&lr->lr_speed); + __swab16s(&lr->lr_version); + __swab16s(&lr->lr_active); + __swab16s(&lr->lr_param); + __swab16s(&lr->lr_async_windows); + __swab32s(&lr->lr_flags); + lustre_swab_lu_fid(&lr->lr_fid); + lustre_swab_lu_fid(&lr->lr_fid2); + __swab32s(&lr->lr_comp_id); + BUILD_BUG_ON(offsetof(typeof(*lr), lr_padding_0) == 0); + BUILD_BUG_ON(offsetof(typeof(*lr), lr_padding_1) == 0); + BUILD_BUG_ON(offsetof(typeof(*lr), lr_padding_2) == 0); + BUILD_BUG_ON(offsetof(typeof(*lr), lr_padding_3) == 0); +} + +void lustre_swab_lfsck_reply(struct lfsck_reply *lr) +{ + __swab32s(&lr->lr_status); + BUILD_BUG_ON(offsetof(typeof(*lr), lr_padding_1) == 0); + __swab64s(&lr->lr_repaired); +} + +static void lustre_swab_orphan_rec(struct lu_orphan_rec *rec) +{ + lustre_swab_lu_fid(&rec->lor_fid); + __swab32s(&rec->lor_uid); + __swab32s(&rec->lor_gid); +} + +void lustre_swab_orphan_ent(struct lu_orphan_ent *ent) +{ + lustre_swab_lu_fid(&ent->loe_key); + lustre_swab_orphan_rec(&ent->loe_rec); +} +EXPORT_SYMBOL(lustre_swab_orphan_ent); + +void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent) +{ + lustre_swab_lu_fid(&ent->loe_key); + lustre_swab_orphan_rec(&ent->loe_rec.lor_rec); + lustre_swab_ost_layout(&ent->loe_rec.lor_layout); + BUILD_BUG_ON(offsetof(typeof(ent->loe_rec), lor_padding) == 0); +} +EXPORT_SYMBOL(lustre_swab_orphan_ent_v2); + +void lustre_swab_orphan_ent_v3(struct lu_orphan_ent_v3 *ent) +{ + lustre_swab_lu_fid(&ent->loe_key); + lustre_swab_orphan_rec(&ent->loe_rec.lor_rec); + lustre_swab_ost_layout(&ent->loe_rec.lor_layout); + __swab32s(&ent->loe_rec.lor_layout_version); + __swab32s(&ent->loe_rec.lor_range); + BUILD_BUG_ON(offsetof(typeof(ent->loe_rec), lor_padding_1) == 0); + BUILD_BUG_ON(offsetof(typeof(ent->loe_rec), lor_padding_2) == 0); +} +EXPORT_SYMBOL(lustre_swab_orphan_ent_v3); + +void lustre_swab_ladvise(struct lu_ladvise *ladvise) +{ + __swab16s(&ladvise->lla_advice); + __swab16s(&ladvise->lla_value1); + __swab32s(&ladvise->lla_value2); + __swab64s(&ladvise->lla_start); + __swab64s(&ladvise->lla_end); + __swab32s(&ladvise->lla_value3); + __swab32s(&ladvise->lla_value4); +} +EXPORT_SYMBOL(lustre_swab_ladvise); + +void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr) +{ + __swab32s(&ladvise_hdr->lah_magic); + __swab32s(&ladvise_hdr->lah_count); + __swab64s(&ladvise_hdr->lah_flags); + __swab32s(&ladvise_hdr->lah_value1); + __swab32s(&ladvise_hdr->lah_value2); + __swab64s(&ladvise_hdr->lah_value3); +} +EXPORT_SYMBOL(lustre_swab_ladvise_hdr); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_server.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_server.c new file mode 100644 index 0000000000000..ec1c20dbef3b7 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_server.c @@ -0,0 +1,137 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/pack_server.c + * + * (Un)packing of OST requests + * + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include + +void lustre_swab_object_update(struct object_update *ou) +{ + struct object_update_param *param; + size_t i; + + __swab16s(&ou->ou_type); + __swab16s(&ou->ou_params_count); + __swab32s(&ou->ou_result_size); + __swab32s(&ou->ou_flags); + __swab32s(&ou->ou_padding1); + __swab64s(&ou->ou_batchid); + lustre_swab_lu_fid(&ou->ou_fid); + param = &ou->ou_params[0]; + for (i = 0; i < ou->ou_params_count; i++) { + __swab16s(¶m->oup_len); + __swab16s(¶m->oup_padding); + __swab32s(¶m->oup_padding2); + param = (struct object_update_param *)((char *)param + + object_update_param_size(param)); + } +} + +int lustre_swab_object_update_request(struct object_update_request *our, + __u32 len) +{ + __u32 i, size = 0; + struct object_update *ou; + + __swab32s(&our->ourq_magic); + __swab16s(&our->ourq_count); + __swab16s(&our->ourq_padding); + + /* Don't need to calculate request size if len is 0. */ + if (len > 0) { + size = sizeof(struct object_update_request); + for (i = 0; i < our->ourq_count; i++) { + ou = object_update_request_get(our, i, NULL); + if (ou == NULL) + return -EPROTO; + size += sizeof(struct object_update) + + ou->ou_params_count * + sizeof(struct object_update_param); + } + if (unlikely(size > len)) + return -EOVERFLOW; + } + + for (i = 0; i < our->ourq_count; i++) { + ou = object_update_request_get(our, i, NULL); + lustre_swab_object_update(ou); + } + + return size; +} + +void lustre_swab_object_update_result(struct object_update_result *our) +{ + __swab32s(&our->our_rc); + __swab16s(&our->our_datalen); + __swab16s(&our->our_padding); +} + +int lustre_swab_object_update_reply(struct object_update_reply *our, __u32 len) +{ + __u32 i, size; + + __swab32s(&our->ourp_magic); + __swab16s(&our->ourp_count); + __swab16s(&our->ourp_padding); + + size = sizeof(struct object_update_reply) + our->ourp_count * + (sizeof(__u16) + sizeof(struct object_update_result)); + if (unlikely(size > len)) + return -EOVERFLOW; + + for (i = 0; i < our->ourp_count; i++) { + struct object_update_result *ourp; + + __swab16s(&our->ourp_lens[i]); + ourp = object_update_result_get(our, i, NULL); + if (ourp == NULL) + return -EPROTO; + lustre_swab_object_update_result(ourp); + } + + return size; +} + +void lustre_swab_out_update_header(struct out_update_header *ouh) +{ + __swab32s(&ouh->ouh_magic); + __swab32s(&ouh->ouh_count); + __swab32s(&ouh->ouh_inline_length); + __swab32s(&ouh->ouh_reply_size); +} +EXPORT_SYMBOL(lustre_swab_out_update_header); + +void lustre_swab_out_update_buffer(struct out_update_buffer *oub) +{ + __swab32s(&oub->oub_size); + __swab32s(&oub->oub_padding); +} +EXPORT_SYMBOL(lustre_swab_out_update_buffer); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c new file mode 100644 index 0000000000000..973f2b5ad0d74 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c @@ -0,0 +1,73 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + + +void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc, + int mdidx) +{ + unsigned int start = desc->bd_mds_off[mdidx]; + + BUILD_BUG_ON(PTLRPC_MAX_BRW_PAGES >= LI_POISON); + + LASSERT(mdidx < desc->bd_md_max_brw); + LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); + + /* just send a lnet header */ + if (mdidx >= desc->bd_md_count) { + md->options |= LNET_MD_KIOV; + md->length = 0; + md->start = NULL; + return; + } + + if (mdidx == (desc->bd_md_count - 1)) + md->length = desc->bd_iov_count - start; + else + md->length = desc->bd_mds_off[mdidx + 1] - start; + + md->options |= LNET_MD_KIOV; + if (desc->bd_enc_vec) + md->start = &desc->bd_enc_vec[start]; + else + md->start = &desc->bd_vec[start]; +} + + diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c new file mode 100644 index 0000000000000..72825b2ad24e0 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c @@ -0,0 +1,571 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/pinger.c + * + * Portal-RPC reconnection and replay operations, for use in recovery. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +static int suppress_pings; +module_param(suppress_pings, int, 0644); +MODULE_PARM_DESC(suppress_pings, "Suppress pings"); + +struct mutex pinger_mutex; +static struct list_head pinger_imports = + LIST_HEAD_INIT(pinger_imports); + +int ptlrpc_pinger_suppress_pings(void) +{ + return suppress_pings; +} +EXPORT_SYMBOL(ptlrpc_pinger_suppress_pings); + +struct ptlrpc_request * +ptlrpc_prep_ping(struct obd_import *imp) +{ + struct ptlrpc_request *req; + + req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, + LUSTRE_OBD_VERSION, OBD_PING); + if (req) { + ptlrpc_request_set_replen(req); + req->rq_no_resend = req->rq_no_delay = 1; + } + return req; +} + +int ptlrpc_obd_ping(struct obd_device *obd) +{ + int rc; + struct ptlrpc_request *req; + struct obd_import *imp; + + ENTRY; + + with_imp_locked(obd, imp, rc) { + req = ptlrpc_prep_ping(imp); + if (!req) { + rc = -ENOMEM; + continue; + } + req->rq_send_state = LUSTRE_IMP_FULL; + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + } + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_obd_ping); + +static bool ptlrpc_check_import_is_idle(struct obd_import *imp) +{ + struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; + time64_t now; + + if (!imp->imp_idle_timeout) + return false; + + if (atomic_read(&imp->imp_reqs) > 0) + return false; + + /* any lock increases ns_bref being a resource holder */ + if (ns && atomic_read(&ns->ns_bref) > 0) + return false; + + now = ktime_get_real_seconds(); + if (now - imp->imp_last_reply_time < imp->imp_idle_timeout) + return false; + + return true; +} + +static void ptlrpc_update_next_ping(struct obd_import *imp, int soon) +{ +#ifdef CONFIG_LUSTRE_FS_PINGER + time64_t time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL; + + if (imp->imp_state == LUSTRE_IMP_DISCON) { + time64_t dtime = max_t(time64_t, CONNECTION_SWITCH_MIN, + AT_OFF ? 0 : + at_get(&imp->imp_at.iat_net_latency)); + time = min(time, dtime); + } + imp->imp_next_ping = ktime_get_seconds() + time; +#endif /* CONFIG_LUSTRE_FS_PINGER */ +} + +static int ptlrpc_ping(struct obd_import *imp) +{ + struct ptlrpc_request *req; + + ENTRY; + + if (ptlrpc_check_import_is_idle(imp) && + ptlrpc_disconnect_and_idle_import(imp) == 1) + RETURN(0); + + req = ptlrpc_prep_ping(imp); + if (!req) { + CERROR("OOM trying to ping %s->%s\n", + imp->imp_obd->obd_uuid.uuid, + obd2cli_tgt(imp->imp_obd)); + RETURN(-ENOMEM); + } + + DEBUG_REQ(D_INFO, req, "pinging %s->%s", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + /* Updating imp_next_ping early, it allows pinger_check_timeout to + * see an actual time for next awake. request_out_callback update + * happens at another thread, and ptlrpc_pinger_main may sleep + * already. + */ + ptlrpc_update_next_ping(imp, 0); + ptlrpcd_add_req(req); + + RETURN(0); +} + +void ptlrpc_ping_import_soon(struct obd_import *imp) +{ + imp->imp_next_ping = ktime_get_seconds(); +} + +static inline int imp_is_deactive(struct obd_import *imp) +{ + return imp->imp_deactive || + OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE); +} + +static inline time64_t ptlrpc_next_reconnect(struct obd_import *imp) +{ + return ktime_get_seconds() + INITIAL_CONNECT_TIMEOUT; +} + +static timeout_t pinger_check_timeout(time64_t time) +{ + timeout_t timeout = PING_INTERVAL; + timeout_t next_timeout; + time64_t now; + struct list_head *iter; + struct obd_import *imp; + + mutex_lock(&pinger_mutex); + now = ktime_get_seconds(); + /* Process imports to find a nearest next ping */ + list_for_each(iter, &pinger_imports) { + imp = list_entry(iter, struct obd_import, imp_pinger_chain); + if (!imp->imp_pingable || imp->imp_next_ping < now) + continue; + next_timeout = imp->imp_next_ping - now; + /* make sure imp_next_ping in the future from time */ + if (next_timeout > (now - time) && timeout > next_timeout) + timeout = next_timeout; + } + mutex_unlock(&pinger_mutex); + + return timeout - (now - time); +} + +static bool ir_up; + +void ptlrpc_pinger_ir_up(void) +{ + CDEBUG(D_HA, "IR up\n"); + ir_up = true; +} +EXPORT_SYMBOL(ptlrpc_pinger_ir_up); + +void ptlrpc_pinger_ir_down(void) +{ + CDEBUG(D_HA, "IR down\n"); + ir_up = false; +} +EXPORT_SYMBOL(ptlrpc_pinger_ir_down); + +static void ptlrpc_pinger_process_import(struct obd_import *imp, + time64_t this_ping) +{ + int level; + int force; + int force_next; + int suppress; + + spin_lock(&imp->imp_lock); + + level = imp->imp_state; + force = imp->imp_force_verify; + force_next = imp->imp_force_next_verify; + /* + * This will be used below only if the import is "FULL". + */ + suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS); + + imp->imp_force_verify = 0; + + if (imp->imp_next_ping - 5 >= this_ping && !force) { + spin_unlock(&imp->imp_lock); + return; + } + + imp->imp_force_next_verify = 0; + + CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, + "%s->%s: level %s/%u force %u force_next %u deactive %u pingable %u suppress %u\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(level), level, force, force_next, + imp->imp_deactive, imp->imp_pingable, suppress); + + if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) { + /* wait for a while before trying recovery again */ + imp->imp_next_ping = ptlrpc_next_reconnect(imp); + spin_unlock(&imp->imp_lock); + if (!imp->imp_no_pinger_recover || + imp->imp_connect_error == -EAGAIN) + ptlrpc_initiate_recovery(imp); + } else if (level != LUSTRE_IMP_FULL || imp->imp_obd->obd_no_recov || + imp_is_deactive(imp)) { + CDEBUG(D_HA, + "%s->%s: not pinging (in recovery or recovery disabled: %s)\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(level)); + if (force) + imp->imp_force_verify = 1; + spin_unlock(&imp->imp_lock); + } else if ((imp->imp_pingable && !suppress) || force_next || force) { + spin_unlock(&imp->imp_lock); + ptlrpc_ping(imp); + } else { + spin_unlock(&imp->imp_lock); + } +} + +static struct workqueue_struct *pinger_wq; +static void ptlrpc_pinger_main(struct work_struct *ws); +static DECLARE_DELAYED_WORK(ping_work, ptlrpc_pinger_main); + +static void ptlrpc_pinger_main(struct work_struct *ws) +{ + time64_t this_ping, time_after_ping; + timeout_t time_to_next_wake; + struct obd_import *imp; + + do { + this_ping = ktime_get_seconds(); + + mutex_lock(&pinger_mutex); + + list_for_each_entry(imp, &pinger_imports, imp_pinger_chain) { + ptlrpc_pinger_process_import(imp, this_ping); + /* obd_timeout might have changed */ + if (imp->imp_pingable && imp->imp_next_ping && + imp->imp_next_ping > this_ping + PING_INTERVAL) + ptlrpc_update_next_ping(imp, 0); + } + mutex_unlock(&pinger_mutex); + + time_after_ping = ktime_get_seconds(); + /* update memory usage info */ + obd_update_maxusage(); + + if ((ktime_get_seconds() - this_ping - 3) > PING_INTERVAL) + CDEBUG(D_HA, "long time to ping: %lld, %lld, %lld\n", + this_ping, time_after_ping, ktime_get_seconds()); + + /* Wait until the next ping time, or until we're stopped. */ + time_to_next_wake = pinger_check_timeout(this_ping); + /* + * The ping sent by ptlrpc_send_rpc may get sent out + * say .01 second after this. + * ptlrpc_pinger_sending_on_import will then set the + * next ping time to next_ping + .01 sec, which means + * we will SKIP the next ping at next_ping, and the + * ping will get sent 2 timeouts from now! Beware. + */ + CDEBUG(D_INFO, "next wakeup in %d (%lld)\n", + time_to_next_wake, this_ping + PING_INTERVAL); + } while (time_to_next_wake <= 0); + + queue_delayed_work(pinger_wq, &ping_work, + cfs_time_seconds(max(time_to_next_wake, 1))); +} + +int ptlrpc_start_pinger(void) +{ +#ifdef CONFIG_LUSTRE_FS_PINGER + if (pinger_wq) + return -EALREADY; + + pinger_wq = cfs_cpt_bind_workqueue("ptlrpc_pinger", cfs_cpt_tab, + 0, CFS_CPT_ANY, 1); + if (IS_ERR(pinger_wq)) { + CERROR("cannot start pinger workqueue\n"); + return PTR_ERR(pinger_wq); + } + + queue_delayed_work(pinger_wq, &ping_work, 0); + + if (suppress_pings) + CWARN("Pings will be suppressed at the request of the administrator. The configuration shall meet the additional requirements described in the manual. (Search for the \"suppress_pings\" kernel module parameter.)\n"); +#endif + return 0; +} + +int ptlrpc_stop_pinger(void) +{ +#ifdef CONFIG_LUSTRE_FS_PINGER + if (!pinger_wq) + return -EALREADY; + + cancel_delayed_work_sync(&ping_work); + destroy_workqueue(pinger_wq); + pinger_wq = NULL; +#endif + return 0; +} + +void ptlrpc_pinger_sending_on_import(struct obd_import *imp) +{ + ptlrpc_update_next_ping(imp, 0); +} + +void ptlrpc_pinger_commit_expected(struct obd_import *imp) +{ + ptlrpc_update_next_ping(imp, 1); + assert_spin_locked(&imp->imp_lock); + /* + * Avoid reading stale imp_connect_data. When not sure if pings are + * expected or not on next connection, we assume they are not and force + * one anyway to guarantee the chance of updating + * imp_peer_committed_transno. + */ + if (imp->imp_state != LUSTRE_IMP_FULL || + OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS)) + imp->imp_force_next_verify = 1; +} + +int ptlrpc_pinger_add_import(struct obd_import *imp) +{ + ENTRY; + if (!list_empty(&imp->imp_pinger_chain)) + RETURN(-EALREADY); + + mutex_lock(&pinger_mutex); + CDEBUG(D_HA, "adding pingable import %s->%s\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + /* if we add to pinger we want recovery on this import */ + imp->imp_obd->obd_no_recov = 0; + ptlrpc_update_next_ping(imp, 0); + /* XXX sort, blah blah */ + list_add_tail(&imp->imp_pinger_chain, &pinger_imports); + class_import_get(imp); + + ptlrpc_pinger_wake_up(); + mutex_unlock(&pinger_mutex); + + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_pinger_add_import); + +int ptlrpc_pinger_del_import(struct obd_import *imp) +{ + ENTRY; + + if (list_empty(&imp->imp_pinger_chain)) + RETURN(-ENOENT); + + mutex_lock(&pinger_mutex); + list_del_init(&imp->imp_pinger_chain); + CDEBUG(D_HA, "removing pingable import %s->%s\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + /* if we remove from pinger we don't want recovery on this import */ + imp->imp_obd->obd_no_recov = 1; + class_import_put(imp); + mutex_unlock(&pinger_mutex); + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_pinger_del_import); + +void ptlrpc_pinger_wake_up(void) +{ +#ifdef CONFIG_LUSTRE_FS_PINGER + mod_delayed_work(pinger_wq, &ping_work, 0); +#endif +} + +/* Ping evictor thread */ +#define PET_READY 1 +#define PET_TERMINATE 2 + +static int pet_refcount; +static int pet_state; +static wait_queue_head_t pet_waitq; +static LIST_HEAD(pet_list); +static DEFINE_SPINLOCK(pet_lock); + +int ping_evictor_wake(struct obd_export *exp) +{ + struct obd_device *obd; + + spin_lock(&pet_lock); + if (pet_state != PET_READY) { + /* eventually the new obd will call here again. */ + spin_unlock(&pet_lock); + return 1; + } + + obd = class_exp2obd(exp); + if (list_empty(&obd->obd_evict_list)) { + class_incref(obd, "evictor", obd); + list_add(&obd->obd_evict_list, &pet_list); + } + spin_unlock(&pet_lock); + + wake_up(&pet_waitq); + return 0; +} + +static int ping_evictor_main(void *arg) +{ + struct obd_device *obd; + struct obd_export *exp; + time64_t expire_time; + + ENTRY; + unshare_fs_struct(); + CDEBUG(D_HA, "Starting Ping Evictor\n"); + pet_state = PET_READY; + while (1) { + wait_event_idle(pet_waitq, + (!list_empty(&pet_list)) || + (pet_state == PET_TERMINATE)); + + /* loop until all obd's will be removed */ + if ((pet_state == PET_TERMINATE) && list_empty(&pet_list)) + break; + + /* + * we only get here if pet_exp != NULL, and the end of this + * loop is the only place which sets it NULL again, so lock + * is not strictly necessary. + */ + spin_lock(&pet_lock); + obd = list_entry(pet_list.next, struct obd_device, + obd_evict_list); + spin_unlock(&pet_lock); + + expire_time = ktime_get_real_seconds() - PING_EVICT_TIMEOUT; + + CDEBUG(D_HA, "evicting all exports of obd %s older than %lld\n", + obd->obd_name, expire_time); + + /* + * Exports can't be deleted out of the list while we hold + * the obd lock (class_unlink_export), which means we can't + * lose the last ref on the export. If they've already been + * removed from the list, we won't find them here. + */ + spin_lock(&obd->obd_dev_lock); + while (!list_empty(&obd->obd_exports_timed)) { + exp = list_entry(obd->obd_exports_timed.next, + struct obd_export, + exp_obd_chain_timed); + if (expire_time > exp->exp_last_request_time) { + struct obd_uuid *client_uuid; + + class_export_get(exp); + client_uuid = &exp->exp_client_uuid; + spin_unlock(&obd->obd_dev_lock); + LCONSOLE_WARN("%s: haven't heard from client %s (at %s) in %lld seconds. I think it's dead, and I am evicting it. exp %p, cur %lld expire %lld last %lld\n", + obd->obd_name, + obd_uuid2str(client_uuid), + obd_export_nid2str(exp), + ktime_get_real_seconds() - + exp->exp_last_request_time, + exp, ktime_get_real_seconds(), + expire_time, + exp->exp_last_request_time); + CDEBUG(D_HA, "Last request was at %lld\n", + exp->exp_last_request_time); + class_fail_export(exp); + class_export_put(exp); + spin_lock(&obd->obd_dev_lock); + } else { + /* List is sorted, so everyone below is ok */ + break; + } + } + spin_unlock(&obd->obd_dev_lock); + + spin_lock(&pet_lock); + list_del_init(&obd->obd_evict_list); + spin_unlock(&pet_lock); + + class_decref(obd, "evictor", obd); + } + CDEBUG(D_HA, "Exiting Ping Evictor\n"); + + RETURN(0); +} + +void ping_evictor_start(void) +{ + struct task_struct *task; + + if (++pet_refcount > 1) + return; + + init_waitqueue_head(&pet_waitq); + + task = kthread_run(ping_evictor_main, NULL, "ll_evictor"); + if (IS_ERR(task)) { + pet_refcount--; + CERROR("Cannot start ping evictor thread: %ld\n", + PTR_ERR(task)); + } +} +EXPORT_SYMBOL(ping_evictor_start); + +void ping_evictor_stop(void) +{ + if (--pet_refcount > 0) + return; + + pet_state = PET_TERMINATE; + wake_up(&pet_waitq); +} +EXPORT_SYMBOL(ping_evictor_stop); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h new file mode 100644 index 0000000000000..399ff28fa5ddb --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h @@ -0,0 +1,441 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +/* Intramodule declarations for ptlrpc. */ + +#ifndef PTLRPC_INTERNAL_H +#define PTLRPC_INTERNAL_H + +#include "../ldlm/ldlm_internal.h" +#include "heap.h" + +struct ldlm_namespace; +struct obd_import; +struct ldlm_res_id; +struct ptlrpc_request_set; +extern int test_req_buffer_pressure; +extern struct list_head ptlrpc_all_services; +extern struct mutex ptlrpc_all_services_mutex; +extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo; +extern struct ptlrpc_nrs_pol_conf nrs_conf_delay; + +#ifdef HAVE_SERVER_SUPPORT +extern struct ptlrpc_nrs_pol_conf nrs_conf_crrn; +extern struct ptlrpc_nrs_pol_conf nrs_conf_orr; +extern struct ptlrpc_nrs_pol_conf nrs_conf_trr; +extern struct ptlrpc_nrs_pol_conf nrs_conf_tbf; +#endif /* HAVE_SERVER_SUPPORT */ + +/** + * \addtogoup nrs + * @{ + */ +extern struct nrs_core nrs_core; + +extern struct mutex ptlrpcd_mutex; +extern struct mutex pinger_mutex; + +extern lnet_handler_t ptlrpc_handler; +extern struct percpu_ref ptlrpc_pending; + +/* ptlrpcd.c */ +int ptlrpcd_start(struct ptlrpcd_ctl *pc); + +/* client.c */ +void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, + timeout_t service_timeout); +struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw, + enum ptlrpc_bulk_op_type type, + unsigned portal, + const struct ptlrpc_bulk_frag_ops + *ops); +int ptlrpc_request_cache_init(void); +void ptlrpc_request_cache_fini(void); +struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags); +void ptlrpc_request_cache_free(struct ptlrpc_request *req); +void ptlrpc_init_xid(void); +void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, + struct ptlrpc_request *req); +void ptlrpc_expired_set(struct ptlrpc_request_set *set); +time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *); +void ptlrpc_resend_req(struct ptlrpc_request *request); +void ptlrpc_set_mbits(struct ptlrpc_request *req); +void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req); +__u64 ptlrpc_known_replied_xid(struct obd_import *imp); +void ptlrpc_add_unreplied(struct ptlrpc_request *req); + +/* events.c */ +int ptlrpc_init_portals(void); +void ptlrpc_exit_portals(void); + +void ptlrpc_request_handle_notconn(struct ptlrpc_request *); +void lustre_assert_wire_constants(void); +int ptlrpc_import_in_recovery(struct obd_import *imp); +int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt, + bool invalid); +void ptlrpc_handle_failed_import(struct obd_import *imp); +int ptlrpc_replay_next(struct obd_import *imp, int *inflight); +void ptlrpc_initiate_recovery(struct obd_import *imp); + +int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset); +int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset); + +int ptlrpc_sysfs_register_service(struct kset *parent, + struct ptlrpc_service *svc); +void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc); + +void ptlrpc_ldebugfs_register_service(struct dentry *debugfs_entry, + struct ptlrpc_service *svc); +#ifdef CONFIG_PROC_FS +void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc); +void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount); +void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req, + long q_usec, long work_usec); +#else +#define ptlrpc_lprocfs_unregister_service(params...) do{}while(0) +#define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0) +#define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0) +#endif /* CONFIG_PROC_FS */ + +/* NRS */ + +/** + * NRS core object. + * + * Holds NRS core fields. + */ +struct nrs_core { + /** + * Protects nrs_core::nrs_policies, serializes external policy + * registration/unregistration, and NRS core lprocfs operations. + */ + struct mutex nrs_mutex; + /** + * List of all policy descriptors registered with NRS core; protected + * by nrs_core::nrs_mutex. + */ + struct list_head nrs_policies; +}; + +int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc); +void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc); + +void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp); +void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req); +void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req); +void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp); + +struct ptlrpc_request * +ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp, + bool peek, bool force); + +static inline struct ptlrpc_request * +ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp, + bool force) +{ + return ptlrpc_nrs_req_get_nolock0(svcpt, hp, false, force); +} + +static inline struct ptlrpc_request * +ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp) +{ + return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false); +} + +void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req); +bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp); +bool ptlrpc_nrs_req_throttling_nolock(struct ptlrpc_service_part *svcpt, + bool hp); + +int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc, + enum ptlrpc_nrs_queue_type queue, char *name, + enum ptlrpc_nrs_ctl opc, bool single, void *arg); + +int ptlrpc_nrs_init(void); +void ptlrpc_nrs_fini(void); + +static inline bool nrs_svcpt_has_hp(const struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_nrs_hp != NULL; +} + +static inline bool nrs_svc_has_hp(const struct ptlrpc_service *svc) +{ + /** + * If the first service partition has an HP NRS head, all service + * partitions will. + */ + return nrs_svcpt_has_hp(svc->srv_parts[0]); +} + +static inline +struct ptlrpc_nrs *nrs_svcpt2nrs(struct ptlrpc_service_part *svcpt, bool hp) +{ + LASSERT(ergo(hp, nrs_svcpt_has_hp(svcpt))); + return hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg; +} + +static inline int nrs_pol2cptid(const struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_nrs->nrs_svcpt->scp_cpt; +} + +static inline +struct ptlrpc_service *nrs_pol2svc(struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_nrs->nrs_svcpt->scp_service; +} + +static inline +struct ptlrpc_service_part *nrs_pol2svcpt(struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_nrs->nrs_svcpt; +} + +static inline +struct cfs_cpt_table *nrs_pol2cptab(struct ptlrpc_nrs_policy *policy) +{ + return nrs_pol2svc(policy)->srv_cptable; +} + +static inline struct ptlrpc_nrs_resource * +nrs_request_resource(struct ptlrpc_nrs_request *nrq) +{ + LASSERT(nrq->nr_initialized); + LASSERT(!nrq->nr_finalized); + + return nrq->nr_res_ptrs[nrq->nr_res_idx]; +} + +static inline +struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq) +{ + return nrs_request_resource(nrq)->res_policy; +} + +#define NRS_LPROCFS_QUANTUM_NAME_REG "reg_quantum:" +#define NRS_LPROCFS_QUANTUM_NAME_HP "hp_quantum:" + +/** + * the maximum size of nrs_crrn_client::cc_quantum and nrs_orr_data::od_quantum. + */ +#define LPROCFS_NRS_QUANTUM_MAX 65535 + +/** + * Max valid command string is the size of the labels, plus "65535" twice, plus + * a separating space character. + */ +#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD \ + sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " " \ + NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX)) + +/* recovd_thread.c */ + +int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink); + +/* pers.c */ +void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc, + int mdcnt); + +/* pack_generic.c */ +struct ptlrpc_reply_state * +lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt); +void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs); +void lustre_msg_early_size_init(void); /* just for init */ + +/* pinger.c */ +int ptlrpc_start_pinger(void); +int ptlrpc_stop_pinger(void); +void ptlrpc_pinger_sending_on_import(struct obd_import *imp); +void ptlrpc_pinger_commit_expected(struct obd_import *imp); +void ptlrpc_pinger_wake_up(void); +void ptlrpc_ping_import_soon(struct obd_import *imp); +int ping_evictor_wake(struct obd_export *exp); + +/* sec_null.c */ +int sptlrpc_null_init(void); +void sptlrpc_null_fini(void); + +/* sec_plain.c */ +int sptlrpc_plain_init(void); +void sptlrpc_plain_fini(void); + +/* sec_bulk.c */ +int sptlrpc_enc_pool_init(void); +void sptlrpc_enc_pool_fini(void); +int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v); + +/* sec_lproc.c */ +int sptlrpc_lproc_init(void); +void sptlrpc_lproc_fini(void); + +/* sec_gc.c */ +int sptlrpc_gc_init(void); +void sptlrpc_gc_fini(void); + +/* sec_config.c */ +void sptlrpc_conf_choose_flavor(enum lustre_sec_part from, + enum lustre_sec_part to, + struct obd_uuid *target, + struct lnet_nid *nid, + struct sptlrpc_flavor *sf); +int sptlrpc_conf_init(void); +void sptlrpc_conf_fini(void); + +/* sec.c */ +int sptlrpc_init(void); +void sptlrpc_fini(void); + +/* layout.c */ +__u32 __req_capsule_offset(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); + +static inline bool ptlrpc_recoverable_error(int rc) +{ + return (rc == -ENOTCONN || rc == -ENODEV); +} + +#ifdef HAVE_SERVER_SUPPORT +int tgt_mod_init(void); +void tgt_mod_exit(void); +int nodemap_mod_init(void); +void nodemap_mod_exit(void); +#else /* HAVE_SERVER_SUPPORT */ +static inline int tgt_mod_init(void) +{ + return 0; +} + +static inline void tgt_mod_exit(void) +{ + return; +} + +static inline int nodemap_mod_init(void) +{ + return 0; +} + +static inline void nodemap_mod_exit(void) +{ + return; +} +#endif /* !HAVE_SERVER_SUPPORT */ + +static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set) +{ + if (atomic_dec_and_test(&set->set_refcount)) + OBD_FREE_PTR(set); +} + +/** initialise ptlrpc common fields */ +static inline void ptlrpc_req_comm_init(struct ptlrpc_request *req) +{ + spin_lock_init(&req->rq_lock); + spin_lock_init(&req->rq_early_free_lock); + atomic_set(&req->rq_refcount, 1); + INIT_LIST_HEAD(&req->rq_list); + INIT_LIST_HEAD(&req->rq_replay_list); +} + +/** initialise client side ptlrpc request */ +static inline void ptlrpc_cli_req_init(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_req *cr = &req->rq_cli; + + ptlrpc_req_comm_init(req); + + req->rq_receiving_reply = 0; + req->rq_req_unlinked = req->rq_reply_unlinked = 1; + req->rq_replied = 0; + + INIT_LIST_HEAD(&cr->cr_set_chain); + INIT_LIST_HEAD(&cr->cr_ctx_chain); + INIT_LIST_HEAD(&cr->cr_unreplied_list); + init_waitqueue_head(&cr->cr_reply_waitq); + init_waitqueue_head(&cr->cr_set_waitq); +} + +/** initialise server side ptlrpc request */ +static inline void ptlrpc_srv_req_init(struct ptlrpc_request *req) +{ + struct ptlrpc_srv_req *sr = &req->rq_srv; + + ptlrpc_req_comm_init(req); + req->rq_srv_req = 1; + INIT_LIST_HEAD(&sr->sr_exp_list); + INIT_LIST_HEAD(&sr->sr_timed_list); + INIT_LIST_HEAD(&sr->sr_hist_list); +} + +static inline bool ptlrpc_req_is_connect(struct ptlrpc_request *req) +{ + if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CONNECT || + lustre_msg_get_opc(req->rq_reqmsg) == OST_CONNECT || + lustre_msg_get_opc(req->rq_reqmsg) == MGS_CONNECT) + return true; + else + return false; +} + +static inline bool ptlrpc_req_is_disconnect(struct ptlrpc_request *req) +{ + if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_DISCONNECT || + lustre_msg_get_opc(req->rq_reqmsg) == OST_DISCONNECT || + lustre_msg_get_opc(req->rq_reqmsg) == MGS_DISCONNECT) + return true; + else + return false; +} + +static inline void do_pack_body(struct ptlrpc_request *req) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + + if (b == NULL) + return; + + b->mbo_valid = 0; + b->mbo_eadatasize = 0; + b->mbo_flags = 0; + b->mbo_suppgid = -1; + b->mbo_uid = from_kuid(&init_user_ns, current_uid()); + b->mbo_gid = from_kgid(&init_user_ns, current_gid()); + b->mbo_fsuid = from_kuid(&init_user_ns, current_fsuid()); + b->mbo_fsgid = from_kgid(&init_user_ns, current_fsgid()); + b->mbo_capability = current_cap().cap[0]; +} + +#endif /* PTLRPC_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c new file mode 100644 index 0000000000000..abde1f23080d8 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c @@ -0,0 +1,147 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_RPC + + +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +static __init int ptlrpc_init(void) +{ + int rc; + + ENTRY; + + lustre_assert_wire_constants(); +#if RS_DEBUG + spin_lock_init(&ptlrpc_rs_debug_lock); +#endif + mutex_init(&ptlrpc_all_services_mutex); + mutex_init(&pinger_mutex); + mutex_init(&ptlrpcd_mutex); + ptlrpc_init_xid(); + lustre_msg_early_size_init(); + + rc = req_layout_init(); + if (rc) + RETURN(rc); + + rc = tgt_mod_init(); + if (rc) + GOTO(err_layout, rc); + + rc = ptlrpc_hr_init(); + if (rc) + GOTO(err_tgt, rc); + + rc = ptlrpc_request_cache_init(); + if (rc) + GOTO(err_hr, rc); + + rc = ptlrpc_init_portals(); + if (rc) + GOTO(err_cache, rc); + + rc = ptlrpc_connection_init(); + if (rc) + GOTO(err_portals, rc); + + rc = ptlrpc_start_pinger(); + if (rc) + GOTO(err_conn, rc); + + rc = ldlm_init(); + if (rc) + GOTO(err_pinger, rc); + + rc = sptlrpc_init(); + if (rc) + GOTO(err_ldlm, rc); + + rc = ptlrpc_nrs_init(); + if (rc) + GOTO(err_sptlrpc, rc); + + rc = nodemap_mod_init(); + if (rc) + GOTO(err_nrs, rc); + + RETURN(0); +err_nrs: + ptlrpc_nrs_fini(); +err_sptlrpc: + sptlrpc_fini(); +err_ldlm: + ldlm_exit(); +err_pinger: + ptlrpc_stop_pinger(); +err_conn: + ptlrpc_connection_fini(); +err_portals: + ptlrpc_exit_portals(); +err_cache: + ptlrpc_request_cache_fini(); +err_hr: + ptlrpc_hr_fini(); +err_tgt: + tgt_mod_exit(); +err_layout: + req_layout_fini(); + return rc; +} + +static void __exit ptlrpc_exit(void) +{ + nodemap_mod_exit(); + ptlrpc_nrs_fini(); + sptlrpc_fini(); + ldlm_exit(); + ptlrpc_stop_pinger(); + ptlrpc_exit_portals(); + ptlrpc_request_cache_fini(); + ptlrpc_hr_fini(); + ptlrpc_connection_fini(); + tgt_mod_exit(); + req_layout_fini(); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Request Processor and Lock Management"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(ptlrpc_init); +module_exit(ptlrpc_exit); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c new file mode 100644 index 0000000000000..9d29cc7a6e953 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c @@ -0,0 +1,993 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/ptlrpcd.c + */ + +/** \defgroup ptlrpcd PortalRPC daemon + * + * ptlrpcd is a special thread with its own set where other user might add + * requests when they don't want to wait for their completion. + * PtlRPCD will take care of sending such requests and then processing their + * replies and calling completion callbacks as necessary. + * The callbacks are called directly from ptlrpcd context. + * It is important to never significantly block (esp. on RPCs!) within such + * completion handler or a deadlock might occur where ptlrpcd enters some + * callback that attempts to send another RPC and wait for it to return, + * during which time ptlrpcd is completely blocked, so e.g. if import + * fails, recovery cannot progress because connection requests are also + * sent by ptlrpcd. + * + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include +#include /* for obd_zombie */ +#include /* for OBD_FAIL_CHECK */ +#include /* cl_env_{get,put}() */ +#include + +#include "ptlrpc_internal.h" + +/* One of these per CPT. */ +struct ptlrpcd { + int pd_size; + int pd_index; + int pd_cpt; + int pd_cursor; + int pd_nthreads; + int pd_groupsize; + struct ptlrpcd_ctl pd_threads[0]; +}; + +/* + * max_ptlrpcds is obsolete, but retained to ensure that the kernel + * module will load on a system where it has been tuned. + * A value other than 0 implies it was tuned, in which case the value + * is used to derive a setting for ptlrpcd_per_cpt_max. + */ +static int max_ptlrpcds; +module_param(max_ptlrpcds, int, 0644); +MODULE_PARM_DESC(max_ptlrpcds, + "Max ptlrpcd thread count to be started (obsolete)."); + +/* + * ptlrpcd_bind_policy is obsolete, but retained to ensure that + * the kernel module will load on a system where it has been tuned. + * A value other than 0 implies it was tuned, in which case the value + * is used to derive a setting for ptlrpcd_partner_group_size. + */ +static int ptlrpcd_bind_policy; +module_param(ptlrpcd_bind_policy, int, 0644); +MODULE_PARM_DESC(ptlrpcd_bind_policy, + "Ptlrpcd threads binding mode (obsolete)."); + +/* + * ptlrpcd_per_cpt_max: The maximum number of ptlrpcd threads to run + * in a CPT. + */ +static int ptlrpcd_per_cpt_max; +module_param(ptlrpcd_per_cpt_max, int, 0644); +MODULE_PARM_DESC(ptlrpcd_per_cpt_max, + "Max ptlrpcd thread count to be started per CPT."); + +/* + * ptlrpcd_partner_group_size: The desired number of threads in each + * ptlrpcd partner thread group. Default is 2, corresponding to the + * old PDB_POLICY_PAIR. A negative value makes all ptlrpcd threads in + * a CPT partners of each other. + */ +static int ptlrpcd_partner_group_size; +module_param(ptlrpcd_partner_group_size, int, 0644); +MODULE_PARM_DESC(ptlrpcd_partner_group_size, + "Number of ptlrpcd threads in a partner group."); + +/* + * ptlrpcd_cpts: A CPT string describing the CPU partitions that + * ptlrpcd threads should run on. Used to make ptlrpcd threads run on + * a subset of all CPTs. + * + * ptlrpcd_cpts=2 + * ptlrpcd_cpts=[2] + * run ptlrpcd threads only on CPT 2. + * + * ptlrpcd_cpts=0-3 + * ptlrpcd_cpts=[0-3] + * run ptlrpcd threads on CPTs 0, 1, 2, and 3. + * + * ptlrpcd_cpts=[0-3,5,7] + * run ptlrpcd threads on CPTS 0, 1, 2, 3, 5, and 7. + */ +static char *ptlrpcd_cpts; +module_param(ptlrpcd_cpts, charp, 0644); +MODULE_PARM_DESC(ptlrpcd_cpts, + "CPU partitions ptlrpcd threads should run in"); + +/* ptlrpcds_cpt_idx maps cpt numbers to an index in the ptlrpcds array. */ +static int *ptlrpcds_cpt_idx; + +/* ptlrpcds_num is the number of entries in the ptlrpcds array. */ +static int ptlrpcds_num; +static struct ptlrpcd **ptlrpcds; + +/* + * In addition to the regular thread pool above, there is a single + * global recovery thread. Recovery isn't critical for performance, + * and doesn't block, but must always be able to proceed, and it is + * possible that all normal ptlrpcd threads are blocked. Hence the + * need for a dedicated thread. + */ +static struct ptlrpcd_ctl ptlrpcd_rcv; + +struct mutex ptlrpcd_mutex; +static int ptlrpcd_users = 0; + +void ptlrpcd_wake(struct ptlrpc_request *req) +{ + struct ptlrpc_request_set *set = req->rq_set; + + LASSERT(set != NULL); + wake_up(&set->set_waitq); +} +EXPORT_SYMBOL(ptlrpcd_wake); + +static struct ptlrpcd_ctl * +ptlrpcd_select_pc(struct ptlrpc_request *req) +{ + struct ptlrpcd *pd; + int cpt; + int idx; + + if (req != NULL && req->rq_send_state != LUSTRE_IMP_FULL) + return &ptlrpcd_rcv; + + cpt = cfs_cpt_current(cfs_cpt_tab, 1); + if (ptlrpcds_cpt_idx == NULL) + idx = cpt; + else + idx = ptlrpcds_cpt_idx[cpt]; + pd = ptlrpcds[idx]; + + /* We do not care whether it is strict load balance. */ + idx = pd->pd_cursor; + if (++idx == pd->pd_nthreads) + idx = 0; + pd->pd_cursor = idx; + + return &pd->pd_threads[idx]; +} + +/** + * Move all request from an existing request set to the ptlrpcd queue. + * All requests from the set must be in phase RQ_PHASE_NEW. + */ +void ptlrpcd_add_rqset(struct ptlrpc_request_set *set) +{ + struct list_head *tmp, *pos; + struct ptlrpcd_ctl *pc; + struct ptlrpc_request_set *new; + int count, i; + + pc = ptlrpcd_select_pc(NULL); + new = pc->pc_set; + + list_for_each_safe(pos, tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(pos, struct ptlrpc_request, + rq_set_chain); + + LASSERT(req->rq_phase == RQ_PHASE_NEW); + req->rq_set = new; + req->rq_queued_time = ktime_get_seconds(); + } + + spin_lock(&new->set_new_req_lock); + list_splice_init(&set->set_requests, &new->set_new_requests); + i = atomic_read(&set->set_remaining); + count = atomic_add_return(i, &new->set_new_count); + atomic_set(&set->set_remaining, 0); + spin_unlock(&new->set_new_req_lock); + if (count == i) { + wake_up(&new->set_waitq); + + /* + * XXX: It maybe unnecessary to wakeup all the partners. But to + * guarantee the async RPC can be processed ASAP, we have + * no other better choice. It maybe fixed in future. + */ + for (i = 0; i < pc->pc_npartners; i++) + wake_up(&pc->pc_partners[i]->pc_set->set_waitq); + } +} + +/** + * Return transferred RPCs count. + */ +static int ptlrpcd_steal_rqset(struct ptlrpc_request_set *des, + struct ptlrpc_request_set *src) +{ + struct ptlrpc_request *req; + int rc = 0; + + spin_lock(&src->set_new_req_lock); + if (likely(!list_empty(&src->set_new_requests))) { + list_for_each_entry(req, &src->set_new_requests, rq_set_chain) + req->rq_set = des; + + list_splice_init(&src->set_new_requests, + &des->set_requests); + rc = atomic_read(&src->set_new_count); + atomic_add(rc, &des->set_remaining); + atomic_set(&src->set_new_count, 0); + } + spin_unlock(&src->set_new_req_lock); + return rc; +} + +/** + * Requests that are added to the ptlrpcd queue are sent via + * ptlrpcd_check->ptlrpc_check_set(). + */ +void ptlrpcd_add_req(struct ptlrpc_request *req) +{ + struct ptlrpcd_ctl *pc; + + if (req->rq_reqmsg) + lustre_msg_set_jobid(req->rq_reqmsg, NULL); + + spin_lock(&req->rq_lock); + if (req->rq_invalid_rqset) { + req->rq_invalid_rqset = 0; + spin_unlock(&req->rq_lock); + if (wait_event_idle_timeout(req->rq_set_waitq, + req->rq_set == NULL, + cfs_time_seconds(5)) == 0) + l_wait_event_abortable(req->rq_set_waitq, + req->rq_set == NULL); + } else if (req->rq_set) { + /* + * If we have a vaid "rq_set", just reuse it to avoid double + * linked. + */ + LASSERT(req->rq_phase == RQ_PHASE_NEW); + LASSERT(req->rq_send_state == LUSTRE_IMP_REPLAY); + + /* ptlrpc_check_set will decrease the count */ + atomic_inc(&req->rq_set->set_remaining); + spin_unlock(&req->rq_lock); + wake_up(&req->rq_set->set_waitq); + return; + } else { + spin_unlock(&req->rq_lock); + } + + pc = ptlrpcd_select_pc(req); + + DEBUG_REQ(D_INFO, req, "add req [%p] to pc [%s+%d]", + req, pc->pc_name, pc->pc_index); + + ptlrpc_set_add_new_req(pc, req); +} +EXPORT_SYMBOL(ptlrpcd_add_req); + +static inline void ptlrpc_reqset_get(struct ptlrpc_request_set *set) +{ + atomic_inc(&set->set_refcount); +} + +/** + * Check if there is more work to do on ptlrpcd set. + * Returns 1 if yes. + */ +static int ptlrpcd_check(struct lu_env *env, struct ptlrpcd_ctl *pc) +{ + struct ptlrpc_request *req, *tmp; + struct ptlrpc_request_set *set = pc->pc_set; + int rc = 0; + int rc2; + + ENTRY; + + if (atomic_read(&set->set_new_count)) { + spin_lock(&set->set_new_req_lock); + if (likely(!list_empty(&set->set_new_requests))) { + list_splice_init(&set->set_new_requests, + &set->set_requests); + atomic_add(atomic_read(&set->set_new_count), + &set->set_remaining); + atomic_set(&set->set_new_count, 0); + /* + * Need to calculate its timeout. + */ + rc = 1; + } + spin_unlock(&set->set_new_req_lock); + } + + /* + * We should call lu_env_refill() before handling new requests to make + * sure that env key the requests depending on really exists. + */ + rc2 = lu_env_refill(env); + if (rc2 != 0) { + /* + * XXX This is very awkward situation, because + * execution can neither continue (request + * interpreters assume that env is set up), nor repeat + * the loop (as this potentially results in a tight + * loop of -ENOMEM's). + * + * Fortunately, refill only ever does something when + * new modules are loaded, i.e., early during boot up. + */ + CERROR("Failure to refill session: %d\n", rc2); + RETURN(rc); + } + + if (atomic_read(&set->set_remaining)) + rc |= ptlrpc_check_set(env, set); + + /* + * NB: ptlrpc_check_set has already moved complted request at the + * head of seq::set_requests + */ + list_for_each_entry_safe(req, tmp, &set->set_requests, rq_set_chain) { + if (req->rq_phase != RQ_PHASE_COMPLETE) + break; + + list_del_init(&req->rq_set_chain); + req->rq_set = NULL; + ptlrpc_req_finished(req); + } + + if (rc == 0) { + /* + * If new requests have been added, make sure to wake up. + */ + rc = atomic_read(&set->set_new_count); + + /* + * If we have nothing to do, check whether we can take some + * work from our partner threads. + */ + if (rc == 0 && pc->pc_npartners > 0) { + struct ptlrpcd_ctl *partner; + struct ptlrpc_request_set *ps; + int first = pc->pc_cursor; + + do { + partner = pc->pc_partners[pc->pc_cursor++]; + if (pc->pc_cursor >= pc->pc_npartners) + pc->pc_cursor = 0; + if (partner == NULL) + continue; + + spin_lock(&partner->pc_lock); + ps = partner->pc_set; + if (ps == NULL) { + spin_unlock(&partner->pc_lock); + continue; + } + + ptlrpc_reqset_get(ps); + spin_unlock(&partner->pc_lock); + + if (atomic_read(&ps->set_new_count)) { + rc = ptlrpcd_steal_rqset(set, ps); + if (rc > 0) + CDEBUG(D_RPCTRACE, + "transfer %d async RPCs [%d->%d]\n", + rc, partner->pc_index, + pc->pc_index); + } + ptlrpc_reqset_put(ps); + } while (rc == 0 && pc->pc_cursor != first); + } + } + + RETURN(rc || test_bit(LIOD_STOP, &pc->pc_flags)); +} + +/** + * Main ptlrpcd thread. + * ptlrpc's code paths like to execute in process context, so we have this + * thread which spins on a set which contains the rpcs and sends them. + */ +static int ptlrpcd(void *arg) +{ + struct ptlrpcd_ctl *pc = arg; + struct ptlrpc_request_set *set; + struct lu_context ses = { 0 }; + struct lu_env env = { .le_ses = &ses }; + int rc = 0; + int exit = 0; + + ENTRY; + unshare_fs_struct(); + if (cfs_cpt_bind(cfs_cpt_tab, pc->pc_cpt) != 0) + CWARN("Failed to bind %s on CPT %d\n", pc->pc_name, pc->pc_cpt); + + /* + * Allocate the request set after the thread has been bound + * above. This is safe because no requests will be queued + * until all ptlrpcd threads have confirmed that they have + * successfully started. + */ + set = ptlrpc_prep_set(); + if (set == NULL) + GOTO(failed, rc = -ENOMEM); + spin_lock(&pc->pc_lock); + pc->pc_set = set; + spin_unlock(&pc->pc_lock); + + /* Both client and server (MDT/OST) may use the environment. */ + rc = lu_context_init(&env.le_ctx, LCT_MD_THREAD | + LCT_DT_THREAD | + LCT_CL_THREAD | + LCT_REMEMBER | + LCT_NOREF); + if (rc != 0) + GOTO(failed, rc); + rc = lu_context_init(env.le_ses, LCT_SESSION | + LCT_REMEMBER | + LCT_NOREF); + if (rc != 0) { + lu_context_fini(&env.le_ctx); + GOTO(failed, rc); + } + + complete(&pc->pc_starting); + + /* + * This mainloop strongly resembles ptlrpc_set_wait() except that our + * set never completes. ptlrpcd_check() calls ptlrpc_check_set() when + * there are requests in the set. New requests come in on the set's + * new_req_list and ptlrpcd_check() moves them into the set. + */ + do { + DEFINE_WAIT_FUNC(wait, woken_wake_function); + time64_t timeout; + + timeout = cfs_time_seconds(ptlrpc_set_next_timeout(set)); + + lu_context_enter(&env.le_ctx); + lu_context_enter(env.le_ses); + + add_wait_queue(&set->set_waitq, &wait); + while (!ptlrpcd_check(&env, pc)) { + int ret; + + if (timeout == 0) + ret = wait_woken(&wait, TASK_IDLE, + MAX_SCHEDULE_TIMEOUT); + else { + ret = wait_woken(&wait, TASK_IDLE, timeout); + if (ret > 0) + timeout = ret; + } + if (ret != 0) + continue; + /* Timed out */ + ptlrpc_expired_set(set); + break; + } + remove_wait_queue(&set->set_waitq, &wait); + + lu_context_exit(&env.le_ctx); + lu_context_exit(env.le_ses); + + /* + * Abort inflight rpcs for forced stop case. + */ + if (test_bit(LIOD_STOP, &pc->pc_flags)) { + if (test_bit(LIOD_FORCE, &pc->pc_flags)) + ptlrpc_abort_set(set); + exit++; + } + + /* + * Let's make one more loop to make sure that ptlrpcd_check() + * copied all raced new rpcs into the set so we can kill them. + */ + } while (exit < 2); + + /* + * Wait for inflight requests to drain. + */ + if (!list_empty(&set->set_requests)) + ptlrpc_set_wait(&env, set); + lu_context_fini(&env.le_ctx); + lu_context_fini(env.le_ses); + + complete(&pc->pc_finishing); + + return 0; + +failed: + pc->pc_error = rc; + complete(&pc->pc_starting); + RETURN(rc); +} + +static void ptlrpcd_ctl_init(struct ptlrpcd_ctl *pc, int index, int cpt) +{ + ENTRY; + + pc->pc_index = index; + pc->pc_cpt = cpt; + init_completion(&pc->pc_starting); + init_completion(&pc->pc_finishing); + spin_lock_init(&pc->pc_lock); + + if (index < 0) { + /* Recovery thread. */ + snprintf(pc->pc_name, sizeof(pc->pc_name), "ptlrpcd_rcv"); + } else { + /* Regular thread. */ + snprintf(pc->pc_name, sizeof(pc->pc_name), + "ptlrpcd_%02d_%02d", cpt, index); + } + + EXIT; +} + +/* XXX: We want multiple CPU cores to share the async RPC load. So we + * start many ptlrpcd threads. We also want to reduce the ptlrpcd + * overhead caused by data transfer cross-CPU cores. So we bind + * all ptlrpcd threads to a CPT, in the expectation that CPTs + * will be defined in a way that matches these boundaries. Within + * a CPT a ptlrpcd thread can be scheduled on any available core. + * + * Each ptlrpcd thread has its own request queue. This can cause + * response delay if the thread is already busy. To help with + * this we define partner threads: these are other threads bound + * to the same CPT which will check for work in each other's + * request queues if they have no work to do. + * + * The desired number of partner threads can be tuned by setting + * ptlrpcd_partner_group_size. The default is to create pairs of + * partner threads. + */ +static int ptlrpcd_partners(struct ptlrpcd *pd, int index) +{ + struct ptlrpcd_ctl *pc; + struct ptlrpcd_ctl **ppc; + int first; + int i; + int rc = 0; + + ENTRY; + + LASSERT(index >= 0 && index < pd->pd_nthreads); + pc = &pd->pd_threads[index]; + pc->pc_npartners = pd->pd_groupsize - 1; + + if (pc->pc_npartners <= 0) + GOTO(out, rc); + + OBD_CPT_ALLOC(pc->pc_partners, cfs_cpt_tab, pc->pc_cpt, + sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners); + if (pc->pc_partners == NULL) { + pc->pc_npartners = 0; + GOTO(out, rc = -ENOMEM); + } + + first = index - index % pd->pd_groupsize; + ppc = pc->pc_partners; + for (i = first; i < first + pd->pd_groupsize; i++) { + if (i != index) + *ppc++ = &pd->pd_threads[i]; + } +out: + RETURN(rc); +} + +int ptlrpcd_start(struct ptlrpcd_ctl *pc) +{ + struct task_struct *task; + int rc = 0; + + ENTRY; + + /* + * Do not allow starting a second thread for one pc. + */ + if (test_and_set_bit(LIOD_START, &pc->pc_flags)) { + CWARN("Starting second thread (%s) for same pc %p\n", + pc->pc_name, pc); + RETURN(0); + } + + task = kthread_run(ptlrpcd, pc, "%s", pc->pc_name); + if (IS_ERR(task)) + GOTO(out_set, rc = PTR_ERR(task)); + + wait_for_completion(&pc->pc_starting); + rc = pc->pc_error; + if (rc != 0) + GOTO(out_set, rc); + + RETURN(0); + +out_set: + if (pc->pc_set != NULL) { + struct ptlrpc_request_set *set = pc->pc_set; + + spin_lock(&pc->pc_lock); + pc->pc_set = NULL; + spin_unlock(&pc->pc_lock); + ptlrpc_set_destroy(set); + } + clear_bit(LIOD_START, &pc->pc_flags); + RETURN(rc); +} + +void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force) +{ + ENTRY; + + if (!test_bit(LIOD_START, &pc->pc_flags)) { + CWARN("Thread for pc %p was not started\n", pc); + goto out; + } + + set_bit(LIOD_STOP, &pc->pc_flags); + if (force) + set_bit(LIOD_FORCE, &pc->pc_flags); + wake_up(&pc->pc_set->set_waitq); + +out: + EXIT; +} + +void ptlrpcd_free(struct ptlrpcd_ctl *pc) +{ + struct ptlrpc_request_set *set = pc->pc_set; + + ENTRY; + + if (!test_bit(LIOD_START, &pc->pc_flags)) { + CWARN("Thread for pc %p was not started\n", pc); + goto out; + } + + wait_for_completion(&pc->pc_finishing); + + spin_lock(&pc->pc_lock); + pc->pc_set = NULL; + spin_unlock(&pc->pc_lock); + ptlrpc_set_destroy(set); + + clear_bit(LIOD_START, &pc->pc_flags); + clear_bit(LIOD_STOP, &pc->pc_flags); + clear_bit(LIOD_FORCE, &pc->pc_flags); + +out: + if (pc->pc_npartners > 0) { + LASSERT(pc->pc_partners != NULL); + + OBD_FREE_PTR_ARRAY(pc->pc_partners, pc->pc_npartners); + pc->pc_partners = NULL; + } + pc->pc_npartners = 0; + pc->pc_error = 0; + EXIT; +} + +static void ptlrpcd_fini(void) +{ + int i; + int j; + int ncpts; + + ENTRY; + + if (ptlrpcds != NULL) { + for (i = 0; i < ptlrpcds_num; i++) { + if (ptlrpcds[i] == NULL) + break; + for (j = 0; j < ptlrpcds[i]->pd_nthreads; j++) + ptlrpcd_stop(&ptlrpcds[i]->pd_threads[j], 0); + for (j = 0; j < ptlrpcds[i]->pd_nthreads; j++) + ptlrpcd_free(&ptlrpcds[i]->pd_threads[j]); + OBD_FREE(ptlrpcds[i], ptlrpcds[i]->pd_size); + ptlrpcds[i] = NULL; + } + OBD_FREE_PTR_ARRAY(ptlrpcds, ptlrpcds_num); + } + ptlrpcds_num = 0; + + ptlrpcd_stop(&ptlrpcd_rcv, 0); + ptlrpcd_free(&ptlrpcd_rcv); + + if (ptlrpcds_cpt_idx != NULL) { + ncpts = cfs_cpt_number(cfs_cpt_tab); + OBD_FREE_PTR_ARRAY(ptlrpcds_cpt_idx, ncpts); + ptlrpcds_cpt_idx = NULL; + } + + EXIT; +} + +static int ptlrpcd_init(void) +{ + int nthreads; + int groupsize; + int size; + int i; + int j; + int rc = 0; + struct cfs_cpt_table *cptable; + __u32 *cpts = NULL; + int ncpts; + int cpt; + struct ptlrpcd *pd; + + ENTRY; + + /* + * Determine the CPTs that ptlrpcd threads will run on. + */ + cptable = cfs_cpt_tab; + ncpts = cfs_cpt_number(cptable); + if (ptlrpcd_cpts != NULL) { + struct cfs_expr_list *el; + + size = ncpts * sizeof(ptlrpcds_cpt_idx[0]); + OBD_ALLOC(ptlrpcds_cpt_idx, size); + if (ptlrpcds_cpt_idx == NULL) + GOTO(out, rc = -ENOMEM); + + rc = cfs_expr_list_parse(ptlrpcd_cpts, + strlen(ptlrpcd_cpts), + 0, ncpts - 1, &el); + if (rc != 0) { + CERROR("%s: invalid CPT pattern string: %s", + "ptlrpcd_cpts", ptlrpcd_cpts); + GOTO(out, rc = -EINVAL); + } + + rc = cfs_expr_list_values(el, ncpts, &cpts); + cfs_expr_list_free(el); + if (rc <= 0) { + CERROR("%s: failed to parse CPT array %s: %d\n", + "ptlrpcd_cpts", ptlrpcd_cpts, rc); + if (rc == 0) + rc = -EINVAL; + GOTO(out, rc); + } + + /* + * Create the cpt-to-index map. When there is no match + * in the cpt table, pick a cpt at random. This could + * be changed to take the topology of the system into + * account. + */ + for (cpt = 0; cpt < ncpts; cpt++) { + for (i = 0; i < rc; i++) + if (cpts[i] == cpt) + break; + if (i >= rc) + i = cpt % rc; + ptlrpcds_cpt_idx[cpt] = i; + } + + cfs_expr_list_values_free(cpts, rc); + ncpts = rc; + } + ptlrpcds_num = ncpts; + + size = ncpts * sizeof(ptlrpcds[0]); + OBD_ALLOC(ptlrpcds, size); + if (ptlrpcds == NULL) + GOTO(out, rc = -ENOMEM); + + /* + * The max_ptlrpcds parameter is obsolete, but do something + * sane if it has been tuned, and complain if + * ptlrpcd_per_cpt_max has also been tuned. + */ + if (max_ptlrpcds != 0) { + CWARN("max_ptlrpcds is obsolete.\n"); + if (ptlrpcd_per_cpt_max == 0) { + ptlrpcd_per_cpt_max = max_ptlrpcds / ncpts; + /* Round up if there is a remainder. */ + if (max_ptlrpcds % ncpts != 0) + ptlrpcd_per_cpt_max++; + CWARN("Setting ptlrpcd_per_cpt_max = %d\n", + ptlrpcd_per_cpt_max); + } else { + CWARN("ptlrpd_per_cpt_max is also set!\n"); + } + } + + /* + * The ptlrpcd_bind_policy parameter is obsolete, but do + * something sane if it has been tuned, and complain if + * ptlrpcd_partner_group_size is also tuned. + */ + if (ptlrpcd_bind_policy != 0) { + CWARN("ptlrpcd_bind_policy is obsolete.\n"); + if (ptlrpcd_partner_group_size == 0) { + switch (ptlrpcd_bind_policy) { + case 1: /* PDB_POLICY_NONE */ + case 2: /* PDB_POLICY_FULL */ + ptlrpcd_partner_group_size = 1; + break; + case 3: /* PDB_POLICY_PAIR */ + ptlrpcd_partner_group_size = 2; + break; + case 4: /* PDB_POLICY_NEIGHBOR */ +#ifdef CONFIG_NUMA + ptlrpcd_partner_group_size = -1; /* CPT */ +#else + ptlrpcd_partner_group_size = 3; /* Triplets */ +#endif + break; + default: /* Illegal value, use the default. */ + ptlrpcd_partner_group_size = 2; + break; + } + CWARN("Setting ptlrpcd_partner_group_size = %d\n", + ptlrpcd_partner_group_size); + } else { + CWARN("ptlrpcd_partner_group_size is also set!\n"); + } + } + + if (ptlrpcd_partner_group_size == 0) + ptlrpcd_partner_group_size = 2; + else if (ptlrpcd_partner_group_size < 0) + ptlrpcd_partner_group_size = -1; + else if (ptlrpcd_per_cpt_max > 0 && + ptlrpcd_partner_group_size > ptlrpcd_per_cpt_max) + ptlrpcd_partner_group_size = ptlrpcd_per_cpt_max; + + /* + * Start the recovery thread first. + */ + set_bit(LIOD_RECOVERY, &ptlrpcd_rcv.pc_flags); + ptlrpcd_ctl_init(&ptlrpcd_rcv, -1, CFS_CPT_ANY); + rc = ptlrpcd_start(&ptlrpcd_rcv); + if (rc < 0) + GOTO(out, rc); + + for (i = 0; i < ncpts; i++) { + if (cpts == NULL) + cpt = i; + else + cpt = cpts[i]; + + nthreads = cfs_cpt_weight(cptable, cpt); + if (ptlrpcd_per_cpt_max > 0 && ptlrpcd_per_cpt_max < nthreads) + nthreads = ptlrpcd_per_cpt_max; + if (nthreads < 2) + nthreads = 2; + + if (ptlrpcd_partner_group_size <= 0) { + groupsize = nthreads; + } else if (nthreads <= ptlrpcd_partner_group_size) { + groupsize = nthreads; + } else { + groupsize = ptlrpcd_partner_group_size; + if (nthreads % groupsize != 0) + nthreads += groupsize - (nthreads % groupsize); + } + + size = offsetof(struct ptlrpcd, pd_threads[nthreads]); + OBD_CPT_ALLOC(pd, cptable, cpt, size); + + if (!pd) + GOTO(out, rc = -ENOMEM); + pd->pd_size = size; + pd->pd_index = i; + pd->pd_cpt = cpt; + pd->pd_cursor = 0; + pd->pd_nthreads = nthreads; + pd->pd_groupsize = groupsize; + ptlrpcds[i] = pd; + + /* + * The ptlrpcd threads in a partner group can access + * each other's struct ptlrpcd_ctl, so these must be + * initialized before any thead is started. + */ + for (j = 0; j < nthreads; j++) { + ptlrpcd_ctl_init(&pd->pd_threads[j], j, cpt); + rc = ptlrpcd_partners(pd, j); + if (rc < 0) + GOTO(out, rc); + } + + /* XXX: We start nthreads ptlrpc daemons on this cpt. + * Each of them can process any non-recovery + * async RPC to improve overall async RPC + * efficiency. + * + * But there are some issues with async I/O RPCs + * and async non-I/O RPCs processed in the same + * set under some cases. The ptlrpcd may be + * blocked by some async I/O RPC(s), then will + * cause other async non-I/O RPC(s) can not be + * processed in time. + * + * Maybe we should distinguish blocked async RPCs + * from non-blocked async RPCs, and process them + * in different ptlrpcd sets to avoid unnecessary + * dependency. But how to distribute async RPCs + * load among all the ptlrpc daemons becomes + * another trouble. + */ + for (j = 0; j < nthreads; j++) { + rc = ptlrpcd_start(&pd->pd_threads[j]); + if (rc < 0) + GOTO(out, rc); + } + } +out: + if (rc != 0) + ptlrpcd_fini(); + + RETURN(rc); +} + +int ptlrpcd_addref(void) +{ + int rc = 0; + + ENTRY; + + mutex_lock(&ptlrpcd_mutex); + if (++ptlrpcd_users == 1) { + rc = ptlrpcd_init(); + if (rc < 0) + ptlrpcd_users--; + } + mutex_unlock(&ptlrpcd_mutex); + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpcd_addref); + +void ptlrpcd_decref(void) +{ + mutex_lock(&ptlrpcd_mutex); + if (--ptlrpcd_users == 0) + ptlrpcd_fini(); + mutex_unlock(&ptlrpcd_mutex); +} +EXPORT_SYMBOL(ptlrpcd_decref); +/** @} ptlrpcd */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c new file mode 100644 index 0000000000000..c44b56c767885 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c @@ -0,0 +1,377 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/recover.c + * + * Author: Mike Shaver + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +/** + * Start recovery on disconnected import. + * This is done by just attempting a connect + */ +void ptlrpc_initiate_recovery(struct obd_import *imp) +{ + ENTRY; + + CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd)); + ptlrpc_connect_import(imp); + + EXIT; +} + +/** + * Identify what request from replay list needs to be replayed next + * (based on what we have already replayed) and send it to server. + */ +int ptlrpc_replay_next(struct obd_import *imp, int *inflight) +{ + int rc = 0; + struct ptlrpc_request *req = NULL; + __u64 last_transno; + ENTRY; + + *inflight = 0; + + /* It might have committed some after we last spoke, so make sure we + * get rid of them now. + */ + spin_lock(&imp->imp_lock); + imp->imp_last_transno_checked = 0; + ptlrpc_free_committed(imp); + last_transno = imp->imp_last_replay_transno; + + CDEBUG(D_HA, "import %p from %s committed %llu last %llu\n", + imp, obd2cli_tgt(imp->imp_obd), + imp->imp_peer_committed_transno, last_transno); + + /* Replay all the committed open requests on committed_list first */ + if (!list_empty(&imp->imp_committed_list)) { + req = list_last_entry(&imp->imp_committed_list, + struct ptlrpc_request, rq_replay_list); + + /* The last request on committed_list hasn't been replayed */ + if (req->rq_transno > last_transno) { + if (!imp->imp_resend_replay || + imp->imp_replay_cursor == &imp->imp_committed_list) + imp->imp_replay_cursor = + imp->imp_replay_cursor->next; + + while (imp->imp_replay_cursor != + &imp->imp_committed_list) { + req = list_entry(imp->imp_replay_cursor, + struct ptlrpc_request, + rq_replay_list); + if (req->rq_transno > last_transno) + break; + + req = NULL; + LASSERT(!list_empty(imp->imp_replay_cursor)); + imp->imp_replay_cursor = + imp->imp_replay_cursor->next; + } + } else { + /* All requests on committed_list have been replayed */ + imp->imp_replay_cursor = &imp->imp_committed_list; + req = NULL; + } + } + + /* All the requests in committed list have been replayed, let's replay + * the imp_replay_list */ + if (req == NULL) { + struct ptlrpc_request *tmp; + + list_for_each_entry(tmp, &imp->imp_replay_list, + rq_replay_list) { + if (tmp->rq_transno > last_transno) { + req = tmp; + break; + } + } + } + + /* If need to resend the last sent transno (because a reconnect + * has occurred), then stop on the matching req and send it again. + * If, however, the last sent transno has been committed then we + * continue replay from the next request. */ + if (req != NULL && imp->imp_resend_replay) + lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); + + /* ptlrpc_prepare_replay() may fail to add the reqeust into unreplied + * list if the request hasn't been added to replay list then. Another + * exception is that resend replay could have been removed from the + * unreplied list. */ + if (req != NULL && list_empty(&req->rq_unreplied_list)) { + DEBUG_REQ(D_HA, req, "resend_replay=%d, last_transno=%llu", + imp->imp_resend_replay, last_transno); + ptlrpc_add_unreplied(req); + imp->imp_known_replied_xid = ptlrpc_known_replied_xid(imp); + } + + imp->imp_resend_replay = 0; + spin_unlock(&imp->imp_lock); + + if (req != NULL) { + LASSERT(!list_empty(&req->rq_unreplied_list)); + + rc = ptlrpc_replay_req(req); + if (rc) { + CERROR("recovery replay error %d for req %llu\n", + rc, req->rq_xid); + RETURN(rc); + } + *inflight = 1; + } + RETURN(rc); +} + +/** + * Schedule resending of request on sending_list. This is done after + * we completed replaying of requests and locks. + */ +int ptlrpc_resend(struct obd_import *imp) +{ + struct ptlrpc_request *req; + + ENTRY; + + /* As long as we're in recovery, nothing should be added to the sending + * list, so we don't need to hold the lock during this iteration and + * resend process. + */ + /* Well... what if lctl recover is called twice at the same time? + */ + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_RECOVER) { + spin_unlock(&imp->imp_lock); + RETURN(-1); + } + + list_for_each_entry(req, &imp->imp_sending_list, rq_list) { + LASSERTF((long)req > PAGE_SIZE && req != LP_POISON, + "req %p bad\n", req); + LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req); + + /* If the request is allowed to be sent during replay and it + * is not timeout yet, then it does not need to be resent. */ + if (!ptlrpc_no_resend(req) && + (req->rq_timedout || !req->rq_allow_replay)) + ptlrpc_resend_req(req); + } + spin_unlock(&imp->imp_lock); + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT, 2); + RETURN(0); +} + +/** + * Go through all requests in delayed list and wake their threads + * for resending + */ +void ptlrpc_wake_delayed(struct obd_import *imp) +{ + struct ptlrpc_request *req; + + spin_lock(&imp->imp_lock); + list_for_each_entry(req, &imp->imp_delayed_list, rq_list) { + DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set); + ptlrpc_client_wake_req(req); + } + spin_unlock(&imp->imp_lock); +} + +void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) +{ + struct obd_import *imp = failed_req->rq_import; + int conn = lustre_msg_get_conn_cnt(failed_req->rq_reqmsg); + ENTRY; + + CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n", + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + + if (ptlrpc_set_import_discon(imp, conn, true)) { + /* to control recovery via lctl {disable|enable}_recovery */ + if (imp->imp_deactive == 0) + ptlrpc_connect_import(imp); + } + + /* Wait for recovery to complete and resend. If evicted, then + this request will be errored out later.*/ + spin_lock(&failed_req->rq_lock); + if (!failed_req->rq_no_resend) + failed_req->rq_resend = 1; + spin_unlock(&failed_req->rq_lock); + + EXIT; +} + +/** + * Administratively active/deactive a client. + * This should only be called by the ioctl interface, currently + * - the lctl deactivate and activate commands + * - echo 0/1 >> /proc/osc/XXX/active + * - client umount -f (ll_umount_begin) + */ +int ptlrpc_set_import_active(struct obd_import *imp, int active) +{ + struct obd_device *obd = imp->imp_obd; + int rc = 0; + + ENTRY; + LASSERT(obd); + + /* When deactivating, mark import invalid, and abort in-flight + * requests. */ + if (!active) { + LCONSOLE_WARN("setting import %s INACTIVE by administrator " + "request\n", obd2cli_tgt(imp->imp_obd)); + + /* set before invalidate to avoid messages about imp_inval + * set without imp_deactive in ptlrpc_import_delay_req */ + spin_lock(&imp->imp_lock); + imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_DEACTIVATE); + + ptlrpc_invalidate_import(imp); + } + + /* When activating, mark import valid, and attempt recovery */ + if (active) { + CDEBUG(D_HA, "setting import %s VALID\n", + obd2cli_tgt(imp->imp_obd)); + + spin_lock(&imp->imp_lock); + imp->imp_deactive = 0; + spin_unlock(&imp->imp_lock); + obd_import_event(imp->imp_obd, imp, IMP_EVENT_ACTIVATE); + + rc = ptlrpc_recover_import(imp, NULL, 0); + } + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_set_import_active); + +/* Attempt to reconnect an import */ +int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async) +{ + int rc = 0; + ENTRY; + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_NEW || imp->imp_deactive || + atomic_read(&imp->imp_inval_count)) + rc = -EINVAL; + spin_unlock(&imp->imp_lock); + if (rc) + GOTO(out, rc); + + /* force import to be disconnected. */ + ptlrpc_set_import_discon(imp, 0, false); + + if (new_uuid) { + struct obd_uuid uuid; + + /* intruct import to use new uuid */ + obd_str2uuid(&uuid, new_uuid); + rc = import_set_conn_priority(imp, &uuid); + if (rc) + GOTO(out, rc); + } + + /* Check if reconnect is already in progress */ + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_DISCON) { + imp->imp_force_verify = 1; + rc = -EALREADY; + } + spin_unlock(&imp->imp_lock); + if (rc) + GOTO(out, rc); + + OBD_RACE(OBD_FAIL_PTLRPC_CONNECT_RACE); + + rc = ptlrpc_connect_import(imp); + if (rc) + GOTO(out, rc); + + if (!async) { + long timeout = cfs_time_seconds(obd_timeout); + + CDEBUG(D_HA, "%s: recovery started, waiting %u jiffies\n", + obd2cli_tgt(imp->imp_obd), obd_timeout); + + rc = wait_event_idle_timeout(imp->imp_recovery_waitq, + !ptlrpc_import_in_recovery(imp), + timeout); + if (rc == 0) + rc = -ETIMEDOUT; + else + rc = 0; + CDEBUG(D_HA, "%s: recovery finished\n", + obd2cli_tgt(imp->imp_obd)); + } + EXIT; + +out: + return rc; +} +EXPORT_SYMBOL(ptlrpc_recover_import); + +int ptlrpc_import_in_recovery(struct obd_import *imp) +{ + int in_recovery = 1; + + spin_lock(&imp->imp_lock); + if (imp->imp_state <= LUSTRE_IMP_DISCON || + imp->imp_state >= LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov) + in_recovery = 0; + spin_unlock(&imp->imp_lock); + + return in_recovery; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c new file mode 100644 index 0000000000000..d126df52518c8 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c @@ -0,0 +1,2762 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/sec.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +static int send_sepol; +module_param(send_sepol, int, 0644); +MODULE_PARM_DESC(send_sepol, "Client sends SELinux policy status"); + +/* + * policy registers + */ + +static rwlock_t policy_lock; +static struct ptlrpc_sec_policy *policies[SPTLRPC_POLICY_MAX] = { + NULL, +}; + +int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy) +{ + __u16 number = policy->sp_policy; + + LASSERT(policy->sp_name); + LASSERT(policy->sp_cops); + LASSERT(policy->sp_sops); + + if (number >= SPTLRPC_POLICY_MAX) + return -EINVAL; + + write_lock(&policy_lock); + if (unlikely(policies[number])) { + write_unlock(&policy_lock); + return -EALREADY; + } + policies[number] = policy; + write_unlock(&policy_lock); + + CDEBUG(D_SEC, "%s: registered\n", policy->sp_name); + return 0; +} +EXPORT_SYMBOL(sptlrpc_register_policy); + +int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy) +{ + __u16 number = policy->sp_policy; + + LASSERT(number < SPTLRPC_POLICY_MAX); + + write_lock(&policy_lock); + if (unlikely(policies[number] == NULL)) { + write_unlock(&policy_lock); + CERROR("%s: already unregistered\n", policy->sp_name); + return -EINVAL; + } + + LASSERT(policies[number] == policy); + policies[number] = NULL; + write_unlock(&policy_lock); + + CDEBUG(D_SEC, "%s: unregistered\n", policy->sp_name); + return 0; +} +EXPORT_SYMBOL(sptlrpc_unregister_policy); + +static +struct ptlrpc_sec_policy *sptlrpc_wireflavor2policy(__u32 flavor) +{ + static DEFINE_MUTEX(load_mutex); + static atomic_t loaded = ATOMIC_INIT(0); + struct ptlrpc_sec_policy *policy; + __u16 number = SPTLRPC_FLVR_POLICY(flavor); + __u16 flag = 0; + + if (number >= SPTLRPC_POLICY_MAX) + return NULL; + + while (1) { + read_lock(&policy_lock); + policy = policies[number]; + if (policy && !try_module_get(policy->sp_owner)) + policy = NULL; + if (policy == NULL) + flag = atomic_read(&loaded); + read_unlock(&policy_lock); + + if (policy != NULL || flag != 0 || + number != SPTLRPC_POLICY_GSS) + break; + + /* try to load gss module, once */ + mutex_lock(&load_mutex); + if (atomic_read(&loaded) == 0) { + if (request_module("ptlrpc_gss") == 0) + CDEBUG(D_SEC, + "module ptlrpc_gss loaded on demand\n"); + else + CERROR("Unable to load module ptlrpc_gss\n"); + + atomic_set(&loaded, 1); + } + mutex_unlock(&load_mutex); + } + + return policy; +} + +__u32 sptlrpc_name2flavor_base(const char *name) +{ + if (!strcmp(name, "null")) + return SPTLRPC_FLVR_NULL; + if (!strcmp(name, "plain")) + return SPTLRPC_FLVR_PLAIN; + if (!strcmp(name, "gssnull")) + return SPTLRPC_FLVR_GSSNULL; + if (!strcmp(name, "krb5n")) + return SPTLRPC_FLVR_KRB5N; + if (!strcmp(name, "krb5a")) + return SPTLRPC_FLVR_KRB5A; + if (!strcmp(name, "krb5i")) + return SPTLRPC_FLVR_KRB5I; + if (!strcmp(name, "krb5p")) + return SPTLRPC_FLVR_KRB5P; + if (!strcmp(name, "skn")) + return SPTLRPC_FLVR_SKN; + if (!strcmp(name, "ska")) + return SPTLRPC_FLVR_SKA; + if (!strcmp(name, "ski")) + return SPTLRPC_FLVR_SKI; + if (!strcmp(name, "skpi")) + return SPTLRPC_FLVR_SKPI; + + return SPTLRPC_FLVR_INVALID; +} +EXPORT_SYMBOL(sptlrpc_name2flavor_base); + +const char *sptlrpc_flavor2name_base(__u32 flvr) +{ + __u32 base = SPTLRPC_FLVR_BASE(flvr); + + if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) + return "null"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN)) + return "plain"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_GSSNULL)) + return "gssnull"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N)) + return "krb5n"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A)) + return "krb5a"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I)) + return "krb5i"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P)) + return "krb5p"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKN)) + return "skn"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKA)) + return "ska"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKI)) + return "ski"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKPI)) + return "skpi"; + + CERROR("invalid wire flavor 0x%x\n", flvr); + return "invalid"; +} +EXPORT_SYMBOL(sptlrpc_flavor2name_base); + +char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf, + char *buf, int bufsize) +{ + if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) + snprintf(buf, bufsize, "hash:%s", + sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg)); + else + snprintf(buf, bufsize, "%s", + sptlrpc_flavor2name_base(sf->sf_rpc)); + + buf[bufsize - 1] = '\0'; + return buf; +} +EXPORT_SYMBOL(sptlrpc_flavor2name_bulk); + +char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize) +{ + snprintf(buf, bufsize, "%s", sptlrpc_flavor2name_base(sf->sf_rpc)); + + /* + * currently we don't support customized bulk specification for + * flavors other than plain + */ + if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) { + char bspec[16]; + + bspec[0] = '-'; + sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1); + strncat(buf, bspec, bufsize); + } + + buf[bufsize - 1] = '\0'; + return buf; +} +EXPORT_SYMBOL(sptlrpc_flavor2name); + +char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize) +{ + buf[0] = '\0'; + + if (flags & PTLRPC_SEC_FL_REVERSE) + strlcat(buf, "reverse,", bufsize); + if (flags & PTLRPC_SEC_FL_ROOTONLY) + strlcat(buf, "rootonly,", bufsize); + if (flags & PTLRPC_SEC_FL_UDESC) + strlcat(buf, "udesc,", bufsize); + if (flags & PTLRPC_SEC_FL_BULK) + strlcat(buf, "bulk,", bufsize); + if (buf[0] == '\0') + strlcat(buf, "-,", bufsize); + + return buf; +} +EXPORT_SYMBOL(sptlrpc_secflags2str); + +/* + * client context APIs + */ + +static +struct ptlrpc_cli_ctx *get_my_ctx(struct ptlrpc_sec *sec) +{ + struct vfs_cred vcred; + int create = 1, remove_dead = 1; + + LASSERT(sec); + LASSERT(sec->ps_policy->sp_cops->lookup_ctx); + + if (sec->ps_flvr.sf_flags & (PTLRPC_SEC_FL_REVERSE | + PTLRPC_SEC_FL_ROOTONLY)) { + vcred.vc_uid = 0; + vcred.vc_gid = 0; + if (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE) { + create = 0; + remove_dead = 0; + } + } else { + vcred.vc_uid = from_kuid(&init_user_ns, current_uid()); + vcred.vc_gid = from_kgid(&init_user_ns, current_gid()); + } + + return sec->ps_policy->sp_cops->lookup_ctx(sec, &vcred, create, + remove_dead); +} + +struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx) +{ + atomic_inc(&ctx->cc_refcount); + return ctx; +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_get); + +void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync) +{ + struct ptlrpc_sec *sec = ctx->cc_sec; + + LASSERT(sec); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + if (!atomic_dec_and_test(&ctx->cc_refcount)) + return; + + sec->ps_policy->sp_cops->release_ctx(sec, ctx, sync); +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_put); + +/** + * Expire the client context immediately. + * + * \pre Caller must hold at least 1 reference on the \a ctx. + */ +void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(ctx->cc_ops->die); + ctx->cc_ops->die(ctx, 0); +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_expire); + +/** + * To wake up the threads who are waiting for this client context. Called + * after some status change happened on \a ctx. + */ +void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx) +{ + struct ptlrpc_request *req, *next; + + spin_lock(&ctx->cc_lock); + list_for_each_entry_safe(req, next, &ctx->cc_req_list, + rq_ctx_chain) { + list_del_init(&req->rq_ctx_chain); + ptlrpc_client_wake_req(req); + } + spin_unlock(&ctx->cc_lock); +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_wakeup); + +int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize) +{ + LASSERT(ctx->cc_ops); + + if (ctx->cc_ops->display == NULL) + return 0; + + return ctx->cc_ops->display(ctx, buf, bufsize); +} + +static int import_sec_check_expire(struct obd_import *imp) +{ + int adapt = 0; + + write_lock(&imp->imp_sec_lock); + if (imp->imp_sec_expire && + imp->imp_sec_expire < ktime_get_real_seconds()) { + adapt = 1; + imp->imp_sec_expire = 0; + } + write_unlock(&imp->imp_sec_lock); + + if (!adapt) + return 0; + + CDEBUG(D_SEC, "found delayed sec adapt expired, do it now\n"); + return sptlrpc_import_sec_adapt(imp, NULL, NULL); +} + +/** + * Get and validate the client side ptlrpc security facilities from + * \a imp. There is a race condition on client reconnect when the import is + * being destroyed while there are outstanding client bound requests. In + * this case do not output any error messages if import secuity is not + * found. + * + * \param[in] imp obd import associated with client + * \param[out] sec client side ptlrpc security + * + * \retval 0 if security retrieved successfully + * \retval -ve errno if there was a problem + */ +static int import_sec_validate_get(struct obd_import *imp, + struct ptlrpc_sec **sec) +{ + int rc; + + if (unlikely(imp->imp_sec_expire)) { + rc = import_sec_check_expire(imp); + if (rc) + return rc; + } + + *sec = sptlrpc_import_sec_ref(imp); + if (*sec == NULL) { + /* Only output an error when the import is still active */ + if (!test_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(&imp->imp_zombie_work))) + CERROR("import %p (%s) with no sec\n", + imp, ptlrpc_import_state_name(imp->imp_state)); + return -EACCES; + } + + if (unlikely((*sec)->ps_dying)) { + CERROR("attempt to use dying sec %p\n", sec); + sptlrpc_sec_put(*sec); + return -EACCES; + } + + return 0; +} + +/** + * Given a \a req, find or allocate an appropriate context for it. + * \pre req->rq_cli_ctx == NULL. + * + * \retval 0 succeed, and req->rq_cli_ctx is set. + * \retval -ev error number, and req->rq_cli_ctx == NULL. + */ +int sptlrpc_req_get_ctx(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + struct ptlrpc_sec *sec; + int rc; + + ENTRY; + + LASSERT(!req->rq_cli_ctx); + LASSERT(imp); + + rc = import_sec_validate_get(imp, &sec); + if (rc) + RETURN(rc); + + req->rq_cli_ctx = get_my_ctx(sec); + + sptlrpc_sec_put(sec); + + if (!req->rq_cli_ctx) { + CERROR("req %p: fail to get context\n", req); + RETURN(-ECONNREFUSED); + } + + RETURN(0); +} + +/** + * Drop the context for \a req. + * \pre req->rq_cli_ctx != NULL. + * \post req->rq_cli_ctx == NULL. + * + * If \a sync == 0, this function should return quickly without sleep; + * otherwise it might trigger and wait for the whole process of sending + * an context-destroying rpc to server. + */ +void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync) +{ + ENTRY; + + LASSERT(req); + LASSERT(req->rq_cli_ctx); + + /* + * request might be asked to release earlier while still + * in the context waiting list. + */ + if (!list_empty(&req->rq_ctx_chain)) { + spin_lock(&req->rq_cli_ctx->cc_lock); + list_del_init(&req->rq_ctx_chain); + spin_unlock(&req->rq_cli_ctx->cc_lock); + } + + sptlrpc_cli_ctx_put(req->rq_cli_ctx, sync); + req->rq_cli_ctx = NULL; + EXIT; +} + +static +int sptlrpc_req_ctx_switch(struct ptlrpc_request *req, + struct ptlrpc_cli_ctx *oldctx, + struct ptlrpc_cli_ctx *newctx) +{ + struct sptlrpc_flavor old_flvr; + char *reqmsg = NULL; /* to workaround old gcc */ + int reqmsg_size; + int rc = 0; + + CDEBUG(D_SEC, + "req %p: switch ctx %p(%u->%s) -> %p(%u->%s), switch sec %p(%s) -> %p(%s)\n", + req, oldctx, oldctx->cc_vcred.vc_uid, + sec2target_str(oldctx->cc_sec), newctx, newctx->cc_vcred.vc_uid, + sec2target_str(newctx->cc_sec), oldctx->cc_sec, + oldctx->cc_sec->ps_policy->sp_name, newctx->cc_sec, + newctx->cc_sec->ps_policy->sp_name); + + /* save flavor */ + old_flvr = req->rq_flvr; + + /* save request message */ + reqmsg_size = req->rq_reqlen; + if (reqmsg_size != 0) { + LASSERT(req->rq_reqmsg); + OBD_ALLOC_LARGE(reqmsg, reqmsg_size); + if (reqmsg == NULL) + return -ENOMEM; + memcpy(reqmsg, req->rq_reqmsg, reqmsg_size); + } + + /* release old req/rep buf */ + req->rq_cli_ctx = oldctx; + sptlrpc_cli_free_reqbuf(req); + sptlrpc_cli_free_repbuf(req); + req->rq_cli_ctx = newctx; + + /* recalculate the flavor */ + sptlrpc_req_set_flavor(req, 0); + + /* + * alloc new request buffer + * we don't need to alloc reply buffer here, leave it to the + * rest procedure of ptlrpc + */ + if (reqmsg_size != 0) { + rc = sptlrpc_cli_alloc_reqbuf(req, reqmsg_size); + if (!rc) { + LASSERT(req->rq_reqmsg); + memcpy(req->rq_reqmsg, reqmsg, reqmsg_size); + } else { + CWARN("failed to alloc reqbuf: %d\n", rc); + req->rq_flvr = old_flvr; + } + + OBD_FREE_LARGE(reqmsg, reqmsg_size); + } + return rc; +} + +/** + * If current context of \a req is dead somehow, e.g. we just switched flavor + * thus marked original contexts dead, we'll find a new context for it. if + * no switch is needed, \a req will end up with the same context. + * + * \note a request must have a context, to keep other parts of code happy. + * In any case of failure during the switching, we must restore the old one. + */ +int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *oldctx = req->rq_cli_ctx; + struct ptlrpc_cli_ctx *newctx; + int rc; + + ENTRY; + + LASSERT(oldctx); + + sptlrpc_cli_ctx_get(oldctx); + sptlrpc_req_put_ctx(req, 0); + + rc = sptlrpc_req_get_ctx(req); + if (unlikely(rc)) { + LASSERT(!req->rq_cli_ctx); + + /* restore old ctx */ + req->rq_cli_ctx = oldctx; + RETURN(rc); + } + + newctx = req->rq_cli_ctx; + LASSERT(newctx); + + if (unlikely(newctx == oldctx && + test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags))) { + /* + * still get the old dead ctx, usually means system too busy + */ + CDEBUG(D_SEC, + "ctx (%p, fl %lx) doesn't switch, relax a little bit\n", + newctx, newctx->cc_flags); + + schedule_timeout_interruptible(cfs_time_seconds(1)); + } else if (unlikely(test_bit(PTLRPC_CTX_UPTODATE_BIT, &newctx->cc_flags) + == 0)) { + /* + * new ctx not up to date yet + */ + CDEBUG(D_SEC, + "ctx (%p, fl %lx) doesn't switch, not up to date yet\n", + newctx, newctx->cc_flags); + } else { + /* + * it's possible newctx == oldctx if we're switching + * subflavor with the same sec. + */ + rc = sptlrpc_req_ctx_switch(req, oldctx, newctx); + if (rc) { + /* restore old ctx */ + sptlrpc_req_put_ctx(req, 0); + req->rq_cli_ctx = oldctx; + RETURN(rc); + } + + LASSERT(req->rq_cli_ctx == newctx); + } + + sptlrpc_cli_ctx_put(oldctx, 1); + RETURN(0); +} +EXPORT_SYMBOL(sptlrpc_req_replace_dead_ctx); + +static +int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx) +{ + if (cli_ctx_is_refreshed(ctx)) + return 1; + return 0; +} + +static +void ctx_refresh_interrupt(struct ptlrpc_request *req) +{ + + spin_lock(&req->rq_lock); + req->rq_intr = 1; + spin_unlock(&req->rq_lock); +} + +static +void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx) +{ + spin_lock(&ctx->cc_lock); + if (!list_empty(&req->rq_ctx_chain)) + list_del_init(&req->rq_ctx_chain); + spin_unlock(&ctx->cc_lock); +} + +/** + * To refresh the context of \req, if it's not up-to-date. + * \param timeout + * - == 0: do not wait + * - == MAX_SCHEDULE_TIMEOUT: wait indefinitely + * - > 0: not supported + * + * The status of the context could be subject to be changed by other threads + * at any time. We allow this race, but once we return with 0, the caller will + * suppose it's uptodated and keep using it until the owning rpc is done. + * + * \retval 0 only if the context is uptodated. + * \retval -ev error number. + */ +int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec *sec; + int rc; + + ENTRY; + + LASSERT(ctx); + + if (req->rq_ctx_init || req->rq_ctx_fini) + RETURN(0); + + if (timeout != 0 && timeout != MAX_SCHEDULE_TIMEOUT) { + CERROR("req %p: invalid timeout %lu\n", req, timeout); + RETURN(-EINVAL); + } + + /* + * during the process a request's context might change type even + * (e.g. from gss ctx to null ctx), so each loop we need to re-check + * everything + */ +again: + rc = import_sec_validate_get(req->rq_import, &sec); + if (rc) + RETURN(rc); + + if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) { + CDEBUG(D_SEC, "req %p: flavor has changed %x -> %x\n", + req, req->rq_flvr.sf_rpc, sec->ps_flvr.sf_rpc); + req_off_ctx_list(req, ctx); + sptlrpc_req_replace_dead_ctx(req); + ctx = req->rq_cli_ctx; + } + sptlrpc_sec_put(sec); + + if (cli_ctx_is_eternal(ctx)) + RETURN(0); + + if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) { + if (ctx->cc_ops->refresh) + ctx->cc_ops->refresh(ctx); + } + LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0); + + LASSERT(ctx->cc_ops->validate); + if (ctx->cc_ops->validate(ctx) == 0) { + req_off_ctx_list(req, ctx); + RETURN(0); + } + + if (unlikely(test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags))) { + spin_lock(&req->rq_lock); + req->rq_err = 1; + spin_unlock(&req->rq_lock); + req_off_ctx_list(req, ctx); + RETURN(-EPERM); + } + + /* + * There's a subtle issue for resending RPCs, suppose following + * situation: + * 1. the request was sent to server. + * 2. recovery was kicked start, after finished the request was + * marked as resent. + * 3. resend the request. + * 4. old reply from server received, we accept and verify the reply. + * this has to be success, otherwise the error will be aware + * by application. + * 5. new reply from server received, dropped by LNet. + * + * Note the xid of old & new request is the same. We can't simply + * change xid for the resent request because the server replies on + * it for reply reconstruction. + * + * Commonly the original context should be uptodate because we + * have an expiry nice time; server will keep its context because + * we at least hold a ref of old context which prevent context + * from destroying RPC being sent. So server still can accept the + * request and finish the RPC. But if that's not the case: + * 1. If server side context has been trimmed, a NO_CONTEXT will + * be returned, gss_cli_ctx_verify/unseal will switch to new + * context by force. + * 2. Current context never be refreshed, then we are fine: we + * never really send request with old context before. + */ + if (test_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags) && + unlikely(req->rq_reqmsg) && + lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) { + req_off_ctx_list(req, ctx); + RETURN(0); + } + + if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) { + req_off_ctx_list(req, ctx); + /* + * don't switch ctx if import was deactivated + */ + if (req->rq_import->imp_deactive) { + spin_lock(&req->rq_lock); + req->rq_err = 1; + spin_unlock(&req->rq_lock); + RETURN(-EINTR); + } + + rc = sptlrpc_req_replace_dead_ctx(req); + if (rc) { + LASSERT(ctx == req->rq_cli_ctx); + CERROR("req %p: failed to replace dead ctx %p: %d\n", + req, ctx, rc); + spin_lock(&req->rq_lock); + req->rq_err = 1; + spin_unlock(&req->rq_lock); + RETURN(rc); + } + + ctx = req->rq_cli_ctx; + goto again; + } + + /* + * Now we're sure this context is during upcall, add myself into + * waiting list + */ + spin_lock(&ctx->cc_lock); + if (list_empty(&req->rq_ctx_chain)) + list_add(&req->rq_ctx_chain, &ctx->cc_req_list); + spin_unlock(&ctx->cc_lock); + + if (timeout == 0) + RETURN(-EAGAIN); + + /* Clear any flags that may be present from previous sends */ + LASSERT(req->rq_receiving_reply == 0); + spin_lock(&req->rq_lock); + req->rq_err = 0; + req->rq_timedout = 0; + req->rq_resend = 0; + req->rq_restart = 0; + spin_unlock(&req->rq_lock); + + /* by now we know that timeout value is MAX_SCHEDULE_TIMEOUT, + * so wait indefinitely with non-fatal signals blocked + */ + if (l_wait_event_abortable(req->rq_reply_waitq, + ctx_check_refresh(ctx)) == -ERESTARTSYS) { + rc = -EINTR; + ctx_refresh_interrupt(req); + } + + /* + * following cases could lead us here: + * - successfully refreshed; + * - interrupted; + * - timedout, and we don't want recover from the failure; + * - timedout, and waked up upon recovery finished; + * - someone else mark this ctx dead by force; + * - someone invalidate the req and call ptlrpc_client_wake_req(), + * e.g. ptlrpc_abort_inflight(); + */ + if (!cli_ctx_is_refreshed(ctx)) { + /* timed out or interruptted */ + req_off_ctx_list(req, ctx); + + LASSERT(rc != 0); + RETURN(rc); + } + + goto again; +} + +/* Bring ptlrpc_sec context up-to-date */ +int sptlrpc_export_update_ctx(struct obd_export *exp) +{ + struct obd_import *imp = exp ? exp->exp_imp_reverse : NULL; + struct ptlrpc_sec *sec = NULL; + struct ptlrpc_cli_ctx *ctx = NULL; + int rc = 0; + + if (imp) + sec = sptlrpc_import_sec_ref(imp); + if (sec) { + ctx = get_my_ctx(sec); + sptlrpc_sec_put(sec); + } + + if (ctx) { + if (ctx->cc_ops->refresh) + rc = ctx->cc_ops->refresh(ctx); + sptlrpc_cli_ctx_put(ctx, 1); + } + return rc; +} + +/** + * Initialize flavor settings for \a req, according to \a opcode. + * + * \note this could be called in two situations: + * - new request from ptlrpc_pre_req(), with proper @opcode + * - old request which changed ctx in the middle, with @opcode == 0 + */ +void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode) +{ + struct ptlrpc_sec *sec; + + LASSERT(req->rq_import); + LASSERT(req->rq_cli_ctx); + LASSERT(req->rq_cli_ctx->cc_sec); + LASSERT(req->rq_bulk_read == 0 || req->rq_bulk_write == 0); + + /* special security flags according to opcode */ + switch (opcode) { + case OST_READ: + case MDS_READPAGE: + case MGS_CONFIG_READ: + case OBD_IDX_READ: + req->rq_bulk_read = 1; + break; + case OST_WRITE: + case MDS_WRITEPAGE: + req->rq_bulk_write = 1; + break; + case SEC_CTX_INIT: + req->rq_ctx_init = 1; + break; + case SEC_CTX_FINI: + req->rq_ctx_fini = 1; + break; + case 0: + /* init/fini rpc won't be resend, so can't be here */ + LASSERT(req->rq_ctx_init == 0); + LASSERT(req->rq_ctx_fini == 0); + + /* cleanup flags, which should be recalculated */ + req->rq_pack_udesc = 0; + req->rq_pack_bulk = 0; + break; + } + + sec = req->rq_cli_ctx->cc_sec; + + spin_lock(&sec->ps_lock); + req->rq_flvr = sec->ps_flvr; + spin_unlock(&sec->ps_lock); + + /* + * force SVC_NULL for context initiation rpc, SVC_INTG for context + * destruction rpc + */ + if (unlikely(req->rq_ctx_init)) + flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL); + else if (unlikely(req->rq_ctx_fini)) + flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG); + + /* user descriptor flag, null security can't do it anyway */ + if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) && + (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL)) + req->rq_pack_udesc = 1; + + /* bulk security flag */ + if ((req->rq_bulk_read || req->rq_bulk_write) && + sptlrpc_flavor_has_bulk(&req->rq_flvr)) + req->rq_pack_bulk = 1; +} + +void sptlrpc_request_out_callback(struct ptlrpc_request *req) +{ + if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV) + return; + + LASSERT(req->rq_clrbuf); + if (req->rq_pool || !req->rq_reqbuf) + return; + + OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; +} + +/** + * Given an import \a imp, check whether current user has a valid context + * or not. We may create a new context and try to refresh it, and try + * repeatedly try in case of non-fatal errors. Return 0 means success. + */ +int sptlrpc_import_check_ctx(struct obd_import *imp) +{ + struct ptlrpc_sec *sec; + struct ptlrpc_cli_ctx *ctx; + struct ptlrpc_request *req = NULL; + int rc; + + ENTRY; + + might_sleep(); + + sec = sptlrpc_import_sec_ref(imp); + ctx = get_my_ctx(sec); + sptlrpc_sec_put(sec); + + if (!ctx) + RETURN(-ENOMEM); + + if (cli_ctx_is_eternal(ctx) || + ctx->cc_ops->validate(ctx) == 0) { + sptlrpc_cli_ctx_put(ctx, 1); + RETURN(0); + } + + if (cli_ctx_is_error(ctx)) { + sptlrpc_cli_ctx_put(ctx, 1); + RETURN(-EACCES); + } + + req = ptlrpc_request_cache_alloc(GFP_NOFS); + if (!req) + RETURN(-ENOMEM); + + ptlrpc_cli_req_init(req); + atomic_set(&req->rq_refcount, 10000); + + req->rq_import = imp; + req->rq_flvr = sec->ps_flvr; + req->rq_cli_ctx = ctx; + + rc = sptlrpc_req_refresh_ctx(req, MAX_SCHEDULE_TIMEOUT); + LASSERT(list_empty(&req->rq_ctx_chain)); + sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1); + ptlrpc_request_cache_free(req); + + RETURN(rc); +} + +/** + * Used by ptlrpc client, to perform the pre-defined security transformation + * upon the request message of \a req. After this function called, + * req->rq_reqmsg is still accessible as clear text. + */ +int sptlrpc_cli_wrap_request(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + int rc = 0; + + ENTRY; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(req->rq_reqbuf || req->rq_clrbuf); + + /* + * we wrap bulk request here because now we can be sure + * the context is uptodate. + */ + if (req->rq_bulk) { + rc = sptlrpc_cli_wrap_bulk(req, req->rq_bulk); + if (rc) + RETURN(rc); + } + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + LASSERT(ctx->cc_ops->sign); + rc = ctx->cc_ops->sign(ctx, req); + break; + case SPTLRPC_SVC_PRIV: + LASSERT(ctx->cc_ops->seal); + rc = ctx->cc_ops->seal(ctx, req); + break; + default: + LBUG(); + } + + if (rc == 0) { + LASSERT(req->rq_reqdata_len); + LASSERT(req->rq_reqdata_len % 8 == 0); + LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len); + } + + RETURN(rc); +} + +static int do_cli_unwrap_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + int rc; + + ENTRY; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata); + LASSERT(req->rq_repmsg == NULL); + + req->rq_rep_swab_mask = 0; + + rc = __lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len); + switch (rc) { + case 1: + req_capsule_set_rep_swabbed(&req->rq_pill, + MSG_PTLRPC_HEADER_OFF); + case 0: + break; + default: + CERROR("failed unpack reply: x%llu\n", req->rq_xid); + RETURN(-EPROTO); + } + + if (req->rq_repdata_len < sizeof(struct lustre_msg)) { + CERROR("replied data length %d too small\n", + req->rq_repdata_len); + RETURN(-EPROTO); + } + + if (SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr) != + SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) { + CERROR("reply policy %u doesn't match request policy %u\n", + SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr), + SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)); + RETURN(-EPROTO); + } + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + LASSERT(ctx->cc_ops->verify); + rc = ctx->cc_ops->verify(ctx, req); + break; + case SPTLRPC_SVC_PRIV: + LASSERT(ctx->cc_ops->unseal); + rc = ctx->cc_ops->unseal(ctx, req); + break; + default: + LBUG(); + } + LASSERT(rc || req->rq_repmsg || req->rq_resend); + + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL && + !req->rq_ctx_init) + req->rq_rep_swab_mask = 0; + RETURN(rc); +} + +/** + * Used by ptlrpc client, to perform security transformation upon the reply + * message of \a req. After return successfully, req->rq_repmsg points to + * the reply message in clear text. + * + * \pre the reply buffer should have been un-posted from LNet, so nothing is + * going to change. + */ +int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req) +{ + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata == NULL); + LASSERT(req->rq_repmsg == NULL); + LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len); + + if (req->rq_reply_off == 0 && + (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { + CERROR("real reply with offset 0\n"); + return -EPROTO; + } + + if (req->rq_reply_off % 8 != 0) { + CERROR("reply at odd offset %u\n", req->rq_reply_off); + return -EPROTO; + } + + req->rq_repdata = (struct lustre_msg *) + (req->rq_repbuf + req->rq_reply_off); + req->rq_repdata_len = req->rq_nob_received; + + return do_cli_unwrap_reply(req); +} + +/** + * Used by ptlrpc client, to perform security transformation upon the early + * reply message of \a req. We expect the rq_reply_off is 0, and + * rq_nob_received is the early reply size. + * + * Because the receive buffer might be still posted, the reply data might be + * changed at any time, no matter we're holding rq_lock or not. For this reason + * we allocate a separate ptlrpc_request and reply buffer for early reply + * processing. + * + * \retval 0 success, \a req_ret is filled with a duplicated ptlrpc_request. + * Later the caller must call sptlrpc_cli_finish_early_reply() on the returned + * \a *req_ret to release it. + * \retval -ev error number, and \a req_ret will not be set. + */ +int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req, + struct ptlrpc_request **req_ret) +{ + struct ptlrpc_request *early_req; + char *early_buf; + int early_bufsz, early_size; + int rc; + + ENTRY; + + early_req = ptlrpc_request_cache_alloc(GFP_NOFS); + if (early_req == NULL) + RETURN(-ENOMEM); + + ptlrpc_cli_req_init(early_req); + + early_size = req->rq_nob_received; + early_bufsz = size_roundup_power2(early_size); + OBD_ALLOC_LARGE(early_buf, early_bufsz); + if (early_buf == NULL) + GOTO(err_req, rc = -ENOMEM); + + /* sanity checkings and copy data out, do it inside spinlock */ + spin_lock(&req->rq_lock); + + if (req->rq_replied) { + spin_unlock(&req->rq_lock); + GOTO(err_buf, rc = -EALREADY); + } + + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata == NULL); + LASSERT(req->rq_repmsg == NULL); + + if (req->rq_reply_off != 0) { + CERROR("early reply with offset %u\n", req->rq_reply_off); + spin_unlock(&req->rq_lock); + GOTO(err_buf, rc = -EPROTO); + } + + if (req->rq_nob_received != early_size) { + /* even another early arrived the size should be the same */ + CERROR("data size has changed from %u to %u\n", + early_size, req->rq_nob_received); + spin_unlock(&req->rq_lock); + GOTO(err_buf, rc = -EINVAL); + } + + if (req->rq_nob_received < sizeof(struct lustre_msg)) { + CERROR("early reply length %d too small\n", + req->rq_nob_received); + spin_unlock(&req->rq_lock); + GOTO(err_buf, rc = -EALREADY); + } + + memcpy(early_buf, req->rq_repbuf, early_size); + spin_unlock(&req->rq_lock); + + early_req->rq_cli_ctx = sptlrpc_cli_ctx_get(req->rq_cli_ctx); + early_req->rq_flvr = req->rq_flvr; + early_req->rq_repbuf = early_buf; + early_req->rq_repbuf_len = early_bufsz; + early_req->rq_repdata = (struct lustre_msg *) early_buf; + early_req->rq_repdata_len = early_size; + early_req->rq_early = 1; + early_req->rq_reqmsg = req->rq_reqmsg; + + rc = do_cli_unwrap_reply(early_req); + if (rc) { + DEBUG_REQ(D_ADAPTTO, early_req, + "unwrap early reply: rc = %d", rc); + GOTO(err_ctx, rc); + } + + LASSERT(early_req->rq_repmsg); + *req_ret = early_req; + RETURN(0); + +err_ctx: + sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1); +err_buf: + OBD_FREE_LARGE(early_buf, early_bufsz); +err_req: + ptlrpc_request_cache_free(early_req); + RETURN(rc); +} + +/** + * Used by ptlrpc client, to release a processed early reply \a early_req. + * + * \pre \a early_req was obtained from calling sptlrpc_cli_unwrap_early_reply(). + */ +void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req) +{ + LASSERT(early_req->rq_repbuf); + LASSERT(early_req->rq_repdata); + LASSERT(early_req->rq_repmsg); + + sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1); + OBD_FREE_LARGE(early_req->rq_repbuf, early_req->rq_repbuf_len); + ptlrpc_request_cache_free(early_req); +} + +/************************************************** + * sec ID * + **************************************************/ + +/* + * "fixed" sec (e.g. null) use sec_id < 0 + */ +static atomic_t sptlrpc_sec_id = ATOMIC_INIT(1); + +int sptlrpc_get_next_secid(void) +{ + return atomic_inc_return(&sptlrpc_sec_id); +} +EXPORT_SYMBOL(sptlrpc_get_next_secid); + +/* + * client side high-level security APIs + */ + +static int sec_cop_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid, + int grace, int force) +{ + struct ptlrpc_sec_policy *policy = sec->ps_policy; + + LASSERT(policy->sp_cops); + LASSERT(policy->sp_cops->flush_ctx_cache); + + return policy->sp_cops->flush_ctx_cache(sec, uid, grace, force); +} + +static void sec_cop_destroy_sec(struct ptlrpc_sec *sec) +{ + struct ptlrpc_sec_policy *policy = sec->ps_policy; + + LASSERT_ATOMIC_ZERO(&sec->ps_refcount); + LASSERT_ATOMIC_ZERO(&sec->ps_nctx); + LASSERT(policy->sp_cops->destroy_sec); + + CDEBUG(D_SEC, "%s@%p: being destroied\n", sec->ps_policy->sp_name, sec); + + policy->sp_cops->destroy_sec(sec); + sptlrpc_policy_put(policy); +} + +void sptlrpc_sec_destroy(struct ptlrpc_sec *sec) +{ + sec_cop_destroy_sec(sec); +} +EXPORT_SYMBOL(sptlrpc_sec_destroy); + +static void sptlrpc_sec_kill(struct ptlrpc_sec *sec) +{ + LASSERT_ATOMIC_POS(&sec->ps_refcount); + + if (sec->ps_policy->sp_cops->kill_sec) { + sec->ps_policy->sp_cops->kill_sec(sec); + + sec_cop_flush_ctx_cache(sec, -1, 1, 1); + } +} + +struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec) +{ + if (sec) + atomic_inc(&sec->ps_refcount); + + return sec; +} +EXPORT_SYMBOL(sptlrpc_sec_get); + +void sptlrpc_sec_put(struct ptlrpc_sec *sec) +{ + if (sec) { + LASSERT_ATOMIC_POS(&sec->ps_refcount); + + if (atomic_dec_and_test(&sec->ps_refcount)) { + sptlrpc_gc_del_sec(sec); + sec_cop_destroy_sec(sec); + } + } +} +EXPORT_SYMBOL(sptlrpc_sec_put); + +/* + * policy module is responsible for taking refrence of import + */ +static +struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *sf, + enum lustre_sec_part sp) +{ + struct ptlrpc_sec_policy *policy; + struct ptlrpc_sec *sec; + char str[32]; + + ENTRY; + + if (svc_ctx) { + LASSERT(imp->imp_dlm_fake == 1); + + CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n", + imp->imp_obd->obd_type->typ_name, + imp->imp_obd->obd_name, + sptlrpc_flavor2name(sf, str, sizeof(str))); + + policy = sptlrpc_policy_get(svc_ctx->sc_policy); + sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY; + } else { + LASSERT(imp->imp_dlm_fake == 0); + + CDEBUG(D_SEC, "%s %s: select security flavor %s\n", + imp->imp_obd->obd_type->typ_name, + imp->imp_obd->obd_name, + sptlrpc_flavor2name(sf, str, sizeof(str))); + + policy = sptlrpc_wireflavor2policy(sf->sf_rpc); + if (!policy) { + CERROR("invalid flavor 0x%x\n", sf->sf_rpc); + RETURN(NULL); + } + } + + sec = policy->sp_cops->create_sec(imp, svc_ctx, sf); + if (sec) { + atomic_inc(&sec->ps_refcount); + + sec->ps_part = sp; + + if (sec->ps_gc_interval && policy->sp_cops->gc_ctx) + sptlrpc_gc_add_sec(sec); + } else { + sptlrpc_policy_put(policy); + } + + RETURN(sec); +} + +struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp) +{ + struct ptlrpc_sec *sec; + + read_lock(&imp->imp_sec_lock); + sec = sptlrpc_sec_get(imp->imp_sec); + read_unlock(&imp->imp_sec_lock); + + return sec; +} +EXPORT_SYMBOL(sptlrpc_import_sec_ref); + +static void sptlrpc_import_sec_install(struct obd_import *imp, + struct ptlrpc_sec *sec) +{ + struct ptlrpc_sec *old_sec; + + LASSERT_ATOMIC_POS(&sec->ps_refcount); + + write_lock(&imp->imp_sec_lock); + old_sec = imp->imp_sec; + imp->imp_sec = sec; + write_unlock(&imp->imp_sec_lock); + + if (old_sec) { + sptlrpc_sec_kill(old_sec); + + /* balance the ref taken by this import */ + sptlrpc_sec_put(old_sec); + } +} + +static inline +int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2) +{ + return (memcmp(sf1, sf2, sizeof(*sf1)) == 0); +} + +static inline +void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src) +{ + *dst = *src; +} + +/** + * To get an appropriate ptlrpc_sec for the \a imp, according to the current + * configuration. Upon called, imp->imp_sec may or may not be NULL. + * + * - regular import: \a svc_ctx should be NULL and \a flvr is ignored; + * - reverse import: \a svc_ctx and \a flvr are obtained from incoming request. + */ +int sptlrpc_import_sec_adapt(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *flvr) +{ + struct ptlrpc_connection *conn; + struct sptlrpc_flavor sf; + struct ptlrpc_sec *sec, *newsec; + enum lustre_sec_part sp; + char str[24]; + int rc = 0; + + ENTRY; + + might_sleep(); + + if (imp == NULL) + RETURN(0); + + conn = imp->imp_connection; + + if (svc_ctx == NULL) { + struct client_obd *cliobd = &imp->imp_obd->u.cli; + /* + * normal import, determine flavor from rule set, except + * for mgc the flavor is predetermined. + */ + if (cliobd->cl_sp_me == LUSTRE_SP_MGC) + sf = cliobd->cl_flvr_mgc; + else + sptlrpc_conf_choose_flavor(cliobd->cl_sp_me, + cliobd->cl_sp_to, + &cliobd->cl_target_uuid, + &conn->c_self, &sf); + + sp = imp->imp_obd->u.cli.cl_sp_me; + } else { + /* reverse import, determine flavor from incoming reqeust */ + sf = *flvr; + + if (sf.sf_rpc != SPTLRPC_FLVR_NULL) + sf.sf_flags = PTLRPC_SEC_FL_REVERSE | + PTLRPC_SEC_FL_ROOTONLY; + + sp = sptlrpc_target_sec_part(imp->imp_obd); + } + + sec = sptlrpc_import_sec_ref(imp); + if (sec) { + char str2[24]; + + if (flavor_equal(&sf, &sec->ps_flvr)) + GOTO(out, rc); + + CDEBUG(D_SEC, "import %s->%s: changing flavor %s -> %s\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid), + sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)), + sptlrpc_flavor2name(&sf, str2, sizeof(str2))); + } else if (SPTLRPC_FLVR_BASE(sf.sf_rpc) != + SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) { + CDEBUG(D_SEC, "import %s->%s netid %x: select flavor %s\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid), + LNET_NID_NET(&conn->c_self), + sptlrpc_flavor2name(&sf, str, sizeof(str))); + } + + newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp); + if (newsec) { + sptlrpc_import_sec_install(imp, newsec); + } else { + CERROR("import %s->%s: failed to create new sec\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid)); + rc = -EPERM; + } + +out: + sptlrpc_sec_put(sec); + RETURN(rc); +} + +void sptlrpc_import_sec_put(struct obd_import *imp) +{ + if (imp->imp_sec) { + sptlrpc_sec_kill(imp->imp_sec); + + sptlrpc_sec_put(imp->imp_sec); + imp->imp_sec = NULL; + } +} + +static void import_flush_ctx_common(struct obd_import *imp, + uid_t uid, int grace, int force) +{ + struct ptlrpc_sec *sec; + + if (imp == NULL) + return; + + sec = sptlrpc_import_sec_ref(imp); + if (sec == NULL) + return; + + sec_cop_flush_ctx_cache(sec, uid, grace, force); + sptlrpc_sec_put(sec); +} + +void sptlrpc_import_flush_root_ctx(struct obd_import *imp) +{ + /* + * it's important to use grace mode, see explain in + * sptlrpc_req_refresh_ctx() + */ + import_flush_ctx_common(imp, 0, 1, 1); +} + +void sptlrpc_import_flush_my_ctx(struct obd_import *imp) +{ + import_flush_ctx_common(imp, from_kuid(&init_user_ns, current_uid()), + 1, 1); +} +EXPORT_SYMBOL(sptlrpc_import_flush_my_ctx); + +void sptlrpc_import_flush_all_ctx(struct obd_import *imp) +{ + import_flush_ctx_common(imp, -1, 1, 1); +} +EXPORT_SYMBOL(sptlrpc_import_flush_all_ctx); + +/** + * Used by ptlrpc client to allocate request buffer of \a req. Upon return + * successfully, req->rq_reqmsg points to a buffer with size \a msgsize. + */ +int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + int rc; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + LASSERT(req->rq_reqmsg == NULL); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + policy = ctx->cc_sec->ps_policy; + rc = policy->sp_cops->alloc_reqbuf(ctx->cc_sec, req, msgsize); + if (!rc) { + LASSERT(req->rq_reqmsg); + LASSERT(req->rq_reqbuf || req->rq_clrbuf); + + /* zeroing preallocated buffer */ + if (req->rq_pool) + memset(req->rq_reqmsg, 0, msgsize); + } + + return rc; +} + +/** + * Used by ptlrpc client to free request buffer of \a req. After this + * req->rq_reqmsg is set to NULL and should not be accessed anymore. + */ +void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + if (req->rq_reqbuf == NULL && req->rq_clrbuf == NULL) + return; + + policy = ctx->cc_sec->ps_policy; + policy->sp_cops->free_reqbuf(ctx->cc_sec, req); + req->rq_reqmsg = NULL; +} + +/* + * NOTE caller must guarantee the buffer size is enough for the enlargement + */ +void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg, + int segment, int newsize) +{ + void *src, *dst; + int oldsize, oldmsg_size, movesize; + + LASSERT(segment < msg->lm_bufcount); + LASSERT(msg->lm_buflens[segment] <= newsize); + + if (msg->lm_buflens[segment] == newsize) + return; + + /* nothing to do if we are enlarging the last segment */ + if (segment == msg->lm_bufcount - 1) { + msg->lm_buflens[segment] = newsize; + return; + } + + oldsize = msg->lm_buflens[segment]; + + src = lustre_msg_buf(msg, segment + 1, 0); + msg->lm_buflens[segment] = newsize; + dst = lustre_msg_buf(msg, segment + 1, 0); + msg->lm_buflens[segment] = oldsize; + + /* move from segment + 1 to end segment */ + LASSERT(msg->lm_magic == LUSTRE_MSG_MAGIC_V2); + oldmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + movesize = oldmsg_size - ((unsigned long) src - (unsigned long) msg); + LASSERT(movesize >= 0); + + if (movesize) + memmove(dst, src, movesize); + + /* note we don't clear the ares where old data live, not secret */ + + /* finally set new segment size */ + msg->lm_buflens[segment] = newsize; +} +EXPORT_SYMBOL(_sptlrpc_enlarge_msg_inplace); + +/** + * Used by ptlrpc client to enlarge the \a segment of request message pointed + * by req->rq_reqmsg to size \a newsize, all previously filled-in data will be + * preserved after the enlargement. this must be called after original request + * buffer being allocated. + * + * \note after this be called, rq_reqmsg and rq_reqlen might have been changed, + * so caller should refresh its local pointers if needed. + */ +int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req, + const struct req_msg_field *field, + int newsize) +{ + struct req_capsule *pill = &req->rq_pill; + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_cops *cops; + struct lustre_msg *msg = req->rq_reqmsg; + int segment = __req_capsule_offset(pill, field, RCL_CLIENT); + + LASSERT(ctx); + LASSERT(msg); + LASSERT(msg->lm_bufcount > segment); + LASSERT(msg->lm_buflens[segment] <= newsize); + + if (msg->lm_buflens[segment] == newsize) + return 0; + + cops = ctx->cc_sec->ps_policy->sp_cops; + LASSERT(cops->enlarge_reqbuf); + return cops->enlarge_reqbuf(ctx->cc_sec, req, segment, newsize); +} +EXPORT_SYMBOL(sptlrpc_cli_enlarge_reqbuf); + +/** + * Used by ptlrpc client to allocate reply buffer of \a req. + * + * \note After this, req->rq_repmsg is still not accessible. + */ +int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + + ENTRY; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + + if (req->rq_repbuf) + RETURN(0); + + policy = ctx->cc_sec->ps_policy; + RETURN(policy->sp_cops->alloc_repbuf(ctx->cc_sec, req, msgsize)); +} + +/** + * Used by ptlrpc client to free reply buffer of \a req. After this + * req->rq_repmsg is set to NULL and should not be accessed anymore. + */ +void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + + ENTRY; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + if (req->rq_repbuf == NULL) + return; + LASSERT(req->rq_repbuf_len); + + policy = ctx->cc_sec->ps_policy; + policy->sp_cops->free_repbuf(ctx->cc_sec, req); + req->rq_repmsg = NULL; + EXIT; +} +EXPORT_SYMBOL(sptlrpc_cli_free_repbuf); + +int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_cli_ctx *ctx) +{ + struct ptlrpc_sec_policy *policy = ctx->cc_sec->ps_policy; + + if (!policy->sp_cops->install_rctx) + return 0; + return policy->sp_cops->install_rctx(imp, ctx->cc_sec, ctx); +} + +int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx) +{ + struct ptlrpc_sec_policy *policy = ctx->sc_policy; + + if (!policy->sp_sops->install_rctx) + return 0; + return policy->sp_sops->install_rctx(imp, ctx); +} + +/* Get SELinux policy info from userspace */ +static int sepol_helper(struct obd_import *imp) +{ + char mtime_str[21] = { 0 }, mode_str[2] = { 0 }; + char *argv[] = { + [0] = "/usr/sbin/l_getsepol", + [1] = "-o", + [2] = NULL, /* obd type */ + [3] = "-n", + [4] = NULL, /* obd name */ + [5] = "-t", + [6] = mtime_str, /* policy mtime */ + [7] = "-m", + [8] = mode_str, /* enforcing mode */ + [9] = NULL + }; + char *envp[] = { + [0] = "HOME=/", + [1] = "PATH=/sbin:/usr/sbin", + [2] = NULL + }; + signed short ret; + int rc = 0; + + if (imp == NULL || imp->imp_obd == NULL || + imp->imp_obd->obd_type == NULL) { + rc = -EINVAL; + } else { + argv[2] = (char *)imp->imp_obd->obd_type->typ_name; + argv[4] = imp->imp_obd->obd_name; + spin_lock(&imp->imp_sec->ps_lock); + if (ktime_to_ns(imp->imp_sec->ps_sepol_mtime) == 0 && + imp->imp_sec->ps_sepol[0] == '\0') { + /* ps_sepol has not been initialized */ + argv[5] = NULL; + argv[7] = NULL; + } else { + time64_t mtime_ms; + + mtime_ms = ktime_to_ms(imp->imp_sec->ps_sepol_mtime); + snprintf(mtime_str, sizeof(mtime_str), "%lld", + mtime_ms / MSEC_PER_SEC); + mode_str[0] = imp->imp_sec->ps_sepol[0]; + } + spin_unlock(&imp->imp_sec->ps_lock); + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + rc = ret>>8; + } + + return rc; +} + +static inline int sptlrpc_sepol_needs_check(struct ptlrpc_sec *imp_sec) +{ + ktime_t checknext; + + if (send_sepol == 0) + return 0; + + if (send_sepol == -1) + /* send_sepol == -1 means fetch sepol status every time */ + return 1; + + spin_lock(&imp_sec->ps_lock); + checknext = imp_sec->ps_sepol_checknext; + spin_unlock(&imp_sec->ps_lock); + + /* next check is too far in time, please update */ + if (ktime_after(checknext, + ktime_add(ktime_get(), ktime_set(send_sepol, 0)))) + goto setnext; + + if (ktime_before(ktime_get(), checknext)) + /* too early to fetch sepol status */ + return 0; + +setnext: + /* define new sepol_checknext time */ + spin_lock(&imp_sec->ps_lock); + imp_sec->ps_sepol_checknext = ktime_add(ktime_get(), + ktime_set(send_sepol, 0)); + spin_unlock(&imp_sec->ps_lock); + + return 1; +} + +int sptlrpc_get_sepol(struct ptlrpc_request *req) +{ + struct ptlrpc_sec *imp_sec = req->rq_import->imp_sec; + int rc = 0; + + ENTRY; + + (req->rq_sepol)[0] = '\0'; + +#ifndef HAVE_SELINUX + if (unlikely(send_sepol != 0)) + CDEBUG(D_SEC, + "Client cannot report SELinux status, it was not built against libselinux.\n"); + RETURN(0); +#endif + + if (send_sepol == 0) + RETURN(0); + + if (imp_sec == NULL) + RETURN(-EINVAL); + + /* Retrieve SELinux status info */ + if (sptlrpc_sepol_needs_check(imp_sec)) + rc = sepol_helper(req->rq_import); + if (likely(rc == 0)) { + spin_lock(&imp_sec->ps_lock); + memcpy(req->rq_sepol, imp_sec->ps_sepol, + sizeof(req->rq_sepol)); + spin_unlock(&imp_sec->ps_lock); + } else if (rc == -ENODEV) { + CDEBUG(D_SEC, + "Client cannot report SELinux status, SELinux is disabled.\n"); + rc = 0; + } + + RETURN(rc); +} +EXPORT_SYMBOL(sptlrpc_get_sepol); + +/* + * server side security + */ + +static int flavor_allowed(struct sptlrpc_flavor *exp, + struct ptlrpc_request *req) +{ + struct sptlrpc_flavor *flvr = &req->rq_flvr; + + if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc) + return 1; + + if ((req->rq_ctx_init || req->rq_ctx_fini) && + SPTLRPC_FLVR_POLICY(exp->sf_rpc) == + SPTLRPC_FLVR_POLICY(flvr->sf_rpc) && + SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc)) + return 1; + + return 0; +} + +#define EXP_FLVR_UPDATE_EXPIRE (OBD_TIMEOUT_DEFAULT + 10) + +/** + * Given an export \a exp, check whether the flavor of incoming \a req + * is allowed by the export \a exp. Main logic is about taking care of + * changing configurations. Return 0 means success. + */ +int sptlrpc_target_export_check(struct obd_export *exp, + struct ptlrpc_request *req) +{ + struct sptlrpc_flavor flavor; + + if (exp == NULL) + return 0; + + /* + * client side export has no imp_reverse, skip + * FIXME maybe we should check flavor this as well??? + */ + if (exp->exp_imp_reverse == NULL) + return 0; + + /* don't care about ctx fini rpc */ + if (req->rq_ctx_fini) + return 0; + + spin_lock(&exp->exp_lock); + + /* + * if flavor just changed (exp->exp_flvr_changed != 0), we wait for + * the first req with the new flavor, then treat it as current flavor, + * adapt reverse sec according to it. + * note the first rpc with new flavor might not be with root ctx, in + * which case delay the sec_adapt by leaving exp_flvr_adapt == 1. + */ + if (unlikely(exp->exp_flvr_changed) && + flavor_allowed(&exp->exp_flvr_old[1], req)) { + /* + * make the new flavor as "current", and old ones as + * about-to-expire + */ + CDEBUG(D_SEC, "exp %p: just changed: %x->%x\n", exp, + exp->exp_flvr.sf_rpc, exp->exp_flvr_old[1].sf_rpc); + flavor = exp->exp_flvr_old[1]; + exp->exp_flvr_old[1] = exp->exp_flvr_old[0]; + exp->exp_flvr_expire[1] = exp->exp_flvr_expire[0]; + exp->exp_flvr_old[0] = exp->exp_flvr; + exp->exp_flvr_expire[0] = ktime_get_real_seconds() + + EXP_FLVR_UPDATE_EXPIRE; + exp->exp_flvr = flavor; + + /* flavor change finished */ + exp->exp_flvr_changed = 0; + LASSERT(exp->exp_flvr_adapt == 1); + + /* if it's gss, we only interested in root ctx init */ + if (req->rq_auth_gss && + !(req->rq_ctx_init && + (req->rq_auth_usr_root || req->rq_auth_usr_mdt || + req->rq_auth_usr_ost))) { + spin_unlock(&exp->exp_lock); + CDEBUG(D_SEC, "is good but not root(%d:%d:%d:%d:%d)\n", + req->rq_auth_gss, req->rq_ctx_init, + req->rq_auth_usr_root, req->rq_auth_usr_mdt, + req->rq_auth_usr_ost); + return 0; + } + + exp->exp_flvr_adapt = 0; + spin_unlock(&exp->exp_lock); + + return sptlrpc_import_sec_adapt(exp->exp_imp_reverse, + req->rq_svc_ctx, &flavor); + } + + /* + * if it equals to the current flavor, we accept it, but need to + * dealing with reverse sec/ctx + */ + if (likely(flavor_allowed(&exp->exp_flvr, req))) { + /* + * most cases should return here, we only interested in + * gss root ctx init + */ + if (!req->rq_auth_gss || !req->rq_ctx_init || + (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt && + !req->rq_auth_usr_ost)) { + spin_unlock(&exp->exp_lock); + return 0; + } + + /* + * if flavor just changed, we should not proceed, just leave + * it and current flavor will be discovered and replaced + * shortly, and let _this_ rpc pass through + */ + if (exp->exp_flvr_changed) { + LASSERT(exp->exp_flvr_adapt); + spin_unlock(&exp->exp_lock); + return 0; + } + + if (exp->exp_flvr_adapt) { + exp->exp_flvr_adapt = 0; + CDEBUG(D_SEC, "exp %p (%x|%x|%x): do delayed adapt\n", + exp, exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + flavor = exp->exp_flvr; + spin_unlock(&exp->exp_lock); + + return sptlrpc_import_sec_adapt(exp->exp_imp_reverse, + req->rq_svc_ctx, + &flavor); + } else { + CDEBUG(D_SEC, + "exp %p (%x|%x|%x): is current flavor, install rvs ctx\n", + exp, exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + spin_unlock(&exp->exp_lock); + + return sptlrpc_svc_install_rvs_ctx(exp->exp_imp_reverse, + req->rq_svc_ctx); + } + } + + if (exp->exp_flvr_expire[0]) { + if (exp->exp_flvr_expire[0] >= ktime_get_real_seconds()) { + if (flavor_allowed(&exp->exp_flvr_old[0], req)) { + CDEBUG(D_SEC, + "exp %p (%x|%x|%x): match the middle one (%lld)\n", + exp, exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc, + (s64)(exp->exp_flvr_expire[0] - + ktime_get_real_seconds())); + spin_unlock(&exp->exp_lock); + return 0; + } + } else { + CDEBUG(D_SEC, "mark middle expired\n"); + exp->exp_flvr_expire[0] = 0; + } + CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match middle\n", exp, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc, + req->rq_flvr.sf_rpc); + } + + /* + * now it doesn't match the current flavor, the only chance we can + * accept it is match the old flavors which is not expired. + */ + if (exp->exp_flvr_changed == 0 && exp->exp_flvr_expire[1]) { + if (exp->exp_flvr_expire[1] >= ktime_get_real_seconds()) { + if (flavor_allowed(&exp->exp_flvr_old[1], req)) { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the oldest one (%lld)\n", + exp, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc, + (s64)(exp->exp_flvr_expire[1] - + ktime_get_real_seconds())); + spin_unlock(&exp->exp_lock); + return 0; + } + } else { + CDEBUG(D_SEC, "mark oldest expired\n"); + exp->exp_flvr_expire[1] = 0; + } + CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match found\n", + exp, exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc, + req->rq_flvr.sf_rpc); + } else { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): skip the last one\n", + exp, exp->exp_flvr.sf_rpc, exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + } + + spin_unlock(&exp->exp_lock); + + CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u|%u) with unauthorized flavor %x, expect %x|%x(%+lld)|%x(%+lld)\n", + exp, exp->exp_obd->obd_name, + req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini, + req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_auth_usr_ost, + req->rq_flvr.sf_rpc, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_expire[0] ? + (s64)(exp->exp_flvr_expire[0] - ktime_get_real_seconds()) : 0, + exp->exp_flvr_old[1].sf_rpc, + exp->exp_flvr_expire[1] ? + (s64)(exp->exp_flvr_expire[1] - ktime_get_real_seconds()) : 0); + return -EACCES; +} +EXPORT_SYMBOL(sptlrpc_target_export_check); + +void sptlrpc_target_update_exp_flavor(struct obd_device *obd, + struct sptlrpc_rule_set *rset) +{ + struct obd_export *exp; + struct sptlrpc_flavor new_flvr; + + LASSERT(obd); + + spin_lock(&obd->obd_dev_lock); + + list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) { + if (exp->exp_connection == NULL) + continue; + + /* + * note if this export had just been updated flavor + * (exp_flvr_changed == 1), this will override the + * previous one. + */ + spin_lock(&exp->exp_lock); + sptlrpc_target_choose_flavor(rset, exp->exp_sp_peer, + lnet_nid_to_nid4(&exp->exp_connection->c_peer.nid), + &new_flvr); + if (exp->exp_flvr_changed || + !flavor_equal(&new_flvr, &exp->exp_flvr)) { + exp->exp_flvr_old[1] = new_flvr; + exp->exp_flvr_expire[1] = 0; + exp->exp_flvr_changed = 1; + exp->exp_flvr_adapt = 1; + + CDEBUG(D_SEC, "exp %p (%s): updated flavor %x->%x\n", + exp, sptlrpc_part2name(exp->exp_sp_peer), + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + } + spin_unlock(&exp->exp_lock); + } + + spin_unlock(&obd->obd_dev_lock); +} +EXPORT_SYMBOL(sptlrpc_target_update_exp_flavor); + +static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc) +{ + /* peer's claim is unreliable unless gss is being used */ + if (!req->rq_auth_gss || svc_rc == SECSVC_DROP) + return svc_rc; + + switch (req->rq_sp_from) { + case LUSTRE_SP_CLI: + if (req->rq_auth_usr_mdt || req->rq_auth_usr_ost) { + /* The below message is checked in sanity-sec test_33 */ + DEBUG_REQ(D_ERROR, req, "faked source CLI"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_MDT: + if (!req->rq_auth_usr_mdt) { + /* The below message is checked in sanity-sec test_33 */ + DEBUG_REQ(D_ERROR, req, "faked source MDT"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_OST: + if (!req->rq_auth_usr_ost) { + /* The below message is checked in sanity-sec test_33 */ + DEBUG_REQ(D_ERROR, req, "faked source OST"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_MGS: + case LUSTRE_SP_MGC: + if (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt && + !req->rq_auth_usr_ost) { + /* The below message is checked in sanity-sec test_33 */ + DEBUG_REQ(D_ERROR, req, "faked source MGC/MGS"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_ANY: + default: + DEBUG_REQ(D_ERROR, req, "invalid source %u", req->rq_sp_from); + svc_rc = SECSVC_DROP; + } + + return svc_rc; +} + +/** + * Used by ptlrpc server, to perform transformation upon request message of + * incoming \a req. This must be the first thing to do with an incoming + * request in ptlrpc layer. + * + * \retval SECSVC_OK success, and req->rq_reqmsg point to request message in + * clear text, size is req->rq_reqlen; also req->rq_svc_ctx is set. + * \retval SECSVC_COMPLETE success, the request has been fully processed, and + * reply message has been prepared. + * \retval SECSVC_DROP failed, this request should be dropped. + */ +int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req) +{ + struct ptlrpc_sec_policy *policy; + struct lustre_msg *msg = req->rq_reqbuf; + int rc; + + ENTRY; + + LASSERT(msg); + LASSERT(req->rq_reqmsg == NULL); + LASSERT(req->rq_repmsg == NULL); + LASSERT(req->rq_svc_ctx == NULL); + + req->rq_req_swab_mask = 0; + + rc = __lustre_unpack_msg(msg, req->rq_reqdata_len); + switch (rc) { + case 1: + req_capsule_set_req_swabbed(&req->rq_pill, + MSG_PTLRPC_HEADER_OFF); + case 0: + break; + default: + CERROR("error unpacking request from %s x%llu\n", + libcfs_id2str(req->rq_peer), req->rq_xid); + RETURN(SECSVC_DROP); + } + + req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr); + req->rq_sp_from = LUSTRE_SP_ANY; + req->rq_auth_uid = -1; /* set to INVALID_UID */ + req->rq_auth_mapped_uid = -1; + + policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc); + if (!policy) { + CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc); + RETURN(SECSVC_DROP); + } + + LASSERT(policy->sp_sops->accept); + rc = policy->sp_sops->accept(req); + sptlrpc_policy_put(policy); + LASSERT(req->rq_reqmsg || rc != SECSVC_OK); + LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP); + + /* + * if it's not null flavor (which means embedded packing msg), + * reset the swab mask for the comming inner msg unpacking. + */ + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) + req->rq_req_swab_mask = 0; + + /* sanity check for the request source */ + rc = sptlrpc_svc_check_from(req, rc); + RETURN(rc); +} + +/** + * Used by ptlrpc server, to allocate reply buffer for \a req. If succeed, + * req->rq_reply_state is set, and req->rq_reply_state->rs_msg point to + * a buffer of \a msglen size. + */ +int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen) +{ + struct ptlrpc_sec_policy *policy; + struct ptlrpc_reply_state *rs; + int rc; + + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_svc_ctx->sc_policy); + + policy = req->rq_svc_ctx->sc_policy; + LASSERT(policy->sp_sops->alloc_rs); + + rc = policy->sp_sops->alloc_rs(req, msglen); + if (unlikely(rc == -ENOMEM)) { + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + + if (svcpt->scp_service->srv_max_reply_size < + msglen + sizeof(struct ptlrpc_reply_state)) { + /* Just return failure if the size is too big */ + CERROR("size of message is too big (%zd), %d allowed\n", + msglen + sizeof(struct ptlrpc_reply_state), + svcpt->scp_service->srv_max_reply_size); + RETURN(-ENOMEM); + } + + /* failed alloc, try emergency pool */ + rs = lustre_get_emerg_rs(svcpt); + if (rs == NULL) + RETURN(-ENOMEM); + + req->rq_reply_state = rs; + rc = policy->sp_sops->alloc_rs(req, msglen); + if (rc) { + lustre_put_emerg_rs(rs); + req->rq_reply_state = NULL; + } + } + + LASSERT(rc != 0 || + (req->rq_reply_state && req->rq_reply_state->rs_msg)); + + RETURN(rc); +} + +/** + * Used by ptlrpc server, to perform transformation upon reply message. + * + * \post req->rq_reply_off is set to approriate server-controlled reply offset. + * \post req->rq_repmsg and req->rq_reply_state->rs_msg becomes inaccessible. + */ +int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_sec_policy *policy; + int rc; + + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_svc_ctx->sc_policy); + + policy = req->rq_svc_ctx->sc_policy; + LASSERT(policy->sp_sops->authorize); + + rc = policy->sp_sops->authorize(req); + LASSERT(rc || req->rq_reply_state->rs_repdata_len); + + RETURN(rc); +} + +/** + * Used by ptlrpc server, to free reply_state. + */ +void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_sec_policy *policy; + unsigned int prealloc; + + ENTRY; + + LASSERT(rs->rs_svc_ctx); + LASSERT(rs->rs_svc_ctx->sc_policy); + + policy = rs->rs_svc_ctx->sc_policy; + LASSERT(policy->sp_sops->free_rs); + + prealloc = rs->rs_prealloc; + policy->sp_sops->free_rs(rs); + + if (prealloc) + lustre_put_emerg_rs(rs); + EXIT; +} + +void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req) +{ + struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; + + if (ctx != NULL) + atomic_inc(&ctx->sc_refcount); +} + +void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req) +{ + struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; + + if (ctx == NULL) + return; + + LASSERT_ATOMIC_POS(&ctx->sc_refcount); + if (atomic_dec_and_test(&ctx->sc_refcount)) { + if (ctx->sc_policy->sp_sops->free_ctx) + ctx->sc_policy->sp_sops->free_ctx(ctx); + } + req->rq_svc_ctx = NULL; +} + +void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req) +{ + struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; + + if (ctx == NULL) + return; + + LASSERT_ATOMIC_POS(&ctx->sc_refcount); + if (ctx->sc_policy->sp_sops->invalidate_ctx) + ctx->sc_policy->sp_sops->invalidate_ctx(ctx); +} +EXPORT_SYMBOL(sptlrpc_svc_ctx_invalidate); + +/* + * bulk security + */ + +/** + * Perform transformation upon bulk data pointed by \a desc. This is called + * before transforming the request message. + */ +int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_cli_ctx *ctx; + + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_cli_ctx; + if (ctx->cc_ops->wrap_bulk) + return ctx->cc_ops->wrap_bulk(ctx, req, desc); + return 0; +} +EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk); + +/** + * This is called after unwrap the reply message. + * return nob of actual plain text size received, or error code. + */ +int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc, + int nob) +{ + struct ptlrpc_cli_ctx *ctx; + int rc; + + LASSERT(req->rq_bulk_read && !req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return desc->bd_nob_transferred; + + ctx = req->rq_cli_ctx; + if (ctx->cc_ops->unwrap_bulk) { + rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc); + if (rc < 0) + return rc; + } + return desc->bd_nob_transferred; +} +EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read); + +/** + * This is called after unwrap the reply message. + * return 0 for success or error code. + */ +int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_cli_ctx *ctx; + int rc; + + LASSERT(!req->rq_bulk_read && req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_cli_ctx; + if (ctx->cc_ops->unwrap_bulk) { + rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc); + if (rc < 0) + return rc; + } + + /* + * if everything is going right, nob should equals to nob_transferred. + * in case of privacy mode, nob_transferred needs to be adjusted. + */ + if (desc->bd_nob != desc->bd_nob_transferred) { + CERROR("nob %d doesn't match transferred nob %d\n", + desc->bd_nob, desc->bd_nob_transferred); + return -EPROTO; + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_write); + +#ifdef HAVE_SERVER_SUPPORT +/** + * Performe transformation upon outgoing bulk read. + */ +int sptlrpc_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_svc_ctx *ctx; + + LASSERT(req->rq_bulk_read); + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_svc_ctx; + if (ctx->sc_policy->sp_sops->wrap_bulk) + return ctx->sc_policy->sp_sops->wrap_bulk(req, desc); + + return 0; +} +EXPORT_SYMBOL(sptlrpc_svc_wrap_bulk); + +/** + * Performe transformation upon incoming bulk write. + */ +int sptlrpc_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_svc_ctx *ctx; + int rc; + + LASSERT(req->rq_bulk_write); + + /* + * if it's in privacy mode, transferred should >= expected; otherwise + * transferred should == expected. + */ + if (desc->bd_nob_transferred < desc->bd_nob || + (desc->bd_nob_transferred > desc->bd_nob && + SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != + SPTLRPC_BULK_SVC_PRIV)) { + DEBUG_REQ(D_ERROR, req, "truncated bulk GET %d(%d)", + desc->bd_nob_transferred, desc->bd_nob); + return -ETIMEDOUT; + } + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_svc_ctx; + if (ctx->sc_policy->sp_sops->unwrap_bulk) { + rc = ctx->sc_policy->sp_sops->unwrap_bulk(req, desc); + if (rc) + CERROR("error unwrap bulk: %d\n", rc); + } + + /* return 0 to allow reply be sent */ + return 0; +} +EXPORT_SYMBOL(sptlrpc_svc_unwrap_bulk); + +/** + * Prepare buffers for incoming bulk write. + */ +int sptlrpc_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_svc_ctx *ctx; + + LASSERT(req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_svc_ctx; + if (ctx->sc_policy->sp_sops->prep_bulk) + return ctx->sc_policy->sp_sops->prep_bulk(req, desc); + + return 0; +} +EXPORT_SYMBOL(sptlrpc_svc_prep_bulk); + +#endif /* HAVE_SERVER_SUPPORT */ + +/* + * user descriptor helpers + */ + +int sptlrpc_current_user_desc_size(void) +{ + int ngroups; + + ngroups = current_cred()->group_info->ngroups; + + if (ngroups > LUSTRE_MAX_GROUPS) + ngroups = LUSTRE_MAX_GROUPS; + return sptlrpc_user_desc_size(ngroups); +} +EXPORT_SYMBOL(sptlrpc_current_user_desc_size); + +int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset) +{ + struct ptlrpc_user_desc *pud; + int ngroups; + + pud = lustre_msg_buf(msg, offset, 0); + + pud->pud_uid = from_kuid(&init_user_ns, current_uid()); + pud->pud_gid = from_kgid(&init_user_ns, current_gid()); + pud->pud_fsuid = from_kuid(&init_user_ns, current_fsuid()); + pud->pud_fsgid = from_kgid(&init_user_ns, current_fsgid()); + pud->pud_cap = current_cap().cap[0]; + pud->pud_ngroups = (msg->lm_buflens[offset] - sizeof(*pud)) / 4; + + task_lock(current); + ngroups = current_cred()->group_info->ngroups; + if (pud->pud_ngroups > ngroups) + pud->pud_ngroups = ngroups; +#ifdef HAVE_GROUP_INFO_GID + memcpy(pud->pud_groups, current_cred()->group_info->gid, + pud->pud_ngroups * sizeof(__u32)); +#else /* !HAVE_GROUP_INFO_GID */ + memcpy(pud->pud_groups, current_cred()->group_info->blocks[0], + pud->pud_ngroups * sizeof(__u32)); +#endif /* HAVE_GROUP_INFO_GID */ + task_unlock(current); + + return 0; +} +EXPORT_SYMBOL(sptlrpc_pack_user_desc); + +int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset, int swabbed) +{ + struct ptlrpc_user_desc *pud; + int i; + + pud = lustre_msg_buf(msg, offset, sizeof(*pud)); + if (!pud) + return -EINVAL; + + if (swabbed) { + __swab32s(&pud->pud_uid); + __swab32s(&pud->pud_gid); + __swab32s(&pud->pud_fsuid); + __swab32s(&pud->pud_fsgid); + __swab32s(&pud->pud_cap); + __swab32s(&pud->pud_ngroups); + } + + if (pud->pud_ngroups > LUSTRE_MAX_GROUPS) { + CERROR("%u groups is too large\n", pud->pud_ngroups); + return -EINVAL; + } + + if (sizeof(*pud) + pud->pud_ngroups * sizeof(__u32) > + msg->lm_buflens[offset]) { + CERROR("%u groups are claimed but bufsize only %u\n", + pud->pud_ngroups, msg->lm_buflens[offset]); + return -EINVAL; + } + + if (swabbed) { + for (i = 0; i < pud->pud_ngroups; i++) + __swab32s(&pud->pud_groups[i]); + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_unpack_user_desc); + +/* + * misc helpers + */ + +const char *sec2target_str(struct ptlrpc_sec *sec) +{ + if (!sec || !sec->ps_import || !sec->ps_import->imp_obd) + return "*"; + if (sec_is_reverse(sec)) + return "c"; + return obd_uuid2str(&sec->ps_import->imp_obd->u.cli.cl_target_uuid); +} +EXPORT_SYMBOL(sec2target_str); + +/* + * return true if the bulk data is protected + */ +int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr) +{ + switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { + case SPTLRPC_BULK_SVC_INTG: + case SPTLRPC_BULK_SVC_PRIV: + return 1; + default: + return 0; + } +} +EXPORT_SYMBOL(sptlrpc_flavor_has_bulk); + +/* + * crypto API helper/alloc blkciper + */ + +/* + * initialize/finalize + */ + +int sptlrpc_init(void) +{ + int rc; + + rwlock_init(&policy_lock); + + rc = sptlrpc_gc_init(); + if (rc) + goto out; + + rc = sptlrpc_conf_init(); + if (rc) + goto out_gc; + + rc = sptlrpc_enc_pool_init(); + if (rc) + goto out_conf; + + rc = sptlrpc_null_init(); + if (rc) + goto out_pool; + + rc = sptlrpc_plain_init(); + if (rc) + goto out_null; + + rc = sptlrpc_lproc_init(); + if (rc) + goto out_plain; + + return 0; + +out_plain: + sptlrpc_plain_fini(); +out_null: + sptlrpc_null_fini(); +out_pool: + sptlrpc_enc_pool_fini(); +out_conf: + sptlrpc_conf_fini(); +out_gc: + sptlrpc_gc_fini(); +out: + return rc; +} + +void sptlrpc_fini(void) +{ + sptlrpc_lproc_fini(); + sptlrpc_plain_fini(); + sptlrpc_null_fini(); + sptlrpc_enc_pool_fini(); + sptlrpc_conf_fini(); + sptlrpc_gc_fini(); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c new file mode 100644 index 0000000000000..bdb65dd637e97 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c @@ -0,0 +1,1005 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/sec_bulk.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +static int mult = 20 - PAGE_SHIFT; +static int enc_pool_max_memory_mb; +module_param(enc_pool_max_memory_mb, int, 0644); +MODULE_PARM_DESC(enc_pool_max_memory_mb, + "Encoding pool max memory (MB), 1/8 of total physical memory by default"); + +/* + * bulk encryption page pools + */ + +#define PTRS_PER_PAGE (PAGE_SIZE / sizeof(void *)) +#define PAGES_PER_POOL (PTRS_PER_PAGE) + +#define IDLE_IDX_MAX (100) +#define IDLE_IDX_WEIGHT (3) + +#define CACHE_QUIESCENT_PERIOD (20) + +static struct ptlrpc_enc_page_pool { + unsigned long epp_max_pages; /* maximum pages can hold, const */ + unsigned int epp_max_pools; /* number of pools, const */ + + /* + * wait queue in case of not enough free pages. + */ + wait_queue_head_t epp_waitq; /* waiting threads */ + unsigned int epp_waitqlen; /* wait queue length */ + unsigned long epp_pages_short; /* # of pages wanted of in-q users */ + unsigned int epp_growing:1; /* during adding pages */ + + /* + * indicating how idle the pools are, from 0 to MAX_IDLE_IDX + * this is counted based on each time when getting pages from + * the pools, not based on time. which means in case that system + * is idled for a while but the idle_idx might still be low if no + * activities happened in the pools. + */ + unsigned long epp_idle_idx; + + /* last shrink time due to mem tight */ + time64_t epp_last_shrink; + time64_t epp_last_access; + + /* in-pool pages bookkeeping */ + spinlock_t epp_lock; /* protect following fields */ + unsigned long epp_total_pages; /* total pages in pools */ + unsigned long epp_free_pages; /* current pages available */ + + /* statistics */ + unsigned long epp_st_max_pages; /* # of pages ever reached */ + unsigned int epp_st_grows; /* # of grows */ + unsigned int epp_st_grow_fails; /* # of add pages failures */ + unsigned int epp_st_shrinks; /* # of shrinks */ + unsigned long epp_st_access; /* # of access */ + unsigned long epp_st_missings; /* # of cache missing */ + unsigned long epp_st_lowfree; /* lowest free pages reached */ + unsigned int epp_st_max_wqlen; /* highest waitqueue length */ + ktime_t epp_st_max_wait; /* in nanoseconds */ + unsigned long epp_st_outofmem; /* # of out of mem requests */ + /* + * pointers to pools, may be vmalloc'd + */ + struct page ***epp_pools; +} page_pools; + +/* + * /proc/fs/lustre/sptlrpc/encrypt_page_pools + */ +int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v) +{ + spin_lock(&page_pools.epp_lock); + + seq_printf(m, "physical pages: %lu\n" + "pages per pool: %lu\n" + "max pages: %lu\n" + "max pools: %u\n" + "total pages: %lu\n" + "total free: %lu\n" + "idle index: %lu/100\n" + "last shrink: %llds\n" + "last access: %llds\n" + "max pages reached: %lu\n" + "grows: %u\n" + "grows failure: %u\n" + "shrinks: %u\n" + "cache access: %lu\n" + "cache missing: %lu\n" + "low free mark: %lu\n" + "max waitqueue depth: %u\n" + "max wait time ms: %lld\n" + "out of mem: %lu\n", + cfs_totalram_pages(), PAGES_PER_POOL, + page_pools.epp_max_pages, + page_pools.epp_max_pools, + page_pools.epp_total_pages, + page_pools.epp_free_pages, + page_pools.epp_idle_idx, + ktime_get_seconds() - page_pools.epp_last_shrink, + ktime_get_seconds() - page_pools.epp_last_access, + page_pools.epp_st_max_pages, + page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, + page_pools.epp_st_access, + page_pools.epp_st_missings, + page_pools.epp_st_lowfree, + page_pools.epp_st_max_wqlen, + ktime_to_ms(page_pools.epp_st_max_wait), + page_pools.epp_st_outofmem); + + spin_unlock(&page_pools.epp_lock); + return 0; +} + +static void enc_pools_release_free_pages(long npages) +{ + int p_idx, g_idx; + int p_idx_max1, p_idx_max2; + + LASSERT(npages > 0); + LASSERT(npages <= page_pools.epp_free_pages); + LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages); + + /* max pool index before the release */ + p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL; + + page_pools.epp_free_pages -= npages; + page_pools.epp_total_pages -= npages; + + /* max pool index after the release */ + p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 : + ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL); + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + LASSERT(page_pools.epp_pools[p_idx]); + + while (npages--) { + LASSERT(page_pools.epp_pools[p_idx]); + LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); + + __free_page(page_pools.epp_pools[p_idx][g_idx]); + page_pools.epp_pools[p_idx][g_idx] = NULL; + + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + } + + /* free unused pools */ + while (p_idx_max1 < p_idx_max2) { + LASSERT(page_pools.epp_pools[p_idx_max2]); + OBD_FREE(page_pools.epp_pools[p_idx_max2], PAGE_SIZE); + page_pools.epp_pools[p_idx_max2] = NULL; + p_idx_max2--; + } +} + +/* + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. + */ +static unsigned long enc_pools_shrink_count(struct shrinker *s, + struct shrink_control *sc) +{ + /* + * if no pool access for a long time, we consider it's fully idle. + * a little race here is fine. + */ + if (unlikely(ktime_get_seconds() - page_pools.epp_last_access > + CACHE_QUIESCENT_PERIOD)) { + spin_lock(&page_pools.epp_lock); + page_pools.epp_idle_idx = IDLE_IDX_MAX; + spin_unlock(&page_pools.epp_lock); + } + + LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); + return (page_pools.epp_free_pages <= PTLRPC_MAX_BRW_PAGES) ? 0 : + (page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES) * + (IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX; +} + +/* + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. + */ +static unsigned long enc_pools_shrink_scan(struct shrinker *s, + struct shrink_control *sc) +{ + spin_lock(&page_pools.epp_lock); + if (page_pools.epp_free_pages <= PTLRPC_MAX_BRW_PAGES) + sc->nr_to_scan = 0; + else + sc->nr_to_scan = min_t(unsigned long, sc->nr_to_scan, + page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES); + if (sc->nr_to_scan > 0) { + enc_pools_release_free_pages(sc->nr_to_scan); + CDEBUG(D_SEC, "released %ld pages, %ld left\n", + (long)sc->nr_to_scan, page_pools.epp_free_pages); + + page_pools.epp_st_shrinks++; + page_pools.epp_last_shrink = ktime_get_seconds(); + } + spin_unlock(&page_pools.epp_lock); + + /* + * if no pool access for a long time, we consider it's fully idle. + * a little race here is fine. + */ + if (unlikely(ktime_get_seconds() - page_pools.epp_last_access > + CACHE_QUIESCENT_PERIOD)) { + spin_lock(&page_pools.epp_lock); + page_pools.epp_idle_idx = IDLE_IDX_MAX; + spin_unlock(&page_pools.epp_lock); + } + + LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); + return sc->nr_to_scan; +} + +#ifdef HAVE_SHRINKER_COUNT +static struct shrinker pools_shrinker = { + .count_objects = enc_pools_shrink_count, + .scan_objects = enc_pools_shrink_scan, + .seeks = DEFAULT_SEEKS, +}; +#else +/* + * could be called frequently for query (@nr_to_scan == 0). + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. + */ +static int enc_pools_shrink(struct shrinker *shrinker, + struct shrink_control *sc) +{ + enc_pools_shrink_scan(shrinker, sc); + + return enc_pools_shrink_count(shrinker, sc); +} + +static struct shrinker pools_shrinker = { + .shrink = enc_pools_shrink, + .seeks = DEFAULT_SEEKS, +}; +#endif /* HAVE_SHRINKER_COUNT */ + +static inline +int npages_to_npools(unsigned long npages) +{ + return (int) ((npages + PAGES_PER_POOL - 1) / PAGES_PER_POOL); +} + +/* + * return how many pages cleaned up. + */ +static unsigned long enc_pools_cleanup(struct page ***pools, int npools) +{ + unsigned long cleaned = 0; + int i, j; + + for (i = 0; i < npools; i++) { + if (pools[i]) { + for (j = 0; j < PAGES_PER_POOL; j++) { + if (pools[i][j]) { + __free_page(pools[i][j]); + cleaned++; + } + } + OBD_FREE(pools[i], PAGE_SIZE); + pools[i] = NULL; + } + } + + return cleaned; +} + +/* + * merge @npools pointed by @pools which contains @npages new pages + * into current pools. + * + * we have options to avoid most memory copy with some tricks. but we choose + * the simplest way to avoid complexity. It's not frequently called. + */ +static void enc_pools_insert(struct page ***pools, int npools, int npages) +{ + int freeslot; + int op_idx, np_idx, og_idx, ng_idx; + int cur_npools, end_npools; + + LASSERT(npages > 0); + LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages); + LASSERT(npages_to_npools(npages) == npools); + LASSERT(page_pools.epp_growing); + + spin_lock(&page_pools.epp_lock); + + /* + * (1) fill all the free slots of current pools. + */ + /* + * free slots are those left by rent pages, and the extra ones with + * index >= total_pages, locate at the tail of last pool. + */ + freeslot = page_pools.epp_total_pages % PAGES_PER_POOL; + if (freeslot != 0) + freeslot = PAGES_PER_POOL - freeslot; + freeslot += page_pools.epp_total_pages - page_pools.epp_free_pages; + + op_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + og_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + np_idx = npools - 1; + ng_idx = (npages - 1) % PAGES_PER_POOL; + + while (freeslot) { + LASSERT(page_pools.epp_pools[op_idx][og_idx] == NULL); + LASSERT(pools[np_idx][ng_idx] != NULL); + + page_pools.epp_pools[op_idx][og_idx] = pools[np_idx][ng_idx]; + pools[np_idx][ng_idx] = NULL; + + freeslot--; + + if (++og_idx == PAGES_PER_POOL) { + op_idx++; + og_idx = 0; + } + if (--ng_idx < 0) { + if (np_idx == 0) + break; + np_idx--; + ng_idx = PAGES_PER_POOL - 1; + } + } + + /* + * (2) add pools if needed. + */ + cur_npools = (page_pools.epp_total_pages + PAGES_PER_POOL - 1) / + PAGES_PER_POOL; + end_npools = (page_pools.epp_total_pages + npages + + PAGES_PER_POOL - 1) / PAGES_PER_POOL; + LASSERT(end_npools <= page_pools.epp_max_pools); + + np_idx = 0; + while (cur_npools < end_npools) { + LASSERT(page_pools.epp_pools[cur_npools] == NULL); + LASSERT(np_idx < npools); + LASSERT(pools[np_idx] != NULL); + + page_pools.epp_pools[cur_npools++] = pools[np_idx]; + pools[np_idx++] = NULL; + } + + page_pools.epp_total_pages += npages; + page_pools.epp_free_pages += npages; + page_pools.epp_st_lowfree = page_pools.epp_free_pages; + + if (page_pools.epp_total_pages > page_pools.epp_st_max_pages) + page_pools.epp_st_max_pages = page_pools.epp_total_pages; + + CDEBUG(D_SEC, "add %d pages to total %lu\n", npages, + page_pools.epp_total_pages); + + spin_unlock(&page_pools.epp_lock); +} + +static int enc_pools_add_pages(int npages) +{ + static DEFINE_MUTEX(add_pages_mutex); + struct page ***pools; + int npools, alloced = 0; + int i, j, rc = -ENOMEM; + + if (npages < PTLRPC_MAX_BRW_PAGES) + npages = PTLRPC_MAX_BRW_PAGES; + + mutex_lock(&add_pages_mutex); + + if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages) + npages = page_pools.epp_max_pages - page_pools.epp_total_pages; + LASSERT(npages > 0); + + page_pools.epp_st_grows++; + + npools = npages_to_npools(npages); + OBD_ALLOC_PTR_ARRAY(pools, npools); + if (pools == NULL) + goto out; + + for (i = 0; i < npools; i++) { + OBD_ALLOC(pools[i], PAGE_SIZE); + if (pools[i] == NULL) + goto out_pools; + + for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) { + pools[i][j] = alloc_page(GFP_NOFS | + __GFP_HIGHMEM); + if (pools[i][j] == NULL) + goto out_pools; + + alloced++; + } + } + LASSERT(alloced == npages); + + enc_pools_insert(pools, npools, npages); + CDEBUG(D_SEC, "added %d pages into pools\n", npages); + rc = 0; + +out_pools: + enc_pools_cleanup(pools, npools); + OBD_FREE_PTR_ARRAY(pools, npools); +out: + if (rc) { + page_pools.epp_st_grow_fails++; + CERROR("Failed to allocate %d enc pages\n", npages); + } + + mutex_unlock(&add_pages_mutex); + return rc; +} + +static inline void enc_pools_wakeup(void) +{ + assert_spin_locked(&page_pools.epp_lock); + + /* waitqueue_active */ + if (unlikely(waitqueue_active(&page_pools.epp_waitq))) + wake_up(&page_pools.epp_waitq); +} + +static int enc_pools_should_grow(int page_needed, time64_t now) +{ + /* + * don't grow if someone else is growing the pools right now, + * or the pools has reached its full capacity + */ + if (page_pools.epp_growing || + page_pools.epp_total_pages == page_pools.epp_max_pages) + return 0; + + /* if total pages is not enough, we need to grow */ + if (page_pools.epp_total_pages < page_needed) + return 1; + + /* + * we wanted to return 0 here if there was a shrink just + * happened a moment ago, but this may cause deadlock if both + * client and ost live on single node. + */ + + /* + * here we perhaps need consider other factors like wait queue + * length, idle index, etc. ? + */ + + /* grow the pools in any other cases */ + return 1; +} + +/* + * Export the number of free pages in the pool + */ +int get_free_pages_in_pool(void) +{ + return page_pools.epp_free_pages; +} +EXPORT_SYMBOL(get_free_pages_in_pool); + +/* + * Let outside world know if enc_pool full capacity is reached + */ +int pool_is_at_full_capacity(void) +{ + return (page_pools.epp_total_pages == page_pools.epp_max_pages); +} +EXPORT_SYMBOL(pool_is_at_full_capacity); + +static inline struct page **page_from_bulkdesc(void *array, int index) +{ + struct ptlrpc_bulk_desc *desc = (struct ptlrpc_bulk_desc *)array; + + return &desc->bd_enc_vec[index].bv_page; +} + +static inline struct page **page_from_pagearray(void *array, int index) +{ + struct page **pa = (struct page **)array; + + return &pa[index]; +} + +/* + * we allocate the requested pages atomically. + */ +static inline int __sptlrpc_enc_pool_get_pages(void *array, unsigned int count, + struct page **(*page_from)(void *, int)) +{ + wait_queue_entry_t waitlink; + unsigned long this_idle = -1; + u64 tick_ns = 0; + time64_t now; + int p_idx, g_idx; + int i, rc = 0; + + if (!array || count <= 0 || count > page_pools.epp_max_pages) + return -EINVAL; + + spin_lock(&page_pools.epp_lock); + + page_pools.epp_st_access++; +again: + if (unlikely(page_pools.epp_free_pages < count)) { + if (tick_ns == 0) + tick_ns = ktime_get_ns(); + + now = ktime_get_real_seconds(); + + page_pools.epp_st_missings++; + page_pools.epp_pages_short += count; + + if (enc_pools_should_grow(count, now)) { + page_pools.epp_growing = 1; + + spin_unlock(&page_pools.epp_lock); + enc_pools_add_pages(page_pools.epp_pages_short / 2); + spin_lock(&page_pools.epp_lock); + + page_pools.epp_growing = 0; + + enc_pools_wakeup(); + } else { + if (page_pools.epp_growing) { + if (++page_pools.epp_waitqlen > + page_pools.epp_st_max_wqlen) + page_pools.epp_st_max_wqlen = + page_pools.epp_waitqlen; + + set_current_state(TASK_UNINTERRUPTIBLE); + init_wait(&waitlink); + add_wait_queue(&page_pools.epp_waitq, + &waitlink); + + spin_unlock(&page_pools.epp_lock); + schedule(); + remove_wait_queue(&page_pools.epp_waitq, + &waitlink); + spin_lock(&page_pools.epp_lock); + page_pools.epp_waitqlen--; + } else { + /* + * ptlrpcd thread should not sleep in that case, + * or deadlock may occur! + * Instead, return -ENOMEM so that upper layers + * will put request back in queue. + */ + page_pools.epp_st_outofmem++; + GOTO(out_unlock, rc = -ENOMEM); + } + } + + if (page_pools.epp_pages_short < count) + GOTO(out_unlock, rc = -EPROTO); + page_pools.epp_pages_short -= count; + + this_idle = 0; + goto again; + } + + /* record max wait time */ + if (unlikely(tick_ns)) { + ktime_t tick = ktime_sub_ns(ktime_get(), tick_ns); + + if (ktime_after(tick, page_pools.epp_st_max_wait)) + page_pools.epp_st_max_wait = tick; + } + + /* proceed with rest of allocation */ + page_pools.epp_free_pages -= count; + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + + for (i = 0; i < count; i++) { + struct page **pagep = page_from(array, i); + + if (page_pools.epp_pools[p_idx][g_idx] == NULL) + GOTO(out_unlock, rc = -EPROTO); + *pagep = page_pools.epp_pools[p_idx][g_idx]; + page_pools.epp_pools[p_idx][g_idx] = NULL; + + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + } + + if (page_pools.epp_free_pages < page_pools.epp_st_lowfree) + page_pools.epp_st_lowfree = page_pools.epp_free_pages; + + /* + * new idle index = (old * weight + new) / (weight + 1) + */ + if (this_idle == -1) { + this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX / + page_pools.epp_total_pages; + } + page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT + + this_idle) / + (IDLE_IDX_WEIGHT + 1); + + page_pools.epp_last_access = ktime_get_seconds(); + +out_unlock: + spin_unlock(&page_pools.epp_lock); + return rc; +} + +int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc) +{ + int rc; + + LASSERT(desc->bd_iov_count > 0); + LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages); + + /* resent bulk, enc iov might have been allocated previously */ + if (desc->bd_enc_vec != NULL) + return 0; + + OBD_ALLOC_LARGE(desc->bd_enc_vec, + desc->bd_iov_count * sizeof(*desc->bd_enc_vec)); + if (desc->bd_enc_vec == NULL) + return -ENOMEM; + + rc = __sptlrpc_enc_pool_get_pages((void *)desc, desc->bd_iov_count, + page_from_bulkdesc); + if (rc) { + OBD_FREE_LARGE(desc->bd_enc_vec, + desc->bd_iov_count * + sizeof(*desc->bd_enc_vec)); + desc->bd_enc_vec = NULL; + } + return rc; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages); + +int sptlrpc_enc_pool_get_pages_array(struct page **pa, unsigned int count) +{ + return __sptlrpc_enc_pool_get_pages((void *)pa, count, + page_from_pagearray); +} +EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages_array); + +static int __sptlrpc_enc_pool_put_pages(void *array, unsigned int count, + struct page **(*page_from)(void *, int)) +{ + int p_idx, g_idx; + int i, rc = 0; + + if (!array || count <= 0) + return -EINVAL; + + spin_lock(&page_pools.epp_lock); + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + + if (page_pools.epp_free_pages + count > page_pools.epp_total_pages) + GOTO(out_unlock, rc = -EPROTO); + if (!page_pools.epp_pools[p_idx]) + GOTO(out_unlock, rc = -EPROTO); + + for (i = 0; i < count; i++) { + struct page **pagep = page_from(array, i); + + if (!*pagep || + page_pools.epp_pools[p_idx][g_idx] != NULL) + GOTO(out_unlock, rc = -EPROTO); + + page_pools.epp_pools[p_idx][g_idx] = *pagep; + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + } + + page_pools.epp_free_pages += count; + enc_pools_wakeup(); + +out_unlock: + spin_unlock(&page_pools.epp_lock); + return rc; +} + +void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) +{ + int rc; + + if (desc->bd_enc_vec == NULL) + return; + + rc = __sptlrpc_enc_pool_put_pages((void *)desc, desc->bd_iov_count, + page_from_bulkdesc); + if (rc) + CDEBUG(D_SEC, "error putting pages in enc pool: %d\n", rc); + + OBD_FREE_LARGE(desc->bd_enc_vec, + desc->bd_iov_count * sizeof(*desc->bd_enc_vec)); + desc->bd_enc_vec = NULL; +} + +void sptlrpc_enc_pool_put_pages_array(struct page **pa, unsigned int count) +{ + int rc; + + rc = __sptlrpc_enc_pool_put_pages((void *)pa, count, + page_from_pagearray); + + if (rc) + CDEBUG(D_SEC, "error putting pages in enc pool: %d\n", rc); +} +EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages_array); + +/* + * we don't do much stuff for add_user/del_user anymore, except adding some + * initial pages in add_user() if current pools are empty, rest would be + * handled by the pools's self-adaption. + */ +int sptlrpc_enc_pool_add_user(void) +{ + int need_grow = 0; + + spin_lock(&page_pools.epp_lock); + if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) { + page_pools.epp_growing = 1; + need_grow = 1; + } + spin_unlock(&page_pools.epp_lock); + + if (need_grow) { + enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES + + PTLRPC_MAX_BRW_PAGES); + + spin_lock(&page_pools.epp_lock); + page_pools.epp_growing = 0; + enc_pools_wakeup(); + spin_unlock(&page_pools.epp_lock); + } + return 0; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_add_user); + +int sptlrpc_enc_pool_del_user(void) +{ + return 0; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_del_user); + +static inline void enc_pools_alloc(void) +{ + LASSERT(page_pools.epp_max_pools); + OBD_ALLOC_LARGE(page_pools.epp_pools, + page_pools.epp_max_pools * + sizeof(*page_pools.epp_pools)); +} + +static inline void enc_pools_free(void) +{ + LASSERT(page_pools.epp_max_pools); + LASSERT(page_pools.epp_pools); + + OBD_FREE_LARGE(page_pools.epp_pools, + page_pools.epp_max_pools * + sizeof(*page_pools.epp_pools)); +} + +int sptlrpc_enc_pool_init(void) +{ + int rc; + + page_pools.epp_max_pages = cfs_totalram_pages() / 8; + if (enc_pool_max_memory_mb > 0 && + enc_pool_max_memory_mb <= (cfs_totalram_pages() >> mult)) + page_pools.epp_max_pages = enc_pool_max_memory_mb << mult; + + page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages); + + init_waitqueue_head(&page_pools.epp_waitq); + page_pools.epp_waitqlen = 0; + page_pools.epp_pages_short = 0; + + page_pools.epp_growing = 0; + + page_pools.epp_idle_idx = 0; + page_pools.epp_last_shrink = ktime_get_seconds(); + page_pools.epp_last_access = ktime_get_seconds(); + + spin_lock_init(&page_pools.epp_lock); + page_pools.epp_total_pages = 0; + page_pools.epp_free_pages = 0; + + page_pools.epp_st_max_pages = 0; + page_pools.epp_st_grows = 0; + page_pools.epp_st_grow_fails = 0; + page_pools.epp_st_shrinks = 0; + page_pools.epp_st_access = 0; + page_pools.epp_st_missings = 0; + page_pools.epp_st_lowfree = 0; + page_pools.epp_st_max_wqlen = 0; + page_pools.epp_st_max_wait = ktime_set(0, 0); + page_pools.epp_st_outofmem = 0; + + enc_pools_alloc(); + if (page_pools.epp_pools == NULL) + return -ENOMEM; + + rc = register_shrinker(&pools_shrinker); + if (rc) + enc_pools_free(); + + return rc; +} + +void sptlrpc_enc_pool_fini(void) +{ + unsigned long cleaned, npools; + + LASSERT(page_pools.epp_pools); + LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages); + + unregister_shrinker(&pools_shrinker); + + npools = npages_to_npools(page_pools.epp_total_pages); + cleaned = enc_pools_cleanup(page_pools.epp_pools, npools); + LASSERT(cleaned == page_pools.epp_total_pages); + + enc_pools_free(); + + if (page_pools.epp_st_access > 0) { + CDEBUG(D_SEC, + "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait ms %lld, out of mem %lu\n", + page_pools.epp_st_max_pages, page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, page_pools.epp_st_access, + page_pools.epp_st_missings, page_pools.epp_st_max_wqlen, + ktime_to_ms(page_pools.epp_st_max_wait), + page_pools.epp_st_outofmem); + } +} + + +static int cfs_hash_alg_id[] = { + [BULK_HASH_ALG_NULL] = CFS_HASH_ALG_NULL, + [BULK_HASH_ALG_ADLER32] = CFS_HASH_ALG_ADLER32, + [BULK_HASH_ALG_CRC32] = CFS_HASH_ALG_CRC32, + [BULK_HASH_ALG_MD5] = CFS_HASH_ALG_MD5, + [BULK_HASH_ALG_SHA1] = CFS_HASH_ALG_SHA1, + [BULK_HASH_ALG_SHA256] = CFS_HASH_ALG_SHA256, + [BULK_HASH_ALG_SHA384] = CFS_HASH_ALG_SHA384, + [BULK_HASH_ALG_SHA512] = CFS_HASH_ALG_SHA512, +}; +const char *sptlrpc_get_hash_name(__u8 hash_alg) +{ + return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]); +} + +__u8 sptlrpc_get_hash_alg(const char *algname) +{ + return cfs_crypto_hash_alg(algname); +} + +int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed) +{ + struct ptlrpc_bulk_sec_desc *bsd; + int size = msg->lm_buflens[offset]; + + bsd = lustre_msg_buf(msg, offset, sizeof(*bsd)); + if (bsd == NULL) { + CERROR("Invalid bulk sec desc: size %d\n", size); + return -EINVAL; + } + + if (swabbed) + __swab32s(&bsd->bsd_nob); + + if (unlikely(bsd->bsd_version != 0)) { + CERROR("Unexpected version %u\n", bsd->bsd_version); + return -EPROTO; + } + + if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) { + CERROR("Invalid type %u\n", bsd->bsd_type); + return -EPROTO; + } + + /* FIXME more sanity check here */ + + if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && + bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG && + bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) { + CERROR("Invalid svc %u\n", bsd->bsd_svc); + return -EPROTO; + } + + return 0; +} +EXPORT_SYMBOL(bulk_sec_desc_unpack); + +/* + * Compute the checksum of an RPC buffer payload. If the return \a buflen + * is not large enough, truncate the result to fit so that it is possible + * to use a hash function with a large hash space, but only use a part of + * the resulting hash. + */ +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen) +{ + struct ahash_request *req; + int hashsize; + unsigned int bufsize; + int i, err; + + LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX); + LASSERT(buflen >= 4); + + req = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0); + if (IS_ERR(req)) { + CERROR("Unable to initialize checksum hash %s\n", + cfs_crypto_hash_name(cfs_hash_alg_id[alg])); + return PTR_ERR(req); + } + + hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]); + + for (i = 0; i < desc->bd_iov_count; i++) { + cfs_crypto_hash_update_page(req, + desc->bd_vec[i].bv_page, + desc->bd_vec[i].bv_offset & + ~PAGE_MASK, + desc->bd_vec[i].bv_len); + } + + if (hashsize > buflen) { + unsigned char hashbuf[CFS_CRYPTO_HASH_DIGESTSIZE_MAX]; + + bufsize = sizeof(hashbuf); + LASSERTF(bufsize >= hashsize, "bufsize = %u < hashsize %u\n", + bufsize, hashsize); + err = cfs_crypto_hash_final(req, hashbuf, &bufsize); + memcpy(buf, hashbuf, buflen); + } else { + bufsize = buflen; + err = cfs_crypto_hash_final(req, buf, &bufsize); + } + + return err; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c new file mode 100644 index 0000000000000..a36452e86eb9b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c @@ -0,0 +1,979 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +const char *sptlrpc_part2name(enum lustre_sec_part part) +{ + switch (part) { + case LUSTRE_SP_CLI: + return "cli"; + case LUSTRE_SP_MDT: + return "mdt"; + case LUSTRE_SP_OST: + return "ost"; + case LUSTRE_SP_MGC: + return "mgc"; + case LUSTRE_SP_MGS: + return "mgs"; + case LUSTRE_SP_ANY: + return "any"; + default: + return "err"; + } +} +EXPORT_SYMBOL(sptlrpc_part2name); + +enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd) +{ + const char *type = obd->obd_type->typ_name; + + if (!strcmp(type, LUSTRE_MDT_NAME)) + return LUSTRE_SP_MDT; + if (!strcmp(type, LUSTRE_OST_NAME)) + return LUSTRE_SP_OST; + if (!strcmp(type, LUSTRE_MGS_NAME)) + return LUSTRE_SP_MGS; + + CERROR("unknown target %p(%s)\n", obd, type); + return LUSTRE_SP_ANY; +} + +/**************************************** + * user supplied flavor string parsing * + ****************************************/ + +/* + * format: [-] + */ +int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr) +{ + char buf[32]; + char *bulk, *alg; + + memset(flvr, 0, sizeof(*flvr)); + + if (str == NULL || str[0] == '\0') { + flvr->sf_rpc = SPTLRPC_FLVR_INVALID; + return 0; + } + + strlcpy(buf, str, sizeof(buf)); + + bulk = strchr(buf, '-'); + if (bulk) + *bulk++ = '\0'; + + flvr->sf_rpc = sptlrpc_name2flavor_base(buf); + if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID) + goto err_out; + + /* + * currently only base flavor "plain" can have bulk specification. + */ + if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) { + flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32; + if (bulk) { + /* + * format: plain-hash: + */ + alg = strchr(bulk, ':'); + if (alg == NULL) + goto err_out; + *alg++ = '\0'; + + if (strcmp(bulk, "hash")) + goto err_out; + + flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg); + if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX) + goto err_out; + } + + if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL) + flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL); + else + flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG); + } else { + if (bulk) + goto err_out; + } + + flvr->sf_flags = 0; + return 0; + +err_out: + CERROR("invalid flavor string: %s\n", str); + return -EINVAL; +} +EXPORT_SYMBOL(sptlrpc_parse_flavor); + +/**************************************** + * configure rules * + ****************************************/ + +static void get_default_flavor(struct sptlrpc_flavor *sf) +{ + memset(sf, 0, sizeof(*sf)); + + sf->sf_rpc = SPTLRPC_FLVR_NULL; + sf->sf_flags = 0; +} + +static void sptlrpc_rule_init(struct sptlrpc_rule *rule) +{ + rule->sr_netid = LNET_NET_ANY; + rule->sr_from = LUSTRE_SP_ANY; + rule->sr_to = LUSTRE_SP_ANY; + rule->sr_padding = 0; + + get_default_flavor(&rule->sr_flvr); +} + +/* + * format: network[.direction]=flavor + */ +int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule) +{ + char *flavor, *dir; + int rc; + + sptlrpc_rule_init(rule); + + flavor = strchr(param, '='); + if (flavor == NULL) { + CERROR("invalid param, no '='\n"); + RETURN(-EINVAL); + } + *flavor++ = '\0'; + + dir = strchr(param, '.'); + if (dir) + *dir++ = '\0'; + + /* 1.1 network */ + if (strcmp(param, "default")) { + rule->sr_netid = libcfs_str2net(param); + if (rule->sr_netid == LNET_NET_ANY) { + CERROR("invalid network name: %s\n", param); + RETURN(-EINVAL); + } + } + + /* 1.2 direction */ + if (dir) { + if (!strcmp(dir, "mdt2ost")) { + rule->sr_from = LUSTRE_SP_MDT; + rule->sr_to = LUSTRE_SP_OST; + } else if (!strcmp(dir, "mdt2mdt")) { + rule->sr_from = LUSTRE_SP_MDT; + rule->sr_to = LUSTRE_SP_MDT; + } else if (!strcmp(dir, "cli2ost")) { + rule->sr_from = LUSTRE_SP_CLI; + rule->sr_to = LUSTRE_SP_OST; + } else if (!strcmp(dir, "cli2mdt")) { + rule->sr_from = LUSTRE_SP_CLI; + rule->sr_to = LUSTRE_SP_MDT; + } else { + CERROR("invalid rule dir segment: %s\n", dir); + RETURN(-EINVAL); + } + } + + /* 2.1 flavor */ + rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr); + if (rc) + RETURN(-EINVAL); + + RETURN(0); +} +EXPORT_SYMBOL(sptlrpc_parse_rule); + +void sptlrpc_rule_set_free(struct sptlrpc_rule_set *rset) +{ + LASSERT(rset->srs_nslot || + (rset->srs_nrule == 0 && rset->srs_rules == NULL)); + + if (rset->srs_nslot) { + OBD_FREE_PTR_ARRAY(rset->srs_rules, rset->srs_nslot); + sptlrpc_rule_set_init(rset); + } +} +EXPORT_SYMBOL(sptlrpc_rule_set_free); + +/* + * return 0 if the rule set could accomodate one more rule. + */ +int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_rule *rules; + int nslot; + + might_sleep(); + + if (rset->srs_nrule < rset->srs_nslot) + return 0; + + nslot = rset->srs_nslot + 8; + + /* better use realloc() if available */ + OBD_ALLOC_PTR_ARRAY(rules, nslot); + if (rules == NULL) + return -ENOMEM; + + if (rset->srs_nrule) { + LASSERT(rset->srs_nslot && rset->srs_rules); + memcpy(rules, rset->srs_rules, + rset->srs_nrule * sizeof(*rset->srs_rules)); + + OBD_FREE_PTR_ARRAY(rset->srs_rules, rset->srs_nslot); + } + + rset->srs_rules = rules; + rset->srs_nslot = nslot; + return 0; +} + +static inline int rule_spec_dir(struct sptlrpc_rule *rule) +{ + return (rule->sr_from != LUSTRE_SP_ANY || + rule->sr_to != LUSTRE_SP_ANY); +} +static inline int rule_spec_net(struct sptlrpc_rule *rule) +{ + return (rule->sr_netid != LNET_NET_ANY); +} +static inline int rule_match_dir(struct sptlrpc_rule *r1, + struct sptlrpc_rule *r2) +{ + return (r1->sr_from == r2->sr_from && r1->sr_to == r2->sr_to); +} +static inline int rule_match_net(struct sptlrpc_rule *r1, + struct sptlrpc_rule *r2) +{ + return (r1->sr_netid == r2->sr_netid); +} + +/* + * merge @rule into @rset. + * the @rset slots might be expanded. + */ +int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset, + struct sptlrpc_rule *rule) +{ + struct sptlrpc_rule *p = rset->srs_rules; + int spec_dir, spec_net; + int rc, n, match = 0; + + might_sleep(); + + spec_net = rule_spec_net(rule); + spec_dir = rule_spec_dir(rule); + + for (n = 0; n < rset->srs_nrule; n++) { + p = &rset->srs_rules[n]; + + /* test network match, if failed: + * - spec rule: skip rules which is also spec rule match, until + * we hit a wild rule, which means no more chance + * - wild rule: skip until reach the one which is also wild + * and matches + */ + if (!rule_match_net(p, rule)) { + if (spec_net) { + if (rule_spec_net(p)) + continue; + else + break; + } else { + continue; + } + } + + /* test dir match, same logic as net matching */ + if (!rule_match_dir(p, rule)) { + if (spec_dir) { + if (rule_spec_dir(p)) + continue; + else + break; + } else { + continue; + } + } + + /* find a match */ + match = 1; + break; + } + + if (match) { + LASSERT(n >= 0 && n < rset->srs_nrule); + + if (rule->sr_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) { + /* remove this rule */ + if (n < rset->srs_nrule - 1) + memmove(&rset->srs_rules[n], + &rset->srs_rules[n + 1], + (rset->srs_nrule - n - 1) * + sizeof(*rule)); + rset->srs_nrule--; + } else { + /* override the rule */ + memcpy(&rset->srs_rules[n], rule, sizeof(*rule)); + } + } else { + LASSERT(n >= 0 && n <= rset->srs_nrule); + + if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) { + rc = sptlrpc_rule_set_expand(rset); + if (rc) + return rc; + + if (n < rset->srs_nrule) + memmove(&rset->srs_rules[n + 1], + &rset->srs_rules[n], + (rset->srs_nrule - n) * sizeof(*rule)); + memcpy(&rset->srs_rules[n], rule, sizeof(*rule)); + rset->srs_nrule++; + } else { + CDEBUG(D_CONFIG, "ignore the unmatched deletion\n"); + } + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_rule_set_merge); + +/** + * given from/to/nid, determine a matching flavor in ruleset. + * return 1 if a match found, otherwise return 0. + */ +int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + enum lustre_sec_part to, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) +{ + struct sptlrpc_rule *r; + int n; + + for (n = 0; n < rset->srs_nrule; n++) { + r = &rset->srs_rules[n]; + + if (LNET_NIDNET(nid) != LNET_NET_ANY && + r->sr_netid != LNET_NET_ANY && + LNET_NIDNET(nid) != r->sr_netid) + continue; + + if (from != LUSTRE_SP_ANY && r->sr_from != LUSTRE_SP_ANY && + from != r->sr_from) + continue; + + if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY && + to != r->sr_to) + continue; + + *sf = r->sr_flvr; + return 1; + } + + return 0; +} + +void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_rule *r; + int n; + + for (n = 0; n < rset->srs_nrule; n++) { + r = &rset->srs_rules[n]; + CDEBUG(D_SEC, "<%02d> from %x to %x, net %x, rpc %x\n", n, + r->sr_from, r->sr_to, r->sr_netid, r->sr_flvr.sf_rpc); + } +} + +static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen, + struct sptlrpc_rule_set *tgt, + enum lustre_sec_part from, + enum lustre_sec_part to, + struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_rule_set *src[2] = { gen, tgt }; + struct sptlrpc_rule *rule; + int i, n, rc; + + might_sleep(); + + /* merge general rules firstly, then target-specific rules */ + for (i = 0; i < 2; i++) { + if (src[i] == NULL) + continue; + + for (n = 0; n < src[i]->srs_nrule; n++) { + rule = &src[i]->srs_rules[n]; + + if (from != LUSTRE_SP_ANY && + rule->sr_from != LUSTRE_SP_ANY && + rule->sr_from != from) + continue; + if (to != LUSTRE_SP_ANY && + rule->sr_to != LUSTRE_SP_ANY && + rule->sr_to != to) + continue; + + rc = sptlrpc_rule_set_merge(rset, rule); + if (rc) { + CERROR("can't merge: %d\n", rc); + return rc; + } + } + } + + return 0; +} + +/********************************** + * sptlrpc configuration support * + **********************************/ + +struct sptlrpc_conf_tgt { + struct list_head sct_list; + char sct_name[MAX_OBD_NAME]; + struct sptlrpc_rule_set sct_rset; +}; + +struct sptlrpc_conf { + struct list_head sc_list; + char sc_fsname[MTI_NAME_MAXLEN]; + unsigned int sc_modified; /* modified during updating */ + unsigned int sc_updated:1, /* updated copy from MGS */ + sc_local:1; /* local copy from target */ + struct sptlrpc_rule_set sc_rset; /* fs general rules */ + struct list_head sc_tgts; /* target-specific rules */ +}; + +static struct mutex sptlrpc_conf_lock; +static LIST_HEAD(sptlrpc_confs); + +static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf) +{ + struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next; + + sptlrpc_rule_set_free(&conf->sc_rset); + + list_for_each_entry_safe(conf_tgt, conf_tgt_next, + &conf->sc_tgts, sct_list) { + sptlrpc_rule_set_free(&conf_tgt->sct_rset); + list_del(&conf_tgt->sct_list); + OBD_FREE_PTR(conf_tgt); + } + LASSERT(list_empty(&conf->sc_tgts)); + + conf->sc_updated = 0; + conf->sc_local = 0; +} + +static void sptlrpc_conf_free(struct sptlrpc_conf *conf) +{ + CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname); + + sptlrpc_conf_free_rsets(conf); + list_del(&conf->sc_list); + OBD_FREE_PTR(conf); +} + +static +struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf, + const char *name, + int create) +{ + struct sptlrpc_conf_tgt *conf_tgt; + + list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) { + if (strcmp(conf_tgt->sct_name, name) == 0) + return conf_tgt; + } + + if (!create) + return NULL; + + OBD_ALLOC_PTR(conf_tgt); + if (conf_tgt) { + strlcpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name)); + sptlrpc_rule_set_init(&conf_tgt->sct_rset); + list_add(&conf_tgt->sct_list, &conf->sc_tgts); + } + + return conf_tgt; +} + +static +struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname, + int create) +{ + struct sptlrpc_conf *conf; + + list_for_each_entry(conf, &sptlrpc_confs, sc_list) { + if (strcmp(conf->sc_fsname, fsname) == 0) + return conf; + } + + if (!create) + return NULL; + + OBD_ALLOC_PTR(conf); + if (conf == NULL) + return NULL; + + if (strlcpy(conf->sc_fsname, fsname, sizeof(conf->sc_fsname)) >= + sizeof(conf->sc_fsname)) { + OBD_FREE_PTR(conf); + return NULL; + } + sptlrpc_rule_set_init(&conf->sc_rset); + INIT_LIST_HEAD(&conf->sc_tgts); + list_add(&conf->sc_list, &sptlrpc_confs); + + CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname); + return conf; +} + +/** + * caller must hold conf_lock already. + */ +static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf, + const char *target, + struct sptlrpc_rule *rule) +{ + struct sptlrpc_conf_tgt *conf_tgt; + struct sptlrpc_rule_set *rule_set; + + /* fsname == target means general rules for the whole fs */ + if (strcmp(conf->sc_fsname, target) == 0) { + rule_set = &conf->sc_rset; + } else { + conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1); + if (conf_tgt) { + rule_set = &conf_tgt->sct_rset; + } else { + CERROR("out of memory, can't merge rule!\n"); + return -ENOMEM; + } + } + + return sptlrpc_rule_set_merge(rule_set, rule); +} + +/** + * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we + * find one through the target name in the record inside conf_lock; + * otherwise means caller already hold conf_lock. + */ +static int __sptlrpc_process_config(char *target, const char *fsname, + struct sptlrpc_rule *rule, + struct sptlrpc_conf *conf) +{ + int rc; + + ENTRY; + if (!conf) { + if (!fsname) + return -ENODEV; + + mutex_lock(&sptlrpc_conf_lock); + conf = sptlrpc_conf_get(fsname, 0); + if (!conf) { + CERROR("can't find conf\n"); + rc = -ENOMEM; + } else { + rc = sptlrpc_conf_merge_rule(conf, target, rule); + } + mutex_unlock(&sptlrpc_conf_lock); + } else { + LASSERT(mutex_is_locked(&sptlrpc_conf_lock)); + rc = sptlrpc_conf_merge_rule(conf, target, rule); + } + + if (!rc) + conf->sc_modified++; + + RETURN(rc); +} + +int sptlrpc_process_config(struct lustre_cfg *lcfg) +{ + char fsname[MTI_NAME_MAXLEN]; + struct sptlrpc_rule rule; + char *target, *param; + int rc; + + print_lustre_cfg(lcfg); + + target = lustre_cfg_string(lcfg, 1); + if (!target) { + CERROR("missing target name\n"); + return -EINVAL; + } + + param = lustre_cfg_string(lcfg, 2); + if (!param) { + CERROR("missing parameter\n"); + return -EINVAL; + } + + /* parse rule to make sure the format is correct */ + if (strncmp(param, PARAM_SRPC_FLVR, + sizeof(PARAM_SRPC_FLVR) - 1) != 0) { + CERROR("Invalid sptlrpc parameter: %s\n", param); + return -EINVAL; + } + param += sizeof(PARAM_SRPC_FLVR) - 1; + + CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param); + + /* + * Three types of targets exist for sptlrpc using conf_param + * 1. '_mgs' which targets mgc srpc settings. Treat it as + * as a special file system name. + * 2. target is a device which can be fsname-MDTXXXX or + * fsname-OSTXXXX. This can be verified by the function + * server_name2fsname. + * 3. If both above conditions are not meet then the target + * is a actual filesystem. + */ + if (server_name2fsname(target, fsname, NULL)) + strlcpy(fsname, target, sizeof(target)); + + rc = sptlrpc_parse_rule(param, &rule); + if (rc) + return rc; + + return __sptlrpc_process_config(target, fsname, &rule, NULL); +} +EXPORT_SYMBOL(sptlrpc_process_config); + +static int logname2fsname(const char *logname, char *buf, int buflen) +{ + char *ptr; + int len; + + ptr = strrchr(logname, '-'); + if (ptr == NULL || strcmp(ptr, "-sptlrpc")) { + CERROR("%s is not a sptlrpc config log\n", logname); + return -EINVAL; + } + + len = min((int) (ptr - logname), buflen - 1); + + memcpy(buf, logname, len); + buf[len] = '\0'; + return 0; +} + +void sptlrpc_conf_log_update_begin(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(fsname, 0); + if (conf) { + if (conf->sc_local) { + LASSERT(conf->sc_updated == 0); + sptlrpc_conf_free_rsets(conf); + } + conf->sc_modified = 0; + } + + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_update_begin); + +/** + * mark a config log has been updated + */ +void sptlrpc_conf_log_update_end(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(fsname, 0); + if (conf) { + /* + * if original state is not updated, make sure the + * modified counter > 0 to enforce updating local copy. + */ + if (conf->sc_updated == 0) + conf->sc_modified++; + + conf->sc_updated = 1; + } + + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_update_end); + +void sptlrpc_conf_log_start(const char *logname) +{ + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + sptlrpc_conf_get(fsname, 1); + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_start); + +void sptlrpc_conf_log_stop(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + conf = sptlrpc_conf_get(fsname, 0); + if (conf) + sptlrpc_conf_free(conf); + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_stop); + +static void inline flavor_set_flags(struct sptlrpc_flavor *sf, + enum lustre_sec_part from, + enum lustre_sec_part to, + unsigned int fl_udesc) +{ + /* + * null flavor doesn't need to set any flavor, and in fact + * we'd better not do that because everybody share a single sec. + */ + if (sf->sf_rpc == SPTLRPC_FLVR_NULL) + return; + + if (from == LUSTRE_SP_MDT) { + /* MDT->MDT; MDT->OST */ + sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY; + } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) { + /* CLI->OST */ + sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK; + } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) { + /* CLI->MDT */ + if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL) + sf->sf_flags |= PTLRPC_SEC_FL_UDESC; + } + + /* Some flavors use a single uid (0) context */ + if (flvr_is_rootonly(sf->sf_rpc)) + sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY; + + /* User descriptor might need to be cleared */ + if (flvr_allows_user_desc(sf->sf_rpc) == 0) + sf->sf_flags &= ~PTLRPC_SEC_FL_UDESC; +} + +void sptlrpc_conf_choose_flavor(enum lustre_sec_part from, + enum lustre_sec_part to, + struct obd_uuid *target, + struct lnet_nid *nid, + struct sptlrpc_flavor *sf) +{ + struct sptlrpc_conf *conf; + struct sptlrpc_conf_tgt *conf_tgt; + char name[MTI_NAME_MAXLEN]; + int len, rc = 0; + + obd_uuid2fsname(name, target->uuid, sizeof(name)); + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(name, 0); + if (conf == NULL) + goto out; + + /* convert uuid name (supposed end with _UUID) to target name */ + len = strlen(target->uuid); + LASSERT(len > 5); + memcpy(name, target->uuid, len - 5); + name[len - 5] = '\0'; + + conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0); + if (conf_tgt) { + rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset, + from, to, lnet_nid_to_nid4(nid), sf); + if (rc) + goto out; + } + + rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to, + lnet_nid_to_nid4(nid), sf); +out: + mutex_unlock(&sptlrpc_conf_lock); + + if (rc == 0) + get_default_flavor(sf); + + flavor_set_flags(sf, from, to, 1); +} + +/** + * called by target devices, determine the expected flavor from + * certain peer (from, nid). + */ +void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) +{ + if (sptlrpc_rule_set_choose(rset, from, LUSTRE_SP_ANY, nid, sf) == 0) + get_default_flavor(sf); +} + +#define SEC_ADAPT_DELAY (10) + +/** + * called by client devices, notify the sptlrpc config has changed and + * do import_sec_adapt later. + */ +void sptlrpc_conf_client_adapt(struct obd_device *obd) +{ + struct obd_import *imp; + int rc; + ENTRY; + + LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) == 0); + CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid); + + /* serialize with connect/disconnect import */ + with_imp_locked_nested(obd, imp, rc, OBD_CLI_SEM_MDCOSC) { + write_lock(&imp->imp_sec_lock); + if (imp->imp_sec) + imp->imp_sec_expire = ktime_get_real_seconds() + + SEC_ADAPT_DELAY; + write_unlock(&imp->imp_sec_lock); + } + + EXIT; +} +EXPORT_SYMBOL(sptlrpc_conf_client_adapt); + +/** + * called by target devices, extract sptlrpc rules which applies to + * this target, to be used for future rpc flavor checking. + */ +int sptlrpc_conf_target_get_rules(struct obd_device *obd, + struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_conf *conf; + struct sptlrpc_conf_tgt *conf_tgt; + enum lustre_sec_part sp_dst; + char fsname[MTI_NAME_MAXLEN]; + int rc = 0; + ENTRY; + + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) { + sp_dst = LUSTRE_SP_MDT; + } else if (strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) == 0) { + sp_dst = LUSTRE_SP_OST; + } else { + CERROR("unexpected obd type %s\n", obd->obd_type->typ_name); + RETURN(-EINVAL); + } + + obd_uuid2fsname(fsname, obd->obd_uuid.uuid, sizeof(fsname)); + + mutex_lock(&sptlrpc_conf_lock); + conf = sptlrpc_conf_get(fsname, 0); + if (conf == NULL) { + CERROR("missing sptlrpc config log\n"); + rc = -EFAULT; + } else { + /* extract rule set for this target */ + conf_tgt = sptlrpc_conf_get_tgt(conf, obd->obd_name, 0); + + rc = sptlrpc_rule_set_extract(&conf->sc_rset, + conf_tgt ? &conf_tgt->sct_rset : NULL, + LUSTRE_SP_ANY, sp_dst, rset); + } + mutex_unlock(&sptlrpc_conf_lock); + + RETURN(rc); +} + +int sptlrpc_conf_init(void) +{ + mutex_init(&sptlrpc_conf_lock); + return 0; +} + +void sptlrpc_conf_fini(void) +{ + struct sptlrpc_conf *conf, *conf_next; + + mutex_lock(&sptlrpc_conf_lock); + list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list) + sptlrpc_conf_free(conf); + LASSERT(list_empty(&sptlrpc_confs)); + mutex_unlock(&sptlrpc_conf_lock); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c new file mode 100644 index 0000000000000..a5de86426a86c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c @@ -0,0 +1,110 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, 2017, Intel Corporation. + */ + +#define DEBUG_SUBSYSTEM S_FILTER + +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +/* refine later and change to seqlock or simlar from libcfs */ +/* Debugging check only needed during development */ +#ifdef OBD_CTXT_DEBUG +# define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC) +#else +# define ASSERT_CTXT_MAGIC(magic) do {} while(0) +#endif + +static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + struct path path; + struct path old_pwd; + + path.mnt = mnt; + path.dentry = dentry; + path_get(&path); + spin_lock(&fs->lock); + write_seqcount_begin(&fs->seq); + old_pwd = fs->pwd; + fs->pwd = path; + write_seqcount_end(&fs->seq); + spin_unlock(&fs->lock); + + if (old_pwd.dentry) + path_put(&old_pwd); +} + +/* push / pop to root of obd store */ +void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx) +{ + /* if there is underlaying dt_device then push_ctxt is not needed */ + if (new_ctx->dt != NULL) + return; + + ASSERT_CTXT_MAGIC(new_ctx->magic); + OBD_SET_CTXT_MAGIC(save); + + LASSERT(ll_d_count(current->fs->pwd.dentry)); + LASSERT(ll_d_count(new_ctx->pwd)); + save->pwd = dget(current->fs->pwd.dentry); + save->pwdmnt = mntget(current->fs->pwd.mnt); + save->umask = current_umask(); + + LASSERT(save->pwd); + LASSERT(save->pwdmnt); + LASSERT(new_ctx->pwd); + LASSERT(new_ctx->pwdmnt); + + current->fs->umask = 0; /* umask already applied on client */ + ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd); +} +EXPORT_SYMBOL(push_ctxt); + +void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx) +{ + /* if there is underlaying dt_device then pop_ctxt is not needed */ + if (new_ctx->dt != NULL) + return; + + ASSERT_CTXT_MAGIC(saved->magic); + + LASSERTF(current->fs->pwd.dentry == new_ctx->pwd, "%p != %p\n", + current->fs->pwd.dentry, new_ctx->pwd); + LASSERTF(current->fs->pwd.mnt == new_ctx->pwdmnt, "%p != %p\n", + current->fs->pwd.mnt, new_ctx->pwdmnt); + + ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd); + + dput(saved->pwd); + mntput(saved->pwdmnt); + current->fs->umask = saved->umask; +} +EXPORT_SYMBOL(pop_ctxt); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c new file mode 100644 index 0000000000000..c056aa4b97480 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c @@ -0,0 +1,198 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/sec_gc.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include + +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +#define SEC_GC_INTERVAL (30 * 60) + +static DEFINE_MUTEX(sec_gc_mutex); +static DEFINE_SPINLOCK(sec_gc_list_lock); +static DEFINE_SPINLOCK(sec_gc_ctx_list_lock); +static LIST_HEAD(sec_gc_list); +static LIST_HEAD(sec_gc_ctx_list); + +static atomic_t sec_gc_wait_del = ATOMIC_INIT(0); + +void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec) +{ + LASSERT(sec->ps_policy->sp_cops->gc_ctx); + LASSERT(sec->ps_gc_interval > 0); + LASSERT(list_empty(&sec->ps_gc_list)); + + sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval; + + spin_lock(&sec_gc_list_lock); + list_add_tail(&sec->ps_gc_list, &sec_gc_list); + spin_unlock(&sec_gc_list_lock); + + CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name); +} + +void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec) +{ + if (list_empty(&sec->ps_gc_list)) + return; + + /* signal before list_del to make iteration in gc thread safe */ + atomic_inc(&sec_gc_wait_del); + + spin_lock(&sec_gc_list_lock); + list_del_init(&sec->ps_gc_list); + spin_unlock(&sec_gc_list_lock); + + /* barrier */ + mutex_lock(&sec_gc_mutex); + mutex_unlock(&sec_gc_mutex); + + atomic_dec(&sec_gc_wait_del); + + CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name); +} + +static void sec_gc_main(struct work_struct *ws); +static DECLARE_DELAYED_WORK(sec_gc_work, sec_gc_main); + +void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(list_empty(&ctx->cc_gc_chain)); + + CDEBUG(D_SEC, "hand over ctx %p(%u->%s)\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + spin_lock(&sec_gc_ctx_list_lock); + list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list); + spin_unlock(&sec_gc_ctx_list_lock); + + mod_delayed_work(system_wq, &sec_gc_work, 0); +} +EXPORT_SYMBOL(sptlrpc_gc_add_ctx); + +static void sec_process_ctx_list(void) +{ + struct ptlrpc_cli_ctx *ctx; + + spin_lock(&sec_gc_ctx_list_lock); + + while ((ctx = list_first_entry_or_null(&sec_gc_ctx_list, + struct ptlrpc_cli_ctx, + cc_gc_chain)) != NULL) { + list_del_init(&ctx->cc_gc_chain); + spin_unlock(&sec_gc_ctx_list_lock); + + LASSERT(ctx->cc_sec); + LASSERT(atomic_read(&ctx->cc_refcount) == 1); + CDEBUG(D_SEC, "gc pick up ctx %p(%u->%s)\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + sptlrpc_cli_ctx_put(ctx, 1); + + spin_lock(&sec_gc_ctx_list_lock); + } + + spin_unlock(&sec_gc_ctx_list_lock); +} + +static void sec_do_gc(struct ptlrpc_sec *sec) +{ + LASSERT(sec->ps_policy->sp_cops->gc_ctx); + + if (unlikely(sec->ps_gc_next == 0)) { + CDEBUG(D_SEC, "sec %p(%s) has 0 gc time\n", + sec, sec->ps_policy->sp_name); + return; + } + + CDEBUG(D_SEC, "check on sec %p(%s)\n", sec, sec->ps_policy->sp_name); + + if (sec->ps_gc_next > ktime_get_real_seconds()) + return; + + sec->ps_policy->sp_cops->gc_ctx(sec); + sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval; +} + +static void sec_gc_main(struct work_struct *ws) +{ + struct ptlrpc_sec *sec; + + sec_process_ctx_list(); +again: + /* + * go through sec list do gc. + * FIXME here we iterate through the whole list each time which + * is not optimal. we perhaps want to use balanced binary tree + * to trace each sec as order of expiry time. + * another issue here is we wakeup as fixed interval instead of + * according to each sec's expiry time + */ + mutex_lock(&sec_gc_mutex); + list_for_each_entry(sec, &sec_gc_list, ps_gc_list) { + /* + * if someone is waiting to be deleted, let it + * proceed as soon as possible. + */ + if (atomic_read(&sec_gc_wait_del)) { + CDEBUG(D_SEC, "deletion pending, start over\n"); + mutex_unlock(&sec_gc_mutex); + goto again; + } + + sec_do_gc(sec); + } + mutex_unlock(&sec_gc_mutex); + + /* check ctx list again before sleep */ + sec_process_ctx_list(); + schedule_delayed_work(&sec_gc_work, cfs_time_seconds(SEC_GC_INTERVAL)); +} + +int sptlrpc_gc_init(void) +{ + schedule_delayed_work(&sec_gc_work, cfs_time_seconds(SEC_GC_INTERVAL)); + return 0; +} + +void sptlrpc_gc_fini(void) +{ + cancel_delayed_work_sync(&sec_gc_work); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c new file mode 100644 index 0000000000000..fb25a28700b05 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c @@ -0,0 +1,381 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/sec_lproc.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +static char *sec_flags2str(unsigned long flags, char *buf, int bufsize) +{ + buf[0] = '\0'; + + if (flags & PTLRPC_SEC_FL_REVERSE) + strlcat(buf, "reverse,", bufsize); + if (flags & PTLRPC_SEC_FL_ROOTONLY) + strlcat(buf, "rootonly,", bufsize); + if (flags & PTLRPC_SEC_FL_UDESC) + strlcat(buf, "udesc,", bufsize); + if (flags & PTLRPC_SEC_FL_BULK) + strlcat(buf, "bulk,", bufsize); + if (buf[0] == '\0') + strlcat(buf, "-,", bufsize); + + return buf; +} + +static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *obd = seq->private; + struct client_obd *cli = &obd->u.cli; + struct ptlrpc_sec *sec = NULL; + char str[32]; + + LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) == 0); + + if (cli->cl_import) + sec = sptlrpc_import_sec_ref(cli->cl_import); + if (sec == NULL) + goto out; + + sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)); + + seq_printf(seq, "rpc flavor: %s\n", + sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc)); + seq_printf(seq, "bulk flavor: %s\n", + sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str))); + seq_printf(seq, "flags: %s\n", + sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str))); + seq_printf(seq, "id: %d\n", sec->ps_id); + seq_printf(seq, "refcount: %d\n", + atomic_read(&sec->ps_refcount)); + seq_printf(seq, "nctx: %d\n", atomic_read(&sec->ps_nctx)); + seq_printf(seq, "gc internal %lld\n", sec->ps_gc_interval); + seq_printf(seq, "gc next %lld\n", + sec->ps_gc_interval ? + (s64)(sec->ps_gc_next - ktime_get_real_seconds()) : 0ll); + + sptlrpc_sec_put(sec); +out: + return 0; +} + +LDEBUGFS_SEQ_FOPS_RO(sptlrpc_info_lprocfs); + +static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *obd = seq->private; + struct client_obd *cli = &obd->u.cli; + struct ptlrpc_sec *sec = NULL; + + LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) == 0); + + if (cli->cl_import) + sec = sptlrpc_import_sec_ref(cli->cl_import); + if (sec == NULL) + goto out; + + if (sec->ps_policy->sp_cops->display) + sec->ps_policy->sp_cops->display(sec, seq); + + sptlrpc_sec_put(sec); +out: + return 0; +} + +LDEBUGFS_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs); + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 16, 53, 0) +static ssize_t sepol_seq_write_old(struct obd_device *obd, + const char __user *buffer, + size_t count) +{ + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp = cli->cl_import; + struct sepol_downcall_data_old *param; + int size = sizeof(*param); + __u16 len; + int rc = 0; + + if (count < size) { + rc = -EINVAL; + CERROR("%s: invalid data count = %lu, size = %d: rc = %d\n", + obd->obd_name, (unsigned long) count, size, rc); + return rc; + } + + OBD_ALLOC(param, size); + if (param == NULL) + return -ENOMEM; + + if (copy_from_user(param, buffer, size)) { + rc = -EFAULT; + CERROR("%s: bad sepol data: rc = %d\n", obd->obd_name, rc); + GOTO(out, rc); + } + + if (param->sdd_magic != SEPOL_DOWNCALL_MAGIC_OLD) { + rc = -EINVAL; + CERROR("%s: sepol downcall bad params: rc = %d\n", + obd->obd_name, rc); + GOTO(out, rc); + } + + if (param->sdd_sepol_len == 0 || + param->sdd_sepol_len >= sizeof(imp->imp_sec->ps_sepol)) { + rc = -EINVAL; + CERROR("%s: invalid sepol data returned: rc = %d\n", + obd->obd_name, rc); + GOTO(out, rc); + } + len = param->sdd_sepol_len; /* save sdd_sepol_len */ + OBD_FREE(param, size); + size = offsetof(struct sepol_downcall_data_old, + sdd_sepol[len]); + + if (count < size) { + rc = -EINVAL; + CERROR("%s: invalid sepol count = %lu, size = %d: rc = %d\n", + obd->obd_name, (unsigned long) count, size, rc); + return rc; + } + + /* alloc again with real size */ + OBD_ALLOC(param, size); + if (param == NULL) + return -ENOMEM; + + if (copy_from_user(param, buffer, size)) { + rc = -EFAULT; + CERROR("%s: cannot copy sepol data: rc = %d\n", + obd->obd_name, rc); + GOTO(out, rc); + } + + spin_lock(&imp->imp_sec->ps_lock); + snprintf(imp->imp_sec->ps_sepol, param->sdd_sepol_len + 1, "%s", + param->sdd_sepol); + imp->imp_sec->ps_sepol_mtime = ktime_set(param->sdd_sepol_mtime, 0); + spin_unlock(&imp->imp_sec->ps_lock); + +out: + if (param != NULL) + OBD_FREE(param, size); + + return rc ? rc : count; +} +#endif + +static ssize_t +ldebugfs_sptlrpc_sepol_seq_write(struct file *file, const char __user *buffer, + size_t count, void *data) +{ + struct seq_file *seq = file->private_data; + struct obd_device *obd = seq->private; + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp = cli->cl_import; + struct sepol_downcall_data *param; + __u32 magic; + int size = sizeof(magic); + __u16 len; + int rc = 0; + + if (count < size) { + rc = -EINVAL; + CERROR("%s: invalid buffer count = %lu, size = %d: rc = %d\n", + obd->obd_name, (unsigned long) count, size, rc); + return rc; + } + + if (copy_from_user(&magic, buffer, size)) { + rc = -EFAULT; + CERROR("%s: bad sepol magic: rc = %d\n", obd->obd_name, rc); + return rc; + } + + if (magic != SEPOL_DOWNCALL_MAGIC) { +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 16, 53, 0) + if (magic == SEPOL_DOWNCALL_MAGIC_OLD) { + return sepol_seq_write_old(obd, buffer, count); + } +#endif + rc = -EINVAL; + CERROR("%s: sepol downcall bad magic '%#08x': rc = %d\n", + obd->obd_name, magic, rc); + return rc; + } + + size = sizeof(*param); + if (count < size) { + rc = -EINVAL; + CERROR("%s: invalid data count = %lu, size = %d: rc = %d\n", + obd->obd_name, (unsigned long) count, size, rc); + return rc; + } + + OBD_ALLOC(param, size); + if (param == NULL) + return -ENOMEM; + + if (copy_from_user(param, buffer, size)) { + rc = -EFAULT; + CERROR("%s: bad sepol data: rc = %d\n", obd->obd_name, rc); + GOTO(out, rc); + } + + if (param->sdd_sepol_len == 0 || + param->sdd_sepol_len >= sizeof(imp->imp_sec->ps_sepol)) { + rc = -EINVAL; + CERROR("%s: invalid sepol data returned: rc = %d\n", + obd->obd_name, rc); + GOTO(out, rc); + } + len = param->sdd_sepol_len; /* save sdd_sepol_len */ + OBD_FREE(param, size); + size = offsetof(struct sepol_downcall_data, + sdd_sepol[len]); + + /* alloc again with real size */ + OBD_ALLOC(param, size); + if (param == NULL) + return -ENOMEM; + + if (copy_from_user(param, buffer, size)) { + rc = -EFAULT; + CERROR("%s: cannot copy sepol data: rc = %d\n", + obd->obd_name, rc); + GOTO(out, rc); + } + + spin_lock(&imp->imp_sec->ps_lock); + snprintf(imp->imp_sec->ps_sepol, param->sdd_sepol_len + 1, "%s", + param->sdd_sepol); + imp->imp_sec->ps_sepol_mtime = ktime_set(param->sdd_sepol_mtime, 0); + spin_unlock(&imp->imp_sec->ps_lock); + +out: + if (param != NULL) + OBD_FREE(param, size); + + return rc ? rc : count; +} +LDEBUGFS_FOPS_WR_ONLY(srpc, sptlrpc_sepol); + +int sptlrpc_lprocfs_cliobd_attach(struct obd_device *obd) +{ + if (strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 && + strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 && + strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME) != 0 && + strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) != 0 && + strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) != 0) { + CERROR("can't register lproc for obd type %s\n", + obd->obd_type->typ_name); + return -EINVAL; + } + + debugfs_create_file("srpc_info", 0444, obd->obd_debugfs_entry, obd, + &sptlrpc_info_lprocfs_fops); + + debugfs_create_file("srpc_contexts", 0444, obd->obd_debugfs_entry, obd, + &sptlrpc_ctxs_lprocfs_fops); + + debugfs_create_file("srpc_sepol", 0200, obd->obd_debugfs_entry, obd, + &srpc_sptlrpc_sepol_fops); + + return 0; +} +EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach); + +LDEBUGFS_SEQ_FOPS_RO(sptlrpc_proc_enc_pool); + +static struct ldebugfs_vars sptlrpc_lprocfs_vars[] = { + { .name = "encrypt_page_pools", + .fops = &sptlrpc_proc_enc_pool_fops }, + { NULL } +}; + +struct dentry *sptlrpc_debugfs_dir; +EXPORT_SYMBOL(sptlrpc_debugfs_dir); + +struct proc_dir_entry *sptlrpc_lprocfs_dir; +EXPORT_SYMBOL(sptlrpc_lprocfs_dir); + +int sptlrpc_lproc_init(void) +{ + int rc; + + LASSERT(sptlrpc_debugfs_dir == NULL); + + sptlrpc_debugfs_dir = debugfs_create_dir("sptlrpc", + debugfs_lustre_root); + ldebugfs_add_vars(sptlrpc_debugfs_dir, sptlrpc_lprocfs_vars, NULL); + + sptlrpc_lprocfs_dir = lprocfs_register("sptlrpc", proc_lustre_root, + NULL, NULL); + if (IS_ERR_OR_NULL(sptlrpc_lprocfs_dir)) { + rc = PTR_ERR(sptlrpc_lprocfs_dir); + rc = sptlrpc_lprocfs_dir ? PTR_ERR(sptlrpc_lprocfs_dir) + : -ENOMEM; + sptlrpc_lprocfs_dir = NULL; + } + return 0; +} + +void sptlrpc_lproc_fini(void) +{ + debugfs_remove_recursive(sptlrpc_debugfs_dir); + sptlrpc_debugfs_dir = NULL; + + if (!IS_ERR_OR_NULL(sptlrpc_lprocfs_dir)) + lprocfs_remove(&sptlrpc_lprocfs_dir); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c new file mode 100644 index 0000000000000..4fb3a092c634a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c @@ -0,0 +1,451 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/sec_null.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + + +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +static struct ptlrpc_sec_policy null_policy; +static struct ptlrpc_sec null_sec; +static struct ptlrpc_cli_ctx null_cli_ctx; +static struct ptlrpc_svc_ctx null_svc_ctx; + +/* + * we can temporarily use the topmost 8-bits of lm_secflvr to identify + * the source sec part. + */ +static inline +void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp) +{ + msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24; +} + +static inline +enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg) +{ + return (msg->lm_secflvr >> 24) & 0xFF; +} + +static +int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + req->rq_reqbuf->lm_secflvr = SPTLRPC_FLVR_NULL; + + if (!req->rq_import->imp_dlm_fake) { + struct obd_device *obd = req->rq_import->imp_obd; + + null_encode_sec_part(req->rq_reqbuf, + obd->u.cli.cl_sp_me); + } + req->rq_reqdata_len = req->rq_reqlen; + return 0; +} + +static +int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + __u32 cksums, cksumc; + + LASSERT(req->rq_repdata); + + req->rq_repmsg = req->rq_repdata; + req->rq_replen = req->rq_repdata_len; + + if (req->rq_early) { + cksums = lustre_msg_get_cksum(req->rq_repdata); + cksumc = lustre_msg_calc_cksum(req->rq_repmsg, + MSG_PTLRPC_BODY_OFF); + + if (cksumc != cksums) { + CDEBUG(D_SEC, + "early reply checksum mismatch: %08x != %08x\n", + cksumc, cksums); + return -EINVAL; + } + } + + return 0; +} + +static +struct ptlrpc_sec *null_create_sec(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *sf) +{ + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL); + + /* + * general layer has take a module reference for us, because we never + * really destroy the sec, simply release the reference here. + */ + sptlrpc_policy_put(&null_policy); + return &null_sec; +} + +static +void null_destroy_sec(struct ptlrpc_sec *sec) +{ + LASSERT(sec == &null_sec); +} + +static +struct ptlrpc_cli_ctx *null_lookup_ctx(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + atomic_inc(&null_cli_ctx.cc_refcount); + return &null_cli_ctx; +} + +static +int null_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid, int grace, + int force) +{ + return 0; +} + +static +int null_alloc_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + if (!req->rq_reqbuf) { + int alloc_size = size_roundup_power2(msgsize); + + LASSERT(!req->rq_pool); + OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_size); + if (!req->rq_reqbuf) + return -ENOMEM; + + req->rq_reqbuf_len = alloc_size; + } else { + LASSERT(req->rq_pool); + LASSERT(req->rq_reqbuf_len >= msgsize); + memset(req->rq_reqbuf, 0, msgsize); + } + + req->rq_reqmsg = req->rq_reqbuf; + return 0; +} + +static +void null_free_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + if (!req->rq_pool) { + LASSERTF(req->rq_reqmsg == req->rq_reqbuf, + "req %p: reqmsg %p is not reqbuf %p in null sec\n", + req, req->rq_reqmsg, req->rq_reqbuf); + LASSERTF(req->rq_reqbuf_len >= req->rq_reqlen, + "req %p: reqlen %d should smaller than buflen %d\n", + req, req->rq_reqlen, req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } +} + +static +int null_alloc_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + /* add space for early replied */ + msgsize += lustre_msg_early_size; + + msgsize = size_roundup_power2(msgsize); + + OBD_ALLOC_LARGE(req->rq_repbuf, msgsize); + if (!req->rq_repbuf) + return -ENOMEM; + + req->rq_repbuf_len = msgsize; + return 0; +} + +static +void null_free_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + LASSERT(req->rq_repbuf); + + OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len); + req->rq_repbuf = NULL; + req->rq_repbuf_len = 0; +} + +static +int null_enlarge_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + struct lustre_msg *newbuf; + struct lustre_msg *oldbuf = req->rq_reqmsg; + int oldsize, newmsg_size, alloc_size; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf == req->rq_reqmsg); + LASSERT(req->rq_reqbuf_len >= req->rq_reqlen); + LASSERT(req->rq_reqlen == lustre_packed_msg_size(oldbuf)); + + /* compute new message size */ + oldsize = req->rq_reqbuf->lm_buflens[segment]; + req->rq_reqbuf->lm_buflens[segment] = newsize; + newmsg_size = lustre_packed_msg_size(oldbuf); + req->rq_reqbuf->lm_buflens[segment] = oldsize; + + /* request from pool should always have enough buffer */ + LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newmsg_size); + + if (req->rq_reqbuf_len < newmsg_size) { + alloc_size = size_roundup_power2(newmsg_size); + + OBD_ALLOC_LARGE(newbuf, alloc_size); + if (newbuf == NULL) + return -ENOMEM; + + /* + * Must lock this, so that otherwise unprotected change of + * rq_reqmsg is not racing with parallel processing of + * imp_replay_list traversing threads. See LU-3333 + * This is a bandaid at best, we really need to deal with this + * in request enlarging code before unpacking that's already + * there + */ + if (req->rq_import) + spin_lock(&req->rq_import->imp_lock); + memcpy(newbuf, req->rq_reqbuf, req->rq_reqlen); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = req->rq_reqmsg = newbuf; + req->rq_reqbuf_len = alloc_size; + + if (req->rq_import) + spin_unlock(&req->rq_import->imp_lock); + } + + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + req->rq_reqlen = newmsg_size; + + return 0; +} + +static struct ptlrpc_svc_ctx null_svc_ctx = { + .sc_refcount = ATOMIC_INIT(1), + .sc_policy = &null_policy, +}; + +static +int null_accept(struct ptlrpc_request *req) +{ + LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == + SPTLRPC_POLICY_NULL); + + if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) { + CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc); + return SECSVC_DROP; + } + + req->rq_sp_from = null_decode_sec_part(req->rq_reqbuf); + + req->rq_reqmsg = req->rq_reqbuf; + req->rq_reqlen = req->rq_reqdata_len; + + req->rq_svc_ctx = &null_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + + return SECSVC_OK; +} + +static +int null_alloc_rs(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_reply_state *rs; + int rs_size = sizeof(*rs) + msgsize; + + LASSERT(msgsize % 8 == 0); + + rs = req->rq_reply_state; + + if (rs) { + /* pre-allocated */ + LASSERT(rs->rs_size >= rs_size); + } else { + OBD_ALLOC_LARGE(rs, rs_size); + if (rs == NULL) + return -ENOMEM; + + rs->rs_size = rs_size; + } + + rs->rs_svc_ctx = req->rq_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + + rs->rs_repbuf = (struct lustre_msg *) (rs + 1); + rs->rs_repbuf_len = rs_size - sizeof(*rs); + rs->rs_msg = rs->rs_repbuf; + + req->rq_reply_state = rs; + return 0; +} + +static +void null_free_rs(struct ptlrpc_reply_state *rs) +{ + LASSERT_ATOMIC_GT(&rs->rs_svc_ctx->sc_refcount, 1); + atomic_dec(&rs->rs_svc_ctx->sc_refcount); + + if (!rs->rs_prealloc) + OBD_FREE_LARGE(rs, rs->rs_size); +} + +static +int null_authorize(struct ptlrpc_request *req) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + + LASSERT(rs); + + rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL; + rs->rs_repdata_len = req->rq_replen; + req->rq_reply_off = 0; + + if (likely(req->rq_packed_final)) { + if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) + req->rq_reply_off = lustre_msg_early_size; + } else { + __u32 cksum; + + cksum = lustre_msg_calc_cksum(rs->rs_repbuf, + MSG_PTLRPC_BODY_OFF); + lustre_msg_set_cksum(rs->rs_repbuf, cksum); + } + + return 0; +} + +static struct ptlrpc_ctx_ops null_ctx_ops = { + .sign = null_ctx_sign, + .verify = null_ctx_verify, +}; + +static struct ptlrpc_sec_cops null_sec_cops = { + .create_sec = null_create_sec, + .destroy_sec = null_destroy_sec, + .lookup_ctx = null_lookup_ctx, + .flush_ctx_cache = null_flush_ctx_cache, + .alloc_reqbuf = null_alloc_reqbuf, + .alloc_repbuf = null_alloc_repbuf, + .free_reqbuf = null_free_reqbuf, + .free_repbuf = null_free_repbuf, + .enlarge_reqbuf = null_enlarge_reqbuf, +}; + +static struct ptlrpc_sec_sops null_sec_sops = { + .accept = null_accept, + .alloc_rs = null_alloc_rs, + .authorize = null_authorize, + .free_rs = null_free_rs, +}; + +static struct ptlrpc_sec_policy null_policy = { + .sp_owner = THIS_MODULE, + .sp_name = "sec.null", + .sp_policy = SPTLRPC_POLICY_NULL, + .sp_cops = &null_sec_cops, + .sp_sops = &null_sec_sops, +}; + +static void null_init_internal(void) +{ + static HLIST_HEAD(__list); + + null_sec.ps_policy = &null_policy; + atomic_set(&null_sec.ps_refcount, 1); /* always busy */ + null_sec.ps_id = -1; + null_sec.ps_import = NULL; + null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL; + null_sec.ps_flvr.sf_flags = 0; + null_sec.ps_part = LUSTRE_SP_ANY; + null_sec.ps_dying = 0; + spin_lock_init(&null_sec.ps_lock); + atomic_set(&null_sec.ps_nctx, 1); /* for "null_cli_ctx" */ + INIT_LIST_HEAD(&null_sec.ps_gc_list); + null_sec.ps_gc_interval = 0; + null_sec.ps_gc_next = 0; + + hlist_add_head(&null_cli_ctx.cc_cache, &__list); + atomic_set(&null_cli_ctx.cc_refcount, 1); /* for hash */ + null_cli_ctx.cc_sec = &null_sec; + null_cli_ctx.cc_ops = &null_ctx_ops; + null_cli_ctx.cc_expire = 0; + null_cli_ctx.cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_ETERNAL | + PTLRPC_CTX_UPTODATE; + null_cli_ctx.cc_vcred.vc_uid = 0; + spin_lock_init(&null_cli_ctx.cc_lock); + INIT_LIST_HEAD(&null_cli_ctx.cc_req_list); + INIT_LIST_HEAD(&null_cli_ctx.cc_gc_chain); +} + +int sptlrpc_null_init(void) +{ + int rc; + + null_init_internal(); + + rc = sptlrpc_register_policy(&null_policy); + if (rc) + CERROR("failed to register %s: %d\n", null_policy.sp_name, rc); + + return rc; +} + +void sptlrpc_null_fini(void) +{ + int rc; + + rc = sptlrpc_unregister_policy(&null_policy); + if (rc) + CERROR("failed to unregister %s: %d\n", null_policy.sp_name, + rc); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c new file mode 100644 index 0000000000000..7cf3a2c64af6d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c @@ -0,0 +1,1032 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/ptlrpc/sec_plain.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + + +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +struct plain_sec { + struct ptlrpc_sec pls_base; + rwlock_t pls_lock; + struct ptlrpc_cli_ctx *pls_ctx; +}; + +static inline struct plain_sec *sec2plsec(struct ptlrpc_sec *sec) +{ + return container_of(sec, struct plain_sec, pls_base); +} + +static struct ptlrpc_sec_policy plain_policy; +static struct ptlrpc_ctx_ops plain_ctx_ops; +static struct ptlrpc_svc_ctx plain_svc_ctx; + +static unsigned int plain_at_offset; + +/* + * for simplicity, plain policy rpc use fixed layout. + */ +#define PLAIN_PACK_SEGMENTS (4) + +#define PLAIN_PACK_HDR_OFF (0) +#define PLAIN_PACK_MSG_OFF (1) +#define PLAIN_PACK_USER_OFF (2) +#define PLAIN_PACK_BULK_OFF (3) + +#define PLAIN_FL_USER (0x01) +#define PLAIN_FL_BULK (0x02) + +struct plain_header { + __u8 ph_ver; /* 0 */ + __u8 ph_flags; + __u8 ph_sp; /* source */ + __u8 ph_bulk_hash_alg; /* complete flavor desc */ + __u8 ph_pad[4]; +}; + +struct plain_bulk_token { + __u8 pbt_hash[8]; +}; + +#define PLAIN_BSD_SIZE \ + (sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token)) + +/* + * bulk checksum helpers + */ + +static int plain_unpack_bsd(struct lustre_msg *msg, int swabbed) +{ + struct ptlrpc_bulk_sec_desc *bsd; + + if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF, swabbed)) + return -EPROTO; + + bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE); + if (bsd == NULL) { + CERROR("bulk sec desc has short size %d\n", + lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF)); + return -EPROTO; + } + + if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && + bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) { + CERROR("invalid bulk svc %u\n", bsd->bsd_svc); + return -EPROTO; + } + + return 0; +} + +static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc, + __u8 hash_alg, + struct plain_bulk_token *token) +{ + if (hash_alg == BULK_HASH_ALG_NULL) + return 0; + + memset(token->pbt_hash, 0, sizeof(token->pbt_hash)); + return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash, + sizeof(token->pbt_hash)); +} + +static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc, + __u8 hash_alg, + struct plain_bulk_token *tokenr) +{ + struct plain_bulk_token tokenv; + int rc; + + if (hash_alg == BULK_HASH_ALG_NULL) + return 0; + + memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash)); + rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash, + sizeof(tokenv.pbt_hash)); + if (rc) + return rc; + + if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash))) + return -EACCES; + return 0; +} + +static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc) +{ + char *ptr; + unsigned int off, i; + + for (i = 0; i < desc->bd_iov_count; i++) { + if (desc->bd_vec[i].bv_len == 0) + continue; + + ptr = kmap(desc->bd_vec[i].bv_page); + off = desc->bd_vec[i].bv_offset & ~PAGE_MASK; + ptr[off] ^= 0x1; + kunmap(desc->bd_vec[i].bv_page); + return; + } +} + +/* + * cli_ctx apis + */ + +static +int plain_ctx_refresh(struct ptlrpc_cli_ctx *ctx) +{ + /* should never reach here */ + LBUG(); + return 0; +} + +static +int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx) +{ + return 0; +} + +static +int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + struct lustre_msg *msg = req->rq_reqbuf; + struct plain_header *phdr; + + ENTRY; + + msg->lm_secflvr = req->rq_flvr.sf_rpc; + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0); + phdr->ph_ver = 0; + phdr->ph_flags = 0; + phdr->ph_sp = ctx->cc_sec->ps_part; + phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg; + + if (req->rq_pack_udesc) + phdr->ph_flags |= PLAIN_FL_USER; + if (req->rq_pack_bulk) + phdr->ph_flags |= PLAIN_FL_BULK; + + req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount, + msg->lm_buflens); + RETURN(0); +} + +static +int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + struct lustre_msg *msg = req->rq_repdata; + struct plain_header *phdr; + bool swabbed; + + ENTRY; + if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) { + CERROR("unexpected reply buf count %u\n", msg->lm_bufcount); + RETURN(-EPROTO); + } + + swabbed = req_capsule_rep_need_swab(&req->rq_pill); + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr)); + if (phdr == NULL) { + CERROR("missing plain header\n"); + RETURN(-EPROTO); + } + + if (phdr->ph_ver != 0) { + CERROR("Invalid header version\n"); + RETURN(-EPROTO); + } + + /* expect no user desc in reply */ + if (phdr->ph_flags & PLAIN_FL_USER) { + CERROR("Unexpected udesc flag in reply\n"); + RETURN(-EPROTO); + } + + if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) { + CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg, + req->rq_flvr.u_bulk.hash.hash_alg); + RETURN(-EPROTO); + } + + if (unlikely(req->rq_early)) { + __u32 cksum = lustre_msg_calc_cksum(msg, PLAIN_PACK_MSG_OFF); + + if (cksum != msg->lm_cksum) { + CDEBUG(D_SEC, + "early reply checksum mismatch: %08x != %08x\n", + cpu_to_le32(cksum), msg->lm_cksum); + RETURN(-EINVAL); + } + } else { + /* + * whether we sent with bulk or not, we expect the same + * in reply, except for early reply + */ + if (!req->rq_early && + !equi(req->rq_pack_bulk == 1, + phdr->ph_flags & PLAIN_FL_BULK)) { + CERROR("%s bulk checksum in reply\n", + req->rq_pack_bulk ? "Missing" : "Unexpected"); + RETURN(-EPROTO); + } + + if (phdr->ph_flags & PLAIN_FL_BULK) { + if (plain_unpack_bsd(msg, swabbed)) + RETURN(-EPROTO); + } + } + + req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0); + req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF); + RETURN(0); +} + +static +int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_bulk_sec_desc *bsd; + struct plain_bulk_token *token; + int rc; + + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); + + bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + token = (struct plain_bulk_token *) bsd->bsd_data; + + bsd->bsd_version = 0; + bsd->bsd_flags = 0; + bsd->bsd_type = SPTLRPC_BULK_DEFAULT; + bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc); + + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL) + RETURN(0); + + if (req->rq_bulk_read) + RETURN(0); + + rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + token); + if (rc) { + CERROR("bulk write: failed to compute checksum: %d\n", rc); + } else { + /* + * for sending we only compute the wrong checksum instead + * of corrupting the data so it is still correct on a redo + */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) && + req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL) + token->pbt_hash[0] ^= 0x1; + } + + return rc; +} + +static +int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_bulk_sec_desc *bsdv; + struct plain_bulk_token *tokenv; + int rc; + int i, nob; + + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); + LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS); + + bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0); + tokenv = (struct plain_bulk_token *) bsdv->bsd_data; + + if (req->rq_bulk_write) { + if (bsdv->bsd_flags & BSD_FL_ERR) + return -EIO; + return 0; + } + + /* fix the actual data size */ + for (i = 0, nob = 0; i < desc->bd_iov_count; i++) { + if (desc->bd_vec[i].bv_len + + nob > desc->bd_nob_transferred) { + desc->bd_vec[i].bv_len = + desc->bd_nob_transferred - nob; + } + nob += desc->bd_vec[i].bv_len; + } + + rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenv); + if (rc) + CERROR("bulk read: client verify failed: %d\n", rc); + + return rc; +} + +/* + * sec apis + */ + +static +struct ptlrpc_cli_ctx *plain_sec_install_ctx(struct plain_sec *plsec) +{ + struct ptlrpc_cli_ctx *ctx, *ctx_new; + + OBD_ALLOC_PTR(ctx_new); + + write_lock(&plsec->pls_lock); + + ctx = plsec->pls_ctx; + if (ctx) { + atomic_inc(&ctx->cc_refcount); + + if (ctx_new) + OBD_FREE_PTR(ctx_new); + } else if (ctx_new) { + ctx = ctx_new; + + atomic_set(&ctx->cc_refcount, 1); /* for cache */ + ctx->cc_sec = &plsec->pls_base; + ctx->cc_ops = &plain_ctx_ops; + ctx->cc_expire = 0; + ctx->cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_UPTODATE; + ctx->cc_vcred.vc_uid = 0; + spin_lock_init(&ctx->cc_lock); + INIT_LIST_HEAD(&ctx->cc_req_list); + INIT_LIST_HEAD(&ctx->cc_gc_chain); + + plsec->pls_ctx = ctx; + atomic_inc(&plsec->pls_base.ps_nctx); + atomic_inc(&plsec->pls_base.ps_refcount); + + atomic_inc(&ctx->cc_refcount); /* for caller */ + } + + write_unlock(&plsec->pls_lock); + + return ctx; +} + +static +void plain_destroy_sec(struct ptlrpc_sec *sec) +{ + struct plain_sec *plsec = sec2plsec(sec); + + ENTRY; + + LASSERT(sec->ps_policy == &plain_policy); + LASSERT(sec->ps_import); + LASSERT(atomic_read(&sec->ps_refcount) == 0); + LASSERT(atomic_read(&sec->ps_nctx) == 0); + LASSERT(plsec->pls_ctx == NULL); + + class_import_put(sec->ps_import); + + OBD_FREE_PTR(plsec); + EXIT; +} + +static +void plain_kill_sec(struct ptlrpc_sec *sec) +{ + sec->ps_dying = 1; +} + +static +struct ptlrpc_sec *plain_create_sec(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *sf) +{ + struct plain_sec *plsec; + struct ptlrpc_sec *sec; + struct ptlrpc_cli_ctx *ctx; + + ENTRY; + + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN); + + OBD_ALLOC_PTR(plsec); + if (plsec == NULL) + RETURN(NULL); + + /* + * initialize plain_sec + */ + rwlock_init(&plsec->pls_lock); + plsec->pls_ctx = NULL; + + sec = &plsec->pls_base; + sec->ps_policy = &plain_policy; + atomic_set(&sec->ps_refcount, 0); + atomic_set(&sec->ps_nctx, 0); + sec->ps_id = sptlrpc_get_next_secid(); + sec->ps_import = class_import_get(imp); + sec->ps_flvr = *sf; + spin_lock_init(&sec->ps_lock); + INIT_LIST_HEAD(&sec->ps_gc_list); + sec->ps_gc_interval = 0; + sec->ps_gc_next = 0; + + /* install ctx immediately if this is a reverse sec */ + if (svc_ctx) { + ctx = plain_sec_install_ctx(plsec); + if (ctx == NULL) { + plain_destroy_sec(sec); + RETURN(NULL); + } + sptlrpc_cli_ctx_put(ctx, 1); + } + + RETURN(sec); +} + +static +struct ptlrpc_cli_ctx *plain_lookup_ctx(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + struct plain_sec *plsec = sec2plsec(sec); + struct ptlrpc_cli_ctx *ctx; + + ENTRY; + + read_lock(&plsec->pls_lock); + ctx = plsec->pls_ctx; + if (ctx) + atomic_inc(&ctx->cc_refcount); + read_unlock(&plsec->pls_lock); + + if (unlikely(ctx == NULL)) + ctx = plain_sec_install_ctx(plsec); + + RETURN(ctx); +} + +static +void plain_release_ctx(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, int sync) +{ + LASSERT(atomic_read(&sec->ps_refcount) > 0); + LASSERT(atomic_read(&sec->ps_nctx) > 0); + LASSERT(atomic_read(&ctx->cc_refcount) == 0); + LASSERT(ctx->cc_sec == sec); + + OBD_FREE_PTR(ctx); + + atomic_dec(&sec->ps_nctx); + sptlrpc_sec_put(sec); +} + +static +int plain_flush_ctx_cache(struct ptlrpc_sec *sec, + uid_t uid, int grace, int force) +{ + struct plain_sec *plsec = sec2plsec(sec); + struct ptlrpc_cli_ctx *ctx; + + ENTRY; + + /* do nothing unless caller want to flush for 'all' */ + if (uid != -1) + RETURN(0); + + write_lock(&plsec->pls_lock); + ctx = plsec->pls_ctx; + plsec->pls_ctx = NULL; + write_unlock(&plsec->pls_lock); + + if (ctx) + sptlrpc_cli_ctx_put(ctx, 1); + RETURN(0); +} + +static +int plain_alloc_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int alloc_len; + + ENTRY; + + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); + buflens[PLAIN_PACK_MSG_OFF] = msgsize; + + if (req->rq_pack_udesc) + buflens[PLAIN_PACK_USER_OFF] = sptlrpc_current_user_desc_size(); + + if (req->rq_pack_bulk) { + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; + } + + alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + if (!req->rq_reqbuf) { + LASSERT(!req->rq_pool); + + alloc_len = size_roundup_power2(alloc_len); + OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_len); + if (!req->rq_reqbuf) + RETURN(-ENOMEM); + + req->rq_reqbuf_len = alloc_len; + } else { + LASSERT(req->rq_pool); + LASSERT(req->rq_reqbuf_len >= alloc_len); + memset(req->rq_reqbuf, 0, alloc_len); + } + + lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL); + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0); + + if (req->rq_pack_udesc) + sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF); + + RETURN(0); +} + +static +void plain_free_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + ENTRY; + if (!req->rq_pool) { + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } + EXIT; +} + +static +int plain_alloc_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int alloc_len; + + ENTRY; + + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); + buflens[PLAIN_PACK_MSG_OFF] = msgsize; + + if (req->rq_pack_bulk) { + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; + } + + alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + /* add space for early reply */ + alloc_len += plain_at_offset; + + alloc_len = size_roundup_power2(alloc_len); + + OBD_ALLOC_LARGE(req->rq_repbuf, alloc_len); + if (!req->rq_repbuf) + RETURN(-ENOMEM); + + req->rq_repbuf_len = alloc_len; + RETURN(0); +} + +static +void plain_free_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + ENTRY; + OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len); + req->rq_repbuf = NULL; + req->rq_repbuf_len = 0; + EXIT; +} + +static +int plain_enlarge_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + struct lustre_msg *newbuf; + int oldsize; + int newmsg_size, newbuf_size; + + ENTRY; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len >= req->rq_reqlen); + LASSERT(lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0) == + req->rq_reqmsg); + + /* compute new embedded msg size. */ + oldsize = req->rq_reqmsg->lm_buflens[segment]; + req->rq_reqmsg->lm_buflens[segment] = newsize; + newmsg_size = lustre_msg_size_v2(req->rq_reqmsg->lm_bufcount, + req->rq_reqmsg->lm_buflens); + req->rq_reqmsg->lm_buflens[segment] = oldsize; + + /* compute new wrapper msg size. */ + oldsize = req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF]; + req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = newmsg_size; + newbuf_size = lustre_msg_size_v2(req->rq_reqbuf->lm_bufcount, + req->rq_reqbuf->lm_buflens); + req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = oldsize; + + /* request from pool should always have enough buffer */ + LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size); + + if (req->rq_reqbuf_len < newbuf_size) { + newbuf_size = size_roundup_power2(newbuf_size); + + OBD_ALLOC_LARGE(newbuf, newbuf_size); + if (newbuf == NULL) + RETURN(-ENOMEM); + + /* + * Must lock this, so that otherwise unprotected change of + * rq_reqmsg is not racing with parallel processing of + * imp_replay_list traversing threads. See LU-3333 + * This is a bandaid at best, we really need to deal with this + * in request enlarging code before unpacking that's already + * there + */ + if (req->rq_import) + spin_lock(&req->rq_import->imp_lock); + + memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = newbuf; + req->rq_reqbuf_len = newbuf_size; + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, + PLAIN_PACK_MSG_OFF, 0); + + if (req->rq_import) + spin_unlock(&req->rq_import->imp_lock); + } + + _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, + newmsg_size); + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + + req->rq_reqlen = newmsg_size; + RETURN(0); +} + +/* + * service apis + */ + +static struct ptlrpc_svc_ctx plain_svc_ctx = { + .sc_refcount = ATOMIC_INIT(1), + .sc_policy = &plain_policy, +}; + +static int plain_accept(struct ptlrpc_request *req) +{ + struct lustre_msg *msg = req->rq_reqbuf; + struct plain_header *phdr; + bool swabbed; + + ENTRY; + LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == + SPTLRPC_POLICY_PLAIN); + + if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) != + SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) || + SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) != + SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) { + CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc); + RETURN(SECSVC_DROP); + } + + if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) { + CERROR("unexpected request buf count %u\n", msg->lm_bufcount); + RETURN(SECSVC_DROP); + } + + swabbed = req_capsule_req_need_swab(&req->rq_pill); + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr)); + if (phdr == NULL) { + CERROR("missing plain header\n"); + RETURN(-EPROTO); + } + + if (phdr->ph_ver != 0) { + CERROR("Invalid header version\n"); + RETURN(-EPROTO); + } + + if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) { + CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg); + RETURN(-EPROTO); + } + + req->rq_sp_from = phdr->ph_sp; + req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg; + + if (phdr->ph_flags & PLAIN_FL_USER) { + if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF, + swabbed)) { + CERROR("Mal-formed user descriptor\n"); + RETURN(SECSVC_DROP); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0); + } + + if (phdr->ph_flags & PLAIN_FL_BULK) { + if (plain_unpack_bsd(msg, swabbed)) + RETURN(SECSVC_DROP); + + req->rq_pack_bulk = 1; + } + + req->rq_reqmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0); + req->rq_reqlen = msg->lm_buflens[PLAIN_PACK_MSG_OFF]; + + req->rq_svc_ctx = &plain_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + + RETURN(SECSVC_OK); +} + +static +int plain_alloc_rs(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_reply_state *rs; + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int rs_size = sizeof(*rs); + + ENTRY; + + LASSERT(msgsize % 8 == 0); + + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); + buflens[PLAIN_PACK_MSG_OFF] = msgsize; + + if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write)) + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; + + rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + rs = req->rq_reply_state; + + if (rs) { + /* pre-allocated */ + LASSERT(rs->rs_size >= rs_size); + } else { + OBD_ALLOC_LARGE(rs, rs_size); + if (rs == NULL) + RETURN(-ENOMEM); + + rs->rs_size = rs_size; + } + + rs->rs_svc_ctx = req->rq_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + rs->rs_repbuf = (struct lustre_msg *) (rs + 1); + rs->rs_repbuf_len = rs_size - sizeof(*rs); + + lustre_init_msg_v2(rs->rs_repbuf, PLAIN_PACK_SEGMENTS, buflens, NULL); + rs->rs_msg = lustre_msg_buf_v2(rs->rs_repbuf, PLAIN_PACK_MSG_OFF, 0); + + req->rq_reply_state = rs; + RETURN(0); +} + +static +void plain_free_rs(struct ptlrpc_reply_state *rs) +{ + ENTRY; + + LASSERT(atomic_read(&rs->rs_svc_ctx->sc_refcount) > 1); + atomic_dec(&rs->rs_svc_ctx->sc_refcount); + + if (!rs->rs_prealloc) + OBD_FREE_LARGE(rs, rs->rs_size); + EXIT; +} + +static +int plain_authorize(struct ptlrpc_request *req) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct lustre_msg_v2 *msg = rs->rs_repbuf; + struct plain_header *phdr; + int len; + + ENTRY; + + LASSERT(rs); + LASSERT(msg); + + if (req->rq_replen != msg->lm_buflens[PLAIN_PACK_MSG_OFF]) + len = lustre_shrink_msg(msg, PLAIN_PACK_MSG_OFF, + req->rq_replen, 1); + else + len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + + msg->lm_secflvr = req->rq_flvr.sf_rpc; + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0); + phdr->ph_ver = 0; + phdr->ph_flags = 0; + phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg; + + if (req->rq_pack_bulk) + phdr->ph_flags |= PLAIN_FL_BULK; + + rs->rs_repdata_len = len; + req->rq_reply_off = 0; + + if (likely(req->rq_packed_final)) { + if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) + req->rq_reply_off = plain_at_offset; + } else { + msg->lm_cksum = lustre_msg_calc_cksum(msg, PLAIN_PACK_MSG_OFF); + } + + RETURN(0); +} + +static +int plain_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + struct plain_bulk_token *tokenr; + int rc; + + LASSERT(req->rq_bulk_write); + LASSERT(req->rq_pack_bulk); + + bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + tokenr = (struct plain_bulk_token *) bsdr->bsd_data; + bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0); + + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL) + return 0; + + rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenr); + if (rc) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("bulk write: server verify failed: %d\n", rc); + } + + return rc; +} + +static +int plain_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + struct plain_bulk_token *tokenv; + int rc; + + LASSERT(req->rq_bulk_read); + LASSERT(req->rq_pack_bulk); + + bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0); + tokenv = (struct plain_bulk_token *) bsdv->bsd_data; + + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL) + return 0; + + rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenv); + if (rc) { + CERROR("bulk read: server failed to compute checksum: %d\n", + rc); + } else { + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) + corrupt_bulk_data(desc); + } + + return rc; +} + +static struct ptlrpc_ctx_ops plain_ctx_ops = { + .refresh = plain_ctx_refresh, + .validate = plain_ctx_validate, + .sign = plain_ctx_sign, + .verify = plain_ctx_verify, + .wrap_bulk = plain_cli_wrap_bulk, + .unwrap_bulk = plain_cli_unwrap_bulk, +}; + +static struct ptlrpc_sec_cops plain_sec_cops = { + .create_sec = plain_create_sec, + .destroy_sec = plain_destroy_sec, + .kill_sec = plain_kill_sec, + .lookup_ctx = plain_lookup_ctx, + .release_ctx = plain_release_ctx, + .flush_ctx_cache = plain_flush_ctx_cache, + .alloc_reqbuf = plain_alloc_reqbuf, + .free_reqbuf = plain_free_reqbuf, + .alloc_repbuf = plain_alloc_repbuf, + .free_repbuf = plain_free_repbuf, + .enlarge_reqbuf = plain_enlarge_reqbuf, +}; + +static struct ptlrpc_sec_sops plain_sec_sops = { + .accept = plain_accept, + .alloc_rs = plain_alloc_rs, + .authorize = plain_authorize, + .free_rs = plain_free_rs, + .unwrap_bulk = plain_svc_unwrap_bulk, + .wrap_bulk = plain_svc_wrap_bulk, +}; + +static struct ptlrpc_sec_policy plain_policy = { + .sp_owner = THIS_MODULE, + .sp_name = "plain", + .sp_policy = SPTLRPC_POLICY_PLAIN, + .sp_cops = &plain_sec_cops, + .sp_sops = &plain_sec_sops, +}; + +int sptlrpc_plain_init(void) +{ + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int rc; + + buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size; + plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + rc = sptlrpc_register_policy(&plain_policy); + if (rc) + CERROR("failed to register: %d\n", rc); + + return rc; +} + +void sptlrpc_plain_fini(void) +{ + int rc; + + rc = sptlrpc_unregister_policy(&plain_policy); + if (rc) + CERROR("cannot unregister: %d\n", rc); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/service.c b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c new file mode 100644 index 0000000000000..16fcaba8f03c5 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c @@ -0,0 +1,3672 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include + +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" +#include + +/* The following are visible and mutable through /sys/module/ptlrpc */ +int test_req_buffer_pressure = 0; +module_param(test_req_buffer_pressure, int, 0444); +MODULE_PARM_DESC(test_req_buffer_pressure, "set non-zero to put pressure on request buffer pools"); +module_param(at_min, int, 0644); +MODULE_PARM_DESC(at_min, "Adaptive timeout minimum (sec)"); +module_param(at_max, int, 0644); +MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)"); +module_param(at_history, int, 0644); +MODULE_PARM_DESC(at_history, + "Adaptive timeouts remember the slowest event that took place within this period (sec)"); +module_param(at_early_margin, int, 0644); +MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply"); +module_param(at_extra, int, 0644); +MODULE_PARM_DESC(at_extra, "How much extra time to give with each early reply"); + +/* forward ref */ +static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt); +static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req); +static void ptlrpc_at_remove_timed(struct ptlrpc_request *req); +static int ptlrpc_start_threads(struct ptlrpc_service *svc); +static int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait); + +/** Holds a list of all PTLRPC services */ +LIST_HEAD(ptlrpc_all_services); +/** Used to protect the \e ptlrpc_all_services list */ +struct mutex ptlrpc_all_services_mutex; + +static struct ptlrpc_request_buffer_desc * +ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request_buffer_desc *rqbd; + + OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt); + if (rqbd == NULL) + return NULL; + + rqbd->rqbd_svcpt = svcpt; + rqbd->rqbd_refcount = 0; + rqbd->rqbd_cbid.cbid_fn = request_in_callback; + rqbd->rqbd_cbid.cbid_arg = rqbd; + INIT_LIST_HEAD(&rqbd->rqbd_reqs); + OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable, + svcpt->scp_cpt, svc->srv_buf_size); + if (rqbd->rqbd_buffer == NULL) { + OBD_FREE_PTR(rqbd); + return NULL; + } + + spin_lock(&svcpt->scp_lock); + list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); + svcpt->scp_nrqbds_total++; + spin_unlock(&svcpt->scp_lock); + + return rqbd; +} + +static void ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd) +{ + struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; + + LASSERT(rqbd->rqbd_refcount == 0); + LASSERT(list_empty(&rqbd->rqbd_reqs)); + + spin_lock(&svcpt->scp_lock); + list_del(&rqbd->rqbd_list); + svcpt->scp_nrqbds_total--; + spin_unlock(&svcpt->scp_lock); + + OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size); + OBD_FREE_PTR(rqbd); +} + +static int ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request_buffer_desc *rqbd; + int rc = 0; + int i; + + if (svcpt->scp_rqbd_allocating) + goto try_post; + + spin_lock(&svcpt->scp_lock); + /* check again with lock */ + if (svcpt->scp_rqbd_allocating) { + /* NB: we might allow more than one thread in the future */ + LASSERT(svcpt->scp_rqbd_allocating == 1); + spin_unlock(&svcpt->scp_lock); + goto try_post; + } + + svcpt->scp_rqbd_allocating++; + spin_unlock(&svcpt->scp_lock); + + + for (i = 0; i < svc->srv_nbuf_per_group; i++) { + /* + * NB: another thread might have recycled enough rqbds, we + * need to make sure it wouldn't over-allocate, see LU-1212. + */ + if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group || + (svc->srv_nrqbds_max != 0 && + svcpt->scp_nrqbds_total > svc->srv_nrqbds_max)) + break; + + rqbd = ptlrpc_alloc_rqbd(svcpt); + + if (rqbd == NULL) { + CERROR("%s: Can't allocate request buffer\n", + svc->srv_name); + rc = -ENOMEM; + break; + } + } + + spin_lock(&svcpt->scp_lock); + + LASSERT(svcpt->scp_rqbd_allocating == 1); + svcpt->scp_rqbd_allocating--; + + spin_unlock(&svcpt->scp_lock); + + CDEBUG(D_RPCTRACE, + "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n", + svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted, + svcpt->scp_nrqbds_total, rc); + + try_post: + if (post && rc == 0) + rc = ptlrpc_server_post_idle_rqbds(svcpt); + + return rc; +} + +/** + * Part of Rep-Ack logic. + * Puts a lock and its mode into reply state assotiated to request reply. + */ +void ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock, + int mode, bool no_ack, bool convert_lock) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + int idx; + + LASSERT(rs != NULL); + LASSERT(rs->rs_nlocks < RS_MAX_LOCKS); + + idx = rs->rs_nlocks++; + rs->rs_locks[idx] = *lock; + rs->rs_modes[idx] = mode; + rs->rs_difficult = 1; + rs->rs_no_ack = no_ack; + rs->rs_convert_lock = convert_lock; +} +EXPORT_SYMBOL(ptlrpc_save_lock); + + +struct ptlrpc_hr_partition; + +struct ptlrpc_hr_thread { + int hrt_id; /* thread ID */ + spinlock_t hrt_lock; + wait_queue_head_t hrt_waitq; + struct list_head hrt_queue; + struct ptlrpc_hr_partition *hrt_partition; +}; + +struct ptlrpc_hr_partition { + /* # of started threads */ + atomic_t hrp_nstarted; + /* # of stopped threads */ + atomic_t hrp_nstopped; + /* cpu partition id */ + int hrp_cpt; + /* round-robin rotor for choosing thread */ + int hrp_rotor; + /* total number of threads on this partition */ + int hrp_nthrs; + /* threads table */ + struct ptlrpc_hr_thread *hrp_thrs; +}; + +#define HRT_RUNNING 0 +#define HRT_STOPPING 1 + +struct ptlrpc_hr_service { + /* CPU partition table, it's just cfs_cpt_tab for now */ + struct cfs_cpt_table *hr_cpt_table; + /** controller sleep waitq */ + wait_queue_head_t hr_waitq; + unsigned int hr_stopping; + /** roundrobin rotor for non-affinity service */ + unsigned int hr_rotor; + /* partition data */ + struct ptlrpc_hr_partition **hr_partitions; +}; + +struct rs_batch { + struct list_head rsb_replies; + unsigned int rsb_n_replies; + struct ptlrpc_service_part *rsb_svcpt; +}; + +/** reply handling service. */ +static struct ptlrpc_hr_service ptlrpc_hr; + +/** + * maximum mumber of replies scheduled in one batch + */ +#define MAX_SCHEDULED 256 + +/** + * Initialize a reply batch. + * + * \param b batch + */ +static void rs_batch_init(struct rs_batch *b) +{ + memset(b, 0, sizeof(*b)); + INIT_LIST_HEAD(&b->rsb_replies); +} + +/** + * Choose an hr thread to dispatch requests to. + */ +static +struct ptlrpc_hr_thread *ptlrpc_hr_select(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_hr_partition *hrp; + unsigned int rotor; + + if (svcpt->scp_cpt >= 0 && + svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) { + /* directly match partition */ + hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt]; + + } else { + rotor = ptlrpc_hr.hr_rotor++; + rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table); + + hrp = ptlrpc_hr.hr_partitions[rotor]; + } + + rotor = hrp->hrp_rotor++; + return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs]; +} + +/** + * Dispatch all replies accumulated in the batch to one from + * dedicated reply handling threads. + * + * \param b batch + */ +static void rs_batch_dispatch(struct rs_batch *b) +{ + if (b->rsb_n_replies != 0) { + struct ptlrpc_hr_thread *hrt; + + hrt = ptlrpc_hr_select(b->rsb_svcpt); + + spin_lock(&hrt->hrt_lock); + list_splice_init(&b->rsb_replies, &hrt->hrt_queue); + spin_unlock(&hrt->hrt_lock); + + wake_up(&hrt->hrt_waitq); + b->rsb_n_replies = 0; + } +} + +/** + * Add a reply to a batch. + * Add one reply object to a batch, schedule batched replies if overload. + * + * \param b batch + * \param rs reply + */ +static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + + if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) { + if (b->rsb_svcpt != NULL) { + rs_batch_dispatch(b); + spin_unlock(&b->rsb_svcpt->scp_rep_lock); + } + spin_lock(&svcpt->scp_rep_lock); + b->rsb_svcpt = svcpt; + } + spin_lock(&rs->rs_lock); + rs->rs_scheduled_ever = 1; + if (rs->rs_scheduled == 0) { + list_move(&rs->rs_list, &b->rsb_replies); + rs->rs_scheduled = 1; + b->rsb_n_replies++; + } + rs->rs_committed = 1; + spin_unlock(&rs->rs_lock); +} + +/** + * Reply batch finalization. + * Dispatch remaining replies from the batch + * and release remaining spinlock. + * + * \param b batch + */ +static void rs_batch_fini(struct rs_batch *b) +{ + if (b->rsb_svcpt != NULL) { + rs_batch_dispatch(b); + spin_unlock(&b->rsb_svcpt->scp_rep_lock); + } +} + +#define DECLARE_RS_BATCH(b) struct rs_batch b + + +/** + * Put reply state into a queue for processing because we received + * ACK from the client + */ +void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_hr_thread *hrt; + + ENTRY; + + LASSERT(list_empty(&rs->rs_list)); + + hrt = ptlrpc_hr_select(rs->rs_svcpt); + + spin_lock(&hrt->hrt_lock); + list_add_tail(&rs->rs_list, &hrt->hrt_queue); + spin_unlock(&hrt->hrt_lock); + + wake_up(&hrt->hrt_waitq); + EXIT; +} + +void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs) +{ + ENTRY; + + assert_spin_locked(&rs->rs_svcpt->scp_rep_lock); + assert_spin_locked(&rs->rs_lock); + LASSERT(rs->rs_difficult); + rs->rs_scheduled_ever = 1; /* flag any notification attempt */ + + if (rs->rs_scheduled) { /* being set up or already notified */ + EXIT; + return; + } + + rs->rs_scheduled = 1; + list_del_init(&rs->rs_list); + ptlrpc_dispatch_difficult_reply(rs); + EXIT; +} +EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply); + +void ptlrpc_commit_replies(struct obd_export *exp) +{ + struct ptlrpc_reply_state *rs, *nxt; + DECLARE_RS_BATCH(batch); + + ENTRY; + + rs_batch_init(&batch); + /* + * Find any replies that have been committed and get their service + * to attend to complete them. + */ + + /* CAVEAT EMPTOR: spinlock ordering!!! */ + spin_lock(&exp->exp_uncommitted_replies_lock); + list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies, + rs_obd_list) { + LASSERT(rs->rs_difficult); + /* VBR: per-export last_committed */ + LASSERT(rs->rs_export); + if (rs->rs_transno <= exp->exp_last_committed) { + list_del_init(&rs->rs_obd_list); + rs_batch_add(&batch, rs); + } + } + spin_unlock(&exp->exp_uncommitted_replies_lock); + rs_batch_fini(&batch); + EXIT; +} + +static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_request_buffer_desc *rqbd; + int rc; + int posted = 0; + + for (;;) { + spin_lock(&svcpt->scp_lock); + + if (list_empty(&svcpt->scp_rqbd_idle)) { + spin_unlock(&svcpt->scp_lock); + return posted; + } + + rqbd = list_first_entry(&svcpt->scp_rqbd_idle, + struct ptlrpc_request_buffer_desc, + rqbd_list); + + /* assume we will post successfully */ + svcpt->scp_nrqbds_posted++; + list_move(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted); + + spin_unlock(&svcpt->scp_lock); + + rc = ptlrpc_register_rqbd(rqbd); + if (rc != 0) + break; + + posted = 1; + } + + spin_lock(&svcpt->scp_lock); + + svcpt->scp_nrqbds_posted--; + list_move_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); + + /* + * Don't complain if no request buffers are posted right now; LNET + * won't drop requests because we set the portal lazy! + */ + + spin_unlock(&svcpt->scp_lock); + + return -1; +} + +static void ptlrpc_at_timer(cfs_timer_cb_arg_t data) +{ + struct ptlrpc_service_part *svcpt; + + svcpt = cfs_from_timer(svcpt, data, scp_at_timer); + + svcpt->scp_at_check = 1; + svcpt->scp_at_checktime = ktime_get(); + wake_up(&svcpt->scp_waitq); +} + +static void ptlrpc_server_nthreads_check(struct ptlrpc_service *svc, + struct ptlrpc_service_conf *conf) +{ + struct ptlrpc_service_thr_conf *tc = &conf->psc_thr; + unsigned int init; + unsigned int total; + unsigned int nthrs; + int weight; + + /* + * Common code for estimating & validating threads number. + * CPT affinity service could have percpt thread-pool instead + * of a global thread-pool, which means user might not always + * get the threads number they give it in conf::tc_nthrs_user + * even they did set. It's because we need to validate threads + * number for each CPT to guarantee each pool will have enough + * threads to keep the service healthy. + */ + init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL); + init = max_t(int, init, tc->tc_nthrs_init); + + /* + * NB: please see comments in lustre_lnet.h for definition + * details of these members + */ + LASSERT(tc->tc_nthrs_max != 0); + + if (tc->tc_nthrs_user != 0) { + /* + * In case there is a reason to test a service with many + * threads, we give a less strict check here, it can + * be up to 8 * nthrs_max + */ + total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user); + nthrs = total / svc->srv_ncpts; + init = max(init, nthrs); + goto out; + } + + total = tc->tc_nthrs_max; + if (tc->tc_nthrs_base == 0) { + /* + * don't care about base threads number per partition, + * this is most for non-affinity service + */ + nthrs = total / svc->srv_ncpts; + goto out; + } + + nthrs = tc->tc_nthrs_base; + if (svc->srv_ncpts == 1) { + int i; + + /* + * NB: Increase the base number if it's single partition + * and total number of cores/HTs is larger or equal to 4. + * result will always < 2 * nthrs_base + */ + weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY); + for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */ + (tc->tc_nthrs_base >> i) != 0; i++) + nthrs += tc->tc_nthrs_base >> i; + } + + if (tc->tc_thr_factor != 0) { + int factor = tc->tc_thr_factor; + const int fade = 4; + + /* + * User wants to increase number of threads with for + * each CPU core/HT, most likely the factor is larger than + * one thread/core because service threads are supposed to + * be blocked by lock or wait for IO. + */ + /* + * Amdahl's law says that adding processors wouldn't give + * a linear increasing of parallelism, so it's nonsense to + * have too many threads no matter how many cores/HTs + * there are. + */ + preempt_disable(); + if (cpumask_weight + (topology_sibling_cpumask(smp_processor_id())) > 1) { + /* weight is # of HTs */ + /* depress thread factor for hyper-thread */ + factor = factor - (factor >> 1) + (factor >> 3); + } + preempt_enable(); + + weight = cfs_cpt_weight(svc->srv_cptable, 0); + + for (; factor > 0 && weight > 0; factor--, weight -= fade) + nthrs += min(weight, fade) * factor; + } + + if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { + nthrs = max(tc->tc_nthrs_base, + tc->tc_nthrs_max / svc->srv_ncpts); + } + out: + nthrs = max(nthrs, tc->tc_nthrs_init); + svc->srv_nthrs_cpt_limit = nthrs; + svc->srv_nthrs_cpt_init = init; + + if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { + CDEBUG(D_OTHER, + "%s: This service may have more threads (%d) than the given soft limit (%d)\n", + svc->srv_name, nthrs * svc->srv_ncpts, + tc->tc_nthrs_max); + } +} + +/** + * Initialize percpt data for a service + */ +static int ptlrpc_service_part_init(struct ptlrpc_service *svc, + struct ptlrpc_service_part *svcpt, int cpt) +{ + struct ptlrpc_at_array *array; + int size; + int index; + int rc; + + svcpt->scp_cpt = cpt; + INIT_LIST_HEAD(&svcpt->scp_threads); + + /* rqbd and incoming request queue */ + spin_lock_init(&svcpt->scp_lock); + mutex_init(&svcpt->scp_mutex); + INIT_LIST_HEAD(&svcpt->scp_rqbd_idle); + INIT_LIST_HEAD(&svcpt->scp_rqbd_posted); + INIT_LIST_HEAD(&svcpt->scp_req_incoming); + init_waitqueue_head(&svcpt->scp_waitq); + /* history request & rqbd list */ + INIT_LIST_HEAD(&svcpt->scp_hist_reqs); + INIT_LIST_HEAD(&svcpt->scp_hist_rqbds); + + /* acitve requests and hp requests */ + spin_lock_init(&svcpt->scp_req_lock); + + /* reply states */ + spin_lock_init(&svcpt->scp_rep_lock); + INIT_LIST_HEAD(&svcpt->scp_rep_active); + INIT_LIST_HEAD(&svcpt->scp_rep_idle); + init_waitqueue_head(&svcpt->scp_rep_waitq); + atomic_set(&svcpt->scp_nreps_difficult, 0); + + /* adaptive timeout */ + spin_lock_init(&svcpt->scp_at_lock); + array = &svcpt->scp_at_array; + + size = at_est2timeout(at_max); + array->paa_size = size; + array->paa_count = 0; + array->paa_deadline = -1; + + /* allocate memory for scp_at_array (ptlrpc_at_array) */ + OBD_CPT_ALLOC(array->paa_reqs_array, + svc->srv_cptable, cpt, sizeof(struct list_head) * size); + if (array->paa_reqs_array == NULL) + return -ENOMEM; + + for (index = 0; index < size; index++) + INIT_LIST_HEAD(&array->paa_reqs_array[index]); + + OBD_CPT_ALLOC(array->paa_reqs_count, + svc->srv_cptable, cpt, sizeof(__u32) * size); + if (array->paa_reqs_count == NULL) + goto failed; + + cfs_timer_setup(&svcpt->scp_at_timer, ptlrpc_at_timer, + (unsigned long)svcpt, 0); + + /* + * At SOW, service time should be quick; 10s seems generous. If client + * timeout is less than this, we'll be sending an early reply. + */ + at_init(&svcpt->scp_at_estimate, 10, 0); + + /* assign this before call ptlrpc_grow_req_bufs */ + svcpt->scp_service = svc; + /* Now allocate the request buffers, but don't post them now */ + rc = ptlrpc_grow_req_bufs(svcpt, 0); + /* + * We shouldn't be under memory pressure at startup, so + * fail if we can't allocate all our buffers at this time. + */ + if (rc != 0) + goto failed; + + return 0; + + failed: + if (array->paa_reqs_count != NULL) { + OBD_FREE_PTR_ARRAY(array->paa_reqs_count, size); + array->paa_reqs_count = NULL; + } + + if (array->paa_reqs_array != NULL) { + OBD_FREE_PTR_ARRAY(array->paa_reqs_array, array->paa_size); + array->paa_reqs_array = NULL; + } + + return -ENOMEM; +} + +/** + * Initialize service on a given portal. + * This includes starting serving threads , allocating and posting rqbds and + * so on. + */ +struct ptlrpc_service *ptlrpc_register_service(struct ptlrpc_service_conf *conf, + struct kset *parent, + struct dentry *debugfs_entry) +{ + struct ptlrpc_service_cpt_conf *cconf = &conf->psc_cpt; + struct ptlrpc_service *service; + struct ptlrpc_service_part *svcpt; + struct cfs_cpt_table *cptable; + __u32 *cpts = NULL; + int ncpts; + int cpt; + int rc; + int i; + + ENTRY; + + LASSERT(conf->psc_buf.bc_nbufs > 0); + LASSERT(conf->psc_buf.bc_buf_size >= + conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD); + LASSERT(conf->psc_thr.tc_ctx_tags != 0); + + cptable = cconf->cc_cptable; + if (cptable == NULL) + cptable = cfs_cpt_tab; + + if (conf->psc_thr.tc_cpu_bind > 1) { + CERROR("%s: Invalid cpu bind value %d, only 1 or 0 allowed\n", + conf->psc_name, conf->psc_thr.tc_cpu_bind); + RETURN(ERR_PTR(-EINVAL)); + } + + if (!cconf->cc_affinity) { + ncpts = 1; + } else { + ncpts = cfs_cpt_number(cptable); + if (cconf->cc_pattern != NULL) { + struct cfs_expr_list *el; + + rc = cfs_expr_list_parse(cconf->cc_pattern, + strlen(cconf->cc_pattern), + 0, ncpts - 1, &el); + if (rc != 0) { + CERROR("%s: invalid CPT pattern string: %s\n", + conf->psc_name, cconf->cc_pattern); + RETURN(ERR_PTR(-EINVAL)); + } + + rc = cfs_expr_list_values(el, ncpts, &cpts); + cfs_expr_list_free(el); + if (rc <= 0) { + CERROR("%s: failed to parse CPT array %s: %d\n", + conf->psc_name, cconf->cc_pattern, rc); + if (cpts != NULL) + OBD_FREE_PTR_ARRAY(cpts, ncpts); + RETURN(ERR_PTR(rc < 0 ? rc : -EINVAL)); + } + ncpts = rc; + } + } + + OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts])); + if (service == NULL) { + if (cpts != NULL) + OBD_FREE_PTR_ARRAY(cpts, ncpts); + RETURN(ERR_PTR(-ENOMEM)); + } + + service->srv_cptable = cptable; + service->srv_cpts = cpts; + service->srv_ncpts = ncpts; + service->srv_cpt_bind = conf->psc_thr.tc_cpu_bind; + + service->srv_cpt_bits = 0; /* it's zero already, easy to read... */ + while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable)) + service->srv_cpt_bits++; + + /* public members */ + spin_lock_init(&service->srv_lock); + service->srv_name = conf->psc_name; + service->srv_watchdog_factor = conf->psc_watchdog_factor; + INIT_LIST_HEAD(&service->srv_list); /* for safty of cleanup */ + + /* buffer configuration */ + service->srv_nbuf_per_group = test_req_buffer_pressure ? + 1 : conf->psc_buf.bc_nbufs; + /* do not limit max number of rqbds by default */ + service->srv_nrqbds_max = 0; + + service->srv_max_req_size = conf->psc_buf.bc_req_max_size + + SPTLRPC_MAX_PAYLOAD; + service->srv_buf_size = conf->psc_buf.bc_buf_size; + service->srv_rep_portal = conf->psc_buf.bc_rep_portal; + service->srv_req_portal = conf->psc_buf.bc_req_portal; + + /* With slab/alloc_pages buffer size will be rounded up to 2^n */ + if (service->srv_buf_size & (service->srv_buf_size - 1)) { + int round = size_roundup_power2(service->srv_buf_size); + + service->srv_buf_size = round; + } + + /* Increase max reply size to next power of two */ + service->srv_max_reply_size = 1; + while (service->srv_max_reply_size < + conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD) + service->srv_max_reply_size <<= 1; + + service->srv_thread_name = conf->psc_thr.tc_thr_name; + service->srv_ctx_tags = conf->psc_thr.tc_ctx_tags; + service->srv_hpreq_ratio = PTLRPC_SVC_HP_RATIO; + service->srv_ops = conf->psc_ops; + + for (i = 0; i < ncpts; i++) { + if (!cconf->cc_affinity) + cpt = CFS_CPT_ANY; + else + cpt = cpts != NULL ? cpts[i] : i; + + OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt)); + if (svcpt == NULL) + GOTO(failed, rc = -ENOMEM); + + service->srv_parts[i] = svcpt; + rc = ptlrpc_service_part_init(service, svcpt, cpt); + if (rc != 0) + GOTO(failed, rc); + } + + ptlrpc_server_nthreads_check(service, conf); + + rc = LNetSetLazyPortal(service->srv_req_portal); + LASSERT(rc == 0); + + mutex_lock(&ptlrpc_all_services_mutex); + list_add(&service->srv_list, &ptlrpc_all_services); + mutex_unlock(&ptlrpc_all_services_mutex); + + if (parent) { + rc = ptlrpc_sysfs_register_service(parent, service); + if (rc) + GOTO(failed, rc); + } + + if (debugfs_entry != NULL) + ptlrpc_ldebugfs_register_service(debugfs_entry, service); + + rc = ptlrpc_service_nrs_setup(service); + if (rc != 0) + GOTO(failed, rc); + + CDEBUG(D_NET, "%s: Started, listening on portal %d\n", + service->srv_name, service->srv_req_portal); + + rc = ptlrpc_start_threads(service); + if (rc != 0) { + CERROR("Failed to start threads for service %s: %d\n", + service->srv_name, rc); + GOTO(failed, rc); + } + + RETURN(service); +failed: + ptlrpc_unregister_service(service); + RETURN(ERR_PTR(rc)); +} +EXPORT_SYMBOL(ptlrpc_register_service); + +/** + * to actually free the request, must be called without holding svc_lock. + * note it's caller's responsibility to unlink req->rq_list. + */ +static void ptlrpc_server_free_request(struct ptlrpc_request *req) +{ + LASSERT(atomic_read(&req->rq_refcount) == 0); + LASSERT(list_empty(&req->rq_timed_list)); + + /* + * DEBUG_REQ() assumes the reply state of a request with a valid + * ref will not be destroyed until that reference is dropped. + */ + ptlrpc_req_drop_rs(req); + + sptlrpc_svc_ctx_decref(req); + + if (req != &req->rq_rqbd->rqbd_req) { + /* + * NB request buffers use an embedded + * req if the incoming req unlinked the + * MD; this isn't one of them! + */ + ptlrpc_request_cache_free(req); + } +} + +/** + * drop a reference count of the request. if it reaches 0, we either + * put it into history list, or free it immediately. + */ +void ptlrpc_server_drop_request(struct ptlrpc_request *req) +{ + struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; + struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + int refcount; + + if (!atomic_dec_and_test(&req->rq_refcount)) + return; + + if (req->rq_session.lc_state == LCS_ENTERED) { + lu_context_exit(&req->rq_session); + lu_context_fini(&req->rq_session); + } + + if (req->rq_at_linked) { + spin_lock(&svcpt->scp_at_lock); + /* + * recheck with lock, in case it's unlinked by + * ptlrpc_at_check_timed() + */ + if (likely(req->rq_at_linked)) + ptlrpc_at_remove_timed(req); + spin_unlock(&svcpt->scp_at_lock); + } + + LASSERT(list_empty(&req->rq_timed_list)); + + /* finalize request */ + if (req->rq_export) { + class_export_put(req->rq_export); + req->rq_export = NULL; + } + + spin_lock(&svcpt->scp_lock); + + list_add(&req->rq_list, &rqbd->rqbd_reqs); + + refcount = --(rqbd->rqbd_refcount); + if (refcount == 0) { + /* request buffer is now idle: add to history */ + list_move_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds); + svcpt->scp_hist_nrqbds++; + + /* + * cull some history? + * I expect only about 1 or 2 rqbds need to be recycled here + */ + while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) { + rqbd = list_first_entry(&svcpt->scp_hist_rqbds, + struct ptlrpc_request_buffer_desc, + rqbd_list); + + list_del(&rqbd->rqbd_list); + svcpt->scp_hist_nrqbds--; + + /* + * remove rqbd's reqs from svc's req history while + * I've got the service lock + */ + list_for_each_entry(req, &rqbd->rqbd_reqs, rq_list) { + /* Track the highest culled req seq */ + if (req->rq_history_seq > + svcpt->scp_hist_seq_culled) { + svcpt->scp_hist_seq_culled = + req->rq_history_seq; + } + list_del(&req->rq_history_list); + } + + spin_unlock(&svcpt->scp_lock); + + while ((req = list_first_entry_or_null( + &rqbd->rqbd_reqs, + struct ptlrpc_request, rq_list))) { + list_del(&req->rq_list); + ptlrpc_server_free_request(req); + } + + spin_lock(&svcpt->scp_lock); + /* + * now all reqs including the embedded req has been + * disposed, schedule request buffer for re-use + * or free it to drain some in excess. + */ + LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == 0); + if (svcpt->scp_nrqbds_posted >= + svc->srv_nbuf_per_group || + (svc->srv_nrqbds_max != 0 && + svcpt->scp_nrqbds_total > svc->srv_nrqbds_max) || + test_req_buffer_pressure) { + /* like in ptlrpc_free_rqbd() */ + svcpt->scp_nrqbds_total--; + OBD_FREE_LARGE(rqbd->rqbd_buffer, + svc->srv_buf_size); + OBD_FREE_PTR(rqbd); + } else { + list_add_tail(&rqbd->rqbd_list, + &svcpt->scp_rqbd_idle); + } + } + + spin_unlock(&svcpt->scp_lock); + } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) { + /* If we are low on memory, we are not interested in history */ + list_del(&req->rq_list); + list_del_init(&req->rq_history_list); + + /* Track the highest culled req seq */ + if (req->rq_history_seq > svcpt->scp_hist_seq_culled) + svcpt->scp_hist_seq_culled = req->rq_history_seq; + + spin_unlock(&svcpt->scp_lock); + + ptlrpc_server_free_request(req); + } else { + spin_unlock(&svcpt->scp_lock); + } +} + +static void ptlrpc_add_exp_list_nolock(struct ptlrpc_request *req, + struct obd_export *export, bool hp) +{ + __u16 tag = lustre_msg_get_tag(req->rq_reqmsg); + + if (hp) + list_add(&req->rq_exp_list, &export->exp_hp_rpcs); + else + list_add(&req->rq_exp_list, &export->exp_reg_rpcs); + if (tag && export->exp_used_slots) + set_bit(tag - 1, export->exp_used_slots); +} + +static void ptlrpc_del_exp_list(struct ptlrpc_request *req) +{ + __u16 tag = lustre_msg_get_tag(req->rq_reqmsg); + + spin_lock(&req->rq_export->exp_rpc_lock); + list_del_init(&req->rq_exp_list); + if (tag && !req->rq_obsolete && req->rq_export->exp_used_slots) + clear_bit(tag - 1, req->rq_export->exp_used_slots); + spin_unlock(&req->rq_export->exp_rpc_lock); +} + +/** Change request export and move hp request from old export to new */ +void ptlrpc_request_change_export(struct ptlrpc_request *req, + struct obd_export *export) +{ + if (req->rq_export != NULL) { + LASSERT(!list_empty(&req->rq_exp_list)); + /* remove rq_exp_list from last export */ + ptlrpc_del_exp_list(req); + /* export has one reference already, so it's safe to + * add req to export queue here and get another + * reference for request later + */ + spin_lock(&export->exp_rpc_lock); + ptlrpc_add_exp_list_nolock(req, export, req->rq_ops != NULL); + spin_unlock(&export->exp_rpc_lock); + + class_export_rpc_dec(req->rq_export); + class_export_put(req->rq_export); + } + + /* request takes one export refcount */ + req->rq_export = class_export_get(export); + class_export_rpc_inc(export); +} + +/** + * to finish a request: stop sending more early replies, and release + * the request. + */ +static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + ptlrpc_server_hpreq_fini(req); + + ptlrpc_server_drop_request(req); +} + +/** + * to finish an active request: stop sending more early replies, and release + * the request. should be called after we finished handling the request. + */ +static void ptlrpc_server_finish_active_request( + struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + spin_lock(&svcpt->scp_req_lock); + ptlrpc_nrs_req_stop_nolock(req); + svcpt->scp_nreqs_active--; + if (req->rq_hp) + svcpt->scp_nhreqs_active--; + spin_unlock(&svcpt->scp_req_lock); + + ptlrpc_nrs_req_finalize(req); + + if (req->rq_export != NULL) + class_export_rpc_dec(req->rq_export); + + ptlrpc_server_finish_request(svcpt, req); +} + +/** + * This function makes sure dead exports are evicted in a timely manner. + * This function is only called when some export receives a message (i.e., + * the network is up.) + */ +void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay) +{ + struct obd_export *oldest_exp; + time64_t oldest_time, new_time; + + ENTRY; + + LASSERT(exp); + + /* + * Compensate for slow machines, etc, by faking our request time + * into the future. Although this can break the strict time-ordering + * of the list, we can be really lazy here - we don't have to evict + * at the exact right moment. Eventually, all silent exports + * will make it to the top of the list. + */ + + /* Do not pay attention on 1sec or smaller renewals. */ + new_time = ktime_get_real_seconds() + extra_delay; + if (exp->exp_last_request_time + 1 /*second */ >= new_time) + RETURN_EXIT; + + exp->exp_last_request_time = new_time; + + /* + * exports may get disconnected from the chain even though the + * export has references, so we must keep the spin lock while + * manipulating the lists + */ + spin_lock(&exp->exp_obd->obd_dev_lock); + + if (list_empty(&exp->exp_obd_chain_timed)) { + /* this one is not timed */ + spin_unlock(&exp->exp_obd->obd_dev_lock); + RETURN_EXIT; + } + + list_move_tail(&exp->exp_obd_chain_timed, + &exp->exp_obd->obd_exports_timed); + + oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next, + struct obd_export, exp_obd_chain_timed); + oldest_time = oldest_exp->exp_last_request_time; + spin_unlock(&exp->exp_obd->obd_dev_lock); + + if (exp->exp_obd->obd_recovering) { + /* be nice to everyone during recovery */ + EXIT; + return; + } + + /* Note - racing to start/reset the obd_eviction timer is safe */ + if (exp->exp_obd->obd_eviction_timer == 0) { + /* Check if the oldest entry is expired. */ + if (ktime_get_real_seconds() > + oldest_time + PING_EVICT_TIMEOUT + extra_delay) { + /* + * We need a second timer, in case the net was down and + * it just came back. Since the pinger may skip every + * other PING_INTERVAL (see note in ptlrpc_pinger_main), + * we better wait for 3. + */ + exp->exp_obd->obd_eviction_timer = + ktime_get_real_seconds() + 3 * PING_INTERVAL; + CDEBUG(D_HA, "%s: Think about evicting %s from %lld\n", + exp->exp_obd->obd_name, + obd_export_nid2str(oldest_exp), oldest_time); + } + } else { + if (ktime_get_real_seconds() > + (exp->exp_obd->obd_eviction_timer + extra_delay)) { + /* + * The evictor won't evict anyone who we've heard from + * recently, so we don't have to check before we start + * it. + */ + if (!ping_evictor_wake(exp)) + exp->exp_obd->obd_eviction_timer = 0; + } + } + + EXIT; +} + +/** + * Sanity check request \a req. + * Return 0 if all is ok, error code otherwise. + */ +static int ptlrpc_check_req(struct ptlrpc_request *req) +{ + struct obd_device *obd = req->rq_export->exp_obd; + int rc = 0; + + if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) < + req->rq_export->exp_conn_cnt)) { + DEBUG_REQ(D_RPCTRACE, req, + "DROPPING req from old connection %d < %d", + lustre_msg_get_conn_cnt(req->rq_reqmsg), + req->rq_export->exp_conn_cnt); + return -EEXIST; + } + if (unlikely(obd == NULL || obd->obd_fail)) { + /* + * Failing over, don't handle any more reqs, + * send error response instead. + */ + CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n", + req, (obd != NULL) ? obd->obd_name : "unknown"); + rc = -ENODEV; + } else if (lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_REPLAY | MSG_REQ_REPLAY_DONE) && + !obd->obd_recovering) { + DEBUG_REQ(D_ERROR, req, + "Invalid replay without recovery"); + class_fail_export(req->rq_export); + rc = -ENODEV; + } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 && + !obd->obd_recovering) { + DEBUG_REQ(D_ERROR, req, + "Invalid req with transno %llu without recovery", + lustre_msg_get_transno(req->rq_reqmsg)); + class_fail_export(req->rq_export); + rc = -ENODEV; + } + + if (unlikely(rc < 0)) { + req->rq_status = rc; + ptlrpc_error(req); + } + return rc; +} + +static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_at_array *array = &svcpt->scp_at_array; + time64_t next; + + if (array->paa_count == 0) { + del_timer(&svcpt->scp_at_timer); + return; + } + + /* Set timer for closest deadline */ + next = array->paa_deadline - ktime_get_real_seconds() - + at_early_margin; + if (next <= 0) { + ptlrpc_at_timer(cfs_timer_cb_arg(svcpt, scp_at_timer)); + } else { + mod_timer(&svcpt->scp_at_timer, + jiffies + nsecs_to_jiffies(next * NSEC_PER_SEC)); + CDEBUG(D_INFO, "armed %s at %+llds\n", + svcpt->scp_service->srv_name, next); + } +} + +/* Add rpc to early reply check list */ +static int ptlrpc_at_add_timed(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_at_array *array = &svcpt->scp_at_array; + struct ptlrpc_request *rq = NULL; + __u32 index; + + if (AT_OFF) + return(0); + + if (req->rq_no_reply) + return 0; + + if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0) + return(-ENOSYS); + + spin_lock(&svcpt->scp_at_lock); + LASSERT(list_empty(&req->rq_timed_list)); + + div_u64_rem(req->rq_deadline, array->paa_size, &index); + if (array->paa_reqs_count[index] > 0) { + /* + * latest rpcs will have the latest deadlines in the list, + * so search backward. + */ + list_for_each_entry_reverse(rq, &array->paa_reqs_array[index], + rq_timed_list) { + if (req->rq_deadline >= rq->rq_deadline) { + list_add(&req->rq_timed_list, + &rq->rq_timed_list); + break; + } + } + } + + /* Add the request at the head of the list */ + if (list_empty(&req->rq_timed_list)) + list_add(&req->rq_timed_list, &array->paa_reqs_array[index]); + + spin_lock(&req->rq_lock); + req->rq_at_linked = 1; + spin_unlock(&req->rq_lock); + req->rq_at_index = index; + array->paa_reqs_count[index]++; + array->paa_count++; + if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) { + array->paa_deadline = req->rq_deadline; + ptlrpc_at_set_timer(svcpt); + } + spin_unlock(&svcpt->scp_at_lock); + + return 0; +} + +static void ptlrpc_at_remove_timed(struct ptlrpc_request *req) +{ + struct ptlrpc_at_array *array; + + array = &req->rq_rqbd->rqbd_svcpt->scp_at_array; + + /* NB: must call with hold svcpt::scp_at_lock */ + LASSERT(!list_empty(&req->rq_timed_list)); + list_del_init(&req->rq_timed_list); + + spin_lock(&req->rq_lock); + req->rq_at_linked = 0; + spin_unlock(&req->rq_lock); + + array->paa_reqs_count[req->rq_at_index]--; + array->paa_count--; +} + +/* + * Attempt to extend the request deadline by sending an early reply to the + * client. + */ +static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_request *reqcopy; + struct lustre_msg *reqmsg; + timeout_t olddl = req->rq_deadline - ktime_get_real_seconds(); + time64_t newdl; + int rc; + + ENTRY; + + if (CFS_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_RECONNECT) || + CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_ENQ_RESEND)) { + /* don't send early reply */ + RETURN(1); + } + + /* + * deadline is when the client expects us to reply, margin is the + * difference between clients' and servers' expectations + */ + DEBUG_REQ(D_ADAPTTO, req, + "%ssending early reply (deadline %+ds, margin %+ds) for %d+%d", + AT_OFF ? "AT off - not " : "", + olddl, olddl - at_get(&svcpt->scp_at_estimate), + at_get(&svcpt->scp_at_estimate), at_extra); + + if (AT_OFF) + RETURN(0); + + if (olddl < 0) { + /* below message is checked in replay-ost-single.sh test_9 */ + DEBUG_REQ(D_WARNING, req, + "Already past deadline (%+ds), not sending early reply. Consider increasing at_early_margin (%d)?", + olddl, at_early_margin); + + /* Return an error so we're not re-added to the timed list. */ + RETURN(-ETIMEDOUT); + } + + if ((lustre_msghdr_get_flags(req->rq_reqmsg) & + MSGHDR_AT_SUPPORT) == 0) { + DEBUG_REQ(D_INFO, req, + "Wanted to ask client for more time, but no AT support"); + RETURN(-ENOSYS); + } + + if (req->rq_export && + lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) { + struct obd_device *obd_exp = req->rq_export->exp_obd; + + /* + * During recovery, we don't want to send too many early + * replies, but on the other hand we want to make sure the + * client has enough time to resend if the rpc is lost. So + * during the recovery period send at least 4 early replies, + * spacing them every at_extra if we can. at_estimate should + * always equal this fixed value during recovery. + */ + + /* + * Don't account request processing time into AT history + * during recovery, it is not service time we need but + * includes also waiting time for recovering clients + */ + newdl = min_t(time64_t, at_extra, + obd_exp->obd_recovery_timeout / 4) + + ktime_get_real_seconds(); + } else { + /* + * We want to extend the request deadline by at_extra seconds, + * so we set our service estimate to reflect how much time has + * passed since this request arrived plus an additional + * at_extra seconds. The client will calculate the new deadline + * based on this service estimate (plus some additional time to + * account for network latency). See ptlrpc_at_recv_early_reply + */ + at_measured(&svcpt->scp_at_estimate, at_extra + + ktime_get_real_seconds() - + req->rq_arrival_time.tv_sec); + newdl = req->rq_arrival_time.tv_sec + + at_get(&svcpt->scp_at_estimate); + } + + /* + * Check to see if we've actually increased the deadline - + * we may be past adaptive_max + */ + if (req->rq_deadline >= newdl) { + DEBUG_REQ(D_WARNING, req, + "Could not add any time (%d/%lld), not sending early reply", + olddl, newdl - ktime_get_real_seconds()); + RETURN(-ETIMEDOUT); + } + + reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS); + if (reqcopy == NULL) + RETURN(-ENOMEM); + OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen); + if (!reqmsg) + GOTO(out_free, rc = -ENOMEM); + + *reqcopy = *req; + spin_lock_init(&reqcopy->rq_early_free_lock); + reqcopy->rq_reply_state = NULL; + reqcopy->rq_rep_swab_mask = 0; + reqcopy->rq_pack_bulk = 0; + reqcopy->rq_pack_udesc = 0; + reqcopy->rq_packed_final = 0; + sptlrpc_svc_ctx_addref(reqcopy); + /* We only need the reqmsg for the magic */ + reqcopy->rq_reqmsg = reqmsg; + memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); + + /* + * tgt_brw_read() and tgt_brw_write() may have decided not to reply. + * Without this check, we would fail the rq_no_reply assertion in + * ptlrpc_send_reply(). + */ + if (reqcopy->rq_no_reply) + GOTO(out, rc = -ETIMEDOUT); + + LASSERT(atomic_read(&req->rq_refcount)); + /* if it is last refcount then early reply isn't needed */ + if (atomic_read(&req->rq_refcount) == 1) { + DEBUG_REQ(D_ADAPTTO, reqcopy, + "Normal reply already sent, abort early reply"); + GOTO(out, rc = -EINVAL); + } + + /* Connection ref */ + reqcopy->rq_export = class_conn2export( + lustre_msg_get_handle(reqcopy->rq_reqmsg)); + if (reqcopy->rq_export == NULL) + GOTO(out, rc = -ENODEV); + + /* RPC ref */ + class_export_rpc_inc(reqcopy->rq_export); + if (reqcopy->rq_export->exp_obd && + reqcopy->rq_export->exp_obd->obd_fail) + GOTO(out_put, rc = -ENODEV); + + rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY); + if (rc) + GOTO(out_put, rc); + + rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY); + + if (!rc) { + /* Adjust our own deadline to what we told the client */ + req->rq_deadline = newdl; + req->rq_early_count++; /* number sent, server side */ + } else { + DEBUG_REQ(D_ERROR, req, "Early reply send failed: rc = %d", rc); + } + + /* + * Free the (early) reply state from lustre_pack_reply. + * (ptlrpc_send_reply takes it's own rs ref, so this is safe here) + */ + ptlrpc_req_drop_rs(reqcopy); + +out_put: + class_export_rpc_dec(reqcopy->rq_export); + class_export_put(reqcopy->rq_export); +out: + sptlrpc_svc_ctx_decref(reqcopy); + OBD_FREE_LARGE(reqmsg, req->rq_reqlen); +out_free: + ptlrpc_request_cache_free(reqcopy); + RETURN(rc); +} + +/* + * Send early replies to everybody expiring within at_early_margin + * asking for at_extra time + */ +static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_at_array *array = &svcpt->scp_at_array; + struct ptlrpc_request *rq, *n; + LIST_HEAD(work_list); + __u32 index, count; + time64_t deadline; + time64_t now = ktime_get_real_seconds(); + s64 delay_ms; + int first, counter = 0; + + ENTRY; + spin_lock(&svcpt->scp_at_lock); + if (svcpt->scp_at_check == 0) { + spin_unlock(&svcpt->scp_at_lock); + RETURN(0); + } + delay_ms = ktime_ms_delta(ktime_get(), svcpt->scp_at_checktime); + svcpt->scp_at_check = 0; + + if (array->paa_count == 0) { + spin_unlock(&svcpt->scp_at_lock); + RETURN(0); + } + + /* The timer went off, but maybe the nearest rpc already completed. */ + first = array->paa_deadline - now; + if (first > at_early_margin) { + /* We've still got plenty of time. Reset the timer. */ + ptlrpc_at_set_timer(svcpt); + spin_unlock(&svcpt->scp_at_lock); + RETURN(0); + } + + /* + * We're close to a timeout, and we don't know how much longer the + * server will take. Send early replies to everyone expiring soon. + */ + deadline = -1; + div_u64_rem(array->paa_deadline, array->paa_size, &index); + count = array->paa_count; + while (count > 0) { + count -= array->paa_reqs_count[index]; + list_for_each_entry_safe(rq, n, + &array->paa_reqs_array[index], + rq_timed_list) { + if (rq->rq_deadline > now + at_early_margin) { + /* update the earliest deadline */ + if (deadline == -1 || + rq->rq_deadline < deadline) + deadline = rq->rq_deadline; + break; + } + + /** + * ptlrpc_server_drop_request() may drop + * refcount to 0 already. Let's check this and + * don't add entry to work_list + */ + if (likely(atomic_inc_not_zero(&rq->rq_refcount))) { + ptlrpc_at_remove_timed(rq); + list_add(&rq->rq_timed_list, &work_list); + } else { + ptlrpc_at_remove_timed(rq); + } + + counter++; + } + + if (++index >= array->paa_size) + index = 0; + } + array->paa_deadline = deadline; + /* we have a new earliest deadline, restart the timer */ + ptlrpc_at_set_timer(svcpt); + + spin_unlock(&svcpt->scp_at_lock); + + CDEBUG(D_ADAPTTO, + "timeout in %+ds, asking for %d secs on %d early replies\n", + first, at_extra, counter); + if (first < 0) { + /* + * We're already past request deadlines before we even get a + * chance to send early replies + */ + LCONSOLE_WARN("%s: This server is not able to keep up with request traffic (cpu-bound).\n", + svcpt->scp_service->srv_name); + CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=%lldms\n", + counter, svcpt->scp_nreqs_incoming, + svcpt->scp_nreqs_active, + at_get(&svcpt->scp_at_estimate), delay_ms); + } + + /* + * we took additional refcount so entries can't be deleted from list, no + * locking is needed + */ + while ((rq = list_first_entry_or_null(&work_list, + struct ptlrpc_request, + rq_timed_list)) != NULL) { + list_del_init(&rq->rq_timed_list); + + if (ptlrpc_at_send_early_reply(rq) == 0) + ptlrpc_at_add_timed(rq); + + ptlrpc_server_drop_request(rq); + } + + RETURN(1); /* return "did_something" for liblustre */ +} + +/* + * Check if we are already handling earlier incarnation of this request. + * Called under &req->rq_export->exp_rpc_lock locked + */ +static struct ptlrpc_request* +ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req) +{ + struct ptlrpc_request *tmp = NULL; + + if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)) + return NULL; + + /* + * This list should not be longer than max_requests in + * flights on the client, so it is not all that long. + * Also we only hit this codepath in case of a resent + * request which makes it even more rarely hit + */ + list_for_each_entry(tmp, &req->rq_export->exp_reg_rpcs, + rq_exp_list) { + /* Found duplicate one */ + if (tmp->rq_xid == req->rq_xid) + goto found; + } + list_for_each_entry(tmp, &req->rq_export->exp_hp_rpcs, + rq_exp_list) { + /* Found duplicate one */ + if (tmp->rq_xid == req->rq_xid) + goto found; + } + return NULL; + +found: + DEBUG_REQ(D_HA, req, "Found duplicate req in processing"); + DEBUG_REQ(D_HA, tmp, "Request being processed"); + return tmp; +} + +#ifdef HAVE_SERVER_SUPPORT +static void ptlrpc_server_mark_obsolete(struct ptlrpc_request *req) +{ + req->rq_obsolete = 1; +} + +static void +ptlrpc_server_mark_in_progress_obsolete(struct ptlrpc_request *req) +{ + struct ptlrpc_request *tmp = NULL; + __u16 tag; + + if (!tgt_is_increasing_xid_client(req->rq_export) || + req->rq_export->exp_used_slots == NULL) + return; + + tag = lustre_msg_get_tag(req->rq_reqmsg); + if (tag == 0) + return; + + if (!test_bit(tag - 1, req->rq_export->exp_used_slots)) + return; + + /* This list should not be longer than max_requests in + * flights on the client, so it is not all that long. + * Also we only hit this codepath in case of a resent + * request which makes it even more rarely hit */ + list_for_each_entry(tmp, &req->rq_export->exp_reg_rpcs, rq_exp_list) { + if (tag == lustre_msg_get_tag(tmp->rq_reqmsg) && + req->rq_xid > tmp->rq_xid) + ptlrpc_server_mark_obsolete(tmp); + + } + list_for_each_entry(tmp, &req->rq_export->exp_hp_rpcs, rq_exp_list) { + if (tag == lustre_msg_get_tag(tmp->rq_reqmsg) && + req->rq_xid > tmp->rq_xid) + ptlrpc_server_mark_obsolete(tmp); + } +} +#endif + +/** + * Check if a request should be assigned with a high priority. + * + * \retval < 0: error occurred + * 0: normal RPC request + * +1: high priority request + */ +static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + int rc = 0; + + ENTRY; + if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) { + rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req); + if (rc < 0) + RETURN(rc); + + LASSERT(rc == 0); + } + + if (req->rq_export != NULL && req->rq_ops != NULL) { + /* + * Perform request specific check. We should do this + * check before the request is added into exp_hp_rpcs + * list otherwise it may hit swab race at LU-1044. + */ + if (req->rq_ops->hpreq_check != NULL) { + rc = req->rq_ops->hpreq_check(req); + if (rc == -ESTALE) { + req->rq_status = rc; + ptlrpc_error(req); + } + /* + * can only return error, + * 0 for normal request, + * or 1 for high priority request + */ + LASSERT(rc <= 1); + } + } + + RETURN(rc); +} + +/** Remove the request from the export list. */ +static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req) +{ + ENTRY; + if (req->rq_export) { + /* + * refresh lock timeout again so that client has more + * room to send lock cancel RPC. + */ + if (req->rq_ops && req->rq_ops->hpreq_fini) + req->rq_ops->hpreq_fini(req); + + ptlrpc_del_exp_list(req); + } + EXIT; +} + +static int ptlrpc_hpreq_check(struct ptlrpc_request *req) +{ + return 1; +} + +static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = { + .hpreq_check = ptlrpc_hpreq_check, +}; + +/* Hi-Priority RPC check by RPC operation code. */ +int ptlrpc_hpreq_handler(struct ptlrpc_request *req) +{ + int opc = lustre_msg_get_opc(req->rq_reqmsg); + + /* + * Check for export to let only reconnects for not yet evicted + * export to become a HP rpc. + */ + if ((req->rq_export != NULL) && + (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT)) + req->rq_ops = &ptlrpc_hpreq_common; + + return 0; +} +EXPORT_SYMBOL(ptlrpc_hpreq_handler); + +static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + int rc; + bool hp; + struct ptlrpc_request *orig; + + ENTRY; + + rc = ptlrpc_server_hpreq_init(svcpt, req); + if (rc < 0) + RETURN(rc); + + hp = rc > 0; + ptlrpc_nrs_req_initialize(svcpt, req, hp); + + while (req->rq_export != NULL) { + struct obd_export *exp = req->rq_export; + + /* + * do search for duplicated xid and the adding to the list + * atomically + */ + spin_lock_bh(&exp->exp_rpc_lock); +#ifdef HAVE_SERVER_SUPPORT + ptlrpc_server_mark_in_progress_obsolete(req); +#endif + orig = ptlrpc_server_check_resend_in_progress(req); + if (orig && OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_RESEND_RACE)) { + spin_unlock_bh(&exp->exp_rpc_lock); + + OBD_RACE(OBD_FAIL_PTLRPC_RESEND_RACE); + msleep(4 * MSEC_PER_SEC); + continue; + } + + if (orig && likely(atomic_inc_not_zero(&orig->rq_refcount))) { + bool linked; + + spin_unlock_bh(&exp->exp_rpc_lock); + + /* + * When the client resend request and the server has + * the previous copy of it, we need to update deadlines, + * to be sure that the client and the server have equal + * request deadlines. + */ + + spin_lock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock); + linked = orig->rq_at_linked; + if (likely(linked)) + ptlrpc_at_remove_timed(orig); + spin_unlock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock); + orig->rq_deadline = req->rq_deadline; + orig->rq_rep_mbits = req->rq_rep_mbits; + if (likely(linked)) + ptlrpc_at_add_timed(orig); + ptlrpc_server_drop_request(orig); + ptlrpc_nrs_req_finalize(req); + + /* don't mark slot unused for resend in progress */ + req->rq_obsolete = 1; + + RETURN(-EBUSY); + } + + ptlrpc_add_exp_list_nolock(req, exp, hp || req->rq_ops != NULL); + + spin_unlock_bh(&exp->exp_rpc_lock); + break; + } + + /* + * the current thread is not the processing thread for this request + * since that, but request is in exp_hp_list and can be find there. + * Remove all relations between request and old thread. + */ + req->rq_svc_thread->t_env->le_ses = NULL; + req->rq_svc_thread = NULL; + req->rq_session.lc_thread = NULL; + + ptlrpc_nrs_req_add(svcpt, req, hp); + + RETURN(0); +} + +/** + * Allow to handle high priority request + * User can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_req_lock to get reliable result + */ +static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt, + bool force) +{ + int running = svcpt->scp_nthrs_running; + + if (!nrs_svcpt_has_hp(svcpt)) + return false; + + if (force) + return true; + + if (ptlrpc_nrs_req_throttling_nolock(svcpt, true)) + return false; + + if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && + CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { + /* leave just 1 thread for normal RPCs */ + running = PTLRPC_NTHRS_INIT; + if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) + running += 1; + } + + if (svcpt->scp_nreqs_active >= running - 1) + return false; + + if (svcpt->scp_nhreqs_active == 0) + return true; + + return !ptlrpc_nrs_req_pending_nolock(svcpt, false) || + svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio; +} + +static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt, + bool force) +{ + return ptlrpc_server_allow_high(svcpt, force) && + ptlrpc_nrs_req_pending_nolock(svcpt, true); +} + +/** + * Only allow normal priority requests on a service that has a high-priority + * queue if forced (i.e. cleanup), if there are other high priority requests + * already being processed (i.e. those threads can service more high-priority + * requests), or if there are enough idle threads that a later thread can do + * a high priority request. + * User can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_req_lock to get reliable result + */ +static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt, + bool force) +{ + int running = svcpt->scp_nthrs_running; + + if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && + CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { + /* leave just 1 thread for normal RPCs */ + running = PTLRPC_NTHRS_INIT; + if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) + running += 1; + } + + if (force) + return true; + + if (ptlrpc_nrs_req_throttling_nolock(svcpt, false)) + return false; + + if (svcpt->scp_nreqs_active < running - 2) + return true; + + if (svcpt->scp_nreqs_active >= running - 1) + return false; + + return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt); +} + +static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt, + bool force) +{ + return ptlrpc_server_allow_normal(svcpt, force) && + ptlrpc_nrs_req_pending_nolock(svcpt, false); +} + +/** + * Returns true if there are requests available in incoming + * request queue for processing and it is allowed to fetch them. + * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock + * to get reliable result + * \see ptlrpc_server_allow_normal + * \see ptlrpc_server_allow high + */ +static inline +bool ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, + bool force) +{ + return ptlrpc_server_high_pending(svcpt, force) || + ptlrpc_server_normal_pending(svcpt, force); +} + +/** + * Fetch a request for processing from queue of unprocessed requests. + * Favors high-priority requests. + * Returns a pointer to fetched request. + */ +static struct ptlrpc_request * +ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force) +{ + struct ptlrpc_request *req = NULL; + + ENTRY; + + spin_lock(&svcpt->scp_req_lock); + + if (ptlrpc_server_high_pending(svcpt, force)) { + req = ptlrpc_nrs_req_get_nolock(svcpt, true, force); + if (req != NULL) { + svcpt->scp_hreq_count++; + goto got_request; + } + } + + if (ptlrpc_server_normal_pending(svcpt, force)) { + req = ptlrpc_nrs_req_get_nolock(svcpt, false, force); + if (req != NULL) { + svcpt->scp_hreq_count = 0; + goto got_request; + } + } + + spin_unlock(&svcpt->scp_req_lock); + RETURN(NULL); + +got_request: + svcpt->scp_nreqs_active++; + if (req->rq_hp) + svcpt->scp_nhreqs_active++; + + spin_unlock(&svcpt->scp_req_lock); + + if (likely(req->rq_export)) + class_export_rpc_inc(req->rq_export); + + RETURN(req); +} + +/** + * Handle freshly incoming reqs, add to timed early reply list, + * pass on to regular request queue. + * All incoming requests pass through here before getting into + * ptlrpc_server_handle_req later on. + */ +static int ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt, + struct ptlrpc_thread *thread) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request *req; + __u32 deadline; + __u32 opc; + int rc; + + ENTRY; + + spin_lock(&svcpt->scp_lock); + if (list_empty(&svcpt->scp_req_incoming)) { + spin_unlock(&svcpt->scp_lock); + RETURN(0); + } + + req = list_first_entry(&svcpt->scp_req_incoming, + struct ptlrpc_request, rq_list); + list_del_init(&req->rq_list); + svcpt->scp_nreqs_incoming--; + /* + * Consider this still a "queued" request as far as stats are + * concerned + */ + spin_unlock(&svcpt->scp_lock); + + /* go through security check/transform */ + rc = sptlrpc_svc_unwrap_request(req); + switch (rc) { + case SECSVC_OK: + break; + case SECSVC_COMPLETE: + target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET); + goto err_req; + case SECSVC_DROP: + goto err_req; + default: + LBUG(); + } + + /* + * for null-flavored rpc, msg has been unpacked by sptlrpc, although + * redo it wouldn't be harmful. + */ + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { + rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen); + if (rc != 0) { + CERROR("error unpacking request: ptl %d from %s x%llu\n", + svc->srv_req_portal, libcfs_id2str(req->rq_peer), + req->rq_xid); + goto err_req; + } + } + + rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); + if (rc) { + CERROR("error unpacking ptlrpc body: ptl %d from %s x %llu\n", + svc->srv_req_portal, libcfs_id2str(req->rq_peer), + req->rq_xid); + goto err_req; + } + + opc = lustre_msg_get_opc(req->rq_reqmsg); + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) && + opc == cfs_fail_val) { + CERROR("drop incoming rpc opc %u, x%llu\n", + cfs_fail_val, req->rq_xid); + goto err_req; + } + + rc = -EINVAL; + if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) { + CERROR("wrong packet type received (type=%u) from %s\n", + lustre_msg_get_type(req->rq_reqmsg), + libcfs_id2str(req->rq_peer)); + goto err_req; + } + + switch (opc) { + case MDS_WRITEPAGE: + case OST_WRITE: + case OUT_UPDATE: + req->rq_bulk_write = 1; + break; + case MDS_READPAGE: + case OST_READ: + case MGS_CONFIG_READ: + req->rq_bulk_read = 1; + break; + } + + CDEBUG(D_RPCTRACE, "got req x%llu\n", req->rq_xid); + + req->rq_export = class_conn2export( + lustre_msg_get_handle(req->rq_reqmsg)); + if (req->rq_export) { + rc = ptlrpc_check_req(req); + if (rc == 0) { + rc = sptlrpc_target_export_check(req->rq_export, req); + if (rc) + DEBUG_REQ(D_ERROR, req, + "DROPPING req with illegal security flavor"); + } + + if (rc) + goto err_req; + ptlrpc_update_export_timer(req->rq_export, 0); + } + + /* req_in handling should/must be fast */ + if (ktime_get_real_seconds() - req->rq_arrival_time.tv_sec > 5) + DEBUG_REQ(D_WARNING, req, "Slow req_in handling %llds", + ktime_get_real_seconds() - + req->rq_arrival_time.tv_sec); + + /* Set rpc server deadline and add it to the timed list */ + deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) & + MSGHDR_AT_SUPPORT) ? + /* The max time the client expects us to take */ + lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout; + + req->rq_deadline = req->rq_arrival_time.tv_sec + deadline; + if (unlikely(deadline == 0)) { + DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout"); + goto err_req; + } + + /* Skip early reply */ + if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_RESEND)) + req->rq_deadline += obd_timeout; + + req->rq_svc_thread = thread; + if (thread != NULL) { + /* + * initialize request session, it is needed for request + * processing by target + */ + rc = lu_context_init(&req->rq_session, LCT_SERVER_SESSION | + LCT_NOREF); + if (rc) { + CERROR("%s: failure to initialize session: rc = %d\n", + thread->t_name, rc); + goto err_req; + } + req->rq_session.lc_thread = thread; + lu_context_enter(&req->rq_session); + thread->t_env->le_ses = &req->rq_session; + } + + + if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_ENQ_RESEND) && + (opc == LDLM_ENQUEUE) && + (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))) + OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_ENQ_RESEND, 6); + + ptlrpc_at_add_timed(req); + + if (opc != OST_CONNECT && opc != MDS_CONNECT && + opc != MGS_CONNECT && req->rq_export != NULL) { + if (exp_connect_flags2(req->rq_export) & OBD_CONNECT2_REP_MBITS) + req->rq_rep_mbits = lustre_msg_get_mbits(req->rq_reqmsg); + } + + /* Move it over to the request processing queue */ + rc = ptlrpc_server_request_add(svcpt, req); + if (rc) + GOTO(err_req, rc); + + wake_up(&svcpt->scp_waitq); + RETURN(1); + +err_req: + ptlrpc_server_finish_request(svcpt, req); + + RETURN(1); +} + +/** + * Main incoming request handling logic. + * Calls handler function from service to do actual processing. + */ +static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, + struct ptlrpc_thread *thread) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request *request; + ktime_t work_start; + ktime_t work_end; + ktime_t arrived; + s64 timediff_usecs; + s64 arrived_usecs; + int fail_opc = 0; + + ENTRY; + + request = ptlrpc_server_request_get(svcpt, false); + if (request == NULL) + RETURN(0); + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT)) + fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT; + else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) + fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT; + + if (unlikely(fail_opc)) { + if (request->rq_export && request->rq_ops) + OBD_FAIL_TIMEOUT(fail_opc, 4); + } + + ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET); + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG)) + libcfs_debug_dumplog(); + + work_start = ktime_get_real(); + arrived = timespec64_to_ktime(request->rq_arrival_time); + timediff_usecs = ktime_us_delta(work_start, arrived); + if (likely(svc->srv_stats != NULL)) { + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR, + timediff_usecs); + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR, + svcpt->scp_nreqs_incoming); + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR, + svcpt->scp_nreqs_active); + lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT, + at_get(&svcpt->scp_at_estimate)); + } + + if (likely(request->rq_export)) { + if (unlikely(ptlrpc_check_req(request))) + goto put_conn; + ptlrpc_update_export_timer(request->rq_export, + div_u64(timediff_usecs, + USEC_PER_SEC / 2)); + } + + /* + * Discard requests queued for longer than the deadline. + * The deadline is increased if we send an early reply. + */ + if (ktime_get_real_seconds() > request->rq_deadline) { + DEBUG_REQ(D_ERROR, request, + "Dropping timed-out request from %s: deadline %lld/%llds ago", + libcfs_id2str(request->rq_peer), + request->rq_deadline - + request->rq_arrival_time.tv_sec, + ktime_get_real_seconds() - request->rq_deadline); + goto put_conn; + } + + CDEBUG(D_RPCTRACE, + "Handling RPC req@%p pname:cluuid+ref:pid:xid:nid:opc:job %s:%s+%d:%d:x%llu:%s:%d:%s\n", + request, current->comm, + (request->rq_export ? + (char *)request->rq_export->exp_client_uuid.uuid : "0"), + (request->rq_export ? + refcount_read(&request->rq_export->exp_handle.h_ref) : -99), + lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, + libcfs_id2str(request->rq_peer), + lustre_msg_get_opc(request->rq_reqmsg), + lustre_msg_get_jobid(request->rq_reqmsg) ?: ""); + + if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING) + CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val); + + CDEBUG(D_NET, "got req %llu\n", request->rq_xid); + + /* re-assign request and sesson thread to the current one */ + request->rq_svc_thread = thread; + if (thread != NULL) { + LASSERT(request->rq_session.lc_thread == NULL); + request->rq_session.lc_thread = thread; + thread->t_env->le_ses = &request->rq_session; + } + svc->srv_ops.so_req_handler(request); + + ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE); + +put_conn: + if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) { + DEBUG_REQ(D_WARNING, request, + "Request took longer than estimated (%lld/%llds); client may timeout", + request->rq_deadline - + request->rq_arrival_time.tv_sec, + ktime_get_real_seconds() - request->rq_deadline); + } + + work_end = ktime_get_real(); + timediff_usecs = ktime_us_delta(work_end, work_start); + arrived_usecs = ktime_us_delta(work_end, arrived); + CDEBUG(D_RPCTRACE, + "Handled RPC req@%p pname:cluuid+ref:pid:xid:nid:opc:job %s:%s+%d:%d:x%llu:%s:%d:%s Request processed in %lldus (%lldus total) trans %llu rc %d/%d\n", + request, current->comm, + (request->rq_export ? + (char *)request->rq_export->exp_client_uuid.uuid : "0"), + (request->rq_export ? + refcount_read(&request->rq_export->exp_handle.h_ref) : -99), + lustre_msg_get_status(request->rq_reqmsg), + request->rq_xid, + libcfs_id2str(request->rq_peer), + lustre_msg_get_opc(request->rq_reqmsg), + lustre_msg_get_jobid(request->rq_reqmsg) ?: "", + timediff_usecs, + arrived_usecs, + (request->rq_repmsg ? + lustre_msg_get_transno(request->rq_repmsg) : + request->rq_transno), + request->rq_status, + (request->rq_repmsg ? + lustre_msg_get_status(request->rq_repmsg) : -999)); + if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) { + __u32 op = lustre_msg_get_opc(request->rq_reqmsg); + int opc = opcode_offset(op); + + if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) { + LASSERT(opc < LUSTRE_MAX_OPCODES); + lprocfs_counter_add(svc->srv_stats, + opc + EXTRA_MAX_OPCODES, + timediff_usecs); + } + } + if (unlikely(request->rq_early_count)) { + DEBUG_REQ(D_ADAPTTO, request, + "sent %d early replies before finishing in %llds", + request->rq_early_count, + div_u64(arrived_usecs, USEC_PER_SEC)); + } + + ptlrpc_server_finish_active_request(svcpt, request); + + RETURN(1); +} + +/** + * An internal function to process a single reply state object. + */ +static int ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + struct obd_export *exp; + int nlocks; + int been_handled; + + ENTRY; + + exp = rs->rs_export; + + LASSERT(rs->rs_difficult); + LASSERT(rs->rs_scheduled); + LASSERT(list_empty(&rs->rs_list)); + + /* + * The disk commit callback holds exp_uncommitted_replies_lock while it + * iterates over newly committed replies, removing them from + * exp_uncommitted_replies. It then drops this lock and schedules the + * replies it found for handling here. + * + * We can avoid contention for exp_uncommitted_replies_lock between the + * HRT threads and further commit callbacks by checking rs_committed + * which is set in the commit callback while it holds both + * rs_lock and exp_uncommitted_reples. + * + * If we see rs_committed clear, the commit callback _may_ not have + * handled this reply yet and we race with it to grab + * exp_uncommitted_replies_lock before removing the reply from + * exp_uncommitted_replies. Note that if we lose the race and the + * reply has already been removed, list_del_init() is a noop. + * + * If we see rs_committed set, we know the commit callback is handling, + * or has handled this reply since store reordering might allow us to + * see rs_committed set out of sequence. But since this is done + * holding rs_lock, we can be sure it has all completed once we hold + * rs_lock, which we do right next. + */ + if (!rs->rs_committed) { + /* + * if rs was commited, no need to convert locks, don't check + * rs_committed here because rs may never be added into + * exp_uncommitted_replies and this flag never be set, see + * target_send_reply() + */ + if (rs->rs_convert_lock && + rs->rs_transno > exp->exp_last_committed) { + struct ldlm_lock *lock; + struct ldlm_lock *ack_locks[RS_MAX_LOCKS] = { NULL }; + + spin_lock(&rs->rs_lock); + if (rs->rs_convert_lock && + rs->rs_transno > exp->exp_last_committed) { + nlocks = rs->rs_nlocks; + while (nlocks-- > 0) { + /* + * NB don't assume rs is always handled + * by the same service thread (see + * ptlrpc_hr_select, so REP-ACK hr may + * race with trans commit, while the + * latter will release locks, get locks + * here early to convert to COS mode + * safely. + */ + lock = ldlm_handle2lock( + &rs->rs_locks[nlocks]); + LASSERT(lock); + ack_locks[nlocks] = lock; + rs->rs_modes[nlocks] = LCK_COS; + } + nlocks = rs->rs_nlocks; + rs->rs_convert_lock = 0; + /* + * clear rs_scheduled so that commit callback + * can schedule again + */ + rs->rs_scheduled = 0; + spin_unlock(&rs->rs_lock); + + while (nlocks-- > 0) { + lock = ack_locks[nlocks]; + ldlm_lock_mode_downgrade(lock, LCK_COS); + LDLM_LOCK_PUT(lock); + } + RETURN(0); + } + spin_unlock(&rs->rs_lock); + } + + spin_lock(&exp->exp_uncommitted_replies_lock); + list_del_init(&rs->rs_obd_list); + spin_unlock(&exp->exp_uncommitted_replies_lock); + } + + spin_lock(&exp->exp_lock); + /* Noop if removed already */ + list_del_init(&rs->rs_exp_list); + spin_unlock(&exp->exp_lock); + + spin_lock(&rs->rs_lock); + + been_handled = rs->rs_handled; + rs->rs_handled = 1; + + nlocks = rs->rs_nlocks; /* atomic "steal", but */ + rs->rs_nlocks = 0; /* locks still on rs_locks! */ + + if (nlocks == 0 && !been_handled) { + /* + * If we see this, we should already have seen the warning + * in mds_steal_ack_locks() + */ + CDEBUG(D_HA, + "All locks stolen from rs %p x%lld.t%lld o%d NID %s\n", + rs, rs->rs_xid, rs->rs_transno, rs->rs_opc, + libcfs_nidstr(&exp->exp_connection->c_peer.nid)); + } + + if ((rs->rs_sent && !rs->rs_unlinked) || nlocks > 0) { + spin_unlock(&rs->rs_lock); + + /* We can unlink if the LNET_EVENT_SEND has occurred. + * If rs_unlinked is set then MD is already unlinked and no + * need to do so here. + */ + if ((rs->rs_sent && !rs->rs_unlinked)) { + LNetMDUnlink(rs->rs_md_h); + /* Ignore return code; we're racing with completion */ + } + + while (nlocks-- > 0) + ldlm_lock_decref(&rs->rs_locks[nlocks], + rs->rs_modes[nlocks]); + + spin_lock(&rs->rs_lock); + } + + rs->rs_scheduled = 0; + rs->rs_convert_lock = 0; + + if (rs->rs_unlinked) { + /* Off the net */ + spin_unlock(&rs->rs_lock); + + class_export_put(exp); + rs->rs_export = NULL; + ptlrpc_rs_decref(rs); + if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) && + svc->srv_is_stopping) + wake_up_all(&svcpt->scp_waitq); + RETURN(1); + } + + /* still on the net; callback will schedule */ + spin_unlock(&rs->rs_lock); + RETURN(1); +} + + +static void ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt) +{ + int avail = svcpt->scp_nrqbds_posted; + int low_water = test_req_buffer_pressure ? 0 : + svcpt->scp_service->srv_nbuf_per_group / 2; + + /* NB I'm not locking; just looking. */ + + /* + * CAVEAT EMPTOR: We might be allocating buffers here because we've + * allowed the request history to grow out of control. We could put a + * sanity check on that here and cull some history if we need the + * space. + */ + + if (avail <= low_water) + ptlrpc_grow_req_bufs(svcpt, 1); + + if (svcpt->scp_service->srv_stats) { + lprocfs_counter_add(svcpt->scp_service->srv_stats, + PTLRPC_REQBUF_AVAIL_CNTR, avail); + } +} + +static inline int ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_nreqs_active < + svcpt->scp_nthrs_running - 1 - + (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL); +} + +/** + * allowed to create more threads + * user can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_lock to get reliable result + */ +static inline int ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_nthrs_running + + svcpt->scp_nthrs_starting < + svcpt->scp_service->srv_nthrs_cpt_limit; +} + +/** + * too many requests and allowed to create more threads + */ +static inline int ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt) +{ + return !ptlrpc_threads_enough(svcpt) && + ptlrpc_threads_increasable(svcpt); +} + +static inline int ptlrpc_thread_stopping(struct ptlrpc_thread *thread) +{ + return thread_is_stopping(thread) || + thread->t_svcpt->scp_service->srv_is_stopping; +} + +/* stop the highest numbered thread if there are too many threads running */ +static inline bool ptlrpc_thread_should_stop(struct ptlrpc_thread *thread) +{ + struct ptlrpc_service_part *svcpt = thread->t_svcpt; + + return thread->t_id >= svcpt->scp_service->srv_nthrs_cpt_limit && + thread->t_id == svcpt->scp_thr_nextid - 1; +} + +static void ptlrpc_stop_thread(struct ptlrpc_thread *thread) +{ + CDEBUG(D_INFO, "Stopping thread %s #%u\n", + thread->t_svcpt->scp_service->srv_thread_name, thread->t_id); + thread_add_flags(thread, SVC_STOPPING); +} + +static inline void ptlrpc_thread_stop(struct ptlrpc_thread *thread) +{ + struct ptlrpc_service_part *svcpt = thread->t_svcpt; + + spin_lock(&svcpt->scp_lock); + if (ptlrpc_thread_should_stop(thread)) { + ptlrpc_stop_thread(thread); + svcpt->scp_thr_nextid--; + } + spin_unlock(&svcpt->scp_lock); +} + +static inline int ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt) +{ + return !list_empty(&svcpt->scp_rqbd_idle) && + svcpt->scp_rqbd_timeout == 0; +} + +static inline int +ptlrpc_at_check(struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_at_check; +} + +/* + * If a thread runs too long or spends to much time on a single request, + * we want to know about it, so we set up a delayed work item as a watchdog. + * If it fires, we display a stack trace of the delayed thread, + * providing we aren't rate-limited + * + * Watchdog stack traces are limited to 3 per 'libcfs_watchdog_ratelimit' + * seconds + */ +static struct ratelimit_state watchdog_limit; + +static void ptlrpc_watchdog_fire(struct work_struct *w) +{ + struct ptlrpc_thread *thread = container_of(w, struct ptlrpc_thread, + t_watchdog.work); + u64 ms_lapse = ktime_ms_delta(ktime_get(), thread->t_touched); + u32 ms_frac = do_div(ms_lapse, MSEC_PER_SEC); + + /* ___ratelimit() returns true if the action is NOT ratelimited */ + if (__ratelimit(&watchdog_limit)) { + /* below message is checked in sanity-quota.sh test_6,18 */ + LCONSOLE_WARN("%s: service thread pid %u was inactive for %llu.%03u seconds. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:\n", + thread->t_task->comm, thread->t_task->pid, + ms_lapse, ms_frac); + + libcfs_debug_dumpstack(thread->t_task); + } else { + /* below message is checked in sanity-quota.sh test_6,18 */ + LCONSOLE_WARN("%s: service thread pid %u was inactive for %llu.%03u seconds. Watchdog stack traces are limited to 3 per %u seconds, skipping this one.\n", + thread->t_task->comm, thread->t_task->pid, + ms_lapse, ms_frac, libcfs_watchdog_ratelimit); + } +} + +void ptlrpc_watchdog_init(struct delayed_work *work, timeout_t timeout) +{ + INIT_DELAYED_WORK(work, ptlrpc_watchdog_fire); + schedule_delayed_work(work, cfs_time_seconds(timeout)); +} + +void ptlrpc_watchdog_disable(struct delayed_work *work) +{ + cancel_delayed_work_sync(work); +} + +void ptlrpc_watchdog_touch(struct delayed_work *work, timeout_t timeout) +{ + struct ptlrpc_thread *thread = container_of(&work->work, + struct ptlrpc_thread, + t_watchdog.work); + thread->t_touched = ktime_get(); + mod_delayed_work(system_wq, work, cfs_time_seconds(timeout)); +} + +/** + * requests wait on preprocessing + * user can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_lock to get reliable result + */ +static inline int +ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt) +{ + return !list_empty(&svcpt->scp_req_incoming); +} + +static __attribute__((__noinline__)) int +ptlrpc_wait_event(struct ptlrpc_service_part *svcpt, + struct ptlrpc_thread *thread) +{ + ptlrpc_watchdog_disable(&thread->t_watchdog); + + cond_resched(); + + if (svcpt->scp_rqbd_timeout == 0) + /* Don't exit while there are replies to be handled */ + wait_event_idle_exclusive_lifo( + svcpt->scp_waitq, + ptlrpc_thread_stopping(thread) || + ptlrpc_server_request_incoming(svcpt) || + ptlrpc_server_request_pending(svcpt, false) || + ptlrpc_rqbd_pending(svcpt) || + ptlrpc_at_check(svcpt)); + else if (wait_event_idle_exclusive_lifo_timeout( + svcpt->scp_waitq, + ptlrpc_thread_stopping(thread) || + ptlrpc_server_request_incoming(svcpt) || + ptlrpc_server_request_pending(svcpt, false) || + ptlrpc_rqbd_pending(svcpt) || + ptlrpc_at_check(svcpt), + svcpt->scp_rqbd_timeout) == 0) + svcpt->scp_rqbd_timeout = 0; + + if (ptlrpc_thread_stopping(thread)) + return -EINTR; + + ptlrpc_watchdog_touch(&thread->t_watchdog, + ptlrpc_server_get_timeout(svcpt)); + return 0; +} + +/** + * Main thread body for service threads. + * Waits in a loop waiting for new requests to process to appear. + * Every time an incoming requests is added to its queue, a waitq + * is woken up and one of the threads will handle it. + */ +static int ptlrpc_main(void *arg) +{ + struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg; + struct ptlrpc_service_part *svcpt = thread->t_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_reply_state *rs; + struct group_info *ginfo = NULL; + struct lu_env *env; + int counter = 0, rc = 0; + + ENTRY; + unshare_fs_struct(); + + thread->t_task = current; + thread->t_pid = current->pid; + + if (svc->srv_cpt_bind) { + rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt); + if (rc != 0) { + CWARN("%s: failed to bind %s on CPT %d\n", + svc->srv_name, thread->t_name, svcpt->scp_cpt); + } + } + + ginfo = groups_alloc(0); + if (!ginfo) + GOTO(out, rc = -ENOMEM); + + set_current_groups(ginfo); + put_group_info(ginfo); + + if (svc->srv_ops.so_thr_init != NULL) { + rc = svc->srv_ops.so_thr_init(thread); + if (rc) + GOTO(out, rc); + } + + OBD_ALLOC_PTR(env); + if (env == NULL) + GOTO(out_srv_fini, rc = -ENOMEM); + rc = lu_env_add(env); + if (rc) + GOTO(out_env, rc); + + rc = lu_context_init(&env->le_ctx, + svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF); + if (rc) + GOTO(out_env_remove, rc); + + thread->t_env = env; + env->le_ctx.lc_thread = thread; + env->le_ctx.lc_cookie = 0x6; + + while (!list_empty(&svcpt->scp_rqbd_idle)) { + rc = ptlrpc_server_post_idle_rqbds(svcpt); + if (rc >= 0) + continue; + + CERROR("Failed to post rqbd for %s on CPT %d: %d\n", + svc->srv_name, svcpt->scp_cpt, rc); + GOTO(out_ctx_fini, rc); + } + + /* Alloc reply state structure for this one */ + OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size); + if (!rs) + GOTO(out_ctx_fini, rc = -ENOMEM); + + spin_lock(&svcpt->scp_lock); + + LASSERT(thread_is_starting(thread)); + thread_clear_flags(thread, SVC_STARTING); + + LASSERT(svcpt->scp_nthrs_starting == 1); + svcpt->scp_nthrs_starting--; + + /* + * SVC_STOPPING may already be set here if someone else is trying + * to stop the service while this new thread has been dynamically + * forked. We still set SVC_RUNNING to let our creator know that + * we are now running, however we will exit as soon as possible + */ + thread_add_flags(thread, SVC_RUNNING); + svcpt->scp_nthrs_running++; + spin_unlock(&svcpt->scp_lock); + + /* wake up our creator in case he's still waiting. */ + wake_up(&thread->t_ctl_waitq); + + thread->t_touched = ktime_get(); + ptlrpc_watchdog_init(&thread->t_watchdog, + ptlrpc_server_get_timeout(svcpt)); + + spin_lock(&svcpt->scp_rep_lock); + list_add(&rs->rs_list, &svcpt->scp_rep_idle); + wake_up(&svcpt->scp_rep_waitq); + spin_unlock(&svcpt->scp_rep_lock); + + CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id, + svcpt->scp_nthrs_running); + + /* XXX maintain a list of all managed devices: insert here */ + while (!ptlrpc_thread_stopping(thread)) { + if (ptlrpc_wait_event(svcpt, thread)) + break; + + ptlrpc_check_rqbd_pool(svcpt); + + if (ptlrpc_threads_need_create(svcpt)) { + /* Ignore return code - we tried... */ + ptlrpc_start_thread(svcpt, 0); + } + + /* reset le_ses to initial state */ + env->le_ses = NULL; + /* Refill the context before execution to make sure + * all thread keys are allocated */ + lu_env_refill(env); + /* Process all incoming reqs before handling any */ + if (ptlrpc_server_request_incoming(svcpt)) { + lu_context_enter(&env->le_ctx); + ptlrpc_server_handle_req_in(svcpt, thread); + lu_context_exit(&env->le_ctx); + + /* but limit ourselves in case of flood */ + if (counter++ < 100) + continue; + counter = 0; + } + + if (ptlrpc_at_check(svcpt)) + ptlrpc_at_check_timed(svcpt); + + if (ptlrpc_server_request_pending(svcpt, false)) { + lu_context_enter(&env->le_ctx); + ptlrpc_server_handle_request(svcpt, thread); + lu_context_exit(&env->le_ctx); + } + + if (ptlrpc_rqbd_pending(svcpt) && + ptlrpc_server_post_idle_rqbds(svcpt) < 0) { + /* + * I just failed to repost request buffers. + * Wait for a timeout (unless something else + * happens) before I try again + */ + svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10; + CDEBUG(D_RPCTRACE, "Posted buffers: %d\n", + svcpt->scp_nrqbds_posted); + } + /* + * If the number of threads has been tuned downward and this + * thread should be stopped, then stop in reverse order so the + * the threads always have contiguous thread index values. + */ + if (unlikely(ptlrpc_thread_should_stop(thread))) + ptlrpc_thread_stop(thread); + } + + ptlrpc_watchdog_disable(&thread->t_watchdog); + +out_ctx_fini: + lu_context_fini(&env->le_ctx); +out_env_remove: + lu_env_remove(env); +out_env: + OBD_FREE_PTR(env); +out_srv_fini: + /* deconstruct service thread state created by ptlrpc_start_thread() */ + if (svc->srv_ops.so_thr_done != NULL) + svc->srv_ops.so_thr_done(thread); +out: + CDEBUG(D_RPCTRACE, "%s: service thread [%p:%u] %d exiting: rc = %d\n", + thread->t_name, thread, thread->t_pid, thread->t_id, rc); + spin_lock(&svcpt->scp_lock); + if (thread_test_and_clear_flags(thread, SVC_STARTING)) + svcpt->scp_nthrs_starting--; + + if (thread_test_and_clear_flags(thread, SVC_RUNNING)) { + /* must know immediately */ + svcpt->scp_nthrs_running--; + } + + thread->t_id = rc; + thread_add_flags(thread, SVC_STOPPED); + + wake_up(&thread->t_ctl_waitq); + spin_unlock(&svcpt->scp_lock); + + return rc; +} + +static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt, + struct list_head *replies) +{ + int result; + + spin_lock(&hrt->hrt_lock); + + list_splice_init(&hrt->hrt_queue, replies); + result = ptlrpc_hr.hr_stopping || !list_empty(replies); + + spin_unlock(&hrt->hrt_lock); + return result; +} + +/** + * Main body of "handle reply" function. + * It processes acked reply states + */ +static int ptlrpc_hr_main(void *arg) +{ + struct ptlrpc_hr_thread *hrt = (struct ptlrpc_hr_thread *)arg; + struct ptlrpc_hr_partition *hrp = hrt->hrt_partition; + LIST_HEAD(replies); + struct lu_env *env; + int rc; + + unshare_fs_struct(); + OBD_ALLOC_PTR(env); + if (env == NULL) + RETURN(-ENOMEM); + + rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt); + if (rc != 0) { + char threadname[20]; + + snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d", + hrp->hrp_cpt, hrt->hrt_id); + CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n", + threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc); + } + + rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD | + LCT_REMEMBER | LCT_NOREF); + if (rc) + GOTO(out_env, rc); + + rc = lu_env_add(env); + if (rc) + GOTO(out_ctx_fini, rc); + + atomic_inc(&hrp->hrp_nstarted); + wake_up(&ptlrpc_hr.hr_waitq); + + while (!ptlrpc_hr.hr_stopping) { + wait_event_idle(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies)); + + while (!list_empty(&replies)) { + struct ptlrpc_reply_state *rs; + + rs = list_entry(replies.prev, + struct ptlrpc_reply_state, + rs_list); + list_del_init(&rs->rs_list); + /* refill keys if needed */ + lu_env_refill(env); + lu_context_enter(&env->le_ctx); + ptlrpc_handle_rs(rs); + lu_context_exit(&env->le_ctx); + } + } + + atomic_inc(&hrp->hrp_nstopped); + wake_up(&ptlrpc_hr.hr_waitq); + + lu_env_remove(env); +out_ctx_fini: + lu_context_fini(&env->le_ctx); +out_env: + OBD_FREE_PTR(env); + return 0; +} + +static void ptlrpc_stop_hr_threads(void) +{ + struct ptlrpc_hr_partition *hrp; + int i; + int j; + + ptlrpc_hr.hr_stopping = 1; + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + if (hrp->hrp_thrs == NULL) + continue; /* uninitialized */ + for (j = 0; j < hrp->hrp_nthrs; j++) + wake_up(&hrp->hrp_thrs[j].hrt_waitq); + } + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + if (hrp->hrp_thrs == NULL) + continue; /* uninitialized */ + wait_event(ptlrpc_hr.hr_waitq, + atomic_read(&hrp->hrp_nstopped) == + atomic_read(&hrp->hrp_nstarted)); + } +} + +static int ptlrpc_start_hr_threads(void) +{ + struct ptlrpc_hr_partition *hrp; + int i; + int j; + + ENTRY; + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + int rc = 0; + + for (j = 0; j < hrp->hrp_nthrs; j++) { + struct ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j]; + struct task_struct *task; + + task = kthread_run(ptlrpc_hr_main, + &hrp->hrp_thrs[j], + "ptlrpc_hr%02d_%03d", + hrp->hrp_cpt, + hrt->hrt_id); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + break; + } + } + + wait_event(ptlrpc_hr.hr_waitq, + atomic_read(&hrp->hrp_nstarted) == j); + + if (rc < 0) { + CERROR("cannot start reply handler thread %d:%d: rc = %d\n", + i, j, rc); + ptlrpc_stop_hr_threads(); + RETURN(rc); + } + } + + RETURN(0); +} + +static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_thread *thread; + LIST_HEAD(zombie); + + ENTRY; + + CDEBUG(D_INFO, "Stopping threads for service %s\n", + svcpt->scp_service->srv_name); + + spin_lock(&svcpt->scp_lock); + /* let the thread know that we would like it to stop asap */ + list_for_each_entry(thread, &svcpt->scp_threads, t_link) + ptlrpc_stop_thread(thread); + + wake_up_all(&svcpt->scp_waitq); + + while ((thread = list_first_entry_or_null(&svcpt->scp_threads, + struct ptlrpc_thread, + t_link)) != NULL) { + if (thread_is_stopped(thread)) { + list_move(&thread->t_link, &zombie); + continue; + } + spin_unlock(&svcpt->scp_lock); + + CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n", + svcpt->scp_service->srv_thread_name, thread->t_id); + wait_event_idle(thread->t_ctl_waitq, + thread_is_stopped(thread)); + + spin_lock(&svcpt->scp_lock); + } + + spin_unlock(&svcpt->scp_lock); + + while ((thread = list_first_entry_or_null(&zombie, + struct ptlrpc_thread, + t_link)) != NULL) { + list_del(&thread->t_link); + OBD_FREE_PTR(thread); + } + EXIT; +} + +/** + * Stops all threads of a particular service \a svc + */ +static void ptlrpc_stop_all_threads(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + int i; + + ENTRY; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service != NULL) + ptlrpc_svcpt_stop_threads(svcpt); + } + + EXIT; +} + +static int ptlrpc_start_threads(struct ptlrpc_service *svc) +{ + int rc = 0; + int i; + int j; + + ENTRY; + + /* We require 2 threads min, see note in ptlrpc_server_handle_request */ + LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT); + + for (i = 0; i < svc->srv_ncpts; i++) { + for (j = 0; j < svc->srv_nthrs_cpt_init; j++) { + rc = ptlrpc_start_thread(svc->srv_parts[i], 1); + if (rc == 0) + continue; + + if (rc != -EMFILE) + goto failed; + /* We have enough threads, don't start more. b=15759 */ + break; + } + } + + RETURN(0); + failed: + CERROR("cannot start %s thread #%d_%d: rc %d\n", + svc->srv_thread_name, i, j, rc); + ptlrpc_stop_all_threads(svc); + RETURN(rc); +} + +static int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait) +{ + struct ptlrpc_thread *thread; + struct ptlrpc_service *svc; + struct task_struct *task; + int rc; + + ENTRY; + + LASSERT(svcpt != NULL); + + svc = svcpt->scp_service; + + CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n", + svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running, + svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit); + + again: + if (unlikely(svc->srv_is_stopping)) + RETURN(-ESRCH); + + if (!ptlrpc_threads_increasable(svcpt) || + (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) && + svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1)) + RETURN(-EMFILE); + + OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt); + if (thread == NULL) + RETURN(-ENOMEM); + init_waitqueue_head(&thread->t_ctl_waitq); + + spin_lock(&svcpt->scp_lock); + if (!ptlrpc_threads_increasable(svcpt)) { + spin_unlock(&svcpt->scp_lock); + OBD_FREE_PTR(thread); + RETURN(-EMFILE); + } + + if (svcpt->scp_nthrs_starting != 0) { + /* + * serialize starting because some modules (obdfilter) + * might require unique and contiguous t_id + */ + LASSERT(svcpt->scp_nthrs_starting == 1); + spin_unlock(&svcpt->scp_lock); + OBD_FREE_PTR(thread); + if (wait) { + CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n", + svc->srv_thread_name, svcpt->scp_thr_nextid); + schedule(); + goto again; + } + + CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n", + svc->srv_thread_name, svcpt->scp_thr_nextid); + RETURN(-EAGAIN); + } + + svcpt->scp_nthrs_starting++; + thread->t_id = svcpt->scp_thr_nextid++; + thread_add_flags(thread, SVC_STARTING); + thread->t_svcpt = svcpt; + + list_add(&thread->t_link, &svcpt->scp_threads); + spin_unlock(&svcpt->scp_lock); + + if (svcpt->scp_cpt >= 0) { + snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s%02d_%03d", + svc->srv_thread_name, svcpt->scp_cpt, thread->t_id); + } else { + snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s_%04d", + svc->srv_thread_name, thread->t_id); + } + + CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name); + task = kthread_run(ptlrpc_main, thread, "%s", thread->t_name); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("cannot start thread '%s': rc = %d\n", + thread->t_name, rc); + spin_lock(&svcpt->scp_lock); + --svcpt->scp_nthrs_starting; + if (thread_is_stopping(thread)) { + /* + * this ptlrpc_thread is being hanled + * by ptlrpc_svcpt_stop_threads now + */ + thread_add_flags(thread, SVC_STOPPED); + wake_up(&thread->t_ctl_waitq); + spin_unlock(&svcpt->scp_lock); + } else { + list_del(&thread->t_link); + spin_unlock(&svcpt->scp_lock); + OBD_FREE_PTR(thread); + } + RETURN(rc); + } + + if (!wait) + RETURN(0); + + wait_event_idle(thread->t_ctl_waitq, + thread_is_running(thread) || thread_is_stopped(thread)); + + rc = thread_is_stopped(thread) ? thread->t_id : 0; + RETURN(rc); +} + +int ptlrpc_hr_init(void) +{ + struct ptlrpc_hr_partition *hrp; + struct ptlrpc_hr_thread *hrt; + int rc; + int cpt; + int i; + int weight; + + ENTRY; + + memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr)); + ptlrpc_hr.hr_cpt_table = cfs_cpt_tab; + + ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table, + sizeof(*hrp)); + if (ptlrpc_hr.hr_partitions == NULL) + RETURN(-ENOMEM); + + ratelimit_state_init(&watchdog_limit, + cfs_time_seconds(libcfs_watchdog_ratelimit), 3); + + init_waitqueue_head(&ptlrpc_hr.hr_waitq); + + preempt_disable(); + weight = cpumask_weight(topology_sibling_cpumask(smp_processor_id())); + preempt_enable(); + + cfs_percpt_for_each(hrp, cpt, ptlrpc_hr.hr_partitions) { + hrp->hrp_cpt = cpt; + + atomic_set(&hrp->hrp_nstarted, 0); + atomic_set(&hrp->hrp_nstopped, 0); + + hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, cpt); + hrp->hrp_nthrs /= weight; + if (hrp->hrp_nthrs == 0) + hrp->hrp_nthrs = 1; + + OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, cpt, + hrp->hrp_nthrs * sizeof(*hrt)); + if (hrp->hrp_thrs == NULL) + GOTO(out, rc = -ENOMEM); + + for (i = 0; i < hrp->hrp_nthrs; i++) { + hrt = &hrp->hrp_thrs[i]; + + hrt->hrt_id = i; + hrt->hrt_partition = hrp; + init_waitqueue_head(&hrt->hrt_waitq); + spin_lock_init(&hrt->hrt_lock); + INIT_LIST_HEAD(&hrt->hrt_queue); + } + } + + rc = ptlrpc_start_hr_threads(); +out: + if (rc != 0) + ptlrpc_hr_fini(); + RETURN(rc); +} + +void ptlrpc_hr_fini(void) +{ + struct ptlrpc_hr_partition *hrp; + int cpt; + + if (ptlrpc_hr.hr_partitions == NULL) + return; + + ptlrpc_stop_hr_threads(); + + cfs_percpt_for_each(hrp, cpt, ptlrpc_hr.hr_partitions) { + if (hrp->hrp_thrs) + OBD_FREE_PTR_ARRAY(hrp->hrp_thrs, hrp->hrp_nthrs); + } + + cfs_percpt_free(ptlrpc_hr.hr_partitions); + ptlrpc_hr.hr_partitions = NULL; +} + + +/** + * Wait until all already scheduled replies are processed. + */ +static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt) +{ + while (1) { + if (wait_event_idle_timeout( + svcpt->scp_waitq, + atomic_read(&svcpt->scp_nreps_difficult) == 0, + cfs_time_seconds(10)) > 0) + break; + CWARN("Unexpectedly long timeout %s %p\n", + svcpt->scp_service->srv_name, svcpt->scp_service); + } +} + +static void +ptlrpc_service_del_atimer(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + int i; + + /* early disarm AT timer... */ + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service != NULL) + del_timer(&svcpt->scp_at_timer); + } +} + +static void +ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + struct ptlrpc_request_buffer_desc *rqbd; + int rc; + int i; + + /* + * All history will be culled when the next request buffer is + * freed in ptlrpc_service_purge_all() + */ + svc->srv_hist_nrqbds_cpt_max = 0; + + rc = LNetClearLazyPortal(svc->srv_req_portal); + LASSERT(rc == 0); + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + /* + * Unlink all the request buffers. This forces a 'final' + * event with its 'unlink' flag set for each posted rqbd + */ + list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted, + rqbd_list) { + rc = LNetMDUnlink(rqbd->rqbd_md_h); + LASSERT(rc == 0 || rc == -ENOENT); + } + } + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + /* + * Wait for the network to release any buffers + * it's currently filling + */ + spin_lock(&svcpt->scp_lock); + while (svcpt->scp_nrqbds_posted != 0) { + int seconds = PTLRPC_REQ_LONG_UNLINK; + + spin_unlock(&svcpt->scp_lock); + /* + * Network access will complete in finite time but + * the HUGE timeout lets us CWARN for visibility + * of sluggish NALs + */ + while (seconds > 0 && + wait_event_idle_timeout( + svcpt->scp_waitq, + svcpt->scp_nrqbds_posted == 0, + cfs_time_seconds(1)) == 0) + seconds -= 1; + if (seconds == 0) { + CWARN("Service %s waiting for request buffers\n", + svcpt->scp_service->srv_name); + } + spin_lock(&svcpt->scp_lock); + } + spin_unlock(&svcpt->scp_lock); + } +} + +static void +ptlrpc_service_purge_all(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + struct ptlrpc_request_buffer_desc *rqbd; + struct ptlrpc_request *req; + struct ptlrpc_reply_state *rs; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + spin_lock(&svcpt->scp_rep_lock); + while ((rs = list_first_entry_or_null(&svcpt->scp_rep_active, + struct ptlrpc_reply_state, + rs_list)) != NULL) { + spin_lock(&rs->rs_lock); + ptlrpc_schedule_difficult_reply(rs); + spin_unlock(&rs->rs_lock); + } + spin_unlock(&svcpt->scp_rep_lock); + + /* + * purge the request queue. NB No new replies (rqbds + * all unlinked) and no service threads, so I'm the only + * thread noodling the request queue now + */ + while ((req = list_first_entry_or_null(&svcpt->scp_req_incoming, + struct ptlrpc_request, + rq_list)) != NULL) { + list_del(&req->rq_list); + svcpt->scp_nreqs_incoming--; + ptlrpc_server_finish_request(svcpt, req); + } + + while (ptlrpc_server_request_pending(svcpt, true)) { + req = ptlrpc_server_request_get(svcpt, true); + ptlrpc_server_finish_active_request(svcpt, req); + } + + /* + * The portal may be shared by several services (eg:OUT_PORTAL). + * So the request could be referenced by other target. So we + * have to wait the ptlrpc_server_drop_request invoked. + * + * TODO: move the req_buffer as global rather than per service. + */ + spin_lock(&svcpt->scp_lock); + while (!list_empty(&svcpt->scp_rqbd_posted)) { + spin_unlock(&svcpt->scp_lock); + wait_event_idle_timeout(svcpt->scp_waitq, + list_empty(&svcpt->scp_rqbd_posted), + cfs_time_seconds(1)); + spin_lock(&svcpt->scp_lock); + } + spin_unlock(&svcpt->scp_lock); + + LASSERT(svcpt->scp_nreqs_incoming == 0); + LASSERT(svcpt->scp_nreqs_active == 0); + /* + * history should have been culled by + * ptlrpc_server_finish_request + */ + LASSERT(svcpt->scp_hist_nrqbds == 0); + + /* + * Now free all the request buffers since nothing + * references them any more... + */ + while ((rqbd = list_first_entry_or_null(&svcpt->scp_rqbd_idle, + struct ptlrpc_request_buffer_desc, + rqbd_list)) != NULL) + ptlrpc_free_rqbd(rqbd); + + ptlrpc_wait_replies(svcpt); + + while ((rs = list_first_entry_or_null(&svcpt->scp_rep_idle, + struct ptlrpc_reply_state, + rs_list)) != NULL) { + list_del(&rs->rs_list); + OBD_FREE_LARGE(rs, svc->srv_max_reply_size); + } + } +} + +static void +ptlrpc_service_free(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + struct ptlrpc_at_array *array; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + /* In case somebody rearmed this in the meantime */ + del_timer(&svcpt->scp_at_timer); + array = &svcpt->scp_at_array; + + if (array->paa_reqs_array != NULL) { + OBD_FREE_PTR_ARRAY(array->paa_reqs_array, + array->paa_size); + array->paa_reqs_array = NULL; + } + + if (array->paa_reqs_count != NULL) { + OBD_FREE_PTR_ARRAY(array->paa_reqs_count, + array->paa_size); + array->paa_reqs_count = NULL; + } + } + + ptlrpc_service_for_each_part(svcpt, i, svc) + OBD_FREE_PTR(svcpt); + + if (svc->srv_cpts != NULL) + cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts); + + OBD_FREE(svc, offsetof(struct ptlrpc_service, + srv_parts[svc->srv_ncpts])); +} + +int ptlrpc_unregister_service(struct ptlrpc_service *service) +{ + ENTRY; + + CDEBUG(D_NET, "%s: tearing down\n", service->srv_name); + + service->srv_is_stopping = 1; + + mutex_lock(&ptlrpc_all_services_mutex); + list_del_init(&service->srv_list); + mutex_unlock(&ptlrpc_all_services_mutex); + + ptlrpc_service_del_atimer(service); + ptlrpc_stop_all_threads(service); + + ptlrpc_service_unlink_rqbd(service); + ptlrpc_service_purge_all(service); + ptlrpc_service_nrs_cleanup(service); + + ptlrpc_lprocfs_unregister_service(service); + ptlrpc_sysfs_unregister_service(service); + + ptlrpc_service_free(service); + + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_unregister_service); + +/** + * Returns 0 if the service is healthy. + * + * Right now, it just checks to make sure that requests aren't languishing + * in the queue. We'll use this health check to govern whether a node needs + * to be shot, so it's intentionally non-aggressive. + */ +static int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_request *request = NULL; + struct timespec64 right_now; + struct timespec64 timediff; + + ktime_get_real_ts64(&right_now); + + spin_lock(&svcpt->scp_req_lock); + /* How long has the next entry been waiting? */ + if (ptlrpc_server_high_pending(svcpt, true)) + request = ptlrpc_nrs_req_peek_nolock(svcpt, true); + else if (ptlrpc_server_normal_pending(svcpt, true)) + request = ptlrpc_nrs_req_peek_nolock(svcpt, false); + + if (request == NULL) { + spin_unlock(&svcpt->scp_req_lock); + return 0; + } + + timediff = timespec64_sub(right_now, request->rq_arrival_time); + spin_unlock(&svcpt->scp_req_lock); + + if ((timediff.tv_sec) > + (AT_OFF ? obd_timeout * 3 / 2 : at_max)) { + CERROR("%s: unhealthy - request has been waiting %llds\n", + svcpt->scp_service->srv_name, (s64)timediff.tv_sec); + return -1; + } + + return 0; +} + +int +ptlrpc_service_health_check(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + int i; + + if (svc == NULL) + return 0; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + int rc = ptlrpc_svcpt_health_check(svcpt); + + if (rc != 0) + return rc; + } + return 0; +} +EXPORT_SYMBOL(ptlrpc_service_health_check); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c new file mode 100644 index 0000000000000..c196eb280f4d5 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c @@ -0,0 +1,46 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#ifdef CONFIG_LUSTRE_FS_POSIX_ACL +# include +# include +#endif /* CONFIG_LUSTRE_FS_POSIX_ACL */ + +#include +#include +#include +#include +#include +#include +#include + diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c new file mode 100644 index 0000000000000..71bf668c1295b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c @@ -0,0 +1,6095 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#ifdef CONFIG_LUSTRE_FS_POSIX_ACL +# include +# include +#endif /* CONFIG_LUSTRE_FS_POSIX_ACL */ + +#include +#include +#include +#include +#include +#include +#include + + +void lustre_assert_wire_constants(void) +{ + /* Wire protocol assertions generated by 'wirecheck' + * (make -C lustre/utils newwiretest) + */ + + /* Constants... */ + LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n", + (long long)PTL_RPC_MSG_REQUEST); + LASSERTF(PTL_RPC_MSG_ERR == 4712, "found %lld\n", + (long long)PTL_RPC_MSG_ERR); + LASSERTF(PTL_RPC_MSG_REPLY == 4713, "found %lld\n", + (long long)PTL_RPC_MSG_REPLY); + LASSERTF(MDS_DIR_END_OFF == 0xfffffffffffffffeULL, "found 0x%.16llxULL\n", + MDS_DIR_END_OFF); + LASSERTF(DEAD_HANDLE_MAGIC == 0xdeadbeefcafebabeULL, "found 0x%.16llxULL\n", + DEAD_HANDLE_MAGIC); + BUILD_BUG_ON(MTI_NAME_MAXLEN != 64); + LASSERTF(OST_REPLY == 0, "found %lld\n", + (long long)OST_REPLY); + LASSERTF(OST_GETATTR == 1, "found %lld\n", + (long long)OST_GETATTR); + LASSERTF(OST_SETATTR == 2, "found %lld\n", + (long long)OST_SETATTR); + LASSERTF(OST_READ == 3, "found %lld\n", + (long long)OST_READ); + LASSERTF(OST_WRITE == 4, "found %lld\n", + (long long)OST_WRITE); + LASSERTF(OST_CREATE == 5, "found %lld\n", + (long long)OST_CREATE); + LASSERTF(OST_DESTROY == 6, "found %lld\n", + (long long)OST_DESTROY); + LASSERTF(OST_GET_INFO == 7, "found %lld\n", + (long long)OST_GET_INFO); + LASSERTF(OST_CONNECT == 8, "found %lld\n", + (long long)OST_CONNECT); + LASSERTF(OST_DISCONNECT == 9, "found %lld\n", + (long long)OST_DISCONNECT); + LASSERTF(OST_PUNCH == 10, "found %lld\n", + (long long)OST_PUNCH); + LASSERTF(OST_OPEN == 11, "found %lld\n", + (long long)OST_OPEN); + LASSERTF(OST_CLOSE == 12, "found %lld\n", + (long long)OST_CLOSE); + LASSERTF(OST_STATFS == 13, "found %lld\n", + (long long)OST_STATFS); + LASSERTF(OST_SYNC == 16, "found %lld\n", + (long long)OST_SYNC); + LASSERTF(OST_SET_INFO == 17, "found %lld\n", + (long long)OST_SET_INFO); + LASSERTF(OST_QUOTACHECK == 18, "found %lld\n", + (long long)OST_QUOTACHECK); + LASSERTF(OST_QUOTACTL == 19, "found %lld\n", + (long long)OST_QUOTACTL); + LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n", + (long long)OST_QUOTA_ADJUST_QUNIT); + LASSERTF(OST_LADVISE == 21, "found %lld\n", + (long long)OST_LADVISE); + LASSERTF(OST_FALLOCATE == 22, "found %lld\n", + (long long)OST_FALLOCATE); + LASSERTF(OST_SEEK == 23, "found %lld\n", + (long long)OST_SEEK); + LASSERTF(OST_LAST_OPC == 24, "found %lld\n", + (long long)OST_LAST_OPC); + LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n", + OBD_OBJECT_EOF); + LASSERTF(OST_MIN_PRECREATE == 32, "found %lld\n", + (long long)OST_MIN_PRECREATE); + LASSERTF(OST_MAX_PRECREATE == 20000, "found %lld\n", + (long long)OST_MAX_PRECREATE); + LASSERTF(OST_LVB_ERR_INIT == 0xffbadbad80000000ULL, "found 0x%.16llxULL\n", + OST_LVB_ERR_INIT); + LASSERTF(OST_LVB_ERR_MASK == 0xffbadbad00000000ULL, "found 0x%.16llxULL\n", + OST_LVB_ERR_MASK); + LASSERTF(MDS_FIRST_OPC == 33, "found %lld\n", + (long long)MDS_FIRST_OPC); + LASSERTF(MDS_GETATTR == 33, "found %lld\n", + (long long)MDS_GETATTR); + LASSERTF(MDS_GETATTR_NAME == 34, "found %lld\n", + (long long)MDS_GETATTR_NAME); + LASSERTF(MDS_CLOSE == 35, "found %lld\n", + (long long)MDS_CLOSE); + LASSERTF(MDS_REINT == 36, "found %lld\n", + (long long)MDS_REINT); + LASSERTF(MDS_READPAGE == 37, "found %lld\n", + (long long)MDS_READPAGE); + LASSERTF(MDS_CONNECT == 38, "found %lld\n", + (long long)MDS_CONNECT); + LASSERTF(MDS_DISCONNECT == 39, "found %lld\n", + (long long)MDS_DISCONNECT); + LASSERTF(MDS_GET_ROOT == 40, "found %lld\n", + (long long)MDS_GET_ROOT); + LASSERTF(MDS_STATFS == 41, "found %lld\n", + (long long)MDS_STATFS); + LASSERTF(MDS_PIN == 42, "found %lld\n", + (long long)MDS_PIN); + LASSERTF(MDS_UNPIN == 43, "found %lld\n", + (long long)MDS_UNPIN); + LASSERTF(MDS_SYNC == 44, "found %lld\n", + (long long)MDS_SYNC); + LASSERTF(MDS_DONE_WRITING == 45, "found %lld\n", + (long long)MDS_DONE_WRITING); + LASSERTF(MDS_SET_INFO == 46, "found %lld\n", + (long long)MDS_SET_INFO); + LASSERTF(MDS_QUOTACHECK == 47, "found %lld\n", + (long long)MDS_QUOTACHECK); + LASSERTF(MDS_QUOTACTL == 48, "found %lld\n", + (long long)MDS_QUOTACTL); + LASSERTF(MDS_GETXATTR == 49, "found %lld\n", + (long long)MDS_GETXATTR); + LASSERTF(MDS_SETXATTR == 50, "found %lld\n", + (long long)MDS_SETXATTR); + LASSERTF(MDS_WRITEPAGE == 51, "found %lld\n", + (long long)MDS_WRITEPAGE); + LASSERTF(MDS_IS_SUBDIR == 52, "found %lld\n", + (long long)MDS_IS_SUBDIR); + LASSERTF(MDS_GET_INFO == 53, "found %lld\n", + (long long)MDS_GET_INFO); + LASSERTF(MDS_HSM_STATE_GET == 54, "found %lld\n", + (long long)MDS_HSM_STATE_GET); + LASSERTF(MDS_HSM_STATE_SET == 55, "found %lld\n", + (long long)MDS_HSM_STATE_SET); + LASSERTF(MDS_HSM_ACTION == 56, "found %lld\n", + (long long)MDS_HSM_ACTION); + LASSERTF(MDS_HSM_PROGRESS == 57, "found %lld\n", + (long long)MDS_HSM_PROGRESS); + LASSERTF(MDS_HSM_REQUEST == 58, "found %lld\n", + (long long)MDS_HSM_REQUEST); + LASSERTF(MDS_HSM_CT_REGISTER == 59, "found %lld\n", + (long long)MDS_HSM_CT_REGISTER); + LASSERTF(MDS_HSM_CT_UNREGISTER == 60, "found %lld\n", + (long long)MDS_HSM_CT_UNREGISTER); + LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n", + (long long)MDS_SWAP_LAYOUTS); + LASSERTF(MDS_RMFID == 62, "found %lld\n", + (long long)MDS_RMFID); + LASSERTF(MDS_LAST_OPC == 63, "found %lld\n", + (long long)MDS_LAST_OPC); + LASSERTF(REINT_SETATTR == 1, "found %lld\n", + (long long)REINT_SETATTR); + LASSERTF(REINT_CREATE == 2, "found %lld\n", + (long long)REINT_CREATE); + LASSERTF(REINT_LINK == 3, "found %lld\n", + (long long)REINT_LINK); + LASSERTF(REINT_UNLINK == 4, "found %lld\n", + (long long)REINT_UNLINK); + LASSERTF(REINT_RENAME == 5, "found %lld\n", + (long long)REINT_RENAME); + LASSERTF(REINT_OPEN == 6, "found %lld\n", + (long long)REINT_OPEN); + LASSERTF(REINT_SETXATTR == 7, "found %lld\n", + (long long)REINT_SETXATTR); + LASSERTF(REINT_RMENTRY == 8, "found %lld\n", + (long long)REINT_RMENTRY); + LASSERTF(REINT_MIGRATE == 9, "found %lld\n", + (long long)REINT_MIGRATE); + LASSERTF(REINT_MAX == 11, "found %lld\n", + (long long)REINT_MAX); + LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)DISP_IT_EXECD); + LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)DISP_LOOKUP_EXECD); + LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)DISP_LOOKUP_NEG); + LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)DISP_LOOKUP_POS); + LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)DISP_OPEN_CREATE); + LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)DISP_OPEN_OPEN); + LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n", + (unsigned)DISP_ENQ_COMPLETE); + LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n", + (unsigned)DISP_ENQ_OPEN_REF); + LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n", + (unsigned)DISP_ENQ_CREATE_REF); + LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n", + (unsigned)DISP_OPEN_LOCK); + LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n", + (long long)MDS_STATUS_CONN); + LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n", + (long long)MDS_STATUS_LOV); + LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_MODE); + LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_UID); + LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_GID); + LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_SIZE); + LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_ATIME); + LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_MTIME); + LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_CTIME); + LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_ATIME_SET); + LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_MTIME_SET); + LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_FORCE); + LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_ATTR_FLAG); + LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_KILL_SUID); + LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_KILL_SGID); + LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_CTIME_SET); + LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_FROM_OPEN); + LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_BLOCKS); + LASSERTF(MDS_ATTR_PROJID == 0x0000000000010000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_PROJID); + LASSERTF(MDS_ATTR_LSIZE == 0x0000000000020000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_LSIZE); + LASSERTF(MDS_ATTR_LBLOCKS == 0x0000000000040000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_LBLOCKS); + LASSERTF(MDS_ATTR_OVERRIDE == 0x0000000002000000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_OVERRIDE); + LASSERTF(FLD_QUERY == 900, "found %lld\n", + (long long)FLD_QUERY); + LASSERTF(FLD_READ == 901, "found %lld\n", + (long long)FLD_READ); + LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n", + (long long)FLD_FIRST_OPC); + LASSERTF(FLD_LAST_OPC == 902, "found %lld\n", + (long long)FLD_LAST_OPC); + LASSERTF(SEQ_QUERY == 700, "found %lld\n", + (long long)SEQ_QUERY); + LASSERTF(SEQ_FIRST_OPC == 700, "found %lld\n", + (long long)SEQ_FIRST_OPC); + LASSERTF(SEQ_LAST_OPC == 701, "found %lld\n", + (long long)SEQ_LAST_OPC); +#ifdef HAVE_SERVER_SUPPORT + LASSERTF(LFSCK_NOTIFY == 1101, "found %lld\n", + (long long)LFSCK_NOTIFY); + LASSERTF(LFSCK_QUERY == 1102, "found %lld\n", + (long long)LFSCK_QUERY); + LASSERTF(LFSCK_FIRST_OPC == 1101, "found %lld\n", + (long long)LFSCK_FIRST_OPC); + LASSERTF(LFSCK_LAST_OPC == 1103, "found %lld\n", + (long long)LFSCK_LAST_OPC); +#endif /* HAVE_SERVER_SUPPORT */ + LASSERTF(SEQ_ALLOC_SUPER == 0, "found %lld\n", + (long long)SEQ_ALLOC_SUPER); + LASSERTF(SEQ_ALLOC_META == 1, "found %lld\n", + (long long)SEQ_ALLOC_META); + LASSERTF(LDLM_ENQUEUE == 101, "found %lld\n", + (long long)LDLM_ENQUEUE); + LASSERTF(LDLM_CONVERT == 102, "found %lld\n", + (long long)LDLM_CONVERT); + LASSERTF(LDLM_CANCEL == 103, "found %lld\n", + (long long)LDLM_CANCEL); + LASSERTF(LDLM_BL_CALLBACK == 104, "found %lld\n", + (long long)LDLM_BL_CALLBACK); + LASSERTF(LDLM_CP_CALLBACK == 105, "found %lld\n", + (long long)LDLM_CP_CALLBACK); + LASSERTF(LDLM_GL_CALLBACK == 106, "found %lld\n", + (long long)LDLM_GL_CALLBACK); + LASSERTF(LDLM_SET_INFO == 107, "found %lld\n", + (long long)LDLM_SET_INFO); + LASSERTF(LDLM_LAST_OPC == 108, "found %lld\n", + (long long)LDLM_LAST_OPC); + LASSERTF(LCK_MINMODE == 0, "found %lld\n", + (long long)LCK_MINMODE); + LASSERTF(LCK_EX == 1, "found %lld\n", + (long long)LCK_EX); + LASSERTF(LCK_PW == 2, "found %lld\n", + (long long)LCK_PW); + LASSERTF(LCK_PR == 4, "found %lld\n", + (long long)LCK_PR); + LASSERTF(LCK_CW == 8, "found %lld\n", + (long long)LCK_CW); + LASSERTF(LCK_CR == 16, "found %lld\n", + (long long)LCK_CR); + LASSERTF(LCK_NL == 32, "found %lld\n", + (long long)LCK_NL); + LASSERTF(LCK_GROUP == 64, "found %lld\n", + (long long)LCK_GROUP); + LASSERTF(LCK_COS == 128, "found %lld\n", + (long long)LCK_COS); + LASSERTF(LCK_MAXMODE == 129, "found %lld\n", + (long long)LCK_MAXMODE); + LASSERTF(LCK_MODE_NUM == 8, "found %lld\n", + (long long)LCK_MODE_NUM); + BUILD_BUG_ON(LDLM_PLAIN != 10); + BUILD_BUG_ON(LDLM_EXTENT != 11); + BUILD_BUG_ON(LDLM_FLOCK != 12); + BUILD_BUG_ON(LDLM_IBITS != 13); + BUILD_BUG_ON(LDLM_MAX_TYPE != 14); + BUILD_BUG_ON(LUSTRE_RES_ID_SEQ_OFF != 0); + BUILD_BUG_ON(LUSTRE_RES_ID_VER_OID_OFF != 1); + BUILD_BUG_ON(LUSTRE_RES_ID_QUOTA_SEQ_OFF != 2); + BUILD_BUG_ON(LUSTRE_RES_ID_QUOTA_VER_OID_OFF != 3); + BUILD_BUG_ON(LUSTRE_RES_ID_HSH_OFF != 3); +#ifdef HAVE_SERVER_SUPPORT + LASSERTF(OUT_UPDATE == 1000, "found %lld\n", + (long long)OUT_UPDATE); + LASSERTF(OUT_UPDATE_LAST_OPC == 1001, "found %lld\n", + (long long)OUT_UPDATE_LAST_OPC); + BUILD_BUG_ON(LQUOTA_TYPE_USR != 0); + BUILD_BUG_ON(LQUOTA_TYPE_GRP != 1); + BUILD_BUG_ON(LQUOTA_RES_MD != 1); + BUILD_BUG_ON(LQUOTA_RES_DT != 2); +#endif /* HAVE_SERVER_SUPPORT */ + LASSERTF(OBD_PING == 400, "found %lld\n", + (long long)OBD_PING); + LASSERTF(OBD_IDX_READ == 403, "found %lld\n", + (long long)OBD_IDX_READ); + LASSERTF(OBD_LAST_OPC == 404, "found %lld\n", + (long long)OBD_LAST_OPC); + LASSERTF(QUOTA_DQACQ == 601, "found %lld\n", + (long long)QUOTA_DQACQ); + LASSERTF(QUOTA_DQREL == 602, "found %lld\n", + (long long)QUOTA_DQREL); + LASSERTF(QUOTA_LAST_OPC == 603, "found %lld\n", + (long long)QUOTA_LAST_OPC); + LASSERTF(MGS_CONNECT == 250, "found %lld\n", + (long long)MGS_CONNECT); + LASSERTF(MGS_DISCONNECT == 251, "found %lld\n", + (long long)MGS_DISCONNECT); + LASSERTF(MGS_EXCEPTION == 252, "found %lld\n", + (long long)MGS_EXCEPTION); + LASSERTF(MGS_TARGET_REG == 253, "found %lld\n", + (long long)MGS_TARGET_REG); + LASSERTF(MGS_TARGET_DEL == 254, "found %lld\n", + (long long)MGS_TARGET_DEL); + LASSERTF(MGS_SET_INFO == 255, "found %lld\n", + (long long)MGS_SET_INFO); + LASSERTF(MGS_CONFIG_READ == 256, "found %lld\n", + (long long)MGS_CONFIG_READ); + LASSERTF(MGS_LAST_OPC == 257, "found %lld\n", + (long long)MGS_LAST_OPC); + LASSERTF(SEC_CTX_INIT == 801, "found %lld\n", + (long long)SEC_CTX_INIT); + LASSERTF(SEC_CTX_INIT_CONT == 802, "found %lld\n", + (long long)SEC_CTX_INIT_CONT); + LASSERTF(SEC_CTX_FINI == 803, "found %lld\n", + (long long)SEC_CTX_FINI); + LASSERTF(SEC_LAST_OPC == 804, "found %lld\n", + (long long)SEC_LAST_OPC); + /* Sizes and Offsets */ + + /* Checks for struct obd_uuid */ + LASSERTF((int)sizeof(struct obd_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(struct obd_uuid)); + + /* Checks for struct lu_seq_range */ + LASSERTF((int)sizeof(struct lu_seq_range) == 24, "found %lld\n", + (long long)(int)sizeof(struct lu_seq_range)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_start)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_start)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_end)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_end)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_index) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_index)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_index)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_flags) == 20, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_flags)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_flags)); + LASSERTF(LU_SEQ_RANGE_MDT == 0, "found %lld\n", + (long long)LU_SEQ_RANGE_MDT); + LASSERTF(LU_SEQ_RANGE_OST == 1, "found %lld\n", + (long long)LU_SEQ_RANGE_OST); + + /* Checks for struct lustre_som_attrs */ + LASSERTF((int)sizeof(struct lustre_som_attrs) == 24, "found %lld\n", + (long long)(int)sizeof(struct lustre_som_attrs)); + LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_valid) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_som_attrs, lsa_valid)); + LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_valid) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_valid)); + LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_reserved) == 2, "found %lld\n", + (long long)(int)offsetof(struct lustre_som_attrs, lsa_reserved)); + LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_reserved) == 6, "found %lld\n", + (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_reserved)); + LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_size) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_som_attrs, lsa_size)); + LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_size)); + LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_blocks) == 16, "found %lld\n", + (long long)(int)offsetof(struct lustre_som_attrs, lsa_blocks)); + LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_blocks)); +#ifdef HAVE_SERVER_SUPPORT + + /* Checks for struct lustre_mdt_attrs */ + LASSERTF((int)sizeof(struct lustre_mdt_attrs) == 24, "found %lld\n", + (long long)(int)sizeof(struct lustre_mdt_attrs)); + LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_compat) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_mdt_attrs, lma_compat)); + LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat)); + LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_incompat) == 4, "found %lld\n", + (long long)(int)offsetof(struct lustre_mdt_attrs, lma_incompat)); + LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat)); + LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_self_fid) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_mdt_attrs, lma_self_fid)); + LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid)); + LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LMAC_HSM); + LASSERTF(LMAC_NOT_IN_OI == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LMAC_NOT_IN_OI); + LASSERTF(LMAC_FID_ON_OST == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)LMAC_FID_ON_OST); + LASSERTF(LMAC_STRIPE_INFO == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)LMAC_STRIPE_INFO); + LASSERTF(LMAC_COMP_INFO == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)LMAC_COMP_INFO); + LASSERTF(LMAC_IDX_BACKUP == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)LMAC_IDX_BACKUP); + LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LMAI_RELEASED); + LASSERTF(LMAI_AGENT == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LMAI_AGENT); + LASSERTF(LMAI_REMOTE_PARENT == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LMAI_REMOTE_PARENT); + LASSERTF(LMAI_STRIPED == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)LMAI_STRIPED); + LASSERTF(LMAI_ORPHAN == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)LMAI_ORPHAN); + LASSERTF(LMAI_ENCRYPT == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)LMAI_ENCRYPT); + + /* Checks for struct lustre_ost_attrs */ + LASSERTF((int)sizeof(struct lustre_ost_attrs) == 64, "found %lld\n", + (long long)(int)sizeof(struct lustre_ost_attrs)); + LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_lma) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_ost_attrs, loa_lma)); + LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_lma) == 24, "found %lld\n", + (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_lma)); + LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_parent_fid) == 24, "found %lld\n", + (long long)(int)offsetof(struct lustre_ost_attrs, loa_parent_fid)); + LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_parent_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_parent_fid)); + LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_stripe_size) == 40, "found %lld\n", + (long long)(int)offsetof(struct lustre_ost_attrs, loa_stripe_size)); + LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_stripe_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_stripe_size)); + LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_comp_id) == 44, "found %lld\n", + (long long)(int)offsetof(struct lustre_ost_attrs, loa_comp_id)); + LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_id)); + LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_comp_start) == 48, "found %lld\n", + (long long)(int)offsetof(struct lustre_ost_attrs, loa_comp_start)); + LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_start)); + LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_comp_end) == 56, "found %lld\n", + (long long)(int)offsetof(struct lustre_ost_attrs, loa_comp_end)); + LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_end)); + LASSERTF(OUT_CREATE == 1, "found %lld\n", + (long long)OUT_CREATE); + LASSERTF(OUT_DESTROY == 2, "found %lld\n", + (long long)OUT_DESTROY); + LASSERTF(OUT_REF_ADD == 3, "found %lld\n", + (long long)OUT_REF_ADD); + LASSERTF(OUT_REF_DEL == 4, "found %lld\n", + (long long)OUT_REF_DEL); + LASSERTF(OUT_ATTR_SET == 5, "found %lld\n", + (long long)OUT_ATTR_SET); + LASSERTF(OUT_ATTR_GET == 6, "found %lld\n", + (long long)OUT_ATTR_GET); + LASSERTF(OUT_XATTR_SET == 7, "found %lld\n", + (long long)OUT_XATTR_SET); + LASSERTF(OUT_XATTR_GET == 8, "found %lld\n", + (long long)OUT_XATTR_GET); + LASSERTF(OUT_INDEX_LOOKUP == 9, "found %lld\n", + (long long)OUT_INDEX_LOOKUP); + LASSERTF(OUT_INDEX_LOOKUP == 9, "found %lld\n", + (long long)OUT_INDEX_LOOKUP); + LASSERTF(OUT_INDEX_INSERT == 10, "found %lld\n", + (long long)OUT_INDEX_INSERT); + LASSERTF(OUT_INDEX_DELETE == 11, "found %lld\n", + (long long)OUT_INDEX_DELETE); + LASSERTF(OUT_WRITE == 12, "found %lld\n", + (long long)OUT_WRITE); + LASSERTF(OUT_XATTR_DEL == 13, "found %lld\n", + (long long)OUT_XATTR_DEL); + LASSERTF(OUT_PUNCH == 14, "found %lld\n", + (long long)OUT_PUNCH); + LASSERTF(OUT_READ == 15, "found %lld\n", + (long long)OUT_READ); + LASSERTF(OUT_NOOP == 16, "found %lld\n", + (long long)OUT_NOOP); + LASSERTF(OUT_XATTR_LIST == 17, "found %lld\n", + (long long)OUT_XATTR_LIST); + + /* Checks for struct hsm_attrs */ + LASSERTF((int)sizeof(struct hsm_attrs) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_attrs)); + LASSERTF((int)offsetof(struct hsm_attrs, hsm_compat) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_attrs, hsm_compat)); + LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_compat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_compat)); + LASSERTF((int)offsetof(struct hsm_attrs, hsm_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_attrs, hsm_flags)); + LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_flags)); + LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_attrs, hsm_arch_id)); + LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id)); + LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_ver) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_attrs, hsm_arch_ver)); + LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver)); +#endif /* HAVE_SERVER_SUPPORT */ + + /* Checks for struct ost_id */ + LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n", + (long long)(int)sizeof(struct ost_id)); + LASSERTF((int)offsetof(struct ost_id, oi) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_id, oi)); + LASSERTF((int)sizeof(((struct ost_id *)0)->oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct ost_id *)0)->oi)); + LASSERTF(LUSTRE_FID_INIT_OID == 1, "found %lld\n", + (long long)LUSTRE_FID_INIT_OID); + LASSERTF(FID_SEQ_OST_MDT0 == 0, "found %lld\n", + (long long)FID_SEQ_OST_MDT0); + LASSERTF(FID_SEQ_LLOG == 1, "found %lld\n", + (long long)FID_SEQ_LLOG); + LASSERTF(FID_SEQ_ECHO == 2, "found %lld\n", + (long long)FID_SEQ_ECHO); + LASSERTF(FID_SEQ_UNUSED_START == 3, "found %lld\n", + (long long)FID_SEQ_UNUSED_START); + LASSERTF(FID_SEQ_UNUSED_END == 9, "found %lld\n", + (long long)FID_SEQ_UNUSED_END); + LASSERTF(FID_SEQ_LLOG_NAME == 10, "found %lld\n", + (long long)FID_SEQ_LLOG_NAME); + LASSERTF(FID_SEQ_RSVD == 11, "found %lld\n", + (long long)FID_SEQ_RSVD); + LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n", + (long long)FID_SEQ_IGIF); + LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_IGIF_MAX); + LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_IDIF); + LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_IDIF_MAX); + LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_START); + LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_LOCAL_FILE); + LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_DOT_LUSTRE); + LASSERTF(FID_SEQ_LOCAL_NAME == 0x0000000200000003ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_LOCAL_NAME); + LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_SPECIAL); + LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_QUOTA); + LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_QUOTA_GLB); + LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_ROOT); +#ifdef HAVE_SERVER_SUPPORT + LASSERTF(FID_SEQ_LAYOUT_RBTREE == 0x0000000200000008ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_LAYOUT_RBTREE); + LASSERTF(FID_SEQ_UPDATE_LOG == 0x0000000200000009ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_UPDATE_LOG); + LASSERTF(FID_SEQ_UPDATE_LOG_DIR == 0x000000020000000aULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_UPDATE_LOG_DIR); +#endif /* HAVE_SERVER_SUPPORT */ + LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_NORMAL); + LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_LOV_DEFAULT); + LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)FID_OID_SPECIAL_BFL); + LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)FID_OID_DOT_LUSTRE); + LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)FID_OID_DOT_LUSTRE_OBF); + LASSERTF(FID_OID_DOT_LUSTRE_LPF == 0x00000003UL, "found 0x%.8xUL\n", + (unsigned)FID_OID_DOT_LUSTRE_LPF); + + /* Checks for struct lu_dirent */ + LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n", + (long long)(int)sizeof(struct lu_dirent)); + LASSERTF((int)offsetof(struct lu_dirent, lde_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_fid)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_fid)); + LASSERTF((int)offsetof(struct lu_dirent, lde_hash) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_hash)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_hash) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_hash)); + LASSERTF((int)offsetof(struct lu_dirent, lde_reclen) == 24, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_reclen)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_reclen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_reclen)); + LASSERTF((int)offsetof(struct lu_dirent, lde_namelen) == 26, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_namelen)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_namelen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_namelen)); + LASSERTF((int)offsetof(struct lu_dirent, lde_attrs) == 28, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_attrs)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_attrs) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_attrs)); + LASSERTF((int)offsetof(struct lu_dirent, lde_name[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_name[0])); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0])); + LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LUDA_FID); + LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LUDA_TYPE); + LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LUDA_64BITHASH); + + /* Checks for struct luda_type */ + LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n", + (long long)(int)sizeof(struct luda_type)); + LASSERTF((int)offsetof(struct luda_type, lt_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct luda_type, lt_type)); + LASSERTF((int)sizeof(((struct luda_type *)0)->lt_type) == 2, "found %lld\n", + (long long)(int)sizeof(((struct luda_type *)0)->lt_type)); + + /* Checks for struct lu_dirpage */ + LASSERTF((int)sizeof(struct lu_dirpage) == 24, "found %lld\n", + (long long)(int)sizeof(struct lu_dirpage)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_hash_start)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_end) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_hash_end)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_flags)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_flags)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_pad0) == 20, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_pad0)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_pad0) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_pad0)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_entries[0]) == 24, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_entries[0])); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]) == 32, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0])); + LASSERTF(LDF_EMPTY == 1, "found %lld\n", + (long long)LDF_EMPTY); + LASSERTF(LDF_COLLIDE == 2, "found %lld\n", + (long long)LDF_COLLIDE); + LASSERTF(LU_PAGE_SIZE == 4096, "found %lld\n", + (long long)LU_PAGE_SIZE); +#ifdef HAVE_SERVER_SUPPORT + /* Checks for union lu_page */ + LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n", + (long long)(int)sizeof(union lu_page)); +#endif /* HAVE_SERVER_SUPPORT */ + + /* Checks for struct lu_ladvise */ + LASSERTF((int)sizeof(struct lu_ladvise) == 32, "found %lld\n", + (long long)(int)sizeof(struct lu_ladvise)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_advice) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_advice)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_advice) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_advice)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_value1) == 2, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_value1)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value1) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value1)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_value2) == 4, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_value2)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value2)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_start) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_start)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_start)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_end) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_end)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_end)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_value3) == 24, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_value3)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value3)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_value4) == 28, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_value4)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4)); + LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n", + (long long)LU_LADVISE_WILLREAD); + LASSERTF(LU_LADVISE_DONTNEED == 2, "found %lld\n", + (long long)LU_LADVISE_DONTNEED); + LASSERTF(LU_LADVISE_LOCKNOEXPAND == 3, "found %lld\n", + (long long)LU_LADVISE_LOCKNOEXPAND); + LASSERTF(LU_LADVISE_LOCKAHEAD == 4, "found %lld\n", + (long long)LU_LADVISE_LOCKAHEAD); + + /* Checks for struct ladvise_hdr */ + LASSERTF((int)sizeof(struct ladvise_hdr) == 32, "found %lld\n", + (long long)(int)sizeof(struct ladvise_hdr)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_magic)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_magic)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_count)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_count)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_flags)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_flags)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_value1) == 16, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_value1)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value1)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_value2) == 20, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_value2)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value2)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_value3) == 24, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_value3)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value3)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_advise) == 32, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_advise)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise)); + BUILD_BUG_ON(LF_ASYNC != 0x00000001); + BUILD_BUG_ON(LF_UNSET != 0x00000002); + BUILD_BUG_ON(LADVISE_MAGIC != 0x1adf1ce0); + + /* Checks for struct lustre_handle */ + LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n", + (long long)(int)sizeof(struct lustre_handle)); + LASSERTF((int)offsetof(struct lustre_handle, cookie) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_handle, cookie)); + LASSERTF((int)sizeof(((struct lustre_handle *)0)->cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_handle *)0)->cookie)); + + /* Checks for struct lustre_msg_v2 */ + LASSERTF((int)sizeof(struct lustre_msg_v2) == 32, "found %lld\n", + (long long)(int)sizeof(struct lustre_msg_v2)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_bufcount) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_bufcount)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_secflvr) == 4, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_secflvr)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_magic) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_magic)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_repsize) == 12, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_flags)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_3) == 28, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_3)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_buflens[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0])); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0])); + LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0bd00bd3UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_MSG_MAGIC_V2); + LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xd30bd00bUL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_MSG_MAGIC_V2_SWABBED); + + /* Checks for struct ptlrpc_body */ + LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n", + (long long)(int)sizeof(struct ptlrpc_body_v3)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == 0, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_handle)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == 8, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_type)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == 12, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_version)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == 16, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_opc)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == 20, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_status)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == 24, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_xid)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_tag) == 32, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_tag)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag) == 2, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding0) == 34, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding0)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0) == 2, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding1)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == 40, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_committed)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == 48, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_transno)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == 56, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == 60, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_op_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == 64, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == 68, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_timeout)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_service_time)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == 76, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_limit)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == 80, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_slv)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv)); + BUILD_BUG_ON(PTLRPC_NUM_VERSIONS != 4); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == 88, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_pre_versions)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_mbits) == 120, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_mbits)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_0) == 128, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_0)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_1) == 136, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_1)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_2) == 144, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_2)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2)); + BUILD_BUG_ON(LUSTRE_JOBID_SIZE != 32); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == (int)offsetof(struct ptlrpc_body_v2, pb_handle), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_handle), (int)offsetof(struct ptlrpc_body_v2, pb_handle)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == (int)offsetof(struct ptlrpc_body_v2, pb_type), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_type), (int)offsetof(struct ptlrpc_body_v2, pb_type)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == (int)offsetof(struct ptlrpc_body_v2, pb_version), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_version), (int)offsetof(struct ptlrpc_body_v2, pb_version)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == (int)offsetof(struct ptlrpc_body_v2, pb_opc), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_opc), (int)offsetof(struct ptlrpc_body_v2, pb_opc)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == (int)offsetof(struct ptlrpc_body_v2, pb_status), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_status), (int)offsetof(struct ptlrpc_body_v2, pb_status)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == (int)offsetof(struct ptlrpc_body_v2, pb_last_xid), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_last_xid), (int)offsetof(struct ptlrpc_body_v2, pb_last_xid)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_tag) == (int)offsetof(struct ptlrpc_body_v2, pb_tag), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_tag), (int)offsetof(struct ptlrpc_body_v2, pb_tag)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_tag), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_tag)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding0) == (int)offsetof(struct ptlrpc_body_v2, pb_padding0), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_padding0), (int)offsetof(struct ptlrpc_body_v2, pb_padding0)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding0), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding0)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding1) == (int)offsetof(struct ptlrpc_body_v2, pb_padding1), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_padding1), (int)offsetof(struct ptlrpc_body_v2, pb_padding1)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding1), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding1)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == (int)offsetof(struct ptlrpc_body_v2, pb_last_committed), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_last_committed), (int)offsetof(struct ptlrpc_body_v2, pb_last_committed)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == (int)offsetof(struct ptlrpc_body_v2, pb_transno), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_transno), (int)offsetof(struct ptlrpc_body_v2, pb_transno)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_flags), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_flags), (int)offsetof(struct ptlrpc_body_v2, pb_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_op_flags), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_op_flags), (int)offsetof(struct ptlrpc_body_v2, pb_op_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt), (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == (int)offsetof(struct ptlrpc_body_v2, pb_timeout), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_timeout), (int)offsetof(struct ptlrpc_body_v2, pb_timeout)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == (int)offsetof(struct ptlrpc_body_v2, pb_service_time), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_service_time), (int)offsetof(struct ptlrpc_body_v2, pb_service_time)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == (int)offsetof(struct ptlrpc_body_v2, pb_limit), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_limit), (int)offsetof(struct ptlrpc_body_v2, pb_limit)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == (int)offsetof(struct ptlrpc_body_v2, pb_slv), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_slv), (int)offsetof(struct ptlrpc_body_v2, pb_slv)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_pre_versions), (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_mbits) == (int)offsetof(struct ptlrpc_body_v2, pb_mbits), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_mbits), (int)offsetof(struct ptlrpc_body_v2, pb_mbits)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_mbits), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_mbits)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_0) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_0), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_padding64_0), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_0)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_0), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_0)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_1) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_1), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_padding64_1), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_1)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_1), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_1)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_2) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_2), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_padding64_2), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_2)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_2), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_2)); + LASSERTF(MSG_PTLRPC_BODY_OFF == 0, "found %lld\n", + (long long)MSG_PTLRPC_BODY_OFF); + LASSERTF(REQ_REC_OFF == 1, "found %lld\n", + (long long)REQ_REC_OFF); + LASSERTF(REPLY_REC_OFF == 1, "found %lld\n", + (long long)REPLY_REC_OFF); + LASSERTF(DLM_LOCKREQ_OFF == 1, "found %lld\n", + (long long)DLM_LOCKREQ_OFF); + LASSERTF(DLM_REQ_REC_OFF == 2, "found %lld\n", + (long long)DLM_REQ_REC_OFF); + LASSERTF(DLM_INTENT_IT_OFF == 2, "found %lld\n", + (long long)DLM_INTENT_IT_OFF); + LASSERTF(DLM_INTENT_REC_OFF == 3, "found %lld\n", + (long long)DLM_INTENT_REC_OFF); + LASSERTF(DLM_LOCKREPLY_OFF == 1, "found %lld\n", + (long long)DLM_LOCKREPLY_OFF); + LASSERTF(DLM_REPLY_REC_OFF == 2, "found %lld\n", + (long long)DLM_REPLY_REC_OFF); + LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n", + (long long)MSG_PTLRPC_HEADER_OFF); + LASSERTF(PTLRPC_MSG_VERSION == 0x00000003UL, "found 0x%.8xUL\n", + (unsigned)PTLRPC_MSG_VERSION); + LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_VERSION_MASK); + LASSERTF(LUSTRE_OBD_VERSION == 0x00010000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_OBD_VERSION); + LASSERTF(LUSTRE_MDS_VERSION == 0x00020000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_MDS_VERSION); + LASSERTF(LUSTRE_OST_VERSION == 0x00030000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_OST_VERSION); + LASSERTF(LUSTRE_DLM_VERSION == 0x00040000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_DLM_VERSION); + LASSERTF(LUSTRE_LOG_VERSION == 0x00050000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_LOG_VERSION); + LASSERTF(LUSTRE_MGS_VERSION == 0x00060000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_MGS_VERSION); + LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n", + (long long)MSGHDR_AT_SUPPORT); + LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n", + (long long)MSGHDR_CKSUM_INCOMPAT18); + LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)MSG_RESENT); + LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)MSG_REPLAY); + LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)MSG_REQ_REPLAY_DONE); + LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)MSG_LOCK_REPLAY_DONE); + LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_RECOVERING); + LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_RECONNECT); + LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_REPLAYABLE); + LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_LIBCLIENT); + LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_INITIAL); + LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_NEXT_VER); + LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_TRANSNO); + + /* Checks for struct obd_connect_data */ + LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n", + (long long)(int)sizeof(struct obd_connect_data)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_version) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_version)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_version)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant) == 12, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_index) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_index)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_index)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_brw_size) == 20, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_brw_size)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_ibits_known) == 24, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_blkbits) == 32, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_blkbits)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_blkbits) == 1, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_blkbits)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_inobits) == 33, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_inobits)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_inobits) == 1, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_inobits)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_tax_kb) == 34, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_tax_kb)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_tax_kb) == 2, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_tax_kb)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_max_blks) == 36, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_max_blks)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_max_blks) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_max_blks)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_transno)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_transno)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_group) == 48, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_group)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_group) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_group)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_cksum_types) == 52, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_cksum_types)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_max_easize) == 56, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_max_easize)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_instance) == 60, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_instance)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_instance) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_instance)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxbytes) == 64, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_maxbytes)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxmodrpcs) == 72, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_maxmodrpcs)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxmodrpcs) == 2, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxmodrpcs)); + LASSERTF((int)offsetof(struct obd_connect_data, padding0) == 74, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding0)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding0) == 2, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding0)); + LASSERTF((int)offsetof(struct obd_connect_data, padding1) == 76, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding1)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding1)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags2) == 80, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags2)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags2)); + LASSERTF((int)offsetof(struct obd_connect_data, padding3) == 88, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding3)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding3)); + LASSERTF((int)offsetof(struct obd_connect_data, padding4) == 96, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding4)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding4)); + LASSERTF((int)offsetof(struct obd_connect_data, padding5) == 104, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding5)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding5)); + LASSERTF((int)offsetof(struct obd_connect_data, padding6) == 112, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding6)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding6)); + LASSERTF((int)offsetof(struct obd_connect_data, padding7) == 120, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding7)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding7) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding7)); + LASSERTF((int)offsetof(struct obd_connect_data, padding8) == 128, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding8)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding8) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding8)); + LASSERTF((int)offsetof(struct obd_connect_data, padding9) == 136, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding9)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding9) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding9)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingA) == 144, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingA)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingA) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingA)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingB) == 152, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingB)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingB) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingB)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingC) == 160, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingC)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingC) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingC)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingD) == 168, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingD)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingD) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingD)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingE) == 176, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingE)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingE) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingE)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingF) == 184, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingF)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingF) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingF)); + LASSERTF(OBD_CONNECT_RDONLY == 0x1ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_RDONLY); + LASSERTF(OBD_CONNECT_INDEX == 0x2ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_INDEX); + LASSERTF(OBD_CONNECT_MDS == 0x4ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MDS); + LASSERTF(OBD_CONNECT_GRANT == 0x8ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_GRANT); + LASSERTF(OBD_CONNECT_SRVLOCK == 0x10ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SRVLOCK); + LASSERTF(OBD_CONNECT_VERSION == 0x20ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_VERSION); + LASSERTF(OBD_CONNECT_REQPORTAL == 0x40ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_REQPORTAL); + LASSERTF(OBD_CONNECT_ACL == 0x80ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_ACL); + LASSERTF(OBD_CONNECT_XATTR == 0x100ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_XATTR); + LASSERTF(OBD_CONNECT_LARGE_ACL == 0x200ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LARGE_ACL); + LASSERTF(OBD_CONNECT_TRANSNO == 0x800ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_TRANSNO); + LASSERTF(OBD_CONNECT_IBITS == 0x1000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_IBITS); + LASSERTF(OBD_CONNECT_BARRIER == 0x2000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_BARRIER); + LASSERTF(OBD_CONNECT_ATTRFID == 0x4000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_ATTRFID); + LASSERTF(OBD_CONNECT_NODEVOH == 0x8000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_NODEVOH); + LASSERTF(OBD_CONNECT_RMT_CLIENT == 0x10000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_RMT_CLIENT); + LASSERTF(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_RMT_CLIENT_FORCE); + LASSERTF(OBD_CONNECT_BRW_SIZE == 0x40000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_BRW_SIZE); + LASSERTF(OBD_CONNECT_QUOTA64 == 0x80000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_QUOTA64); + LASSERTF(OBD_CONNECT_MDS_CAPA == 0x100000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MDS_CAPA); + LASSERTF(OBD_CONNECT_OSS_CAPA == 0x200000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_OSS_CAPA); + LASSERTF(OBD_CONNECT_CANCELSET == 0x400000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CANCELSET); + LASSERTF(OBD_CONNECT_SOM == 0x800000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SOM); + LASSERTF(OBD_CONNECT_AT == 0x1000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_AT); + LASSERTF(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LRU_RESIZE); + LASSERTF(OBD_CONNECT_MDS_MDS == 0x4000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MDS_MDS); + LASSERTF(OBD_CONNECT_REAL == 0x8000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_REAL); + LASSERTF(OBD_CONNECT_CHANGE_QS == 0x10000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CHANGE_QS); + LASSERTF(OBD_CONNECT_CKSUM == 0x20000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CKSUM); + LASSERTF(OBD_CONNECT_FID == 0x40000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FID); + LASSERTF(OBD_CONNECT_VBR == 0x80000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_VBR); + LASSERTF(OBD_CONNECT_LOV_V3 == 0x100000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LOV_V3); + LASSERTF(OBD_CONNECT_GRANT_SHRINK == 0x200000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_GRANT_SHRINK); + LASSERTF(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SKIP_ORPHAN); + LASSERTF(OBD_CONNECT_MAX_EASIZE == 0x800000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MAX_EASIZE); + LASSERTF(OBD_CONNECT_FULL20 == 0x1000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FULL20); + LASSERTF(OBD_CONNECT_LAYOUTLOCK == 0x2000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LAYOUTLOCK); + LASSERTF(OBD_CONNECT_64BITHASH == 0x4000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_64BITHASH); + LASSERTF(OBD_CONNECT_MAXBYTES == 0x8000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MAXBYTES); + LASSERTF(OBD_CONNECT_IMP_RECOV == 0x10000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_IMP_RECOV); + LASSERTF(OBD_CONNECT_JOBSTATS == 0x20000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_JOBSTATS); + LASSERTF(OBD_CONNECT_UMASK == 0x40000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_UMASK); + LASSERTF(OBD_CONNECT_EINPROGRESS == 0x80000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_EINPROGRESS); + LASSERTF(OBD_CONNECT_GRANT_PARAM == 0x100000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_GRANT_PARAM); + LASSERTF(OBD_CONNECT_FLOCK_OWNER == 0x200000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FLOCK_OWNER); + LASSERTF(OBD_CONNECT_LVB_TYPE == 0x400000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LVB_TYPE); + LASSERTF(OBD_CONNECT_NANOSEC_TIME == 0x800000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_NANOSEC_TIME); + LASSERTF(OBD_CONNECT_LIGHTWEIGHT == 0x1000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LIGHTWEIGHT); + LASSERTF(OBD_CONNECT_SHORTIO == 0x2000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SHORTIO); + LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_PINGLESS); + LASSERTF(OBD_CONNECT_FLOCK_DEAD == 0x8000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FLOCK_DEAD); + LASSERTF(OBD_CONNECT_DISP_STRIPE == 0x10000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_DISP_STRIPE); + LASSERTF(OBD_CONNECT_OPEN_BY_FID == 0x20000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_OPEN_BY_FID); + LASSERTF(OBD_CONNECT_LFSCK == 0x40000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LFSCK); + LASSERTF(OBD_CONNECT_UNLINK_CLOSE == 0x100000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_UNLINK_CLOSE); + LASSERTF(OBD_CONNECT_MULTIMODRPCS == 0x200000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MULTIMODRPCS); + LASSERTF(OBD_CONNECT_DIR_STRIPE == 0x400000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_DIR_STRIPE); + LASSERTF(OBD_CONNECT_SUBTREE == 0x800000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SUBTREE); + LASSERTF(OBD_CONNECT_BULK_MBITS == 0x2000000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_BULK_MBITS); + LASSERTF(OBD_CONNECT_OBDOPACK == 0x4000000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_OBDOPACK); + LASSERTF(OBD_CONNECT_FLAGS2 == 0x8000000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FLAGS2); + LASSERTF(OBD_CONNECT2_FILE_SECCTX == 0x1ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_FILE_SECCTX); + LASSERTF(OBD_CONNECT2_LOCKAHEAD == 0x2ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_LOCKAHEAD); + LASSERTF(OBD_CONNECT2_DIR_MIGRATE == 0x4ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_DIR_MIGRATE); + LASSERTF(OBD_CONNECT2_SUM_STATFS == 0x8ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_SUM_STATFS); + LASSERTF(OBD_CONNECT2_OVERSTRIPING == 0x10ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_OVERSTRIPING); + LASSERTF(OBD_CONNECT2_FLR == 0x20ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_FLR); + LASSERTF(OBD_CONNECT2_WBC_INTENTS == 0x40ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_WBC_INTENTS); + LASSERTF(OBD_CONNECT2_LOCK_CONVERT == 0x80ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_LOCK_CONVERT); + LASSERTF(OBD_CONNECT2_ARCHIVE_ID_ARRAY == 0x100ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_ARCHIVE_ID_ARRAY); + LASSERTF(OBD_CONNECT2_INC_XID == 0x200ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_INC_XID); + LASSERTF(OBD_CONNECT2_SELINUX_POLICY == 0x400ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_SELINUX_POLICY); + LASSERTF(OBD_CONNECT2_LSOM == 0x800ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_LSOM); + LASSERTF(OBD_CONNECT2_PCC == 0x1000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_PCC); + LASSERTF(OBD_CONNECT2_CRUSH == 0x2000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_CRUSH); + LASSERTF(OBD_CONNECT2_ASYNC_DISCARD == 0x4000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_ASYNC_DISCARD); + LASSERTF(OBD_CONNECT2_ENCRYPT == 0x8000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_ENCRYPT); + LASSERTF(OBD_CONNECT2_FIDMAP == 0x10000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_FIDMAP); + LASSERTF(OBD_CONNECT2_GETATTR_PFID == 0x20000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_GETATTR_PFID); + LASSERTF(OBD_CONNECT2_LSEEK == 0x40000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_LSEEK); + LASSERTF(OBD_CONNECT2_DOM_LVB == 0x80000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_DOM_LVB); + LASSERTF(OBD_CONNECT2_REP_MBITS == 0x100000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_REP_MBITS); + LASSERTF(OBD_CONNECT2_MODE_CONVERT == 0x200000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_MODE_CONVERT); + LASSERTF(OBD_CONNECT2_BATCH_RPC == 0x400000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_BATCH_RPC); + LASSERTF(OBD_CONNECT2_PCCRO == 0x800000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_PCCRO); + LASSERTF(OBD_CONNECT2_ATOMIC_OPEN_LOCK == 0x4000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_ATOMIC_OPEN_LOCK); + LASSERTF(OBD_CONNECT2_ENCRYPT_NAME == 0x8000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_ENCRYPT_NAME); + LASSERTF(OBD_CONNECT2_MDLL_BYPASS == OBD_CONNECT2_MDLL_BYPASS, "found 0x%.16llxULL\n", + OBD_CONNECT2_MDLL_BYPASS); + LASSERTF(OBD_CONNECT2_MDLL == 0x1000000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_MDLL); + LASSERTF(OBD_CONNECT2_MDLL_AUTO_REFRESH == 0x2000000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_MDLL_AUTO_REFRESH); + LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_CRC32); + LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_ADLER); + LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_CRC32C); + LASSERTF(OBD_CKSUM_RESERVED == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_RESERVED); + LASSERTF(OBD_CKSUM_T10IP512 == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_T10IP512); + LASSERTF(OBD_CKSUM_T10IP4K == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_T10IP4K); + LASSERTF(OBD_CKSUM_T10CRC512 == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_T10CRC512); + LASSERTF(OBD_CKSUM_T10CRC4K == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_T10CRC4K); + LASSERTF(OBD_CKSUM_T10_TOP == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_T10_TOP); + + /* Checks for struct ost_layout */ + LASSERTF((int)sizeof(struct ost_layout) == 28, "found %lld\n", + (long long)(int)sizeof(struct ost_layout)); + LASSERTF((int)offsetof(struct ost_layout, ol_stripe_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_layout, ol_stripe_size)); + LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_stripe_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_layout *)0)->ol_stripe_size)); + LASSERTF((int)offsetof(struct ost_layout, ol_stripe_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct ost_layout, ol_stripe_count)); + LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_stripe_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_layout *)0)->ol_stripe_count)); + LASSERTF((int)offsetof(struct ost_layout, ol_comp_start) == 8, "found %lld\n", + (long long)(int)offsetof(struct ost_layout, ol_comp_start)); + LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_comp_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_layout *)0)->ol_comp_start)); + LASSERTF((int)offsetof(struct ost_layout, ol_comp_end) == 16, "found %lld\n", + (long long)(int)offsetof(struct ost_layout, ol_comp_end)); + LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_comp_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_layout *)0)->ol_comp_end)); + LASSERTF((int)offsetof(struct ost_layout, ol_comp_id) == 24, "found %lld\n", + (long long)(int)offsetof(struct ost_layout, ol_comp_id)); + LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_comp_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_layout *)0)->ol_comp_id)); + + /* Checks for struct obdo */ + LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n", + (long long)(int)sizeof(struct obdo)); + LASSERTF((int)offsetof(struct obdo, o_valid) == 0, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_valid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_valid)); + LASSERTF((int)offsetof(struct obdo, o_oi) == 8, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_oi)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_oi)); + LASSERTF((int)offsetof(struct obdo, o_parent_seq) == 24, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_parent_seq)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_seq) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_parent_seq)); + LASSERTF((int)offsetof(struct obdo, o_size) == 32, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_size)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_size)); + LASSERTF((int)offsetof(struct obdo, o_mtime) == 40, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_mtime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_mtime)); + LASSERTF((int)offsetof(struct obdo, o_atime) == 48, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_atime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_atime)); + LASSERTF((int)offsetof(struct obdo, o_ctime) == 56, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_ctime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_ctime)); + LASSERTF((int)offsetof(struct obdo, o_blocks) == 64, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_blocks)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_blocks)); + LASSERTF((int)offsetof(struct obdo, o_grant) == 72, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_grant)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_grant) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_grant)); + LASSERTF((int)offsetof(struct obdo, o_blksize) == 80, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_blksize)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_blksize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_blksize)); + LASSERTF((int)offsetof(struct obdo, o_mode) == 84, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_mode)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_mode)); + LASSERTF((int)offsetof(struct obdo, o_uid) == 88, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_uid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_uid)); + LASSERTF((int)offsetof(struct obdo, o_gid) == 92, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_gid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_gid)); + LASSERTF((int)offsetof(struct obdo, o_flags) == 96, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_flags)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_flags)); + LASSERTF((int)offsetof(struct obdo, o_nlink) == 100, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_nlink)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_nlink) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_nlink)); + LASSERTF((int)offsetof(struct obdo, o_parent_oid) == 104, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_parent_oid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_oid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_parent_oid)); + LASSERTF((int)offsetof(struct obdo, o_misc) == 108, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_misc)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_misc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_misc)); + LASSERTF((int)offsetof(struct obdo, o_ioepoch) == 112, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_ioepoch)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_ioepoch) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_ioepoch)); + LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_stripe_idx)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx)); + LASSERTF((int)offsetof(struct obdo, o_parent_ver) == 124, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_parent_ver)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_ver) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_parent_ver)); + LASSERTF((int)offsetof(struct obdo, o_handle) == 128, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_handle)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_handle)); + LASSERTF((int)offsetof(struct obdo, o_layout) == 136, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_layout)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_layout) == 28, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_layout)); + LASSERTF((int)offsetof(struct obdo, o_layout_version) == 164, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_layout_version)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_layout_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_layout_version)); + LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_uid_h)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_uid_h)); + LASSERTF((int)offsetof(struct obdo, o_gid_h) == 172, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_gid_h)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_gid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_gid_h)); + LASSERTF((int)offsetof(struct obdo, o_data_version) == 176, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_data_version)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_data_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_data_version)); + LASSERTF((int)offsetof(struct obdo, o_projid) == 184, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_projid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_projid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_projid)); + LASSERTF((int)offsetof(struct obdo, o_padding_4) == 188, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_4)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_4)); + LASSERTF((int)offsetof(struct obdo, o_padding_5) == 192, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_5)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_5)); + LASSERTF((int)offsetof(struct obdo, o_padding_6) == 200, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_6)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_6)); + LASSERTF(OBD_MD_FLID == (0x00000001ULL), "found 0x%.16llxULL\n", + OBD_MD_FLID); + LASSERTF(OBD_MD_FLATIME == (0x00000002ULL), "found 0x%.16llxULL\n", + OBD_MD_FLATIME); + LASSERTF(OBD_MD_FLMTIME == (0x00000004ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMTIME); + LASSERTF(OBD_MD_FLCTIME == (0x00000008ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCTIME); + LASSERTF(OBD_MD_FLSIZE == (0x00000010ULL), "found 0x%.16llxULL\n", + OBD_MD_FLSIZE); + LASSERTF(OBD_MD_FLBLOCKS == (0x00000020ULL), "found 0x%.16llxULL\n", + OBD_MD_FLBLOCKS); + LASSERTF(OBD_MD_FLBLKSZ == (0x00000040ULL), "found 0x%.16llxULL\n", + OBD_MD_FLBLKSZ); + LASSERTF(OBD_MD_FLMODE == (0x00000080ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMODE); + LASSERTF(OBD_MD_FLTYPE == (0x00000100ULL), "found 0x%.16llxULL\n", + OBD_MD_FLTYPE); + LASSERTF(OBD_MD_FLUID == (0x00000200ULL), "found 0x%.16llxULL\n", + OBD_MD_FLUID); + LASSERTF(OBD_MD_FLGID == (0x00000400ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGID); + LASSERTF(OBD_MD_FLFLAGS == (0x00000800ULL), "found 0x%.16llxULL\n", + OBD_MD_FLFLAGS); + LASSERTF(OBD_MD_DOM_SIZE == (0X00001000ULL), "found 0x%.16llxULL\n", + OBD_MD_DOM_SIZE); + LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLNLINK); + LASSERTF(OBD_MD_FLPARENT == (0x00004000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLPARENT); + LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRDEV); + LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLEASIZE); + LASSERTF(OBD_MD_LINKNAME == (0x00040000ULL), "found 0x%.16llxULL\n", + OBD_MD_LINKNAME); + LASSERTF(OBD_MD_FLHANDLE == (0x00080000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLHANDLE); + LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCKSUM); + LASSERTF(OBD_MD_FLPRJQUOTA == (0x00400000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLPRJQUOTA); + LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGROUP); + LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLFID); + LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGRANT); + LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLDIREA); + LASSERTF(OBD_MD_FLUSRQUOTA == (0x20000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLUSRQUOTA); + LASSERTF(OBD_MD_FLGRPQUOTA == (0x40000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGRPQUOTA); + LASSERTF(OBD_MD_FLMODEASIZE == (0x80000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMODEASIZE); + LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n", + OBD_MD_MDS); + LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n", + OBD_MD_MEA); + LASSERTF(OBD_MD_TSTATE == (0x0000000800000000ULL), "found 0x%.16llxULL\n", + OBD_MD_TSTATE); + LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLXATTR); + LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLXATTRLS); + LASSERTF(OBD_MD_FLXATTRRM == (0x0000004000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLXATTRRM); + LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLACL); + LASSERTF(OBD_MD_FLAGSTATFS == (0x0000010000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLAGSTATFS); + LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCROSSREF); + LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGETATTRLOCK); +#ifdef HAVE_SERVER_SUPPORT + LASSERTF(OBD_MD_FLOBJCOUNT == (0x0000400000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLOBJCOUNT); +#endif /* HAVE_SERVER_SUPPORT */ + LASSERTF(OBD_MD_FLDATAVERSION == (0x0010000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLDATAVERSION); + LASSERTF(OBD_MD_CLOSE_INTENT_EXECED == (0x0020000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_CLOSE_INTENT_EXECED); + LASSERTF(OBD_MD_DEFAULT_MEA == (0x0040000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_DEFAULT_MEA); + LASSERTF(OBD_MD_FLOSTLAYOUT == (0x0080000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLOSTLAYOUT); + LASSERTF(OBD_MD_FLPROJID == (0x0100000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLPROJID); + LASSERTF(OBD_MD_SECCTX == (0x0200000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_SECCTX); + LASSERTF(OBD_MD_FLLAZYSIZE == (0x0400000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLLAZYSIZE); + LASSERTF(OBD_MD_FLLAZYBLOCKS == (0x0800000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLLAZYBLOCKS); + LASSERTF(OBD_MD_ENCCTX == (0x2000000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_ENCCTX); + BUILD_BUG_ON(OBD_FL_INLINEDATA != 0x00000001); + BUILD_BUG_ON(OBD_FL_OBDMDEXISTS != 0x00000002); + BUILD_BUG_ON(OBD_FL_DELORPHAN != 0x00000004); + BUILD_BUG_ON(OBD_FL_NORPC != 0x00000008); + BUILD_BUG_ON(OBD_FL_IDONLY != 0x00000010); + BUILD_BUG_ON(OBD_FL_RECREATE_OBJS != 0x00000020); + BUILD_BUG_ON(OBD_FL_DEBUG_CHECK != 0x00000040); + BUILD_BUG_ON(OBD_FL_NO_PRJQUOTA != 0x00000080); + BUILD_BUG_ON(OBD_FL_NO_USRQUOTA != 0x00000100); + BUILD_BUG_ON(OBD_FL_NO_GRPQUOTA != 0x00000200); + BUILD_BUG_ON(OBD_FL_CREATE_CROW != 0x00000400); + BUILD_BUG_ON(OBD_FL_SRVLOCK != 0x00000800); + BUILD_BUG_ON(OBD_FL_CKSUM_CRC32 != 0x00001000); + BUILD_BUG_ON(OBD_FL_CKSUM_ADLER != 0x00002000); + BUILD_BUG_ON(OBD_FL_CKSUM_CRC32C != 0x00004000); + BUILD_BUG_ON(OBD_FL_CKSUM_T10IP512 != 0x00005000); + BUILD_BUG_ON(OBD_FL_CKSUM_T10IP4K != 0x00006000); + BUILD_BUG_ON(OBD_FL_CKSUM_T10CRC512 != 0x00007000); + BUILD_BUG_ON(OBD_FL_CKSUM_T10CRC4K != 0x00008000); + BUILD_BUG_ON(OBD_FL_CKSUM_RSVD3 != 0x00010000); + BUILD_BUG_ON(OBD_FL_SHRINK_GRANT != 0x00020000); + BUILD_BUG_ON(OBD_FL_MMAP != 0x00040000); + BUILD_BUG_ON(OBD_FL_RECOV_RESEND != 0x00080000); + BUILD_BUG_ON(OBD_FL_NOSPC_BLK != 0x00100000); + BUILD_BUG_ON(OBD_FL_FLUSH != 0x00200000); + BUILD_BUG_ON(OBD_FL_SHORT_IO != 0x00400000); + + /* Checks for struct lov_ost_data_v1 */ + LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, "found %lld\n", + (long long)(int)sizeof(struct lov_ost_data_v1)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_oi) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_oi)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, "found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, "found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx)); + + /* Checks for struct lov_mds_md_v1 */ + LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, "found %lld\n", + (long long)(int)sizeof(struct lov_mds_md_v1)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_magic)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_pattern) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_pattern)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_oi) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_oi)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_size) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_size)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_count)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_layout_gen) == 30, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_layout_gen)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_objects[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0])); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0])); + BUILD_BUG_ON(LOV_MAGIC_V1 != (0x0BD10000 | 0x0BD0)); + + /* Checks for struct lov_mds_md_v3 */ + LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n", + (long long)(int)sizeof(struct lov_mds_md_v3)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_oi) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_oi)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_layout_gen) == 30, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen)); + BUILD_BUG_ON(LOV_MAXPOOLNAME != 15); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[15 + 1]) == 48, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[15 + 1])); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[15 + 1]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[15 + 1])); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects[0]) == 48, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0])); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0])); + BUILD_BUG_ON(LOV_MAGIC_V3 != (0x0BD30000 | 0x0BD0)); + LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_RAID0); + LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_RAID1); + LASSERTF(LOV_PATTERN_MDT == 0x00000100UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_MDT); + LASSERTF(LOV_PATTERN_OVERSTRIPING == 0x00000200UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_OVERSTRIPING); + + /* Checks for struct lov_comp_md_entry_v1 */ + LASSERTF((int)sizeof(struct lov_comp_md_entry_v1) == 48, "found %lld\n", + (long long)(int)sizeof(struct lov_comp_md_entry_v1)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_id) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_id)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_id)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_flags)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_flags)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_extent) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_extent)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_extent)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_offset) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_offset)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_offset) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_offset)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_size) == 28, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_size)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_layout_gen) == 32, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_layout_gen)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_layout_gen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_layout_gen)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_timestamp) == 36, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_timestamp)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_timestamp) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_timestamp)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_padding_1) == 44, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_padding_1)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding_1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding_1)); + BUILD_BUG_ON(LCME_FL_STALE != 0x00000001); + BUILD_BUG_ON(LCME_FL_PREF_RD != 0x00000002); + BUILD_BUG_ON(LCME_FL_PREF_WR != 0x00000004); + BUILD_BUG_ON(LCME_FL_PREF_RW != 0x00000006); + BUILD_BUG_ON(LCME_FL_OFFLINE != 0x00000008); + BUILD_BUG_ON(LCME_FL_INIT != 0x00000010); + BUILD_BUG_ON(LCME_FL_NOSYNC != 0x00000020); + BUILD_BUG_ON(LCME_FL_EXTENSION != 0x00000040); + BUILD_BUG_ON(LCME_FL_NEG != 0x80000000); + + /* Checks for struct lov_comp_md_v1 */ + LASSERTF((int)sizeof(struct lov_comp_md_v1) == 32, "found %lld\n", + (long long)(int)sizeof(struct lov_comp_md_v1)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_magic)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_magic)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_size) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_size)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_size)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_layout_gen) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_layout_gen)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_layout_gen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_layout_gen)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_flags) == 12, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_flags)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_flags)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_entry_count) == 14, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_entry_count)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_mirror_count) == 16, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_mirror_count)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 18, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding1)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 6, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding2) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding2)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding2)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_entries[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_entries[0])); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]) == 48, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0])); + BUILD_BUG_ON(LOV_MAGIC_COMP_V1 != (0x0BD60000 | 0x0BD0)); + LASSERTF(LCM_FL_NONE == 0, "found %lld\n", + (long long)LCM_FL_NONE); + LASSERTF(LCM_FL_RDONLY == 1, "found %lld\n", + (long long)LCM_FL_RDONLY); + LASSERTF(LCM_FL_WRITE_PENDING == 2, "found %lld\n", + (long long)LCM_FL_WRITE_PENDING); + LASSERTF(LCM_FL_SYNC_PENDING == 3, "found %lld\n", + (long long)LCM_FL_SYNC_PENDING); + LASSERTF(LCM_FL_PCC_RDONLY == 8, "found %lld\n", + (long long)LCM_FL_PCC_RDONLY); + + /* Checks for struct lmv_mds_md_v1 */ + LASSERTF((int)sizeof(struct lmv_mds_md_v1) == 56, "found %lld\n", + (long long)(int)sizeof(struct lmv_mds_md_v1)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_magic)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_magic)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_stripe_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_stripe_count)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_count)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_master_mdt_index) == 8, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_master_mdt_index)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_master_mdt_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_master_mdt_index)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_hash_type) == 12, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_hash_type)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_hash_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_hash_type)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_layout_version) == 16, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_layout_version)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_migrate_offset) == 20, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_migrate_offset)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_offset) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_offset)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_migrate_hash) == 24, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_migrate_hash)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_hash) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_hash)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding2) == 28, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding2)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding3) == 32, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding3)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding3)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_pool_name[15 + 1]) == 56, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_pool_name[15 + 1])); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_pool_name[15 + 1]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_pool_name[15 + 1])); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_stripe_fids[0]) == 56, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_stripe_fids[0])); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_fids[0]) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_fids[0])); + BUILD_BUG_ON(LMV_MAGIC_V1 != 0x0CD20CD0); + BUILD_BUG_ON(LMV_MAGIC_STRIPE != 0x0CD40CD0); + BUILD_BUG_ON(LMV_HASH_TYPE_MASK != 0x0000ffff); + BUILD_BUG_ON(LMV_HASH_FLAG_FIXED != 0x02000000); + BUILD_BUG_ON(LMV_HASH_FLAG_MERGE != 0x04000000); + BUILD_BUG_ON(LMV_HASH_FLAG_SPLIT != 0x08000000); + BUILD_BUG_ON(LMV_HASH_FLAG_LOST_LMV != 0x10000000); + BUILD_BUG_ON(LMV_HASH_FLAG_BAD_TYPE != 0x20000000); + BUILD_BUG_ON(LMV_HASH_FLAG_MIGRATION != 0x80000000); + BUILD_BUG_ON(LMV_CRUSH_PG_COUNT != 4096); + + /* Checks for struct obd_statfs */ + LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n", + (long long)(int)sizeof(struct obd_statfs)); + LASSERTF((int)offsetof(struct obd_statfs, os_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_type)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_type) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_type)); + LASSERTF((int)offsetof(struct obd_statfs, os_blocks) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_blocks)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_blocks)); + LASSERTF((int)offsetof(struct obd_statfs, os_bfree) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_bfree)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bfree) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_bfree)); + LASSERTF((int)offsetof(struct obd_statfs, os_bavail) == 24, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_bavail)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail)); + LASSERTF((int)offsetof(struct obd_statfs, os_files) == 32, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_files)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_files) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_files)); + LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_ffree)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_ffree)); + LASSERTF((int)offsetof(struct obd_statfs, os_fsid) == 48, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_fsid)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fsid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_fsid)); + LASSERTF((int)offsetof(struct obd_statfs, os_bsize) == 88, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_bsize)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_bsize)); + LASSERTF((int)offsetof(struct obd_statfs, os_namelen) == 92, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_namelen)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen)); + LASSERTF((int)offsetof(struct obd_statfs, os_maxbytes) == 96, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_maxbytes)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_maxbytes) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_maxbytes)); + LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_state)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_state)); + LASSERTF((int)offsetof(struct obd_statfs, os_fprecreated) == 108, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_fprecreated)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated)); + LASSERTF((int)offsetof(struct obd_statfs, os_granted) == 112, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_granted)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_granted) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_granted)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare3)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare4)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare5)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare6)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare7)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare8)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare9)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9)); + LASSERTF(OS_STATFS_DEGRADED == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)OS_STATFS_DEGRADED); + LASSERTF(OS_STATFS_READONLY == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)OS_STATFS_READONLY); + LASSERTF(OS_STATFS_NOPRECREATE == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)OS_STATFS_NOPRECREATE); + LASSERTF(OS_STATFS_ENOSPC == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)OS_STATFS_ENOSPC); + LASSERTF(OS_STATFS_ENOINO == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)OS_STATFS_ENOINO); + LASSERTF(OS_STATFS_SUM == 0x00000100UL, "found 0x%.8xUL\n", + (unsigned)OS_STATFS_SUM); + LASSERTF(OS_STATFS_NONROT == 0x00000200UL, "found 0x%.8xUL\n", + (unsigned)OS_STATFS_NONROT); + + /* Checks for struct obd_ioobj */ + LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n", + (long long)(int)sizeof(struct obd_ioobj)); + LASSERTF((int)offsetof(struct obd_ioobj, ioo_oid) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_ioobj, ioo_oid)); + LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_oid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_oid)); + LASSERTF((int)offsetof(struct obd_ioobj, ioo_max_brw) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_ioobj, ioo_max_brw)); + LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw)); + LASSERTF((int)offsetof(struct obd_ioobj, ioo_bufcnt) == 20, "found %lld\n", + (long long)(int)offsetof(struct obd_ioobj, ioo_bufcnt)); + LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt)); + LASSERTF(IOOBJ_MAX_BRW_BITS == 16, "found %lld\n", + (long long)IOOBJ_MAX_BRW_BITS); + + /* Checks for union lquota_id */ + LASSERTF((int)sizeof(union lquota_id) == 16, "found %lld\n", + (long long)(int)sizeof(union lquota_id)); + + LASSERTF(QIF_DQBLKSIZE_BITS == 10, "found %lld\n", + (long long)QIF_DQBLKSIZE_BITS); + LASSERTF(QIF_DQBLKSIZE == 1024, "found %lld\n", + (long long)QIF_DQBLKSIZE); + + /* Checks for struct obd_quotactl */ + LASSERTF((int)sizeof(struct obd_quotactl) == 112, "found %lld\n", + (long long)(int)sizeof(struct obd_quotactl)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_cmd) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_cmd)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_cmd) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_cmd)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_type) == 4, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_type)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_type)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_id)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_id)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_stat) == 12, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_stat)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_stat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_stat)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_dqinfo) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_dqinfo)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo) == 24, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_dqblk) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_dqblk)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqblk) == 72, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqblk)); + + /* Checks for struct obd_dqinfo */ + LASSERTF((int)sizeof(struct obd_dqinfo) == 24, "found %lld\n", + (long long)(int)sizeof(struct obd_dqinfo)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_bgrace) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_bgrace)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_igrace) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_igrace)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_flags)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_flags)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_valid) == 20, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_valid)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_valid)); + + /* Checks for struct obd_dqblk */ + LASSERTF((int)sizeof(struct obd_dqblk) == 72, "found %lld\n", + (long long)(int)sizeof(struct obd_dqblk)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_bhardlimit) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_bhardlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_bsoftlimit) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_bsoftlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_curspace) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_curspace)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curspace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curspace)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_ihardlimit) == 24, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_ihardlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_isoftlimit) == 32, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_isoftlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_curinodes) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_curinodes)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_btime) == 48, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_btime)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_btime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_btime)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_itime) == 56, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_itime)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_itime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_itime)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_valid) == 64, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_valid)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_padding) == 68, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_padding)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding)); + LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n", + Q_QUOTACHECK); + LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n", + Q_INITQUOTA); + LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n", + Q_GETOINFO); + LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n", + Q_GETOQUOTA); + LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n", + Q_FINVALIDATE); +#ifdef HAVE_SERVER_SUPPORT + + /* Checks for struct lquota_acct_rec */ + LASSERTF((int)sizeof(struct lquota_acct_rec) == 16, "found %lld\n", + (long long)(int)sizeof(struct lquota_acct_rec)); + LASSERTF((int)offsetof(struct lquota_acct_rec, bspace) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_acct_rec, bspace)); + LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->bspace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_acct_rec *)0)->bspace)); + LASSERTF((int)offsetof(struct lquota_acct_rec, ispace) == 8, "found %lld\n", + (long long)(int)offsetof(struct lquota_acct_rec, ispace)); + LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->ispace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_acct_rec *)0)->ispace)); + + /* Checks for struct lquota_glb_rec */ + LASSERTF((int)sizeof(struct lquota_glb_rec) == 32, "found %lld\n", + (long long)(int)sizeof(struct lquota_glb_rec)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_hardlimit) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_hardlimit)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_softlimit) == 8, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_softlimit)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_time) == 16, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_time)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_time)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_granted) == 24, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_granted)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted)); + + /* Checks for struct lquota_slv_rec */ + LASSERTF((int)sizeof(struct lquota_slv_rec) == 8, "found %lld\n", + (long long)(int)sizeof(struct lquota_slv_rec)); + LASSERTF((int)offsetof(struct lquota_slv_rec, qsr_granted) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_slv_rec, qsr_granted)); + LASSERTF((int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted)); + + /* Checks for struct idx_info */ + LASSERTF((int)sizeof(struct idx_info) == 80, "found %lld\n", + (long long)(int)sizeof(struct idx_info)); + LASSERTF((int)offsetof(struct idx_info, ii_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_magic)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_magic)); + LASSERTF((int)offsetof(struct idx_info, ii_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_flags)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_flags)); + LASSERTF((int)offsetof(struct idx_info, ii_count) == 8, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_count)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_count)); + LASSERTF((int)offsetof(struct idx_info, ii_pad0) == 10, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad0)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad0) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad0)); + LASSERTF((int)offsetof(struct idx_info, ii_attrs) == 12, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_attrs)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_attrs) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_attrs)); + LASSERTF((int)offsetof(struct idx_info, ii_fid) == 16, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_fid)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_fid)); + LASSERTF((int)offsetof(struct idx_info, ii_version) == 32, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_version)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_version)); + LASSERTF((int)offsetof(struct idx_info, ii_hash_start) == 40, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_hash_start)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_start)); + LASSERTF((int)offsetof(struct idx_info, ii_hash_end) == 48, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_hash_end)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_end)); + LASSERTF((int)offsetof(struct idx_info, ii_keysize) == 56, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_keysize)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_keysize) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_keysize)); + LASSERTF((int)offsetof(struct idx_info, ii_recsize) == 58, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_recsize)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_recsize) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_recsize)); + LASSERTF((int)offsetof(struct idx_info, ii_pad1) == 60, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad1)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad1)); + LASSERTF((int)offsetof(struct idx_info, ii_pad2) == 64, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad2)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad2)); + LASSERTF((int)offsetof(struct idx_info, ii_pad3) == 72, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad3)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad3)); + BUILD_BUG_ON(IDX_INFO_MAGIC != 0x3D37CC37); + + /* Checks for struct lu_idxpage */ + LASSERTF((int)sizeof(struct lu_idxpage) == 16, "found %lld\n", + (long long)(int)sizeof(struct lu_idxpage)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_magic)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_magic)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_flags)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_flags)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_nr) == 6, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_nr)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_nr) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_nr)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_pad0) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_pad0)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_pad0) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_pad0)); + BUILD_BUG_ON(LIP_MAGIC != 0x8A6D6B6C); + BUILD_BUG_ON(LIP_HDR_SIZE != (__builtin_offsetof (struct lu_idxpage, lip_entries))); + BUILD_BUG_ON(II_FL_NOHASH != 0x00000001); + BUILD_BUG_ON(II_FL_VARKEY != 0x00000002); + BUILD_BUG_ON(II_FL_VARREC != 0x00000004); + BUILD_BUG_ON(II_FL_NONUNQ != 0x00000008); + BUILD_BUG_ON(II_FL_NOKEY != 0x00000010); +#endif /* HAVE_SERVER_SUPPORT */ + + /* Checks for struct niobuf_remote */ + LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n", + (long long)(int)sizeof(struct niobuf_remote)); + LASSERTF((int)offsetof(struct niobuf_remote, rnb_offset) == 0, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, rnb_offset)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_offset)); + LASSERTF((int)offsetof(struct niobuf_remote, rnb_len) == 8, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, rnb_len)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_len)); + LASSERTF((int)offsetof(struct niobuf_remote, rnb_flags) == 12, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, rnb_flags)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_flags)); + LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n", + OBD_BRW_READ); + LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n", + OBD_BRW_WRITE); + LASSERTF(OBD_BRW_NDELAY == 0x04, "found 0x%.8x\n", + OBD_BRW_NDELAY); + LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n", + OBD_BRW_SYNC); + LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n", + OBD_BRW_CHECK); + LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n", + OBD_BRW_FROM_GRANT); + LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n", + OBD_BRW_GRANTED); + LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n", + OBD_BRW_NOCACHE); + LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n", + OBD_BRW_NOQUOTA); + LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n", + OBD_BRW_SRVLOCK); + LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n", + OBD_BRW_ASYNC); + LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n", + OBD_BRW_MEMALLOC); + LASSERTF(OBD_BRW_OVER_USRQUOTA == 0x1000, "found 0x%.8x\n", + OBD_BRW_OVER_USRQUOTA); + LASSERTF(OBD_BRW_OVER_GRPQUOTA == 0x2000, "found 0x%.8x\n", + OBD_BRW_OVER_GRPQUOTA); + LASSERTF(OBD_BRW_SOFT_SYNC == 0x4000, "found 0x%.8x\n", + OBD_BRW_SOFT_SYNC); + LASSERTF(OBD_BRW_OVER_PRJQUOTA == 0x8000, "found 0x%.8x\n", + OBD_BRW_OVER_PRJQUOTA); + LASSERTF(OBD_BRW_RDMA_ONLY == 0x20000, "found 0x%.8x\n", + OBD_BRW_RDMA_ONLY); + LASSERTF(OBD_BRW_SYS_RESOURCE == 0x40000, "found 0x%.8x\n", + OBD_BRW_SYS_RESOURCE); + + /* Checks for struct ost_body */ + LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n", + (long long)(int)sizeof(struct ost_body)); + LASSERTF((int)offsetof(struct ost_body, oa) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_body, oa)); + LASSERTF((int)sizeof(((struct ost_body *)0)->oa) == 208, "found %lld\n", + (long long)(int)sizeof(((struct ost_body *)0)->oa)); + + /* Checks for struct ll_fid */ + LASSERTF((int)sizeof(struct ll_fid) == 16, "found %lld\n", + (long long)(int)sizeof(struct ll_fid)); + LASSERTF((int)offsetof(struct ll_fid, id) == 0, "found %lld\n", + (long long)(int)offsetof(struct ll_fid, id)); + LASSERTF((int)sizeof(((struct ll_fid *)0)->id) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_fid *)0)->id)); + LASSERTF((int)offsetof(struct ll_fid, generation) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fid, generation)); + LASSERTF((int)sizeof(((struct ll_fid *)0)->generation) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_fid *)0)->generation)); + LASSERTF((int)offsetof(struct ll_fid, f_type) == 12, "found %lld\n", + (long long)(int)offsetof(struct ll_fid, f_type)); + LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_fid *)0)->f_type)); + + LASSERTF(MDS_CROSS_REF == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)MDS_CROSS_REF); + LASSERTF(MDS_PERM_BYPASS == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)MDS_PERM_BYPASS); + LASSERTF(MDS_QUOTA_IGNORE == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)MDS_QUOTA_IGNORE); + LASSERTF(MDS_KEEP_ORPHAN == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)MDS_KEEP_ORPHAN); + LASSERTF(MDS_RECOV_OPEN == 0x00000100UL, "found 0x%.8xUL\n", + (unsigned)MDS_RECOV_OPEN); + LASSERTF(MDS_DATA_MODIFIED == 0x00000200UL, "found 0x%.8xUL\n", + (unsigned)MDS_DATA_MODIFIED); + LASSERTF(MDS_CREATE_VOLATILE == 0x00000400UL, "found 0x%.8xUL\n", + (unsigned)MDS_CREATE_VOLATILE); + LASSERTF(MDS_OWNEROVERRIDE == 0x00000800UL, "found 0x%.8xUL\n", + (unsigned)MDS_OWNEROVERRIDE); + LASSERTF(MDS_HSM_RELEASE == 0x00001000UL, "found 0x%.8xUL\n", + (unsigned)MDS_HSM_RELEASE); + LASSERTF(MDS_CLOSE_MIGRATE == 0x00002000UL, "found 0x%.8xUL\n", + (unsigned)MDS_CLOSE_MIGRATE); + LASSERTF(MDS_CLOSE_LAYOUT_SWAP == 0x00004000UL, "found 0x%.8xUL\n", + (unsigned)MDS_CLOSE_LAYOUT_SWAP); + LASSERTF(MDS_CLOSE_LAYOUT_MERGE == 0x00008000UL, "found 0x%.8xUL\n", + (unsigned)MDS_CLOSE_LAYOUT_MERGE); + LASSERTF(MDS_CLOSE_RESYNC_DONE == 0x00010000UL, "found 0x%.8xUL\n", + (unsigned)MDS_CLOSE_RESYNC_DONE); + LASSERTF(MDS_CLOSE_LAYOUT_SPLIT == 0x00020000UL, "found 0x%.8xUL\n", + (unsigned)MDS_CLOSE_LAYOUT_SPLIT); + LASSERTF(MDS_TRUNC_KEEP_LEASE == 0x00040000UL, "found 0x%.8xUL\n", + (unsigned)MDS_TRUNC_KEEP_LEASE); + LASSERTF(MDS_PCC_ATTACH == 0x00080000UL, "found 0x%.8xUL\n", + (unsigned)MDS_PCC_ATTACH); + LASSERTF(MDS_CLOSE_UPDATE_TIMES == 0x00100000UL, "found 0x%.8xUL\n", + (unsigned)MDS_CLOSE_UPDATE_TIMES); + LASSERTF(MDS_SETSTRIPE_CREATE == 0x00200000UL, "found 0x%.8xUL\n", + (unsigned)MDS_SETSTRIPE_CREATE); + LASSERTF(MDS_FID_OP == 0x00400000UL, "found 0x%.8xUL\n", + (unsigned)MDS_FID_OP); + LASSERTF(MDS_MIGRATE_NSONLY == 0x00800000UL, "found 0x%.8xUL\n", + (unsigned)MDS_MIGRATE_NSONLY); + + /* Checks for struct mdt_body */ + LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n", + (long long)(int)sizeof(struct mdt_body)); + LASSERTF((int)offsetof(struct mdt_body, mbo_fid1) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_fid1)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid1)); + LASSERTF((int)offsetof(struct mdt_body, mbo_fid2) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_fid2)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid2)); + LASSERTF((int)offsetof(struct mdt_body, mbo_open_handle) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_open_handle)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_open_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_open_handle)); + LASSERTF((int)offsetof(struct mdt_body, mbo_valid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_valid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_valid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_size) == 48, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_size)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_size)); + LASSERTF((int)offsetof(struct mdt_body, mbo_mtime) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_mtime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_mtime)); + LASSERTF((int)offsetof(struct mdt_body, mbo_atime) == 64, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_atime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_atime)); + LASSERTF((int)offsetof(struct mdt_body, mbo_ctime) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_ctime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_ctime)); + LASSERTF((int)offsetof(struct mdt_body, mbo_blocks) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_blocks)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_blocks)); + LASSERTF((int)offsetof(struct mdt_body, mbo_version) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_version)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_version)); + LASSERTF((int)offsetof(struct mdt_body, mbo_t_state) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_t_state)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_t_state) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_t_state)); + LASSERTF((int)offsetof(struct mdt_body, mbo_fsuid) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_fsuid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fsuid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_fsgid) == 108, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_fsgid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fsgid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_capability) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_capability)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_capability) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_capability)); + LASSERTF((int)offsetof(struct mdt_body, mbo_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_mode)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_mode)); + LASSERTF((int)offsetof(struct mdt_body, mbo_uid) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_uid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_uid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_gid) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_gid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_gid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_flags) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_flags)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_flags)); + LASSERTF((int)offsetof(struct mdt_body, mbo_rdev) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_rdev)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_rdev) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_rdev)); + LASSERTF((int)offsetof(struct mdt_body, mbo_nlink) == 136, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_nlink)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_nlink) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_nlink)); + LASSERTF((int)offsetof(struct mdt_body, mbo_layout_gen) == 140, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_layout_gen)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_layout_gen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_layout_gen)); + LASSERTF((int)offsetof(struct mdt_body, mbo_suppgid) == 144, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_suppgid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_suppgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_suppgid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_eadatasize) == 148, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_eadatasize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_eadatasize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_eadatasize)); + LASSERTF((int)offsetof(struct mdt_body, mbo_aclsize) == 152, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_aclsize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_aclsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_aclsize)); + LASSERTF((int)offsetof(struct mdt_body, mbo_max_mdsize) == 156, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_max_mdsize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_max_mdsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_max_mdsize)); + LASSERTF((int)offsetof(struct mdt_body, mbo_unused3) == 160, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_unused3)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_unused3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_unused3)); + LASSERTF((int)offsetof(struct mdt_body, mbo_uid_h) == 164, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_uid_h)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_uid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_uid_h)); + LASSERTF((int)offsetof(struct mdt_body, mbo_gid_h) == 168, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_gid_h)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_gid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_gid_h)); + LASSERTF((int)offsetof(struct mdt_body, mbo_projid) == 172, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_projid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_projid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_projid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_dom_size) == 176, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_dom_size)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_dom_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_dom_size)); + LASSERTF((int)offsetof(struct mdt_body, mbo_dom_blocks) == 184, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_dom_blocks)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_dom_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_dom_blocks)); + LASSERTF((int)offsetof(struct mdt_body, mbo_btime) == 192, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_btime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_btime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_btime)); + LASSERTF((int)offsetof(struct mdt_body, mbo_padding_9) == 200, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_padding_9)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_9) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_9)); + LASSERTF((int)offsetof(struct mdt_body, mbo_padding_10) == 208, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_padding_10)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_10) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_10)); + LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n", + MDS_FMODE_CLOSED); + LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n", + MDS_FMODE_EXEC); + LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n", + MDS_OPEN_CREATED); + LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n", + MDS_OPEN_CREAT); + LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n", + MDS_OPEN_EXCL); + LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n", + MDS_OPEN_TRUNC); + LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n", + MDS_OPEN_APPEND); + LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n", + MDS_OPEN_SYNC); + LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n", + MDS_OPEN_DIRECTORY); + LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n", + MDS_OPEN_BY_FID); + LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n", + MDS_OPEN_DELAY_CREATE); + LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n", + MDS_OPEN_OWNEROVERRIDE); + LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n", + MDS_OPEN_JOIN_FILE); + LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n", + MDS_OPEN_LOCK); + LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n", + MDS_OPEN_HAS_EA); + LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n", + MDS_OPEN_HAS_OBJS); + LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_NORESTORE); + LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_NEWSTRIPE); + LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_VOLATILE); + LASSERTF(MDS_OPEN_LEASE == 00000000001000000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_LEASE); + LASSERTF(MDS_OPEN_RESYNC == 00000000004000000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_RESYNC); + LASSERTF(MDS_OPEN_PCC == 00000000010000000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_PCC); + LASSERTF(MDS_OPEN_DEFAULT_LMV == 00000000040000000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_DEFAULT_LMV); + LASSERTF(LUSTRE_SYNC_FL == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_SYNC_FL); + LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_IMMUTABLE_FL); + LASSERTF(LUSTRE_APPEND_FL == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_APPEND_FL); + LASSERTF(LUSTRE_NODUMP_FL == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_NODUMP_FL); + LASSERTF(LUSTRE_NOATIME_FL == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_NOATIME_FL); + LASSERTF(LUSTRE_INDEX_FL == 0x00001000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_INDEX_FL); +#ifdef HAVE_SERVER_SUPPORT + LASSERTF(LUSTRE_ORPHAN_FL == 0x00002000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_ORPHAN_FL); +#endif /* HAVE_SERVER_SUPPORT */ + LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_DIRSYNC_FL); + LASSERTF(LUSTRE_TOPDIR_FL == 0x00020000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_TOPDIR_FL); + LASSERTF(LUSTRE_INLINE_DATA_FL == 0x10000000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_INLINE_DATA_FL); +#ifdef HAVE_SERVER_SUPPORT + LASSERTF(LUSTRE_SET_SYNC_FL == 0x00040000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_SET_SYNC_FL); +#endif /* HAVE_SERVER_SUPPORT */ + LASSERTF(LUSTRE_ENCRYPT_FL == 0x00800000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_ENCRYPT_FL); + LASSERTF(MDS_INODELOCK_LOOKUP == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)MDS_INODELOCK_LOOKUP); + LASSERTF(MDS_INODELOCK_UPDATE == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)MDS_INODELOCK_UPDATE); + LASSERTF(MDS_INODELOCK_OPEN == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)MDS_INODELOCK_OPEN); + LASSERTF(MDS_INODELOCK_LAYOUT == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)MDS_INODELOCK_LAYOUT); + LASSERTF(MDS_INODELOCK_PERM == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)MDS_INODELOCK_PERM); + LASSERTF(MDS_INODELOCK_XATTR == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)MDS_INODELOCK_XATTR); + LASSERTF(MDS_INODELOCK_DOM == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)MDS_INODELOCK_DOM); + + /* Checks for struct mdt_ioepoch */ + LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n", + (long long)(int)sizeof(struct mdt_ioepoch)); + LASSERTF((int)offsetof(struct mdt_ioepoch, mio_open_handle) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, mio_open_handle)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_open_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_open_handle)); + LASSERTF((int)offsetof(struct mdt_ioepoch, mio_unused1) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, mio_unused1)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_unused1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_unused1)); + LASSERTF((int)offsetof(struct mdt_ioepoch, mio_unused2) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, mio_unused2)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_unused2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_unused2)); + LASSERTF((int)offsetof(struct mdt_ioepoch, mio_padding) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, mio_padding)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_padding)); + + /* Checks for struct mdt_rec_setattr */ + LASSERTF((int)sizeof(struct mdt_rec_setattr) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_setattr)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_valid) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_valid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_uid) == 64, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_uid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_gid) == 68, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_gid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_size) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_size)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_size)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_blocks) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_blocks)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mtime) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_mtime)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_atime) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_atime)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_ctime) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_ctime)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_attr_flags) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_attr_flags)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_bias) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_projid) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_projid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_projid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_projid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_4) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_5) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5)); + + /* Checks for struct mdt_rec_create */ + LASSERTF((int)sizeof(struct mdt_rec_create) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_create)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_cap)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_open_handle_old) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_open_handle_old)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_open_handle_old) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_open_handle_old)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_time)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_time)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_rdev) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_rdev)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_rdev) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_rdev)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_ioepoch) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_ioepoch)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_1) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_mode) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_mode)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_bias) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_bias)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_l) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_flags_l)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_h) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_flags_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_umask) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_umask)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_umask) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_umask)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_4) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4)); + + /* Checks for struct mdt_rec_link */ + LASSERTF((int)sizeof(struct mdt_rec_link) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_link)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_cap)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_time)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_time)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_1) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_2) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_3) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_4) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_bias)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_5) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_6) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_7) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_8) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_9) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_9)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9)); + + /* Checks for struct mdt_rec_unlink */ + LASSERTF((int)sizeof(struct mdt_rec_unlink) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_unlink)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_time)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_time)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_2) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_3) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_4) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_5) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_6) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_7) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_8) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_9) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_9)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9)); + + /* Checks for struct mdt_rec_rename */ + LASSERTF((int)sizeof(struct mdt_rec_rename) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_rename)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_cap)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_time)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_time)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_1) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_2) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_3) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_4) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_bias)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_mode)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_5) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_6) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_7) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_8) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8)); + + /* Checks for struct mdt_rec_setxattr */ + LASSERTF((int)sizeof(struct mdt_rec_setxattr) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_setxattr)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_1) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_2) == 64, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_3) == 68, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_valid) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_valid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_time) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_time)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_5) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_6) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_7) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_size) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_size)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_flags) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_flags)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_8) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_9) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_9)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_10) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_10)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_11) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_11)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11)); + + /* Checks for struct mdt_rec_resync */ + LASSERTF((int)sizeof(struct mdt_rec_resync) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_resync)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_cap)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_fid)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fid)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding0) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding0)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_lease_handle) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_lease_handle)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_lease_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_lease_handle)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding1) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding1)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding2) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding2)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding3) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding3)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding4) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding4)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_bias)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding5) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding5)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding6) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding6)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding7) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding7)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding8) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding8)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_mirror_id) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_mirror_id)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_mirror_id) == 2, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_mirror_id)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding9) == 134, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding9)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9) == 2, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9)); + + /* Checks for struct mdt_rec_reint */ + LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_reint)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_cap)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mtime) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_mtime)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_atime) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_atime)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_atime)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_ctime) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_ctime)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_size) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_size)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_size)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_blocks) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_blocks)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_bias)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mode)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_flags)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags_h) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_flags_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_umask) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_umask)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mirror_id) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_mirror_id)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mirror_id) == 2, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mirror_id)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 134, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 2, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4)); + + /* Checks for struct lmv_desc */ + LASSERTF((int)sizeof(struct lmv_desc) == 88, "found %lld\n", + (long long)(int)sizeof(struct lmv_desc)); + LASSERTF((int)offsetof(struct lmv_desc, ld_tgt_count) == 0, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_tgt_count)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_tgt_count)); + LASSERTF((int)offsetof(struct lmv_desc, ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_active_tgt_count)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count)); + LASSERTF((int)offsetof(struct lmv_desc, ld_default_stripe_count) == 8, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_default_stripe_count)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count)); + LASSERTF((int)offsetof(struct lmv_desc, ld_pattern) == 12, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_pattern)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_pattern)); + LASSERTF((int)offsetof(struct lmv_desc, ld_default_hash_size) == 16, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_default_hash_size)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_1) == 24, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_1)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_1)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_2) == 32, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_2)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_2)); + LASSERTF((int)offsetof(struct lmv_desc, ld_qos_maxage) == 36, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_qos_maxage)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_3) == 40, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_3)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_3)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_4) == 44, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_4)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_4)); + LASSERTF((int)offsetof(struct lmv_desc, ld_uuid) == 48, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_uuid)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid)); + + /* Checks for struct lov_desc */ + LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n", + (long long)(int)sizeof(struct lov_desc)); + LASSERTF((int)offsetof(struct lov_desc, ld_tgt_count) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_tgt_count)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_tgt_count)); + LASSERTF((int)offsetof(struct lov_desc, ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_active_tgt_count)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_count) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_count)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count)); + LASSERTF((int)offsetof(struct lov_desc, ld_pattern) == 12, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_pattern)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_pattern)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_size) == 16, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_size)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_0) == 32, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_0)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_0) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_0)); + LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_qos_maxage)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_1)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_2)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2)); + LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_uuid)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_uuid)); + BUILD_BUG_ON(LOV_DESC_MAGIC != 0xB0CCDE5C); + + /* Checks for struct ldlm_res_id */ + LASSERTF((int)sizeof(struct ldlm_res_id) == 32, "found %lld\n", + (long long)(int)sizeof(struct ldlm_res_id)); + BUILD_BUG_ON(RES_NAME_SIZE != 4); + LASSERTF((int)offsetof(struct ldlm_res_id, name[4]) == 32, "found %lld\n", + (long long)(int)offsetof(struct ldlm_res_id, name[4])); + LASSERTF((int)sizeof(((struct ldlm_res_id *)0)->name[4]) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_res_id *)0)->name[4])); + + /* Checks for struct ldlm_extent */ + LASSERTF((int)sizeof(struct ldlm_extent) == 24, "found %lld\n", + (long long)(int)sizeof(struct ldlm_extent)); + LASSERTF((int)offsetof(struct ldlm_extent, start) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_extent, start)); + LASSERTF((int)sizeof(((struct ldlm_extent *)0)->start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_extent *)0)->start)); + LASSERTF((int)offsetof(struct ldlm_extent, end) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_extent, end)); + LASSERTF((int)sizeof(((struct ldlm_extent *)0)->end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_extent *)0)->end)); + LASSERTF((int)offsetof(struct ldlm_extent, gid) == 16, "found %lld\n", + (long long)(int)offsetof(struct ldlm_extent, gid)); + LASSERTF((int)sizeof(((struct ldlm_extent *)0)->gid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_extent *)0)->gid)); + + /* Checks for struct ldlm_inodebits */ + LASSERTF((int)sizeof(struct ldlm_inodebits) == 24, "found %lld\n", + (long long)(int)sizeof(struct ldlm_inodebits)); + LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_inodebits, bits)); + LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits)); +#ifdef HAVE_SERVER_SUPPORT + LASSERTF((int)offsetof(struct ldlm_inodebits, try_bits) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_inodebits, try_bits)); + LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->try_bits) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_inodebits *)0)->try_bits)); +#else + LASSERTF((int)offsetof(struct ldlm_inodebits, cancel_bits) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_inodebits, cancel_bits)); + LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->cancel_bits) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_inodebits *)0)->cancel_bits)); +#endif /* HAVE_SERVER_SUPPORT */ + LASSERTF((int)offsetof(struct ldlm_inodebits, li_gid) == 16, "found %lld\n", + (long long)(int)offsetof(struct ldlm_inodebits, li_gid)); + LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->li_gid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_inodebits *)0)->li_gid)); + + /* Checks for struct ldlm_flock_wire */ + LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n", + (long long)(int)sizeof(struct ldlm_flock_wire)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_start)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_end) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_end)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_owner) == 16, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_owner)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_padding) == 24, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_padding)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_pid) == 28, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_pid)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid)); + + /* Checks for struct ldlm_intent */ + LASSERTF((int)sizeof(struct ldlm_intent) == 8, "found %lld\n", + (long long)(int)sizeof(struct ldlm_intent)); + LASSERTF((int)offsetof(struct ldlm_intent, opc) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_intent, opc)); + LASSERTF((int)sizeof(((struct ldlm_intent *)0)->opc) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_intent *)0)->opc)); + BUILD_BUG_ON(IT_OPEN != 0x00000001); + BUILD_BUG_ON(IT_CREAT != 0x00000002); + BUILD_BUG_ON(IT_READDIR != 0x00000004); + BUILD_BUG_ON(IT_GETATTR != 0x00000008); + BUILD_BUG_ON(IT_LOOKUP != 0x00000010); + BUILD_BUG_ON(IT_GETXATTR != 0x00000080); + BUILD_BUG_ON(IT_LAYOUT != 0x00000400); + BUILD_BUG_ON(IT_QUOTA_DQACQ != 0x00000800); + BUILD_BUG_ON(IT_QUOTA_CONN != 0x00001000); + BUILD_BUG_ON(IT_GLIMPSE != 0x00004000); + BUILD_BUG_ON(IT_BRW != 0x00008000); + + /* Checks for struct ldlm_resource_desc */ + LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n", + (long long)(int)sizeof(struct ldlm_resource_desc)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_type)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_pad) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_pad)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_pad) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_pad)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_name)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_name)); + + /* Checks for struct ldlm_lock_desc */ + LASSERTF((int)sizeof(struct ldlm_lock_desc) == 80, "found %lld\n", + (long long)(int)sizeof(struct ldlm_lock_desc)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_resource) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_resource)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_resource) == 40, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_resource)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_req_mode) == 40, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_req_mode)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_granted_mode) == 44, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_granted_mode)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_policy_data) == 48, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_policy_data)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data)); + + /* Checks for struct ldlm_request */ + LASSERTF((int)sizeof(struct ldlm_request) == 104, "found %lld\n", + (long long)(int)sizeof(struct ldlm_request)); + LASSERTF((int)offsetof(struct ldlm_request, lock_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_flags)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags)); + LASSERTF((int)offsetof(struct ldlm_request, lock_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_count)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_count)); + LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_desc)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_desc)); + LASSERTF((int)offsetof(struct ldlm_request, lock_handle) == 88, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_handle)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_handle) == 16, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_handle)); + + /* Checks for struct ldlm_reply */ + LASSERTF((int)sizeof(struct ldlm_reply) == 112, "found %lld\n", + (long long)(int)sizeof(struct ldlm_reply)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_flags)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_padding) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_padding)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_padding)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_desc) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_desc)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_desc) == 80, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_desc)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_handle) == 88, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_handle)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_handle)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res1) == 96, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_policy_res1)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res2) == 104, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_policy_res2)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2)); + + /* Checks for struct ost_lvb_v1 */ + LASSERTF((int)sizeof(struct ost_lvb_v1) == 40, "found %lld\n", + (long long)(int)sizeof(struct ost_lvb_v1)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_size)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_mtime) == 8, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_mtime)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_atime) == 16, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_atime)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_ctime) == 24, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_ctime)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_blocks) == 32, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_blocks)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks)); + + /* Checks for struct ost_lvb */ + LASSERTF((int)sizeof(struct ost_lvb) == 56, "found %lld\n", + (long long)(int)sizeof(struct ost_lvb)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_size)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_size)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime) == 8, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_mtime)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_atime) == 16, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_atime)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime) == 24, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_ctime)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_blocks) == 32, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_blocks)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime_ns) == 40, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_mtime_ns)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_atime_ns) == 44, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_atime_ns)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime_ns) == 48, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_ctime_ns)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_padding) == 52, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_padding)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_padding)); + + /* Checks for struct lquota_lvb */ + LASSERTF((int)sizeof(struct lquota_lvb) == 40, "found %lld\n", + (long long)(int)sizeof(struct lquota_lvb)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_flags)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_flags)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_may_rel) == 8, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_id_may_rel)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_rel) == 16, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_id_rel)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_qunit) == 24, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_id_qunit)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_pad1) == 32, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_pad1)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_pad1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_pad1)); + LASSERTF(LQUOTA_FL_EDQUOT == 1, "found %lld\n", + (long long)LQUOTA_FL_EDQUOT); + + /* Checks for struct ldlm_gl_lquota_desc */ + LASSERTF((int)sizeof(struct ldlm_gl_lquota_desc) == 64, "found %lld\n", + (long long)(int)sizeof(struct ldlm_gl_lquota_desc)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_id) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_id)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id) == 16, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_flags)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_ver) == 24, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_ver)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit) == 32, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit) == 40, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_time) == 48, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_time)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2) == 56, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2)); +#ifdef HAVE_SERVER_SUPPORT + + /* Checks for struct ldlm_gl_barrier_desc */ + LASSERTF((int)sizeof(struct ldlm_gl_barrier_desc) == 16, "found %lld\n", + (long long)(int)sizeof(struct ldlm_gl_barrier_desc)); + LASSERTF((int)offsetof(struct ldlm_gl_barrier_desc, lgbd_status) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_barrier_desc, lgbd_status)); + LASSERTF((int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_status) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_status)); + LASSERTF((int)offsetof(struct ldlm_gl_barrier_desc, lgbd_timeout) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_barrier_desc, lgbd_timeout)); + LASSERTF((int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_timeout) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_timeout)); + LASSERTF((int)offsetof(struct ldlm_gl_barrier_desc, lgbd_padding) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_barrier_desc, lgbd_padding)); + LASSERTF((int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_padding) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_padding)); + + /* Checks for struct barrier_lvb */ + LASSERTF((int)sizeof(struct barrier_lvb) == 16, "found %lld\n", + (long long)(int)sizeof(struct barrier_lvb)); + LASSERTF((int)offsetof(struct barrier_lvb, lvb_status) == 0, "found %lld\n", + (long long)(int)offsetof(struct barrier_lvb, lvb_status)); + LASSERTF((int)sizeof(((struct barrier_lvb *)0)->lvb_status) == 4, "found %lld\n", + (long long)(int)sizeof(((struct barrier_lvb *)0)->lvb_status)); + LASSERTF((int)offsetof(struct barrier_lvb, lvb_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct barrier_lvb, lvb_index)); + LASSERTF((int)sizeof(((struct barrier_lvb *)0)->lvb_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct barrier_lvb *)0)->lvb_index)); + LASSERTF((int)offsetof(struct barrier_lvb, lvb_padding) == 8, "found %lld\n", + (long long)(int)offsetof(struct barrier_lvb, lvb_padding)); + LASSERTF((int)sizeof(((struct barrier_lvb *)0)->lvb_padding) == 8, "found %lld\n", + (long long)(int)sizeof(((struct barrier_lvb *)0)->lvb_padding)); +#endif /* HAVE_SERVER_SUPPORT */ + + /* Checks for struct mgs_send_param */ + LASSERTF((int)sizeof(struct mgs_send_param) == 1024, "found %lld\n", + (long long)(int)sizeof(struct mgs_send_param)); + BUILD_BUG_ON(MGS_PARAM_MAXLEN != 1024); + LASSERTF((int)offsetof(struct mgs_send_param, mgs_param[1024]) == 1024, "found %lld\n", + (long long)(int)offsetof(struct mgs_send_param, mgs_param[1024])); + LASSERTF((int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024])); + + /* Checks for struct cfg_marker */ + LASSERTF((int)sizeof(struct cfg_marker) == 160, "found %lld\n", + (long long)(int)sizeof(struct cfg_marker)); + LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_step)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step)); + LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_flags)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags)); + LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_vers)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers)); + LASSERTF((int)offsetof(struct cfg_marker, cm_padding) == 12, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_padding)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_padding)); + LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_createtime)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime)); + LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_canceltime)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime)); + LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_tgtname)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname)); + LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_comment)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment)); + + /* Checks for struct llog_logid */ + LASSERTF((int)sizeof(struct llog_logid) == 20, "found %lld\n", + (long long)(int)sizeof(struct llog_logid)); + LASSERTF((int)offsetof(struct llog_logid, lgl_oi) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_logid, lgl_oi)); + LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid *)0)->lgl_oi)); + LASSERTF((int)offsetof(struct llog_logid, lgl_ogen) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_logid, lgl_ogen)); + LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen)); + BUILD_BUG_ON(OST_SZ_REC != 0x10600f00); + BUILD_BUG_ON(MDS_UNLINK_REC != 0x10612404); + BUILD_BUG_ON(MDS_UNLINK64_REC != 0x10692404); + BUILD_BUG_ON(MDS_SETATTR64_REC != 0x10692401); + BUILD_BUG_ON(OBD_CFG_REC != 0x10620000); + BUILD_BUG_ON(LLOG_GEN_REC != 0x10640000); + BUILD_BUG_ON(CHANGELOG_REC != 0x10660000); + BUILD_BUG_ON(CHANGELOG_USER_REC != 0x10670000); + BUILD_BUG_ON(CHANGELOG_USER_REC2 != 0x10670002); + BUILD_BUG_ON(HSM_AGENT_REC != 0x10680000); + BUILD_BUG_ON(UPDATE_REC != 0x106a0000); + BUILD_BUG_ON(LLOG_HDR_MAGIC != 0x10645539); + BUILD_BUG_ON(LLOG_LOGID_MAGIC != 0x1064553b); + + /* Checks for struct llog_catid */ + LASSERTF((int)sizeof(struct llog_catid) == 32, "found %lld\n", + (long long)(int)sizeof(struct llog_catid)); + LASSERTF((int)offsetof(struct llog_catid, lci_logid) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_logid)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding1)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding2)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding3)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3)); + + /* Checks for struct llog_rec_hdr */ + LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(struct llog_rec_hdr)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_len)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_len)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_index)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_index)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_type) == 8, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_type)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_id) == 12, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_id)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_id)); + + /* Checks for struct llog_rec_tail */ + LASSERTF((int)sizeof(struct llog_rec_tail) == 8, "found %lld\n", + (long long)(int)sizeof(struct llog_rec_tail)); + LASSERTF((int)offsetof(struct llog_rec_tail, lrt_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_tail, lrt_len)); + LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_len)); + LASSERTF((int)offsetof(struct llog_rec_tail, lrt_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_tail, lrt_index)); + LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_index)); + + /* Checks for struct llog_logid_rec */ + LASSERTF((int)sizeof(struct llog_logid_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_logid_rec)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_hdr)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_hdr)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_id) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_id)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_padding1)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding1)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding2) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_padding2)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding2)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding3) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_padding3)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding3)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_tail)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_tail)); + + /* Checks for struct llog_unlink_rec */ + LASSERTF((int)sizeof(struct llog_unlink_rec) == 40, "found %lld\n", + (long long)(int)sizeof(struct llog_unlink_rec)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_hdr)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_oid)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oseq) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_oseq)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_count)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_tail)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail)); + /* Checks for struct llog_unlink64_rec */ + LASSERTF((int)sizeof(struct llog_unlink64_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_unlink64_rec)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_hdr)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_fid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_fid)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_count) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_count)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_count)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_tail)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding1)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding2) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding2)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding3) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding3)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3)); + +#ifdef HAVE_SERVER_SUPPORT + /* Checks for struct llog_setattr64_rec */ + LASSERTF((int)sizeof(struct llog_setattr64_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_setattr64_rec)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oi)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_valid) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_valid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_hdr), "%d != %d\n", + (int)offsetof(struct llog_setattr64_rec, lsr_hdr), (int)offsetof(struct llog_setattr64_rec_v2, lsr_hdr)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_hdr), "%d != %d\n", + (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_hdr)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_oi), "%d != %d\n", + (int)offsetof(struct llog_setattr64_rec, lsr_oi), (int)offsetof(struct llog_setattr64_rec_v2, lsr_oi)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_oi), "%d != %d\n", + (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_oi)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_uid), "%d != %d\n", + (int)offsetof(struct llog_setattr64_rec, lsr_uid), (int)offsetof(struct llog_setattr64_rec_v2, lsr_uid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_uid), "%d != %d\n", + (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_uid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_uid_h), "%d != %d\n", + (int)offsetof(struct llog_setattr64_rec, lsr_uid_h), (int)offsetof(struct llog_setattr64_rec_v2, lsr_uid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_uid_h), "%d != %d\n", + (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_uid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_gid), "%d != %d\n", + (int)offsetof(struct llog_setattr64_rec, lsr_gid), (int)offsetof(struct llog_setattr64_rec_v2, lsr_gid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_gid), "%d != %d\n", + (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_gid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_gid_h), "%d != %d\n", + (int)offsetof(struct llog_setattr64_rec, lsr_gid_h), (int)offsetof(struct llog_setattr64_rec_v2, lsr_gid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_gid_h), "%d != %d\n", + (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_gid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_valid) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_valid), "%d != %d\n", + (int)offsetof(struct llog_setattr64_rec, lsr_valid), (int)offsetof(struct llog_setattr64_rec_v2, lsr_valid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_valid), "%d != %d\n", + (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_valid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_projid) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_projid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_layout_version) == 60, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_layout_version)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_layout_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_layout_version)); + LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_padding2) == 64, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_padding2)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_padding2)); + LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_padding3) == 72, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_padding3)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_padding3)); + LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_tail) == 80, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_tail)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_tail)); +#endif /* HAVE_SERVER_SUPPORT */ + + /* Checks for struct llog_size_change_rec */ + LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_size_change_rec)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_hdr)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_fid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_fid)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_ioepoch) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_ioepoch)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding1)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding2) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding2)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding3) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding3)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail)); + + /* Checks for struct changelog_rec */ + LASSERTF((int)sizeof(struct changelog_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct changelog_rec)); + LASSERTF((int)offsetof(struct changelog_rec, cr_namelen) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_namelen)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_namelen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_namelen)); + LASSERTF((int)offsetof(struct changelog_rec, cr_flags) == 2, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_flags)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_flags)); + LASSERTF((int)offsetof(struct changelog_rec, cr_type) == 4, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_type)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_type)); + LASSERTF((int)offsetof(struct changelog_rec, cr_index) == 8, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_index)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_index) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_index)); + LASSERTF((int)offsetof(struct changelog_rec, cr_prev) == 16, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_prev)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_prev) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_prev)); + LASSERTF((int)offsetof(struct changelog_rec, cr_time) == 24, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_time)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_time)); + LASSERTF((int)offsetof(struct changelog_rec, cr_tfid) == 32, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_tfid)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_tfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_tfid)); + LASSERTF((int)offsetof(struct changelog_rec, cr_pfid) == 48, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_pfid)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid)); + + /* Checks for struct changelog_ext_rename */ + LASSERTF((int)sizeof(struct changelog_ext_rename) == 32, "found %lld\n", + (long long)(int)sizeof(struct changelog_ext_rename)); + LASSERTF((int)offsetof(struct changelog_ext_rename, cr_sfid) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rename, cr_sfid)); + LASSERTF((int)sizeof(((struct changelog_ext_rename *)0)->cr_sfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rename *)0)->cr_sfid)); + LASSERTF((int)offsetof(struct changelog_ext_rename, cr_spfid) == 16, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rename, cr_spfid)); + LASSERTF((int)sizeof(((struct changelog_ext_rename *)0)->cr_spfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rename *)0)->cr_spfid)); + +#ifdef HAVE_SERVER_SUPPORT + /* Checks for struct changelog_ext_jobid */ + LASSERTF((int)sizeof(struct changelog_ext_jobid) == 32, "found %lld\n", + (long long)(int)sizeof(struct changelog_ext_jobid)); + BUILD_BUG_ON(LUSTRE_JOBID_SIZE != 32); + LASSERTF((int)offsetof(struct changelog_ext_jobid, cr_jobid) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_jobid, cr_jobid)); + LASSERTF((int)sizeof(((struct changelog_ext_jobid *)0)->cr_jobid) == 32, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_jobid *)0)->cr_jobid)); +#endif /* HAVE_SERVER_SUPPORT */ + + /* Checks for struct changelog_setinfo */ + LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n", + (long long)(int)sizeof(struct changelog_setinfo)); + LASSERTF((int)offsetof(struct changelog_setinfo, cs_recno) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_setinfo, cs_recno)); + LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_recno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_recno)); + LASSERTF((int)offsetof(struct changelog_setinfo, cs_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct changelog_setinfo, cs_id)); + LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_id)); + + /* Checks for struct llog_changelog_rec */ + LASSERTF((int)sizeof(struct llog_changelog_rec) == 88, "found %lld\n", + (long long)(int)sizeof(struct llog_changelog_rec)); + LASSERTF((int)offsetof(struct llog_changelog_rec, cr_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_rec, cr_hdr)); + LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr)); + LASSERTF((int)offsetof(struct llog_changelog_rec, cr) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_rec, cr)); + LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr)); + LASSERTF((int)offsetof(struct llog_changelog_rec, cr_do_not_use) == 80, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_rec, cr_do_not_use)); + LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_do_not_use) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_do_not_use)); +#ifdef HAVE_SERVER_SUPPORT + + /* Checks for struct llog_changelog_user_rec */ + LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n", + (long long)(int)sizeof(struct llog_changelog_user_rec)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_hdr)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_id) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_time) == 20, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_time)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_time) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_time)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_tail) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_tail)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail)); +#endif /* HAVE_SERVER_SUPPORT */ + + /* Checks for struct llog_gen */ + LASSERTF((int)sizeof(struct llog_gen) == 16, "found %lld\n", + (long long)(int)sizeof(struct llog_gen)); + LASSERTF((int)offsetof(struct llog_gen, mnt_cnt) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_gen, mnt_cnt)); + LASSERTF((int)sizeof(((struct llog_gen *)0)->mnt_cnt) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen *)0)->mnt_cnt)); + LASSERTF((int)offsetof(struct llog_gen, conn_cnt) == 8, "found %lld\n", + (long long)(int)offsetof(struct llog_gen, conn_cnt)); + LASSERTF((int)sizeof(((struct llog_gen *)0)->conn_cnt) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen *)0)->conn_cnt)); + + /* Checks for struct llog_gen_rec */ + LASSERTF((int)sizeof(struct llog_gen_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_gen_rec)); + LASSERTF((int)offsetof(struct llog_gen_rec, lgr_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_gen_rec, lgr_hdr)); + LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr)); + LASSERTF((int)offsetof(struct llog_gen_rec, lgr_gen) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_gen_rec, lgr_gen)); + LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_gen) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_gen)); + LASSERTF((int)offsetof(struct llog_gen_rec, lgr_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_gen_rec, lgr_tail)); + LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_tail)); + + /* Checks for struct llog_log_hdr */ + LASSERTF((int)sizeof(struct llog_log_hdr) == 8192, "found %lld\n", + (long long)(int)sizeof(struct llog_log_hdr)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_hdr)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_hdr)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_timestamp) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_timestamp)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_count) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_count)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_count)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap_offset) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap_offset)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_size) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_size)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_size)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_flags) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_flags)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_flags)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_cat_idx) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_cat_idx)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_tgtuuid) == 44, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_tgtuuid)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid)); + BUILD_BUG_ON(LLOG_F_ZAP_WHEN_EMPTY != 0x00000001); + BUILD_BUG_ON(LLOG_F_IS_CAT != 0x00000002); + BUILD_BUG_ON(LLOG_F_IS_PLAIN != 0x00000004); + BUILD_BUG_ON(LLOG_F_EXT_JOBID != 0x00000008); + BUILD_BUG_ON(LLOG_F_IS_FIXSIZE != 0x00000010); + BUILD_BUG_ON(LLOG_F_EXT_EXTRA_FLAGS != 0x00000020); + BUILD_BUG_ON(LLOG_F_EXT_X_UIDGID != 0x00000040); + BUILD_BUG_ON(LLOG_F_EXT_X_NID != 0x00000080); + BUILD_BUG_ON(LLOG_F_EXT_X_OMODE != 0x00000100); + BUILD_BUG_ON(LLOG_F_EXT_X_XATTR != 0x00000200); + + /* Checks for struct llogd_body */ + LASSERTF((int)sizeof(struct llogd_body) == 48, "found %lld\n", + (long long)(int)sizeof(struct llogd_body)); + LASSERTF((int)offsetof(struct llogd_body, lgd_logid) == 0, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_logid)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_logid) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_logid)); + LASSERTF((int)offsetof(struct llogd_body, lgd_ctxt_idx) == 20, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_ctxt_idx)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx)); + LASSERTF((int)offsetof(struct llogd_body, lgd_llh_flags) == 24, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_llh_flags)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_llh_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_llh_flags)); + LASSERTF((int)offsetof(struct llogd_body, lgd_index) == 28, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_index)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_index)); + LASSERTF((int)offsetof(struct llogd_body, lgd_saved_index) == 32, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_saved_index)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_saved_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_saved_index)); + LASSERTF((int)offsetof(struct llogd_body, lgd_len) == 36, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_len)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_len)); + LASSERTF((int)offsetof(struct llogd_body, lgd_cur_offset) == 40, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_cur_offset)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset)); + BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_CREATE != 501); + BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_NEXT_BLOCK != 502); + BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_READ_HEADER != 503); + BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_PREV_BLOCK != 508); + BUILD_BUG_ON(LLOG_FIRST_OPC != 501); + BUILD_BUG_ON(LLOG_LAST_OPC != 510); + BUILD_BUG_ON(LLOG_CONFIG_ORIG_CTXT != 0); + BUILD_BUG_ON(LLOG_CONFIG_REPL_CTXT != 1); + BUILD_BUG_ON(LLOG_MDS_OST_ORIG_CTXT != 2); + BUILD_BUG_ON(LLOG_MDS_OST_REPL_CTXT != 3); + BUILD_BUG_ON(LLOG_SIZE_ORIG_CTXT != 4); + BUILD_BUG_ON(LLOG_SIZE_REPL_CTXT != 5); + BUILD_BUG_ON(LLOG_TEST_ORIG_CTXT != 8); + BUILD_BUG_ON(LLOG_TEST_REPL_CTXT != 9); + BUILD_BUG_ON(LLOG_CHANGELOG_ORIG_CTXT != 12); + BUILD_BUG_ON(LLOG_CHANGELOG_REPL_CTXT != 13); + BUILD_BUG_ON(LLOG_CHANGELOG_USER_ORIG_CTXT != 14); + BUILD_BUG_ON(LLOG_AGENT_ORIG_CTXT != 15); + BUILD_BUG_ON(LLOG_UPDATELOG_ORIG_CTXT != 16); + BUILD_BUG_ON(LLOG_UPDATELOG_REPL_CTXT != 17); + BUILD_BUG_ON(LLOG_MAX_CTXTS != 18); + + /* Checks for struct llogd_conn_body */ + LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n", + (long long)(int)sizeof(struct llogd_conn_body)); + LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_gen) == 0, "found %lld\n", + (long long)(int)offsetof(struct llogd_conn_body, lgdc_gen)); + LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen)); + LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_logid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llogd_conn_body, lgdc_logid)); + LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid)); + LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx) == 36, "found %lld\n", + (long long)(int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx)); + LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx)); + + /* Checks for struct ll_fiemap_info_key */ + LASSERTF((int)sizeof(struct ll_fiemap_info_key) == 248, "found %lld\n", + (long long)(int)sizeof(struct ll_fiemap_info_key)); + LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_name[8]) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_name[8])); + LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_name[8]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_name[8])); + LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_oa) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_oa)); + LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_oa) == 208, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_oa)); + LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_fiemap) == 216, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_fiemap)); + LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_fiemap) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_fiemap)); +#ifdef HAVE_SERVER_SUPPORT + + /* Checks for struct quota_body */ + LASSERTF((int)sizeof(struct quota_body) == 112, "found %lld\n", + (long long)(int)sizeof(struct quota_body)); + LASSERTF((int)offsetof(struct quota_body, qb_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_fid)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_fid)); + LASSERTF((int)offsetof(struct quota_body, qb_id) == 16, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_id)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_id) == 16, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_id)); + LASSERTF((int)offsetof(struct quota_body, qb_flags) == 32, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_flags)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_flags)); + LASSERTF((int)offsetof(struct quota_body, qb_padding) == 36, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_padding)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_padding)); + LASSERTF((int)offsetof(struct quota_body, qb_count) == 40, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_count)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_count) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_count)); + LASSERTF((int)offsetof(struct quota_body, qb_usage) == 48, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_usage)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_usage) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_usage)); + LASSERTF((int)offsetof(struct quota_body, qb_slv_ver) == 56, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_slv_ver)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_slv_ver) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_slv_ver)); + LASSERTF((int)offsetof(struct quota_body, qb_lockh) == 64, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_lockh)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_lockh) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_lockh)); + LASSERTF((int)offsetof(struct quota_body, qb_glb_lockh) == 72, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_glb_lockh)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_glb_lockh) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_glb_lockh)); + LASSERTF((int)offsetof(struct quota_body, qb_padding1[4]) == 112, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_padding1[4])); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding1[4]) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_padding1[4])); +#endif /* HAVE_SERVER_SUPPORT */ + + /* Checks for struct mgs_target_info */ + LASSERTF((int)sizeof(struct mgs_target_info) == 4544, "found %lld\n", + (long long)(int)sizeof(struct mgs_target_info)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_lustre_ver) == 0, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_lustre_ver)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_stripe_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_stripe_index)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_config_ver) == 8, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_config_ver)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_config_ver) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_config_ver)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_flags) == 12, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_flags)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_flags)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_nid_count) == 16, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_nid_count)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nid_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nid_count)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_instance) == 20, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_instance)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_instance) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_instance)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_fsname) == 24, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_fsname)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_fsname) == 64, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_fsname)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_svname) == 88, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_svname)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_svname) == 64, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_svname)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_uuid) == 152, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_uuid)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_uuid)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_nids) == 192, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_nids)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nids) == 256, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nids)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_params) == 448, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_params)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_params) == 4096, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_params)); + + /* Checks for struct mgs_nidtbl_entry */ + LASSERTF((int)sizeof(struct mgs_nidtbl_entry) == 24, "found %lld\n", + (long long)(int)sizeof(struct mgs_nidtbl_entry)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_version)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_version)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_instance) == 8, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_instance)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_instance) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_instance)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_index) == 12, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_index)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_index)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_length) == 16, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_length)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_length) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_length)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_type) == 20, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_type)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_type) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_type)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_nid_type) == 21, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_nid_type)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_type) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_type)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_nid_size) == 22, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_nid_size)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_size) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_size)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_nid_count) == 23, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_nid_count)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_count) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_count)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, u.nids[0]) == 24, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, u.nids[0])); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->u.nids[0]) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->u.nids[0])); + + /* Checks for struct mgs_config_body */ + LASSERTF((int)sizeof(struct mgs_config_body) == 80, "found %lld\n", + (long long)(int)sizeof(struct mgs_config_body)); + LASSERTF((int)offsetof(struct mgs_config_body, mcb_name) == 0, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_body, mcb_name)); + LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_name) == 64, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_name)); + LASSERTF((int)offsetof(struct mgs_config_body, mcb_offset) == 64, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_body, mcb_offset)); + LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_offset)); + LASSERTF((int)offsetof(struct mgs_config_body, mcb_type) == 72, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_body, mcb_type)); + LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_type) == 2, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_type)); + LASSERTF((int)offsetof(struct mgs_config_body, mcb_nm_cur_pass) == 74, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_body, mcb_nm_cur_pass)); + LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_nm_cur_pass) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_nm_cur_pass)); + LASSERTF((int)offsetof(struct mgs_config_body, mcb_bits) == 75, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_body, mcb_bits)); + LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_bits) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_bits)); + LASSERTF((int)offsetof(struct mgs_config_body, mcb_units) == 76, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_body, mcb_units)); + LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_units) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_units)); + BUILD_BUG_ON(MGS_CFG_T_CONFIG != 0); + BUILD_BUG_ON(MGS_CFG_T_SPTLRPC != 1); + BUILD_BUG_ON(MGS_CFG_T_RECOVER != 2); + BUILD_BUG_ON(MGS_CFG_T_PARAMS != 3); +#ifdef HAVE_SERVER_SUPPORT + BUILD_BUG_ON(MGS_CFG_T_NODEMAP != 4); + BUILD_BUG_ON(MGS_CFG_T_BARRIER != 5); +#endif /* HAVE_SERVER_SUPPORT */ + + /* Checks for struct mgs_config_res */ + LASSERTF((int)sizeof(struct mgs_config_res) == 16, "found %lld\n", + (long long)(int)sizeof(struct mgs_config_res)); + LASSERTF((int)offsetof(struct mgs_config_res, mcr_offset) == 0, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_res, mcr_offset)); + LASSERTF((int)sizeof(((struct mgs_config_res *)0)->mcr_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_res *)0)->mcr_offset)); + LASSERTF((int)offsetof(struct mgs_config_res, mcr_size) == 8, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_res, mcr_size)); + LASSERTF((int)sizeof(((struct mgs_config_res *)0)->mcr_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_res *)0)->mcr_size)); + + /* Checks for struct getinfo_fid2path */ + LASSERTF((int)sizeof(struct getinfo_fid2path) == 32, "found %lld\n", + (long long)(int)sizeof(struct getinfo_fid2path)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_fid)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_fid)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_recno) == 16, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_recno)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_recno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_recno)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_linkno) == 24, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_linkno)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno) == 4, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_pathlen) == 28, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_pathlen)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen)); +#ifdef HAVE_FID2PATH_ANON_UNIONS + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_path[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_path[0])); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0])); +#else + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_u.gf_path[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_u.gf_path[0])); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_u.gf_path[0]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_u.gf_path[0])); +#endif /* HAVE_FID2PATH_ANON_UNIONS */ + + /* Checks for struct fiemap */ + LASSERTF((int)sizeof(struct fiemap) == 32, "found %lld\n", + (long long)(int)sizeof(struct fiemap)); + LASSERTF((int)offsetof(struct fiemap, fm_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_start)); + LASSERTF((int)sizeof(((struct fiemap *)0)->fm_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct fiemap *)0)->fm_start)); + LASSERTF((int)offsetof(struct fiemap, fm_length) == 8, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_length)); + LASSERTF((int)sizeof(((struct fiemap *)0)->fm_length) == 8, "found %lld\n", + (long long)(int)sizeof(((struct fiemap *)0)->fm_length)); + LASSERTF((int)offsetof(struct fiemap, fm_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_flags)); + LASSERTF((int)sizeof(((struct fiemap *)0)->fm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct fiemap *)0)->fm_flags)); + LASSERTF((int)offsetof(struct fiemap, fm_mapped_extents) == 20, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_mapped_extents)); + LASSERTF((int)sizeof(((struct fiemap *)0)->fm_mapped_extents) == 4, "found %lld\n", + (long long)(int)sizeof(((struct fiemap *)0)->fm_mapped_extents)); + LASSERTF((int)offsetof(struct fiemap, fm_extent_count) == 24, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_extent_count)); + LASSERTF((int)sizeof(((struct fiemap *)0)->fm_extent_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct fiemap *)0)->fm_extent_count)); + LASSERTF((int)offsetof(struct fiemap, fm_reserved) == 28, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_reserved)); + LASSERTF((int)sizeof(((struct fiemap *)0)->fm_reserved) == 4, "found %lld\n", + (long long)(int)sizeof(((struct fiemap *)0)->fm_reserved)); + LASSERTF((int)offsetof(struct fiemap, fm_extents) == 32, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_extents)); + BUILD_BUG_ON(offsetof(struct fiemap, fm_extents) != sizeof(struct fiemap)); + BUILD_BUG_ON(FIEMAP_FLAG_SYNC != 0x00000001); + BUILD_BUG_ON(FIEMAP_FLAG_XATTR != 0x00000002); + BUILD_BUG_ON(FIEMAP_FLAG_DEVICE_ORDER != 0x40000000); + + /* Checks for struct fiemap_extent */ + LASSERTF((int)sizeof(struct fiemap_extent) == 56, "found %lld\n", + (long long)(int)sizeof(struct fiemap_extent)); + LASSERTF((int)offsetof(struct fiemap_extent, fe_logical) == 0, "found %lld\n", + (long long)(int)offsetof(struct fiemap_extent, fe_logical)); + LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_logical) == 8, "found %lld\n", + (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_logical)); + LASSERTF((int)offsetof(struct fiemap_extent, fe_physical) == 8, "found %lld\n", + (long long)(int)offsetof(struct fiemap_extent, fe_physical)); + LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_physical) == 8, "found %lld\n", + (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_physical)); + LASSERTF((int)offsetof(struct fiemap_extent, fe_length) == 16, "found %lld\n", + (long long)(int)offsetof(struct fiemap_extent, fe_length)); + LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_length) == 8, "found %lld\n", + (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_length)); + LASSERTF((int)offsetof(struct fiemap_extent, fe_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct fiemap_extent, fe_flags)); + LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_flags)); + LASSERTF((int)offsetof(struct fiemap_extent, fe_reserved[0]) == 44, "found %lld\n", + (long long)(int)offsetof(struct fiemap_extent, fe_reserved[0])); + LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_reserved[0]) == 4, "found %lld\n", + (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_reserved[0])); + BUILD_BUG_ON(FIEMAP_EXTENT_LAST != 0x00000001); + BUILD_BUG_ON(FIEMAP_EXTENT_UNKNOWN != 0x00000002); + BUILD_BUG_ON(FIEMAP_EXTENT_DELALLOC != 0x00000004); + BUILD_BUG_ON(FIEMAP_EXTENT_ENCODED != 0x00000008); + BUILD_BUG_ON(FIEMAP_EXTENT_DATA_ENCRYPTED != 0x00000080); + BUILD_BUG_ON(FIEMAP_EXTENT_NOT_ALIGNED != 0x00000100); + BUILD_BUG_ON(FIEMAP_EXTENT_DATA_INLINE != 0x00000200); + BUILD_BUG_ON(FIEMAP_EXTENT_DATA_TAIL != 0x00000400); + BUILD_BUG_ON(FIEMAP_EXTENT_UNWRITTEN != 0x00000800); + BUILD_BUG_ON(FIEMAP_EXTENT_MERGED != 0x00001000); + BUILD_BUG_ON(FIEMAP_EXTENT_SHARED != 0x00002000); + BUILD_BUG_ON(FIEMAP_EXTENT_NET != 0x80000000); + +#ifdef CONFIG_FS_POSIX_ACL + /* Checks for type posix_acl_xattr_entry */ + LASSERTF((int)sizeof(posix_acl_xattr_entry) == 8, "found %lld\n", + (long long)(int)sizeof(posix_acl_xattr_entry)); + LASSERTF((int)offsetof(posix_acl_xattr_entry, e_tag) == 0, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_entry, e_tag)); + LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_tag) == 2, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_tag)); + LASSERTF((int)offsetof(posix_acl_xattr_entry, e_perm) == 2, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_entry, e_perm)); + LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_perm) == 2, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_perm)); + LASSERTF((int)offsetof(posix_acl_xattr_entry, e_id) == 4, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_entry, e_id)); + LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_id) == 4, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_id)); +#endif /* CONFIG_FS_POSIX_ACL */ + +#ifdef CONFIG_FS_POSIX_ACL + /* Checks for type posix_acl_xattr_header */ + LASSERTF((int)sizeof(posix_acl_xattr_header) == 4, "found %lld\n", + (long long)(int)sizeof(posix_acl_xattr_header)); + LASSERTF((int)offsetof(posix_acl_xattr_header, a_version) == 0, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_header, a_version)); + LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_version) == 4, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_version)); +#ifndef HAVE_STRUCT_POSIX_ACL_XATTR + LASSERTF((int)offsetof(posix_acl_xattr_header, a_entries) == 4, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_header, a_entries)); + LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_entries) == 0, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_entries)); +#endif /* HAVE_STRUCT_POSIX_ACL_XATTR */ +#endif /* CONFIG_FS_POSIX_ACL */ + + /* Checks for struct link_ea_header */ + LASSERTF((int)sizeof(struct link_ea_header) == 24, "found %lld\n", + (long long)(int)sizeof(struct link_ea_header)); + LASSERTF((int)offsetof(struct link_ea_header, leh_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_magic)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_magic)); + LASSERTF((int)offsetof(struct link_ea_header, leh_reccount) == 4, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_reccount)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_reccount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_reccount)); + LASSERTF((int)offsetof(struct link_ea_header, leh_len) == 8, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_len)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_len) == 8, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_len)); + LASSERTF((int)offsetof(struct link_ea_header, leh_overflow_time) == 16, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_overflow_time)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_overflow_time) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_overflow_time)); + LASSERTF((int)offsetof(struct link_ea_header, leh_padding) == 20, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_padding)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_padding)); + BUILD_BUG_ON(LINK_EA_MAGIC != 0x11EAF1DFUL); + + /* Checks for struct link_ea_entry */ + LASSERTF((int)sizeof(struct link_ea_entry) == 18, "found %lld\n", + (long long)(int)sizeof(struct link_ea_entry)); + LASSERTF((int)offsetof(struct link_ea_entry, lee_reclen) == 0, "found %lld\n", + (long long)(int)offsetof(struct link_ea_entry, lee_reclen)); + LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_reclen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_reclen)); + LASSERTF((int)offsetof(struct link_ea_entry, lee_parent_fid) == 2, "found %lld\n", + (long long)(int)offsetof(struct link_ea_entry, lee_parent_fid)); + LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid)); + LASSERTF((int)offsetof(struct link_ea_entry, lee_name) == 18, "found %lld\n", + (long long)(int)offsetof(struct link_ea_entry, lee_name)); + LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_name) == 0, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_name)); + + /* Checks for struct layout_intent */ + LASSERTF((int)sizeof(struct layout_intent) == 24, "found %lld\n", + (long long)(int)sizeof(struct layout_intent)); + LASSERTF((int)offsetof(struct layout_intent, li_opc) == 0, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_opc)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_opc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_opc)); + LASSERTF((int)offsetof(struct layout_intent, li_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_flags)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_flags)); + LASSERTF((int)offsetof(struct layout_intent, li_extent) == 8, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_extent)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_extent)); + LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n", + (long long)LAYOUT_INTENT_ACCESS); + LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n", + (long long)LAYOUT_INTENT_READ); + LASSERTF(LAYOUT_INTENT_WRITE == 2, "found %lld\n", + (long long)LAYOUT_INTENT_WRITE); + LASSERTF(LAYOUT_INTENT_GLIMPSE == 3, "found %lld\n", + (long long)LAYOUT_INTENT_GLIMPSE); + LASSERTF(LAYOUT_INTENT_TRUNC == 4, "found %lld\n", + (long long)LAYOUT_INTENT_TRUNC); + LASSERTF(LAYOUT_INTENT_RELEASE == 5, "found %lld\n", + (long long)LAYOUT_INTENT_RELEASE); + LASSERTF(LAYOUT_INTENT_RESTORE == 6, "found %lld\n", + (long long)LAYOUT_INTENT_RESTORE); + + /* Checks for struct hsm_action_item */ + LASSERTF((int)sizeof(struct hsm_action_item) == 72, "found %lld\n", + (long long)(int)sizeof(struct hsm_action_item)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_len)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_len)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_action) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_action)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_action)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_fid) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_fid)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_fid)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_dfid) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_dfid)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_dfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_dfid)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_extent) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_extent)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_extent)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_cookie) == 56, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_cookie)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_cookie)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_gid) == 64, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_gid)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_gid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_gid)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_data) == 72, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_data)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_data) == 0, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_data)); + + /* Checks for struct hsm_action_list */ + LASSERTF((int)sizeof(struct hsm_action_list) == 32, "found %lld\n", + (long long)(int)sizeof(struct hsm_action_list)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_version)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_version)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_count)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_count)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_compound_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_compound_id)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_compound_id) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_compound_id)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_flags)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_flags)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_archive_id) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_archive_id)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_archive_id)); + LASSERTF((int)offsetof(struct hsm_action_list, padding1) == 28, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, padding1)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->padding1)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_fsname) == 32, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_fsname)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_fsname) == 0, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_fsname)); + + /* Checks for struct hsm_progress */ + LASSERTF((int)sizeof(struct hsm_progress) == 48, "found %lld\n", + (long long)(int)sizeof(struct hsm_progress)); + LASSERTF((int)offsetof(struct hsm_progress, hp_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_fid)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_fid)); + LASSERTF((int)offsetof(struct hsm_progress, hp_cookie) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_cookie)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_cookie)); + LASSERTF((int)offsetof(struct hsm_progress, hp_extent) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_extent)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_extent)); + LASSERTF((int)offsetof(struct hsm_progress, hp_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_flags)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_flags)); + LASSERTF((int)offsetof(struct hsm_progress, hp_errval) == 42, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_errval)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_errval) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_errval)); + LASSERTF((int)offsetof(struct hsm_progress, padding) == 44, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, padding)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->padding)); + LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n", + HP_FLAG_COMPLETED); + LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n", + HP_FLAG_RETRY); + + LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_data_version)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_data_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_data_version)); + LASSERTF((int)offsetof(struct hsm_copy, hc_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_flags)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_flags)); + LASSERTF((int)offsetof(struct hsm_copy, hc_errval) == 10, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_errval)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_errval) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_errval)); + LASSERTF((int)offsetof(struct hsm_copy, padding) == 12, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, padding)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->padding)); + LASSERTF((int)offsetof(struct hsm_copy, hc_hai) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_hai)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_hai) == 72, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_hai)); + + /* Checks for struct hsm_progress_kernel */ + LASSERTF((int)sizeof(struct hsm_progress_kernel) == 64, "found %lld\n", + (long long)(int)sizeof(struct hsm_progress_kernel)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_fid)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_cookie) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_cookie)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_extent) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_extent)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_flags)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_errval) == 42, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_errval)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding1) == 44, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding1)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_data_version) == 48, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_data_version)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding2) == 56, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding2)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2)); + + /* Checks for struct hsm_user_item */ + LASSERTF((int)sizeof(struct hsm_user_item) == 32, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_item)); + LASSERTF((int)offsetof(struct hsm_user_item, hui_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_item, hui_fid)); + LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_fid)); + LASSERTF((int)offsetof(struct hsm_user_item, hui_extent) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_item, hui_extent)); + LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_extent)); + + /* Checks for struct hsm_user_state */ + LASSERTF((int)sizeof(struct hsm_user_state) == 32, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_state)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_states) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_states)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_states) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_states)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_archive_id) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_archive_id)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_archive_id)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_state) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_state)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_action) == 12, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_action)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_location) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_location)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location)); + + /* Checks for struct hsm_state_set */ + LASSERTF((int)sizeof(struct hsm_state_set) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_state_set)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_valid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_valid)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_valid)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_archive_id) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_archive_id)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_archive_id)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_setmask) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_setmask)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_setmask) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_setmask)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_clearmask) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_clearmask)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_clearmask) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_clearmask)); + BUILD_BUG_ON(HSS_SETMASK != 1); + BUILD_BUG_ON(HSS_CLEARMASK != 2); + BUILD_BUG_ON(HSS_ARCHIVE_ID != 4); + + /* Checks for struct hsm_current_action */ + LASSERTF((int)sizeof(struct hsm_current_action) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_current_action)); + LASSERTF((int)offsetof(struct hsm_current_action, hca_state) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_current_action, hca_state)); + LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_state) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_state)); + LASSERTF((int)offsetof(struct hsm_current_action, hca_action) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_current_action, hca_action)); + LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_action)); + LASSERTF((int)offsetof(struct hsm_current_action, hca_location) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_current_action, hca_location)); + LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_location) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_location)); + BUILD_BUG_ON(HPS_NONE != 0); + BUILD_BUG_ON(HPS_WAITING != 1); + BUILD_BUG_ON(HPS_RUNNING != 2); + BUILD_BUG_ON(HPS_DONE != 3); + BUILD_BUG_ON(HUA_NONE != 1); + BUILD_BUG_ON(HUA_ARCHIVE != 10); + BUILD_BUG_ON(HUA_RESTORE != 11); + BUILD_BUG_ON(HUA_RELEASE != 12); + BUILD_BUG_ON(HUA_REMOVE != 13); + BUILD_BUG_ON(HUA_CANCEL != 14); + + /* Checks for struct hsm_request */ + LASSERTF((int)sizeof(struct hsm_request) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_request)); + LASSERTF((int)offsetof(struct hsm_request, hr_action) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_action)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_action)); + LASSERTF((int)offsetof(struct hsm_request, hr_archive_id) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_archive_id)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_archive_id)); + LASSERTF((int)offsetof(struct hsm_request, hr_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_flags)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_flags)); + LASSERTF((int)offsetof(struct hsm_request, hr_itemcount) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_itemcount)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_itemcount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_itemcount)); + LASSERTF((int)offsetof(struct hsm_request, hr_data_len) == 20, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_data_len)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len)); + LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)HSM_FORCE_ACTION); + LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)HSM_GHOST_COPY); + + /* Checks for struct hsm_user_request */ + LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_request)); + LASSERTF((int)offsetof(struct hsm_user_request, hur_request) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_request, hur_request)); + LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_request) == 24, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_request)); + LASSERTF((int)offsetof(struct hsm_user_request, hur_user_item) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_request, hur_user_item)); + LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_user_item) == 0, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_user_item)); + + /* Checks for struct hsm_user_import */ + LASSERTF((int)sizeof(struct hsm_user_import) == 48, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_import)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_size)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_size)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_uid) == 32, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_uid)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_uid)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_gid) == 36, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_gid)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_gid)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_mode) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_mode)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_mode)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_atime) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_atime)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_atime)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_atime_ns) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_atime_ns)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_atime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_atime_ns)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_mtime) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_mtime)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_mtime)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_mtime_ns) == 28, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_mtime_ns)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_mtime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_mtime_ns)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_archive_id) == 44, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_archive_id)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_archive_id)); + + /* Checks for struct netobj_s */ + LASSERTF((int)sizeof(struct netobj_s) == 4, "found %lld\n", + (long long)(int)sizeof(struct netobj_s)); + LASSERTF((int)offsetof(struct netobj_s, len) == 0, "found %lld\n", + (long long)(int)offsetof(struct netobj_s, len)); + LASSERTF((int)sizeof(((struct netobj_s *)0)->len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct netobj_s *)0)->len)); + LASSERTF((int)offsetof(struct netobj_s, data) == 4, "found %lld\n", + (long long)(int)offsetof(struct netobj_s, data)); + LASSERTF((int)sizeof(((struct netobj_s *)0)->data) == 0, "found %lld\n", + (long long)(int)sizeof(((struct netobj_s *)0)->data)); + + /* Checks for struct rawobj_s */ + LASSERTF((int)sizeof(struct rawobj_s) == 16, "found %lld\n", + (long long)(int)sizeof(struct rawobj_s)); + LASSERTF((int)offsetof(struct rawobj_s, len) == 0, "found %lld\n", + (long long)(int)offsetof(struct rawobj_s, len)); + LASSERTF((int)sizeof(((struct rawobj_s *)0)->len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct rawobj_s *)0)->len)); + LASSERTF((int)offsetof(struct rawobj_s, data) == 8, "found %lld\n", + (long long)(int)offsetof(struct rawobj_s, data)); + LASSERTF((int)sizeof(((struct rawobj_s *)0)->data) == 8, "found %lld\n", + (long long)(int)sizeof(((struct rawobj_s *)0)->data)); + + /* Checks for struct gss_header */ + LASSERTF((int)sizeof(struct gss_header) == 36, "found %lld\n", + (long long)(int)sizeof(struct gss_header)); + LASSERTF((int)offsetof(struct gss_header, gh_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct gss_header, gh_version)); + LASSERTF((int)sizeof(((struct gss_header *)0)->gh_version) == 1, "found %lld\n", + (long long)(int)sizeof(((struct gss_header *)0)->gh_version)); + LASSERTF((int)offsetof(struct gss_header, gh_sp) == 1, "found %lld\n", + (long long)(int)offsetof(struct gss_header, gh_sp)); + LASSERTF((int)sizeof(((struct gss_header *)0)->gh_sp) == 1, "found %lld\n", + (long long)(int)sizeof(((struct gss_header *)0)->gh_sp)); + LASSERTF((int)offsetof(struct gss_header, gh_pad0) == 2, "found %lld\n", + (long long)(int)offsetof(struct gss_header, gh_pad0)); + LASSERTF((int)sizeof(((struct gss_header *)0)->gh_pad0) == 2, "found %lld\n", + (long long)(int)sizeof(((struct gss_header *)0)->gh_pad0)); + LASSERTF((int)offsetof(struct gss_header, gh_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct gss_header, gh_flags)); + LASSERTF((int)sizeof(((struct gss_header *)0)->gh_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_header *)0)->gh_flags)); + LASSERTF((int)offsetof(struct gss_header, gh_proc) == 8, "found %lld\n", + (long long)(int)offsetof(struct gss_header, gh_proc)); + LASSERTF((int)sizeof(((struct gss_header *)0)->gh_proc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_header *)0)->gh_proc)); + LASSERTF((int)offsetof(struct gss_header, gh_seq) == 12, "found %lld\n", + (long long)(int)offsetof(struct gss_header, gh_seq)); + LASSERTF((int)sizeof(((struct gss_header *)0)->gh_seq) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_header *)0)->gh_seq)); + LASSERTF((int)offsetof(struct gss_header, gh_svc) == 16, "found %lld\n", + (long long)(int)offsetof(struct gss_header, gh_svc)); + LASSERTF((int)sizeof(((struct gss_header *)0)->gh_svc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_header *)0)->gh_svc)); + LASSERTF((int)offsetof(struct gss_header, gh_pad1) == 20, "found %lld\n", + (long long)(int)offsetof(struct gss_header, gh_pad1)); + LASSERTF((int)sizeof(((struct gss_header *)0)->gh_pad1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_header *)0)->gh_pad1)); + LASSERTF((int)offsetof(struct gss_header, gh_pad2) == 24, "found %lld\n", + (long long)(int)offsetof(struct gss_header, gh_pad2)); + LASSERTF((int)sizeof(((struct gss_header *)0)->gh_pad2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_header *)0)->gh_pad2)); + LASSERTF((int)offsetof(struct gss_header, gh_pad3) == 28, "found %lld\n", + (long long)(int)offsetof(struct gss_header, gh_pad3)); + LASSERTF((int)sizeof(((struct gss_header *)0)->gh_pad3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_header *)0)->gh_pad3)); + LASSERTF((int)offsetof(struct gss_header, gh_handle) == 32, "found %lld\n", + (long long)(int)offsetof(struct gss_header, gh_handle)); + LASSERTF((int)sizeof(((struct gss_header *)0)->gh_handle) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_header *)0)->gh_handle)); + + /* Checks for struct gss_rep_header */ + LASSERTF((int)sizeof(struct gss_rep_header) == 36, "found %lld\n", + (long long)(int)sizeof(struct gss_rep_header)); + LASSERTF((int)offsetof(struct gss_rep_header, gh_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct gss_rep_header, gh_version)); + LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_version) == 1, "found %lld\n", + (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_version)); + LASSERTF((int)offsetof(struct gss_rep_header, gh_sp) == 1, "found %lld\n", + (long long)(int)offsetof(struct gss_rep_header, gh_sp)); + LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_sp) == 1, "found %lld\n", + (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_sp)); + LASSERTF((int)offsetof(struct gss_rep_header, gh_pad0) == 2, "found %lld\n", + (long long)(int)offsetof(struct gss_rep_header, gh_pad0)); + LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_pad0) == 2, "found %lld\n", + (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_pad0)); + LASSERTF((int)offsetof(struct gss_rep_header, gh_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct gss_rep_header, gh_flags)); + LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_flags)); + LASSERTF((int)offsetof(struct gss_rep_header, gh_proc) == 8, "found %lld\n", + (long long)(int)offsetof(struct gss_rep_header, gh_proc)); + LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_proc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_proc)); + LASSERTF((int)offsetof(struct gss_rep_header, gh_major) == 12, "found %lld\n", + (long long)(int)offsetof(struct gss_rep_header, gh_major)); + LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_major) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_major)); + LASSERTF((int)offsetof(struct gss_rep_header, gh_minor) == 16, "found %lld\n", + (long long)(int)offsetof(struct gss_rep_header, gh_minor)); + LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_minor) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_minor)); + LASSERTF((int)offsetof(struct gss_rep_header, gh_seqwin) == 20, "found %lld\n", + (long long)(int)offsetof(struct gss_rep_header, gh_seqwin)); + LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_seqwin) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_seqwin)); + LASSERTF((int)offsetof(struct gss_rep_header, gh_pad2) == 24, "found %lld\n", + (long long)(int)offsetof(struct gss_rep_header, gh_pad2)); + LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_pad2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_pad2)); + LASSERTF((int)offsetof(struct gss_rep_header, gh_pad3) == 28, "found %lld\n", + (long long)(int)offsetof(struct gss_rep_header, gh_pad3)); + LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_pad3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_pad3)); + LASSERTF((int)offsetof(struct gss_rep_header, gh_handle) == 32, "found %lld\n", + (long long)(int)offsetof(struct gss_rep_header, gh_handle)); + LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_handle) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_handle)); + + /* Checks for struct gss_err_header */ + LASSERTF((int)sizeof(struct gss_err_header) == 36, "found %lld\n", + (long long)(int)sizeof(struct gss_err_header)); + LASSERTF((int)offsetof(struct gss_err_header, gh_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct gss_err_header, gh_version)); + LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_version) == 1, "found %lld\n", + (long long)(int)sizeof(((struct gss_err_header *)0)->gh_version)); + LASSERTF((int)offsetof(struct gss_err_header, gh_sp) == 1, "found %lld\n", + (long long)(int)offsetof(struct gss_err_header, gh_sp)); + LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_sp) == 1, "found %lld\n", + (long long)(int)sizeof(((struct gss_err_header *)0)->gh_sp)); + LASSERTF((int)offsetof(struct gss_err_header, gh_pad0) == 2, "found %lld\n", + (long long)(int)offsetof(struct gss_err_header, gh_pad0)); + LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_pad0) == 2, "found %lld\n", + (long long)(int)sizeof(((struct gss_err_header *)0)->gh_pad0)); + LASSERTF((int)offsetof(struct gss_err_header, gh_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct gss_err_header, gh_flags)); + LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_err_header *)0)->gh_flags)); + LASSERTF((int)offsetof(struct gss_err_header, gh_proc) == 8, "found %lld\n", + (long long)(int)offsetof(struct gss_err_header, gh_proc)); + LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_proc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_err_header *)0)->gh_proc)); + LASSERTF((int)offsetof(struct gss_err_header, gh_major) == 12, "found %lld\n", + (long long)(int)offsetof(struct gss_err_header, gh_major)); + LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_major) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_err_header *)0)->gh_major)); + LASSERTF((int)offsetof(struct gss_err_header, gh_minor) == 16, "found %lld\n", + (long long)(int)offsetof(struct gss_err_header, gh_minor)); + LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_minor) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_err_header *)0)->gh_minor)); + LASSERTF((int)offsetof(struct gss_err_header, gh_pad1) == 20, "found %lld\n", + (long long)(int)offsetof(struct gss_err_header, gh_pad1)); + LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_pad1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_err_header *)0)->gh_pad1)); + LASSERTF((int)offsetof(struct gss_err_header, gh_pad2) == 24, "found %lld\n", + (long long)(int)offsetof(struct gss_err_header, gh_pad2)); + LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_pad2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_err_header *)0)->gh_pad2)); + LASSERTF((int)offsetof(struct gss_err_header, gh_pad3) == 28, "found %lld\n", + (long long)(int)offsetof(struct gss_err_header, gh_pad3)); + LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_pad3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_err_header *)0)->gh_pad3)); + LASSERTF((int)offsetof(struct gss_err_header, gh_handle) == 32, "found %lld\n", + (long long)(int)offsetof(struct gss_err_header, gh_handle)); + LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_handle) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_err_header *)0)->gh_handle)); + + /* Checks for struct gss_wire_ctx */ + LASSERTF((int)sizeof(struct gss_wire_ctx) == 32, "found %lld\n", + (long long)(int)sizeof(struct gss_wire_ctx)); + LASSERTF((int)offsetof(struct gss_wire_ctx, gw_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct gss_wire_ctx, gw_flags)); + LASSERTF((int)sizeof(((struct gss_wire_ctx *)0)->gw_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_wire_ctx *)0)->gw_flags)); + LASSERTF((int)offsetof(struct gss_wire_ctx, gw_proc) == 4, "found %lld\n", + (long long)(int)offsetof(struct gss_wire_ctx, gw_proc)); + LASSERTF((int)sizeof(((struct gss_wire_ctx *)0)->gw_proc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_wire_ctx *)0)->gw_proc)); + LASSERTF((int)offsetof(struct gss_wire_ctx, gw_seq) == 8, "found %lld\n", + (long long)(int)offsetof(struct gss_wire_ctx, gw_seq)); + LASSERTF((int)sizeof(((struct gss_wire_ctx *)0)->gw_seq) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_wire_ctx *)0)->gw_seq)); + LASSERTF((int)offsetof(struct gss_wire_ctx, gw_svc) == 12, "found %lld\n", + (long long)(int)offsetof(struct gss_wire_ctx, gw_svc)); + LASSERTF((int)sizeof(((struct gss_wire_ctx *)0)->gw_svc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct gss_wire_ctx *)0)->gw_svc)); + LASSERTF((int)offsetof(struct gss_wire_ctx, gw_handle) == 16, "found %lld\n", + (long long)(int)offsetof(struct gss_wire_ctx, gw_handle)); + LASSERTF((int)sizeof(((struct gss_wire_ctx *)0)->gw_handle) == 16, "found %lld\n", + (long long)(int)sizeof(((struct gss_wire_ctx *)0)->gw_handle)); + +#ifdef HAVE_SERVER_SUPPORT + + /* Checks for struct object_update_param */ + LASSERTF((int)sizeof(struct object_update_param) == 8, "found %lld\n", + (long long)(int)sizeof(struct object_update_param)); + LASSERTF((int)offsetof(struct object_update_param, oup_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct object_update_param, oup_len)); + LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_len) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_param *)0)->oup_len)); + LASSERTF((int)offsetof(struct object_update_param, oup_padding) == 2, "found %lld\n", + (long long)(int)offsetof(struct object_update_param, oup_padding)); + LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_padding) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_param *)0)->oup_padding)); + LASSERTF((int)offsetof(struct object_update_param, oup_padding2) == 4, "found %lld\n", + (long long)(int)offsetof(struct object_update_param, oup_padding2)); + LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_padding2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update_param *)0)->oup_padding2)); + LASSERTF((int)offsetof(struct object_update_param, oup_buf) == 8, "found %lld\n", + (long long)(int)offsetof(struct object_update_param, oup_buf)); + LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_buf) == 0, "found %lld\n", + (long long)(int)sizeof(((struct object_update_param *)0)->oup_buf)); + + /* Checks for struct object_update */ + LASSERTF((int)sizeof(struct object_update) == 40, "found %lld\n", + (long long)(int)sizeof(struct object_update)); + LASSERTF((int)offsetof(struct object_update, ou_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_type)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_type) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_type)); + LASSERTF((int)offsetof(struct object_update, ou_params_count) == 2, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_params_count)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_params_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_params_count)); + LASSERTF((int)offsetof(struct object_update, ou_result_size) == 4, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_result_size)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_result_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_result_size)); + LASSERTF((int)offsetof(struct object_update, ou_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_flags)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_flags)); + LASSERTF((int)offsetof(struct object_update, ou_padding1) == 12, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_padding1)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_padding1)); + LASSERTF((int)offsetof(struct object_update, ou_batchid) == 16, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_batchid)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_batchid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_batchid)); + LASSERTF((int)offsetof(struct object_update, ou_fid) == 24, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_fid)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_fid)); + LASSERTF((int)offsetof(struct object_update, ou_params) == 40, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_params)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_params) == 0, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_params)); + BUILD_BUG_ON(UPDATE_FL_OST != 0x00000001); + BUILD_BUG_ON(UPDATE_FL_SYNC != 0x00000002); + BUILD_BUG_ON(UPDATE_FL_COMMITTED != 0x00000004); + BUILD_BUG_ON(UPDATE_FL_NOLOG != 0x00000008); + + /* Checks for struct object_update_request */ + LASSERTF((int)sizeof(struct object_update_request) == 8, "found %lld\n", + (long long)(int)sizeof(struct object_update_request)); + LASSERTF((int)offsetof(struct object_update_request, ourq_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct object_update_request, ourq_magic)); + LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update_request *)0)->ourq_magic)); + LASSERTF((int)offsetof(struct object_update_request, ourq_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct object_update_request, ourq_count)); + LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_request *)0)->ourq_count)); + LASSERTF((int)offsetof(struct object_update_request, ourq_padding) == 6, "found %lld\n", + (long long)(int)offsetof(struct object_update_request, ourq_padding)); + LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_padding) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_request *)0)->ourq_padding)); + LASSERTF((int)offsetof(struct object_update_request, ourq_updates) == 8, "found %lld\n", + (long long)(int)offsetof(struct object_update_request, ourq_updates)); + LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_updates) == 0, "found %lld\n", + (long long)(int)sizeof(((struct object_update_request *)0)->ourq_updates)); + BUILD_BUG_ON(UPDATE_REQUEST_MAGIC != 0xBDDE0002); + + /* Checks for struct object_update_result */ + LASSERTF((int)sizeof(struct object_update_result) == 8, "found %lld\n", + (long long)(int)sizeof(struct object_update_result)); + LASSERTF((int)offsetof(struct object_update_result, our_rc) == 0, "found %lld\n", + (long long)(int)offsetof(struct object_update_result, our_rc)); + LASSERTF((int)sizeof(((struct object_update_result *)0)->our_rc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update_result *)0)->our_rc)); + LASSERTF((int)offsetof(struct object_update_result, our_datalen) == 4, "found %lld\n", + (long long)(int)offsetof(struct object_update_result, our_datalen)); + LASSERTF((int)sizeof(((struct object_update_result *)0)->our_datalen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_result *)0)->our_datalen)); + LASSERTF((int)offsetof(struct object_update_result, our_padding) == 6, "found %lld\n", + (long long)(int)offsetof(struct object_update_result, our_padding)); + LASSERTF((int)sizeof(((struct object_update_result *)0)->our_padding) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_result *)0)->our_padding)); + LASSERTF((int)offsetof(struct object_update_result, our_data) == 8, "found %lld\n", + (long long)(int)offsetof(struct object_update_result, our_data)); + LASSERTF((int)sizeof(((struct object_update_result *)0)->our_data) == 0, "found %lld\n", + (long long)(int)sizeof(((struct object_update_result *)0)->our_data)); + + /* Checks for struct object_update_reply */ + LASSERTF((int)sizeof(struct object_update_reply) == 8, "found %lld\n", + (long long)(int)sizeof(struct object_update_reply)); + LASSERTF((int)offsetof(struct object_update_reply, ourp_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct object_update_reply, ourp_magic)); + LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_magic)); + LASSERTF((int)offsetof(struct object_update_reply, ourp_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct object_update_reply, ourp_count)); + LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_count)); + LASSERTF((int)offsetof(struct object_update_reply, ourp_padding) == 6, "found %lld\n", + (long long)(int)offsetof(struct object_update_reply, ourp_padding)); + LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_padding) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_padding)); + LASSERTF((int)offsetof(struct object_update_reply, ourp_lens) == 8, "found %lld\n", + (long long)(int)offsetof(struct object_update_reply, ourp_lens)); + LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_lens) == 0, "found %lld\n", + (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_lens)); + BUILD_BUG_ON(UPDATE_REPLY_MAGIC != 0x00BD0002); + + /* Checks for struct out_update_header */ + LASSERTF((int)sizeof(struct out_update_header) == 16, "found %lld\n", + (long long)(int)sizeof(struct out_update_header)); + LASSERTF((int)offsetof(struct out_update_header, ouh_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct out_update_header, ouh_magic)); + LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct out_update_header *)0)->ouh_magic)); + LASSERTF((int)offsetof(struct out_update_header, ouh_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct out_update_header, ouh_count)); + LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct out_update_header *)0)->ouh_count)); + LASSERTF((int)offsetof(struct out_update_header, ouh_inline_length) == 8, "found %lld\n", + (long long)(int)offsetof(struct out_update_header, ouh_inline_length)); + LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_inline_length) == 4, "found %lld\n", + (long long)(int)sizeof(((struct out_update_header *)0)->ouh_inline_length)); + LASSERTF((int)offsetof(struct out_update_header, ouh_reply_size) == 12, "found %lld\n", + (long long)(int)offsetof(struct out_update_header, ouh_reply_size)); + LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_reply_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct out_update_header *)0)->ouh_reply_size)); + LASSERTF((int)offsetof(struct out_update_header, ouh_inline_data) == 16, "found %lld\n", + (long long)(int)offsetof(struct out_update_header, ouh_inline_data)); + LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_inline_data) == 0, "found %lld\n", + (long long)(int)sizeof(((struct out_update_header *)0)->ouh_inline_data)); + BUILD_BUG_ON(OUT_UPDATE_HEADER_MAGIC != 0xBDDF0001); + BUILD_BUG_ON(OUT_UPDATE_MAX_INLINE_SIZE != 4096); + + /* Checks for struct out_update_buffer */ + LASSERTF((int)sizeof(struct out_update_buffer) == 8, "found %lld\n", + (long long)(int)sizeof(struct out_update_buffer)); + LASSERTF((int)offsetof(struct out_update_buffer, oub_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct out_update_buffer, oub_size)); + LASSERTF((int)sizeof(((struct out_update_buffer *)0)->oub_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct out_update_buffer *)0)->oub_size)); + LASSERTF((int)offsetof(struct out_update_buffer, oub_padding) == 4, "found %lld\n", + (long long)(int)offsetof(struct out_update_buffer, oub_padding)); + LASSERTF((int)sizeof(((struct out_update_buffer *)0)->oub_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct out_update_buffer *)0)->oub_padding)); + + /* Checks for struct nodemap_cluster_rec */ + LASSERTF((int)sizeof(struct nodemap_cluster_rec) == 32, "found %lld\n", + (long long)(int)sizeof(struct nodemap_cluster_rec)); + BUILD_BUG_ON(LUSTRE_NODEMAP_NAME_LENGTH != 16); + LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_name[16 + 1]) == 17, "found %lld\n", + (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_name[16 + 1])); + LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_name[16 + 1]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_name[16 + 1])); + LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_flags) == 17, "found %lld\n", + (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_flags)); + LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_flags) == 1, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_flags)); + LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_padding1) == 18, "found %lld\n", + (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_padding1)); + LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_padding1) == 2, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_padding1)); + LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_squash_projid) == 20, "found %lld\n", + (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_squash_projid)); + LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_projid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_projid)); + LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_squash_uid) == 24, "found %lld\n", + (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_squash_uid)); + LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_uid)); + LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_squash_gid) == 28, "found %lld\n", + (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_squash_gid)); + LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_gid)); + + /* Checks for struct nodemap_range_rec */ + LASSERTF((int)sizeof(struct nodemap_range_rec) == 32, "found %lld\n", + (long long)(int)sizeof(struct nodemap_range_rec)); + LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_start_nid) == 0, "found %lld\n", + (long long)(int)offsetof(struct nodemap_range_rec, nrr_start_nid)); + LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_start_nid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_start_nid)); + LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_end_nid) == 8, "found %lld\n", + (long long)(int)offsetof(struct nodemap_range_rec, nrr_end_nid)); + LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_end_nid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_end_nid)); + LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_padding1) == 16, "found %lld\n", + (long long)(int)offsetof(struct nodemap_range_rec, nrr_padding1)); + LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding1)); + LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_padding2) == 24, "found %lld\n", + (long long)(int)offsetof(struct nodemap_range_rec, nrr_padding2)); + LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding2)); + + /* Checks for struct nodemap_id_rec */ + LASSERTF((int)sizeof(struct nodemap_id_rec) == 32, "found %lld\n", + (long long)(int)sizeof(struct nodemap_id_rec)); + LASSERTF((int)offsetof(struct nodemap_id_rec, nir_id_fs) == 0, "found %lld\n", + (long long)(int)offsetof(struct nodemap_id_rec, nir_id_fs)); + LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_id_fs) == 4, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_id_fs)); + LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding1) == 4, "found %lld\n", + (long long)(int)offsetof(struct nodemap_id_rec, nir_padding1)); + LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding1)); + LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding2) == 8, "found %lld\n", + (long long)(int)offsetof(struct nodemap_id_rec, nir_padding2)); + LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding2)); + LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding3) == 16, "found %lld\n", + (long long)(int)offsetof(struct nodemap_id_rec, nir_padding3)); + LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding3)); + LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding4) == 24, "found %lld\n", + (long long)(int)offsetof(struct nodemap_id_rec, nir_padding4)); + LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding4)); + + /* Checks for struct nodemap_global_rec */ + LASSERTF((int)sizeof(struct nodemap_global_rec) == 32, "found %lld\n", + (long long)(int)sizeof(struct nodemap_global_rec)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_is_active) == 0, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_is_active)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_is_active) == 1, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_is_active)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding1) == 1, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding1)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding1) == 1, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding1)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding2) == 2, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding2)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding2) == 2, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding2)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding3) == 4, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding3)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding3)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding4) == 8, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding4)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding4)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding5) == 16, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding5)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding5)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding6) == 24, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding6)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding6)); + + /* Checks for union nodemap_rec */ + LASSERTF((int)sizeof(union nodemap_rec) == 32, "found %lld\n", + (long long)(int)sizeof(union nodemap_rec)); + + LASSERTF(OFD_ACCESS_READ == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)OFD_ACCESS_READ); + LASSERTF(OFD_ACCESS_WRITE == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)OFD_ACCESS_WRITE); + /* Checks for struct ofd_access_entry_v1 */ + LASSERTF((int)sizeof(struct ofd_access_entry_v1) == 64, "found %lld\n", + (long long)(int)sizeof(struct ofd_access_entry_v1)); + LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_parent_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct ofd_access_entry_v1, oae_parent_fid)); + LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_parent_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_parent_fid)); + LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_begin) == 16, "found %lld\n", + (long long)(int)offsetof(struct ofd_access_entry_v1, oae_begin)); + LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_begin) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_begin)); + LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_end) == 24, "found %lld\n", + (long long)(int)offsetof(struct ofd_access_entry_v1, oae_end)); + LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_end)); + LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_time) == 32, "found %lld\n", + (long long)(int)offsetof(struct ofd_access_entry_v1, oae_time)); + LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_time)); + LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_size) == 40, "found %lld\n", + (long long)(int)offsetof(struct ofd_access_entry_v1, oae_size)); + LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_size)); + LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_segment_count) == 44, "found %lld\n", + (long long)(int)offsetof(struct ofd_access_entry_v1, oae_segment_count)); + LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_segment_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_segment_count)); + LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_flags) == 48, "found %lld\n", + (long long)(int)offsetof(struct ofd_access_entry_v1, oae_flags)); + LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_flags)); + LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_reserved1) == 52, "found %lld\n", + (long long)(int)offsetof(struct ofd_access_entry_v1, oae_reserved1)); + LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_reserved1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_reserved1)); + LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_reserved2) == 56, "found %lld\n", + (long long)(int)offsetof(struct ofd_access_entry_v1, oae_reserved2)); + LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_reserved2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_reserved2)); + LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_reserved3) == 60, "found %lld\n", + (long long)(int)offsetof(struct ofd_access_entry_v1, oae_reserved3)); + LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_reserved3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_reserved3)); + + LASSERTF(LUSTRE_ACCESS_LOG_VERSION_1 == 0x00010000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_ACCESS_LOG_VERSION_1); + LASSERTF(LUSTRE_ACCESS_LOG_TYPE_OFD == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_ACCESS_LOG_TYPE_OFD); + /* Checks for struct lustre_access_log_info_v1 */ + LASSERTF((int)sizeof(struct lustre_access_log_info_v1) == 168, "found %lld\n", + (long long)(int)sizeof(struct lustre_access_log_info_v1)); + LASSERTF((int)offsetof(struct lustre_access_log_info_v1, lali_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_access_log_info_v1, lali_version)); + LASSERTF((int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_version)); + LASSERTF((int)offsetof(struct lustre_access_log_info_v1, lali_type) == 4, "found %lld\n", + (long long)(int)offsetof(struct lustre_access_log_info_v1, lali_type)); + LASSERTF((int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_type)); + LASSERTF((int)offsetof(struct lustre_access_log_info_v1, lali_name) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_access_log_info_v1, lali_name)); + LASSERTF((int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_name) == 128, "found %lld\n", + (long long)(int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_name)); + LASSERTF((int)offsetof(struct lustre_access_log_info_v1, lali_log_size) == 136, "found %lld\n", + (long long)(int)offsetof(struct lustre_access_log_info_v1, lali_log_size)); + LASSERTF((int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_log_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_log_size)); + LASSERTF((int)offsetof(struct lustre_access_log_info_v1, lali_entry_size) == 140, "found %lld\n", + (long long)(int)offsetof(struct lustre_access_log_info_v1, lali_entry_size)); + LASSERTF((int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_entry_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_entry_size)); + + /* Checks for struct lfsck_request */ + LASSERTF((int)sizeof(struct lfsck_request) == 96, "found %lld\n", + (long long)(int)sizeof(struct lfsck_request)); + LASSERTF((int)offsetof(struct lfsck_request, lr_event) == 0, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_event)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_event) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_event)); + LASSERTF((int)offsetof(struct lfsck_request, lr_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_index)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_index)); + LASSERTF((int)offsetof(struct lfsck_request, lr_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_flags)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_flags)); + LASSERTF((int)offsetof(struct lfsck_request, lr_valid) == 12, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_valid)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_valid)); + LASSERTF((int)offsetof(struct lfsck_request, lr_speed) == 16, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_speed)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_speed) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_speed)); + LASSERTF((int)offsetof(struct lfsck_request, lr_version) == 20, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_version)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_version) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_version)); + LASSERTF((int)offsetof(struct lfsck_request, lr_active) == 22, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_active)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_active) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_active)); + LASSERTF((int)offsetof(struct lfsck_request, lr_param) == 24, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_param)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_param) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_param)); + LASSERTF((int)offsetof(struct lfsck_request, lr_async_windows) == 26, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_async_windows)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_async_windows) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_async_windows)); + LASSERTF((int)offsetof(struct lfsck_request, lr_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_flags)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_flags)); + LASSERTF((int)offsetof(struct lfsck_request, lr_fid) == 32, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_fid)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_fid)); + LASSERTF((int)offsetof(struct lfsck_request, lr_fid2) == 48, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_fid2)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_fid2)); + LASSERTF((int)offsetof(struct lfsck_request, lr_comp_id) == 64, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_comp_id)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_comp_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_comp_id)); + LASSERTF((int)offsetof(struct lfsck_request, lr_padding_0) == 68, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_padding_0)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_0) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_0)); + LASSERTF((int)offsetof(struct lfsck_request, lr_padding_1) == 72, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_padding_1)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_1)); + LASSERTF((int)offsetof(struct lfsck_request, lr_padding_2) == 80, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_padding_2)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_2)); + LASSERTF((int)offsetof(struct lfsck_request, lr_padding_3) == 88, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_padding_3)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_3)); + LASSERTF(LFSCK_TYPE_SCRUB == 0x00000000UL, "found 0x%.8xUL\n", + (unsigned)LFSCK_TYPE_SCRUB); + LASSERTF(LFSCK_TYPE_LAYOUT == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LFSCK_TYPE_LAYOUT); + LASSERTF(LFSCK_TYPE_NAMESPACE == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LFSCK_TYPE_NAMESPACE); + LASSERTF(LE_LASTID_REBUILDING == 1, "found %lld\n", + (long long)LE_LASTID_REBUILDING); + LASSERTF(LE_LASTID_REBUILT == 2, "found %lld\n", + (long long)LE_LASTID_REBUILT); + LASSERTF(LE_PHASE1_DONE == 3, "found %lld\n", + (long long)LE_PHASE1_DONE); + LASSERTF(LE_PHASE2_DONE == 4, "found %lld\n", + (long long)LE_PHASE2_DONE); + LASSERTF(LE_START == 5, "found %lld\n", + (long long)LE_START); + LASSERTF(LE_STOP == 6, "found %lld\n", + (long long)LE_STOP); + LASSERTF(LE_QUERY == 7, "found %lld\n", + (long long)LE_QUERY); + LASSERTF(LE_PEER_EXIT == 9, "found %lld\n", + (long long)LE_PEER_EXIT); + LASSERTF(LE_CONDITIONAL_DESTROY == 10, "found %lld\n", + (long long)LE_CONDITIONAL_DESTROY); + LASSERTF(LE_PAIRS_VERIFY == 11, "found %lld\n", + (long long)LE_PAIRS_VERIFY); + LASSERTF(LE_SET_LMV_MASTER == 15, "found %lld\n", + (long long)LE_SET_LMV_MASTER); + LASSERTF(LE_SET_LMV_SLAVE == 16, "found %lld\n", + (long long)LE_SET_LMV_SLAVE); + LASSERTF(LEF_TO_OST == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LEF_TO_OST); + LASSERTF(LEF_FROM_OST == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LEF_FROM_OST); + LASSERTF(LEF_SET_LMV_HASH == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LEF_SET_LMV_HASH); + LASSERTF(LEF_SET_LMV_ALL == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)LEF_SET_LMV_ALL); + LASSERTF(LEF_RECHECK_NAME_HASH == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)LEF_RECHECK_NAME_HASH); + LASSERTF(LEF_QUERY_ALL == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)LEF_QUERY_ALL); + + /* Checks for struct lfsck_reply */ + LASSERTF((int)sizeof(struct lfsck_reply) == 16, "found %lld\n", + (long long)(int)sizeof(struct lfsck_reply)); + LASSERTF((int)offsetof(struct lfsck_reply, lr_status) == 0, "found %lld\n", + (long long)(int)offsetof(struct lfsck_reply, lr_status)); + LASSERTF((int)sizeof(((struct lfsck_reply *)0)->lr_status) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_reply *)0)->lr_status)); + LASSERTF((int)offsetof(struct lfsck_reply, lr_padding_1) == 4, "found %lld\n", + (long long)(int)offsetof(struct lfsck_reply, lr_padding_1)); + LASSERTF((int)sizeof(((struct lfsck_reply *)0)->lr_padding_1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_reply *)0)->lr_padding_1)); + LASSERTF((int)offsetof(struct lfsck_reply, lr_repaired) == 8, "found %lld\n", + (long long)(int)offsetof(struct lfsck_reply, lr_repaired)); + LASSERTF((int)sizeof(((struct lfsck_reply *)0)->lr_repaired) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_reply *)0)->lr_repaired)); + + /* Checks for struct update_params */ + LASSERTF((int)sizeof(struct update_params) == 0, "found %lld\n", + (long long)(int)sizeof(struct update_params)); + LASSERTF((int)offsetof(struct update_params, up_params) == 0, "found %lld\n", + (long long)(int)offsetof(struct update_params, up_params)); + LASSERTF((int)sizeof(((struct update_params *)0)->up_params) == 0, "found %lld\n", + (long long)(int)sizeof(((struct update_params *)0)->up_params)); + + /* Checks for struct update_op */ + LASSERTF((int)sizeof(struct update_op) == 20, "found %lld\n", + (long long)(int)sizeof(struct update_op)); + LASSERTF((int)offsetof(struct update_op, uop_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct update_op, uop_fid)); + LASSERTF((int)sizeof(((struct update_op *)0)->uop_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct update_op *)0)->uop_fid)); + LASSERTF((int)offsetof(struct update_op, uop_type) == 16, "found %lld\n", + (long long)(int)offsetof(struct update_op, uop_type)); + LASSERTF((int)sizeof(((struct update_op *)0)->uop_type) == 2, "found %lld\n", + (long long)(int)sizeof(((struct update_op *)0)->uop_type)); + LASSERTF((int)offsetof(struct update_op, uop_param_count) == 18, "found %lld\n", + (long long)(int)offsetof(struct update_op, uop_param_count)); + LASSERTF((int)sizeof(((struct update_op *)0)->uop_param_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct update_op *)0)->uop_param_count)); + LASSERTF((int)offsetof(struct update_op, uop_params_off) == 20, "found %lld\n", + (long long)(int)offsetof(struct update_op, uop_params_off)); + + /* Checks for struct update_ops */ + LASSERTF((int)sizeof(struct update_ops) == 0, "found %lld\n", + (long long)(int)sizeof(struct update_ops)); + LASSERTF((int)offsetof(struct update_ops, uops_op) == 0, "found %lld\n", + (long long)(int)offsetof(struct update_ops, uops_op)); + LASSERTF((int)sizeof(((struct update_ops *)0)->uops_op) == 0, "found %lld\n", + (long long)(int)sizeof(((struct update_ops *)0)->uops_op)); + + /* Checks for struct update_records */ + LASSERTF((int)sizeof(struct update_records) == 32, "found %lld\n", + (long long)(int)sizeof(struct update_records)); + LASSERTF((int)offsetof(struct update_records, ur_master_transno) == 0, "found %lld\n", + (long long)(int)offsetof(struct update_records, ur_master_transno)); + LASSERTF((int)sizeof(((struct update_records *)0)->ur_master_transno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct update_records *)0)->ur_master_transno)); + LASSERTF((int)offsetof(struct update_records, ur_batchid) == 8, "found %lld\n", + (long long)(int)offsetof(struct update_records, ur_batchid)); + LASSERTF((int)sizeof(((struct update_records *)0)->ur_batchid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct update_records *)0)->ur_batchid)); + LASSERTF((int)offsetof(struct update_records, ur_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct update_records, ur_flags)); + LASSERTF((int)sizeof(((struct update_records *)0)->ur_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_records *)0)->ur_flags)); + LASSERTF((int)offsetof(struct update_records, ur_index) == 20, "found %lld\n", + (long long)(int)offsetof(struct update_records, ur_index)); + LASSERTF((int)sizeof(((struct update_records *)0)->ur_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_records *)0)->ur_index)); + LASSERTF((int)offsetof(struct update_records, ur_update_count) == 24, "found %lld\n", + (long long)(int)offsetof(struct update_records, ur_update_count)); + LASSERTF((int)sizeof(((struct update_records *)0)->ur_update_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_records *)0)->ur_update_count)); + LASSERTF((int)offsetof(struct update_records, ur_param_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct update_records, ur_param_count)); + LASSERTF((int)sizeof(((struct update_records *)0)->ur_param_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_records *)0)->ur_param_count)); + LASSERTF(UPDATE_RECORD_CONTINUE == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)UPDATE_RECORD_CONTINUE); + + /* Checks for struct llog_update_record */ + LASSERTF((int)sizeof(struct llog_update_record) == 48, "found %lld\n", + (long long)(int)sizeof(struct llog_update_record)); + LASSERTF((int)offsetof(struct llog_update_record, lur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_update_record, lur_hdr)); + LASSERTF((int)sizeof(((struct llog_update_record *)0)->lur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_update_record *)0)->lur_hdr)); + LASSERTF((int)offsetof(struct llog_update_record, lur_update_rec) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_update_record, lur_update_rec)); + LASSERTF((int)sizeof(((struct llog_update_record *)0)->lur_update_rec) == 32, "found %lld\n", + (long long)(int)sizeof(((struct llog_update_record *)0)->lur_update_rec)); +#endif /* HAVE_SERVER_SUPPORT */ + + /* Checks for struct lustre_cfg */ + LASSERTF((int)sizeof(struct lustre_cfg) == 32, "found %lld\n", + (long long)(int)sizeof(struct lustre_cfg)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_version)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_version)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_command) == 4, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_command)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_command) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_command)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_num) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_num)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_num) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_num)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_flags) == 12, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_flags)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_flags)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_nid) == 16, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_nid)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_nid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_nid)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_nal) == 24, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_nal)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_nal) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_nal)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_bufcount) == 28, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_bufcount)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_bufcount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_bufcount)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_buflens[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_buflens[0])); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_buflens[0]) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_buflens[0])); + LASSERTF(LCFG_ATTACH == 0x000cf001UL, "found 0x%.8xUL\n", + (unsigned)LCFG_ATTACH); + LASSERTF(LCFG_DETACH == 0x000cf002UL, "found 0x%.8xUL\n", + (unsigned)LCFG_DETACH); + LASSERTF(LCFG_SETUP == 0x000cf003UL, "found 0x%.8xUL\n", + (unsigned)LCFG_SETUP); + LASSERTF(LCFG_CLEANUP == 0x000cf004UL, "found 0x%.8xUL\n", + (unsigned)LCFG_CLEANUP); + LASSERTF(LCFG_ADD_UUID == 0x000cf005UL, "found 0x%.8xUL\n", + (unsigned)LCFG_ADD_UUID); + LASSERTF(LCFG_DEL_UUID == 0x000cf006UL, "found 0x%.8xUL\n", + (unsigned)LCFG_DEL_UUID); + LASSERTF(LCFG_MOUNTOPT == 0x000cf007UL, "found 0x%.8xUL\n", + (unsigned)LCFG_MOUNTOPT); + LASSERTF(LCFG_DEL_MOUNTOPT == 0x000cf008UL, "found 0x%.8xUL\n", + (unsigned)LCFG_DEL_MOUNTOPT); + LASSERTF(LCFG_SET_TIMEOUT == 0x000cf009UL, "found 0x%.8xUL\n", + (unsigned)LCFG_SET_TIMEOUT); + LASSERTF(LCFG_SET_UPCALL == 0x000cf00aUL, "found 0x%.8xUL\n", + (unsigned)LCFG_SET_UPCALL); + LASSERTF(LCFG_ADD_CONN == 0x000cf00bUL, "found 0x%.8xUL\n", + (unsigned)LCFG_ADD_CONN); + LASSERTF(LCFG_DEL_CONN == 0x000cf00cUL, "found 0x%.8xUL\n", + (unsigned)LCFG_DEL_CONN); + LASSERTF(LCFG_LOV_ADD_OBD == 0x000cf00dUL, "found 0x%.8xUL\n", + (unsigned)LCFG_LOV_ADD_OBD); + LASSERTF(LCFG_LOV_DEL_OBD == 0x000cf00eUL, "found 0x%.8xUL\n", + (unsigned)LCFG_LOV_DEL_OBD); + LASSERTF(LCFG_PARAM == 0x000cf00fUL, "found 0x%.8xUL\n", + (unsigned)LCFG_PARAM); + LASSERTF(LCFG_MARKER == 0x000cf010UL, "found 0x%.8xUL\n", + (unsigned)LCFG_MARKER); + LASSERTF(LCFG_LOG_START == 0x000ce011UL, "found 0x%.8xUL\n", + (unsigned)LCFG_LOG_START); + LASSERTF(LCFG_LOG_END == 0x000ce012UL, "found 0x%.8xUL\n", + (unsigned)LCFG_LOG_END); + LASSERTF(LCFG_LOV_ADD_INA == 0x000ce013UL, "found 0x%.8xUL\n", + (unsigned)LCFG_LOV_ADD_INA); + LASSERTF(LCFG_ADD_MDC == 0x000cf014UL, "found 0x%.8xUL\n", + (unsigned)LCFG_ADD_MDC); + LASSERTF(LCFG_DEL_MDC == 0x000cf015UL, "found 0x%.8xUL\n", + (unsigned)LCFG_DEL_MDC); + LASSERTF(LCFG_SPTLRPC_CONF == 0x000ce016UL, "found 0x%.8xUL\n", + (unsigned)LCFG_SPTLRPC_CONF); + LASSERTF(LCFG_POOL_NEW == 0x000ce020UL, "found 0x%.8xUL\n", + (unsigned)LCFG_POOL_NEW); + LASSERTF(LCFG_POOL_ADD == 0x000ce021UL, "found 0x%.8xUL\n", + (unsigned)LCFG_POOL_ADD); + LASSERTF(LCFG_POOL_REM == 0x000ce022UL, "found 0x%.8xUL\n", + (unsigned)LCFG_POOL_REM); + LASSERTF(LCFG_POOL_DEL == 0x000ce023UL, "found 0x%.8xUL\n", + (unsigned)LCFG_POOL_DEL); + LASSERTF(LCFG_SET_LDLM_TIMEOUT == 0x000ce030UL, "found 0x%.8xUL\n", + (unsigned)LCFG_SET_LDLM_TIMEOUT); + LASSERTF(LCFG_PRE_CLEANUP == 0x000cf031UL, "found 0x%.8xUL\n", + (unsigned)LCFG_PRE_CLEANUP); + LASSERTF(LCFG_SET_PARAM == 0x000ce032UL, "found 0x%.8xUL\n", + (unsigned)LCFG_SET_PARAM); +#ifdef HAVE_SERVER_SUPPORT + LASSERTF(LCFG_NODEMAP_ADD == 0x000ce040UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ADD); + LASSERTF(LCFG_NODEMAP_DEL == 0x000ce041UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_DEL); + LASSERTF(LCFG_NODEMAP_ADD_RANGE == 0x000ce042UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ADD_RANGE); + LASSERTF(LCFG_NODEMAP_DEL_RANGE == 0x000ce043UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_DEL_RANGE); + LASSERTF(LCFG_NODEMAP_ADD_UIDMAP == 0x000ce044UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ADD_UIDMAP); + LASSERTF(LCFG_NODEMAP_DEL_UIDMAP == 0x000ce045UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_DEL_UIDMAP); + LASSERTF(LCFG_NODEMAP_ADD_GIDMAP == 0x000ce046UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ADD_GIDMAP); + LASSERTF(LCFG_NODEMAP_DEL_GIDMAP == 0x000ce047UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_DEL_GIDMAP); + LASSERTF(LCFG_NODEMAP_ACTIVATE == 0x000ce048UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ACTIVATE); + LASSERTF(LCFG_NODEMAP_ADMIN == 0x000ce049UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ADMIN); + LASSERTF(LCFG_NODEMAP_ADD_PROJIDMAP == 0x000ce04aUL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ADD_PROJIDMAP); + LASSERTF(LCFG_NODEMAP_DEL_PROJIDMAP == 0x000ce04bUL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_DEL_PROJIDMAP); + LASSERTF(LCFG_NODEMAP_TRUSTED == 0x000ce050UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_TRUSTED); + LASSERTF(LCFG_NODEMAP_SQUASH_UID == 0x000ce051UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_SQUASH_UID); + LASSERTF(LCFG_NODEMAP_SQUASH_GID == 0x000ce052UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_SQUASH_GID); + LASSERTF(LCFG_NODEMAP_ADD_SHKEY == 0x000ce053UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ADD_SHKEY); + LASSERTF(LCFG_NODEMAP_DEL_SHKEY == 0x000ce054UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_DEL_SHKEY); + LASSERTF(LCFG_NODEMAP_TEST_NID == 0x000ce055UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_TEST_NID); + LASSERTF(LCFG_NODEMAP_TEST_ID == 0x000ce056UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_TEST_ID); + LASSERTF(LCFG_NODEMAP_SET_FILESET == 0x000ce057UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_SET_FILESET); + LASSERTF(LCFG_NODEMAP_DENY_UNKNOWN == 0x000ce058UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_DENY_UNKNOWN); + LASSERTF(LCFG_NODEMAP_MAP_MODE == 0x000ce059UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_MAP_MODE); + LASSERTF(LCFG_NODEMAP_AUDIT_MODE == 0x000ce05aUL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_AUDIT_MODE); + LASSERTF(LCFG_NODEMAP_SET_SEPOL == 0x000ce05bUL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_SET_SEPOL); + LASSERTF(LCFG_NODEMAP_FORBID_ENCRYPT == 0x000ce05cUL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_FORBID_ENCRYPT); + LASSERTF(LCFG_NODEMAP_SQUASH_PROJID == 0x000ce05dUL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_SQUASH_PROJID); +#endif /* HAVE_SERVER_SUPPORT */ + LASSERTF(PORTALS_CFG_TYPE == 1, "found %lld\n", + (long long)PORTALS_CFG_TYPE); + LASSERTF(LUSTRE_CFG_TYPE == 123, "found %lld\n", + (long long)LUSTRE_CFG_TYPE); +} diff --git a/drivers/staging/lustrefsx/lustre/target/barrier.c b/drivers/staging/lustrefsx/lustre/target/barrier.c new file mode 100644 index 0000000000000..4f8c5b3dfdb4d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/barrier.c @@ -0,0 +1,412 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + * + * lustre/target/barrier.c + * + * Currently, the Lustre barrier is implemented as write barrier on all MDTs. + * For each MDT in the system, when it starts, it registers a barrier instance + * that will be used in handling subsequent barrier requests. + * + * Author: Fan, Yong + */ + +#define DEBUG_SUBSYSTEM S_SNAPSHOT + +#include + +#include +#include +#include +#include +#include + +static LIST_HEAD(barrier_instance_list); +static DEFINE_SPINLOCK(barrier_instance_lock); + +struct barrier_instance { + struct list_head bi_link; + struct dt_device *bi_bottom; + struct dt_device *bi_next; + wait_queue_head_t bi_waitq; + rwlock_t bi_rwlock; + struct percpu_counter bi_writers; + atomic_t bi_ref; + time64_t bi_deadline; + __u32 bi_status; +}; + +static inline char *barrier_barrier2name(struct barrier_instance *barrier) +{ + return barrier->bi_bottom->dd_lu_dev.ld_obd->obd_name; +} + +static inline __u32 barrier_dev_idx(struct barrier_instance *barrier) +{ + return lu_site2seq(barrier->bi_bottom->dd_lu_dev.ld_site)->ss_node_id; +} + +static void barrier_instance_cleanup(struct barrier_instance *barrier) +{ + LASSERT(list_empty(&barrier->bi_link)); + + percpu_counter_destroy(&barrier->bi_writers); + OBD_FREE_PTR(barrier); +} + +static inline void barrier_instance_put(struct barrier_instance *barrier) +{ + if (atomic_dec_and_test(&barrier->bi_ref)) + barrier_instance_cleanup(barrier); +} + +static struct barrier_instance * +barrier_instance_find_locked(struct dt_device *key) +{ + struct barrier_instance *barrier; + + list_for_each_entry(barrier, &barrier_instance_list, bi_link) { + if (barrier->bi_bottom == key) + return barrier; + } + + return NULL; +} + +static void barrier_instance_add(struct barrier_instance *barrier) +{ + struct barrier_instance *tmp; + + spin_lock(&barrier_instance_lock); + tmp = barrier_instance_find_locked(barrier->bi_bottom); + LASSERT(!tmp); + + list_add_tail(&barrier->bi_link, &barrier_instance_list); + spin_unlock(&barrier_instance_lock); +} + +static struct barrier_instance *barrier_instance_find(struct dt_device *key) +{ + struct barrier_instance *barrier; + + spin_lock(&barrier_instance_lock); + barrier = barrier_instance_find_locked(key); + if (barrier) + atomic_inc(&barrier->bi_ref); + spin_unlock(&barrier_instance_lock); + + return barrier; +} + +static void barrier_set(struct barrier_instance *barrier, __u32 status) +{ + if (barrier->bi_status != status) { + CDEBUG(D_SNAPSHOT, "%s: change barrier status from %u to %u\n", + barrier_barrier2name(barrier), + barrier->bi_status, status); + + barrier->bi_status = status; + } +} + +/** + * Create the barrier for the given instance. + * + * We use two-phases barrier to guarantee that after the barrier setup: + * 1) All the MDT side pending async modification have been flushed. + * 2) Any subsequent modification will be blocked. + * 3) All async transactions on the MDTs have been committed. + * + * For phase1, we do the following: + * + * Firstly, it sets barrier flag on the instance that will block subsequent + * modifications from clients. (Note: server sponsored modification will be + * allowed for flush pending modifications) + * + * Secondly, it will flush all pending modification via dt_sync(), such as + * async OST-object destroy, async OST-object owner changes, and so on. + * + * If there are some on-handling clients sponsored modifications during the + * barrier freezing, then related modifications may cause pending requests + * after the first dt_sync(), so call dt_sync() again after all on-handling + * modifications done. + * + * With the phase1 barrier set, all pending cross-servers modification have + * been flushed to remote servers, and any new modification will be blocked. + * But it does not guarantees that all the updates have been committed to + * storage on remote servers. So when all the instances have done phase1 + * barrier successfully, the MGS will notify all instances to do the phase2 + * barrier as following: + * + * Every barrier instance will call dt_sync() to make all async transactions + * to be committed locally. + * + * \param[in] env pointer to the thread context + * \param[in] barrier pointer to the barrier instance + * \param[in] phase1 indicate whether it is phase1 barrier or not + * + * \retval positive number for timeout + * \retval 0 for success + * \retval negative error number on failure + */ +static int barrier_freeze(const struct lu_env *env, + struct barrier_instance *barrier, bool phase1) +{ + time64_t left; + int rc = 0; + __s64 inflight = 0; + ENTRY; + + write_lock(&barrier->bi_rwlock); + barrier_set(barrier, phase1 ? BS_FREEZING_P1 : BS_FREEZING_P2); + + /* Avoid out-of-order execution the barrier_set() + * and the check of inflight modifications count. */ + smp_mb(); + + if (phase1) + inflight = percpu_counter_sum(&barrier->bi_writers); + write_unlock(&barrier->bi_rwlock); + + rc = dt_sync(env, barrier->bi_next); + if (rc) + RETURN(rc); + + LASSERT(barrier->bi_deadline != 0); + + left = barrier->bi_deadline - ktime_get_real_seconds(); + if (left <= 0) + RETURN(1); + + if (phase1 && inflight != 0) { + rc = wait_event_idle_timeout( + barrier->bi_waitq, + percpu_counter_sum(&barrier->bi_writers) == 0, + cfs_time_seconds(left)); + if (rc <= 0) + RETURN(1); + + /* sync again after all inflight modifications done. */ + rc = dt_sync(env, barrier->bi_next); + if (rc) + RETURN(rc); + + if (ktime_get_real_seconds() > barrier->bi_deadline) + RETURN(1); + } + + CDEBUG(D_SNAPSHOT, "%s: barrier freezing %s done.\n", + barrier_barrier2name(barrier), phase1 ? "phase1" : "phase2"); + + if (!phase1) + barrier_set(barrier, BS_FROZEN); + + RETURN(0); +} + +void barrier_init(void) +{ +} + +void barrier_fini(void) +{ + LASSERT(list_empty(&barrier_instance_list)); +} + +bool barrier_entry(struct dt_device *key) +{ + struct barrier_instance *barrier; + bool entered = false; + ENTRY; + + barrier = barrier_instance_find(key); + if (unlikely(!barrier)) + /* Fail open */ + RETURN(true); + + read_lock(&barrier->bi_rwlock); + if (likely(barrier->bi_status != BS_FREEZING_P1 && + barrier->bi_status != BS_FREEZING_P2 && + barrier->bi_status != BS_FROZEN) || + ktime_get_real_seconds() > barrier->bi_deadline) { + percpu_counter_inc(&barrier->bi_writers); + entered = true; + } + read_unlock(&barrier->bi_rwlock); + + barrier_instance_put(barrier); + return entered; +} +EXPORT_SYMBOL(barrier_entry); + +void barrier_exit(struct dt_device *key) +{ + struct barrier_instance *barrier; + + barrier = barrier_instance_find(key); + if (likely(barrier)) { + percpu_counter_dec(&barrier->bi_writers); + + /* Avoid out-of-order execution the decreasing inflight + * modifications count and the check of barrier status. */ + smp_mb(); + + if (unlikely(barrier->bi_status == BS_FREEZING_P1)) + wake_up(&barrier->bi_waitq); + barrier_instance_put(barrier); + } +} +EXPORT_SYMBOL(barrier_exit); + +int barrier_handler(struct dt_device *key, struct ptlrpc_request *req) +{ + struct ldlm_gl_barrier_desc *desc; + struct barrier_instance *barrier; + struct barrier_lvb *lvb; + struct lu_env env; + int rc = 0; + ENTRY; + + /* glimpse on barrier locks always packs a glimpse descriptor */ + req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK_DESC); + desc = req_capsule_client_get(&req->rq_pill, &RMF_DLM_GL_DESC); + if (!desc) + GOTO(out, rc = -EPROTO); + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + sizeof(struct barrier_lvb)); + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + GOTO(out, rc); + + lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB); + barrier = barrier_instance_find(key); + if (!barrier) + GOTO(out, rc = -ENODEV); + + rc = lu_env_init(&env, LCT_MD_THREAD | LCT_DT_THREAD); + if (rc) + GOTO(out_barrier, rc); + + CDEBUG(D_SNAPSHOT, + "%s: handling barrier request: status %u, timeout %u\n", + barrier_barrier2name(barrier), + desc->lgbd_status, desc->lgbd_timeout); + + switch (desc->lgbd_status) { + case BS_RESCAN: + barrier_set(barrier, BS_INIT); + break; + case BS_FREEZING_P1: + case BS_FREEZING_P2: + if (OBD_FAIL_CHECK(OBD_FAIL_BARRIER_FAILURE)) + GOTO(fini, rc = -EINVAL); + + barrier->bi_deadline = ktime_get_real_seconds() + + desc->lgbd_timeout; + rc = barrier_freeze(&env, barrier, + desc->lgbd_status == BS_FREEZING_P1); + break; + case BS_THAWING: + case BS_FAILED: + case BS_EXPIRED: + barrier_set(barrier, BS_THAWED); + break; + default: + CWARN("%s: unexpected barrier status %u\n", + barrier_barrier2name(barrier), desc->lgbd_status); + rc = -EINVAL; + break; + } + + GOTO(fini, rc); + +fini: + lu_env_fini(&env); + +out_barrier: + if (rc < 0) + barrier_set(barrier, BS_FAILED); + else if (rc > 0) + barrier_set(barrier, BS_EXPIRED); + + lvb->lvb_status = barrier->bi_status; + lvb->lvb_index = barrier_dev_idx(barrier); + + CDEBUG(D_SNAPSHOT, "%s: handled barrier request: status %u, " + "deadline %lld: rc = %d\n", barrier_barrier2name(barrier), + lvb->lvb_status, barrier->bi_deadline, rc); + + barrier_instance_put(barrier); + rc = 0; + +out: + req->rq_status = rc; + return rc; +} +EXPORT_SYMBOL(barrier_handler); + +int barrier_register(struct dt_device *key, struct dt_device *next) +{ + struct barrier_instance *barrier; + int rc; + ENTRY; + + OBD_ALLOC_PTR(barrier); + if (!barrier) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&barrier->bi_link); + barrier->bi_bottom = key; + barrier->bi_next = next; + init_waitqueue_head(&barrier->bi_waitq); + rwlock_init(&barrier->bi_rwlock); + atomic_set(&barrier->bi_ref, 1); +#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG + rc = percpu_counter_init(&barrier->bi_writers, 0, GFP_KERNEL); +#else + rc = percpu_counter_init(&barrier->bi_writers, 0); +#endif + if (rc) + barrier_instance_cleanup(barrier); + else + barrier_instance_add(barrier); + + RETURN(rc); +} +EXPORT_SYMBOL(barrier_register); + +void barrier_deregister(struct dt_device *key) +{ + struct barrier_instance *barrier; + + spin_lock(&barrier_instance_lock); + barrier = barrier_instance_find_locked(key); + if (barrier) + list_del_init(&barrier->bi_link); + spin_unlock(&barrier_instance_lock); + + if (barrier) + barrier_instance_put(barrier); +} +EXPORT_SYMBOL(barrier_deregister); diff --git a/drivers/staging/lustrefsx/lustre/target/out_handler.c b/drivers/staging/lustrefsx/lustre/target/out_handler.c new file mode 100644 index 0000000000000..57c0d914f8ba7 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/out_handler.c @@ -0,0 +1,1254 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + * + * lustre/target/out_handler.c + * + * Object update handler between targets. + * + * Author: di.wang + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include "tgt_internal.h" + +static inline void orr_cpu_to_le(struct out_read_reply *orr_dst, + const struct out_read_reply *orr_src) +{ + orr_dst->orr_size = cpu_to_le32(orr_src->orr_size); + orr_dst->orr_padding = cpu_to_le32(orr_src->orr_padding); + orr_dst->orr_offset = cpu_to_le64(orr_dst->orr_offset); +} + +static void out_reconstruct(const struct lu_env *env, struct dt_device *dt, + struct dt_object *obj, + struct object_update_reply *reply, + int index) +{ + CDEBUG(D_HA, "%s: fork reply reply %p index %d: rc = %d\n", + dt_obd_name(dt), reply, index, 0); + + object_update_result_insert(reply, NULL, 0, index, 0); +} + +typedef void (*out_reconstruct_t)(const struct lu_env *env, + struct dt_device *dt, + struct dt_object *obj, + struct object_update_reply *reply, + int index); + +static inline bool out_check_resent(struct ptlrpc_request *req) +{ + if (likely(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))) + return false; + + if (req_xid_is_last(req)) { + struct lsd_client_data *lcd; + + /* XXX this does not support mulitple transactions yet, i.e. + * only 1 update RPC each time betwee MDTs */ + lcd = req->rq_export->exp_target_data.ted_lcd; + + req->rq_transno = lcd->lcd_last_transno; + req->rq_status = lcd->lcd_last_result; + if (req->rq_status != 0) + req->rq_transno = 0; + lustre_msg_set_transno(req->rq_repmsg, req->rq_transno); + lustre_msg_set_status(req->rq_repmsg, req->rq_status); + + DEBUG_REQ(D_HA, req, "reconstruct resent RPC"); + return true; + } + DEBUG_REQ(D_HA, req, "reprocess RESENT req, last_xid is %lld", + req->rq_export->exp_target_data.ted_lcd->lcd_last_xid); + return false; +} + +static int out_create(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct dt_object_format *dof = &tti->tti_u.update.tti_update_dof; + struct obdo *lobdo = &tti->tti_u.update.tti_obdo; + struct lu_attr *attr = &tti->tti_attr; + struct lu_fid *fid = NULL; + struct obdo *wobdo; + size_t size; + int rc; + + ENTRY; + + wobdo = object_update_param_get(update, 0, &size); + if (IS_ERR(wobdo) || size != sizeof(*wobdo)) { + CERROR("%s: obdo is NULL, invalid RPC: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(wobdo)); + RETURN(PTR_ERR(wobdo)); + } + + if (req_capsule_req_need_swab(tsi->tsi_pill)) + lustre_swab_obdo(wobdo); + lustre_get_wire_obdo(NULL, lobdo, wobdo); + la_from_obdo(attr, lobdo, lobdo->o_valid); + + dof->dof_type = dt_mode_to_dft(attr->la_mode); + if (update->ou_params_count > 1) { + fid = object_update_param_get(update, 1, &size); + if (IS_ERR(fid) || size != sizeof(*fid)) { + CERROR("%s: invalid fid: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(fid)); + RETURN(PTR_ERR(fid)); + } + if (req_capsule_req_need_swab(tsi->tsi_pill)) + lustre_swab_lu_fid(fid); + if (!fid_is_sane(fid)) { + CERROR("%s: invalid fid "DFID": rc = %d\n", + tgt_name(tsi->tsi_tgt), PFID(fid), -EPROTO); + RETURN(-EPROTO); + } + } + + if (lu_object_exists(&obj->do_lu)) + RETURN(-EEXIST); + + rc = out_tx_create(tsi->tsi_env, obj, attr, fid, dof, + &tti->tti_tea, tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + + RETURN(rc); +} + +static int out_attr_set(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct lu_attr *attr = &tti->tti_attr; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct obdo *lobdo = &tti->tti_u.update.tti_obdo; + struct obdo *wobdo; + size_t size; + int rc; + + ENTRY; + + wobdo = object_update_param_get(update, 0, &size); + if (IS_ERR(wobdo) || size != sizeof(*wobdo)) { + CERROR("%s: empty obdo in the update: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(wobdo)); + RETURN(PTR_ERR(wobdo)); + } + + attr->la_valid = 0; + attr->la_valid = 0; + + if (req_capsule_req_need_swab(tsi->tsi_pill)) + lustre_swab_obdo(wobdo); + lustre_get_wire_obdo(NULL, lobdo, wobdo); + la_from_obdo(attr, lobdo, lobdo->o_valid); + + rc = out_tx_attr_set(tsi->tsi_env, obj, attr, &tti->tti_tea, + tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + + RETURN(rc); +} + +static int out_attr_get(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct tgt_thread_info *tti = tgt_th_info(env); + struct object_update *update = tti->tti_u.update.tti_update; + struct obdo *obdo = &tti->tti_u.update.tti_obdo; + struct lu_attr *la = &tti->tti_attr; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + int idx = tti->tti_u.update.tti_update_reply_index; + int rc; + + ENTRY; + + if (unlikely(update->ou_result_size < sizeof(*obdo))) + return -EPROTO; + + if (!lu_object_exists(&obj->do_lu)) { + /* Usually, this will be called when the master MDT try + * to init a remote object(see osp_object_init), so if + * the object does not exist on slave, we need set BANSHEE flag, + * so the object can be removed from the cache immediately */ + set_bit(LU_OBJECT_HEARD_BANSHEE, + &obj->do_lu.lo_header->loh_flags); + RETURN(-ENOENT); + } + + dt_read_lock(env, obj, DT_TGT_CHILD); + rc = dt_attr_get(env, obj, la); + if (rc) + GOTO(out_unlock, rc); + + obdo->o_valid = 0; + obdo_from_la(obdo, la, la->la_valid); + +out_unlock: + dt_read_unlock(env, obj); + + CDEBUG(D_INFO, "%s: insert attr get reply %p index %d: rc = %d\n", + tgt_name(tsi->tsi_tgt), tti->tti_u.update.tti_update_reply, + 0, rc); + + object_update_result_insert(tti->tti_u.update.tti_update_reply, obdo, + sizeof(*obdo), idx, rc); + + RETURN(rc); +} + +static int out_xattr_get(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct tgt_thread_info *tti = tgt_th_info(env); + struct object_update *update = tti->tti_u.update.tti_update; + struct lu_buf *lbuf = &tti->tti_buf; + struct object_update_reply *reply = tti->tti_u.update.tti_update_reply; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + char *name; + struct object_update_result *update_result; + int idx = tti->tti_u.update.tti_update_reply_index; + int rc; + + ENTRY; + + if (!lu_object_exists(&obj->do_lu)) { + set_bit(LU_OBJECT_HEARD_BANSHEE, + &obj->do_lu.lo_header->loh_flags); + RETURN(-ENOENT); + } + + name = object_update_param_get(update, 0, NULL); + if (IS_ERR(name)) { + CERROR("%s: empty name for xattr get: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(name)); + RETURN(PTR_ERR(name)); + } + + update_result = object_update_result_get(reply, idx, NULL); + if (update_result == NULL) { + CERROR("%s: empty name for xattr get: rc = %d\n", + tgt_name(tsi->tsi_tgt), -EPROTO); + RETURN(-EPROTO); + } + + lbuf->lb_len = (int)tti->tti_u.update.tti_update->ou_result_size; + if (lbuf->lb_len == 0) + lbuf->lb_buf = NULL; + else + lbuf->lb_buf = update_result->our_data; + + dt_read_lock(env, obj, DT_TGT_CHILD); + rc = dt_xattr_get(env, obj, lbuf, name); + dt_read_unlock(env, obj); + if (rc <= 0) { + lbuf->lb_len = 0; + if (unlikely(!rc)) + rc = -ENODATA; + } else if (lbuf->lb_buf) { + lbuf->lb_len = rc; + } + CDEBUG(D_INFO, "%s: "DFID" get xattr %s len %d\n", + tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)), + name, rc); + + GOTO(out, rc); + +out: + object_update_result_insert(reply, lbuf->lb_buf, lbuf->lb_len, idx, rc); + RETURN(0); +} + +static int out_xattr_list(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct tgt_thread_info *tti = tgt_th_info(env); + struct lu_buf *lbuf = &tti->tti_buf; + struct object_update_reply *reply = tti->tti_u.update.tti_update_reply; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct object_update_result *update_result; + int idx = tti->tti_u.update.tti_update_reply_index; + int rc; + + ENTRY; + + if (!lu_object_exists(&obj->do_lu)) { + set_bit(LU_OBJECT_HEARD_BANSHEE, + &obj->do_lu.lo_header->loh_flags); + RETURN(-ENOENT); + } + + update_result = object_update_result_get(reply, 0, NULL); + if (!update_result) { + rc = -EPROTO; + CERROR("%s: empty buf for xattr list: rc = %d\n", + tgt_name(tsi->tsi_tgt), rc); + RETURN(rc); + } + + lbuf->lb_len = (int)tti->tti_u.update.tti_update->ou_result_size; + lbuf->lb_buf = update_result->our_data; + if (lbuf->lb_len == 0) + lbuf->lb_buf = 0; + + dt_read_lock(env, obj, DT_TGT_CHILD); + rc = dt_xattr_list(env, obj, lbuf); + dt_read_unlock(env, obj); + if (rc <= 0) { + lbuf->lb_len = 0; + if (unlikely(!rc)) + rc = -ENODATA; + } else if (lbuf->lb_buf) { + lbuf->lb_len = rc; + } + + CDEBUG(D_INFO, "%s: "DFID" list xattr len %d\n", + tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)), rc); + + /* Since we directly use update_result->our_data as the lbuf->lb_buf, + * then use NULL for result_insert to avoid unnecessary memory copy. */ + object_update_result_insert(reply, NULL, lbuf->lb_len, idx, rc); + + RETURN(0); +} + +static int out_index_lookup(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct tgt_thread_info *tti = tgt_th_info(env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + char *name; + int rc; + + ENTRY; + + if (unlikely(update->ou_result_size < sizeof(tti->tti_fid1))) + return -EPROTO; + + if (!lu_object_exists(&obj->do_lu)) + RETURN(-ENOENT); + + name = object_update_param_get(update, 0, NULL); + if (IS_ERR(name)) { + CERROR("%s: empty name for lookup: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(name)); + RETURN(PTR_ERR(name)); + } + + dt_read_lock(env, obj, DT_TGT_CHILD); + if (!dt_try_as_dir(env, obj)) + GOTO(out_unlock, rc = -ENOTDIR); + + rc = dt_lookup(env, obj, (struct dt_rec *)&tti->tti_fid1, + (struct dt_key *)name); + + if (rc < 0) + GOTO(out_unlock, rc); + + if (rc == 0) + rc += 1; + +out_unlock: + dt_read_unlock(env, obj); + + CDEBUG(D_INFO, "lookup "DFID" %s get "DFID" rc %d\n", + PFID(lu_object_fid(&obj->do_lu)), name, + PFID(&tti->tti_fid1), rc); + + CDEBUG(D_INFO, "%s: insert lookup reply %p index %d: rc = %d\n", + tgt_name(tsi->tsi_tgt), tti->tti_u.update.tti_update_reply, + 0, rc); + + object_update_result_insert(tti->tti_u.update.tti_update_reply, + &tti->tti_fid1, sizeof(tti->tti_fid1), + tti->tti_u.update.tti_update_reply_index, rc); + RETURN(rc); +} + +static int out_xattr_set(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct lu_buf *lbuf = &tti->tti_buf; + char *name; + char *buf; + __u32 *tmp; + size_t buf_len = 0; + int flag; + size_t size = 0; + int rc; + ENTRY; + + name = object_update_param_get(update, 0, NULL); + if (IS_ERR(name)) { + CERROR("%s: empty name for xattr set: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(name)); + RETURN(PTR_ERR(name)); + } + + /* If buffer == NULL (-ENODATA), then it might mean delete xattr */ + buf = object_update_param_get(update, 1, &buf_len); + if (IS_ERR(buf) && PTR_ERR(buf) != -ENODATA) + RETURN(PTR_ERR(buf)); + + lbuf->lb_buf = buf; + lbuf->lb_len = buf_len; + + tmp = object_update_param_get(update, 2, &size); + if (IS_ERR(tmp) || size != sizeof(*tmp)) { + CERROR("%s: emptry or wrong size %zu flag: rc = %ld\n", + tgt_name(tsi->tsi_tgt), size, PTR_ERR(tmp)); + RETURN(PTR_ERR(tmp)); + } + + if (req_capsule_req_need_swab(tsi->tsi_pill)) + __swab32s(tmp); + flag = *tmp; + + rc = out_tx_xattr_set(tsi->tsi_env, obj, lbuf, name, flag, + &tti->tti_tea, tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + RETURN(rc); +} + +static int out_xattr_del(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + char *name; + int rc; + ENTRY; + + name = object_update_param_get(update, 0, NULL); + if (IS_ERR(name)) { + CERROR("%s: empty name for xattr set: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(name)); + RETURN(PTR_ERR(name)); + } + + rc = out_tx_xattr_del(tsi->tsi_env, obj, name, &tti->tti_tea, + tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + RETURN(rc); +} + +/** + * increase ref of the object + **/ +static int out_ref_add(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + int rc; + + ENTRY; + + rc = out_tx_ref_add(tsi->tsi_env, obj, &tti->tti_tea, + tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + RETURN(rc); +} + +static int out_ref_del(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + int rc; + + ENTRY; + + if (!lu_object_exists(&obj->do_lu)) + RETURN(-ENOENT); + + rc = out_tx_ref_del(tsi->tsi_env, obj, &tti->tti_tea, + tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + RETURN(rc); +} + +static int out_index_insert(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct dt_insert_rec *rec = &tti->tti_rec; + struct lu_fid *fid; + char *name; + __u32 *ptype; + int rc = 0; + size_t size; + ENTRY; + + name = object_update_param_get(update, 0, NULL); + if (IS_ERR(name)) { + CERROR("%s: empty name for index insert: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(name)); + RETURN(PTR_ERR(name)); + } + + fid = object_update_param_get(update, 1, &size); + if (IS_ERR(fid) || size != sizeof(*fid)) { + CERROR("%s: invalid fid: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(fid)); + RETURN(PTR_ERR(fid)); + } + + if (req_capsule_req_need_swab(tsi->tsi_pill)) + lustre_swab_lu_fid(fid); + + if (!fid_is_sane(fid)) { + CERROR("%s: invalid FID "DFID": rc = %d\n", + tgt_name(tsi->tsi_tgt), PFID(fid), -EPROTO); + RETURN(-EPROTO); + } + + ptype = object_update_param_get(update, 2, &size); + if (IS_ERR(ptype) || size != sizeof(*ptype)) { + CERROR("%s: invalid type for index insert: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(ptype)); + RETURN(PTR_ERR(ptype)); + } + + if (req_capsule_req_need_swab(tsi->tsi_pill)) + __swab32s(ptype); + + rec->rec_fid = fid; + rec->rec_type = *ptype; + + rc = out_tx_index_insert(tsi->tsi_env, obj, (const struct dt_rec *)rec, + (const struct dt_key *)name, &tti->tti_tea, + tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + + CDEBUG(D_INFO, "%s: "DFID" index insert %s: rc = %d\n", + tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)), + name, rc); + + RETURN(rc); +} + +static int out_index_delete(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + char *name; + int rc = 0; + + if (!lu_object_exists(&obj->do_lu)) + RETURN(-ENOENT); + + name = object_update_param_get(update, 0, NULL); + if (IS_ERR(name)) { + CERROR("%s: empty name for index delete: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(name)); + RETURN(PTR_ERR(name)); + } + + rc = out_tx_index_delete(tsi->tsi_env, obj, (const struct dt_key *)name, + &tti->tti_tea, tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + RETURN(rc); +} + +static int out_destroy(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct lu_fid *fid; + int rc; + ENTRY; + + fid = &update->ou_fid; + if (!fid_is_sane(fid)) { + CERROR("%s: invalid FID "DFID": rc = %d\n", + tgt_name(tsi->tsi_tgt), PFID(fid), -EPROTO); + RETURN(-EPROTO); + } + + if (!lu_object_exists(&obj->do_lu)) + RETURN(-ENOENT); + + rc = out_tx_destroy(tsi->tsi_env, obj, &tti->tti_tea, + tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + + RETURN(rc); +} + +static int out_write(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct lu_buf *lbuf = &tti->tti_buf; + char *buf; + __u64 *tmp; + size_t size = 0; + size_t buf_len = 0; + loff_t pos; + int rc; + ENTRY; + + buf = object_update_param_get(update, 0, &buf_len); + if (IS_ERR(buf) || buf_len == 0) { + CERROR("%s: empty buf for xattr set: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(buf)); + RETURN(PTR_ERR(buf)); + } + lbuf->lb_buf = buf; + lbuf->lb_len = buf_len; + + tmp = object_update_param_get(update, 1, &size); + if (IS_ERR(tmp) || size != sizeof(*tmp)) { + CERROR("%s: empty or wrong size %zu pos: rc = %ld\n", + tgt_name(tsi->tsi_tgt), size, PTR_ERR(tmp)); + RETURN(PTR_ERR(tmp)); + } + + if (req_capsule_req_need_swab(tsi->tsi_pill)) + __swab64s(tmp); + pos = *tmp; + + rc = out_tx_write(tsi->tsi_env, obj, lbuf, pos, + &tti->tti_tea, tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + RETURN(rc); +} + +static int out_read(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct tgt_thread_info *tti = tgt_th_info(env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct object_update_reply *reply = tti->tti_u.update.tti_update_reply; + int index = tti->tti_u.update.tti_update_reply_index; + struct lu_rdbuf *rdbuf; + struct object_update_result *update_result; + struct out_read_reply *orr; + void *tmp; + size_t size; + size_t total_size = 0; + __u64 pos; + unsigned int i; + unsigned int nbufs; + int rc = 0; + ENTRY; + + update_result = object_update_result_get(reply, index, NULL); + LASSERT(update_result != NULL); + update_result->our_datalen = sizeof(*orr); + + if (!lu_object_exists(&obj->do_lu)) + GOTO(out, rc = -ENOENT); + + tmp = object_update_param_get(update, 0, NULL); + if (IS_ERR(tmp)) { + CERROR("%s: empty size for read: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(tmp)); + GOTO(out, rc = PTR_ERR(tmp)); + } + size = le64_to_cpu(*(size_t *)(tmp)); + + tmp = object_update_param_get(update, 1, NULL); + if (IS_ERR(tmp)) { + CERROR("%s: empty pos for read: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(tmp)); + GOTO(out, rc = PTR_ERR(tmp)); + } + pos = le64_to_cpu(*(__u64 *)(tmp)); + + /* Put the offset into the begining of the buffer in reply */ + orr = (struct out_read_reply *)update_result->our_data; + + nbufs = (size + OUT_BULK_BUFFER_SIZE - 1) / OUT_BULK_BUFFER_SIZE; + OBD_ALLOC(rdbuf, sizeof(*rdbuf) + nbufs * sizeof(rdbuf->rb_bufs[0])); + if (rdbuf == NULL) + GOTO(out, rc = -ENOMEM); + + rdbuf->rb_nbufs = 0; + total_size = 0; + for (i = 0; i < nbufs; i++) { + __u32 read_size; + + read_size = size > OUT_BULK_BUFFER_SIZE ? + OUT_BULK_BUFFER_SIZE : size; + OBD_ALLOC(rdbuf->rb_bufs[i].lb_buf, read_size); + if (rdbuf->rb_bufs[i].lb_buf == NULL) + GOTO(out_free, rc = -ENOMEM); + + rdbuf->rb_bufs[i].lb_len = read_size; + dt_read_lock(env, obj, DT_TGT_CHILD); + rc = dt_read(env, obj, &rdbuf->rb_bufs[i], &pos); + dt_read_unlock(env, obj); + + total_size += rc < 0 ? 0 : rc; + if (rc <= 0) + break; + + rdbuf->rb_nbufs++; + size -= read_size; + } + + /* send pages to client */ + rc = tgt_send_buffer(tsi, rdbuf); + if (rc < 0) + GOTO(out_free, rc); + + orr->orr_size = total_size; + orr->orr_offset = pos; + + orr_cpu_to_le(orr, orr); + update_result->our_datalen += orr->orr_size; +out_free: + for (i = 0; i < nbufs; i++) { + if (rdbuf->rb_bufs[i].lb_buf != NULL) { + OBD_FREE(rdbuf->rb_bufs[i].lb_buf, + rdbuf->rb_bufs[i].lb_len); + } + } + OBD_FREE(rdbuf, sizeof(*rdbuf) + + nbufs * sizeof(rdbuf->rb_bufs[0])); +out: + /* Insert read buffer */ + update_result->our_rc = ptlrpc_status_hton(rc); + reply->ourp_lens[index] = cfs_size_round(update_result->our_datalen + + sizeof(*update_result)); + RETURN(rc); +} + +static int out_noop(struct tgt_session_info *tsi) +{ + return 0; +} + +#define DEF_OUT_HNDL(opc, name, flags, fn) \ +[opc - OUT_CREATE] = { \ + .th_name = name, \ + .th_fail_id = 0, \ + .th_opc = opc, \ + .th_flags = flags, \ + .th_act = fn, \ + .th_fmt = NULL, \ + .th_version = 0, \ +} + +static struct tgt_handler out_update_ops[] = { + DEF_OUT_HNDL(OUT_CREATE, "out_create", IS_MUTABLE | HAS_REPLY, + out_create), + DEF_OUT_HNDL(OUT_DESTROY, "out_create", IS_MUTABLE | HAS_REPLY, + out_destroy), + DEF_OUT_HNDL(OUT_REF_ADD, "out_ref_add", IS_MUTABLE | HAS_REPLY, + out_ref_add), + DEF_OUT_HNDL(OUT_REF_DEL, "out_ref_del", IS_MUTABLE | HAS_REPLY, + out_ref_del), + DEF_OUT_HNDL(OUT_ATTR_SET, "out_attr_set", IS_MUTABLE | HAS_REPLY, + out_attr_set), + DEF_OUT_HNDL(OUT_ATTR_GET, "out_attr_get", HAS_REPLY, + out_attr_get), + DEF_OUT_HNDL(OUT_XATTR_SET, "out_xattr_set", IS_MUTABLE | HAS_REPLY, + out_xattr_set), + DEF_OUT_HNDL(OUT_XATTR_DEL, "out_xattr_del", IS_MUTABLE | HAS_REPLY, + out_xattr_del), + DEF_OUT_HNDL(OUT_XATTR_GET, "out_xattr_get", HAS_REPLY, + out_xattr_get), + DEF_OUT_HNDL(OUT_INDEX_LOOKUP, "out_index_lookup", HAS_REPLY, + out_index_lookup), + DEF_OUT_HNDL(OUT_INDEX_INSERT, "out_index_insert", + IS_MUTABLE | HAS_REPLY, out_index_insert), + DEF_OUT_HNDL(OUT_INDEX_DELETE, "out_index_delete", + IS_MUTABLE | HAS_REPLY, out_index_delete), + DEF_OUT_HNDL(OUT_WRITE, "out_write", IS_MUTABLE | HAS_REPLY, out_write), + DEF_OUT_HNDL(OUT_READ, "out_read", HAS_REPLY, out_read), + DEF_OUT_HNDL(OUT_NOOP, "out_noop", HAS_REPLY, out_noop), + DEF_OUT_HNDL(OUT_XATTR_LIST, "out_xattr_list", HAS_REPLY, + out_xattr_list), +}; + +static struct tgt_handler *out_handler_find(__u32 opc) +{ + struct tgt_handler *h; + + h = NULL; + if (OUT_CREATE <= opc && opc < OUT_LAST) { + h = &out_update_ops[opc - OUT_CREATE]; + LASSERTF(h->th_opc == opc, "opcode mismatch %d != %d\n", + h->th_opc, opc); + } else { + h = NULL; /* unsupported opc */ + } + return h; +} + +static int out_tx_start(const struct lu_env *env, struct dt_device *dt, + struct thandle_exec_args *ta, struct obd_export *exp) +{ + ta->ta_argno = 0; + ta->ta_handle = dt_trans_create(env, dt); + if (IS_ERR(ta->ta_handle)) { + int rc; + + rc = PTR_ERR(ta->ta_handle); + ta->ta_handle = NULL; + CERROR("%s: start handle error: rc = %d\n", dt_obd_name(dt), + rc); + return rc; + } + if (exp->exp_need_sync) + ta->ta_handle->th_sync = 1; + + return 0; +} + +static int out_trans_start(const struct lu_env *env, + struct thandle_exec_args *ta) +{ + return dt_trans_start(env, ta->ta_handle->th_dev, ta->ta_handle); +} + +static int out_trans_stop(const struct lu_env *env, + struct thandle_exec_args *ta, int err) +{ + int i; + int rc; + + ta->ta_handle->th_result = err; + rc = dt_trans_stop(env, ta->ta_handle->th_dev, ta->ta_handle); + for (i = 0; i < ta->ta_argno; i++) { + if (ta->ta_args[i]->object != NULL) { + dt_object_put(env, ta->ta_args[i]->object); + ta->ta_args[i]->object = NULL; + } + } + ta->ta_handle = NULL; + ta->ta_argno = 0; + + return rc; +} + +static int out_tx_end(const struct lu_env *env, struct thandle_exec_args *ta, + int declare_ret) +{ + struct tgt_session_info *tsi = tgt_ses_info(env); + int i; + int rc; + int rc1; + ENTRY; + + if (ta->ta_handle == NULL) + RETURN(0); + + if (declare_ret != 0 || ta->ta_argno == 0) + GOTO(stop, rc = declare_ret); + + LASSERT(ta->ta_handle->th_dev != NULL); + rc = out_trans_start(env, ta); + if (unlikely(rc != 0)) + GOTO(stop, rc); + + for (i = 0; i < ta->ta_argno; i++) { + rc = ta->ta_args[i]->exec_fn(env, ta->ta_handle, + ta->ta_args[i]); + if (unlikely(rc != 0)) { + CDEBUG(D_INFO, "error during execution of #%u from" + " %s:%d: rc = %d\n", i, ta->ta_args[i]->file, + ta->ta_args[i]->line, rc); + while (--i >= 0) { + if (ta->ta_args[i]->undo_fn != NULL) + ta->ta_args[i]->undo_fn(env, + ta->ta_handle, + ta->ta_args[i]); + else + CERROR("%s: undo for %s:%d: rc = %d\n", + dt_obd_name(ta->ta_handle->th_dev), + ta->ta_args[i]->file, + ta->ta_args[i]->line, -ENOTSUPP); + } + break; + } + CDEBUG(D_INFO, "%s: executed %u/%u: rc = %d\n", + dt_obd_name(ta->ta_handle->th_dev), i, ta->ta_argno, rc); + } + + /* Only fail for real updates, XXX right now llog updates will be + * ignore, whose updates count is usually 1, so failover test + * case will spot this FAIL_UPDATE_NET_REP precisely, and it will + * be removed after async update patch is landed. */ + if (ta->ta_argno > 1) + tsi->tsi_reply_fail_id = OBD_FAIL_OUT_UPDATE_NET_REP; + +stop: + rc1 = out_trans_stop(env, ta, rc); + if (rc == 0) + rc = rc1; + + ta->ta_handle = NULL; + ta->ta_argno = 0; + + RETURN(rc); +} + +/** + * Object updates between Targets. Because all the updates has been + * dis-assemblied into object updates at sender side, so OUT will + * call OSD API directly to execute these updates. + * + * In DNE phase I all of the updates in the request need to be executed + * in one transaction, and the transaction has to be synchronously. + * + * Please refer to lustre/include/lustre/lustre_idl.h for req/reply + * format. + */ +int out_handle(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct tgt_thread_info *tti = tgt_th_info(env); + struct thandle_exec_args *ta = &tti->tti_tea; + struct req_capsule *pill = tsi->tsi_pill; + struct dt_device *dt = tsi->tsi_tgt->lut_bottom; + struct out_update_header *ouh; + struct out_update_buffer *oub = NULL; + struct object_update *update; + struct object_update_reply *reply; + struct ptlrpc_bulk_desc *desc = NULL; + void **update_bufs; + int current_batchid = -1; + __u32 update_buf_count; + unsigned int i; + unsigned int reply_index = 0; + int rc = 0; + int rc1 = 0; + int ouh_size, reply_size; + int updates; + bool need_reconstruct; + + ENTRY; + + req_capsule_set(pill, &RQF_OUT_UPDATE); + ouh_size = req_capsule_get_size(pill, &RMF_OUT_UPDATE_HEADER, + RCL_CLIENT); + if (ouh_size <= 0) + RETURN(err_serious(-EPROTO)); + + ouh = req_capsule_client_get(pill, &RMF_OUT_UPDATE_HEADER); + if (ouh == NULL) + RETURN(err_serious(-EPROTO)); + + if (ouh->ouh_magic != OUT_UPDATE_HEADER_MAGIC) { + CERROR("%s: invalid update buffer magic %x expect %x: " + "rc = %d\n", tgt_name(tsi->tsi_tgt), ouh->ouh_magic, + UPDATE_REQUEST_MAGIC, -EPROTO); + RETURN(err_serious(-EPROTO)); + } + + update_buf_count = ouh->ouh_count; + if (update_buf_count == 0) + RETURN(err_serious(-EPROTO)); + + OBD_ALLOC_PTR_ARRAY(update_bufs, update_buf_count); + if (update_bufs == NULL) + RETURN(err_serious(-ENOMEM)); + + if (ouh->ouh_inline_length > 0) { + update_bufs[0] = ouh->ouh_inline_data; + } else { + struct out_update_buffer *tmp; + int page_count = 0; + + oub = req_capsule_client_get(pill, &RMF_OUT_UPDATE_BUF); + if (oub == NULL) + GOTO(out_free, rc = err_serious(-EPROTO)); + + for (i = 0; i < update_buf_count; i++) + /* First *and* last might be partial pages, hence +1 */ + page_count += DIV_ROUND_UP(oub[i].oub_size, + PAGE_SIZE) + 1; + + desc = ptlrpc_prep_bulk_exp(pill->rc_req, page_count, + PTLRPC_BULK_OPS_COUNT, + PTLRPC_BULK_GET_SINK, + MDS_BULK_PORTAL, + &ptlrpc_bulk_kiov_nopin_ops); + if (desc == NULL) + GOTO(out_free, rc = err_serious(-ENOMEM)); + + tmp = oub; + for (i = 0; i < update_buf_count; i++, tmp++) { + if (tmp->oub_size >= OUT_MAXREQSIZE) + GOTO(out_free, rc = err_serious(-EPROTO)); + + OBD_ALLOC_LARGE(update_bufs[i], tmp->oub_size); + if (update_bufs[i] == NULL) + GOTO(out_free, rc = err_serious(-ENOMEM)); + + desc->bd_frag_ops->add_iov_frag(desc, update_bufs[i], + tmp->oub_size); + } + + pill->rc_req->rq_bulk_write = 1; + rc = sptlrpc_svc_prep_bulk(pill->rc_req, desc); + if (rc != 0) + GOTO(out_free, rc = err_serious(rc)); + + rc = target_bulk_io(pill->rc_req->rq_export, desc); + if (rc < 0) + GOTO(out_free, rc = err_serious(rc)); + } + /* validate the request and calculate the total update count and + * set it to reply */ + reply_size = 0; + updates = 0; + for (i = 0; i < update_buf_count; i++) { + struct object_update_request *our; + int j; + + our = update_bufs[i]; + if (req_capsule_req_need_swab(pill)) + lustre_swab_object_update_request(our, 0); + + if (our->ourq_magic != UPDATE_REQUEST_MAGIC) { + CERROR("%s: invalid update buffer magic %x" + " expect %x: rc = %d\n", + tgt_name(tsi->tsi_tgt), our->ourq_magic, + UPDATE_REQUEST_MAGIC, -EPROTO); + GOTO(out_free, rc = err_serious(-EPROTO)); + } + updates += our->ourq_count; + + /* need to calculate reply size */ + for (j = 0; j < our->ourq_count; j++) { + update = object_update_request_get(our, j, NULL); + if (update == NULL) + GOTO(out, rc = err_serious(-EPROTO)); + if (req_capsule_req_need_swab(pill)) + lustre_swab_object_update(update); + + if (!fid_is_sane(&update->ou_fid)) { + CERROR("%s: invalid FID "DFID": rc = %d\n", + tgt_name(tsi->tsi_tgt), + PFID(&update->ou_fid), -EPROTO); + GOTO(out, rc = err_serious(-EPROTO)); + } + + /* XXX: what ou_result_size can be considered safe? */ + + reply_size += sizeof(reply->ourp_lens[0]); + reply_size += sizeof(struct object_update_result); + reply_size += update->ou_result_size; + } + } + reply_size += sizeof(*reply); + + if (unlikely(reply_size > ouh->ouh_reply_size)) { + CERROR("%s: too small reply buf %u for %u, need %u at least\n", + tgt_name(tsi->tsi_tgt), ouh->ouh_reply_size, + updates, reply_size); + GOTO(out_free, rc = err_serious(-EPROTO)); + } + + req_capsule_set_size(pill, &RMF_OUT_UPDATE_REPLY, RCL_SERVER, + ouh->ouh_reply_size); + rc = req_capsule_server_pack(pill); + if (rc != 0) { + CERROR("%s: Can't pack response: rc = %d\n", + tgt_name(tsi->tsi_tgt), rc); + GOTO(out_free, rc = err_serious(-EPROTO)); + } + + /* Prepare the update reply buffer */ + reply = req_capsule_server_get(pill, &RMF_OUT_UPDATE_REPLY); + if (reply == NULL) + GOTO(out_free, rc = -EPROTO); + reply->ourp_magic = UPDATE_REPLY_MAGIC; + reply->ourp_count = updates; + tti->tti_u.update.tti_update_reply = reply; + tti->tti_mult_trans = !req_is_replay(tgt_ses_req(tsi)); + + need_reconstruct = out_check_resent(pill->rc_req); + + /* Walk through updates in the request to execute them */ + for (i = 0; i < update_buf_count; i++) { + struct tgt_handler *h; + struct dt_object *dt_obj; + int update_count; + struct object_update_request *our; + int j; + + our = update_bufs[i]; + update_count = our->ourq_count; + for (j = 0; j < update_count; j++) { + struct lu_object_conf conf; + + update = object_update_request_get(our, j, NULL); + if (update->ou_type == OUT_CREATE) + conf.loc_flags = LOC_F_NEW; + else + conf.loc_flags = 0; + + dt_obj = dt_locate_at(env, dt, &update->ou_fid, + dt->dd_lu_dev.ld_site->ls_top_dev, &conf); + if (IS_ERR(dt_obj)) + GOTO(out, rc = PTR_ERR(dt_obj)); + + if (dt->dd_record_fid_accessed) { + struct lfsck_req_local *lrl = &tti->tti_lrl; + + lfsck_pack_rfa(lrl, + lu_object_fid(&dt_obj->do_lu), + LEL_FID_ACCESSED, + LFSCK_TYPE_LAYOUT); + tgt_lfsck_in_notify_local(env, dt, lrl, NULL); + } + + tti->tti_u.update.tti_dt_object = dt_obj; + tti->tti_u.update.tti_update = update; + tti->tti_u.update.tti_update_reply_index = reply_index; + + h = out_handler_find(update->ou_type); + if (unlikely(h == NULL)) { + CERROR("%s: unsupported opc: 0x%x\n", + tgt_name(tsi->tsi_tgt), update->ou_type); + GOTO(next, rc = -ENOTSUPP); + } + + /* Check resend case only for modifying RPC */ + if (h->th_flags & IS_MUTABLE) { + /* sanity check for last XID changing */ + if (unlikely(!need_reconstruct && + req_xid_is_last(pill->rc_req))) { + DEBUG_REQ(D_ERROR, pill->rc_req, + "unexpected last XID change"); + GOTO(next, rc = -EINVAL); + } + + if (need_reconstruct) { + out_reconstruct(env, dt, dt_obj, reply, + reply_index); + GOTO(next, rc = 0); + } + + if (dt->dd_rdonly) + GOTO(next, rc = -EROFS); + } + + /* start transaction for modification RPC only */ + if (h->th_flags & IS_MUTABLE && current_batchid == -1) { + current_batchid = update->ou_batchid; + + if (reply_index == 0) + CFS_RACE(OBD_FAIL_PTLRPC_RESEND_RACE); + + rc = out_tx_start(env, dt, ta, tsi->tsi_exp); + if (rc != 0) + GOTO(next, rc); + + if (update->ou_flags & UPDATE_FL_SYNC) + ta->ta_handle->th_sync = 1; + } + + /* Stop the current update transaction, if the update + * has different batchid, or read-only update */ + if (((current_batchid != update->ou_batchid) || + !(h->th_flags & IS_MUTABLE)) && + ta->ta_handle != NULL) { + rc = out_tx_end(env, ta, rc); + current_batchid = -1; + if (rc != 0) + GOTO(next, rc); + + /* start a new transaction if needed */ + if (h->th_flags & IS_MUTABLE) { + rc = out_tx_start(env, dt, ta, + tsi->tsi_exp); + if (rc != 0) + GOTO(next, rc); + if (update->ou_flags & UPDATE_FL_SYNC) + ta->ta_handle->th_sync = 1; + current_batchid = update->ou_batchid; + } + } + + rc = h->th_act(tsi); +next: + reply_index++; + dt_object_put(env, dt_obj); + if (rc < 0) + GOTO(out, rc); + } + } +out: + if (current_batchid != -1) { + rc1 = out_tx_end(env, ta, rc); + if (rc == 0) + rc = rc1; + } + +out_free: + if (update_bufs != NULL) { + if (oub != NULL) { + for (i = 0; i < update_buf_count; i++, oub++) { + if (update_bufs[i] != NULL) + OBD_FREE_LARGE(update_bufs[i], + oub->oub_size); + } + } + + OBD_FREE_PTR_ARRAY(update_bufs, update_buf_count); + } + + if (desc != NULL) + ptlrpc_free_bulk(desc); + + RETURN(rc); +} + +struct tgt_handler tgt_out_handlers[] = { +TGT_UPDATE_HDL(IS_MUTABLE, OUT_UPDATE, out_handle), +}; +EXPORT_SYMBOL(tgt_out_handlers); + diff --git a/drivers/staging/lustrefsx/lustre/target/out_lib.c b/drivers/staging/lustrefsx/lustre/target/out_lib.c new file mode 100644 index 0000000000000..5a0c0da4769bc --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/out_lib.c @@ -0,0 +1,1276 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * lustre/target/out_lib.c + * + * Author: Di Wang + * Author: Fan, Yong + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include + +#include "tgt_internal.h" + +const char *update_op_str(__u16 opc) +{ + static const char *const opc_str[] = { + [OUT_START] = "start", + [OUT_CREATE] = "create", + [OUT_DESTROY] = "destroy", + [OUT_REF_ADD] = "ref_add", + [OUT_REF_DEL] = "ref_del" , + [OUT_ATTR_SET] = "attr_set", + [OUT_ATTR_GET] = "attr_get", + [OUT_XATTR_SET] = "xattr_set", + [OUT_XATTR_GET] = "xattr_get", + [OUT_XATTR_LIST] = "xattr_list", + [OUT_INDEX_LOOKUP] = "lookup", + [OUT_INDEX_INSERT] = "insert", + [OUT_INDEX_DELETE] = "delete", + [OUT_WRITE] = "write", + [OUT_XATTR_DEL] = "xattr_del", + [OUT_PUNCH] = "punch", + [OUT_READ] = "read", + [OUT_NOOP] = "noop", + }; + + if (opc < ARRAY_SIZE(opc_str) && opc_str[opc] != NULL) + return opc_str[opc]; + else + return "unknown"; +} +EXPORT_SYMBOL(update_op_str); + +/** + * Fill object update header + * + * Only fill the object update header, and parameters will be filled later + * in other functions. + * + * \params[in] env execution environment + * \params[in] update object update to be filled + * \params[in,out] max_update_size maximum object update size, if the + * current update length equals or + * exceeds the size, it will return -E2BIG. + * \params[in] update_op update type + * \params[in] fid object FID of the update + * \params[in] param_count the count of the update parameters + * \params[in] param_sizes the length of each parameters + * + * \retval 0 if packing succeeds. + * \retval -E2BIG if packing exceeds the maximum length. + */ +int out_update_header_pack(const struct lu_env *env, + struct object_update *update, + size_t *max_update_size, + enum update_type update_op, + const struct lu_fid *fid, + unsigned int param_count, + __u16 *param_sizes, + __u32 reply_size) +{ + struct object_update_param *param; + unsigned int i; + size_t update_size; + + if (reply_size >= LNET_MTU) + return -EINVAL; + + /* Check whether the packing exceeding the maxima update length */ + update_size = sizeof(*update); + for (i = 0; i < param_count; i++) + update_size += cfs_size_round(sizeof(*param) + param_sizes[i]); + + if (unlikely(update_size >= *max_update_size)) { + *max_update_size = update_size; + return -E2BIG; + } + + update->ou_fid = *fid; + update->ou_type = update_op; + update->ou_params_count = param_count; + update->ou_result_size = reply_size; + param = &update->ou_params[0]; + for (i = 0; i < param_count; i++) { + param->oup_len = param_sizes[i]; + param = (struct object_update_param *)((char *)param + + object_update_param_size(param)); + } + + return 0; +} + +/** + * Packs one update into the update_buffer. + * + * \param[in] env execution environment + * \param[in] update update to be packed + * \param[in] max_update_size *maximum size of \a update + * \param[in] op update operation (enum update_type) + * \param[in] fid object FID for this update + * \param[in] param_count number of parameters for this update + * \param[in] param_sizes array of parameters length of this update + * \param[in] param_bufs parameter buffers + * + * \retval = 0 if updates packing succeeds + * \retval negative errno if updates packing fails + **/ +int out_update_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, enum update_type op, + const struct lu_fid *fid, unsigned int param_count, + __u16 *param_sizes, const void **param_bufs, + __u32 reply_size) +{ + struct object_update_param *param; + unsigned int i; + int rc; + ENTRY; + + rc = out_update_header_pack(env, update, max_update_size, op, fid, + param_count, param_sizes, reply_size); + if (rc != 0) + RETURN(rc); + + param = &update->ou_params[0]; + for (i = 0; i < param_count; i++) { + memcpy(¶m->oup_buf[0], param_bufs[i], param_sizes[i]); + param = (struct object_update_param *)((char *)param + + object_update_param_size(param)); + } + + RETURN(0); +} +EXPORT_SYMBOL(out_update_pack); + +/** + * Pack various updates into the update_buffer. + * + * The following functions pack different updates into the update_buffer + * So parameters of these API is basically same as its correspondent OSD/OSP + * API, for detail description of these parameters see osd_handler.c or + * osp_md_object.c. + * + * \param[in] env execution environment + * \param[in] ubuf update buffer + * \param[in] fid fid of this object for the update + * + * \retval 0 if insertion succeeds. + * \retval negative errno if insertion fails. + */ +int out_create_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof) +{ + struct obdo *obdo; + __u16 sizes[2] = {sizeof(*obdo), 0}; + int buf_count = 1; + const struct lu_fid *parent_fid = NULL; + int rc; + ENTRY; + + if (hint != NULL && hint->dah_parent) { + parent_fid = lu_object_fid(&hint->dah_parent->do_lu); + sizes[1] = sizeof(*parent_fid); + buf_count++; + } + + rc = out_update_header_pack(env, update, max_update_size, OUT_CREATE, + fid, buf_count, sizes, 0); + if (rc != 0) + RETURN(rc); + + obdo = object_update_param_get(update, 0, NULL); + if (IS_ERR(obdo)) + RETURN(PTR_ERR(obdo)); + + obdo->o_valid = 0; + obdo_from_la(obdo, attr, attr->la_valid); + + if (parent_fid != NULL) { + struct lu_fid *tmp; + + tmp = object_update_param_get(update, 1, NULL); + if (IS_ERR(tmp)) + RETURN(PTR_ERR(tmp)); + + fid_cpu_to_le(tmp, parent_fid); + } + + RETURN(0); +} +EXPORT_SYMBOL(out_create_pack); + +int out_ref_del_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid) +{ + return out_update_pack(env, update, max_update_size, OUT_REF_DEL, fid, + 0, NULL, NULL, 0); +} +EXPORT_SYMBOL(out_ref_del_pack); + +int out_ref_add_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid) +{ + return out_update_pack(env, update, max_update_size, OUT_REF_ADD, fid, + 0, NULL, NULL, 0); +} +EXPORT_SYMBOL(out_ref_add_pack); + +int out_attr_set_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct lu_attr *attr) +{ + struct obdo *obdo; + __u16 size = sizeof(*obdo); + int rc; + ENTRY; + + rc = out_update_header_pack(env, update, max_update_size, + OUT_ATTR_SET, fid, 1, &size, 0); + if (rc != 0) + RETURN(rc); + + obdo = object_update_param_get(update, 0, NULL); + if (IS_ERR(obdo)) + RETURN(PTR_ERR(obdo)); + + obdo->o_valid = 0; + obdo_from_la(obdo, attr, attr->la_valid); + + RETURN(0); +} +EXPORT_SYMBOL(out_attr_set_pack); + +int out_xattr_set_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct lu_buf *buf, const char *name, __u32 flag) +{ + __u16 sizes[3] = {strlen(name) + 1, buf->lb_len, sizeof(flag)}; + const void *bufs[3] = {(char *)name, (char *)buf->lb_buf, + (char *)&flag}; + + return out_update_pack(env, update, max_update_size, OUT_XATTR_SET, + fid, ARRAY_SIZE(sizes), sizes, bufs, 0); +} +EXPORT_SYMBOL(out_xattr_set_pack); + +int out_xattr_del_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const char *name) +{ + __u16 size = strlen(name) + 1; + + return out_update_pack(env, update, max_update_size, OUT_XATTR_DEL, + fid, 1, &size, (const void **)&name, 0); +} +EXPORT_SYMBOL(out_xattr_del_pack); + +int out_index_insert_pack(const struct lu_env *env, + struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct dt_rec *rec, const struct dt_key *key) +{ + struct dt_insert_rec *rec1 = (struct dt_insert_rec *)rec; + struct lu_fid rec_fid; + __u32 type = cpu_to_le32(rec1->rec_type); + __u16 sizes[3] = { strlen((char *)key) + 1, + sizeof(rec_fid), + sizeof(type) }; + const void *bufs[3] = { (char *)key, + (char *)&rec_fid, + (char *)&type }; + + fid_cpu_to_le(&rec_fid, rec1->rec_fid); + + return out_update_pack(env, update, max_update_size, OUT_INDEX_INSERT, + fid, ARRAY_SIZE(sizes), sizes, bufs, 0); +} +EXPORT_SYMBOL(out_index_insert_pack); + +int out_index_delete_pack(const struct lu_env *env, + struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct dt_key *key) +{ + __u16 size = strlen((char *)key) + 1; + const void *buf = key; + + return out_update_pack(env, update, max_update_size, OUT_INDEX_DELETE, + fid, 1, &size, &buf, 0); +} +EXPORT_SYMBOL(out_index_delete_pack); + +int out_destroy_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid) +{ + return out_update_pack(env, update, max_update_size, OUT_DESTROY, fid, + 0, NULL, NULL, 0); +} +EXPORT_SYMBOL(out_destroy_pack); + +int out_write_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct lu_buf *buf, __u64 pos) +{ + __u16 sizes[2] = {buf->lb_len, sizeof(pos)}; + const void *bufs[2] = {(char *)buf->lb_buf, (char *)&pos}; + int rc; + + pos = cpu_to_le64(pos); + + rc = out_update_pack(env, update, max_update_size, OUT_WRITE, fid, + ARRAY_SIZE(sizes), sizes, bufs, 0); + return rc; +} +EXPORT_SYMBOL(out_write_pack); + +/** + * Pack various readonly updates into the update_buffer. + * + * The following update funcs are only used by read-only ops, lookup, + * getattr etc, so it does not need transaction here. Currently they + * are only used by OSP. + * + * \param[in] env execution environment + * \param[in] fid fid of this object for the update + * \param[in] ubuf update buffer + * + * \retval = 0 pack succeed. + * < 0 pack failed. + **/ +int out_index_lookup_pack(const struct lu_env *env, + struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + struct dt_rec *rec, const struct dt_key *key) +{ + const void *name = key; + __u16 size = strlen((char *)name) + 1; + + /* XXX: this shouldn't be hardcoded */ + return out_update_pack(env, update, max_update_size, OUT_INDEX_LOOKUP, + fid, 1, &size, &name, 256); +} +EXPORT_SYMBOL(out_index_lookup_pack); + +int out_attr_get_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid) +{ + return out_update_pack(env, update, max_update_size, OUT_ATTR_GET, + fid, 0, NULL, NULL, sizeof(struct obdo)); +} +EXPORT_SYMBOL(out_attr_get_pack); + +int out_xattr_get_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const char *name, const int bufsize) +{ + __u16 size; + + LASSERT(name != NULL); + size = strlen(name) + 1; + + return out_update_pack(env, update, max_update_size, OUT_XATTR_GET, + fid, 1, &size, (const void **)&name, bufsize); +} +EXPORT_SYMBOL(out_xattr_get_pack); + +int out_xattr_list_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const int bufsize) +{ + return out_update_pack(env, update, max_update_size, OUT_XATTR_LIST, + fid, 0, NULL, NULL, bufsize); +} +EXPORT_SYMBOL(out_xattr_list_pack); + +int out_read_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + size_t size, loff_t pos) +{ + __u16 sizes[2] = {sizeof(size), sizeof(pos)}; + const void *bufs[2] = {&size, &pos}; + + LASSERT(size > 0); + size = cpu_to_le64(size); + pos = cpu_to_le64(pos); + + return out_update_pack(env, update, max_update_size, OUT_READ, fid, + ARRAY_SIZE(sizes), sizes, bufs, size); +} +EXPORT_SYMBOL(out_read_pack); + +static int tx_extend_args(struct thandle_exec_args *ta, int new_alloc_ta) +{ + struct tx_arg **new_ta; + int i; + int rc = 0; + + if (ta->ta_alloc_args >= new_alloc_ta) + return 0; + + OBD_ALLOC_PTR_ARRAY(new_ta, new_alloc_ta); + if (new_ta == NULL) + return -ENOMEM; + + for (i = 0; i < new_alloc_ta; i++) { + if (i < ta->ta_alloc_args) { + /* copy the old args to new one */ + new_ta[i] = ta->ta_args[i]; + } else { + OBD_ALLOC_PTR(new_ta[i]); + if (new_ta[i] == NULL) + GOTO(out, rc = -ENOMEM); + } + } + + /* free the old args */ + if (ta->ta_args != NULL) + OBD_FREE_PTR_ARRAY(ta->ta_args, ta->ta_alloc_args); + + ta->ta_args = new_ta; + ta->ta_alloc_args = new_alloc_ta; +out: + if (rc != 0) { + for (i = 0; i < new_alloc_ta; i++) { + if (new_ta[i] != NULL) + OBD_FREE_PTR(new_ta[i]); + } + OBD_FREE_PTR_ARRAY(new_ta, new_alloc_ta); + } + return rc; +} + +#define TX_ALLOC_STEP 8 +struct tx_arg *tx_add_exec(struct thandle_exec_args *ta, + tx_exec_func_t func, tx_exec_func_t undo, + const char *file, int line) +{ + int rc; + int i; + + LASSERT(ta != NULL); + LASSERT(func != NULL); + + if (ta->ta_argno + 1 >= ta->ta_alloc_args) { + rc = tx_extend_args(ta, ta->ta_alloc_args + TX_ALLOC_STEP); + if (rc != 0) + return ERR_PTR(rc); + } + + i = ta->ta_argno; + + ta->ta_argno++; + + ta->ta_args[i]->exec_fn = func; + ta->ta_args[i]->undo_fn = undo; + ta->ta_args[i]->file = file; + ta->ta_args[i]->line = line; + + return ta->ta_args[i]; +} + +static int out_obj_destroy(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle *th) +{ + int rc; + + CDEBUG(D_INFO, "%s: destroy "DFID"\n", dt_obd_name(th->th_dev), + PFID(lu_object_fid(&dt_obj->do_lu))); + + dt_write_lock(env, dt_obj, DT_TGT_CHILD); + rc = dt_destroy(env, dt_obj, th); + dt_write_unlock(env, dt_obj); + + return rc; +} + +/** + * All of the xxx_undo will be used once execution failed, + * But because all of the required resource has been reserved in + * declare phase, i.e. if declare succeed, it should make sure + * the following executing phase succeed in anyway, so these undo + * should be useless for most of the time in Phase I + */ +static int out_tx_create_undo(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + int rc; + + rc = out_obj_destroy(env, arg->object, th); + if (rc != 0) + CERROR("%s: undo failure, we are doomed!: rc = %d\n", + dt_obd_name(th->th_dev), rc); + return rc; +} + +int out_tx_create_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + CDEBUG(D_OTHER, "%s: create "DFID": dof %u, mode %o\n", + dt_obd_name(th->th_dev), + PFID(lu_object_fid(&arg->object->do_lu)), + arg->u.create.dof.dof_type, + arg->u.create.attr.la_mode & S_IFMT); + + dt_write_lock(env, dt_obj, DT_TGT_CHILD); + rc = dt_create(env, dt_obj, &arg->u.create.attr, + &arg->u.create.hint, &arg->u.create.dof, th); + + dt_write_unlock(env, dt_obj); + + CDEBUG(D_INFO, "%s: insert create reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + + return rc; +} + +/** + * Add create update to thandle + * + * Declare create updates and add the update to the thandle updates + * exec array. + * + * \param [in] env execution environment + * \param [in] obj object to be created + * \param [in] attr attributes of the creation + * \param [in] parent_fid the fid of the parent + * \param [in] dof dt object format of the creation + * \param [in] ta thandle execuation args where all of updates + * of the transaction are stored + * \param [in] th thandle for this update + * \param [in] reply reply of the updates + * \param [in] index index of the reply + * \param [in] file the file name where the function is called, + * which is only for debugging purpose. + * \param [in] line the line number where the funtion is called, + * which is only for debugging purpose. + * + * \retval 0 if updates is added successfully. + * \retval negative errno if update adding fails. + */ +int out_create_add_exec(const struct lu_env *env, struct dt_object *obj, + struct lu_attr *attr, struct lu_fid *parent_fid, + struct dt_object_format *dof, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + /* LU-13653: ignore quota for DNE directory creation */ + if (dof->dof_type == DFT_DIR) + th->th_ignore_quota = 1; + + rc = dt_declare_create(env, obj, attr, NULL, dof, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_create_exec, out_tx_create_undo, file, + line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + /* release the object in out_trans_stop */ + lu_object_get(&obj->do_lu); + arg->object = obj; + arg->u.create.attr = *attr; + if (parent_fid != NULL) + arg->u.create.fid = *parent_fid; + memset(&arg->u.create.hint, 0, sizeof(arg->u.create.hint)); + arg->u.create.dof = *dof; + arg->reply = reply; + arg->index = index; + + return 0; +} + +static int out_tx_attr_set_undo(const struct lu_env *env, + struct thandle *th, struct tx_arg *arg) +{ + CERROR("%s: attr set undo "DFID" unimplemented yet!: rc = %d\n", + dt_obd_name(th->th_dev), + PFID(lu_object_fid(&arg->object->do_lu)), -ENOTSUPP); + + return -ENOTSUPP; +} + +static int out_tx_attr_set_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + CDEBUG(D_OTHER, "%s: attr set "DFID"\n", dt_obd_name(th->th_dev), + PFID(lu_object_fid(&dt_obj->do_lu))); + + dt_write_lock(env, dt_obj, DT_TGT_CHILD); + rc = dt_attr_set(env, dt_obj, &arg->u.attr_set.attr, th); + dt_write_unlock(env, dt_obj); + + CDEBUG(D_INFO, "%s: insert attr_set reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, + arg->index, rc); + + return rc; +} + +int out_attr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const struct lu_attr *attr, + struct thandle_exec_args *ta, + struct thandle *th, struct object_update_reply *reply, + int index, const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_attr_set(env, dt_obj, attr, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_attr_set_exec, out_tx_attr_set_undo, + file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->u.attr_set.attr = *attr; + arg->reply = reply; + arg->index = index; + return 0; +} + +static int out_tx_write_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + CDEBUG(D_INFO, "write "DFID" pos %llu buf %p, len %lu\n", + PFID(lu_object_fid(&dt_obj->do_lu)), arg->u.write.pos, + arg->u.write.buf.lb_buf, (unsigned long)arg->u.write.buf.lb_len); + + if (OBD_FAIL_CHECK(OBD_FAIL_OUT_ENOSPC)) { + rc = -ENOSPC; + } else { + dt_write_lock(env, dt_obj, DT_TGT_CHILD); + rc = dt_record_write(env, dt_obj, &arg->u.write.buf, + &arg->u.write.pos, th); + dt_write_unlock(env, dt_obj); + + if (rc == 0) + rc = arg->u.write.buf.lb_len; + } + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc < 0 ? rc : 0); + + return rc > 0 ? 0 : rc; +} + +int out_write_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const struct lu_buf *buf, loff_t pos, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_record_write(env, dt_obj, buf, pos, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_write_exec, NULL, file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->u.write.buf = *buf; + arg->u.write.pos = pos; + arg->reply = reply; + arg->index = index; + return 0; +} + +static int out_tx_xattr_set_exec(const struct lu_env *env, + struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + ENTRY; + + CDEBUG(D_INFO, "%s: set xattr buf %p name %s flag %d\n", + dt_obd_name(th->th_dev), arg->u.xattr_set.buf.lb_buf, + arg->u.xattr_set.name, arg->u.xattr_set.flags); + + if (!lu_object_exists(&dt_obj->do_lu) || + OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) { + rc = -ENOENT; + } else { + struct linkea_data ldata = { 0 }; + bool linkea; + + ldata.ld_buf = &arg->u.xattr_set.buf; + if (strcmp(arg->u.xattr_set.name, XATTR_NAME_LINK) == 0) { + struct link_ea_header *leh; + + linkea = true; + rc = linkea_init(&ldata); + if (unlikely(rc)) + GOTO(out, rc == -ENODATA ? -EINVAL : rc); + + leh = ldata.ld_leh; + LASSERT(leh != NULL); + + /* If the new linkEA contains overflow timestamp, + * then two cases: + * + * 1. The old linkEA for the object has already + * overflowed before current setting, the new + * linkEA does not contains new link entry. So + * the linkEA overflow timestamp is unchanged. + * + * 2. There are new link entry in the new linkEA, + * so its overflow timestamp is differnt from + * the old one. Usually, the overstamp in the + * given linkEA is newer. But because of clock + * drift among MDTs, the timestamp may become + * older. So here, we convert the timestamp to + * the server local time. Then namespace LFSCK + * that uses local time can handle it easily. */ + if (unlikely(leh->leh_overflow_time)) { + struct lu_buf tbuf = { 0 }; + bool update = false; + + lu_buf_alloc(&tbuf, MAX_LINKEA_SIZE); + if (tbuf.lb_buf == NULL) + GOTO(unlock, rc = -ENOMEM); + + rc = dt_xattr_get(env, dt_obj, &tbuf, + XATTR_NAME_LINK); + if (rc > 0) { + struct linkea_data tdata = { 0 }; + + tdata.ld_buf = &tbuf; + rc = linkea_init(&tdata); + if (rc || leh->leh_overflow_time != + tdata.ld_leh->leh_overflow_time) + update = true; + } else { + /* Update the timestamp by force if + * fail to load the old linkEA. */ + update = true; + } + + lu_buf_free(&tbuf); + if (update) { + leh->leh_overflow_time = ktime_get_real_seconds(); + if (unlikely(!leh->leh_overflow_time)) + leh->leh_overflow_time++; + } + } + } else { + linkea = false; + } + + dt_write_lock(env, dt_obj, DT_TGT_CHILD); + +again: + rc = dt_xattr_set(env, dt_obj, ldata.ld_buf, + arg->u.xattr_set.name, arg->u.xattr_set.flags, + th); + if (unlikely(rc == -ENOSPC && linkea)) { + rc = linkea_overflow_shrink(&ldata); + if (likely(rc > 0)) { + arg->u.xattr_set.buf.lb_len = rc; + goto again; + } + } + +unlock: + dt_write_unlock(env, dt_obj); + } + + GOTO(out, rc); + +out: + CDEBUG(D_INFO, "%s: insert xattr set reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + + return rc; +} + +int out_xattr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const struct lu_buf *buf, const char *name, + int flags, struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_xattr_set(env, dt_obj, buf, name, flags, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_xattr_set_exec, NULL, file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->u.xattr_set.name = name; + arg->u.xattr_set.flags = flags; + arg->u.xattr_set.buf = *buf; + arg->reply = reply; + arg->index = index; + arg->u.xattr_set.csum = 0; + return 0; +} + +static int out_tx_xattr_del_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + CDEBUG(D_INFO, "%s: del xattr name '%s' on "DFID"\n", + dt_obd_name(th->th_dev), arg->u.xattr_set.name, + PFID(lu_object_fid(&dt_obj->do_lu))); + + if (!lu_object_exists(&dt_obj->do_lu)) + GOTO(out, rc = -ENOENT); + + dt_write_lock(env, dt_obj, DT_TGT_CHILD); + rc = dt_xattr_del(env, dt_obj, arg->u.xattr_set.name, + th); + dt_write_unlock(env, dt_obj); +out: + CDEBUG(D_INFO, "%s: insert xattr del reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + + return rc; +} + +int out_xattr_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const char *name, struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_xattr_del(env, dt_obj, name, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_xattr_del_exec, NULL, file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->u.xattr_set.name = name; + arg->reply = reply; + arg->index = index; + return 0; +} + +static int out_obj_ref_add(const struct lu_env *env, + struct dt_object *dt_obj, + struct thandle *th) +{ + int rc; + + dt_write_lock(env, dt_obj, DT_TGT_CHILD); + rc = dt_ref_add(env, dt_obj, th); + dt_write_unlock(env, dt_obj); + + return rc; +} + +static int out_obj_ref_del(const struct lu_env *env, + struct dt_object *dt_obj, + struct thandle *th) +{ + int rc; + + dt_write_lock(env, dt_obj, DT_TGT_CHILD); + rc = dt_ref_del(env, dt_obj, th); + dt_write_unlock(env, dt_obj); + + return rc; +} + +static int out_tx_ref_add_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + rc = out_obj_ref_add(env, dt_obj, th); + + CDEBUG(D_INFO, "%s: insert ref_add reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + return rc; +} + +static int out_tx_ref_add_undo(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + return out_obj_ref_del(env, arg->object, th); +} + +int out_ref_add_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_ref_add(env, dt_obj, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_ref_add_exec, out_tx_ref_add_undo, file, + line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->reply = reply; + arg->index = index; + return 0; +} + +static int out_tx_ref_del_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + rc = out_obj_ref_del(env, dt_obj, th); + + CDEBUG(D_INFO, "%s: insert ref_del reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, 0); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + + return rc; +} + +static int out_tx_ref_del_undo(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + return out_obj_ref_add(env, arg->object, th); +} + +int out_ref_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_ref_del(env, dt_obj, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_ref_del_exec, out_tx_ref_del_undo, file, + line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->reply = reply; + arg->index = index; + return 0; +} + +static int out_obj_index_insert(const struct lu_env *env, + struct dt_object *dt_obj, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th) +{ + int rc; + + CDEBUG(D_INFO, "%s: index insert "DFID" name: %s fid "DFID", type %u\n", + dt_obd_name(th->th_dev), PFID(lu_object_fid(&dt_obj->do_lu)), + (char *)key, PFID(((struct dt_insert_rec *)rec)->rec_fid), + ((struct dt_insert_rec *)rec)->rec_type); + + if (dt_try_as_dir(env, dt_obj) == 0) + return -ENOTDIR; + + dt_write_lock(env, dt_obj, DT_TGT_CHILD); + rc = dt_insert(env, dt_obj, rec, key, th); + dt_write_unlock(env, dt_obj); + + return rc; +} + +static int out_obj_index_delete(const struct lu_env *env, + struct dt_object *dt_obj, + const struct dt_key *key, + struct thandle *th) +{ + int rc; + + CDEBUG(D_INFO, "%s: index delete "DFID" name: %s\n", + dt_obd_name(th->th_dev), PFID(lu_object_fid(&dt_obj->do_lu)), + (char *)key); + + if (dt_try_as_dir(env, dt_obj) == 0) + return -ENOTDIR; + + dt_write_lock(env, dt_obj, DT_TGT_CHILD); + rc = dt_delete(env, dt_obj, key, th); + dt_write_unlock(env, dt_obj); + + return rc; +} + +static int out_tx_index_insert_exec(const struct lu_env *env, + struct thandle *th, struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + if (unlikely(!dt_object_exists(dt_obj))) + RETURN(-ESTALE); + + rc = out_obj_index_insert(env, dt_obj, + (const struct dt_rec *)&arg->u.insert.rec, + arg->u.insert.key, th); + + CDEBUG(D_INFO, "%s: insert idx insert reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + return rc; +} + +static int out_tx_index_insert_undo(const struct lu_env *env, + struct thandle *th, struct tx_arg *arg) +{ + return out_obj_index_delete(env, arg->object, arg->u.insert.key, th); +} + +int out_index_insert_add_exec(const struct lu_env *env, + struct dt_object *dt_obj, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + if (dt_try_as_dir(env, dt_obj) == 0) { + rc = -ENOTDIR; + return rc; + } + + rc = dt_declare_insert(env, dt_obj, rec, key, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_index_insert_exec, + out_tx_index_insert_undo, file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->reply = reply; + arg->index = index; + arg->u.insert.rec = *(const struct dt_insert_rec *)rec; + arg->u.insert.key = key; + + return 0; +} + +static int out_tx_index_delete_exec(const struct lu_env *env, + struct thandle *th, + struct tx_arg *arg) +{ + int rc; + + rc = out_obj_index_delete(env, arg->object, arg->u.insert.key, th); + + CDEBUG(D_INFO, "%s: delete idx insert reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + + return rc; +} + +static int out_tx_index_delete_undo(const struct lu_env *env, + struct thandle *th, + struct tx_arg *arg) +{ + CERROR("%s: Oops, can not rollback index_delete yet: rc = %d\n", + dt_obd_name(th->th_dev), -ENOTSUPP); + return -ENOTSUPP; +} + +int out_index_delete_add_exec(const struct lu_env *env, + struct dt_object *dt_obj, + const struct dt_key *key, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + if (dt_try_as_dir(env, dt_obj) == 0) { + rc = -ENOTDIR; + return rc; + } + + LASSERT(ta->ta_handle != NULL); + rc = dt_declare_delete(env, dt_obj, key, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_index_delete_exec, + out_tx_index_delete_undo, file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->reply = reply; + arg->index = index; + arg->u.insert.key = key; + return 0; +} + +static int out_tx_destroy_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + rc = out_obj_destroy(env, dt_obj, th); + + CDEBUG(D_INFO, "%s: insert destroy reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + + RETURN(rc); +} + +static int out_tx_destroy_undo(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + CERROR("%s: not support destroy undo yet!: rc = %d\n", + dt_obd_name(th->th_dev), -ENOTSUPP); + return -ENOTSUPP; +} + +int out_destroy_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_destroy(env, dt_obj, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_destroy_exec, out_tx_destroy_undo, + file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->reply = reply; + arg->index = index; + return 0; +} diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c b/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c new file mode 100644 index 0000000000000..afbf668e38a70 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c @@ -0,0 +1,363 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + * + * Copyright (c) 2019, DDN Storage Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/target/tgt_fmd.c + * + * This file provides functions to handle Filter Modification Data (FMD). + * The FMD is responsible for file attributes to be applied in + * Transaction ID (XID) order, so older requests can't re-write newer + * attributes. + * + * FMD is organized as per-client list and identified by FID of object. Each + * FMD stores FID of object and the highest received XID of modification + * request for this object. + * + * FMD can expire if there are no updates for a long time to keep the list + * reasonably small. + * + * Author: Andreas Dilger + * Author: Mike Pershin + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#include "tgt_internal.h" + +/** + * Drop FMD reference and free it if reference drops to zero. + * + * Must be called with ted_fmd_lock held. + * + * \param[in] exp OBD export + * \param[in] fmd FMD to put + */ +static inline void tgt_fmd_put_nolock(struct obd_export *exp, + struct tgt_fmd_data *fmd) +{ + struct tg_export_data *ted = &exp->exp_target_data; + + assert_spin_locked(&ted->ted_fmd_lock); + if (--fmd->fmd_refcount == 0) { + ted->ted_fmd_count--; + list_del(&fmd->fmd_list); + OBD_SLAB_FREE_PTR(fmd, tgt_fmd_kmem); + } +} + +/** + * Wrapper to drop FMD reference with ted_fmd_lock held. + * + * \param[in] exp OBD export + * \param[in] fmd FMD to put + */ +void tgt_fmd_put(struct obd_export *exp, struct tgt_fmd_data *fmd) +{ + struct tg_export_data *ted = &exp->exp_target_data; + + spin_lock(&ted->ted_fmd_lock); + tgt_fmd_put_nolock(exp, fmd); /* caller reference */ + spin_unlock(&ted->ted_fmd_lock); +} + +/** + * Expire FMD entries. + * + * Expire entries from the FMD list if there are too many + * of them or they are too old. + * + * This function must be called with ted_fmd_lock held. + * + * The \a keep FMD is not to be expired in any case. This parameter is used + * by ofd_fmd_find_nolock() to prohibit a FMD that was just found from + * expiring. + * + * \param[in] exp OBD export + * \param[in] keep FMD to keep always + */ +static void tgt_fmd_expire_nolock(struct obd_export *exp, + struct tgt_fmd_data *keep) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *lut = exp->exp_obd->u.obt.obt_lut; + time64_t now = ktime_get_seconds(); + struct tgt_fmd_data *fmd, *tmp; + + list_for_each_entry_safe(fmd, tmp, &ted->ted_fmd_list, fmd_list) { + if (fmd == keep) + break; + + if (now < fmd->fmd_expire && + ted->ted_fmd_count < lut->lut_fmd_max_num) + break; + + list_del_init(&fmd->fmd_list); + tgt_fmd_put_nolock(exp, fmd); /* list reference */ + } +} + +/** + * Expire FMD entries. + * + * This is a wrapper to call ofd_fmd_expire_nolock() with the required lock. + * + * \param[in] exp OBD export + */ +void tgt_fmd_expire(struct obd_export *exp) +{ + struct tg_export_data *ted = &exp->exp_target_data; + + spin_lock(&ted->ted_fmd_lock); + tgt_fmd_expire_nolock(exp, NULL); + spin_unlock(&ted->ted_fmd_lock); +} + +/** + * Find FMD by specified FID. + * + * Function finds FMD entry by FID in the tg_export_data::ted_fmd_list. + * + * Caller must hold tg_export_data::ted_fmd_lock and take FMD reference. + * + * \param[in] exp OBD export + * \param[in] fid FID of FMD to find + * + * \retval struct tgt_fmd_data found by FID + * \retval NULL is FMD is not found + */ +static struct tgt_fmd_data *tgt_fmd_find_nolock(struct obd_export *exp, + const struct lu_fid *fid) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct tgt_fmd_data *found = NULL, *fmd; + struct lu_target *lut = exp->exp_obd->u.obt.obt_lut; + time64_t now = ktime_get_seconds(); + + assert_spin_locked(&ted->ted_fmd_lock); + + list_for_each_entry_reverse(fmd, &ted->ted_fmd_list, fmd_list) { + if (lu_fid_eq(&fmd->fmd_fid, fid)) { + found = fmd; + list_move_tail(&fmd->fmd_list, &ted->ted_fmd_list); + fmd->fmd_expire = now + lut->lut_fmd_max_age; + break; + } + } + + tgt_fmd_expire_nolock(exp, found); + + return found; +} + +/** + * Find FMD by specified FID with locking. + * + * Wrapper to the ofd_fmd_find_nolock() with correct locks. + * + * \param[in] exp OBD export + * \param[in] fid FID of FMD to find + * + * \retval struct tgt_fmd_data found by FID + * \retval NULL indicates FMD is not found + */ +struct tgt_fmd_data *tgt_fmd_find(struct obd_export *exp, + const struct lu_fid *fid) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct tgt_fmd_data *fmd; + + spin_lock(&ted->ted_fmd_lock); + fmd = tgt_fmd_find_nolock(exp, fid); + if (fmd) + fmd->fmd_refcount++; /* caller reference */ + spin_unlock(&ted->ted_fmd_lock); + + return fmd; +} + +/** + * Find FMD by FID or create a new one if none is found. + * + * It is possible for this function to return NULL under memory pressure, + * or if the passed FID is zero (which will only cause old entries to expire). + * Currently this is not fatal because any FMD state is transient and + * may also be freed when it gets sufficiently old. + * + * \param[in] exp OBD export + * \param[in] fid FID of FMD to find + * + * \retval struct tgt_fmd_data found by FID + * \retval NULL indicates FMD is not found + */ +struct tgt_fmd_data *tgt_fmd_get(struct obd_export *exp, + const struct lu_fid *fid) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct tgt_fmd_data *found = NULL, *fmd_new = NULL; + + OBD_SLAB_ALLOC_PTR(fmd_new, tgt_fmd_kmem); + + spin_lock(&ted->ted_fmd_lock); + found = tgt_fmd_find_nolock(exp, fid); + if (fmd_new) { + if (!found) { + list_add_tail(&fmd_new->fmd_list, &ted->ted_fmd_list); + fmd_new->fmd_fid = *fid; + fmd_new->fmd_refcount++; /* list reference */ + found = fmd_new; + ted->ted_fmd_count++; + } else { + OBD_SLAB_FREE_PTR(fmd_new, tgt_fmd_kmem); + } + } + if (found) { + found->fmd_refcount++; /* caller reference */ + found->fmd_expire = ktime_get_seconds() + + class_exp2tgt(exp)->lut_fmd_max_age; + } else { + LCONSOLE_WARN("%s: cannot allocate FMD for "DFID + ", timestamps may be out of sync\n", + exp->exp_obd->obd_name, PFID(fid)); + } + spin_unlock(&ted->ted_fmd_lock); + + return found; +} + +#ifdef DO_FMD_DROP +/** + * Drop FMD list reference so it will disappear when last reference is dropped + * to zero. + * + * This function is called from ofd_destroy() and may only affect + * the one client that is doing the unlink and at worst we have an stale entry + * referencing an object that should never be used again. + * + * NB: this function is used only if DO_FMD_DROP is defined. It is not + * currently defined, so FMD drop doesn't happen and FMD are dropped only + * when expired. + * + * \param[in] exp OBD export + * \param[in] fid FID of FMD to drop + */ +void tgt_fmd_drop(struct obd_export *exp, const struct lu_fid *fid) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct tgt_fmd_data *fmd = NULL; + + spin_lock(&ted->ted_fmd_lock); + fmd = tgt_fmd_find_nolock(exp, fid); + if (fmd) { + list_del_init(&fmd->fmd_list); + tgt_fmd_put_nolock(exp, fmd); + } + spin_unlock(&ted->ted_fmd_lock); +} +EXPORT_SYMBOL(tgt_fmd_drop); +#endif + +/** + * Remove all entries from FMD list. + * + * Cleanup function to free all FMD enries on the given export. + * + * \param[in] exp OBD export + */ +void tgt_fmd_cleanup(struct obd_export *exp) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct tgt_fmd_data *fmd = NULL, *tmp; + + spin_lock(&ted->ted_fmd_lock); + list_for_each_entry_safe(fmd, tmp, &ted->ted_fmd_list, fmd_list) { + list_del_init(&fmd->fmd_list); + if (fmd->fmd_refcount > 1) { + CDEBUG(D_INFO, + "fmd %p still referenced (refcount = %d)\n", + fmd, fmd->fmd_refcount); + } + tgt_fmd_put_nolock(exp, fmd); + } + spin_unlock(&ted->ted_fmd_lock); + LASSERT(list_empty(&exp->exp_target_data.ted_fmd_list)); +} + +/** + * Update FMD with the latest request XID. + * + * Save a new setattr/punch XID in FMD if exists. + * + * \param[in] exp OBD export + * \param[in] fid FID of FMD to find + * \param[in] xid request XID + */ +void tgt_fmd_update(struct obd_export *exp, const struct lu_fid *fid, __u64 xid) +{ + struct tgt_fmd_data *fmd; + + fmd = tgt_fmd_get(exp, fid); + if (fmd) { + if (fmd->fmd_mactime_xid < xid) + fmd->fmd_mactime_xid = xid; + tgt_fmd_put(exp, fmd); + } +} +EXPORT_SYMBOL(tgt_fmd_update); + +/** + * Chech that time can be updated by the request with given XID. + * + * Check FMD XID if exists to be less than supplied XID + * + * \param[in] exp OBD export + * \param[in] fid FID of FMD to find + * \param[in] xid request XID + * + * \retval true if FMD has no greater XID, so time attr can be updated + */ +bool tgt_fmd_check(struct obd_export *exp, const struct lu_fid *fid, __u64 xid) +{ + struct tgt_fmd_data *fmd; + bool can_update = true; + + fmd = tgt_fmd_find(exp, fid); + if (fmd) { + can_update = fmd->fmd_mactime_xid < xid; + tgt_fmd_put(exp, fmd); + } + + return can_update; +} +EXPORT_SYMBOL(tgt_fmd_check); + diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c new file mode 100644 index 0000000000000..ac1757ed38905 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c @@ -0,0 +1,1704 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * lustre/target/tgt_grant.c + * + * This file provides code related to grant space management on Lustre Targets + * (OSTs and MDTs). Grant is a mechanism used by client nodes to reserve disk + * space on a target for the data writeback cache. The Lustre client is thus + * assured that enough space will be available when flushing dirty pages + * asynchronously. Each client node is granted an initial amount of reserved + * space at connect time and gets additional space back from target in bulk + * write reply. + * + * We actually support three different cases: + * - The client supports the new grant parameters (i.e. OBD_CONNECT_GRANT_PARAM) + * which means that all grant overhead calculation happens on the client side. + * The server reports at connect time the backend filesystem block size, the + * maximum extent size as well as the extent insertion cost and it is then up + * to the osc layer to the track dirty extents and consume grant accordingly + * (see osc_cache.c). In each bulk write request, the client provides how much + * grant space was consumed for this RPC. + * - The client does not support OBD_CONNECT_GRANT_PARAM and always assumes a + * a backend file system block size of 4KB. We then have two cases: + * - If the block size is really 4KB, then the client can deal with grant + * allocation for partial block writes, but won't take extent insertion cost + * into account. For such clients, we inflate grant by 100% on the server + * side. It means that when 32MB of grant is hold by the client, 64MB of + * grant space is actually reserved on the server. All grant counters + * provided by such a client are inflated by 100%. + * - The backend filesystem block size is bigger than 4KB, which isn't + * supported by the client. In this case, we emulate a 4KB block size and + * consume one block size on the server for each 4KB of grant returned to + * client. With a 128KB blocksize, it means that 32MB dirty pages of 4KB + * on the client will actually consume 1GB of grant on the server. + * All grant counters provided by such a client are inflated by the block + * size ratio. + * + * This file handles the core logic for: + * - grant allocation strategy + * - maintaining per-client as well as global grant space accounting + * - processing grant information packed in incoming requests + * - allocating server-side grant space for synchronous write RPCs which did not + * consume grant on the client side (OBD_BRW_FROM_GRANT flag not set). If not + * enough space is available, such RPCs fail with ENOSPC + * + * Author: Johann Lombardi + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#include "tgt_internal.h" + +int lbug_on_grant_miscount; +module_param(lbug_on_grant_miscount, int, 0644); +MODULE_PARM_DESC(lbug_on_grant_miscount, "LBUG on grant miscount"); + +/* Clients typically hold 2x their max_rpcs_in_flight of grant space */ +#define TGT_GRANT_SHRINK_LIMIT(exp) (2ULL * 8 * exp_max_brw_size(exp)) + +/* Helpers to inflate/deflate grants for clients that do not support the grant + * parameters */ +static inline u64 tgt_grant_inflate(struct tg_grants_data *tgd, u64 val) +{ + if (tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) + /* Client does not support such large block size, grant + * is thus inflated. We already significantly overestimate + * overhead, no need to add the extent tax in this case */ + return val << (tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT); + return val; +} + +/* Companion of tgt_grant_inflate() */ +static inline u64 tgt_grant_deflate(struct tg_grants_data *tgd, u64 val) +{ + if (tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) + return val >> (tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT); + return val; +} + +/* Grant chunk is used as a unit for grant allocation. It should be inflated + * if the client does not support the grant paramaters. + * Check connection flag against \a data if not NULL. This is used during + * connection creation where exp->exp_connect_data isn't populated yet */ +static inline u64 tgt_grant_chunk(struct obd_export *exp, + struct lu_target *lut, + struct obd_connect_data *data) +{ + struct tg_grants_data *tgd = &lut->lut_tgd; + u64 chunk = exp_max_brw_size(exp); + u64 tax; + + if (exp->exp_obd->obd_self_export == exp) + /* Grant enough space to handle a big precreate request */ + return OST_MAX_PRECREATE * lut->lut_dt_conf.ddp_inodespace / 2; + + if ((data == NULL && !(exp_grant_param_supp(exp))) || + (data != NULL && !OCD_HAS_FLAG(data, GRANT_PARAM))) + /* Try to grant enough space to send 2 full-size RPCs */ + return tgt_grant_inflate(tgd, chunk) << 1; + + /* Try to return enough to send two full-size RPCs + * = 2 * (BRW_size + #extents_in_BRW * grant_tax) */ + tax = 1ULL << tgd->tgd_blockbits; /* block size */ + tax *= lut->lut_dt_conf.ddp_max_extent_blks; /* max extent size */ + tax = (chunk + tax - 1) / tax; /* #extents in a RPC */ + tax *= lut->lut_dt_conf.ddp_extent_tax; /* extent tax for a RPC */ + chunk = (chunk + tax) * 2; /* we said two full RPCs */ + return chunk; +} + +static int tgt_check_export_grants(struct obd_export *exp, u64 *dirty, + u64 *pending, u64 *granted, u64 maxsize) +{ + struct tg_export_data *ted = &exp->exp_target_data; + int level = D_CACHE; + + if (ted->ted_grant < 0 || ted->ted_pending < 0 || ted->ted_dirty < 0) + level = D_ERROR; + CDEBUG_LIMIT(level, "%s: cli %s/%p dirty %ld pend %ld grant %ld\n", + exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, + ted->ted_dirty, ted->ted_pending, ted->ted_grant); + + if (ted->ted_grant + ted->ted_pending > maxsize) { + CERROR("%s: cli %s/%p ted_grant(%ld) + ted_pending(%ld)" + " > maxsize(%llu)\n", exp->exp_obd->obd_name, + exp->exp_client_uuid.uuid, exp, ted->ted_grant, + ted->ted_pending, maxsize); + return -EFAULT; + } + if (ted->ted_dirty > maxsize) { + CERROR("%s: cli %s/%p ted_dirty(%ld) > maxsize(%llu)\n", + exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, + exp, ted->ted_dirty, maxsize); + return -EFAULT; + } + *granted += ted->ted_grant + ted->ted_pending; + *pending += ted->ted_pending; + *dirty += ted->ted_dirty; + return 0; +} + +/** + * Perform extra sanity checks for grant accounting. + * + * This function scans the export list, sanity checks per-export grant counters + * and verifies accuracy of global grant accounting. If an inconsistency is + * found, a CERROR is printed with the function name \func that was passed as + * argument. LBUG is only called in case of serious counter corruption (i.e. + * value larger than the device size). + * Those sanity checks can be pretty expensive and are disabled if the OBD + * device has more than 100 connected exports by default. + * + * \param[in] obd OBD device for which grant accounting should be + * verified + * \param[in] func caller's function name + */ +void tgt_grant_sanity_check(struct obd_device *obd, const char *func) +{ + struct lu_target *lut = obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + struct obd_export *exp; + struct tg_export_data *ted; + u64 maxsize; + u64 tot_dirty = 0; + u64 tot_pending = 0; + u64 tot_granted = 0; + u64 fo_tot_granted; + u64 fo_tot_pending; + u64 fo_tot_dirty; + int error; + + if (list_empty(&obd->obd_exports)) + return; + + /* + * We don't want to do this for large machines that do lots of + * mounts or unmounts. It burns... + * Use set_param to change obd_grant_check_threshold, which + * is 100 by default, 0 to always check grants + */ + if (obd->obd_num_exports > obd->obd_grant_check_threshold && + obd->obd_grant_check_threshold) + return; + + maxsize = tgd->tgd_osfs.os_blocks << tgd->tgd_blockbits; + + spin_lock(&obd->obd_dev_lock); + spin_lock(&tgd->tgd_grant_lock); + exp = obd->obd_self_export; + ted = &exp->exp_target_data; + CDEBUG(D_CACHE, "%s: processing self export: %ld %ld " + "%ld\n", obd->obd_name, ted->ted_grant, + ted->ted_pending, ted->ted_dirty); + tot_granted += ted->ted_grant + ted->ted_pending; + tot_pending += ted->ted_pending; + tot_dirty += ted->ted_dirty; + + list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) { + error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending, + &tot_granted, maxsize); + if (error < 0) { + spin_unlock(&obd->obd_dev_lock); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + } + + /* exports about to be unlinked should also be taken into account since + * they might still hold pending grant space to be released at + * commit time */ + list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain) { + error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending, + &tot_granted, maxsize); + if (error < 0) { + spin_unlock(&obd->obd_dev_lock); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + } + + fo_tot_granted = tgd->tgd_tot_granted; + fo_tot_pending = tgd->tgd_tot_pending; + fo_tot_dirty = tgd->tgd_tot_dirty; + spin_unlock(&obd->obd_dev_lock); + spin_unlock(&tgd->tgd_grant_lock); + + if (tot_granted != fo_tot_granted) + CERROR("%s: tot_granted %llu != fo_tot_granted %llu\n", + func, tot_granted, fo_tot_granted); + if (tot_pending != fo_tot_pending) + CERROR("%s: tot_pending %llu != fo_tot_pending %llu\n", + func, tot_pending, fo_tot_pending); + if (tot_dirty != fo_tot_dirty) + CERROR("%s: tot_dirty %llu != fo_tot_dirty %llu\n", + func, tot_dirty, fo_tot_dirty); + if (tot_pending > tot_granted) + CERROR("%s: tot_pending %llu > tot_granted %llu\n", + func, tot_pending, tot_granted); + if (tot_granted > maxsize) + CERROR("%s: tot_granted %llu > maxsize %llu\n", + func, tot_granted, maxsize); + if (tot_dirty > maxsize) + CERROR("%s: tot_dirty %llu > maxsize %llu\n", + func, tot_dirty, maxsize); +} +EXPORT_SYMBOL(tgt_grant_sanity_check); + +/** + * Get file system statistics of target. + * + * Helper function for statfs(), also used by grant code. + * Implements caching for statistics to avoid calling OSD device each time. + * + * \param[in] env execution environment + * \param[in] lut LU target + * \param[out] osfs statistic data to return + * \param[in] max_age maximum age for cached data + * \param[in] from_cache show that data was get from cache or not + * + * \retval 0 if successful + * \retval negative value on error + */ +int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut, + struct obd_statfs *osfs, time64_t max_age, int *from_cache) +{ + struct tg_grants_data *tgd = &lut->lut_tgd; + int rc = 0; + ENTRY; + + spin_lock(&tgd->tgd_osfs_lock); + if (tgd->tgd_osfs_age < max_age || max_age == 0) { + u64 unstable; + + /* statfs data are too old, get up-to-date one. + * we must be cautious here since multiple threads might be + * willing to update statfs data concurrently and we must + * grant that cached statfs data are always consistent */ + + if (tgd->tgd_statfs_inflight == 0) + /* clear inflight counter if no users, although it would + * take a while to overflow this 64-bit counter ... */ + tgd->tgd_osfs_inflight = 0; + /* notify tgt_grant_commit() that we want to track writes + * completed as of now */ + tgd->tgd_statfs_inflight++; + /* record value of inflight counter before running statfs to + * compute the diff once statfs is completed */ + unstable = tgd->tgd_osfs_inflight; + spin_unlock(&tgd->tgd_osfs_lock); + + /* statfs can sleep ... hopefully not for too long since we can + * call it fairly often as space fills up */ + rc = dt_statfs(env, lut->lut_bottom, osfs); + if (unlikely(rc)) + GOTO(out, rc); + + osfs->os_namelen = min_t(__u32, osfs->os_namelen, NAME_MAX); + + spin_lock(&tgd->tgd_grant_lock); + spin_lock(&tgd->tgd_osfs_lock); + /* calculate how much space was written while we released the + * tgd_osfs_lock */ + unstable = tgd->tgd_osfs_inflight - unstable; + tgd->tgd_osfs_unstable = 0; + if (unstable) { + /* some writes committed while we were running statfs + * w/o the tgd_osfs_lock. Those ones got added to + * the cached statfs data that we are about to crunch. + * Take them into account in the new statfs data */ + osfs->os_bavail -= min_t(u64, osfs->os_bavail, + unstable >> tgd->tgd_blockbits); + /* However, we don't really know if those writes got + * accounted in the statfs call, so tell + * tgt_grant_space_left() there is some uncertainty + * on the accounting of those writes. + * The purpose is to prevent spurious error messages in + * tgt_grant_space_left() since those writes might be + * accounted twice. */ + tgd->tgd_osfs_unstable += unstable; + } + /* similarly, there is some uncertainty on write requests + * between prepare & commit */ + tgd->tgd_osfs_unstable += tgd->tgd_tot_pending; + spin_unlock(&tgd->tgd_grant_lock); + + /* finally udpate cached statfs data */ + tgd->tgd_osfs = *osfs; + tgd->tgd_osfs_age = ktime_get_seconds(); + + tgd->tgd_statfs_inflight--; /* stop tracking */ + if (tgd->tgd_statfs_inflight == 0) + tgd->tgd_osfs_inflight = 0; + spin_unlock(&tgd->tgd_osfs_lock); + + if (from_cache) + *from_cache = 0; + } else { + /* use cached statfs data */ + *osfs = tgd->tgd_osfs; + spin_unlock(&tgd->tgd_osfs_lock); + if (from_cache) + *from_cache = 1; + } + GOTO(out, rc); + +out: + return rc; +} +EXPORT_SYMBOL(tgt_statfs_internal); + +/** + * Update cached statfs information from the OSD layer + * + * Refresh statfs information cached in tgd::tgd_osfs if the cache is older + * than 1s or if force is set. The OSD layer is in charge of estimating data & + * metadata overhead. + * This function can sleep so it should not be called with any spinlock held. + * + * \param[in] env LU environment passed by the caller + * \param[in] exp export used to print client info in debug + * messages + * \param[in] force force a refresh of statfs information + * \param[out] from_cache returns whether the statfs information are + * taken from cache + */ +static void tgt_grant_statfs(const struct lu_env *env, struct obd_export *exp, + int force, int *from_cache) +{ + struct obd_device *obd = exp->exp_obd; + struct lu_target *lut = obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + struct tgt_thread_info *tti; + struct obd_statfs *osfs; + time64_t max_age; + int rc; + + if (force) + max_age = 0; /* get fresh statfs data */ + else + max_age = ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS; + + tti = tgt_th_info(env); + osfs = &tti->tti_u.osfs; + rc = tgt_statfs_internal(env, lut, osfs, max_age, from_cache); + if (unlikely(rc)) { + if (from_cache) + *from_cache = 0; + return; + } + + CDEBUG(D_CACHE, "%s: cli %s/%p free: %llu avail: %llu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + osfs->os_bfree << tgd->tgd_blockbits, + osfs->os_bavail << tgd->tgd_blockbits); +} + +/** + * Figure out how much space is available on the backend filesystem after + * removing grant space already booked by clients. + * + * This is done by accessing cached statfs data previously populated by + * tgt_grant_statfs(), from which we withdraw the space already granted to + * clients and the reserved space. + * Caller must hold tgd_grant_lock spinlock. + * + * \param[in] exp export associated with the device for which the amount + * of available space is requested + * \retval amount of non-allocated space, in bytes + */ +static u64 tgt_grant_space_left(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + struct lu_target *lut = obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + u64 tot_granted; + u64 left; + u64 avail; + u64 unstable; + u64 reserved; + + ENTRY; + assert_spin_locked(&tgd->tgd_grant_lock); + + spin_lock(&tgd->tgd_osfs_lock); + /* get available space from cached statfs data */ + left = tgd->tgd_osfs.os_bavail << tgd->tgd_blockbits; + unstable = tgd->tgd_osfs_unstable; /* those might be accounted twice */ + spin_unlock(&tgd->tgd_osfs_lock); + + reserved = left * tgd->tgd_reserved_pcnt / 100; + tot_granted = tgd->tgd_tot_granted + reserved; + + if (left < tot_granted) { + int mask = (left + unstable < + tot_granted - tgd->tgd_tot_pending) ? + D_ERROR : D_CACHE; + + /* the below message is checked in sanityn.sh test_15 */ + CDEBUG_LIMIT(mask, + "%s: cli %s/%p left=%llu < tot_grant=%llu unstable=%llu pending=%llu dirty=%llu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + left, tot_granted, unstable, + tgd->tgd_tot_pending, + tgd->tgd_tot_dirty); + RETURN(0); + } + + avail = left; + /* Withdraw space already granted to clients */ + left -= tot_granted; + + /* Align left on block size */ + left &= ~((1ULL << tgd->tgd_blockbits) - 1); + + CDEBUG(D_CACHE, + "%s: cli %s/%p avail=%llu left=%llu unstable=%llu tot_grant=%llu pending=%llu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, avail, left, + unstable, tot_granted, tgd->tgd_tot_pending); + + RETURN(left); +} + +/** + * Process grant information from obdo structure packed in incoming BRW + * and inflate grant counters if required. + * + * Grab the dirty and seen grant announcements from the incoming obdo and + * inflate all grant counters passed in the request if the client does not + * support the grant parameters. + * We will later calculate the client's new grant and return it. + * Caller must hold tgd_grant_lock spinlock. + * + * \param[in] env LU environment supplying osfs storage + * \param[in] exp export for which we received the request + * \param[in,out] oa incoming obdo sent by the client + */ +static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, long chunk) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct obd_device *obd = exp->exp_obd; + struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; + long long dirty, dropped; + ENTRY; + + assert_spin_locked(&tgd->tgd_grant_lock); + + if ((oa->o_valid & (OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) != + (OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) { + oa->o_valid &= ~OBD_MD_FLGRANT; + RETURN_EXIT; + } + + /* Add some margin, since there is a small race if other RPCs arrive + * out-or-order and have already consumed some grant. We want to + * leave this here in case there is a large error in accounting. */ + CDEBUG(D_CACHE, + "%s: cli %s/%p reports grant %llu dropped %u, local %lu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, oa->o_grant, + oa->o_dropped, ted->ted_grant); + + if ((long long)oa->o_dirty < 0) + oa->o_dirty = 0; + + /* inflate grant counters if required */ + if (!exp_grant_param_supp(exp)) { + u64 tmp; + oa->o_grant = tgt_grant_inflate(tgd, oa->o_grant); + oa->o_dirty = tgt_grant_inflate(tgd, oa->o_dirty); + /* inflation can bump client's wish to >4GB which doesn't fit + * 32bit o_undirty, limit that .. */ + tmp = tgt_grant_inflate(tgd, oa->o_undirty); + if (tmp >= OBD_MAX_GRANT) + tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits); + oa->o_undirty = tmp; + tmp = tgt_grant_inflate(tgd, oa->o_dropped); + if (tmp >= OBD_MAX_GRANT) + tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits); + oa->o_dropped = tmp; + } + + dirty = oa->o_dirty; + dropped = oa->o_dropped; + + /* Update our accounting now so that statfs takes it into account. + * Note that ted_dirty is only approximate and can become incorrect + * if RPCs arrive out-of-order. No important calculations depend + * on ted_dirty however, but we must check sanity to not assert. */ + if (dirty > ted->ted_grant + 4 * chunk) + dirty = ted->ted_grant + 4 * chunk; + tgd->tgd_tot_dirty += dirty - ted->ted_dirty; + if (ted->ted_grant < dropped) { + CDEBUG(D_CACHE, + "%s: cli %s/%p reports %llu dropped > grant %lu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, dropped, + ted->ted_grant); + dropped = 0; + } + if (tgd->tgd_tot_granted < dropped) { + CERROR("%s: cli %s/%p reports %llu dropped > tot_grant %llu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + dropped, tgd->tgd_tot_granted); + dropped = 0; + } + tgd->tgd_tot_granted -= dropped; + ted->ted_grant -= dropped; + ted->ted_dirty = dirty; + + if (ted->ted_dirty < 0 || ted->ted_grant < 0 || ted->ted_pending < 0) { + CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + ted->ted_dirty, ted->ted_pending, ted->ted_grant); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + EXIT; +} + +/** + * Grant shrink request handler. + * + * Client nodes can explicitly release grant space (i.e. process called grant + * shrinking). This function proceeds with the shrink request when there is + * less ungranted space remaining than the amount all of the connected clients + * would consume if they used their full grant. + * Caller must hold tgd_grant_lock spinlock. + * + * \param[in] exp export releasing grant space + * \param[in,out] oa incoming obdo sent by the client + * \param[in] left_space remaining free space with space already granted + * taken out + */ +static void tgt_grant_shrink(struct obd_export *exp, struct obdo *oa, + u64 left_space) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct obd_device *obd = exp->exp_obd; + struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; + long grant_shrink; + + assert_spin_locked(&tgd->tgd_grant_lock); + LASSERT(exp); + if (left_space >= tgd->tgd_tot_granted_clients * + TGT_GRANT_SHRINK_LIMIT(exp)) + return; + + grant_shrink = oa->o_grant; + + if (ted->ted_grant < grant_shrink) { + CDEBUG(D_CACHE, + "%s: cli %s/%p wants %lu shrinked > grant %lu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + grant_shrink, ted->ted_grant); + grant_shrink = ted->ted_grant; + } + + ted->ted_grant -= grant_shrink; + tgd->tgd_tot_granted -= grant_shrink; + + CDEBUG(D_CACHE, "%s: cli %s/%p shrink %ld ted_grant %ld total %llu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, grant_shrink, + ted->ted_grant, tgd->tgd_tot_granted); + + /* client has just released some grant, don't grant any space back */ + oa->o_grant = 0; +} + +/** + * Calculate how much space is required to write a given network buffer + * + * This function takes block alignment into account to estimate how much on-disk + * space will be required to successfully write the whole niobuf. + * Estimated space is inflated if the export does not support + * OBD_CONNECT_GRANT_PARAM and if the backend filesystem has a block size + * larger than the minimal supported page size (i.e. 4KB). + * + * \param[in] exp export associated which the write request + * if NULL, then size estimate is done for server-side + * grant allocation. + * \param[in] lut LU target handling the request + * \param[in] rnb network buffer to estimate size of + * + * \retval space (in bytes) that will be consumed to write the + * network buffer + */ +static inline u64 tgt_grant_rnb_size(struct obd_export *exp, + struct lu_target *lut, + struct niobuf_remote *rnb) +{ + struct tg_grants_data *tgd = &lut->lut_tgd; + u64 blksize; + u64 bytes; + u64 end; + + if (exp && !exp_grant_param_supp(exp) && + tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) + blksize = 1ULL << COMPAT_BSIZE_SHIFT; + else + blksize = 1ULL << tgd->tgd_blockbits; + + /* The network buffer might span several blocks, align it on block + * boundaries */ + bytes = rnb->rnb_offset & (blksize - 1); + bytes += rnb->rnb_len; + end = bytes & (blksize - 1); + if (end) + bytes += blksize - end; + + if (exp == NULL || exp_grant_param_supp(exp)) { + /* add per-extent insertion cost */ + u64 max_ext; + int nr_ext; + + max_ext = blksize * lut->lut_dt_conf.ddp_max_extent_blks; + nr_ext = (bytes + max_ext - 1) / max_ext; + bytes += nr_ext * lut->lut_dt_conf.ddp_extent_tax; + } else { + /* Inflate grant space if client does not support extent-based + * grant allocation */ + bytes = tgt_grant_inflate(tgd, (u64)bytes); + } + + return bytes; +} + +/** + * Validate grant accounting for each incoming remote network buffer. + * + * When clients have dirtied as much space as they've been granted they + * fall through to sync writes. These sync writes haven't been expressed + * in grants and need to error with ENOSPC when there isn't room in the + * filesystem for them after grants are taken into account. However, + * writeback of the dirty data that was already granted space can write + * right on through. + * The OBD_BRW_GRANTED flag will be set in the rnb_flags of each network + * buffer which has been granted enough space to proceed. Buffers without + * this flag will fail to be written with -ENOSPC (see tgt_preprw_write(). + * Caller must hold tgd_grant_lock spinlock. + * + * \param[in] env LU environment passed by the caller + * \param[in] exp export identifying the client which sent the RPC + * \param[in] oa incoming obdo in which we should return the pack the + * additional grant + * \param[in,out] rnb the list of network buffers + * \param[in] niocount the number of network buffers in the list + * \param[in] left the remaining free space with space already granted + * taken out + */ +static void tgt_grant_check(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct niobuf_remote *rnb, + int niocount, u64 *left) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct obd_device *obd = exp->exp_obd; + struct lu_target *lut = obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + unsigned long ungranted = 0; + unsigned long granted = 0; + int i; + bool skip = false; + + ENTRY; + + assert_spin_locked(&tgd->tgd_grant_lock); + + if (obd->obd_recovering) { + /* Replaying write. Grant info have been processed already so no + * need to do any enforcement here. It is worth noting that only + * bulk writes with all rnbs having OBD_BRW_FROM_GRANT can be + * replayed. If one page hasn't OBD_BRW_FROM_GRANT set, then + * the whole bulk is written synchronously */ + skip = true; + CDEBUG(D_CACHE, "Replaying write, skipping accounting\n"); + } else if ((oa->o_valid & OBD_MD_FLFLAGS) && + (oa->o_flags & OBD_FL_RECOV_RESEND)) { + /* Recoverable resend, grant info have already been processed as + * well */ + skip = true; + CDEBUG(D_CACHE, "Recoverable resend arrived, skipping " + "accounting\n"); + } else if (exp_grant_param_supp(exp) && oa->o_grant_used > 0) { + /* Client supports the new grant parameters and is telling us + * how much grant space it consumed for this bulk write. + * Although all rnbs are supposed to have the OBD_BRW_FROM_GRANT + * flag set, we will scan the rnb list and looks for non-cache + * I/O in case it changes in the future */ + if (ted->ted_grant >= oa->o_grant_used) { + /* skip grant accounting for rnbs with + * OBD_BRW_FROM_GRANT and just used grant consumption + * claimed in the request */ + granted = oa->o_grant_used; + skip = true; + } else { + /* client has used more grants for this request that + * it owns ... */ + CERROR("%s: cli %s claims %lu GRANT, real grant %lu\n", + exp->exp_obd->obd_name, + exp->exp_client_uuid.uuid, + (unsigned long)oa->o_grant_used, ted->ted_grant); + + /* check whether we can fill the gap with unallocated + * grant */ + if (*left > (oa->o_grant_used - ted->ted_grant)) { + /* ouf .. we are safe for now */ + granted = ted->ted_grant; + ungranted = oa->o_grant_used - granted; + *left -= ungranted; + skip = true; + } + /* too bad, but we cannot afford to blow up our grant + * accounting. The loop below will handle each rnb in + * case by case. */ + } + } + + for (i = 0; i < niocount; i++) { + int bytes; + + if ((rnb[i].rnb_flags & OBD_BRW_FROM_GRANT)) { + if (skip) { + rnb[i].rnb_flags |= OBD_BRW_GRANTED; + continue; + } + + /* compute how much grant space is actually needed for + * this rnb, inflate grant if required */ + bytes = tgt_grant_rnb_size(exp, lut, &rnb[i]); + if (ted->ted_grant >= granted + bytes) { + granted += bytes; + rnb[i].rnb_flags |= OBD_BRW_GRANTED; + continue; + } + + CDEBUG(D_CACHE, "%s: cli %s/%p claims %ld+%d GRANT, " + "real grant %lu idx %d\n", obd->obd_name, + exp->exp_client_uuid.uuid, exp, granted, bytes, + ted->ted_grant, i); + } + + if (obd->obd_recovering) + CERROR("%s: cli %s is replaying OST_WRITE while one rnb" + " hasn't OBD_BRW_FROM_GRANT set (0x%x)\n", + obd->obd_name, exp->exp_client_uuid.uuid, + rnb[i].rnb_flags); + + /* Consume grant space on the server. + * Unlike above, tgt_grant_rnb_size() is called with exp = NULL + * so that the required grant space isn't inflated. This is + * done on purpose since the server can deal with large block + * size, unlike some clients */ + bytes = tgt_grant_rnb_size(NULL, lut, &rnb[i]); + if (*left > bytes) { + /* if enough space, pretend it was granted */ + ungranted += bytes; + *left -= bytes; + rnb[i].rnb_flags |= OBD_BRW_GRANTED; + continue; + } + + /* We can't check for already-mapped blocks here (make sense + * when backend filesystem does not use COW) as it requires + * dropping the grant lock. + * Instead, we clear OBD_BRW_GRANTED and in that case we need + * to go through and verify if all of the blocks not marked + * BRW_GRANTED are already mapped and we can ignore this error. + */ + rnb[i].rnb_flags &= ~OBD_BRW_GRANTED; + CDEBUG(D_CACHE, "%s: cli %s/%p idx %d no space for %d\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, i, bytes); + } + + /* record in o_grant_used the actual space reserved for the I/O, will be + * used later in tgt_grant_commmit() */ + oa->o_grant_used = granted + ungranted; + + /* record space used for the I/O, will be used in tgt_grant_commmit() */ + /* Now substract what the clients has used already. We don't subtract + * this from the tot_granted yet, so that other client's can't grab + * that space before we have actually allocated our blocks. That + * happens in tgt_grant_commit() after the writes are done. */ + ted->ted_grant -= granted; + ted->ted_pending += oa->o_grant_used; + tgd->tgd_tot_granted += ungranted; + tgd->tgd_tot_pending += oa->o_grant_used; + + CDEBUG(D_CACHE, + "%s: cli %s/%p granted: %lu ungranted: %lu grant: %lu dirty: %lu" + "\n", obd->obd_name, exp->exp_client_uuid.uuid, exp, + granted, ungranted, ted->ted_grant, ted->ted_dirty); + + if (obd->obd_recovering || (oa->o_valid & OBD_MD_FLGRANT) == 0) + /* don't update dirty accounting during recovery or + * if grant information got discarded (e.g. during resend) */ + RETURN_EXIT; + + if (ted->ted_dirty < granted) { + CWARN("%s: cli %s/%p claims granted %lu > ted_dirty %lu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + granted, ted->ted_dirty); + granted = ted->ted_dirty; + } + tgd->tgd_tot_dirty -= granted; + ted->ted_dirty -= granted; + + if (ted->ted_dirty < 0 || ted->ted_grant < 0 || ted->ted_pending < 0) { + CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + ted->ted_dirty, ted->ted_pending, ted->ted_grant); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + EXIT; +} + +/** + * Allocate additional grant space to a client + * + * Calculate how much grant space to return to client, based on how much space + * is currently free and how much of that is already granted. + * Caller must hold tgd_grant_lock spinlock. + * + * \param[in] exp export of the client which sent the request + * \param[in] curgrant current grant claimed by the client + * \param[in] want how much grant space the client would like to + * have + * \param[in] left remaining free space with granted space taken + * out + * \param[in] chunk grant allocation unit + * \param[in] conservative if set to true, the server should be cautious + * and limit how much space is granted back to the + * client. Otherwise, the server should try hard to + * satisfy the client request. + * + * \retval amount of grant space allocated + */ +static long tgt_grant_alloc(struct obd_export *exp, u64 curgrant, + u64 want, u64 left, long chunk, + bool conservative) +{ + struct obd_device *obd = exp->exp_obd; + struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; + struct tg_export_data *ted = &exp->exp_target_data; + u64 grant; + + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_NO_GRANT)) + RETURN(0); + + /* When tgd_grant_compat_disable is set, we don't grant any space to + * clients not supporting OBD_CONNECT_GRANT_PARAM. + * Otherwise, space granted to such a client is inflated since it + * consumes PAGE_SIZE of grant space per block */ + if ((obd->obd_self_export != exp && !exp_grant_param_supp(exp) && + tgd->tgd_grant_compat_disable) || left == 0 || exp->exp_failed) + RETURN(0); + + if (want > OBD_MAX_GRANT) { + CERROR("%s: client %s/%p requesting > max (%lu), %llu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + OBD_MAX_GRANT, want); + RETURN(0); + } + + /* Grant some fraction of the client's requested grant space so that + * they are not always waiting for write credits (not all of it to + * avoid overgranting in face of multiple RPCs in flight). This + * essentially will be able to control the OSC_MAX_RIF for a client. + * + * If we do have a large disparity between what the client thinks it + * has and what we think it has, don't grant very much and let the + * client consume its grant first. Either it just has lots of RPCs + * in flight, or it was evicted and its grants will soon be used up. */ + if (curgrant >= want || curgrant >= ted->ted_grant + chunk) + RETURN(0); + + if (obd->obd_recovering) + conservative = false; + + if (conservative) + /* don't grant more than 1/8th of the remaining free space in + * one chunk */ + left >>= 3; + grant = min(want - curgrant, left); + /* round grant up to the next block size */ + grant = (grant + (1 << tgd->tgd_blockbits) - 1) & + ~((1ULL << tgd->tgd_blockbits) - 1); + + if (!grant) + RETURN(0); + + /* Limit to grant_chunk if not reconnect/recovery */ + if ((grant > chunk) && conservative) + grant = chunk; + + /* + * Limit grant so that export' grant does not exceed what the + * client would like to have by more than grants for 2 full + * RPCs + */ + if (want + chunk <= ted->ted_grant) + RETURN(0); + if (ted->ted_grant + grant > want + chunk) + grant = want + chunk - ted->ted_grant; + + tgd->tgd_tot_granted += grant; + ted->ted_grant += grant; + + if (unlikely(ted->ted_grant < 0 || ted->ted_grant > want + chunk)) { + CERROR("%s: cli %s/%p grant %ld want %llu current %llu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + ted->ted_grant, want, curgrant); + if (lbug_on_grant_miscount) { + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + } + + CDEBUG(D_CACHE, + "%s: cli %s/%p wants: %llu current grant %llu" + " granting: %llu\n", obd->obd_name, exp->exp_client_uuid.uuid, + exp, want, curgrant, grant); + CDEBUG(D_CACHE, + "%s: cli %s/%p tot cached:%llu granted:%llu" + " num_exports: %d\n", obd->obd_name, exp->exp_client_uuid.uuid, + exp, tgd->tgd_tot_dirty, tgd->tgd_tot_granted, + obd->obd_num_exports); + + RETURN(grant); +} + +/** + * Handle grant space allocation on client connection & reconnection. + * + * A new non-readonly connection gets an initial grant allocation equals to + * tgt_grant_chunk() (i.e. twice the max BRW size in most of the cases). + * On reconnection, grant counters between client & target are resynchronized + * and additional space might be granted back if possible. + * + * \param[in] env LU environment provided by the caller + * \param[in] exp client's export which is (re)connecting + * \param[in,out] data obd_connect_data structure sent by the client in the + * connect request + * \param[in] new_conn must set to true if this is a new connection and false + * for a reconnection + */ +void tgt_grant_connect(const struct lu_env *env, struct obd_export *exp, + struct obd_connect_data *data, bool new_conn) +{ + struct lu_target *lut = exp->exp_obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + struct tg_export_data *ted = &exp->exp_target_data; + u64 left = 0; + u64 want; + long chunk; + int from_cache; + int force = 0; /* can use cached data */ + + /* don't grant space to client with read-only access */ + if (OCD_HAS_FLAG(data, RDONLY) || + (!OCD_HAS_FLAG(data, GRANT_PARAM) && + tgd->tgd_grant_compat_disable)) { + data->ocd_grant = 0; + data->ocd_connect_flags &= ~(OBD_CONNECT_GRANT | + OBD_CONNECT_GRANT_PARAM); + RETURN_EXIT; + } + + if (OCD_HAS_FLAG(data, GRANT_PARAM)) + want = data->ocd_grant; + else + want = tgt_grant_inflate(tgd, data->ocd_grant); + chunk = tgt_grant_chunk(exp, lut, data); +refresh: + tgt_grant_statfs(env, exp, force, &from_cache); + + spin_lock(&tgd->tgd_grant_lock); + + /* Grab free space from cached info and take out space already granted + * to clients as well as reserved space */ + left = tgt_grant_space_left(exp); + + /* get fresh statfs data if we are short in ungranted space */ + if (from_cache && left < 32 * chunk) { + spin_unlock(&tgd->tgd_grant_lock); + CDEBUG(D_CACHE, "fs has no space left and statfs too old\n"); + force = 1; + goto refresh; + } + + tgt_grant_alloc(exp, (u64)ted->ted_grant, want, left, chunk, new_conn); + + /* return to client its current grant */ + if (OCD_HAS_FLAG(data, GRANT_PARAM)) + data->ocd_grant = ted->ted_grant; + else + /* deflate grant */ + data->ocd_grant = tgt_grant_deflate(tgd, (u64)ted->ted_grant); + + /* reset dirty accounting */ + tgd->tgd_tot_dirty -= ted->ted_dirty; + ted->ted_dirty = 0; + + if (new_conn && OCD_HAS_FLAG(data, GRANT)) + tgd->tgd_tot_granted_clients++; + + spin_unlock(&tgd->tgd_grant_lock); + + CDEBUG(D_CACHE, "%s: cli %s/%p ocd_grant: %d want: %llu left: %llu\n", + exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, + exp, data->ocd_grant, want, left); + + EXIT; +} +EXPORT_SYMBOL(tgt_grant_connect); + +/** + * Release all grant space attached to a given export. + * + * Remove a client from the grant accounting totals. We also remove + * the export from the obd device under the osfs and dev locks to ensure + * that the tgt_grant_sanity_check() calculations are always valid. + * The client should do something similar when it invalidates its import. + * + * \param[in] exp client's export to remove from grant accounting + */ +void tgt_grant_discard(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + struct lu_target *lut = class_exp2tgt(exp); + struct tg_export_data *ted = &exp->exp_target_data; + struct tg_grants_data *tgd; + + if (!lut) + return; + + tgd = &lut->lut_tgd; + spin_lock(&tgd->tgd_grant_lock); + if (unlikely(tgd->tgd_tot_granted < ted->ted_grant || + tgd->tgd_tot_dirty < ted->ted_dirty)) { + struct obd_export *e; + u64 ttg = 0; + u64 ttd = 0; + + list_for_each_entry(e, &obd->obd_exports, exp_obd_chain) { + LASSERT(exp != e); + ttg += e->exp_target_data.ted_grant; + ttg += e->exp_target_data.ted_pending; + ttd += e->exp_target_data.ted_dirty; + } + if (tgd->tgd_tot_granted < ted->ted_grant) + CERROR("%s: cli %s/%p: tot_granted %llu < ted_grant %ld, corrected to %llu", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + tgd->tgd_tot_granted, ted->ted_grant, ttg); + if (tgd->tgd_tot_dirty < ted->ted_dirty) + CERROR("%s: cli %s/%p: tot_dirty %llu < ted_dirty %ld, corrected to %llu", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + tgd->tgd_tot_dirty, ted->ted_dirty, ttd); + tgd->tgd_tot_granted = ttg; + tgd->tgd_tot_dirty = ttd; + } else { + tgd->tgd_tot_granted -= ted->ted_grant; + tgd->tgd_tot_dirty -= ted->ted_dirty; + } + ted->ted_grant = 0; + ted->ted_dirty = 0; + + if (tgd->tgd_tot_pending < ted->ted_pending) { + CERROR("%s: tot_pending %llu < cli %s/%p ted_pending %ld\n", + obd->obd_name, tgd->tgd_tot_pending, + exp->exp_client_uuid.uuid, exp, ted->ted_pending); + } + /* tgd_tot_pending is handled in tgt_grant_commit as bulk + * commmits */ + spin_unlock(&tgd->tgd_grant_lock); +} +EXPORT_SYMBOL(tgt_grant_discard); + +/** + * Process grant information from incoming bulk read request. + * + * Extract grant information packed in obdo structure (OBD_MD_FLGRANT set in + * o_valid). Bulk reads usually comes with grant announcements (number of dirty + * blocks, remaining amount of grant space, ...) and could also include a grant + * shrink request. Unlike bulk write, no additional grant space is returned on + * bulk read request. + * + * \param[in] env is the lu environment provided by the caller + * \param[in] exp is the export of the client which sent the request + * \param[in,out] oa is the incoming obdo sent by the client + */ +void tgt_grant_prepare_read(const struct lu_env *env, + struct obd_export *exp, struct obdo *oa) +{ + struct lu_target *lut = exp->exp_obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + int do_shrink; + u64 left = 0; + + ENTRY; + + if (!oa) + RETURN_EXIT; + + if ((oa->o_valid & OBD_MD_FLGRANT) == 0) + /* The read request does not contain any grant + * information */ + RETURN_EXIT; + + if ((oa->o_valid & OBD_MD_FLFLAGS) && + (oa->o_flags & OBD_FL_SHRINK_GRANT)) { + /* To process grant shrink request, we need to know how much + * available space remains on the backend filesystem. + * Shrink requests are not so common, we always get fresh + * statfs information. */ + tgt_grant_statfs(env, exp, 1, NULL); + + /* protect all grant counters */ + spin_lock(&tgd->tgd_grant_lock); + + /* Grab free space from cached statfs data and take out space + * already granted to clients as well as reserved space */ + left = tgt_grant_space_left(exp); + + /* all set now to proceed with shrinking */ + do_shrink = 1; + } else { + /* no grant shrinking request packed in the obdo and + * since we don't grant space back on reads, no point + * in running statfs, so just skip it and process + * incoming grant data directly. */ + spin_lock(&tgd->tgd_grant_lock); + do_shrink = 0; + } + + /* extract incoming grant information provided by the client and + * inflate grant counters if required */ + tgt_grant_incoming(env, exp, oa, tgt_grant_chunk(exp, lut, NULL)); + + /* unlike writes, we don't return grants back on reads unless a grant + * shrink request was packed and we decided to turn it down. */ + if (do_shrink) + tgt_grant_shrink(exp, oa, left); + else + oa->o_grant = 0; + + if (!exp_grant_param_supp(exp)) + oa->o_grant = tgt_grant_deflate(tgd, oa->o_grant); + spin_unlock(&tgd->tgd_grant_lock); + EXIT; +} +EXPORT_SYMBOL(tgt_grant_prepare_read); + +/** + * Process grant information from incoming bulk write request. + * + * This function extracts client's grant announcements from incoming bulk write + * request and attempts to allocate grant space for network buffers that need it + * (i.e. OBD_BRW_FROM_GRANT not set in rnb_fags). + * Network buffers which aren't granted the OBD_BRW_GRANTED flag should not + * proceed further and should fail with -ENOSPC. + * Whenever possible, additional grant space will be returned to the client + * in the bulk write reply. + * tgt_grant_prepare_write() must be called before writting any buffers to + * the backend storage. This function works in pair with tgt_grant_commit() + * which must be invoked once all buffers have been written to disk in order + * to release space from the pending grant counter. + * + * \param[in] env LU environment provided by the caller + * \param[in] exp export of the client which sent the request + * \param[in] oa incoming obdo sent by the client + * \param[in] rnb list of network buffers + * \param[in] niocount number of network buffers in the list + */ +void tgt_grant_prepare_write(const struct lu_env *env, + struct obd_export *exp, struct obdo *oa, + struct niobuf_remote *rnb, int niocount) +{ + struct obd_device *obd = exp->exp_obd; + struct lu_target *lut = obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + u64 left; + int from_cache; + int force = 0; /* can use cached data intially */ + long chunk = tgt_grant_chunk(exp, lut, NULL); + + ENTRY; + +refresh: + /* get statfs information from OSD layer */ + tgt_grant_statfs(env, exp, force, &from_cache); + + spin_lock(&tgd->tgd_grant_lock); /* protect all grant counters */ + + /* Grab free space from cached statfs data and take out space already + * granted to clients as well as reserved space */ + left = tgt_grant_space_left(exp); + + /* Get fresh statfs data if we are short in ungranted space */ + if (from_cache && left < 32 * chunk) { + spin_unlock(&tgd->tgd_grant_lock); + CDEBUG(D_CACHE, "%s: fs has no space left and statfs too old\n", + obd->obd_name); + force = 1; + goto refresh; + } + + /* When close to free space exhaustion, trigger a sync to force + * writeback cache to consume required space immediately and release as + * much space as possible. */ + if (!obd->obd_recovering && force != 2 && left < chunk) { + bool from_grant = true; + int i; + + /* That said, it is worth running a sync only if some pages did + * not consume grant space on the client and could thus fail + * with ENOSPC later in tgt_grant_check() */ + for (i = 0; i < niocount; i++) + if (!(rnb[i].rnb_flags & OBD_BRW_FROM_GRANT)) + from_grant = false; + + if (!from_grant) { + /* at least one network buffer requires acquiring grant + * space on the server */ + spin_unlock(&tgd->tgd_grant_lock); + /* discard errors, at least we tried ... */ + dt_sync(env, lut->lut_bottom); + force = 2; + goto refresh; + } + } + + /* extract incoming grant information provided by the client, + * and inflate grant counters if required */ + tgt_grant_incoming(env, exp, oa, chunk); + + /* check limit */ + tgt_grant_check(env, exp, oa, rnb, niocount, &left); + + if (!(oa->o_valid & OBD_MD_FLGRANT)) { + spin_unlock(&tgd->tgd_grant_lock); + RETURN_EXIT; + } + + /* if OBD_FL_SHRINK_GRANT is set, the client is willing to release some + * grant space. */ + if ((oa->o_valid & OBD_MD_FLFLAGS) && + (oa->o_flags & OBD_FL_SHRINK_GRANT)) + tgt_grant_shrink(exp, oa, left); + else + /* grant more space back to the client if possible */ + oa->o_grant = tgt_grant_alloc(exp, oa->o_grant, oa->o_undirty, + left, chunk, true); + + if (!exp_grant_param_supp(exp)) + oa->o_grant = tgt_grant_deflate(tgd, oa->o_grant); + spin_unlock(&tgd->tgd_grant_lock); + EXIT; +} +EXPORT_SYMBOL(tgt_grant_prepare_write); + +/** + * Consume grant space reserved for object creation. + * + * Grant space is allocated to the local self export for object precreation. + * This is required to prevent object precreation from consuming grant space + * allocated to client nodes for the data writeback cache. + * This function consumes enough space to create \a nr objects and allocates + * more grant space to the self export for future precreation requests, if + * possible. + * + * \param[in] env LU environment provided by the caller + * \param[in] exp export holding the grant space for precreation (= self + * export currently) + * \param[in] nr number of objects to be created + * + * \retval >= 0 amount of grant space allocated to the precreate request + * \retval -ENOSPC on failure + */ +long tgt_grant_create(const struct lu_env *env, struct obd_export *exp, s64 *nr) +{ + struct lu_target *lut = exp->exp_obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + struct tg_export_data *ted = &exp->exp_target_data; + u64 left = 0; + unsigned long wanted; + unsigned long granted; + ENTRY; + + if (exp->exp_obd->obd_recovering || + lut->lut_dt_conf.ddp_inodespace == 0) + /* don't enforce grant during recovery */ + RETURN(0); + + /* Update statfs data if required */ + tgt_grant_statfs(env, exp, 1, NULL); + + /* protect all grant counters */ + spin_lock(&tgd->tgd_grant_lock); + + /* fail precreate request if there is not enough blocks available for + * writing */ + if (tgd->tgd_osfs.os_bavail - (ted->ted_grant >> tgd->tgd_blockbits) < + (tgd->tgd_osfs.os_blocks >> 10)) { + spin_unlock(&tgd->tgd_grant_lock); + CDEBUG(D_RPCTRACE, "%s: not enough space for create %llu\n", + exp->exp_obd->obd_name, + tgd->tgd_osfs.os_bavail * tgd->tgd_osfs.os_blocks); + RETURN(-ENOSPC); + } + + /* Grab free space from cached statfs data and take out space + * already granted to clients as well as reserved space */ + left = tgt_grant_space_left(exp); + + /* compute how much space is required to handle the precreation + * request */ + wanted = *nr * lut->lut_dt_conf.ddp_inodespace; + if (wanted > ted->ted_grant + left) { + /* that's beyond what remains, adjust the number of objects that + * can be safely precreated */ + wanted = ted->ted_grant + left; + *nr = wanted / lut->lut_dt_conf.ddp_inodespace; + if (*nr == 0) { + /* we really have no space any more for precreation, + * fail the precreate request with ENOSPC */ + spin_unlock(&tgd->tgd_grant_lock); + RETURN(-ENOSPC); + } + /* compute space needed for the new number of creations */ + wanted = *nr * lut->lut_dt_conf.ddp_inodespace; + } + LASSERT(wanted <= ted->ted_grant + left); + + if (wanted <= ted->ted_grant) { + /* we've enough grant space to handle this precreate request */ + ted->ted_grant -= wanted; + } else { + /* we need to take some space from the ungranted pool */ + tgd->tgd_tot_granted += wanted - ted->ted_grant; + left -= wanted - ted->ted_grant; + ted->ted_grant = 0; + } + granted = wanted; + ted->ted_pending += granted; + tgd->tgd_tot_pending += granted; + + /* grant more space for precreate purpose if possible. */ + wanted = OST_MAX_PRECREATE * lut->lut_dt_conf.ddp_inodespace / 2; + if (wanted > ted->ted_grant) { + long chunk; + + /* always try to book enough space to handle a large precreate + * request */ + chunk = tgt_grant_chunk(exp, lut, NULL); + wanted -= ted->ted_grant; + tgt_grant_alloc(exp, ted->ted_grant, wanted, left, chunk, + false); + } + spin_unlock(&tgd->tgd_grant_lock); + RETURN(granted); +} +EXPORT_SYMBOL(tgt_grant_create); + +/** + * Release grant space added to the pending counter by tgt_grant_prepare_write() + * + * Update pending grant counter once buffers have been written to the disk. + * + * \param[in] exp export of the client which sent the request + * \param[in] pending amount of reserved space to be released + * \param[in] rc return code of pre-commit operations + */ +void tgt_grant_commit(struct obd_export *exp, unsigned long pending, + int rc) +{ + struct tg_grants_data *tgd = &exp->exp_obd->u.obt.obt_lut->lut_tgd; + + ENTRY; + + /* get space accounted in tot_pending for the I/O, set in + * tgt_grant_check() */ + if (pending == 0) + RETURN_EXIT; + + spin_lock(&tgd->tgd_grant_lock); + /* Don't update statfs data for errors raised before commit (e.g. + * bulk transfer failed, ...) since we know those writes have not been + * processed. For other errors hit during commit, we cannot really tell + * whether or not something was written, so we update statfs data. + * In any case, this should not be fatal since we always get fresh + * statfs data before failing a request with ENOSPC */ + if (rc == 0) { + spin_lock(&tgd->tgd_osfs_lock); + /* Take pending out of cached statfs data */ + tgd->tgd_osfs.os_bavail -= min_t(u64, + tgd->tgd_osfs.os_bavail, + pending >> tgd->tgd_blockbits); + if (tgd->tgd_statfs_inflight) + /* someone is running statfs and want to be notified of + * writes happening meanwhile */ + tgd->tgd_osfs_inflight += pending; + spin_unlock(&tgd->tgd_osfs_lock); + } + + if (exp->exp_target_data.ted_pending < pending) { + CERROR("%s: cli %s/%p ted_pending(%lu) < grant_used(%lu)\n", + exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, + exp->exp_target_data.ted_pending, pending); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + exp->exp_target_data.ted_pending -= pending; + + if (tgd->tgd_tot_granted < pending) { + CERROR("%s: cli %s/%p tot_granted(%llu) < grant_used(%lu)\n", + exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, + tgd->tgd_tot_granted, pending); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + tgd->tgd_tot_granted -= pending; + + if (tgd->tgd_tot_pending < pending) { + CERROR("%s: cli %s/%p tot_pending(%llu) < grant_used(%lu)\n", + exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, + tgd->tgd_tot_pending, pending); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + tgd->tgd_tot_pending -= pending; + spin_unlock(&tgd->tgd_grant_lock); + EXIT; +} +EXPORT_SYMBOL(tgt_grant_commit); + +struct tgt_grant_cb { + /* commit callback structure */ + struct dt_txn_commit_cb tgc_cb; + /* export associated with the bulk write */ + struct obd_export *tgc_exp; + /* pending grant to be released */ + unsigned long tgc_granted; +}; + +/** + * Callback function for grant releasing + * + * Release grant space reserved by the client node. + * + * \param[in] env execution environment + * \param[in] th transaction handle + * \param[in] cb callback data + * \param[in] err error code + */ +static void tgt_grant_commit_cb(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err) +{ + struct tgt_grant_cb *tgc; + + tgc = container_of(cb, struct tgt_grant_cb, tgc_cb); + + tgt_grant_commit(tgc->tgc_exp, tgc->tgc_granted, err); + class_export_cb_put(tgc->tgc_exp); + OBD_FREE_PTR(tgc); +} + +/** + * Add callback for grant releasing + * + * Register a commit callback to release grant space. + * + * \param[in] th transaction handle + * \param[in] exp OBD export of client + * \param[in] granted amount of grant space to be released upon commit + * + * \retval 0 on successful callback adding + * \retval negative value on error + */ +int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp, + unsigned long granted) +{ + struct tgt_grant_cb *tgc; + struct dt_txn_commit_cb *dcb; + int rc; + ENTRY; + + OBD_ALLOC_PTR(tgc); + if (tgc == NULL) + RETURN(-ENOMEM); + + tgc->tgc_exp = class_export_cb_get(exp); + tgc->tgc_granted = granted; + + dcb = &tgc->tgc_cb; + dcb->dcb_func = tgt_grant_commit_cb; + INIT_LIST_HEAD(&dcb->dcb_linkage); + strlcpy(dcb->dcb_name, "tgt_grant_commit_cb", sizeof(dcb->dcb_name)); + + rc = dt_trans_cb_add(th, dcb); + if (rc) { + class_export_cb_put(tgc->tgc_exp); + OBD_FREE_PTR(tgc); + } + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_grant_commit_cb_add); + +/** + * Show estimate of total amount of dirty data on clients. + * + * @kobj kobject embedded in obd_device + * @attr unused + * @buf buf used by sysfs to print out data + * + * Return: 0 on success + * negative value on error + */ +ssize_t tot_dirty_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct tg_grants_data *tgd; + + tgd = &obd->u.obt.obt_lut->lut_tgd; + return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_dirty); +} +EXPORT_SYMBOL(tot_dirty_show); + +/** + * Show total amount of space granted to clients. + * + * @kobj kobject embedded in obd_device + * @attr unused + * @buf buf used by sysfs to print out data + * + * Return: 0 on success + * negative value on error + */ +ssize_t tot_granted_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct tg_grants_data *tgd; + + tgd = &obd->u.obt.obt_lut->lut_tgd; + return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_granted); +} +EXPORT_SYMBOL(tot_granted_show); + +/** + * Show total amount of space used by IO in progress. + * + * @kobj kobject embedded in obd_device + * @attr unused + * @buf buf used by sysfs to print out data + * + * Return: 0 on success + * negative value on error + */ +ssize_t tot_pending_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct tg_grants_data *tgd; + + tgd = &obd->u.obt.obt_lut->lut_tgd; + return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_pending); +} +EXPORT_SYMBOL(tot_pending_show); + +/** + * Show if grants compatibility mode is disabled. + * + * When tgd_grant_compat_disable is set, we don't grant any space to clients + * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such + * a client is inflated since it consumes PAGE_SIZE of grant space per + * block, (i.e. typically 4kB units), but underlaying file system might have + * block size bigger than page size, e.g. ZFS. See LU-2049 for details. + * + * @kobj kobject embedded in obd_device + * @attr unused + * @buf buf used by sysfs to print out data + * + * Return: string length of @buf output on success + */ +ssize_t grant_compat_disable_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; + + return scnprintf(buf, PAGE_SIZE, "%u\n", tgd->tgd_grant_compat_disable); +} +EXPORT_SYMBOL(grant_compat_disable_show); + +/** + * Change grant compatibility mode. + * + * Setting tgd_grant_compat_disable prohibit any space granting to clients + * not supporting OBD_CONNECT_GRANT_PARAM. See details above. + * + * @kobj kobject embedded in obd_device + * @attr unused + * @buffer string which represents mode + * 1: disable compatibility mode + * 0: enable compatibility mode + * @count @buffer length + * + * Return: @count on success + * negative number on error + */ +ssize_t grant_compat_disable_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + tgd->tgd_grant_compat_disable = val; + + return count; +} +EXPORT_SYMBOL(grant_compat_disable_store); diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_handler.c b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c new file mode 100644 index 0000000000000..6be647b20cfa4 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c @@ -0,0 +1,3028 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * lustre/target/tgt_handler.c + * + * Lustre Unified Target request handler code + * + * Author: Brian Behlendorf + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "tgt_internal.h" + +char *tgt_name(struct lu_target *tgt) +{ + LASSERT(tgt->lut_obd != NULL); + return tgt->lut_obd->obd_name; +} +EXPORT_SYMBOL(tgt_name); + +/* + * Generic code handling requests that have struct mdt_body passed in: + * + * - extract mdt_body from request and save it in @tsi, if present; + * + * - create lu_object, corresponding to the fid in mdt_body, and save it in + * @tsi; + * + * - if HAS_BODY flag is set for this request type check whether object + * actually exists on storage (lu_object_exists()). + * + */ +static int tgt_mdt_body_unpack(struct tgt_session_info *tsi, __u32 flags) +{ + const struct mdt_body *body; + struct lu_object *obj; + struct req_capsule *pill = tsi->tsi_pill; + int rc; + + ENTRY; + + body = req_capsule_client_get(pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EFAULT); + + tsi->tsi_mdt_body = body; + + if (!(body->mbo_valid & OBD_MD_FLID)) + RETURN(0); + + /* mdc_pack_body() doesn't check if fid is zero and set OBD_ML_FID + * in any case in pre-2.5 clients. Fix that here if needed */ + if (unlikely(fid_is_zero(&body->mbo_fid1))) + RETURN(0); + + if (!fid_is_sane(&body->mbo_fid1)) { + CERROR("%s: invalid FID: "DFID"\n", tgt_name(tsi->tsi_tgt), + PFID(&body->mbo_fid1)); + RETURN(-EINVAL); + } + + obj = lu_object_find(tsi->tsi_env, + &tsi->tsi_tgt->lut_bottom->dd_lu_dev, + &body->mbo_fid1, NULL); + if (!IS_ERR(obj)) { + if ((flags & HAS_BODY) && !lu_object_exists(obj)) { + lu_object_put(tsi->tsi_env, obj); + rc = -ENOENT; + } else { + tsi->tsi_corpus = obj; + rc = 0; + } + } else { + rc = PTR_ERR(obj); + } + + tsi->tsi_fid = body->mbo_fid1; + + RETURN(rc); +} + +/** + * Validate oa from client. + * If the request comes from 2.0 clients, currently only RSVD seq and IDIF + * req are valid. + * a. objects in Single MDT FS seq = FID_SEQ_OST_MDT0, oi_id != 0 + * b. Echo objects(seq = 2), old echo client still use oi_id/oi_seq to + * pack ost_id. Because non-zero oi_seq will make it diffcult to tell + * whether this is oi_fid or real ostid. So it will check + * OBD_CONNECT_FID, then convert the ostid to FID for old client. + * c. Old FID-disable osc will send IDIF. + * d. new FID-enable osc/osp will send normal FID. + * + * And also oi_id/f_oid should always start from 1. oi_id/f_oid = 0 will + * be used for LAST_ID file, and only being accessed inside OST now. + */ +int tgt_validate_obdo(struct tgt_session_info *tsi, struct obdo *oa) +{ + struct ost_id *oi = &oa->o_oi; + u64 seq = ostid_seq(oi); + u64 id = ostid_id(oi); + int rc; + ENTRY; + + if (unlikely(!(exp_connect_flags(tsi->tsi_exp) & OBD_CONNECT_FID) && + fid_seq_is_echo(seq))) { + /* Sigh 2.[123] client still sends echo req with oi_id = 0 + * during create, and we will reset this to 1, since this + * oi_id is basically useless in the following create process, + * but oi_id == 0 will make it difficult to tell whether it is + * real FID or ost_id. */ + oi->oi_fid.f_seq = FID_SEQ_ECHO; + oi->oi_fid.f_oid = id ?: 1; + oi->oi_fid.f_ver = 0; + } else { + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + + if (unlikely((oa->o_valid & OBD_MD_FLID) && id == 0)) + GOTO(out, rc = -EPROTO); + + /* Note: this check might be forced in 2.5 or 2.6, i.e. + * all of the requests are required to setup FLGROUP */ + if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP))) { + ostid_set_seq_mdt0(oi); + oa->o_valid |= OBD_MD_FLGROUP; + seq = ostid_seq(oi); + } + + if (unlikely(!(fid_seq_is_idif(seq) || fid_seq_is_mdt0(seq) || + fid_seq_is_norm(seq) || fid_seq_is_echo(seq)))) + GOTO(out, rc = -EPROTO); + + rc = ostid_to_fid(&tti->tti_fid1, oi, + tsi->tsi_tgt->lut_lsd.lsd_osd_index); + if (unlikely(rc != 0)) + GOTO(out, rc); + + oi->oi_fid = tti->tti_fid1; + } + + RETURN(0); + +out: + CERROR("%s: client %s sent bad object "DOSTID": rc = %d\n", + tgt_name(tsi->tsi_tgt), obd_export_nid2str(tsi->tsi_exp), + seq, id, rc); + return rc; +} +EXPORT_SYMBOL(tgt_validate_obdo); + +static int tgt_io_data_unpack(struct tgt_session_info *tsi, struct ost_id *oi) +{ + unsigned max_brw; + struct niobuf_remote *rnb; + struct obd_ioobj *ioo; + int obj_count; + + ENTRY; + + ioo = req_capsule_client_get(tsi->tsi_pill, &RMF_OBD_IOOBJ); + if (ioo == NULL) + RETURN(-EPROTO); + + rnb = req_capsule_client_get(tsi->tsi_pill, &RMF_NIOBUF_REMOTE); + if (rnb == NULL) + RETURN(-EPROTO); + + max_brw = ioobj_max_brw_get(ioo); + if (unlikely((max_brw & (max_brw - 1)) != 0)) { + CERROR("%s: client %s sent bad ioobj max %u for "DOSTID + ": rc = %d\n", tgt_name(tsi->tsi_tgt), + obd_export_nid2str(tsi->tsi_exp), max_brw, + POSTID(oi), -EPROTO); + RETURN(-EPROTO); + } + ioo->ioo_oid = *oi; + + obj_count = req_capsule_get_size(tsi->tsi_pill, &RMF_OBD_IOOBJ, + RCL_CLIENT) / sizeof(*ioo); + if (obj_count == 0) { + CERROR("%s: short ioobj\n", tgt_name(tsi->tsi_tgt)); + RETURN(-EPROTO); + } else if (obj_count > 1) { + CERROR("%s: too many ioobjs (%d)\n", tgt_name(tsi->tsi_tgt), + obj_count); + RETURN(-EPROTO); + } + + if (ioo->ioo_bufcnt == 0) { + CERROR("%s: ioo has zero bufcnt\n", tgt_name(tsi->tsi_tgt)); + RETURN(-EPROTO); + } + + if (ioo->ioo_bufcnt > PTLRPC_MAX_BRW_PAGES) { + DEBUG_REQ(D_RPCTRACE, tgt_ses_req(tsi), + "bulk has too many pages (%d)", + ioo->ioo_bufcnt); + RETURN(-EPROTO); + } + + RETURN(0); +} + +static int tgt_ost_body_unpack(struct tgt_session_info *tsi, __u32 flags) +{ + struct ost_body *body; + struct req_capsule *pill = tsi->tsi_pill; + struct lu_nodemap *nodemap; + int rc; + + ENTRY; + + body = req_capsule_client_get(pill, &RMF_OST_BODY); + if (body == NULL) + RETURN(-EFAULT); + + rc = tgt_validate_obdo(tsi, &body->oa); + if (rc) + RETURN(rc); + + nodemap = nodemap_get_from_exp(tsi->tsi_exp); + if (IS_ERR(nodemap)) + RETURN(PTR_ERR(nodemap)); + + body->oa.o_uid = nodemap_map_id(nodemap, NODEMAP_UID, + NODEMAP_CLIENT_TO_FS, + body->oa.o_uid); + body->oa.o_gid = nodemap_map_id(nodemap, NODEMAP_GID, + NODEMAP_CLIENT_TO_FS, + body->oa.o_gid); + body->oa.o_projid = nodemap_map_id(nodemap, NODEMAP_PROJID, + NODEMAP_CLIENT_TO_FS, + body->oa.o_projid); + nodemap_putref(nodemap); + + tsi->tsi_ost_body = body; + tsi->tsi_fid = body->oa.o_oi.oi_fid; + + if (req_capsule_has_field(pill, &RMF_OBD_IOOBJ, RCL_CLIENT)) { + rc = tgt_io_data_unpack(tsi, &body->oa.o_oi); + if (rc < 0) + RETURN(rc); + } + + if (!(body->oa.o_valid & OBD_MD_FLID)) { + if (flags & HAS_BODY) { + CERROR("%s: OBD_MD_FLID flag is not set in ost_body but OID/FID is mandatory with HAS_BODY\n", + tgt_name(tsi->tsi_tgt)); + RETURN(-EPROTO); + } else { + RETURN(0); + } + } + + ost_fid_build_resid(&tsi->tsi_fid, &tsi->tsi_resid); + + /* + * OST doesn't get object in advance for further use to prevent + * situations with nested object_find which is potential deadlock. + */ + tsi->tsi_corpus = NULL; + RETURN(rc); +} + +/* + * Do necessary preprocessing according to handler ->th_flags. + */ +static int tgt_request_preprocess(struct tgt_session_info *tsi, + struct tgt_handler *h, + struct ptlrpc_request *req) +{ + struct req_capsule *pill = tsi->tsi_pill; + __u32 flags = h->th_flags; + int rc = 0; + + ENTRY; + + if (tsi->tsi_preprocessed) + RETURN(0); + + LASSERT(h->th_act != NULL); + LASSERT(h->th_opc == lustre_msg_get_opc(req->rq_reqmsg)); + LASSERT(current->journal_info == NULL); + + LASSERT(ergo(flags & (HAS_BODY | HAS_REPLY), + h->th_fmt != NULL)); + if (h->th_fmt != NULL) { + req_capsule_set(pill, h->th_fmt); + if (req_capsule_has_field(pill, &RMF_MDT_BODY, RCL_CLIENT) && + req_capsule_field_present(pill, &RMF_MDT_BODY, + RCL_CLIENT)) { + rc = tgt_mdt_body_unpack(tsi, flags); + if (rc < 0) + RETURN(rc); + } else if (req_capsule_has_field(pill, &RMF_OST_BODY, + RCL_CLIENT) && + req_capsule_field_present(pill, &RMF_OST_BODY, + RCL_CLIENT)) { + rc = tgt_ost_body_unpack(tsi, flags); + if (rc < 0) + RETURN(rc); + } + } + + if (flags & IS_MUTABLE && tgt_conn_flags(tsi) & OBD_CONNECT_RDONLY) + RETURN(-EROFS); + + if (flags & HAS_KEY) { + struct ldlm_request *dlm_req; + + LASSERT(h->th_fmt != NULL); + + dlm_req = req_capsule_client_get(pill, &RMF_DLM_REQ); + if (dlm_req != NULL) { + union ldlm_wire_policy_data *policy = + &dlm_req->lock_desc.l_policy_data; + + if (unlikely(dlm_req->lock_desc.l_resource.lr_type == + LDLM_IBITS && + (policy->l_inodebits.bits | + policy->l_inodebits.try_bits) == 0)) { + /* + * Lock without inodebits makes no sense and + * will oops later in ldlm. If client miss to + * set such bits, do not trigger ASSERTION. + * + * For liblustre flock case, it maybe zero. + */ + rc = -EPROTO; + } else { + tsi->tsi_dlm_req = dlm_req; + } + } else { + rc = -EFAULT; + } + } + tsi->tsi_preprocessed = 1; + RETURN(rc); +} + +/* + * Invoke handler for this request opc. Also do necessary preprocessing + * (according to handler ->th_flags), and post-processing (setting of + * ->last_{xid,committed}). + */ +static int tgt_handle_request0(struct tgt_session_info *tsi, + struct tgt_handler *h, + struct ptlrpc_request *req) +{ + int serious = 0; + int rc; + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + + ENTRY; + + + /* When dealing with sec context requests, no export is associated yet, + * because these requests are sent before *_CONNECT requests. + * A NULL req->rq_export means the normal *_common_slice handlers will + * not be called, because there is no reference to the target. + * So deal with them by hand and jump directly to target_send_reply(). + */ + switch (opc) { + case SEC_CTX_INIT: + case SEC_CTX_INIT_CONT: + case SEC_CTX_FINI: + CFS_FAIL_TIMEOUT(OBD_FAIL_SEC_CTX_HDL_PAUSE, cfs_fail_val); + GOTO(out, rc = 0); + } + + /* + * Checking for various OBD_FAIL_$PREF_$OPC_NET codes. _Do_ not try + * to put same checks into handlers like mdt_close(), mdt_reint(), + * etc., without talking to mdt authors first. Checking same thing + * there again is useless and returning 0 error without packing reply + * is buggy! Handlers either pack reply or return error. + * + * We return 0 here and do not send any reply in order to emulate + * network failure. Do not send any reply in case any of NET related + * fail_id has occured. + */ + if (OBD_FAIL_CHECK_ORSET(h->th_fail_id, OBD_FAIL_ONCE)) + RETURN(0); + if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT && + OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET))) + RETURN(0); + + /* drop OUT_UPDATE rpc */ + if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == OUT_UPDATE && + OBD_FAIL_CHECK(OBD_FAIL_OUT_UPDATE_DROP))) + RETURN(0); + + rc = tgt_request_preprocess(tsi, h, req); + /* pack reply if reply format is fixed */ + if (rc == 0 && h->th_flags & HAS_REPLY) { + /* Pack reply */ + if (req_capsule_has_field(tsi->tsi_pill, &RMF_MDT_MD, + RCL_SERVER)) + req_capsule_set_size(tsi->tsi_pill, &RMF_MDT_MD, + RCL_SERVER, + tsi->tsi_mdt_body->mbo_eadatasize); + if (req_capsule_has_field(tsi->tsi_pill, &RMF_LOGCOOKIES, + RCL_SERVER)) + req_capsule_set_size(tsi->tsi_pill, &RMF_LOGCOOKIES, + RCL_SERVER, 0); + if (req_capsule_has_field(tsi->tsi_pill, &RMF_ACL, RCL_SERVER)) + req_capsule_set_size(tsi->tsi_pill, + &RMF_ACL, RCL_SERVER, + LUSTRE_POSIX_ACL_MAX_SIZE_OLD); + + if (req_capsule_has_field(tsi->tsi_pill, &RMF_SHORT_IO, + RCL_SERVER)) { + struct niobuf_remote *remote_nb = + req_capsule_client_get(tsi->tsi_pill, + &RMF_NIOBUF_REMOTE); + struct ost_body *body = tsi->tsi_ost_body; + + req_capsule_set_size(tsi->tsi_pill, &RMF_SHORT_IO, + RCL_SERVER, + (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_SHORT_IO) ? + remote_nb[0].rnb_len : 0); + } + if (req_capsule_has_field(tsi->tsi_pill, &RMF_FILE_ENCCTX, + RCL_SERVER)) + req_capsule_set_size(tsi->tsi_pill, &RMF_FILE_ENCCTX, + RCL_SERVER, 0); + + rc = req_capsule_server_pack(tsi->tsi_pill); + } + + if (likely(rc == 0)) { + /* + * Process request, there can be two types of rc: + * 1) errors with msg unpack/pack, other failures outside the + * operation itself. This is counted as serious errors; + * 2) errors during fs operation, should be placed in rq_status + * only + */ + rc = h->th_act(tsi); + if (!is_serious(rc) && + !req->rq_no_reply && req->rq_reply_state == NULL) { + DEBUG_REQ(D_ERROR, req, + "%s: %s handler did not pack reply but returned no error", + tgt_name(tsi->tsi_tgt), h->th_name); + LBUG(); + } + serious = is_serious(rc); + rc = clear_serious(rc); + } else { + serious = 1; + } + + req->rq_status = rc; + + /* + * ELDLM_* codes which > 0 should be in rq_status only as well as + * all non-serious errors. + */ + if (rc > 0 || !serious) + rc = 0; + + LASSERT(current->journal_info == NULL); + + if (likely(rc == 0 && req->rq_export)) + target_committed_to_req(req); + +out: + target_send_reply(req, rc, tsi->tsi_reply_fail_id); + RETURN(0); +} + +static int tgt_filter_recovery_request(struct ptlrpc_request *req, + struct obd_device *obd, int *process) +{ + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case MDS_DISCONNECT: + case OST_DISCONNECT: + case OBD_IDX_READ: + *process = 1; + RETURN(0); + case MDS_CLOSE: + case MDS_SYNC: /* used in unmounting */ + case OBD_PING: + case MDS_REINT: + case OUT_UPDATE: + case SEQ_QUERY: + case FLD_QUERY: + case FLD_READ: + case LDLM_ENQUEUE: + case OST_CREATE: + case OST_DESTROY: + case OST_PUNCH: + case OST_SETATTR: + case OST_SYNC: + case OST_WRITE: + case MDS_HSM_PROGRESS: + case MDS_HSM_STATE_SET: + case MDS_HSM_REQUEST: + case OST_FALLOCATE: + *process = target_queue_recovery_request(req, obd); + RETURN(0); + + default: + DEBUG_REQ(D_ERROR, req, "not permitted during recovery"); + *process = -EAGAIN; + RETURN(0); + } +} + +/* + * Handle recovery. Return: + * +1: continue request processing; + * -ve: abort immediately with the given error code; + * 0: send reply with error code in req->rq_status; + */ +static int tgt_handle_recovery(struct ptlrpc_request *req, int reply_fail_id) +{ + ENTRY; + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case MDS_CONNECT: + case OST_CONNECT: + case MGS_CONNECT: + case SEC_CTX_INIT: + case SEC_CTX_INIT_CONT: + case SEC_CTX_FINI: + RETURN(+1); + } + + if (!req->rq_export->exp_obd->obd_replayable) + RETURN(+1); + + /* sanity check: if the xid matches, the request must be marked as a + * resent or replayed */ + if (req_can_reconstruct(req, NULL) == 1) { + if (!(lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY))) { + DEBUG_REQ(D_WARNING, req, + "rq_xid=%llu matches saved XID, expected REPLAY or RESENT flag (%x)", + req->rq_xid, + lustre_msg_get_flags(req->rq_reqmsg)); + req->rq_status = -ENOTCONN; + RETURN(-ENOTCONN); + } + } + /* else: note the opposite is not always true; a RESENT req after a + * failover will usually not match the last_xid, since it was likely + * never committed. A REPLAYed request will almost never match the + * last xid, however it could for a committed, but still retained, + * open. */ + + /* Check for aborted recovery... */ + if (unlikely(req->rq_export->exp_obd->obd_recovering)) { + int rc; + int should_process; + + DEBUG_REQ(D_INFO, req, "Got new replay"); + rc = tgt_filter_recovery_request(req, req->rq_export->exp_obd, + &should_process); + if (rc != 0 || !should_process) + RETURN(rc); + else if (should_process < 0) { + req->rq_status = should_process; + rc = ptlrpc_error(req); + RETURN(rc); + } + } + RETURN(+1); +} + +/* Initial check for request, it is validation mostly */ +static struct tgt_handler *tgt_handler_find_check(struct ptlrpc_request *req) +{ + struct tgt_handler *h; + struct tgt_opc_slice *s; + struct lu_target *tgt; + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + + ENTRY; + + tgt = class_exp2tgt(req->rq_export); + if (unlikely(tgt == NULL)) { + DEBUG_REQ(D_ERROR, req, "%s: no target for connected export", + class_exp2obd(req->rq_export)->obd_name); + RETURN(ERR_PTR(-EINVAL)); + } + + for (s = tgt->lut_slice; s->tos_hs != NULL; s++) + if (s->tos_opc_start <= opc && opc < s->tos_opc_end) + break; + + /* opcode was not found in slice */ + if (unlikely(s->tos_hs == NULL)) { + static bool printed; + + /* don't spew error messages for unhandled RPCs */ + if (!printed) { + CERROR("%s: no handler for opcode 0x%x from %s\n", + tgt_name(tgt), opc, libcfs_id2str(req->rq_peer)); + printed = true; + } + RETURN(ERR_PTR(-ENOTSUPP)); + } + + LASSERT(opc >= s->tos_opc_start && opc < s->tos_opc_end); + h = s->tos_hs + (opc - s->tos_opc_start); + if (unlikely(h->th_opc == 0)) { + CERROR("%s: unsupported opcode 0x%x\n", tgt_name(tgt), opc); + RETURN(ERR_PTR(-ENOTSUPP)); + } + + RETURN(h); +} + +static int process_req_last_xid(struct ptlrpc_request *req) +{ + __u64 last_xid; + int rc = 0; + struct obd_export *exp = req->rq_export; + struct tg_export_data *ted = &exp->exp_target_data; + bool need_lock = tgt_is_multimodrpcs_client(exp); + ENTRY; + + if (need_lock) + mutex_lock(&ted->ted_lcd_lock); + /* check request's xid is consistent with export's last_xid */ + last_xid = lustre_msg_get_last_xid(req->rq_reqmsg); + if (last_xid > exp->exp_last_xid) + exp->exp_last_xid = last_xid; + + if (req->rq_xid == 0 || req->rq_xid <= exp->exp_last_xid) { + /* Some request is allowed to be sent during replay, + * such as OUT update requests, FLD requests, so it + * is possible that replay requests has smaller XID + * than the exp_last_xid. + * + * Some non-replay requests may have smaller XID as + * well: + * + * - Client send a no_resend RPC, like statfs; + * - The RPC timedout (or some other error) on client, + * then it's removed from the unreplied list; + * - Client send some other request to bump the + * exp_last_xid on server; + * - The former RPC got chance to be processed; + */ + if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) + rc = -EPROTO; + + DEBUG_REQ(D_WARNING, req, + "unexpected xid=%llx != exp_last_xid=%llx, rc = %d", + req->rq_xid, exp->exp_last_xid, rc); + if (rc) + GOTO(out, rc); + } + + /* The "last_xid" is the minimum xid among unreplied requests, + * if the request is from the previous connection, its xid can + * still be larger than "exp_last_xid", then the above check of + * xid is not enough to determine whether the request is delayed. + * + * For example, if some replay request was delayed and caused + * timeout at client and the replay is restarted, the delayed + * replay request will have the larger xid than "exp_last_xid" + */ + if (req->rq_export->exp_conn_cnt > + lustre_msg_get_conn_cnt(req->rq_reqmsg)) { + CDEBUG(D_RPCTRACE, + "Dropping request %llu from an old epoch %u/%u\n", + req->rq_xid, + lustre_msg_get_conn_cnt(req->rq_reqmsg), + req->rq_export->exp_conn_cnt); + req->rq_no_reply = 1; + GOTO(out, rc = -ESTALE); + } + + /* try to release in-memory reply data */ + if (tgt_is_multimodrpcs_client(exp)) { + tgt_handle_received_xid(exp, last_xid); + rc = tgt_handle_tag(req); + } + +out: + if (need_lock) + mutex_unlock(&ted->ted_lcd_lock); + + RETURN(rc); +} + +int tgt_request_handle(struct ptlrpc_request *req) +{ + struct tgt_session_info *tsi = tgt_ses_info(req->rq_svc_thread->t_env); + + struct lustre_msg *msg = req->rq_reqmsg; + struct tgt_handler *h; + struct lu_target *tgt; + int request_fail_id = 0; + __u32 opc = lustre_msg_get_opc(msg); + struct obd_device *obd; + int rc; + bool is_connect = false; + ENTRY; + + if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) { + if (cfs_fail_val == 0 && + lustre_msg_get_opc(msg) != OBD_PING && + lustre_msg_get_flags(msg) & MSG_REQ_REPLAY_DONE) { + cfs_fail_val = 1; + cfs_race_state = 0; + wait_event_idle(cfs_race_waitq, (cfs_race_state == 1)); + } + } + + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + tsi->tsi_pill = &req->rq_pill; + tsi->tsi_env = req->rq_svc_thread->t_env; + + /* if request has export then get handlers slice from corresponding + * target, otherwise that should be connect operation */ + if (opc == MDS_CONNECT || opc == OST_CONNECT || + opc == MGS_CONNECT) { + is_connect = true; + req_capsule_set(&req->rq_pill, &RQF_CONNECT); + rc = target_handle_connect(req); + if (rc != 0) { + rc = ptlrpc_error(req); + GOTO(out, rc); + } + /* recovery-small test 18c asks to drop connect reply */ + if (unlikely(opc == OST_CONNECT && + OBD_FAIL_CHECK(OBD_FAIL_OST_CONNECT_NET2))) + GOTO(out, rc = 0); + } + + if (unlikely(!class_connected_export(req->rq_export))) { + if (opc == SEC_CTX_INIT || opc == SEC_CTX_INIT_CONT || + opc == SEC_CTX_FINI) { + /* sec context initialization has to be handled + * by hand in tgt_handle_request0() */ + tsi->tsi_reply_fail_id = OBD_FAIL_SEC_CTX_INIT_NET; + h = NULL; + GOTO(handle_recov, rc = 0); + } + CDEBUG(D_HA, "operation %d on unconnected OST from %s\n", + opc, libcfs_id2str(req->rq_peer)); + req->rq_status = -ENOTCONN; + rc = ptlrpc_error(req); + GOTO(out, rc); + } + + tsi->tsi_tgt = tgt = class_exp2tgt(req->rq_export); + tsi->tsi_exp = req->rq_export; + if (exp_connect_flags(req->rq_export) & OBD_CONNECT_JOBSTATS) + tsi->tsi_jobid = lustre_msg_get_jobid(req->rq_reqmsg); + else + tsi->tsi_jobid = NULL; + + if (tgt == NULL) { + DEBUG_REQ(D_ERROR, req, "%s: No target for connected export", + class_exp2obd(req->rq_export)->obd_name); + req->rq_status = -EINVAL; + rc = ptlrpc_error(req); + GOTO(out, rc); + } + + /* Skip last_xid processing for the recovery thread, otherwise, the + * last_xid on same request could be processed twice: first time when + * processing the incoming request, second time when the request is + * being processed by recovery thread. */ + obd = class_exp2obd(req->rq_export); + if (is_connect) { + /* reset the exp_last_xid on each connection. */ + req->rq_export->exp_last_xid = 0; + } else if (obd->obd_recovery_data.trd_processing_task != + current->pid) { + rc = process_req_last_xid(req); + if (rc) { + req->rq_status = rc; + rc = ptlrpc_error(req); + GOTO(out, rc); + } + } + + request_fail_id = tgt->lut_request_fail_id; + tsi->tsi_reply_fail_id = tgt->lut_reply_fail_id; + + h = tgt_handler_find_check(req); + if (IS_ERR(h)) { + req->rq_status = PTR_ERR(h); + rc = ptlrpc_error(req); + GOTO(out, rc); + } + + LASSERTF(h->th_opc == opc, "opcode mismatch %d != %d\n", + h->th_opc, opc); + + if ((cfs_fail_val == 0 || cfs_fail_val == opc) && + CFS_FAIL_CHECK_ORSET(request_fail_id, CFS_FAIL_ONCE)) + GOTO(out, rc = 0); + + rc = lustre_msg_check_version(msg, h->th_version); + if (unlikely(rc)) { + DEBUG_REQ(D_ERROR, req, + "%s: drop malformed request version=%08x expect=%08x", + tgt_name(tgt), lustre_msg_get_version(msg), + h->th_version); + req->rq_status = -EINVAL; + rc = ptlrpc_error(req); + GOTO(out, rc); + } + +handle_recov: + rc = tgt_handle_recovery(req, tsi->tsi_reply_fail_id); + if (likely(rc == 1)) { + rc = tgt_handle_request0(tsi, h, req); + if (rc) + GOTO(out, rc); + } + EXIT; +out: + req_capsule_fini(tsi->tsi_pill); + if (tsi->tsi_corpus != NULL) { + lu_object_put(tsi->tsi_env, tsi->tsi_corpus); + tsi->tsi_corpus = NULL; + } + return rc; +} +EXPORT_SYMBOL(tgt_request_handle); + +/** Assign high priority operations to the request if needed. */ +int tgt_hpreq_handler(struct ptlrpc_request *req) +{ + struct tgt_session_info *tsi = tgt_ses_info(req->rq_svc_thread->t_env); + struct tgt_handler *h; + int rc; + + ENTRY; + + if (req->rq_export == NULL) + RETURN(0); + + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + tsi->tsi_pill = &req->rq_pill; + tsi->tsi_env = req->rq_svc_thread->t_env; + tsi->tsi_tgt = class_exp2tgt(req->rq_export); + tsi->tsi_exp = req->rq_export; + + h = tgt_handler_find_check(req); + if (IS_ERR(h)) { + rc = PTR_ERR(h); + RETURN(rc); + } + + rc = tgt_request_preprocess(tsi, h, req); + if (unlikely(rc != 0)) + RETURN(rc); + + if (h->th_hp != NULL) + h->th_hp(tsi); + RETURN(0); +} +EXPORT_SYMBOL(tgt_hpreq_handler); + +void tgt_counter_incr(struct obd_export *exp, int opcode) +{ + lprocfs_counter_incr(exp->exp_obd->obd_stats, opcode); + if (exp->exp_nid_stats && exp->exp_nid_stats->nid_stats != NULL) + lprocfs_counter_incr(exp->exp_nid_stats->nid_stats, opcode); +} +EXPORT_SYMBOL(tgt_counter_incr); + +/* + * Unified target generic handlers. + */ + +int tgt_connect_check_sptlrpc(struct ptlrpc_request *req, struct obd_export *exp) +{ + struct lu_target *tgt = class_exp2tgt(exp); + struct sptlrpc_flavor flvr; + int rc = 0; + + LASSERT(tgt); + LASSERT(tgt->lut_obd); + LASSERT(tgt->lut_slice); + + /* always allow ECHO client */ + if (unlikely(strcmp(exp->exp_obd->obd_type->typ_name, + LUSTRE_ECHO_NAME) == 0)) { + exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_ANY; + return 0; + } + + if (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) { + read_lock(&tgt->lut_sptlrpc_lock); + sptlrpc_target_choose_flavor(&tgt->lut_sptlrpc_rset, + req->rq_sp_from, + req->rq_peer.nid, + &flvr); + read_unlock(&tgt->lut_sptlrpc_lock); + + spin_lock(&exp->exp_lock); + exp->exp_sp_peer = req->rq_sp_from; + exp->exp_flvr = flvr; + + /* when on mgs, if no restriction is set, or if the client + * NID is on the local node, allow any flavor + */ + if ((strcmp(exp->exp_obd->obd_type->typ_name, + LUSTRE_MGS_NAME) == 0) && + (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_NULL || + LNetIsPeerLocal(lnet_nid_to_nid4(&exp->exp_connection->c_peer.nid)))) + exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_ANY; + + if (exp->exp_flvr.sf_rpc != SPTLRPC_FLVR_ANY && + exp->exp_flvr.sf_rpc != req->rq_flvr.sf_rpc) { + CERROR("%s: unauthorized rpc flavor %x from %s, " + "expect %x\n", tgt_name(tgt), + req->rq_flvr.sf_rpc, + libcfs_nid2str(req->rq_peer.nid), + exp->exp_flvr.sf_rpc); + rc = -EACCES; + } + spin_unlock(&exp->exp_lock); + } else { + if (exp->exp_sp_peer != req->rq_sp_from) { + CERROR("%s: RPC source %s doesn't match %s\n", + tgt_name(tgt), + sptlrpc_part2name(req->rq_sp_from), + sptlrpc_part2name(exp->exp_sp_peer)); + rc = -EACCES; + } else { + rc = sptlrpc_target_export_check(exp, req); + } + } + + return rc; +} + +int tgt_adapt_sptlrpc_conf(struct lu_target *tgt) +{ + struct sptlrpc_rule_set tmp_rset; + int rc; + + if (unlikely(tgt == NULL)) { + CERROR("No target passed\n"); + return -EINVAL; + } + + sptlrpc_rule_set_init(&tmp_rset); + rc = sptlrpc_conf_target_get_rules(tgt->lut_obd, &tmp_rset); + if (rc) { + CERROR("%s: failed get sptlrpc rules: rc = %d\n", + tgt_name(tgt), rc); + return rc; + } + + sptlrpc_target_update_exp_flavor(tgt->lut_obd, &tmp_rset); + + write_lock(&tgt->lut_sptlrpc_lock); + sptlrpc_rule_set_free(&tgt->lut_sptlrpc_rset); + tgt->lut_sptlrpc_rset = tmp_rset; + write_unlock(&tgt->lut_sptlrpc_lock); + + return 0; +} +EXPORT_SYMBOL(tgt_adapt_sptlrpc_conf); + +int tgt_connect(struct tgt_session_info *tsi) +{ + struct ptlrpc_request *req = tgt_ses_req(tsi); + struct obd_connect_data *reply; + int rc; + + ENTRY; + + /* XXX: better to call this check right after getting new export but + * before last_rcvd slot allocation to avoid server load upon insecure + * connects. This is to be fixed after unifiyng all targets. + */ + rc = tgt_connect_check_sptlrpc(req, tsi->tsi_exp); + if (rc) + GOTO(out, rc); + + /* To avoid exposing partially initialized connection flags, changes up + * to this point have been staged in reply->ocd_connect_flags. Now that + * connection handling has completed successfully, atomically update + * the connect flags in the shared export data structure. LU-1623 */ + reply = req_capsule_server_get(tsi->tsi_pill, &RMF_CONNECT_DATA); + spin_lock(&tsi->tsi_exp->exp_lock); + *exp_connect_flags_ptr(tsi->tsi_exp) = reply->ocd_connect_flags; + if (reply->ocd_connect_flags & OBD_CONNECT_FLAGS2) + *exp_connect_flags2_ptr(tsi->tsi_exp) = + reply->ocd_connect_flags2; + tsi->tsi_exp->exp_connect_data.ocd_brw_size = reply->ocd_brw_size; + spin_unlock(&tsi->tsi_exp->exp_lock); + + if (strcmp(tsi->tsi_exp->exp_obd->obd_type->typ_name, + LUSTRE_MDT_NAME) == 0) { + rc = req_check_sepol(tsi->tsi_pill); + if (rc) + GOTO(out, rc); + + if (reply->ocd_connect_flags & OBD_CONNECT_FLAGS2 && + reply->ocd_connect_flags2 & OBD_CONNECT2_ENCRYPT && + tsi->tsi_pill->rc_req->rq_export) { + bool forbid_encrypt = true; + struct lu_nodemap *nm = + nodemap_get_from_exp(tsi->tsi_pill->rc_req->rq_export); + + if (!nm) { + /* nodemap_get_from_exp returns NULL in case + * nodemap is not active, so we do not forbid + */ + forbid_encrypt = false; + } else if (!IS_ERR(nm)) { + forbid_encrypt = nm->nmf_forbid_encryption; + nodemap_putref(nm); + } + + if (forbid_encrypt) + GOTO(out, rc = -EACCES); + } + } + + RETURN(0); +out: + obd_disconnect(class_export_get(tsi->tsi_exp)); + return rc; +} +EXPORT_SYMBOL(tgt_connect); + +int tgt_disconnect(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + OBD_FAIL_TIMEOUT(OBD_FAIL_OST_DISCONNECT_DELAY, cfs_fail_val); + + rc = target_handle_disconnect(tgt_ses_req(tsi)); + if (rc) + RETURN(err_serious(rc)); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_disconnect); + +/* + * Unified target OBD handlers + */ +int tgt_obd_ping(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + /* The target-specific part of OBD_PING request handling. + * It controls Filter Modification Data (FMD) expiration each time + * PING is received. + * + * Valid only for replayable targets, e.g. MDT and OFD + */ + if (tsi->tsi_exp->exp_obd->obd_replayable) + tgt_fmd_expire(tsi->tsi_exp); + + rc = req_capsule_server_pack(tsi->tsi_pill); + if (rc) + RETURN(err_serious(rc)); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_obd_ping); + +int tgt_obd_log_cancel(struct tgt_session_info *tsi) +{ + return err_serious(-EOPNOTSUPP); +} + +int tgt_send_buffer(struct tgt_session_info *tsi, struct lu_rdbuf *rdbuf) +{ + struct ptlrpc_request *req = tgt_ses_req(tsi); + struct obd_export *exp = req->rq_export; + struct ptlrpc_bulk_desc *desc; + int i; + int rc; + int pages = 0; + + ENTRY; + + for (i = 0; i < rdbuf->rb_nbufs; i++) { + unsigned int offset; + + offset = (unsigned long)rdbuf->rb_bufs[i].lb_buf & ~PAGE_MASK; + pages += DIV_ROUND_UP(rdbuf->rb_bufs[i].lb_len + offset, + PAGE_SIZE); + } + + desc = ptlrpc_prep_bulk_exp(req, pages, 1, + PTLRPC_BULK_PUT_SOURCE, + MDS_BULK_PORTAL, + &ptlrpc_bulk_kiov_nopin_ops); + if (desc == NULL) + RETURN(-ENOMEM); + + for (i = 0; i < rdbuf->rb_nbufs; i++) + desc->bd_frag_ops->add_iov_frag(desc, + rdbuf->rb_bufs[i].lb_buf, + rdbuf->rb_bufs[i].lb_len); + + rc = target_bulk_io(exp, desc); + ptlrpc_free_bulk(desc); + RETURN(rc); +} +EXPORT_SYMBOL(tgt_send_buffer); + +int tgt_sendpage(struct tgt_session_info *tsi, struct lu_rdpg *rdpg, int nob) +{ + struct ptlrpc_request *req = tgt_ses_req(tsi); + struct obd_export *exp = req->rq_export; + struct ptlrpc_bulk_desc *desc; + int tmpcount; + int tmpsize; + int i; + int rc; + + ENTRY; + + desc = ptlrpc_prep_bulk_exp(req, rdpg->rp_npages, 1, + PTLRPC_BULK_PUT_SOURCE, + MDS_BULK_PORTAL, + &ptlrpc_bulk_kiov_pin_ops); + if (desc == NULL) + RETURN(-ENOMEM); + + if (!(exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE)) + /* old client requires reply size in it's PAGE_SIZE, + * which is rdpg->rp_count */ + nob = rdpg->rp_count; + + for (i = 0, tmpcount = nob; i < rdpg->rp_npages && tmpcount > 0; + i++, tmpcount -= tmpsize) { + tmpsize = min_t(int, tmpcount, PAGE_SIZE); + desc->bd_frag_ops->add_kiov_frag(desc, rdpg->rp_pages[i], 0, + tmpsize); + } + + LASSERT(desc->bd_nob == nob); + rc = target_bulk_io(exp, desc); + ptlrpc_free_bulk(desc); + RETURN(rc); +} +EXPORT_SYMBOL(tgt_sendpage); + +/* + * OBD_IDX_READ handler + */ +static int tgt_obd_idx_read(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct lu_rdpg *rdpg = &tti->tti_u.rdpg.tti_rdpg; + struct idx_info *req_ii, *rep_ii; + int rc, i; + + ENTRY; + + memset(rdpg, 0, sizeof(*rdpg)); + req_capsule_set(tsi->tsi_pill, &RQF_OBD_IDX_READ); + + /* extract idx_info buffer from request & reply */ + req_ii = req_capsule_client_get(tsi->tsi_pill, &RMF_IDX_INFO); + if (req_ii == NULL || req_ii->ii_magic != IDX_INFO_MAGIC) + RETURN(err_serious(-EPROTO)); + + rc = req_capsule_server_pack(tsi->tsi_pill); + if (rc) + RETURN(err_serious(rc)); + + rep_ii = req_capsule_server_get(tsi->tsi_pill, &RMF_IDX_INFO); + if (rep_ii == NULL) + RETURN(err_serious(-EFAULT)); + rep_ii->ii_magic = IDX_INFO_MAGIC; + + /* extract hash to start with */ + rdpg->rp_hash = req_ii->ii_hash_start; + + /* extract requested attributes */ + rdpg->rp_attrs = req_ii->ii_attrs; + + /* check that fid packed in request is valid and supported */ + if (!fid_is_sane(&req_ii->ii_fid)) + RETURN(-EINVAL); + rep_ii->ii_fid = req_ii->ii_fid; + + /* copy flags */ + rep_ii->ii_flags = req_ii->ii_flags; + + /* compute number of pages to allocate, ii_count is the number of 4KB + * containers */ + if (req_ii->ii_count <= 0) + GOTO(out, rc = -EFAULT); + rdpg->rp_count = min_t(unsigned int, req_ii->ii_count << LU_PAGE_SHIFT, + exp_max_brw_size(tsi->tsi_exp)); + rdpg->rp_npages = (rdpg->rp_count + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* allocate pages to store the containers */ + OBD_ALLOC_PTR_ARRAY(rdpg->rp_pages, rdpg->rp_npages); + if (rdpg->rp_pages == NULL) + GOTO(out, rc = -ENOMEM); + for (i = 0; i < rdpg->rp_npages; i++) { + rdpg->rp_pages[i] = alloc_page(GFP_NOFS); + if (rdpg->rp_pages[i] == NULL) + GOTO(out, rc = -ENOMEM); + } + + /* populate pages with key/record pairs */ + rc = dt_index_read(tsi->tsi_env, tsi->tsi_tgt->lut_bottom, rep_ii, rdpg); + if (rc < 0) + GOTO(out, rc); + + LASSERTF(rc <= rdpg->rp_count, "dt_index_read() returned more than " + "asked %d > %d\n", rc, rdpg->rp_count); + + /* send pages to client */ + rc = tgt_sendpage(tsi, rdpg, rc); + if (rc) + GOTO(out, rc); + EXIT; +out: + if (rdpg->rp_pages) { + for (i = 0; i < rdpg->rp_npages; i++) + if (rdpg->rp_pages[i]) + __free_page(rdpg->rp_pages[i]); + OBD_FREE_PTR_ARRAY(rdpg->rp_pages, rdpg->rp_npages); + } + return rc; +} + +struct tgt_handler tgt_obd_handlers[] = { +TGT_OBD_HDL (0, OBD_PING, tgt_obd_ping), +TGT_OBD_HDL (0, OBD_IDX_READ, tgt_obd_idx_read) +}; +EXPORT_SYMBOL(tgt_obd_handlers); + +int tgt_sync(const struct lu_env *env, struct lu_target *tgt, + struct dt_object *obj, __u64 start, __u64 end) +{ + int rc = 0; + + ENTRY; + + /* if no objid is specified, it means "sync whole filesystem" */ + if (obj == NULL) { + rc = dt_sync(env, tgt->lut_bottom); + } else if (dt_version_get(env, obj) > + tgt->lut_obd->obd_last_committed) { + rc = dt_object_sync(env, obj, start, end); + } + atomic_inc(&tgt->lut_sync_count); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_sync); +/* + * Unified target DLM handlers. + */ + +/** + * Unified target BAST + * + * Ensure data and metadata are synced to disk when lock is canceled if Sync on + * Cancel (SOC) is enabled. If it's extent lock, normally sync obj is enough, + * but if it's cross-MDT lock, because remote object version is not set, a + * filesystem sync is needed. + * + * \param lock server side lock + * \param desc lock desc + * \param data ldlm_cb_set_arg + * \param flag indicates whether this cancelling or blocking callback + * \retval 0 on success + * \retval negative number on error + */ +static int tgt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + struct lu_env env; + struct lu_target *tgt; + struct dt_object *obj = NULL; + struct lu_fid fid; + int rc = 0; + + ENTRY; + + tgt = class_exp2tgt(lock->l_export); + + if (unlikely(tgt == NULL)) { + CDEBUG(D_ERROR, "%s: No target for connected export\n", + class_exp2obd(lock->l_export)->obd_name); + RETURN(-EINVAL); + } + + if (flag == LDLM_CB_CANCELING && + (lock->l_granted_mode & (LCK_EX | LCK_PW | LCK_GROUP)) && + (tgt->lut_sync_lock_cancel == SYNC_LOCK_CANCEL_ALWAYS || + (tgt->lut_sync_lock_cancel == SYNC_LOCK_CANCEL_BLOCKING && + ldlm_is_cbpending(lock))) && + ((exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS) || + lock->l_resource->lr_type == LDLM_EXTENT)) { + __u64 start = 0; + __u64 end = OBD_OBJECT_EOF; + + rc = lu_env_init(&env, LCT_DT_THREAD); + if (unlikely(rc != 0)) + GOTO(err, rc); + + ost_fid_from_resid(&fid, &lock->l_resource->lr_name, + tgt->lut_lsd.lsd_osd_index); + + if (lock->l_resource->lr_type == LDLM_EXTENT) { + obj = dt_locate(&env, tgt->lut_bottom, &fid); + if (IS_ERR(obj)) + GOTO(err_env, rc = PTR_ERR(obj)); + + if (!dt_object_exists(obj)) + GOTO(err_put, rc = -ENOENT); + + start = lock->l_policy_data.l_extent.start; + end = lock->l_policy_data.l_extent.end; + } + + rc = tgt_sync(&env, tgt, obj, start, end); + if (rc < 0) { + CERROR("%s: syncing "DFID" (%llu-%llu) on lock " + "cancel: rc = %d\n", + tgt_name(tgt), PFID(&fid), + lock->l_policy_data.l_extent.start, + lock->l_policy_data.l_extent.end, rc); + } +err_put: + if (obj != NULL) + dt_object_put(&env, obj); +err_env: + lu_env_fini(&env); + } +err: + rc = ldlm_server_blocking_ast(lock, desc, data, flag); + RETURN(rc); +} + +static struct ldlm_callback_suite tgt_dlm_cbs = { + .lcs_completion = ldlm_server_completion_ast, + .lcs_blocking = tgt_blocking_ast, + .lcs_glimpse = ldlm_server_glimpse_ast +}; + +int tgt_enqueue(struct tgt_session_info *tsi) +{ + struct ptlrpc_request *req = tgt_ses_req(tsi); + int rc; + + ENTRY; + /* + * tsi->tsi_dlm_req was already swapped and (if necessary) converted, + * tsi->tsi_dlm_cbs was set by the *_req_handle() function. + */ + LASSERT(tsi->tsi_dlm_req != NULL); + rc = ldlm_handle_enqueue0(tsi->tsi_exp->exp_obd->obd_namespace, req, + tsi->tsi_dlm_req, &tgt_dlm_cbs); + if (rc) + RETURN(err_serious(rc)); + + switch (LUT_FAIL_CLASS(tsi->tsi_reply_fail_id)) { + case LUT_FAIL_MDT: + tsi->tsi_reply_fail_id = OBD_FAIL_MDS_LDLM_REPLY_NET; + break; + case LUT_FAIL_OST: + tsi->tsi_reply_fail_id = OBD_FAIL_OST_LDLM_REPLY_NET; + break; + case LUT_FAIL_MGT: + tsi->tsi_reply_fail_id = OBD_FAIL_MGS_LDLM_REPLY_NET; + break; + default: + tsi->tsi_reply_fail_id = OBD_FAIL_LDLM_REPLY; + break; + } + RETURN(req->rq_status); +} +EXPORT_SYMBOL(tgt_enqueue); + +int tgt_convert(struct tgt_session_info *tsi) +{ + struct ptlrpc_request *req = tgt_ses_req(tsi); + int rc; + + ENTRY; + LASSERT(tsi->tsi_dlm_req); + rc = ldlm_handle_convert0(req, tsi->tsi_dlm_req); + if (rc) + RETURN(err_serious(rc)); + + RETURN(req->rq_status); +} + +int tgt_bl_callback(struct tgt_session_info *tsi) +{ + return err_serious(-EOPNOTSUPP); +} + +int tgt_cp_callback(struct tgt_session_info *tsi) +{ + return err_serious(-EOPNOTSUPP); +} + +/* generic LDLM target handler */ +struct tgt_handler tgt_dlm_handlers[] = { +TGT_DLM_HDL(HAS_KEY, LDLM_ENQUEUE, tgt_enqueue), +TGT_DLM_HDL(HAS_KEY, LDLM_CONVERT, tgt_convert), +TGT_DLM_HDL_VAR(0, LDLM_BL_CALLBACK, tgt_bl_callback), +TGT_DLM_HDL_VAR(0, LDLM_CP_CALLBACK, tgt_cp_callback) +}; +EXPORT_SYMBOL(tgt_dlm_handlers); + +/* + * Unified target LLOG handlers. + */ +int tgt_llog_open(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + rc = llog_origin_handle_open(tgt_ses_req(tsi)); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_llog_open); + +int tgt_llog_read_header(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + rc = llog_origin_handle_read_header(tgt_ses_req(tsi)); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_llog_read_header); + +int tgt_llog_next_block(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + rc = llog_origin_handle_next_block(tgt_ses_req(tsi)); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_llog_next_block); + +int tgt_llog_prev_block(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + rc = llog_origin_handle_prev_block(tgt_ses_req(tsi)); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_llog_prev_block); + +/* generic llog target handler */ +struct tgt_handler tgt_llog_handlers[] = { +TGT_LLOG_HDL (0, LLOG_ORIGIN_HANDLE_CREATE, tgt_llog_open), +TGT_LLOG_HDL (0, LLOG_ORIGIN_HANDLE_NEXT_BLOCK, tgt_llog_next_block), +TGT_LLOG_HDL (0, LLOG_ORIGIN_HANDLE_READ_HEADER, tgt_llog_read_header), +TGT_LLOG_HDL (0, LLOG_ORIGIN_HANDLE_PREV_BLOCK, tgt_llog_prev_block), +}; +EXPORT_SYMBOL(tgt_llog_handlers); + +/* + * sec context handlers + */ +/* XXX: Implement based on mdt_sec_ctx_handle()? */ +static int tgt_sec_ctx_handle(struct tgt_session_info *tsi) +{ + return 0; +} + +struct tgt_handler tgt_sec_ctx_handlers[] = { +TGT_SEC_HDL_VAR(0, SEC_CTX_INIT, tgt_sec_ctx_handle), +TGT_SEC_HDL_VAR(0, SEC_CTX_INIT_CONT, tgt_sec_ctx_handle), +TGT_SEC_HDL_VAR(0, SEC_CTX_FINI, tgt_sec_ctx_handle), +}; +EXPORT_SYMBOL(tgt_sec_ctx_handlers); + +int (*tgt_lfsck_in_notify_local)(const struct lu_env *env, + struct dt_device *key, + struct lfsck_req_local *lrl, + struct thandle *th) = NULL; + +void tgt_register_lfsck_in_notify_local(int (*notify)(const struct lu_env *, + struct dt_device *, + struct lfsck_req_local *, + struct thandle *)) +{ + tgt_lfsck_in_notify_local = notify; +} +EXPORT_SYMBOL(tgt_register_lfsck_in_notify_local); + +int (*tgt_lfsck_in_notify)(const struct lu_env *env, + struct dt_device *key, + struct lfsck_request *lr) = NULL; + +void tgt_register_lfsck_in_notify(int (*notify)(const struct lu_env *, + struct dt_device *, + struct lfsck_request *)) +{ + tgt_lfsck_in_notify = notify; +} +EXPORT_SYMBOL(tgt_register_lfsck_in_notify); + +static int (*tgt_lfsck_query)(const struct lu_env *env, + struct dt_device *key, + struct lfsck_request *req, + struct lfsck_reply *rep, + struct lfsck_query *que) = NULL; + +void tgt_register_lfsck_query(int (*query)(const struct lu_env *, + struct dt_device *, + struct lfsck_request *, + struct lfsck_reply *, + struct lfsck_query *)) +{ + tgt_lfsck_query = query; +} +EXPORT_SYMBOL(tgt_register_lfsck_query); + +/* LFSCK request handlers */ +static int tgt_handle_lfsck_notify(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct dt_device *key = tsi->tsi_tgt->lut_bottom; + struct lfsck_request *lr; + int rc; + ENTRY; + + lr = req_capsule_client_get(tsi->tsi_pill, &RMF_LFSCK_REQUEST); + if (lr == NULL) + RETURN(-EPROTO); + + rc = tgt_lfsck_in_notify(env, key, lr); + + RETURN(rc); +} + +static int tgt_handle_lfsck_query(struct tgt_session_info *tsi) +{ + struct lfsck_request *request; + struct lfsck_reply *reply; + int rc; + ENTRY; + + request = req_capsule_client_get(tsi->tsi_pill, &RMF_LFSCK_REQUEST); + if (request == NULL) + RETURN(-EPROTO); + + reply = req_capsule_server_get(tsi->tsi_pill, &RMF_LFSCK_REPLY); + if (reply == NULL) + RETURN(-ENOMEM); + + rc = tgt_lfsck_query(tsi->tsi_env, tsi->tsi_tgt->lut_bottom, + request, reply, NULL); + + RETURN(rc < 0 ? rc : 0); +} + +struct tgt_handler tgt_lfsck_handlers[] = { +TGT_LFSCK_HDL(HAS_REPLY, LFSCK_NOTIFY, tgt_handle_lfsck_notify), +TGT_LFSCK_HDL(HAS_REPLY, LFSCK_QUERY, tgt_handle_lfsck_query), +}; +EXPORT_SYMBOL(tgt_lfsck_handlers); + +/* + * initialize per-thread page pool (bug 5137). + */ +int tgt_io_thread_init(struct ptlrpc_thread *thread) +{ + struct tgt_thread_big_cache *tbc; + + ENTRY; + + LASSERT(thread != NULL); + LASSERT(thread->t_data == NULL); + + OBD_ALLOC_LARGE(tbc, sizeof(*tbc)); + if (tbc == NULL) + RETURN(-ENOMEM); + thread->t_data = tbc; + RETURN(0); +} +EXPORT_SYMBOL(tgt_io_thread_init); + +/* + * free per-thread pool created by tgt_thread_init(). + */ +void tgt_io_thread_done(struct ptlrpc_thread *thread) +{ + struct tgt_thread_big_cache *tbc; + + ENTRY; + + LASSERT(thread != NULL); + + /* + * be prepared to handle partially-initialized pools (because this is + * called from ost_io_thread_init() for cleanup. + */ + tbc = thread->t_data; + if (tbc != NULL) { + OBD_FREE_LARGE(tbc, sizeof(*tbc)); + thread->t_data = NULL; + } + EXIT; +} +EXPORT_SYMBOL(tgt_io_thread_done); + +/** + * Helper function for getting Data-on-MDT file server DLM lock + * if asked by client. + */ +int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id, + struct lustre_handle *lh, int mode, __u64 *flags) +{ + union ldlm_policy_data policy = { + .l_inodebits.bits = MDS_INODELOCK_DOM, + }; + int rc; + + ENTRY; + + LASSERT(lh != NULL); + LASSERT(ns != NULL); + LASSERT(!lustre_handle_is_used(lh)); + + rc = ldlm_cli_enqueue_local(NULL, ns, res_id, LDLM_IBITS, &policy, mode, + flags, ldlm_blocking_ast, + ldlm_completion_ast, ldlm_glimpse_ast, + NULL, 0, LVB_T_NONE, NULL, lh); + + RETURN(rc == ELDLM_OK ? 0 : -EIO); +} +EXPORT_SYMBOL(tgt_mdt_data_lock); + +/** + * Helper function for getting server side [start, start+count] DLM lock + * if asked by client. + */ +int tgt_extent_lock(const struct lu_env *env, struct ldlm_namespace *ns, + struct ldlm_res_id *res_id, __u64 start, __u64 end, + struct lustre_handle *lh, int mode, __u64 *flags) +{ + union ldlm_policy_data policy; + int rc; + + ENTRY; + + LASSERT(lh != NULL); + LASSERT(ns != NULL); + LASSERT(!lustre_handle_is_used(lh)); + + policy.l_extent.gid = 0; + policy.l_extent.start = start & PAGE_MASK; + + /* + * If ->o_blocks is EOF it means "lock till the end of the file". + * Otherwise, it's size of an extent or hole being punched (in bytes). + */ + if (end == OBD_OBJECT_EOF || end < start) + policy.l_extent.end = OBD_OBJECT_EOF; + else + policy.l_extent.end = end | ~PAGE_MASK; + + rc = ldlm_cli_enqueue_local(env, ns, res_id, LDLM_EXTENT, &policy, + mode, flags, ldlm_blocking_ast, + ldlm_completion_ast, ldlm_glimpse_ast, + NULL, 0, LVB_T_NONE, NULL, lh); + RETURN(rc == ELDLM_OK ? 0 : -EIO); +} +EXPORT_SYMBOL(tgt_extent_lock); + +static int tgt_data_lock(const struct lu_env *env, struct obd_export *exp, + struct ldlm_res_id *res_id, __u64 start, __u64 end, + struct lustre_handle *lh, enum ldlm_mode mode) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + __u64 flags = 0; + + /* MDT IO for data-on-mdt */ + if (exp->exp_connect_data.ocd_connect_flags & OBD_CONNECT_IBITS) + return tgt_mdt_data_lock(ns, res_id, lh, mode, &flags); + + return tgt_extent_lock(env, ns, res_id, start, end, lh, mode, &flags); +} + +void tgt_data_unlock(struct lustre_handle *lh, enum ldlm_mode mode) +{ + LASSERT(lustre_handle_is_used(lh)); + ldlm_lock_decref(lh, mode); +} +EXPORT_SYMBOL(tgt_data_unlock); + +static int tgt_brw_lock(const struct lu_env *env, struct obd_export *exp, + struct ldlm_res_id *res_id, struct obd_ioobj *obj, + struct niobuf_remote *nb, struct lustre_handle *lh, + enum ldlm_mode mode) +{ + int nrbufs = obj->ioo_bufcnt; + int i; + + ENTRY; + + LASSERT(mode == LCK_PR || mode == LCK_PW); + LASSERT(!lustre_handle_is_used(lh)); + + if (exp->exp_obd->obd_recovering) + RETURN(0); + + if (nrbufs == 0 || !(nb[0].rnb_flags & OBD_BRW_SRVLOCK)) + RETURN(0); + + for (i = 1; i < nrbufs; i++) + if (!(nb[i].rnb_flags & OBD_BRW_SRVLOCK)) + RETURN(-EFAULT); + + return tgt_data_lock(env, exp, res_id, nb[0].rnb_offset, + nb[nrbufs - 1].rnb_offset + + nb[nrbufs - 1].rnb_len - 1, lh, mode); +} + +static void tgt_brw_unlock(struct obd_export *exp, struct obd_ioobj *obj, + struct niobuf_remote *niob, + struct lustre_handle *lh, enum ldlm_mode mode) +{ + ENTRY; + + LASSERT(mode == LCK_PR || mode == LCK_PW); + LASSERT((!exp->exp_obd->obd_recovering && obj->ioo_bufcnt && + niob[0].rnb_flags & OBD_BRW_SRVLOCK) == + lustre_handle_is_used(lh)); + + if (lustre_handle_is_used(lh)) + tgt_data_unlock(lh, mode); + EXIT; +} + +static int tgt_checksum_niobuf(struct lu_target *tgt, + struct niobuf_local *local_nb, int npages, + int opc, enum cksum_types cksum_type, + __u32 *cksum) +{ + struct ahash_request *req; + unsigned int bufsize; + int i, err; + unsigned char cfs_alg = cksum_obd2cfs(cksum_type); + + req = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(req)) { + CERROR("%s: unable to initialize checksum hash %s\n", + tgt_name(tgt), cfs_crypto_hash_name(cfs_alg)); + return PTR_ERR(req); + } + + CDEBUG(D_INFO, "Checksum for algo %s\n", cfs_crypto_hash_name(cfs_alg)); + for (i = 0; i < npages; i++) { + /* corrupt the data before we compute the checksum, to + * simulate a client->OST data error */ + if (i == 0 && opc == OST_WRITE && + OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) { + int off = local_nb[i].lnb_page_offset & ~PAGE_MASK; + int len = local_nb[i].lnb_len; + struct page *np = tgt_page_to_corrupt; + + if (np) { + char *ptr = kmap_atomic(local_nb[i].lnb_page); + char *ptr2 = page_address(np); + + memcpy(ptr2 + off, ptr + off, len); + memcpy(ptr2 + off, "bad3", min(4, len)); + kunmap_atomic(ptr); + + /* LU-8376 to preserve original index for + * display in dump_all_bulk_pages() */ + np->index = i; + + cfs_crypto_hash_update_page(req, np, off, + len); + continue; + } else { + CERROR("%s: can't alloc page for corruption\n", + tgt_name(tgt)); + } + } + cfs_crypto_hash_update_page(req, local_nb[i].lnb_page, + local_nb[i].lnb_page_offset & ~PAGE_MASK, + local_nb[i].lnb_len); + + /* corrupt the data after we compute the checksum, to + * simulate an OST->client data error */ + if (i == 0 && opc == OST_READ && + OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) { + int off = local_nb[i].lnb_page_offset & ~PAGE_MASK; + int len = local_nb[i].lnb_len; + struct page *np = tgt_page_to_corrupt; + + if (np) { + char *ptr = kmap_atomic(local_nb[i].lnb_page); + char *ptr2 = page_address(np); + + memcpy(ptr2 + off, ptr + off, len); + memcpy(ptr2 + off, "bad4", min(4, len)); + kunmap_atomic(ptr); + + /* LU-8376 to preserve original index for + * display in dump_all_bulk_pages() */ + np->index = i; + + cfs_crypto_hash_update_page(req, np, off, + len); + continue; + } else { + CERROR("%s: can't alloc page for corruption\n", + tgt_name(tgt)); + } + } + } + + bufsize = sizeof(*cksum); + err = cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize); + + return 0; +} + +char dbgcksum_file_name[PATH_MAX]; + +static void dump_all_bulk_pages(struct obdo *oa, int count, + struct niobuf_local *local_nb, + __u32 server_cksum, __u32 client_cksum) +{ + struct file *filp; + int rc, i; + unsigned int len; + char *buf; + + /* will only keep dump of pages on first error for the same range in + * file/fid, not during the resends/retries. */ + snprintf(dbgcksum_file_name, sizeof(dbgcksum_file_name), + "%s-checksum_dump-ost-"DFID":[%llu-%llu]-%x-%x", + (strncmp(libcfs_debug_file_path, "NONE", 4) != 0 ? + libcfs_debug_file_path : LIBCFS_DEBUG_FILE_PATH_DEFAULT), + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, + local_nb[0].lnb_file_offset, + local_nb[count-1].lnb_file_offset + + local_nb[count-1].lnb_len - 1, client_cksum, server_cksum); + CWARN("dumping checksum data to %s\n", dbgcksum_file_name); + filp = filp_open(dbgcksum_file_name, + O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + if (rc == -EEXIST) + CDEBUG(D_INFO, "%s: can't open to dump pages with " + "checksum error: rc = %d\n", dbgcksum_file_name, + rc); + else + CERROR("%s: can't open to dump pages with checksum " + "error: rc = %d\n", dbgcksum_file_name, rc); + return; + } + + for (i = 0; i < count; i++) { + len = local_nb[i].lnb_len; + buf = kmap(local_nb[i].lnb_page); + while (len != 0) { + rc = cfs_kernel_write(filp, buf, len, &filp->f_pos); + if (rc < 0) { + CERROR("%s: wanted to write %u but got %d " + "error\n", dbgcksum_file_name, len, rc); + break; + } + len -= rc; + buf += rc; + } + kunmap(local_nb[i].lnb_page); + } + + rc = vfs_fsync_range(filp, 0, LLONG_MAX, 1); + if (rc) + CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc); + filp_close(filp, NULL); + + libcfs_debug_dumplog(); +} + +static int check_read_checksum(struct niobuf_local *local_nb, int npages, + struct obd_export *exp, struct obdo *oa, + const struct lnet_process_id *peer, + __u32 client_cksum, __u32 server_cksum, + enum cksum_types server_cksum_type) +{ + char *msg; + enum cksum_types cksum_type; + loff_t start, end; + + if (unlikely(npages <= 0)) + return 0; + + /* unlikely to happen and only if resend does not occur due to cksum + * control failure on Client */ + if (unlikely(server_cksum == client_cksum)) { + CDEBUG(D_PAGE, "checksum %x confirmed upon retry\n", + client_cksum); + return 0; + } + + if (exp->exp_obd->obd_checksum_dump) + dump_all_bulk_pages(oa, npages, local_nb, server_cksum, + client_cksum); + + cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ? + oa->o_flags : 0); + + if (cksum_type != server_cksum_type) + msg = "the server may have not used the checksum type specified" + " in the original request - likely a protocol problem"; + else + msg = "should have changed on the client or in transit"; + + start = local_nb[0].lnb_file_offset; + end = local_nb[npages-1].lnb_file_offset + + local_nb[npages-1].lnb_len - 1; + + LCONSOLE_ERROR_MSG(0x132, "%s: BAD READ CHECKSUM: %s: from %s inode " + DFID " object "DOSTID" extent [%llu-%llu], client returned csum" + " %x (type %x), server csum %x (type %x)\n", + exp->exp_obd->obd_name, + msg, libcfs_nid2str(peer->nid), + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, + POSTID(&oa->o_oi), + start, end, client_cksum, cksum_type, server_cksum, + server_cksum_type); + + return 1; +} + +static int tgt_pages2shortio(struct niobuf_local *local, int npages, + unsigned char *buf, int size) +{ + int i, off, len, copied = size; + char *ptr; + + for (i = 0; i < npages; i++) { + off = local[i].lnb_page_offset & ~PAGE_MASK; + len = local[i].lnb_len; + + CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n", + i, off, len, size); + if (len > size) + return -EINVAL; + + ptr = kmap_atomic(local[i].lnb_page); + memcpy(buf, ptr + off, len); + kunmap_atomic(ptr); + buf += len; + size -= len; + } + return copied - size; +} + +static int tgt_checksum_niobuf_t10pi(struct lu_target *tgt, + enum cksum_types cksum_type, + struct niobuf_local *local_nb, int npages, + int opc, obd_dif_csum_fn *fn, + int sector_size, u32 *check_sum, + bool resend) +{ + enum cksum_types t10_cksum_type = tgt->lut_dt_conf.ddp_t10_cksum_type; + unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP); + const char *obd_name = tgt->lut_obd->obd_name; + struct ahash_request *req; + unsigned int bufsize; + unsigned char *buffer; + struct page *__page; + __be16 *guard_start; + int guard_number; + int used_number = 0; + __u32 cksum; + int rc = 0; + int used; + int i; + + __page = alloc_page(GFP_KERNEL); + if (__page == NULL) + return -ENOMEM; + + req = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(req)) { + CERROR("%s: unable to initialize checksum hash %s\n", + tgt_name(tgt), cfs_crypto_hash_name(cfs_alg)); + return PTR_ERR(req); + } + + buffer = kmap(__page); + guard_start = (__be16 *)buffer; + guard_number = PAGE_SIZE / sizeof(*guard_start); + if (unlikely(resend)) + CDEBUG(D_PAGE | D_HA, "GRD tags per page = %u\n", guard_number); + for (i = 0; i < npages; i++) { + bool use_t10_grd; + + /* corrupt the data before we compute the checksum, to + * simulate a client->OST data error */ + if (i == 0 && opc == OST_WRITE && + OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) { + int off = local_nb[i].lnb_page_offset & ~PAGE_MASK; + int len = local_nb[i].lnb_len; + struct page *np = tgt_page_to_corrupt; + + if (np) { + char *ptr = kmap_atomic(local_nb[i].lnb_page); + char *ptr2 = page_address(np); + + memcpy(ptr2 + off, ptr + off, len); + memcpy(ptr2 + off, "bad3", min(4, len)); + kunmap_atomic(ptr); + + /* LU-8376 to preserve original index for + * display in dump_all_bulk_pages() */ + np->index = i; + + cfs_crypto_hash_update_page(req, np, off, + len); + continue; + } else { + CERROR("%s: can't alloc page for corruption\n", + tgt_name(tgt)); + } + } + + /* + * The left guard number should be able to hold checksums of a + * whole page + */ + use_t10_grd = t10_cksum_type && t10_cksum_type == cksum_type && + opc == OST_READ && + local_nb[i].lnb_len == PAGE_SIZE && + local_nb[i].lnb_guard_disk; + if (use_t10_grd) { + used = DIV_ROUND_UP(local_nb[i].lnb_len, sector_size); + if (used > (guard_number - used_number)) { + rc = -E2BIG; + break; + } + memcpy(guard_start + used_number, + local_nb[i].lnb_guards, + used * sizeof(*guard_start)); + if (unlikely(resend)) + CDEBUG(D_PAGE | D_HA, + "lnb[%u]: used %u off %u+%u lnb checksum: %*phN\n", + i, used, + local_nb[i].lnb_page_offset, + local_nb[i].lnb_len, + (int)(used * sizeof(*guard_start)), + guard_start + used_number); + } + if (!use_t10_grd || unlikely(resend)) { + __be16 guard_tmp[MAX_GUARD_NUMBER]; + __be16 *guards = guard_start + used_number; + int used_tmp = -1, *usedp = &used; + + if (unlikely(use_t10_grd)) { + guards = guard_tmp; + usedp = &used_tmp; + } + rc = obd_page_dif_generate_buffer(obd_name, + local_nb[i].lnb_page, + local_nb[i].lnb_page_offset & ~PAGE_MASK, + local_nb[i].lnb_len, guards, + guard_number - used_number, usedp, sector_size, + fn); + if (unlikely(resend)) { + bool bad = use_t10_grd && + memcmp(guard_tmp, + local_nb[i].lnb_guards, + used_tmp * sizeof(*guard_tmp)); + + if (bad) + CERROR("lnb[%u]: used %u/%u off %u+%u tmp checksum: %*phN\n", + i, used, used_tmp, + local_nb[i].lnb_page_offset, + local_nb[i].lnb_len, + (int)(used_tmp * sizeof(*guard_start)), + guard_tmp); + CDEBUG_LIMIT(D_PAGE | D_HA | (bad ? D_ERROR : 0), + "lnb[%u]: used %u/%u off %u+%u gen checksum: %*phN\n", + i, used, used_tmp, + local_nb[i].lnb_page_offset, + local_nb[i].lnb_len, + (int)(used * sizeof(*guard_start)), + guard_start + used_number); + } + if (rc) + break; + } + + LASSERT(used <= MAX_GUARD_NUMBER); + /* + * If disk support T10PI checksum, copy guards to local_nb. + * If the write is partial page, do not use the guards for bio + * submission since the data might not be full-sector. The bio + * guards will be generated later based on the full sectors. If + * the sector size is 512B rather than 4 KB, or the page size + * is larger than 4KB, this might drop some useful guards for + * partial page write, but it will only add minimal extra time + * of checksum calculation. + */ + if (t10_cksum_type && t10_cksum_type == cksum_type && + opc == OST_WRITE && + local_nb[i].lnb_len == PAGE_SIZE) { + local_nb[i].lnb_guard_rpc = 1; + memcpy(local_nb[i].lnb_guards, + guard_start + used_number, + used * sizeof(*local_nb[i].lnb_guards)); + } + + used_number += used; + if (used_number == guard_number) { + cfs_crypto_hash_update_page(req, __page, 0, + used_number * sizeof(*guard_start)); + used_number = 0; + } + + /* corrupt the data after we compute the checksum, to + * simulate an OST->client data error */ + if (unlikely(i == 0 && opc == OST_READ && + OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND))) { + int off = local_nb[i].lnb_page_offset & ~PAGE_MASK; + int len = local_nb[i].lnb_len; + struct page *np = tgt_page_to_corrupt; + + if (np) { + char *ptr = kmap_atomic(local_nb[i].lnb_page); + char *ptr2 = page_address(np); + + memcpy(ptr2 + off, ptr + off, len); + memcpy(ptr2 + off, "bad4", min(4, len)); + kunmap_atomic(ptr); + + /* LU-8376 to preserve original index for + * display in dump_all_bulk_pages() */ + np->index = i; + + cfs_crypto_hash_update_page(req, np, off, + len); + continue; + } else { + CERROR("%s: can't alloc page for corruption\n", + tgt_name(tgt)); + } + } + } + kunmap(__page); + if (rc) + GOTO(out, rc); + + if (used_number != 0) + cfs_crypto_hash_update_page(req, __page, 0, + used_number * sizeof(*guard_start)); + + bufsize = sizeof(cksum); + rc = cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize); + + if (rc == 0) + *check_sum = cksum; +out: + __free_page(__page); + return rc; +} + +static int tgt_checksum_niobuf_rw(struct lu_target *tgt, + enum cksum_types cksum_type, + struct niobuf_local *local_nb, + int npages, int opc, u32 *check_sum, + bool resend) +{ + obd_dif_csum_fn *fn = NULL; + int sector_size = 0; + int rc; + + ENTRY; + obd_t10_cksum2dif(cksum_type, &fn, §or_size); + + if (fn) + rc = tgt_checksum_niobuf_t10pi(tgt, cksum_type, + local_nb, npages, + opc, fn, sector_size, + check_sum, resend); + else + rc = tgt_checksum_niobuf(tgt, local_nb, npages, opc, + cksum_type, check_sum); + + RETURN(rc); +} + +int tgt_brw_read(struct tgt_session_info *tsi) +{ + struct ptlrpc_request *req = tgt_ses_req(tsi); + struct ptlrpc_bulk_desc *desc = NULL; + struct obd_export *exp = tsi->tsi_exp; + struct niobuf_remote *remote_nb; + struct niobuf_local *local_nb; + struct obd_ioobj *ioo; + struct ost_body *body, *repbody; + struct lustre_handle lockh = { 0 }; + int npages, nob = 0, rc, i, no_reply = 0, + npages_read; + struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data; + const char *obd_name = exp->exp_obd->obd_name; + ktime_t kstart; + + ENTRY; + + if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL && + ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) { + CERROR("%s: deny read request from %s to portal %u\n", + tgt_name(tsi->tsi_tgt), + obd_export_nid2str(req->rq_export), + ptlrpc_req2svc(req)->srv_req_portal); + RETURN(-EPROTO); + } + + req->rq_bulk_read = 1; + + if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK)) { + /* optionally use cfs_fail_val - 1 to select a specific OST on + * this server to fail requests. + */ + char fail_ost_name[MAX_OBD_NAME]; + + if (cfs_fail_val > 0) { + snprintf(fail_ost_name, MAX_OBD_NAME, "OST%04X", + cfs_fail_val - 1); + + if (strstr(obd_name, fail_ost_name)) + RETURN(err_serious(-EIO)); + } else { + RETURN(err_serious(-EIO)); + } + } + + OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ? + cfs_fail_val : (obd_timeout + 1) / 4); + + /* Check if there is eviction in progress, and if so, wait for it to + * finish */ + if (unlikely(atomic_read(&exp->exp_obd->obd_evict_inprogress))) { + /* We do not care how long it takes */ + wait_event_idle( + exp->exp_obd->obd_evict_inprogress_waitq, + !atomic_read(&exp->exp_obd->obd_evict_inprogress)); + } + + /* There must be big cache in current thread to process this request + * if it is NULL then something went wrong and it wasn't allocated, + * report -ENOMEM in that case */ + if (tbc == NULL) + RETURN(-ENOMEM); + + body = tsi->tsi_ost_body; + LASSERT(body != NULL); + + if (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_NORPC) + RETURN(0); + + ioo = req_capsule_client_get(tsi->tsi_pill, &RMF_OBD_IOOBJ); + LASSERT(ioo != NULL); /* must exists after tgt_ost_body_unpack */ + + remote_nb = req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE); + LASSERT(remote_nb != NULL); /* must exists after tgt_ost_body_unpack */ + + local_nb = tbc->local; + + rc = tgt_brw_lock(tsi->tsi_env, exp, &tsi->tsi_resid, ioo, remote_nb, + &lockh, LCK_PR); + if (rc != 0) + RETURN(rc); + + /* + * If getting the lock took more time than + * client was willing to wait, drop it. b=11330 + */ + if (ktime_get_real_seconds() > req->rq_deadline || + OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) { + no_reply = 1; + CERROR("Dropping timed-out read from %s because locking object " DOSTID " took %lld seconds (limit was %lld).\n", + libcfs_id2str(req->rq_peer), POSTID(&ioo->ioo_oid), + ktime_get_real_seconds() - req->rq_arrival_time.tv_sec, + req->rq_deadline - req->rq_arrival_time.tv_sec); + GOTO(out_lock, rc = -ETIMEDOUT); + } + + /* + * Because we already sync grant info with client when + * reconnect, grant info will be cleared for resent req, + * otherwise, outdated grant count in the rpc would de-sync + * grant counters in case of shrink + */ + if (lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) { + DEBUG_REQ(D_CACHE, req, "clear resent/replay req grant info"); + body->oa.o_valid &= ~OBD_MD_FLGRANT; + } + + repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + repbody->oa = body->oa; + + npages = PTLRPC_MAX_BRW_PAGES; + kstart = ktime_get(); + rc = obd_preprw(tsi->tsi_env, OBD_BRW_READ, exp, &repbody->oa, 1, + ioo, remote_nb, &npages, local_nb); + if (rc != 0) + GOTO(out_lock, rc); + + if (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_SHORT_IO) { + desc = NULL; + } else { + desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo), + PTLRPC_BULK_PUT_SOURCE, + OST_BULK_PORTAL, + &ptlrpc_bulk_kiov_nopin_ops); + if (desc == NULL) + GOTO(out_commitrw, rc = -ENOMEM); + } + + npages_read = npages; + for (i = 0; i < npages; i++) { + int page_rc = local_nb[i].lnb_rc; + + if (page_rc < 0) { + rc = page_rc; + npages_read = i; + break; + } + + nob += page_rc; + if (page_rc != 0 && desc != NULL) { /* some data! */ + LASSERT(local_nb[i].lnb_page != NULL); + desc->bd_frag_ops->add_kiov_frag + (desc, local_nb[i].lnb_page, + local_nb[i].lnb_page_offset & ~PAGE_MASK, + page_rc); + } + + if (page_rc != local_nb[i].lnb_len) { /* short read */ + local_nb[i].lnb_len = page_rc; + npages_read = i + (page_rc != 0 ? 1 : 0); + /* All subsequent pages should be 0 */ + while (++i < npages) + LASSERT(local_nb[i].lnb_rc == 0); + break; + } + } + + if (body->oa.o_valid & OBD_MD_FLCKSUM) { + u32 flag = body->oa.o_valid & OBD_MD_FLFLAGS ? + body->oa.o_flags : 0; + enum cksum_types cksum_type = obd_cksum_type_unpack(flag); + bool resend = (body->oa.o_valid & OBD_MD_FLFLAGS) && + (body->oa.o_flags & OBD_FL_RECOV_RESEND); + + repbody->oa.o_flags = obd_cksum_type_pack(obd_name, + cksum_type); + repbody->oa.o_valid = OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + + rc = tgt_checksum_niobuf_rw(tsi->tsi_tgt, cksum_type, + local_nb, npages_read, OST_READ, + &repbody->oa.o_cksum, resend); + if (rc < 0) + GOTO(out_commitrw, rc); + CDEBUG(D_PAGE | (resend ? D_HA : 0), + "checksum at read origin: %x (%x)\n", + repbody->oa.o_cksum, cksum_type); + + /* if a resend it could be for a cksum error, so check Server + * cksum with returned Client cksum (this should even cover + * zero-cksum case) */ + if (resend) + check_read_checksum(local_nb, npages_read, exp, + &body->oa, &req->rq_peer, + body->oa.o_cksum, + repbody->oa.o_cksum, cksum_type); + } else { + repbody->oa.o_valid = 0; + } + if (body->oa.o_valid & OBD_MD_FLGRANT) + repbody->oa.o_valid |= OBD_MD_FLGRANT; + /* We're finishing using body->oa as an input variable */ + + /* Check if client was evicted while we were doing i/o before touching + * network */ + if (rc == 0) { + if (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_SHORT_IO) { + unsigned char *short_io_buf; + int short_io_size; + + short_io_buf = req_capsule_server_get(&req->rq_pill, + &RMF_SHORT_IO); + short_io_size = req_capsule_get_size(&req->rq_pill, + &RMF_SHORT_IO, + RCL_SERVER); + rc = tgt_pages2shortio(local_nb, npages_read, + short_io_buf, short_io_size); + if (rc >= 0) + req_capsule_shrink(&req->rq_pill, + &RMF_SHORT_IO, rc, + RCL_SERVER); + rc = rc > 0 ? 0 : rc; + } else if (!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) { + rc = target_bulk_io(exp, desc); + } + no_reply = rc != 0; + } else { + if (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_SHORT_IO) + req_capsule_shrink(&req->rq_pill, &RMF_SHORT_IO, 0, + RCL_SERVER); + } + +out_commitrw: + /* Must commit after prep above in all cases */ + rc = obd_commitrw(tsi->tsi_env, OBD_BRW_READ, exp, &repbody->oa, 1, ioo, + remote_nb, npages, local_nb, rc, nob, kstart); +out_lock: + tgt_brw_unlock(exp, ioo, remote_nb, &lockh, LCK_PR); + + if (desc && !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) + ptlrpc_free_bulk(desc); + + LASSERT(rc <= 0); + if (rc == 0) { + rc = nob; + ptlrpc_lprocfs_brw(req, nob); + } else if (no_reply) { + req->rq_no_reply = 1; + /* reply out callback would free */ + ptlrpc_req_drop_rs(req); + LCONSOLE_WARN("%s: Bulk IO read error with %s (at %s), " + "client will retry: rc %d\n", + obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), rc); + } + /* send a bulk after reply to simulate a network delay or reordering + * by a router - Note that !desc implies short io, so there is no bulk + * to reorder. */ + if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) && + desc) { + /* Calculate checksum before request transfer, original + * it is done by target_bulk_io() */ + rc = sptlrpc_svc_wrap_bulk(req, desc); + if (OCD_HAS_FLAG(&exp->exp_connect_data, BULK_MBITS)) + req->rq_mbits = lustre_msg_get_mbits(req->rq_reqmsg); + else /* old version, bulk matchbits is rq_xid */ + req->rq_mbits = req->rq_xid; + + req->rq_status = rc; + target_committed_to_req(req); + target_send_reply(req, 0, 0); + + CDEBUG(D_INFO, "reorder BULK\n"); + OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2, + cfs_fail_val ? : 3); + + target_bulk_io(exp, desc); + ptlrpc_free_bulk(desc); + } + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_brw_read); + +static int tgt_shortio2pages(struct niobuf_local *local, int npages, + unsigned char *buf, unsigned int size) +{ + int i, off, len; + char *ptr; + + for (i = 0; i < npages; i++) { + off = local[i].lnb_page_offset & ~PAGE_MASK; + len = local[i].lnb_len; + + if (len == 0) + continue; + + CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n", + i, off, len, size); + ptr = kmap_atomic(local[i].lnb_page); + if (ptr == NULL) + return -EINVAL; + memcpy(ptr + off, buf, len < size ? len : size); + kunmap_atomic(ptr); + buf += len; + size -= len; + } + return 0; +} + +static void tgt_warn_on_cksum(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc, + struct niobuf_local *local_nb, int npages, + u32 client_cksum, u32 server_cksum, + bool mmap) +{ + struct obd_export *exp = req->rq_export; + struct ost_body *body; + char *router = ""; + char *via = ""; + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body != NULL); + + if (desc && req->rq_peer.nid != desc->bd_sender) { + via = " via "; + router = libcfs_nid2str(desc->bd_sender); + } + + if (exp->exp_obd->obd_checksum_dump) + dump_all_bulk_pages(&body->oa, npages, local_nb, server_cksum, + client_cksum); + + if (mmap) { + CDEBUG_LIMIT(D_INFO, "client csum %x, server csum %x\n", + client_cksum, server_cksum); + return; + } + + LCONSOLE_ERROR_MSG(0x168, "%s: BAD WRITE CHECKSUM: from %s%s%s inode " + DFID" object "DOSTID" extent [%llu-%llu" + "]: client csum %x, server csum %x\n", + exp->exp_obd->obd_name, libcfs_id2str(req->rq_peer), + via, router, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_seq : (__u64)0, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_oid : 0, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_ver : 0, + POSTID(&body->oa.o_oi), + local_nb[0].lnb_file_offset, + local_nb[npages-1].lnb_file_offset + + local_nb[npages - 1].lnb_len - 1, + client_cksum, server_cksum); +} + +int tgt_brw_write(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct ptlrpc_request *req = tgt_ses_req(tsi); + struct ptlrpc_bulk_desc *desc = NULL; + struct obd_export *exp = req->rq_export; + struct niobuf_remote *remote_nb; + struct niobuf_local *local_nb; + struct obd_ioobj *ioo; + struct ost_body *body, *repbody; + struct lustre_handle lockh = {0}; + __u32 *rcs; + int objcount, niocount, npages; + int rc = 0; + int i, j; + enum cksum_types cksum_type = OBD_CKSUM_CRC32; + bool no_reply = false, mmap; + struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data; + bool wait_sync = false; + const char *obd_name = exp->exp_obd->obd_name; + /* '1' for consistency with code that checks !mpflag to restore */ + unsigned int mpflags = 1; + ktime_t kstart; + int nob = 0; + + ENTRY; + + if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL && + ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) { + CERROR("%s: deny write request from %s to portal %u\n", + tgt_name(tsi->tsi_tgt), + obd_export_nid2str(req->rq_export), + ptlrpc_req2svc(req)->srv_req_portal); + RETURN(err_serious(-EPROTO)); + } + + if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOSPC)) + RETURN(err_serious(-ENOSPC)); + if (OBD_FAIL_TIMEOUT(OBD_FAIL_OST_EROFS, 1)) + RETURN(err_serious(-EROFS)); + + req->rq_bulk_write = 1; + + if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK)) + rc = -EIO; + if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK2)) + rc = -EFAULT; + if (rc < 0) { + /* optionally use cfs_fail_val - 1 to select a specific OST on + * this server to fail requests. + */ + char fail_ost_name[MAX_OBD_NAME]; + + if (cfs_fail_val > 0) { + snprintf(fail_ost_name, MAX_OBD_NAME, "OST%04X", + cfs_fail_val - 1); + + if (strstr(obd_name, fail_ost_name)) + RETURN(err_serious(rc)); + } else { + RETURN(err_serious(rc)); + } + } + + /* pause before transaction has been started */ + CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ? + cfs_fail_val : (obd_timeout + 1) / 4); + + /* Delay write commit to show stale size information */ + CFS_FAIL_TIMEOUT(OBD_FAIL_OSC_NO_SIZE_DATA, cfs_fail_val); + + /* There must be big cache in current thread to process this request + * if it is NULL then something went wrong and it wasn't allocated, + * report -ENOMEM in that case */ + if (tbc == NULL) + RETURN(-ENOMEM); + + body = tsi->tsi_ost_body; + LASSERT(body != NULL); + + if (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_NORPC) + RETURN(0); + + + ioo = req_capsule_client_get(&req->rq_pill, &RMF_OBD_IOOBJ); + LASSERT(ioo != NULL); /* must exists after tgt_ost_body_unpack */ + + objcount = req_capsule_get_size(&req->rq_pill, &RMF_OBD_IOOBJ, + RCL_CLIENT) / sizeof(*ioo); + + for (niocount = i = 0; i < objcount; i++) + niocount += ioo[i].ioo_bufcnt; + + remote_nb = req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE); + LASSERT(remote_nb != NULL); /* must exists after tgt_ost_body_unpack */ + if (niocount != req_capsule_get_size(&req->rq_pill, + &RMF_NIOBUF_REMOTE, RCL_CLIENT) / + sizeof(*remote_nb)) + RETURN(err_serious(-EPROTO)); + + if ((remote_nb[0].rnb_flags & OBD_BRW_MEMALLOC) && + ptlrpc_connection_is_local(exp->exp_connection)) + mpflags = memalloc_noreclaim_save(); + + req_capsule_set_size(&req->rq_pill, &RMF_RCS, RCL_SERVER, + niocount * sizeof(*rcs)); + rc = req_capsule_server_pack(&req->rq_pill); + if (rc != 0) + GOTO(out, rc = err_serious(rc)); + + CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_PACK, cfs_fail_val); + rcs = req_capsule_server_get(&req->rq_pill, &RMF_RCS); + + local_nb = tbc->local; + + rc = tgt_brw_lock(tsi->tsi_env, exp, &tsi->tsi_resid, ioo, remote_nb, + &lockh, LCK_PW); + if (rc != 0) + GOTO(out, rc); + + /* + * If getting the lock took more time than + * client was willing to wait, drop it. b=11330 + */ + if (ktime_get_real_seconds() > req->rq_deadline || + OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) { + no_reply = true; + CERROR("%s: Dropping timed-out write from %s because locking object " DOSTID " took %lld seconds (limit was %lld).\n", + tgt_name(tsi->tsi_tgt), libcfs_id2str(req->rq_peer), + POSTID(&ioo->ioo_oid), + ktime_get_real_seconds() - req->rq_arrival_time.tv_sec, + req->rq_deadline - req->rq_arrival_time.tv_sec); + GOTO(out_lock, rc = -ETIMEDOUT); + } + + /* Because we already sync grant info with client when reconnect, + * grant info will be cleared for resent req, then fed_grant and + * total_grant will not be modified in following preprw_write */ + if (lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) { + DEBUG_REQ(D_CACHE, req, "clear resent/replay req grant info"); + body->oa.o_valid &= ~OBD_MD_FLGRANT; + } + + repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (repbody == NULL) + GOTO(out_lock, rc = -ENOMEM); + repbody->oa = body->oa; + + npages = PTLRPC_MAX_BRW_PAGES; + kstart = ktime_get(); + rc = obd_preprw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa, + objcount, ioo, remote_nb, &npages, local_nb); + if (rc < 0) + GOTO(out_lock, rc); + if (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_SHORT_IO) { + unsigned int short_io_size; + unsigned char *short_io_buf; + + short_io_size = req_capsule_get_size(&req->rq_pill, + &RMF_SHORT_IO, + RCL_CLIENT); + short_io_buf = req_capsule_client_get(&req->rq_pill, + &RMF_SHORT_IO); + CDEBUG(D_INFO, "Client use short io for data transfer," + " size = %d\n", short_io_size); + + /* Copy short io buf to pages */ + rc = tgt_shortio2pages(local_nb, npages, short_io_buf, + short_io_size); + desc = NULL; + } else { + desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo), + PTLRPC_BULK_GET_SINK, + OST_BULK_PORTAL, + &ptlrpc_bulk_kiov_nopin_ops); + if (desc == NULL) + GOTO(skip_transfer, rc = -ENOMEM); + + /* NB Having prepped, we must commit... */ + for (i = 0; i < npages; i++) + desc->bd_frag_ops->add_kiov_frag(desc, + local_nb[i].lnb_page, + local_nb[i].lnb_page_offset & ~PAGE_MASK, + local_nb[i].lnb_len); + + rc = sptlrpc_svc_prep_bulk(req, desc); + if (rc != 0) + GOTO(skip_transfer, rc); + + rc = target_bulk_io(exp, desc); + } + + no_reply = rc != 0; + +skip_transfer: + if (body->oa.o_valid & OBD_MD_FLCKSUM && rc == 0) { + static int cksum_counter; + + if (body->oa.o_valid & OBD_MD_FLFLAGS) + cksum_type = obd_cksum_type_unpack(body->oa.o_flags); + + repbody->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + repbody->oa.o_flags &= ~OBD_FL_CKSUM_ALL; + repbody->oa.o_flags |= obd_cksum_type_pack(obd_name, + cksum_type); + + rc = tgt_checksum_niobuf_rw(tsi->tsi_tgt, cksum_type, + local_nb, npages, OST_WRITE, + &repbody->oa.o_cksum, false); + if (rc < 0) + GOTO(out_commitrw, rc); + + cksum_counter++; + + if (unlikely(body->oa.o_cksum != repbody->oa.o_cksum)) { + mmap = (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_MMAP); + + tgt_warn_on_cksum(req, desc, local_nb, npages, + body->oa.o_cksum, + repbody->oa.o_cksum, mmap); + cksum_counter = 0; + } else if ((cksum_counter & (-cksum_counter)) == + cksum_counter) { + CDEBUG(D_INFO, "Checksum %u from %s OK: %x\n", + cksum_counter, libcfs_id2str(req->rq_peer), + repbody->oa.o_cksum); + } + } + + OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK2, cfs_fail_val); + +out_commitrw: + /* calculate the expected actual write bytes (nob) for OFD stats. + * Technically, if commit fails this would be wrong, but that should be + * very rare + */ + for (i = 0; i < niocount; i++) { + int len = remote_nb[i].rnb_len; + + nob += len; + } + + /* multiple transactions can be assigned during write commit */ + tti->tti_mult_trans = 1; + + /* Must commit after prep above in all cases */ + rc = obd_commitrw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa, + objcount, ioo, remote_nb, npages, local_nb, rc, nob, + kstart); + if (rc == -ENOTCONN) + /* quota acquire process has been given up because + * either the client has been evicted or the client + * has timed out the request already + */ + no_reply = true; + + for (i = 0; i < niocount; i++) { + if (!(local_nb[i].lnb_flags & OBD_BRW_ASYNC)) { + wait_sync = true; + break; + } + } + /* + * Disable sending mtime back to the client. If the client locked the + * whole object, then it has already updated the mtime on its side, + * otherwise it will have to glimpse anyway (see bug 21489, comment 32) + */ + repbody->oa.o_valid &= ~(OBD_MD_FLMTIME | OBD_MD_FLATIME); + + if (rc == 0) { + /* set per-requested niobuf return codes */ + for (i = j = 0; i < niocount; i++) { + int len = remote_nb[i].rnb_len; + + rcs[i] = 0; + do { + LASSERT(j < npages); + if (local_nb[j].lnb_rc < 0) + rcs[i] = local_nb[j].lnb_rc; + len -= local_nb[j].lnb_len; + j++; + } while (len > 0); + LASSERT(len == 0); + } + LASSERT(j == npages); + ptlrpc_lprocfs_brw(req, nob); + } +out_lock: + tgt_brw_unlock(exp, ioo, remote_nb, &lockh, LCK_PW); + if (desc) + ptlrpc_free_bulk(desc); +out: + if (unlikely(no_reply || (exp->exp_obd->obd_no_transno && wait_sync))) { + req->rq_no_reply = 1; + /* reply out callback would free */ + ptlrpc_req_drop_rs(req); + if (!exp->exp_obd->obd_no_transno) + LCONSOLE_WARN("%s: Bulk IO write error with %s (at %s)," + " client will retry: rc = %d\n", + obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), rc); + } + + if (mpflags) + memalloc_noreclaim_restore(mpflags); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_brw_write); + +/** + * Common request handler for OST_SEEK RPC. + * + * Unified request handling for OST_SEEK RPC. + * It takes object by its FID, does needed lseek and packs result + * into reply. Only SEEK_HOLE and SEEK_DATA are supported. + * + * \param[in] tsi target session environment for this request + * + * \retval 0 if successful + * \retval negative value on error + */ +int tgt_lseek(struct tgt_session_info *tsi) +{ + struct lustre_handle lh = { 0 }; + struct dt_object *dob; + struct ost_body *repbody; + loff_t offset = tsi->tsi_ost_body->oa.o_size; + int whence = tsi->tsi_ost_body->oa.o_mode; + bool srvlock; + int rc = 0; + + ENTRY; + + if (whence != SEEK_HOLE && whence != SEEK_DATA) + RETURN(-EPROTO); + + /* Negative offset is prohibited on wire and must be handled on client + * prior sending RPC. + */ + if (offset < 0) + RETURN(-EPROTO); + + repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY); + if (repbody == NULL) + RETURN(-ENOMEM); + repbody->oa = tsi->tsi_ost_body->oa; + + srvlock = tsi->tsi_ost_body->oa.o_valid & OBD_MD_FLFLAGS && + tsi->tsi_ost_body->oa.o_flags & OBD_FL_SRVLOCK; + if (srvlock) { + rc = tgt_data_lock(tsi->tsi_env, tsi->tsi_exp, &tsi->tsi_resid, + offset, OBD_OBJECT_EOF, &lh, LCK_PR); + if (rc) + RETURN(rc); + } + + dob = dt_locate(tsi->tsi_env, tsi->tsi_tgt->lut_bottom, &tsi->tsi_fid); + if (IS_ERR(dob)) + GOTO(out, rc = PTR_ERR(dob)); + + if (!dt_object_exists(dob)) + GOTO(obj_put, rc = -ENOENT); + + repbody->oa.o_size = dt_lseek(tsi->tsi_env, dob, offset, whence); + rc = 0; +obj_put: + dt_object_put(tsi->tsi_env, dob); +out: + if (srvlock) + tgt_data_unlock(&lh, LCK_PR); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_lseek); + +/* Check if request can be reconstructed from saved reply data + * A copy of the reply data is returned in @trd if the pointer is not NULL + */ +int req_can_reconstruct(struct ptlrpc_request *req, + struct tg_reply_data *trd) +{ + struct tg_export_data *ted = &req->rq_export->exp_target_data; + struct lsd_client_data *lcd = ted->ted_lcd; + int found; + + if (tgt_is_multimodrpcs_client(req->rq_export)) + return tgt_lookup_reply(req, trd); + + mutex_lock(&ted->ted_lcd_lock); + found = req->rq_xid == lcd->lcd_last_xid || + req->rq_xid == lcd->lcd_last_close_xid; + + if (found && trd != NULL) { + if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE) { + trd->trd_reply.lrd_xid = lcd->lcd_last_close_xid; + trd->trd_reply.lrd_transno = + lcd->lcd_last_close_transno; + trd->trd_reply.lrd_result = lcd->lcd_last_close_result; + } else { + trd->trd_reply.lrd_xid = lcd->lcd_last_xid; + trd->trd_reply.lrd_transno = lcd->lcd_last_transno; + trd->trd_reply.lrd_result = lcd->lcd_last_result; + trd->trd_reply.lrd_data = lcd->lcd_last_data; + trd->trd_pre_versions[0] = lcd->lcd_pre_versions[0]; + trd->trd_pre_versions[1] = lcd->lcd_pre_versions[1]; + trd->trd_pre_versions[2] = lcd->lcd_pre_versions[2]; + trd->trd_pre_versions[3] = lcd->lcd_pre_versions[3]; + } + } + mutex_unlock(&ted->ted_lcd_lock); + + return found; +} +EXPORT_SYMBOL(req_can_reconstruct); + diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_internal.h b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h new file mode 100644 index 0000000000000..39fb4101e6f2c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h @@ -0,0 +1,302 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * lustre/target/tgt_internal.h + * + * Lustre Unified Target header file + * + * Author: Mikhail Pershin + */ + +#ifndef _TG_INTERNAL_H +#define _TG_INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include + +extern int (*tgt_lfsck_in_notify_local)(const struct lu_env *env, + struct dt_device *key, + struct lfsck_req_local *lrl, + struct thandle *th); +/** + * Common data shared by tg-level handlers. This is allocated per-thread to + * reduce stack consumption. + */ +struct tgt_thread_info { + /* server and client data buffers */ + struct lr_server_data tti_lsd; + struct lsd_client_data tti_lcd; + struct lsd_reply_data tti_lrd; + struct lu_buf tti_buf; + loff_t tti_off; + + struct lu_attr tti_attr; + struct lu_fid tti_fid1; + + /* transno storage during last_rcvd update */ + __u64 tti_transno; + __u32 tti_has_trans:1, + tti_mult_trans:1; + + /* Updates data for OUT target */ + struct thandle_exec_args tti_tea; + union { + struct { + /* for tgt_readpage() */ + struct lu_rdpg tti_rdpg; + } rdpg; + struct { + struct dt_object_format tti_update_dof; + struct object_update_reply *tti_update_reply; + struct object_update *tti_update; + int tti_update_reply_index; + struct obdo tti_obdo; + struct dt_object *tti_dt_object; + } update; + struct obd_statfs osfs; /* for obd_statfs() in OFD/MDT */ + } tti_u; + struct lfsck_req_local tti_lrl; + struct dt_insert_rec tti_rec; +}; + +extern struct lu_context_key tgt_thread_key; + +static inline struct tgt_thread_info *tgt_th_info(const struct lu_env *env) +{ + struct tgt_thread_info *tti; + + tti = lu_context_key_get(&env->le_ctx, &tgt_thread_key); + LASSERT(tti); + return tti; +} + +#define MGS_SERVICE_WATCHDOG_FACTOR (2) + +int tgt_request_handle(struct ptlrpc_request *req); + +/* check if request's xid is equal to last one or not*/ +static inline int req_xid_is_last(struct ptlrpc_request *req) +{ + struct lsd_client_data *lcd = req->rq_export->exp_target_data.ted_lcd; + + LASSERT(lcd != NULL); + return (req->rq_xid == lcd->lcd_last_xid || + req->rq_xid == lcd->lcd_last_close_xid); +} + +static inline char *dt_obd_name(struct dt_device *dt) +{ + return dt->dd_lu_dev.ld_obd->obd_name; +} + +/* out_lib.c */ +int out_tx_create_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg); +struct tx_arg *tx_add_exec(struct thandle_exec_args *ta, + tx_exec_func_t func, tx_exec_func_t undo, + const char *file, int line); + +int out_create_add_exec(const struct lu_env *env, struct dt_object *obj, + struct lu_attr *attr, struct lu_fid *parent_fid, + struct dt_object_format *dof, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line); + +int out_attr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const struct lu_attr *attr, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line); + +int out_write_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const struct lu_buf *buf, loff_t pos, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line); + +int out_xattr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const struct lu_buf *buf, const char *name, + int flags, struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line); + +int out_xattr_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const char *name, struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line); + +int out_ref_add_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line); + +int out_ref_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line); + +int out_index_insert_add_exec(const struct lu_env *env, + struct dt_object *dt_obj, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line); + +int out_index_delete_add_exec(const struct lu_env *env, + struct dt_object *dt_obj, + const struct dt_key *key, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line); + +int out_destroy_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line); + +/* Update handlers */ +int out_handle(struct tgt_session_info *tsi); + +#define out_tx_create(env, obj, attr, fid, dof, ta, th, reply, idx) \ + out_create_add_exec(env, obj, attr, fid, dof, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_attr_set(env, obj, attr, ta, th, reply, idx) \ + out_attr_set_add_exec(env, obj, attr, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_xattr_set(env, obj, buf, name, fl, ta, th, reply, idx) \ + out_xattr_set_add_exec(env, obj, buf, name, fl, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_xattr_del(env, obj, name, ta, th, reply, idx) \ + out_xattr_del_add_exec(env, obj, name, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_ref_add(env, obj, ta, th, reply, idx) \ + out_ref_add_add_exec(env, obj, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_ref_del(env, obj, ta, th, reply, idx) \ + out_ref_del_add_exec(env, obj, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_index_insert(env, obj, rec, key, ta, th, reply, idx) \ + out_index_insert_add_exec(env, obj, rec, key, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_index_delete(env, obj, key, ta, th, reply, idx) \ + out_index_delete_add_exec(env, obj, key, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_destroy(env, obj, ta, th, reply, idx) \ + out_destroy_add_exec(env, obj, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_write(env, obj, buf, pos, ta, th, reply, idx) \ + out_write_add_exec(env, obj, buf, pos, ta, th, reply, idx,\ + __FILE__, __LINE__) + +const char *update_op_str(__u16 opcode); + +extern struct page *tgt_page_to_corrupt; + +int tgt_server_data_init(const struct lu_env *env, struct lu_target *tgt); +int tgt_txn_start_cb(const struct lu_env *env, struct thandle *th, + void *cookie); +int tgt_txn_stop_cb(const struct lu_env *env, struct thandle *th, + void *cookie); +int tgt_handle_received_xid(struct obd_export *exp, __u64 rcvd_xid); +int tgt_handle_tag(struct ptlrpc_request *req); + +void update_records_dump(const struct update_records *records, + unsigned int mask, bool dump_updates); +int check_and_prepare_update_record(const struct lu_env *env, + struct thandle_update_records *tur); +struct update_thread_info { + struct lu_attr uti_attr; + struct lu_fid uti_fid; + struct lu_buf uti_buf; + struct thandle_update_records uti_tur; + struct obdo uti_obdo; + struct thandle_exec_args uti_tea; + struct dt_insert_rec uti_rec; + struct distribute_txn_replay_req *uti_dtrq; +}; + +extern struct lu_context_key update_thread_key; + +static inline struct update_thread_info * +update_env_info(const struct lu_env *env) +{ + struct update_thread_info *uti; + + uti = lu_context_key_get(&env->le_ctx, &update_thread_key); + LASSERT(uti != NULL); + return uti; +} + +void update_info_init(void); +void update_info_fini(void); +struct sub_thandle *create_sub_thandle(struct top_multiple_thandle *tmt, + struct dt_device *dt_dev); +int sub_thandle_trans_create(const struct lu_env *env, + struct top_thandle *top_th, + struct sub_thandle *st); +void distribute_txn_insert_by_batchid(struct top_multiple_thandle *new); +int top_trans_create_tmt(const struct lu_env *env, + struct top_thandle *top_th); + +void tgt_cancel_slc_locks(struct lu_target *tgt, __u64 transno); +void barrier_init(void); +void barrier_fini(void); + +/* FMD tracking data */ +struct tgt_fmd_data { + struct list_head fmd_list; /* linked to tgt_fmd_list */ + struct lu_fid fmd_fid; /* FID being written to */ + __u64 fmd_mactime_xid; /* xid highest {m,a,c}time setattr */ + time64_t fmd_expire; /* time when the fmd should expire */ + int fmd_refcount; /* reference counter - list holds 1 */ +}; + +/* tgt_fmd.c */ +extern struct kmem_cache *tgt_fmd_kmem; +void tgt_fmd_expire(struct obd_export *exp); +void tgt_fmd_cleanup(struct obd_export *exp); + +#endif /* _TG_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c new file mode 100644 index 0000000000000..4341d75b0bf38 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c @@ -0,0 +1,2282 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Lustre Unified Target + * These are common function to work with last_received file + * + * Author: Mikhail Pershin + */ +#include +#include +#include + +#include "tgt_internal.h" + +/** version recovery epoch */ +#define LR_EPOCH_BITS 32 + +/* Allocate a bitmap for a chunk of reply data slots */ +static int tgt_bitmap_chunk_alloc(struct lu_target *lut, int chunk) +{ + unsigned long *bm; + + OBD_ALLOC_LARGE(bm, BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) * + sizeof(long)); + if (bm == NULL) + return -ENOMEM; + + spin_lock(&lut->lut_client_bitmap_lock); + + if (lut->lut_reply_bitmap[chunk] != NULL) { + /* someone else already allocated the bitmap for this chunk */ + spin_unlock(&lut->lut_client_bitmap_lock); + OBD_FREE_LARGE(bm, BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) * + sizeof(long)); + return 0; + } + + lut->lut_reply_bitmap[chunk] = bm; + + spin_unlock(&lut->lut_client_bitmap_lock); + + return 0; +} + +/* Look for an available reply data slot in the bitmap + * of the target @lut + * Allocate bitmap chunk when first used + * XXX algo could be improved if this routine limits performance + */ +static int tgt_find_free_reply_slot(struct lu_target *lut) +{ + unsigned long *bmp; + int chunk = 0; + int rc; + int b; + + for (chunk = 0; chunk < LUT_REPLY_SLOTS_MAX_CHUNKS; chunk++) { + /* allocate the bitmap chunk if necessary */ + if (unlikely(lut->lut_reply_bitmap[chunk] == NULL)) { + rc = tgt_bitmap_chunk_alloc(lut, chunk); + if (rc != 0) + return rc; + } + bmp = lut->lut_reply_bitmap[chunk]; + + /* look for an available slot in this chunk */ + do { + b = find_first_zero_bit(bmp, LUT_REPLY_SLOTS_PER_CHUNK); + if (b >= LUT_REPLY_SLOTS_PER_CHUNK) + break; + + /* found one */ + if (test_and_set_bit(b, bmp) == 0) + return chunk * LUT_REPLY_SLOTS_PER_CHUNK + b; + } while (true); + } + + return -ENOSPC; +} + +/* Mark the reply data slot @idx 'used' in the corresponding bitmap chunk + * of the target @lut + * Allocate the bitmap chunk if necessary + */ +static int tgt_set_reply_slot(struct lu_target *lut, int idx) +{ + int chunk; + int b; + int rc; + + chunk = idx / LUT_REPLY_SLOTS_PER_CHUNK; + b = idx % LUT_REPLY_SLOTS_PER_CHUNK; + + LASSERT(chunk < LUT_REPLY_SLOTS_MAX_CHUNKS); + LASSERT(b < LUT_REPLY_SLOTS_PER_CHUNK); + + /* allocate the bitmap chunk if necessary */ + if (unlikely(lut->lut_reply_bitmap[chunk] == NULL)) { + rc = tgt_bitmap_chunk_alloc(lut, chunk); + if (rc != 0) + return rc; + } + + /* mark the slot 'used' in this chunk */ + if (test_and_set_bit(b, lut->lut_reply_bitmap[chunk]) != 0) { + CERROR("%s: slot %d already set in bitmap\n", + tgt_name(lut), idx); + return -EALREADY; + } + + return 0; +} + + +/* Mark the reply data slot @idx 'unused' in the corresponding bitmap chunk + * of the target @lut + */ +static int tgt_clear_reply_slot(struct lu_target *lut, int idx) +{ + int chunk; + int b; + + if (lut->lut_obd->obd_stopping) + /* + * in case of failover keep the bit set in order to + * avoid overwriting slots in reply_data which might + * be required by resent rpcs + */ + return 0; + chunk = idx / LUT_REPLY_SLOTS_PER_CHUNK; + b = idx % LUT_REPLY_SLOTS_PER_CHUNK; + + LASSERT(chunk < LUT_REPLY_SLOTS_MAX_CHUNKS); + LASSERT(b < LUT_REPLY_SLOTS_PER_CHUNK); + + if (lut->lut_reply_bitmap[chunk] == NULL) { + CERROR("%s: slot %d not allocated\n", + tgt_name(lut), idx); + return -ENOENT; + } + + if (test_and_clear_bit(b, lut->lut_reply_bitmap[chunk]) == 0) { + CERROR("%s: slot %d already clear in bitmap\n", + tgt_name(lut), idx); + return -EALREADY; + } + + return 0; +} + + +/* Read header of reply_data file of target @tgt into structure @lrh */ +static int tgt_reply_header_read(const struct lu_env *env, + struct lu_target *tgt, + struct lsd_reply_header *lrh) +{ + int rc; + struct lsd_reply_header buf; + struct tgt_thread_info *tti = tgt_th_info(env); + + tti->tti_off = 0; + tti->tti_buf.lb_buf = &buf; + tti->tti_buf.lb_len = sizeof(buf); + + rc = dt_record_read(env, tgt->lut_reply_data, &tti->tti_buf, + &tti->tti_off); + if (rc != 0) + return rc; + + lrh->lrh_magic = le32_to_cpu(buf.lrh_magic); + lrh->lrh_header_size = le32_to_cpu(buf.lrh_header_size); + lrh->lrh_reply_size = le32_to_cpu(buf.lrh_reply_size); + + CDEBUG(D_HA, "%s: read %s header. magic=0x%08x " + "header_size=%d reply_size=%d\n", + tgt->lut_obd->obd_name, REPLY_DATA, + lrh->lrh_magic, lrh->lrh_header_size, lrh->lrh_reply_size); + + return 0; +} + +/* Write header into replay_data file of target @tgt from structure @lrh */ +static int tgt_reply_header_write(const struct lu_env *env, + struct lu_target *tgt, + struct lsd_reply_header *lrh) +{ + int rc; + struct lsd_reply_header buf; + struct tgt_thread_info *tti = tgt_th_info(env); + struct thandle *th; + struct dt_object *dto; + + CDEBUG(D_HA, "%s: write %s header. magic=0x%08x " + "header_size=%d reply_size=%d\n", + tgt->lut_obd->obd_name, REPLY_DATA, + lrh->lrh_magic, lrh->lrh_header_size, lrh->lrh_reply_size); + + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + + buf.lrh_magic = cpu_to_le32(lrh->lrh_magic); + buf.lrh_header_size = cpu_to_le32(lrh->lrh_header_size); + buf.lrh_reply_size = cpu_to_le32(lrh->lrh_reply_size); + + th = dt_trans_create(env, tgt->lut_bottom); + if (IS_ERR(th)) + return PTR_ERR(th); + th->th_sync = 1; + + tti->tti_off = 0; + tti->tti_buf.lb_buf = &buf; + tti->tti_buf.lb_len = sizeof(buf); + + rc = dt_declare_record_write(env, tgt->lut_reply_data, + &tti->tti_buf, tti->tti_off, th); + if (rc) + GOTO(out, rc); + + rc = dt_trans_start(env, tgt->lut_bottom, th); + if (rc) + GOTO(out, rc); + + dto = dt_object_locate(tgt->lut_reply_data, th->th_dev); + rc = dt_record_write(env, dto, &tti->tti_buf, &tti->tti_off, th); +out: + dt_trans_stop(env, tgt->lut_bottom, th); + return rc; +} + +/* Write the reply data @lrd into reply_data file of target @tgt + * at offset @off + */ +static int tgt_reply_data_write(const struct lu_env *env, struct lu_target *tgt, + struct lsd_reply_data *lrd, loff_t off, + struct thandle *th) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct dt_object *dto; + struct lsd_reply_data *buf = &tti->tti_lrd; + + lrd->lrd_result = ptlrpc_status_hton(lrd->lrd_result); + + buf->lrd_transno = cpu_to_le64(lrd->lrd_transno); + buf->lrd_xid = cpu_to_le64(lrd->lrd_xid); + buf->lrd_data = cpu_to_le64(lrd->lrd_data); + buf->lrd_result = cpu_to_le32(lrd->lrd_result); + buf->lrd_client_gen = cpu_to_le32(lrd->lrd_client_gen); + + lrd->lrd_result = ptlrpc_status_ntoh(lrd->lrd_result); + + tti->tti_off = off; + tti->tti_buf.lb_buf = buf; + tti->tti_buf.lb_len = sizeof(*buf); + + dto = dt_object_locate(tgt->lut_reply_data, th->th_dev); + return dt_record_write(env, dto, &tti->tti_buf, &tti->tti_off, th); +} + +/* Read the reply data from reply_data file of target @tgt at offset @off + * into structure @lrd + */ +static int tgt_reply_data_read(const struct lu_env *env, struct lu_target *tgt, + struct lsd_reply_data *lrd, loff_t off) +{ + int rc; + struct tgt_thread_info *tti = tgt_th_info(env); + struct lsd_reply_data *buf = &tti->tti_lrd; + + tti->tti_off = off; + tti->tti_buf.lb_buf = buf; + tti->tti_buf.lb_len = sizeof(*buf); + + rc = dt_record_read(env, tgt->lut_reply_data, &tti->tti_buf, + &tti->tti_off); + if (rc != 0) + return rc; + + lrd->lrd_transno = le64_to_cpu(buf->lrd_transno); + lrd->lrd_xid = le64_to_cpu(buf->lrd_xid); + lrd->lrd_data = le64_to_cpu(buf->lrd_data); + lrd->lrd_result = le32_to_cpu(buf->lrd_result); + lrd->lrd_client_gen = le32_to_cpu(buf->lrd_client_gen); + + return 0; +} + + +/* Free the in-memory reply data structure @trd and release + * the corresponding slot in the reply_data file of target @lut + * Called with ted_lcd_lock held + */ +static void tgt_free_reply_data(struct lu_target *lut, + struct tg_export_data *ted, + struct tg_reply_data *trd) +{ + CDEBUG(D_TRACE, "%s: free reply data %p: xid %llu, transno %llu, " + "client gen %u, slot idx %d\n", + lut == NULL ? "" : tgt_name(lut), trd, trd->trd_reply.lrd_xid, + trd->trd_reply.lrd_transno, trd->trd_reply.lrd_client_gen, + trd->trd_index); + + LASSERT(mutex_is_locked(&ted->ted_lcd_lock)); + + list_del(&trd->trd_list); + ted->ted_reply_cnt--; + if (lut != NULL && trd->trd_index != TRD_INDEX_MEMORY) + tgt_clear_reply_slot(lut, trd->trd_index); + OBD_FREE_PTR(trd); +} + +/* Release the reply data @trd from target @lut + * The reply data with the highest transno for this export + * is retained to ensure correctness of target recovery + * Called with ted_lcd_lock held + */ +static void tgt_release_reply_data(struct lu_target *lut, + struct tg_export_data *ted, + struct tg_reply_data *trd) +{ + CDEBUG(D_TRACE, "%s: release reply data %p: xid %llu, transno %llu, " + "client gen %u, slot idx %d\n", + lut == NULL ? "" : tgt_name(lut), trd, trd->trd_reply.lrd_xid, + trd->trd_reply.lrd_transno, trd->trd_reply.lrd_client_gen, + trd->trd_index); + + LASSERT(mutex_is_locked(&ted->ted_lcd_lock)); + + /* Do not free the reply data corresponding to the + * highest transno of this export. + * This ensures on-disk reply data is kept and + * last committed transno can be restored from disk in case + * of target recovery + */ + if (trd->trd_reply.lrd_transno == ted->ted_lcd->lcd_last_transno) { + /* free previous retained reply */ + if (ted->ted_reply_last != NULL) + tgt_free_reply_data(lut, ted, ted->ted_reply_last); + /* retain the reply */ + list_del_init(&trd->trd_list); + ted->ted_reply_last = trd; + } else { + tgt_free_reply_data(lut, ted, trd); + } +} + +static inline struct lu_buf *tti_buf_lsd(struct tgt_thread_info *tti) +{ + tti->tti_buf.lb_buf = &tti->tti_lsd; + tti->tti_buf.lb_len = sizeof(tti->tti_lsd); + return &tti->tti_buf; +} + +static inline struct lu_buf *tti_buf_lcd(struct tgt_thread_info *tti) +{ + tti->tti_buf.lb_buf = &tti->tti_lcd; + tti->tti_buf.lb_len = sizeof(tti->tti_lcd); + return &tti->tti_buf; +} + +static inline bool tgt_is_multimodrpcs_record(struct lu_target *tgt, + struct lsd_client_data *lcd) +{ + return tgt->lut_lsd.lsd_feature_incompat & OBD_INCOMPAT_MULTI_RPCS && + lcd->lcd_generation != 0; +} + +/** + * Allocate in-memory data for client slot related to export. + */ +int tgt_client_alloc(struct obd_export *exp) +{ + ENTRY; + LASSERT(exp != exp->exp_obd->obd_self_export); + + spin_lock_init(&exp->exp_target_data.ted_nodemap_lock); + INIT_LIST_HEAD(&exp->exp_target_data.ted_nodemap_member); + spin_lock_init(&exp->exp_target_data.ted_fmd_lock); + INIT_LIST_HEAD(&exp->exp_target_data.ted_fmd_list); + + OBD_ALLOC_PTR(exp->exp_target_data.ted_lcd); + if (exp->exp_target_data.ted_lcd == NULL) + RETURN(-ENOMEM); + /* Mark that slot is not yet valid, 0 doesn't work here */ + exp->exp_target_data.ted_lr_idx = -1; + INIT_LIST_HEAD(&exp->exp_target_data.ted_reply_list); + mutex_init(&exp->exp_target_data.ted_lcd_lock); + RETURN(0); +} +EXPORT_SYMBOL(tgt_client_alloc); + +/** + * Free in-memory data for client slot related to export. + */ +void tgt_client_free(struct obd_export *exp) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *lut = class_exp2tgt(exp); + struct tg_reply_data *trd, *tmp; + + LASSERT(exp != exp->exp_obd->obd_self_export); + + tgt_fmd_cleanup(exp); + + /* free reply data */ + mutex_lock(&ted->ted_lcd_lock); + list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) { + tgt_release_reply_data(lut, ted, trd); + } + if (ted->ted_reply_last != NULL) { + tgt_free_reply_data(lut, ted, ted->ted_reply_last); + ted->ted_reply_last = NULL; + } + mutex_unlock(&ted->ted_lcd_lock); + + if (!hlist_unhashed(&exp->exp_gen_hash)) + cfs_hash_del(exp->exp_obd->obd_gen_hash, + &ted->ted_lcd->lcd_generation, + &exp->exp_gen_hash); + + OBD_FREE_PTR(ted->ted_lcd); + ted->ted_lcd = NULL; + + /* Target may have been freed (see LU-7430) + * Slot may be not yet assigned */ + if (exp->exp_obd->u.obt.obt_magic != OBT_MAGIC || + ted->ted_lr_idx < 0) + return; + + /* Clear bit when lcd is freed */ + LASSERT(lut && lut->lut_client_bitmap); + if (!test_and_clear_bit(ted->ted_lr_idx, lut->lut_client_bitmap)) { + CERROR("%s: client %u bit already clear in bitmap\n", + exp->exp_obd->obd_name, ted->ted_lr_idx); + LBUG(); + } +} +EXPORT_SYMBOL(tgt_client_free); + +static inline void tgt_check_lcd(const char *obd_name, int index, + struct lsd_client_data *lcd) +{ + size_t uuid_size = sizeof(lcd->lcd_uuid); + + if (strnlen((char*)lcd->lcd_uuid, uuid_size) == uuid_size) { + lcd->lcd_uuid[uuid_size - 1] = '\0'; + + LCONSOLE_ERROR("the client UUID (%s) on %s for exports stored in last_rcvd(index = %d) is bad!\n", + lcd->lcd_uuid, obd_name, index); + } +} + +static int tgt_client_data_read(const struct lu_env *env, struct lu_target *tgt, + struct lsd_client_data *lcd, + loff_t *off, int index) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + int rc; + + tti_buf_lcd(tti); + rc = dt_record_read(env, tgt->lut_last_rcvd, &tti->tti_buf, off); + if (rc == 0) { + tgt_check_lcd(tgt->lut_obd->obd_name, index, &tti->tti_lcd); + lcd_le_to_cpu(&tti->tti_lcd, lcd); + lcd->lcd_last_result = ptlrpc_status_ntoh(lcd->lcd_last_result); + lcd->lcd_last_close_result = + ptlrpc_status_ntoh(lcd->lcd_last_close_result); + } + + CDEBUG(D_INFO, "%s: read lcd @%lld uuid = %s, last_transno = %llu" + ", last_xid = %llu, last_result = %u, last_data = %u, " + "last_close_transno = %llu, last_close_xid = %llu, " + "last_close_result = %u, rc = %d\n", tgt->lut_obd->obd_name, + *off, lcd->lcd_uuid, lcd->lcd_last_transno, lcd->lcd_last_xid, + lcd->lcd_last_result, lcd->lcd_last_data, + lcd->lcd_last_close_transno, lcd->lcd_last_close_xid, + lcd->lcd_last_close_result, rc); + return rc; +} + +static int tgt_client_data_write(const struct lu_env *env, + struct lu_target *tgt, + struct lsd_client_data *lcd, + loff_t *off, struct thandle *th) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct dt_object *dto; + + lcd->lcd_last_result = ptlrpc_status_hton(lcd->lcd_last_result); + lcd->lcd_last_close_result = + ptlrpc_status_hton(lcd->lcd_last_close_result); + lcd_cpu_to_le(lcd, &tti->tti_lcd); + tti_buf_lcd(tti); + + dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev); + return dt_record_write(env, dto, &tti->tti_buf, off, th); +} + +struct tgt_new_client_callback { + struct dt_txn_commit_cb lncc_cb; + struct obd_export *lncc_exp; +}; + +static void tgt_cb_new_client(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err) +{ + struct tgt_new_client_callback *ccb; + + ccb = container_of(cb, struct tgt_new_client_callback, lncc_cb); + + LASSERT(ccb->lncc_exp->exp_obd); + + CDEBUG(D_RPCTRACE, "%s: committing for initial connect of %s\n", + ccb->lncc_exp->exp_obd->obd_name, + ccb->lncc_exp->exp_client_uuid.uuid); + + spin_lock(&ccb->lncc_exp->exp_lock); + + ccb->lncc_exp->exp_need_sync = 0; + + spin_unlock(&ccb->lncc_exp->exp_lock); + class_export_cb_put(ccb->lncc_exp); + + OBD_FREE_PTR(ccb); +} + +int tgt_new_client_cb_add(struct thandle *th, struct obd_export *exp) +{ + struct tgt_new_client_callback *ccb; + struct dt_txn_commit_cb *dcb; + int rc; + + OBD_ALLOC_PTR(ccb); + if (ccb == NULL) + return -ENOMEM; + + ccb->lncc_exp = class_export_cb_get(exp); + + dcb = &ccb->lncc_cb; + dcb->dcb_func = tgt_cb_new_client; + INIT_LIST_HEAD(&dcb->dcb_linkage); + strlcpy(dcb->dcb_name, "tgt_cb_new_client", sizeof(dcb->dcb_name)); + + rc = dt_trans_cb_add(th, dcb); + if (rc) { + class_export_cb_put(exp); + OBD_FREE_PTR(ccb); + } + return rc; +} + +/** + * Update client data in last_rcvd + */ +static int tgt_client_data_update(const struct lu_env *env, + struct obd_export *exp) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *tgt = class_exp2tgt(exp); + struct tgt_thread_info *tti = tgt_th_info(env); + struct thandle *th; + int rc = 0; + + ENTRY; + + if (unlikely(tgt == NULL)) { + CDEBUG(D_ERROR, "%s: No target for connected export\n", + class_exp2obd(exp)->obd_name); + RETURN(-EINVAL); + } + + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + + th = dt_trans_create(env, tgt->lut_bottom); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + tti_buf_lcd(tti); + rc = dt_declare_record_write(env, tgt->lut_last_rcvd, + &tti->tti_buf, + ted->ted_lr_off, th); + if (rc) + GOTO(out, rc); + + rc = dt_trans_start_local(env, tgt->lut_bottom, th); + if (rc) + GOTO(out, rc); + + mutex_lock(&ted->ted_lcd_lock); + + /* + * Until this operations will be committed the sync is needed + * for this export. This should be done _after_ starting the + * transaction so that many connecting clients will not bring + * server down with lots of sync writes. + */ + rc = tgt_new_client_cb_add(th, exp); + if (rc) { + /* can't add callback, do sync now */ + th->th_sync = 1; + } else { + spin_lock(&exp->exp_lock); + exp->exp_need_sync = 1; + spin_unlock(&exp->exp_lock); + } + + tti->tti_off = ted->ted_lr_off; + rc = tgt_client_data_write(env, tgt, ted->ted_lcd, &tti->tti_off, th); + + mutex_unlock(&ted->ted_lcd_lock); + + EXIT; +out: + dt_trans_stop(env, tgt->lut_bottom, th); + CDEBUG(D_INFO, "%s: update last_rcvd client data for UUID = %s, " + "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name, + tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc); + + return rc; +} + +static int tgt_server_data_read(const struct lu_env *env, struct lu_target *tgt) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + int rc; + + tti->tti_off = 0; + tti_buf_lsd(tti); + rc = dt_record_read(env, tgt->lut_last_rcvd, &tti->tti_buf, + &tti->tti_off); + if (rc == 0) + lsd_le_to_cpu(&tti->tti_lsd, &tgt->lut_lsd); + + CDEBUG(D_INFO, "%s: read last_rcvd server data for UUID = %s, " + "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name, + tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc); + return rc; +} + +static int tgt_server_data_write(const struct lu_env *env, + struct lu_target *tgt, struct thandle *th) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct dt_object *dto; + int rc; + + ENTRY; + + tti->tti_off = 0; + tti_buf_lsd(tti); + lsd_cpu_to_le(&tgt->lut_lsd, &tti->tti_lsd); + + dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev); + rc = dt_record_write(env, dto, &tti->tti_buf, &tti->tti_off, th); + + CDEBUG(D_INFO, "%s: write last_rcvd server data for UUID = %s, " + "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name, + tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc); + + RETURN(rc); +} + +/** + * Update server data in last_rcvd + */ +int tgt_server_data_update(const struct lu_env *env, struct lu_target *tgt, + int sync) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct thandle *th; + int rc = 0; + + ENTRY; + + CDEBUG(D_SUPER, + "%s: mount_count is %llu, last_transno is %llu\n", + tgt->lut_lsd.lsd_uuid, tgt->lut_obd->u.obt.obt_mount_count, + tgt->lut_last_transno); + + /* Always save latest transno to keep it fresh */ + spin_lock(&tgt->lut_translock); + tgt->lut_lsd.lsd_last_transno = tgt->lut_last_transno; + spin_unlock(&tgt->lut_translock); + + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + + th = dt_trans_create(env, tgt->lut_bottom); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + th->th_sync = sync; + + tti_buf_lsd(tti); + rc = dt_declare_record_write(env, tgt->lut_last_rcvd, + &tti->tti_buf, tti->tti_off, th); + if (rc) + GOTO(out, rc); + + rc = dt_trans_start(env, tgt->lut_bottom, th); + if (rc) + GOTO(out, rc); + + rc = tgt_server_data_write(env, tgt, th); +out: + dt_trans_stop(env, tgt->lut_bottom, th); + + CDEBUG(D_INFO, "%s: update last_rcvd server data for UUID = %s, " + "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name, + tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc); + RETURN(rc); +} +EXPORT_SYMBOL(tgt_server_data_update); + +static int tgt_truncate_last_rcvd(const struct lu_env *env, + struct lu_target *tgt, loff_t size) +{ + struct dt_object *dt = tgt->lut_last_rcvd; + struct thandle *th; + struct lu_attr attr; + int rc; + + ENTRY; + + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + + attr.la_size = size; + attr.la_valid = LA_SIZE; + + th = dt_trans_create(env, tgt->lut_bottom); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + rc = dt_declare_punch(env, dt, size, OBD_OBJECT_EOF, th); + if (rc) + GOTO(cleanup, rc); + rc = dt_declare_attr_set(env, dt, &attr, th); + if (rc) + GOTO(cleanup, rc); + rc = dt_trans_start_local(env, tgt->lut_bottom, th); + if (rc) + GOTO(cleanup, rc); + + rc = dt_punch(env, dt, size, OBD_OBJECT_EOF, th); + if (rc == 0) + rc = dt_attr_set(env, dt, &attr, th); + +cleanup: + dt_trans_stop(env, tgt->lut_bottom, th); + + RETURN(rc); +} + +static void tgt_client_epoch_update(const struct lu_env *env, + struct obd_export *exp) +{ + struct lsd_client_data *lcd = exp->exp_target_data.ted_lcd; + struct lu_target *tgt = class_exp2tgt(exp); + + LASSERT(tgt && tgt->lut_bottom); + /** VBR: set client last_epoch to current epoch */ + if (lcd->lcd_last_epoch >= tgt->lut_lsd.lsd_start_epoch) + return; + lcd->lcd_last_epoch = tgt->lut_lsd.lsd_start_epoch; + tgt_client_data_update(env, exp); +} + +/** + * Update boot epoch when recovery ends + */ +void tgt_boot_epoch_update(struct lu_target *tgt) +{ + struct lu_env env; + struct ptlrpc_request *req; + __u32 start_epoch; + LIST_HEAD(client_list); + int rc; + + if (tgt->lut_obd->obd_stopping) + return; + + rc = lu_env_init(&env, LCT_DT_THREAD); + if (rc) { + CERROR("%s: can't initialize environment: rc = %d\n", + tgt->lut_obd->obd_name, rc); + return; + } + + spin_lock(&tgt->lut_translock); + start_epoch = (tgt->lut_last_transno >> LR_EPOCH_BITS) + 1; + tgt->lut_last_transno = (__u64)start_epoch << LR_EPOCH_BITS; + tgt->lut_lsd.lsd_start_epoch = start_epoch; + spin_unlock(&tgt->lut_translock); + + /** + * The recovery is not yet finished and final queue can still be updated + * with resend requests. Move final list to separate one for processing + */ + spin_lock(&tgt->lut_obd->obd_recovery_task_lock); + list_splice_init(&tgt->lut_obd->obd_final_req_queue, &client_list); + spin_unlock(&tgt->lut_obd->obd_recovery_task_lock); + + /** + * go through list of exports participated in recovery and + * set new epoch for them + */ + list_for_each_entry(req, &client_list, rq_list) { + LASSERT(!req->rq_export->exp_delayed); + if (!req->rq_export->exp_vbr_failed) + tgt_client_epoch_update(&env, req->rq_export); + } + /** return list back at once */ + spin_lock(&tgt->lut_obd->obd_recovery_task_lock); + list_splice_init(&client_list, &tgt->lut_obd->obd_final_req_queue); + spin_unlock(&tgt->lut_obd->obd_recovery_task_lock); + + /** + * Clear MULTI RPCS incompatibility flag if there is no multi-rpcs + * client in last_rcvd file + */ + if (atomic_read(&tgt->lut_num_clients) == 0) + tgt->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS; + + /** update server epoch */ + tgt_server_data_update(&env, tgt, 1); + lu_env_fini(&env); +} + +/** + * commit callback, need to update last_committed value + */ +struct tgt_last_committed_callback { + struct dt_txn_commit_cb llcc_cb; + struct lu_target *llcc_tgt; + struct obd_export *llcc_exp; + __u64 llcc_transno; +}; + +static void tgt_cb_last_committed(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err) +{ + struct tgt_last_committed_callback *ccb; + + ccb = container_of(cb, struct tgt_last_committed_callback, llcc_cb); + + LASSERT(ccb->llcc_exp); + LASSERT(ccb->llcc_tgt != NULL); + LASSERT(ccb->llcc_exp->exp_obd == ccb->llcc_tgt->lut_obd); + + if (th->th_reserved_quota.qrr_count > 0) { + struct lu_env temp_env; + int rc; + + CDEBUG(D_QUOTA, "free quota %llu %llu\n", + th->th_reserved_quota.qrr_id.qid_gid, + th->th_reserved_quota.qrr_count); + + rc = lu_env_init(&temp_env, LCT_DT_THREAD); + if (rc) { + CERROR("%s: can't initialize environment: rc = %d\n", + ccb->llcc_tgt->lut_obd->obd_name, rc); + goto out; + } + + dt_reserve_or_free_quota(&temp_env, th->th_dev, + th->th_reserved_quota.qrr_type, + th->th_reserved_quota.qrr_id.qid_uid, + th->th_reserved_quota.qrr_id.qid_gid, + -th->th_reserved_quota.qrr_count, + false); + lu_env_fini(&temp_env); + } + + /* error hit, don't update last committed to provide chance to + * replay data after fail */ + if (err != 0) + goto out; + + /* Fast path w/o spinlock, if exp_last_committed was updated + * with higher transno, no need to take spinlock and check, + * also no need to update obd_last_committed. */ + if (ccb->llcc_transno <= ccb->llcc_exp->exp_last_committed) + goto out; + spin_lock(&ccb->llcc_tgt->lut_translock); + if (ccb->llcc_transno > ccb->llcc_tgt->lut_obd->obd_last_committed) + ccb->llcc_tgt->lut_obd->obd_last_committed = ccb->llcc_transno; + + if (ccb->llcc_transno > ccb->llcc_exp->exp_last_committed) { + ccb->llcc_exp->exp_last_committed = ccb->llcc_transno; + spin_unlock(&ccb->llcc_tgt->lut_translock); + + ptlrpc_commit_replies(ccb->llcc_exp); + tgt_cancel_slc_locks(ccb->llcc_tgt, ccb->llcc_transno); + } else { + spin_unlock(&ccb->llcc_tgt->lut_translock); + } + + CDEBUG(D_HA, "%s: transno %lld is committed\n", + ccb->llcc_tgt->lut_obd->obd_name, ccb->llcc_transno); + +out: + class_export_cb_put(ccb->llcc_exp); + OBD_FREE_PTR(ccb); +} + +/** + * Add commit callback function, it returns a non-zero value to inform + * caller to use sync transaction if necessary. + */ +static int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *tgt, + struct obd_export *exp, __u64 transno) +{ + struct tgt_last_committed_callback *ccb; + struct dt_txn_commit_cb *dcb; + int rc; + + OBD_ALLOC_PTR(ccb); + if (ccb == NULL) + return -ENOMEM; + + ccb->llcc_tgt = tgt; + ccb->llcc_exp = class_export_cb_get(exp); + ccb->llcc_transno = transno; + + dcb = &ccb->llcc_cb; + dcb->dcb_func = tgt_cb_last_committed; + INIT_LIST_HEAD(&dcb->dcb_linkage); + strlcpy(dcb->dcb_name, "tgt_cb_last_committed", sizeof(dcb->dcb_name)); + + rc = dt_trans_cb_add(th, dcb); + if (rc) { + class_export_cb_put(exp); + OBD_FREE_PTR(ccb); + } + + if (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) + /* report failure to force synchronous operation */ + return -EPERM; + + /* if exp_need_sync is set, return non-zero value to force + * a sync transaction. */ + return rc ? rc : exp->exp_need_sync; +} + +static int tgt_is_local_client(const struct lu_env *env, + struct obd_export *exp) +{ + struct lu_target *tgt = class_exp2tgt(exp); + struct tgt_session_info *tsi = tgt_ses_info(env); + struct ptlrpc_request *req = tgt_ses_req(tsi); + + if (exp_connect_flags(exp) & OBD_CONNECT_MDS || + exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) + return 0; + if (tgt->lut_local_recovery) + return 0; + if (!req) + return 0; + if (!LNetIsPeerLocal(req->rq_peer.nid)) + return 0; + + return 1; +} + +/** + * Add new client to the last_rcvd upon new connection. + * + * We use a bitmap to locate a free space in the last_rcvd file and initialize + * tg_export_data. + */ +int tgt_client_new(const struct lu_env *env, struct obd_export *exp) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *tgt = class_exp2tgt(exp); + int rc = 0, idx; + + ENTRY; + + LASSERT(tgt && tgt->lut_client_bitmap != NULL); + if (!strcmp(ted->ted_lcd->lcd_uuid, tgt->lut_obd->obd_uuid.uuid)) + RETURN(0); + + if (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) + RETURN(0); + + if (tgt_is_local_client(env, exp)) { + LCONSOLE_WARN("%s: local client %s w/o recovery\n", + exp->exp_obd->obd_name, ted->ted_lcd->lcd_uuid); + exp->exp_no_recovery = 1; + RETURN(0); + } + + /* the bitmap operations can handle cl_idx > sizeof(long) * 8, so + * there's no need for extra complication here + */ + idx = find_first_zero_bit(tgt->lut_client_bitmap, LR_MAX_CLIENTS); +repeat: + if (idx >= LR_MAX_CLIENTS || + OBD_FAIL_CHECK(OBD_FAIL_MDS_CLIENT_ADD)) { + CERROR("%s: no room for %u clients - fix LR_MAX_CLIENTS\n", + tgt->lut_obd->obd_name, idx); + RETURN(-EOVERFLOW); + } + if (test_and_set_bit(idx, tgt->lut_client_bitmap)) { + idx = find_next_zero_bit(tgt->lut_client_bitmap, + LR_MAX_CLIENTS, idx); + goto repeat; + } + + ted->ted_lr_idx = idx; + ted->ted_lr_off = tgt->lut_lsd.lsd_client_start + + idx * tgt->lut_lsd.lsd_client_size; + + LASSERTF(ted->ted_lr_off > 0, "ted_lr_off = %llu\n", ted->ted_lr_off); + + if (tgt_is_multimodrpcs_client(exp)) { + /* Set MULTI RPCS incompatibility flag to prevent previous + * Lustre versions to mount a target with reply_data file */ + if (!(tgt->lut_lsd.lsd_feature_incompat & + OBD_INCOMPAT_MULTI_RPCS)) { + tgt->lut_lsd.lsd_feature_incompat |= + OBD_INCOMPAT_MULTI_RPCS; + rc = tgt_server_data_update(env, tgt, 1); + if (rc < 0) { + CERROR("%s: unable to set MULTI RPCS " + "incompatibility flag\n", + exp->exp_obd->obd_name); + RETURN(rc); + } + } + + /* assign client slot generation */ + ted->ted_lcd->lcd_generation = + atomic_inc_return(&tgt->lut_client_generation); + } else { + ted->ted_lcd->lcd_generation = 0; + } + + CDEBUG(D_INFO, "%s: new client at index %d (%llu) with UUID '%s' " + "generation %d\n", + tgt->lut_obd->obd_name, ted->ted_lr_idx, ted->ted_lr_off, + ted->ted_lcd->lcd_uuid, ted->ted_lcd->lcd_generation); + + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_CLIENT_ADD)) + RETURN(-ENOSPC); + + rc = tgt_client_data_update(env, exp); + if (rc) { + CERROR("%s: Failed to write client lcd at idx %d, rc %d\n", + tgt->lut_obd->obd_name, idx, rc); + RETURN(rc); + } + + if (tgt_is_multimodrpcs_client(exp)) + atomic_inc(&tgt->lut_num_clients); + + RETURN(0); +} +EXPORT_SYMBOL(tgt_client_new); + +/* Add an existing client to the MDS in-memory state based on + * a client that was previously found in the last_rcvd file and + * already has an assigned slot (idx >= 0). + * + * It should not be possible to fail adding an existing client - otherwise + * mdt_init_server_data() callsite needs to be fixed. + */ +int tgt_client_add(const struct lu_env *env, struct obd_export *exp, int idx) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *tgt = class_exp2tgt(exp); + + ENTRY; + + LASSERT(tgt && tgt->lut_client_bitmap != NULL); + LASSERTF(idx >= 0, "%d\n", idx); + + if (!strcmp(ted->ted_lcd->lcd_uuid, tgt->lut_obd->obd_uuid.uuid) || + exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) + RETURN(0); + + if (test_and_set_bit(idx, tgt->lut_client_bitmap)) { + CERROR("%s: client %d: bit already set in bitmap!!\n", + tgt->lut_obd->obd_name, idx); + LBUG(); + } + + CDEBUG(D_INFO, "%s: client at idx %d with UUID '%s' added, " + "generation %d\n", + tgt->lut_obd->obd_name, idx, ted->ted_lcd->lcd_uuid, + ted->ted_lcd->lcd_generation); + + ted->ted_lr_idx = idx; + ted->ted_lr_off = tgt->lut_lsd.lsd_client_start + + idx * tgt->lut_lsd.lsd_client_size; + + mutex_init(&ted->ted_lcd_lock); + + LASSERTF(ted->ted_lr_off > 0, "ted_lr_off = %llu\n", ted->ted_lr_off); + + RETURN(0); +} + +int tgt_client_del(const struct lu_env *env, struct obd_export *exp) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *tgt = class_exp2tgt(exp); + int rc; + + ENTRY; + + LASSERT(ted->ted_lcd); + + if (unlikely(tgt == NULL)) { + CDEBUG(D_ERROR, "%s: No target for connected export\n", + class_exp2obd(exp)->obd_name); + RETURN(-EINVAL); + } + + /* XXX if lcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ + if (!strcmp((char *)ted->ted_lcd->lcd_uuid, + (char *)tgt->lut_obd->obd_uuid.uuid) || + exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT || + exp->exp_no_recovery) + RETURN(0); + + /* Slot may be not yet assigned, use case is race between Client + * reconnect and forced eviction */ + if (ted->ted_lr_idx < 0) { + CWARN("%s: client with UUID '%s' not in bitmap\n", + tgt->lut_obd->obd_name, ted->ted_lcd->lcd_uuid); + RETURN(0); + } + + CDEBUG(D_INFO, "%s: del client at idx %u, off %lld, UUID '%s'\n", + tgt->lut_obd->obd_name, ted->ted_lr_idx, ted->ted_lr_off, + ted->ted_lcd->lcd_uuid); + + /* Clear the bit _after_ zeroing out the client so we don't + race with filter_client_add and zero out new clients.*/ + if (!test_bit(ted->ted_lr_idx, tgt->lut_client_bitmap)) { + CERROR("%s: client %u: bit already clear in bitmap!!\n", + tgt->lut_obd->obd_name, ted->ted_lr_idx); + LBUG(); + } + + /* Do not erase record for recoverable client. */ + if (exp->exp_flags & OBD_OPT_FAILOVER) + RETURN(0); + + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_CLIENT_DEL)) + RETURN(0); + + /* Make sure the server's last_transno is up to date. + * This should be done before zeroing client slot so last_transno will + * be in server data or in client data in case of failure */ + rc = tgt_server_data_update(env, tgt, 0); + if (rc != 0) { + CERROR("%s: failed to update server data, skip client %s " + "zeroing, rc %d\n", tgt->lut_obd->obd_name, + ted->ted_lcd->lcd_uuid, rc); + RETURN(rc); + } + + /* Race between an eviction and a disconnection ?*/ + mutex_lock(&ted->ted_lcd_lock); + if (ted->ted_lcd->lcd_uuid[0] == '\0') { + mutex_unlock(&ted->ted_lcd_lock); + RETURN(rc); + } + + memset(ted->ted_lcd->lcd_uuid, 0, sizeof ted->ted_lcd->lcd_uuid); + mutex_unlock(&ted->ted_lcd_lock); + + rc = tgt_client_data_update(env, exp); + + if (!rc && tgt_is_multimodrpcs_record(tgt, ted->ted_lcd)) + atomic_dec(&tgt->lut_num_clients); + + CDEBUG(rc == 0 ? D_INFO : D_ERROR, + "%s: zeroing out client %s at idx %u (%llu), rc %d\n", + tgt->lut_obd->obd_name, ted->ted_lcd->lcd_uuid, + ted->ted_lr_idx, ted->ted_lr_off, rc); + RETURN(rc); +} +EXPORT_SYMBOL(tgt_client_del); + +static void tgt_clean_by_tag(struct obd_export *exp, __u64 xid, __u16 tag) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *lut = class_exp2tgt(exp); + struct tg_reply_data *trd, *tmp; + + if (tag == 0) + return; + + list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) { + if (trd->trd_tag != tag) + continue; + + LASSERT(ergo(tgt_is_increasing_xid_client(exp), + trd->trd_reply.lrd_xid <= xid)); + + ted->ted_release_tag++; + tgt_release_reply_data(lut, ted, trd); + } +} + +static int tgt_add_reply_data(const struct lu_env *env, struct lu_target *tgt, + struct tg_export_data *ted, struct tg_reply_data *trd, + struct ptlrpc_request *req, + struct thandle *th, bool update_lrd_file) +{ + struct lsd_reply_data *lrd; + int i; + int rc; + + lrd = &trd->trd_reply; + /* update export last transno */ + mutex_lock(&ted->ted_lcd_lock); + if (lrd->lrd_transno > ted->ted_lcd->lcd_last_transno) + ted->ted_lcd->lcd_last_transno = lrd->lrd_transno; + mutex_unlock(&ted->ted_lcd_lock); + + if (tgt != NULL) { + /* find a empty slot */ + i = tgt_find_free_reply_slot(tgt); + if (unlikely(i < 0)) { + CERROR("%s: couldn't find a slot for reply data: " + "rc = %d\n", tgt_name(tgt), i); + RETURN(i); + } + trd->trd_index = i; + + if (update_lrd_file) { + loff_t off; + + /* write reply data to disk */ + off = sizeof(struct lsd_reply_header) + sizeof(*lrd) * i; + rc = tgt_reply_data_write(env, tgt, lrd, off, th); + if (unlikely(rc != 0)) { + CERROR("%s: can't update %s file: rc = %d\n", + tgt_name(tgt), REPLY_DATA, rc); + GOTO(free_slot, rc); + } + } + } else { + trd->trd_index = TRD_INDEX_MEMORY; + } + + /* add reply data to target export's reply list */ + mutex_lock(&ted->ted_lcd_lock); + if (req != NULL) { + int exclude = tgt_is_increasing_xid_client(req->rq_export) ? + MSG_REPLAY : MSG_REPLAY|MSG_RESENT; + + if (req->rq_obsolete) { + CDEBUG(D_INFO, + "drop reply data update for obsolete req xid=%llu," + "transno=%llu, tag=%hu\n", req->rq_xid, + lrd->lrd_transno, trd->trd_tag); + mutex_unlock(&ted->ted_lcd_lock); + GOTO(free_slot, rc = -EBADR); + } + + if (!(lustre_msg_get_flags(req->rq_reqmsg) & exclude)) + tgt_clean_by_tag(req->rq_export, req->rq_xid, + trd->trd_tag); + } + list_add(&trd->trd_list, &ted->ted_reply_list); + ted->ted_reply_cnt++; + if (ted->ted_reply_cnt > ted->ted_reply_max) + ted->ted_reply_max = ted->ted_reply_cnt; + mutex_unlock(&ted->ted_lcd_lock); + + CDEBUG(D_TRACE, "add reply %p: xid %llu, transno %llu, " + "tag %hu, client gen %u, slot idx %d\n", + trd, lrd->lrd_xid, lrd->lrd_transno, + trd->trd_tag, lrd->lrd_client_gen, trd->trd_index); + + RETURN(0); + +free_slot: + if (tgt != NULL) + tgt_clear_reply_slot(tgt, trd->trd_index); + return rc; +} + +int tgt_mk_reply_data(const struct lu_env *env, + struct lu_target *tgt, + struct tg_export_data *ted, + struct ptlrpc_request *req, + __u64 opdata, + struct thandle *th, + bool write_update, + __u64 transno) +{ + struct tg_reply_data *trd; + struct lsd_reply_data *lrd; + __u64 *pre_versions = NULL; + int rc; + struct tgt_session_info *tsi = NULL; + + OBD_ALLOC_PTR(trd); + if (unlikely(trd == NULL)) + RETURN(-ENOMEM); + + if (env != NULL) + tsi = tgt_ses_info(env); + + /* fill reply data information */ + lrd = &trd->trd_reply; + lrd->lrd_transno = transno; + if (req != NULL) { + lrd->lrd_xid = req->rq_xid; + trd->trd_tag = lustre_msg_get_tag(req->rq_reqmsg); + lrd->lrd_client_gen = ted->ted_lcd->lcd_generation; + if (write_update) { + pre_versions = lustre_msg_get_versions(req->rq_repmsg); + lrd->lrd_result = th->th_result; + } + } else { + LASSERT(env != NULL); + LASSERT(tsi->tsi_xid != 0); + + lrd->lrd_xid = tsi->tsi_xid; + lrd->lrd_result = tsi->tsi_result; + lrd->lrd_client_gen = tsi->tsi_client_gen; + } + + lrd->lrd_data = opdata; + if (pre_versions) { + trd->trd_pre_versions[0] = pre_versions[0]; + trd->trd_pre_versions[1] = pre_versions[1]; + trd->trd_pre_versions[2] = pre_versions[2]; + trd->trd_pre_versions[3] = pre_versions[3]; + } + + if (tsi && tsi->tsi_open_obj) + trd->trd_object = *lu_object_fid(&tsi->tsi_open_obj->do_lu); + + rc = tgt_add_reply_data(env, tgt, ted, trd, req, + th, write_update); + if (rc < 0) { + OBD_FREE_PTR(trd); + if (rc == -EBADR) + rc = 0; + } + return rc; + +} +EXPORT_SYMBOL(tgt_mk_reply_data); + +/* + * last_rcvd & last_committed update callbacks + */ +static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt, + struct dt_object *obj, __u64 opdata, + struct thandle *th, struct ptlrpc_request *req) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct tgt_session_info *tsi = tgt_ses_info(env); + struct obd_export *exp = tsi->tsi_exp; + struct tg_export_data *ted; + __u64 *transno_p; + bool nolcd = false; + int rc = 0; + + ENTRY; + + + LASSERT(exp != NULL); + ted = &exp->exp_target_data; + + /* Some clients don't support recovery, and they don't have last_rcvd + * client data: + * 1. lightweight clients. + * 2. local clients on MDS which doesn't enable "localrecov". + * 3. OFD connect may cause transaction before export has last_rcvd + * slot. + */ + if (ted->ted_lr_idx < 0) + nolcd = true; + + if (req != NULL) + tti->tti_transno = lustre_msg_get_transno(req->rq_reqmsg); + else + /* From update replay, tti_transno should be set already */ + LASSERT(tti->tti_transno != 0); + + spin_lock(&tgt->lut_translock); + if (th->th_result != 0) { + if (tti->tti_transno != 0) { + CERROR("%s: replay transno %llu failed: rc = %d\n", + tgt_name(tgt), tti->tti_transno, th->th_result); + } + } else if (tti->tti_transno == 0) { + tti->tti_transno = ++tgt->lut_last_transno; + } else { + /* should be replay */ + if (tti->tti_transno > tgt->lut_last_transno) + tgt->lut_last_transno = tti->tti_transno; + } + spin_unlock(&tgt->lut_translock); + + /** VBR: set new versions */ + if (th->th_result == 0 && obj != NULL) { + struct dt_object *dto = dt_object_locate(obj, th->th_dev); + dt_version_set(env, dto, tti->tti_transno, th); + } + + /* filling reply data */ + CDEBUG(D_INODE, "transno = %llu, last_committed = %llu\n", + tti->tti_transno, tgt->lut_obd->obd_last_committed); + + if (req != NULL) { + req->rq_transno = tti->tti_transno; + lustre_msg_set_transno(req->rq_repmsg, tti->tti_transno); + } + + /* if can't add callback, do sync write */ + th->th_sync |= !!tgt_last_commit_cb_add(th, tgt, exp, tti->tti_transno); + + if (nolcd) { + /* store transno in the last_rcvd header */ + spin_lock(&tgt->lut_translock); + if (tti->tti_transno > tgt->lut_lsd.lsd_last_transno) { + tgt->lut_lsd.lsd_last_transno = tti->tti_transno; + spin_unlock(&tgt->lut_translock); + /* Although current connection doesn't have slot + * in the last_rcvd, we still want to maintain + * the in-memory lsd_client_data structure in order to + * properly handle reply reconstruction. */ + rc = tgt_server_data_write(env, tgt, th); + } else { + spin_unlock(&tgt->lut_translock); + } + } else if (ted->ted_lr_off == 0) { + CERROR("%s: client idx %d has offset %lld\n", + tgt_name(tgt), ted->ted_lr_idx, ted->ted_lr_off); + RETURN(-EINVAL); + } + + /* Target that supports multiple reply data */ + if (tgt_is_multimodrpcs_client(exp)) { + return tgt_mk_reply_data(env, tgt, ted, req, opdata, th, + !!(req != NULL), tti->tti_transno); + } + + /* Enough for update replay, let's return */ + if (req == NULL) + RETURN(rc); + + mutex_lock(&ted->ted_lcd_lock); + LASSERT(ergo(tti->tti_transno == 0, th->th_result != 0)); + if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE) { + transno_p = &ted->ted_lcd->lcd_last_close_transno; + ted->ted_lcd->lcd_last_close_xid = req->rq_xid; + ted->ted_lcd->lcd_last_close_result = th->th_result; + } else { + /* VBR: save versions in last_rcvd for reconstruct. */ + __u64 *pre_versions = lustre_msg_get_versions(req->rq_repmsg); + + if (pre_versions) { + ted->ted_lcd->lcd_pre_versions[0] = pre_versions[0]; + ted->ted_lcd->lcd_pre_versions[1] = pre_versions[1]; + ted->ted_lcd->lcd_pre_versions[2] = pre_versions[2]; + ted->ted_lcd->lcd_pre_versions[3] = pre_versions[3]; + } + transno_p = &ted->ted_lcd->lcd_last_transno; + ted->ted_lcd->lcd_last_xid = req->rq_xid; + ted->ted_lcd->lcd_last_result = th->th_result; + /* XXX: lcd_last_data is __u32 but intent_dispostion is __u64, + * see struct ldlm_reply->lock_policy_res1; */ + ted->ted_lcd->lcd_last_data = opdata; + } + + /* Update transno in slot only if non-zero number, i.e. no errors */ + if (likely(tti->tti_transno != 0)) { + /* Don't overwrite bigger transaction number with lower one. + * That is not sign of problem in all cases, but in any case + * this value should be monotonically increased only. */ + if (*transno_p > tti->tti_transno) { + if (!tgt->lut_no_reconstruct) { + CERROR("%s: trying to overwrite bigger transno:" + "on-disk: %llu, new: %llu replay: " + "%d. See LU-617.\n", tgt_name(tgt), + *transno_p, tti->tti_transno, + req_is_replay(req)); + if (req_is_replay(req)) { + spin_lock(&req->rq_export->exp_lock); + req->rq_export->exp_vbr_failed = 1; + spin_unlock(&req->rq_export->exp_lock); + } + mutex_unlock(&ted->ted_lcd_lock); + RETURN(req_is_replay(req) ? -EOVERFLOW : 0); + } + } else { + *transno_p = tti->tti_transno; + } + } + + if (!nolcd) { + tti->tti_off = ted->ted_lr_off; + if (CFS_FAIL_CHECK(OBD_FAIL_TGT_RCVD_EIO)) + rc = -EIO; + else + rc = tgt_client_data_write(env, tgt, ted->ted_lcd, + &tti->tti_off, th); + if (rc < 0) { + mutex_unlock(&ted->ted_lcd_lock); + RETURN(rc); + } + } + mutex_unlock(&ted->ted_lcd_lock); + RETURN(rc); +} + +/* + * last_rcvd update for echo client simulation. + * It updates last_rcvd client slot and version of object in + * simple way but with all locks to simulate all drawbacks + */ +static int tgt_last_rcvd_update_echo(const struct lu_env *env, + struct lu_target *tgt, + struct dt_object *obj, + struct thandle *th, + struct obd_export *exp) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct tg_export_data *ted = &exp->exp_target_data; + int rc = 0; + + ENTRY; + + tti->tti_transno = 0; + + spin_lock(&tgt->lut_translock); + if (th->th_result == 0) + tti->tti_transno = ++tgt->lut_last_transno; + spin_unlock(&tgt->lut_translock); + + /** VBR: set new versions */ + if (th->th_result == 0 && obj != NULL) + dt_version_set(env, obj, tti->tti_transno, th); + + /* if can't add callback, do sync write */ + th->th_sync |= !!tgt_last_commit_cb_add(th, tgt, exp, + tti->tti_transno); + + LASSERT(ted->ted_lr_off > 0); + + mutex_lock(&ted->ted_lcd_lock); + LASSERT(ergo(tti->tti_transno == 0, th->th_result != 0)); + ted->ted_lcd->lcd_last_transno = tti->tti_transno; + ted->ted_lcd->lcd_last_result = th->th_result; + + tti->tti_off = ted->ted_lr_off; + rc = tgt_client_data_write(env, tgt, ted->ted_lcd, &tti->tti_off, th); + mutex_unlock(&ted->ted_lcd_lock); + RETURN(rc); +} + +static int tgt_clients_data_init(const struct lu_env *env, + struct lu_target *tgt, + unsigned long last_size) +{ + struct obd_device *obd = tgt->lut_obd; + struct lr_server_data *lsd = &tgt->lut_lsd; + struct lsd_client_data *lcd = NULL; + struct tg_export_data *ted; + int cl_idx; + int rc = 0; + loff_t off = lsd->lsd_client_start; + __u32 generation = 0; + struct cfs_hash *hash = NULL; + + ENTRY; + + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + + BUILD_BUG_ON(offsetof(struct lsd_client_data, lcd_padding) + + sizeof(lcd->lcd_padding) != LR_CLIENT_SIZE); + + OBD_ALLOC_PTR(lcd); + if (lcd == NULL) + RETURN(-ENOMEM); + + hash = cfs_hash_getref(tgt->lut_obd->obd_gen_hash); + if (hash == NULL) + GOTO(err_out, rc = -ENODEV); + + for (cl_idx = 0; off < last_size; cl_idx++) { + struct obd_export *exp; + __u64 last_transno; + + /* Don't assume off is incremented properly by + * read_record(), in case sizeof(*lcd) + * isn't the same as fsd->lsd_client_size. */ + off = lsd->lsd_client_start + cl_idx * lsd->lsd_client_size; + rc = tgt_client_data_read(env, tgt, lcd, &off, cl_idx); + if (rc) { + CERROR("%s: error reading last_rcvd %s idx %d off " + "%llu: rc = %d\n", tgt_name(tgt), LAST_RCVD, + cl_idx, off, rc); + rc = 0; + break; /* read error shouldn't cause startup to fail */ + } + + if (lcd->lcd_uuid[0] == '\0') { + CDEBUG(D_INFO, "skipping zeroed client at offset %d\n", + cl_idx); + continue; + } + + last_transno = lcd_last_transno(lcd); + + /* These exports are cleaned up by disconnect, so they + * need to be set up like real exports as connect does. + */ + CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: %llu" + " srv lr: %llu lx: %llu gen %u\n", lcd->lcd_uuid, + cl_idx, last_transno, lsd->lsd_last_transno, + lcd_last_xid(lcd), lcd->lcd_generation); + + exp = class_new_export(obd, (struct obd_uuid *)lcd->lcd_uuid); + if (IS_ERR(exp)) { + if (PTR_ERR(exp) == -EALREADY) { + /* export already exists, zero out this one */ + CERROR("%s: Duplicate export %s!\n", + tgt_name(tgt), lcd->lcd_uuid); + continue; + } + GOTO(err_out, rc = PTR_ERR(exp)); + } + + ted = &exp->exp_target_data; + *ted->ted_lcd = *lcd; + + rc = tgt_client_add(env, exp, cl_idx); + LASSERTF(rc == 0, "rc = %d\n", rc); /* can't fail existing */ + /* VBR: set export last committed version */ + exp->exp_last_committed = last_transno; + spin_lock(&exp->exp_lock); + exp->exp_connecting = 0; + exp->exp_in_recovery = 0; + spin_unlock(&exp->exp_lock); + atomic_inc(&obd->obd_max_recoverable_clients); + + if (tgt_is_multimodrpcs_record(tgt, lcd)) { + atomic_inc(&tgt->lut_num_clients); + + /* compute the highest valid client generation */ + generation = max(generation, lcd->lcd_generation); + /* fill client_generation <-> export hash table */ + rc = cfs_hash_add_unique(hash, &lcd->lcd_generation, + &exp->exp_gen_hash); + if (rc != 0) { + CERROR("%s: duplicate export for client " + "generation %u\n", + tgt_name(tgt), lcd->lcd_generation); + class_export_put(exp); + GOTO(err_out, rc); + } + } + + class_export_put(exp); + + rc = rev_import_init(exp); + if (rc != 0) { + class_unlink_export(exp); + GOTO(err_out, rc); + } + + /* Need to check last_rcvd even for duplicated exports. */ + CDEBUG(D_OTHER, "client at idx %d has last_transno = %llu\n", + cl_idx, last_transno); + + spin_lock(&tgt->lut_translock); + tgt->lut_last_transno = max(last_transno, + tgt->lut_last_transno); + spin_unlock(&tgt->lut_translock); + } + + /* record highest valid client generation */ + atomic_set(&tgt->lut_client_generation, generation); + +err_out: + if (hash != NULL) + cfs_hash_putref(hash); + OBD_FREE_PTR(lcd); + RETURN(rc); +} + +struct server_compat_data { + __u32 rocompat; + __u32 incompat; + __u32 rocinit; + __u32 incinit; +}; + +static struct server_compat_data tgt_scd[] = { + [LDD_F_SV_TYPE_MDT] = { + .rocompat = OBD_ROCOMPAT_LOVOBJID, + .incompat = OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR | + OBD_INCOMPAT_FID | OBD_INCOMPAT_IAM_DIR | + OBD_INCOMPAT_LMM_VER | OBD_INCOMPAT_MULTI_OI | + OBD_INCOMPAT_MULTI_RPCS, + .rocinit = OBD_ROCOMPAT_LOVOBJID, + .incinit = OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR | + OBD_INCOMPAT_MULTI_OI, + }, + [LDD_F_SV_TYPE_OST] = { + .rocompat = OBD_ROCOMPAT_IDX_IN_IDIF, + .incompat = OBD_INCOMPAT_OST | OBD_INCOMPAT_COMMON_LR | + OBD_INCOMPAT_FID, + .rocinit = OBD_ROCOMPAT_IDX_IN_IDIF, + .incinit = OBD_INCOMPAT_OST | OBD_INCOMPAT_COMMON_LR, + } +}; + +int tgt_server_data_init(const struct lu_env *env, struct lu_target *tgt) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct lr_server_data *lsd = &tgt->lut_lsd; + unsigned long last_rcvd_size; + __u32 index; + int rc, type; + + rc = dt_attr_get(env, tgt->lut_last_rcvd, &tti->tti_attr); + if (rc) + RETURN(rc); + + last_rcvd_size = (unsigned long)tti->tti_attr.la_size; + + /* ensure padding in the struct is the correct size */ + BUILD_BUG_ON(offsetof(struct lr_server_data, lsd_padding) + + sizeof(lsd->lsd_padding) != LR_SERVER_SIZE); + + rc = server_name2index(tgt_name(tgt), &index, NULL); + if (rc < 0) { + CERROR("%s: Can not get index from name: rc = %d\n", + tgt_name(tgt), rc); + RETURN(rc); + } + /* server_name2index() returns type */ + type = rc; + if (type != LDD_F_SV_TYPE_MDT && type != LDD_F_SV_TYPE_OST) { + CERROR("%s: unknown target type %x\n", tgt_name(tgt), type); + RETURN(-EINVAL); + } + + /* last_rcvd on OST doesn't provide reconstruct support because there + * may be up to 8 in-flight write requests per single slot in + * last_rcvd client data + */ + tgt->lut_no_reconstruct = (type == LDD_F_SV_TYPE_OST); + + if (last_rcvd_size == 0) { + LCONSOLE_WARN("%s: new disk, initializing\n", tgt_name(tgt)); + + memcpy(lsd->lsd_uuid, tgt->lut_obd->obd_uuid.uuid, + sizeof(lsd->lsd_uuid)); + lsd->lsd_last_transno = 0; + lsd->lsd_mount_count = 0; + lsd->lsd_server_size = LR_SERVER_SIZE; + lsd->lsd_client_start = LR_CLIENT_START; + lsd->lsd_client_size = LR_CLIENT_SIZE; + lsd->lsd_subdir_count = OBJ_SUBDIR_COUNT; + lsd->lsd_osd_index = index; + lsd->lsd_feature_rocompat = tgt_scd[type].rocinit; + lsd->lsd_feature_incompat = tgt_scd[type].incinit; + } else { + rc = tgt_server_data_read(env, tgt); + if (rc) { + CERROR("%s: error reading LAST_RCVD: rc= %d\n", + tgt_name(tgt), rc); + RETURN(rc); + } + if (strcmp(lsd->lsd_uuid, tgt->lut_obd->obd_uuid.uuid)) { + if (tgt->lut_bottom->dd_rdonly) { + /* Such difference may be caused by mounting + * up snapshot with new fsname under rd_only + * mode. But even if it was NOT, it will not + * damage the system because of "rd_only". */ + memcpy(lsd->lsd_uuid, + tgt->lut_obd->obd_uuid.uuid, + sizeof(lsd->lsd_uuid)); + } else { + LCONSOLE_ERROR_MSG(0x157, "Trying to start " + "OBD %s using the wrong " + "disk %s. Were the /dev/ " + "assignments rearranged?\n", + tgt->lut_obd->obd_uuid.uuid, + lsd->lsd_uuid); + RETURN(-EINVAL); + } + } + + if (lsd->lsd_osd_index != index) { + LCONSOLE_ERROR_MSG(0x157, + "%s: index %d in last rcvd is different with the index %d in config log, It might be disk corruption!\n", + tgt_name(tgt), + lsd->lsd_osd_index, index); + RETURN(-EINVAL); + } + } + + if (lsd->lsd_feature_incompat & ~tgt_scd[type].incompat) { + CERROR("%s: unsupported incompat filesystem feature(s) %x\n", + tgt_name(tgt), + lsd->lsd_feature_incompat & ~tgt_scd[type].incompat); + RETURN(-EINVAL); + } + + if (type == LDD_F_SV_TYPE_MDT) + lsd->lsd_feature_incompat |= OBD_INCOMPAT_FID; + + if (lsd->lsd_feature_rocompat & ~tgt_scd[type].rocompat) { + CERROR("%s: unsupported read-only filesystem feature(s) %x\n", + tgt_name(tgt), + lsd->lsd_feature_rocompat & ~tgt_scd[type].rocompat); + RETURN(-EINVAL); + } + /** Interop: evict all clients at first boot with 1.8 last_rcvd */ + if (type == LDD_F_SV_TYPE_MDT && + !(lsd->lsd_feature_compat & OBD_COMPAT_20)) { + if (last_rcvd_size > lsd->lsd_client_start) { + LCONSOLE_WARN("%s: mounting at first time on 1.8 FS, " + "remove all clients for interop needs\n", + tgt_name(tgt)); + rc = tgt_truncate_last_rcvd(env, tgt, + lsd->lsd_client_start); + if (rc) + RETURN(rc); + last_rcvd_size = lsd->lsd_client_start; + } + /** set 2.0 flag to upgrade/downgrade between 1.8 and 2.0 */ + lsd->lsd_feature_compat |= OBD_COMPAT_20; + } + + spin_lock(&tgt->lut_translock); + tgt->lut_last_transno = lsd->lsd_last_transno; + spin_unlock(&tgt->lut_translock); + + lsd->lsd_mount_count++; + + CDEBUG(D_INODE, "=======,=BEGIN DUMPING LAST_RCVD========\n"); + CDEBUG(D_INODE, "%s: server last_transno: %llu\n", + tgt_name(tgt), tgt->lut_last_transno); + CDEBUG(D_INODE, "%s: server mount_count: %llu\n", + tgt_name(tgt), lsd->lsd_mount_count); + CDEBUG(D_INODE, "%s: server data size: %u\n", + tgt_name(tgt), lsd->lsd_server_size); + CDEBUG(D_INODE, "%s: per-client data start: %u\n", + tgt_name(tgt), lsd->lsd_client_start); + CDEBUG(D_INODE, "%s: per-client data size: %u\n", + tgt_name(tgt), lsd->lsd_client_size); + CDEBUG(D_INODE, "%s: last_rcvd size: %lu\n", + tgt_name(tgt), last_rcvd_size); + CDEBUG(D_INODE, "%s: server subdir_count: %u\n", + tgt_name(tgt), lsd->lsd_subdir_count); + CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", tgt_name(tgt), + last_rcvd_size <= lsd->lsd_client_start ? 0 : + (last_rcvd_size - lsd->lsd_client_start) / + lsd->lsd_client_size); + CDEBUG(D_INODE, "========END DUMPING LAST_RCVD========\n"); + + if (lsd->lsd_server_size == 0 || lsd->lsd_client_start == 0 || + lsd->lsd_client_size == 0) { + CERROR("%s: bad last_rcvd contents!\n", tgt_name(tgt)); + RETURN(-EINVAL); + } + + if (!tgt->lut_obd->obd_replayable) + CWARN("%s: recovery support OFF\n", tgt_name(tgt)); + + rc = tgt_clients_data_init(env, tgt, last_rcvd_size); + if (rc < 0) + GOTO(err_client, rc); + + spin_lock(&tgt->lut_translock); + /* obd_last_committed is used for compatibility + * with other lustre recovery code */ + tgt->lut_obd->obd_last_committed = tgt->lut_last_transno; + spin_unlock(&tgt->lut_translock); + + tgt->lut_obd->u.obt.obt_mount_count = lsd->lsd_mount_count; + tgt->lut_obd->u.obt.obt_instance = (__u32)lsd->lsd_mount_count; + + /* save it, so mount count and last_transno is current */ + rc = tgt_server_data_update(env, tgt, 0); + if (rc < 0) + GOTO(err_client, rc); + + RETURN(0); + +err_client: + class_disconnect_exports(tgt->lut_obd); + return rc; +} + +/* add credits for last_rcvd update */ +int tgt_txn_start_cb(const struct lu_env *env, struct thandle *th, + void *cookie) +{ + struct lu_target *tgt = cookie; + struct tgt_session_info *tsi; + struct tgt_thread_info *tti = tgt_th_info(env); + struct dt_object *dto; + int rc; + + /* For readonly case, the caller should have got failure + * when start the transaction. If the logic comes here, + * there must be something wrong. */ + if (unlikely(tgt->lut_bottom->dd_rdonly)) { + dump_stack(); + LBUG(); + } + + /* if there is no session, then this transaction is not result of + * request processing but some local operation */ + if (env->le_ses == NULL) + return 0; + + LASSERT(tgt->lut_last_rcvd); + tsi = tgt_ses_info(env); + /* OFD may start transaction without export assigned */ + if (tsi->tsi_exp == NULL) + return 0; + + if (tgt_is_multimodrpcs_client(tsi->tsi_exp)) { + /* + * Use maximum possible file offset for declaration to ensure + * ZFS will reserve enough credits for a write anywhere in this + * file, since we don't know where in the file the write will be + * because a replay slot has not been assigned. This should be + * replaced by dmu_tx_hold_append() when available. + */ + tti->tti_buf.lb_buf = NULL; + tti->tti_buf.lb_len = sizeof(struct lsd_reply_data); + dto = dt_object_locate(tgt->lut_reply_data, th->th_dev); + rc = dt_declare_record_write(env, dto, &tti->tti_buf, -1, th); + if (rc) + return rc; + } else { + dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev); + tti_buf_lcd(tti); + tti->tti_off = tsi->tsi_exp->exp_target_data.ted_lr_off; + rc = dt_declare_record_write(env, dto, &tti->tti_buf, + tti->tti_off, th); + if (rc) + return rc; + } + + if (tsi->tsi_vbr_obj != NULL && + !lu_object_remote(&tsi->tsi_vbr_obj->do_lu)) { + dto = dt_object_locate(tsi->tsi_vbr_obj, th->th_dev); + rc = dt_declare_version_set(env, dto, th); + } + + return rc; +} + +/* Update last_rcvd records with latests transaction data */ +int tgt_txn_stop_cb(const struct lu_env *env, struct thandle *th, + void *cookie) +{ + struct lu_target *tgt = cookie; + struct tgt_session_info *tsi; + struct tgt_thread_info *tti = tgt_th_info(env); + struct dt_object *obj = NULL; + int rc; + bool echo_client; + + if (env->le_ses == NULL) + return 0; + + tsi = tgt_ses_info(env); + /* OFD may start transaction without export assigned */ + if (tsi->tsi_exp == NULL) + return 0; + + echo_client = (tgt_ses_req(tsi) == NULL && tsi->tsi_xid == 0); + + if (tti->tti_has_trans && !echo_client) { + if (tti->tti_mult_trans == 0) { + CDEBUG(D_HA, "More than one transaction %llu\n", + tti->tti_transno); + RETURN(0); + } + /* we need another transno to be assigned */ + tti->tti_transno = 0; + } else if (th->th_result == 0) { + tti->tti_has_trans = 1; + } + + if (tsi->tsi_vbr_obj != NULL && + !lu_object_remote(&tsi->tsi_vbr_obj->do_lu)) { + obj = tsi->tsi_vbr_obj; + } + + if (unlikely(echo_client)) /* echo client special case */ + rc = tgt_last_rcvd_update_echo(env, tgt, obj, th, + tsi->tsi_exp); + else + rc = tgt_last_rcvd_update(env, tgt, obj, tsi->tsi_opdata, th, + tgt_ses_req(tsi)); + return rc; +} + +int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct lsd_reply_data *lrd = &tti->tti_lrd; + unsigned long reply_data_size; + int rc; + struct lsd_reply_header *lrh = NULL; + struct tg_reply_data *trd = NULL; + int idx; + loff_t off; + struct cfs_hash *hash = NULL; + struct obd_export *exp; + struct tg_export_data *ted; + int reply_data_recovered = 0; + + rc = dt_attr_get(env, tgt->lut_reply_data, &tti->tti_attr); + if (rc) + GOTO(out, rc); + reply_data_size = (unsigned long)tti->tti_attr.la_size; + + OBD_ALLOC_PTR(lrh); + if (lrh == NULL) + GOTO(out, rc = -ENOMEM); + + if (reply_data_size == 0) { + CDEBUG(D_INFO, "%s: new reply_data file, initializing\n", + tgt_name(tgt)); + lrh->lrh_magic = LRH_MAGIC; + lrh->lrh_header_size = sizeof(struct lsd_reply_header); + lrh->lrh_reply_size = sizeof(struct lsd_reply_data); + rc = tgt_reply_header_write(env, tgt, lrh); + if (rc) { + CERROR("%s: error writing %s: rc = %d\n", + tgt_name(tgt), REPLY_DATA, rc); + GOTO(out, rc); + } + } else { + rc = tgt_reply_header_read(env, tgt, lrh); + if (rc) { + CERROR("%s: error reading %s: rc = %d\n", + tgt_name(tgt), REPLY_DATA, rc); + GOTO(out, rc); + } + if (lrh->lrh_magic != LRH_MAGIC || + lrh->lrh_header_size != sizeof(struct lsd_reply_header) || + lrh->lrh_reply_size != sizeof(struct lsd_reply_data)) { + CERROR("%s: invalid header in %s\n", + tgt_name(tgt), REPLY_DATA); + GOTO(out, rc = -EINVAL); + } + + hash = cfs_hash_getref(tgt->lut_obd->obd_gen_hash); + if (hash == NULL) + GOTO(out, rc = -ENODEV); + + OBD_ALLOC_PTR(trd); + if (trd == NULL) + GOTO(out, rc = -ENOMEM); + + /* Load reply_data from disk */ + for (idx = 0, off = sizeof(struct lsd_reply_header); + off < reply_data_size; + idx++, off += sizeof(struct lsd_reply_data)) { + rc = tgt_reply_data_read(env, tgt, lrd, off); + if (rc) { + CERROR("%s: error reading %s: rc = %d\n", + tgt_name(tgt), REPLY_DATA, rc); + GOTO(out, rc); + } + + exp = cfs_hash_lookup(hash, &lrd->lrd_client_gen); + if (exp == NULL) { + /* old reply data from a disconnected client */ + continue; + } + ted = &exp->exp_target_data; + mutex_lock(&ted->ted_lcd_lock); + + /* create in-memory reply_data and link it to + * target export's reply list */ + rc = tgt_set_reply_slot(tgt, idx); + if (rc != 0) { + mutex_unlock(&ted->ted_lcd_lock); + GOTO(out, rc); + } + trd->trd_reply = *lrd; + trd->trd_pre_versions[0] = 0; + trd->trd_pre_versions[1] = 0; + trd->trd_pre_versions[2] = 0; + trd->trd_pre_versions[3] = 0; + trd->trd_index = idx; + trd->trd_tag = 0; + fid_zero(&trd->trd_object); + list_add(&trd->trd_list, &ted->ted_reply_list); + ted->ted_reply_cnt++; + if (ted->ted_reply_cnt > ted->ted_reply_max) + ted->ted_reply_max = ted->ted_reply_cnt; + + CDEBUG(D_HA, "%s: restore reply %p: xid %llu, " + "transno %llu, client gen %u, slot idx %d\n", + tgt_name(tgt), trd, lrd->lrd_xid, + lrd->lrd_transno, lrd->lrd_client_gen, + trd->trd_index); + + /* update export last committed transation */ + exp->exp_last_committed = max(exp->exp_last_committed, + lrd->lrd_transno); + /* Update lcd_last_transno as well for check in + * tgt_release_reply_data() or the latest client + * transno can be lost. + */ + ted->ted_lcd->lcd_last_transno = + max(ted->ted_lcd->lcd_last_transno, + exp->exp_last_committed); + + mutex_unlock(&ted->ted_lcd_lock); + class_export_put(exp); + + /* update target last committed transaction */ + spin_lock(&tgt->lut_translock); + tgt->lut_last_transno = max(tgt->lut_last_transno, + lrd->lrd_transno); + spin_unlock(&tgt->lut_translock); + + reply_data_recovered++; + + OBD_ALLOC_PTR(trd); + if (trd == NULL) + GOTO(out, rc = -ENOMEM); + } + CDEBUG(D_INFO, "%s: %d reply data have been recovered\n", + tgt_name(tgt), reply_data_recovered); + } + + spin_lock(&tgt->lut_translock); + /* obd_last_committed is used for compatibility + * with other lustre recovery code */ + tgt->lut_obd->obd_last_committed = tgt->lut_last_transno; + spin_unlock(&tgt->lut_translock); + + rc = 0; + +out: + if (hash != NULL) + cfs_hash_putref(hash); + if (trd != NULL) + OBD_FREE_PTR(trd); + if (lrh != NULL) + OBD_FREE_PTR(lrh); + return rc; +} + +static int tgt_check_lookup_req(struct ptlrpc_request *req, int lookup, + struct tg_reply_data *trd) +{ + struct tg_export_data *ted = &req->rq_export->exp_target_data; + struct lu_target *lut = class_exp2tgt(req->rq_export); + __u16 tag = lustre_msg_get_tag(req->rq_reqmsg); + int rc = 0; + struct tg_reply_data *reply; + bool check_increasing; + + if (tag == 0) + return 0; + + check_increasing = tgt_is_increasing_xid_client(req->rq_export) && + !(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY); + if (!lookup && !check_increasing) + return 0; + + list_for_each_entry(reply, &ted->ted_reply_list, trd_list) { + if (lookup && reply->trd_reply.lrd_xid == req->rq_xid) { + rc = 1; + if (trd != NULL) + *trd = *reply; + break; + } else if (check_increasing && reply->trd_tag == tag && + reply->trd_reply.lrd_xid > req->rq_xid) { + rc = -EPROTO; + CERROR("%s: busy tag=%u req_xid=%llu, trd=%p: xid=%llu transno=%llu client_gen=%u slot_idx=%d: rc = %d\n", + tgt_name(lut), tag, req->rq_xid, trd, + reply->trd_reply.lrd_xid, + reply->trd_reply.lrd_transno, + reply->trd_reply.lrd_client_gen, + reply->trd_index, rc); + break; + } + } + + return rc; +} + +/* Look for a reply data matching specified request @req + * A copy is returned in @trd if the pointer is not NULL + */ +int tgt_lookup_reply(struct ptlrpc_request *req, struct tg_reply_data *trd) +{ + struct tg_export_data *ted = &req->rq_export->exp_target_data; + int found = 0; + bool not_replay = !(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY); + + mutex_lock(&ted->ted_lcd_lock); + if (not_replay && req->rq_xid <= req->rq_export->exp_last_xid) { + /* A check for the last_xid is needed here in case there is + * no reply data is left in the list. It may happen if another + * RPC on another slot increased the last_xid between our + * process_req_last_xid & tgt_lookup_reply calls */ + found = -EPROTO; + } else { + found = tgt_check_lookup_req(req, 1, trd); + } + mutex_unlock(&ted->ted_lcd_lock); + + CDEBUG(D_TRACE, "%s: lookup reply xid %llu, found %d last_xid %llu\n", + tgt_name(class_exp2tgt(req->rq_export)), req->rq_xid, found, + req->rq_export->exp_last_xid); + + return found; +} +EXPORT_SYMBOL(tgt_lookup_reply); + +int tgt_handle_received_xid(struct obd_export *exp, __u64 rcvd_xid) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *lut = class_exp2tgt(exp); + struct tg_reply_data *trd, *tmp; + + + list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) { + if (trd->trd_reply.lrd_xid > rcvd_xid) + continue; + ted->ted_release_xid++; + tgt_release_reply_data(lut, ted, trd); + } + + return 0; +} + +int tgt_handle_tag(struct ptlrpc_request *req) +{ + return tgt_check_lookup_req(req, 0, NULL); +} + diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_main.c b/drivers/staging/lustrefsx/lustre/target/tgt_main.c new file mode 100644 index 0000000000000..76ccece817326 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/tgt_main.c @@ -0,0 +1,853 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * lustre/target/tgt_main.c + * + * Lustre Unified Target main initialization code + * + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include "tgt_internal.h" +#include "../ptlrpc/ptlrpc_internal.h" + +/* This must be longer than the longest string below */ +#define SYNC_STATES_MAXLEN 16 +static const char * const sync_lock_cancel_states[] = { + [SYNC_LOCK_CANCEL_NEVER] = "never", + [SYNC_LOCK_CANCEL_BLOCKING] = "blocking", + [SYNC_LOCK_CANCEL_ALWAYS] = "always", +}; + +/** + * Show policy for handling dirty data under a lock being cancelled. + * + * \param[in] kobj sysfs kobject + * \param[in] attr sysfs attribute + * \param[in] buf buffer for data + * + * \retval 0 and buffer filled with data on success + * \retval negative value on error + */ +ssize_t sync_lock_cancel_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lu_target *tgt = obd->u.obt.obt_lut; + + return sprintf(buf, "%s\n", + sync_lock_cancel_states[tgt->lut_sync_lock_cancel]); +} +EXPORT_SYMBOL(sync_lock_cancel_show); + +/** + * Change policy for handling dirty data under a lock being cancelled. + * + * This variable defines what action target takes upon lock cancel + * There are three possible modes: + * 1) never - never do sync upon lock cancel. This can lead to data + * inconsistencies if both the OST and client crash while writing a file + * that is also concurrently being read by another client. In these cases, + * this may allow the file data to "rewind" to an earlier state. + * 2) blocking - do sync only if there is blocking lock, e.g. if another + * client is trying to access this same object + * 3) always - do sync always + * + * \param[in] kobj kobject + * \param[in] attr attribute to show + * \param[in] buf buffer for data + * \param[in] count buffer size + * + * \retval \a count on success + * \retval negative value on error + */ +ssize_t sync_lock_cancel_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lu_target *tgt = obd->u.obt.obt_lut; + int val = -1; + enum tgt_sync_lock_cancel slc; + + if (count == 0 || count >= SYNC_STATES_MAXLEN) + return -EINVAL; + + for (slc = 0; slc < ARRAY_SIZE(sync_lock_cancel_states); slc++) { + if (strcmp(buffer, sync_lock_cancel_states[slc]) == 0) { + val = slc; + break; + } + } + + /* Legacy numeric codes */ + if (val == -1) { + int rc = kstrtoint(buffer, 0, &val); + if (rc) + return rc; + } + + if (val < 0 || val > 2) + return -EINVAL; + + spin_lock(&tgt->lut_flags_lock); + tgt->lut_sync_lock_cancel = val; + spin_unlock(&tgt->lut_flags_lock); + return count; +} +EXPORT_SYMBOL(sync_lock_cancel_store); +LUSTRE_RW_ATTR(sync_lock_cancel); + +/** + * Show maximum number of Filter Modification Data (FMD) maintained. + * + * \param[in] kobj kobject + * \param[in] attr attribute to show + * \param[in] buf buffer for data + * + * \retval 0 and buffer filled with data on success + * \retval negative value on error + */ +ssize_t tgt_fmd_count_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lu_target *lut = obd->u.obt.obt_lut; + + return sprintf(buf, "%u\n", lut->lut_fmd_max_num); +} + +/** + * Change number of FMDs maintained by target. + * + * This defines how large the list of FMDs can be. + * + * \param[in] kobj kobject + * \param[in] attr attribute to show + * \param[in] buf buffer for data + * \param[in] count buffer size + * + * \retval \a count on success + * \retval negative value on error + */ +ssize_t tgt_fmd_count_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lu_target *lut = obd->u.obt.obt_lut; + int val, rc; + + rc = kstrtoint(buffer, 0, &val); + if (rc) + return rc; + + if (val < 1 || val > 65536) + return -EINVAL; + + lut->lut_fmd_max_num = val; + + return count; +} +LUSTRE_RW_ATTR(tgt_fmd_count); + +/** + * Show the maximum age of FMD data in seconds. + * + * \param[in] kobj kobject + * \param[in] attr attribute to show + * \param[in] buf buffer for data + * + * \retval 0 and buffer filled with data on success + * \retval negative value on error + */ +ssize_t tgt_fmd_seconds_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lu_target *lut = obd->u.obt.obt_lut; + + return sprintf(buf, "%lld\n", lut->lut_fmd_max_age); +} + +/** + * Set the maximum age of FMD data in seconds. + * + * This defines how long FMD data stays in the FMD list. + * + * \param[in] kobj kobject + * \param[in] attr attribute to show + * \param[in] buf buffer for data + * \param[in] count buffer size + * + * \retval \a count on success + * \retval negative number on error + */ +ssize_t tgt_fmd_seconds_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lu_target *lut = obd->u.obt.obt_lut; + time64_t val; + int rc; + + rc = kstrtoll(buffer, 0, &val); + if (rc) + return rc; + + if (val < 1 || val > 65536) /* ~ 18 hour max */ + return -EINVAL; + + lut->lut_fmd_max_age = val; + + return count; +} +LUSTRE_RW_ATTR(tgt_fmd_seconds); + +/* These two aliases are old names and kept for compatibility, they were + * changed to 'tgt_fmd_count' and 'tgt_fmd_seconds'. + * This change was made in Lustre 2.13, so these aliases can be removed + * when back compatibility is not needed with any Lustre version prior 2.13 + */ +static struct lustre_attr tgt_fmd_count_compat = __ATTR(client_cache_count, + 0644, tgt_fmd_count_show, tgt_fmd_count_store); +static struct lustre_attr tgt_fmd_seconds_compat = __ATTR(client_cache_seconds, + 0644, tgt_fmd_seconds_show, tgt_fmd_seconds_store); + +static const struct attribute *tgt_attrs[] = { + &lustre_attr_sync_lock_cancel.attr, + &lustre_attr_tgt_fmd_count.attr, + &lustre_attr_tgt_fmd_seconds.attr, + &tgt_fmd_count_compat.attr, + &tgt_fmd_seconds_compat.attr, + NULL, +}; + +/** + * Decide which checksums both client and OST support, possibly forcing + * the use of T10PI checksums if the hardware supports this. + * + * The clients that have no T10-PI RPC checksum support will use the same + * mechanism to select checksum type as before, and will not be affected by + * the following logic. + * + * For the clients that have T10-PI RPC checksum support: + * + * If the target supports T10-PI feature and T10-PI checksum is enforced, + * clients will have no other choice for RPC checksum type other than using + * the T10PI checksum type. This is useful for enforcing end-to-end integrity + * in the whole system. + * + * If the target doesn't support T10-PI feature and T10-PI checksum is + * enforced, together with other checksum with reasonably good speeds (e.g. + * crc32, crc32c, adler, etc.), all T10-PI checksum types understood by the + * client (t10ip512, t10ip4K, t10crc512, t10crc4K) will be added to the + * available checksum types, regardless of the speeds of T10-PI checksums. + * This is useful for testing T10-PI checksum of RPC. + * + * If the target supports T10-PI feature and T10-PI checksum is NOT enforced, + * the corresponding T10-PI checksum type will be added to the checksum type + * list, regardless of the speed of the T10-PI checksum. This provides clients + * the flexibility to choose whether to enable end-to-end integrity or not. + * + * If the target does NOT supports T10-PI feature and T10-PI checksum is NOT + * enforced, together with other checksums with reasonably good speeds, + * all the T10-PI checksum types with good speeds will be added into the + * checksum type list. Note that a T10-PI checksum type with a speed worse + * than half of Alder will NOT be added as a option. In this circumstance, + * T10-PI checksum types has the same behavior like other normal checksum + * types. + */ +void tgt_mask_cksum_types(struct lu_target *lut, enum cksum_types *cksum_types) +{ + bool enforce = lut->lut_cksum_t10pi_enforce; + enum cksum_types tgt_t10_cksum_type; + enum cksum_types client_t10_types = *cksum_types & OBD_CKSUM_T10_ALL; + enum cksum_types server_t10_types; + + /* + * The client set in ocd_cksum_types the checksum types it + * supports. We have to mask off the algorithms that we don't + * support. T10PI checksum types will be added later. + */ + *cksum_types &= (lut->lut_cksum_types_supported & ~OBD_CKSUM_T10_ALL); + server_t10_types = lut->lut_cksum_types_supported & OBD_CKSUM_T10_ALL; + tgt_t10_cksum_type = lut->lut_dt_conf.ddp_t10_cksum_type; + + /* Quick exit if no T10-PI support on client */ + if (!client_t10_types) + return; + + /* + * This OST has NO T10-PI feature. Add all supported T10-PI checksums + * as options if T10-PI checksum is enforced. If the T10-PI checksum is + * not enforced, only add them as options when speed is good. + */ + if (tgt_t10_cksum_type == 0) { + /* + * Server allows all T10PI checksums, and server_t10_types + * include quick ones. + */ + if (enforce) + *cksum_types |= client_t10_types; + else + *cksum_types |= client_t10_types & server_t10_types; + return; + } + + /* + * This OST has T10-PI feature. Disable all other checksum types if + * T10-PI checksum is enforced. If the T10-PI checksum is not enforced, + * add the checksum type as an option. + */ + if (client_t10_types & tgt_t10_cksum_type) { + if (enforce) + *cksum_types = tgt_t10_cksum_type; + else + *cksum_types |= tgt_t10_cksum_type; + } +} +EXPORT_SYMBOL(tgt_mask_cksum_types); + +int tgt_tunables_init(struct lu_target *lut) +{ + int rc; + + rc = sysfs_create_files(&lut->lut_obd->obd_kset.kobj, tgt_attrs); + if (!rc) + lut->lut_attrs = tgt_attrs; + return rc; +} +EXPORT_SYMBOL(tgt_tunables_init); + +void tgt_tunables_fini(struct lu_target *lut) +{ + if (lut->lut_attrs) { + sysfs_remove_files(&lut->lut_obd->obd_kset.kobj, + lut->lut_attrs); + lut->lut_attrs = NULL; + } +} +EXPORT_SYMBOL(tgt_tunables_fini); + +/* + * Save cross-MDT lock in lut_slc_locks. + * + * Lock R/W count is not saved, but released in unlock (not canceled remotely), + * instead only a refcount is taken, so that the remote MDT where the object + * resides can detect conflict with this lock there. + * + * \param lut target + * \param lock cross-MDT lock to save + * \param transno when the transaction with this transno is committed, this lock + * can be canceled. + */ +void tgt_save_slc_lock(struct lu_target *lut, struct ldlm_lock *lock, + __u64 transno) +{ + spin_lock(&lut->lut_slc_locks_guard); + lock_res_and_lock(lock); + if (ldlm_is_cbpending(lock)) { + /* if it was canceld by server, don't save, because remote MDT + * will do Sync-on-Cancel. */ + LDLM_LOCK_PUT(lock); + } else { + lock->l_transno = transno; + /* if this lock is in the list already, there are two operations + * both use this lock, and save it after use, so for the second + * one, just put the refcount. */ + if (list_empty(&lock->l_slc_link)) + list_add_tail(&lock->l_slc_link, &lut->lut_slc_locks); + else + LDLM_LOCK_PUT(lock); + } + unlock_res_and_lock(lock); + spin_unlock(&lut->lut_slc_locks_guard); +} +EXPORT_SYMBOL(tgt_save_slc_lock); + +/* + * Discard cross-MDT lock from lut_slc_locks. + * + * This is called upon BAST, just remove lock from lut_slc_locks and put lock + * refcount. The BAST will cancel this lock. + * + * \param lut target + * \param lock cross-MDT lock to discard + */ +void tgt_discard_slc_lock(struct lu_target *lut, struct ldlm_lock *lock) +{ + spin_lock(&lut->lut_slc_locks_guard); + lock_res_and_lock(lock); + /* may race with tgt_cancel_slc_locks() */ + if (lock->l_transno != 0) { + LASSERT(!list_empty(&lock->l_slc_link)); + LASSERT(ldlm_is_cbpending(lock)); + list_del_init(&lock->l_slc_link); + lock->l_transno = 0; + LDLM_LOCK_PUT(lock); + } + unlock_res_and_lock(lock); + spin_unlock(&lut->lut_slc_locks_guard); +} +EXPORT_SYMBOL(tgt_discard_slc_lock); + +/* + * Cancel cross-MDT locks upon transaction commit. + * + * Remove cross-MDT locks from lut_slc_locks, cancel them and put lock refcount. + * + * \param lut target + * \param transno transaction with this number was committed. + */ +void tgt_cancel_slc_locks(struct lu_target *lut, __u64 transno) +{ + struct ldlm_lock *lock, *next; + LIST_HEAD(list); + struct lustre_handle lockh; + + spin_lock(&lut->lut_slc_locks_guard); + list_for_each_entry_safe(lock, next, &lut->lut_slc_locks, + l_slc_link) { + lock_res_and_lock(lock); + LASSERT(lock->l_transno != 0); + if (lock->l_transno > transno) { + unlock_res_and_lock(lock); + continue; + } + /* ouch, another operation is using it after it's saved */ + if (lock->l_readers != 0 || lock->l_writers != 0) { + unlock_res_and_lock(lock); + continue; + } + /* set CBPENDING so that this lock won't be used again */ + ldlm_set_cbpending(lock); + lock->l_transno = 0; + list_move(&lock->l_slc_link, &list); + unlock_res_and_lock(lock); + } + spin_unlock(&lut->lut_slc_locks_guard); + + list_for_each_entry_safe(lock, next, &list, l_slc_link) { + list_del_init(&lock->l_slc_link); + ldlm_lock2handle(lock, &lockh); + ldlm_cli_cancel(&lockh, LCF_ASYNC); + LDLM_LOCK_PUT(lock); + } +} + +int tgt_init(const struct lu_env *env, struct lu_target *lut, + struct obd_device *obd, struct dt_device *dt, + struct tgt_opc_slice *slice, int request_fail_id, + int reply_fail_id) +{ + struct dt_object_format dof; + struct lu_attr attr; + struct lu_fid fid; + struct dt_object *o; + struct tg_grants_data *tgd = &lut->lut_tgd; + struct obd_statfs *osfs; + int i, rc = 0; + + ENTRY; + + LASSERT(lut); + LASSERT(obd); + lut->lut_obd = obd; + lut->lut_bottom = dt; + lut->lut_last_rcvd = NULL; + lut->lut_client_bitmap = NULL; + atomic_set(&lut->lut_num_clients, 0); + atomic_set(&lut->lut_client_generation, 0); + lut->lut_reply_data = NULL; + lut->lut_reply_bitmap = NULL; + obd->u.obt.obt_lut = lut; + obd->u.obt.obt_magic = OBT_MAGIC; + + /* set request handler slice and parameters */ + lut->lut_slice = slice; + lut->lut_reply_fail_id = reply_fail_id; + lut->lut_request_fail_id = request_fail_id; + + /* sptlrcp variables init */ + rwlock_init(&lut->lut_sptlrpc_lock); + sptlrpc_rule_set_init(&lut->lut_sptlrpc_rset); + + spin_lock_init(&lut->lut_flags_lock); + lut->lut_sync_lock_cancel = SYNC_LOCK_CANCEL_NEVER; + lut->lut_cksum_t10pi_enforce = 0; + lut->lut_cksum_types_supported = + obd_cksum_types_supported_server(obd->obd_name); + + spin_lock_init(&lut->lut_slc_locks_guard); + INIT_LIST_HEAD(&lut->lut_slc_locks); + + /* last_rcvd initialization is needed by replayable targets only */ + if (!obd->obd_replayable) + RETURN(0); + + /* initialize grant and statfs data in target */ + dt_conf_get(env, lut->lut_bottom, &lut->lut_dt_conf); + + /* statfs data */ + spin_lock_init(&tgd->tgd_osfs_lock); + tgd->tgd_osfs_age = ktime_get_seconds() - 1000; + tgd->tgd_osfs_unstable = 0; + tgd->tgd_statfs_inflight = 0; + tgd->tgd_osfs_inflight = 0; + + /* grant data */ + spin_lock_init(&tgd->tgd_grant_lock); + tgd->tgd_tot_dirty = 0; + tgd->tgd_tot_granted = 0; + tgd->tgd_tot_pending = 0; + tgd->tgd_grant_compat_disable = 0; + + /* populate cached statfs data */ + osfs = &tgt_th_info(env)->tti_u.osfs; + rc = tgt_statfs_internal(env, lut, osfs, 0, NULL); + if (rc != 0) { + CERROR("%s: can't get statfs data, rc %d\n", tgt_name(lut), + rc); + GOTO(out, rc); + } + if (!is_power_of_2(osfs->os_bsize)) { + CERROR("%s: blocksize (%d) is not a power of 2\n", + tgt_name(lut), osfs->os_bsize); + GOTO(out, rc = -EPROTO); + } + tgd->tgd_blockbits = fls(osfs->os_bsize) - 1; + + spin_lock_init(&lut->lut_translock); + spin_lock_init(&lut->lut_client_bitmap_lock); + + OBD_ALLOC(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3); + if (lut->lut_client_bitmap == NULL) + RETURN(-ENOMEM); + + memset(&attr, 0, sizeof(attr)); + attr.la_valid = LA_MODE; + attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + dof.dof_type = dt_mode_to_dft(S_IFREG); + + lu_local_obj_fid(&fid, LAST_RECV_OID); + + o = dt_find_or_create(env, lut->lut_bottom, &fid, &dof, &attr); + if (IS_ERR(o)) { + rc = PTR_ERR(o); + CERROR("%s: cannot open LAST_RCVD: rc = %d\n", tgt_name(lut), + rc); + GOTO(out_put, rc); + } + + lut->lut_last_rcvd = o; + rc = tgt_server_data_init(env, lut); + if (rc < 0) + GOTO(out_put, rc); + + /* prepare transactions callbacks */ + lut->lut_txn_cb.dtc_txn_start = tgt_txn_start_cb; + lut->lut_txn_cb.dtc_txn_stop = tgt_txn_stop_cb; + lut->lut_txn_cb.dtc_cookie = lut; + lut->lut_txn_cb.dtc_tag = LCT_DT_THREAD | LCT_MD_THREAD; + INIT_LIST_HEAD(&lut->lut_txn_cb.dtc_linkage); + + dt_txn_callback_add(lut->lut_bottom, &lut->lut_txn_cb); + lut->lut_bottom->dd_lu_dev.ld_site->ls_tgt = lut; + + lut->lut_fmd_max_num = LUT_FMD_MAX_NUM_DEFAULT; + lut->lut_fmd_max_age = LUT_FMD_MAX_AGE_DEFAULT; + + atomic_set(&lut->lut_sync_count, 0); + + /* reply_data is supported by MDT targets only for now */ + if (strncmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) != 0) + RETURN(0); + + OBD_ALLOC(lut->lut_reply_bitmap, + LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *)); + if (lut->lut_reply_bitmap == NULL) + GOTO(out, rc = -ENOMEM); + + memset(&attr, 0, sizeof(attr)); + attr.la_valid = LA_MODE; + attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + dof.dof_type = dt_mode_to_dft(S_IFREG); + + lu_local_obj_fid(&fid, REPLY_DATA_OID); + + o = dt_find_or_create(env, lut->lut_bottom, &fid, &dof, &attr); + if (IS_ERR(o)) { + rc = PTR_ERR(o); + CERROR("%s: cannot open REPLY_DATA: rc = %d\n", tgt_name(lut), + rc); + GOTO(out, rc); + } + lut->lut_reply_data = o; + + rc = tgt_reply_data_init(env, lut); + if (rc < 0) + GOTO(out, rc); + + RETURN(0); + +out: + dt_txn_callback_del(lut->lut_bottom, &lut->lut_txn_cb); +out_put: + obd->u.obt.obt_magic = 0; + obd->u.obt.obt_lut = NULL; + if (lut->lut_last_rcvd != NULL) { + dt_object_put(env, lut->lut_last_rcvd); + lut->lut_last_rcvd = NULL; + } + if (lut->lut_client_bitmap != NULL) + OBD_FREE(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3); + lut->lut_client_bitmap = NULL; + if (lut->lut_reply_data != NULL) + dt_object_put(env, lut->lut_reply_data); + lut->lut_reply_data = NULL; + if (lut->lut_reply_bitmap != NULL) { + for (i = 0; i < LUT_REPLY_SLOTS_MAX_CHUNKS; i++) { + if (lut->lut_reply_bitmap[i] != NULL) + OBD_FREE_LARGE(lut->lut_reply_bitmap[i], + BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) * + sizeof(long)); + lut->lut_reply_bitmap[i] = NULL; + } + OBD_FREE(lut->lut_reply_bitmap, + LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *)); + } + lut->lut_reply_bitmap = NULL; + return rc; +} +EXPORT_SYMBOL(tgt_init); + +void tgt_fini(const struct lu_env *env, struct lu_target *lut) +{ + int i; + int rc; + ENTRY; + + if (lut->lut_lsd.lsd_feature_incompat & OBD_INCOMPAT_MULTI_RPCS && + atomic_read(&lut->lut_num_clients) == 0) { + /* Clear MULTI RPCS incompatibility flag that prevents previous + * Lustre versions to mount a target with reply_data file */ + lut->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS; + rc = tgt_server_data_update(env, lut, 1); + if (rc < 0) + CERROR("%s: unable to clear MULTI RPCS " + "incompatibility flag\n", + lut->lut_obd->obd_name); + } + + sptlrpc_rule_set_free(&lut->lut_sptlrpc_rset); + + if (lut->lut_reply_data != NULL) + dt_object_put(env, lut->lut_reply_data); + lut->lut_reply_data = NULL; + if (lut->lut_reply_bitmap != NULL) { + for (i = 0; i < LUT_REPLY_SLOTS_MAX_CHUNKS; i++) { + if (lut->lut_reply_bitmap[i] != NULL) + OBD_FREE_LARGE(lut->lut_reply_bitmap[i], + BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) * + sizeof(long)); + lut->lut_reply_bitmap[i] = NULL; + } + OBD_FREE(lut->lut_reply_bitmap, + LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *)); + } + lut->lut_reply_bitmap = NULL; + if (lut->lut_client_bitmap) { + OBD_FREE(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3); + lut->lut_client_bitmap = NULL; + } + if (lut->lut_last_rcvd) { + dt_txn_callback_del(lut->lut_bottom, &lut->lut_txn_cb); + dt_object_put(env, lut->lut_last_rcvd); + lut->lut_last_rcvd = NULL; + } + EXIT; +} +EXPORT_SYMBOL(tgt_fini); + +static struct kmem_cache *tgt_thread_kmem; +static struct kmem_cache *tgt_session_kmem; +struct kmem_cache *tgt_fmd_kmem; + +static struct lu_kmem_descr tgt_caches[] = { + { + .ckd_cache = &tgt_thread_kmem, + .ckd_name = "tgt_thread_kmem", + .ckd_size = sizeof(struct tgt_thread_info), + }, + { + .ckd_cache = &tgt_session_kmem, + .ckd_name = "tgt_session_kmem", + .ckd_size = sizeof(struct tgt_session_info) + }, + { + .ckd_cache = &tgt_fmd_kmem, + .ckd_name = "tgt_fmd_cache", + .ckd_size = sizeof(struct tgt_fmd_data) + }, + { + .ckd_cache = NULL + } +}; + + +/* context key constructor/destructor: tg_key_init, tg_key_fini */ +static void *tgt_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct tgt_thread_info *thread; + + OBD_SLAB_ALLOC_PTR_GFP(thread, tgt_thread_kmem, GFP_NOFS); + if (thread == NULL) + return ERR_PTR(-ENOMEM); + + return thread; +} + +static void tgt_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct tgt_thread_info *info = data; + struct thandle_exec_args *args = &info->tti_tea; + int i; + + for (i = 0; i < args->ta_alloc_args; i++) { + if (args->ta_args[i] != NULL) + OBD_FREE_PTR(args->ta_args[i]); + } + + if (args->ta_args != NULL) + OBD_FREE_PTR_ARRAY(args->ta_args, args->ta_alloc_args); + OBD_SLAB_FREE_PTR(info, tgt_thread_kmem); +} + +static void tgt_key_exit(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct tgt_thread_info *tti = data; + + tti->tti_has_trans = 0; + tti->tti_mult_trans = 0; +} + +/* context key: tg_thread_key */ +struct lu_context_key tgt_thread_key = { + .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD, + .lct_init = tgt_key_init, + .lct_fini = tgt_key_fini, + .lct_exit = tgt_key_exit, +}; + +LU_KEY_INIT_GENERIC(tgt); + +static void *tgt_ses_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct tgt_session_info *session; + + OBD_SLAB_ALLOC_PTR_GFP(session, tgt_session_kmem, GFP_NOFS); + if (session == NULL) + return ERR_PTR(-ENOMEM); + + return session; +} + +static void tgt_ses_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct tgt_session_info *session = data; + + OBD_SLAB_FREE_PTR(session, tgt_session_kmem); +} + +/* context key: tgt_session_key */ +struct lu_context_key tgt_session_key = { + .lct_tags = LCT_SERVER_SESSION, + .lct_init = tgt_ses_key_init, + .lct_fini = tgt_ses_key_fini, +}; +EXPORT_SYMBOL(tgt_session_key); + +LU_KEY_INIT_GENERIC(tgt_ses); + +/* + * this page is allocated statically when module is initializing + * it is used to simulate data corruptions, see ost_checksum_bulk() + * for details. as the original pages provided by the layers below + * can be remain in the internal cache, we do not want to modify + * them. + */ +struct page *tgt_page_to_corrupt; + +int tgt_mod_init(void) +{ + int result; + ENTRY; + + result = lu_kmem_init(tgt_caches); + if (result != 0) + RETURN(result); + + tgt_page_to_corrupt = alloc_page(GFP_KERNEL); + + tgt_key_init_generic(&tgt_thread_key, NULL); + lu_context_key_register_many(&tgt_thread_key, NULL); + + tgt_ses_key_init_generic(&tgt_session_key, NULL); + lu_context_key_register_many(&tgt_session_key, NULL); + barrier_init(); + + update_info_init(); + + RETURN(0); +} + +void tgt_mod_exit(void) +{ + barrier_fini(); + if (tgt_page_to_corrupt != NULL) + put_page(tgt_page_to_corrupt); + + lu_context_key_degister(&tgt_thread_key); + lu_context_key_degister(&tgt_session_key); + update_info_fini(); + + lu_kmem_fini(tgt_caches); +} + diff --git a/drivers/staging/lustrefsx/lustre/target/update_records.c b/drivers/staging/lustrefsx/lustre/target/update_records.c new file mode 100644 index 0000000000000..56a833c03069f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/update_records.c @@ -0,0 +1,1232 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2015, 2017, Intel Corporation. + */ + +/* + * lustre/target/update_records.c + * + * This file implement the methods to pack updates as update records, which + * will be written to the disk as llog record, and might be used during + * recovery. + * + * For cross-MDT operation, all of updates of the operation needs to be + * recorded in the disk, then during recovery phase, the recovery thread + * will retrieve and redo these updates if it needed. + * + * See comments above struct update_records for the format of update_records. + * + * Author: Di Wang + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include + +#include "tgt_internal.h" + +#define UPDATE_RECORDS_BUFFER_SIZE 8192 +#define UPDATE_PARAMS_BUFFER_SIZE 8192 +/** + * Dump update record. + * + * Dump all of updates in the update_records, mostly for debugging purpose. + * + * \param[in] records update records to be dumpped + * \param[in] mask debug level mask + * \param[in] dump_params if dump all of updates the updates. + * + */ +void update_records_dump(const struct update_records *records, + unsigned int mask, bool dump_updates) +{ + const struct update_ops *ops; + const struct update_op *op = NULL; + struct update_params *params = NULL; + unsigned int i; + + CDEBUG(mask, "master transno = %llu batchid = %llu flags = %x" + " ops = %d params = %d\n", records->ur_master_transno, + records->ur_batchid, records->ur_flags, records->ur_update_count, + records->ur_param_count); + + if (records->ur_update_count == 0) + return; + + if (!dump_updates) + return; + + ops = &records->ur_ops; + if (records->ur_param_count > 0) + params = update_records_get_params(records); + + op = &ops->uops_op[0]; + for (i = 0; i < records->ur_update_count; i++, + op = update_op_next_op(op)) { + unsigned int j; + + CDEBUG(mask, "update %dth "DFID" %s params_count = %hu\n", i, + PFID(&op->uop_fid), update_op_str(op->uop_type), + op->uop_param_count); + + if (params == NULL) + continue; + + for (j = 0; j < op->uop_param_count; j++) { + struct object_update_param *param; + + param = update_params_get_param(params, + (unsigned int)op->uop_params_off[j], + records->ur_param_count); + + if (param == NULL) + continue; + CDEBUG(mask, "param = %p %dth off = %hu size = %hu\n", + param, j, op->uop_params_off[j], param->oup_len); + } + } +} + +/** + * Pack parameters to update records + * + * Find and insert parameter to update records, if the parameter + * already exists in \a params, then just return the offset of this + * parameter, otherwise insert the parameter and return its offset + * + * \param[in] params update params in which to insert parameter + * \param[in] new_param parameters to be inserted. + * \param[in] new_param_size the size of \a new_param + * + * \retval index inside \a params if parameter insertion + * succeeds. + * \retval negative errno if it fails. + */ +static unsigned int update_records_param_pack(struct update_params *params, + const void *new_param, + size_t new_param_size, + unsigned int *param_count) +{ + struct object_update_param *param; + unsigned int i; + + for (i = 0; i < *param_count; i++) { + struct object_update_param *param; + + param = update_params_get_param(params, i, *param_count); + if ((new_param == NULL && param->oup_len == new_param_size) || + (param->oup_len == new_param_size && + memcmp(param->oup_buf, new_param, new_param_size) == 0)) + /* Found the parameter and return its index */ + return i; + } + + param = (struct object_update_param *)((char *)params + + update_params_size(params, *param_count)); + + param->oup_len = new_param_size; + if (new_param != NULL) + memcpy(param->oup_buf, new_param, new_param_size); + + *param_count = *param_count + 1; + + return *param_count - 1; +} + +/** + * Pack update to update records + * + * Pack the update and its parameters to the update records. First it will + * insert parameters, get the offset of these parameter, then fill the + * update with these offset. If insertion exceed the maximum size of + * current update records, it will return -E2BIG here, and the caller might + * extend the update_record size \see lod_updates_pack. + * + * \param[in] env execution environment + * \param[in] fid FID of the update. + * \param[in] op_type operation type of the update + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] param_bufs buffers of parameters + * \param[in] params_buf_count the count of the parameter buffers + * \param[in] param_size sizes of parameters + * + * \retval 0 if packing succeeds + * \retval negative errno if packing fails + */ +static int update_records_update_pack(const struct lu_env *env, + const struct lu_fid *fid, + enum update_type op_type, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_op_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + unsigned int param_bufs_count, + const void **param_bufs, + size_t *param_sizes) +{ + struct update_op *op; + size_t total_param_sizes = 0; + int index; + unsigned int i; + + /* Check whether the packing exceeding the maximum update size */ + if (unlikely(*max_op_size < update_op_size(param_bufs_count))) { + CDEBUG(D_INFO, "max_op_size = %zu update_op = %zu\n", + *max_op_size, update_op_size(param_bufs_count)); + *max_op_size = update_op_size(param_bufs_count); + return -E2BIG; + } + + for (i = 0; i < param_bufs_count; i++) + total_param_sizes += + cfs_size_round(sizeof(struct object_update_param) + + param_sizes[i]); + + /* Check whether the packing exceeding the maximum parameter size */ + if (unlikely(*max_param_size < total_param_sizes)) { + CDEBUG(D_INFO, "max_param_size = %zu params size = %zu\n", + *max_param_size, total_param_sizes); + + *max_param_size = total_param_sizes; + return -E2BIG; + } + + op = update_ops_get_op(ops, *op_count, *op_count); + op->uop_fid = *fid; + op->uop_type = op_type; + op->uop_param_count = param_bufs_count; + for (i = 0; i < param_bufs_count; i++) { + index = update_records_param_pack(params, param_bufs[i], + param_sizes[i], param_count); + if (index < 0) + return index; + + CDEBUG(D_INFO, "%s %uth param offset = %d size = %zu\n", + update_op_str(op_type), i, index, param_sizes[i]); + + op->uop_params_off[i] = index; + } + CDEBUG(D_INFO, "%huth "DFID" %s param_count = %u\n", + *op_count, PFID(fid), update_op_str(op_type), *param_count); + + *op_count = *op_count + 1; + + return 0; +} + +/** + * Calculate update_records size + * + * Calculate update_records size by param_count and param_sizes array. + * + * \param[in] param_count the count of parameters + * \param[in] sizes the size array of these parameters + * + * \retval the size of this update + */ +static size_t update_records_update_size(__u32 param_count, size_t *sizes) +{ + int i; + size_t size; + + /* Check whether the packing exceeding the maximum update size */ + size = update_op_size(param_count); + + for (i = 0; i < param_count; i++) + size += cfs_size_round(sizeof(struct object_update_param) + + sizes[i]); + + return size; +} + +/** + * Calculate create update size + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in] fid FID of the object to be created + * \param[in] attr attribute of the object to be created + * \param[in] hint creation hint + * \param[in] dof creation format information + * + * \retval size of create update. + */ +size_t update_records_create_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_attr *attr, + const struct dt_allocation_hint *hint, + struct dt_object_format *dof) +{ + size_t sizes[2]; + int param_count = 0; + + if (attr != NULL) { + sizes[param_count] = sizeof(struct obdo); + param_count++; + } + + if (hint != NULL && hint->dah_parent != NULL) { + sizes[param_count] = sizeof(*fid); + param_count++; + } + + return update_records_update_size(param_count, sizes); +} +EXPORT_SYMBOL(update_records_create_size); + +/** + * Pack create update + * + * Pack create update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to be created + * \param[in] attr attribute of the object to be created + * \param[in] hint creation hint + * \param[in] dof creation format information + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_create_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_attr *attr, + const struct dt_allocation_hint *hint, + struct dt_object_format *dof) +{ + size_t sizes[2]; + const void *bufs[2]; + int buf_count = 0; + const struct lu_fid *parent_fid = NULL; + struct lu_fid tmp_fid; + int rc; + struct obdo *obdo; + + if (attr != NULL) { + obdo = &update_env_info(env)->uti_obdo; + obdo->o_valid = 0; + obdo_from_la(obdo, attr, attr->la_valid); + bufs[buf_count] = obdo; + sizes[buf_count] = sizeof(*obdo); + buf_count++; + } + + if (hint != NULL && hint->dah_parent != NULL) { + parent_fid = lu_object_fid(&hint->dah_parent->do_lu); + fid_cpu_to_le(&tmp_fid, parent_fid); + bufs[buf_count] = &tmp_fid; + sizes[buf_count] = sizeof(tmp_fid); + buf_count++; + } + + rc = update_records_update_pack(env, fid, OUT_CREATE, ops, op_count, + max_ops_size, params, param_count, + max_param_size, buf_count, bufs, sizes); + return rc; +} +EXPORT_SYMBOL(update_records_create_pack); + +/** + * Calculate attr set update size + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in] fid FID of the object to set attr + * \param[in] attr attribute of attr set + * + * \retval size of attr set update. + */ +size_t update_records_attr_set_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_attr *attr) +{ + size_t size = sizeof(struct obdo); + + return update_records_update_size(1, &size); +} +EXPORT_SYMBOL(update_records_attr_set_size); + +/** + * Pack attr set update + * + * Pack attr_set update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to set attr + * \param[in] attr attribute of attr set + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_attr_set_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_attr *attr) +{ + struct obdo *obdo = &update_env_info(env)->uti_obdo; + size_t size = sizeof(*obdo); + + obdo->o_valid = 0; + obdo_from_la(obdo, attr, attr->la_valid); + return update_records_update_pack(env, fid, OUT_ATTR_SET, ops, op_count, + max_ops_size, params, param_count, + max_param_size, 1, + (const void **)&obdo, &size); +} +EXPORT_SYMBOL(update_records_attr_set_pack); + +/** + * Calculate ref add update size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to add reference + * + * \retval size of ref_add udpate. + */ +size_t update_records_ref_add_size(const struct lu_env *env, + const struct lu_fid *fid) +{ + return update_records_update_size(0, NULL); +} +EXPORT_SYMBOL(update_records_ref_add_size); + +/** + * Pack ref add update + * + * Pack ref add update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to add reference + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_ref_add_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid) +{ + return update_records_update_pack(env, fid, OUT_REF_ADD, ops, op_count, + max_ops_size, params, param_count, + max_param_size, 0, NULL, NULL); +} +EXPORT_SYMBOL(update_records_ref_add_pack); + +/** + * Pack noop update + * + * Pack no op update into update records. Note: no op means + * the update does not need do anything, which is only used + * in test case to verify large size record. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to add reference + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_noop_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid) +{ + return update_records_update_pack(env, fid, OUT_NOOP, ops, op_count, + max_ops_size, params, param_count, + max_param_size, 0, NULL, NULL); +} +EXPORT_SYMBOL(update_records_noop_pack); + +/** + * Calculate ref del update size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to delete reference + * + * \retval size of ref_del update. + */ +size_t update_records_ref_del_size(const struct lu_env *env, + const struct lu_fid *fid) +{ + return update_records_update_size(0, NULL); +} +EXPORT_SYMBOL(update_records_ref_del_size); + +/** + * Pack ref del update + * + * Pack ref del update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to delete reference + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_ref_del_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid) +{ + return update_records_update_pack(env, fid, OUT_REF_DEL, ops, op_count, + max_ops_size, params, param_count, + max_param_size, 0, NULL, NULL); +} +EXPORT_SYMBOL(update_records_ref_del_pack); + +/** + * Calculate object destroy update size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to delete reference + * + * \retval size of object destroy update. + */ +size_t update_records_destroy_size(const struct lu_env *env, + const struct lu_fid *fid) +{ + return update_records_update_size(0, NULL); +} +EXPORT_SYMBOL(update_records_destroy_size); + +/** + * Pack object destroy update + * + * Pack object destroy update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to delete reference + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_destroy_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid) +{ + return update_records_update_pack(env, fid, OUT_DESTROY, ops, op_count, + max_ops_size, params, param_count, + max_param_size, 0, NULL, NULL); +} +EXPORT_SYMBOL(update_records_destroy_pack); + +/** + * Calculate index insert update size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to insert index + * \param[in] rec record of insertion + * \param[in] key key of insertion + * + * \retval the size of index insert update. + */ +size_t update_records_index_insert_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct dt_rec *rec, + const struct dt_key *key) +{ + size_t sizes[3] = { strlen((const char *)key) + 1, + sizeof(struct lu_fid), + sizeof(__u32) }; + return update_records_update_size(3, sizes); +} +EXPORT_SYMBOL(update_records_index_insert_size); + +/** + * Pack index insert update + * + * Pack index insert update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to insert index + * \param[in] rec record of insertion + * \param[in] key key of insertion + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_index_insert_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct dt_rec *rec, + const struct dt_key *key) +{ + struct dt_insert_rec *rec1 = (struct dt_insert_rec *)rec; + struct lu_fid rec_fid; + __u32 type = cpu_to_le32(rec1->rec_type); + size_t sizes[3] = { strlen((const char *)key) + 1, + sizeof(rec_fid), + sizeof(type) }; + const void *bufs[3] = { key, + &rec_fid, + &type }; + + fid_cpu_to_le(&rec_fid, rec1->rec_fid); + + return update_records_update_pack(env, fid, OUT_INDEX_INSERT, ops, + op_count, max_ops_size, params, + param_count, max_param_size, + 3, bufs, sizes); +} +EXPORT_SYMBOL(update_records_index_insert_pack); + +/** + * Calculate index delete update size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to delete index + * \param[in] key key of deletion + * + * \retval the size of index delete update + */ +size_t update_records_index_delete_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct dt_key *key) +{ + size_t size = strlen((const char *)key) + 1; + + return update_records_update_size(1, &size); +} +EXPORT_SYMBOL(update_records_index_delete_size); + +/** + * Pack index delete update + * + * Pack index delete update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|ount] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to delete index + * \param[in] key key of deletion + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_index_delete_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct dt_key *key) +{ + size_t size = strlen((const char *)key) + 1; + + return update_records_update_pack(env, fid, OUT_INDEX_DELETE, ops, + op_count, max_ops_size, params, + param_count, max_param_size, + 1, (const void **)&key, &size); +} +EXPORT_SYMBOL(update_records_index_delete_pack); + +/** + * Calculate xattr set size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to set xattr + * \param[in] buf xattr to be set + * \param[in] name name of the xattr + * \param[in] flag flag for setting xattr + * + * \retval size of xattr set update. + */ +size_t update_records_xattr_set_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_buf *buf, + const char *name, __u32 flag) +{ + size_t sizes[3] = {strlen(name) + 1, buf->lb_len, sizeof(flag)}; + + return update_records_update_size(3, sizes); +} +EXPORT_SYMBOL(update_records_xattr_set_size); + +/** + * Pack xattr set update + * + * Pack xattr set update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to set xattr + * \param[in] buf xattr to be set + * \param[in] name name of the xattr + * \param[in] flag flag for setting xattr + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_xattr_set_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_buf *buf, const char *name, + __u32 flag) +{ + size_t sizes[3] = {strlen(name) + 1, buf->lb_len, sizeof(flag)}; + const void *bufs[3] = {name, buf->lb_buf, &flag}; + + flag = cpu_to_le32(flag); + + return update_records_update_pack(env, fid, OUT_XATTR_SET, ops, + op_count, max_ops_size, params, + param_count, max_param_size, + 3, bufs, sizes); +} +EXPORT_SYMBOL(update_records_xattr_set_pack); + +/** + * Calculate xattr delete update size. + * + * \param[in] env execution environment + * \param[in] fid FID of the object to delete xattr + * \param[in] name name of the xattr + * + * \retval size of xattr delet updatee. + */ +size_t update_records_xattr_del_size(const struct lu_env *env, + const struct lu_fid *fid, + const char *name) +{ + size_t size = strlen(name) + 1; + + return update_records_update_size(1, &size); +} +EXPORT_SYMBOL(update_records_xattr_del_size); + +/** + * Pack xattr delete update + * + * Pack xattr delete update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to delete xattr + * \param[in] name name of the xattr + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_xattr_del_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const char *name) +{ + size_t size = strlen(name) + 1; + + return update_records_update_pack(env, fid, OUT_XATTR_DEL, ops, + op_count, max_ops_size, params, + param_count, max_param_size, + 1, (const void **)&name, &size); +} +EXPORT_SYMBOL(update_records_xattr_del_pack); + +/** + * Calculate write update size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to write into + * \param[in] buf buffer to write which includes an embedded size field + * \param[in] pos offet in the object to start writing at + * + * \retval size of write udpate. + */ +size_t update_records_write_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_buf *buf, + __u64 pos) +{ + size_t sizes[2] = {buf->lb_len, sizeof(pos)}; + + return update_records_update_size(2, sizes); +} +EXPORT_SYMBOL(update_records_write_size); + +/** + * Pack write update + * + * Pack write update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to write into + * \param[in] buf buffer to write which includes an embedded size field + * \param[in] pos offet in the object to start writing at + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_write_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_buf *buf, + __u64 pos) +{ + size_t sizes[2] = {buf->lb_len, sizeof(pos)}; + const void *bufs[2] = {buf->lb_buf, &pos}; + + pos = cpu_to_le64(pos); + + return update_records_update_pack(env, fid, OUT_WRITE, ops, + op_count, max_ops_size, params, + param_count, max_param_size, + 2, bufs, sizes); +} +EXPORT_SYMBOL(update_records_write_pack); + +/** + * Calculate size of punch update. + * + * \param[in] env execution environment + * \param[in] fid FID of the object to write into + * \param[in] start start offset of punch + * \param[in] end end offet of punch + * + * \retval size of update punch. + */ +size_t update_records_punch_size(const struct lu_env *env, + const struct lu_fid *fid, + __u64 start, __u64 end) +{ + size_t sizes[2] = {sizeof(start), sizeof(end)}; + + return update_records_update_size(2, sizes); +} +EXPORT_SYMBOL(update_records_punch_size); + +/** + * Pack punch + * + * Pack punch update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to write into + * \param[in] start start offset of punch + * \param[in] end end offet of punch + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_punch_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + __u64 start, __u64 end) +{ + size_t sizes[2] = {sizeof(start), sizeof(end)}; + const void *bufs[2] = {&start, &end}; + + start = cpu_to_le64(start); + end = cpu_to_le64(end); + + return update_records_update_pack(env, fid, OUT_PUNCH, ops, op_count, + max_ops_size, params, param_count, + max_param_size, 2, bufs, sizes); +} +EXPORT_SYMBOL(update_records_punch_pack); + +/** + * Create update records in thandle_update_records + * + * Allocate update_records for thandle_update_records, the initial size + * will be 4KB. + * + * \param[in] tur thandle_update_records where update_records will be + * allocated + * \retval 0 if allocation succeeds. + * \retval negative errno if allocation fails. + */ +static int tur_update_records_create(struct thandle_update_records *tur) +{ + if (tur->tur_update_records != NULL) + return 0; + + OBD_ALLOC_LARGE(tur->tur_update_records, + UPDATE_RECORDS_BUFFER_SIZE); + + if (tur->tur_update_records == NULL) + return -ENOMEM; + + tur->tur_update_records_buf_size = UPDATE_RECORDS_BUFFER_SIZE; + + return 0; +} + +/** + * Extend update records + * + * Extend update_records to the new size in thandle_update_records. + * + * \param[in] tur thandle_update_records where update_records will be + * extended. + * \retval 0 if extension succeeds. + * \retval negative errno if extension fails. + */ +int tur_update_records_extend(struct thandle_update_records *tur, + size_t new_size) +{ + struct llog_update_record *record; + + OBD_ALLOC_LARGE(record, new_size); + if (record == NULL) + return -ENOMEM; + + if (tur->tur_update_records != NULL) { + memcpy(record, tur->tur_update_records, + tur->tur_update_records_buf_size); + OBD_FREE_LARGE(tur->tur_update_records, + tur->tur_update_records_buf_size); + } + + tur->tur_update_records = record; + tur->tur_update_records_buf_size = new_size; + + return 0; +} +EXPORT_SYMBOL(tur_update_records_extend); + +/** + * Extend update records + * + * Extend update records in thandle to make sure it is able to hold + * the update with certain update_op and params size. + * + * \param [in] tur thandle_update_records to be extend + * \param [in] new_op_size update_op size of the update record + * \param [in] new_param_size params size of the update record + * + * \retval 0 if the update_records is being extended. + * \retval negative errno if the update_records is not being + * extended. + */ +int tur_update_extend(struct thandle_update_records *tur, + size_t new_op_size, size_t new_param_size) +{ + size_t record_size; + size_t params_size; + size_t extend_size; + int rc; + ENTRY; + + record_size = llog_update_record_size(tur->tur_update_records); + /* extend update records buffer */ + if (new_op_size >= (tur->tur_update_records_buf_size - record_size)) { + extend_size = round_up(new_op_size, UPDATE_RECORDS_BUFFER_SIZE); + rc = tur_update_records_extend(tur, + tur->tur_update_records_buf_size + + extend_size); + if (rc != 0) + RETURN(rc); + } + + /* extend parameters buffer */ + params_size = update_params_size(tur->tur_update_params, + tur->tur_update_param_count); + if (new_param_size >= (tur->tur_update_params_buf_size - + params_size)) { + extend_size = round_up(new_param_size, + UPDATE_PARAMS_BUFFER_SIZE); + rc = tur_update_params_extend(tur, + tur->tur_update_params_buf_size + + extend_size); + if (rc != 0) + RETURN(rc); + } + + RETURN(0); +} +EXPORT_SYMBOL(tur_update_extend); + +/** + * Create update params in thandle_update_records + * + * Allocate update_params for thandle_update_records, the initial size + * will be 4KB. + * + * \param[in] tur thandle_update_records where update_params will be + * allocated + * \retval 0 if allocation succeeds. + * \retval negative errno if allocation fails. + */ +static int tur_update_params_create(struct thandle_update_records *tur) +{ + if (tur->tur_update_params != NULL) + return 0; + + OBD_ALLOC_LARGE(tur->tur_update_params, UPDATE_PARAMS_BUFFER_SIZE); + if (tur->tur_update_params == NULL) + return -ENOMEM; + + tur->tur_update_params_buf_size = UPDATE_PARAMS_BUFFER_SIZE; + return 0; +} + +/** + * Extend update params + * + * Extend update_params to the new size in thandle_update_records. + * + * \param[in] tur thandle_update_records where update_params will be + * extended. + * \retval 0 if extension succeeds. + * \retval negative errno if extension fails. + */ +int tur_update_params_extend(struct thandle_update_records *tur, + size_t new_size) +{ + struct update_params *params; + + OBD_ALLOC_LARGE(params, new_size); + if (params == NULL) + return -ENOMEM; + + if (tur->tur_update_params != NULL) { + memcpy(params, tur->tur_update_params, + tur->tur_update_params_buf_size); + OBD_FREE_LARGE(tur->tur_update_params, + tur->tur_update_params_buf_size); + } + + tur->tur_update_params = params; + tur->tur_update_params_buf_size = new_size; + + return 0; +} +EXPORT_SYMBOL(tur_update_params_extend); + +/** + * Check and prepare whether it needs to record update. + * + * Checks if the transaction needs to record updates, and if it + * does, then initialize the update record buffer in the transaction. + * + * \param[in] env execution environment + * \param[in] th transaction handle + * + * \retval 0 if updates recording succeeds. + * \retval negative errno if updates recording fails. + */ +int check_and_prepare_update_record(const struct lu_env *env, + struct thandle_update_records *tur) +{ + struct llog_update_record *lur; + int rc; + + if (tur->tur_update_records == NULL) { + rc = tur_update_records_create(tur); + if (rc < 0) + RETURN(rc); + } + + if (tur->tur_update_params == NULL) { + rc = tur_update_params_create(tur); + if (rc < 0) + RETURN(rc); + } + + lur = tur->tur_update_records; + lur->lur_update_rec.ur_update_count = 0; + lur->lur_update_rec.ur_param_count = 0; + lur->lur_update_rec.ur_master_transno = 0; + lur->lur_update_rec.ur_batchid = 0; + lur->lur_update_rec.ur_flags = 0; + lur->lur_hdr.lrh_len = LLOG_MIN_CHUNK_SIZE; + + tur->tur_update_param_count = 0; + + RETURN(0); +} + +static void update_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct update_thread_info *info = data; + struct thandle_exec_args *args = &info->uti_tea; + int i; + + for (i = 0; i < args->ta_alloc_args; i++) { + if (args->ta_args[i] != NULL) + OBD_FREE_PTR(args->ta_args[i]); + } + + if (args->ta_args != NULL) + OBD_FREE_PTR_ARRAY(args->ta_args, args->ta_alloc_args); + + if (info->uti_tur.tur_update_records != NULL) + OBD_FREE_LARGE(info->uti_tur.tur_update_records, + info->uti_tur.tur_update_records_buf_size); + if (info->uti_tur.tur_update_params != NULL) + OBD_FREE_LARGE(info->uti_tur.tur_update_params, + info->uti_tur.tur_update_params_buf_size); + + OBD_FREE_PTR(info); +} + +/* context key constructor/destructor: update_key_init, update_key_fini */ +LU_KEY_INIT(update, struct update_thread_info); +/* context key: update_thread_key */ +LU_CONTEXT_KEY_DEFINE(update, LCT_MD_THREAD | LCT_MG_THREAD | + LCT_DT_THREAD | LCT_LOCAL); +EXPORT_SYMBOL(update_thread_key); +LU_KEY_INIT_GENERIC(update); + +void update_info_init(void) +{ + update_key_init_generic(&update_thread_key, NULL); + lu_context_key_register(&update_thread_key); +} + +void update_info_fini(void) +{ + lu_context_key_degister(&update_thread_key); +} diff --git a/drivers/staging/lustrefsx/lustre/target/update_recovery.c b/drivers/staging/lustrefsx/lustre/target/update_recovery.c new file mode 100644 index 0000000000000..b483a26c5857c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/update_recovery.c @@ -0,0 +1,1451 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2015, 2017, Intel Corporation. + */ + +/* + * lustre/target/update_recovery.c + * + * This file implement the methods to handle the update recovery. + * + * During DNE recovery, the recovery thread will redo the operation according + * to the transaction no, and these replay are either from client replay req + * or update replay records(for distribute transaction) in the update log. + * For distribute transaction replay, the replay thread will call + * distribute_txn_replay_handle() to handle the updates. + * + * After the Master MDT restarts, it will retrieve the update records from all + * of MDTs, for each distributed operation, it will check updates on all MDTs, + * if some updates records are missing on some MDTs, the replay thread will redo + * updates on these MDTs. + * + * Author: Di Wang + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include + +#include "tgt_internal.h" + +/** + * Lookup distribute_txn_replay req + * + * Lookup distribute_txn_replay in the replay list by batchid. + * It is assumed the list has been locked before calling this function. + * + * \param[in] tdtd distribute_txn_data, which holds the replay + * list. + * \param[in] batchid batchid used by lookup. + * + * \retval pointer of the replay if succeeds. + * \retval NULL if can not find it. + */ +static struct distribute_txn_replay_req * +dtrq_lookup(struct target_distribute_txn_data *tdtd, __u64 batchid) +{ + struct distribute_txn_replay_req *tmp; + struct distribute_txn_replay_req *dtrq = NULL; + + list_for_each_entry(tmp, &tdtd->tdtd_replay_list, dtrq_list) { + if (tmp->dtrq_batchid == batchid) { + dtrq = tmp; + break; + } + } + return dtrq; +} + +/** + * insert distribute txn replay req + * + * Insert distribute txn replay to the replay list, and it assumes the + * list has been looked. Note: the replay list is a sorted list, which + * is sorted by master transno. It is assumed the replay list has been + * locked before calling this function. + * + * \param[in] tdtd target distribute txn data where replay list is + * \param[in] new distribute txn replay to be inserted + * + * \retval 0 if insertion succeeds + * \retval EEXIST if the dtrq already exists + */ +static int dtrq_insert(struct target_distribute_txn_data *tdtd, + struct distribute_txn_replay_req *new) +{ + struct distribute_txn_replay_req *iter; + + /* Check if the dtrq has been added to the list */ + iter = dtrq_lookup(tdtd, new->dtrq_batchid); + if (iter != NULL) + return -EEXIST; + + list_for_each_entry_reverse(iter, &tdtd->tdtd_replay_list, dtrq_list) { + if (iter->dtrq_master_transno > new->dtrq_master_transno) + continue; + + /* If there are mulitple replay req with same transno, then + * sort them with batchid */ + if (iter->dtrq_master_transno == new->dtrq_master_transno && + iter->dtrq_batchid > new->dtrq_batchid) + continue; + + list_add(&new->dtrq_list, &iter->dtrq_list); + break; + } + + if (list_empty(&new->dtrq_list)) + list_add(&new->dtrq_list, &tdtd->tdtd_replay_list); + + return 0; +} + +/** + * create distribute txn replay req + * + * Allocate distribute txn replay req according to the update records. + * + * \param[in] tdtd target distribute txn data where replay list is. + * \param[in] record update records from the update log. + * + * \retval the pointer of distribute txn replay req if + * the creation succeeds. + * \retval NULL if the creation fails. + */ +static struct distribute_txn_replay_req * +dtrq_create(struct target_distribute_txn_data *tdtd, + struct llog_update_record *lur) +{ + struct distribute_txn_replay_req *new; + + OBD_ALLOC_PTR(new); + if (new == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + new->dtrq_lur_size = llog_update_record_size(lur); + OBD_ALLOC_LARGE(new->dtrq_lur, new->dtrq_lur_size); + if (new->dtrq_lur == NULL) { + OBD_FREE_PTR(new); + RETURN(ERR_PTR(-ENOMEM)); + } + + memcpy(new->dtrq_lur, lur, new->dtrq_lur_size); + + /* If the transno in the update record is 0, it means the + * update are from master MDT, and it will use the master + * last committed transno as its master transno. Later, if + * the update records are gotten from slave MDTs, then these + * transno will be replaced. + * See insert_update_records_to_replay_list(). */ + if (lur->lur_update_rec.ur_master_transno == 0) { + new->dtrq_lur->lur_update_rec.ur_master_transno = + tdtd->tdtd_lut->lut_obd->obd_last_committed; + new->dtrq_master_transno = + tdtd->tdtd_lut->lut_obd->obd_last_committed; + } else { + new->dtrq_master_transno = + lur->lur_update_rec.ur_master_transno; + } + + new->dtrq_batchid = lur->lur_update_rec.ur_batchid; + + spin_lock_init(&new->dtrq_sub_list_lock); + INIT_LIST_HEAD(&new->dtrq_sub_list); + INIT_LIST_HEAD(&new->dtrq_list); + + RETURN(new); +} + +/** + * Lookup distribute sub replay + * + * Lookup distribute sub replay in the sub list of distribute_txn_replay by + * mdt_index. + * + * \param[in] distribute_txn_replay_req the distribute txn replay req to lookup + * \param[in] mdt_index the mdt_index as the key of lookup + * + * \retval the pointer of sub replay if it can be found. + * \retval NULL if it can not find. + */ +struct distribute_txn_replay_req_sub * +dtrq_sub_lookup(struct distribute_txn_replay_req *dtrq, __u32 mdt_index) +{ + struct distribute_txn_replay_req_sub *dtrqs = NULL; + struct distribute_txn_replay_req_sub *tmp; + + list_for_each_entry(tmp, &dtrq->dtrq_sub_list, dtrqs_list) { + if (tmp->dtrqs_mdt_index == mdt_index) { + dtrqs = tmp; + break; + } + } + return dtrqs; +} + +/** + * Try to add cookie to sub distribute txn request + * + * Check if the update log cookie has been added to the request, if not, + * add it to the dtrqs_cookie_list. + * + * \param[in] dtrqs sub replay req where cookies to be added. + * \param[in] cookie cookie to be added. + * + * \retval 0 if the cookie is adding succeeds. + * \retval negative errno if adding fails. + */ +static int dtrq_sub_add_cookie(struct distribute_txn_replay_req_sub *dtrqs, + struct llog_cookie *cookie) +{ + struct sub_thandle_cookie *new; + + OBD_ALLOC_PTR(new); + if (new == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&new->stc_list); + new->stc_cookie = *cookie; + /* Note: only single thread will access one sub_request each time, + * so no need lock here */ + list_add(&new->stc_list, &dtrqs->dtrqs_cookie_list); + + return 0; +} + +/** + * Insert distribute txn sub req replay + * + * Allocate sub replay req and insert distribute txn replay list. + * + * \param[in] dtrq d to be added + * \param[in] cookie the cookie of the update record + * \param[in] mdt_index the mdt_index of the update record + * + * \retval 0 if the adding succeeds. + * \retval negative errno if the adding fails. + */ +static int +dtrq_sub_create_and_insert(struct distribute_txn_replay_req *dtrq, + struct llog_cookie *cookie, + __u32 mdt_index) +{ + struct distribute_txn_replay_req_sub *dtrqs = NULL; + struct distribute_txn_replay_req_sub *new; + int rc; + ENTRY; + + spin_lock(&dtrq->dtrq_sub_list_lock); + dtrqs = dtrq_sub_lookup(dtrq, mdt_index); + spin_unlock(&dtrq->dtrq_sub_list_lock); + if (dtrqs != NULL) { + rc = dtrq_sub_add_cookie(dtrqs, cookie); + RETURN(0); + } + + OBD_ALLOC_PTR(new); + if (new == NULL) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&new->dtrqs_list); + INIT_LIST_HEAD(&new->dtrqs_cookie_list); + new->dtrqs_mdt_index = mdt_index; + spin_lock(&dtrq->dtrq_sub_list_lock); + dtrqs = dtrq_sub_lookup(dtrq, mdt_index); + if (dtrqs == NULL) { + list_add(&new->dtrqs_list, &dtrq->dtrq_sub_list); + dtrqs = new; + } else { + OBD_FREE_PTR(new); + } + spin_unlock(&dtrq->dtrq_sub_list_lock); + + rc = dtrq_sub_add_cookie(dtrqs, cookie); + + RETURN(rc); +} + +/** + * append updates to the current replay updates + * + * Append more updates to the existent replay update. And this is only + * used when combining mulitple updates into one large updates during + * replay. + * + * \param[in] dtrq the update replay request where the new update + * records will be added. + * \param[in] lur the new update record. + * + * \retval 0 if appending succeeds. + * \retval negative errno if appending fails. + */ +static int dtrq_append_updates(struct distribute_txn_replay_req *dtrq, + struct update_records *record) +{ + struct llog_update_record *new_lur; + size_t lur_size = dtrq->dtrq_lur_size; + void *ptr; + ENTRY; + + /* Because several threads might retrieve the same records from + * different targets, and we only need one copy of records. So + * we will check if the records is in the next one, if not, just + * skip it */ + spin_lock(&dtrq->dtrq_sub_list_lock); + if (dtrq->dtrq_lur->lur_update_rec.ur_index + 1 != record->ur_index) { + spin_unlock(&dtrq->dtrq_sub_list_lock); + RETURN(0); + } + dtrq->dtrq_lur->lur_update_rec.ur_index++; + spin_unlock(&dtrq->dtrq_sub_list_lock); + + lur_size += update_records_size(record); + OBD_ALLOC_LARGE(new_lur, lur_size); + if (new_lur == NULL) { + spin_lock(&dtrq->dtrq_sub_list_lock); + dtrq->dtrq_lur->lur_update_rec.ur_index--; + spin_unlock(&dtrq->dtrq_sub_list_lock); + RETURN(-ENOMEM); + } + + /* Copy the old and new records to the new allocated buffer */ + memcpy(new_lur, dtrq->dtrq_lur, dtrq->dtrq_lur_size); + ptr = (char *)&new_lur->lur_update_rec + + update_records_size(&new_lur->lur_update_rec); + memcpy(ptr, &record->ur_ops, + update_records_size(record) - + offsetof(struct update_records, ur_ops)); + + new_lur->lur_update_rec.ur_update_count += record->ur_update_count; + new_lur->lur_update_rec.ur_param_count += record->ur_param_count; + new_lur->lur_hdr.lrh_len = llog_update_record_size(new_lur); + + /* Replace the records */ + OBD_FREE_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size); + dtrq->dtrq_lur = new_lur; + dtrq->dtrq_lur_size = lur_size; + dtrq->dtrq_lur->lur_update_rec.ur_flags = record->ur_flags; + update_records_dump(&new_lur->lur_update_rec, D_INFO, true); + RETURN(0); +} + +/** + * Insert update records to the replay list. + * + * Allocate distribute txn replay req and insert it into the replay + * list, then insert the update records into the replay req. + * + * \param[in] tdtd distribute txn replay data where the replay list + * is. + * \param[in] record the update record + * \param[in] cookie cookie of the record + * \param[in] index mdt index of the record + * + * \retval 0 if the adding succeeds. + * \retval negative errno if the adding fails. + */ +int +insert_update_records_to_replay_list(struct target_distribute_txn_data *tdtd, + struct llog_update_record *lur, + struct llog_cookie *cookie, + __u32 mdt_index) +{ + struct distribute_txn_replay_req *dtrq; + struct update_records *record = &lur->lur_update_rec; + bool replace_record = false; + int rc = 0; + ENTRY; + + CDEBUG(D_HA, "%s: insert record batchid = %llu transno = %llu" + " mdt_index %u\n", tdtd->tdtd_lut->lut_obd->obd_name, + record->ur_batchid, record->ur_master_transno, mdt_index); + + /* Update batchid if necessary */ + spin_lock(&tdtd->tdtd_batchid_lock); + if (record->ur_batchid >= tdtd->tdtd_batchid) { + CDEBUG(D_HA, "%s update batchid from %llu" " to %llu\n", + tdtd->tdtd_lut->lut_obd->obd_name, + tdtd->tdtd_batchid, record->ur_batchid); + tdtd->tdtd_batchid = record->ur_batchid + 1; + } + spin_unlock(&tdtd->tdtd_batchid_lock); + +again: + spin_lock(&tdtd->tdtd_replay_list_lock); + /* First try to build the replay update request with the records */ + dtrq = dtrq_lookup(tdtd, record->ur_batchid); + if (dtrq == NULL) { + spin_unlock(&tdtd->tdtd_replay_list_lock); + dtrq = dtrq_create(tdtd, lur); + if (IS_ERR(dtrq)) + RETURN(PTR_ERR(dtrq)); + + spin_lock(&tdtd->tdtd_replay_list_lock); + rc = dtrq_insert(tdtd, dtrq); + if (rc < 0) { + spin_unlock(&tdtd->tdtd_replay_list_lock); + dtrq_destroy(dtrq); + if (rc == -EEXIST) + goto again; + return rc; + } + } else { + /* If the master transno in update header is not + * matched with the one in the record, then it means + * the dtrq is originally created by master record, + * so we need update master transno and reposition + * the dtrq(by master transno) in the list and also + * replace update record */ + if (record->ur_master_transno != 0 && + dtrq->dtrq_master_transno != record->ur_master_transno && + dtrq->dtrq_lur != NULL) { + list_del_init(&dtrq->dtrq_list); + dtrq->dtrq_lur->lur_update_rec.ur_master_transno = + record->ur_master_transno; + + dtrq->dtrq_master_transno = record->ur_master_transno; + replace_record = true; + /* try to insert again */ + rc = dtrq_insert(tdtd, dtrq); + if (rc < 0) { + spin_unlock(&tdtd->tdtd_replay_list_lock); + dtrq_destroy(dtrq); + return rc; + } + } + } + spin_unlock(&tdtd->tdtd_replay_list_lock); + + /* Because there should be only thread access the update record, so + * we do not need lock here */ + if (replace_record) { + /* Replace the update record and master transno */ + OBD_FREE_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size); + dtrq->dtrq_lur = NULL; + dtrq->dtrq_lur_size = llog_update_record_size(lur); + OBD_ALLOC_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size); + if (dtrq->dtrq_lur == NULL) + return -ENOMEM; + + memcpy(dtrq->dtrq_lur, lur, dtrq->dtrq_lur_size); + } + + /* This is a partial update records, let's try to append + * the record to the current replay request */ + if (record->ur_flags & UPDATE_RECORD_CONTINUE) + rc = dtrq_append_updates(dtrq, record); + + /* Then create and add sub update request */ + rc = dtrq_sub_create_and_insert(dtrq, cookie, mdt_index); + + RETURN(rc); +} +EXPORT_SYMBOL(insert_update_records_to_replay_list); + +/** + * Dump updates of distribute txns. + * + * Output all of recovery updates in the distribute txn list to the + * debug log. + * + * \param[in] tdtd distribute txn data where all of distribute txn + * are listed. + * \param[in] mask debug mask + */ +void dtrq_list_dump(struct target_distribute_txn_data *tdtd, unsigned int mask) +{ + struct distribute_txn_replay_req *dtrq; + + spin_lock(&tdtd->tdtd_replay_list_lock); + list_for_each_entry(dtrq, &tdtd->tdtd_replay_list, dtrq_list) + update_records_dump(&dtrq->dtrq_lur->lur_update_rec, mask, + false); + spin_unlock(&tdtd->tdtd_replay_list_lock); +} +EXPORT_SYMBOL(dtrq_list_dump); + +/** + * Destroy distribute txn replay req + * + * Destroy distribute txn replay req and all of subs. + * + * \param[in] dtrq distribute txn replqy req to be destroyed. + */ +void dtrq_destroy(struct distribute_txn_replay_req *dtrq) +{ + struct distribute_txn_replay_req_sub *dtrqs; + struct distribute_txn_replay_req_sub *tmp; + + LASSERT(list_empty(&dtrq->dtrq_list)); + CDEBUG(D_HA, "destroy x%llu t%llu\n", dtrq->dtrq_xid, + dtrq->dtrq_master_transno); + spin_lock(&dtrq->dtrq_sub_list_lock); + list_for_each_entry_safe(dtrqs, tmp, &dtrq->dtrq_sub_list, dtrqs_list) { + struct sub_thandle_cookie *stc; + struct sub_thandle_cookie *tmp; + + list_del(&dtrqs->dtrqs_list); + list_for_each_entry_safe(stc, tmp, &dtrqs->dtrqs_cookie_list, + stc_list) { + list_del(&stc->stc_list); + OBD_FREE_PTR(stc); + } + OBD_FREE_PTR(dtrqs); + } + spin_unlock(&dtrq->dtrq_sub_list_lock); + + if (dtrq->dtrq_lur != NULL) + OBD_FREE_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size); + + OBD_FREE_PTR(dtrq); +} +EXPORT_SYMBOL(dtrq_destroy); + +/** + * Destroy all of replay req. + * + * Destroy all of replay req in the replay list. + * + * \param[in] tdtd target distribute txn data where the replay list is. + */ +void dtrq_list_destroy(struct target_distribute_txn_data *tdtd) +{ + struct distribute_txn_replay_req *dtrq; + struct distribute_txn_replay_req *tmp; + + spin_lock(&tdtd->tdtd_replay_list_lock); + list_for_each_entry_safe(dtrq, tmp, &tdtd->tdtd_replay_list, + dtrq_list) { + list_del_init(&dtrq->dtrq_list); + dtrq_destroy(dtrq); + } + list_for_each_entry_safe(dtrq, tmp, &tdtd->tdtd_replay_finish_list, + dtrq_list) { + list_del_init(&dtrq->dtrq_list); + dtrq_destroy(dtrq); + } + spin_unlock(&tdtd->tdtd_replay_list_lock); +} +EXPORT_SYMBOL(dtrq_list_destroy); + +/** + * Get next req in the replay list + * + * Get next req needs to be replayed, since it is a sorted list + * (by master MDT transno) + * + * \param[in] tdtd distribute txn data where the replay list is + * + * \retval the pointer of update recovery header + */ +struct distribute_txn_replay_req * +distribute_txn_get_next_req(struct target_distribute_txn_data *tdtd) +{ + struct distribute_txn_replay_req *dtrq = NULL; + + spin_lock(&tdtd->tdtd_replay_list_lock); + if (!list_empty(&tdtd->tdtd_replay_list)) { + dtrq = list_entry(tdtd->tdtd_replay_list.next, + struct distribute_txn_replay_req, dtrq_list); + list_del_init(&dtrq->dtrq_list); + } + spin_unlock(&tdtd->tdtd_replay_list_lock); + + return dtrq; +} +EXPORT_SYMBOL(distribute_txn_get_next_req); + +/** + * Get next transno in the replay list, because this is the sorted + * list, so it will return the transno of next req in the list. + * + * \param[in] tdtd distribute txn data where the replay list is + * + * \retval the transno of next update in the list + */ +__u64 distribute_txn_get_next_transno(struct target_distribute_txn_data *tdtd) +{ + struct distribute_txn_replay_req *dtrq = NULL; + __u64 transno = 0; + + spin_lock(&tdtd->tdtd_replay_list_lock); + if (!list_empty(&tdtd->tdtd_replay_list)) { + dtrq = list_entry(tdtd->tdtd_replay_list.next, + struct distribute_txn_replay_req, dtrq_list); + transno = dtrq->dtrq_master_transno; + } + spin_unlock(&tdtd->tdtd_replay_list_lock); + + CDEBUG(D_HA, "%s: Next update transno %llu\n", + tdtd->tdtd_lut->lut_obd->obd_name, transno); + return transno; +} +EXPORT_SYMBOL(distribute_txn_get_next_transno); + +struct distribute_txn_replay_req * +distribute_txn_lookup_finish_list(struct target_distribute_txn_data *tdtd, + __u64 transno) +{ + struct distribute_txn_replay_req *dtrq = NULL; + struct distribute_txn_replay_req *iter; + + spin_lock(&tdtd->tdtd_replay_list_lock); + list_for_each_entry(iter, &tdtd->tdtd_replay_finish_list, dtrq_list) { + if (iter->dtrq_master_transno == transno) { + dtrq = iter; + break; + } + } + spin_unlock(&tdtd->tdtd_replay_list_lock); + return dtrq; +} + +bool is_req_replayed_by_update(struct ptlrpc_request *req) +{ + struct lu_target *tgt = class_exp2tgt(req->rq_export); + struct distribute_txn_replay_req *dtrq; + + if (tgt->lut_tdtd == NULL) + return false; + + dtrq = distribute_txn_lookup_finish_list(tgt->lut_tdtd, + lustre_msg_get_transno(req->rq_reqmsg)); + if (dtrq == NULL) + return false; + + return true; +} +EXPORT_SYMBOL(is_req_replayed_by_update); + +/** + * Check if the update of one object is committed + * + * Check whether the update for the object is committed by checking whether + * the correspondent sub exists in the replay req. If it is committed, mark + * the committed flag in correspondent the sub thandle. + * + * \param[in] env execution environment + * \param[in] dtrq replay request + * \param[in] dt_obj object for the update + * \param[in] top_th top thandle + * \param[in] sub_th sub thandle which the update belongs to + * + * \retval 1 if the update is not committed. + * \retval 0 if the update is committed. + * \retval negative errno if some other failures happen. + */ +static int update_is_committed(const struct lu_env *env, + struct distribute_txn_replay_req *dtrq, + struct dt_object *dt_obj, + struct top_thandle *top_th, + struct sub_thandle *st) +{ + struct seq_server_site *seq_site; + const struct lu_fid *fid = lu_object_fid(&dt_obj->do_lu); + struct distribute_txn_replay_req_sub *dtrqs; + __u32 mdt_index; + ENTRY; + + if (st->st_sub_th != NULL) + RETURN(1); + + if (st->st_committed) + RETURN(0); + + seq_site = lu_site2seq(dt_obj->do_lu.lo_dev->ld_site); + if (fid_is_update_log(fid) || fid_is_update_log_dir(fid)) { + mdt_index = fid_oid(fid); + } else if (!fid_seq_in_fldb(fid_seq(fid))) { + mdt_index = seq_site->ss_node_id; + } else { + struct lu_server_fld *fld; + struct lu_seq_range range = {0}; + int rc; + + fld = seq_site->ss_server_fld; + fld_range_set_type(&range, LU_SEQ_RANGE_MDT); + LASSERT(fld->lsf_seq_lookup != NULL); + rc = fld->lsf_seq_lookup(env, fld, fid_seq(fid), + &range); + if (rc < 0) + RETURN(rc); + mdt_index = range.lsr_index; + } + + dtrqs = dtrq_sub_lookup(dtrq, mdt_index); + if (dtrqs != NULL || top_th->tt_multiple_thandle->tmt_committed) { + st->st_committed = 1; + if (dtrqs != NULL) { + struct sub_thandle_cookie *stc; + struct sub_thandle_cookie *tmp; + + list_for_each_entry_safe(stc, tmp, + &dtrqs->dtrqs_cookie_list, + stc_list) + list_move(&stc->stc_list, &st->st_cookie_list); + } + RETURN(0); + } + + CDEBUG(D_HA, "Update of "DFID "on MDT%u is not committed\n", PFID(fid), + mdt_index); + + RETURN(1); +} + +/** + * Implementation of different update methods for update recovery. + * + * These following functions update_recovery_$(update_name) implement + * different updates recovery methods. They will extract the parameters + * from the common parameters area and call correspondent dt API to redo + * the update. + * + * \param[in] env execution environment + * \param[in] op update operation to be replayed + * \param[in] params common update parameters which holds all parameters + * of the operation + * \param[in] th transaction handle + * \param[in] declare indicate it will do declare or real execution, true + * means declare, false means real execution + * + * \retval 0 if it succeeds. + * \retval negative errno if it fails. + */ +static int update_recovery_create(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + struct llog_update_record *lur = uti->uti_dtrq->dtrq_lur; + struct lu_attr *attr = &uti->uti_attr; + struct obdo *wobdo; + struct obdo *lobdo = &uti->uti_obdo; + struct dt_object_format dof; + __u16 size; + unsigned int param_count; + int rc; + ENTRY; + + if (dt_object_exists(dt_obj)) + RETURN(-EEXIST); + + param_count = lur->lur_update_rec.ur_param_count; + wobdo = update_params_get_param_buf(params, op->uop_params_off[0], + param_count, &size); + if (wobdo == NULL) + RETURN(-EIO); + if (size != sizeof(*wobdo)) + RETURN(-EIO); + + if (LLOG_REC_HDR_NEEDS_SWABBING(&lur->lur_hdr)) + lustre_swab_obdo(wobdo); + + lustre_get_wire_obdo(NULL, lobdo, wobdo); + la_from_obdo(attr, lobdo, lobdo->o_valid); + + dof.dof_type = dt_mode_to_dft(attr->la_mode); + + rc = out_tx_create(env, dt_obj, attr, NULL, &dof, + ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_destroy(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + int rc; + ENTRY; + + rc = out_tx_destroy(env, dt_obj, ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_ref_add(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + int rc; + ENTRY; + + rc = out_tx_ref_add(env, dt_obj, ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_ref_del(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + int rc; + ENTRY; + + rc = out_tx_ref_del(env, dt_obj, ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_attr_set(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + struct llog_update_record *lur = uti->uti_dtrq->dtrq_lur; + struct obdo *wobdo; + struct obdo *lobdo = &uti->uti_obdo; + struct lu_attr *attr = &uti->uti_attr; + __u16 size; + unsigned int param_count; + int rc; + ENTRY; + + param_count = lur->lur_update_rec.ur_param_count; + wobdo = update_params_get_param_buf(params, op->uop_params_off[0], + param_count, &size); + if (wobdo == NULL) + RETURN(-EIO); + if (size != sizeof(*wobdo)) + RETURN(-EIO); + + if (LLOG_REC_HDR_NEEDS_SWABBING(&lur->lur_hdr)) + lustre_swab_obdo(wobdo); + + lustre_get_wire_obdo(NULL, lobdo, wobdo); + la_from_obdo(attr, lobdo, lobdo->o_valid); + + rc = out_tx_attr_set(env, dt_obj, attr, ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_xattr_set(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + char *buf; + char *name; + int fl; + __u16 size; + __u32 param_count; + int rc; + ENTRY; + + param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count; + name = update_params_get_param_buf(params, + op->uop_params_off[0], + param_count, &size); + if (name == NULL) + RETURN(-EIO); + + buf = update_params_get_param_buf(params, + op->uop_params_off[1], + param_count, &size); + if (buf == NULL) + RETURN(-EIO); + + uti->uti_buf.lb_buf = buf; + uti->uti_buf.lb_len = (size_t)size; + + buf = update_params_get_param_buf(params, op->uop_params_off[2], + param_count, &size); + if (buf == NULL) + RETURN(-EIO); + if (size != sizeof(fl)) + RETURN(-EIO); + + fl = le32_to_cpu(*(int *)buf); + + rc = out_tx_xattr_set(env, dt_obj, &uti->uti_buf, name, fl, ta, th, + NULL, 0); + + RETURN(rc); +} + +static int update_recovery_index_insert(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + struct lu_fid *fid; + char *name; + __u32 param_count; + __u32 *ptype; + __u32 type; + __u16 size; + int rc; + ENTRY; + + param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count; + name = update_params_get_param_buf(params, op->uop_params_off[0], + param_count, &size); + if (name == NULL) + RETURN(-EIO); + + fid = update_params_get_param_buf(params, op->uop_params_off[1], + param_count, &size); + if (fid == NULL) + RETURN(-EIO); + if (size != sizeof(*fid)) + RETURN(-EIO); + + fid_le_to_cpu(fid, fid); + + ptype = update_params_get_param_buf(params, op->uop_params_off[2], + param_count, &size); + if (ptype == NULL) + RETURN(-EIO); + if (size != sizeof(*ptype)) + RETURN(-EIO); + type = le32_to_cpu(*ptype); + + if (dt_try_as_dir(env, dt_obj) == 0) + RETURN(-ENOTDIR); + + uti->uti_rec.rec_fid = fid; + uti->uti_rec.rec_type = type; + + rc = out_tx_index_insert(env, dt_obj, + (const struct dt_rec *)&uti->uti_rec, + (const struct dt_key *)name, ta, th, + NULL, 0); + + RETURN(rc); +} + +static int update_recovery_index_delete(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + __u32 param_count; + char *name; + __u16 size; + int rc; + ENTRY; + + param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count; + name = update_params_get_param_buf(params, op->uop_params_off[0], + param_count, &size); + if (name == NULL) + RETURN(-EIO); + + if (dt_try_as_dir(env, dt_obj) == 0) + RETURN(-ENOTDIR); + + rc = out_tx_index_delete(env, dt_obj, + (const struct dt_key *)name, ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_write(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + char *buf; + __u32 param_count; + __u64 pos; + __u16 size; + int rc; + ENTRY; + + param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count; + buf = update_params_get_param_buf(params, op->uop_params_off[0], + param_count, &size); + if (buf == NULL) + RETURN(-EIO); + + uti->uti_buf.lb_buf = buf; + uti->uti_buf.lb_len = size; + + buf = update_params_get_param_buf(params, op->uop_params_off[1], + param_count, &size); + if (buf == NULL) + RETURN(-EIO); + + pos = le64_to_cpu(*(__u64 *)buf); + + rc = out_tx_write(env, dt_obj, &uti->uti_buf, pos, + ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_xattr_del(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + __u32 param_count; + char *name; + __u16 size; + int rc; + ENTRY; + + param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count; + name = update_params_get_param_buf(params, op->uop_params_off[0], + param_count, &size); + if (name == NULL) + RETURN(-EIO); + + rc = out_tx_xattr_del(env, dt_obj, name, ta, th, NULL, 0); + + RETURN(rc); +} + +/** + * Update session information + * + * Update session information so tgt_txn_stop_cb()->tgt_last_rcvd_update() + * can be called correctly during update replay. + * + * \param[in] env execution environment. + * \param[in] tdtd distribute data structure of the recovering tgt. + * \param[in] th thandle of this update replay. + * \param[in] master_th master sub thandle. + * \param[in] ta_arg the tx arg structure to hold the update for updating + * reply data. + */ +static void update_recovery_update_ses(struct lu_env *env, + struct target_distribute_txn_data *tdtd, + struct thandle *th, + struct thandle *master_th, + struct distribute_txn_replay_req *dtrq, + struct tx_arg *ta_arg) +{ + struct tgt_session_info *tsi; + struct lu_target *lut = tdtd->tdtd_lut; + struct obd_export *export; + struct cfs_hash *hash; + struct top_thandle *top_th; + struct lsd_reply_data *lrd; + size_t size; + + tsi = tgt_ses_info(env); + if (tsi->tsi_exp != NULL) + return; + + size = ta_arg->u.write.buf.lb_len; + lrd = ta_arg->u.write.buf.lb_buf; + if (size != sizeof(*lrd) || lrd == NULL) + return; + + lrd->lrd_transno = le64_to_cpu(lrd->lrd_transno); + lrd->lrd_xid = le64_to_cpu(lrd->lrd_xid); + lrd->lrd_data = le64_to_cpu(lrd->lrd_data); + lrd->lrd_result = le32_to_cpu(lrd->lrd_result); + lrd->lrd_client_gen = le32_to_cpu(lrd->lrd_client_gen); + + CDEBUG(D_HA, "xid=%llu transno=%llu\n", lrd->lrd_xid, lrd->lrd_transno); + if (lrd->lrd_transno != tgt_th_info(env)->tti_transno) + return; + + hash = cfs_hash_getref(lut->lut_obd->obd_gen_hash); + if (hash == NULL) + return; + + export = cfs_hash_lookup(hash, &lrd->lrd_client_gen); + if (export == NULL) { + cfs_hash_putref(hash); + return; + } + + tsi->tsi_exp = export; + tsi->tsi_xid = lrd->lrd_xid; + tsi->tsi_opdata = lrd->lrd_data; + tsi->tsi_result = lrd->lrd_result; + tsi->tsi_client_gen = lrd->lrd_client_gen; + dtrq->dtrq_xid = lrd->lrd_xid; + top_th = container_of(th, struct top_thandle, tt_super); + top_th->tt_master_sub_thandle = master_th; + cfs_hash_putref(hash); +} + +/** + * Execute updates in the update replay records + * + * Declare distribute txn replay by update records and add the updates + * to the execution list. Note: it will check if the update has been + * committed, and only execute the updates if it is not committed to + * disk. + * + * \param[in] env execution environment + * \param[in] tdtd distribute txn replay data which hold all of replay + * reqs and all replay parameters. + * \param[in] dtrq distribute transaction replay req. + * \param[in] ta thandle execute args. + * + * \retval 0 if declare succeeds. + * \retval negative errno if declare fails. + */ +static int update_recovery_exec(const struct lu_env *env, + struct target_distribute_txn_data *tdtd, + struct distribute_txn_replay_req *dtrq, + struct thandle_exec_args *ta) +{ + struct llog_update_record *lur = dtrq->dtrq_lur; + struct update_records *records = &lur->lur_update_rec; + struct update_ops *ops = &records->ur_ops; + struct update_params *params = update_records_get_params(records); + struct top_thandle *top_th = container_of(ta->ta_handle, + struct top_thandle, + tt_super); + struct top_multiple_thandle *tmt = top_th->tt_multiple_thandle; + struct update_op *op; + unsigned int i; + int rc = 0; + ENTRY; + + /* These records have been swabbed in llog_cat_process() */ + for (i = 0, op = &ops->uops_op[0]; i < records->ur_update_count; + i++, op = update_op_next_op(op)) { + struct lu_fid *fid = &op->uop_fid; + struct dt_object *dt_obj; + struct dt_object *sub_dt_obj; + struct dt_device *sub_dt; + struct sub_thandle *st; + + if (op->uop_type == OUT_NOOP) + continue; + + dt_obj = dt_locate(env, tdtd->tdtd_dt, fid); + if (IS_ERR(dt_obj)) { + rc = PTR_ERR(dt_obj); + if (rc == -EREMCHG) + LCONSOLE_WARN("%.16s: hit invalid OI mapping " + "for "DFID" during recovering, " + "that may because auto scrub is " + "disabled on related MDT, and " + "will cause recovery failure. " + "Please enable auto scrub and " + "retry the recovery.\n", + tdtd->tdtd_lut->lut_obd->obd_name, + PFID(fid)); + + break; + } + sub_dt_obj = dt_object_child(dt_obj); + + /* Create sub thandle if not */ + sub_dt = lu2dt_dev(sub_dt_obj->do_lu.lo_dev); + st = lookup_sub_thandle(tmt, sub_dt); + if (st == NULL) { + st = create_sub_thandle(tmt, sub_dt); + if (IS_ERR(st)) + GOTO(next, rc = PTR_ERR(st)); + } + + /* check if updates on the OSD/OSP are committed */ + rc = update_is_committed(env, dtrq, dt_obj, top_th, st); + if (rc == 0) + /* If this is committed, goto next */ + goto next; + + if (rc < 0) + GOTO(next, rc); + + /* Create thandle for sub thandle if needed */ + if (st->st_sub_th == NULL) { + rc = sub_thandle_trans_create(env, top_th, st); + if (rc != 0) + GOTO(next, rc); + } + + CDEBUG(D_HA, "replay %uth update\n", i); + switch (op->uop_type) { + case OUT_CREATE: + rc = update_recovery_create(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_DESTROY: + rc = update_recovery_destroy(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_REF_ADD: + rc = update_recovery_ref_add(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_REF_DEL: + rc = update_recovery_ref_del(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_ATTR_SET: + rc = update_recovery_attr_set(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_XATTR_SET: + rc = update_recovery_xattr_set(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_INDEX_INSERT: + rc = update_recovery_index_insert(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_INDEX_DELETE: + rc = update_recovery_index_delete(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_WRITE: + rc = update_recovery_write(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_XATTR_DEL: + rc = update_recovery_xattr_del(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + default: + CERROR("Unknown update type %u\n", (__u32)op->uop_type); + rc = -EINVAL; + break; + } +next: + dt_object_put(env, dt_obj); + if (rc < 0) + break; + } + + ta->ta_handle->th_result = rc; + RETURN(rc); +} + +/** + * redo updates on MDT if needed. + * + * During DNE recovery, the recovery thread (target_recovery_thread) will call + * this function to replay distribute txn updates on all MDTs. It only replay + * updates on the MDT where the update record is missing. + * + * If the update already exists on the MDT, then it does not need replay the + * updates on that MDT, and only mark the sub transaction has been committed + * there. + * + * \param[in] env execution environment + * \param[in] tdtd target distribute txn data, which holds the replay list + * and all parameters needed by replay process. + * \param[in] dtrq distribute txn replay req. + * + * \retval 0 if replay succeeds. + * \retval negative errno if replay failes. + */ +int distribute_txn_replay_handle(struct lu_env *env, + struct target_distribute_txn_data *tdtd, + struct distribute_txn_replay_req *dtrq) +{ + struct update_records *records = &dtrq->dtrq_lur->lur_update_rec; + struct thandle_exec_args *ta; + struct lu_context session_env; + struct thandle *th = NULL; + struct top_thandle *top_th; + struct top_multiple_thandle *tmt; + struct thandle_update_records *tur = NULL; + int i; + int rc = 0; + ENTRY; + + /* initialize session, it is needed for the handler of target */ + rc = lu_context_init(&session_env, LCT_SERVER_SESSION | LCT_NOREF); + if (rc) { + CERROR("%s: failure to initialize session: rc = %d\n", + tdtd->tdtd_lut->lut_obd->obd_name, rc); + RETURN(rc); + } + lu_context_enter(&session_env); + env->le_ses = &session_env; + lu_env_refill(env); + update_records_dump(records, D_HA, true); + th = top_trans_create(env, NULL); + if (IS_ERR(th)) + GOTO(exit_session, rc = PTR_ERR(th)); + + ta = &update_env_info(env)->uti_tea; + ta->ta_argno = 0; + + update_env_info(env)->uti_dtrq = dtrq; + /* Create distribute transaction structure for this top thandle */ + top_th = container_of(th, struct top_thandle, tt_super); + rc = top_trans_create_tmt(env, top_th); + if (rc < 0) + GOTO(stop_trans, rc); + + th->th_dev = tdtd->tdtd_dt; + ta->ta_handle = th; + + /* check if the distribute transaction has been committed */ + tmt = top_th->tt_multiple_thandle; + tmt->tmt_master_sub_dt = tdtd->tdtd_lut->lut_bottom; + tmt->tmt_batchid = dtrq->dtrq_batchid; + tgt_th_info(env)->tti_transno = dtrq->dtrq_master_transno; + + if (tmt->tmt_batchid <= tdtd->tdtd_committed_batchid) + tmt->tmt_committed = 1; + + rc = update_recovery_exec(env, tdtd, dtrq, ta); + if (rc < 0) + GOTO(stop_trans, rc); + + /* If no updates are needed to be replayed, then mark this records as + * committed, so commit thread distribute_txn_commit_thread() will + * delete the record */ + if (ta->ta_argno == 0) + tmt->tmt_committed = 1; + + tur = &update_env_info(env)->uti_tur; + tur->tur_update_records = dtrq->dtrq_lur; + tur->tur_update_records_buf_size = dtrq->dtrq_lur_size; + tur->tur_update_params = NULL; + tur->tur_update_param_count = 0; + tmt->tmt_update_records = tur; + + distribute_txn_insert_by_batchid(tmt); + rc = top_trans_start(env, NULL, th); + if (rc < 0) + GOTO(stop_trans, rc); + + for (i = 0; i < ta->ta_argno; i++) { + struct tx_arg *ta_arg; + struct dt_object *dt_obj; + struct dt_device *sub_dt; + struct sub_thandle *st; + + ta_arg = ta->ta_args[i]; + dt_obj = ta_arg->object; + + LASSERT(tmt->tmt_committed == 0); + sub_dt = lu2dt_dev(dt_obj->do_lu.lo_dev); + st = lookup_sub_thandle(tmt, sub_dt); + + LASSERT(st != NULL); + LASSERT(st->st_sub_th != NULL); + rc = ta->ta_args[i]->exec_fn(env, st->st_sub_th, + ta->ta_args[i]); + + /* If the update is to update the reply data, then + * we need set the session information, so + * tgt_last_rcvd_update() can be called correctly */ + if (rc == 0 && dt_obj == tdtd->tdtd_lut->lut_reply_data) + update_recovery_update_ses(env, tdtd, th, + st->st_sub_th, dtrq, ta_arg); + + if (unlikely(rc < 0)) { + CDEBUG(D_HA, "error during execution of #%u from" + " %s:%d: rc = %d\n", i, ta->ta_args[i]->file, + ta->ta_args[i]->line, rc); + while (--i > 0) { + if (ta->ta_args[i]->undo_fn != NULL) { + dt_obj = ta->ta_args[i]->object; + sub_dt = + lu2dt_dev(dt_obj->do_lu.lo_dev); + st = lookup_sub_thandle(tmt, sub_dt); + LASSERT(st != NULL); + LASSERT(st->st_sub_th != NULL); + + ta->ta_args[i]->undo_fn(env, + st->st_sub_th, + ta->ta_args[i]); + } else { + CERROR("%s: undo for %s:%d: rc = %d\n", + dt_obd_name(ta->ta_handle->th_dev), + ta->ta_args[i]->file, + ta->ta_args[i]->line, -ENOTSUPP); + } + } + break; + } + CDEBUG(D_HA, "%s: executed %u/%u: rc = %d\n", + dt_obd_name(sub_dt), i, ta->ta_argno, rc); + } + +stop_trans: + if (rc < 0) + th->th_result = rc; + rc = top_trans_stop(env, tdtd->tdtd_dt, th); + for (i = 0; i < ta->ta_argno; i++) { + if (ta->ta_args[i]->object != NULL) { + dt_object_put(env, ta->ta_args[i]->object); + ta->ta_args[i]->object = NULL; + } + } + + if (tur != NULL) + tur->tur_update_records = NULL; + + if (tgt_ses_info(env)->tsi_exp != NULL) { + class_export_put(tgt_ses_info(env)->tsi_exp); + tgt_ses_info(env)->tsi_exp = NULL; + } +exit_session: + lu_context_exit(&session_env); + lu_context_fini(&session_env); + RETURN(rc); +} +EXPORT_SYMBOL(distribute_txn_replay_handle); diff --git a/drivers/staging/lustrefsx/lustre/target/update_trans.c b/drivers/staging/lustrefsx/lustre/target/update_trans.c new file mode 100644 index 0000000000000..aa13f9433d3e5 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/update_trans.c @@ -0,0 +1,1743 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2015, 2017, Intel Corporation. + */ +/* + * lustre/target/update_trans.c + * + * This file implements the update distribute transaction API. + * + * To manage the cross-MDT operation (distribute operation) transaction, + * the transaction will also be separated two layers on MD stack, top + * transaction and sub transaction. + * + * During the distribute operation, top transaction is created in the LOD + * layer, and represent the operation. Sub transaction is created by + * each OSD or OSP. Top transaction start/stop will trigger all of its sub + * transaction start/stop. Top transaction (the whole operation) is committed + * only all of its sub transaction are committed. + * + * there are three kinds of transactions + * 1. local transaction: All updates are in a single local OSD. + * 2. Remote transaction: All Updates are only in the remote OSD, + * i.e. locally all updates are in OSP. + * 3. Mixed transaction: Updates are both in local OSD and remote + * OSD. + * + * Author: Di Wang + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include + +#include +/** + * Dump top mulitple thandle + * + * Dump top multiple thandle and all of its sub thandle to the debug log. + * + * \param[in]mask debug mask + * \param[in]top_th top_thandle to be dumped + */ +static void top_multiple_thandle_dump(struct top_multiple_thandle *tmt, + __u32 mask) +{ + struct sub_thandle *st; + + LASSERT(tmt->tmt_magic == TOP_THANDLE_MAGIC); + CDEBUG(mask, "%s tmt %p refcount %d committed %d result %d batchid %llu\n", + tmt->tmt_master_sub_dt ? + tmt->tmt_master_sub_dt->dd_lu_dev.ld_obd->obd_name : + "NULL", + tmt, atomic_read(&tmt->tmt_refcount), tmt->tmt_committed, + tmt->tmt_result, tmt->tmt_batchid); + + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + struct sub_thandle_cookie *stc; + + CDEBUG(mask, "st %p obd %s committed %d started %d stopped %d " + "result %d sub_th %p\n", + st, st->st_dt->dd_lu_dev.ld_obd->obd_name, + st->st_committed, st->st_started, st->st_stopped, + st->st_result, st->st_sub_th); + + list_for_each_entry(stc, &st->st_cookie_list, stc_list) { + CDEBUG(mask, " cookie "DFID".%u\n", + PFID(&stc->stc_cookie.lgc_lgl.lgl_oi.oi_fid), + stc->stc_cookie.lgc_index); + } + } +} + +/** + * Declare write update to sub device + * + * Declare Write updates llog records to the sub device during distribute + * transaction. + * + * \param[in] env execution environment + * \param[in] record update records being written + * \param[in] sub_th sub transaction handle + * \param[in] record_size total update record size + * + * \retval 0 if writing succeeds + * \retval negative errno if writing fails + */ +static int sub_declare_updates_write(const struct lu_env *env, + struct llog_update_record *record, + struct thandle *sub_th, size_t record_size) +{ + struct llog_ctxt *ctxt; + struct dt_device *dt = sub_th->th_dev; + int left = record_size; + int rc; + + /* If ctxt is NULL, it means not need to write update, + * for example if the the OSP is used to connect to OST */ + ctxt = llog_get_context(dt->dd_lu_dev.ld_obd, + LLOG_UPDATELOG_ORIG_CTXT); + + /* Not ready to record updates yet. */ + if (ctxt == NULL || ctxt->loc_handle == NULL) { + llog_ctxt_put(ctxt); + return 0; + } + + rc = llog_declare_add(env, ctxt->loc_handle, + &record->lur_hdr, sub_th); + if (rc < 0) + GOTO(out_put, rc); + + while (left > ctxt->loc_chunk_size) { + rc = llog_declare_add(env, ctxt->loc_handle, + &record->lur_hdr, sub_th); + if (rc < 0) + GOTO(out_put, rc); + + left -= ctxt->loc_chunk_size; + } + +out_put: + llog_ctxt_put(ctxt); + + return rc; +} + +/** + * write update to sub device + * + * Write llog update record to the sub device during distribute + * transaction. If it succeeds, llog cookie of the record will be + * returned by @cookie. + * + * \param[in] env execution environment + * \param[in] record update records being written + * \param[in] sub_th sub transaction handle + * \param[out] cookie llog cookie of the update record. + * + * \retval 1 if writing succeeds + * \retval negative errno if writing fails + */ +static int sub_updates_write(const struct lu_env *env, + struct llog_update_record *record, + struct sub_thandle *sub_th) +{ + struct dt_device *dt = sub_th->st_dt; + struct llog_ctxt *ctxt; + struct llog_update_record *lur = NULL; + __u32 update_count = 0; + __u32 param_count = 0; + __u32 last_update_count = 0; + __u32 last_param_count = 0; + char *start; + char *cur; + char *next; + struct sub_thandle_cookie *stc; + size_t reclen; + bool eof = false; + int rc; + ENTRY; + + ctxt = llog_get_context(dt->dd_lu_dev.ld_obd, + LLOG_UPDATELOG_ORIG_CTXT); + /* If ctxt == NULL, then it means updates on OST (only happens + * during migration), and we do not track those updates for now */ + /* If ctxt->loc_handle == NULL, then it does not need to record + * update, usually happens in error handler path */ + if (ctxt == NULL || ctxt->loc_handle == NULL) { + llog_ctxt_put(ctxt); + RETURN(0); + } + + /* Since the cross-MDT updates will includes both local + * and remote updates, the update ops count must > 1 */ + LASSERT(record->lur_update_rec.ur_update_count > 1); + LASSERTF(record->lur_hdr.lrh_len == llog_update_record_size(record), + "lrh_len %u record_size %zu\n", record->lur_hdr.lrh_len, + llog_update_record_size(record)); + + /* + * If its size > llog chunk_size, then write current chunk to the update + * llog, NB the padding should >= LLOG_MIN_REC_SIZE. + * + * So check padding length is either >= LLOG_MIN_REC_SIZE or is 0 + * (record length just matches the chunk size). + */ + + reclen = record->lur_hdr.lrh_len; + if (reclen + LLOG_MIN_REC_SIZE <= ctxt->loc_chunk_size || + reclen == ctxt->loc_chunk_size) { + OBD_ALLOC_PTR(stc); + if (stc == NULL) + GOTO(llog_put, rc = -ENOMEM); + INIT_LIST_HEAD(&stc->stc_list); + + rc = llog_add(env, ctxt->loc_handle, &record->lur_hdr, + &stc->stc_cookie, sub_th->st_sub_th); + + CDEBUG(D_INFO, "%s: Add update log "DFID".%u: rc = %d\n", + dt->dd_lu_dev.ld_obd->obd_name, + PFID(&stc->stc_cookie.lgc_lgl.lgl_oi.oi_fid), + stc->stc_cookie.lgc_index, rc); + + if (rc > 0) { + list_add(&stc->stc_list, &sub_th->st_cookie_list); + rc = 0; + } else { + OBD_FREE_PTR(stc); + } + + GOTO(llog_put, rc); + } + + /* Split the records into chunk_size update record */ + OBD_ALLOC_LARGE(lur, ctxt->loc_chunk_size); + if (lur == NULL) + GOTO(llog_put, rc = -ENOMEM); + + memcpy(lur, &record->lur_hdr, sizeof(record->lur_hdr)); + lur->lur_update_rec.ur_update_count = 0; + lur->lur_update_rec.ur_param_count = 0; + start = (char *)&record->lur_update_rec.ur_ops; + cur = next = start; + do { + if (update_count < record->lur_update_rec.ur_update_count) + next = (char *)update_op_next_op( + (struct update_op *)cur); + else if (param_count < record->lur_update_rec.ur_param_count) + next = (char *)update_param_next_param( + (struct update_param *)cur); + else + eof = true; + + reclen = __llog_update_record_size( + __update_records_size(next - start)); + if ((reclen + LLOG_MIN_REC_SIZE <= ctxt->loc_chunk_size || + reclen == ctxt->loc_chunk_size) && + !eof) { + cur = next; + + if (update_count < + record->lur_update_rec.ur_update_count) + update_count++; + else if (param_count < + record->lur_update_rec.ur_param_count) + param_count++; + continue; + } + + lur->lur_update_rec.ur_update_count = update_count - + last_update_count; + lur->lur_update_rec.ur_param_count = param_count - + last_param_count; + memcpy(&lur->lur_update_rec.ur_ops, start, cur - start); + lur->lur_hdr.lrh_len = llog_update_record_size(lur); + + LASSERT(lur->lur_hdr.lrh_len == + __llog_update_record_size( + __update_records_size(cur - start))); + LASSERT(lur->lur_hdr.lrh_len <= ctxt->loc_chunk_size); + + update_records_dump(&lur->lur_update_rec, D_INFO, true); + + OBD_ALLOC_PTR(stc); + if (stc == NULL) + GOTO(llog_put, rc = -ENOMEM); + INIT_LIST_HEAD(&stc->stc_list); + + rc = llog_add(env, ctxt->loc_handle, &lur->lur_hdr, + &stc->stc_cookie, sub_th->st_sub_th); + + CDEBUG(D_INFO, "%s: Add update log "DFID".%u: rc = %d\n", + dt->dd_lu_dev.ld_obd->obd_name, + PFID(&stc->stc_cookie.lgc_lgl.lgl_oi.oi_fid), + stc->stc_cookie.lgc_index, rc); + + if (rc > 0) { + list_add(&stc->stc_list, &sub_th->st_cookie_list); + rc = 0; + } else { + OBD_FREE_PTR(stc); + GOTO(llog_put, rc); + } + + last_update_count = update_count; + last_param_count = param_count; + start = cur; + lur->lur_update_rec.ur_update_count = 0; + lur->lur_update_rec.ur_param_count = 0; + lur->lur_update_rec.ur_flags |= UPDATE_RECORD_CONTINUE; + } while (!eof); + +llog_put: + if (lur != NULL) + OBD_FREE_LARGE(lur, ctxt->loc_chunk_size); + llog_ctxt_put(ctxt); + + RETURN(rc); +} + +/** + * Prepare the update records. + * + * Merge params and ops into the update records, then initializing + * the update buffer. + * + * During transaction execution phase, parameters and update ops + * are collected in two different buffers (see lod_updates_pack()), + * during transaction stop, it needs to be merged in one buffer, + * so it will be written in the update log. + * + * \param[in] env execution environment + * \param[in] tmt top_multiple_thandle for distribute txn + * + * \retval 0 if merging succeeds. + * \retval negaitive errno if merging fails. + */ +static int prepare_writing_updates(const struct lu_env *env, + struct top_multiple_thandle *tmt) +{ + struct thandle_update_records *tur = tmt->tmt_update_records; + struct llog_update_record *lur; + struct update_params *params; + size_t params_size; + size_t update_size; + + if (tur == NULL || tur->tur_update_records == NULL || + tur->tur_update_params == NULL) + return 0; + + lur = tur->tur_update_records; + /* Extends the update records buffer if needed */ + params_size = update_params_size(tur->tur_update_params, + tur->tur_update_param_count); + LASSERT(lur->lur_update_rec.ur_param_count == 0); + update_size = llog_update_record_size(lur); + if (cfs_size_round(update_size + params_size) > + tur->tur_update_records_buf_size) { + int rc; + + rc = tur_update_records_extend(tur, + cfs_size_round(update_size + params_size)); + if (rc < 0) + return rc; + + lur = tur->tur_update_records; + } + + params = update_records_get_params(&lur->lur_update_rec); + memcpy(params, tur->tur_update_params, params_size); + + lur->lur_update_rec.ur_param_count = tur->tur_update_param_count; + lur->lur_update_rec.ur_batchid = tmt->tmt_batchid; + /* Init update record header */ + lur->lur_hdr.lrh_len = llog_update_record_size(lur); + lur->lur_hdr.lrh_type = UPDATE_REC; + + /* Dump updates for debugging purpose */ + update_records_dump(&lur->lur_update_rec, D_INFO, true); + + return 0; +} + +/** + * Top thandle commit callback + * + * This callback will be called when all of sub transactions are committed. + * + * \param[in] th top thandle to be committed. + */ +static void top_trans_committed_cb(struct top_multiple_thandle *tmt) +{ + struct lu_target *lut; + ENTRY; + + LASSERT(atomic_read(&tmt->tmt_refcount) > 0); + + top_multiple_thandle_dump(tmt, D_HA); + tmt->tmt_committed = 1; + lut = dt2lu_dev(tmt->tmt_master_sub_dt)->ld_site->ls_tgt; + if (lut->lut_tdtd && lut->lut_tdtd->tdtd_commit_task) + wake_up_process(lut->lut_tdtd->tdtd_commit_task); + + RETURN_EXIT; +} + +struct sub_thandle *lookup_sub_thandle(struct top_multiple_thandle *tmt, + struct dt_device *dt_dev) +{ + struct sub_thandle *st; + + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (st->st_dt == dt_dev) + return st; + } + return NULL; +} +EXPORT_SYMBOL(lookup_sub_thandle); + +struct sub_thandle *create_sub_thandle(struct top_multiple_thandle *tmt, + struct dt_device *dt_dev) +{ + struct sub_thandle *st; + + OBD_ALLOC_PTR(st); + if (st == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + INIT_LIST_HEAD(&st->st_sub_list); + INIT_LIST_HEAD(&st->st_cookie_list); + st->st_dt = dt_dev; + + list_add(&st->st_sub_list, &tmt->tmt_sub_thandle_list); + return st; +} + +static void sub_trans_commit_cb_internal(struct top_multiple_thandle *tmt, + struct thandle *sub_th, int err) +{ + struct sub_thandle *st; + bool all_committed = true; + + /* Check if all sub thandles are committed */ + spin_lock(&tmt->tmt_sub_lock); + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (st->st_sub_th == sub_th) { + st->st_committed = 1; + st->st_result = err; + } + if (!st->st_committed) + all_committed = false; + } + spin_unlock(&tmt->tmt_sub_lock); + + if (tmt->tmt_result == 0) + tmt->tmt_result = err; + + if (all_committed) + top_trans_committed_cb(tmt); + + top_multiple_thandle_dump(tmt, D_INFO); + top_multiple_thandle_put(tmt); + RETURN_EXIT; +} + +/** + * sub thandle commit callback + * + * Mark the sub thandle to be committed and if all sub thandle are committed + * notify the top thandle. + * + * \param[in] env execution environment + * \param[in] sub_th sub thandle being committed + * \param[in] cb commit callback + * \param[in] err trans result + */ +static void sub_trans_commit_cb(struct lu_env *env, + struct thandle *sub_th, + struct dt_txn_commit_cb *cb, int err) +{ + struct top_multiple_thandle *tmt = cb->dcb_data; + + sub_trans_commit_cb_internal(tmt, sub_th, err); +} + +static void sub_thandle_register_commit_cb(struct sub_thandle *st, + struct top_multiple_thandle *tmt) +{ + LASSERT(st->st_sub_th != NULL); + top_multiple_thandle_get(tmt); + st->st_commit_dcb.dcb_func = sub_trans_commit_cb; + st->st_commit_dcb.dcb_data = tmt; + INIT_LIST_HEAD(&st->st_commit_dcb.dcb_linkage); + dt_trans_cb_add(st->st_sub_th, &st->st_commit_dcb); +} + +/** + * Sub thandle stop call back + * + * After sub thandle is stopped, it will call this callback to notify + * the top thandle. + * + * \param[in] th sub thandle to be stopped + * \param[in] rc result of sub trans + */ +static void sub_trans_stop_cb(struct lu_env *env, + struct thandle *sub_th, + struct dt_txn_commit_cb *cb, int err) +{ + struct sub_thandle *st; + struct top_multiple_thandle *tmt = cb->dcb_data; + ENTRY; + + spin_lock(&tmt->tmt_sub_lock); + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (st->st_stopped) + continue; + + if (st->st_dt == sub_th->th_dev) { + st->st_stopped = 1; + st->st_result = err; + break; + } + } + spin_unlock(&tmt->tmt_sub_lock); + + wake_up(&tmt->tmt_stop_waitq); + RETURN_EXIT; +} + +static void sub_thandle_register_stop_cb(struct sub_thandle *st, + struct top_multiple_thandle *tmt) +{ + st->st_stop_dcb.dcb_func = sub_trans_stop_cb; + st->st_stop_dcb.dcb_data = tmt; + st->st_stop_dcb.dcb_flags = DCB_TRANS_STOP; + INIT_LIST_HEAD(&st->st_stop_dcb.dcb_linkage); + dt_trans_cb_add(st->st_sub_th, &st->st_stop_dcb); +} + +/** + * Create sub thandle + * + * Create transaction handle for sub_thandle + * + * \param[in] env execution environment + * \param[in] th top thandle + * \param[in] st sub_thandle + * + * \retval 0 if creation succeeds. + * \retval negative errno if creation fails. + */ +int sub_thandle_trans_create(const struct lu_env *env, + struct top_thandle *top_th, + struct sub_thandle *st) +{ + struct thandle *sub_th; + + sub_th = dt_trans_create(env, st->st_dt); + if (IS_ERR(sub_th)) + return PTR_ERR(sub_th); + + sub_th->th_top = &top_th->tt_super; + st->st_sub_th = sub_th; + + sub_th->th_wait_submit = 1; + sub_thandle_register_stop_cb(st, top_th->tt_multiple_thandle); + return 0; +} + +/** + * Create the top transaction. + * + * Create the top transaction on the master device. It will create a top + * thandle and a sub thandle on the master device. + * + * \param[in] env execution environment + * \param[in] master_dev master_dev the top thandle will be created + * + * \retval pointer to the created thandle. + * \retval ERR_PTR(errno) if creation failed. + */ +struct thandle * +top_trans_create(const struct lu_env *env, struct dt_device *master_dev) +{ + struct top_thandle *top_th; + struct thandle *child_th; + + OBD_ALLOC_GFP(top_th, sizeof(*top_th), __GFP_IO); + if (top_th == NULL) + return ERR_PTR(-ENOMEM); + + top_th->tt_super.th_top = &top_th->tt_super; + + if (master_dev != NULL) { + child_th = dt_trans_create(env, master_dev); + if (IS_ERR(child_th)) { + OBD_FREE_PTR(top_th); + return child_th; + } + + child_th->th_top = &top_th->tt_super; + child_th->th_wait_submit = 1; + top_th->tt_master_sub_thandle = child_th; + } + return &top_th->tt_super; +} +EXPORT_SYMBOL(top_trans_create); + +/** + * Declare write update transaction + * + * Check if there are updates being recorded in this transaction, + * it will write the record into the disk. + * + * \param[in] env execution environment + * \param[in] tmt top multiple transaction handle + * + * \retval 0 if writing succeeds + * \retval negative errno if writing fails + */ +static int declare_updates_write(const struct lu_env *env, + struct top_multiple_thandle *tmt) +{ + struct llog_update_record *record; + struct sub_thandle *st; + int rc = 0; + + record = tmt->tmt_update_records->tur_update_records; + /* Declare update write for all other target */ + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (st->st_sub_th == NULL) + continue; + + rc = sub_declare_updates_write(env, record, st->st_sub_th, + tmt->tmt_record_size); + if (rc < 0) + break; + } + + return rc; +} + +/** + * Assign batchid to the distribute transaction. + * + * Assign batchid to the distribute transaction + * + * \param[in] tmt distribute transaction + */ +static void distribute_txn_assign_batchid(struct top_multiple_thandle *new) +{ + struct target_distribute_txn_data *tdtd; + struct dt_device *dt = new->tmt_master_sub_dt; + struct sub_thandle *st; + + LASSERT(dt != NULL); + tdtd = dt2lu_dev(dt)->ld_site->ls_tgt->lut_tdtd; + spin_lock(&tdtd->tdtd_batchid_lock); + new->tmt_batchid = tdtd->tdtd_batchid++; + list_add_tail(&new->tmt_commit_list, &tdtd->tdtd_list); + spin_unlock(&tdtd->tdtd_batchid_lock); + list_for_each_entry(st, &new->tmt_sub_thandle_list, st_sub_list) { + if (st->st_sub_th != NULL) + sub_thandle_register_commit_cb(st, new); + } + top_multiple_thandle_get(new); + top_multiple_thandle_dump(new, D_INFO); +} + +/** + * Insert distribute transaction to the distribute txn list. + * + * Insert distribute transaction to the distribute txn list. + * + * \param[in] new the distribute txn to be inserted. + */ +void distribute_txn_insert_by_batchid(struct top_multiple_thandle *new) +{ + struct dt_device *dt = new->tmt_master_sub_dt; + struct top_multiple_thandle *tmt; + struct target_distribute_txn_data *tdtd; + struct sub_thandle *st; + bool at_head = false; + + LASSERT(dt != NULL); + tdtd = dt2lu_dev(dt)->ld_site->ls_tgt->lut_tdtd; + + spin_lock(&tdtd->tdtd_batchid_lock); + list_for_each_entry_reverse(tmt, &tdtd->tdtd_list, tmt_commit_list) { + if (new->tmt_batchid > tmt->tmt_batchid) { + list_add(&new->tmt_commit_list, &tmt->tmt_commit_list); + break; + } + } + if (list_empty(&new->tmt_commit_list)) { + at_head = true; + list_add(&new->tmt_commit_list, &tdtd->tdtd_list); + } + spin_unlock(&tdtd->tdtd_batchid_lock); + + list_for_each_entry(st, &new->tmt_sub_thandle_list, st_sub_list) { + if (st->st_sub_th != NULL) + sub_thandle_register_commit_cb(st, new); + } + + top_multiple_thandle_get(new); + top_multiple_thandle_dump(new, D_INFO); + if (new->tmt_committed && at_head && tdtd->tdtd_commit_task) + wake_up_process(tdtd->tdtd_commit_task); +} + +/** + * Prepare cross-MDT operation. + * + * Create the update record buffer to record updates for cross-MDT operation, + * add master sub transaction to tt_sub_trans_list, and declare the update + * writes. + * + * During updates packing, all of parameters will be packed in + * tur_update_params, and updates will be packed in tur_update_records. + * Then in transaction stop, parameters and updates will be merged + * into one updates buffer. + * + * And also master thandle will be added to the sub_th list, so it will be + * easy to track the commit status. + * + * \param[in] env execution environment + * \param[in] th top transaction handle + * + * \retval 0 if preparation succeeds. + * \retval negative errno if preparation fails. + */ +static int prepare_multiple_node_trans(const struct lu_env *env, + struct top_multiple_thandle *tmt) +{ + struct thandle_update_records *tur; + int rc; + ENTRY; + + if (tmt->tmt_update_records == NULL) { + tur = &update_env_info(env)->uti_tur; + rc = check_and_prepare_update_record(env, tur); + if (rc < 0) + RETURN(rc); + + tmt->tmt_update_records = tur; + distribute_txn_assign_batchid(tmt); + } + + rc = declare_updates_write(env, tmt); + + RETURN(rc); +} + +/** + * start the top transaction. + * + * Start all of its sub transactions, then start master sub transaction. + * + * \param[in] env execution environment + * \param[in] master_dev master_dev the top thandle will be start + * \param[in] th top thandle + * + * \retval 0 if transaction start succeeds. + * \retval negative errno if start fails. + */ +int top_trans_start(const struct lu_env *env, struct dt_device *master_dev, + struct thandle *th) +{ + struct top_thandle *top_th = container_of(th, struct top_thandle, + tt_super); + struct sub_thandle *st; + struct top_multiple_thandle *tmt = top_th->tt_multiple_thandle; + int rc = 0; + ENTRY; + + if (tmt == NULL) { + if (th->th_sync) + top_th->tt_master_sub_thandle->th_sync = th->th_sync; + if (th->th_local) + top_th->tt_master_sub_thandle->th_local = th->th_local; + rc = dt_trans_start(env, top_th->tt_master_sub_thandle->th_dev, + top_th->tt_master_sub_thandle); + RETURN(rc); + } + + tmt = top_th->tt_multiple_thandle; + rc = prepare_multiple_node_trans(env, tmt); + if (rc < 0) + RETURN(rc); + + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (st->st_sub_th == NULL) + continue; + if (th->th_sync) + st->st_sub_th->th_sync = th->th_sync; + if (th->th_local) + st->st_sub_th->th_local = th->th_local; + rc = dt_trans_start(env, st->st_sub_th->th_dev, + st->st_sub_th); + if (rc != 0) + GOTO(out, rc); + + LASSERT(st->st_started == 0); + st->st_started = 1; + } +out: + th->th_result = rc; + RETURN(rc); +} +EXPORT_SYMBOL(top_trans_start); + +/** + * Check whether we need write updates record + * + * Check if the updates for the top_thandle needs to be writen + * to all targets. Only if the transaction succeeds and the updates + * number > 2, it will write the updates, + * + * \params [in] top_th top thandle. + * + * \retval true if it needs to write updates + * \retval false if it does not need to write updates + **/ +static bool top_check_write_updates(struct top_thandle *top_th) +{ + struct top_multiple_thandle *tmt; + struct thandle_update_records *tur; + + /* Do not write updates to records if the transaction fails */ + if (top_th->tt_super.th_result != 0) + return false; + + tmt = top_th->tt_multiple_thandle; + if (tmt == NULL) + return false; + + tur = tmt->tmt_update_records; + if (tur == NULL) + return false; + + /* Hmm, false update records, since the cross-MDT operation + * should includes both local and remote updates, so the + * updates count should >= 2 */ + if (tur->tur_update_records == NULL || + tur->tur_update_records->lur_update_rec.ur_update_count <= 1) + return false; + + return true; +} + +/** + * Check if top transaction is stopped + * + * Check if top transaction is stopped, only if all sub transaction + * is stopped, then the top transaction is stopped. + * + * \param [in] top_th top thandle + * + * \retval true if the top transaction is stopped. + * \retval false if the top transaction is not stopped. + */ +static bool top_trans_is_stopped(struct top_thandle *top_th) +{ + struct top_multiple_thandle *tmt; + struct sub_thandle *st; + bool all_stopped = true; + + tmt = top_th->tt_multiple_thandle; + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (!st->st_stopped && st->st_sub_th != NULL) { + all_stopped = false; + break; + } + + if (st->st_result != 0 && + top_th->tt_super.th_result == 0) + top_th->tt_super.th_result = st->st_result; + } + + return all_stopped; +} + +/** + * Wait result of top transaction + * + * Wait until all sub transaction get its result. + * + * \param [in] top_th top thandle. + * + * \retval the result of top thandle. + */ +static int top_trans_wait_result(struct top_thandle *top_th) +{ + wait_event_idle(top_th->tt_multiple_thandle->tmt_stop_waitq, + top_trans_is_stopped(top_th)); + + RETURN(top_th->tt_super.th_result); +} + +/** + * Stop the top transaction. + * + * Stop the transaction on the master device first, then stop transactions + * on other sub devices. + * + * \param[in] env execution environment + * \param[in] master_dev master_dev the top thandle will be created + * \param[in] th top thandle + * + * \retval 0 if stop transaction succeeds. + * \retval negative errno if stop transaction fails. + */ +int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev, + struct thandle *th) +{ + struct top_thandle *top_th = container_of(th, struct top_thandle, + tt_super); + struct sub_thandle *st; + struct sub_thandle *master_st; + struct top_multiple_thandle *tmt; + struct thandle_update_records *tur; + bool write_updates = false; + int rc = 0; + ENTRY; + + if (likely(top_th->tt_multiple_thandle == NULL)) { + LASSERT(master_dev != NULL); + + if (th->th_sync) + top_th->tt_master_sub_thandle->th_sync = th->th_sync; + if (th->th_local) + top_th->tt_master_sub_thandle->th_local = th->th_local; + rc = dt_trans_stop(env, master_dev, + top_th->tt_master_sub_thandle); + OBD_FREE_PTR(top_th); + RETURN(rc); + } + + tmt = top_th->tt_multiple_thandle; + tur = tmt->tmt_update_records; + + /* Note: we need stop the master thandle first, then the stop + * callback will fill the master transno in the update logs, + * then these update logs will be sent to other MDTs */ + /* get the master sub thandle */ + master_st = lookup_sub_thandle(tmt, tmt->tmt_master_sub_dt); + write_updates = top_check_write_updates(top_th); + + /* Step 1: write the updates log on Master MDT */ + if (master_st != NULL && master_st->st_sub_th != NULL && + write_updates) { + struct llog_update_record *lur; + + /* Merge the parameters and updates into one buffer */ + rc = prepare_writing_updates(env, tmt); + if (rc < 0) { + CERROR("%s: cannot prepare updates: rc = %d\n", + master_dev->dd_lu_dev.ld_obd->obd_name, rc); + th->th_result = rc; + write_updates = false; + GOTO(stop_master_trans, rc); + } + + lur = tur->tur_update_records; + /* Write updates to the master MDT */ + rc = sub_updates_write(env, lur, master_st); + + /* Cleanup the common parameters in the update records, + * master transno callback might add more parameters. + * and we need merge the update records again in the + * following */ + if (tur->tur_update_params != NULL) + lur->lur_update_rec.ur_param_count = 0; + + if (rc < 0) { + CERROR("%s: write updates failed: rc = %d\n", + master_dev->dd_lu_dev.ld_obd->obd_name, rc); + th->th_result = rc; + write_updates = false; + GOTO(stop_master_trans, rc); + } + } + +stop_master_trans: + /* Step 2: Stop the transaction on the master MDT, and fill the + * master transno in the update logs to other MDT. */ + if (master_st != NULL && master_st->st_sub_th != NULL) { + if (th->th_local) + master_st->st_sub_th->th_local = th->th_local; + if (th->th_sync) + master_st->st_sub_th->th_sync = th->th_sync; + master_st->st_sub_th->th_result = th->th_result; + rc = dt_trans_stop(env, master_st->st_dt, master_st->st_sub_th); + /* If it does not write_updates, then we call submit callback + * here, otherwise callback is done through + * osd(osp)_trans_commit_cb() */ + if (!master_st->st_started && + !list_empty(&tmt->tmt_commit_list)) + sub_trans_commit_cb_internal(tmt, + master_st->st_sub_th, rc); + if (rc < 0) { + CERROR("%s: stop trans failed: rc = %d\n", + master_dev->dd_lu_dev.ld_obd->obd_name, rc); + th->th_result = rc; + GOTO(stop_other_trans, rc); + } else if (tur != NULL && tur->tur_update_records != NULL) { + struct llog_update_record *lur; + + lur = tur->tur_update_records; + if (lur->lur_update_rec.ur_master_transno == 0) + /* Update master transno after master stop + * callback */ + lur->lur_update_rec.ur_master_transno = + tgt_th_info(env)->tti_transno; + } + } + + /* Step 3: write updates to other MDTs */ + if (write_updates) { + struct llog_update_record *lur; + if (OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) { + if (cfs_fail_val == 1) { + long timeout = cfs_time_seconds(1) / 10; + + OBD_RACE(OBD_FAIL_OUT_OBJECT_MISS); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(schedule_timeout(timeout)); + cfs_fail_loc = 0; + } + cfs_fail_val++; + } + + /* Stop callback of master will add more updates and also update + * master transno, so merge the parameters and updates into one + * buffer again */ + rc = prepare_writing_updates(env, tmt); + if (rc < 0) { + CERROR("%s: prepare updates failed: rc = %d\n", + master_dev->dd_lu_dev.ld_obd->obd_name, rc); + th->th_result = rc; + GOTO(stop_other_trans, rc); + } + lur = tur->tur_update_records; + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, + st_sub_list) { + if (st->st_sub_th == NULL || st == master_st || + st->st_sub_th->th_result < 0) + continue; + + rc = sub_updates_write(env, lur, st); + if (rc < 0) { + CERROR("%s: write updates failed: rc = %d\n", + st->st_dt->dd_lu_dev.ld_obd->obd_name, + rc); + th->th_result = rc; + break; + } + } + } + +stop_other_trans: + /* Step 4: Stop the transaction on other MDTs */ + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (st == master_st || st->st_sub_th == NULL) + continue; + + if (th->th_sync) + st->st_sub_th->th_sync = th->th_sync; + if (th->th_local) + st->st_sub_th->th_local = th->th_local; + st->st_sub_th->th_result = th->th_result; + rc = dt_trans_stop(env, st->st_sub_th->th_dev, + st->st_sub_th); + if (rc < 0) { + CERROR("%s: stop trans failed: rc = %d\n", + st->st_dt->dd_lu_dev.ld_obd->obd_name, rc); + if (th->th_result == 0) + th->th_result = rc; + } + } + + rc = top_trans_wait_result(top_th); + + tmt->tmt_result = rc; + + /* Balance for the refcount in top_trans_create, Note: if it is NOT + * multiple node transaction, the top transaction will be destroyed. */ + top_multiple_thandle_put(tmt); + OBD_FREE_PTR(top_th); + RETURN(rc); +} +EXPORT_SYMBOL(top_trans_stop); + +/** + * Create top_multiple_thandle for top_thandle + * + * Create top_mutilple_thandle to manage the mutiple node transaction + * for top_thandle, and it also needs to add master sub thandle to the + * sub trans list now. + * + * \param[in] env execution environment + * \param[in] top_th the top thandle + * + * \retval 0 if creation succeeds + * \retval negative errno if creation fails + */ +int top_trans_create_tmt(const struct lu_env *env, + struct top_thandle *top_th) +{ + struct top_multiple_thandle *tmt; + + OBD_ALLOC_PTR(tmt); + if (tmt == NULL) + return -ENOMEM; + + tmt->tmt_magic = TOP_THANDLE_MAGIC; + INIT_LIST_HEAD(&tmt->tmt_sub_thandle_list); + INIT_LIST_HEAD(&tmt->tmt_commit_list); + atomic_set(&tmt->tmt_refcount, 1); + spin_lock_init(&tmt->tmt_sub_lock); + init_waitqueue_head(&tmt->tmt_stop_waitq); + + top_th->tt_multiple_thandle = tmt; + + return 0; +} + +static struct sub_thandle * +create_sub_thandle_with_thandle(struct top_thandle *top_th, + struct thandle *sub_th) +{ + struct sub_thandle *st; + + /* create and init sub th to the top trans list */ + st = create_sub_thandle(top_th->tt_multiple_thandle, + sub_th->th_dev); + if (IS_ERR(st)) + return st; + + st->st_sub_th = sub_th; + + sub_th->th_top = &top_th->tt_super; + sub_thandle_register_stop_cb(st, top_th->tt_multiple_thandle); + return st; +} + +/** + * Get sub thandle. + * + * Get sub thandle from the top thandle according to the sub dt_device. + * + * \param[in] env execution environment + * \param[in] th thandle on the top layer. + * \param[in] sub_dt sub dt_device used to get sub transaction + * + * \retval thandle of sub transaction if succeed + * \retval PTR_ERR(errno) if failed + */ +struct thandle *thandle_get_sub_by_dt(const struct lu_env *env, + struct thandle *th, + struct dt_device *sub_dt) +{ + struct sub_thandle *st = NULL; + struct sub_thandle *master_st = NULL; + struct top_thandle *top_th; + struct thandle *sub_th = NULL; + int rc = 0; + ENTRY; + + top_th = container_of(th, struct top_thandle, tt_super); + + if (likely(sub_dt == top_th->tt_master_sub_thandle->th_dev)) + RETURN(top_th->tt_master_sub_thandle); + + if (top_th->tt_multiple_thandle != NULL) { + st = lookup_sub_thandle(top_th->tt_multiple_thandle, sub_dt); + if (st != NULL) + RETURN(st->st_sub_th); + } + + sub_th = dt_trans_create(env, sub_dt); + if (IS_ERR(sub_th)) + RETURN(sub_th); + + /* Create top_multiple_thandle if necessary */ + if (top_th->tt_multiple_thandle == NULL) { + struct top_multiple_thandle *tmt; + + rc = top_trans_create_tmt(env, top_th); + if (rc < 0) + GOTO(stop_trans, rc); + + tmt = top_th->tt_multiple_thandle; + + /* Add master sub th to the top trans list */ + tmt->tmt_master_sub_dt = + top_th->tt_master_sub_thandle->th_dev; + master_st = create_sub_thandle_with_thandle(top_th, + top_th->tt_master_sub_thandle); + if (IS_ERR(master_st)) { + rc = PTR_ERR(master_st); + master_st = NULL; + GOTO(stop_trans, rc); + } + } + + /* create and init sub th to the top trans list */ + st = create_sub_thandle_with_thandle(top_th, sub_th); + if (IS_ERR(st)) { + rc = PTR_ERR(st); + st = NULL; + GOTO(stop_trans, rc); + } + st->st_sub_th->th_wait_submit = 1; +stop_trans: + if (rc < 0) { + if (master_st != NULL) { + list_del(&master_st->st_sub_list); + OBD_FREE_PTR(master_st); + } + sub_th->th_result = rc; + dt_trans_stop(env, sub_dt, sub_th); + sub_th = ERR_PTR(rc); + } + + RETURN(sub_th); +} +EXPORT_SYMBOL(thandle_get_sub_by_dt); + +/** + * Top multiple thandle destroy + * + * Destroy multiple thandle and all its sub thandle. + * + * \param[in] tmt top_multiple_thandle to be destroyed. + */ +void top_multiple_thandle_destroy(struct top_multiple_thandle *tmt) +{ + struct sub_thandle *st; + struct sub_thandle *tmp; + + LASSERT(tmt->tmt_magic == TOP_THANDLE_MAGIC); + list_for_each_entry_safe(st, tmp, &tmt->tmt_sub_thandle_list, + st_sub_list) { + struct sub_thandle_cookie *stc; + struct sub_thandle_cookie *tmp; + + list_del(&st->st_sub_list); + list_for_each_entry_safe(stc, tmp, &st->st_cookie_list, + stc_list) { + list_del(&stc->stc_list); + OBD_FREE_PTR(stc); + } + OBD_FREE_PTR(st); + } + OBD_FREE_PTR(tmt); +} +EXPORT_SYMBOL(top_multiple_thandle_destroy); + +/** + * Cancel the update log on MDTs + * + * Cancel the update log on MDTs then destroy the thandle. + * + * \param[in] env execution environment + * \param[in] tmt the top multiple thandle whose updates records + * will be cancelled. + * + * \retval 0 if cancellation succeeds. + * \retval negative errno if cancellation fails. + */ +static int distribute_txn_cancel_records(const struct lu_env *env, + struct top_multiple_thandle *tmt) +{ + struct sub_thandle *st; + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_TXN_NO_CANCEL)) + RETURN(0); + + top_multiple_thandle_dump(tmt, D_INFO); + /* Cancel update logs on other MDTs */ + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + struct llog_ctxt *ctxt; + struct obd_device *obd; + struct llog_cookie *cookie; + struct sub_thandle_cookie *stc; + int rc; + + obd = st->st_dt->dd_lu_dev.ld_obd; + ctxt = llog_get_context(obd, LLOG_UPDATELOG_ORIG_CTXT); + if (ctxt == NULL) + continue; + list_for_each_entry(stc, &st->st_cookie_list, stc_list) { + cookie = &stc->stc_cookie; + if (fid_is_zero(&cookie->lgc_lgl.lgl_oi.oi_fid)) + continue; + + rc = llog_cat_cancel_records(env, ctxt->loc_handle, 1, + cookie); + CDEBUG(D_HA, "%s: batchid %llu cancel update log " + DFID".%u: rc = %d\n", obd->obd_name, + tmt->tmt_batchid, + PFID(&cookie->lgc_lgl.lgl_oi.oi_fid), + cookie->lgc_index, rc); + } + + llog_ctxt_put(ctxt); + } + + RETURN(0); +} + +struct distribute_txn_bid_data { + struct dt_txn_commit_cb dtbd_cb; + struct target_distribute_txn_data *dtbd_tdtd; + __u64 dtbd_batchid; +}; + +/** + * callback of updating commit batchid + * + * Updating commit batchid then wake up the commit thread to cancel the + * records. + * + * \param[in]env execution environment + * \param[in]th thandle to updating commit batchid + * \param[in]cb commit callback + * \param[in]err result of thandle + */ +static void distribute_txn_batchid_cb(struct lu_env *env, + struct thandle *th, + struct dt_txn_commit_cb *cb, + int err) +{ + struct distribute_txn_bid_data *dtbd = NULL; + struct target_distribute_txn_data *tdtd; + + dtbd = container_of(cb, struct distribute_txn_bid_data, dtbd_cb); + tdtd = dtbd->dtbd_tdtd; + + CDEBUG(D_HA, "%s: %llu batchid updated\n", + tdtd->tdtd_lut->lut_obd->obd_name, dtbd->dtbd_batchid); + spin_lock(&tdtd->tdtd_batchid_lock); + if (dtbd->dtbd_batchid > tdtd->tdtd_committed_batchid && + !tdtd->tdtd_lut->lut_obd->obd_no_transno) + tdtd->tdtd_committed_batchid = dtbd->dtbd_batchid; + spin_unlock(&tdtd->tdtd_batchid_lock); + if (atomic_dec_and_test(&tdtd->tdtd_refcount)) + wake_up_process(tdtd->tdtd_commit_task); + + OBD_FREE_PTR(dtbd); +} + +/** + * Update the commit batchid in disk + * + * Update commit batchid in the disk, after this is committed, it can start + * to cancel the update records. + * + * \param[in] env execution environment + * \param[in] tdtd distribute transaction structure + * \param[in] batchid commit batchid to be updated + * + * \retval 0 if update succeeds. + * \retval negative errno if update fails. + */ +static int +distribute_txn_commit_batchid_update(const struct lu_env *env, + struct target_distribute_txn_data *tdtd, + __u64 batchid) +{ + struct distribute_txn_bid_data *dtbd = NULL; + struct thandle *th; + struct lu_buf buf; + __u64 tmp; + __u64 off; + int rc; + ENTRY; + + OBD_ALLOC_PTR(dtbd); + if (dtbd == NULL) + RETURN(-ENOMEM); + dtbd->dtbd_batchid = batchid; + dtbd->dtbd_tdtd = tdtd; + dtbd->dtbd_cb.dcb_func = distribute_txn_batchid_cb; + atomic_inc(&tdtd->tdtd_refcount); + + th = dt_trans_create(env, tdtd->tdtd_lut->lut_bottom); + if (IS_ERR(th)) { + atomic_dec(&tdtd->tdtd_refcount); + OBD_FREE_PTR(dtbd); + RETURN(PTR_ERR(th)); + } + + tmp = cpu_to_le64(batchid); + buf.lb_buf = &tmp; + buf.lb_len = sizeof(tmp); + off = 0; + + rc = dt_declare_record_write(env, tdtd->tdtd_batchid_obj, &buf, off, + th); + if (rc < 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, tdtd->tdtd_lut->lut_bottom, th); + if (rc < 0) + GOTO(stop, rc); + + rc = dt_trans_cb_add(th, &dtbd->dtbd_cb); + if (rc < 0) + GOTO(stop, rc); + + rc = dt_record_write(env, tdtd->tdtd_batchid_obj, &buf, + &off, th); + + CDEBUG(D_INFO, "%s: update batchid %llu: rc = %d\n", + tdtd->tdtd_lut->lut_obd->obd_name, batchid, rc); + +stop: + dt_trans_stop(env, tdtd->tdtd_lut->lut_bottom, th); + if (rc < 0) { + atomic_dec(&tdtd->tdtd_refcount); + OBD_FREE_PTR(dtbd); + } + RETURN(rc); +} + +/** + * Init commit batchid for distribute transaction. + * + * Initialize the batchid object and get commit batchid from the object. + * + * \param[in] env execution environment + * \param[in] tdtd distribute transaction whose batchid is initialized. + * + * \retval 0 if initialization succeeds. + * \retval negative errno if initialization fails. + **/ +static int +distribute_txn_commit_batchid_init(const struct lu_env *env, + struct target_distribute_txn_data *tdtd) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct lu_target *lut = tdtd->tdtd_lut; + struct lu_attr *attr = &tti->tti_attr; + struct lu_fid *fid = &tti->tti_fid1; + struct dt_object_format *dof = &tti->tti_u.update.tti_update_dof; + struct dt_object *dt_obj = NULL; + struct lu_buf buf; + __u64 tmp; + __u64 off; + int rc; + ENTRY; + + memset(attr, 0, sizeof(*attr)); + attr->la_valid = LA_MODE; + attr->la_mode = S_IFREG | S_IRUGO | S_IWUSR; + dof->dof_type = dt_mode_to_dft(S_IFREG); + + lu_local_obj_fid(fid, BATCHID_COMMITTED_OID); + + dt_obj = dt_find_or_create(env, lut->lut_bottom, fid, dof, + attr); + if (IS_ERR(dt_obj)) { + rc = PTR_ERR(dt_obj); + dt_obj = NULL; + GOTO(out_put, rc); + } + + tdtd->tdtd_batchid_obj = dt_obj; + + buf.lb_buf = &tmp; + buf.lb_len = sizeof(tmp); + off = 0; + rc = dt_read(env, dt_obj, &buf, &off); + if (rc < 0 || (rc < buf.lb_len && rc > 0)) { + CERROR("%s can't read last committed batchid: rc = %d\n", + tdtd->tdtd_lut->lut_obd->obd_name, rc); + if (rc > 0) + rc = -EINVAL; + GOTO(out_put, rc); + } else if (rc == buf.lb_len) { + tdtd->tdtd_committed_batchid = le64_to_cpu(tmp); + CDEBUG(D_HA, "%s: committed batchid %llu\n", + tdtd->tdtd_lut->lut_obd->obd_name, + tdtd->tdtd_committed_batchid); + rc = 0; + } + +out_put: + if (rc < 0 && dt_obj != NULL) { + dt_object_put(env, dt_obj); + tdtd->tdtd_batchid_obj = NULL; + } + return rc; +} + +#ifndef TASK_IDLE +#define TASK_IDLE TASK_INTERRUPTIBLE +#endif + +/** + * manage the distribute transaction thread + * + * Distribute transaction are linked to the list, and once the distribute + * transaction is committed, it will update the last committed batchid first, + * after it is committed, it will cancel the records. + * + * \param[in] _arg argument for commit thread + * + * \retval 0 if thread is running successfully + * \retval negative errno if the thread can not be run. + */ +static int distribute_txn_commit_thread(void *_arg) +{ + struct target_distribute_txn_data *tdtd = _arg; + struct lu_env *env = &tdtd->tdtd_env; + LIST_HEAD(list); + int rc; + struct top_multiple_thandle *tmt; + struct top_multiple_thandle *tmp; + __u64 batchid = 0, committed; + + ENTRY; + + + CDEBUG(D_HA, "%s: start commit thread committed batchid %llu\n", + tdtd->tdtd_lut->lut_obd->obd_name, + tdtd->tdtd_committed_batchid); + + while (({set_current_state(TASK_IDLE); + !kthread_should_stop(); })) { + spin_lock(&tdtd->tdtd_batchid_lock); + list_for_each_entry_safe(tmt, tmp, &tdtd->tdtd_list, + tmt_commit_list) { + if (tmt->tmt_committed == 0) + break; + + /* Note: right now, replay is based on master MDT + * transno, but cancellation is based on batchid. + * so we do not try to cancel the update log until + * the recoverying is done, unless the update records + * batchid < committed_batchid. */ + if (tmt->tmt_batchid <= tdtd->tdtd_committed_batchid) { + __set_current_state(TASK_RUNNING); + list_move_tail(&tmt->tmt_commit_list, &list); + } else if (!tdtd->tdtd_lut->lut_obd->obd_recovering) { + __set_current_state(TASK_RUNNING); + LASSERTF(tmt->tmt_batchid >= batchid, + "tmt %p tmt_batchid: %llu, batchid " + "%llu\n", tmt, tmt->tmt_batchid, + batchid); + /* There are three types of distribution + * transaction result + * + * 1. If tmt_result < 0, it means the + * distribution transaction fails, which should + * be rare, because once declare phase succeeds, + * the operation should succeeds anyway. Note in + * this case, we will still update batchid so + * cancellation would be stopped. + * + * 2. If tmt_result == 0, it means the + * distribution transaction succeeds, and we + * will update batchid. + * + * 3. If tmt_result > 0, it means distribute + * transaction is not yet committed on every + * node, but we need release this tmt before + * that, which usuually happens during umount. + */ + if (tmt->tmt_result <= 0) + batchid = tmt->tmt_batchid; + list_move_tail(&tmt->tmt_commit_list, &list); + } + } + spin_unlock(&tdtd->tdtd_batchid_lock); + + CDEBUG(D_HA, "%s: batchid: %llu committed batchid " + "%llu\n", tdtd->tdtd_lut->lut_obd->obd_name, batchid, + tdtd->tdtd_committed_batchid); + /* update globally committed on a storage */ + if (batchid > tdtd->tdtd_committed_batchid) { + rc = distribute_txn_commit_batchid_update(env, tdtd, + batchid); + if (rc == 0) + batchid = 0; + } + /* cancel the records for committed batchid's */ + /* XXX: should we postpone cancel's till the end of recovery? */ + committed = tdtd->tdtd_committed_batchid; + list_for_each_entry_safe(tmt, tmp, &list, tmt_commit_list) { + if (tmt->tmt_batchid > committed) + break; + __set_current_state(TASK_RUNNING); + list_del_init(&tmt->tmt_commit_list); + if (tmt->tmt_result <= 0) + distribute_txn_cancel_records(env, tmt); + top_multiple_thandle_put(tmt); + } + + if (!task_is_running(current)) + schedule(); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(5)); + } + } + + while (({set_current_state(TASK_IDLE); + atomic_read(&tdtd->tdtd_refcount) != 0; })) + schedule(); + __set_current_state(TASK_RUNNING); + + spin_lock(&tdtd->tdtd_batchid_lock); + list_for_each_entry_safe(tmt, tmp, &tdtd->tdtd_list, + tmt_commit_list) + list_move_tail(&tmt->tmt_commit_list, &list); + spin_unlock(&tdtd->tdtd_batchid_lock); + + CDEBUG(D_INFO, "%s stopping distribute txn commit thread.\n", + tdtd->tdtd_lut->lut_obd->obd_name); + list_for_each_entry_safe(tmt, tmp, &list, tmt_commit_list) { + list_del_init(&tmt->tmt_commit_list); + top_multiple_thandle_dump(tmt, D_HA); + top_multiple_thandle_put(tmt); + } + RETURN(0); +} + +/** + * Start llog cancel thread + * + * Start llog cancel(master/slave) thread on LOD + * + * \param[in]lclt cancel log thread to be started. + * + * \retval 0 if the thread is started successfully. + * \retval negative errno if the thread is not being + * started. + */ +int distribute_txn_init(const struct lu_env *env, + struct lu_target *lut, + struct target_distribute_txn_data *tdtd, + __u32 index) +{ + struct task_struct *task; + int rc; + ENTRY; + + INIT_LIST_HEAD(&tdtd->tdtd_list); + INIT_LIST_HEAD(&tdtd->tdtd_replay_finish_list); + INIT_LIST_HEAD(&tdtd->tdtd_replay_list); + spin_lock_init(&tdtd->tdtd_batchid_lock); + spin_lock_init(&tdtd->tdtd_replay_list_lock); + tdtd->tdtd_replay_handler = distribute_txn_replay_handle; + tdtd->tdtd_replay_ready = 0; + + tdtd->tdtd_batchid = lut->lut_last_transno + 1; + + init_waitqueue_head(&tdtd->tdtd_recovery_threads_waitq); + atomic_set(&tdtd->tdtd_refcount, 0); + atomic_set(&tdtd->tdtd_recovery_threads_count, 0); + + tdtd->tdtd_lut = lut; + if (lut->lut_bottom->dd_rdonly) + RETURN(0); + + rc = distribute_txn_commit_batchid_init(env, tdtd); + if (rc != 0) + RETURN(rc); + + rc = lu_env_init(&tdtd->tdtd_env, LCT_LOCAL | LCT_MD_THREAD); + if (rc) + RETURN(rc); + + task = kthread_create(distribute_txn_commit_thread, tdtd, "dist_txn-%u", + index); + if (IS_ERR(task)) { + lu_env_fini(&tdtd->tdtd_env); + RETURN(PTR_ERR(task)); + } + tdtd->tdtd_commit_task = task; + wake_up_process(task); + + RETURN(0); +} +EXPORT_SYMBOL(distribute_txn_init); + +/** + * Stop llog cancel thread + * + * Stop llog cancel(master/slave) thread on LOD and also destory + * all of transaction in the list. + * + * \param[in]lclt cancel log thread to be stopped. + */ +void distribute_txn_fini(const struct lu_env *env, + struct target_distribute_txn_data *tdtd) +{ + struct top_multiple_thandle *tmt; + LIST_HEAD(list); + + /* Stop cancel thread */ + if (!tdtd->tdtd_commit_task) + return; + + kthread_stop(tdtd->tdtd_commit_task); + tdtd->tdtd_commit_task = NULL; + + spin_lock(&tdtd->tdtd_batchid_lock); + list_splice_init(&tdtd->tdtd_list, &list); + spin_unlock(&tdtd->tdtd_batchid_lock); + + CDEBUG(D_INFO, "%s stopping distribute txn commit thread.\n", + tdtd->tdtd_lut->lut_obd->obd_name); + while ((tmt = list_first_entry_or_null(&list, + struct top_multiple_thandle, + tmt_commit_list)) != NULL) { + list_del_init(&tmt->tmt_commit_list); + top_multiple_thandle_dump(tmt, D_HA); + top_multiple_thandle_put(tmt); + } + + lu_env_fini(&tdtd->tdtd_env); + + dtrq_list_destroy(tdtd); + if (tdtd->tdtd_batchid_obj != NULL) { + dt_object_put(env, tdtd->tdtd_batchid_obj); + tdtd->tdtd_batchid_obj = NULL; + } +} +EXPORT_SYMBOL(distribute_txn_fini); diff --git a/drivers/staging/lustrefsx/undef.h b/drivers/staging/lustrefsx/undef.h new file mode 100644 index 0000000000000..324c3eb478bdb --- /dev/null +++ b/drivers/staging/lustrefsx/undef.h @@ -0,0 +1,1256 @@ + +/* enable libcfs CDEBUG, CWARN */ +#undef CDEBUG_ENABLED + +/* enable libcfs ENTRY/EXIT */ +#undef CDEBUG_ENTRY_EXIT + +/* enable page state tracking code */ +#undef CONFIG_DEBUG_PAGESTATE_TRACKING + +/* enable encryption for ldiskfs */ +#undef CONFIG_LDISKFS_FS_ENCRYPTION + +/* posix acls for ldiskfs */ +#undef CONFIG_LDISKFS_FS_POSIX_ACL + +/* enable rw access for ldiskfs */ +#undef CONFIG_LDISKFS_FS_RW + +/* fs security for ldiskfs */ +#undef CONFIG_LDISKFS_FS_SECURITY + +/* extened attributes for ldiskfs */ +#undef CONFIG_LDISKFS_FS_XATTR + +/* embedded llcrypt */ +#undef CONFIG_LL_ENCRYPTION + +/* enable invariant checking */ +#undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK + +/* enable lu_ref reference tracking code */ +#undef CONFIG_LUSTRE_DEBUG_LU_REF + +/* Use the Pinger */ +#undef CONFIG_LUSTRE_FS_PINGER + +/* Enable POSIX acl */ +#undef CONFIG_LUSTRE_FS_POSIX_ACL + +/* name of ldiskfs debug program */ +#undef DEBUGFS + +/* name of ldiskfs dump program */ +#undef DUMPE2FS + +/* name of ldiskfs fsck program */ +#undef E2FSCK + +/* name of ldiskfs e2fsprogs package */ +#undef E2FSPROGS + +/* name of ldiskfs label program */ +#undef E2LABEL + +/* do data checksums */ +#undef ENABLE_CHECKSUM + +/* enable flock by default */ +#undef ENABLE_FLOCK + +/* filldir_t return type is bool or int */ +#undef FILLDIR_TYPE + +/* rhashtable_walk_init() has 3 args */ +#undef HAVE_3ARG_RHASHTABLE_WALK_INIT + +/* account_page_dirtied takes three arguments */ +#undef HAVE_ACCOUNT_PAGE_DIRTIED_3ARGS + +/* account_page_dirtied is exported */ +#undef HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT + +/* 'get_acl' and 'set_acl' use dentry argument */ +#undef HAVE_ACL_WITH_DENTRY + +/* aes-sha2 is supported by krb5 */ +#undef HAVE_AES_SHA2_SUPPORT + +/* aio_complete defined */ +#undef HAVE_AIO_COMPLETE + +/* 'alloc_file_pseudo' exist */ +#undef HAVE_ALLOC_FILE_PSEUDO + +/* alloc_inode_sb() exists */ +#undef HAVE_ALLOC_INODE_SB + +/* struct address_space_operations() has migrate_folio() */ +#undef HAVE_AOPS_MIGRATE_FOLIO + +/* struct address_space_operations() has read_folio() */ +#undef HAVE_AOPS_READ_FOLIO + +/* struct address_space_operations() has release_folio() */ +#undef HAVE_AOPS_RELEASE_FOLIO + +/* Define to 1 if you have the header file. */ +#undef HAVE_ASM_TYPES_H + +/* backing_dev_info exist */ +#undef HAVE_BACKING_DEV_INFO + +/* BDI_CAP_MAP_COPY exist */ +#undef HAVE_BDI_CAP_MAP_COPY + +/* backing_dev_info has io_pages */ +#undef HAVE_BDI_IO_PAGES + +/* struct bio has bi_phys_segments member */ +#undef HAVE_BIO_BI_PHYS_SEGMENTS + +/* bio_endio takes only one argument */ +#undef HAVE_BIO_ENDIO_USES_ONE_ARG + +/* 'bio_integrity_enabled' is available */ +#undef HAVE_BIO_INTEGRITY_ENABLED + +/* kernel has bio_integrity_prep_fn */ +#undef HAVE_BIO_INTEGRITY_PREP_FN + +/* bio_integrity_prep_fn returns bool */ +#undef HAVE_BIO_INTEGRITY_PREP_FN_RETURNS_BOOL + +/* 'bio_set_dev' is available */ +#undef HAVE_BIO_SET_DEV + +/* bio_integrity_payload.bip_iter exist */ +#undef HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD + +/* Linux bitmap can be allocated */ +#undef HAVE_BITMAP_ALLOC + +/* 'bi_bdev' is available */ +#undef HAVE_BI_BDEV + +/* struct bio has bi_opf */ +#undef HAVE_BI_OPF + +/* 'bi_status' is available */ +#undef HAVE_BI_STATUS + +/* kernel has struct blk_integrity_iter */ +#undef HAVE_BLK_INTEGRITY_ITER + +/* kernel hash_64() is broken */ +#undef HAVE_BROKEN_HASH_64 + +/* kernel has struct bvec_iter */ +#undef HAVE_BVEC_ITER + +/* if bvec_iter_all exists for multi-page bvec iternation */ +#undef HAVE_BVEC_ITER_ALL + +/* struct cache_detail has writers */ +#undef HAVE_CACHE_DETAIL_WRITERS + +/* if cache_detail->hash_lock is a spinlock */ +#undef HAVE_CACHE_HASH_SPINLOCK + +/* cache_head has hlist cache_list */ +#undef HAVE_CACHE_HEAD_HLIST + +/* crypto/internal/cipher.h is present */ +#undef HAVE_CIPHER_H + +/* kernel has clean_bdev_aliases */ +#undef HAVE_CLEAN_BDEV_ALIASES + +/* 'clear_and_wake_up_bit' is available */ +#undef HAVE_CLEAR_AND_WAKE_UP_BIT + +/* compat rdma found */ +#undef HAVE_COMPAT_RDMA + +/* copy_file_range() is supported */ +#undef HAVE_COPY_FILE_RANGE + +/* 'cpus_read_lock' exist */ +#undef HAVE_CPUS_READ_LOCK + +/* crypto_alloc_skcipher is defined */ +#undef HAVE_CRYPTO_ALLOC_SKCIPHER + +/* crypto hash helper functions are available */ +#undef HAVE_CRYPTO_HASH_HELPERS + +/* 'CRYPTO_MAX_ALG_NAME' is 128 */ +#undef HAVE_CRYPTO_MAX_ALG_NAME_128 + +/* crypto/sha2.h is present */ +#undef HAVE_CRYPTO_SHA2_HEADER + +/* current_time() has replaced CURRENT_TIME */ +#undef HAVE_CURRENT_TIME + +/* Have db_dirty_records list_t */ +#undef HAVE_DB_DIRTY_RECORDS_LIST + +/* default_file_splice_read is exported */ +#undef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT + +/* delete_from_page_cache is exported */ +#undef HAVE_DELETE_FROM_PAGE_CACHE + +/* dentry.d_child exist */ +#undef HAVE_DENTRY_D_CHILD + +/* list dentry.d_u.d_alias exist */ +#undef HAVE_DENTRY_D_U_D_ALIAS + +/* DES3 enctype is supported by krb5 */ +#undef HAVE_DES3_SUPPORT + +/* direct_IO has 2 arguments */ +#undef HAVE_DIRECTIO_2ARGS + +/* direct IO uses iov_iter */ +#undef HAVE_DIRECTIO_ITER + +/* address_spaace_operaions->dirty_folio() member exists */ +#undef HAVE_DIRTY_FOLIO + +/* dir_context exist */ +#undef HAVE_DIR_CONTEXT + +/* Define to 1 if you have the header file. */ +#undef HAVE_DLFCN_H + +/* Have dmu_object_alloc_dnsize in ZFS */ +#undef HAVE_DMU_OBJECT_ALLOC_DNSIZE + +/* Have dmu_objset_disown() with 3 args */ +#undef HAVE_DMU_OBJSET_DISOWN_3ARG + +/* Have dmu_objset_own() with 6 args */ +#undef HAVE_DMU_OBJSET_OWN_6ARG + +/* Have dmu_offset_next() exported */ +#undef HAVE_DMU_OFFSET_NEXT + +/* Have 6 argument dmu_pretch in ZFS */ +#undef HAVE_DMU_PREFETCH_6ARG + +/* Have dmu_read_by_dnode() in ZFS */ +#undef HAVE_DMU_READ_BY_DNODE + +/* Have dmu_tx_hold_write_by_dnode() in ZFS */ +#undef HAVE_DMU_TX_HOLD_WRITE_BY_DNODE + +/* Have dmu_tx_hold_zap_by_dnode() in ZFS */ +#undef HAVE_DMU_TX_HOLD_ZAP_BY_DNODE + +/* Have dmu_tx_mark_netfree */ +#undef HAVE_DMU_TX_MARK_NETFREE + +/* Have native dnode accounting in ZFS */ +#undef HAVE_DMU_USEROBJ_ACCOUNTING + +/* Have dmu_write_by_dnode() in ZFS */ +#undef HAVE_DMU_WRITE_BY_DNODE + +/* down_write_killable function exists */ +#undef HAVE_DOWN_WRITE_KILLABLE + +/* quotactl_ops.set_dqblk takes struct kqid */ +#undef HAVE_DQUOT_KQID + +/* quotactl_ops.set_dqblk takes struct qc_dqblk */ +#undef HAVE_DQUOT_QC_DQBLK + +/* dquot_transfer() has user_ns argument */ +#undef HAVE_DQUOT_TRANSFER_WITH_USER_NS + +/* Have dsl_pool_config_enter/exit in ZFS */ +#undef HAVE_DSL_POOL_CONFIG + +/* Have dsl_sync_task_do_nowait in ZFS */ +#undef HAVE_DSL_SYNC_TASK_DO_NOWAIT + +/* d_compare need 4 arguments */ +#undef HAVE_D_COMPARE_4ARGS + +/* d_compare need 5 arguments */ +#undef HAVE_D_COMPARE_5ARGS + +/* d_count exist */ +#undef HAVE_D_COUNT + +/* 'd_init' exists */ +#undef HAVE_D_INIT + +/* d_in_lookup is defined */ +#undef HAVE_D_IN_LOOKUP + +/* 'd_is_positive' is available */ +#undef HAVE_D_IS_POSITIVE + +/* Define to 1 if you have the header file. */ +#undef HAVE_ENDIAN_H + +/* ethtool_link_settings is defined */ +#undef HAVE_ETHTOOL_LINK_SETTINGS + +/* Define to 1 if you have the header file. */ +#undef HAVE_EXT2FS_EXT2FS_H + +/* ext4_bread takes 4 arguments */ +#undef HAVE_EXT4_BREAD_4ARGS + +/* ext4_(inc|dec)_count() has 2 arguments */ +#undef HAVE_EXT4_INC_DEC_COUNT_2ARGS + +/* i_dquot is in ext4_inode_info */ +#undef HAVE_EXT4_INFO_DQUOT + +/* ext4_free_blocks do not require struct buffer_head */ +#undef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD + +/* file handle and related syscalls are supported */ +#undef HAVE_FHANDLE_GLIBC_SUPPORT + +/* union is unnamed */ +#undef HAVE_FID2PATH_ANON_UNIONS + +/* filemap_get_folios_contig() is available */ +#undef HAVE_FILEMAP_GET_FOLIOS_CONTIG + +/* kernel has file_dentry */ +#undef HAVE_FILE_DENTRY + +/* file_operations.[read|write]_iter functions exist */ +#undef HAVE_FILE_OPERATIONS_READ_WRITE_ITER + +/* filldir_t needs struct dir_context as argument */ +#undef HAVE_FILLDIR_USE_CTX + +/* filldir_t needs struct dir_context and returns bool */ +#undef HAVE_FILLDIR_USE_CTX_RETURN_BOOL + +/* FMR pool API is available */ +#undef HAVE_FMR_POOL_API + +/* file_operations has iterate_shared */ +#undef HAVE_FOP_ITERATE_SHARED + +/* force_sig() has task parameter */ +#undef HAVE_FORCE_SIG_WITH_TASK + +/* 'struct fscrypt_digested_name' exists */ +#undef HAVE_FSCRYPT_DIGESTED_NAME + +/* embedded llcrypt uses llcrypt_dummy_context_enabled() */ +#undef HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED + +/* fscrypt_is_nokey_name() exists */ +#undef HAVE_FSCRYPT_IS_NOKEY_NAME + +/* full_name_hash need 3 arguments */ +#undef HAVE_FULL_NAME_HASH_3ARGS + +/* generic_write_sync has 2 arguments */ +#undef HAVE_GENERIC_WRITE_SYNC_2ARGS + +/* struct genl_dumpit_info has family field */ +#undef HAVE_GENL_DUMPIT_INFO + +/* Define to 1 if you have the `gethostbyname' function. */ +#undef HAVE_GETHOSTBYNAME + +/* 'get_acl' has a rcu argument */ +#undef HAVE_GET_ACL_RCU_ARG + +/* get_inode_usage function exists */ +#undef HAVE_GET_INODE_USAGE + +/* get_random_[u32|u64] are available */ +#undef HAVE_GET_RANDOM_U32_AND_U64 + +/* get_random_u32_below() is available */ +#undef HAVE_GET_RANDOM_U32_BELOW + +/* get_request_key_auth() is available */ +#undef HAVE_GET_REQUEST_KEY_AUTH + +/* get_user_pages takes 6 arguments */ +#undef HAVE_GET_USER_PAGES_6ARG + +/* get_user_pages takes gup_flags in arguments */ +#undef HAVE_GET_USER_PAGES_GUP_FLAGS + +/* glob_match() is available */ +#undef HAVE_GLOB + +/* grab_cache_page_write_begin() has flags argument */ +#undef HAVE_GRAB_CACHE_PAGE_WRITE_BEGIN_WITH_FLAGS + +/* struct group_info has member gid */ +#undef HAVE_GROUP_INFO_GID + +/* Define this is if you enable gss */ +#undef HAVE_GSS + +/* Define this if you enable gss keyring backend */ +#undef HAVE_GSS_KEYRING + +/* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */ +#undef HAVE_GSS_KRB5_CCACHE_NAME + +/* '__rhashtable_insert_fast()' returns int */ +#undef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT + +/* Define this if you have Heimdal Kerberos libraries */ +#undef HAVE_HEIMDAL + +/* hlist_add_after is available */ +#undef HAVE_HLIST_ADD_AFTER + +/* hotplug state machine is supported */ +#undef HAVE_HOTPLUG_STATE_MACHINE + +/* hypervisor_is_type function exists */ +#undef HAVE_HYPERVISOR_IS_TYPE + +/* ib_alloc_fast_reg_mr is defined */ +#undef HAVE_IB_ALLOC_FAST_REG_MR + +/* ib_alloc_pd has 2 arguments */ +#undef HAVE_IB_ALLOC_PD_2ARGS + +/* struct ib_cq_init_attr is used by ib_create_cq */ +#undef HAVE_IB_CQ_INIT_ATTR + +/* struct ib_device.attrs is defined */ +#undef HAVE_IB_DEVICE_ATTRS + +/* if struct ib_device_ops is defined */ +#undef HAVE_IB_DEVICE_OPS + +/* ib_get_dma_mr is defined */ +#undef HAVE_IB_GET_DMA_MR + +/* function ib_inc_rkey exist */ +#undef HAVE_IB_INC_RKEY + +/* ib_map_mr_sg exists */ +#undef HAVE_IB_MAP_MR_SG + +/* ib_map_mr_sg has 5 arguments */ +#undef HAVE_IB_MAP_MR_SG_5ARGS + +/* ib_post_send and ib_post_recv have const parameters */ +#undef HAVE_IB_POST_SEND_RECV_CONST + +/* struct ib_rdma_wr is defined */ +#undef HAVE_IB_RDMA_WR + +/* if ib_sg_dma_address wrapper exists */ +#undef HAVE_IB_SG_DMA_ADDRESS + +/* inode_operations .getattr member function can gather advance stats */ +#undef HAVE_INODEOPS_ENHANCED_GETATTR + +/* inode_lock is defined */ +#undef HAVE_INODE_LOCK + +/* inode times are using timespec64 */ +#undef HAVE_INODE_TIMESPEC64 + +/* blk_integrity.interval exist */ +#undef HAVE_INTERVAL_BLK_INTEGRITY + +/* blk_integrity.interval_exp exist */ +#undef HAVE_INTERVAL_EXP_BLK_INTEGRITY + +/* interval trees use rb_tree_cached */ +#undef HAVE_INTERVAL_TREE_CACHED + +/* Define to 1 if you have the header file. */ +#undef HAVE_INTTYPES_H + +/* address_spaace_operaions->invalidate_folio() member exists */ +#undef HAVE_INVALIDATE_FOLIO + +/* address_space invalidate_lock member exists */ +#undef HAVE_INVALIDATE_LOCK + +/* address_space_operations.invalidatepage needs 3 arguments */ +#undef HAVE_INVALIDATE_RANGE + +/* have in_compat_syscall */ +#undef HAVE_IN_COMPAT_SYSCALL + +/* 'in_dev_for_each_ifa_rtnl' is defined */ +#undef HAVE_IN_DEV_FOR_EACH_IFA_RTNL + +/* inode_operations->rename need flags as argument */ +#undef HAVE_IOPS_RENAME_WITH_FLAGS + +/* generic_readlink has been removed */ +#undef HAVE_IOP_GENERIC_READLINK + +/* have iop get_link */ +#undef HAVE_IOP_GET_LINK + +/* inode_operations has .set_acl member function */ +#undef HAVE_IOP_SET_ACL + +/* inode_operations has {get,set,remove}xattr members */ +#undef HAVE_IOP_XATTR + +/* iov_iter_get_pages_alloc2() is available */ +#undef HAVE_IOV_ITER_GET_PAGES_ALLOC2 + +/* if iov_iter has member iter_type */ +#undef HAVE_IOV_ITER_HAS_ITER_TYPE_MEMBER + +/* if iov_iter has member type */ +#undef HAVE_IOV_ITER_HAS_TYPE_MEMBER + +/* iov_iter_init handles directional tag */ +#undef HAVE_IOV_ITER_INIT_DIRECTION + +/* iov_iter_rw exist */ +#undef HAVE_IOV_ITER_RW + +/* iov_iter_truncate exists */ +#undef HAVE_IOV_ITER_TRUNCATE + +/* if iov_iter_type exists */ +#undef HAVE_IOV_ITER_TYPE + +/* is_root_inode defined */ +#undef HAVE_IS_ROOT_INODE + +/* 'iter_file_splice_write' exists */ +#undef HAVE_ITER_FILE_SPLICE_WRITE + +/* struct address_space has i_pages */ +#undef HAVE_I_PAGES + +/* if jbd2_journal_get_max_txn_bufs is available */ +#undef HAVE_JBD2_JOURNAL_GET_MAX_TXN_BUFS + +/* struct jbd2_journal_handle has h_total_credits member */ +#undef HAVE_JOURNAL_TOTAL_CREDITS + +/* kallsyms_lookup_name is exported by kernel */ +#undef HAVE_KALLSYMS_LOOKUP_NAME + +/* 'kernel_param_[un]lock' is available */ +#undef HAVE_KERNEL_PARAM_LOCK + +/* 'struct kernel_param_ops' is available */ +#undef HAVE_KERNEL_PARAM_OPS + +/* kernel_read() signature ends with loff_t *pos */ +#undef HAVE_KERNEL_READ_LAST_POSP + +/* kernel_setsockopt still in use */ +#undef HAVE_KERNEL_SETSOCKOPT + +/* 'getname' has two args */ +#undef HAVE_KERN_SOCK_GETNAME_2ARGS + +/* keyring_search has 4 args */ +#undef HAVE_KEYRING_SEARCH_4ARGS + +/* struct key_match_data exist */ +#undef HAVE_KEY_MATCH_DATA + +/* payload.data is an array */ +#undef HAVE_KEY_PAYLOAD_DATA_ARRAY + +/* key_type->instantiate has two args */ +#undef HAVE_KEY_TYPE_INSTANTIATE_2ARGS + +/* key.usage is of type refcount_t */ +#undef HAVE_KEY_USAGE_REFCOUNT + +/* kfree_sensitive() is available. */ +#undef HAVE_KFREE_SENSITIVE + +/* kiocb->ki_complete() has 2 arguments */ +#undef HAVE_KIOCB_COMPLETE_2ARGS + +/* ki_left exist */ +#undef HAVE_KIOCB_KI_LEFT + +/* ki_nbytes field exist */ +#undef HAVE_KI_NBYTES + +/* kmap_to_page is exported by the kernel */ +#undef HAVE_KMAP_TO_PAGE + +/* struct kobj_type has 'default_groups' member */ +#undef HAVE_KOBJ_TYPE_DEFAULT_GROUPS + +/* Define this if you have MIT Kerberos libraries */ +#undef HAVE_KRB5 + +/* Define this if the function krb5int_derive_key is available */ +#undef HAVE_KRB5INT_DERIVE_KEY + +/* Define this if the function krb5_derive_key is available */ +#undef HAVE_KRB5_DERIVE_KEY + +/* Define this if the function krb5_get_error_message is available */ +#undef HAVE_KRB5_GET_ERROR_MESSAGE + +/* Define this if the function krb5_get_init_creds_opt_set_addressless is + available */ +#undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS + +/* kref_read() is available */ +#undef HAVE_KREF_READ + +/* kset_find_obj is exported by the kernel */ +#undef HAVE_KSET_FIND_OBJ + +/* kernel has kstrtobool_from_user */ +#undef HAVE_KSTRTOBOOL_FROM_USER + +/* kthread_worker found */ +#undef HAVE_KTHREAD_WORK + +/* ktime_add is available */ +#undef HAVE_KTIME_ADD + +/* ktime_after is available */ +#undef HAVE_KTIME_AFTER + +/* ktime_before is available */ +#undef HAVE_KTIME_BEFORE + +/* ktime_compare is available */ +#undef HAVE_KTIME_COMPARE + +/* 'ktime_get_real_seconds' is available */ +#undef HAVE_KTIME_GET_REAL_SECONDS + +/* 'ktime_get_real_ts64' is available */ +#undef HAVE_KTIME_GET_REAL_TS64 + +/* 'ktime_get_seconds' is available */ +#undef HAVE_KTIME_GET_SECONDS + +/* 'ktime_get_ts64' is available */ +#undef HAVE_KTIME_GET_TS64 + +/* 'ktime_ms_delta' is available */ +#undef HAVE_KTIME_MS_DELTA + +/* 'ktime_to_timespec64' is available */ +#undef HAVE_KTIME_TO_TIMESPEC64 + +/* ldiskfsfs_dirhash takes an inode argument */ +#undef HAVE_LDISKFSFS_GETHASH_INODE_ARG + +/* enable use of ldiskfsprogs package */ +#undef HAVE_LDISKFSPROGS + +/* EXT4_GET_BLOCKS_KEEP_SIZE exists */ +#undef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE + +/* if ldiskfs_iget takes a flags argument */ +#undef HAVE_LDISKFS_IGET_WITH_FLAGS + +/* 'ext4_journal_ensure_credits' exists */ +#undef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS + +/* Enable ldiskfs osd */ +#undef HAVE_LDISKFS_OSD + +/* libefence support is requested */ +#undef HAVE_LIBEFENCE + +/* Define to 1 if you have the `keyutils' library (-lkeyutils). */ +#undef HAVE_LIBKEYUTILS + +/* use libpthread for libcfs library */ +#undef HAVE_LIBPTHREAD + +/* readline library is available */ +#undef HAVE_LIBREADLINE + +/* linux/blk-integrity.h is present */ +#undef HAVE_LINUX_BLK_INTEGRITY_HEADER + +/* linux/fortify-string.h header available */ +#undef HAVE_LINUX_FORTIFY_STRING_HEADER + +/* linux/stdarg.h is present */ +#undef HAVE_LINUX_STDARG_HEADER + +/* list_cmp_func_t type is defined */ +#undef HAVE_LIST_CMP_FUNC_T + +/* lock_manager_operations has lm_compare_owner */ +#undef HAVE_LM_COMPARE_OWNER + +/* kernel has locks_lock_file_wait */ +#undef HAVE_LOCKS_LOCK_FILE_WAIT + +/* lock_page_memcg is defined */ +#undef HAVE_LOCK_PAGE_MEMCG + +/* lookup_user_key() is available */ +#undef HAVE_LOOKUP_USER_KEY + +/* Enable lru resize support */ +#undef HAVE_LRU_RESIZE_SUPPORT + +/* lsmcontext_init is available */ +#undef HAVE_LSMCONTEXT_INIT + +/* Define this if the Kerberos GSS library supports + gss_krb5_export_lucid_sec_context */ +#undef HAVE_LUCID_CONTEXT_SUPPORT + +/* Enable Lustre client crypto via embedded llcrypt */ +#undef HAVE_LUSTRE_CRYPTO + +/* enum mapping_flags has AS_EXITING flag */ +#undef HAVE_MAPPING_AS_EXITING_FLAG + +/* match_wildcard() is available */ +#undef HAVE_MATCH_WILDCARD + +/* memalloc_noreclaim_{save,restore}() is supported */ +#undef HAVE_MEMALLOC_RECLAIM + +/* Define to 1 if you have the header file. */ +#undef HAVE_MEMORY_H + +/* mmap_lock API is available. */ +#undef HAVE_MMAP_LOCK + +/* kernel module loading is possible */ +#undef HAVE_MODULE_LOADING_SUPPORT + +/* Define to 1 if you have the `name_to_handle_at' function. */ +#undef HAVE_NAME_TO_HANDLE_AT + +/* support native Linux client */ +#undef HAVE_NATIVE_LINUX_CLIENT + +/* Define to 1 if you have the header file. */ +#undef HAVE_NETDB_H + +/* struct genl_ops has 'start' callback */ +#undef HAVE_NETLINK_CALLBACK_START + +/* DEFINE_TIMER uses only 2 arguements */ +#undef HAVE_NEW_DEFINE_TIMER + +/* 'kernel_write' aligns with read/write helpers */ +#undef HAVE_NEW_KERNEL_WRITE + +/* libnl3 supports nla_get_s32 */ +#undef HAVE_NLA_GET_S32 + +/* libnl3 supports nla_get_s64 */ +#undef HAVE_NLA_GET_S64 + +/* 'nla_strdup' is available */ +#undef HAVE_NLA_STRDUP + +/* 'nla_strlcpy' is available */ +#undef HAVE_NLA_STRLCPY + +/* netlink_ext_ack is handled for Netlink dump handlers */ +#undef HAVE_NL_DUMP_WITH_EXT_ACK + +/* netlink_ext_ack is an argument to nla_parse type function */ +#undef HAVE_NL_PARSE_WITH_EXT_ACK + +/* no_llseek() is available */ +#undef HAVE_NO_LLSEEK + +/* NR_UNSTABLE_NFS is still in use. */ +#undef HAVE_NR_UNSTABLE_NFS + +/* ns_to_timespec64() is available */ +#undef HAVE_NS_TO_TIMESPEC64 + +/* with oldsize */ +#undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE + +/* OpenSSL EVP_PKEY_get_params */ +#undef HAVE_OPENSSL_EVP_PKEY + +/* openssl-devel is present */ +#undef HAVE_OPENSSL_GETSEPOL + +/* OpenSSL HMAC functions needed for SSK */ +#undef HAVE_OPENSSL_SSK + +/* if Oracle OFED Extensions are enabled */ +#undef HAVE_ORACLE_OFED_EXTENSIONS + +/* 'pagevec_init' takes one parameter */ +#undef HAVE_PAGEVEC_INIT_ONE_PARAM + +/* linux/panic_notifier.h is present */ +#undef HAVE_PANIC_NOTIFIER_H + +/* 'param_set_uint_minmax' is available */ +#undef HAVE_PARAM_SET_UINT_MINMAX + +/* percpu_counter_init uses GFP_* flag */ +#undef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG + +/* 'struct nsproxy' has 'pid_ns_for_children' */ +#undef HAVE_PID_NS_FOR_CHILDREN + +/* 'posix_acl_update_mode' is available */ +#undef HAVE_POSIX_ACL_UPDATE_MODE + +/* posix_acl_valid takes struct user_namespace */ +#undef HAVE_POSIX_ACL_VALID_USER_NS + +/* 'prepare_to_wait_event' is available */ +#undef HAVE_PREPARE_TO_WAIT_EVENT + +/* processor.h is present */ +#undef HAVE_PROCESSOR_H + +/* struct proc_ops exists */ +#undef HAVE_PROC_OPS + +/* get_projid function exists */ +#undef HAVE_PROJECT_QUOTA + +/* 'PTR_ERR_OR_ZERO' exist */ +#undef HAVE_PTR_ERR_OR_ZERO + +/* If available, contains the Python version number currently in use. */ +#undef HAVE_PYTHON + +/* radix_tree_tag_set exists */ +#undef HAVE_RADIX_TREE_TAG_SET + +/* rdma_connect_locked is defined */ +#undef HAVE_RDMA_CONNECT_LOCKED + +/* rdma_create_id wants 4 args */ +#undef HAVE_RDMA_CREATE_ID_4ARG + +/* rdma_create_id wants 5 args */ +#undef HAVE_RDMA_CREATE_ID_5ARG + +/* rdma_reject has 4 arguments */ +#undef HAVE_RDMA_REJECT_4ARGS + +/* read_cache_page() filler_t needs struct file */ +#undef HAVE_READ_CACHE_PAGE_WANTS_FILE + +/* refcount_t is supported */ +#undef HAVE_REFCOUNT_T + +/* register_shrinker() returns status */ +#undef HAVE_REGISTER_SHRINKER_FORMAT_NAMED + +/* register_shrinker() returns status */ +#undef HAVE_REGISTER_SHRINKER_RET + +/* rhashtable_lookup() is available */ +#undef HAVE_RHASHTABLE_LOOKUP + +/* rhashtable_lookup_get_insert_fast() is available */ +#undef HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST + +/* rhashtable_replace_fast() is available */ +#undef HAVE_RHASHTABLE_REPLACE + +/* rhashtable_walk_enter() is available */ +#undef HAVE_RHASHTABLE_WALK_ENTER + +/* struct rhltable exist */ +#undef HAVE_RHLTABLE + +/* rht_bucket_var() is available */ +#undef HAVE_RHT_BUCKET_VAR + +/* save_stack_trace_tsk is exported */ +#undef HAVE_SAVE_STACK_TRACE_TSK + +/* Have sa_spill_alloc in ZFS */ +#undef HAVE_SA_SPILL_ALLOC + +/* linux/sched header directory exist */ +#undef HAVE_SCHED_HEADERS + +/* security_dentry_init_security needs lsmcontext */ +#undef HAVE_SECURITY_DENTRY_INIT_SECURTY_WITH_CTX + +/* security_dentry_init_security() returns xattr name */ +#undef HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG + +/* security_release_secctx has 1 arg. */ +#undef HAVE_SEC_RELEASE_SECCTX_1ARG + +/* support for selinux */ +#undef HAVE_SELINUX + +/* Define to 1 if you have the header file. */ +#undef HAVE_SELINUX_SELINUX_H + +/* support server */ +#undef HAVE_SERVER_SUPPORT + +/* Define this if the Kerberos GSS library supports + gss_krb5_set_allowable_enctypes */ +#undef HAVE_SET_ALLOWABLE_ENCTYPES + +/* shrinker has count_objects member */ +#undef HAVE_SHRINKER_COUNT + +/* sk_data_ready uses only one argument */ +#undef HAVE_SK_DATA_READY_ONE_ARG + +/* sock_create_kern use net as first parameter */ +#undef HAVE_SOCK_CREATE_KERN_USE_NET + +/* Have spa_maxblocksize in ZFS */ +#undef HAVE_SPA_MAXBLOCKSIZE + +/* struct stacktrace_ops exists */ +#undef HAVE_STACKTRACE_OPS + +/* Define to 1 if you have the `statx' function. */ +#undef HAVE_STATX + +/* Define to 1 if you have the header file. */ +#undef HAVE_STDINT_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_STDLIB_H + +/* stringhash.h is present */ +#undef HAVE_STRINGHASH + +/* Define to 1 if you have the header file. */ +#undef HAVE_STRINGS_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_STRING_H + +/* Define to 1 if you have the `strnlen' function. */ +#undef HAVE_STRNLEN + +/* kernel strscpy is available */ +#undef HAVE_STRSCPY + +/* struct posix_acl_xattr_{header,entry} defined */ +#undef HAVE_STRUCT_POSIX_ACL_XATTR + +/* submit_bio takes two arguments */ +#undef HAVE_SUBMIT_BIO_2ARGS + +/* 'super_setup_bdi_name' is available */ +#undef HAVE_SUPER_SETUP_BDI_NAME + +/* symlink inode operations need struct nameidata argument */ +#undef HAVE_SYMLINK_OPS_USE_NAMEIDATA + +/* new_sync_[read|write] is exported by the kernel */ +#undef HAVE_SYNC_READ_WRITE + +/* Define to 1 if you have . */ +#undef HAVE_SYS_QUOTA_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_STAT_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_TYPES_H + +/* 's_uuid' is an uuid_t */ +#undef HAVE_S_UUID_AS_UUID_T + +/* task_is_running() is defined */ +#undef HAVE_TASK_IS_RUNNING + +/* 'tcp_sock_set_keepcnt()' exists */ +#undef HAVE_TCP_SOCK_SET_KEEPCNT + +/* 'tcp_sock_set_keepidle()' exists */ +#undef HAVE_TCP_SOCK_SET_KEEPIDLE + +/* 'tcp_sock_set_keepintvl()' exists */ +#undef HAVE_TCP_SOCK_SET_KEEPINTVL + +/* 'tcp_sock_set_nodelay()' exists */ +#undef HAVE_TCP_SOCK_SET_NODELAY + +/* 'tcp_sock_set_quickack()' exists */ +#undef HAVE_TCP_SOCK_SET_QUICKACK + +/* timer_setup has replaced setup_timer */ +#undef HAVE_TIMER_SETUP + +/* 'struct timespec64' is available */ +#undef HAVE_TIMESPEC64 + +/* 'timespec64_sub' is available */ +#undef HAVE_TIMESPEC64_SUB + +/* 'timespec64_to_ktime' is available */ +#undef HAVE_TIMESPEC64_TO_KTIME + +/* topology_sibling_cpumask is available */ +#undef HAVE_TOPOLOGY_SIBLING_CPUMASK + +/* if totalram_pages is a function */ +#undef HAVE_TOTALRAM_PAGES_AS_FUNC + +/* kernel has truncate_inode_pages_final */ +#undef HAVE_TRUNCATE_INODE_PAGES_FINAL + +/* if MS_RDONLY was moved to uapi/linux/mount.h */ +#undef HAVE_UAPI_LINUX_MOUNT_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_UNISTD_H + +/* 'inode_operations' members have user namespace argument */ +#undef HAVE_USER_NAMESPACE_ARG + +/* 'enum nlmsgerr_attrs' exists */ +#undef HAVE_USRSPC_NLMSGERR + +/* RDMA_PS_TCP exists */ +#undef HAVE_USRSPC_RDMA_PS_TCP + +/* 'uuid_t' exist */ +#undef HAVE_UUID_T + +/* kernel has vfs_rename with 5 args */ +#undef HAVE_VFS_RENAME_5ARGS + +/* kernel has vfs_rename with 6 args */ +#undef HAVE_VFS_RENAME_6ARGS + +/* '__vfs_setxattr' is available */ +#undef HAVE_VFS_SETXATTR + +/* kernel has vfs_unlink with 3 args */ +#undef HAVE_VFS_UNLINK_3ARGS + +/* __vmalloc only takes 2 args. */ +#undef HAVE_VMALLOC_2ARGS + +/* virtual_address has been replaced by address field */ +#undef HAVE_VM_FAULT_ADDRESS + +/* if VM_FAULT_RETRY is defined */ +#undef HAVE_VM_FAULT_RETRY + +/* if vm_fault_t type exists */ +#undef HAVE_VM_FAULT_T + +/* 'struct vm_operations' remove struct vm_area_struct argument */ +#undef HAVE_VM_OPS_USE_VM_FAULT_ONLY + +/* wait_bit.h is present */ +#undef HAVE_WAIT_BIT_HEADER_H + +/* if struct wait_bit_queue_entry exists */ +#undef HAVE_WAIT_BIT_QUEUE_ENTRY + +/* 'wait_queue_entry_t' is available */ +#undef HAVE_WAIT_QUEUE_ENTRY + +/* linux wait_queue_head_t list_head is name head */ +#undef HAVE_WAIT_QUEUE_ENTRY_LIST + +/* 'wait_var_event' is available */ +#undef HAVE_WAIT_VAR_EVENT + +/* 'wait_woken, is available' */ +#undef HAVE_WAIT_WOKEN + +/* kernel Xarray implementation lacks 'xa_is_value' */ +#undef HAVE_XARRAY_SUPPORT + +/* needs inode parameter */ +#undef HAVE_XATTR_HANDLER_INODE_PARAM + +/* xattr_handler has a name member */ +#undef HAVE_XATTR_HANDLER_NAME + +/* handler pointer is parameter */ +#undef HAVE_XATTR_HANDLER_SIMPLIFIED + +/* Have zap_add_by_dnode() in ZFS */ +#undef HAVE_ZAP_ADD_BY_DNODE + +/* Have zap_lookup_by_dnode() in ZFS */ +#undef HAVE_ZAP_LOOKUP_BY_DNODE + +/* Have zap_remove_by_dnode() in ZFS */ +#undef HAVE_ZAP_REMOVE_ADD_BY_DNODE + +/* Have inode_timespec_t */ +#undef HAVE_ZFS_INODE_TIMESPEC + +/* Have multihost protection in ZFS */ +#undef HAVE_ZFS_MULTIHOST + +/* Enable zfs osd */ +#undef HAVE_ZFS_OSD + +/* Have zfs_refcount_add */ +#undef HAVE_ZFS_REFCOUNT_ADD + +/* Have zfs_refcount.h */ +#undef HAVE_ZFS_REFCOUNT_HEADER + +/* struct bio has __bi_cnt */ +#undef HAVE___BI_CNT + +/* if __ldiskfs_find_entry is available */ +#undef HAVE___LDISKFS_FIND_ENTRY + +/* function pde_data() available */ +#undef HAVE_pde_data + +/* ext4_journal_start takes 3 arguments */ +#undef JOURNAL_START_HAS_3ARGS + +/* Define this as the Kerberos version number */ +#undef KRB5_VERSION + +/* enable libcfs LASSERT, LASSERTF */ +#undef LIBCFS_DEBUG + +/* use dumplog on panic */ +#undef LNET_DUMP_ON_PANIC + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#undef LT_OBJDIR + +/* Fourth number in the Lustre version */ +#undef LUSTRE_FIX + +/* First number in the Lustre version */ +#undef LUSTRE_MAJOR + +/* Second number in the Lustre version */ +#undef LUSTRE_MINOR + +/* Third number in the Lustre version */ +#undef LUSTRE_PATCH + +/* A copy of PACKAGE_VERSION */ +#undef LUSTRE_VERSION_STRING + +/* maximum number of MDS threads */ +#undef MDS_MAX_THREADS + +/* Report minimum OST free space */ +#undef MIN_DF + +/* name of ldiskfs mkfs program */ +#undef MKE2FS + +/* 'ktime_get_ns' is not available */ +#undef NEED_KTIME_GET_NS + +/* 'ktime_get_real_ns' is not available */ +#undef NEED_KTIME_GET_REAL_NS + +/* lockdep_is_held() argument is const */ +#undef NEED_LOCKDEP_IS_HELD_DISCARD_CONST + +/* Name of package */ +#undef PACKAGE + +/* Define to the address where bug reports for this package should be sent. */ +#undef PACKAGE_BUGREPORT + +/* Define to the full name of this package. */ +#undef PACKAGE_NAME + +/* Define to the full name and version of this package. */ +#undef PACKAGE_STRING + +/* Define to the one symbol short name of this package. */ +#undef PACKAGE_TARNAME + +/* Define to the home page for this package. */ +#undef PACKAGE_URL + +/* Define to the version of this package. */ +#undef PACKAGE_VERSION + +/* name of parallel fsck program */ +#undef PFSCK + +/* enable randomly alloc failure */ +#undef RANDOM_FAIL_ALLOC + +/* The size of `unsigned long long', as computed by sizeof. */ +#undef SIZEOF_UNSIGNED_LONG_LONG + +/* use tunable backoff TCP */ +#undef SOCKNAL_BACKOFF + +/* tunable backoff TCP in ms */ +#undef SOCKNAL_BACKOFF_MS + +/* 'struct stacktrace_ops' address function returns an int */ +#undef STACKTRACE_OPS_ADDRESS_RETURN_INT + +/* Define to 1 if you have the ANSI C header files. */ +#undef STDC_HEADERS + +/* name of ldiskfs tune program */ +#undef TUNE2FS + +/* Define this if the private function, gss_krb5_cache_name, must be used to + tell the Kerberos library which credentials cache to use. Otherwise, this + is done by setting the KRB5CCNAME environment variable */ +#undef USE_GSS_KRB5_CCACHE_NAME + +/* Write when Checking Health */ +#undef USE_HEALTH_CHECK_WRITE + +/* Version number of package */ +#undef VERSION + +/* vfs_setxattr() value argument is non-const */ +#undef VFS_SETXATTR_VALUE + +/* zfs fix version */ +#undef ZFS_FIX + +/* zfs major version */ +#undef ZFS_MAJOR + +/* zfs minor version */ +#undef ZFS_MINOR + +/* zfs patch version */ +#undef ZFS_PATCH + +/* get_random_u32() is not available, use prandom_u32 */ +#undef get_random_u32 + +/* get_random_u32_below() is not available */ +#undef get_random_u32_below + +/* function pde_data() unavailable */ +#undef pde_data diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index aa90bd0199d7e..372f8f5262fa7 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -18,6 +18,7 @@ #include #include #include +#include /* * Balloon device works in 4K page units. So each page is pointed to by @@ -39,6 +40,13 @@ (1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT)) #define VIRTIO_BALLOON_HINT_BLOCK_PAGES (1 << VIRTIO_BALLOON_HINT_BLOCK_ORDER) +static bool report_offline = false; +module_param(report_offline, bool, 0444); +MODULE_PARM_DESC(report_offline, + "Report offlined pages to the hypervisor"); + +static DEFINE_MUTEX(vb_page_report_lock); + enum virtio_balloon_vq { VIRTIO_BALLOON_VQ_INFLATE, VIRTIO_BALLOON_VQ_DEFLATE, @@ -166,6 +174,15 @@ static int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_i struct virtqueue *vq = vb->reporting_vq; unsigned int unused, err; + /* + * virtqueue callers must make sure that only one thread is + * using a queue. With offline page reporting enabled, multiple + * threads might be calling this function at the same time. + * + * So, make sure they don't get in each other's way. + */ + mutex_lock(&vb_page_report_lock); + /* We should always be able to add these buffers to an empty queue. */ err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN); @@ -174,17 +191,55 @@ static int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_i * are able to trigger an error we will simply display a warning * and exit without actually processing the pages. */ - if (WARN_ON_ONCE(err)) + if (WARN_ON_ONCE(err)) { + mutex_unlock(&vb_page_report_lock); return err; + } virtqueue_kick(vq); /* When host has read buffer, this completes via balloon_ack */ wait_event(vb->acked, virtqueue_get_buf(vq, &unused)); + mutex_unlock(&vb_page_report_lock); + return 0; } +/* + * Callback for memory offline. Takes the offlined range and passes it + * to the normal free page reporting entry point. + * + * Assumptions that are currently all true: + * + * 1) We're in a safe context to sleep. + * 2) The offlined range is <= a memory section (128M on x86, 1G on arm64), + * and so the length will fit in a 32bit field. + */ +static int virtioballoon_free_page_report_offline( + struct page_reporting_dev_info *pr_dev_info, + unsigned long start_pfn, unsigned int nr_pages) +{ + struct scatterlist sgl; + unsigned int len = nr_pages << PAGE_SHIFT; + int err; + + /* + * Set the page to NULL to signal a "pre-mapped" address, + * e.g. the virtio ring code will not touch the page + * structure and will just use the dma_address passed in. + */ + sg_init_table(&sgl, 1); + sg_set_page(&sgl, NULL, len, 0); + sgl.dma_address = PFN_PHYS(start_pfn); + + err = virtballoon_free_page_report(pr_dev_info, &sgl, 1); + if (err) + pr_err("virtio_balloon: offline reporting failed (%d)\n", err); + + return err; +} + static void set_page_pfns(struct virtio_balloon *vb, __virtio32 pfns[], struct page *page) { @@ -957,6 +1012,8 @@ static int virtballoon_probe(struct virtio_device *vdev) } vb->pr_dev_info.report = virtballoon_free_page_report; + if (report_offline) + vb->pr_dev_info.report_offline = virtioballoon_free_page_report_offline; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) { unsigned int capacity; diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 7d320f799ca1e..e81a2e2583cfc 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -360,7 +360,8 @@ static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq, * depending on the direction. */ kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction); - return (dma_addr_t)sg_phys(sg); + return sg_page(sg) == NULL ? sg_dma_address(sg) : + (dma_addr_t)sg_phys(sg); } /* diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 96b96516c9806..048f8ea9c1b1e 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -68,6 +68,10 @@ #include #include +#ifdef CONFIG_ACPI +#include +#endif + #include "events_internal.h" #undef MODULE_PARAM_PREFIX @@ -550,6 +554,14 @@ static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu, channels_on_cpu_inc(info); } +static void xen_evtchn_mask_all(void) +{ + evtchn_port_t evtchn; + + for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) + mask_evtchn(evtchn); +} + /** * notify_remote_via_irq - send event to remote end of event channel via irq * @irq: irq of event channel to send event to @@ -2098,6 +2110,7 @@ void xen_irq_resume(void) struct irq_info *info; /* New event-channel space is not 'live' yet. */ + xen_evtchn_mask_all(); xen_evtchn_resume(); /* No IRQ <-> event-channel mappings. */ @@ -2118,6 +2131,19 @@ void xen_irq_resume(void) restore_pirqs(); } +void xen_shutdown_pirqs(void) +{ + struct irq_info *info; + + list_for_each_entry(info, &xen_irq_list_head, list) { + if (info->type != IRQT_PIRQ || !VALID_EVTCHN(info->evtchn)) + continue; + + shutdown_pirq(irq_get_irq_data(info->irq)); + irq_state_clr_started(irq_to_desc(info->irq)); + } +} + static struct irq_chip xen_dynamic_chip __read_mostly = { .name = "xen-dyn", @@ -2276,7 +2302,6 @@ static int xen_evtchn_cpu_dead(unsigned int cpu) void __init xen_init_IRQ(void) { int ret = -EINVAL; - evtchn_port_t evtchn; if (xen_fifo_events) ret = xen_evtchn_fifo_init(); @@ -2296,8 +2321,7 @@ void __init xen_init_IRQ(void) BUG_ON(!evtchn_to_irq); /* No event channels are 'live' right now. */ - for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) - mask_evtchn(evtchn); + xen_evtchn_mask_all(); pirq_needs_eoi = pirq_needs_eoi_flag; diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index c16df629907e1..de89fed344ca6 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,31 @@ enum shutdown_state { /* Ignore multiple shutdown requests. */ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; +enum suspend_modes { + NO_SUSPEND = 0, + XEN_SUSPEND, + PM_SUSPEND, + PM_HIBERNATION, +}; + +/* Protected by pm_mutex */ +static enum suspend_modes suspend_mode = NO_SUSPEND; + +bool xen_suspend_mode_is_xen_suspend(void) +{ + return suspend_mode == XEN_SUSPEND; +} + +bool xen_suspend_mode_is_pm_suspend(void) +{ + return suspend_mode == PM_SUSPEND; +} + +bool xen_suspend_mode_is_pm_hibernation(void) +{ + return suspend_mode == PM_HIBERNATION; +} + struct suspend_info { int cancelled; }; @@ -98,6 +124,11 @@ static void do_suspend(void) { int err; struct suspend_info si; + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); + + suspend_mode = XEN_SUSPEND; shutting_down = SHUTDOWN_SUSPEND; @@ -162,6 +193,10 @@ static void do_suspend(void) thaw_processes(); out: shutting_down = SHUTDOWN_INVALID; + + suspend_mode = NO_SUSPEND; + + unlock_system_sleep(sleep_flags); } #endif /* CONFIG_HIBERNATE_CALLBACKS */ @@ -388,3 +423,42 @@ int xen_setup_shutdown_event(void) EXPORT_SYMBOL_GPL(xen_setup_shutdown_event); subsys_initcall(xen_setup_shutdown_event); + +static int xen_pm_notifier(struct notifier_block *notifier, + unsigned long pm_event, void *unused) +{ + switch (pm_event) { + case PM_SUSPEND_PREPARE: + suspend_mode = PM_SUSPEND; + break; + case PM_HIBERNATION_PREPARE: + case PM_RESTORE_PREPARE: + suspend_mode = PM_HIBERNATION; + break; + case PM_POST_SUSPEND: + case PM_POST_RESTORE: + case PM_POST_HIBERNATION: + /* Set back to the default */ + suspend_mode = NO_SUSPEND; + break; + default: + pr_warn("Receive unknown PM event 0x%lx\n", pm_event); + return -EINVAL; + } + + return 0; +}; + +static struct notifier_block xen_pm_notifier_block = { + .notifier_call = xen_pm_notifier +}; + +static int xen_setup_pm_notifier(void) +{ + if (!xen_hvm_domain()) + return -ENODEV; + + return register_pm_notifier(&xen_pm_notifier_block); +} + +subsys_initcall(xen_setup_pm_notifier); diff --git a/drivers/xen/time.c b/drivers/xen/time.c index 152dd33bb2236..bf41e5cf1332d 100644 --- a/drivers/xen/time.c +++ b/drivers/xen/time.c @@ -24,6 +24,9 @@ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); static DEFINE_PER_CPU(u64[4], old_runstate_time); +static DEFINE_PER_CPU(u64, xen_prev_steal_clock); +static DEFINE_PER_CPU(u64, xen_steal_clock_offset); + /* return an consistent snapshot of 64-bit time/counter value */ static u64 get64(const u64 *p) { @@ -150,7 +153,7 @@ bool xen_vcpu_stolen(int vcpu) return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable; } -u64 xen_steal_clock(int cpu) +static u64 __xen_steal_clock(int cpu) { struct vcpu_runstate_info state; @@ -158,6 +161,30 @@ u64 xen_steal_clock(int cpu) return state.time[RUNSTATE_runnable] + state.time[RUNSTATE_offline]; } +u64 xen_steal_clock(int cpu) +{ + return __xen_steal_clock(cpu) + per_cpu(xen_steal_clock_offset, cpu); +} + +void xen_save_steal_clock(int cpu) +{ + per_cpu(xen_prev_steal_clock, cpu) = xen_steal_clock(cpu); +} + +void xen_restore_steal_clock(int cpu) +{ + u64 steal_clock = __xen_steal_clock(cpu); + + if (per_cpu(xen_prev_steal_clock, cpu) > steal_clock) { + /* Need to update the offset */ + per_cpu(xen_steal_clock_offset, cpu) = + per_cpu(xen_prev_steal_clock, cpu) - steal_clock; + } else { + /* Avoid unnecessary steal clock warp */ + per_cpu(xen_steal_clock_offset, cpu) = 0; + } +} + void xen_setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 1a9ded0cddcb0..fcf00a41422db 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -668,26 +669,47 @@ int xenbus_dev_suspend(struct device *dev) struct xenbus_driver *drv; struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev); + int (*cb)(struct xenbus_device *) = NULL; + bool xen_suspend = xen_suspend_mode_is_xen_suspend(); DPRINTK("%s", xdev->nodename); if (dev->driver == NULL) return 0; drv = to_xenbus_driver(dev->driver); - if (drv->suspend) - err = drv->suspend(xdev); - if (err) - dev_warn(dev, "suspend failed: %i\n", err); + + if (xen_suspend) + cb = drv->suspend; + else + cb = drv->freeze; + + if (cb) + err = cb(xdev); + + if (err) { + dev_warn(dev, "%s failed: %i\n", xen_suspend ? + "suspend" : "freeze", err); + return err; + } + + if (!xen_suspend) { + /* Forget otherend since this can become stale after restore */ + free_otherend_watch(xdev); + free_otherend_details(xdev); + } + return 0; } EXPORT_SYMBOL_GPL(xenbus_dev_suspend); int xenbus_dev_resume(struct device *dev) { - int err; + int err = 0; struct xenbus_driver *drv; struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev); + int (*cb)(struct xenbus_device *) = NULL; + bool xen_suspend = xen_suspend_mode_is_xen_suspend(); DPRINTK("%s", xdev->nodename); @@ -696,23 +718,32 @@ int xenbus_dev_resume(struct device *dev) drv = to_xenbus_driver(dev->driver); err = talk_to_otherend(xdev); if (err) { - dev_warn(dev, "resume (talk_to_otherend) failed: %i\n", err); + dev_warn(dev, "%s (talk_to_otherend) failed: %i\n", + xen_suspend ? "resume" : "restore", err); return err; } - xdev->state = XenbusStateInitialising; + if (xen_suspend) + xdev->state = XenbusStateInitialising; - if (drv->resume) { - err = drv->resume(xdev); - if (err) { - dev_warn(dev, "resume failed: %i\n", err); - return err; - } + if (xen_suspend) + cb = drv->resume; + else + cb = drv->restore; + + if (cb) + err = cb(xdev); + + if (err) { + dev_warn(dev, "%s failed: %i\n", + xen_suspend ? "resume" : "restore", err); + return err; } err = watch_otherend(xdev); if (err) { - dev_warn(dev, "resume (watch_otherend) failed: %d\n", err); + dev_warn(dev, "%s (watch_otherend) failed: %d.\n", + xen_suspend ? "resume" : "restore", err); return err; } @@ -722,8 +753,44 @@ EXPORT_SYMBOL_GPL(xenbus_dev_resume); int xenbus_dev_cancel(struct device *dev) { - /* Do nothing */ - DPRINTK("cancel"); + int err = 0; + struct xenbus_driver *drv; + struct xenbus_device *xdev + = container_of(dev, struct xenbus_device, dev); + bool xen_suspend = xen_suspend_mode_is_xen_suspend(); + + if (xen_suspend) { + /* Do nothing */ + DPRINTK("cancel"); + return 0; + } + + DPRINTK("%s", xdev->nodename); + + if (dev->driver == NULL) + return 0; + drv = to_xenbus_driver(dev->driver); + + err = talk_to_otherend(xdev); + if (err) { + dev_warn(dev, "thaw (talk_to_otherend) failed: %d.\n", err); + return err; + } + + if (drv->thaw) { + err = drv->thaw(xdev); + if (err) { + dev_warn(dev, "thaw failed: %i\n", err); + return err; + } + } + + err = watch_otherend(xdev); + if (err) { + dev_warn(dev, "thaw (watch_otherend) failed: %d.\n", err); + return err; + } + return 0; } EXPORT_SYMBOL_GPL(xenbus_dev_cancel); diff --git a/fs/smb/client/cifs_dfs_ref.c b/fs/smb/client/cifs_dfs_ref.c index 020e71fe1454e..876f9a43a99db 100644 --- a/fs/smb/client/cifs_dfs_ref.c +++ b/fs/smb/client/cifs_dfs_ref.c @@ -258,6 +258,31 @@ char *cifs_compose_mount_options(const char *sb_mountdata, goto compose_mount_options_out; } +static int set_dest_addr(struct smb3_fs_context *ctx, const char *full_path) +{ + struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr; + char *str_addr = NULL; + int rc; + + rc = dns_resolve_server_name_to_ip(full_path, &str_addr, NULL); + if (rc < 0) + goto out; + + rc = cifs_convert_address(addr, str_addr, strlen(str_addr)); + if (!rc) { + cifs_dbg(FYI, "%s: failed to convert ip address\n", __func__); + rc = -EINVAL; + goto out; + } + + cifs_set_port(addr, ctx->port); + rc = 0; + +out: + kfree(str_addr); + return rc; +} + /* * Create a vfsmount that we can automount */ @@ -295,8 +320,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path) ctx = smb3_fc2context(fc); page = alloc_dentry_path(); - /* always use tree name prefix */ - full_path = build_path_from_dentry_optional_prefix(mntpt, page, true); + full_path = dfs_get_automount_devname(mntpt, page); if (IS_ERR(full_path)) { mnt = ERR_CAST(full_path); goto out; @@ -315,6 +339,12 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path) goto out; } + rc = set_dest_addr(ctx, full_path); + if (rc) { + mnt = ERR_PTR(rc); + goto out; + } + rc = smb3_parse_devname(full_path, ctx); if (!rc) mnt = fc_mount(fc); diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index f37e4da0fe405..6dbc9afd67281 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -57,8 +57,26 @@ extern void exit_cifs_idmap(void); extern int init_cifs_spnego(void); extern void exit_cifs_spnego(void); extern const char *build_path_from_dentry(struct dentry *, void *); +char *__build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, + const char *tree, int tree_len, + bool prefix); extern char *build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, bool prefix); +static inline char *dfs_get_automount_devname(struct dentry *dentry, void *page) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct TCP_Server_Info *server = tcon->ses->server; + + if (unlikely(!server->origin_fullpath)) + return ERR_PTR(-EREMOTE); + + return __build_path_from_dentry_optional_prefix(dentry, page, + server->origin_fullpath, + strlen(server->origin_fullpath), + true); +} + static inline void *alloc_dentry_path(void) { return __getname(); diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index 863c7bc3db86f..477302157ab3d 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -78,14 +78,13 @@ build_path_from_dentry(struct dentry *direntry, void *page) prefix); } -char * -build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, - bool prefix) +char *__build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, + const char *tree, int tree_len, + bool prefix) { int dfsplen; int pplen = 0; struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); char dirsep = CIFS_DIR_SEP(cifs_sb); char *s; @@ -93,7 +92,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, return ERR_PTR(-ENOMEM); if (prefix) - dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1); + dfsplen = strnlen(tree, tree_len + 1); else dfsplen = 0; @@ -123,7 +122,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, } if (dfsplen) { s -= dfsplen; - memcpy(s, tcon->tree_name, dfsplen); + memcpy(s, tree, dfsplen); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { int i; for (i = 0; i < dfsplen; i++) { @@ -135,6 +134,16 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, return s; } +char *build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, + bool prefix) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + + return __build_path_from_dentry_optional_prefix(direntry, page, tcon->tree_name, + MAX_TREE_SIZE, prefix); +} + /* * Don't allow path components longer than the server max. * Don't allow the separator character in a path component. diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 6196b71c5eb58..06d17f8335f91 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -89,6 +89,8 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); vcpu->arch.pmu.events = *kvm_get_pmu_events(); \ } while (0) +u8 kvm_arm_pmu_get_pmuver_limit(void); + #else struct kvm_pmu { }; @@ -154,6 +156,10 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {} +static inline u8 kvm_arm_pmu_get_pmuver_limit(void) +{ + return 0; +} #endif diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 439815cc1ab96..9c984ffc8a0aa 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -104,6 +104,7 @@ static inline unsigned long wb_stat_error(void) int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* * Flags in backing_dev_info::capability diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1ca1902af23e9..5c10057a99d9a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -674,7 +674,7 @@ enum bpf_reg_type { PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ - PTR_TO_DYNPTR, /* reg points to a dynptr */ + CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ @@ -2780,7 +2780,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); int bpf_dynptr_check_size(u32 size); -u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr); +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f080ccf27d256..35cc34a7a625e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -594,11 +594,9 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); -bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, - struct bpf_reg_state *reg); -bool is_dynptr_type_expected(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - enum bpf_arg_type arg_type); +struct bpf_call_arg_meta; +int process_dynptr_func(struct bpf_verifier_env *env, int regno, + enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 973a1bfd7ef53..92689cff87d97 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -117,6 +117,14 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, */ #define __stringify_label(n) #n +#define __annotate_reachable(c) ({ \ + asm volatile(__stringify_label(c) ":\n\t" \ + ".pushsection .discard.reachable\n\t" \ + ".long " __stringify_label(c) "b - .\n\t" \ + ".popsection\n\t"); \ +}) +#define annotate_reachable() __annotate_reachable(__COUNTER__) + #define __annotate_unreachable(c) ({ \ asm volatile(__stringify_label(c) ":\n\t" \ ".pushsection .discard.unreachable\n\t" \ @@ -129,6 +137,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __annotate_jump_table __section(".rodata..c_jump_table") #else /* !CONFIG_OBJTOOL */ +#define annotate_reachable() #define annotate_unreachable() #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ diff --git a/include/linux/crypto.h b/include/linux/crypto.h index e3c4be29aaccb..2d36bebc4799c 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -141,6 +141,12 @@ */ #define CRYPTO_ALG_FIPS_INTERNAL 0x00020000 +/* + * Mark an algorithm as approved for FIPS 140-3. This is intended to be used + * for algorithms that are only approved under certain conditions + */ +#define CRYPTO_ALG_FIPS140_COMPLIANT 0x00040000 + /* * Transform masks and values (for crt_flags). */ @@ -150,6 +156,7 @@ #define CRYPTO_TFM_REQ_FORBID_WEAK_KEYS 0x00000100 #define CRYPTO_TFM_REQ_MAY_SLEEP 0x00000200 #define CRYPTO_TFM_REQ_MAY_BACKLOG 0x00000400 +#define CRYPTO_TFM_REQ_NEED_RESEED 0x00000800 /* * Miscellaneous stuff. diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index d678afeb8a13a..6f548807fc84b 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -7,6 +7,7 @@ #define _LINUX_DMA_MAP_OPS_H #include +#include #include struct cma; @@ -168,12 +169,6 @@ static inline void dma_free_contiguous(struct device *dev, struct page *page, } #endif /* CONFIG_DMA_CMA*/ -#ifdef CONFIG_DMA_PERNUMA_CMA -void dma_pernuma_cma_reserve(void); -#else -static inline void dma_pernuma_cma_reserve(void) { } -#endif /* CONFIG_DMA_PERNUMA_CMA */ - #ifdef CONFIG_DMA_DECLARE_COHERENT int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size); @@ -368,6 +363,10 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent) { +#ifdef CONFIG_DMA_PAGE_TOUCHING + if (!dev->dma_ops) + setup_dma_page_touching_ops(dev); +#endif } #endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */ diff --git a/include/linux/dma-page-touching.h b/include/linux/dma-page-touching.h new file mode 100644 index 0000000000000..8ff9856e994c9 --- /dev/null +++ b/include/linux/dma-page-touching.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2021 Amazon.com, Inc. or its affiliates. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * Sets the supplied device's DMA ops to the page toucing DMA ops if + * page touching is enabled and the device does not already have + * DMA ops assigned. + */ +void setup_dma_page_touching_ops(struct device *dev); diff --git a/include/linux/irq.h b/include/linux/irq.h index 06c692cc0accb..9509c21b07452 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -799,6 +799,8 @@ extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry); extern struct irq_data *irq_get_irq_data(unsigned int irq); +extern void irq_state_clr_started(struct irq_desc *desc); + static inline struct irq_chip *irq_get_chip(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 62c54ffbeeaac..bf494bd6e191a 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -45,7 +45,9 @@ struct unwind_hint { #ifdef CONFIG_OBJTOOL +#ifndef CONFIG_ARM64 #include +#endif #ifndef __ASSEMBLY__ diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h index fe648dfa3a7ca..8ca7e623e3146 100644 --- a/include/linux/page_reporting.h +++ b/include/linux/page_reporting.h @@ -12,6 +12,8 @@ struct page_reporting_dev_info { /* function that alters pages to make them "reported" */ int (*report)(struct page_reporting_dev_info *prdev, struct scatterlist *sg, unsigned int nents); + int (*report_offline)(struct page_reporting_dev_info *prdev, + unsigned long start_pfn, unsigned int nr_pages); /* work struct for processing reports */ struct delayed_work work; @@ -23,6 +25,8 @@ struct page_reporting_dev_info { unsigned int order; }; +void page_report_offline(unsigned long start_pfn, unsigned int nr_pages); + /* Tear-down and bring-up for page reporting devices */ void page_reporting_unregister(struct page_reporting_dev_info *prdev); int page_reporting_register(struct page_reporting_dev_info *prdev); diff --git a/include/linux/random.h b/include/linux/random.h index 51133627ba73a..fc5f73f4f2d33 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -10,6 +10,11 @@ #include +struct random_extrng { + ssize_t (*extrng_read)(void __user *buf, size_t buflen, bool reseed); + struct module *owner; +}; + struct notifier_block; void add_device_randomness(const void *buf, size_t len); @@ -139,6 +144,8 @@ void __init random_init_early(const char *command_line); void __init random_init(void); bool rng_is_initialized(void); int wait_for_random_bytes(void); +void random_register_extrng(const struct random_extrng *rng); +void random_unregister_extrng(void); /* Calls wait_for_random_bytes() and then calls get_random_bytes(buf, nbytes). * Returns the result of the call to wait_for_random_bytes. */ diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 867d588314e03..902654ac5f7e7 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -32,6 +32,10 @@ static inline void clear_sched_clock_stable(void) { } +static inline void set_sched_clock_stable(void) +{ +} + static inline void sched_clock_idle_sleep_event(void) { } @@ -51,6 +55,7 @@ static inline u64 local_clock(void) } #else extern int sched_clock_stable(void); +extern void set_sched_clock_stable(void); extern void clear_sched_clock_stable(void); /* diff --git a/include/net/udp.h b/include/net/udp.h index fa4cdbe55552c..17a78e61692b7 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -186,6 +186,11 @@ void udp_lib_rehash(struct sock *sk, u16 new_hash); static inline void udp_lib_close(struct sock *sk, long timeout) { + /* A zerocopy skb has a refcnt of sk and may be + * put into sk_error_queue with TX timestamp + */ + skb_queue_purge(&sk->sk_error_queue); + sk_common_release(sk); } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a17688011440e..eac87609ca170 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5301,7 +5301,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5311,7 +5311,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5321,7 +5321,7 @@ union bpf_attr { * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* * is a read-only dynptr or if *flags* is not 0. * - * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) * Description * Get a pointer to the underlying dynptr data. * @@ -5422,7 +5422,7 @@ union bpf_attr { * Drain samples from the specified user ring buffer, and invoke * the provided callback for each such sample: * - * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); * * If **callback_fn** returns 0, the helper will continue to try * and drain the next sample, up to a maximum of diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 283dec7e36451..bf7ece5b157b3 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -67,7 +67,7 @@ #define IPVERSION 4 #define MAXTTL 255 -#define IPDEFTTL 64 +#define IPDEFTTL 127 #define IPOPT_OPTVAL 0 #define IPOPT_OLEN 1 diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h index 42a40ad3fb622..082ed689fdaf6 100644 --- a/include/uapi/linux/psci.h +++ b/include/uapi/linux/psci.h @@ -59,6 +59,8 @@ #define PSCI_1_1_FN_SYSTEM_RESET2 PSCI_0_2_FN(18) #define PSCI_1_1_FN_MEM_PROTECT PSCI_0_2_FN(19) #define PSCI_1_1_FN_MEM_PROTECT_CHECK_RANGE PSCI_0_2_FN(20) +#define PSCI_1_3_FN_SYSTEM_OFF2 PSCI_0_2_FN(21) +#define PSCI_1_3_FN_CLEAN_INV_MEMREGION_ATTRIBUTES PSCI_0_2_FN(23) #define PSCI_1_0_FN64_CPU_DEFAULT_SUSPEND PSCI_0_2_FN64(12) #define PSCI_1_0_FN64_NODE_HW_STATE PSCI_0_2_FN64(13) @@ -68,6 +70,8 @@ #define PSCI_1_1_FN64_SYSTEM_RESET2 PSCI_0_2_FN64(18) #define PSCI_1_1_FN64_MEM_PROTECT_CHECK_RANGE PSCI_0_2_FN64(20) +#define PSCI_1_3_FN64_SYSTEM_OFF2 PSCI_0_2_FN64(21) +#define PSCI_1_3_FN64_CLEAN_INV_MEMREGION PSCI_0_2_FN64(22) /* PSCI v0.2 power state encoding for CPU_SUSPEND function */ #define PSCI_0_2_POWER_STATE_ID_MASK 0xffff @@ -100,6 +104,19 @@ #define PSCI_1_1_RESET_TYPE_SYSTEM_WARM_RESET 0 #define PSCI_1_1_RESET_TYPE_VENDOR_START 0x80000000U +/* PSCI v1.3 hibernate type for SYSTEM_OFF2 */ +#define PSCI_1_3_HIBERNATE_TYPE_OFF 0 + +/* PSCI v1.3 flags for CLEAN_INV_MEMREGION */ +#define PSCI_1_3_CLEAN_INV_MEMREGION_FLAG_DRY_RUN BIT(0) + +/* PSCI v1.3 attributes for CLEAN_INV_MEMREGION_ATTRIBUTES */ +#define PSCI_1_3_CLEAN_INV_MEMREGION_ATTR_OP_TYPE 0 +#define PSCI_1_3_CLEAN_INV_MEMREGION_ATTR_CPU_RDVZ 1 +#define PSCI_1_3_CLEAN_INV_MEMREGION_ATTR_LATENCY 2 +#define PSCI_1_3_CLEAN_INV_MEMREGION_ATTR_RATE_LIMIT 3 +#define PSCI_1_3_CLEAN_INV_MEMREGION_ATTR_TIMEOUT 4 + /* PSCI version decoding (independent of PSCI version) */ #define PSCI_VERSION_MAJOR_SHIFT 16 #define PSCI_VERSION_MINOR_MASK \ @@ -133,5 +150,8 @@ #define PSCI_RET_NOT_PRESENT -7 #define PSCI_RET_DISABLED -8 #define PSCI_RET_INVALID_ADDRESS -9 +#define PSCI_RET_TIMEOUT -10 +#define PSCI_RET_RATE_LIMITED -11 +#define PSCI_RET_BUSY -12 #endif /* _UAPI_LINUX_PSCI_H */ diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h index 163ac79556d68..701e2d567e411 100644 --- a/include/uapi/rdma/efa-abi.h +++ b/include/uapi/rdma/efa-abi.h @@ -1,12 +1,13 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ /* - * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_ABI_USER_H #define EFA_ABI_USER_H #include +#include /* * Increment this value if any changes that break userspace ABI @@ -120,6 +121,8 @@ enum { EFA_QUERY_DEVICE_CAPS_RNR_RETRY = 1 << 1, EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2, EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID = 1 << 3, + EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4, + EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5, }; struct efa_ibv_ex_query_device_resp { @@ -132,4 +135,22 @@ struct efa_ibv_ex_query_device_resp { __u32 device_caps; }; +enum { + EFA_QUERY_MR_VALIDITY_RECV_IC_ID = 1 << 0, + EFA_QUERY_MR_VALIDITY_RDMA_READ_IC_ID = 1 << 1, + EFA_QUERY_MR_VALIDITY_RDMA_RECV_IC_ID = 1 << 2, +}; + +enum efa_query_mr_attrs { + EFA_IB_ATTR_QUERY_MR_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + EFA_IB_ATTR_QUERY_MR_RESP_IC_ID_VALIDITY, + EFA_IB_ATTR_QUERY_MR_RESP_RECV_IC_ID, + EFA_IB_ATTR_QUERY_MR_RESP_RDMA_READ_IC_ID, + EFA_IB_ATTR_QUERY_MR_RESP_RDMA_RECV_IC_ID, +}; + +enum efa_mr_methods { + EFA_IB_METHOD_MR_QUERY = (1U << UVERBS_ID_NS_SHIFT), +}; + #endif /* EFA_ABI_USER_H */ diff --git a/include/xen/events.h b/include/xen/events.h index b303bd24e2a6c..52f635c6ce94b 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -86,6 +86,8 @@ static inline void notify_remote_via_evtchn(evtchn_port_t port) void notify_remote_via_irq(int irq); void xen_irq_resume(void); +void xen_shutdown_pirqs(void); +void xen_restore_pirqs(void); /* Clear an irq's pending state, in preparation for polling on it */ void xen_clear_irq_pending(int irq); diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index a34f4271a2e9f..58231409cdddb 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -38,9 +38,17 @@ void xen_time_setup_guest(void); void xen_manage_runstate_time(int action); void xen_get_runstate_snapshot(struct vcpu_runstate_info *res); u64 xen_steal_clock(int cpu); +void xen_save_steal_clock(int cpu); +void xen_restore_steal_clock(int cpu); int xen_setup_shutdown_event(void); +bool xen_suspend_mode_is_xen_suspend(void); +bool xen_suspend_mode_is_pm_suspend(void); +bool xen_suspend_mode_is_pm_hibernation(void); + +void xen_setup_syscore_ops(void); + extern unsigned long *xen_contiguous_bitmap; #if defined(CONFIG_XEN_PV) diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index eaa932b99d8ac..3d8684d373d59 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -120,6 +120,9 @@ struct xenbus_driver { int (*remove)(struct xenbus_device *dev); int (*suspend)(struct xenbus_device *dev); int (*resume)(struct xenbus_device *dev); + int (*freeze)(struct xenbus_device *dev); + int (*thaw)(struct xenbus_device *dev); + int (*restore)(struct xenbus_device *dev); int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *); struct device_driver driver; int (*read_otherend_details)(struct xenbus_device *dev); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7582ec4fd4131..2571c37360acf 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6491,29 +6491,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, } if (arg_dynptr) { - if (reg->type != PTR_TO_STACK) { - bpf_log(log, "arg#%d pointer type %s %s not to stack\n", + if (reg->type != PTR_TO_STACK && + reg->type != CONST_PTR_TO_DYNPTR) { + bpf_log(log, "arg#%d pointer type %s %s not to stack or dynptr_ptr\n", i, btf_type_str(ref_t), ref_tname); return -EINVAL; } - if (!is_dynptr_reg_valid_init(env, reg)) { - bpf_log(log, - "arg#%d pointer type %s %s must be valid and initialized\n", - i, btf_type_str(ref_t), - ref_tname); - return -EINVAL; - } - - if (!is_dynptr_type_expected(env, reg, - ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) { - bpf_log(log, - "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n", - i, btf_type_str(ref_t), - ref_tname); - return -EINVAL; - } + ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL); + if (ret < 0) + return ret; continue; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 758510b46d87b..47a1bdf7ac595 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1422,7 +1422,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } @@ -1432,7 +1432,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ ptr->size |= type << DYNPTR_TYPE_SHIFT; } -u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } @@ -1456,7 +1456,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) { u32 size = bpf_dynptr_get_size(ptr); @@ -1501,7 +1501,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, }; -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, u32, offset, u64, flags) { int err; @@ -1513,7 +1513,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src if (err) return err; - memcpy(dst, src->data + src->offset + offset, len); + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst, src->data + src->offset + offset, len); return 0; } @@ -1524,12 +1528,12 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_DYNPTR, + .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len, u64, flags) { int err; @@ -1541,7 +1545,11 @@ BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, if (err) return err; - memcpy(dst->data + dst->offset + offset, src, len); + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst->data + dst->offset + offset, src, len); return 0; } @@ -1550,14 +1558,14 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) { int err; @@ -1578,7 +1586,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 56a5c8beb553d..20935d865c69f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -572,7 +572,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", - [PTR_TO_DYNPTR] = "dynptr_ptr", + [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", }; if (type & PTR_MAYBE_NULL) { @@ -619,11 +619,34 @@ static void print_liveness(struct bpf_verifier_env *env, verbose(env, "D"); } -static int get_spi(s32 off) +static int __get_spi(s32 off) { return (-off - 1) / BPF_REG_SIZE; } +static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + int off, spi; + + if (!tnum_is_const(reg->var_off)) { + verbose(env, "dynptr has to be at a constant offset\n"); + return -EINVAL; + } + + off = reg->off + reg->var_off.value; + if (off % BPF_REG_SIZE) { + verbose(env, "cannot pass in dynptr at an offset=%d\n", off); + return -EINVAL; + } + + spi = __get_spi(off); + if (spi < 1) { + verbose(env, "cannot pass in dynptr at an offset=%d\n", off); + return -EINVAL; + } + return spi; +} + static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots) { int allocated_slots = state->allocated_stack / BPF_REG_SIZE; @@ -706,6 +729,30 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type) return type == BPF_DYNPTR_TYPE_RINGBUF; } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, + enum bpf_dynptr_type type, + bool first_slot); + +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); + +static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1, + struct bpf_reg_state *sreg2, + enum bpf_dynptr_type type) +{ + __mark_dynptr_reg(sreg1, type, true); + __mark_dynptr_reg(sreg2, type, false); +} + +static void mark_dynptr_cb_reg(struct bpf_reg_state *reg, + enum bpf_dynptr_type type) +{ + __mark_dynptr_reg(reg, type, true); +} + +static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, + struct bpf_func_state *state, int spi); + static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type, int insn_idx) { @@ -713,7 +760,9 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ enum bpf_dynptr_type type; int spi, i, id; - spi = get_spi(reg->off); + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) return -EINVAL; @@ -727,9 +776,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (type == BPF_DYNPTR_TYPE_INVALID) return -EINVAL; - state->stack[spi].spilled_ptr.dynptr.first_slot = true; - state->stack[spi].spilled_ptr.dynptr.type = type; - state->stack[spi - 1].spilled_ptr.dynptr.type = type; + mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr, + &state->stack[spi - 1].spilled_ptr, type); if (dynptr_type_refcounted(type)) { /* The id is used to track proper releasing */ @@ -737,10 +785,13 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (id < 0) return id; - state->stack[spi].spilled_ptr.id = id; - state->stack[spi - 1].spilled_ptr.id = id; + state->stack[spi].spilled_ptr.ref_obj_id = id; + state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + return 0; } @@ -749,7 +800,9 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re struct bpf_func_state *state = func(env, reg); int spi, i; - spi = get_spi(reg->off); + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) return -EINVAL; @@ -760,15 +813,84 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re } /* Invalidate any slices associated with this dynptr */ + if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) + WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id)); + + __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); + __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); + + /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot? + * + * While we don't allow reading STACK_INVALID, it is still possible to + * do <8 byte writes marking some but not all slots as STACK_MISC. Then, + * helpers or insns can do partial read of that part without failing, + * but check_stack_range_initialized, check_stack_read_var_off, and + * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of + * the slot conservatively. Hence we need to prevent those liveness + * marking walks. + * + * This was not a problem before because STACK_INVALID is only set by + * default (where the default reg state has its reg->parent as NULL), or + * in clean_live_states after REG_LIVE_DONE (at which point + * mark_reg_read won't walk reg->parent chain), but not randomly during + * verifier state exploration (like we did above). Hence, for our case + * parentage chain will still be live (i.e. reg->parent may be + * non-NULL), while earlier reg->parent was NULL, so we need + * REG_LIVE_WRITTEN to screen off read marker propagation when it is + * done later on reads or by mark_dynptr_read as well to unnecessary + * mark registers in verifier state. + */ + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + + return 0; +} + +static void __mark_reg_unknown(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); + +static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, + struct bpf_func_state *state, int spi) +{ + int i; + + /* We always ensure that STACK_DYNPTR is never set partially, + * hence just checking for slot_type[0] is enough. This is + * different for STACK_SPILL, where it may be only set for + * 1 byte, so code has to use is_spilled_reg. + */ + if (state->stack[spi].slot_type[0] != STACK_DYNPTR) + return 0; + + /* Reposition spi to first slot */ + if (!state->stack[spi].spilled_ptr.dynptr.first_slot) + spi = spi + 1; + if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - release_reference(env, state->stack[spi].spilled_ptr.id); - state->stack[spi].spilled_ptr.id = 0; - state->stack[spi - 1].spilled_ptr.id = 0; + verbose(env, "cannot overwrite referenced dynptr\n"); + return -EINVAL; + } + + mark_stack_slot_scratched(env, spi); + mark_stack_slot_scratched(env, spi - 1); + + /* Writing partially to one dynptr stack slot destroys both. */ + for (i = 0; i < BPF_REG_SIZE; i++) { + state->stack[spi].slot_type[i] = STACK_INVALID; + state->stack[spi - 1].slot_type[i] = STACK_INVALID; } - state->stack[spi].spilled_ptr.dynptr.first_slot = false; - state->stack[spi].spilled_ptr.dynptr.type = 0; - state->stack[spi - 1].spilled_ptr.dynptr.type = 0; + /* TODO: Invalidate any slices associated with this dynptr */ + + /* Do not release reference state, we are destroying dynptr on stack, + * not using some helper to release it. Just reset register. + */ + __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); + __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); + + /* Same reason as unmark_stack_slots_dynptr above */ + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; return 0; } @@ -776,9 +898,16 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); - int i; + int spi, i; + + if (reg->type == CONST_PTR_TO_DYNPTR) + return false; + + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return false; + /* We will do check_mem_access to check and update stack bounds later */ if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) return true; @@ -791,13 +920,18 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ return true; } -bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, - struct bpf_reg_state *reg) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); - int i; + int spi, i; + + /* This already represents first slot of initialized bpf_dynptr */ + if (reg->type == CONST_PTR_TO_DYNPTR) + return true; + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return false; if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || !state->stack[spi].spilled_ptr.dynptr.first_slot) return false; @@ -811,21 +945,26 @@ bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, return true; } -bool is_dynptr_type_expected(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - enum bpf_arg_type arg_type) +static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type dynptr_type; - int spi = get_spi(reg->off); + int spi; /* ARG_PTR_TO_DYNPTR takes any type of dynptr */ if (arg_type == ARG_PTR_TO_DYNPTR) return true; dynptr_type = arg_to_dynptr_type(arg_type); - - return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; + if (reg->type == CONST_PTR_TO_DYNPTR) { + return reg->dynptr.type == dynptr_type; + } else { + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return false; + return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; + } } /* The reg state of a pointer or a bounded scalar was saved when @@ -1338,9 +1477,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void __mark_reg_not_init(const struct bpf_verifier_env *env, - struct bpf_reg_state *reg); - /* This helper doesn't clear reg->id */ static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { @@ -1403,6 +1539,19 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, __mark_reg_known_zero(regs + regno); } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, + bool first_slot) +{ + /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for + * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply + * set it unconditionally as it is ignored for STACK_DYNPTR anyway. + */ + __mark_reg_known_zero(reg); + reg->type = CONST_PTR_TO_DYNPTR; + reg->dynptr.type = type; + reg->dynptr.first_slot = first_slot; +} + static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) { if (base_type(reg->type) == PTR_TO_MAP_VALUE) { @@ -2334,6 +2483,32 @@ static int mark_reg_read(struct bpf_verifier_env *env, return 0; } +static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi, ret; + + /* For CONST_PTR_TO_DYNPTR, it must have already been done by + * check_reg_arg in check_helper_call and mark_btf_func_reg_size in + * check_kfunc_call. + */ + if (reg->type == CONST_PTR_TO_DYNPTR) + return 0; + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + /* Caller ensures dynptr is valid and initialized, which means spi is in + * bounds and spi is the first dynptr slot. Simply mark stack slot as + * read. + */ + ret = mark_reg_read(env, &state->stack[spi].spilled_ptr, + state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64); + if (ret) + return ret; + return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr, + state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64); +} + /* This function is supposed to be used by the following 32-bit optimization * code only. It returns TRUE if the source or destination register operates * on 64-bit, otherwise return FALSE. @@ -3309,6 +3484,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, env->insn_aux_data[insn_idx].sanitize_stack_spill = true; } + err = destroy_if_dynptr_stack_slot(env, state, spi); + if (err) + return err; + mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { @@ -3433,6 +3612,14 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, if (err) return err; + for (i = min_off; i < max_off; i++) { + int spi; + + spi = __get_spi(i); + err = destroy_if_dynptr_stack_slot(env, state, spi); + if (err) + return err; + } /* Variable offset writes destroy any spilled pointers in range. */ for (i = min_off; i < max_off; i++) { @@ -5411,6 +5598,31 @@ static int check_stack_range_initialized( } if (meta && meta->raw_mode) { + /* Ensure we won't be overwriting dynptrs when simulating byte + * by byte access in check_helper_call using meta.access_size. + * This would be a problem if we have a helper in the future + * which takes: + * + * helper(uninit_mem, len, dynptr) + * + * Now, uninint_mem may overlap with dynptr pointer. Hence, it + * may end up writing to dynptr itself when touching memory from + * arg 1. This can be relaxed on a case by case basis for known + * safe cases, but reject due to the possibilitiy of aliasing by + * default. + */ + for (i = min_off; i < max_off + access_size; i++) { + int stack_off = -i - 1; + + spi = __get_spi(i); + /* raw_mode may write past allocated_stack */ + if (state->allocated_stack <= stack_off) + continue; + if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) { + verbose(env, "potential write to dynptr at off=%d disallowed\n", i); + return -EACCES; + } + } meta->access_size = access_size; meta->regno = regno; return 0; @@ -5854,6 +6066,128 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return 0; } +/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK + * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR. + * + * In both cases we deal with the first 8 bytes, but need to mark the next 8 + * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of + * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object. + * + * Mutability of bpf_dynptr is at two levels, one is at the level of struct + * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct + * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can + * mutate the view of the dynptr and also possibly destroy it. In the latter + * case, it cannot mutate the bpf_dynptr itself but it can still mutate the + * memory that dynptr points to. + * + * The verifier will keep track both levels of mutation (bpf_dynptr's in + * reg->type and the memory's in reg->dynptr.type), but there is no support for + * readonly dynptr view yet, hence only the first case is tracked and checked. + * + * This is consistent with how C applies the const modifier to a struct object, + * where the pointer itself inside bpf_dynptr becomes const but not what it + * points to. + * + * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument + * type, and declare it as 'const struct bpf_dynptr *' in their prototype. + */ +int process_dynptr_func(struct bpf_verifier_env *env, int regno, + enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + + /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an + * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): + */ + if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { + verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); + return -EFAULT; + } + /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to + * check_func_arg_reg_off's logic. We only need to check offset + * and its alignment for PTR_TO_STACK. + */ + if (reg->type == PTR_TO_STACK) { + int err = dynptr_get_spi(env, reg); + + if (err < 0) + return err; + } + + /* MEM_UNINIT - Points to memory that is an appropriate candidate for + * constructing a mutable bpf_dynptr object. + * + * Currently, this is only possible with PTR_TO_STACK + * pointing to a region of at least 16 bytes which doesn't + * contain an existing bpf_dynptr. + * + * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be + * mutated or destroyed. However, the memory it points to + * may be mutated. + * + * None - Points to a initialized dynptr that can be mutated and + * destroyed, including mutation of the memory it points + * to. + */ + if (arg_type & MEM_UNINIT) { + if (!is_dynptr_reg_valid_uninit(env, reg)) { + verbose(env, "Dynptr has to be an uninitialized dynptr\n"); + return -EINVAL; + } + + /* We only support one dynptr being uninitialized at the moment, + * which is sufficient for the helper functions we have right now. + */ + if (meta->uninit_dynptr_regno) { + verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); + return -EFAULT; + } + + meta->uninit_dynptr_regno = regno; + } else /* MEM_RDONLY and None case from above */ { + int err; + + /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ + if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { + verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); + return -EINVAL; + } + + if (!is_dynptr_reg_valid_init(env, reg)) { + verbose(env, + "Expected an initialized dynptr as arg #%d\n", + regno); + return -EINVAL; + } + + /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */ + if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { + const char *err_extra = ""; + + switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { + case DYNPTR_TYPE_LOCAL: + err_extra = "local"; + break; + case DYNPTR_TYPE_RINGBUF: + err_extra = "ringbuf"; + break; + default: + err_extra = ""; + break; + } + verbose(env, + "Expected a dynptr of type %s as arg #%d\n", + err_extra, regno); + return -EINVAL; + } + + err = mark_dynptr_read(env, reg); + if (err) + return err; + } + return 0; +} + static bool arg_type_is_mem_size(enum bpf_arg_type type) { return type == ARG_CONST_SIZE || @@ -5986,7 +6320,7 @@ static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } } static const struct bpf_reg_types dynptr_types = { .types = { PTR_TO_STACK, - PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL, + CONST_PTR_TO_DYNPTR, } }; @@ -6110,17 +6444,38 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, enum bpf_arg_type arg_type) { - enum bpf_reg_type type = reg->type; - bool fixed_off_ok = false; + u32 type = reg->type; - switch ((u32)type) { - /* Pointer types where reg offset is explicitly allowed: */ - case PTR_TO_STACK: - if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) { - verbose(env, "cannot pass in dynptr at an offset\n"); + /* When referenced register is passed to release function, its fixed + * offset must be 0. + * + * We will check arg_type_is_release reg has ref_obj_id when storing + * meta->release_regno. + */ + if (arg_type_is_release(arg_type)) { + /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it + * may not directly point to the object being released, but to + * dynptr pointing to such object, which might be at some offset + * on the stack. In that case, we simply to fallback to the + * default handling. + */ + if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK) + return 0; + /* Doing check_ptr_off_reg check for the offset will catch this + * because fixed_off_ok is false, but checking here allows us + * to give the user a better error message. + */ + if (reg->off) { + verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n", + regno); return -EINVAL; } - fallthrough; + return __check_ptr_off_reg(env, reg, regno, false); + } + + switch (type) { + /* Pointer types where both fixed and variable offset is explicitly allowed: */ + case PTR_TO_STACK: case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_MAP_KEY: @@ -6131,43 +6486,35 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: case SCALAR_VALUE: - /* Some of the argument types nevertheless require a - * zero register offset. - */ - if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM) - return 0; - break; + return 0; /* All the rest must be rejected, except PTR_TO_BTF_ID which allows * fixed offset. */ case PTR_TO_BTF_ID: /* When referenced PTR_TO_BTF_ID is passed to release function, - * it's fixed offset must be 0. In the other cases, fixed offset - * can be non-zero. - */ - if (arg_type_is_release(arg_type) && reg->off) { - verbose(env, "R%d must have zero offset when passed to release func\n", - regno); - return -EINVAL; - } - /* For arg is release pointer, fixed_off_ok must be false, but - * we already checked and rejected reg->off != 0 above, so set - * to true to allow fixed offset for all other cases. + * its fixed offset must be 0. In the other cases, fixed offset + * can be non-zero. This was already checked above. So pass + * fixed_off_ok as true to allow fixed offset for all other + * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we + * still need to do checks instead of returning. */ - fixed_off_ok = true; - break; + return __check_ptr_off_reg(env, reg, regno, true); default: - break; + return __check_ptr_off_reg(env, reg, regno, false); } - return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); } -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; - return state->stack[spi].spilled_ptr.id; + if (reg->type == CONST_PTR_TO_DYNPTR) + return reg->ref_obj_id; + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + return state->stack[spi].spilled_ptr.ref_obj_id; } static int check_func_arg(struct bpf_verifier_env *env, u32 arg, @@ -6231,11 +6578,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (arg_type_is_release(arg_type)) { if (arg_type_is_dynptr(arg_type)) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || - !state->stack[spi].spilled_ptr.id) { - verbose(env, "arg %d is an unacquired reference\n", regno); + /* Only dynptr created on stack can be released, thus + * the get_spi and stack state checks for spilled_ptr + * should only be done before process_dynptr_func for + * PTR_TO_STACK. + */ + if (reg->type == PTR_TO_STACK) { + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || + !state->stack[spi].spilled_ptr.ref_obj_id) { + verbose(env, "arg %d is an unacquired reference\n", regno); + return -EINVAL; + } + } else { + verbose(env, "cannot release unowned const bpf_dynptr\n"); return -EINVAL; } } else if (!reg->ref_obj_id && !register_is_null(reg)) { @@ -6332,19 +6692,22 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, break; case ARG_PTR_TO_SPIN_LOCK: if (meta->func_id == BPF_FUNC_spin_lock) { - if (process_spin_lock(env, regno, true)) - return -EACCES; + err = process_spin_lock(env, regno, true); + if (err) + return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - if (process_spin_lock(env, regno, false)) - return -EACCES; + err = process_spin_lock(env, regno, false); + if (err) + return err; } else { verbose(env, "verifier internal error\n"); return -EFAULT; } break; case ARG_PTR_TO_TIMER: - if (process_timer_func(env, regno, meta)) - return -EACCES; + err = process_timer_func(env, regno, meta); + if (err) + return err; break; case ARG_PTR_TO_FUNC: meta->subprogno = reg->subprogno; @@ -6367,52 +6730,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, err = check_mem_size_reg(env, reg, regno, true, meta); break; case ARG_PTR_TO_DYNPTR: - /* We only need to check for initialized / uninitialized helper - * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the - * assumption is that if it is, that a helper function - * initialized the dynptr on behalf of the BPF program. - */ - if (base_type(reg->type) == PTR_TO_DYNPTR) - break; - if (arg_type & MEM_UNINIT) { - if (!is_dynptr_reg_valid_uninit(env, reg)) { - verbose(env, "Dynptr has to be an uninitialized dynptr\n"); - return -EINVAL; - } - - /* We only support one dynptr being uninitialized at the moment, - * which is sufficient for the helper functions we have right now. - */ - if (meta->uninit_dynptr_regno) { - verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); - return -EFAULT; - } - - meta->uninit_dynptr_regno = regno; - } else if (!is_dynptr_reg_valid_init(env, reg)) { - verbose(env, - "Expected an initialized dynptr as arg #%d\n", - arg + 1); - return -EINVAL; - } else if (!is_dynptr_type_expected(env, reg, arg_type)) { - const char *err_extra = ""; - - switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { - case DYNPTR_TYPE_LOCAL: - err_extra = "local"; - break; - case DYNPTR_TYPE_RINGBUF: - err_extra = "ringbuf"; - break; - default: - err_extra = ""; - break; - } - verbose(env, - "Expected a dynptr of type %s as arg #%d\n", - err_extra, arg + 1); - return -EINVAL; - } + err = process_dynptr_func(env, regno, arg_type, meta); + if (err) + return err; break; case ARG_CONST_ALLOC_SIZE_OR_ZERO: if (!tnum_is_const(reg->var_off)) { @@ -6479,8 +6799,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, break; } case ARG_PTR_TO_KPTR: - if (process_kptr_func(env, regno, meta)) - return -EACCES; + err = process_kptr_func(env, regno, meta); + if (err) + return err; break; } @@ -7241,11 +7562,10 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, { /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void * callback_ctx, u64 flags); - * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx); + * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx); */ __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); - callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL; - __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL); callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; /* unused */ @@ -7631,7 +7951,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs = cur_regs(env); + /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr + * is safe to do directly. + */ if (meta.uninit_dynptr_regno) { + if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) { + verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n"); + return -EFAULT; + } /* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno, @@ -7649,15 +7977,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.release_regno) { err = -EINVAL; - if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) + /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr + * is safe to do directly. + */ + if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { + if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) { + verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n"); + return -EFAULT; + } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); - else if (meta.ref_obj_id) + } else if (meta.ref_obj_id) { err = release_reference(env, meta.ref_obj_id); - /* meta.ref_obj_id can only be 0 if register that is meant to be - * released is NULL, which must be > R0. - */ - else if (register_is_null(®s[meta.release_regno])) + } else if (register_is_null(®s[meta.release_regno])) { + /* meta.ref_obj_id can only be 0 if register that is meant to be + * released is NULL, which must be > R0. + */ err = 0; + } if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", func_id_name(func_id), func_id); @@ -7725,17 +8062,19 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { if (arg_type_is_dynptr(fn->arg_type[i])) { struct bpf_reg_state *reg = ®s[BPF_REG_1 + i]; + int ref_obj_id; if (meta.ref_obj_id) { verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); return -EFAULT; } - if (base_type(reg->type) != PTR_TO_DYNPTR) - /* Find the id of the dynptr we're - * tracking the reference of - */ - meta.ref_obj_id = stack_slot_get_id(env, reg); + ref_obj_id = dynptr_ref_obj_id(env, reg); + if (ref_obj_id < 0) { + verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); + return ref_obj_id; + } + meta.ref_obj_id = ref_obj_id; break; } } @@ -11990,10 +12329,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, return false; if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1) continue; - if (!is_spilled_reg(&old->stack[spi])) - continue; - if (!regsafe(env, &old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, idmap)) + /* Both old and cur are having same slot_type */ + switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) { + case STACK_SPILL: /* when explored and current stack slot are both storing * spilled registers, check that stored pointers types * are the same as well. @@ -12004,7 +12342,30 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, * such verifier states are not equivalent. * return false to continue verification of this path */ + if (!regsafe(env, &old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, idmap)) + return false; + break; + case STACK_DYNPTR: + { + const struct bpf_reg_state *old_reg, *cur_reg; + + old_reg = &old->stack[spi].spilled_ptr; + cur_reg = &cur->stack[spi].spilled_ptr; + if (old_reg->dynptr.type != cur_reg->dynptr.type || + old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot || + !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + return false; + break; + } + case STACK_MISC: + case STACK_ZERO: + case STACK_INVALID: + continue; + /* Ensure that new unhandled slot types return false by default */ + default: return false; + } } return true; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 97ecca43386d9..2e74ef9750202 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -207,6 +207,8 @@ static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; +static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .ns.count = REFCOUNT_INIT(2), @@ -1383,7 +1385,9 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_root_count--; } - cgroup_favor_dynmods(root, false); + if (!have_favordynmods) + cgroup_favor_dynmods(root, false); + cgroup_exit_root_id(root); cgroup_unlock(); @@ -2266,9 +2270,9 @@ static int cgroup_init_fs_context(struct fs_context *fc) fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; -#ifdef CONFIG_CGROUP_FAVOR_DYNMODS - ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; -#endif + if (have_favordynmods) + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + return 0; } @@ -6762,6 +6766,12 @@ static int __init enable_cgroup_debug(char *str) } __setup("cgroup_debug", enable_cgroup_debug); +static int __init cgroup_favordynmods_setup(char *str) +{ + return (kstrtobool(str, &have_favordynmods) == 0); +} +__setup("cgroup_favordynmods=", cgroup_favordynmods_setup); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 56866aaa2ae1a..177a3fa38ce57 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -24,6 +24,16 @@ config DMA_OPS_BYPASS config ARCH_HAS_DMA_MAP_DIRECT bool +config DMA_PAGE_TOUCHING + bool "Support touching pages when allocated for DMA" + help + Builds in support for binding page touching DMA ops to devices which + don't have an IOMMU. Memory mapped for DMA by those devices will be + access by the CPU via the page touching dma_map_ops to ensure that + the memory is resident when running on a memory overcommit host. + The capacility must still be set up at boot time via the + page_touching.dma_page_touching_enable kernel command line param. + config NEED_SG_DMA_LENGTH bool @@ -131,15 +141,16 @@ config DMA_CMA if DMA_CMA -config DMA_PERNUMA_CMA - bool "Enable separate DMA Contiguous Memory Area for each NUMA Node" - default NUMA && ARM64 +config DMA_NUMA_CMA + bool "Enable separate DMA Contiguous Memory Area for NUMA Node" + default NUMA help - Enable this option to get pernuma CMA areas so that devices like - ARM64 SMMU can get local memory by DMA coherent APIs. + Enable this option to get numa CMA areas so that NUMA devices + can get local memory by DMA coherent APIs. You can set the size of pernuma CMA by specifying "cma_pernuma=size" - on the kernel's command line. + or set the node id and its size of CMA by specifying "numa_cma= + :size[,:size]" on the kernel's command line. comment "Default contiguous memory area size:" diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 21926e46ef4fb..c552b9831f5b7 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o obj-$(CONFIG_MMU) += remap.o obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o +obj-$(CONFIG_DMA_PAGE_TOUCHING) += page_touching.o diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 6ea80ae426228..f005c66f378c3 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -50,6 +50,7 @@ #include #include #include +#include #ifdef CONFIG_CMA_SIZE_MBYTES #define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES @@ -96,11 +97,44 @@ static int __init early_cma(char *p) } early_param("cma", early_cma); -#ifdef CONFIG_DMA_PERNUMA_CMA +#ifdef CONFIG_DMA_NUMA_CMA +static struct cma *dma_contiguous_numa_area[MAX_NUMNODES]; +static phys_addr_t numa_cma_size[MAX_NUMNODES] __initdata; static struct cma *dma_contiguous_pernuma_area[MAX_NUMNODES]; static phys_addr_t pernuma_size_bytes __initdata; +static int __init early_numa_cma(char *p) +{ + int nid, count = 0; + unsigned long tmp; + char *s = p; + + while (*s) { + if (sscanf(s, "%lu%n", &tmp, &count) != 1) + break; + + if (s[count] == ':') { + if (tmp >= MAX_NUMNODES) + break; + nid = array_index_nospec(tmp, MAX_NUMNODES); + + s += count + 1; + tmp = memparse(s, &s); + numa_cma_size[nid] = tmp; + + if (*s == ',') + s++; + else + break; + } else + break; + } + + return 0; +} +early_param("numa_cma", early_numa_cma); + static int __init early_cma_pernuma(char *p) { pernuma_size_bytes = memparse(p, &p); @@ -127,32 +161,49 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) #endif -#ifdef CONFIG_DMA_PERNUMA_CMA -void __init dma_pernuma_cma_reserve(void) +#ifdef CONFIG_DMA_NUMA_CMA +static void __init dma_numa_cma_reserve(void) { int nid; - if (!pernuma_size_bytes) - return; - - for_each_online_node(nid) { + for_each_node(nid) { int ret; char name[CMA_MAX_NAME]; - struct cma **cma = &dma_contiguous_pernuma_area[nid]; - - snprintf(name, sizeof(name), "pernuma%d", nid); - ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0, - 0, false, name, cma, nid); - if (ret) { - pr_warn("%s: reservation failed: err %d, node %d", __func__, - ret, nid); + struct cma **cma; + + if (!node_online(nid)) { + if (pernuma_size_bytes || numa_cma_size[nid]) + pr_warn("invalid node %d specified\n", nid); continue; } - pr_debug("%s: reserved %llu MiB on node %d\n", __func__, - (unsigned long long)pernuma_size_bytes / SZ_1M, nid); + if (pernuma_size_bytes) { + + cma = &dma_contiguous_pernuma_area[nid]; + snprintf(name, sizeof(name), "pernuma%d", nid); + ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0, + 0, false, name, cma, nid); + if (ret) + pr_warn("%s: reservation failed: err %d, node %d", __func__, + ret, nid); + } + + if (numa_cma_size[nid]) { + + cma = &dma_contiguous_numa_area[nid]; + snprintf(name, sizeof(name), "numa%d", nid); + ret = cma_declare_contiguous_nid(0, numa_cma_size[nid], 0, 0, 0, false, + name, cma, nid); + if (ret) + pr_warn("%s: reservation failed: err %d, node %d", __func__, + ret, nid); + } } } +#else +static inline void __init dma_numa_cma_reserve(void) +{ +} #endif /** @@ -171,6 +222,8 @@ void __init dma_contiguous_reserve(phys_addr_t limit) phys_addr_t selected_limit = limit; bool fixed = false; + dma_numa_cma_reserve(); + pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); if (size_cmdline != -1) { @@ -303,7 +356,7 @@ static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp) */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { -#ifdef CONFIG_DMA_PERNUMA_CMA +#ifdef CONFIG_DMA_NUMA_CMA int nid = dev_to_node(dev); #endif @@ -315,7 +368,7 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) if (size <= PAGE_SIZE) return NULL; -#ifdef CONFIG_DMA_PERNUMA_CMA +#ifdef CONFIG_DMA_NUMA_CMA if (nid != NUMA_NO_NODE && !(gfp & (GFP_DMA | GFP_DMA32))) { struct cma *cma = dma_contiguous_pernuma_area[nid]; struct page *page; @@ -325,6 +378,13 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) if (page) return page; } + + cma = dma_contiguous_numa_area[nid]; + if (cma) { + page = cma_alloc_aligned(cma, size, gfp); + if (page) + return page; + } } #endif if (!dma_contiguous_default_area) @@ -356,10 +416,13 @@ void dma_free_contiguous(struct device *dev, struct page *page, size_t size) /* * otherwise, page is from either per-numa cma or default cma */ -#ifdef CONFIG_DMA_PERNUMA_CMA +#ifdef CONFIG_DMA_NUMA_CMA if (cma_release(dma_contiguous_pernuma_area[page_to_nid(page)], page, count)) return; + if (cma_release(dma_contiguous_numa_area[page_to_nid(page)], + page, count)) + return; #endif if (cma_release(dma_contiguous_default_area, page, count)) return; diff --git a/kernel/dma/page_touching.c b/kernel/dma/page_touching.c new file mode 100644 index 0000000000000..6ec3123beddaf --- /dev/null +++ b/kernel/dma/page_touching.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright 2020 Amazon.com, Inc. or its affiliates. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "direct.h" +#include +#include + +/* + * A wrapper around dma_direct which does a readb on the memory being mapped + * for DMA to ensure that it becomes resident. + * Useful when running in a memory overcommit environment with lazy allocation + * and free page reporting. + */ + +/* + * Set with kernel cmd line param: + * page_touching.dma_page_touching_enable=y + */ +static bool dma_page_touching_enable __ro_after_init; +module_param_named(dma_page_touching_enable, dma_page_touching_enable, bool, 0400); +MODULE_PARM_DESC(dma_page_touching_enable, + "Touch pages allocated for DMA to ensure they are resident"); + +static void touch_each_page(void *start_addr, size_t size) +{ + int addr_offset; + + for (addr_offset = 0; addr_offset < size; addr_offset += PAGE_SIZE) + __raw_readb((char *)start_addr + addr_offset); +} + +static void *page_touching_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + unsigned long attrs) +{ + char *kaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs); + + if (!kaddr) + return NULL; + touch_each_page(kaddr, size); + return kaddr; + +} + +static dma_addr_t page_touching_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + dma_addr_t dma_handle = dma_direct_map_page(dev, page, offset, size, dir, attrs); + + if (!(dma_mapping_error(dev, dma_handle))) + touch_each_page(page_to_virt(page) + offset, size); + return dma_handle; +} + +static int page_touching_dma_map_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *sg; + int i, ret = dma_direct_map_sg(dev, sglist, nents, dir, attrs); + + if (!ret) + goto out; + + for_each_sg(sglist, sg, nents, i) + touch_each_page(page_to_virt(sg_page(sg)) + sg->offset, sg->length); + +out: + return ret; + +} + +/* + * Only a portion of the dma_map_ops interface is implemented here; enough for + * the EC2 ENA / NVMe drivers to work. + * Notibly missing is alloc_pages. + */ +const static struct dma_map_ops page_touching_dma_ops = { + .alloc = page_touching_dma_alloc, + .free = dma_direct_free, + .mmap = dma_common_mmap, + .map_page = page_touching_dma_map_page, + .unmap_page = dma_direct_unmap_page, + .map_sg = page_touching_dma_map_sg, + .unmap_sg = dma_direct_unmap_sg, + .dma_supported = dma_direct_supported, + .sync_single_for_cpu = dma_direct_sync_single_for_cpu, + .sync_single_for_device = dma_direct_sync_single_for_device, + .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, + .dma_supported = dma_direct_supported, + .get_required_mask = dma_direct_get_required_mask, + .max_mapping_size = dma_direct_max_mapping_size, +}; + +void setup_dma_page_touching_ops(struct device *dev) +{ + if (!dma_page_touching_enable || dev->dma_ops) + return; + + dev_info(dev, "binding to page touching DMA ops\n"); + dev->dma_ops = &page_touching_dma_ops; +} + +static const struct dmi_system_id pt_enable_table[] __initconst = { + { + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Amazon EC2"), + DMI_MATCH(DMI_PRODUCT_NAME, "caspian"), + }, + }, + {}, +}; + +static int __init dmi_enable_pt(void) +{ + if (dmi_check_system(pt_enable_table)) { + pr_info("Automatically enabling page touching for Caspian\n"); + dma_page_touching_enable = 1; + } + return 0; +} +arch_initcall(dmi_enable_pt) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 8ac37e8e738a3..891a895ac218d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -170,11 +170,11 @@ static void irq_state_clr_masked(struct irq_desc *desc) irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); } -static void irq_state_clr_started(struct irq_desc *desc) +void irq_state_clr_started(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); } - +EXPORT_SYMBOL_GPL(irq_state_clr_started); static void irq_state_set_started(struct irq_desc *desc) { irqd_set(&desc->irq_data, IRQD_IRQ_STARTED); diff --git a/kernel/module/signing.c b/kernel/module/signing.c index a2ff4242e623d..f0d2be1ee4f1c 100644 --- a/kernel/module/signing.c +++ b/kernel/module/signing.c @@ -61,10 +61,17 @@ int mod_verify_sig(const void *mod, struct load_info *info) modlen -= sig_len + sizeof(ms); info->len = modlen; - return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, + ret = verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, VERIFY_USE_SECONDARY_KEYRING, VERIFYING_MODULE_SIGNATURE, NULL, NULL); + if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) { + ret = verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, + VERIFY_USE_PLATFORM_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); + } + return ret; } int module_sig_check(struct load_info *info, int flags) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 30d1274f03f62..d418800661f82 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -670,8 +670,11 @@ static void power_down(void) hibernation_platform_enter(); fallthrough; case HIBERNATION_SHUTDOWN: - if (kernel_can_power_off()) + if (kernel_can_power_off()) { + entering_platform_hibernation = true; kernel_power_off(); + entering_platform_hibernation = false; + } break; } kernel_halt(); diff --git a/kernel/power/user.c b/kernel/power/user.c index 3a4e70366f354..0d9a3f899c380 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -243,6 +243,10 @@ static int snapshot_set_swap_area(struct snapshot_data *data, if (data->swap < 0) return swdev ? -ENODEV : -EINVAL; data->dev = swdev; + + swsusp_resume_device = swdev; + swsusp_resume_block = offset; + return 0; } diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e374c0c923dae..1b3fb1604a536 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -114,7 +114,7 @@ notrace static void __scd_stamp(struct sched_clock_data *scd) scd->tick_raw = sched_clock(); } -notrace static void __set_sched_clock_stable(void) +notrace void set_sched_clock_stable(void) { struct sched_clock_data *scd; @@ -234,7 +234,7 @@ static int __init sched_clock_init_late(void) smp_mb(); /* matches {set,clear}_sched_clock_stable() */ if (__sched_clock_stable_early) - __set_sched_clock_stable(); + set_sched_clock_stable(); return 0; } diff --git a/mm/backing-dev.c b/mm/backing-dev.c index bf5525c2e561a..e581102276f23 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -209,11 +209,40 @@ static ssize_t stable_pages_required_show(struct device *dev, } static DEVICE_ATTR_RO(stable_pages_required); +static ssize_t strict_limit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int strict_limit; + ssize_t ret; + + ret = kstrtouint(buf, 10, &strict_limit); + if (ret < 0) + return ret; + + ret = bdi_set_strict_limit(bdi, strict_limit); + if (!ret) + ret = count; + + return ret; +} + +static ssize_t strict_limit_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%d\n", + !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); +} +static DEVICE_ATTR_RW(strict_limit); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, &dev_attr_stable_pages_required.attr, + &dev_attr_strict_limit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); diff --git a/mm/cma.c b/mm/cma.c index 01e9d0b2d8757..f6b8d9b3392e1 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -262,6 +262,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, if (alignment && !is_power_of_2(alignment)) return -EINVAL; + if (!IS_ENABLED(CONFIG_NUMA)) + nid = NUMA_NO_NODE; + /* Sanitise input arguments. */ alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); if (fixed && base & (alignment - 1)) { @@ -367,14 +370,15 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, if (ret) goto free_mem; - pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, - &base); + pr_info("Reserved %ld MiB at %pa on node %d\n", (unsigned long)size / SZ_1M, + &base, nid); return 0; free_mem: memblock_phys_free(base, size); err: - pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + pr_err("Failed to reserve %ld MiB on node %d\n", (unsigned long)size / SZ_1M, + nid); return ret; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4570d3e315cf1..80a8970202213 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2629,6 +2629,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, { unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); int nr_retries = MAX_RECLAIM_RETRIES; + int timeout = 1; struct mem_cgroup *mem_over_limit; struct page_counter *counter; unsigned long nr_reclaimed; @@ -2710,7 +2711,25 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, */ if (mem_cgroup_wait_acct_move(mem_over_limit)) goto retry; - + /* + * Legacy memcg relies on dirty data throttling during the reclaim + * but this cannot be done for GFP_NOFS requests so we might trigger + * the oom way too early. Throttle here if we have way too many + * dirty/writeback pages. + */ + if ((nr_retries < MAX_RECLAIM_RETRIES/2) && + !cgroup_subsys_on_dfl(memory_cgrp_subsys) && + !(gfp_mask & __GFP_FS)) { + unsigned long dirty = memcg_page_state(memcg, NR_FILE_DIRTY); + unsigned long writeback = memcg_page_state(memcg, NR_WRITEBACK); + + if (4*(dirty + writeback) > + 3*page_counter_read(&memcg->memory)) { + schedule_timeout_interruptible(timeout); + if (timeout < 32) + timeout *= 2; + } + } if (nr_retries--) goto retry; @@ -2730,6 +2749,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, get_order(nr_pages * PAGE_SIZE))) { passed_oom = true; nr_retries = MAX_RECLAIM_RETRIES; + timeout = 1; goto retry; } nomem: diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7dbac6ede7242..bc34e3330f3da 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -720,6 +720,21 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) } EXPORT_SYMBOL(bdi_set_max_ratio); +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) +{ + if (strict_limit > 1) + return -EINVAL; + + spin_lock_bh(&bdi_lock); + if (strict_limit) + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + else + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + spin_unlock_bh(&bdi_lock); + + return 0; +} + static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 382958eef8a92..2022508304400 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -317,6 +317,19 @@ static void page_reporting_process(struct work_struct *work) static DEFINE_MUTEX(page_reporting_mutex); DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); +void page_report_offline(unsigned long start_pfn, unsigned int nr_pages) +{ + struct page_reporting_dev_info *prdev; + + mutex_lock(&page_reporting_mutex); + + prdev = rcu_access_pointer(pr_dev_info); + if (prdev && prdev->report_offline) + prdev->report_offline(prdev, start_pfn, nr_pages); + + mutex_unlock(&page_reporting_mutex); +} + int page_reporting_register(struct page_reporting_dev_info *prdev) { int err = 0; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 3616225c89ef6..fd0fc09a4e304 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -33,8 +33,14 @@ u32 inet6_ehashfn(const struct net *net, net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret)); net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); - lhash = (__force u32)laddr->s6_addr32[3]; - fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret); + lhash = jhash_3words((__force u32)laddr->s6_addr32[3], + (((u32)lport) << 16) | (__force u32)fport, + (__force u32)faddr->s6_addr32[0], + ipv6_hash_secret); + fhash = jhash_3words((__force u32)faddr->s6_addr32[1], + (__force u32)faddr->s6_addr32[2], + (__force u32)faddr->s6_addr32[3], + ipv6_hash_secret); return __inet6_ehashfn(lhash, lport, fhash, fport, inet6_ehash_secret + net_hash_mix(net)); diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index c2da6ed32104f..d8a7abbe787ee 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -732,6 +732,7 @@ def __init__(self, parser): 'struct bpf_timer', 'struct mptcp_sock', 'struct bpf_dynptr', + 'const struct bpf_dynptr', 'struct iphdr', 'struct ipv6hdr', } diff --git a/scripts/sign-file.c b/scripts/sign-file.c index 3edb156ae52c3..2549c880c2019 100644 --- a/scripts/sign-file.c +++ b/scripts/sign-file.c @@ -77,7 +77,7 @@ static __attribute__((noreturn)) void format(void) { fprintf(stderr, - "Usage: scripts/sign-file [-dp] []\n"); + "Usage: scripts/sign-file [-dpc] []\n"); fprintf(stderr, " scripts/sign-file -s []\n"); exit(2); @@ -222,6 +222,7 @@ int main(int argc, char **argv) bool save_sig = false, replace_orig; bool sign_only = false; bool raw_sig = false; + bool cert_flags = CMS_NOCERTS; unsigned char buf[4096]; unsigned long module_size, sig_size; unsigned int use_signed_attrs; @@ -249,11 +250,12 @@ int main(int argc, char **argv) #endif do { - opt = getopt(argc, argv, "sdpk"); + opt = getopt(argc, argv, "sdpkc"); switch (opt) { case 's': raw_sig = true; break; case 'p': save_sig = true; break; case 'd': sign_only = true; save_sig = true; break; + case 'c': cert_flags = 0; break; #ifndef USE_PKCS7 case 'k': use_keyid = CMS_USE_KEYID; break; #endif @@ -313,16 +315,16 @@ int main(int argc, char **argv) #ifndef USE_PKCS7 /* Load the signature message from the digest buffer. */ cms = CMS_sign(NULL, NULL, NULL, NULL, - CMS_NOCERTS | CMS_PARTIAL | CMS_BINARY | + cert_flags | CMS_PARTIAL | CMS_BINARY | CMS_DETACHED | CMS_STREAM); ERR(!cms, "CMS_sign"); ERR(!CMS_add1_signer(cms, x509, private_key, digest_algo, - CMS_NOCERTS | CMS_BINARY | + cert_flags | CMS_BINARY | CMS_NOSMIMECAP | use_keyid | use_signed_attrs), "CMS_add1_signer"); - ERR(CMS_final(cms, bm, NULL, CMS_NOCERTS | CMS_BINARY) != 1, + ERR(CMS_final(cms, bm, NULL, cert_flags | CMS_BINARY) != 1, "CMS_final"); #else diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index a00d191394365..3f0adb987c1ee 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -293,8 +293,6 @@ static ssize_t sel_write_disable(struct file *file, const char __user *buf, * kernel releases until eventually it is removed */ pr_err("SELinux: Runtime disable is deprecated, use selinux=0 on the kernel cmdline.\n"); - pr_err("SELinux: https://github.com/SELinuxProject/selinux-kernel/wiki/DEPRECATE-runtime-disable\n"); - ssleep(5); if (count >= PAGE_SIZE) return -ENOMEM; diff --git a/tools/arch/arm64/include/asm/insn.h b/tools/arch/arm64/include/asm/insn.h new file mode 100644 index 0000000000000..8393456922147 --- /dev/null +++ b/tools/arch/arm64/include/asm/insn.h @@ -0,0 +1,458 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2013 Huawei Ltd. + * Author: Jiang Liu + * + * Copyright (C) 2014 Zi Shen Lim + */ +#ifndef __ASM_INSN_H +#define __ASM_INSN_H +#include +#include + +/* A64 instructions are always 32 bits. */ +#define AARCH64_INSN_SIZE 4 + +#ifndef __ASSEMBLY__ +/* + * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a + * Section C3.1 "A64 instruction index by encoding": + * AArch64 main encoding table + * Bit position + * 28 27 26 25 Encoding Group + * 0 0 - - Unallocated + * 1 0 0 - Data processing, immediate + * 1 0 1 - Branch, exception generation and system instructions + * - 1 - 0 Loads and stores + * - 1 0 1 Data processing - register + * 0 1 1 1 Data processing - SIMD and floating point + * 1 1 1 1 Data processing - SIMD and floating point + * "-" means "don't care" + */ +enum aarch64_insn_encoding_class { + AARCH64_INSN_CLS_UNKNOWN, /* UNALLOCATED */ + AARCH64_INSN_CLS_SVE, /* SVE instructions */ + AARCH64_INSN_CLS_DP_IMM, /* Data processing - immediate */ + AARCH64_INSN_CLS_DP_REG, /* Data processing - register */ + AARCH64_INSN_CLS_DP_FPSIMD, /* Data processing - SIMD and FP */ + AARCH64_INSN_CLS_LDST, /* Loads and stores */ + AARCH64_INSN_CLS_BR_SYS, /* Branch, exception generation and + * system instructions */ +}; + +enum aarch64_insn_hint_cr_op { + AARCH64_INSN_HINT_NOP = 0x0 << 5, + AARCH64_INSN_HINT_YIELD = 0x1 << 5, + AARCH64_INSN_HINT_WFE = 0x2 << 5, + AARCH64_INSN_HINT_WFI = 0x3 << 5, + AARCH64_INSN_HINT_SEV = 0x4 << 5, + AARCH64_INSN_HINT_SEVL = 0x5 << 5, + + AARCH64_INSN_HINT_XPACLRI = 0x07 << 5, + AARCH64_INSN_HINT_PACIA_1716 = 0x08 << 5, + AARCH64_INSN_HINT_PACIB_1716 = 0x0A << 5, + AARCH64_INSN_HINT_AUTIA_1716 = 0x0C << 5, + AARCH64_INSN_HINT_AUTIB_1716 = 0x0E << 5, + AARCH64_INSN_HINT_PACIAZ = 0x18 << 5, + AARCH64_INSN_HINT_PACIASP = 0x19 << 5, + AARCH64_INSN_HINT_PACIBZ = 0x1A << 5, + AARCH64_INSN_HINT_PACIBSP = 0x1B << 5, + AARCH64_INSN_HINT_AUTIAZ = 0x1C << 5, + AARCH64_INSN_HINT_AUTIASP = 0x1D << 5, + AARCH64_INSN_HINT_AUTIBZ = 0x1E << 5, + AARCH64_INSN_HINT_AUTIBSP = 0x1F << 5, + + AARCH64_INSN_HINT_ESB = 0x10 << 5, + AARCH64_INSN_HINT_PSB = 0x11 << 5, + AARCH64_INSN_HINT_TSB = 0x12 << 5, + AARCH64_INSN_HINT_CSDB = 0x14 << 5, + + AARCH64_INSN_HINT_BTI = 0x20 << 5, + AARCH64_INSN_HINT_BTIC = 0x22 << 5, + AARCH64_INSN_HINT_BTIJ = 0x24 << 5, + AARCH64_INSN_HINT_BTIJC = 0x26 << 5, +}; + +enum aarch64_insn_imm_type { + AARCH64_INSN_IMM_ADR, + AARCH64_INSN_IMM_26, + AARCH64_INSN_IMM_19, + AARCH64_INSN_IMM_16, + AARCH64_INSN_IMM_14, + AARCH64_INSN_IMM_12, + AARCH64_INSN_IMM_9, + AARCH64_INSN_IMM_7, + AARCH64_INSN_IMM_6, + AARCH64_INSN_IMM_S, + AARCH64_INSN_IMM_R, + AARCH64_INSN_IMM_N, + AARCH64_INSN_IMM_MAX +}; + +enum aarch64_insn_register_type { + AARCH64_INSN_REGTYPE_RT, + AARCH64_INSN_REGTYPE_RN, + AARCH64_INSN_REGTYPE_RT2, + AARCH64_INSN_REGTYPE_RM, + AARCH64_INSN_REGTYPE_RD, + AARCH64_INSN_REGTYPE_RA, + AARCH64_INSN_REGTYPE_RS, +}; + +enum aarch64_insn_register { + AARCH64_INSN_REG_0 = 0, + AARCH64_INSN_REG_1 = 1, + AARCH64_INSN_REG_2 = 2, + AARCH64_INSN_REG_3 = 3, + AARCH64_INSN_REG_4 = 4, + AARCH64_INSN_REG_5 = 5, + AARCH64_INSN_REG_6 = 6, + AARCH64_INSN_REG_7 = 7, + AARCH64_INSN_REG_8 = 8, + AARCH64_INSN_REG_9 = 9, + AARCH64_INSN_REG_10 = 10, + AARCH64_INSN_REG_11 = 11, + AARCH64_INSN_REG_12 = 12, + AARCH64_INSN_REG_13 = 13, + AARCH64_INSN_REG_14 = 14, + AARCH64_INSN_REG_15 = 15, + AARCH64_INSN_REG_16 = 16, + AARCH64_INSN_REG_17 = 17, + AARCH64_INSN_REG_18 = 18, + AARCH64_INSN_REG_19 = 19, + AARCH64_INSN_REG_20 = 20, + AARCH64_INSN_REG_21 = 21, + AARCH64_INSN_REG_22 = 22, + AARCH64_INSN_REG_23 = 23, + AARCH64_INSN_REG_24 = 24, + AARCH64_INSN_REG_25 = 25, + AARCH64_INSN_REG_26 = 26, + AARCH64_INSN_REG_27 = 27, + AARCH64_INSN_REG_28 = 28, + AARCH64_INSN_REG_29 = 29, + AARCH64_INSN_REG_FP = 29, /* Frame pointer */ + AARCH64_INSN_REG_30 = 30, + AARCH64_INSN_REG_LR = 30, /* Link register */ + AARCH64_INSN_REG_ZR = 31, /* Zero: as source register */ + AARCH64_INSN_REG_SP = 31 /* Stack pointer: as load/store base reg */ +}; + +enum aarch64_insn_special_register { + AARCH64_INSN_SPCLREG_SPSR_EL1 = 0xC200, + AARCH64_INSN_SPCLREG_ELR_EL1 = 0xC201, + AARCH64_INSN_SPCLREG_SP_EL0 = 0xC208, + AARCH64_INSN_SPCLREG_SPSEL = 0xC210, + AARCH64_INSN_SPCLREG_CURRENTEL = 0xC212, + AARCH64_INSN_SPCLREG_DAIF = 0xDA11, + AARCH64_INSN_SPCLREG_NZCV = 0xDA10, + AARCH64_INSN_SPCLREG_FPCR = 0xDA20, + AARCH64_INSN_SPCLREG_DSPSR_EL0 = 0xDA28, + AARCH64_INSN_SPCLREG_DLR_EL0 = 0xDA29, + AARCH64_INSN_SPCLREG_SPSR_EL2 = 0xE200, + AARCH64_INSN_SPCLREG_ELR_EL2 = 0xE201, + AARCH64_INSN_SPCLREG_SP_EL1 = 0xE208, + AARCH64_INSN_SPCLREG_SPSR_INQ = 0xE218, + AARCH64_INSN_SPCLREG_SPSR_ABT = 0xE219, + AARCH64_INSN_SPCLREG_SPSR_UND = 0xE21A, + AARCH64_INSN_SPCLREG_SPSR_FIQ = 0xE21B, + AARCH64_INSN_SPCLREG_SPSR_EL3 = 0xF200, + AARCH64_INSN_SPCLREG_ELR_EL3 = 0xF201, + AARCH64_INSN_SPCLREG_SP_EL2 = 0xF210 +}; + +enum aarch64_insn_variant { + AARCH64_INSN_VARIANT_32BIT, + AARCH64_INSN_VARIANT_64BIT +}; + +enum aarch64_insn_condition { + AARCH64_INSN_COND_EQ = 0x0, /* == */ + AARCH64_INSN_COND_NE = 0x1, /* != */ + AARCH64_INSN_COND_CS = 0x2, /* unsigned >= */ + AARCH64_INSN_COND_CC = 0x3, /* unsigned < */ + AARCH64_INSN_COND_MI = 0x4, /* < 0 */ + AARCH64_INSN_COND_PL = 0x5, /* >= 0 */ + AARCH64_INSN_COND_VS = 0x6, /* overflow */ + AARCH64_INSN_COND_VC = 0x7, /* no overflow */ + AARCH64_INSN_COND_HI = 0x8, /* unsigned > */ + AARCH64_INSN_COND_LS = 0x9, /* unsigned <= */ + AARCH64_INSN_COND_GE = 0xa, /* signed >= */ + AARCH64_INSN_COND_LT = 0xb, /* signed < */ + AARCH64_INSN_COND_GT = 0xc, /* signed > */ + AARCH64_INSN_COND_LE = 0xd, /* signed <= */ + AARCH64_INSN_COND_AL = 0xe, /* always */ +}; + +enum aarch64_insn_branch_type { + AARCH64_INSN_BRANCH_NOLINK, + AARCH64_INSN_BRANCH_LINK, + AARCH64_INSN_BRANCH_RETURN, + AARCH64_INSN_BRANCH_COMP_ZERO, + AARCH64_INSN_BRANCH_COMP_NONZERO, +}; + +enum aarch64_insn_size_type { + AARCH64_INSN_SIZE_8, + AARCH64_INSN_SIZE_16, + AARCH64_INSN_SIZE_32, + AARCH64_INSN_SIZE_64, +}; + +enum aarch64_insn_ldst_type { + AARCH64_INSN_LDST_LOAD_REG_OFFSET, + AARCH64_INSN_LDST_STORE_REG_OFFSET, + AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX, + AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX, + AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX, + AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX, + AARCH64_INSN_LDST_LOAD_EX, + AARCH64_INSN_LDST_STORE_EX, +}; + +enum aarch64_insn_adsb_type { + AARCH64_INSN_ADSB_ADD, + AARCH64_INSN_ADSB_SUB, + AARCH64_INSN_ADSB_ADD_SETFLAGS, + AARCH64_INSN_ADSB_SUB_SETFLAGS +}; + +enum aarch64_insn_movewide_type { + AARCH64_INSN_MOVEWIDE_ZERO, + AARCH64_INSN_MOVEWIDE_KEEP, + AARCH64_INSN_MOVEWIDE_INVERSE +}; + +enum aarch64_insn_bitfield_type { + AARCH64_INSN_BITFIELD_MOVE, + AARCH64_INSN_BITFIELD_MOVE_UNSIGNED, + AARCH64_INSN_BITFIELD_MOVE_SIGNED +}; + +enum aarch64_insn_data1_type { + AARCH64_INSN_DATA1_REVERSE_16, + AARCH64_INSN_DATA1_REVERSE_32, + AARCH64_INSN_DATA1_REVERSE_64, +}; + +enum aarch64_insn_data2_type { + AARCH64_INSN_DATA2_UDIV, + AARCH64_INSN_DATA2_SDIV, + AARCH64_INSN_DATA2_LSLV, + AARCH64_INSN_DATA2_LSRV, + AARCH64_INSN_DATA2_ASRV, + AARCH64_INSN_DATA2_RORV, +}; + +enum aarch64_insn_data3_type { + AARCH64_INSN_DATA3_MADD, + AARCH64_INSN_DATA3_MSUB, +}; + +enum aarch64_insn_logic_type { + AARCH64_INSN_LOGIC_AND, + AARCH64_INSN_LOGIC_BIC, + AARCH64_INSN_LOGIC_ORR, + AARCH64_INSN_LOGIC_ORN, + AARCH64_INSN_LOGIC_EOR, + AARCH64_INSN_LOGIC_EON, + AARCH64_INSN_LOGIC_AND_SETFLAGS, + AARCH64_INSN_LOGIC_BIC_SETFLAGS +}; + +enum aarch64_insn_prfm_type { + AARCH64_INSN_PRFM_TYPE_PLD, + AARCH64_INSN_PRFM_TYPE_PLI, + AARCH64_INSN_PRFM_TYPE_PST, +}; + +enum aarch64_insn_prfm_target { + AARCH64_INSN_PRFM_TARGET_L1, + AARCH64_INSN_PRFM_TARGET_L2, + AARCH64_INSN_PRFM_TARGET_L3, +}; + +enum aarch64_insn_prfm_policy { + AARCH64_INSN_PRFM_POLICY_KEEP, + AARCH64_INSN_PRFM_POLICY_STRM, +}; + +enum aarch64_insn_adr_type { + AARCH64_INSN_ADR_TYPE_ADRP, + AARCH64_INSN_ADR_TYPE_ADR, +}; + +#define __AARCH64_INSN_FUNCS(abbr, mask, val) \ +static __always_inline bool aarch64_insn_is_##abbr(u32 code) \ +{ \ + BUILD_BUG_ON(~(mask) & (val)); \ + return (code & (mask)) == (val); \ +} \ +static __always_inline u32 aarch64_insn_get_##abbr##_value(void) \ +{ \ + return (val); \ +} + +__AARCH64_INSN_FUNCS(adr, 0x9F000000, 0x10000000) +__AARCH64_INSN_FUNCS(adrp, 0x9F000000, 0x90000000) +__AARCH64_INSN_FUNCS(prfm, 0x3FC00000, 0x39800000) +__AARCH64_INSN_FUNCS(prfm_lit, 0xFF000000, 0xD8000000) +__AARCH64_INSN_FUNCS(store_imm, 0x3FC00000, 0x39000000) +__AARCH64_INSN_FUNCS(load_imm, 0x3FC00000, 0x39400000) +__AARCH64_INSN_FUNCS(store_pre, 0x3FE00C00, 0x38000C00) +__AARCH64_INSN_FUNCS(load_pre, 0x3FE00C00, 0x38400C00) +__AARCH64_INSN_FUNCS(store_post, 0x3FE00C00, 0x38000400) +__AARCH64_INSN_FUNCS(load_post, 0x3FE00C00, 0x38400400) +__AARCH64_INSN_FUNCS(str_reg, 0x3FE0EC00, 0x38206800) +__AARCH64_INSN_FUNCS(ldadd, 0x3F20FC00, 0x38200000) +__AARCH64_INSN_FUNCS(ldr_reg, 0x3FE0EC00, 0x38606800) +__AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000) +__AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000) +__AARCH64_INSN_FUNCS(exclusive, 0x3F800000, 0x08000000) +__AARCH64_INSN_FUNCS(load_ex, 0x3F400000, 0x08400000) +__AARCH64_INSN_FUNCS(store_ex, 0x3F400000, 0x08000000) +__AARCH64_INSN_FUNCS(stp, 0x7FC00000, 0x29000000) +__AARCH64_INSN_FUNCS(ldp, 0x7FC00000, 0x29400000) +__AARCH64_INSN_FUNCS(stp_post, 0x7FC00000, 0x28800000) +__AARCH64_INSN_FUNCS(ldp_post, 0x7FC00000, 0x28C00000) +__AARCH64_INSN_FUNCS(stp_pre, 0x7FC00000, 0x29800000) +__AARCH64_INSN_FUNCS(ldp_pre, 0x7FC00000, 0x29C00000) +__AARCH64_INSN_FUNCS(add_imm, 0x7F000000, 0x11000000) +__AARCH64_INSN_FUNCS(adds_imm, 0x7F000000, 0x31000000) +__AARCH64_INSN_FUNCS(sub_imm, 0x7F000000, 0x51000000) +__AARCH64_INSN_FUNCS(subs_imm, 0x7F000000, 0x71000000) +__AARCH64_INSN_FUNCS(movn, 0x7F800000, 0x12800000) +__AARCH64_INSN_FUNCS(sbfm, 0x7F800000, 0x13000000) +__AARCH64_INSN_FUNCS(bfm, 0x7F800000, 0x33000000) +__AARCH64_INSN_FUNCS(movz, 0x7F800000, 0x52800000) +__AARCH64_INSN_FUNCS(ubfm, 0x7F800000, 0x53000000) +__AARCH64_INSN_FUNCS(movk, 0x7F800000, 0x72800000) +__AARCH64_INSN_FUNCS(add, 0x7F200000, 0x0B000000) +__AARCH64_INSN_FUNCS(adds, 0x7F200000, 0x2B000000) +__AARCH64_INSN_FUNCS(sub, 0x7F200000, 0x4B000000) +__AARCH64_INSN_FUNCS(subs, 0x7F200000, 0x6B000000) +__AARCH64_INSN_FUNCS(madd, 0x7FE08000, 0x1B000000) +__AARCH64_INSN_FUNCS(msub, 0x7FE08000, 0x1B008000) +__AARCH64_INSN_FUNCS(udiv, 0x7FE0FC00, 0x1AC00800) +__AARCH64_INSN_FUNCS(sdiv, 0x7FE0FC00, 0x1AC00C00) +__AARCH64_INSN_FUNCS(lslv, 0x7FE0FC00, 0x1AC02000) +__AARCH64_INSN_FUNCS(lsrv, 0x7FE0FC00, 0x1AC02400) +__AARCH64_INSN_FUNCS(asrv, 0x7FE0FC00, 0x1AC02800) +__AARCH64_INSN_FUNCS(rorv, 0x7FE0FC00, 0x1AC02C00) +__AARCH64_INSN_FUNCS(rev16, 0x7FFFFC00, 0x5AC00400) +__AARCH64_INSN_FUNCS(rev32, 0x7FFFFC00, 0x5AC00800) +__AARCH64_INSN_FUNCS(rev64, 0x7FFFFC00, 0x5AC00C00) +__AARCH64_INSN_FUNCS(and, 0x7F200000, 0x0A000000) +__AARCH64_INSN_FUNCS(bic, 0x7F200000, 0x0A200000) +__AARCH64_INSN_FUNCS(orr, 0x7F200000, 0x2A000000) +__AARCH64_INSN_FUNCS(mov_reg, 0x7FE0FFE0, 0x2A0003E0) +__AARCH64_INSN_FUNCS(orn, 0x7F200000, 0x2A200000) +__AARCH64_INSN_FUNCS(eor, 0x7F200000, 0x4A000000) +__AARCH64_INSN_FUNCS(eon, 0x7F200000, 0x4A200000) +__AARCH64_INSN_FUNCS(ands, 0x7F200000, 0x6A000000) +__AARCH64_INSN_FUNCS(bics, 0x7F200000, 0x6A200000) +__AARCH64_INSN_FUNCS(and_imm, 0x7F800000, 0x12000000) +__AARCH64_INSN_FUNCS(orr_imm, 0x7F800000, 0x32000000) +__AARCH64_INSN_FUNCS(eor_imm, 0x7F800000, 0x52000000) +__AARCH64_INSN_FUNCS(ands_imm, 0x7F800000, 0x72000000) +__AARCH64_INSN_FUNCS(extr, 0x7FA00000, 0x13800000) +__AARCH64_INSN_FUNCS(b, 0xFC000000, 0x14000000) +__AARCH64_INSN_FUNCS(bl, 0xFC000000, 0x94000000) +__AARCH64_INSN_FUNCS(cbz, 0x7F000000, 0x34000000) +__AARCH64_INSN_FUNCS(cbnz, 0x7F000000, 0x35000000) +__AARCH64_INSN_FUNCS(tbz, 0x7F000000, 0x36000000) +__AARCH64_INSN_FUNCS(tbnz, 0x7F000000, 0x37000000) +__AARCH64_INSN_FUNCS(bcond, 0xFF000010, 0x54000000) +__AARCH64_INSN_FUNCS(svc, 0xFFE0001F, 0xD4000001) +__AARCH64_INSN_FUNCS(hvc, 0xFFE0001F, 0xD4000002) +__AARCH64_INSN_FUNCS(smc, 0xFFE0001F, 0xD4000003) +__AARCH64_INSN_FUNCS(brk, 0xFFE0001F, 0xD4200000) +__AARCH64_INSN_FUNCS(exception, 0xFF000000, 0xD4000000) +__AARCH64_INSN_FUNCS(hint, 0xFFFFF01F, 0xD503201F) +__AARCH64_INSN_FUNCS(br, 0xFFFFFC1F, 0xD61F0000) +__AARCH64_INSN_FUNCS(br_auth, 0xFEFFF800, 0xD61F0800) +__AARCH64_INSN_FUNCS(blr, 0xFFFFFC1F, 0xD63F0000) +__AARCH64_INSN_FUNCS(blr_auth, 0xFEFFF800, 0xD63F0800) +__AARCH64_INSN_FUNCS(ret, 0xFFFFFC1F, 0xD65F0000) +__AARCH64_INSN_FUNCS(ret_auth, 0xFFFFFBFF, 0xD65F0BFF) +__AARCH64_INSN_FUNCS(eret, 0xFFFFFFFF, 0xD69F03E0) +__AARCH64_INSN_FUNCS(eret_auth, 0xFFFFFBFF, 0xD69F0BFF) +__AARCH64_INSN_FUNCS(mrs, 0xFFF00000, 0xD5300000) +__AARCH64_INSN_FUNCS(msr_imm, 0xFFF8F01F, 0xD500401F) +__AARCH64_INSN_FUNCS(msr_reg, 0xFFF00000, 0xD5100000) +__AARCH64_INSN_FUNCS(dmb, 0xFFFFF0FF, 0xD50330BF) +__AARCH64_INSN_FUNCS(dsb_base, 0xFFFFF0FF, 0xD503309F) +__AARCH64_INSN_FUNCS(dsb_nxs, 0xFFFFF3FF, 0xD503323F) +__AARCH64_INSN_FUNCS(isb, 0xFFFFF0FF, 0xD50330DF) +__AARCH64_INSN_FUNCS(sb, 0xFFFFFFFF, 0xD50330FF) +__AARCH64_INSN_FUNCS(clrex, 0xFFFFF0FF, 0xD503305F) +__AARCH64_INSN_FUNCS(ssbb, 0xFFFFFFFF, 0xD503309F) +__AARCH64_INSN_FUNCS(pssbb, 0xFFFFFFFF, 0xD503349F) + +#undef __AARCH64_INSN_FUNCS + +bool aarch64_insn_is_steppable_hint(u32 insn); +bool aarch64_insn_is_branch_imm(u32 insn); + +static inline bool aarch64_insn_is_adr_adrp(u32 insn) +{ + return aarch64_insn_is_adr(insn) || aarch64_insn_is_adrp(insn); +} + +static inline bool aarch64_insn_is_dsb(u32 insn) +{ + return (aarch64_insn_is_dsb_base(insn) && (insn & 0xb00)) || + aarch64_insn_is_dsb_nxs(insn); +} + +static inline bool aarch64_insn_is_barrier(u32 insn) +{ + return aarch64_insn_is_dmb(insn) || aarch64_insn_is_dsb(insn) || + aarch64_insn_is_isb(insn) || aarch64_insn_is_sb(insn) || + aarch64_insn_is_clrex(insn) || aarch64_insn_is_ssbb(insn) || + aarch64_insn_is_pssbb(insn); +} + +static inline bool aarch64_insn_is_store_single(u32 insn) +{ + return aarch64_insn_is_store_imm(insn) || + aarch64_insn_is_store_pre(insn) || + aarch64_insn_is_store_post(insn); +} + +static inline bool aarch64_insn_is_store_pair(u32 insn) +{ + return aarch64_insn_is_stp(insn) || + aarch64_insn_is_stp_pre(insn) || + aarch64_insn_is_stp_post(insn); +} + +static inline bool aarch64_insn_is_load_single(u32 insn) +{ + return aarch64_insn_is_load_imm(insn) || + aarch64_insn_is_load_pre(insn) || + aarch64_insn_is_load_post(insn); +} + +static inline bool aarch64_insn_is_load_pair(u32 insn) +{ + return aarch64_insn_is_ldp(insn) || + aarch64_insn_is_ldp_pre(insn) || + aarch64_insn_is_ldp_post(insn); +} + +enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn); +bool aarch64_insn_uses_literal(u32 insn); +bool aarch64_insn_is_branch(u32 insn); +u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn); +u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type, + u32 insn); +u32 aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op); +u32 aarch64_insn_gen_nop(void); +u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg, + enum aarch64_insn_branch_type type); +s32 aarch64_get_branch_offset(u32 insn); +s32 aarch64_insn_adrp_get_offset(u32 insn); + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_INSN_H */ diff --git a/tools/arch/arm64/include/asm/unwind_hints.h b/tools/arch/arm64/include/asm/unwind_hints.h new file mode 100644 index 0000000000000..8655058aa63c8 --- /dev/null +++ b/tools/arch/arm64/include/asm/unwind_hints.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ASM_UNWIND_HINTS_H +#define __ASM_UNWIND_HINTS_H + +#include + +#define UNWIND_HINT_REG_UNDEFINED 0xff +#define UNWIND_HINT_REG_SP 31 + +#ifdef __ASSEMBLY__ + +.macro UNWIND_HINT_EMPTY + UNWIND_HINT sp_reg=UNWIND_HINT_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL end=1 +.endm + +.macro UNWIND_HINT_FUNC sp_offset=0 + UNWIND_HINT sp_reg=UNWIND_HINT_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL +.endm + +.macro UNWIND_HINT_REGS base=UNWIND_HINT_REG_SP offset=0 + UNWIND_HINT sp_reg=\base sp_offset=\offset type=UNWIND_HINT_TYPE_REGS +.endm + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_UNWIND_HINTS_H */ diff --git a/tools/arch/arm64/lib/insn.c b/tools/arch/arm64/lib/insn.c new file mode 100644 index 0000000000000..b0cc984fcf6a8 --- /dev/null +++ b/tools/arch/arm64/lib/insn.c @@ -0,0 +1,335 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2013 Huawei Ltd. + * Author: Jiang Liu + * + * Copyright (C) 2014-2016 Zi Shen Lim + */ +#include +#include +#include + +#include +#include + +#define AARCH64_DECODE_FAULT 0xFFFFFFFF + +#define AARCH64_INSN_SF_BIT BIT(31) +#define AARCH64_INSN_N_BIT BIT(22) +#define AARCH64_INSN_LSL_12 BIT(22) + +static const int aarch64_insn_encoding_class[] = { + AARCH64_INSN_CLS_UNKNOWN, + AARCH64_INSN_CLS_UNKNOWN, + AARCH64_INSN_CLS_SVE, + AARCH64_INSN_CLS_UNKNOWN, + AARCH64_INSN_CLS_LDST, + AARCH64_INSN_CLS_DP_REG, + AARCH64_INSN_CLS_LDST, + AARCH64_INSN_CLS_DP_FPSIMD, + AARCH64_INSN_CLS_DP_IMM, + AARCH64_INSN_CLS_DP_IMM, + AARCH64_INSN_CLS_BR_SYS, + AARCH64_INSN_CLS_BR_SYS, + AARCH64_INSN_CLS_LDST, + AARCH64_INSN_CLS_DP_REG, + AARCH64_INSN_CLS_LDST, + AARCH64_INSN_CLS_DP_FPSIMD, +}; + +enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn) +{ + return aarch64_insn_encoding_class[(insn >> 25) & 0xf]; +} + +bool aarch64_insn_is_steppable_hint(u32 insn) +{ + if (!aarch64_insn_is_hint(insn)) + return false; + + switch (insn & 0xFE0) { + case AARCH64_INSN_HINT_XPACLRI: + case AARCH64_INSN_HINT_PACIA_1716: + case AARCH64_INSN_HINT_PACIB_1716: + case AARCH64_INSN_HINT_PACIAZ: + case AARCH64_INSN_HINT_PACIASP: + case AARCH64_INSN_HINT_PACIBZ: + case AARCH64_INSN_HINT_PACIBSP: + case AARCH64_INSN_HINT_BTI: + case AARCH64_INSN_HINT_BTIC: + case AARCH64_INSN_HINT_BTIJ: + case AARCH64_INSN_HINT_BTIJC: + case AARCH64_INSN_HINT_NOP: + return true; + default: + return false; + } +} + +bool aarch64_insn_is_branch_imm(u32 insn) +{ + return (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn) || + aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn) || + aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || + aarch64_insn_is_bcond(insn)); +} + +bool aarch64_insn_uses_literal(u32 insn) +{ + /* ldr/ldrsw (literal), prfm */ + + return aarch64_insn_is_ldr_lit(insn) || + aarch64_insn_is_ldrsw_lit(insn) || + aarch64_insn_is_adr_adrp(insn) || + aarch64_insn_is_prfm_lit(insn); +} + +bool aarch64_insn_is_branch(u32 insn) +{ + /* b, bl, cb*, tb*, ret*, b.cond, br*, blr* */ + + return aarch64_insn_is_b(insn) || + aarch64_insn_is_bl(insn) || + aarch64_insn_is_cbz(insn) || + aarch64_insn_is_cbnz(insn) || + aarch64_insn_is_tbz(insn) || + aarch64_insn_is_tbnz(insn) || + aarch64_insn_is_ret(insn) || + aarch64_insn_is_ret_auth(insn) || + aarch64_insn_is_br(insn) || + aarch64_insn_is_br_auth(insn) || + aarch64_insn_is_blr(insn) || + aarch64_insn_is_blr_auth(insn) || + aarch64_insn_is_bcond(insn); +} + +static int aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type, + u32 *maskp, int *shiftp) +{ + u32 mask; + int shift; + + switch (type) { + case AARCH64_INSN_IMM_26: + mask = BIT(26) - 1; + shift = 0; + break; + case AARCH64_INSN_IMM_19: + mask = BIT(19) - 1; + shift = 5; + break; + case AARCH64_INSN_IMM_16: + mask = BIT(16) - 1; + shift = 5; + break; + case AARCH64_INSN_IMM_14: + mask = BIT(14) - 1; + shift = 5; + break; + case AARCH64_INSN_IMM_12: + mask = BIT(12) - 1; + shift = 10; + break; + case AARCH64_INSN_IMM_9: + mask = BIT(9) - 1; + shift = 12; + break; + case AARCH64_INSN_IMM_7: + mask = BIT(7) - 1; + shift = 15; + break; + case AARCH64_INSN_IMM_6: + case AARCH64_INSN_IMM_S: + mask = BIT(6) - 1; + shift = 10; + break; + case AARCH64_INSN_IMM_R: + mask = BIT(6) - 1; + shift = 16; + break; + case AARCH64_INSN_IMM_N: + mask = 1; + shift = 22; + break; + default: + return -EINVAL; + } + + *maskp = mask; + *shiftp = shift; + + return 0; +} + +#define ADR_IMM_HILOSPLIT 2 +#define ADR_IMM_SIZE SZ_2M +#define ADR_IMM_LOMASK ((1 << ADR_IMM_HILOSPLIT) - 1) +#define ADR_IMM_HIMASK ((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1) +#define ADR_IMM_LOSHIFT 29 +#define ADR_IMM_HISHIFT 5 + +u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn) +{ + u32 immlo, immhi, mask; + int shift; + + switch (type) { + case AARCH64_INSN_IMM_ADR: + shift = 0; + immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK; + immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK; + insn = (immhi << ADR_IMM_HILOSPLIT) | immlo; + mask = ADR_IMM_SIZE - 1; + break; + default: + if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) { + WARN("aarch64_insn_decode_immediate: unknown immediate encoding %d\n", + type); + return AARCH64_DECODE_FAULT; + } + } + + return (insn >> shift) & mask; +} + +u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type, + u32 insn) +{ + int shift; + + switch (type) { + case AARCH64_INSN_REGTYPE_RT: + case AARCH64_INSN_REGTYPE_RD: + shift = 0; + break; + case AARCH64_INSN_REGTYPE_RN: + shift = 5; + break; + case AARCH64_INSN_REGTYPE_RT2: + case AARCH64_INSN_REGTYPE_RA: + shift = 10; + break; + case AARCH64_INSN_REGTYPE_RM: + shift = 16; + break; + default: + WARN("%s: unknown register type encoding %d\n", __func__, + type); + return AARCH64_DECODE_FAULT; + } + + return (insn >> shift) & GENMASK(4, 0); +} + +u32 aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op) +{ + return aarch64_insn_get_hint_value() | op; +} + +u32 aarch64_insn_gen_nop(void) +{ + return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP); +} + +static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type, + u32 insn, + enum aarch64_insn_register reg) +{ + int shift; + + if (insn == AARCH64_DECODE_FAULT) + return AARCH64_DECODE_FAULT; + + if (reg < AARCH64_INSN_REG_0 || reg > AARCH64_INSN_REG_SP) { + WARN("%s: unknown register encoding %d\n", __func__, reg); + return AARCH64_DECODE_FAULT; + } + + switch (type) { + case AARCH64_INSN_REGTYPE_RT: + case AARCH64_INSN_REGTYPE_RD: + shift = 0; + break; + case AARCH64_INSN_REGTYPE_RN: + shift = 5; + break; + case AARCH64_INSN_REGTYPE_RT2: + case AARCH64_INSN_REGTYPE_RA: + shift = 10; + break; + case AARCH64_INSN_REGTYPE_RM: + case AARCH64_INSN_REGTYPE_RS: + shift = 16; + break; + default: + WARN("%s: unknown register type encoding %d\n", __func__, + type); + return AARCH64_DECODE_FAULT; + } + + insn &= ~(GENMASK(4, 0) << shift); + insn |= reg << shift; + + return insn; +} + +u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg, + enum aarch64_insn_branch_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_BRANCH_NOLINK: + insn = aarch64_insn_get_br_value(); + break; + case AARCH64_INSN_BRANCH_LINK: + insn = aarch64_insn_get_blr_value(); + break; + case AARCH64_INSN_BRANCH_RETURN: + insn = aarch64_insn_get_ret_value(); + break; + default: + WARN("%s: unknown branch encoding %d\n", __func__, type); + return AARCH64_DECODE_FAULT; + } + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, reg); +} + +/* + * Decode the imm field of a branch, and return the byte offset as a + * signed value (so it can be used when computing a new branch + * target). + */ +s32 aarch64_get_branch_offset(u32 insn) +{ + s32 imm; + + if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) { + imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn); + return (imm << 6) >> 4; + } + + if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || + aarch64_insn_is_bcond(insn)) { + imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn); + return (imm << 13) >> 11; + } + + if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) { + imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn); + return (imm << 18) >> 16; + } + + WARN("Unhandled instruction %x", insn); + return AARCH64_DECODE_FAULT; +} + +s32 aarch64_insn_adrp_get_offset(u32 insn) +{ + if (!aarch64_insn_is_adrp(insn)) { + WARN("Unhandled instruction %x", insn); + return AARCH64_DECODE_FAULT; + } + return aarch64_insn_decode_immediate(AARCH64_INSN_IMM_ADR, insn) << 12; +} diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h index 62c54ffbeeaac..bf494bd6e191a 100644 --- a/tools/include/linux/objtool.h +++ b/tools/include/linux/objtool.h @@ -45,7 +45,9 @@ struct unwind_hint { #ifdef CONFIG_OBJTOOL +#ifndef CONFIG_ARM64 #include +#endif #ifndef __ASSEMBLY__ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a17688011440e..eac87609ca170 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5301,7 +5301,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5311,7 +5311,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5321,7 +5321,7 @@ union bpf_attr { * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* * is a read-only dynptr or if *flags* is not 0. * - * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) * Description * Get a pointer to the underlying dynptr data. * @@ -5422,7 +5422,7 @@ union bpf_attr { * Drain samples from the specified user ring buffer, and invoke * the provided callback for each such sample: * - * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); * * If **callback_fn** returns 0, the helper will continue to try * and drain the next sample, up to a maximum of diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index a3a9cc24e0e37..9772ac5993ed5 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -45,6 +45,10 @@ ifeq ($(SRCARCH),x86) BUILD_ORC := y endif +ifeq ($(SRCARCH),arm64) + CFLAGS += -Wno-nested-externs +endif + export BUILD_ORC export srctree OUTPUT CFLAGS SRCARCH AWK include $(srctree)/tools/build/Makefile.include @@ -58,7 +62,7 @@ $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) $(LIBSUBCMD): fixdep FORCE - $(Q)$(MAKE) -C $(SUBCMD_SRCDIR) OUTPUT=$(LIBSUBCMD_OUTPUT) + $(Q)$(MAKE) -C $(SUBCMD_SRCDIR) OUTPUT=$(LIBSUBCMD_OUTPUT) AR=$(AR) CC=$(CC) LD=$(LD) clean: $(call QUIET_CLEAN, objtool) $(RM) $(OBJTOOL) diff --git a/tools/objtool/arch/arm64/Build b/tools/objtool/arch/arm64/Build new file mode 100644 index 0000000000000..f3de3a50d5411 --- /dev/null +++ b/tools/objtool/arch/arm64/Build @@ -0,0 +1,8 @@ +objtool-y += special.o +objtool-y += decode.o + +objtool-y += libhweight.o + +$(OUTPUT)arch/arm64/libhweight.o: ../lib/hweight.c FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_o_c) diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c new file mode 100644 index 0000000000000..c585bef3b75a1 --- /dev/null +++ b/tools/objtool/arch/arm64/decode.c @@ -0,0 +1,465 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../../../arch/arm64/lib/insn.c" + +#define is_SP(reg) (reg == AARCH64_INSN_REG_SP) +#define is_FP(reg) (reg == AARCH64_INSN_REG_FP) +#define is_SPFP(reg) (reg == AARCH64_INSN_REG_SP || reg == AARCH64_INSN_REG_FP) + +#define ADD_OP(op) \ + if (!(op = calloc(1, sizeof(*op)))) \ + return -1; \ + else for (list_add_tail(&op->list, ops_list); op; op = NULL) + +static unsigned long sign_extend(unsigned long x, int nbits) +{ + unsigned long sign_bit = (x >> (nbits - 1)) & 1; + + return ((~0UL + (sign_bit ^ 1)) << nbits) | x; +} + +struct insn_loc { + const struct section *sec; + unsigned long offset; + struct hlist_node hnode; +}; + +DEFINE_HASHTABLE(invalid_insns, 16); + +static int record_invalid_insn(const struct section *sec, + unsigned long offset) +{ + struct insn_loc *loc; + struct hlist_head *l; + + l = &invalid_insns[hash_min(offset, HASH_BITS(invalid_insns))]; + if (!hlist_empty(l)) { + loc = hlist_entry(l->first, struct insn_loc, hnode); + return 0; + } + + loc = malloc(sizeof(*loc)); + if (!loc) { + WARN("malloc failed"); + return -1; + } + + loc->sec = sec; + loc->offset = offset; + + hash_add(invalid_insns, &loc->hnode, loc->offset); + + return 0; +} + +int arch_post_process_instructions(struct objtool_file *file) +{ + struct hlist_node *tmp; + struct insn_loc *loc; + unsigned int bkt; + int res = 0; + + hash_for_each_safe(invalid_insns, bkt, tmp, loc, hnode) { + struct instruction *insn; + + insn = find_insn(file, (struct section *) loc->sec, loc->offset); + if (insn) { + list_del(&insn->list); + hash_del(&insn->hash); + free(insn); + } + + hash_del(&loc->hnode); + free(loc); + } + + return res; +} + +bool arch_callee_saved_reg(unsigned char reg) +{ + switch (reg) { + case AARCH64_INSN_REG_19: + case AARCH64_INSN_REG_20: + case AARCH64_INSN_REG_21: + case AARCH64_INSN_REG_22: + case AARCH64_INSN_REG_23: + case AARCH64_INSN_REG_24: + case AARCH64_INSN_REG_25: + case AARCH64_INSN_REG_26: + case AARCH64_INSN_REG_27: + case AARCH64_INSN_REG_28: + case AARCH64_INSN_REG_FP: + case AARCH64_INSN_REG_LR: + return true; + default: + return false; + } +} + +void arch_initial_func_cfi_state(struct cfi_init_state *state) +{ + int i; + + for (i = 0; i < CFI_NUM_REGS; i++) { + state->regs[i].base = CFI_UNDEFINED; + state->regs[i].offset = 0; + } + + /* initial CFA (call frame address) */ + state->cfa.base = CFI_SP; + state->cfa.offset = 0; +} + +unsigned long arch_dest_reloc_offset(int addend) +{ + return addend; +} + +unsigned long arch_jump_destination(struct instruction *insn) +{ + return insn->offset + insn->immediate; +} + +const char *arch_nop_insn(int len) +{ + static u32 nop; + + if (len != AARCH64_INSN_SIZE) + WARN("invalid NOP size: %d\n", len); + + if (!nop) + nop = aarch64_insn_gen_nop(); + + return (const char *)&nop; +} + +const char *arch_ret_insn(int len) +{ + static u32 ret; + + if (len != AARCH64_INSN_SIZE) + WARN("invalid RET size: %d\n", len); + + if (!ret) { + ret = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_LR, + AARCH64_INSN_BRANCH_RETURN); + } + + return (const char *)&ret; +} + +static int is_arm64(const struct elf *elf) +{ + switch (elf->ehdr.e_machine) { + case EM_AARCH64: //0xB7 + return 1; + default: + WARN("unexpected ELF machine type %x", + elf->ehdr.e_machine); + return 0; + } +} + +int arch_decode_hint_reg(u8 sp_reg, int *base) +{ + switch (sp_reg) { + case UNWIND_HINT_REG_UNDEFINED: + *base = CFI_UNDEFINED; + break; + case UNWIND_HINT_REG_SP: + *base = CFI_SP; + break; + default: + return -1; + } + + return 0; +} + +static inline void make_add_op(enum aarch64_insn_register dest, + enum aarch64_insn_register src, + int val, struct stack_op *op) +{ + op->dest.type = OP_DEST_REG; + op->dest.reg = dest; + op->src.reg = src; + op->src.type = val != 0 ? OP_SRC_ADD : OP_SRC_REG; + op->src.offset = val; +} + +static inline void make_store_op(enum aarch64_insn_register base, + enum aarch64_insn_register reg, + int offset, struct stack_op *op) +{ + op->dest.type = OP_DEST_REG_INDIRECT; + op->dest.reg = base; + op->dest.offset = offset; + op->src.type = OP_SRC_REG; + op->src.reg = reg; + op->src.offset = 0; +} + +static inline void make_load_op(enum aarch64_insn_register base, + enum aarch64_insn_register reg, + int offset, struct stack_op *op) +{ + op->dest.type = OP_DEST_REG; + op->dest.reg = reg; + op->dest.offset = 0; + op->src.type = OP_SRC_REG_INDIRECT; + op->src.reg = base; + op->src.offset = offset; +} + +static inline bool aarch64_insn_is_ldst_pre(u32 insn) +{ + return aarch64_insn_is_store_pre(insn) || + aarch64_insn_is_load_pre(insn) || + aarch64_insn_is_stp_pre(insn) || + aarch64_insn_is_ldp_pre(insn); +} + +static inline bool aarch64_insn_is_ldst_post(u32 insn) +{ + return aarch64_insn_is_store_post(insn) || + aarch64_insn_is_load_post(insn) || + aarch64_insn_is_stp_post(insn) || + aarch64_insn_is_ldp_post(insn); +} + +static int decode_load_store(u32 insn, unsigned long *immediate, + struct list_head *ops_list) +{ + enum aarch64_insn_register base; + enum aarch64_insn_register rt; + struct stack_op *op; + int size; + int offset; + + if (aarch64_insn_is_store_single(insn) || + aarch64_insn_is_load_single(insn)) + size = 1 << ((insn & GENMASK(31, 30)) >> 30); + else + size = 4 << ((insn >> 31) & 1); + + if (aarch64_insn_is_store_pair(insn) || + aarch64_insn_is_load_pair(insn)) + *immediate = size * sign_extend(aarch64_insn_decode_immediate(AARCH64_INSN_IMM_7, + insn), 7); + else if (aarch64_insn_is_store_imm(insn) || + aarch64_insn_is_load_imm(insn)) + *immediate = size * aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12, insn); + else /* load/store_pre/post */ + *immediate = sign_extend(aarch64_insn_decode_immediate(AARCH64_INSN_IMM_9, + insn), 9); + + base = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn); + if (!is_SPFP(base)) + return 0; + + if (aarch64_insn_is_ldst_post(insn)) + offset = 0; + else + offset = *immediate; + + /* First register */ + rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); + ADD_OP(op) { + if (aarch64_insn_is_store_single(insn) || + aarch64_insn_is_store_pair(insn)) + make_store_op(base, rt, offset, op); + else + make_load_op(base, rt, offset, op); + } + + /* Second register (if present) */ + if (aarch64_insn_is_store_pair(insn) || + aarch64_insn_is_load_pair(insn)) { + rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT2, + insn); + ADD_OP(op) { + if (aarch64_insn_is_store_pair(insn)) + make_store_op(base, rt, offset + size, op); + else + make_load_op(base, rt, offset + size, op); + } + } + + if (aarch64_insn_is_ldst_pre(insn) || + aarch64_insn_is_ldst_post(insn)) { + ADD_OP(op) { + make_add_op(base, base, *immediate, op); + } + } + + return 0; +} + +static void decode_add_sub_imm(u32 instr, bool set_flags, + unsigned long *immediate, + struct stack_op *op) +{ + u32 rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, instr); + u32 rn = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, instr); + + *immediate = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12, instr); + + if (instr & AARCH64_INSN_LSL_12) + *immediate <<= 12; + + if ((!set_flags && is_SP(rd)) || is_FP(rd) + || is_SPFP(rn)) { + int value; + + if (aarch64_insn_is_subs_imm(instr) || aarch64_insn_is_sub_imm(instr)) + value = -*immediate; + else + value = *immediate; + + make_add_op(rd, rn, value, op); + } +} + +int arch_decode_instruction(struct objtool_file *file, const struct section *sec, + unsigned long offset, unsigned int maxlen, + unsigned int *len, enum insn_type *type, + unsigned long *immediate, + struct list_head *ops_list) +{ + const struct elf *elf = file->elf; + struct stack_op *op = NULL; + u32 insn; + + if (!is_arm64(elf)) + return -1; + + if (maxlen < AARCH64_INSN_SIZE) + return 0; + + *len = AARCH64_INSN_SIZE; + *immediate = 0; + *type = INSN_OTHER; + + insn = *(u32 *)(sec->data->d_buf + offset); + + switch (aarch64_get_insn_class(insn)) { + case AARCH64_INSN_CLS_UNKNOWN: + if (insn == 0x0) { + *type = INSN_NOP; + } else { + WARN("undecoded insn at %s:0x%lx", sec->name, offset); + return record_invalid_insn(sec, offset); + } + + break; + case AARCH64_INSN_CLS_DP_IMM: + /* Mov register to and from SP are aliases of add_imm */ + if (aarch64_insn_is_add_imm(insn) || + aarch64_insn_is_sub_imm(insn)) { + ADD_OP(op) { + decode_add_sub_imm(insn, false, immediate, op); + } + } + else if (aarch64_insn_is_adds_imm(insn) || + aarch64_insn_is_subs_imm(insn)) { + ADD_OP(op) { + decode_add_sub_imm(insn, true, immediate, op); + } + } + break; + case AARCH64_INSN_CLS_DP_REG: + if (aarch64_insn_is_mov_reg(insn)) { + enum aarch64_insn_register rd; + enum aarch64_insn_register rm; + + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + rm = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RM, insn); + if (is_FP(rd) || is_FP(rm)) { + ADD_OP(op) { + make_add_op(rd, rm, 0, op); + } + } + } + break; + case AARCH64_INSN_CLS_BR_SYS: + if (aarch64_insn_is_ret(insn) && + aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn) + == AARCH64_INSN_REG_LR) { + *type = INSN_RETURN; + } else if (aarch64_insn_is_bl(insn)) { + *type = INSN_CALL; + *immediate = aarch64_get_branch_offset(insn); + } else if (aarch64_insn_is_blr(insn)) { + *type = INSN_CALL_DYNAMIC; + } else if (aarch64_insn_is_b(insn)) { + *type = INSN_JUMP_UNCONDITIONAL; + *immediate = aarch64_get_branch_offset(insn); + } else if (aarch64_insn_is_br(insn)) { + *type = INSN_JUMP_DYNAMIC; + } else if (aarch64_insn_is_branch_imm(insn)) { + /* Remaining branch opcodes are conditional */ + *type = INSN_JUMP_CONDITIONAL; + *immediate = aarch64_get_branch_offset(insn); + } else if (aarch64_insn_is_eret(insn)) { + *type = INSN_CONTEXT_SWITCH; + } else if (aarch64_insn_is_hint(insn) || + aarch64_insn_is_barrier(insn)) { + *type = INSN_NOP; + } else if (aarch64_insn_is_brk(insn)) { + *type = INSN_BUG; + *immediate = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn); + } + break; + case AARCH64_INSN_CLS_LDST: + { + int ret; + + ret = decode_load_store(insn, immediate, ops_list); + if (ret <= 0) + return ret; + + /* + * For LDR ops, assembler can generate the data to be + * loaded in the code section + * Record and remove these data because they + * are never excuted + */ + if (aarch64_insn_is_ldr_lit(insn)) { + long pc_offset; + + pc_offset = insn & GENMASK(23, 5); + /* Sign extend and multiply by 4 */ + pc_offset = (pc_offset << (64 - 23)); + pc_offset = ((pc_offset >> (64 - 23)) >> 5) << 2; + + ret = record_invalid_insn(sec, offset + pc_offset); + + /* 64-bit literal */ + if (insn & BIT(30)) + ret = record_invalid_insn(sec, offset + pc_offset + 4); + + return ret; + } + break; + } + default: + break; + } + + return 0; +} diff --git a/tools/objtool/arch/arm64/include/arch/cfi_regs.h b/tools/objtool/arch/arm64/include/arch/cfi_regs.h new file mode 100644 index 0000000000000..a5185649686b7 --- /dev/null +++ b/tools/objtool/arch/arm64/include/arch/cfi_regs.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _OBJTOOL_CFI_REGS_H +#define _OBJTOOL_CFI_REGS_H + +#include + +#define CFI_BP AARCH64_INSN_REG_FP +#define CFI_RA AARCH64_INSN_REG_LR +#define CFI_SP AARCH64_INSN_REG_SP + +#define CFI_NUM_REGS 32 + +#endif /* _OBJTOOL_CFI_REGS_H */ diff --git a/tools/objtool/arch/arm64/include/arch/elf.h b/tools/objtool/arch/arm64/include/arch/elf.h new file mode 100644 index 0000000000000..a59888a906b5d --- /dev/null +++ b/tools/objtool/arch/arm64/include/arch/elf.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */ + +#ifndef _OBJTOOL_ARCH_ELF +#define _OBJTOOL_ARCH_ELF + +#define R_NTYPE -1 +#define R_NONE R_AARCH64_NONE +#define R_ABS64 R_AARCH64_ABS64 +#define R_REL32 R_AARCH64_PREL32 +#define R_PLT32 R_NTYPE + +#endif /* _OBJTOOL_ARCH_ELF */ diff --git a/tools/objtool/arch/arm64/include/arch/endianness.h b/tools/objtool/arch/arm64/include/arch/endianness.h new file mode 100644 index 0000000000000..7c362527da205 --- /dev/null +++ b/tools/objtool/arch/arm64/include/arch/endianness.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _ARCH_ENDIANNESS_H +#define _ARCH_ENDIANNESS_H + +#include + +#define __TARGET_BYTE_ORDER __LITTLE_ENDIAN + +#endif /* _ARCH_ENDIANNESS_H */ diff --git a/tools/objtool/arch/arm64/include/arch/special.h b/tools/objtool/arch/arm64/include/arch/special.h new file mode 100644 index 0000000000000..63a705c622a4b --- /dev/null +++ b/tools/objtool/arch/arm64/include/arch/special.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _ARM64_ARCH_SPECIAL_H +#define _ARM64_ARCH_SPECIAL_H + +#define EX_ENTRY_SIZE 12 +#define EX_ORIG_OFFSET 0 +#define EX_NEW_OFFSET 4 + +#define JUMP_ENTRY_SIZE 16 +#define JUMP_ORIG_OFFSET 0 +#define JUMP_NEW_OFFSET 4 +#define JUMP_KEY_OFFSET 8 + +#define ALT_ENTRY_SIZE 12 +#define ALT_ORIG_OFFSET 0 +#define ALT_NEW_OFFSET 4 +#define ALT_FEATURE_OFFSET 8 +#define ALT_ORIG_LEN_OFFSET 10 +#define ALT_NEW_LEN_OFFSET 11 + +#endif /* _ARM64_ARCH_SPECIAL_H */ diff --git a/tools/objtool/arch/arm64/special.c b/tools/objtool/arch/arm64/special.c new file mode 100644 index 0000000000000..8bb1ebd2132a3 --- /dev/null +++ b/tools/objtool/arch/arm64/special.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +void arch_handle_alternative(unsigned short feature, struct special_alt *alt) +{ + /* + * ARM64_CB_PATCH has no alternative instruction. + * a callback is called at alternative replacement time + * to dynamically change the original instructions. + * + * ARM64_CB_PATCH is the last ARM64 feature, it's value changes + * every time a new feature is added. So the orig/alt region + * length are used to detect those alternatives + */ + if (alt->orig_len && !alt->new_len) + alt->skip_alt = true; +} + +bool arch_support_alt_relocation(struct special_alt *special_alt, + struct instruction *insn, + struct reloc *reloc) +{ + u32 opcode = *(u32 *)(insn->sec->data->d_buf + insn->offset); + + return aarch64_insn_is_branch_imm(opcode) || + aarch64_insn_is_adrp(opcode) || + !aarch64_insn_uses_literal(opcode); +} + + +struct reloc *arch_find_switch_table(struct objtool_file *file, + struct instruction *insn) +{ + return NULL; +} diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 1ed49ab4e871f..8548ef5867e6b 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -699,6 +699,11 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec return 0; } +int arch_post_process_instructions(struct objtool_file *file) +{ + return 0; +} + void arch_initial_func_cfi_state(struct cfi_init_state *state) { int i; diff --git a/tools/objtool/arch/x86/include/arch/elf.h b/tools/objtool/arch/x86/include/arch/elf.h index 69cc4264b28a8..7b737fcfcb9c9 100644 --- a/tools/objtool/arch/x86/include/arch/elf.h +++ b/tools/objtool/arch/x86/include/arch/elf.h @@ -1,6 +1,9 @@ #ifndef _OBJTOOL_ARCH_ELF #define _OBJTOOL_ARCH_ELF -#define R_NONE R_X86_64_NONE +#define R_NONE R_X86_64_NONE +#define R_ABS64 R_X86_64_64 +#define R_REL32 R_X86_64_PC32 +#define R_PLT32 R_X86_64_PLT32 #endif /* _OBJTOOL_ARCH_ELF */ diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c index 7c97b73912799..fbe0745a9ed74 100644 --- a/tools/objtool/arch/x86/special.c +++ b/tools/objtool/arch/x86/special.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include #include #include @@ -108,7 +109,7 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, table_offset = text_reloc->addend; table_sec = text_reloc->sym->sec; - if (text_reloc->type == R_X86_64_PC32) + if (text_reloc->type == R_REL32) table_offset += 4; /* @@ -138,7 +139,7 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, * indicates a rare GCC quirk/bug which can leave dead * code behind. */ - if (text_reloc->type == R_X86_64_PC32) + if (text_reloc->type == R_REL32) file->ignore_unreachables = true; return rodata_reloc; diff --git a/tools/objtool/check.c b/tools/objtool/check.c index cb363b507a329..6eb1ce881533d 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -363,7 +363,7 @@ static int decode_instructions(struct objtool_file *file) { struct section *sec; struct symbol *func; - unsigned long offset; + unsigned long offset, next_offset; struct instruction *insn; int ret; @@ -382,7 +382,15 @@ static int decode_instructions(struct objtool_file *file) !strncmp(sec->name, ".text..__x86.", 13)) sec->noinstr = true; - for (offset = 0; offset < sec->sh.sh_size; offset += insn->len) { + for (offset = 0; offset < sec->sh.sh_size; offset = next_offset) { + struct symbol *obj_sym = find_object_containing(sec, offset); + + if (obj_sym) { + /* This is data in the middle of text section, skip it */ + next_offset = obj_sym->offset + obj_sym->len; + continue; + } + insn = malloc(sizeof(*insn)); if (!insn) { WARN("malloc failed"); @@ -415,6 +423,8 @@ static int decode_instructions(struct objtool_file *file) hash_add(file->insn_hash, &insn->hash, sec_offset_hash(sec, insn->offset)); list_add_tail(&insn->list, &file->insn_list); nr_insns++; + + next_offset = offset + insn->len; } list_for_each_entry(func, &sec->symbol_list, list) { @@ -444,6 +454,9 @@ static int decode_instructions(struct objtool_file *file) if (opts.stats) printf("nr_insns: %lu\n", nr_insns); + if (arch_post_process_instructions(file)) + return -1; + return 0; err: @@ -655,7 +668,7 @@ static int create_static_call_sections(struct objtool_file *file) /* populate reloc for 'addr' */ if (elf_add_reloc_to_insn(file->elf, sec, idx * sizeof(struct static_call_site), - R_X86_64_PC32, + R_REL32, insn->sec, insn->offset)) return -1; @@ -698,7 +711,7 @@ static int create_static_call_sections(struct objtool_file *file) /* populate reloc for 'key' */ if (elf_add_reloc(file->elf, sec, idx * sizeof(struct static_call_site) + 4, - R_X86_64_PC32, key_sym, + R_REL32, key_sym, is_sibling_call(insn) * STATIC_CALL_SITE_TAIL)) return -1; @@ -742,7 +755,7 @@ static int create_retpoline_sites_sections(struct objtool_file *file) if (elf_add_reloc_to_insn(file->elf, sec, idx * sizeof(int), - R_X86_64_PC32, + R_REL32, insn->sec, insn->offset)) { WARN("elf_add_reloc_to_insn: .retpoline_sites"); return -1; @@ -788,7 +801,7 @@ static int create_return_sites_sections(struct objtool_file *file) if (elf_add_reloc_to_insn(file->elf, sec, idx * sizeof(int), - R_X86_64_PC32, + R_REL32, insn->sec, insn->offset)) { WARN("elf_add_reloc_to_insn: .return_sites"); return -1; @@ -840,7 +853,7 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file) if (elf_add_reloc_to_insn(file->elf, sec, idx * sizeof(int), - R_X86_64_PC32, + R_REL32, insn->sec, insn->offset)) { WARN("elf_add_reloc_to_insn: .ibt_endbr_seal"); return -1; @@ -885,7 +898,7 @@ static int create_mcount_loc_sections(struct objtool_file *file) if (elf_add_reloc_to_insn(file->elf, sec, idx * sizeof(unsigned long), - R_X86_64_64, + R_ABS64, insn->sec, insn->offset)) return -1; @@ -1815,6 +1828,9 @@ static int add_special_section_alts(struct objtool_file *file) continue; } + if (special_alt->skip_alt && !special_alt->new_len) + continue; + ret = handle_group_alt(file, special_alt, orig_insn, &new_insn); if (ret) @@ -4051,7 +4067,7 @@ static int validate_ibt_insn(struct objtool_file *file, struct instruction *insn continue; off = reloc->sym->offset; - if (reloc->type == R_X86_64_PC32 || reloc->type == R_X86_64_PLT32) + if (reloc->type == R_REL32 || reloc->type == R_PLT32) off += arch_dest_reloc_offset(reloc->addend); else off += reloc->addend; diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 7e24b09b1163a..0f76525d6f8e6 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -236,6 +236,20 @@ struct symbol *find_func_containing(struct section *sec, unsigned long offset) return NULL; } +struct symbol *find_object_containing(const struct section *sec, unsigned long offset) +{ + struct rb_node *node; + + rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { + struct symbol *s = rb_entry(node, struct symbol, node); + + if (s->type == STT_OBJECT) + return s; + } + + return NULL; +} + struct symbol *find_symbol_by_name(const struct elf *elf, const char *name) { struct symbol *sym; diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 861c0c60ac81e..bc305cbe048a9 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -77,6 +77,8 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec unsigned long *immediate, struct list_head *ops_list); +int arch_post_process_instructions(struct objtool_file *file); + bool arch_callee_saved_reg(unsigned char reg); unsigned long arch_jump_destination(struct instruction *insn); diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index 5d4a841fbd311..4df6f1dcbc64e 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -169,6 +169,7 @@ struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, uns struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec, unsigned long offset, unsigned int len); struct symbol *find_func_containing(struct section *sec, unsigned long offset); +struct symbol *find_object_containing(const struct section *sec, unsigned long offset); #define for_each_sec(file, sec) \ list_for_each_entry(sec, &file->elf->sections, list) diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index cda649644e32d..a7ecc32e35125 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -146,5 +146,7 @@ int main(int argc, const char **argv) exec_cmd_init("objtool", UNUSED, UNUSED, UNUSED); pager_init(UNUSED); - return objtool_run(argc, argv); + objtool_run(argc, argv); + + return 0; } diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index dd3c64af9db23..ca7bba45a46e5 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -101,7 +102,7 @@ static int write_orc_entry(struct elf *elf, struct section *orc_sec, orc->bp_offset = bswap_if_needed(orc->bp_offset); /* populate reloc for ip */ - if (elf_add_reloc_to_insn(elf, ip_sec, idx * sizeof(int), R_X86_64_PC32, + if (elf_add_reloc_to_insn(elf, ip_sec, idx * sizeof(int), R_REL32, insn_sec, insn_off)) return -1; diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3b57fbf8fff4a..31c425a4fff30 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -519,7 +519,7 @@ TRUNNER_BPF_PROGS_DIR := progs TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ network_helpers.c testing_helpers.c \ btf_helpers.c flow_dissector_load.h \ - cap_helpers.c + cap_helpers.c test_loader.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ $(OUTPUT)/liburandom_read.so \ $(OUTPUT)/xdp_synproxy \ diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index 8fc4e6c02bfda..7faaf6d9e0d47 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -5,86 +5,16 @@ #include "dynptr_fail.skel.h" #include "dynptr_success.skel.h" -static size_t log_buf_sz = 1048576; /* 1 MB */ -static char obj_log_buf[1048576]; - static struct { const char *prog_name; const char *expected_err_msg; } dynptr_tests[] = { - /* failure cases */ - {"ringbuf_missing_release1", "Unreleased reference id=1"}, - {"ringbuf_missing_release2", "Unreleased reference id=2"}, - {"ringbuf_missing_release_callback", "Unreleased reference id"}, - {"use_after_invalid", "Expected an initialized dynptr as arg #3"}, - {"ringbuf_invalid_api", "type=mem expected=alloc_mem"}, - {"add_dynptr_to_map1", "invalid indirect read from stack"}, - {"add_dynptr_to_map2", "invalid indirect read from stack"}, - {"data_slice_out_of_bounds_ringbuf", "value is outside of the allowed memory range"}, - {"data_slice_out_of_bounds_map_value", "value is outside of the allowed memory range"}, - {"data_slice_use_after_release1", "invalid mem access 'scalar'"}, - {"data_slice_use_after_release2", "invalid mem access 'scalar'"}, - {"data_slice_missing_null_check1", "invalid mem access 'mem_or_null'"}, - {"data_slice_missing_null_check2", "invalid mem access 'mem_or_null'"}, - {"invalid_helper1", "invalid indirect read from stack"}, - {"invalid_helper2", "Expected an initialized dynptr as arg #3"}, - {"invalid_write1", "Expected an initialized dynptr as arg #1"}, - {"invalid_write2", "Expected an initialized dynptr as arg #3"}, - {"invalid_write3", "Expected an initialized dynptr as arg #1"}, - {"invalid_write4", "arg 1 is an unacquired reference"}, - {"invalid_read1", "invalid read from stack"}, - {"invalid_read2", "cannot pass in dynptr at an offset"}, - {"invalid_read3", "invalid read from stack"}, - {"invalid_read4", "invalid read from stack"}, - {"invalid_offset", "invalid write to stack"}, - {"global", "type=map_value expected=fp"}, - {"release_twice", "arg 1 is an unacquired reference"}, - {"release_twice_callback", "arg 1 is an unacquired reference"}, - {"dynptr_from_mem_invalid_api", - "Unsupported reg type fp for bpf_dynptr_from_mem data"}, - /* success cases */ {"test_read_write", NULL}, {"test_data_slice", NULL}, {"test_ringbuf", NULL}, }; -static void verify_fail(const char *prog_name, const char *expected_err_msg) -{ - LIBBPF_OPTS(bpf_object_open_opts, opts); - struct bpf_program *prog; - struct dynptr_fail *skel; - int err; - - opts.kernel_log_buf = obj_log_buf; - opts.kernel_log_size = log_buf_sz; - opts.kernel_log_level = 1; - - skel = dynptr_fail__open_opts(&opts); - if (!ASSERT_OK_PTR(skel, "dynptr_fail__open_opts")) - goto cleanup; - - prog = bpf_object__find_program_by_name(skel->obj, prog_name); - if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) - goto cleanup; - - bpf_program__set_autoload(prog, true); - - bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize()); - - err = dynptr_fail__load(skel); - if (!ASSERT_ERR(err, "unexpected load success")) - goto cleanup; - - if (!ASSERT_OK_PTR(strstr(obj_log_buf, expected_err_msg), "expected_err_msg")) { - fprintf(stderr, "Expected err_msg: %s\n", expected_err_msg); - fprintf(stderr, "Verifier output: %s\n", obj_log_buf); - } - -cleanup: - dynptr_fail__destroy(skel); -} - static void verify_success(const char *prog_name) { struct dynptr_success *skel; @@ -97,8 +27,6 @@ static void verify_success(const char *prog_name) skel->bss->pid = getpid(); - bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize()); - dynptr_success__load(skel); if (!ASSERT_OK_PTR(skel, "dynptr_success__load")) goto cleanup; @@ -129,10 +57,8 @@ void test_dynptr(void) if (!test__start_subtest(dynptr_tests[i].prog_name)) continue; - if (dynptr_tests[i].expected_err_msg) - verify_fail(dynptr_tests[i].prog_name, - dynptr_tests[i].expected_err_msg); - else - verify_success(dynptr_tests[i].prog_name); + verify_success(dynptr_tests[i].prog_name); } + + RUN_TESTS(dynptr_fail); } diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c index c210657d4d0aa..612907d1fbeab 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c @@ -18,11 +18,8 @@ static struct { const char *expected_verifier_err_msg; int expected_runtime_err; } kfunc_dynptr_tests[] = { - {"dynptr_type_not_supp", - "arg#0 pointer type STRUCT bpf_dynptr_kern points to unsupported dynamic pointer type", 0}, - {"not_valid_dynptr", - "arg#0 pointer type STRUCT bpf_dynptr_kern must be valid and initialized", 0}, - {"not_ptr_to_stack", "arg#0 pointer type STRUCT bpf_dynptr_kern not to stack", 0}, + {"not_valid_dynptr", "cannot pass in dynptr at an offset=-8", 0}, + {"not_ptr_to_stack", "arg#0 pointer type STRUCT bpf_dynptr_kern not to stack or dynptr_ptr", 0}, {"dynptr_data_null", NULL, -EBADMSG}, }; diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c index 0d66b15242089..3533a4ecad018 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c @@ -5,83 +5,6 @@ #include "map_kptr.skel.h" #include "map_kptr_fail.skel.h" -static char log_buf[1024 * 1024]; - -struct { - const char *prog_name; - const char *err_msg; -} map_kptr_fail_tests[] = { - { "size_not_bpf_dw", "kptr access size must be BPF_DW" }, - { "non_const_var_off", "kptr access cannot have variable offset" }, - { "non_const_var_off_kptr_xchg", "R1 doesn't have constant offset. kptr has to be" }, - { "misaligned_access_write", "kptr access misaligned expected=8 off=7" }, - { "misaligned_access_read", "kptr access misaligned expected=8 off=1" }, - { "reject_var_off_store", "variable untrusted_ptr_ access var_off=(0x0; 0x1e0)" }, - { "reject_bad_type_match", "invalid kptr access, R1 type=untrusted_ptr_prog_test_ref_kfunc" }, - { "marked_as_untrusted_or_null", "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_" }, - { "correct_btf_id_check_size", "access beyond struct prog_test_ref_kfunc at off 32 size 4" }, - { "inherit_untrusted_on_walk", "R1 type=untrusted_ptr_ expected=percpu_ptr_" }, - { "reject_kptr_xchg_on_unref", "off=8 kptr isn't referenced kptr" }, - { "reject_kptr_get_no_map_val", "arg#0 expected pointer to map value" }, - { "reject_kptr_get_no_null_map_val", "arg#0 expected pointer to map value" }, - { "reject_kptr_get_no_kptr", "arg#0 no referenced kptr at map value offset=0" }, - { "reject_kptr_get_on_unref", "arg#0 no referenced kptr at map value offset=8" }, - { "reject_kptr_get_bad_type_match", "kernel function bpf_kfunc_call_test_kptr_get args#0" }, - { "mark_ref_as_untrusted_or_null", "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_" }, - { "reject_untrusted_store_to_ref", "store to referenced kptr disallowed" }, - { "reject_bad_type_xchg", "invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member" }, - { "reject_untrusted_xchg", "R2 type=untrusted_ptr_ expected=ptr_" }, - { "reject_member_of_ref_xchg", "invalid kptr access, R2 type=ptr_prog_test_ref_kfunc" }, - { "reject_indirect_helper_access", "kptr cannot be accessed indirectly by helper" }, - { "reject_indirect_global_func_access", "kptr cannot be accessed indirectly by helper" }, - { "kptr_xchg_ref_state", "Unreleased reference id=5 alloc_insn=" }, - { "kptr_get_ref_state", "Unreleased reference id=3 alloc_insn=" }, -}; - -static void test_map_kptr_fail_prog(const char *prog_name, const char *err_msg) -{ - LIBBPF_OPTS(bpf_object_open_opts, opts, .kernel_log_buf = log_buf, - .kernel_log_size = sizeof(log_buf), - .kernel_log_level = 1); - struct map_kptr_fail *skel; - struct bpf_program *prog; - int ret; - - skel = map_kptr_fail__open_opts(&opts); - if (!ASSERT_OK_PTR(skel, "map_kptr_fail__open_opts")) - return; - - prog = bpf_object__find_program_by_name(skel->obj, prog_name); - if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) - goto end; - - bpf_program__set_autoload(prog, true); - - ret = map_kptr_fail__load(skel); - if (!ASSERT_ERR(ret, "map_kptr__load must fail")) - goto end; - - if (!ASSERT_OK_PTR(strstr(log_buf, err_msg), "expected error message")) { - fprintf(stderr, "Expected: %s\n", err_msg); - fprintf(stderr, "Verifier: %s\n", log_buf); - } - -end: - map_kptr_fail__destroy(skel); -} - -static void test_map_kptr_fail(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(map_kptr_fail_tests); i++) { - if (!test__start_subtest(map_kptr_fail_tests[i].prog_name)) - continue; - test_map_kptr_fail_prog(map_kptr_fail_tests[i].prog_name, - map_kptr_fail_tests[i].err_msg); - } -} - static void test_map_kptr_success(bool test_run) { LIBBPF_OPTS(bpf_test_run_opts, opts, @@ -145,5 +68,6 @@ void test_map_kptr(void) */ test_map_kptr_success(true); } - test_map_kptr_fail(); + + RUN_TESTS(map_kptr_fail); } diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c index 02b18d018b36a..aefa0a474e582 100644 --- a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c @@ -673,8 +673,8 @@ static struct { {"user_ringbuf_callback_write_forbidden", "invalid mem access 'dynptr_ptr'"}, {"user_ringbuf_callback_null_context_write", "invalid mem access 'scalar'"}, {"user_ringbuf_callback_null_context_read", "invalid mem access 'scalar'"}, - {"user_ringbuf_callback_discard_dynptr", "arg 1 is an unacquired reference"}, - {"user_ringbuf_callback_submit_dynptr", "arg 1 is an unacquired reference"}, + {"user_ringbuf_callback_discard_dynptr", "cannot release unowned const bpf_dynptr"}, + {"user_ringbuf_callback_submit_dynptr", "cannot release unowned const bpf_dynptr"}, {"user_ringbuf_callback_invalid_return", "At callback return the register R0 has value"}, }; diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 5bb11fe595a43..2d7b89b447b27 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -2,6 +2,18 @@ #ifndef __BPF_MISC_H__ #define __BPF_MISC_H__ +#define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" msg))) +#define __failure __attribute__((btf_decl_tag("comment:test_expect_failure"))) +#define __success __attribute__((btf_decl_tag("comment:test_expect_success"))) +#define __log_level(lvl) __attribute__((btf_decl_tag("comment:test_log_level="#lvl))) + +/* Convenience macro for use with 'asm volatile' blocks */ +#define __naked __attribute__((naked)) +#define __clobber_all "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "memory" +#define __clobber_common "r0", "r1", "r2", "r3", "r4", "r5", "memory" +#define __imm(name) [name]"i"(name) +#define __imm_addr(name) [name]"i"(&name) + #if defined(__TARGET_ARCH_x86) #define SYSCALL_WRAPPER 1 #define SYS_PREFIX "__x64_" diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index b0f08ff024fb8..eb359c95579fa 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -35,6 +35,13 @@ struct { __type(value, __u32); } array_map3 SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} array_map4 SEC(".maps"); + struct sample { int pid; long value; @@ -43,6 +50,7 @@ struct sample { struct { __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 4096); } ringbuf SEC(".maps"); int err, val; @@ -66,6 +74,7 @@ static int get_map_val_dynptr(struct bpf_dynptr *ptr) * bpf_ringbuf_submit/discard_dynptr call */ SEC("?raw_tp") +__failure __msg("Unreleased reference id=1") int ringbuf_missing_release1(void *ctx) { struct bpf_dynptr ptr; @@ -78,6 +87,7 @@ int ringbuf_missing_release1(void *ctx) } SEC("?raw_tp") +__failure __msg("Unreleased reference id=2") int ringbuf_missing_release2(void *ctx) { struct bpf_dynptr ptr1, ptr2; @@ -113,6 +123,7 @@ static int missing_release_callback_fn(__u32 index, void *data) /* Any dynptr initialized within a callback must have bpf_dynptr_put called */ SEC("?raw_tp") +__failure __msg("Unreleased reference id") int ringbuf_missing_release_callback(void *ctx) { bpf_loop(10, missing_release_callback_fn, NULL, 0); @@ -121,6 +132,7 @@ int ringbuf_missing_release_callback(void *ctx) /* Can't call bpf_ringbuf_submit/discard_dynptr on a non-initialized dynptr */ SEC("?raw_tp") +__failure __msg("arg 1 is an unacquired reference") int ringbuf_release_uninit_dynptr(void *ctx) { struct bpf_dynptr ptr; @@ -133,6 +145,7 @@ int ringbuf_release_uninit_dynptr(void *ctx) /* A dynptr can't be used after it has been invalidated */ SEC("?raw_tp") +__failure __msg("Expected an initialized dynptr as arg #3") int use_after_invalid(void *ctx) { struct bpf_dynptr ptr; @@ -152,6 +165,7 @@ int use_after_invalid(void *ctx) /* Can't call non-dynptr ringbuf APIs on a dynptr ringbuf sample */ SEC("?raw_tp") +__failure __msg("type=mem expected=alloc_mem") int ringbuf_invalid_api(void *ctx) { struct bpf_dynptr ptr; @@ -174,6 +188,7 @@ int ringbuf_invalid_api(void *ctx) /* Can't add a dynptr to a map */ SEC("?raw_tp") +__failure __msg("invalid indirect read from stack") int add_dynptr_to_map1(void *ctx) { struct bpf_dynptr ptr; @@ -191,6 +206,7 @@ int add_dynptr_to_map1(void *ctx) /* Can't add a struct with an embedded dynptr to a map */ SEC("?raw_tp") +__failure __msg("invalid indirect read from stack") int add_dynptr_to_map2(void *ctx) { struct test_info x; @@ -208,6 +224,7 @@ int add_dynptr_to_map2(void *ctx) /* A data slice can't be accessed out of bounds */ SEC("?raw_tp") +__failure __msg("value is outside of the allowed memory range") int data_slice_out_of_bounds_ringbuf(void *ctx) { struct bpf_dynptr ptr; @@ -228,6 +245,7 @@ int data_slice_out_of_bounds_ringbuf(void *ctx) } SEC("?raw_tp") +__failure __msg("value is outside of the allowed memory range") int data_slice_out_of_bounds_map_value(void *ctx) { __u32 key = 0, map_val; @@ -248,6 +266,7 @@ int data_slice_out_of_bounds_map_value(void *ctx) /* A data slice can't be used after it has been released */ SEC("?raw_tp") +__failure __msg("invalid mem access 'scalar'") int data_slice_use_after_release1(void *ctx) { struct bpf_dynptr ptr; @@ -279,6 +298,7 @@ int data_slice_use_after_release1(void *ctx) * ptr2 is at fp - 16). */ SEC("?raw_tp") +__failure __msg("invalid mem access 'scalar'") int data_slice_use_after_release2(void *ctx) { struct bpf_dynptr ptr1, ptr2; @@ -310,6 +330,7 @@ int data_slice_use_after_release2(void *ctx) /* A data slice must be first checked for NULL */ SEC("?raw_tp") +__failure __msg("invalid mem access 'mem_or_null'") int data_slice_missing_null_check1(void *ctx) { struct bpf_dynptr ptr; @@ -330,6 +351,7 @@ int data_slice_missing_null_check1(void *ctx) /* A data slice can't be dereferenced if it wasn't checked for null */ SEC("?raw_tp") +__failure __msg("invalid mem access 'mem_or_null'") int data_slice_missing_null_check2(void *ctx) { struct bpf_dynptr ptr; @@ -352,6 +374,7 @@ int data_slice_missing_null_check2(void *ctx) * dynptr argument */ SEC("?raw_tp") +__failure __msg("invalid indirect read from stack") int invalid_helper1(void *ctx) { struct bpf_dynptr ptr; @@ -366,6 +389,7 @@ int invalid_helper1(void *ctx) /* A dynptr can't be passed into a helper function at a non-zero offset */ SEC("?raw_tp") +__failure __msg("cannot pass in dynptr at an offset=-8") int invalid_helper2(void *ctx) { struct bpf_dynptr ptr; @@ -381,6 +405,7 @@ int invalid_helper2(void *ctx) /* A bpf_dynptr is invalidated if it's been written into */ SEC("?raw_tp") +__failure __msg("Expected an initialized dynptr as arg #1") int invalid_write1(void *ctx) { struct bpf_dynptr ptr; @@ -402,6 +427,7 @@ int invalid_write1(void *ctx) * offset */ SEC("?raw_tp") +__failure __msg("cannot overwrite referenced dynptr") int invalid_write2(void *ctx) { struct bpf_dynptr ptr; @@ -425,6 +451,7 @@ int invalid_write2(void *ctx) * non-const offset */ SEC("?raw_tp") +__failure __msg("cannot overwrite referenced dynptr") int invalid_write3(void *ctx) { struct bpf_dynptr ptr; @@ -456,6 +483,7 @@ static int invalid_write4_callback(__u32 index, void *data) * be invalidated as a dynptr */ SEC("?raw_tp") +__failure __msg("cannot overwrite referenced dynptr") int invalid_write4(void *ctx) { struct bpf_dynptr ptr; @@ -472,7 +500,9 @@ int invalid_write4(void *ctx) /* A globally-defined bpf_dynptr can't be used (it must reside as a stack frame) */ struct bpf_dynptr global_dynptr; + SEC("?raw_tp") +__failure __msg("type=map_value expected=fp") int global(void *ctx) { /* this should fail */ @@ -485,6 +515,7 @@ int global(void *ctx) /* A direct read should fail */ SEC("?raw_tp") +__failure __msg("invalid read from stack") int invalid_read1(void *ctx) { struct bpf_dynptr ptr; @@ -501,6 +532,7 @@ int invalid_read1(void *ctx) /* A direct read at an offset should fail */ SEC("?raw_tp") +__failure __msg("cannot pass in dynptr at an offset") int invalid_read2(void *ctx) { struct bpf_dynptr ptr; @@ -516,6 +548,7 @@ int invalid_read2(void *ctx) /* A direct read at an offset into the lower stack slot should fail */ SEC("?raw_tp") +__failure __msg("invalid read from stack") int invalid_read3(void *ctx) { struct bpf_dynptr ptr1, ptr2; @@ -542,6 +575,7 @@ static int invalid_read4_callback(__u32 index, void *data) /* A direct read within a callback function should fail */ SEC("?raw_tp") +__failure __msg("invalid read from stack") int invalid_read4(void *ctx) { struct bpf_dynptr ptr; @@ -557,6 +591,7 @@ int invalid_read4(void *ctx) /* Initializing a dynptr on an offset should fail */ SEC("?raw_tp") +__failure __msg("cannot pass in dynptr at an offset=0") int invalid_offset(void *ctx) { struct bpf_dynptr ptr; @@ -571,6 +606,7 @@ int invalid_offset(void *ctx) /* Can't release a dynptr twice */ SEC("?raw_tp") +__failure __msg("arg 1 is an unacquired reference") int release_twice(void *ctx) { struct bpf_dynptr ptr; @@ -597,6 +633,7 @@ static int release_twice_callback_fn(__u32 index, void *data) * within a calback function, fails */ SEC("?raw_tp") +__failure __msg("arg 1 is an unacquired reference") int release_twice_callback(void *ctx) { struct bpf_dynptr ptr; @@ -612,6 +649,7 @@ int release_twice_callback(void *ctx) /* Reject unsupported local mem types for dynptr_from_mem API */ SEC("?raw_tp") +__failure __msg("Unsupported reg type fp for bpf_dynptr_from_mem data") int dynptr_from_mem_invalid_api(void *ctx) { struct bpf_dynptr ptr; @@ -622,3 +660,243 @@ int dynptr_from_mem_invalid_api(void *ctx) return 0; } + +SEC("?tc") +__failure __msg("cannot overwrite referenced dynptr") __log_level(2) +int dynptr_pruning_overwrite(struct __sk_buff *ctx) +{ + asm volatile ( + "r9 = 0xeB9F; \ + r6 = %[ringbuf] ll; \ + r1 = r6; \ + r2 = 8; \ + r3 = 0; \ + r4 = r10; \ + r4 += -16; \ + call %[bpf_ringbuf_reserve_dynptr]; \ + if r0 == 0 goto pjmp1; \ + goto pjmp2; \ + pjmp1: \ + *(u64 *)(r10 - 16) = r9; \ + pjmp2: \ + r1 = r10; \ + r1 += -16; \ + r2 = 0; \ + call %[bpf_ringbuf_discard_dynptr]; " + : + : __imm(bpf_ringbuf_reserve_dynptr), + __imm(bpf_ringbuf_discard_dynptr), + __imm_addr(ringbuf) + : __clobber_all + ); + return 0; +} + +SEC("?tc") +__success __msg("12: safe") __log_level(2) +int dynptr_pruning_stacksafe(struct __sk_buff *ctx) +{ + asm volatile ( + "r9 = 0xeB9F; \ + r6 = %[ringbuf] ll; \ + r1 = r6; \ + r2 = 8; \ + r3 = 0; \ + r4 = r10; \ + r4 += -16; \ + call %[bpf_ringbuf_reserve_dynptr]; \ + if r0 == 0 goto stjmp1; \ + goto stjmp2; \ + stjmp1: \ + r9 = r9; \ + stjmp2: \ + r1 = r10; \ + r1 += -16; \ + r2 = 0; \ + call %[bpf_ringbuf_discard_dynptr]; " + : + : __imm(bpf_ringbuf_reserve_dynptr), + __imm(bpf_ringbuf_discard_dynptr), + __imm_addr(ringbuf) + : __clobber_all + ); + return 0; +} + +SEC("?tc") +__failure __msg("cannot overwrite referenced dynptr") __log_level(2) +int dynptr_pruning_type_confusion(struct __sk_buff *ctx) +{ + asm volatile ( + "r6 = %[array_map4] ll; \ + r7 = %[ringbuf] ll; \ + r1 = r6; \ + r2 = r10; \ + r2 += -8; \ + r9 = 0; \ + *(u64 *)(r2 + 0) = r9; \ + r3 = r10; \ + r3 += -24; \ + r9 = 0xeB9FeB9F; \ + *(u64 *)(r10 - 16) = r9; \ + *(u64 *)(r10 - 24) = r9; \ + r9 = 0; \ + r4 = 0; \ + r8 = r2; \ + call %[bpf_map_update_elem]; \ + r1 = r6; \ + r2 = r8; \ + call %[bpf_map_lookup_elem]; \ + if r0 != 0 goto tjmp1; \ + exit; \ + tjmp1: \ + r8 = r0; \ + r1 = r7; \ + r2 = 8; \ + r3 = 0; \ + r4 = r10; \ + r4 += -16; \ + r0 = *(u64 *)(r0 + 0); \ + call %[bpf_ringbuf_reserve_dynptr]; \ + if r0 == 0 goto tjmp2; \ + r8 = r8; \ + r8 = r8; \ + r8 = r8; \ + r8 = r8; \ + r8 = r8; \ + r8 = r8; \ + r8 = r8; \ + goto tjmp3; \ + tjmp2: \ + *(u64 *)(r10 - 8) = r9; \ + *(u64 *)(r10 - 16) = r9; \ + r1 = r8; \ + r1 += 8; \ + r2 = 0; \ + r3 = 0; \ + r4 = r10; \ + r4 += -16; \ + call %[bpf_dynptr_from_mem]; \ + tjmp3: \ + r1 = r10; \ + r1 += -16; \ + r2 = 0; \ + call %[bpf_ringbuf_discard_dynptr]; " + : + : __imm(bpf_map_update_elem), + __imm(bpf_map_lookup_elem), + __imm(bpf_ringbuf_reserve_dynptr), + __imm(bpf_dynptr_from_mem), + __imm(bpf_ringbuf_discard_dynptr), + __imm_addr(array_map4), + __imm_addr(ringbuf) + : __clobber_all + ); + return 0; +} + +SEC("?tc") +__failure __msg("dynptr has to be at a constant offset") __log_level(2) +int dynptr_var_off_overwrite(struct __sk_buff *ctx) +{ + asm volatile ( + "r9 = 16; \ + *(u32 *)(r10 - 4) = r9; \ + r8 = *(u32 *)(r10 - 4); \ + if r8 >= 0 goto vjmp1; \ + r0 = 1; \ + exit; \ + vjmp1: \ + if r8 <= 16 goto vjmp2; \ + r0 = 1; \ + exit; \ + vjmp2: \ + r8 &= 16; \ + r1 = %[ringbuf] ll; \ + r2 = 8; \ + r3 = 0; \ + r4 = r10; \ + r4 += -32; \ + r4 += r8; \ + call %[bpf_ringbuf_reserve_dynptr]; \ + r9 = 0xeB9F; \ + *(u64 *)(r10 - 16) = r9; \ + r1 = r10; \ + r1 += -32; \ + r1 += r8; \ + r2 = 0; \ + call %[bpf_ringbuf_discard_dynptr]; " + : + : __imm(bpf_ringbuf_reserve_dynptr), + __imm(bpf_ringbuf_discard_dynptr), + __imm_addr(ringbuf) + : __clobber_all + ); + return 0; +} + +SEC("?tc") +__failure __msg("cannot overwrite referenced dynptr") __log_level(2) +int dynptr_partial_slot_invalidate(struct __sk_buff *ctx) +{ + asm volatile ( + "r6 = %[ringbuf] ll; \ + r7 = %[array_map4] ll; \ + r1 = r7; \ + r2 = r10; \ + r2 += -8; \ + r9 = 0; \ + *(u64 *)(r2 + 0) = r9; \ + r3 = r2; \ + r4 = 0; \ + r8 = r2; \ + call %[bpf_map_update_elem]; \ + r1 = r7; \ + r2 = r8; \ + call %[bpf_map_lookup_elem]; \ + if r0 != 0 goto sjmp1; \ + exit; \ + sjmp1: \ + r7 = r0; \ + r1 = r6; \ + r2 = 8; \ + r3 = 0; \ + r4 = r10; \ + r4 += -24; \ + call %[bpf_ringbuf_reserve_dynptr]; \ + *(u64 *)(r10 - 16) = r9; \ + r1 = r7; \ + r2 = 8; \ + r3 = 0; \ + r4 = r10; \ + r4 += -16; \ + call %[bpf_dynptr_from_mem]; \ + r1 = r10; \ + r1 += -512; \ + r2 = 488; \ + r3 = r10; \ + r3 += -24; \ + r4 = 0; \ + r5 = 0; \ + call %[bpf_dynptr_read]; \ + r8 = 1; \ + if r0 != 0 goto sjmp2; \ + r8 = 0; \ + sjmp2: \ + r1 = r10; \ + r1 += -24; \ + r2 = 0; \ + call %[bpf_ringbuf_discard_dynptr]; " + : + : __imm(bpf_map_update_elem), + __imm(bpf_map_lookup_elem), + __imm(bpf_ringbuf_reserve_dynptr), + __imm(bpf_ringbuf_discard_dynptr), + __imm(bpf_dynptr_from_mem), + __imm(bpf_dynptr_read), + __imm_addr(ringbuf), + __imm_addr(array_map4) + : __clobber_all + ); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index a3a6103c85694..35db7c6c1fc74 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -20,6 +20,7 @@ struct sample { struct { __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 4096); } ringbuf SEC(".maps"); struct { diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index 05e209b1b12aa..760e41e1a6326 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -3,6 +3,7 @@ #include #include #include +#include "bpf_misc.h" struct map_value { char buf[8]; @@ -23,6 +24,7 @@ extern struct prog_test_ref_kfunc * bpf_kfunc_call_test_kptr_get(struct prog_test_ref_kfunc **p, int a, int b) __ksym; SEC("?tc") +__failure __msg("kptr access size must be BPF_DW") int size_not_bpf_dw(struct __sk_buff *ctx) { struct map_value *v; @@ -37,6 +39,7 @@ int size_not_bpf_dw(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("kptr access cannot have variable offset") int non_const_var_off(struct __sk_buff *ctx) { struct map_value *v; @@ -55,6 +58,7 @@ int non_const_var_off(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("R1 doesn't have constant offset. kptr has to be") int non_const_var_off_kptr_xchg(struct __sk_buff *ctx) { struct map_value *v; @@ -73,6 +77,7 @@ int non_const_var_off_kptr_xchg(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("kptr access misaligned expected=8 off=7") int misaligned_access_write(struct __sk_buff *ctx) { struct map_value *v; @@ -88,6 +93,7 @@ int misaligned_access_write(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("kptr access misaligned expected=8 off=1") int misaligned_access_read(struct __sk_buff *ctx) { struct map_value *v; @@ -101,6 +107,7 @@ int misaligned_access_read(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("variable untrusted_ptr_ access var_off=(0x0; 0x1e0)") int reject_var_off_store(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *unref_ptr; @@ -124,6 +131,7 @@ int reject_var_off_store(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("invalid kptr access, R1 type=untrusted_ptr_prog_test_ref_kfunc") int reject_bad_type_match(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *unref_ptr; @@ -144,6 +152,7 @@ int reject_bad_type_match(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_") int marked_as_untrusted_or_null(struct __sk_buff *ctx) { struct map_value *v; @@ -158,6 +167,7 @@ int marked_as_untrusted_or_null(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("access beyond struct prog_test_ref_kfunc at off 32 size 4") int correct_btf_id_check_size(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; @@ -175,6 +185,7 @@ int correct_btf_id_check_size(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("R1 type=untrusted_ptr_ expected=percpu_ptr_") int inherit_untrusted_on_walk(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *unref_ptr; @@ -194,6 +205,7 @@ int inherit_untrusted_on_walk(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("off=8 kptr isn't referenced kptr") int reject_kptr_xchg_on_unref(struct __sk_buff *ctx) { struct map_value *v; @@ -208,6 +220,7 @@ int reject_kptr_xchg_on_unref(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("arg#0 expected pointer to map value") int reject_kptr_get_no_map_val(struct __sk_buff *ctx) { bpf_kfunc_call_test_kptr_get((void *)&ctx, 0, 0); @@ -215,6 +228,7 @@ int reject_kptr_get_no_map_val(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("arg#0 expected pointer to map value") int reject_kptr_get_no_null_map_val(struct __sk_buff *ctx) { bpf_kfunc_call_test_kptr_get(bpf_map_lookup_elem(&array_map, &(int){0}), 0, 0); @@ -222,6 +236,7 @@ int reject_kptr_get_no_null_map_val(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("arg#0 no referenced kptr at map value offset=0") int reject_kptr_get_no_kptr(struct __sk_buff *ctx) { struct map_value *v; @@ -236,6 +251,7 @@ int reject_kptr_get_no_kptr(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("arg#0 no referenced kptr at map value offset=8") int reject_kptr_get_on_unref(struct __sk_buff *ctx) { struct map_value *v; @@ -250,6 +266,7 @@ int reject_kptr_get_on_unref(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("kernel function bpf_kfunc_call_test_kptr_get args#0") int reject_kptr_get_bad_type_match(struct __sk_buff *ctx) { struct map_value *v; @@ -264,6 +281,7 @@ int reject_kptr_get_bad_type_match(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_") int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx) { struct map_value *v; @@ -278,6 +296,7 @@ int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("store to referenced kptr disallowed") int reject_untrusted_store_to_ref(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; @@ -297,6 +316,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("R2 type=untrusted_ptr_ expected=ptr_") int reject_untrusted_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; @@ -315,6 +335,8 @@ int reject_untrusted_xchg(struct __sk_buff *ctx) } SEC("?tc") +__failure +__msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member") int reject_bad_type_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; @@ -333,6 +355,7 @@ int reject_bad_type_xchg(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc") int reject_member_of_ref_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; @@ -351,6 +374,7 @@ int reject_member_of_ref_xchg(struct __sk_buff *ctx) } SEC("?syscall") +__failure __msg("kptr cannot be accessed indirectly by helper") int reject_indirect_helper_access(struct __sk_buff *ctx) { struct map_value *v; @@ -371,6 +395,7 @@ int write_func(int *p) } SEC("?tc") +__failure __msg("kptr cannot be accessed indirectly by helper") int reject_indirect_global_func_access(struct __sk_buff *ctx) { struct map_value *v; @@ -384,6 +409,7 @@ int reject_indirect_global_func_access(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("Unreleased reference id=5 alloc_insn=") int kptr_xchg_ref_state(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; @@ -402,6 +428,7 @@ int kptr_xchg_ref_state(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("Unreleased reference id=3 alloc_insn=") int kptr_get_ref_state(struct __sk_buff *ctx) { struct map_value *v; diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c index ce39d096bba34..f4a8250329b2d 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c @@ -32,18 +32,6 @@ int err, pid; char _license[] SEC("license") = "GPL"; -SEC("?lsm.s/bpf") -int BPF_PROG(dynptr_type_not_supp, int cmd, union bpf_attr *attr, - unsigned int size) -{ - char write_data[64] = "hello there, world!!"; - struct bpf_dynptr ptr; - - bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(write_data), 0, &ptr); - - return bpf_verify_pkcs7_signature(&ptr, &ptr, NULL); -} - SEC("?lsm.s/bpf") int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size) { diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c new file mode 100644 index 0000000000000..679efb3aa785e --- /dev/null +++ b/tools/testing/selftests/bpf/test_loader.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + +#define str_has_pfx(str, pfx) \ + (strncmp(str, pfx, __builtin_constant_p(pfx) ? sizeof(pfx) - 1 : strlen(pfx)) == 0) + +#define TEST_LOADER_LOG_BUF_SZ 1048576 + +#define TEST_TAG_EXPECT_FAILURE "comment:test_expect_failure" +#define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success" +#define TEST_TAG_EXPECT_MSG_PFX "comment:test_expect_msg=" +#define TEST_TAG_LOG_LEVEL_PFX "comment:test_log_level=" + +struct test_spec { + const char *name; + bool expect_failure; + const char *expect_msg; + int log_level; +}; + +static int tester_init(struct test_loader *tester) +{ + if (!tester->log_buf) { + tester->log_buf_sz = TEST_LOADER_LOG_BUF_SZ; + tester->log_buf = malloc(tester->log_buf_sz); + if (!ASSERT_OK_PTR(tester->log_buf, "tester_log_buf")) + return -ENOMEM; + } + + return 0; +} + +void test_loader_fini(struct test_loader *tester) +{ + if (!tester) + return; + + free(tester->log_buf); +} + +static int parse_test_spec(struct test_loader *tester, + struct bpf_object *obj, + struct bpf_program *prog, + struct test_spec *spec) +{ + struct btf *btf; + int func_id, i; + + memset(spec, 0, sizeof(*spec)); + + spec->name = bpf_program__name(prog); + + btf = bpf_object__btf(obj); + if (!btf) { + ASSERT_FAIL("BPF object has no BTF"); + return -EINVAL; + } + + func_id = btf__find_by_name_kind(btf, spec->name, BTF_KIND_FUNC); + if (func_id < 0) { + ASSERT_FAIL("failed to find FUNC BTF type for '%s'", spec->name); + return -EINVAL; + } + + for (i = 1; i < btf__type_cnt(btf); i++) { + const struct btf_type *t; + const char *s; + + t = btf__type_by_id(btf, i); + if (!btf_is_decl_tag(t)) + continue; + + if (t->type != func_id || btf_decl_tag(t)->component_idx != -1) + continue; + + s = btf__str_by_offset(btf, t->name_off); + if (strcmp(s, TEST_TAG_EXPECT_FAILURE) == 0) { + spec->expect_failure = true; + } else if (strcmp(s, TEST_TAG_EXPECT_SUCCESS) == 0) { + spec->expect_failure = false; + } else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX)) { + spec->expect_msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX) - 1; + } else if (str_has_pfx(s, TEST_TAG_LOG_LEVEL_PFX)) { + errno = 0; + spec->log_level = strtol(s + sizeof(TEST_TAG_LOG_LEVEL_PFX) - 1, NULL, 0); + if (errno) { + ASSERT_FAIL("failed to parse test log level from '%s'", s); + return -EINVAL; + } + } + } + + return 0; +} + +static void prepare_case(struct test_loader *tester, + struct test_spec *spec, + struct bpf_object *obj, + struct bpf_program *prog) +{ + int min_log_level = 0; + + if (env.verbosity > VERBOSE_NONE) + min_log_level = 1; + if (env.verbosity > VERBOSE_VERY) + min_log_level = 2; + + bpf_program__set_log_buf(prog, tester->log_buf, tester->log_buf_sz); + + /* Make sure we set at least minimal log level, unless test requirest + * even higher level already. Make sure to preserve independent log + * level 4 (verifier stats), though. + */ + if ((spec->log_level & 3) < min_log_level) + bpf_program__set_log_level(prog, (spec->log_level & 4) | min_log_level); + else + bpf_program__set_log_level(prog, spec->log_level); + + tester->log_buf[0] = '\0'; +} + +static void emit_verifier_log(const char *log_buf, bool force) +{ + if (!force && env.verbosity == VERBOSE_NONE) + return; + fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log_buf); +} + +static void validate_case(struct test_loader *tester, + struct test_spec *spec, + struct bpf_object *obj, + struct bpf_program *prog, + int load_err) +{ + if (spec->expect_msg) { + char *match; + + match = strstr(tester->log_buf, spec->expect_msg); + if (!ASSERT_OK_PTR(match, "expect_msg")) { + /* if we are in verbose mode, we've already emitted log */ + if (env.verbosity == VERBOSE_NONE) + emit_verifier_log(tester->log_buf, true /*force*/); + fprintf(stderr, "EXPECTED MSG: '%s'\n", spec->expect_msg); + return; + } + } +} + +/* this function is forced noinline and has short generic name to look better + * in test_progs output (in case of a failure) + */ +static noinline +void run_subtest(struct test_loader *tester, + const char *skel_name, + skel_elf_bytes_fn elf_bytes_factory) +{ + LIBBPF_OPTS(bpf_object_open_opts, open_opts, .object_name = skel_name); + struct bpf_object *obj = NULL, *tobj; + struct bpf_program *prog, *tprog; + const void *obj_bytes; + size_t obj_byte_cnt; + int err; + + if (tester_init(tester) < 0) + return; /* failed to initialize tester */ + + obj_bytes = elf_bytes_factory(&obj_byte_cnt); + obj = bpf_object__open_mem(obj_bytes, obj_byte_cnt, &open_opts); + if (!ASSERT_OK_PTR(obj, "obj_open_mem")) + return; + + bpf_object__for_each_program(prog, obj) { + const char *prog_name = bpf_program__name(prog); + struct test_spec spec; + + if (!test__start_subtest(prog_name)) + continue; + + /* if we can't derive test specification, go to the next test */ + err = parse_test_spec(tester, obj, prog, &spec); + if (!ASSERT_OK(err, "parse_test_spec")) + continue; + + tobj = bpf_object__open_mem(obj_bytes, obj_byte_cnt, &open_opts); + if (!ASSERT_OK_PTR(tobj, "obj_open_mem")) /* shouldn't happen */ + continue; + + bpf_object__for_each_program(tprog, tobj) + bpf_program__set_autoload(tprog, false); + + bpf_object__for_each_program(tprog, tobj) { + /* only load specified program */ + if (strcmp(bpf_program__name(tprog), prog_name) == 0) { + bpf_program__set_autoload(tprog, true); + break; + } + } + + prepare_case(tester, &spec, tobj, tprog); + + err = bpf_object__load(tobj); + if (spec.expect_failure) { + if (!ASSERT_ERR(err, "unexpected_load_success")) { + emit_verifier_log(tester->log_buf, false /*force*/); + goto tobj_cleanup; + } + } else { + if (!ASSERT_OK(err, "unexpected_load_failure")) { + emit_verifier_log(tester->log_buf, true /*force*/); + goto tobj_cleanup; + } + } + + emit_verifier_log(tester->log_buf, false /*force*/); + validate_case(tester, &spec, tobj, tprog, err); + +tobj_cleanup: + bpf_object__close(tobj); + } + + bpf_object__close(obj); +} + +void test_loader__run_subtests(struct test_loader *tester, + const char *skel_name, + skel_elf_bytes_fn elf_bytes_factory) +{ + /* see comment in run_subtest() for why we do this function nesting */ + run_subtest(tester, skel_name, elf_bytes_factory); +} diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index b090996daee5c..3f058dfadbaf1 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TEST_PROGS_H +#define __TEST_PROGS_H + #include #include #include @@ -210,6 +213,12 @@ int test__join_cgroup(const char *path); #define CHECK_ATTR(condition, tag, format...) \ _CHECK(condition, tag, tattr.duration, format) +#define ASSERT_FAIL(fmt, args...) ({ \ + static int duration = 0; \ + CHECK(false, "", fmt"\n", ##args); \ + false; \ +}) + #define ASSERT_TRUE(actual, name) ({ \ static int duration = 0; \ bool ___ok = (actual); \ @@ -397,3 +406,27 @@ int write_sysctl(const char *sysctl, const char *value); #endif #define BPF_TESTMOD_TEST_FILE "/sys/kernel/bpf_testmod" + +struct test_loader { + char *log_buf; + size_t log_buf_sz; + + struct bpf_object *obj; +}; + +typedef const void *(*skel_elf_bytes_fn)(size_t *sz); + +extern void test_loader__run_subtests(struct test_loader *tester, + const char *skel_name, + skel_elf_bytes_fn elf_bytes_factory); + +extern void test_loader_fini(struct test_loader *tester); + +#define RUN_TESTS(skel) ({ \ + struct test_loader tester = {}; \ + \ + test_loader__run_subtests(&tester, #skel, skel##__elf_bytes); \ + test_loader_fini(&tester); \ +}) + +#endif /* __TEST_PROGS_H */ diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c index b64d33e4833c8..92e3f6a61a798 100644 --- a/tools/testing/selftests/bpf/verifier/ringbuf.c +++ b/tools/testing/selftests/bpf/verifier/ringbuf.c @@ -28,7 +28,7 @@ }, .fixup_map_ringbuf = { 1 }, .result = REJECT, - .errstr = "dereference of modified alloc_mem ptr R1", + .errstr = "R1 must have zero offset when passed to release func", }, { "ringbuf: invalid reservation offset 2",